diff --git a/.bazelrc b/.bazelrc index 8c2be7ea8586..c71651133b52 100644 --- a/.bazelrc +++ b/.bazelrc @@ -1,12 +1,11 @@ # Must be first. Enables build:windows, build:linux, build:macos, build:freebsd, build:openbsd build --enable_platform_specific_config -build:linux --workspace_status_command="bash ./bazel/workspace_status.sh" - -# Provides users an option to turn on strict action env. -# TODO(aslonnie): make this default; fix the python tests.. +build --incompatible_strict_action_env build:strict --incompatible_strict_action_env +build:linux --workspace_status_command="bash ./bazel/workspace_status.sh" + # To distinguish different incompatible environments. build --action_env=RAY_BUILD_ENV @@ -81,8 +80,6 @@ build:iwyu --output_groups=report build:windows --attempt_to_print_relative_paths # Save disk space by hardlinking cache hits instead of copying build:windows --experimental_repository_cache_hardlinks -# Clean the environment before building, to make builds more deterministic -build:windows --incompatible_strict_action_env # For colored output (seems necessary on Windows) build:windows --color=yes # For compiler colored output (seems necessary on Windows) @@ -165,6 +162,18 @@ test:ci-base --test_output=errors test:ci-base --test_verbose_timeout_warnings test:ci-base --flaky_test_attempts=3 +# Sending in PATH is required for tests to run on CI, after we enable +# --incompatible_strict_action_env, until we either convert all Python tests to +# hermetic tests -- which not only requires pinning all Python dependencies with bazel, +# but also requires building ray(test) wheel with bazel. Alternatively, we can +# also stop using bazel test to run ray's Python tests. +# +# This PATH test_env is intentionally not enabled on non-CI so that C/C++ +# tests, which are all hermetic, can build, test and cache as intended, ray +# Python developers do not really use bazel test to run tests locally, but more +# often just run tests with "pytest" directly. +test:ci-base --test_env=PATH + build:ci --color=yes build:ci --curses=no build:ci --keep_going @@ -220,6 +229,9 @@ build:cgroup --sandbox_writable_path=/sys/fs/cgroup --config=llvm # ci/env/install-llvm-dependencies.sh try-import %workspace%/.llvm-local.bazelrc +# Allow users to define custom options. +try-import %workspace%/.user.bazelrc + # Even with sandbox mode bazel prioritizes system headers over the ones in the sandbox. # It picks up the system headers when someone has protobuf installed via Homebrew. # Work around for https://github.com/bazelbuild/bazel/issues/8053 diff --git a/.buildkite/_forge.rayci.yml b/.buildkite/_forge.rayci.yml index fcc4a3e770d9..69c066c7bca2 100644 --- a/.buildkite/_forge.rayci.yml +++ b/.buildkite/_forge.rayci.yml @@ -1,101 +1,8 @@ group: forge +sort_key: "_forge" steps: - name: forge wanda: ci/docker/forge.wanda.yaml - name: manylinux wanda: ci/docker/manylinux.wanda.yaml - - - name: raycudabase - label: "wanda: ray.py{{matrix.python}}.cu{{matrix.cuda}}.base" - tags: - - python_dependencies - - docker - wanda: ci/docker/ray.cuda.base.wanda.yaml - matrix: - setup: - python: - - "3.9" - - "3.10" - - "3.11" - - "3.12" - cuda: - - "11.7.1-cudnn8" - - "11.8.0-cudnn8" - - "12.1.1-cudnn8" - - "12.3.2-cudnn9" - - "12.4.1-cudnn" - - "12.5.1-cudnn" - - "12.6.3-cudnn" - - "12.8.1-cudnn" - env: - PYTHON_VERSION: "{{matrix.python}}" - CUDA_VERSION: "{{matrix.cuda}}" - - - - name: raycpubase - label: "wanda: ray.py{{matrix}}.cpu.base" - tags: - - python_dependencies - - python - - docker - - tune - - serve - wanda: ci/docker/ray.cpu.base.wanda.yaml - matrix: - - "3.9" - - "3.10" - - "3.11" - - "3.12" - env: - PYTHON_VERSION: "{{matrix}}" - - - name: ray-llmbase - label: "wanda: ray-llm.py{{matrix.python}}.cu{{matrix.cuda}}.base" - tags: - - python_dependencies - - docker - wanda: ci/docker/ray-llm.base.wanda.yaml - depends_on: raycudabase - matrix: - setup: - python: - - "3.11" - cuda: - - "12.8.1-cudnn" - env: - PYTHON_VERSION: "{{matrix.python}}" - CUDA_VERSION: "{{matrix.cuda}}" - - - name: ray-mlcudabase - label: "wanda: ray-ml.py{{matrix.python}}.cu{{matrix.cuda}}.base" - tags: - - python_dependencies - - docker - wanda: ci/docker/ray-ml.cuda.base.wanda.yaml - depends_on: raycudabase - matrix: - setup: - python: - - "3.9" - - "3.10" - - "3.11" - cuda: - - "12.1.1-cudnn8" - env: - PYTHON_VERSION: "{{matrix.python}}" - CUDA_VERSION: "{{matrix.cuda}}" - - - name: ray-mlcpubase - label: "wanda: ray-ml.py{{matrix}}.cpu.base" - tags: - - python_dependencies - - docker - wanda: ci/docker/ray-ml.cpu.base.wanda.yaml - depends_on: raycpubase - matrix: - - "3.9" - - "3.10" - - "3.11" - env: - PYTHON_VERSION: "{{matrix}}" diff --git a/.buildkite/_images.rayci.yml b/.buildkite/_images.rayci.yml new file mode 100644 index 000000000000..caa0b158f52d --- /dev/null +++ b/.buildkite/_images.rayci.yml @@ -0,0 +1,227 @@ +group: images +sort_key: "_images" +steps: + - name: raycpubase + label: "wanda: ray-py{{matrix}}-cpu-base" + tags: + - python_dependencies + - docker + wanda: docker/base-deps/cpu.wanda.yaml + matrix: + - "3.9" + - "3.10" + - "3.11" + - "3.12" + env: + PYTHON_VERSION: "{{matrix}}" + ARCH_SUFFIX: "" + + - name: raycpubaseextra + label: "wanda: ray-py{{matrix}}-cpu-base-extra" + wanda: docker/base-extra/cpu.wanda.yaml + matrix: + - "3.9" + - "3.10" + - "3.11" + - "3.12" + env: + PYTHON_VERSION: "{{matrix}}" + IMAGE_TYPE: "ray" + ARCH_SUFFIX: "" + depends_on: raycpubase + + - name: raycudabase + label: "wanda: ray-py{{matrix.python}}-cu{{matrix.cuda}}-base" + tags: + - python_dependencies + - docker + wanda: docker/base-deps/cuda.wanda.yaml + matrix: + setup: + python: + - "3.9" + - "3.10" + - "3.11" + - "3.12" + cuda: + - "11.7.1-cudnn8" + - "11.8.0-cudnn8" + - "12.1.1-cudnn8" + - "12.3.2-cudnn9" + - "12.4.1-cudnn" + - "12.5.1-cudnn" + - "12.6.3-cudnn" + - "12.8.1-cudnn" + env: + PYTHON_VERSION: "{{matrix.python}}" + CUDA_VERSION: "{{matrix.cuda}}" + ARCH_SUFFIX: "" + + - name: raycudabaseextra + label: "wanda: ray-py{{matrix.python}}-cu{{matrix.cuda}}-base-extra" + wanda: docker/base-extra/cuda.wanda.yaml + matrix: + setup: + python: + - "3.9" + - "3.10" + - "3.11" + - "3.12" + cuda: + - "11.7.1-cudnn8" + - "11.8.0-cudnn8" + - "12.1.1-cudnn8" + - "12.3.2-cudnn9" + - "12.4.1-cudnn" + - "12.5.1-cudnn" + - "12.6.3-cudnn" + - "12.8.1-cudnn" + env: + PYTHON_VERSION: "{{matrix.python}}" + CUDA_VERSION: "{{matrix.cuda}}" + IMAGE_TYPE: "ray" + ARCH_SUFFIX: "" + depends_on: raycudabase + + - name: ray-llmbase + label: "wanda: ray-llm-py{{matrix.python}}-cu{{matrix.cuda}}-base" + tags: + - python_dependencies + - docker + wanda: docker/ray-llm/cuda.wanda.yaml + depends_on: raycudabase + matrix: + setup: + python: + - "3.11" + cuda: + - "12.8.1-cudnn" + env: + PYTHON_VERSION: "{{matrix.python}}" + CUDA_VERSION: "{{matrix.cuda}}" + + - name: ray-llmbaseextra + label: "wanda: ray-llm-py{{matrix.python}}-cu{{matrix.cuda}}-base-extra" + wanda: docker/base-extra/cuda.wanda.yaml + matrix: + setup: + python: + - "3.11" + cuda: + - "12.8.1-cudnn" + env: + PYTHON_VERSION: "{{matrix.python}}" + CUDA_VERSION: "{{matrix.cuda}}" + IMAGE_TYPE: "ray-llm" + ARCH_SUFFIX: "" + depends_on: ray-llmbase + + - name: ray-mlcpubase + label: "wanda: ray-ml-py{{matrix}}-cpu-base" + tags: + - python_dependencies + - docker + wanda: docker/ray-ml/cpu.wanda.yaml + depends_on: raycpubase + matrix: + - "3.9" + - "3.10" + - "3.11" + env: + PYTHON_VERSION: "{{matrix}}" + + - name: ray-mlcpubaseextra + label: "wanda: ray-ml-py{{matrix}}-cpu-base-extra" + wanda: docker/base-extra/cpu.wanda.yaml + matrix: + - "3.9" + - "3.10" + - "3.11" + env: + PYTHON_VERSION: "{{matrix}}" + IMAGE_TYPE: "ray-ml" + ARCH_SUFFIX: "" + depends_on: ray-mlcpubase + + - name: ray-mlcudabase + label: "wanda: ray-ml-py{{matrix.python}}-cu{{matrix.cuda}}-base" + tags: + - python_dependencies + - docker + wanda: docker/ray-ml/cuda.wanda.yaml + depends_on: raycudabase + matrix: + setup: + python: + - "3.9" + - "3.10" + - "3.11" + cuda: + - "12.1.1-cudnn8" + env: + PYTHON_VERSION: "{{matrix.python}}" + CUDA_VERSION: "{{matrix.cuda}}" + + - name: ray-mlcudabaseextra + label: "wanda: ray-ml-py{{matrix.python}}-cu{{matrix.cuda}}-base-extra" + wanda: docker/base-extra/cuda.wanda.yaml + matrix: + setup: + python: + - "3.9" + - "3.10" + - "3.11" + cuda: + - "12.1.1-cudnn8" + env: + PYTHON_VERSION: "{{matrix.python}}" + CUDA_VERSION: "{{matrix.cuda}}" + IMAGE_TYPE: "ray-ml" + ARCH_SUFFIX: "" + depends_on: ray-mlcudabase + + - name: ray-slimcpubase + label: "wanda: ray-slim-py{{matrix}}-cpu-base" + tags: + - python_dependencies + - docker + - skip-on-release-tests + wanda: docker/base-slim/cpu.wanda.yaml + depends_on: raycpubase + matrix: + - "3.9" + - "3.10" + - "3.11" + - "3.12" + env: + PYTHON_VERSION: "{{matrix}}" + ARCH_SUFFIX: "" + + - name: ray-slimcudabase + label: "wanda: ray-slim-py{{matrix.python}}-cu{{matrix.cuda}}-base" + tags: + - python_dependencies + - docker + - skip-on-release-tests + wanda: docker/base-slim/cuda.wanda.yaml + depends_on: raycudabase + matrix: + setup: + python: + - "3.9" + - "3.10" + - "3.11" + - "3.12" + cuda: + - "11.7.1" + - "11.8.0" + - "12.1.1" + - "12.3.2" + - "12.4.1" + - "12.5.1" + - "12.6.3" + - "12.8.1" + env: + PYTHON_VERSION: "{{matrix.python}}" + CUDA_VERSION: "{{matrix.cuda}}" + ARCH_SUFFIX: "" diff --git a/.buildkite/build.rayci.yml b/.buildkite/build.rayci.yml index dcd69269f6a7..01256a3b63c8 100644 --- a/.buildkite/build.rayci.yml +++ b/.buildkite/build.rayci.yml @@ -19,17 +19,6 @@ steps: - manylinux - forge - - label: ":tapioca: build: debug wheel" - tags: - - linux_wheels - - oss - instance_type: large - commands: - - bazel run //ci/ray_ci:build_in_docker -- wheel --build-type debug --upload - depends_on: - - manylinux - - forge - - label: ":tapioca: build: jar" key: java_wheels tags: diff --git a/.buildkite/core.rayci.yml b/.buildkite/core.rayci.yml index 09542fe0a8f3..71ed4084c0e2 100644 --- a/.buildkite/core.rayci.yml +++ b/.buildkite/core.rayci.yml @@ -60,19 +60,7 @@ steps: - bazel run //ci/ray_ci:test_in_docker -- //python/ray/tests/... //python/ray/_common/tests/... //python/ray/dag/... //python/ray/autoscaler/v2/... core --install-mask all-ray-libraries --workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}" --parallelism-per-worker 3 - --except-tags debug_tests,asan_tests,post_wheel_build,ha_integration,mem_pressure,tmpfs,container,manual,multi_gpu,spark_on_ray,ray_client,compiled_graphs,dask - --install-mask all-ray-libraries - - - label: ":ray: core: cgraph python tests" - tags: - - compiled_graphs - instance_type: large - commands: - - bazel run //ci/ray_ci:test_in_docker -- //python/ray/dag/... core - --install-mask all-ray-libraries - --workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}" --parallelism-per-worker 3 - --only-tags compiled_graphs - --except-tags multi_gpu + --except-tags debug_tests,asan_tests,post_wheel_build,ha_integration,mem_pressure,tmpfs,runtime_env_container,manual,multi_gpu,spark_on_ray,ray_client,dask --install-mask all-ray-libraries - label: ":ray: core: python {{matrix.python}} tests ({{matrix.worker_id}})" @@ -86,7 +74,7 @@ steps: --install-mask all-ray-libraries --workers 4 --worker-id "{{matrix.worker_id}}" --parallelism-per-worker 3 --python-version {{matrix.python}} - --except-tags debug_tests,asan_tests,post_wheel_build,ha_integration,mem_pressure,tmpfs,container,manual,multi_gpu,spark_on_ray,ray_client,dask + --except-tags debug_tests,asan_tests,post_wheel_build,ha_integration,mem_pressure,tmpfs,runtime_env_container,manual,multi_gpu,spark_on_ray,ray_client,dask depends_on: corebuild-multipy matrix: setup: @@ -115,7 +103,7 @@ steps: --install-mask all-ray-libraries --workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}" --parallelism-per-worker 3 --test-env=TEST_EXTERNAL_REDIS=1 - --except-tags debug_tests,asan_tests,post_wheel_build,ha_integration,mem_pressure,tmpfs,container,manual,multi_gpu,spark_on_ray,ray_client,dask + --except-tags debug_tests,asan_tests,post_wheel_build,ha_integration,mem_pressure,tmpfs,runtime_env_container,manual,multi_gpu,spark_on_ray,ray_client,dask - label: ":ray: core: memory pressure tests" tags: @@ -304,19 +292,20 @@ steps: - "3.12" - "3.13" - # cpp tests - label: ":ray: core: cgroup tests" tags: core_cpp instance_type: medium commands: - - bazel run //ci/ray_ci:test_in_docker -- //:all //src/... core --only-tags=cgroup --build-type cgroup - --privileged --cache-test-results + - RAYCI_DISABLE_TEST_DB=1 bazel run //ci/ray_ci:test_in_docker -- //:all //src/ray/common/cgroup2/tests/... core --build-type clang --cache-test-results + - docker run --privileged -i --rm --volume /tmp/artifacts:/artifact-mount --shm-size=2.5gb + "$${RAYCI_WORK_REPO}":"$${RAYCI_BUILD_ID}"-corebuild /bin/bash + "./src/ray/common/cgroup2/integration_tests/sysfs_cgroup_driver_integration_test_entrypoint.sh" - label: ":ray: core: cpp tests" tags: core_cpp instance_type: medium commands: - - bazel run //ci/ray_ci:test_in_docker -- //:all //src/... core --except-tags=cgroup --build-type clang + - RAYCI_DISABLE_TEST_DB=1 bazel run //ci/ray_ci:test_in_docker -- //:all //src/... core --except-tags=cgroup --build-type clang --cache-test-results --parallelism-per-worker 2 # block on premerge and microcheck @@ -329,7 +318,7 @@ steps: tags: core_cpp instance_type: medium commands: - - bazel run //ci/ray_ci:test_in_docker -- //:all //src/... core --except-tags=cgroup + - RAYCI_DISABLE_TEST_DB=1 bazel run //ci/ray_ci:test_in_docker -- //:all //src/... core --except-tags=cgroup --build-type asan-clang --cache-test-results --parallelism-per-worker 2 depends_on: - block-core-cpp-sanitizer-tests @@ -339,7 +328,7 @@ steps: tags: core_cpp instance_type: large commands: - - bazel run //ci/ray_ci:test_in_docker -- //:all //src/... core + - RAYCI_DISABLE_TEST_DB=1 bazel run //ci/ray_ci:test_in_docker -- //:all //src/... core --build-type ubsan --except-tags no_ubsan,cgroup --cache-test-results --parallelism-per-worker 2 depends_on: @@ -350,27 +339,13 @@ steps: tags: core_cpp instance_type: medium commands: - - bazel run //ci/ray_ci:test_in_docker -- //:all //src/... core + - RAYCI_DISABLE_TEST_DB=1 bazel run //ci/ray_ci:test_in_docker -- //:all //src/... core --build-type tsan-clang --except-tags no_tsan,cgroup --cache-test-results --parallelism-per-worker 2 depends_on: - block-core-cpp-sanitizer-tests - corebuild - - label: ":ray: core: flaky cpp tests" - key: core_flaky_cpp_tests - tags: - - python - - flaky - - skip-on-premerge - instance_type: large - soft_fail: true - commands: - - bazel run //ci/ray_ci:test_in_docker -- //:all //src/... core - --run-flaky-tests --build-type clang - depends_on: - - corebuild - - label: ":ray: core: flaky tests" key: core_flaky_tests tags: @@ -433,10 +408,10 @@ steps: - raycpubase - corebuild - - label: ":ray: core: container tests" + - label: ":ray: core: runtime env container tests" tags: - - python - docker + - runtime_env_container - oss instance_type: medium commands: @@ -447,7 +422,7 @@ steps: # Disable test DB, these tests will never succeed if run in the flaky step. - RAYCI_DISABLE_TEST_DB=1 bazel run //ci/ray_ci:test_in_docker -- //python/ray/tests/... core --install-mask all-ray-libraries - --only-tags container + --only-tags runtime_env_container depends_on: - manylinux - forge @@ -469,7 +444,7 @@ steps: depends_on: - corebuild - # block on premerge and microcheck + # block gpu tests on premerge and microcheck - block: "run multi gpu tests" if: build.env("BUILDKITE_PIPELINE_ID") == "0189942e-0876-4b8f-80a4-617f988ec59b" || build.env("BUILDKITE_PIPELINE_ID") == "018f4f1e-1b73-4906-9802-92422e3badaa" key: block-core-gpu-tests @@ -478,7 +453,7 @@ steps: - label: ":ray: core: multi gpu tests" key: core-multi-gpu-tests tags: - - compiled_graphs + - cgraphs_direct_transport - gpu instance_type: gpu-large # we're running some cgraph doc tests here as well since they need gpus diff --git a/.buildkite/dependencies.rayci.yml b/.buildkite/dependencies.rayci.yml new file mode 100644 index 000000000000..ab8217bfa214 --- /dev/null +++ b/.buildkite/dependencies.rayci.yml @@ -0,0 +1,43 @@ +group: dependencies +depends_on: + - forge +steps: + # dependencies + - label: ":tapioca: build: pip-compile dependencies" + key: pip_compile_dependencies + tags: always + instance_type: small + commands: + # uncomment the following line to update the pinned versions of pip dependencies + # to the latest versions; otherwise, the pinned versions will be re-used as much + # as possible + # - rm ./python/requirements_compiled.txt + - cp ./python/requirements_compiled.txt requirements_compiled_backup.txt + - ./ci/ci.sh compile_pip_dependencies + - cp -f ./python/requirements_compiled.txt /artifact-mount/ + - diff ./python/requirements_compiled.txt requirements_compiled_backup.txt || (echo "requirements_compiled.txt is not up to date. Please download it from Artifacts tab and git push the changes." && exit 1) + job_env: oss-ci-base_test-py3.11 + depends_on: oss-ci-base_test-multipy + + - label: ":tapioca: build: raydepsets: compile LLM dependencies" + key: raydepsets_compile_llm_dependencies + tags: always + instance_type: small + commands: + - bazel run //ci/raydepsets:raydepsets -- build ci/raydepsets/rayllm.depsets.yaml --check + - chown -R 2000:100 /artifact-mount + - cp ./python/deplocks/llm/* /artifact-mount/ + job_env: manylinux + depends_on: manylinux + + - label: ":tapioca: build: raydepsets: compile ray img dependencies" + key: raydepsets_compile_rayimg_dependencies + tags: always + instance_type: medium + commands: + # build placeholder wheel for all python versions + - bash ci/build/build-placeholder-wheel.sh + # compile rayimg dependencies + - bazel run //ci/raydepsets:raydepsets -- build ci/raydepsets/rayimg.depsets.yaml --check + job_env: manylinux + depends_on: manylinux diff --git a/.buildkite/lint.rayci.yml b/.buildkite/lint.rayci.yml index f45c826374c4..9762ce7a8261 100644 --- a/.buildkite/lint.rayci.yml +++ b/.buildkite/lint.rayci.yml @@ -35,11 +35,21 @@ steps: commands: - ./ci/lint/lint.sh pre_commit_pydoclint - - label: ":lint-roller: lint: {{matrix}}" + - label: ":lint-roller: python API: {{matrix}}" tags: - oss - - lint - - always + - python + - dashboard + - ray_client + - data + - serve + - ml + - tune + - train + - llm + - rllib + - rllib_gpu + - doc key: lint-medium instance_type: medium depends_on: docbuild diff --git a/.buildkite/linux_aarch64.rayci.yml b/.buildkite/linux_aarch64.rayci.yml index 8cf1b12ef59a..14c397b3d9db 100644 --- a/.buildkite/linux_aarch64.rayci.yml +++ b/.buildkite/linux_aarch64.rayci.yml @@ -13,13 +13,43 @@ steps: wanda: ci/docker/manylinux.aarch64.wanda.yaml instance_type: builder-arm64 + - name: raycpubase-aarch64 + label: "wanda: ray.py{{matrix}}.cpu.base (aarch64)" + tags: + - python_dependencies + - docker + wanda: docker/base-deps/cpu.wanda.yaml + matrix: + - "3.9" + - "3.10" + - "3.11" + - "3.12" + instance_type: builder-arm64 + env: + PYTHON_VERSION: "{{matrix}}" + ARCH_SUFFIX: "-aarch64" + + - name: raycpubaseextra-aarch64 + label: "wanda: ray.py{{matrix}}.cpu.base-extra (aarch64)" + wanda: docker/base-extra/cpu.wanda.yaml + matrix: + - "3.9" + - "3.10" + - "3.11" + - "3.12" + instance_type: builder-arm64 + env: + PYTHON_VERSION: "{{matrix}}" + IMAGE_TYPE: "ray" + ARCH_SUFFIX: "-aarch64" + depends_on: raycpubase-aarch64 + - name: raycudabase-aarch64 label: "wanda: ray.py{{matrix.python}}.cu{{matrix.cuda}}.base (aarch64)" tags: - python_dependencies - docker - - core_cpp - wanda: ci/docker/ray.cuda.base.aarch64.wanda.yaml + wanda: docker/base-deps/cuda.wanda.yaml matrix: setup: python: @@ -34,27 +64,40 @@ steps: - "12.3.2-cudnn9" - "12.4.1-cudnn" - "12.5.1-cudnn" + - "12.6.3-cudnn" - "12.8.1-cudnn" instance_type: builder-arm64 env: PYTHON_VERSION: "{{matrix.python}}" CUDA_VERSION: "{{matrix.cuda}}" + ARCH_SUFFIX: "-aarch64" - - name: raycpubase-aarch64 - label: "wanda: ray.py{{matrix}}.cpu.base (aarch64)" - tags: - - python_dependencies - - docker - - core_cpp - wanda: ci/docker/ray.cpu.base.aarch64.wanda.yaml + - name: raycudabaseextra-aarch64 + label: "wanda: ray.py{{matrix.python}}.cu{{matrix.cuda}}.base-extra (aarch64)" + wanda: docker/base-extra/cuda.wanda.yaml matrix: - - "3.9" - - "3.10" - - "3.11" - - "3.12" + setup: + python: + - "3.9" + - "3.10" + - "3.11" + - "3.12" + cuda: + - "11.7.1-cudnn8" + - "11.8.0-cudnn8" + - "12.1.1-cudnn8" + - "12.3.2-cudnn9" + - "12.4.1-cudnn" + - "12.5.1-cudnn" + - "12.6.3-cudnn" + - "12.8.1-cudnn" instance_type: builder-arm64 env: - PYTHON_VERSION: "{{matrix}}" + PYTHON_VERSION: "{{matrix.python}}" + CUDA_VERSION: "{{matrix.cuda}}" + IMAGE_TYPE: "ray" + ARCH_SUFFIX: "-aarch64" + depends_on: raycudabase-aarch64 - label: ":tapioca: build: wheel {{matrix}} (aarch64)" tags: @@ -80,7 +123,6 @@ steps: tags: - python_dependencies - docker - - core_cpp - oss instance_type: medium-arm64 commands: @@ -88,7 +130,8 @@ steps: --platform cu11.7.1-cudnn8 --platform cu11.8.0-cudnn8 --platform cu12.1.1-cudnn8 --platform cu12.3.2-cudnn9 --platform cu12.4.1-cudnn --platform cu12.5.1-cudnn - --platform cu12.8.1-cudnn --platform cpu + --platform cu12.6.3-cudnn --platform cu12.8.1-cudnn + --platform cpu --image-type ray --architecture aarch64 --upload depends_on: - manylinux-aarch64 diff --git a/.buildkite/others.rayci.yml b/.buildkite/others.rayci.yml index 3f4551474e30..08fbf7944b8b 100644 --- a/.buildkite/others.rayci.yml +++ b/.buildkite/others.rayci.yml @@ -2,31 +2,6 @@ group: others depends_on: - forge steps: - # dependencies - - label: ":tapioca: build: pip-compile dependencies" - key: pip_compile_dependencies - tags: always - instance_type: small - commands: - # uncomment the following line to update the pinned versions of pip dependencies - # to the latest versions; otherwise, the pinned versions will be re-used as much - # as possible - # - rm ./python/requirements_compiled.txt - - cp ./python/requirements_compiled.txt requirements_compiled_backup.txt - - ./ci/ci.sh compile_pip_dependencies - - cp -f ./python/requirements_compiled.txt /artifact-mount/ - - diff ./python/requirements_compiled.txt requirements_compiled_backup.txt || (echo "requirements_compiled.txt is not up to date. Please download it from Artifacts tab and git push the changes." && exit 1) - job_env: oss-ci-base_test-py3.11 - depends_on: oss-ci-base_test-multipy - - - label: ":tapioca: build: uv pip compile LLM dependencies" - key: uv_pip_compile_llm_dependencies - tags: always - instance_type: small - command: ./ci/test_compile_llm_requirements.sh - job_env: oss-ci-base_test-py3.11 - depends_on: oss-ci-base_test-multipy - # docs - name: doctestbuild wanda: ci/docker/doctest.build.wanda.yaml diff --git a/.buildkite/release-automation/verify-macos-wheels.sh b/.buildkite/release-automation/verify-macos-wheels.sh index a3a06dafca58..e867ff0d6b74 100755 --- a/.buildkite/release-automation/verify-macos-wheels.sh +++ b/.buildkite/release-automation/verify-macos-wheels.sh @@ -4,7 +4,10 @@ set -euo pipefail set -x -PYTHON_VERSIONS=("3.9" "3.10" "3.11" "3.12" "3.13") +# TODO(#54047): Python 3.13 is skipped due to the bug +# we should re-enable it when the bug is fixed. + +PYTHON_VERSIONS=("3.9" "3.10" "3.11" "3.12") BAZELISK_VERSION="v1.16.0" # Check arguments diff --git a/.buildkite/release/_images.rayci.yml b/.buildkite/release/_images.rayci.yml new file mode 120000 index 000000000000..67fd8382b173 --- /dev/null +++ b/.buildkite/release/_images.rayci.yml @@ -0,0 +1 @@ +../_images.rayci.yml \ No newline at end of file diff --git a/.buildkite/release/build.rayci.yml b/.buildkite/release/build.rayci.yml index f7fdc95375a9..708b7b622aaf 100644 --- a/.buildkite/release/build.rayci.yml +++ b/.buildkite/release/build.rayci.yml @@ -1,9 +1,77 @@ group: release build steps: - - label: ":tapioca: build: anyscale py{{matrix.python}}-{{matrix.platform}} docker" - tags: skip-on-premerge + - name: raycpubaseextra-testdeps + label: "wanda: ray.py{{matrix}}.cpu.base-extra-testdeps" + wanda: docker/base-extra-testdeps/cpu.wanda.yaml + matrix: + - "3.9" + - "3.11" + - "3.12" + env: + PYTHON_VERSION: "{{matrix}}" + IMAGE_TYPE: "ray" + REQUIREMENTS_FILE: "requirements_byod_{{matrix}}.txt" + depends_on: + - raycpubaseextra + + - name: raycudabaseextra-testdeps + label: "wanda: ray.py{{matrix.python}}.cu{{matrix.cuda}}.base-extra-testdeps" + wanda: docker/base-extra-testdeps/cuda.wanda.yaml + matrix: + setup: + python: + - "3.9" + - "3.11" + - "3.12" + cuda: + - "12.3.2-cudnn9" + env: + PYTHON_VERSION: "{{matrix.python}}" + CUDA_VERSION: "{{matrix.cuda}}" + IMAGE_TYPE: "ray" + REQUIREMENTS_FILE: "requirements_byod_{{matrix.python}}.txt" + depends_on: + - raycudabaseextra + + - name: ray-llmbaseextra-testdeps + label: "wanda: ray.py{{matrix.python}}.llm.base-extra-testdeps (cuda {{matrix.cuda}})" + wanda: docker/base-extra-testdeps/cuda.wanda.yaml + matrix: + setup: + python: + - "3.11" + cuda: + - "12.8.1-cudnn" + env: + PYTHON_VERSION: "{{matrix.python}}" + CUDA_VERSION: "{{matrix.cuda}}" + IMAGE_TYPE: "ray-llm" + REQUIREMENTS_FILE: "requirements_llm_byod_{{matrix.python}}.txt" + depends_on: + - ray-llmbaseextra + + - name: ray-mlcudabaseextra-testdeps + label: "wanda: ray.py{{matrix.python}}.cu{{matrix.cuda}}.ml.base-extra-testdeps" + wanda: docker/base-extra-testdeps/cuda.wanda.yaml + matrix: + setup: + python: + - "3.9" + cuda: + - "12.1.1-cudnn8" + env: + PYTHON_VERSION: "{{matrix.python}}" + CUDA_VERSION: "{{matrix.cuda}}" + IMAGE_TYPE: "ray-ml" + REQUIREMENTS_FILE: "requirements_ml_byod_{{matrix.python}}.txt" + depends_on: + - ray-mlcudabaseextra + + - label: ":tapioca: build: ray py{{matrix.python}}-{{matrix.platform}} image for release tests" key: anyscalebuild instance_type: release-medium + tags: + - oss commands: - bazel run //ci/ray_ci:build_in_docker -- anyscale --python-version {{matrix.python}} --platform {{matrix.platform}} @@ -11,8 +79,8 @@ steps: depends_on: - manylinux - forge - - raycudabase - - raycpubase + - raycpubaseextra-testdeps + - raycudabaseextra-testdeps matrix: setup: python: @@ -25,31 +93,33 @@ steps: - cu12.3.2-cudnn9 - cpu - - label: ":tapioca: build: anyscale-llm py{{matrix}} docker" - tags: skip-on-premerge + - label: ":tapioca: build: ray-llm py{{matrix}} image for release tests" key: anyscalellmbuild instance_type: release-medium + tags: + - oss commands: - bazel run //ci/ray_ci:build_in_docker -- anyscale --python-version {{matrix}} --platform cu12.8.1-cudnn --image-type ray-llm --upload depends_on: - manylinux - forge - - ray-llmbase + - ray-llmbaseextra-testdeps matrix: - "3.11" - - label: ":tapioca: build: anyscale-ml py{{matrix}} docker" - tags: skip-on-premerge + - label: ":tapioca: build: ray-ml py{{matrix}} image for release tests" key: anyscalemlbuild instance_type: release-medium + tags: + - oss commands: - bazel run //ci/ray_ci:build_in_docker -- anyscale --python-version {{matrix}} --platform cu12.1.1-cudnn8 --image-type ray-ml --upload depends_on: - manylinux - forge - - ray-mlcudabase + - ray-mlcudabaseextra-testdeps matrix: # This list should be kept in sync with the list of supported Python in # release test suite. We don't have ray-ml release tests for Python 3.10 and 3.11 diff --git a/.buildkite/release/config.yml b/.buildkite/release/config.yml index 6dffd5492011..30ac2983d3b0 100644 --- a/.buildkite/release/config.yml +++ b/.buildkite/release/config.yml @@ -15,3 +15,11 @@ env: RAYCI_SKIP_UPLOAD: "true" hook_env_keys: - RAYCI_CHECKOUT_DIR +skip_tags: + - disabled + - skip-on-release-tests +build_env_keys: + - AUTOMATIC + - RELEASE_FREQUENCY +docker_plugin: + allow_mount_buildkite_agent: true diff --git a/.buildkite/release/test-init.sh b/.buildkite/release/test-init.sh new file mode 100644 index 000000000000..85282c9002c7 --- /dev/null +++ b/.buildkite/release/test-init.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +set -euo pipefail + +if [[ ${BUILDKITE_COMMIT} == "HEAD" ]]; then + BUILDKITE_COMMIT="$(git rev-parse HEAD)" + export BUILDKITE_COMMIT +fi + +aws ecr get-login-password --region us-west-2 | \ + docker login --username AWS --password-stdin 029272617770.dkr.ecr.us-west-2.amazonaws.com + +bash release/gcloud_docker_login.sh release/aws2gce_iam.json +export PATH="${PWD}/google-cloud-sdk/bin:$PATH" + +if [[ "${AUTOMATIC:-0}" == "1" && "${BUILDKITE_BRANCH}" == "master" ]]; then + export REPORT_TO_RAY_TEST_DB=1 +fi + +RUN_FLAGS=() + +if [[ "${AUTOMATIC:-0}" == "0" || "${BUILDKITE_BRANCH}" == "releases/"* ]]; then + RUN_FLAGS+=(--run-jailed-tests) +fi +if [[ "${BUILDKITE_BRANCH}" != "releases/"* ]]; then + RUN_FLAGS+=(--run-unstable-tests) +fi + +echo "---- Build test steps" +bazelisk run //release:build_pipeline -- "${RUN_FLAGS[@]}" \ + | buildkite-agent pipeline upload diff --git a/.buildkite/release/test.rayci.yml b/.buildkite/release/test.rayci.yml new file mode 100644 index 000000000000..dd7d235780b0 --- /dev/null +++ b/.buildkite/release/test.rayci.yml @@ -0,0 +1,15 @@ +group: test init +tags: + - oss +steps: + - label: "test init" + key: test-init + instance_type: release + commands: + - /bin/bash .buildkite/release/test-init.sh + mount_buildkite_agent: true + depends_on: + - forge + - anyscalebuild + - anyscalellmbuild + - anyscalemlbuild diff --git a/.buildkite/releasebuild.rayci.yml b/.buildkite/releasebuild.rayci.yml deleted file mode 120000 index d0497f6db89d..000000000000 --- a/.buildkite/releasebuild.rayci.yml +++ /dev/null @@ -1 +0,0 @@ -release/build.rayci.yml \ No newline at end of file diff --git a/.buildkite/serve.rayci.yml b/.buildkite/serve.rayci.yml index 6b45f68f4430..5a2f5a4c00f6 100644 --- a/.buildkite/serve.rayci.yml +++ b/.buildkite/serve.rayci.yml @@ -54,6 +54,7 @@ steps: tags: - serve - python + - skip-on-premerge instance_type: large soft_fail: true commands: @@ -68,6 +69,7 @@ steps: tags: - serve - python + - skip-on-premerge instance_type: large soft_fail: true commands: diff --git a/.gemini/config.yaml b/.gemini/config.yaml new file mode 100644 index 000000000000..9add3a6c8058 --- /dev/null +++ b/.gemini/config.yaml @@ -0,0 +1,10 @@ +have_fun: false +code_review: + disable: false + comment_severity_threshold: MEDIUM + max_review_comments: -1 + pull_request_opened: + help: false + summary: false + code_review: true +ignore_patterns: [] diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index f692cfbe2ef4..da43c939515c 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -29,6 +29,9 @@ /doc/source/cluster/ @ray-project/ray-core @ray-project/ray-docs /doc/source/ray-core/ @ray-project/ray-core @ray-project/ray-docs +# Public protobuf files. +/src/ray/protobuf/public/ @edoakes @jjyao + # ==== Libraries and frameworks ==== # Dependencies @@ -36,7 +39,7 @@ # Common directory shared by core and the libraries. # @edoakes is the czar for now because the pattern is new. -/python/ray/_common/ @edoakes @aslonnie +/python/ray/_common/ @edoakes @jjyao # Ray data. /python/ray/data/ @ray-project/ray-data @@ -51,16 +54,20 @@ /rllib/ @ray-project/ray-rllib /doc/source/rllib/ @ray-project/ray-rllib @ray-project/ray-docs -# Tune +# Ray Tune /python/ray/tune/ @ray-project/ray-tune /doc/source/tune/ @ray-project/ray-tune @ray-project/ray-docs -# Train +# Ray Train /python/ray/train/ @ray-project/ray-train /doc/source/train/ @ray-project/ray-train @ray-project/ray-docs +# Ray AIR +/python/ray/air/ @ray-project/ray-train + # LLM /python/ray/llm/ @ray-project/ray-llm +/python/ray/data/llm.py @ray-project/ray-llm # Ray Serve /python/ray/serve/ @ray-project/ray-serve @@ -72,8 +79,8 @@ /python/requirements/ml/dl-gpu-requirements.txt @richardliaw @matthewdeng # Ray symbol export -/src/ray/ray_version_script.lds @aslonnie -/src/ray/ray_exported_symbols.lds @aslonnie +/src/ray/ray_version_script.lds @ray-project/ray-core +/src/ray/ray_exported_symbols.lds @ray-project/ray-core # Ray usage stats /python/ray/_private/usage/ @edoakes @richardliaw @jjyao @@ -105,6 +112,8 @@ # on their own. /release/ray_release/byod/*.sh -/.github/ISSUE_TEMPLATE/ @aslonnie +/.github/ISSUE_TEMPLATE/ @ray-project/ray-ci /.github/workflows/ @ray-project/ray-ci + +/.gemini/ @edoakes @ray-project/ray-ci diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 25358b043673..5f43c1d9f812 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -21,16 +21,6 @@ updates: open-pull-requests-limit: 5 reviewers: - "ray-project/ray-tune" - # compat requirements should not be updated - - package-ecosystem: "pip" - directory: "/python/requirements/compat" - commit-message: - prefix: "[air/do-not-merge]" - include: "scope" - ignore: * - open-pull-requests-limit: 0 - reviewers: - - "ray-project/ray-tune" # Data Requirements. - package-ecosystem: "pip" directory: "/python/requirements/data_processing" diff --git a/.gitignore b/.gitignore index ae8dd2240350..5e7bbfa27cfa 100644 --- a/.gitignore +++ b/.gitignore @@ -38,7 +38,7 @@ python/ray/autoscaler/kuberay/config # Python byte code files *.pyc python/.eggs - +.eggs # Backup files *.bak @@ -126,6 +126,7 @@ scripts/nodes.txt .idea/**/tasks.xml .idea/dictionaries .llvm-local.bazelrc +.user.bazelrc .aider* # Sensitive or high-churn files: @@ -153,6 +154,10 @@ scripts/nodes.txt .benchmarks python-driver-* +# Ray Train unit test artifacts +lightning_logs/ +hf-internal-testing/ + # Vscode .vscode/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a586e806b6bc..4029869cda16 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -44,12 +44,39 @@ repos: args: [ --fix, --exit-non-zero-on-fix ] - id: ruff args: [ --select, "I", --fix, --exit-non-zero-on-fix ] - files: '^python/ray/serve/|^python/ray/train|^python/ray/data|^python/ray/_private/|^python/ray/llm/|^python/ray/tune/' + # pydoclint-local is for local commits only due to pre-commit-hook only passing + # updated files to the hook and overwriting the baseline text file - repo: https://github.com/jsh9/pydoclint rev: "0.6.6" hooks: - id: pydoclint + name: pydoclint-local + stages: [pre-commit, pre-push] + args: [ + --style=google, + --baseline=ci/lint/pydoclint-baseline.txt, + --exclude=thirdparty|^python/ray/serve/tests/test_config_files/syntax_error\.py$|^python/ray/_private/parameter\.py$, + --auto-regenerate-baseline=False, + # Current settings (not because we think they're right, but because we + # don't want a baseline the size of the codebase) + --arg-type-hints-in-docstring=False, + --skip-checking-raises=True, + --check-return-types=False, + --allow-init-docstring=True, + --check-class-attributes=False, + # --check-style-mismatch=True, # Bring this back once things are a bit cleaner + ] + types: [python] + files: '^python/ray/' + + # pydoclint-ci is for CI, overwrites the baseline text file, and is run with the manual stage flag + - repo: https://github.com/jsh9/pydoclint + rev: "0.6.6" + hooks: + - id: pydoclint + name: pydoclint-ci + stages: [manual] args: [ --style=google, --baseline=ci/lint/pydoclint-baseline.txt, @@ -73,7 +100,7 @@ repos: hooks: - id: cpplint args: ["--filter=-whitespace/braces,-whitespace/line_length,-build/c++11,-build/c++14,-build/c++17,-readability/braces,-whitespace/indent_namespace,-runtime/int,-runtime/references,-build/include_order"] - files: ^src/ray/(common/cgroup2|common/scheduling|common/ray_syncer|util|raylet_client|internal|scheduling|pubsub|object_manager|rpc(?:/.*)?|raylet|core_worker)/.*\.(h|cc)$ + files: ^src/ray/(common/cgroup2|common/scheduling|common/ray_syncer|common/test|util|raylet_client|internal|scheduling|pubsub|object_manager|rpc(?:/.*)?|raylet|core_worker|ipc)/.*\.(h|cc)$ exclude: | (?x)^( src/ray/raylet/scheduling/.*\.(h|cc)$ | @@ -148,11 +175,12 @@ repos: # 1091: Not following {file} due to some error # 2207: Prefer mapfile or read -a to split command output (or quote to avoid splitting). -- these aren't compatible with macOS's old Bash - - repo: https://github.com/pocc/pre-commit-hooks - rev: v1.3.5 + - repo: https://github.com/pre-commit/mirrors-clang-format + # `rev` specifies a tag on the above repo that mirrors the corresponding clang-format version. + # The version should be kept in sync with the version in `ci/lint/format.sh`. + rev: v12.0.1 hooks: - id: clang-format - args: [--version=12.0.1] - repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks rev: v2.11.0 diff --git a/.vale/styles/config/vocabularies/General/accept.txt b/.vale/styles/config/vocabularies/General/accept.txt index 886a23d9b920..dfd100659547 100644 --- a/.vale/styles/config/vocabularies/General/accept.txt +++ b/.vale/styles/config/vocabularies/General/accept.txt @@ -33,6 +33,7 @@ autoscales bool breakpoint BTS +bursty chatbot CLI configs @@ -45,8 +46,10 @@ deserialize deserializes dev dev to prod -disable +[d|D]isable[d] +[d|D]isable DLinear +Dockerfile DPO EKS ETDataset @@ -69,6 +72,7 @@ LMs LSH MCP Megatron +Mixtral MLflow MLOps namespace @@ -76,6 +80,7 @@ NER Nsight NumPy NVIDIA +NVLink OOM open-source PACK @@ -86,6 +91,8 @@ pretraining productionize Pythonic QPS +Qwen +Quantizing retrigger RISECamp RLHF @@ -104,6 +111,7 @@ teardown uncaptured URI(s)? UUID +USD uv verl VM(s)? diff --git a/BUILD.bazel b/BUILD.bazel index fb6f2b58e163..9224cf8f2373 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -87,22 +87,19 @@ config_setting( flag_values = {":jemalloc_flag": "true"}, ) +alias( + name = "uv_file", + actual = select({ + "//bazel:linux_x86_64_config": "@uv_x86_64-linux//:file", + "//bazel:osx_arm64_config": "@uv_aarch64-darwin//:file", + "//conditions:default": "@uv_x86_64-linux//:file", + }), +) + # bazel run :refresh_compile_commands for compile_commands generation for clangd # https://github.com/hedronvision/bazel-compile-commands-extractor?tab=readme-ov-file#vscode - directions for clangd config refresh_compile_commands( name = "refresh_compile_commands", - exclude_external_sources = True, # removed below to have lsp index external cc files at the cost of 2x index time - # Specify the targets of interest. - # For example, specify a dict of targets and any flags required to build. - targets = { - "//:ray_pkg_zip": "", - }, - # No need to add flags already in .bazelrc. They're automatically picked up. -) - -# bazel run :refresh_compile_commands_external_sources for generation with external source files (cc files) -refresh_compile_commands( - name = "refresh_compile_commands_external_sources", targets = { "//:ray_pkg_zip": "", }, @@ -110,18 +107,30 @@ refresh_compile_commands( ray_cc_library( name = "ray_mock", + # NOTE(edoakes): we are moving towards fine-grained mock and fake targets. + # Do not include new files in this target, instead make a BUILD.bazel file + # in the subdirectory and exclude it here. hdrs = glob( ["src/mock/**/*.h"], - exclude = ["src/mock/ray/common/ray_syncer/ray_syncer.h"], + exclude = [ + "src/mock/ray/common/pubsub/publisher.h", + "src/mock/ray/common/pubsub/subscriber.h", + "src/mock/ray/common/ray_syncer/ray_syncer.h", + ], ), ) ray_cc_library( name = "ray_fakes", - hdrs = glob(["src/fakes/**/*.h"]), + # NOTE(edoakes): we are moving towards fine-grained mock and fake targets. + # Do not include new files in this target, instead make a BUILD.bazel file + # in the subdirectory and exclude it here. + hdrs = glob( + ["src/fakes/**/*.h"], + ), deps = [ "//src/ray/common:asio", - "//src/ray/raylet_client:raylet_client_lib", + "//src/ray/rpc:raylet_client_interface", ], ) @@ -237,14 +246,15 @@ pyx_library( "//:src/ray/ray_exported_symbols.lds", "//:src/ray/ray_version_script.lds", "//src/ray/core_worker:core_worker_lib", - "//src/ray/gcs:gcs_redis_client", - "//src/ray/gcs/gcs_client:gcs_python_callbacks", - "//src/ray/gcs/gcs_client:global_state_accessor_lib", - "//src/ray/gcs/gcs_server:gcs_server_lib", + "//src/ray/gcs:gcs_server_lib", + "//src/ray/gcs/store_client:redis_store_client", + "//src/ray/gcs_client:gcs_python_callbacks", + "//src/ray/gcs_client:global_state_accessor_lib", "//src/ray/protobuf:serialization_cc_proto", + "//src/ray/pubsub:python_gcs_subscriber", "//src/ray/thirdparty/setproctitle", - "//src/ray/util", "//src/ray/util:memory", + "//src/ray/util:raii", "//src/ray/util:stream_redirection", "//src/ray/util:stream_redirection_options", ], @@ -271,6 +281,9 @@ alias( name = "redis-server", actual = select({ "@platforms//os:windows": "@com_github_tporadowski_redis_bin//:redis-server.exe", + "//bazel:linux_x86_64_config": "@redis_linux_x86_64//:redis-server", + "//bazel:linux_arm64_config": "@redis_linux_arm64//:redis-server", + "//bazel:osx_arm64_config": "@redis_osx_arm64//:redis-server", "//conditions:default": "@com_github_antirez_redis//:redis-server", }), ) @@ -279,6 +292,9 @@ alias( name = "redis-cli", actual = select({ "@platforms//os:windows": "@com_github_tporadowski_redis_bin//:redis-cli.exe", + "//bazel:linux_x86_64_config": "@redis_linux_x86_64//:redis-cli", + "//bazel:linux_arm64_config": "@redis_linux_arm64//:redis-cli", + "//bazel:osx_arm64_config": "@redis_osx_arm64//:redis-cli", "//conditions:default": "@com_github_antirez_redis//:redis-cli", }), ) @@ -369,7 +385,7 @@ pkg_files( pkg_files( name = "gcs_server_files", - srcs = ["//src/ray/gcs/gcs_server"], + srcs = ["//src/ray/gcs:gcs_server"], attributes = pkg_attributes(mode = "755"), prefix = "ray/core/src/ray/gcs", visibility = ["//visibility:private"], @@ -420,6 +436,7 @@ genrule( # NOTE(hchen): Protobuf doesn't allow specifying Python package name. So we use this `sed` # command to change the import path in the generated file. + sed -i -E 's/from src.ray.protobuf.public/from ./' "$${files[@]}" sed -i -E 's/from src.ray.protobuf/from ./' "$${files[@]}" # Help the generated serve files to have the correct module serve_files=($$(ls "$$tmpdir"/ray/serve/generated/*_pb2*.py)) @@ -431,6 +448,8 @@ genrule( sed -i -E 's/from opencensus.proto.resource.v1 import/from . import/' "$${files[@]}" $(location //bazel:pyzip) "$$tmpdir" $@ + + rm -rf "$$tmpdir" """, tools = [ "//bazel:pyzip", @@ -475,6 +494,18 @@ genrule( local = 1, ) +py_binary( + name = "gen_py_proto", + srcs = ["gen_py_proto.py"], + data = [ + ":ray_py_proto_zip", + ], + visibility = ["//visibility:private"], + deps = [ + "//bazel:gen_extract", + ], +) + py_binary( name = "gen_ray_pkg", srcs = ["gen_ray_pkg.py"], diff --git a/WORKSPACE b/WORKSPACE index be7a05ef4371..666b38d06797 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -1,4 +1,4 @@ -workspace(name = "com_github_ray_project_ray") +workspace(name = "io_ray") load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") @@ -117,7 +117,7 @@ filegroup( ) http_archive( - name = "uv_x86_64", + name = "uv_x86_64-linux", build_file_content = """ filegroup( name = "file", @@ -125,8 +125,21 @@ filegroup( visibility = ["//visibility:public"], ) """, - sha256 = "10f204426ff188925d22a53c1d0310d190a8d4d24513712e1b8e2ca9873f0666", - urls = ["https://github.com/astral-sh/uv/releases/download/0.7.20/uv-x86_64-unknown-linux-gnu.tar.gz"], + sha256 = "920cbcaad514cc185634f6f0dcd71df5e8f4ee4456d440a22e0f8c0f142a8203", + urls = ["https://github.com/astral-sh/uv/releases/download/0.8.17/uv-x86_64-unknown-linux-gnu.tar.gz"], +) + +http_archive( + name = "uv_aarch64-darwin", + build_file_content = """ +filegroup( + name = "file", + srcs = glob(["**"]), + visibility = ["//visibility:public"], +) +""", + sha256 = "e4d4859d7726298daa4c12e114f269ff282b2cfc2b415dc0b2ca44ae2dbd358e", + urls = ["https://github.com/astral-sh/uv/releases/download/0.8.17/uv-aarch64-apple-darwin.tar.gz"], ) http_archive( @@ -138,6 +151,27 @@ http_archive( ], ) +http_archive( + name = "redis_linux_x86_64", + build_file_content = """exports_files(["redis-server", "redis-cli"])""", + sha256 = "4ae33c10059ed52202a12929d269deea46fac81b8e02e722d30cb22ceb3ed678", + urls = ["https://github.com/ray-project/redis/releases/download/7.2.3/redis-linux-x86_64.tar.gz"], +) + +http_archive( + name = "redis_linux_arm64", + build_file_content = """exports_files(["redis-server", "redis-cli"])""", + sha256 = "2d1085a4f69477e1f44cbddd531e593f0712532b1ade9beab0b221a0cb01f298", + urls = ["https://github.com/ray-project/redis/releases/download/7.2.3/redis-linux-arm64.tar.gz"], +) + +http_archive( + name = "redis_osx_arm64", + build_file_content = """exports_files(["redis-server", "redis-cli"])""", + sha256 = "74b76099c3600b538252cdd1731278e087e8e85eecc6c64318c860f3e9462506", + urls = ["https://github.com/ray-project/redis/releases/download/7.2.3/redis-osx-arm64.tar.gz"], +) + load("@com_github_storypku_bazel_iwyu//bazel:dependencies.bzl", "bazel_iwyu_dependencies") bazel_iwyu_dependencies() diff --git a/bazel/BUILD.bazel b/bazel/BUILD.bazel index 2e4ce8f51b18..7aa94b909bc0 100644 --- a/bazel/BUILD.bazel +++ b/bazel/BUILD.bazel @@ -20,3 +20,43 @@ py_library( ], visibility = ["//visibility:public"], ) + +config_setting( + name = "linux_x86_64_config", + constraint_values = [ + "@platforms//os:linux", + "@platforms//cpu:x86_64", + ], +) + +config_setting( + name = "linux_arm64_config", + constraint_values = [ + "@platforms//os:linux", + "@platforms//cpu:arm64", + ], +) + +config_setting( + name = "osx_x86_64_config", + constraint_values = [ + "@platforms//os:osx", + "@platforms//cpu:x86_64", + ], +) + +config_setting( + name = "osx_arm64_config", + constraint_values = [ + "@platforms//os:osx", + "@platforms//cpu:arm64", + ], +) + +config_setting( + name = "windows_x86_64_config", + constraint_values = [ + "@platforms//os:windows", + "@platforms//cpu:x86_64", + ], +) diff --git a/bazel/gen_extract.py b/bazel/gen_extract.py index ce89c7e49a3b..a635922011ee 100644 --- a/bazel/gen_extract.py +++ b/bazel/gen_extract.py @@ -1,7 +1,7 @@ -from typing import List, Optional import os import shutil import subprocess +from typing import List, Optional import runfiles @@ -12,11 +12,13 @@ def gen_extract( sub_dir: str = "python", ): r = runfiles.Create() - _repo_name = "com_github_ray_project_ray" + _repo_name = "io_ray" root_dir = os.environ.get("BUILD_WORKSPACE_DIRECTORY") if not root_dir: - raise ValueError("BUILD_WORKSPACE_DIRECTORY not set") + raise ValueError( + "BUILD_WORKSPACE_DIRECTORY not set; please run this script from 'bazelisk run'" + ) if sub_dir: extract_dir = os.path.join(root_dir, sub_dir) diff --git a/bazel/jemalloc.BUILD b/bazel/jemalloc.BUILD index e0be47fd4446..545a557293a2 100644 --- a/bazel/jemalloc.BUILD +++ b/bazel/jemalloc.BUILD @@ -1,5 +1,5 @@ load("@rules_foreign_cc//foreign_cc:configure.bzl", "configure_make") -load("@com_github_ray_project_ray//bazel:ray.bzl", "filter_files_with_suffix") +load("@io_ray//bazel:ray.bzl", "filter_files_with_suffix") filegroup( name = "all", diff --git a/bazel/python.bzl b/bazel/python.bzl index 4ebe4cffdcdc..3c15ae6f527d 100644 --- a/bazel/python.bzl +++ b/bazel/python.bzl @@ -17,6 +17,24 @@ def _convert_target_to_import_path(t): # 3) Replace '/' with '.' to form an import path. return t.replace("/", ".") +def doctest_each(files, gpu = False, deps=[], srcs=[], data=[], args=[], size="medium", tags=[], pytest_plugin_file="//bazel:default_doctest_pytest_plugin.py", **kwargs): + # Unlike the `doctest` macro, `doctest_each` runs `pytest` on each file separately. + # This is useful to run tests in parallel and more clearly report the test results. + for file in files: + doctest( + files = [file], + gpu = gpu, + name = paths.split_extension(file)[0], + deps = deps, + srcs = srcs, + data = data, + args = args, + size = size, + tags = tags, + pytest_plugin_file = pytest_plugin_file, + **kwargs + ) + def doctest(files, gpu = False, name="doctest", deps=[], srcs=[], data=[], args=[], size="medium", tags=[], pytest_plugin_file="//bazel:default_doctest_pytest_plugin.py", **kwargs): # NOTE: If you run `pytest` on `__init__.py`, it tries to test all files in that # package. We don't want that, so we exclude it from the list of input files. diff --git a/bazel/ray.bzl b/bazel/ray.bzl index 9ef8b4a9c07b..0fda67ebd6d0 100644 --- a/bazel/ray.bzl +++ b/bazel/ray.bzl @@ -3,7 +3,7 @@ load("@bazel_skylib//rules:copy_file.bzl", "copy_file") load("@com_github_google_flatbuffers//:build_defs.bzl", "flatbuffer_library_public") load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library", "cc_test") -COPTS_WITHOUT_LOG = select({ +COPTS_TESTS = select({ "//:opt": ["-DBAZEL_OPT"], "//conditions:default": [], }) + select({ @@ -25,7 +25,10 @@ COPTS_WITHOUT_LOG = select({ "//conditions:default": [], }) -COPTS = COPTS_WITHOUT_LOG +COPTS = COPTS_TESTS + select({ + "@platforms//os:windows": [""], + "//conditions:default": ["-Wshadow"], +}) PYX_COPTS = select({ "//:msvc-cl": [], @@ -144,7 +147,7 @@ def ray_cc_library(name, strip_include_prefix = "/src", copts = [], visibility = def ray_cc_test(name, linkopts = [], copts = [], **kwargs): cc_test( name = name, - copts = COPTS + copts, + copts = COPTS_TESTS + copts, linkopts = linkopts + ["-pie"], **kwargs ) diff --git a/bazel/ray_deps_build_all.bzl b/bazel/ray_deps_build_all.bzl index a8597dd1840f..8d59beab3263 100644 --- a/bazel/ray_deps_build_all.bzl +++ b/bazel/ray_deps_build_all.bzl @@ -1,5 +1,5 @@ load("@bazel_skylib//:workspace.bzl", "bazel_skylib_workspace") -load("@com_github_ray_project_ray//java:dependencies.bzl", "gen_java_deps") +load("@io_ray//java:dependencies.bzl", "gen_java_deps") load("@com_github_nelhage_rules_boost//:boost/boost.bzl", "boost_deps") load("@com_github_jupp0r_prometheus_cpp//bazel:repositories.bzl", "prometheus_cpp_repositories") load("@com_github_grpc_grpc//third_party/py:python_configure.bzl", "python_configure") diff --git a/bazel/ray_deps_setup.bzl b/bazel/ray_deps_setup.bzl index 210f71c8b1da..13a6a96c162e 100644 --- a/bazel/ray_deps_setup.bzl +++ b/bazel/ray_deps_setup.bzl @@ -53,7 +53,7 @@ def auto_http_archive( # auto appending ray project namespace prefix for 3rd party library reusing. if build_file == True: - build_file = "@com_github_ray_project_ray//%s:%s" % ("bazel", name + ".BUILD") + build_file = "@io_ray//%s:%s" % ("bazel", name + ".BUILD") if urls == True: prefer_url_over_mirrors = is_github @@ -106,41 +106,41 @@ def ray_deps_setup(): # all of http/git_repository should add prefix for patches defined in ray directory. auto_http_archive( name = "com_github_antirez_redis", - build_file = "@com_github_ray_project_ray//bazel:redis.BUILD", + build_file = "@io_ray//bazel:redis.BUILD", patch_args = ["-p1"], url = "https://github.com/redis/redis/archive/refs/tags/7.2.3.tar.gz", sha256 = "afd656dbc18a886f9a1cc08a550bf5eb89de0d431e713eba3ae243391fb008a6", patches = [ - "@com_github_ray_project_ray//thirdparty/patches:redis-quiet.patch", + "@io_ray//thirdparty/patches:redis-quiet.patch", ], workspace_file_content = 'workspace(name = "com_github_antirez_redis")', ) auto_http_archive( name = "com_github_redis_hiredis", - build_file = "@com_github_ray_project_ray//bazel:hiredis.BUILD", + build_file = "@io_ray//bazel:hiredis.BUILD", url = "https://github.com/redis/hiredis/archive/60e5075d4ac77424809f855ba3e398df7aacefe8.tar.gz", sha256 = "b6d6f799b7714d85316f9ebfb76a35a78744f42ea3b6774289d882d13a2f0383", patches = [ - "@com_github_ray_project_ray//thirdparty/patches:hiredis-windows-msvc.patch", + "@io_ray//thirdparty/patches:hiredis-windows-msvc.patch", ], ) auto_http_archive( name = "com_github_spdlog", - build_file = "@com_github_ray_project_ray//bazel:spdlog.BUILD", + build_file = "@io_ray//bazel:spdlog.BUILD", urls = ["https://github.com/gabime/spdlog/archive/v1.12.0.zip"], sha256 = "6174bf8885287422a6c6a0312eb8a30e8d22bcfcee7c48a6d02d1835d7769232", # spdlog rotation filename format conflict with ray, update the format. patches = [ - "@com_github_ray_project_ray//thirdparty/patches:spdlog-rotation-file-format.patch", + "@io_ray//thirdparty/patches:spdlog-rotation-file-format.patch", ], patch_args = ["-p1"], ) auto_http_archive( name = "com_github_tporadowski_redis_bin", - build_file = "@com_github_ray_project_ray//bazel:redis.BUILD", + build_file = "@io_ray//bazel:redis.BUILD", strip_prefix = None, url = "https://github.com/tporadowski/redis/releases/download/v5.0.9/Redis-x64-5.0.9.zip", sha256 = "b09565b22b50c505a5faa86a7e40b6683afb22f3c17c5e6a5e35fc9b7c03f4c2", @@ -224,8 +224,8 @@ def ray_deps_setup(): url = "https://github.com/census-instrumentation/opencensus-cpp/archive/5e5f2632c84e2230fb7ccb8e336f603d2ec6aa1b.zip", sha256 = "1b88d6663f05c6a56c1604eb2afad22831d5f28a76f6fab8f37187f1e4ace425", patches = [ - "@com_github_ray_project_ray//thirdparty/patches:opencensus-cpp-harvest-interval.patch", - "@com_github_ray_project_ray//thirdparty/patches:opencensus-cpp-shutdown-api.patch", + "@io_ray//thirdparty/patches:opencensus-cpp-harvest-interval.patch", + "@io_ray//thirdparty/patches:opencensus-cpp-shutdown-api.patch", ], patch_args = ["-p1"], ) @@ -255,6 +255,10 @@ def ray_deps_setup(): urls = [ "https://github.com/abseil/abseil-cpp/archive/refs/tags/20230802.1.tar.gz", ], + patches = [ + # TODO (israbbani): #55430 Separate the compiler flags and remove this patch + "@io_ray//thirdparty/patches:abseil-cpp-shadow.patch", + ], ) # OpenCensus depends on jupp0r/prometheus-cpp @@ -263,11 +267,11 @@ def ray_deps_setup(): url = "https://github.com/jupp0r/prometheus-cpp/archive/60eaa4ea47b16751a8e8740b05fe70914c68a480.tar.gz", sha256 = "ec825b802487ac18b0d98e2e8b7961487b12562f8f82e424521d0a891d9e1373", patches = [ - "@com_github_ray_project_ray//thirdparty/patches:prometheus-windows-headers.patch", + "@io_ray//thirdparty/patches:prometheus-windows-headers.patch", # https://github.com/jupp0r/prometheus-cpp/pull/225 - "@com_github_ray_project_ray//thirdparty/patches:prometheus-windows-zlib.patch", - "@com_github_ray_project_ray//thirdparty/patches:prometheus-windows-pollfd.patch", - "@com_github_ray_project_ray//thirdparty/patches:prometheus-zlib-fdopen.patch", + "@io_ray//thirdparty/patches:prometheus-windows-zlib.patch", + "@io_ray//thirdparty/patches:prometheus-windows-pollfd.patch", + "@io_ray//thirdparty/patches:prometheus-zlib-fdopen.patch", ], ) @@ -277,9 +281,9 @@ def ray_deps_setup(): url = "https://github.com/grpc/grpc/archive/refs/tags/v1.57.1.tar.gz", sha256 = "0762f809b9de845e6a7c809cabccad6aa4143479fd43b396611fe5a086c0aeeb", patches = [ - "@com_github_ray_project_ray//thirdparty/patches:grpc-cython-copts.patch", - "@com_github_ray_project_ray//thirdparty/patches:grpc-zlib-fdopen.patch", - "@com_github_ray_project_ray//thirdparty/patches:grpc-configurable-thread-count.patch", + "@io_ray//thirdparty/patches:grpc-cython-copts.patch", + "@io_ray//thirdparty/patches:grpc-zlib-fdopen.patch", + "@io_ray//thirdparty/patches:grpc-configurable-thread-count.patch", ], ) @@ -356,7 +360,9 @@ def ray_deps_setup(): url = "https://github.com/msgpack/msgpack-c/archive/8085ab8721090a447cf98bb802d1406ad7afe420.tar.gz", sha256 = "83c37c9ad926bbee68d564d9f53c6cbb057c1f755c264043ddd87d89e36d15bb", patches = [ - "@com_github_ray_project_ray//thirdparty/patches:msgpack-windows-iovec.patch", + "@io_ray//thirdparty/patches:msgpack-windows-iovec.patch", + # TODO (israbbani): #55430 Separate the compiler flags and remove this patch + "@io_ray//thirdparty/patches:msgpack-shadow.patch", ], ) @@ -372,7 +378,7 @@ def ray_deps_setup(): strip_prefix = "json-3.9.1", urls = ["https://github.com/nlohmann/json/archive/v3.9.1.tar.gz"], sha256 = "4cf0df69731494668bdd6460ed8cb269b68de9c19ad8c27abc24cd72605b2d5b", - build_file = "@com_github_ray_project_ray//bazel:nlohmann_json.BUILD", + build_file = "@io_ray//bazel:nlohmann_json.BUILD", ) auto_http_archive( @@ -398,7 +404,7 @@ def ray_deps_setup(): http_archive( name = "jemalloc", urls = ["https://github.com/jemalloc/jemalloc/releases/download/5.3.0/jemalloc-5.3.0.tar.bz2"], - build_file = "@com_github_ray_project_ray//bazel:jemalloc.BUILD", + build_file = "@io_ray//bazel:jemalloc.BUILD", sha256 = "2db82d1e7119df3e71b7640219b6dfe84789bc0537983c3b7ac4f7189aecfeaa", strip_prefix = "jemalloc-5.3.0", ) diff --git a/ci/build/build-anyscale-docker.sh b/ci/build/build-anyscale-docker.sh index 3e4e74ef8a0b..bcee1703e44c 100755 --- a/ci/build/build-anyscale-docker.sh +++ b/ci/build/build-anyscale-docker.sh @@ -3,33 +3,11 @@ set -euo pipefail SOURCE_IMAGE="$1" DEST_IMAGE="$2" -REQUIREMENTS="$3" -ECR="$4" - -DATAPLANE_S3_BUCKET="ray-release-automation-results" -DATAPLANE_FILENAME="dataplane_20250624.tar.gz" -DATAPLANE_DIGEST="3cffb55f1a56f0bc6256cbf1a38bf1e764e202a647a4272b80531760f1250059" - -# download dataplane build file -aws s3api get-object --bucket "${DATAPLANE_S3_BUCKET}" \ - --key "${DATAPLANE_FILENAME}" "${DATAPLANE_FILENAME}" - -# check dataplane build file digest -echo "${DATAPLANE_DIGEST} ${DATAPLANE_FILENAME}" | sha256sum -c - -# build anyscale image -DOCKER_BUILDKIT=1 docker build \ - --build-arg BASE_IMAGE="$SOURCE_IMAGE" \ - -t "$DEST_IMAGE" - < "${DATAPLANE_FILENAME}" - -DOCKER_BUILDKIT=1 docker build \ - --build-arg BASE_IMAGE="$DEST_IMAGE" \ - --build-arg PIP_REQUIREMENTS="$REQUIREMENTS" \ - -t "$DEST_IMAGE" \ - -f release/ray_release/byod/byod.Dockerfile \ - release/ray_release/byod +ECR="$3" # publish anyscale image aws ecr get-login-password --region us-west-2 | \ docker login --username AWS --password-stdin "$ECR" + +docker tag "$SOURCE_IMAGE" "$DEST_IMAGE" docker push "$DEST_IMAGE" diff --git a/ci/build/build-manylinux-forge.sh b/ci/build/build-manylinux-forge.sh index 483aa167ae3e..3553cdac7509 100755 --- a/ci/build/build-manylinux-forge.sh +++ b/ci/build/build-manylinux-forge.sh @@ -3,6 +3,25 @@ set -exuo pipefail +BAZELISK_VERSION="v1.26.0" + +ARCH="$(uname -m)" + +case "$ARCH" in + x86_64|amd64) + ARCH="x86_64" + ;; + aarch64|arm64) + ARCH="aarch64" + ;; + *) + echo "Unsupported arch: $ARCH" >&2 + exit 1 + ;; +esac + +echo "Architecture is ${ARCH}" + if [[ ! -e /usr/bin/nproc ]]; then echo -e '#!/bin/bash\necho 10' > "/usr/bin/nproc" chmod +x /usr/bin/nproc @@ -10,13 +29,13 @@ fi # Install ray cpp dependencies. sudo yum -y install unzip zip sudo openssl xz -if [[ "${HOSTTYPE-}" == "x86_64" ]]; then +if [[ "${ARCH}" == "x86_64" ]]; then sudo yum -y install libasan-4.8.5-44.el7.x86_64 libubsan-7.3.1-5.10.el7.x86_64 \ devtoolset-8-libasan-devel.x86_64 fi # Install ray java dependencies. -if [[ "${RAY_INSTALL_JAVA}" == "1" ]]; then +if [[ "${RAYCI_DISABLE_JAVA:-false}" != "true" && "${RAY_INSTALL_JAVA:-1}" == "1" ]]; then sudo yum -y install java-1.8.0-openjdk java-1.8.0-openjdk-devel maven java -version JAVA_BIN="$(readlink -f "$(command -v java)")" @@ -24,21 +43,39 @@ if [[ "${RAY_INSTALL_JAVA}" == "1" ]]; then export JAVA_HOME="${JAVA_BIN%jre/bin/java}" fi -# Install ray dashboard dependencies. -curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.34.0/install.sh | bash -source "$HOME"/.nvm/nvm.sh +# Install nodejs +NODE_VERSION_FULL="${NODE_VERSION_FULL:-14.21.3}" -NODE_VERSION="14" -nvm install "$NODE_VERSION" -nvm use "$NODE_VERSION" +if [[ "${ARCH}" == "x86_64" ]]; then + NODE_URL="https://nodejs.org/dist/v${NODE_VERSION_FULL}/node-v${NODE_VERSION_FULL}-linux-x64.tar.xz" + NODE_SHA256="05c08a107c50572ab39ce9e8663a2a2d696b5d262d5bd6f98d84b997ce932d9a" +else # aarch64 + NODE_URL="https://nodejs.org/dist/v${NODE_VERSION_FULL}/node-v${NODE_VERSION_FULL}-linux-arm64.tar.xz" + NODE_SHA256="f06642bfcf0b8cc50231624629bec58b183954641b638e38ed6f94cd39e8a6ef" +fi + +NODE_DIR="/usr/local/node" +curl -fsSL "${NODE_URL}" -o /tmp/node.tar.xz +echo "$NODE_SHA256 /tmp/node.tar.xz" | sha256sum -c - +sudo mkdir -p "$NODE_DIR" +sudo tar -xf /tmp/node.tar.xz -C "$NODE_DIR" --strip-components=1 +rm /tmp/node.tar.xz # Install bazel -npm install -g @bazel/bazelisk mkdir -p "$HOME"/bin -ln -sf "$(which bazelisk)" "$HOME"/bin/bazel +if [[ "${ARCH}" == "x86_64" ]]; then + BAZELISK_URL="https://github.com/bazelbuild/bazelisk/releases/download/${BAZELISK_VERSION}/bazelisk-linux-amd64" +else # aarch64 + BAZELISK_URL="https://github.com/bazelbuild/bazelisk/releases/download/${BAZELISK_VERSION}/bazelisk-linux-arm64" +fi + +curl -sSfL -o /tmp/bazelisk "${BAZELISK_URL}" +chmod +x /tmp/bazelisk +sudo mv /tmp/bazelisk /usr/local/bin/bazelisk +sudo ln -sf /usr/local/bin/bazelisk /usr/local/bin/bazel # Use python3.9 as default python3 -ln -sf /usr/local/bin/python3.9 /usr/local/bin/python3 +sudo ln -sf /usr/local/bin/python3.9 /usr/local/bin/python3 { echo "build --config=ci" diff --git a/ci/build/build-manylinux-ray.sh b/ci/build/build-manylinux-ray.sh index e81eb1da9ea8..c32d23ac6347 100755 --- a/ci/build/build-manylinux-ray.sh +++ b/ci/build/build-manylinux-ray.sh @@ -11,7 +11,11 @@ if [[ "${RAY_INSTALL_JAVA}" == "1" ]]; then bazel build //java:ray_java_pkg fi +export PATH="/usr/local/node/bin:$PATH" + # Build ray dashboard -cd python/ray/dashboard/client -npm ci -npm run build +( + cd python/ray/dashboard/client + npm ci + npm run build +) diff --git a/ci/build/build-manylinux-wheel.sh b/ci/build/build-manylinux-wheel.sh index a324091b7903..b2b1abdadaa7 100755 --- a/ci/build/build-manylinux-wheel.sh +++ b/ci/build/build-manylinux-wheel.sh @@ -28,13 +28,13 @@ sudo ln -sf "/opt/python/${PYTHON}/bin/python3" /usr/local/bin/python3 # build ray wheel PATH="/opt/python/${PYTHON}/bin:$PATH" RAY_INSTALL_JAVA=0 \ -"/opt/python/${PYTHON}/bin/python" -m pip wheel -q -w dist . --no-deps +"/opt/python/${PYTHON}/bin/python" -m pip wheel -v -w dist . --no-deps if [[ "${RAY_DISABLE_EXTRA_CPP:-}" != 1 ]]; then # build ray-cpp wheel PATH="/opt/python/${PYTHON}/bin:$PATH" RAY_INSTALL_JAVA=0 \ - RAY_INSTALL_CPP=1 "/opt/python/${PYTHON}/bin/python" -m pip wheel -q -w dist . --no-deps + RAY_INSTALL_CPP=1 "/opt/python/${PYTHON}/bin/python" -m pip wheel -v -w dist . --no-deps fi # Rename the wheels so that they can be uploaded to PyPI. TODO(rkn): This is a diff --git a/ci/build/build-placeholder-wheel.sh b/ci/build/build-placeholder-wheel.sh new file mode 100755 index 000000000000..effa117c3da8 --- /dev/null +++ b/ci/build/build-placeholder-wheel.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +set -exuo pipefail + +export RAY_DEBUG_BUILD=deps-only + +PYTHON_VERSIONS=("3.9" "3.10" "3.11" "3.12") + +for PYTHON_VERSION in "${PYTHON_VERSIONS[@]}"; do + uv build --wheel --directory python/ -o ../.whl/ --force-pep517 --python "$PYTHON_VERSION" +done diff --git a/ci/build/build-ray-docker.sh b/ci/build/build-ray-docker.sh index 59857533d5ee..86325491d97f 100755 --- a/ci/build/build-ray-docker.sh +++ b/ci/build/build-ray-docker.sh @@ -7,6 +7,9 @@ CONSTRAINTS_FILE="$3" DEST_IMAGE="$4" PIP_FREEZE_FILE="$5" +RAY_VERSION="$(python python/ray/_version.py | cut -d' ' -f1)" +RAY_COMMIT="$(git rev-parse HEAD)" + CPU_TMP="$(mktemp -d)" cp -r .whl "${CPU_TMP}/.whl" @@ -20,6 +23,8 @@ tar --mtime="UTC 2020-01-01" -c -f - . \ --build-arg FULL_BASE_IMAGE="$SOURCE_IMAGE" \ --build-arg WHEEL_PATH=".whl/${WHEEL_NAME}" \ --build-arg CONSTRAINTS_FILE="$CONSTRAINTS_FILE" \ + --label "io.ray.ray-version=$RAY_VERSION" \ + --label "io.ray.ray-commit=$RAY_COMMIT" \ -t "$DEST_IMAGE" -f Dockerfile - # Copy the pip freeze file to the artifact mount. diff --git a/ci/build/get_build_info.py b/ci/build/get_build_info.py index b22918551788..ae0758382b26 100755 --- a/ci/build/get_build_info.py +++ b/ci/build/get_build_info.py @@ -10,9 +10,9 @@ } """ +import json import os import platform -import json def gha_get_self_url(): diff --git a/ci/build/test-linux-placeholder-wheel.sh b/ci/build/test-linux-placeholder-wheel.sh new file mode 100755 index 000000000000..921f7ee05b6d --- /dev/null +++ b/ci/build/test-linux-placeholder-wheel.sh @@ -0,0 +1,42 @@ +#!/bin/bash +set -exuo pipefail + +PYTHON="$1" + +if [[ ! "${OSTYPE}" =~ ^linux ]]; then + echo "ERROR: This wheel test script is only for Linux platforms." >/dev/stderr + exit 1 +fi + +PYTHON_VERSION="${PYTHON//./}" + +which python + +which pip + +RAY_PLACEHOLDER_VERSION="100.0.0-dev" +MINIFORGE_BIN_PATH="/opt/miniforge/bin" +PYTHON_EXE="${MINIFORGE_BIN_PATH}/python" +PIP_CMD="${MINIFORGE_BIN_PATH}/pip" +PIP_COMPILE_CMD="${MINIFORGE_BIN_PATH}/pip-compile" +# Find the appropriate wheel by grepping for the Python version. +PYTHON_WHEEL=$(find ./.whl -maxdepth 1 -type f -name "*${PYTHON_VERSION}*.whl" -print -quit) + +if [[ -z "$PYTHON_WHEEL" ]]; then + echo "No wheel found for pattern *${PYTHON_VERSION}*.whl" >/dev/stderr + exit 1 +fi + +"$PYTHON_EXE" --version + +"$PIP_CMD" install --upgrade pip + +"$PIP_CMD" install pip-tools + +"$PIP_COMPILE_CMD" --version + +echo "ray[all]==${RAY_PLACEHOLDER_VERSION}" > ray-requirement.txt + +"$PIP_COMPILE_CMD" ray-requirement.txt -o /ray.lock --find-links=.whl/ + +echo "✅ Completed ray placeholder wheel test" diff --git a/ci/ci.sh b/ci/ci.sh index 8afa3ccc78a6..cea4b38f5e00 100755 --- a/ci/ci.sh +++ b/ci/ci.sh @@ -151,8 +151,8 @@ test_macos_wheels() { return "${TEST_WHEEL_RESULT}" } -install_npm_project() { - if [ "${OSTYPE}" = msys ]; then +_install_npm_project() { + if [[ "${OSTYPE}" == msys ]]; then # Not Windows-compatible: https://github.com/npm/cli/issues/558#issuecomment-584673763 { echo "WARNING: Skipping NPM due to module incompatibilities with Windows"; } 2> /dev/null else @@ -170,14 +170,16 @@ build_dashboard_front_end() { cd ray/dashboard/client # skip nvm activation on buildkite linux instances. - if [ -z "${BUILDKITE-}" ] || [[ "${OSTYPE}" != linux* ]]; then - set +x # suppress set -x since it'll get very noisy here - . "${HOME}/.nvm/nvm.sh" - NODE_VERSION="14" - nvm install $NODE_VERSION - nvm use --silent $NODE_VERSION + if [[ -z "${BUILDKITE-}" || "${OSTYPE}" != linux* ]]; then + if [[ -d "${HOME}/.nvm" ]]; then + set +x # suppress set -x since it'll get very noisy here + . "${HOME}/.nvm/nvm.sh" + NODE_VERSION="14" + nvm install $NODE_VERSION + nvm use --silent $NODE_VERSION + fi fi - install_npm_project + _install_npm_project npm run build ) fi @@ -242,25 +244,20 @@ install_ray() { ) } -validate_wheels_commit_str() { - if [ "${OSTYPE}" = msys ]; then - echo "Windows builds do not set the commit string, skipping wheel commit validity check." - return 0 - fi - - if [ -n "${BUILDKITE_COMMIT}" ]; then - EXPECTED_COMMIT=${BUILDKITE_COMMIT:-} +_validate_macos_wheels_commit_str() { + if [[ -n "${BUILDKITE_COMMIT}" ]]; then + EXPECTED_COMMIT="${BUILDKITE_COMMIT:-}" else - EXPECTED_COMMIT=${TRAVIS_COMMIT:-} + EXPECTED_COMMIT="$(git rev-parse HEAD)" fi - if [ -z "$EXPECTED_COMMIT" ]; then - echo "Could not validate expected wheel commits: TRAVIS_COMMIT is empty." - return 0 + if [[ -z "$EXPECTED_COMMIT" ]]; then + echo "Could not validate expected wheel commits: BUILDKITE_COMMIT is empty." >&2 + exit 1 fi for whl in .whl/*.whl; do - basename=${whl##*/} + basename="${whl##*/}" if [[ "$basename" =~ "_cpp" ]]; then # cpp wheels cannot be checked this way @@ -281,85 +278,29 @@ validate_wheels_commit_str() { echo "All wheels passed the sanity check and have the correct wheel commit set." } -build_wheels_and_jars() { +build_macos_wheels_and_jars() { + if [[ "${OSTYPE}" != darwin* ]]; then + echo "Not on macOS" + exit 1 + fi + _bazel_build_before_install # Create wheel output directory and empty contents # If buildkite runners are re-used, wheels from previous builds might be here, so we delete them. + rm -rf .whl mkdir -p .whl - rm -rf .whl/* || true - - case "${OSTYPE}" in - linux*) - # Mount bazel cache dir to the docker container. - # For the linux wheel build, we use a shared cache between all - # wheels, but not between different travis runs, because that - # caused timeouts in the past. See the "cache: false" line below. - local MOUNT_BAZEL_CACHE=( - -e "TRAVIS=true" - -e "TRAVIS_PULL_REQUEST=${TRAVIS_PULL_REQUEST:-false}" - -e "TRAVIS_COMMIT=${TRAVIS_COMMIT}" - -e "CI=${CI}" - -e "RAY_INSTALL_JAVA=${RAY_INSTALL_JAVA:-1}" - -e "BUILDKITE=${BUILDKITE:-}" - -e "BUILDKITE_PULL_REQUEST=${BUILDKITE_PULL_REQUEST:-}" - -e "BUILDKITE_BAZEL_CACHE_URL=${BUILDKITE_BAZEL_CACHE_URL:-}" - -e "RAY_DEBUG_BUILD=${RAY_DEBUG_BUILD:-}" - -e "BUILD_ONE_PYTHON_ONLY=${BUILD_ONE_PYTHON_ONLY:-}" - ) - - IMAGE_NAME="quay.io/pypa/manylinux2014_${HOSTTYPE}" - IMAGE_TAG="2022-12-20-b4884d9" - - local MOUNT_ENV=() - if [[ "${LINUX_JARS-}" == "1" ]]; then - MOUNT_ENV+=(-e "BUILD_JAR=1") - fi - if [[ -z "${BUILDKITE-}" ]]; then - # This command should be kept in sync with ray/python/README-building-wheels.md, - # except the "${MOUNT_BAZEL_CACHE[@]}" part. - docker run --rm -w /ray -v "${PWD}":/ray "${MOUNT_BAZEL_CACHE[@]}" \ - "${MOUNT_ENV[@]}" "${IMAGE_NAME}:${IMAGE_TAG}" /ray/python/build-wheel-manylinux2014.sh - else - rm -rf /ray-mount/* - rm -rf /ray-mount/.whl || true - rm -rf /ray/.whl || true - cp -rT /ray /ray-mount - ls -a /ray-mount - docker run --rm -w /ray -v /ray:/ray "${MOUNT_BAZEL_CACHE[@]}" \ - "${MOUNT_ENV[@]}" "${IMAGE_NAME}:${IMAGE_TAG}" /ray/python/build-wheel-manylinux2014.sh - cp -rT /ray-mount /ray # copy new files back here - find . | grep whl # testing - - # Sync the directory to buildkite artifacts - rm -rf /artifact-mount/.whl || true - - if [ "${UPLOAD_WHEELS_AS_ARTIFACTS-}" = "1" ]; then - cp -r .whl /artifact-mount/.whl - chmod -R 777 /artifact-mount/.whl - fi + # This command should be kept in sync with ray/python/README-building-wheels.md. + "${WORKSPACE_DIR}"/python/build-wheel-macos.sh - validate_wheels_commit_str - fi - ;; - darwin*) - # This command should be kept in sync with ray/python/README-building-wheels.md. - "${WORKSPACE_DIR}"/python/build-wheel-macos.sh - mkdir -p /tmp/artifacts/.whl - rm -rf /tmp/artifacts/.whl || true - - if [[ "${UPLOAD_WHEELS_AS_ARTIFACTS-}" == "1" ]]; then - cp -r .whl /tmp/artifacts/.whl - chmod -R 777 /tmp/artifacts/.whl - fi + mkdir -p /tmp/artifacts + rm -rf /tmp/artifacts/.whl + cp -r .whl /tmp/artifacts/.whl + chmod 755 /tmp/artifacts/.whl + chmod 644 /tmp/artifacts/.whl/* - validate_wheels_commit_str - ;; - msys*) - "${WORKSPACE_DIR}"/python/build-wheel-windows.sh - ;; - esac + _validate_macos_wheels_commit_str } configure_system() { diff --git a/ci/compile_llm_requirements.sh b/ci/compile_llm_requirements.sh index b12e7df6c07d..cf71563bf47c 100755 --- a/ci/compile_llm_requirements.sh +++ b/ci/compile_llm_requirements.sh @@ -2,80 +2,15 @@ set -euo pipefail -PYTHON_CODE="$(python -c "import sys; v=sys.version_info; print(f'py{v.major}{v.minor}')")" -if [[ "${PYTHON_CODE}" != "py311" ]]; then - echo "--- Python version is not 3.11" - echo "--- Current Python version: ${PYTHON_CODE}" - exit 1 -fi +CONFIG_PATH="${1:-ci/raydepsets/rayllm.depsets.yaml}" -for CUDA_CODE in cpu cu121 cu128; do - PYTHON_CUDA_CODE="${PYTHON_CODE}_${CUDA_CODE}" +mkdir -p /tmp/ray-deps - echo "--- Compile dependencies for ${PYTHON_CODE}_${CUDA_CODE}" +# Remove the GPU constraints +cp python/requirements_compiled.txt /tmp/ray-deps/requirements_compiled.txt +sed -e '/^--extra-index-url /d' -e '/^--find-links /d' /tmp/ray-deps/requirements_compiled.txt > /tmp/ray-deps/requirements_compiled.txt.tmp +mv /tmp/ray-deps/requirements_compiled.txt.tmp /tmp/ray-deps/requirements_compiled.txt - UV_PIP_COMPILE=( - uv pip compile --generate-hashes --strip-extras - --unsafe-package ray - # The version we use on python 3.9 is not installable on python 3.11 - --unsafe-package grpcio-tools - # setuptools should not be pinned. - --unsafe-package setuptools - --index-url "https://pypi.org/simple" - --extra-index-url "https://download.pytorch.org/whl/${CUDA_CODE}" - --index-strategy unsafe-best-match - --no-strip-markers - --emit-index-url - --emit-find-links - ) - - mkdir -p /tmp/ray-deps - - # Remove the GPU constraints - cp python/requirements_compiled.txt /tmp/ray-deps/requirements_compiled.txt - sed -i '/^--extra-index-url /d' /tmp/ray-deps/requirements_compiled.txt - sed -i '/^--find-links /d' /tmp/ray-deps/requirements_compiled.txt - - # First, extract base test dependencies from the current compiled mono repo one. - # This also expands to the indirect dependencies for this Python version & platform. - # - # Needs to use the exact torch version. - echo "--- Compile ray base test dependencies" - "${UV_PIP_COMPILE[@]}" \ - -c "/tmp/ray-deps/requirements_compiled.txt" \ - "python/requirements.txt" \ - "python/requirements/cloud-requirements.txt" \ - "python/requirements/base-test-requirements.txt" \ - -o "python/requirements_compiled_ray_test_${PYTHON_CUDA_CODE}.txt" - - # Second, expand it into LLM test dependencies - echo "--- Compile LLM test dependencies" - "${UV_PIP_COMPILE[@]}" \ - -c "python/requirements_compiled_ray_test_${PYTHON_CUDA_CODE}.txt" \ - "python/requirements.txt" \ - "python/requirements/cloud-requirements.txt" \ - "python/requirements/base-test-requirements.txt" \ - "python/requirements/llm/llm-requirements.txt" \ - "python/requirements/llm/llm-test-requirements.txt" \ - -o "python/requirements_compiled_rayllm_test_${PYTHON_CUDA_CODE}.txt" - - # Third, extract the ray base dependencies from ray base test dependencies. - # TODO(aslonnie): This should be used for installing ray in the container images. - echo "--- Compile ray base dependencies" - "${UV_PIP_COMPILE[@]}" \ - -c "python/requirements_compiled_ray_test_${PYTHON_CUDA_CODE}.txt" \ - "python/requirements.txt" \ - -o "python/requirements_compiled_ray_${PYTHON_CUDA_CODE}.txt" - - # Finally, extract the LLM dependencies from the LLM test dependencies, - # which is also an expansion of the ray base dependencies. - # TODO(aslonnie): This should be used for installing ray[llm] in the container images. - echo "--- Compile LLM dependencies" - "${UV_PIP_COMPILE[@]}" \ - -c "python/requirements_compiled_rayllm_test_${PYTHON_CUDA_CODE}.txt" \ - "python/requirements.txt" \ - "python/requirements/llm/llm-requirements.txt" \ - -o "python/requirements_compiled_rayllm_${PYTHON_CUDA_CODE}.txt" -done +bazel run //ci/raydepsets:raydepsets -- build "${CONFIG_PATH}" echo "--- Done" diff --git a/ci/docker/llm.build.Dockerfile b/ci/docker/llm.build.Dockerfile index 42e1dca1ac03..312d31c5e94b 100644 --- a/ci/docker/llm.build.Dockerfile +++ b/ci/docker/llm.build.Dockerfile @@ -17,6 +17,6 @@ set -euo pipefail SKIP_PYTHON_PACKAGES=1 ./ci/env/install-dependencies.sh -pip install --no-deps -r python/requirements_compiled_rayllm_test_py311_$RAY_CUDA_CODE.txt +pip install --no-deps -r python/deplocks/llm/rayllm_test_py311_${RAY_CUDA_CODE}.lock EOF diff --git a/ci/docker/llm.build.wanda.yaml b/ci/docker/llm.build.wanda.yaml index 5779c145fcf9..6d89370977a3 100644 --- a/ci/docker/llm.build.wanda.yaml +++ b/ci/docker/llm.build.wanda.yaml @@ -5,8 +5,8 @@ srcs: - ci/env/install-dependencies.sh - ci/env/install-llvm-binaries.sh - ci/suppress_output - - python/requirements_compiled_rayllm_test_py311_cpu.txt - - python/requirements_compiled_rayllm_test_py311_cu128.txt + - python/deplocks/llm/rayllm_test_py311_cpu.lock + - python/deplocks/llm/rayllm_test_py311_cu128.lock tags: - cr.ray.io/rayproject/$IMAGE_TO build_args: diff --git a/ci/docker/manylinux.Dockerfile b/ci/docker/manylinux.Dockerfile index d090e53b5e1f..7a243e5033f1 100644 --- a/ci/docker/manylinux.Dockerfile +++ b/ci/docker/manylinux.Dockerfile @@ -4,13 +4,18 @@ ARG HOSTTYPE FROM quay.io/pypa/manylinux2014_${HOSTTYPE}:2024-07-02-9ac04ee ARG BUILDKITE_BAZEL_CACHE_URL +ARG RAYCI_DISABLE_JAVA=false ENV BUILD_JAR=1 +ENV RAYCI_DISABLE_JAVA=$RAYCI_DISABLE_JAVA ENV RAY_INSTALL_JAVA=1 ENV BUILDKITE_BAZEL_CACHE_URL=$BUILDKITE_BAZEL_CACHE_URL RUN yum -y install sudo +RUN curl -LsSf https://astral.sh/uv/0.8.17/install.sh | \ + env UV_INSTALL_DIR=/usr/local/bin sh + COPY ci/build/build-manylinux-forge.sh /tmp/build-manylinux-forge.sh RUN ./tmp/build-manylinux-forge.sh diff --git a/ci/docker/manylinux.aarch64.wanda.yaml b/ci/docker/manylinux.aarch64.wanda.yaml index 5b72e6df5bd3..fb5827b560dc 100644 --- a/ci/docker/manylinux.aarch64.wanda.yaml +++ b/ci/docker/manylinux.aarch64.wanda.yaml @@ -5,5 +5,6 @@ srcs: - ci/build/build-manylinux-forge.sh build_args: - BUILDKITE_BAZEL_CACHE_URL + - RAYCI_DISABLE_JAVA - HOSTTYPE=aarch64 dockerfile: ci/docker/manylinux.Dockerfile diff --git a/ci/docker/manylinux.wanda.yaml b/ci/docker/manylinux.wanda.yaml index 3e01ed3a2cd6..5b72115f3cc7 100644 --- a/ci/docker/manylinux.wanda.yaml +++ b/ci/docker/manylinux.wanda.yaml @@ -5,5 +5,6 @@ srcs: - ci/build/build-manylinux-forge.sh build_args: - BUILDKITE_BAZEL_CACHE_URL + - RAYCI_DISABLE_JAVA - HOSTTYPE=x86_64 dockerfile: ci/docker/manylinux.Dockerfile diff --git a/ci/docker/min.build.Dockerfile b/ci/docker/min.build.Dockerfile index 00e6082788d6..fa88dbfb435d 100644 --- a/ci/docker/min.build.Dockerfile +++ b/ci/docker/min.build.Dockerfile @@ -31,6 +31,7 @@ elif [[ "${EXTRA_DEPENDENCY}" == "default" ]]; then pip-compile -o min_requirements.txt python/setup.py --extra default elif [[ "${EXTRA_DEPENDENCY}" == "serve" ]]; then echo "httpx==0.27.2" >> /tmp/min_build_requirements.txt + echo "pytest-asyncio==1.1.0" >> /tmp/min_build_requirements.txt pip-compile -o min_requirements.txt /tmp/min_build_requirements.txt python/setup.py --extra "serve-grpc" rm /tmp/min_build_requirements.txt fi diff --git a/ci/docker/ray.cpu.base.aarch64.wanda.yaml b/ci/docker/ray.cpu.base.aarch64.wanda.yaml deleted file mode 100644 index 1726fb261825..000000000000 --- a/ci/docker/ray.cpu.base.aarch64.wanda.yaml +++ /dev/null @@ -1,11 +0,0 @@ -name: "ray-py$PYTHON_VERSION-cpu-base-aarch64" -froms: ["ubuntu:22.04"] -dockerfile: docker/base-deps/Dockerfile -srcs: - - python/requirements_compiled.txt -build_args: - - PYTHON_VERSION - - BASE_IMAGE=ubuntu:22.04 - - HOSTTYPE=aarch64 -tags: - - cr.ray.io/rayproject/ray-py$PYTHON_VERSION-cpu-base-aarch64 diff --git a/ci/docker/ray.cuda.base.aarch64.wanda.yaml b/ci/docker/ray.cuda.base.aarch64.wanda.yaml deleted file mode 100644 index 1d1d6df12787..000000000000 --- a/ci/docker/ray.cuda.base.aarch64.wanda.yaml +++ /dev/null @@ -1,11 +0,0 @@ -name: "ray-py$PYTHON_VERSION-cu$CUDA_VERSION-base-aarch64" -froms: ["nvidia/cuda:$CUDA_VERSION-devel-ubuntu22.04"] -dockerfile: docker/base-deps/Dockerfile -srcs: - - python/requirements_compiled.txt -build_args: - - PYTHON_VERSION - - BASE_IMAGE=nvidia/cuda:$CUDA_VERSION-devel-ubuntu22.04 - - HOSTTYPE=aarch64 -tags: - - cr.ray.io/rayproject/ray-py$PYTHON_VERSION-cu$CUDA_VERSION-base-aarch64 diff --git a/ci/docker/runtime_env_container/Dockerfile b/ci/docker/runtime_env_container/Dockerfile index 45501d3e0f5a..ad7972761bce 100644 --- a/ci/docker/runtime_env_container/Dockerfile +++ b/ci/docker/runtime_env_container/Dockerfile @@ -4,4 +4,5 @@ FROM $BASE_IMAGE COPY python/ray/tests/runtime_env_container/ /home/ray/tests/ # Install podman +RUN pip install --no-cache-dir -c /home/ray/requirements_compiled.txt httpx RUN sudo apt-get update && sudo apt-get install podman -y diff --git a/ci/docker/serve.build.Dockerfile b/ci/docker/serve.build.Dockerfile index 1b38777f0f25..8753d5a2bd37 100644 --- a/ci/docker/serve.build.Dockerfile +++ b/ci/docker/serve.build.Dockerfile @@ -27,7 +27,11 @@ if [[ "${PYTHON-}" != "3.12" ]]; then tensorflow tensorflow-probability torch torchvision \ transformers aioboto3 fi -git clone https://github.com/wg/wrk.git /tmp/wrk && pushd /tmp/wrk && make -j && sudo cp wrk /usr/local/bin && popd + +git clone --branch=4.2.0 --depth=1 https://github.com/wg/wrk.git /tmp/wrk +make -C /tmp/wrk -j +sudo cp /tmp/wrk/wrk /usr/local/bin/wrk +rm -rf /tmp/wrk # Install custom Pydantic version if requested. if [[ -n "${PYDANTIC_VERSION-}" ]]; then diff --git a/ci/env/check_minimal_install.py b/ci/env/check_minimal_install.py index 8bf4630ee210..c9ec2255aed6 100644 --- a/ci/env/check_minimal_install.py +++ b/ci/env/check_minimal_install.py @@ -8,9 +8,9 @@ It also ensures the correct Python version. """ -from typing import List import argparse import sys +from typing import List # These are taken from `setup.py` for ray[default] DEFAULT_BLACKLIST = [ diff --git a/ci/env/setup_credentials.py b/ci/env/setup_credentials.py index 2f03b1c766a4..86a886cf75aa 100644 --- a/ci/env/setup_credentials.py +++ b/ci/env/setup_credentials.py @@ -7,10 +7,11 @@ export WANDB_API_KEY=abcd export COMET_API_KEY=efgh """ -import boto3 import json import sys +import boto3 + AWS_AIR_SECRETS_ARN = ( "arn:aws:secretsmanager:us-west-2:029272617770:secret:" "oss-ci/ray-air-test-secrets20221014164754935800000002-UONblX" diff --git a/ci/lint/check_cpp_files_inclusion.py b/ci/lint/check_cpp_files_inclusion.py index b1e4df83d19f..d849b0f765eb 100755 --- a/ci/lint/check_cpp_files_inclusion.py +++ b/ci/lint/check_cpp_files_inclusion.py @@ -2,8 +2,8 @@ """This script checks whether header file inclusion for ray core C++ code is correct. """ -import sys import re +import sys def check_ray_core_inclusion(fname: str): diff --git a/ci/lint/format.sh b/ci/lint/format.sh index a540b3d65e21..b2a98d1527bb 100755 --- a/ci/lint/format.sh +++ b/ci/lint/format.sh @@ -88,6 +88,7 @@ else fi if command -v clang-format >/dev/null; then + # This version should be kept in sync with the clang-format version tag in `.pre-commit-config.yaml`. CLANG_FORMAT_VERSION=$(clang-format --version | awk '{print $3}') tool_version_check "clang-format" "$CLANG_FORMAT_VERSION" "12.0.1" else diff --git a/ci/lint/git-clang-format b/ci/lint/git-clang-format index 46b466ee191b..6972b1bf7c6e 100755 --- a/ci/lint/git-clang-format +++ b/ci/lint/git-clang-format @@ -25,6 +25,7 @@ Requires Python 2.7 or Python 3 """ from __future__ import absolute_import, division, print_function + import argparse import collections import contextlib diff --git a/ci/lint/lint.sh b/ci/lint/lint.sh index e8355dda12ba..53d7156e86c8 100755 --- a/ci/lint/lint.sh +++ b/ci/lint/lint.sh @@ -48,7 +48,11 @@ pre_commit() { pre_commit_pydoclint() { # Run pre-commit pydoclint on all files pip install -c python/requirements_compiled.txt pre-commit clang-format - pre-commit run pydoclint --all-files --show-diff-on-failure + pre-commit run pydoclint --hook-stage manual --all-files --show-diff-on-failure + git diff --quiet -- ci/lint/pydoclint-baseline.txt || { + echo "Baseline needs update. Run the CI-style hook: \"pre-commit run pydoclint --hook-stage manual --all-files --show-diff-on-failure\" locally and commit the baseline." + exit 1 + } } code_format() { diff --git a/ci/lint/pydoclint-baseline.txt b/ci/lint/pydoclint-baseline.txt index 99adbd1f2fe2..602d58a3e274 100644 --- a/ci/lint/pydoclint-baseline.txt +++ b/ci/lint/pydoclint-baseline.txt @@ -307,7 +307,7 @@ python/ray/actor.py DOC201: Method `ActorMethod.options` does not have a return section in docstring DOC101: Method `_ActorClassMetadata.__init__`: Docstring contains fewer arguments than in function signature. DOC107: Method `_ActorClassMetadata.__init__`: The option `--arg-type-hints-in-signature` is `True` but not all args in the signature have type hints - DOC103: Method `_ActorClassMetadata.__init__`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [accelerator_type: , actor_creation_function_descriptor: , class_id: , concurrency_groups: , enable_tensor_transport: bool, label_selector: , language: , max_restarts: , max_task_retries: , memory: , modified_class: , num_cpus: , num_gpus: , object_store_memory: , resources: , runtime_env: , scheduling_strategy: SchedulingStrategyT]. + DOC103: Method `_ActorClassMetadata.__init__`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [accelerator_type: , actor_creation_function_descriptor: , class_id: , concurrency_groups: , enable_tensor_transport: bool, label_selector: , language: , max_restarts: , max_task_retries: , memory: , method_meta: , modified_class: , num_cpus: , num_gpus: , object_store_memory: , resources: , runtime_env: , scheduling_strategy: SchedulingStrategyT]. DOC101: Method `ActorClass.__init__`: Docstring contains fewer arguments than in function signature. DOC106: Method `ActorClass.__init__`: The option `--arg-type-hints-in-signature` is `True` but there are no argument type hints in the signature DOC107: Method `ActorClass.__init__`: The option `--arg-type-hints-in-signature` is `True` but not all args in the signature have type hints @@ -957,10 +957,10 @@ python/ray/dashboard/modules/reporter/reporter_head.py DOC101: Method `ReportHead.get_task_cpu_profile`: Docstring contains fewer arguments than in function signature. DOC103: Method `ReportHead.get_task_cpu_profile`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [req: aiohttp.web.Request]. DOC102: Method `ReportHead.get_traceback`: Docstring contains more arguments than in function signature. - DOC103: Method `ReportHead.get_traceback`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [req: aiohttp.web.Request]. Arguments in the docstring but not in the function signature: [ip: , pid: ]. + DOC103: Method `ReportHead.get_traceback`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [req: aiohttp.web.Request]. Arguments in the docstring but not in the function signature: [ip or node_id: , pid: ]. DOC201: Method `ReportHead.get_traceback` does not have a return section in docstring DOC102: Method `ReportHead.cpu_profile`: Docstring contains more arguments than in function signature. - DOC103: Method `ReportHead.cpu_profile`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [req: aiohttp.web.Request]. Arguments in the docstring but not in the function signature: [duration: , format: , ip: , native: , pid: ]. + DOC103: Method `ReportHead.cpu_profile`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [req: aiohttp.web.Request]. Arguments in the docstring but not in the function signature: [duration: , format: , ip or node_id: , native: , pid: ]. DOC201: Method `ReportHead.cpu_profile` does not have a return section in docstring DOC101: Method `ReportHead.memory_profile`: Docstring contains fewer arguments than in function signature. DOC103: Method `ReportHead.memory_profile`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [req: aiohttp.web.Request]. @@ -1098,8 +1098,6 @@ python/ray/data/_internal/execution/interfaces/task_context.py python/ray/data/_internal/execution/operators/base_physical_operator.py DOC101: Method `OneToOneOperator.__init__`: Docstring contains fewer arguments than in function signature. DOC103: Method `OneToOneOperator.__init__`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [data_context: DataContext]. - DOC101: Method `AllToAllOperator.__init__`: Docstring contains fewer arguments than in function signature. - DOC103: Method `AllToAllOperator.__init__`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [data_context: DataContext, target_max_block_size: Optional[int]]. DOC103: Method `NAryOperator.__init__`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [*input_ops: LogicalOperator, data_context: DataContext]. Arguments in the docstring but not in the function signature: [input_op: , name: ]. -------------------- python/ray/data/_internal/execution/operators/hash_shuffle.py @@ -1332,10 +1330,6 @@ python/ray/data/datasource/filename_provider.py DOC201: Method `FilenameProvider.get_filename_for_block` does not have a return section in docstring DOC201: Method `FilenameProvider.get_filename_for_row` does not have a return section in docstring -------------------- -python/ray/data/datasource/parquet_meta_provider.py - DOC101: Method `ParquetMetadataProvider.prefetch_file_metadata`: Docstring contains fewer arguments than in function signature. - DOC103: Method `ParquetMetadataProvider.prefetch_file_metadata`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [**ray_remote_args: ]. --------------------- python/ray/data/datasource/path_util.py DOC201: Function `_has_file_extension` does not have a return section in docstring DOC201: Function `_resolve_paths_and_filesystem` does not have a return section in docstring @@ -1484,10 +1478,6 @@ python/ray/llm/_internal/batch/processor/base.py DOC101: Method `ProcessorBuilder.build`: Docstring contains fewer arguments than in function signature. DOC103: Method `ProcessorBuilder.build`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [**kwargs: ]. -------------------- -python/ray/llm/_internal/batch/processor/vllm_engine_proc.py - DOC101: Function `build_vllm_engine_processor`: Docstring contains fewer arguments than in function signature. - DOC103: Function `build_vllm_engine_processor`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [telemetry_agent: Optional[TelemetryAgent]]. --------------------- python/ray/llm/_internal/batch/stages/base.py DOC405: Method `StatefulStageUDF.__call__` has both "return" and "yield" statements. Please use Generator[YieldType, SendType, ReturnType] as the return type annotation, and put your yield type in YieldType and return type in ReturnType. More details in https://jsh9.github.io/pydoclint/notes_generator_vs_iterator.html -------------------- @@ -2006,14 +1996,6 @@ python/ray/train/v2/_internal/callbacks/accelerators.py python/ray/train/v2/_internal/execution/checkpoint/checkpoint_manager.py DOC103: Method `CheckpointManager.register_checkpoint`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [checkpoint_result: _TrainingResult]. Arguments in the docstring but not in the function signature: [checkpoint: ]. -------------------- -python/ray/train/v2/_internal/execution/context.py - DOC101: Method `TrainContext._save_checkpoint`: Docstring contains fewer arguments than in function signature. - DOC103: Method `TrainContext._save_checkpoint`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [checkpoint: Optional[Checkpoint], checkpoint_dir_name: str, metrics: Dict[str, Any]]. --------------------- -python/ray/train/v2/_internal/execution/controller/controller.py - DOC101: Method `TrainController._start_worker_group`: Docstring contains fewer arguments than in function signature. - DOC103: Method `TrainController._start_worker_group`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [num_workers: int, resources_per_worker: dict]. --------------------- python/ray/train/v2/_internal/execution/storage.py DOC101: Method `_ExcludingLocalFilesystem.__init__`: Docstring contains fewer arguments than in function signature. DOC103: Method `_ExcludingLocalFilesystem.__init__`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [**kwargs: ]. @@ -2833,3 +2815,7 @@ python/ray/widgets/util.py DOC103: Function `_has_missing`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [*deps: Iterable[Union[str, Optional[str]]]]. Arguments in the docstring but not in the function signature: [deps: ]. DOC103: Function `repr_with_fallback`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [*notebook_deps: Iterable[Union[str, Optional[str]]]]. Arguments in the docstring but not in the function signature: [notebook_deps: ]. -------------------- +python/ray/_private/serialization.py + DOC106: Function `_gpu_object_ref_deserializer`: The option `--arg-type-hints-in-signature` is `True` but there are no argument type hints in the signature + DOC107: Function `_gpu_object_ref_deserializer`: The option `--arg-type-hints-in-signature` is `True` but not all args in the signature have type hints +-------------------- diff --git a/ci/pipeline/determine_tests_to_run.py b/ci/pipeline/determine_tests_to_run.py index 2b7e394345f5..c29815199cf4 100644 --- a/ci/pipeline/determine_tests_to_run.py +++ b/ci/pipeline/determine_tests_to_run.py @@ -5,19 +5,8 @@ import os import subprocess import sys -from typing import List, Optional, Set, Tuple from pprint import pformat - - -_ALL_TAGS = set( - """ - always - lint python cpp core_cpp java workflow compiled_graphs dashboard ray_client - data dask serve ml tune train llm rllib rllib_gpu rllib_directly - linux_wheels macos_wheels docker doc python_dependencies tools - release_tests compiled_python spark_on_ray - """.split() -) +from typing import List, Optional, Set, Tuple def _list_changed_files(commit_range): @@ -64,11 +53,13 @@ class TagRule: def __init__( self, tags: List[str], + lineno: int, dirs: Optional[List[str]] = None, files: Optional[List[str]] = None, patterns: Optional[List[str]] = None, ): self.tags = set(tags) + self.lineno = lineno self.dirs = dirs or [] self.patterns = patterns or [] self.files = files or [] @@ -91,7 +82,7 @@ def match_tags(self, changed_file: str) -> Tuple[Set[str], bool]: return set(), False -def _parse_rules(rule_content: str) -> List[TagRule]: +def _parse_rules(rule_content: str) -> Tuple[Set[str], List[TagRule]]: """ Parse the rule config content into a list ot TagRule's. @@ -113,6 +104,9 @@ def _parse_rules(rule_content: str) -> List[TagRule]: """ rules: List[TagRule] = [] + tag_defs: Set[str] = set() + tag_defs_ended: bool = False + tags: Set[str] = set() dirs: List[str] = [] files: List[str] = [] @@ -130,13 +124,22 @@ def _parse_rules(rule_content: str) -> List[TagRule]: if comment_index != -1: line = line[:comment_index].strip() # Remove comments. + if line.startswith("!"): + if tag_defs_ended: + raise ValueError("Tag must be declared at file start.") + tag_defs.update(line[1:].split()) + continue + + if not tag_defs_ended: + tag_defs_ended = True + if line.startswith("@"): # tags. # Strip the leading '@' and split into tags. tags.update(line[1:].split()) elif line.startswith(";"): # End of a rule. if line != ";": raise ValueError(f"Unexpected tokens after semicolon on line {lineno}.") - rules.append(TagRule(tags, dirs, files, patterns)) + rules.append(TagRule(tags, lineno, dirs, files, patterns)) tags, dirs, files, patterns = set(), [], [], [] else: if line.find("*") != -1: # Patterns. @@ -148,20 +151,33 @@ def _parse_rules(rule_content: str) -> List[TagRule]: # Append the last rule if not empty. if tags or dirs or files or patterns: - rules.append(TagRule(tags, dirs, files, patterns)) + rules.append(TagRule(tags, lineno, dirs, files, patterns)) - return rules + return tag_defs, rules class TagRuleSet: def __init__(self, content: Optional[str] = None): + self.tag_defs = set() + self.rules = [] + if content is not None: - self.rules = _parse_rules(content) - else: - self.rules = [] + self.add_rules(content) def add_rules(self, content: str): - self.rules.extend(_parse_rules(content)) + tag_defs, rules = _parse_rules(content) + self.tag_defs.update(tag_defs) + self.rules.extend(rules) + + def check_rules(self): + for rule in self.rules: + if not rule.tags: + continue + for tag in rule.tags: + if tag not in self.tag_defs: + raise ValueError( + f"Tag {tag} not declared, used in rule at line {rule.lineno}." + ) def match_tags(self, changed_file: str) -> Tuple[Set[str], bool]: for rule in self.rules: @@ -188,6 +204,8 @@ def match_tags(self, changed_file: str) -> Tuple[Set[str], bool]: with open(config) as f: rules.add_rules(f.read()) + rules.check_rules() + tags: Set[str] = set() tags.add("always") @@ -221,7 +239,7 @@ def _emit(line: str): # Log the modified environment variables visible in console. output_string = " ".join(list(tags)) for tag in tags: - assert tag in _ALL_TAGS, f"Unknown tag {tag}" + assert tag in rules.tag_defs, f"Unknown tag {tag}" print(output_string, file=sys.stderr) # Debug purpose print(output_string) diff --git a/ci/pipeline/test_conditional_testing.py b/ci/pipeline/test_conditional_testing.py index 0e8dd6ba8420..d42e44973d5b 100644 --- a/ci/pipeline/test_conditional_testing.py +++ b/ci/pipeline/test_conditional_testing.py @@ -4,13 +4,13 @@ import tempfile from typing import List, Set -import runfiles import pytest +import runfiles import yaml from ci.pipeline.determine_tests_to_run import TagRule, TagRuleSet -_REPO_NAME = "com_github_ray_project_ray" +_REPO_NAME = "io_ray" _runfiles = runfiles.Create() @@ -43,7 +43,9 @@ - lint ml tune train data - python dashboard linux_wheels macos_wheels java python/ray/dag/dag.py: - - lint python compiled_graphs + - lint python cgraphs_direct_transport +python/ray/experimental/gpu_object_manager/gpu_object_manager.py: + - lint python cgraphs_direct_transport .buildkite/core.rayci.yml: lint python core_cpp java/ray.java: lint java @@ -166,6 +168,7 @@ def __init__(self, file: str, tags: Set[str]): def test_tag_rule(): rule = TagRule( tags=["hit"], + lineno=1, dirs=["fancy"], files=["file.txt"], patterns=["python/*.py"], @@ -182,7 +185,7 @@ def test_tag_rule(): assert rule.match_tags("fancy") == ({"hit"}, True) assert rule.match_tags("not_match") == (set(), False) - skip_rule = TagRule(tags=[], files=["skip.txt"]) + skip_rule = TagRule(tags=[], lineno=1, files=["skip.txt"]) assert skip_rule.match("skip.txt") assert skip_rule.match_tags("skip.txt") == (set(), True) assert skip_rule.match_tags("not_match") == (set(), False) @@ -193,8 +196,19 @@ def test_tag_rule_set(): assert rule_set.match_tags("fancy/file.txt") == ({"fancy"}, True) rule_set = TagRuleSet( - "\n".join(["fancy/ #dir", "@fancy", ";", "\t\t ", "foobar.txt", "@foobar"]) + "\n".join( + [ + "!fancy foobar", + "fancy/ #dir", + "@fancy", + ";", + "\t\t ", + "foobar.txt", + "@foobar", + ] + ) ) + rule_set.check_rules() assert rule_set.match_tags("fancy/file.txt") == ({"fancy"}, True) assert rule_set.match_tags("foobar.txt") == ({"foobar"}, True) assert rule_set.match_tags("not_a_match") == (set(), False) @@ -203,5 +217,11 @@ def test_tag_rule_set(): assert rule_set.match_tags("anything") == (set(), False) +def test_tag_rule_set_check_rules(): + rule_set = TagRuleSet("\n".join(["!foobar", "fancy/ #dir", "@fancy"])) + with pytest.raises(ValueError): + rule_set.check_rules() + + if __name__ == "__main__": sys.exit(pytest.main(["-vv", __file__])) diff --git a/ci/pipeline/test_rules.txt b/ci/pipeline/test_rules.txt index e13a83b25e12..9bb208c2a0fb 100644 --- a/ci/pipeline/test_rules.txt +++ b/ci/pipeline/test_rules.txt @@ -3,13 +3,23 @@ # Comment content, after '#', will be ignored. # Empty lines will be ignored too. # +# ! tag1 tag2 tag3 # Declares a tag. A tag must be declared first to be used. +# # Tags must be declared at the beginning. +# # dir/ # Directory to match # file # File to match # dir/*.py # Pattern to match, using fnmatch, matches dir/a.py dir/dir/b.py or dir/.py -# @ tag1 tag2 tag3 # Tags to emit for a rule. A rule without tags is a skipping rule. +# @ tag1 tag2 tag3 # Tags to emit for a rule. A rule without tags is a skipping rule. # # ; # Semicolon to separate rules +! always lint +! python cpp core_cpp java workflow cgraphs_direct_transport dashboard +! ray_client runtime_env_container +! data dask serve ml tune train llm rllib rllib_gpu rllib_directly +! linux_wheels macos_wheels docker doc python_dependencies tools +! release_tests spark_on_ray + python/ray/air/ @ ml train tune data linux_wheels ; @@ -18,7 +28,7 @@ python/ray/llm/ doc/source/llm/ .buildkite/llm.rayci.yml ci/docker/llm.build.Dockerfile -python/requirements_compiled_*.txt +python/deplocks/llm/*.lock @ llm ; @@ -84,7 +94,10 @@ python/requirements/ python/ray/dag/ python/ray/experimental/channel/ -@ python compiled_graphs +python/ray/experimental/gpu_object_manager/ +python/ray/experimental/collective/ +python/ray/tests/gpu_objects/ +@ python cgraphs_direct_transport ; python/ray/util/client/ @@ -99,6 +112,10 @@ python/ray/util/spark/ @ python spark_on_ray ; +python/ray/runtime_env/ +@ python runtime_env_container +; + python/ @ ml tune train data # Python changes might impact cross language stack in Java. @@ -199,10 +216,6 @@ ci/docker/forge.aarch64.wanda.yaml ci/docker/manylinux.Dockerfile ci/docker/manylinux.wanda.yaml ci/docker/manylinux.aarch64.wanda.yaml -ci/docker/ray.cpu.base.wanda.yaml -ci/docker/ray.cpu.base.aarch64.wanda.yaml -ci/docker/ray.cuda.base.wanda.yaml -ci/docker/ray.cuda.base.aarch64.wanda.yaml ci/docker/windows.build.Dockerfile ci/docker/windows.build.wanda.yaml build-docker.sh @@ -222,7 +235,7 @@ src/ src/ray/core_worker/experimental*.h src/ray/core_worker/experimental*.cc -@ compiled_graphs +@ cgraphs_direct_transport ; .github/ diff --git a/ci/ray_ci/anyscale_docker_container.py b/ci/ray_ci/anyscale_docker_container.py index b2b5aa1bd169..112b520cd6eb 100644 --- a/ci/ray_ci/anyscale_docker_container.py +++ b/ci/ray_ci/anyscale_docker_container.py @@ -1,5 +1,7 @@ -from ci.ray_ci.docker_container import DockerContainer +from ray_release.configs.global_config import get_global_config + from ci.ray_ci.container import _DOCKER_ECR_REPO, _DOCKER_GCP_REGISTRY +from ci.ray_ci.docker_container import DockerContainer class AnyscaleDockerContainer(DockerContainer): @@ -16,14 +18,14 @@ def run(self) -> None: tag = self._get_canonical_tag() ray_image = f"rayproject/{self.image_type}:{tag}" anyscale_image = f"{aws_registry}/anyscale/{self.image_type}:{tag}" - requirement = self._get_requirement_file() + gce_credentials = get_global_config()["aws2gce_credentials"] cmds = [ # build docker image - f"./ci/build/build-anyscale-docker.sh " - f"{ray_image} {anyscale_image} {requirement} {aws_registry}", + "./ci/build/build-anyscale-docker.sh " + + f"{ray_image} {anyscale_image} {aws_registry}", # gcloud login - "./release/gcloud_docker_login.sh release/aws2gce_iam.json", + f"./release/gcloud_docker_login.sh {gce_credentials}", "export PATH=$(pwd)/google-cloud-sdk/bin:$PATH", ] # TODO(can): remove the alias when release test infra uses only the canonical @@ -43,14 +45,3 @@ def run(self) -> None: def _should_upload(self) -> bool: return self.upload - - def _get_requirement_file(self) -> str: - if self.image_type == "ray-ml": - prefix = "requirements_ml" - elif self.image_type == "ray-llm": - prefix = "requirements_llm" - else: - prefix = "requirements" - postfix = self.python_version - - return f"{prefix}_byod_{postfix}.txt" diff --git a/ci/ray_ci/automation/determine_microcheck_step_ids.py b/ci/ray_ci/automation/determine_microcheck_step_ids.py index 4d15da5f8e6e..097afee56aca 100644 --- a/ci/ray_ci/automation/determine_microcheck_step_ids.py +++ b/ci/ray_ci/automation/determine_microcheck_step_ids.py @@ -1,14 +1,15 @@ -import click import os -from ci.ray_ci.utils import ci_init +import click from ray_release.test import ( - Test, LINUX_TEST_PREFIX, - WINDOWS_TEST_PREFIX, MACOS_TEST_PREFIX, + WINDOWS_TEST_PREFIX, + Test, ) +from ci.ray_ci.utils import ci_init + BAZEL_WORKSPACE_DIR = os.environ.get("BUILD_WORKSPACE_DIRECTORY", "") diff --git a/ci/ray_ci/automation/determine_microcheck_tests.py b/ci/ray_ci/automation/determine_microcheck_tests.py index ba40521860bb..67f914d4dff4 100644 --- a/ci/ray_ci/automation/determine_microcheck_tests.py +++ b/ci/ray_ci/automation/determine_microcheck_tests.py @@ -1,12 +1,13 @@ -import click -from typing import List, Set, Dict +from typing import Dict, List, Set -from ci.ray_ci.utils import logger, ci_init +import click from ray_release.configs.global_config import get_global_config -from ray_release.test import Test from ray_release.result import ResultStatus +from ray_release.test import Test from ray_release.test_automation.ci_state_machine import CITestStateMachine +from ci.ray_ci.utils import ci_init, logger + # The s3 prefix for the tests that run on Linux. It comes from the bazel prefix rule # linux:// with the character "/" replaced by "_" for s3 compatibility LINUX_TEST_PREFIX = "linux:__" diff --git a/ci/ray_ci/automation/docker_tags_lib.py b/ci/ray_ci/automation/docker_tags_lib.py index 99d3b1f3f0a0..6e780b230c4c 100644 --- a/ci/ray_ci/automation/docker_tags_lib.py +++ b/ci/ray_ci/automation/docker_tags_lib.py @@ -1,25 +1,25 @@ -import subprocess -import re -from datetime import datetime -from typing import List, Optional, Callable, Tuple import os -import sys -from dateutil import parser import platform +import re +import subprocess +import sys +from datetime import datetime +from typing import Callable, List, Optional, Tuple -import docker import requests import runfiles +from dateutil import parser +import docker from ci.ray_ci.builder_container import DEFAULT_ARCHITECTURE, DEFAULT_PYTHON_VERSION from ci.ray_ci.docker_container import ( + ARCHITECTURES_RAY, + ARCHITECTURES_RAY_ML, GPU_PLATFORM, - PYTHON_VERSIONS_RAY, - PYTHON_VERSIONS_RAY_ML, PLATFORMS_RAY, PLATFORMS_RAY_ML, - ARCHITECTURES_RAY, - ARCHITECTURES_RAY_ML, + PYTHON_VERSIONS_RAY, + PYTHON_VERSIONS_RAY_ML, RayType, ) from ci.ray_ci.utils import logger diff --git a/ci/ray_ci/automation/filter_tests.py b/ci/ray_ci/automation/filter_tests.py index 2444902515ab..84c053498e89 100644 --- a/ci/ray_ci/automation/filter_tests.py +++ b/ci/ray_ci/automation/filter_tests.py @@ -1,7 +1,8 @@ import sys + import click -from ci.ray_ci.utils import filter_tests, ci_init +from ci.ray_ci.utils import ci_init, filter_tests @click.command() diff --git a/ci/ray_ci/automation/generate_index.py b/ci/ray_ci/automation/generate_index.py index 1cea2c5901ff..ad6844b62bec 100644 --- a/ci/ray_ci/automation/generate_index.py +++ b/ci/ray_ci/automation/generate_index.py @@ -1,11 +1,11 @@ import click -from ci.ray_ci.automation.docker_tags_lib import list_image_tags, generate_index +from ci.ray_ci.automation.docker_tags_lib import generate_index, list_image_tags from ci.ray_ci.docker_container import ( - RayType, + ARCHITECTURES_RAY, PLATFORMS_RAY, PYTHON_VERSIONS_RAY, - ARCHITECTURES_RAY, + RayType, ) diff --git a/ci/ray_ci/automation/get_contributors.py b/ci/ray_ci/automation/get_contributors.py index 5f78ac0b47ab..f2c251a1b834 100644 --- a/ci/ray_ci/automation/get_contributors.py +++ b/ci/ray_ci/automation/get_contributors.py @@ -1,7 +1,7 @@ import os import sys -from subprocess import check_output from collections import defaultdict +from subprocess import check_output import click from github import Github diff --git a/ci/ray_ci/automation/list_docker_tags.py b/ci/ray_ci/automation/list_docker_tags.py index 1876a18f4aa6..8a95dc84b223 100644 --- a/ci/ray_ci/automation/list_docker_tags.py +++ b/ci/ray_ci/automation/list_docker_tags.py @@ -1,14 +1,15 @@ -import click import sys +import click + from ci.ray_ci.automation.docker_tags_lib import list_image_tags from ci.ray_ci.docker_container import ( + ARCHITECTURES_RAY, + ARCHITECTURES_RAY_ML, PLATFORMS_RAY, PLATFORMS_RAY_ML, PYTHON_VERSIONS_RAY, PYTHON_VERSIONS_RAY_ML, - ARCHITECTURES_RAY, - ARCHITECTURES_RAY_ML, RayType, ) diff --git a/ci/ray_ci/automation/pypi_lib.py b/ci/ray_ci/automation/pypi_lib.py index 31aeea2aae66..df60d8759928 100644 --- a/ci/ray_ci/automation/pypi_lib.py +++ b/ci/ray_ci/automation/pypi_lib.py @@ -1,7 +1,7 @@ -import subprocess import os -from typing import List +import subprocess import sys +from typing import List from ray_release.aws import get_secret_token diff --git a/ci/ray_ci/automation/ray_wheels_lib.py b/ci/ray_ci/automation/ray_wheels_lib.py index 2e16002a105f..aa46ac4f7769 100644 --- a/ci/ray_ci/automation/ray_wheels_lib.py +++ b/ci/ray_ci/automation/ray_wheels_lib.py @@ -1,6 +1,7 @@ -import boto3 -from typing import List import os +from typing import List + +import boto3 from ci.ray_ci.utils import logger diff --git a/ci/ray_ci/automation/test_db_bot.py b/ci/ray_ci/automation/test_db_bot.py index 186d706a739d..5d562309ac63 100644 --- a/ci/ray_ci/automation/test_db_bot.py +++ b/ci/ray_ci/automation/test_db_bot.py @@ -1,10 +1,10 @@ import os import click +from ray_release.configs.global_config import get_global_config -from ci.ray_ci.utils import logger, ci_init from ci.ray_ci.tester_container import TesterContainer -from ray_release.configs.global_config import get_global_config +from ci.ray_ci.utils import ci_init, logger @click.command() diff --git a/ci/ray_ci/automation/test_determine_microcheck_tests.py b/ci/ray_ci/automation/test_determine_microcheck_tests.py index 71f055591902..75083482ca3a 100644 --- a/ci/ray_ci/automation/test_determine_microcheck_tests.py +++ b/ci/ray_ci/automation/test_determine_microcheck_tests.py @@ -1,19 +1,19 @@ -import sys import json +import sys from typing import List import pytest +from ray_release.result import ResultStatus +from ray_release.test import Test, TestResult from ci.ray_ci.automation.determine_microcheck_tests import ( _get_failed_commits, + _get_failed_tests_from_master_branch, _get_flaky_tests, _get_test_with_minimal_coverage, - _get_failed_tests_from_master_branch, _update_high_impact_tests, ) from ci.ray_ci.utils import ci_init -from ray_release.result import ResultStatus -from ray_release.test import TestResult, Test ci_init() diff --git a/ci/ray_ci/automation/test_docker_tags_lib.py b/ci/ray_ci/automation/test_docker_tags_lib.py index e36b9fba1b18..fea87eca42ce 100644 --- a/ci/ray_ci/automation/test_docker_tags_lib.py +++ b/ci/ray_ci/automation/test_docker_tags_lib.py @@ -1,36 +1,37 @@ -from unittest import mock +import platform +import random +import shutil +import subprocess import sys +import tempfile +import threading +import time from datetime import datetime, timezone +from unittest import mock + import pytest import requests -import subprocess -import tempfile import runfiles -import platform -import time -import threading -import shutil -import random from ci.ray_ci.automation.docker_tags_lib import ( + AuthTokenException, + DockerHubRateLimitException, + RetrieveImageConfigException, _get_docker_auth_token, _get_docker_hub_auth_token, _get_image_creation_time, + _is_release_tag, + _list_recent_commit_short_shas, backup_release_tags, + call_crane_copy, + check_image_ray_commit, copy_tag_to_aws_ecr, delete_tag, - _list_recent_commit_short_shas, + generate_index, + get_ray_commit, + list_image_tags, query_tags_from_docker_hub, query_tags_from_docker_with_oci, - _is_release_tag, - list_image_tags, - get_ray_commit, - check_image_ray_commit, - generate_index, - AuthTokenException, - RetrieveImageConfigException, - DockerHubRateLimitException, - call_crane_copy, ) diff --git a/ci/ray_ci/automation/test_pypi_lib.py b/ci/ray_ci/automation/test_pypi_lib.py index 9cb23f27c3b4..17e787369024 100644 --- a/ci/ray_ci/automation/test_pypi_lib.py +++ b/ci/ray_ci/automation/test_pypi_lib.py @@ -1,14 +1,15 @@ -import pytest -from unittest import mock -import tempfile import os -import sys import subprocess +import sys +import tempfile +from unittest import mock + +import pytest from ci.ray_ci.automation.pypi_lib import ( - upload_wheels_to_pypi, - _get_pypi_url, _get_pypi_token, + _get_pypi_url, + upload_wheels_to_pypi, ) diff --git a/ci/ray_ci/automation/test_ray_wheels_lib.py b/ci/ray_ci/automation/test_ray_wheels_lib.py index aef36be327ae..0b8d46bcdf8d 100644 --- a/ci/ray_ci/automation/test_ray_wheels_lib.py +++ b/ci/ray_ci/automation/test_ray_wheels_lib.py @@ -1,20 +1,21 @@ -from unittest import mock +import os import sys import tempfile -import os -from botocore.exceptions import ClientError +from unittest import mock + import pytest +from botocore.exceptions import ClientError from ci.ray_ci.automation.ray_wheels_lib import ( - _get_wheel_names, - download_wheel_from_s3, - download_ray_wheels_from_s3, - _check_downloaded_wheels, - PYTHON_VERSIONS, ALL_PLATFORMS, + PYTHON_VERSIONS, RAY_TYPES, - add_build_tag_to_wheels, + _check_downloaded_wheels, + _get_wheel_names, add_build_tag_to_wheel, + add_build_tag_to_wheels, + download_ray_wheels_from_s3, + download_wheel_from_s3, ) SAMPLE_WHEELS = [ diff --git a/ci/ray_ci/automation/test_update_version_lib.py b/ci/ray_ci/automation/test_update_version_lib.py index cd43d3142fde..ca51369fd084 100644 --- a/ci/ray_ci/automation/test_update_version_lib.py +++ b/ci/ray_ci/automation/test_update_version_lib.py @@ -1,13 +1,13 @@ -from unittest import mock +import os import sys import tempfile -import os +from unittest import mock import pytest from ci.ray_ci.automation.update_version_lib import ( - list_java_files, get_current_version, + list_java_files, update_file_version, ) diff --git a/ci/ray_ci/automation/update_version.py b/ci/ray_ci/automation/update_version.py index eec6aed47bfe..221e49ca59e2 100644 --- a/ci/ray_ci/automation/update_version.py +++ b/ci/ray_ci/automation/update_version.py @@ -1,7 +1,8 @@ -import click import os from typing import Optional +import click + from ci.ray_ci.automation.update_version_lib import ( get_current_version, update_file_version, diff --git a/ci/ray_ci/automation/upload_wheels_pypi.py b/ci/ray_ci/automation/upload_wheels_pypi.py index 784f57a52453..48859cacb7d9 100644 --- a/ci/ray_ci/automation/upload_wheels_pypi.py +++ b/ci/ray_ci/automation/upload_wheels_pypi.py @@ -1,11 +1,13 @@ -import click import tempfile from typing import Optional + +import click + +from ci.ray_ci.automation.pypi_lib import upload_wheels_to_pypi from ci.ray_ci.automation.ray_wheels_lib import ( - download_ray_wheels_from_s3, add_build_tag_to_wheels, + download_ray_wheels_from_s3, ) -from ci.ray_ci.automation.pypi_lib import upload_wheels_to_pypi @click.command() diff --git a/ci/ray_ci/automation/weekly_green_metric.py b/ci/ray_ci/automation/weekly_green_metric.py index ab66bd893662..230a14691dea 100644 --- a/ci/ray_ci/automation/weekly_green_metric.py +++ b/ci/ray_ci/automation/weekly_green_metric.py @@ -1,14 +1,13 @@ import json -import time import sys +import time import boto3 import click - -from ci.ray_ci.utils import logger, ci_init from ray_release.test_automation.state_machine import TestStateMachine from ray_release.util import get_write_state_machine_aws_bucket +from ci.ray_ci.utils import ci_init, logger AWS_WEEKLY_GREEN_METRIC = "ray_weekly_green_metric" diff --git a/ci/ray_ci/bazel_sharding.py b/ci/ray_ci/bazel_sharding.py index 93be178fd5a9..d40683c45f6d 100644 --- a/ci/ray_ci/bazel_sharding.py +++ b/ci/ray_ci/bazel_sharding.py @@ -16,9 +16,6 @@ # BASED ON https://github.com/philwo/bazel-utils/blob/main/sharding/sharding.py -from collections import defaultdict -from dataclasses import dataclass -from typing import Iterable, List, Optional, Set, Tuple import argparse import os import re @@ -26,6 +23,9 @@ import subprocess import sys import xml.etree.ElementTree as ET +from collections import defaultdict +from dataclasses import dataclass +from typing import Iterable, List, Optional, Set, Tuple @dataclass diff --git a/ci/ray_ci/bisect/bisect_test.py b/ci/ray_ci/bisect/bisect_test.py index 6defa6f250b3..2fa8466572dd 100644 --- a/ci/ray_ci/bisect/bisect_test.py +++ b/ci/ray_ci/bisect/bisect_test.py @@ -1,17 +1,17 @@ -import click import json import os -from ci.ray_ci.utils import logger, ci_init -from ci.ray_ci.bisect.macos_validator import MacOSValidator -from ci.ray_ci.bisect.generic_validator import GenericValidator -from ci.ray_ci.bisect.bisector import Bisector +import click from ray_release.test import ( Test, TestType, ) from ray_release.test_automation.ci_state_machine import CITestStateMachine +from ci.ray_ci.bisect.bisector import Bisector +from ci.ray_ci.bisect.generic_validator import GenericValidator +from ci.ray_ci.bisect.macos_validator import MacOSValidator +from ci.ray_ci.utils import ci_init, logger # This is the directory where the ray repository is mounted in the container RAYCI_CHECKOUT_DIR_MOUNT = "/ray" diff --git a/ci/ray_ci/bisect/bisector.py b/ci/ray_ci/bisect/bisector.py index 822c391ce5cd..095f48d09876 100644 --- a/ci/ray_ci/bisect/bisector.py +++ b/ci/ray_ci/bisect/bisector.py @@ -1,10 +1,11 @@ import subprocess from typing import List, Optional -from ci.ray_ci.utils import logger -from ci.ray_ci.bisect.validator import Validator from ray_release.test import Test +from ci.ray_ci.bisect.validator import Validator +from ci.ray_ci.utils import logger + class Bisector: def __init__( diff --git a/ci/ray_ci/bisect/generic_validator.py b/ci/ray_ci/bisect/generic_validator.py index 636ed9246dda..3142a0eed463 100644 --- a/ci/ray_ci/bisect/generic_validator.py +++ b/ci/ray_ci/bisect/generic_validator.py @@ -1,12 +1,12 @@ import time from pybuildkite.buildkite import Buildkite +from ray_release.aws import get_secret_token +from ray_release.configs.global_config import get_global_config +from ray_release.test import Test from ci.ray_ci.bisect.validator import Validator from ci.ray_ci.utils import logger -from ray_release.test import Test -from ray_release.aws import get_secret_token -from ray_release.configs.global_config import get_global_config BUILDKITE_ORGANIZATION = "ray-project" BUILDKITE_POSTMERGE_PIPELINE = "postmerge" diff --git a/ci/ray_ci/bisect/macos_validator.py b/ci/ray_ci/bisect/macos_validator.py index 2112b9db0704..e1f7beb4e71f 100644 --- a/ci/ray_ci/bisect/macos_validator.py +++ b/ci/ray_ci/bisect/macos_validator.py @@ -1,10 +1,10 @@ import os import subprocess -from ci.ray_ci.bisect.validator import Validator from ray_release.bazel import bazel_runfile from ray_release.test import Test +from ci.ray_ci.bisect.validator import Validator TEST_SCRIPT = "ci/ray_ci/bisect/macos_validator.sh" diff --git a/ci/ray_ci/bisect/test_bisector.py b/ci/ray_ci/bisect/test_bisector.py index 0928be55e2dd..975cda7e18ce 100644 --- a/ci/ray_ci/bisect/test_bisector.py +++ b/ci/ray_ci/bisect/test_bisector.py @@ -1,11 +1,12 @@ import sys -import pytest from unittest import mock +import pytest +from ray_release.test import Test + from ci.ray_ci.bisect.bisector import Bisector -from ci.ray_ci.bisect.validator import Validator from ci.ray_ci.bisect.macos_validator import MacOSValidator -from ray_release.test import Test +from ci.ray_ci.bisect.validator import Validator class MockValidator(Validator): diff --git a/ci/ray_ci/bisect/test_generic_validator.py b/ci/ray_ci/bisect/test_generic_validator.py index e5314a6d68d3..3c3d4ea857c7 100644 --- a/ci/ray_ci/bisect/test_generic_validator.py +++ b/ci/ray_ci/bisect/test_generic_validator.py @@ -1,11 +1,11 @@ -import time import sys -import pytest +import time from unittest import mock +import pytest +from ray_release.test import Test from ci.ray_ci.bisect.generic_validator import WAIT, GenericValidator -from ray_release.test import Test START = time.time() diff --git a/ci/ray_ci/builder.py b/ci/ray_ci/builder.py index 3f3ba27f8a96..7e74a4906b98 100644 --- a/ci/ray_ci/builder.py +++ b/ci/ray_ci/builder.py @@ -2,19 +2,19 @@ import click +from ci.ray_ci.anyscale_docker_container import AnyscaleDockerContainer from ci.ray_ci.builder_container import ( + ARCHITECTURE, + BUILD_TYPES, DEFAULT_PYTHON_VERSION, PYTHON_VERSIONS, - BUILD_TYPES, - ARCHITECTURE, BuilderContainer, ) -from ci.ray_ci.windows_builder_container import WindowsBuilderContainer +from ci.ray_ci.container import _DOCKER_ECR_REPO from ci.ray_ci.docker_container import PLATFORMS_RAY from ci.ray_ci.ray_docker_container import RayDockerContainer -from ci.ray_ci.anyscale_docker_container import AnyscaleDockerContainer -from ci.ray_ci.container import _DOCKER_ECR_REPO -from ci.ray_ci.utils import logger, docker_login, ci_init +from ci.ray_ci.utils import ci_init, docker_login, logger +from ci.ray_ci.windows_builder_container import WindowsBuilderContainer @click.command() @@ -172,7 +172,7 @@ def build_anyscale( for p in platform: RayDockerContainer( python_version, p, image_type, architecture, canonical_tag, upload=False - ).run() + ).run(base="base-extra-testdeps") AnyscaleDockerContainer( python_version, p, image_type, architecture, canonical_tag, upload ).run() diff --git a/ci/ray_ci/builder_container.py b/ci/ray_ci/builder_container.py index 84e5c78ed634..f3601d47b779 100644 --- a/ci/ray_ci/builder_container.py +++ b/ci/ray_ci/builder_container.py @@ -62,6 +62,7 @@ def run(self) -> None: f"./ci/build/build-manylinux-wheel.sh {self.bin_path}", "chown -R 2000:100 /artifact-mount", ] + if self.upload: cmds += ["./ci/build/copy_build_artifacts.sh wheel"] self.run_script(cmds) diff --git a/ci/ray_ci/container.py b/ci/ray_ci/container.py index 44bda3117273..d8ec6a37a6bb 100644 --- a/ci/ray_ci/container.py +++ b/ci/ray_ci/container.py @@ -1,11 +1,9 @@ import abc import os +import re import subprocess import sys -import re - -from typing import List, Tuple, Optional - +from typing import List, Optional, Tuple # Regex pattern to match CUDA copyright header with any version _CUDA_COPYRIGHT_PATTERN = r"""========== diff --git a/ci/ray_ci/doc/api.py b/ci/ray_ci/doc/api.py index 265bad77b2a0..f570b15be0e2 100644 --- a/ci/ray_ci/doc/api.py +++ b/ci/ray_ci/doc/api.py @@ -1,11 +1,9 @@ -import re import importlib import inspect - -from enum import Enum +import re from dataclasses import dataclass -from typing import Optional, List, Tuple, Set, Dict - +from enum import Enum +from typing import Dict, List, Optional, Set, Tuple _SPHINX_AUTOSUMMARY_HEADER = ".. autosummary::" _SPHINX_AUTOCLASS_HEADER = ".. autoclass::" diff --git a/ci/ray_ci/doc/autodoc.py b/ci/ray_ci/doc/autodoc.py index 9d2f18b8dd78..2f875488d5e8 100644 --- a/ci/ray_ci/doc/autodoc.py +++ b/ci/ray_ci/doc/autodoc.py @@ -3,12 +3,11 @@ from typing import List, Set from ci.ray_ci.doc.api import ( - API, - _SPHINX_AUTOSUMMARY_HEADER, _SPHINX_AUTOCLASS_HEADER, + _SPHINX_AUTOSUMMARY_HEADER, + API, ) - _SPHINX_CURRENTMODULE_HEADER = ".. currentmodule::" _SPHINX_TOCTREE_HEADER = ".. toctree::" _SPHINX_INCLUDE_HEADER = ".. include::" diff --git a/ci/ray_ci/doc/build_cache.py b/ci/ray_ci/doc/build_cache.py index 4301dbc1204a..9a45d98496ae 100644 --- a/ci/ray_ci/doc/build_cache.py +++ b/ci/ray_ci/doc/build_cache.py @@ -1,14 +1,13 @@ -import tempfile -import subprocess import os import pickle +import subprocess +import tempfile from typing import Set import boto3 - -from ci.ray_ci.utils import logger from ray_release.util import get_write_state_machine_aws_bucket +from ci.ray_ci.utils import logger AWS_CACHE_KEY = "doc_build" ENVIRONMENT_PICKLE = "_build/doctrees/environment.pickle" diff --git a/ci/ray_ci/doc/cmd_build.py b/ci/ray_ci/doc/cmd_build.py index fd89bdf32854..8740980910d1 100644 --- a/ci/ray_ci/doc/cmd_build.py +++ b/ci/ray_ci/doc/cmd_build.py @@ -1,12 +1,11 @@ -import subprocess import os +import subprocess import click +from ray_release.configs.global_config import get_global_config -from ci.ray_ci.utils import logger, ci_init from ci.ray_ci.doc.build_cache import BuildCache - -from ray_release.configs.global_config import get_global_config +from ci.ray_ci.utils import ci_init, logger @click.command() diff --git a/ci/ray_ci/doc/cmd_check_api_discrepancy.py b/ci/ray_ci/doc/cmd_check_api_discrepancy.py index ffdbce7792b0..0112a1a3e121 100644 --- a/ci/ray_ci/doc/cmd_check_api_discrepancy.py +++ b/ci/ray_ci/doc/cmd_check_api_discrepancy.py @@ -1,8 +1,8 @@ import click -from ci.ray_ci.doc.module import Module -from ci.ray_ci.doc.autodoc import Autodoc from ci.ray_ci.doc.api import API +from ci.ray_ci.doc.autodoc import Autodoc +from ci.ray_ci.doc.module import Module from ci.ray_ci.utils import logger TEAM_API_CONFIGS = { diff --git a/ci/ray_ci/doc/mock/__init__.py b/ci/ray_ci/doc/mock/__init__.py index 8491bdf4eb10..8692093685ca 100644 --- a/ci/ray_ci/doc/mock/__init__.py +++ b/ci/ray_ci/doc/mock/__init__.py @@ -1,5 +1,4 @@ -from ci.ray_ci.doc.mock.mock_module import MockClass -from ci.ray_ci.doc.mock.mock_module import mock_function +from ci.ray_ci.doc.mock.mock_module import MockClass, mock_function # classes and functions __all__ = [ diff --git a/ci/ray_ci/doc/test_api.py b/ci/ray_ci/doc/test_api.py index 490d517ffac3..d95417987ba5 100644 --- a/ci/ray_ci/doc/test_api.py +++ b/ci/ray_ci/doc/test_api.py @@ -1,12 +1,13 @@ import sys + import pytest from ci.ray_ci.doc.api import ( + _SPHINX_AUTOCLASS_HEADER, + _SPHINX_AUTOSUMMARY_HEADER, API, AnnotationType, CodeType, - _SPHINX_AUTOCLASS_HEADER, - _SPHINX_AUTOSUMMARY_HEADER, ) from ci.ray_ci.doc.mock.mock_module import mock_function diff --git a/ci/ray_ci/doc/test_autodoc.py b/ci/ray_ci/doc/test_autodoc.py index e340889e8255..cbd7d54eb4f6 100644 --- a/ci/ray_ci/doc/test_autodoc.py +++ b/ci/ray_ci/doc/test_autodoc.py @@ -1,11 +1,12 @@ import os -import tempfile import sys +import tempfile + import pytest +from ci.ray_ci.doc.api import API, AnnotationType, CodeType from ci.ray_ci.doc.autodoc import Autodoc from ci.ray_ci.doc.mock.mock_module import MockClass, mock_function, mock_w00t -from ci.ray_ci.doc.api import API, AnnotationType, CodeType def test_walk(): diff --git a/ci/ray_ci/doc/test_build_cache.py b/ci/ray_ci/doc/test_build_cache.py index 8c45bc97d932..b1070f03b6f2 100644 --- a/ci/ray_ci/doc/test_build_cache.py +++ b/ci/ray_ci/doc/test_build_cache.py @@ -1,10 +1,11 @@ -import sys import os import pickle -import pytest +import sys import tempfile from unittest import mock +import pytest + from ci.ray_ci.doc.build_cache import BuildCache diff --git a/ci/ray_ci/doc/test_module.py b/ci/ray_ci/doc/test_module.py index 3407cfce5fc5..ead02afb7157 100644 --- a/ci/ray_ci/doc/test_module.py +++ b/ci/ray_ci/doc/test_module.py @@ -1,8 +1,9 @@ import sys + import pytest -from ci.ray_ci.doc.module import Module from ci.ray_ci.doc.api import AnnotationType, CodeType +from ci.ray_ci.doc.module import Module def test_walk(): diff --git a/ci/ray_ci/doc/test_update_cache_env.py b/ci/ray_ci/doc/test_update_cache_env.py index a7d2592793d2..88ce8c6894da 100644 --- a/ci/ray_ci/doc/test_update_cache_env.py +++ b/ci/ray_ci/doc/test_update_cache_env.py @@ -1,11 +1,13 @@ -import sys import os import pickle +import sys +import tempfile + import pytest from sphinx.project import Project -import tempfile -from ci.ray_ci.doc.cmd_update_cache_env import update_environment_pickle + from ci.ray_ci.doc.build_cache import ENVIRONMENT_PICKLE +from ci.ray_ci.doc.cmd_update_cache_env import update_environment_pickle class FakeBuildEnv: diff --git a/ci/ray_ci/docker_container.py b/ci/ray_ci/docker_container.py index 6e30335b73e1..9f739466d679 100644 --- a/ci/ray_ci/docker_container.py +++ b/ci/ray_ci/docker_container.py @@ -1,11 +1,10 @@ import os -from typing import List from datetime import datetime from enum import Enum +from typing import List -from ci.ray_ci.linux_container import LinuxContainer from ci.ray_ci.builder_container import DEFAULT_ARCHITECTURE, DEFAULT_PYTHON_VERSION - +from ci.ray_ci.linux_container import LinuxContainer PLATFORMS_RAY = [ "cpu", @@ -22,17 +21,21 @@ "cpu", "cu12.1.1-cudnn8", ] +PLATFORMS_RAY_LLM = ["cu12.8.1-cudnn"] GPU_PLATFORM = "cu12.1.1-cudnn8" PYTHON_VERSIONS_RAY = ["3.9", "3.10", "3.11", "3.12"] PYTHON_VERSIONS_RAY_ML = ["3.9", "3.10", "3.11"] +PYTHON_VERSIONS_RAY_LLM = ["3.11"] ARCHITECTURES_RAY = ["x86_64", "aarch64"] ARCHITECTURES_RAY_ML = ["x86_64"] +ARCHITECTURES_RAY_LLM = ["x86_64"] class RayType(str, Enum): RAY = "ray" RAY_ML = "ray-ml" + RAY_LLM = "ray-llm" class DockerContainer(LinuxContainer): @@ -51,13 +54,18 @@ def __init__( ) -> None: assert "RAYCI_CHECKOUT_DIR" in os.environ, "RAYCI_CHECKOUT_DIR not set" - assert python_version in PYTHON_VERSIONS_RAY - assert platform in PLATFORMS_RAY - assert architecture in ARCHITECTURES_RAY if image_type == RayType.RAY_ML: assert python_version in PYTHON_VERSIONS_RAY_ML assert platform in PLATFORMS_RAY_ML assert architecture in ARCHITECTURES_RAY_ML + elif image_type == RayType.RAY_LLM: + assert python_version in PYTHON_VERSIONS_RAY_LLM + assert platform in PLATFORMS_RAY_LLM + assert architecture in ARCHITECTURES_RAY_LLM + else: + assert python_version in PYTHON_VERSIONS_RAY + assert platform in PLATFORMS_RAY + assert architecture in ARCHITECTURES_RAY rayci_checkout_dir = os.environ["RAYCI_CHECKOUT_DIR"] self.python_version = python_version @@ -83,24 +91,29 @@ def _get_image_version_tags(self, external: bool) -> List[str]: external: If True, return the external image tags. If False, return the internal image tags. """ - branch = os.environ.get("BUILDKITE_BRANCH") + branch = os.environ.get("BUILDKITE_BRANCH", "") sha_tag = os.environ["BUILDKITE_COMMIT"][:6] + rayci_build_id = os.environ["RAYCI_BUILD_ID"] pr = os.environ.get("BUILDKITE_PULL_REQUEST", "false") formatted_date = datetime.now().strftime("%y%m%d") if branch == "master": if external and os.environ.get("RAYCI_SCHEDULE") == "nightly": return [f"nightly.{formatted_date}.{sha_tag}", "nightly"] - return [sha_tag] + return [sha_tag, rayci_build_id] if branch and branch.startswith("releases/"): release_name = branch[len("releases/") :] - return [f"{release_name}.{sha_tag}"] + release_tag = f"{release_name}.{sha_tag}" + if external: + # Avoid saving build ID ones when saving it on public registries. + return [release_tag] + return [release_tag, rayci_build_id] if pr != "false": - return [f"pr-{pr}.{sha_tag}"] + return [f"pr-{pr}.{sha_tag}", rayci_build_id] - return [sha_tag] + return [sha_tag, rayci_build_id] def _get_canonical_tag(self) -> str: # The canonical tag is the first tag in the list of tags. The list of tag is @@ -110,10 +123,10 @@ def _get_canonical_tag(self) -> str: # e.g. sha-pyversion-platform return self.canonical_tag if self.canonical_tag else self._get_image_tags()[0] - def get_python_version_tag(self) -> str: + def _get_python_version_tag(self) -> str: return f"-py{self.python_version.replace('.', '')}" # 3.x -> py3x - def get_platform_tag(self) -> str: + def _get_platform_tag(self) -> str: if self.platform == "cpu": return "-cpu" versions = self.platform.split(".") @@ -134,7 +147,7 @@ def _get_image_tags(self, external: bool = False) -> List[str]: versions = self._get_image_version_tags(external) - platforms = [self.get_platform_tag()] + platforms = [self._get_platform_tag()] if self.platform == "cpu" and self.image_type == RayType.RAY: # no tag is alias to cpu for ray image platforms.append("") @@ -145,7 +158,7 @@ def _get_image_tags(self, external: bool = False) -> List[str]: # no tag is alias to gpu for ray-ml image platforms.append("") - py_versions = [self.get_python_version_tag()] + py_versions = [self._get_python_version_tag()] if self.python_version == DEFAULT_PYTHON_VERSION: py_versions.append("") diff --git a/ci/ray_ci/linux_container.py b/ci/ray_ci/linux_container.py index 1e865269d25c..44c6d1971d2d 100644 --- a/ci/ray_ci/linux_container.py +++ b/ci/ray_ci/linux_container.py @@ -1,7 +1,7 @@ import os import subprocess import sys -from typing import List, Tuple, Optional +from typing import List, Optional, Tuple from ci.ray_ci.container import Container diff --git a/ci/ray_ci/macos/macos_ci_build.sh b/ci/ray_ci/macos/macos_ci_build.sh index 810b9cc540af..242dbcb618ee 100755 --- a/ci/ray_ci/macos/macos_ci_build.sh +++ b/ci/ray_ci/macos/macos_ci_build.sh @@ -32,14 +32,13 @@ build() { export JAVA_HOME=/Library/Java/JavaVirtualMachines/temurin-8.jdk/Contents/Home java -version # Build wheels - export UPLOAD_WHEELS_AS_ARTIFACTS=1 export MAC_WHEELS=1 export MAC_JARS=1 export RAY_INSTALL_JAVA=1 export RAY_ENABLE_WINDOWS_OR_OSX_CLUSTER=1 . ./ci/ci.sh init && source ~/.zshenv source ~/.zshrc - ./ci/ci.sh build_wheels_and_jars + ./ci/ci.sh build_macos_wheels_and_jars # Test wheels ./ci/ci.sh test_macos_wheels # Build jars diff --git a/ci/ray_ci/oss_config.yaml b/ci/ray_ci/oss_config.yaml index cf2b64aa1cc8..ffaa50fa7158 100644 --- a/ci/ray_ci/oss_config.yaml +++ b/ci/ray_ci/oss_config.yaml @@ -4,6 +4,7 @@ release_byod: ray_ml_cr_repo: ray-ml ray_llm_cr_repo: ray-llm byod_ecr: 029272617770.dkr.ecr.us-west-2.amazonaws.com + byod_ecr_region: us-west-2 aws_cr: 029272617770.dkr.ecr.us-west-2.amazonaws.com gcp_cr: us-west1-docker.pkg.dev/anyscale-oss-ci aws2gce_credentials: release/aws2gce_iam.json @@ -22,3 +23,7 @@ state_machine: aws_bucket: ray-ci-pr-results branch: aws_bucket: ray-ci-results +release_image_step: + ray: anyscalebuild + ray_ml: anyscalemlbuild + ray_llm: anyscalellmbuild diff --git a/ci/ray_ci/pipeline/gap_filling_scheduler.py b/ci/ray_ci/pipeline/gap_filling_scheduler.py index 4bcf65cd6b38..52b1d507a977 100644 --- a/ci/ray_ci/pipeline/gap_filling_scheduler.py +++ b/ci/ray_ci/pipeline/gap_filling_scheduler.py @@ -1,10 +1,9 @@ import subprocess from datetime import datetime, timedelta -from typing import List, Dict, Optional, Any, Tuple +from typing import Any, Dict, List, Optional, Tuple from pybuildkite.buildkite import Buildkite - BRANCH = "master" BLOCK_STEP_KEY = "unblock-me" diff --git a/ci/ray_ci/pipeline/scheduler.py b/ci/ray_ci/pipeline/scheduler.py index 75b498fc88ab..a23e271817d6 100644 --- a/ci/ray_ci/pipeline/scheduler.py +++ b/ci/ray_ci/pipeline/scheduler.py @@ -1,10 +1,10 @@ import click - -from ci.ray_ci.utils import ci_init, logger -from ci.ray_ci.pipeline.gap_filling_scheduler import GapFillingScheduler from ray_release.aws import get_secret_token from ray_release.configs.global_config import get_global_config +from ci.ray_ci.pipeline.gap_filling_scheduler import GapFillingScheduler +from ci.ray_ci.utils import ci_init, logger + @click.command() @click.argument("buildkite_organization", type=str) diff --git a/ci/ray_ci/pipeline/test_gap_filling_scheduler.py b/ci/ray_ci/pipeline/test_gap_filling_scheduler.py index 669899275aec..c4e667ee6679 100644 --- a/ci/ray_ci/pipeline/test_gap_filling_scheduler.py +++ b/ci/ray_ci/pipeline/test_gap_filling_scheduler.py @@ -3,7 +3,7 @@ import pytest -from ci.ray_ci.pipeline.gap_filling_scheduler import GapFillingScheduler, BLOCK_STEP_KEY +from ci.ray_ci.pipeline.gap_filling_scheduler import BLOCK_STEP_KEY, GapFillingScheduler @mock.patch( diff --git a/ci/ray_ci/ray_docker_container.py b/ci/ray_ci/ray_docker_container.py index 71d67b72e502..b3f9e1758bb6 100644 --- a/ci/ray_ci/ray_docker_container.py +++ b/ci/ray_ci/ray_docker_container.py @@ -1,11 +1,12 @@ import os -from typing import List +from typing import List, Optional +from ray_release.configs.global_config import get_global_config + +from ci.ray_ci.builder_container import DEFAULT_ARCHITECTURE, PYTHON_VERSIONS from ci.ray_ci.container import _DOCKER_ECR_REPO from ci.ray_ci.docker_container import DockerContainer -from ci.ray_ci.builder_container import PYTHON_VERSIONS, DEFAULT_ARCHITECTURE -from ci.ray_ci.utils import docker_pull, RAY_VERSION -from ray_release.configs.global_config import get_global_config +from ci.ray_ci.utils import RAY_VERSION, docker_pull class RayDockerContainer(DockerContainer): @@ -13,20 +14,25 @@ class RayDockerContainer(DockerContainer): Container for building and publishing ray docker images """ - def run(self) -> None: + def run(self, base: Optional[str] = None) -> None: """ Build and publish ray docker images """ assert "RAYCI_BUILD_ID" in os.environ, "RAYCI_BUILD_ID not set" rayci_build_id = os.environ["RAYCI_BUILD_ID"] + if base is None: + base = "base" + if self.architecture == DEFAULT_ARCHITECTURE: - suffix = "base" + suffix = base else: - suffix = f"base-{self.architecture}" + suffix = f"{base}-{self.architecture}" + + image_repo = self.image_type base_image = ( f"{_DOCKER_ECR_REPO}:{rayci_build_id}" - f"-{self.image_type}-py{self.python_version}-{self.platform}-{suffix}" + f"-{image_repo}-py{self.python_version}-{self.platform}-{suffix}" ) docker_pull(base_image) @@ -37,7 +43,7 @@ def run(self) -> None: ) constraints_file = "requirements_compiled.txt" tag = self._get_canonical_tag() - ray_image = f"rayproject/{self.image_type}:{tag}" + ray_image = f"rayproject/{image_repo}:{tag}" pip_freeze = f"{self.image_type}:{tag}_pip-freeze.txt" cmds = [ diff --git a/ci/ray_ci/test_anyscale_docker_container.py b/ci/ray_ci/test_anyscale_docker_container.py index 7f09036aa366..6d96cab52cb7 100644 --- a/ci/ray_ci/test_anyscale_docker_container.py +++ b/ci/ray_ci/test_anyscale_docker_container.py @@ -1,13 +1,14 @@ -import sys import os +import sys from typing import List from unittest import mock import pytest +from ray_release.configs.global_config import get_global_config from ci.ray_ci.anyscale_docker_container import AnyscaleDockerContainer +from ci.ray_ci.container import _DOCKER_ECR_REPO, _DOCKER_GCP_REGISTRY from ci.ray_ci.test_base import RayCITestBase -from ci.ray_ci.container import _DOCKER_GCP_REGISTRY, _DOCKER_ECR_REPO class TestAnyscaleDockerContainer(RayCITestBase): @@ -32,41 +33,37 @@ def _mock_run_script(input: List[str]) -> None: aws_ecr = _DOCKER_ECR_REPO.split("/")[0] aws_prj = f"{aws_ecr}/anyscale/ray-ml" gcp_prj = f"{_DOCKER_GCP_REGISTRY}/anyscale/ray-ml" - assert cmd == [ - "./ci/build/build-anyscale-docker.sh " - f"rayproject/ray-ml:123456-{pv}-cu121 " - f"{aws_prj}:123456-{pv}-cu121 requirements_ml_byod_{v}.txt {aws_ecr}", - "./release/gcloud_docker_login.sh release/aws2gce_iam.json", - "export PATH=$(pwd)/google-cloud-sdk/bin:$PATH", - f"docker tag {aws_prj}:123456-{pv}-cu121 {aws_prj}:123456-{pv}-cu121", - f"docker push {aws_prj}:123456-{pv}-cu121", - f"docker tag {aws_prj}:123456-{pv}-cu121 {gcp_prj}:123456-{pv}-cu121", - f"docker push {gcp_prj}:123456-{pv}-cu121", - f"docker tag {aws_prj}:123456-{pv}-cu121 {aws_prj}:123456-{pv}-gpu", - f"docker push {aws_prj}:123456-{pv}-gpu", - f"docker tag {aws_prj}:123456-{pv}-cu121 {gcp_prj}:123456-{pv}-gpu", - f"docker push {gcp_prj}:123456-{pv}-gpu", - f"docker tag {aws_prj}:123456-{pv}-cu121 {aws_prj}:123456-{pv}", - f"docker push {aws_prj}:123456-{pv}", - f"docker tag {aws_prj}:123456-{pv}-cu121 {gcp_prj}:123456-{pv}", - f"docker push {gcp_prj}:123456-{pv}", - ] - - def test_requirements_file(self) -> None: - container = AnyscaleDockerContainer("3.11", "cu12.1.1-cudnn8", "ray-ml") - assert container._get_requirement_file() == "requirements_ml_byod_3.11.txt" + gce_credentials = get_global_config()["aws2gce_credentials"] - container = AnyscaleDockerContainer("3.9", "cu12.1.1-cudnn8", "ray-ml") - assert container._get_requirement_file() == "requirements_ml_byod_3.9.txt" - - container = AnyscaleDockerContainer("3.11", "cu12.4.1-cudnn", "ray-llm") - assert container._get_requirement_file() == "requirements_llm_byod_3.11.txt" + tags_want = [ + f"123456-{pv}-cu121", + f"123456-{pv}-gpu", + f"123456-{pv}", + f"a1b2c3d4-{pv}-cu121", + f"a1b2c3d4-{pv}-gpu", + f"a1b2c3d4-{pv}", + ] - container = AnyscaleDockerContainer("3.9", "cpu", "ray") - assert container._get_requirement_file() == "requirements_byod_3.9.txt" + push_cmds_want = [] + for tag in tags_want: + push_cmds_want += [ + f"docker tag {aws_prj}:123456-{pv}-cu121 {aws_prj}:{tag}", + f"docker push {aws_prj}:{tag}", + f"docker tag {aws_prj}:123456-{pv}-cu121 {gcp_prj}:{tag}", + f"docker push {gcp_prj}:{tag}", + ] - container = AnyscaleDockerContainer("3.12", "cpu", "ray") - assert container._get_requirement_file() == "requirements_byod_3.12.txt" + assert ( + cmd + == [ + "./ci/build/build-anyscale-docker.sh " + f"rayproject/ray-ml:123456-{pv}-cu121 " + f"{aws_prj}:123456-{pv}-cu121 {aws_ecr}", + f"./release/gcloud_docker_login.sh {gce_credentials}", + "export PATH=$(pwd)/google-cloud-sdk/bin:$PATH", + ] + + push_cmds_want + ) if __name__ == "__main__": diff --git a/ci/ray_ci/test_base.py b/ci/ray_ci/test_base.py index e5c5d0b76679..ec237d64e418 100644 --- a/ci/ray_ci/test_base.py +++ b/ci/ray_ci/test_base.py @@ -2,8 +2,8 @@ import unittest from unittest.mock import patch -from ci.ray_ci.builder_container import PYTHON_VERSIONS from ci.ray_ci.builder import DEFAULT_PYTHON_VERSION +from ci.ray_ci.builder_container import PYTHON_VERSIONS from ci.ray_ci.utils import ci_init @@ -14,7 +14,7 @@ def setUp(self) -> None: os.environ, { "RAYCI_CHECKOUT_DIR": "/ray", - "RAYCI_BUILD_ID": "123", + "RAYCI_BUILD_ID": "a1b2c3d4", "RAYCI_WORK_REPO": "rayproject/citemp", "BUILDKITE_COMMIT": "123456", "BUILDKITE_BRANCH": "master", diff --git a/ci/ray_ci/test_bazel_sharding.py b/ci/ray_ci/test_bazel_sharding.py index 927eb59afe58..74d0e320b3c7 100644 --- a/ci/ray_ci/test_bazel_sharding.py +++ b/ci/ray_ci/test_bazel_sharding.py @@ -1,9 +1,10 @@ -from typing import List -import pytest import os import shutil import sys import tempfile +from typing import List + +import pytest # Required for bazel file_parent = os.path.dirname(__file__) diff --git a/ci/ray_ci/test_builder_container.py b/ci/ray_ci/test_builder_container.py index 47abef000116..eb2e6dc23600 100644 --- a/ci/ray_ci/test_builder_container.py +++ b/ci/ray_ci/test_builder_container.py @@ -1,7 +1,8 @@ import sys -import pytest -from unittest import mock from typing import List +from unittest import mock + +import pytest from ci.ray_ci.builder_container import BuilderContainer diff --git a/ci/ray_ci/test_linux_container.py b/ci/ray_ci/test_linux_container.py index e6c7d693b4bc..3e6e32e1bba3 100644 --- a/ci/ray_ci/test_linux_container.py +++ b/ci/ray_ci/test_linux_container.py @@ -1,4 +1,5 @@ import sys + import pytest from ci.ray_ci.linux_container import LinuxContainer diff --git a/ci/ray_ci/test_linux_tester_container.py b/ci/ray_ci/test_linux_tester_container.py index 831eb5e7f594..bf2003bff15e 100644 --- a/ci/ray_ci/test_linux_tester_container.py +++ b/ci/ray_ci/test_linux_tester_container.py @@ -2,17 +2,17 @@ import os import platform import sys -import pytest import tempfile -from unittest import mock from typing import List, Optional +from unittest import mock + +import pytest +from ray_release.configs.global_config import get_global_config +from ci.ray_ci.container import _DOCKER_ECR_REPO, _RAYCI_BUILD_ID from ci.ray_ci.linux_tester_container import LinuxTesterContainer from ci.ray_ci.tester_container import RUN_PER_FLAKY_TEST from ci.ray_ci.utils import chunk_into_n, ci_init -from ci.ray_ci.container import _DOCKER_ECR_REPO, _RAYCI_BUILD_ID -from ray_release.configs.global_config import get_global_config - ci_init() diff --git a/ci/ray_ci/test_privileged.py b/ci/ray_ci/test_privileged.py index c1f71dfe7056..e69d6ad78367 100644 --- a/ci/ray_ci/test_privileged.py +++ b/ci/ray_ci/test_privileged.py @@ -1,9 +1,9 @@ import os -import pytest import sys - from pathlib import Path +import pytest + # In privileged containers, we expect the following # cgroupv1 is disabled # cgroupv2 is enabled and mounted on /sys/fs/cgroup diff --git a/ci/ray_ci/test_ray_docker_container.py b/ci/ray_ci/test_ray_docker_container.py index 6d474129f722..767bfbdd228e 100644 --- a/ci/ray_ci/test_ray_docker_container.py +++ b/ci/ray_ci/test_ray_docker_container.py @@ -1,9 +1,11 @@ import os import sys +from datetime import datetime from typing import List from unittest import mock -from datetime import datetime + import pytest +from ray_release.configs.global_config import get_global_config from ci.ray_ci.builder_container import DEFAULT_PYTHON_VERSION from ci.ray_ci.container import _DOCKER_ECR_REPO @@ -11,7 +13,6 @@ from ci.ray_ci.ray_docker_container import RayDockerContainer from ci.ray_ci.test_base import RayCITestBase from ci.ray_ci.utils import RAY_VERSION -from ray_release.configs.global_config import get_global_config class TestRayDockerContainer(RayCITestBase): @@ -28,7 +29,7 @@ def _mock_run_script(input: List[str]) -> None: side_effect=_mock_run_script, ): sha = "123456" - ray_ci_build_id = "123" + ray_ci_build_id = "a1b2c3d4" cuda = "cu12.4.1-cudnn" # Run with default python version and ray image @@ -52,6 +53,7 @@ def _mock_run_script(input: List[str]) -> None: v = "3.11" cv = self.get_cpp_version(v) pv = self.get_python_version(v) + cuda = "cu12.8.1-cudnn" container = RayDockerContainer(v, cuda, "ray-llm") container.run() cmd = self.cmds[-1] @@ -60,14 +62,15 @@ def _mock_run_script(input: List[str]) -> None: f"ray-{RAY_VERSION}-{cv}-{cv}-manylinux2014_x86_64.whl " f"{_DOCKER_ECR_REPO}:{ray_ci_build_id}-ray-llm-py{v}-{cuda}-base " "requirements_compiled.txt " - f"rayproject/ray-llm:{sha}-{pv}-cu124 " - f"ray-llm:{sha}-{pv}-cu124_pip-freeze.txt" + f"rayproject/ray-llm:{sha}-{pv}-cu128 " + f"ray-llm:{sha}-{pv}-cu128_pip-freeze.txt" ) # Run with non-default python version and ray-ml image v = self.get_non_default_python() cv = self.get_cpp_version(v) pv = self.get_python_version(v) + cuda = "cu12.4.1-cudnn" container = RayDockerContainer(v, "cpu", "ray-ml") container.run() cmd = self.cmds[-1] @@ -98,7 +101,7 @@ def _mock_run_script(input: List[str]) -> None: ): formatted_date = datetime.now().strftime("%y%m%d") sha = "123456" - ray_ci_build_id = "123" + ray_ci_build_id = "a1b2c3d4" # Run with default python version and ray image self.cmds = [] @@ -131,7 +134,7 @@ def _mock_run_script(input: List[str]) -> None: v = "3.11" cv = self.get_cpp_version(v) pv = self.get_python_version(v) - cuda = "cu12.4.1-cudnn" + cuda = "cu12.8.1-cudnn" container = RayDockerContainer(v, cuda, "ray-llm") container.run() assert len(self.cmds) == 6 @@ -140,8 +143,8 @@ def _mock_run_script(input: List[str]) -> None: f"ray-{RAY_VERSION}-{cv}-{cv}-manylinux2014_x86_64.whl " f"{_DOCKER_ECR_REPO}:{ray_ci_build_id}-ray-llm-py{v}-{cuda}-base " "requirements_compiled.txt " - f"rayproject/ray-llm:{sha}-{pv}-cu124 " - f"ray-llm:{sha}-{pv}-cu124_pip-freeze.txt" + f"rayproject/ray-llm:{sha}-{pv}-cu128 " + f"ray-llm:{sha}-{pv}-cu128_pip-freeze.txt" ) assert ( self.cmds[1] @@ -194,7 +197,7 @@ def _mock_run_script(input: List[str]) -> None: os.environ, {"RAYCI_SCHEDULE": "daytime"} ): sha = "123456" - ray_ci_build_id = "123" + ray_ci_build_id = "a1b2c3d4" cuda = "cu11.8.0-cudnn8" # Run with default python version and ray image @@ -216,8 +219,8 @@ def _mock_run_script(input: List[str]) -> None: # Run with specific python version and ray-llm image self.cmds = [] - v = DEFAULT_PYTHON_VERSION - cuda = "cu12.4.1-cudnn" + v = "3.11" + cuda = "cu12.8.1-cudnn" cv = self.get_cpp_version(v) pv = self.get_python_version(v) container = RayDockerContainer(v, cuda, "ray-llm") @@ -228,8 +231,8 @@ def _mock_run_script(input: List[str]) -> None: f"ray-{RAY_VERSION}-{cv}-{cv}-manylinux2014_x86_64.whl " f"{_DOCKER_ECR_REPO}:{ray_ci_build_id}-ray-llm-py{v}-{cuda}-base " "requirements_compiled.txt " - f"rayproject/ray-llm:{sha}-{pv}-cu124 " - f"ray-llm:{sha}-{pv}-cu124_pip-freeze.txt" + f"rayproject/ray-llm:{sha}-{pv}-cu128 " + f"ray-llm:{sha}-{pv}-cu128_pip-freeze.txt" ) # Run with non-default python version and ray-ml image @@ -279,6 +282,7 @@ def test_get_image_tags(self) -> None: # bulk logic of _get_image_tags is tested in its callers (get_image_name and # get_canonical_tag), so we only test the basic cases here sha = "123456" + rayci_build_id = "a1b2c3d4" v = DEFAULT_PYTHON_VERSION pv = self.get_python_version(v) container = RayDockerContainer(v, "cpu", "ray") @@ -289,6 +293,10 @@ def test_get_image_tags(self) -> None: f"{sha}-cpu", f"{sha}-{pv}", f"{sha}", + f"{rayci_build_id}-{pv}-cpu", + f"{rayci_build_id}-cpu", + f"{rayci_build_id}-{pv}", + f"{rayci_build_id}", ] with mock.patch.dict(os.environ, {"RAYCI_SCHEDULE": "nightly"}): assert container._get_image_tags(external=True) == [ @@ -304,6 +312,7 @@ def test_get_image_tags(self) -> None: def test_get_image_name(self) -> None: sha = "123456" + rayci_build_id = "a1b2c3d4" v = DEFAULT_PYTHON_VERSION pv = self.get_python_version(v) formatted_date = datetime.now().strftime("%y%m%d") @@ -314,6 +323,10 @@ def test_get_image_name(self) -> None: f"rayproject/ray:{sha}-cpu", f"rayproject/ray:{sha}-{pv}", f"rayproject/ray:{sha}", + f"rayproject/ray:{rayci_build_id}-{pv}-cpu", + f"rayproject/ray:{rayci_build_id}-cpu", + f"rayproject/ray:{rayci_build_id}-{pv}", + f"rayproject/ray:{rayci_build_id}", ] with mock.patch.dict(os.environ, {"RAYCI_SCHEDULE": "nightly"}): @@ -330,16 +343,17 @@ def test_get_image_name(self) -> None: v = "3.11" pv = self.get_python_version(v) - container = RayDockerContainer(v, "cu12.4.1-cudnn", "ray-llm") + container = RayDockerContainer(v, "cu12.8.1-cudnn", "ray-llm") with mock.patch.dict(os.environ, {"RAYCI_SCHEDULE": "daytime"}): assert container._get_image_names() == [ - f"rayproject/ray-llm:{sha}-{pv}-cu124", + f"rayproject/ray-llm:{sha}-{pv}-cu128", + f"rayproject/ray-llm:{rayci_build_id}-{pv}-cu128", ] with mock.patch.dict(os.environ, {"RAYCI_SCHEDULE": "nightly"}): assert container._get_image_names() == [ - f"rayproject/ray-llm:nightly.{formatted_date}.{sha}-{pv}-cu124", - f"rayproject/ray-llm:nightly-{pv}-cu124", + f"rayproject/ray-llm:nightly.{formatted_date}.{sha}-{pv}-cu128", + f"rayproject/ray-llm:nightly-{pv}-cu128", ] v = self.get_non_default_python() @@ -350,6 +364,9 @@ def test_get_image_name(self) -> None: f"rayproject/ray-ml:{sha}-{pv}-cu121", f"rayproject/ray-ml:{sha}-{pv}-gpu", f"rayproject/ray-ml:{sha}-{pv}", + f"rayproject/ray-ml:{rayci_build_id}-{pv}-cu121", + f"rayproject/ray-ml:{rayci_build_id}-{pv}-gpu", + f"rayproject/ray-ml:{rayci_build_id}-{pv}", ] with mock.patch.dict(os.environ, {"RAYCI_SCHEDULE": "nightly"}): @@ -380,30 +397,30 @@ def test_get_python_version_tag(self) -> None: v = DEFAULT_PYTHON_VERSION pv = self.get_python_version(v) container = RayDockerContainer(v, "cpu", "ray") - assert container.get_python_version_tag() == f"-{pv}" + assert container._get_python_version_tag() == f"-{pv}" def test_get_platform_tag(self) -> None: v = DEFAULT_PYTHON_VERSION container = RayDockerContainer(v, "cpu", "ray") - assert container.get_platform_tag() == "-cpu" + assert container._get_platform_tag() == "-cpu" container = RayDockerContainer(v, "cu11.8.0-cudnn8", "ray") - assert container.get_platform_tag() == "-cu118" + assert container._get_platform_tag() == "-cu118" container = RayDockerContainer(v, "cu12.3.2-cudnn9", "ray") - assert container.get_platform_tag() == "-cu123" + assert container._get_platform_tag() == "-cu123" container = RayDockerContainer(v, "cu12.4.1-cudnn", "ray") - assert container.get_platform_tag() == "-cu124" + assert container._get_platform_tag() == "-cu124" container = RayDockerContainer(v, "cu12.5.1-cudnn", "ray") - assert container.get_platform_tag() == "-cu125" + assert container._get_platform_tag() == "-cu125" container = RayDockerContainer(v, "cu12.6.3-cudnn", "ray") - assert container.get_platform_tag() == "-cu126" + assert container._get_platform_tag() == "-cu126" container = RayDockerContainer(v, "cu12.8.1-cudnn", "ray") - assert container.get_platform_tag() == "-cu128" + assert container._get_platform_tag() == "-cu128" def test_should_upload(self) -> None: v = DEFAULT_PYTHON_VERSION diff --git a/ci/ray_ci/test_tester.py b/ci/ray_ci/test_tester.py index 21861e250dad..4f3e8ce06fb6 100644 --- a/ci/ray_ci/test_tester.py +++ b/ci/ray_ci/test_tester.py @@ -5,19 +5,19 @@ from unittest import mock import pytest +from ray_release.test import Test, TestState from ci.ray_ci.linux_tester_container import LinuxTesterContainer -from ci.ray_ci.windows_tester_container import WindowsTesterContainer from ci.ray_ci.tester import ( _add_default_except_tags, - _get_container, _get_all_test_query, - _get_test_targets, - _get_new_tests, + _get_container, _get_flaky_test_targets, + _get_new_tests, _get_tag_matcher, + _get_test_targets, ) -from ray_release.test import Test, TestState +from ci.ray_ci.windows_tester_container import WindowsTesterContainer def _stub_test(val: dict) -> Test: diff --git a/ci/ray_ci/test_utils.py b/ci/ray_ci/test_utils.py index f97566d00d19..cd946229612a 100644 --- a/ci/ray_ci/test_utils.py +++ b/ci/ray_ci/test_utils.py @@ -1,16 +1,17 @@ import base64 import io import sys -import pytest -from unittest import mock from typing import List +from unittest import mock +import pytest from ray_release.test import Test + from ci.ray_ci.utils import ( chunk_into_n, docker_login, - get_flaky_test_names, filter_tests, + get_flaky_test_names, ) diff --git a/ci/ray_ci/test_windows_container.py b/ci/ray_ci/test_windows_container.py index 9ea95d212c23..d7527b97b234 100644 --- a/ci/ray_ci/test_windows_container.py +++ b/ci/ray_ci/test_windows_container.py @@ -1,10 +1,11 @@ import sys -import pytest -from unittest import mock from typing import List +from unittest import mock + +import pytest -from ci.ray_ci.windows_container import WindowsContainer from ci.ray_ci.container import _DOCKER_ENV +from ci.ray_ci.windows_container import WindowsContainer def test_install_ray() -> None: diff --git a/ci/ray_ci/test_windows_tester_container.py b/ci/ray_ci/test_windows_tester_container.py index 48667b9265a5..753ffd373d6c 100644 --- a/ci/ray_ci/test_windows_tester_container.py +++ b/ci/ray_ci/test_windows_tester_container.py @@ -1,5 +1,5 @@ -from unittest import mock from typing import List +from unittest import mock from ci.ray_ci.windows_tester_container import WindowsTesterContainer diff --git a/ci/ray_ci/tester.py b/ci/ray_ci/tester.py index c2b700f34db8..339c6aea5844 100644 --- a/ci/ray_ci/tester.py +++ b/ci/ray_ci/tester.py @@ -1,23 +1,23 @@ import os import sys -from typing import List, Set, Tuple, Optional +from typing import List, Optional, Set, Tuple -import yaml import click +import yaml +from ray_release.test import Test, TestState -from ci.ray_ci.container import _DOCKER_ECR_REPO from ci.ray_ci.builder_container import ( - BuilderContainer, + DEFAULT_ARCHITECTURE, DEFAULT_BUILD_TYPE, DEFAULT_PYTHON_VERSION, - DEFAULT_ARCHITECTURE, PYTHON_VERSIONS, + BuilderContainer, ) +from ci.ray_ci.container import _DOCKER_ECR_REPO from ci.ray_ci.linux_tester_container import LinuxTesterContainer -from ci.ray_ci.windows_tester_container import WindowsTesterContainer from ci.ray_ci.tester_container import TesterContainer -from ci.ray_ci.utils import docker_login, ci_init -from ray_release.test import Test, TestState +from ci.ray_ci.utils import ci_init, docker_login +from ci.ray_ci.windows_tester_container import WindowsTesterContainer CUDA_COPYRIGHT = """ ========== diff --git a/ci/ray_ci/tester_container.py b/ci/ray_ci/tester_container.py index 5c0078e323d5..0e91ee58a5d0 100644 --- a/ci/ray_ci/tester_container.py +++ b/ci/ray_ci/tester_container.py @@ -5,16 +5,15 @@ import shutil import string import subprocess -from typing import List, Tuple, Optional -from os import path, listdir +from os import listdir, path +from typing import List, Optional, Tuple -from ci.ray_ci.utils import shard_tests, chunk_into_n -from ci.ray_ci.utils import logger -from ci.ray_ci.container import Container -from ray_release.test import TestResult, Test -from ray_release.test_automation.ci_state_machine import CITestStateMachine from ray_release.configs.global_config import get_global_config +from ray_release.test import Test, TestResult +from ray_release.test_automation.ci_state_machine import CITestStateMachine +from ci.ray_ci.container import Container +from ci.ray_ci.utils import chunk_into_n, logger, shard_tests # We will run each flaky test this number of times per CI job independent of pass/fail. RUN_PER_FLAKY_TEST = 1 diff --git a/ci/ray_ci/tests.env.Dockerfile b/ci/ray_ci/tests.env.Dockerfile index 7ae17986e618..fb009afa59d4 100644 --- a/ci/ray_ci/tests.env.Dockerfile +++ b/ci/ray_ci/tests.env.Dockerfile @@ -9,6 +9,7 @@ ARG RAY_INSTALL_MASK= ENV CC=clang ENV CXX=clang++-12 +ENV RAY_DISABLE_EXTRA_CPP=1 RUN mkdir /rayci WORKDIR /rayci diff --git a/ci/ray_ci/utils.py b/ci/ray_ci/utils.py index a9d6159aa8a3..90b510fd5379 100644 --- a/ci/ray_ci/utils.py +++ b/ci/ray_ci/utils.py @@ -5,15 +5,15 @@ import subprocess import sys import tempfile - -import boto3 -from typing import List from math import ceil +from typing import List -import ci.ray_ci.bazel_sharding as bazel_sharding +import boto3 from ray_release.bazel import bazel_runfile -from ray_release.test import Test, TestState from ray_release.configs.global_config import init_global_config +from ray_release.test import Test, TestState + +import ci.ray_ci.bazel_sharding as bazel_sharding GLOBAL_CONFIG_FILE = ( os.environ.get("RAYCI_GLOBAL_CONFIG") or "ci/ray_ci/oss_config.yaml" diff --git a/ci/ray_ci/windows/build_ray.sh b/ci/ray_ci/windows/build_ray.sh index 0966becbf3d1..2731118742ce 100644 --- a/ci/ray_ci/windows/build_ray.sh +++ b/ci/ray_ci/windows/build_ray.sh @@ -11,7 +11,8 @@ cd /c/rayci { echo "build --announce_rc"; echo "build --config=ci"; - echo "startup --output_user_root=c:/raytmp"; + # Set a shorter output_base to avoid long file paths that Windows can't handle. + echo "startup --output_base=c:/bzl"; echo "build --remote_cache=${BUILDKITE_BAZEL_CACHE_URL}"; } >> ~/.bazelrc diff --git a/ci/ray_ci/windows/tests.env.Dockerfile b/ci/ray_ci/windows/tests.env.Dockerfile index 0e0cd9eea4ab..cce117b1fe7e 100644 --- a/ci/ray_ci/windows/tests.env.Dockerfile +++ b/ci/ray_ci/windows/tests.env.Dockerfile @@ -12,6 +12,7 @@ ENV PYTHON=3.9 ENV RAY_USE_RANDOM_PORTS=1 ENV RAY_DEFAULT_BUILD=1 ENV RAY_INSTALL_JAVA=0 +ENV RAY_DISABLE_EXTRA_CPP=1 ENV RAY_ENABLE_WINDOWS_OR_OSX_CLUSTER=1 ENV LC_ALL=en_US.UTF-8 ENV LANG=en_US.UTF-8 diff --git a/ci/ray_ci/windows_builder_container.py b/ci/ray_ci/windows_builder_container.py index 0c4f0cf214ee..1ccf2c5078c0 100644 --- a/ci/ray_ci/windows_builder_container.py +++ b/ci/ray_ci/windows_builder_container.py @@ -1,6 +1,6 @@ import os -from ci.ray_ci.windows_container import WindowsContainer, WORKDIR +from ci.ray_ci.windows_container import WORKDIR, WindowsContainer class WindowsBuilderContainer(WindowsContainer): diff --git a/ci/ray_ci/windows_container.py b/ci/ray_ci/windows_container.py index 838c6491b05c..0e9f4b79e0e1 100644 --- a/ci/ray_ci/windows_container.py +++ b/ci/ray_ci/windows_container.py @@ -1,11 +1,10 @@ import os import subprocess import sys -from typing import List, Tuple, Optional +from typing import List, Optional, Tuple from ci.ray_ci.container import Container - WORKDIR = "C:\\rayci" diff --git a/ci/ray_ci/windows_tester_container.py b/ci/ray_ci/windows_tester_container.py index 37ea14f645e6..750f4da112dc 100644 --- a/ci/ray_ci/windows_tester_container.py +++ b/ci/ray_ci/windows_tester_container.py @@ -1,7 +1,7 @@ from typing import List, Optional -from ci.ray_ci.windows_container import WindowsContainer from ci.ray_ci.tester_container import TesterContainer +from ci.ray_ci.windows_container import WindowsContainer class WindowsTesterContainer(TesterContainer, WindowsContainer): diff --git a/ci/raydepsets/BUILD.bazel b/ci/raydepsets/BUILD.bazel index d69d976b5d1f..9802526e950a 100644 --- a/ci/raydepsets/BUILD.bazel +++ b/ci/raydepsets/BUILD.bazel @@ -14,7 +14,7 @@ py_library( srcs = [ "cli.py", ], - data = ["@uv_x86_64//:file"], + data = ["//:uv_file"], deps = [ ":workspace", ci_require("bazel-runfiles"), @@ -33,14 +33,16 @@ py_binary( py_test( name = "test_cli", - srcs = ["test_cli.py"], + srcs = ["tests/test_cli.py"], data = [ - "test_data/requirement_constraints_test.txt", - "test_data/requirements_compiled_test.txt", - "test_data/requirements_compiled_test_expand.txt", - "test_data/requirements_compiled_test_update.txt", - "test_data/requirements_test.txt", - "test_data/test.depsets.yaml", + "tests/test_data/pre-hook-error-test.sh", + "tests/test_data/pre-hook-test.sh", + "tests/test_data/requirement_constraints_test.txt", + "tests/test_data/requirements_compiled_test.txt", + "tests/test_data/requirements_compiled_test_expand.txt", + "tests/test_data/requirements_compiled_test_update.txt", + "tests/test_data/requirements_test.txt", + "tests/test_data/test.depsets.yaml", ], exec_compatible_with = ["//:hermetic_python"], tags = [ @@ -50,15 +52,32 @@ py_test( deps = [ ci_require("pytest"), ":raydepsets_lib", - ":testing_utils", + ":utils", ], ) py_library( - name = "testing_utils", + name = "utils", testonly = True, - srcs = ["testing_utils.py"], + srcs = ["tests/utils.py"], deps = [ ci_require("bazel-runfiles"), ], ) + +py_test( + name = "test_workspace", + srcs = ["tests/test_workspace.py"], + data = [ + "tests/test_data/test.depsets.yaml", + ], + tags = [ + "ci_unit", + "team:ci", + ], + deps = [ + ci_require("pytest"), + ":utils", + ":workspace", + ], +) diff --git a/ci/raydepsets/cli.py b/ci/raydepsets/cli.py index 41f4e6382685..6e73b7b5b13e 100644 --- a/ci/raydepsets/cli.py +++ b/ci/raydepsets/cli.py @@ -1,25 +1,28 @@ -import click -from pathlib import Path -from ci.raydepsets.workspace import Workspace, Depset -from typing import List -import subprocess +import difflib import platform +import shutil +import subprocess +import sys +import tempfile +from pathlib import Path +from typing import List, Optional + +import click import runfiles -from typing import Optional -from networkx import DiGraph, topological_sort +from networkx import DiGraph, ancestors as networkx_ancestors, topological_sort + +from ci.raydepsets.workspace import Depset, Workspace DEFAULT_UV_FLAGS = """ --generate-hashes --strip-extras - --no-strip-markers - --emit-index-url - --emit-find-links - --unsafe-package ray - --unsafe-package grpcio-tools --unsafe-package setuptools --index-url https://pypi.org/simple --extra-index-url https://download.pytorch.org/whl/cpu --index-strategy unsafe-best-match + --no-strip-markers + --emit-index-url + --emit-find-links --quiet """.split() @@ -31,25 +34,51 @@ def cli(): @cli.command() @click.argument("config_path", default="ci/raydepsets/ray.depsets.yaml") -@click.option("--workspace-dir", default=None) -@click.option("--name", default=None) -@click.option("--uv-cache-dir", default=None) -def load( +@click.option( + "--workspace-dir", + default=None, + help="The path to the workspace directory. If not specified, $BUILD_WORKSPACE_DIRECTORY will be used.", +) +@click.option( + "--name", + default=None, + help="The name of the dependency set to load. If not specified, all dependency sets will be loaded.", +) +@click.option( + "--uv-cache-dir", default=None, help="The directory to cache uv dependencies" +) +@click.option( + "--check", + is_flag=True, + help="Check the the compiled dependencies are valid. Only compatible with generating all dependency sets.", +) +def build( config_path: str, workspace_dir: Optional[str], name: Optional[str], uv_cache_dir: Optional[str], + check: Optional[bool], ): - """Load a dependency sets from a config file.""" + """ + Build dependency sets from a config file. + Args: + config_path: The path to the config file. If not specified, ci/raydepsets/ray.depsets.yaml will be used. + """ manager = DependencySetManager( config_path=config_path, workspace_dir=workspace_dir, uv_cache_dir=uv_cache_dir, + check=check, ) - if name: - manager.execute_single(manager.get_depset(name)) - else: - manager.execute() + manager.execute(name) + if check: + try: + manager.diff_lock_files() + except RuntimeError as e: + click.echo(e, err=True) + sys.exit(1) + finally: + manager.cleanup() class DependencySetManager: @@ -58,54 +87,131 @@ def __init__( config_path: str = None, workspace_dir: Optional[str] = None, uv_cache_dir: Optional[str] = None, + check: Optional[bool] = False, ): self.workspace = Workspace(workspace_dir) self.config = self.workspace.load_config(config_path) + if check: + self.temp_dir = tempfile.mkdtemp() + self.output_paths = self.get_output_paths() + self.copy_to_temp_dir() self.build_graph = DiGraph() self._build() self._uv_binary = _uv_binary() self._uv_cache_dir = uv_cache_dir + def get_output_paths(self) -> List[Path]: + output_paths = [] + for depset in self.config.depsets: + output_paths.append(Path(depset.output)) + return output_paths + + def copy_to_temp_dir(self): + """Copy the lock files from source file paths to temp dir.""" + for output_path in self.output_paths: + source_fp, target_fp = self.get_source_and_dest(output_path) + target_fp.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2( + source_fp, + target_fp, + ) + + def get_diffs(self) -> List[str]: + diffs = [] + for output_path in self.output_paths: + new_lock_file_fp, old_lock_file_fp = self.get_source_and_dest(output_path) + old_lock_file_contents = self.read_lock_file(old_lock_file_fp) + new_lock_file_contents = self.read_lock_file(new_lock_file_fp) + for diff in difflib.unified_diff( + old_lock_file_contents, + new_lock_file_contents, + fromfile=new_lock_file_fp.as_posix(), + tofile=old_lock_file_fp.as_posix(), + lineterm="", + ): + diffs.append(diff) + return diffs + + def diff_lock_files(self): + diffs = self.get_diffs() + if len(diffs) > 0: + raise RuntimeError( + "Lock files are not up to date. Please update lock files and push the changes.\n" + + "".join(diffs) + ) + click.echo("Lock files are up to date.") + + def get_source_and_dest(self, output_path: str) -> tuple[Path, Path]: + return (self.get_path(output_path), (Path(self.temp_dir) / output_path)) + def _build(self): for depset in self.config.depsets: if depset.operation == "compile": self.build_graph.add_node( - depset.name, operation="compile", depset=depset + depset.name, operation="compile", depset=depset, node_type="depset" ) elif depset.operation == "subset": self.build_graph.add_node( - depset.name, operation="subset", depset=depset + depset.name, operation="subset", depset=depset, node_type="depset" ) self.build_graph.add_edge(depset.source_depset, depset.name) elif depset.operation == "expand": self.build_graph.add_node( - depset.name, operation="expand", depset=depset + depset.name, operation="expand", depset=depset, node_type="depset" ) for depset_name in depset.depsets: self.build_graph.add_edge(depset_name, depset.name) else: raise ValueError(f"Invalid operation: {depset.operation}") + if depset.pre_hooks: + for ind, hook in enumerate(depset.pre_hooks): + hook_name = f"{depset.name}_pre_hook_{ind+1}" + self.build_graph.add_node( + hook_name, + operation="pre_hook", + pre_hook=hook, + node_type="pre_hook", + ) + self.build_graph.add_edge(hook_name, depset.name) - def execute(self): - for node in topological_sort(self.build_graph): - depset = self.build_graph.nodes[node]["depset"] - self.execute_single(depset) + def subgraph_dependency_nodes(self, depset_name: str): + dependency_nodes = networkx_ancestors(self.build_graph, depset_name) + nodes = dependency_nodes | {depset_name} + self.build_graph = self.build_graph.subgraph(nodes).copy() - def get_depset(self, name: str) -> Depset: - for depset in self.config.depsets: - if depset.name == name: - return depset - raise KeyError(f"Dependency set {name} not found") + def execute(self, single_depset_name: Optional[str] = None): + if single_depset_name: + # check if the depset exists + _get_depset(self.config.depsets, single_depset_name) + self.subgraph_dependency_nodes(single_depset_name) - def exec_uv_cmd(self, cmd: str, args: List[str]) -> str: + for node in topological_sort(self.build_graph): + node_type = self.build_graph.nodes[node]["node_type"] + if node_type == "pre_hook": + pre_hook = self.build_graph.nodes[node]["pre_hook"] + self.execute_pre_hook(pre_hook) + elif node_type == "depset": + depset = self.build_graph.nodes[node]["depset"] + self.execute_depset(depset) + + def exec_uv_cmd( + self, cmd: str, args: List[str], stdin: Optional[bytes] = None + ) -> str: cmd = [self._uv_binary, "pip", cmd, *args] click.echo(f"Executing command: {cmd}") - status = subprocess.run(cmd, cwd=self.workspace.dir) + status = subprocess.run(cmd, cwd=self.workspace.dir, input=stdin) if status.returncode != 0: raise RuntimeError(f"Failed to execute command: {cmd}") return status.stdout - def execute_single(self, depset: Depset): + def execute_pre_hook(self, pre_hook: str): + status_code = subprocess.call(pre_hook, cwd=self.workspace.dir) + if status_code != 0: + raise RuntimeError(f"Failed to execute pre-hook: {pre_hook}") + click.echo(f"Executed pre-hook: {pre_hook}") + return status_code + + def execute_depset(self, depset: Depset): if depset.operation == "compile": self.compile( constraints=depset.constraints, @@ -114,6 +220,7 @@ def execute_single(self, depset: Depset): output=depset.output, append_flags=depset.append_flags, override_flags=depset.override_flags, + packages=depset.packages, ) elif depset.operation == "subset": self.subset( @@ -139,29 +246,35 @@ def execute_single(self, depset: Depset): def compile( self, constraints: List[str], - requirements: List[str], name: str, output: str, append_flags: Optional[List[str]] = None, override_flags: Optional[List[str]] = None, + packages: Optional[List[str]] = None, + requirements: Optional[List[str]] = None, ): """Compile a dependency set.""" args = DEFAULT_UV_FLAGS.copy() + stdin = None if self._uv_cache_dir: args.extend(["--cache-dir", self._uv_cache_dir]) if override_flags: args = _override_uv_flags(override_flags, args) if append_flags: - args = _append_uv_flags(append_flags, args) + args.extend(_flatten_flags(append_flags)) if constraints: for constraint in constraints: - args.extend(["-c", self.get_path(constraint)]) + args.extend(["-c", constraint]) if requirements: for requirement in requirements: - args.extend([self.get_path(requirement)]) + args.extend([requirement]) + if packages: + # need to add a dash to process stdin + args.append("-") + stdin = _get_bytes(packages) if output: - args.extend(["-o", self.get_path(output)]) - self.exec_uv_cmd("compile", args) + args.extend(["-o", output]) + self.exec_uv_cmd("compile", args, stdin) def subset( self, @@ -173,7 +286,7 @@ def subset( override_flags: Optional[List[str]] = None, ): """Subset a dependency set.""" - source_depset = self.get_depset(source_depset) + source_depset = _get_depset(self.config.depsets, source_depset) self.check_subset_exists(source_depset, requirements) self.compile( constraints=[source_depset.output], @@ -198,7 +311,7 @@ def expand( # handle both depsets and requirements depset_req_list = [] for depset_name in depsets: - depset = self.get_depset(depset_name) + depset = _get_depset(self.config.depsets, depset_name) depset_req_list.extend(depset.requirements) if requirements: depset_req_list.extend(requirements) @@ -211,8 +324,14 @@ def expand( override_flags=override_flags, ) - def get_path(self, path: str) -> str: - return (Path(self.workspace.dir) / path).as_posix() + def read_lock_file(self, file_path: Path) -> List[str]: + if not file_path.exists(): + raise RuntimeError(f"Lock file {file_path} does not exist") + with open(file_path, "r") as f: + return f.readlines() + + def get_path(self, path: str) -> Path: + return Path(self.workspace.dir) / path def check_subset_exists(self, source_depset: Depset, requirements: List[str]): for req in requirements: @@ -221,6 +340,21 @@ def check_subset_exists(self, source_depset: Depset, requirements: List[str]): f"Requirement {req} is not a subset of {source_depset.name}" ) + def cleanup(self): + if self.temp_dir: + shutil.rmtree(self.temp_dir) + + +def _get_bytes(packages: List[str]) -> bytes: + return ("\n".join(packages) + "\n").encode("utf-8") + + +def _get_depset(depsets: List[Depset], name: str) -> Depset: + for depset in depsets: + if depset.name == name: + return depset + raise KeyError(f"Dependency set {name} not found") + def _flatten_flags(flags: List[str]) -> List[str]: """ @@ -250,16 +384,14 @@ def _override_uv_flags(flags: List[str], args: List[str]) -> List[str]: return new_args + _flatten_flags(flags) -def _append_uv_flags(flags: List[str], args: List[str]) -> List[str]: - args.extend(flags) - return args - - def _uv_binary(): r = runfiles.Create() system = platform.system() - if system != "Linux" or platform.processor() != "x86_64": - raise RuntimeError( - f"Unsupported platform/processor: {system}/{platform.processor()}" - ) - return r.Rlocation("uv_x86_64/uv-x86_64-unknown-linux-gnu/uv") + processor = platform.processor() + + if system == "Linux" and processor == "x86_64": + return r.Rlocation("uv_x86_64-linux/uv-x86_64-unknown-linux-gnu/uv") + elif system == "Darwin" and (processor == "arm" or processor == "aarch64"): + return r.Rlocation("uv_aarch64-darwin/uv-aarch64-apple-darwin/uv") + else: + raise RuntimeError(f"Unsupported platform/processor: {system}/{processor}") diff --git a/ci/raydepsets/pre_hooks/remove-compiled-headers.sh b/ci/raydepsets/pre_hooks/remove-compiled-headers.sh new file mode 100755 index 000000000000..109563fd2be2 --- /dev/null +++ b/ci/raydepsets/pre_hooks/remove-compiled-headers.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +set -euo pipefail + +mkdir -p /tmp/ray-deps + +# Remove the GPU constraints +cp python/requirements_compiled.txt /tmp/ray-deps/requirements_compiled.txt +sed -e '/^--extra-index-url /d' -e '/^--find-links /d' /tmp/ray-deps/requirements_compiled.txt > /tmp/ray-deps/requirements_compiled.txt.tmp +mv /tmp/ray-deps/requirements_compiled.txt.tmp /tmp/ray-deps/requirements_compiled.txt diff --git a/ci/raydepsets/ray.depsets.yaml b/ci/raydepsets/ray.depsets.yaml deleted file mode 100644 index a6b5c0a33a12..000000000000 --- a/ci/raydepsets/ray.depsets.yaml +++ /dev/null @@ -1,27 +0,0 @@ -depsets: - - name: subset_general_depset - operation: subset - source_depset: general_depset - requirements: - - python/requirements/cloud-requirements.txt - output: ci/raydepsets/test/requirements_compiled_subset_general_py311_cpu.txt - - name: ray_base_test_depset - requirements: - - python/requirements.txt - - python/requirements/cloud-requirements.txt - - python/requirements/base-test-requirements.txt - constraints: - - python/requirements_compiled_ray_test_py311_cpu.txt - output: ci/raydepsets/test/requirements_compiled_ray_test_py311_cpu.txt - operation: compile - - name: general_depset - operation: compile - requirements: - - python/requirements.txt - output: python/test/requirements_compiled_general_py311_cpu.txt - - name: subset_general_depset - operation: subset - source_depset: general_depset - requirements: - - ci/raydepsets/cloud-requirements.txt - output: python/test/requirements_compiled_subset_general_py311_cpu.txt diff --git a/ci/raydepsets/rayimg.depsets.yaml b/ci/raydepsets/rayimg.depsets.yaml new file mode 100644 index 000000000000..88b0a3d341de --- /dev/null +++ b/ci/raydepsets/rayimg.depsets.yaml @@ -0,0 +1,31 @@ +build_arg_sets: + py39: + PYTHON_VERSION: "3.9" + PYTHON_SHORT: "39" + py310: + PYTHON_VERSION: "3.10" + PYTHON_SHORT: "310" + py311: + PYTHON_VERSION: "3.11" + PYTHON_SHORT: "311" + py312: + PYTHON_VERSION: "3.12" + PYTHON_SHORT: "312" + + +depsets: + - name: ray_img_depset_${PYTHON_SHORT} + packages: + - ray[all]==100.0.0-dev + constraints: + - python/requirements_compiled.txt + output: python/deplocks/ray_img/ray_img_py${PYTHON_SHORT}.lock + operation: compile + append_flags: + - --python-version=${PYTHON_VERSION} + - --find-links=.whl/ + build_arg_sets: + - py39 + - py310 + - py311 + - py312 diff --git a/ci/raydepsets/rayllm.depsets.yaml b/ci/raydepsets/rayllm.depsets.yaml new file mode 100644 index 000000000000..ecb4fea950cf --- /dev/null +++ b/ci/raydepsets/rayllm.depsets.yaml @@ -0,0 +1,72 @@ +build_arg_sets: + cpu: + PYTHON_VERSION: py311 + CUDA_CODE: cpu + cu121: + PYTHON_VERSION: py311 + CUDA_CODE: cu121 + cu128: + PYTHON_VERSION: py311 + CUDA_CODE: cu128 + + +.common_settings: &common_settings + override_flags: + - --extra-index-url https://download.pytorch.org/whl/${CUDA_CODE} + append_flags: + - --python-version=3.11 + - --unsafe-package ray + - --python-platform=linux + build_arg_sets: + - cpu + - cu121 + - cu128 + +depsets: +# First, extract base test dependencies from the current compiled mono repo one. +# This also expands to the indirect dependencies for this Python version & platform. + - name: ray_base_test_depset_${PYTHON_VERSION}_${CUDA_CODE} + operation: compile + <<: *common_settings + requirements: + - python/requirements.txt + - python/requirements/cloud-requirements.txt + - python/requirements/base-test-requirements.txt + constraints: + - /tmp/ray-deps/requirements_compiled.txt + output: python/deplocks/llm/ray_test_${PYTHON_VERSION}_${CUDA_CODE}.lock + pre_hooks: + - ci/raydepsets/pre_hooks/remove-compiled-headers.sh + +# Second, expand it into LLM test dependencies. + - name: compiled_ray_llm_test_depset_${PYTHON_VERSION}_${CUDA_CODE} + <<: *common_settings + operation: expand + requirements: + - python/requirements.txt + - python/requirements/cloud-requirements.txt + - python/requirements/base-test-requirements.txt + - python/requirements/llm/llm-requirements.txt + - python/requirements/llm/llm-test-requirements.txt + constraints: + - python/deplocks/llm/ray_test_${PYTHON_VERSION}_${CUDA_CODE}.lock + output: python/deplocks/llm/rayllm_test_${PYTHON_VERSION}_${CUDA_CODE}.lock + +# Third, subset the base test dependencies into Ray dependencies. + - name: compiled_ray_depset_${PYTHON_VERSION}_${CUDA_CODE} + <<: *common_settings + operation: subset + source_depset: ray_base_test_depset_${PYTHON_VERSION}_${CUDA_CODE} + requirements: + - python/requirements.txt + output: python/deplocks/llm/ray_${PYTHON_VERSION}_${CUDA_CODE}.lock + +# Fourth, subset the LLM test dependencies into RayLLM dependencies. + - name: compiled_ray_llm_depset_${PYTHON_VERSION}_${CUDA_CODE} + <<: *common_settings + operation: subset + source_depset: compiled_ray_llm_test_depset_${PYTHON_VERSION}_${CUDA_CODE} + requirements: + - python/requirements.txt + - python/requirements/llm/llm-requirements.txt + output: python/deplocks/llm/rayllm_${PYTHON_VERSION}_${CUDA_CODE}.lock diff --git a/ci/raydepsets/test_data/test.depsets.yaml b/ci/raydepsets/test_data/test.depsets.yaml deleted file mode 100644 index 5ebd0485746e..000000000000 --- a/ci/raydepsets/test_data/test.depsets.yaml +++ /dev/null @@ -1,42 +0,0 @@ -build_arg_sets: - - name: py311_cpu - build_args: - CUDA_VERSION: cpu - PYTHON_VERSION: py311 - - name: py311_cuda128 - build_args: - CUDA_VERSION: 128 - PYTHON_VERSION: py311 - -depsets: - - name: ray_base_test_depset - operation: compile - requirements: - - requirements_test.txt - constraints: - - requirement_constraints_test.txt - output: requirements_compiled.txt - - name: general_depset - operation: compile - requirements: - - requirements_test.txt - output: requirements_compiled_general.txt - - name: subset_general_depset - operation: subset - source_depset: general_depset - requirements: - - requirement_constraints_subset.txt - output: requirements_compiled_subset_general.txt - - name: expanded_depset - operation: compile - requirements: - - requirements_expanded.txt - output: requirements_compiled_expanded.txt - - name: expand_general_depset - operation: expand - depsets: - - general_depset - - expanded_depset - constraints: - - requirement_constraints_expand.txt - output: requirements_compiled_expand_general.txt diff --git a/ci/raydepsets/test_cli.py b/ci/raydepsets/tests/test_cli.py similarity index 50% rename from ci/raydepsets/test_cli.py rename to ci/raydepsets/tests/test_cli.py index b7cb3af62b8d..d44ce195711f 100644 --- a/ci/raydepsets/test_cli.py +++ b/ci/raydepsets/tests/test_cli.py @@ -1,41 +1,41 @@ -import pytest -import sys -from typing import Optional -from pathlib import Path import subprocess -import shutil +import sys import tempfile import unittest +from pathlib import Path +from typing import Optional +import pytest import runfiles +from click.testing import CliRunner from networkx import topological_sort from ci.raydepsets.cli import ( - load, + DEFAULT_UV_FLAGS, DependencySetManager, - _uv_binary, - _override_uv_flags, - _append_uv_flags, _flatten_flags, - Depset, - DEFAULT_UV_FLAGS, + _get_depset, + _override_uv_flags, + _uv_binary, + build, ) -from ci.raydepsets.workspace import Workspace -from click.testing import CliRunner -from ci.raydepsets.testing_utils import ( +from ci.raydepsets.tests.utils import ( + append_to_file, copy_data_to_tmpdir, replace_in_file, - save_packages_to_file, save_file_as, - append_to_file, + save_packages_to_file, +) +from ci.raydepsets.workspace import ( + Depset, ) -_REPO_NAME = "com_github_ray_project_ray" +_REPO_NAME = "io_ray" _runfiles = runfiles.Create() def _create_test_manager( - tmpdir: str, config_path: Optional[str] = None + tmpdir: str, config_path: Optional[str] = None, check: bool = False ) -> DependencySetManager: if config_path is None: config_path = "test.depsets.yaml" @@ -44,27 +44,41 @@ def _create_test_manager( config_path=config_path, workspace_dir=tmpdir, uv_cache_dir=uv_cache_dir.as_posix(), + check=check, ) -class TestCli(unittest.TestCase): - def test_workspace_init(self): - with tempfile.TemporaryDirectory() as tmpdir: - workspace = Workspace(tmpdir) - assert workspace.dir is not None +def _overwrite_config_file(tmpdir: str, depset: Depset): + with open(Path(tmpdir) / "test.depsets.yaml", "w") as f: + f.write( + f""" +depsets: + - name: {depset.name} + operation: {depset.operation} + constraints: + - {depset.constraints} + requirements: + - {depset.requirements} + output: {depset.output} + """ + ) + +class TestCli(unittest.TestCase): def test_cli_load_fail_no_config(self): - result = CliRunner().invoke( - load, - [ - "fake_path/test.depsets.yaml", - "--workspace-dir", - "/ci/raydepsets/test_data", - ], - ) - assert result.exit_code == 1 - assert isinstance(result.exception, FileNotFoundError) - assert "No such file or directory" in str(result.exception) + with tempfile.TemporaryDirectory() as tmpdir: + copy_data_to_tmpdir(tmpdir) + result = CliRunner().invoke( + build, + [ + "fake_path/test.depsets.yaml", + "--workspace-dir", + tmpdir, + ], + ) + assert result.exit_code == 1 + assert isinstance(result.exception, FileNotFoundError) + assert "No such file or directory" in str(result.exception) def test_dependency_set_manager_init(self): with tempfile.TemporaryDirectory() as tmpdir: @@ -80,12 +94,12 @@ def test_dependency_set_manager_init(self): ] assert manager.config.depsets[0].output == "requirements_compiled.txt" - def test_dependency_set_manager_get_depset(self): + def test_get_depset(self): with tempfile.TemporaryDirectory() as tmpdir: copy_data_to_tmpdir(tmpdir) manager = _create_test_manager(tmpdir) with self.assertRaises(KeyError): - manager.get_depset("fake_depset") + _get_depset(manager.config.depsets, "fake_depset") def test_uv_binary_exists(self): assert _uv_binary() is not None @@ -97,24 +111,16 @@ def test_uv_version(self): stderr=subprocess.PIPE, ) assert result.returncode == 0 - assert "uv 0.7.20" in result.stdout.decode("utf-8") + assert "uv 0.8.17" in result.stdout.decode("utf-8") assert result.stderr.decode("utf-8") == "" def test_compile(self): - compiled_file = Path( - _runfiles.Rlocation( - f"{_REPO_NAME}/ci/raydepsets/test_data/requirements_compiled_test.txt" - ) - ) - output_file = Path( - _runfiles.Rlocation( - f"{_REPO_NAME}/ci/raydepsets/test_data/requirements_compiled.txt" - ) - ) - shutil.copy(compiled_file, output_file) - with tempfile.TemporaryDirectory() as tmpdir: copy_data_to_tmpdir(tmpdir) + save_file_as( + Path(tmpdir) / "requirements_compiled_test.txt", + Path(tmpdir) / "requirements_compiled.txt", + ) manager = _create_test_manager(tmpdir) manager.compile( constraints=["requirement_constraints_test.txt"], @@ -139,7 +145,7 @@ def test_compile_update_package(self): output_file = Path( _runfiles.Rlocation(f"{tmpdir}/requirements_compiled.txt") ) - shutil.copy(compiled_file, output_file) + save_file_as(compiled_file, output_file) manager = _create_test_manager(tmpdir) manager.compile( constraints=["requirement_constraints_test.txt"], @@ -154,13 +160,33 @@ def test_compile_update_package(self): output_text_valid = output_file_valid.read_text() assert output_text == output_text_valid + def test_compile_with_append_and_override_flags(self): + with tempfile.TemporaryDirectory() as tmpdir: + copy_data_to_tmpdir(tmpdir) + manager = _create_test_manager(tmpdir) + manager.compile( + constraints=["requirement_constraints_test.txt"], + requirements=["requirements_test.txt"], + append_flags=["--no-annotate", "--python-version 3.10"], + override_flags=["--extra-index-url https://dummyurl.com"], + name="ray_base_test_depset", + output="requirements_compiled.txt", + ) + output_file = Path(tmpdir) / "requirements_compiled.txt" + output_text = output_file.read_text() + assert "--python-version 3.10" in output_text + assert "--extra-index-url https://dummyurl.com" in output_text + assert ( + "--extra-index-url https://download.pytorch.org/whl/cu128" + not in output_text + ) + def test_compile_by_depset_name(self): with tempfile.TemporaryDirectory() as tmpdir: copy_data_to_tmpdir(tmpdir) uv_cache_dir = Path(tmpdir) / "uv_cache" - result = CliRunner().invoke( - load, + build, [ "test.depsets.yaml", "--workspace-dir", @@ -171,7 +197,6 @@ def test_compile_by_depset_name(self): uv_cache_dir.as_posix(), ], ) - output_fp = Path(tmpdir) / "requirements_compiled.txt" assert output_fp.is_file() assert result.exit_code == 0 @@ -195,15 +220,15 @@ def test_subset(self): constraints=["requirement_constraints_test.txt"], requirements=["requirements_test.txt", "requirements_test_subset.txt"], append_flags=["--no-annotate", "--no-header"], - name="general_depset", + name="general_depset__py311_cpu", output="requirements_compiled_general.txt", ) # Subset general_depset with requirements_test.txt (should lock emoji & pyperclip) manager.subset( - source_depset="general_depset", + source_depset="general_depset__py311_cpu", requirements=["requirements_test.txt"], append_flags=["--no-annotate", "--no-header"], - name="subset_general_depset", + name="subset_general_depset__py311_cpu", output="requirements_compiled_subset_general.txt", ) output_file = Path(tmpdir) / "requirements_compiled_subset_general.txt" @@ -226,16 +251,16 @@ def test_subset_does_not_exist(self): constraints=["requirement_constraints_test.txt"], requirements=["requirements_test.txt", "requirements_test_subset.txt"], append_flags=["--no-annotate", "--no-header"], - name="general_depset", + name="general_depset__py311_cpu", output="requirements_compiled_general.txt", ) with self.assertRaises(RuntimeError): manager.subset( - source_depset="general_depset", + source_depset="general_depset__py311_cpu", requirements=["requirements_compiled_test.txt"], append_flags=["--no-annotate", "--no-header"], - name="subset_general_depset", + name="subset_general_depset__py311_cpu", output="requirements_compiled_subset_general.txt", ) @@ -244,7 +269,7 @@ def test_check_if_subset_exists(self): copy_data_to_tmpdir(tmpdir) manager = _create_test_manager(tmpdir) source_depset = Depset( - name="general_depset", + name="general_depset__py311_cpu", operation="compile", requirements=["requirements_1.txt", "requirements_2.txt"], constraints=["requirement_constraints_1.txt"], @@ -276,13 +301,38 @@ def test_get_path(self): manager = _create_test_manager(tmpdir) assert ( manager.get_path("requirements_test.txt") - == f"{tmpdir}/requirements_test.txt" + == Path(tmpdir) / "requirements_test.txt" ) - def test_append_uv_flags(self): - assert _append_uv_flags( - ["--no-annotate", "--no-header"], DEFAULT_UV_FLAGS.copy() - ) == DEFAULT_UV_FLAGS.copy() + ["--no-annotate", "--no-header"] + def test_append_uv_flags_exist_in_output(self): + with tempfile.TemporaryDirectory() as tmpdir: + copy_data_to_tmpdir(tmpdir) + manager = _create_test_manager(tmpdir) + manager.compile( + constraints=[], + requirements=["requirements_test.txt"], + name="general_depset", + output="requirements_compiled_general.txt", + append_flags=["--python-version=3.10"], + ) + output_file = Path(tmpdir) / "requirements_compiled_general.txt" + output_text = output_file.read_text() + assert "--python-version=3.10" in output_text + + def test_append_uv_flags_with_space_in_flag(self): + with tempfile.TemporaryDirectory() as tmpdir: + copy_data_to_tmpdir(tmpdir) + manager = _create_test_manager(tmpdir) + manager.compile( + constraints=[], + requirements=["requirements_test.txt"], + name="general_depset", + output="requirements_compiled_general.txt", + append_flags=["--python-version 3.10"], + ) + output_file = Path(tmpdir) / "requirements_compiled_general.txt" + output_text = output_file.read_text() + assert "--python-version 3.10" in output_text def test_override_uv_flag_single_flag(self): expected_flags = DEFAULT_UV_FLAGS.copy() @@ -302,10 +352,6 @@ def test_override_uv_flag_single_flag(self): def test_override_uv_flag_multiple_flags(self): expected_flags = DEFAULT_UV_FLAGS.copy() expected_flags.remove("--unsafe-package") - expected_flags.remove("ray") - expected_flags.remove("--unsafe-package") - expected_flags.remove("grpcio-tools") - expected_flags.remove("--unsafe-package") expected_flags.remove("setuptools") expected_flags.extend(["--unsafe-package", "dummy"]) assert ( @@ -339,37 +385,62 @@ def test_build_graph(self): copy_data_to_tmpdir(tmpdir) manager = _create_test_manager(tmpdir) assert manager.build_graph is not None - assert len(manager.build_graph.nodes()) == 5 - assert len(manager.build_graph.edges()) == 3 - assert manager.build_graph.nodes["general_depset"]["operation"] == "compile" + assert len(manager.build_graph.nodes()) == 8 + assert len(manager.build_graph.edges()) == 4 + # assert that the compile depsets are first + assert ( + manager.build_graph.nodes["general_depset__py311_cpu"]["operation"] + == "compile" + ) assert ( manager.build_graph.nodes["subset_general_depset"]["operation"] == "subset" ) assert ( - manager.build_graph.nodes["expand_general_depset"]["operation"] + manager.build_graph.nodes["expand_general_depset__py311_cpu"][ + "operation" + ] == "expand" ) - sorted_nodes = list(topological_sort(manager.build_graph)) - assert sorted_nodes[0] == "ray_base_test_depset" - assert sorted_nodes[1] == "general_depset" - assert sorted_nodes[2] == "expanded_depset" + # assert that the root nodes are the compile depsets + assert "ray_base_test_depset" in sorted_nodes[:3] + assert "general_depset__py311_cpu" in sorted_nodes[:3] + assert "build_args_test_depset__py311_cpu" in sorted_nodes[:3] + + def test_build_graph_predecessors(self): + with tempfile.TemporaryDirectory() as tmpdir: + copy_data_to_tmpdir(tmpdir) + manager = _create_test_manager(tmpdir) + assert manager.build_graph is not None + assert ( + manager.build_graph.nodes["general_depset__py311_cpu"]["operation"] + == "compile" + ) + assert ( + manager.build_graph.nodes["expanded_depset__py311_cpu"]["operation"] + == "compile" + ) + assert ( + manager.build_graph.nodes["expand_general_depset__py311_cpu"][ + "operation" + ] + == "expand" + ) + assert set( + manager.build_graph.predecessors("expand_general_depset__py311_cpu") + ) == {"general_depset__py311_cpu", "expanded_depset__py311_cpu"} def test_build_graph_bad_operation(self): with tempfile.TemporaryDirectory() as tmpdir: copy_data_to_tmpdir(tmpdir) - with open(Path(tmpdir) / "test.depsets.yaml", "w") as f: - f.write( - """ -depsets: - - name: invalid_op_depset - operation: invalid_op - requirements: - - requirements_test.txt - output: requirements_compiled_invalid_op.txt - """ - ) + depset = Depset( + name="invalid_op_depset", + operation="invalid_op", + requirements=["requirements_test.txt"], + output="requirements_compiled_invalid_op.txt", + ) + _overwrite_config_file(tmpdir, depset) with self.assertRaises(ValueError): _create_test_manager(tmpdir) @@ -377,6 +448,24 @@ def test_execute(self): with tempfile.TemporaryDirectory() as tmpdir: copy_data_to_tmpdir(tmpdir) + def test_execute_single_depset(self): + with tempfile.TemporaryDirectory() as tmpdir: + copy_data_to_tmpdir(tmpdir) + manager = _create_test_manager(tmpdir) + manager.execute(single_depset_name="general_depset__py311_cpu") + assert ( + manager.build_graph.nodes["general_depset__py311_cpu"]["operation"] + == "compile" + ) + assert len(manager.build_graph.nodes()) == 1 + + def test_execute_single_depset_that_does_not_exist(self): + with tempfile.TemporaryDirectory() as tmpdir: + copy_data_to_tmpdir(tmpdir) + manager = _create_test_manager(tmpdir) + with self.assertRaises(KeyError): + manager.execute(single_depset_name="fake_depset") + def test_expand(self): with tempfile.TemporaryDirectory() as tmpdir: copy_data_to_tmpdir(tmpdir) @@ -397,22 +486,22 @@ def test_expand(self): constraints=["requirement_constraints_test.txt"], requirements=["requirements_test.txt"], append_flags=["--no-annotate", "--no-header"], - name="general_depset", + name="general_depset__py311_cpu", output="requirements_compiled_general.txt", ) manager.compile( constraints=[], requirements=["requirements_expanded.txt"], append_flags=["--no-annotate", "--no-header"], - name="expanded_depset", + name="expanded_depset__py311_cpu", output="requirements_compiled_expanded.txt", ) manager.expand( - depsets=["general_depset", "expanded_depset"], + depsets=["general_depset__py311_cpu", "expanded_depset__py311_cpu"], constraints=["requirement_constraints_expand.txt"], append_flags=["--no-annotate", "--no-header"], requirements=[], - name="expand_general_depset", + name="expand_general_depset__py311_cpu", output="requirements_compiled_expand_general.txt", ) output_file = Path(tmpdir) / "requirements_compiled_expand_general.txt" @@ -441,15 +530,15 @@ def test_expand_with_requirements(self): constraints=["requirement_constraints_test.txt"], requirements=["requirements_test.txt"], append_flags=["--no-annotate", "--no-header"], - name="general_depset", + name="general_depset__py311_cpu", output="requirements_compiled_general.txt", ) manager.expand( - depsets=["general_depset"], + depsets=["general_depset__py311_cpu"], requirements=["requirements_expanded.txt"], constraints=["requirement_constraints_expand.txt"], append_flags=["--no-annotate", "--no-header"], - name="expand_general_depset", + name="expand_general_depset__py311_cpu", output="requirements_compiled_expand_general.txt", ) output_file = Path(tmpdir) / "requirements_compiled_expand_general.txt" @@ -458,21 +547,181 @@ def test_expand_with_requirements(self): output_text_valid = output_file_valid.read_text() assert output_text == output_text_valid - def test_parse_build_arg_sets(self): - with tempfile.TemporaryDirectory() as tmpdir: - copy_data_to_tmpdir(tmpdir) - workspace = Workspace(dir=tmpdir) - config = workspace.load_config(path=Path(tmpdir) / "test.depsets.yaml") - assert config.build_arg_sets[0].name == "py311_cpu" - assert config.build_arg_sets[0].build_args == { - "CUDA_VERSION": "cpu", - "PYTHON_VERSION": "py311", - } - assert config.build_arg_sets[1].name == "py311_cuda128" - assert config.build_arg_sets[1].build_args == { - "CUDA_VERSION": 128, - "PYTHON_VERSION": "py311", - } + def test_get_depset_with_build_arg_set(self): + with tempfile.TemporaryDirectory() as tmpdir: + copy_data_to_tmpdir(tmpdir) + manager = DependencySetManager( + config_path="test.depsets.yaml", + workspace_dir=tmpdir, + ) + depset = _get_depset( + manager.config.depsets, "build_args_test_depset__py311_cpu" + ) + assert depset.name == "build_args_test_depset__py311_cpu" + + def test_get_depset_without_build_arg_set(self): + with tempfile.TemporaryDirectory() as tmpdir: + copy_data_to_tmpdir(tmpdir) + manager = DependencySetManager( + config_path="test.depsets.yaml", + workspace_dir=tmpdir, + ) + depset = _get_depset(manager.config.depsets, "ray_base_test_depset") + assert depset.name == "ray_base_test_depset" + + def test_get_depset_with_build_arg_set_and_no_build_arg_set_provided(self): + with tempfile.TemporaryDirectory() as tmpdir: + copy_data_to_tmpdir(tmpdir) + manager = DependencySetManager( + config_path="test.depsets.yaml", + workspace_dir=tmpdir, + ) + with self.assertRaises(KeyError): + _get_depset(manager.config.depsets, "build_args_test_depset_py311") + + def test_execute_single_pre_hook(self): + with tempfile.TemporaryDirectory() as tmpdir: + copy_data_to_tmpdir(tmpdir) + manager = _create_test_manager(tmpdir) + manager.execute_pre_hook("pre-hook-test.sh") + + def test_execute_single_invalid_pre_hook(self): + with tempfile.TemporaryDirectory() as tmpdir: + copy_data_to_tmpdir(tmpdir) + manager = _create_test_manager(tmpdir) + with self.assertRaises(RuntimeError): + manager.execute_pre_hook("pre-hook-error-test.sh") + + def test_execute_pre_hooks_failure_in_middle(self): + with tempfile.TemporaryDirectory() as tmpdir: + copy_data_to_tmpdir(tmpdir) + manager = _create_test_manager(tmpdir) + with self.assertRaises(RuntimeError): + manager.execute_pre_hook("pre-hook-test.sh") + manager.execute_pre_hook("pre-hook-error-test.sh") + manager.execute_pre_hook("pre-hook-test.sh") + + def test_copy_lock_files_to_temp_dir(self): + with tempfile.TemporaryDirectory() as tmpdir: + copy_data_to_tmpdir(tmpdir) + depset = Depset( + name="check_depset", + operation="compile", + constraints=["requirement_constraints_test.txt"], + requirements=["requirements_test.txt"], + output="requirements_compiled_test.txt", + ) + _overwrite_config_file(tmpdir, depset) + manager = _create_test_manager(tmpdir, check=True) + manager.compile( + constraints=["requirement_constraints_test.txt"], + requirements=["requirements_test.txt"], + append_flags=["--no-annotate", "--no-header"], + name="check_depset", + output="requirements_compiled_test.txt", + ) + assert ( + Path(manager.workspace.dir) / "requirements_compiled_test.txt" + ).exists() + assert (Path(manager.temp_dir) / "requirements_compiled_test.txt").exists() + + def test_diff_lock_files_out_of_date(self): + with tempfile.TemporaryDirectory() as tmpdir: + copy_data_to_tmpdir(tmpdir) + depset = Depset( + name="check_depset", + operation="compile", + constraints=["requirement_constraints_test.txt"], + requirements=["requirements_test.txt"], + output="requirements_compiled_test.txt", + ) + _overwrite_config_file(tmpdir, depset) + manager = _create_test_manager(tmpdir, check=True) + manager.compile( + constraints=["requirement_constraints_test.txt"], + requirements=["requirements_test.txt"], + append_flags=["--no-annotate", "--no-header"], + name="check_depset", + output="requirements_compiled_test.txt", + ) + replace_in_file( + Path(manager.workspace.dir) / "requirements_compiled_test.txt", + "emoji==2.9.0", + "emoji==2.8.0", + ) + + with self.assertRaises(RuntimeError) as e: + manager.diff_lock_files() + assert ( + "Lock files are not up to date. Please update lock files and push the changes." + in str(e.exception) + ) + assert "+emoji==2.8.0" in str(e.exception) + assert "-emoji==2.9.0" in str(e.exception) + + def test_diff_lock_files_up_to_date(self): + with tempfile.TemporaryDirectory() as tmpdir: + copy_data_to_tmpdir(tmpdir) + depset = Depset( + name="check_depset", + operation="compile", + constraints=["requirement_constraints_test.txt"], + requirements=["requirements_test.txt"], + output="requirements_compiled_test.txt", + ) + _overwrite_config_file(tmpdir, depset) + manager = _create_test_manager(tmpdir, check=True) + manager.compile( + constraints=["requirement_constraints_test.txt"], + requirements=["requirements_test.txt"], + append_flags=["--no-annotate", "--no-header"], + name="check_depset", + output="requirements_compiled_test.txt", + ) + manager.diff_lock_files() + + def test_compile_with_packages(self): + with tempfile.TemporaryDirectory() as tmpdir: + copy_data_to_tmpdir(tmpdir) + save_file_as( + Path(tmpdir) / "requirements_compiled_test.txt", + Path(tmpdir) / "requirements_compiled_test_packages.txt", + ) + manager = _create_test_manager(tmpdir) + manager.compile( + constraints=["requirement_constraints_test.txt"], + packages=["emoji==2.9.0", "pyperclip==1.6.0"], + append_flags=["--no-annotate", "--no-header"], + name="packages_test_depset", + output="requirements_compiled_test_packages.txt", + ) + output_file = Path(tmpdir) / "requirements_compiled_test_packages.txt" + output_text = output_file.read_text() + output_file_valid = Path(tmpdir) / "requirements_compiled_test.txt" + output_text_valid = output_file_valid.read_text() + assert output_text == output_text_valid + + def test_compile_with_packages_and_requirements(self): + with tempfile.TemporaryDirectory() as tmpdir: + copy_data_to_tmpdir(tmpdir) + save_file_as( + Path(tmpdir) / "requirements_compiled_test.txt", + Path(tmpdir) / "requirements_compiled_test_packages.txt", + ) + manager = _create_test_manager(tmpdir) + manager.compile( + constraints=["requirement_constraints_test.txt"], + packages=["emoji==2.9.0", "pyperclip==1.6.0"], + requirements=["requirements_test.txt"], + append_flags=["--no-annotate", "--no-header"], + name="packages_test_depset", + output="requirements_compiled_test_packages.txt", + ) + output_file = Path(tmpdir) / "requirements_compiled_test_packages.txt" + output_text = output_file.read_text() + output_file_valid = Path(tmpdir) / "requirements_compiled_test.txt" + output_text_valid = output_file_valid.read_text() + assert output_text == output_text_valid if __name__ == "__main__": diff --git a/ci/raydepsets/tests/test_data/pre-hook-error-test.sh b/ci/raydepsets/tests/test_data/pre-hook-error-test.sh new file mode 100755 index 000000000000..4196354f3deb --- /dev/null +++ b/ci/raydepsets/tests/test_data/pre-hook-error-test.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +set -euo pipefail + +echo "Pre-hook test error" + +exit 1 diff --git a/ci/raydepsets/tests/test_data/pre-hook-test.sh b/ci/raydepsets/tests/test_data/pre-hook-test.sh new file mode 100755 index 000000000000..bd86a37a2e34 --- /dev/null +++ b/ci/raydepsets/tests/test_data/pre-hook-test.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +set -euo pipefail + +echo "Pre-hook test" diff --git a/ci/raydepsets/test_data/requirement_constraints_test.txt b/ci/raydepsets/tests/test_data/requirement_constraints_test.txt similarity index 100% rename from ci/raydepsets/test_data/requirement_constraints_test.txt rename to ci/raydepsets/tests/test_data/requirement_constraints_test.txt diff --git a/ci/raydepsets/test_data/requirements_compiled_test.txt b/ci/raydepsets/tests/test_data/requirements_compiled_test.txt similarity index 100% rename from ci/raydepsets/test_data/requirements_compiled_test.txt rename to ci/raydepsets/tests/test_data/requirements_compiled_test.txt diff --git a/ci/raydepsets/test_data/requirements_compiled_test_expand.txt b/ci/raydepsets/tests/test_data/requirements_compiled_test_expand.txt similarity index 100% rename from ci/raydepsets/test_data/requirements_compiled_test_expand.txt rename to ci/raydepsets/tests/test_data/requirements_compiled_test_expand.txt diff --git a/ci/raydepsets/test_data/requirements_compiled_test_update.txt b/ci/raydepsets/tests/test_data/requirements_compiled_test_update.txt similarity index 100% rename from ci/raydepsets/test_data/requirements_compiled_test_update.txt rename to ci/raydepsets/tests/test_data/requirements_compiled_test_update.txt diff --git a/ci/raydepsets/test_data/requirements_test.txt b/ci/raydepsets/tests/test_data/requirements_test.txt similarity index 100% rename from ci/raydepsets/test_data/requirements_test.txt rename to ci/raydepsets/tests/test_data/requirements_test.txt diff --git a/ci/raydepsets/tests/test_data/test.depsets.yaml b/ci/raydepsets/tests/test_data/test.depsets.yaml new file mode 100644 index 000000000000..97950a046a68 --- /dev/null +++ b/ci/raydepsets/tests/test_data/test.depsets.yaml @@ -0,0 +1,60 @@ +build_arg_sets: + py311_cpu: + CUDA_VERSION: cpu + PYTHON_VERSION: py311 + py311_cuda128: + CUDA_VERSION: 128 + PYTHON_VERSION: py311 + +depsets: + - name: ray_base_test_depset + operation: compile + requirements: + - requirements_test.txt + constraints: + - requirement_constraints_test.txt + output: requirements_compiled.txt + - name: general_depset__${PYTHON_VERSION}_${CUDA_VERSION} + operation: compile + requirements: + - requirements_test.txt + output: requirements_compiled_general.txt + build_arg_sets: + - py311_cpu + - name: build_args_test_depset__${PYTHON_VERSION}_${CUDA_VERSION} + operation: compile + requirements: + - requirements_test.txt + output: requirements_compiled_general_${PYTHON_VERSION}_${CUDA_VERSION}.txt + build_arg_sets: + - py311_cpu + - name: subset_general_depset + operation: subset + source_depset: general_depset__py311_cpu + requirements: + - requirement_constraints_subset.txt + output: requirements_compiled_subset_general.txt + - name: expanded_depset__${PYTHON_VERSION}_${CUDA_VERSION} + operation: compile + requirements: + - requirements_expanded.txt + output: requirements_compiled_expanded.txt + build_arg_sets: + - py311_cpu + - name: expand_general_depset__${PYTHON_VERSION}_${CUDA_VERSION} + operation: expand + depsets: + - general_depset__${PYTHON_VERSION}_${CUDA_VERSION} + - expanded_depset__${PYTHON_VERSION}_${CUDA_VERSION} + constraints: + - requirement_constraints_expand.txt + output: requirements_compiled_expand_general.txt + build_arg_sets: + - py311_cpu + - name: pre_hook_test_depset + operation: compile + requirements: + - requirements_test.txt + output: requirements_compiled_pre_hook.txt + pre_hooks: + - pre-hook-test.sh diff --git a/ci/raydepsets/tests/test_workspace.py b/ci/raydepsets/tests/test_workspace.py new file mode 100644 index 000000000000..a79ce82d4b60 --- /dev/null +++ b/ci/raydepsets/tests/test_workspace.py @@ -0,0 +1,81 @@ +import sys +import tempfile +from pathlib import Path + +import pytest + +from ci.raydepsets.tests.utils import copy_data_to_tmpdir, get_depset_by_name +from ci.raydepsets.workspace import BuildArgSet, Workspace, _substitute_build_args + + +def test_workspace_init(): + with tempfile.TemporaryDirectory() as tmpdir: + workspace = Workspace(tmpdir) + assert workspace.dir is not None + + +def test_parse_build_arg_sets(): + with tempfile.TemporaryDirectory() as tmpdir: + copy_data_to_tmpdir(tmpdir) + workspace = Workspace(dir=tmpdir) + config = workspace.load_config(path=Path(tmpdir) / "test.depsets.yaml") + assert config.build_arg_sets["py311_cpu"].build_args == { + "CUDA_VERSION": "cpu", + "PYTHON_VERSION": "py311", + } + assert config.build_arg_sets["py311_cuda128"].build_args == { + "CUDA_VERSION": 128, + "PYTHON_VERSION": "py311", + } + + +def test_substitute_build_args(): + build_arg_set = BuildArgSet( + build_args={ + "PYTHON_VERSION": "py311", + "CUDA_VERSION": "cu128", + }, + ) + depset_dict = { + "name": "test_depset_${PYTHON_VERSION}_${CUDA_VERSION}", + "operation": "compile", + "requirements": ["requirements_test.txt"], + "output": "requirements_compiled_test_${PYTHON_VERSION}_${CUDA_VERSION}.txt", + } + substituted_depset = _substitute_build_args(depset_dict, build_arg_set) + assert substituted_depset["output"] == "requirements_compiled_test_py311_cu128.txt" + assert substituted_depset["name"] == "test_depset_py311_cu128" + + +def test_invalid_build_arg_set(): + with tempfile.TemporaryDirectory() as tmpdir: + copy_data_to_tmpdir(tmpdir) + with open(Path(tmpdir) / "test.depsets.yaml", "w") as f: + f.write( + """ +depsets: + - name: invalid_build_arg_set + operation: compile + requirements: + - requirements_test.txt + output: requirements_compiled_invalid_build_arg_set.txt + build_arg_sets: + - invalid_build_arg_set + """ + ) + with pytest.raises(KeyError): + workspace = Workspace(dir=tmpdir) + workspace.load_config(path=Path(tmpdir) / "test.depsets.yaml") + + +def test_parse_pre_hooks(): + with tempfile.TemporaryDirectory() as tmpdir: + copy_data_to_tmpdir(tmpdir) + workspace = Workspace(dir=tmpdir) + config = workspace.load_config(path=Path(tmpdir) / "test.depsets.yaml") + pre_hook_depset = get_depset_by_name(config.depsets, "pre_hook_test_depset") + assert pre_hook_depset.pre_hooks == ["pre-hook-test.sh"] + + +if __name__ == "__main__": + sys.exit(pytest.main(["-v", __file__])) diff --git a/ci/raydepsets/testing_utils.py b/ci/raydepsets/tests/utils.py similarity index 80% rename from ci/raydepsets/testing_utils.py rename to ci/raydepsets/tests/utils.py index 13d6d9b373a0..5f531cf7c3c0 100644 --- a/ci/raydepsets/testing_utils.py +++ b/ci/raydepsets/tests/utils.py @@ -1,16 +1,17 @@ """Shared test utilities for raydepsets tests.""" import shutil + import runfiles -_REPO_NAME = "com_github_ray_project_ray" +_REPO_NAME = "io_ray" _runfiles = runfiles.Create() def copy_data_to_tmpdir(tmpdir): """Copy test data to a temporary directory.""" shutil.copytree( - _runfiles.Rlocation(f"{_REPO_NAME}/ci/raydepsets/test_data"), + _runfiles.Rlocation(f"{_REPO_NAME}/ci/raydepsets/tests/test_data"), tmpdir, dirs_exist_ok=True, ) @@ -42,3 +43,9 @@ def save_file_as(input_file, output_file): def append_to_file(filepath, new): with open(filepath, "a") as f: f.write(new + "\n") + + +def get_depset_by_name(depsets, name): + for depset in depsets: + if depset.name == name: + return depset diff --git a/ci/raydepsets/workspace.py b/ci/raydepsets/workspace.py index ed3f8652ca40..cbebc79890f8 100644 --- a/ci/raydepsets/workspace.py +++ b/ci/raydepsets/workspace.py @@ -1,63 +1,93 @@ -import yaml -from dataclasses import dataclass, field -from typing import List, Optional import os +from dataclasses import dataclass, field +from string import Template +from typing import Any, Dict, List, Optional + +import yaml @dataclass class BuildArgSet: - name: str - build_args: List[str] + build_args: Dict[str, str] @dataclass class Depset: name: str operation: str - requirements: List[str] - constraints: List[str] output: str - override_flags: List[str] - append_flags: List[str] + constraints: Optional[List[str]] = None + override_flags: Optional[List[str]] = None + append_flags: Optional[List[str]] = None + requirements: Optional[List[str]] = None + packages: Optional[List[str]] = None source_depset: Optional[str] = None depsets: Optional[List[str]] = None + pre_hooks: Optional[List[str]] = None + + +def _substitute_build_args(obj: Any, build_arg_set: BuildArgSet): + if isinstance(obj, str): + return Template(obj).substitute(build_arg_set.build_args) + elif isinstance(obj, dict): + return { + key: _substitute_build_args(value, build_arg_set) + for key, value in obj.items() + } + elif isinstance(obj, list): + return [_substitute_build_args(item, build_arg_set) for item in obj] + else: + return obj + + +def _dict_to_depset(depset: dict) -> Depset: + return Depset( + name=depset.get("name"), + requirements=depset.get("requirements", []), + constraints=depset.get("constraints", []), + operation=depset.get("operation", None), + output=depset.get("output"), + source_depset=depset.get("source_depset"), + depsets=depset.get("depsets", []), + override_flags=depset.get("override_flags", []), + append_flags=depset.get("append_flags", []), + pre_hooks=depset.get("pre_hooks", []), + packages=depset.get("packages", []), + ) @dataclass class Config: depsets: List[Depset] = field(default_factory=list) - build_arg_sets: List[BuildArgSet] = field(default_factory=list) + build_arg_sets: Dict[str, BuildArgSet] = field(default_factory=dict) - @staticmethod - def from_dict(data: dict) -> "Config": + @classmethod + def from_dict(cls, data: dict) -> "Config": + build_arg_sets = cls.parse_build_arg_sets(data.get("build_arg_sets", {})) raw_depsets = data.get("depsets", []) - depsets = [ - Depset( - name=values.get("name"), - requirements=values.get("requirements", []), - constraints=values.get("constraints", []), - operation=values.get("operation", "compile"), - output=values.get("output"), - source_depset=values.get("source_depset"), - override_flags=values.get("override_flags", []), - append_flags=values.get("append_flags", []), - depsets=values.get("depsets", []), - ) - for values in raw_depsets - ] - - build_arg_sets = Config.parse_build_arg_sets(data.get("build_arg_sets", [])) + depsets = [] + for depset in raw_depsets: + build_arg_set_keys = depset.get("build_arg_sets", []) + if build_arg_set_keys: + # Expand the depset for each build arg set + for build_arg_set_key in build_arg_set_keys: + build_arg_set = build_arg_sets[build_arg_set_key] + if build_arg_set is None: + raise KeyError(f"Build arg set {build_arg_set_key} not found") + depset_yaml = _substitute_build_args(depset, build_arg_set) + depsets.append(_dict_to_depset(depset_yaml)) + else: + depsets.append(_dict_to_depset(depset)) return Config(depsets=depsets, build_arg_sets=build_arg_sets) @staticmethod - def parse_build_arg_sets(build_arg_sets: List[dict]) -> List[BuildArgSet]: - return [ - BuildArgSet( - name=build_arg_set.get("name", None), - build_args=build_arg_set.get("build_args", []), + def parse_build_arg_sets(build_arg_sets: Dict[str, dict]) -> Dict[str, BuildArgSet]: + return { + key: BuildArgSet( + build_args=build_arg_set, ) - for build_arg_set in build_arg_sets - ] + for key, build_arg_set in build_arg_sets.items() + } class Workspace: diff --git a/ci/repro-ci.py b/ci/repro-ci.py index 7800e71bc1ec..c5b6537ecbbe 100644 --- a/ci/repro-ci.py +++ b/ci/repro-ci.py @@ -37,7 +37,7 @@ import threading import time from numbers import Number -from typing import Any, Dict, List, Optional, Callable +from typing import Any, Callable, Dict, List, Optional import boto3 import click diff --git a/ci/test_compile_llm_requirements.sh b/ci/test_compile_llm_requirements.sh index 6351add67743..7be0634145b8 100755 --- a/ci/test_compile_llm_requirements.sh +++ b/ci/test_compile_llm_requirements.sh @@ -2,11 +2,6 @@ set -e -# Install uv and set up Python -pip install uv -uv python install 3.11 -uv python pin 3.11 - # Create a temporary directory for backup files and setup cleanup trap TEMP_DIR=$(mktemp -d) cleanup() { @@ -23,16 +18,16 @@ VARIANTS=(cpu cu121 cu128) for LOCK_TYPE in "${LOCK_TYPES[@]}"; do for VARIANT in "${VARIANTS[@]}"; do - cp ./python/requirements_compiled_"${LOCK_TYPE}"_py311_"${VARIANT}".txt "$TEMP_DIR/requirements_compiled_${LOCK_TYPE}_py311_${VARIANT}_backup.txt" + cp ./python/deplocks/llm/"${LOCK_TYPE}"_py311_"${VARIANT}".lock "$TEMP_DIR/${LOCK_TYPE}_py311_${VARIANT}_backup.lock" done done -./ci/compile_llm_requirements.sh +bazel run //ci/raydepsets:raydepsets -- build ci/raydepsets/rayllm.depsets.yaml # Copy files to artifact mount on Buildkite for LOCK_TYPE in "${LOCK_TYPES[@]}"; do for VARIANT in "${VARIANTS[@]}"; do - cp ./python/requirements_compiled_"${LOCK_TYPE}"_py311_"${VARIANT}".txt /artifact-mount/ + cp ./python/deplocks/llm/"${LOCK_TYPE}"_py311_"${VARIANT}".lock /artifact-mount/ done done @@ -40,8 +35,8 @@ done FAILED=0 for LOCK_TYPE in "${LOCK_TYPES[@]}"; do for VARIANT in "${VARIANTS[@]}"; do - diff --color -u ./python/requirements_compiled_"${LOCK_TYPE}"_py311_"${VARIANT}".txt "$TEMP_DIR/requirements_compiled_${LOCK_TYPE}_py311_${VARIANT}_backup.txt" || { - echo "requirements_compiled_${LOCK_TYPE}_py311_${VARIANT}.txt is not up to date. Please download it from Artifacts tab and git push the changes." + diff -u ./python/deplocks/llm/"${LOCK_TYPE}"_py311_"${VARIANT}".lock "$TEMP_DIR/${LOCK_TYPE}_py311_${VARIANT}_backup.lock" || { + echo "${LOCK_TYPE}_py311_${VARIANT}.lock is not up to date. Please download it from Artifacts tab and git push the changes." FAILED=1 } done diff --git a/cpp/BUILD.bazel b/cpp/BUILD.bazel index 3164cf7cacf5..9174d0683dbd 100644 --- a/cpp/BUILD.bazel +++ b/cpp/BUILD.bazel @@ -66,8 +66,7 @@ cc_library( "//src/ray/common:ray_config", "//src/ray/common:task_common", "//src/ray/core_worker:core_worker_lib", - "//src/ray/gcs/gcs_client:global_state_accessor_lib", - "//src/ray/util", + "//src/ray/gcs_client:global_state_accessor_lib", "//src/ray/util:cmd_line_utils", "//src/ray/util:network_util", "//src/ray/util:process", @@ -141,11 +140,6 @@ pkg_files( name = "example_files", srcs = glob(["example/*"]), prefix = "ray/cpp/example/", - renames = { - "example/_WORKSPACE": "WORKSPACE", - "example/_BUILD.bazel": "BUILD.bazel", - "example/_.bazelrc": ".bazelrc", - }, visibility = ["//visibility:private"], ) diff --git a/cpp/include/ray/api/actor_creator.h b/cpp/include/ray/api/actor_creator.h index 0c59b007355c..973068e36766 100644 --- a/cpp/include/ray/api/actor_creator.h +++ b/cpp/include/ray/api/actor_creator.h @@ -92,14 +92,14 @@ ActorHandle, is_x_lang_v> ActorCreator::Remote(Args &&...a if constexpr (is_x_lang_v) { using ArgsTuple = std::tuple; - Arguments::WrapArgs(remote_function_holder_.lang_type, + Arguments::WrapArgs(remote_function_holder_.lang_type_, &args_, std::make_index_sequence{}, std::forward(args)...); } else { StaticCheck(); using ArgsTuple = RemoveReference_t>; - Arguments::WrapArgs(remote_function_holder_.lang_type, + Arguments::WrapArgs(remote_function_holder_.lang_type_, &args_, std::make_index_sequence{}, std::forward(args)...); diff --git a/cpp/include/ray/api/actor_task_caller.h b/cpp/include/ray/api/actor_task_caller.h index 9824234357d8..d0c22fabeb01 100644 --- a/cpp/include/ray/api/actor_task_caller.h +++ b/cpp/include/ray/api/actor_task_caller.h @@ -69,14 +69,14 @@ ObjectRef> ActorTaskCaller::Remote( if constexpr (is_x_lang_v) { using ArgsTuple = std::tuple; - Arguments::WrapArgs(remote_function_holder_.lang_type, + Arguments::WrapArgs(remote_function_holder_.lang_type_, &args_, std::make_index_sequence{}, std::forward(args)...); } else { StaticCheck(); using ArgsTuple = RemoveReference_t>>; - Arguments::WrapArgs(remote_function_holder_.lang_type, + Arguments::WrapArgs(remote_function_holder_.lang_type_, &args_, std::make_index_sequence{}, std::forward(args)...); diff --git a/cpp/include/ray/api/ray_runtime.h b/cpp/include/ray/api/ray_runtime.h index 8a8bf35e83ce..a56c95f148d7 100644 --- a/cpp/include/ray/api/ray_runtime.h +++ b/cpp/include/ray/api/ray_runtime.h @@ -32,24 +32,24 @@ struct RemoteFunctionHolder { RemoteFunctionHolder(const std::string &module_name, const std::string &function_name, const std::string &class_name = "", - LangType lang_type = LangType::CPP) { - this->module_name = module_name; - this->function_name = function_name; - this->class_name = class_name; - this->lang_type = lang_type; - } + LangType lang_type = LangType::CPP) + : module_name_(module_name), + function_name_(function_name), + class_name_(class_name), + lang_type_(lang_type) {} + RemoteFunctionHolder(std::string func_name) { if (func_name.empty()) { throw RayException( "Function not found. Please use RAY_REMOTE to register this function."); } - function_name = std::move(func_name); + function_name_ = std::move(func_name); } - std::string module_name; - std::string function_name; - std::string class_name; - LangType lang_type = LangType::CPP; + std::string module_name_; + std::string function_name_; + std::string class_name_; + LangType lang_type_ = LangType::CPP; }; class RayRuntime { diff --git a/cpp/include/ray/api/task_caller.h b/cpp/include/ray/api/task_caller.h index ca61c49c594f..c3b24f6dbe8c 100644 --- a/cpp/include/ray/api/task_caller.h +++ b/cpp/include/ray/api/task_caller.h @@ -83,14 +83,14 @@ ObjectRef> TaskCaller::Remote( if constexpr (is_x_lang_v) { using ArgsTuple = std::tuple; - Arguments::WrapArgs(remote_function_holder_.lang_type, + Arguments::WrapArgs(remote_function_holder_.lang_type_, &args_, std::make_index_sequence{}, std::forward(args)...); } else { StaticCheck(); using ArgsTuple = RemoveReference_t>; - Arguments::WrapArgs(remote_function_holder_.lang_type, + Arguments::WrapArgs(remote_function_holder_.lang_type_, &args_, std::make_index_sequence{}, std::forward(args)...); diff --git a/cpp/src/ray/config_internal.cc b/cpp/src/ray/config_internal.cc index 01ee6001502c..b5ae0b2d227b 100644 --- a/cpp/src/ray/config_internal.cc +++ b/cpp/src/ray/config_internal.cc @@ -22,6 +22,7 @@ #include "absl/flags/parse.h" #include "absl/strings/str_split.h" #include "nlohmann/json.hpp" +#include "ray/common/id.h" #include "ray/util/network_util.h" ABSL_FLAG(std::string, ray_address, "", "The address of the Ray cluster to connect to."); @@ -235,7 +236,7 @@ void ConfigInternal::Init(RayConfig &config, int argc, char **argv) { ray_namespace = FLAGS_ray_job_namespace.CurrentValue(); } if (ray_namespace.empty()) { - ray_namespace = GenerateUUIDV4(); + ray_namespace = UniqueID::FromRandom().Hex(); } } diff --git a/cpp/src/ray/runtime/abstract_ray_runtime.cc b/cpp/src/ray/runtime/abstract_ray_runtime.cc index 50b3f9f9073d..0f6ec2e24b4a 100644 --- a/cpp/src/ray/runtime/abstract_ray_runtime.cc +++ b/cpp/src/ray/runtime/abstract_ray_runtime.cc @@ -172,7 +172,7 @@ InvocationSpec BuildInvocationSpec1(TaskType task_type, invocation_spec.remote_function_holder = remote_function_holder; invocation_spec.actor_id = actor; invocation_spec.args = - TransformArgs(args, remote_function_holder.lang_type != LangType::CPP); + TransformArgs(args, remote_function_holder.lang_type_ != LangType::CPP); return invocation_spec; } @@ -199,23 +199,23 @@ std::string AbstractRayRuntime::CallActor( std::vector &args, const CallOptions &call_options) { InvocationSpec invocation_spec{}; - if (remote_function_holder.lang_type == LangType::PYTHON) { + if (remote_function_holder.lang_type_ == LangType::PYTHON) { const auto native_actor_handle = CoreWorkerProcess::GetCoreWorker().GetActorHandle( ray::ActorID::FromBinary(actor)); auto function_descriptor = native_actor_handle->ActorCreationTaskFunctionDescriptor(); auto typed_descriptor = function_descriptor->As(); RemoteFunctionHolder func_holder = remote_function_holder; - func_holder.module_name = typed_descriptor->ModuleName(); - func_holder.class_name = typed_descriptor->ClassName(); + func_holder.module_name_ = typed_descriptor->ModuleName(); + func_holder.class_name_ = typed_descriptor->ClassName(); invocation_spec = BuildInvocationSpec1( TaskType::ACTOR_TASK, func_holder, args, ActorID::FromBinary(actor)); - } else if (remote_function_holder.lang_type == LangType::JAVA) { + } else if (remote_function_holder.lang_type_ == LangType::JAVA) { const auto native_actor_handle = CoreWorkerProcess::GetCoreWorker().GetActorHandle( ray::ActorID::FromBinary(actor)); auto function_descriptor = native_actor_handle->ActorCreationTaskFunctionDescriptor(); auto typed_descriptor = function_descriptor->As(); RemoteFunctionHolder func_holder = remote_function_holder; - func_holder.class_name = typed_descriptor->ClassName(); + func_holder.class_name_ = typed_descriptor->ClassName(); invocation_spec = BuildInvocationSpec1( TaskType::ACTOR_TASK, func_holder, args, ActorID::FromBinary(actor)); } else { diff --git a/cpp/src/ray/runtime/runtime_env.cc b/cpp/src/ray/runtime/runtime_env.cc index df69dbfd36d3..437238bd2f9c 100644 --- a/cpp/src/ray/runtime/runtime_env.cc +++ b/cpp/src/ray/runtime/runtime_env.cc @@ -16,7 +16,7 @@ #include #include -#include "src/ray/protobuf/runtime_env_common.pb.h" +#include "src/ray/protobuf/public/runtime_environment.pb.h" namespace ray { diff --git a/cpp/src/ray/runtime/task/local_mode_task_submitter.cc b/cpp/src/ray/runtime/task/local_mode_task_submitter.cc index 90cba57d573b..6c91f2516b19 100644 --- a/cpp/src/ray/runtime/task/local_mode_task_submitter.cc +++ b/cpp/src/ray/runtime/task/local_mode_task_submitter.cc @@ -37,7 +37,7 @@ ObjectID LocalModeTaskSubmitter::Submit(InvocationSpec &invocation, /// Maybe some information of TaskSpecification are not reasonable or invalid. /// We will enhance this after implement the cluster mode. auto functionDescriptor = FunctionDescriptorBuilder::BuildCpp( - invocation.remote_function_holder.function_name); + invocation.remote_function_holder.function_name_); rpc::Address address; std::unordered_map required_resources; std::unordered_map required_placement_resources; diff --git a/cpp/src/ray/runtime/task/native_task_submitter.cc b/cpp/src/ray/runtime/task/native_task_submitter.cc index 8983eb857ae4..c42ecf725120 100644 --- a/cpp/src/ray/runtime/task/native_task_submitter.cc +++ b/cpp/src/ray/runtime/task/native_task_submitter.cc @@ -26,23 +26,23 @@ using ray::core::CoreWorkerProcess; using ray::core::TaskOptions; RayFunction BuildRayFunction(InvocationSpec &invocation) { - if (invocation.remote_function_holder.lang_type == LangType::CPP) { + if (invocation.remote_function_holder.lang_type_ == LangType::CPP) { auto function_descriptor = FunctionDescriptorBuilder::BuildCpp( - invocation.remote_function_holder.function_name, + invocation.remote_function_holder.function_name_, "", - invocation.remote_function_holder.class_name); + invocation.remote_function_holder.class_name_); return RayFunction(ray::Language::CPP, function_descriptor); - } else if (invocation.remote_function_holder.lang_type == LangType::PYTHON) { + } else if (invocation.remote_function_holder.lang_type_ == LangType::PYTHON) { auto function_descriptor = FunctionDescriptorBuilder::BuildPython( - invocation.remote_function_holder.module_name, - invocation.remote_function_holder.class_name, - invocation.remote_function_holder.function_name, + invocation.remote_function_holder.module_name_, + invocation.remote_function_holder.class_name_, + invocation.remote_function_holder.function_name_, ""); return RayFunction(ray::Language::PYTHON, function_descriptor); - } else if (invocation.remote_function_holder.lang_type == LangType::JAVA) { + } else if (invocation.remote_function_holder.lang_type_ == LangType::JAVA) { auto function_descriptor = FunctionDescriptorBuilder::BuildJava( - invocation.remote_function_holder.class_name, - invocation.remote_function_holder.function_name, + invocation.remote_function_holder.class_name_, + invocation.remote_function_holder.function_name_, ""); return RayFunction(ray::Language::JAVA, function_descriptor); } else { @@ -200,8 +200,7 @@ ray::PlacementGroup NativeTaskSubmitter::CreatePlacementGroup( create_options.name, (ray::core::PlacementStrategy)create_options.strategy, create_options.bundles, - false, - 1.0); + false); ray::PlacementGroupID placement_group_id; auto status = CoreWorkerProcess::GetCoreWorker().CreatePlacementGroup( options, &placement_group_id); diff --git a/cpp/src/ray/test/cluster/cluster_mode_test.cc b/cpp/src/ray/test/cluster/cluster_mode_test.cc index 2e7d2ff9a31a..3bbd0809c393 100644 --- a/cpp/src/ray/test/cluster/cluster_mode_test.cc +++ b/cpp/src/ray/test/cluster/cluster_mode_test.cc @@ -586,20 +586,20 @@ TEST(RayClusterModeTest, GetNamespaceApiTest) { class Pip { public: - std::vector packages; - bool pip_check = false; + std::vector packages_; + bool pip_check_ = false; Pip() = default; Pip(const std::vector &packages, bool pip_check) - : packages(packages), pip_check(pip_check) {} + : packages_(packages), pip_check_(pip_check) {} }; void to_json(nlohmann::json &j, const Pip &pip) { - j = nlohmann::json{{"packages", pip.packages}, {"pip_check", pip.pip_check}}; + j = nlohmann::json{{"packages", pip.packages_}, {"pip_check", pip.pip_check_}}; }; void from_json(const nlohmann::json &j, Pip &pip) { - j.at("packages").get_to(pip.packages); - j.at("pip_check").get_to(pip.pip_check); + j.at("packages").get_to(pip.packages_); + j.at("pip_check").get_to(pip.pip_check_); }; TEST(RayClusterModeTest, RuntimeEnvApiTest) { @@ -618,8 +618,8 @@ TEST(RayClusterModeTest, RuntimeEnvApiTest) { // Deserialize auto runtime_env_2 = ray::RuntimeEnv::Deserialize(serialized_runtime_env); auto pip2 = runtime_env_2.Get("pip"); - EXPECT_EQ(pip2.packages, pip.packages); - EXPECT_EQ(pip2.pip_check, pip.pip_check); + EXPECT_EQ(pip2.packages_, pip.packages_); + EXPECT_EQ(pip2.pip_check_, pip.pip_check_); auto working_dir2 = runtime_env_2.Get("working_dir"); EXPECT_EQ(working_dir2, working_dir); diff --git a/cpp/src/ray/test/cluster/counter.cc b/cpp/src/ray/test/cluster/counter.cc index eb77917189d9..7c994d3e0d99 100644 --- a/cpp/src/ray/test/cluster/counter.cc +++ b/cpp/src/ray/test/cluster/counter.cc @@ -84,10 +84,10 @@ bool Counter::CheckRestartInActorCreationTask() { return is_restared; } bool Counter::CheckRestartInActorTask() { return ray::WasCurrentActorRestarted(); } ray::ActorHandle Counter::CreateChildActor(std::string actor_name) { - auto child_actor = + auto new_child_actor = ray::Actor(RAY_FUNC(Counter::FactoryCreate)).SetName(actor_name).Remote(); - child_actor.Task(&Counter::GetCount).Remote().Get(); - return child_actor; + new_child_actor.Task(&Counter::GetCount).Remote().Get(); + return new_child_actor; } std::string Counter::GetNamespaceInActor() { return ray::GetNamespace(); } diff --git a/cpp/src/ray/util/process_helper.cc b/cpp/src/ray/util/process_helper.cc index 25c691894d78..fbddae45f7b9 100644 --- a/cpp/src/ray/util/process_helper.cc +++ b/cpp/src/ray/util/process_helper.cc @@ -21,7 +21,6 @@ #include "ray/util/cmd_line_utils.h" #include "ray/util/network_util.h" #include "ray/util/process.h" -#include "ray/util/util.h" #include "src/ray/protobuf/gcs.pb.h" namespace ray { @@ -150,7 +149,6 @@ void ProcessHelper::RayStart(CoreWorkerOptions::TaskExecutionCallback callback) options.install_failure_signal_handler = true; options.node_ip_address = node_ip; options.node_manager_port = ConfigInternal::Instance().node_manager_port; - options.raylet_ip_address = node_ip; options.driver_name = "cpp_worker"; options.metrics_agent_port = -1; options.task_execution_callback = callback; diff --git a/cpp/src/ray/util/process_helper.h b/cpp/src/ray/util/process_helper.h index 084bbeda93a7..26dfc6ca108c 100644 --- a/cpp/src/ray/util/process_helper.h +++ b/cpp/src/ray/util/process_helper.h @@ -17,7 +17,7 @@ #include "../config_internal.h" #include "ray/core_worker/core_worker.h" -#include "ray/gcs/gcs_client/global_state_accessor.h" +#include "ray/gcs_client/global_state_accessor.h" #include "util.h" namespace ray { diff --git a/cpp/test_submit_cpp_job.py b/cpp/test_submit_cpp_job.py index 5ab6e753bf8f..695079a50c6a 100644 --- a/cpp/test_submit_cpp_job.py +++ b/cpp/test_submit_cpp_job.py @@ -21,9 +21,7 @@ def headers(): @pytest.fixture(scope="module") def job_sdk_client(headers): - with _ray_start( - include_dashboard=True, num_cpus=1, _node_ip_address="0.0.0.0" - ) as ctx: + with _ray_start(include_dashboard=True, num_cpus=1) as ctx: address = ctx.address_info["webui_url"] assert wait_until_server_available(address) yield JobSubmissionClient(format_web_url(address), headers=headers) diff --git a/doc/BUILD.bazel b/doc/BUILD.bazel index 8e0cf7adde4e..2dcca3b6ac1e 100644 --- a/doc/BUILD.bazel +++ b/doc/BUILD.bazel @@ -1,6 +1,6 @@ load("@py_deps_buildkite//:requirements.bzl", ci_require = "requirement") load("@rules_python//python:defs.bzl", "py_test") -load("//bazel:python.bzl", "doctest", "py_test_run_all_notebooks", "py_test_run_all_subdirectory") +load("//bazel:python.bzl", "doctest", "doctest_each", "py_test_run_all_notebooks", "py_test_run_all_subdirectory") exports_files(["test_myst_doc.py"]) @@ -202,6 +202,41 @@ py_test( ], ) +py_test( + name = "doc_code_direct_transport_gloo", + size = "small", + srcs = ["source/ray-core/doc_code/direct_transport_gloo.py"], + main = "source/ray-core/doc_code/direct_transport_gloo.py", + tags = [ + "exclusive", + "team:core", + ], +) + +py_test( + name = "doc_code_direct_transport_nccl", + size = "small", + srcs = ["source/ray-core/doc_code/direct_transport_nccl.py"], + main = "source/ray-core/doc_code/direct_transport_nccl.py", + tags = [ + "exclusive", + "multi_gpu", + "team:core", + ], +) + +py_test( + name = "doc_code_direct_transport_nixl", + size = "small", + srcs = ["source/ray-core/doc_code/direct_transport_nixl.py"], + main = "source/ray-core/doc_code/direct_transport_nixl.py", + tags = [ + "exclusive", + "multi_gpu", + "team:core", + ], +) + py_test_run_all_subdirectory( size = "medium", include = ["source/ray-core/doc_code/*.py"], @@ -214,6 +249,8 @@ py_test_run_all_subdirectory( "source/ray-core/doc_code/cgraph_overlap.py", # not testing this as it purposefully segfaults "source/ray-core/doc_code/cgraph_troubleshooting.py", + "source/ray-core/doc_code/direct_transport_nccl.py", + "source/ray-core/doc_code/direct_transport_nixl.py", ], extra_srcs = [], tags = [ @@ -239,6 +276,8 @@ py_test_run_all_subdirectory( "source/serve/doc_code/stable_diffusion.py", "source/serve/doc_code/object_detection.py", "source/serve/doc_code/vllm_example.py", + "source/serve/doc_code/llm/llm_yaml_config_example.py", + "source/serve/doc_code/llm/qwen_example.py", ], extra_srcs = [], tags = [ @@ -270,6 +309,30 @@ py_test_run_all_subdirectory( ], ) +# -------------------------------------------------------------------- +# Test all doc/source/llm/doc_code/serve code included in rst/md files. +# -------------------------------------------------------------------- + +filegroup( + name = "serve_llm_examples", + srcs = glob(["source/llm/doc_code/serve/**/*.py"]), + visibility = ["//doc:__subpackages__"], +) + +# GPU Tests +py_test_run_all_subdirectory( + size = "large", + include = ["source/llm/doc_code/serve/**/*.py"], + exclude = [], + extra_srcs = [], + data = ["source/llm/doc_code/serve/qwen/llm_config_example.yaml"], + tags = [ + "exclusive", + "gpu", + "team:llm", + ], +) + # -------------------------------------------------------------------- # Test all doc/source/tune/doc_code code included in rst/md files. # -------------------------------------------------------------------- @@ -454,8 +517,7 @@ doctest( tags = ["team:core"], ) -doctest( - name = "doctest[data]", +doctest_each( files = glob( include = [ "source/data/**/*.md", @@ -466,15 +528,9 @@ doctest( "source/data/batch_inference.rst", "source/data/transforming-data.rst", # These tests are currently failing. - "source/data/loading-data.rst", - "source/data/data-internals.rst", - "source/data/inspecting-data.rst", - "source/data/loading-data.rst", - "source/data/performance-tips.rst", - "source/data/saving-data.rst", - "source/data/working-with-images.rst", "source/data/working-with-llms.rst", - "source/data/working-with-pytorch.rst", + # These don't contain code snippets. + "source/data/api/**/*.rst", ], ), pytest_plugin_file = "//python/ray/data:tests/doctest_pytest_plugin.py", @@ -580,3 +636,9 @@ filegroup( srcs = glob(["source/ray-overview/examples/**/*.yaml"]), visibility = ["//release:__pkg__"], ) + +filegroup( + name = "deployment_serve_llm_example_configs", + srcs = glob(["source/serve/tutorials/deployment-serve-llm/**/*.yaml"]), + visibility = ["//release:__pkg__"], +) diff --git a/doc/external/test_hashes.py b/doc/external/test_hashes.py index 7e98fcf6bc4b..4bd5a62aa203 100644 --- a/doc/external/test_hashes.py +++ b/doc/external/test_hashes.py @@ -6,7 +6,7 @@ import runfiles import pytest -_REPO_NAME = "com_github_ray_project_ray" +_REPO_NAME = "io_ray" _runfiles = runfiles.Create() diff --git a/doc/source/_includes/_help.rst b/doc/source/_includes/_help.rst index 05f46e7dcdf7..aecc526fd21c 100644 --- a/doc/source/_includes/_help.rst +++ b/doc/source/_includes/_help.rst @@ -3,7 +3,7 @@ You can post questions or issues or feedback through the following channels: 1. `Discussion Board`_: For **questions about Ray usage** or **feature requests**. 2. `GitHub Issues`_: For **bug reports**. 3. `Ray Slack`_: For **getting in touch** with Ray maintainers. -4. `StackOverflow`_: Use the [ray] tag **questions about Ray**. +4. `StackOverflow`_: Use the [ray] tag for **questions about Ray**. .. _`Discussion Board`: https://discuss.ray.io/ .. _`GitHub Issues`: https://github.com/ray-project/ray/issues diff --git a/doc/source/_static/css/custom.css b/doc/source/_static/css/custom.css index 24ed4992b2ba..4d685fdb4dea 100644 --- a/doc/source/_static/css/custom.css +++ b/doc/source/_static/css/custom.css @@ -408,4 +408,18 @@ readthedocs-flyout { /* Styling the experimental Anyscale upsell CTA */ .anyscale-cta { margin-bottom: 16px; +} + + +/* Prevent text wrapping around left-aligned images on ultra-wide screens */ +@media (min-width: 1600px) { + .bd-content .align-left, + .bd-content .figure.align-left, + .bd-content img.align-left { + float: none !important; + display: block; + clear: both; + margin-left: 0 !important; + margin-right: 0 !important; + } } \ No newline at end of file diff --git a/doc/source/_templates/csat.html b/doc/source/_templates/csat.html index 368af0d322d2..852245d7bd50 100644 --- a/doc/source/_templates/csat.html +++ b/doc/source/_templates/csat.html @@ -8,13 +8,13 @@ - Yes + Yes
- No + No
diff --git a/doc/source/_templates/template.ipynb b/doc/source/_templates/template.ipynb index b1778ed9fb97..6ea5ba018ab4 100644 --- a/doc/source/_templates/template.ipynb +++ b/doc/source/_templates/template.ipynb @@ -15,9 +15,9 @@ "If you want to learn more about the MyST parser, see the\n", "[MyST documentation](https://myst-parser.readthedocs.io/en/latest/).\n", "\n", - "MyST is common markdown compliant, so if you can use plain markdown here.\n", - "In case you need to execute restructured text (`rSt`) directives, you can use `{eval-rst}` to execute the code.\n", - "For instance, a here's a note written in rSt:\n", + "MyST is CommonMark compliant, so you can use plain markdown here.\n", + "In case you need to execute restructured text (rST) directives, you can use `{eval-rst}` to execute the code.\n", + "For instance, here's a note written in rST:\n", "\n", "```{eval-rst}\n", ".. note::\n", @@ -69,10 +69,10 @@ "source": [ "## Hiding and removing cells\n", "\n", - "You can hide cells, so that they will toggle when you click on the cell header.\n", + "You can hide cells, so that they toggle when you click the cell header.\n", "You can use different `:tags:` like `hide-cell`, `hide-input`, or `hide-output` to hide cell content,\n", - "and you can use `remove-cell`, `remove-input`, or `remove-output` to remove the cell completely when rendered.\n", - "Those cells will still show up in the notebook itself." + "and you can use `remove-cell`, `remove-input`, or `remove-output` to completely remove the cell when rendered.\n", + "Those cells still show up in the notebook itself." ] }, { @@ -107,7 +107,7 @@ "And this is a note.\n", ":::\n", "\n", - "The following cell will be removed and not render:" + "The following cell doesn't render:" ] }, { diff --git a/doc/source/_templates/template.md b/doc/source/_templates/template.md index 8e1cc0c58655..cf330f14eba3 100644 --- a/doc/source/_templates/template.md +++ b/doc/source/_templates/template.md @@ -20,9 +20,9 @@ For more information on MyST notebooks, see the If you want to learn more about the MyST parser, see the [MyST documentation](https://myst-parser.readthedocs.io/en/latest/). -MyST is common markdown compliant, so if you can use plain markdown here. -In case you need to execute restructured text (`rSt`) directives, you can use `{eval-rst}` to execute the code. -For instance, a here's a note written in rSt: +MyST is CommonMark compliant, so you can use plain markdown here. +In case you need to execute restructured text (rST) directives, you can use `{eval-rst}` to execute the code. +For instance, here's a note written in rST: ```{eval-rst} .. note:: @@ -65,10 +65,10 @@ checkpoint_path = train_ppo_model() ## Hiding and removing cells -You can hide cells, so that they will toggle when you click on the cell header. +You can hide cells, so that they toggle when you click the cell header. You can use different `:tags:` like `hide-cell`, `hide-input`, or `hide-output` to hide cell content, -and you can use `remove-cell`, `remove-input`, or `remove-output` to remove the cell completely when rendered. -Those cells will still show up in the notebook itself. +and you can use `remove-cell`, `remove-input`, or `remove-output` to completely remove the cell when rendered. +Those cells still show up in the notebook itself. ```{code-cell} python3 :tags: [hide-cell] @@ -88,7 +88,7 @@ Here's a quick tip. And this is a note. ::: -The following cell will be removed and not render: +The following cell doesn't render: ```{code-cell} python3 :tags: [remove-cell] diff --git a/doc/source/cluster/configure-manage-dashboard.md b/doc/source/cluster/configure-manage-dashboard.md index b14476ab5f3b..1e996e6b0e5a 100644 --- a/doc/source/cluster/configure-manage-dashboard.md +++ b/doc/source/cluster/configure-manage-dashboard.md @@ -2,7 +2,7 @@ # Configuring and Managing Ray Dashboard {ref}`Ray Dashboard` is one of the most important tools to monitor and debug Ray applications and Clusters. This page describes how to configure Ray Dashboard on your Clusters. -Dashboard configurations may differ depending on how you launch Ray Clusters (e.g., local Ray Cluster v.s. KubeRay). Integrations with Prometheus and Grafana are optional for enhanced Dashboard experience. +Dashboard configurations may differ depending on how you launch Ray Clusters (e.g., local Ray Cluster vs. KubeRay). Integrations with Prometheus and Grafana are optional for enhanced Dashboard experience. :::{note} Ray Dashboard is useful for interactive development and debugging because when clusters terminate, the dashboard UI and the underlying data are no longer accessible. For production monitoring and debugging, you should rely on [persisted logs](../cluster/kubernetes/user-guides/persist-kuberay-custom-resource-logs.md), [persisted metrics](./metrics.md), [persisted Ray states](../ray-observability/user-guides/cli-sdk.rst), and other observability tools. @@ -135,7 +135,7 @@ The Ray Dashboard provides read **and write** access to the Ray Cluster. The rev Dashboard is included if you use `ray[default]` or {ref}`other installation commands ` and automatically started. -To disable Dashboard, use the following arguments `--include-dashboard`. +To disable the Dashboard, use the `--include-dashboard` argument. ::::{tab-set} @@ -267,7 +267,7 @@ If you have followed the instructions above to set up everything, run the connec ##### Getting an error that says `RAY_GRAFANA_HOST` is not setup -If you have set up Grafana , check that: +If you have set up Grafana, check that: * You've included the protocol in the URL (e.g., `http://your-grafana-url.com` instead of `your-grafana-url.com`). * The URL doesn't have a trailing slash (e.g., `http://your-grafana-url.com` instead of `http://your-grafana-url.com/`). diff --git a/doc/source/cluster/faq.rst b/doc/source/cluster/faq.rst index 19be814cdd81..64f6526415b3 100644 --- a/doc/source/cluster/faq.rst +++ b/doc/source/cluster/faq.rst @@ -43,7 +43,7 @@ connect. Use this command: .. code:: bash - ray start --head --node-ip-address xx.xx.xx.xx --port nnnn`` + ray start --head --node-ip-address xx.xx.xx.xx --port nnnn Then when starting the worker node, use this command to connect to the head node: @@ -66,8 +66,8 @@ debugging routing issues. You may also see failures in the log like: - This node has an IP address of xx.xx.xx.xx, while we can not found the - matched Raylet address. This maybe come from when you connect the Ray + This node has an IP address of xx.xx.xx.xx, while we cannot find the + matched Raylet address. This may come from when you connect the Ray cluster with a different IP address or connect a container. The cause of this error may be the head node overloading with too many simultaneous diff --git a/doc/source/cluster/kubernetes/examples.md b/doc/source/cluster/kubernetes/examples.md index 4e683a784a1f..167a1362b221 100644 --- a/doc/source/cluster/kubernetes/examples.md +++ b/doc/source/cluster/kubernetes/examples.md @@ -31,7 +31,7 @@ This section presents example Ray workloads to try out on your Kubernetes cluste - {ref}`kuberay-batch-inference-example` - {ref}`kuberay-kueue-priority-scheduling-example` - {ref}`kuberay-kueue-gang-scheduling-example` -- {ref}`kuberay-distributed-checkpointing-gcsefuse` +- {ref}`kuberay-distributed-checkpointing-gcsfuse` - {ref}`kuberay-modin-example` - {ref}`kuberay-rayservice-llm-example` - {ref}`kuberay-rayservice-deepseek-example` diff --git a/doc/source/cluster/kubernetes/examples/distributed-checkpointing-with-gcsfuse.md b/doc/source/cluster/kubernetes/examples/distributed-checkpointing-with-gcsfuse.md index fceabc555b8b..19fe6572df6a 100644 --- a/doc/source/cluster/kubernetes/examples/distributed-checkpointing-with-gcsfuse.md +++ b/doc/source/cluster/kubernetes/examples/distributed-checkpointing-with-gcsfuse.md @@ -1,4 +1,4 @@ -(kuberay-distributed-checkpointing-gcsefuse)= +(kuberay-distributed-checkpointing-gcsfuse)= # Distributed checkpointing with KubeRay and GCSFuse diff --git a/doc/source/cluster/kubernetes/examples/rayserve-deepseek-example.md b/doc/source/cluster/kubernetes/examples/rayserve-deepseek-example.md index 3b9fff613fd3..8f147de21742 100644 --- a/doc/source/cluster/kubernetes/examples/rayserve-deepseek-example.md +++ b/doc/source/cluster/kubernetes/examples/rayserve-deepseek-example.md @@ -2,7 +2,7 @@ # Serve Deepseek R1 using Ray Serve LLM -This guide provides a step-by-step guide for deploying a Large Language Model (LLM) using Ray Serve LLM on Kubernetes. Leveraging KubeRay, Ray Serve, and vLLM, this guide deploys the `deepseek-ai/DeepSeek-R1` model from Hugging Face, enabling scalable, efficient, and OpenAI-compatible LLM serving within a Kubernetes environment. See [Serving LLMs](serving_llms) for information on Ray Serve LLM. +This guide provides a step-by-step guide for deploying a Large Language Model (LLM) using Ray Serve LLM on Kubernetes. Leveraging KubeRay, Ray Serve, and vLLM, this guide deploys the `deepseek-ai/DeepSeek-R1` model from Hugging Face, enabling scalable, efficient, and OpenAI-compatible LLM serving within a Kubernetes environment. See [Serving LLMs](serving-llms) for information on Ray Serve LLM. ## Prerequisites A DeepSeek model requires 2 nodes, each equipped with 8 H100 80 GB GPUs. @@ -108,7 +108,7 @@ In particular, this configuration loads the model from `deepseek-ai/DeepSeek-R1` This setting enables pipeline parallelism, dividing the model's entire set of layers into 2 sequential stages. Adjust this variable according to cluster worker node numbers. -The `deployment_config` section sets the desired number of engine replicas. See [Serving LLMs](serving_llms) and the [Ray Serve config documentation](serve-in-production-config-file) for more information. +The `deployment_config` section sets the desired number of engine replicas. See [Serving LLMs](serving-llms) and the [Ray Serve config documentation](serve-in-production-config-file) for more information. Wait for the RayService resource to become healthy. You can confirm its status by running the following command: ```sh diff --git a/doc/source/cluster/kubernetes/examples/rayserve-llm-example.md b/doc/source/cluster/kubernetes/examples/rayserve-llm-example.md index b3aa86b24997..77e48dc45064 100644 --- a/doc/source/cluster/kubernetes/examples/rayserve-llm-example.md +++ b/doc/source/cluster/kubernetes/examples/rayserve-llm-example.md @@ -2,7 +2,7 @@ # Serve a Large Language Model using Ray Serve LLM on Kubernetes -This guide provides a step-by-step guide for deploying a Large Language Model (LLM) using Ray Serve LLM on Kubernetes. Leveraging KubeRay, Ray Serve, and vLLM, this guide deploys the `Qwen/Qwen2.5-7B-Instruct` model from Hugging Face, enabling scalable, efficient, and OpenAI-compatible LLM serving within a Kubernetes environment. See [Serving LLMs](serving_llms) for information on Ray Serve LLM. +This guide provides a step-by-step guide for deploying a Large Language Model (LLM) using Ray Serve LLM on Kubernetes. Leveraging KubeRay, Ray Serve, and vLLM, this guide deploys the `Qwen/Qwen2.5-7B-Instruct` model from Hugging Face, enabling scalable, efficient, and OpenAI-compatible LLM serving within a Kubernetes environment. See [Serving LLMs](serving-llms) for information on Ray Serve LLM. ## Prerequisites @@ -72,7 +72,7 @@ serveConfigV2: | max_ongoing_requests: 128 ``` -In particular, this configuration loads the model from `Qwen/Qwen2.5-7B-Instruct` and sets its `model_id` to `qwen2.5-7b-instruct`. The `LLMDeployment` initializes the underlying LLM engine using the `engine_kwargs` field. The `deployment_config` section sets the desired number of engine replicas. By default, each replica requires one GPU. See [Serving LLMs](serving_llms) and the [Ray Serve config documentation](serve-in-production-config-file) for more information. +In particular, this configuration loads the model from `Qwen/Qwen2.5-7B-Instruct` and sets its `model_id` to `qwen2.5-7b-instruct`. The `LLMDeployment` initializes the underlying LLM engine using the `engine_kwargs` field. The `deployment_config` section sets the desired number of engine replicas. By default, each replica requires one GPU. See [Serving LLMs](serving-llms) and the [Ray Serve config documentation](serve-in-production-config-file) for more information. Wait for the RayService resource to become healthy. You can confirm its status by running the following command: ```sh diff --git a/doc/source/cluster/kubernetes/getting-started/rayjob-quick-start.md b/doc/source/cluster/kubernetes/getting-started/rayjob-quick-start.md index b7102c0852b4..f70a700aa333 100644 --- a/doc/source/cluster/kubernetes/getting-started/rayjob-quick-start.md +++ b/doc/source/cluster/kubernetes/getting-started/rayjob-quick-start.md @@ -49,8 +49,11 @@ To understand the following content better, you should understand the difference * `metadata` (Optional): See {ref}`Ray Jobs CLI API Reference ` for more details about the `--metadata-json` option. * `entrypointNumCpus` / `entrypointNumGpus` / `entrypointResources` (Optional): See {ref}`Ray Jobs CLI API Reference ` for more details. * `backoffLimit` (Optional, added in version 1.2.0): Specifies the number of retries before marking this RayJob failed. Each retry creates a new RayCluster. The default value is 0. -* Submission configuration - * `submissionMode` (Optional): `submissionMode` specifies how RayJob submits the Ray job to the RayCluster. In "K8sJobMode", the KubeRay operator creates a submitter Kubernetes Job to submit the Ray job. In "HTTPMode", the KubeRay operator sends a request to the RayCluster to create a Ray job. The default value is "K8sJobMode". +* Submission configuration + * `submissionMode` (Optional): Specifies how RayJob submits the Ray job to the RayCluster. There are three possible values, with the default being `K8sJobMode`. + * `K8sJobMode`: The KubeRay operator creates a submitter Kubernetes Job to submit the Ray job. + * `HTTPMode`: The KubeRay operator sends a request to the RayCluster to create a Ray job. + * `InteractiveMode`: The KubeRay operator waits for the user to submit a job to the RayCluster. This mode is currently in alpha and the [KubeRay kubectl plugin](kubectl-plugin) relies on it. * `submitterPodTemplate` (Optional): Defines the Pod template for the submitter Kubernetes Job. This field is only effective when `submissionMode` is "K8sJobMode". * `RAY_DASHBOARD_ADDRESS` - The KubeRay operator injects this environment variable to the submitter Pod. The value is `$HEAD_SERVICE:$DASHBOARD_PORT`. * `RAY_JOB_SUBMISSION_ID` - The KubeRay operator injects this environment variable to the submitter Pod. The value is the `RayJob.Status.JobId` of the RayJob. @@ -201,4 +204,4 @@ kind delete cluster * [RayJob Batch Inference Example](kuberay-batch-inference-example) * [Priority Scheduling with RayJob and Kueue](kuberay-kueue-priority-scheduling-example) -* [Gang Scheduling with RayJob and Kueue](kuberay-kueue-gang-scheduling-example) \ No newline at end of file +* [Gang Scheduling with RayJob and Kueue](kuberay-kueue-gang-scheduling-example) diff --git a/doc/source/cluster/kubernetes/k8s-ecosystem/metrics-references.md b/doc/source/cluster/kubernetes/k8s-ecosystem/metrics-references.md index c3f95c771e14..f4a7a6c97ee5 100644 --- a/doc/source/cluster/kubernetes/k8s-ecosystem/metrics-references.md +++ b/doc/source/cluster/kubernetes/k8s-ecosystem/metrics-references.md @@ -28,24 +28,24 @@ curl localhost:8080/metrics | Metric name | Type | Description | Labels | |--------------------------------------------------|-------|----------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------| -| `kuberay_cluster_info` | Gauge | Metadata information about RayCluster custom resources. | `namespace`: <RayCluster-namespace>
`name`: <RayCluster-name>
`owner_kind`: <RayJob\|RayService\|None> | -| `kuberay_cluster_condition_provisioned` | Gauge | Indicates whether the RayCluster is provisioned. See [RayClusterProvisioned](https://github.com/ray-project/kuberay/blob/7c6aedff5b4106281f50e87a7e9e177bf1237ec7/ray-operator/apis/ray/v1/raycluster_types.go#L214) for more information. | `namespace`: <RayCluster-namespace>
`name`: <RayCluster-name>
`condition`: <true\|false> | -| `kuberay_cluster_provisioned_duration_seconds` | Gauge | The time, in seconds, when a RayCluster's `RayClusterProvisioned` status transitions from false (or unset) to true. | `namespace`: <RayCluster-namespace>
`name`: <RayCluster-name> | +| `kuberay_cluster_info` | Gauge | Metadata information about RayCluster custom resources. | `namespace`: <RayCluster-namespace>
`name`: <RayCluster-name>
`owner_kind`: <RayJob\|RayService\|None>
`uid`: <RayCluster-uid> | +| `kuberay_cluster_condition_provisioned` | Gauge | Indicates whether the RayCluster is provisioned. See [RayClusterProvisioned](https://github.com/ray-project/kuberay/blob/7c6aedff5b4106281f50e87a7e9e177bf1237ec7/ray-operator/apis/ray/v1/raycluster_types.go#L214) for more information. | `namespace`: <RayCluster-namespace>
`name`: <RayCluster-name>
`condition`: <true\|false>
`uid`: <RayCluster-uid> | +| `kuberay_cluster_provisioned_duration_seconds` | Gauge | The time, in seconds, when a RayCluster's `RayClusterProvisioned` status transitions from false (or unset) to true. | `namespace`: <RayCluster-namespace>
`name`: <RayCluster-name>
`uid`: <RayCluster-uid> | ### RayService metrics | Metric name | Type | Description | Labels | |--------------------------------------------------|-------|------------------------------------------------------------|--------------------------------------------------------------------| -| `kuberay_service_info` | Gauge | Metadata information about RayService custom resources. | `namespace`: <RayService-namespace>
`name`: <RayService-name> | -| `kuberay_service_condition_ready` | Gauge | Describes whether the RayService is ready. Ready means users can send requests to the underlying cluster and the number of serve endpoints is greater than 0. See [RayServiceReady](https://github.com/ray-project/kuberay/blob/33ee6724ca2a429c77cb7ff5821ba9a3d63f7c34/ray-operator/apis/ray/v1/rayservice_types.go#L135) for more information. | `namespace`: <RayService-namespace>
`name`: <RayService-name> | -| `kuberay_service_condition_upgrade_in_progress` | Gauge | Describes whether the RayService is performing a zero-downtime upgrade. See [UpgradeInProgress](https://github.com/ray-project/kuberay/blob/33ee6724ca2a429c77cb7ff5821ba9a3d63f7c34/ray-operator/apis/ray/v1/rayservice_types.go#L137) for more information. | `namespace`: <RayService-namespace>
`name`: <RayService-name> | +| `kuberay_service_info` | Gauge | Metadata information about RayService custom resources. | `namespace`: <RayService-namespace>
`name`: <RayService-name>
`uid`: <RayService-uid> | +| `kuberay_service_condition_ready` | Gauge | Describes whether the RayService is ready. Ready means users can send requests to the underlying cluster and the number of serve endpoints is greater than 0. See [RayServiceReady](https://github.com/ray-project/kuberay/blob/33ee6724ca2a429c77cb7ff5821ba9a3d63f7c34/ray-operator/apis/ray/v1/rayservice_types.go#L135) for more information. | `namespace`: <RayService-namespace>
`name`: <RayService-name>
`uid`: <RayService-uid> | +| `kuberay_service_condition_upgrade_in_progress` | Gauge | Describes whether the RayService is performing a zero-downtime upgrade. See [UpgradeInProgress](https://github.com/ray-project/kuberay/blob/33ee6724ca2a429c77cb7ff5821ba9a3d63f7c34/ray-operator/apis/ray/v1/rayservice_types.go#L137) for more information. | `namespace`: <RayService-namespace>
`name`: <RayService-name>
`uid`: <RayService-uid> | ### RayJob metrics | Metric name | Type | Description | Labels | |--------------------------------------------------|-------|------------------------------------------------------------|---------------------------------------------------------------------------| -| `kuberay_job_info` | Gauge | Metadata information about RayJob custom resources. | `namespace`: <RayJob-namespace>
`name`: <RayJob-name> | -| `kuberay_job_deployment_status` | Gauge | The RayJob's current deployment status. | `namespace`: <RayJob-namespace>
`name`: <RayJob-name>
`deployment_status`: <New\|Initializing\|Running\|Complete\|Failed\|Suspending\|Suspended\|Retrying\|Waiting> | -| `kuberay_job_execution_duration_seconds` | Gauge | Duration of the RayJob CR’s JobDeploymentStatus transition from `Initializing` to either the `Retrying` state or a terminal state, such as `Complete` or `Failed`. The `Retrying` state indicates that the CR previously failed and that spec.backoffLimit is enabled. | `namespace`: <RayJob-namespace>
`name`: <RayJob-name>
`job_deployment_status`: <Complete\|Failed>
`retry_count`: <count> | +| `kuberay_job_info` | Gauge | Metadata information about RayJob custom resources. | `namespace`: <RayJob-namespace>
`name`: <RayJob-name>
`uid`: <RayJob-uid> | +| `kuberay_job_deployment_status` | Gauge | The RayJob's current deployment status. | `namespace`: <RayJob-namespace>
`name`: <RayJob-name>
`deployment_status`: <New\|Initializing\|Running\|Complete\|Failed\|Suspending\|Suspended\|Retrying\|Waiting>
`uid`: <RayJob-uid> | +| `kuberay_job_execution_duration_seconds` | Gauge | Duration of the RayJob CR’s JobDeploymentStatus transition from `Initializing` to either the `Retrying` state or a terminal state, such as `Complete` or `Failed`. The `Retrying` state indicates that the CR previously failed and that spec.backoffLimit is enabled. | `namespace`: <RayJob-namespace>
`name`: <RayJob-name>
`job_deployment_status`: <Complete\|Failed>
`retry_count`: <count>
`uid`: <RayJob-uid> | diff --git a/doc/source/cluster/kubernetes/troubleshooting.md b/doc/source/cluster/kubernetes/troubleshooting.md index 5bf2257b44f5..b3525993ee88 100644 --- a/doc/source/cluster/kubernetes/troubleshooting.md +++ b/doc/source/cluster/kubernetes/troubleshooting.md @@ -9,5 +9,5 @@ troubleshooting/troubleshooting troubleshooting/rayservice-troubleshooting ``` -- {ref}`kuberay-troubleshootin-guides` +- {ref}`kuberay-troubleshooting-guides` - {ref}`kuberay-raysvc-troubleshoot` diff --git a/doc/source/cluster/kubernetes/troubleshooting/troubleshooting.md b/doc/source/cluster/kubernetes/troubleshooting/troubleshooting.md index f381efe16567..7ec4279f6f81 100644 --- a/doc/source/cluster/kubernetes/troubleshooting/troubleshooting.md +++ b/doc/source/cluster/kubernetes/troubleshooting/troubleshooting.md @@ -1,4 +1,4 @@ -(kuberay-troubleshootin-guides)= +(kuberay-troubleshooting-guides)= # Troubleshooting guide @@ -29,7 +29,7 @@ When a Ray job is created, the Ray dashboard agent process on the head node gets (docker-image-for-apple-macbooks)= ## Use ARM-based docker images for Apple M1 or M2 MacBooks -Ray builds different images for different platforms. Until Ray moves to building multi-architecture images, [tracked by this Github issue](https://github.com/ray-project/ray/issues/39364), use platform-specific docker images in the head and worker group specs of the [RayCluster config](https://docs.ray.io/en/latest/cluster/kubernetes/user-guides/config.html#image). +Ray builds different images for different platforms. Until Ray moves to building multi-architecture images, [tracked by this GitHub issue](https://github.com/ray-project/ray/issues/39364), use platform-specific docker images in the head and worker group specs of the [RayCluster config](https://docs.ray.io/en/latest/cluster/kubernetes/user-guides/config.html#image). Use an image with the tag `aarch64`, for example, `image: rayproject/ray:2.41.0-aarch64`), if you are running KubeRay on a MacBook M1 or M2. diff --git a/doc/source/cluster/kubernetes/user-guides/configuring-autoscaling.md b/doc/source/cluster/kubernetes/user-guides/configuring-autoscaling.md index 1586124868a9..b24b891c7b45 100644 --- a/doc/source/cluster/kubernetes/user-guides/configuring-autoscaling.md +++ b/doc/source/cluster/kubernetes/user-guides/configuring-autoscaling.md @@ -431,7 +431,7 @@ Total Usage: 0B/72.63GiB memory 0B/33.53GiB object_store_memory -Total Demands: +Pending Demands: (no resource demands) Node: 40f427230584b2d9c9f113d8db51d10eaf914aa9bf61f81dc7fabc64 diff --git a/doc/source/cluster/metrics.md b/doc/source/cluster/metrics.md index 7956a74ddcbb..c4ae16abe89e 100644 --- a/doc/source/cluster/metrics.md +++ b/doc/source/cluster/metrics.md @@ -188,6 +188,22 @@ scrape_configs: - '/tmp/ray/prom_metrics_service_discovery.json' ``` +#### HTTP service discovery +Ray also exposes the same list of addresses to scrape over an HTTP endpoint, compatible with [Prometheus HTTP Service Discovery](https://prometheus.io/docs/prometheus/latest/http_sd/). + +Use the following in your Prometheus config to use the HTTP endpoint for service discovery ([HTTP SD docs](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#http_sd_config)): + +```yaml +scrape_configs: +- job_name: 'ray' + http_sd_configs: + - url: 'http://:/api/prometheus/sd' + refresh_interval: 60s +``` + +- `` is `8265` by default. See [Configuring and Managing Ray Dashboard](https://docs.ray.io/en/latest/cluster/configure-manage-dashboard.html) for more details. +- The endpoint returns a JSON list of targets for Prometheus metrics. When no targets are available, it returns `[]`. + ### Manually discovering metrics endpoints If you know the IP addresses of the nodes in your Ray Cluster, you can configure Prometheus to read metrics from a static list of endpoints. diff --git a/doc/source/cluster/package-overview.rst b/doc/source/cluster/package-overview.rst index 1aef16167d64..b79b5488d242 100644 --- a/doc/source/cluster/package-overview.rst +++ b/doc/source/cluster/package-overview.rst @@ -4,7 +4,7 @@ Ray Cluster Management API ========================== This section contains a reference for the cluster management API. If there is anything missing, please open an issue -on `Github`_. +on `GitHub`_. .. _`GitHub`: https://github.com/ray-project/ray/issues diff --git a/doc/source/cluster/running-applications/job-submission/quickstart.rst b/doc/source/cluster/running-applications/job-submission/quickstart.rst index c40e344e3b38..3a9efd1043ca 100644 --- a/doc/source/cluster/running-applications/job-submission/quickstart.rst +++ b/doc/source/cluster/running-applications/job-submission/quickstart.rst @@ -68,13 +68,13 @@ If you are using a local Ray Cluster (``ray start --head``), connect directly at If you are using a Ray Cluster started on VMs or Kubernetes, follow the instructions there for setting up network access from a client. See :ref:`Using a Remote Cluster ` for tips. -To tell the Ray Jobs CLI how to find your Ray Cluster, pass the Ray Dashboard address. Set the ``RAY_ADDRESS`` environment variable: +To tell the Ray Jobs CLI how to find your Ray Cluster, pass the Ray Dashboard address. Set the ``RAY_API_SERVER_ADDRESS`` environment variable: .. code-block:: bash - $ export RAY_ADDRESS="http://127.0.0.1:8265" + $ export RAY_API_SERVER_ADDRESS="http://127.0.0.1:8265" -Alternatively, you can also pass the ``--address=http://127.0.0.1:8265`` flag explicitly to each Ray Jobs CLI command, or prepend each command with ``RAY_ADDRESS=http://127.0.0.1:8265``. +Alternatively, you can also pass the ``--address=http://127.0.0.1:8265`` flag explicitly to each Ray Jobs CLI command, or prepend each command with ``RAY_API_SERVER_ADDRESS=http://127.0.0.1:8265``. Additionally, if you wish to pass headers per HTTP request to the Cluster, use the `RAY_JOB_HEADERS` environment variable. This environment variable must be in JSON form. @@ -217,7 +217,7 @@ Run the following command on your local machine, where ``cluster.yaml`` is the c ray dashboard cluster.yaml Once this command is running, verify that you can view the Ray Dashboard in your local browser at ``http://127.0.0.1:8265``. -Also, verify that you set the environment variable ``RAY_ADDRESS`` to ``"http://127.0.0.1:8265"``. After this setup, you can use the Jobs CLI on the local machine as in the preceding example to interact with the remote Ray cluster. +Also, verify that you set the environment variable ``RAY_API_SERVER_ADDRESS`` to ``"http://127.0.0.1:8265"``. After this setup, you can use the Jobs CLI on the local machine as in the preceding example to interact with the remote Ray cluster. Using the CLI on Kubernetes ^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/doc/source/cluster/vms/user-guides/large-cluster-best-practices.rst b/doc/source/cluster/vms/user-guides/large-cluster-best-practices.rst index 278673bf7734..6dcff6758ab0 100644 --- a/doc/source/cluster/vms/user-guides/large-cluster-best-practices.rst +++ b/doc/source/cluster/vms/user-guides/large-cluster-best-practices.rst @@ -127,7 +127,7 @@ General recommendations with AWS instance types: should help with this). If your CPU utilization is low add GPUs, or vice versa. * The exact ratio will be very dependent on your workload. -* Once you find a good ratio, you should be able to scale up and and keep the +* Once you find a good ratio, you should be able to scale up and keep the same ratio. * You can’t infinitely scale forever. Eventually, as you add more machines your performance improvements will become sub-linear/not worth it. There may not diff --git a/doc/source/cluster/vms/user-guides/launching-clusters/aws.md b/doc/source/cluster/vms/user-guides/launching-clusters/aws.md index 3f7a3f7aba08..20d8f3298e23 100644 --- a/doc/source/cluster/vms/user-guides/launching-clusters/aws.md +++ b/doc/source/cluster/vms/user-guides/launching-clusters/aws.md @@ -154,7 +154,7 @@ CloudWatch integration with Ray requires an AMI (or Docker image) with the Unifi AMIs with the Unified CloudWatch Agent pre-installed are provided by the Amazon Ray Team, and are currently available in the us-east-1, us-east-2, us-west-1, and us-west-2 regions. Please direct any questions, comments, or issues to the `Amazon Ray Team `_. -The table below lists AMIs with the Unified CloudWatch Agent pre-installed in each region, and you can also find AMIs at `amazon-ray README `_. +The table below lists AMIs with the Unified CloudWatch Agent pre-installed in each region, and you can also find AMIs at `DLAMI Release Notes `_. Each DLAMI (Deep Learning AMI) is pre-installed with the Unified CloudWatch Agent, and its corresponding release notes include AWS CLI commands to query the latest AMI ID. .. list-table:: All available unified CloudWatch agent images @@ -162,22 +162,22 @@ The table below lists AMIs with the Unified CloudWatch Agent pre-installed in ea - AMI ID - Region - Unified CloudWatch Agent Version - * - AWS Deep Learning AMI (Ubuntu 18.04, 64-bit) - - ami-069f2811478f86c20 + * - AWS Deep Learning AMI (Ubuntu 24.04, 64-bit) + - ami-087feac195f30e722 - us-east-1 - - v1.247348.0b251302 - * - AWS Deep Learning AMI (Ubuntu 18.04, 64-bit) - - ami-058cc0932940c2b8b + - v1.300057.1b1167 + * - AWS Deep Learning AMI (Ubuntu 24.04, 64-bit) + - ami-0ed6c422a7c93278a - us-east-2 - - v1.247348.0b251302 - * - AWS Deep Learning AMI (Ubuntu 18.04, 64-bit) - - ami-044f95c9ef12883ef + - v1.300057.1b1167 + * - AWS Deep Learning AMI (Ubuntu 24.04, 64-bit) + - ami-0c5ddf2c101267018 - us-west-1 - - v1.247348.0b251302 - * - AWS Deep Learning AMI (Ubuntu 18.04, 64-bit) - - ami-0d88d9cbe28fac870 + - v1.300057.1b1167 + * - AWS Deep Learning AMI (Ubuntu 24.04, 64-bit) + - ami-0cfd95c6c87d00570 - us-west-2 - - v1.247348.0b251302 + - v1.300057.1b1167 .. note:: @@ -213,12 +213,12 @@ Getting started ray.head.default: node_config: InstanceType: c5a.large - ImageId: ami-0d88d9cbe28fac870 # Unified CloudWatch agent pre-installed AMI, us-west-2 + ImageId: ami-0cfd95c6c87d00570 # Unified CloudWatch agent pre-installed AMI, us-west-2 resources: {} ray.worker.default: node_config: InstanceType: c5a.large - ImageId: ami-0d88d9cbe28fac870 # Unified CloudWatch agent pre-installed AMI, us-west-2 + ImageId: ami-0cfd95c6c87d00570 # Unified CloudWatch agent pre-installed AMI, us-west-2 IamInstanceProfile: Name: ray-autoscaler-cloudwatch-v1 resources: {} @@ -275,11 +275,11 @@ The following CLI command returns the latest available Unified CloudWatch Agent ray.head.default: node_config: InstanceType: c5a.large - ImageId: ami-0d88d9cbe28fac870 + ImageId: ami-0cfd95c6c87d00570 ray.worker.default: node_config: InstanceType: c5a.large - ImageId: ami-0d88d9cbe28fac870 + ImageId: ami-0cfd95c6c87d00570 To build your own AMI with the Unified CloudWatch Agent installed: diff --git a/doc/source/cluster/vms/user-guides/launching-clusters/azure.md b/doc/source/cluster/vms/user-guides/launching-clusters/azure.md index 7fb6b4402ef2..37242aeb3bd7 100644 --- a/doc/source/cluster/vms/user-guides/launching-clusters/azure.md +++ b/doc/source/cluster/vms/user-guides/launching-clusters/azure.md @@ -8,7 +8,7 @@ There are two ways to start an Azure Ray cluster. - Deploy a cluster using Azure portal. ```{note} -The Azure integration is community-maintained. Please reach out to the integration maintainers on Github if +The Azure integration is community-maintained. Please reach out to the integration maintainers on GitHub if you run into any problems: gramhagen, eisber, ijrsvt. ``` @@ -60,38 +60,46 @@ Download the reference example locally: wget https://raw.githubusercontent.com/ray-project/ray/master/python/ray/autoscaler/azure/example-full.yaml ``` -To connect to the provisioned head node VM, you need to ensure that you properly configure the `auth.ssh_private_key`, `auth.ssh_public_key`, and `file_mounts` configuration values to point to file paths on your local environment that have a valid key pair. By default the configuration assumes `$HOME/.ssh/id_rsa` and `$HOME/.ssh/id_rsa.pub`. If you have a different set of key pair files you want to use (for example a `ed25519` pair), update the `example-full.yaml` configurations to use them. - -For example a custom-configured `example-full.yaml` file might look like the following if you're using a `ed25519` key pair: - -```sh -$ git diff example-full.yaml -diff --git a/python/ray/autoscaler/azure/example-full.yaml b/python/ray/autoscaler/azure/example-full.yaml -index b25f1b07f1..c65fb77219 100644 ---- a/python/ray/autoscaler/azure/example-full.yaml -+++ b/python/ray/autoscaler/azure/example-full.yaml -@@ -61,9 +61,9 @@ auth: - ssh_user: ubuntu - # You must specify paths to matching private and public key pair files. - # Use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair. -- ssh_private_key: ~/.ssh/id_rsa -+ ssh_private_key: ~/.ssh/id_ed25519 - # Changes to this should match what is specified in file_mounts. -- ssh_public_key: ~/.ssh/id_rsa.pub -+ ssh_public_key: ~/.ssh/id_ed25519.pub - - # You can make more specific customization to node configurations can be made using the ARM template azure-vm-template.json file. - # See this documentation here: https://docs.microsoft.com/en-us/azure/templates/microsoft.compute/2019-03-01/virtualmachines -@@ -128,7 +128,7 @@ head_node_type: ray.head.default - file_mounts: { - # "/path1/on/remote/machine": "/path1/on/local/machine", - # "/path2/on/remote/machine": "/path2/on/local/machine", -- "~/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub"} -+ "~/.ssh/id_ed25519.pub": "~/.ssh/id_ed25519.pub"} - - # Files or directories to copy from the head node to the worker nodes. The format is a - # list of paths. Ray copies the same path on the head node to the worker node. - ``` + + +##### Automatic SSH Key Generation + +To connect to the provisioned head node VM, Ray has automatic SSH Key Generation if none are specified in the config. This is the simplest approach and requires no manual key management. + +The default configuration in `example-full.yaml` uses automatic key generation: + +```yaml +auth: + ssh_user: ubuntu + # SSH keys are auto-generated if not specified + # Uncomment and specify custom paths if you want to use existing keys: + # ssh_private_key: /path/to/your/key.pem + # ssh_public_key: /path/to/your/key.pub +``` + +##### (Optional) Manual SSH Key Configuration + +If you prefer to use your own existing SSH keys, uncomment and specify both of the key paths in the `auth` section. + +For example, to use an existing `ed25519` key pair: + +```yaml +auth: + ssh_user: ubuntu + ssh_private_key: ~/.ssh/id_ed25519 + ssh_public_key: ~/.ssh/id_ed25519.pub +``` + +Or for RSA keys: + +```yaml +auth: + ssh_user: ubuntu + ssh_private_key: ~/.ssh/id_rsa + ssh_public_key: ~/.ssh/id_rsa.pub +``` + +Both methods inject the public key directly into the VM's `~/.ssh/authorized_keys` via Azure ARM templates. #### Launch the Ray cluster on Azure diff --git a/doc/source/conf.py b/doc/source/conf.py index d72a24fb14eb..02db9e3b801a 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -228,6 +228,8 @@ def __init__(self, version: str): "data/api/ray.data.*.rst", "ray-overview/examples/**/README.md", # Exclude .md files in examples subfolders "train/examples/**/README.md", + "serve/tutorials/deployment-serve-llm/README.*", + "serve/tutorials/deployment-serve-llm/*/notebook.ipynb", ] + autogen_files # If "DOC_LIB" is found, only build that top-level navigation item. diff --git a/doc/source/data/aggregations.rst b/doc/source/data/aggregations.rst index ffaa2263736c..443af7cc1878 100644 --- a/doc/source/data/aggregations.rst +++ b/doc/source/data/aggregations.rst @@ -8,9 +8,14 @@ Ray Data provides a flexible and performant API for performing aggregations on : Basic Aggregations ------------------ -Ray Data provides several built-in aggregation functions like -* :class:`~ray.data.aggregate.Count`, * :class:`~ray.data.aggregate.Sum`, * :class:`~ray.data.aggregate.Mean`, -* :class:`~ray.data.aggregate.Min`, * :class:`~ray.data.aggregate.Max`, * :class:`~ray.data.aggregate.Std`, +Ray Data provides several built-in aggregation functions like: + +* :class:`~ray.data.aggregate.Count` +* :class:`~ray.data.aggregate.Sum` +* :class:`~ray.data.aggregate.Mean` +* :class:`~ray.data.aggregate.Min` +* :class:`~ray.data.aggregate.Max` +* :class:`~ray.data.aggregate.Std` * :class:`~ray.data.aggregate.Quantile` These can be used directly with datasets like shown below: @@ -145,7 +150,7 @@ Here's an example of creating a custom aggregator that calculates the Mean of va .. note:: Internally, aggregations support both the :ref:`hash-shuffle backend ` and the :ref:`range based backend `. - Hash-shuffling can provide better performance for aggregations in certain cases. For more information see `comparision between hash based shuffling and Range Based shuffling approach `_ . + Hash-shuffling can provide better performance for aggregations in certain cases. For more information see `comparison between hash based shuffling and Range Based shuffling approach `_ . To use the hash-shuffle algorithm for aggregations, you need to set the shuffle strategy explicitly: ``ray.data.DataContext.get_current().shuffle_strategy = ShuffleStrategy.HASH_SHUFFLE`` before creating a ``Dataset`` diff --git a/doc/source/data/api/api.rst b/doc/source/data/api/api.rst index e0d0e94d9480..009eafbdc950 100644 --- a/doc/source/data/api/api.rst +++ b/doc/source/data/api/api.rst @@ -13,7 +13,8 @@ Ray Data API aggregate.rst grouped_data.rst expressions.rst + datatype.rst data_context.rst preprocessor.rst llm.rst - from_other_data_libs.rst + from_other_data_libs.rst \ No newline at end of file diff --git a/doc/source/data/api/data_context.rst b/doc/source/data/api/data_context.rst index d86c640b20e2..eac742df73c8 100644 --- a/doc/source/data/api/data_context.rst +++ b/doc/source/data/api/data_context.rst @@ -3,7 +3,7 @@ Global configuration ==================== -.. currentmodule:: ray.data +.. currentmodule:: ray.data.context .. autoclass:: DataContext @@ -12,3 +12,6 @@ Global configuration :toctree: doc/ DataContext.get_current + + +.. autoclass:: AutoscalingConfig diff --git a/doc/source/data/api/dataset.rst b/doc/source/data/api/dataset.rst index 472ffa02b2f2..4e4fc16a04ad 100644 --- a/doc/source/data/api/dataset.rst +++ b/doc/source/data/api/dataset.rst @@ -43,4 +43,3 @@ Deprecated API :toctree: doc/ Dataset.iter_tf_batches - Dataset.to_torch diff --git a/doc/source/data/api/datatype.rst b/doc/source/data/api/datatype.rst new file mode 100644 index 000000000000..4e39831b50a9 --- /dev/null +++ b/doc/source/data/api/datatype.rst @@ -0,0 +1,12 @@ +.. _datatype-api: + +Data types +========== + +.. currentmodule:: ray.data.datatype + +Class +----- + +.. autoclass:: DataType + :members: diff --git a/doc/source/data/api/expressions.rst b/doc/source/data/api/expressions.rst index 69c1a50c93d0..3e73a314b3bf 100644 --- a/doc/source/data/api/expressions.rst +++ b/doc/source/data/api/expressions.rst @@ -1,7 +1,7 @@ .. _expressions-api: Expressions API -=============== +================ .. currentmodule:: ray.data.expressions @@ -19,6 +19,8 @@ Public API col lit + udf + download Expression Classes ------------------ diff --git a/doc/source/data/api/input_output.rst b/doc/source/data/api/input_output.rst index 9c4ad3868bb2..0b32168c35f6 100644 --- a/doc/source/data/api/input_output.rst +++ b/doc/source/data/api/input_output.rst @@ -410,7 +410,6 @@ MetadataProvider API datasource.FileMetadataProvider datasource.BaseFileMetadataProvider datasource.DefaultFileMetadataProvider - datasource.ParquetMetadataProvider datasource.FastFileMetadataProvider Shuffling API diff --git a/doc/source/data/batch_inference.rst b/doc/source/data/batch_inference.rst index 1e6b1f9c996b..8d80808e5465 100644 --- a/doc/source/data/batch_inference.rst +++ b/doc/source/data/batch_inference.rst @@ -55,7 +55,7 @@ For how to configure batch inference, see :ref:`the configuration guide`). Ray Data implements two main shuffle algorithms: @@ -179,12 +179,19 @@ To add custom optimization rules, implement a class that extends ``Rule`` and co import ray from ray.data._internal.logical.interfaces import Rule + from ray.data._internal.logical.optimizers import get_logical_ruleset class CustomRule(Rule): def apply(self, plan): ... - ray.data._internal.logical.optimizers.DEFAULT_LOGICAL_RULES.append(CustomRule) + logical_ruleset = get_logical_ruleset() + logical_ruleset.add(CustomRule) + +.. testcode:: + :hide: + + logical_ruleset.remove(CustomRule) Types of physical operators ~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/data/images/dataset-shuffle.svg b/doc/source/data/images/dataset-shuffle.svg index 6745724c722d..018f877fb728 100644 --- a/doc/source/data/images/dataset-shuffle.svg +++ b/doc/source/data/images/dataset-shuffle.svg @@ -1 +1 @@ - + \ No newline at end of file diff --git a/doc/source/data/inspecting-data.rst b/doc/source/data/inspecting-data.rst index 986b0d82b6e1..0936204fc655 100644 --- a/doc/source/data/inspecting-data.rst +++ b/doc/source/data/inspecting-data.rst @@ -123,12 +123,11 @@ of the returned batch, set ``batch_format``. print(batch) .. testoutput:: - :options: +NORMALIZE_WHITESPACE + :options: +MOCK sepal length (cm) sepal width (cm) ... petal width (cm) target 0 5.1 3.5 ... 0.2 0 1 4.9 3.0 ... 0.2 0 - For more information on working with batches, see :ref:`Transforming batches ` and @@ -143,7 +142,10 @@ Ray Data calculates statistics during execution for each operator, such as wall To view stats about your :class:`Datasets `, call :meth:`Dataset.stats() ` on an executed dataset. The stats are also persisted under `/tmp/ray/session_*/logs/ray-data/ray-data.log`. For more on how to read this output, see :ref:`Monitoring Your Workload with the Ray Data Dashboard `. +.. This snippet below is skipped because of https://github.com/ray-project/ray/issues/54101. + .. testcode:: + :skipif: True import ray import datasets diff --git a/doc/source/data/joining-data.rst b/doc/source/data/joining-data.rst index f9222ae74173..eeac75e1bc9e 100644 --- a/doc/source/data/joining-data.rst +++ b/doc/source/data/joining-data.rst @@ -4,9 +4,9 @@ Joining datasets ================ -.. note:: This is a new feature released in Ray 2.46. Note, this is an experimental feature and some things might not work as expected. +.. note:: This is a new feature released in Ray 2.46. Note that this is an experimental feature and some things might not work as expected. -Ray Data allows multiple :class:`~ray.data.dataset.Dataset` instances to be joined using different join types (inner, outer, semi, anti) based on the provided key columns like following: +Ray Data allows multiple :class:`~ray.data.dataset.Dataset` instances to be joined using different join types (inner, outer, semi, anti) based on the provided key columns as follows: .. testcode:: @@ -27,7 +27,7 @@ Ray Data allows multiple :class:`~ray.data.dataset.Dataset` instances to be join on=("id",), ) -Ray Data supports following join types (check out `Dataset.join` docs for up-to-date list): +Ray Data supports the following join types (check out `Dataset.join` docs for up-to-date list): **Inner/Outer Joins:** - Inner, Left Outer, Right Outer, Full Outer @@ -47,7 +47,7 @@ Configuring Joins Joins are generally memory-intensive operations that require accurate memory accounting and projection and hence are sensitive to skews and imbalances in the dataset. -Ray Data provides following levers to allow to tune up performance of joins for your workload: +Ray Data provides the following levers to allow tuning the performance of joins for your workload: - `num_partitions`: (required) specifies number of partitions both incoming datasets will be hash-partitioned into. Check out :ref:`configuring number of partitions ` section for guidance on how to tune this up. - `partition_size_hint`: (optional) Hint to joining operator about the estimated avg expected size of the individual partition (in bytes). If not specified, defaults to DataContext.target_max_block_size (128Mb by default). diff --git a/doc/source/data/key-concepts.rst b/doc/source/data/key-concepts.rst index 3be2437b261a..a1a979402ab6 100644 --- a/doc/source/data/key-concepts.rst +++ b/doc/source/data/key-concepts.rst @@ -12,7 +12,7 @@ There are two main concepts in Ray Data: * Datasets * Blocks -`Dataset` is the main user-facing Python API. It represents a distributed data collection and define data loading and processing operations. Users typically use the API by: +`Dataset` is the main user-facing Python API. It represents a distributed data collection and defines data loading and processing operations. Users typically use the API by: 1. Create a :class:`Dataset ` from external storage or in-memory data. 2. Apply transformations to the data. @@ -22,7 +22,7 @@ The Dataset API is lazy, meaning that operations aren't executed until you mater like :meth:`~ray.data.Dataset.show`. This allows Ray Data to optimize the execution plan and execute operations in a pipelined, streaming fashion. -*Block* is a set of rows representing single partition of the dataset. Blocks, as collection of rows represented by columnar formats (like Arrow) +*Block* is a set of rows representing single partition of the dataset. Blocks, as a collection of rows represented by columnar formats (like Arrow) are the basic unit of data processing in Ray Data: 1. Every dataset is partitioned into a number of blocks, then @@ -75,7 +75,7 @@ You can inspect the resulting logical plan by printing the dataset: +- MapBatches(add_column) +- Dataset(schema={...}) -When execution begins, Ray Data optimizes the logical plan, then translate it into a physical plan - a series of operators that implement the actual data transformations. During this translation: +When execution begins, Ray Data optimizes the logical plan, then translates it into a physical plan - a series of operators that implement the actual data transformations. During this translation: 1. A single logical operator may become multiple physical operators. For example, ``ReadOp`` becomes both ``InputDataBuffer`` and ``TaskPoolMapOperator``. 2. Both logical and physical plans go through optimization passes. For example, ``OperatorFusionRule`` combines map operators to reduce serialization overhead. diff --git a/doc/source/data/loading-data.rst b/doc/source/data/loading-data.rst index ce80c8f21e9a..3abe4e31fb8e 100644 --- a/doc/source/data/loading-data.rst +++ b/doc/source/data/loading-data.rst @@ -486,13 +486,16 @@ Ray Data interoperates with distributed data processing frameworks like `Daft `__. + .. testcode:: + :skipif: True import daft import ray - ray.init() - df = daft.from_pydict({"int_col": [i for i in range(10000)], "str_col": [str(i) for i in range(10000)]}) ds = ray.data.from_daft(df) @@ -512,7 +515,12 @@ Ray Data interoperates with distributed data processing frameworks like `Daft >> import ray - >>> from pyiceberg.expressions import EqualTo - >>> ds = ray.data.read_iceberg( - ... table_identifier="db_name.table_name", - ... row_filter=EqualTo("column_name", "literal_value"), - ... catalog_kwargs={"name": "default", "type": "glue"} - ... ) + import ray + from pyiceberg.expressions import EqualTo + ds = ray.data.read_iceberg( + table_identifier="db_name.table_name", + row_filter=EqualTo("column_name", "literal_value"), + catalog_kwargs={"name": "default", "type": "glue"} + ) + ds.show(3) .. testoutput:: + :options: +MOCK {'col1': 0, 'col2': '0'} {'col1': 1, 'col2': '1'} @@ -622,6 +630,7 @@ Ray Data interoperates with distributed data processing frameworks like `Daft `_ objects aren't supported. + .. This snippet below is skipped because of https://github.com/ray-project/ray/issues/54837. + .. testcode:: + :skipif: True import ray.data from datasets import load_dataset diff --git a/doc/source/data/performance-tips.rst b/doc/source/data/performance-tips.rst index 9657d0235f35..0402842c4795 100644 --- a/doc/source/data/performance-tips.rst +++ b/doc/source/data/performance-tips.rst @@ -51,7 +51,7 @@ For example, the following code batches multiple files into the same read task t ray.init(num_cpus=2) # Repeat the iris.csv file 16 times. - ds = ray.data.read_csv(["example://iris.csv"] * 16) + ds = ray.data.read_csv(["s3://anonymous@ray-example-data/iris.csv"] * 16) print(ds.materialize()) .. testoutput:: @@ -81,7 +81,7 @@ Notice how the number of output blocks is equal to ``override_num_blocks`` in th ray.init(num_cpus=2) # Repeat the iris.csv file 16 times. - ds = ray.data.read_csv(["example://iris.csv"] * 16, override_num_blocks=16) + ds = ray.data.read_csv(["s3://anonymous@ray-example-data/iris.csv"] * 16, override_num_blocks=16) print(ds.materialize()) .. testoutput:: @@ -143,7 +143,7 @@ For example, the following code executes :func:`~ray.data.read_csv` with only on # Pretend there are two CPUs. ray.init(num_cpus=2) - ds = ray.data.read_csv("example://iris.csv").map(lambda row: row) + ds = ray.data.read_csv("s3://anonymous@ray-example-data/iris.csv").map(lambda row: row) print(ds.materialize().stats()) .. testoutput:: @@ -171,7 +171,7 @@ For example, this code sets the number of files equal to ``override_num_blocks`` # Pretend there are two CPUs. ray.init(num_cpus=2) - ds = ray.data.read_csv("example://iris.csv", override_num_blocks=1).map(lambda row: row) + ds = ray.data.read_csv("s3://anonymous@ray-example-data/iris.csv", override_num_blocks=1).map(lambda row: row) print(ds.materialize().stats()) .. testoutput:: @@ -205,15 +205,21 @@ calling :func:`~ray.data.Dataset.select_columns`, since column selection is push .. testcode:: import ray + # Read just two of the five columns of the Iris dataset. - ray.data.read_parquet( + ds = ray.data.read_parquet( "s3://anonymous@ray-example-data/iris.parquet", columns=["sepal.length", "variety"], ) + + print(ds.schema()) .. testoutput:: - Dataset(num_rows=150, schema={sepal.length: double, variety: string}) + Column Type + ------ ---- + sepal.length double + variety string .. _data_memory: @@ -411,10 +417,12 @@ You can configure execution options with the global DataContext. The options are .. code-block:: - ctx = ray.data.DataContext.get_current() - ctx.execution_options.resource_limits.cpu = 10 - ctx.execution_options.resource_limits.gpu = 5 - ctx.execution_options.resource_limits.object_store_memory = 10e9 + ctx = ray.data.DataContext.get_current() + ctx.execution_options.resource_limits = ctx.execution_options.resource_limits.copy( + cpu=10, + gpu=5, + object_store_memory=10e9, + ) .. note:: It's **not** recommended to modify the Ray Core object store memory limit, as this can reduce available memory for task execution. The one exception to this is if you are using machines with a very large amount of RAM (1 TB or more each); then it's recommended to set the object store to ~30-40%. diff --git a/doc/source/data/saving-data.rst b/doc/source/data/saving-data.rst index d37215541584..c6c814d04398 100644 --- a/doc/source/data/saving-data.rst +++ b/doc/source/data/saving-data.rst @@ -173,7 +173,7 @@ Writing into Partitioned Dataset ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ When writing partitioned dataset (using Hive-style, folder-based partitioning) it's recommended to repartition the dataset by the partition columns prior to writing into it. -This allows you to *have the control over the file-sizes and their number*. When the dataset is repartitioned by the partition columns every block should contain all of the rows corresponding to particular partition, +This allows you to *have control over the file sizes and their number*. When the dataset is repartitioned by the partition columns every block should contain all of the rows corresponding to particular partition, meaning that the number of files created should be controlled based on the configuration provided to, for example, `write_parquet` method (such as `min_rows_per_file`, `max_rows_per_file`). Since every block is written out independently, when writing the dataset without prior @@ -228,7 +228,7 @@ number of files & their sizes (since every block could potentially carry the row print_directory_tree("/tmp/sales_partitioned") .. testoutput:: - :options: +NORMALIZE_WHITESPACE + :options: +MOCK sales_partitioned/ city=NYC/ @@ -301,24 +301,10 @@ Ray Data interoperates with distributed data processing frameworks like `Daft `__, call - :meth:`Dataset.to_dask() `. - - .. testcode:: - - import ray - - ds = ray.data.read_csv("s3://anonymous@ray-example-data/iris.csv") - - df = ds.to_dask() - - df + print(df) .. testoutput:: + :options: +MOCK ╭───────────────────┬──────────────────┬───────────────────┬──────────────────┬────────╮ │ sepal length (cm) ┆ sepal width (cm) ┆ petal length (cm) ┆ petal width (cm) ┆ target │ @@ -345,6 +331,25 @@ Ray Data interoperates with distributed data processing frameworks like `Daft `__, call + :meth:`Dataset.to_dask() `. + + .. + We skip the code snippet below because `to_dask` doesn't work with PyArrow + 14 and later. For more information, see https://github.com/ray-project/ray/issues/54837 + + .. testcode:: + :skipif: True + + import ray + + ds = ray.data.read_csv("s3://anonymous@ray-example-data/iris.csv") + + df = ds.to_dask() + .. tab-item:: Spark To convert a :class:`~ray.data.dataset.Dataset` to a `Spark DataFrame @@ -352,6 +357,7 @@ Ray Data interoperates with distributed data processing frameworks like `Daft `. .. testcode:: + :skipif: True import ray import raydp @@ -367,6 +373,7 @@ Ray Data interoperates with distributed data processing frameworks like `Daft `. .. testcode:: + :skipif: True import ray diff --git a/doc/source/data/shuffling-data.rst b/doc/source/data/shuffling-data.rst index a146370aeb09..11f720ec8f7e 100644 --- a/doc/source/data/shuffling-data.rst +++ b/doc/source/data/shuffling-data.rst @@ -189,8 +189,8 @@ To try out push-based shuffle, set the environment variable ``RAY_DATA_PUSH_BASE .. code-block:: bash - $ wget https://raw.githubusercontent.com/ray-project/ray/master/release/nightly_tests/dataset/sort.py - $ RAY_DATA_PUSH_BASED_SHUFFLE=1 python sort.py --num-partitions=10 --partition-size=1e7 + $ wget https://raw.githubusercontent.com/ray-project/ray/master/release/nightly_tests/dataset/sort_benchmark.py + $ RAY_DATA_PUSH_BASED_SHUFFLE=1 python sort_benchmark.py --num-partitions=10 --partition-size=1e7 # Dataset size: 10 partitions, 0.01GB partition size, 0.1GB total # [dataset]: Run `pip install tqdm` to enable progress reporting. diff --git a/doc/source/data/user-guide.rst b/doc/source/data/user-guide.rst index a1f450d17282..a83f34a7ab69 100644 --- a/doc/source/data/user-guide.rst +++ b/doc/source/data/user-guide.rst @@ -7,7 +7,7 @@ User Guides If you’re new to Ray Data, start with the :ref:`Ray Data Quickstart `. This user guide helps you navigate the Ray Data project and -show you how achieve several tasks. +shows you how to achieve several tasks. .. toctree:: :maxdepth: 2 diff --git a/doc/source/data/working-with-images.rst b/doc/source/data/working-with-images.rst index 1f53fbcdf568..3152d1a90c07 100644 --- a/doc/source/data/working-with-images.rst +++ b/doc/source/data/working-with-images.rst @@ -147,7 +147,7 @@ To view the full list of supported file formats, see the Column Type ------ ---- - image numpy.ndarray(shape=(32, 32, 3), dtype=uint8) + img struct label int64 diff --git a/doc/source/data/working-with-llms.rst b/doc/source/data/working-with-llms.rst index 8b12c4add575..cfd0c4bedf77 100644 --- a/doc/source/data/working-with-llms.rst +++ b/doc/source/data/working-with-llms.rst @@ -9,6 +9,7 @@ This guide shows you how to use :ref:`ray.data.llm ` to: * :ref:`Perform batch inference with LLMs ` * :ref:`Configure vLLM for LLM inference ` +* :ref:`Batch inference with embedding models ` * :ref:`Query deployed models with an OpenAI compatible API endpoint ` .. _batch_inference_llm: @@ -283,6 +284,63 @@ This example applies 2 adjustments on top of the previous example: vision_processed_ds = vision_processor(vision_dataset).materialize() vision_processed_ds.show(3) +.. _embedding_models: + +Batch inference with embedding models +--------------------------------------- + +Ray Data LLM supports batch inference with embedding models using vLLM: + +.. testcode:: + + import ray + from ray.data.llm import vLLMEngineProcessorConfig, build_llm_processor + + embedding_config = vLLMEngineProcessorConfig( + model_source="sentence-transformers/all-MiniLM-L6-v2", + task_type="embed", + engine_kwargs=dict( + enable_prefix_caching=False, + enable_chunked_prefill=False, + max_model_len=256, + enforce_eager=True, + ), + batch_size=32, + concurrency=1, + apply_chat_template=False, + detokenize=False, + ) + + embedding_processor = build_llm_processor( + embedding_config, + preprocess=lambda row: dict(prompt=row["text"]), + postprocess=lambda row: { + "text": row["prompt"], + "embedding": row["embeddings"], + }, + ) + + texts = [ + "Hello world", + "This is a test sentence", + "Embedding models convert text to vectors", + ] + ds = ray.data.from_items([{"text": text} for text in texts]) + + embedded_ds = embedding_processor(ds) + embedded_ds.show(limit=1) + +.. testoutput:: + :options: +MOCK + + {'text': 'Hello world', 'embedding': [0.1, -0.2, 0.3, ...]} + +Key differences for embedding models: + +- Set ``task_type="embed"`` +- Set ``apply_chat_template=False`` and ``detokenize=False`` +- Use direct ``prompt`` input instead of ``messages`` +- Access embeddings through``row["embeddings"]`` .. _openai_compatible_api_endpoint: @@ -343,14 +401,34 @@ Data for the following features and attributes is collected to improve Ray Data If you would like to opt-out from usage data collection, you can follow :ref:`Ray usage stats ` to turn it off. -.. _production_guide: +.. _faqs: -Production guide +Frequently Asked Questions (FAQs) -------------------------------------------------- +.. TODO(#55491): Rewrite this section once the restriction is lifted. +.. _cross_node_parallelism: + +How to configure LLM stage to parallelize across multiple nodes? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +At the moment, Ray Data LLM doesn't support cross-node parallelism (either +tensor parallelism or pipeline parallelism). + +The processing pipeline is designed to run on a single node. The number of +GPUs is calculated as the product of the tensor parallel size and the pipeline +parallel size, and apply +[`STRICT_PACK` strategy](https://docs.ray.io/en/latest/ray-core/scheduling/placement-group.html#pgroup-strategy) +to ensure that each replica of the LLM stage is executed on a single node. + +Nevertheless, you can still horizontally scale the LLM stage to multiple nodes +as long as each replica (TP * PP) fits into a single node. The number of +replicas is configured by the `concurrency` argument in +:class:`vLLMEngineProcessorConfig `. + .. _model_cache: -Caching model weight to remote object storage +How to cache model weight to remote object storage ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ While deploying Ray Data LLM to large scale clusters, model loading may be rate diff --git a/doc/source/data/working-with-pytorch.rst b/doc/source/data/working-with-pytorch.rst index a101333379b3..f52cc181912e 100644 --- a/doc/source/data/working-with-pytorch.rst +++ b/doc/source/data/working-with-pytorch.rst @@ -229,8 +229,8 @@ You can use built-in Torch transforms from ``torchvision``, ``torchtext``, and ` Column Type ------ ---- - text - tokenized_text + text string + tokenized_text list .. _batch_inference_pytorch: @@ -255,7 +255,7 @@ With Ray Datasets, you can do scalable offline batch inference with Torch models # Step 2: Define a Predictor class for inference. # Use a class to initialize the model just once in `__init__` - # and re-use it for inference across multiple batches. + # and reuse it for inference across multiple batches. class TorchPredictor: def __init__(self): # Load a dummy neural network. diff --git a/doc/source/llm/doc_code/serve/prefix_aware_router/prefix_aware_example.py b/doc/source/llm/doc_code/serve/prefix_aware_router/prefix_aware_example.py new file mode 100644 index 000000000000..bb6bf734fac2 --- /dev/null +++ b/doc/source/llm/doc_code/serve/prefix_aware_router/prefix_aware_example.py @@ -0,0 +1,90 @@ +""" +This file serves as a documentation example and CI test. + +Structure: +1. Monkeypatch setup: Ensures serve.run is non-blocking and removes accelerator requirements for CI testing. +2. Docs example (between __prefix_aware_example_start/end__): Embedded in Sphinx docs via literalinclude. +3. Test validation (deployment status polling + cleanup) +""" + +import time +from ray import serve +from ray.serve.schema import ApplicationStatus +from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME +from ray.serve import llm + +_original_serve_run = serve.run +_original_build_openai_app = llm.build_openai_app + + +def _non_blocking_serve_run(app, **kwargs): + """Forces blocking=False for testing""" + kwargs["blocking"] = False + return _original_serve_run(app, **kwargs) + + +def _testing_build_openai_app(llm_serving_args): + """Removes accelerator requirements for testing""" + for config in llm_serving_args["llm_configs"]: + config.accelerator_type = None + + return _original_build_openai_app(llm_serving_args) + + +serve.run = _non_blocking_serve_run +llm.build_openai_app = _testing_build_openai_app + +# __prefix_aware_example_start__ +from ray import serve +from ray.serve.llm import LLMConfig, build_openai_app +from ray.serve.llm.request_router import PrefixCacheAffinityRouter + +llm_config = LLMConfig( + model_loading_config={ + "model_id": "qwen-0.5b", + "model_source": "Qwen/Qwen2.5-0.5B-Instruct", + }, + deployment_config={ + "autoscaling_config": { + "min_replicas": 4, + "max_replicas": 4, + }, + "request_router_config": { + "request_router_class": PrefixCacheAffinityRouter, + "request_router_kwargs": { + "imbalanced_threshold": 5, # More aggressive load balancing + "match_rate_threshold": 0.15, # Require 15% match rate + "do_eviction": True, # Enable memory management + "eviction_threshold_chars": 500_000, + "eviction_target_chars": 400_000, + "eviction_interval_secs": 30, + }, + }, + }, + runtime_env={"env_vars": {"VLLM_USE_V1": "1"}}, +) + +app = build_openai_app({"llm_configs": [llm_config]}) +serve.run(app, blocking=True) +# __prefix_aware_example_end__ + +status = ApplicationStatus.NOT_STARTED +timeout_seconds = 180 +start_time = time.time() + +while ( + status != ApplicationStatus.RUNNING and time.time() - start_time < timeout_seconds +): + status = serve.status().applications[SERVE_DEFAULT_APP_NAME].status + + if status in [ApplicationStatus.DEPLOY_FAILED, ApplicationStatus.UNHEALTHY]: + raise AssertionError(f"Deployment failed with status: {status}") + + time.sleep(1) + +if status != ApplicationStatus.RUNNING: + raise AssertionError( + f"Deployment failed to reach RUNNING status within {timeout_seconds}s. Current status: {status}" + ) + +serve.shutdown() diff --git a/doc/source/llm/doc_code/serve/qwen/llm_config_example.yaml b/doc/source/llm/doc_code/serve/qwen/llm_config_example.yaml new file mode 100644 index 000000000000..cd5302b6f637 --- /dev/null +++ b/doc/source/llm/doc_code/serve/qwen/llm_config_example.yaml @@ -0,0 +1,29 @@ +# config.yaml +applications: +- args: + llm_configs: + - model_loading_config: + model_id: qwen-0.5b + model_source: Qwen/Qwen2.5-0.5B-Instruct + accelerator_type: A10G + deployment_config: + autoscaling_config: + min_replicas: 1 + max_replicas: 2 + runtime_env: + env_vars: + VLLM_USE_V1: "1" + - model_loading_config: + model_id: qwen-1.5b + model_source: Qwen/Qwen2.5-1.5B-Instruct + accelerator_type: A10G + deployment_config: + autoscaling_config: + min_replicas: 1 + max_replicas: 2 + runtime_env: + env_vars: + VLLM_USE_V1: "1" + import_path: ray.serve.llm:build_openai_app + name: llm_app + route_prefix: "/" \ No newline at end of file diff --git a/doc/source/llm/doc_code/serve/qwen/llm_yaml_config_example.py b/doc/source/llm/doc_code/serve/qwen/llm_yaml_config_example.py new file mode 100644 index 000000000000..1f921f886716 --- /dev/null +++ b/doc/source/llm/doc_code/serve/qwen/llm_yaml_config_example.py @@ -0,0 +1,47 @@ +""" +This file serves as a documentation example and CI test for YAML config deployment. + +Structure: +1. Monkeypatch setup: Ensures serve.run is non-blocking and removes accelerator requirements for CI testing. +2. Load YAML config and convert to Python using build_openai_app +3. Test validation (deployment status polling + cleanup) +""" + +import time +import os +import yaml +from ray import serve +from ray.serve.schema import ApplicationStatus +from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME +from ray.serve import llm + + +config_path = os.path.join(os.path.dirname(__file__), "llm_config_example.yaml") +with open(config_path, "r") as f: + config_dict = yaml.safe_load(f) + +llm_configs = config_dict["applications"][0]["args"]["llm_configs"] +for config in llm_configs: + config.pop("accelerator_type", None) + +app = llm.build_openai_app({"llm_configs": llm_configs}) +serve.run(app, blocking=False) + +status = ApplicationStatus.NOT_STARTED +timeout_seconds = 180 +start_time = time.time() + +while ( + status != ApplicationStatus.RUNNING and time.time() - start_time < timeout_seconds +): + status = serve.status().applications[SERVE_DEFAULT_APP_NAME].status + + if status in [ApplicationStatus.DEPLOY_FAILED, ApplicationStatus.UNHEALTHY]: + raise AssertionError(f"Deployment failed with status: {status}") + + time.sleep(1) + +if status != ApplicationStatus.RUNNING: + raise AssertionError( + f"Deployment failed to reach RUNNING status within {timeout_seconds}s. Current status: {status}" + ) diff --git a/doc/source/llm/doc_code/serve/qwen/qwen_example.py b/doc/source/llm/doc_code/serve/qwen/qwen_example.py new file mode 100644 index 000000000000..791405940351 --- /dev/null +++ b/doc/source/llm/doc_code/serve/qwen/qwen_example.py @@ -0,0 +1,84 @@ +""" +This file serves as a documentation example and CI test. + +Structure: +1. Monkeypatch setup: Ensures serve.run is non-blocking and removes accelerator requirements for CI testing. +2. Docs example (between __qwen_example_start/end__): Embedded in Sphinx docs via literalinclude. +3. Test validation (deployment status polling + cleanup) +""" + +import time +from ray import serve +from ray.serve.schema import ApplicationStatus +from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME +from ray.serve import llm + +_original_serve_run = serve.run +_original_build_openai_app = llm.build_openai_app + + +def _non_blocking_serve_run(app, **kwargs): + """Forces blocking=False for testing""" + kwargs["blocking"] = False + return _original_serve_run(app, **kwargs) + + +def _testing_build_openai_app(llm_serving_args): + """Removes accelerator requirements for testing""" + for config in llm_serving_args["llm_configs"]: + config.accelerator_type = None + + return _original_build_openai_app(llm_serving_args) + + +serve.run = _non_blocking_serve_run +llm.build_openai_app = _testing_build_openai_app + +# __qwen_example_start__ +from ray import serve +from ray.serve.llm import LLMConfig, build_openai_app + +llm_config = LLMConfig( + model_loading_config={ + "model_id": "qwen-0.5b", + "model_source": "Qwen/Qwen2.5-0.5B-Instruct", + }, + deployment_config={ + "autoscaling_config": { + "min_replicas": 1, + "max_replicas": 2, + } + }, + # Pass the desired accelerator type (e.g. A10G, L4, etc.) + accelerator_type="A10G", + # You can customize the engine arguments (e.g. vLLM engine kwargs) + engine_kwargs={ + "tensor_parallel_size": 2, + }, + runtime_env={"env_vars": {"VLLM_USE_V1": "1"}}, +) + +app = build_openai_app({"llm_configs": [llm_config]}) +serve.run(app, blocking=True) +# __qwen_example_end__ + +status = ApplicationStatus.NOT_STARTED +timeout_seconds = 180 +start_time = time.time() + +while ( + status != ApplicationStatus.RUNNING and time.time() - start_time < timeout_seconds +): + status = serve.status().applications[SERVE_DEFAULT_APP_NAME].status + + if status in [ApplicationStatus.DEPLOY_FAILED, ApplicationStatus.UNHEALTHY]: + raise AssertionError(f"Deployment failed with status: {status}") + + time.sleep(1) + +if status != ApplicationStatus.RUNNING: + raise AssertionError( + f"Deployment failed to reach RUNNING status within {timeout_seconds}s. Current status: {status}" + ) + +serve.shutdown() diff --git a/doc/source/ray-air/deployment.rst b/doc/source/ray-air/deployment.rst index 812e7cf691fd..343618b97a1a 100644 --- a/doc/source/ray-air/deployment.rst +++ b/doc/source/ray-air/deployment.rst @@ -1,14 +1,14 @@ Deploying Ray for ML platforms ============================== -Here, we describe how you might use or deploy Ray in your infrastructure. There are two main deployment patterns -- pick and choose and within existing platforms. +This page describes how you might use or deploy Ray in your infrastructure. There are two main deployment patterns -- pick and choose, and within existing platforms. The core idea is that Ray can be **complementary** to your existing infrastructure and integration tools. Design Principles ----------------- -* Ray and its libraries handles the heavyweight compute aspects of AI apps and services. +* Ray and its libraries handle the heavyweight compute aspects of AI apps and services. * Ray relies on external integrations (e.g., Tecton, MLFlow, W&B) for Storage and Tracking. * Workflow Orchestrators (e.g., AirFlow) are an optional component that can be used for scheduling recurring jobs, launching new Ray clusters for jobs, and running non-Ray compute steps. * Lightweight orchestration of task graphs within a single Ray app can be handled using Ray tasks. diff --git a/doc/source/ray-air/getting-started.rst b/doc/source/ray-air/getting-started.rst index 047adbaf55f8..2ed5395fdd38 100644 --- a/doc/source/ray-air/getting-started.rst +++ b/doc/source/ray-air/getting-started.rst @@ -5,9 +5,9 @@ Ray for ML Infrastructure .. tip:: - We'd love to hear from you if you are using Ray to build a ML platform! Fill out `this short form `__ to get involved. + We'd love to hear from you if you are using Ray to build an ML platform! Fill out `this short form `__ to get involved. -Ray and its AI libraries provide unified compute runtime for teams looking to simplify their ML platform. +Ray and its AI libraries provide a unified compute runtime for teams looking to simplify their ML platform. Ray's libraries such as Ray Train, Ray Data, and Ray Serve can be used to compose end-to-end ML workflows, providing features and APIs for data preprocessing as part of training, and transitioning from training to serving. @@ -27,9 +27,9 @@ Ray's AI libraries simplify the ecosystem of machine learning frameworks, platfo .. https://docs.google.com/drawings/d/1oi_JwNHXVgtR_9iTdbecquesUd4hOk0dWgHaTaFj6gk/edit -**1. Seamless Dev to Prod**: Ray's AI libraries reduces friction going from development to production. With Ray and its libraries, the same Python code scales seamlessly from a laptop to a large cluster. +**1. Seamless Dev to Prod**: Ray's AI libraries reduce friction going from development to production. With Ray and its libraries, the same Python code scales seamlessly from a laptop to a large cluster. -**2. Unified ML API and Runtime**: Ray's APIs enables swapping between popular frameworks, such as XGBoost, PyTorch, and Hugging Face, with minimal code changes. Everything from training to serving runs on a single runtime (Ray + KubeRay). +**2. Unified ML API and Runtime**: Ray's APIs enable swapping between popular frameworks, such as XGBoost, PyTorch, and Hugging Face, with minimal code changes. Everything from training to serving runs on a single runtime (Ray + KubeRay). **3. Open and Extensible**: Ray is fully open-source and can run on any cluster, cloud, or Kubernetes. Build custom components and integrations on top of scalable developer APIs. @@ -46,7 +46,7 @@ Spotify `uses Ray for advanced applications `_ to discuss Ray! - Star and follow us on `on GitHub`_. -- To post questions or feature requests, check out the `Discussion Board`_! -- Follow us and spread the word on `Twitter`_! -- Join our `Meetup Group`_ to connect with others in the community! -- Use the `[ray]` tag on `StackOverflow`_ to ask and answer questions about Ray usage +- To post questions or feature requests, check out the `Discussion Board`_. +- Follow us and spread the word on `Twitter`_. +- Join our `Meetup Group`_ to connect with others in the community. +- Use the `[ray]` tag on `StackOverflow`_ to ask and answer questions about Ray usage. .. _`Discussion Board`: https://discuss.ray.io/ diff --git a/doc/source/ray-core/actors.rst b/doc/source/ray-core/actors.rst index ebbabda54342..50add30ab2c5 100644 --- a/doc/source/ray-core/actors.rst +++ b/doc/source/ray-core/actors.rst @@ -412,7 +412,7 @@ For tasks classified as a single-threaded Actor or a multi-threaded Actor, Ray offers no mechanism for interruption. **Running async actor tasks**: -For Tasks classified as `async Actors <_async-actors>`, Ray seeks to cancel the associated `asyncio.Task`. +For Tasks classified as :ref:`async Actors `, Ray seeks to cancel the associated `asyncio.Task`. This cancellation approach aligns with the standards presented in `asyncio task cancellation `__. Note that `asyncio.Task` won't be interrupted in the middle of execution if you don't `await` within the async function. diff --git a/doc/source/ray-core/actors/async_api.rst b/doc/source/ray-core/actors/async_api.rst index 36f5296fdba4..50427459d433 100644 --- a/doc/source/ray-core/actors/async_api.rst +++ b/doc/source/ray-core/actors/async_api.rst @@ -22,7 +22,7 @@ AsyncIO for Actors Since Python 3.5, it is possible to write concurrent code using the ``async/await`` `syntax `__. -Ray natively integrates with asyncio. You can use ray alongside with popular +Ray natively integrates with asyncio. You can use Ray alongside popular async frameworks like aiohttp, aioredis, etc. .. testcode:: @@ -63,14 +63,14 @@ async frameworks like aiohttp, aioredis, etc. .. testoutput:: :options: +MOCK - (AsyncActor pid=40293) started - (AsyncActor pid=40293) started - (AsyncActor pid=40293) started - (AsyncActor pid=40293) started - (AsyncActor pid=40293) finished - (AsyncActor pid=40293) finished - (AsyncActor pid=40293) finished - (AsyncActor pid=40293) finished + (AsyncActor pid=9064) Waiting for other coroutines to start. + (AsyncActor pid=9064) Waiting for other coroutines to start. + (AsyncActor pid=9064) Waiting for other coroutines to start. + (AsyncActor pid=9064) All coroutines are executing concurrently, unblocking. + (AsyncActor pid=9064) All coroutines ran concurrently. + (AsyncActor pid=9064) All coroutines ran concurrently. + (AsyncActor pid=9064) All coroutines ran concurrently. + (AsyncActor pid=9064) All coroutines ran concurrently. .. testcode:: :hide: @@ -217,7 +217,7 @@ Please note that running blocking ``ray.get`` or ``ray.wait`` inside async actor method is not allowed, because ``ray.get`` will block the execution of the event loop. -In async actors, only one task can be running at any point in time (though tasks can be multi-plexed). There will be only one thread in AsyncActor! See :ref:`threaded-actors` if you want a threadpool. +In async actors, only one task can be running at any point in time (though tasks can be multiplexed). There will be only one thread in AsyncActor! See :ref:`threaded-actors` if you want a threadpool. Setting concurrency in Async Actors ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -284,7 +284,7 @@ Sometimes, asyncio is not an ideal solution for your actor. For example, you may have one method that performs some computation heavy task while blocking the event loop, not giving up control via ``await``. This would hurt the performance of an Async Actor because Async Actors can only execute 1 task at a time and rely on ``await`` to context switch. -Instead, you can use the ``max_concurrency`` Actor options without any async methods, allowng you to achieve threaded concurrency (like a thread pool). +Instead, you can use the ``max_concurrency`` Actor options without any async methods, allowing you to achieve threaded concurrency (like a thread pool). .. warning:: diff --git a/doc/source/ray-core/actors/concurrency_group_api.rst b/doc/source/ray-core/actors/concurrency_group_api.rst index 326f3b957bbc..29e9e6d788b7 100644 --- a/doc/source/ray-core/actors/concurrency_group_api.rst +++ b/doc/source/ray-core/actors/concurrency_group_api.rst @@ -1,7 +1,7 @@ Limiting Concurrency Per-Method with Concurrency Groups ======================================================= -Besides setting the max concurrency overall for an actor, Ray allows methods to be separated into *concurrency groups*, each with its own threads(s). This allows you to limit the concurrency per-method, e.g., allow a health-check method to be given its own concurrency quota separate from request serving methods. +Besides setting the max concurrency overall for an actor, Ray allows methods to be separated into *concurrency groups*, each with its own thread(s). This allows you to limit the concurrency per-method, e.g., allow a health-check method to be given its own concurrency quota separate from request serving methods. .. tip:: Concurrency groups work with both asyncio and threaded actors. The syntax is the same. @@ -14,7 +14,7 @@ This defines two concurrency groups, "io" with max concurrency = 2 and "compute" with max concurrency = 4. The methods ``f1`` and ``f2`` are placed in the "io" group, and the methods ``f3`` and ``f4`` are placed into the "compute" group. Note that there is always a default -concurrency group for actors, which has a default concurrency of 1000 +concurrency group for actors, which has a default concurrency of 1000 for AsyncIO actors and 1 otherwise. .. tab-set:: @@ -143,10 +143,11 @@ The concurrency of the default group can be changed by setting the ``max_concurr .. code-block:: java - class ConcurrentActor: + class ConcurrentActor { public long f1() { return Thread.currentThread().getId(); } + } ConcurrencyGroup group = new ConcurrencyGroupBuilder() @@ -156,7 +157,7 @@ The concurrency of the default group can be changed by setting the ``max_concurr .build(); ActorHandle myActor = Ray.actor(ConcurrentActor::new) - .setConcurrencyGroups(group1) + .setConcurrencyGroups(group) .setMaxConcurrency(10) .remote(); diff --git a/doc/source/ray-core/actors/named-actors.rst b/doc/source/ray-core/actors/named-actors.rst index 81ec48230010..980a2053c46f 100644 --- a/doc/source/ray-core/actors/named-actors.rst +++ b/doc/source/ray-core/actors/named-actors.rst @@ -52,7 +52,7 @@ exist. See :ref:`actor-lifetimes` for more details. // Retrieve the actor later somewhere boost::optional> counter = ray::GetGlobalActor("some_name"); - We also support non-global named actors in C++, which means that the actor name is only valid within the job and the actor cannot be accessed from another job + We also support non-global named actors in C++, which means that the actor name is only valid within the job and the actor cannot be accessed from another job. .. code-block:: c++ @@ -80,7 +80,7 @@ exist. See :ref:`actor-lifetimes` for more details. @ray.remote class Actor: - pass + pass # driver_1.py # Job 1 creates an actor, "orange" in the "colors" namespace. diff --git a/doc/source/ray-core/actors/out-of-band-communication.rst b/doc/source/ray-core/actors/out-of-band-communication.rst index 063b9a26f69d..ba6719046ecc 100644 --- a/doc/source/ray-core/actors/out-of-band-communication.rst +++ b/doc/source/ray-core/actors/out-of-band-communication.rst @@ -19,8 +19,8 @@ See :ref:`Ray Collective ` for more details. HTTP Server ----------- -You can start a http server inside the actor and expose http endpoints to clients -so users outside of the ray cluster can communicate with the actor. +You can start an HTTP server inside the actor and expose HTTP endpoints to clients +so users outside of the Ray cluster can communicate with the actor. .. tab-set:: diff --git a/doc/source/ray-core/actors/terminating-actors.rst b/doc/source/ray-core/actors/terminating-actors.rst index 3ac8dc22eefb..2422ffa88068 100644 --- a/doc/source/ray-core/actors/terminating-actors.rst +++ b/doc/source/ray-core/actors/terminating-actors.rst @@ -64,7 +64,7 @@ Ray to :ref:`automatically restart ` the actor, make sur flag ``no_restart=False`` to ``ray.kill``. For :ref:`named and detached actors `, calling ``ray.kill`` on -an actor handle destroys the actor and allow the name to be reused. +an actor handle destroys the actor and allows the name to be reused. Use `ray list actors --detail` from :ref:`State API ` to see the death cause of dead actors: @@ -133,7 +133,7 @@ This will kill the actor process and release resources associated/assigned to th Ray.exitActor(); - Garbage collection for actors haven't been implemented yet, so this is currently the + Garbage collection for actors hasn't been implemented yet, so this is currently the only way to terminate an actor gracefully. The ``ObjectRef`` resulting from the task can be waited on to wait for the actor to exit (calling ``ObjectRef::get`` on it will throw a ``RayActorException``). @@ -144,7 +144,7 @@ This will kill the actor process and release resources associated/assigned to th ray::ExitActor(); - Garbage collection for actors haven't been implemented yet, so this is currently the + Garbage collection for actors hasn't been implemented yet, so this is currently the only way to terminate an actor gracefully. The ``ObjectRef`` resulting from the task can be waited on to wait for the actor to exit (calling ``ObjectRef::Get`` on it will throw a ``RayActorException``). @@ -178,7 +178,7 @@ You could see the actor is dead as a result of the user's `exit_actor()` call: actor_died_error_context: error_message: 'The actor is dead because its worker process has died. Worker exit type: INTENDED_USER_EXIT Worker exit detail: Worker exits - by an user request. exit_actor() is called.' + by a user request. exit_actor() is called.' owner_id: 02000000ffffffffffffffffffffffffffffffffffffffffffffffff owner_ip_address: 127.0.0.1 node_ip_address: 127.0.0.1 diff --git a/doc/source/ray-core/advanced-topics.rst b/doc/source/ray-core/advanced-topics.rst index 7d1344856507..a1e310b96ca5 100644 --- a/doc/source/ray-core/advanced-topics.rst +++ b/doc/source/ray-core/advanced-topics.rst @@ -7,6 +7,7 @@ This section covers extended topics on how to use Ray. :maxdepth: -1 tips-for-first-time + type-hint starting-ray ray-generator namespaces diff --git a/doc/source/ray-core/api/core.rst b/doc/source/ray-core/api/core.rst index 1a428adbd8c3..b860c87fa6df 100644 --- a/doc/source/ray-core/api/core.rst +++ b/doc/source/ray-core/api/core.rst @@ -50,6 +50,8 @@ Objects ray.get ray.wait ray.put + ray.util.as_completed + ray.util.map_unordered .. _runtime-context-apis: diff --git a/doc/source/ray-core/api/direct-transport.rst b/doc/source/ray-core/api/direct-transport.rst new file mode 100644 index 000000000000..07815b1f9758 --- /dev/null +++ b/doc/source/ray-core/api/direct-transport.rst @@ -0,0 +1,28 @@ +Ray Direct Transport (RDT) API +============================== + +Usage with Core APIs +-------------------- +Enable RDT for actor tasks with the :func:`@ray.method ` decorator, or pass `_tensor_transport` to :func:`ray.put`. You can then pass the resulting `ray.ObjectRef` to other actor tasks, or use :func:`ray.get` to retrieve the result. See :ref:`Ray Direct Transport (RDT) ` for more details on usage. + + +.. autosummary:: + :nosignatures: + :toctree: doc/ + + ray.method + ray.put + ray.get + +Collective tensor transports +---------------------------- +Collective tensor transports require a collective group to be created before RDT objects can be used. Use these methods to create and manage collective groups for the `gloo` and `nccl` tensor transports. + + +.. autosummary:: + :nosignatures: + :toctree: doc/ + + ray.experimental.collective.create_collective_group + ray.experimental.collective.get_collective_groups + ray.experimental.collective.destroy_collective_group \ No newline at end of file diff --git a/doc/source/ray-core/api/exceptions.rst b/doc/source/ray-core/api/exceptions.rst index 104c1c361008..fec840b82eab 100644 --- a/doc/source/ray-core/api/exceptions.rst +++ b/doc/source/ray-core/api/exceptions.rst @@ -39,3 +39,4 @@ Exceptions ray.exceptions.CrossLanguageError ray.exceptions.RaySystemError ray.exceptions.NodeDiedError + ray.exceptions.UnserializableException diff --git a/doc/source/ray-core/api/index.rst b/doc/source/ray-core/api/index.rst index 2845ebe892ef..25b15b766bf1 100644 --- a/doc/source/ray-core/api/index.rst +++ b/doc/source/ray-core/api/index.rst @@ -12,3 +12,4 @@ Ray Core API cli.rst ../../ray-observability/reference/cli.rst ../../ray-observability/reference/api.rst + direct-transport.rst \ No newline at end of file diff --git a/doc/source/ray-core/compiled-graph/profiling.rst b/doc/source/ray-core/compiled-graph/profiling.rst index 8c3ac5d4bfb8..ddfae1313541 100644 --- a/doc/source/ray-core/compiled-graph/profiling.rst +++ b/doc/source/ray-core/compiled-graph/profiling.rst @@ -2,7 +2,7 @@ Profiling ========= Ray Compiled Graph provides both PyTorch-based and Nsight-based profiling functionalities to better understand the performance -of individual tasks, systems overhead, and performance bottlenecks. You can pick your favorite profiler based on your preference. +of individual tasks, system overhead, and performance bottlenecks. You can pick your favorite profiler based on your preference. PyTorch profiler ---------------- diff --git a/doc/source/ray-core/compiled-graph/ray-compiled-graph.rst b/doc/source/ray-core/compiled-graph/ray-compiled-graph.rst index bdf22b20733e..b40956af2e94 100644 --- a/doc/source/ray-core/compiled-graph/ray-compiled-graph.rst +++ b/doc/source/ray-core/compiled-graph/ray-compiled-graph.rst @@ -12,7 +12,7 @@ As large language models (LLMs) become common, programming distributed systems w :ref:`Ray Core APIs ` facilitate using multiple GPUs but have limitations such as: * System overhead of ~1 ms per task launch, which is unsuitable for high-performance tasks like LLM inference. -* Lack support for direct GPU-to-GPU communication, requiring manual development with external libraries like NVIDIA Collective Communications Library (`NCCL `_). +* Lack of support for direct GPU-to-GPU communication, requiring manual development with external libraries like NVIDIA Collective Communications Library (`NCCL `_). Ray Compiled Graph gives you a Ray Core-like API but with: @@ -66,7 +66,7 @@ More Resources - `Ray Compiled Graph blog `_ - `Ray Compiled Graph talk at Ray Summit `_ -- `Heterogenous training with Ray Compiled Graph `_ +- `Heterogeneous training with Ray Compiled Graph `_ - `Distributed LLM inference with Ray Compiled Graph `_ Table of Contents diff --git a/doc/source/ray-core/configure.rst b/doc/source/ray-core/configure.rst index 85a960fee13c..060ec42eb79c 100644 --- a/doc/source/ray-core/configure.rst +++ b/doc/source/ray-core/configure.rst @@ -5,7 +5,7 @@ Configuring Ray .. note:: For running Java applications, see `Java Applications`_. -This page discusses the various way to configure Ray, both from the Python API +This page discusses the various ways to configure Ray, both from the Python API and from the command line. Take a look at the ``ray.init`` `documentation `__ for a complete overview of the configurations. @@ -96,7 +96,7 @@ Change the *root temporary directory* by passing ``--temp-dir={your temp path}`` There currently isn't a stable way to change the root temporary directory when calling ``ray.init()``, but if you need to, you can provide the ``_temp_dir`` argument to ``ray.init()``. -Look :ref:`Logging Directory Structure ` for more details. +See :ref:`Logging Directory Structure ` for more details. .. _ray-ports: diff --git a/doc/source/ray-core/cross-language.rst b/doc/source/ray-core/cross-language.rst index bde0c2a50cfc..c2bf5dba990c 100644 --- a/doc/source/ray-core/cross-language.rst +++ b/doc/source/ray-core/cross-language.rst @@ -148,7 +148,7 @@ from the preceding Python class. Cross-language data serialization --------------------------------- -Ray automatically serializes and deserializes the arguments and return values of ray call +Ray automatically serializes and deserializes the arguments and return values of Ray calls if their types are the following: - Primitive data types diff --git a/doc/source/ray-core/direct-transport.rst b/doc/source/ray-core/direct-transport.rst new file mode 100644 index 000000000000..d8d08a7c163c --- /dev/null +++ b/doc/source/ray-core/direct-transport.rst @@ -0,0 +1,250 @@ +.. _direct-transport: + +.. TODO: asyncio not yet supported. +.. TODO: wait_tensor_freed + +************************** +Ray Direct Transport (RDT) +************************** + +Ray objects are normally stored in Ray's CPU-based object store and copied and deserialized when accessed by a Ray task or actor. +For GPU data specifically, this can lead to unnecessary and expensive data transfers. +For example, passing a CUDA ``torch.Tensor`` from one Ray task to another would require a copy from GPU to CPU memory, then back again to GPU memory. + +*Ray Direct Transport (RDT)* is a new feature that allows Ray to store and pass objects directly between Ray actors. +This feature augments the familiar Ray :class:`ObjectRef ` API by: + +- Keeping GPU data in GPU memory until a transfer is needed +- Avoiding expensive serialization and copies to and from the Ray object store +- Using efficient data transports like collective communication libraries (`Gloo `__ or `NCCL `__) or point-to-point RDMA (via `NVIDIA's NIXL `__) to transfer data directly between devices, including both CPU and GPUs + +.. note:: + RDT is currently in **alpha**. Not all Ray Core APIs are supported yet. Future releases may introduce breaking API changes. See the :ref:`limitations ` section for more details. + +Getting started +=============== + +.. tip:: + RDT currently supports ``torch.Tensor`` objects created by Ray actor tasks. Other datatypes and Ray non-actor tasks may be supported in future releases. + +This walkthrough will show how to create and use RDT with different *tensor transports*, i.e. the mechanism used to transfer the tensor between actors. +Currently, RDT supports the following tensor transports: + +1. `Gloo `__: A collective communication library for PyTorch and CPUs. +2. `NVIDIA NCCL `__: A collective communication library for NVIDIA GPUs. +3. `NVIDIA NIXL `__ (backed by `UCX `__): A library for accelerating point-to-point transfers via RDMA, especially between various types of memory and NVIDIA GPUs. + +For ease of following along, we'll start with the `Gloo `__ transport, which can be used without any physical GPUs. + +.. _direct-transport-gloo: + +Usage with Gloo (CPUs only) +--------------------------- + +Installation +^^^^^^^^^^^^ + +.. note:: + Under construction. + +Walkthrough +^^^^^^^^^^^ + +To get started, define an actor class and a task that returns a ``torch.Tensor``: + +.. literalinclude:: doc_code/direct_transport_gloo.py + :language: python + :start-after: __normal_example_start__ + :end-before: __normal_example_end__ + +As written, when the ``torch.Tensor`` is returned, it will be copied into Ray's CPU-based object store. +For CPU-based tensors, this can require an expensive step to copy and serialize the object, while GPU-based tensors additionally require a copy to and from CPU memory. + +To enable RDT, use the ``tensor_transport`` option in the :func:`@ray.method ` decorator. + +.. literalinclude:: doc_code/direct_transport_gloo.py + :language: python + :start-after: __gloo_example_start__ + :end-before: __gloo_example_end__ + +This decorator can be added to any actor tasks that return a ``torch.Tensor``, or that return ``torch.Tensors`` nested inside other Python objects. +Adding this decorator will change Ray's behavior in the following ways: + +1. When returning the tensor, Ray will store a *reference* to the tensor instead of copying it to CPU memory. +2. When the :class:`ray.ObjectRef` is passed to another task, Ray will use Gloo to transfer the tensor to the destination task. + +Note that for (2) to work, the :func:`@ray.method(tensor_transport) ` decorator only needs to be added to the actor task that *returns* the tensor. It should not be added to actor tasks that *consume* the tensor (unless those tasks also return tensors). + +Also, for (2) to work, we must first create a *collective group* of actors. + +Creating a collective group +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To create a collective group for use with RDT: + +1. Create multiple Ray actors. +2. Create a collective group on the actors using the :func:`ray.experimental.collective.create_collective_group ` function. The `backend` specified must match the `tensor_transport` used in the :func:`@ray.method ` decorator. + +Here is an example: + +.. literalinclude:: doc_code/direct_transport_gloo.py + :language: python + :start-after: __gloo_group_start__ + :end-before: __gloo_group_end__ + +The actors can now communicate directly via gloo. +The group can also be destroyed using the :func:`ray.experimental.collective.destroy_collective_group ` function. +After calling this function, a new collective group can be created on the same actors. + +Passing objects to other actors +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Now that we have a collective group, we can create and pass RDT objects between the actors. +Here is a full example: + +.. literalinclude:: doc_code/direct_transport_gloo.py + :language: python + :start-after: __gloo_full_example_start__ + :end-before: __gloo_full_example_end__ + +When the :class:`ray.ObjectRef` is passed to another task, Ray will use Gloo to transfer the tensor directly from the source actor to the destination actor instead of the default object store. +Note that the :func:`@ray.method(tensor_transport) ` decorator is only added to the actor task that *returns* the tensor; once this hint has been added, the receiving actor task `receiver.sum` will automatically use Gloo to receive the tensor. +In this example, because `MyActor.sum` does not have the :func:`@ray.method(tensor_transport) ` decorator, it will use the default Ray object store transport to return `torch.sum(tensor)`. + +RDT also supports passing tensors nested inside Python data structures, as well as actor tasks that return multiple tensors, like in this example: + +.. literalinclude:: doc_code/direct_transport_gloo.py + :language: python + :start-after: __gloo_multiple_tensors_example_start__ + :end-before: __gloo_multiple_tensors_example_end__ + +Passing RDT objects to the actor that produced them +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +RDT :class:`ray.ObjectRefs ` can also be passed to the actor that produced them. +This avoids any copies and just provides a reference to the same ``torch.Tensor`` that was previously created. +For example: + +.. literalinclude:: doc_code/direct_transport_gloo.py + :language: python + :start-after: __gloo_intra_actor_start__ + :end-before: __gloo_intra_actor_end__ + + +.. note:: + Ray only keeps a reference to the tensor created by the user, so the tensor objects are *mutable*. + If ``sender.sum`` were to modify the tensor in the above example, the changes would also be seen by ``receiver.sum``. + This differs from the normal Ray Core API, which always makes an immutable copy of data returned by actors. + + +``ray.get`` +^^^^^^^^^^^ + +The :func:`ray.get ` function can also be used as usual to retrieve the result of an RDT object, via Ray's object store. + +.. TODO: This example needs to be updated once we change the default transport for ray.get to match the ray.method transport. + +.. literalinclude:: doc_code/direct_transport_gloo.py + :language: python + :start-after: __gloo_get_start__ + :end-before: __gloo_get_end__ + +Usage with NCCL (NVIDIA GPUs only) +---------------------------------- + +RDT requires just a few lines of code change to switch tensor transports. Here is the :ref:`Gloo example `, modified to use NVIDIA GPUs and the `NCCL `__ library for collective GPU communication. + +.. literalinclude:: doc_code/direct_transport_nccl.py + :language: python + :start-after: __nccl_full_example_start__ + :end-before: __nccl_full_example_end__ + +The main code differences are: + +1. The :func:`@ray.method ` uses ``tensor_transport="nccl"`` instead of ``tensor_transport="gloo"``. +2. The :func:`ray.experimental.collective.create_collective_group ` function is used to create a collective group. +3. The tensor is created on the GPU using the ``.cuda()`` method. + +Usage with NIXL (CPUs or NVIDIA GPUs) +------------------------------------- + +NIXL can transfer data between different devices, including CPUs and NVIDIA GPUs, but doesn't require a collective group to be created ahead of time. +This means that any actor that has NIXL installed in its environment can be used to create and pass an RDT object. + +Otherwise, the usage is the same as in the :ref:`Gloo example `. + +Here is an example showing how to use NIXL to transfer an RDT object between two actors: + +.. literalinclude:: doc_code/direct_transport_nixl.py + :language: python + :start-after: __nixl_full_example_start__ + :end-before: __nixl_full_example_end__ + +Compared to the :ref:`Gloo example `, the main code differences are: + +1. The :func:`@ray.method ` uses ``tensor_transport="nixl"`` instead of ``tensor_transport="gloo"``. +2. No collective group is needed. + +.. TODO: ray.get with NIXL + ``ray.get`` + ^^^^^^^^^^^ + + Unlike the collective-based tensor transports (Gloo and NCCL), the :func:`ray.get ` function can use NIXL or the Ray object store to retrieve a copy of the result. + By default, the tensor transport for :func:`ray.get ` will be the one specified in the :func:`@ray.method ` decorator. + + .. literalinclude:: doc_code/direct_transport_nixl.py + :language: python + :start-after: __nixl_get_start__ + :end-before: __nixl_get_end__ + +Summary +------- + +RDT allows Ray to store and pass objects directly between Ray actors, using accelerated transports like GLOO, NCCL, and NIXL. +Here are the main points to keep in mind: + +* If using a collective-based tensor transport (Gloo or NCCL), a collective group must be created ahead of time. NIXL just requires all involved actors to have NIXL installed. +* Unlike objects in the Ray object store, RDT objects are *mutable*, meaning that Ray only holds a reference, not a copy, to the stored tensor(s). +* Otherwise, actors can be used as normal. + +For a full list of limitations, see the :ref:`limitations ` section. + + +Microbenchmarks +=============== + +.. note:: + Under construction. + +.. _limitations: + +Limitations +=========== + +RDT is currently in alpha and currently has the following limitations, which may be addressed in future releases: + +* Support for ``torch.Tensor`` objects only. +* Support for Ray actors only, not Ray tasks. +* Support for the following transports: Gloo, NCCL, and NIXL. +* Support for CPUs and NVIDIA GPUs only. +* RDT objects are *mutable*. This means that Ray only holds a reference to the tensor, and will not copy it until a transfer is requested. Thus, if the application code also keeps a reference to a tensor before returning it, and modifies the tensor in place, then some or all of the changes may be seen by the receiving actor. + +For collective-based tensor transports (Gloo and NCCL): + +* Only the process that created the collective group can submit actor tasks that return and pass RDT objects. If the creating process passes the actor handles to other processes, those processes can submit actor tasks as usual, but will not be able to use RDT objects. +* Similarly, the process that created the collective group cannot serialize and pass RDT :class:`ray.ObjectRefs ` to other Ray tasks or actors. Instead, the :class:`ray.ObjectRef`\s can only be passed as direct arguments to other actor tasks, and those actors must be in the same collective group. +* Each actor can only be in one collective group per tensor transport at a time. +* No support for :func:`ray.put `. +* If a system-level error occurs during a collective operation, the collective group will be destroyed and the actors will no longer be able to communicate via the collective group. Note that application-level errors, i.e. exceptions raised by user code, will not destroy the collective group and will instead be propagated to any dependent task(s), as for non-RDT Ray objects. System-level errors include: + + * Errors internal to the third-party transport, e.g., NCCL network errors + * Actor and node failure + * Tensors returned by the user that are located on an unsupported device, e.g., a CPU tensor when using NCCL + * Any unexpected system bugs + + +Advanced: RDT Internals +======================= + +.. note:: + Under construction. diff --git a/doc/source/ray-core/doc_code/direct_transport_gloo.py b/doc/source/ray-core/doc_code/direct_transport_gloo.py new file mode 100644 index 000000000000..282ee55a8d63 --- /dev/null +++ b/doc/source/ray-core/doc_code/direct_transport_gloo.py @@ -0,0 +1,136 @@ +# flake8: noqa + +# __normal_example_start__ +import torch +import ray + + +@ray.remote +class MyActor: + def random_tensor(self): + return torch.randn(1000, 1000) + + +# __normal_example_end__ + +# __gloo_example_start__ +@ray.remote +class MyActor: + @ray.method(tensor_transport="gloo") + def random_tensor(self): + return torch.randn(1000, 1000) + + +# __gloo_example_end__ + +# __gloo_group_start__ +import torch +import ray +from ray.experimental.collective import create_collective_group + + +@ray.remote +class MyActor: + @ray.method(tensor_transport="gloo") + def random_tensor(self): + return torch.randn(1000, 1000) + + def sum(self, tensor: torch.Tensor): + return torch.sum(tensor) + + +sender, receiver = MyActor.remote(), MyActor.remote() +# The tensor_transport specified here must match the one used in the @ray.method +# decorator. +group = create_collective_group([sender, receiver], backend="torch_gloo") +# __gloo_group_end__ + +# __gloo_group_destroy_start__ +from ray.experimental.collective import destroy_collective_group + +destroy_collective_group(group) +# __gloo_group_destroy_end__ + +# __gloo_full_example_start__ +import torch +import ray +from ray.experimental.collective import create_collective_group + + +@ray.remote +class MyActor: + @ray.method(tensor_transport="gloo") + def random_tensor(self): + return torch.randn(1000, 1000) + + def sum(self, tensor: torch.Tensor): + return torch.sum(tensor) + + +sender, receiver = MyActor.remote(), MyActor.remote() +group = create_collective_group([sender, receiver], backend="torch_gloo") + +# The tensor will be stored by the `sender` actor instead of in Ray's object +# store. +tensor = sender.random_tensor.remote() +result = receiver.sum.remote(tensor) +print(ray.get(result)) +# __gloo_full_example_end__ + +# __gloo_multiple_tensors_example_start__ +import torch +import ray +from ray.experimental.collective import create_collective_group + + +@ray.remote +class MyActor: + @ray.method(tensor_transport="gloo") + def random_tensor_dict(self): + return {"tensor1": torch.randn(1000, 1000), "tensor2": torch.randn(1000, 1000)} + + def sum(self, tensor_dict: dict): + return torch.sum(tensor_dict["tensor1"]) + torch.sum(tensor_dict["tensor2"]) + + +sender, receiver = MyActor.remote(), MyActor.remote() +group = create_collective_group([sender, receiver], backend="torch_gloo") + +# Both tensor values in the dictionary will be stored by the `sender` actor +# instead of in Ray's object store. +tensor_dict = sender.random_tensor_dict.remote() +result = receiver.sum.remote(tensor_dict) +print(ray.get(result)) +# __gloo_multiple_tensors_example_end__ + +# __gloo_intra_actor_start__ +import torch +import ray +from ray.experimental.collective import create_collective_group + + +@ray.remote +class MyActor: + @ray.method(tensor_transport="gloo") + def random_tensor(self): + return torch.randn(1000, 1000) + + def sum(self, tensor: torch.Tensor): + return torch.sum(tensor) + + +sender, receiver = MyActor.remote(), MyActor.remote() +group = create_collective_group([sender, receiver], backend="torch_gloo") + +tensor = sender.random_tensor.remote() +# Pass the ObjectRef back to the actor that produced it. The tensor will be +# passed back to the same actor without copying. +sum1 = sender.sum.remote(tensor) +sum2 = receiver.sum.remote(tensor) +assert torch.allclose(*ray.get([sum1, sum2])) +# __gloo_intra_actor_end__ + +# __gloo_get_start__ +print(ray.get(tensor)) +# torch.Tensor(...) +# __gloo_get_end__ diff --git a/doc/source/ray-core/doc_code/direct_transport_nccl.py b/doc/source/ray-core/doc_code/direct_transport_nccl.py new file mode 100644 index 000000000000..2296a073ce6b --- /dev/null +++ b/doc/source/ray-core/doc_code/direct_transport_nccl.py @@ -0,0 +1,27 @@ +# flake8: noqa + +# __nccl_full_example_start__ +import torch +import ray +from ray.experimental.collective import create_collective_group + + +@ray.remote(num_gpus=1) +class MyActor: + @ray.method(tensor_transport="nccl") + def random_tensor(self): + return torch.randn(1000, 1000).cuda() + + def sum(self, tensor: torch.Tensor): + return torch.sum(tensor) + + +sender, receiver = MyActor.remote(), MyActor.remote() +group = create_collective_group([sender, receiver], backend="nccl") + +# The tensor will be stored by the `sender` actor instead of in Ray's object +# store. +tensor = sender.random_tensor.remote() +result = receiver.sum.remote(tensor) +ray.get(result) +# __nccl_full_example_end__ diff --git a/doc/source/ray-core/doc_code/direct_transport_nixl.py b/doc/source/ray-core/doc_code/direct_transport_nixl.py new file mode 100644 index 000000000000..7952c7f9fb87 --- /dev/null +++ b/doc/source/ray-core/doc_code/direct_transport_nixl.py @@ -0,0 +1,34 @@ +# flake8: noqa + +# __nixl_full_example_start__ +import torch +import ray + + +@ray.remote(num_gpus=1) +class MyActor: + @ray.method(tensor_transport="nixl") + def random_tensor(self): + return torch.randn(1000, 1000).cuda() + + def sum(self, tensor: torch.Tensor): + return torch.sum(tensor) + + +# No collective group is needed. The two actors just need to have NIXL +# installed. +sender, receiver = MyActor.remote(), MyActor.remote() + +# The tensor will be stored by the `sender` actor instead of in Ray's object +# store. +tensor = sender.random_tensor.remote() +result = receiver.sum.remote(tensor) +ray.get(result) +# __nixl_full_example_end__ + +# __nixl_get_start__ +# The :func:`ray.get ` function will also use NIXL to retrieve the +# result. +print(ray.get(tensor)) +# torch.Tensor(...) +# __nixl_get_end__ diff --git a/doc/source/ray-core/fault-tolerance.rst b/doc/source/ray-core/fault-tolerance.rst index d9f9f329e0b6..4de1bab34677 100644 --- a/doc/source/ray-core/fault-tolerance.rst +++ b/doc/source/ray-core/fault-tolerance.rst @@ -69,7 +69,7 @@ It allows you to specify the affinity as a soft constraint so even if the target .. literalinclude:: doc_code/fault_tolerance_tips.py :language: python - :start-after: _node_affinity_scheduling_strategy_start__ + :start-after: __node_affinity_scheduling_strategy_start__ :end-before: __node_affinity_scheduling_strategy_end__ diff --git a/doc/source/ray-core/fault_tolerance/actors.rst b/doc/source/ray-core/fault_tolerance/actors.rst index 6f46a6d67412..9dcbc6029a3b 100644 --- a/doc/source/ray-core/fault_tolerance/actors.rst +++ b/doc/source/ray-core/fault_tolerance/actors.rst @@ -163,7 +163,7 @@ If a task has ``max_task_retries > 0`` and it received ``ActorUnavailableError`` Actor method exceptions ----------------------- -Sometime you want to retry when an actor method raises exceptions. Use ``max_task_retries`` with ``retry_exceptions`` to retry. +Sometimes you want to retry when an actor method raises exceptions. Use ``max_task_retries`` with ``retry_exceptions`` to enable this. Note that by default, retrying on user raised exceptions is disabled. To enable it, make sure the method is **idempotent**, that is, invoking it multiple times should be equivalent to invoking it only once. @@ -180,6 +180,6 @@ Retry behavior depends on the value you set ``retry_exceptions`` to: - The method definition's value, for example, `@ray.method(max_task_retries=2)`. Ray ignores this value if you don't set it. - The actor creation call's value, for example, `Actor.options(max_task_retries=2)`. Ray ignores this value if you didn't set it. - The Actor class definition's value, for example, `@ray.remote(max_task_retries=2)` decorator. Ray ignores this value if you didn't set it. -- The default value,`0`. +- The default value, `0`. For example, if a method sets `max_task_retries=5` and `retry_exceptions=True`, and the actor sets `max_restarts=2`, Ray executes the method up to 6 times: once for the initial invocation, and 5 additional retries. The 6 invocations may include 2 actor crashes. After the 6th invocation, a `ray.get` call to the result Ray ObjectRef raises the exception raised in the last invocation, or `ray.exceptions.RayActorError` if the actor crashed in the last invocation. diff --git a/doc/source/ray-core/fault_tolerance/nodes.rst b/doc/source/ray-core/fault_tolerance/nodes.rst index 41d8959e3ae7..e106a962797a 100644 --- a/doc/source/ray-core/fault_tolerance/nodes.rst +++ b/doc/source/ray-core/fault_tolerance/nodes.rst @@ -24,6 +24,6 @@ so that when we start a new head node we still have all the cluster-level data. Raylet failure -------------- -When a raylet process fails, the corresponding node will be marked as dead and is treated the same as node failure. +When a raylet process fails, the corresponding node will be marked as dead and is treated the same as a node failure. Each raylet is associated with a unique id, so even if the raylet restarts on the same physical machine, it'll be treated as a new raylet/node to the Ray cluster. diff --git a/doc/source/ray-core/fault_tolerance/objects.rst b/doc/source/ray-core/fault_tolerance/objects.rst index 4c2987d8efd2..b363baebe9af 100644 --- a/doc/source/ray-core/fault_tolerance/objects.rst +++ b/doc/source/ray-core/fault_tolerance/objects.rst @@ -73,8 +73,8 @@ will clean up any remaining copies of the object's value to prevent a memory leak. Any workers that subsequently try to get the object's value will receive an ``OwnerDiedError`` exception, which can be handled manually. -Understanding `ObjectLostErrors` --------------------------------- +Understanding ``ObjectLostErrors`` +---------------------------------- Ray throws an ``ObjectLostError`` to the application when an object cannot be retrieved due to application or system error. This can occur during a diff --git a/doc/source/ray-core/handling-dependencies.rst b/doc/source/ray-core/handling-dependencies.rst index 440b0dad0507..993ac51ff644 100644 --- a/doc/source/ray-core/handling-dependencies.rst +++ b/doc/source/ray-core/handling-dependencies.rst @@ -39,7 +39,7 @@ Preparing an environment using the Ray Cluster launcher The first way to set up dependencies is to prepare a single environment across the cluster before starting the Ray runtime. -- You can build all your files and dependencies into a container image and specify this in your your :ref:`Cluster YAML Configuration `. +- You can build all your files and dependencies into a container image and specify this in your :ref:`Cluster YAML Configuration `. - You can also install packages using ``setup_commands`` in the Ray Cluster configuration file (:ref:`reference `); these commands will be run as each node joins the cluster. Note that for production settings, it is recommended to build any necessary packages into a container image instead. @@ -613,7 +613,7 @@ The ``runtime_env`` is a Python dictionary or a Python class :class:`ray.runtime - Example: ``{"LD_LIBRARY_PATH": "${LD_LIBRARY_PATH}:/home/admin/my_lib"}`` - - Non-existant variable example: ``{"ENV_VAR_NOT_EXIST": "${ENV_VAR_NOT_EXIST}:/home/admin/my_lib"}`` -> ``ENV_VAR_NOT_EXIST=":/home/admin/my_lib"``. + - Non-existent variable example: ``{"ENV_VAR_NOT_EXIST": "${ENV_VAR_NOT_EXIST}:/home/admin/my_lib"}`` -> ``ENV_VAR_NOT_EXIST=":/home/admin/my_lib"``. - ``nsight`` (Union[str, Dict[str, str]]): specifies the config for the Nsight System Profiler. The value is either (1) "default", which refers to the `default config `_, or (2) a dict of Nsight System Profiler options and their values. See :ref:`here ` for more details on setup and usage. @@ -847,7 +847,7 @@ Your ``runtime_env`` dictionary should contain: Check for hidden files and metadata directories in zipped dependencies. You can inspect a zip file's contents by running the ``zipinfo -1 zip_file_name.zip`` command in the Terminal. Some zipping methods can cause hidden files or metadata directories to appear in the zip file at the top level. - To avoid this, use the ``zip -r`` command directly on the directory you want to compress from its parent's directory. For example, if you have a directory structure such as: ``a/b`` and you what to compress ``b``, issue the ``zip -r b`` command from the directory ``a.`` + To avoid this, use the ``zip -r`` command directly on the directory you want to compress from its parent's directory. For example, if you have a directory structure such as: ``a/b`` and you want to compress ``b``, issue the ``zip -r b`` command from the directory ``a.`` If Ray detects more than a single directory at the top level, it will use the entire zip file instead of the top-level directory, which may lead to unexpected behavior. Currently, four types of remote URIs are supported for hosting ``working_dir`` and ``py_modules`` packages: @@ -859,7 +859,7 @@ Currently, four types of remote URIs are supported for hosting ``working_dir`` a - Example: - - ``runtime_env = {"working_dir": "https://github.com/example_username/example_respository/archive/HEAD.zip"}`` + - ``runtime_env = {"working_dir": "https://github.com/example_username/example_repository/archive/HEAD.zip"}`` - ``S3``: ``S3`` refers to URIs starting with ``s3://`` that point to compressed packages stored in `AWS S3 `_. To use packages via ``S3`` URIs, you must have the ``smart_open`` and ``boto3`` libraries (you can install them using ``pip install smart_open`` and ``pip install boto3``). diff --git a/doc/source/ray-core/internals.rst b/doc/source/ray-core/internals.rst new file mode 100644 index 000000000000..69505a23c1ad --- /dev/null +++ b/doc/source/ray-core/internals.rst @@ -0,0 +1,12 @@ +.. _ray-core-internals: + +Internals +========= + +This section provides a look into some of Ray Core internals. It's primarily intended for advanced users and developers of Ray Core. +For the high level architecture overview, please refer to the `whitepaper `__. + +.. toctree:: + :maxdepth: 1 + + internals/task-lifecycle.rst diff --git a/doc/source/ray-core/internals/task-lifecycle.rst b/doc/source/ray-core/internals/task-lifecycle.rst new file mode 100644 index 000000000000..7c0393f09891 --- /dev/null +++ b/doc/source/ray-core/internals/task-lifecycle.rst @@ -0,0 +1,83 @@ +.. _task-lifecycle: + +Task Lifecycle +============== + +This doc talks about the lifecycle of a task in Ray Core, including how tasks are defined, scheduled and executed. +We will use the following code as an example and the internals are based on Ray 2.48. + + +.. testcode:: + + import ray + + @ray.remote + def my_task(arg): + return f"Hello, {arg}!" + + obj_ref = my_task.remote("Ray") + print(ray.get(obj_ref)) + +.. testoutput:: + + Hello, Ray! + + +Defining a remote function +-------------------------- + +The first step in the task lifecycle is defining a remote function using the :func:`ray.remote` decorator. :func:`ray.remote` wraps the Python function and returns an instance of `RemoteFunction `__. +``RemoteFunction`` stores the underlying function and all the user specified Ray task :meth:`options ` such as ``num_cpus``. + + +Invoking a remote function +-------------------------- + +Once a remote function is defined, it can be invoked using the `.remote()` method. Each invocation of a remote function creates a Ray task. This method submits the task for execution and returns an object reference (``ObjectRef``) that can be used to retrieve the result later. +Under the hood, `.remote()` does the following: + +1. `Pickles the underlying function `__ into bytes and `stores the bytes in GCS key-value store `__ with a `key `__ so that, later on, the remote executor (the core worker process that will execute the task) can get the bytes, unpickle, and execute the function. This is done once per remote function definition instead of once per invocation. +2. `Calls `__ Cython `submit_task `__ which `prepares `__ the arguments (3 types) and calls the C++ `CoreWorker::SubmitTask `__. + + 1. Pass-by-reference argument: the argument is an ``ObjectRef``. + 2. Pass-by-value inline argument: the argument is a `small `__ Python object and the total size of such arguments so far is below the `threshold `__. In this case, it will be pickled, sent to the remote executor (as part of the ``PushTask`` RPC), and unpickled there. This is called inlining and plasma store is not involved in this case. + 3. Pass-by-value non-inline argument: the argument is a normal Python object but it doesn't meet the inline criteria (e.g. size is too big), it is `put `__ in the local plasma store and the argument is replaced by the generated ``ObjectRef``, so it's effectively equivalent to ``.remote(ray.put(arg))``. + +3. ``CoreWorker`` `builds `__ a `TaskSpecification `__ that contains all the information about the task including the `ID `__ of the function, all the user specified options and the arguments. This spec will be sent to the executor for execution. +4. The TaskSpecification is `submitted `__ to `NormalTaskSubmitter `__ asynchronously. This means the ``.remote()`` call returns immediately and the task is scheduled and executed asynchronously. + +Scheduling a task +----------------- + +Once the task is submitted to ``NormalTaskSubmitter``, a worker process on some Ray node is selected to execute the task and this process is called scheduling. + +1. ``NormalTaskSubmitter`` first `waits `__ for all the ``ObjectRef`` arguments to be available. Available means tasks that produce those ``ObjectRef``\s finished execution and the data is available somewhere in the cluster. + + 1. If the object pointed to by the ``ObjectRef`` is in the plasma store, the ``ObjectRef`` itself is sent to the executor and the executor will resolve the ``ObjectRef`` to the actual data (pull from remote plasma store if needed) before calling the user function. + 2. If the object pointed to by the ``ObjectRef`` is in the caller memory store, the data is `inlined `__ and sent to the executor as part of the ``PushTask`` RPC just like other pass-by-value inline arguments. + +2. Once all the arguments are available, ``NormalTaskSubmitter`` will try to find an idle worker to execute the task. ``NormalTaskSubmitter`` gets workers for task execution from raylet via a process called worker lease and this is where scheduling happens. + Specifically, it will `send `__ a ``RequestWorkerLease`` RPC to a `selected `__ (it's either the local raylet or a data-locality-favored raylet) raylet for a worker lease. +3. Raylet `handles `__ the ``RequestWorkerLease`` RPC. +4. When the ``RequestWorkerLease`` RPC returns with a leased worker address in the response, a worker lease is granted to the caller to execute the task. If the ``RequestWorkerLease`` response contains another raylet address instead, ``NormalTaskSubmitter`` will then request a worker lease from the specified raylet. This process continues until a worker lease is obtained. + +Executing a task +---------------- + +Once a leased worker is obtained, the task execution starts. + +1. ``NormalTaskSubmitter`` `sends `__ a ``PushTask`` RPC to the leased worker with the ``TaskSpecification`` to execute. +2. The executor `receives `__ the ``PushTask`` RPC and executes (`1 `__ -> `2 `__ -> `3 `__ -> `4 `__ -> `5 `__) the task. +3. First step of executing the task is `getting `__ all the pass-by-reference arguments from the local plasma store (data is already pulled from remote plasma store to the local plasma store during scheduling). +4. Then the executor `gets `__ the pickled function bytes from GCS key-value store and unpickles it. +5. The next step is `unpickling `__ the arguments. +6. Finally, the user function is `called `__. + +Getting the return value +------------------------ + +After the user function is executed, the caller can get the return values. + +1. After the user function returns, the executor `gets and stores `__ all the return values. If the return value is a `small `__ object and the total size of such return values so far is below the `threshold `__, it is returned directly to the caller as part of the ``PushTask`` RPC response. `Otherwise `__, it is put in the local plasma store and the reference is returned to the caller. +2. When the caller `receives `__ the ``PushTask`` RPC response, it `stores `__ the return values (actual data if the return value is small or a special value indicating the data is in plasma store if the return value is big) in the local memory store. +3. When the return value is `added `__ to the local memory store, ``ray.get()`` is `unblocked `__ and returns the value directly if the object is small, or it will `get `__ from the local plasma store (pull from remote plasma store first if needed) if the object is big. diff --git a/doc/source/ray-core/namespaces.rst b/doc/source/ray-core/namespaces.rst index 6a502534bda2..8f0c1a6ab70c 100644 --- a/doc/source/ray-core/namespaces.rst +++ b/doc/source/ray-core/namespaces.rst @@ -161,7 +161,7 @@ the specified namespace, no matter what namespace of the current job is. .. tab-item:: C++ - .. code-block:: + .. code-block:: c++ // `ray start --head` has been run to launch a local cluster. ray::RayConfig config; @@ -169,8 +169,8 @@ the specified namespace, no matter what namespace of the current job is. // Create an actor with specified namespace. ray::Actor(RAY_FUNC(Counter::FactoryCreate)).SetName("my_actor", "actor_namespace").Remote(); // It is accessible in its namespace. - ray::GetActor("orange"); - ray::Shutdown();` + ray::GetActor("my_actor", "actor_namespace"); + ray::Shutdown(); Anonymous namespaces diff --git a/doc/source/ray-core/objects/object-spilling.rst b/doc/source/ray-core/objects/object-spilling.rst index 1875a3229453..8f0ff5c7c857 100644 --- a/doc/source/ray-core/objects/object-spilling.rst +++ b/doc/source/ray-core/objects/object-spilling.rst @@ -31,7 +31,7 @@ For advanced usage and customizations, reach out to the `Ray team `_ to efficiently transfer objects across different processes and different nodes. Numpy arrays in the object store are shared between workers on the same node (zero-copy deserialization). +Since Ray processes do not share memory space, data transferred between workers and nodes will need to be **serialized** and **deserialized**. Ray uses the `Plasma object store `_ to efficiently transfer objects across different processes and different nodes. Numpy arrays in the object store are shared between workers on the same node (zero-copy deserialization). Overview -------- @@ -48,7 +48,7 @@ Numpy Arrays Ray optimizes for numpy arrays by using Pickle protocol 5 with out-of-band data. The numpy array is stored as a read-only object, and all Ray workers on the same node can read the numpy array in the object store without copying (zero-copy reads). Each numpy array object in the worker process holds a pointer to the relevant array held in shared memory. Any writes to the read-only object will require the user to first copy it into the local process memory. -.. tip:: You can often avoid serialization issues by using only native types (e.g., numpy arrays or lists/dicts of numpy arrays and other primitive types), or by using Actors hold objects that cannot be serialized. +.. tip:: You can often avoid serialization issues by using only native types (e.g., numpy arrays or lists/dicts of numpy arrays and other primitive types), or by using Actors to hold objects that cannot be serialized. Fixing "assignment destination is read-only" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -202,6 +202,63 @@ There are at least 3 ways to define your custom serialization process: except TypeError: pass +.. _custom-exception-serializer: + +Custom Serializers for Exceptions +---------------------------------- + +When Ray tasks raise exceptions that cannot be serialized with the default pickle mechanism, you can register custom serializers to handle them (Note: the serializer must be registered in the driver and all workers). + +.. testcode:: + + import ray + import threading + + class CustomError(Exception): + def __init__(self, message, data): + self.message = message + self.data = data + self.lock = threading.Lock() # Cannot be serialized + + def custom_serializer(exc): + return {"message": exc.message, "data": str(exc.data)} + + def custom_deserializer(state): + return CustomError(state["message"], state["data"]) + + # Register in the driver + ray.util.register_serializer( + CustomError, + serializer=custom_serializer, + deserializer=custom_deserializer + ) + + @ray.remote + def task_that_registers_serializer_and_raises(): + # Register the custom serializer in the worker + ray.util.register_serializer( + CustomError, + serializer=custom_serializer, + deserializer=custom_deserializer + ) + + # Now raise the custom exception + raise CustomError("Something went wrong", {"complex": "data"}) + + # The custom exception will be properly serialized across worker boundaries + try: + ray.get(task_that_registers_serializer_and_raises.remote()) + except ray.exceptions.RayTaskError as e: + print(f"Caught exception: {e.cause}") # This will be our CustomError + +When a custom exception is raised in a remote task, Ray will: + +1. Serialize the exception using your custom serializer +2. Wrap it in a :class:`RayTaskError ` +3. The deserialized exception will be available as ``ray_task_error.cause`` + +Whenever serialization fails, Ray throws an :class:`UnserializableException ` containing the string representation of the original stack trace. + Troubleshooting --------------- diff --git a/doc/source/ray-core/patterns/concurrent-operations-async-actor.rst b/doc/source/ray-core/patterns/concurrent-operations-async-actor.rst index 656c3215243d..fc8e897569c3 100644 --- a/doc/source/ray-core/patterns/concurrent-operations-async-actor.rst +++ b/doc/source/ray-core/patterns/concurrent-operations-async-actor.rst @@ -22,7 +22,7 @@ With the default actor, the code will look like this: :start-after: __sync_actor_start__ :end-before: __sync_actor_end__ -This is problematic because ``TaskExecutor.run`` method runs forever and never yield the control to run other methods. +This is problematic because ``TaskExecutor.run`` method runs forever and never yields control to run other methods. We can solve this problem by using :ref:`async actors ` and use ``await`` to yield control: .. literalinclude:: ../doc_code/pattern_async_actor.py @@ -30,4 +30,4 @@ We can solve this problem by using :ref:`async actors ` and use `` :start-after: __async_actor_start__ :end-before: __async_actor_end__ -Here, instead of using the blocking :func:`ray.get() ` to get the value of an ObjectRef, we use ``await`` so it can yield the control while we are waiting for the object to be fetched. +Here, instead of using the blocking :func:`ray.get() ` to get the value of an ObjectRef, we use ``await`` so it can yield control while we are waiting for the object to be fetched. diff --git a/doc/source/ray-core/patterns/fork-new-processes.rst b/doc/source/ray-core/patterns/fork-new-processes.rst index 0ef83b88d274..0323d906dced 100644 --- a/doc/source/ray-core/patterns/fork-new-processes.rst +++ b/doc/source/ray-core/patterns/fork-new-processes.rst @@ -3,21 +3,21 @@ Anti-pattern: Forking new processes in application code ======================================================== -**Summary:** Don't fork new processes in Ray application code-for example, in -driver, tasks or actors. Instead, use "spawn" method to start new processes or use Ray +**Summary:** Don't fork new processes in Ray application code—for example, in +driver, tasks or actors. Instead, use the "spawn" method to start new processes or use Ray tasks and actors to parallelize your workload Ray manages the lifecycle of processes for you. Ray Objects, Tasks, and -Actors manages sockets to communicate with the Raylet and the GCS. If you fork new +Actors manage sockets to communicate with the Raylet and the GCS. If you fork new processes in your application code, the processes could share the same sockets without -any synchronization. This can lead to corrupted message and unexpected +any synchronization. This can lead to corrupted messages and unexpected behavior. The solution is to: -1. use "spawn" method to start new processes so that parent process's -memory space isn't copied to the child processes or +1. use the "spawn" method to start new processes so that the parent process's +memory space is not copied to the child processes or 2. use Ray tasks and -actors to parallelize your workload and let Ray to manage the lifecycle of the +actors to parallelize your workload and let Ray manage the lifecycle of the processes for you. Code example diff --git a/doc/source/ray-core/patterns/limit-pending-tasks.rst b/doc/source/ray-core/patterns/limit-pending-tasks.rst index 8a266990c4d4..6bdac69273aa 100644 --- a/doc/source/ray-core/patterns/limit-pending-tasks.rst +++ b/doc/source/ray-core/patterns/limit-pending-tasks.rst @@ -22,7 +22,7 @@ With ``ray.wait()``, we can apply backpressure and limit the number of pending t Example use case ---------------- -You have a worker actor that process tasks at a rate of X tasks per second and you want to submit tasks to it at a rate lower than X to avoid OOM. +You have a worker actor that processes tasks at a rate of X tasks per second and you want to submit tasks to it at a rate lower than X to avoid OOM. For example, Ray Serve uses this pattern to limit the number of pending queries for each worker. diff --git a/doc/source/ray-core/patterns/out-of-band-object-ref-serialization.rst b/doc/source/ray-core/patterns/out-of-band-object-ref-serialization.rst index 4b87f16d68fb..6a24ea50b1aa 100644 --- a/doc/source/ray-core/patterns/out-of-band-object-ref-serialization.rst +++ b/doc/source/ray-core/patterns/out-of-band-object-ref-serialization.rst @@ -6,14 +6,14 @@ Anti-pattern: Serialize ray.ObjectRef out of band **TLDR:** Avoid serializing ``ray.ObjectRef`` because Ray can't know when to garbage collect the underlying object. Ray's ``ray.ObjectRef`` is distributed reference counted. Ray pins the underlying object until the reference isn't used by the system anymore. -When all references are the pinned object gone, Ray garbage collects the pinned object and cleans it up from the system. -However, if user code serializes ``ray.objectRef``, Ray can't keep track of the reference. +When all references to the pinned object are gone, Ray garbage collects the pinned object and cleans it up from the system. +However, if user code serializes ``ray.ObjectRef``, Ray can't keep track of the reference. -To avoid incorrect behavior, if ``ray.cloudpickle`` serializes``ray.ObjectRef``, Ray pins the object for the lifetime of a worker. "Pin" means that object can't be evicted from the object store -until the corresponding owner worker dies. It's prone to Ray object leaks, which can lead disk spilling. See :ref:`this page ` for more details. +To avoid incorrect behavior, if ``ray.cloudpickle`` serializes ``ray.ObjectRef``, Ray pins the object for the lifetime of a worker. "Pin" means that object can't be evicted from the object store +until the corresponding owner worker dies. It's prone to Ray object leaks, which can lead to disk spilling. See :ref:`this page ` for more details. To detect if this pattern exists in your code, you can set an environment variable ``RAY_allow_out_of_band_object_ref_serialization=0``. If Ray detects -that ``ray.cloudpickle`` serialized``ray.ObjectRef``, it raises an exception with helpful messages. +that ``ray.cloudpickle`` serialized ``ray.ObjectRef``, it raises an exception with helpful messages. Code example ------------ diff --git a/doc/source/ray-core/ray-generator.rst b/doc/source/ray-core/ray-generator.rst index da81bc22f7ca..6aba716d4834 100644 --- a/doc/source/ray-core/ray-generator.rst +++ b/doc/source/ray-core/ray-generator.rst @@ -97,7 +97,7 @@ Ray raises the exception. :start-after: __streaming_generator_exception_start__ :end-before: __streaming_generator_exception_end__ -In the above example, if the an application fails the task, Ray returns the object reference with an exception +In the above example, if an application fails the task, Ray returns the object reference with an exception in a correct order. For example, if Ray raises the exception after the second yield, the third ``next(gen)`` returns an object reference with an exception all the time. If a system error fails the task, (e.g., a node failure or worker process failure), ``next(gen)`` returns the object reference that contains the system level exception diff --git a/doc/source/ray-core/scheduling/accelerators.rst b/doc/source/ray-core/scheduling/accelerators.rst index 707b5b75a053..47ef5cd80e6e 100644 --- a/doc/source/ray-core/scheduling/accelerators.rst +++ b/doc/source/ray-core/scheduling/accelerators.rst @@ -84,11 +84,11 @@ If you need to, you can :ref:`override ` this. .. tip:: You can set the ``NEURON_RT_VISIBLE_CORES`` environment variable before starting a Ray node - to limit the AWS Neuro Cores that are visible to Ray. + to limit the AWS Neuron Cores that are visible to Ray. For example, ``NEURON_RT_VISIBLE_CORES=1,3 ray start --head --resources='{"neuron_cores": 2}'`` lets Ray only see devices 1 and 3. - See the `Amazon documentation` for more examples of Ray on Neuron with EKS as an orchestration substrate. + See the `Amazon documentation `_ for more examples of Ray on Neuron with EKS as an orchestration substrate. .. tab-item:: Google TPU :sync: Google TPU diff --git a/doc/source/ray-core/scheduling/index.rst b/doc/source/ray-core/scheduling/index.rst index 9fb5bed4f712..e46203c21d3d 100644 --- a/doc/source/ray-core/scheduling/index.rst +++ b/doc/source/ray-core/scheduling/index.rst @@ -22,7 +22,7 @@ Given that, a node can be in one of the following states: - Infeasible: the node doesn't have the required resources. For example a CPU-only node is infeasible for a GPU task. Resource requirements are **hard** requirements meaning that only feasible nodes are eligible to run the task or actor. -If there are feasible nodes, Ray will either choose an available node or wait until a unavailable node to become available +If there are feasible nodes, Ray will either choose an available node or wait until an unavailable node to become available depending on other factors discussed below. If all nodes are infeasible, the task or actor cannot be scheduled until feasible nodes are added to the cluster. diff --git a/doc/source/ray-core/scheduling/placement-group.rst b/doc/source/ray-core/scheduling/placement-group.rst index beb00ee77cca..5a4245eeb005 100644 --- a/doc/source/ray-core/scheduling/placement-group.rst +++ b/doc/source/ray-core/scheduling/placement-group.rst @@ -37,7 +37,7 @@ Create a Placement Group (Reserve Resources) You can create a placement group using :func:`ray.util.placement_group`. Placement groups take in a list of bundles and a :ref:`placement strategy `. Note that each bundle must be able to fit on a single node on the Ray cluster. -For example, if you only have a 8 CPU node, and if you have a bundle that requires ``{"CPU": 9}``, +For example, if you only have an 8 CPU node, and if you have a bundle that requires ``{"CPU": 9}``, this bundle cannot be scheduled. Bundles are specified by a list of dictionaries, e.g., ``[{"CPU": 1}, {"CPU": 1, "GPU": 1}]``). diff --git a/doc/source/ray-core/tasks.rst b/doc/source/ray-core/tasks.rst index 76a7173e2100..81c0438a1b11 100644 --- a/doc/source/ray-core/tasks.rst +++ b/doc/source/ray-core/tasks.rst @@ -287,6 +287,8 @@ You can change this behavior by setting in :func:`ray.remote() ` and :meth:`.options() `. See :ref:`Ray fault tolerance ` for more details. +.. _task-events: + Task Events ----------- diff --git a/doc/source/ray-core/type-hint.md b/doc/source/ray-core/type-hint.md new file mode 100644 index 000000000000..536271860a61 --- /dev/null +++ b/doc/source/ray-core/type-hint.md @@ -0,0 +1,90 @@ +# Type hints in Ray + +As of Ray 2.48, Ray provides comprehensive support for Python type hints with both remote functions and actors. This enables better IDE support, static type checking, and improved code maintainability in distributed Ray applications. + +## Overview + +In most cases, Ray applications can use type hints without any modifications to existing code. Ray automatically handles type inference for standard remote functions and basic actor usage patterns. For example, remote functions support standard Python type annotations without additional configuration. The `@ray.remote` decorator preserves the original function signature and type information. + +```python +import ray + +@ray.remote +def add_numbers(x: int, y: int) -> int: + return x + y + +# Type hints work seamlessly with remote function calls +a = add_numbers.remote(5, 3) +print(ray.get(a)) +``` + +However, certain patterns, especially when working with actors, require specific approaches to ensure proper type annotation. + +## Pattern 1: Use `ray.remote` as a function to build an actor + +Use the `ray.remote` function directly to create an actor class, instead of using the `@ray.remote` decorator. This will preserve the original class type and allow type inference to work correctly. For example, in this case, the original class type is `DemoRay`, and the actor class type is `ActorClass[DemoRay]`. + +```python +import ray +from ray.actor import ActorClass + +class DemoRay: + def __init__(self, init: int): + self.init = init + + @ray.method + def calculate(self, v1: int, v2: int) -> int: + return self.init + v1 + v2 + +ActorDemoRay: ActorClass[DemoRay] = ray.remote(DemoRay) +# DemoRay is the original class type, ActorDemoRay is the ActorClass[DemoRay] type +``` + +After creating the `ActorClass[DemoRay]` type, we can use it to instantiate an actor by calling `ActorDemoRay.remote(1)`. It returns an `ActorProxy[DemoRay]` type, which represents an actor handle. + +This handle will provide type hints for the actor methods, including their arguments and return types. + +```python + +actor: ActorProxy[DemoRay] = ActorDemoRay.remote(1) + +def func(actor: ActorProxy[DemoRay]) -> int: + b: ObjectRef[int] = actor.calculate.remote(1, 2) + return ray.get(b) + +a = func.remote() +print(ray.get(a)) +``` + +**Why do we need to do this?** + +In Ray, the `@ray.remote` decorator indicates that instances of the class `T` are actors, with each actor running in its own Python process. However, the `@ray.remote` decorator will transform the class `T` into a `ActorClass[T]` type, which is not the original class type. + +Unfortunately, IDE and static type checkers will not be able to infer the original type `T` of the `ActorClass[T]`. To solve this problem, using `ray.remote(T)` will explicitly return a new generic class `ActorClass[T]` type while preserving the original class type. + +## Pattern 2: Use `@ray.method` decorator for remote methods + +Add the `@ray.method` decorator to the actor methods in order to obtain type hints for the remote methods of the actor through `ActorProxy[T]` type, including their arguments and return types. + +```python +from ray.actor import ActorClass, ActorProxy + +class DemoRay: + def __init__(self, init: int): + self.init = init + + @ray.method + def calculate(self, v1: int, v2: int) -> int: + return self.init + v1 + v2 + +ActorDemoRay: ActorClass[DemoRay] = ray.remote(DemoRay) +actor: ActorProxy[DemoRay] = ActorDemoRay.remote(1) +# IDEs will be able to correctly list the remote methods of the actor +# and provide type hints for the arguments and return values of the remote methods +a: ObjectRef[int] = actor.calculate.remote(1, 2) +print(ray.get(a)) +``` + +:::{note} +We would love to make the typing of remote methods work without `@ray.method` decorator. If any community member has an idea, we welcome PRs. +::: diff --git a/doc/source/ray-core/user-guide.rst b/doc/source/ray-core/user-guide.rst index 89c40d108548..6befdfc519ba 100644 --- a/doc/source/ray-core/user-guide.rst +++ b/doc/source/ray-core/user-guide.rst @@ -17,5 +17,6 @@ If you’re brand new to Ray, we recommend starting with the :ref:`walkthrough < scheduling/index.rst fault-tolerance patterns/index.rst + direct-transport compiled-graph/ray-compiled-graph advanced-topics diff --git a/doc/source/ray-core/user-spawn-processes.rst b/doc/source/ray-core/user-spawn-processes.rst index 1302f0a7b16a..af9dd7b0d072 100644 --- a/doc/source/ray-core/user-spawn-processes.rst +++ b/doc/source/ray-core/user-spawn-processes.rst @@ -1,7 +1,7 @@ Lifetimes of a User-Spawn Process ================================= -When you spawns child processes from Ray workers, you are responsible for managing the lifetime of child processes. However, it is not always possible, especially when worker crashes and child processes are spawned from libraries (torch dataloader). +When you spawn child processes from Ray workers, you are responsible for managing the lifetime of child processes. However, it is not always possible, especially when worker crashes and child processes are spawned from libraries (torch dataloader). To avoid leaking user-spawned processes, Ray provides mechanisms to kill all user-spawned processes when a worker that starts it exits. This feature prevents GPU memory leaks from child processes (e.g., torch). diff --git a/doc/source/ray-core/walkthrough.rst b/doc/source/ray-core/walkthrough.rst index 1fb7f8a8b3c6..938b219dedc4 100644 --- a/doc/source/ray-core/walkthrough.rst +++ b/doc/source/ray-core/walkthrough.rst @@ -11,6 +11,7 @@ What's Ray Core? User Guides Examples api/index + Internals Ray Core is a powerful distributed computing framework that provides a small set of essential primitives (tasks, actors, and objects) for building and scaling distributed applications. @@ -58,7 +59,7 @@ Here's a simple example: Calling an Actor ---------------- -While tasks are stateless, Ray actors allow you to create stateful workers that maintain their internal state between method calls. +While tasks are stateless, Ray actors allow you to create stateful workers that maintain their internal state between method calls. When you instantiate a Ray actor: 1. Ray starts a dedicated worker process somewhere in your cluster diff --git a/doc/source/ray-more-libs/dask-on-ray.rst b/doc/source/ray-more-libs/dask-on-ray.rst index 64dc2cdb6a5a..4bd68bce4abd 100644 --- a/doc/source/ray-more-libs/dask-on-ray.rst +++ b/doc/source/ray-more-libs/dask-on-ray.rst @@ -118,13 +118,13 @@ Best Practice for Large Scale workloads For Ray 1.3, the default scheduling policy is to pack tasks to the same node as much as possible. It is more desirable to spread tasks if you run a large scale / memory intensive Dask on Ray workloads. -In this case, there are two recommended setup. +In this case, there are two recommended setups. - Reducing the config flag `scheduler_spread_threshold` to tell the scheduler to prefer spreading tasks across the cluster instead of packing. - Setting the head node's `num-cpus` to 0 so that tasks are not scheduled on a head node. .. code-block:: bash - # Head node. Set `num_cpus=0` to avoid tasks are being scheduled on a head node. + # Head node. Set `num_cpus=0` to avoid tasks being scheduled on a head node. RAY_scheduler_spread_threshold=0.0 ray start --head --num-cpus=0 # Worker node. diff --git a/doc/source/ray-more-libs/data_juicer_distributed_data_processing.md b/doc/source/ray-more-libs/data_juicer_distributed_data_processing.md index 926368ffa65d..f48238127a7f 100644 --- a/doc/source/ray-more-libs/data_juicer_distributed_data_processing.md +++ b/doc/source/ray-more-libs/data_juicer_distributed_data_processing.md @@ -18,12 +18,12 @@ See the [Data-Juicer 2.0: Cloud-Scale Adaptive Data Processing for Foundation Mo ### Ray mode in Data-Juicer -- For most implementations of Data-Juicer [operators](https://github.com/modelscope/data-juicer/blob/main/docs/Operators.md), the core processing functions are engine-agnostic. Operators manage interoperability is primarily in [RayDataset](https://github.com/modelscope/data-juicer/blob/main/data_juicer/core/data/ray_dataset.py) and [RayExecutor](https://github.com/modelscope/data-juicer/blob/main/data_juicer/core/executor/ray_executor.py), which are subclasses of the base `DJDataset` and `BaseExecutor`, respectively, and support both Ray [Tasks](ray-remote-functions) and [Actors](actor-guide). -- The exception is the deduplication operators, which are challenging to scale in standalone mode. The names of these operators follow the pattern of [`ray_xx_deduplicator`](https://github.com/modelscope/data-juicer/blob/main//data_juicer/ops/deduplicator/). +- For most implementations of Data-Juicer [operators](https://github.com/modelscope/data-juicer/blob/main/docs/Operators.md), the core processing functions are engine-agnostic. Operator interoperability is managed primarily in [RayDataset](https://github.com/modelscope/data-juicer/blob/main/data_juicer/core/data/ray_dataset.py) and [RayExecutor](https://github.com/modelscope/data-juicer/blob/main/data_juicer/core/executor/ray_executor.py), which are subclasses of the base `DJDataset` and `BaseExecutor`, respectively, and support both Ray [Tasks](ray-remote-functions) and [Actors](actor-guide). +- The exception is the deduplication operators, which are challenging to scale in standalone mode. The names of these operators follow the pattern of [`ray_xx_deduplicator`](https://github.com/modelscope/data-juicer/blob/main/data_juicer/ops/deduplicator/). ### Subset splitting -When a cluster has tens of thousands of nodes but only a few dataset files, Ray splits the dataset files according to available resources and distribute the blocks across all nodes, incurring high network communication costs and reduced CPU utilization. For more details, see [Ray's `_autodetect_parallelism` function](https://github.com/ray-project/ray/blob/2dbd08a46f7f08ea614d8dd20fd0bca5682a3078/python/ray/data/_internal/util.py#L201-L205) and [tuning output blocks for Ray](read_output_blocks). +When a cluster has tens of thousands of nodes but only a few dataset files, Ray splits the dataset files according to available resources and distributes the blocks across all nodes, incurring high network communication costs and reducing CPU utilization. For more details, see [Ray's `_autodetect_parallelism` function](https://github.com/ray-project/ray/blob/2dbd08a46f7f08ea614d8dd20fd0bca5682a3078/python/ray/data/_internal/util.py#L201-L205) and [tuning output blocks for Ray](read_output_blocks). This default execution plan can be quite inefficient especially for scenarios with a large number of nodes. To optimize performance for such cases, Data-Juicer automatically splits the original dataset into smaller files in advance, taking into consideration the features of Ray and Arrow. When you encounter such performance issues, you can use this feature or split the dataset according to your own preferences. In this auto-split strategy, the single file size is about 128 MB, and the result should ensure that the number of sub-files after splitting is at least twice the total number of CPU cores available in the cluster. @@ -93,7 +93,7 @@ demos/process_on_ray ### Running Example of Ray Mode -In the `demo.yaml` config file, it sets the executor type to "ray" and specify an automatic Ray address. +In the `demo.yaml` config file, it sets the executor type to "ray" and specifies an automatic Ray address. ```yaml ... @@ -115,11 +115,11 @@ python tools/process_data.py --config demos/process_on_ray/configs/demo.yaml dj-process --config demos/process_on_ray/configs/demo.yaml ``` -Data-Juicer processes the demo dataset with the demo config file and export the result datasets to the directory specified by the `export_path` argument in the config file. +Data-Juicer processes the demo dataset with the demo config file and exports the result datasets to the directory specified by the `export_path` argument in the config file. ### Running Example of Distributed Deduplication -In the `dedup.yaml` config file, it sets the executor type to "ray" and specify an automatic Ray address. +In the `dedup.yaml` config file, it sets the executor type to "ray" and specifies an automatic Ray address. And it uses a dedicated distributed version of MinHash deduplication operator to deduplicate the dataset. ```yaml @@ -147,4 +147,4 @@ python tools/process_data.py --config demos/process_on_ray/configs/dedup.yaml dj-process --config demos/process_on_ray/configs/dedup.yaml ``` -Data-Juicer deduplicates the demo dataset with the demo config file and export the result datasets to the directory specified by the `export_path` argument in the config file. +Data-Juicer deduplicates the demo dataset with the demo config file and exports the result datasets to the directory specified by the `export_path` argument in the config file. diff --git a/doc/source/ray-more-libs/joblib.rst b/doc/source/ray-more-libs/joblib.rst index 3d9e5d6f818c..cb3f20c6d2a9 100644 --- a/doc/source/ray-more-libs/joblib.rst +++ b/doc/source/ray-more-libs/joblib.rst @@ -52,7 +52,7 @@ a multi-node Ray cluster instead. search.fit(digits.data, digits.target) You can also set the ``ray_remote_args`` argument in ``parallel_backend`` to :func:`configure -the Ray Actors ` making up the Pool. This can be used to eg. :ref:`assign resources +the Ray Actors ` making up the Pool. This can be used to e.g., :ref:`assign resources to Actors, such as GPUs `. .. code-block:: python @@ -67,7 +67,7 @@ Run on a Cluster This section assumes that you have a running Ray cluster. To start a Ray cluster, see the :ref:`cluster setup ` instructions. -To connect a scikit-learn to a running Ray cluster, you have to specify the address of the +To connect scikit-learn to a running Ray cluster, you have to specify the address of the head node by setting the ``RAY_ADDRESS`` environment variable. You can also start Ray manually by calling ``ray.init()`` (with any of its supported diff --git a/doc/source/ray-more-libs/mars-on-ray.rst b/doc/source/ray-more-libs/mars-on-ray.rst index 129fa94724f7..4baf6e38538d 100644 --- a/doc/source/ray-more-libs/mars-on-ray.rst +++ b/doc/source/ray-more-libs/mars-on-ray.rst @@ -6,11 +6,11 @@ Using Mars on Ray .. _`issue on GitHub`: https://github.com/mars-project/mars/issues -`Mars`_ is a tensor-based unified framework for large-scale data computation which scales Numpy, Pandas and Scikit-learn. +`Mars`_ is a tensor-based unified framework for large-scale data computation which scales NumPy, Pandas and Scikit-learn. Mars on Ray makes it easy to scale your programs with a Ray cluster. Currently Mars on Ray supports both Ray actors -and tasks as execution backend. The task will be scheduled by mars scheduler if Ray actors is used. This mode can reuse -all mars scheduler optimizations. If ray tasks mode is used, all tasks will be scheduled by ray, which can reuse failover and -pipeline capabilities provided by ray futures. +and tasks as an execution backend. The task will be scheduled by Mars scheduler if Ray actors are used. This mode can reuse +all Mars scheduler optimizations. If Ray tasks mode is used, all tasks will be scheduled by Ray, which can reuse failover and +pipeline capabilities provided by Ray futures. .. _`Mars`: https://mars-project.readthedocs.io/en/latest/ @@ -75,4 +75,6 @@ Interact with Dataset: df2 = ds.to_mars() print(df2.head(5).execute()) -Refer to _`Mars on Ray`: https://mars-project.readthedocs.io/en/latest/installation/ray.html#mars-ray for more information. +Refer to `Mars on Ray`_ for more information. + +.. _`Mars on Ray`: https://mars-project.readthedocs.io/en/latest/installation/ray.html#mars-ray diff --git a/doc/source/ray-more-libs/modin/index.rst b/doc/source/ray-more-libs/modin/index.rst index 04bfbcb82064..ad88c31d49c6 100644 --- a/doc/source/ray-more-libs/modin/index.rst +++ b/doc/source/ray-more-libs/modin/index.rst @@ -20,7 +20,7 @@ You can use Modin on Ray with your laptop or cluster. In this document, we show instructions for how to set up a Modin compatible Ray cluster and connect Modin to Ray. -.. note:: In previous versions of Modin, you had to initialize Ray before importing Modin. As of Modin 0.9.0, This is no longer the case. +.. note:: In previous versions of Modin, you had to initialize Ray before importing Modin. As of Modin 0.9.0, this is no longer the case. Using Modin with Ray's autoscaler --------------------------------- @@ -54,7 +54,7 @@ and operate on data with Ray. Dataframe operations '''''''''''''''''''' -The Modin Dataframe uses Ray tasks to perform data manipulations. Ray Tasks have +The Modin Dataframe uses Ray Tasks to perform data manipulations. Ray Tasks have a number of benefits over the actor model for data manipulation: - Multiple tasks may be manipulating the same objects simultaneously @@ -63,7 +63,7 @@ a number of benefits over the actor model for data manipulation: - As new workers come online the shuffling of data will happen as tasks are scheduled on the new node - Identical partitions need not be replicated, especially beneficial for operations - that selectively mutate the data (e.g. ``fillna``). + that selectively mutate the data (e.g., ``fillna``). - Finer grained parallelism with finer grained placement control Machine Learning @@ -71,7 +71,7 @@ Machine Learning Modin uses Ray Actors for the machine learning support it currently provides. Modin's implementation of XGBoost is able to spin up one actor for each node -and aggregate all of the partitions on that node to the XGBoost Actor. Modin +and aggregate all of the partitions on that node to the XGBoost actor. Modin is able to specify precisely the node IP for each actor on creation, giving fine-grained control over placement - a must for distributed training performance. diff --git a/doc/source/ray-more-libs/multiprocessing.rst b/doc/source/ray-more-libs/multiprocessing.rst index d57c4db54e0a..05c153c0472f 100644 --- a/doc/source/ray-more-libs/multiprocessing.rst +++ b/doc/source/ray-more-libs/multiprocessing.rst @@ -5,7 +5,7 @@ Distributed multiprocessing.Pool .. _`issue on GitHub`: https://github.com/ray-project/ray/issues -Ray supports running distributed python programs with the `multiprocessing.Pool API`_ +Ray supports running distributed Python programs with the `multiprocessing.Pool API`_ using `Ray Actors `__ instead of local processes. This makes it easy to scale existing applications that use ``multiprocessing.Pool`` from a single node to a cluster. diff --git a/doc/source/ray-more-libs/ray-collective.rst b/doc/source/ray-more-libs/ray-collective.rst index 46df357faa3f..c0c5d4f6afaf 100644 --- a/doc/source/ray-more-libs/ray-collective.rst +++ b/doc/source/ray-more-libs/ray-collective.rst @@ -185,19 +185,19 @@ remote actors. Refer to `APIs <#api-reference>`_ for the detailed descriptions o results = ray.get([w.compute.remote() for w in workers]) Note that for the same set of actors/task processes, multiple collective groups can be constructed, with ``group_name`` as their unique identifier. -This enables to specify complex communication patterns between different (sub)set of processes. +This enables specifying complex communication patterns between different (sub)set of processes. Collective Communication ^^^^^^^^^^^^^^^^^^^^^^^^ Check `the support matrix <#collective-primitives-support-matrix>`_ for the current status of supported collective calls and backends. -Note that the current set of collective communication API are imperative, and exhibit the following behaviours: +Note that the current set of collective communication APIs are imperative, and exhibit the following behaviours: * All the collective APIs are synchronous blocking calls * Since each API only specifies a part of the collective communication, the API is expected to be called by each participating process of the (pre-declared) collective group. - Upon all the processes have made the call and rendezvous with each other, the collective communication happens and proceeds. + Once all the processes have made the call and rendezvous with each other, the collective communication happens and proceeds. * The APIs are imperative and the communication happens out-of-band --- they need to be used inside the collective process (actor/task) code. An example of using ``ray.util.collective.allreduce`` is below: @@ -351,7 +351,7 @@ The following links provide helpful resources on how to efficiently leverage the * `More running examples `_ under ``ray.util.collective.examples``. -* `Scaling up the Spacy Name Entity Recognition (NER) pipeline `_ using Ray collective library. +* `Scaling up the spaCy Named Entity Recognition (NER) pipeline `_ using Ray collective library. * `Implementing the AllReduce strategy `_ for data-parallel distributed ML training. API References diff --git a/doc/source/ray-more-libs/raydp.rst b/doc/source/ray-more-libs/raydp.rst index 8ffcc70966d6..a227ef1f6046 100644 --- a/doc/source/ray-more-libs/raydp.rst +++ b/doc/source/ray-more-libs/raydp.rst @@ -8,7 +8,7 @@ RayDP combines your Spark and Ray clusters, making it easy to do large scale data processing using the PySpark API and seamlessly use that data to train your models using TensorFlow and PyTorch. -For more information and examples, see the RayDP Github page: +For more information and examples, see the RayDP GitHub page: https://github.com/oap-project/raydp ================ @@ -17,7 +17,7 @@ Installing RayDP RayDP can be installed from PyPI and supports PySpark 3.0 and 3.1. -.. code-block bash +.. code-block:: bash pip install raydp @@ -31,7 +31,7 @@ RayDP can be installed from PyPI and supports PySpark 3.0 and 3.1. Creating a Spark Session ======================== -To create a spark session, call ``raydp.init_spark`` +To create a Spark session, call ``raydp.init_spark`` For example, @@ -123,7 +123,7 @@ PyTorch. from raydp.utils import random_split train_df, test_df = random_split(df, [0.7, 0.3]) - # PyTorch Code + # PyTorch Code import torch class LinearModel(torch.nn.Module): def __init__(self): diff --git a/doc/source/ray-observability/getting-started.rst b/doc/source/ray-observability/getting-started.rst index 69ddbe76e7cb..ba66257146e9 100644 --- a/doc/source/ray-observability/getting-started.rst +++ b/doc/source/ray-observability/getting-started.rst @@ -130,7 +130,7 @@ Task Timeline First, download the chrome tracing file by clicking the download button. Alternatively, you can :ref:`use CLI or SDK to export the tracing file `. -Second, use tools like ``chrome://tracing`` or the `Perfetto UI `_ and drop the downloaded chrome tracing file. We will use the Perfetto as it is the recommendation way to visualize chrome tracing files. +Second, use tools like ``chrome://tracing`` or the `Perfetto UI `_ and drop the downloaded chrome tracing file. We will use Perfetto as it is the recommended way to visualize chrome tracing files. In the timeline visualization of Ray Tasks and Actors, there are Node rows (hardware) and Worker rows (processes). Each Worker rows display a list of Task events (e.g., Task scheduled, Task running, input/output deserialization, etc.) happening from that Worker over time. @@ -311,7 +311,7 @@ Additionally, users can see a snapshot of hardware utilization from the :ref:`Cl View the resource utilization ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Ray requires users to specify the number of :ref:`resources ` their Tasks and Actors to use through arguments such as ``num_cpus``, ``num_gpus``, ``memory``, and ``resource``. +Ray requires users to specify the number of :ref:`resources ` their Tasks and Actors use through arguments such as ``num_cpus``, ``num_gpus``, ``memory``, and ``resource``. These values are used for scheduling, but may not always match the actual resource utilization (physical resource utilization). - See the logical and physical resource utilization over time from the :ref:`Metrics view `. diff --git a/doc/source/ray-observability/images/post-moretem.gif b/doc/source/ray-observability/images/post-mortem.gif similarity index 100% rename from doc/source/ray-observability/images/post-moretem.gif rename to doc/source/ray-observability/images/post-mortem.gif diff --git a/doc/source/ray-observability/images/ray-event-export.png b/doc/source/ray-observability/images/ray-event-export.png new file mode 100644 index 000000000000..1e81ff197439 Binary files /dev/null and b/doc/source/ray-observability/images/ray-event-export.png differ diff --git a/doc/source/ray-observability/key-concepts.rst b/doc/source/ray-observability/key-concepts.rst index 00723f3056aa..0273a0132791 100644 --- a/doc/source/ray-observability/key-concepts.rst +++ b/doc/source/ray-observability/key-concepts.rst @@ -73,7 +73,7 @@ View :ref:`Ray Debugger ` for more details. Profiling --------- -Profiling is way of analyzing the performance of an application by sampling the resource usage of it. Ray supports various profiling tools: +Profiling is a way of analyzing the performance of an application by sampling the resource usage of it. Ray supports various profiling tools: - CPU profiling for Driver and Worker processes, including integration with :ref:`py-spy ` and :ref:`cProfile ` - Memory profiling for Driver and Worker processes with :ref:`memray ` diff --git a/doc/source/ray-observability/ray-distributed-debugger.rst b/doc/source/ray-observability/ray-distributed-debugger.rst index ca1a545deb7b..461fd7db91a1 100644 --- a/doc/source/ray-observability/ray-distributed-debugger.rst +++ b/doc/source/ray-observability/ray-distributed-debugger.rst @@ -199,7 +199,7 @@ When the app throws an exception: - The paused task is listed in the Ray Debugger extension. - Click the play icon next to the name of the paused task to attach the debugger and start debugging. -.. image:: ./images/post-moretem.gif +.. image:: ./images/post-mortem.gif :align: center diff --git a/doc/source/ray-observability/reference/system-metrics.rst b/doc/source/ray-observability/reference/system-metrics.rst index 7cacfe249df5..b7ba9d889f27 100644 --- a/doc/source/ray-observability/reference/system-metrics.rst +++ b/doc/source/ray-observability/reference/system-metrics.rst @@ -6,7 +6,7 @@ Ray exports a number of system metrics, which provide introspection into the sta .. note:: - Certain labels are common across all metrics, such as `SessionName` (uniquely identifies a Ray cluster instance), `instance` (per-node label applied by Prometheus, and `JobId` (Ray job id, as applicable). + Certain labels are common across all metrics, such as `SessionName` (uniquely identifies a Ray cluster instance), `instance` (per-node label applied by Prometheus), and `JobId` (Ray job ID, as applicable). .. list-table:: Ray System Metrics :header-rows: 1 @@ -22,7 +22,7 @@ Ray exports a number of system metrics, which provide introspection into the sta - Current number of actors in a particular state. The State label is described by `rpc::ActorTableData `_ proto in gcs.proto. The actor class name is available in the Name label. * - `ray_resources` - `Name`, `State`, `InstanceId` - - Logical resource usage for each node of the cluster. Each resource has some quantity that is `in either `_ USED state vs AVAILABLE state. The Name label defines the resource name (e.g., CPU, GPU). + - Logical resource usage for each node of the cluster. Each resource has some quantity that is in either `USED or AVAILABLE state `_. The Name label defines the resource name (e.g., CPU, GPU). * - `ray_object_store_memory` - `Location`, `ObjectState`, `InstanceId` - Object store memory usage in bytes, `broken down `_ by logical Location (SPILLED, MMAP_DISK, MMAP_SHM, and WORKER_HEAP). Definitions are as follows. SPILLED--Objects that have spilled to disk or a remote Storage solution (for example, AWS S3). The default is the disk. MMAP_DISK--Objects stored on a memory-mapped page on disk. This mode very slow and only happens under severe memory pressure. MMAP_SHM--Objects store on a memory-mapped page in Shared Memory. This mode is the default, in the absence of memory pressure. WORKER_HEAP--Objects, usually smaller, stored in the memory of the Ray Worker process itself. Small objects are stored in the worker heap. diff --git a/doc/source/ray-observability/user-guides/cli-sdk.rst b/doc/source/ray-observability/user-guides/cli-sdk.rst index 11d34759fa81..5be71ed9f913 100644 --- a/doc/source/ray-observability/user-guides/cli-sdk.rst +++ b/doc/source/ray-observability/user-guides/cli-sdk.rst @@ -317,8 +317,11 @@ you can use ``list`` or ``get`` APIs to get more details for an individual abnor .. note:: By default, objects are summarized by callsite. However, callsite is not recorded by Ray by default. - To get callsite info, set env variable `RAY_record_ref_creation_sites=1` when starting the Ray Cluster - RAY_record_ref_creation_sites=1 ray start --head + To get callsite info, set env variable `RAY_record_ref_creation_sites=1` when starting the Ray cluster: + + .. code-block:: bash + + RAY_record_ref_creation_sites=1 ray start --head .. tab-set:: diff --git a/doc/source/ray-observability/user-guides/configure-logging.md b/doc/source/ray-observability/user-guides/configure-logging.md index 0d932be05fd3..d74daf7c442c 100644 --- a/doc/source/ray-observability/user-guides/configure-logging.md +++ b/doc/source/ray-observability/user-guides/configure-logging.md @@ -594,4 +594,4 @@ The max size of a log file, including its backup, is `RAY_ROTATION_MAX_BYTES * R ## Log persistence -To process and export logs to external storage or management systems, view {ref}`log persistence on Kubernetes ` see {ref}`log persistence on VMs ` for more details. +To process and export logs to external storage or management systems, see {ref}`log persistence on Kubernetes ` and {ref}`log persistence on VMs ` for more details. diff --git a/doc/source/ray-observability/user-guides/debug-apps/debug-failures.rst b/doc/source/ray-observability/user-guides/debug-apps/debug-failures.rst index b1b12d46b0e6..6ac6a695a26a 100644 --- a/doc/source/ray-observability/user-guides/debug-apps/debug-failures.rst +++ b/doc/source/ray-observability/user-guides/debug-apps/debug-failures.rst @@ -61,8 +61,8 @@ Many Python developers use a debugger to debug Python programs, and `Python pdb Ray has native integration to ``pdb``. You can simply add ``breakpoint()`` to Actors and Tasks code to enable ``pdb``. View :ref:`Ray Debugger ` for more details. -Running out of file descriptors (``Too may open files``) --------------------------------------------------------- +Running out of file descriptors (``Too many open files``) +--------------------------------------------------------- In a Ray cluster, arbitrary two system components can communicate with each other and make 1 or more connections. For example, some workers may need to communicate with GCS to schedule Actors (worker <-> GCS connection). @@ -76,7 +76,7 @@ more than 1024 connections to the component, it can raise error messages below. .. code-block:: bash - Too may open files + Too many open files It is especially common for the head node GCS process because it is a centralized component that many other components in Ray communicate with. When you see this error message, @@ -119,4 +119,4 @@ View :ref:`debugging memory issues ` for more details. This document discusses some common problems that people run into when using Ray as well as some known problems. If you encounter other problems, `let us know`_. -.. _`let us know`: https://github.com/ray-project/ray/issues \ No newline at end of file +.. _`let us know`: https://github.com/ray-project/ray/issues diff --git a/doc/source/ray-observability/user-guides/index.md b/doc/source/ray-observability/user-guides/index.md index d50e4b8b0ae8..a39788f1c569 100644 --- a/doc/source/ray-observability/user-guides/index.md +++ b/doc/source/ray-observability/user-guides/index.md @@ -11,6 +11,7 @@ configure-logging profiling add-app-metrics ray-tracing +ray-event-export ``` These guides help you monitor and debug your Ray applications and clusters. @@ -21,3 +22,4 @@ The guides include: * {ref}`configure-logging` * {ref}`application-level-metrics` * {ref}`ray-tracing` +* {ref}`ray-event-export` diff --git a/doc/source/ray-observability/user-guides/profiling.md b/doc/source/ray-observability/user-guides/profiling.md index 6068a97fa1f3..a5814e309163 100644 --- a/doc/source/ray-observability/user-guides/profiling.md +++ b/doc/source/ray-observability/user-guides/profiling.md @@ -102,12 +102,12 @@ ray.init() @ray.remote(num_gpus=1, runtime_env={ "nsight": "default"}) class RayActor: - def run(): - a = torch.tensor([1.0, 2.0, 3.0]).cuda() - b = torch.tensor([4.0, 5.0, 6.0]).cuda() - c = a * b + def run(self): + a = torch.tensor([1.0, 2.0, 3.0]).cuda() + b = torch.tensor([4.0, 5.0, 6.0]).cuda() + c = a * b - print("Result on GPU:", c) + print("Result on GPU:", c) ray_actor = RayActor.remote() # The Actor or Task process runs with : "nsys profile [default options] ..." @@ -135,12 +135,12 @@ runtime_env={ "nsight": { "cuda-graph-trace": "graph", }}) class RayActor: - def run(): - a = torch.tensor([1.0, 2.0, 3.0]).cuda() - b = torch.tensor([4.0, 5.0, 6.0]).cuda() - c = a * b + def run(self): + a = torch.tensor([1.0, 2.0, 3.0]).cuda() + b = torch.tensor([4.0, 5.0, 6.0]).cuda() + c = a * b - print("Result on GPU:", c) + print("Result on GPU:", c) ray_actor = RayActor.remote() diff --git a/doc/source/ray-observability/user-guides/ray-event-export.rst b/doc/source/ray-observability/user-guides/ray-event-export.rst new file mode 100644 index 000000000000..89ef27bcc46c --- /dev/null +++ b/doc/source/ray-observability/user-guides/ray-event-export.rst @@ -0,0 +1,134 @@ +.. _ray-event-export: + +Ray Event Export +================ + +Starting from 2.49, Ray supports exporting structured events to a configured HTTP +endpoint. Each node sends events to the endpoint through an HTTP POST request. + +Ray 2.49 supports exporting task events. Future releases include support for other +event types, such as actor events, node events, job events, and more. + +Previously, Ray's :ref:`task events ` were only used internally by the Ray Dashboard +and :ref:`State API ` for monitoring and debugging. With the new event +export feature, you can now send these raw events to external systems for custom analytics, +monitoring, and integration with third-party tools. + +.. note:: + Ray Event Export is still in alpha. The way to configure event + reporting and the format of the events is subject to change. + +Enable event reporting +---------------------- +To enable event reporting, you need to set the ``RAY_enable_core_worker_ray_event_to_aggregator`` environment +variable to ``1`` when starting each Ray worker node. + +To set the target HTTP endpoint, set the ``RAY_DASHBOARD_AGGREGATOR_AGENT_EVENTS_EXPORT_ADDR`` +environment variable to a valid HTTP URL with the ``http://`` URL scheme. + +Event format +------------ + +Events are JSON objects in the POST request body. + +All events contain the same base fields and different event specific fields. +See `src/ray/protobuf/public/events_base_event.proto `_ for the base fields. + +Task events +^^^^^^^^^^^ + +For each task, Ray exports two types of events: Task Definition Event and Task Execution Event. + +* Each task attempt generates one Task Definition Event which contains the metadata of the task. + See `src/ray/protobuf/public/events_task_definition_event.proto `_ + and `src/ray/protobuf/public/events_actor_task_definition_event.proto `_ for the event formats for normal tasks + and actor tasks respectively. +* Task Execution Events contain task state transition information and metadata + generated during task execution. + See `src/ray/protobuf/public/events_task_execution_event.proto `_ for the event format. + +An example of a Task Definition Event and a Task Execution Event: + +.. code-block:: json + + // task definition event + { + "eventId":"N5n229xkwyjlZRFJDF2G1sh6ZNYlqChwJ4WPEQ==", + "sourceType":"CORE_WORKER", + "eventType":"TASK_DEFINITION_EVENT", + "timestamp":"2025-09-03T18:52:14.467290Z", + "severity":"INFO", + "sessionName":"session_2025-09-03_11-52-12_635210_85618", + "taskDefinitionEvent":{ + "taskId":"yO9FzNARJXH///////////////8BAAAA", + "taskFunc":{ + "pythonFunctionDescriptor":{ + "moduleName":"test-tasks", + "functionName":"test_task", + "functionHash":"37ddb110c0514b049bd4db5ab934627b", + "className":"" + } + }, + "taskName":"test_task", + "requiredResources":{ + "CPU":1.0 + }, + "runtimeEnvInfo":{ + "serializedRuntimeEnv":"{}", + "runtimeEnvConfig":{ + "setupTimeoutSeconds":600, + "eagerInstall":true, + "logFiles":[ + + ] + } + }, + "jobId":"AQAAAA==", + "parentTaskId":"//////////////////////////8BAAAA", + "placementGroupId":"////////////////////////", + "taskAttempt":0, + "taskType":"NORMAL_TASK", + "language":"PYTHON", + "refIds":{ + + } + }, + "message":"" + } + + // task execution event + { + "eventId":"vkIaAHlQC5KoppGosqs2kBq5k2WzsAAbawDDbQ==", + "sourceType":"CORE_WORKER", + "eventType":"TASK_EXECUTION_EVENT", + "timestamp":"2025-09-03T18:52:14.469074Z", + "severity":"INFO", + "sessionName":"session_2025-09-03_11-52-12_635210_85618", + "taskExecutionEvent":{ + "taskId":"yO9FzNARJXH///////////////8BAAAA", + "taskState":{ + // key is the integer value of TaskStatus enum in common.proto at + // https://github.com/ray-project/ray/blob/master/src/ray/protobuf/common.proto + "2":"2025-09-03T18:52:14.467402Z", // PENDING_NODE_ASSIGNMENT + "1":"2025-09-03T18:52:14.467290Z", // PENDING_ARGS_AVAIL + "5":"2025-09-03T18:52:14.469074Z" // SUBMITTED_TO_WORKER + }, + "nodeId":"ZvxTI6x9dlMFqMlIHErJpg5UEGK1INsKhW2zyg==", + "workerId":"hMybCNYIFi+/yInYYhdc+qH8yMF65j/8+uCTmw==", + "jobId":"AQAAAA==", + "taskAttempt":0, + "workerPid":0 + }, + "message":"" + } + +High-level Architecture +----------------------- + +The following diagram shows the high-level architecture of Ray Event Export. + +.. image:: ../images/ray-event-export.png + +All Ray components send events to an aggregator agent through gRPC. There is an aggregator +agent on each node. The aggregator agent collects all events on that node and sends the +events to the configured HTTP endpoint. \ No newline at end of file diff --git a/doc/source/ray-overview/examples/e2e-audio/requirements.txt b/doc/source/ray-overview/examples/e2e-audio/requirements.txt index 4f2606eeaa9e..0c47ff4c5f00 100644 --- a/doc/source/ray-overview/examples/e2e-audio/requirements.txt +++ b/doc/source/ray-overview/examples/e2e-audio/requirements.txt @@ -1,7 +1,7 @@ # This file is autogenerated by pip-compile with Python 3.10 # by the following command: # -# pip-compile --constraint=python/requirements_compiled_rayllm_py311_cu128.txt --no-annotate --no-emit-index-url --no-emit-trusted-host --output-file=requirements.txt --strip-extras doc/source/ray-overview/examples/e2e-audio/requirements.in +# pip-compile --constraint=python/deplocks/llm/rayllm_py311_cu128.lock --no-annotate --no-emit-index-url --no-emit-trusted-host --output-file=requirements.txt --strip-extras doc/source/ray-overview/examples/e2e-audio/requirements.in # # ... and then slimmed down by Ricardo accelerate==1.7.0 diff --git a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/.anyscaleignore b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/.anyscaleignore new file mode 100644 index 000000000000..30c59efc983b --- /dev/null +++ b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/.anyscaleignore @@ -0,0 +1,18 @@ +# This file is used to exclude files from Anyscale Workspaces snapshots. +# Use this to prevent large or unnecessary files from being included in your snapshots, +# which helps reduce snapshot size and creation time. See documentation for more details: +# https://docs.anyscale.com/platform/workspaces/workspaces-files/#excluding-files-with-anyscaleignore +# +# Syntax examples: +# *.txt # Ignore files with a .txt extension at the same level as `.anyscaleignore`. +# **/*.txt # Ignore files with a .txt extension in ANY directory. +# folder/ # Ignore all files under "folder/". The slash at the end is optional. +# folder/*.txt # Ignore files with a .txt extension under "folder/". +# path/to/filename.py # Ignore a specific file by providing its relative path. +# file_[1,2].txt # Ignore file_1.txt and file_2.txt. + +# Exclude Python virtual environments (.venv/) from snapshots. Virtual environments contain +# all installed Python dependencies, which can be multiple gigabytes in size. These directories +# are typically recreatable from requirements files and don't need to be included in snapshots. +# The ** pattern ensures all .venv directories are excluded regardless of location in your project. +**/.venv/ diff --git a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/.gitignore b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/.gitignore new file mode 100644 index 000000000000..a9fffb1a10c6 --- /dev/null +++ b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/.gitignore @@ -0,0 +1,103 @@ +# VSCode +.vscode/ +.idea + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Flask: +instance/ +.webassets-cache + +# Scrapy: +.scrapy + +# Sphinx +docs/_build/ + +# PyBuilder +target/ + +# IPython +.ipynb_checkpoints +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# PEP 582 +__pypackages__/ + +# Celery +celerybeat-schedule +celerybeat.pid + +# Environment +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# mkdocs +site/ + +# Airflow +airflow/airflow.db + +# MacOS +.DS_Store + +# Clean up +.trash/ \ No newline at end of file diff --git a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/README.ipynb b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/README.ipynb index 2559cc045ccd..dbf5eeb84d69 100644 --- a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/README.ipynb +++ b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/README.ipynb @@ -18,7 +18,7 @@ "
\n", "\n", "💻 Run this entire tutorial on [Anyscale](https://www.anyscale.com/) for free:\n", - "**https://console.anyscale.com/template-preview/image-search-and-classification**\n", + "**https://console.anyscale.com/template-preview/image-search-and-classification** or access the repository [here](https://github.com/ray-project/ray/tree/master/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads).\n", "\n", "This tutorial focuses on the fundamental challenges of multimodal AI workloads at scale:\n", "\n", @@ -42,6 +42,7 @@ "- **Development tools**: Spin up a remote session from your local IDE (Cursor, VS Code, etc.) and start coding, using the same tools you love but with the power of Anyscale's compute.\n", "- **Dependencies**: Install dependencies using familiar tools like pip or uv. Anyscale propagates all dependencies to the cluster's worker nodes.\n", "- **Compute**: Leverage any reserved instance capacity, spot instance from any compute provider of your choice by deploying Anyscale into your account. Alternatively, you can use the Anyscale cloud for a full serverless experience.\n", + " - Under the hood, a cluster spins up and is efficiently managed by Anyscale.\n", "- **Debugging**: Leverage a [distributed debugger](https://docs.anyscale.com/platform/workspaces/workspaces-debugging/#distributed-debugger) to get the same VS Code-like debugging experience.\n", "\n", "Learn more about Anyscale Workspaces in the [official documentation](https://docs.anyscale.com/platform/workspaces/).\n", @@ -50,11 +51,40 @@ " \n", "\n", "\n", + "### Additional dependencies\n", + "\n", + "You can choose to manage the additional dependencies through `uv` or `pip`. \n", + "\n", + "#### uv\n", + "\n", + "```bash\n", + "# UV setup instructions\n", + "uv init . # this creates pyproject.toml, uv lockfile, etc.\n", + "ray_wheel_url=http://localhost:9478/ray/$(pip freeze | grep -oP '^ray @ file:///home/ray/\\.whl/\\K.*')\n", + "uv add \"$ray_wheel_url[data, train, tune, serve]\" # to use anyscale's performant ray runtime\n", + "uv add $(grep -v '^\\s*#' requirements.txt)\n", + "uv add --editable ./doggos\n", + "```\n", + "\n", + "#### Pip\n", + "\n", + "```bash\n", + "# Pip setup instructions\n", + "pip install -q -r /home/ray/default/requirements.txt\n", + "pip install -e ./doggos\n", + "```\n", + "\n", "**Note**: Run the entire tutorial for free on [Anyscale](https://console.anyscale.com/)—all dependencies come pre-installed, and compute autoscales automatically. To run it elsewhere, install the dependencies from the [`containerfile`](https://github.com/anyscale/multimodal-ai/tree/main/containerfile) and provision the appropriate GPU resources.\n", "\n", "## Production\n", "Seamlessly integrate with your existing CI/CD pipelines by leveraging the Anyscale [CLI](https://docs.anyscale.com/reference/quickstart-cli) or [SDK](https://docs.anyscale.com/reference/quickstart-sdk) to deploy [highly available services](https://docs.anyscale.com/platform/services) and run [reliable batch jobs](https://docs.anyscale.com/platform/jobs). Developing in an environment nearly identical to production—a multi-node cluster—drastically accelerates the dev-to-prod transition. This tutorial also introduces proprietary RayTurbo features that optimize workloads for performance, fault tolerance, scale, and observability.\n", "\n", + "```bash\n", + "anyscale job submit -f /home/ray/default/configs/generate_embeddings.yaml\n", + "anyscale job submit -f /home/ray/default/configs/train_model.yaml\n", + "anyscale service deploy -f /home/ray/default/configs/service.yaml\n", + "```\n", + "\n", "## No infrastructure headaches\n", "Abstract away infrastructure from your ML/AI developers so they can focus on their core ML development. You can additionally better manage compute resources and costs with [enterprise governance and observability](https://www.anyscale.com/blog/enterprise-governance-observability) and [admin capabilities](https://docs.anyscale.com/administration/overview) so you can set [resource quotas](https://docs.anyscale.com/reference/resource-quotas/), set [priorities for different workloads](https://docs.anyscale.com/administration/cloud-deployment/global-resource-scheduler) and gain [observability of your utilization across your entire compute fleet](https://docs.anyscale.com/administration/resource-management/telescope-dashboard).\n", "Users running on a Kubernetes cloud (EKS, GKE, etc.) can still access the proprietary RayTurbo optimizations demonstrated in this tutorial by deploying the [Anyscale Kubernetes Operator](https://docs.anyscale.com/administration/cloud-deployment/kubernetes/)." diff --git a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/README.md b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/README.md index 60b2e693cafa..ad77132b8cf7 100644 --- a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/README.md +++ b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/README.md @@ -14,7 +14,7 @@ notebooks/03-Online-Serving 💻 Run this entire tutorial on [Anyscale](https://www.anyscale.com/) for free: -**https://console.anyscale.com/template-preview/image-search-and-classification** +**https://console.anyscale.com/template-preview/image-search-and-classification** or access the repository [here](https://github.com/ray-project/ray/tree/master/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads). This tutorial focuses on the fundamental challenges of multimodal AI workloads at scale: @@ -47,11 +47,40 @@ Learn more about Anyscale Workspaces in the [official documentation](https://doc +### Additional dependencies + +You can choose to manage the additional dependencies through `uv` or `pip`. + +#### uv + +```bash +# UV setup instructions +uv init . # this creates pyproject.toml, uv lockfile, etc. +ray_wheel_url=http://localhost:9478/ray/$(pip freeze | grep -oP '^ray @ file:///home/ray/\.whl/\K.*') +uv add "$ray_wheel_url[data, train, tune, serve]" # to use anyscale's performant ray runtime +uv add $(grep -v '^\s*#' requirements.txt) +uv add --editable ./doggos +``` + +#### Pip + +```bash +# Pip setup instructions +pip install -q -r /home/ray/default/requirements.txt +pip install -e ./doggos +``` + **Note**: Run the entire tutorial for free on [Anyscale](https://console.anyscale.com/)—all dependencies come pre-installed, and compute autoscales automatically. To run it elsewhere, install the dependencies from the [`containerfile`](https://github.com/anyscale/multimodal-ai/tree/main/containerfile) and provision the appropriate GPU resources. ## Production Seamlessly integrate with your existing CI/CD pipelines by leveraging the Anyscale [CLI](https://docs.anyscale.com/reference/quickstart-cli) or [SDK](https://docs.anyscale.com/reference/quickstart-sdk) to deploy [highly available services](https://docs.anyscale.com/platform/services) and run [reliable batch jobs](https://docs.anyscale.com/platform/jobs). Developing in an environment nearly identical to production—a multi-node cluster—drastically accelerates the dev-to-prod transition. This tutorial also introduces proprietary RayTurbo features that optimize workloads for performance, fault tolerance, scale, and observability. +```bash +anyscale job submit -f /home/ray/default/configs/generate_embeddings.yaml +anyscale job submit -f /home/ray/default/configs/train_model.yaml +anyscale service deploy -f /home/ray/default/configs/service.yaml +``` + ## No infrastructure headaches Abstract away infrastructure from your ML/AI developers so they can focus on their core ML development. You can additionally better manage compute resources and costs with [enterprise governance and observability](https://www.anyscale.com/blog/enterprise-governance-observability) and [admin capabilities](https://docs.anyscale.com/administration/overview) so you can set [resource quotas](https://docs.anyscale.com/reference/resource-quotas/), set [priorities for different workloads](https://docs.anyscale.com/administration/cloud-deployment/global-resource-scheduler) and gain [observability of your utilization across your entire compute fleet](https://docs.anyscale.com/administration/resource-management/telescope-dashboard). diff --git a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/ci/build.sh b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/ci/build.sh index bda0ef917f47..05ff13248bd7 100755 --- a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/ci/build.sh +++ b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/ci/build.sh @@ -5,7 +5,7 @@ set -exo pipefail # Install Python dependencies pip3 install --no-cache-dir \ "matplotlib==3.10.0" \ - "torch==2.7.0" \ + "torch==2.7.1" \ "transformers==4.52.3" \ "scikit-learn==1.6.0" \ "mlflow==2.19.0" \ diff --git a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/configs/generate_embeddings.yaml b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/configs/generate_embeddings.yaml new file mode 100644 index 000000000000..fcb9020fe206 --- /dev/null +++ b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/configs/generate_embeddings.yaml @@ -0,0 +1,71 @@ +# View the docs https://docs.anyscale.com/reference/job-api#jobconfig. + +name: image-batch-embeddings + +# When empty, use the default image. This can be an Anyscale-provided base image +# like anyscale/ray:2.43.0-slim-py312-cu125, a user-provided base image (provided +# that it meets certain specs), or you can build new images using the Anyscale +# image builder at https://console.anyscale-staging.com/v2/container-images. +image_uri: anyscale/ray:2.48.0-slim-py312-cu128 +# containerfile: /home/ray/default/containerfile + +# When empty, Anyscale will auto-select the instance types. You can also specify +# minimum and maximum resources. +compute_config: +# head_node: +# instance_type: m5.2xlarge +# worker_nodes: +# - instance_type: m5.16xlarge +# min_nodes: 0 +# max_nodes: 100 +# - instance_type: m7a.24xlarge +# min_nodes: 0 +# max_nodes: 100 +# market_type: PREFER_SPOT # Defaults to ON_DEMAND +# - instance_type: g4dn.2xlarge +# min_nodes: 0 +# max_nodes: 100 +# market_type: PREFER_SPOT # Defaults to ON_DEMAND +# min_resources: +# CPU: 100 +# GPU: 1 +# max_resources: +# CPU: 5000 +# GPU: 100 + +# Path to a local directory or a remote URI to a .zip file (S3, GS, HTTP) that +# will be the working directory for the job. The files in the directory will be +# automatically uploaded to the job environment in Anyscale. +working_dir: /home/ray/default +excludes: # (Optional) List of files to exclude from being packaged up for the job. + - .git + - .env + - .venv + - '**/*.egg-info/**' + - '**/.DS_Store/**' + - '**/__pycache__/**' + +requirements: # (Optional) List of requirements files to install. Can also be a path to a requirements.txt. + - ipywidgets==8.1.3 + - matplotlib==3.10.0 + - mlflow==2.19.0 + - torch==2.7.1 + - transformers==4.52.3 + - scikit-learn==1.6.0 +env_vars: # (Optional) Dictionary of environment variables to set in the job. + # MY_ENV_VAR: my_value + # ANOTHER_ENV_VAR: another_value +py_modules: # (Optional) A list of local directories or remote URIs that will be added to the Python path. + - /home/ray/default/doggos + # - /path/to/my_module + # - s3://my_bucket/my_module + +# When empty, this uses the default Anyscale Cloud in your organization. +cloud: + +# The script to run in your job. You can also do "uv run main.py" if you have a +# pyproject.toml file in your working_dir. +entrypoint: python doggos/doggos/embed.py # uv run doggos/doggos/embed.py # remove the requirements and py_modules + +# If there is an error, do not retry. +max_retries: 0 \ No newline at end of file diff --git a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/configs/service.yaml b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/configs/service.yaml new file mode 100644 index 000000000000..c3b1d2abd4cb --- /dev/null +++ b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/configs/service.yaml @@ -0,0 +1,69 @@ +# View the docs https://docs.anyscale.com/reference/service-api#serviceconfig. + +name: doggos-app + +# When empty, use the default image. This can be an Anyscale-provided base image +# like anyscale/ray:2.43.0-slim-py312-cu125, a user-provided base image (provided +# that it meets certain specs), or you can build new images using the Anyscale +# image builder at https://console.anyscale-staging.com/v2/container-images. +image_uri: anyscale/ray:2.48.0-slim-py312-cu128 +# containerfile: /home/ray/default/containerfile + +# When empty, Anyscale will auto-select the instance types. You can also specify +# minimum and maximum resources. +compute_config: +# head_node: +# instance_type: m5.2xlarge +# worker_nodes: +# - instance_type: m5.16xlarge +# min_nodes: 0 +# max_nodes: 100 +# - instance_type: m7a.24xlarge +# min_nodes: 0 +# max_nodes: 100 +# market_type: PREFER_SPOT # Defaults to ON_DEMAND +# - instance_type: g4dn.2xlarge +# min_nodes: 0 +# max_nodes: 100 +# market_type: PREFER_SPOT # Defaults to ON_DEMAND +# min_resources: +# CPU: 100 +# GPU: 1 +# max_resources: +# CPU: 5000 +# GPU: 100 + +# Path to a local directory or a remote URI to a .zip file (S3, GS, HTTP) that +# will be the working directory for the job. The files in the directory will be +# automatically uploaded to the job environment in Anyscale. +working_dir: /home/ray/default +excludes: # (Optional) List of files to exclude from being packaged up for the job. + - .git + - .env + - .venv + - '**/*.egg-info/**' + - '**/.DS_Store/**' + - '**/__pycache__/**' + +requirements: # (Optional) List of requirements files to install. Can also be a path to a requirements.txt. + - ipywidgets==8.1.3 + - matplotlib==3.10.0 + - mlflow==2.19.0 + - torch==2.7.1 + - transformers==4.52.3 + - scikit-learn==1.6.0 +env_vars: # (Optional) Dictionary of environment variables to set in the job. + # MY_ENV_VAR: my_value + # ANOTHER_ENV_VAR: another_value +py_modules: # (Optional) A list of local directories or remote URIs that will be added to the Python path. + - /home/ray/default/doggos + # - /path/to/my_module + # - s3://my_bucket/my_module +py_executable: python # uv run # remove the requirements and py_modules + +# When empty, this uses the default Anyscale Cloud in your organization. +cloud: + +# Speciy the Ray Serve app to deploy. +applications: +- import_path: doggos.serve:app \ No newline at end of file diff --git a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/configs/train_model.yaml b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/configs/train_model.yaml new file mode 100644 index 000000000000..28305413a642 --- /dev/null +++ b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/configs/train_model.yaml @@ -0,0 +1,71 @@ +# View the docs https://docs.anyscale.com/reference/job-api#jobconfig. + +name: train-image-model + +# When empty, use the default image. This can be an Anyscale-provided base image +# like anyscale/ray:2.43.0-slim-py312-cu125, a user-provided base image (provided +# that it meets certain specs), or you can build new images using the Anyscale +# image builder at https://console.anyscale-staging.com/v2/container-images. +image_uri: anyscale/ray:2.48.0-slim-py312-cu128 +# containerfile: /home/ray/default/containerfile + +# When empty, Anyscale will auto-select the instance types. You can also specify +# minimum and maximum resources. +compute_config: +# head_node: +# instance_type: m5.2xlarge +# worker_nodes: +# - instance_type: m5.16xlarge +# min_nodes: 0 +# max_nodes: 100 +# - instance_type: m7a.24xlarge +# min_nodes: 0 +# max_nodes: 100 +# market_type: PREFER_SPOT # Defaults to ON_DEMAND +# - instance_type: g4dn.2xlarge +# min_nodes: 0 +# max_nodes: 100 +# market_type: PREFER_SPOT # Defaults to ON_DEMAND +# min_resources: +# CPU: 100 +# GPU: 1 +# max_resources: +# CPU: 5000 +# GPU: 100 + +# Path to a local directory or a remote URI to a .zip file (S3, GS, HTTP) that +# will be the working directory for the job. The files in the directory will be +# automatically uploaded to the job environment in Anyscale. +working_dir: /home/ray/default +excludes: # (Optional) List of files to exclude from being packaged up for the job. + - .git + - .env + - .venv + - '**/*.egg-info/**' + - '**/.DS_Store/**' + - '**/__pycache__/**' + +requirements: # (Optional) List of requirements files to install. Can also be a path to a requirements.txt. + - ipywidgets==8.1.3 + - matplotlib==3.10.0 + - mlflow==2.19.0 + - torch==2.7.1 + - transformers==4.52.3 + - scikit-learn==1.6.0 +env_vars: # (Optional) Dictionary of environment variables to set in the job. + # MY_ENV_VAR: my_value + # ANOTHER_ENV_VAR: another_value +py_modules: # (Optional) A list of local directories or remote URIs that will be added to the Python path. + - /home/ray/default/doggos + # - /path/to/my_module + # - s3://my_bucket/my_module + +# When empty, this uses the default Anyscale Cloud in your organization. +cloud: + +# The script to run in your job. You can also do "uv run main.py" if you have a +# pyproject.toml file in your working_dir. +entrypoint: python doggos/doggos/train.py # uv run doggos/doggos/train.py # remove the requirements and py_modules + +# If there is an error, do not retry. +max_retries: 0 \ No newline at end of file diff --git a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/containerfile b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/containerfile index 43b34335edda..a9bc32fc2faa 100644 --- a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/containerfile +++ b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/containerfile @@ -1,6 +1,6 @@ # Start with an Anyscale base image. # Use the drop-down above to browse through all available images. -FROM anyscale/ray:2.47.0-slim-py312-cu128 +FROM anyscale/ray:2.49.0-slim-py312-cu128 # Add your pip dependencies here. Disable cache for a smaller image to optimize build and cluster startup time. # RUN pip install --no-cache-dir --upgrade @@ -13,5 +13,5 @@ FROM anyscale/ray:2.47.0-slim-py312-cu128 # Add other build commands here. # RUN echo "Testing Ray import..." && python -c "import ray" RUN python3 -m pip install --no-cache-dir \ - "matplotlib==3.10.0" "torch==2.7.0" "transformers==4.52.3" \ + "matplotlib==3.10.0" "torch==2.7.1" "transformers==4.52.3" \ "scikit-learn==1.6.0" "mlflow==2.19.0" "ipywidgets==8.1.3" diff --git a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/__init__.py b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/doggos/__init__.py similarity index 100% rename from doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/__init__.py rename to doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/doggos/__init__.py diff --git a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/data.py b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/doggos/data.py similarity index 83% rename from doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/data.py rename to doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/doggos/data.py index 81fd2cbe3b3a..fd672fccb9e8 100644 --- a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/data.py +++ b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/doggos/data.py @@ -29,12 +29,15 @@ def transform(self, ds): ) ds = ds.map_batches( EmbedImages, - fn_constructor_kwargs={"model_id": "openai/clip-vit-base-patch32"}, - fn_kwargs={"device": "cuda"}, + fn_constructor_kwargs={ + "model_id": "openai/clip-vit-base-patch32", + "device": "cuda", + }, # class kwargs + fn_kwargs={}, concurrency=4, batch_size=64, num_gpus=1, - accelerator_type="L4", + accelerator_type="T4", ) ds = ds.drop_columns(["image"]) return ds diff --git a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/embed.py b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/doggos/embed.py similarity index 94% rename from doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/embed.py rename to doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/doggos/embed.py index 6979139177c2..88a3680dbb4d 100644 --- a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/embed.py +++ b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/doggos/embed.py @@ -99,12 +99,15 @@ def display_top_matches(url, matches): # Batch embedding generation embeddings_ds = ds.map_batches( EmbedImages, - fn_constructor_kwargs={"model_id": "openai/clip-vit-base-patch32"}, - fn_kwargs={"device": "cuda"}, + fn_constructor_kwargs={ + "model_id": "openai/clip-vit-base-patch32", + "device": "cuda", + }, # class kwargs + fn_kwargs={}, concurrency=4, batch_size=64, num_gpus=1, - accelerator_type="L4", + accelerator_type="T4", ) embeddings_ds = embeddings_ds.drop_columns(["image"]) # remove image column diff --git a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/infer.py b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/doggos/infer.py similarity index 100% rename from doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/infer.py rename to doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/doggos/infer.py diff --git a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/model.py b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/doggos/model.py similarity index 100% rename from doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/model.py rename to doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/doggos/model.py diff --git a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/serve.py b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/doggos/serve.py similarity index 82% rename from doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/serve.py rename to doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/doggos/serve.py index 52b85f02defe..da8ff6a1b778 100644 --- a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/serve.py +++ b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/doggos/serve.py @@ -13,26 +13,18 @@ from ray import serve -# Define app -api = FastAPI( - title="doggos", - description="classify your dog", - version="0.1", -) - @serve.deployment( num_replicas="1", ray_actor_options={ - "num_cpus": 4, "num_gpus": 1, - "accelerator_type": "L4", + "accelerator_type": "T4", }, ) class ClassPredictor: def __init__(self, model_id, artifacts_dir, device="cuda"): """Initialize the model.""" - # Embdding model + # Embedding model self.processor = CLIPProcessor.from_pretrained(model_id) self.model = CLIPModel.from_pretrained(model_id) self.model.to(device=device) @@ -49,13 +41,21 @@ def get_probabilities(self, url): ) with torch.inference_mode(): embedding = self.model.get_image_features(**inputs).cpu().numpy() - probabilities = self.predictor.predict_probabilities( + outputs = self.predictor.predict_probabilities( collate_fn({"embedding": embedding}) ) - return probabilities + return {"probabilities": outputs["probabilities"][0]} -@serve.deployment(num_replicas="1", ray_actor_options={"num_cpus": 2}) +# Define app +api = FastAPI( + title="doggos", + description="classify your dog", + version="0.1", +) + + +@serve.deployment @serve.ingress(api) class Doggos: def __init__(self, classifier): @@ -65,27 +65,28 @@ def __init__(self, classifier): async def predict(self, request: Request): data = await request.json() probabilities = await self.classifier.get_probabilities.remote(url=data["url"]) - return { - "probabilities": probabilities, - } + return probabilities -# Model registry +# Model registry. model_registry = "/mnt/user_storage/mlflow/doggos" experiment_name = "doggos" mlflow.set_tracking_uri(f"file:{model_registry}") -# Best run +# Get best_run's artifact_dir. sorted_runs = mlflow.search_runs( - experiment_names=[experiment_name], - order_by=["metrics.val_loss ASC"], + experiment_names=[experiment_name], order_by=["metrics.val_loss ASC"] ) best_run = sorted_runs.iloc[0] artifacts_dir = urlparse(best_run.artifact_uri).path # Define app app = Doggos.bind( - classifier=ClassPredictor.bind(artifacts_dir=artifacts_dir), + classifier=ClassPredictor.bind( + model_id="openai/clip-vit-base-patch32", + artifacts_dir=artifacts_dir, + device="cuda", + ) ) if __name__ == "__main__": diff --git a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/train.py b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/doggos/train.py similarity index 99% rename from doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/train.py rename to doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/doggos/train.py index 8d060d480c6f..328ce36c91cd 100644 --- a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/train.py +++ b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/doggos/train.py @@ -148,7 +148,7 @@ def train_loop_per_worker(config): num_workers=num_workers, use_gpu=True, resources_per_worker={"CPU": 8, "GPU": 2}, - accelerator_type="L4", + accelerator_type="T4", ) # Datasets diff --git a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/utils.py b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/doggos/utils.py similarity index 100% rename from doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/utils.py rename to doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/doggos/utils.py diff --git a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/pyproject.toml b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/pyproject.toml new file mode 100644 index 000000000000..5ec5cff96a4b --- /dev/null +++ b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/pyproject.toml @@ -0,0 +1,13 @@ +[build-system] +requires = ["setuptools>=68", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "doggos" +version = "0.1.0" +requires-python = ">=3.12" +description = "doggos multimodal ai package" + +[tool.setuptools.packages.find] +where = ["."] +include = ["doggos*"] \ No newline at end of file diff --git a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/notebooks/01-Batch-Inference.ipynb b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/notebooks/01-Batch-Inference.ipynb index 858f43fa7b64..2b083decd67d 100644 --- a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/notebooks/01-Batch-Inference.ipynb +++ b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/notebooks/01-Batch-Inference.ipynb @@ -30,14 +30,35 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[92mSuccessfully registered `matplotlib, torch` and 4 other packages to be installed on all cluster nodes.\u001b[0m\n", - "\u001b[92mView and update dependencies here: https://console.anyscale.com/cld_kvedZWag2qA8i5BjxUevf5i7/prj_cz951f43jjdybtzkx1s5sjgz99/workspaces/expwrk_eys8cskj5aivghbf773dp2vmcd?workspace-tab=dependencies\u001b[0m\n" + "\u001b[92mSuccessfully registered `ipywidgets, matplotlib` and 4 other packages to be installed on all cluster nodes.\u001b[0m\n", + "\u001b[92mView and update dependencies here: https://console.anyscale.com/cld_kvedZWag2qA8i5BjxUevf5i7/prj_cz951f43jjdybtzkx1s5sjgz99/workspaces/expwrk_23ry3pgfn3jgq2jk3e5z25udhz?workspace-tab=dependencies\u001b[0m\n", + "\u001b[92mSuccessfully registered `doggos` package to be installed on all cluster nodes.\u001b[0m\n", + "\u001b[92mView and update dependencies here: https://console.anyscale.com/cld_kvedZWag2qA8i5BjxUevf5i7/prj_cz951f43jjdybtzkx1s5sjgz99/workspaces/expwrk_23ry3pgfn3jgq2jk3e5z25udhz?workspace-tab=dependencies\u001b[0m\n" ] } ], "source": [ "%%bash\n", - "pip install -q \"matplotlib==3.10.0\" \"torch==2.7.1\" \"transformers==4.52.3\" \"scikit-learn==1.6.0\" \"mlflow==2.19.0\" \"ipywidgets==8.1.3\"" + "pip install -q -r /home/ray/default/requirements.txt\n", + "pip install -q -e /home/ray/default/doggos\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Note**: A kernel restart may be required for all dependencies to become available. \n", + "\n", + "If using **uv**, then:\n", + "1. Turn off the runtime dependencies (`Dependencies` tab up top > Toggle off `Pip packages`). And no need to run the `pip install` commands above.\n", + "2. Change the python kernel of this notebook to use the `venv` (Click on `base (Python x.yy.zz)` on top right cordern of notebook > `Select another Kernel` > `Python Environments...` > `Create Python Environment` > `Venv` > `Use Existing`) and done! Now all the notebook's cells will use the virtual env.\n", + "3. Change the py executable to use `uv run` instead of `python` by adding this line after importing ray.\n", + "```python\n", + "import os\n", + "os.environ.pop(\"RAY_RUNTIME_ENV_HOOK\", None)\n", + "import ray\n", + "ray.init(runtime_env={\"py_executable\": \"uv run\", \"working_dir\": \"/home/ray/default\"})\n", + "```" ] }, { @@ -47,7 +68,7 @@ "outputs": [], "source": [ "%load_ext autoreload\n", - "%autoreload all" + "%autoreload all\n" ] }, { @@ -59,7 +80,27 @@ "import os\n", "import ray\n", "import sys\n", - "sys.path.append(os.path.abspath(\"..\"))" + "sys.path.append(os.path.abspath(\"../doggos/\"))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# If using UV\n", + "# os.environ.pop(\"RAY_RUNTIME_ENV_HOOK\", None)\n", + "# ray.init(runtime_env={\"py_executable\": \"uv run\", \"working_dir\": \"/home/ray/default\"})\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from doggos import utils\n" ] }, { @@ -80,20 +121,23 @@ "name": "stderr", "output_type": "stream", "text": [ - "2025-06-23 14:04:39,249\tINFO worker.py:1723 -- Connecting to existing Ray cluster at address: 10.0.52.172:6379...\n", - "2025-06-23 14:04:39,260\tINFO worker.py:1908 -- Connected to Ray cluster. View the dashboard at \u001b[1m\u001b[32mhttps://session-gcwehd9xxjzkv5lxv8lgcdgx2n.i.anyscaleuserdata.com \u001b[39m\u001b[22m\n", - "2025-06-23 14:04:39,266\tINFO packaging.py:380 -- Pushing file package 'gcs://_ray_pkg_a644723e367c78760222a7f2fcce949b2fe72f7b.zip' (1.92MiB) to Ray cluster...\n", - "2025-06-23 14:04:39,275\tINFO packaging.py:393 -- Successfully pushed file package 'gcs://_ray_pkg_a644723e367c78760222a7f2fcce949b2fe72f7b.zip'.\n", - "2025-06-23 14:04:39,581\tINFO dataset.py:3048 -- Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.\n", - "2025-06-23 14:04:39,583\tINFO logging.py:295 -- Registered dataset logger for dataset dataset_11_0\n", - "2025-06-23 14:04:39,594\tINFO streaming_executor.py:117 -- Starting execution of Dataset dataset_11_0. Full logs are in /tmp/ray/session_2025-06-23_13-49-50_102769_2149/logs/ray-data\n", - "2025-06-23 14:04:39,595\tINFO streaming_executor.py:118 -- Execution plan of Dataset dataset_11_0: InputDataBuffer[Input] -> TaskPoolMapOperator[ListFiles] -> TaskPoolMapOperator[ReadFiles] -> LimitOperator[limit=1]\n" + "2025-08-28 05:00:43,606\tINFO worker.py:1771 -- Connecting to existing Ray cluster at address: 10.0.17.148:6379...\n", + "2025-08-28 05:00:43,617\tINFO worker.py:1942 -- Connected to Ray cluster. View the dashboard at \u001b[1m\u001b[32mhttps://session-jhxhj69d6ttkjctcxfnsfe7gwk.i.anyscaleuserdata.com \u001b[39m\u001b[22m\n", + "2025-08-28 05:00:43,621\tINFO packaging.py:588 -- Creating a file package for local module '/home/ray/default/doggos/doggos'.\n", + "2025-08-28 05:00:43,625\tINFO packaging.py:380 -- Pushing file package 'gcs://_ray_pkg_7400f2bea399eebc.zip' (0.02MiB) to Ray cluster...\n", + "2025-08-28 05:00:43,625\tINFO packaging.py:393 -- Successfully pushed file package 'gcs://_ray_pkg_7400f2bea399eebc.zip'.\n", + "2025-08-28 05:00:43,628\tINFO packaging.py:380 -- Pushing file package 'gcs://_ray_pkg_a31dca6092632244a5c9467084f1b1f8df982200.zip' (1.10MiB) to Ray cluster...\n", + "2025-08-28 05:00:43,634\tINFO packaging.py:393 -- Successfully pushed file package 'gcs://_ray_pkg_a31dca6092632244a5c9467084f1b1f8df982200.zip'.\n", + "2025-08-28 05:00:48,035\tINFO dataset.py:3248 -- Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.\n", + "2025-08-28 05:00:48,039\tINFO logging.py:295 -- Registered dataset logger for dataset dataset_1_0\n", + "2025-08-28 05:00:48,101\tINFO streaming_executor.py:159 -- Starting execution of Dataset dataset_1_0. Full logs are in /tmp/ray/session_2025-08-28_04-57-43_348032_12595/logs/ray-data\n", + "2025-08-28 05:00:48,102\tINFO streaming_executor.py:160 -- Execution plan of Dataset dataset_1_0: InputDataBuffer[Input] -> TaskPoolMapOperator[ListFiles] -> LimitOperator[limit=1] -> TaskPoolMapOperator[ReadFiles]\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "9dbfa59a93134b189b928b743d442130", + "model_id": "d08e184535944a6c8ea162eca5674cd1", "version_major": 2, "version_minor": 0 }, @@ -107,7 +151,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "d44b4fa98a6343d9a31b3dba01234981", + "model_id": "ac866daca29b4379b67367b2c50c65f0", "version_major": 2, "version_minor": 0 }, @@ -121,12 +165,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "3c0885d0896e4fac8b5a9bcb0f3833f1", + "model_id": "724c3b66392442aebf0d756157799e44", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "- ReadFiles 2: 0.00 row [00:00, ? row/s]" + "- limit=1 2: 0.00 row [00:00, ? row/s]" ] }, "metadata": {}, @@ -135,12 +179,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "a05586f2d15948bba416a1200dcd8fa6", + "model_id": "c1d6a10ed6c04fce8135dc2a98b8ebe3", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "- limit=1 3: 0.00 row [00:00, ? row/s]" + "- ReadFiles 3: 0.00 row [00:00, ? row/s]" ] }, "metadata": {}, @@ -150,62 +194,63 @@ "name": "stderr", "output_type": "stream", "text": [ - "2025-06-23 14:05:56,467\tINFO streaming_executor.py:227 -- ✔️ Dataset dataset_11_0 execution finished in 76.87 seconds\n" + "2025-08-28 05:00:48,137\tWARNING resource_manager.py:134 -- ⚠️ Ray's object store is configured to use only 27.3% of available memory (8.7GiB out of 32.0GiB total). For optimal Ray Data performance, we recommend setting the object store to at least 50% of available memory. You can do this by setting the 'object_store_memory' parameter when calling ray.init() or by setting the RAY_DEFAULT_OBJECT_STORE_MEMORY_PROPORTION environment variable.\n", + "2025-08-28 05:00:52,084\tINFO streaming_executor.py:279 -- ✔️ Dataset dataset_1_0 execution finished in 3.98 seconds\n" ] }, { "data": { "text/plain": [ - "[{'image': array([[[ 32, 52, 77],\n", - " [ 27, 47, 72],\n", - " [ 28, 43, 72],\n", + "[{'image': array([[[ 71, 93, 81],\n", + " [ 71, 93, 81],\n", + " [ 71, 91, 79],\n", " ...,\n", - " [235, 235, 233],\n", - " [236, 236, 234],\n", - " [236, 236, 234]],\n", + " [ 99, 129, 137],\n", + " [101, 131, 139],\n", + " [102, 132, 140]],\n", " \n", - " [[ 34, 51, 77],\n", - " [ 30, 47, 73],\n", - " [ 30, 45, 74],\n", + " [[ 61, 81, 70],\n", + " [ 61, 81, 70],\n", + " [ 61, 81, 69],\n", " ...,\n", - " [233, 233, 231],\n", - " [233, 233, 231],\n", - " [233, 233, 231]],\n", + " [ 93, 123, 131],\n", + " [ 96, 125, 133],\n", + " [ 97, 127, 135]],\n", " \n", - " [[ 35, 50, 79],\n", - " [ 32, 47, 76],\n", - " [ 33, 48, 79],\n", + " [[ 51, 68, 58],\n", + " [ 51, 68, 58],\n", + " [ 50, 68, 56],\n", " ...,\n", - " [237, 237, 237],\n", - " [237, 237, 237],\n", - " [237, 237, 237]],\n", + " [ 82, 111, 117],\n", + " [ 85, 112, 119],\n", + " [ 86, 115, 121]],\n", " \n", " ...,\n", " \n", - " [[ 55, 80, 76],\n", - " [ 65, 90, 86],\n", - " [ 56, 78, 75],\n", + " [[ 83, 101, 103],\n", + " [ 83, 101, 103],\n", + " [ 84, 102, 106],\n", " ...,\n", - " [142, 168, 133],\n", - " [157, 184, 149],\n", - " [140, 170, 132]],\n", + " [ 94, 82, 56],\n", + " [ 97, 85, 59],\n", + " [ 99, 87, 61]],\n", " \n", - " [[ 52, 72, 70],\n", - " [ 77, 97, 95],\n", - " [ 78, 97, 95],\n", + " [[ 82, 100, 102],\n", + " [ 82, 100, 102],\n", + " [ 83, 101, 105],\n", " ...,\n", - " [125, 151, 112],\n", - " [141, 169, 128],\n", - " [180, 211, 167]],\n", + " [ 95, 83, 57],\n", + " [ 98, 86, 60],\n", + " [ 99, 87, 61]],\n", " \n", - " [[ 92, 108, 107],\n", - " [123, 139, 138],\n", - " [135, 149, 149],\n", + " [[ 85, 100, 103],\n", + " [ 85, 100, 103],\n", + " [ 83, 101, 103],\n", " ...,\n", - " [125, 152, 109],\n", - " [ 87, 116, 68],\n", - " [127, 159, 109]]], dtype=uint8),\n", - " 'path': 'doggos-dataset/train/saint_bernard/saint_bernard_7024.jpg'}]" + " [ 95, 84, 56],\n", + " [ 99, 88, 60],\n", + " [100, 89, 61]]], dtype=uint8),\n", + " 'path': 'doggos-dataset/train/malamute/malamute_11814.jpg'}]" ] }, "execution_count": null, @@ -220,7 +265,7 @@ " include_paths=True, \n", " shuffle=\"files\",\n", ")\n", - "ds.take(1)" + "ds.take(1)\n" ] }, { @@ -265,7 +310,7 @@ "source": [ "def add_class(row):\n", " row[\"class\"] = row[\"path\"].rsplit(\"/\", 3)[-2]\n", - " return row" + " return row\n" ] }, { @@ -278,7 +323,7 @@ "ds = ds.map(add_class,\n", " num_cpus=1,\n", " num_gpus=0,\n", - " concurrency=4)" + " concurrency=4)\n" ] }, { @@ -328,7 +373,7 @@ "import numpy as np\n", "from PIL import Image\n", "import torch\n", - "from transformers import CLIPModel, CLIPProcessor" + "from transformers import CLIPModel, CLIPProcessor\n" ] }, { @@ -354,7 +399,7 @@ " with torch.inference_mode():\n", " batch[\"embedding\"] = self.model.get_image_features(**inputs).cpu().numpy()\n", "\n", - " return batch" + " return batch\n" ] }, { @@ -393,9 +438,9 @@ " concurrency=4,\n", " batch_size=64,\n", " num_gpus=1,\n", - " accelerator_type=\"L4\",\n", + " accelerator_type=\"T4\",\n", ")\n", - "embeddings_ds = embeddings_ds.drop_columns([\"image\"]) # remove image column" + "embeddings_ds = embeddings_ds.drop_columns([\"image\"]) # remove image column\n" ] }, { @@ -441,7 +486,7 @@ "metadata": {}, "outputs": [], "source": [ - "import shutil" + "import shutil\n" ] }, { @@ -453,15 +498,15 @@ "name": "stderr", "output_type": "stream", "text": [ - "2025-06-23 14:06:01,973\tINFO logging.py:295 -- Registered dataset logger for dataset dataset_16_0\n", - "2025-06-23 14:06:02,000\tINFO streaming_executor.py:117 -- Starting execution of Dataset dataset_16_0. Full logs are in /tmp/ray/session_2025-06-23_13-49-50_102769_2149/logs/ray-data\n", - "2025-06-23 14:06:02,002\tINFO streaming_executor.py:118 -- Execution plan of Dataset dataset_16_0: InputDataBuffer[Input] -> TaskPoolMapOperator[ListFiles] -> TaskPoolMapOperator[ReadFiles] -> TaskPoolMapOperator[Map(add_class)] -> ActorPoolMapOperator[MapBatches(EmbedImages)] -> TaskPoolMapOperator[MapBatches(drop_columns)->Write]\n" + "2025-08-28 05:00:55,737\tINFO logging.py:295 -- Registered dataset logger for dataset dataset_6_0\n", + "2025-08-28 05:00:55,756\tINFO streaming_executor.py:159 -- Starting execution of Dataset dataset_6_0. Full logs are in /tmp/ray/session_2025-08-28_04-57-43_348032_12595/logs/ray-data\n", + "2025-08-28 05:00:55,757\tINFO streaming_executor.py:160 -- Execution plan of Dataset dataset_6_0: InputDataBuffer[Input] -> TaskPoolMapOperator[ListFiles] -> TaskPoolMapOperator[ReadFiles] -> TaskPoolMapOperator[Map(add_class)] -> ActorPoolMapOperator[MapBatches(EmbedImages)] -> TaskPoolMapOperator[MapBatches(drop_columns)->Write]\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "5933c94751554584b0efe1af1c11b265", + "model_id": "7dbb9f0a9c364c529da80ff9e3266eb4", "version_major": 2, "version_minor": 0 }, @@ -476,13 +521,13 @@ "name": "stderr", "output_type": "stream", "text": [ - "2025-06-23 14:06:02,029\tINFO actor_pool_map_operator.py:633 -- Scaling up actor pool by 4 (reason=scaling to min size, running=0, restarting=0, pending=0)\n" + "{\"asctime\":\"2025-08-28 05:00:55,808\",\"levelname\":\"E\",\"message\":\"Actor with class name: 'MapWorker(MapBatches(EmbedImages))' and ID: '1e923c76f6e2b92256b942a802000000' has constructor arguments in the object store and max_restarts > 0. If the arguments in the object store go out of scope or are lost, the actor restart will fail. See https://github.com/ray-project/ray/issues/53727 for more details.\",\"filename\":\"core_worker.cc\",\"lineno\":2254}\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "09923297706a4c6ca9bdc8c217fef9dd", + "model_id": "02bc199f5f074df19b376272e8c29ba8", "version_major": 2, "version_minor": 0 }, @@ -496,7 +541,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "61862104a5dd47b8be8364d4a9f91677", + "model_id": "70ce40105ae34580a6ebb69dfada0de0", "version_major": 2, "version_minor": 0 }, @@ -510,7 +555,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "2f089288b9214fde9a8c93dad13fc7ab", + "model_id": "36450487d0614de89dab8cb02e4e7180", "version_major": 2, "version_minor": 0 }, @@ -524,7 +569,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "f22efa3031dd4b86903f20c17d00946f", + "model_id": "1197937e481a43bb90094625f2c8a569", "version_major": 2, "version_minor": 0 }, @@ -538,7 +583,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "915c9ece141c44519b93546cc8ab7724", + "model_id": "c6277d187bd345ff9a773b33bbc03ea6", "version_major": 2, "version_minor": 0 }, @@ -549,25 +594,36 @@ "metadata": {}, "output_type": "display_data" }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[36m(autoscaler +20s)\u001b[0m Tip: use `ray status` to view detailed cluster status. To disable these messages, set RAY_SCHEDULER_EVENTS=0.\n", + "\u001b[36m(autoscaler +20s)\u001b[0m [autoscaler] [4xT4:48CPU-192GB] Attempting to add 1 node to the cluster (increasing from 0 to 1).\n", + "\u001b[36m(autoscaler +25s)\u001b[0m [autoscaler] [4xT4:48CPU-192GB|g4dn.12xlarge] [us-west-2a] [on-demand] Launched 1 instance.\n" + ] + }, { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[36m(_MapWorker pid=2910, ip=10.0.69.70)\u001b[0m Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\n", - "2025-06-23 14:06:22,379\tINFO actor_pool_map_operator.py:661 -- Scaled down actor pool by 1 (reason=None; running=3, restarting=0, pending=0)\n", - "2025-06-23 14:06:22,744\tINFO streaming_executor.py:227 -- ✔️ Dataset dataset_16_0 execution finished in 20.74 seconds\n", - "2025-06-23 14:06:22,842\tINFO dataset.py:4603 -- Data sink Parquet finished. 2880 rows and 5.8MB data written.\n" + "2025-08-28 05:01:19,478\tWARNING resource_manager.py:551 -- Cluster resources are not engough to run any task from ActorPoolMapOperator[MapBatches(EmbedImages)]. The job may hang forever unless the cluster scales up.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[36m(autoscaler +5m51s)\u001b[0m Tip: use `ray status` to view detailed cluster status. To disable these messages, set RAY_SCHEDULER_EVENTS=0.\n", - "\u001b[36m(autoscaler +5m51s)\u001b[0m [autoscaler] Downscaling node i-018706717a4455b75 (node IP: 10.0.65.200) due to node idle termination.\n", - "\u001b[36m(autoscaler +5m51s)\u001b[0m [autoscaler] Downscaling node i-0e3238b7f703616e7 (node IP: 10.0.127.236) due to node idle termination.\n", - "\u001b[36m(autoscaler +5m51s)\u001b[0m [autoscaler] Downscaling node i-0fcefb76d19edf42b (node IP: 10.0.49.153) due to node idle termination.\n", - "\u001b[36m(autoscaler +5m56s)\u001b[0m [autoscaler] Cluster resized to {8 CPU, 2 GPU}.\n" + "\u001b[36m(autoscaler +1m10s)\u001b[0m [autoscaler] Cluster upscaled to {56 CPU, 4 GPU}.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[36m(MapWorker(MapBatches(EmbedImages)) pid=3337, ip=10.0.5.252)\u001b[0m Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\n", + "2025-08-28 05:03:39,362\tINFO streaming_executor.py:279 -- ✔️ Dataset dataset_6_0 execution finished in 163.60 seconds\n", + "2025-08-28 05:03:39,422\tINFO dataset.py:4871 -- Data sink Parquet finished. 2880 rows and 5.8MB data written.\n" ] } ], @@ -576,7 +632,7 @@ "embeddings_path = os.path.join(\"/mnt/cluster_storage\", \"doggos/embeddings\")\n", "if os.path.exists(embeddings_path): \n", " shutil.rmtree(embeddings_path) # clean up\n", - "embeddings_ds.write_parquet(embeddings_path)" + "embeddings_ds.write_parquet(embeddings_path)\n" ] }, { @@ -662,19 +718,28 @@ ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Output\n", + "(anyscale +0.8s) Submitting job with config JobConfig(name='image-batch-embeddings', image_uri='anyscale/ray:2.48.0-slim-py312-cu128', compute_config=None, env_vars=None, py_modules=['/home/ray/default/doggos'], py_executable=None, cloud=None, project=None, ray_version=None, job_queue_config=None).\n", + "(anyscale +7.2s) Uploading local dir '/home/ray/default' to cloud storage.\n", + "(anyscale +7.9s) Uploading local dir '/home/ray/default/doggos' to cloud storage.\n", + "(anyscale +9.2s) Job 'image-batch-embeddings' submitted, ID: 'prodjob_7e1fsj9xzs2iryayj7hgbhifl8'.\n", + "(anyscale +9.2s) View the job in the UI: https://console.anyscale.com/jobs/prodjob_7e1fsj9xzs2iryayj7hgbhifl8\n", + "(anyscale +9.2s) Use `--wait` to wait for the job to run and stream logs.\n" + ] + } + ], "source": [ - "```bash\n", - "# Production batch job.\n", - "anyscale job submit --name=generate-doggos-embeddings \\\n", - " --containerfile=\"/home/ray/default/containerfile\" \\\n", - " --compute-config=\"/home/ray/default/configs/aws.yaml\" \\\n", - " --working-dir=\"/home/ray/default\" \\\n", - " --exclude=\"\" \\\n", - " --max-retries=0 \\\n", - " -- python doggos/embed.py\n", - "```" + "%%bash\n", + "# Production batch embedding generation job\n", + "anyscale job submit -f /home/ray/default/configs/generate_embeddings.yaml\n" ] }, { @@ -708,7 +773,7 @@ "from PIL import Image\n", "import numpy as np\n", "import requests\n", - "from doggos.embed import get_top_matches, display_top_matches" + "from doggos.embed import get_top_matches, display_top_matches\n" ] }, { @@ -719,7 +784,7 @@ "source": [ "def url_to_array(url):\n", " return np.array(Image.open(\n", - " BytesIO(requests.get(url).content)).convert(\"RGB\"))" + " BytesIO(requests.get(url).content)).convert(\"RGB\"))\n" ] }, { @@ -727,6 +792,125 @@ "execution_count": null, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "91c47446fb224d72987f0f9b4c9c5e90", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "preprocessor_config.json: 0%| | 0.00/316 [00:00 TaskPoolMapOperator[ListFiles] -> TaskPoolMapOperator[ReadFiles]\n" + "/home/ray/anaconda3/lib/python3.12/site-packages/ray/data/_internal/datasource/parquet_datasource.py:750: FutureWarning: The default `file_extensions` for `read_parquet` will change from `None` to ['parquet'] after Ray 2.43, and your dataset contains files that don't match the new `file_extensions`. To maintain backwards compatibility, set `file_extensions=None` explicitly.\n", + " warnings.warn(\n", + "2025-08-28 05:03:56,303\tINFO logging.py:295 -- Registered dataset logger for dataset dataset_8_0\n", + "2025-08-28 05:03:56,308\tINFO streaming_executor.py:159 -- Starting execution of Dataset dataset_8_0. Full logs are in /tmp/ray/session_2025-08-28_04-57-43_348032_12595/logs/ray-data\n", + "2025-08-28 05:03:56,309\tINFO streaming_executor.py:160 -- Execution plan of Dataset dataset_8_0: InputDataBuffer[Input] -> TaskPoolMapOperator[ListFiles] -> TaskPoolMapOperator[ReadFiles]\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "95d21fb5261949a69e7b9b52e9c93605", + "model_id": "5d82b793825b412c9ab72693c6fb92ce", "version_major": 2, "version_minor": 0 }, @@ -778,7 +964,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "ef0d324d477c42e587bbc19eed93697d", + "model_id": "bdd005beb194490e8c641ef7548fdf09", "version_major": 2, "version_minor": 0 }, @@ -792,7 +978,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "e896cad5fe264723a3cc3cd4f6a64ebf", + "model_id": "884a8230054a42c88e655c178827a68f", "version_major": 2, "version_minor": 0 }, @@ -803,16 +989,30 @@ "metadata": {}, "output_type": "display_data" }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6f2d6a1e74fd41f6a9be6c2fefdadf64", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "model.safetensors: 0%| | 0.00/605M [00:00" ] @@ -820,24 +1020,13 @@ "metadata": {}, "output_type": "display_data" }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[36m(_MapWorker pid=3343, ip=10.0.102.235)\u001b[0m Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[36m(autoscaler +16m26s)\u001b[0m [autoscaler] [4xL4:48CPU-192GB] Attempting to add 2 node(s) to the cluster (increasing from 1 to 3).\n", - "\u001b[36m(autoscaler +16m26s)\u001b[0m [autoscaler] [4xL4:48CPU-192GB] Launched 2 instances.\n", - "\u001b[36m(autoscaler +17m11s)\u001b[0m [autoscaler] Cluster upscaled to {152 CPU, 14 GPU}.\n", - "\u001b[33m(raylet)\u001b[0m WARNING: 4 PYTHON worker processes have been started on node: 97b39558bc8a3057162823cead1b8e035f1be130c49bb311e538ed2d with address: 10.0.52.172. This could be a result of using a large number of actors, or due to tasks blocked in ray.get() calls (see https://github.com/ray-project/ray/issues/3644 for some discussion of workarounds).\n", - "\u001b[36m(autoscaler +1h19m21s)\u001b[0m [autoscaler] Downscaling node i-03a133888407b8cf8 (node IP: 10.0.103.152) due to node idle termination.\n", - "\u001b[36m(autoscaler +1h19m21s)\u001b[0m [autoscaler] Downscaling node i-06023e83fb012b7ae (node IP: 10.0.90.122) due to node idle termination.\n", - "\u001b[36m(autoscaler +1h19m26s)\u001b[0m [autoscaler] Cluster resized to {56 CPU, 6 GPU}.\n" + "\u001b[36m(autoscaler +7m14s)\u001b[0m [autoscaler] [4xT4:48CPU-192GB] Attempting to add 1 node to the cluster (increasing from 1 to 2).\n", + "\u001b[36m(autoscaler +7m14s)\u001b[0m [autoscaler] [4xT4:48CPU-192GB|g4dn.12xlarge] [us-west-2a] [on-demand] Launched 1 instance.\n", + "\u001b[36m(autoscaler +8m0s)\u001b[0m [autoscaler] Cluster upscaled to {104 CPU, 8 GPU}.\n" ] } ], @@ -845,7 +1034,7 @@ "# Top matches by embedding similarity.\n", "embeddings_ds = ray.data.read_parquet(embeddings_path)\n", "top_matches = get_top_matches(embedding, embeddings_ds, n=5)\n", - "display_top_matches(url, top_matches)" + "display_top_matches(url, top_matches)\n" ] }, { diff --git a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/notebooks/02-Distributed-Training.ipynb b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/notebooks/02-Distributed-Training.ipynb index 59c16348c27a..37cf5b136b24 100644 --- a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/notebooks/02-Distributed-Training.ipynb +++ b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/notebooks/02-Distributed-Training.ipynb @@ -32,14 +32,35 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[92mSuccessfully registered `matplotlib, torch` and 4 other packages to be installed on all cluster nodes.\u001b[0m\n", - "\u001b[92mView and update dependencies here: https://console.anyscale.com/cld_kvedZWag2qA8i5BjxUevf5i7/prj_cz951f43jjdybtzkx1s5sjgz99/workspaces/expwrk_eys8cskj5aivghbf773dp2vmcd?workspace-tab=dependencies\u001b[0m\n" + "\u001b[92mSuccessfully registered `ipywidgets, matplotlib` and 4 other packages to be installed on all cluster nodes.\u001b[0m\n", + "\u001b[92mView and update dependencies here: https://console.anyscale.com/cld_kvedZWag2qA8i5BjxUevf5i7/prj_cz951f43jjdybtzkx1s5sjgz99/workspaces/expwrk_23ry3pgfn3jgq2jk3e5z25udhz?workspace-tab=dependencies\u001b[0m\n", + "\u001b[92mSuccessfully registered `doggos` package to be installed on all cluster nodes.\u001b[0m\n", + "\u001b[92mView and update dependencies here: https://console.anyscale.com/cld_kvedZWag2qA8i5BjxUevf5i7/prj_cz951f43jjdybtzkx1s5sjgz99/workspaces/expwrk_23ry3pgfn3jgq2jk3e5z25udhz?workspace-tab=dependencies\u001b[0m\n" ] } ], "source": [ "%%bash\n", - "pip install -q \"matplotlib==3.10.0\" \"torch==2.7.0\" \"transformers==4.52.3\" \"scikit-learn==1.6.0\" \"mlflow==2.19.0\" \"ipywidgets==8.1.3\"" + "pip install -q -r /home/ray/default/requirements.txt\n", + "pip install -q -e /home/ray/default/doggos\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Note**: A kernel restart may be required for all dependencies to become available. \n", + "\n", + "If using **uv**, then:\n", + "1. Turn off the runtime dependencies (`Dependencies` tab up top > Toggle off `Pip packages`). And no need to run the `pip install` commands above.\n", + "2. Change the python kernel of this notebook to use the `venv` (Click on `base (Python x.yy.zz)` on top right cordern of notebook > `Select another Kernel` > `Python Environments...` > `Create Python Environment` > `Venv` > `Use Existing`) and done! Now all the notebook's cells will use the virtual env.\n", + "3. Change the py executable to use `uv run` instead of `python` by adding this line after importing ray.\n", + "```python\n", + "import os\n", + "os.environ.pop(\"RAY_RUNTIME_ENV_HOOK\", None)\n", + "import ray\n", + "ray.init(runtime_env={\"py_executable\": \"uv run\", \"working_dir\": \"/home/ray/default\"})\n", + "```" ] }, { @@ -49,7 +70,7 @@ "outputs": [], "source": [ "%load_ext autoreload\n", - "%autoreload all" + "%autoreload all\n" ] }, { @@ -61,7 +82,17 @@ "import os\n", "import ray\n", "import sys\n", - "sys.path.append(os.path.abspath(\"..\"))" + "sys.path.append(os.path.abspath(\"../doggos/\"))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# If using UV\n", + "# os.environ.pop(\"RAY_RUNTIME_ENV_HOOK\", None)\n" ] }, { @@ -73,17 +104,19 @@ "name": "stderr", "output_type": "stream", "text": [ - "2025-06-23 14:26:58,662\tINFO worker.py:1723 -- Connecting to existing Ray cluster at address: 10.0.52.172:6379...\n", - "2025-06-23 14:26:58,674\tINFO worker.py:1908 -- Connected to Ray cluster. View the dashboard at \u001b[1m\u001b[32mhttps://session-gcwehd9xxjzkv5lxv8lgcdgx2n.i.anyscaleuserdata.com \u001b[39m\u001b[22m\n", - "2025-06-23 14:26:58,721\tINFO packaging.py:588 -- Creating a file package for local module '../'.\n", - "2025-06-23 14:26:58,781\tINFO packaging.py:380 -- Pushing file package 'gcs://_ray_pkg_df54fa2aa282ae62.zip' (13.77MiB) to Ray cluster...\n", - "2025-06-23 14:26:58,845\tINFO packaging.py:393 -- Successfully pushed file package 'gcs://_ray_pkg_df54fa2aa282ae62.zip'.\n" + "2025-08-28 05:06:48,041\tINFO worker.py:1771 -- Connecting to existing Ray cluster at address: 10.0.17.148:6379...\n", + "2025-08-28 05:06:48,052\tINFO worker.py:1942 -- Connected to Ray cluster. View the dashboard at \u001b[1m\u001b[32mhttps://session-jhxhj69d6ttkjctcxfnsfe7gwk.i.anyscaleuserdata.com \u001b[39m\u001b[22m\n", + "2025-08-28 05:06:48,061\tINFO packaging.py:588 -- Creating a file package for local module '/home/ray/default/doggos/doggos'.\n", + "2025-08-28 05:06:48,064\tINFO packaging.py:380 -- Pushing file package 'gcs://_ray_pkg_86cc12e3f2760ca4.zip' (0.03MiB) to Ray cluster...\n", + "2025-08-28 05:06:48,065\tINFO packaging.py:393 -- Successfully pushed file package 'gcs://_ray_pkg_86cc12e3f2760ca4.zip'.\n", + "2025-08-28 05:06:48,068\tINFO packaging.py:380 -- Pushing file package 'gcs://_ray_pkg_563e3191c4f9ed5f5d5e8601702cfa5ff10660e4.zip' (1.09MiB) to Ray cluster...\n", + "2025-08-28 05:06:48,073\tINFO packaging.py:393 -- Successfully pushed file package 'gcs://_ray_pkg_563e3191c4f9ed5f5d5e8601702cfa5ff10660e4.zip'.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "0ee0ee1a3af84e0481f83f2e8802e581", + "model_id": "c853a046148f42b8baa50711e3057054", "version_major": 2, "version_minor": 0 }, @@ -111,11 +144,11 @@ " \n", " \n", " Ray version:\n", - " 2.47.1\n", + " 2.49.0\n", " \n", " \n", " Dashboard:\n", - " http://session-gcwehd9xxjzkv5lxv8lgcdgx2n.i.anyscaleuserdata.com\n", + " http://session-jhxhj69d6ttkjctcxfnsfe7gwk.i.anyscaleuserdata.com\n", "\n", "\n", "\n", @@ -124,7 +157,7 @@ "\n" ], "text/plain": [ - "RayContext(dashboard_url='session-gcwehd9xxjzkv5lxv8lgcdgx2n.i.anyscaleuserdata.com', python_version='3.12.11', ray_version='2.47.1', ray_commit='e06f523c450fb1c99d8f347f8bfcc4085cc68b66')" + "RayContext(dashboard_url='session-jhxhj69d6ttkjctcxfnsfe7gwk.i.anyscaleuserdata.com', python_version='3.12.11', ray_version='2.49.0', ray_commit='8b349d73c5d5c4b56dc719fcc447d18ae8571dd4')" ] }, "execution_count": null, @@ -139,11 +172,11 @@ " # connect to existing ray runtime (from previous notebook if still running)\n", " address=os.environ.get(\"RAY_ADDRESS\", \"auto\"),\n", " runtime_env={\n", - " \"env_vars\": {\"RAY_TRAIN_V2_ENABLED\": \"1\"}, \n", - " # working_dir to import doggos (default working_dir=\".\")\n", - " \"working_dir\": \"../\",\n", + " \"env_vars\": {\"RAY_TRAIN_V2_ENABLED\": \"1\"},\n", + " # \"py_executable\": \"uv run\", # if using uv \n", + " # \"working_dir\": \"/home/ray/default\", # if using uv \n", " },\n", - ")" + ")\n" ] }, { @@ -154,7 +187,7 @@ "source": [ "%%bash\n", "# This will be removed once Ray Train v2 is enabled by default.\n", - "echo \"RAY_TRAIN_V2_ENABLED=1\" > /home/ray/default/.env" + "echo \"RAY_TRAIN_V2_ENABLED=1\" > /home/ray/default/.env\n" ] }, { @@ -176,7 +209,7 @@ "source": [ "# Load env vars in notebooks.\n", "from dotenv import load_dotenv\n", - "load_dotenv()" + "load_dotenv()\n" ] }, { @@ -201,7 +234,7 @@ "source": [ "def add_class(row):\n", " row[\"class\"] = row[\"path\"].rsplit(\"/\", 3)[-2]\n", - " return row" + " return row\n" ] }, { @@ -214,7 +247,7 @@ "train_ds = ray.data.read_images(\"s3://doggos-dataset/train\", include_paths=True, shuffle=\"files\")\n", "train_ds = train_ds.map(add_class)\n", "val_ds = ray.data.read_images(\"s3://doggos-dataset/val\", include_paths=True)\n", - "val_ds = val_ds.map(add_class)" + "val_ds = val_ds.map(add_class)\n" ] }, { @@ -237,7 +270,7 @@ "def convert_to_label(row, class_to_label):\n", " if \"class\" in row:\n", " row[\"label\"] = class_to_label[row[\"class\"]]\n", - " return row" + " return row\n" ] }, { @@ -250,7 +283,7 @@ "from PIL import Image\n", "import torch\n", "from transformers import CLIPModel, CLIPProcessor\n", - "from doggos.embed import EmbedImages" + "from doggos.embed import EmbedImages\n" ] }, { @@ -285,14 +318,14 @@ " concurrency=4,\n", " batch_size=64,\n", " num_gpus=1,\n", - " accelerator_type=\"L4\",\n", + " accelerator_type=\"T4\",\n", " )\n", " ds = ds.drop_columns([\"image\"])\n", " return ds\n", "\n", " def save(self, fp):\n", " with open(fp, \"w\") as f:\n", - " json.dump(self.class_to_label, f)" + " json.dump(self.class_to_label, f)\n" ] }, { @@ -304,16 +337,22 @@ "name": "stderr", "output_type": "stream", "text": [ - "2025-06-23 14:27:10,597\tINFO dataset.py:3048 -- Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.\n", - "2025-06-23 14:27:10,599\tINFO logging.py:295 -- Registered dataset logger for dataset dataset_65_0\n", - "2025-06-23 14:27:10,612\tINFO streaming_executor.py:117 -- Starting execution of Dataset dataset_65_0. Full logs are in /tmp/ray/session_2025-06-23_13-49-50_102769_2149/logs/ray-data\n", - "2025-06-23 14:27:10,613\tINFO streaming_executor.py:118 -- Execution plan of Dataset dataset_65_0: InputDataBuffer[Input] -> TaskPoolMapOperator[ListFiles] -> TaskPoolMapOperator[ReadFiles] -> TaskPoolMapOperator[Map(add_class)] -> AllToAllOperator[Aggregate] -> LimitOperator[limit=1]\n" + "2025-08-28 05:06:54,182\tINFO dataset.py:3248 -- Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.\n", + "2025-08-28 05:06:54,184\tINFO logging.py:295 -- Registered dataset logger for dataset dataset_14_0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-08-28 05:06:54,206\tINFO streaming_executor.py:159 -- Starting execution of Dataset dataset_14_0. Full logs are in /tmp/ray/session_2025-08-28_04-57-43_348032_12595/logs/ray-data\n", + "2025-08-28 05:06:54,207\tINFO streaming_executor.py:160 -- Execution plan of Dataset dataset_14_0: InputDataBuffer[Input] -> TaskPoolMapOperator[ListFiles] -> TaskPoolMapOperator[ReadFiles] -> TaskPoolMapOperator[Map(add_class)] -> AllToAllOperator[Aggregate] -> LimitOperator[limit=1]\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "6c2629752381401bb193d0d84fa68963", + "model_id": "66271b6a5fb7493998bb818d81eb9d12", "version_major": 2, "version_minor": 0 }, @@ -327,7 +366,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "ffc73aed04544803a19756d5fc09c575", + "model_id": "342323504f9046f5ab79cc8fab75fd3d", "version_major": 2, "version_minor": 0 }, @@ -341,7 +380,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "7cb26d7641104cfdabb606292026da04", + "model_id": "4ad27f68f4954b839ec614deb470dc2c", "version_major": 2, "version_minor": 0 }, @@ -355,7 +394,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "cbd97058f69741b0a40e8bb312a88065", + "model_id": "ec8f2f8c07134885bcf1e339079e5602", "version_major": 2, "version_minor": 0 }, @@ -369,7 +408,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "654ec007f7184ec0a9c2c487dd3df860", + "model_id": "8af9685130484d73bdcc36ff9a7b6742", "version_major": 2, "version_minor": 0 }, @@ -383,7 +422,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "58acb9149bf644be8386a8da980ea125", + "model_id": "8e2ff9a70b3047b78d3421a9c6ba4a2a", "version_major": 2, "version_minor": 0 }, @@ -397,7 +436,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "19656545491b4ae4b239bb7773341210", + "model_id": "5d47b1e760004a8ba310e0cda5de47f0", "version_major": 2, "version_minor": 0 }, @@ -411,7 +450,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "54a23a3f33054741981ad75230221b54", + "model_id": "d4a4b8c2fd3240a1ad46f9abd4869497", "version_major": 2, "version_minor": 0 }, @@ -425,7 +464,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "6ec35ea4ab2244748e2c6fc2d1b280d8", + "model_id": "c881bfe545de4d8e9384ad4a4c4a3346", "version_major": 2, "version_minor": 0 }, @@ -440,7 +479,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "2025-06-23 14:27:17,996\tINFO streaming_executor.py:227 -- ✔️ Dataset dataset_65_0 execution finished in 7.38 seconds\n" + "2025-08-28 05:06:54,275\tWARNING resource_manager.py:134 -- ⚠️ Ray's object store is configured to use only 28.5% of available memory (63.9GiB out of 224.0GiB total). For optimal Ray Data performance, we recommend setting the object store to at least 50% of available memory. You can do this by setting the 'object_store_memory' parameter when calling ray.init() or by setting the RAY_DEFAULT_OBJECT_STORE_MEMORY_PROPORTION environment variable.\n", + "2025-08-28 05:07:03,480\tINFO streaming_executor.py:279 -- ✔️ Dataset dataset_14_0 execution finished in 9.27 seconds\n" ] } ], @@ -449,7 +489,7 @@ "preprocessor = Preprocessor()\n", "preprocessor = preprocessor.fit(train_ds, column=\"class\")\n", "train_ds = preprocessor.transform(ds=train_ds)\n", - "val_ds = preprocessor.transform(ds=val_ds)" + "val_ds = preprocessor.transform(ds=val_ds)\n" ] }, { @@ -467,7 +507,7 @@ "metadata": {}, "outputs": [], "source": [ - "import shutil" + "import shutil\n" ] }, { @@ -479,15 +519,15 @@ "name": "stderr", "output_type": "stream", "text": [ - "2025-06-23 14:19:45,048\tINFO logging.py:295 -- Registered dataset logger for dataset dataset_40_0\n", - "2025-06-23 14:19:45,067\tINFO streaming_executor.py:117 -- Starting execution of Dataset dataset_40_0. Full logs are in /tmp/ray/session_2025-06-23_13-49-50_102769_2149/logs/ray-data\n", - "2025-06-23 14:19:45,069\tINFO streaming_executor.py:118 -- Execution plan of Dataset dataset_40_0: InputDataBuffer[Input] -> TaskPoolMapOperator[ListFiles] -> TaskPoolMapOperator[ReadFiles] -> TaskPoolMapOperator[Map(add_class)->Map(convert_to_label)] -> ActorPoolMapOperator[MapBatches(EmbedImages)] -> TaskPoolMapOperator[MapBatches(drop_columns)->Write]\n" + "2025-08-28 05:07:04,254\tINFO logging.py:295 -- Registered dataset logger for dataset dataset_22_0\n", + "2025-08-28 05:07:04,270\tINFO streaming_executor.py:159 -- Starting execution of Dataset dataset_22_0. Full logs are in /tmp/ray/session_2025-08-28_04-57-43_348032_12595/logs/ray-data\n", + "2025-08-28 05:07:04,271\tINFO streaming_executor.py:160 -- Execution plan of Dataset dataset_22_0: InputDataBuffer[Input] -> TaskPoolMapOperator[ListFiles] -> TaskPoolMapOperator[ReadFiles] -> TaskPoolMapOperator[Map(add_class)->Map(convert_to_label)] -> ActorPoolMapOperator[MapBatches(EmbedImages)] -> TaskPoolMapOperator[MapBatches(drop_columns)->Write]\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "a676da85e459434b82d231b8cf23a213", + "model_id": "05ab1c63234c4c779fbca8267b744477", "version_major": 2, "version_minor": 0 }, @@ -498,17 +538,10 @@ "metadata": {}, "output_type": "display_data" }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-06-23 14:19:45,088\tINFO actor_pool_map_operator.py:633 -- Scaling up actor pool by 4 (reason=scaling to min size, running=0, restarting=0, pending=0)\n" - ] - }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "f3e0d36a35444d60b6234498fa910777", + "model_id": "310392fbcf7f4b50bc122405fdfdaaac", "version_major": 2, "version_minor": 0 }, @@ -522,7 +555,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "befd7f324d954c7f8ca324b50d807239", + "model_id": "b600d27a67dc42159f5a97a445538560", "version_major": 2, "version_minor": 0 }, @@ -536,7 +569,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "39a445c091c5457ea4bfb31a88b9215b", + "model_id": "fbb9d22bf6c74df7b21af88c7363adad", "version_major": 2, "version_minor": 0 }, @@ -550,7 +583,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "dd118260e28a42e0a4325a8f5036bc85", + "model_id": "5b6f5b9affbb4f458851974ec2811594", "version_major": 2, "version_minor": 0 }, @@ -564,7 +597,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "a49e0bbbbf0e424da8b3a514885c0148", + "model_id": "936041324dbb49dda5872a3e6d3fa979", "version_major": 2, "version_minor": 0 }, @@ -579,19 +612,18 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[36m(_MapWorker pid=18628, ip=10.0.102.235)\u001b[0m Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\n", - "2025-06-23 14:19:57,926\tINFO actor_pool_map_operator.py:661 -- Scaled down actor pool by 1 (reason=None; running=3, restarting=0, pending=0)\n", - "2025-06-23 14:19:58,259\tINFO streaming_executor.py:227 -- ✔️ Dataset dataset_40_0 execution finished in 13.19 seconds\n", - "2025-06-23 14:19:58,573\tINFO dataset.py:4603 -- Data sink Parquet finished. 2880 rows and 5.9MB data written.\n", - "2025-06-23 14:19:58,584\tINFO logging.py:295 -- Registered dataset logger for dataset dataset_43_0\n", - "2025-06-23 14:19:58,602\tINFO streaming_executor.py:117 -- Starting execution of Dataset dataset_43_0. Full logs are in /tmp/ray/session_2025-06-23_13-49-50_102769_2149/logs/ray-data\n", - "2025-06-23 14:19:58,603\tINFO streaming_executor.py:118 -- Execution plan of Dataset dataset_43_0: InputDataBuffer[Input] -> TaskPoolMapOperator[ListFiles] -> TaskPoolMapOperator[ReadFiles] -> TaskPoolMapOperator[Map(add_class)->Map(convert_to_label)] -> ActorPoolMapOperator[MapBatches(EmbedImages)] -> TaskPoolMapOperator[MapBatches(drop_columns)->Write]\n" + "\u001b[36m(MapWorker(MapBatches(EmbedImages)) pid=9215, ip=10.0.5.252)\u001b[0m Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\n", + "2025-08-28 05:07:20,682\tINFO streaming_executor.py:279 -- ✔️ Dataset dataset_22_0 execution finished in 16.41 seconds\n", + "2025-08-28 05:07:20,747\tINFO dataset.py:4871 -- Data sink Parquet finished. 2880 rows and 5.9MB data written.\n", + "2025-08-28 05:07:20,759\tINFO logging.py:295 -- Registered dataset logger for dataset dataset_25_0\n", + "2025-08-28 05:07:20,774\tINFO streaming_executor.py:159 -- Starting execution of Dataset dataset_25_0. Full logs are in /tmp/ray/session_2025-08-28_04-57-43_348032_12595/logs/ray-data\n", + "2025-08-28 05:07:20,775\tINFO streaming_executor.py:160 -- Execution plan of Dataset dataset_25_0: InputDataBuffer[Input] -> TaskPoolMapOperator[ListFiles] -> TaskPoolMapOperator[ReadFiles] -> TaskPoolMapOperator[Map(add_class)->Map(convert_to_label)] -> ActorPoolMapOperator[MapBatches(EmbedImages)] -> TaskPoolMapOperator[MapBatches(drop_columns)->Write]\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "8cac399a609346a89cab141cb4bd91af", + "model_id": "cb6abfa554bd48a8a91f6bf67b72321d", "version_major": 2, "version_minor": 0 }, @@ -602,17 +634,10 @@ "metadata": {}, "output_type": "display_data" }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-06-23 14:19:58,620\tINFO actor_pool_map_operator.py:633 -- Scaling up actor pool by 4 (reason=scaling to min size, running=0, restarting=0, pending=0)\n" - ] - }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "231149ef4ba34ab9bb7c0346956bfb21", + "model_id": "ebd4d9a7ce0e494cbd256531e82c76a0", "version_major": 2, "version_minor": 0 }, @@ -626,7 +651,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "50bcce7953b944eca37c79f5c024c982", + "model_id": "bbf925aa09a9434d9a0e1b0a0434a977", "version_major": 2, "version_minor": 0 }, @@ -640,7 +665,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "82da80edde0546d5935e0426960a904a", + "model_id": "c061d8e4bee4444fa9d1ddfafa9f99bc", "version_major": 2, "version_minor": 0 }, @@ -654,7 +679,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "1ca5f7798fd744e99513b5cdfbf144f4", + "model_id": "47cb0744205149d1860913bf2124338d", "version_major": 2, "version_minor": 0 }, @@ -668,7 +693,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "5b1f7618f9e94098a64c33bf49f4d12c", + "model_id": "a68e006347144beca6b0593d148d0f8d", "version_major": 2, "version_minor": 0 }, @@ -683,11 +708,19 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[36m(_MapWorker pid=33082, ip=10.0.102.235)\u001b[0m Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\u001b[32m [repeated 4x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)\u001b[0m\n", - "2025-06-23 14:20:07,331\tINFO actor_pool_map_operator.py:661 -- Scaled down actor pool by 1 (reason=None; running=3, restarting=0, pending=0)\n", - "2025-06-23 14:20:07,854\tINFO actor_pool_map_operator.py:661 -- Scaled down actor pool by 1 (reason=None; running=2, restarting=0, pending=0)\n", - "2025-06-23 14:20:08,323\tINFO streaming_executor.py:227 -- ✔️ Dataset dataset_43_0 execution finished in 9.72 seconds\n", - "2025-06-23 14:20:08,372\tINFO dataset.py:4603 -- Data sink Parquet finished. 720 rows and 1.5MB data written.\n" + "2025-08-28 05:07:22,417\tWARNING streaming_executor_state.py:790 -- Operator produced a RefBundle with a different schema than the previous one. Previous schema: image: extension>\n", + "path: string, new schema: image: extension>\n", + "path: string. This may lead to unexpected behavior.\n", + "2025-08-28 05:07:22,642\tWARNING streaming_executor_state.py:790 -- Operator produced a RefBundle with a different schema than the previous one. Previous schema: image: extension>\n", + "path: string\n", + "class: string\n", + "label: int64, new schema: image: extension>\n", + "path: string\n", + "class: string\n", + "label: int64. This may lead to unexpected behavior.\n", + "\u001b[36m(MapWorker(MapBatches(EmbedImages)) pid=23307, ip=10.0.5.252)\u001b[0m Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\u001b[32m [repeated 4x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)\u001b[0m\n", + "2025-08-28 05:07:33,184\tINFO streaming_executor.py:279 -- ✔️ Dataset dataset_25_0 execution finished in 12.41 seconds\n", + "2025-08-28 05:07:33,214\tINFO dataset.py:4871 -- Data sink Parquet finished. 720 rows and 1.5MB data written.\n" ] } ], @@ -699,7 +732,7 @@ "preprocessed_train_path = os.path.join(preprocessed_data_path, \"preprocessed_train\")\n", "preprocessed_val_path = os.path.join(preprocessed_data_path, \"preprocessed_val\")\n", "train_ds.write_parquet(preprocessed_train_path)\n", - "val_ds.write_parquet(preprocessed_val_path)" + "val_ds.write_parquet(preprocessed_val_path)\n" ] }, { @@ -738,7 +771,7 @@ "from pathlib import Path\n", "import torch\n", "import torch.nn as nn\n", - "import torch.nn.functional as F" + "import torch.nn.functional as F\n" ] }, { @@ -799,7 +832,7 @@ " with open(args_fp, \"r\") as fp:\n", " model = cls(**json.load(fp))\n", " model.load_state_dict(torch.load(state_dict_fp, map_location=device))\n", - " return model" + " return model\n" ] }, { @@ -830,7 +863,7 @@ " dropout_p=0.3, \n", " num_classes=num_classes,\n", ")\n", - "print (model)" + "print (model)\n" ] }, { @@ -853,7 +886,7 @@ "metadata": {}, "outputs": [], "source": [ - "from ray.train.torch import get_device" + "from ray.train.torch import get_device\n" ] }, { @@ -872,7 +905,7 @@ " dtype=dtypes[key],\n", " device=get_device(),\n", " )\n", - " return tensor_batch" + " return tensor_batch\n" ] }, { @@ -884,15 +917,15 @@ "name": "stderr", "output_type": "stream", "text": [ - "2025-06-23 14:27:26,458\tINFO logging.py:295 -- Registered dataset logger for dataset dataset_72_0\n", - "2025-06-23 14:27:26,469\tINFO streaming_executor.py:117 -- Starting execution of Dataset dataset_72_0. Full logs are in /tmp/ray/session_2025-06-23_13-49-50_102769_2149/logs/ray-data\n", - "2025-06-23 14:27:26,470\tINFO streaming_executor.py:118 -- Execution plan of Dataset dataset_72_0: InputDataBuffer[Input] -> TaskPoolMapOperator[ListFiles] -> TaskPoolMapOperator[ReadFiles] -> TaskPoolMapOperator[Map(add_class)->Map(convert_to_label)] -> ActorPoolMapOperator[MapBatches(EmbedImages)] -> TaskPoolMapOperator[MapBatches(drop_columns)] -> LimitOperator[limit=3]\n" + "2025-08-28 05:07:34,380\tINFO logging.py:295 -- Registered dataset logger for dataset dataset_27_0\n", + "2025-08-28 05:07:34,394\tINFO streaming_executor.py:159 -- Starting execution of Dataset dataset_27_0. Full logs are in /tmp/ray/session_2025-08-28_04-57-43_348032_12595/logs/ray-data\n", + "2025-08-28 05:07:34,395\tINFO streaming_executor.py:160 -- Execution plan of Dataset dataset_27_0: InputDataBuffer[Input] -> TaskPoolMapOperator[ListFiles] -> TaskPoolMapOperator[ReadFiles] -> TaskPoolMapOperator[Map(add_class)->Map(convert_to_label)] -> ActorPoolMapOperator[MapBatches(EmbedImages)] -> TaskPoolMapOperator[MapBatches(drop_columns)] -> LimitOperator[limit=3]\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "f7d09fbef089477397fd9c9745974185", + "model_id": "a81b12e57aba4dc3b16c2fafcb91cade", "version_major": 2, "version_minor": 0 }, @@ -903,17 +936,10 @@ "metadata": {}, "output_type": "display_data" }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-06-23 14:27:26,489\tINFO actor_pool_map_operator.py:633 -- Scaling up actor pool by 4 (reason=scaling to min size, running=0, restarting=0, pending=0)\n" - ] - }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "ae6a71a193b94308be98fe3bb49e830e", + "model_id": "c176ff7a83b54b9b99fc7f5dc12e92c7", "version_major": 2, "version_minor": 0 }, @@ -927,7 +953,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "23fbb838b4f4413188a21f963216d9b3", + "model_id": "59f86a3defeb41de9be9874b6ae8a234", "version_major": 2, "version_minor": 0 }, @@ -941,7 +967,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "505daeea69ac49f3b0eb712b855f4dbd", + "model_id": "1acb1ef77faf42119ee3702dcaa2bcd7", "version_major": 2, "version_minor": 0 }, @@ -955,7 +981,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "9acc85ac1e8e44a3b672e6bd0bb38995", + "model_id": "59526fe14e3a41cd8abb44b306127a7d", "version_major": 2, "version_minor": 0 }, @@ -969,7 +995,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "620184f23bf74c1c9af7475e1bd291e4", + "model_id": "98c20dcbec0b4cb8871a4635a3d02815", "version_major": 2, "version_minor": 0 }, @@ -983,7 +1009,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "939b699564104c0b8048c3fe78a235bc", + "model_id": "09f2cbe0afe04bc1a046e4e45a387fc1", "version_major": 2, "version_minor": 0 }, @@ -998,37 +1024,30 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[36m(_MapWorker pid=18053, ip=10.0.90.122)\u001b[0m Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\n", - "2025-06-23 14:27:33,774\tINFO streaming_executor.py:227 -- ✔️ Dataset dataset_72_0 execution finished in 7.30 seconds\n", - "/tmp/ipykernel_18629/3214280880.py:6: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /pytorch/torch/csrc/utils/tensor_numpy.cpp:203.)\n", + "\u001b[36m(MapWorker(MapBatches(EmbedImages)) pid=26114, ip=10.0.5.252)\u001b[0m Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\n", + "2025-08-28 05:07:45,755\tINFO streaming_executor.py:279 -- ✔️ Dataset dataset_27_0 execution finished in 11.36 seconds\n", + "/tmp/ipykernel_31027/417303983.py:6: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /pytorch/torch/csrc/utils/tensor_numpy.cpp:203.)\n", " tensor_batch[key] = torch.as_tensor(\n" ] }, { "data": { "text/plain": [ - "{'embedding': tensor([[-0.1921, 0.1182, -0.1963, ..., 0.7892, -0.2841, -0.0829],\n", - " [-0.0389, -0.1284, -0.5749, ..., 0.4360, 0.0745, -0.1555],\n", - " [-0.1139, 0.1539, -0.1519, ..., 0.8438, 0.3064, -0.1918]]),\n", - " 'label': tensor([22, 11, 33])}" + "{'embedding': tensor([[ 0.0245, 0.6505, 0.0627, ..., 0.4001, -0.2721, -0.0673],\n", + " [-0.2416, 0.2315, 0.0255, ..., 0.4065, 0.2805, -0.1156],\n", + " [-0.2301, -0.3628, 0.1086, ..., 0.3038, 0.0543, 0.6214]]),\n", + " 'label': tensor([10, 29, 27])}" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[36m(autoscaler +35s)\u001b[0m Tip: use `ray status` to view detailed cluster status. To disable these messages, set RAY_SCHEDULER_EVENTS=0.\n" - ] } ], "source": [ "# Sample batch\n", "sample_batch = train_ds.take_batch(batch_size=3)\n", - "collate_fn(batch=sample_batch)" + "collate_fn(batch=sample_batch)\n" ] }, { @@ -1049,19 +1068,9 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[36m(autoscaler +57m1s)\u001b[0m [autoscaler] Downscaling node i-03a133888407b8cf8 (node IP: 10.0.103.152) due to node idle termination.\n", - "\u001b[36m(autoscaler +57m1s)\u001b[0m [autoscaler] Downscaling node i-06023e83fb012b7ae (node IP: 10.0.90.122) due to node idle termination.\n", - "\u001b[36m(autoscaler +57m6s)\u001b[0m [autoscaler] Cluster resized to {56 CPU, 6 GPU}.\n" - ] - } - ], + "outputs": [], "source": [ - "import shutil" + "import shutil\n" ] }, { @@ -1073,7 +1082,7 @@ "model_registry = \"/mnt/cluster_storage/mlflow/doggos\"\n", "if os.path.isdir(model_registry):\n", " shutil.rmtree(model_registry) # clean up\n", - "os.makedirs(model_registry, exist_ok=True)" + "os.makedirs(model_registry, exist_ok=True)\n" ] }, { @@ -1120,7 +1129,7 @@ " \"lr_patience\": 3,\n", " \"num_epochs\": 20,\n", " \"batch_size\": 256,\n", - "}" + "}\n" ] }, { @@ -1135,8 +1144,8 @@ " num_workers=num_workers,\n", " use_gpu=True,\n", " resources_per_worker={\"CPU\": 8, \"GPU\": 2},\n", - " accelerator_type=\"L4\",\n", - ")" + " accelerator_type=\"T4\",\n", + ")\n" ] }, { @@ -1148,7 +1157,7 @@ "import tempfile\n", "import mlflow\n", "import numpy as np\n", - "from ray.train.torch import TorchTrainer" + "from ray.train.torch import TorchTrainer\n" ] }, { @@ -1169,7 +1178,7 @@ " J.backward() # Backward pass.\n", " optimizer.step() # Update weights.\n", " loss += (J.detach().item() - loss) / (i + 1) # Cumulative loss\n", - " return loss" + " return loss\n" ] }, { @@ -1191,7 +1200,7 @@ " loss += (J - loss) / (i + 1)\n", " y_trues.extend(batch[\"label\"].cpu().numpy())\n", " y_preds.extend(torch.argmax(z, dim=1).cpu().numpy())\n", - " return loss, np.vstack(y_trues), np.vstack(y_preds)" + " return loss, np.vstack(y_trues), np.vstack(y_preds)\n" ] }, { @@ -1266,7 +1275,7 @@ "\n", " # End experiment tracking.\n", " if ray.train.get_context().get_world_rank() == 0:\n", - " mlflow.end_run()" + " mlflow.end_run()\n" ] }, { @@ -1282,11 +1291,20 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ray/anaconda3/lib/python3.12/site-packages/ray/data/_internal/datasource/parquet_datasource.py:750: FutureWarning: The default `file_extensions` for `read_parquet` will change from `None` to ['parquet'] after Ray 2.43, and your dataset contains files that don't match the new `file_extensions`. To maintain backwards compatibility, set `file_extensions=None` explicitly.\n", + " warnings.warn(\n" + ] + } + ], "source": [ "# Load preprocessed datasets.\n", "preprocessed_train_ds = ray.data.read_parquet(preprocessed_train_path)\n", - "preprocessed_val_ds = ray.data.read_parquet(preprocessed_val_path)" + "preprocessed_val_ds = ray.data.read_parquet(preprocessed_val_path)\n" ] }, { @@ -1303,7 +1321,7 @@ " train_loop_config=train_loop_config,\n", " scaling_config=scaling_config,\n", " datasets={\"train\": preprocessed_train_ds, \"val\": preprocessed_val_ds},\n", - ")" + ")\n" ] }, { @@ -1313,7 +1331,7 @@ "outputs": [], "source": [ "# Train.\n", - "results = trainer.fit()" + "results = trainer.fit()\n" ] }, { @@ -1377,31 +1395,31 @@ { "data": { "text/plain": [ - "run_id c65d5aba186c4ee58bf8188493cd047c\n", - "experiment_id 477478897635232497\n", + "run_id d54aa07059384d139ea572123ae9409c\n", + "experiment_id 653138458592289747\n", "status FINISHED\n", - "artifact_uri file:///mnt/cluster_storage/mlflow/doggos/4774...\n", - "start_time 2025-06-23 14:23:03.775000+00:00\n", - "end_time 2025-06-23 14:23:21.440000+00:00\n", - "metrics.train_loss 0.388298\n", + "artifact_uri file:///mnt/cluster_storage/mlflow/doggos/6531...\n", + "start_time 2025-08-28 05:10:15.049000+00:00\n", + "end_time 2025-08-28 05:10:33.936000+00:00\n", "metrics.lr 0.001\n", - "metrics.val_loss 0.664968\n", - "params.batch_size 256\n", - "params.num_epochs 20\n", - "params.lr 0.001\n", + "metrics.val_loss 0.778273\n", + "metrics.train_loss 0.39104\n", + "params.lr_factor 0.8\n", "params.hidden_dim 256\n", - "params.experiment_name doggos\n", - "params.dropout_p 0.3\n", "params.embedding_dim 512\n", + "params.dropout_p 0.3\n", + "params.experiment_name doggos\n", + "params.batch_size 256\n", + "params.lr 0.001\n", + "params.num_classes 36\n", + "params.class_to_label {'pomeranian': 0, 'rottweiler': 1, 'boxer': 2,...\n", + "params.num_epochs 20\n", "params.lr_patience 3\n", - "params.class_to_label {'doberman': 0, 'collie': 1, 'dingo': 2, 'pome...\n", - "params.lr_factor 0.8\n", "params.model_registry /mnt/cluster_storage/mlflow/doggos\n", - "params.num_classes 36\n", "tags.mlflow.source.name /home/ray/anaconda3/lib/python3.12/site-packag...\n", - "tags.mlflow.user ray\n", "tags.mlflow.source.type LOCAL\n", - "tags.mlflow.runName abrasive-newt-588\n", + "tags.mlflow.runName judicious-panda-916\n", + "tags.mlflow.user ray\n", "Name: 0, dtype: object" ] }, @@ -1417,7 +1435,7 @@ " experiment_names=[experiment_name], \n", " order_by=[\"metrics.val_loss ASC\"])\n", "best_run = sorted_runs.iloc[0]\n", - "best_run" + "best_run\n" ] }, { @@ -1440,19 +1458,28 @@ ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Output\n", + "(anyscale +0.8s) Submitting job with config JobConfig(name='train-image-model', image_uri='anyscale/ray:2.48.0-slim-py312-cu128', compute_config=None, env_vars=None, py_modules=['/home/ray/default/doggos'], py_executable=None, cloud=None, project=None, ray_version=None, job_queue_config=None).\n", + "(anyscale +3.0s) Uploading local dir '/home/ray/default' to cloud storage.\n", + "(anyscale +3.8s) Uploading local dir '/home/ray/default/doggos' to cloud storage.\n", + "(anyscale +4.9s) Job 'train-image-model' submitted, ID: 'prodjob_zfy5ak9a5masjb4vuidtxvxpqt'.\n", + "(anyscale +4.9s) View the job in the UI: https://console.anyscale.com/jobs/prodjob_zfy5ak9a5masjb4vuidtxvxpqt\n", + "(anyscale +4.9s) Use `--wait` to wait for the job to run and stream logs.\n" + ] + } + ], "source": [ - "```bash\n", - "# Production batch job.\n", - "anyscale job submit --name=train-doggos-model \\\n", - " --containerfile=\"/home/ray/default/containerfile\" \\\n", - " --compute-config=\"/home/ray/default/configs/aws.yaml\" \\\n", - " --working-dir=\"/home/ray/default\" \\\n", - " --exclude=\"\" \\\n", - " --max-retries=0 \\\n", - " -- python doggos/train.py\n", - "```" + "%%bash\n", + "# Production model training job\n", + "anyscale job submit -f /home/ray/default/configs/train_model.yaml\n" ] }, { @@ -1483,7 +1510,7 @@ "outputs": [], "source": [ "from urllib.parse import urlparse\n", - "from sklearn.metrics import multilabel_confusion_matrix" + "from sklearn.metrics import multilabel_confusion_matrix\n" ] }, { @@ -1524,7 +1551,7 @@ " args_fp=os.path.join(artifacts_dir, \"args.json\"), \n", " state_dict_fp=os.path.join(artifacts_dir, \"model.pt\"),\n", " )\n", - " return cls(preprocessor=preprocessor, model=model)" + " return cls(preprocessor=preprocessor, model=model)\n" ] }, { @@ -1538,7 +1565,7 @@ "predictor = TorchPredictor.from_artifacts_dir(artifacts_dir=artifacts_dir)\n", "test_ds = ray.data.read_images(\"s3://doggos-dataset/test\", include_paths=True)\n", "test_ds = test_ds.map(add_class)\n", - "test_ds = predictor.preprocessor.transform(ds=test_ds)" + "test_ds = predictor.preprocessor.transform(ds=test_ds)\n" ] }, { @@ -1550,15 +1577,15 @@ "name": "stderr", "output_type": "stream", "text": [ - "2025-06-23 14:25:17,471\tINFO logging.py:295 -- Registered dataset logger for dataset dataset_56_0\n", - "2025-06-23 14:25:17,483\tINFO streaming_executor.py:117 -- Starting execution of Dataset dataset_56_0. Full logs are in /tmp/ray/session_2025-06-23_13-49-50_102769_2149/logs/ray-data\n", - "2025-06-23 14:25:17,484\tINFO streaming_executor.py:118 -- Execution plan of Dataset dataset_56_0: InputDataBuffer[Input] -> TaskPoolMapOperator[ListFiles] -> TaskPoolMapOperator[ReadFiles] -> TaskPoolMapOperator[Map(add_class)->Map(convert_to_label)] -> ActorPoolMapOperator[MapBatches(EmbedImages)] -> TaskPoolMapOperator[MapBatches(drop_columns)] -> TaskPoolMapOperator[MapBatches(TorchPredictor)] -> LimitOperator[limit=1]\n" + "2025-08-28 05:10:42,369\tINFO logging.py:295 -- Registered dataset logger for dataset dataset_40_0\n", + "2025-08-28 05:10:42,388\tINFO streaming_executor.py:159 -- Starting execution of Dataset dataset_40_0. Full logs are in /tmp/ray/session_2025-08-28_04-57-43_348032_12595/logs/ray-data\n", + "2025-08-28 05:10:42,388\tINFO streaming_executor.py:160 -- Execution plan of Dataset dataset_40_0: InputDataBuffer[Input] -> TaskPoolMapOperator[ListFiles] -> TaskPoolMapOperator[ReadFiles] -> TaskPoolMapOperator[Map(add_class)->Map(convert_to_label)] -> ActorPoolMapOperator[MapBatches(EmbedImages)] -> TaskPoolMapOperator[MapBatches(drop_columns)] -> TaskPoolMapOperator[MapBatches(TorchPredictor)] -> LimitOperator[limit=1]\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "9b9a801dfa75419b9f307a00a3397677", + "model_id": "9c8deb98ca3d40cd8aea0fdaaa3abadc", "version_major": 2, "version_minor": 0 }, @@ -1569,17 +1596,10 @@ "metadata": {}, "output_type": "display_data" }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-06-23 14:25:17,504\tINFO actor_pool_map_operator.py:633 -- Scaling up actor pool by 4 (reason=scaling to min size, running=0, restarting=0, pending=0)\n" - ] - }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "9354ed2fc7644cb7bacb97aa620d76fa", + "model_id": "34c194e15c044a308d3d89e3c99414be", "version_major": 2, "version_minor": 0 }, @@ -1593,7 +1613,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "1980c5b233994a82b79c1b5853333de4", + "model_id": "7c6efd5cc49744a594a647449d67e6c5", "version_major": 2, "version_minor": 0 }, @@ -1607,7 +1627,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "a78cd2dd17df4f72b3aa28f40e36a04b", + "model_id": "e9edf72e22d64cf6adaeda87459d0c0b", "version_major": 2, "version_minor": 0 }, @@ -1621,7 +1641,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "9a240953f8a6401cb9e060439e3c7432", + "model_id": "1fd89ff72f0e42689114fc21b8748658", "version_major": 2, "version_minor": 0 }, @@ -1635,7 +1655,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "c1ead75a9a74448fb96776643b93b769", + "model_id": "984b6bbdeaee4207aaa18b88cbaa2691", "version_major": 2, "version_minor": 0 }, @@ -1649,7 +1669,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "17d189a8c3534c7fbea57d6b4680337c", + "model_id": "6dca7e9346d34396b3d758f7e8eb34f6", "version_major": 2, "version_minor": 0 }, @@ -1663,7 +1683,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "969c3c3cc23946238bae3b1682aa2ade", + "model_id": "cd539308a7ef471f927d664745cebb49", "version_major": 2, "version_minor": 0 }, @@ -1678,148 +1698,148 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[36m(_MapWorker pid=41895, ip=10.0.102.235)\u001b[0m Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\n", - "\u001b[36m(MapBatches(TorchPredictor) pid=7131, ip=10.0.90.122)\u001b[0m /tmp/ipykernel_14938/3214280880.py:6: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /pytorch/torch/csrc/utils/tensor_numpy.cpp:203.)\n", - "\u001b[36m(_MapWorker pid=6304, ip=10.0.90.122)\u001b[0m Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\u001b[32m [repeated 3x across cluster]\u001b[0m\n", - "2025-06-23 14:25:31,572\tINFO streaming_executor.py:227 -- ✔️ Dataset dataset_56_0 execution finished in 14.08 seconds\n" + "\u001b[36m(MapWorker(MapBatches(EmbedImages)) pid=33395, ip=10.0.5.252)\u001b[0m Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\n", + "\u001b[36m(MapBatches(TorchPredictor) pid=34104, ip=10.0.5.252)\u001b[0m /tmp/ipykernel_31027/417303983.py:6: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /pytorch/torch/csrc/utils/tensor_numpy.cpp:203.)\n", + "\u001b[36m(MapWorker(MapBatches(EmbedImages)) pid=6674, ip=10.0.5.20)\u001b[0m Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\u001b[32m [repeated 3x across cluster]\u001b[0m\n", + "2025-08-28 05:10:59,374\tINFO streaming_executor.py:279 -- ✔️ Dataset dataset_40_0 execution finished in 16.98 seconds\n" ] }, { "data": { "text/plain": [ - "[{'path': 'doggos-dataset/test/basset/basset_10288.jpg',\n", + "[{'path': 'doggos-dataset/test/basset/basset_10005.jpg',\n", " 'class': 'basset',\n", - " 'label': 26,\n", - " 'embedding': array([-1.04914151e-01, -2.44789988e-01, -9.95982289e-02, 1.35369569e-01,\n", - " -5.52587211e-02, -5.80722839e-02, 1.91796571e-01, 1.56359702e-01,\n", - " -6.07913733e-01, 2.08769619e-01, -3.80898006e-02, -1.11314066e-01,\n", - " -1.96144834e-01, -6.14988208e-02, 5.18053114e-01, 2.08482340e-01,\n", - " 1.18680000e+00, 2.00228021e-01, -2.38505289e-01, 7.44116083e-02,\n", - " -1.17921010e-01, 1.65986642e-02, 4.06986564e-01, 1.73043087e-02,\n", - " -7.19358325e-02, -2.49894068e-01, 5.69958836e-02, -2.07780451e-02,\n", - " -2.98084527e-01, -1.49073690e-01, 2.44870782e-02, 4.86774921e-01,\n", - " 3.78374428e-01, -2.37518042e-01, 1.26714706e-01, 1.10405624e-01,\n", - " 1.23483673e-01, -2.53296018e-01, -1.41814440e-01, 1.88360083e+00,\n", - " -4.67942834e-01, -1.71202213e-01, 2.93785512e-01, 9.53243077e-02,\n", - " -1.08036891e-01, -1.05388820e+00, 2.12952226e-01, 3.43122423e-01,\n", - " -9.08568352e-02, -6.02110699e-02, 1.57682300e-02, 1.13998428e-01,\n", - " -9.61582065e-02, 1.91040933e-01, 3.62998173e-02, -1.67396963e-02,\n", - " 4.08946127e-01, 4.58516389e-01, -4.09091681e-01, -3.85877311e-01,\n", - " 9.77702141e-01, -1.69139802e-02, 1.93179488e-01, 1.36374593e-01,\n", - " -2.66537070e-01, -6.00859582e-01, -5.44146113e-02, 1.52056739e-01,\n", - " -2.88875699e-01, 2.30367318e-01, 6.66391551e-02, -3.48750651e-01,\n", - " 1.32896990e-01, 2.43517846e-01, -3.36779654e-03, 2.86127269e-01,\n", - " -3.56745601e-01, -1.14945844e-01, 1.51565939e-01, 4.90366817e-02,\n", - " 7.63746500e-02, -2.27382034e-02, 2.54388422e-01, -5.34341276e-01,\n", - " 3.07917655e-01, 4.43625525e-02, 3.23391706e-02, -3.16016555e-01,\n", - " 3.49402249e-01, 1.40896916e-01, -3.93401146e-01, -6.98464215e-01,\n", - " -7.05318165e+00, -9.64104384e-02, -1.29345521e-01, 1.01153195e-01,\n", - " 1.66721642e-03, 2.46858150e-01, -6.62657797e-01, 8.84700537e-01,\n", - " -2.41105676e-01, -1.67729586e-01, -2.76175410e-01, -1.06329188e-01,\n", - " 4.68529433e-01, -2.96109051e-01, 5.00090122e-01, -1.51693597e-02,\n", - " 1.84735969e-01, -4.76171166e-01, 2.78874516e-01, -7.43267417e-01,\n", - " 3.29548061e-01, 9.67882574e-03, -2.46126920e-01, -2.13637024e-01,\n", - " -5.42725086e-01, 3.51180196e-01, -2.11806729e-01, 3.27730656e-01,\n", - " 1.95189789e-01, 1.26086920e-01, 6.48027122e-01, 2.56954640e-01,\n", - " 4.22701418e-01, -2.30529577e-01, -1.10486835e-01, -1.01444468e-01,\n", - " 7.89555907e-03, -2.47240350e-01, 1.73558876e-01, 3.03944647e-01,\n", - " -5.77825531e-02, 9.45507646e-01, -4.95145559e-01, 2.86680222e-01,\n", - " -7.24357292e-02, -8.29979897e-01, 4.94338155e-01, 2.54262447e-01,\n", - " 2.29299828e-01, -2.25470066e-02, 5.62191963e-01, 3.00550222e-01,\n", - " -2.83117369e-02, 3.84202749e-01, 2.89719075e-01, 3.54923964e-01,\n", - " 2.66314894e-01, -3.58392656e-01, -3.72334182e-01, 5.86691260e-01,\n", - " -1.24578431e-01, -4.04101044e-01, -5.07451952e-01, 5.48313916e-01,\n", - " -3.14691275e-01, -1.80745274e-01, 2.89481759e-01, 5.75179756e-02,\n", - " -1.80967286e-01, 9.15101022e-02, 4.65520680e-01, 7.72555918e-02,\n", - " 2.23801851e-01, -1.68022275e-01, 1.34750500e-01, 2.97952116e-01,\n", - " 2.26987794e-01, 3.05612266e-01, 8.25502351e-02, 1.27266854e-01,\n", - " 4.45461750e-01, 4.75219965e-01, 2.56610662e-02, -4.94095474e-01,\n", - " 6.80846751e-01, 6.35496229e-02, 2.54889160e-01, -1.44209296e-01,\n", - " -5.48627734e-01, 3.29704136e-02, 4.15674299e-02, -2.43748799e-02,\n", - " -2.19443023e-01, -1.42820716e-01, -2.50694096e-01, -2.07656205e-01,\n", - " -1.79199561e-01, 3.50940913e-01, 6.33473039e-01, 3.80550534e-01,\n", - " -2.89176375e-01, 2.02112049e-01, -4.48559523e-01, 2.72922575e-01,\n", - " 2.24376589e-01, -2.83806473e-01, -4.37651068e-01, -9.45880890e-01,\n", - " 1.22266248e-01, 4.01376486e-02, 3.55452418e-01, 2.14725018e-01,\n", - " -3.82868618e-01, -3.58605623e-01, 1.33403972e-01, 3.17366868e-02,\n", - " 8.55787545e-02, 8.59863982e-02, 9.54705626e-02, -3.47019404e-01,\n", - " -7.17684031e-02, 2.91243881e-01, 2.65088528e-01, -9.42258835e-02,\n", - " -1.77515849e-01, 2.28757620e-01, 9.07460928e-01, -1.03129521e-01,\n", - " 7.33332276e-01, 2.64944017e-01, -1.47793442e-01, 3.05287898e-01,\n", - " -2.62915194e-01, 1.97677180e-01, 6.06525466e-02, -1.16444737e-01,\n", - " 7.31713697e-03, 1.67819709e-01, 9.79746133e-02, 1.47581011e-01,\n", - " -4.00336832e-01, 4.21648145e-01, -8.30136314e-02, -6.39808178e-01,\n", - " -1.41640380e-01, 4.65202779e-02, 7.18399584e-02, -4.38913584e-01,\n", - " 2.07775518e-01, 4.70566414e-02, -8.90242606e-02, -4.53150421e-01,\n", - " -2.14878619e-01, 2.44945884e-01, 3.16962540e-01, -3.41699839e-01,\n", - " -1.91379115e-01, -2.09521651e-02, 2.30608553e-01, 3.33673239e-01,\n", - " 2.77272910e-01, -2.96298712e-01, 1.22105137e-01, -2.16433048e-01,\n", - " 5.48319101e-01, 2.72968113e-01, 1.73093528e-01, 1.80758208e-01,\n", - " -3.40644240e-01, 2.62541264e-01, 1.24807566e-01, -7.05128908e-01,\n", - " -1.10303462e-02, -1.81341395e-01, -1.78187087e-01, 1.32017612e-01,\n", - " -4.31975611e-02, 3.50797176e-03, 1.59508839e-01, 9.21480432e-02,\n", - " 4.54917192e-01, 2.72805333e-01, -5.77595115e-01, -2.87324011e-01,\n", - " 1.66138291e-01, 8.66501480e-02, 9.02174413e-03, -3.78495932e-01,\n", - " -3.07204783e-01, 1.98499486e-02, -2.17410654e-01, -3.29564735e-02,\n", - " -9.36664641e-03, 1.02078244e-01, -5.64144492e-01, 2.59325683e-01,\n", - " -1.29754335e-01, 1.67371452e-01, 3.65311772e-01, 1.91542730e-02,\n", - " -1.80281848e-01, -1.50442168e-01, 3.04976612e-01, 3.71464863e-02,\n", - " 1.42819434e-02, 1.84083462e-01, 2.46860430e-01, 1.05640769e-01,\n", - " 4.84380722e-02, -3.53347808e-02, -4.98287007e-02, 2.02643886e-01,\n", - " -1.73173457e-01, -3.63763243e-01, -2.20462531e-01, 3.16181600e-01,\n", - " 6.26130402e-02, 7.24823922e-02, -1.47105128e-01, 3.08875024e-01,\n", - " 9.42751825e-01, 1.98151171e-02, -1.21707544e-02, -2.04986826e-01,\n", - " 2.55928785e-01, -9.34749842e-02, -1.57368124e-01, -9.39193606e-01,\n", - " 7.99043655e-01, 7.17637539e-01, -3.75674933e-01, 5.69818616e-01,\n", - " -1.33306235e-02, 5.30459285e-01, -5.34143746e-01, 2.46586412e-01,\n", - " -1.07142270e-01, 3.60272974e-02, -2.97878295e-01, -4.83343840e-01,\n", - " 6.04178667e-01, -5.00948548e-01, 3.49492311e-01, 2.63357386e-02,\n", - " 9.19313729e-02, 4.02335197e-01, 1.58837855e-01, -6.79962993e-01,\n", - " -2.58434951e-01, -4.40313041e-01, 3.03083509e-01, 3.24987084e-01,\n", - " 5.39690614e-01, 5.20520747e-01, 4.50525880e-01, 4.25642878e-01,\n", - " -3.66918445e-01, 3.89405370e-01, -1.27459884e+00, 1.07019678e-01,\n", - " -2.60990173e-01, -1.43924609e-01, 7.54836053e-02, 9.26972032e-01,\n", - " 3.27434987e-01, -1.17758155e+00, 1.98659331e-01, -2.22037435e-02,\n", - " 7.09707081e-01, 2.66087234e-01, 1.21972881e-01, 3.83028030e-01,\n", - " -7.28927612e-01, 2.53533423e-01, -4.85364050e-01, -2.49552578e-01,\n", - " -6.45122454e-02, -7.29703009e-01, 4.32397306e-01, 2.20177278e-01,\n", - " 2.00846434e-01, -9.86097157e-02, -1.90976754e-01, 2.79123753e-01,\n", - " 1.66312551e+00, 4.78211313e-01, -2.51018330e-02, 2.72021592e-01,\n", - " 7.38141775e-01, -1.70819223e-01, 8.71482790e-02, 5.43940544e-01,\n", - " 1.69077605e-01, -3.87216598e-01, -2.42075190e-01, 2.69218534e-01,\n", - " 3.44690025e-01, -8.90391588e-01, -7.69253790e-01, -3.58836114e-01,\n", - " 5.44936597e-01, -5.26414633e-01, -7.02109337e-02, -9.80197862e-02,\n", - " 1.44381337e-02, 2.74508834e-01, -2.26176381e-01, -4.58218932e-01,\n", - " -1.67408079e-01, 9.71819162e-02, -4.52373654e-01, 2.12075204e-01,\n", - " 3.00378114e-01, -4.85782117e-01, -8.94452184e-02, -3.76136094e-01,\n", - " 6.35548115e-01, -5.96615791e-01, 4.56892580e-01, 8.58041495e-02,\n", - " -4.65728045e-01, 2.77835429e-02, 3.81691009e-02, -2.30244100e-01,\n", - " 2.88146824e-01, 4.18678313e-01, 2.95979947e-01, -3.73036146e-01,\n", - " 2.28022650e-01, 3.33540946e-01, -1.05593085e-01, -3.15681905e-01,\n", - " -1.58446252e-01, -1.87164396e-01, -2.52391577e-01, -2.95362055e-01,\n", - " 8.43314469e-01, 1.14071526e-01, -2.23938376e-02, 1.09957650e-01,\n", - " -3.88728201e-01, 1.39827147e-01, 2.20899284e-03, -1.90839812e-01,\n", - " -9.09137726e-01, 1.57145649e-01, -1.39061660e-02, -2.81439349e-02,\n", - " 1.31379187e-01, 1.93342119e-02, -3.97078514e-01, 4.37840447e-02,\n", - " 5.70612431e-01, -3.71424943e-01, 1.27987966e-01, -1.53837383e-01,\n", - " -1.62056446e-01, -2.61603892e-02, -9.74950790e-01, -2.85338938e-01,\n", - " 1.48266554e-06, -5.19999146e-01, -1.39436916e-01, -1.61675125e-01,\n", - " 2.82035142e-01, 5.65708935e-01, 1.78672537e-01, 2.84627140e-01,\n", - " -1.29202381e-02, -5.35536408e-01, 6.67068288e-02, 1.26034901e-01,\n", - " 4.77381468e-01, 4.13616210e-01, -8.82375419e-01, 2.16037527e-01,\n", - " -7.70060718e-03, -1.17288813e-01, 3.86771172e-01, 3.40055674e-01,\n", - " -3.02813143e-01, -2.90828168e-01, -4.41879481e-01, -3.02490562e-01,\n", - " 1.14623025e-01, 5.78140691e-02, -5.26804924e-01, -1.41756445e-01,\n", - " 2.43902951e-03, 6.49944693e-02, -2.29362592e-01, -5.48198938e-01,\n", - " -7.99068272e-01, -3.52486148e-02, 4.28467467e-02, -5.25768399e-01,\n", - " 1.63442969e-01, -2.11263120e-01, -6.78404570e-02, -2.00107336e-01,\n", - " 4.71601546e-01, -4.66121018e-01, 2.91595191e-01, -5.46462014e-02,\n", - " -5.07597744e-01, 6.30303860e-01, -7.32594371e-01, 1.00498527e-01,\n", - " -7.07668364e-01, -8.52217302e-02, -5.60935438e-02, -1.76870823e-03,\n", - " 3.38252485e-01, -1.68113291e-01, -1.64995581e-01, 1.30709872e-01,\n", - " -9.02270138e-01, 1.71258092e-01, -5.64923435e-02, -2.03939527e-01],\n", + " 'label': 30,\n", + " 'embedding': array([ 8.86104554e-02, -5.89382686e-02, 1.15464866e-01, 2.15815112e-01,\n", + " -3.43266308e-01, -3.35150540e-01, 1.48883224e-01, -1.02369718e-01,\n", + " -1.69915810e-01, 4.34856862e-03, 2.41593361e-01, 1.79200619e-01,\n", + " 4.34402555e-01, 4.59785998e-01, 1.59284808e-02, 4.16959971e-01,\n", + " 5.20779848e-01, 1.86366066e-01, -3.43496174e-01, -4.00813907e-01,\n", + " -1.15213782e-01, -3.04853529e-01, 1.77998394e-01, 1.82090014e-01,\n", + " -3.56360346e-01, -2.30711952e-01, 1.69025257e-01, 3.78455579e-01,\n", + " 8.37044120e-02, -4.81875241e-02, 3.17967087e-01, -1.40099749e-01,\n", + " -2.15949178e-01, -4.72761095e-01, -3.01893711e-01, 7.59940967e-02,\n", + " -2.64865339e-01, 5.89084566e-01, -3.75831634e-01, 3.11807573e-01,\n", + " -3.82964134e-01, -1.86417520e-01, 1.07007243e-01, 4.81416702e-01,\n", + " -3.70819569e-01, 9.12090182e-01, 3.13470632e-01, -3.69494259e-02,\n", + " -2.21142501e-01, 3.32214013e-02, 8.51379186e-02, 3.64337176e-01,\n", + " -3.90754700e-01, 4.39904258e-02, 5.39945886e-02, -5.02359867e-01,\n", + " -4.76054996e-02, 3.87604594e-01, -3.71239424e-01, -8.79095644e-02,\n", + " 5.62141061e-01, 1.96927994e-01, 3.54419112e-01, -6.80974126e-03,\n", + " 2.86425143e-01, -3.24660867e-01, -4.56204057e-01, 6.41017914e-01,\n", + " -1.67037442e-01, -2.29641497e-01, 4.71122622e-01, 5.03865302e-01,\n", + " -9.06585157e-03, -1.23926058e-01, -3.32888782e-01, 1.59683321e-02,\n", + " -5.00816345e-01, -3.53796408e-02, -1.60535276e-01, -2.88702995e-01,\n", + " 5.51706925e-02, -3.47863048e-01, -3.01085338e-02, -6.00592375e-01,\n", + " 2.04530790e-01, -1.17298350e-01, 8.88321698e-01, -3.18641007e-01,\n", + " 2.02193573e-01, -1.50856599e-01, -2.96603352e-01, -5.45758486e-01,\n", + " -7.55531311e+00, -3.07271361e-01, -7.33374238e-01, 2.76708573e-01,\n", + " -3.76666151e-02, -4.25825119e-01, -5.56892097e-01, 7.15545475e-01,\n", + " 1.02834240e-01, -1.19939610e-01, 1.94998607e-01, -2.46950224e-01,\n", + " 2.61530429e-01, -4.19263542e-01, 1.31001920e-01, -2.49398082e-01,\n", + " -3.26750994e-01, -3.92482489e-01, 3.30219358e-01, -5.78646958e-01,\n", + " 1.53134540e-01, -3.10127169e-01, -3.67199332e-01, -7.94161111e-02,\n", + " -2.93402106e-01, 2.62198240e-01, 2.91103810e-01, 1.32868871e-01,\n", + " -5.78317158e-02, -4.26885992e-01, 2.99195677e-01, 4.23972368e-01,\n", + " 2.30407149e-01, -2.98300147e-01, -1.55886114e-01, -1.24661736e-01,\n", + " -1.17139973e-01, -4.21351314e-01, -1.45010501e-02, -3.06388348e-01,\n", + " 2.89572328e-01, 9.73405361e-01, -5.52814901e-01, 2.36222595e-01,\n", + " -2.13898420e-01, -1.00043082e+00, -3.57041806e-01, -1.50843680e-01,\n", + " 4.69288528e-02, 2.08646134e-01, -2.70194232e-01, 2.63797104e-01,\n", + " 1.31332219e-01, 2.82329589e-01, 2.69341841e-02, -1.21627375e-01,\n", + " 3.80910456e-01, 2.65330970e-01, -3.01948935e-01, -6.39178753e-02,\n", + " -3.13922286e-01, -4.14075851e-01, -2.19056532e-01, 2.22424790e-01,\n", + " 8.13730657e-02, -3.03519934e-01, 9.32400897e-02, -3.76873404e-01,\n", + " 8.34950879e-02, 1.01878762e-01, 2.87054926e-01, 2.09415853e-02,\n", + " -1.22204229e-01, 1.64302550e-02, -2.41174936e-01, 1.78844824e-01,\n", + " 9.15416703e-03, 1.66462481e-01, -1.45732313e-01, -5.85511327e-04,\n", + " 2.25536823e-01, 3.30472469e-01, -1.25101686e-01, 1.13093004e-01,\n", + " 1.52094781e-01, 4.37459409e-01, 3.22061956e-01, 1.37893021e-01,\n", + " -2.53650725e-01, -1.94988877e-01, -2.72130489e-01, -2.57504702e-01,\n", + " 1.92389667e-01, -2.07393348e-01, 1.73574477e-01, 2.59756446e-02,\n", + " 2.20320046e-01, 6.48344308e-02, 3.96853566e-01, 1.11773282e-01,\n", + " -4.38930988e-01, -5.10937572e-02, 5.92644155e-01, 6.10140711e-03,\n", + " -3.97206768e-02, 7.65584633e-02, -7.68468618e-01, 1.23042464e-01,\n", + " 3.48037392e-01, 1.49242997e-01, 2.86662281e-02, 2.79642552e-01,\n", + " -2.26151049e-01, -6.73239648e-01, -8.07924390e-01, 8.62701386e-02,\n", + " 4.94999364e-02, 1.61207989e-02, -1.30242959e-01, 1.77768275e-01,\n", + " 3.62961054e-01, -3.20745975e-01, 3.67820978e-01, -9.77848917e-02,\n", + " -2.64019221e-01, 6.74475431e-01, 9.26629007e-01, -4.54470068e-02,\n", + " 9.59405363e-01, 3.02993000e-01, -5.81385851e-01, 3.98850322e-01,\n", + " 7.40434751e-02, 1.79926023e-01, 9.12196040e-02, 2.77938917e-02,\n", + " -2.20950916e-02, -1.98561847e-01, -4.33019698e-01, 1.35872006e-01,\n", + " -3.84440348e-02, 1.63487554e-01, 5.38927615e-02, 8.52212310e-01,\n", + " -8.64772916e-01, -3.00439209e-01, 1.66039094e-02, -4.84181255e-01,\n", + " -2.57156193e-01, 4.46582437e-01, 3.71635705e-02, -7.58354291e-02,\n", + " -1.38248950e-02, 1.01295078e+00, 2.14489758e-01, -1.17217854e-01,\n", + " -2.82662451e-01, 7.08411038e-01, 2.08262652e-01, -1.69240460e-02,\n", + " 1.02334268e-01, 4.20059741e-01, 1.07706316e-01, -3.89203757e-01,\n", + " -5.91410846e-02, -1.77690476e-01, -1.26772380e+00, 1.75859511e-01,\n", + " -2.49499828e-01, 1.60166726e-01, 8.72884393e-02, -4.53421593e-01,\n", + " 1.96858853e-01, -2.25365251e-01, -1.31235719e-02, -4.58204031e-01,\n", + " -1.54087022e-01, -1.87472761e-01, 2.73187131e-01, 4.14693624e-01,\n", + " 6.00348413e-01, 5.16499318e-02, -2.52319247e-01, -2.08351701e-01,\n", + " -3.85643661e-01, -6.44139796e-02, -2.70672083e-01, -5.09124994e-02,\n", + " -1.17392734e-01, -1.16136428e-02, -1.69710606e-01, 2.30101690e-01,\n", + " -6.31506741e-02, 2.20495850e-01, 4.81231391e-01, 3.76428038e-01,\n", + " -2.14597031e-01, -4.70009223e-02, 4.38644290e-01, 2.72557199e-01,\n", + " -1.89499091e-02, 6.36664629e-02, -4.86765429e-02, -6.02428794e-01,\n", + " 5.40002957e-02, -9.60005671e-02, 4.63560931e-02, -3.55034113e-01,\n", + " 2.27724269e-01, -1.30642965e-01, -5.17771959e-01, 7.08835796e-02,\n", + " -2.57462114e-01, -4.82860744e-01, 1.13421358e-01, 9.88648832e-02,\n", + " 6.21988237e-01, 2.64641732e-01, -9.67874378e-03, 1.94528699e-01,\n", + " 9.72453296e-01, -4.36969042e-01, -5.50681949e-02, 1.42934144e-01,\n", + " 1.37221038e-01, 5.63952804e-01, -3.20022464e-01, -5.56031644e-01,\n", + " 9.09894407e-01, 1.02216589e+00, -2.79887915e-01, 1.69066399e-01,\n", + " 6.48921371e-01, 1.68456510e-02, -2.58911937e-01, 4.62736428e-01,\n", + " 8.00172612e-03, 1.66315883e-01, -5.30062854e-01, -3.96020412e-01,\n", + " 4.43380117e-01, -4.35658276e-01, -1.11912012e-01, -5.91614306e-01,\n", + " -7.02220649e-02, 1.41544282e-01, -5.65246567e-02, -1.19229007e+00,\n", + " -1.00026041e-01, 1.35173336e-01, -1.37986809e-01, 4.58395988e-01,\n", + " 2.99769610e-01, 1.13845997e-01, -3.23149785e-02, 4.82394725e-01,\n", + " -6.13934547e-03, 3.68614852e-01, -4.91497517e-01, -4.97332066e-01,\n", + " 8.73729736e-02, 3.60586494e-01, -2.91166097e-01, 1.89481646e-01,\n", + " 2.87948608e-01, 1.90306157e-01, 4.15048778e-01, 3.93784940e-01,\n", + " 6.75817132e-02, 1.18251920e-01, 2.03508779e-01, 3.09830695e-01,\n", + " -1.03927016e+00, 1.00612268e-01, -3.46988708e-01, -7.09752440e-01,\n", + " 2.20241398e-01, -3.74946982e-01, -1.48783788e-01, -1.31232068e-01,\n", + " 3.87498319e-01, 1.67044029e-01, -2.79640555e-01, 3.40543866e-01,\n", + " 1.28378880e+00, 4.47215438e-01, -5.00054121e-01, 6.85076341e-02,\n", + " 1.93691164e-01, -4.66935217e-01, -3.24348718e-01, 4.53348368e-01,\n", + " 6.36629641e-01, -5.52294970e-01, -3.59640062e-01, 2.45728597e-01,\n", + " 4.48195577e-01, -1.36022663e+00, -6.26060665e-01, -4.96963590e-01,\n", + " -2.55071461e-01, -2.31453001e-01, -4.22013104e-01, 5.81141561e-02,\n", + " 1.66424632e-01, -1.81557357e-01, -2.85358205e-02, -1.10628068e+00,\n", + " -2.42026821e-01, -4.49676067e-03, 5.53836450e-02, 4.92810488e-01,\n", + " 5.83105981e-01, 6.97781667e-02, -1.33217961e-01, -1.25093237e-01,\n", + " 1.17499933e-01, -5.19634366e-01, 1.42042309e-01, 2.34404474e-01,\n", + " -2.55929470e-01, 3.23758684e-02, -2.34450802e-01, -7.54091814e-02,\n", + " 1.83672294e-01, -2.25883007e-01, -4.76478487e-02, -4.84889567e-01,\n", + " 1.12959743e-03, 1.80705532e-01, -5.87785244e-02, 4.82457250e-01,\n", + " -1.88920692e-01, 1.47517592e-01, 1.10182568e-01, -2.28278339e-02,\n", + " 8.62778306e-01, 4.46689427e-02, 4.16403189e-02, -1.07179873e-01,\n", + " -1.42522454e+00, -2.31161788e-02, 3.05959303e-02, -6.58722073e-02,\n", + " -3.69132429e-01, 3.49290550e-01, -1.39178723e-01, -3.51127565e-01,\n", + " 5.00785351e-01, 2.31236637e-01, 6.77590072e-02, -3.59323025e-02,\n", + " 2.69076526e-01, -3.60533416e-01, 1.48107335e-01, -1.11518174e-01,\n", + " 1.65307403e-01, -1.74086124e-01, 6.01880312e-01, -5.95235109e-01,\n", + " 5.29538319e-02, 3.12422097e-01, -1.14403330e-01, 2.30422497e-01,\n", + " -9.48345065e-02, 3.76421027e-02, 4.77573276e-02, 3.89954895e-01,\n", + " -1.91829026e-01, -6.26232028e-01, 1.29549801e-01, -2.84714490e-01,\n", + " 2.88834363e-01, 6.25569642e-01, -2.44193405e-01, 3.08956832e-01,\n", + " -4.79587227e-01, 1.59115836e-01, -1.07442781e-01, 1.57203451e-01,\n", + " -8.51369202e-02, -1.20136715e-01, -2.91232206e-02, 1.08408488e-01,\n", + " -5.97195402e-02, -1.21715315e-01, -5.79822421e-01, 3.90639007e-01,\n", + " -2.83878148e-01, -2.72939146e-01, 3.87672335e-04, -2.62640566e-01,\n", + " -1.67415068e-01, 1.97720259e-01, 3.60535234e-01, -1.85247302e-01,\n", + " -2.80813038e-01, 3.32875013e-01, -3.98125350e-01, -3.53022516e-02,\n", + " 5.48863769e-01, -1.35882646e-01, 2.50048220e-01, -1.27448589e-01,\n", + " -3.03174406e-01, 3.85489166e-02, -7.27320850e-01, 5.22592783e-01,\n", + " -1.97360516e-01, -1.98229402e-01, -1.42074719e-01, 4.11824808e-02,\n", + " -2.92105675e-01, 2.07964912e-01, 4.97746691e-02, 1.48062438e-01,\n", + " -2.94304550e-01, 7.31720269e-01, 1.14105418e-02, 5.50758056e-02],\n", " dtype=float32),\n", - " 'prediction': 26}]" + " 'prediction': 8}]" ] }, "execution_count": null, @@ -1834,9 +1854,9 @@ " concurrency=4,\n", " batch_size=64,\n", " num_gpus=1,\n", - " accelerator_type=\"L4\",\n", + " accelerator_type=\"T4\",\n", ")\n", - "pred_ds.take(1)" + "pred_ds.take(1)\n" ] }, { @@ -1867,21 +1887,15 @@ "name": "stderr", "output_type": "stream", "text": [ - "2025-06-23 14:25:31,814\tINFO logging.py:295 -- Registered dataset logger for dataset dataset_59_0\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-06-23 14:25:31,828\tINFO streaming_executor.py:117 -- Starting execution of Dataset dataset_59_0. Full logs are in /tmp/ray/session_2025-06-23_13-49-50_102769_2149/logs/ray-data\n", - "2025-06-23 14:25:31,829\tINFO streaming_executor.py:118 -- Execution plan of Dataset dataset_59_0: InputDataBuffer[Input] -> TaskPoolMapOperator[ListFiles] -> TaskPoolMapOperator[ReadFiles] -> TaskPoolMapOperator[Map(add_class)->Map(convert_to_label)] -> ActorPoolMapOperator[MapBatches(EmbedImages)] -> TaskPoolMapOperator[MapBatches(drop_columns)] -> TaskPoolMapOperator[MapBatches(TorchPredictor)] -> TaskPoolMapOperator[MapBatches(batch_metric)] -> AllToAllOperator[Aggregate] -> LimitOperator[limit=1]\n" + "2025-08-28 05:10:59,627\tINFO logging.py:295 -- Registered dataset logger for dataset dataset_43_0\n", + "2025-08-28 05:10:59,639\tINFO streaming_executor.py:159 -- Starting execution of Dataset dataset_43_0. Full logs are in /tmp/ray/session_2025-08-28_04-57-43_348032_12595/logs/ray-data\n", + "2025-08-28 05:10:59,640\tINFO streaming_executor.py:160 -- Execution plan of Dataset dataset_43_0: InputDataBuffer[Input] -> TaskPoolMapOperator[ListFiles] -> TaskPoolMapOperator[ReadFiles] -> TaskPoolMapOperator[Map(add_class)->Map(convert_to_label)] -> ActorPoolMapOperator[MapBatches(EmbedImages)] -> TaskPoolMapOperator[MapBatches(drop_columns)] -> TaskPoolMapOperator[MapBatches(TorchPredictor)] -> TaskPoolMapOperator[MapBatches(batch_metric)] -> AllToAllOperator[Aggregate] -> LimitOperator[limit=1]\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "419085884b1849758482929023d6eb50", + "model_id": "d6accaaab88244f09ad25c06860ef15f", "version_major": 2, "version_minor": 0 }, @@ -1892,17 +1906,10 @@ "metadata": {}, "output_type": "display_data" }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-06-23 14:25:31,856\tINFO actor_pool_map_operator.py:633 -- Scaling up actor pool by 4 (reason=scaling to min size, running=0, restarting=0, pending=0)\n" - ] - }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "fbde64d675d4412597d9ace64aa3ac38", + "model_id": "b5716638e4bf437399f7192ed356d610", "version_major": 2, "version_minor": 0 }, @@ -1916,7 +1923,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "8325afb46d644ecc9027b7f152341021", + "model_id": "d691aeac306d4249ad7cb71172b81f5c", "version_major": 2, "version_minor": 0 }, @@ -1930,7 +1937,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "e6a774a2b1f84b4086e750dc8ac348ed", + "model_id": "f9175ec54ce64fd18afcc7d2b31b2e4b", "version_major": 2, "version_minor": 0 }, @@ -1944,7 +1951,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "b4770e750930416cbe1629bbf698f4a2", + "model_id": "ac75a05d32484b918a46a7c082a6c88a", "version_major": 2, "version_minor": 0 }, @@ -1958,7 +1965,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "89ffb7899dcc47a3b36daa13da9cfe4d", + "model_id": "e0f1ce17c2c54a9ea96762fc0004c543", "version_major": 2, "version_minor": 0 }, @@ -1972,7 +1979,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "85af9e24a5fa41189b94044860db6ee7", + "model_id": "b3fccdab34974505a9b518ff286f390f", "version_major": 2, "version_minor": 0 }, @@ -1986,7 +1993,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "5d9111eece5c4d9381953e84c53de7b0", + "model_id": "3d9e617f49b847a0891fbbdb5185aae6", "version_major": 2, "version_minor": 0 }, @@ -2000,7 +2007,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "1242eb6344e2407b9c9ca909ffc59816", + "model_id": "52a0cd72c7be435894e9ad1981c50301", "version_major": 2, "version_minor": 0 }, @@ -2014,7 +2021,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "02d595ac51494d12bb2428a788d73b7f", + "model_id": "000b77243d824ddf8a2c199357ce6cf1", "version_major": 2, "version_minor": 0 }, @@ -2028,7 +2035,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "2f749b365c194b4a8da80f1c7679a040", + "model_id": "51122559786d45c8856839c1818a7158", "version_major": 2, "version_minor": 0 }, @@ -2042,7 +2049,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "cd5adf33abcd4a6f86a3411387dc62e6", + "model_id": "361ff1a618994ef2819407518534ecd7", "version_major": 2, "version_minor": 0 }, @@ -2056,7 +2063,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "6efdb6f918a24510b0d69e060da5e2de", + "model_id": "20681d4878dd485e8ca817c2ee333ccc", "version_major": 2, "version_minor": 0 }, @@ -2071,23 +2078,19 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[36m(_MapWorker pid=7186, ip=10.0.90.122)\u001b[0m Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\n", - "2025-06-23 14:25:43,855\tINFO actor_pool_map_operator.py:661 -- Scaled down actor pool by 1 (reason=None; running=3, restarting=0, pending=0)\n", - "\u001b[36m(MapBatches(TorchPredictor) pid=7259, ip=10.0.90.122)\u001b[0m /tmp/ipykernel_14938/3214280880.py:6: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /pytorch/torch/csrc/utils/tensor_numpy.cpp:203.)\n", - "\u001b[36m(_MapWorker pid=14469, ip=10.0.103.152)\u001b[0m Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\u001b[32m [repeated 3x across cluster]\u001b[0m\n", - "2025-06-23 14:25:44,370\tINFO actor_pool_map_operator.py:661 -- Scaled down actor pool by 1 (reason=None; running=2, restarting=0, pending=0)\n", - "2025-06-23 14:25:44,899\tINFO actor_pool_map_operator.py:661 -- Scaled down actor pool by 1 (reason=None; running=1, restarting=0, pending=0)\n", - "2025-06-23 14:25:45,419\tINFO actor_pool_map_operator.py:661 -- Scaled down actor pool by 1 (reason=None; running=0, restarting=0, pending=0)\n", - "\u001b[36m(MapBatches(TorchPredictor) pid=7393, ip=10.0.90.122)\u001b[0m /tmp/ipykernel_14938/3214280880.py:6: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /pytorch/torch/csrc/utils/tensor_numpy.cpp:203.)\u001b[32m [repeated 4x across cluster]\u001b[0m\n", - "\u001b[36m(MapBatches(TorchPredictor) pid=46643, ip=10.0.102.235)\u001b[0m /tmp/ipykernel_14938/3214280880.py:6: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /pytorch/torch/csrc/utils/tensor_numpy.cpp:203.)\u001b[32m [repeated 4x across cluster]\u001b[0m\n", - "\u001b[36m(MapBatches(TorchPredictor) pid=15409, ip=10.0.69.70)\u001b[0m /tmp/ipykernel_14938/3214280880.py:6: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /pytorch/torch/csrc/utils/tensor_numpy.cpp:203.)\u001b[32m [repeated 4x across cluster]\u001b[0m\n", - "\u001b[36m(MapBatches(TorchPredictor) pid=16788, ip=10.0.90.122)\u001b[0m /tmp/ipykernel_14938/3214280880.py:6: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /pytorch/torch/csrc/utils/tensor_numpy.cpp:203.)\u001b[32m [repeated 4x across cluster]\u001b[0m\n", - "\u001b[36m(MapBatches(TorchPredictor) pid=15462, ip=10.0.67.42)\u001b[0m /tmp/ipykernel_14938/3214280880.py:6: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /pytorch/torch/csrc/utils/tensor_numpy.cpp:203.)\u001b[32m [repeated 4x across cluster]\u001b[0m\n", - "\u001b[36m(MapBatches(TorchPredictor) pid=47017, ip=10.0.102.235)\u001b[0m /tmp/ipykernel_14938/3214280880.py:6: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /pytorch/torch/csrc/utils/tensor_numpy.cpp:203.)\u001b[32m [repeated 4x across cluster]\u001b[0m\n", - "\u001b[36m(MapBatches(TorchPredictor) pid=15584, ip=10.0.69.70)\u001b[0m /tmp/ipykernel_14938/3214280880.py:6: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /pytorch/torch/csrc/utils/tensor_numpy.cpp:203.)\u001b[32m [repeated 4x across cluster]\u001b[0m\n", - "\u001b[36m(MapBatches(TorchPredictor) pid=17097, ip=10.0.103.152)\u001b[0m /tmp/ipykernel_14938/3214280880.py:6: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /pytorch/torch/csrc/utils/tensor_numpy.cpp:203.)\u001b[32m [repeated 4x across cluster]\u001b[0m\n", - "\u001b[36m(MapBatches(TorchPredictor) pid=17183, ip=10.0.90.122)\u001b[0m /tmp/ipykernel_14938/3214280880.py:6: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /pytorch/torch/csrc/utils/tensor_numpy.cpp:203.)\u001b[32m [repeated 4x across cluster]\u001b[0m\n", - "2025-06-23 14:26:35,251\tINFO streaming_executor.py:227 -- ✔️ Dataset dataset_59_0 execution finished in 63.42 seconds\n" + "\u001b[36m(MapWorker(MapBatches(EmbedImages)) pid=34103, ip=10.0.5.252)\u001b[0m Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\n", + "\u001b[36m(MapBatches(TorchPredictor) pid=8149, ip=10.0.5.20)\u001b[0m /tmp/ipykernel_31027/417303983.py:6: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /pytorch/torch/csrc/utils/tensor_numpy.cpp:203.)\n", + "\u001b[36m(MapWorker(MapBatches(EmbedImages)) pid=40389, ip=10.0.5.252)\u001b[0m Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\u001b[32m [repeated 3x across cluster]\u001b[0m\n", + "\u001b[36m(MapBatches(TorchPredictor) pid=8263, ip=10.0.5.20)\u001b[0m /tmp/ipykernel_31027/417303983.py:6: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /pytorch/torch/csrc/utils/tensor_numpy.cpp:203.)\u001b[32m [repeated 4x across cluster]\u001b[0m\n", + "\u001b[36m(MapBatches(TorchPredictor) pid=8340, ip=10.0.5.20)\u001b[0m /tmp/ipykernel_31027/417303983.py:6: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /pytorch/torch/csrc/utils/tensor_numpy.cpp:203.)\u001b[32m [repeated 4x across cluster]\u001b[0m\n", + "\u001b[36m(MapBatches(TorchPredictor) pid=17879, ip=10.0.5.20)\u001b[0m /tmp/ipykernel_31027/417303983.py:6: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /pytorch/torch/csrc/utils/tensor_numpy.cpp:203.)\u001b[32m [repeated 4x across cluster]\u001b[0m\n", + "\u001b[36m(MapBatches(TorchPredictor) pid=18144, ip=10.0.5.20)\u001b[0m /tmp/ipykernel_31027/417303983.py:6: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /pytorch/torch/csrc/utils/tensor_numpy.cpp:203.)\u001b[32m [repeated 4x across cluster]\u001b[0m\n", + "\u001b[36m(MapBatches(TorchPredictor) pid=18411, ip=10.0.5.20)\u001b[0m /tmp/ipykernel_31027/417303983.py:6: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /pytorch/torch/csrc/utils/tensor_numpy.cpp:203.)\u001b[32m [repeated 4x across cluster]\u001b[0m\n", + "\u001b[36m(MapBatches(TorchPredictor) pid=18682, ip=10.0.5.20)\u001b[0m /tmp/ipykernel_31027/417303983.py:6: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /pytorch/torch/csrc/utils/tensor_numpy.cpp:203.)\u001b[32m [repeated 4x across cluster]\u001b[0m\n", + "\u001b[36m(MapBatches(TorchPredictor) pid=18950, ip=10.0.5.20)\u001b[0m /tmp/ipykernel_31027/417303983.py:6: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /pytorch/torch/csrc/utils/tensor_numpy.cpp:203.)\u001b[32m [repeated 4x across cluster]\u001b[0m\n", + "\u001b[36m(MapBatches(TorchPredictor) pid=19219, ip=10.0.5.20)\u001b[0m /tmp/ipykernel_31027/417303983.py:6: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /pytorch/torch/csrc/utils/tensor_numpy.cpp:203.)\u001b[32m [repeated 4x across cluster]\u001b[0m\n", + "\u001b[36m(MapBatches(TorchPredictor) pid=19564, ip=10.0.5.20)\u001b[0m /tmp/ipykernel_31027/417303983.py:6: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /pytorch/torch/csrc/utils/tensor_numpy.cpp:203.)\u001b[32m [repeated 4x across cluster]\u001b[0m\n", + "2025-08-28 05:12:20,741\tINFO streaming_executor.py:279 -- ✔️ Dataset dataset_43_0 execution finished in 81.10 seconds\n" ] } ], @@ -2106,7 +2109,7 @@ "precision = tp / (tp + fp) if (tp + fp) > 0 else 0\n", "recall = tp / (tp + fn) if (tp + fn) > 0 else 0\n", "f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0\n", - "accuracy = (tp + tn) / (tp + tn + fp + fn)" + "accuracy = (tp + tn) / (tp + tn + fp + fn)\n" ] }, { @@ -2129,7 +2132,7 @@ "print(f\"Precision: {precision:.2f}\")\n", "print(f\"Recall: {recall:.2f}\")\n", "print(f\"F1: {f1:.2f}\")\n", - "print(f\"Accuracy: {accuracy:.2f}\")" + "print(f\"Accuracy: {accuracy:.2f}\")\n" ] }, { diff --git a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/notebooks/03-Online-Serving.ipynb b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/notebooks/03-Online-Serving.ipynb index 587a257dd3a5..d60d62c52c13 100644 --- a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/notebooks/03-Online-Serving.ipynb +++ b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/notebooks/03-Online-Serving.ipynb @@ -24,14 +24,35 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[92mSuccessfully registered `matplotlib, torch` and 4 other packages to be installed on all cluster nodes.\u001b[0m\n", - "\u001b[92mView and update dependencies here: https://console.anyscale.com/cld_kvedZWag2qA8i5BjxUevf5i7/prj_cz951f43jjdybtzkx1s5sjgz99/workspaces/expwrk_eys8cskj5aivghbf773dp2vmcd?workspace-tab=dependencies\u001b[0m\n" + "\u001b[92mSuccessfully registered `ipywidgets, matplotlib` and 4 other packages to be installed on all cluster nodes.\u001b[0m\n", + "\u001b[92mView and update dependencies here: https://console.anyscale.com/cld_kvedZWag2qA8i5BjxUevf5i7/prj_cz951f43jjdybtzkx1s5sjgz99/workspaces/expwrk_23ry3pgfn3jgq2jk3e5z25udhz?workspace-tab=dependencies\u001b[0m\n", + "\u001b[92mSuccessfully registered `doggos` package to be installed on all cluster nodes.\u001b[0m\n", + "\u001b[92mView and update dependencies here: https://console.anyscale.com/cld_kvedZWag2qA8i5BjxUevf5i7/prj_cz951f43jjdybtzkx1s5sjgz99/workspaces/expwrk_23ry3pgfn3jgq2jk3e5z25udhz?workspace-tab=dependencies\u001b[0m\n" ] } ], "source": [ "%%bash\n", - "pip install -q \"matplotlib==3.10.0\" \"torch==2.7.0\" \"transformers==4.52.3\" \"scikit-learn==1.6.0\" \"mlflow==2.19.0\" \"ipywidgets==8.1.3\"" + "pip install -q -r /home/ray/default/requirements.txt\n", + "pip install -q -e /home/ray/default/doggos\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Note**: A kernel restart may be required for all dependencies to become available. \n", + "\n", + "If using **uv**, then:\n", + "1. Turn off the runtime dependencies (`Dependencies` tab up top > Toggle off `Pip packages`). And no need to run the `pip install` commands above.\n", + "2. Change the python kernel of this notebook to use the `venv` (Click on `base (Python x.yy.zz)` on top right cordern of notebook > `Select another Kernel` > `Python Environments...` > `Create Python Environment` > `Venv` > `Use Existing`) and done! Now all the notebook's cells will use the virtual env.\n", + "3. Change the py executable to use `uv run` instead of `python` by adding this line after importing ray.\n", + "```python\n", + "import os\n", + "os.environ.pop(\"RAY_RUNTIME_ENV_HOOK\", None)\n", + "import ray\n", + "ray.init(runtime_env={\"py_executable\": \"uv run\", \"working_dir\": \"/home/ray/default\"})\n", + "```" ] }, { @@ -41,83 +62,30 @@ "outputs": [], "source": [ "%load_ext autoreload\n", - "%autoreload all" + "%autoreload all\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-06-23 20:03:54,080\tINFO worker.py:1723 -- Connecting to existing Ray cluster at address: 10.0.61.28:6379...\n", - "2025-06-23 20:03:54,091\tINFO worker.py:1908 -- Connected to Ray cluster. View the dashboard at \u001b[1m\u001b[32mhttps://session-gcwehd9xxjzkv5lxv8lgcdgx2n.i.anyscaleuserdata.com \u001b[39m\u001b[22m\n", - "2025-06-23 20:03:54,133\tINFO packaging.py:588 -- Creating a file package for local module '../'.\n", - "2025-06-23 20:03:54,190\tINFO packaging.py:380 -- Pushing file package 'gcs://_ray_pkg_60b8ab9607f9a287.zip' (12.99MiB) to Ray cluster...\n", - "2025-06-23 20:03:54,250\tINFO packaging.py:393 -- Successfully pushed file package 'gcs://_ray_pkg_60b8ab9607f9a287.zip'.\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "aa33be7c5f98450283f661adb61a3c6b", - "version_major": 2, - "version_minor": 0 - }, - "text/html": [ - "
\n", - "
\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "
Python version:3.12.11
Ray version:2.47.1
Dashboard:http://session-gcwehd9xxjzkv5lxv8lgcdgx2n.i.anyscaleuserdata.com
\n", - "\n", - "
\n", - "
\n" - ], - "text/plain": [ - "RayContext(dashboard_url='session-gcwehd9xxjzkv5lxv8lgcdgx2n.i.anyscaleuserdata.com', python_version='3.12.11', ray_version='2.47.1', ray_commit='e06f523c450fb1c99d8f347f8bfcc4085cc68b66')" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import os\n", "import ray\n", "import sys\n", - "sys.path.append(os.path.abspath(\"..\"))\n", - "ray.init(runtime_env={\"working_dir\": \"../\"})" + "sys.path.append(os.path.abspath(\"../doggos/\"))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# If using UV\n", + "# os.environ.pop(\"RAY_RUNTIME_ENV_HOOK\", None)\n", + "# ray.init(runtime_env={\"py_executable\": \"uv run\", \"working_dir\": \"/home/ray/default\"})\n" ] }, { @@ -132,7 +100,7 @@ "import requests\n", "from starlette.requests import Request\n", "from urllib.parse import urlparse\n", - "from ray import serve" + "from ray import serve\n" ] }, { @@ -144,7 +112,7 @@ "import numpy as np\n", "from PIL import Image\n", "import torch\n", - "from transformers import CLIPModel, CLIPProcessor" + "from transformers import CLIPModel, CLIPProcessor\n" ] }, { @@ -155,7 +123,7 @@ "source": [ "from doggos.infer import TorchPredictor\n", "from doggos.model import collate_fn\n", - "from doggos.utils import url_to_array" + "from doggos.utils import url_to_array\n" ] }, { @@ -182,7 +150,7 @@ " num_replicas=\"1\", \n", " ray_actor_options={\n", " \"num_gpus\": 1, \n", - " \"accelerator_type\": \"L4\",\n", + " \"accelerator_type\": \"T4\",\n", " },\n", ")\n", "class ClassPredictor:\n", @@ -205,7 +173,7 @@ " embedding = self.model.get_image_features(**inputs).cpu().numpy()\n", " outputs = self.predictor.predict_probabilities(\n", " collate_fn({\"embedding\": embedding}))\n", - " return {\"probabilities\": outputs[\"probabilities\"][0]}" + " return {\"probabilities\": outputs[\"probabilities\"][0]}\n" ] }, { @@ -237,7 +205,7 @@ " title=\"doggos\", \n", " description=\"classify your dog\", \n", " version=\"0.1\",\n", - ")" + ")\n" ] }, { @@ -256,7 +224,7 @@ " async def predict(self, request: Request):\n", " data = await request.json()\n", " probabilities = await self.classifier.get_probabilities.remote(url=data[\"url\"])\n", - " return probabilities" + " return probabilities\n" ] }, { @@ -268,7 +236,7 @@ "# Model registry.\n", "model_registry = \"/mnt/cluster_storage/mlflow/doggos\"\n", "experiment_name = \"doggos\"\n", - "mlflow.set_tracking_uri(f\"file:{model_registry}\")" + "mlflow.set_tracking_uri(f\"file:{model_registry}\")\n" ] }, { @@ -278,12 +246,11 @@ "outputs": [], "source": [ "# Get best_run's artifact_dir.\n", - "mlflow.set_tracking_uri(f\"file:{model_registry}\")\n", "sorted_runs = mlflow.search_runs(\n", " experiment_names=[experiment_name], \n", " order_by=[\"metrics.val_loss ASC\"])\n", "best_run = sorted_runs.iloc[0]\n", - "artifacts_dir = urlparse(best_run.artifact_uri).path" + "artifacts_dir = urlparse(best_run.artifact_uri).path\n" ] }, { @@ -299,7 +266,7 @@ " artifacts_dir=artifacts_dir,\n", " device=\"cuda\"\n", " )\n", - ")" + ")\n" ] }, { @@ -311,27 +278,43 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[36m(ProxyActor pid=75693)\u001b[0m INFO 2025-06-23 20:04:07,726 proxy 10.0.61.28 -- Proxy starting on node b4c1ef3393280e7df5c15725708ef231f52e1e31e050f75f5d32a41a (HTTP port: 8000).\n", - "\u001b[36m(ProxyActor pid=75693)\u001b[0m INFO 2025-06-23 20:04:07,794 proxy 10.0.61.28 -- Got updated endpoints: {}.\n", - "INFO 2025-06-23 20:04:07,815 serve 75456 -- Started Serve in namespace \"serve\".\n", - "\u001b[36m(ServeController pid=75629)\u001b[0m INFO 2025-06-23 20:04:07,905 controller 75629 -- Deploying new version of Deployment(name='ClassPredictor', app='default') (initial target replicas: 1).\n", - "\u001b[36m(ServeController pid=75629)\u001b[0m INFO 2025-06-23 20:04:07,907 controller 75629 -- Deploying new version of Deployment(name='Doggos', app='default') (initial target replicas: 1).\n", - "\u001b[36m(ProxyActor pid=75693)\u001b[0m INFO 2025-06-23 20:04:07,910 proxy 10.0.61.28 -- Got updated endpoints: {Deployment(name='Doggos', app='default'): EndpointInfo(route='/', app_is_cross_language=False)}.\n", - "\u001b[36m(ServeController pid=75629)\u001b[0m INFO 2025-06-23 20:04:08,013 controller 75629 -- Adding 1 replica to Deployment(name='ClassPredictor', app='default').\n", - "\u001b[36m(ServeController pid=75629)\u001b[0m INFO 2025-06-23 20:04:08,014 controller 75629 -- Adding 1 replica to Deployment(name='Doggos', app='default').\n", - "\u001b[36m(ProxyActor pid=75693)\u001b[0m INFO 2025-06-23 20:04:07,922 proxy 10.0.61.28 -- Started .\n", - "\u001b[36m(ServeController pid=75629)\u001b[0m WARNING 2025-06-23 20:04:38,040 controller 75629 -- Deployment 'ClassPredictor' in application 'default' has 1 replicas that have taken more than 30s to be scheduled. This may be due to waiting for the cluster to auto-scale or for a runtime environment to be installed. Resources required for each replica: {\"CPU\": 1, \"GPU\": 1, \"accelerator_type:L4\": 0.001}, total resources available: {\"accelerator_type:L4\": 0.999, \"CPU\": 2.0}. Use `ray status` for more details.\n", - "\u001b[36m(ServeController pid=75629)\u001b[0m WARNING 2025-06-23 20:04:38,041 controller 75629 -- Deployment 'Doggos' in application 'default' has 1 replicas that have taken more than 30s to be scheduled. This may be due to waiting for the cluster to auto-scale or for a runtime environment to be installed. Resources required for each replica: {\"CPU\": 1}, total resources available: {\"CPU\": 2.0}. Use `ray status` for more details.\n", - "\u001b[36m(ServeReplica:default:Doggos pid=19668, ip=10.0.95.114)\u001b[0m INFO 2025-06-23 20:05:03,231 default_Doggos 21c29nfb -- Direct ingress is disabled, skipping direct ingress server start\n", - "\u001b[36m(ProxyActor pid=19768, ip=10.0.95.114)\u001b[0m INFO 2025-06-23 20:05:05,037 proxy 10.0.95.114 -- Proxy starting on node 760a1c063ba581ef6100d697d1e1d263b0b354b603658541229768ae (HTTP port: 8000).\n", - "\u001b[36m(ProxyActor pid=19768, ip=10.0.95.114)\u001b[0m INFO 2025-06-23 20:05:05,092 proxy 10.0.95.114 -- Got updated endpoints: {Deployment(name='Doggos', app='default'): EndpointInfo(route='/', app_is_cross_language=False)}.\n", - "\u001b[36m(ProxyActor pid=19768, ip=10.0.95.114)\u001b[0m INFO 2025-06-23 20:05:05,105 proxy 10.0.95.114 -- Started .\n", - "\u001b[36m(ServeReplica:default:ClassPredictor pid=19669, ip=10.0.95.114)\u001b[0m Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\n", - "\u001b[36m(ServeController pid=75629)\u001b[0m WARNING 2025-06-23 20:05:08,122 controller 75629 -- Deployment 'ClassPredictor' in application 'default' has 1 replicas that have taken more than 30s to initialize.\n", - "\u001b[36m(ServeController pid=75629)\u001b[0m This may be caused by a slow __init__ or reconfigure method.\n", - "\u001b[36m(ServeReplica:default:ClassPredictor pid=19669, ip=10.0.95.114)\u001b[0m INFO 2025-06-23 20:05:09,415 default_ClassPredictor fyf5xp23 -- Direct ingress is disabled, skipping direct ingress server start\n", - "INFO 2025-06-23 20:05:10,065 serve 75456 -- Application 'default' is ready at http://127.0.0.1:8000/.\n", - "INFO 2025-06-23 20:05:10,071 serve 75456 -- Started .\n" + "2025-08-28 05:15:38,455\tINFO worker.py:1771 -- Connecting to existing Ray cluster at address: 10.0.17.148:6379...\n", + "2025-08-28 05:15:38,465\tINFO worker.py:1942 -- Connected to Ray cluster. View the dashboard at \u001b[1m\u001b[32mhttps://session-jhxhj69d6ttkjctcxfnsfe7gwk.i.anyscaleuserdata.com \u001b[39m\u001b[22m\n", + "2025-08-28 05:15:38,471\tINFO packaging.py:588 -- Creating a file package for local module '/home/ray/default/doggos/doggos'.\n", + "2025-08-28 05:15:38,475\tINFO packaging.py:380 -- Pushing file package 'gcs://_ray_pkg_62e649352ce105b6.zip' (0.04MiB) to Ray cluster...\n", + "2025-08-28 05:15:38,476\tINFO packaging.py:393 -- Successfully pushed file package 'gcs://_ray_pkg_62e649352ce105b6.zip'.\n", + "2025-08-28 05:15:38,478\tINFO packaging.py:380 -- Pushing file package 'gcs://_ray_pkg_c3f5a1927d401ecc93333d17727d37c3401aeed9.zip' (1.08MiB) to Ray cluster...\n", + "2025-08-28 05:15:38,484\tINFO packaging.py:393 -- Successfully pushed file package 'gcs://_ray_pkg_c3f5a1927d401ecc93333d17727d37c3401aeed9.zip'.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[36m(autoscaler +9s)\u001b[0m Tip: use `ray status` to view detailed cluster status. To disable these messages, set RAY_SCHEDULER_EVENTS=0.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[36m(ProxyActor pid=42150)\u001b[0m INFO 2025-08-28 05:15:42,208 proxy 10.0.17.148 -- Proxy starting on node 524d54fa7a3dfe7fcd55149e6efeaa7a697a4ce87282da72073206b6 (HTTP port: 8000).\n", + "INFO 2025-08-28 05:15:42,290 serve 41929 -- Started Serve in namespace \"serve\".\n", + "\u001b[36m(ProxyActor pid=42150)\u001b[0m INFO 2025-08-28 05:15:42,286 proxy 10.0.17.148 -- Got updated endpoints: {}.\n", + "\u001b[36m(ServeController pid=42086)\u001b[0m INFO 2025-08-28 05:15:47,403 controller 42086 -- Deploying new version of Deployment(name='ClassPredictor', app='default') (initial target replicas: 1).\n", + "\u001b[36m(ServeController pid=42086)\u001b[0m INFO 2025-08-28 05:15:47,404 controller 42086 -- Deploying new version of Deployment(name='Doggos', app='default') (initial target replicas: 1).\n", + "\u001b[36m(ProxyActor pid=42150)\u001b[0m INFO 2025-08-28 05:15:47,423 proxy 10.0.17.148 -- Got updated endpoints: {Deployment(name='Doggos', app='default'): EndpointInfo(route='/', app_is_cross_language=False)}.\n", + "\u001b[36m(ProxyActor pid=42150)\u001b[0m WARNING 2025-08-28 05:15:47,430 proxy 10.0.17.148 -- ANYSCALE_RAY_SERVE_GRPC_RUN_PROXY_ROUTER_SEPARATE_LOOP has been deprecated and will be removed in the ray v2.50.0. Please use RAY_SERVE_RUN_ROUTER_IN_SEPARATE_LOOP instead.\n", + "\u001b[36m(ProxyActor pid=42150)\u001b[0m INFO 2025-08-28 05:15:47,434 proxy 10.0.17.148 -- Started .\n", + "\u001b[36m(ServeController pid=42086)\u001b[0m INFO 2025-08-28 05:15:47,524 controller 42086 -- Adding 1 replica to Deployment(name='ClassPredictor', app='default').\n", + "\u001b[36m(ServeController pid=42086)\u001b[0m INFO 2025-08-28 05:15:47,525 controller 42086 -- Adding 1 replica to Deployment(name='Doggos', app='default').\n", + "\u001b[36m(ServeReplica:default:ClassPredictor pid=20055, ip=10.0.5.20)\u001b[0m Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\n", + "\u001b[36m(ProxyActor pid=20172, ip=10.0.5.20)\u001b[0m INFO 2025-08-28 05:15:56,055 proxy 10.0.5.20 -- Proxy starting on node b84e244dca75c40ea981202cae7a1a06df9598ac29ad2b18e1bedb99 (HTTP port: 8000).\n", + "\u001b[36m(ProxyActor pid=20172, ip=10.0.5.20)\u001b[0m INFO 2025-08-28 05:15:56,131 proxy 10.0.5.20 -- Got updated endpoints: {Deployment(name='Doggos', app='default'): EndpointInfo(route='/', app_is_cross_language=False)}.\n", + "\u001b[36m(ProxyActor pid=20172, ip=10.0.5.20)\u001b[0m WARNING 2025-08-28 05:15:56,137 proxy 10.0.5.20 -- ANYSCALE_RAY_SERVE_GRPC_RUN_PROXY_ROUTER_SEPARATE_LOOP has been deprecated and will be removed in the ray v2.50.0. Please use RAY_SERVE_RUN_ROUTER_IN_SEPARATE_LOOP instead.\n", + "\u001b[36m(ProxyActor pid=20172, ip=10.0.5.20)\u001b[0m INFO 2025-08-28 05:15:56,141 proxy 10.0.5.20 -- Started .\n", + "INFO 2025-08-28 05:15:57,505 serve 41929 -- Application 'default' is ready at http://127.0.0.1:8000/.\n", + "INFO 2025-08-28 05:15:57,511 serve 41929 -- Started .\n" ] }, { @@ -347,7 +330,7 @@ ], "source": [ "# Run service locally.\n", - "serve.run(app, route_prefix=\"/\")" + "serve.run(app, route_prefix=\"/\")\n" ] }, { @@ -355,12 +338,21 @@ "execution_count": null, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[36m(ServeReplica:default:Doggos pid=42244)\u001b[0m INFO 2025-08-28 05:15:57,646 default_Doggos fs1weamq 31c15b70-89a9-4b2d-b4ab-f8424fe6d8d2 -- Started .\n", + "\u001b[36m(ServeReplica:default:ClassPredictor pid=20055, ip=10.0.5.20)\u001b[0m /home/ray/anaconda3/lib/python3.12/site-packages/ray/serve/_private/replica.py:1397: UserWarning: Calling sync method 'get_probabilities' directly on the asyncio loop. In a future version, sync methods will be run in a threadpool by default. Ensure your sync methods are thread safe or keep the existing behavior by making them `async def`. Opt into the new behavior by setting RAY_SERVE_RUN_SYNC_IN_THREADPOOL=1.\n", + "\u001b[36m(ServeReplica:default:ClassPredictor pid=20055, ip=10.0.5.20)\u001b[0m warnings.warn(\n" + ] + }, { "data": { "text/plain": [ - "[('collie', 0.2568000853061676),\n", - " ('border_collie', 0.16908691823482513),\n", - " ('bernese_mountain_dog', 0.0767023041844368)]" + "[('border_collie', 0.1990548074245453),\n", + " ('collie', 0.1363961398601532),\n", + " ('german_shepherd', 0.07545585185289383)]" ] }, "execution_count": null, @@ -368,10 +360,11 @@ "output_type": "execute_result" }, { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "\u001b[36m(autoscaler +38m14s)\u001b[0m Tip: use `ray status` to view detailed cluster status. To disable these messages, set RAY_SCHEDULER_EVENTS=0.\n" + "\u001b[36m(ServeReplica:default:Doggos pid=42244)\u001b[0m INFO 2025-08-28 05:15:58,150 default_Doggos fs1weamq 31c15b70-89a9-4b2d-b4ab-f8424fe6d8d2 -- POST /predict/ 200 516.2ms\n", + "\u001b[36m(ServeReplica:default:ClassPredictor pid=20055, ip=10.0.5.20)\u001b[0m INFO 2025-08-28 05:15:58,148 default_ClassPredictor y7tebd3e 31c15b70-89a9-4b2d-b4ab-f8424fe6d8d2 -- CALL /predict/ OK 491.4ms\n" ] } ], @@ -382,7 +375,7 @@ "response = requests.post(\"http://127.0.0.1:8000/predict/\", json=data)\n", "probabilities = response.json()[\"probabilities\"]\n", "sorted_probabilities = sorted(probabilities.items(), key=lambda x: x[1], reverse=True)\n", - "sorted_probabilities[0:3]" + "sorted_probabilities[0:3]\n" ] }, { @@ -476,13 +469,8 @@ "source": [ "```bash\n", "# Production online service.\n", - "anyscale service deploy doggos.serve:app --name=doggos-app \\\n", - " --containerfile=\"/home/ray/default/containerfile\" \\\n", - " --compute-config=\"/home/ray/default/configs/aws.yaml\" \\\n", - " --working-dir=\"/home/ray/default\" \\\n", - " --exclude=\"\"\n", + "anyscale service deploy -f /home/ray/default/configs/service.yaml\n", "```\n", - "\n", "```\n", "(anyscale +1.9s) Restarting existing service 'doggos-app'.\n", "(anyscale +3.2s) Uploading local dir '/home/ray/default' to cloud storage.\n", diff --git a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/pyproject.toml b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/pyproject.toml new file mode 100644 index 000000000000..b63d44cd0ee8 --- /dev/null +++ b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/pyproject.toml @@ -0,0 +1,22 @@ +[project] +name = "default" +version = "0.1.0" +description = "Add your description here" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "boto3>=1.40.9", + "doggos", + "ipykernel>=6.30.1", + "ipywidgets==8.1.3", + "matplotlib==3.10.0", + "mlflow==2.19.0", + "ray[data,serve,train,tune]", + "scikit-learn==1.6.0", + "torch==2.7.1", + "transformers==4.52.3", +] + +[tool.uv.sources] +ray = { url = "http://localhost:9478/ray/ray-2.48.0-cp312-cp312-manylinux2014_x86_64.whl" } +doggos = { path = "doggos" } diff --git a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/requirements.txt b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/requirements.txt index 42a1a0e489ce..603b6a10aafe 100644 --- a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/requirements.txt +++ b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/requirements.txt @@ -1,6 +1,6 @@ +ipywidgets==8.1.3 matplotlib==3.10.0 +mlflow==2.19.0 torch==2.7.1 transformers==4.52.3 -scikit-learn==1.6.0 -mlflow==2.19.0 -ipywidgets==8.1.3 +scikit-learn==1.6.0 \ No newline at end of file diff --git a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/uv.lock b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/uv.lock new file mode 100644 index 000000000000..bf67a0f858bc --- /dev/null +++ b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/uv.lock @@ -0,0 +1,3506 @@ +version = 1 +revision = 2 +requires-python = ">=3.12" +resolution-markers = [ + "python_full_version >= '3.13' and sys_platform != 'win32'", + "python_full_version < '3.13' and sys_platform != 'win32'", + "python_full_version >= '3.13' and sys_platform == 'win32'", + "python_full_version < '3.13' and sys_platform == 'win32'", +] + +[[package]] +name = "aiohappyeyeballs" +version = "2.6.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/26/30/f84a107a9c4331c14b2b586036f40965c128aa4fee4dda5d3d51cb14ad54/aiohappyeyeballs-2.6.1.tar.gz", hash = "sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558", size = 22760, upload-time = "2025-03-12T01:42:48.764Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0f/15/5bf3b99495fb160b63f95972b81750f18f7f4e02ad051373b669d17d44f2/aiohappyeyeballs-2.6.1-py3-none-any.whl", hash = "sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8", size = 15265, upload-time = "2025-03-12T01:42:47.083Z" }, +] + +[[package]] +name = "aiohttp" +version = "3.12.15" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "aiohappyeyeballs" }, + { name = "aiosignal" }, + { name = "attrs" }, + { name = "frozenlist" }, + { name = "multidict" }, + { name = "propcache" }, + { name = "yarl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9b/e7/d92a237d8802ca88483906c388f7c201bbe96cd80a165ffd0ac2f6a8d59f/aiohttp-3.12.15.tar.gz", hash = "sha256:4fc61385e9c98d72fcdf47e6dd81833f47b2f77c114c29cd64a361be57a763a2", size = 7823716, upload-time = "2025-07-29T05:52:32.215Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/63/97/77cb2450d9b35f517d6cf506256bf4f5bda3f93a66b4ad64ba7fc917899c/aiohttp-3.12.15-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:802d3868f5776e28f7bf69d349c26fc0efadb81676d0afa88ed00d98a26340b7", size = 702333, upload-time = "2025-07-29T05:50:46.507Z" }, + { url = "https://files.pythonhosted.org/packages/83/6d/0544e6b08b748682c30b9f65640d006e51f90763b41d7c546693bc22900d/aiohttp-3.12.15-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:f2800614cd560287be05e33a679638e586a2d7401f4ddf99e304d98878c29444", size = 476948, upload-time = "2025-07-29T05:50:48.067Z" }, + { url = "https://files.pythonhosted.org/packages/3a/1d/c8c40e611e5094330284b1aea8a4b02ca0858f8458614fa35754cab42b9c/aiohttp-3.12.15-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8466151554b593909d30a0a125d638b4e5f3836e5aecde85b66b80ded1cb5b0d", size = 469787, upload-time = "2025-07-29T05:50:49.669Z" }, + { url = "https://files.pythonhosted.org/packages/38/7d/b76438e70319796bfff717f325d97ce2e9310f752a267bfdf5192ac6082b/aiohttp-3.12.15-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2e5a495cb1be69dae4b08f35a6c4579c539e9b5706f606632102c0f855bcba7c", size = 1716590, upload-time = "2025-07-29T05:50:51.368Z" }, + { url = "https://files.pythonhosted.org/packages/79/b1/60370d70cdf8b269ee1444b390cbd72ce514f0d1cd1a715821c784d272c9/aiohttp-3.12.15-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:6404dfc8cdde35c69aaa489bb3542fb86ef215fc70277c892be8af540e5e21c0", size = 1699241, upload-time = "2025-07-29T05:50:53.628Z" }, + { url = "https://files.pythonhosted.org/packages/a3/2b/4968a7b8792437ebc12186db31523f541943e99bda8f30335c482bea6879/aiohttp-3.12.15-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3ead1c00f8521a5c9070fcb88f02967b1d8a0544e6d85c253f6968b785e1a2ab", size = 1754335, upload-time = "2025-07-29T05:50:55.394Z" }, + { url = "https://files.pythonhosted.org/packages/fb/c1/49524ed553f9a0bec1a11fac09e790f49ff669bcd14164f9fab608831c4d/aiohttp-3.12.15-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6990ef617f14450bc6b34941dba4f12d5613cbf4e33805932f853fbd1cf18bfb", size = 1800491, upload-time = "2025-07-29T05:50:57.202Z" }, + { url = "https://files.pythonhosted.org/packages/de/5e/3bf5acea47a96a28c121b167f5ef659cf71208b19e52a88cdfa5c37f1fcc/aiohttp-3.12.15-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd736ed420f4db2b8148b52b46b88ed038d0354255f9a73196b7bbce3ea97545", size = 1719929, upload-time = "2025-07-29T05:50:59.192Z" }, + { url = "https://files.pythonhosted.org/packages/39/94/8ae30b806835bcd1cba799ba35347dee6961a11bd507db634516210e91d8/aiohttp-3.12.15-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c5092ce14361a73086b90c6efb3948ffa5be2f5b6fbcf52e8d8c8b8848bb97c", size = 1635733, upload-time = "2025-07-29T05:51:01.394Z" }, + { url = "https://files.pythonhosted.org/packages/7a/46/06cdef71dd03acd9da7f51ab3a9107318aee12ad38d273f654e4f981583a/aiohttp-3.12.15-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:aaa2234bb60c4dbf82893e934d8ee8dea30446f0647e024074237a56a08c01bd", size = 1696790, upload-time = "2025-07-29T05:51:03.657Z" }, + { url = "https://files.pythonhosted.org/packages/02/90/6b4cfaaf92ed98d0ec4d173e78b99b4b1a7551250be8937d9d67ecb356b4/aiohttp-3.12.15-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:6d86a2fbdd14192e2f234a92d3b494dd4457e683ba07e5905a0b3ee25389ac9f", size = 1718245, upload-time = "2025-07-29T05:51:05.911Z" }, + { url = "https://files.pythonhosted.org/packages/2e/e6/2593751670fa06f080a846f37f112cbe6f873ba510d070136a6ed46117c6/aiohttp-3.12.15-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:a041e7e2612041a6ddf1c6a33b883be6a421247c7afd47e885969ee4cc58bd8d", size = 1658899, upload-time = "2025-07-29T05:51:07.753Z" }, + { url = "https://files.pythonhosted.org/packages/8f/28/c15bacbdb8b8eb5bf39b10680d129ea7410b859e379b03190f02fa104ffd/aiohttp-3.12.15-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:5015082477abeafad7203757ae44299a610e89ee82a1503e3d4184e6bafdd519", size = 1738459, upload-time = "2025-07-29T05:51:09.56Z" }, + { url = "https://files.pythonhosted.org/packages/00/de/c269cbc4faa01fb10f143b1670633a8ddd5b2e1ffd0548f7aa49cb5c70e2/aiohttp-3.12.15-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:56822ff5ddfd1b745534e658faba944012346184fbfe732e0d6134b744516eea", size = 1766434, upload-time = "2025-07-29T05:51:11.423Z" }, + { url = "https://files.pythonhosted.org/packages/52/b0/4ff3abd81aa7d929b27d2e1403722a65fc87b763e3a97b3a2a494bfc63bc/aiohttp-3.12.15-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b2acbbfff69019d9014508c4ba0401822e8bae5a5fdc3b6814285b71231b60f3", size = 1726045, upload-time = "2025-07-29T05:51:13.689Z" }, + { url = "https://files.pythonhosted.org/packages/71/16/949225a6a2dd6efcbd855fbd90cf476052e648fb011aa538e3b15b89a57a/aiohttp-3.12.15-cp312-cp312-win32.whl", hash = "sha256:d849b0901b50f2185874b9a232f38e26b9b3d4810095a7572eacea939132d4e1", size = 423591, upload-time = "2025-07-29T05:51:15.452Z" }, + { url = "https://files.pythonhosted.org/packages/2b/d8/fa65d2a349fe938b76d309db1a56a75c4fb8cc7b17a398b698488a939903/aiohttp-3.12.15-cp312-cp312-win_amd64.whl", hash = "sha256:b390ef5f62bb508a9d67cb3bba9b8356e23b3996da7062f1a57ce1a79d2b3d34", size = 450266, upload-time = "2025-07-29T05:51:17.239Z" }, + { url = "https://files.pythonhosted.org/packages/f2/33/918091abcf102e39d15aba2476ad9e7bd35ddb190dcdd43a854000d3da0d/aiohttp-3.12.15-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:9f922ffd05034d439dde1c77a20461cf4a1b0831e6caa26151fe7aa8aaebc315", size = 696741, upload-time = "2025-07-29T05:51:19.021Z" }, + { url = "https://files.pythonhosted.org/packages/b5/2a/7495a81e39a998e400f3ecdd44a62107254803d1681d9189be5c2e4530cd/aiohttp-3.12.15-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:2ee8a8ac39ce45f3e55663891d4b1d15598c157b4d494a4613e704c8b43112cd", size = 474407, upload-time = "2025-07-29T05:51:21.165Z" }, + { url = "https://files.pythonhosted.org/packages/49/fc/a9576ab4be2dcbd0f73ee8675d16c707cfc12d5ee80ccf4015ba543480c9/aiohttp-3.12.15-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3eae49032c29d356b94eee45a3f39fdf4b0814b397638c2f718e96cfadf4c4e4", size = 466703, upload-time = "2025-07-29T05:51:22.948Z" }, + { url = "https://files.pythonhosted.org/packages/09/2f/d4bcc8448cf536b2b54eed48f19682031ad182faa3a3fee54ebe5b156387/aiohttp-3.12.15-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b97752ff12cc12f46a9b20327104448042fce5c33a624f88c18f66f9368091c7", size = 1705532, upload-time = "2025-07-29T05:51:25.211Z" }, + { url = "https://files.pythonhosted.org/packages/f1/f3/59406396083f8b489261e3c011aa8aee9df360a96ac8fa5c2e7e1b8f0466/aiohttp-3.12.15-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:894261472691d6fe76ebb7fcf2e5870a2ac284c7406ddc95823c8598a1390f0d", size = 1686794, upload-time = "2025-07-29T05:51:27.145Z" }, + { url = "https://files.pythonhosted.org/packages/dc/71/164d194993a8d114ee5656c3b7ae9c12ceee7040d076bf7b32fb98a8c5c6/aiohttp-3.12.15-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5fa5d9eb82ce98959fc1031c28198b431b4d9396894f385cb63f1e2f3f20ca6b", size = 1738865, upload-time = "2025-07-29T05:51:29.366Z" }, + { url = "https://files.pythonhosted.org/packages/1c/00/d198461b699188a93ead39cb458554d9f0f69879b95078dce416d3209b54/aiohttp-3.12.15-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f0fa751efb11a541f57db59c1dd821bec09031e01452b2b6217319b3a1f34f3d", size = 1788238, upload-time = "2025-07-29T05:51:31.285Z" }, + { url = "https://files.pythonhosted.org/packages/85/b8/9e7175e1fa0ac8e56baa83bf3c214823ce250d0028955dfb23f43d5e61fd/aiohttp-3.12.15-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5346b93e62ab51ee2a9d68e8f73c7cf96ffb73568a23e683f931e52450e4148d", size = 1710566, upload-time = "2025-07-29T05:51:33.219Z" }, + { url = "https://files.pythonhosted.org/packages/59/e4/16a8eac9df39b48ae102ec030fa9f726d3570732e46ba0c592aeeb507b93/aiohttp-3.12.15-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:049ec0360f939cd164ecbfd2873eaa432613d5e77d6b04535e3d1fbae5a9e645", size = 1624270, upload-time = "2025-07-29T05:51:35.195Z" }, + { url = "https://files.pythonhosted.org/packages/1f/f8/cd84dee7b6ace0740908fd0af170f9fab50c2a41ccbc3806aabcb1050141/aiohttp-3.12.15-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:b52dcf013b57464b6d1e51b627adfd69a8053e84b7103a7cd49c030f9ca44461", size = 1677294, upload-time = "2025-07-29T05:51:37.215Z" }, + { url = "https://files.pythonhosted.org/packages/ce/42/d0f1f85e50d401eccd12bf85c46ba84f947a84839c8a1c2c5f6e8ab1eb50/aiohttp-3.12.15-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:9b2af240143dd2765e0fb661fd0361a1b469cab235039ea57663cda087250ea9", size = 1708958, upload-time = "2025-07-29T05:51:39.328Z" }, + { url = "https://files.pythonhosted.org/packages/d5/6b/f6fa6c5790fb602538483aa5a1b86fcbad66244997e5230d88f9412ef24c/aiohttp-3.12.15-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ac77f709a2cde2cc71257ab2d8c74dd157c67a0558a0d2799d5d571b4c63d44d", size = 1651553, upload-time = "2025-07-29T05:51:41.356Z" }, + { url = "https://files.pythonhosted.org/packages/04/36/a6d36ad545fa12e61d11d1932eef273928b0495e6a576eb2af04297fdd3c/aiohttp-3.12.15-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:47f6b962246f0a774fbd3b6b7be25d59b06fdb2f164cf2513097998fc6a29693", size = 1727688, upload-time = "2025-07-29T05:51:43.452Z" }, + { url = "https://files.pythonhosted.org/packages/aa/c8/f195e5e06608a97a4e52c5d41c7927301bf757a8e8bb5bbf8cef6c314961/aiohttp-3.12.15-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:760fb7db442f284996e39cf9915a94492e1896baac44f06ae551974907922b64", size = 1761157, upload-time = "2025-07-29T05:51:45.643Z" }, + { url = "https://files.pythonhosted.org/packages/05/6a/ea199e61b67f25ba688d3ce93f63b49b0a4e3b3d380f03971b4646412fc6/aiohttp-3.12.15-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ad702e57dc385cae679c39d318def49aef754455f237499d5b99bea4ef582e51", size = 1710050, upload-time = "2025-07-29T05:51:48.203Z" }, + { url = "https://files.pythonhosted.org/packages/b4/2e/ffeb7f6256b33635c29dbed29a22a723ff2dd7401fff42ea60cf2060abfb/aiohttp-3.12.15-cp313-cp313-win32.whl", hash = "sha256:f813c3e9032331024de2eb2e32a88d86afb69291fbc37a3a3ae81cc9917fb3d0", size = 422647, upload-time = "2025-07-29T05:51:50.718Z" }, + { url = "https://files.pythonhosted.org/packages/1b/8e/78ee35774201f38d5e1ba079c9958f7629b1fd079459aea9467441dbfbf5/aiohttp-3.12.15-cp313-cp313-win_amd64.whl", hash = "sha256:1a649001580bdb37c6fdb1bebbd7e3bc688e8ec2b5c6f52edbb664662b17dc84", size = 449067, upload-time = "2025-07-29T05:51:52.549Z" }, +] + +[[package]] +name = "aiohttp-cors" +version = "0.8.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "aiohttp" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6f/6d/d89e846a5444b3d5eb8985a6ddb0daef3774928e1bfbce8e84ec97b0ffa7/aiohttp_cors-0.8.1.tar.gz", hash = "sha256:ccacf9cb84b64939ea15f859a146af1f662a6b1d68175754a07315e305fb1403", size = 38626, upload-time = "2025-03-31T14:16:20.048Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/98/3b/40a68de458904bcc143622015fff2352b6461cd92fd66d3527bf1c6f5716/aiohttp_cors-0.8.1-py3-none-any.whl", hash = "sha256:3180cf304c5c712d626b9162b195b1db7ddf976a2a25172b35bb2448b890a80d", size = 25231, upload-time = "2025-03-31T14:16:18.478Z" }, +] + +[[package]] +name = "aiosignal" +version = "1.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "frozenlist" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/61/62/06741b579156360248d1ec624842ad0edf697050bbaf7c3e46394e106ad1/aiosignal-1.4.0.tar.gz", hash = "sha256:f47eecd9468083c2029cc99945502cb7708b082c232f9aca65da147157b251c7", size = 25007, upload-time = "2025-07-03T22:54:43.528Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fb/76/641ae371508676492379f16e2fa48f4e2c11741bd63c48be4b12a6b09cba/aiosignal-1.4.0-py3-none-any.whl", hash = "sha256:053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e", size = 7490, upload-time = "2025-07-03T22:54:42.156Z" }, +] + +[[package]] +name = "alembic" +version = "1.16.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mako" }, + { name = "sqlalchemy" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/83/52/72e791b75c6b1efa803e491f7cbab78e963695e76d4ada05385252927e76/alembic-1.16.4.tar.gz", hash = "sha256:efab6ada0dd0fae2c92060800e0bf5c1dc26af15a10e02fb4babff164b4725e2", size = 1968161, upload-time = "2025-07-10T16:17:20.192Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c2/62/96b5217b742805236614f05904541000f55422a6060a90d7fd4ce26c172d/alembic-1.16.4-py3-none-any.whl", hash = "sha256:b05e51e8e82efc1abd14ba2af6392897e145930c3e0a2faf2b0da2f7f7fd660d", size = 247026, upload-time = "2025-07-10T16:17:21.845Z" }, +] + +[[package]] +name = "annotated-types" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" }, +] + +[[package]] +name = "anyio" +version = "4.10.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "idna" }, + { name = "sniffio" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f1/b4/636b3b65173d3ce9a38ef5f0522789614e590dab6a8d505340a4efe4c567/anyio-4.10.0.tar.gz", hash = "sha256:3f3fae35c96039744587aa5b8371e7e8e603c0702999535961dd336026973ba6", size = 213252, upload-time = "2025-08-04T08:54:26.451Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6f/12/e5e0282d673bb9746bacfb6e2dba8719989d3660cdb2ea79aee9a9651afb/anyio-4.10.0-py3-none-any.whl", hash = "sha256:60e474ac86736bbfd6f210f7a61218939c318f43f9972497381f1c5e930ed3d1", size = 107213, upload-time = "2025-08-04T08:54:24.882Z" }, +] + +[[package]] +name = "appnope" +version = "0.1.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/35/5d/752690df9ef5b76e169e68d6a129fa6d08a7100ca7f754c89495db3c6019/appnope-0.1.4.tar.gz", hash = "sha256:1de3860566df9caf38f01f86f65e0e13e379af54f9e4bee1e66b48f2efffd1ee", size = 4170, upload-time = "2024-02-06T09:43:11.258Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/81/29/5ecc3a15d5a33e31b26c11426c45c501e439cb865d0bff96315d86443b78/appnope-0.1.4-py2.py3-none-any.whl", hash = "sha256:502575ee11cd7a28c0205f379b525beefebab9d161b7c964670864014ed7213c", size = 4321, upload-time = "2024-02-06T09:43:09.663Z" }, +] + +[[package]] +name = "asttokens" +version = "3.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/4a/e7/82da0a03e7ba5141f05cce0d302e6eed121ae055e0456ca228bf693984bc/asttokens-3.0.0.tar.gz", hash = "sha256:0dcd8baa8d62b0c1d118b399b2ddba3c4aff271d0d7a9e0d4c1681c79035bbc7", size = 61978, upload-time = "2024-11-30T04:30:14.439Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/25/8a/c46dcc25341b5bce5472c718902eb3d38600a903b14fa6aeecef3f21a46f/asttokens-3.0.0-py3-none-any.whl", hash = "sha256:e3078351a059199dd5138cb1c706e6430c05eff2ff136af5eb4790f9d28932e2", size = 26918, upload-time = "2024-11-30T04:30:10.946Z" }, +] + +[[package]] +name = "attrs" +version = "25.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/5a/b0/1367933a8532ee6ff8d63537de4f1177af4bff9f3e829baf7331f595bb24/attrs-25.3.0.tar.gz", hash = "sha256:75d7cefc7fb576747b2c81b4442d4d4a1ce0900973527c011d1030fd3bf4af1b", size = 812032, upload-time = "2025-03-13T11:10:22.779Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/77/06/bb80f5f86020c4551da315d78b3ab75e8228f89f0162f2c3a819e407941a/attrs-25.3.0-py3-none-any.whl", hash = "sha256:427318ce031701fea540783410126f03899a97ffc6f61596ad581ac2e40e3bc3", size = 63815, upload-time = "2025-03-13T11:10:21.14Z" }, +] + +[[package]] +name = "blinker" +version = "1.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/21/28/9b3f50ce0e048515135495f198351908d99540d69bfdc8c1d15b73dc55ce/blinker-1.9.0.tar.gz", hash = "sha256:b4ce2265a7abece45e7cc896e98dbebe6cead56bcf805a3d23136d145f5445bf", size = 22460, upload-time = "2024-11-08T17:25:47.436Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/10/cb/f2ad4230dc2eb1a74edf38f1a38b9b52277f75bef262d8908e60d957e13c/blinker-1.9.0-py3-none-any.whl", hash = "sha256:ba0efaa9080b619ff2f3459d1d500c57bddea4a6b424b60a91141db6fd2f08bc", size = 8458, upload-time = "2024-11-08T17:25:46.184Z" }, +] + +[[package]] +name = "boto3" +version = "1.40.9" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "botocore" }, + { name = "jmespath" }, + { name = "s3transfer" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/72/63/b263070ba4a2815de633d71dd4c5c04c9eb7000d33c510036c9557692324/boto3-1.40.9.tar.gz", hash = "sha256:af3f77a548b3dd7db5046609598a28a9ad5d062437b1783da9b526cc67c38b79", size = 111953, upload-time = "2025-08-13T19:20:32.495Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b6/6d/79fad38fcd7e1fc6961061b46cc87706c5c946088bc4620abf0d0aa49420/boto3-1.40.9-py3-none-any.whl", hash = "sha256:516f5e3f7552b2a7ca4d2c89b338fb4684998c676b11b906e2ab694c91716ba6", size = 140061, upload-time = "2025-08-13T19:20:30.652Z" }, +] + +[[package]] +name = "botocore" +version = "1.40.9" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "jmespath" }, + { name = "python-dateutil" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ff/f3/7bf4913b4b61416c014cfee38211d071f75894cca37f7234519c4d8676d1/botocore-1.40.9.tar.gz", hash = "sha256:f4a9c6ed08e8637138e1b5534f89d38c02650974b6458a07690493130e295f68", size = 14325768, upload-time = "2025-08-13T19:20:22.393Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/02/e9/367e81e114deb92a6e0d5740f0bff4548af710be318af65265b9aad72237/botocore-1.40.9-py3-none-any.whl", hash = "sha256:d4960a39aab9658bcd0272490003001cb4a8d12b89bb297ccef994ee023fb638", size = 13990592, upload-time = "2025-08-13T19:20:16.942Z" }, +] + +[[package]] +name = "cachetools" +version = "5.5.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6c/81/3747dad6b14fa2cf53fcf10548cf5aea6913e96fab41a3c198676f8948a5/cachetools-5.5.2.tar.gz", hash = "sha256:1a661caa9175d26759571b2e19580f9d6393969e5dfca11fdb1f947a23e640d4", size = 28380, upload-time = "2025-02-20T21:01:19.524Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/72/76/20fa66124dbe6be5cafeb312ece67de6b61dd91a0247d1ea13db4ebb33c2/cachetools-5.5.2-py3-none-any.whl", hash = "sha256:d26a22bcc62eb95c3beabd9f1ee5e820d3d2704fe2967cbe350e20c8ffcd3f0a", size = 10080, upload-time = "2025-02-20T21:01:16.647Z" }, +] + +[[package]] +name = "certifi" +version = "2025.8.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/dc/67/960ebe6bf230a96cda2e0abcf73af550ec4f090005363542f0765df162e0/certifi-2025.8.3.tar.gz", hash = "sha256:e564105f78ded564e3ae7c923924435e1daa7463faeab5bb932bc53ffae63407", size = 162386, upload-time = "2025-08-03T03:07:47.08Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e5/48/1549795ba7742c948d2ad169c1c8cdbae65bc450d6cd753d124b17c8cd32/certifi-2025.8.3-py3-none-any.whl", hash = "sha256:f6c12493cfb1b06ba2ff328595af9350c65d6644968e5d3a2ffd78699af217a5", size = 161216, upload-time = "2025-08-03T03:07:45.777Z" }, +] + +[[package]] +name = "cffi" +version = "1.17.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pycparser" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/fc/97/c783634659c2920c3fc70419e3af40972dbaf758daa229a7d6ea6135c90d/cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824", size = 516621, upload-time = "2024-09-04T20:45:21.852Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5a/84/e94227139ee5fb4d600a7a4927f322e1d4aea6fdc50bd3fca8493caba23f/cffi-1.17.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:805b4371bf7197c329fcb3ead37e710d1bca9da5d583f5073b799d5c5bd1eee4", size = 183178, upload-time = "2024-09-04T20:44:12.232Z" }, + { url = "https://files.pythonhosted.org/packages/da/ee/fb72c2b48656111c4ef27f0f91da355e130a923473bf5ee75c5643d00cca/cffi-1.17.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:733e99bc2df47476e3848417c5a4540522f234dfd4ef3ab7fafdf555b082ec0c", size = 178840, upload-time = "2024-09-04T20:44:13.739Z" }, + { url = "https://files.pythonhosted.org/packages/cc/b6/db007700f67d151abadf508cbfd6a1884f57eab90b1bb985c4c8c02b0f28/cffi-1.17.1-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1257bdabf294dceb59f5e70c64a3e2f462c30c7ad68092d01bbbfb1c16b1ba36", size = 454803, upload-time = "2024-09-04T20:44:15.231Z" }, + { url = "https://files.pythonhosted.org/packages/1a/df/f8d151540d8c200eb1c6fba8cd0dfd40904f1b0682ea705c36e6c2e97ab3/cffi-1.17.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da95af8214998d77a98cc14e3a3bd00aa191526343078b530ceb0bd710fb48a5", size = 478850, upload-time = "2024-09-04T20:44:17.188Z" }, + { url = "https://files.pythonhosted.org/packages/28/c0/b31116332a547fd2677ae5b78a2ef662dfc8023d67f41b2a83f7c2aa78b1/cffi-1.17.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d63afe322132c194cf832bfec0dc69a99fb9bb6bbd550f161a49e9e855cc78ff", size = 485729, upload-time = "2024-09-04T20:44:18.688Z" }, + { url = "https://files.pythonhosted.org/packages/91/2b/9a1ddfa5c7f13cab007a2c9cc295b70fbbda7cb10a286aa6810338e60ea1/cffi-1.17.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f79fc4fc25f1c8698ff97788206bb3c2598949bfe0fef03d299eb1b5356ada99", size = 471256, upload-time = "2024-09-04T20:44:20.248Z" }, + { url = "https://files.pythonhosted.org/packages/b2/d5/da47df7004cb17e4955df6a43d14b3b4ae77737dff8bf7f8f333196717bf/cffi-1.17.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b62ce867176a75d03a665bad002af8e6d54644fad99a3c70905c543130e39d93", size = 479424, upload-time = "2024-09-04T20:44:21.673Z" }, + { url = "https://files.pythonhosted.org/packages/0b/ac/2a28bcf513e93a219c8a4e8e125534f4f6db03e3179ba1c45e949b76212c/cffi-1.17.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:386c8bf53c502fff58903061338ce4f4950cbdcb23e2902d86c0f722b786bbe3", size = 484568, upload-time = "2024-09-04T20:44:23.245Z" }, + { url = "https://files.pythonhosted.org/packages/d4/38/ca8a4f639065f14ae0f1d9751e70447a261f1a30fa7547a828ae08142465/cffi-1.17.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4ceb10419a9adf4460ea14cfd6bc43d08701f0835e979bf821052f1805850fe8", size = 488736, upload-time = "2024-09-04T20:44:24.757Z" }, + { url = "https://files.pythonhosted.org/packages/86/c5/28b2d6f799ec0bdecf44dced2ec5ed43e0eb63097b0f58c293583b406582/cffi-1.17.1-cp312-cp312-win32.whl", hash = "sha256:a08d7e755f8ed21095a310a693525137cfe756ce62d066e53f502a83dc550f65", size = 172448, upload-time = "2024-09-04T20:44:26.208Z" }, + { url = "https://files.pythonhosted.org/packages/50/b9/db34c4755a7bd1cb2d1603ac3863f22bcecbd1ba29e5ee841a4bc510b294/cffi-1.17.1-cp312-cp312-win_amd64.whl", hash = "sha256:51392eae71afec0d0c8fb1a53b204dbb3bcabcb3c9b807eedf3e1e6ccf2de903", size = 181976, upload-time = "2024-09-04T20:44:27.578Z" }, + { url = "https://files.pythonhosted.org/packages/8d/f8/dd6c246b148639254dad4d6803eb6a54e8c85c6e11ec9df2cffa87571dbe/cffi-1.17.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f3a2b4222ce6b60e2e8b337bb9596923045681d71e5a082783484d845390938e", size = 182989, upload-time = "2024-09-04T20:44:28.956Z" }, + { url = "https://files.pythonhosted.org/packages/8b/f1/672d303ddf17c24fc83afd712316fda78dc6fce1cd53011b839483e1ecc8/cffi-1.17.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0984a4925a435b1da406122d4d7968dd861c1385afe3b45ba82b750f229811e2", size = 178802, upload-time = "2024-09-04T20:44:30.289Z" }, + { url = "https://files.pythonhosted.org/packages/0e/2d/eab2e858a91fdff70533cab61dcff4a1f55ec60425832ddfdc9cd36bc8af/cffi-1.17.1-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d01b12eeeb4427d3110de311e1774046ad344f5b1a7403101878976ecd7a10f3", size = 454792, upload-time = "2024-09-04T20:44:32.01Z" }, + { url = "https://files.pythonhosted.org/packages/75/b2/fbaec7c4455c604e29388d55599b99ebcc250a60050610fadde58932b7ee/cffi-1.17.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:706510fe141c86a69c8ddc029c7910003a17353970cff3b904ff0686a5927683", size = 478893, upload-time = "2024-09-04T20:44:33.606Z" }, + { url = "https://files.pythonhosted.org/packages/4f/b7/6e4a2162178bf1935c336d4da8a9352cccab4d3a5d7914065490f08c0690/cffi-1.17.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de55b766c7aa2e2a3092c51e0483d700341182f08e67c63630d5b6f200bb28e5", size = 485810, upload-time = "2024-09-04T20:44:35.191Z" }, + { url = "https://files.pythonhosted.org/packages/c7/8a/1d0e4a9c26e54746dc08c2c6c037889124d4f59dffd853a659fa545f1b40/cffi-1.17.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c59d6e989d07460165cc5ad3c61f9fd8f1b4796eacbd81cee78957842b834af4", size = 471200, upload-time = "2024-09-04T20:44:36.743Z" }, + { url = "https://files.pythonhosted.org/packages/26/9f/1aab65a6c0db35f43c4d1b4f580e8df53914310afc10ae0397d29d697af4/cffi-1.17.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd398dbc6773384a17fe0d3e7eeb8d1a21c2200473ee6806bb5e6a8e62bb73dd", size = 479447, upload-time = "2024-09-04T20:44:38.492Z" }, + { url = "https://files.pythonhosted.org/packages/5f/e4/fb8b3dd8dc0e98edf1135ff067ae070bb32ef9d509d6cb0f538cd6f7483f/cffi-1.17.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:3edc8d958eb099c634dace3c7e16560ae474aa3803a5df240542b305d14e14ed", size = 484358, upload-time = "2024-09-04T20:44:40.046Z" }, + { url = "https://files.pythonhosted.org/packages/f1/47/d7145bf2dc04684935d57d67dff9d6d795b2ba2796806bb109864be3a151/cffi-1.17.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:72e72408cad3d5419375fc87d289076ee319835bdfa2caad331e377589aebba9", size = 488469, upload-time = "2024-09-04T20:44:41.616Z" }, + { url = "https://files.pythonhosted.org/packages/bf/ee/f94057fa6426481d663b88637a9a10e859e492c73d0384514a17d78ee205/cffi-1.17.1-cp313-cp313-win32.whl", hash = "sha256:e03eab0a8677fa80d646b5ddece1cbeaf556c313dcfac435ba11f107ba117b5d", size = 172475, upload-time = "2024-09-04T20:44:43.733Z" }, + { url = "https://files.pythonhosted.org/packages/7c/fc/6a8cb64e5f0324877d503c854da15d76c1e50eb722e320b15345c4d0c6de/cffi-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a", size = 182009, upload-time = "2024-09-04T20:44:45.309Z" }, +] + +[[package]] +name = "charset-normalizer" +version = "3.4.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/83/2d/5fd176ceb9b2fc619e63405525573493ca23441330fcdaee6bef9460e924/charset_normalizer-3.4.3.tar.gz", hash = "sha256:6fce4b8500244f6fcb71465d4a4930d132ba9ab8e71a7859e6a5d59851068d14", size = 122371, upload-time = "2025-08-09T07:57:28.46Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e9/5e/14c94999e418d9b87682734589404a25854d5f5d0408df68bc15b6ff54bb/charset_normalizer-3.4.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:e28e334d3ff134e88989d90ba04b47d84382a828c061d0d1027b1b12a62b39b1", size = 205655, upload-time = "2025-08-09T07:56:08.475Z" }, + { url = "https://files.pythonhosted.org/packages/7d/a8/c6ec5d389672521f644505a257f50544c074cf5fc292d5390331cd6fc9c3/charset_normalizer-3.4.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0cacf8f7297b0c4fcb74227692ca46b4a5852f8f4f24b3c766dd94a1075c4884", size = 146223, upload-time = "2025-08-09T07:56:09.708Z" }, + { url = "https://files.pythonhosted.org/packages/fc/eb/a2ffb08547f4e1e5415fb69eb7db25932c52a52bed371429648db4d84fb1/charset_normalizer-3.4.3-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c6fd51128a41297f5409deab284fecbe5305ebd7e5a1f959bee1c054622b7018", size = 159366, upload-time = "2025-08-09T07:56:11.326Z" }, + { url = "https://files.pythonhosted.org/packages/82/10/0fd19f20c624b278dddaf83b8464dcddc2456cb4b02bb902a6da126b87a1/charset_normalizer-3.4.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:3cfb2aad70f2c6debfbcb717f23b7eb55febc0bb23dcffc0f076009da10c6392", size = 157104, upload-time = "2025-08-09T07:56:13.014Z" }, + { url = "https://files.pythonhosted.org/packages/16/ab/0233c3231af734f5dfcf0844aa9582d5a1466c985bbed6cedab85af9bfe3/charset_normalizer-3.4.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1606f4a55c0fd363d754049cdf400175ee96c992b1f8018b993941f221221c5f", size = 151830, upload-time = "2025-08-09T07:56:14.428Z" }, + { url = "https://files.pythonhosted.org/packages/ae/02/e29e22b4e02839a0e4a06557b1999d0a47db3567e82989b5bb21f3fbbd9f/charset_normalizer-3.4.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:027b776c26d38b7f15b26a5da1044f376455fb3766df8fc38563b4efbc515154", size = 148854, upload-time = "2025-08-09T07:56:16.051Z" }, + { url = "https://files.pythonhosted.org/packages/05/6b/e2539a0a4be302b481e8cafb5af8792da8093b486885a1ae4d15d452bcec/charset_normalizer-3.4.3-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:42e5088973e56e31e4fa58eb6bd709e42fc03799c11c42929592889a2e54c491", size = 160670, upload-time = "2025-08-09T07:56:17.314Z" }, + { url = "https://files.pythonhosted.org/packages/31/e7/883ee5676a2ef217a40ce0bffcc3d0dfbf9e64cbcfbdf822c52981c3304b/charset_normalizer-3.4.3-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:cc34f233c9e71701040d772aa7490318673aa7164a0efe3172b2981218c26d93", size = 158501, upload-time = "2025-08-09T07:56:18.641Z" }, + { url = "https://files.pythonhosted.org/packages/c1/35/6525b21aa0db614cf8b5792d232021dca3df7f90a1944db934efa5d20bb1/charset_normalizer-3.4.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:320e8e66157cc4e247d9ddca8e21f427efc7a04bbd0ac8a9faf56583fa543f9f", size = 153173, upload-time = "2025-08-09T07:56:20.289Z" }, + { url = "https://files.pythonhosted.org/packages/50/ee/f4704bad8201de513fdc8aac1cabc87e38c5818c93857140e06e772b5892/charset_normalizer-3.4.3-cp312-cp312-win32.whl", hash = "sha256:fb6fecfd65564f208cbf0fba07f107fb661bcd1a7c389edbced3f7a493f70e37", size = 99822, upload-time = "2025-08-09T07:56:21.551Z" }, + { url = "https://files.pythonhosted.org/packages/39/f5/3b3836ca6064d0992c58c7561c6b6eee1b3892e9665d650c803bd5614522/charset_normalizer-3.4.3-cp312-cp312-win_amd64.whl", hash = "sha256:86df271bf921c2ee3818f0522e9a5b8092ca2ad8b065ece5d7d9d0e9f4849bcc", size = 107543, upload-time = "2025-08-09T07:56:23.115Z" }, + { url = "https://files.pythonhosted.org/packages/65/ca/2135ac97709b400c7654b4b764daf5c5567c2da45a30cdd20f9eefe2d658/charset_normalizer-3.4.3-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:14c2a87c65b351109f6abfc424cab3927b3bdece6f706e4d12faaf3d52ee5efe", size = 205326, upload-time = "2025-08-09T07:56:24.721Z" }, + { url = "https://files.pythonhosted.org/packages/71/11/98a04c3c97dd34e49c7d247083af03645ca3730809a5509443f3c37f7c99/charset_normalizer-3.4.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:41d1fc408ff5fdfb910200ec0e74abc40387bccb3252f3f27c0676731df2b2c8", size = 146008, upload-time = "2025-08-09T07:56:26.004Z" }, + { url = "https://files.pythonhosted.org/packages/60/f5/4659a4cb3c4ec146bec80c32d8bb16033752574c20b1252ee842a95d1a1e/charset_normalizer-3.4.3-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:1bb60174149316da1c35fa5233681f7c0f9f514509b8e399ab70fea5f17e45c9", size = 159196, upload-time = "2025-08-09T07:56:27.25Z" }, + { url = "https://files.pythonhosted.org/packages/86/9e/f552f7a00611f168b9a5865a1414179b2c6de8235a4fa40189f6f79a1753/charset_normalizer-3.4.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:30d006f98569de3459c2fc1f2acde170b7b2bd265dc1943e87e1a4efe1b67c31", size = 156819, upload-time = "2025-08-09T07:56:28.515Z" }, + { url = "https://files.pythonhosted.org/packages/7e/95/42aa2156235cbc8fa61208aded06ef46111c4d3f0de233107b3f38631803/charset_normalizer-3.4.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:416175faf02e4b0810f1f38bcb54682878a4af94059a1cd63b8747244420801f", size = 151350, upload-time = "2025-08-09T07:56:29.716Z" }, + { url = "https://files.pythonhosted.org/packages/c2/a9/3865b02c56f300a6f94fc631ef54f0a8a29da74fb45a773dfd3dcd380af7/charset_normalizer-3.4.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6aab0f181c486f973bc7262a97f5aca3ee7e1437011ef0c2ec04b5a11d16c927", size = 148644, upload-time = "2025-08-09T07:56:30.984Z" }, + { url = "https://files.pythonhosted.org/packages/77/d9/cbcf1a2a5c7d7856f11e7ac2d782aec12bdfea60d104e60e0aa1c97849dc/charset_normalizer-3.4.3-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:fdabf8315679312cfa71302f9bd509ded4f2f263fb5b765cf1433b39106c3cc9", size = 160468, upload-time = "2025-08-09T07:56:32.252Z" }, + { url = "https://files.pythonhosted.org/packages/f6/42/6f45efee8697b89fda4d50580f292b8f7f9306cb2971d4b53f8914e4d890/charset_normalizer-3.4.3-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:bd28b817ea8c70215401f657edef3a8aa83c29d447fb0b622c35403780ba11d5", size = 158187, upload-time = "2025-08-09T07:56:33.481Z" }, + { url = "https://files.pythonhosted.org/packages/70/99/f1c3bdcfaa9c45b3ce96f70b14f070411366fa19549c1d4832c935d8e2c3/charset_normalizer-3.4.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:18343b2d246dc6761a249ba1fb13f9ee9a2bcd95decc767319506056ea4ad4dc", size = 152699, upload-time = "2025-08-09T07:56:34.739Z" }, + { url = "https://files.pythonhosted.org/packages/a3/ad/b0081f2f99a4b194bcbb1934ef3b12aa4d9702ced80a37026b7607c72e58/charset_normalizer-3.4.3-cp313-cp313-win32.whl", hash = "sha256:6fb70de56f1859a3f71261cbe41005f56a7842cc348d3aeb26237560bfa5e0ce", size = 99580, upload-time = "2025-08-09T07:56:35.981Z" }, + { url = "https://files.pythonhosted.org/packages/9a/8f/ae790790c7b64f925e5c953b924aaa42a243fb778fed9e41f147b2a5715a/charset_normalizer-3.4.3-cp313-cp313-win_amd64.whl", hash = "sha256:cf1ebb7d78e1ad8ec2a8c4732c7be2e736f6e5123a4146c5b89c9d1f585f8cef", size = 107366, upload-time = "2025-08-09T07:56:37.339Z" }, + { url = "https://files.pythonhosted.org/packages/8e/91/b5a06ad970ddc7a0e513112d40113e834638f4ca1120eb727a249fb2715e/charset_normalizer-3.4.3-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:3cd35b7e8aedeb9e34c41385fda4f73ba609e561faedfae0a9e75e44ac558a15", size = 204342, upload-time = "2025-08-09T07:56:38.687Z" }, + { url = "https://files.pythonhosted.org/packages/ce/ec/1edc30a377f0a02689342f214455c3f6c2fbedd896a1d2f856c002fc3062/charset_normalizer-3.4.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b89bc04de1d83006373429975f8ef9e7932534b8cc9ca582e4db7d20d91816db", size = 145995, upload-time = "2025-08-09T07:56:40.048Z" }, + { url = "https://files.pythonhosted.org/packages/17/e5/5e67ab85e6d22b04641acb5399c8684f4d37caf7558a53859f0283a650e9/charset_normalizer-3.4.3-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2001a39612b241dae17b4687898843f254f8748b796a2e16f1051a17078d991d", size = 158640, upload-time = "2025-08-09T07:56:41.311Z" }, + { url = "https://files.pythonhosted.org/packages/f1/e5/38421987f6c697ee3722981289d554957c4be652f963d71c5e46a262e135/charset_normalizer-3.4.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8dcfc373f888e4fb39a7bc57e93e3b845e7f462dacc008d9749568b1c4ece096", size = 156636, upload-time = "2025-08-09T07:56:43.195Z" }, + { url = "https://files.pythonhosted.org/packages/a0/e4/5a075de8daa3ec0745a9a3b54467e0c2967daaaf2cec04c845f73493e9a1/charset_normalizer-3.4.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:18b97b8404387b96cdbd30ad660f6407799126d26a39ca65729162fd810a99aa", size = 150939, upload-time = "2025-08-09T07:56:44.819Z" }, + { url = "https://files.pythonhosted.org/packages/02/f7/3611b32318b30974131db62b4043f335861d4d9b49adc6d57c1149cc49d4/charset_normalizer-3.4.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:ccf600859c183d70eb47e05a44cd80a4ce77394d1ac0f79dbd2dd90a69a3a049", size = 148580, upload-time = "2025-08-09T07:56:46.684Z" }, + { url = "https://files.pythonhosted.org/packages/7e/61/19b36f4bd67f2793ab6a99b979b4e4f3d8fc754cbdffb805335df4337126/charset_normalizer-3.4.3-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:53cd68b185d98dde4ad8990e56a58dea83a4162161b1ea9272e5c9182ce415e0", size = 159870, upload-time = "2025-08-09T07:56:47.941Z" }, + { url = "https://files.pythonhosted.org/packages/06/57/84722eefdd338c04cf3030ada66889298eaedf3e7a30a624201e0cbe424a/charset_normalizer-3.4.3-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:30a96e1e1f865f78b030d65241c1ee850cdf422d869e9028e2fc1d5e4db73b92", size = 157797, upload-time = "2025-08-09T07:56:49.756Z" }, + { url = "https://files.pythonhosted.org/packages/72/2a/aff5dd112b2f14bcc3462c312dce5445806bfc8ab3a7328555da95330e4b/charset_normalizer-3.4.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d716a916938e03231e86e43782ca7878fb602a125a91e7acb8b5112e2e96ac16", size = 152224, upload-time = "2025-08-09T07:56:51.369Z" }, + { url = "https://files.pythonhosted.org/packages/b7/8c/9839225320046ed279c6e839d51f028342eb77c91c89b8ef2549f951f3ec/charset_normalizer-3.4.3-cp314-cp314-win32.whl", hash = "sha256:c6dbd0ccdda3a2ba7c2ecd9d77b37f3b5831687d8dc1b6ca5f56a4880cc7b7ce", size = 100086, upload-time = "2025-08-09T07:56:52.722Z" }, + { url = "https://files.pythonhosted.org/packages/ee/7a/36fbcf646e41f710ce0a563c1c9a343c6edf9be80786edeb15b6f62e17db/charset_normalizer-3.4.3-cp314-cp314-win_amd64.whl", hash = "sha256:73dc19b562516fc9bcf6e5d6e596df0b4eb98d87e4f79f3ae71840e6ed21361c", size = 107400, upload-time = "2025-08-09T07:56:55.172Z" }, + { url = "https://files.pythonhosted.org/packages/8a/1f/f041989e93b001bc4e44bb1669ccdcf54d3f00e628229a85b08d330615c5/charset_normalizer-3.4.3-py3-none-any.whl", hash = "sha256:ce571ab16d890d23b5c278547ba694193a45011ff86a9162a71307ed9f86759a", size = 53175, upload-time = "2025-08-09T07:57:26.864Z" }, +] + +[[package]] +name = "click" +version = "8.2.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/60/6c/8ca2efa64cf75a977a0d7fac081354553ebe483345c734fb6b6515d96bbc/click-8.2.1.tar.gz", hash = "sha256:27c491cc05d968d271d5a1db13e3b5a184636d9d930f148c50b038f0d0646202", size = 286342, upload-time = "2025-05-20T23:19:49.832Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/85/32/10bb5764d90a8eee674e9dc6f4db6a0ab47c8c4d0d83c27f7c39ac415a4d/click-8.2.1-py3-none-any.whl", hash = "sha256:61a3265b914e850b85317d0b3109c7f8cd35a670f963866005d6ef1d5175a12b", size = 102215, upload-time = "2025-05-20T23:19:47.796Z" }, +] + +[[package]] +name = "cloudpickle" +version = "3.1.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/52/39/069100b84d7418bc358d81669d5748efb14b9cceacd2f9c75f550424132f/cloudpickle-3.1.1.tar.gz", hash = "sha256:b216fa8ae4019d5482a8ac3c95d8f6346115d8835911fd4aefd1a445e4242c64", size = 22113, upload-time = "2025-01-14T17:02:05.085Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/e8/64c37fadfc2816a7701fa8a6ed8d87327c7d54eacfbfb6edab14a2f2be75/cloudpickle-3.1.1-py3-none-any.whl", hash = "sha256:c8c5a44295039331ee9dad40ba100a9c7297b6f988e50e87ccdf3765a668350e", size = 20992, upload-time = "2025-01-14T17:02:02.417Z" }, +] + +[[package]] +name = "colorama" +version = "0.4.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, +] + +[[package]] +name = "colorful" +version = "0.5.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/0c/0c/d180ebf230b771907f46981023a80f62cf592d49673cc5f8a5993aa67bb6/colorful-0.5.7.tar.gz", hash = "sha256:c5452179b56601c178b03d468a5326cc1fe37d9be81d24d0d6bdab36c4b93ad8", size = 209487, upload-time = "2025-06-30T15:24:03.936Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e2/98/0d791b3d1eaed89d7d370b5cf9b8079b124da0545559417f394ba21b5532/colorful-0.5.7-py2.py3-none-any.whl", hash = "sha256:495dd3a23151a9568cee8a90fc1174c902ad7ef06655f50b6bddf9e80008da69", size = 201475, upload-time = "2025-06-30T15:24:02.693Z" }, +] + +[[package]] +name = "comm" +version = "0.2.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/4c/13/7d740c5849255756bc17888787313b61fd38a0a8304fc4f073dfc46122aa/comm-0.2.3.tar.gz", hash = "sha256:2dc8048c10962d55d7ad693be1e7045d891b7ce8d999c97963a5e3e99c055971", size = 6319, upload-time = "2025-07-25T14:02:04.452Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/60/97/891a0971e1e4a8c5d2b20bbe0e524dc04548d2307fee33cdeba148fd4fc7/comm-0.2.3-py3-none-any.whl", hash = "sha256:c615d91d75f7f04f095b30d1c1711babd43bdc6419c1be9886a85f2f4e489417", size = 7294, upload-time = "2025-07-25T14:02:02.896Z" }, +] + +[[package]] +name = "contourpy" +version = "1.3.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/58/01/1253e6698a07380cd31a736d248a3f2a50a7c88779a1813da27503cadc2a/contourpy-1.3.3.tar.gz", hash = "sha256:083e12155b210502d0bca491432bb04d56dc3432f95a979b429f2848c3dbe880", size = 13466174, upload-time = "2025-07-26T12:03:12.549Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/be/45/adfee365d9ea3d853550b2e735f9d66366701c65db7855cd07621732ccfc/contourpy-1.3.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b08a32ea2f8e42cf1d4be3169a98dd4be32bafe4f22b6c4cb4ba810fa9e5d2cb", size = 293419, upload-time = "2025-07-26T12:01:21.16Z" }, + { url = "https://files.pythonhosted.org/packages/53/3e/405b59cfa13021a56bba395a6b3aca8cec012b45bf177b0eaf7a202cde2c/contourpy-1.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:556dba8fb6f5d8742f2923fe9457dbdd51e1049c4a43fd3986a0b14a1d815fc6", size = 273979, upload-time = "2025-07-26T12:01:22.448Z" }, + { url = "https://files.pythonhosted.org/packages/d4/1c/a12359b9b2ca3a845e8f7f9ac08bdf776114eb931392fcad91743e2ea17b/contourpy-1.3.3-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:92d9abc807cf7d0e047b95ca5d957cf4792fcd04e920ca70d48add15c1a90ea7", size = 332653, upload-time = "2025-07-26T12:01:24.155Z" }, + { url = "https://files.pythonhosted.org/packages/63/12/897aeebfb475b7748ea67b61e045accdfcf0d971f8a588b67108ed7f5512/contourpy-1.3.3-cp312-cp312-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b2e8faa0ed68cb29af51edd8e24798bb661eac3bd9f65420c1887b6ca89987c8", size = 379536, upload-time = "2025-07-26T12:01:25.91Z" }, + { url = "https://files.pythonhosted.org/packages/43/8a/a8c584b82deb248930ce069e71576fc09bd7174bbd35183b7943fb1064fd/contourpy-1.3.3-cp312-cp312-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:626d60935cf668e70a5ce6ff184fd713e9683fb458898e4249b63be9e28286ea", size = 384397, upload-time = "2025-07-26T12:01:27.152Z" }, + { url = "https://files.pythonhosted.org/packages/cc/8f/ec6289987824b29529d0dfda0d74a07cec60e54b9c92f3c9da4c0ac732de/contourpy-1.3.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4d00e655fcef08aba35ec9610536bfe90267d7ab5ba944f7032549c55a146da1", size = 362601, upload-time = "2025-07-26T12:01:28.808Z" }, + { url = "https://files.pythonhosted.org/packages/05/0a/a3fe3be3ee2dceb3e615ebb4df97ae6f3828aa915d3e10549ce016302bd1/contourpy-1.3.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:451e71b5a7d597379ef572de31eeb909a87246974d960049a9848c3bc6c41bf7", size = 1331288, upload-time = "2025-07-26T12:01:31.198Z" }, + { url = "https://files.pythonhosted.org/packages/33/1d/acad9bd4e97f13f3e2b18a3977fe1b4a37ecf3d38d815333980c6c72e963/contourpy-1.3.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:459c1f020cd59fcfe6650180678a9993932d80d44ccde1fa1868977438f0b411", size = 1403386, upload-time = "2025-07-26T12:01:33.947Z" }, + { url = "https://files.pythonhosted.org/packages/cf/8f/5847f44a7fddf859704217a99a23a4f6417b10e5ab1256a179264561540e/contourpy-1.3.3-cp312-cp312-win32.whl", hash = "sha256:023b44101dfe49d7d53932be418477dba359649246075c996866106da069af69", size = 185018, upload-time = "2025-07-26T12:01:35.64Z" }, + { url = "https://files.pythonhosted.org/packages/19/e8/6026ed58a64563186a9ee3f29f41261fd1828f527dd93d33b60feca63352/contourpy-1.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:8153b8bfc11e1e4d75bcb0bff1db232f9e10b274e0929de9d608027e0d34ff8b", size = 226567, upload-time = "2025-07-26T12:01:36.804Z" }, + { url = "https://files.pythonhosted.org/packages/d1/e2/f05240d2c39a1ed228d8328a78b6f44cd695f7ef47beb3e684cf93604f86/contourpy-1.3.3-cp312-cp312-win_arm64.whl", hash = "sha256:07ce5ed73ecdc4a03ffe3e1b3e3c1166db35ae7584be76f65dbbe28a7791b0cc", size = 193655, upload-time = "2025-07-26T12:01:37.999Z" }, + { url = "https://files.pythonhosted.org/packages/68/35/0167aad910bbdb9599272bd96d01a9ec6852f36b9455cf2ca67bd4cc2d23/contourpy-1.3.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:177fb367556747a686509d6fef71d221a4b198a3905fe824430e5ea0fda54eb5", size = 293257, upload-time = "2025-07-26T12:01:39.367Z" }, + { url = "https://files.pythonhosted.org/packages/96/e4/7adcd9c8362745b2210728f209bfbcf7d91ba868a2c5f40d8b58f54c509b/contourpy-1.3.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d002b6f00d73d69333dac9d0b8d5e84d9724ff9ef044fd63c5986e62b7c9e1b1", size = 274034, upload-time = "2025-07-26T12:01:40.645Z" }, + { url = "https://files.pythonhosted.org/packages/73/23/90e31ceeed1de63058a02cb04b12f2de4b40e3bef5e082a7c18d9c8ae281/contourpy-1.3.3-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:348ac1f5d4f1d66d3322420f01d42e43122f43616e0f194fc1c9f5d830c5b286", size = 334672, upload-time = "2025-07-26T12:01:41.942Z" }, + { url = "https://files.pythonhosted.org/packages/ed/93/b43d8acbe67392e659e1d984700e79eb67e2acb2bd7f62012b583a7f1b55/contourpy-1.3.3-cp313-cp313-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:655456777ff65c2c548b7c454af9c6f33f16c8884f11083244b5819cc214f1b5", size = 381234, upload-time = "2025-07-26T12:01:43.499Z" }, + { url = "https://files.pythonhosted.org/packages/46/3b/bec82a3ea06f66711520f75a40c8fc0b113b2a75edb36aa633eb11c4f50f/contourpy-1.3.3-cp313-cp313-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:644a6853d15b2512d67881586bd03f462c7ab755db95f16f14d7e238f2852c67", size = 385169, upload-time = "2025-07-26T12:01:45.219Z" }, + { url = "https://files.pythonhosted.org/packages/4b/32/e0f13a1c5b0f8572d0ec6ae2f6c677b7991fafd95da523159c19eff0696a/contourpy-1.3.3-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4debd64f124ca62069f313a9cb86656ff087786016d76927ae2cf37846b006c9", size = 362859, upload-time = "2025-07-26T12:01:46.519Z" }, + { url = "https://files.pythonhosted.org/packages/33/71/e2a7945b7de4e58af42d708a219f3b2f4cff7386e6b6ab0a0fa0033c49a9/contourpy-1.3.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a15459b0f4615b00bbd1e91f1b9e19b7e63aea7483d03d804186f278c0af2659", size = 1332062, upload-time = "2025-07-26T12:01:48.964Z" }, + { url = "https://files.pythonhosted.org/packages/12/fc/4e87ac754220ccc0e807284f88e943d6d43b43843614f0a8afa469801db0/contourpy-1.3.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ca0fdcd73925568ca027e0b17ab07aad764be4706d0a925b89227e447d9737b7", size = 1403932, upload-time = "2025-07-26T12:01:51.979Z" }, + { url = "https://files.pythonhosted.org/packages/a6/2e/adc197a37443f934594112222ac1aa7dc9a98faf9c3842884df9a9d8751d/contourpy-1.3.3-cp313-cp313-win32.whl", hash = "sha256:b20c7c9a3bf701366556e1b1984ed2d0cedf999903c51311417cf5f591d8c78d", size = 185024, upload-time = "2025-07-26T12:01:53.245Z" }, + { url = "https://files.pythonhosted.org/packages/18/0b/0098c214843213759692cc638fce7de5c289200a830e5035d1791d7a2338/contourpy-1.3.3-cp313-cp313-win_amd64.whl", hash = "sha256:1cadd8b8969f060ba45ed7c1b714fe69185812ab43bd6b86a9123fe8f99c3263", size = 226578, upload-time = "2025-07-26T12:01:54.422Z" }, + { url = "https://files.pythonhosted.org/packages/8a/9a/2f6024a0c5995243cd63afdeb3651c984f0d2bc727fd98066d40e141ad73/contourpy-1.3.3-cp313-cp313-win_arm64.whl", hash = "sha256:fd914713266421b7536de2bfa8181aa8c699432b6763a0ea64195ebe28bff6a9", size = 193524, upload-time = "2025-07-26T12:01:55.73Z" }, + { url = "https://files.pythonhosted.org/packages/c0/b3/f8a1a86bd3298513f500e5b1f5fd92b69896449f6cab6a146a5d52715479/contourpy-1.3.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:88df9880d507169449d434c293467418b9f6cbe82edd19284aa0409e7fdb933d", size = 306730, upload-time = "2025-07-26T12:01:57.051Z" }, + { url = "https://files.pythonhosted.org/packages/3f/11/4780db94ae62fc0c2053909b65dc3246bd7cecfc4f8a20d957ad43aa4ad8/contourpy-1.3.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:d06bb1f751ba5d417047db62bca3c8fde202b8c11fb50742ab3ab962c81e8216", size = 287897, upload-time = "2025-07-26T12:01:58.663Z" }, + { url = "https://files.pythonhosted.org/packages/ae/15/e59f5f3ffdd6f3d4daa3e47114c53daabcb18574a26c21f03dc9e4e42ff0/contourpy-1.3.3-cp313-cp313t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e4e6b05a45525357e382909a4c1600444e2a45b4795163d3b22669285591c1ae", size = 326751, upload-time = "2025-07-26T12:02:00.343Z" }, + { url = "https://files.pythonhosted.org/packages/0f/81/03b45cfad088e4770b1dcf72ea78d3802d04200009fb364d18a493857210/contourpy-1.3.3-cp313-cp313t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ab3074b48c4e2cf1a960e6bbeb7f04566bf36b1861d5c9d4d8ac04b82e38ba20", size = 375486, upload-time = "2025-07-26T12:02:02.128Z" }, + { url = "https://files.pythonhosted.org/packages/0c/ba/49923366492ffbdd4486e970d421b289a670ae8cf539c1ea9a09822b371a/contourpy-1.3.3-cp313-cp313t-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:6c3d53c796f8647d6deb1abe867daeb66dcc8a97e8455efa729516b997b8ed99", size = 388106, upload-time = "2025-07-26T12:02:03.615Z" }, + { url = "https://files.pythonhosted.org/packages/9f/52/5b00ea89525f8f143651f9f03a0df371d3cbd2fccd21ca9b768c7a6500c2/contourpy-1.3.3-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:50ed930df7289ff2a8d7afeb9603f8289e5704755c7e5c3bbd929c90c817164b", size = 352548, upload-time = "2025-07-26T12:02:05.165Z" }, + { url = "https://files.pythonhosted.org/packages/32/1d/a209ec1a3a3452d490f6b14dd92e72280c99ae3d1e73da74f8277d4ee08f/contourpy-1.3.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:4feffb6537d64b84877da813a5c30f1422ea5739566abf0bd18065ac040e120a", size = 1322297, upload-time = "2025-07-26T12:02:07.379Z" }, + { url = "https://files.pythonhosted.org/packages/bc/9e/46f0e8ebdd884ca0e8877e46a3f4e633f6c9c8c4f3f6e72be3fe075994aa/contourpy-1.3.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:2b7e9480ffe2b0cd2e787e4df64270e3a0440d9db8dc823312e2c940c167df7e", size = 1391023, upload-time = "2025-07-26T12:02:10.171Z" }, + { url = "https://files.pythonhosted.org/packages/b9/70/f308384a3ae9cd2209e0849f33c913f658d3326900d0ff5d378d6a1422d2/contourpy-1.3.3-cp313-cp313t-win32.whl", hash = "sha256:283edd842a01e3dcd435b1c5116798d661378d83d36d337b8dde1d16a5fc9ba3", size = 196157, upload-time = "2025-07-26T12:02:11.488Z" }, + { url = "https://files.pythonhosted.org/packages/b2/dd/880f890a6663b84d9e34a6f88cded89d78f0091e0045a284427cb6b18521/contourpy-1.3.3-cp313-cp313t-win_amd64.whl", hash = "sha256:87acf5963fc2b34825e5b6b048f40e3635dd547f590b04d2ab317c2619ef7ae8", size = 240570, upload-time = "2025-07-26T12:02:12.754Z" }, + { url = "https://files.pythonhosted.org/packages/80/99/2adc7d8ffead633234817ef8e9a87115c8a11927a94478f6bb3d3f4d4f7d/contourpy-1.3.3-cp313-cp313t-win_arm64.whl", hash = "sha256:3c30273eb2a55024ff31ba7d052dde990d7d8e5450f4bbb6e913558b3d6c2301", size = 199713, upload-time = "2025-07-26T12:02:14.4Z" }, + { url = "https://files.pythonhosted.org/packages/72/8b/4546f3ab60f78c514ffb7d01a0bd743f90de36f0019d1be84d0a708a580a/contourpy-1.3.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:fde6c716d51c04b1c25d0b90364d0be954624a0ee9d60e23e850e8d48353d07a", size = 292189, upload-time = "2025-07-26T12:02:16.095Z" }, + { url = "https://files.pythonhosted.org/packages/fd/e1/3542a9cb596cadd76fcef413f19c79216e002623158befe6daa03dbfa88c/contourpy-1.3.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:cbedb772ed74ff5be440fa8eee9bd49f64f6e3fc09436d9c7d8f1c287b121d77", size = 273251, upload-time = "2025-07-26T12:02:17.524Z" }, + { url = "https://files.pythonhosted.org/packages/b1/71/f93e1e9471d189f79d0ce2497007731c1e6bf9ef6d1d61b911430c3db4e5/contourpy-1.3.3-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:22e9b1bd7a9b1d652cd77388465dc358dafcd2e217d35552424aa4f996f524f5", size = 335810, upload-time = "2025-07-26T12:02:18.9Z" }, + { url = "https://files.pythonhosted.org/packages/91/f9/e35f4c1c93f9275d4e38681a80506b5510e9327350c51f8d4a5a724d178c/contourpy-1.3.3-cp314-cp314-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a22738912262aa3e254e4f3cb079a95a67132fc5a063890e224393596902f5a4", size = 382871, upload-time = "2025-07-26T12:02:20.418Z" }, + { url = "https://files.pythonhosted.org/packages/b5/71/47b512f936f66a0a900d81c396a7e60d73419868fba959c61efed7a8ab46/contourpy-1.3.3-cp314-cp314-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:afe5a512f31ee6bd7d0dda52ec9864c984ca3d66664444f2d72e0dc4eb832e36", size = 386264, upload-time = "2025-07-26T12:02:21.916Z" }, + { url = "https://files.pythonhosted.org/packages/04/5f/9ff93450ba96b09c7c2b3f81c94de31c89f92292f1380261bd7195bea4ea/contourpy-1.3.3-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f64836de09927cba6f79dcd00fdd7d5329f3fccc633468507079c829ca4db4e3", size = 363819, upload-time = "2025-07-26T12:02:23.759Z" }, + { url = "https://files.pythonhosted.org/packages/3e/a6/0b185d4cc480ee494945cde102cb0149ae830b5fa17bf855b95f2e70ad13/contourpy-1.3.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:1fd43c3be4c8e5fd6e4f2baeae35ae18176cf2e5cced681cca908addf1cdd53b", size = 1333650, upload-time = "2025-07-26T12:02:26.181Z" }, + { url = "https://files.pythonhosted.org/packages/43/d7/afdc95580ca56f30fbcd3060250f66cedbde69b4547028863abd8aa3b47e/contourpy-1.3.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:6afc576f7b33cf00996e5c1102dc2a8f7cc89e39c0b55df93a0b78c1bd992b36", size = 1404833, upload-time = "2025-07-26T12:02:28.782Z" }, + { url = "https://files.pythonhosted.org/packages/e2/e2/366af18a6d386f41132a48f033cbd2102e9b0cf6345d35ff0826cd984566/contourpy-1.3.3-cp314-cp314-win32.whl", hash = "sha256:66c8a43a4f7b8df8b71ee1840e4211a3c8d93b214b213f590e18a1beca458f7d", size = 189692, upload-time = "2025-07-26T12:02:30.128Z" }, + { url = "https://files.pythonhosted.org/packages/7d/c2/57f54b03d0f22d4044b8afb9ca0e184f8b1afd57b4f735c2fa70883dc601/contourpy-1.3.3-cp314-cp314-win_amd64.whl", hash = "sha256:cf9022ef053f2694e31d630feaacb21ea24224be1c3ad0520b13d844274614fd", size = 232424, upload-time = "2025-07-26T12:02:31.395Z" }, + { url = "https://files.pythonhosted.org/packages/18/79/a9416650df9b525737ab521aa181ccc42d56016d2123ddcb7b58e926a42c/contourpy-1.3.3-cp314-cp314-win_arm64.whl", hash = "sha256:95b181891b4c71de4bb404c6621e7e2390745f887f2a026b2d99e92c17892339", size = 198300, upload-time = "2025-07-26T12:02:32.956Z" }, + { url = "https://files.pythonhosted.org/packages/1f/42/38c159a7d0f2b7b9c04c64ab317042bb6952b713ba875c1681529a2932fe/contourpy-1.3.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:33c82d0138c0a062380332c861387650c82e4cf1747aaa6938b9b6516762e772", size = 306769, upload-time = "2025-07-26T12:02:34.2Z" }, + { url = "https://files.pythonhosted.org/packages/c3/6c/26a8205f24bca10974e77460de68d3d7c63e282e23782f1239f226fcae6f/contourpy-1.3.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:ea37e7b45949df430fe649e5de8351c423430046a2af20b1c1961cae3afcda77", size = 287892, upload-time = "2025-07-26T12:02:35.807Z" }, + { url = "https://files.pythonhosted.org/packages/66/06/8a475c8ab718ebfd7925661747dbb3c3ee9c82ac834ccb3570be49d129f4/contourpy-1.3.3-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d304906ecc71672e9c89e87c4675dc5c2645e1f4269a5063b99b0bb29f232d13", size = 326748, upload-time = "2025-07-26T12:02:37.193Z" }, + { url = "https://files.pythonhosted.org/packages/b4/a3/c5ca9f010a44c223f098fccd8b158bb1cb287378a31ac141f04730dc49be/contourpy-1.3.3-cp314-cp314t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ca658cd1a680a5c9ea96dc61cdbae1e85c8f25849843aa799dfd3cb370ad4fbe", size = 375554, upload-time = "2025-07-26T12:02:38.894Z" }, + { url = "https://files.pythonhosted.org/packages/80/5b/68bd33ae63fac658a4145088c1e894405e07584a316738710b636c6d0333/contourpy-1.3.3-cp314-cp314t-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ab2fd90904c503739a75b7c8c5c01160130ba67944a7b77bbf36ef8054576e7f", size = 388118, upload-time = "2025-07-26T12:02:40.642Z" }, + { url = "https://files.pythonhosted.org/packages/40/52/4c285a6435940ae25d7410a6c36bda5145839bc3f0beb20c707cda18b9d2/contourpy-1.3.3-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b7301b89040075c30e5768810bc96a8e8d78085b47d8be6e4c3f5a0b4ed478a0", size = 352555, upload-time = "2025-07-26T12:02:42.25Z" }, + { url = "https://files.pythonhosted.org/packages/24/ee/3e81e1dd174f5c7fefe50e85d0892de05ca4e26ef1c9a59c2a57e43b865a/contourpy-1.3.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:2a2a8b627d5cc6b7c41a4beff6c5ad5eb848c88255fda4a8745f7e901b32d8e4", size = 1322295, upload-time = "2025-07-26T12:02:44.668Z" }, + { url = "https://files.pythonhosted.org/packages/3c/b2/6d913d4d04e14379de429057cd169e5e00f6c2af3bb13e1710bcbdb5da12/contourpy-1.3.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:fd6ec6be509c787f1caf6b247f0b1ca598bef13f4ddeaa126b7658215529ba0f", size = 1391027, upload-time = "2025-07-26T12:02:47.09Z" }, + { url = "https://files.pythonhosted.org/packages/93/8a/68a4ec5c55a2971213d29a9374913f7e9f18581945a7a31d1a39b5d2dfe5/contourpy-1.3.3-cp314-cp314t-win32.whl", hash = "sha256:e74a9a0f5e3fff48fb5a7f2fd2b9b70a3fe014a67522f79b7cca4c0c7e43c9ae", size = 202428, upload-time = "2025-07-26T12:02:48.691Z" }, + { url = "https://files.pythonhosted.org/packages/fa/96/fd9f641ffedc4fa3ace923af73b9d07e869496c9cc7a459103e6e978992f/contourpy-1.3.3-cp314-cp314t-win_amd64.whl", hash = "sha256:13b68d6a62db8eafaebb8039218921399baf6e47bf85006fd8529f2a08ef33fc", size = 250331, upload-time = "2025-07-26T12:02:50.137Z" }, + { url = "https://files.pythonhosted.org/packages/ae/8c/469afb6465b853afff216f9528ffda78a915ff880ed58813ba4faf4ba0b6/contourpy-1.3.3-cp314-cp314t-win_arm64.whl", hash = "sha256:b7448cb5a725bb1e35ce88771b86fba35ef418952474492cf7c764059933ff8b", size = 203831, upload-time = "2025-07-26T12:02:51.449Z" }, +] + +[[package]] +name = "cycler" +version = "0.12.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a9/95/a3dbbb5028f35eafb79008e7522a75244477d2838f38cbb722248dabc2a8/cycler-0.12.1.tar.gz", hash = "sha256:88bb128f02ba341da8ef447245a9e138fae777f6a23943da4540077d3601eb1c", size = 7615, upload-time = "2023-10-07T05:32:18.335Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30", size = 8321, upload-time = "2023-10-07T05:32:16.783Z" }, +] + +[[package]] +name = "databricks-sdk" +version = "0.63.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "google-auth" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9e/af/f77be9ed096e7e83732e80d87997256d7bd7d9aff62fa0d51e4068ae786c/databricks_sdk-0.63.0.tar.gz", hash = "sha256:f141bc810b4145e93e628a0e159ea41806440aed0cd17adddd252d65d1968465", size = 732112, upload-time = "2025-08-13T09:00:08.722Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/22/fd/2bb9bdf083a6943630762108e3e4e24b83f0571dd39cfa4fd467adaa1921/databricks_sdk-0.63.0-py3-none-any.whl", hash = "sha256:3ea569dcd0a4395c17221a5da39db4da85c6fc91b5fc14546514e329451d5eb3", size = 688018, upload-time = "2025-08-13T09:00:07.071Z" }, +] + +[[package]] +name = "debugpy" +version = "1.8.16" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ca/d4/722d0bcc7986172ac2ef3c979ad56a1030e3afd44ced136d45f8142b1f4a/debugpy-1.8.16.tar.gz", hash = "sha256:31e69a1feb1cf6b51efbed3f6c9b0ef03bc46ff050679c4be7ea6d2e23540870", size = 1643809, upload-time = "2025-08-06T18:00:02.647Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/61/fb/0387c0e108d842c902801bc65ccc53e5b91d8c169702a9bbf4f7efcedf0c/debugpy-1.8.16-cp312-cp312-macosx_14_0_universal2.whl", hash = "sha256:b202e2843e32e80b3b584bcebfe0e65e0392920dc70df11b2bfe1afcb7a085e4", size = 2511822, upload-time = "2025-08-06T18:00:18.526Z" }, + { url = "https://files.pythonhosted.org/packages/37/44/19e02745cae22bf96440141f94e15a69a1afaa3a64ddfc38004668fcdebf/debugpy-1.8.16-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64473c4a306ba11a99fe0bb14622ba4fbd943eb004847d9b69b107bde45aa9ea", size = 4230135, upload-time = "2025-08-06T18:00:19.997Z" }, + { url = "https://files.pythonhosted.org/packages/f3/0b/19b1ba5ee4412f303475a2c7ad5858efb99c90eae5ec627aa6275c439957/debugpy-1.8.16-cp312-cp312-win32.whl", hash = "sha256:833a61ed446426e38b0dd8be3e9d45ae285d424f5bf6cd5b2b559c8f12305508", size = 5281271, upload-time = "2025-08-06T18:00:21.281Z" }, + { url = "https://files.pythonhosted.org/packages/b1/e0/bc62e2dc141de53bd03e2c7cb9d7011de2e65e8bdcdaa26703e4d28656ba/debugpy-1.8.16-cp312-cp312-win_amd64.whl", hash = "sha256:75f204684581e9ef3dc2f67687c3c8c183fde2d6675ab131d94084baf8084121", size = 5323149, upload-time = "2025-08-06T18:00:23.033Z" }, + { url = "https://files.pythonhosted.org/packages/62/66/607ab45cc79e60624df386e233ab64a6d8d39ea02e7f80e19c1d451345bb/debugpy-1.8.16-cp313-cp313-macosx_14_0_universal2.whl", hash = "sha256:85df3adb1de5258dca910ae0bb185e48c98801ec15018a263a92bb06be1c8787", size = 2496157, upload-time = "2025-08-06T18:00:24.361Z" }, + { url = "https://files.pythonhosted.org/packages/4d/a0/c95baae08a75bceabb79868d663a0736655e427ab9c81fb848da29edaeac/debugpy-1.8.16-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bee89e948bc236a5c43c4214ac62d28b29388453f5fd328d739035e205365f0b", size = 4222491, upload-time = "2025-08-06T18:00:25.806Z" }, + { url = "https://files.pythonhosted.org/packages/5b/2f/1c8db6ddd8a257c3cd2c46413b267f1d5fa3df910401c899513ce30392d6/debugpy-1.8.16-cp313-cp313-win32.whl", hash = "sha256:cf358066650439847ec5ff3dae1da98b5461ea5da0173d93d5e10f477c94609a", size = 5281126, upload-time = "2025-08-06T18:00:27.207Z" }, + { url = "https://files.pythonhosted.org/packages/d3/ba/c3e154ab307366d6c5a9c1b68de04914e2ce7fa2f50d578311d8cc5074b2/debugpy-1.8.16-cp313-cp313-win_amd64.whl", hash = "sha256:b5aea1083f6f50023e8509399d7dc6535a351cc9f2e8827d1e093175e4d9fa4c", size = 5323094, upload-time = "2025-08-06T18:00:29.03Z" }, + { url = "https://files.pythonhosted.org/packages/52/57/ecc9ae29fa5b2d90107cd1d9bf8ed19aacb74b2264d986ae9d44fe9bdf87/debugpy-1.8.16-py2.py3-none-any.whl", hash = "sha256:19c9521962475b87da6f673514f7fd610328757ec993bf7ec0d8c96f9a325f9e", size = 5287700, upload-time = "2025-08-06T18:00:42.333Z" }, +] + +[[package]] +name = "decorator" +version = "5.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/43/fa/6d96a0978d19e17b68d634497769987b16c8f4cd0a7a05048bec693caa6b/decorator-5.2.1.tar.gz", hash = "sha256:65f266143752f734b0a7cc83c46f4618af75b8c5911b00ccb61d0ac9b6da0360", size = 56711, upload-time = "2025-02-24T04:41:34.073Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a", size = 9190, upload-time = "2025-02-24T04:41:32.565Z" }, +] + +[[package]] +name = "default" +version = "0.1.0" +source = { virtual = "." } +dependencies = [ + { name = "boto3" }, + { name = "doggos" }, + { name = "ipykernel" }, + { name = "ipywidgets" }, + { name = "matplotlib" }, + { name = "mlflow" }, + { name = "ray", extra = ["data", "serve", "train", "tune"] }, + { name = "scikit-learn" }, + { name = "torch" }, + { name = "transformers" }, +] + +[package.metadata] +requires-dist = [ + { name = "boto3", specifier = ">=1.40.9" }, + { name = "doggos", directory = "doggos" }, + { name = "ipykernel", specifier = ">=6.30.1" }, + { name = "ipywidgets", specifier = "==8.1.3" }, + { name = "matplotlib", specifier = "==3.10.0" }, + { name = "mlflow", specifier = "==2.19.0" }, + { name = "ray", extras = ["data", "serve", "train", "tune"], url = "http://localhost:9478/ray/ray-2.48.0-cp312-cp312-manylinux2014_x86_64.whl" }, + { name = "scikit-learn", specifier = "==1.6.0" }, + { name = "torch", specifier = "==2.7.1" }, + { name = "transformers", specifier = "==4.52.3" }, +] + +[[package]] +name = "distlib" +version = "0.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/96/8e/709914eb2b5749865801041647dc7f4e6d00b549cfe88b65ca192995f07c/distlib-0.4.0.tar.gz", hash = "sha256:feec40075be03a04501a973d81f633735b4b69f98b05450592310c0f401a4e0d", size = 614605, upload-time = "2025-07-17T16:52:00.465Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/33/6b/e0547afaf41bf2c42e52430072fa5658766e3d65bd4b03a563d1b6336f57/distlib-0.4.0-py2.py3-none-any.whl", hash = "sha256:9659f7d87e46584a30b5780e43ac7a2143098441670ff0a49d5f9034c54a6c16", size = 469047, upload-time = "2025-07-17T16:51:58.613Z" }, +] + +[[package]] +name = "docker" +version = "7.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pywin32", marker = "sys_platform == 'win32'" }, + { name = "requests" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/91/9b/4a2ea29aeba62471211598dac5d96825bb49348fa07e906ea930394a83ce/docker-7.1.0.tar.gz", hash = "sha256:ad8c70e6e3f8926cb8a92619b832b4ea5299e2831c14284663184e200546fa6c", size = 117834, upload-time = "2024-05-23T11:13:57.216Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e3/26/57c6fb270950d476074c087527a558ccb6f4436657314bfb6cdf484114c4/docker-7.1.0-py3-none-any.whl", hash = "sha256:c96b93b7f0a746f9e77d325bcfb87422a3d8bd4f03136ae8a85b37f1898d5fc0", size = 147774, upload-time = "2024-05-23T11:13:55.01Z" }, +] + +[[package]] +name = "doggos" +version = "0.1.0" +source = { directory = "doggos" } + +[[package]] +name = "executing" +version = "2.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/91/50/a9d80c47ff289c611ff12e63f7c5d13942c65d68125160cefd768c73e6e4/executing-2.2.0.tar.gz", hash = "sha256:5d108c028108fe2551d1a7b2e8b713341e2cb4fc0aa7dcf966fa4327a5226755", size = 978693, upload-time = "2025-01-22T15:41:29.403Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7b/8f/c4d9bafc34ad7ad5d8dc16dd1347ee0e507a52c3adb6bfa8887e1c6a26ba/executing-2.2.0-py2.py3-none-any.whl", hash = "sha256:11387150cad388d62750327a53d3339fad4888b39a6fe233c3afbb54ecffd3aa", size = 26702, upload-time = "2025-01-22T15:41:25.929Z" }, +] + +[[package]] +name = "fastapi" +version = "0.116.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pydantic" }, + { name = "starlette" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/78/d7/6c8b3bfe33eeffa208183ec037fee0cce9f7f024089ab1c5d12ef04bd27c/fastapi-0.116.1.tar.gz", hash = "sha256:ed52cbf946abfd70c5a0dccb24673f0670deeb517a88b3544d03c2a6bf283143", size = 296485, upload-time = "2025-07-11T16:22:32.057Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e5/47/d63c60f59a59467fda0f93f46335c9d18526d7071f025cb5b89d5353ea42/fastapi-0.116.1-py3-none-any.whl", hash = "sha256:c46ac7c312df840f0c9e220f7964bada936781bc4e2e6eb71f1c4d7553786565", size = 95631, upload-time = "2025-07-11T16:22:30.485Z" }, +] + +[[package]] +name = "filelock" +version = "3.19.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/40/bb/0ab3e58d22305b6f5440629d20683af28959bf793d98d11950e305c1c326/filelock-3.19.1.tar.gz", hash = "sha256:66eda1888b0171c998b35be2bcc0f6d75c388a7ce20c3f3f37aa8e96c2dddf58", size = 17687, upload-time = "2025-08-14T16:56:03.016Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/42/14/42b2651a2f46b022ccd948bca9f2d5af0fd8929c4eec235b8d6d844fbe67/filelock-3.19.1-py3-none-any.whl", hash = "sha256:d38e30481def20772f5baf097c122c3babc4fcdb7e14e57049eb9d88c6dc017d", size = 15988, upload-time = "2025-08-14T16:56:01.633Z" }, +] + +[[package]] +name = "flask" +version = "3.1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "blinker" }, + { name = "click" }, + { name = "itsdangerous" }, + { name = "jinja2" }, + { name = "markupsafe" }, + { name = "werkzeug" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c0/de/e47735752347f4128bcf354e0da07ef311a78244eba9e3dc1d4a5ab21a98/flask-3.1.1.tar.gz", hash = "sha256:284c7b8f2f58cb737f0cf1c30fd7eaf0ccfcde196099d24ecede3fc2005aa59e", size = 753440, upload-time = "2025-05-13T15:01:17.447Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3d/68/9d4508e893976286d2ead7f8f571314af6c2037af34853a30fd769c02e9d/flask-3.1.1-py3-none-any.whl", hash = "sha256:07aae2bb5eaf77993ef57e357491839f5fd9f4dc281593a81a9e4d79a24f295c", size = 103305, upload-time = "2025-05-13T15:01:15.591Z" }, +] + +[[package]] +name = "fonttools" +version = "4.59.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/11/7f/29c9c3fe4246f6ad96fee52b88d0dc3a863c7563b0afc959e36d78b965dc/fonttools-4.59.1.tar.gz", hash = "sha256:74995b402ad09822a4c8002438e54940d9f1ecda898d2bb057729d7da983e4cb", size = 3534394, upload-time = "2025-08-14T16:28:14.266Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ac/fe/6e069cc4cb8881d164a9bd956e9df555bc62d3eb36f6282e43440200009c/fonttools-4.59.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:43ab814bbba5f02a93a152ee61a04182bb5809bd2bc3609f7822e12c53ae2c91", size = 2769172, upload-time = "2025-08-14T16:26:45.729Z" }, + { url = "https://files.pythonhosted.org/packages/b9/98/ec4e03f748fefa0dd72d9d95235aff6fef16601267f4a2340f0e16b9330f/fonttools-4.59.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4f04c3ffbfa0baafcbc550657cf83657034eb63304d27b05cff1653b448ccff6", size = 2337281, upload-time = "2025-08-14T16:26:47.921Z" }, + { url = "https://files.pythonhosted.org/packages/8b/b1/890360a7e3d04a30ba50b267aca2783f4c1364363797e892e78a4f036076/fonttools-4.59.1-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:d601b153e51a5a6221f0d4ec077b6bfc6ac35bfe6c19aeaa233d8990b2b71726", size = 4909215, upload-time = "2025-08-14T16:26:49.682Z" }, + { url = "https://files.pythonhosted.org/packages/8a/ec/2490599550d6c9c97a44c1e36ef4de52d6acf742359eaa385735e30c05c4/fonttools-4.59.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c735e385e30278c54f43a0d056736942023c9043f84ee1021eff9fd616d17693", size = 4951958, upload-time = "2025-08-14T16:26:51.616Z" }, + { url = "https://files.pythonhosted.org/packages/d1/40/bd053f6f7634234a9b9805ff8ae4f32df4f2168bee23cafd1271ba9915a9/fonttools-4.59.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1017413cdc8555dce7ee23720da490282ab7ec1cf022af90a241f33f9a49afc4", size = 4894738, upload-time = "2025-08-14T16:26:53.836Z" }, + { url = "https://files.pythonhosted.org/packages/ac/a1/3cd12a010d288325a7cfcf298a84825f0f9c29b01dee1baba64edfe89257/fonttools-4.59.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5c6d8d773470a5107052874341ed3c487c16ecd179976d81afed89dea5cd7406", size = 5045983, upload-time = "2025-08-14T16:26:56.153Z" }, + { url = "https://files.pythonhosted.org/packages/a2/af/8a2c3f6619cc43cf87951405337cc8460d08a4e717bb05eaa94b335d11dc/fonttools-4.59.1-cp312-cp312-win32.whl", hash = "sha256:2a2d0d33307f6ad3a2086a95dd607c202ea8852fa9fb52af9b48811154d1428a", size = 2203407, upload-time = "2025-08-14T16:26:58.165Z" }, + { url = "https://files.pythonhosted.org/packages/8e/f2/a19b874ddbd3ebcf11d7e25188ef9ac3f68b9219c62263acb34aca8cde05/fonttools-4.59.1-cp312-cp312-win_amd64.whl", hash = "sha256:0b9e4fa7eaf046ed6ac470f6033d52c052481ff7a6e0a92373d14f556f298dc0", size = 2251561, upload-time = "2025-08-14T16:27:00.646Z" }, + { url = "https://files.pythonhosted.org/packages/19/5e/94a4d7f36c36e82f6a81e0064d148542e0ad3e6cf51fc5461ca128f3658d/fonttools-4.59.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:89d9957b54246c6251345297dddf77a84d2c19df96af30d2de24093bbdf0528b", size = 2760192, upload-time = "2025-08-14T16:27:03.024Z" }, + { url = "https://files.pythonhosted.org/packages/ee/a5/f50712fc33ef9d06953c660cefaf8c8fe4b8bc74fa21f44ee5e4f9739439/fonttools-4.59.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8156b11c0d5405810d216f53907bd0f8b982aa5f1e7e3127ab3be1a4062154ff", size = 2332694, upload-time = "2025-08-14T16:27:04.883Z" }, + { url = "https://files.pythonhosted.org/packages/e9/a2/5a9fc21c354bf8613215ce233ab0d933bd17d5ff4c29693636551adbc7b3/fonttools-4.59.1-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:8387876a8011caec52d327d5e5bca705d9399ec4b17afb8b431ec50d47c17d23", size = 4889254, upload-time = "2025-08-14T16:27:07.02Z" }, + { url = "https://files.pythonhosted.org/packages/2d/e5/54a6dc811eba018d022ca2e8bd6f2969291f9586ccf9a22a05fc55f91250/fonttools-4.59.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fb13823a74b3a9204a8ed76d3d6d5ec12e64cc5bc44914eb9ff1cdac04facd43", size = 4949109, upload-time = "2025-08-14T16:27:09.3Z" }, + { url = "https://files.pythonhosted.org/packages/db/15/b05c72a248a95bea0fd05fbd95acdf0742945942143fcf961343b7a3663a/fonttools-4.59.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e1ca10da138c300f768bb68e40e5b20b6ecfbd95f91aac4cc15010b6b9d65455", size = 4888428, upload-time = "2025-08-14T16:27:11.514Z" }, + { url = "https://files.pythonhosted.org/packages/63/71/c7d6840f858d695adc0c4371ec45e3fb1c8e060b276ba944e2800495aca4/fonttools-4.59.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:2beb5bfc4887a3130f8625349605a3a45fe345655ce6031d1bac11017454b943", size = 5032668, upload-time = "2025-08-14T16:27:13.872Z" }, + { url = "https://files.pythonhosted.org/packages/90/54/57be4aca6f1312e2bc4d811200dd822325794e05bdb26eeff0976edca651/fonttools-4.59.1-cp313-cp313-win32.whl", hash = "sha256:419f16d750d78e6d704bfe97b48bba2f73b15c9418f817d0cb8a9ca87a5b94bf", size = 2201832, upload-time = "2025-08-14T16:27:16.126Z" }, + { url = "https://files.pythonhosted.org/packages/fc/1f/1899a6175a5f900ed8730a0d64f53ca1b596ed7609bfda033cf659114258/fonttools-4.59.1-cp313-cp313-win_amd64.whl", hash = "sha256:c536f8a852e8d3fa71dde1ec03892aee50be59f7154b533f0bf3c1174cfd5126", size = 2250673, upload-time = "2025-08-14T16:27:18.033Z" }, + { url = "https://files.pythonhosted.org/packages/15/07/f6ba82c22f118d9985c37fea65d8d715ca71300d78b6c6e90874dc59f11d/fonttools-4.59.1-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:d5c3bfdc9663f3d4b565f9cb3b8c1efb3e178186435b45105bde7328cfddd7fe", size = 2758606, upload-time = "2025-08-14T16:27:20.064Z" }, + { url = "https://files.pythonhosted.org/packages/3a/81/84aa3d0ce27b0112c28b67b637ff7a47cf401cf5fbfee6476e4bc9777580/fonttools-4.59.1-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:ea03f1da0d722fe3c2278a05957e6550175571a4894fbf9d178ceef4a3783d2b", size = 2330187, upload-time = "2025-08-14T16:27:22.42Z" }, + { url = "https://files.pythonhosted.org/packages/17/41/b3ba43f78afb321e2e50232c87304c8d0f5ab39b64389b8286cc39cdb824/fonttools-4.59.1-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:57a3708ca6bfccb790f585fa6d8f29432ec329618a09ff94c16bcb3c55994643", size = 4832020, upload-time = "2025-08-14T16:27:24.214Z" }, + { url = "https://files.pythonhosted.org/packages/67/b1/3af871c7fb325a68938e7ce544ca48bfd2c6bb7b357f3c8252933b29100a/fonttools-4.59.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:729367c91eb1ee84e61a733acc485065a00590618ca31c438e7dd4d600c01486", size = 4930687, upload-time = "2025-08-14T16:27:26.484Z" }, + { url = "https://files.pythonhosted.org/packages/c5/4f/299fc44646b30d9ef03ffaa78b109c7bd32121f0d8f10009ee73ac4514bc/fonttools-4.59.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:8f8ef66ac6db450193ed150e10b3b45dde7aded10c5d279968bc63368027f62b", size = 4875794, upload-time = "2025-08-14T16:27:28.887Z" }, + { url = "https://files.pythonhosted.org/packages/90/cf/a0a3d763ab58f5f81ceff104ddb662fd9da94248694862b9c6cbd509fdd5/fonttools-4.59.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:075f745d539a998cd92cb84c339a82e53e49114ec62aaea8307c80d3ad3aef3a", size = 4985780, upload-time = "2025-08-14T16:27:30.858Z" }, + { url = "https://files.pythonhosted.org/packages/72/c5/ba76511aaae143d89c29cd32ce30bafb61c477e8759a1590b8483f8065f8/fonttools-4.59.1-cp314-cp314-win32.whl", hash = "sha256:c2b0597522d4c5bb18aa5cf258746a2d4a90f25878cbe865e4d35526abd1b9fc", size = 2205610, upload-time = "2025-08-14T16:27:32.578Z" }, + { url = "https://files.pythonhosted.org/packages/a9/65/b250e69d6caf35bc65cddbf608be0662d741c248f2e7503ab01081fc267e/fonttools-4.59.1-cp314-cp314-win_amd64.whl", hash = "sha256:e9ad4ce044e3236f0814c906ccce8647046cc557539661e35211faadf76f283b", size = 2255376, upload-time = "2025-08-14T16:27:34.653Z" }, + { url = "https://files.pythonhosted.org/packages/11/f3/0bc63a23ac0f8175e23d82f85d6ee693fbd849de7ad739f0a3622182ad29/fonttools-4.59.1-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:652159e8214eb4856e8387ebcd6b6bd336ee258cbeb639c8be52005b122b9609", size = 2826546, upload-time = "2025-08-14T16:27:36.783Z" }, + { url = "https://files.pythonhosted.org/packages/e9/46/a3968205590e068fdf60e926be329a207782576cb584d3b7dcd2d2844957/fonttools-4.59.1-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:43d177cd0e847ea026fedd9f099dc917da136ed8792d142298a252836390c478", size = 2359771, upload-time = "2025-08-14T16:27:39.678Z" }, + { url = "https://files.pythonhosted.org/packages/b8/ff/d14b4c283879e8cb57862d9624a34fe6522b6fcdd46ccbfc58900958794a/fonttools-4.59.1-cp314-cp314t-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:e54437651e1440ee53a95e6ceb6ee440b67a3d348c76f45f4f48de1a5ecab019", size = 4831575, upload-time = "2025-08-14T16:27:41.885Z" }, + { url = "https://files.pythonhosted.org/packages/9c/04/a277d9a584a49d98ca12d3b2c6663bdf333ae97aaa83bd0cdabf7c5a6c84/fonttools-4.59.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6065fdec8ff44c32a483fd44abe5bcdb40dd5e2571a5034b555348f2b3a52cea", size = 5069962, upload-time = "2025-08-14T16:27:44.284Z" }, + { url = "https://files.pythonhosted.org/packages/16/6f/3d2ae69d96c4cdee6dfe7598ca5519a1514487700ca3d7c49c5a1ad65308/fonttools-4.59.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:42052b56d176f8b315fbc09259439c013c0cb2109df72447148aeda677599612", size = 4942926, upload-time = "2025-08-14T16:27:46.523Z" }, + { url = "https://files.pythonhosted.org/packages/0c/d3/c17379e0048d03ce26b38e4ab0e9a98280395b00529e093fe2d663ac0658/fonttools-4.59.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:bcd52eaa5c4c593ae9f447c1d13e7e4a00ca21d755645efa660b6999425b3c88", size = 4958678, upload-time = "2025-08-14T16:27:48.555Z" }, + { url = "https://files.pythonhosted.org/packages/8c/3f/c5543a1540abdfb4d375e3ebeb84de365ab9b153ec14cb7db05f537dd1e7/fonttools-4.59.1-cp314-cp314t-win32.whl", hash = "sha256:02e4fdf27c550dded10fe038a5981c29f81cb9bc649ff2eaa48e80dab8998f97", size = 2266706, upload-time = "2025-08-14T16:27:50.556Z" }, + { url = "https://files.pythonhosted.org/packages/3e/99/85bff6e674226bc8402f983e365f07e76d990e7220ba72bcc738fef52391/fonttools-4.59.1-cp314-cp314t-win_amd64.whl", hash = "sha256:412a5fd6345872a7c249dac5bcce380393f40c1c316ac07f447bc17d51900922", size = 2329994, upload-time = "2025-08-14T16:27:52.36Z" }, + { url = "https://files.pythonhosted.org/packages/0f/64/9d606e66d498917cd7a2ff24f558010d42d6fd4576d9dd57f0bd98333f5a/fonttools-4.59.1-py3-none-any.whl", hash = "sha256:647db657073672a8330608970a984d51573557f328030566521bc03415535042", size = 1130094, upload-time = "2025-08-14T16:28:12.048Z" }, +] + +[[package]] +name = "frozenlist" +version = "1.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/79/b1/b64018016eeb087db503b038296fd782586432b9c077fc5c7839e9cb6ef6/frozenlist-1.7.0.tar.gz", hash = "sha256:2e310d81923c2437ea8670467121cc3e9b0f76d3043cc1d2331d56c7fb7a3a8f", size = 45078, upload-time = "2025-06-09T23:02:35.538Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ef/a2/c8131383f1e66adad5f6ecfcce383d584ca94055a34d683bbb24ac5f2f1c/frozenlist-1.7.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:3dbf9952c4bb0e90e98aec1bd992b3318685005702656bc6f67c1a32b76787f2", size = 81424, upload-time = "2025-06-09T23:00:42.24Z" }, + { url = "https://files.pythonhosted.org/packages/4c/9d/02754159955088cb52567337d1113f945b9e444c4960771ea90eb73de8db/frozenlist-1.7.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:1f5906d3359300b8a9bb194239491122e6cf1444c2efb88865426f170c262cdb", size = 47952, upload-time = "2025-06-09T23:00:43.481Z" }, + { url = "https://files.pythonhosted.org/packages/01/7a/0046ef1bd6699b40acd2067ed6d6670b4db2f425c56980fa21c982c2a9db/frozenlist-1.7.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3dabd5a8f84573c8d10d8859a50ea2dec01eea372031929871368c09fa103478", size = 46688, upload-time = "2025-06-09T23:00:44.793Z" }, + { url = "https://files.pythonhosted.org/packages/d6/a2/a910bafe29c86997363fb4c02069df4ff0b5bc39d33c5198b4e9dd42d8f8/frozenlist-1.7.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa57daa5917f1738064f302bf2626281a1cb01920c32f711fbc7bc36111058a8", size = 243084, upload-time = "2025-06-09T23:00:46.125Z" }, + { url = "https://files.pythonhosted.org/packages/64/3e/5036af9d5031374c64c387469bfcc3af537fc0f5b1187d83a1cf6fab1639/frozenlist-1.7.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:c193dda2b6d49f4c4398962810fa7d7c78f032bf45572b3e04dd5249dff27e08", size = 233524, upload-time = "2025-06-09T23:00:47.73Z" }, + { url = "https://files.pythonhosted.org/packages/06/39/6a17b7c107a2887e781a48ecf20ad20f1c39d94b2a548c83615b5b879f28/frozenlist-1.7.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bfe2b675cf0aaa6d61bf8fbffd3c274b3c9b7b1623beb3809df8a81399a4a9c4", size = 248493, upload-time = "2025-06-09T23:00:49.742Z" }, + { url = "https://files.pythonhosted.org/packages/be/00/711d1337c7327d88c44d91dd0f556a1c47fb99afc060ae0ef66b4d24793d/frozenlist-1.7.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8fc5d5cda37f62b262405cf9652cf0856839c4be8ee41be0afe8858f17f4c94b", size = 244116, upload-time = "2025-06-09T23:00:51.352Z" }, + { url = "https://files.pythonhosted.org/packages/24/fe/74e6ec0639c115df13d5850e75722750adabdc7de24e37e05a40527ca539/frozenlist-1.7.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b0d5ce521d1dd7d620198829b87ea002956e4319002ef0bc8d3e6d045cb4646e", size = 224557, upload-time = "2025-06-09T23:00:52.855Z" }, + { url = "https://files.pythonhosted.org/packages/8d/db/48421f62a6f77c553575201e89048e97198046b793f4a089c79a6e3268bd/frozenlist-1.7.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:488d0a7d6a0008ca0db273c542098a0fa9e7dfaa7e57f70acef43f32b3f69dca", size = 241820, upload-time = "2025-06-09T23:00:54.43Z" }, + { url = "https://files.pythonhosted.org/packages/1d/fa/cb4a76bea23047c8462976ea7b7a2bf53997a0ca171302deae9d6dd12096/frozenlist-1.7.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:15a7eaba63983d22c54d255b854e8108e7e5f3e89f647fc854bd77a237e767df", size = 236542, upload-time = "2025-06-09T23:00:56.409Z" }, + { url = "https://files.pythonhosted.org/packages/5d/32/476a4b5cfaa0ec94d3f808f193301debff2ea42288a099afe60757ef6282/frozenlist-1.7.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:1eaa7e9c6d15df825bf255649e05bd8a74b04a4d2baa1ae46d9c2d00b2ca2cb5", size = 249350, upload-time = "2025-06-09T23:00:58.468Z" }, + { url = "https://files.pythonhosted.org/packages/8d/ba/9a28042f84a6bf8ea5dbc81cfff8eaef18d78b2a1ad9d51c7bc5b029ad16/frozenlist-1.7.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:e4389e06714cfa9d47ab87f784a7c5be91d3934cd6e9a7b85beef808297cc025", size = 225093, upload-time = "2025-06-09T23:01:00.015Z" }, + { url = "https://files.pythonhosted.org/packages/bc/29/3a32959e68f9cf000b04e79ba574527c17e8842e38c91d68214a37455786/frozenlist-1.7.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:73bd45e1488c40b63fe5a7df892baf9e2a4d4bb6409a2b3b78ac1c6236178e01", size = 245482, upload-time = "2025-06-09T23:01:01.474Z" }, + { url = "https://files.pythonhosted.org/packages/80/e8/edf2f9e00da553f07f5fa165325cfc302dead715cab6ac8336a5f3d0adc2/frozenlist-1.7.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:99886d98e1643269760e5fe0df31e5ae7050788dd288947f7f007209b8c33f08", size = 249590, upload-time = "2025-06-09T23:01:02.961Z" }, + { url = "https://files.pythonhosted.org/packages/1c/80/9a0eb48b944050f94cc51ee1c413eb14a39543cc4f760ed12657a5a3c45a/frozenlist-1.7.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:290a172aae5a4c278c6da8a96222e6337744cd9c77313efe33d5670b9f65fc43", size = 237785, upload-time = "2025-06-09T23:01:05.095Z" }, + { url = "https://files.pythonhosted.org/packages/f3/74/87601e0fb0369b7a2baf404ea921769c53b7ae00dee7dcfe5162c8c6dbf0/frozenlist-1.7.0-cp312-cp312-win32.whl", hash = "sha256:426c7bc70e07cfebc178bc4c2bf2d861d720c4fff172181eeb4a4c41d4ca2ad3", size = 39487, upload-time = "2025-06-09T23:01:06.54Z" }, + { url = "https://files.pythonhosted.org/packages/0b/15/c026e9a9fc17585a9d461f65d8593d281fedf55fbf7eb53f16c6df2392f9/frozenlist-1.7.0-cp312-cp312-win_amd64.whl", hash = "sha256:563b72efe5da92e02eb68c59cb37205457c977aa7a449ed1b37e6939e5c47c6a", size = 43874, upload-time = "2025-06-09T23:01:07.752Z" }, + { url = "https://files.pythonhosted.org/packages/24/90/6b2cebdabdbd50367273c20ff6b57a3dfa89bd0762de02c3a1eb42cb6462/frozenlist-1.7.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ee80eeda5e2a4e660651370ebffd1286542b67e268aa1ac8d6dbe973120ef7ee", size = 79791, upload-time = "2025-06-09T23:01:09.368Z" }, + { url = "https://files.pythonhosted.org/packages/83/2e/5b70b6a3325363293fe5fc3ae74cdcbc3e996c2a11dde2fd9f1fb0776d19/frozenlist-1.7.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:d1a81c85417b914139e3a9b995d4a1c84559afc839a93cf2cb7f15e6e5f6ed2d", size = 47165, upload-time = "2025-06-09T23:01:10.653Z" }, + { url = "https://files.pythonhosted.org/packages/f4/25/a0895c99270ca6966110f4ad98e87e5662eab416a17e7fd53c364bf8b954/frozenlist-1.7.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:cbb65198a9132ebc334f237d7b0df163e4de83fb4f2bdfe46c1e654bdb0c5d43", size = 45881, upload-time = "2025-06-09T23:01:12.296Z" }, + { url = "https://files.pythonhosted.org/packages/19/7c/71bb0bbe0832793c601fff68cd0cf6143753d0c667f9aec93d3c323f4b55/frozenlist-1.7.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dab46c723eeb2c255a64f9dc05b8dd601fde66d6b19cdb82b2e09cc6ff8d8b5d", size = 232409, upload-time = "2025-06-09T23:01:13.641Z" }, + { url = "https://files.pythonhosted.org/packages/c0/45/ed2798718910fe6eb3ba574082aaceff4528e6323f9a8570be0f7028d8e9/frozenlist-1.7.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:6aeac207a759d0dedd2e40745575ae32ab30926ff4fa49b1635def65806fddee", size = 225132, upload-time = "2025-06-09T23:01:15.264Z" }, + { url = "https://files.pythonhosted.org/packages/ba/e2/8417ae0f8eacb1d071d4950f32f229aa6bf68ab69aab797b72a07ea68d4f/frozenlist-1.7.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bd8c4e58ad14b4fa7802b8be49d47993182fdd4023393899632c88fd8cd994eb", size = 237638, upload-time = "2025-06-09T23:01:16.752Z" }, + { url = "https://files.pythonhosted.org/packages/f8/b7/2ace5450ce85f2af05a871b8c8719b341294775a0a6c5585d5e6170f2ce7/frozenlist-1.7.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:04fb24d104f425da3540ed83cbfc31388a586a7696142004c577fa61c6298c3f", size = 233539, upload-time = "2025-06-09T23:01:18.202Z" }, + { url = "https://files.pythonhosted.org/packages/46/b9/6989292c5539553dba63f3c83dc4598186ab2888f67c0dc1d917e6887db6/frozenlist-1.7.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6a5c505156368e4ea6b53b5ac23c92d7edc864537ff911d2fb24c140bb175e60", size = 215646, upload-time = "2025-06-09T23:01:19.649Z" }, + { url = "https://files.pythonhosted.org/packages/72/31/bc8c5c99c7818293458fe745dab4fd5730ff49697ccc82b554eb69f16a24/frozenlist-1.7.0-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8bd7eb96a675f18aa5c553eb7ddc24a43c8c18f22e1f9925528128c052cdbe00", size = 232233, upload-time = "2025-06-09T23:01:21.175Z" }, + { url = "https://files.pythonhosted.org/packages/59/52/460db4d7ba0811b9ccb85af996019f5d70831f2f5f255f7cc61f86199795/frozenlist-1.7.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:05579bf020096fe05a764f1f84cd104a12f78eaab68842d036772dc6d4870b4b", size = 227996, upload-time = "2025-06-09T23:01:23.098Z" }, + { url = "https://files.pythonhosted.org/packages/ba/c9/f4b39e904c03927b7ecf891804fd3b4df3db29b9e487c6418e37988d6e9d/frozenlist-1.7.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:376b6222d114e97eeec13d46c486facd41d4f43bab626b7c3f6a8b4e81a5192c", size = 242280, upload-time = "2025-06-09T23:01:24.808Z" }, + { url = "https://files.pythonhosted.org/packages/b8/33/3f8d6ced42f162d743e3517781566b8481322be321b486d9d262adf70bfb/frozenlist-1.7.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:0aa7e176ebe115379b5b1c95b4096fb1c17cce0847402e227e712c27bdb5a949", size = 217717, upload-time = "2025-06-09T23:01:26.28Z" }, + { url = "https://files.pythonhosted.org/packages/3e/e8/ad683e75da6ccef50d0ab0c2b2324b32f84fc88ceee778ed79b8e2d2fe2e/frozenlist-1.7.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:3fbba20e662b9c2130dc771e332a99eff5da078b2b2648153a40669a6d0e36ca", size = 236644, upload-time = "2025-06-09T23:01:27.887Z" }, + { url = "https://files.pythonhosted.org/packages/b2/14/8d19ccdd3799310722195a72ac94ddc677541fb4bef4091d8e7775752360/frozenlist-1.7.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:f3f4410a0a601d349dd406b5713fec59b4cee7e71678d5b17edda7f4655a940b", size = 238879, upload-time = "2025-06-09T23:01:29.524Z" }, + { url = "https://files.pythonhosted.org/packages/ce/13/c12bf657494c2fd1079a48b2db49fa4196325909249a52d8f09bc9123fd7/frozenlist-1.7.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e2cdfaaec6a2f9327bf43c933c0319a7c429058e8537c508964a133dffee412e", size = 232502, upload-time = "2025-06-09T23:01:31.287Z" }, + { url = "https://files.pythonhosted.org/packages/d7/8b/e7f9dfde869825489382bc0d512c15e96d3964180c9499efcec72e85db7e/frozenlist-1.7.0-cp313-cp313-win32.whl", hash = "sha256:5fc4df05a6591c7768459caba1b342d9ec23fa16195e744939ba5914596ae3e1", size = 39169, upload-time = "2025-06-09T23:01:35.503Z" }, + { url = "https://files.pythonhosted.org/packages/35/89/a487a98d94205d85745080a37860ff5744b9820a2c9acbcdd9440bfddf98/frozenlist-1.7.0-cp313-cp313-win_amd64.whl", hash = "sha256:52109052b9791a3e6b5d1b65f4b909703984b770694d3eb64fad124c835d7cba", size = 43219, upload-time = "2025-06-09T23:01:36.784Z" }, + { url = "https://files.pythonhosted.org/packages/56/d5/5c4cf2319a49eddd9dd7145e66c4866bdc6f3dbc67ca3d59685149c11e0d/frozenlist-1.7.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:a6f86e4193bb0e235ef6ce3dde5cbabed887e0b11f516ce8a0f4d3b33078ec2d", size = 84345, upload-time = "2025-06-09T23:01:38.295Z" }, + { url = "https://files.pythonhosted.org/packages/a4/7d/ec2c1e1dc16b85bc9d526009961953df9cec8481b6886debb36ec9107799/frozenlist-1.7.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:82d664628865abeb32d90ae497fb93df398a69bb3434463d172b80fc25b0dd7d", size = 48880, upload-time = "2025-06-09T23:01:39.887Z" }, + { url = "https://files.pythonhosted.org/packages/69/86/f9596807b03de126e11e7d42ac91e3d0b19a6599c714a1989a4e85eeefc4/frozenlist-1.7.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:912a7e8375a1c9a68325a902f3953191b7b292aa3c3fb0d71a216221deca460b", size = 48498, upload-time = "2025-06-09T23:01:41.318Z" }, + { url = "https://files.pythonhosted.org/packages/5e/cb/df6de220f5036001005f2d726b789b2c0b65f2363b104bbc16f5be8084f8/frozenlist-1.7.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9537c2777167488d539bc5de2ad262efc44388230e5118868e172dd4a552b146", size = 292296, upload-time = "2025-06-09T23:01:42.685Z" }, + { url = "https://files.pythonhosted.org/packages/83/1f/de84c642f17c8f851a2905cee2dae401e5e0daca9b5ef121e120e19aa825/frozenlist-1.7.0-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:f34560fb1b4c3e30ba35fa9a13894ba39e5acfc5f60f57d8accde65f46cc5e74", size = 273103, upload-time = "2025-06-09T23:01:44.166Z" }, + { url = "https://files.pythonhosted.org/packages/88/3c/c840bfa474ba3fa13c772b93070893c6e9d5c0350885760376cbe3b6c1b3/frozenlist-1.7.0-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:acd03d224b0175f5a850edc104ac19040d35419eddad04e7cf2d5986d98427f1", size = 292869, upload-time = "2025-06-09T23:01:45.681Z" }, + { url = "https://files.pythonhosted.org/packages/a6/1c/3efa6e7d5a39a1d5ef0abeb51c48fb657765794a46cf124e5aca2c7a592c/frozenlist-1.7.0-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f2038310bc582f3d6a09b3816ab01737d60bf7b1ec70f5356b09e84fb7408ab1", size = 291467, upload-time = "2025-06-09T23:01:47.234Z" }, + { url = "https://files.pythonhosted.org/packages/4f/00/d5c5e09d4922c395e2f2f6b79b9a20dab4b67daaf78ab92e7729341f61f6/frozenlist-1.7.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b8c05e4c8e5f36e5e088caa1bf78a687528f83c043706640a92cb76cd6999384", size = 266028, upload-time = "2025-06-09T23:01:48.819Z" }, + { url = "https://files.pythonhosted.org/packages/4e/27/72765be905619dfde25a7f33813ac0341eb6b076abede17a2e3fbfade0cb/frozenlist-1.7.0-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:765bb588c86e47d0b68f23c1bee323d4b703218037765dcf3f25c838c6fecceb", size = 284294, upload-time = "2025-06-09T23:01:50.394Z" }, + { url = "https://files.pythonhosted.org/packages/88/67/c94103a23001b17808eb7dd1200c156bb69fb68e63fcf0693dde4cd6228c/frozenlist-1.7.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:32dc2e08c67d86d0969714dd484fd60ff08ff81d1a1e40a77dd34a387e6ebc0c", size = 281898, upload-time = "2025-06-09T23:01:52.234Z" }, + { url = "https://files.pythonhosted.org/packages/42/34/a3e2c00c00f9e2a9db5653bca3fec306349e71aff14ae45ecc6d0951dd24/frozenlist-1.7.0-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:c0303e597eb5a5321b4de9c68e9845ac8f290d2ab3f3e2c864437d3c5a30cd65", size = 290465, upload-time = "2025-06-09T23:01:53.788Z" }, + { url = "https://files.pythonhosted.org/packages/bb/73/f89b7fbce8b0b0c095d82b008afd0590f71ccb3dee6eee41791cf8cd25fd/frozenlist-1.7.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:a47f2abb4e29b3a8d0b530f7c3598badc6b134562b1a5caee867f7c62fee51e3", size = 266385, upload-time = "2025-06-09T23:01:55.769Z" }, + { url = "https://files.pythonhosted.org/packages/cd/45/e365fdb554159462ca12df54bc59bfa7a9a273ecc21e99e72e597564d1ae/frozenlist-1.7.0-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:3d688126c242a6fabbd92e02633414d40f50bb6002fa4cf995a1d18051525657", size = 288771, upload-time = "2025-06-09T23:01:57.4Z" }, + { url = "https://files.pythonhosted.org/packages/00/11/47b6117002a0e904f004d70ec5194fe9144f117c33c851e3d51c765962d0/frozenlist-1.7.0-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:4e7e9652b3d367c7bd449a727dc79d5043f48b88d0cbfd4f9f1060cf2b414104", size = 288206, upload-time = "2025-06-09T23:01:58.936Z" }, + { url = "https://files.pythonhosted.org/packages/40/37/5f9f3c3fd7f7746082ec67bcdc204db72dad081f4f83a503d33220a92973/frozenlist-1.7.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:1a85e345b4c43db8b842cab1feb41be5cc0b10a1830e6295b69d7310f99becaf", size = 282620, upload-time = "2025-06-09T23:02:00.493Z" }, + { url = "https://files.pythonhosted.org/packages/0b/31/8fbc5af2d183bff20f21aa743b4088eac4445d2bb1cdece449ae80e4e2d1/frozenlist-1.7.0-cp313-cp313t-win32.whl", hash = "sha256:3a14027124ddb70dfcee5148979998066897e79f89f64b13328595c4bdf77c81", size = 43059, upload-time = "2025-06-09T23:02:02.072Z" }, + { url = "https://files.pythonhosted.org/packages/bb/ed/41956f52105b8dbc26e457c5705340c67c8cc2b79f394b79bffc09d0e938/frozenlist-1.7.0-cp313-cp313t-win_amd64.whl", hash = "sha256:3bf8010d71d4507775f658e9823210b7427be36625b387221642725b515dcf3e", size = 47516, upload-time = "2025-06-09T23:02:03.779Z" }, + { url = "https://files.pythonhosted.org/packages/ee/45/b82e3c16be2182bff01179db177fe144d58b5dc787a7d4492c6ed8b9317f/frozenlist-1.7.0-py3-none-any.whl", hash = "sha256:9a5af342e34f7e97caf8c995864c7a396418ae2859cc6fdf1b1073020d516a7e", size = 13106, upload-time = "2025-06-09T23:02:34.204Z" }, +] + +[[package]] +name = "fsspec" +version = "2025.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8b/02/0835e6ab9cfc03916fe3f78c0956cfcdb6ff2669ffa6651065d5ebf7fc98/fsspec-2025.7.0.tar.gz", hash = "sha256:786120687ffa54b8283d942929540d8bc5ccfa820deb555a2b5d0ed2b737bf58", size = 304432, upload-time = "2025-07-15T16:05:21.19Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2f/e0/014d5d9d7a4564cf1c40b5039bc882db69fd881111e03ab3657ac0b218e2/fsspec-2025.7.0-py3-none-any.whl", hash = "sha256:8b012e39f63c7d5f10474de957f3ab793b47b45ae7d39f2fb735f8bbe25c0e21", size = 199597, upload-time = "2025-07-15T16:05:19.529Z" }, +] + +[[package]] +name = "gitdb" +version = "4.0.12" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "smmap" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/72/94/63b0fc47eb32792c7ba1fe1b694daec9a63620db1e313033d18140c2320a/gitdb-4.0.12.tar.gz", hash = "sha256:5ef71f855d191a3326fcfbc0d5da835f26b13fbcba60c32c21091c349ffdb571", size = 394684, upload-time = "2025-01-02T07:20:46.413Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a0/61/5c78b91c3143ed5c14207f463aecfc8f9dbb5092fb2869baf37c273b2705/gitdb-4.0.12-py3-none-any.whl", hash = "sha256:67073e15955400952c6565cc3e707c554a4eea2e428946f7a4c162fab9bd9bcf", size = 62794, upload-time = "2025-01-02T07:20:43.624Z" }, +] + +[[package]] +name = "gitpython" +version = "3.1.45" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "gitdb" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9a/c8/dd58967d119baab745caec2f9d853297cec1989ec1d63f677d3880632b88/gitpython-3.1.45.tar.gz", hash = "sha256:85b0ee964ceddf211c41b9f27a49086010a190fd8132a24e21f362a4b36a791c", size = 215076, upload-time = "2025-07-24T03:45:54.871Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/01/61/d4b89fec821f72385526e1b9d9a3a0385dda4a72b206d28049e2c7cd39b8/gitpython-3.1.45-py3-none-any.whl", hash = "sha256:8908cb2e02fb3b93b7eb0f2827125cb699869470432cc885f019b8fd0fccff77", size = 208168, upload-time = "2025-07-24T03:45:52.517Z" }, +] + +[[package]] +name = "google-api-core" +version = "2.25.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "google-auth" }, + { name = "googleapis-common-protos" }, + { name = "proto-plus" }, + { name = "protobuf" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/dc/21/e9d043e88222317afdbdb567165fdbc3b0aad90064c7e0c9eb0ad9955ad8/google_api_core-2.25.1.tar.gz", hash = "sha256:d2aaa0b13c78c61cb3f4282c464c046e45fbd75755683c9c525e6e8f7ed0a5e8", size = 165443, upload-time = "2025-06-12T20:52:20.439Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/14/4b/ead00905132820b623732b175d66354e9d3e69fcf2a5dcdab780664e7896/google_api_core-2.25.1-py3-none-any.whl", hash = "sha256:8a2a56c1fef82987a524371f99f3bd0143702fecc670c72e600c1cda6bf8dbb7", size = 160807, upload-time = "2025-06-12T20:52:19.334Z" }, +] + +[[package]] +name = "google-auth" +version = "2.40.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cachetools" }, + { name = "pyasn1-modules" }, + { name = "rsa" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9e/9b/e92ef23b84fa10a64ce4831390b7a4c2e53c0132568d99d4ae61d04c8855/google_auth-2.40.3.tar.gz", hash = "sha256:500c3a29adedeb36ea9cf24b8d10858e152f2412e3ca37829b3fa18e33d63b77", size = 281029, upload-time = "2025-06-04T18:04:57.577Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/17/63/b19553b658a1692443c62bd07e5868adaa0ad746a0751ba62c59568cd45b/google_auth-2.40.3-py2.py3-none-any.whl", hash = "sha256:1370d4593e86213563547f97a92752fc658456fe4514c809544f330fed45a7ca", size = 216137, upload-time = "2025-06-04T18:04:55.573Z" }, +] + +[[package]] +name = "googleapis-common-protos" +version = "1.70.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "protobuf" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/39/24/33db22342cf4a2ea27c9955e6713140fedd51e8b141b5ce5260897020f1a/googleapis_common_protos-1.70.0.tar.gz", hash = "sha256:0e1b44e0ea153e6594f9f394fef15193a68aaaea2d843f83e2742717ca753257", size = 145903, upload-time = "2025-04-14T10:17:02.924Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/86/f1/62a193f0227cf15a920390abe675f386dec35f7ae3ffe6da582d3ade42c7/googleapis_common_protos-1.70.0-py3-none-any.whl", hash = "sha256:b8bfcca8c25a2bb253e0e0b0adaf8c00773e5e6af6fd92397576680b807e0fd8", size = 294530, upload-time = "2025-04-14T10:17:01.271Z" }, +] + +[[package]] +name = "graphene" +version = "3.4.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "graphql-core" }, + { name = "graphql-relay" }, + { name = "python-dateutil" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/cc/f6/bf62ff950c317ed03e77f3f6ddd7e34aaa98fe89d79ebd660c55343d8054/graphene-3.4.3.tar.gz", hash = "sha256:2a3786948ce75fe7e078443d37f609cbe5bb36ad8d6b828740ad3b95ed1a0aaa", size = 44739, upload-time = "2024-11-09T20:44:25.757Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/66/e0/61d8e98007182e6b2aca7cf65904721fb2e4bce0192272ab9cb6f69d8812/graphene-3.4.3-py2.py3-none-any.whl", hash = "sha256:820db6289754c181007a150db1f7fff544b94142b556d12e3ebc777a7bf36c71", size = 114894, upload-time = "2024-11-09T20:44:23.851Z" }, +] + +[[package]] +name = "graphql-core" +version = "3.2.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c4/16/7574029da84834349b60ed71614d66ca3afe46e9bf9c7b9562102acb7d4f/graphql_core-3.2.6.tar.gz", hash = "sha256:c08eec22f9e40f0bd61d805907e3b3b1b9a320bc606e23dc145eebca07c8fbab", size = 505353, upload-time = "2025-01-26T16:36:27.374Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ae/4f/7297663840621022bc73c22d7d9d80dbc78b4db6297f764b545cd5dd462d/graphql_core-3.2.6-py3-none-any.whl", hash = "sha256:78b016718c161a6fb20a7d97bbf107f331cd1afe53e45566c59f776ed7f0b45f", size = 203416, upload-time = "2025-01-26T16:36:24.868Z" }, +] + +[[package]] +name = "graphql-relay" +version = "3.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "graphql-core" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d1/13/98fbf8d67552f102488ffc16c6f559ce71ea15f6294728d33928ab5ff14d/graphql-relay-3.2.0.tar.gz", hash = "sha256:1ff1c51298356e481a0be009ccdff249832ce53f30559c1338f22a0e0d17250c", size = 50027, upload-time = "2022-04-16T11:03:45.447Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/74/16/a4cf06adbc711bd364a73ce043b0b08d8fa5aae3df11b6ee4248bcdad2e0/graphql_relay-3.2.0-py3-none-any.whl", hash = "sha256:c9b22bd28b170ba1fe674c74384a8ff30a76c8e26f88ac3aa1584dd3179953e5", size = 16940, upload-time = "2022-04-16T11:03:43.895Z" }, +] + +[[package]] +name = "greenlet" +version = "3.2.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/03/b8/704d753a5a45507a7aab61f18db9509302ed3d0a27ac7e0359ec2905b1a6/greenlet-3.2.4.tar.gz", hash = "sha256:0dca0d95ff849f9a364385f36ab49f50065d76964944638be9691e1832e9f86d", size = 188260, upload-time = "2025-08-07T13:24:33.51Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/44/69/9b804adb5fd0671f367781560eb5eb586c4d495277c93bde4307b9e28068/greenlet-3.2.4-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:3b67ca49f54cede0186854a008109d6ee71f66bd57bb36abd6d0a0267b540cdd", size = 274079, upload-time = "2025-08-07T13:15:45.033Z" }, + { url = "https://files.pythonhosted.org/packages/46/e9/d2a80c99f19a153eff70bc451ab78615583b8dac0754cfb942223d2c1a0d/greenlet-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ddf9164e7a5b08e9d22511526865780a576f19ddd00d62f8a665949327fde8bb", size = 640997, upload-time = "2025-08-07T13:42:56.234Z" }, + { url = "https://files.pythonhosted.org/packages/3b/16/035dcfcc48715ccd345f3a93183267167cdd162ad123cd93067d86f27ce4/greenlet-3.2.4-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f28588772bb5fb869a8eb331374ec06f24a83a9c25bfa1f38b6993afe9c1e968", size = 655185, upload-time = "2025-08-07T13:45:27.624Z" }, + { url = "https://files.pythonhosted.org/packages/31/da/0386695eef69ffae1ad726881571dfe28b41970173947e7c558d9998de0f/greenlet-3.2.4-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:5c9320971821a7cb77cfab8d956fa8e39cd07ca44b6070db358ceb7f8797c8c9", size = 649926, upload-time = "2025-08-07T13:53:15.251Z" }, + { url = "https://files.pythonhosted.org/packages/68/88/69bf19fd4dc19981928ceacbc5fd4bb6bc2215d53199e367832e98d1d8fe/greenlet-3.2.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c60a6d84229b271d44b70fb6e5fa23781abb5d742af7b808ae3f6efd7c9c60f6", size = 651839, upload-time = "2025-08-07T13:18:30.281Z" }, + { url = "https://files.pythonhosted.org/packages/19/0d/6660d55f7373b2ff8152401a83e02084956da23ae58cddbfb0b330978fe9/greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0", size = 607586, upload-time = "2025-08-07T13:18:28.544Z" }, + { url = "https://files.pythonhosted.org/packages/8e/1a/c953fdedd22d81ee4629afbb38d2f9d71e37d23caace44775a3a969147d4/greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0", size = 1123281, upload-time = "2025-08-07T13:42:39.858Z" }, + { url = "https://files.pythonhosted.org/packages/3f/c7/12381b18e21aef2c6bd3a636da1088b888b97b7a0362fac2e4de92405f97/greenlet-3.2.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:20fb936b4652b6e307b8f347665e2c615540d4b42b3b4c8a321d8286da7e520f", size = 1151142, upload-time = "2025-08-07T13:18:22.981Z" }, + { url = "https://files.pythonhosted.org/packages/e9/08/b0814846b79399e585f974bbeebf5580fbe59e258ea7be64d9dfb253c84f/greenlet-3.2.4-cp312-cp312-win_amd64.whl", hash = "sha256:a7d4e128405eea3814a12cc2605e0e6aedb4035bf32697f72deca74de4105e02", size = 299899, upload-time = "2025-08-07T13:38:53.448Z" }, + { url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814, upload-time = "2025-08-07T13:15:50.011Z" }, + { url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073, upload-time = "2025-08-07T13:42:57.23Z" }, + { url = "https://files.pythonhosted.org/packages/f7/0b/bc13f787394920b23073ca3b6c4a7a21396301ed75a655bcb47196b50e6e/greenlet-3.2.4-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:710638eb93b1fa52823aa91bf75326f9ecdfd5e0466f00789246a5280f4ba0fc", size = 655191, upload-time = "2025-08-07T13:45:29.752Z" }, + { url = "https://files.pythonhosted.org/packages/f2/d6/6adde57d1345a8d0f14d31e4ab9c23cfe8e2cd39c3baf7674b4b0338d266/greenlet-3.2.4-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:c5111ccdc9c88f423426df3fd1811bfc40ed66264d35aa373420a34377efc98a", size = 649516, upload-time = "2025-08-07T13:53:16.314Z" }, + { url = "https://files.pythonhosted.org/packages/7f/3b/3a3328a788d4a473889a2d403199932be55b1b0060f4ddd96ee7cdfcad10/greenlet-3.2.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d76383238584e9711e20ebe14db6c88ddcedc1829a9ad31a584389463b5aa504", size = 652169, upload-time = "2025-08-07T13:18:32.861Z" }, + { url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" }, + { url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" }, + { url = "https://files.pythonhosted.org/packages/a2/15/0d5e4e1a66fab130d98168fe984c509249c833c1a3c16806b90f253ce7b9/greenlet-3.2.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d25c5091190f2dc0eaa3f950252122edbbadbb682aa7b1ef2f8af0f8c0afefae", size = 1149210, upload-time = "2025-08-07T13:18:24.072Z" }, + { url = "https://files.pythonhosted.org/packages/0b/55/2321e43595e6801e105fcfdee02b34c0f996eb71e6ddffca6b10b7e1d771/greenlet-3.2.4-cp313-cp313-win_amd64.whl", hash = "sha256:554b03b6e73aaabec3745364d6239e9e012d64c68ccd0b8430c64ccc14939a8b", size = 299685, upload-time = "2025-08-07T13:24:38.824Z" }, + { url = "https://files.pythonhosted.org/packages/22/5c/85273fd7cc388285632b0498dbbab97596e04b154933dfe0f3e68156c68c/greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0", size = 273586, upload-time = "2025-08-07T13:16:08.004Z" }, + { url = "https://files.pythonhosted.org/packages/d1/75/10aeeaa3da9332c2e761e4c50d4c3556c21113ee3f0afa2cf5769946f7a3/greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f", size = 686346, upload-time = "2025-08-07T13:42:59.944Z" }, + { url = "https://files.pythonhosted.org/packages/c0/aa/687d6b12ffb505a4447567d1f3abea23bd20e73a5bed63871178e0831b7a/greenlet-3.2.4-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:c17b6b34111ea72fc5a4e4beec9711d2226285f0386ea83477cbb97c30a3f3a5", size = 699218, upload-time = "2025-08-07T13:45:30.969Z" }, + { url = "https://files.pythonhosted.org/packages/dc/8b/29aae55436521f1d6f8ff4e12fb676f3400de7fcf27fccd1d4d17fd8fecd/greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1", size = 694659, upload-time = "2025-08-07T13:53:17.759Z" }, + { url = "https://files.pythonhosted.org/packages/92/2e/ea25914b1ebfde93b6fc4ff46d6864564fba59024e928bdc7de475affc25/greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735", size = 695355, upload-time = "2025-08-07T13:18:34.517Z" }, + { url = "https://files.pythonhosted.org/packages/72/60/fc56c62046ec17f6b0d3060564562c64c862948c9d4bc8aa807cf5bd74f4/greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337", size = 657512, upload-time = "2025-08-07T13:18:33.969Z" }, + { url = "https://files.pythonhosted.org/packages/e3/a5/6ddab2b4c112be95601c13428db1d8b6608a8b6039816f2ba09c346c08fc/greenlet-3.2.4-cp314-cp314-win_amd64.whl", hash = "sha256:e37ab26028f12dbb0ff65f29a8d3d44a765c61e729647bf2ddfbbed621726f01", size = 303425, upload-time = "2025-08-07T13:32:27.59Z" }, +] + +[[package]] +name = "grpcio" +version = "1.74.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/38/b4/35feb8f7cab7239c5b94bd2db71abb3d6adb5f335ad8f131abb6060840b6/grpcio-1.74.0.tar.gz", hash = "sha256:80d1f4fbb35b0742d3e3d3bb654b7381cd5f015f8497279a1e9c21ba623e01b1", size = 12756048, upload-time = "2025-07-24T18:54:23.039Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4c/5d/e504d5d5c4469823504f65687d6c8fb97b7f7bf0b34873b7598f1df24630/grpcio-1.74.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:8533e6e9c5bd630ca98062e3a1326249e6ada07d05acf191a77bc33f8948f3d8", size = 5445551, upload-time = "2025-07-24T18:53:23.641Z" }, + { url = "https://files.pythonhosted.org/packages/43/01/730e37056f96f2f6ce9f17999af1556df62ee8dab7fa48bceeaab5fd3008/grpcio-1.74.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:2918948864fec2a11721d91568effffbe0a02b23ecd57f281391d986847982f6", size = 10979810, upload-time = "2025-07-24T18:53:25.349Z" }, + { url = "https://files.pythonhosted.org/packages/79/3d/09fd100473ea5c47083889ca47ffd356576173ec134312f6aa0e13111dee/grpcio-1.74.0-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:60d2d48b0580e70d2e1954d0d19fa3c2e60dd7cbed826aca104fff518310d1c5", size = 5941946, upload-time = "2025-07-24T18:53:27.387Z" }, + { url = "https://files.pythonhosted.org/packages/8a/99/12d2cca0a63c874c6d3d195629dcd85cdf5d6f98a30d8db44271f8a97b93/grpcio-1.74.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3601274bc0523f6dc07666c0e01682c94472402ac2fd1226fd96e079863bfa49", size = 6621763, upload-time = "2025-07-24T18:53:29.193Z" }, + { url = "https://files.pythonhosted.org/packages/9d/2c/930b0e7a2f1029bbc193443c7bc4dc2a46fedb0203c8793dcd97081f1520/grpcio-1.74.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:176d60a5168d7948539def20b2a3adcce67d72454d9ae05969a2e73f3a0feee7", size = 6180664, upload-time = "2025-07-24T18:53:30.823Z" }, + { url = "https://files.pythonhosted.org/packages/db/d5/ff8a2442180ad0867717e670f5ec42bfd8d38b92158ad6bcd864e6d4b1ed/grpcio-1.74.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e759f9e8bc908aaae0412642afe5416c9f983a80499448fcc7fab8692ae044c3", size = 6301083, upload-time = "2025-07-24T18:53:32.454Z" }, + { url = "https://files.pythonhosted.org/packages/b0/ba/b361d390451a37ca118e4ec7dccec690422e05bc85fba2ec72b06cefec9f/grpcio-1.74.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:9e7c4389771855a92934b2846bd807fc25a3dfa820fd912fe6bd8136026b2707", size = 6994132, upload-time = "2025-07-24T18:53:34.506Z" }, + { url = "https://files.pythonhosted.org/packages/3b/0c/3a5fa47d2437a44ced74141795ac0251bbddeae74bf81df3447edd767d27/grpcio-1.74.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:cce634b10aeab37010449124814b05a62fb5f18928ca878f1bf4750d1f0c815b", size = 6489616, upload-time = "2025-07-24T18:53:36.217Z" }, + { url = "https://files.pythonhosted.org/packages/ae/95/ab64703b436d99dc5217228babc76047d60e9ad14df129e307b5fec81fd0/grpcio-1.74.0-cp312-cp312-win32.whl", hash = "sha256:885912559974df35d92219e2dc98f51a16a48395f37b92865ad45186f294096c", size = 3807083, upload-time = "2025-07-24T18:53:37.911Z" }, + { url = "https://files.pythonhosted.org/packages/84/59/900aa2445891fc47a33f7d2f76e00ca5d6ae6584b20d19af9c06fa09bf9a/grpcio-1.74.0-cp312-cp312-win_amd64.whl", hash = "sha256:42f8fee287427b94be63d916c90399ed310ed10aadbf9e2e5538b3e497d269bc", size = 4490123, upload-time = "2025-07-24T18:53:39.528Z" }, + { url = "https://files.pythonhosted.org/packages/d4/d8/1004a5f468715221450e66b051c839c2ce9a985aa3ee427422061fcbb6aa/grpcio-1.74.0-cp313-cp313-linux_armv7l.whl", hash = "sha256:2bc2d7d8d184e2362b53905cb1708c84cb16354771c04b490485fa07ce3a1d89", size = 5449488, upload-time = "2025-07-24T18:53:41.174Z" }, + { url = "https://files.pythonhosted.org/packages/94/0e/33731a03f63740d7743dced423846c831d8e6da808fcd02821a4416df7fa/grpcio-1.74.0-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:c14e803037e572c177ba54a3e090d6eb12efd795d49327c5ee2b3bddb836bf01", size = 10974059, upload-time = "2025-07-24T18:53:43.066Z" }, + { url = "https://files.pythonhosted.org/packages/0d/c6/3d2c14d87771a421205bdca991467cfe473ee4c6a1231c1ede5248c62ab8/grpcio-1.74.0-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:f6ec94f0e50eb8fa1744a731088b966427575e40c2944a980049798b127a687e", size = 5945647, upload-time = "2025-07-24T18:53:45.269Z" }, + { url = "https://files.pythonhosted.org/packages/c5/83/5a354c8aaff58594eef7fffebae41a0f8995a6258bbc6809b800c33d4c13/grpcio-1.74.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:566b9395b90cc3d0d0c6404bc8572c7c18786ede549cdb540ae27b58afe0fb91", size = 6626101, upload-time = "2025-07-24T18:53:47.015Z" }, + { url = "https://files.pythonhosted.org/packages/3f/ca/4fdc7bf59bf6994aa45cbd4ef1055cd65e2884de6113dbd49f75498ddb08/grpcio-1.74.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e1ea6176d7dfd5b941ea01c2ec34de9531ba494d541fe2057c904e601879f249", size = 6182562, upload-time = "2025-07-24T18:53:48.967Z" }, + { url = "https://files.pythonhosted.org/packages/fd/48/2869e5b2c1922583686f7ae674937986807c2f676d08be70d0a541316270/grpcio-1.74.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:64229c1e9cea079420527fa8ac45d80fc1e8d3f94deaa35643c381fa8d98f362", size = 6303425, upload-time = "2025-07-24T18:53:50.847Z" }, + { url = "https://files.pythonhosted.org/packages/a6/0e/bac93147b9a164f759497bc6913e74af1cb632c733c7af62c0336782bd38/grpcio-1.74.0-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:0f87bddd6e27fc776aacf7ebfec367b6d49cad0455123951e4488ea99d9b9b8f", size = 6996533, upload-time = "2025-07-24T18:53:52.747Z" }, + { url = "https://files.pythonhosted.org/packages/84/35/9f6b2503c1fd86d068b46818bbd7329db26a87cdd8c01e0d1a9abea1104c/grpcio-1.74.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:3b03d8f2a07f0fea8c8f74deb59f8352b770e3900d143b3d1475effcb08eec20", size = 6491489, upload-time = "2025-07-24T18:53:55.06Z" }, + { url = "https://files.pythonhosted.org/packages/75/33/a04e99be2a82c4cbc4039eb3a76f6c3632932b9d5d295221389d10ac9ca7/grpcio-1.74.0-cp313-cp313-win32.whl", hash = "sha256:b6a73b2ba83e663b2480a90b82fdae6a7aa6427f62bf43b29912c0cfd1aa2bfa", size = 3805811, upload-time = "2025-07-24T18:53:56.798Z" }, + { url = "https://files.pythonhosted.org/packages/34/80/de3eb55eb581815342d097214bed4c59e806b05f1b3110df03b2280d6dfd/grpcio-1.74.0-cp313-cp313-win_amd64.whl", hash = "sha256:fd3c71aeee838299c5887230b8a1822795325ddfea635edd82954c1eaa831e24", size = 4489214, upload-time = "2025-07-24T18:53:59.771Z" }, +] + +[[package]] +name = "gunicorn" +version = "23.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "packaging", marker = "sys_platform != 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/34/72/9614c465dc206155d93eff0ca20d42e1e35afc533971379482de953521a4/gunicorn-23.0.0.tar.gz", hash = "sha256:f014447a0101dc57e294f6c18ca6b40227a4c90e9bdb586042628030cba004ec", size = 375031, upload-time = "2024-08-10T20:25:27.378Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cb/7d/6dac2a6e1eba33ee43f318edbed4ff29151a49b5d37f080aad1e6469bca4/gunicorn-23.0.0-py3-none-any.whl", hash = "sha256:ec400d38950de4dfd418cff8328b2c8faed0edb0d517d3394e457c317908ca4d", size = 85029, upload-time = "2024-08-10T20:25:24.996Z" }, +] + +[[package]] +name = "h11" +version = "0.16.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload-time = "2025-04-24T03:35:25.427Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" }, +] + +[[package]] +name = "hf-xet" +version = "1.1.7" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b2/0a/a0f56735940fde6dd627602fec9ab3bad23f66a272397560abd65aba416e/hf_xet-1.1.7.tar.gz", hash = "sha256:20cec8db4561338824a3b5f8c19774055b04a8df7fff0cb1ff2cb1a0c1607b80", size = 477719, upload-time = "2025-08-06T00:30:55.741Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b1/7c/8d7803995caf14e7d19a392a486a040f923e2cfeff824e9b800b92072f76/hf_xet-1.1.7-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:60dae4b44d520819e54e216a2505685248ec0adbdb2dd4848b17aa85a0375cde", size = 2761743, upload-time = "2025-08-06T00:30:50.634Z" }, + { url = "https://files.pythonhosted.org/packages/51/a3/fa5897099454aa287022a34a30e68dbff0e617760f774f8bd1db17f06bd4/hf_xet-1.1.7-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:b109f4c11e01c057fc82004c9e51e6cdfe2cb230637644ade40c599739067b2e", size = 2624331, upload-time = "2025-08-06T00:30:49.212Z" }, + { url = "https://files.pythonhosted.org/packages/86/50/2446a132267e60b8a48b2e5835d6e24fd988000d0f5b9b15ebd6d64ef769/hf_xet-1.1.7-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6efaaf1a5a9fc3a501d3e71e88a6bfebc69ee3a716d0e713a931c8b8d920038f", size = 3183844, upload-time = "2025-08-06T00:30:47.582Z" }, + { url = "https://files.pythonhosted.org/packages/20/8f/ccc670616bb9beee867c6bb7139f7eab2b1370fe426503c25f5cbb27b148/hf_xet-1.1.7-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:751571540f9c1fbad9afcf222a5fb96daf2384bf821317b8bfb0c59d86078513", size = 3074209, upload-time = "2025-08-06T00:30:45.509Z" }, + { url = "https://files.pythonhosted.org/packages/21/0a/4c30e1eb77205565b854f5e4a82cf1f056214e4dc87f2918ebf83d47ae14/hf_xet-1.1.7-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:18b61bbae92d56ae731b92087c44efcac216071182c603fc535f8e29ec4b09b8", size = 3239602, upload-time = "2025-08-06T00:30:52.41Z" }, + { url = "https://files.pythonhosted.org/packages/f5/1e/fc7e9baf14152662ef0b35fa52a6e889f770a7ed14ac239de3c829ecb47e/hf_xet-1.1.7-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:713f2bff61b252f8523739969f247aa354ad8e6d869b8281e174e2ea1bb8d604", size = 3348184, upload-time = "2025-08-06T00:30:54.105Z" }, + { url = "https://files.pythonhosted.org/packages/a3/73/e354eae84ceff117ec3560141224724794828927fcc013c5b449bf0b8745/hf_xet-1.1.7-cp37-abi3-win_amd64.whl", hash = "sha256:2e356da7d284479ae0f1dea3cf5a2f74fdf925d6dca84ac4341930d892c7cb34", size = 2820008, upload-time = "2025-08-06T00:30:57.056Z" }, +] + +[[package]] +name = "httptools" +version = "0.6.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a7/9a/ce5e1f7e131522e6d3426e8e7a490b3a01f39a6696602e1c4f33f9e94277/httptools-0.6.4.tar.gz", hash = "sha256:4e93eee4add6493b59a5c514da98c939b244fce4a0d8879cd3f466562f4b7d5c", size = 240639, upload-time = "2024-10-16T19:45:08.902Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bb/0e/d0b71465c66b9185f90a091ab36389a7352985fe857e352801c39d6127c8/httptools-0.6.4-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:df017d6c780287d5c80601dafa31f17bddb170232d85c066604d8558683711a2", size = 200683, upload-time = "2024-10-16T19:44:30.175Z" }, + { url = "https://files.pythonhosted.org/packages/e2/b8/412a9bb28d0a8988de3296e01efa0bd62068b33856cdda47fe1b5e890954/httptools-0.6.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:85071a1e8c2d051b507161f6c3e26155b5c790e4e28d7f236422dbacc2a9cc44", size = 104337, upload-time = "2024-10-16T19:44:31.786Z" }, + { url = "https://files.pythonhosted.org/packages/9b/01/6fb20be3196ffdc8eeec4e653bc2a275eca7f36634c86302242c4fbb2760/httptools-0.6.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69422b7f458c5af875922cdb5bd586cc1f1033295aa9ff63ee196a87519ac8e1", size = 508796, upload-time = "2024-10-16T19:44:32.825Z" }, + { url = "https://files.pythonhosted.org/packages/f7/d8/b644c44acc1368938317d76ac991c9bba1166311880bcc0ac297cb9d6bd7/httptools-0.6.4-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:16e603a3bff50db08cd578d54f07032ca1631450ceb972c2f834c2b860c28ea2", size = 510837, upload-time = "2024-10-16T19:44:33.974Z" }, + { url = "https://files.pythonhosted.org/packages/52/d8/254d16a31d543073a0e57f1c329ca7378d8924e7e292eda72d0064987486/httptools-0.6.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ec4f178901fa1834d4a060320d2f3abc5c9e39766953d038f1458cb885f47e81", size = 485289, upload-time = "2024-10-16T19:44:35.111Z" }, + { url = "https://files.pythonhosted.org/packages/5f/3c/4aee161b4b7a971660b8be71a92c24d6c64372c1ab3ae7f366b3680df20f/httptools-0.6.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:f9eb89ecf8b290f2e293325c646a211ff1c2493222798bb80a530c5e7502494f", size = 489779, upload-time = "2024-10-16T19:44:36.253Z" }, + { url = "https://files.pythonhosted.org/packages/12/b7/5cae71a8868e555f3f67a50ee7f673ce36eac970f029c0c5e9d584352961/httptools-0.6.4-cp312-cp312-win_amd64.whl", hash = "sha256:db78cb9ca56b59b016e64b6031eda5653be0589dba2b1b43453f6e8b405a0970", size = 88634, upload-time = "2024-10-16T19:44:37.357Z" }, + { url = "https://files.pythonhosted.org/packages/94/a3/9fe9ad23fd35f7de6b91eeb60848986058bd8b5a5c1e256f5860a160cc3e/httptools-0.6.4-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ade273d7e767d5fae13fa637f4d53b6e961fb7fd93c7797562663f0171c26660", size = 197214, upload-time = "2024-10-16T19:44:38.738Z" }, + { url = "https://files.pythonhosted.org/packages/ea/d9/82d5e68bab783b632023f2fa31db20bebb4e89dfc4d2293945fd68484ee4/httptools-0.6.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:856f4bc0478ae143bad54a4242fccb1f3f86a6e1be5548fecfd4102061b3a083", size = 102431, upload-time = "2024-10-16T19:44:39.818Z" }, + { url = "https://files.pythonhosted.org/packages/96/c1/cb499655cbdbfb57b577734fde02f6fa0bbc3fe9fb4d87b742b512908dff/httptools-0.6.4-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:322d20ea9cdd1fa98bd6a74b77e2ec5b818abdc3d36695ab402a0de8ef2865a3", size = 473121, upload-time = "2024-10-16T19:44:41.189Z" }, + { url = "https://files.pythonhosted.org/packages/af/71/ee32fd358f8a3bb199b03261f10921716990808a675d8160b5383487a317/httptools-0.6.4-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4d87b29bd4486c0093fc64dea80231f7c7f7eb4dc70ae394d70a495ab8436071", size = 473805, upload-time = "2024-10-16T19:44:42.384Z" }, + { url = "https://files.pythonhosted.org/packages/8a/0a/0d4df132bfca1507114198b766f1737d57580c9ad1cf93c1ff673e3387be/httptools-0.6.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:342dd6946aa6bda4b8f18c734576106b8a31f2fe31492881a9a160ec84ff4bd5", size = 448858, upload-time = "2024-10-16T19:44:43.959Z" }, + { url = "https://files.pythonhosted.org/packages/1e/6a/787004fdef2cabea27bad1073bf6a33f2437b4dbd3b6fb4a9d71172b1c7c/httptools-0.6.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4b36913ba52008249223042dca46e69967985fb4051951f94357ea681e1f5dc0", size = 452042, upload-time = "2024-10-16T19:44:45.071Z" }, + { url = "https://files.pythonhosted.org/packages/4d/dc/7decab5c404d1d2cdc1bb330b1bf70e83d6af0396fd4fc76fc60c0d522bf/httptools-0.6.4-cp313-cp313-win_amd64.whl", hash = "sha256:28908df1b9bb8187393d5b5db91435ccc9c8e891657f9cbb42a2541b44c82fc8", size = 87682, upload-time = "2024-10-16T19:44:46.46Z" }, +] + +[[package]] +name = "huggingface-hub" +version = "0.34.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock" }, + { name = "fsspec" }, + { name = "hf-xet", marker = "platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" }, + { name = "packaging" }, + { name = "pyyaml" }, + { name = "requests" }, + { name = "tqdm" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/45/c9/bdbe19339f76d12985bc03572f330a01a93c04dffecaaea3061bdd7fb892/huggingface_hub-0.34.4.tar.gz", hash = "sha256:a4228daa6fb001be3f4f4bdaf9a0db00e1739235702848df00885c9b5742c85c", size = 459768, upload-time = "2025-08-08T09:14:52.365Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/39/7b/bb06b061991107cd8783f300adff3e7b7f284e330fd82f507f2a1417b11d/huggingface_hub-0.34.4-py3-none-any.whl", hash = "sha256:9b365d781739c93ff90c359844221beef048403f1bc1f1c123c191257c3c890a", size = 561452, upload-time = "2025-08-08T09:14:50.159Z" }, +] + +[[package]] +name = "idna" +version = "3.10" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490, upload-time = "2024-09-15T18:07:39.745Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload-time = "2024-09-15T18:07:37.964Z" }, +] + +[[package]] +name = "importlib-metadata" +version = "8.7.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "zipp" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/76/66/650a33bd90f786193e4de4b3ad86ea60b53c89b669a5c7be931fac31cdb0/importlib_metadata-8.7.0.tar.gz", hash = "sha256:d13b81ad223b890aa16c5471f2ac3056cf76c5f10f82d6f9292f0b415f389000", size = 56641, upload-time = "2025-04-27T15:29:01.736Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/20/b0/36bd937216ec521246249be3bf9855081de4c5e06a0c9b4219dbeda50373/importlib_metadata-8.7.0-py3-none-any.whl", hash = "sha256:e5dd1551894c77868a30651cef00984d50e1002d06942a7101d34870c5f02afd", size = 27656, upload-time = "2025-04-27T15:29:00.214Z" }, +] + +[[package]] +name = "ipykernel" +version = "6.30.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "appnope", marker = "sys_platform == 'darwin'" }, + { name = "comm" }, + { name = "debugpy" }, + { name = "ipython" }, + { name = "jupyter-client" }, + { name = "jupyter-core" }, + { name = "matplotlib-inline" }, + { name = "nest-asyncio" }, + { name = "packaging" }, + { name = "psutil" }, + { name = "pyzmq" }, + { name = "tornado" }, + { name = "traitlets" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/bb/76/11082e338e0daadc89c8ff866185de11daf67d181901038f9e139d109761/ipykernel-6.30.1.tar.gz", hash = "sha256:6abb270161896402e76b91394fcdce5d1be5d45f456671e5080572f8505be39b", size = 166260, upload-time = "2025-08-04T15:47:35.018Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fc/c7/b445faca8deb954fe536abebff4ece5b097b923de482b26e78448c89d1dd/ipykernel-6.30.1-py3-none-any.whl", hash = "sha256:aa6b9fb93dca949069d8b85b6c79b2518e32ac583ae9c7d37c51d119e18b3fb4", size = 117484, upload-time = "2025-08-04T15:47:32.622Z" }, +] + +[[package]] +name = "ipython" +version = "9.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "decorator" }, + { name = "ipython-pygments-lexers" }, + { name = "jedi" }, + { name = "matplotlib-inline" }, + { name = "pexpect", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" }, + { name = "prompt-toolkit" }, + { name = "pygments" }, + { name = "stack-data" }, + { name = "traitlets" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/54/80/406f9e3bde1c1fd9bf5a0be9d090f8ae623e401b7670d8f6fdf2ab679891/ipython-9.4.0.tar.gz", hash = "sha256:c033c6d4e7914c3d9768aabe76bbe87ba1dc66a92a05db6bfa1125d81f2ee270", size = 4385338, upload-time = "2025-07-01T11:11:30.606Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/63/f8/0031ee2b906a15a33d6bfc12dd09c3dfa966b3cb5b284ecfb7549e6ac3c4/ipython-9.4.0-py3-none-any.whl", hash = "sha256:25850f025a446d9b359e8d296ba175a36aedd32e83ca9b5060430fe16801f066", size = 611021, upload-time = "2025-07-01T11:11:27.85Z" }, +] + +[[package]] +name = "ipython-pygments-lexers" +version = "1.1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ef/4c/5dd1d8af08107f88c7f741ead7a40854b8ac24ddf9ae850afbcf698aa552/ipython_pygments_lexers-1.1.1.tar.gz", hash = "sha256:09c0138009e56b6854f9535736f4171d855c8c08a563a0dcd8022f78355c7e81", size = 8393, upload-time = "2025-01-17T11:24:34.505Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d9/33/1f075bf72b0b747cb3288d011319aaf64083cf2efef8354174e3ed4540e2/ipython_pygments_lexers-1.1.1-py3-none-any.whl", hash = "sha256:a9462224a505ade19a605f71f8fa63c2048833ce50abc86768a0d81d876dc81c", size = 8074, upload-time = "2025-01-17T11:24:33.271Z" }, +] + +[[package]] +name = "ipywidgets" +version = "8.1.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "comm" }, + { name = "ipython" }, + { name = "jupyterlab-widgets" }, + { name = "traitlets" }, + { name = "widgetsnbextension" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/77/0a/7e2069d2cf55307b37a6a5195e873968dea965252976c32515d4e300efb0/ipywidgets-8.1.3.tar.gz", hash = "sha256:f5f9eeaae082b1823ce9eac2575272952f40d748893972956dc09700a6392d9c", size = 116515, upload-time = "2024-05-28T09:32:19.319Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d4/17/8b2ce5765dd423433d2e0727712629c46152fb0bc706b0977f847480f262/ipywidgets-8.1.3-py3-none-any.whl", hash = "sha256:efafd18f7a142248f7cb0ba890a68b96abd4d6e88ddbda483c9130d12667eaf2", size = 139410, upload-time = "2024-05-28T09:32:16.041Z" }, +] + +[[package]] +name = "itsdangerous" +version = "2.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/9c/cb/8ac0172223afbccb63986cc25049b154ecfb5e85932587206f42317be31d/itsdangerous-2.2.0.tar.gz", hash = "sha256:e0050c0b7da1eea53ffaf149c0cfbb5c6e2e2b69c4bef22c81fa6eb73e5f6173", size = 54410, upload-time = "2024-04-16T21:28:15.614Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/96/92447566d16df59b2a776c0fb82dbc4d9e07cd95062562af01e408583fc4/itsdangerous-2.2.0-py3-none-any.whl", hash = "sha256:c6242fc49e35958c8b15141343aa660db5fc54d4f13a1db01a3f5891b98700ef", size = 16234, upload-time = "2024-04-16T21:28:14.499Z" }, +] + +[[package]] +name = "jedi" +version = "0.19.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "parso" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/72/3a/79a912fbd4d8dd6fbb02bf69afd3bb72cf0c729bb3063c6f4498603db17a/jedi-0.19.2.tar.gz", hash = "sha256:4770dc3de41bde3966b02eb84fbcf557fb33cce26ad23da12c742fb50ecb11f0", size = 1231287, upload-time = "2024-11-11T01:41:42.873Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl", hash = "sha256:a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9", size = 1572278, upload-time = "2024-11-11T01:41:40.175Z" }, +] + +[[package]] +name = "jinja2" +version = "3.1.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markupsafe" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115, upload-time = "2025-03-05T20:05:02.478Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" }, +] + +[[package]] +name = "jmespath" +version = "1.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/00/2a/e867e8531cf3e36b41201936b7fa7ba7b5702dbef42922193f05c8976cd6/jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe", size = 25843, upload-time = "2022-06-17T18:00:12.224Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/31/b4/b9b800c45527aadd64d5b442f9b932b00648617eb5d63d2c7a6587b7cafc/jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980", size = 20256, upload-time = "2022-06-17T18:00:10.251Z" }, +] + +[[package]] +name = "joblib" +version = "1.5.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/dc/fe/0f5a938c54105553436dbff7a61dc4fed4b1b2c98852f8833beaf4d5968f/joblib-1.5.1.tar.gz", hash = "sha256:f4f86e351f39fe3d0d32a9f2c3d8af1ee4cec285aafcb27003dda5205576b444", size = 330475, upload-time = "2025-05-23T12:04:37.097Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7d/4f/1195bbac8e0c2acc5f740661631d8d750dc38d4a32b23ee5df3cde6f4e0d/joblib-1.5.1-py3-none-any.whl", hash = "sha256:4719a31f054c7d766948dcd83e9613686b27114f190f717cec7eaa2084f8a74a", size = 307746, upload-time = "2025-05-23T12:04:35.124Z" }, +] + +[[package]] +name = "jsonschema" +version = "4.25.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "attrs" }, + { name = "jsonschema-specifications" }, + { name = "referencing" }, + { name = "rpds-py" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d5/00/a297a868e9d0784450faa7365c2172a7d6110c763e30ba861867c32ae6a9/jsonschema-4.25.0.tar.gz", hash = "sha256:e63acf5c11762c0e6672ffb61482bdf57f0876684d8d249c0fe2d730d48bc55f", size = 356830, upload-time = "2025-07-18T15:39:45.11Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fe/54/c86cd8e011fe98803d7e382fd67c0df5ceab8d2b7ad8c5a81524f791551c/jsonschema-4.25.0-py3-none-any.whl", hash = "sha256:24c2e8da302de79c8b9382fee3e76b355e44d2a4364bb207159ce10b517bd716", size = 89184, upload-time = "2025-07-18T15:39:42.956Z" }, +] + +[[package]] +name = "jsonschema-specifications" +version = "2025.4.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "referencing" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/bf/ce/46fbd9c8119cfc3581ee5643ea49464d168028cfb5caff5fc0596d0cf914/jsonschema_specifications-2025.4.1.tar.gz", hash = "sha256:630159c9f4dbea161a6a2205c3011cc4f18ff381b189fff48bb39b9bf26ae608", size = 15513, upload-time = "2025-04-23T12:34:07.418Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/01/0e/b27cdbaccf30b890c40ed1da9fd4a3593a5cf94dae54fb34f8a4b74fcd3f/jsonschema_specifications-2025.4.1-py3-none-any.whl", hash = "sha256:4653bffbd6584f7de83a67e0d620ef16900b390ddc7939d56684d6c81e33f1af", size = 18437, upload-time = "2025-04-23T12:34:05.422Z" }, +] + +[[package]] +name = "jupyter-client" +version = "8.6.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "jupyter-core" }, + { name = "python-dateutil" }, + { name = "pyzmq" }, + { name = "tornado" }, + { name = "traitlets" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/71/22/bf9f12fdaeae18019a468b68952a60fe6dbab5d67cd2a103cac7659b41ca/jupyter_client-8.6.3.tar.gz", hash = "sha256:35b3a0947c4a6e9d589eb97d7d4cd5e90f910ee73101611f01283732bd6d9419", size = 342019, upload-time = "2024-09-17T10:44:17.613Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/11/85/b0394e0b6fcccd2c1eeefc230978a6f8cb0c5df1e4cd3e7625735a0d7d1e/jupyter_client-8.6.3-py3-none-any.whl", hash = "sha256:e8a19cc986cc45905ac3362915f410f3af85424b4c0905e94fa5f2cb08e8f23f", size = 106105, upload-time = "2024-09-17T10:44:15.218Z" }, +] + +[[package]] +name = "jupyter-core" +version = "5.8.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "platformdirs" }, + { name = "pywin32", marker = "platform_python_implementation != 'PyPy' and sys_platform == 'win32'" }, + { name = "traitlets" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/99/1b/72906d554acfeb588332eaaa6f61577705e9ec752ddb486f302dafa292d9/jupyter_core-5.8.1.tar.gz", hash = "sha256:0a5f9706f70e64786b75acba995988915ebd4601c8a52e534a40b51c95f59941", size = 88923, upload-time = "2025-05-27T07:38:16.655Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2f/57/6bffd4b20b88da3800c5d691e0337761576ee688eb01299eae865689d2df/jupyter_core-5.8.1-py3-none-any.whl", hash = "sha256:c28d268fc90fb53f1338ded2eb410704c5449a358406e8a948b75706e24863d0", size = 28880, upload-time = "2025-05-27T07:38:15.137Z" }, +] + +[[package]] +name = "jupyterlab-widgets" +version = "3.0.15" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b9/7d/160595ca88ee87ac6ba95d82177d29ec60aaa63821d3077babb22ce031a5/jupyterlab_widgets-3.0.15.tar.gz", hash = "sha256:2920888a0c2922351a9202817957a68c07d99673504d6cd37345299e971bb08b", size = 213149, upload-time = "2025-05-05T12:32:31.004Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/43/6a/ca128561b22b60bd5a0c4ea26649e68c8556b82bc70a0c396eebc977fe86/jupyterlab_widgets-3.0.15-py3-none-any.whl", hash = "sha256:d59023d7d7ef71400d51e6fee9a88867f6e65e10a4201605d2d7f3e8f012a31c", size = 216571, upload-time = "2025-05-05T12:32:29.534Z" }, +] + +[[package]] +name = "kiwisolver" +version = "1.4.9" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/5c/3c/85844f1b0feb11ee581ac23fe5fce65cd049a200c1446708cc1b7f922875/kiwisolver-1.4.9.tar.gz", hash = "sha256:c3b22c26c6fd6811b0ae8363b95ca8ce4ea3c202d3d0975b2914310ceb1bcc4d", size = 97564, upload-time = "2025-08-10T21:27:49.279Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/86/c9/13573a747838aeb1c76e3267620daa054f4152444d1f3d1a2324b78255b5/kiwisolver-1.4.9-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:ac5a486ac389dddcc5bef4f365b6ae3ffff2c433324fb38dd35e3fab7c957999", size = 123686, upload-time = "2025-08-10T21:26:10.034Z" }, + { url = "https://files.pythonhosted.org/packages/51/ea/2ecf727927f103ffd1739271ca19c424d0e65ea473fbaeea1c014aea93f6/kiwisolver-1.4.9-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:f2ba92255faa7309d06fe44c3a4a97efe1c8d640c2a79a5ef728b685762a6fd2", size = 66460, upload-time = "2025-08-10T21:26:11.083Z" }, + { url = "https://files.pythonhosted.org/packages/5b/5a/51f5464373ce2aeb5194508298a508b6f21d3867f499556263c64c621914/kiwisolver-1.4.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4a2899935e724dd1074cb568ce7ac0dce28b2cd6ab539c8e001a8578eb106d14", size = 64952, upload-time = "2025-08-10T21:26:12.058Z" }, + { url = "https://files.pythonhosted.org/packages/70/90/6d240beb0f24b74371762873e9b7f499f1e02166a2d9c5801f4dbf8fa12e/kiwisolver-1.4.9-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f6008a4919fdbc0b0097089f67a1eb55d950ed7e90ce2cc3e640abadd2757a04", size = 1474756, upload-time = "2025-08-10T21:26:13.096Z" }, + { url = "https://files.pythonhosted.org/packages/12/42/f36816eaf465220f683fb711efdd1bbf7a7005a2473d0e4ed421389bd26c/kiwisolver-1.4.9-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:67bb8b474b4181770f926f7b7d2f8c0248cbcb78b660fdd41a47054b28d2a752", size = 1276404, upload-time = "2025-08-10T21:26:14.457Z" }, + { url = "https://files.pythonhosted.org/packages/2e/64/bc2de94800adc830c476dce44e9b40fd0809cddeef1fde9fcf0f73da301f/kiwisolver-1.4.9-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2327a4a30d3ee07d2fbe2e7933e8a37c591663b96ce42a00bc67461a87d7df77", size = 1294410, upload-time = "2025-08-10T21:26:15.73Z" }, + { url = "https://files.pythonhosted.org/packages/5f/42/2dc82330a70aa8e55b6d395b11018045e58d0bb00834502bf11509f79091/kiwisolver-1.4.9-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:7a08b491ec91b1d5053ac177afe5290adacf1f0f6307d771ccac5de30592d198", size = 1343631, upload-time = "2025-08-10T21:26:17.045Z" }, + { url = "https://files.pythonhosted.org/packages/22/fd/f4c67a6ed1aab149ec5a8a401c323cee7a1cbe364381bb6c9c0d564e0e20/kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d8fc5c867c22b828001b6a38d2eaeb88160bf5783c6cb4a5e440efc981ce286d", size = 2224963, upload-time = "2025-08-10T21:26:18.737Z" }, + { url = "https://files.pythonhosted.org/packages/45/aa/76720bd4cb3713314677d9ec94dcc21ced3f1baf4830adde5bb9b2430a5f/kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:3b3115b2581ea35bb6d1f24a4c90af37e5d9b49dcff267eeed14c3893c5b86ab", size = 2321295, upload-time = "2025-08-10T21:26:20.11Z" }, + { url = "https://files.pythonhosted.org/packages/80/19/d3ec0d9ab711242f56ae0dc2fc5d70e298bb4a1f9dfab44c027668c673a1/kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:858e4c22fb075920b96a291928cb7dea5644e94c0ee4fcd5af7e865655e4ccf2", size = 2487987, upload-time = "2025-08-10T21:26:21.49Z" }, + { url = "https://files.pythonhosted.org/packages/39/e9/61e4813b2c97e86b6fdbd4dd824bf72d28bcd8d4849b8084a357bc0dd64d/kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ed0fecd28cc62c54b262e3736f8bb2512d8dcfdc2bcf08be5f47f96bf405b145", size = 2291817, upload-time = "2025-08-10T21:26:22.812Z" }, + { url = "https://files.pythonhosted.org/packages/a0/41/85d82b0291db7504da3c2defe35c9a8a5c9803a730f297bd823d11d5fb77/kiwisolver-1.4.9-cp312-cp312-win_amd64.whl", hash = "sha256:f68208a520c3d86ea51acf688a3e3002615a7f0238002cccc17affecc86a8a54", size = 73895, upload-time = "2025-08-10T21:26:24.37Z" }, + { url = "https://files.pythonhosted.org/packages/e2/92/5f3068cf15ee5cb624a0c7596e67e2a0bb2adee33f71c379054a491d07da/kiwisolver-1.4.9-cp312-cp312-win_arm64.whl", hash = "sha256:2c1a4f57df73965f3f14df20b80ee29e6a7930a57d2d9e8491a25f676e197c60", size = 64992, upload-time = "2025-08-10T21:26:25.732Z" }, + { url = "https://files.pythonhosted.org/packages/31/c1/c2686cda909742ab66c7388e9a1a8521a59eb89f8bcfbee28fc980d07e24/kiwisolver-1.4.9-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a5d0432ccf1c7ab14f9949eec60c5d1f924f17c037e9f8b33352fa05799359b8", size = 123681, upload-time = "2025-08-10T21:26:26.725Z" }, + { url = "https://files.pythonhosted.org/packages/ca/f0/f44f50c9f5b1a1860261092e3bc91ecdc9acda848a8b8c6abfda4a24dd5c/kiwisolver-1.4.9-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efb3a45b35622bb6c16dbfab491a8f5a391fe0e9d45ef32f4df85658232ca0e2", size = 66464, upload-time = "2025-08-10T21:26:27.733Z" }, + { url = "https://files.pythonhosted.org/packages/2d/7a/9d90a151f558e29c3936b8a47ac770235f436f2120aca41a6d5f3d62ae8d/kiwisolver-1.4.9-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1a12cf6398e8a0a001a059747a1cbf24705e18fe413bc22de7b3d15c67cffe3f", size = 64961, upload-time = "2025-08-10T21:26:28.729Z" }, + { url = "https://files.pythonhosted.org/packages/e9/e9/f218a2cb3a9ffbe324ca29a9e399fa2d2866d7f348ec3a88df87fc248fc5/kiwisolver-1.4.9-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b67e6efbf68e077dd71d1a6b37e43e1a99d0bff1a3d51867d45ee8908b931098", size = 1474607, upload-time = "2025-08-10T21:26:29.798Z" }, + { url = "https://files.pythonhosted.org/packages/d9/28/aac26d4c882f14de59041636292bc838db8961373825df23b8eeb807e198/kiwisolver-1.4.9-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5656aa670507437af0207645273ccdfee4f14bacd7f7c67a4306d0dcaeaf6eed", size = 1276546, upload-time = "2025-08-10T21:26:31.401Z" }, + { url = "https://files.pythonhosted.org/packages/8b/ad/8bfc1c93d4cc565e5069162f610ba2f48ff39b7de4b5b8d93f69f30c4bed/kiwisolver-1.4.9-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:bfc08add558155345129c7803b3671cf195e6a56e7a12f3dde7c57d9b417f525", size = 1294482, upload-time = "2025-08-10T21:26:32.721Z" }, + { url = "https://files.pythonhosted.org/packages/da/f1/6aca55ff798901d8ce403206d00e033191f63d82dd708a186e0ed2067e9c/kiwisolver-1.4.9-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:40092754720b174e6ccf9e845d0d8c7d8e12c3d71e7fc35f55f3813e96376f78", size = 1343720, upload-time = "2025-08-10T21:26:34.032Z" }, + { url = "https://files.pythonhosted.org/packages/d1/91/eed031876c595c81d90d0f6fc681ece250e14bf6998c3d7c419466b523b7/kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:497d05f29a1300d14e02e6441cf0f5ee81c1ff5a304b0d9fb77423974684e08b", size = 2224907, upload-time = "2025-08-10T21:26:35.824Z" }, + { url = "https://files.pythonhosted.org/packages/e9/ec/4d1925f2e49617b9cca9c34bfa11adefad49d00db038e692a559454dfb2e/kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:bdd1a81a1860476eb41ac4bc1e07b3f07259e6d55bbf739b79c8aaedcf512799", size = 2321334, upload-time = "2025-08-10T21:26:37.534Z" }, + { url = "https://files.pythonhosted.org/packages/43/cb/450cd4499356f68802750c6ddc18647b8ea01ffa28f50d20598e0befe6e9/kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:e6b93f13371d341afee3be9f7c5964e3fe61d5fa30f6a30eb49856935dfe4fc3", size = 2488313, upload-time = "2025-08-10T21:26:39.191Z" }, + { url = "https://files.pythonhosted.org/packages/71/67/fc76242bd99f885651128a5d4fa6083e5524694b7c88b489b1b55fdc491d/kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:d75aa530ccfaa593da12834b86a0724f58bff12706659baa9227c2ccaa06264c", size = 2291970, upload-time = "2025-08-10T21:26:40.828Z" }, + { url = "https://files.pythonhosted.org/packages/75/bd/f1a5d894000941739f2ae1b65a32892349423ad49c2e6d0771d0bad3fae4/kiwisolver-1.4.9-cp313-cp313-win_amd64.whl", hash = "sha256:dd0a578400839256df88c16abddf9ba14813ec5f21362e1fe65022e00c883d4d", size = 73894, upload-time = "2025-08-10T21:26:42.33Z" }, + { url = "https://files.pythonhosted.org/packages/95/38/dce480814d25b99a391abbddadc78f7c117c6da34be68ca8b02d5848b424/kiwisolver-1.4.9-cp313-cp313-win_arm64.whl", hash = "sha256:d4188e73af84ca82468f09cadc5ac4db578109e52acb4518d8154698d3a87ca2", size = 64995, upload-time = "2025-08-10T21:26:43.889Z" }, + { url = "https://files.pythonhosted.org/packages/e2/37/7d218ce5d92dadc5ebdd9070d903e0c7cf7edfe03f179433ac4d13ce659c/kiwisolver-1.4.9-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:5a0f2724dfd4e3b3ac5a82436a8e6fd16baa7d507117e4279b660fe8ca38a3a1", size = 126510, upload-time = "2025-08-10T21:26:44.915Z" }, + { url = "https://files.pythonhosted.org/packages/23/b0/e85a2b48233daef4b648fb657ebbb6f8367696a2d9548a00b4ee0eb67803/kiwisolver-1.4.9-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:1b11d6a633e4ed84fc0ddafd4ebfd8ea49b3f25082c04ad12b8315c11d504dc1", size = 67903, upload-time = "2025-08-10T21:26:45.934Z" }, + { url = "https://files.pythonhosted.org/packages/44/98/f2425bc0113ad7de24da6bb4dae1343476e95e1d738be7c04d31a5d037fd/kiwisolver-1.4.9-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:61874cdb0a36016354853593cffc38e56fc9ca5aa97d2c05d3dcf6922cd55a11", size = 66402, upload-time = "2025-08-10T21:26:47.101Z" }, + { url = "https://files.pythonhosted.org/packages/98/d8/594657886df9f34c4177cc353cc28ca7e6e5eb562d37ccc233bff43bbe2a/kiwisolver-1.4.9-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:60c439763a969a6af93b4881db0eed8fadf93ee98e18cbc35bc8da868d0c4f0c", size = 1582135, upload-time = "2025-08-10T21:26:48.665Z" }, + { url = "https://files.pythonhosted.org/packages/5c/c6/38a115b7170f8b306fc929e166340c24958347308ea3012c2b44e7e295db/kiwisolver-1.4.9-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:92a2f997387a1b79a75e7803aa7ded2cfbe2823852ccf1ba3bcf613b62ae3197", size = 1389409, upload-time = "2025-08-10T21:26:50.335Z" }, + { url = "https://files.pythonhosted.org/packages/bf/3b/e04883dace81f24a568bcee6eb3001da4ba05114afa622ec9b6fafdc1f5e/kiwisolver-1.4.9-cp313-cp313t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a31d512c812daea6d8b3be3b2bfcbeb091dbb09177706569bcfc6240dcf8b41c", size = 1401763, upload-time = "2025-08-10T21:26:51.867Z" }, + { url = "https://files.pythonhosted.org/packages/9f/80/20ace48e33408947af49d7d15c341eaee69e4e0304aab4b7660e234d6288/kiwisolver-1.4.9-cp313-cp313t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:52a15b0f35dad39862d376df10c5230155243a2c1a436e39eb55623ccbd68185", size = 1453643, upload-time = "2025-08-10T21:26:53.592Z" }, + { url = "https://files.pythonhosted.org/packages/64/31/6ce4380a4cd1f515bdda976a1e90e547ccd47b67a1546d63884463c92ca9/kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a30fd6fdef1430fd9e1ba7b3398b5ee4e2887783917a687d86ba69985fb08748", size = 2330818, upload-time = "2025-08-10T21:26:55.051Z" }, + { url = "https://files.pythonhosted.org/packages/fa/e9/3f3fcba3bcc7432c795b82646306e822f3fd74df0ee81f0fa067a1f95668/kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:cc9617b46837c6468197b5945e196ee9ca43057bb7d9d1ae688101e4e1dddf64", size = 2419963, upload-time = "2025-08-10T21:26:56.421Z" }, + { url = "https://files.pythonhosted.org/packages/99/43/7320c50e4133575c66e9f7dadead35ab22d7c012a3b09bb35647792b2a6d/kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:0ab74e19f6a2b027ea4f845a78827969af45ce790e6cb3e1ebab71bdf9f215ff", size = 2594639, upload-time = "2025-08-10T21:26:57.882Z" }, + { url = "https://files.pythonhosted.org/packages/65/d6/17ae4a270d4a987ef8a385b906d2bdfc9fce502d6dc0d3aea865b47f548c/kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:dba5ee5d3981160c28d5490f0d1b7ed730c22470ff7f6cc26cfcfaacb9896a07", size = 2391741, upload-time = "2025-08-10T21:26:59.237Z" }, + { url = "https://files.pythonhosted.org/packages/2a/8f/8f6f491d595a9e5912971f3f863d81baddccc8a4d0c3749d6a0dd9ffc9df/kiwisolver-1.4.9-cp313-cp313t-win_arm64.whl", hash = "sha256:0749fd8f4218ad2e851e11cc4dc05c7cbc0cbc4267bdfdb31782e65aace4ee9c", size = 68646, upload-time = "2025-08-10T21:27:00.52Z" }, + { url = "https://files.pythonhosted.org/packages/6b/32/6cc0fbc9c54d06c2969faa9c1d29f5751a2e51809dd55c69055e62d9b426/kiwisolver-1.4.9-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:9928fe1eb816d11ae170885a74d074f57af3a0d65777ca47e9aeb854a1fba386", size = 123806, upload-time = "2025-08-10T21:27:01.537Z" }, + { url = "https://files.pythonhosted.org/packages/b2/dd/2bfb1d4a4823d92e8cbb420fe024b8d2167f72079b3bb941207c42570bdf/kiwisolver-1.4.9-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:d0005b053977e7b43388ddec89fa567f43d4f6d5c2c0affe57de5ebf290dc552", size = 66605, upload-time = "2025-08-10T21:27:03.335Z" }, + { url = "https://files.pythonhosted.org/packages/f7/69/00aafdb4e4509c2ca6064646cba9cd4b37933898f426756adb2cb92ebbed/kiwisolver-1.4.9-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:2635d352d67458b66fd0667c14cb1d4145e9560d503219034a18a87e971ce4f3", size = 64925, upload-time = "2025-08-10T21:27:04.339Z" }, + { url = "https://files.pythonhosted.org/packages/43/dc/51acc6791aa14e5cb6d8a2e28cefb0dc2886d8862795449d021334c0df20/kiwisolver-1.4.9-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:767c23ad1c58c9e827b649a9ab7809fd5fd9db266a9cf02b0e926ddc2c680d58", size = 1472414, upload-time = "2025-08-10T21:27:05.437Z" }, + { url = "https://files.pythonhosted.org/packages/3d/bb/93fa64a81db304ac8a246f834d5094fae4b13baf53c839d6bb6e81177129/kiwisolver-1.4.9-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:72d0eb9fba308b8311685c2268cf7d0a0639a6cd027d8128659f72bdd8a024b4", size = 1281272, upload-time = "2025-08-10T21:27:07.063Z" }, + { url = "https://files.pythonhosted.org/packages/70/e6/6df102916960fb8d05069d4bd92d6d9a8202d5a3e2444494e7cd50f65b7a/kiwisolver-1.4.9-cp314-cp314-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f68e4f3eeca8fb22cc3d731f9715a13b652795ef657a13df1ad0c7dc0e9731df", size = 1298578, upload-time = "2025-08-10T21:27:08.452Z" }, + { url = "https://files.pythonhosted.org/packages/7c/47/e142aaa612f5343736b087864dbaebc53ea8831453fb47e7521fa8658f30/kiwisolver-1.4.9-cp314-cp314-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d84cd4061ae292d8ac367b2c3fa3aad11cb8625a95d135fe93f286f914f3f5a6", size = 1345607, upload-time = "2025-08-10T21:27:10.125Z" }, + { url = "https://files.pythonhosted.org/packages/54/89/d641a746194a0f4d1a3670fb900d0dbaa786fb98341056814bc3f058fa52/kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:a60ea74330b91bd22a29638940d115df9dc00af5035a9a2a6ad9399ffb4ceca5", size = 2230150, upload-time = "2025-08-10T21:27:11.484Z" }, + { url = "https://files.pythonhosted.org/packages/aa/6b/5ee1207198febdf16ac11f78c5ae40861b809cbe0e6d2a8d5b0b3044b199/kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:ce6a3a4e106cf35c2d9c4fa17c05ce0b180db622736845d4315519397a77beaf", size = 2325979, upload-time = "2025-08-10T21:27:12.917Z" }, + { url = "https://files.pythonhosted.org/packages/fc/ff/b269eefd90f4ae14dcc74973d5a0f6d28d3b9bb1afd8c0340513afe6b39a/kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:77937e5e2a38a7b48eef0585114fe7930346993a88060d0bf886086d2aa49ef5", size = 2491456, upload-time = "2025-08-10T21:27:14.353Z" }, + { url = "https://files.pythonhosted.org/packages/fc/d4/10303190bd4d30de547534601e259a4fbf014eed94aae3e5521129215086/kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:24c175051354f4a28c5d6a31c93906dc653e2bf234e8a4bbfb964892078898ce", size = 2294621, upload-time = "2025-08-10T21:27:15.808Z" }, + { url = "https://files.pythonhosted.org/packages/28/e0/a9a90416fce5c0be25742729c2ea52105d62eda6c4be4d803c2a7be1fa50/kiwisolver-1.4.9-cp314-cp314-win_amd64.whl", hash = "sha256:0763515d4df10edf6d06a3c19734e2566368980d21ebec439f33f9eb936c07b7", size = 75417, upload-time = "2025-08-10T21:27:17.436Z" }, + { url = "https://files.pythonhosted.org/packages/1f/10/6949958215b7a9a264299a7db195564e87900f709db9245e4ebdd3c70779/kiwisolver-1.4.9-cp314-cp314-win_arm64.whl", hash = "sha256:0e4e2bf29574a6a7b7f6cb5fa69293b9f96c928949ac4a53ba3f525dffb87f9c", size = 66582, upload-time = "2025-08-10T21:27:18.436Z" }, + { url = "https://files.pythonhosted.org/packages/ec/79/60e53067903d3bc5469b369fe0dfc6b3482e2133e85dae9daa9527535991/kiwisolver-1.4.9-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:d976bbb382b202f71c67f77b0ac11244021cfa3f7dfd9e562eefcea2df711548", size = 126514, upload-time = "2025-08-10T21:27:19.465Z" }, + { url = "https://files.pythonhosted.org/packages/25/d1/4843d3e8d46b072c12a38c97c57fab4608d36e13fe47d47ee96b4d61ba6f/kiwisolver-1.4.9-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:2489e4e5d7ef9a1c300a5e0196e43d9c739f066ef23270607d45aba368b91f2d", size = 67905, upload-time = "2025-08-10T21:27:20.51Z" }, + { url = "https://files.pythonhosted.org/packages/8c/ae/29ffcbd239aea8b93108de1278271ae764dfc0d803a5693914975f200596/kiwisolver-1.4.9-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:e2ea9f7ab7fbf18fffb1b5434ce7c69a07582f7acc7717720f1d69f3e806f90c", size = 66399, upload-time = "2025-08-10T21:27:21.496Z" }, + { url = "https://files.pythonhosted.org/packages/a1/ae/d7ba902aa604152c2ceba5d352d7b62106bedbccc8e95c3934d94472bfa3/kiwisolver-1.4.9-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b34e51affded8faee0dfdb705416153819d8ea9250bbbf7ea1b249bdeb5f1122", size = 1582197, upload-time = "2025-08-10T21:27:22.604Z" }, + { url = "https://files.pythonhosted.org/packages/f2/41/27c70d427eddb8bc7e4f16420a20fefc6f480312122a59a959fdfe0445ad/kiwisolver-1.4.9-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d8aacd3d4b33b772542b2e01beb50187536967b514b00003bdda7589722d2a64", size = 1390125, upload-time = "2025-08-10T21:27:24.036Z" }, + { url = "https://files.pythonhosted.org/packages/41/42/b3799a12bafc76d962ad69083f8b43b12bf4fe78b097b12e105d75c9b8f1/kiwisolver-1.4.9-cp314-cp314t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7cf974dd4e35fa315563ac99d6287a1024e4dc2077b8a7d7cd3d2fb65d283134", size = 1402612, upload-time = "2025-08-10T21:27:25.773Z" }, + { url = "https://files.pythonhosted.org/packages/d2/b5/a210ea073ea1cfaca1bb5c55a62307d8252f531beb364e18aa1e0888b5a0/kiwisolver-1.4.9-cp314-cp314t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:85bd218b5ecfbee8c8a82e121802dcb519a86044c9c3b2e4aef02fa05c6da370", size = 1453990, upload-time = "2025-08-10T21:27:27.089Z" }, + { url = "https://files.pythonhosted.org/packages/5f/ce/a829eb8c033e977d7ea03ed32fb3c1781b4fa0433fbadfff29e39c676f32/kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:0856e241c2d3df4efef7c04a1e46b1936b6120c9bcf36dd216e3acd84bc4fb21", size = 2331601, upload-time = "2025-08-10T21:27:29.343Z" }, + { url = "https://files.pythonhosted.org/packages/e0/4b/b5e97eb142eb9cd0072dacfcdcd31b1c66dc7352b0f7c7255d339c0edf00/kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:9af39d6551f97d31a4deebeac6f45b156f9755ddc59c07b402c148f5dbb6482a", size = 2422041, upload-time = "2025-08-10T21:27:30.754Z" }, + { url = "https://files.pythonhosted.org/packages/40/be/8eb4cd53e1b85ba4edc3a9321666f12b83113a178845593307a3e7891f44/kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:bb4ae2b57fc1d8cbd1cf7b1d9913803681ffa903e7488012be5b76dedf49297f", size = 2594897, upload-time = "2025-08-10T21:27:32.803Z" }, + { url = "https://files.pythonhosted.org/packages/99/dd/841e9a66c4715477ea0abc78da039832fbb09dac5c35c58dc4c41a407b8a/kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:aedff62918805fb62d43a4aa2ecd4482c380dc76cd31bd7c8878588a61bd0369", size = 2391835, upload-time = "2025-08-10T21:27:34.23Z" }, + { url = "https://files.pythonhosted.org/packages/0c/28/4b2e5c47a0da96896fdfdb006340ade064afa1e63675d01ea5ac222b6d52/kiwisolver-1.4.9-cp314-cp314t-win_amd64.whl", hash = "sha256:1fa333e8b2ce4d9660f2cda9c0e1b6bafcfb2457a9d259faa82289e73ec24891", size = 79988, upload-time = "2025-08-10T21:27:35.587Z" }, + { url = "https://files.pythonhosted.org/packages/80/be/3578e8afd18c88cdf9cb4cffde75a96d2be38c5a903f1ed0ceec061bd09e/kiwisolver-1.4.9-cp314-cp314t-win_arm64.whl", hash = "sha256:4a48a2ce79d65d363597ef7b567ce3d14d68783d2b2263d98db3d9477805ba32", size = 70260, upload-time = "2025-08-10T21:27:36.606Z" }, +] + +[[package]] +name = "mako" +version = "1.3.10" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markupsafe" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9e/38/bd5b78a920a64d708fe6bc8e0a2c075e1389d53bef8413725c63ba041535/mako-1.3.10.tar.gz", hash = "sha256:99579a6f39583fa7e5630a28c3c1f440e4e97a414b80372649c0ce338da2ea28", size = 392474, upload-time = "2025-04-10T12:44:31.16Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/87/fb/99f81ac72ae23375f22b7afdb7642aba97c00a713c217124420147681a2f/mako-1.3.10-py3-none-any.whl", hash = "sha256:baef24a52fc4fc514a0887ac600f9f1cff3d82c61d4d700a1fa84d597b88db59", size = 78509, upload-time = "2025-04-10T12:50:53.297Z" }, +] + +[[package]] +name = "markdown" +version = "3.8.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d7/c2/4ab49206c17f75cb08d6311171f2d65798988db4360c4d1485bd0eedd67c/markdown-3.8.2.tar.gz", hash = "sha256:247b9a70dd12e27f67431ce62523e675b866d254f900c4fe75ce3dda62237c45", size = 362071, upload-time = "2025-06-19T17:12:44.483Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/96/2b/34cc11786bc00d0f04d0f5fdc3a2b1ae0b6239eef72d3d345805f9ad92a1/markdown-3.8.2-py3-none-any.whl", hash = "sha256:5c83764dbd4e00bdd94d85a19b8d55ccca20fe35b2e678a1422b380324dd5f24", size = 106827, upload-time = "2025-06-19T17:12:42.994Z" }, +] + +[[package]] +name = "markupsafe" +version = "3.0.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b2/97/5d42485e71dfc078108a86d6de8fa46db44a1a9295e89c5d6d4a06e23a62/markupsafe-3.0.2.tar.gz", hash = "sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0", size = 20537, upload-time = "2024-10-18T15:21:54.129Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/22/09/d1f21434c97fc42f09d290cbb6350d44eb12f09cc62c9476effdb33a18aa/MarkupSafe-3.0.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:9778bd8ab0a994ebf6f84c2b949e65736d5575320a17ae8984a77fab08db94cf", size = 14274, upload-time = "2024-10-18T15:21:13.777Z" }, + { url = "https://files.pythonhosted.org/packages/6b/b0/18f76bba336fa5aecf79d45dcd6c806c280ec44538b3c13671d49099fdd0/MarkupSafe-3.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:846ade7b71e3536c4e56b386c2a47adf5741d2d8b94ec9dc3e92e5e1ee1e2225", size = 12348, upload-time = "2024-10-18T15:21:14.822Z" }, + { url = "https://files.pythonhosted.org/packages/e0/25/dd5c0f6ac1311e9b40f4af06c78efde0f3b5cbf02502f8ef9501294c425b/MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c99d261bd2d5f6b59325c92c73df481e05e57f19837bdca8413b9eac4bd8028", size = 24149, upload-time = "2024-10-18T15:21:15.642Z" }, + { url = "https://files.pythonhosted.org/packages/f3/f0/89e7aadfb3749d0f52234a0c8c7867877876e0a20b60e2188e9850794c17/MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e17c96c14e19278594aa4841ec148115f9c7615a47382ecb6b82bd8fea3ab0c8", size = 23118, upload-time = "2024-10-18T15:21:17.133Z" }, + { url = "https://files.pythonhosted.org/packages/d5/da/f2eeb64c723f5e3777bc081da884b414671982008c47dcc1873d81f625b6/MarkupSafe-3.0.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:88416bd1e65dcea10bc7569faacb2c20ce071dd1f87539ca2ab364bf6231393c", size = 22993, upload-time = "2024-10-18T15:21:18.064Z" }, + { url = "https://files.pythonhosted.org/packages/da/0e/1f32af846df486dce7c227fe0f2398dc7e2e51d4a370508281f3c1c5cddc/MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2181e67807fc2fa785d0592dc2d6206c019b9502410671cc905d132a92866557", size = 24178, upload-time = "2024-10-18T15:21:18.859Z" }, + { url = "https://files.pythonhosted.org/packages/c4/f6/bb3ca0532de8086cbff5f06d137064c8410d10779c4c127e0e47d17c0b71/MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:52305740fe773d09cffb16f8ed0427942901f00adedac82ec8b67752f58a1b22", size = 23319, upload-time = "2024-10-18T15:21:19.671Z" }, + { url = "https://files.pythonhosted.org/packages/a2/82/8be4c96ffee03c5b4a034e60a31294daf481e12c7c43ab8e34a1453ee48b/MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ad10d3ded218f1039f11a75f8091880239651b52e9bb592ca27de44eed242a48", size = 23352, upload-time = "2024-10-18T15:21:20.971Z" }, + { url = "https://files.pythonhosted.org/packages/51/ae/97827349d3fcffee7e184bdf7f41cd6b88d9919c80f0263ba7acd1bbcb18/MarkupSafe-3.0.2-cp312-cp312-win32.whl", hash = "sha256:0f4ca02bea9a23221c0182836703cbf8930c5e9454bacce27e767509fa286a30", size = 15097, upload-time = "2024-10-18T15:21:22.646Z" }, + { url = "https://files.pythonhosted.org/packages/c1/80/a61f99dc3a936413c3ee4e1eecac96c0da5ed07ad56fd975f1a9da5bc630/MarkupSafe-3.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:8e06879fc22a25ca47312fbe7c8264eb0b662f6db27cb2d3bbbc74b1df4b9b87", size = 15601, upload-time = "2024-10-18T15:21:23.499Z" }, + { url = "https://files.pythonhosted.org/packages/83/0e/67eb10a7ecc77a0c2bbe2b0235765b98d164d81600746914bebada795e97/MarkupSafe-3.0.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ba9527cdd4c926ed0760bc301f6728ef34d841f405abf9d4f959c478421e4efd", size = 14274, upload-time = "2024-10-18T15:21:24.577Z" }, + { url = "https://files.pythonhosted.org/packages/2b/6d/9409f3684d3335375d04e5f05744dfe7e9f120062c9857df4ab490a1031a/MarkupSafe-3.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f8b3d067f2e40fe93e1ccdd6b2e1d16c43140e76f02fb1319a05cf2b79d99430", size = 12352, upload-time = "2024-10-18T15:21:25.382Z" }, + { url = "https://files.pythonhosted.org/packages/d2/f5/6eadfcd3885ea85fe2a7c128315cc1bb7241e1987443d78c8fe712d03091/MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:569511d3b58c8791ab4c2e1285575265991e6d8f8700c7be0e88f86cb0672094", size = 24122, upload-time = "2024-10-18T15:21:26.199Z" }, + { url = "https://files.pythonhosted.org/packages/0c/91/96cf928db8236f1bfab6ce15ad070dfdd02ed88261c2afafd4b43575e9e9/MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15ab75ef81add55874e7ab7055e9c397312385bd9ced94920f2802310c930396", size = 23085, upload-time = "2024-10-18T15:21:27.029Z" }, + { url = "https://files.pythonhosted.org/packages/c2/cf/c9d56af24d56ea04daae7ac0940232d31d5a8354f2b457c6d856b2057d69/MarkupSafe-3.0.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f3818cb119498c0678015754eba762e0d61e5b52d34c8b13d770f0719f7b1d79", size = 22978, upload-time = "2024-10-18T15:21:27.846Z" }, + { url = "https://files.pythonhosted.org/packages/2a/9f/8619835cd6a711d6272d62abb78c033bda638fdc54c4e7f4272cf1c0962b/MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cdb82a876c47801bb54a690c5ae105a46b392ac6099881cdfb9f6e95e4014c6a", size = 24208, upload-time = "2024-10-18T15:21:28.744Z" }, + { url = "https://files.pythonhosted.org/packages/f9/bf/176950a1792b2cd2102b8ffeb5133e1ed984547b75db47c25a67d3359f77/MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:cabc348d87e913db6ab4aa100f01b08f481097838bdddf7c7a84b7575b7309ca", size = 23357, upload-time = "2024-10-18T15:21:29.545Z" }, + { url = "https://files.pythonhosted.org/packages/ce/4f/9a02c1d335caabe5c4efb90e1b6e8ee944aa245c1aaaab8e8a618987d816/MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:444dcda765c8a838eaae23112db52f1efaf750daddb2d9ca300bcae1039adc5c", size = 23344, upload-time = "2024-10-18T15:21:30.366Z" }, + { url = "https://files.pythonhosted.org/packages/ee/55/c271b57db36f748f0e04a759ace9f8f759ccf22b4960c270c78a394f58be/MarkupSafe-3.0.2-cp313-cp313-win32.whl", hash = "sha256:bcf3e58998965654fdaff38e58584d8937aa3096ab5354d493c77d1fdd66d7a1", size = 15101, upload-time = "2024-10-18T15:21:31.207Z" }, + { url = "https://files.pythonhosted.org/packages/29/88/07df22d2dd4df40aba9f3e402e6dc1b8ee86297dddbad4872bd5e7b0094f/MarkupSafe-3.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:e6a2a455bd412959b57a172ce6328d2dd1f01cb2135efda2e4576e8a23fa3b0f", size = 15603, upload-time = "2024-10-18T15:21:32.032Z" }, + { url = "https://files.pythonhosted.org/packages/62/6a/8b89d24db2d32d433dffcd6a8779159da109842434f1dd2f6e71f32f738c/MarkupSafe-3.0.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:b5a6b3ada725cea8a5e634536b1b01c30bcdcd7f9c6fff4151548d5bf6b3a36c", size = 14510, upload-time = "2024-10-18T15:21:33.625Z" }, + { url = "https://files.pythonhosted.org/packages/7a/06/a10f955f70a2e5a9bf78d11a161029d278eeacbd35ef806c3fd17b13060d/MarkupSafe-3.0.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:a904af0a6162c73e3edcb969eeeb53a63ceeb5d8cf642fade7d39e7963a22ddb", size = 12486, upload-time = "2024-10-18T15:21:34.611Z" }, + { url = "https://files.pythonhosted.org/packages/34/cf/65d4a571869a1a9078198ca28f39fba5fbb910f952f9dbc5220afff9f5e6/MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4aa4e5faecf353ed117801a068ebab7b7e09ffb6e1d5e412dc852e0da018126c", size = 25480, upload-time = "2024-10-18T15:21:35.398Z" }, + { url = "https://files.pythonhosted.org/packages/0c/e3/90e9651924c430b885468b56b3d597cabf6d72be4b24a0acd1fa0e12af67/MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0ef13eaeee5b615fb07c9a7dadb38eac06a0608b41570d8ade51c56539e509d", size = 23914, upload-time = "2024-10-18T15:21:36.231Z" }, + { url = "https://files.pythonhosted.org/packages/66/8c/6c7cf61f95d63bb866db39085150df1f2a5bd3335298f14a66b48e92659c/MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d16a81a06776313e817c951135cf7340a3e91e8c1ff2fac444cfd75fffa04afe", size = 23796, upload-time = "2024-10-18T15:21:37.073Z" }, + { url = "https://files.pythonhosted.org/packages/bb/35/cbe9238ec3f47ac9a7c8b3df7a808e7cb50fe149dc7039f5f454b3fba218/MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:6381026f158fdb7c72a168278597a5e3a5222e83ea18f543112b2662a9b699c5", size = 25473, upload-time = "2024-10-18T15:21:37.932Z" }, + { url = "https://files.pythonhosted.org/packages/e6/32/7621a4382488aa283cc05e8984a9c219abad3bca087be9ec77e89939ded9/MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:3d79d162e7be8f996986c064d1c7c817f6df3a77fe3d6859f6f9e7be4b8c213a", size = 24114, upload-time = "2024-10-18T15:21:39.799Z" }, + { url = "https://files.pythonhosted.org/packages/0d/80/0985960e4b89922cb5a0bac0ed39c5b96cbc1a536a99f30e8c220a996ed9/MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:131a3c7689c85f5ad20f9f6fb1b866f402c445b220c19fe4308c0b147ccd2ad9", size = 24098, upload-time = "2024-10-18T15:21:40.813Z" }, + { url = "https://files.pythonhosted.org/packages/82/78/fedb03c7d5380df2427038ec8d973587e90561b2d90cd472ce9254cf348b/MarkupSafe-3.0.2-cp313-cp313t-win32.whl", hash = "sha256:ba8062ed2cf21c07a9e295d5b8a2a5ce678b913b45fdf68c32d95d6c1291e0b6", size = 15208, upload-time = "2024-10-18T15:21:41.814Z" }, + { url = "https://files.pythonhosted.org/packages/4f/65/6079a46068dfceaeabb5dcad6d674f5f5c61a6fa5673746f42a9f4c233b3/MarkupSafe-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f", size = 15739, upload-time = "2024-10-18T15:21:42.784Z" }, +] + +[[package]] +name = "matplotlib" +version = "3.10.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "contourpy" }, + { name = "cycler" }, + { name = "fonttools" }, + { name = "kiwisolver" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "pillow" }, + { name = "pyparsing" }, + { name = "python-dateutil" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/68/dd/fa2e1a45fce2d09f4aea3cee169760e672c8262325aa5796c49d543dc7e6/matplotlib-3.10.0.tar.gz", hash = "sha256:b886d02a581b96704c9d1ffe55709e49b4d2d52709ccebc4be42db856e511278", size = 36686418, upload-time = "2024-12-14T06:32:51.547Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/44/c7/6b2d8cb7cc251d53c976799cacd3200add56351c175ba89ab9cbd7c1e68a/matplotlib-3.10.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4659665bc7c9b58f8c00317c3c2a299f7f258eeae5a5d56b4c64226fca2f7c59", size = 8172465, upload-time = "2024-12-14T06:31:24.727Z" }, + { url = "https://files.pythonhosted.org/packages/42/2a/6d66d0fba41e13e9ca6512a0a51170f43e7e7ed3a8dfa036324100775612/matplotlib-3.10.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d44cb942af1693cced2604c33a9abcef6205601c445f6d0dc531d813af8a2f5a", size = 8043300, upload-time = "2024-12-14T06:31:28.55Z" }, + { url = "https://files.pythonhosted.org/packages/90/60/2a60342b27b90a16bada939a85e29589902b41073f59668b904b15ea666c/matplotlib-3.10.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a994f29e968ca002b50982b27168addfd65f0105610b6be7fa515ca4b5307c95", size = 8448936, upload-time = "2024-12-14T06:31:32.223Z" }, + { url = "https://files.pythonhosted.org/packages/a7/b2/d872fc3d753516870d520595ddd8ce4dd44fa797a240999f125f58521ad7/matplotlib-3.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9b0558bae37f154fffda54d779a592bc97ca8b4701f1c710055b609a3bac44c8", size = 8594151, upload-time = "2024-12-14T06:31:34.894Z" }, + { url = "https://files.pythonhosted.org/packages/f4/bd/b2f60cf7f57d014ab33e4f74602a2b5bdc657976db8196bbc022185f6f9c/matplotlib-3.10.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:503feb23bd8c8acc75541548a1d709c059b7184cde26314896e10a9f14df5f12", size = 9400347, upload-time = "2024-12-14T06:31:39.552Z" }, + { url = "https://files.pythonhosted.org/packages/9f/6e/264673e64001b99d747aff5a288eca82826c024437a3694e19aed1decf46/matplotlib-3.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:c40ba2eb08b3f5de88152c2333c58cee7edcead0a2a0d60fcafa116b17117adc", size = 8039144, upload-time = "2024-12-14T06:31:44.128Z" }, + { url = "https://files.pythonhosted.org/packages/72/11/1b2a094d95dcb6e6edd4a0b238177c439006c6b7a9fe8d31801237bf512f/matplotlib-3.10.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:96f2886f5c1e466f21cc41b70c5a0cd47bfa0015eb2d5793c88ebce658600e25", size = 8173073, upload-time = "2024-12-14T06:31:46.592Z" }, + { url = "https://files.pythonhosted.org/packages/0d/c4/87b6ad2723070511a411ea719f9c70fde64605423b184face4e94986de9d/matplotlib-3.10.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:12eaf48463b472c3c0f8dbacdbf906e573013df81a0ab82f0616ea4b11281908", size = 8043892, upload-time = "2024-12-14T06:31:49.14Z" }, + { url = "https://files.pythonhosted.org/packages/57/69/cb0812a136550b21361335e9ffb7d459bf6d13e03cb7b015555d5143d2d6/matplotlib-3.10.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2fbbabc82fde51391c4da5006f965e36d86d95f6ee83fb594b279564a4c5d0d2", size = 8450532, upload-time = "2024-12-14T06:31:53.005Z" }, + { url = "https://files.pythonhosted.org/packages/ea/3a/bab9deb4fb199c05e9100f94d7f1c702f78d3241e6a71b784d2b88d7bebd/matplotlib-3.10.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad2e15300530c1a94c63cfa546e3b7864bd18ea2901317bae8bbf06a5ade6dcf", size = 8593905, upload-time = "2024-12-14T06:31:59.022Z" }, + { url = "https://files.pythonhosted.org/packages/8b/66/742fd242f989adc1847ddf5f445815f73ad7c46aa3440690cc889cfa423c/matplotlib-3.10.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:3547d153d70233a8496859097ef0312212e2689cdf8d7ed764441c77604095ae", size = 9399609, upload-time = "2024-12-14T06:32:05.151Z" }, + { url = "https://files.pythonhosted.org/packages/fa/d6/54cee7142cef7d910a324a7aedf335c0c147b03658b54d49ec48166f10a6/matplotlib-3.10.0-cp313-cp313-win_amd64.whl", hash = "sha256:c55b20591ced744aa04e8c3e4b7543ea4d650b6c3c4b208c08a05b4010e8b442", size = 8039076, upload-time = "2024-12-14T06:32:08.38Z" }, + { url = "https://files.pythonhosted.org/packages/43/14/815d072dc36e88753433bfd0385113405efb947e6895ff7b4d2e8614a33b/matplotlib-3.10.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:9ade1003376731a971e398cc4ef38bb83ee8caf0aee46ac6daa4b0506db1fd06", size = 8211000, upload-time = "2024-12-14T06:32:12.383Z" }, + { url = "https://files.pythonhosted.org/packages/9a/76/34e75f364194ec352678adcb540964be6f35ec7d3d8c75ebcb17e6839359/matplotlib-3.10.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:95b710fea129c76d30be72c3b38f330269363fbc6e570a5dd43580487380b5ff", size = 8087707, upload-time = "2024-12-14T06:32:15.773Z" }, + { url = "https://files.pythonhosted.org/packages/c3/2b/b6bc0dff6a72d333bc7df94a66e6ce662d224e43daa8ad8ae4eaa9a77f55/matplotlib-3.10.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5cdbaf909887373c3e094b0318d7ff230b2ad9dcb64da7ade654182872ab2593", size = 8477384, upload-time = "2024-12-14T06:32:20.311Z" }, + { url = "https://files.pythonhosted.org/packages/c2/2d/b5949fb2b76e9b47ab05e25a5f5f887c70de20d8b0cbc704a4e2ee71c786/matplotlib-3.10.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d907fddb39f923d011875452ff1eca29a9e7f21722b873e90db32e5d8ddff12e", size = 8610334, upload-time = "2024-12-14T06:32:25.779Z" }, + { url = "https://files.pythonhosted.org/packages/d6/9a/6e3c799d5134d9af44b01c787e1360bee38cf51850506ea2e743a787700b/matplotlib-3.10.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:3b427392354d10975c1d0f4ee18aa5844640b512d5311ef32efd4dd7db106ede", size = 9406777, upload-time = "2024-12-14T06:32:28.919Z" }, + { url = "https://files.pythonhosted.org/packages/0e/dd/e6ae97151e5ed648ab2ea48885bc33d39202b640eec7a2910e2c843f7ac0/matplotlib-3.10.0-cp313-cp313t-win_amd64.whl", hash = "sha256:5fd41b0ec7ee45cd960a8e71aea7c946a28a0b8a4dcee47d2856b2af051f334c", size = 8109742, upload-time = "2024-12-14T06:32:32.115Z" }, +] + +[[package]] +name = "matplotlib-inline" +version = "0.1.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "traitlets" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/99/5b/a36a337438a14116b16480db471ad061c36c3694df7c2084a0da7ba538b7/matplotlib_inline-0.1.7.tar.gz", hash = "sha256:8423b23ec666be3d16e16b60bdd8ac4e86e840ebd1dd11a30b9f117f2fa0ab90", size = 8159, upload-time = "2024-04-15T13:44:44.803Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl", hash = "sha256:df192d39a4ff8f21b1895d72e6a13f5fcc5099f00fa84384e0ea28c2cc0653ca", size = 9899, upload-time = "2024-04-15T13:44:43.265Z" }, +] + +[[package]] +name = "mlflow" +version = "2.19.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "alembic" }, + { name = "docker" }, + { name = "flask" }, + { name = "graphene" }, + { name = "gunicorn", marker = "sys_platform != 'win32'" }, + { name = "jinja2" }, + { name = "markdown" }, + { name = "matplotlib" }, + { name = "mlflow-skinny" }, + { name = "numpy" }, + { name = "pandas" }, + { name = "pyarrow" }, + { name = "scikit-learn" }, + { name = "scipy" }, + { name = "sqlalchemy" }, + { name = "waitress", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/cd/76/f623312328a8b642fba8b9683e07904ee9f9c59b9e58528e9a9f5bbdcfea/mlflow-2.19.0.tar.gz", hash = "sha256:b860e9d2599a32460968a0a90efdf960b6a6237a08bff44cc5508830017cf70e", size = 26813362, upload-time = "2024-12-11T09:49:38.38Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c2/39/e051e58f35077500fea62adb67c0ff32cab768a5bbc1e0d8c682e30d56ee/mlflow-2.19.0-py3-none-any.whl", hash = "sha256:875364a9c37d2e6e5b6256a3cee314e1e6ada0c253f46b6fcb37d986a2dc2514", size = 27397174, upload-time = "2024-12-11T09:49:32.119Z" }, +] + +[[package]] +name = "mlflow-skinny" +version = "2.19.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cachetools" }, + { name = "click" }, + { name = "cloudpickle" }, + { name = "databricks-sdk" }, + { name = "gitpython" }, + { name = "importlib-metadata" }, + { name = "opentelemetry-api" }, + { name = "opentelemetry-sdk" }, + { name = "packaging" }, + { name = "protobuf" }, + { name = "pyyaml" }, + { name = "requests" }, + { name = "sqlparse" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/93/69/53c19be8f05574b9955a2930d0c9b04403d5dd35afce05fbe664b5bfbbfc/mlflow_skinny-2.19.0.tar.gz", hash = "sha256:55a464082ecd48961f73f9a0a58b8d44bf2e77bd32632998f1dffd43ef48623c", size = 5503927, upload-time = "2024-12-11T08:53:47.849Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/05/95/75f59715e39aa2224e5ecd8c52d5a305467e16a843ade2235a215599a1fa/mlflow_skinny-2.19.0-py3-none-any.whl", hash = "sha256:72c652545460db09dc5716241d2fcd9a211b7875444632fbe2d0b62a1f057694", size = 5854771, upload-time = "2024-12-11T08:53:44.16Z" }, +] + +[[package]] +name = "mpmath" +version = "1.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e0/47/dd32fa426cc72114383ac549964eecb20ecfd886d1e5ccf5340b55b02f57/mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f", size = 508106, upload-time = "2023-03-07T16:47:11.061Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198, upload-time = "2023-03-07T16:47:09.197Z" }, +] + +[[package]] +name = "msgpack" +version = "1.1.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/45/b1/ea4f68038a18c77c9467400d166d74c4ffa536f34761f7983a104357e614/msgpack-1.1.1.tar.gz", hash = "sha256:77b79ce34a2bdab2594f490c8e80dd62a02d650b91a75159a63ec413b8d104cd", size = 173555, upload-time = "2025-06-13T06:52:51.324Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e3/26/389b9c593eda2b8551b2e7126ad3a06af6f9b44274eb3a4f054d48ff7e47/msgpack-1.1.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ae497b11f4c21558d95de9f64fff7053544f4d1a17731c866143ed6bb4591238", size = 82359, upload-time = "2025-06-13T06:52:03.909Z" }, + { url = "https://files.pythonhosted.org/packages/ab/65/7d1de38c8a22cf8b1551469159d4b6cf49be2126adc2482de50976084d78/msgpack-1.1.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:33be9ab121df9b6b461ff91baac6f2731f83d9b27ed948c5b9d1978ae28bf157", size = 79172, upload-time = "2025-06-13T06:52:05.246Z" }, + { url = "https://files.pythonhosted.org/packages/0f/bd/cacf208b64d9577a62c74b677e1ada005caa9b69a05a599889d6fc2ab20a/msgpack-1.1.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6f64ae8fe7ffba251fecb8408540c34ee9df1c26674c50c4544d72dbf792e5ce", size = 425013, upload-time = "2025-06-13T06:52:06.341Z" }, + { url = "https://files.pythonhosted.org/packages/4d/ec/fd869e2567cc9c01278a736cfd1697941ba0d4b81a43e0aa2e8d71dab208/msgpack-1.1.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a494554874691720ba5891c9b0b39474ba43ffb1aaf32a5dac874effb1619e1a", size = 426905, upload-time = "2025-06-13T06:52:07.501Z" }, + { url = "https://files.pythonhosted.org/packages/55/2a/35860f33229075bce803a5593d046d8b489d7ba2fc85701e714fc1aaf898/msgpack-1.1.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cb643284ab0ed26f6957d969fe0dd8bb17beb567beb8998140b5e38a90974f6c", size = 407336, upload-time = "2025-06-13T06:52:09.047Z" }, + { url = "https://files.pythonhosted.org/packages/8c/16/69ed8f3ada150bf92745fb4921bd621fd2cdf5a42e25eb50bcc57a5328f0/msgpack-1.1.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d275a9e3c81b1093c060c3837e580c37f47c51eca031f7b5fb76f7b8470f5f9b", size = 409485, upload-time = "2025-06-13T06:52:10.382Z" }, + { url = "https://files.pythonhosted.org/packages/c6/b6/0c398039e4c6d0b2e37c61d7e0e9d13439f91f780686deb8ee64ecf1ae71/msgpack-1.1.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:4fd6b577e4541676e0cc9ddc1709d25014d3ad9a66caa19962c4f5de30fc09ef", size = 412182, upload-time = "2025-06-13T06:52:11.644Z" }, + { url = "https://files.pythonhosted.org/packages/b8/d0/0cf4a6ecb9bc960d624c93effaeaae75cbf00b3bc4a54f35c8507273cda1/msgpack-1.1.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:bb29aaa613c0a1c40d1af111abf025f1732cab333f96f285d6a93b934738a68a", size = 419883, upload-time = "2025-06-13T06:52:12.806Z" }, + { url = "https://files.pythonhosted.org/packages/62/83/9697c211720fa71a2dfb632cad6196a8af3abea56eece220fde4674dc44b/msgpack-1.1.1-cp312-cp312-win32.whl", hash = "sha256:870b9a626280c86cff9c576ec0d9cbcc54a1e5ebda9cd26dab12baf41fee218c", size = 65406, upload-time = "2025-06-13T06:52:14.271Z" }, + { url = "https://files.pythonhosted.org/packages/c0/23/0abb886e80eab08f5e8c485d6f13924028602829f63b8f5fa25a06636628/msgpack-1.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:5692095123007180dca3e788bb4c399cc26626da51629a31d40207cb262e67f4", size = 72558, upload-time = "2025-06-13T06:52:15.252Z" }, + { url = "https://files.pythonhosted.org/packages/a1/38/561f01cf3577430b59b340b51329803d3a5bf6a45864a55f4ef308ac11e3/msgpack-1.1.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3765afa6bd4832fc11c3749be4ba4b69a0e8d7b728f78e68120a157a4c5d41f0", size = 81677, upload-time = "2025-06-13T06:52:16.64Z" }, + { url = "https://files.pythonhosted.org/packages/09/48/54a89579ea36b6ae0ee001cba8c61f776451fad3c9306cd80f5b5c55be87/msgpack-1.1.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:8ddb2bcfd1a8b9e431c8d6f4f7db0773084e107730ecf3472f1dfe9ad583f3d9", size = 78603, upload-time = "2025-06-13T06:52:17.843Z" }, + { url = "https://files.pythonhosted.org/packages/a0/60/daba2699b308e95ae792cdc2ef092a38eb5ee422f9d2fbd4101526d8a210/msgpack-1.1.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:196a736f0526a03653d829d7d4c5500a97eea3648aebfd4b6743875f28aa2af8", size = 420504, upload-time = "2025-06-13T06:52:18.982Z" }, + { url = "https://files.pythonhosted.org/packages/20/22/2ebae7ae43cd8f2debc35c631172ddf14e2a87ffcc04cf43ff9df9fff0d3/msgpack-1.1.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d592d06e3cc2f537ceeeb23d38799c6ad83255289bb84c2e5792e5a8dea268a", size = 423749, upload-time = "2025-06-13T06:52:20.211Z" }, + { url = "https://files.pythonhosted.org/packages/40/1b/54c08dd5452427e1179a40b4b607e37e2664bca1c790c60c442c8e972e47/msgpack-1.1.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4df2311b0ce24f06ba253fda361f938dfecd7b961576f9be3f3fbd60e87130ac", size = 404458, upload-time = "2025-06-13T06:52:21.429Z" }, + { url = "https://files.pythonhosted.org/packages/2e/60/6bb17e9ffb080616a51f09928fdd5cac1353c9becc6c4a8abd4e57269a16/msgpack-1.1.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e4141c5a32b5e37905b5940aacbc59739f036930367d7acce7a64e4dec1f5e0b", size = 405976, upload-time = "2025-06-13T06:52:22.995Z" }, + { url = "https://files.pythonhosted.org/packages/ee/97/88983e266572e8707c1f4b99c8fd04f9eb97b43f2db40e3172d87d8642db/msgpack-1.1.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:b1ce7f41670c5a69e1389420436f41385b1aa2504c3b0c30620764b15dded2e7", size = 408607, upload-time = "2025-06-13T06:52:24.152Z" }, + { url = "https://files.pythonhosted.org/packages/bc/66/36c78af2efaffcc15a5a61ae0df53a1d025f2680122e2a9eb8442fed3ae4/msgpack-1.1.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4147151acabb9caed4e474c3344181e91ff7a388b888f1e19ea04f7e73dc7ad5", size = 424172, upload-time = "2025-06-13T06:52:25.704Z" }, + { url = "https://files.pythonhosted.org/packages/8c/87/a75eb622b555708fe0427fab96056d39d4c9892b0c784b3a721088c7ee37/msgpack-1.1.1-cp313-cp313-win32.whl", hash = "sha256:500e85823a27d6d9bba1d057c871b4210c1dd6fb01fbb764e37e4e8847376323", size = 65347, upload-time = "2025-06-13T06:52:26.846Z" }, + { url = "https://files.pythonhosted.org/packages/ca/91/7dc28d5e2a11a5ad804cf2b7f7a5fcb1eb5a4966d66a5d2b41aee6376543/msgpack-1.1.1-cp313-cp313-win_amd64.whl", hash = "sha256:6d489fba546295983abd142812bda76b57e33d0b9f5d5b71c09a583285506f69", size = 72341, upload-time = "2025-06-13T06:52:27.835Z" }, +] + +[[package]] +name = "multidict" +version = "6.6.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/69/7f/0652e6ed47ab288e3756ea9c0df8b14950781184d4bd7883f4d87dd41245/multidict-6.6.4.tar.gz", hash = "sha256:d2d4e4787672911b48350df02ed3fa3fffdc2f2e8ca06dd6afdf34189b76a9dd", size = 101843, upload-time = "2025-08-11T12:08:48.217Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/05/f6/512ffd8fd8b37fb2680e5ac35d788f1d71bbaf37789d21a820bdc441e565/multidict-6.6.4-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:0ffb87be160942d56d7b87b0fdf098e81ed565add09eaa1294268c7f3caac4c8", size = 76516, upload-time = "2025-08-11T12:06:53.393Z" }, + { url = "https://files.pythonhosted.org/packages/99/58/45c3e75deb8855c36bd66cc1658007589662ba584dbf423d01df478dd1c5/multidict-6.6.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d191de6cbab2aff5de6c5723101705fd044b3e4c7cfd587a1929b5028b9714b3", size = 45394, upload-time = "2025-08-11T12:06:54.555Z" }, + { url = "https://files.pythonhosted.org/packages/fd/ca/e8c4472a93a26e4507c0b8e1f0762c0d8a32de1328ef72fd704ef9cc5447/multidict-6.6.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:38a0956dd92d918ad5feff3db8fcb4a5eb7dba114da917e1a88475619781b57b", size = 43591, upload-time = "2025-08-11T12:06:55.672Z" }, + { url = "https://files.pythonhosted.org/packages/05/51/edf414f4df058574a7265034d04c935aa84a89e79ce90fcf4df211f47b16/multidict-6.6.4-cp312-cp312-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:6865f6d3b7900ae020b495d599fcf3765653bc927951c1abb959017f81ae8287", size = 237215, upload-time = "2025-08-11T12:06:57.213Z" }, + { url = "https://files.pythonhosted.org/packages/c8/45/8b3d6dbad8cf3252553cc41abea09ad527b33ce47a5e199072620b296902/multidict-6.6.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0a2088c126b6f72db6c9212ad827d0ba088c01d951cee25e758c450da732c138", size = 258299, upload-time = "2025-08-11T12:06:58.946Z" }, + { url = "https://files.pythonhosted.org/packages/3c/e8/8ca2e9a9f5a435fc6db40438a55730a4bf4956b554e487fa1b9ae920f825/multidict-6.6.4-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:0f37bed7319b848097085d7d48116f545985db988e2256b2e6f00563a3416ee6", size = 242357, upload-time = "2025-08-11T12:07:00.301Z" }, + { url = "https://files.pythonhosted.org/packages/0f/84/80c77c99df05a75c28490b2af8f7cba2a12621186e0a8b0865d8e745c104/multidict-6.6.4-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:01368e3c94032ba6ca0b78e7ccb099643466cf24f8dc8eefcfdc0571d56e58f9", size = 268369, upload-time = "2025-08-11T12:07:01.638Z" }, + { url = "https://files.pythonhosted.org/packages/0d/e9/920bfa46c27b05fb3e1ad85121fd49f441492dca2449c5bcfe42e4565d8a/multidict-6.6.4-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8fe323540c255db0bffee79ad7f048c909f2ab0edb87a597e1c17da6a54e493c", size = 269341, upload-time = "2025-08-11T12:07:02.943Z" }, + { url = "https://files.pythonhosted.org/packages/af/65/753a2d8b05daf496f4a9c367fe844e90a1b2cac78e2be2c844200d10cc4c/multidict-6.6.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b8eb3025f17b0a4c3cd08cda49acf312a19ad6e8a4edd9dbd591e6506d999402", size = 256100, upload-time = "2025-08-11T12:07:04.564Z" }, + { url = "https://files.pythonhosted.org/packages/09/54/655be13ae324212bf0bc15d665a4e34844f34c206f78801be42f7a0a8aaa/multidict-6.6.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bbc14f0365534d35a06970d6a83478b249752e922d662dc24d489af1aa0d1be7", size = 253584, upload-time = "2025-08-11T12:07:05.914Z" }, + { url = "https://files.pythonhosted.org/packages/5c/74/ab2039ecc05264b5cec73eb018ce417af3ebb384ae9c0e9ed42cb33f8151/multidict-6.6.4-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:75aa52fba2d96bf972e85451b99d8e19cc37ce26fd016f6d4aa60da9ab2b005f", size = 251018, upload-time = "2025-08-11T12:07:08.301Z" }, + { url = "https://files.pythonhosted.org/packages/af/0a/ccbb244ac848e56c6427f2392741c06302bbfba49c0042f1eb3c5b606497/multidict-6.6.4-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:4fefd4a815e362d4f011919d97d7b4a1e566f1dde83dc4ad8cfb5b41de1df68d", size = 251477, upload-time = "2025-08-11T12:07:10.248Z" }, + { url = "https://files.pythonhosted.org/packages/0e/b0/0ed49bba775b135937f52fe13922bc64a7eaf0a3ead84a36e8e4e446e096/multidict-6.6.4-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:db9801fe021f59a5b375ab778973127ca0ac52429a26e2fd86aa9508f4d26eb7", size = 263575, upload-time = "2025-08-11T12:07:11.928Z" }, + { url = "https://files.pythonhosted.org/packages/3e/d9/7fb85a85e14de2e44dfb6a24f03c41e2af8697a6df83daddb0e9b7569f73/multidict-6.6.4-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:a650629970fa21ac1fb06ba25dabfc5b8a2054fcbf6ae97c758aa956b8dba802", size = 259649, upload-time = "2025-08-11T12:07:13.244Z" }, + { url = "https://files.pythonhosted.org/packages/03/9e/b3a459bcf9b6e74fa461a5222a10ff9b544cb1cd52fd482fb1b75ecda2a2/multidict-6.6.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:452ff5da78d4720d7516a3a2abd804957532dd69296cb77319c193e3ffb87e24", size = 251505, upload-time = "2025-08-11T12:07:14.57Z" }, + { url = "https://files.pythonhosted.org/packages/86/a2/8022f78f041dfe6d71e364001a5cf987c30edfc83c8a5fb7a3f0974cff39/multidict-6.6.4-cp312-cp312-win32.whl", hash = "sha256:8c2fcb12136530ed19572bbba61b407f655e3953ba669b96a35036a11a485793", size = 41888, upload-time = "2025-08-11T12:07:15.904Z" }, + { url = "https://files.pythonhosted.org/packages/c7/eb/d88b1780d43a56db2cba24289fa744a9d216c1a8546a0dc3956563fd53ea/multidict-6.6.4-cp312-cp312-win_amd64.whl", hash = "sha256:047d9425860a8c9544fed1b9584f0c8bcd31bcde9568b047c5e567a1025ecd6e", size = 46072, upload-time = "2025-08-11T12:07:17.045Z" }, + { url = "https://files.pythonhosted.org/packages/9f/16/b929320bf5750e2d9d4931835a4c638a19d2494a5b519caaaa7492ebe105/multidict-6.6.4-cp312-cp312-win_arm64.whl", hash = "sha256:14754eb72feaa1e8ae528468f24250dd997b8e2188c3d2f593f9eba259e4b364", size = 43222, upload-time = "2025-08-11T12:07:18.328Z" }, + { url = "https://files.pythonhosted.org/packages/3a/5d/e1db626f64f60008320aab00fbe4f23fc3300d75892a3381275b3d284580/multidict-6.6.4-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:f46a6e8597f9bd71b31cc708195d42b634c8527fecbcf93febf1052cacc1f16e", size = 75848, upload-time = "2025-08-11T12:07:19.912Z" }, + { url = "https://files.pythonhosted.org/packages/4c/aa/8b6f548d839b6c13887253af4e29c939af22a18591bfb5d0ee6f1931dae8/multidict-6.6.4-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:22e38b2bc176c5eb9c0a0e379f9d188ae4cd8b28c0f53b52bce7ab0a9e534657", size = 45060, upload-time = "2025-08-11T12:07:21.163Z" }, + { url = "https://files.pythonhosted.org/packages/eb/c6/f5e97e5d99a729bc2aa58eb3ebfa9f1e56a9b517cc38c60537c81834a73f/multidict-6.6.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5df8afd26f162da59e218ac0eefaa01b01b2e6cd606cffa46608f699539246da", size = 43269, upload-time = "2025-08-11T12:07:22.392Z" }, + { url = "https://files.pythonhosted.org/packages/dc/31/d54eb0c62516776f36fe67f84a732f97e0b0e12f98d5685bebcc6d396910/multidict-6.6.4-cp313-cp313-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:49517449b58d043023720aa58e62b2f74ce9b28f740a0b5d33971149553d72aa", size = 237158, upload-time = "2025-08-11T12:07:23.636Z" }, + { url = "https://files.pythonhosted.org/packages/c4/1c/8a10c1c25b23156e63b12165a929d8eb49a6ed769fdbefb06e6f07c1e50d/multidict-6.6.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ae9408439537c5afdca05edd128a63f56a62680f4b3c234301055d7a2000220f", size = 257076, upload-time = "2025-08-11T12:07:25.049Z" }, + { url = "https://files.pythonhosted.org/packages/ad/86/90e20b5771d6805a119e483fd3d1e8393e745a11511aebca41f0da38c3e2/multidict-6.6.4-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:87a32d20759dc52a9e850fe1061b6e41ab28e2998d44168a8a341b99ded1dba0", size = 240694, upload-time = "2025-08-11T12:07:26.458Z" }, + { url = "https://files.pythonhosted.org/packages/e7/49/484d3e6b535bc0555b52a0a26ba86e4d8d03fd5587d4936dc59ba7583221/multidict-6.6.4-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:52e3c8d43cdfff587ceedce9deb25e6ae77daba560b626e97a56ddcad3756879", size = 266350, upload-time = "2025-08-11T12:07:27.94Z" }, + { url = "https://files.pythonhosted.org/packages/bf/b4/aa4c5c379b11895083d50021e229e90c408d7d875471cb3abf721e4670d6/multidict-6.6.4-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ad8850921d3a8d8ff6fbef790e773cecfc260bbfa0566998980d3fa8f520bc4a", size = 267250, upload-time = "2025-08-11T12:07:29.303Z" }, + { url = "https://files.pythonhosted.org/packages/80/e5/5e22c5bf96a64bdd43518b1834c6d95a4922cc2066b7d8e467dae9b6cee6/multidict-6.6.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:497a2954adc25c08daff36f795077f63ad33e13f19bfff7736e72c785391534f", size = 254900, upload-time = "2025-08-11T12:07:30.764Z" }, + { url = "https://files.pythonhosted.org/packages/17/38/58b27fed927c07035abc02befacab42491e7388ca105e087e6e0215ead64/multidict-6.6.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:024ce601f92d780ca1617ad4be5ac15b501cc2414970ffa2bb2bbc2bd5a68fa5", size = 252355, upload-time = "2025-08-11T12:07:32.205Z" }, + { url = "https://files.pythonhosted.org/packages/d0/a1/dad75d23a90c29c02b5d6f3d7c10ab36c3197613be5d07ec49c7791e186c/multidict-6.6.4-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:a693fc5ed9bdd1c9e898013e0da4dcc640de7963a371c0bd458e50e046bf6438", size = 250061, upload-time = "2025-08-11T12:07:33.623Z" }, + { url = "https://files.pythonhosted.org/packages/b8/1a/ac2216b61c7f116edab6dc3378cca6c70dc019c9a457ff0d754067c58b20/multidict-6.6.4-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:190766dac95aab54cae5b152a56520fd99298f32a1266d66d27fdd1b5ac00f4e", size = 249675, upload-time = "2025-08-11T12:07:34.958Z" }, + { url = "https://files.pythonhosted.org/packages/d4/79/1916af833b800d13883e452e8e0977c065c4ee3ab7a26941fbfdebc11895/multidict-6.6.4-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:34d8f2a5ffdceab9dcd97c7a016deb2308531d5f0fced2bb0c9e1df45b3363d7", size = 261247, upload-time = "2025-08-11T12:07:36.588Z" }, + { url = "https://files.pythonhosted.org/packages/c5/65/d1f84fe08ac44a5fc7391cbc20a7cedc433ea616b266284413fd86062f8c/multidict-6.6.4-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:59e8d40ab1f5a8597abcef00d04845155a5693b5da00d2c93dbe88f2050f2812", size = 257960, upload-time = "2025-08-11T12:07:39.735Z" }, + { url = "https://files.pythonhosted.org/packages/13/b5/29ec78057d377b195ac2c5248c773703a6b602e132a763e20ec0457e7440/multidict-6.6.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:467fe64138cfac771f0e949b938c2e1ada2b5af22f39692aa9258715e9ea613a", size = 250078, upload-time = "2025-08-11T12:07:41.525Z" }, + { url = "https://files.pythonhosted.org/packages/c4/0e/7e79d38f70a872cae32e29b0d77024bef7834b0afb406ddae6558d9e2414/multidict-6.6.4-cp313-cp313-win32.whl", hash = "sha256:14616a30fe6d0a48d0a48d1a633ab3b8bec4cf293aac65f32ed116f620adfd69", size = 41708, upload-time = "2025-08-11T12:07:43.405Z" }, + { url = "https://files.pythonhosted.org/packages/9d/34/746696dffff742e97cd6a23da953e55d0ea51fa601fa2ff387b3edcfaa2c/multidict-6.6.4-cp313-cp313-win_amd64.whl", hash = "sha256:40cd05eaeb39e2bc8939451f033e57feaa2ac99e07dbca8afe2be450a4a3b6cf", size = 45912, upload-time = "2025-08-11T12:07:45.082Z" }, + { url = "https://files.pythonhosted.org/packages/c7/87/3bac136181e271e29170d8d71929cdeddeb77f3e8b6a0c08da3a8e9da114/multidict-6.6.4-cp313-cp313-win_arm64.whl", hash = "sha256:f6eb37d511bfae9e13e82cb4d1af36b91150466f24d9b2b8a9785816deb16605", size = 43076, upload-time = "2025-08-11T12:07:46.746Z" }, + { url = "https://files.pythonhosted.org/packages/64/94/0a8e63e36c049b571c9ae41ee301ada29c3fee9643d9c2548d7d558a1d99/multidict-6.6.4-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:6c84378acd4f37d1b507dfa0d459b449e2321b3ba5f2338f9b085cf7a7ba95eb", size = 82812, upload-time = "2025-08-11T12:07:48.402Z" }, + { url = "https://files.pythonhosted.org/packages/25/1a/be8e369dfcd260d2070a67e65dd3990dd635cbd735b98da31e00ea84cd4e/multidict-6.6.4-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0e0558693063c75f3d952abf645c78f3c5dfdd825a41d8c4d8156fc0b0da6e7e", size = 48313, upload-time = "2025-08-11T12:07:49.679Z" }, + { url = "https://files.pythonhosted.org/packages/26/5a/dd4ade298674b2f9a7b06a32c94ffbc0497354df8285f27317c66433ce3b/multidict-6.6.4-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:3f8e2384cb83ebd23fd07e9eada8ba64afc4c759cd94817433ab8c81ee4b403f", size = 46777, upload-time = "2025-08-11T12:07:51.318Z" }, + { url = "https://files.pythonhosted.org/packages/89/db/98aa28bc7e071bfba611ac2ae803c24e96dd3a452b4118c587d3d872c64c/multidict-6.6.4-cp313-cp313t-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:f996b87b420995a9174b2a7c1a8daf7db4750be6848b03eb5e639674f7963773", size = 229321, upload-time = "2025-08-11T12:07:52.965Z" }, + { url = "https://files.pythonhosted.org/packages/c7/bc/01ddda2a73dd9d167bd85d0e8ef4293836a8f82b786c63fb1a429bc3e678/multidict-6.6.4-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cc356250cffd6e78416cf5b40dc6a74f1edf3be8e834cf8862d9ed5265cf9b0e", size = 249954, upload-time = "2025-08-11T12:07:54.423Z" }, + { url = "https://files.pythonhosted.org/packages/06/78/6b7c0f020f9aa0acf66d0ab4eb9f08375bac9a50ff5e3edb1c4ccd59eafc/multidict-6.6.4-cp313-cp313t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:dadf95aa862714ea468a49ad1e09fe00fcc9ec67d122f6596a8d40caf6cec7d0", size = 228612, upload-time = "2025-08-11T12:07:55.914Z" }, + { url = "https://files.pythonhosted.org/packages/00/44/3faa416f89b2d5d76e9d447296a81521e1c832ad6e40b92f990697b43192/multidict-6.6.4-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7dd57515bebffd8ebd714d101d4c434063322e4fe24042e90ced41f18b6d3395", size = 257528, upload-time = "2025-08-11T12:07:57.371Z" }, + { url = "https://files.pythonhosted.org/packages/05/5f/77c03b89af0fcb16f018f668207768191fb9dcfb5e3361a5e706a11db2c9/multidict-6.6.4-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:967af5f238ebc2eb1da4e77af5492219fbd9b4b812347da39a7b5f5c72c0fa45", size = 256329, upload-time = "2025-08-11T12:07:58.844Z" }, + { url = "https://files.pythonhosted.org/packages/cf/e9/ed750a2a9afb4f8dc6f13dc5b67b514832101b95714f1211cd42e0aafc26/multidict-6.6.4-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2a4c6875c37aae9794308ec43e3530e4aa0d36579ce38d89979bbf89582002bb", size = 247928, upload-time = "2025-08-11T12:08:01.037Z" }, + { url = "https://files.pythonhosted.org/packages/1f/b5/e0571bc13cda277db7e6e8a532791d4403dacc9850006cb66d2556e649c0/multidict-6.6.4-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:7f683a551e92bdb7fac545b9c6f9fa2aebdeefa61d607510b3533286fcab67f5", size = 245228, upload-time = "2025-08-11T12:08:02.96Z" }, + { url = "https://files.pythonhosted.org/packages/f3/a3/69a84b0eccb9824491f06368f5b86e72e4af54c3067c37c39099b6687109/multidict-6.6.4-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:3ba5aaf600edaf2a868a391779f7a85d93bed147854925f34edd24cc70a3e141", size = 235869, upload-time = "2025-08-11T12:08:04.746Z" }, + { url = "https://files.pythonhosted.org/packages/a9/9d/28802e8f9121a6a0804fa009debf4e753d0a59969ea9f70be5f5fdfcb18f/multidict-6.6.4-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:580b643b7fd2c295d83cad90d78419081f53fd532d1f1eb67ceb7060f61cff0d", size = 243446, upload-time = "2025-08-11T12:08:06.332Z" }, + { url = "https://files.pythonhosted.org/packages/38/ea/6c98add069b4878c1d66428a5f5149ddb6d32b1f9836a826ac764b9940be/multidict-6.6.4-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:37b7187197da6af3ee0b044dbc9625afd0c885f2800815b228a0e70f9a7f473d", size = 252299, upload-time = "2025-08-11T12:08:07.931Z" }, + { url = "https://files.pythonhosted.org/packages/3a/09/8fe02d204473e14c0af3affd50af9078839dfca1742f025cca765435d6b4/multidict-6.6.4-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:e1b93790ed0bc26feb72e2f08299691ceb6da5e9e14a0d13cc74f1869af327a0", size = 246926, upload-time = "2025-08-11T12:08:09.467Z" }, + { url = "https://files.pythonhosted.org/packages/37/3d/7b1e10d774a6df5175ecd3c92bff069e77bed9ec2a927fdd4ff5fe182f67/multidict-6.6.4-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:a506a77ddee1efcca81ecbeae27ade3e09cdf21a8ae854d766c2bb4f14053f92", size = 243383, upload-time = "2025-08-11T12:08:10.981Z" }, + { url = "https://files.pythonhosted.org/packages/50/b0/a6fae46071b645ae98786ab738447de1ef53742eaad949f27e960864bb49/multidict-6.6.4-cp313-cp313t-win32.whl", hash = "sha256:f93b2b2279883d1d0a9e1bd01f312d6fc315c5e4c1f09e112e4736e2f650bc4e", size = 47775, upload-time = "2025-08-11T12:08:12.439Z" }, + { url = "https://files.pythonhosted.org/packages/b2/0a/2436550b1520091af0600dff547913cb2d66fbac27a8c33bc1b1bccd8d98/multidict-6.6.4-cp313-cp313t-win_amd64.whl", hash = "sha256:6d46a180acdf6e87cc41dc15d8f5c2986e1e8739dc25dbb7dac826731ef381a4", size = 53100, upload-time = "2025-08-11T12:08:13.823Z" }, + { url = "https://files.pythonhosted.org/packages/97/ea/43ac51faff934086db9c072a94d327d71b7d8b40cd5dcb47311330929ef0/multidict-6.6.4-cp313-cp313t-win_arm64.whl", hash = "sha256:756989334015e3335d087a27331659820d53ba432befdef6a718398b0a8493ad", size = 45501, upload-time = "2025-08-11T12:08:15.173Z" }, + { url = "https://files.pythonhosted.org/packages/fd/69/b547032297c7e63ba2af494edba695d781af8a0c6e89e4d06cf848b21d80/multidict-6.6.4-py3-none-any.whl", hash = "sha256:27d8f8e125c07cb954e54d75d04905a9bba8a439c1d84aca94949d4d03d8601c", size = 12313, upload-time = "2025-08-11T12:08:46.891Z" }, +] + +[[package]] +name = "nest-asyncio" +version = "1.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/83/f8/51569ac65d696c8ecbee95938f89d4abf00f47d58d48f6fbabfe8f0baefe/nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe", size = 7418, upload-time = "2024-01-21T14:25:19.227Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a0/c4/c2971a3ba4c6103a3d10c4b0f24f461ddc027f0f09763220cf35ca1401b3/nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c", size = 5195, upload-time = "2024-01-21T14:25:17.223Z" }, +] + +[[package]] +name = "networkx" +version = "3.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6c/4f/ccdb8ad3a38e583f214547fd2f7ff1fc160c43a75af88e6aec213404b96a/networkx-3.5.tar.gz", hash = "sha256:d4c6f9cf81f52d69230866796b82afbccdec3db7ae4fbd1b65ea750feed50037", size = 2471065, upload-time = "2025-05-29T11:35:07.804Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/eb/8d/776adee7bbf76365fdd7f2552710282c79a4ead5d2a46408c9043a2b70ba/networkx-3.5-py3-none-any.whl", hash = "sha256:0030d386a9a06dee3565298b4a734b68589749a544acbb6c412dc9e2489ec6ec", size = 2034406, upload-time = "2025-05-29T11:35:04.961Z" }, +] + +[[package]] +name = "numpy" +version = "2.3.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/37/7d/3fec4199c5ffb892bed55cff901e4f39a58c81df9c44c280499e92cad264/numpy-2.3.2.tar.gz", hash = "sha256:e0486a11ec30cdecb53f184d496d1c6a20786c81e55e41640270130056f8ee48", size = 20489306, upload-time = "2025-07-24T21:32:07.553Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/00/6d/745dd1c1c5c284d17725e5c802ca4d45cfc6803519d777f087b71c9f4069/numpy-2.3.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:bc3186bea41fae9d8e90c2b4fb5f0a1f5a690682da79b92574d63f56b529080b", size = 20956420, upload-time = "2025-07-24T20:28:18.002Z" }, + { url = "https://files.pythonhosted.org/packages/bc/96/e7b533ea5740641dd62b07a790af5d9d8fec36000b8e2d0472bd7574105f/numpy-2.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2f4f0215edb189048a3c03bd5b19345bdfa7b45a7a6f72ae5945d2a28272727f", size = 14184660, upload-time = "2025-07-24T20:28:39.522Z" }, + { url = "https://files.pythonhosted.org/packages/2b/53/102c6122db45a62aa20d1b18c9986f67e6b97e0d6fbc1ae13e3e4c84430c/numpy-2.3.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:8b1224a734cd509f70816455c3cffe13a4f599b1bf7130f913ba0e2c0b2006c0", size = 5113382, upload-time = "2025-07-24T20:28:48.544Z" }, + { url = "https://files.pythonhosted.org/packages/2b/21/376257efcbf63e624250717e82b4fae93d60178f09eb03ed766dbb48ec9c/numpy-2.3.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:3dcf02866b977a38ba3ec10215220609ab9667378a9e2150615673f3ffd6c73b", size = 6647258, upload-time = "2025-07-24T20:28:59.104Z" }, + { url = "https://files.pythonhosted.org/packages/91/ba/f4ebf257f08affa464fe6036e13f2bf9d4642a40228781dc1235da81be9f/numpy-2.3.2-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:572d5512df5470f50ada8d1972c5f1082d9a0b7aa5944db8084077570cf98370", size = 14281409, upload-time = "2025-07-24T20:40:30.298Z" }, + { url = "https://files.pythonhosted.org/packages/59/ef/f96536f1df42c668cbacb727a8c6da7afc9c05ece6d558927fb1722693e1/numpy-2.3.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8145dd6d10df13c559d1e4314df29695613575183fa2e2d11fac4c208c8a1f73", size = 16641317, upload-time = "2025-07-24T20:40:56.625Z" }, + { url = "https://files.pythonhosted.org/packages/f6/a7/af813a7b4f9a42f498dde8a4c6fcbff8100eed00182cc91dbaf095645f38/numpy-2.3.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:103ea7063fa624af04a791c39f97070bf93b96d7af7eb23530cd087dc8dbe9dc", size = 16056262, upload-time = "2025-07-24T20:41:20.797Z" }, + { url = "https://files.pythonhosted.org/packages/8b/5d/41c4ef8404caaa7f05ed1cfb06afe16a25895260eacbd29b4d84dff2920b/numpy-2.3.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fc927d7f289d14f5e037be917539620603294454130b6de200091e23d27dc9be", size = 18579342, upload-time = "2025-07-24T20:41:50.753Z" }, + { url = "https://files.pythonhosted.org/packages/a1/4f/9950e44c5a11636f4a3af6e825ec23003475cc9a466edb7a759ed3ea63bd/numpy-2.3.2-cp312-cp312-win32.whl", hash = "sha256:d95f59afe7f808c103be692175008bab926b59309ade3e6d25009e9a171f7036", size = 6320610, upload-time = "2025-07-24T20:42:01.551Z" }, + { url = "https://files.pythonhosted.org/packages/7c/2f/244643a5ce54a94f0a9a2ab578189c061e4a87c002e037b0829dd77293b6/numpy-2.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:9e196ade2400c0c737d93465327d1ae7c06c7cb8a1756121ebf54b06ca183c7f", size = 12786292, upload-time = "2025-07-24T20:42:20.738Z" }, + { url = "https://files.pythonhosted.org/packages/54/cd/7b5f49d5d78db7badab22d8323c1b6ae458fbf86c4fdfa194ab3cd4eb39b/numpy-2.3.2-cp312-cp312-win_arm64.whl", hash = "sha256:ee807923782faaf60d0d7331f5e86da7d5e3079e28b291973c545476c2b00d07", size = 10194071, upload-time = "2025-07-24T20:42:36.657Z" }, + { url = "https://files.pythonhosted.org/packages/1c/c0/c6bb172c916b00700ed3bf71cb56175fd1f7dbecebf8353545d0b5519f6c/numpy-2.3.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:c8d9727f5316a256425892b043736d63e89ed15bbfe6556c5ff4d9d4448ff3b3", size = 20949074, upload-time = "2025-07-24T20:43:07.813Z" }, + { url = "https://files.pythonhosted.org/packages/20/4e/c116466d22acaf4573e58421c956c6076dc526e24a6be0903219775d862e/numpy-2.3.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:efc81393f25f14d11c9d161e46e6ee348637c0a1e8a54bf9dedc472a3fae993b", size = 14177311, upload-time = "2025-07-24T20:43:29.335Z" }, + { url = "https://files.pythonhosted.org/packages/78/45/d4698c182895af189c463fc91d70805d455a227261d950e4e0f1310c2550/numpy-2.3.2-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:dd937f088a2df683cbb79dda9a772b62a3e5a8a7e76690612c2737f38c6ef1b6", size = 5106022, upload-time = "2025-07-24T20:43:37.999Z" }, + { url = "https://files.pythonhosted.org/packages/9f/76/3e6880fef4420179309dba72a8c11f6166c431cf6dee54c577af8906f914/numpy-2.3.2-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:11e58218c0c46c80509186e460d79fbdc9ca1eb8d8aee39d8f2dc768eb781089", size = 6640135, upload-time = "2025-07-24T20:43:49.28Z" }, + { url = "https://files.pythonhosted.org/packages/34/fa/87ff7f25b3c4ce9085a62554460b7db686fef1e0207e8977795c7b7d7ba1/numpy-2.3.2-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5ad4ebcb683a1f99f4f392cc522ee20a18b2bb12a2c1c42c3d48d5a1adc9d3d2", size = 14278147, upload-time = "2025-07-24T20:44:10.328Z" }, + { url = "https://files.pythonhosted.org/packages/1d/0f/571b2c7a3833ae419fe69ff7b479a78d313581785203cc70a8db90121b9a/numpy-2.3.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:938065908d1d869c7d75d8ec45f735a034771c6ea07088867f713d1cd3bbbe4f", size = 16635989, upload-time = "2025-07-24T20:44:34.88Z" }, + { url = "https://files.pythonhosted.org/packages/24/5a/84ae8dca9c9a4c592fe11340b36a86ffa9fd3e40513198daf8a97839345c/numpy-2.3.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:66459dccc65d8ec98cc7df61307b64bf9e08101f9598755d42d8ae65d9a7a6ee", size = 16053052, upload-time = "2025-07-24T20:44:58.872Z" }, + { url = "https://files.pythonhosted.org/packages/57/7c/e5725d99a9133b9813fcf148d3f858df98511686e853169dbaf63aec6097/numpy-2.3.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a7af9ed2aa9ec5950daf05bb11abc4076a108bd3c7db9aa7251d5f107079b6a6", size = 18577955, upload-time = "2025-07-24T20:45:26.714Z" }, + { url = "https://files.pythonhosted.org/packages/ae/11/7c546fcf42145f29b71e4d6f429e96d8d68e5a7ba1830b2e68d7418f0bbd/numpy-2.3.2-cp313-cp313-win32.whl", hash = "sha256:906a30249315f9c8e17b085cc5f87d3f369b35fedd0051d4a84686967bdbbd0b", size = 6311843, upload-time = "2025-07-24T20:49:24.444Z" }, + { url = "https://files.pythonhosted.org/packages/aa/6f/a428fd1cb7ed39b4280d057720fed5121b0d7754fd2a9768640160f5517b/numpy-2.3.2-cp313-cp313-win_amd64.whl", hash = "sha256:c63d95dc9d67b676e9108fe0d2182987ccb0f11933c1e8959f42fa0da8d4fa56", size = 12782876, upload-time = "2025-07-24T20:49:43.227Z" }, + { url = "https://files.pythonhosted.org/packages/65/85/4ea455c9040a12595fb6c43f2c217257c7b52dd0ba332c6a6c1d28b289fe/numpy-2.3.2-cp313-cp313-win_arm64.whl", hash = "sha256:b05a89f2fb84d21235f93de47129dd4f11c16f64c87c33f5e284e6a3a54e43f2", size = 10192786, upload-time = "2025-07-24T20:49:59.443Z" }, + { url = "https://files.pythonhosted.org/packages/80/23/8278f40282d10c3f258ec3ff1b103d4994bcad78b0cba9208317f6bb73da/numpy-2.3.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:4e6ecfeddfa83b02318f4d84acf15fbdbf9ded18e46989a15a8b6995dfbf85ab", size = 21047395, upload-time = "2025-07-24T20:45:58.821Z" }, + { url = "https://files.pythonhosted.org/packages/1f/2d/624f2ce4a5df52628b4ccd16a4f9437b37c35f4f8a50d00e962aae6efd7a/numpy-2.3.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:508b0eada3eded10a3b55725b40806a4b855961040180028f52580c4729916a2", size = 14300374, upload-time = "2025-07-24T20:46:20.207Z" }, + { url = "https://files.pythonhosted.org/packages/f6/62/ff1e512cdbb829b80a6bd08318a58698867bca0ca2499d101b4af063ee97/numpy-2.3.2-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:754d6755d9a7588bdc6ac47dc4ee97867271b17cee39cb87aef079574366db0a", size = 5228864, upload-time = "2025-07-24T20:46:30.58Z" }, + { url = "https://files.pythonhosted.org/packages/7d/8e/74bc18078fff03192d4032cfa99d5a5ca937807136d6f5790ce07ca53515/numpy-2.3.2-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:a9f66e7d2b2d7712410d3bc5684149040ef5f19856f20277cd17ea83e5006286", size = 6737533, upload-time = "2025-07-24T20:46:46.111Z" }, + { url = "https://files.pythonhosted.org/packages/19/ea/0731efe2c9073ccca5698ef6a8c3667c4cf4eea53fcdcd0b50140aba03bc/numpy-2.3.2-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:de6ea4e5a65d5a90c7d286ddff2b87f3f4ad61faa3db8dabe936b34c2275b6f8", size = 14352007, upload-time = "2025-07-24T20:47:07.1Z" }, + { url = "https://files.pythonhosted.org/packages/cf/90/36be0865f16dfed20f4bc7f75235b963d5939707d4b591f086777412ff7b/numpy-2.3.2-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a3ef07ec8cbc8fc9e369c8dcd52019510c12da4de81367d8b20bc692aa07573a", size = 16701914, upload-time = "2025-07-24T20:47:32.459Z" }, + { url = "https://files.pythonhosted.org/packages/94/30/06cd055e24cb6c38e5989a9e747042b4e723535758e6153f11afea88c01b/numpy-2.3.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:27c9f90e7481275c7800dc9c24b7cc40ace3fdb970ae4d21eaff983a32f70c91", size = 16132708, upload-time = "2025-07-24T20:47:58.129Z" }, + { url = "https://files.pythonhosted.org/packages/9a/14/ecede608ea73e58267fd7cb78f42341b3b37ba576e778a1a06baffbe585c/numpy-2.3.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:07b62978075b67eee4065b166d000d457c82a1efe726cce608b9db9dd66a73a5", size = 18651678, upload-time = "2025-07-24T20:48:25.402Z" }, + { url = "https://files.pythonhosted.org/packages/40/f3/2fe6066b8d07c3685509bc24d56386534c008b462a488b7f503ba82b8923/numpy-2.3.2-cp313-cp313t-win32.whl", hash = "sha256:c771cfac34a4f2c0de8e8c97312d07d64fd8f8ed45bc9f5726a7e947270152b5", size = 6441832, upload-time = "2025-07-24T20:48:37.181Z" }, + { url = "https://files.pythonhosted.org/packages/0b/ba/0937d66d05204d8f28630c9c60bc3eda68824abde4cf756c4d6aad03b0c6/numpy-2.3.2-cp313-cp313t-win_amd64.whl", hash = "sha256:72dbebb2dcc8305c431b2836bcc66af967df91be793d63a24e3d9b741374c450", size = 12927049, upload-time = "2025-07-24T20:48:56.24Z" }, + { url = "https://files.pythonhosted.org/packages/e9/ed/13542dd59c104d5e654dfa2ac282c199ba64846a74c2c4bcdbc3a0f75df1/numpy-2.3.2-cp313-cp313t-win_arm64.whl", hash = "sha256:72c6df2267e926a6d5286b0a6d556ebe49eae261062059317837fda12ddf0c1a", size = 10262935, upload-time = "2025-07-24T20:49:13.136Z" }, + { url = "https://files.pythonhosted.org/packages/c9/7c/7659048aaf498f7611b783e000c7268fcc4dcf0ce21cd10aad7b2e8f9591/numpy-2.3.2-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:448a66d052d0cf14ce9865d159bfc403282c9bc7bb2a31b03cc18b651eca8b1a", size = 20950906, upload-time = "2025-07-24T20:50:30.346Z" }, + { url = "https://files.pythonhosted.org/packages/80/db/984bea9d4ddf7112a04cfdfb22b1050af5757864cfffe8e09e44b7f11a10/numpy-2.3.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:546aaf78e81b4081b2eba1d105c3b34064783027a06b3ab20b6eba21fb64132b", size = 14185607, upload-time = "2025-07-24T20:50:51.923Z" }, + { url = "https://files.pythonhosted.org/packages/e4/76/b3d6f414f4eca568f469ac112a3b510938d892bc5a6c190cb883af080b77/numpy-2.3.2-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:87c930d52f45df092f7578889711a0768094debf73cfcde105e2d66954358125", size = 5114110, upload-time = "2025-07-24T20:51:01.041Z" }, + { url = "https://files.pythonhosted.org/packages/9e/d2/6f5e6826abd6bca52392ed88fe44a4b52aacb60567ac3bc86c67834c3a56/numpy-2.3.2-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:8dc082ea901a62edb8f59713c6a7e28a85daddcb67454c839de57656478f5b19", size = 6642050, upload-time = "2025-07-24T20:51:11.64Z" }, + { url = "https://files.pythonhosted.org/packages/c4/43/f12b2ade99199e39c73ad182f103f9d9791f48d885c600c8e05927865baf/numpy-2.3.2-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:af58de8745f7fa9ca1c0c7c943616c6fe28e75d0c81f5c295810e3c83b5be92f", size = 14296292, upload-time = "2025-07-24T20:51:33.488Z" }, + { url = "https://files.pythonhosted.org/packages/5d/f9/77c07d94bf110a916b17210fac38680ed8734c236bfed9982fd8524a7b47/numpy-2.3.2-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fed5527c4cf10f16c6d0b6bee1f89958bccb0ad2522c8cadc2efd318bcd545f5", size = 16638913, upload-time = "2025-07-24T20:51:58.517Z" }, + { url = "https://files.pythonhosted.org/packages/9b/d1/9d9f2c8ea399cc05cfff8a7437453bd4e7d894373a93cdc46361bbb49a7d/numpy-2.3.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:095737ed986e00393ec18ec0b21b47c22889ae4b0cd2d5e88342e08b01141f58", size = 16071180, upload-time = "2025-07-24T20:52:22.827Z" }, + { url = "https://files.pythonhosted.org/packages/4c/41/82e2c68aff2a0c9bf315e47d61951099fed65d8cb2c8d9dc388cb87e947e/numpy-2.3.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b5e40e80299607f597e1a8a247ff8d71d79c5b52baa11cc1cce30aa92d2da6e0", size = 18576809, upload-time = "2025-07-24T20:52:51.015Z" }, + { url = "https://files.pythonhosted.org/packages/14/14/4b4fd3efb0837ed252d0f583c5c35a75121038a8c4e065f2c259be06d2d8/numpy-2.3.2-cp314-cp314-win32.whl", hash = "sha256:7d6e390423cc1f76e1b8108c9b6889d20a7a1f59d9a60cac4a050fa734d6c1e2", size = 6366410, upload-time = "2025-07-24T20:56:44.949Z" }, + { url = "https://files.pythonhosted.org/packages/11/9e/b4c24a6b8467b61aced5c8dc7dcfce23621baa2e17f661edb2444a418040/numpy-2.3.2-cp314-cp314-win_amd64.whl", hash = "sha256:b9d0878b21e3918d76d2209c924ebb272340da1fb51abc00f986c258cd5e957b", size = 12918821, upload-time = "2025-07-24T20:57:06.479Z" }, + { url = "https://files.pythonhosted.org/packages/0e/0f/0dc44007c70b1007c1cef86b06986a3812dd7106d8f946c09cfa75782556/numpy-2.3.2-cp314-cp314-win_arm64.whl", hash = "sha256:2738534837c6a1d0c39340a190177d7d66fdf432894f469728da901f8f6dc910", size = 10477303, upload-time = "2025-07-24T20:57:22.879Z" }, + { url = "https://files.pythonhosted.org/packages/8b/3e/075752b79140b78ddfc9c0a1634d234cfdbc6f9bbbfa6b7504e445ad7d19/numpy-2.3.2-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:4d002ecf7c9b53240be3bb69d80f86ddbd34078bae04d87be81c1f58466f264e", size = 21047524, upload-time = "2025-07-24T20:53:22.086Z" }, + { url = "https://files.pythonhosted.org/packages/fe/6d/60e8247564a72426570d0e0ea1151b95ce5bd2f1597bb878a18d32aec855/numpy-2.3.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:293b2192c6bcce487dbc6326de5853787f870aeb6c43f8f9c6496db5b1781e45", size = 14300519, upload-time = "2025-07-24T20:53:44.053Z" }, + { url = "https://files.pythonhosted.org/packages/4d/73/d8326c442cd428d47a067070c3ac6cc3b651a6e53613a1668342a12d4479/numpy-2.3.2-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:0a4f2021a6da53a0d580d6ef5db29947025ae8b35b3250141805ea9a32bbe86b", size = 5228972, upload-time = "2025-07-24T20:53:53.81Z" }, + { url = "https://files.pythonhosted.org/packages/34/2e/e71b2d6dad075271e7079db776196829019b90ce3ece5c69639e4f6fdc44/numpy-2.3.2-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:9c144440db4bf3bb6372d2c3e49834cc0ff7bb4c24975ab33e01199e645416f2", size = 6737439, upload-time = "2025-07-24T20:54:04.742Z" }, + { url = "https://files.pythonhosted.org/packages/15/b0/d004bcd56c2c5e0500ffc65385eb6d569ffd3363cb5e593ae742749b2daa/numpy-2.3.2-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f92d6c2a8535dc4fe4419562294ff957f83a16ebdec66df0805e473ffaad8bd0", size = 14352479, upload-time = "2025-07-24T20:54:25.819Z" }, + { url = "https://files.pythonhosted.org/packages/11/e3/285142fcff8721e0c99b51686426165059874c150ea9ab898e12a492e291/numpy-2.3.2-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cefc2219baa48e468e3db7e706305fcd0c095534a192a08f31e98d83a7d45fb0", size = 16702805, upload-time = "2025-07-24T20:54:50.814Z" }, + { url = "https://files.pythonhosted.org/packages/33/c3/33b56b0e47e604af2c7cd065edca892d180f5899599b76830652875249a3/numpy-2.3.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:76c3e9501ceb50b2ff3824c3589d5d1ab4ac857b0ee3f8f49629d0de55ecf7c2", size = 16133830, upload-time = "2025-07-24T20:55:17.306Z" }, + { url = "https://files.pythonhosted.org/packages/6e/ae/7b1476a1f4d6a48bc669b8deb09939c56dd2a439db1ab03017844374fb67/numpy-2.3.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:122bf5ed9a0221b3419672493878ba4967121514b1d7d4656a7580cd11dddcbf", size = 18652665, upload-time = "2025-07-24T20:55:46.665Z" }, + { url = "https://files.pythonhosted.org/packages/14/ba/5b5c9978c4bb161034148ade2de9db44ec316fab89ce8c400db0e0c81f86/numpy-2.3.2-cp314-cp314t-win32.whl", hash = "sha256:6f1ae3dcb840edccc45af496f312528c15b1f79ac318169d094e85e4bb35fdf1", size = 6514777, upload-time = "2025-07-24T20:55:57.66Z" }, + { url = "https://files.pythonhosted.org/packages/eb/46/3dbaf0ae7c17cdc46b9f662c56da2054887b8d9e737c1476f335c83d33db/numpy-2.3.2-cp314-cp314t-win_amd64.whl", hash = "sha256:087ffc25890d89a43536f75c5fe8770922008758e8eeeef61733957041ed2f9b", size = 13111856, upload-time = "2025-07-24T20:56:17.318Z" }, + { url = "https://files.pythonhosted.org/packages/c1/9e/1652778bce745a67b5fe05adde60ed362d38eb17d919a540e813d30f6874/numpy-2.3.2-cp314-cp314t-win_arm64.whl", hash = "sha256:092aeb3449833ea9c0bf0089d70c29ae480685dd2377ec9cdbbb620257f84631", size = 10544226, upload-time = "2025-07-24T20:56:34.509Z" }, +] + +[[package]] +name = "nvidia-cublas-cu12" +version = "12.6.4.1" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/af/eb/ff4b8c503fa1f1796679dce648854d58751982426e4e4b37d6fce49d259c/nvidia_cublas_cu12-12.6.4.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:08ed2686e9875d01b58e3cb379c6896df8e76c75e0d4a7f7dace3d7b6d9ef8eb", size = 393138322, upload-time = "2024-11-20T17:40:25.65Z" }, +] + +[[package]] +name = "nvidia-cuda-cupti-cu12" +version = "12.6.80" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/49/60/7b6497946d74bcf1de852a21824d63baad12cd417db4195fc1bfe59db953/nvidia_cuda_cupti_cu12-12.6.80-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6768bad6cab4f19e8292125e5f1ac8aa7d1718704012a0e3272a6f61c4bce132", size = 8917980, upload-time = "2024-11-20T17:36:04.019Z" }, + { url = "https://files.pythonhosted.org/packages/a5/24/120ee57b218d9952c379d1e026c4479c9ece9997a4fb46303611ee48f038/nvidia_cuda_cupti_cu12-12.6.80-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a3eff6cdfcc6a4c35db968a06fcadb061cbc7d6dde548609a941ff8701b98b73", size = 8917972, upload-time = "2024-10-01T16:58:06.036Z" }, +] + +[[package]] +name = "nvidia-cuda-nvrtc-cu12" +version = "12.6.77" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/75/2e/46030320b5a80661e88039f59060d1790298b4718944a65a7f2aeda3d9e9/nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl", hash = "sha256:35b0cc6ee3a9636d5409133e79273ce1f3fd087abb0532d2d2e8fff1fe9efc53", size = 23650380, upload-time = "2024-10-01T17:00:14.643Z" }, +] + +[[package]] +name = "nvidia-cuda-runtime-cu12" +version = "12.6.77" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e1/23/e717c5ac26d26cf39a27fbc076240fad2e3b817e5889d671b67f4f9f49c5/nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ba3b56a4f896141e25e19ab287cd71e52a6a0f4b29d0d31609f60e3b4d5219b7", size = 897690, upload-time = "2024-11-20T17:35:30.697Z" }, + { url = "https://files.pythonhosted.org/packages/f0/62/65c05e161eeddbafeca24dc461f47de550d9fa8a7e04eb213e32b55cfd99/nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a84d15d5e1da416dd4774cb42edf5e954a3e60cc945698dc1d5be02321c44dc8", size = 897678, upload-time = "2024-10-01T16:57:33.821Z" }, +] + +[[package]] +name = "nvidia-cudnn-cu12" +version = "9.5.1.17" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-cublas-cu12", marker = "sys_platform != 'win32'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/2a/78/4535c9c7f859a64781e43c969a3a7e84c54634e319a996d43ef32ce46f83/nvidia_cudnn_cu12-9.5.1.17-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:30ac3869f6db17d170e0e556dd6cc5eee02647abc31ca856634d5a40f82c15b2", size = 570988386, upload-time = "2024-10-25T19:54:26.39Z" }, +] + +[[package]] +name = "nvidia-cufft-cu12" +version = "11.3.0.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-nvjitlink-cu12", marker = "sys_platform != 'win32'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/8f/16/73727675941ab8e6ffd86ca3a4b7b47065edcca7a997920b831f8147c99d/nvidia_cufft_cu12-11.3.0.4-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ccba62eb9cef5559abd5e0d54ceed2d9934030f51163df018532142a8ec533e5", size = 200221632, upload-time = "2024-11-20T17:41:32.357Z" }, + { url = "https://files.pythonhosted.org/packages/60/de/99ec247a07ea40c969d904fc14f3a356b3e2a704121675b75c366b694ee1/nvidia_cufft_cu12-11.3.0.4-py3-none-manylinux2014_x86_64.whl", hash = "sha256:768160ac89f6f7b459bee747e8d175dbf53619cfe74b2a5636264163138013ca", size = 200221622, upload-time = "2024-10-01T17:03:58.79Z" }, +] + +[[package]] +name = "nvidia-cufile-cu12" +version = "1.11.1.6" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b2/66/cc9876340ac68ae71b15c743ddb13f8b30d5244af344ec8322b449e35426/nvidia_cufile_cu12-1.11.1.6-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cc23469d1c7e52ce6c1d55253273d32c565dd22068647f3aa59b3c6b005bf159", size = 1142103, upload-time = "2024-11-20T17:42:11.83Z" }, +] + +[[package]] +name = "nvidia-curand-cu12" +version = "10.3.7.77" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/73/1b/44a01c4e70933637c93e6e1a8063d1e998b50213a6b65ac5a9169c47e98e/nvidia_curand_cu12-10.3.7.77-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a42cd1344297f70b9e39a1e4f467a4e1c10f1da54ff7a85c12197f6c652c8bdf", size = 56279010, upload-time = "2024-11-20T17:42:50.958Z" }, + { url = "https://files.pythonhosted.org/packages/4a/aa/2c7ff0b5ee02eaef890c0ce7d4f74bc30901871c5e45dee1ae6d0083cd80/nvidia_curand_cu12-10.3.7.77-py3-none-manylinux2014_x86_64.whl", hash = "sha256:99f1a32f1ac2bd134897fc7a203f779303261268a65762a623bf30cc9fe79117", size = 56279000, upload-time = "2024-10-01T17:04:45.274Z" }, +] + +[[package]] +name = "nvidia-cusolver-cu12" +version = "11.7.1.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-cublas-cu12", marker = "sys_platform != 'win32'" }, + { name = "nvidia-cusparse-cu12", marker = "sys_platform != 'win32'" }, + { name = "nvidia-nvjitlink-cu12", marker = "sys_platform != 'win32'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/f0/6e/c2cf12c9ff8b872e92b4a5740701e51ff17689c4d726fca91875b07f655d/nvidia_cusolver_cu12-11.7.1.2-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e9e49843a7707e42022babb9bcfa33c29857a93b88020c4e4434656a655b698c", size = 158229790, upload-time = "2024-11-20T17:43:43.211Z" }, + { url = "https://files.pythonhosted.org/packages/9f/81/baba53585da791d043c10084cf9553e074548408e04ae884cfe9193bd484/nvidia_cusolver_cu12-11.7.1.2-py3-none-manylinux2014_x86_64.whl", hash = "sha256:6cf28f17f64107a0c4d7802be5ff5537b2130bfc112f25d5a30df227058ca0e6", size = 158229780, upload-time = "2024-10-01T17:05:39.875Z" }, +] + +[[package]] +name = "nvidia-cusparse-cu12" +version = "12.5.4.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-nvjitlink-cu12", marker = "sys_platform != 'win32'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/06/1e/b8b7c2f4099a37b96af5c9bb158632ea9e5d9d27d7391d7eb8fc45236674/nvidia_cusparse_cu12-12.5.4.2-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7556d9eca156e18184b94947ade0fba5bb47d69cec46bf8660fd2c71a4b48b73", size = 216561367, upload-time = "2024-11-20T17:44:54.824Z" }, + { url = "https://files.pythonhosted.org/packages/43/ac/64c4316ba163e8217a99680c7605f779accffc6a4bcd0c778c12948d3707/nvidia_cusparse_cu12-12.5.4.2-py3-none-manylinux2014_x86_64.whl", hash = "sha256:23749a6571191a215cb74d1cdbff4a86e7b19f1200c071b3fcf844a5bea23a2f", size = 216561357, upload-time = "2024-10-01T17:06:29.861Z" }, +] + +[[package]] +name = "nvidia-cusparselt-cu12" +version = "0.6.3" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3b/9a/72ef35b399b0e183bc2e8f6f558036922d453c4d8237dab26c666a04244b/nvidia_cusparselt_cu12-0.6.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:e5c8a26c36445dd2e6812f1177978a24e2d37cacce7e090f297a688d1ec44f46", size = 156785796, upload-time = "2024-10-15T21:29:17.709Z" }, +] + +[[package]] +name = "nvidia-nccl-cu12" +version = "2.26.2" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/67/ca/f42388aed0fddd64ade7493dbba36e1f534d4e6fdbdd355c6a90030ae028/nvidia_nccl_cu12-2.26.2-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:694cf3879a206553cc9d7dbda76b13efaf610fdb70a50cba303de1b0d1530ac6", size = 201319755, upload-time = "2025-03-13T00:29:55.296Z" }, +] + +[[package]] +name = "nvidia-nvjitlink-cu12" +version = "12.6.85" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9d/d7/c5383e47c7e9bf1c99d5bd2a8c935af2b6d705ad831a7ec5c97db4d82f4f/nvidia_nvjitlink_cu12-12.6.85-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:eedc36df9e88b682efe4309aa16b5b4e78c2407eac59e8c10a6a47535164369a", size = 19744971, upload-time = "2024-11-20T17:46:53.366Z" }, +] + +[[package]] +name = "nvidia-nvtx-cu12" +version = "12.6.77" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/56/9a/fff8376f8e3d084cd1530e1ef7b879bb7d6d265620c95c1b322725c694f4/nvidia_nvtx_cu12-12.6.77-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b90bed3df379fa79afbd21be8e04a0314336b8ae16768b58f2d34cb1d04cd7d2", size = 89276, upload-time = "2024-11-20T17:38:27.621Z" }, + { url = "https://files.pythonhosted.org/packages/9e/4e/0d0c945463719429b7bd21dece907ad0bde437a2ff12b9b12fee94722ab0/nvidia_nvtx_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl", hash = "sha256:6574241a3ec5fdc9334353ab8c479fe75841dbe8f4532a8fc97ce63503330ba1", size = 89265, upload-time = "2024-10-01T17:00:38.172Z" }, +] + +[[package]] +name = "opencensus" +version = "0.11.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "google-api-core" }, + { name = "opencensus-context" }, + { name = "six" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/15/a7/a46dcffa1b63084f9f17fe3c8cb20724c4c8f91009fd0b2cfdb27d5d2b35/opencensus-0.11.4.tar.gz", hash = "sha256:cbef87d8b8773064ab60e5c2a1ced58bbaa38a6d052c41aec224958ce544eff2", size = 64966, upload-time = "2024-01-03T18:04:07.085Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b5/ed/9fbdeb23a09e430d87b7d72d430484b88184633dc50f6bfb792354b6f661/opencensus-0.11.4-py2.py3-none-any.whl", hash = "sha256:a18487ce68bc19900336e0ff4655c5a116daf10c1b3685ece8d971bddad6a864", size = 128225, upload-time = "2024-01-03T18:04:05.127Z" }, +] + +[[package]] +name = "opencensus-context" +version = "0.1.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/4c/96/3b6f638f6275a8abbd45e582448723bffa29c1fb426721dedb5c72f7d056/opencensus-context-0.1.3.tar.gz", hash = "sha256:a03108c3c10d8c80bb5ddf5c8a1f033161fa61972a9917f9b9b3a18517f0088c", size = 4066, upload-time = "2022-08-03T22:20:22.359Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/10/68/162c97ea78c957d68ecf78a5c5041d2e25bd5562bdf5d89a6cbf7f8429bf/opencensus_context-0.1.3-py2.py3-none-any.whl", hash = "sha256:073bb0590007af276853009fac7e4bab1d523c3f03baf4cb4511ca38967c6039", size = 5060, upload-time = "2022-08-03T22:20:20.352Z" }, +] + +[[package]] +name = "opentelemetry-api" +version = "1.36.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "importlib-metadata" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/27/d2/c782c88b8afbf961d6972428821c302bd1e9e7bc361352172f0ca31296e2/opentelemetry_api-1.36.0.tar.gz", hash = "sha256:9a72572b9c416d004d492cbc6e61962c0501eaf945ece9b5a0f56597d8348aa0", size = 64780, upload-time = "2025-07-29T15:12:06.02Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bb/ee/6b08dde0a022c463b88f55ae81149584b125a42183407dc1045c486cc870/opentelemetry_api-1.36.0-py3-none-any.whl", hash = "sha256:02f20bcacf666e1333b6b1f04e647dc1d5111f86b8e510238fcc56d7762cda8c", size = 65564, upload-time = "2025-07-29T15:11:47.998Z" }, +] + +[[package]] +name = "opentelemetry-exporter-prometheus" +version = "0.57b0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-api" }, + { name = "opentelemetry-sdk" }, + { name = "prometheus-client" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d6/d8/5f04c6d51c0823c3d8ac973a2a38db6fcf2d040ca3f08fc66b3c14b6e164/opentelemetry_exporter_prometheus-0.57b0.tar.gz", hash = "sha256:9eb15bdc189235cf03c3f93abf56f8ff0ab57a493a189263bd7fe77a4249e689", size = 14906, upload-time = "2025-07-29T15:12:09.96Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/1c/40fb93a7b7e495985393bbc734104d5d20e470811644dd56c2402d683739/opentelemetry_exporter_prometheus-0.57b0-py3-none-any.whl", hash = "sha256:c5b893d1cdd593fb022af2c7de3258c2d5a4d04402ae80d9fa35675fed77f05c", size = 12922, upload-time = "2025-07-29T15:11:54.055Z" }, +] + +[[package]] +name = "opentelemetry-proto" +version = "1.36.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "protobuf" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/fd/02/f6556142301d136e3b7e95ab8ea6a5d9dc28d879a99f3dd673b5f97dca06/opentelemetry_proto-1.36.0.tar.gz", hash = "sha256:0f10b3c72f74c91e0764a5ec88fd8f1c368ea5d9c64639fb455e2854ef87dd2f", size = 46152, upload-time = "2025-07-29T15:12:15.717Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b3/57/3361e06136225be8180e879199caea520f38026f8071366241ac458beb8d/opentelemetry_proto-1.36.0-py3-none-any.whl", hash = "sha256:151b3bf73a09f94afc658497cf77d45a565606f62ce0c17acb08cd9937ca206e", size = 72537, upload-time = "2025-07-29T15:12:02.243Z" }, +] + +[[package]] +name = "opentelemetry-sdk" +version = "1.36.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-api" }, + { name = "opentelemetry-semantic-conventions" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/4c/85/8567a966b85a2d3f971c4d42f781c305b2b91c043724fa08fd37d158e9dc/opentelemetry_sdk-1.36.0.tar.gz", hash = "sha256:19c8c81599f51b71670661ff7495c905d8fdf6976e41622d5245b791b06fa581", size = 162557, upload-time = "2025-07-29T15:12:16.76Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0b/59/7bed362ad1137ba5886dac8439e84cd2df6d087be7c09574ece47ae9b22c/opentelemetry_sdk-1.36.0-py3-none-any.whl", hash = "sha256:19fe048b42e98c5c1ffe85b569b7073576ad4ce0bcb6e9b4c6a39e890a6c45fb", size = 119995, upload-time = "2025-07-29T15:12:03.181Z" }, +] + +[[package]] +name = "opentelemetry-semantic-conventions" +version = "0.57b0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-api" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7e/31/67dfa252ee88476a29200b0255bda8dfc2cf07b56ad66dc9a6221f7dc787/opentelemetry_semantic_conventions-0.57b0.tar.gz", hash = "sha256:609a4a79c7891b4620d64c7aac6898f872d790d75f22019913a660756f27ff32", size = 124225, upload-time = "2025-07-29T15:12:17.873Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/05/75/7d591371c6c39c73de5ce5da5a2cc7b72d1d1cd3f8f4638f553c01c37b11/opentelemetry_semantic_conventions-0.57b0-py3-none-any.whl", hash = "sha256:757f7e76293294f124c827e514c2a3144f191ef175b069ce8d1211e1e38e9e78", size = 201627, upload-time = "2025-07-29T15:12:04.174Z" }, +] + +[[package]] +name = "packaging" +version = "24.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d0/63/68dbb6eb2de9cb10ee4c9c14a0148804425e13c4fb20d61cce69f53106da/packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f", size = 163950, upload-time = "2024-11-08T09:47:47.202Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/88/ef/eb23f262cca3c0c4eb7ab1933c3b1f03d021f2c48f54763065b6f0e321be/packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759", size = 65451, upload-time = "2024-11-08T09:47:44.722Z" }, +] + +[[package]] +name = "pandas" +version = "2.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, + { name = "python-dateutil" }, + { name = "pytz" }, + { name = "tzdata" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d1/6f/75aa71f8a14267117adeeed5d21b204770189c0a0025acbdc03c337b28fc/pandas-2.3.1.tar.gz", hash = "sha256:0a95b9ac964fe83ce317827f80304d37388ea77616b1425f0ae41c9d2d0d7bb2", size = 4487493, upload-time = "2025-07-07T19:20:04.079Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/46/de/b8445e0f5d217a99fe0eeb2f4988070908979bec3587c0633e5428ab596c/pandas-2.3.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:689968e841136f9e542020698ee1c4fbe9caa2ed2213ae2388dc7b81721510d3", size = 11588172, upload-time = "2025-07-07T19:18:52.054Z" }, + { url = "https://files.pythonhosted.org/packages/1e/e0/801cdb3564e65a5ac041ab99ea6f1d802a6c325bb6e58c79c06a3f1cd010/pandas-2.3.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:025e92411c16cbe5bb2a4abc99732a6b132f439b8aab23a59fa593eb00704232", size = 10717365, upload-time = "2025-07-07T19:18:54.785Z" }, + { url = "https://files.pythonhosted.org/packages/51/a5/c76a8311833c24ae61a376dbf360eb1b1c9247a5d9c1e8b356563b31b80c/pandas-2.3.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9b7ff55f31c4fcb3e316e8f7fa194566b286d6ac430afec0d461163312c5841e", size = 11280411, upload-time = "2025-07-07T19:18:57.045Z" }, + { url = "https://files.pythonhosted.org/packages/da/01/e383018feba0a1ead6cf5fe8728e5d767fee02f06a3d800e82c489e5daaf/pandas-2.3.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7dcb79bf373a47d2a40cf7232928eb7540155abbc460925c2c96d2d30b006eb4", size = 11988013, upload-time = "2025-07-07T19:18:59.771Z" }, + { url = "https://files.pythonhosted.org/packages/5b/14/cec7760d7c9507f11c97d64f29022e12a6cc4fc03ac694535e89f88ad2ec/pandas-2.3.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:56a342b231e8862c96bdb6ab97170e203ce511f4d0429589c8ede1ee8ece48b8", size = 12767210, upload-time = "2025-07-07T19:19:02.944Z" }, + { url = "https://files.pythonhosted.org/packages/50/b9/6e2d2c6728ed29fb3d4d4d302504fb66f1a543e37eb2e43f352a86365cdf/pandas-2.3.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ca7ed14832bce68baef331f4d7f294411bed8efd032f8109d690df45e00c4679", size = 13440571, upload-time = "2025-07-07T19:19:06.82Z" }, + { url = "https://files.pythonhosted.org/packages/80/a5/3a92893e7399a691bad7664d977cb5e7c81cf666c81f89ea76ba2bff483d/pandas-2.3.1-cp312-cp312-win_amd64.whl", hash = "sha256:ac942bfd0aca577bef61f2bc8da8147c4ef6879965ef883d8e8d5d2dc3e744b8", size = 10987601, upload-time = "2025-07-07T19:19:09.589Z" }, + { url = "https://files.pythonhosted.org/packages/32/ed/ff0a67a2c5505e1854e6715586ac6693dd860fbf52ef9f81edee200266e7/pandas-2.3.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:9026bd4a80108fac2239294a15ef9003c4ee191a0f64b90f170b40cfb7cf2d22", size = 11531393, upload-time = "2025-07-07T19:19:12.245Z" }, + { url = "https://files.pythonhosted.org/packages/c7/db/d8f24a7cc9fb0972adab0cc80b6817e8bef888cfd0024eeb5a21c0bb5c4a/pandas-2.3.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6de8547d4fdb12421e2d047a2c446c623ff4c11f47fddb6b9169eb98ffba485a", size = 10668750, upload-time = "2025-07-07T19:19:14.612Z" }, + { url = "https://files.pythonhosted.org/packages/0f/b0/80f6ec783313f1e2356b28b4fd8d2148c378370045da918c73145e6aab50/pandas-2.3.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:782647ddc63c83133b2506912cc6b108140a38a37292102aaa19c81c83db2928", size = 11342004, upload-time = "2025-07-07T19:19:16.857Z" }, + { url = "https://files.pythonhosted.org/packages/e9/e2/20a317688435470872885e7fc8f95109ae9683dec7c50be29b56911515a5/pandas-2.3.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ba6aff74075311fc88504b1db890187a3cd0f887a5b10f5525f8e2ef55bfdb9", size = 12050869, upload-time = "2025-07-07T19:19:19.265Z" }, + { url = "https://files.pythonhosted.org/packages/55/79/20d746b0a96c67203a5bee5fb4e00ac49c3e8009a39e1f78de264ecc5729/pandas-2.3.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e5635178b387bd2ba4ac040f82bc2ef6e6b500483975c4ebacd34bec945fda12", size = 12750218, upload-time = "2025-07-07T19:19:21.547Z" }, + { url = "https://files.pythonhosted.org/packages/7c/0f/145c8b41e48dbf03dd18fdd7f24f8ba95b8254a97a3379048378f33e7838/pandas-2.3.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6f3bf5ec947526106399a9e1d26d40ee2b259c66422efdf4de63c848492d91bb", size = 13416763, upload-time = "2025-07-07T19:19:23.939Z" }, + { url = "https://files.pythonhosted.org/packages/b2/c0/54415af59db5cdd86a3d3bf79863e8cc3fa9ed265f0745254061ac09d5f2/pandas-2.3.1-cp313-cp313-win_amd64.whl", hash = "sha256:1c78cf43c8fde236342a1cb2c34bcff89564a7bfed7e474ed2fffa6aed03a956", size = 10987482, upload-time = "2025-07-07T19:19:42.699Z" }, + { url = "https://files.pythonhosted.org/packages/48/64/2fd2e400073a1230e13b8cd604c9bc95d9e3b962e5d44088ead2e8f0cfec/pandas-2.3.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:8dfc17328e8da77be3cf9f47509e5637ba8f137148ed0e9b5241e1baf526e20a", size = 12029159, upload-time = "2025-07-07T19:19:26.362Z" }, + { url = "https://files.pythonhosted.org/packages/d8/0a/d84fd79b0293b7ef88c760d7dca69828d867c89b6d9bc52d6a27e4d87316/pandas-2.3.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:ec6c851509364c59a5344458ab935e6451b31b818be467eb24b0fe89bd05b6b9", size = 11393287, upload-time = "2025-07-07T19:19:29.157Z" }, + { url = "https://files.pythonhosted.org/packages/50/ae/ff885d2b6e88f3c7520bb74ba319268b42f05d7e583b5dded9837da2723f/pandas-2.3.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:911580460fc4884d9b05254b38a6bfadddfcc6aaef856fb5859e7ca202e45275", size = 11309381, upload-time = "2025-07-07T19:19:31.436Z" }, + { url = "https://files.pythonhosted.org/packages/85/86/1fa345fc17caf5d7780d2699985c03dbe186c68fee00b526813939062bb0/pandas-2.3.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2f4d6feeba91744872a600e6edbbd5b033005b431d5ae8379abee5bcfa479fab", size = 11883998, upload-time = "2025-07-07T19:19:34.267Z" }, + { url = "https://files.pythonhosted.org/packages/81/aa/e58541a49b5e6310d89474333e994ee57fea97c8aaa8fc7f00b873059bbf/pandas-2.3.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:fe37e757f462d31a9cd7580236a82f353f5713a80e059a29753cf938c6775d96", size = 12704705, upload-time = "2025-07-07T19:19:36.856Z" }, + { url = "https://files.pythonhosted.org/packages/d5/f9/07086f5b0f2a19872554abeea7658200824f5835c58a106fa8f2ae96a46c/pandas-2.3.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5db9637dbc24b631ff3707269ae4559bce4b7fd75c1c4d7e13f40edc42df4444", size = 13189044, upload-time = "2025-07-07T19:19:39.999Z" }, +] + +[[package]] +name = "parso" +version = "0.8.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/66/94/68e2e17afaa9169cf6412ab0f28623903be73d1b32e208d9e8e541bb086d/parso-0.8.4.tar.gz", hash = "sha256:eb3a7b58240fb99099a345571deecc0f9540ea5f4dd2fe14c2a99d6b281ab92d", size = 400609, upload-time = "2024-04-05T09:43:55.897Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl", hash = "sha256:a418670a20291dacd2dddc80c377c5c3791378ee1e8d12bffc35420643d43f18", size = 103650, upload-time = "2024-04-05T09:43:53.299Z" }, +] + +[[package]] +name = "pexpect" +version = "4.9.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "ptyprocess", marker = "sys_platform != 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/42/92/cc564bf6381ff43ce1f4d06852fc19a2f11d180f23dc32d9588bee2f149d/pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f", size = 166450, upload-time = "2023-11-25T09:07:26.339Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523", size = 63772, upload-time = "2023-11-25T06:56:14.81Z" }, +] + +[[package]] +name = "pillow" +version = "11.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f3/0d/d0d6dea55cd152ce3d6767bb38a8fc10e33796ba4ba210cbab9354b6d238/pillow-11.3.0.tar.gz", hash = "sha256:3828ee7586cd0b2091b6209e5ad53e20d0649bbe87164a459d0676e035e8f523", size = 47113069, upload-time = "2025-07-01T09:16:30.666Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/40/fe/1bc9b3ee13f68487a99ac9529968035cca2f0a51ec36892060edcc51d06a/pillow-11.3.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:fdae223722da47b024b867c1ea0be64e0df702c5e0a60e27daad39bf960dd1e4", size = 5278800, upload-time = "2025-07-01T09:14:17.648Z" }, + { url = "https://files.pythonhosted.org/packages/2c/32/7e2ac19b5713657384cec55f89065fb306b06af008cfd87e572035b27119/pillow-11.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:921bd305b10e82b4d1f5e802b6850677f965d8394203d182f078873851dada69", size = 4686296, upload-time = "2025-07-01T09:14:19.828Z" }, + { url = "https://files.pythonhosted.org/packages/8e/1e/b9e12bbe6e4c2220effebc09ea0923a07a6da1e1f1bfbc8d7d29a01ce32b/pillow-11.3.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:eb76541cba2f958032d79d143b98a3a6b3ea87f0959bbe256c0b5e416599fd5d", size = 5871726, upload-time = "2025-07-03T13:10:04.448Z" }, + { url = "https://files.pythonhosted.org/packages/8d/33/e9200d2bd7ba00dc3ddb78df1198a6e80d7669cce6c2bdbeb2530a74ec58/pillow-11.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:67172f2944ebba3d4a7b54f2e95c786a3a50c21b88456329314caaa28cda70f6", size = 7644652, upload-time = "2025-07-03T13:10:10.391Z" }, + { url = "https://files.pythonhosted.org/packages/41/f1/6f2427a26fc683e00d985bc391bdd76d8dd4e92fac33d841127eb8fb2313/pillow-11.3.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:97f07ed9f56a3b9b5f49d3661dc9607484e85c67e27f3e8be2c7d28ca032fec7", size = 5977787, upload-time = "2025-07-01T09:14:21.63Z" }, + { url = "https://files.pythonhosted.org/packages/e4/c9/06dd4a38974e24f932ff5f98ea3c546ce3f8c995d3f0985f8e5ba48bba19/pillow-11.3.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:676b2815362456b5b3216b4fd5bd89d362100dc6f4945154ff172e206a22c024", size = 6645236, upload-time = "2025-07-01T09:14:23.321Z" }, + { url = "https://files.pythonhosted.org/packages/40/e7/848f69fb79843b3d91241bad658e9c14f39a32f71a301bcd1d139416d1be/pillow-11.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3e184b2f26ff146363dd07bde8b711833d7b0202e27d13540bfe2e35a323a809", size = 6086950, upload-time = "2025-07-01T09:14:25.237Z" }, + { url = "https://files.pythonhosted.org/packages/0b/1a/7cff92e695a2a29ac1958c2a0fe4c0b2393b60aac13b04a4fe2735cad52d/pillow-11.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6be31e3fc9a621e071bc17bb7de63b85cbe0bfae91bb0363c893cbe67247780d", size = 6723358, upload-time = "2025-07-01T09:14:27.053Z" }, + { url = "https://files.pythonhosted.org/packages/26/7d/73699ad77895f69edff76b0f332acc3d497f22f5d75e5360f78cbcaff248/pillow-11.3.0-cp312-cp312-win32.whl", hash = "sha256:7b161756381f0918e05e7cb8a371fff367e807770f8fe92ecb20d905d0e1c149", size = 6275079, upload-time = "2025-07-01T09:14:30.104Z" }, + { url = "https://files.pythonhosted.org/packages/8c/ce/e7dfc873bdd9828f3b6e5c2bbb74e47a98ec23cc5c74fc4e54462f0d9204/pillow-11.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:a6444696fce635783440b7f7a9fc24b3ad10a9ea3f0ab66c5905be1c19ccf17d", size = 6986324, upload-time = "2025-07-01T09:14:31.899Z" }, + { url = "https://files.pythonhosted.org/packages/16/8f/b13447d1bf0b1f7467ce7d86f6e6edf66c0ad7cf44cf5c87a37f9bed9936/pillow-11.3.0-cp312-cp312-win_arm64.whl", hash = "sha256:2aceea54f957dd4448264f9bf40875da0415c83eb85f55069d89c0ed436e3542", size = 2423067, upload-time = "2025-07-01T09:14:33.709Z" }, + { url = "https://files.pythonhosted.org/packages/1e/93/0952f2ed8db3a5a4c7a11f91965d6184ebc8cd7cbb7941a260d5f018cd2d/pillow-11.3.0-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:1c627742b539bba4309df89171356fcb3cc5a9178355b2727d1b74a6cf155fbd", size = 2128328, upload-time = "2025-07-01T09:14:35.276Z" }, + { url = "https://files.pythonhosted.org/packages/4b/e8/100c3d114b1a0bf4042f27e0f87d2f25e857e838034e98ca98fe7b8c0a9c/pillow-11.3.0-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:30b7c02f3899d10f13d7a48163c8969e4e653f8b43416d23d13d1bbfdc93b9f8", size = 2170652, upload-time = "2025-07-01T09:14:37.203Z" }, + { url = "https://files.pythonhosted.org/packages/aa/86/3f758a28a6e381758545f7cdb4942e1cb79abd271bea932998fc0db93cb6/pillow-11.3.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:7859a4cc7c9295f5838015d8cc0a9c215b77e43d07a25e460f35cf516df8626f", size = 2227443, upload-time = "2025-07-01T09:14:39.344Z" }, + { url = "https://files.pythonhosted.org/packages/01/f4/91d5b3ffa718df2f53b0dc109877993e511f4fd055d7e9508682e8aba092/pillow-11.3.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ec1ee50470b0d050984394423d96325b744d55c701a439d2bd66089bff963d3c", size = 5278474, upload-time = "2025-07-01T09:14:41.843Z" }, + { url = "https://files.pythonhosted.org/packages/f9/0e/37d7d3eca6c879fbd9dba21268427dffda1ab00d4eb05b32923d4fbe3b12/pillow-11.3.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7db51d222548ccfd274e4572fdbf3e810a5e66b00608862f947b163e613b67dd", size = 4686038, upload-time = "2025-07-01T09:14:44.008Z" }, + { url = "https://files.pythonhosted.org/packages/ff/b0/3426e5c7f6565e752d81221af9d3676fdbb4f352317ceafd42899aaf5d8a/pillow-11.3.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:2d6fcc902a24ac74495df63faad1884282239265c6839a0a6416d33faedfae7e", size = 5864407, upload-time = "2025-07-03T13:10:15.628Z" }, + { url = "https://files.pythonhosted.org/packages/fc/c1/c6c423134229f2a221ee53f838d4be9d82bab86f7e2f8e75e47b6bf6cd77/pillow-11.3.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f0f5d8f4a08090c6d6d578351a2b91acf519a54986c055af27e7a93feae6d3f1", size = 7639094, upload-time = "2025-07-03T13:10:21.857Z" }, + { url = "https://files.pythonhosted.org/packages/ba/c9/09e6746630fe6372c67c648ff9deae52a2bc20897d51fa293571977ceb5d/pillow-11.3.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c37d8ba9411d6003bba9e518db0db0c58a680ab9fe5179f040b0463644bc9805", size = 5973503, upload-time = "2025-07-01T09:14:45.698Z" }, + { url = "https://files.pythonhosted.org/packages/d5/1c/a2a29649c0b1983d3ef57ee87a66487fdeb45132df66ab30dd37f7dbe162/pillow-11.3.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:13f87d581e71d9189ab21fe0efb5a23e9f28552d5be6979e84001d3b8505abe8", size = 6642574, upload-time = "2025-07-01T09:14:47.415Z" }, + { url = "https://files.pythonhosted.org/packages/36/de/d5cc31cc4b055b6c6fd990e3e7f0f8aaf36229a2698501bcb0cdf67c7146/pillow-11.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:023f6d2d11784a465f09fd09a34b150ea4672e85fb3d05931d89f373ab14abb2", size = 6084060, upload-time = "2025-07-01T09:14:49.636Z" }, + { url = "https://files.pythonhosted.org/packages/d5/ea/502d938cbaeec836ac28a9b730193716f0114c41325db428e6b280513f09/pillow-11.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:45dfc51ac5975b938e9809451c51734124e73b04d0f0ac621649821a63852e7b", size = 6721407, upload-time = "2025-07-01T09:14:51.962Z" }, + { url = "https://files.pythonhosted.org/packages/45/9c/9c5e2a73f125f6cbc59cc7087c8f2d649a7ae453f83bd0362ff7c9e2aee2/pillow-11.3.0-cp313-cp313-win32.whl", hash = "sha256:a4d336baed65d50d37b88ca5b60c0fa9d81e3a87d4a7930d3880d1624d5b31f3", size = 6273841, upload-time = "2025-07-01T09:14:54.142Z" }, + { url = "https://files.pythonhosted.org/packages/23/85/397c73524e0cd212067e0c969aa245b01d50183439550d24d9f55781b776/pillow-11.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:0bce5c4fd0921f99d2e858dc4d4d64193407e1b99478bc5cacecba2311abde51", size = 6978450, upload-time = "2025-07-01T09:14:56.436Z" }, + { url = "https://files.pythonhosted.org/packages/17/d2/622f4547f69cd173955194b78e4d19ca4935a1b0f03a302d655c9f6aae65/pillow-11.3.0-cp313-cp313-win_arm64.whl", hash = "sha256:1904e1264881f682f02b7f8167935cce37bc97db457f8e7849dc3a6a52b99580", size = 2423055, upload-time = "2025-07-01T09:14:58.072Z" }, + { url = "https://files.pythonhosted.org/packages/dd/80/a8a2ac21dda2e82480852978416cfacd439a4b490a501a288ecf4fe2532d/pillow-11.3.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:4c834a3921375c48ee6b9624061076bc0a32a60b5532b322cc0ea64e639dd50e", size = 5281110, upload-time = "2025-07-01T09:14:59.79Z" }, + { url = "https://files.pythonhosted.org/packages/44/d6/b79754ca790f315918732e18f82a8146d33bcd7f4494380457ea89eb883d/pillow-11.3.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5e05688ccef30ea69b9317a9ead994b93975104a677a36a8ed8106be9260aa6d", size = 4689547, upload-time = "2025-07-01T09:15:01.648Z" }, + { url = "https://files.pythonhosted.org/packages/49/20/716b8717d331150cb00f7fdd78169c01e8e0c219732a78b0e59b6bdb2fd6/pillow-11.3.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1019b04af07fc0163e2810167918cb5add8d74674b6267616021ab558dc98ced", size = 5901554, upload-time = "2025-07-03T13:10:27.018Z" }, + { url = "https://files.pythonhosted.org/packages/74/cf/a9f3a2514a65bb071075063a96f0a5cf949c2f2fce683c15ccc83b1c1cab/pillow-11.3.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f944255db153ebb2b19c51fe85dd99ef0ce494123f21b9db4877ffdfc5590c7c", size = 7669132, upload-time = "2025-07-03T13:10:33.01Z" }, + { url = "https://files.pythonhosted.org/packages/98/3c/da78805cbdbee9cb43efe8261dd7cc0b4b93f2ac79b676c03159e9db2187/pillow-11.3.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1f85acb69adf2aaee8b7da124efebbdb959a104db34d3a2cb0f3793dbae422a8", size = 6005001, upload-time = "2025-07-01T09:15:03.365Z" }, + { url = "https://files.pythonhosted.org/packages/6c/fa/ce044b91faecf30e635321351bba32bab5a7e034c60187fe9698191aef4f/pillow-11.3.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:05f6ecbeff5005399bb48d198f098a9b4b6bdf27b8487c7f38ca16eeb070cd59", size = 6668814, upload-time = "2025-07-01T09:15:05.655Z" }, + { url = "https://files.pythonhosted.org/packages/7b/51/90f9291406d09bf93686434f9183aba27b831c10c87746ff49f127ee80cb/pillow-11.3.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a7bc6e6fd0395bc052f16b1a8670859964dbd7003bd0af2ff08342eb6e442cfe", size = 6113124, upload-time = "2025-07-01T09:15:07.358Z" }, + { url = "https://files.pythonhosted.org/packages/cd/5a/6fec59b1dfb619234f7636d4157d11fb4e196caeee220232a8d2ec48488d/pillow-11.3.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:83e1b0161c9d148125083a35c1c5a89db5b7054834fd4387499e06552035236c", size = 6747186, upload-time = "2025-07-01T09:15:09.317Z" }, + { url = "https://files.pythonhosted.org/packages/49/6b/00187a044f98255225f172de653941e61da37104a9ea60e4f6887717e2b5/pillow-11.3.0-cp313-cp313t-win32.whl", hash = "sha256:2a3117c06b8fb646639dce83694f2f9eac405472713fcb1ae887469c0d4f6788", size = 6277546, upload-time = "2025-07-01T09:15:11.311Z" }, + { url = "https://files.pythonhosted.org/packages/e8/5c/6caaba7e261c0d75bab23be79f1d06b5ad2a2ae49f028ccec801b0e853d6/pillow-11.3.0-cp313-cp313t-win_amd64.whl", hash = "sha256:857844335c95bea93fb39e0fa2726b4d9d758850b34075a7e3ff4f4fa3aa3b31", size = 6985102, upload-time = "2025-07-01T09:15:13.164Z" }, + { url = "https://files.pythonhosted.org/packages/f3/7e/b623008460c09a0cb38263c93b828c666493caee2eb34ff67f778b87e58c/pillow-11.3.0-cp313-cp313t-win_arm64.whl", hash = "sha256:8797edc41f3e8536ae4b10897ee2f637235c94f27404cac7297f7b607dd0716e", size = 2424803, upload-time = "2025-07-01T09:15:15.695Z" }, + { url = "https://files.pythonhosted.org/packages/73/f4/04905af42837292ed86cb1b1dabe03dce1edc008ef14c473c5c7e1443c5d/pillow-11.3.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:d9da3df5f9ea2a89b81bb6087177fb1f4d1c7146d583a3fe5c672c0d94e55e12", size = 5278520, upload-time = "2025-07-01T09:15:17.429Z" }, + { url = "https://files.pythonhosted.org/packages/41/b0/33d79e377a336247df6348a54e6d2a2b85d644ca202555e3faa0cf811ecc/pillow-11.3.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:0b275ff9b04df7b640c59ec5a3cb113eefd3795a8df80bac69646ef699c6981a", size = 4686116, upload-time = "2025-07-01T09:15:19.423Z" }, + { url = "https://files.pythonhosted.org/packages/49/2d/ed8bc0ab219ae8768f529597d9509d184fe8a6c4741a6864fea334d25f3f/pillow-11.3.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0743841cabd3dba6a83f38a92672cccbd69af56e3e91777b0ee7f4dba4385632", size = 5864597, upload-time = "2025-07-03T13:10:38.404Z" }, + { url = "https://files.pythonhosted.org/packages/b5/3d/b932bb4225c80b58dfadaca9d42d08d0b7064d2d1791b6a237f87f661834/pillow-11.3.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2465a69cf967b8b49ee1b96d76718cd98c4e925414ead59fdf75cf0fd07df673", size = 7638246, upload-time = "2025-07-03T13:10:44.987Z" }, + { url = "https://files.pythonhosted.org/packages/09/b5/0487044b7c096f1b48f0d7ad416472c02e0e4bf6919541b111efd3cae690/pillow-11.3.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:41742638139424703b4d01665b807c6468e23e699e8e90cffefe291c5832b027", size = 5973336, upload-time = "2025-07-01T09:15:21.237Z" }, + { url = "https://files.pythonhosted.org/packages/a8/2d/524f9318f6cbfcc79fbc004801ea6b607ec3f843977652fdee4857a7568b/pillow-11.3.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:93efb0b4de7e340d99057415c749175e24c8864302369e05914682ba642e5d77", size = 6642699, upload-time = "2025-07-01T09:15:23.186Z" }, + { url = "https://files.pythonhosted.org/packages/6f/d2/a9a4f280c6aefedce1e8f615baaa5474e0701d86dd6f1dede66726462bbd/pillow-11.3.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7966e38dcd0fa11ca390aed7c6f20454443581d758242023cf36fcb319b1a874", size = 6083789, upload-time = "2025-07-01T09:15:25.1Z" }, + { url = "https://files.pythonhosted.org/packages/fe/54/86b0cd9dbb683a9d5e960b66c7379e821a19be4ac5810e2e5a715c09a0c0/pillow-11.3.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:98a9afa7b9007c67ed84c57c9e0ad86a6000da96eaa638e4f8abe5b65ff83f0a", size = 6720386, upload-time = "2025-07-01T09:15:27.378Z" }, + { url = "https://files.pythonhosted.org/packages/e7/95/88efcaf384c3588e24259c4203b909cbe3e3c2d887af9e938c2022c9dd48/pillow-11.3.0-cp314-cp314-win32.whl", hash = "sha256:02a723e6bf909e7cea0dac1b0e0310be9d7650cd66222a5f1c571455c0a45214", size = 6370911, upload-time = "2025-07-01T09:15:29.294Z" }, + { url = "https://files.pythonhosted.org/packages/2e/cc/934e5820850ec5eb107e7b1a72dd278140731c669f396110ebc326f2a503/pillow-11.3.0-cp314-cp314-win_amd64.whl", hash = "sha256:a418486160228f64dd9e9efcd132679b7a02a5f22c982c78b6fc7dab3fefb635", size = 7117383, upload-time = "2025-07-01T09:15:31.128Z" }, + { url = "https://files.pythonhosted.org/packages/d6/e9/9c0a616a71da2a5d163aa37405e8aced9a906d574b4a214bede134e731bc/pillow-11.3.0-cp314-cp314-win_arm64.whl", hash = "sha256:155658efb5e044669c08896c0c44231c5e9abcaadbc5cd3648df2f7c0b96b9a6", size = 2511385, upload-time = "2025-07-01T09:15:33.328Z" }, + { url = "https://files.pythonhosted.org/packages/1a/33/c88376898aff369658b225262cd4f2659b13e8178e7534df9e6e1fa289f6/pillow-11.3.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:59a03cdf019efbfeeed910bf79c7c93255c3d54bc45898ac2a4140071b02b4ae", size = 5281129, upload-time = "2025-07-01T09:15:35.194Z" }, + { url = "https://files.pythonhosted.org/packages/1f/70/d376247fb36f1844b42910911c83a02d5544ebd2a8bad9efcc0f707ea774/pillow-11.3.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:f8a5827f84d973d8636e9dc5764af4f0cf2318d26744b3d902931701b0d46653", size = 4689580, upload-time = "2025-07-01T09:15:37.114Z" }, + { url = "https://files.pythonhosted.org/packages/eb/1c/537e930496149fbac69efd2fc4329035bbe2e5475b4165439e3be9cb183b/pillow-11.3.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ee92f2fd10f4adc4b43d07ec5e779932b4eb3dbfbc34790ada5a6669bc095aa6", size = 5902860, upload-time = "2025-07-03T13:10:50.248Z" }, + { url = "https://files.pythonhosted.org/packages/bd/57/80f53264954dcefeebcf9dae6e3eb1daea1b488f0be8b8fef12f79a3eb10/pillow-11.3.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c96d333dcf42d01f47b37e0979b6bd73ec91eae18614864622d9b87bbd5bbf36", size = 7670694, upload-time = "2025-07-03T13:10:56.432Z" }, + { url = "https://files.pythonhosted.org/packages/70/ff/4727d3b71a8578b4587d9c276e90efad2d6fe0335fd76742a6da08132e8c/pillow-11.3.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4c96f993ab8c98460cd0c001447bff6194403e8b1d7e149ade5f00594918128b", size = 6005888, upload-time = "2025-07-01T09:15:39.436Z" }, + { url = "https://files.pythonhosted.org/packages/05/ae/716592277934f85d3be51d7256f3636672d7b1abfafdc42cf3f8cbd4b4c8/pillow-11.3.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:41342b64afeba938edb034d122b2dda5db2139b9a4af999729ba8818e0056477", size = 6670330, upload-time = "2025-07-01T09:15:41.269Z" }, + { url = "https://files.pythonhosted.org/packages/e7/bb/7fe6cddcc8827b01b1a9766f5fdeb7418680744f9082035bdbabecf1d57f/pillow-11.3.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:068d9c39a2d1b358eb9f245ce7ab1b5c3246c7c8c7d9ba58cfa5b43146c06e50", size = 6114089, upload-time = "2025-07-01T09:15:43.13Z" }, + { url = "https://files.pythonhosted.org/packages/8b/f5/06bfaa444c8e80f1a8e4bff98da9c83b37b5be3b1deaa43d27a0db37ef84/pillow-11.3.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:a1bc6ba083b145187f648b667e05a2534ecc4b9f2784c2cbe3089e44868f2b9b", size = 6748206, upload-time = "2025-07-01T09:15:44.937Z" }, + { url = "https://files.pythonhosted.org/packages/f0/77/bc6f92a3e8e6e46c0ca78abfffec0037845800ea38c73483760362804c41/pillow-11.3.0-cp314-cp314t-win32.whl", hash = "sha256:118ca10c0d60b06d006be10a501fd6bbdfef559251ed31b794668ed569c87e12", size = 6377370, upload-time = "2025-07-01T09:15:46.673Z" }, + { url = "https://files.pythonhosted.org/packages/4a/82/3a721f7d69dca802befb8af08b7c79ebcab461007ce1c18bd91a5d5896f9/pillow-11.3.0-cp314-cp314t-win_amd64.whl", hash = "sha256:8924748b688aa210d79883357d102cd64690e56b923a186f35a82cbc10f997db", size = 7121500, upload-time = "2025-07-01T09:15:48.512Z" }, + { url = "https://files.pythonhosted.org/packages/89/c7/5572fa4a3f45740eaab6ae86fcdf7195b55beac1371ac8c619d880cfe948/pillow-11.3.0-cp314-cp314t-win_arm64.whl", hash = "sha256:79ea0d14d3ebad43ec77ad5272e6ff9bba5b679ef73375ea760261207fa8e0aa", size = 2512835, upload-time = "2025-07-01T09:15:50.399Z" }, +] + +[[package]] +name = "platformdirs" +version = "4.3.8" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fe/8b/3c73abc9c759ecd3f1f7ceff6685840859e8070c4d947c93fae71f6a0bf2/platformdirs-4.3.8.tar.gz", hash = "sha256:3d512d96e16bcb959a814c9f348431070822a6496326a4be0911c40b5a74c2bc", size = 21362, upload-time = "2025-05-07T22:47:42.121Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fe/39/979e8e21520d4e47a0bbe349e2713c0aac6f3d853d0e5b34d76206c439aa/platformdirs-4.3.8-py3-none-any.whl", hash = "sha256:ff7059bb7eb1179e2685604f4aaf157cfd9535242bd23742eadc3c13542139b4", size = 18567, upload-time = "2025-05-07T22:47:40.376Z" }, +] + +[[package]] +name = "polars" +version = "1.32.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/aa/f2/1a76a8bd902bc4942e435a480f362c8687bba60d438ff3283191e38568fa/polars-1.32.3.tar.gz", hash = "sha256:57c500dc1b5cba49b0589034478db031815f3d57a20cb830b05ecee1a9ba56b1", size = 4838448, upload-time = "2025-08-14T17:28:10.702Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4c/9b/5937ab9f8fa49c8e00617aeb817a5ffa5740434d5bb8a90f2afa657875aa/polars-1.32.3-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:c7c472ea1d50a5104079cb64e34f78f85774bcc69b875ba8daf21233f4c70d42", size = 37935794, upload-time = "2025-08-14T17:26:55.565Z" }, + { url = "https://files.pythonhosted.org/packages/6e/e9/88f5332001b9dd5c8e0a4fab51015f740e01715a081c41bc0f7ad2bf76a5/polars-1.32.3-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:fd87275f0cc795e72a2030b58293198cfa748d4b009cf52218e27db5397ed07f", size = 34621102, upload-time = "2025-08-14T17:27:00.521Z" }, + { url = "https://files.pythonhosted.org/packages/ab/8a/6f56af7e535c34c95decc8654786bfce4632ba32817dc2f8bad18571ef9a/polars-1.32.3-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c9a9b9668ef310e5a77a7e7daa9c753874779c8da52e93f654bfd7953eb4b60b", size = 38443071, upload-time = "2025-08-14T17:27:08.382Z" }, + { url = "https://files.pythonhosted.org/packages/46/aa/63536ea5780edc0ef6850679dc81d519f3966c7bb11a5cf10ccecb541095/polars-1.32.3-cp39-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:c8f5d2f43b80b68e39bfaa2948ce632563633466576f12e74e8560d6481f5851", size = 35639598, upload-time = "2025-08-14T17:27:12.261Z" }, + { url = "https://files.pythonhosted.org/packages/d7/c8/226953cda6cf9ae63aa9714d396a9138029e31db3c504c15d6711b618f8f/polars-1.32.3-cp39-abi3-win_amd64.whl", hash = "sha256:db56a7cb4898e173d62634e182f74bdff744c62be5470e0fe20df8d10f659af7", size = 38038192, upload-time = "2025-08-14T17:27:15.993Z" }, + { url = "https://files.pythonhosted.org/packages/ec/99/6b93c854e602927a778eabd7550204f700cc4e6c07be73372371583dda3e/polars-1.32.3-cp39-abi3-win_arm64.whl", hash = "sha256:a2e3f87c60f54eefe67b1bebd3105918d84df0fd6d59cc6b870c2f16d2d26ca1", size = 34198919, upload-time = "2025-08-14T17:27:21.423Z" }, +] + +[[package]] +name = "prometheus-client" +version = "0.22.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/5e/cf/40dde0a2be27cc1eb41e333d1a674a74ce8b8b0457269cc640fd42b07cf7/prometheus_client-0.22.1.tar.gz", hash = "sha256:190f1331e783cf21eb60bca559354e0a4d4378facecf78f5428c39b675d20d28", size = 69746, upload-time = "2025-06-02T14:29:01.152Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/32/ae/ec06af4fe3ee72d16973474f122541746196aaa16cea6f66d18b963c6177/prometheus_client-0.22.1-py3-none-any.whl", hash = "sha256:cca895342e308174341b2cbf99a56bef291fbc0ef7b9e5412a0f26d653ba7094", size = 58694, upload-time = "2025-06-02T14:29:00.068Z" }, +] + +[[package]] +name = "prompt-toolkit" +version = "3.0.51" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "wcwidth" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/bb/6e/9d084c929dfe9e3bfe0c6a47e31f78a25c54627d64a66e884a8bf5474f1c/prompt_toolkit-3.0.51.tar.gz", hash = "sha256:931a162e3b27fc90c86f1b48bb1fb2c528c2761475e57c9c06de13311c7b54ed", size = 428940, upload-time = "2025-04-15T09:18:47.731Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ce/4f/5249960887b1fbe561d9ff265496d170b55a735b76724f10ef19f9e40716/prompt_toolkit-3.0.51-py3-none-any.whl", hash = "sha256:52742911fde84e2d423e2f9a4cf1de7d7ac4e51958f648d9540e0fb8db077b07", size = 387810, upload-time = "2025-04-15T09:18:44.753Z" }, +] + +[[package]] +name = "propcache" +version = "0.3.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a6/16/43264e4a779dd8588c21a70f0709665ee8f611211bdd2c87d952cfa7c776/propcache-0.3.2.tar.gz", hash = "sha256:20d7d62e4e7ef05f221e0db2856b979540686342e7dd9973b815599c7057e168", size = 44139, upload-time = "2025-06-09T22:56:06.081Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a8/42/9ca01b0a6f48e81615dca4765a8f1dd2c057e0540f6116a27dc5ee01dfb6/propcache-0.3.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:8de106b6c84506b31c27168582cd3cb3000a6412c16df14a8628e5871ff83c10", size = 73674, upload-time = "2025-06-09T22:54:30.551Z" }, + { url = "https://files.pythonhosted.org/packages/af/6e/21293133beb550f9c901bbece755d582bfaf2176bee4774000bd4dd41884/propcache-0.3.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:28710b0d3975117239c76600ea351934ac7b5ff56e60953474342608dbbb6154", size = 43570, upload-time = "2025-06-09T22:54:32.296Z" }, + { url = "https://files.pythonhosted.org/packages/0c/c8/0393a0a3a2b8760eb3bde3c147f62b20044f0ddac81e9d6ed7318ec0d852/propcache-0.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce26862344bdf836650ed2487c3d724b00fbfec4233a1013f597b78c1cb73615", size = 43094, upload-time = "2025-06-09T22:54:33.929Z" }, + { url = "https://files.pythonhosted.org/packages/37/2c/489afe311a690399d04a3e03b069225670c1d489eb7b044a566511c1c498/propcache-0.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bca54bd347a253af2cf4544bbec232ab982f4868de0dd684246b67a51bc6b1db", size = 226958, upload-time = "2025-06-09T22:54:35.186Z" }, + { url = "https://files.pythonhosted.org/packages/9d/ca/63b520d2f3d418c968bf596839ae26cf7f87bead026b6192d4da6a08c467/propcache-0.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:55780d5e9a2ddc59711d727226bb1ba83a22dd32f64ee15594b9392b1f544eb1", size = 234894, upload-time = "2025-06-09T22:54:36.708Z" }, + { url = "https://files.pythonhosted.org/packages/11/60/1d0ed6fff455a028d678df30cc28dcee7af77fa2b0e6962ce1df95c9a2a9/propcache-0.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:035e631be25d6975ed87ab23153db6a73426a48db688070d925aa27e996fe93c", size = 233672, upload-time = "2025-06-09T22:54:38.062Z" }, + { url = "https://files.pythonhosted.org/packages/37/7c/54fd5301ef38505ab235d98827207176a5c9b2aa61939b10a460ca53e123/propcache-0.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ee6f22b6eaa39297c751d0e80c0d3a454f112f5c6481214fcf4c092074cecd67", size = 224395, upload-time = "2025-06-09T22:54:39.634Z" }, + { url = "https://files.pythonhosted.org/packages/ee/1a/89a40e0846f5de05fdc6779883bf46ba980e6df4d2ff8fb02643de126592/propcache-0.3.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7ca3aee1aa955438c4dba34fc20a9f390e4c79967257d830f137bd5a8a32ed3b", size = 212510, upload-time = "2025-06-09T22:54:41.565Z" }, + { url = "https://files.pythonhosted.org/packages/5e/33/ca98368586c9566a6b8d5ef66e30484f8da84c0aac3f2d9aec6d31a11bd5/propcache-0.3.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:7a4f30862869fa2b68380d677cc1c5fcf1e0f2b9ea0cf665812895c75d0ca3b8", size = 222949, upload-time = "2025-06-09T22:54:43.038Z" }, + { url = "https://files.pythonhosted.org/packages/ba/11/ace870d0aafe443b33b2f0b7efdb872b7c3abd505bfb4890716ad7865e9d/propcache-0.3.2-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:b77ec3c257d7816d9f3700013639db7491a434644c906a2578a11daf13176251", size = 217258, upload-time = "2025-06-09T22:54:44.376Z" }, + { url = "https://files.pythonhosted.org/packages/5b/d2/86fd6f7adffcfc74b42c10a6b7db721d1d9ca1055c45d39a1a8f2a740a21/propcache-0.3.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:cab90ac9d3f14b2d5050928483d3d3b8fb6b4018893fc75710e6aa361ecb2474", size = 213036, upload-time = "2025-06-09T22:54:46.243Z" }, + { url = "https://files.pythonhosted.org/packages/07/94/2d7d1e328f45ff34a0a284cf5a2847013701e24c2a53117e7c280a4316b3/propcache-0.3.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:0b504d29f3c47cf6b9e936c1852246c83d450e8e063d50562115a6be6d3a2535", size = 227684, upload-time = "2025-06-09T22:54:47.63Z" }, + { url = "https://files.pythonhosted.org/packages/b7/05/37ae63a0087677e90b1d14710e532ff104d44bc1efa3b3970fff99b891dc/propcache-0.3.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:ce2ac2675a6aa41ddb2a0c9cbff53780a617ac3d43e620f8fd77ba1c84dcfc06", size = 234562, upload-time = "2025-06-09T22:54:48.982Z" }, + { url = "https://files.pythonhosted.org/packages/a4/7c/3f539fcae630408d0bd8bf3208b9a647ccad10976eda62402a80adf8fc34/propcache-0.3.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:62b4239611205294cc433845b914131b2a1f03500ff3c1ed093ed216b82621e1", size = 222142, upload-time = "2025-06-09T22:54:50.424Z" }, + { url = "https://files.pythonhosted.org/packages/7c/d2/34b9eac8c35f79f8a962546b3e97e9d4b990c420ee66ac8255d5d9611648/propcache-0.3.2-cp312-cp312-win32.whl", hash = "sha256:df4a81b9b53449ebc90cc4deefb052c1dd934ba85012aa912c7ea7b7e38b60c1", size = 37711, upload-time = "2025-06-09T22:54:52.072Z" }, + { url = "https://files.pythonhosted.org/packages/19/61/d582be5d226cf79071681d1b46b848d6cb03d7b70af7063e33a2787eaa03/propcache-0.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:7046e79b989d7fe457bb755844019e10f693752d169076138abf17f31380800c", size = 41479, upload-time = "2025-06-09T22:54:53.234Z" }, + { url = "https://files.pythonhosted.org/packages/dc/d1/8c747fafa558c603c4ca19d8e20b288aa0c7cda74e9402f50f31eb65267e/propcache-0.3.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ca592ed634a73ca002967458187109265e980422116c0a107cf93d81f95af945", size = 71286, upload-time = "2025-06-09T22:54:54.369Z" }, + { url = "https://files.pythonhosted.org/packages/61/99/d606cb7986b60d89c36de8a85d58764323b3a5ff07770a99d8e993b3fa73/propcache-0.3.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:9ecb0aad4020e275652ba3975740f241bd12a61f1a784df044cf7477a02bc252", size = 42425, upload-time = "2025-06-09T22:54:55.642Z" }, + { url = "https://files.pythonhosted.org/packages/8c/96/ef98f91bbb42b79e9bb82bdd348b255eb9d65f14dbbe3b1594644c4073f7/propcache-0.3.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7f08f1cc28bd2eade7a8a3d2954ccc673bb02062e3e7da09bc75d843386b342f", size = 41846, upload-time = "2025-06-09T22:54:57.246Z" }, + { url = "https://files.pythonhosted.org/packages/5b/ad/3f0f9a705fb630d175146cd7b1d2bf5555c9beaed54e94132b21aac098a6/propcache-0.3.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1a342c834734edb4be5ecb1e9fb48cb64b1e2320fccbd8c54bf8da8f2a84c33", size = 208871, upload-time = "2025-06-09T22:54:58.975Z" }, + { url = "https://files.pythonhosted.org/packages/3a/38/2085cda93d2c8b6ec3e92af2c89489a36a5886b712a34ab25de9fbca7992/propcache-0.3.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8a544caaae1ac73f1fecfae70ded3e93728831affebd017d53449e3ac052ac1e", size = 215720, upload-time = "2025-06-09T22:55:00.471Z" }, + { url = "https://files.pythonhosted.org/packages/61/c1/d72ea2dc83ac7f2c8e182786ab0fc2c7bd123a1ff9b7975bee671866fe5f/propcache-0.3.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:310d11aa44635298397db47a3ebce7db99a4cc4b9bbdfcf6c98a60c8d5261cf1", size = 215203, upload-time = "2025-06-09T22:55:01.834Z" }, + { url = "https://files.pythonhosted.org/packages/af/81/b324c44ae60c56ef12007105f1460d5c304b0626ab0cc6b07c8f2a9aa0b8/propcache-0.3.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c1396592321ac83157ac03a2023aa6cc4a3cc3cfdecb71090054c09e5a7cce3", size = 206365, upload-time = "2025-06-09T22:55:03.199Z" }, + { url = "https://files.pythonhosted.org/packages/09/73/88549128bb89e66d2aff242488f62869014ae092db63ccea53c1cc75a81d/propcache-0.3.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8cabf5b5902272565e78197edb682017d21cf3b550ba0460ee473753f28d23c1", size = 196016, upload-time = "2025-06-09T22:55:04.518Z" }, + { url = "https://files.pythonhosted.org/packages/b9/3f/3bdd14e737d145114a5eb83cb172903afba7242f67c5877f9909a20d948d/propcache-0.3.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0a2f2235ac46a7aa25bdeb03a9e7060f6ecbd213b1f9101c43b3090ffb971ef6", size = 205596, upload-time = "2025-06-09T22:55:05.942Z" }, + { url = "https://files.pythonhosted.org/packages/0f/ca/2f4aa819c357d3107c3763d7ef42c03980f9ed5c48c82e01e25945d437c1/propcache-0.3.2-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:92b69e12e34869a6970fd2f3da91669899994b47c98f5d430b781c26f1d9f387", size = 200977, upload-time = "2025-06-09T22:55:07.792Z" }, + { url = "https://files.pythonhosted.org/packages/cd/4a/e65276c7477533c59085251ae88505caf6831c0e85ff8b2e31ebcbb949b1/propcache-0.3.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:54e02207c79968ebbdffc169591009f4474dde3b4679e16634d34c9363ff56b4", size = 197220, upload-time = "2025-06-09T22:55:09.173Z" }, + { url = "https://files.pythonhosted.org/packages/7c/54/fc7152e517cf5578278b242396ce4d4b36795423988ef39bb8cd5bf274c8/propcache-0.3.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:4adfb44cb588001f68c5466579d3f1157ca07f7504fc91ec87862e2b8e556b88", size = 210642, upload-time = "2025-06-09T22:55:10.62Z" }, + { url = "https://files.pythonhosted.org/packages/b9/80/abeb4a896d2767bf5f1ea7b92eb7be6a5330645bd7fb844049c0e4045d9d/propcache-0.3.2-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:fd3e6019dc1261cd0291ee8919dd91fbab7b169bb76aeef6c716833a3f65d206", size = 212789, upload-time = "2025-06-09T22:55:12.029Z" }, + { url = "https://files.pythonhosted.org/packages/b3/db/ea12a49aa7b2b6d68a5da8293dcf50068d48d088100ac016ad92a6a780e6/propcache-0.3.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4c181cad81158d71c41a2bce88edce078458e2dd5ffee7eddd6b05da85079f43", size = 205880, upload-time = "2025-06-09T22:55:13.45Z" }, + { url = "https://files.pythonhosted.org/packages/d1/e5/9076a0bbbfb65d1198007059c65639dfd56266cf8e477a9707e4b1999ff4/propcache-0.3.2-cp313-cp313-win32.whl", hash = "sha256:8a08154613f2249519e549de2330cf8e2071c2887309a7b07fb56098f5170a02", size = 37220, upload-time = "2025-06-09T22:55:15.284Z" }, + { url = "https://files.pythonhosted.org/packages/d3/f5/b369e026b09a26cd77aa88d8fffd69141d2ae00a2abaaf5380d2603f4b7f/propcache-0.3.2-cp313-cp313-win_amd64.whl", hash = "sha256:e41671f1594fc4ab0a6dec1351864713cb3a279910ae8b58f884a88a0a632c05", size = 40678, upload-time = "2025-06-09T22:55:16.445Z" }, + { url = "https://files.pythonhosted.org/packages/a4/3a/6ece377b55544941a08d03581c7bc400a3c8cd3c2865900a68d5de79e21f/propcache-0.3.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:9a3cf035bbaf035f109987d9d55dc90e4b0e36e04bbbb95af3055ef17194057b", size = 76560, upload-time = "2025-06-09T22:55:17.598Z" }, + { url = "https://files.pythonhosted.org/packages/0c/da/64a2bb16418740fa634b0e9c3d29edff1db07f56d3546ca2d86ddf0305e1/propcache-0.3.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:156c03d07dc1323d8dacaa221fbe028c5c70d16709cdd63502778e6c3ccca1b0", size = 44676, upload-time = "2025-06-09T22:55:18.922Z" }, + { url = "https://files.pythonhosted.org/packages/36/7b/f025e06ea51cb72c52fb87e9b395cced02786610b60a3ed51da8af017170/propcache-0.3.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:74413c0ba02ba86f55cf60d18daab219f7e531620c15f1e23d95563f505efe7e", size = 44701, upload-time = "2025-06-09T22:55:20.106Z" }, + { url = "https://files.pythonhosted.org/packages/a4/00/faa1b1b7c3b74fc277f8642f32a4c72ba1d7b2de36d7cdfb676db7f4303e/propcache-0.3.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f066b437bb3fa39c58ff97ab2ca351db465157d68ed0440abecb21715eb24b28", size = 276934, upload-time = "2025-06-09T22:55:21.5Z" }, + { url = "https://files.pythonhosted.org/packages/74/ab/935beb6f1756e0476a4d5938ff44bf0d13a055fed880caf93859b4f1baf4/propcache-0.3.2-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f1304b085c83067914721e7e9d9917d41ad87696bf70f0bc7dee450e9c71ad0a", size = 278316, upload-time = "2025-06-09T22:55:22.918Z" }, + { url = "https://files.pythonhosted.org/packages/f8/9d/994a5c1ce4389610838d1caec74bdf0e98b306c70314d46dbe4fcf21a3e2/propcache-0.3.2-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ab50cef01b372763a13333b4e54021bdcb291fc9a8e2ccb9c2df98be51bcde6c", size = 282619, upload-time = "2025-06-09T22:55:24.651Z" }, + { url = "https://files.pythonhosted.org/packages/2b/00/a10afce3d1ed0287cef2e09506d3be9822513f2c1e96457ee369adb9a6cd/propcache-0.3.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fad3b2a085ec259ad2c2842666b2a0a49dea8463579c606426128925af1ed725", size = 265896, upload-time = "2025-06-09T22:55:26.049Z" }, + { url = "https://files.pythonhosted.org/packages/2e/a8/2aa6716ffa566ca57c749edb909ad27884680887d68517e4be41b02299f3/propcache-0.3.2-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:261fa020c1c14deafd54c76b014956e2f86991af198c51139faf41c4d5e83892", size = 252111, upload-time = "2025-06-09T22:55:27.381Z" }, + { url = "https://files.pythonhosted.org/packages/36/4f/345ca9183b85ac29c8694b0941f7484bf419c7f0fea2d1e386b4f7893eed/propcache-0.3.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:46d7f8aa79c927e5f987ee3a80205c987717d3659f035c85cf0c3680526bdb44", size = 268334, upload-time = "2025-06-09T22:55:28.747Z" }, + { url = "https://files.pythonhosted.org/packages/3e/ca/fcd54f78b59e3f97b3b9715501e3147f5340167733d27db423aa321e7148/propcache-0.3.2-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:6d8f3f0eebf73e3c0ff0e7853f68be638b4043c65a70517bb575eff54edd8dbe", size = 255026, upload-time = "2025-06-09T22:55:30.184Z" }, + { url = "https://files.pythonhosted.org/packages/8b/95/8e6a6bbbd78ac89c30c225210a5c687790e532ba4088afb8c0445b77ef37/propcache-0.3.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:03c89c1b14a5452cf15403e291c0ccd7751d5b9736ecb2c5bab977ad6c5bcd81", size = 250724, upload-time = "2025-06-09T22:55:31.646Z" }, + { url = "https://files.pythonhosted.org/packages/ee/b0/0dd03616142baba28e8b2d14ce5df6631b4673850a3d4f9c0f9dd714a404/propcache-0.3.2-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:0cc17efde71e12bbaad086d679ce575268d70bc123a5a71ea7ad76f70ba30bba", size = 268868, upload-time = "2025-06-09T22:55:33.209Z" }, + { url = "https://files.pythonhosted.org/packages/c5/98/2c12407a7e4fbacd94ddd32f3b1e3d5231e77c30ef7162b12a60e2dd5ce3/propcache-0.3.2-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:acdf05d00696bc0447e278bb53cb04ca72354e562cf88ea6f9107df8e7fd9770", size = 271322, upload-time = "2025-06-09T22:55:35.065Z" }, + { url = "https://files.pythonhosted.org/packages/35/91/9cb56efbb428b006bb85db28591e40b7736847b8331d43fe335acf95f6c8/propcache-0.3.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:4445542398bd0b5d32df908031cb1b30d43ac848e20470a878b770ec2dcc6330", size = 265778, upload-time = "2025-06-09T22:55:36.45Z" }, + { url = "https://files.pythonhosted.org/packages/9a/4c/b0fe775a2bdd01e176b14b574be679d84fc83958335790f7c9a686c1f468/propcache-0.3.2-cp313-cp313t-win32.whl", hash = "sha256:f86e5d7cd03afb3a1db8e9f9f6eff15794e79e791350ac48a8c924e6f439f394", size = 41175, upload-time = "2025-06-09T22:55:38.436Z" }, + { url = "https://files.pythonhosted.org/packages/a4/ff/47f08595e3d9b5e149c150f88d9714574f1a7cbd89fe2817158a952674bf/propcache-0.3.2-cp313-cp313t-win_amd64.whl", hash = "sha256:9704bedf6e7cbe3c65eca4379a9b53ee6a83749f047808cbb5044d40d7d72198", size = 44857, upload-time = "2025-06-09T22:55:39.687Z" }, + { url = "https://files.pythonhosted.org/packages/cc/35/cc0aaecf278bb4575b8555f2b137de5ab821595ddae9da9d3cd1da4072c7/propcache-0.3.2-py3-none-any.whl", hash = "sha256:98f1ec44fb675f5052cccc8e609c46ed23a35a1cfd18545ad4e29002d858a43f", size = 12663, upload-time = "2025-06-09T22:56:04.484Z" }, +] + +[[package]] +name = "proto-plus" +version = "1.26.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "protobuf" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f4/ac/87285f15f7cce6d4a008f33f1757fb5a13611ea8914eb58c3d0d26243468/proto_plus-1.26.1.tar.gz", hash = "sha256:21a515a4c4c0088a773899e23c7bbade3d18f9c66c73edd4c7ee3816bc96a012", size = 56142, upload-time = "2025-03-10T15:54:38.843Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4e/6d/280c4c2ce28b1593a19ad5239c8b826871fc6ec275c21afc8e1820108039/proto_plus-1.26.1-py3-none-any.whl", hash = "sha256:13285478c2dcf2abb829db158e1047e2f1e8d63a077d94263c2b88b043c75a66", size = 50163, upload-time = "2025-03-10T15:54:37.335Z" }, +] + +[[package]] +name = "protobuf" +version = "5.29.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/43/29/d09e70352e4e88c9c7a198d5645d7277811448d76c23b00345670f7c8a38/protobuf-5.29.5.tar.gz", hash = "sha256:bc1463bafd4b0929216c35f437a8e28731a2b7fe3d98bb77a600efced5a15c84", size = 425226, upload-time = "2025-05-28T23:51:59.82Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5f/11/6e40e9fc5bba02988a214c07cf324595789ca7820160bfd1f8be96e48539/protobuf-5.29.5-cp310-abi3-win32.whl", hash = "sha256:3f1c6468a2cfd102ff4703976138844f78ebd1fb45f49011afc5139e9e283079", size = 422963, upload-time = "2025-05-28T23:51:41.204Z" }, + { url = "https://files.pythonhosted.org/packages/81/7f/73cefb093e1a2a7c3ffd839e6f9fcafb7a427d300c7f8aef9c64405d8ac6/protobuf-5.29.5-cp310-abi3-win_amd64.whl", hash = "sha256:3f76e3a3675b4a4d867b52e4a5f5b78a2ef9565549d4037e06cf7b0942b1d3fc", size = 434818, upload-time = "2025-05-28T23:51:44.297Z" }, + { url = "https://files.pythonhosted.org/packages/dd/73/10e1661c21f139f2c6ad9b23040ff36fee624310dc28fba20d33fdae124c/protobuf-5.29.5-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:e38c5add5a311f2a6eb0340716ef9b039c1dfa428b28f25a7838ac329204a671", size = 418091, upload-time = "2025-05-28T23:51:45.907Z" }, + { url = "https://files.pythonhosted.org/packages/6c/04/98f6f8cf5b07ab1294c13f34b4e69b3722bb609c5b701d6c169828f9f8aa/protobuf-5.29.5-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:fa18533a299d7ab6c55a238bf8629311439995f2e7eca5caaff08663606e9015", size = 319824, upload-time = "2025-05-28T23:51:47.545Z" }, + { url = "https://files.pythonhosted.org/packages/85/e4/07c80521879c2d15f321465ac24c70efe2381378c00bf5e56a0f4fbac8cd/protobuf-5.29.5-cp38-abi3-manylinux2014_x86_64.whl", hash = "sha256:63848923da3325e1bf7e9003d680ce6e14b07e55d0473253a690c3a8b8fd6e61", size = 319942, upload-time = "2025-05-28T23:51:49.11Z" }, + { url = "https://files.pythonhosted.org/packages/7e/cc/7e77861000a0691aeea8f4566e5d3aa716f2b1dece4a24439437e41d3d25/protobuf-5.29.5-py3-none-any.whl", hash = "sha256:6cf42630262c59b2d8de33954443d94b746c952b01434fc58a417fdbd2e84bd5", size = 172823, upload-time = "2025-05-28T23:51:58.157Z" }, +] + +[[package]] +name = "psutil" +version = "7.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2a/80/336820c1ad9286a4ded7e845b2eccfcb27851ab8ac6abece774a6ff4d3de/psutil-7.0.0.tar.gz", hash = "sha256:7be9c3eba38beccb6495ea33afd982a44074b78f28c434a1f51cc07fd315c456", size = 497003, upload-time = "2025-02-13T21:54:07.946Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ed/e6/2d26234410f8b8abdbf891c9da62bee396583f713fb9f3325a4760875d22/psutil-7.0.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:101d71dc322e3cffd7cea0650b09b3d08b8e7c4109dd6809fe452dfd00e58b25", size = 238051, upload-time = "2025-02-13T21:54:12.36Z" }, + { url = "https://files.pythonhosted.org/packages/04/8b/30f930733afe425e3cbfc0e1468a30a18942350c1a8816acfade80c005c4/psutil-7.0.0-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:39db632f6bb862eeccf56660871433e111b6ea58f2caea825571951d4b6aa3da", size = 239535, upload-time = "2025-02-13T21:54:16.07Z" }, + { url = "https://files.pythonhosted.org/packages/2a/ed/d362e84620dd22876b55389248e522338ed1bf134a5edd3b8231d7207f6d/psutil-7.0.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1fcee592b4c6f146991ca55919ea3d1f8926497a713ed7faaf8225e174581e91", size = 275004, upload-time = "2025-02-13T21:54:18.662Z" }, + { url = "https://files.pythonhosted.org/packages/bf/b9/b0eb3f3cbcb734d930fdf839431606844a825b23eaf9a6ab371edac8162c/psutil-7.0.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b1388a4f6875d7e2aff5c4ca1cc16c545ed41dd8bb596cefea80111db353a34", size = 277986, upload-time = "2025-02-13T21:54:21.811Z" }, + { url = "https://files.pythonhosted.org/packages/eb/a2/709e0fe2f093556c17fbafda93ac032257242cabcc7ff3369e2cb76a97aa/psutil-7.0.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5f098451abc2828f7dc6b58d44b532b22f2088f4999a937557b603ce72b1993", size = 279544, upload-time = "2025-02-13T21:54:24.68Z" }, + { url = "https://files.pythonhosted.org/packages/50/e6/eecf58810b9d12e6427369784efe814a1eec0f492084ce8eb8f4d89d6d61/psutil-7.0.0-cp37-abi3-win32.whl", hash = "sha256:ba3fcef7523064a6c9da440fc4d6bd07da93ac726b5733c29027d7dc95b39d99", size = 241053, upload-time = "2025-02-13T21:54:34.31Z" }, + { url = "https://files.pythonhosted.org/packages/50/1b/6921afe68c74868b4c9fa424dad3be35b095e16687989ebbb50ce4fceb7c/psutil-7.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:4cf3d4eb1aa9b348dec30105c55cd9b7d4629285735a102beb4441e38db90553", size = 244885, upload-time = "2025-02-13T21:54:37.486Z" }, +] + +[[package]] +name = "ptyprocess" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/20/e5/16ff212c1e452235a90aeb09066144d0c5a6a8c0834397e03f5224495c4e/ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220", size = 70762, upload-time = "2020-12-28T15:15:30.155Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35", size = 13993, upload-time = "2020-12-28T15:15:28.35Z" }, +] + +[[package]] +name = "pure-eval" +version = "0.2.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/cd/05/0a34433a064256a578f1783a10da6df098ceaa4a57bbeaa96a6c0352786b/pure_eval-0.2.3.tar.gz", hash = "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42", size = 19752, upload-time = "2024-07-21T12:58:21.801Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842, upload-time = "2024-07-21T12:58:20.04Z" }, +] + +[[package]] +name = "py-spy" +version = "0.4.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/19/e2/ff811a367028b87e86714945bb9ecb5c1cc69114a8039a67b3a862cef921/py_spy-0.4.1.tar.gz", hash = "sha256:e53aa53daa2e47c2eef97dd2455b47bb3a7e7f962796a86cc3e7dbde8e6f4db4", size = 244726, upload-time = "2025-07-31T19:33:25.172Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/14/e3/3a32500d845bdd94f6a2b4ed6244982f42ec2bc64602ea8fcfe900678ae7/py_spy-0.4.1-py2.py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:809094208c6256c8f4ccadd31e9a513fe2429253f48e20066879239ba12cd8cc", size = 3682508, upload-time = "2025-07-31T19:33:13.753Z" }, + { url = "https://files.pythonhosted.org/packages/4f/bf/e4d280e9e0bec71d39fc646654097027d4bbe8e04af18fb68e49afcff404/py_spy-0.4.1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:1fb8bf71ab8df95a95cc387deed6552934c50feef2cf6456bc06692a5508fd0c", size = 1796395, upload-time = "2025-07-31T19:33:15.325Z" }, + { url = "https://files.pythonhosted.org/packages/df/79/9ed50bb0a9de63ed023aa2db8b6265b04a7760d98c61eb54def6a5fddb68/py_spy-0.4.1-py2.py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ee776b9d512a011d1ad3907ed53ae32ce2f3d9ff3e1782236554e22103b5c084", size = 2034938, upload-time = "2025-07-31T19:33:17.194Z" }, + { url = "https://files.pythonhosted.org/packages/53/a5/36862e3eea59f729dfb70ee6f9e14b051d8ddce1aa7e70e0b81d9fe18536/py_spy-0.4.1-py2.py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:532d3525538254d1859b49de1fbe9744df6b8865657c9f0e444bf36ce3f19226", size = 2658968, upload-time = "2025-07-31T19:33:18.916Z" }, + { url = "https://files.pythonhosted.org/packages/08/f8/9ea0b586b065a623f591e5e7961282ec944b5fbbdca33186c7c0296645b3/py_spy-0.4.1-py2.py3-none-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4972c21890b6814017e39ac233c22572c4a61fd874524ebc5ccab0f2237aee0a", size = 2147541, upload-time = "2025-07-31T19:33:20.565Z" }, + { url = "https://files.pythonhosted.org/packages/68/fb/bc7f639aed026bca6e7beb1e33f6951e16b7d315594e7635a4f7d21d63f4/py_spy-0.4.1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6a80ec05eb8a6883863a367c6a4d4f2d57de68466f7956b6367d4edd5c61bb29", size = 2763338, upload-time = "2025-07-31T19:33:22.202Z" }, + { url = "https://files.pythonhosted.org/packages/e1/da/fcc9a9fcd4ca946ff402cff20348e838b051d69f50f5d1f5dca4cd3c5eb8/py_spy-0.4.1-py2.py3-none-win_amd64.whl", hash = "sha256:d92e522bd40e9bf7d87c204033ce5bb5c828fca45fa28d970f58d71128069fdc", size = 1818784, upload-time = "2025-07-31T19:33:23.802Z" }, +] + +[[package]] +name = "pyarrow" +version = "18.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7f/7b/640785a9062bb00314caa8a387abce547d2a420cf09bd6c715fe659ccffb/pyarrow-18.1.0.tar.gz", hash = "sha256:9386d3ca9c145b5539a1cfc75df07757dff870168c959b473a0bccbc3abc8c73", size = 1118671, upload-time = "2024-11-26T02:01:48.62Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6a/50/12829e7111b932581e51dda51d5cb39207a056c30fe31ef43f14c63c4d7e/pyarrow-18.1.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:9f3a76670b263dc41d0ae877f09124ab96ce10e4e48f3e3e4257273cee61ad0d", size = 29514620, upload-time = "2024-11-26T01:59:39.797Z" }, + { url = "https://files.pythonhosted.org/packages/d1/41/468c944eab157702e96abab3d07b48b8424927d4933541ab43788bb6964d/pyarrow-18.1.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:da31fbca07c435be88a0c321402c4e31a2ba61593ec7473630769de8346b54ee", size = 30856494, upload-time = "2024-11-26T01:59:44.725Z" }, + { url = "https://files.pythonhosted.org/packages/68/f9/29fb659b390312a7345aeb858a9d9c157552a8852522f2c8bad437c29c0a/pyarrow-18.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:543ad8459bc438efc46d29a759e1079436290bd583141384c6f7a1068ed6f992", size = 39203624, upload-time = "2024-11-26T01:59:49.189Z" }, + { url = "https://files.pythonhosted.org/packages/6e/f6/19360dae44200e35753c5c2889dc478154cd78e61b1f738514c9f131734d/pyarrow-18.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0743e503c55be0fdb5c08e7d44853da27f19dc854531c0570f9f394ec9671d54", size = 40139341, upload-time = "2024-11-26T01:59:54.849Z" }, + { url = "https://files.pythonhosted.org/packages/bb/e6/9b3afbbcf10cc724312e824af94a2e993d8ace22994d823f5c35324cebf5/pyarrow-18.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:d4b3d2a34780645bed6414e22dda55a92e0fcd1b8a637fba86800ad737057e33", size = 38618629, upload-time = "2024-11-26T01:59:59.966Z" }, + { url = "https://files.pythonhosted.org/packages/3a/2e/3b99f8a3d9e0ccae0e961978a0d0089b25fb46ebbcfb5ebae3cca179a5b3/pyarrow-18.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:c52f81aa6f6575058d8e2c782bf79d4f9fdc89887f16825ec3a66607a5dd8e30", size = 40078661, upload-time = "2024-11-26T02:00:04.55Z" }, + { url = "https://files.pythonhosted.org/packages/76/52/f8da04195000099d394012b8d42c503d7041b79f778d854f410e5f05049a/pyarrow-18.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:0ad4892617e1a6c7a551cfc827e072a633eaff758fa09f21c4ee548c30bcaf99", size = 25092330, upload-time = "2024-11-26T02:00:09.576Z" }, + { url = "https://files.pythonhosted.org/packages/cb/87/aa4d249732edef6ad88899399047d7e49311a55749d3c373007d034ee471/pyarrow-18.1.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:84e314d22231357d473eabec709d0ba285fa706a72377f9cc8e1cb3c8013813b", size = 29497406, upload-time = "2024-11-26T02:00:14.469Z" }, + { url = "https://files.pythonhosted.org/packages/3c/c7/ed6adb46d93a3177540e228b5ca30d99fc8ea3b13bdb88b6f8b6467e2cb7/pyarrow-18.1.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:f591704ac05dfd0477bb8f8e0bd4b5dc52c1cadf50503858dce3a15db6e46ff2", size = 30835095, upload-time = "2024-11-26T02:00:19.347Z" }, + { url = "https://files.pythonhosted.org/packages/41/d7/ed85001edfb96200ff606943cff71d64f91926ab42828676c0fc0db98963/pyarrow-18.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:acb7564204d3c40babf93a05624fc6a8ec1ab1def295c363afc40b0c9e66c191", size = 39194527, upload-time = "2024-11-26T02:00:24.085Z" }, + { url = "https://files.pythonhosted.org/packages/59/16/35e28eab126342fa391593415d79477e89582de411bb95232f28b131a769/pyarrow-18.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:74de649d1d2ccb778f7c3afff6085bd5092aed4c23df9feeb45dd6b16f3811aa", size = 40131443, upload-time = "2024-11-26T02:00:29.483Z" }, + { url = "https://files.pythonhosted.org/packages/0c/95/e855880614c8da20f4cd74fa85d7268c725cf0013dc754048593a38896a0/pyarrow-18.1.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:f96bd502cb11abb08efea6dab09c003305161cb6c9eafd432e35e76e7fa9b90c", size = 38608750, upload-time = "2024-11-26T02:00:34.069Z" }, + { url = "https://files.pythonhosted.org/packages/54/9d/f253554b1457d4fdb3831b7bd5f8f00f1795585a606eabf6fec0a58a9c38/pyarrow-18.1.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:36ac22d7782554754a3b50201b607d553a8d71b78cdf03b33c1125be4b52397c", size = 40066690, upload-time = "2024-11-26T02:00:39.603Z" }, + { url = "https://files.pythonhosted.org/packages/2f/58/8912a2563e6b8273e8aa7b605a345bba5a06204549826f6493065575ebc0/pyarrow-18.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:25dbacab8c5952df0ca6ca0af28f50d45bd31c1ff6fcf79e2d120b4a65ee7181", size = 25081054, upload-time = "2024-11-26T02:00:43.611Z" }, + { url = "https://files.pythonhosted.org/packages/82/f9/d06ddc06cab1ada0c2f2fd205ac8c25c2701182de1b9c4bf7a0a44844431/pyarrow-18.1.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:6a276190309aba7bc9d5bd2933230458b3521a4317acfefe69a354f2fe59f2bc", size = 29525542, upload-time = "2024-11-26T02:00:48.094Z" }, + { url = "https://files.pythonhosted.org/packages/ab/94/8917e3b961810587ecbdaa417f8ebac0abb25105ae667b7aa11c05876976/pyarrow-18.1.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:ad514dbfcffe30124ce655d72771ae070f30bf850b48bc4d9d3b25993ee0e386", size = 30829412, upload-time = "2024-11-26T02:00:52.458Z" }, + { url = "https://files.pythonhosted.org/packages/5e/e3/3b16c3190f3d71d3b10f6758d2d5f7779ef008c4fd367cedab3ed178a9f7/pyarrow-18.1.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aebc13a11ed3032d8dd6e7171eb6e86d40d67a5639d96c35142bd568b9299324", size = 39119106, upload-time = "2024-11-26T02:00:57.219Z" }, + { url = "https://files.pythonhosted.org/packages/1d/d6/5d704b0d25c3c79532f8c0639f253ec2803b897100f64bcb3f53ced236e5/pyarrow-18.1.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d6cf5c05f3cee251d80e98726b5c7cc9f21bab9e9783673bac58e6dfab57ecc8", size = 40090940, upload-time = "2024-11-26T02:01:02.31Z" }, + { url = "https://files.pythonhosted.org/packages/37/29/366bc7e588220d74ec00e497ac6710c2833c9176f0372fe0286929b2d64c/pyarrow-18.1.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:11b676cd410cf162d3f6a70b43fb9e1e40affbc542a1e9ed3681895f2962d3d9", size = 38548177, upload-time = "2024-11-26T02:01:07.371Z" }, + { url = "https://files.pythonhosted.org/packages/c8/11/fabf6ecabb1fe5b7d96889228ca2a9158c4c3bb732e3b8ee3f7f6d40b703/pyarrow-18.1.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:b76130d835261b38f14fc41fdfb39ad8d672afb84c447126b84d5472244cfaba", size = 40043567, upload-time = "2024-11-26T02:01:12.931Z" }, +] + +[[package]] +name = "pyasn1" +version = "0.6.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ba/e9/01f1a64245b89f039897cb0130016d79f77d52669aae6ee7b159a6c4c018/pyasn1-0.6.1.tar.gz", hash = "sha256:6f580d2bdd84365380830acf45550f2511469f673cb4a5ae3857a3170128b034", size = 145322, upload-time = "2024-09-10T22:41:42.55Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c8/f1/d6a797abb14f6283c0ddff96bbdd46937f64122b8c925cab503dd37f8214/pyasn1-0.6.1-py3-none-any.whl", hash = "sha256:0d632f46f2ba09143da3a8afe9e33fb6f92fa2320ab7e886e2d0f7672af84629", size = 83135, upload-time = "2024-09-11T16:00:36.122Z" }, +] + +[[package]] +name = "pyasn1-modules" +version = "0.4.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyasn1" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e9/e6/78ebbb10a8c8e4b61a59249394a4a594c1a7af95593dc933a349c8d00964/pyasn1_modules-0.4.2.tar.gz", hash = "sha256:677091de870a80aae844b1ca6134f54652fa2c8c5a52aa396440ac3106e941e6", size = 307892, upload-time = "2025-03-28T02:41:22.17Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/47/8d/d529b5d697919ba8c11ad626e835d4039be708a35b0d22de83a269a6682c/pyasn1_modules-0.4.2-py3-none-any.whl", hash = "sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a", size = 181259, upload-time = "2025-03-28T02:41:19.028Z" }, +] + +[[package]] +name = "pycparser" +version = "2.22" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1d/b2/31537cf4b1ca988837256c910a668b553fceb8f069bedc4b1c826024b52c/pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6", size = 172736, upload-time = "2024-03-30T13:22:22.564Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/13/a3/a812df4e2dd5696d1f351d58b8fe16a405b234ad2886a0dab9183fb78109/pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc", size = 117552, upload-time = "2024-03-30T13:22:20.476Z" }, +] + +[[package]] +name = "pydantic" +version = "2.11.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "annotated-types" }, + { name = "pydantic-core" }, + { name = "typing-extensions" }, + { name = "typing-inspection" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/00/dd/4325abf92c39ba8623b5af936ddb36ffcfe0beae70405d456ab1fb2f5b8c/pydantic-2.11.7.tar.gz", hash = "sha256:d989c3c6cb79469287b1569f7447a17848c998458d49ebe294e975b9baf0f0db", size = 788350, upload-time = "2025-06-14T08:33:17.137Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6a/c0/ec2b1c8712ca690e5d61979dee872603e92b8a32f94cc1b72d53beab008a/pydantic-2.11.7-py3-none-any.whl", hash = "sha256:dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b", size = 444782, upload-time = "2025-06-14T08:33:14.905Z" }, +] + +[[package]] +name = "pydantic-core" +version = "2.33.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ad/88/5f2260bdfae97aabf98f1778d43f69574390ad787afb646292a638c923d4/pydantic_core-2.33.2.tar.gz", hash = "sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc", size = 435195, upload-time = "2025-04-23T18:33:52.104Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/18/8a/2b41c97f554ec8c71f2a8a5f85cb56a8b0956addfe8b0efb5b3d77e8bdc3/pydantic_core-2.33.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a7ec89dc587667f22b6a0b6579c249fca9026ce7c333fc142ba42411fa243cdc", size = 2009000, upload-time = "2025-04-23T18:31:25.863Z" }, + { url = "https://files.pythonhosted.org/packages/a1/02/6224312aacb3c8ecbaa959897af57181fb6cf3a3d7917fd44d0f2917e6f2/pydantic_core-2.33.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3c6db6e52c6d70aa0d00d45cdb9b40f0433b96380071ea80b09277dba021ddf7", size = 1847996, upload-time = "2025-04-23T18:31:27.341Z" }, + { url = "https://files.pythonhosted.org/packages/d6/46/6dcdf084a523dbe0a0be59d054734b86a981726f221f4562aed313dbcb49/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e61206137cbc65e6d5256e1166f88331d3b6238e082d9f74613b9b765fb9025", size = 1880957, upload-time = "2025-04-23T18:31:28.956Z" }, + { url = "https://files.pythonhosted.org/packages/ec/6b/1ec2c03837ac00886ba8160ce041ce4e325b41d06a034adbef11339ae422/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:eb8c529b2819c37140eb51b914153063d27ed88e3bdc31b71198a198e921e011", size = 1964199, upload-time = "2025-04-23T18:31:31.025Z" }, + { url = "https://files.pythonhosted.org/packages/2d/1d/6bf34d6adb9debd9136bd197ca72642203ce9aaaa85cfcbfcf20f9696e83/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c52b02ad8b4e2cf14ca7b3d918f3eb0ee91e63b3167c32591e57c4317e134f8f", size = 2120296, upload-time = "2025-04-23T18:31:32.514Z" }, + { url = "https://files.pythonhosted.org/packages/e0/94/2bd0aaf5a591e974b32a9f7123f16637776c304471a0ab33cf263cf5591a/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:96081f1605125ba0855dfda83f6f3df5ec90c61195421ba72223de35ccfb2f88", size = 2676109, upload-time = "2025-04-23T18:31:33.958Z" }, + { url = "https://files.pythonhosted.org/packages/f9/41/4b043778cf9c4285d59742281a769eac371b9e47e35f98ad321349cc5d61/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f57a69461af2a5fa6e6bbd7a5f60d3b7e6cebb687f55106933188e79ad155c1", size = 2002028, upload-time = "2025-04-23T18:31:39.095Z" }, + { url = "https://files.pythonhosted.org/packages/cb/d5/7bb781bf2748ce3d03af04d5c969fa1308880e1dca35a9bd94e1a96a922e/pydantic_core-2.33.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:572c7e6c8bb4774d2ac88929e3d1f12bc45714ae5ee6d9a788a9fb35e60bb04b", size = 2100044, upload-time = "2025-04-23T18:31:41.034Z" }, + { url = "https://files.pythonhosted.org/packages/fe/36/def5e53e1eb0ad896785702a5bbfd25eed546cdcf4087ad285021a90ed53/pydantic_core-2.33.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:db4b41f9bd95fbe5acd76d89920336ba96f03e149097365afe1cb092fceb89a1", size = 2058881, upload-time = "2025-04-23T18:31:42.757Z" }, + { url = "https://files.pythonhosted.org/packages/01/6c/57f8d70b2ee57fc3dc8b9610315949837fa8c11d86927b9bb044f8705419/pydantic_core-2.33.2-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:fa854f5cf7e33842a892e5c73f45327760bc7bc516339fda888c75ae60edaeb6", size = 2227034, upload-time = "2025-04-23T18:31:44.304Z" }, + { url = "https://files.pythonhosted.org/packages/27/b9/9c17f0396a82b3d5cbea4c24d742083422639e7bb1d5bf600e12cb176a13/pydantic_core-2.33.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:5f483cfb75ff703095c59e365360cb73e00185e01aaea067cd19acffd2ab20ea", size = 2234187, upload-time = "2025-04-23T18:31:45.891Z" }, + { url = "https://files.pythonhosted.org/packages/b0/6a/adf5734ffd52bf86d865093ad70b2ce543415e0e356f6cacabbc0d9ad910/pydantic_core-2.33.2-cp312-cp312-win32.whl", hash = "sha256:9cb1da0f5a471435a7bc7e439b8a728e8b61e59784b2af70d7c169f8dd8ae290", size = 1892628, upload-time = "2025-04-23T18:31:47.819Z" }, + { url = "https://files.pythonhosted.org/packages/43/e4/5479fecb3606c1368d496a825d8411e126133c41224c1e7238be58b87d7e/pydantic_core-2.33.2-cp312-cp312-win_amd64.whl", hash = "sha256:f941635f2a3d96b2973e867144fde513665c87f13fe0e193c158ac51bfaaa7b2", size = 1955866, upload-time = "2025-04-23T18:31:49.635Z" }, + { url = "https://files.pythonhosted.org/packages/0d/24/8b11e8b3e2be9dd82df4b11408a67c61bb4dc4f8e11b5b0fc888b38118b5/pydantic_core-2.33.2-cp312-cp312-win_arm64.whl", hash = "sha256:cca3868ddfaccfbc4bfb1d608e2ccaaebe0ae628e1416aeb9c4d88c001bb45ab", size = 1888894, upload-time = "2025-04-23T18:31:51.609Z" }, + { url = "https://files.pythonhosted.org/packages/46/8c/99040727b41f56616573a28771b1bfa08a3d3fe74d3d513f01251f79f172/pydantic_core-2.33.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:1082dd3e2d7109ad8b7da48e1d4710c8d06c253cbc4a27c1cff4fbcaa97a9e3f", size = 2015688, upload-time = "2025-04-23T18:31:53.175Z" }, + { url = "https://files.pythonhosted.org/packages/3a/cc/5999d1eb705a6cefc31f0b4a90e9f7fc400539b1a1030529700cc1b51838/pydantic_core-2.33.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f517ca031dfc037a9c07e748cefd8d96235088b83b4f4ba8939105d20fa1dcd6", size = 1844808, upload-time = "2025-04-23T18:31:54.79Z" }, + { url = "https://files.pythonhosted.org/packages/6f/5e/a0a7b8885c98889a18b6e376f344da1ef323d270b44edf8174d6bce4d622/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a9f2c9dd19656823cb8250b0724ee9c60a82f3cdf68a080979d13092a3b0fef", size = 1885580, upload-time = "2025-04-23T18:31:57.393Z" }, + { url = "https://files.pythonhosted.org/packages/3b/2a/953581f343c7d11a304581156618c3f592435523dd9d79865903272c256a/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2b0a451c263b01acebe51895bfb0e1cc842a5c666efe06cdf13846c7418caa9a", size = 1973859, upload-time = "2025-04-23T18:31:59.065Z" }, + { url = "https://files.pythonhosted.org/packages/e6/55/f1a813904771c03a3f97f676c62cca0c0a4138654107c1b61f19c644868b/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ea40a64d23faa25e62a70ad163571c0b342b8bf66d5fa612ac0dec4f069d916", size = 2120810, upload-time = "2025-04-23T18:32:00.78Z" }, + { url = "https://files.pythonhosted.org/packages/aa/c3/053389835a996e18853ba107a63caae0b9deb4a276c6b472931ea9ae6e48/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0fb2d542b4d66f9470e8065c5469ec676978d625a8b7a363f07d9a501a9cb36a", size = 2676498, upload-time = "2025-04-23T18:32:02.418Z" }, + { url = "https://files.pythonhosted.org/packages/eb/3c/f4abd740877a35abade05e437245b192f9d0ffb48bbbbd708df33d3cda37/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fdac5d6ffa1b5a83bca06ffe7583f5576555e6c8b3a91fbd25ea7780f825f7d", size = 2000611, upload-time = "2025-04-23T18:32:04.152Z" }, + { url = "https://files.pythonhosted.org/packages/59/a7/63ef2fed1837d1121a894d0ce88439fe3e3b3e48c7543b2a4479eb99c2bd/pydantic_core-2.33.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:04a1a413977ab517154eebb2d326da71638271477d6ad87a769102f7c2488c56", size = 2107924, upload-time = "2025-04-23T18:32:06.129Z" }, + { url = "https://files.pythonhosted.org/packages/04/8f/2551964ef045669801675f1cfc3b0d74147f4901c3ffa42be2ddb1f0efc4/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:c8e7af2f4e0194c22b5b37205bfb293d166a7344a5b0d0eaccebc376546d77d5", size = 2063196, upload-time = "2025-04-23T18:32:08.178Z" }, + { url = "https://files.pythonhosted.org/packages/26/bd/d9602777e77fc6dbb0c7db9ad356e9a985825547dce5ad1d30ee04903918/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:5c92edd15cd58b3c2d34873597a1e20f13094f59cf88068adb18947df5455b4e", size = 2236389, upload-time = "2025-04-23T18:32:10.242Z" }, + { url = "https://files.pythonhosted.org/packages/42/db/0e950daa7e2230423ab342ae918a794964b053bec24ba8af013fc7c94846/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:65132b7b4a1c0beded5e057324b7e16e10910c106d43675d9bd87d4f38dde162", size = 2239223, upload-time = "2025-04-23T18:32:12.382Z" }, + { url = "https://files.pythonhosted.org/packages/58/4d/4f937099c545a8a17eb52cb67fe0447fd9a373b348ccfa9a87f141eeb00f/pydantic_core-2.33.2-cp313-cp313-win32.whl", hash = "sha256:52fb90784e0a242bb96ec53f42196a17278855b0f31ac7c3cc6f5c1ec4811849", size = 1900473, upload-time = "2025-04-23T18:32:14.034Z" }, + { url = "https://files.pythonhosted.org/packages/a0/75/4a0a9bac998d78d889def5e4ef2b065acba8cae8c93696906c3a91f310ca/pydantic_core-2.33.2-cp313-cp313-win_amd64.whl", hash = "sha256:c083a3bdd5a93dfe480f1125926afcdbf2917ae714bdb80b36d34318b2bec5d9", size = 1955269, upload-time = "2025-04-23T18:32:15.783Z" }, + { url = "https://files.pythonhosted.org/packages/f9/86/1beda0576969592f1497b4ce8e7bc8cbdf614c352426271b1b10d5f0aa64/pydantic_core-2.33.2-cp313-cp313-win_arm64.whl", hash = "sha256:e80b087132752f6b3d714f041ccf74403799d3b23a72722ea2e6ba2e892555b9", size = 1893921, upload-time = "2025-04-23T18:32:18.473Z" }, + { url = "https://files.pythonhosted.org/packages/a4/7d/e09391c2eebeab681df2b74bfe6c43422fffede8dc74187b2b0bf6fd7571/pydantic_core-2.33.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:61c18fba8e5e9db3ab908620af374db0ac1baa69f0f32df4f61ae23f15e586ac", size = 1806162, upload-time = "2025-04-23T18:32:20.188Z" }, + { url = "https://files.pythonhosted.org/packages/f1/3d/847b6b1fed9f8ed3bb95a9ad04fbd0b212e832d4f0f50ff4d9ee5a9f15cf/pydantic_core-2.33.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95237e53bb015f67b63c91af7518a62a8660376a6a0db19b89acc77a4d6199f5", size = 1981560, upload-time = "2025-04-23T18:32:22.354Z" }, + { url = "https://files.pythonhosted.org/packages/6f/9a/e73262f6c6656262b5fdd723ad90f518f579b7bc8622e43a942eec53c938/pydantic_core-2.33.2-cp313-cp313t-win_amd64.whl", hash = "sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9", size = 1935777, upload-time = "2025-04-23T18:32:25.088Z" }, +] + +[[package]] +name = "pygments" +version = "2.19.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b0/77/a5b8c569bf593b0140bde72ea885a803b82086995367bf2037de0159d924/pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887", size = 4968631, upload-time = "2025-06-21T13:39:12.283Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" }, +] + +[[package]] +name = "pyparsing" +version = "3.2.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/bb/22/f1129e69d94ffff626bdb5c835506b3a5b4f3d070f17ea295e12c2c6f60f/pyparsing-3.2.3.tar.gz", hash = "sha256:b9c13f1ab8b3b542f72e28f634bad4de758ab3ce4546e4301970ad6fa77c38be", size = 1088608, upload-time = "2025-03-25T05:01:28.114Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/05/e7/df2285f3d08fee213f2d041540fa4fc9ca6c2d44cf36d3a035bf2a8d2bcc/pyparsing-3.2.3-py3-none-any.whl", hash = "sha256:a749938e02d6fd0b59b356ca504a24982314bb090c383e3cf201c95ef7e2bfcf", size = 111120, upload-time = "2025-03-25T05:01:24.908Z" }, +] + +[[package]] +name = "python-dateutil" +version = "2.9.0.post0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "six" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432, upload-time = "2024-03-01T18:36:20.211Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" }, +] + +[[package]] +name = "python-dotenv" +version = "1.1.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f6/b0/4bc07ccd3572a2f9df7e6782f52b0c6c90dcbb803ac4a167702d7d0dfe1e/python_dotenv-1.1.1.tar.gz", hash = "sha256:a8a6399716257f45be6a007360200409fce5cda2661e3dec71d23dc15f6189ab", size = 41978, upload-time = "2025-06-24T04:21:07.341Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5f/ed/539768cf28c661b5b068d66d96a2f155c4971a5d55684a514c1a0e0dec2f/python_dotenv-1.1.1-py3-none-any.whl", hash = "sha256:31f23644fe2602f88ff55e1f5c79ba497e01224ee7737937930c448e4d0e24dc", size = 20556, upload-time = "2025-06-24T04:21:06.073Z" }, +] + +[[package]] +name = "pytz" +version = "2025.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f8/bf/abbd3cdfb8fbc7fb3d4d38d320f2441b1e7cbe29be4f23797b4a2b5d8aac/pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3", size = 320884, upload-time = "2025-03-25T02:25:00.538Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload-time = "2025-03-25T02:24:58.468Z" }, +] + +[[package]] +name = "pywin32" +version = "311" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e7/ab/01ea1943d4eba0f850c3c61e78e8dd59757ff815ff3ccd0a84de5f541f42/pywin32-311-cp312-cp312-win32.whl", hash = "sha256:750ec6e621af2b948540032557b10a2d43b0cee2ae9758c54154d711cc852d31", size = 8706543, upload-time = "2025-07-14T20:13:20.765Z" }, + { url = "https://files.pythonhosted.org/packages/d1/a8/a0e8d07d4d051ec7502cd58b291ec98dcc0c3fff027caad0470b72cfcc2f/pywin32-311-cp312-cp312-win_amd64.whl", hash = "sha256:b8c095edad5c211ff31c05223658e71bf7116daa0ecf3ad85f3201ea3190d067", size = 9495040, upload-time = "2025-07-14T20:13:22.543Z" }, + { url = "https://files.pythonhosted.org/packages/ba/3a/2ae996277b4b50f17d61f0603efd8253cb2d79cc7ae159468007b586396d/pywin32-311-cp312-cp312-win_arm64.whl", hash = "sha256:e286f46a9a39c4a18b319c28f59b61de793654af2f395c102b4f819e584b5852", size = 8710102, upload-time = "2025-07-14T20:13:24.682Z" }, + { url = "https://files.pythonhosted.org/packages/a5/be/3fd5de0979fcb3994bfee0d65ed8ca9506a8a1260651b86174f6a86f52b3/pywin32-311-cp313-cp313-win32.whl", hash = "sha256:f95ba5a847cba10dd8c4d8fefa9f2a6cf283b8b88ed6178fa8a6c1ab16054d0d", size = 8705700, upload-time = "2025-07-14T20:13:26.471Z" }, + { url = "https://files.pythonhosted.org/packages/e3/28/e0a1909523c6890208295a29e05c2adb2126364e289826c0a8bc7297bd5c/pywin32-311-cp313-cp313-win_amd64.whl", hash = "sha256:718a38f7e5b058e76aee1c56ddd06908116d35147e133427e59a3983f703a20d", size = 9494700, upload-time = "2025-07-14T20:13:28.243Z" }, + { url = "https://files.pythonhosted.org/packages/04/bf/90339ac0f55726dce7d794e6d79a18a91265bdf3aa70b6b9ca52f35e022a/pywin32-311-cp313-cp313-win_arm64.whl", hash = "sha256:7b4075d959648406202d92a2310cb990fea19b535c7f4a78d3f5e10b926eeb8a", size = 8709318, upload-time = "2025-07-14T20:13:30.348Z" }, + { url = "https://files.pythonhosted.org/packages/c9/31/097f2e132c4f16d99a22bfb777e0fd88bd8e1c634304e102f313af69ace5/pywin32-311-cp314-cp314-win32.whl", hash = "sha256:b7a2c10b93f8986666d0c803ee19b5990885872a7de910fc460f9b0c2fbf92ee", size = 8840714, upload-time = "2025-07-14T20:13:32.449Z" }, + { url = "https://files.pythonhosted.org/packages/90/4b/07c77d8ba0e01349358082713400435347df8426208171ce297da32c313d/pywin32-311-cp314-cp314-win_amd64.whl", hash = "sha256:3aca44c046bd2ed8c90de9cb8427f581c479e594e99b5c0bb19b29c10fd6cb87", size = 9656800, upload-time = "2025-07-14T20:13:34.312Z" }, + { url = "https://files.pythonhosted.org/packages/c0/d2/21af5c535501a7233e734b8af901574572da66fcc254cb35d0609c9080dd/pywin32-311-cp314-cp314-win_arm64.whl", hash = "sha256:a508e2d9025764a8270f93111a970e1d0fbfc33f4153b388bb649b7eec4f9b42", size = 8932540, upload-time = "2025-07-14T20:13:36.379Z" }, +] + +[[package]] +name = "pyyaml" +version = "6.0.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/54/ed/79a089b6be93607fa5cdaedf301d7dfb23af5f25c398d5ead2525b063e17/pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e", size = 130631, upload-time = "2024-08-06T20:33:50.674Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/86/0c/c581167fc46d6d6d7ddcfb8c843a4de25bdd27e4466938109ca68492292c/PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab", size = 183873, upload-time = "2024-08-06T20:32:25.131Z" }, + { url = "https://files.pythonhosted.org/packages/a8/0c/38374f5bb272c051e2a69281d71cba6fdb983413e6758b84482905e29a5d/PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725", size = 173302, upload-time = "2024-08-06T20:32:26.511Z" }, + { url = "https://files.pythonhosted.org/packages/c3/93/9916574aa8c00aa06bbac729972eb1071d002b8e158bd0e83a3b9a20a1f7/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5", size = 739154, upload-time = "2024-08-06T20:32:28.363Z" }, + { url = "https://files.pythonhosted.org/packages/95/0f/b8938f1cbd09739c6da569d172531567dbcc9789e0029aa070856f123984/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425", size = 766223, upload-time = "2024-08-06T20:32:30.058Z" }, + { url = "https://files.pythonhosted.org/packages/b9/2b/614b4752f2e127db5cc206abc23a8c19678e92b23c3db30fc86ab731d3bd/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476", size = 767542, upload-time = "2024-08-06T20:32:31.881Z" }, + { url = "https://files.pythonhosted.org/packages/d4/00/dd137d5bcc7efea1836d6264f049359861cf548469d18da90cd8216cf05f/PyYAML-6.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48", size = 731164, upload-time = "2024-08-06T20:32:37.083Z" }, + { url = "https://files.pythonhosted.org/packages/c9/1f/4f998c900485e5c0ef43838363ba4a9723ac0ad73a9dc42068b12aaba4e4/PyYAML-6.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b", size = 756611, upload-time = "2024-08-06T20:32:38.898Z" }, + { url = "https://files.pythonhosted.org/packages/df/d1/f5a275fdb252768b7a11ec63585bc38d0e87c9e05668a139fea92b80634c/PyYAML-6.0.2-cp312-cp312-win32.whl", hash = "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4", size = 140591, upload-time = "2024-08-06T20:32:40.241Z" }, + { url = "https://files.pythonhosted.org/packages/0c/e8/4f648c598b17c3d06e8753d7d13d57542b30d56e6c2dedf9c331ae56312e/PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8", size = 156338, upload-time = "2024-08-06T20:32:41.93Z" }, + { url = "https://files.pythonhosted.org/packages/ef/e3/3af305b830494fa85d95f6d95ef7fa73f2ee1cc8ef5b495c7c3269fb835f/PyYAML-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba", size = 181309, upload-time = "2024-08-06T20:32:43.4Z" }, + { url = "https://files.pythonhosted.org/packages/45/9f/3b1c20a0b7a3200524eb0076cc027a970d320bd3a6592873c85c92a08731/PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1", size = 171679, upload-time = "2024-08-06T20:32:44.801Z" }, + { url = "https://files.pythonhosted.org/packages/7c/9a/337322f27005c33bcb656c655fa78325b730324c78620e8328ae28b64d0c/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133", size = 733428, upload-time = "2024-08-06T20:32:46.432Z" }, + { url = "https://files.pythonhosted.org/packages/a3/69/864fbe19e6c18ea3cc196cbe5d392175b4cf3d5d0ac1403ec3f2d237ebb5/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484", size = 763361, upload-time = "2024-08-06T20:32:51.188Z" }, + { url = "https://files.pythonhosted.org/packages/04/24/b7721e4845c2f162d26f50521b825fb061bc0a5afcf9a386840f23ea19fa/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5", size = 759523, upload-time = "2024-08-06T20:32:53.019Z" }, + { url = "https://files.pythonhosted.org/packages/2b/b2/e3234f59ba06559c6ff63c4e10baea10e5e7df868092bf9ab40e5b9c56b6/PyYAML-6.0.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc", size = 726660, upload-time = "2024-08-06T20:32:54.708Z" }, + { url = "https://files.pythonhosted.org/packages/fe/0f/25911a9f080464c59fab9027482f822b86bf0608957a5fcc6eaac85aa515/PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652", size = 751597, upload-time = "2024-08-06T20:32:56.985Z" }, + { url = "https://files.pythonhosted.org/packages/14/0d/e2c3b43bbce3cf6bd97c840b46088a3031085179e596d4929729d8d68270/PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183", size = 140527, upload-time = "2024-08-06T20:33:03.001Z" }, + { url = "https://files.pythonhosted.org/packages/fa/de/02b54f42487e3d3c6efb3f89428677074ca7bf43aae402517bc7cca949f3/PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563", size = 156446, upload-time = "2024-08-06T20:33:04.33Z" }, +] + +[[package]] +name = "pyzmq" +version = "27.0.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cffi", marker = "implementation_name == 'pypy'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f8/66/159f38d184f08b5f971b467f87b1ab142ab1320d5200825c824b32b84b66/pyzmq-27.0.2.tar.gz", hash = "sha256:b398dd713b18de89730447347e96a0240225e154db56e35b6bb8447ffdb07798", size = 281440, upload-time = "2025-08-21T04:23:26.334Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/68/69/b3a729e7b03e412bee2b1823ab8d22e20a92593634f664afd04c6c9d9ac0/pyzmq-27.0.2-cp312-abi3-macosx_10_15_universal2.whl", hash = "sha256:5da05e3c22c95e23bfc4afeee6ff7d4be9ff2233ad6cb171a0e8257cd46b169a", size = 1305910, upload-time = "2025-08-21T04:21:27.609Z" }, + { url = "https://files.pythonhosted.org/packages/15/b7/f6a6a285193d489b223c340b38ee03a673467cb54914da21c3d7849f1b10/pyzmq-27.0.2-cp312-abi3-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:4e4520577971d01d47e2559bb3175fce1be9103b18621bf0b241abe0a933d040", size = 895507, upload-time = "2025-08-21T04:21:29.005Z" }, + { url = "https://files.pythonhosted.org/packages/17/e6/c4ed2da5ef9182cde1b1f5d0051a986e76339d71720ec1a00be0b49275ad/pyzmq-27.0.2-cp312-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:56d7de7bf73165b90bd25a8668659ccb134dd28449116bf3c7e9bab5cf8a8ec9", size = 652670, upload-time = "2025-08-21T04:21:30.71Z" }, + { url = "https://files.pythonhosted.org/packages/0e/66/d781ab0636570d32c745c4e389b1c6b713115905cca69ab6233508622edd/pyzmq-27.0.2-cp312-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:340e7cddc32f147c6c00d116a3f284ab07ee63dbd26c52be13b590520434533c", size = 840581, upload-time = "2025-08-21T04:21:32.008Z" }, + { url = "https://files.pythonhosted.org/packages/a6/df/f24790caf565d72544f5c8d8500960b9562c1dc848d6f22f3c7e122e73d4/pyzmq-27.0.2-cp312-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ba95693f9df8bb4a9826464fb0fe89033936f35fd4a8ff1edff09a473570afa0", size = 1641931, upload-time = "2025-08-21T04:21:33.371Z" }, + { url = "https://files.pythonhosted.org/packages/65/65/77d27b19fc5e845367f9100db90b9fce924f611b14770db480615944c9c9/pyzmq-27.0.2-cp312-abi3-musllinux_1_2_i686.whl", hash = "sha256:ca42a6ce2d697537da34f77a1960d21476c6a4af3e539eddb2b114c3cf65a78c", size = 2021226, upload-time = "2025-08-21T04:21:35.301Z" }, + { url = "https://files.pythonhosted.org/packages/5b/65/1ed14421ba27a4207fa694772003a311d1142b7f543179e4d1099b7eb746/pyzmq-27.0.2-cp312-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:3e44e665d78a07214b2772ccbd4b9bcc6d848d7895f1b2d7653f047b6318a4f6", size = 1878047, upload-time = "2025-08-21T04:21:36.749Z" }, + { url = "https://files.pythonhosted.org/packages/dd/dc/e578549b89b40dc78a387ec471c2a360766690c0a045cd8d1877d401012d/pyzmq-27.0.2-cp312-abi3-win32.whl", hash = "sha256:272d772d116615397d2be2b1417b3b8c8bc8671f93728c2f2c25002a4530e8f6", size = 558757, upload-time = "2025-08-21T04:21:38.2Z" }, + { url = "https://files.pythonhosted.org/packages/b5/89/06600980aefcc535c758414da969f37a5194ea4cdb73b745223f6af3acfb/pyzmq-27.0.2-cp312-abi3-win_amd64.whl", hash = "sha256:734be4f44efba0aa69bf5f015ed13eb69ff29bf0d17ea1e21588b095a3147b8e", size = 619281, upload-time = "2025-08-21T04:21:39.909Z" }, + { url = "https://files.pythonhosted.org/packages/30/84/df8a5c089552d17c9941d1aea4314b606edf1b1622361dae89aacedc6467/pyzmq-27.0.2-cp312-abi3-win_arm64.whl", hash = "sha256:41f0bd56d9279392810950feb2785a419c2920bbf007fdaaa7f4a07332ae492d", size = 552680, upload-time = "2025-08-21T04:21:41.571Z" }, + { url = "https://files.pythonhosted.org/packages/b4/7b/b79e976508517ab80dc800f7021ef1fb602a6d55e4caa2d47fb3dca5d8b6/pyzmq-27.0.2-cp313-cp313-android_24_arm64_v8a.whl", hash = "sha256:7f01118133427cd7f34ee133b5098e2af5f70303fa7519785c007bca5aa6f96a", size = 1122259, upload-time = "2025-08-21T04:21:43.063Z" }, + { url = "https://files.pythonhosted.org/packages/2b/1c/777217b9940ebcb7e71c924184ca5f31e410580a58d9fd93798589f0d31c/pyzmq-27.0.2-cp313-cp313-android_24_x86_64.whl", hash = "sha256:e4b860edf6379a7234ccbb19b4ed2c57e3ff569c3414fadfb49ae72b61a8ef07", size = 1156113, upload-time = "2025-08-21T04:21:44.566Z" }, + { url = "https://files.pythonhosted.org/packages/59/7d/654657a4c6435f41538182e71b61eac386a789a2bbb6f30171915253a9a7/pyzmq-27.0.2-cp313-cp313t-macosx_10_15_universal2.whl", hash = "sha256:cb77923ea163156da14295c941930bd525df0d29c96c1ec2fe3c3806b1e17cb3", size = 1341437, upload-time = "2025-08-21T04:21:46.019Z" }, + { url = "https://files.pythonhosted.org/packages/20/a0/5ed7710037f9c096017adc748bcb1698674a2d297f8b9422d38816f7b56a/pyzmq-27.0.2-cp313-cp313t-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:61678b7407b04df8f9423f188156355dc94d0fb52d360ae79d02ed7e0d431eea", size = 897888, upload-time = "2025-08-21T04:21:47.362Z" }, + { url = "https://files.pythonhosted.org/packages/2c/8a/6e4699a60931c17e7406641d201d7f2c121e2a38979bc83226a6d8f1ba32/pyzmq-27.0.2-cp313-cp313t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e3c824b70925963bdc8e39a642672c15ffaa67e7d4b491f64662dd56d6271263", size = 660727, upload-time = "2025-08-21T04:21:48.734Z" }, + { url = "https://files.pythonhosted.org/packages/7b/d8/d761e438c186451bd89ce63a665cde5690c084b61cd8f5d7b51e966e875a/pyzmq-27.0.2-cp313-cp313t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c4833e02fcf2751975457be1dfa2f744d4d09901a8cc106acaa519d868232175", size = 848136, upload-time = "2025-08-21T04:21:50.416Z" }, + { url = "https://files.pythonhosted.org/packages/43/f1/a0f31684efdf3eb92f46b7dd2117e752208115e89d278f8ca5f413c5bb85/pyzmq-27.0.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:b18045668d09cf0faa44918af2a67f0dbbef738c96f61c2f1b975b1ddb92ccfc", size = 1650402, upload-time = "2025-08-21T04:21:52.235Z" }, + { url = "https://files.pythonhosted.org/packages/41/fd/0d7f2a1732812df02c85002770da4a7864c79b210084bcdab01ea57e8d92/pyzmq-27.0.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:bbbb7e2f3ac5a22901324e7b086f398b8e16d343879a77b15ca3312e8cd8e6d5", size = 2024587, upload-time = "2025-08-21T04:21:54.07Z" }, + { url = "https://files.pythonhosted.org/packages/f1/73/358be69e279a382dd09e46dda29df8446365cddee4f79ef214e71e5b2b5a/pyzmq-27.0.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:b751914a73604d40d88a061bab042a11d4511b3ddbb7624cd83c39c8a498564c", size = 1885493, upload-time = "2025-08-21T04:21:55.588Z" }, + { url = "https://files.pythonhosted.org/packages/c5/7b/e9951ad53b3dfed8cfb4c2cfd6e0097c9b454e5c0d0e6df5f2b60d7c8c3d/pyzmq-27.0.2-cp313-cp313t-win32.whl", hash = "sha256:3e8f833dd82af11db5321c414638045c70f61009f72dd61c88db4a713c1fb1d2", size = 574934, upload-time = "2025-08-21T04:21:57.52Z" }, + { url = "https://files.pythonhosted.org/packages/55/33/1a7fc3a92f2124a63e6e2a6afa0af471a5c0c713e776b476d4eda5111b13/pyzmq-27.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:5b45153cb8eadcab14139970643a84f7a7b08dda541fbc1f6f4855c49334b549", size = 640932, upload-time = "2025-08-21T04:21:59.527Z" }, + { url = "https://files.pythonhosted.org/packages/2a/52/2598a94ac251a7c83f3887866225eea1952b0d4463a68df5032eb00ff052/pyzmq-27.0.2-cp313-cp313t-win_arm64.whl", hash = "sha256:86898f5c9730df23427c1ee0097d8aa41aa5f89539a79e48cd0d2c22d059f1b7", size = 561315, upload-time = "2025-08-21T04:22:01.295Z" }, + { url = "https://files.pythonhosted.org/packages/42/7d/10ef02ea36590b29d48ef88eb0831f0af3eb240cccca2752556faec55f59/pyzmq-27.0.2-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:d2b4b261dce10762be5c116b6ad1f267a9429765b493c454f049f33791dd8b8a", size = 1341463, upload-time = "2025-08-21T04:22:02.712Z" }, + { url = "https://files.pythonhosted.org/packages/94/36/115d18dade9a3d4d3d08dd8bfe5459561b8e02815f99df040555fdd7768e/pyzmq-27.0.2-cp314-cp314t-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:4e4d88b6cff156fed468903006b24bbd85322612f9c2f7b96e72d5016fd3f543", size = 897840, upload-time = "2025-08-21T04:22:04.845Z" }, + { url = "https://files.pythonhosted.org/packages/39/66/083b37839b95c386a95f1537bb41bdbf0c002b7c55b75ee737949cecb11f/pyzmq-27.0.2-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8426c0ebbc11ed8416a6e9409c194142d677c2c5c688595f2743664e356d9e9b", size = 660704, upload-time = "2025-08-21T04:22:06.389Z" }, + { url = "https://files.pythonhosted.org/packages/76/5a/196ab46e549ba35bf3268f575e10cfac0dc86b78dcaa7a3e36407ecda752/pyzmq-27.0.2-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:565bee96a155fe6452caed5fb5f60c9862038e6b51a59f4f632562081cdb4004", size = 848037, upload-time = "2025-08-21T04:22:07.817Z" }, + { url = "https://files.pythonhosted.org/packages/70/ea/a27b9eb44b2e615a9ecb8510ebb023cc1d2d251181e4a1e50366bfbf94d6/pyzmq-27.0.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:5de735c745ca5cefe9c2d1547d8f28cfe1b1926aecb7483ab1102fd0a746c093", size = 1650278, upload-time = "2025-08-21T04:22:09.269Z" }, + { url = "https://files.pythonhosted.org/packages/62/ac/3e9af036bfaf718ab5e69ded8f6332da392c5450ad43e8e3ca66797f145a/pyzmq-27.0.2-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:ea4f498f8115fd90d7bf03a3e83ae3e9898e43362f8e8e8faec93597206e15cc", size = 2024504, upload-time = "2025-08-21T04:22:10.778Z" }, + { url = "https://files.pythonhosted.org/packages/ae/e9/3202d31788df8ebaa176b23d846335eb9c768d8b43c0506bbd6265ad36a0/pyzmq-27.0.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:d00e81cb0afd672915257a3927124ee2ad117ace3c256d39cd97ca3f190152ad", size = 1885381, upload-time = "2025-08-21T04:22:12.718Z" }, + { url = "https://files.pythonhosted.org/packages/4b/ed/42de80b7ab4e8fcf13376f81206cf8041740672ac1fd2e1c598d63f595bf/pyzmq-27.0.2-cp314-cp314t-win32.whl", hash = "sha256:0f6e9b00d81b58f859fffc112365d50413954e02aefe36c5b4c8fb4af79f8cc3", size = 587526, upload-time = "2025-08-21T04:22:14.18Z" }, + { url = "https://files.pythonhosted.org/packages/ed/c8/8f3c72d6f0bfbf090aa5e283576073ca5c59839b85a5cc8c66ddb9b59801/pyzmq-27.0.2-cp314-cp314t-win_amd64.whl", hash = "sha256:2e73cf3b127a437fef4100eb3ac2ebe6b49e655bb721329f667f59eca0a26221", size = 661368, upload-time = "2025-08-21T04:22:15.677Z" }, + { url = "https://files.pythonhosted.org/packages/69/a4/7ee652ea1c77d872f5d99ed937fa8bbd1f6f4b7a39a6d3a0076c286e0c3e/pyzmq-27.0.2-cp314-cp314t-win_arm64.whl", hash = "sha256:4108785f2e5ac865d06f678a07a1901e3465611356df21a545eeea8b45f56265", size = 574901, upload-time = "2025-08-21T04:22:17.423Z" }, +] + +[[package]] +name = "ray" +version = "2.48.0" +source = { url = "http://localhost:9478/ray/ray-2.48.0-cp312-cp312-manylinux2014_x86_64.whl" } +dependencies = [ + { name = "click" }, + { name = "filelock" }, + { name = "jsonschema" }, + { name = "msgpack" }, + { name = "packaging" }, + { name = "protobuf" }, + { name = "pyyaml" }, + { name = "requests" }, +] +wheels = [ + { url = "http://localhost:9478/ray/ray-2.48.0-cp312-cp312-manylinux2014_x86_64.whl", hash = "sha256:f93c6f2e7a91d5dfc6442c390005509c933dbc49d544d9b73bd81687529f4b57" }, +] + +[package.optional-dependencies] +data = [ + { name = "fsspec" }, + { name = "numpy" }, + { name = "pandas" }, + { name = "polars" }, + { name = "pyarrow" }, +] +serve = [ + { name = "aiohttp" }, + { name = "aiohttp-cors" }, + { name = "colorful" }, + { name = "fastapi" }, + { name = "grpcio" }, + { name = "opencensus" }, + { name = "opentelemetry-exporter-prometheus" }, + { name = "opentelemetry-proto" }, + { name = "opentelemetry-sdk" }, + { name = "prometheus-client" }, + { name = "py-spy" }, + { name = "pydantic" }, + { name = "requests" }, + { name = "smart-open" }, + { name = "starlette" }, + { name = "uvicorn", extra = ["standard"] }, + { name = "virtualenv" }, + { name = "watchfiles" }, +] +train = [ + { name = "fsspec" }, + { name = "pandas" }, + { name = "pyarrow" }, + { name = "pydantic" }, + { name = "requests" }, + { name = "tensorboardx" }, +] +tune = [ + { name = "fsspec" }, + { name = "pandas" }, + { name = "pyarrow" }, + { name = "requests" }, + { name = "tensorboardx" }, +] + +[package.metadata] +requires-dist = [ + { name = "aiohttp", marker = "extra == 'air'", specifier = ">=3.7" }, + { name = "aiohttp", marker = "extra == 'all'", specifier = ">=3.7" }, + { name = "aiohttp", marker = "extra == 'default'", specifier = ">=3.7" }, + { name = "aiohttp", marker = "extra == 'llm'", specifier = ">=3.7" }, + { name = "aiohttp", marker = "extra == 'serve'", specifier = ">=3.7" }, + { name = "aiohttp", marker = "extra == 'serve-grpc'", specifier = ">=3.7" }, + { name = "aiohttp-cors", marker = "extra == 'air'" }, + { name = "aiohttp-cors", marker = "extra == 'all'" }, + { name = "aiohttp-cors", marker = "extra == 'default'" }, + { name = "aiohttp-cors", marker = "extra == 'llm'" }, + { name = "aiohttp-cors", marker = "extra == 'serve'" }, + { name = "aiohttp-cors", marker = "extra == 'serve-grpc'" }, + { name = "async-timeout", marker = "python_full_version < '3.11' and extra == 'llm'" }, + { name = "click", specifier = ">=7.0" }, + { name = "colorful", marker = "extra == 'air'" }, + { name = "colorful", marker = "extra == 'all'" }, + { name = "colorful", marker = "extra == 'default'" }, + { name = "colorful", marker = "extra == 'llm'" }, + { name = "colorful", marker = "extra == 'serve'" }, + { name = "colorful", marker = "extra == 'serve-grpc'" }, + { name = "cupy-cuda12x", marker = "sys_platform != 'darwin' and extra == 'adag'" }, + { name = "cupy-cuda12x", marker = "sys_platform != 'darwin' and extra == 'all'" }, + { name = "cupy-cuda12x", marker = "sys_platform != 'darwin' and extra == 'cgraph'" }, + { name = "dm-tree", marker = "extra == 'all'" }, + { name = "dm-tree", marker = "extra == 'rllib'" }, + { name = "fastapi", marker = "extra == 'air'" }, + { name = "fastapi", marker = "extra == 'all'" }, + { name = "fastapi", marker = "extra == 'llm'" }, + { name = "fastapi", marker = "extra == 'serve'" }, + { name = "fastapi", marker = "extra == 'serve-grpc'" }, + { name = "filelock" }, + { name = "fsspec", marker = "extra == 'air'" }, + { name = "fsspec", marker = "extra == 'all'" }, + { name = "fsspec", marker = "extra == 'data'" }, + { name = "fsspec", marker = "extra == 'llm'" }, + { name = "fsspec", marker = "extra == 'rllib'" }, + { name = "fsspec", marker = "extra == 'train'" }, + { name = "fsspec", marker = "extra == 'tune'" }, + { name = "grpcio", marker = "python_full_version >= '3.10' and extra == 'air'", specifier = ">=1.42.0" }, + { name = "grpcio", marker = "python_full_version >= '3.10' and extra == 'all'", specifier = ">=1.42.0" }, + { name = "grpcio", marker = "python_full_version >= '3.10' and extra == 'default'", specifier = ">=1.42.0" }, + { name = "grpcio", marker = "python_full_version >= '3.10' and extra == 'llm'", specifier = ">=1.42.0" }, + { name = "grpcio", marker = "python_full_version >= '3.10' and extra == 'serve'", specifier = ">=1.42.0" }, + { name = "grpcio", marker = "python_full_version >= '3.10' and extra == 'serve-grpc'", specifier = ">=1.42.0" }, + { name = "grpcio", marker = "python_full_version < '3.10' and extra == 'air'", specifier = ">=1.32.0" }, + { name = "grpcio", marker = "python_full_version < '3.10' and extra == 'all'", specifier = ">=1.32.0" }, + { name = "grpcio", marker = "python_full_version < '3.10' and extra == 'default'", specifier = ">=1.32.0" }, + { name = "grpcio", marker = "python_full_version < '3.10' and extra == 'llm'", specifier = ">=1.32.0" }, + { name = "grpcio", marker = "python_full_version < '3.10' and extra == 'serve'", specifier = ">=1.32.0" }, + { name = "grpcio", marker = "python_full_version < '3.10' and extra == 'serve-grpc'", specifier = ">=1.32.0" }, + { name = "grpcio", marker = "sys_platform == 'darwin' and extra == 'all'", specifier = "!=1.56.0" }, + { name = "grpcio", marker = "sys_platform == 'darwin' and extra == 'client'", specifier = "!=1.56.0" }, + { name = "grpcio", marker = "extra == 'all'" }, + { name = "grpcio", marker = "extra == 'client'" }, + { name = "gymnasium", marker = "extra == 'all'", specifier = "==1.0.0" }, + { name = "gymnasium", marker = "extra == 'rllib'", specifier = "==1.0.0" }, + { name = "jsonref", marker = "extra == 'llm'", specifier = ">=1.1.0" }, + { name = "jsonschema" }, + { name = "jsonschema", marker = "extra == 'llm'" }, + { name = "lz4", marker = "extra == 'all'" }, + { name = "lz4", marker = "extra == 'rllib'" }, + { name = "memray", marker = "sys_platform != 'win32' and extra == 'all'" }, + { name = "memray", marker = "sys_platform != 'win32' and extra == 'observability'" }, + { name = "msgpack", specifier = ">=1.0.0,<2.0.0" }, + { name = "ninja", marker = "extra == 'llm'" }, + { name = "numpy", marker = "extra == 'air'", specifier = ">=1.20" }, + { name = "numpy", marker = "extra == 'all'", specifier = ">=1.20" }, + { name = "numpy", marker = "extra == 'data'", specifier = ">=1.20" }, + { name = "numpy", marker = "extra == 'llm'", specifier = ">=1.20" }, + { name = "opencensus", marker = "extra == 'air'" }, + { name = "opencensus", marker = "extra == 'all'" }, + { name = "opencensus", marker = "extra == 'default'" }, + { name = "opencensus", marker = "extra == 'llm'" }, + { name = "opencensus", marker = "extra == 'serve'" }, + { name = "opencensus", marker = "extra == 'serve-grpc'" }, + { name = "opentelemetry-exporter-prometheus", marker = "extra == 'air'" }, + { name = "opentelemetry-exporter-prometheus", marker = "extra == 'all'" }, + { name = "opentelemetry-exporter-prometheus", marker = "extra == 'default'" }, + { name = "opentelemetry-exporter-prometheus", marker = "extra == 'llm'" }, + { name = "opentelemetry-exporter-prometheus", marker = "extra == 'serve'" }, + { name = "opentelemetry-exporter-prometheus", marker = "extra == 'serve-grpc'" }, + { name = "opentelemetry-proto", marker = "extra == 'air'" }, + { name = "opentelemetry-proto", marker = "extra == 'all'" }, + { name = "opentelemetry-proto", marker = "extra == 'default'" }, + { name = "opentelemetry-proto", marker = "extra == 'llm'" }, + { name = "opentelemetry-proto", marker = "extra == 'serve'" }, + { name = "opentelemetry-proto", marker = "extra == 'serve-grpc'" }, + { name = "opentelemetry-sdk", marker = "extra == 'air'", specifier = ">=1.30.0" }, + { name = "opentelemetry-sdk", marker = "extra == 'all'", specifier = ">=1.30.0" }, + { name = "opentelemetry-sdk", marker = "extra == 'default'", specifier = ">=1.30.0" }, + { name = "opentelemetry-sdk", marker = "extra == 'llm'", specifier = ">=1.30.0" }, + { name = "opentelemetry-sdk", marker = "extra == 'serve'", specifier = ">=1.30.0" }, + { name = "opentelemetry-sdk", marker = "extra == 'serve-grpc'", specifier = ">=1.30.0" }, + { name = "ormsgpack", marker = "extra == 'all'", specifier = "==1.7.0" }, + { name = "ormsgpack", marker = "extra == 'rllib'", specifier = "==1.7.0" }, + { name = "packaging" }, + { name = "pandas", marker = "extra == 'air'" }, + { name = "pandas", marker = "extra == 'air'", specifier = ">=1.3" }, + { name = "pandas", marker = "extra == 'all'" }, + { name = "pandas", marker = "extra == 'all'", specifier = ">=1.3" }, + { name = "pandas", marker = "extra == 'data'", specifier = ">=1.3" }, + { name = "pandas", marker = "extra == 'llm'", specifier = ">=1.3" }, + { name = "pandas", marker = "extra == 'rllib'" }, + { name = "pandas", marker = "extra == 'train'" }, + { name = "pandas", marker = "extra == 'tune'" }, + { name = "polars", marker = "extra == 'air'", specifier = ">=1.30.0,<2.0.0" }, + { name = "polars", marker = "extra == 'all'", specifier = ">=1.30.0,<2.0.0" }, + { name = "polars", marker = "extra == 'data'", specifier = ">=1.30.0,<2.0.0" }, + { name = "polars", marker = "extra == 'llm'", specifier = ">=1.30.0,<2.0.0" }, + { name = "prometheus-client", marker = "extra == 'air'", specifier = ">=0.7.1" }, + { name = "prometheus-client", marker = "extra == 'all'", specifier = ">=0.7.1" }, + { name = "prometheus-client", marker = "extra == 'default'", specifier = ">=0.7.1" }, + { name = "prometheus-client", marker = "extra == 'llm'", specifier = ">=0.7.1" }, + { name = "prometheus-client", marker = "extra == 'serve'", specifier = ">=0.7.1" }, + { name = "prometheus-client", marker = "extra == 'serve-grpc'", specifier = ">=0.7.1" }, + { name = "protobuf", specifier = ">=3.15.3,!=3.19.5" }, + { name = "py-spy", marker = "python_full_version >= '3.12' and extra == 'air'", specifier = ">=0.4.0" }, + { name = "py-spy", marker = "python_full_version >= '3.12' and extra == 'all'", specifier = ">=0.4.0" }, + { name = "py-spy", marker = "python_full_version >= '3.12' and extra == 'default'", specifier = ">=0.4.0" }, + { name = "py-spy", marker = "python_full_version >= '3.12' and extra == 'llm'", specifier = ">=0.4.0" }, + { name = "py-spy", marker = "python_full_version >= '3.12' and extra == 'serve'", specifier = ">=0.4.0" }, + { name = "py-spy", marker = "python_full_version >= '3.12' and extra == 'serve-grpc'", specifier = ">=0.4.0" }, + { name = "py-spy", marker = "python_full_version < '3.12' and extra == 'air'", specifier = ">=0.2.0" }, + { name = "py-spy", marker = "python_full_version < '3.12' and extra == 'all'", specifier = ">=0.2.0" }, + { name = "py-spy", marker = "python_full_version < '3.12' and extra == 'default'", specifier = ">=0.2.0" }, + { name = "py-spy", marker = "python_full_version < '3.12' and extra == 'llm'", specifier = ">=0.2.0" }, + { name = "py-spy", marker = "python_full_version < '3.12' and extra == 'serve'", specifier = ">=0.2.0" }, + { name = "py-spy", marker = "python_full_version < '3.12' and extra == 'serve-grpc'", specifier = ">=0.2.0" }, + { name = "pyarrow", marker = "extra == 'air'", specifier = ">=9.0.0" }, + { name = "pyarrow", marker = "extra == 'all'", specifier = ">=9.0.0" }, + { name = "pyarrow", marker = "extra == 'data'", specifier = ">=9.0.0" }, + { name = "pyarrow", marker = "extra == 'llm'", specifier = ">=9.0.0" }, + { name = "pyarrow", marker = "extra == 'rllib'", specifier = ">=9.0.0" }, + { name = "pyarrow", marker = "extra == 'train'", specifier = ">=9.0.0" }, + { name = "pyarrow", marker = "extra == 'tune'", specifier = ">=9.0.0" }, + { name = "pydantic", marker = "extra == 'air'", specifier = "!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3" }, + { name = "pydantic", marker = "extra == 'all'", specifier = "!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3" }, + { name = "pydantic", marker = "extra == 'default'", specifier = "!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3" }, + { name = "pydantic", marker = "extra == 'llm'", specifier = "!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3" }, + { name = "pydantic", marker = "extra == 'serve'", specifier = "!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3" }, + { name = "pydantic", marker = "extra == 'serve-grpc'", specifier = "!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3" }, + { name = "pydantic", marker = "extra == 'train'", specifier = "!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3" }, + { name = "pyopenssl", marker = "extra == 'all'" }, + { name = "pyopenssl", marker = "extra == 'serve-grpc'" }, + { name = "pyyaml" }, + { name = "pyyaml", marker = "extra == 'all'" }, + { name = "pyyaml", marker = "extra == 'rllib'" }, + { name = "requests" }, + { name = "requests", marker = "extra == 'air'" }, + { name = "requests", marker = "extra == 'all'" }, + { name = "requests", marker = "extra == 'default'" }, + { name = "requests", marker = "extra == 'llm'" }, + { name = "requests", marker = "extra == 'rllib'" }, + { name = "requests", marker = "extra == 'serve'" }, + { name = "requests", marker = "extra == 'serve-grpc'" }, + { name = "requests", marker = "extra == 'train'" }, + { name = "requests", marker = "extra == 'tune'" }, + { name = "scipy", marker = "extra == 'all'" }, + { name = "scipy", marker = "extra == 'rllib'" }, + { name = "smart-open", marker = "extra == 'air'" }, + { name = "smart-open", marker = "extra == 'all'" }, + { name = "smart-open", marker = "extra == 'default'" }, + { name = "smart-open", marker = "extra == 'llm'" }, + { name = "smart-open", marker = "extra == 'serve'" }, + { name = "smart-open", marker = "extra == 'serve-grpc'" }, + { name = "starlette", marker = "extra == 'air'" }, + { name = "starlette", marker = "extra == 'all'" }, + { name = "starlette", marker = "extra == 'llm'" }, + { name = "starlette", marker = "extra == 'serve'" }, + { name = "starlette", marker = "extra == 'serve-grpc'" }, + { name = "tensorboardx", marker = "extra == 'air'", specifier = ">=1.9" }, + { name = "tensorboardx", marker = "extra == 'all'", specifier = ">=1.9" }, + { name = "tensorboardx", marker = "extra == 'rllib'", specifier = ">=1.9" }, + { name = "tensorboardx", marker = "extra == 'train'", specifier = ">=1.9" }, + { name = "tensorboardx", marker = "extra == 'tune'", specifier = ">=1.9" }, + { name = "typer", marker = "extra == 'llm'" }, + { name = "uvicorn", extras = ["standard"], marker = "extra == 'air'" }, + { name = "uvicorn", extras = ["standard"], marker = "extra == 'all'" }, + { name = "uvicorn", extras = ["standard"], marker = "extra == 'llm'" }, + { name = "uvicorn", extras = ["standard"], marker = "extra == 'serve'" }, + { name = "uvicorn", extras = ["standard"], marker = "extra == 'serve-grpc'" }, + { name = "virtualenv", marker = "extra == 'air'", specifier = ">=20.0.24,!=20.21.1" }, + { name = "virtualenv", marker = "extra == 'all'", specifier = ">=20.0.24,!=20.21.1" }, + { name = "virtualenv", marker = "extra == 'default'", specifier = ">=20.0.24,!=20.21.1" }, + { name = "virtualenv", marker = "extra == 'llm'", specifier = ">=20.0.24,!=20.21.1" }, + { name = "virtualenv", marker = "extra == 'serve'", specifier = ">=20.0.24,!=20.21.1" }, + { name = "virtualenv", marker = "extra == 'serve-grpc'", specifier = ">=20.0.24,!=20.21.1" }, + { name = "vllm", marker = "extra == 'llm'", specifier = ">=0.9.2" }, + { name = "watchfiles", marker = "extra == 'air'" }, + { name = "watchfiles", marker = "extra == 'all'" }, + { name = "watchfiles", marker = "extra == 'llm'" }, + { name = "watchfiles", marker = "extra == 'serve'" }, + { name = "watchfiles", marker = "extra == 'serve-grpc'" }, +] +provides-extras = ["cgraph", "client", "data", "default", "observability", "serve", "tune", "adag", "serve-grpc", "rllib", "train", "air", "all", "llm"] + +[[package]] +name = "referencing" +version = "0.36.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "attrs" }, + { name = "rpds-py" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/2f/db/98b5c277be99dd18bfd91dd04e1b759cad18d1a338188c936e92f921c7e2/referencing-0.36.2.tar.gz", hash = "sha256:df2e89862cd09deabbdba16944cc3f10feb6b3e6f18e902f7cc25609a34775aa", size = 74744, upload-time = "2025-01-25T08:48:16.138Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/b1/3baf80dc6d2b7bc27a95a67752d0208e410351e3feb4eb78de5f77454d8d/referencing-0.36.2-py3-none-any.whl", hash = "sha256:e8699adbbf8b5c7de96d8ffa0eb5c158b3beafce084968e2ea8bb08c6794dcd0", size = 26775, upload-time = "2025-01-25T08:48:14.241Z" }, +] + +[[package]] +name = "regex" +version = "2025.7.34" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0b/de/e13fa6dc61d78b30ba47481f99933a3b49a57779d625c392d8036770a60d/regex-2025.7.34.tar.gz", hash = "sha256:9ead9765217afd04a86822dfcd4ed2747dfe426e887da413b15ff0ac2457e21a", size = 400714, upload-time = "2025-07-31T00:21:16.262Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ff/f0/31d62596c75a33f979317658e8d261574785c6cd8672c06741ce2e2e2070/regex-2025.7.34-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:7f7211a746aced993bef487de69307a38c5ddd79257d7be83f7b202cb59ddb50", size = 485492, upload-time = "2025-07-31T00:19:35.57Z" }, + { url = "https://files.pythonhosted.org/packages/d8/16/b818d223f1c9758c3434be89aa1a01aae798e0e0df36c1f143d1963dd1ee/regex-2025.7.34-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:fb31080f2bd0681484b275461b202b5ad182f52c9ec606052020fe13eb13a72f", size = 290000, upload-time = "2025-07-31T00:19:37.175Z" }, + { url = "https://files.pythonhosted.org/packages/cd/70/69506d53397b4bd6954061bae75677ad34deb7f6ca3ba199660d6f728ff5/regex-2025.7.34-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0200a5150c4cf61e407038f4b4d5cdad13e86345dac29ff9dab3d75d905cf130", size = 286072, upload-time = "2025-07-31T00:19:38.612Z" }, + { url = "https://files.pythonhosted.org/packages/b0/73/536a216d5f66084fb577bb0543b5cb7de3272eb70a157f0c3a542f1c2551/regex-2025.7.34-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:739a74970e736df0773788377969c9fea3876c2fc13d0563f98e5503e5185f46", size = 797341, upload-time = "2025-07-31T00:19:40.119Z" }, + { url = "https://files.pythonhosted.org/packages/26/af/733f8168449e56e8f404bb807ea7189f59507cbea1b67a7bbcd92f8bf844/regex-2025.7.34-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4fef81b2f7ea6a2029161ed6dea9ae13834c28eb5a95b8771828194a026621e4", size = 862556, upload-time = "2025-07-31T00:19:41.556Z" }, + { url = "https://files.pythonhosted.org/packages/19/dd/59c464d58c06c4f7d87de4ab1f590e430821345a40c5d345d449a636d15f/regex-2025.7.34-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ea74cf81fe61a7e9d77989050d0089a927ab758c29dac4e8e1b6c06fccf3ebf0", size = 910762, upload-time = "2025-07-31T00:19:43Z" }, + { url = "https://files.pythonhosted.org/packages/37/a8/b05ccf33ceca0815a1e253693b2c86544932ebcc0049c16b0fbdf18b688b/regex-2025.7.34-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e4636a7f3b65a5f340ed9ddf53585c42e3ff37101d383ed321bfe5660481744b", size = 801892, upload-time = "2025-07-31T00:19:44.645Z" }, + { url = "https://files.pythonhosted.org/packages/5f/9a/b993cb2e634cc22810afd1652dba0cae156c40d4864285ff486c73cd1996/regex-2025.7.34-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6cef962d7834437fe8d3da6f9bfc6f93f20f218266dcefec0560ed7765f5fe01", size = 786551, upload-time = "2025-07-31T00:19:46.127Z" }, + { url = "https://files.pythonhosted.org/packages/2d/79/7849d67910a0de4e26834b5bb816e028e35473f3d7ae563552ea04f58ca2/regex-2025.7.34-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:cbe1698e5b80298dbce8df4d8d1182279fbdaf1044e864cbc9d53c20e4a2be77", size = 856457, upload-time = "2025-07-31T00:19:47.562Z" }, + { url = "https://files.pythonhosted.org/packages/91/c6/de516bc082524b27e45cb4f54e28bd800c01efb26d15646a65b87b13a91e/regex-2025.7.34-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:32b9f9bcf0f605eb094b08e8da72e44badabb63dde6b83bd530580b488d1c6da", size = 848902, upload-time = "2025-07-31T00:19:49.312Z" }, + { url = "https://files.pythonhosted.org/packages/7d/22/519ff8ba15f732db099b126f039586bd372da6cd4efb810d5d66a5daeda1/regex-2025.7.34-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:524c868ba527eab4e8744a9287809579f54ae8c62fbf07d62aacd89f6026b282", size = 788038, upload-time = "2025-07-31T00:19:50.794Z" }, + { url = "https://files.pythonhosted.org/packages/3f/7d/aabb467d8f57d8149895d133c88eb809a1a6a0fe262c1d508eb9dfabb6f9/regex-2025.7.34-cp312-cp312-win32.whl", hash = "sha256:d600e58ee6d036081c89696d2bdd55d507498a7180df2e19945c6642fac59588", size = 264417, upload-time = "2025-07-31T00:19:52.292Z" }, + { url = "https://files.pythonhosted.org/packages/3b/39/bd922b55a4fc5ad5c13753274e5b536f5b06ec8eb9747675668491c7ab7a/regex-2025.7.34-cp312-cp312-win_amd64.whl", hash = "sha256:9a9ab52a466a9b4b91564437b36417b76033e8778e5af8f36be835d8cb370d62", size = 275387, upload-time = "2025-07-31T00:19:53.593Z" }, + { url = "https://files.pythonhosted.org/packages/f7/3c/c61d2fdcecb754a40475a3d1ef9a000911d3e3fc75c096acf44b0dfb786a/regex-2025.7.34-cp312-cp312-win_arm64.whl", hash = "sha256:c83aec91af9c6fbf7c743274fd952272403ad9a9db05fe9bfc9df8d12b45f176", size = 268482, upload-time = "2025-07-31T00:19:55.183Z" }, + { url = "https://files.pythonhosted.org/packages/15/16/b709b2119975035169a25aa8e4940ca177b1a2e25e14f8d996d09130368e/regex-2025.7.34-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:c3c9740a77aeef3f5e3aaab92403946a8d34437db930a0280e7e81ddcada61f5", size = 485334, upload-time = "2025-07-31T00:19:56.58Z" }, + { url = "https://files.pythonhosted.org/packages/94/a6/c09136046be0595f0331bc58a0e5f89c2d324cf734e0b0ec53cf4b12a636/regex-2025.7.34-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:69ed3bc611540f2ea70a4080f853741ec698be556b1df404599f8724690edbcd", size = 289942, upload-time = "2025-07-31T00:19:57.943Z" }, + { url = "https://files.pythonhosted.org/packages/36/91/08fc0fd0f40bdfb0e0df4134ee37cfb16e66a1044ac56d36911fd01c69d2/regex-2025.7.34-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d03c6f9dcd562c56527c42b8530aad93193e0b3254a588be1f2ed378cdfdea1b", size = 285991, upload-time = "2025-07-31T00:19:59.837Z" }, + { url = "https://files.pythonhosted.org/packages/be/2f/99dc8f6f756606f0c214d14c7b6c17270b6bbe26d5c1f05cde9dbb1c551f/regex-2025.7.34-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6164b1d99dee1dfad33f301f174d8139d4368a9fb50bf0a3603b2eaf579963ad", size = 797415, upload-time = "2025-07-31T00:20:01.668Z" }, + { url = "https://files.pythonhosted.org/packages/62/cf/2fcdca1110495458ba4e95c52ce73b361cf1cafd8a53b5c31542cde9a15b/regex-2025.7.34-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:1e4f4f62599b8142362f164ce776f19d79bdd21273e86920a7b604a4275b4f59", size = 862487, upload-time = "2025-07-31T00:20:03.142Z" }, + { url = "https://files.pythonhosted.org/packages/90/38/899105dd27fed394e3fae45607c1983e138273ec167e47882fc401f112b9/regex-2025.7.34-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:72a26dcc6a59c057b292f39d41465d8233a10fd69121fa24f8f43ec6294e5415", size = 910717, upload-time = "2025-07-31T00:20:04.727Z" }, + { url = "https://files.pythonhosted.org/packages/ee/f6/4716198dbd0bcc9c45625ac4c81a435d1c4d8ad662e8576dac06bab35b17/regex-2025.7.34-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d5273fddf7a3e602695c92716c420c377599ed3c853ea669c1fe26218867002f", size = 801943, upload-time = "2025-07-31T00:20:07.1Z" }, + { url = "https://files.pythonhosted.org/packages/40/5d/cff8896d27e4e3dd11dd72ac78797c7987eb50fe4debc2c0f2f1682eb06d/regex-2025.7.34-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c1844be23cd40135b3a5a4dd298e1e0c0cb36757364dd6cdc6025770363e06c1", size = 786664, upload-time = "2025-07-31T00:20:08.818Z" }, + { url = "https://files.pythonhosted.org/packages/10/29/758bf83cf7b4c34f07ac3423ea03cee3eb3176941641e4ccc05620f6c0b8/regex-2025.7.34-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:dde35e2afbbe2272f8abee3b9fe6772d9b5a07d82607b5788e8508974059925c", size = 856457, upload-time = "2025-07-31T00:20:10.328Z" }, + { url = "https://files.pythonhosted.org/packages/d7/30/c19d212b619963c5b460bfed0ea69a092c6a43cba52a973d46c27b3e2975/regex-2025.7.34-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:f3f6e8e7af516a7549412ce57613e859c3be27d55341a894aacaa11703a4c31a", size = 849008, upload-time = "2025-07-31T00:20:11.823Z" }, + { url = "https://files.pythonhosted.org/packages/9e/b8/3c35da3b12c87e3cc00010ef6c3a4ae787cff0bc381aa3d251def219969a/regex-2025.7.34-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:469142fb94a869beb25b5f18ea87646d21def10fbacb0bcb749224f3509476f0", size = 788101, upload-time = "2025-07-31T00:20:13.729Z" }, + { url = "https://files.pythonhosted.org/packages/47/80/2f46677c0b3c2b723b2c358d19f9346e714113865da0f5f736ca1a883bde/regex-2025.7.34-cp313-cp313-win32.whl", hash = "sha256:da7507d083ee33ccea1310447410c27ca11fb9ef18c95899ca57ff60a7e4d8f1", size = 264401, upload-time = "2025-07-31T00:20:15.233Z" }, + { url = "https://files.pythonhosted.org/packages/be/fa/917d64dd074682606a003cba33585c28138c77d848ef72fc77cbb1183849/regex-2025.7.34-cp313-cp313-win_amd64.whl", hash = "sha256:9d644de5520441e5f7e2db63aec2748948cc39ed4d7a87fd5db578ea4043d997", size = 275368, upload-time = "2025-07-31T00:20:16.711Z" }, + { url = "https://files.pythonhosted.org/packages/65/cd/f94383666704170a2154a5df7b16be28f0c27a266bffcd843e58bc84120f/regex-2025.7.34-cp313-cp313-win_arm64.whl", hash = "sha256:7bf1c5503a9f2cbd2f52d7e260acb3131b07b6273c470abb78568174fe6bde3f", size = 268482, upload-time = "2025-07-31T00:20:18.189Z" }, + { url = "https://files.pythonhosted.org/packages/ac/23/6376f3a23cf2f3c00514b1cdd8c990afb4dfbac3cb4a68b633c6b7e2e307/regex-2025.7.34-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:8283afe7042d8270cecf27cca558873168e771183d4d593e3c5fe5f12402212a", size = 485385, upload-time = "2025-07-31T00:20:19.692Z" }, + { url = "https://files.pythonhosted.org/packages/73/5b/6d4d3a0b4d312adbfd6d5694c8dddcf1396708976dd87e4d00af439d962b/regex-2025.7.34-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:6c053f9647e3421dd2f5dff8172eb7b4eec129df9d1d2f7133a4386319b47435", size = 289788, upload-time = "2025-07-31T00:20:21.941Z" }, + { url = "https://files.pythonhosted.org/packages/92/71/5862ac9913746e5054d01cb9fb8125b3d0802c0706ef547cae1e7f4428fa/regex-2025.7.34-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:a16dd56bbcb7d10e62861c3cd000290ddff28ea142ffb5eb3470f183628011ac", size = 286136, upload-time = "2025-07-31T00:20:26.146Z" }, + { url = "https://files.pythonhosted.org/packages/27/df/5b505dc447eb71278eba10d5ec940769ca89c1af70f0468bfbcb98035dc2/regex-2025.7.34-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:69c593ff5a24c0d5c1112b0df9b09eae42b33c014bdca7022d6523b210b69f72", size = 797753, upload-time = "2025-07-31T00:20:27.919Z" }, + { url = "https://files.pythonhosted.org/packages/86/38/3e3dc953d13998fa047e9a2414b556201dbd7147034fbac129392363253b/regex-2025.7.34-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:98d0ce170fcde1a03b5df19c5650db22ab58af375aaa6ff07978a85c9f250f0e", size = 863263, upload-time = "2025-07-31T00:20:29.803Z" }, + { url = "https://files.pythonhosted.org/packages/68/e5/3ff66b29dde12f5b874dda2d9dec7245c2051f2528d8c2a797901497f140/regex-2025.7.34-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d72765a4bff8c43711d5b0f5b452991a9947853dfa471972169b3cc0ba1d0751", size = 910103, upload-time = "2025-07-31T00:20:31.313Z" }, + { url = "https://files.pythonhosted.org/packages/9e/fe/14176f2182125977fba3711adea73f472a11f3f9288c1317c59cd16ad5e6/regex-2025.7.34-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4494f8fd95a77eb434039ad8460e64d57baa0434f1395b7da44015bef650d0e4", size = 801709, upload-time = "2025-07-31T00:20:33.323Z" }, + { url = "https://files.pythonhosted.org/packages/5a/0d/80d4e66ed24f1ba876a9e8e31b709f9fd22d5c266bf5f3ab3c1afe683d7d/regex-2025.7.34-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:4f42b522259c66e918a0121a12429b2abcf696c6f967fa37bdc7b72e61469f98", size = 786726, upload-time = "2025-07-31T00:20:35.252Z" }, + { url = "https://files.pythonhosted.org/packages/12/75/c3ebb30e04a56c046f5c85179dc173818551037daae2c0c940c7b19152cb/regex-2025.7.34-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:aaef1f056d96a0a5d53ad47d019d5b4c66fe4be2da87016e0d43b7242599ffc7", size = 857306, upload-time = "2025-07-31T00:20:37.12Z" }, + { url = "https://files.pythonhosted.org/packages/b1/b2/a4dc5d8b14f90924f27f0ac4c4c4f5e195b723be98adecc884f6716614b6/regex-2025.7.34-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:656433e5b7dccc9bc0da6312da8eb897b81f5e560321ec413500e5367fcd5d47", size = 848494, upload-time = "2025-07-31T00:20:38.818Z" }, + { url = "https://files.pythonhosted.org/packages/0d/21/9ac6e07a4c5e8646a90b56b61f7e9dac11ae0747c857f91d3d2bc7c241d9/regex-2025.7.34-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e91eb2c62c39705e17b4d42d4b86c4e86c884c0d15d9c5a47d0835f8387add8e", size = 787850, upload-time = "2025-07-31T00:20:40.478Z" }, + { url = "https://files.pythonhosted.org/packages/be/6c/d51204e28e7bc54f9a03bb799b04730d7e54ff2718862b8d4e09e7110a6a/regex-2025.7.34-cp314-cp314-win32.whl", hash = "sha256:f978ddfb6216028c8f1d6b0f7ef779949498b64117fc35a939022f67f810bdcb", size = 269730, upload-time = "2025-07-31T00:20:42.253Z" }, + { url = "https://files.pythonhosted.org/packages/74/52/a7e92d02fa1fdef59d113098cb9f02c5d03289a0e9f9e5d4d6acccd10677/regex-2025.7.34-cp314-cp314-win_amd64.whl", hash = "sha256:4b7dc33b9b48fb37ead12ffc7bdb846ac72f99a80373c4da48f64b373a7abeae", size = 278640, upload-time = "2025-07-31T00:20:44.42Z" }, + { url = "https://files.pythonhosted.org/packages/d1/78/a815529b559b1771080faa90c3ab401730661f99d495ab0071649f139ebd/regex-2025.7.34-cp314-cp314-win_arm64.whl", hash = "sha256:4b8c4d39f451e64809912c82392933d80fe2e4a87eeef8859fcc5380d0173c64", size = 271757, upload-time = "2025-07-31T00:20:46.355Z" }, +] + +[[package]] +name = "requests" +version = "2.32.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "charset-normalizer" }, + { name = "idna" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e1/0a/929373653770d8a0d7ea76c37de6e41f11eb07559b103b1c02cafb3f7cf8/requests-2.32.4.tar.gz", hash = "sha256:27d0316682c8a29834d3264820024b62a36942083d52caf2f14c0591336d3422", size = 135258, upload-time = "2025-06-09T16:43:07.34Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7c/e4/56027c4a6b4ae70ca9de302488c5ca95ad4a39e190093d6c1a8ace08341b/requests-2.32.4-py3-none-any.whl", hash = "sha256:27babd3cda2a6d50b30443204ee89830707d396671944c998b5975b031ac2b2c", size = 64847, upload-time = "2025-06-09T16:43:05.728Z" }, +] + +[[package]] +name = "rpds-py" +version = "0.27.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1e/d9/991a0dee12d9fc53ed027e26a26a64b151d77252ac477e22666b9688bc16/rpds_py-0.27.0.tar.gz", hash = "sha256:8b23cf252f180cda89220b378d917180f29d313cd6a07b2431c0d3b776aae86f", size = 27420, upload-time = "2025-08-07T08:26:39.624Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cd/17/e67309ca1ac993fa1888a0d9b2f5ccc1f67196ace32e76c9f8e1dbbbd50c/rpds_py-0.27.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:19c990fdf5acecbf0623e906ae2e09ce1c58947197f9bced6bbd7482662231c4", size = 362611, upload-time = "2025-08-07T08:23:44.773Z" }, + { url = "https://files.pythonhosted.org/packages/93/2e/28c2fb84aa7aa5d75933d1862d0f7de6198ea22dfd9a0cca06e8a4e7509e/rpds_py-0.27.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6c27a7054b5224710fcfb1a626ec3ff4f28bcb89b899148c72873b18210e446b", size = 347680, upload-time = "2025-08-07T08:23:46.014Z" }, + { url = "https://files.pythonhosted.org/packages/44/3e/9834b4c8f4f5fe936b479e623832468aa4bd6beb8d014fecaee9eac6cdb1/rpds_py-0.27.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:09965b314091829b378b60607022048953e25f0b396c2b70e7c4c81bcecf932e", size = 384600, upload-time = "2025-08-07T08:23:48Z" }, + { url = "https://files.pythonhosted.org/packages/19/78/744123c7b38865a965cd9e6f691fde7ef989a00a256fa8bf15b75240d12f/rpds_py-0.27.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:14f028eb47f59e9169bfdf9f7ceafd29dd64902141840633683d0bad5b04ff34", size = 400697, upload-time = "2025-08-07T08:23:49.407Z" }, + { url = "https://files.pythonhosted.org/packages/32/97/3c3d32fe7daee0a1f1a678b6d4dfb8c4dcf88197fa2441f9da7cb54a8466/rpds_py-0.27.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6168af0be75bba990a39f9431cdfae5f0ad501f4af32ae62e8856307200517b8", size = 517781, upload-time = "2025-08-07T08:23:50.557Z" }, + { url = "https://files.pythonhosted.org/packages/b2/be/28f0e3e733680aa13ecec1212fc0f585928a206292f14f89c0b8a684cad1/rpds_py-0.27.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ab47fe727c13c09d0e6f508e3a49e545008e23bf762a245b020391b621f5b726", size = 406449, upload-time = "2025-08-07T08:23:51.732Z" }, + { url = "https://files.pythonhosted.org/packages/95/ae/5d15c83e337c082d0367053baeb40bfba683f42459f6ebff63a2fd7e5518/rpds_py-0.27.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5fa01b3d5e3b7d97efab65bd3d88f164e289ec323a8c033c5c38e53ee25c007e", size = 386150, upload-time = "2025-08-07T08:23:52.822Z" }, + { url = "https://files.pythonhosted.org/packages/bf/65/944e95f95d5931112829e040912b25a77b2e7ed913ea5fe5746aa5c1ce75/rpds_py-0.27.0-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:6c135708e987f46053e0a1246a206f53717f9fadfba27174a9769ad4befba5c3", size = 406100, upload-time = "2025-08-07T08:23:54.339Z" }, + { url = "https://files.pythonhosted.org/packages/21/a4/1664b83fae02894533cd11dc0b9f91d673797c2185b7be0f7496107ed6c5/rpds_py-0.27.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:fc327f4497b7087d06204235199daf208fd01c82d80465dc5efa4ec9df1c5b4e", size = 421345, upload-time = "2025-08-07T08:23:55.832Z" }, + { url = "https://files.pythonhosted.org/packages/7c/26/b7303941c2b0823bfb34c71378249f8beedce57301f400acb04bb345d025/rpds_py-0.27.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:7e57906e38583a2cba67046a09c2637e23297618dc1f3caddbc493f2be97c93f", size = 561891, upload-time = "2025-08-07T08:23:56.951Z" }, + { url = "https://files.pythonhosted.org/packages/9b/c8/48623d64d4a5a028fa99576c768a6159db49ab907230edddc0b8468b998b/rpds_py-0.27.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:0f4f69d7a4300fbf91efb1fb4916421bd57804c01ab938ab50ac9c4aa2212f03", size = 591756, upload-time = "2025-08-07T08:23:58.146Z" }, + { url = "https://files.pythonhosted.org/packages/b3/51/18f62617e8e61cc66334c9fb44b1ad7baae3438662098efbc55fb3fda453/rpds_py-0.27.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b4c4fbbcff474e1e5f38be1bf04511c03d492d42eec0babda5d03af3b5589374", size = 557088, upload-time = "2025-08-07T08:23:59.6Z" }, + { url = "https://files.pythonhosted.org/packages/bd/4c/e84c3a276e2496a93d245516be6b49e20499aa8ca1c94d59fada0d79addc/rpds_py-0.27.0-cp312-cp312-win32.whl", hash = "sha256:27bac29bbbf39601b2aab474daf99dbc8e7176ca3389237a23944b17f8913d97", size = 221926, upload-time = "2025-08-07T08:24:00.695Z" }, + { url = "https://files.pythonhosted.org/packages/83/89/9d0fbcef64340db0605eb0a0044f258076f3ae0a3b108983b2c614d96212/rpds_py-0.27.0-cp312-cp312-win_amd64.whl", hash = "sha256:8a06aa1197ec0281eb1d7daf6073e199eb832fe591ffa329b88bae28f25f5fe5", size = 233235, upload-time = "2025-08-07T08:24:01.846Z" }, + { url = "https://files.pythonhosted.org/packages/c9/b0/e177aa9f39cbab060f96de4a09df77d494f0279604dc2f509263e21b05f9/rpds_py-0.27.0-cp312-cp312-win_arm64.whl", hash = "sha256:e14aab02258cb776a108107bd15f5b5e4a1bbaa61ef33b36693dfab6f89d54f9", size = 223315, upload-time = "2025-08-07T08:24:03.337Z" }, + { url = "https://files.pythonhosted.org/packages/81/d2/dfdfd42565a923b9e5a29f93501664f5b984a802967d48d49200ad71be36/rpds_py-0.27.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:443d239d02d9ae55b74015234f2cd8eb09e59fbba30bf60baeb3123ad4c6d5ff", size = 362133, upload-time = "2025-08-07T08:24:04.508Z" }, + { url = "https://files.pythonhosted.org/packages/ac/4a/0a2e2460c4b66021d349ce9f6331df1d6c75d7eea90df9785d333a49df04/rpds_py-0.27.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:b8a7acf04fda1f30f1007f3cc96d29d8cf0a53e626e4e1655fdf4eabc082d367", size = 347128, upload-time = "2025-08-07T08:24:05.695Z" }, + { url = "https://files.pythonhosted.org/packages/35/8d/7d1e4390dfe09d4213b3175a3f5a817514355cb3524593380733204f20b9/rpds_py-0.27.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d0f92b78cfc3b74a42239fdd8c1266f4715b573204c234d2f9fc3fc7a24f185", size = 384027, upload-time = "2025-08-07T08:24:06.841Z" }, + { url = "https://files.pythonhosted.org/packages/c1/65/78499d1a62172891c8cd45de737b2a4b84a414b6ad8315ab3ac4945a5b61/rpds_py-0.27.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ce4ed8e0c7dbc5b19352b9c2c6131dd23b95fa8698b5cdd076307a33626b72dc", size = 399973, upload-time = "2025-08-07T08:24:08.143Z" }, + { url = "https://files.pythonhosted.org/packages/10/a1/1c67c1d8cc889107b19570bb01f75cf49852068e95e6aee80d22915406fc/rpds_py-0.27.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fde355b02934cc6b07200cc3b27ab0c15870a757d1a72fd401aa92e2ea3c6bfe", size = 515295, upload-time = "2025-08-07T08:24:09.711Z" }, + { url = "https://files.pythonhosted.org/packages/df/27/700ec88e748436b6c7c4a2262d66e80f8c21ab585d5e98c45e02f13f21c0/rpds_py-0.27.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:13bbc4846ae4c993f07c93feb21a24d8ec637573d567a924b1001e81c8ae80f9", size = 406737, upload-time = "2025-08-07T08:24:11.182Z" }, + { url = "https://files.pythonhosted.org/packages/33/cc/6b0ee8f0ba3f2df2daac1beda17fde5cf10897a7d466f252bd184ef20162/rpds_py-0.27.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:be0744661afbc4099fef7f4e604e7f1ea1be1dd7284f357924af12a705cc7d5c", size = 385898, upload-time = "2025-08-07T08:24:12.798Z" }, + { url = "https://files.pythonhosted.org/packages/e8/7e/c927b37d7d33c0a0ebf249cc268dc2fcec52864c1b6309ecb960497f2285/rpds_py-0.27.0-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:069e0384a54f427bd65d7fda83b68a90606a3835901aaff42185fcd94f5a9295", size = 405785, upload-time = "2025-08-07T08:24:14.906Z" }, + { url = "https://files.pythonhosted.org/packages/5b/d2/8ed50746d909dcf402af3fa58b83d5a590ed43e07251d6b08fad1a535ba6/rpds_py-0.27.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4bc262ace5a1a7dc3e2eac2fa97b8257ae795389f688b5adf22c5db1e2431c43", size = 419760, upload-time = "2025-08-07T08:24:16.129Z" }, + { url = "https://files.pythonhosted.org/packages/d3/60/2b2071aee781cb3bd49f94d5d35686990b925e9b9f3e3d149235a6f5d5c1/rpds_py-0.27.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:2fe6e18e5c8581f0361b35ae575043c7029d0a92cb3429e6e596c2cdde251432", size = 561201, upload-time = "2025-08-07T08:24:17.645Z" }, + { url = "https://files.pythonhosted.org/packages/98/1f/27b67304272521aaea02be293fecedce13fa351a4e41cdb9290576fc6d81/rpds_py-0.27.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:d93ebdb82363d2e7bec64eecdc3632b59e84bd270d74fe5be1659f7787052f9b", size = 591021, upload-time = "2025-08-07T08:24:18.999Z" }, + { url = "https://files.pythonhosted.org/packages/db/9b/a2fadf823164dd085b1f894be6443b0762a54a7af6f36e98e8fcda69ee50/rpds_py-0.27.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:0954e3a92e1d62e83a54ea7b3fdc9efa5d61acef8488a8a3d31fdafbfb00460d", size = 556368, upload-time = "2025-08-07T08:24:20.54Z" }, + { url = "https://files.pythonhosted.org/packages/24/f3/6d135d46a129cda2e3e6d4c5e91e2cc26ea0428c6cf152763f3f10b6dd05/rpds_py-0.27.0-cp313-cp313-win32.whl", hash = "sha256:2cff9bdd6c7b906cc562a505c04a57d92e82d37200027e8d362518df427f96cd", size = 221236, upload-time = "2025-08-07T08:24:22.144Z" }, + { url = "https://files.pythonhosted.org/packages/c5/44/65d7494f5448ecc755b545d78b188440f81da98b50ea0447ab5ebfdf9bd6/rpds_py-0.27.0-cp313-cp313-win_amd64.whl", hash = "sha256:dc79d192fb76fc0c84f2c58672c17bbbc383fd26c3cdc29daae16ce3d927e8b2", size = 232634, upload-time = "2025-08-07T08:24:23.642Z" }, + { url = "https://files.pythonhosted.org/packages/70/d9/23852410fadab2abb611733933401de42a1964ce6600a3badae35fbd573e/rpds_py-0.27.0-cp313-cp313-win_arm64.whl", hash = "sha256:5b3a5c8089eed498a3af23ce87a80805ff98f6ef8f7bdb70bd1b7dae5105f6ac", size = 222783, upload-time = "2025-08-07T08:24:25.098Z" }, + { url = "https://files.pythonhosted.org/packages/15/75/03447917f78512b34463f4ef11066516067099a0c466545655503bed0c77/rpds_py-0.27.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:90fb790138c1a89a2e58c9282fe1089638401f2f3b8dddd758499041bc6e0774", size = 359154, upload-time = "2025-08-07T08:24:26.249Z" }, + { url = "https://files.pythonhosted.org/packages/6b/fc/4dac4fa756451f2122ddaf136e2c6aeb758dc6fdbe9ccc4bc95c98451d50/rpds_py-0.27.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:010c4843a3b92b54373e3d2291a7447d6c3fc29f591772cc2ea0e9f5c1da434b", size = 343909, upload-time = "2025-08-07T08:24:27.405Z" }, + { url = "https://files.pythonhosted.org/packages/7b/81/723c1ed8e6f57ed9d8c0c07578747a2d3d554aaefc1ab89f4e42cfeefa07/rpds_py-0.27.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c9ce7a9e967afc0a2af7caa0d15a3e9c1054815f73d6a8cb9225b61921b419bd", size = 379340, upload-time = "2025-08-07T08:24:28.714Z" }, + { url = "https://files.pythonhosted.org/packages/98/16/7e3740413de71818ce1997df82ba5f94bae9fff90c0a578c0e24658e6201/rpds_py-0.27.0-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:aa0bf113d15e8abdfee92aa4db86761b709a09954083afcb5bf0f952d6065fdb", size = 391655, upload-time = "2025-08-07T08:24:30.223Z" }, + { url = "https://files.pythonhosted.org/packages/e0/63/2a9f510e124d80660f60ecce07953f3f2d5f0b96192c1365443859b9c87f/rpds_py-0.27.0-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:eb91d252b35004a84670dfeafadb042528b19842a0080d8b53e5ec1128e8f433", size = 513017, upload-time = "2025-08-07T08:24:31.446Z" }, + { url = "https://files.pythonhosted.org/packages/2c/4e/cf6ff311d09776c53ea1b4f2e6700b9d43bb4e99551006817ade4bbd6f78/rpds_py-0.27.0-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:db8a6313dbac934193fc17fe7610f70cd8181c542a91382531bef5ed785e5615", size = 402058, upload-time = "2025-08-07T08:24:32.613Z" }, + { url = "https://files.pythonhosted.org/packages/88/11/5e36096d474cb10f2a2d68b22af60a3bc4164fd8db15078769a568d9d3ac/rpds_py-0.27.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce96ab0bdfcef1b8c371ada2100767ace6804ea35aacce0aef3aeb4f3f499ca8", size = 383474, upload-time = "2025-08-07T08:24:33.767Z" }, + { url = "https://files.pythonhosted.org/packages/db/a2/3dff02805b06058760b5eaa6d8cb8db3eb3e46c9e452453ad5fc5b5ad9fe/rpds_py-0.27.0-cp313-cp313t-manylinux_2_31_riscv64.whl", hash = "sha256:7451ede3560086abe1aa27dcdcf55cd15c96b56f543fb12e5826eee6f721f858", size = 400067, upload-time = "2025-08-07T08:24:35.021Z" }, + { url = "https://files.pythonhosted.org/packages/67/87/eed7369b0b265518e21ea836456a4ed4a6744c8c12422ce05bce760bb3cf/rpds_py-0.27.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:32196b5a99821476537b3f7732432d64d93a58d680a52c5e12a190ee0135d8b5", size = 412085, upload-time = "2025-08-07T08:24:36.267Z" }, + { url = "https://files.pythonhosted.org/packages/8b/48/f50b2ab2fbb422fbb389fe296e70b7a6b5ea31b263ada5c61377e710a924/rpds_py-0.27.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a029be818059870664157194e46ce0e995082ac49926f1423c1f058534d2aaa9", size = 555928, upload-time = "2025-08-07T08:24:37.573Z" }, + { url = "https://files.pythonhosted.org/packages/98/41/b18eb51045d06887666c3560cd4bbb6819127b43d758f5adb82b5f56f7d1/rpds_py-0.27.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:3841f66c1ffdc6cebce8aed64e36db71466f1dc23c0d9a5592e2a782a3042c79", size = 585527, upload-time = "2025-08-07T08:24:39.391Z" }, + { url = "https://files.pythonhosted.org/packages/be/03/a3dd6470fc76499959b00ae56295b76b4bdf7c6ffc60d62006b1217567e1/rpds_py-0.27.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:42894616da0fc0dcb2ec08a77896c3f56e9cb2f4b66acd76fc8992c3557ceb1c", size = 554211, upload-time = "2025-08-07T08:24:40.6Z" }, + { url = "https://files.pythonhosted.org/packages/bf/d1/ee5fd1be395a07423ac4ca0bcc05280bf95db2b155d03adefeb47d5ebf7e/rpds_py-0.27.0-cp313-cp313t-win32.whl", hash = "sha256:b1fef1f13c842a39a03409e30ca0bf87b39a1e2a305a9924deadb75a43105d23", size = 216624, upload-time = "2025-08-07T08:24:42.204Z" }, + { url = "https://files.pythonhosted.org/packages/1c/94/4814c4c858833bf46706f87349c37ca45e154da7dbbec9ff09f1abeb08cc/rpds_py-0.27.0-cp313-cp313t-win_amd64.whl", hash = "sha256:183f5e221ba3e283cd36fdfbe311d95cd87699a083330b4f792543987167eff1", size = 230007, upload-time = "2025-08-07T08:24:43.329Z" }, + { url = "https://files.pythonhosted.org/packages/0e/a5/8fffe1c7dc7c055aa02df310f9fb71cfc693a4d5ccc5de2d3456ea5fb022/rpds_py-0.27.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:f3cd110e02c5bf17d8fb562f6c9df5c20e73029d587cf8602a2da6c5ef1e32cb", size = 362595, upload-time = "2025-08-07T08:24:44.478Z" }, + { url = "https://files.pythonhosted.org/packages/bc/c7/4e4253fd2d4bb0edbc0b0b10d9f280612ca4f0f990e3c04c599000fe7d71/rpds_py-0.27.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:8d0e09cf4863c74106b5265c2c310f36146e2b445ff7b3018a56799f28f39f6f", size = 347252, upload-time = "2025-08-07T08:24:45.678Z" }, + { url = "https://files.pythonhosted.org/packages/f3/c8/3d1a954d30f0174dd6baf18b57c215da03cf7846a9d6e0143304e784cddc/rpds_py-0.27.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:64f689ab822f9b5eb6dfc69893b4b9366db1d2420f7db1f6a2adf2a9ca15ad64", size = 384886, upload-time = "2025-08-07T08:24:46.86Z" }, + { url = "https://files.pythonhosted.org/packages/e0/52/3c5835f2df389832b28f9276dd5395b5a965cea34226e7c88c8fbec2093c/rpds_py-0.27.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e36c80c49853b3ffda7aa1831bf175c13356b210c73128c861f3aa93c3cc4015", size = 399716, upload-time = "2025-08-07T08:24:48.174Z" }, + { url = "https://files.pythonhosted.org/packages/40/73/176e46992461a1749686a2a441e24df51ff86b99c2d34bf39f2a5273b987/rpds_py-0.27.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6de6a7f622860af0146cb9ee148682ff4d0cea0b8fd3ad51ce4d40efb2f061d0", size = 517030, upload-time = "2025-08-07T08:24:49.52Z" }, + { url = "https://files.pythonhosted.org/packages/79/2a/7266c75840e8c6e70effeb0d38922a45720904f2cd695e68a0150e5407e2/rpds_py-0.27.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4045e2fc4b37ec4b48e8907a5819bdd3380708c139d7cc358f03a3653abedb89", size = 408448, upload-time = "2025-08-07T08:24:50.727Z" }, + { url = "https://files.pythonhosted.org/packages/e6/5f/a7efc572b8e235093dc6cf39f4dbc8a7f08e65fdbcec7ff4daeb3585eef1/rpds_py-0.27.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9da162b718b12c4219eeeeb68a5b7552fbc7aadedf2efee440f88b9c0e54b45d", size = 387320, upload-time = "2025-08-07T08:24:52.004Z" }, + { url = "https://files.pythonhosted.org/packages/a2/eb/9ff6bc92efe57cf5a2cb74dee20453ba444b6fdc85275d8c99e0d27239d1/rpds_py-0.27.0-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:0665be515767dc727ffa5f74bd2ef60b0ff85dad6bb8f50d91eaa6b5fb226f51", size = 407414, upload-time = "2025-08-07T08:24:53.664Z" }, + { url = "https://files.pythonhosted.org/packages/fb/bd/3b9b19b00d5c6e1bd0f418c229ab0f8d3b110ddf7ec5d9d689ef783d0268/rpds_py-0.27.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:203f581accef67300a942e49a37d74c12ceeef4514874c7cede21b012613ca2c", size = 420766, upload-time = "2025-08-07T08:24:55.917Z" }, + { url = "https://files.pythonhosted.org/packages/17/6b/521a7b1079ce16258c70805166e3ac6ec4ee2139d023fe07954dc9b2d568/rpds_py-0.27.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7873b65686a6471c0037139aa000d23fe94628e0daaa27b6e40607c90e3f5ec4", size = 562409, upload-time = "2025-08-07T08:24:57.17Z" }, + { url = "https://files.pythonhosted.org/packages/8b/bf/65db5bfb14ccc55e39de8419a659d05a2a9cd232f0a699a516bb0991da7b/rpds_py-0.27.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:249ab91ceaa6b41abc5f19513cb95b45c6f956f6b89f1fe3d99c81255a849f9e", size = 590793, upload-time = "2025-08-07T08:24:58.388Z" }, + { url = "https://files.pythonhosted.org/packages/db/b8/82d368b378325191ba7aae8f40f009b78057b598d4394d1f2cdabaf67b3f/rpds_py-0.27.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d2f184336bc1d6abfaaa1262ed42739c3789b1e3a65a29916a615307d22ffd2e", size = 558178, upload-time = "2025-08-07T08:24:59.756Z" }, + { url = "https://files.pythonhosted.org/packages/f6/ff/f270bddbfbc3812500f8131b1ebbd97afd014cd554b604a3f73f03133a36/rpds_py-0.27.0-cp314-cp314-win32.whl", hash = "sha256:d3c622c39f04d5751408f5b801ecb527e6e0a471b367f420a877f7a660d583f6", size = 222355, upload-time = "2025-08-07T08:25:01.027Z" }, + { url = "https://files.pythonhosted.org/packages/bf/20/fdab055b1460c02ed356a0e0b0a78c1dd32dc64e82a544f7b31c9ac643dc/rpds_py-0.27.0-cp314-cp314-win_amd64.whl", hash = "sha256:cf824aceaeffff029ccfba0da637d432ca71ab21f13e7f6f5179cd88ebc77a8a", size = 234007, upload-time = "2025-08-07T08:25:02.268Z" }, + { url = "https://files.pythonhosted.org/packages/4d/a8/694c060005421797a3be4943dab8347c76c2b429a9bef68fb2c87c9e70c7/rpds_py-0.27.0-cp314-cp314-win_arm64.whl", hash = "sha256:86aca1616922b40d8ac1b3073a1ead4255a2f13405e5700c01f7c8d29a03972d", size = 223527, upload-time = "2025-08-07T08:25:03.45Z" }, + { url = "https://files.pythonhosted.org/packages/1e/f9/77f4c90f79d2c5ca8ce6ec6a76cb4734ee247de6b3a4f337e289e1f00372/rpds_py-0.27.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:341d8acb6724c0c17bdf714319c393bb27f6d23d39bc74f94221b3e59fc31828", size = 359469, upload-time = "2025-08-07T08:25:04.648Z" }, + { url = "https://files.pythonhosted.org/packages/c0/22/b97878d2f1284286fef4172069e84b0b42b546ea7d053e5fb7adb9ac6494/rpds_py-0.27.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:6b96b0b784fe5fd03beffff2b1533dc0d85e92bab8d1b2c24ef3a5dc8fac5669", size = 343960, upload-time = "2025-08-07T08:25:05.863Z" }, + { url = "https://files.pythonhosted.org/packages/b1/b0/dfd55b5bb480eda0578ae94ef256d3061d20b19a0f5e18c482f03e65464f/rpds_py-0.27.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0c431bfb91478d7cbe368d0a699978050d3b112d7f1d440a41e90faa325557fd", size = 380201, upload-time = "2025-08-07T08:25:07.513Z" }, + { url = "https://files.pythonhosted.org/packages/28/22/e1fa64e50d58ad2b2053077e3ec81a979147c43428de9e6de68ddf6aff4e/rpds_py-0.27.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:20e222a44ae9f507d0f2678ee3dd0c45ec1e930f6875d99b8459631c24058aec", size = 392111, upload-time = "2025-08-07T08:25:09.149Z" }, + { url = "https://files.pythonhosted.org/packages/49/f9/43ab7a43e97aedf6cea6af70fdcbe18abbbc41d4ae6cdec1bfc23bbad403/rpds_py-0.27.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:184f0d7b342967f6cda94a07d0e1fae177d11d0b8f17d73e06e36ac02889f303", size = 515863, upload-time = "2025-08-07T08:25:10.431Z" }, + { url = "https://files.pythonhosted.org/packages/38/9b/9bd59dcc636cd04d86a2d20ad967770bf348f5eb5922a8f29b547c074243/rpds_py-0.27.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a00c91104c173c9043bc46f7b30ee5e6d2f6b1149f11f545580f5d6fdff42c0b", size = 402398, upload-time = "2025-08-07T08:25:11.819Z" }, + { url = "https://files.pythonhosted.org/packages/71/bf/f099328c6c85667aba6b66fa5c35a8882db06dcd462ea214be72813a0dd2/rpds_py-0.27.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7a37dd208f0d658e0487522078b1ed68cd6bce20ef4b5a915d2809b9094b410", size = 384665, upload-time = "2025-08-07T08:25:13.194Z" }, + { url = "https://files.pythonhosted.org/packages/a9/c5/9c1f03121ece6634818490bd3c8be2c82a70928a19de03467fb25a3ae2a8/rpds_py-0.27.0-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:92f3b3ec3e6008a1fe00b7c0946a170f161ac00645cde35e3c9a68c2475e8156", size = 400405, upload-time = "2025-08-07T08:25:14.417Z" }, + { url = "https://files.pythonhosted.org/packages/b5/b8/e25d54af3e63ac94f0c16d8fe143779fe71ff209445a0c00d0f6984b6b2c/rpds_py-0.27.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a1b3db5fae5cbce2131b7420a3f83553d4d89514c03d67804ced36161fe8b6b2", size = 413179, upload-time = "2025-08-07T08:25:15.664Z" }, + { url = "https://files.pythonhosted.org/packages/f9/d1/406b3316433fe49c3021546293a04bc33f1478e3ec7950215a7fce1a1208/rpds_py-0.27.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:5355527adaa713ab693cbce7c1e0ec71682f599f61b128cf19d07e5c13c9b1f1", size = 556895, upload-time = "2025-08-07T08:25:17.061Z" }, + { url = "https://files.pythonhosted.org/packages/5f/bc/3697c0c21fcb9a54d46ae3b735eb2365eea0c2be076b8f770f98e07998de/rpds_py-0.27.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:fcc01c57ce6e70b728af02b2401c5bc853a9e14eb07deda30624374f0aebfe42", size = 585464, upload-time = "2025-08-07T08:25:18.406Z" }, + { url = "https://files.pythonhosted.org/packages/63/09/ee1bb5536f99f42c839b177d552f6114aa3142d82f49cef49261ed28dbe0/rpds_py-0.27.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3001013dae10f806380ba739d40dee11db1ecb91684febb8406a87c2ded23dae", size = 555090, upload-time = "2025-08-07T08:25:20.461Z" }, + { url = "https://files.pythonhosted.org/packages/7d/2c/363eada9e89f7059199d3724135a86c47082cbf72790d6ba2f336d146ddb/rpds_py-0.27.0-cp314-cp314t-win32.whl", hash = "sha256:0f401c369186a5743694dd9fc08cba66cf70908757552e1f714bfc5219c655b5", size = 218001, upload-time = "2025-08-07T08:25:21.761Z" }, + { url = "https://files.pythonhosted.org/packages/e2/3f/d6c216ed5199c9ef79e2a33955601f454ed1e7420a93b89670133bca5ace/rpds_py-0.27.0-cp314-cp314t-win_amd64.whl", hash = "sha256:8a1dca5507fa1337f75dcd5070218b20bc68cf8844271c923c1b79dfcbc20391", size = 230993, upload-time = "2025-08-07T08:25:23.34Z" }, +] + +[[package]] +name = "rsa" +version = "4.9.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyasn1" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/da/8a/22b7beea3ee0d44b1916c0c1cb0ee3af23b700b6da9f04991899d0c555d4/rsa-4.9.1.tar.gz", hash = "sha256:e7bdbfdb5497da4c07dfd35530e1a902659db6ff241e39d9953cad06ebd0ae75", size = 29034, upload-time = "2025-04-16T09:51:18.218Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762", size = 34696, upload-time = "2025-04-16T09:51:17.142Z" }, +] + +[[package]] +name = "s3transfer" +version = "0.13.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "botocore" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6d/05/d52bf1e65044b4e5e27d4e63e8d1579dbdec54fce685908ae09bc3720030/s3transfer-0.13.1.tar.gz", hash = "sha256:c3fdba22ba1bd367922f27ec8032d6a1cf5f10c934fb5d68cf60fd5a23d936cf", size = 150589, upload-time = "2025-07-18T19:22:42.31Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6d/4f/d073e09df851cfa251ef7840007d04db3293a0482ce607d2b993926089be/s3transfer-0.13.1-py3-none-any.whl", hash = "sha256:a981aa7429be23fe6dfc13e80e4020057cbab622b08c0315288758d67cabc724", size = 85308, upload-time = "2025-07-18T19:22:40.947Z" }, +] + +[[package]] +name = "safetensors" +version = "0.6.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ac/cc/738f3011628920e027a11754d9cae9abec1aed00f7ae860abbf843755233/safetensors-0.6.2.tar.gz", hash = "sha256:43ff2aa0e6fa2dc3ea5524ac7ad93a9839256b8703761e76e2d0b2a3fa4f15d9", size = 197968, upload-time = "2025-08-08T13:13:58.654Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4d/b1/3f5fd73c039fc87dba3ff8b5d528bfc5a32b597fea8e7a6a4800343a17c7/safetensors-0.6.2-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:9c85ede8ec58f120bad982ec47746981e210492a6db876882aa021446af8ffba", size = 454797, upload-time = "2025-08-08T13:13:52.066Z" }, + { url = "https://files.pythonhosted.org/packages/8c/c9/bb114c158540ee17907ec470d01980957fdaf87b4aa07914c24eba87b9c6/safetensors-0.6.2-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:d6675cf4b39c98dbd7d940598028f3742e0375a6b4d4277e76beb0c35f4b843b", size = 432206, upload-time = "2025-08-08T13:13:50.931Z" }, + { url = "https://files.pythonhosted.org/packages/d3/8e/f70c34e47df3110e8e0bb268d90db8d4be8958a54ab0336c9be4fe86dac8/safetensors-0.6.2-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d2d2b3ce1e2509c68932ca03ab8f20570920cd9754b05063d4368ee52833ecd", size = 473261, upload-time = "2025-08-08T13:13:41.259Z" }, + { url = "https://files.pythonhosted.org/packages/2a/f5/be9c6a7c7ef773e1996dc214e73485286df1836dbd063e8085ee1976f9cb/safetensors-0.6.2-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:93de35a18f46b0f5a6a1f9e26d91b442094f2df02e9fd7acf224cfec4238821a", size = 485117, upload-time = "2025-08-08T13:13:43.506Z" }, + { url = "https://files.pythonhosted.org/packages/c9/55/23f2d0a2c96ed8665bf17a30ab4ce5270413f4d74b6d87dd663258b9af31/safetensors-0.6.2-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:89a89b505f335640f9120fac65ddeb83e40f1fd081cb8ed88b505bdccec8d0a1", size = 616154, upload-time = "2025-08-08T13:13:45.096Z" }, + { url = "https://files.pythonhosted.org/packages/98/c6/affb0bd9ce02aa46e7acddbe087912a04d953d7a4d74b708c91b5806ef3f/safetensors-0.6.2-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fc4d0d0b937e04bdf2ae6f70cd3ad51328635fe0e6214aa1fc811f3b576b3bda", size = 520713, upload-time = "2025-08-08T13:13:46.25Z" }, + { url = "https://files.pythonhosted.org/packages/fe/5d/5a514d7b88e310c8b146e2404e0dc161282e78634d9358975fd56dfd14be/safetensors-0.6.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8045db2c872db8f4cbe3faa0495932d89c38c899c603f21e9b6486951a5ecb8f", size = 485835, upload-time = "2025-08-08T13:13:49.373Z" }, + { url = "https://files.pythonhosted.org/packages/7a/7b/4fc3b2ba62c352b2071bea9cfbad330fadda70579f617506ae1a2f129cab/safetensors-0.6.2-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:81e67e8bab9878bb568cffbc5f5e655adb38d2418351dc0859ccac158f753e19", size = 521503, upload-time = "2025-08-08T13:13:47.651Z" }, + { url = "https://files.pythonhosted.org/packages/5a/50/0057e11fe1f3cead9254315a6c106a16dd4b1a19cd247f7cc6414f6b7866/safetensors-0.6.2-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:b0e4d029ab0a0e0e4fdf142b194514695b1d7d3735503ba700cf36d0fc7136ce", size = 652256, upload-time = "2025-08-08T13:13:53.167Z" }, + { url = "https://files.pythonhosted.org/packages/e9/29/473f789e4ac242593ac1656fbece6e1ecd860bb289e635e963667807afe3/safetensors-0.6.2-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:fa48268185c52bfe8771e46325a1e21d317207bcabcb72e65c6e28e9ffeb29c7", size = 747281, upload-time = "2025-08-08T13:13:54.656Z" }, + { url = "https://files.pythonhosted.org/packages/68/52/f7324aad7f2df99e05525c84d352dc217e0fa637a4f603e9f2eedfbe2c67/safetensors-0.6.2-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:d83c20c12c2d2f465997c51b7ecb00e407e5f94d7dec3ea0cc11d86f60d3fde5", size = 692286, upload-time = "2025-08-08T13:13:55.884Z" }, + { url = "https://files.pythonhosted.org/packages/ad/fe/cad1d9762868c7c5dc70c8620074df28ebb1a8e4c17d4c0cb031889c457e/safetensors-0.6.2-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:d944cea65fad0ead848b6ec2c37cc0b197194bec228f8020054742190e9312ac", size = 655957, upload-time = "2025-08-08T13:13:57.029Z" }, + { url = "https://files.pythonhosted.org/packages/59/a7/e2158e17bbe57d104f0abbd95dff60dda916cf277c9f9663b4bf9bad8b6e/safetensors-0.6.2-cp38-abi3-win32.whl", hash = "sha256:cab75ca7c064d3911411461151cb69380c9225798a20e712b102edda2542ddb1", size = 308926, upload-time = "2025-08-08T13:14:01.095Z" }, + { url = "https://files.pythonhosted.org/packages/2c/c3/c0be1135726618dc1e28d181b8c442403d8dbb9e273fd791de2d4384bcdd/safetensors-0.6.2-cp38-abi3-win_amd64.whl", hash = "sha256:c7b214870df923cbc1593c3faee16bec59ea462758699bd3fee399d00aac072c", size = 320192, upload-time = "2025-08-08T13:13:59.467Z" }, +] + +[[package]] +name = "scikit-learn" +version = "1.6.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "joblib" }, + { name = "numpy" }, + { name = "scipy" }, + { name = "threadpoolctl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/fa/19/5aa2002044afc297ecaf1e3517ed07bba4aece3b5613b5160c1212995fc8/scikit_learn-1.6.0.tar.gz", hash = "sha256:9d58481f9f7499dff4196927aedd4285a0baec8caa3790efbe205f13de37dd6e", size = 7074944, upload-time = "2024-12-09T16:02:23.639Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/18/0c/a5de627aa57b028aea7026cb3bbeaf63be3158adc118212d6cc7843d939a/scikit_learn-1.6.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:04a5ba45c12a5ff81518aa4f1604e826a45d20e53da47b15871526cda4ff5174", size = 12096999, upload-time = "2024-12-09T16:01:31.659Z" }, + { url = "https://files.pythonhosted.org/packages/a3/7d/02a96e6fb28ddb213e84b1b4a44148d26ec96fc9db9c74e050277e009892/scikit_learn-1.6.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:21fadfc2ad7a1ce8bd1d90f23d17875b84ec765eecbbfc924ff11fb73db582ce", size = 11160579, upload-time = "2024-12-09T16:01:34.693Z" }, + { url = "https://files.pythonhosted.org/packages/70/28/77b071f541d75247e6c3403f19aaa634371e972691f6aa1838ca9fd4cc52/scikit_learn-1.6.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:30f34bb5fde90e020653bb84dcb38b6c83f90c70680dbd8c38bd9becbad7a127", size = 12246543, upload-time = "2024-12-09T16:01:37.241Z" }, + { url = "https://files.pythonhosted.org/packages/17/0e/e6bb84074f1081245a165c0ee775ecef24beae9d2f2e24bcac0c9f155f13/scikit_learn-1.6.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1dad624cffe3062276a0881d4e441bc9e3b19d02d17757cd6ae79a9d192a0027", size = 13140402, upload-time = "2024-12-09T16:01:40.15Z" }, + { url = "https://files.pythonhosted.org/packages/21/1d/3df58df8bd425f425df9f90b316618ace62b7f1f838ac1580191025cc735/scikit_learn-1.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:2fce7950a3fad85e0a61dc403df0f9345b53432ac0e47c50da210d22c60b6d85", size = 11103596, upload-time = "2024-12-09T16:01:43.205Z" }, + { url = "https://files.pythonhosted.org/packages/2e/f4/c3b51920cf310169d19d07855a7bdf51a9b065314877d9a58c0c60d08eea/scikit_learn-1.6.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e5453b2e87ef8accedc5a8a4e6709f887ca01896cd7cc8a174fe39bd4bb00aef", size = 12002532, upload-time = "2024-12-09T16:01:46.199Z" }, + { url = "https://files.pythonhosted.org/packages/e4/76/cfb0778a84c30df272f1c41fc7b3bd3ffac6e8b02ee6a078a592d35cf73f/scikit_learn-1.6.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:5fe11794236fb83bead2af26a87ced5d26e3370b8487430818b915dafab1724e", size = 11088997, upload-time = "2024-12-09T16:01:48.57Z" }, + { url = "https://files.pythonhosted.org/packages/2b/8d/4563419d742b852e50871fa3494a8dd0304610601359209a2e614e200260/scikit_learn-1.6.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:61fe3dcec0d82ae280877a818ab652f4988371e32dd5451e75251bece79668b1", size = 12203192, upload-time = "2024-12-09T16:01:52.024Z" }, + { url = "https://files.pythonhosted.org/packages/15/a4/f4fdcdd11d82837804c888097ad02aa6381c4bbd57b9d3074ecf9eba8f42/scikit_learn-1.6.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b44e3a51e181933bdf9a4953cc69c6025b40d2b49e238233f149b98849beb4bf", size = 13164436, upload-time = "2024-12-09T16:01:54.447Z" }, + { url = "https://files.pythonhosted.org/packages/1a/e1/32bdcf8f918de5a156da6886aba24a3b5718d267954bd34555be896289f0/scikit_learn-1.6.0-cp313-cp313-win_amd64.whl", hash = "sha256:a17860a562bac54384454d40b3f6155200c1c737c9399e6a97962c63fce503ac", size = 11064779, upload-time = "2024-12-09T16:01:56.756Z" }, + { url = "https://files.pythonhosted.org/packages/c6/8d/14464bea220bc02879f9e8d905c4b0a44b5c12afde6c375720b6f41d9407/scikit_learn-1.6.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:98717d3c152f6842d36a70f21e1468fb2f1a2f8f2624d9a3f382211798516426", size = 11962472, upload-time = "2024-12-09T16:01:59.129Z" }, + { url = "https://files.pythonhosted.org/packages/b4/69/66899cdc65986188e0e255e52ee93dee5101a72f139ee05f263dfff2053a/scikit_learn-1.6.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:34e20bfac8ff0ebe0ff20fb16a4d6df5dc4cc9ce383e00c2ab67a526a3c67b18", size = 11104864, upload-time = "2024-12-09T16:02:01.457Z" }, + { url = "https://files.pythonhosted.org/packages/3c/32/2c63bc108cc5438b116a0c6fd25c6126dd14c03118724385f10a3d218ee8/scikit_learn-1.6.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eba06d75815406091419e06dd650b91ebd1c5f836392a0d833ff36447c2b1bfa", size = 12435734, upload-time = "2024-12-09T16:02:04.317Z" }, + { url = "https://files.pythonhosted.org/packages/0c/f5/9434dff19e04a334bfb30df90511904263c48a422a9952d91d8de5c3aa62/scikit_learn-1.6.0-cp313-cp313t-win_amd64.whl", hash = "sha256:b6916d1cec1ff163c7d281e699d7a6a709da2f2c5ec7b10547e08cc788ddd3ae", size = 11329803, upload-time = "2024-12-09T16:02:07.43Z" }, +] + +[[package]] +name = "scipy" +version = "1.16.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f5/4a/b927028464795439faec8eaf0b03b011005c487bb2d07409f28bf30879c4/scipy-1.16.1.tar.gz", hash = "sha256:44c76f9e8b6e8e488a586190ab38016e4ed2f8a038af7cd3defa903c0a2238b3", size = 30580861, upload-time = "2025-07-27T16:33:30.834Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f8/d9/ec4864f5896232133f51382b54a08de91a9d1af7a76dfa372894026dfee2/scipy-1.16.1-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:81b433bbeaf35728dad619afc002db9b189e45eebe2cd676effe1fb93fef2b9c", size = 36575194, upload-time = "2025-07-27T16:27:41.321Z" }, + { url = "https://files.pythonhosted.org/packages/5c/6d/40e81ecfb688e9d25d34a847dca361982a6addf8e31f0957b1a54fbfa994/scipy-1.16.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:886cc81fdb4c6903a3bb0464047c25a6d1016fef77bb97949817d0c0d79f9e04", size = 28594590, upload-time = "2025-07-27T16:27:49.204Z" }, + { url = "https://files.pythonhosted.org/packages/0e/37/9f65178edfcc629377ce9a64fc09baebea18c80a9e57ae09a52edf84880b/scipy-1.16.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:15240c3aac087a522b4eaedb09f0ad061753c5eebf1ea430859e5bf8640d5919", size = 20866458, upload-time = "2025-07-27T16:27:54.98Z" }, + { url = "https://files.pythonhosted.org/packages/2c/7b/749a66766871ea4cb1d1ea10f27004db63023074c22abed51f22f09770e0/scipy-1.16.1-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:65f81a25805f3659b48126b5053d9e823d3215e4a63730b5e1671852a1705921", size = 23539318, upload-time = "2025-07-27T16:28:01.604Z" }, + { url = "https://files.pythonhosted.org/packages/c4/db/8d4afec60eb833a666434d4541a3151eedbf2494ea6d4d468cbe877f00cd/scipy-1.16.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6c62eea7f607f122069b9bad3f99489ddca1a5173bef8a0c75555d7488b6f725", size = 33292899, upload-time = "2025-07-27T16:28:09.147Z" }, + { url = "https://files.pythonhosted.org/packages/51/1e/79023ca3bbb13a015d7d2757ecca3b81293c663694c35d6541b4dca53e98/scipy-1.16.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f965bbf3235b01c776115ab18f092a95aa74c271a52577bcb0563e85738fd618", size = 35162637, upload-time = "2025-07-27T16:28:17.535Z" }, + { url = "https://files.pythonhosted.org/packages/b6/49/0648665f9c29fdaca4c679182eb972935b3b4f5ace41d323c32352f29816/scipy-1.16.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:f006e323874ffd0b0b816d8c6a8e7f9a73d55ab3b8c3f72b752b226d0e3ac83d", size = 35490507, upload-time = "2025-07-27T16:28:25.705Z" }, + { url = "https://files.pythonhosted.org/packages/62/8f/66cbb9d6bbb18d8c658f774904f42a92078707a7c71e5347e8bf2f52bb89/scipy-1.16.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e8fd15fc5085ab4cca74cb91fe0a4263b1f32e4420761ddae531ad60934c2119", size = 37923998, upload-time = "2025-07-27T16:28:34.339Z" }, + { url = "https://files.pythonhosted.org/packages/14/c3/61f273ae550fbf1667675701112e380881905e28448c080b23b5a181df7c/scipy-1.16.1-cp312-cp312-win_amd64.whl", hash = "sha256:f7b8013c6c066609577d910d1a2a077021727af07b6fab0ee22c2f901f22352a", size = 38508060, upload-time = "2025-07-27T16:28:43.242Z" }, + { url = "https://files.pythonhosted.org/packages/93/0b/b5c99382b839854a71ca9482c684e3472badc62620287cbbdab499b75ce6/scipy-1.16.1-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:5451606823a5e73dfa621a89948096c6528e2896e40b39248295d3a0138d594f", size = 36533717, upload-time = "2025-07-27T16:28:51.706Z" }, + { url = "https://files.pythonhosted.org/packages/eb/e5/69ab2771062c91e23e07c12e7d5033a6b9b80b0903ee709c3c36b3eb520c/scipy-1.16.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:89728678c5ca5abd610aee148c199ac1afb16e19844401ca97d43dc548a354eb", size = 28570009, upload-time = "2025-07-27T16:28:57.017Z" }, + { url = "https://files.pythonhosted.org/packages/f4/69/bd75dbfdd3cf524f4d753484d723594aed62cfaac510123e91a6686d520b/scipy-1.16.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:e756d688cb03fd07de0fffad475649b03cb89bee696c98ce508b17c11a03f95c", size = 20841942, upload-time = "2025-07-27T16:29:01.152Z" }, + { url = "https://files.pythonhosted.org/packages/ea/74/add181c87663f178ba7d6144b370243a87af8476664d5435e57d599e6874/scipy-1.16.1-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:5aa2687b9935da3ed89c5dbed5234576589dd28d0bf7cd237501ccfbdf1ad608", size = 23498507, upload-time = "2025-07-27T16:29:05.202Z" }, + { url = "https://files.pythonhosted.org/packages/1d/74/ece2e582a0d9550cee33e2e416cc96737dce423a994d12bbe59716f47ff1/scipy-1.16.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0851f6a1e537fe9399f35986897e395a1aa61c574b178c0d456be5b1a0f5ca1f", size = 33286040, upload-time = "2025-07-27T16:29:10.201Z" }, + { url = "https://files.pythonhosted.org/packages/e4/82/08e4076df538fb56caa1d489588d880ec7c52d8273a606bb54d660528f7c/scipy-1.16.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fedc2cbd1baed37474b1924c331b97bdff611d762c196fac1a9b71e67b813b1b", size = 35176096, upload-time = "2025-07-27T16:29:17.091Z" }, + { url = "https://files.pythonhosted.org/packages/fa/79/cd710aab8c921375711a8321c6be696e705a120e3011a643efbbcdeeabcc/scipy-1.16.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:2ef500e72f9623a6735769e4b93e9dcb158d40752cdbb077f305487e3e2d1f45", size = 35490328, upload-time = "2025-07-27T16:29:22.928Z" }, + { url = "https://files.pythonhosted.org/packages/71/73/e9cc3d35ee4526d784520d4494a3e1ca969b071fb5ae5910c036a375ceec/scipy-1.16.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:978d8311674b05a8f7ff2ea6c6bce5d8b45a0cb09d4c5793e0318f448613ea65", size = 37939921, upload-time = "2025-07-27T16:29:29.108Z" }, + { url = "https://files.pythonhosted.org/packages/21/12/c0efd2941f01940119b5305c375ae5c0fcb7ec193f806bd8f158b73a1782/scipy-1.16.1-cp313-cp313-win_amd64.whl", hash = "sha256:81929ed0fa7a5713fcdd8b2e6f73697d3b4c4816d090dd34ff937c20fa90e8ab", size = 38479462, upload-time = "2025-07-27T16:30:24.078Z" }, + { url = "https://files.pythonhosted.org/packages/7a/19/c3d08b675260046a991040e1ea5d65f91f40c7df1045fffff412dcfc6765/scipy-1.16.1-cp313-cp313t-macosx_10_14_x86_64.whl", hash = "sha256:bcc12db731858abda693cecdb3bdc9e6d4bd200213f49d224fe22df82687bdd6", size = 36938832, upload-time = "2025-07-27T16:29:35.057Z" }, + { url = "https://files.pythonhosted.org/packages/81/f2/ce53db652c033a414a5b34598dba6b95f3d38153a2417c5a3883da429029/scipy-1.16.1-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:744d977daa4becb9fc59135e75c069f8d301a87d64f88f1e602a9ecf51e77b27", size = 29093084, upload-time = "2025-07-27T16:29:40.201Z" }, + { url = "https://files.pythonhosted.org/packages/a9/ae/7a10ff04a7dc15f9057d05b33737ade244e4bd195caa3f7cc04d77b9e214/scipy-1.16.1-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:dc54f76ac18073bcecffb98d93f03ed6b81a92ef91b5d3b135dcc81d55a724c7", size = 21365098, upload-time = "2025-07-27T16:29:44.295Z" }, + { url = "https://files.pythonhosted.org/packages/36/ac/029ff710959932ad3c2a98721b20b405f05f752f07344622fd61a47c5197/scipy-1.16.1-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:367d567ee9fc1e9e2047d31f39d9d6a7a04e0710c86e701e053f237d14a9b4f6", size = 23896858, upload-time = "2025-07-27T16:29:48.784Z" }, + { url = "https://files.pythonhosted.org/packages/71/13/d1ef77b6bd7898720e1f0b6b3743cb945f6c3cafa7718eaac8841035ab60/scipy-1.16.1-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:4cf5785e44e19dcd32a0e4807555e1e9a9b8d475c6afff3d21c3c543a6aa84f4", size = 33438311, upload-time = "2025-07-27T16:29:54.164Z" }, + { url = "https://files.pythonhosted.org/packages/2d/e0/e64a6821ffbb00b4c5b05169f1c1fddb4800e9307efe3db3788995a82a2c/scipy-1.16.1-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3d0b80fb26d3e13a794c71d4b837e2a589d839fd574a6bbb4ee1288c213ad4a3", size = 35279542, upload-time = "2025-07-27T16:30:00.249Z" }, + { url = "https://files.pythonhosted.org/packages/57/59/0dc3c8b43e118f1e4ee2b798dcc96ac21bb20014e5f1f7a8e85cc0653bdb/scipy-1.16.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:8503517c44c18d1030d666cb70aaac1cc8913608816e06742498833b128488b7", size = 35667665, upload-time = "2025-07-27T16:30:05.916Z" }, + { url = "https://files.pythonhosted.org/packages/45/5f/844ee26e34e2f3f9f8febb9343748e72daeaec64fe0c70e9bf1ff84ec955/scipy-1.16.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:30cc4bb81c41831ecfd6dc450baf48ffd80ef5aed0f5cf3ea775740e80f16ecc", size = 38045210, upload-time = "2025-07-27T16:30:11.655Z" }, + { url = "https://files.pythonhosted.org/packages/8d/d7/210f2b45290f444f1de64bc7353aa598ece9f0e90c384b4a156f9b1a5063/scipy-1.16.1-cp313-cp313t-win_amd64.whl", hash = "sha256:c24fa02f7ed23ae514460a22c57eca8f530dbfa50b1cfdbf4f37c05b5309cc39", size = 38593661, upload-time = "2025-07-27T16:30:17.825Z" }, + { url = "https://files.pythonhosted.org/packages/81/ea/84d481a5237ed223bd3d32d6e82d7a6a96e34756492666c260cef16011d1/scipy-1.16.1-cp314-cp314-macosx_10_14_x86_64.whl", hash = "sha256:796a5a9ad36fa3a782375db8f4241ab02a091308eb079746bc0f874c9b998318", size = 36525921, upload-time = "2025-07-27T16:30:30.081Z" }, + { url = "https://files.pythonhosted.org/packages/4e/9f/d9edbdeff9f3a664807ae3aea383e10afaa247e8e6255e6d2aa4515e8863/scipy-1.16.1-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:3ea0733a2ff73fd6fdc5fecca54ee9b459f4d74f00b99aced7d9a3adb43fb1cc", size = 28564152, upload-time = "2025-07-27T16:30:35.336Z" }, + { url = "https://files.pythonhosted.org/packages/3b/95/8125bcb1fe04bc267d103e76516243e8d5e11229e6b306bda1024a5423d1/scipy-1.16.1-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:85764fb15a2ad994e708258bb4ed8290d1305c62a4e1ef07c414356a24fcfbf8", size = 20836028, upload-time = "2025-07-27T16:30:39.421Z" }, + { url = "https://files.pythonhosted.org/packages/77/9c/bf92e215701fc70bbcd3d14d86337cf56a9b912a804b9c776a269524a9e9/scipy-1.16.1-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:ca66d980469cb623b1759bdd6e9fd97d4e33a9fad5b33771ced24d0cb24df67e", size = 23489666, upload-time = "2025-07-27T16:30:43.663Z" }, + { url = "https://files.pythonhosted.org/packages/5e/00/5e941d397d9adac41b02839011594620d54d99488d1be5be755c00cde9ee/scipy-1.16.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e7cc1ffcc230f568549fc56670bcf3df1884c30bd652c5da8138199c8c76dae0", size = 33358318, upload-time = "2025-07-27T16:30:48.982Z" }, + { url = "https://files.pythonhosted.org/packages/0e/87/8db3aa10dde6e3e8e7eb0133f24baa011377d543f5b19c71469cf2648026/scipy-1.16.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3ddfb1e8d0b540cb4ee9c53fc3dea3186f97711248fb94b4142a1b27178d8b4b", size = 35185724, upload-time = "2025-07-27T16:30:54.26Z" }, + { url = "https://files.pythonhosted.org/packages/89/b4/6ab9ae443216807622bcff02690262d8184078ea467efee2f8c93288a3b1/scipy-1.16.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:4dc0e7be79e95d8ba3435d193e0d8ce372f47f774cffd882f88ea4e1e1ddc731", size = 35554335, upload-time = "2025-07-27T16:30:59.765Z" }, + { url = "https://files.pythonhosted.org/packages/9c/9a/d0e9dc03c5269a1afb60661118296a32ed5d2c24298af61b676c11e05e56/scipy-1.16.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:f23634f9e5adb51b2a77766dac217063e764337fbc816aa8ad9aaebcd4397fd3", size = 37960310, upload-time = "2025-07-27T16:31:06.151Z" }, + { url = "https://files.pythonhosted.org/packages/5e/00/c8f3130a50521a7977874817ca89e0599b1b4ee8e938bad8ae798a0e1f0d/scipy-1.16.1-cp314-cp314-win_amd64.whl", hash = "sha256:57d75524cb1c5a374958a2eae3d84e1929bb971204cc9d52213fb8589183fc19", size = 39319239, upload-time = "2025-07-27T16:31:59.942Z" }, + { url = "https://files.pythonhosted.org/packages/f2/f2/1ca3eda54c3a7e4c92f6acef7db7b3a057deb135540d23aa6343ef8ad333/scipy-1.16.1-cp314-cp314t-macosx_10_14_x86_64.whl", hash = "sha256:d8da7c3dd67bcd93f15618938f43ed0995982eb38973023d46d4646c4283ad65", size = 36939460, upload-time = "2025-07-27T16:31:11.865Z" }, + { url = "https://files.pythonhosted.org/packages/80/30/98c2840b293a132400c0940bb9e140171dcb8189588619048f42b2ce7b4f/scipy-1.16.1-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:cc1d2f2fd48ba1e0620554fe5bc44d3e8f5d4185c8c109c7fbdf5af2792cfad2", size = 29093322, upload-time = "2025-07-27T16:31:17.045Z" }, + { url = "https://files.pythonhosted.org/packages/c1/e6/1e6e006e850622cf2a039b62d1a6ddc4497d4851e58b68008526f04a9a00/scipy-1.16.1-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:21a611ced9275cb861bacadbada0b8c0623bc00b05b09eb97f23b370fc2ae56d", size = 21365329, upload-time = "2025-07-27T16:31:21.188Z" }, + { url = "https://files.pythonhosted.org/packages/8e/02/72a5aa5b820589dda9a25e329ca752842bfbbaf635e36bc7065a9b42216e/scipy-1.16.1-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:8dfbb25dffc4c3dd9371d8ab456ca81beeaf6f9e1c2119f179392f0dc1ab7695", size = 23897544, upload-time = "2025-07-27T16:31:25.408Z" }, + { url = "https://files.pythonhosted.org/packages/2b/dc/7122d806a6f9eb8a33532982234bed91f90272e990f414f2830cfe656e0b/scipy-1.16.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f0ebb7204f063fad87fc0a0e4ff4a2ff40b2a226e4ba1b7e34bf4b79bf97cd86", size = 33442112, upload-time = "2025-07-27T16:31:30.62Z" }, + { url = "https://files.pythonhosted.org/packages/24/39/e383af23564daa1021a5b3afbe0d8d6a68ec639b943661841f44ac92de85/scipy-1.16.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f1b9e5962656f2734c2b285a8745358ecb4e4efbadd00208c80a389227ec61ff", size = 35286594, upload-time = "2025-07-27T16:31:36.112Z" }, + { url = "https://files.pythonhosted.org/packages/95/47/1a0b0aff40c3056d955f38b0df5d178350c3d74734ec54f9c68d23910be5/scipy-1.16.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:5e1a106f8c023d57a2a903e771228bf5c5b27b5d692088f457acacd3b54511e4", size = 35665080, upload-time = "2025-07-27T16:31:42.025Z" }, + { url = "https://files.pythonhosted.org/packages/64/df/ce88803e9ed6e27fe9b9abefa157cf2c80e4fa527cf17ee14be41f790ad4/scipy-1.16.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:709559a1db68a9abc3b2c8672c4badf1614f3b440b3ab326d86a5c0491eafae3", size = 38050306, upload-time = "2025-07-27T16:31:48.109Z" }, + { url = "https://files.pythonhosted.org/packages/6e/6c/a76329897a7cae4937d403e623aa6aaea616a0bb5b36588f0b9d1c9a3739/scipy-1.16.1-cp314-cp314t-win_amd64.whl", hash = "sha256:c0c804d60492a0aad7f5b2bb1862f4548b990049e27e828391ff2bf6f7199998", size = 39427705, upload-time = "2025-07-27T16:31:53.96Z" }, +] + +[[package]] +name = "setuptools" +version = "80.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/18/5d/3bf57dcd21979b887f014ea83c24ae194cfcd12b9e0fda66b957c69d1fca/setuptools-80.9.0.tar.gz", hash = "sha256:f36b47402ecde768dbfafc46e8e4207b4360c654f1f3bb84475f0a28628fb19c", size = 1319958, upload-time = "2025-05-27T00:56:51.443Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a3/dc/17031897dae0efacfea57dfd3a82fdd2a2aeb58e0ff71b77b87e44edc772/setuptools-80.9.0-py3-none-any.whl", hash = "sha256:062d34222ad13e0cc312a4c02d73f059e86a4acbfbdea8f8f76b28c99f306922", size = 1201486, upload-time = "2025-05-27T00:56:49.664Z" }, +] + +[[package]] +name = "six" +version = "1.17.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031, upload-time = "2024-12-04T17:35:28.174Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" }, +] + +[[package]] +name = "smart-open" +version = "7.3.0.post1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "wrapt" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/18/2b/5e7234c68ed5bc872ad6ae77b8a421c2ed70dcb1190b44dc1abdeed5e347/smart_open-7.3.0.post1.tar.gz", hash = "sha256:ce6a3d9bc1afbf6234ad13c010b77f8cd36d24636811e3c52c3b5160f5214d1e", size = 51557, upload-time = "2025-07-03T10:06:31.271Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/08/5b/a2a3d4514c64818925f4e886d39981f1926eeb5288a4549c6b3c17ed66bb/smart_open-7.3.0.post1-py3-none-any.whl", hash = "sha256:c73661a2c24bf045c1e04e08fffc585b59af023fe783d57896f590489db66fb4", size = 61946, upload-time = "2025-07-03T10:06:29.599Z" }, +] + +[[package]] +name = "smmap" +version = "5.0.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/44/cd/a040c4b3119bbe532e5b0732286f805445375489fceaec1f48306068ee3b/smmap-5.0.2.tar.gz", hash = "sha256:26ea65a03958fa0c8a1c7e8c7a58fdc77221b8910f6be2131affade476898ad5", size = 22329, upload-time = "2025-01-02T07:14:40.909Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/be/d09147ad1ec7934636ad912901c5fd7667e1c858e19d355237db0d0cd5e4/smmap-5.0.2-py3-none-any.whl", hash = "sha256:b30115f0def7d7531d22a0fb6502488d879e75b260a9db4d0819cfb25403af5e", size = 24303, upload-time = "2025-01-02T07:14:38.724Z" }, +] + +[[package]] +name = "sniffio" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372, upload-time = "2024-02-25T23:20:04.057Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" }, +] + +[[package]] +name = "sqlalchemy" +version = "2.0.43" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "greenlet", marker = "(python_full_version < '3.14' and platform_machine == 'AMD64') or (python_full_version < '3.14' and platform_machine == 'WIN32') or (python_full_version < '3.14' and platform_machine == 'aarch64') or (python_full_version < '3.14' and platform_machine == 'amd64') or (python_full_version < '3.14' and platform_machine == 'ppc64le') or (python_full_version < '3.14' and platform_machine == 'win32') or (python_full_version < '3.14' and platform_machine == 'x86_64')" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d7/bc/d59b5d97d27229b0e009bd9098cd81af71c2fa5549c580a0a67b9bed0496/sqlalchemy-2.0.43.tar.gz", hash = "sha256:788bfcef6787a7764169cfe9859fe425bf44559619e1d9f56f5bddf2ebf6f417", size = 9762949, upload-time = "2025-08-11T14:24:58.438Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/61/db/20c78f1081446095450bdc6ee6cc10045fce67a8e003a5876b6eaafc5cc4/sqlalchemy-2.0.43-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:20d81fc2736509d7a2bd33292e489b056cbae543661bb7de7ce9f1c0cd6e7f24", size = 2134891, upload-time = "2025-08-11T15:51:13.019Z" }, + { url = "https://files.pythonhosted.org/packages/45/0a/3d89034ae62b200b4396f0f95319f7d86e9945ee64d2343dcad857150fa2/sqlalchemy-2.0.43-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:25b9fc27650ff5a2c9d490c13c14906b918b0de1f8fcbb4c992712d8caf40e83", size = 2123061, upload-time = "2025-08-11T15:51:14.319Z" }, + { url = "https://files.pythonhosted.org/packages/cb/10/2711f7ff1805919221ad5bee205971254845c069ee2e7036847103ca1e4c/sqlalchemy-2.0.43-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6772e3ca8a43a65a37c88e2f3e2adfd511b0b1da37ef11ed78dea16aeae85bd9", size = 3320384, upload-time = "2025-08-11T15:52:35.088Z" }, + { url = "https://files.pythonhosted.org/packages/6e/0e/3d155e264d2ed2778484006ef04647bc63f55b3e2d12e6a4f787747b5900/sqlalchemy-2.0.43-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1a113da919c25f7f641ffbd07fbc9077abd4b3b75097c888ab818f962707eb48", size = 3329648, upload-time = "2025-08-11T15:56:34.153Z" }, + { url = "https://files.pythonhosted.org/packages/5b/81/635100fb19725c931622c673900da5efb1595c96ff5b441e07e3dd61f2be/sqlalchemy-2.0.43-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:4286a1139f14b7d70141c67a8ae1582fc2b69105f1b09d9573494eb4bb4b2687", size = 3258030, upload-time = "2025-08-11T15:52:36.933Z" }, + { url = "https://files.pythonhosted.org/packages/0c/ed/a99302716d62b4965fded12520c1cbb189f99b17a6d8cf77611d21442e47/sqlalchemy-2.0.43-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:529064085be2f4d8a6e5fab12d36ad44f1909a18848fcfbdb59cc6d4bbe48efe", size = 3294469, upload-time = "2025-08-11T15:56:35.553Z" }, + { url = "https://files.pythonhosted.org/packages/5d/a2/3a11b06715149bf3310b55a98b5c1e84a42cfb949a7b800bc75cb4e33abc/sqlalchemy-2.0.43-cp312-cp312-win32.whl", hash = "sha256:b535d35dea8bbb8195e7e2b40059e2253acb2b7579b73c1b432a35363694641d", size = 2098906, upload-time = "2025-08-11T15:55:00.645Z" }, + { url = "https://files.pythonhosted.org/packages/bc/09/405c915a974814b90aa591280623adc6ad6b322f61fd5cff80aeaef216c9/sqlalchemy-2.0.43-cp312-cp312-win_amd64.whl", hash = "sha256:1c6d85327ca688dbae7e2b06d7d84cfe4f3fffa5b5f9e21bb6ce9d0e1a0e0e0a", size = 2126260, upload-time = "2025-08-11T15:55:02.965Z" }, + { url = "https://files.pythonhosted.org/packages/41/1c/a7260bd47a6fae7e03768bf66451437b36451143f36b285522b865987ced/sqlalchemy-2.0.43-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e7c08f57f75a2bb62d7ee80a89686a5e5669f199235c6d1dac75cd59374091c3", size = 2130598, upload-time = "2025-08-11T15:51:15.903Z" }, + { url = "https://files.pythonhosted.org/packages/8e/84/8a337454e82388283830b3586ad7847aa9c76fdd4f1df09cdd1f94591873/sqlalchemy-2.0.43-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:14111d22c29efad445cd5021a70a8b42f7d9152d8ba7f73304c4d82460946aaa", size = 2118415, upload-time = "2025-08-11T15:51:17.256Z" }, + { url = "https://files.pythonhosted.org/packages/cf/ff/22ab2328148492c4d71899d62a0e65370ea66c877aea017a244a35733685/sqlalchemy-2.0.43-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21b27b56eb2f82653168cefe6cb8e970cdaf4f3a6cb2c5e3c3c1cf3158968ff9", size = 3248707, upload-time = "2025-08-11T15:52:38.444Z" }, + { url = "https://files.pythonhosted.org/packages/dc/29/11ae2c2b981de60187f7cbc84277d9d21f101093d1b2e945c63774477aba/sqlalchemy-2.0.43-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c5a9da957c56e43d72126a3f5845603da00e0293720b03bde0aacffcf2dc04f", size = 3253602, upload-time = "2025-08-11T15:56:37.348Z" }, + { url = "https://files.pythonhosted.org/packages/b8/61/987b6c23b12c56d2be451bc70900f67dd7d989d52b1ee64f239cf19aec69/sqlalchemy-2.0.43-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:5d79f9fdc9584ec83d1b3c75e9f4595c49017f5594fee1a2217117647225d738", size = 3183248, upload-time = "2025-08-11T15:52:39.865Z" }, + { url = "https://files.pythonhosted.org/packages/86/85/29d216002d4593c2ce1c0ec2cec46dda77bfbcd221e24caa6e85eff53d89/sqlalchemy-2.0.43-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9df7126fd9db49e3a5a3999442cc67e9ee8971f3cb9644250107d7296cb2a164", size = 3219363, upload-time = "2025-08-11T15:56:39.11Z" }, + { url = "https://files.pythonhosted.org/packages/b6/e4/bd78b01919c524f190b4905d47e7630bf4130b9f48fd971ae1c6225b6f6a/sqlalchemy-2.0.43-cp313-cp313-win32.whl", hash = "sha256:7f1ac7828857fcedb0361b48b9ac4821469f7694089d15550bbcf9ab22564a1d", size = 2096718, upload-time = "2025-08-11T15:55:05.349Z" }, + { url = "https://files.pythonhosted.org/packages/ac/a5/ca2f07a2a201f9497de1928f787926613db6307992fe5cda97624eb07c2f/sqlalchemy-2.0.43-cp313-cp313-win_amd64.whl", hash = "sha256:971ba928fcde01869361f504fcff3b7143b47d30de188b11c6357c0505824197", size = 2123200, upload-time = "2025-08-11T15:55:07.932Z" }, + { url = "https://files.pythonhosted.org/packages/b8/d9/13bdde6521f322861fab67473cec4b1cc8999f3871953531cf61945fad92/sqlalchemy-2.0.43-py3-none-any.whl", hash = "sha256:1681c21dd2ccee222c2fe0bef671d1aef7c504087c9c4e800371cfcc8ac966fc", size = 1924759, upload-time = "2025-08-11T15:39:53.024Z" }, +] + +[[package]] +name = "sqlparse" +version = "0.5.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e5/40/edede8dd6977b0d3da179a342c198ed100dd2aba4be081861ee5911e4da4/sqlparse-0.5.3.tar.gz", hash = "sha256:09f67787f56a0b16ecdbde1bfc7f5d9c3371ca683cfeaa8e6ff60b4807ec9272", size = 84999, upload-time = "2024-12-10T12:05:30.728Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a9/5c/bfd6bd0bf979426d405cc6e71eceb8701b148b16c21d2dc3c261efc61c7b/sqlparse-0.5.3-py3-none-any.whl", hash = "sha256:cf2196ed3418f3ba5de6af7e82c694a9fbdbfecccdfc72e281548517081f16ca", size = 44415, upload-time = "2024-12-10T12:05:27.824Z" }, +] + +[[package]] +name = "stack-data" +version = "0.6.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "asttokens" }, + { name = "executing" }, + { name = "pure-eval" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/28/e3/55dcc2cfbc3ca9c29519eb6884dd1415ecb53b0e934862d3559ddcb7e20b/stack_data-0.6.3.tar.gz", hash = "sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9", size = 44707, upload-time = "2023-09-30T13:58:05.479Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695", size = 24521, upload-time = "2023-09-30T13:58:03.53Z" }, +] + +[[package]] +name = "starlette" +version = "0.47.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/04/57/d062573f391d062710d4088fa1369428c38d51460ab6fedff920efef932e/starlette-0.47.2.tar.gz", hash = "sha256:6ae9aa5db235e4846decc1e7b79c4f346adf41e9777aebeb49dfd09bbd7023d8", size = 2583948, upload-time = "2025-07-20T17:31:58.522Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f7/1f/b876b1f83aef204198a42dc101613fefccb32258e5428b5f9259677864b4/starlette-0.47.2-py3-none-any.whl", hash = "sha256:c5847e96134e5c5371ee9fac6fdf1a67336d5815e09eb2a01fdb57a351ef915b", size = 72984, upload-time = "2025-07-20T17:31:56.738Z" }, +] + +[[package]] +name = "sympy" +version = "1.14.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mpmath" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/83/d3/803453b36afefb7c2bb238361cd4ae6125a569b4db67cd9e79846ba2d68c/sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517", size = 7793921, upload-time = "2025-04-27T18:05:01.611Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" }, +] + +[[package]] +name = "tensorboardx" +version = "2.6.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, + { name = "packaging" }, + { name = "protobuf" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/2b/c5/d4cc6e293fb837aaf9f76dd7745476aeba8ef7ef5146c3b3f9ee375fe7a5/tensorboardx-2.6.4.tar.gz", hash = "sha256:b163ccb7798b31100b9f5fa4d6bc22dad362d7065c2f24b51e50731adde86828", size = 4769801, upload-time = "2025-06-10T22:37:07.419Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e0/1d/b5d63f1a6b824282b57f7b581810d20b7a28ca951f2d5b59f1eb0782c12b/tensorboardx-2.6.4-py3-none-any.whl", hash = "sha256:5970cf3a1f0a6a6e8b180ccf46f3fe832b8a25a70b86e5a237048a7c0beb18e2", size = 87201, upload-time = "2025-06-10T22:37:05.44Z" }, +] + +[[package]] +name = "threadpoolctl" +version = "3.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b7/4d/08c89e34946fce2aec4fbb45c9016efd5f4d7f24af8e5d93296e935631d8/threadpoolctl-3.6.0.tar.gz", hash = "sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e", size = 21274, upload-time = "2025-03-13T13:49:23.031Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb", size = 18638, upload-time = "2025-03-13T13:49:21.846Z" }, +] + +[[package]] +name = "tokenizers" +version = "0.21.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "huggingface-hub" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c2/2f/402986d0823f8d7ca139d969af2917fefaa9b947d1fb32f6168c509f2492/tokenizers-0.21.4.tar.gz", hash = "sha256:fa23f85fbc9a02ec5c6978da172cdcbac23498c3ca9f3645c5c68740ac007880", size = 351253, upload-time = "2025-07-28T15:48:54.325Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/98/c6/fdb6f72bf6454f52eb4a2510be7fb0f614e541a2554d6210e370d85efff4/tokenizers-0.21.4-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:2ccc10a7c3bcefe0f242867dc914fc1226ee44321eb618cfe3019b5df3400133", size = 2863987, upload-time = "2025-07-28T15:48:44.877Z" }, + { url = "https://files.pythonhosted.org/packages/8d/a6/28975479e35ddc751dc1ddc97b9b69bf7fcf074db31548aab37f8116674c/tokenizers-0.21.4-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:5e2f601a8e0cd5be5cc7506b20a79112370b9b3e9cb5f13f68ab11acd6ca7d60", size = 2732457, upload-time = "2025-07-28T15:48:43.265Z" }, + { url = "https://files.pythonhosted.org/packages/aa/8f/24f39d7b5c726b7b0be95dca04f344df278a3fe3a4deb15a975d194cbb32/tokenizers-0.21.4-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:39b376f5a1aee67b4d29032ee85511bbd1b99007ec735f7f35c8a2eb104eade5", size = 3012624, upload-time = "2025-07-28T13:22:43.895Z" }, + { url = "https://files.pythonhosted.org/packages/58/47/26358925717687a58cb74d7a508de96649544fad5778f0cd9827398dc499/tokenizers-0.21.4-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2107ad649e2cda4488d41dfd031469e9da3fcbfd6183e74e4958fa729ffbf9c6", size = 2939681, upload-time = "2025-07-28T13:22:47.499Z" }, + { url = "https://files.pythonhosted.org/packages/99/6f/cc300fea5db2ab5ddc2c8aea5757a27b89c84469899710c3aeddc1d39801/tokenizers-0.21.4-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c73012da95afafdf235ba80047699df4384fdc481527448a078ffd00e45a7d9", size = 3247445, upload-time = "2025-07-28T15:48:39.711Z" }, + { url = "https://files.pythonhosted.org/packages/be/bf/98cb4b9c3c4afd8be89cfa6423704337dc20b73eb4180397a6e0d456c334/tokenizers-0.21.4-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f23186c40395fc390d27f519679a58023f368a0aad234af145e0f39ad1212732", size = 3428014, upload-time = "2025-07-28T13:22:49.569Z" }, + { url = "https://files.pythonhosted.org/packages/75/c7/96c1cc780e6ca7f01a57c13235dd05b7bc1c0f3588512ebe9d1331b5f5ae/tokenizers-0.21.4-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cc88bb34e23a54cc42713d6d98af5f1bf79c07653d24fe984d2d695ba2c922a2", size = 3193197, upload-time = "2025-07-28T13:22:51.471Z" }, + { url = "https://files.pythonhosted.org/packages/f2/90/273b6c7ec78af547694eddeea9e05de771278bd20476525ab930cecaf7d8/tokenizers-0.21.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51b7eabb104f46c1c50b486520555715457ae833d5aee9ff6ae853d1130506ff", size = 3115426, upload-time = "2025-07-28T15:48:41.439Z" }, + { url = "https://files.pythonhosted.org/packages/91/43/c640d5a07e95f1cf9d2c92501f20a25f179ac53a4f71e1489a3dcfcc67ee/tokenizers-0.21.4-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:714b05b2e1af1288bd1bc56ce496c4cebb64a20d158ee802887757791191e6e2", size = 9089127, upload-time = "2025-07-28T15:48:46.472Z" }, + { url = "https://files.pythonhosted.org/packages/44/a1/dd23edd6271d4dca788e5200a807b49ec3e6987815cd9d0a07ad9c96c7c2/tokenizers-0.21.4-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:1340ff877ceedfa937544b7d79f5b7becf33a4cfb58f89b3b49927004ef66f78", size = 9055243, upload-time = "2025-07-28T15:48:48.539Z" }, + { url = "https://files.pythonhosted.org/packages/21/2b/b410d6e9021c4b7ddb57248304dc817c4d4970b73b6ee343674914701197/tokenizers-0.21.4-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:3c1f4317576e465ac9ef0d165b247825a2a4078bcd01cba6b54b867bdf9fdd8b", size = 9298237, upload-time = "2025-07-28T15:48:50.443Z" }, + { url = "https://files.pythonhosted.org/packages/b7/0a/42348c995c67e2e6e5c89ffb9cfd68507cbaeb84ff39c49ee6e0a6dd0fd2/tokenizers-0.21.4-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:c212aa4e45ec0bb5274b16b6f31dd3f1c41944025c2358faaa5782c754e84c24", size = 9461980, upload-time = "2025-07-28T15:48:52.325Z" }, + { url = "https://files.pythonhosted.org/packages/3d/d3/dacccd834404cd71b5c334882f3ba40331ad2120e69ded32cf5fda9a7436/tokenizers-0.21.4-cp39-abi3-win32.whl", hash = "sha256:6c42a930bc5f4c47f4ea775c91de47d27910881902b0f20e4990ebe045a415d0", size = 2329871, upload-time = "2025-07-28T15:48:56.841Z" }, + { url = "https://files.pythonhosted.org/packages/41/f2/fd673d979185f5dcbac4be7d09461cbb99751554ffb6718d0013af8604cb/tokenizers-0.21.4-cp39-abi3-win_amd64.whl", hash = "sha256:475d807a5c3eb72c59ad9b5fcdb254f6e17f53dfcbb9903233b0dfa9c943b597", size = 2507568, upload-time = "2025-07-28T15:48:55.456Z" }, +] + +[[package]] +name = "torch" +version = "2.7.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock" }, + { name = "fsspec" }, + { name = "jinja2" }, + { name = "networkx" }, + { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cufile-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusparselt-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "setuptools" }, + { name = "sympy" }, + { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "typing-extensions" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/87/93/fb505a5022a2e908d81fe9a5e0aa84c86c0d5f408173be71c6018836f34e/torch-2.7.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:27ea1e518df4c9de73af7e8a720770f3628e7f667280bce2be7a16292697e3fa", size = 98948276, upload-time = "2025-06-04T17:39:12.852Z" }, + { url = "https://files.pythonhosted.org/packages/56/7e/67c3fe2b8c33f40af06326a3d6ae7776b3e3a01daa8f71d125d78594d874/torch-2.7.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:c33360cfc2edd976c2633b3b66c769bdcbbf0e0b6550606d188431c81e7dd1fc", size = 821025792, upload-time = "2025-06-04T17:34:58.747Z" }, + { url = "https://files.pythonhosted.org/packages/a1/37/a37495502bc7a23bf34f89584fa5a78e25bae7b8da513bc1b8f97afb7009/torch-2.7.1-cp312-cp312-win_amd64.whl", hash = "sha256:d8bf6e1856ddd1807e79dc57e54d3335f2b62e6f316ed13ed3ecfe1fc1df3d8b", size = 216050349, upload-time = "2025-06-04T17:38:59.709Z" }, + { url = "https://files.pythonhosted.org/packages/3a/60/04b77281c730bb13460628e518c52721257814ac6c298acd25757f6a175c/torch-2.7.1-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:787687087412c4bd68d315e39bc1223f08aae1d16a9e9771d95eabbb04ae98fb", size = 68645146, upload-time = "2025-06-04T17:38:52.97Z" }, + { url = "https://files.pythonhosted.org/packages/66/81/e48c9edb655ee8eb8c2a6026abdb6f8d2146abd1f150979ede807bb75dcb/torch-2.7.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:03563603d931e70722dce0e11999d53aa80a375a3d78e6b39b9f6805ea0a8d28", size = 98946649, upload-time = "2025-06-04T17:38:43.031Z" }, + { url = "https://files.pythonhosted.org/packages/3a/24/efe2f520d75274fc06b695c616415a1e8a1021d87a13c68ff9dce733d088/torch-2.7.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:d632f5417b6980f61404a125b999ca6ebd0b8b4bbdbb5fbbba44374ab619a412", size = 821033192, upload-time = "2025-06-04T17:38:09.146Z" }, + { url = "https://files.pythonhosted.org/packages/dd/d9/9c24d230333ff4e9b6807274f6f8d52a864210b52ec794c5def7925f4495/torch-2.7.1-cp313-cp313-win_amd64.whl", hash = "sha256:23660443e13995ee93e3d844786701ea4ca69f337027b05182f5ba053ce43b38", size = 216055668, upload-time = "2025-06-04T17:38:36.253Z" }, + { url = "https://files.pythonhosted.org/packages/95/bf/e086ee36ddcef9299f6e708d3b6c8487c1651787bb9ee2939eb2a7f74911/torch-2.7.1-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:0da4f4dba9f65d0d203794e619fe7ca3247a55ffdcbd17ae8fb83c8b2dc9b585", size = 68925988, upload-time = "2025-06-04T17:38:29.273Z" }, + { url = "https://files.pythonhosted.org/packages/69/6a/67090dcfe1cf9048448b31555af6efb149f7afa0a310a366adbdada32105/torch-2.7.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:e08d7e6f21a617fe38eeb46dd2213ded43f27c072e9165dc27300c9ef9570934", size = 99028857, upload-time = "2025-06-04T17:37:50.956Z" }, + { url = "https://files.pythonhosted.org/packages/90/1c/48b988870823d1cc381f15ec4e70ed3d65e043f43f919329b0045ae83529/torch-2.7.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:30207f672328a42df4f2174b8f426f354b2baa0b7cca3a0adb3d6ab5daf00dc8", size = 821098066, upload-time = "2025-06-04T17:37:33.939Z" }, + { url = "https://files.pythonhosted.org/packages/7b/eb/10050d61c9d5140c5dc04a89ed3257ef1a6b93e49dd91b95363d757071e0/torch-2.7.1-cp313-cp313t-win_amd64.whl", hash = "sha256:79042feca1c634aaf6603fe6feea8c6b30dfa140a6bbc0b973e2260c7e79a22e", size = 216336310, upload-time = "2025-06-04T17:36:09.862Z" }, + { url = "https://files.pythonhosted.org/packages/b1/29/beb45cdf5c4fc3ebe282bf5eafc8dfd925ead7299b3c97491900fe5ed844/torch-2.7.1-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:988b0cbc4333618a1056d2ebad9eb10089637b659eb645434d0809d8d937b946", size = 68645708, upload-time = "2025-06-04T17:34:39.852Z" }, +] + +[[package]] +name = "tornado" +version = "6.5.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/09/ce/1eb500eae19f4648281bb2186927bb062d2438c2e5093d1360391afd2f90/tornado-6.5.2.tar.gz", hash = "sha256:ab53c8f9a0fa351e2c0741284e06c7a45da86afb544133201c5cc8578eb076a0", size = 510821, upload-time = "2025-08-08T18:27:00.78Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f6/48/6a7529df2c9cc12efd2e8f5dd219516184d703b34c06786809670df5b3bd/tornado-6.5.2-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:2436822940d37cde62771cff8774f4f00b3c8024fe482e16ca8387b8a2724db6", size = 442563, upload-time = "2025-08-08T18:26:42.945Z" }, + { url = "https://files.pythonhosted.org/packages/f2/b5/9b575a0ed3e50b00c40b08cbce82eb618229091d09f6d14bce80fc01cb0b/tornado-6.5.2-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:583a52c7aa94ee046854ba81d9ebb6c81ec0fd30386d96f7640c96dad45a03ef", size = 440729, upload-time = "2025-08-08T18:26:44.473Z" }, + { url = "https://files.pythonhosted.org/packages/1b/4e/619174f52b120efcf23633c817fd3fed867c30bff785e2cd5a53a70e483c/tornado-6.5.2-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b0fe179f28d597deab2842b86ed4060deec7388f1fd9c1b4a41adf8af058907e", size = 444295, upload-time = "2025-08-08T18:26:46.021Z" }, + { url = "https://files.pythonhosted.org/packages/95/fa/87b41709552bbd393c85dd18e4e3499dcd8983f66e7972926db8d96aa065/tornado-6.5.2-cp39-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b186e85d1e3536d69583d2298423744740986018e393d0321df7340e71898882", size = 443644, upload-time = "2025-08-08T18:26:47.625Z" }, + { url = "https://files.pythonhosted.org/packages/f9/41/fb15f06e33d7430ca89420283a8762a4e6b8025b800ea51796ab5e6d9559/tornado-6.5.2-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e792706668c87709709c18b353da1f7662317b563ff69f00bab83595940c7108", size = 443878, upload-time = "2025-08-08T18:26:50.599Z" }, + { url = "https://files.pythonhosted.org/packages/11/92/fe6d57da897776ad2e01e279170ea8ae726755b045fe5ac73b75357a5a3f/tornado-6.5.2-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:06ceb1300fd70cb20e43b1ad8aaee0266e69e7ced38fa910ad2e03285009ce7c", size = 444549, upload-time = "2025-08-08T18:26:51.864Z" }, + { url = "https://files.pythonhosted.org/packages/9b/02/c8f4f6c9204526daf3d760f4aa555a7a33ad0e60843eac025ccfd6ff4a93/tornado-6.5.2-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:74db443e0f5251be86cbf37929f84d8c20c27a355dd452a5cfa2aada0d001ec4", size = 443973, upload-time = "2025-08-08T18:26:53.625Z" }, + { url = "https://files.pythonhosted.org/packages/ae/2d/f5f5707b655ce2317190183868cd0f6822a1121b4baeae509ceb9590d0bd/tornado-6.5.2-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b5e735ab2889d7ed33b32a459cac490eda71a1ba6857b0118de476ab6c366c04", size = 443954, upload-time = "2025-08-08T18:26:55.072Z" }, + { url = "https://files.pythonhosted.org/packages/e8/59/593bd0f40f7355806bf6573b47b8c22f8e1374c9b6fd03114bd6b7a3dcfd/tornado-6.5.2-cp39-abi3-win32.whl", hash = "sha256:c6f29e94d9b37a95013bb669616352ddb82e3bfe8326fccee50583caebc8a5f0", size = 445023, upload-time = "2025-08-08T18:26:56.677Z" }, + { url = "https://files.pythonhosted.org/packages/c7/2a/f609b420c2f564a748a2d80ebfb2ee02a73ca80223af712fca591386cafb/tornado-6.5.2-cp39-abi3-win_amd64.whl", hash = "sha256:e56a5af51cc30dd2cae649429af65ca2f6571da29504a07995175df14c18f35f", size = 445427, upload-time = "2025-08-08T18:26:57.91Z" }, + { url = "https://files.pythonhosted.org/packages/5e/4f/e1f65e8f8c76d73658b33d33b81eed4322fb5085350e4328d5c956f0c8f9/tornado-6.5.2-cp39-abi3-win_arm64.whl", hash = "sha256:d6c33dc3672e3a1f3618eb63b7ef4683a7688e7b9e6e8f0d9aa5726360a004af", size = 444456, upload-time = "2025-08-08T18:26:59.207Z" }, +] + +[[package]] +name = "tqdm" +version = "4.67.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737, upload-time = "2024-11-24T20:12:22.481Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" }, +] + +[[package]] +name = "traitlets" +version = "5.14.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/eb/79/72064e6a701c2183016abbbfedaba506d81e30e232a68c9f0d6f6fcd1574/traitlets-5.14.3.tar.gz", hash = "sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7", size = 161621, upload-time = "2024-04-19T11:11:49.746Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359, upload-time = "2024-04-19T11:11:46.763Z" }, +] + +[[package]] +name = "transformers" +version = "4.52.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock" }, + { name = "huggingface-hub" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "pyyaml" }, + { name = "regex" }, + { name = "requests" }, + { name = "safetensors" }, + { name = "tokenizers" }, + { name = "tqdm" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/07/42/271bcf364788337ac24e7f200005ac7142aaf022206bd6119d2daca22c04/transformers-4.52.3.tar.gz", hash = "sha256:2e1de29374f27920aaf6d589d4e6339f33def2fb08809e1a1d792e040e9fbce7", size = 8951324, upload-time = "2025-05-22T14:40:52.888Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/36/f8/1f086942bc6a044e4e68dacf6de761a45367795efd5f57ad356765691c79/transformers-4.52.3-py3-none-any.whl", hash = "sha256:cd04059da50e7cf2a617ce3143ba8beffbf119f8c25a0717c3454fd9d0f19609", size = 10460322, upload-time = "2025-05-22T14:40:49.583Z" }, +] + +[[package]] +name = "triton" +version = "3.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "setuptools", marker = "sys_platform != 'win32'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/24/5f/950fb373bf9c01ad4eb5a8cd5eaf32cdf9e238c02f9293557a2129b9c4ac/triton-3.3.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9999e83aba21e1a78c1f36f21bce621b77bcaa530277a50484a7cb4a822f6e43", size = 155669138, upload-time = "2025-05-29T23:39:51.771Z" }, + { url = "https://files.pythonhosted.org/packages/74/1f/dfb531f90a2d367d914adfee771babbd3f1a5b26c3f5fbc458dee21daa78/triton-3.3.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b89d846b5a4198317fec27a5d3a609ea96b6d557ff44b56c23176546023c4240", size = 155673035, upload-time = "2025-05-29T23:40:02.468Z" }, + { url = "https://files.pythonhosted.org/packages/28/71/bd20ffcb7a64c753dc2463489a61bf69d531f308e390ad06390268c4ea04/triton-3.3.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a3198adb9d78b77818a5388bff89fa72ff36f9da0bc689db2f0a651a67ce6a42", size = 155735832, upload-time = "2025-05-29T23:40:10.522Z" }, +] + +[[package]] +name = "typing-extensions" +version = "4.14.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/98/5a/da40306b885cc8c09109dc2e1abd358d5684b1425678151cdaed4731c822/typing_extensions-4.14.1.tar.gz", hash = "sha256:38b39f4aeeab64884ce9f74c94263ef78f3c22467c8724005483154c26648d36", size = 107673, upload-time = "2025-07-04T13:28:34.16Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b5/00/d631e67a838026495268c2f6884f3711a15a9a2a96cd244fdaea53b823fb/typing_extensions-4.14.1-py3-none-any.whl", hash = "sha256:d1e1e3b58374dc93031d6eda2420a48ea44a36c2b4766a4fdeb3710755731d76", size = 43906, upload-time = "2025-07-04T13:28:32.743Z" }, +] + +[[package]] +name = "typing-inspection" +version = "0.4.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f8/b1/0c11f5058406b3af7609f121aaa6b609744687f1d158b3c3a5bf4cc94238/typing_inspection-0.4.1.tar.gz", hash = "sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28", size = 75726, upload-time = "2025-05-21T18:55:23.885Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/17/69/cd203477f944c353c31bade965f880aa1061fd6bf05ded0726ca845b6ff7/typing_inspection-0.4.1-py3-none-any.whl", hash = "sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51", size = 14552, upload-time = "2025-05-21T18:55:22.152Z" }, +] + +[[package]] +name = "tzdata" +version = "2025.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/95/32/1a225d6164441be760d75c2c42e2780dc0873fe382da3e98a2e1e48361e5/tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9", size = 196380, upload-time = "2025-03-23T13:54:43.652Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" }, +] + +[[package]] +name = "urllib3" +version = "2.5.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/15/22/9ee70a2574a4f4599c47dd506532914ce044817c7752a79b6a51286319bc/urllib3-2.5.0.tar.gz", hash = "sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760", size = 393185, upload-time = "2025-06-18T14:07:41.644Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc", size = 129795, upload-time = "2025-06-18T14:07:40.39Z" }, +] + +[[package]] +name = "uvicorn" +version = "0.35.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "h11" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5e/42/e0e305207bb88c6b8d3061399c6a961ffe5fbb7e2aa63c9234df7259e9cd/uvicorn-0.35.0.tar.gz", hash = "sha256:bc662f087f7cf2ce11a1d7fd70b90c9f98ef2e2831556dd078d131b96cc94a01", size = 78473, upload-time = "2025-06-28T16:15:46.058Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d2/e2/dc81b1bd1dcfe91735810265e9d26bc8ec5da45b4c0f6237e286819194c3/uvicorn-0.35.0-py3-none-any.whl", hash = "sha256:197535216b25ff9b785e29a0b79199f55222193d47f820816e7da751e9bc8d4a", size = 66406, upload-time = "2025-06-28T16:15:44.816Z" }, +] + +[package.optional-dependencies] +standard = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "httptools" }, + { name = "python-dotenv" }, + { name = "pyyaml" }, + { name = "uvloop", marker = "platform_python_implementation != 'PyPy' and sys_platform != 'cygwin' and sys_platform != 'win32'" }, + { name = "watchfiles" }, + { name = "websockets" }, +] + +[[package]] +name = "uvloop" +version = "0.21.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/af/c0/854216d09d33c543f12a44b393c402e89a920b1a0a7dc634c42de91b9cf6/uvloop-0.21.0.tar.gz", hash = "sha256:3bf12b0fda68447806a7ad847bfa591613177275d35b6724b1ee573faa3704e3", size = 2492741, upload-time = "2024-10-14T23:38:35.489Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8c/4c/03f93178830dc7ce8b4cdee1d36770d2f5ebb6f3d37d354e061eefc73545/uvloop-0.21.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:359ec2c888397b9e592a889c4d72ba3d6befba8b2bb01743f72fffbde663b59c", size = 1471284, upload-time = "2024-10-14T23:37:47.833Z" }, + { url = "https://files.pythonhosted.org/packages/43/3e/92c03f4d05e50f09251bd8b2b2b584a2a7f8fe600008bcc4523337abe676/uvloop-0.21.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:f7089d2dc73179ce5ac255bdf37c236a9f914b264825fdaacaded6990a7fb4c2", size = 821349, upload-time = "2024-10-14T23:37:50.149Z" }, + { url = "https://files.pythonhosted.org/packages/a6/ef/a02ec5da49909dbbfb1fd205a9a1ac4e88ea92dcae885e7c961847cd51e2/uvloop-0.21.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:baa4dcdbd9ae0a372f2167a207cd98c9f9a1ea1188a8a526431eef2f8116cc8d", size = 4580089, upload-time = "2024-10-14T23:37:51.703Z" }, + { url = "https://files.pythonhosted.org/packages/06/a7/b4e6a19925c900be9f98bec0a75e6e8f79bb53bdeb891916609ab3958967/uvloop-0.21.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86975dca1c773a2c9864f4c52c5a55631038e387b47eaf56210f873887b6c8dc", size = 4693770, upload-time = "2024-10-14T23:37:54.122Z" }, + { url = "https://files.pythonhosted.org/packages/ce/0c/f07435a18a4b94ce6bd0677d8319cd3de61f3a9eeb1e5f8ab4e8b5edfcb3/uvloop-0.21.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:461d9ae6660fbbafedd07559c6a2e57cd553b34b0065b6550685f6653a98c1cb", size = 4451321, upload-time = "2024-10-14T23:37:55.766Z" }, + { url = "https://files.pythonhosted.org/packages/8f/eb/f7032be105877bcf924709c97b1bf3b90255b4ec251f9340cef912559f28/uvloop-0.21.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:183aef7c8730e54c9a3ee3227464daed66e37ba13040bb3f350bc2ddc040f22f", size = 4659022, upload-time = "2024-10-14T23:37:58.195Z" }, + { url = "https://files.pythonhosted.org/packages/3f/8d/2cbef610ca21539f0f36e2b34da49302029e7c9f09acef0b1c3b5839412b/uvloop-0.21.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:bfd55dfcc2a512316e65f16e503e9e450cab148ef11df4e4e679b5e8253a5281", size = 1468123, upload-time = "2024-10-14T23:38:00.688Z" }, + { url = "https://files.pythonhosted.org/packages/93/0d/b0038d5a469f94ed8f2b2fce2434a18396d8fbfb5da85a0a9781ebbdec14/uvloop-0.21.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:787ae31ad8a2856fc4e7c095341cccc7209bd657d0e71ad0dc2ea83c4a6fa8af", size = 819325, upload-time = "2024-10-14T23:38:02.309Z" }, + { url = "https://files.pythonhosted.org/packages/50/94/0a687f39e78c4c1e02e3272c6b2ccdb4e0085fda3b8352fecd0410ccf915/uvloop-0.21.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5ee4d4ef48036ff6e5cfffb09dd192c7a5027153948d85b8da7ff705065bacc6", size = 4582806, upload-time = "2024-10-14T23:38:04.711Z" }, + { url = "https://files.pythonhosted.org/packages/d2/19/f5b78616566ea68edd42aacaf645adbf71fbd83fc52281fba555dc27e3f1/uvloop-0.21.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3df876acd7ec037a3d005b3ab85a7e4110422e4d9c1571d4fc89b0fc41b6816", size = 4701068, upload-time = "2024-10-14T23:38:06.385Z" }, + { url = "https://files.pythonhosted.org/packages/47/57/66f061ee118f413cd22a656de622925097170b9380b30091b78ea0c6ea75/uvloop-0.21.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bd53ecc9a0f3d87ab847503c2e1552b690362e005ab54e8a48ba97da3924c0dc", size = 4454428, upload-time = "2024-10-14T23:38:08.416Z" }, + { url = "https://files.pythonhosted.org/packages/63/9a/0962b05b308494e3202d3f794a6e85abe471fe3cafdbcf95c2e8c713aabd/uvloop-0.21.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a5c39f217ab3c663dc699c04cbd50c13813e31d917642d459fdcec07555cc553", size = 4660018, upload-time = "2024-10-14T23:38:10.888Z" }, +] + +[[package]] +name = "virtualenv" +version = "20.34.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "distlib" }, + { name = "filelock" }, + { name = "platformdirs" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/1c/14/37fcdba2808a6c615681cd216fecae00413c9dab44fb2e57805ecf3eaee3/virtualenv-20.34.0.tar.gz", hash = "sha256:44815b2c9dee7ed86e387b842a84f20b93f7f417f95886ca1996a72a4138eb1a", size = 6003808, upload-time = "2025-08-13T14:24:07.464Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/76/06/04c8e804f813cf972e3262f3f8584c232de64f0cde9f703b46cf53a45090/virtualenv-20.34.0-py3-none-any.whl", hash = "sha256:341f5afa7eee943e4984a9207c025feedd768baff6753cd660c857ceb3e36026", size = 5983279, upload-time = "2025-08-13T14:24:05.111Z" }, +] + +[[package]] +name = "waitress" +version = "3.0.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/bf/cb/04ddb054f45faa306a230769e868c28b8065ea196891f09004ebace5b184/waitress-3.0.2.tar.gz", hash = "sha256:682aaaf2af0c44ada4abfb70ded36393f0e307f4ab9456a215ce0020baefc31f", size = 179901, upload-time = "2024-11-16T20:02:35.195Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8d/57/a27182528c90ef38d82b636a11f606b0cbb0e17588ed205435f8affe3368/waitress-3.0.2-py3-none-any.whl", hash = "sha256:c56d67fd6e87c2ee598b76abdd4e96cfad1f24cacdea5078d382b1f9d7b5ed2e", size = 56232, upload-time = "2024-11-16T20:02:33.858Z" }, +] + +[[package]] +name = "watchfiles" +version = "1.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/2a/9a/d451fcc97d029f5812e898fd30a53fd8c15c7bbd058fd75cfc6beb9bd761/watchfiles-1.1.0.tar.gz", hash = "sha256:693ed7ec72cbfcee399e92c895362b6e66d63dac6b91e2c11ae03d10d503e575", size = 94406, upload-time = "2025-06-15T19:06:59.42Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f6/b8/858957045a38a4079203a33aaa7d23ea9269ca7761c8a074af3524fbb240/watchfiles-1.1.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:9dc001c3e10de4725c749d4c2f2bdc6ae24de5a88a339c4bce32300a31ede179", size = 402339, upload-time = "2025-06-15T19:05:24.516Z" }, + { url = "https://files.pythonhosted.org/packages/80/28/98b222cca751ba68e88521fabd79a4fab64005fc5976ea49b53fa205d1fa/watchfiles-1.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d9ba68ec283153dead62cbe81872d28e053745f12335d037de9cbd14bd1877f5", size = 394409, upload-time = "2025-06-15T19:05:25.469Z" }, + { url = "https://files.pythonhosted.org/packages/86/50/dee79968566c03190677c26f7f47960aff738d32087087bdf63a5473e7df/watchfiles-1.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:130fc497b8ee68dce163e4254d9b0356411d1490e868bd8790028bc46c5cc297", size = 450939, upload-time = "2025-06-15T19:05:26.494Z" }, + { url = "https://files.pythonhosted.org/packages/40/45/a7b56fb129700f3cfe2594a01aa38d033b92a33dddce86c8dfdfc1247b72/watchfiles-1.1.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:50a51a90610d0845a5931a780d8e51d7bd7f309ebc25132ba975aca016b576a0", size = 457270, upload-time = "2025-06-15T19:05:27.466Z" }, + { url = "https://files.pythonhosted.org/packages/b5/c8/fa5ef9476b1d02dc6b5e258f515fcaaecf559037edf8b6feffcbc097c4b8/watchfiles-1.1.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dc44678a72ac0910bac46fa6a0de6af9ba1355669b3dfaf1ce5f05ca7a74364e", size = 483370, upload-time = "2025-06-15T19:05:28.548Z" }, + { url = "https://files.pythonhosted.org/packages/98/68/42cfcdd6533ec94f0a7aab83f759ec11280f70b11bfba0b0f885e298f9bd/watchfiles-1.1.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a543492513a93b001975ae283a51f4b67973662a375a403ae82f420d2c7205ee", size = 598654, upload-time = "2025-06-15T19:05:29.997Z" }, + { url = "https://files.pythonhosted.org/packages/d3/74/b2a1544224118cc28df7e59008a929e711f9c68ce7d554e171b2dc531352/watchfiles-1.1.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8ac164e20d17cc285f2b94dc31c384bc3aa3dd5e7490473b3db043dd70fbccfd", size = 478667, upload-time = "2025-06-15T19:05:31.172Z" }, + { url = "https://files.pythonhosted.org/packages/8c/77/e3362fe308358dc9f8588102481e599c83e1b91c2ae843780a7ded939a35/watchfiles-1.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7590d5a455321e53857892ab8879dce62d1f4b04748769f5adf2e707afb9d4f", size = 452213, upload-time = "2025-06-15T19:05:32.299Z" }, + { url = "https://files.pythonhosted.org/packages/6e/17/c8f1a36540c9a1558d4faf08e909399e8133599fa359bf52ec8fcee5be6f/watchfiles-1.1.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:37d3d3f7defb13f62ece99e9be912afe9dd8a0077b7c45ee5a57c74811d581a4", size = 626718, upload-time = "2025-06-15T19:05:33.415Z" }, + { url = "https://files.pythonhosted.org/packages/26/45/fb599be38b4bd38032643783d7496a26a6f9ae05dea1a42e58229a20ac13/watchfiles-1.1.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:7080c4bb3efd70a07b1cc2df99a7aa51d98685be56be6038c3169199d0a1c69f", size = 623098, upload-time = "2025-06-15T19:05:34.534Z" }, + { url = "https://files.pythonhosted.org/packages/a1/e7/fdf40e038475498e160cd167333c946e45d8563ae4dd65caf757e9ffe6b4/watchfiles-1.1.0-cp312-cp312-win32.whl", hash = "sha256:cbcf8630ef4afb05dc30107bfa17f16c0896bb30ee48fc24bf64c1f970f3b1fd", size = 279209, upload-time = "2025-06-15T19:05:35.577Z" }, + { url = "https://files.pythonhosted.org/packages/3f/d3/3ae9d5124ec75143bdf088d436cba39812122edc47709cd2caafeac3266f/watchfiles-1.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:cbd949bdd87567b0ad183d7676feb98136cde5bb9025403794a4c0db28ed3a47", size = 292786, upload-time = "2025-06-15T19:05:36.559Z" }, + { url = "https://files.pythonhosted.org/packages/26/2f/7dd4fc8b5f2b34b545e19629b4a018bfb1de23b3a496766a2c1165ca890d/watchfiles-1.1.0-cp312-cp312-win_arm64.whl", hash = "sha256:0a7d40b77f07be87c6faa93d0951a0fcd8cbca1ddff60a1b65d741bac6f3a9f6", size = 284343, upload-time = "2025-06-15T19:05:37.5Z" }, + { url = "https://files.pythonhosted.org/packages/d3/42/fae874df96595556a9089ade83be34a2e04f0f11eb53a8dbf8a8a5e562b4/watchfiles-1.1.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:5007f860c7f1f8df471e4e04aaa8c43673429047d63205d1630880f7637bca30", size = 402004, upload-time = "2025-06-15T19:05:38.499Z" }, + { url = "https://files.pythonhosted.org/packages/fa/55/a77e533e59c3003d9803c09c44c3651224067cbe7fb5d574ddbaa31e11ca/watchfiles-1.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:20ecc8abbd957046f1fe9562757903f5eaf57c3bce70929fda6c7711bb58074a", size = 393671, upload-time = "2025-06-15T19:05:39.52Z" }, + { url = "https://files.pythonhosted.org/packages/05/68/b0afb3f79c8e832e6571022611adbdc36e35a44e14f129ba09709aa4bb7a/watchfiles-1.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f2f0498b7d2a3c072766dba3274fe22a183dbea1f99d188f1c6c72209a1063dc", size = 449772, upload-time = "2025-06-15T19:05:40.897Z" }, + { url = "https://files.pythonhosted.org/packages/ff/05/46dd1f6879bc40e1e74c6c39a1b9ab9e790bf1f5a2fe6c08b463d9a807f4/watchfiles-1.1.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:239736577e848678e13b201bba14e89718f5c2133dfd6b1f7846fa1b58a8532b", size = 456789, upload-time = "2025-06-15T19:05:42.045Z" }, + { url = "https://files.pythonhosted.org/packages/8b/ca/0eeb2c06227ca7f12e50a47a3679df0cd1ba487ea19cf844a905920f8e95/watchfiles-1.1.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eff4b8d89f444f7e49136dc695599a591ff769300734446c0a86cba2eb2f9895", size = 482551, upload-time = "2025-06-15T19:05:43.781Z" }, + { url = "https://files.pythonhosted.org/packages/31/47/2cecbd8694095647406645f822781008cc524320466ea393f55fe70eed3b/watchfiles-1.1.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:12b0a02a91762c08f7264e2e79542f76870c3040bbc847fb67410ab81474932a", size = 597420, upload-time = "2025-06-15T19:05:45.244Z" }, + { url = "https://files.pythonhosted.org/packages/d9/7e/82abc4240e0806846548559d70f0b1a6dfdca75c1b4f9fa62b504ae9b083/watchfiles-1.1.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:29e7bc2eee15cbb339c68445959108803dc14ee0c7b4eea556400131a8de462b", size = 477950, upload-time = "2025-06-15T19:05:46.332Z" }, + { url = "https://files.pythonhosted.org/packages/25/0d/4d564798a49bf5482a4fa9416dea6b6c0733a3b5700cb8a5a503c4b15853/watchfiles-1.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9481174d3ed982e269c090f780122fb59cee6c3796f74efe74e70f7780ed94c", size = 451706, upload-time = "2025-06-15T19:05:47.459Z" }, + { url = "https://files.pythonhosted.org/packages/81/b5/5516cf46b033192d544102ea07c65b6f770f10ed1d0a6d388f5d3874f6e4/watchfiles-1.1.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:80f811146831c8c86ab17b640801c25dc0a88c630e855e2bef3568f30434d52b", size = 625814, upload-time = "2025-06-15T19:05:48.654Z" }, + { url = "https://files.pythonhosted.org/packages/0c/dd/7c1331f902f30669ac3e754680b6edb9a0dd06dea5438e61128111fadd2c/watchfiles-1.1.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:60022527e71d1d1fda67a33150ee42869042bce3d0fcc9cc49be009a9cded3fb", size = 622820, upload-time = "2025-06-15T19:05:50.088Z" }, + { url = "https://files.pythonhosted.org/packages/1b/14/36d7a8e27cd128d7b1009e7715a7c02f6c131be9d4ce1e5c3b73d0e342d8/watchfiles-1.1.0-cp313-cp313-win32.whl", hash = "sha256:32d6d4e583593cb8576e129879ea0991660b935177c0f93c6681359b3654bfa9", size = 279194, upload-time = "2025-06-15T19:05:51.186Z" }, + { url = "https://files.pythonhosted.org/packages/25/41/2dd88054b849aa546dbeef5696019c58f8e0774f4d1c42123273304cdb2e/watchfiles-1.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:f21af781a4a6fbad54f03c598ab620e3a77032c5878f3d780448421a6e1818c7", size = 292349, upload-time = "2025-06-15T19:05:52.201Z" }, + { url = "https://files.pythonhosted.org/packages/c8/cf/421d659de88285eb13941cf11a81f875c176f76a6d99342599be88e08d03/watchfiles-1.1.0-cp313-cp313-win_arm64.whl", hash = "sha256:5366164391873ed76bfdf618818c82084c9db7fac82b64a20c44d335eec9ced5", size = 283836, upload-time = "2025-06-15T19:05:53.265Z" }, + { url = "https://files.pythonhosted.org/packages/45/10/6faf6858d527e3599cc50ec9fcae73590fbddc1420bd4fdccfebffeedbc6/watchfiles-1.1.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:17ab167cca6339c2b830b744eaf10803d2a5b6683be4d79d8475d88b4a8a4be1", size = 400343, upload-time = "2025-06-15T19:05:54.252Z" }, + { url = "https://files.pythonhosted.org/packages/03/20/5cb7d3966f5e8c718006d0e97dfe379a82f16fecd3caa7810f634412047a/watchfiles-1.1.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:328dbc9bff7205c215a7807da7c18dce37da7da718e798356212d22696404339", size = 392916, upload-time = "2025-06-15T19:05:55.264Z" }, + { url = "https://files.pythonhosted.org/packages/8c/07/d8f1176328fa9e9581b6f120b017e286d2a2d22ae3f554efd9515c8e1b49/watchfiles-1.1.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f7208ab6e009c627b7557ce55c465c98967e8caa8b11833531fdf95799372633", size = 449582, upload-time = "2025-06-15T19:05:56.317Z" }, + { url = "https://files.pythonhosted.org/packages/66/e8/80a14a453cf6038e81d072a86c05276692a1826471fef91df7537dba8b46/watchfiles-1.1.0-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a8f6f72974a19efead54195bc9bed4d850fc047bb7aa971268fd9a8387c89011", size = 456752, upload-time = "2025-06-15T19:05:57.359Z" }, + { url = "https://files.pythonhosted.org/packages/5a/25/0853b3fe0e3c2f5af9ea60eb2e781eade939760239a72c2d38fc4cc335f6/watchfiles-1.1.0-cp313-cp313t-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d181ef50923c29cf0450c3cd47e2f0557b62218c50b2ab8ce2ecaa02bd97e670", size = 481436, upload-time = "2025-06-15T19:05:58.447Z" }, + { url = "https://files.pythonhosted.org/packages/fe/9e/4af0056c258b861fbb29dcb36258de1e2b857be4a9509e6298abcf31e5c9/watchfiles-1.1.0-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:adb4167043d3a78280d5d05ce0ba22055c266cf8655ce942f2fb881262ff3cdf", size = 596016, upload-time = "2025-06-15T19:05:59.59Z" }, + { url = "https://files.pythonhosted.org/packages/c5/fa/95d604b58aa375e781daf350897aaaa089cff59d84147e9ccff2447c8294/watchfiles-1.1.0-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8c5701dc474b041e2934a26d31d39f90fac8a3dee2322b39f7729867f932b1d4", size = 476727, upload-time = "2025-06-15T19:06:01.086Z" }, + { url = "https://files.pythonhosted.org/packages/65/95/fe479b2664f19be4cf5ceeb21be05afd491d95f142e72d26a42f41b7c4f8/watchfiles-1.1.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b067915e3c3936966a8607f6fe5487df0c9c4afb85226613b520890049deea20", size = 451864, upload-time = "2025-06-15T19:06:02.144Z" }, + { url = "https://files.pythonhosted.org/packages/d3/8a/3c4af14b93a15ce55901cd7a92e1a4701910f1768c78fb30f61d2b79785b/watchfiles-1.1.0-cp313-cp313t-musllinux_1_1_aarch64.whl", hash = "sha256:9c733cda03b6d636b4219625a4acb5c6ffb10803338e437fb614fef9516825ef", size = 625626, upload-time = "2025-06-15T19:06:03.578Z" }, + { url = "https://files.pythonhosted.org/packages/da/f5/cf6aa047d4d9e128f4b7cde615236a915673775ef171ff85971d698f3c2c/watchfiles-1.1.0-cp313-cp313t-musllinux_1_1_x86_64.whl", hash = "sha256:cc08ef8b90d78bfac66f0def80240b0197008e4852c9f285907377b2947ffdcb", size = 622744, upload-time = "2025-06-15T19:06:05.066Z" }, + { url = "https://files.pythonhosted.org/packages/2c/00/70f75c47f05dea6fd30df90f047765f6fc2d6eb8b5a3921379b0b04defa2/watchfiles-1.1.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:9974d2f7dc561cce3bb88dfa8eb309dab64c729de85fba32e98d75cf24b66297", size = 402114, upload-time = "2025-06-15T19:06:06.186Z" }, + { url = "https://files.pythonhosted.org/packages/53/03/acd69c48db4a1ed1de26b349d94077cca2238ff98fd64393f3e97484cae6/watchfiles-1.1.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c68e9f1fcb4d43798ad8814c4c1b61547b014b667216cb754e606bfade587018", size = 393879, upload-time = "2025-06-15T19:06:07.369Z" }, + { url = "https://files.pythonhosted.org/packages/2f/c8/a9a2a6f9c8baa4eceae5887fecd421e1b7ce86802bcfc8b6a942e2add834/watchfiles-1.1.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:95ab1594377effac17110e1352989bdd7bdfca9ff0e5eeccd8c69c5389b826d0", size = 450026, upload-time = "2025-06-15T19:06:08.476Z" }, + { url = "https://files.pythonhosted.org/packages/fe/51/d572260d98388e6e2b967425c985e07d47ee6f62e6455cefb46a6e06eda5/watchfiles-1.1.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fba9b62da882c1be1280a7584ec4515d0a6006a94d6e5819730ec2eab60ffe12", size = 457917, upload-time = "2025-06-15T19:06:09.988Z" }, + { url = "https://files.pythonhosted.org/packages/c6/2d/4258e52917bf9f12909b6ec314ff9636276f3542f9d3807d143f27309104/watchfiles-1.1.0-cp314-cp314-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3434e401f3ce0ed6b42569128b3d1e3af773d7ec18751b918b89cd49c14eaafb", size = 483602, upload-time = "2025-06-15T19:06:11.088Z" }, + { url = "https://files.pythonhosted.org/packages/84/99/bee17a5f341a4345fe7b7972a475809af9e528deba056f8963d61ea49f75/watchfiles-1.1.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fa257a4d0d21fcbca5b5fcba9dca5a78011cb93c0323fb8855c6d2dfbc76eb77", size = 596758, upload-time = "2025-06-15T19:06:12.197Z" }, + { url = "https://files.pythonhosted.org/packages/40/76/e4bec1d59b25b89d2b0716b41b461ed655a9a53c60dc78ad5771fda5b3e6/watchfiles-1.1.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7fd1b3879a578a8ec2076c7961076df540b9af317123f84569f5a9ddee64ce92", size = 477601, upload-time = "2025-06-15T19:06:13.391Z" }, + { url = "https://files.pythonhosted.org/packages/1f/fa/a514292956f4a9ce3c567ec0c13cce427c158e9f272062685a8a727d08fc/watchfiles-1.1.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:62cc7a30eeb0e20ecc5f4bd113cd69dcdb745a07c68c0370cea919f373f65d9e", size = 451936, upload-time = "2025-06-15T19:06:14.656Z" }, + { url = "https://files.pythonhosted.org/packages/32/5d/c3bf927ec3bbeb4566984eba8dd7a8eb69569400f5509904545576741f88/watchfiles-1.1.0-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:891c69e027748b4a73847335d208e374ce54ca3c335907d381fde4e41661b13b", size = 626243, upload-time = "2025-06-15T19:06:16.232Z" }, + { url = "https://files.pythonhosted.org/packages/e6/65/6e12c042f1a68c556802a84d54bb06d35577c81e29fba14019562479159c/watchfiles-1.1.0-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:12fe8eaffaf0faa7906895b4f8bb88264035b3f0243275e0bf24af0436b27259", size = 623073, upload-time = "2025-06-15T19:06:17.457Z" }, + { url = "https://files.pythonhosted.org/packages/89/ab/7f79d9bf57329e7cbb0a6fd4c7bd7d0cee1e4a8ef0041459f5409da3506c/watchfiles-1.1.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:bfe3c517c283e484843cb2e357dd57ba009cff351edf45fb455b5fbd1f45b15f", size = 400872, upload-time = "2025-06-15T19:06:18.57Z" }, + { url = "https://files.pythonhosted.org/packages/df/d5/3f7bf9912798e9e6c516094db6b8932df53b223660c781ee37607030b6d3/watchfiles-1.1.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:a9ccbf1f129480ed3044f540c0fdbc4ee556f7175e5ab40fe077ff6baf286d4e", size = 392877, upload-time = "2025-06-15T19:06:19.55Z" }, + { url = "https://files.pythonhosted.org/packages/0d/c5/54ec7601a2798604e01c75294770dbee8150e81c6e471445d7601610b495/watchfiles-1.1.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ba0e3255b0396cac3cc7bbace76404dd72b5438bf0d8e7cefa2f79a7f3649caa", size = 449645, upload-time = "2025-06-15T19:06:20.66Z" }, + { url = "https://files.pythonhosted.org/packages/0a/04/c2f44afc3b2fce21ca0b7802cbd37ed90a29874f96069ed30a36dfe57c2b/watchfiles-1.1.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4281cd9fce9fc0a9dbf0fc1217f39bf9cf2b4d315d9626ef1d4e87b84699e7e8", size = 457424, upload-time = "2025-06-15T19:06:21.712Z" }, + { url = "https://files.pythonhosted.org/packages/9f/b0/eec32cb6c14d248095261a04f290636da3df3119d4040ef91a4a50b29fa5/watchfiles-1.1.0-cp314-cp314t-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6d2404af8db1329f9a3c9b79ff63e0ae7131986446901582067d9304ae8aaf7f", size = 481584, upload-time = "2025-06-15T19:06:22.777Z" }, + { url = "https://files.pythonhosted.org/packages/d1/e2/ca4bb71c68a937d7145aa25709e4f5d68eb7698a25ce266e84b55d591bbd/watchfiles-1.1.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e78b6ed8165996013165eeabd875c5dfc19d41b54f94b40e9fff0eb3193e5e8e", size = 596675, upload-time = "2025-06-15T19:06:24.226Z" }, + { url = "https://files.pythonhosted.org/packages/a1/dd/b0e4b7fb5acf783816bc950180a6cd7c6c1d2cf7e9372c0ea634e722712b/watchfiles-1.1.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:249590eb75ccc117f488e2fabd1bfa33c580e24b96f00658ad88e38844a040bb", size = 477363, upload-time = "2025-06-15T19:06:25.42Z" }, + { url = "https://files.pythonhosted.org/packages/69/c4/088825b75489cb5b6a761a4542645718893d395d8c530b38734f19da44d2/watchfiles-1.1.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d05686b5487cfa2e2c28ff1aa370ea3e6c5accfe6435944ddea1e10d93872147", size = 452240, upload-time = "2025-06-15T19:06:26.552Z" }, + { url = "https://files.pythonhosted.org/packages/10/8c/22b074814970eeef43b7c44df98c3e9667c1f7bf5b83e0ff0201b0bd43f9/watchfiles-1.1.0-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:d0e10e6f8f6dc5762adee7dece33b722282e1f59aa6a55da5d493a97282fedd8", size = 625607, upload-time = "2025-06-15T19:06:27.606Z" }, + { url = "https://files.pythonhosted.org/packages/32/fa/a4f5c2046385492b2273213ef815bf71a0d4c1943b784fb904e184e30201/watchfiles-1.1.0-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:af06c863f152005c7592df1d6a7009c836a247c9d8adb78fef8575a5a98699db", size = 623315, upload-time = "2025-06-15T19:06:29.076Z" }, +] + +[[package]] +name = "wcwidth" +version = "0.2.13" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6c/63/53559446a878410fc5a5974feb13d31d78d752eb18aeba59c7fef1af7598/wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5", size = 101301, upload-time = "2024-01-06T02:10:57.829Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859", size = 34166, upload-time = "2024-01-06T02:10:55.763Z" }, +] + +[[package]] +name = "websockets" +version = "15.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/21/e6/26d09fab466b7ca9c7737474c52be4f76a40301b08362eb2dbc19dcc16c1/websockets-15.0.1.tar.gz", hash = "sha256:82544de02076bafba038ce055ee6412d68da13ab47f0c60cab827346de828dee", size = 177016, upload-time = "2025-03-05T20:03:41.606Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/51/6b/4545a0d843594f5d0771e86463606a3988b5a09ca5123136f8a76580dd63/websockets-15.0.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:3e90baa811a5d73f3ca0bcbf32064d663ed81318ab225ee4f427ad4e26e5aff3", size = 175437, upload-time = "2025-03-05T20:02:16.706Z" }, + { url = "https://files.pythonhosted.org/packages/f4/71/809a0f5f6a06522af902e0f2ea2757f71ead94610010cf570ab5c98e99ed/websockets-15.0.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:592f1a9fe869c778694f0aa806ba0374e97648ab57936f092fd9d87f8bc03665", size = 173096, upload-time = "2025-03-05T20:02:18.832Z" }, + { url = "https://files.pythonhosted.org/packages/3d/69/1a681dd6f02180916f116894181eab8b2e25b31e484c5d0eae637ec01f7c/websockets-15.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0701bc3cfcb9164d04a14b149fd74be7347a530ad3bbf15ab2c678a2cd3dd9a2", size = 173332, upload-time = "2025-03-05T20:02:20.187Z" }, + { url = "https://files.pythonhosted.org/packages/a6/02/0073b3952f5bce97eafbb35757f8d0d54812b6174ed8dd952aa08429bcc3/websockets-15.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8b56bdcdb4505c8078cb6c7157d9811a85790f2f2b3632c7d1462ab5783d215", size = 183152, upload-time = "2025-03-05T20:02:22.286Z" }, + { url = "https://files.pythonhosted.org/packages/74/45/c205c8480eafd114b428284840da0b1be9ffd0e4f87338dc95dc6ff961a1/websockets-15.0.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0af68c55afbd5f07986df82831c7bff04846928ea8d1fd7f30052638788bc9b5", size = 182096, upload-time = "2025-03-05T20:02:24.368Z" }, + { url = "https://files.pythonhosted.org/packages/14/8f/aa61f528fba38578ec553c145857a181384c72b98156f858ca5c8e82d9d3/websockets-15.0.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64dee438fed052b52e4f98f76c5790513235efaa1ef7f3f2192c392cd7c91b65", size = 182523, upload-time = "2025-03-05T20:02:25.669Z" }, + { url = "https://files.pythonhosted.org/packages/ec/6d/0267396610add5bc0d0d3e77f546d4cd287200804fe02323797de77dbce9/websockets-15.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d5f6b181bb38171a8ad1d6aa58a67a6aa9d4b38d0f8c5f496b9e42561dfc62fe", size = 182790, upload-time = "2025-03-05T20:02:26.99Z" }, + { url = "https://files.pythonhosted.org/packages/02/05/c68c5adbf679cf610ae2f74a9b871ae84564462955d991178f95a1ddb7dd/websockets-15.0.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:5d54b09eba2bada6011aea5375542a157637b91029687eb4fdb2dab11059c1b4", size = 182165, upload-time = "2025-03-05T20:02:30.291Z" }, + { url = "https://files.pythonhosted.org/packages/29/93/bb672df7b2f5faac89761cb5fa34f5cec45a4026c383a4b5761c6cea5c16/websockets-15.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3be571a8b5afed347da347bfcf27ba12b069d9d7f42cb8c7028b5e98bbb12597", size = 182160, upload-time = "2025-03-05T20:02:31.634Z" }, + { url = "https://files.pythonhosted.org/packages/ff/83/de1f7709376dc3ca9b7eeb4b9a07b4526b14876b6d372a4dc62312bebee0/websockets-15.0.1-cp312-cp312-win32.whl", hash = "sha256:c338ffa0520bdb12fbc527265235639fb76e7bc7faafbb93f6ba80d9c06578a9", size = 176395, upload-time = "2025-03-05T20:02:33.017Z" }, + { url = "https://files.pythonhosted.org/packages/7d/71/abf2ebc3bbfa40f391ce1428c7168fb20582d0ff57019b69ea20fa698043/websockets-15.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:fcd5cf9e305d7b8338754470cf69cf81f420459dbae8a3b40cee57417f4614a7", size = 176841, upload-time = "2025-03-05T20:02:34.498Z" }, + { url = "https://files.pythonhosted.org/packages/cb/9f/51f0cf64471a9d2b4d0fc6c534f323b664e7095640c34562f5182e5a7195/websockets-15.0.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ee443ef070bb3b6ed74514f5efaa37a252af57c90eb33b956d35c8e9c10a1931", size = 175440, upload-time = "2025-03-05T20:02:36.695Z" }, + { url = "https://files.pythonhosted.org/packages/8a/05/aa116ec9943c718905997412c5989f7ed671bc0188ee2ba89520e8765d7b/websockets-15.0.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5a939de6b7b4e18ca683218320fc67ea886038265fd1ed30173f5ce3f8e85675", size = 173098, upload-time = "2025-03-05T20:02:37.985Z" }, + { url = "https://files.pythonhosted.org/packages/ff/0b/33cef55ff24f2d92924923c99926dcce78e7bd922d649467f0eda8368923/websockets-15.0.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:746ee8dba912cd6fc889a8147168991d50ed70447bf18bcda7039f7d2e3d9151", size = 173329, upload-time = "2025-03-05T20:02:39.298Z" }, + { url = "https://files.pythonhosted.org/packages/31/1d/063b25dcc01faa8fada1469bdf769de3768b7044eac9d41f734fd7b6ad6d/websockets-15.0.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:595b6c3969023ecf9041b2936ac3827e4623bfa3ccf007575f04c5a6aa318c22", size = 183111, upload-time = "2025-03-05T20:02:40.595Z" }, + { url = "https://files.pythonhosted.org/packages/93/53/9a87ee494a51bf63e4ec9241c1ccc4f7c2f45fff85d5bde2ff74fcb68b9e/websockets-15.0.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c714d2fc58b5ca3e285461a4cc0c9a66bd0e24c5da9911e30158286c9b5be7f", size = 182054, upload-time = "2025-03-05T20:02:41.926Z" }, + { url = "https://files.pythonhosted.org/packages/ff/b2/83a6ddf56cdcbad4e3d841fcc55d6ba7d19aeb89c50f24dd7e859ec0805f/websockets-15.0.1-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f3c1e2ab208db911594ae5b4f79addeb3501604a165019dd221c0bdcabe4db8", size = 182496, upload-time = "2025-03-05T20:02:43.304Z" }, + { url = "https://files.pythonhosted.org/packages/98/41/e7038944ed0abf34c45aa4635ba28136f06052e08fc2168520bb8b25149f/websockets-15.0.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:229cf1d3ca6c1804400b0a9790dc66528e08a6a1feec0d5040e8b9eb14422375", size = 182829, upload-time = "2025-03-05T20:02:48.812Z" }, + { url = "https://files.pythonhosted.org/packages/e0/17/de15b6158680c7623c6ef0db361da965ab25d813ae54fcfeae2e5b9ef910/websockets-15.0.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:756c56e867a90fb00177d530dca4b097dd753cde348448a1012ed6c5131f8b7d", size = 182217, upload-time = "2025-03-05T20:02:50.14Z" }, + { url = "https://files.pythonhosted.org/packages/33/2b/1f168cb6041853eef0362fb9554c3824367c5560cbdaad89ac40f8c2edfc/websockets-15.0.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:558d023b3df0bffe50a04e710bc87742de35060580a293c2a984299ed83bc4e4", size = 182195, upload-time = "2025-03-05T20:02:51.561Z" }, + { url = "https://files.pythonhosted.org/packages/86/eb/20b6cdf273913d0ad05a6a14aed4b9a85591c18a987a3d47f20fa13dcc47/websockets-15.0.1-cp313-cp313-win32.whl", hash = "sha256:ba9e56e8ceeeedb2e080147ba85ffcd5cd0711b89576b83784d8605a7df455fa", size = 176393, upload-time = "2025-03-05T20:02:53.814Z" }, + { url = "https://files.pythonhosted.org/packages/1b/6c/c65773d6cab416a64d191d6ee8a8b1c68a09970ea6909d16965d26bfed1e/websockets-15.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:e09473f095a819042ecb2ab9465aee615bd9c2028e4ef7d933600a8401c79561", size = 176837, upload-time = "2025-03-05T20:02:55.237Z" }, + { url = "https://files.pythonhosted.org/packages/fa/a8/5b41e0da817d64113292ab1f8247140aac61cbf6cfd085d6a0fa77f4984f/websockets-15.0.1-py3-none-any.whl", hash = "sha256:f7a866fbc1e97b5c617ee4116daaa09b722101d4a3c170c787450ba409f9736f", size = 169743, upload-time = "2025-03-05T20:03:39.41Z" }, +] + +[[package]] +name = "werkzeug" +version = "3.1.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markupsafe" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9f/69/83029f1f6300c5fb2471d621ab06f6ec6b3324685a2ce0f9777fd4a8b71e/werkzeug-3.1.3.tar.gz", hash = "sha256:60723ce945c19328679790e3282cc758aa4a6040e4bb330f53d30fa546d44746", size = 806925, upload-time = "2024-11-08T15:52:18.093Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/52/24/ab44c871b0f07f491e5d2ad12c9bd7358e527510618cb1b803a88e986db1/werkzeug-3.1.3-py3-none-any.whl", hash = "sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e", size = 224498, upload-time = "2024-11-08T15:52:16.132Z" }, +] + +[[package]] +name = "widgetsnbextension" +version = "4.0.14" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/41/53/2e0253c5efd69c9656b1843892052a31c36d37ad42812b5da45c62191f7e/widgetsnbextension-4.0.14.tar.gz", hash = "sha256:a3629b04e3edb893212df862038c7232f62973373869db5084aed739b437b5af", size = 1097428, upload-time = "2025-04-10T13:01:25.628Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ca/51/5447876806d1088a0f8f71e16542bf350918128d0a69437df26047c8e46f/widgetsnbextension-4.0.14-py3-none-any.whl", hash = "sha256:4875a9eaf72fbf5079dc372a51a9f268fc38d46f767cbf85c43a36da5cb9b575", size = 2196503, upload-time = "2025-04-10T13:01:23.086Z" }, +] + +[[package]] +name = "wrapt" +version = "1.17.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/95/8f/aeb76c5b46e273670962298c23e7ddde79916cb74db802131d49a85e4b7d/wrapt-1.17.3.tar.gz", hash = "sha256:f66eb08feaa410fe4eebd17f2a2c8e2e46d3476e9f8c783daa8e09e0faa666d0", size = 55547, upload-time = "2025-08-12T05:53:21.714Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9f/41/cad1aba93e752f1f9268c77270da3c469883d56e2798e7df6240dcb2287b/wrapt-1.17.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:ab232e7fdb44cdfbf55fc3afa31bcdb0d8980b9b95c38b6405df2acb672af0e0", size = 53998, upload-time = "2025-08-12T05:51:47.138Z" }, + { url = "https://files.pythonhosted.org/packages/60/f8/096a7cc13097a1869fe44efe68dace40d2a16ecb853141394047f0780b96/wrapt-1.17.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:9baa544e6acc91130e926e8c802a17f3b16fbea0fd441b5a60f5cf2cc5c3deba", size = 39020, upload-time = "2025-08-12T05:51:35.906Z" }, + { url = "https://files.pythonhosted.org/packages/33/df/bdf864b8997aab4febb96a9ae5c124f700a5abd9b5e13d2a3214ec4be705/wrapt-1.17.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6b538e31eca1a7ea4605e44f81a48aa24c4632a277431a6ed3f328835901f4fd", size = 39098, upload-time = "2025-08-12T05:51:57.474Z" }, + { url = "https://files.pythonhosted.org/packages/9f/81/5d931d78d0eb732b95dc3ddaeeb71c8bb572fb01356e9133916cd729ecdd/wrapt-1.17.3-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:042ec3bb8f319c147b1301f2393bc19dba6e176b7da446853406d041c36c7828", size = 88036, upload-time = "2025-08-12T05:52:34.784Z" }, + { url = "https://files.pythonhosted.org/packages/ca/38/2e1785df03b3d72d34fc6252d91d9d12dc27a5c89caef3335a1bbb8908ca/wrapt-1.17.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3af60380ba0b7b5aeb329bc4e402acd25bd877e98b3727b0135cb5c2efdaefe9", size = 88156, upload-time = "2025-08-12T05:52:13.599Z" }, + { url = "https://files.pythonhosted.org/packages/b3/8b/48cdb60fe0603e34e05cffda0b2a4adab81fd43718e11111a4b0100fd7c1/wrapt-1.17.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0b02e424deef65c9f7326d8c19220a2c9040c51dc165cddb732f16198c168396", size = 87102, upload-time = "2025-08-12T05:52:14.56Z" }, + { url = "https://files.pythonhosted.org/packages/3c/51/d81abca783b58f40a154f1b2c56db1d2d9e0d04fa2d4224e357529f57a57/wrapt-1.17.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:74afa28374a3c3a11b3b5e5fca0ae03bef8450d6aa3ab3a1e2c30e3a75d023dc", size = 87732, upload-time = "2025-08-12T05:52:36.165Z" }, + { url = "https://files.pythonhosted.org/packages/9e/b1/43b286ca1392a006d5336412d41663eeef1ad57485f3e52c767376ba7e5a/wrapt-1.17.3-cp312-cp312-win32.whl", hash = "sha256:4da9f45279fff3543c371d5ababc57a0384f70be244de7759c85a7f989cb4ebe", size = 36705, upload-time = "2025-08-12T05:53:07.123Z" }, + { url = "https://files.pythonhosted.org/packages/28/de/49493f962bd3c586ab4b88066e967aa2e0703d6ef2c43aa28cb83bf7b507/wrapt-1.17.3-cp312-cp312-win_amd64.whl", hash = "sha256:e71d5c6ebac14875668a1e90baf2ea0ef5b7ac7918355850c0908ae82bcb297c", size = 38877, upload-time = "2025-08-12T05:53:05.436Z" }, + { url = "https://files.pythonhosted.org/packages/f1/48/0f7102fe9cb1e8a5a77f80d4f0956d62d97034bbe88d33e94699f99d181d/wrapt-1.17.3-cp312-cp312-win_arm64.whl", hash = "sha256:604d076c55e2fdd4c1c03d06dc1a31b95130010517b5019db15365ec4a405fc6", size = 36885, upload-time = "2025-08-12T05:52:54.367Z" }, + { url = "https://files.pythonhosted.org/packages/fc/f6/759ece88472157acb55fc195e5b116e06730f1b651b5b314c66291729193/wrapt-1.17.3-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a47681378a0439215912ef542c45a783484d4dd82bac412b71e59cf9c0e1cea0", size = 54003, upload-time = "2025-08-12T05:51:48.627Z" }, + { url = "https://files.pythonhosted.org/packages/4f/a9/49940b9dc6d47027dc850c116d79b4155f15c08547d04db0f07121499347/wrapt-1.17.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:54a30837587c6ee3cd1a4d1c2ec5d24e77984d44e2f34547e2323ddb4e22eb77", size = 39025, upload-time = "2025-08-12T05:51:37.156Z" }, + { url = "https://files.pythonhosted.org/packages/45/35/6a08de0f2c96dcdd7fe464d7420ddb9a7655a6561150e5fc4da9356aeaab/wrapt-1.17.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:16ecf15d6af39246fe33e507105d67e4b81d8f8d2c6598ff7e3ca1b8a37213f7", size = 39108, upload-time = "2025-08-12T05:51:58.425Z" }, + { url = "https://files.pythonhosted.org/packages/0c/37/6faf15cfa41bf1f3dba80cd3f5ccc6622dfccb660ab26ed79f0178c7497f/wrapt-1.17.3-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:6fd1ad24dc235e4ab88cda009e19bf347aabb975e44fd5c2fb22a3f6e4141277", size = 88072, upload-time = "2025-08-12T05:52:37.53Z" }, + { url = "https://files.pythonhosted.org/packages/78/f2/efe19ada4a38e4e15b6dff39c3e3f3f73f5decf901f66e6f72fe79623a06/wrapt-1.17.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0ed61b7c2d49cee3c027372df5809a59d60cf1b6c2f81ee980a091f3afed6a2d", size = 88214, upload-time = "2025-08-12T05:52:15.886Z" }, + { url = "https://files.pythonhosted.org/packages/40/90/ca86701e9de1622b16e09689fc24b76f69b06bb0150990f6f4e8b0eeb576/wrapt-1.17.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:423ed5420ad5f5529db9ce89eac09c8a2f97da18eb1c870237e84c5a5c2d60aa", size = 87105, upload-time = "2025-08-12T05:52:17.914Z" }, + { url = "https://files.pythonhosted.org/packages/fd/e0/d10bd257c9a3e15cbf5523025252cc14d77468e8ed644aafb2d6f54cb95d/wrapt-1.17.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e01375f275f010fcbf7f643b4279896d04e571889b8a5b3f848423d91bf07050", size = 87766, upload-time = "2025-08-12T05:52:39.243Z" }, + { url = "https://files.pythonhosted.org/packages/e8/cf/7d848740203c7b4b27eb55dbfede11aca974a51c3d894f6cc4b865f42f58/wrapt-1.17.3-cp313-cp313-win32.whl", hash = "sha256:53e5e39ff71b3fc484df8a522c933ea2b7cdd0d5d15ae82e5b23fde87d44cbd8", size = 36711, upload-time = "2025-08-12T05:53:10.074Z" }, + { url = "https://files.pythonhosted.org/packages/57/54/35a84d0a4d23ea675994104e667ceff49227ce473ba6a59ba2c84f250b74/wrapt-1.17.3-cp313-cp313-win_amd64.whl", hash = "sha256:1f0b2f40cf341ee8cc1a97d51ff50dddb9fcc73241b9143ec74b30fc4f44f6cb", size = 38885, upload-time = "2025-08-12T05:53:08.695Z" }, + { url = "https://files.pythonhosted.org/packages/01/77/66e54407c59d7b02a3c4e0af3783168fff8e5d61def52cda8728439d86bc/wrapt-1.17.3-cp313-cp313-win_arm64.whl", hash = "sha256:7425ac3c54430f5fc5e7b6f41d41e704db073309acfc09305816bc6a0b26bb16", size = 36896, upload-time = "2025-08-12T05:52:55.34Z" }, + { url = "https://files.pythonhosted.org/packages/02/a2/cd864b2a14f20d14f4c496fab97802001560f9f41554eef6df201cd7f76c/wrapt-1.17.3-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:cf30f6e3c077c8e6a9a7809c94551203c8843e74ba0c960f4a98cd80d4665d39", size = 54132, upload-time = "2025-08-12T05:51:49.864Z" }, + { url = "https://files.pythonhosted.org/packages/d5/46/d011725b0c89e853dc44cceb738a307cde5d240d023d6d40a82d1b4e1182/wrapt-1.17.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:e228514a06843cae89621384cfe3a80418f3c04aadf8a3b14e46a7be704e4235", size = 39091, upload-time = "2025-08-12T05:51:38.935Z" }, + { url = "https://files.pythonhosted.org/packages/2e/9e/3ad852d77c35aae7ddebdbc3b6d35ec8013af7d7dddad0ad911f3d891dae/wrapt-1.17.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:5ea5eb3c0c071862997d6f3e02af1d055f381b1d25b286b9d6644b79db77657c", size = 39172, upload-time = "2025-08-12T05:51:59.365Z" }, + { url = "https://files.pythonhosted.org/packages/c3/f7/c983d2762bcce2326c317c26a6a1e7016f7eb039c27cdf5c4e30f4160f31/wrapt-1.17.3-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:281262213373b6d5e4bb4353bc36d1ba4084e6d6b5d242863721ef2bf2c2930b", size = 87163, upload-time = "2025-08-12T05:52:40.965Z" }, + { url = "https://files.pythonhosted.org/packages/e4/0f/f673f75d489c7f22d17fe0193e84b41540d962f75fce579cf6873167c29b/wrapt-1.17.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dc4a8d2b25efb6681ecacad42fca8859f88092d8732b170de6a5dddd80a1c8fa", size = 87963, upload-time = "2025-08-12T05:52:20.326Z" }, + { url = "https://files.pythonhosted.org/packages/df/61/515ad6caca68995da2fac7a6af97faab8f78ebe3bf4f761e1b77efbc47b5/wrapt-1.17.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:373342dd05b1d07d752cecbec0c41817231f29f3a89aa8b8843f7b95992ed0c7", size = 86945, upload-time = "2025-08-12T05:52:21.581Z" }, + { url = "https://files.pythonhosted.org/packages/d3/bd/4e70162ce398462a467bc09e768bee112f1412e563620adc353de9055d33/wrapt-1.17.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d40770d7c0fd5cbed9d84b2c3f2e156431a12c9a37dc6284060fb4bec0b7ffd4", size = 86857, upload-time = "2025-08-12T05:52:43.043Z" }, + { url = "https://files.pythonhosted.org/packages/2b/b8/da8560695e9284810b8d3df8a19396a6e40e7518059584a1a394a2b35e0a/wrapt-1.17.3-cp314-cp314-win32.whl", hash = "sha256:fbd3c8319de8e1dc79d346929cd71d523622da527cca14e0c1d257e31c2b8b10", size = 37178, upload-time = "2025-08-12T05:53:12.605Z" }, + { url = "https://files.pythonhosted.org/packages/db/c8/b71eeb192c440d67a5a0449aaee2310a1a1e8eca41676046f99ed2487e9f/wrapt-1.17.3-cp314-cp314-win_amd64.whl", hash = "sha256:e1a4120ae5705f673727d3253de3ed0e016f7cd78dc463db1b31e2463e1f3cf6", size = 39310, upload-time = "2025-08-12T05:53:11.106Z" }, + { url = "https://files.pythonhosted.org/packages/45/20/2cda20fd4865fa40f86f6c46ed37a2a8356a7a2fde0773269311f2af56c7/wrapt-1.17.3-cp314-cp314-win_arm64.whl", hash = "sha256:507553480670cab08a800b9463bdb881b2edeed77dc677b0a5915e6106e91a58", size = 37266, upload-time = "2025-08-12T05:52:56.531Z" }, + { url = "https://files.pythonhosted.org/packages/77/ed/dd5cf21aec36c80443c6f900449260b80e2a65cf963668eaef3b9accce36/wrapt-1.17.3-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:ed7c635ae45cfbc1a7371f708727bf74690daedc49b4dba310590ca0bd28aa8a", size = 56544, upload-time = "2025-08-12T05:51:51.109Z" }, + { url = "https://files.pythonhosted.org/packages/8d/96/450c651cc753877ad100c7949ab4d2e2ecc4d97157e00fa8f45df682456a/wrapt-1.17.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:249f88ed15503f6492a71f01442abddd73856a0032ae860de6d75ca62eed8067", size = 40283, upload-time = "2025-08-12T05:51:39.912Z" }, + { url = "https://files.pythonhosted.org/packages/d1/86/2fcad95994d9b572db57632acb6f900695a648c3e063f2cd344b3f5c5a37/wrapt-1.17.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:5a03a38adec8066d5a37bea22f2ba6bbf39fcdefbe2d91419ab864c3fb515454", size = 40366, upload-time = "2025-08-12T05:52:00.693Z" }, + { url = "https://files.pythonhosted.org/packages/64/0e/f4472f2fdde2d4617975144311f8800ef73677a159be7fe61fa50997d6c0/wrapt-1.17.3-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:5d4478d72eb61c36e5b446e375bbc49ed002430d17cdec3cecb36993398e1a9e", size = 108571, upload-time = "2025-08-12T05:52:44.521Z" }, + { url = "https://files.pythonhosted.org/packages/cc/01/9b85a99996b0a97c8a17484684f206cbb6ba73c1ce6890ac668bcf3838fb/wrapt-1.17.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:223db574bb38637e8230eb14b185565023ab624474df94d2af18f1cdb625216f", size = 113094, upload-time = "2025-08-12T05:52:22.618Z" }, + { url = "https://files.pythonhosted.org/packages/25/02/78926c1efddcc7b3aa0bc3d6b33a822f7d898059f7cd9ace8c8318e559ef/wrapt-1.17.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e405adefb53a435f01efa7ccdec012c016b5a1d3f35459990afc39b6be4d5056", size = 110659, upload-time = "2025-08-12T05:52:24.057Z" }, + { url = "https://files.pythonhosted.org/packages/dc/ee/c414501ad518ac3e6fe184753632fe5e5ecacdcf0effc23f31c1e4f7bfcf/wrapt-1.17.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:88547535b787a6c9ce4086917b6e1d291aa8ed914fdd3a838b3539dc95c12804", size = 106946, upload-time = "2025-08-12T05:52:45.976Z" }, + { url = "https://files.pythonhosted.org/packages/be/44/a1bd64b723d13bb151d6cc91b986146a1952385e0392a78567e12149c7b4/wrapt-1.17.3-cp314-cp314t-win32.whl", hash = "sha256:41b1d2bc74c2cac6f9074df52b2efbef2b30bdfe5f40cb78f8ca22963bc62977", size = 38717, upload-time = "2025-08-12T05:53:15.214Z" }, + { url = "https://files.pythonhosted.org/packages/79/d9/7cfd5a312760ac4dd8bf0184a6ee9e43c33e47f3dadc303032ce012b8fa3/wrapt-1.17.3-cp314-cp314t-win_amd64.whl", hash = "sha256:73d496de46cd2cdbdbcce4ae4bcdb4afb6a11234a1df9c085249d55166b95116", size = 41334, upload-time = "2025-08-12T05:53:14.178Z" }, + { url = "https://files.pythonhosted.org/packages/46/78/10ad9781128ed2f99dbc474f43283b13fea8ba58723e98844367531c18e9/wrapt-1.17.3-cp314-cp314t-win_arm64.whl", hash = "sha256:f38e60678850c42461d4202739f9bf1e3a737c7ad283638251e79cc49effb6b6", size = 38471, upload-time = "2025-08-12T05:52:57.784Z" }, + { url = "https://files.pythonhosted.org/packages/1f/f6/a933bd70f98e9cf3e08167fc5cd7aaaca49147e48411c0bd5ae701bb2194/wrapt-1.17.3-py3-none-any.whl", hash = "sha256:7171ae35d2c33d326ac19dd8facb1e82e5fd04ef8c6c0e394d7af55a55051c22", size = 23591, upload-time = "2025-08-12T05:53:20.674Z" }, +] + +[[package]] +name = "yarl" +version = "1.20.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "idna" }, + { name = "multidict" }, + { name = "propcache" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3c/fb/efaa23fa4e45537b827620f04cf8f3cd658b76642205162e072703a5b963/yarl-1.20.1.tar.gz", hash = "sha256:d017a4997ee50c91fd5466cef416231bb82177b93b029906cefc542ce14c35ac", size = 186428, upload-time = "2025-06-10T00:46:09.923Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5f/9a/cb7fad7d73c69f296eda6815e4a2c7ed53fc70c2f136479a91c8e5fbdb6d/yarl-1.20.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:bdcc4cd244e58593a4379fe60fdee5ac0331f8eb70320a24d591a3be197b94a9", size = 133667, upload-time = "2025-06-10T00:43:44.369Z" }, + { url = "https://files.pythonhosted.org/packages/67/38/688577a1cb1e656e3971fb66a3492501c5a5df56d99722e57c98249e5b8a/yarl-1.20.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b29a2c385a5f5b9c7d9347e5812b6f7ab267193c62d282a540b4fc528c8a9d2a", size = 91025, upload-time = "2025-06-10T00:43:46.295Z" }, + { url = "https://files.pythonhosted.org/packages/50/ec/72991ae51febeb11a42813fc259f0d4c8e0507f2b74b5514618d8b640365/yarl-1.20.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1112ae8154186dfe2de4732197f59c05a83dc814849a5ced892b708033f40dc2", size = 89709, upload-time = "2025-06-10T00:43:48.22Z" }, + { url = "https://files.pythonhosted.org/packages/99/da/4d798025490e89426e9f976702e5f9482005c548c579bdae792a4c37769e/yarl-1.20.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:90bbd29c4fe234233f7fa2b9b121fb63c321830e5d05b45153a2ca68f7d310ee", size = 352287, upload-time = "2025-06-10T00:43:49.924Z" }, + { url = "https://files.pythonhosted.org/packages/1a/26/54a15c6a567aac1c61b18aa0f4b8aa2e285a52d547d1be8bf48abe2b3991/yarl-1.20.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:680e19c7ce3710ac4cd964e90dad99bf9b5029372ba0c7cbfcd55e54d90ea819", size = 345429, upload-time = "2025-06-10T00:43:51.7Z" }, + { url = "https://files.pythonhosted.org/packages/d6/95/9dcf2386cb875b234353b93ec43e40219e14900e046bf6ac118f94b1e353/yarl-1.20.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4a979218c1fdb4246a05efc2cc23859d47c89af463a90b99b7c56094daf25a16", size = 365429, upload-time = "2025-06-10T00:43:53.494Z" }, + { url = "https://files.pythonhosted.org/packages/91/b2/33a8750f6a4bc224242a635f5f2cff6d6ad5ba651f6edcccf721992c21a0/yarl-1.20.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:255b468adf57b4a7b65d8aad5b5138dce6a0752c139965711bdcb81bc370e1b6", size = 363862, upload-time = "2025-06-10T00:43:55.766Z" }, + { url = "https://files.pythonhosted.org/packages/98/28/3ab7acc5b51f4434b181b0cee8f1f4b77a65919700a355fb3617f9488874/yarl-1.20.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a97d67108e79cfe22e2b430d80d7571ae57d19f17cda8bb967057ca8a7bf5bfd", size = 355616, upload-time = "2025-06-10T00:43:58.056Z" }, + { url = "https://files.pythonhosted.org/packages/36/a3/f666894aa947a371724ec7cd2e5daa78ee8a777b21509b4252dd7bd15e29/yarl-1.20.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8570d998db4ddbfb9a590b185a0a33dbf8aafb831d07a5257b4ec9948df9cb0a", size = 339954, upload-time = "2025-06-10T00:43:59.773Z" }, + { url = "https://files.pythonhosted.org/packages/f1/81/5f466427e09773c04219d3450d7a1256138a010b6c9f0af2d48565e9ad13/yarl-1.20.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:97c75596019baae7c71ccf1d8cc4738bc08134060d0adfcbe5642f778d1dca38", size = 365575, upload-time = "2025-06-10T00:44:02.051Z" }, + { url = "https://files.pythonhosted.org/packages/2e/e3/e4b0ad8403e97e6c9972dd587388940a032f030ebec196ab81a3b8e94d31/yarl-1.20.1-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:1c48912653e63aef91ff988c5432832692ac5a1d8f0fb8a33091520b5bbe19ef", size = 365061, upload-time = "2025-06-10T00:44:04.196Z" }, + { url = "https://files.pythonhosted.org/packages/ac/99/b8a142e79eb86c926f9f06452eb13ecb1bb5713bd01dc0038faf5452e544/yarl-1.20.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:4c3ae28f3ae1563c50f3d37f064ddb1511ecc1d5584e88c6b7c63cf7702a6d5f", size = 364142, upload-time = "2025-06-10T00:44:06.527Z" }, + { url = "https://files.pythonhosted.org/packages/34/f2/08ed34a4a506d82a1a3e5bab99ccd930a040f9b6449e9fd050320e45845c/yarl-1.20.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c5e9642f27036283550f5f57dc6156c51084b458570b9d0d96100c8bebb186a8", size = 381894, upload-time = "2025-06-10T00:44:08.379Z" }, + { url = "https://files.pythonhosted.org/packages/92/f8/9a3fbf0968eac704f681726eff595dce9b49c8a25cd92bf83df209668285/yarl-1.20.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:2c26b0c49220d5799f7b22c6838409ee9bc58ee5c95361a4d7831f03cc225b5a", size = 383378, upload-time = "2025-06-10T00:44:10.51Z" }, + { url = "https://files.pythonhosted.org/packages/af/85/9363f77bdfa1e4d690957cd39d192c4cacd1c58965df0470a4905253b54f/yarl-1.20.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:564ab3d517e3d01c408c67f2e5247aad4019dcf1969982aba3974b4093279004", size = 374069, upload-time = "2025-06-10T00:44:12.834Z" }, + { url = "https://files.pythonhosted.org/packages/35/99/9918c8739ba271dcd935400cff8b32e3cd319eaf02fcd023d5dcd487a7c8/yarl-1.20.1-cp312-cp312-win32.whl", hash = "sha256:daea0d313868da1cf2fac6b2d3a25c6e3a9e879483244be38c8e6a41f1d876a5", size = 81249, upload-time = "2025-06-10T00:44:14.731Z" }, + { url = "https://files.pythonhosted.org/packages/eb/83/5d9092950565481b413b31a23e75dd3418ff0a277d6e0abf3729d4d1ce25/yarl-1.20.1-cp312-cp312-win_amd64.whl", hash = "sha256:48ea7d7f9be0487339828a4de0360d7ce0efc06524a48e1810f945c45b813698", size = 86710, upload-time = "2025-06-10T00:44:16.716Z" }, + { url = "https://files.pythonhosted.org/packages/8a/e1/2411b6d7f769a07687acee88a062af5833cf1966b7266f3d8dfb3d3dc7d3/yarl-1.20.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:0b5ff0fbb7c9f1b1b5ab53330acbfc5247893069e7716840c8e7d5bb7355038a", size = 131811, upload-time = "2025-06-10T00:44:18.933Z" }, + { url = "https://files.pythonhosted.org/packages/b2/27/584394e1cb76fb771371770eccad35de400e7b434ce3142c2dd27392c968/yarl-1.20.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:14f326acd845c2b2e2eb38fb1346c94f7f3b01a4f5c788f8144f9b630bfff9a3", size = 90078, upload-time = "2025-06-10T00:44:20.635Z" }, + { url = "https://files.pythonhosted.org/packages/bf/9a/3246ae92d4049099f52d9b0fe3486e3b500e29b7ea872d0f152966fc209d/yarl-1.20.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f60e4ad5db23f0b96e49c018596707c3ae89f5d0bd97f0ad3684bcbad899f1e7", size = 88748, upload-time = "2025-06-10T00:44:22.34Z" }, + { url = "https://files.pythonhosted.org/packages/a3/25/35afe384e31115a1a801fbcf84012d7a066d89035befae7c5d4284df1e03/yarl-1.20.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:49bdd1b8e00ce57e68ba51916e4bb04461746e794e7c4d4bbc42ba2f18297691", size = 349595, upload-time = "2025-06-10T00:44:24.314Z" }, + { url = "https://files.pythonhosted.org/packages/28/2d/8aca6cb2cabc8f12efcb82749b9cefecbccfc7b0384e56cd71058ccee433/yarl-1.20.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:66252d780b45189975abfed839616e8fd2dbacbdc262105ad7742c6ae58f3e31", size = 342616, upload-time = "2025-06-10T00:44:26.167Z" }, + { url = "https://files.pythonhosted.org/packages/0b/e9/1312633d16b31acf0098d30440ca855e3492d66623dafb8e25b03d00c3da/yarl-1.20.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:59174e7332f5d153d8f7452a102b103e2e74035ad085f404df2e40e663a22b28", size = 361324, upload-time = "2025-06-10T00:44:27.915Z" }, + { url = "https://files.pythonhosted.org/packages/bc/a0/688cc99463f12f7669eec7c8acc71ef56a1521b99eab7cd3abb75af887b0/yarl-1.20.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e3968ec7d92a0c0f9ac34d5ecfd03869ec0cab0697c91a45db3fbbd95fe1b653", size = 359676, upload-time = "2025-06-10T00:44:30.041Z" }, + { url = "https://files.pythonhosted.org/packages/af/44/46407d7f7a56e9a85a4c207724c9f2c545c060380718eea9088f222ba697/yarl-1.20.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d1a4fbb50e14396ba3d375f68bfe02215d8e7bc3ec49da8341fe3157f59d2ff5", size = 352614, upload-time = "2025-06-10T00:44:32.171Z" }, + { url = "https://files.pythonhosted.org/packages/b1/91/31163295e82b8d5485d31d9cf7754d973d41915cadce070491778d9c9825/yarl-1.20.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:11a62c839c3a8eac2410e951301309426f368388ff2f33799052787035793b02", size = 336766, upload-time = "2025-06-10T00:44:34.494Z" }, + { url = "https://files.pythonhosted.org/packages/b4/8e/c41a5bc482121f51c083c4c2bcd16b9e01e1cf8729e380273a952513a21f/yarl-1.20.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:041eaa14f73ff5a8986b4388ac6bb43a77f2ea09bf1913df7a35d4646db69e53", size = 364615, upload-time = "2025-06-10T00:44:36.856Z" }, + { url = "https://files.pythonhosted.org/packages/e3/5b/61a3b054238d33d70ea06ebba7e58597891b71c699e247df35cc984ab393/yarl-1.20.1-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:377fae2fef158e8fd9d60b4c8751387b8d1fb121d3d0b8e9b0be07d1b41e83dc", size = 360982, upload-time = "2025-06-10T00:44:39.141Z" }, + { url = "https://files.pythonhosted.org/packages/df/a3/6a72fb83f8d478cb201d14927bc8040af901811a88e0ff2da7842dd0ed19/yarl-1.20.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:1c92f4390e407513f619d49319023664643d3339bd5e5a56a3bebe01bc67ec04", size = 369792, upload-time = "2025-06-10T00:44:40.934Z" }, + { url = "https://files.pythonhosted.org/packages/7c/af/4cc3c36dfc7c077f8dedb561eb21f69e1e9f2456b91b593882b0b18c19dc/yarl-1.20.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:d25ddcf954df1754ab0f86bb696af765c5bfaba39b74095f27eececa049ef9a4", size = 382049, upload-time = "2025-06-10T00:44:42.854Z" }, + { url = "https://files.pythonhosted.org/packages/19/3a/e54e2c4752160115183a66dc9ee75a153f81f3ab2ba4bf79c3c53b33de34/yarl-1.20.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:909313577e9619dcff8c31a0ea2aa0a2a828341d92673015456b3ae492e7317b", size = 384774, upload-time = "2025-06-10T00:44:45.275Z" }, + { url = "https://files.pythonhosted.org/packages/9c/20/200ae86dabfca89060ec6447649f219b4cbd94531e425e50d57e5f5ac330/yarl-1.20.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:793fd0580cb9664548c6b83c63b43c477212c0260891ddf86809e1c06c8b08f1", size = 374252, upload-time = "2025-06-10T00:44:47.31Z" }, + { url = "https://files.pythonhosted.org/packages/83/75/11ee332f2f516b3d094e89448da73d557687f7d137d5a0f48c40ff211487/yarl-1.20.1-cp313-cp313-win32.whl", hash = "sha256:468f6e40285de5a5b3c44981ca3a319a4b208ccc07d526b20b12aeedcfa654b7", size = 81198, upload-time = "2025-06-10T00:44:49.164Z" }, + { url = "https://files.pythonhosted.org/packages/ba/ba/39b1ecbf51620b40ab402b0fc817f0ff750f6d92712b44689c2c215be89d/yarl-1.20.1-cp313-cp313-win_amd64.whl", hash = "sha256:495b4ef2fea40596bfc0affe3837411d6aa3371abcf31aac0ccc4bdd64d4ef5c", size = 86346, upload-time = "2025-06-10T00:44:51.182Z" }, + { url = "https://files.pythonhosted.org/packages/43/c7/669c52519dca4c95153c8ad96dd123c79f354a376346b198f438e56ffeb4/yarl-1.20.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:f60233b98423aab21d249a30eb27c389c14929f47be8430efa7dbd91493a729d", size = 138826, upload-time = "2025-06-10T00:44:52.883Z" }, + { url = "https://files.pythonhosted.org/packages/6a/42/fc0053719b44f6ad04a75d7f05e0e9674d45ef62f2d9ad2c1163e5c05827/yarl-1.20.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:6f3eff4cc3f03d650d8755c6eefc844edde99d641d0dcf4da3ab27141a5f8ddf", size = 93217, upload-time = "2025-06-10T00:44:54.658Z" }, + { url = "https://files.pythonhosted.org/packages/4f/7f/fa59c4c27e2a076bba0d959386e26eba77eb52ea4a0aac48e3515c186b4c/yarl-1.20.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:69ff8439d8ba832d6bed88af2c2b3445977eba9a4588b787b32945871c2444e3", size = 92700, upload-time = "2025-06-10T00:44:56.784Z" }, + { url = "https://files.pythonhosted.org/packages/2f/d4/062b2f48e7c93481e88eff97a6312dca15ea200e959f23e96d8ab898c5b8/yarl-1.20.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3cf34efa60eb81dd2645a2e13e00bb98b76c35ab5061a3989c7a70f78c85006d", size = 347644, upload-time = "2025-06-10T00:44:59.071Z" }, + { url = "https://files.pythonhosted.org/packages/89/47/78b7f40d13c8f62b499cc702fdf69e090455518ae544c00a3bf4afc9fc77/yarl-1.20.1-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:8e0fe9364ad0fddab2688ce72cb7a8e61ea42eff3c7caeeb83874a5d479c896c", size = 323452, upload-time = "2025-06-10T00:45:01.605Z" }, + { url = "https://files.pythonhosted.org/packages/eb/2b/490d3b2dc66f52987d4ee0d3090a147ea67732ce6b4d61e362c1846d0d32/yarl-1.20.1-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8f64fbf81878ba914562c672024089e3401974a39767747691c65080a67b18c1", size = 346378, upload-time = "2025-06-10T00:45:03.946Z" }, + { url = "https://files.pythonhosted.org/packages/66/ad/775da9c8a94ce925d1537f939a4f17d782efef1f973039d821cbe4bcc211/yarl-1.20.1-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f6342d643bf9a1de97e512e45e4b9560a043347e779a173250824f8b254bd5ce", size = 353261, upload-time = "2025-06-10T00:45:05.992Z" }, + { url = "https://files.pythonhosted.org/packages/4b/23/0ed0922b47a4f5c6eb9065d5ff1e459747226ddce5c6a4c111e728c9f701/yarl-1.20.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56dac5f452ed25eef0f6e3c6a066c6ab68971d96a9fb441791cad0efba6140d3", size = 335987, upload-time = "2025-06-10T00:45:08.227Z" }, + { url = "https://files.pythonhosted.org/packages/3e/49/bc728a7fe7d0e9336e2b78f0958a2d6b288ba89f25a1762407a222bf53c3/yarl-1.20.1-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c7d7f497126d65e2cad8dc5f97d34c27b19199b6414a40cb36b52f41b79014be", size = 329361, upload-time = "2025-06-10T00:45:10.11Z" }, + { url = "https://files.pythonhosted.org/packages/93/8f/b811b9d1f617c83c907e7082a76e2b92b655400e61730cd61a1f67178393/yarl-1.20.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:67e708dfb8e78d8a19169818eeb5c7a80717562de9051bf2413aca8e3696bf16", size = 346460, upload-time = "2025-06-10T00:45:12.055Z" }, + { url = "https://files.pythonhosted.org/packages/70/fd/af94f04f275f95da2c3b8b5e1d49e3e79f1ed8b6ceb0f1664cbd902773ff/yarl-1.20.1-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:595c07bc79af2494365cc96ddeb772f76272364ef7c80fb892ef9d0649586513", size = 334486, upload-time = "2025-06-10T00:45:13.995Z" }, + { url = "https://files.pythonhosted.org/packages/84/65/04c62e82704e7dd0a9b3f61dbaa8447f8507655fd16c51da0637b39b2910/yarl-1.20.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:7bdd2f80f4a7df852ab9ab49484a4dee8030023aa536df41f2d922fd57bf023f", size = 342219, upload-time = "2025-06-10T00:45:16.479Z" }, + { url = "https://files.pythonhosted.org/packages/91/95/459ca62eb958381b342d94ab9a4b6aec1ddec1f7057c487e926f03c06d30/yarl-1.20.1-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:c03bfebc4ae8d862f853a9757199677ab74ec25424d0ebd68a0027e9c639a390", size = 350693, upload-time = "2025-06-10T00:45:18.399Z" }, + { url = "https://files.pythonhosted.org/packages/a6/00/d393e82dd955ad20617abc546a8f1aee40534d599ff555ea053d0ec9bf03/yarl-1.20.1-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:344d1103e9c1523f32a5ed704d576172d2cabed3122ea90b1d4e11fe17c66458", size = 355803, upload-time = "2025-06-10T00:45:20.677Z" }, + { url = "https://files.pythonhosted.org/packages/9e/ed/c5fb04869b99b717985e244fd93029c7a8e8febdfcffa06093e32d7d44e7/yarl-1.20.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:88cab98aa4e13e1ade8c141daeedd300a4603b7132819c484841bb7af3edce9e", size = 341709, upload-time = "2025-06-10T00:45:23.221Z" }, + { url = "https://files.pythonhosted.org/packages/24/fd/725b8e73ac2a50e78a4534ac43c6addf5c1c2d65380dd48a9169cc6739a9/yarl-1.20.1-cp313-cp313t-win32.whl", hash = "sha256:b121ff6a7cbd4abc28985b6028235491941b9fe8fe226e6fdc539c977ea1739d", size = 86591, upload-time = "2025-06-10T00:45:25.793Z" }, + { url = "https://files.pythonhosted.org/packages/94/c3/b2e9f38bc3e11191981d57ea08cab2166e74ea770024a646617c9cddd9f6/yarl-1.20.1-cp313-cp313t-win_amd64.whl", hash = "sha256:541d050a355bbbc27e55d906bc91cb6fe42f96c01413dd0f4ed5a5240513874f", size = 93003, upload-time = "2025-06-10T00:45:27.752Z" }, + { url = "https://files.pythonhosted.org/packages/b4/2d/2345fce04cfd4bee161bf1e7d9cdc702e3e16109021035dbb24db654a622/yarl-1.20.1-py3-none-any.whl", hash = "sha256:83b8eb083fe4683c6115795d9fc1cfaf2cbbefb19b3a1cb68f6527460f483a77", size = 46542, upload-time = "2025-06-10T00:46:07.521Z" }, +] + +[[package]] +name = "zipp" +version = "3.23.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e3/02/0f2892c661036d50ede074e376733dca2ae7c6eb617489437771209d4180/zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166", size = 25547, upload-time = "2025-06-08T17:06:39.4Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e", size = 10276, upload-time = "2025-06-08T17:06:38.034Z" }, +] diff --git a/doc/source/ray-overview/examples/object-detection/README.md b/doc/source/ray-overview/examples/object-detection/README.md index bd4ae7a8b95f..4a37b2d5a8cb 100644 --- a/doc/source/ray-overview/examples/object-detection/README.md +++ b/doc/source/ray-overview/examples/object-detection/README.md @@ -1,39 +1,5 @@ # Scalable video processing - - -This tutorial builds an end-to-end face mask detection pipeline that leverages distributed fine-tuning, large-scale batch inference, video analytics, and scalable serving: - -[1.object_detection_train.ipynb](1.object_detection_train.ipynb) -Fine-tune a pre-trained Faster R-CNN model on a face mask dataset in Pascal Visual Object Classes (VOC) format using Ray Train. Parse XML annotations with Ray Data, retrieve images from S3, run a distributed training loop, checkpoint the model, and visualize inference results. -Object Detection Training Pipeline - -[2.object_detection_batch_inference_eval.ipynb](2.object_detection_batch_inference_eval.ipynb) -Load a fine-tuned model from S3 into Anyscale cluster storage, perform GPU-accelerated batch inference on a test set with Ray Data, and calculate object detection metrics (mAP, IoU, recall) using TorchMetrics for comprehensive model evaluation. -Metrics Calculation Pipeline - -[3.video_processing_batch_inference.ipynb](3.video_processing_batch_inference.ipynb) -Demonstrate a real-world video analytics workflow: read a video from S3, split it into frames, apply the detection model in parallel using Ray Data batch inference, draw bounding boxes and labels on each frame, and regenerate an annotated video for downstream consumption. -Video Processing Pipeline - -[4.object_detection_serve.ipynb](4.object_detection_serve.ipynb) -Deploy the trained Faster R-CNN mask detector as a production-ready microservice using Ray Serve and FastAPI. Set up ingress, configure autoscaling and fractional GPU allocation, test the HTTP endpoint, and manage the service lifecycle both locally and through Anyscale Services. - - -# Face mask detection pipeline - This tutorial builds an end-to-end face mask detection pipeline that leverages distributed fine-tuning, large-scale batch inference, video analytics, and scalable serving: [1.object_detection_train.ipynb](1.object_detection_train.ipynb) diff --git a/doc/source/ray-overview/getting-started.md b/doc/source/ray-overview/getting-started.md index 958134f20619..785c7976393a 100644 --- a/doc/source/ray-overview/getting-started.md +++ b/doc/source/ray-overview/getting-started.md @@ -34,7 +34,7 @@ Use individual libraries for ML workloads. Each library specializes in a specifi [Ray Data](data_quickstart) provides distributed data processing optimized for machine learning and AI workloads. It efficiently streams data through data pipelines. -Here's an example on how to scale offline inference and training ingest with Ray Data. +Here's an example of how to scale offline inference and training ingest with Ray Data. ````{note} To run this example, install Ray Data: diff --git a/doc/source/ray-overview/index.md b/doc/source/ray-overview/index.md index 99303fc819f7..0eb732ed02df 100644 --- a/doc/source/ray-overview/index.md +++ b/doc/source/ray-overview/index.md @@ -1,7 +1,7 @@ (overview-overview)= # Overview -Ray is an open-source unified framework for scaling AI and Python applications like machine learning. It provides the compute layer for parallel processing so that you don’t need to be a distributed systems expert. Ray minimizes the complexity of running your distributed individual and end-to-end machine learning workflows with these components: +Ray is an open-source unified framework for scaling AI and Python applications like machine learning. It provides the compute layer for parallel processing so that you don’t need to be a distributed systems expert. Ray minimizes the complexity of running your distributed individual workflows and end-to-end machine learning workflows with these components: * Scalable libraries for common machine learning tasks such as data preprocessing, distributed training, hyperparameter tuning, reinforcement learning, and model serving. * Pythonic distributed computing primitives for parallelizing and scaling Python applications. * Integrations and utilities for integrating and deploying a Ray cluster with existing tools and infrastructure such as Kubernetes, AWS, GCP, and Azure. @@ -16,10 +16,10 @@ For ML platform builders and ML engineers, Ray: * Reduces friction between development and production by enabling the same Python code to scale seamlessly from a laptop to a large cluster. For distributed systems engineers, Ray automatically handles key processes: -* Orchestration--Managing the various components of a distributed system. -* Scheduling--Coordinating when and where tasks are executed. -* Fault tolerance--Ensuring tasks complete regardless of inevitable points of failure. -* Auto-scaling--Adjusting the number of resources allocated to dynamic demand. +* Orchestration: Managing the various components of a distributed system. +* Scheduling: Coordinating when and where tasks are executed. +* Fault tolerance: Ensuring tasks complete regardless of inevitable points of failure. +* Auto-scaling: Adjusting the number of resources allocated to dynamic demand. ## What you can do with Ray @@ -110,7 +110,7 @@ Each of [Ray's](../ray-air/getting-started) five native libraries distributes a - [Serve](../serve/index): Scalable and programmable serving to deploy models for online inference, with optional microbatching to improve performance. - [RLlib](../rllib/index): Scalable distributed reinforcement learning workloads. -Ray's libraries are for both data scientists and ML engineers alike. For data scientists, these libraries can be used to scale individual workloads, and also end-to-end ML applications. For ML Engineers, these libraries provides scalable platform abstractions that can be used to easily onboard and integrate tooling from the broader ML ecosystem. +Ray's libraries are for both data scientists and ML engineers. For data scientists, these libraries can be used to scale individual workloads and end-to-end ML applications. For ML engineers, these libraries provide scalable platform abstractions that can be used to easily onboard and integrate tooling from the broader ML ecosystem. For custom applications, the [Ray Core](../ray-core/walkthrough) library enables Python developers to easily build scalable, distributed systems that can run on a laptop, cluster, cloud, or Kubernetes. It's the foundation that Ray AI libraries and third-party integrations (Ray ecosystem) are built on. diff --git a/doc/source/ray-overview/installation.rst b/doc/source/ray-overview/installation.rst index a6131d6ab8b2..7ba1db6424b3 100644 --- a/doc/source/ray-overview/installation.rst +++ b/doc/source/ray-overview/installation.rst @@ -196,7 +196,7 @@ Here's a summary of the variations: * For MacOS x86_64, commits predating August 7, 2021 will have ``macosx_10_13`` in the filename instead of ``macosx_10_15``. * For MacOS x86_64, commits predating June 1, 2025 will have ``macosx_10_15`` in the filename instead of ``macosx_12_0``. -.. _apple-silcon-supprt: +.. _apple-silicon-support: M1 Mac (Apple Silicon) Support ------------------------------ @@ -436,7 +436,7 @@ We publish the dependencies that are installed in our ``ray`` Docker images for .. tab-item:: ray (Python 3.9) :sync: ray (Python 3.9) - Ray version: nightly (`f99d0ea `_) + Ray version: nightly (`ec5d410 `_) .. literalinclude:: ./pip_freeze_ray-py39-cpu.txt diff --git a/doc/source/ray-overview/pip_freeze_ray-ml-py39-cpu.txt b/doc/source/ray-overview/pip_freeze_ray-ml-py39-cpu.txt index 74d74f03ee00..01830b5aa586 100644 --- a/doc/source/ray-overview/pip_freeze_ray-ml-py39-cpu.txt +++ b/doc/source/ray-overview/pip_freeze_ray-ml-py39-cpu.txt @@ -12,11 +12,12 @@ aiohappyeyeballs==2.6.1 aiohttp==3.11.16 aiohttp-cors==0.7.0 aioitertools==0.11.0 -aiorwlock==1.5.0 +aiorwlock==1.3.0 aiosignal==1.3.1 aiosqlite==0.19.0 ale-py==0.10.1 alembic==1.12.1 +amqp==5.3.1 annotated-types==0.6.0 antlr4-python3-runtime==4.11.1 anyio==3.7.1 @@ -32,11 +33,15 @@ astunparse==1.6.3 async-timeout==4.0.3 attrs==25.1.0 ax-platform==0.3.2 +azure-common==1.1.28 +azure-core==1.29.5 +azure-storage-blob==12.22.0 Babel==2.13.1 backcall==0.2.0 base58==2.0.1 bayesian-optimization==1.4.3 beautifulsoup4==4.11.1 +billiard==4.2.1 bleach==6.1.0 bokeh==2.4.3 boltons @ file:///home/conda/feedstock_root/build_artifacts/boltons_1733827268945/work @@ -45,12 +50,15 @@ boto3==1.26.76 botocore==1.29.76 botorch==0.8.5 Brotli @ file:///home/conda/feedstock_root/build_artifacts/brotli-split_1749229842835/work -build==1.2.2.post1 cachetools==5.5.2 +celery==5.5.3 certifi==2025.1.31 cffi==1.16.0 charset-normalizer==3.3.2 click==8.1.7 +click-didyoumean==0.3.1 +click-plugins==1.1.1.2 +click-repl==0.3.0 cloudpickle==2.2.0 cma==3.2.2 cmdstanpy==1.2.0 @@ -60,7 +68,7 @@ colorful==0.5.5 colorlog==6.7.0 comet-ml==3.44.1 comm==0.2.0 -conda @ file:///home/conda/feedstock_root/build_artifacts/conda_1749201703459/work/conda-src +conda @ file:///home/conda/feedstock_root/build_artifacts/conda_1754405245494/work/conda-src conda-libmamba-solver @ file:///home/conda/feedstock_root/build_artifacts/conda-libmamba-solver_1745834476052/work/src conda-package-handling @ file:///home/conda/feedstock_root/build_artifacts/conda-package-handling_1736345463896/work conda_package_streaming @ file:///home/conda/feedstock_root/build_artifacts/conda-package-streaming_1729004031731/work @@ -121,7 +129,7 @@ gast==0.6.0 gcs-oauth2-boto-plugin==3.0 getdaft==0.4.3 gitdb==4.0.11 -GitPython==3.1.40 +GitPython==3.1.44 glfw==2.6.3 google-api-core==2.24.2 google-api-python-client==2.111.0 @@ -129,9 +137,13 @@ google-apitools==0.5.32 google-auth==2.23.4 google-auth-httplib2==0.1.1 google-auth-oauthlib==1.0.0 +google-cloud-core==2.4.1 +google-cloud-storage==2.14.0 +google-crc32c==1.5.0 google-oauth==1.0.1 google-pasta==0.2.0 google-reauth==0.1.1 +google-resumable-media==2.6.0 googleapis-common-protos==1.61.0 GPy==1.13.1 gpytorch==1.10 @@ -139,10 +151,10 @@ graphene==3.4.3 graphql-core==3.2.3 graphql-relay==3.2.0 greenlet==3.0.1 -grpcio==1.66.2 +grpcio==1.74.0 gsutil==5.27 gunicorn==20.1.0 -gymnasium==1.0.0 +gymnasium==1.1.1 h11==0.16.0 h2 @ file:///home/conda/feedstock_root/build_artifacts/h2_1733298745555/work h5py==3.10.0 @@ -166,6 +178,7 @@ ipykernel==6.27.1 ipython==8.12.3 ipython-genutils==0.2.0 ipywidgets==8.1.3 +isodate==0.6.1 isoduration==20.11.0 itsdangerous==2.1.2 jedi==0.19.1 @@ -190,14 +203,14 @@ jupyterlab_server==2.24.0 jupyterlab_widgets==3.0.11 keras==2.15.0 kiwisolver==1.4.5 +kombu==5.5.4 labmaze==1.0.6 lazy_loader==0.4 libclang==18.1.1 -libmambapy @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_libmambapy_1750078835/work/libmambapy +libmambapy @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_libmambapy_1753776969/work/libmambapy lightgbm==4.6.0 lightning-utilities==0.11.2 linear-operator==0.4.0 -linkify-it-py==2.0.3 llvmlite==0.42.0 locket==1.0.0 lxml==4.9.4 @@ -208,10 +221,9 @@ markdown-it-py==2.2.0 MarkupSafe==2.1.3 matplotlib==3.7.4 matplotlib-inline==0.1.6 -mdit-py-plugins==0.4.2 mdurl==0.1.2 memray==1.10.0 -menuinst @ file:///home/conda/feedstock_root/build_artifacts/menuinst_1750792275478/work +menuinst @ file:///home/conda/feedstock_root/build_artifacts/menuinst_1753546271769/work minigrid==2.3.1 mistune==0.8.4 ml-dtypes==0.3.2 @@ -240,6 +252,7 @@ netifaces==0.11.0 networkx==3.2.1 nevergrad==0.4.3.post7 ninja==1.11.1.1 +nixl==0.4.0 notebook==6.5.7 notebook_shim==0.2.3 numba==0.59.1 @@ -284,7 +297,6 @@ pettingzoo==1.24.3 pexpect==4.8.0 pickleshare==0.7.5 pillow==10.3.0 -pip-tools==7.4.1 platformdirs==3.11.0 plotly==5.23.0 pluggy==1.3.0 @@ -316,7 +328,6 @@ pynvml==11.5.0 PyOpenGL==3.1.7 pyOpenSSL==25.0.0 pyparsing==3.1.1 -pyproject_hooks==1.2.0 pyro-api==0.1.2 pyro-ppl==1.9.1 Pyro4==4.82 @@ -334,11 +345,10 @@ pyu2f==0.1.5 PyYAML==6.0.1 pyzmq==26.0.3 qpd==0.4.4 -ray @ file:///home/ray/ray-3.0.0.dev0-cp39-cp39-manylinux2014_x86_64.whl#sha256=5a7ca35580e97891618705a7a7efcc556b30cc1d3e6155605d690b18d50c4383 -redis==4.4.2 +ray @ file:///home/ray/ray-3.0.0.dev0-cp39-cp39-manylinux2014_x86_64.whl#sha256=6dd296fe192d3c9953867ef02ab2645e2f5e48f49e7d4e7c1fecc7689d139cf1 referencing==0.36.2 regex==2024.5.15 -requests==2.32.3 +requests @ file:///home/conda/feedstock_root/build_artifacts/requests_1733217035951/work requests-oauthlib==2.0.0 requests-toolbelt==1.0.0 responses==0.13.4 @@ -396,9 +406,8 @@ tensorflow-metadata==1.13.1 tensorflow-probability==0.23.0 termcolor==2.4.0 terminado==0.18.1 -textual==4.0.0 threadpoolctl==3.1.0 -tifffile==2024.8.30 +tifffile==2024.7.21 timm==0.9.2 tinycss2==1.3.0 tinyscaler==1.2.8 @@ -423,18 +432,17 @@ transformers==4.36.2 triad==0.9.8 triton==2.3.0 typeguard==2.13.3 -typer==0.16.0 +typer==0.12.3 types-python-dateutil==2.9.0.20240316 -typing-inspection==0.4.1 typing_extensions==4.12.2 tzdata==2025.2 -uc-micro-py==1.0.3 uri-template==1.3.0 uritemplate==4.1.1 urllib3==1.26.19 utilsforecast==0.2.0 uvicorn==0.22.0 uvloop==0.21.0 +vine==5.1.0 virtualenv==20.29.1 wandb==0.17.0 watchfiles==0.19.0 diff --git a/doc/source/ray-overview/pip_freeze_ray-py39-cpu.txt b/doc/source/ray-overview/pip_freeze_ray-py39-cpu.txt index e2ca596c5740..da45ccee1874 100644 --- a/doc/source/ray-overview/pip_freeze_ray-py39-cpu.txt +++ b/doc/source/ray-overview/pip_freeze_ray-py39-cpu.txt @@ -2,24 +2,33 @@ aiohappyeyeballs==2.6.1 aiohttp==3.11.16 aiohttp-cors==0.7.0 aiosignal==1.3.1 +amqp==5.3.1 annotated-types==0.6.0 anyio==3.7.1 archspec @ file:///home/conda/feedstock_root/build_artifacts/archspec_1737352602016/work async-timeout==4.0.3 attrs==25.1.0 +azure-common==1.1.28 +azure-core==1.29.5 +azure-storage-blob==12.22.0 +billiard==4.2.1 boltons @ file:///home/conda/feedstock_root/build_artifacts/boltons_1733827268945/work boto3==1.26.76 botocore==1.29.76 Brotli @ file:///home/conda/feedstock_root/build_artifacts/brotli-split_1749229842835/work cachetools==5.5.2 +celery==5.5.3 certifi==2025.1.31 cffi==1.16.0 charset-normalizer==3.3.2 click==8.1.7 +click-didyoumean==0.3.1 +click-plugins==1.1.1.2 +click-repl==0.3.0 cloudpickle==2.2.0 colorama @ file:///home/conda/feedstock_root/build_artifacts/colorama_1733218098505/work colorful==0.5.5 -conda @ file:///home/conda/feedstock_root/build_artifacts/conda_1749201703459/work/conda-src +conda @ file:///home/conda/feedstock_root/build_artifacts/conda_1754405245494/work/conda-src conda-libmamba-solver @ file:///home/conda/feedstock_root/build_artifacts/conda-libmamba-solver_1745834476052/work/src conda-package-handling @ file:///home/conda/feedstock_root/build_artifacts/conda-package-handling_1736345463896/work conda_package_streaming @ file:///home/conda/feedstock_root/build_artifacts/conda-package-streaming_1729004031731/work @@ -42,10 +51,14 @@ google-api-core==2.24.2 google-api-python-client==2.111.0 google-auth==2.23.4 google-auth-httplib2==0.1.1 +google-cloud-core==2.4.1 +google-cloud-storage==2.14.0 +google-crc32c==1.5.0 google-oauth==1.0.1 +google-resumable-media==2.6.0 googleapis-common-protos==1.61.0 -grpcio==1.66.2 -gymnasium==1.0.0 +grpcio==1.74.0 +gymnasium==1.1.1 h11==0.16.0 h2 @ file:///home/conda/feedstock_root/build_artifacts/h2_1733298745555/work hpack @ file:///home/conda/feedstock_root/build_artifacts/hpack_1733299205993/work @@ -54,19 +67,21 @@ httptools==0.6.4 hyperframe @ file:///home/conda/feedstock_root/build_artifacts/hyperframe_1733298771451/work idna==3.7 importlib-metadata==6.11.0 +isodate==0.6.1 Jinja2==3.1.6 jmespath==1.0.1 jsonpatch @ file:///home/conda/feedstock_root/build_artifacts/jsonpatch_1733814567314/work jsonpointer @ file:///home/conda/feedstock_root/build_artifacts/jsonpointer_1725302957584/work jsonschema==4.23.0 jsonschema-specifications==2024.10.1 -libmambapy @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_libmambapy_1750078835/work/libmambapy +kombu==5.5.4 +libmambapy @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_libmambapy_1753776969/work/libmambapy lz4==4.3.3 markdown-it-py==2.2.0 MarkupSafe==2.1.3 mdurl==0.1.2 memray==1.10.0 -menuinst @ file:///home/conda/feedstock_root/build_artifacts/menuinst_1750792275478/work +menuinst @ file:///home/conda/feedstock_root/build_artifacts/menuinst_1753546271769/work msgpack==1.0.7 multidict==6.0.5 numpy==1.26.4 @@ -83,6 +98,7 @@ pandas==1.5.3 platformdirs==3.11.0 pluggy @ file:///home/conda/feedstock_root/build_artifacts/pluggy_1733222765875/work prometheus-client==0.19.0 +prompt-toolkit==3.0.41 propcache==0.3.0 proto-plus==1.22.3 protobuf==4.25.8 @@ -103,14 +119,13 @@ python-dateutil==2.8.2 python-dotenv==1.1.1 pytz==2022.7.1 PyYAML==6.0.1 -ray @ file:///home/ray/ray-3.0.0.dev0-cp39-cp39-manylinux2014_x86_64.whl#sha256=6d5b553cf00ee5e32ad8f0e02333a0c4ecafd471fa52f4ee1e71ed00944120ec -redis==4.4.2 +ray @ file:///home/ray/ray-3.0.0.dev0-cp39-cp39-manylinux2014_x86_64.whl#sha256=f882349a99e35a6628b064cfcb031919f026933731262d982bbdf7664003fbd3 referencing==0.36.2 requests @ file:///home/conda/feedstock_root/build_artifacts/requests_1733217035951/work rich==13.3.2 rpds-py==0.22.3 rsa==4.7.2 -ruamel.yaml @ file:///home/conda/feedstock_root/build_artifacts/ruamel.yaml_1749479929034/work +ruamel.yaml @ file:///home/conda/feedstock_root/build_artifacts/ruamel.yaml_1755625023823/work ruamel.yaml.clib @ file:///home/conda/feedstock_root/build_artifacts/ruamel.yaml.clib_1728724456970/work s3transfer==0.6.2 scipy==1.11.4 @@ -121,12 +136,15 @@ starlette==0.46.2 tensorboardX==2.6.2.2 tqdm @ file:///home/conda/feedstock_root/build_artifacts/tqdm_1735661334605/work typing_extensions==4.12.2 +tzdata==2025.2 uritemplate==4.1.1 urllib3==1.26.19 uvicorn==0.22.0 uvloop==0.21.0 +vine==5.1.0 virtualenv==20.29.1 watchfiles==0.19.0 +wcwidth==0.2.13 websockets==11.0.3 yarl==1.18.3 zipp==3.19.2 diff --git a/doc/source/ray-overview/use-cases.rst b/doc/source/ray-overview/use-cases.rst index 9a05bda6a96b..b4d74d51f0d7 100644 --- a/doc/source/ray-overview/use-cases.rst +++ b/doc/source/ray-overview/use-cases.rst @@ -137,7 +137,7 @@ RLlib is an open-source library for reinforcement learning (RL), offering suppor .. figure:: /images/rllib_use_case.png - Decentralized distributed proximal polixy optimiation (DD-PPO) architecture. + Decentralized distributed proximal policy optimization (DD-PPO) architecture. Learn more about reinforcement learning with the following resources. diff --git a/doc/source/ray-references/faq.rst b/doc/source/ray-references/faq.rst index 9fbf54fc9c60..9b18ef07bded 100644 --- a/doc/source/ray-references/faq.rst +++ b/doc/source/ray-references/faq.rst @@ -11,6 +11,6 @@ FAQ Further Questions or Issues? ----------------------------- +----------------------------- .. include:: /_includes/_help.rst diff --git a/doc/source/ray-references/glossary.rst b/doc/source/ray-references/glossary.rst index 265efc260c9b..d43fdeef19af 100644 --- a/doc/source/ray-references/glossary.rst +++ b/doc/source/ray-references/glossary.rst @@ -23,7 +23,7 @@ documentation, sorted alphabetically. essentially a stateful service. :ref:`Learn more about Ray actors`. Actor task - An invocation of an Ray actor method. Sometimes we just call it a task. + An invocation of a Ray actor method. Sometimes we just call it a task. Ray Agent Daemon process running on each Ray node. It has several functionalities like @@ -38,7 +38,7 @@ documentation, sorted alphabetically. Algorithm A class that holds the who/when/where/how for training one or more RL agent(s). The user interacts with an Algorithm instance directly to train their agents - (it is the top-most user facing API or RLlib). + (it is the top-most user facing API of RLlib). Asynchronous execution An execution model where a later task can begin executing in parallel, @@ -66,7 +66,7 @@ documentation, sorted alphabetically. Backend A class containing the initialization and teardown logic for a specific deep - learning framework (eg. Torch, TensorFlow), used to set up distributed + learning framework (e.g., Torch, TensorFlow), used to set up distributed data-parallel training for :ref:`Ray Train’s built-in trainers`. Batch format @@ -116,7 +116,7 @@ documentation, sorted alphabetically. different Ray components and libraries. A Checkpoint can have its data represented as a directory on local (on-disk) storage, as a directory on an external storage (e.g., cloud storage), and as an in-memory dictionary. - :class:`Learn more `, + :class:`Learn more `. .. TODO: How does this relate to RLlib checkpoints etc.? Be clear here @@ -197,7 +197,7 @@ documentation, sorted alphabetically. Environment The world or simulation, in which one or more reinforcement learning agents - have to learn to behave optimally in wrt. a given reward function. An + have to learn to behave optimally with respect to a given reward function. An environment consists of an observation space, a reward function, an action space, a state transition function, and a distribution over initial states (after a reset). @@ -219,7 +219,7 @@ documentation, sorted alphabetically. Trial Executor An internal :ref:`Ray Tune component` that manages the resource management and execution of each trial’s corresponding remote - Trainable actor. The trial executor’s responsibilities include launching + Trainable actor. The trial executor’s responsibilities include launching training, checkpointing, and restoring remote tasks. Experiment @@ -266,7 +266,7 @@ documentation, sorted alphabetically. .. TODO: Inference Job - A ray job is a packaged ray application that can be executed on a + A Ray job is a packaged Ray application that can be executed on a (remote) Ray cluster. :ref:`Learn more`. Lineage @@ -375,7 +375,7 @@ documentation, sorted alphabetically. On-Policy A type of RL Algorithm. In an on-policy algorithm, the policy used to compute the actions inside an RL environment (to generate the training data) must be the - exact same (matching NN weights at all times) than the one that is being + exact same (matching NN weights at all times) as the one that's being optimized. Examples for on-policy Algorithms are PPO, APPO, and IMPALA. OOM (Out of Memory) diff --git a/doc/source/ray-security/index.md b/doc/source/ray-security/index.md index 8a2d87acedde..f7a4a707e1d4 100644 --- a/doc/source/ray-security/index.md +++ b/doc/source/ray-security/index.md @@ -1,6 +1,6 @@ (security)= -# Security +# Security Ray is an easy-to-use framework to run arbitrary code across one or more nodes in a Ray Cluster. Ray provides fault-tolerance, optimized scheduling, task orchestration, and auto-scaling to run a given workload. @@ -15,7 +15,7 @@ If you expose these services (Ray Dashboard, Ray Jobs, Ray Client), anybody who can access the associated ports can execute arbitrary code on your Ray Cluster. This can happen: * Explicitly: By submitting a Ray Job, or using the Ray Client * Indirectly: By calling the Dashboard REST APIs of these services -* Implicitly: Ray extensively uses cloudpickle for serialization of arbitrary python objects. See [the pickle documentation](https://docs.python.org/3/library/pickle.html) for more details on Pickle's security model. +* Implicitly: Ray extensively uses cloudpickle for serialization of arbitrary Python objects. See [the pickle documentation](https://docs.python.org/3/library/pickle.html) for more details on Pickle's security model. The Ray Dashboard, Ray Jobs and Ray Client are developer tools that you should only use with the necessary access controls in place to restrict access to trusted parties only. diff --git a/doc/source/rllib/algorithm-config.rst b/doc/source/rllib/algorithm-config.rst index 800ff8de3d0e..76f1b2a7afbd 100644 --- a/doc/source/rllib/algorithm-config.rst +++ b/doc/source/rllib/algorithm-config.rst @@ -12,7 +12,7 @@ the auto-validated and type-safe gateway into configuring and building an RLlib :py:class:`~ray.rllib.algorithms.algorithm.Algorithm`. In essence, you first create an instance of :py:class:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig` -and then call some of its methods to set various configuration options. RLlib uses the following, `black `__ compliant format +and then call some of its methods to set various configuration options. RLlib uses the following `black `__-compliant format in all parts of its code. Note that you can chain together more than one method call, including the constructor: diff --git a/doc/source/rllib/external-envs.rst b/doc/source/rllib/external-envs.rst index 7730a17117c6..467e9fdfb4da 100644 --- a/doc/source/rllib/external-envs.rst +++ b/doc/source/rllib/external-envs.rst @@ -30,7 +30,7 @@ should step. .. scale: 75 % .. A Unity3D soccer game being learnt by RLlib via the ExternalEnv API. -RLlib provides an `external messaging protocol `__ +RLlib provides an `external messaging protocol `__ called :ref:`RLlink ` for this purpose as well as the option to customize your :py:class:`~ray.rllib.env.env_runner.EnvRunner` class toward communicating through :ref:`RLlink ` with one or more clients. An example, `tcp-based EnvRunner implementation with RLlink is available here `__. @@ -68,7 +68,7 @@ Message Structure RLlink messages consist of a header and a body: - - **Header**: 8-byte length field indicating the size of the body, for example `00000016` for a body of length 16 (thus, in total, the message size ). + - **Header**: 8-byte length field indicating the size of the body, for example `00000016` for a body of length 16 (thus, in total, the message size). - **Body**: JSON-encoded content with a `type` field indicating the message type. Example Messages: PING and EPISODES_AND_GET_STATE @@ -153,7 +153,7 @@ Responses: Server → Client - **``SET_STATE``** - - Example: ``{"type": "PONG"}`` + - Example: ``{"type": "SET_STATE", "weights_seq_no": 123, "onnx_file": "... [base64 encoded ONNX file] ..."}`` - Purpose: Provide the client with the current state (for example, model weights). - Body: diff --git a/doc/source/rllib/getting-started.rst b/doc/source/rllib/getting-started.rst index 7cf14882a2fd..0958682c4765 100644 --- a/doc/source/rllib/getting-started.rst +++ b/doc/source/rllib/getting-started.rst @@ -77,7 +77,7 @@ method: ) -To scale your setup and define, how many :py:class:`~ray.rllib.env.env_runner.EnvRunner` actors you want to leverage, +To scale your setup and define how many :py:class:`~ray.rllib.env.env_runner.EnvRunner` actors you want to leverage, you can call the :py:meth:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig.env_runners` method. ``EnvRunners`` are used to collect samples for training updates from your :ref:`environment `. diff --git a/doc/source/rllib/key-concepts.rst b/doc/source/rllib/key-concepts.rst index 7d71b1fc8353..6d89d5ea9154 100644 --- a/doc/source/rllib/key-concepts.rst +++ b/doc/source/rllib/key-concepts.rst @@ -17,12 +17,12 @@ key concepts and general architecture of RLlib. **RLlib overview:** The central component of RLlib is the :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` class, acting as a runtime for executing your RL experiments. Your gateway into using an :ref:`Algorithm ` is the - :py:class:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig` (cyan) class, allowing + :py:class:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig` (cyan) class, allowing you to manage available configuration settings, for example learning rate or model architecture. Most :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` objects have - :py:class:`~ray.rllib.env.env_runner.EnvRunner` actors (blue) to collect training samples + :py:class:`~ray.rllib.env.env_runner.EnvRunner` actors (blue) to collect training samples from the :ref:`RL environment ` and - :py:class:`~ray.rllib.core.learner.learner.Learner` actors (yellow) + :py:class:`~ray.rllib.core.learner.learner.Learner` actors (yellow) to compute gradients and update your :ref:`models `. The algorithm synchronizes model weights after an update. @@ -142,7 +142,7 @@ and the rules that govern environment transitions when applying actions. A simple **RL environment** where an agent starts with an initial observation returned by the ``reset()`` method. The agent, possibly controlled by a neural network policy, sends actions, like ``right`` or ``jump``, - to the environmant's ``step()`` method, which returns a reward. Here, the reward values are +5 for reaching the goal + to the environment's ``step()`` method, which returns a reward. Here, the reward values are +5 for reaching the goal and 0 otherwise. The environment also returns a boolean flag indicating whether the episode is complete. Environments may vary in complexity, from simple tasks, like navigating a grid world, to highly intricate systems, like autonomous @@ -184,7 +184,7 @@ network models and defines how to use them during the three phases of its RL lif **Exploration**, for collecting training data, **inference** when computing actions for evaluation or in production, and **training** for computing the loss function inputs. -You can chose to use :ref:`RLlib's built-in default models and configure these ` as needed, +You can choose to use :ref:`RLlib's built-in default models and configure these ` as needed, for example for changing the number of layers or the activation functions, or :ref:`write your own custom models in PyTorch `, allowing you to implement any architecture and computation logic. diff --git a/doc/source/rllib/metrics-logger.rst b/doc/source/rllib/metrics-logger.rst index 382ce6ad8596..2ef0b0d4c0b0 100644 --- a/doc/source/rllib/metrics-logger.rst +++ b/doc/source/rllib/metrics-logger.rst @@ -46,7 +46,7 @@ Features of MetricsLogger The :py:class:`~ray.rllib.utils.metrics.metrics_logger.MetricsLogger` API offers the following functionalities: - Log scalar values over time, such as losses, individual rewards, or episode returns. -- Configure different reduction types, in particular ``mean``, ``min``, ``max``, or ``sum``. Also, users can chose to not +- Configure different reduction types, in particular ``mean``, ``min``, ``max``, or ``sum``. Also, users can choose to not reduce at all through the ``reduce=None`` setting, leaving the logged values untouched. A separate ``clear_on_reduce=True`` setting allows for automatically clearing all logged values on each ``reduce`` event. - Specify sliding windows, over which reductions take place, for example ``window=100`` to average over the @@ -169,7 +169,7 @@ whenever reduction takes place or you peek at the current value: logger.peek("max_value") # Expect: 1000.0, which is the lifetime max (infinite window) -You can also chose to not reduce at all, but to simply collect individual values, for example a set of images you receive +You can also choose to not reduce at all, but to simply collect individual values, for example a set of images you receive from your environment over time and for which it doesn't make sense to reduce them in any way. Use the ``reduce=None`` argument for achieving this. However, it's strongly advised that you should also @@ -192,7 +192,7 @@ to :py:class:`~ray.rllib.algorithms.algorithm.Algorithm`: You should pass additional arguments like ``reduce=None`` and ``clear_on_reduce=True`` to the :py:meth:`~ray.rllib.utils.metrics.metrics_logger.MetricsLogger.log_value` method on each call. -Otherwise, MetricsLogger will emit warnings to ensure that it's behaviour is always as expected. +Otherwise, MetricsLogger will emit warnings to ensure that its behavior is always as expected. Logging a set of nested scalar values @@ -228,7 +228,7 @@ Logging non-scalar data :py:class:`~ray.rllib.utils.metrics.metrics_logger.MetricsLogger` isn't limited to scalar values. You can also use it to log images, videos, or any other complex data. -Normally, you would chose the previously described ``reduce=None`` argument. For example, to +Normally, you would choose the previously described ``reduce=None`` argument. For example, to log three consecutive image frames from a ``CartPole`` environment, do the following: .. testcode:: diff --git a/doc/source/rllib/multi-agent-envs.rst b/doc/source/rllib/multi-agent-envs.rst index 52312b98fefc..232f9516c23e 100644 --- a/doc/source/rllib/multi-agent-envs.rst +++ b/doc/source/rllib/multi-agent-envs.rst @@ -34,20 +34,19 @@ RLlib's MultiAgentEnv API .. hint:: - This paragraph describes RLlib's own :py:class`~ray.rllib.env.multi_agent_env.MultiAgentEnv` API, which is the + This paragraph describes RLlib's own :py:class:`~ray.rllib.env.multi_agent_env.MultiAgentEnv` API, which is the recommended way of defining your own multi-agent environment logic. However, if you are already using a third-party multi-agent API, RLlib offers wrappers for :ref:`Farama's PettingZoo API ` as well as :ref:`DeepMind's OpenSpiel API `. -The :py:class`~ray.rllib.env.multi_agent_env.MultiAgentEnv` API of RLlib closely follows the +The :py:class:`~ray.rllib.env.multi_agent_env.MultiAgentEnv` API of RLlib closely follows the conventions and APIs of `Farama's gymnasium (single-agent) `__ envs and even subclasses from `gymnasium.Env`, however, instead of publishing individual observations, rewards, and termination/truncation flags -from `reset()` and `step()`, a custom :py:class`~ray.rllib.env.multi_agent_env.MultiAgentEnv` implementation -outputs dictionaries, one for observations, one for rewards, etc..in which agent IDs map -In each such multi-agent dictionary, agent IDs map to the respective individual agent's observation/reward/etc.. +from `reset()` and `step()`, a custom :py:class:`~ray.rllib.env.multi_agent_env.MultiAgentEnv` implementation +outputs separate dictionaries for observations, rewards, etc., where each dictionary maps agent IDs to the corresponding values for each agent. -Here is a first draft of an example :py:class`~ray.rllib.env.multi_agent_env.MultiAgentEnv` implementation: +Here is a first draft of an example :py:class:`~ray.rllib.env.multi_agent_env.MultiAgentEnv` implementation: .. code-block:: @@ -72,7 +71,7 @@ Here is a first draft of an example :py:class`~ray.rllib.env.multi_agent_env.Mul Agent Definitions ~~~~~~~~~~~~~~~~~ -The number of agents in your environment and their IDs are entirely controlled by your :py:class`~ray.rllib.env.multi_agent_env.MultiAgentEnv` +The number of agents in your environment and their IDs are entirely controlled by your :py:class:`~ray.rllib.env.multi_agent_env.MultiAgentEnv` code. Your env decides, which agents start after an episode reset, which agents enter the episode at a later point, which agents terminate the episode early, and which agents stay in the episode until the entire episode ends. @@ -301,7 +300,7 @@ receives +1 reward. The losing player receives a -1 reward. To make the implementation easier, the aberration from the original game is that trying to place a piece on an already occupied field results in the board not changing at all, but the moving player receiving a -5 reward as a penalty (in the original game, this move is -simply not allowed and therefor can never happen). +simply not allowed and therefore can never happen). Here is your initial class scaffold for the Tic-Tac-Toe game: @@ -371,7 +370,7 @@ you can use grouping in conjunction with the policy mapping API described in pri Third Party Multi-Agent Env APIs -------------------------------- -Besides RLlib's own :py:class`~ray.rllib.env.multi_agent_env.MultiAgentEnv` API, you can also use +Besides RLlib's own :py:class:`~ray.rllib.env.multi_agent_env.MultiAgentEnv` API, you can also use various third-party APIs and libraries to implement custom multi-agent envs. diff --git a/doc/source/rllib/new-api-stack-migration-guide.rst b/doc/source/rllib/new-api-stack-migration-guide.rst index 9eb426dcca93..9ab92d53be0c 100644 --- a/doc/source/rllib/new-api-stack-migration-guide.rst +++ b/doc/source/rllib/new-api-stack-migration-guide.rst @@ -330,7 +330,7 @@ Custom callbacks ---------------- If you're using custom callbacks on the old API stack, you're subclassing the ``DefaultCallbacks`` class, -which the Ray team renamed to :py:class`~ray.rllib.callbacks.callbacks.RLlibCallback`. +which the Ray team renamed to :py:class:`~ray.rllib.callbacks.callbacks.RLlibCallback`. You can continue this approach with the new API stack and pass your custom subclass to your config like the following: .. testcode:: @@ -340,7 +340,7 @@ You can continue this approach with the new API stack and pass your custom subcl However, if you're overriding those methods that triggered on the :py:class:`~ray.rllib.env.env_runner.EnvRunner` side, for example, ``on_episode_start/stop/step/etc...``, you may have to translate some call arguments. -The following is a one-to-one translation guide for these types of :py:class`~ray.rllib.callbacks.callbacks.RLlibCallback` +The following is a one-to-one translation guide for these types of :py:class:`~ray.rllib.callbacks.callbacks.RLlibCallback` methods: .. testcode:: diff --git a/doc/source/rllib/package_ref/env.rst b/doc/source/rllib/package_ref/env.rst index b8a49f196508..aa9bfcc483c0 100644 --- a/doc/source/rllib/package_ref/env.rst +++ b/doc/source/rllib/package_ref/env.rst @@ -21,12 +21,6 @@ gymnasium's own `vectorization feature = 1.x` custom vectorization feature. - External Envs ------------- @@ -55,4 +49,5 @@ Environment API Reference env/multi_agent_env.rst env/multi_agent_env_runner.rst env/multi_agent_episode.rst + env/external.rst env/utils.rst diff --git a/doc/source/rllib/package_ref/env/env_runner.rst b/doc/source/rllib/package_ref/env/env_runner.rst index b1f7fb8401ad..a47bd2256e25 100644 --- a/doc/source/rllib/package_ref/env/env_runner.rst +++ b/doc/source/rllib/package_ref/env/env_runner.rst @@ -45,6 +45,13 @@ Cleanup EnvRunner.stop +rllib.env.env_errors.StepFailedRecreateEnvError +------------------------------------------------ + +.. currentmodule:: ray.rllib.env.env_errors + +.. autoclass:: StepFailedRecreateEnvError + Single-agent and multi-agent EnvRunners --------------------------------------- diff --git a/doc/source/rllib/package_ref/env/external.rst b/doc/source/rllib/package_ref/env/external.rst new file mode 100644 index 000000000000..4dce2def1646 --- /dev/null +++ b/doc/source/rllib/package_ref/env/external.rst @@ -0,0 +1,22 @@ +.. include:: /_includes/rllib/we_are_hiring.rst + +.. _env-external-reference-docs: + +External Envs +============= + +.. include:: /_includes/rllib/new_api_stack.rst + +ray.rllib.env.external.rllink.RLlink +------------------------------------ + +.. currentmodule:: ray.rllib.env.external.rllink + +.. autoclass:: ray.rllib.env.external.rllink.RLlink + +.. autosummary:: + :nosignatures: + :toctree: doc/ + + ~get_rllink_message + ~send_rllink_message diff --git a/doc/source/rllib/package_ref/env/utils.rst b/doc/source/rllib/package_ref/env/utils.rst index 49a884bd6bc4..99717102ef34 100644 --- a/doc/source/rllib/package_ref/env/utils.rst +++ b/doc/source/rllib/package_ref/env/utils.rst @@ -16,6 +16,5 @@ rllib.env.utils :nosignatures: :toctree: env/ - ~external_env_protocol.RLlink ~try_import_open_spiel ~try_import_pyspiel diff --git a/doc/source/rllib/package_ref/index.rst b/doc/source/rllib/package_ref/index.rst index 5411f2c213b1..7b88fbd8c8e9 100644 --- a/doc/source/rllib/package_ref/index.rst +++ b/doc/source/rllib/package_ref/index.rst @@ -10,7 +10,7 @@ Ray RLlib API .. tip:: We'd love to hear your feedback on using RLlib - `sign up to our forum and start asking questions `_! This section contains an overview of RLlib's package- and API reference. -If you think there is anything missing, please open an issue on `Github`_. +If you think there is anything missing, please open an issue on `GitHub`_. .. _`GitHub`: https://github.com/ray-project/ray/issues diff --git a/doc/source/rllib/rl-modules.rst b/doc/source/rllib/rl-modules.rst index c8e27ee7c21f..32efc840f599 100644 --- a/doc/source/rllib/rl-modules.rst +++ b/doc/source/rllib/rl-modules.rst @@ -569,7 +569,7 @@ If you don't return the ``actions`` key from your forward method: def _forward_exploration(self, batch): ... return { - Columns.ACTIONS: ... # RLlib uses these actions as-is (no sampling step!) + Columns.ACTIONS: ..., # RLlib uses these actions as-is (no sampling step!) Columns.ACTION_DIST_INPUTS: ... # If provided, RLlib uses these dist inputs to compute probs and logp. } diff --git a/doc/source/rllib/rllib-algorithms.rst b/doc/source/rllib/rllib-algorithms.rst index 41c109d5962b..e5743f8e306a 100644 --- a/doc/source/rllib/rllib-algorithms.rst +++ b/doc/source/rllib/rllib-algorithms.rst @@ -39,6 +39,10 @@ as well as multi-GPU training on multi-node (GPU) clusters when using the `Anysc +-----------------------------------------------------------------------------+------------------------------+------------------------------------+--------------------------------+ | :ref:`BC (Behavior Cloning) ` | |single_agent| | |multi_gpu| |multi_node_multi_gpu| | |cont_actions| |discr_actions| | +-----------------------------------------------------------------------------+------------------------------+------------------------------------+--------------------------------+ +| :ref:`CQL (Conservative Q-Learning) ` | |single_agent| | |multi_gpu| |multi_node_multi_gpu| | |cont_actions| | ++-----------------------------------------------------------------------------+------------------------------+------------------------------------+--------------------------------+ +| :ref:`IQL (Implicit Q-Learning) ` | |single_agent| | |multi_gpu| |multi_node_multi_gpu| | |cont_actions| | ++-----------------------------------------------------------------------------+------------------------------+------------------------------------+--------------------------------+ | :ref:`MARWIL (Monotonic Advantage Re-Weighted Imitation Learning) ` | |single_agent| | |multi_gpu| |multi_node_multi_gpu| | |cont_actions| |discr_actions| | +-----------------------------------------------------------------------------+------------------------------+------------------------------------+--------------------------------+ | **Algorithm Extensions and -Plugins** | @@ -183,7 +187,7 @@ Asynchronous Proximal Policy Optimization (APPO) In a training iteration, APPO requests samples from all EnvRunners asynchronously and the collected episode samples are returned to the main algorithm process as Ray references rather than actual objects available on the local process. APPO then passes these episode references to the Learners for asynchronous updates of the model. - RLlib doesn't always synch back the weights to the EnvRunners right after a new model version is available. + RLlib doesn't always sync back the weights to the EnvRunners right after a new model version is available. To account for the EnvRunners being off-policy, APPO uses a procedure called v-trace, `described in the IMPALA paper `__. APPO scales out on both axes, supporting multiple EnvRunners for sample collection and multiple GPU- or CPU-based Learners @@ -214,7 +218,7 @@ Importance Weighted Actor-Learner Architecture (IMPALA) **IMPALA architecture:** In a training iteration, IMPALA requests samples from all EnvRunners asynchronously and the collected episodes are returned to the main algorithm process as Ray references rather than actual objects available on the local process. IMPALA then passes these episode references to the Learners for asynchronous updates of the model. - RLlib doesn't always synch back the weights to the EnvRunners right after a new model version is available. + RLlib doesn't always sync back the weights to the EnvRunners right after a new model version is available. To account for the EnvRunners being off-policy, IMPALA uses a procedure called v-trace, `described in the paper `__. IMPALA scales out on both axes, supporting multiple EnvRunners for sample collection and multiple GPU- or CPU-based Learners @@ -357,12 +361,36 @@ Conservative Q-Learning (CQL) **Tuned examples:** `Pendulum-v1 `__ -**CQL-specific configs** and :ref:`generic algorithm settings `): +**CQL-specific configs** (see also :ref:`generic algorithm settings `): .. autoclass:: ray.rllib.algorithms.cql.cql.CQLConfig :members: training +.. _iql: + +Implicit Q-Learning (IQL) +------------------------- +`[paper] `__ +`[implementation] `__ + + **IQL architecture:** IQL (Implicit Q-Learning) is an offline RL algorithm that never needs to evaluate actions outside of + the dataset, but still enables the learned policy to improve substantially over the best behavior in the data through + generalization. Instead of standard TD-error minimization, it introduces a value function trained through expectile regression, + which yields a conservative estimate of returns. This allows policy improvement through advantage-weighted behavior cloning, + ensuring safer generalization without explicit exploration. + + The `IQLLearner` replaces the usual TD-based value loss with an expectile regression loss, and trains the policy to imitate + high-advantage actions—enabling substantial performance gains over the behavior policy using only in-dataset actions. + +**Tuned examples:** +`Pendulum-v1 `__ + +**IQL-specific configs** (see also :ref:`generic algorithm settings `): + +.. autoclass:: ray.rllib.algorithms.iql.iql.IQLConfig + :members: training + .. _marwil: Monotonic Advantage Re-Weighted Imitation Learning (MARWIL) @@ -376,7 +404,7 @@ Monotonic Advantage Re-Weighted Imitation Learning (MARWIL) **MARWIL architecture:** MARWIL is a hybrid imitation learning and policy gradient algorithm suitable for training on batched historical data. When the ``beta`` hyperparameter is set to zero, the MARWIL objective reduces to plain - imitation learning (see `BC`_). MARWIL uses Ray.Data to tap into its parallel data + imitation learning (see `BC`_). MARWIL uses Ray. Data to tap into its parallel data processing capabilities. In one training iteration, MARWIL reads episodes in parallel from offline files, for example `parquet `__, by the n DataWorkers. Connector pipelines preprocess these episodes into train batches and send these as data iterators directly to the n Learners for updating the model. diff --git a/doc/source/rllib/rllib-env.rst b/doc/source/rllib/rllib-env.rst index cd97c0259ba6..321ce169f06f 100644 --- a/doc/source/rllib/rllib-env.rst +++ b/doc/source/rllib/rllib-env.rst @@ -289,7 +289,7 @@ in combination. controlled through your :py:class:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig`: ``config.env_runners(num_env_runners=..)``. -1. **Vectorization within a single process:** Many environments achieve high +#. **Vectorization within a single process:** Many environments achieve high frame rates per core but are limited by policy inference latency. To address this limitation, create multiple environments per process to batch the policy forward pass across these vectorized environments. Set ``config.env_runners(num_envs_per_env_runner=..)`` diff --git a/doc/source/rllib/rllib-examples.rst b/doc/source/rllib/rllib-examples.rst index f32307de0bfa..c2d15e49afa3 100644 --- a/doc/source/rllib/rllib-examples.rst +++ b/doc/source/rllib/rllib-examples.rst @@ -158,7 +158,7 @@ Curiosity Using curiosity is beneficial in sparse-reward environments where agents may struggle to find rewarding paths. However, count-based methods are only feasible for environments with small observation spaces. -- `Euclidian distance-based curiosity `__: +- `Euclidean distance-based curiosity `__: Uses Euclidean distance between states and the initial state to measure novelty, encouraging exploration by rewarding the agent for reaching "far away" regions of the environment. Suitable for sparse-reward tasks, where diverse exploration is key to success. @@ -177,6 +177,11 @@ Curriculum learning This approach enables gradual learning, allowing agents to master simpler tasks before progressing to more challenging ones, ideal for environments with hierarchical or staged difficulties. Also see the :doc:`curriculum learning how-to ` from the documentation. +- `Curriculum learning for Atari Pong `__: + Demonstrates curriculum learning for Atari Pong using the `frameskip` to increase difficulty of the task. + This approach enables gradual learning, allowing agents to master slower reactions (lower `frameskip`) before progressing to more faster ones (higher `frameskip`). + Also see the :doc:`curriculum learning how-to ` from the documentation. + Debugging +++++++++ @@ -193,7 +198,7 @@ Environments - `Async gym vectorization, parallelizing sub-environments `__: Shows how the `gym_env_vectorize_mode` config setting can significantly speed up your - :py:class`~ray.rllib.env.env_runner.EnvRunner` actors, if your RL environment is slow and you are + :py:class:`~ray.rllib.env.env_runner.EnvRunner` actors, if your RL environment is slow and you're using `num_envs_per_env_runner > 1`. The reason for the performance gain is that each sub-environment runs in its own process. - `Custom env rendering method `__: @@ -358,6 +363,11 @@ Multi-agent RL Uses OpenSpiel to demonstrate league-based self-play, where agents play against various versions of themselves, frozen or in-training, to improve through competitive interaction. +- `Self-play with Footsies and PPO algorithm `__: + Implements self-play with the Footsies environment (two player zero-sum game). + This example highlights RLlib's capabilities in connecting to the external binaries running the game engine, as well as + setting up a multi-agent self-play training scenario. + - `Self-play with OpenSpiel `__: Similar to the league-based self-play, but simpler. This script leverages OpenSpiel for two-player games, allowing agents to improve through direct self-play without building a complex, structured league. diff --git a/doc/source/rllib/rllib-offline.rst b/doc/source/rllib/rllib-offline.rst index 3d08c287f9c2..2320e1cfcc43 100644 --- a/doc/source/rllib/rllib-offline.rst +++ b/doc/source/rllib/rllib-offline.rst @@ -22,7 +22,7 @@ format. You should use the episode format when #. You need experiences grouped by their trajectory and ordered in time (for example, to train stateful modules). #. You want to use recorded experiences exclusively within RLlib (for example for offline RL or behavior cloning). -Contrary, you should prefer the table (columns) format, if +On the contrary, you should prefer the table (columns) format, if #. You need to read the data easily with other data tools or ML libraries. @@ -30,8 +30,8 @@ Contrary, you should prefer the table (columns) format, if :py:class:`~ray.rllib.env.single_agent_episode.SingleAgentEpisode` class is usable outside of an RLlib context. To enable faster access through external data tools (for example, for data transformations), it's recommended to use the table record format. -Most importantly, RLlib's offline RL API builds on top of :ref:`Ray Data ` and therefore features in general all read and -write methods supported by Ray Data (for example :py:class:`~ray.data.read_parquet`, :py:class:`~ray.data.read_json`, etc.) with +Most importantly, RLlib's offline RL API builds on top of :ref:`Ray Data ` and therefore supports all of its read and +write methods (for example :py:class:`~ray.data.read_parquet`, :py:class:`~ray.data.read_json`, etc.) with :py:class:`~ray.data.read_parquet` and :py:class:`~ray.data.Dataset.write_parquet` being the default read and write methods. A core design principle of the API is to apply as many data transformations as possible on-the-fly prior to engaging the learner, allowing the latter to focus exclusively on model updates. diff --git a/doc/source/serve/advanced-guides/app-builder-guide.md b/doc/source/serve/advanced-guides/app-builder-guide.md index aecb1c761f75..a306ce3eb2db 100644 --- a/doc/source/serve/advanced-guides/app-builder-guide.md +++ b/doc/source/serve/advanced-guides/app-builder-guide.md @@ -8,7 +8,7 @@ This section describes how to pass arguments to your applications using an appli When writing an application, there are often parameters that you want to be able to easily change in development or production. For example, you might have a path to trained model weights and want to test out a newly trained model. In Ray Serve, these parameters are typically passed to the constructor of your deployments using `.bind()`. -This pattern allows you to be configure deployments using ordinary Python code but it requires modifying the code anytime one of the parameters needs to change. +This pattern allows you to configure deployments using ordinary Python code, but it requires modifying the code whenever one of the parameters needs to change. To pass arguments without changing the code, define an "application builder" function that takes an arguments dictionary (or [Pydantic object](typed-app-builders)) and returns the built application to be run. diff --git a/doc/source/serve/advanced-guides/dev-workflow.md b/doc/source/serve/advanced-guides/dev-workflow.md index 9785c7a68643..b16cc120de6b 100644 --- a/doc/source/serve/advanced-guides/dev-workflow.md +++ b/doc/source/serve/advanced-guides/dev-workflow.md @@ -97,7 +97,7 @@ This mode runs each deployment in a background thread and supports most of the s ## Testing on a remote cluster -To test on a remote cluster, use `serve run` again, but this time, pass in an `--address` argument to specify the address of the Ray cluster to connect to. For remote clusters, this address has the form `ray://:10001`; see [Ray Client](ray-client-ref) for more information. +To test on a remote cluster, use `serve run` again, but this time, pass in an `--address` argument to specify the address of the Ray cluster to connect to. For remote clusters, this address has the form `ray://:10001`; see [Ray Client](ray-client-ref) for more information. When making the transition from your local machine to a remote cluster, you'll need to make sure your cluster has a similar environment to your local machine--files, environment variables, and Python packages, for example. @@ -107,7 +107,7 @@ Let's see a simple example that just packages the code. Run the following comman serve run --address=ray://:10001 --working-dir="./project/src" local_dev:app ``` -This connects to the remote cluster with the Ray Client, uploads the `working_dir` directory, and runs your Serve application. Here, the local directory specified by `working_dir` must contain `local_dev.py` so that it can be uploaded to the cluster and imported by Ray Serve. +This connects to the remote cluster with the Ray Client, uploads the `working_dir` directory, and runs your Serve application. Here, the local directory specified by `working_dir` must contain `local_dev.py` so that it can be uploaded to the cluster and imported by Ray Serve. Once this is up and running, we can send requests to the application: diff --git a/doc/source/serve/advanced-guides/performance.md b/doc/source/serve/advanced-guides/performance.md index 8720d032a519..1cdff7ab7862 100644 --- a/doc/source/serve/advanced-guides/performance.md +++ b/doc/source/serve/advanced-guides/performance.md @@ -46,8 +46,8 @@ According to the [FastAPI documentation](https://fastapi.tiangolo.com/async/#ver Are you using `async def` in your callable? If you are using `asyncio` and hitting the same queuing issue mentioned above, you might want to increase -`max_ongoing_requests`. Serve sets a low number (100) by default so the client gets -proper backpressure. You can increase the value in the deployment decorator; e.g., +`max_ongoing_requests`. By default, Serve sets this to a low value (5) to ensure clients receive proper backpressure. +You can increase the value in the deployment decorator; for example, `@serve.deployment(max_ongoing_requests=1000)`. (serve-performance-e2e-timeout)= @@ -80,7 +80,7 @@ Ray Serve allows you to fine-tune the backoff behavior of the request router, wh The Serve Controller runs on the Ray head node and is responsible for a variety of tasks, including receiving autoscaling metrics from other Ray Serve components. If the Serve Controller becomes overloaded -(symptoms might include high CPU usage and a large number of pending `ServeController.record_handle_metrics` tasks), +(symptoms might include high CPU usage and a large number of pending `ServeController.record_autoscaling_metrics_from_handle` tasks), you can increase the interval between cycles of the control loop by setting the `RAY_SERVE_CONTROL_LOOP_INTERVAL_S` environment variable (defaults to `0.1` seconds). This setting gives the Controller more time to process requests and may help alleviate the overload. diff --git a/doc/source/serve/api/index.md b/doc/source/serve/api/index.md index 37ae947210cb..d8d251018c70 100644 --- a/doc/source/serve/api/index.md +++ b/doc/source/serve/api/index.md @@ -70,6 +70,7 @@ See the [model composition guide](serve-model-composition) for how to update cod serve.delete serve.status serve.shutdown + serve.shutdown_async ``` ### Configurations @@ -83,6 +84,7 @@ See the [model composition guide](serve-model-composition) for how to update cod serve.config.gRPCOptions serve.config.HTTPOptions serve.config.AutoscalingConfig + serve.config.AutoscalingPolicy serve.config.RequestRouterConfig ``` @@ -100,6 +102,10 @@ See the [model composition guide](serve-model-composition) for how to update cod serve.schema.ServeStatus serve.schema.DeploymentStatusOverview serve.schema.EncodingType + serve.schema.AutoscalingMetricsHealth + serve.schema.AutoscalingStatus + serve.schema.ScalingDecision + serve.schema.DeploymentAutoscalingDetail ``` ### Request Router @@ -383,6 +389,9 @@ Content-Type: application/json schema.ServeApplicationSchema schema.DeploymentSchema schema.RayActorOptionsSchema + schema.CeleryAdapterConfig + schema.TaskProcessorConfig + schema.TaskResult ``` (serve-rest-api-response-schema)= diff --git a/doc/source/serve/architecture.md b/doc/source/serve/architecture.md index 9aceefa74bda..fa838b0cafc4 100644 --- a/doc/source/serve/architecture.md +++ b/doc/source/serve/architecture.md @@ -29,8 +29,8 @@ There are three kinds of actors that are created to make up a Serve instance: responds once they are completed. For scalability and high availability, you can also run a proxy on each node in the cluster via the `proxy_location` field inside [`serve.start()`](core-apis) or [the config file](serve-in-production-config-file). - **gRPC Proxy**: If Serve is started with valid `port` and `grpc_servicer_functions`, - then the gRPC proxy is started alongside with the HTTP proxy. This Actor runs a - [grpcio](https://grpc.github.io/grpc/python/) server. The gRPC server that accepts + then the gRPC proxy is started alongside the HTTP proxy. This Actor runs a + [grpcio](https://grpc.github.io/grpc/python/) server. The gRPC server accepts incoming requests, forwards them to replicas, and responds once they are completed. - **Replicas**: Actors that actually execute the code in response to a request. For example, they may contain an instantiation of an ML model. Each @@ -51,7 +51,7 @@ When an HTTP or gRPC request is sent to the corresponding HTTP or gRPC proxy, th Each replica maintains a queue of requests and executes requests one at a time, possibly using `asyncio` to process them concurrently. If the handler (the deployment function or the `__call__` method of the deployment class) is declared with `async def`, the replica will not wait for the -handler to run. Otherwise, the replica blocks until the handler returns. +handler to run. Otherwise, the replica blocks until the handler returns. When making a request via a [DeploymentHandle](serve-key-concepts-deployment-handle) instead of HTTP or gRPC for [model composition](serve-model-composition), the request is placed on a queue in the `DeploymentHandle`, and we skip to step 3 above. @@ -88,7 +88,7 @@ Ray Serve's autoscaling feature automatically increases or decreases a deploymen - The Serve Autoscaler runs in the Serve Controller actor. - Each `DeploymentHandle` and each replica periodically pushes its metrics to the autoscaler. - For each deployment, the autoscaler periodically checks `DeploymentHandle` queues and in-flight queries on replicas to decide whether or not to scale the number of replicas. -- Each `DeploymentHandle` continuously polls the controller to check for new deployment replicas. Whenever new replicas are discovered, it sends any buffered or new queries to the replica until `max_ongoing_requests` is reached. Queries are sent to replicas in round-robin fashion, subject to the constraint that no replica is handling more than `max_ongoing_requests` requests at a time. +- Each `DeploymentHandle` continuously polls the controller to check for new deployment replicas. Whenever new replicas are discovered, it sends any buffered or new queries to the replica until `max_ongoing_requests` is reached. Queries are sent to replicas in round-robin fashion, subject to the constraint that no replica is handling more than `max_ongoing_requests` requests at a time. :::{note} When the controller dies, requests can still be sent via HTTP, gRPC and `DeploymentHandle`, but autoscaling is paused. When the controller recovers, the autoscaling resumes, but all previous metrics collected are lost. @@ -105,7 +105,7 @@ Each node in your Ray cluster provides a Serve REST API server that can connect You can configure Serve to start one proxy Actor per node with the `proxy_location` field inside [`serve.start()`](core-apis) or [the config file](serve-in-production-config-file). Each proxy binds to the same port. You should be able to reach Serve and send requests to any models with any of the -servers. You can use your own load balancer on top of Ray Serve. +servers. You can use your own load balancer on top of Ray Serve. This architecture ensures horizontal scalability for Serve. You can scale your HTTP and gRPC ingress by adding more nodes. You can also scale your model inference by increasing the number of replicas via the `num_replicas` option of your deployment. diff --git a/doc/source/serve/develop-and-deploy.md b/doc/source/serve/develop-and-deploy.md index 2e2bf9d2541d..616184deaedd 100644 --- a/doc/source/serve/develop-and-deploy.md +++ b/doc/source/serve/develop-and-deploy.md @@ -38,7 +38,7 @@ Bonjour Monde! ``` Converting this model into a Ray Serve application with FastAPI requires three changes: -1. Import Ray Serve and Fast API dependencies +1. Import Ray Serve and FastAPI dependencies 2. Add decorators for Serve deployment with FastAPI: `@serve.deployment` and `@serve.ingress(app)` 3. `bind` the `Translator` deployment to the arguments that are passed into its constructor @@ -60,7 +60,7 @@ To test locally, run the script with the `serve run` CLI command. This command t $ serve run model:translator_app ``` -This command runs the `translator_app` application and then blocks streaming logs to the console. You can kill it with `Ctrl-C`, which tears down the application. +This command runs the `translator_app` application and then blocks, streaming logs to the console. You can kill it with `Ctrl-C`, which tears down the application. Now test the model over HTTP. Reach it at the following default URL: diff --git a/doc/source/serve/doc_code/pd_dissagregation/lmcache/mooncake.yaml b/doc/source/serve/doc_code/pd_dissagregation/lmcache/mooncake.yaml new file mode 100644 index 000000000000..e6430eff7549 --- /dev/null +++ b/doc/source/serve/doc_code/pd_dissagregation/lmcache/mooncake.yaml @@ -0,0 +1,17 @@ +# LMCache configuration for Mooncake store backend +chunk_size: 256 +local_device: "cpu" +remote_url: "mooncakestore://storage-server:49999/" +remote_serde: "naive" +pipelined_backend: false +local_cpu: false +max_local_cpu_size: 5 +extra_config: + local_hostname: "compute-node-001" + metadata_server: "etcd://metadata-server:2379" + protocol: "rdma" + device_name: "rdma0" + master_server_address: "storage-server:49999" + global_segment_size: 3355443200 # 3.125 GB + local_buffer_size: 1073741824 # 1 GB + transfer_timeout: 1 diff --git a/doc/source/serve/doc_code/pd_dissagregation/lmcache/nixl/decoder.yaml b/doc/source/serve/doc_code/pd_dissagregation/lmcache/nixl/decoder.yaml new file mode 100644 index 000000000000..34e22d421997 --- /dev/null +++ b/doc/source/serve/doc_code/pd_dissagregation/lmcache/nixl/decoder.yaml @@ -0,0 +1,12 @@ +local_cpu: False +max_local_cpu_size: 0 +max_local_disk_size: 0 +remote_serde: NULL + +enable_nixl: True +nixl_role: "receiver" +nixl_receiver_host: "localhost" +nixl_receiver_port: 55555 +nixl_buffer_size: 1073741824 # 1GB +nixl_buffer_device: "cuda" +nixl_enable_gc: True diff --git a/doc/source/serve/doc_code/pd_dissagregation/lmcache/nixl/prefiller.yaml b/doc/source/serve/doc_code/pd_dissagregation/lmcache/nixl/prefiller.yaml new file mode 100644 index 000000000000..544551b78a78 --- /dev/null +++ b/doc/source/serve/doc_code/pd_dissagregation/lmcache/nixl/prefiller.yaml @@ -0,0 +1,12 @@ +local_cpu: False +max_local_cpu_size: 0 +max_local_disk_size: 0 +remote_serde: NULL + +enable_nixl: True +nixl_role: "sender" +nixl_receiver_host: "localhost" +nixl_receiver_port: 55555 +nixl_buffer_size: 1073741824 # 1GB +nixl_buffer_device: "cuda" +nixl_enable_gc: True diff --git a/doc/source/serve/doc_code/pd_dissagregation/lmcache_mooncake_example.yaml b/doc/source/serve/doc_code/pd_dissagregation/lmcache_mooncake_example.yaml new file mode 100644 index 000000000000..d7702cbf5d5b --- /dev/null +++ b/doc/source/serve/doc_code/pd_dissagregation/lmcache_mooncake_example.yaml @@ -0,0 +1,34 @@ +# Example: LMCacheConnectorV1 with Mooncake store configuration + +applications: + - args: + prefill_config: + model_loading_config: + model_id: meta-llama/Llama-3.1-8B-Instruct + engine_kwargs: + kv_transfer_config: &kv_transfer_config + kv_connector: LMCacheConnectorV1 + kv_role: kv_both + deployment_config: + autoscaling_config: + min_replicas: 2 + max_replicas: 2 + runtime_env: &runtime_env + env_vars: + LMCACHE_CONFIG_FILE: lmcache_mooncake.yaml + LMCACHE_USE_EXPERIMENTAL: "True" + + decode_config: + model_loading_config: + model_id: meta-llama/Llama-3.1-8B-Instruct + engine_kwargs: + kv_transfer_config: *kv_transfer_config + deployment_config: + autoscaling_config: + min_replicas: 1 + max_replicas: 1 + runtime_env: *runtime_env + + import_path: ray.serve.llm:build_pd_openai_app + name: pd-disaggregation-lmcache-mooncake + route_prefix: "/" diff --git a/doc/source/serve/doc_code/pd_dissagregation/lmcache_nixl_example.yaml b/doc/source/serve/doc_code/pd_dissagregation/lmcache_nixl_example.yaml new file mode 100644 index 000000000000..4284627055ae --- /dev/null +++ b/doc/source/serve/doc_code/pd_dissagregation/lmcache_nixl_example.yaml @@ -0,0 +1,45 @@ +# Example: LMCacheConnectorV1 with NIXL backend configuration + +applications: + - args: + prefill_config: + model_loading_config: + model_id: meta-llama/Llama-3.1-8B-Instruct + engine_kwargs: + kv_transfer_config: + kv_connector: LMCacheConnectorV1 + kv_role: kv_producer + kv_connector_extra_config: + discard_partial_chunks: false + lmcache_rpc_port: producer1 + deployment_config: + autoscaling_config: + min_replicas: 2 + max_replicas: 2 + runtime_env: + env_vars: + LMCACHE_CONFIG_FILE: lmcache_prefiller.yaml + LMCACHE_USE_EXPERIMENTAL: "True" + + decode_config: + model_loading_config: + model_id: meta-llama/Llama-3.1-8B-Instruct + engine_kwargs: + kv_transfer_config: + kv_connector: LMCacheConnectorV1 + kv_role: kv_consumer + kv_connector_extra_config: + discard_partial_chunks: false + lmcache_rpc_port: consumer1 + deployment_config: + autoscaling_config: + min_replicas: 6 + max_replicas: 6 + runtime_env: + env_vars: + LMCACHE_CONFIG_FILE: lmcache_decoder.yaml + LMCACHE_USE_EXPERIMENTAL: "True" + + import_path: ray.serve.llm:build_pd_openai_app + name: pd-disaggregation-lmcache-nixl + route_prefix: "/" diff --git a/doc/source/serve/doc_code/pd_dissagregation/nixl_example.yaml b/doc/source/serve/doc_code/pd_dissagregation/nixl_example.yaml new file mode 100644 index 000000000000..ac30e0e7f8ef --- /dev/null +++ b/doc/source/serve/doc_code/pd_dissagregation/nixl_example.yaml @@ -0,0 +1,33 @@ +# Example: Basic NIXLConnector configuration for prefill/decode disaggregation + +applications: + - args: + prefill_config: + model_loading_config: + model_id: meta-llama/Llama-3.1-8B-Instruct + engine_kwargs: + kv_transfer_config: + kv_connector: NixlConnector + kv_role: kv_producer + engine_id: engine1 + deployment_config: + autoscaling_config: + min_replicas: 2 + max_replicas: 4 + + decode_config: + model_loading_config: + model_id: meta-llama/Llama-3.1-8B-Instruct + engine_kwargs: + kv_transfer_config: + kv_connector: NixlConnector + kv_role: kv_consumer + engine_id: engine2 + deployment_config: + autoscaling_config: + min_replicas: 6 + max_replicas: 10 + + import_path: ray.serve.llm:build_pd_openai_app + name: pd-disaggregation-nixl + route_prefix: "/" diff --git a/doc/source/serve/examples.yml b/doc/source/serve/examples.yml index bd830b01e8d8..56416a1e0b4c 100644 --- a/doc/source/serve/examples.yml +++ b/doc/source/serve/examples.yml @@ -74,6 +74,54 @@ examples: - natural language processing link: tutorials/serve-deepseek related_technology: llm applications + - title: Deploy a small-sized LLM + skill_level: beginner + use_cases: + - generative ai + - large language models + - natural language processing + link: tutorials/deployment-serve-llm/small-size-llm/README + related_technology: llm applications + - title: Deploy a medium-sized LLM + skill_level: beginner + use_cases: + - generative ai + - large language models + - natural language processing + link: tutorials/deployment-serve-llm/medium-size-llm/README + related_technology: llm applications + - title: Deploy a large-sized LLM + skill_level: beginner + use_cases: + - generative ai + - large language models + - natural language processing + link: tutorials/deployment-serve-llm/large-size-llm/README + related_technology: llm applications + - title: Deploy a vision LLM + skill_level: beginner + use_cases: + - generative ai + - large language models + - natural language processing + link: tutorials/deployment-serve-llm/vision-llm/README + related_technology: llm applications + - title: Deploy a reasoning LLM + skill_level: beginner + use_cases: + - generative ai + - large language models + - natural language processing + link: tutorials/deployment-serve-llm/reasoning-llm/README + related_technology: llm applications + - title: Deploy a hybrid reasoning LLM + skill_level: beginner + use_cases: + - generative ai + - large language models + - natural language processing + link: tutorials/deployment-serve-llm/hybrid-reasoning-llm/README + related_technology: llm applications - title: Serve a Chatbot with Request and Response Streaming skill_level: intermediate use_cases: diff --git a/doc/source/serve/getting_started.md b/doc/source/serve/getting_started.md index 0bbe4084f3e5..23d5171523a5 100644 --- a/doc/source/serve/getting_started.md +++ b/doc/source/serve/getting_started.md @@ -6,7 +6,7 @@ This tutorial will walk you through the process of writing and testing a Ray Ser * convert a machine learning model to a Ray Serve deployment * test a Ray Serve application locally over HTTP -* compose multiple-model machine learning models together into a single application +* compose multi-model machine learning models together into a single application We'll use two models in this tutorial: @@ -101,7 +101,7 @@ parameters in the `@serve.deployment` decorator. The example configures a few co * `ray_actor_options`: a dictionary containing configuration options for each replica. * `num_cpus`: a float representing the logical number of CPUs each replica should reserve. You can make this a fraction to pack multiple replicas together on a machine with fewer CPUs than replicas. * `num_gpus`: a float representing the logical number of GPUs each replica should reserve. You can make this a fraction to pack multiple replicas together on a machine with fewer GPUs than replicas. - * `resources`: a dictionary containing other resource requirements for the replicate, such as non-GPU accelerators like HPUs or TPUs. + * `resources`: a dictionary containing other resource requirements for the replica, such as non-GPU accelerators like HPUs or TPUs. All these parameters are optional, so feel free to omit them: @@ -193,12 +193,12 @@ For example, let's deploy a machine learning pipeline with two steps: :language: python ``` -You can copy-paste this script and run it locally. It summarizes the snippet from _A Tale of Two Cities_ to `it was the best of times, it was worst of times .` +You can copy-paste this script and run it locally. It summarizes the snippet from _A Tale of Two Cities_ to `it was the best of times, it was the worst of times .` ```console $ python summary_model.py -it was the best of times, it was worst of times . +it was the best of times, it was the worst of times . ``` Here's an application that chains the two models together. The graph takes English text, summarizes it, and then translates it: diff --git a/doc/source/serve/http-guide.md b/doc/source/serve/http-guide.md index 054ac9ff2145..a4b14a07f68d 100644 --- a/doc/source/serve/http-guide.md +++ b/doc/source/serve/http-guide.md @@ -63,7 +63,7 @@ When the request is cancelled, a cancellation error is raised inside the `Snorin If you want to define more complex HTTP handling logic, Serve integrates with [FastAPI](https://fastapi.tiangolo.com/). This allows you to define a Serve deployment using the {mod}`@serve.ingress ` decorator that wraps a FastAPI app with its full range of features. The most basic example of this is shown below, but for more details on all that FastAPI has to offer such as variable routes, automatic type validation, dependency injection (e.g., for database connections), and more, please check out [their documentation](https://fastapi.tiangolo.com/). :::{note} -A Serve application that's integrated with FastAPI still respects the `route_prefix` set through Serve. The routes are that registered through the FastAPI `app` object are layered on top of the route prefix. For instance, if your Serve application has `route_prefix = /my_app` and you decorate a method with `@app.get("/fetch_data")`, then you can call that method by sending a GET request to the path `/my_app/fetch_data`. +A Serve application that's integrated with FastAPI still respects the `route_prefix` set through Serve. The routes that are registered through the FastAPI `app` object are layered on top of the route prefix. For instance, if your Serve application has `route_prefix = /my_app` and you decorate a method with `@app.get("/fetch_data")`, then you can call that method by sending a GET request to the path `/my_app/fetch_data`. ::: ```{literalinclude} doc_code/http_guide/http_guide.py :start-after: __begin_fastapi__ diff --git a/doc/source/serve/index.md b/doc/source/serve/index.md index 7c581ab947d4..498337140771 100644 --- a/doc/source/serve/index.md +++ b/doc/source/serve/index.md @@ -13,7 +13,7 @@ multi-app model-multiplexing configure-serve-deployment http-guide -Serving LLMs +Serving LLMs Production Guide monitoring resource-allocation @@ -35,7 +35,7 @@ api/index Ray Serve is a scalable model serving library for building online inference APIs. Serve is framework-agnostic, so you can use a single toolkit to serve everything from deep learning models built with frameworks like PyTorch, TensorFlow, and Keras, to Scikit-Learn models, to arbitrary Python business logic. It has several features and performance optimizations for serving Large Language Models such as response streaming, dynamic request batching, multi-node/multi-GPU serving, etc. -Ray Serve is particularly well suited for [model composition](serve-model-composition) and many model serving, enabling you to build a complex inference service consisting of multiple ML models and business logic all in Python code. +Ray Serve is particularly well suited for [model composition](serve-model-composition) and multi-model serving, enabling you to build a complex inference service consisting of multiple ML models and business logic all in Python code. Ray Serve is built on top of Ray, so it easily scales to many machines and offers flexible scheduling support such as fractional GPUs so you can share resources and serve many machine learning models at low cost. @@ -244,7 +244,7 @@ or head over to the {doc}`examples` to get started building your Ray Serve appli **Getting Started** ^^^ - Start with our quick start tutorials for :ref:`deploying a single model locally ` and how to :ref:`convert an existing model into a Ray Serve deployment ` . + Start with our quick start tutorials for :ref:`deploying a single model locally ` and how to :ref:`convert an existing model into a Ray Serve deployment `. +++ .. button-ref:: serve-getting-started diff --git a/doc/source/serve/llm/index.md b/doc/source/serve/llm/index.md new file mode 100644 index 000000000000..893b4a5d370a --- /dev/null +++ b/doc/source/serve/llm/index.md @@ -0,0 +1,61 @@ +(serving-llms)= + +# Serving LLMs + +Ray Serve LLM APIs allow users to deploy multiple LLM models together with a familiar Ray Serve API, while providing compatibility with the OpenAI API. + +## Features + +- ⚡️ Automatic scaling and load balancing +- 🌐 Unified multi-node multi-model deployment +- 🔌 OpenAI compatible +- 🔄 Multi-LoRA support with shared base models +- 🚀 Engine agnostic architecture (i.e. vLLM, SGLang, etc) + +## Requirements + +```bash +pip install ray[serve,llm]>=2.43.0 vllm>=0.7.2 + +# Suggested dependencies when using vllm 0.7.2: +pip install xgrammar==0.1.11 pynvml==12.0.0 +``` + +## Key Components + +The ray.serve.llm module provides two key deployment types for serving LLMs: + +### LLMServer + +The LLMServer sets up and manages the vLLM engine for model serving. It can be used standalone or combined with your own custom Ray Serve deployments. + +### OpenAiIngress + +This deployment provides an OpenAI-compatible FastAPI ingress and routes traffic to the appropriate model for multi-model services. The following endpoints are supported: + +- `/v1/chat/completions`: Chat interface (ChatGPT-style) +- `/v1/completions`: Text completion +- `/v1/embeddings`: Text embeddings +- `/v1/score`: Text comparison +- `/v1/models`: List available models +- `/v1/models/{model}`: Model information + +## Configuration + +### LLMConfig + +The LLMConfig class specifies model details such as: + +- Model loading sources (HuggingFace or cloud storage) +- Hardware requirements (accelerator type) +- Engine arguments (e.g. vLLM engine kwargs) +- LoRA multiplexing configuration +- Serve auto-scaling parameters + +```{toctree} +:hidden: + +Quickstart +Prefill/Decode Disaggregation +Cache-aware request routing +``` diff --git a/doc/source/serve/llm/pd-dissagregation.md b/doc/source/serve/llm/pd-dissagregation.md new file mode 100644 index 000000000000..96f8859ef8e6 --- /dev/null +++ b/doc/source/serve/llm/pd-dissagregation.md @@ -0,0 +1,179 @@ +(serve-pd-dissagregation)= +# Prefill/Decode Disaggregation with KV Transfer Backends + +## Overview + +Prefill/decode disaggregation is a technique that separates the prefill phase (processing input prompts) from the decode phase (generating tokens). This separation allows for: + +- **Better resource utilization**: Prefill operations can use high-memory, high-compute nodes while decode operations can use optimized inference nodes +- **Improved scalability**: Each phase can be scaled independently based on demand +- **Cost optimization**: Different node types can be used for different workloads + +vLLM v1 supports two main KV transfer backends: +1. **NIXLConnector**: Network-based KV cache transfer using NIXL (Network Interface for eXtended LLM). Simple setup with automatic network configuration. +2. **LMCacheConnectorV1**: Advanced caching solution with support for various storage backends. **Requires etcd server** for metadata coordination between prefill and decode instances. + +## Prerequisites + +Make sure that you are using vLLM v1 by setting `VLLM_USE_V1=1` environment variable. + +For NixlConnector make sure nixl is installed. If you use [ray-project/ray-llm](https://hub.docker.com/r/rayproject/ray-llm/tags) images you automatically get the dependency installed. + +For LMCacheConnectorV1, also install LMCache: + +```bash +pip install lmcache +``` + +## NIXLConnector Backend + +The NIXLConnector provides network-based KV cache transfer between prefill and decode servers using a side channel communication mechanism. + +### Basic Configuration + +```python +from ray.serve.llm import LLMConfig, build_pd_openai_app + +# Prefill configuration +prefill_config = LLMConfig( + model_loading_config={ + "model_id": "meta-llama/Llama-3.1-8B-Instruct" + }, + engine_kwargs={ + "kv_transfer_config": { + "kv_connector": "NixlConnector", + "kv_role": "kv_both", + "engine_id": "engine1" + } + } +) + +# Decode configuration +decode_config = LLMConfig( + model_loading_config={ + "model_id": "meta-llama/Llama-3.1-8B-Instruct" + }, + engine_kwargs={ + "kv_transfer_config": { + "kv_connector": "NixlConnector", + "kv_role": "kv_both", + "engine_id": "engine2" + } + } +) + +pd_config = dict( + prefill_config=prefill_config, + decode_config=decode_config, +) + +app = build_pd_openai_app(pd_config) +serve.run(app) +``` + +### Complete YAML Configuration Example + +Here's a complete configuration file for NIXLConnector: + +```{literalinclude} ../doc_code/pd_dissagregation/nixl_example.yaml +:language: yaml +``` + +## LMCacheConnectorV1 Backend + +LMCacheConnectorV1 provides a more advanced caching solution with support for multiple storage backends and enhanced performance features. + +### Scenario 1: LMCache with NIXL Backend + +This configuration uses LMCache with a NIXL-based storage backend for network communication. + +```{literalinclude} ../doc_code/pd_dissagregation/lmcache_nixl_example.yaml +:language: yaml +``` + +#### LMCache Configuration for NIXL Backend + +Create `lmcache_prefiller.yaml`: + +```{literalinclude} ../doc_code/pd_dissagregation/lmcache/nixl/prefiller.yaml +:language: yaml +``` + +Create `lmcache_decoder.yaml`: + +```{literalinclude} ../doc_code/pd_dissagregation/lmcache/nixl/decoder.yaml +:language: yaml +``` + +**Important**: The `LMCACHE_CONFIG_FILE` environment variable must point to an existing configuration file that is accessible within the Ray Serve container or worker environment. Ensure these configuration files are properly mounted or available in your deployment environment. + +### Scenario 2: LMCache with Mooncake Store Backend + +This configuration uses LMCache with Mooncake store, a high-performance distributed storage system. + +```{literalinclude} ../doc_code/pd_dissagregation/lmcache_mooncake_example.yaml +:language: yaml +``` + +#### LMCache Configuration for Mooncake Store + +Create `lmcache_mooncake.yaml`: + +```{literalinclude} ../doc_code/pd_dissagregation/lmcache/mooncake.yaml +:language: yaml +``` + +**Important Notes**: +- The `LMCACHE_CONFIG_FILE` environment variable must point to an existing configuration file that is accessible within the Ray Serve container or worker environment. +- For Mooncake store backend, ensure the etcd metadata server is running and accessible at the specified address. +- Verify that RDMA devices and storage servers are properly configured and accessible. +- In containerized deployments, mount configuration files with appropriate read permissions (e.g., `chmod 644`). +- Ensure all referenced hostnames and IP addresses in configuration files are resolvable from the deployment environment. + +## Deployment and Testing + +### Deploy the Application + +1. **Start required services** (for LMCacheConnectorV1): + + ```bash + # Start etcd server if not already running + docker run -d --name etcd-server \ + -p 2379:2379 -p 2380:2380 \ + quay.io/coreos/etcd:latest \ + etcd --listen-client-urls http://0.0.0.0:2379 \ + --advertise-client-urls http://localhost:2379 + + # See https://docs.lmcache.ai/kv_cache/mooncake.html for more details. + mooncake_master --port 49999 + ``` + +2. **Save your configuration** to a YAML file (e.g., `mooncake.yaml`) + +3. **Deploy using Ray Serve CLI**: + ```bash + serve deploy pd_config.yaml + ``` + +### Test the Deployment + +Test with a simple request: + +```bash +curl -X POST "http://localhost:8000/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Llama-3.1-8B-Instruct", + "messages": [ + {"role": "user", "content": "Explain the benefits of prefill/decode disaggregation"} + ], + "max_tokens": 100, + "temperature": 0.7 + }' +``` + +## Related Resources + +LMCache prefill/decode dissagregation official guide: + +- [dissagregated serving](https://docs.lmcache.ai/disaggregated_prefill/nixl/index.html) diff --git a/doc/source/serve/llm/prefix-aware-request-router.md b/doc/source/serve/llm/prefix-aware-request-router.md new file mode 100644 index 000000000000..d8a73e599fa7 --- /dev/null +++ b/doc/source/serve/llm/prefix-aware-request-router.md @@ -0,0 +1,157 @@ +(prefix-aware-request-router-guide)= +# `PrefixCacheAffinityRouter` for LLM inference optimization + +:::{warning} +This API is in alpha and may change before becoming stable. +::: + +LLM inference can benefit significantly from cache locality optimization. When one replica processes multiple prompts that share a prefix, the engine can reuse previously computed KV-cache entries, reducing computation overhead and improving response times. This technique is known as [Automatic Prefix Caching (APC)](https://docs.vllm.ai/en/stable/features/automatic_prefix_caching.html) in vLLM. The `PrefixCacheAffinityRouter` is designed specifically for this use case. + +This guide covers: +- Understanding the prefix cache-aware routing algorithm +- Building the components of a prefix-aware router +- Configuration parameters and their impact + +(prefix-aware-algorithm)= +## How Ray Serve LLM prefix cache-aware routing works + +The `PrefixCacheAffinityRouter` implements a multi-tier routing strategy that balances cache locality with load distribution: + +### 1. Load balance check +First, it evaluates whether the current load is balanced across replicas by comparing queue lengths. If the difference between the highest and lowest queue lengths is below the `imbalanced_threshold`, it proceeds with prefix cache-aware routing. + +### 2. Prefix matching strategy +When load is balanced, the router uses a prefix tree to find replicas that have previously processed similar input text: + +- **High Match Rate (≥10%)**: Routes to replicas with the highest prefix match rate for better cache hit rates +- **Low Match Rate (<10%)**: Falls back to replicas with the lowest prefix cache utilization to increase utilization +- **No Prefix Data**: Uses the default Power of Two Choices selection + +### 3. Imbalanced load fallback +When load is imbalanced (queue length difference exceeds threshold), the router prioritizes load balancing over cache locality and falls back to the standard Power of Two Choices algorithm. + +### Prefix tree management +The router maintains a distributed prefix tree actor that: +- Tracks input text prefixes processed by each replica +- Supports automatic eviction of old entries to manage memory usage +- Persists across router instances using Ray's detached actor pattern + +(building-prefix-aware-components)= +## Building prefix-aware router components + +This section breaks down the key components of `PrefixCacheAffinityRouter` and shows how they work together. For a more basic example, see {ref}`custom-request-router-guide`. + +### Base RequestRouter foundation + +Like all custom routers in Ray Serve, the `PrefixCacheAffinityRouter` extends the base [`RequestRouter`](../api/doc/ray.serve.request_router.RequestRouter.rst) class. The two core methods that define router behavior are: + +- **`choose_replicas()`**: The main routing logic that selects which replicas should handle a request +- **`on_request_routed()`**: A callback that updates router state after a request is successfully routed + +For a detailed explanation of these methods and their parameters, see the {ref}`simple-uniform-request-router` example in the custom request router guide. + +### 1. Load balance detection component + +The first component evaluates whether the current load is balanced across replicas: + +```{literalinclude} ../../../../python/ray/llm/_internal/serve/request_router/prefix_aware/prefix_aware_router.py +:start-after: __begin_load_balance_component__ +:end-before: __end_load_balance_component__ +:language: python +:caption: prefix_aware_router.py +``` + +This component prioritizes load balancing over cache locality when replicas become too imbalanced. + + +### 2. Prefix tree management component + +The prefix tree component is implemented as a detached Ray actor that manages prefix tracking across the Serve application. The actual tree structure uses a multi-tenant prefix tree (approximate radix tree). + +This distributed architecture allows the prefix information to persist across router restarts and be shared among multiple router instances. + +### 3. Prefix matching logic component + +The core prefix matching component implements the routing decision logic in the `_prefix_match_best_replicas` method. When load is balanced, it performs prefix matching to find the best replica: + +```{literalinclude} ../../../../python/ray/llm/_internal/serve/request_router/prefix_aware/prefix_aware_router.py +:start-after: __begin_prefix_match_component__ +:end-before: __end_prefix_match_component__ +:language: python +:caption: prefix_aware_router.py +``` + +This logic implements the three-tier strategy: +1. **High match rate**: Routes to replicas with the highest prefix match when `match_rate >= match_rate_threshold` +2. **Low match rate**: Falls back to replicas with smallest KV-cache usage when match rate is below threshold +3. **No match**: Fall back to default Power of Two Choices selection when `_prefix_match_best_replicas` returns to `choose_replicas`. + +### 4. Integration with Power of Two choices + +The prefix-aware router extends the proven Power of Two Choices algorithm, falling back to it when prefix-based routing would degenerate. `PrefixCacheAffinityRouter` integrates this component in the `choose_replicas` method: + +```{literalinclude} ../../../../python/ray/llm/_internal/serve/request_router/prefix_aware/prefix_aware_router.py +:start-after: __begin_pow2_router_base__ +:end-before: __end_pow2_router_base__ +:language: python +:caption: prefix_aware_router.py +``` + + +### 5. State management and callbacks + +The router uses the `on_request_routed()` callback to update the prefix tree with routing decisions: + +```{literalinclude} ../../../../python/ray/llm/_internal/serve/request_router/prefix_aware/prefix_aware_router.py +:start-after: __begin_on_request_routed__ +:end-before: __end_on_request_routed__ +:language: python +:caption: prefix_aware_router.py +``` + +When a replica dies, the router uses the `on_replica_actor_died` callback to remove the replica's entries from the shared prefix tree: +```{literalinclude} ../../../../python/ray/llm/_internal/serve/request_router/prefix_aware/prefix_aware_router.py +:start-after: __begin_on_replica_actor_died__ +:end-before: __end_on_replica_actor_died__ +:language: python +:caption: prefix_aware_router.py +``` + +(mixin-components)= +## Mixin components + +The `PrefixCacheAffinityRouter` inherits from two mixins. For more details about these and other available mixins, see {ref}`utility-mixin`. The router uses these mixins to optimize the list of candidate replicas against which it calculates prefix cache hit rate. + +The [`LocalityMixin`](../api/doc/ray.serve.request_router.LocalityMixin.rst) provides locality-aware routing to optimize network latency by preferring replicas on the same node. The [`MultiplexMixin`](../api/doc/ray.serve.request_router.MultiplexMixin.rst) enables model multiplexing support by tracking which models are loaded on each replica and routing requests to replicas that already have the requested model in memory. + +## Configuration parameters + +The `PrefixCacheAffinityRouter` provides several configuration parameters to tune its behavior: + +### Core routing parameters + +- **`imbalanced_threshold`** (default: 10): Queue length difference threshold for considering load balanced. Lower values prioritize load balancing over cache locality. + +- **`match_rate_threshold`** (default: 0.1): Minimum prefix match rate (0.0-1.0) required to use prefix cache-aware routing. Higher values require stronger prefix matches before routing for cache locality. + +### Memory management parameters + +- **`do_eviction`** (default: False): Enable automatic eviction of old prefix tree entries to approximate the LLM engine's eviction policy. + +- **`eviction_threshold_chars`** (default: 400,000): Maximum number of characters in the prefix tree before the LLM engine triggers an eviction. + +- **`eviction_target_chars`** (default: 360,000): Target number of characters to reduce the prefix tree to during eviction. + +- **`eviction_interval_secs`** (default: 10): Interval in seconds between eviction checks for when eviction is enabled. + +(deploy-llm-with-prefix-aware-router)= +## Deploying LLM applications with Prefix Cache-Aware Routing + +Deploy an LLM application using the prefix cache-aware request router as follows: + +```{literalinclude} ../../llm/doc_code/serve/prefix_aware_router/prefix_aware_example.py +:start-after: __prefix_aware_example_start__ +:end-before: __prefix_aware_example_end__ +:language: python +:caption: prefix_aware_example.py +``` diff --git a/doc/source/serve/llm/serving-llms.rst b/doc/source/serve/llm/quick-start.rst similarity index 88% rename from doc/source/serve/llm/serving-llms.rst rename to doc/source/serve/llm/quick-start.rst index 9e5048e423b1..438dc948c751 100644 --- a/doc/source/serve/llm/serving-llms.rst +++ b/doc/source/serve/llm/quick-start.rst @@ -1,61 +1,4 @@ -.. _serving_llms: - -Serving LLMs -============ - -Ray Serve LLM APIs allow users to deploy multiple LLM models together with a familiar Ray Serve API, while providing compatibility with the OpenAI API. - -Features --------- -- ⚡️ Automatic scaling and load balancing -- 🌐 Unified multi-node multi-model deployment -- 🔌 OpenAI compatible -- 🔄 Multi-LoRA support with shared base models -- 🚀 Engine agnostic architecture (i.e. vLLM, SGLang, etc) - -Requirements --------------- - -.. code-block:: bash - - pip install ray[serve,llm]>=2.43.0 vllm>=0.7.2 - - # Suggested dependencies when using vllm 0.7.2: - pip install xgrammar==0.1.11 pynvml==12.0.0 - - -Key Components --------------- - -The :ref:`ray.serve.llm ` module provides two key deployment types for serving LLMs: - -LLMServer -~~~~~~~~~~~~~~~~~~ - -The LLMServer sets up and manages the vLLM engine for model serving. It can be used standalone or combined with your own custom Ray Serve deployments. - -OpenAiIngress -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -This deployment provides an OpenAI-compatible FastAPI ingress and routes traffic to the appropriate model for multi-model services. The following endpoints are supported: - -- ``/v1/chat/completions``: Chat interface (ChatGPT-style) -- ``/v1/completions``: Text completion -- ``/v1/embeddings``: Text embeddings -- ``/v1/models``: List available models -- ``/v1/models/{model}``: Model information - -Configuration -------------- - -LLMConfig -~~~~~~~~~ -The :class:`LLMConfig ` class specifies model details such as: - -- Model loading sources (HuggingFace or cloud storage) -- Hardware requirements (accelerator type) -- Engine arguments (e.g. vLLM engine kwargs) -- LoRA multiplexing configuration -- Serve auto-scaling parameters +.. _quick-start: Quickstart Examples ------------------- @@ -68,31 +11,10 @@ Deployment through :class:`OpenAiIngress ` .. tab-item:: Builder Pattern :sync: builder - .. code-block:: python - - from ray import serve - from ray.serve.llm import LLMConfig, build_openai_app - - llm_config = LLMConfig( - model_loading_config=dict( - model_id="qwen-0.5b", - model_source="Qwen/Qwen2.5-0.5B-Instruct", - ), - deployment_config=dict( - autoscaling_config=dict( - min_replicas=1, max_replicas=2, - ) - ), - # Pass the desired accelerator type (e.g. A10G, L4, etc.) - accelerator_type="A10G", - # You can customize the engine arguments (e.g. vLLM engine kwargs) - engine_kwargs=dict( - tensor_parallel_size=2, - ), - ) - - app = build_openai_app({"llm_configs": [llm_config]}) - serve.run(app, blocking=True) + .. literalinclude:: ../../llm/doc_code/serve/qwen/qwen_example.py + :language: python + :start-after: __qwen_example_start__ + :end-before: __qwen_example_end__ .. tab-item:: Bind Pattern :sync: bind @@ -263,31 +185,8 @@ For production deployments, Ray Serve LLM provides utilities for config-driven d .. tab-item:: Inline Config :sync: inline - .. code-block:: yaml - - # config.yaml - applications: - - args: - llm_configs: - - model_loading_config: - model_id: qwen-0.5b - model_source: Qwen/Qwen2.5-0.5B-Instruct - accelerator_type: A10G - deployment_config: - autoscaling_config: - min_replicas: 1 - max_replicas: 2 - - model_loading_config: - model_id: qwen-1.5b - model_source: Qwen/Qwen2.5-1.5B-Instruct - accelerator_type: A10G - deployment_config: - autoscaling_config: - min_replicas: 1 - max_replicas: 2 - import_path: ray.serve.llm:build_openai_app - name: llm_app - route_prefix: "/" + .. literalinclude:: ../../llm/doc_code/serve/qwen/llm_config_example.yaml + :language: yaml .. tab-item:: Standalone Config @@ -938,4 +837,4 @@ We collect data about the following features and attributes: - GPU type used and number of GPUs used If you would like to opt-out from usage data collection, you can follow :ref:`Ray usage stats ` -to disable it. +to disable it. \ No newline at end of file diff --git a/doc/source/serve/model-multiplexing.md b/doc/source/serve/model-multiplexing.md index b0bce6a68cb3..4400aac8a6d9 100644 --- a/doc/source/serve/model-multiplexing.md +++ b/doc/source/serve/model-multiplexing.md @@ -13,7 +13,7 @@ model multiplexing optimizes cost and load balances the traffic. This is useful To write a multiplexed deployment, use the `serve.multiplexed` and `serve.get_multiplexed_model_id` APIs. -Assuming you have multiple Torch models inside an aws s3 bucket with the following structure: +Assuming you have multiple PyTorch models inside an AWS S3 bucket with the following structure: ``` s3://my_bucket/1/model.pt s3://my_bucket/2/model.pt @@ -34,15 +34,14 @@ The `serve.multiplexed` API also has a `max_num_models_per_replica` parameter. U ::: :::{tip} -This code example uses the Pytorch Model object. You can also define your own model class and use it here. To release resources when the model is evicted, implement the `__del__` method. Ray Serve internally calls the `__del__` method to release resources when the model is evicted. +This code example uses the PyTorch Model object. You can also define your own model class and use it here. To release resources when the model is evicted, implement the `__del__` method. Ray Serve internally calls the `__del__` method to release resources when the model is evicted. ::: -`serve.get_multiplexed_model_id` is used to retrieve the model id from the request header, and the model_id is then passed into the `get_model` function. If the model id is not found in the replica, Serve will load the model from the s3 bucket and cache it in the replica. If the model id is found in the replica, Serve will return the cached model. +`serve.get_multiplexed_model_id` retrieves the model ID from the request header. This ID is then passed to the `get_model` function. If the model is not already cached in the replica, Serve loads it from the S3 bucket. Otherwise, the cached model is returned. :::{note} -Internally, serve router will route the traffic to the corresponding replica based on the model id in the request header. -If all replicas holding the model are over-subscribed, ray serve sends the request to a new replica that doesn't have the model loaded. The replica will load the model from the s3 bucket and cache it. +Internally, the Serve router uses the model ID in the request header to route traffic to a corresponding replica. If all replicas that have the model are over-subscribed, Ray Serve routes the request to a new replica, which then loads and caches the model from the S3 bucket. ::: To send a request to a specific model, include the `serve_multiplexed_model_id` field in the request header, and set the value to the model ID to which you want to send the request. diff --git a/doc/source/serve/monitoring.md b/doc/source/serve/monitoring.md index 8ed8c706f0f7..9437ba81c07c 100644 --- a/doc/source/serve/monitoring.md +++ b/doc/source/serve/monitoring.md @@ -54,7 +54,7 @@ For a detailed overview of the Ray dashboard, see the [dashboard documentation]( Two Serve CLI commands help you inspect a Serve application in production: `serve config` and `serve status`. If you have a remote cluster, `serve config` and `serve status` also has an `--address/-a` argument to access the cluster. See [VM deployment](serve-in-production-remote-cluster) for more information on this argument. -`serve config` gets the latest config file that the Ray Cluster received. This config file represents the Serve application's goal state. The Ray Cluster constantly strives to reach and maintain this state by deploying deployments, and recovering failed replicas, and performing other relevant actions. +`serve config` gets the latest config file that the Ray Cluster received. This config file represents the Serve application's goal state. The Ray Cluster constantly strives to reach and maintain this state by deploying deployments, recovering failed replicas, and performing other relevant actions. Using the `serve_config.yaml` example from [the production guide](production-config-yaml): diff --git a/doc/source/serve/multi-app.md b/doc/source/serve/multi-app.md index 027281566adc..abb36cd1946a 100644 --- a/doc/source/serve/multi-app.md +++ b/doc/source/serve/multi-app.md @@ -7,7 +7,7 @@ Serve supports deploying multiple independent Serve applications. This user guid ### Background With the introduction of multi-application Serve, we walk you through the new concept of applications and when you should choose to deploy a single application versus multiple applications per cluster. -An application consists of one or more deployments. The deployments in an application are tied into a direct acyclic graph through [model composition](serve-model-composition). An application can be called via HTTP at the specified route prefix, and the ingress deployment handles all such inbound traffic. Due to the dependence between deployments in an application, one application is a unit of upgrade. +An application consists of one or more deployments. The deployments in an application are tied into a directed acyclic graph through [model composition](serve-model-composition). An application can be called via HTTP at the specified route prefix, and the ingress deployment handles all such inbound traffic. Due to the dependence between deployments in an application, one application is a unit of upgrade. ### When to use multiple applications You can solve many use cases by using either model composition or multi-application. However, both have their own individual benefits and can be used together. diff --git a/doc/source/serve/production-guide/config.md b/doc/source/serve/production-guide/config.md index 8906d39894f0..fa5eff346fe5 100644 --- a/doc/source/serve/production-guide/config.md +++ b/doc/source/serve/production-guide/config.md @@ -51,11 +51,19 @@ applications: The file contains `proxy_location`, `http_options`, `grpc_options`, `logging_config` and `applications`. +(proxy-config)= + +## Proxy config + The `proxy_location` field configures where to run proxies to handle traffic to the cluster. You can set `proxy_location` to the following values: - EveryNode (default): Run a proxy on every node in the cluster that has at least one replica actor. - HeadOnly: Only run a single proxy on the head node. - Disabled: Don't run proxies at all. Set this value if you are only making calls to your applications using deployment handles. +(http-config)= + +## HTTP config + The `http_options` are as follows. Note that the HTTP config is global to your Ray cluster, and you can't update it during runtime. - **`host`**: The host IP address for Serve's HTTP proxies. This is optional and can be omitted. By default, the `host` is set to `0.0.0.0` to expose your deployments publicly. If you're using Kubernetes, you must set `host` to `0.0.0.0` to expose your deployments outside the cluster. @@ -63,15 +71,29 @@ The `http_options` are as follows. Note that the HTTP config is global to your R - **`request_timeout_s`**: Allows you to set the end-to-end timeout for a request before terminating and retrying at another replica. By default, there is no request timeout. - **`keep_alive_timeout_s`**: Allows you to set the keep alive timeout for the HTTP proxy. For more details, see [here](serve-http-guide-keep-alive-timeout) +(grpc-config)= + +## gRPC config + The `grpc_options` are as follows. Note that the gRPC config is global to your Ray cluster, and you can't update it during runtime. - **`port`**: The port that the gRPC proxies listen on. These are optional settings and can be omitted. By default, the port is set to `9000`. - **`grpc_servicer_functions`**: List of import paths for gRPC `add_servicer_to_server` functions to add to Serve's gRPC proxy. The servicer functions need to be importable from the context of where Serve is running. This defaults to an empty list, which means the gRPC server isn't started. - **`request_timeout_s`**: Allows you to set the end-to-end timeout for a request before terminating and retrying at another replica. By default, there is no request timeout. +(logging-config)= + +## Logging config + The `logging_config` is global config, you can configure controller & proxy & replica logs. Note that you can also set application and deployment level logging config, which will take precedence over the global config. See logging config API [here](../../serve/api/doc/ray.serve.schema.LoggingConfig.rst) for more details. -These are the fields per application: +(application-config)= + +## Application config + +You configure one or more deployments as part of your Serve application. See [deployment config](serve-configure-deployment). + +These are the fields per `application`: - **`name`**: The names for each application that are auto-generated by `serve build`. The name of each application must be unique. - **`route_prefix`**: An application can be called via HTTP at the specified route prefix. It defaults to `/`. The route prefix for each application must be unique. @@ -80,6 +102,8 @@ These are the fields per application: - **`deployments (optional)`**: A list of deployment options that allows you to override the `@serve.deployment` settings specified in the deployment graph code. Each entry in this list must include the deployment `name`, which must match one in the code. If this section is omitted, Serve launches all deployments in the graph with the parameters specified in the code. See how to [configure serve deployment options](serve-configure-deployment). - **`args`**: Arguments that are passed to the [application builder](serve-app-builder-guide). +## Example config + Below is a config for the [`Text ML Model` example](serve-in-production-example) that follows the format explained above: ```yaml diff --git a/doc/source/serve/resource-allocation.md b/doc/source/serve/resource-allocation.md index 18df5a8181a4..04dff0c9cc5c 100644 --- a/doc/source/serve/resource-allocation.md +++ b/doc/source/serve/resource-allocation.md @@ -39,8 +39,6 @@ def func(*args): ### Fractional CPUs and fractional GPUs -Suppose you have two models and each doesn't fully saturate a GPU. You might want to have them share a GPU by allocating 0.5 GPUs each. - To do this, the resources specified in `ray_actor_options` can be *fractional*. For example, if you have two models and each doesn't fully saturate a GPU, you might want to have them share a GPU by allocating 0.5 GPUs each. diff --git a/doc/source/serve/tutorials/BUILD.bazel b/doc/source/serve/tutorials/BUILD.bazel deleted file mode 100644 index beb03dfbbaa8..000000000000 --- a/doc/source/serve/tutorials/BUILD.bazel +++ /dev/null @@ -1,5 +0,0 @@ -filegroup( - name = "markdowns", - srcs = glob(["*.md"]), - visibility = ["//python/ray/serve:__subpackages__"], -) diff --git a/doc/source/serve/tutorials/deployment-serve-llm/README.ipynb b/doc/source/serve/tutorials/deployment-serve-llm/README.ipynb new file mode 100644 index 000000000000..9dfa5c6f1980 --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/README.ipynb @@ -0,0 +1,58 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "bc12c0d2", + "metadata": {}, + "source": [ + "# Quickstarts for LLM serving\n", + "\n", + "These guides provide a fast path to serving LLMs using Ray Serve on Anyscale, with focused tutorials for different deployment scales, from single-GPU setups to multi-node clusters.\n", + "\n", + "Each tutorial includes development and production setups, tips for configuring your cluster, and guidance on monitoring and scaling with Ray Serve.\n", + "\n", + "## Tutorial categories\n", + "\n", + "**[Deploy a small-sized LLM](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/small-size-llm/README.html)** \n", + "Deploy small-sized models on a single GPU, such as Llama 3 8 B, Mistral 7 B, or Phi-2. \n", + "\n", + "---\n", + "\n", + "**[Deploy a medium-sized LLM](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/medium-size-llm/README.html)** \n", + "Deploy medium-sized models using tensor parallelism across 4—8 GPUs on a single node, such as Llama 3 70 B, Qwen 14 B, Mixtral 8x7 B. \n", + "\n", + "---\n", + "\n", + "**[Deploy a large-sized LLM](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/large-size-llm/README.html)** \n", + "Deploy massive models using pipeline parallelism across a multi-node cluster, such as Deepseek-R1 or Llama-Nemotron-253 B. \n", + "\n", + "---\n", + "\n", + "**[Deploy a vision LLM](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/vision-llm/README.html)** \n", + "Deploy models with image and text input such as Qwen 2.5-VL-7 B-Instruct, MiniGPT-4, or Pixtral-12 B. \n", + "\n", + "---\n", + "\n", + "**[Deploy a reasoning LLM](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/reasoning-llm/README.html)** \n", + "Deploy models with reasoning capabilities designed for long-context tasks, coding, or tool use, such as QwQ-32 B. \n", + "\n", + "---\n", + "\n", + "**[Deploy a hybrid reasoning LLM](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/hybrid-reasoning-llm/README.html)** \n", + "Deploy models that can switch between reasoning and non-reasoning modes for flexible usage, such as Qwen-3." + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + }, + "myst": { + "front_matter": { + "orphan": true + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/doc/source/serve/tutorials/deployment-serve-llm/README.md b/doc/source/serve/tutorials/deployment-serve-llm/README.md new file mode 100644 index 000000000000..666a99af10c5 --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/README.md @@ -0,0 +1,41 @@ + + +# Quickstarts for LLM serving + +These guides provide a fast path to serving LLMs using Ray Serve on Anyscale, with focused tutorials for different deployment scales, from single-GPU setups to multi-node clusters. + +Each tutorial includes development and production setups, tips for configuring your cluster, and guidance on monitoring and scaling with Ray Serve. + +## Tutorial categories + +**[Deploy a small-sized LLM](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/small-size-llm/README.html)** +Deploy small-sized models on a single GPU, such as Llama 3 8 B, Mistral 7 B, or Phi-2. + +--- + +**[Deploy a medium-sized LLM](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/medium-size-llm/README.html)** +Deploy medium-sized models using tensor parallelism across 4—8 GPUs on a single node, such as Llama 3 70 B, Qwen 14 B, Mixtral 8x7 B. + +--- + +**[Deploy a large-sized LLM](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/large-size-llm/README.html)** +Deploy massive models using pipeline parallelism across a multi-node cluster, such as Deepseek-R1 or Llama-Nemotron-253 B. + +--- + +**[Deploy a vision LLM](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/vision-llm/README.html)** +Deploy models with image and text input such as Qwen 2.5-VL-7 B-Instruct, MiniGPT-4, or Pixtral-12 B. + +--- + +**[Deploy a reasoning LLM](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/reasoning-llm/README.html)** +Deploy models with reasoning capabilities designed for long-context tasks, coding, or tool use, such as QwQ-32 B. + +--- + +**[Deploy a hybrid reasoning LLM](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/hybrid-reasoning-llm/README.html)** +Deploy models that can switch between reasoning and non-reasoning modes for flexible usage, such as Qwen-3. diff --git a/doc/source/serve/tutorials/deployment-serve-llm/ci/aws.yaml b/doc/source/serve/tutorials/deployment-serve-llm/ci/aws.yaml new file mode 100644 index 000000000000..beb4314156b7 --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/ci/aws.yaml @@ -0,0 +1,14 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west-2 + +# Head node +head_node_type: + name: head + instance_type: m5.2xlarge + resources: + cpu: 8 + +# Worker nodes +auto_select_worker_config: true +flags: + allow-cross-zone-autoscaling: true diff --git a/doc/source/serve/tutorials/deployment-serve-llm/ci/build.sh b/doc/source/serve/tutorials/deployment-serve-llm/ci/build.sh new file mode 100755 index 000000000000..ef7e19de90b6 --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/ci/build.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +set -exo pipefail diff --git a/doc/source/serve/tutorials/deployment-serve-llm/ci/gce.yaml b/doc/source/serve/tutorials/deployment-serve-llm/ci/gce.yaml new file mode 100644 index 000000000000..9c3790622d03 --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/ci/gce.yaml @@ -0,0 +1,14 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-central1 + +# Head node +head_node_type: + name: head + instance_type: n2-standard-8 + resources: + cpu: 8 + +# Worker nodes +auto_select_worker_config: true +flags: + allow-cross-zone-autoscaling: true diff --git a/doc/source/serve/tutorials/deployment-serve-llm/ci/nb2py.py b/doc/source/serve/tutorials/deployment-serve-llm/ci/nb2py.py new file mode 100644 index 000000000000..2c94f8270b9e --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/ci/nb2py.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 +import argparse +import nbformat + + +def convert_notebook( + input_path: str, output_path: str, ignore_cmds: bool = False +) -> None: + """ + Read a Jupyter notebook and write a Python script, converting all %%bash + cells and IPython "!" commands into subprocess.run calls that raise on error. + Cells that load or autoreload extensions are ignored. + """ + nb = nbformat.read(input_path, as_version=4) + with open(output_path, "w") as out: + for cell in nb.cells: + # Only process code cells + if cell.cell_type != "code": + continue + + lines = cell.source.splitlines() + # Skip cells that load or autoreload extensions + if any( + l.strip().startswith("%load_ext autoreload") + or l.strip().startswith("%autoreload all") + for l in lines + ): + continue + + # Detect a %%bash cell + if lines and lines[0].strip().startswith("%%bash"): + if ignore_cmds: + continue + bash_script = "\n".join(lines[1:]).rstrip() + out.write("import subprocess\n") + out.write( + f"subprocess.run(r'''{bash_script}''',\n" + " shell=True,\n" + " check=True,\n" + " executable='/bin/bash')\n\n" + ) + else: + # Detect any IPython '!' shell commands in code lines + has_bang = any(line.lstrip().startswith("!") for line in lines) + if has_bang: + if ignore_cmds: + continue + out.write("import subprocess\n") + for line in lines: + stripped = line.lstrip() + if stripped.startswith("!"): + cmd = stripped[1:].lstrip() + out.write( + f"subprocess.run(r'''{cmd}''',\n" + " shell=True,\n" + " check=True,\n" + " executable='/bin/bash')\n" + ) + else: + out.write(line.rstrip() + "\n") + out.write("\n") + else: + # Regular Python cell: + code = cell.source.rstrip() + if "client.chat.completions.create" in code: + continue # Model isn't deployed in CI so skip cells calling the service + # else, dump as-is + out.write(cell.source.rstrip() + "\n\n") + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Convert a Jupyter notebook to a Python script, preserving bash cells and '!' commands as subprocess calls unless ignored with --ignore-cmds." + ) + parser.add_argument("input_nb", help="Path to the input .ipynb file") + parser.add_argument("output_py", help="Path for the output .py script") + parser.add_argument( + "--ignore-cmds", action="store_true", help="Ignore bash cells and '!' commands" + ) + args = parser.parse_args() + convert_notebook(args.input_nb, args.output_py, ignore_cmds=args.ignore_cmds) + + +if __name__ == "__main__": + main() diff --git a/doc/source/serve/tutorials/deployment-serve-llm/ci/tests.sh b/doc/source/serve/tutorials/deployment-serve-llm/ci/tests.sh new file mode 100755 index 000000000000..79e90ee0905e --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/ci/tests.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# Don't use nbconvert or jupytext unless you're willing +# to check each subprocess unit and validate that errors +# aren't being consumed/hidden + +set -exo pipefail + +for nb in \ + "small-size-llm/notebook" \ + "medium-size-llm/notebook" \ + "large-size-llm/notebook" \ + "vision-llm/notebook" \ + "reasoning-llm/notebook" \ + "hybrid-reasoning-llm/notebook" +do + python ci/nb2py.py "${nb}.ipynb" "${nb}.py" --ignore-cmds + python "${nb}.py" + rm "${nb}.py" +done diff --git a/doc/source/serve/tutorials/deployment-serve-llm/configs/aws.yaml b/doc/source/serve/tutorials/deployment-serve-llm/configs/aws.yaml new file mode 100644 index 000000000000..823b7cf2d786 --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/configs/aws.yaml @@ -0,0 +1,7 @@ +head_node_type: + name: head + instance_type: m5.2xlarge +worker_node_types: [] +auto_select_worker_config: true +flags: + allow-cross-zone-autoscaling: true diff --git a/doc/source/serve/tutorials/deployment-serve-llm/configs/gce.yaml b/doc/source/serve/tutorials/deployment-serve-llm/configs/gce.yaml new file mode 100644 index 000000000000..455977d495e0 --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/configs/gce.yaml @@ -0,0 +1,7 @@ +head_node_type: + name: head + instance_type: n1-standard-8 +worker_node_types: [] +auto_select_worker_config: true +flags: + allow-cross-zone-autoscaling: true diff --git a/doc/source/serve/tutorials/deployment-serve-llm/hybrid-reasoning-llm/README.md b/doc/source/serve/tutorials/deployment-serve-llm/hybrid-reasoning-llm/README.md new file mode 100644 index 000000000000..cdd0ecf74f03 --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/hybrid-reasoning-llm/README.md @@ -0,0 +1,344 @@ +--- +orphan: true +--- + + + +# Deploy a hybrid reasoning LLM + +A hybrid reasoning model provides flexibility by allowing you to enable or disable reasoning as needed. You can use structured, step-by-step thinking for complex queries while skipping it for simpler ones, balancing accuracy with efficiency depending on the task. + +This tutorial deploys a hybrid reasoning LLM using Ray Serve LLM. + +--- + +## Distinction with purely reasoning models + +*Hybrid reasoning models* are reasoning-capable models that allow you to toggle the thinking process on and off. You can enable structured, step-by-step reasoning when needed but skip it for simpler queries to reduce latency. Purely reasoning models always apply their reasoning behavior, while hybrid models give you fine-grained control over when to use reasoning. + +| **Mode** | **Core behavior** | **Use case examples** | **Limitation** | +| ---------------- | -------------------------------------------- | ------------------------------------------------------------------- | ------------------------------------------------- | +| **Thinking ON** | Explicit multi-step thinking process | Math, coding, logic puzzles, multi-hop QA, CoT prompting | Slower response time, more tokens used. | +| **Thinking OFF** | Direct answer generation | Casual queries, short instructions, single-step answers | May struggle with complex reasoning or interpretability. | + +**Note:** Reasoning often benefits from long context windows (32K up to +1M tokens), high token throughput, low-temperature decoding (greedy sampling), and strong instruction tuning or scratchpad-style reasoning. + +To see an example of deploying a purely reasoning model like *QwQ-32 B*, see [Deploy a reasoning LLM](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/reasoning-llm/README.html). + +--- + +## Enable or disable thinking + +Some hybrid reasoning models let you toggle their "thinking" mode on or off. This section explains when to use thinking mode versus skipping it, and shows how to control the setting in practice. + +--- + +### When to enable or disable thinking mode + +**Enable thinking mode for:** +- Complex, multi-step tasks that require reasoning, such as math, physics, or logic problems. +- Ambiguous queries or situations with incomplete information. +- Planning, workflow orchestration, or when the model needs to act as an "agent" coordinating other tools or models. +- Analyzing intricate data, images, or charts. +- In-depth code reviews or evaluating outputs from other AI systems (LLM as Judge approach). + +**Disable thinking mode for:** +- Simple, well-defined, or routine tasks. +- Low latency and fast responses as the priority. +- Repetitive, straightforward steps within a larger automated workflow. + +--- + +### How to enable or disable thinking mode + +Toggle thinking mode varies by model and framework. Consult the documentation for the model to see how it structures and controls thinking. + +For example, to [control reasoning in Qwen-3](https://huggingface.co/Qwen/Qwen3-32B#switching-between-thinking-and-non-thinking-mode), you can: +* Add `"/think"` or `"/no_think"` in the prompt. +* Set `enable_thinking` in the request: + `extra_body={"chat_template_kwargs": {"enable_thinking": ...}}`. + +See [Send request with thinking enabled](#send-request-with-thinking-enabled) or [Send request with thinking disabled](#send-request-with-thinking-disabled) for practical examples. + +--- + +## Parse reasoning outputs + +In thinking mode, hybrid models often separate _reasoning_ from the _final answer_ using tags like `...`. Without a proper parser, this reasoning may end up in the `content` field instead of the dedicated `reasoning_content` field. + +To ensure that Ray Serve LLM correctly parses the reasoning output, configure a `reasoning_parser` in your Ray Serve LLM deployment. This tells vLLM how to isolate the model’s thought process from the rest of the output. +**Note:** For example, *Qwen-3* uses the `qwen3` parser. See the [vLLM docs](https://docs.vllm.ai/en/stable/features/reasoning_outputs.html#supported-models) or your model's documentation to find a supported parser, or [build your own](https://docs.vllm.ai/en/stable/features/reasoning_outputs.html#how-to-support-a-new-reasoning-model) if needed. + +```yaml +applications: +- ... + args: + llm_configs: + - model_loading_config: + model_id: my-qwen-3-32b + model_source: Qwen/Qwen3-32B + ... + engine_kwargs: + ... + reasoning_parser: qwen3 # <-- for Qwen-3 models +``` + +See [Configure Ray Serve LLM](#configure-ray-serve-llm) for a complete example. + +**Example response** +When using a reasoning parser, the response is typically structured like this: + +```python +ChatCompletionMessage( + content="The temperature is...", + ..., + reasoning_content="Okay, the user is asking for the temperature today and tomorrow..." +) +``` +And you can extract the content and reasoning like this +```python +response = client.chat.completions.create( + ... +) + +print(f"Content: {response.choices[0].message.content}") +print(f"Reasoning: {response.choices[0].message.reasoning_content}") +``` + +--- + +## Configure Ray Serve LLM + +Set your Hugging Face token in the config file to access gated models. + +Ray Serve LLM provides multiple [Python APIs](https://docs.ray.io/en/latest/serve/api/index.html#llm-api) for defining your application. Use [`build_openai_app`](https://docs.ray.io/en/latest/serve/api/doc/ray.serve.llm.build_openai_app.html#ray.serve.llm.build_openai_app) to build a full application from your [`LLMConfig`](https://docs.ray.io/en/latest/serve/api/doc/ray.serve.llm.LLMConfig.html#ray.serve.llm.LLMConfig) object. + +Set `tensor_parallel_size` to distribute the model's weights among 8 GPUs in the node. + + +```python +# serve_qwen_3_32b.py +from ray.serve.llm import LLMConfig, build_openai_app +import os + +llm_config = LLMConfig( + model_loading_config=dict( + model_id="my-qwen-3-32b", + model_source="Qwen/Qwen3-32B", + ), + accelerator_type="A100-40G", + deployment_config=dict( + autoscaling_config=dict( + min_replicas=1, + max_replicas=2, + ) + ), + ### Uncomment if your model is gated and needs your Hugging Face token to access it. + # runtime_env=dict(env_vars={"HF_TOKEN": os.environ.get("HF_TOKEN")}), + engine_kwargs=dict( + tensor_parallel_size=8, max_model_len=32768, reasoning_parser="qwen3" + ), +) +app = build_openai_app({"llm_configs": [llm_config]}) + +``` + +**Note:** Before moving to a production setup, migrate your settings to a [Serve config file](https://docs.ray.io/en/latest/serve/production-guide/config.html) to make your deployment version-controlled, reproducible, and easier to maintain for CI/CD pipelines. See [Serving LLMs - Quickstart Examples: Production Guide](https://docs.ray.io/en/latest/serve/llm/quick-start.html#production-deployment) for an example. + +--- + +## Deploy locally + +**Prerequisites** + +* Access to GPU compute. +* (Optional) A **Hugging Face token** if using gated models like. Store it in `export HF_TOKEN=`. + +**Note:** Depending on the organization, you can usually request access on the model's Hugging Face page. For example, Meta’s Llama models approval can take anywhere from a few hours to several weeks. + +**Dependencies:** +```bash +pip install "ray[serve,llm]" +``` + +--- + +### Launch + +Follow the instructions at [Configure Ray Serve LLM](#configure-ray-serve-llm) to define your app in a Python module `serve_qwen_3_32b.py`. + +In a terminal, run: + + +```bash +%%bash +serve run serve_qwen_3_32b:app --non-blocking +``` + +Deployment typically takes a few minutes as the cluster is provisioned, the vLLM server starts, and the model is downloaded. + +Your endpoint is available locally at `http://localhost:8000` and you can use a placeholder authentication token for the OpenAI client, for example `"FAKE_KEY"` + +Use the `model_id` defined in your config (here, `my-qwen-3-32b`) to query your model. Below are some examples on how to send a request to a Qwen-3 deployment with thinking enabled or disabled. + +--- + +### Send request with thinking disabled + +You can disable thinking in Qwen-3 by either adding a `/no_think` tag in the prompt or by forwarding `enable_thinking: False` to the vLLM inference engine. + +Example curl with `/no_think`: + + +```bash +%%bash +curl -X POST http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer FAKE_KEY" \ + -d '{ "model": "my-qwen-3-32b", "messages": [{"role": "user", "content": "What is greater between 7.8 and 7.11 ? /no_think"}] }' +``` + +Example Python with `enable_thinking: False`: + + +```python +#client_thinking_disabled.py +from urllib.parse import urljoin +from openai import OpenAI + +API_KEY = "FAKE_KEY" +BASE_URL = "http://localhost:8000" + +client = OpenAI(BASE_URL=urljoin(BASE_URL, "v1"), API_KEY=API_KEY) + +# Example: Complex query with thinking process +response = client.chat.completions.create( + model="my-qwen-3-32b", + messages=[ + {"role": "user", "content": "What's the capital of France ?"} + ], + extra_body={"chat_template_kwargs": {"enable_thinking": False}} +) + +print(f"Reasoning: \n{response.choices[0].message.reasoning_content}\n\n") +print(f"Answer: \n {response.choices[0].message.content}") +``` + +Notice the `reasoning_content` is empty here. +**Note:** Depending on your model's documentation, empty could mean `None`, an empty string or even empty tags `""`. + +--- + +### Send request with thinking enabled + +You can enable thinking in Qwen-3 by either adding a `/think` tag in the prompt or by forwarding `enable_thinking: True` to the vLLM inference engine. + +Example curl with `/think`: + + +```bash +%%bash +curl -X POST http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer FAKE_KEY" \ + -d '{ "model": "my-qwen-3-32b", "messages": [{"role": "user", "content": "What is greater between 7.8 and 7.11 ? /think"}] }' +``` + + Example Python with `enable_thinking: True`: + + +```python +#client_thinking_enabled.py +from urllib.parse import urljoin +from openai import OpenAI + +API_KEY = "FAKE_KEY" +BASE_URL = "http://localhost:8000" + +client = OpenAI(BASE_URL=urljoin(BASE_URL, "v1"), API_KEY=API_KEY) + +# Example: Complex query with thinking process +response = client.chat.completions.create( + model="my-qwen-3-32b", + messages=[ + {"role": "user", "content": "What's the capital of France ?"} + ], + extra_body={"chat_template_kwargs": {"enable_thinking": True}} +) + +print(f"Reasoning: \n{response.choices[0].message.reasoning_content}\n\n") +print(f"Answer: \n {response.choices[0].message.content}") +``` + +If you configure a valid reasoning parser, the reasoning output should appear in the `reasoning_content` field of the response message. Otherwise, it may be included in the main `content` field, typically wrapped in `...` tags. See [Parse reasoning outputs](#parse-reasoning-outputs) for more information. + +--- + +### Shutdown + +Shutdown your LLM service: + + +```bash +%%bash +serve shutdown -y +``` + + +--- + +## Deploy to production with Anyscale services + +For production, it's recommended to use Anyscale services to deploy your Ray Serve app on a dedicated cluster without any code changes. Anyscale provides scalability, fault tolerance, and load balancing, ensuring resilience against node failures, high traffic, and rolling updates. See [Deploy a medium-sized LLM](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/medium-size-llm/README.html#deploy-to-production-with-anyscale-services) for an example with a medium-sized model like the *Qwen-32b* from this tutorial. + +--- + +## Stream reasoning content + +In thinking mode, hybrid reasoning models may take longer to begin generating the main content. You can stream intermediate reasoning output in the same way as the main content. + + +```python +#client_streaming.py +from urllib.parse import urljoin +from openai import OpenAI + +API_KEY = "FAKE_KEY" +BASE_URL = "http://localhost:8000" + +client = OpenAI(BASE_URL=urljoin(BASE_URL, "v1"), API_KEY=API_KEY) + +# Example: Complex query with thinking process +response = client.chat.completions.create( + model="my-qwen-3-32b", + messages=[ + {"role": "user", "content": "I need to plan a trip to Paris from Seattle. Can you help me research flight costs, create an itinerary for 3 days, and suggest restaurants based on my dietary restrictions (vegetarian)?"} + ], + extra_body={"chat_template_kwargs": {"enable_thinking": True}}, + stream=True +) + +# Stream +for chunk in response: + # Stream reasoning content + if hasattr(chunk.choices[0].delta, "reasoning_content"): + data_reasoning = chunk.choices[0].delta.reasoning_content + if data_reasoning: + print(data_reasoning, end="", flush=True) + # Later, stream the final answer + if hasattr(chunk.choices[0].delta, "content"): + data_content = chunk.choices[0].delta.content + if data_content: + print(data_content, end="", flush=True) +``` + + +--- + +## Summary + +In this tutorial, you deployed a hybrid reasoning LLM with Ray Serve LLM, from development to production. You learned how to configure Ray Serve LLM with the right reasoning parser, deploy your service on your Ray cluster, send requests, and parse reasoning outputs in the response. diff --git a/doc/source/serve/tutorials/deployment-serve-llm/hybrid-reasoning-llm/client_streaming.py b/doc/source/serve/tutorials/deployment-serve-llm/hybrid-reasoning-llm/client_streaming.py new file mode 100644 index 000000000000..ea383b2649f0 --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/hybrid-reasoning-llm/client_streaming.py @@ -0,0 +1,34 @@ +# client_streaming.py +from urllib.parse import urljoin +from openai import OpenAI + +API_KEY = "FAKE_KEY" +BASE_URL = "http://localhost:8000" + +client = OpenAI(BASE_URL=urljoin(BASE_URL, "v1"), API_KEY=API_KEY) + +# Example: Complex query with thinking process +response = client.chat.completions.create( + model="my-qwen-3-32b", + messages=[ + { + "role": "user", + "content": "I need to plan a trip to Paris from Seattle. Can you help me research flight costs, create an itinerary for 3 days, and suggest restaurants based on my dietary restrictions (vegetarian)?", + } + ], + extra_body={"chat_template_kwargs": {"enable_thinking": True}}, + stream=True, +) + +# Stream +for chunk in response: + # Stream reasoning content + if hasattr(chunk.choices[0].delta, "reasoning_content"): + data_reasoning = chunk.choices[0].delta.reasoning_content + if data_reasoning: + print(data_reasoning, end="", flush=True) + # Later, stream the final answer + if hasattr(chunk.choices[0].delta, "content"): + data_content = chunk.choices[0].delta.content + if data_content: + print(data_content, end="", flush=True) diff --git a/doc/source/serve/tutorials/deployment-serve-llm/hybrid-reasoning-llm/client_thinking_disabled.py b/doc/source/serve/tutorials/deployment-serve-llm/hybrid-reasoning-llm/client_thinking_disabled.py new file mode 100644 index 000000000000..fcacfe43166f --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/hybrid-reasoning-llm/client_thinking_disabled.py @@ -0,0 +1,18 @@ +# client.py +from urllib.parse import urljoin +from openai import OpenAI + +API_KEY = "FAKE_KEY" +BASE_URL = "http://localhost:8000" + +client = OpenAI(BASE_URL=urljoin(BASE_URL, "v1"), API_KEY=API_KEY) + +# Example: Complex query with thinking process +response = client.chat.completions.create( + model="my-qwen-3-32b", + messages=[{"role": "user", "content": "What's the capital of France ?"}], + extra_body={"chat_template_kwargs": {"enable_thinking": False}}, +) + +print(f"Reasoning: \n{response.choices[0].message.reasoning_content}\n\n") +print(f"Answer: \n {response.choices[0].message.content}") diff --git a/doc/source/serve/tutorials/deployment-serve-llm/hybrid-reasoning-llm/client_thinking_enabled.py b/doc/source/serve/tutorials/deployment-serve-llm/hybrid-reasoning-llm/client_thinking_enabled.py new file mode 100644 index 000000000000..f1ea4070ec3f --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/hybrid-reasoning-llm/client_thinking_enabled.py @@ -0,0 +1,18 @@ +# client_thinking_enabled.py +from urllib.parse import urljoin +from openai import OpenAI + +API_KEY = "FAKE_KEY" +BASE_URL = "http://localhost:8000" + +client = OpenAI(BASE_URL=urljoin(BASE_URL, "v1"), API_KEY=API_KEY) + +# Example: Complex query with thinking process +response = client.chat.completions.create( + model="my-qwen-3-32b", + messages=[{"role": "user", "content": "What's the capital of France ?"}], + extra_body={"chat_template_kwargs": {"enable_thinking": True}}, +) + +print(f"Reasoning: \n{response.choices[0].message.reasoning_content}\n\n") +print(f"Answer: \n {response.choices[0].message.content}") diff --git a/doc/source/serve/tutorials/deployment-serve-llm/hybrid-reasoning-llm/notebook.ipynb b/doc/source/serve/tutorials/deployment-serve-llm/hybrid-reasoning-llm/notebook.ipynb new file mode 100644 index 000000000000..97e7bb17834e --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/hybrid-reasoning-llm/notebook.ipynb @@ -0,0 +1,445 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e926219a", + "metadata": {}, + "source": [ + "# Deploy a hybrid reasoning LLM\n", + "\n", + "A hybrid reasoning model provides flexibility by allowing you to enable or disable reasoning as needed. You can use structured, step-by-step thinking for complex queries while skipping it for simpler ones, balancing accuracy with efficiency depending on the task.\n", + "\n", + "This tutorial deploys a hybrid reasoning LLM using Ray Serve LLM. \n", + "\n", + "---\n", + "\n", + "## Distinction with purely reasoning models\n", + "\n", + "*Hybrid reasoning models* are reasoning-capable models that allow you to toggle the thinking process on and off. You can enable structured, step-by-step reasoning when needed but skip it for simpler queries to reduce latency. Purely reasoning models always apply their reasoning behavior, while hybrid models give you fine-grained control over when to use reasoning.\n", + "\n", + "| **Mode** | **Core behavior** | **Use case examples** | **Limitation** |\n", + "| ---------------- | -------------------------------------------- | ------------------------------------------------------------------- | ------------------------------------------------- |\n", + "| **Thinking ON** | Explicit multi-step thinking process | Math, coding, logic puzzles, multi-hop QA, CoT prompting | Slower response time, more tokens used. |\n", + "| **Thinking OFF** | Direct answer generation | Casual queries, short instructions, single-step answers | May struggle with complex reasoning or interpretability. |\n", + "\n", + "**Note:** Reasoning often benefits from long context windows (32K up to +1M tokens), high token throughput, low-temperature decoding (greedy sampling), and strong instruction tuning or scratchpad-style reasoning.\n", + "\n", + "To see an example of deploying a purely reasoning model like *QwQ-32 B*, see [Deploy a reasoning LLM](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/reasoning-llm/README.html).\n", + "\n", + "---\n", + "\n", + "## Enable or disable thinking\n", + "\n", + "Some hybrid reasoning models let you toggle their \"thinking\" mode on or off. This section explains when to use thinking mode versus skipping it, and shows how to control the setting in practice.\n", + "\n", + "---\n", + "\n", + "### When to enable or disable thinking mode\n", + "\n", + "**Enable thinking mode for:**\n", + "- Complex, multi-step tasks that require reasoning, such as math, physics, or logic problems.\n", + "- Ambiguous queries or situations with incomplete information.\n", + "- Planning, workflow orchestration, or when the model needs to act as an \"agent\" coordinating other tools or models.\n", + "- Analyzing intricate data, images, or charts.\n", + "- In-depth code reviews or evaluating outputs from other AI systems (LLM as Judge approach).\n", + "\n", + "**Disable thinking mode for:**\n", + "- Simple, well-defined, or routine tasks.\n", + "- Low latency and fast responses as the priority.\n", + "- Repetitive, straightforward steps within a larger automated workflow.\n", + "\n", + "---\n", + "\n", + "### How to enable or disable thinking mode\n", + "\n", + "Toggle thinking mode varies by model and framework. Consult the documentation for the model to see how it structures and controls thinking.\n", + "\n", + "For example, to [control reasoning in Qwen-3](https://huggingface.co/Qwen/Qwen3-32B#switching-between-thinking-and-non-thinking-mode), you can:\n", + "* Add `\"/think\"` or `\"/no_think\"` in the prompt.\n", + "* Set `enable_thinking` in the request:\n", + " `extra_body={\"chat_template_kwargs\": {\"enable_thinking\": ...}}`.\n", + "\n", + "See [Send request with thinking enabled](#send-request-with-thinking-enabled) or [Send request with thinking disabled](#send-request-with-thinking-disabled) for practical examples.\n", + "\n", + "---\n", + "\n", + "## Parse reasoning outputs\n", + "\n", + "In thinking mode, hybrid models often separate _reasoning_ from the _final answer_ using tags like `...`. Without a proper parser, this reasoning may end up in the `content` field instead of the dedicated `reasoning_content` field. \n", + "\n", + "To ensure that Ray Serve LLM correctly parses the reasoning output, configure a `reasoning_parser` in your Ray Serve LLM deployment. This tells vLLM how to isolate the model’s thought process from the rest of the output. \n", + "**Note:** For example, *Qwen-3* uses the `qwen3` parser. See the [vLLM docs](https://docs.vllm.ai/en/stable/features/reasoning_outputs.html#supported-models) or your model's documentation to find a supported parser, or [build your own](https://docs.vllm.ai/en/stable/features/reasoning_outputs.html#how-to-support-a-new-reasoning-model) if needed.\n", + "\n", + "```yaml\n", + "applications:\n", + "- ...\n", + " args:\n", + " llm_configs:\n", + " - model_loading_config:\n", + " model_id: my-qwen-3-32b\n", + " model_source: Qwen/Qwen3-32B\n", + " ...\n", + " engine_kwargs:\n", + " ...\n", + " reasoning_parser: qwen3 # <-- for Qwen-3 models\n", + "```\n", + "\n", + "See [Configure Ray Serve LLM](#configure-ray-serve-llm) for a complete example.\n", + "\n", + "**Example response** \n", + "When using a reasoning parser, the response is typically structured like this:\n", + "\n", + "```python\n", + "ChatCompletionMessage(\n", + " content=\"The temperature is...\",\n", + " ...,\n", + " reasoning_content=\"Okay, the user is asking for the temperature today and tomorrow...\"\n", + ")\n", + "```\n", + "And you can extract the content and reasoning like this\n", + "```python\n", + "response = client.chat.completions.create(\n", + " ...\n", + ")\n", + "\n", + "print(f\"Content: {response.choices[0].message.content}\")\n", + "print(f\"Reasoning: {response.choices[0].message.reasoning_content}\")\n", + "```\n", + "\n", + "---\n", + "\n", + "## Configure Ray Serve LLM\n", + "\n", + "Set your Hugging Face token in the config file to access gated models.\n", + "\n", + "Ray Serve LLM provides multiple [Python APIs](https://docs.ray.io/en/latest/serve/api/index.html#llm-api) for defining your application. Use [`build_openai_app`](https://docs.ray.io/en/latest/serve/api/doc/ray.serve.llm.build_openai_app.html#ray.serve.llm.build_openai_app) to build a full application from your [`LLMConfig`](https://docs.ray.io/en/latest/serve/api/doc/ray.serve.llm.LLMConfig.html#ray.serve.llm.LLMConfig) object.\n", + "\n", + "Set `tensor_parallel_size` to distribute the model's weights among 8 GPUs in the node. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1daf892", + "metadata": {}, + "outputs": [], + "source": [ + "# serve_qwen_3_32b.py\n", + "from ray.serve.llm import LLMConfig, build_openai_app\n", + "import os\n", + "\n", + "llm_config = LLMConfig(\n", + " model_loading_config=dict(\n", + " model_id=\"my-qwen-3-32b\",\n", + " model_source=\"Qwen/Qwen3-32B\",\n", + " ),\n", + " accelerator_type=\"A100-40G\",\n", + " deployment_config=dict(\n", + " autoscaling_config=dict(\n", + " min_replicas=1,\n", + " max_replicas=2,\n", + " )\n", + " ),\n", + " ### Uncomment if your model is gated and needs your Hugging Face token to access it.\n", + " # runtime_env=dict(env_vars={\"HF_TOKEN\": os.environ.get(\"HF_TOKEN\")}),\n", + " engine_kwargs=dict(\n", + " tensor_parallel_size=8, max_model_len=32768, reasoning_parser=\"qwen3\"\n", + " ),\n", + ")\n", + "app = build_openai_app({\"llm_configs\": [llm_config]})\n" + ] + }, + { + "cell_type": "markdown", + "id": "32272280", + "metadata": {}, + "source": [ + "**Note:** Before moving to a production setup, migrate your settings to a [Serve config file](https://docs.ray.io/en/latest/serve/production-guide/config.html) to make your deployment version-controlled, reproducible, and easier to maintain for CI/CD pipelines. See [Serving LLMs - Quickstart Examples: Production Guide](https://docs.ray.io/en/latest/serve/llm/quick-start.html#production-deployment) for an example.\n", + "\n", + "---\n", + "\n", + "## Deploy locally\n", + "\n", + "**Prerequisites**\n", + "\n", + "* Access to GPU compute.\n", + "* (Optional) A **Hugging Face token** if using gated models like. Store it in `export HF_TOKEN=`.\n", + "\n", + "**Note:** Depending on the organization, you can usually request access on the model's Hugging Face page. For example, Meta’s Llama models approval can take anywhere from a few hours to several weeks.\n", + "\n", + "**Dependencies:** \n", + "```bash\n", + "pip install \"ray[serve,llm]\"\n", + "```\n", + "\n", + "---\n", + "\n", + "### Launch\n", + "\n", + "Follow the instructions at [Configure Ray Serve LLM](#configure-ray-serve-llm) to define your app in a Python module `serve_qwen_3_32b.py`. \n", + "\n", + "In a terminal, run: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a8f1b58", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "serve run serve_qwen_3_32b:app --non-blocking" + ] + }, + { + "cell_type": "markdown", + "id": "a24501f5", + "metadata": {}, + "source": [ + "Deployment typically takes a few minutes as the cluster is provisioned, the vLLM server starts, and the model is downloaded. \n", + "\n", + "Your endpoint is available locally at `http://localhost:8000` and you can use a placeholder authentication token for the OpenAI client, for example `\"FAKE_KEY\"`\n", + "\n", + "Use the `model_id` defined in your config (here, `my-qwen-3-32b`) to query your model. Below are some examples on how to send a request to a Qwen-3 deployment with thinking enabled or disabled. \n", + "\n", + "---\n", + "\n", + "### Send request with thinking disabled\n", + "\n", + "You can disable thinking in Qwen-3 by either adding a `/no_think` tag in the prompt or by forwarding `enable_thinking: False` to the vLLM inference engine. \n", + "\n", + "Example curl with `/no_think`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d77d2201", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "curl -X POST http://localhost:8000/v1/chat/completions \\\n", + " -H \"Content-Type: application/json\" \\\n", + " -H \"Authorization: Bearer FAKE_KEY\" \\\n", + " -d '{ \"model\": \"my-qwen-3-32b\", \"messages\": [{\"role\": \"user\", \"content\": \"What is greater between 7.8 and 7.11 ? /no_think\"}] }'" + ] + }, + { + "cell_type": "markdown", + "id": "a127ea5f", + "metadata": {}, + "source": [ + "Example Python with `enable_thinking: False`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e51e9d85", + "metadata": {}, + "outputs": [], + "source": [ + "#client_thinking_disabled.py\n", + "from urllib.parse import urljoin\n", + "from openai import OpenAI\n", + "\n", + "API_KEY = \"FAKE_KEY\"\n", + "BASE_URL = \"http://localhost:8000\"\n", + "\n", + "client = OpenAI(BASE_URL=urljoin(BASE_URL, \"v1\"), API_KEY=API_KEY)\n", + "\n", + "# Example: Complex query with thinking process\n", + "response = client.chat.completions.create(\n", + " model=\"my-qwen-3-32b\",\n", + " messages=[\n", + " {\"role\": \"user\", \"content\": \"What's the capital of France ?\"}\n", + " ],\n", + " extra_body={\"chat_template_kwargs\": {\"enable_thinking\": False}}\n", + ")\n", + "\n", + "print(f\"Reasoning: \\n{response.choices[0].message.reasoning_content}\\n\\n\")\n", + "print(f\"Answer: \\n {response.choices[0].message.content}\")" + ] + }, + { + "cell_type": "markdown", + "id": "9765b3f8", + "metadata": {}, + "source": [ + "Notice the `reasoning_content` is empty here. \n", + "**Note:** Depending on your model's documentation, empty could mean `None`, an empty string or even empty tags `\"\"`.\n", + "\n", + "---\n", + "\n", + "### Send request with thinking enabled\n", + " \n", + "You can enable thinking in Qwen-3 by either adding a `/think` tag in the prompt or by forwarding `enable_thinking: True` to the vLLM inference engine. \n", + "\n", + "Example curl with `/think`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8702258c", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "curl -X POST http://localhost:8000/v1/chat/completions \\\n", + " -H \"Content-Type: application/json\" \\\n", + " -H \"Authorization: Bearer FAKE_KEY\" \\\n", + " -d '{ \"model\": \"my-qwen-3-32b\", \"messages\": [{\"role\": \"user\", \"content\": \"What is greater between 7.8 and 7.11 ? /think\"}] }'" + ] + }, + { + "cell_type": "markdown", + "id": "c0bad31b", + "metadata": {}, + "source": [ + " Example Python with `enable_thinking: True`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a52eb68", + "metadata": {}, + "outputs": [], + "source": [ + "#client_thinking_enabled.py\n", + "from urllib.parse import urljoin\n", + "from openai import OpenAI\n", + "\n", + "API_KEY = \"FAKE_KEY\"\n", + "BASE_URL = \"http://localhost:8000\"\n", + "\n", + "client = OpenAI(BASE_URL=urljoin(BASE_URL, \"v1\"), API_KEY=API_KEY)\n", + "\n", + "# Example: Complex query with thinking process\n", + "response = client.chat.completions.create(\n", + " model=\"my-qwen-3-32b\",\n", + " messages=[\n", + " {\"role\": \"user\", \"content\": \"What's the capital of France ?\"}\n", + " ],\n", + " extra_body={\"chat_template_kwargs\": {\"enable_thinking\": True}}\n", + ")\n", + "\n", + "print(f\"Reasoning: \\n{response.choices[0].message.reasoning_content}\\n\\n\")\n", + "print(f\"Answer: \\n {response.choices[0].message.content}\")" + ] + }, + { + "cell_type": "markdown", + "id": "1f36ba3d", + "metadata": {}, + "source": [ + "If you configure a valid reasoning parser, the reasoning output should appear in the `reasoning_content` field of the response message. Otherwise, it may be included in the main `content` field, typically wrapped in `...` tags. See [Parse reasoning outputs](#parse-reasoning-outputs) for more information.\n", + "\n", + "---\n", + "\n", + "### Shutdown \n", + "\n", + "Shutdown your LLM service:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2cc5cc23", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "serve shutdown -y" + ] + }, + { + "cell_type": "markdown", + "id": "8009515b", + "metadata": {}, + "source": [ + "\n", + "---\n", + "\n", + "## Deploy to production with Anyscale services\n", + "\n", + "For production, it's recommended to use Anyscale services to deploy your Ray Serve app on a dedicated cluster without any code changes. Anyscale provides scalability, fault tolerance, and load balancing, ensuring resilience against node failures, high traffic, and rolling updates. See [Deploy a medium-sized LLM](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/medium-size-llm/README.html#deploy-to-production-with-anyscale-services) for an example with a medium-sized model like the *Qwen-32b* from this tutorial.\n", + "\n", + "---\n", + "\n", + "## Stream reasoning content\n", + "\n", + "In thinking mode, hybrid reasoning models may take longer to begin generating the main content. You can stream intermediate reasoning output in the same way as the main content. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5f5a877", + "metadata": {}, + "outputs": [], + "source": [ + "#client_streaming.py\n", + "from urllib.parse import urljoin\n", + "from openai import OpenAI\n", + "\n", + "API_KEY = \"FAKE_KEY\"\n", + "BASE_URL = \"http://localhost:8000\"\n", + "\n", + "client = OpenAI(BASE_URL=urljoin(BASE_URL, \"v1\"), API_KEY=API_KEY)\n", + "\n", + "# Example: Complex query with thinking process\n", + "response = client.chat.completions.create(\n", + " model=\"my-qwen-3-32b\",\n", + " messages=[\n", + " {\"role\": \"user\", \"content\": \"I need to plan a trip to Paris from Seattle. Can you help me research flight costs, create an itinerary for 3 days, and suggest restaurants based on my dietary restrictions (vegetarian)?\"}\n", + " ],\n", + " extra_body={\"chat_template_kwargs\": {\"enable_thinking\": True}},\n", + " stream=True\n", + ")\n", + "\n", + "# Stream \n", + "for chunk in response:\n", + " # Stream reasoning content\n", + " if hasattr(chunk.choices[0].delta, \"reasoning_content\"):\n", + " data_reasoning = chunk.choices[0].delta.reasoning_content\n", + " if data_reasoning:\n", + " print(data_reasoning, end=\"\", flush=True)\n", + " # Later, stream the final answer\n", + " if hasattr(chunk.choices[0].delta, \"content\"):\n", + " data_content = chunk.choices[0].delta.content\n", + " if data_content:\n", + " print(data_content, end=\"\", flush=True)" + ] + }, + { + "cell_type": "markdown", + "id": "d6357c06", + "metadata": {}, + "source": [ + "\n", + "---\n", + "\n", + "## Summary\n", + "\n", + "In this tutorial, you deployed a hybrid reasoning LLM with Ray Serve LLM, from development to production. You learned how to configure Ray Serve LLM with the right reasoning parser, deploy your service on your Ray cluster, send requests, and parse reasoning outputs in the response." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "repo_ray_docs", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.12.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/doc/source/serve/tutorials/deployment-serve-llm/hybrid-reasoning-llm/serve_qwen_3_32b.py b/doc/source/serve/tutorials/deployment-serve-llm/hybrid-reasoning-llm/serve_qwen_3_32b.py new file mode 100644 index 000000000000..e53f28ac6a90 --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/hybrid-reasoning-llm/serve_qwen_3_32b.py @@ -0,0 +1,23 @@ +# serve_qwen_3_32b.py +from ray.serve.llm import LLMConfig, build_openai_app +import os + +llm_config = LLMConfig( + model_loading_config=dict( + model_id="my-qwen-3-32b", + model_source="Qwen/Qwen3-32B", + ), + accelerator_type="A100-40G", + deployment_config=dict( + autoscaling_config=dict( + min_replicas=1, + max_replicas=2, + ) + ), + ### Uncomment if your model is gated and needs your Hugging Face token to access it. + # runtime_env=dict(env_vars={"HF_TOKEN": os.environ.get("HF_TOKEN")}), + engine_kwargs=dict( + tensor_parallel_size=8, max_model_len=32768, reasoning_parser="qwen3" + ), +) +app = build_openai_app({"llm_configs": [llm_config]}) diff --git a/doc/source/serve/tutorials/deployment-serve-llm/large-size-llm/Dockerfile b/doc/source/serve/tutorials/deployment-serve-llm/large-size-llm/Dockerfile new file mode 100644 index 000000000000..a2412390df61 --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/large-size-llm/Dockerfile @@ -0,0 +1,8 @@ +FROM anyscale/ray:2.49.0-slim-py312-cu128 + +# C compiler for Triton’s runtime build step (vLLM V1 engine) +# https://github.com/vllm-project/vllm/issues/2997 +RUN sudo apt-get update && \ + sudo apt-get install -y --no-install-recommends build-essential + +RUN pip install vllm==0.10.0 \ No newline at end of file diff --git a/doc/source/serve/tutorials/deployment-serve-llm/large-size-llm/README.md b/doc/source/serve/tutorials/deployment-serve-llm/large-size-llm/README.md new file mode 100644 index 000000000000..379e4c0d0ba4 --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/large-size-llm/README.md @@ -0,0 +1,355 @@ +--- +orphan: true +--- + + + +# Deploy a large-sized LLM + +A large LLM typically runs on multiple nodes with multiple GPUs, prioritizing peak quality and capability: stronger reasoning, broader knowledge, longer context windows, more robust generalization. When higher latency, complexity, and cost are acceptable trade-offs because you require state-of-the-art results. + +This tutorial deploys DeepSeek-R1, a large LLM with 685 B parameters, using Ray Serve LLM. For smaller models, see [Deploying a small-sized LLM](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/small-size-llm/README.html) or [Deploying a medium-sized LLM](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/medium-size-llm/README.html). + +--- + +## Challenges of large-scale deployments + +Deploying a 685 B-parameter model like DeepSeek-R1 presents significant technical challenges. At this scale, the model can't fit on a single GPU or even a single node. You must distribute it across multiple GPUs and nodes using *tensor parallelism* (splitting tensors within each layer) and *pipeline parallelism* (spreading layers across devices). + +Deploying a model of this scale normally requires you to manually launch and coordinate multiple nodes, unless you use a managed platform like [Anyscale](https://www.anyscale.com/), which automates cluster scaling and node orchestration. See [Deploy to production with Anyscale Services](#deploy-to-production-with-anyscale-services) for more details. + +--- + +## Configure Ray Serve LLM + +A large-sized LLM is typically deployed across multiple nodes with multiple GPUs. To fully utilize the hardware, set `pipeline_parallel_size` to the number of nodes and `tensor_parallel_size` to the number of GPUs per node, which distributes the model’s weights evenly. + +Ray Serve LLM provides multiple [Python APIs](https://docs.ray.io/en/latest/serve/api/index.html#llm-api) for defining your application. Use [`build_openai_app`](https://docs.ray.io/en/latest/serve/api/doc/ray.serve.llm.build_openai_app.html#ray.serve.llm.build_openai_app) to build a full application from your [`LLMConfig`](https://docs.ray.io/en/latest/serve/api/doc/ray.serve.llm.LLMConfig.html#ray.serve.llm.LLMConfig) object. + +**Optional:** Because Deepseek-R1 is a reasoning model, this tutorial uses vLLM’s built-in reasoning parser to correctly separate its reasoning content from the final response. See [Deploying a reasoning LLM: Parse reasoning outputs](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/reasoning-llm/README.html#parse-reasoning-outputs). + + +```python +# serve_deepseek_r1.py +from ray.serve.llm import LLMConfig, build_openai_app + +llm_config = LLMConfig( + model_loading_config=dict( + model_id="my-deepseek-r1", + model_source="deepseek-ai/DeepSeek-R1", + ), + accelerator_type="H100", + deployment_config=dict( + autoscaling_config=dict( + min_replicas=1, + max_replicas=1, + ) + ), + ### Uncomment if your model is gated and needs your Hugging Face token to access it. + # runtime_env=dict(env_vars={"HF_TOKEN": os.environ.get("HF_TOKEN")}), + engine_kwargs=dict( + max_model_len=16384, + # Split weights among 8 GPUs in the node + tensor_parallel_size=8, + pipeline_parallel_size=2, + reasoning_parser="deepseek_r1", # Optional: separate reasoning content from the final answer + ), +) + +app = build_openai_app({"llm_configs": [llm_config]}) + +``` + +**Note:** Before moving to a production setup, migrate to a [Serve config file](https://docs.ray.io/en/latest/serve/production-guide/config.html) to make your deployment version-controlled, reproducible, and easier to maintain for CI/CD pipelines. See [Serving LLMs - Quickstart Examples: Production Guide](https://docs.ray.io/en/latest/serve/llm/quick-start.html#production-deployment) for an example. + +--- + +## Deploy locally + +**Prerequisites** + +* Access to GPU compute. +* (Optional) A **Hugging Face token** if using gated models. Store it in `export HF_TOKEN=`. + +**Note:** Depending on the organization, you can usually request access on the model's Hugging Face page. For example, Meta’s Llama models approval can take anywhere from a few hours to several weeks. + +**Dependencies:** +```bash +pip install "ray[serve,llm]" +``` + +**Beware**: this is an expensive deployment. + +--- + +### Launch + +Follow the instructions at [Configure Ray Serve LLM](#configure-ray-serve-llm) to define your app in a Python module `serve_deepseek_r1.py`. + +In a terminal, run: + + +```bash +%%bash +serve run serve_deepseek_r1:app --non-blocking +``` + +Deployment typically takes a few minutes as the cluster is provisioned, the vLLM server starts, and the model is downloaded. + +--- + +### Send requests + +Your endpoint is available locally at `http://localhost:8000` and you can use a placeholder authentication token for the OpenAI client, for example `"FAKE_KEY"`. + +Example curl: + + +```bash +%%bash +curl -X POST http://localhost:8000/v1/chat/completions \ + -H "Authorization: Bearer FAKE_KEY" \ + -H "Content-Type: application/json" \ + -d '{ "model": "my-deepseek-r1", "messages": [{"role": "user", "content": "What is 2 + 2?"}] }' +``` + +Example Python: + + +```python +#client.py +from urllib.parse import urljoin +from openai import OpenAI + +API_KEY = "FAKE_KEY" +BASE_URL = "http://localhost:8000" + +client = OpenAI(BASE_URL=urljoin(BASE_URL, "v1"), API_KEY=API_KEY) + +response = client.chat.completions.create( + model="my-deepseek-r1", + messages=[{"role": "user", "content": "Tell me a joke"}], + stream=True, +) + +# Stream and print JSON +for chunk in response: + # Stream reasoning content first + if hasattr(chunk.choices[0].delta, "reasoning_content"): + data_reasoning = chunk.choices[0].delta.reasoning_content + if data_reasoning: + print(data_reasoning, end="", flush=True) + # Later, stream the final answer + if hasattr(chunk.choices[0].delta, "content"): + data_content = chunk.choices[0].delta.content + if data_content: + print(data_content, end="", flush=True) +``` + + +--- + +### Shutdown + +Shutdown your LLM service: + + +```bash +%%bash +serve shutdown -y +``` + + +--- + +## Deploy to production with Anyscale services + +For production deployment, use Anyscale services to deploy the Ray Serve app to a dedicated cluster without modifying the code. Anyscale provides scalability, fault tolerance, and load balancing, keeping the service resilient against node failures, high traffic, and rolling updates, while also automating multi-node setup and autoscaling for large models like DeepSeek-R1. + +**Beware**: this is an expensive deployment. At the time of writing, the deployment cost is around \$110 USD per hour in the `us-west-2` AWS region using on-demand instances. Because this node has a high amount of inter-node traffic, and cross-zone traffic is expensive (around \$0.02 per GB), it's recommended to *disable cross-zone autoscaling*. This demo is pre-configured with cross-zone autoscaling disabled for your convenience. + +### Prerequisites + +The following template runs only on H100 GPUs in your self-hosted Anyscale cloud, as H100s aren't available in Anyscale’s public cloud. This example uses two nodes of type *8xH100-80 GB:208CPU-1830 GB* on an AWS cloud. + +To provision nodes with 1000 GB of disk capacity, see [Changing the default disk size for GCP clusters](https://docs.anyscale.com/configuration/compute/gcp#disk-size) for Google Cloud Platform (GCP) or [Changing the default disk size for AWS clusters](https://docs.anyscale.com/configuration/compute/aws#disk-size) for Amazon Web Services (AWS). + +--- + +### Launch the service + +Anyscale provides out-of-the-box images (`anyscale/ray-llm`), which come pre-loaded with Ray Serve LLM, vLLM, and all required GPU/runtime dependencies. This makes it easy to get started without building a custom image. + +Create your Anyscale service configuration in a new `service.yaml` file: +```yaml +#service.yaml +name: deploy-deepseek-r1 +image_uri: anyscale/ray-llm:2.49.0-py311-cu128 # Anyscale Ray Serve LLM image. Use `containerfile: ./Dockerfile` to use a custom Dockerfile. +compute_config: + auto_select_worker_config: true + # Change default disk size to 1000GB + advanced_instance_config: + ## AWS ## + BlockDeviceMappings: + - Ebs: + - VolumeSize: 1000 + VolumeType: gp3 + DeleteOnTermination: true + DeviceName: "/dev/sda1" + ######### + ## GCP ## + #instanceProperties: + # disks: + # - boot: true + # auto_delete: true + # initialize_params: + # - disk_size_gb: 1000 + ######### + +working_dir: . +cloud: +applications: +# Point to your app in your Python module +- import_path: serve_deepseek_r1:app +``` + +Deploy your service + + +```bash +%%bash +anyscale service deploy -f service.yaml +``` + +**Note:** If your model is gated, make sure to pass your Hugging Face token to the service with `--env HF_TOKEN=` + +**Custom Dockerfile** +You can customize the container by building your own Dockerfile. In your Anyscale Service config, reference the Dockerfile with `containerfile` (instead of `image_uri`): + +```yaml +# service.yaml +# Replace: +# image_uri: anyscale/ray-llm:2.49.0-py311-cu128 + +# with: +containerfile: ./Dockerfile +``` + +See the [Anyscale base images](https://docs.anyscale.com/reference/base-images) for details on what each image includes. + +--- + +### Send requests + +The `anyscale service deploy` command output shows both the endpoint and authentication token: +```console +(anyscale +3.9s) curl -H "Authorization: Bearer " +``` +You can also retrieve both from the service page in the Anyscale console. Click the **Query** button at the top. See [Send requests](#send-requests) for example requests, but make sure to use the correct endpoint and authentication token. + +--- + +### Access the Serve LLM dashboard + +See [Enable LLM monitoring](#enable-llm-monitoring) for instructions on enabling LLM-specific logging. To open the Ray Serve LLM dashboard from an Anyscale service: +1. In the Anyscale console, go to your **Service** or **Workspace** +2. Navigate to the **Metrics** tab +3. Click **View in Grafana** and click **Serve LLM Dashboard** + +--- + +### Shutdown + +Shutdown your Anyscale service: + + +```bash +%%bash +anyscale service terminate -n deploy-deepseek-r1 +``` + + +--- + +## Enable LLM monitoring + +The *Serve LLM dashboard* offers deep visibility into model performance, latency, and system behavior, including: + +* Token throughput (tokens/sec) +* Latency metrics: Time To First Token (TTFT), Time Per Output Token (TPOT) +* KV cache utilization + +To enable these metrics, go to your LLM config and set `log_engine_metrics: true`. Ensure vLLM V1 is active with `VLLM_USE_V1: "1"`. +**Note:** `VLLM_USE_V1: "1"` is the default value with `ray >= 2.48.0` and can be omitted. +```yaml +applications: +- ... + args: + llm_configs: + - ... + runtime_env: + env_vars: + VLLM_USE_V1: "1" + ... + log_engine_metrics: true +``` + +--- + +## Improve concurrency + +Ray Serve LLM uses [vLLM](https://docs.vllm.ai/en/stable/) as its backend engine, which logs the *maximum concurrency* it can support based on your configuration. + +Example log: +```console +INFO 07-30 11:56:04 [kv_cache_utils.py:637] Maximum concurrency for 32,768 tokens per request: 29.06x +``` + +The following are a few ways to improve concurrency depending on your model and hardware: + +**Reduce `max_model_len`** +Lowering `max_model_len` reduces the memory needed for KV cache. + +**Example**: Running DeepSeek-R1 on 2 nodes with 8xH100-80 GB GPUs each: +* `max_model_len = 32,768` → concurrency ≈ 29 +* `max_model_len = 16,384` → concurrency ≈ 58 + +**Use distilled or quantized models** +Quantizing or distilling your model reduces its memory footprint, freeing up space for more KV cache and enabling more concurrent requests. For example, see [`deepseek-ai/DeepSeek-R1-Distill-Llama-70B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B) for a distilled version of DeepSeek-R1. + + +**Upgrade to GPUs with more memory** +Some GPUs provide significantly more room for KV cache and allow for higher concurrency out of the box. + +**Scale with more replicas** +In addition to tuning per-GPU concurrency, you can scale *horizontally* by increasing the number of replicas in your config. +Each replica runs on its own GPU, so raising the replica count increases the total number of concurrent requests your service can handle, especially under sustained or bursty traffic. +```yaml +deployment_config: + autoscaling_config: + min_replicas: 1 + max_replicas: 4 +``` + +*For more details on tuning strategies, hardware guidance, and serving configurations, see [Choose a GPU for LLM serving](https://docs.anyscale.com/llm/serving/gpu-guidance) and [Tune parameters for LLMs on Anyscale services](https://docs.anyscale.com/llm/serving/parameter-tuning).* + +--- + +## Troubleshooting + +**Hugging Face auth errors** +Some models, such as Llama-3.1, are gated and require prior authorization from the organization. See your model’s documentation for instructions on obtaining access. + +**Out-Of-Memory errors** +Out‑of‑memory (OOM) errors are one of the most common failure modes when deploying LLMs, especially as model sizes, and context length increase. +See [Troubleshooting Guide](https://docs.anyscale.com/overview) for common errors and how to fix them. + +--- + +## Summary + +In this tutorial, you deployed a large-sized LLM with Ray Serve LLM, from development to production. You learned how to configure Ray Serve LLM, deploy your service on your Ray cluster, and how to send requests. You also learned how to monitor your app and troubleshoot common issues. diff --git a/doc/source/serve/tutorials/deployment-serve-llm/large-size-llm/client.py b/doc/source/serve/tutorials/deployment-serve-llm/large-size-llm/client.py new file mode 100644 index 000000000000..839f17958d3b --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/large-size-llm/client.py @@ -0,0 +1,27 @@ +# client.py +from urllib.parse import urljoin +from openai import OpenAI + +API_KEY = "FAKE_KEY" +BASE_URL = "http://localhost:8000" + +client = OpenAI(BASE_URL=urljoin(BASE_URL, "v1"), API_KEY=API_KEY) + +response = client.chat.completions.create( + model="my-deepseek-r1", + messages=[{"role": "user", "content": "Tell me a joke"}], + stream=True, +) + +# Stream and print JSON +for chunk in response: + # Stream reasoning content first + if hasattr(chunk.choices[0].delta, "reasoning_content"): + data_reasoning = chunk.choices[0].delta.reasoning_content + if data_reasoning: + print(data_reasoning, end="", flush=True) + # Later, stream the final answer + if hasattr(chunk.choices[0].delta, "content"): + data_content = chunk.choices[0].delta.content + if data_content: + print(data_content, end="", flush=True) diff --git a/doc/source/serve/tutorials/deployment-serve-llm/large-size-llm/notebook.ipynb b/doc/source/serve/tutorials/deployment-serve-llm/large-size-llm/notebook.ipynb new file mode 100644 index 000000000000..6764565d998e --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/large-size-llm/notebook.ipynb @@ -0,0 +1,447 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f8f6fcbd", + "metadata": {}, + "source": [ + "# Deploy a large-sized LLM\n", + "\n", + "A large LLM typically runs on multiple nodes with multiple GPUs, prioritizing peak quality and capability: stronger reasoning, broader knowledge, longer context windows, more robust generalization. When higher latency, complexity, and cost are acceptable trade-offs because you require state-of-the-art results.\n", + "\n", + "This tutorial deploys DeepSeek-R1, a large LLM with 685 B parameters, using Ray Serve LLM. For smaller models, see [Deploying a small-sized LLM](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/small-size-llm/README.html) or [Deploying a medium-sized LLM](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/medium-size-llm/README.html).\n", + "\n", + "---\n", + "\n", + "## Challenges of large-scale deployments\n", + "\n", + "Deploying a 685 B-parameter model like DeepSeek-R1 presents significant technical challenges. At this scale, the model can't fit on a single GPU or even a single node. You must distribute it across multiple GPUs and nodes using *tensor parallelism* (splitting tensors within each layer) and *pipeline parallelism* (spreading layers across devices). \n", + "\n", + "Deploying a model of this scale normally requires you to manually launch and coordinate multiple nodes, unless you use a managed platform like [Anyscale](https://www.anyscale.com/), which automates cluster scaling and node orchestration. See [Deploy to production with Anyscale Services](#deploy-to-production-with-anyscale-services) for more details.\n", + "\n", + "---\n", + "\n", + "## Configure Ray Serve LLM\n", + "\n", + "A large-sized LLM is typically deployed across multiple nodes with multiple GPUs. To fully utilize the hardware, set `pipeline_parallel_size` to the number of nodes and `tensor_parallel_size` to the number of GPUs per node, which distributes the model’s weights evenly.\n", + "\n", + "Ray Serve LLM provides multiple [Python APIs](https://docs.ray.io/en/latest/serve/api/index.html#llm-api) for defining your application. Use [`build_openai_app`](https://docs.ray.io/en/latest/serve/api/doc/ray.serve.llm.build_openai_app.html#ray.serve.llm.build_openai_app) to build a full application from your [`LLMConfig`](https://docs.ray.io/en/latest/serve/api/doc/ray.serve.llm.LLMConfig.html#ray.serve.llm.LLMConfig) object.\n", + "\n", + "**Optional:** Because Deepseek-R1 is a reasoning model, this tutorial uses vLLM’s built-in reasoning parser to correctly separate its reasoning content from the final response. See [Deploying a reasoning LLM: Parse reasoning outputs](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/reasoning-llm/README.html#parse-reasoning-outputs)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d185d580", + "metadata": {}, + "outputs": [], + "source": [ + "# serve_deepseek_r1.py\n", + "from ray.serve.llm import LLMConfig, build_openai_app\n", + "\n", + "llm_config = LLMConfig(\n", + " model_loading_config=dict(\n", + " model_id=\"my-deepseek-r1\",\n", + " model_source=\"deepseek-ai/DeepSeek-R1\",\n", + " ),\n", + " accelerator_type=\"H100\",\n", + " deployment_config=dict(\n", + " autoscaling_config=dict(\n", + " min_replicas=1,\n", + " max_replicas=1,\n", + " )\n", + " ),\n", + " ### Uncomment if your model is gated and needs your Hugging Face token to access it.\n", + " # runtime_env=dict(env_vars={\"HF_TOKEN\": os.environ.get(\"HF_TOKEN\")}),\n", + " engine_kwargs=dict(\n", + " max_model_len=16384,\n", + " # Split weights among 8 GPUs in the node\n", + " tensor_parallel_size=8,\n", + " pipeline_parallel_size=2,\n", + " reasoning_parser=\"deepseek_r1\", # Optional: separate reasoning content from the final answer\n", + " ),\n", + ")\n", + "\n", + "app = build_openai_app({\"llm_configs\": [llm_config]})\n" + ] + }, + { + "cell_type": "markdown", + "id": "6b2231a5", + "metadata": {}, + "source": [ + "**Note:** Before moving to a production setup, migrate to a [Serve config file](https://docs.ray.io/en/latest/serve/production-guide/config.html) to make your deployment version-controlled, reproducible, and easier to maintain for CI/CD pipelines. See [Serving LLMs - Quickstart Examples: Production Guide](https://docs.ray.io/en/latest/serve/llm/quick-start.html#production-deployment) for an example.\n", + "\n", + "---\n", + "\n", + "## Deploy locally\n", + "\n", + "**Prerequisites**\n", + "\n", + "* Access to GPU compute.\n", + "* (Optional) A **Hugging Face token** if using gated models. Store it in `export HF_TOKEN=`.\n", + "\n", + "**Note:** Depending on the organization, you can usually request access on the model's Hugging Face page. For example, Meta’s Llama models approval can take anywhere from a few hours to several weeks.\n", + "\n", + "**Dependencies:** \n", + "```bash\n", + "pip install \"ray[serve,llm]\"\n", + "```\n", + "\n", + "**Beware**: this is an expensive deployment.\n", + "\n", + "---\n", + "\n", + "### Launch\n", + "\n", + "Follow the instructions at [Configure Ray Serve LLM](#configure-ray-serve-llm) to define your app in a Python module `serve_deepseek_r1.py`. \n", + "\n", + "In a terminal, run: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae9da12c", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "serve run serve_deepseek_r1:app --non-blocking" + ] + }, + { + "cell_type": "markdown", + "id": "96d18e22", + "metadata": {}, + "source": [ + "Deployment typically takes a few minutes as the cluster is provisioned, the vLLM server starts, and the model is downloaded. \n", + "\n", + "---\n", + "\n", + "### Send requests\n", + "\n", + "Your endpoint is available locally at `http://localhost:8000` and you can use a placeholder authentication token for the OpenAI client, for example `\"FAKE_KEY\"`.\n", + "\n", + "Example curl:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1dd345c", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "curl -X POST http://localhost:8000/v1/chat/completions \\\n", + " -H \"Authorization: Bearer FAKE_KEY\" \\\n", + " -H \"Content-Type: application/json\" \\\n", + " -d '{ \"model\": \"my-deepseek-r1\", \"messages\": [{\"role\": \"user\", \"content\": \"What is 2 + 2?\"}] }'" + ] + }, + { + "cell_type": "markdown", + "id": "dca5e4fd", + "metadata": {}, + "source": [ + "Example Python:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "584f01f7", + "metadata": {}, + "outputs": [], + "source": [ + "#client.py\n", + "from urllib.parse import urljoin\n", + "from openai import OpenAI\n", + "\n", + "API_KEY = \"FAKE_KEY\"\n", + "BASE_URL = \"http://localhost:8000\"\n", + "\n", + "client = OpenAI(BASE_URL=urljoin(BASE_URL, \"v1\"), API_KEY=API_KEY)\n", + "\n", + "response = client.chat.completions.create(\n", + " model=\"my-deepseek-r1\",\n", + " messages=[{\"role\": \"user\", \"content\": \"Tell me a joke\"}],\n", + " stream=True,\n", + ")\n", + "\n", + "# Stream and print JSON\n", + "for chunk in response:\n", + " # Stream reasoning content first\n", + " if hasattr(chunk.choices[0].delta, \"reasoning_content\"):\n", + " data_reasoning = chunk.choices[0].delta.reasoning_content\n", + " if data_reasoning:\n", + " print(data_reasoning, end=\"\", flush=True)\n", + " # Later, stream the final answer\n", + " if hasattr(chunk.choices[0].delta, \"content\"):\n", + " data_content = chunk.choices[0].delta.content\n", + " if data_content:\n", + " print(data_content, end=\"\", flush=True)" + ] + }, + { + "cell_type": "markdown", + "id": "1a5fd1fb", + "metadata": {}, + "source": [ + "\n", + "---\n", + "\n", + "### Shutdown\n", + "\n", + "Shutdown your LLM service: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c03cdb9", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "serve shutdown -y" + ] + }, + { + "cell_type": "markdown", + "id": "dc223463", + "metadata": {}, + "source": [ + "\n", + "---\n", + "\n", + "## Deploy to production with Anyscale services\n", + "\n", + "For production deployment, use Anyscale services to deploy the Ray Serve app to a dedicated cluster without modifying the code. Anyscale provides scalability, fault tolerance, and load balancing, keeping the service resilient against node failures, high traffic, and rolling updates, while also automating multi-node setup and autoscaling for large models like DeepSeek-R1.\n", + "\n", + "**Beware**: this is an expensive deployment. At the time of writing, the deployment cost is around \\$110 USD per hour in the `us-west-2` AWS region using on-demand instances. Because this node has a high amount of inter-node traffic, and cross-zone traffic is expensive (around \\$0.02 per GB), it's recommended to *disable cross-zone autoscaling*. This demo is pre-configured with cross-zone autoscaling disabled for your convenience.\n", + "\n", + "### Prerequisites\n", + "\n", + "The following template runs only on H100 GPUs in your self-hosted Anyscale cloud, as H100s aren't available in Anyscale’s public cloud. This example uses two nodes of type *8xH100-80 GB:208CPU-1830 GB* on an AWS cloud.\n", + "\n", + "To provision nodes with 1000 GB of disk capacity, see [Changing the default disk size for GCP clusters](https://docs.anyscale.com/configuration/compute/gcp#disk-size) for Google Cloud Platform (GCP) or [Changing the default disk size for AWS clusters](https://docs.anyscale.com/configuration/compute/aws#disk-size) for Amazon Web Services (AWS). \n", + "\n", + "---\n", + "\n", + "### Launch the service\n", + "\n", + "Anyscale provides out-of-the-box images (`anyscale/ray-llm`), which come pre-loaded with Ray Serve LLM, vLLM, and all required GPU/runtime dependencies. This makes it easy to get started without building a custom image.\n", + "\n", + "Create your Anyscale service configuration in a new `service.yaml` file:\n", + "```yaml\n", + "#service.yaml\n", + "name: deploy-deepseek-r1\n", + "image_uri: anyscale/ray-llm:2.49.0-py311-cu128 # Anyscale Ray Serve LLM image. Use `containerfile: ./Dockerfile` to use a custom Dockerfile.\n", + "compute_config:\n", + " auto_select_worker_config: true \n", + " # Change default disk size to 1000GB\n", + " advanced_instance_config:\n", + " ## AWS ##\n", + " BlockDeviceMappings:\n", + " - Ebs:\n", + " - VolumeSize: 1000\n", + " VolumeType: gp3\n", + " DeleteOnTermination: true\n", + " DeviceName: \"/dev/sda1\"\n", + " #########\n", + " ## GCP ##\n", + " #instanceProperties:\n", + " # disks:\n", + " # - boot: true\n", + " # auto_delete: true\n", + " # initialize_params:\n", + " # - disk_size_gb: 1000\n", + " #########\n", + " \n", + "working_dir: .\n", + "cloud:\n", + "applications:\n", + "# Point to your app in your Python module\n", + "- import_path: serve_deepseek_r1:app\n", + "```\n", + "\n", + "Deploy your service" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa1c6108", + "metadata": { + "pygments_lexer": "bash" + }, + "outputs": [], + "source": [ + "%%bash\n", + "anyscale service deploy -f service.yaml" + ] + }, + { + "cell_type": "markdown", + "id": "18226fd7", + "metadata": {}, + "source": [ + "**Note:** If your model is gated, make sure to pass your Hugging Face token to the service with `--env HF_TOKEN=`\n", + "\n", + "**Custom Dockerfile** \n", + "You can customize the container by building your own Dockerfile. In your Anyscale Service config, reference the Dockerfile with `containerfile` (instead of `image_uri`):\n", + "\n", + "```yaml\n", + "# service.yaml\n", + "# Replace:\n", + "# image_uri: anyscale/ray-llm:2.49.0-py311-cu128\n", + "\n", + "# with:\n", + "containerfile: ./Dockerfile\n", + "```\n", + "\n", + "See the [Anyscale base images](https://docs.anyscale.com/reference/base-images) for details on what each image includes.\n", + "\n", + "---\n", + "\n", + "### Send requests \n", + "\n", + "The `anyscale service deploy` command output shows both the endpoint and authentication token:\n", + "```console\n", + "(anyscale +3.9s) curl -H \"Authorization: Bearer \" \n", + "```\n", + "You can also retrieve both from the service page in the Anyscale console. Click the **Query** button at the top. See [Send requests](#send-requests) for example requests, but make sure to use the correct endpoint and authentication token. \n", + "\n", + "---\n", + "\n", + "### Access the Serve LLM dashboard\n", + "\n", + "See [Enable LLM monitoring](#enable-llm-monitoring) for instructions on enabling LLM-specific logging. To open the Ray Serve LLM dashboard from an Anyscale service:\n", + "1. In the Anyscale console, go to your **Service** or **Workspace**\n", + "2. Navigate to the **Metrics** tab\n", + "3. Click **View in Grafana** and click **Serve LLM Dashboard**\n", + "\n", + "---\n", + "\n", + "### Shutdown \n", + " \n", + "Shutdown your Anyscale service:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "211d5baf", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "anyscale service terminate -n deploy-deepseek-r1" + ] + }, + { + "cell_type": "markdown", + "id": "1d8fba49", + "metadata": {}, + "source": [ + "\n", + "---\n", + "\n", + "## Enable LLM monitoring\n", + "\n", + "The *Serve LLM dashboard* offers deep visibility into model performance, latency, and system behavior, including:\n", + "\n", + "* Token throughput (tokens/sec)\n", + "* Latency metrics: Time To First Token (TTFT), Time Per Output Token (TPOT)\n", + "* KV cache utilization\n", + "\n", + "To enable these metrics, go to your LLM config and set `log_engine_metrics: true`. Ensure vLLM V1 is active with `VLLM_USE_V1: \"1\"`. \n", + "**Note:** `VLLM_USE_V1: \"1\"` is the default value with `ray >= 2.48.0` and can be omitted.\n", + "```yaml\n", + "applications:\n", + "- ...\n", + " args:\n", + " llm_configs:\n", + " - ...\n", + " runtime_env:\n", + " env_vars:\n", + " VLLM_USE_V1: \"1\"\n", + " ...\n", + " log_engine_metrics: true\n", + "```\n", + "\n", + "---\n", + "\n", + "## Improve concurrency\n", + "\n", + "Ray Serve LLM uses [vLLM](https://docs.vllm.ai/en/stable/) as its backend engine, which logs the *maximum concurrency* it can support based on your configuration. \n", + "\n", + "Example log:\n", + "```console\n", + "INFO 07-30 11:56:04 [kv_cache_utils.py:637] Maximum concurrency for 32,768 tokens per request: 29.06x\n", + "```\n", + "\n", + "The following are a few ways to improve concurrency depending on your model and hardware: \n", + "\n", + "**Reduce `max_model_len`** \n", + "Lowering `max_model_len` reduces the memory needed for KV cache.\n", + "\n", + "**Example**: Running DeepSeek-R1 on 2 nodes with 8xH100-80 GB GPUs each:\n", + "* `max_model_len = 32,768` → concurrency ≈ 29\n", + "* `max_model_len = 16,384` → concurrency ≈ 58\n", + "\n", + "**Use distilled or quantized models** \n", + "Quantizing or distilling your model reduces its memory footprint, freeing up space for more KV cache and enabling more concurrent requests. For example, see [`deepseek-ai/DeepSeek-R1-Distill-Llama-70B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B) for a distilled version of DeepSeek-R1.\n", + "\n", + "\n", + "**Upgrade to GPUs with more memory** \n", + "Some GPUs provide significantly more room for KV cache and allow for higher concurrency out of the box.\n", + "\n", + "**Scale with more replicas** \n", + "In addition to tuning per-GPU concurrency, you can scale *horizontally* by increasing the number of replicas in your config. \n", + "Each replica runs on its own GPU, so raising the replica count increases the total number of concurrent requests your service can handle, especially under sustained or bursty traffic.\n", + "```yaml\n", + "deployment_config:\n", + " autoscaling_config:\n", + " min_replicas: 1\n", + " max_replicas: 4\n", + "```\n", + "\n", + "*For more details on tuning strategies, hardware guidance, and serving configurations, see [Choose a GPU for LLM serving](https://docs.anyscale.com/llm/serving/gpu-guidance) and [Tune parameters for LLMs on Anyscale services](https://docs.anyscale.com/llm/serving/parameter-tuning).*\n", + "\n", + "---\n", + "\n", + "## Troubleshooting\n", + "\n", + "**Hugging Face auth errors** \n", + "Some models, such as Llama-3.1, are gated and require prior authorization from the organization. See your model’s documentation for instructions on obtaining access.\n", + "\n", + "**Out-Of-Memory errors** \n", + "Out‑of‑memory (OOM) errors are one of the most common failure modes when deploying LLMs, especially as model sizes, and context length increase. \n", + "See [Troubleshooting Guide](https://docs.anyscale.com/overview) for common errors and how to fix them.\n", + "\n", + "---\n", + "\n", + "## Summary\n", + "\n", + "In this tutorial, you deployed a large-sized LLM with Ray Serve LLM, from development to production. You learned how to configure Ray Serve LLM, deploy your service on your Ray cluster, and how to send requests. You also learned how to monitor your app and troubleshoot common issues." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "repo_ray_docs", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.12.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/doc/source/serve/tutorials/deployment-serve-llm/large-size-llm/serve_deepseek_r1.py b/doc/source/serve/tutorials/deployment-serve-llm/large-size-llm/serve_deepseek_r1.py new file mode 100644 index 000000000000..4f95a2e6d8aa --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/large-size-llm/serve_deepseek_r1.py @@ -0,0 +1,27 @@ +# serve_deepseek_r1.py +from ray.serve.llm import LLMConfig, build_openai_app + +llm_config = LLMConfig( + model_loading_config=dict( + model_id="my-deepseek-r1", + model_source="deepseek-ai/DeepSeek-R1", + ), + accelerator_type="H100", + deployment_config=dict( + autoscaling_config=dict( + min_replicas=1, + max_replicas=1, + ) + ), + ### Uncomment if your model is gated and needs your Hugging Face token to access it. + # runtime_env=dict(env_vars={"HF_TOKEN": os.environ.get("HF_TOKEN")}), + engine_kwargs=dict( + max_model_len=16384, + # Split weights among 8 GPUs in the node + tensor_parallel_size=8, + pipeline_parallel_size=2, + reasoning_parser="deepseek_r1", # Optional: separate reasoning content from the final answer + ), +) + +app = build_openai_app({"llm_configs": [llm_config]}) diff --git a/doc/source/serve/tutorials/deployment-serve-llm/large-size-llm/service.yaml b/doc/source/serve/tutorials/deployment-serve-llm/large-size-llm/service.yaml new file mode 100644 index 000000000000..9fb4e4e7130b --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/large-size-llm/service.yaml @@ -0,0 +1,29 @@ +#service.yaml +name: deploy-deepseek-r1 +image_uri: anyscale/ray-llm:2.49.0-py311-cu128 # Anyscale Ray Serve LLM image. Use `containerfile: ./Dockerfile` to use a custom Dockerfile. +compute_config: + auto_select_worker_config: true + # Change default disk size to 1000GB + advanced_instance_config: + ## AWS ## + BlockDeviceMappings: + - Ebs: + - VolumeSize: 1000 + VolumeType: gp3 + DeleteOnTermination: true + DeviceName: "/dev/sda1" + ######### + ## GCP ## + #instanceProperties: + # disks: + # - boot: true + # auto_delete: true + # initialize_params: + # - disk_size_gb: 1000 + ######### + +working_dir: . +cloud: +applications: + # Point to your app in your Python module + - import_path: serve_deepseek_r1:app \ No newline at end of file diff --git a/doc/source/serve/tutorials/deployment-serve-llm/medium-size-llm/Dockerfile b/doc/source/serve/tutorials/deployment-serve-llm/medium-size-llm/Dockerfile new file mode 100644 index 000000000000..a2412390df61 --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/medium-size-llm/Dockerfile @@ -0,0 +1,8 @@ +FROM anyscale/ray:2.49.0-slim-py312-cu128 + +# C compiler for Triton’s runtime build step (vLLM V1 engine) +# https://github.com/vllm-project/vllm/issues/2997 +RUN sudo apt-get update && \ + sudo apt-get install -y --no-install-recommends build-essential + +RUN pip install vllm==0.10.0 \ No newline at end of file diff --git a/doc/source/serve/tutorials/deployment-serve-llm/medium-size-llm/README.md b/doc/source/serve/tutorials/deployment-serve-llm/medium-size-llm/README.md new file mode 100644 index 000000000000..271080a43e9b --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/medium-size-llm/README.md @@ -0,0 +1,313 @@ +--- +orphan: true +--- + + + +# Deploy a medium-sized LLM + +A medium LLM typically runs on a single node with 4-8 GPUs. It offers a balance between performance and efficiency. These models provide stronger accuracy and reasoning than small models while remaining more affordable and resource-friendly than very large ones. This makes them a solid choice for production workloads that need good quality at lower cost. They're also ideal for scaling applications where large models would be too slow or expensive. + +This tutorial deploys a medium-sized LLM using Ray Serve LLM. For smaller models, see [Deploy a small-sized LLM](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/small-size-llm/README.html), and for larger models, see [Deploy a large-sized LLM](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/large-size-llm/README.html). + +--- + +## Configure Ray Serve LLM + +You can deploy a medium-sized LLM on a single node with multiple GPUs. To leverage all available GPUs, set `tensor_parallel_size` to the number of GPUs on the node, which distributes the model’s weights evenly across them. + +Ray Serve LLM provides multiple [Python APIs](https://docs.ray.io/en/latest/serve/api/index.html#llm-api) for defining your application. Use [`build_openai_app`](https://docs.ray.io/en/latest/serve/api/doc/ray.serve.llm.build_openai_app.html#ray.serve.llm.build_openai_app) to build a full application from your [`LLMConfig`](https://docs.ray.io/en/latest/serve/api/doc/ray.serve.llm.LLMConfig.html#ray.serve.llm.LLMConfig) object. + +Set your Hugging Face token in the config file to access gated models like `Llama-3.1`. + + +```python +# serve_llama_3_1_70b.py +from ray.serve.llm import LLMConfig, build_openai_app +import os + +llm_config = LLMConfig( + model_loading_config=dict( + model_id="my-llama-3.1-70b", + # Or unsloth/Meta-Llama-3.1-70B-Instruct for an ungated model + model_source="meta-llama/Llama-3.1-70B-Instruct", + ), + accelerator_type="A100-40G", + deployment_config=dict( + autoscaling_config=dict( + min_replicas=1, + max_replicas=4, + ) + ), + ### If your model is not gated, you can skip `HF_TOKEN` + # Share your Hugging Face token with the vllm engine so it can access the gated Llama 3. + # Type `export HF_TOKEN=` in a terminal + runtime_env=dict(env_vars={"HF_TOKEN": os.environ.get("HF_TOKEN")}), + engine_kwargs=dict( + max_model_len=32768, + # Split weights among 8 GPUs in the node + tensor_parallel_size=8, + ), +) + +app = build_openai_app({"llm_configs": [llm_config]}) + +``` + +**Note:** Before moving to a production setup, migrate to using a [Serve config file](https://docs.ray.io/en/latest/serve/production-guide/config.html) to make your deployment version-controlled, reproducible, and easier to maintain for CI/CD pipelines. See [Serving LLMs - Quickstart Examples: Production Guide](https://docs.ray.io/en/latest/serve/llm/quick-start.html#production-deployment) for an example. + +--- + +## Deploy locally + +**Prerequisites** + +* Access to GPU compute. +* (Optional) A **Hugging Face token** if using gated models like Meta’s Llama. Store it in `export HF_TOKEN=`. + +**Note: **Depending on the organization, you can usually request access on the model's Hugging Face page. For example, Meta’s Llama model approval can take anywhere from a few hours to several weeks. + +**Dependencies:** +```bash +pip install "ray[serve,llm]" +``` + +--- + +### Launch + +Follow the instructions at [Configure Ray Serve LLM](#configure-ray-serve-llm) to define your app in a Python module `serve_llama_3_1_70b.py`. + +In a terminal, run: + + +```bash +%%bash +export HF_TOKEN= +serve run serve_llama_3_1_70b:app --non-blocking +``` + +Deployment typically takes a few minutes as the cluster is provisioned, the vLLM server starts, and the model is downloaded. + +--- + +### Send requests + +Your endpoint is available locally at `http://localhost:8000` and you can use a placeholder authentication token for the OpenAI client, for example `"FAKE_KEY"`. + +Example curl: + + +```bash +%%bash +curl -X POST http://localhost:8000/v1/chat/completions \ + -H "Authorization: Bearer FAKE_KEY" \ + -H "Content-Type: application/json" \ + -d '{ "model": "my-llama-3.1-70b", "messages": [{"role": "user", "content": "What is 2 + 2?"}] }' +``` + +Example Python: + + +```python +#client.py +from urllib.parse import urljoin +from openai import OpenAI + +API_KEY = "FAKE_KEY" +BASE_URL = "http://localhost:8000" + +client = OpenAI(BASE_URL=urljoin(BASE_URL, "v1"), API_KEY=API_KEY) + +response = client.chat.completions.create( + model="my-llama-3.1-70b", + messages=[{"role": "user", "content": "Tell me a joke"}], + stream=True +) + +for chunk in response: + content = chunk.choices[0].delta.content + if content: + print(content, end="", flush=True) +``` + + +--- + +### Shutdown + +Shutdown your LLM service: + + +```bash +%%bash +serve shutdown -y +``` + + +--- + +## Deploy to production with Anyscale services + +For production deployment, use Anyscale services to deploy the Ray Serve app to a dedicated cluster without modifying the code. Anyscale ensures scalability, fault tolerance, and load balancing, keeping the service resilient against node failures, high traffic, and rolling updates. + +--- + +### Launch the service + +Anyscale provides out-of-the-box images (`anyscale/ray-llm`), which come pre-loaded with Ray Serve LLM, vLLM, and all required GPU/runtime dependencies. This makes it easy to get started without building a custom image. + +Create your Anyscale service configuration in a new `service.yaml` file: +```yaml +# service.yaml +name: deploy-llama-3-70b +image_uri: anyscale/ray-llm:2.49.0-py311-cu128 # Anyscale Ray Serve LLM image. Use `containerfile: ./Dockerfile` to use a custom Dockerfile. +compute_config: + auto_select_worker_config: true +working_dir: . +cloud: +applications: + # Point to your app in your Python module + - import_path: serve_llama_3_1_70b:app +``` + +Deploy your service. Make sure you forward your Hugging Face token to the command. + + +```bash +%%bash +anyscale service deploy -f service.yaml --env HF_TOKEN= +``` + +**Custom Dockerfile** +You can customize the container by building your own Dockerfile. In your Anyscale Service config, reference the Dockerfile with `containerfile` (instead of `image_uri`): + +```yaml +# service.yaml +# Replace: +# image_uri: anyscale/ray-llm:2.49.0-py311-cu128 + +# with: +containerfile: ./Dockerfile +``` + +See the [Anyscale base images](https://docs.anyscale.com/reference/base-images) for details on what each image includes. + +--- + +### Send requests + +The `anyscale service deploy` command output shows both the endpoint and authentication token: +```console +(anyscale +3.9s) curl -H "Authorization: Bearer " +``` +You can also retrieve both from the service page in the Anyscale console. Click the **Query** button at the top. See [Send requests](#send-requests) for example requests, but make sure to use the correct endpoint and authentication token. + +--- + +### Access the Serve LLM dashboard + +See [Enable LLM monitoring](#enable-llm-monitoring) for instructions on enabling LLM-specific logging. To open the Ray Serve LLM dashboard from an Anyscale service: +1. In the Anyscale console, go to your **Service** or **Workspace** +2. Navigate to the **Metrics** tab +3. Click **View in Grafana** and click **Serve LLM Dashboard** + +--- + +### Shutdown + +Shutdown your Anyscale service: + + +```bash +%%bash +anyscale service terminate -n deploy-llama-3-70b +``` + + +--- + +## Enable LLM monitoring + +The *Serve LLM Dashboard* offers deep visibility into model performance, latency, and system behavior, including: + +* Token throughput (tokens/sec). +* Latency metrics: Time To First Token (TTFT), Time Per Output Token (TPOT). +* KV cache utilization. + +To enable these metrics, go to your LLM config and set `log_engine_metrics: true`. Ensure vLLM V1 is active with `VLLM_USE_V1: "1"`. +**Note:** `VLLM_USE_V1: "1"` is the default value with `ray >= 2.48.0` and can be omitted. +```yaml +applications: +- ... + args: + llm_configs: + - ... + runtime_env: + env_vars: + VLLM_USE_V1: "1" + ... + log_engine_metrics: true +``` + +--- + +## Improve concurrency + +Ray Serve LLM uses [vLLM](https://docs.vllm.ai/en/latest/) as its backend engine, which logs the *maximum concurrency* it can support based on your configuration. + +Example log: +```console +INFO 08-19 20:57:37 [kv_cache_utils.py:837] Maximum concurrency for 32,768 tokens per request: 13.02x +``` + +The following are a few ways to improve concurrency depending on your model and hardware: + +**Reduce `max_model_len`** +Lowering `max_model_len` reduces the memory needed for KV cache. + +**Example:** Running Llama-3.1-70 B on an A100-40G: +* `max_model_len = 32,768` → concurrency ≈ 13 +* `max_model_len = 16,384` → concurrency ≈ 26 + +**Use Quantized models** +Quantizing your model (for example, to FP8) reduces the model's memory footprint, freeing up memory for more KV cache and enabling more concurrent requests. + +**Use pipeline parallelism** +If a single node isn't enough to handle your workload, consider distributing the model's layers across multiple nodes with `pipeline_parallel_size > 1`. + +**Upgrade to GPUs with more memory** +Some GPUs provide significantly more room for KV cache and allow for higher concurrency out of the box. + +**Scale with more replicas** +In addition to tuning per-GPU concurrency, you can scale *horizontally* by increasing the number of replicas in your config. +Each replica runs on its own GPU, so raising the replica count increases the total number of concurrent requests your service can handle, especially under sustained or bursty traffic. +```yaml +deployment_config: + autoscaling_config: + min_replicas: 1 + max_replicas: 4 +``` + +*For more details on tuning strategies, hardware guidance, and serving configurations, see [Choose a GPU for LLM serving](https://docs.anyscale.com/llm/serving/gpu-guidance) and [Tune parameters for LLMs on Anyscale services](https://docs.anyscale.com/llm/serving/parameter-tuning).* + +--- + +## Troubleshooting + +**Hugging Face auth errors** +Some models, such as Llama-3.1, are gated and require prior authorization from the organization. See your model’s documentation for instructions on obtaining access. + +**Out-of-memory errors** +Out-of-memory (OOM) errors are one of the most common failure modes when deploying LLMs, especially as model sizes and context length increase. +See this [Troubleshooting Guide](https://docs.anyscale.com/overview) for common errors and how to fix them. + +--- + +## Summary + +In this tutorial, you deployed a medium-sized LLM with Ray Serve LLM, from development to production. You learned how to configure Ray Serve LLM, deploy your service on your Ray cluster, and send requests. You also learned how to monitor your app and troubleshoot common issues. diff --git a/doc/source/serve/tutorials/deployment-serve-llm/medium-size-llm/client.py b/doc/source/serve/tutorials/deployment-serve-llm/medium-size-llm/client.py new file mode 100644 index 000000000000..6715ef57b451 --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/medium-size-llm/client.py @@ -0,0 +1,19 @@ +# client.py +from urllib.parse import urljoin +from openai import OpenAI + +API_KEY = "FAKE_KEY" +BASE_URL = "http://localhost:8000" + +client = OpenAI(BASE_URL=urljoin(BASE_URL, "v1"), API_KEY=API_KEY) + +response = client.chat.completions.create( + model="my-llama-3.1-70b", + messages=[{"role": "user", "content": "Tell me a joke"}], + stream=True, +) + +for chunk in response: + content = chunk.choices[0].delta.content + if content: + print(content, end="", flush=True) diff --git a/doc/source/serve/tutorials/deployment-serve-llm/medium-size-llm/notebook.ipynb b/doc/source/serve/tutorials/deployment-serve-llm/medium-size-llm/notebook.ipynb new file mode 100644 index 000000000000..b6dd436002b8 --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/medium-size-llm/notebook.ipynb @@ -0,0 +1,405 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f8f6fcbd", + "metadata": {}, + "source": [ + "# Deploy a medium-sized LLM\n", + "\n", + "A medium LLM typically runs on a single node with 4-8 GPUs. It offers a balance between performance and efficiency. These models provide stronger accuracy and reasoning than small models while remaining more affordable and resource-friendly than very large ones. This makes them a solid choice for production workloads that need good quality at lower cost. They're also ideal for scaling applications where large models would be too slow or expensive.\n", + "\n", + "This tutorial deploys a medium-sized LLM using Ray Serve LLM. For smaller models, see [Deploy a small-sized LLM](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/small-size-llm/README.html), and for larger models, see [Deploy a large-sized LLM](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/large-size-llm/README.html).\n", + "\n", + "---\n", + "\n", + "## Configure Ray Serve LLM\n", + "\n", + "You can deploy a medium-sized LLM on a single node with multiple GPUs. To leverage all available GPUs, set `tensor_parallel_size` to the number of GPUs on the node, which distributes the model’s weights evenly across them.\n", + "\n", + "Ray Serve LLM provides multiple [Python APIs](https://docs.ray.io/en/latest/serve/api/index.html#llm-api) for defining your application. Use [`build_openai_app`](https://docs.ray.io/en/latest/serve/api/doc/ray.serve.llm.build_openai_app.html#ray.serve.llm.build_openai_app) to build a full application from your [`LLMConfig`](https://docs.ray.io/en/latest/serve/api/doc/ray.serve.llm.LLMConfig.html#ray.serve.llm.LLMConfig) object.\n", + "\n", + "Set your Hugging Face token in the config file to access gated models like `Llama-3.1`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d185d580", + "metadata": {}, + "outputs": [], + "source": [ + "# serve_llama_3_1_70b.py\n", + "from ray.serve.llm import LLMConfig, build_openai_app\n", + "import os\n", + "\n", + "llm_config = LLMConfig(\n", + " model_loading_config=dict(\n", + " model_id=\"my-llama-3.1-70b\",\n", + " # Or unsloth/Meta-Llama-3.1-70B-Instruct for an ungated model\n", + " model_source=\"meta-llama/Llama-3.1-70B-Instruct\",\n", + " ),\n", + " accelerator_type=\"A100-40G\",\n", + " deployment_config=dict(\n", + " autoscaling_config=dict(\n", + " min_replicas=1,\n", + " max_replicas=4,\n", + " )\n", + " ),\n", + " ### If your model is not gated, you can skip `HF_TOKEN`\n", + " # Share your Hugging Face token with the vllm engine so it can access the gated Llama 3.\n", + " # Type `export HF_TOKEN=` in a terminal\n", + " runtime_env=dict(env_vars={\"HF_TOKEN\": os.environ.get(\"HF_TOKEN\")}),\n", + " engine_kwargs=dict(\n", + " max_model_len=32768,\n", + " # Split weights among 8 GPUs in the node\n", + " tensor_parallel_size=8,\n", + " ),\n", + ")\n", + "\n", + "app = build_openai_app({\"llm_configs\": [llm_config]})\n" + ] + }, + { + "cell_type": "markdown", + "id": "6b2231a5", + "metadata": {}, + "source": [ + "**Note:** Before moving to a production setup, migrate to using a [Serve config file](https://docs.ray.io/en/latest/serve/production-guide/config.html) to make your deployment version-controlled, reproducible, and easier to maintain for CI/CD pipelines. See [Serving LLMs - Quickstart Examples: Production Guide](https://docs.ray.io/en/latest/serve/llm/quick-start.html#production-deployment) for an example.\n", + "\n", + "---\n", + "\n", + "## Deploy locally\n", + "\n", + "**Prerequisites**\n", + "\n", + "* Access to GPU compute.\n", + "* (Optional) A **Hugging Face token** if using gated models like Meta’s Llama. Store it in `export HF_TOKEN=`.\n", + "\n", + "**Note: **Depending on the organization, you can usually request access on the model's Hugging Face page. For example, Meta’s Llama model approval can take anywhere from a few hours to several weeks.\n", + "\n", + "**Dependencies:** \n", + "```bash\n", + "pip install \"ray[serve,llm]\"\n", + "```\n", + "\n", + "---\n", + "\n", + "### Launch\n", + "\n", + "Follow the instructions at [Configure Ray Serve LLM](#configure-ray-serve-llm) to define your app in a Python module `serve_llama_3_1_70b.py`. \n", + "\n", + "In a terminal, run: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae9da12c", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "export HF_TOKEN=\n", + "serve run serve_llama_3_1_70b:app --non-blocking" + ] + }, + { + "cell_type": "markdown", + "id": "96d18e22", + "metadata": {}, + "source": [ + "Deployment typically takes a few minutes as the cluster is provisioned, the vLLM server starts, and the model is downloaded. \n", + "\n", + "---\n", + "\n", + "### Send requests\n", + "\n", + "Your endpoint is available locally at `http://localhost:8000` and you can use a placeholder authentication token for the OpenAI client, for example `\"FAKE_KEY\"`.\n", + "\n", + "Example curl:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1dd345c", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "curl -X POST http://localhost:8000/v1/chat/completions \\\n", + " -H \"Authorization: Bearer FAKE_KEY\" \\\n", + " -H \"Content-Type: application/json\" \\\n", + " -d '{ \"model\": \"my-llama-3.1-70b\", \"messages\": [{\"role\": \"user\", \"content\": \"What is 2 + 2?\"}] }'" + ] + }, + { + "cell_type": "markdown", + "id": "dca5e4fd", + "metadata": {}, + "source": [ + "Example Python:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "584f01f7", + "metadata": {}, + "outputs": [], + "source": [ + "#client.py\n", + "from urllib.parse import urljoin\n", + "from openai import OpenAI\n", + "\n", + "API_KEY = \"FAKE_KEY\"\n", + "BASE_URL = \"http://localhost:8000\"\n", + "\n", + "client = OpenAI(BASE_URL=urljoin(BASE_URL, \"v1\"), API_KEY=API_KEY)\n", + "\n", + "response = client.chat.completions.create(\n", + " model=\"my-llama-3.1-70b\",\n", + " messages=[{\"role\": \"user\", \"content\": \"Tell me a joke\"}],\n", + " stream=True\n", + ")\n", + "\n", + "for chunk in response:\n", + " content = chunk.choices[0].delta.content\n", + " if content:\n", + " print(content, end=\"\", flush=True)" + ] + }, + { + "cell_type": "markdown", + "id": "1a5fd1fb", + "metadata": {}, + "source": [ + "\n", + "---\n", + "\n", + "### Shutdown\n", + "\n", + "Shutdown your LLM service: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c03cdb9", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "serve shutdown -y" + ] + }, + { + "cell_type": "markdown", + "id": "dc223463", + "metadata": {}, + "source": [ + "\n", + "---\n", + "\n", + "## Deploy to production with Anyscale services\n", + "\n", + "For production deployment, use Anyscale services to deploy the Ray Serve app to a dedicated cluster without modifying the code. Anyscale ensures scalability, fault tolerance, and load balancing, keeping the service resilient against node failures, high traffic, and rolling updates.\n", + "\n", + "---\n", + "\n", + "### Launch the service\n", + "\n", + "Anyscale provides out-of-the-box images (`anyscale/ray-llm`), which come pre-loaded with Ray Serve LLM, vLLM, and all required GPU/runtime dependencies. This makes it easy to get started without building a custom image.\n", + "\n", + "Create your Anyscale service configuration in a new `service.yaml` file:\n", + "```yaml\n", + "# service.yaml\n", + "name: deploy-llama-3-70b\n", + "image_uri: anyscale/ray-llm:2.49.0-py311-cu128 # Anyscale Ray Serve LLM image. Use `containerfile: ./Dockerfile` to use a custom Dockerfile.\n", + "compute_config:\n", + " auto_select_worker_config: true \n", + "working_dir: .\n", + "cloud:\n", + "applications:\n", + " # Point to your app in your Python module\n", + " - import_path: serve_llama_3_1_70b:app\n", + "```\n", + "\n", + "Deploy your service. Make sure you forward your Hugging Face token to the command." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa1c6108", + "metadata": { + "pygments_lexer": "bash" + }, + "outputs": [], + "source": [ + "%%bash\n", + "anyscale service deploy -f service.yaml --env HF_TOKEN=" + ] + }, + { + "cell_type": "markdown", + "id": "18226fd7", + "metadata": {}, + "source": [ + "**Custom Dockerfile** \n", + "You can customize the container by building your own Dockerfile. In your Anyscale Service config, reference the Dockerfile with `containerfile` (instead of `image_uri`):\n", + "\n", + "```yaml\n", + "# service.yaml\n", + "# Replace:\n", + "# image_uri: anyscale/ray-llm:2.49.0-py311-cu128\n", + "\n", + "# with:\n", + "containerfile: ./Dockerfile\n", + "```\n", + "\n", + "See the [Anyscale base images](https://docs.anyscale.com/reference/base-images) for details on what each image includes.\n", + "\n", + "---\n", + "\n", + "### Send requests \n", + "\n", + "The `anyscale service deploy` command output shows both the endpoint and authentication token:\n", + "```console\n", + "(anyscale +3.9s) curl -H \"Authorization: Bearer \" \n", + "```\n", + "You can also retrieve both from the service page in the Anyscale console. Click the **Query** button at the top. See [Send requests](#send-requests) for example requests, but make sure to use the correct endpoint and authentication token. \n", + "\n", + "---\n", + "\n", + "### Access the Serve LLM dashboard\n", + "\n", + "See [Enable LLM monitoring](#enable-llm-monitoring) for instructions on enabling LLM-specific logging. To open the Ray Serve LLM dashboard from an Anyscale service:\n", + "1. In the Anyscale console, go to your **Service** or **Workspace**\n", + "2. Navigate to the **Metrics** tab\n", + "3. Click **View in Grafana** and click **Serve LLM Dashboard**\n", + "\n", + "---\n", + "\n", + "### Shutdown \n", + " \n", + "Shutdown your Anyscale service:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "211d5baf", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "anyscale service terminate -n deploy-llama-3-70b" + ] + }, + { + "cell_type": "markdown", + "id": "1d8fba49", + "metadata": {}, + "source": [ + "\n", + "---\n", + "\n", + "## Enable LLM monitoring\n", + "\n", + "The *Serve LLM Dashboard* offers deep visibility into model performance, latency, and system behavior, including:\n", + "\n", + "* Token throughput (tokens/sec).\n", + "* Latency metrics: Time To First Token (TTFT), Time Per Output Token (TPOT).\n", + "* KV cache utilization.\n", + "\n", + "To enable these metrics, go to your LLM config and set `log_engine_metrics: true`. Ensure vLLM V1 is active with `VLLM_USE_V1: \"1\"`. \n", + "**Note:** `VLLM_USE_V1: \"1\"` is the default value with `ray >= 2.48.0` and can be omitted.\n", + "```yaml\n", + "applications:\n", + "- ...\n", + " args:\n", + " llm_configs:\n", + " - ...\n", + " runtime_env:\n", + " env_vars:\n", + " VLLM_USE_V1: \"1\"\n", + " ...\n", + " log_engine_metrics: true\n", + "```\n", + "\n", + "---\n", + "\n", + "## Improve concurrency\n", + "\n", + "Ray Serve LLM uses [vLLM](https://docs.vllm.ai/en/latest/) as its backend engine, which logs the *maximum concurrency* it can support based on your configuration. \n", + "\n", + "Example log:\n", + "```console\n", + "INFO 08-19 20:57:37 [kv_cache_utils.py:837] Maximum concurrency for 32,768 tokens per request: 13.02x\n", + "```\n", + "\n", + "The following are a few ways to improve concurrency depending on your model and hardware: \n", + "\n", + "**Reduce `max_model_len`** \n", + "Lowering `max_model_len` reduces the memory needed for KV cache.\n", + "\n", + "**Example:** Running Llama-3.1-70 B on an A100-40G:\n", + "* `max_model_len = 32,768` → concurrency ≈ 13\n", + "* `max_model_len = 16,384` → concurrency ≈ 26\n", + "\n", + "**Use Quantized models** \n", + "Quantizing your model (for example, to FP8) reduces the model's memory footprint, freeing up memory for more KV cache and enabling more concurrent requests.\n", + "\n", + "**Use pipeline parallelism** \n", + "If a single node isn't enough to handle your workload, consider distributing the model's layers across multiple nodes with `pipeline_parallel_size > 1`.\n", + "\n", + "**Upgrade to GPUs with more memory** \n", + "Some GPUs provide significantly more room for KV cache and allow for higher concurrency out of the box.\n", + "\n", + "**Scale with more replicas** \n", + "In addition to tuning per-GPU concurrency, you can scale *horizontally* by increasing the number of replicas in your config. \n", + "Each replica runs on its own GPU, so raising the replica count increases the total number of concurrent requests your service can handle, especially under sustained or bursty traffic.\n", + "```yaml\n", + "deployment_config:\n", + " autoscaling_config:\n", + " min_replicas: 1\n", + " max_replicas: 4\n", + "```\n", + "\n", + "*For more details on tuning strategies, hardware guidance, and serving configurations, see [Choose a GPU for LLM serving](https://docs.anyscale.com/llm/serving/gpu-guidance) and [Tune parameters for LLMs on Anyscale services](https://docs.anyscale.com/llm/serving/parameter-tuning).*\n", + "\n", + "---\n", + "\n", + "## Troubleshooting\n", + "\n", + "**Hugging Face auth errors** \n", + "Some models, such as Llama-3.1, are gated and require prior authorization from the organization. See your model’s documentation for instructions on obtaining access.\n", + "\n", + "**Out-of-memory errors** \n", + "Out-of-memory (OOM) errors are one of the most common failure modes when deploying LLMs, especially as model sizes and context length increase. \n", + "See this [Troubleshooting Guide](https://docs.anyscale.com/overview) for common errors and how to fix them.\n", + "\n", + "---\n", + "\n", + "## Summary\n", + "\n", + "In this tutorial, you deployed a medium-sized LLM with Ray Serve LLM, from development to production. You learned how to configure Ray Serve LLM, deploy your service on your Ray cluster, and send requests. You also learned how to monitor your app and troubleshoot common issues." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "repo_ray_docs", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.12.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/doc/source/serve/tutorials/deployment-serve-llm/medium-size-llm/serve_llama_3_1_70b.py b/doc/source/serve/tutorials/deployment-serve-llm/medium-size-llm/serve_llama_3_1_70b.py new file mode 100644 index 000000000000..9e62adffb19a --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/medium-size-llm/serve_llama_3_1_70b.py @@ -0,0 +1,29 @@ +# serve_llama_3_1_70b.py +from ray.serve.llm import LLMConfig, build_openai_app +import os + +llm_config = LLMConfig( + model_loading_config=dict( + model_id="my-llama-3.1-70b", + # Or unsloth/Meta-Llama-3.1-70B-Instruct for an ungated model + model_source="meta-llama/Llama-3.1-70B-Instruct", + ), + accelerator_type="A100-40G", + deployment_config=dict( + autoscaling_config=dict( + min_replicas=1, + max_replicas=4, + ) + ), + ### If your model is not gated, you can skip `HF_TOKEN` + # Share your Hugging Face token with the vllm engine so it can access the gated Llama 3. + # Type `export HF_TOKEN=` in a terminal + runtime_env=dict(env_vars={"HF_TOKEN": os.environ.get("HF_TOKEN")}), + engine_kwargs=dict( + max_model_len=32768, + # Split weights among 8 GPUs in the node + tensor_parallel_size=8, + ), +) + +app = build_openai_app({"llm_configs": [llm_config]}) diff --git a/doc/source/serve/tutorials/deployment-serve-llm/medium-size-llm/service.yaml b/doc/source/serve/tutorials/deployment-serve-llm/medium-size-llm/service.yaml new file mode 100644 index 000000000000..35388c72f961 --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/medium-size-llm/service.yaml @@ -0,0 +1,10 @@ +# service.yaml +name: deploy-llama-3-70b +image_uri: anyscale/ray-llm:2.49.0-py311-cu128 # Anyscale Ray Serve LLM image. Use `containerfile: ./Dockerfile` to use a custom Dockerfile. +compute_config: + auto_select_worker_config: true +working_dir: . +cloud: +applications: + # Point to your app in your Python module + - import_path: serve_llama_3_1_70b:app \ No newline at end of file diff --git a/doc/source/serve/tutorials/deployment-serve-llm/reasoning-llm/README.md b/doc/source/serve/tutorials/deployment-serve-llm/reasoning-llm/README.md new file mode 100644 index 000000000000..ef7a943d6410 --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/reasoning-llm/README.md @@ -0,0 +1,266 @@ +--- +orphan: true +--- + + + +# Deploy a reasoning LLM + +A reasoning LLM handles tasks that require deeper analysis or step-by-step thought. It generates intermediate reasoning before arriving at a final answer, making it better suited for situations where careful logic or structured problem-solving is more important than speed or efficiency. + +This tutorial deploys a reasoning LLM using Ray Serve LLM. + +--- + +## Compare reasoning and non-reasoning models + +Reasoning models simulate step-by-step, structured thought processes to solve complex tasks like math, multi-hop QA, or code generation. In contrast, non-reasoning models provide fast, direct responses and focus on fluency or instruction following without explicit intermediate reasoning. The key distinction lies in whether the model attempts to "think through" the problem before answering. + +| **Model type** | **Core behavior** | **Use case examples** | **Limitation** | +| ----------------------- | ------------------------------------ | -------------------------------------------------------- | ----------------------------------------------------- | +| **Reasoning model** | Explicit multi-step thinking process | Math, coding, logic puzzles, multi-hop QA, CoT prompting | Slower response time, more tokens used. | +| **Non-reasoning model** | Direct answer generation | Casual queries, short instructions, single-step answers | May struggle with complex reasoning or interpretability. | + +Many reasoning-capable models structure their outputs with special markers such as `` tags, or expose reasoning traces inside dedicated fields like `reasoning_content` in the OpenAI API response. Always check the model's documentation for how to structure and control thinking. + +**Note:** Reasoning LLMs often benefit from long context windows (32K up to +1M tokens), high token throughput, low-temperature decoding (greedy sampling), and strong instruction tuning or scratchpad-style reasoning. + +--- + +### Choose when to use reasoning models + +Whether you should use a reasoning model depends on how much information your prompt already provides. + +If your input is clear and complete, a standard model is usually faster and more efficient. If your input is ambiguous or complex, a reasoning model works better because it can work through the problem step by step and fill in gaps through intermediate reasoning. + +--- + +## Parse reasoning outputs + +Reasoning models often separate *reasoning* from the *final answer* using tags like `...`. Without a proper parser, this reasoning may end up in the `content` field instead of the dedicated `reasoning_content` field. + +To extract reasoning correctly, configure a `reasoning_parser` in your Ray Serve deployment. This tells vLLM how to isolate the model’s thought process from the rest of the output. +**Note:** For example, *QwQ* uses the `deepseek-r1` parser. Other models may require different parsers. See the [vLLM docs](https://docs.vllm.ai/en/stable/features/reasoning_outputs.html#supported-models) or your model's documentation to find a supported parser, or [build your own](https://docs.vllm.ai/en/stable/features/reasoning_outputs.html#how-to-support-a-new-reasoning-model) if needed. + +```yaml +applications: +- name: reasoning-llm-app + ... + args: + llm_configs: + - model_loading_config: + model_id: my-qwq-32B + model_source: Qwen/QwQ-32B + ... + engine_kwargs: + ... + reasoning_parser: deepseek_r1 # <-- for QwQ models +``` + +See [Configure Ray Serve LLM](#configure-ray-serve-llm) for a complete example. + +**Example response** +When using a reasoning parser, the response is typically structured like this: + +```python +ChatCompletionMessage( + content="The temperature is...", + ..., + reasoning_content="Okay, the user is asking for the temperature today and tomorrow..." +) +``` +And you can extract the content and reasoning as follows: +```python +response = client.chat.completions.create( + ... +) + +print(f"Content: {response.choices[0].message.content}") +print(f"Reasoning: {response.choices[0].message.reasoning_content}") +``` + +--- + +## Configure Ray Serve LLM + +Set your Hugging Face token in the config file to access gated models. + +Ray Serve LLM provides multiple [Python APIs](https://docs.ray.io/en/latest/serve/api/index.html#llm-api) for defining your application. Use [`build_openai_app`](https://docs.ray.io/en/latest/serve/api/doc/ray.serve.llm.build_openai_app.html#ray.serve.llm.build_openai_app) to build a full application from your [`LLMConfig`](https://docs.ray.io/en/latest/serve/api/doc/ray.serve.llm.LLMConfig.html#ray.serve.llm.LLMConfig) object. + +Set `tensor_parallel_size=8` to distribute the model's weights among 8 GPUs in the node. + + +```python +# serve_qwq_32b.py +from ray.serve.llm import LLMConfig, build_openai_app +import os + +llm_config = LLMConfig( + model_loading_config=dict( + model_id="my-qwq-32B", + model_source="Qwen/QwQ-32B", + ), + accelerator_type="A100-40G", + deployment_config=dict( + autoscaling_config=dict( + min_replicas=1, + max_replicas=2, + ) + ), + ### Uncomment if your model is gated and needs your Hugging Face token to access it + # runtime_env=dict(env_vars={"HF_TOKEN": os.environ.get("HF_TOKEN")}), + engine_kwargs=dict( + tensor_parallel_size=8, max_model_len=32768, reasoning_parser="deepseek_r1" + ), +) + +app = build_openai_app({"llm_configs": [llm_config]}) + +``` + +**Note:** Before moving to a production setup, migrate to a [Serve config file](https://docs.ray.io/en/latest/serve/production-guide/config.html) to make your deployment version-controlled, reproducible, and easier to maintain for CI/CD pipelines. See [Serving LLMs - Quickstart Examples: Production Guide](https://docs.ray.io/en/latest/serve/llm/quick-start.html#production-deployment) for an example. + +--- + +## Deploy locally + +**Prerequisites** + +* Access to GPU compute. +* (Optional) A **Hugging Face token** if using gated models. Store it in `export HF_TOKEN=` + +**Note:** Depending on the organization, you can usually request access on the model's Hugging Face page. For example, Meta’s Llama models approval can take anywhere from a few hours to several weeks. + +**Dependencies:** +```bash +pip install "ray[serve,llm]" +``` + +--- + +### Launch the service + +Follow the instructions at [Configure Ray Serve LLM](#configure-ray-serve-llm) to define your app in a Python module `serve_qwq_32b.py`. + +In a terminal, run: + + +```bash +%%bash +serve run serve_qwq_32b:app --non-blocking +``` + +Deployment typically takes a few minutes as the cluster is provisioned, the vLLM server starts, and the model is downloaded. + +--- + +### Send requests + +Your endpoint is available locally at `http://localhost:8000` and you can use a placeholder authentication token for the OpenAI client, for example `"FAKE_KEY"`. + +Example curl: + + +```bash +%%bash +curl -X POST http://localhost:8000/v1/chat/completions \ + -H "Authorization: Bearer FAKE_KEY" \ + -H "Content-Type: application/json" \ + -d '{ "model": "my-qwq-32B", "messages": [{"role": "user", "content": "Pick three random words with 3 syllables each and count the number of R'\''s in each of them"}] }' +``` + +Example Python: + + +```python +#client.py +from urllib.parse import urljoin +from openai import OpenAI + +API_KEY = "FAKE_KEY" +BASE_URL = "http://localhost:8000" + +client = OpenAI(BASE_URL=urljoin(BASE_URL, "v1"), API_KEY=API_KEY) + +response = client.chat.completions.create( + model="my-qwq-32B", + messages=[ + {"role": "user", "content": "What is the sum of all even numbers between 1 and 100?"} + ] +) + +print(f"Reasoning: \n{response.choices[0].message.reasoning_content}\n\n") +print(f"Answer: \n {response.choices[0].message.content}") +``` + +If you configure a valid reasoning parser, the reasoning output should appear in the `reasoning_content` field of the response message. Otherwise, it may be included in the main `content` field, typically wrapped in `...` tags. See [Parse reasoning outputs](#parse-reasoning-outputs) for more information. + +--- + +### Shutdown + +Shutdown your LLM service: + + +```bash +%%bash +serve shutdown -y +``` + + +--- + +## Deploy to production with Anyscale services + +For production, use Anyscale services to deploy your Ray Serve app on a dedicated cluster without code changes. Anyscale provides scalability, fault tolerance, and load balancing, ensuring resilience against node failures, high traffic, and rolling updates. See [Deploy a medium-sized LLM](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/medium-size-llm/README.html#deploy-to-production-with-anyscale-services) for an example with a medium-sized model like the *QwQ-32 B* used here. + +--- + +## Stream reasoning content + +Reasoning models may take longer to begin generating the main content. You can stream their intermediate reasoning output in the same way as the main content. + + +```python +#client_streaming.py +from urllib.parse import urljoin +from openai import OpenAI + +API_KEY = "FAKE_KEY" +BASE_URL = "http://localhost:8000" + +client = OpenAI(BASE_URL=urljoin(BASE_URL, "v1"), API_KEY=API_KEY) + +# Example: Complex query with thinking process +response = client.chat.completions.create( + model="my-qwq-32B", + messages=[ + {"role": "user", "content": "I need to plan a trip to Paris from Seattle. Can you help me research flight costs, create an itinerary for 3 days, and suggest restaurants based on my dietary restrictions (vegetarian)?"} + ], + stream=True +) + +# Stream +for chunk in response: + # Stream reasoning content + if hasattr(chunk.choices[0].delta, "reasoning_content"): + data_reasoning = chunk.choices[0].delta.reasoning_content + if data_reasoning: + print(data_reasoning, end="", flush=True) + # Later, stream the final answer + if hasattr(chunk.choices[0].delta, "content"): + data_content = chunk.choices[0].delta.content + if data_content: + print(data_content, end="", flush=True) +``` + + +--- + +## Summary + +In this tutorial, you deployed a reasoning LLM with Ray Serve LLM, from development to production. You learned how to configure Ray Serve LLM with the right reasoning parser, deploy your service on your Ray cluster, send requests, and parse reasoning outputs in the response. diff --git a/doc/source/serve/tutorials/deployment-serve-llm/reasoning-llm/client.py b/doc/source/serve/tutorials/deployment-serve-llm/reasoning-llm/client.py new file mode 100644 index 000000000000..e7ba801365a3 --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/reasoning-llm/client.py @@ -0,0 +1,21 @@ +# client.py +from urllib.parse import urljoin +from openai import OpenAI + +API_KEY = "FAKE_KEY" +BASE_URL = "http://localhost:8000" + +client = OpenAI(BASE_URL=urljoin(BASE_URL, "v1"), API_KEY=API_KEY) + +response = client.chat.completions.create( + model="my-qwq-32B", + messages=[ + { + "role": "user", + "content": "What is the sum of all even numbers between 1 and 100?", + } + ], +) + +print(f"Reasoning: \n{response.choices[0].message.reasoning_content}\n\n") +print(f"Answer: \n {response.choices[0].message.content}") diff --git a/doc/source/serve/tutorials/deployment-serve-llm/reasoning-llm/client_streaming.py b/doc/source/serve/tutorials/deployment-serve-llm/reasoning-llm/client_streaming.py new file mode 100644 index 000000000000..d522e0867603 --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/reasoning-llm/client_streaming.py @@ -0,0 +1,33 @@ +# client_streaming.py +from urllib.parse import urljoin +from openai import OpenAI + +API_KEY = "FAKE_KEY" +BASE_URL = "http://localhost:8000" + +client = OpenAI(BASE_URL=urljoin(BASE_URL, "v1"), API_KEY=API_KEY) + +# Example: Complex query with thinking process +response = client.chat.completions.create( + model="my-qwq-32B", + messages=[ + { + "role": "user", + "content": "I need to plan a trip to Paris from Seattle. Can you help me research flight costs, create an itinerary for 3 days, and suggest restaurants based on my dietary restrictions (vegetarian)?", + } + ], + stream=True, +) + +# Stream +for chunk in response: + # Stream reasoning content + if hasattr(chunk.choices[0].delta, "reasoning_content"): + data_reasoning = chunk.choices[0].delta.reasoning_content + if data_reasoning: + print(data_reasoning, end="", flush=True) + # Later, stream the final answer + if hasattr(chunk.choices[0].delta, "content"): + data_content = chunk.choices[0].delta.content + if data_content: + print(data_content, end="", flush=True) diff --git a/doc/source/serve/tutorials/deployment-serve-llm/reasoning-llm/notebook.ipynb b/doc/source/serve/tutorials/deployment-serve-llm/reasoning-llm/notebook.ipynb new file mode 100644 index 000000000000..5c29cfb6856f --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/reasoning-llm/notebook.ipynb @@ -0,0 +1,345 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c105c497", + "metadata": {}, + "source": [ + "# Deploy a reasoning LLM\n", + "\n", + "A reasoning LLM handles tasks that require deeper analysis or step-by-step thought. It generates intermediate reasoning before arriving at a final answer, making it better suited for situations where careful logic or structured problem-solving is more important than speed or efficiency.\n", + "\n", + "This tutorial deploys a reasoning LLM using Ray Serve LLM. \n", + "\n", + "---\n", + "\n", + "## Compare reasoning and non-reasoning models\n", + "\n", + "Reasoning models simulate step-by-step, structured thought processes to solve complex tasks like math, multi-hop QA, or code generation. In contrast, non-reasoning models provide fast, direct responses and focus on fluency or instruction following without explicit intermediate reasoning. The key distinction lies in whether the model attempts to \"think through\" the problem before answering.\n", + "\n", + "| **Model type** | **Core behavior** | **Use case examples** | **Limitation** |\n", + "| ----------------------- | ------------------------------------ | -------------------------------------------------------- | ----------------------------------------------------- |\n", + "| **Reasoning model** | Explicit multi-step thinking process | Math, coding, logic puzzles, multi-hop QA, CoT prompting | Slower response time, more tokens used. |\n", + "| **Non-reasoning model** | Direct answer generation | Casual queries, short instructions, single-step answers | May struggle with complex reasoning or interpretability. |\n", + "\n", + "Many reasoning-capable models structure their outputs with special markers such as `` tags, or expose reasoning traces inside dedicated fields like `reasoning_content` in the OpenAI API response. Always check the model's documentation for how to structure and control thinking.\n", + "\n", + "**Note:** Reasoning LLMs often benefit from long context windows (32K up to +1M tokens), high token throughput, low-temperature decoding (greedy sampling), and strong instruction tuning or scratchpad-style reasoning.\n", + "\n", + "---\n", + "\n", + "### Choose when to use reasoning models\n", + "\n", + "Whether you should use a reasoning model depends on how much information your prompt already provides.\n", + "\n", + "If your input is clear and complete, a standard model is usually faster and more efficient. If your input is ambiguous or complex, a reasoning model works better because it can work through the problem step by step and fill in gaps through intermediate reasoning.\n", + "\n", + "---\n", + "\n", + "## Parse reasoning outputs\n", + "\n", + "Reasoning models often separate *reasoning* from the *final answer* using tags like `...`. Without a proper parser, this reasoning may end up in the `content` field instead of the dedicated `reasoning_content` field.\n", + "\n", + "To extract reasoning correctly, configure a `reasoning_parser` in your Ray Serve deployment. This tells vLLM how to isolate the model’s thought process from the rest of the output. \n", + "**Note:** For example, *QwQ* uses the `deepseek-r1` parser. Other models may require different parsers. See the [vLLM docs](https://docs.vllm.ai/en/stable/features/reasoning_outputs.html#supported-models) or your model's documentation to find a supported parser, or [build your own](https://docs.vllm.ai/en/stable/features/reasoning_outputs.html#how-to-support-a-new-reasoning-model) if needed.\n", + "\n", + "```yaml\n", + "applications:\n", + "- name: reasoning-llm-app\n", + " ...\n", + " args:\n", + " llm_configs:\n", + " - model_loading_config:\n", + " model_id: my-qwq-32B\n", + " model_source: Qwen/QwQ-32B\n", + " ...\n", + " engine_kwargs:\n", + " ...\n", + " reasoning_parser: deepseek_r1 # <-- for QwQ models\n", + "```\n", + "\n", + "See [Configure Ray Serve LLM](#configure-ray-serve-llm) for a complete example.\n", + "\n", + "**Example response** \n", + "When using a reasoning parser, the response is typically structured like this:\n", + "\n", + "```python\n", + "ChatCompletionMessage(\n", + " content=\"The temperature is...\",\n", + " ...,\n", + " reasoning_content=\"Okay, the user is asking for the temperature today and tomorrow...\"\n", + ")\n", + "```\n", + "And you can extract the content and reasoning as follows:\n", + "```python\n", + "response = client.chat.completions.create(\n", + " ...\n", + ")\n", + "\n", + "print(f\"Content: {response.choices[0].message.content}\")\n", + "print(f\"Reasoning: {response.choices[0].message.reasoning_content}\")\n", + "```\n", + "\n", + "---\n", + "\n", + "## Configure Ray Serve LLM\n", + "\n", + "Set your Hugging Face token in the config file to access gated models.\n", + "\n", + "Ray Serve LLM provides multiple [Python APIs](https://docs.ray.io/en/latest/serve/api/index.html#llm-api) for defining your application. Use [`build_openai_app`](https://docs.ray.io/en/latest/serve/api/doc/ray.serve.llm.build_openai_app.html#ray.serve.llm.build_openai_app) to build a full application from your [`LLMConfig`](https://docs.ray.io/en/latest/serve/api/doc/ray.serve.llm.LLMConfig.html#ray.serve.llm.LLMConfig) object.\n", + "\n", + "Set `tensor_parallel_size=8` to distribute the model's weights among 8 GPUs in the node. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "99ae0ed2", + "metadata": {}, + "outputs": [], + "source": [ + "# serve_qwq_32b.py\n", + "from ray.serve.llm import LLMConfig, build_openai_app\n", + "import os\n", + "\n", + "llm_config = LLMConfig(\n", + " model_loading_config=dict(\n", + " model_id=\"my-qwq-32B\",\n", + " model_source=\"Qwen/QwQ-32B\",\n", + " ),\n", + " accelerator_type=\"A100-40G\",\n", + " deployment_config=dict(\n", + " autoscaling_config=dict(\n", + " min_replicas=1,\n", + " max_replicas=2,\n", + " )\n", + " ),\n", + " ### Uncomment if your model is gated and needs your Hugging Face token to access it\n", + " # runtime_env=dict(env_vars={\"HF_TOKEN\": os.environ.get(\"HF_TOKEN\")}),\n", + " engine_kwargs=dict(\n", + " tensor_parallel_size=8, max_model_len=32768, reasoning_parser=\"deepseek_r1\"\n", + " ),\n", + ")\n", + "\n", + "app = build_openai_app({\"llm_configs\": [llm_config]})\n" + ] + }, + { + "cell_type": "markdown", + "id": "d515e268", + "metadata": {}, + "source": [ + "**Note:** Before moving to a production setup, migrate to a [Serve config file](https://docs.ray.io/en/latest/serve/production-guide/config.html) to make your deployment version-controlled, reproducible, and easier to maintain for CI/CD pipelines. See [Serving LLMs - Quickstart Examples: Production Guide](https://docs.ray.io/en/latest/serve/llm/quick-start.html#production-deployment) for an example.\n", + "\n", + "---\n", + "\n", + "## Deploy locally\n", + "\n", + "**Prerequisites**\n", + "\n", + "* Access to GPU compute.\n", + "* (Optional) A **Hugging Face token** if using gated models. Store it in `export HF_TOKEN=`\n", + "\n", + "**Note:** Depending on the organization, you can usually request access on the model's Hugging Face page. For example, Meta’s Llama models approval can take anywhere from a few hours to several weeks.\n", + "\n", + "**Dependencies:** \n", + "```bash\n", + "pip install \"ray[serve,llm]\"\n", + "```\n", + "\n", + "---\n", + "\n", + "### Launch the service\n", + "\n", + "Follow the instructions at [Configure Ray Serve LLM](#configure-ray-serve-llm) to define your app in a Python module `serve_qwq_32b.py`. \n", + "\n", + "In a terminal, run: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6d6a307", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "serve run serve_qwq_32b:app --non-blocking" + ] + }, + { + "cell_type": "markdown", + "id": "646f1272", + "metadata": {}, + "source": [ + "Deployment typically takes a few minutes as the cluster is provisioned, the vLLM server starts, and the model is downloaded. \n", + "\n", + "---\n", + "\n", + "### Send requests\n", + "\n", + "Your endpoint is available locally at `http://localhost:8000` and you can use a placeholder authentication token for the OpenAI client, for example `\"FAKE_KEY\"`.\n", + "\n", + "Example curl:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56a53387", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "curl -X POST http://localhost:8000/v1/chat/completions \\\n", + " -H \"Authorization: Bearer FAKE_KEY\" \\\n", + " -H \"Content-Type: application/json\" \\\n", + " -d '{ \"model\": \"my-qwq-32B\", \"messages\": [{\"role\": \"user\", \"content\": \"Pick three random words with 3 syllables each and count the number of R'\\''s in each of them\"}] }'" + ] + }, + { + "cell_type": "markdown", + "id": "942e675c", + "metadata": {}, + "source": [ + "Example Python:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5005dde7", + "metadata": {}, + "outputs": [], + "source": [ + "#client.py\n", + "from urllib.parse import urljoin\n", + "from openai import OpenAI\n", + "\n", + "API_KEY = \"FAKE_KEY\"\n", + "BASE_URL = \"http://localhost:8000\"\n", + "\n", + "client = OpenAI(BASE_URL=urljoin(BASE_URL, \"v1\"), API_KEY=API_KEY)\n", + "\n", + "response = client.chat.completions.create(\n", + " model=\"my-qwq-32B\",\n", + " messages=[\n", + " {\"role\": \"user\", \"content\": \"What is the sum of all even numbers between 1 and 100?\"}\n", + " ]\n", + ")\n", + "\n", + "print(f\"Reasoning: \\n{response.choices[0].message.reasoning_content}\\n\\n\")\n", + "print(f\"Answer: \\n {response.choices[0].message.content}\")" + ] + }, + { + "cell_type": "markdown", + "id": "5e04db4e", + "metadata": {}, + "source": [ + "If you configure a valid reasoning parser, the reasoning output should appear in the `reasoning_content` field of the response message. Otherwise, it may be included in the main `content` field, typically wrapped in `...` tags. See [Parse reasoning outputs](#parse-reasoning-outputs) for more information.\n", + "\n", + "---\n", + "\n", + "### Shutdown\n", + "\n", + "Shutdown your LLM service:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac1f3edd", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "serve shutdown -y" + ] + }, + { + "cell_type": "markdown", + "id": "fdc9e8eb", + "metadata": {}, + "source": [ + "\n", + "---\n", + "\n", + "## Deploy to production with Anyscale services\n", + "\n", + "For production, use Anyscale services to deploy your Ray Serve app on a dedicated cluster without code changes. Anyscale provides scalability, fault tolerance, and load balancing, ensuring resilience against node failures, high traffic, and rolling updates. See [Deploy a medium-sized LLM](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/medium-size-llm/README.html#deploy-to-production-with-anyscale-services) for an example with a medium-sized model like the *QwQ-32 B* used here.\n", + "\n", + "---\n", + "\n", + "## Stream reasoning content\n", + "\n", + "Reasoning models may take longer to begin generating the main content. You can stream their intermediate reasoning output in the same way as the main content. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02472f7c", + "metadata": {}, + "outputs": [], + "source": [ + "#client_streaming.py\n", + "from urllib.parse import urljoin\n", + "from openai import OpenAI\n", + "\n", + "API_KEY = \"FAKE_KEY\"\n", + "BASE_URL = \"http://localhost:8000\"\n", + "\n", + "client = OpenAI(BASE_URL=urljoin(BASE_URL, \"v1\"), API_KEY=API_KEY)\n", + "\n", + "# Example: Complex query with thinking process\n", + "response = client.chat.completions.create(\n", + " model=\"my-qwq-32B\",\n", + " messages=[\n", + " {\"role\": \"user\", \"content\": \"I need to plan a trip to Paris from Seattle. Can you help me research flight costs, create an itinerary for 3 days, and suggest restaurants based on my dietary restrictions (vegetarian)?\"}\n", + " ],\n", + " stream=True\n", + ")\n", + "\n", + "# Stream\n", + "for chunk in response:\n", + " # Stream reasoning content\n", + " if hasattr(chunk.choices[0].delta, \"reasoning_content\"):\n", + " data_reasoning = chunk.choices[0].delta.reasoning_content\n", + " if data_reasoning:\n", + " print(data_reasoning, end=\"\", flush=True)\n", + " # Later, stream the final answer\n", + " if hasattr(chunk.choices[0].delta, \"content\"):\n", + " data_content = chunk.choices[0].delta.content\n", + " if data_content:\n", + " print(data_content, end=\"\", flush=True)" + ] + }, + { + "cell_type": "markdown", + "id": "70455ea2", + "metadata": {}, + "source": [ + "\n", + "---\n", + "\n", + "## Summary\n", + "\n", + "In this tutorial, you deployed a reasoning LLM with Ray Serve LLM, from development to production. You learned how to configure Ray Serve LLM with the right reasoning parser, deploy your service on your Ray cluster, send requests, and parse reasoning outputs in the response." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "repo_ray_docs", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.12.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/doc/source/serve/tutorials/deployment-serve-llm/reasoning-llm/serve_qwq_32b.py b/doc/source/serve/tutorials/deployment-serve-llm/reasoning-llm/serve_qwq_32b.py new file mode 100644 index 000000000000..8c8cda59f8a5 --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/reasoning-llm/serve_qwq_32b.py @@ -0,0 +1,24 @@ +# serve_qwq_32b.py +from ray.serve.llm import LLMConfig, build_openai_app +import os + +llm_config = LLMConfig( + model_loading_config=dict( + model_id="my-qwq-32B", + model_source="Qwen/QwQ-32B", + ), + accelerator_type="A100-40G", + deployment_config=dict( + autoscaling_config=dict( + min_replicas=1, + max_replicas=2, + ) + ), + ### Uncomment if your model is gated and needs your Hugging Face token to access it + # runtime_env=dict(env_vars={"HF_TOKEN": os.environ.get("HF_TOKEN")}), + engine_kwargs=dict( + tensor_parallel_size=8, max_model_len=32768, reasoning_parser="deepseek_r1" + ), +) + +app = build_openai_app({"llm_configs": [llm_config]}) diff --git a/doc/source/serve/tutorials/deployment-serve-llm/small-size-llm/Dockerfile b/doc/source/serve/tutorials/deployment-serve-llm/small-size-llm/Dockerfile new file mode 100644 index 000000000000..a2412390df61 --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/small-size-llm/Dockerfile @@ -0,0 +1,8 @@ +FROM anyscale/ray:2.49.0-slim-py312-cu128 + +# C compiler for Triton’s runtime build step (vLLM V1 engine) +# https://github.com/vllm-project/vllm/issues/2997 +RUN sudo apt-get update && \ + sudo apt-get install -y --no-install-recommends build-essential + +RUN pip install vllm==0.10.0 \ No newline at end of file diff --git a/doc/source/serve/tutorials/deployment-serve-llm/small-size-llm/README.md b/doc/source/serve/tutorials/deployment-serve-llm/small-size-llm/README.md new file mode 100644 index 000000000000..88c25c7c121f --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/small-size-llm/README.md @@ -0,0 +1,313 @@ +--- +orphan: true +--- + + + +# Deploy a small-sized LLM + +A small LLM runs on a single node with 1–2 GPUs, making it fast, inexpensive, and simple to use. It’s ideal for prototyping, lightweight applications, latency-critical use cases, cost-sensitive deployments, and environments with limited resources where efficiency matters more than peak accuracy. + + +For larger models, see [Deploy a medium-sized LLM](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/medium-size-llm/README.html) or [Deploy a large-sized LLM](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/large-size-llm/README.html). + +--- + +## Configure Ray Serve LLM + +Ray Serve LLM provides multiple [Python APIs](https://docs.ray.io/en/latest/serve/api/index.html#llm-api) for defining your application. Use [`build_openai_app`](https://docs.ray.io/en/latest/serve/api/doc/ray.serve.llm.build_openai_app.html#ray.serve.llm.build_openai_app) to build a full application from your [`LLMConfig`](https://docs.ray.io/en/latest/serve/api/doc/ray.serve.llm.LLMConfig.html#ray.serve.llm.LLMConfig) object. + +Set your Hugging Face token in the config file to access gated models like `Llama-3.1`. + + +```python +# serve_llama_3_1_8b.py +from ray.serve.llm import LLMConfig, build_openai_app +import os + +llm_config = LLMConfig( + model_loading_config=dict( + model_id="my-llama-3.1-8b", + # Or unsloth/Meta-Llama-3.1-8B-Instruct for an ungated model + model_source="meta-llama/Llama-3.1-8B-Instruct", + ), + accelerator_type="L4", + deployment_config=dict( + autoscaling_config=dict( + min_replicas=1, + max_replicas=2, + ) + ), + ### If your model isn't gated, you can skip `HF_TOKEN` + # Share your Hugging Face token with the vllm engine so it can access the gated Llama 3 + # Type `export HF_TOKEN=` in a terminal + runtime_env=dict(env_vars={"HF_TOKEN": os.environ.get("HF_TOKEN")}), + engine_kwargs=dict(max_model_len=8192), +) +app = build_openai_app({"llm_configs": [llm_config]}) + +``` + +**Note:** Before moving to a production setup, migrate to using a [Serve config file](https://docs.ray.io/en/latest/serve/production-guide/config.html) to make your deployment version-controlled, reproducible, and easier to maintain for CI/CD pipelines. See [Serving LLMs - Quickstart Examples: Production Guide](https://docs.ray.io/en/latest/serve/llm/quick-start.html#production-deployment) for an example. + +--- + +## Deploy locally + + +**Prerequisites** + +* Access to GPU compute. +* (Optional) A **Hugging Face token** if using gated models like Meta’s Llama. Store it in `export HF_TOKEN=`. + +**Note:** Depending on the organization, you can usually request access on the model's Hugging Face page. For example, Meta’s Llama models approval can take anywhere from a few hours to several weeks. + +**Dependencies:** +```bash +pip install "ray[serve,llm]" +``` + +--- + +### Launch + +Follow the instructions at [Configure Ray Serve LLM](#configure-ray-serve-llm) to define your app in a Python module `serve_llama_3_1_8b.py`. + +In a terminal, run: + + +```bash +%%bash +export HF_TOKEN= +serve run serve_llama_3_1_8b:app --non-blocking +``` + +Deployment typically takes a few minutes as the cluster is provisioned, the vLLM server starts, and the model is downloaded. + +--- + +### Send requests + +Your endpoint is available locally at `http://localhost:8000`. You can use a placeholder authentication token for the OpenAI client, for example `"FAKE_KEY"`. + +Example curl: + + +```bash +%%bash +curl -X POST http://localhost:8000/v1/chat/completions \ + -H "Authorization: Bearer FAKE_KEY" \ + -H "Content-Type: application/json" \ + -d '{ "model": "my-llama-3.1-8b", "messages": [{"role": "user", "content": "What is 2 + 2?"}] }' +``` + +Example Python: + + +```python +#client.py +from urllib.parse import urljoin +from openai import OpenAI + +API_KEY = "FAKE_KEY" +BASE_URL = "http://localhost:8000" + +client = OpenAI(BASE_URL=urljoin(BASE_URL, "v1"), API_KEY=API_KEY) + +response = client.chat.completions.create( + model="my-llama-3.1-8b", + messages=[{"role": "user", "content": "Tell me a joke"}], + stream=True +) + +for chunk in response: + content = chunk.choices[0].delta.content + if content: + print(content, end="", flush=True) +``` + + +--- + +### Shutdown + +Shutdown your LLM service: + + +```bash +%%bash +serve shutdown -y +``` + + +--- + +## Deploy to production with Anyscale Services + +For production deployment, use Anyscale Services to deploy the Ray Serve app to a dedicated cluster without modifying the code. Anyscale ensures scalability, fault tolerance, and load balancing, keeping the service resilient against node failures, high traffic, and rolling updates. + +--- + +### Launch the service + +Anyscale provides out-of-the-box images (`anyscale/ray-llm`) which come pre-loaded with Ray Serve LLM, vLLM, and all required GPU/runtime dependencies. This makes it easy to get started without building a custom image. + +Create your Anyscale Service configuration in a new `service.yaml` file: + +```yaml +# service.yaml +name: deploy-llama-3-8b +image_uri: anyscale/ray-llm:2.49.0-py311-cu128 # Anyscale Ray Serve LLM image. Use `containerfile: ./Dockerfile` to use a custom Dockerfile. +compute_config: + auto_select_worker_config: true +working_dir: . +cloud: +applications: + # Point to your app in your Python module + - import_path: serve_llama_3_1_8b:app +``` + + +Deploy your service with the following command. Make sure to forward your Hugging Face token: + + +```bash +%%bash +anyscale service deploy -f service.yaml --env HF_TOKEN= +``` + +**Custom Dockerfile** +You can customize the container by building your own Dockerfile. In your Anyscale Service config, reference the Dockerfile with `containerfile` (instead of `image_uri`): + +```yaml +# service.yaml +# Replace: +# image_uri: anyscale/ray-llm:2.49.0-py311-cu128 + +# with: +containerfile: ./Dockerfile +``` + +See the [Anyscale base images](https://docs.anyscale.com/reference/base-images) for details on what each image includes. + +--- + +### Send requests + +The `anyscale service deploy` command output shows both the endpoint and authentication token: +```console +(anyscale +3.9s) curl -H "Authorization: Bearer " +``` +You can also retrieve both from the service page in the Anyscale Console. Click the **Query** button at the top. See [Send requests](#send-requests) for example requests, but make sure to use the correct endpoint and authentication token. + +--- + +### Access the Serve LLM dashboard + +See [Enable LLM monitoring](#enable-llm-monitoring) for instructions on enabling LLM-specific logging. To open the Ray Serve LLM Dashboard from an Anyscale Service: +1. In the Anyscale console, go to your **Service** or **Workspace**. +2. Navigate to the **Metrics** tab. +3. Expand **View in Grafana** and click **Serve LLM Dashboard**. + +--- + +### Shutdown + +Shutdown your Anyscale Service: + + +```bash +%%bash +anyscale service terminate -n deploy-llama-3-8b +``` + + +--- + +## Enable LLM monitoring + +The *Serve LLM Dashboard* offers deep visibility into model performance, latency, and system behavior, including: + +- Token throughput (tokens/sec). +- Latency metrics: Time To First Token (TTFT), Time Per Output Token (TPOT). +- KV cache utilization. + +To enable these metrics, go to your LLM config and set `log_engine_metrics: true`. Ensure vLLM V1 is active with `VLLM_USE_V1: "1"`. + +**Note:** `VLLM_USE_V1: "1"` is the default value with `ray >= 2.48.0` and can be omitted. +```yaml +applications: +- ... + args: + llm_configs: + - ... + runtime_env: + env_vars: + VLLM_USE_V1: "1" + ... + log_engine_metrics: true +``` + +--- + +## Improve concurrency + +Ray Serve LLM uses [vLLM](https://docs.vllm.ai/en/stable/) as its backend engine, which logs the *maximum concurrency* it can support based on your configuration. + +Example log: +```console +INFO 08-06 20:15:53 [executor_base.py:118] Maximum concurrency for 8192 tokens per request: 3.53x +``` + +You can improve concurrency depending on your model and hardware in several ways: + +**Reduce `max_model_len`** +Lowering `max_model_len` reduces the memory needed for KV cache. + +**Example:** Running *llama-3.1-8 B* on an A10G or L4 GPU: +- `max_model_len = 8192` → concurrency ≈ 3.5 +- `max_model_len = 4096` → concurrency ≈ 7 + +**Use Quantized Models** +Quantizing your model (for example, to FP8) reduces the model's memory footprint, freeing up memory for more KV cache and enabling more concurrent requests. + +**Use Tensor Parallelism** +Distribute the model across multiple GPUs with `tensor_parallel_size > 1`. + +**Note:** Latency may rise if GPUs don’t have strong GPU interconnect like NVLink. + +**Upgrade to GPUs with more memory** +Some GPUs provide significantly more room for KV cache and allow for higher concurrency out of the box. + +**Scale with more Replicas** +In addition to tuning per-GPU concurrency, you can scale *horizontally* by increasing the number of replicas in your config. +Each replica runs on its own GPU, so raising the replica count increases the total number of concurrent requests your service can handle, especially under sustained or bursty traffic. +```yaml +deployment_config: + autoscaling_config: + min_replicas: 1 + max_replicas: 4 +``` + +*For more details on tuning strategies, hardware guidance, and serving configurations, see [Choose a GPU for LLM serving](https://docs.anyscale.com/llm/serving/gpu-guidance) and [Tune parameters for LLMs on Anyscale services](https://docs.anyscale.com/llm/serving/parameter-tuning).* + +--- + +## Troubleshooting + +**Hugging Face authentication errors** +Some models, such as Llama-3.1, are gated and require prior authorization from the organization. See your model’s documentation for instructions on obtaining access. + +**Out-of-memory errors** +Out-of-memory (OOM) errors are one of the most common failure modes when deploying LLMs, especially as model sizes and context length increase. +See this [Troubleshooting Guide](https://docs.anyscale.com/overview) for common errors and how to fix them. + +--- + +## Summary + +In this tutorial, you deployed a small-sized LLM with Ray Serve LLM, from development to production. You learned how to configure Ray Serve LLM, deploy your service on your Ray cluster, and how to send requests. You also learned how to monitor your app and common troubleshooting issues. diff --git a/doc/source/serve/tutorials/deployment-serve-llm/small-size-llm/client.py b/doc/source/serve/tutorials/deployment-serve-llm/small-size-llm/client.py new file mode 100644 index 000000000000..397cda670371 --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/small-size-llm/client.py @@ -0,0 +1,18 @@ +from urllib.parse import urljoin +from openai import OpenAI + +API_KEY = "FAKE_KEY" +BASE_URL = "http://localhost:8000" + +client = OpenAI(BASE_URL=urljoin(BASE_URL, "v1"), API_KEY=API_KEY) + +response = client.chat.completions.create( + model="my-llama-3.1-8b", + messages=[{"role": "user", "content": "Tell me a joke"}], + stream=True, +) + +for chunk in response: + content = chunk.choices[0].delta.content + if content: + print(content, end="", flush=True) diff --git a/doc/source/serve/tutorials/deployment-serve-llm/small-size-llm/notebook.ipynb b/doc/source/serve/tutorials/deployment-serve-llm/small-size-llm/notebook.ipynb new file mode 100644 index 000000000000..b1e7796913f5 --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/small-size-llm/notebook.ipynb @@ -0,0 +1,405 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "6a51548b", + "metadata": {}, + "source": [ + "# Deploy a small-sized LLM\n", + "\n", + "A small LLM runs on a single node with 1–2 GPUs, making it fast, inexpensive, and simple to use. It’s ideal for prototyping, lightweight applications, latency-critical use cases, cost-sensitive deployments, and environments with limited resources where efficiency matters more than peak accuracy.\n", + "\n", + "\n", + "For larger models, see [Deploy a medium-sized LLM](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/medium-size-llm/README.html) or [Deploy a large-sized LLM](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/large-size-llm/README.html).\n", + "\n", + "---\n", + "\n", + "## Configure Ray Serve LLM\n", + "\n", + "Ray Serve LLM provides multiple [Python APIs](https://docs.ray.io/en/latest/serve/api/index.html#llm-api) for defining your application. Use [`build_openai_app`](https://docs.ray.io/en/latest/serve/api/doc/ray.serve.llm.build_openai_app.html#ray.serve.llm.build_openai_app) to build a full application from your [`LLMConfig`](https://docs.ray.io/en/latest/serve/api/doc/ray.serve.llm.LLMConfig.html#ray.serve.llm.LLMConfig) object.\n", + "\n", + "Set your Hugging Face token in the config file to access gated models like `Llama-3.1`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e555ca3f", + "metadata": {}, + "outputs": [], + "source": [ + "# serve_llama_3_1_8b.py\n", + "from ray.serve.llm import LLMConfig, build_openai_app\n", + "import os\n", + "\n", + "llm_config = LLMConfig(\n", + " model_loading_config=dict(\n", + " model_id=\"my-llama-3.1-8b\",\n", + " # Or unsloth/Meta-Llama-3.1-8B-Instruct for an ungated model\n", + " model_source=\"meta-llama/Llama-3.1-8B-Instruct\",\n", + " ),\n", + " accelerator_type=\"L4\",\n", + " deployment_config=dict(\n", + " autoscaling_config=dict(\n", + " min_replicas=1,\n", + " max_replicas=2,\n", + " )\n", + " ),\n", + " ### If your model isn't gated, you can skip `HF_TOKEN`\n", + " # Share your Hugging Face token with the vllm engine so it can access the gated Llama 3\n", + " # Type `export HF_TOKEN=` in a terminal\n", + " runtime_env=dict(env_vars={\"HF_TOKEN\": os.environ.get(\"HF_TOKEN\")}),\n", + " engine_kwargs=dict(max_model_len=8192),\n", + ")\n", + "app = build_openai_app({\"llm_configs\": [llm_config]})\n" + ] + }, + { + "cell_type": "markdown", + "id": "b17a7140", + "metadata": {}, + "source": [ + "**Note:** Before moving to a production setup, migrate to using a [Serve config file](https://docs.ray.io/en/latest/serve/production-guide/config.html) to make your deployment version-controlled, reproducible, and easier to maintain for CI/CD pipelines. See [Serving LLMs - Quickstart Examples: Production Guide](https://docs.ray.io/en/latest/serve/llm/quick-start.html#production-deployment) for an example.\n", + "\n", + "---\n", + "\n", + "## Deploy locally\n", + "\n", + "\n", + "**Prerequisites**\n", + "\n", + "* Access to GPU compute.\n", + "* (Optional) A **Hugging Face token** if using gated models like Meta’s Llama. Store it in `export HF_TOKEN=`.\n", + "\n", + "**Note:** Depending on the organization, you can usually request access on the model's Hugging Face page. For example, Meta’s Llama models approval can take anywhere from a few hours to several weeks.\n", + "\n", + "**Dependencies:** \n", + "```bash\n", + "pip install \"ray[serve,llm]\"\n", + "```\n", + "\n", + "---\n", + "\n", + "### Launch\n", + "\n", + "Follow the instructions at [Configure Ray Serve LLM](#configure-ray-serve-llm) to define your app in a Python module `serve_llama_3_1_8b.py`. \n", + "\n", + "In a terminal, run: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dbdb0921", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "export HF_TOKEN=\n", + "serve run serve_llama_3_1_8b:app --non-blocking" + ] + }, + { + "cell_type": "markdown", + "id": "df944967", + "metadata": {}, + "source": [ + "Deployment typically takes a few minutes as the cluster is provisioned, the vLLM server starts, and the model is downloaded. \n", + "\n", + "---\n", + "\n", + "### Send requests\n", + "\n", + "Your endpoint is available locally at `http://localhost:8000`. You can use a placeholder authentication token for the OpenAI client, for example `\"FAKE_KEY\"`.\n", + "\n", + "Example curl:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a5309437", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "curl -X POST http://localhost:8000/v1/chat/completions \\\n", + " -H \"Authorization: Bearer FAKE_KEY\" \\\n", + " -H \"Content-Type: application/json\" \\\n", + " -d '{ \"model\": \"my-llama-3.1-8b\", \"messages\": [{\"role\": \"user\", \"content\": \"What is 2 + 2?\"}] }'" + ] + }, + { + "cell_type": "markdown", + "id": "d623a30f", + "metadata": {}, + "source": [ + "Example Python:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75bedc22", + "metadata": {}, + "outputs": [], + "source": [ + "#client.py\n", + "from urllib.parse import urljoin\n", + "from openai import OpenAI\n", + "\n", + "API_KEY = \"FAKE_KEY\"\n", + "BASE_URL = \"http://localhost:8000\"\n", + "\n", + "client = OpenAI(BASE_URL=urljoin(BASE_URL, \"v1\"), API_KEY=API_KEY)\n", + "\n", + "response = client.chat.completions.create(\n", + " model=\"my-llama-3.1-8b\",\n", + " messages=[{\"role\": \"user\", \"content\": \"Tell me a joke\"}],\n", + " stream=True\n", + ")\n", + "\n", + "for chunk in response:\n", + " content = chunk.choices[0].delta.content\n", + " if content:\n", + " print(content, end=\"\", flush=True)" + ] + }, + { + "cell_type": "markdown", + "id": "b095ebf3", + "metadata": {}, + "source": [ + "\n", + "---\n", + "\n", + "### Shutdown\n", + "\n", + "Shutdown your LLM service: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4fd3dacf", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "serve shutdown -y" + ] + }, + { + "cell_type": "markdown", + "id": "fb81fa41", + "metadata": {}, + "source": [ + "\n", + "---\n", + "\n", + "## Deploy to production with Anyscale Services\n", + "\n", + "For production deployment, use Anyscale Services to deploy the Ray Serve app to a dedicated cluster without modifying the code. Anyscale ensures scalability, fault tolerance, and load balancing, keeping the service resilient against node failures, high traffic, and rolling updates.\n", + "\n", + "---\n", + "\n", + "### Launch the service\n", + "\n", + "Anyscale provides out-of-the-box images (`anyscale/ray-llm`) which come pre-loaded with Ray Serve LLM, vLLM, and all required GPU/runtime dependencies. This makes it easy to get started without building a custom image.\n", + "\n", + "Create your Anyscale Service configuration in a new `service.yaml` file:\n", + "\n", + "```yaml\n", + "# service.yaml\n", + "name: deploy-llama-3-8b\n", + "image_uri: anyscale/ray-llm:2.49.0-py311-cu128 # Anyscale Ray Serve LLM image. Use `containerfile: ./Dockerfile` to use a custom Dockerfile.\n", + "compute_config:\n", + " auto_select_worker_config: true \n", + "working_dir: .\n", + "cloud:\n", + "applications:\n", + " # Point to your app in your Python module\n", + " - import_path: serve_llama_3_1_8b:app\n", + "```\n", + "\n", + "\n", + "Deploy your service with the following command. Make sure to forward your Hugging Face token:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "66b3b53a", + "metadata": { + "pygments_lexer": "bash" + }, + "outputs": [], + "source": [ + "%%bash\n", + "anyscale service deploy -f service.yaml --env HF_TOKEN=" + ] + }, + { + "cell_type": "markdown", + "id": "7e6de36c", + "metadata": {}, + "source": [ + "**Custom Dockerfile** \n", + "You can customize the container by building your own Dockerfile. In your Anyscale Service config, reference the Dockerfile with `containerfile` (instead of `image_uri`):\n", + "\n", + "```yaml\n", + "# service.yaml\n", + "# Replace:\n", + "# image_uri: anyscale/ray-llm:2.49.0-py311-cu128\n", + "\n", + "# with:\n", + "containerfile: ./Dockerfile\n", + "```\n", + "\n", + "See the [Anyscale base images](https://docs.anyscale.com/reference/base-images) for details on what each image includes.\n", + "\n", + "---\n", + "\n", + "### Send requests \n", + "\n", + "The `anyscale service deploy` command output shows both the endpoint and authentication token:\n", + "```console\n", + "(anyscale +3.9s) curl -H \"Authorization: Bearer \" \n", + "```\n", + "You can also retrieve both from the service page in the Anyscale Console. Click the **Query** button at the top. See [Send requests](#send-requests) for example requests, but make sure to use the correct endpoint and authentication token. \n", + "\n", + "---\n", + "\n", + "### Access the Serve LLM dashboard\n", + "\n", + "See [Enable LLM monitoring](#enable-llm-monitoring) for instructions on enabling LLM-specific logging. To open the Ray Serve LLM Dashboard from an Anyscale Service:\n", + "1. In the Anyscale console, go to your **Service** or **Workspace**.\n", + "2. Navigate to the **Metrics** tab.\n", + "3. Expand **View in Grafana** and click **Serve LLM Dashboard**.\n", + "\n", + "---\n", + "\n", + "### Shutdown\n", + "\n", + "Shutdown your Anyscale Service:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "474b2764", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "anyscale service terminate -n deploy-llama-3-8b" + ] + }, + { + "cell_type": "markdown", + "id": "49f67c39", + "metadata": {}, + "source": [ + "\n", + "---\n", + "\n", + "## Enable LLM monitoring\n", + "\n", + "The *Serve LLM Dashboard* offers deep visibility into model performance, latency, and system behavior, including:\n", + "\n", + "- Token throughput (tokens/sec).\n", + "- Latency metrics: Time To First Token (TTFT), Time Per Output Token (TPOT).\n", + "- KV cache utilization.\n", + "\n", + "To enable these metrics, go to your LLM config and set `log_engine_metrics: true`. Ensure vLLM V1 is active with `VLLM_USE_V1: \"1\"`.\n", + "\n", + "**Note:** `VLLM_USE_V1: \"1\"` is the default value with `ray >= 2.48.0` and can be omitted.\n", + "```yaml\n", + "applications:\n", + "- ...\n", + " args:\n", + " llm_configs:\n", + " - ...\n", + " runtime_env:\n", + " env_vars:\n", + " VLLM_USE_V1: \"1\"\n", + " ...\n", + " log_engine_metrics: true\n", + "```\n", + "\n", + "---\n", + "\n", + "## Improve concurrency\n", + "\n", + "Ray Serve LLM uses [vLLM](https://docs.vllm.ai/en/stable/) as its backend engine, which logs the *maximum concurrency* it can support based on your configuration.\n", + "\n", + "Example log:\n", + "```console\n", + "INFO 08-06 20:15:53 [executor_base.py:118] Maximum concurrency for 8192 tokens per request: 3.53x\n", + "```\n", + "\n", + "You can improve concurrency depending on your model and hardware in several ways: \n", + "\n", + "**Reduce `max_model_len`** \n", + "Lowering `max_model_len` reduces the memory needed for KV cache.\n", + "\n", + "**Example:** Running *llama-3.1-8 B* on an A10G or L4 GPU:\n", + "- `max_model_len = 8192` → concurrency ≈ 3.5\n", + "- `max_model_len = 4096` → concurrency ≈ 7\n", + "\n", + "**Use Quantized Models** \n", + "Quantizing your model (for example, to FP8) reduces the model's memory footprint, freeing up memory for more KV cache and enabling more concurrent requests.\n", + "\n", + "**Use Tensor Parallelism** \n", + "Distribute the model across multiple GPUs with `tensor_parallel_size > 1`.\n", + "\n", + "**Note:** Latency may rise if GPUs don’t have strong GPU interconnect like NVLink.\n", + "\n", + "**Upgrade to GPUs with more memory** \n", + "Some GPUs provide significantly more room for KV cache and allow for higher concurrency out of the box.\n", + "\n", + "**Scale with more Replicas** \n", + "In addition to tuning per-GPU concurrency, you can scale *horizontally* by increasing the number of replicas in your config. \n", + "Each replica runs on its own GPU, so raising the replica count increases the total number of concurrent requests your service can handle, especially under sustained or bursty traffic.\n", + "```yaml\n", + "deployment_config:\n", + " autoscaling_config:\n", + " min_replicas: 1\n", + " max_replicas: 4\n", + "```\n", + "\n", + "*For more details on tuning strategies, hardware guidance, and serving configurations, see [Choose a GPU for LLM serving](https://docs.anyscale.com/llm/serving/gpu-guidance) and [Tune parameters for LLMs on Anyscale services](https://docs.anyscale.com/llm/serving/parameter-tuning).*\n", + "\n", + "---\n", + "\n", + "## Troubleshooting\n", + "\n", + "**Hugging Face authentication errors** \n", + "Some models, such as Llama-3.1, are gated and require prior authorization from the organization. See your model’s documentation for instructions on obtaining access.\n", + "\n", + "**Out-of-memory errors** \n", + "Out-of-memory (OOM) errors are one of the most common failure modes when deploying LLMs, especially as model sizes and context length increase. \n", + "See this [Troubleshooting Guide](https://docs.anyscale.com/overview) for common errors and how to fix them.\n", + "\n", + "---\n", + "\n", + "## Summary\n", + "\n", + "In this tutorial, you deployed a small-sized LLM with Ray Serve LLM, from development to production. You learned how to configure Ray Serve LLM, deploy your service on your Ray cluster, and how to send requests. You also learned how to monitor your app and common troubleshooting issues." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "repo_ray_docs", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.12.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/doc/source/serve/tutorials/deployment-serve-llm/small-size-llm/serve_llama_3_1_8b.py b/doc/source/serve/tutorials/deployment-serve-llm/small-size-llm/serve_llama_3_1_8b.py new file mode 100644 index 000000000000..861a4f7aae7b --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/small-size-llm/serve_llama_3_1_8b.py @@ -0,0 +1,24 @@ +# serve_llama_3_1_8b.py +from ray.serve.llm import LLMConfig, build_openai_app +import os + +llm_config = LLMConfig( + model_loading_config=dict( + model_id="my-llama-3.1-8b", + # Or unsloth/Meta-Llama-3.1-8B-Instruct for an ungated model + model_source="meta-llama/Llama-3.1-8B-Instruct", + ), + accelerator_type="L4", + deployment_config=dict( + autoscaling_config=dict( + min_replicas=1, + max_replicas=2, + ) + ), + ### If your model isn't gated, you can skip `HF_TOKEN` + # Share your Hugging Face token with the vllm engine so it can access the gated Llama 3 + # Type `export HF_TOKEN=` in a terminal + runtime_env=dict(env_vars={"HF_TOKEN": os.environ.get("HF_TOKEN")}), + engine_kwargs=dict(max_model_len=8192), +) +app = build_openai_app({"llm_configs": [llm_config]}) diff --git a/doc/source/serve/tutorials/deployment-serve-llm/small-size-llm/service.yaml b/doc/source/serve/tutorials/deployment-serve-llm/small-size-llm/service.yaml new file mode 100644 index 000000000000..4c12e613c0d0 --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/small-size-llm/service.yaml @@ -0,0 +1,10 @@ +# service.yaml +name: deploy-llama-3-8b +image_uri: anyscale/ray-llm:2.49.0-py311-cu128 # Anyscale Ray Serve LLM image. Use `containerfile: ./Dockerfile` to use a custom Dockerfile. +compute_config: + auto_select_worker_config: true +working_dir: . +cloud: +applications: + # Point to your app in your Python module + - import_path: serve_llama_3_1_8b:app \ No newline at end of file diff --git a/doc/source/serve/tutorials/deployment-serve-llm/vision-llm/README.md b/doc/source/serve/tutorials/deployment-serve-llm/vision-llm/README.md new file mode 100644 index 000000000000..1a8c2654464f --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/vision-llm/README.md @@ -0,0 +1,219 @@ +--- +orphan: true +--- + + + +# Deploy a vision LLM + +A vision LLM can interpret images as well as text, enabling tasks like answering questions about charts, analyzing photos, or combining visuals with instructions. It extends LLMs beyond language to support multimodal reasoning and richer applications. + +This tutorial deploys a vision LLM using Ray Serve LLM. + +--- + +## Configure Ray Serve LLM + +Make sure to set your Hugging Face token in the config file to access gated models. + +Ray Serve LLM provides multiple [Python APIs](https://docs.ray.io/en/latest/serve/api/index.html#llm-api) for defining your application. Use [`build_openai_app`](https://docs.ray.io/en/latest/serve/api/doc/ray.serve.llm.build_openai_app.html#ray.serve.llm.build_openai_app) to build a full application from your [`LLMConfig`](https://docs.ray.io/en/latest/serve/api/doc/ray.serve.llm.LLMConfig.html#ray.serve.llm.LLMConfig) object. + + +```python +# serve_qwen_VL.py +from ray.serve.llm import LLMConfig, build_openai_app +import os + +llm_config = LLMConfig( + model_loading_config=dict( + model_id="my-qwen-VL", + model_source="qwen/Qwen2.5-VL-7B-Instruct", + ), + accelerator_type="L40S", + deployment_config=dict( + autoscaling_config=dict( + min_replicas=1, + max_replicas=2, + ) + ), + ### Uncomment if your model is gated and needs your Hugging Face token to access it. + # runtime_env=dict(env_vars={"HF_TOKEN": os.environ.get("HF_TOKEN")}), + engine_kwargs=dict(max_model_len=8192), +) + +app = build_openai_app({"llm_configs": [llm_config]}) + +``` + +**Note:** Before moving to a production setup, migrate to a [Serve config file](https://docs.ray.io/en/latest/serve/production-guide/config.html) to make your deployment version-controlled, reproducible, and easier to maintain for CI/CD pipelines. See [Serving LLMs - Quickstart Examples: Production Guide](https://docs.ray.io/en/latest/serve/llm/quick-start.html#production-deployment) for an example. + +--- + +## Deploy locally + +**Prerequisites** + +* Access to GPU compute. +* (Optional) A **Hugging Face token** if using gated models. Store it in `export HF_TOKEN=` + +**Note:** Depending on the organization, you can usually request access on the model's Hugging Face page. For example, Meta’s Llama models approval can take anywhere from a few hours to several weeks. + +**Dependencies:** +```bash +pip install "ray[serve,llm]" +``` + +--- + +### Launch + +Follow the instructions at [Configure Ray Serve LLM](#configure-ray-serve-llm) to define your app in a Python module `serve_qwen_VL.py`. + +In a terminal, run: + + +```bash +%%bash +serve run serve_qwen_VL:app --non-blocking +``` + +Deployment typically takes a few minutes as the cluster is provisioned, the vLLM server starts, and the model is downloaded. + +--- + +### Sending requests with images + +Your endpoint is available locally at `http://localhost:8000` and you can use a placeholder authentication token for the OpenAI client, for example `"FAKE_KEY"`. + +Example curl with image URL: + + +```bash +%%bash +curl -X POST http://localhost:8000/v1/chat/completions \ + -H "Authorization: Bearer FAKE_KEY" \ + -H "Content-Type: application/json" \ + -d '{ "model": "my-qwen-VL", "messages": [ { "role": "user", "content": [ {"type": "text", "text": "What do you see in this image?"}, {"type": "image_url", "image_url": { "url": "http://images.cocodataset.org/val2017/000000039769.jpg" }} ] } ] }' +``` + +Example Python with image URL: + + +```python +#client_url_image.py +from urllib.parse import urljoin +from openai import OpenAI + +API_KEY = "FAKE_KEY" +BASE_URL = "http://localhost:8000" + +client = OpenAI(BASE_URL=urljoin(BASE_URL, "v1"), API_KEY=API_KEY) + +response = client.chat.completions.create( + model="my-qwen-VL", + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What is in this image?"}, + {"type": "image_url", "image_url": {"url": "http://images.cocodataset.org/val2017/000000039769.jpg"}} + ] + } + ], + temperature=0.5, + stream=True +) + +for chunk in response: + content = chunk.choices[0].delta.content + if content: + print(content, end="", flush=True) +``` + +Example Python with local image: + + +```python +#client_local_image.py +from urllib.parse import urljoin +import base64 +from openai import OpenAI + +API_KEY = "FAKE_KEY" +BASE_URL = "http://localhost:8000" + +client = OpenAI(BASE_URL=urljoin(BASE_URL, "v1"), API_KEY=API_KEY) + +### From an image locally saved as `example.jpg` +# Load and encode image as base64 +with open("example.jpg", "rb") as f: + img_base64 = base64.b64encode(f.read()).decode() + +response = client.chat.completions.create( + model="my-qwen-VL", + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What is in this image?"}, + {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}} + ] + } + ], + temperature=0.5, + stream=True +) + +for chunk in response: + content = chunk.choices[0].delta.content + if content: + print(content, end="", flush=True) +``` + + +--- + +### Shutdown + +Shutdown your LLM service: + + +```bash +%%bash +serve shutdown -y +``` + + +--- + +## Deploy to production with Anyscale services + +For production, it's recommended to use Anyscale services to deploy your Ray Serve app on a dedicated cluster without code changes. Anyscale provides scalability, fault tolerance, and load balancing, ensuring resilience against node failures, high traffic, and rolling updates. See [Deploy a small-sized LLM](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/small-size-llm/README.html#deploy-to-production-with-anyscale-services) for an example with a small-sized model like the *Qwen2.5-VL-7 B-Instruct* used in this tutorial. + +--- + +## Limiting images per prompt + +Ray Serve LLM uses [vLLM](https://docs.vllm.ai/en/stable/) as its backend engine. You can configure vLLM by passing parameters through the `engine_kwargs` section of your Serve LLM configuration. For a full list of supported options, see the [vLLM documentation](https://docs.vllm.ai/en/stable/configuration/engine_args.html#multimodalconfig). + +In particular, you can limit the number of images per request by setting `limit_mm_per_prompt` in your configuration. +```yaml +applications: +- ... + args: + llm_configs: + ... + engine_kwargs: + ... + limit_mm_per_prompt: {"image": 3} +``` + +--- + +## Summary + +In this tutorial, you deployed a vision LLM with Ray Serve LLM, from development to production. You learned how to configure Ray Serve LLM, deploy your service on your Ray cluster, and send requests with images. diff --git a/doc/source/serve/tutorials/deployment-serve-llm/vision-llm/client_local_image.py b/doc/source/serve/tutorials/deployment-serve-llm/vision-llm/client_local_image.py new file mode 100644 index 000000000000..ac6d86f18be9 --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/vision-llm/client_local_image.py @@ -0,0 +1,37 @@ +# client_local_image.py +from urllib.parse import urljoin +import base64 +from openai import OpenAI + +API_KEY = "FAKE_KEY" +BASE_URL = "http://localhost:8000" + +client = OpenAI(BASE_URL=urljoin(BASE_URL, "v1"), API_KEY=API_KEY) + +### From an image locally saved as `example.jpg` +# Load and encode image as base64 +with open("vision-llm/example.jpg", "rb") as f: + img_base64 = base64.b64encode(f.read()).decode() + +response = client.chat.completions.create( + model="my-qwen-VL", + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What is in this image?"}, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}, + }, + ], + } + ], + temperature=0.5, + stream=True, +) + +for chunk in response: + content = chunk.choices[0].delta.content + if content: + print(content, end="", flush=True) diff --git a/doc/source/serve/tutorials/deployment-serve-llm/vision-llm/client_url_image.py b/doc/source/serve/tutorials/deployment-serve-llm/vision-llm/client_url_image.py new file mode 100644 index 000000000000..0d093af6169b --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/vision-llm/client_url_image.py @@ -0,0 +1,33 @@ +# client_url_image.py +from urllib.parse import urljoin +from openai import OpenAI + +API_KEY = "FAKE_KEY" +BASE_URL = "http://localhost:8000" + +client = OpenAI(BASE_URL=urljoin(BASE_URL, "v1"), API_KEY=API_KEY) + +response = client.chat.completions.create( + model="my-qwen-VL", + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What is in this image?"}, + { + "type": "image_url", + "image_url": { + "url": "http://images.cocodataset.org/val2017/000000039769.jpg" + }, + }, + ], + } + ], + temperature=0.5, + stream=True, +) + +for chunk in response: + content = chunk.choices[0].delta.content + if content: + print(content, end="", flush=True) diff --git a/doc/source/serve/tutorials/deployment-serve-llm/vision-llm/example.jpg b/doc/source/serve/tutorials/deployment-serve-llm/vision-llm/example.jpg new file mode 100644 index 000000000000..4284d25ff336 Binary files /dev/null and b/doc/source/serve/tutorials/deployment-serve-llm/vision-llm/example.jpg differ diff --git a/doc/source/serve/tutorials/deployment-serve-llm/vision-llm/notebook.ipynb b/doc/source/serve/tutorials/deployment-serve-llm/vision-llm/notebook.ipynb new file mode 100644 index 000000000000..fa340e74ab20 --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/vision-llm/notebook.ipynb @@ -0,0 +1,298 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "23243c2e", + "metadata": {}, + "source": [ + "# Deploy a vision LLM\n", + "\n", + "A vision LLM can interpret images as well as text, enabling tasks like answering questions about charts, analyzing photos, or combining visuals with instructions. It extends LLMs beyond language to support multimodal reasoning and richer applications. \n", + "\n", + "This tutorial deploys a vision LLM using Ray Serve LLM. \n", + "\n", + "---\n", + "\n", + "## Configure Ray Serve LLM\n", + "\n", + "Make sure to set your Hugging Face token in the config file to access gated models.\n", + "\n", + "Ray Serve LLM provides multiple [Python APIs](https://docs.ray.io/en/latest/serve/api/index.html#llm-api) for defining your application. Use [`build_openai_app`](https://docs.ray.io/en/latest/serve/api/doc/ray.serve.llm.build_openai_app.html#ray.serve.llm.build_openai_app) to build a full application from your [`LLMConfig`](https://docs.ray.io/en/latest/serve/api/doc/ray.serve.llm.LLMConfig.html#ray.serve.llm.LLMConfig) object." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebc41d60", + "metadata": {}, + "outputs": [], + "source": [ + "# serve_qwen_VL.py\n", + "from ray.serve.llm import LLMConfig, build_openai_app\n", + "import os\n", + "\n", + "llm_config = LLMConfig(\n", + " model_loading_config=dict(\n", + " model_id=\"my-qwen-VL\",\n", + " model_source=\"qwen/Qwen2.5-VL-7B-Instruct\",\n", + " ),\n", + " accelerator_type=\"L40S\",\n", + " deployment_config=dict(\n", + " autoscaling_config=dict(\n", + " min_replicas=1,\n", + " max_replicas=2,\n", + " )\n", + " ),\n", + " ### Uncomment if your model is gated and needs your Hugging Face token to access it.\n", + " # runtime_env=dict(env_vars={\"HF_TOKEN\": os.environ.get(\"HF_TOKEN\")}),\n", + " engine_kwargs=dict(max_model_len=8192),\n", + ")\n", + "\n", + "app = build_openai_app({\"llm_configs\": [llm_config]})\n" + ] + }, + { + "cell_type": "markdown", + "id": "c76a6362", + "metadata": {}, + "source": [ + "**Note:** Before moving to a production setup, migrate to a [Serve config file](https://docs.ray.io/en/latest/serve/production-guide/config.html) to make your deployment version-controlled, reproducible, and easier to maintain for CI/CD pipelines. See [Serving LLMs - Quickstart Examples: Production Guide](https://docs.ray.io/en/latest/serve/llm/quick-start.html#production-deployment) for an example.\n", + "\n", + "---\n", + "\n", + "## Deploy locally\n", + "\n", + "**Prerequisites**\n", + "\n", + "* Access to GPU compute.\n", + "* (Optional) A **Hugging Face token** if using gated models. Store it in `export HF_TOKEN=`\n", + "\n", + "**Note:** Depending on the organization, you can usually request access on the model's Hugging Face page. For example, Meta’s Llama models approval can take anywhere from a few hours to several weeks.\n", + "\n", + "**Dependencies:** \n", + "```bash\n", + "pip install \"ray[serve,llm]\"\n", + "```\n", + "\n", + "---\n", + "\n", + "### Launch\n", + "\n", + "Follow the instructions at [Configure Ray Serve LLM](#configure-ray-serve-llm) to define your app in a Python module `serve_qwen_VL.py`. \n", + "\n", + "In a terminal, run: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7eb8734c", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "serve run serve_qwen_VL:app --non-blocking" + ] + }, + { + "cell_type": "markdown", + "id": "d36f41d1", + "metadata": {}, + "source": [ + "Deployment typically takes a few minutes as the cluster is provisioned, the vLLM server starts, and the model is downloaded. \n", + "\n", + "---\n", + "\n", + "### Sending requests with images\n", + "\n", + "Your endpoint is available locally at `http://localhost:8000` and you can use a placeholder authentication token for the OpenAI client, for example `\"FAKE_KEY\"`.\n", + "\n", + "Example curl with image URL:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "400e7790", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "curl -X POST http://localhost:8000/v1/chat/completions \\\n", + " -H \"Authorization: Bearer FAKE_KEY\" \\\n", + " -H \"Content-Type: application/json\" \\\n", + " -d '{ \"model\": \"my-qwen-VL\", \"messages\": [ { \"role\": \"user\", \"content\": [ {\"type\": \"text\", \"text\": \"What do you see in this image?\"}, {\"type\": \"image_url\", \"image_url\": { \"url\": \"http://images.cocodataset.org/val2017/000000039769.jpg\" }} ] } ] }'" + ] + }, + { + "cell_type": "markdown", + "id": "291743a5", + "metadata": {}, + "source": [ + "Example Python with image URL:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b447094", + "metadata": {}, + "outputs": [], + "source": [ + "#client_url_image.py\n", + "from urllib.parse import urljoin\n", + "from openai import OpenAI\n", + "\n", + "API_KEY = \"FAKE_KEY\"\n", + "BASE_URL = \"http://localhost:8000\"\n", + "\n", + "client = OpenAI(BASE_URL=urljoin(BASE_URL, \"v1\"), API_KEY=API_KEY)\n", + "\n", + "response = client.chat.completions.create(\n", + " model=\"my-qwen-VL\",\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\"type\": \"text\", \"text\": \"What is in this image?\"},\n", + " {\"type\": \"image_url\", \"image_url\": {\"url\": \"http://images.cocodataset.org/val2017/000000039769.jpg\"}}\n", + " ]\n", + " }\n", + " ],\n", + " temperature=0.5,\n", + " stream=True\n", + ")\n", + "\n", + "for chunk in response:\n", + " content = chunk.choices[0].delta.content\n", + " if content:\n", + " print(content, end=\"\", flush=True)" + ] + }, + { + "cell_type": "markdown", + "id": "811f1d41", + "metadata": {}, + "source": [ + "Example Python with local image:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8296023b", + "metadata": {}, + "outputs": [], + "source": [ + "#client_local_image.py\n", + "from urllib.parse import urljoin\n", + "import base64\n", + "from openai import OpenAI\n", + "\n", + "API_KEY = \"FAKE_KEY\"\n", + "BASE_URL = \"http://localhost:8000\"\n", + "\n", + "client = OpenAI(BASE_URL=urljoin(BASE_URL, \"v1\"), API_KEY=API_KEY)\n", + "\n", + "### From an image locally saved as `example.jpg`\n", + "# Load and encode image as base64\n", + "with open(\"example.jpg\", \"rb\") as f:\n", + " img_base64 = base64.b64encode(f.read()).decode()\n", + "\n", + "response = client.chat.completions.create(\n", + " model=\"my-qwen-VL\",\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\"type\": \"text\", \"text\": \"What is in this image?\"},\n", + " {\"type\": \"image_url\", \"image_url\": {\"url\": f\"data:image/jpeg;base64,{img_base64}\"}}\n", + " ]\n", + " }\n", + " ],\n", + " temperature=0.5,\n", + " stream=True\n", + ")\n", + "\n", + "for chunk in response:\n", + " content = chunk.choices[0].delta.content\n", + " if content:\n", + " print(content, end=\"\", flush=True)" + ] + }, + { + "cell_type": "markdown", + "id": "ccc60c1f", + "metadata": {}, + "source": [ + "\n", + "---\n", + "\n", + "### Shutdown \n", + "\n", + "Shutdown your LLM service:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ee4b879", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "serve shutdown -y" + ] + }, + { + "cell_type": "markdown", + "id": "a94c0307", + "metadata": {}, + "source": [ + "\n", + "---\n", + "\n", + "## Deploy to production with Anyscale services\n", + "\n", + "For production, it's recommended to use Anyscale services to deploy your Ray Serve app on a dedicated cluster without code changes. Anyscale provides scalability, fault tolerance, and load balancing, ensuring resilience against node failures, high traffic, and rolling updates. See [Deploy a small-sized LLM](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/small-size-llm/README.html#deploy-to-production-with-anyscale-services) for an example with a small-sized model like the *Qwen2.5-VL-7 B-Instruct* used in this tutorial.\n", + "\n", + "---\n", + "\n", + "## Limiting images per prompt\n", + "\n", + "Ray Serve LLM uses [vLLM](https://docs.vllm.ai/en/stable/) as its backend engine. You can configure vLLM by passing parameters through the `engine_kwargs` section of your Serve LLM configuration. For a full list of supported options, see the [vLLM documentation](https://docs.vllm.ai/en/stable/configuration/engine_args.html#multimodalconfig). \n", + "\n", + "In particular, you can limit the number of images per request by setting `limit_mm_per_prompt` in your configuration. \n", + "```yaml\n", + "applications:\n", + "- ...\n", + " args:\n", + " llm_configs:\n", + " ...\n", + " engine_kwargs:\n", + " ...\n", + " limit_mm_per_prompt: {\"image\": 3}\n", + "```\n", + "\n", + "---\n", + "\n", + "## Summary\n", + "\n", + "In this tutorial, you deployed a vision LLM with Ray Serve LLM, from development to production. You learned how to configure Ray Serve LLM, deploy your service on your Ray cluster, and send requests with images." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "repo_ray_docs", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.12.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/doc/source/serve/tutorials/deployment-serve-llm/vision-llm/serve_qwen_VL.py b/doc/source/serve/tutorials/deployment-serve-llm/vision-llm/serve_qwen_VL.py new file mode 100644 index 000000000000..d1239439700a --- /dev/null +++ b/doc/source/serve/tutorials/deployment-serve-llm/vision-llm/serve_qwen_VL.py @@ -0,0 +1,22 @@ +# serve_qwen_VL.py +from ray.serve.llm import LLMConfig, build_openai_app +import os + +llm_config = LLMConfig( + model_loading_config=dict( + model_id="my-qwen-VL", + model_source="qwen/Qwen2.5-VL-7B-Instruct", + ), + accelerator_type="L40S", + deployment_config=dict( + autoscaling_config=dict( + min_replicas=1, + max_replicas=2, + ) + ), + ### Uncomment if your model is gated and needs your Hugging Face token to access it. + # runtime_env=dict(env_vars={"HF_TOKEN": os.environ.get("HF_TOKEN")}), + engine_kwargs=dict(max_model_len=8192), +) + +app = build_openai_app({"llm_configs": [llm_config]}) diff --git a/doc/source/serve/tutorials/java.md b/doc/source/serve/tutorials/java.md index 6a57addc34aa..743fff19af24 100644 --- a/doc/source/serve/tutorials/java.md +++ b/doc/source/serve/tutorials/java.md @@ -20,7 +20,7 @@ To use Java Ray Serve, you need the following dependency in your pom.xml. ## Example model -This example use case is a production workflow of a financial application. The application needs to compute the best strategy to interact with different banks for a single task. +This example use case is a production workflow for a financial application. The application needs to compute the best strategy to interact with different banks for a single task. ```{literalinclude} ../../../../java/serve/src/test/java/io/ray/serve/docdemo/Strategy.java :end-before: docs-strategy-end @@ -43,7 +43,7 @@ This code uses the `Strategy` class: :start-after: docs-strategy-calc-start ``` -When the scale of banks and indicators expands, the three-tier `for` loop slows down the calculation. Even if you use the thread pool to calculate each indicator in parallel, you may encounter a single machine performance bottleneck. Moreover, you can't use this `Strategy` object as a resident service. +When the scale of banks and indicators expands, the three-tier `for` loop slows down the calculation. Even if you use the thread pool to calculate each indicator in parallel, you may encounter a single machine performance bottleneck. Moreover, you can't use this `Strategy` object as a resident service. ## Converting to a Ray Serve Deployment diff --git a/doc/source/train/api/api.rst b/doc/source/train/api/api.rst index 41e10d7c4a65..9c0ed0af5d33 100644 --- a/doc/source/train/api/api.rst +++ b/doc/source/train/api/api.rst @@ -72,7 +72,7 @@ Hugging Face Transformers More Frameworks --------------- -Tensorflow/Keras +TensorFlow/Keras ~~~~~~~~~~~~~~~~ .. autosummary:: @@ -143,6 +143,7 @@ Ray Train Utilities :nosignatures: :toctree: doc/ + ~train.get_all_reported_checkpoints ~train.get_checkpoint ~train.get_context ~train.get_dataset_shard @@ -165,6 +166,7 @@ Ray Train Output :template: autosummary/class_without_autosummary.rst :toctree: doc/ + ~train.ReportedCheckpoint ~train.Result Ray Train Errors diff --git a/doc/source/train/api/deprecated.rst b/doc/source/train/api/deprecated.rst index 4c016577ee92..7a51cb4b48dd 100644 --- a/doc/source/train/api/deprecated.rst +++ b/doc/source/train/api/deprecated.rst @@ -66,7 +66,7 @@ Hugging Face Transformers More Frameworks --------------- -Tensorflow/Keras +TensorFlow/Keras ~~~~~~~~~~~~~~~~ .. autosummary:: diff --git a/doc/source/train/benchmarks.rst b/doc/source/train/benchmarks.rst index 050d56081029..94702b98916f 100644 --- a/doc/source/train/benchmarks.rst +++ b/doc/source/train/benchmarks.rst @@ -11,7 +11,7 @@ GPU image training ------------------ This task uses the TorchTrainer module to train different amounts of data -using an Pytorch ResNet model. +using a PyTorch ResNet model. We test out the performance across different cluster sizes and data sizes. @@ -22,7 +22,7 @@ We test out the performance across different cluster sizes and data sizes. .. note:: For multi-host distributed training, on AWS we need to ensure ec2 instances are in the same VPC and - all ports are open in the secure group. + all ports are open in the security group. .. list-table:: @@ -46,10 +46,10 @@ We test out the performance across different cluster sizes and data sizes. .. _pytorch-training-parity: -Pytorch Training Parity +PyTorch training parity ----------------------- -This task checks the performance parity between native Pytorch Distributed and +This task checks the performance parity between native PyTorch Distributed and Ray Train's distributed TorchTrainer. We demonstrate that the performance is similar (within 2.5\%) between the two frameworks. @@ -58,9 +58,9 @@ Performance may vary greatly across different model, hardware, and cluster confi The reported times are for the raw training times. There is an unreported constant setup overhead of a few seconds for both methods that is negligible for longer training runs. -- `Pytorch comparison training script`_ -- `Pytorch comparison CPU cluster configuration`_ -- `Pytorch comparison GPU cluster configuration`_ +- `PyTorch comparison training script`_ +- `PyTorch comparison CPU cluster configuration`_ +- `PyTorch comparison GPU cluster configuration`_ .. list-table:: @@ -70,24 +70,24 @@ overhead of a few seconds for both methods that is negligible for longer trainin - **Command** * - 4 m5.2xlarge nodes (4 workers) - FashionMNIST - - 196.64 s (vs 194.90 s Pytorch) + - 196.64 s (vs 194.90 s PyTorch) - `python workloads/torch_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 4 --cpus-per-worker 8` * - 4 m5.2xlarge nodes (16 workers) - FashionMNIST - - 430.88 s (vs 475.97 s Pytorch) + - 430.88 s (vs 475.97 s PyTorch) - `python workloads/torch_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 16 --cpus-per-worker 2` - * - 4 g4dn.12xlarge node (16 workers) + * - 4 g4dn.12xlarge nodes (16 workers) - FashionMNIST - - 149.80 s (vs 146.46 s Pytorch) + - 149.80 s (vs 146.46 s PyTorch) - `python workloads/torch_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 16 --cpus-per-worker 4 --use-gpu` .. _tf-training-parity: -Tensorflow Training Parity +TensorFlow training parity -------------------------- -This task checks the performance parity between native Tensorflow Distributed and +This task checks the performance parity between native TensorFlow Distributed and Ray Train's distributed TensorflowTrainer. We demonstrate that the performance is similar (within 1\%) between the two frameworks. @@ -98,9 +98,9 @@ overhead of a few seconds for both methods that is negligible for longer trainin .. note:: The batch size and number of epochs is different for the GPU benchmark, resulting in a longer runtime. -- `Tensorflow comparison training script`_ -- `Tensorflow comparison CPU cluster configuration`_ -- `Tensorflow comparison GPU cluster configuration`_ +- `TensorFlow comparison training script`_ +- `TensorFlow comparison CPU cluster configuration`_ +- `TensorFlow comparison GPU cluster configuration`_ .. list-table:: @@ -110,15 +110,15 @@ overhead of a few seconds for both methods that is negligible for longer trainin - **Command** * - 4 m5.2xlarge nodes (4 workers) - FashionMNIST - - 78.81 s (vs 79.67 s Tensorflow) + - 78.81 s (versus 79.67 s TensorFlow) - `python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 4 --cpus-per-worker 8` * - 4 m5.2xlarge nodes (16 workers) - FashionMNIST - - 64.57 s (vs 67.45 s Tensorflow) + - 64.57 s (versus 67.45 s TensorFlow) - `python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 16 --cpus-per-worker 2` - * - 4 g4dn.12xlarge node (16 workers) + * - 4 g4dn.12xlarge nodes (16 workers) - FashionMNIST - - 465.16 s (vs 461.74 s Tensorflow) + - 465.16 s (versus 461.74 s TensorFlow) - `python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 200 --num-workers 16 --cpus-per-worker 4 --batch-size 64 --use-gpu` .. _xgboost-benchmark: @@ -157,11 +157,11 @@ XGBoost parameters were kept as defaults for ``xgboost==1.7.6`` this task. .. _`GPU image training script`: https://github.com/ray-project/ray/blob/cec82a1ced631525a4d115e4dc0c283fa4275a7f/release/air_tests/air_benchmarks/workloads/pytorch_training_e2e.py#L95-L106 .. _`GPU training small cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_gpu_1_aws.yaml#L6-L24 .. _`GPU training large cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_gpu_4x4_aws.yaml#L5-L25 -.. _`Pytorch comparison training script`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/workloads/torch_benchmark.py -.. _`Pytorch comparison CPU cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_cpu_4_aws.yaml -.. _`Pytorch comparison GPU cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_gpu_4x4_aws.yaml -.. _`Tensorflow comparison training script`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/workloads/tensorflow_benchmark.py -.. _`Tensorflow comparison CPU cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_cpu_4_aws.yaml -.. _`Tensorflow comparison GPU cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_gpu_4x4_aws.yaml +.. _`PyTorch comparison training script`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/workloads/torch_benchmark.py +.. _`PyTorch comparison CPU cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_cpu_4_aws.yaml +.. _`PyTorch comparison GPU cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_gpu_4x4_aws.yaml +.. _`TensorFlow comparison training script`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/workloads/tensorflow_benchmark.py +.. _`TensorFlow comparison CPU cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_cpu_4_aws.yaml +.. _`TensorFlow comparison GPU cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_gpu_4x4_aws.yaml .. _`XGBoost Training Script`: https://github.com/ray-project/ray/blob/9ac58f4efc83253fe63e280106f959fe317b1104/release/train_tests/xgboost_lightgbm/train_batch_inference_benchmark.py .. _`XGBoost Cluster Configuration`: https://github.com/ray-project/ray/tree/9ac58f4efc83253fe63e280106f959fe317b1104/release/train_tests/xgboost_lightgbm diff --git a/doc/source/train/deepspeed.rst b/doc/source/train/deepspeed.rst index 570892e5a594..44fdda3d1d13 100644 --- a/doc/source/train/deepspeed.rst +++ b/doc/source/train/deepspeed.rst @@ -71,7 +71,7 @@ Complete Examples ----------------- Below are complete examples of ZeRO-3 training with DeepSpeed. Each example shows a full implementation of fine-tuning - a Bidirectional Encoder Representations from Transformers (BERT) model on the Microsoft Research Paraphrase Corpus (MRPC) dataset. +a Bidirectional Encoder Representations from Transformers (BERT) model on the Microsoft Research Paraphrase Corpus (MRPC) dataset. Install the requirements: @@ -119,7 +119,7 @@ Check the below examples for more details: * - Framework - Example * - Accelerate (:ref:`User Guide `) - - `Fine-tune Llama-2 series models with Deepspeed, Accelerate, and Ray Train. `_ + - `Fine-tune Llama-2 series models with DeepSpeed, Accelerate, and Ray Train. `_ * - Transformers (:ref:`User Guide `) - :doc:`Fine-tune GPT-J-6b with DeepSpeed and Hugging Face Transformers ` * - Lightning (:ref:`User Guide `) diff --git a/doc/source/train/distributed-tensorflow-keras.rst b/doc/source/train/distributed-tensorflow-keras.rst index 5ab690bdbec7..34db90b6b37b 100644 --- a/doc/source/train/distributed-tensorflow-keras.rst +++ b/doc/source/train/distributed-tensorflow-keras.rst @@ -78,7 +78,7 @@ Create a TensorflowTrainer -------------------------- ``Trainer``\s are the primary Ray Train classes for managing state and -execute training. For distributed Tensorflow, +execute training. For distributed TensorFlow, use a :class:`~ray.train.tensorflow.TensorflowTrainer` that you can setup like this: diff --git a/doc/source/train/doc_code/train_tune_interop.py b/doc/source/train/doc_code/train_tune_interop.py index 167528f33988..0b630cbe459d 100644 --- a/doc/source/train/doc_code/train_tune_interop.py +++ b/doc/source/train/doc_code/train_tune_interop.py @@ -65,7 +65,8 @@ def train_driver_fn(config: dict): # Launch a single Train run. -train_driver_fn({"num_workers": 4, "train_loop_config": {"lr": 1e-3}}) +# Note that you can only create a TuneReportCallback in a Ray Tune session. +# train_driver_fn({"num_workers": 4, "train_loop_config": {"lr": 1e-3}}) # Launch a sweep of hyperparameters with Ray Tune. diff --git a/doc/source/train/examples/accelerate/accelerate_example.rst b/doc/source/train/examples/accelerate/accelerate_example.rst index d9e84c48d267..e3b941444615 100644 --- a/doc/source/train/examples/accelerate/accelerate_example.rst +++ b/doc/source/train/examples/accelerate/accelerate_example.rst @@ -5,7 +5,7 @@ Distributed Training with Hugging Face Accelerate .. raw:: html - + Run on Anyscale

diff --git a/doc/source/train/examples/aws-trainium/llama3.rst b/doc/source/train/examples/aws-trainium/llama3.rst index 92af58efac8c..ff20e9b6e916 100644 --- a/doc/source/train/examples/aws-trainium/llama3.rst +++ b/doc/source/train/examples/aws-trainium/llama3.rst @@ -89,7 +89,7 @@ Run it in the background with the following command: Launching Ray Jobs ------------------ -The Ray cluster now ready to handle workloads. Initiate the data preparation and fine-tuning Ray jobs: +The Ray cluster is now ready to handle workloads. Initiate the data preparation and fine-tuning Ray jobs: 1. Launch the Ray job for downloading the dolly-15k dataset and the Llama3.1 8B model artifacts: diff --git a/doc/source/train/examples/intel_gaudi/bert.ipynb b/doc/source/train/examples/intel_gaudi/bert.ipynb index c45532960111..c48d34476af3 100644 --- a/doc/source/train/examples/intel_gaudi/bert.ipynb +++ b/doc/source/train/examples/intel_gaudi/bert.ipynb @@ -30,7 +30,7 @@ "docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest\n", "```\n", "\n", - "Inside the container, install the following dependecies to run this notebook.\n", + "Inside the container, install the following dependencies to run this notebook.\n", "```bash\n", "pip install ray[train] notebook transformers datasets evaluate\n", "```" diff --git a/doc/source/train/examples/pytorch/convert_existing_pytorch_code_to_ray_train.ipynb b/doc/source/train/examples/pytorch/convert_existing_pytorch_code_to_ray_train.ipynb index 6a5666412cb8..66170aacdd00 100644 --- a/doc/source/train/examples/pytorch/convert_existing_pytorch_code_to_ray_train.ipynb +++ b/doc/source/train/examples/pytorch/convert_existing_pytorch_code_to_ray_train.ipynb @@ -77,7 +77,7 @@ "source": [ "Then we download the data: \n", "\n", - "This tutorial assumes that your existing code is using the `torch.utils.data.Dataset` native to PyTorch. It continues to use `torch.utils.data.Dataset` to allow you to make as few code changes as possible. **This tutorial also runs with Ray Data, which gives you the benefits of efficient parallel preprocessing.** For more details on using Ray Data for for images, see the {doc}`Working with Images
` Ray Data user guide." + "This tutorial assumes that your existing code is using the `torch.utils.data.Dataset` native to PyTorch. It continues to use `torch.utils.data.Dataset` to allow you to make as few code changes as possible. **This tutorial also runs with Ray Data, which gives you the benefits of efficient parallel preprocessing.** For more details on using Ray Data for images, see the {doc}`Working with Images
` Ray Data user guide." ] }, { diff --git a/doc/source/train/examples/pytorch/dreambooth_finetuning.rst b/doc/source/train/examples/pytorch/dreambooth_finetuning.rst index b8da33e517dc..6d88a556f8fa 100644 --- a/doc/source/train/examples/pytorch/dreambooth_finetuning.rst +++ b/doc/source/train/examples/pytorch/dreambooth_finetuning.rst @@ -1,7 +1,7 @@ :orphan: -Fine-tune of Stable Diffusion with DreamBooth and Ray Train -=========================================================== +Fine-tuning of Stable Diffusion with DreamBooth and Ray Train +============================================================= .. raw:: html diff --git a/doc/source/train/examples/pytorch/pytorch_resnet_finetune.ipynb b/doc/source/train/examples/pytorch/pytorch_resnet_finetune.ipynb index 83dec754ac0c..12a7fc47b088 100644 --- a/doc/source/train/examples/pytorch/pytorch_resnet_finetune.ipynb +++ b/doc/source/train/examples/pytorch/pytorch_resnet_finetune.ipynb @@ -11,7 +11,7 @@ "\n", "

\n", "\n", - "This example fine tunes a pre-trained ResNet model with Ray Train. \n", + "This example fine-tunes a pre-trained ResNet model with Ray Train. \n", "\n", "For this example, the network architecture consists of the intermediate layer output of a pre-trained ResNet model, which feeds into a randomly initialized linear layer that outputs classification logits for our new task.\n", "\n", @@ -211,7 +211,7 @@ "The `train_loop_per_worker` function defines the fine-tuning procedure for each worker.\n", "\n", "**1. Prepare dataloaders for each worker**:\n", - "- This tutorial assumes you are using PyTorch's native `torch.utils.data.Dataset` for data input. {meth}`train.torch.prepare_data_loader() ` prepares your dataLoader for distributed execution. You can also use Ray Data for more efficient preprocessing. For more details on using Ray Data for for images, see the {doc}`Working with Images ` Ray Data user guide.\n", + "- This tutorial assumes you are using PyTorch's native `torch.utils.data.Dataset` for data input. {meth}`train.torch.prepare_data_loader() ` prepares your dataLoader for distributed execution. You can also use Ray Data for more efficient preprocessing. For more details on using Ray Data for images, see the {doc}`Working with Images ` Ray Data user guide.\n", "\n", "**2. Prepare your model**:\n", "- {meth}`train.torch.prepare_model() ` prepares the model for distributed training. Under the hood, it converts your torch model to `DistributedDataParallel` model, which synchronize its weights across all workers.\n", diff --git a/doc/source/train/examples/transformers/huggingface_text_classification.ipynb b/doc/source/train/examples/transformers/huggingface_text_classification.ipynb index c193f2b41686..ace610bbd88b 100644 --- a/doc/source/train/examples/transformers/huggingface_text_classification.ipynb +++ b/doc/source/train/examples/transformers/huggingface_text_classification.ipynb @@ -87,7 +87,7 @@ "id": "oJiSdWy2hYbR" }, "source": [ - "Check the resources our cluster is composed of. If you are running this notebook on your local machine or Google Colab, you should see the number of CPU cores and GPUs available on the your machine." + "Check the resources our cluster is composed of. If you are running this notebook on your local machine or Google Colab, you should see the number of CPU cores and GPUs available on your machine." ] }, { diff --git a/doc/source/train/examples/transformers/transformers_torch_trainer_basic.rst b/doc/source/train/examples/transformers/transformers_torch_trainer_basic.rst index 795ca47d6664..4e0f4a0db892 100644 --- a/doc/source/train/examples/transformers/transformers_torch_trainer_basic.rst +++ b/doc/source/train/examples/transformers/transformers_torch_trainer_basic.rst @@ -1,6 +1,6 @@ :orphan: -.. _transformers_torch_trainer_basic_example : +.. _transformers_torch_trainer_basic_example: Fine-tune a Text Classifier with Hugging Face Transformers ========================================================== diff --git a/doc/source/train/getting-started-pytorch-lightning.rst b/doc/source/train/getting-started-pytorch-lightning.rst index 9a24b8d77b82..96768a1ad9d2 100644 --- a/doc/source/train/getting-started-pytorch-lightning.rst +++ b/doc/source/train/getting-started-pytorch-lightning.rst @@ -38,54 +38,6 @@ Compare a PyTorch Lightning training script with and without Ray Train. .. tab-set:: - .. tab-item:: PyTorch Lightning - - .. This snippet isn't tested because it doesn't use any Ray code. - - .. testcode:: - :skipif: True - - import torch - from torchvision.models import resnet18 - from torchvision.datasets import FashionMNIST - from torchvision.transforms import ToTensor, Normalize, Compose - from torch.utils.data import DataLoader - import lightning.pytorch as pl - - # Model, Loss, Optimizer - class ImageClassifier(pl.LightningModule): - def __init__(self): - super(ImageClassifier, self).__init__() - self.model = resnet18(num_classes=10) - self.model.conv1 = torch.nn.Conv2d( - 1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False - ) - self.criterion = torch.nn.CrossEntropyLoss() - - def forward(self, x): - return self.model(x) - - def training_step(self, batch, batch_idx): - x, y = batch - outputs = self.forward(x) - loss = self.criterion(outputs, y) - self.log("loss", loss, on_step=True, prog_bar=True) - return loss - - def configure_optimizers(self): - return torch.optim.Adam(self.model.parameters(), lr=0.001) - - # Data - transform = Compose([ToTensor(), Normalize((0.28604,), (0.32025,))]) - train_data = FashionMNIST(root='./data', train=True, download=True, transform=transform) - train_dataloader = DataLoader(train_data, batch_size=128, shuffle=True) - - # Training - model = ImageClassifier() - trainer = pl.Trainer(max_epochs=10) - trainer.fit(model, train_dataloaders=train_dataloader) - - .. tab-item:: PyTorch Lightning + Ray Train .. code-block:: python @@ -175,6 +127,53 @@ Compare a PyTorch Lightning training script with and without Ray Train. ), ) + .. tab-item:: PyTorch Lightning + + .. This snippet isn't tested because it doesn't use any Ray code. + + .. testcode:: + :skipif: True + + import torch + from torchvision.models import resnet18 + from torchvision.datasets import FashionMNIST + from torchvision.transforms import ToTensor, Normalize, Compose + from torch.utils.data import DataLoader + import lightning.pytorch as pl + + # Model, Loss, Optimizer + class ImageClassifier(pl.LightningModule): + def __init__(self): + super(ImageClassifier, self).__init__() + self.model = resnet18(num_classes=10) + self.model.conv1 = torch.nn.Conv2d( + 1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False + ) + self.criterion = torch.nn.CrossEntropyLoss() + + def forward(self, x): + return self.model(x) + + def training_step(self, batch, batch_idx): + x, y = batch + outputs = self.forward(x) + loss = self.criterion(outputs, y) + self.log("loss", loss, on_step=True, prog_bar=True) + return loss + + def configure_optimizers(self): + return torch.optim.Adam(self.model.parameters(), lr=0.001) + + # Data + transform = Compose([ToTensor(), Normalize((0.28604,), (0.32025,))]) + train_data = FashionMNIST(root='./data', train=True, download=True, transform=transform) + train_dataloader = DataLoader(train_data, batch_size=128, shuffle=True) + + # Training + model = ImageClassifier() + trainer = pl.Trainer(max_epochs=10) + trainer.fit(model, train_dataloaders=train_dataloader) + Set up a training function -------------------------- diff --git a/doc/source/train/getting-started-pytorch.rst b/doc/source/train/getting-started-pytorch.rst index 8a225d34f9d0..6d28c5df3309 100644 --- a/doc/source/train/getting-started-pytorch.rst +++ b/doc/source/train/getting-started-pytorch.rst @@ -40,60 +40,10 @@ Compare a PyTorch training script with and without Ray Train. .. tab-set:: - .. tab-item:: PyTorch - - .. This snippet isn't tested because it doesn't use any Ray code. - - .. testcode:: - :skipif: True - - import os - import tempfile - - import torch - from torch.nn import CrossEntropyLoss - from torch.optim import Adam - from torch.utils.data import DataLoader - from torchvision.models import resnet18 - from torchvision.datasets import FashionMNIST - from torchvision.transforms import ToTensor, Normalize, Compose - - # Model, Loss, Optimizer - model = resnet18(num_classes=10) - model.conv1 = torch.nn.Conv2d( - 1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False - ) - model.to("cuda") - criterion = CrossEntropyLoss() - optimizer = Adam(model.parameters(), lr=0.001) - - # Data - transform = Compose([ToTensor(), Normalize((0.28604,), (0.32025,))]) - train_data = FashionMNIST(root='./data', train=True, download=True, transform=transform) - train_loader = DataLoader(train_data, batch_size=128, shuffle=True) - - # Training - for epoch in range(10): - for images, labels in train_loader: - images, labels = images.to("cuda"), labels.to("cuda") - outputs = model(images) - loss = criterion(outputs, labels) - optimizer.zero_grad() - loss.backward() - optimizer.step() - - metrics = {"loss": loss.item(), "epoch": epoch} - checkpoint_dir = tempfile.mkdtemp() - checkpoint_path = os.path.join(checkpoint_dir, "model.pt") - torch.save(model.state_dict(), checkpoint_path) - print(metrics) - - - .. tab-item:: PyTorch + Ray Train .. code-block:: python - :emphasize-lines: 12, 14, 21, 55-58, 59, 63, 66-68, 72-73, 76 + :emphasize-lines: 12, 14, 21, 32, 36-37, 55-58, 59, 63, 66-73 import os import tempfile @@ -179,6 +129,54 @@ Compare a PyTorch training script with and without Ray Train. ) model.load_state_dict(model_state_dict) + .. tab-item:: PyTorch + + .. This snippet isn't tested because it doesn't use any Ray code. + + .. testcode:: + :skipif: True + + import os + import tempfile + + import torch + from torch.nn import CrossEntropyLoss + from torch.optim import Adam + from torch.utils.data import DataLoader + from torchvision.models import resnet18 + from torchvision.datasets import FashionMNIST + from torchvision.transforms import ToTensor, Normalize, Compose + + # Model, Loss, Optimizer + model = resnet18(num_classes=10) + model.conv1 = torch.nn.Conv2d( + 1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False + ) + model.to("cuda") + criterion = CrossEntropyLoss() + optimizer = Adam(model.parameters(), lr=0.001) + + # Data + transform = Compose([ToTensor(), Normalize((0.28604,), (0.32025,))]) + train_data = FashionMNIST(root='./data', train=True, download=True, transform=transform) + train_loader = DataLoader(train_data, batch_size=128, shuffle=True) + + # Training + for epoch in range(10): + for images, labels in train_loader: + images, labels = images.to("cuda"), labels.to("cuda") + outputs = model(images) + loss = criterion(outputs, labels) + optimizer.zero_grad() + loss.backward() + optimizer.step() + + metrics = {"loss": loss.item(), "epoch": epoch} + checkpoint_dir = tempfile.mkdtemp() + checkpoint_path = os.path.join(checkpoint_dir, "model.pt") + torch.save(model.state_dict(), checkpoint_path) + print(metrics) + Set up a training function -------------------------- diff --git a/doc/source/train/getting-started-transformers.rst b/doc/source/train/getting-started-transformers.rst index c07215e58ef8..a7beae254d13 100644 --- a/doc/source/train/getting-started-transformers.rst +++ b/doc/source/train/getting-started-transformers.rst @@ -54,66 +54,6 @@ Compare a standard Hugging Face Transformers script with its Ray Train equivalen .. tab-set:: - .. tab-item:: Hugging Face Transformers - - .. This snippet isn't tested because it doesn't use any Ray code. - - .. testcode:: - :skipif: True - - # Adapted from Hugging Face tutorial: https://huggingface.co/docs/transformers/training - - import numpy as np - import evaluate - from datasets import load_dataset - from transformers import ( - Trainer, - TrainingArguments, - AutoTokenizer, - AutoModelForSequenceClassification, - ) - - # Datasets - dataset = load_dataset("yelp_review_full") - tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") - - def tokenize_function(examples): - return tokenizer(examples["text"], padding="max_length", truncation=True) - - small_train_dataset = dataset["train"].select(range(100)).map(tokenize_function, batched=True) - small_eval_dataset = dataset["test"].select(range(100)).map(tokenize_function, batched=True) - - # Model - model = AutoModelForSequenceClassification.from_pretrained( - "bert-base-cased", num_labels=5 - ) - - # Metrics - metric = evaluate.load("accuracy") - - def compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - return metric.compute(predictions=predictions, references=labels) - - # Hugging Face Trainer - training_args = TrainingArguments( - output_dir="test_trainer", evaluation_strategy="epoch", report_to="none" - ) - - trainer = Trainer( - model=model, - args=training_args, - train_dataset=small_train_dataset, - eval_dataset=small_eval_dataset, - compute_metrics=compute_metrics, - ) - - # Start Training - trainer.train() - - - .. tab-item:: Hugging Face Transformers + Ray Train .. code-block:: python @@ -216,6 +156,65 @@ Compare a standard Hugging Face Transformers script with its Ray Train equivalen model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path) + .. tab-item:: Hugging Face Transformers + + .. This snippet isn't tested because it doesn't use any Ray code. + + .. testcode:: + :skipif: True + + # Adapted from Hugging Face tutorial: https://huggingface.co/docs/transformers/training + + import numpy as np + import evaluate + from datasets import load_dataset + from transformers import ( + Trainer, + TrainingArguments, + AutoTokenizer, + AutoModelForSequenceClassification, + ) + + # Datasets + dataset = load_dataset("yelp_review_full") + tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") + + def tokenize_function(examples): + return tokenizer(examples["text"], padding="max_length", truncation=True) + + small_train_dataset = dataset["train"].select(range(100)).map(tokenize_function, batched=True) + small_eval_dataset = dataset["test"].select(range(100)).map(tokenize_function, batched=True) + + # Model + model = AutoModelForSequenceClassification.from_pretrained( + "bert-base-cased", num_labels=5 + ) + + # Metrics + metric = evaluate.load("accuracy") + + def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + + # Hugging Face Trainer + training_args = TrainingArguments( + output_dir="test_trainer", evaluation_strategy="epoch", report_to="none" + ) + + trainer = Trainer( + model=model, + args=training_args, + train_dataset=small_train_dataset, + eval_dataset=small_eval_dataset, + compute_metrics=compute_metrics, + ) + + # Start Training + trainer.train() + + Set up a training function -------------------------- diff --git a/doc/source/train/getting-started-xgboost.rst b/doc/source/train/getting-started-xgboost.rst index f4568f221ba6..983dc5138648 100644 --- a/doc/source/train/getting-started-xgboost.rst +++ b/doc/source/train/getting-started-xgboost.rst @@ -41,6 +41,7 @@ Compare a XGBoost training script with and without Ray Train. .. tab-item:: XGBoost + Ray Train .. literalinclude:: ./doc_code/xgboost_quickstart.py + :emphasize-lines: 3-4, 7-8, 11, 15-16, 19-20, 48, 53, 56-64 :language: python :start-after: __xgboost_ray_start__ :end-before: __xgboost_ray_end__ @@ -53,7 +54,6 @@ Compare a XGBoost training script with and without Ray Train. :end-before: __xgboost_end__ - Set up a training function -------------------------- diff --git a/doc/source/train/user-guides/checkpoints.rst b/doc/source/train/user-guides/checkpoints.rst index 595a3d7b2b7c..cd837cdccf13 100644 --- a/doc/source/train/user-guides/checkpoints.rst +++ b/doc/source/train/user-guides/checkpoints.rst @@ -120,7 +120,7 @@ Here are a few examples of saving checkpoints with different training frameworks .. tab-item:: Hugging Face Transformers - Ray Train leverages HuggingFace Transformers Trainer's ``Callback`` interface + Ray Train leverages Hugging Face Transformers Trainer's ``Callback`` interface to report metrics and checkpoints. **Option 1: Use Ray Train's default report callback** diff --git a/doc/source/train/user-guides/data-loading-preprocessing.rst b/doc/source/train/user-guides/data-loading-preprocessing.rst index e82ea87c9bb8..3835a697fb25 100644 --- a/doc/source/train/user-guides/data-loading-preprocessing.rst +++ b/doc/source/train/user-guides/data-loading-preprocessing.rst @@ -11,7 +11,7 @@ Key advantages include: - Automatic and fast failure recovery. - Automatic on-the-fly data splitting across distributed training workers. -For more details about Ray Data, check out the :ref:`Ray Data documentation`.` +For more details about Ray Data, check out the :ref:`Ray Data documentation`. .. note:: @@ -45,7 +45,7 @@ Data ingestion can be set up with four basic steps: .. tab-item:: PyTorch .. code-block:: python - :emphasize-lines: 14,21,29,31-33,53 + :emphasize-lines: 14,21,29,33-35,53 import torch import ray @@ -149,7 +149,7 @@ Data ingestion can be set up with four basic steps: .. tab-item:: HuggingFace Transformers .. code-block:: python - :emphasize-lines: 7-8,13-14,17-18,30-31,41 + :emphasize-lines: 7-8,13-14,17-18,24,30-31,41 import ray import ray.train @@ -322,7 +322,7 @@ For more details, see the following sections for each framework: .. tip:: When using Torch or Hugging Face Datasets directly without Ray Data, make sure to instantiate your Dataset *inside* the ``train_loop_per_worker``. - Instatiating the Dataset outside of the ``train_loop_per_worker`` and passing it in via global scope + Instantiating the Dataset outside of the ``train_loop_per_worker`` and passing it in via global scope can cause errors due to the Dataset not being serializable. .. note:: diff --git a/doc/source/train/user-guides/experiment-tracking.rst b/doc/source/train/user-guides/experiment-tracking.rst index 5feb082a43e9..424aff50f798 100644 --- a/doc/source/train/user-guides/experiment-tracking.rst +++ b/doc/source/train/user-guides/experiment-tracking.rst @@ -242,7 +242,7 @@ Refer to the tracking libraries' documentation for semantics. def train_func(): if ray.train.get_context().get_world_rank() == 0: - wandb.init(..., config={"ray_train_persistent_storage_path": "TODO: fill in when API stablizes"}) + wandb.init(..., config={"ray_train_persistent_storage_path": "TODO: fill in when API stabilizes"}) .. tip:: @@ -304,14 +304,14 @@ PyTorch .. dropdown:: Log to W&B .. literalinclude:: ../../../../python/ray/train/examples/experiment_tracking//torch_exp_tracking_wandb.py - :emphasize-lines: 15, 16, 17, 21, 22, 51, 52, 54, 55 + :emphasize-lines: 16, 19-21, 59-60, 62-63 :language: python :start-after: __start__ .. dropdown:: Log to file-based MLflow .. literalinclude:: ../../../../python/ray/train/examples/experiment_tracking/torch_exp_tracking_mlflow.py - :emphasize-lines: 22, 23, 24, 25, 54, 55, 57, 58, 64 + :emphasize-lines: 22-25, 58-59, 61-62, 68 :language: python :start-after: __start__ :end-before: __end__ diff --git a/doc/source/train/user-guides/fault-tolerance.rst b/doc/source/train/user-guides/fault-tolerance.rst index 81533ef29e94..ab25902ce54e 100644 --- a/doc/source/train/user-guides/fault-tolerance.rst +++ b/doc/source/train/user-guides/fault-tolerance.rst @@ -1,5 +1,3 @@ -.. _:: ../doc_code: - .. _train-fault-tolerance: Handling Failures and Node Preemption diff --git a/doc/source/tune/api/api.rst b/doc/source/tune/api/api.rst index 2a352e01d37d..3c446bca6fc3 100644 --- a/doc/source/tune/api/api.rst +++ b/doc/source/tune/api/api.rst @@ -6,7 +6,7 @@ Ray Tune API .. tip:: We'd love to hear your feedback on using Tune - `get in touch `_! This section contains a reference for the Tune API. If there is anything missing, please open an issue -on `Github`_. +on `GitHub`_. .. _`GitHub`: https://github.com/ray-project/ray/issues diff --git a/doc/source/tune/api/env.rst b/doc/source/tune/api/env.rst index 513044fc1371..a26e691df1f2 100644 --- a/doc/source/tune/api/env.rst +++ b/doc/source/tune/api/env.rst @@ -63,10 +63,10 @@ These are the environment variables Ray Tune currently considers: but never longer than this value. Defaults to 100 (seconds). * **TUNE_RESULT_BUFFER_MIN_TIME_S**: Additionally, you can specify a minimum time to buffer results. Defaults to 0. * **TUNE_WARN_THRESHOLD_S**: Threshold for logging if an Tune event loop operation takes too long. Defaults to 0.5 (seconds). -* **TUNE_WARN_INSUFFICENT_RESOURCE_THRESHOLD_S**: Threshold for throwing a warning if no active trials are in ``RUNNING`` state +* **TUNE_WARN_INSUFFICIENT_RESOURCE_THRESHOLD_S**: Threshold for throwing a warning if no active trials are in ``RUNNING`` state for this amount of seconds. If the Ray Tune job is stuck in this state (most likely due to insufficient resources), the warning message is printed repeatedly every this amount of seconds. Defaults to 60 (seconds). -* **TUNE_WARN_INSUFFICENT_RESOURCE_THRESHOLD_S_AUTOSCALER**: Threshold for throwing a warning when the autoscaler is enabled and +* **TUNE_WARN_INSUFFICIENT_RESOURCE_THRESHOLD_S_AUTOSCALER**: Threshold for throwing a warning when the autoscaler is enabled and if no active trials are in ``RUNNING`` state for this amount of seconds. If the Ray Tune job is stuck in this state (most likely due to insufficient resources), the warning message is printed repeatedly every this amount of seconds. Defaults to 60 (seconds). diff --git a/doc/source/tune/api/logging.rst b/doc/source/tune/api/logging.rst index f8692a19b2d0..2ef841929056 100644 --- a/doc/source/tune/api/logging.rst +++ b/doc/source/tune/api/logging.rst @@ -97,7 +97,7 @@ Aim Integration Tune also provides a logger for the `Aim `_ experiment tracker. You can install Aim via ``pip install aim``. -See the :doc:`tutorial here ` +See the :doc:`tutorial here `. .. autosummary:: :nosignatures: diff --git a/doc/source/tune/api/schedulers.rst b/doc/source/tune/api/schedulers.rst index 74a44fa826d8..d451d4591b0c 100644 --- a/doc/source/tune/api/schedulers.rst +++ b/doc/source/tune/api/schedulers.rst @@ -44,7 +44,7 @@ setting the ``scheduler`` parameter of ``tune.TuneConfig``, which is taken in by .. code-block:: python from ray import tune - from tune.schedulers import ASHAScheduler + from ray.tune.schedulers import ASHAScheduler asha_scheduler = ASHAScheduler( time_attr='training_iteration', diff --git a/doc/source/tune/examples/index.rst b/doc/source/tune/examples/index.rst index 38c334ab4717..0de91b153247 100644 --- a/doc/source/tune/examples/index.rst +++ b/doc/source/tune/examples/index.rst @@ -6,10 +6,10 @@ Ray Tune Examples ================= .. tip:: - See :ref:`overview` to learn more about Tune features. + See :ref:`tune-main` to learn more about Tune features. -Below are examples for using Ray Tune for a variety use cases and sorted by categories: +Below are examples for using Ray Tune for a variety of use cases and sorted by categories: * `ML frameworks`_ * `Experiment tracking tools`_ diff --git a/doc/source/tune/faq.rst b/doc/source/tune/faq.rst index 7e3ec9b8bc44..0268caa2f6d8 100644 --- a/doc/source/tune/faq.rst +++ b/doc/source/tune/faq.rst @@ -116,7 +116,7 @@ For **layer sizes** we also suggest trying **powers of 2**. For small problems For **discount factors** in reinforcement learning we suggest sampling uniformly between 0.9 and 1.0. Depending on the problem, a much stricter range above 0.97 -or oeven above 0.99 can make sense (e.g. for Atari). +or even above 0.99 can make sense (e.g. for Atari). How can I use nested/conditional search spaces? @@ -295,7 +295,7 @@ Why is my training stuck and Ray reporting that pending actor or tasks cannot be This is usually caused by Ray actors or tasks being started by the trainable without the trainable resources accounting for them, leading to a deadlock. -This can also be "stealthly" caused by using other libraries in the trainable that are +This can also be "stealthily" caused by using other libraries in the trainable that are based on Ray, such as Modin. In order to fix the issue, request additional resources for the trial using :ref:`placement groups `, as outlined in the section above. @@ -490,7 +490,7 @@ on your machine first to avoid any obvious mistakes. How can I get started contributing to Tune? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -We use Github to track issues, feature requests, and bugs. Take a look at the +We use GitHub to track issues, feature requests, and bugs. Take a look at the ones labeled `"good first issue" `__ and `"help wanted" `__ for a place to start. Look for issues with "[tune]" in the title. @@ -674,7 +674,7 @@ running at a time. A symptom was when trials from job A used parameters specifie leading to unexpected results. Please refer to -[this github issue](https://github.com/ray-project/ray/issues/30091#issuecomment-1431676976) +`this GitHub issue `__ for more context and a workaround if you run into this issue. .. _tune-iterative-experimentation: diff --git a/doc/source/tune/getting-started.rst b/doc/source/tune/getting-started.rst index b5321ffbe3bc..9264662589d3 100644 --- a/doc/source/tune/getting-started.rst +++ b/doc/source/tune/getting-started.rst @@ -19,7 +19,7 @@ To run this example, you will need to install the following: $ pip install "ray[tune]" torch torchvision -Setting Up a Pytorch Model to Tune +Setting Up a PyTorch Model to Tune ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ To start off, let's first import some dependencies. @@ -44,7 +44,7 @@ connected layer, and a softmax function. :start-after: __model_def_begin__ :end-before: __model_def_end__ -Below, we have implemented functions for training and evaluating your Pytorch model. +Below, we have implemented functions for training and evaluating your PyTorch model. We define a ``train`` and a ``test`` function for that purpose. If you know how to do this, skip ahead to the next section. @@ -60,7 +60,7 @@ If you know how to do this, skip ahead to the next section. Setting up a ``Tuner`` for a Training Run with Tune ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Below, we define a function that trains the Pytorch model for multiple epochs. +Below, we define a function that trains the PyTorch model for multiple epochs. This function will be executed on a separate :ref:`Ray Actor (process) ` underneath the hood, so we need to communicate the performance of the model back to Tune (which is on the main Python process). @@ -150,7 +150,7 @@ Note that each library has a specific way of defining the search space. Evaluating Your Model after Tuning ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -You can evaluate best trained model using the :ref:`ExperimentAnalysis object ` to retrieve the best model: +You can evaluate the best trained model using the :ref:`ExperimentAnalysis object ` to retrieve the best model: .. literalinclude:: /../../python/ray/tune/tests/tutorial.py :language: python @@ -163,5 +163,5 @@ Next Steps * Check out the :ref:`Tune tutorials ` for guides on using Tune with your preferred machine learning library. * Browse our :ref:`gallery of examples ` to see how to use Tune with PyTorch, XGBoost, Tensorflow, etc. -* `Let us know `__ if you ran into issues or have any questions by opening an issue on our Github. +* `Let us know `__ if you ran into issues or have any questions by opening an issue on our GitHub. * To check how your application is doing, you can use the :ref:`Ray dashboard `. diff --git a/doc/source/tune/index.rst b/doc/source/tune/index.rst index 56537bcf4725..264a402cb54e 100644 --- a/doc/source/tune/index.rst +++ b/doc/source/tune/index.rst @@ -31,7 +31,7 @@ Tune further integrates with a wide range of additional hyperparameter optimizat In this quick-start example you `minimize` a simple function of the form ``f(x) = a**2 + b``, our `objective` function. The closer ``a`` is to zero and the smaller ``b`` is, the smaller the total value of ``f(x)``. - We will define a so-called `search space` for ``a`` and ``b`` and let Ray Tune explore the space for good values. + We will define a so-called `search space` for ``a`` and ``b`` and let Ray Tune explore the space for good values. .. callout:: @@ -261,7 +261,7 @@ Feel free to submit a pull-request adding (or requesting a removal!) of a listed - `Softlearning `_: Softlearning is a reinforcement learning framework for training maximum entropy policies in continuous domains. Includes the official implementation of the Soft Actor-Critic algorithm. - `Flambe `_: An ML framework to accelerate research and its path to production. See `flambe.ai `_. -- `Population Based Augmentation `_: Population Based Augmentation (PBA) is a algorithm that quickly and efficiently learns data augmentation functions for neural network training. PBA matches state-of-the-art results on CIFAR with one thousand times less compute. +- `Population Based Augmentation `_: Population Based Augmentation (PBA) is an algorithm that quickly and efficiently learns data augmentation functions for neural network training. PBA matches state-of-the-art results on CIFAR with one thousand times less compute. - `Fast AutoAugment by Kakao `_: Fast AutoAugment (Accepted at NeurIPS 2019) learns augmentation policies using a more efficient search strategy based on density matching. - `Allentune `_: Hyperparameter Search for AllenNLP from AllenAI. - `machinable `_: A modular configuration system for machine learning research. See `machinable.org `_. diff --git a/doc/source/tune/key-concepts.rst b/doc/source/tune/key-concepts.rst index 18d02aba6318..76d5802d124b 100644 --- a/doc/source/tune/key-concepts.rst +++ b/doc/source/tune/key-concepts.rst @@ -33,7 +33,7 @@ and the :ref:`Class API `. Both are valid ways of defining a `trainable`, but the Function API is generally recommended and is used throughout the rest of this guide. -Consider an example of optimizing a simple objective function like ``a * (x ** 2) + b `` in which ``a`` and ``b`` are the +Consider an example of optimizing a simple objective function like ``a * (x ** 2) + b`` in which ``a`` and ``b`` are the hyperparameters we want to tune to `minimize` the objective. Since the objective also has a variable ``x``, we need to test for different values of ``x``. Given concrete choices for ``a``, ``b`` and ``x`` we can evaluate the objective function and get a `score` to minimize. @@ -42,7 +42,7 @@ Given concrete choices for ``a``, ``b`` and ``x`` we can evaluate the objective .. tab-item:: Function API - With the :ref:`the function-based API ` you create a function (here called ``trainable``) that + With the :ref:`function-based API ` you create a function (here called ``trainable``) that takes in a dictionary of hyperparameters. This function computes a ``score`` in a "training loop" and `reports` this score back to Tune: @@ -238,7 +238,7 @@ Tune also provides helpful utilities to use with Search Algorithms: * :ref:`limiter`: Limits the amount of concurrent trials when running optimization. * :ref:`shim`: Allows creation of the search algorithm object given a string. -Note that in the example above we tell Tune to ``stop`` after ``20`` training iterations. +Note that in the example above we tell Tune to ``stop`` after ``20`` training iterations. This way of stopping trials with explicit rules is useful, but in many cases we can do even better with `schedulers`. @@ -256,7 +256,7 @@ passes through the trials selected by your search algorithm in the order they we In short, schedulers can stop, pause, or tweak the hyperparameters of running trials, potentially making your hyperparameter tuning process much faster. -Unlike search algorithms, :ref:`Trial Scheduler ` do not select which hyperparameter +Unlike search algorithms, :ref:`Trial Schedulers ` do not select which hyperparameter configurations to evaluate. Here's a quick example of using the so-called ``HyperBand`` scheduler to tune an experiment. diff --git a/doc/source/tune/tutorials/tune-search-spaces.rst b/doc/source/tune/tutorials/tune-search-spaces.rst index 3a8eba780c0c..f3707fa80ac1 100644 --- a/doc/source/tune/tutorials/tune-search-spaces.rst +++ b/doc/source/tune/tutorials/tune-search-spaces.rst @@ -59,7 +59,7 @@ If ``grid_search`` is provided as an argument, the *same* grid will be repeated tuner.fit() # 3 different configs. - tuner = tune.Tuner(trainable, tune_config=tune.TuneConfig(num_samples=1), param_space={"x": grid_search([1, 2, 3])}) + tuner = tune.Tuner(trainable, tune_config=tune.TuneConfig(num_samples=1), param_space={"x": tune.grid_search([1, 2, 3])}) tuner.fit() # 6 different configs. diff --git a/doc/source/tune/tutorials/tune_get_data_in_and_out.md b/doc/source/tune/tutorials/tune_get_data_in_and_out.md index ef7fc2d15c38..b255c29c581d 100644 --- a/doc/source/tune/tutorials/tune_get_data_in_and_out.md +++ b/doc/source/tune/tutorials/tune_get_data_in_and_out.md @@ -318,7 +318,7 @@ def training_function(config, data): start_epoch = 0 if checkpoint: with checkpoint.as_directory() as checkpoint_dir: - with open(os.path.join(checkpoint_dir, "model.pkl"), "w") as f: + with open(os.path.join(checkpoint_dir, "model.pkl"), "rb") as f: checkpoint_dict = pickle.load(f) start_epoch = checkpoint_dict["epoch"] + 1 model = checkpoint_dict["state"] @@ -335,7 +335,7 @@ def training_function(config, data): # Create the checkpoint. with tempfile.TemporaryDirectory() as temp_checkpoint_dir: - with open(os.path.join(temp_checkpoint_dir, "model.pkl"), "w") as f: + with open(os.path.join(temp_checkpoint_dir, "model.pkl"), "wb") as f: pickle.dump(checkpoint_dict, f) tune.report( {"metric": metric}, diff --git a/docker/base-deps/Dockerfile b/docker/base-deps/Dockerfile index 94259767c134..f86e70e5fc66 100644 --- a/docker/base-deps/Dockerfile +++ b/docker/base-deps/Dockerfile @@ -5,17 +5,25 @@ # The GPU options are NVIDIA CUDA developer images. ARG BASE_IMAGE="ubuntu:22.04" FROM ${BASE_IMAGE} -# FROM directive resets ARG -ARG BASE_IMAGE # If this arg is not "autoscaler" then no autoscaler requirements will be included ENV TZ=America/Los_Angeles ENV LC_ALL=C.UTF-8 ENV LANG=C.UTF-8 + # TODO(ilr) $HOME seems to point to result in "" instead of "/home/ray" -ENV PATH "/home/ray/anaconda3/bin:$PATH" +# Q: Why add paths like /usr/local/nvidia/lib64 and /usr/local/nvidia/bin? +# A: The NVIDIA GPU operator version used by GKE injects these into the container +# after it's mounted to a pod. +# Issue is tracked here: +# https://github.com/GoogleCloudPlatform/compute-gpu-installation/issues/46 +# More context here: +# https://github.com/NVIDIA/nvidia-container-toolkit/issues/275 +# and here: +# https://gitlab.com/nvidia/container-images/cuda/-/issues/27 +ENV PATH "/home/ray/anaconda3/bin:$PATH:/usr/local/nvidia/bin" +ENV LD_LIBRARY_PATH "$LD_LIBRARY_PATH:/usr/local/nvidia/lib64" ARG DEBIAN_FRONTEND=noninteractive ARG PYTHON_VERSION=3.9 -ARG HOSTTYPE=${HOSTTYPE:-x86_64} ARG RAY_UID=1000 ARG RAY_GID=100 @@ -67,9 +75,19 @@ RUN </dev/stderr + exit 1 +fi + # Install miniforge wget --quiet \ - "https://github.com/conda-forge/miniforge/releases/download/24.11.3-0/Miniforge3-24.11.3-0-Linux-${HOSTTYPE}.sh" \ + "https://github.com/conda-forge/miniforge/releases/download/24.11.3-0/Miniforge3-24.11.3-0-Linux-${ARCH}.sh" \ -O /tmp/miniforge.sh /bin/bash /tmp/miniforge.sh -b -u -p $HOME/anaconda3 @@ -99,6 +117,7 @@ PIP_PKGS=( cryptography google-api-python-client google-oauth + "adlfs[abfs]" ) # Install uv @@ -114,15 +133,11 @@ uv pip install --system --no-cache-dir --index-strategy unsafe-best-match \ -c $HOME/requirements_compiled.txt \ "${PIP_PKGS[@]}" -# To avoid the following error on Jenkins: -# AttributeError: 'numpy.ufunc' object has no attribute '__module__' -uv pip uninstall --system dask - # We install cmake temporarily to get psutil sudo apt-get autoremove -y cmake zlib1g-dev # We keep g++ on GPU images, because uninstalling removes CUDA Devel tooling -if [[ "$BASE_IMAGE" == "ubuntu:22.04" && "$HOSTTYPE" == "x86_64" ]]; then +if [[ ! -d /usr/local/cuda ]]; then sudo apt-get autoremove -y g++ fi diff --git a/ci/docker/ray.cpu.base.wanda.yaml b/docker/base-deps/cpu.wanda.yaml similarity index 59% rename from ci/docker/ray.cpu.base.wanda.yaml rename to docker/base-deps/cpu.wanda.yaml index 895605ed8f71..ecb8f1c3f1e9 100644 --- a/ci/docker/ray.cpu.base.wanda.yaml +++ b/docker/base-deps/cpu.wanda.yaml @@ -1,4 +1,4 @@ -name: "ray-py$PYTHON_VERSION-cpu-base" +name: "ray-py$PYTHON_VERSION-cpu-base$ARCH_SUFFIX" froms: ["ubuntu:22.04"] dockerfile: docker/base-deps/Dockerfile srcs: @@ -7,4 +7,4 @@ build_args: - PYTHON_VERSION - BASE_IMAGE=ubuntu:22.04 tags: - - cr.ray.io/rayproject/ray-py$PYTHON_VERSION-cpu-base + - cr.ray.io/rayproject/ray-py$PYTHON_VERSION-cpu-base$ARCH_SUFFIX diff --git a/ci/docker/ray.cuda.base.wanda.yaml b/docker/base-deps/cuda.wanda.yaml similarity index 79% rename from ci/docker/ray.cuda.base.wanda.yaml rename to docker/base-deps/cuda.wanda.yaml index 0bcd7611c921..44b47fc0dde2 100644 --- a/ci/docker/ray.cuda.base.wanda.yaml +++ b/docker/base-deps/cuda.wanda.yaml @@ -1,4 +1,4 @@ -name: "ray-py$PYTHON_VERSION-cu$CUDA_VERSION-base" +name: "ray-py$PYTHON_VERSION-cu$CUDA_VERSION-base$ARCH_SUFFIX" froms: ["nvidia/cuda:$CUDA_VERSION-devel-ubuntu22.04"] dockerfile: docker/base-deps/Dockerfile srcs: @@ -7,4 +7,4 @@ build_args: - PYTHON_VERSION - BASE_IMAGE=nvidia/cuda:$CUDA_VERSION-devel-ubuntu22.04 tags: - - cr.ray.io/rayproject/ray-py$PYTHON_VERSION-cu$CUDA_VERSION-base + - cr.ray.io/rayproject/ray-py$PYTHON_VERSION-cu$CUDA_VERSION-base$ARCH_SUFFIX diff --git a/docker/base-extra-testdeps/cpu.wanda.yaml b/docker/base-extra-testdeps/cpu.wanda.yaml new file mode 100644 index 000000000000..dbfa11ce5b11 --- /dev/null +++ b/docker/base-extra-testdeps/cpu.wanda.yaml @@ -0,0 +1,10 @@ +name: "$IMAGE_TYPE-py$PYTHON_VERSION-cpu-base-extra-testdeps" +froms: ["cr.ray.io/rayproject/$IMAGE_TYPE-py$PYTHON_VERSION-cpu-base-extra"] +dockerfile: release/ray_release/byod/byod.Dockerfile +srcs: + - release/ray_release/byod/requirements_byod_$PYTHON_VERSION.txt +build_args: + - BASE_IMAGE=cr.ray.io/rayproject/$IMAGE_TYPE-py$PYTHON_VERSION-cpu-base-extra + - PIP_REQUIREMENTS=release/ray_release/byod/$REQUIREMENTS_FILE +tags: + - cr.ray.io/rayproject/$IMAGE_TYPE-py$PYTHON_VERSION-cpu-base-extra-testdeps diff --git a/docker/base-extra-testdeps/cuda.wanda.yaml b/docker/base-extra-testdeps/cuda.wanda.yaml new file mode 100644 index 000000000000..c27e49f812dd --- /dev/null +++ b/docker/base-extra-testdeps/cuda.wanda.yaml @@ -0,0 +1,10 @@ +name: "$IMAGE_TYPE-py$PYTHON_VERSION-cu$CUDA_VERSION-base-extra-testdeps" +froms: ["cr.ray.io/rayproject/$IMAGE_TYPE-py$PYTHON_VERSION-cu$CUDA_VERSION-base-extra"] +dockerfile: release/ray_release/byod/byod.Dockerfile +srcs: + - release/ray_release/byod/$REQUIREMENTS_FILE +build_args: + - BASE_IMAGE=cr.ray.io/rayproject/$IMAGE_TYPE-py$PYTHON_VERSION-cu$CUDA_VERSION-base-extra + - PIP_REQUIREMENTS=release/ray_release/byod/$REQUIREMENTS_FILE +tags: + - cr.ray.io/rayproject/$IMAGE_TYPE-py$PYTHON_VERSION-cu$CUDA_VERSION-base-extra-testdeps diff --git a/docker/base-extra/Dockerfile b/docker/base-extra/Dockerfile new file mode 100644 index 000000000000..02b851b2f594 --- /dev/null +++ b/docker/base-extra/Dockerfile @@ -0,0 +1,204 @@ +# syntax=docker/dockerfile:1.3-labs + +ARG BASE_IMAGE="rayproject/ray:latest" + +FROM "$BASE_IMAGE" + +ENV TERM=xterm + +ARG SSH_PORT=5020 + +RUN </dev/stderr + exit 1 +fi + +# Create boto config; makes gsutil happy. +echo "[GoogleCompute]" > "${HOME}/.boto" +echo "service_account = default" >> "${HOME}/.boto" +chmod 600 "${HOME}/.boto" + +if [[ "$ARCH" == "x86_64" ]]; then + sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub + sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub +else + sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/arm64/7fa2af80.pub + # Nvidia does not have machine-learning repo for arm64 +fi + +echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" \ + | sudo tee -a /etc/apt/sources.list.d/google-cloud-sdk.list +wget -O - https://packages.cloud.google.com/apt/doc/apt-key.gpg \ + | sudo apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - + +# Add gdb since ray dashboard uses `memray attach`, which requires gdb. + +APT_PKGS=( + google-cloud-sdk + supervisor + vim + zsh + nfs-common + zip + unzip + build-essential + ssh + curl + gdb +) + +sudo apt-get update -y +sudo apt-get install -y "${APT_PKGS[@]}" +sudo apt-get autoclean + +# Install azcopy +AZCOPY_VERSION="10.30.0" +AZCOPY_TMP="$(mktemp -d)" +( + cd "${AZCOPY_TMP}" + if [[ "$ARCH" == "x86_64" ]]; then + curl -sSfL "https://github.com/Azure/azure-storage-azcopy/releases/download/v${AZCOPY_VERSION}/azcopy_linux_amd64_${AZCOPY_VERSION}.tar.gz" \ + -o- | tar -xz "azcopy_linux_amd64_${AZCOPY_VERSION}/azcopy" + sudo mv "azcopy_linux_amd64_${AZCOPY_VERSION}/azcopy" /usr/local/bin/azcopy + else + curl -sSfL "https://github.com/Azure/azure-storage-azcopy/releases/download/v${AZCOPY_VERSION}/azcopy_linux_arm64_${AZCOPY_VERSION}.tar.gz" \ + -o- | tar -xz "azcopy_linux_arm64_${AZCOPY_VERSION}/azcopy" + sudo mv "azcopy_linux_arm64_${AZCOPY_VERSION}/azcopy" /usr/local/bin/azcopy + fi +) +rm -rf "${AZCOPY_TMP}" + +# Install dynolog, only on x86_64 machines. +if [[ "$ARCH" == "x86_64" ]]; then + DYNOLOG_TMP="$(mktemp -d)" + ( + cd "${DYNOLOG_TMP}" + curl -sSL https://github.com/facebookincubator/dynolog/releases/download/v0.3.2/dynolog_0.3.2-0-amd64.deb -o dynolog_0.3.2-0-amd64.deb + sudo dpkg -i dynolog_0.3.2-0-amd64.deb + ) + rm -rf "${DYNOLOG_TMP}" +fi + +# Python dependencies to install. To specify a version, please make the change +# in OSS ray repository, but not here. +PYTHON_REQUIREMENTS=( + azure-identity + jupyterlab + ipywidgets + grpcio + grpcio-tools + + # Pinning jupyter_server_terminals==0.4.4 , the higher version will break the + # webterminal when using an older version of terminado. + jupyter_server_terminals==0.4.4 + + # [backend] is for installing anyscale CLI for use in the anyscale cloud. + "anyscale[backend]" +) + + +PYTHON_VERSION="$(python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')" + +uv pip install --system --no-cache-dir --index-strategy unsafe-best-match \ + -c /home/ray/requirements_compiled.txt \ + "${PYTHON_REQUIREMENTS[@]}" + +# Install awscli v2 +AWSCLI_TMP="$(mktemp -d)" +( + cd "${AWSCLI_TMP}" + curl -sfL "https://awscli.amazonaws.com/awscli-exe-linux-${ARCH}.zip" -o "awscliv2.zip" + unzip -q awscliv2.zip + sudo ./aws/install +) +rm -rf "${AWSCLI_TMP}" + +# Cleanup unused packages and caches. +$HOME/anaconda3/bin/conda clean -y -all + +# Work around for https://bugs.launchpad.net/ubuntu/+source/openssh/+bug/45234 +sudo mkdir -p /var/run/sshd +# Configure ssh port +echo Port $SSH_PORT | sudo tee -a /etc/ssh/sshd_config + +if [[ ! -d /usr/local/cuda ]]; then + EFA_VERSION="1.42.0" + GDRCOPY_VERSION="" + AWS_OFI_NCCL_VERSION="" +elif [[ -d "/usr/local/cuda-11" ]]; then + EFA_VERSION="1.28.0" + GDRCOPY_VERSION="2.4" + AWS_OFI_NCCL_VERSION="1.7.3-aws" +elif [[ -d "/usr/local/cuda-12" ]]; then + EFA_VERSION="1.42.0" + GDRCOPY_VERSION="2.5" + AWS_OFI_NCCL_VERSION="1.15.0" +else + echo "Unsupported CUDA major version" + exit 1 +fi + +# Install EFA +wget -q "https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_VERSION}.tar.gz" -O "/tmp/aws-efa-installer-${EFA_VERSION}.tar.gz" +wget -q "https://efa-installer.amazonaws.com/aws-efa-installer.key" -O /tmp/aws-efa-installer.key && gpg --import /tmp/aws-efa-installer.key +gpg --fingerprint > /etc/sudoers + +# Install uv +curl -sSL -o- https://astral.sh/uv/install.sh | env UV_UNMANAGED_INSTALL="/usr/local/bin" sh + +# Determine the architecture of the host +if [[ "${HOSTTYPE}" =~ ^x86_64 ]]; then + ARCH="x86_64" +elif [[ "${HOSTTYPE}" =~ ^aarch64 ]]; then + ARCH="aarch64" +else + echo "Unsupported architecture ${HOSTTYPE}" >/dev/stderr + exit 1 +fi + +# Install dynolog +if [[ "$ARCH" == "x86_64" ]]; then + DYNOLOG_TMP="$(mktemp -d)" + ( + cd "${DYNOLOG_TMP}" + curl -sSL https://github.com/facebookincubator/dynolog/releases/download/v0.3.2/dynolog_0.3.2-0-amd64.deb -o dynolog_0.3.2-0-amd64.deb + sudo dpkg -i dynolog_0.3.2-0-amd64.deb + ) + rm -rf "${DYNOLOG_TMP}" +fi + +# Install azcopy +AZCOPY_VERSION="10.30.0" +AZCOPY_TMP="$(mktemp -d)" +( + cd "${AZCOPY_TMP}" + if [[ "$ARCH" == "x86_64" ]]; then + curl -sSfL "https://github.com/Azure/azure-storage-azcopy/releases/download/v${AZCOPY_VERSION}/azcopy_linux_amd64_${AZCOPY_VERSION}.tar.gz" \ + -o- | tar -xz "azcopy_linux_amd64_${AZCOPY_VERSION}/azcopy" + sudo mv "azcopy_linux_amd64_${AZCOPY_VERSION}/azcopy" /usr/local/bin/azcopy + else + curl -sSfL "https://github.com/Azure/azure-storage-azcopy/releases/download/v${AZCOPY_VERSION}/azcopy_linux_arm64_${AZCOPY_VERSION}.tar.gz" \ + -o- | tar -xz "azcopy_linux_arm64_${AZCOPY_VERSION}/azcopy" + sudo mv "azcopy_linux_arm64_${AZCOPY_VERSION}/azcopy" /usr/local/bin/azcopy + fi +) +rm -rf "${AZCOPY_TMP}" + +# Install awscli +AWSCLI_TMP="$(mktemp -d)" +( + cd "${AWSCLI_TMP}" + curl -sfL "https://awscli.amazonaws.com/awscli-exe-linux-${ARCH}.zip" -o "awscliv2.zip" + unzip -q awscliv2.zip + sudo ./aws/install +) +rm -rf "${AWSCLI_TMP}" +aws --version + +EOF + +# Switch to ray user +USER ray +ENV HOME=/home/ray +WORKDIR /home/ray + +COPY python/requirements_compiled.txt /home/ray/requirements_compiled.txt + +RUN < /home/ray/pip-freeze.txt @@ -76,7 +76,7 @@ sudo apt-get install -y kmod pkg-config librdmacm-dev cmake --no-questions ) -UCX_VERSION="1.18.1" +UCX_VERSION="1.19.0" ( echo "Installing UCX ${UCX_VERSION}" cd "${TEMP_DIR}" @@ -109,7 +109,7 @@ UCX_VERSION="1.18.1" ) # Keep in sync with llm-requirements.txt -NIXL_VERSION="0.3.1" +NIXL_VERSION="0.4.1" ( echo "Installing NIXL ${NIXL_VERSION}" # NIXL needs meson pybind11 ninja, but should have been included in requirements_*.txt @@ -190,16 +190,6 @@ sudo apt-get clean EOF -# Q: Why add paths that don't exist in the base image, like /usr/local/nvidia/lib64 -# and /usr/local/nvidia/bin? -# A: The NVIDIA GPU operator version used by GKE injects these into the container -# after it's mounted to a pod. -# Issue is tracked here: -# https://github.com/GoogleCloudPlatform/compute-gpu-installation/issues/46 -# More context here: -# https://github.com/NVIDIA/nvidia-container-toolkit/issues/275 -# and here: -# https://gitlab.com/nvidia/container-images/cuda/-/issues/27 -ENV PATH="${PATH}:${UCX_HOME}/bin:${NIXL_HOME}/bin:/usr/local/nvidia/bin" -ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${UCX_HOME}/lib:${NIXL_HOME}/lib/x86_64-linux-gnu:/usr/local/nvidia/lib64" +ENV PATH="${PATH}:${UCX_HOME}/bin:${NIXL_HOME}/bin" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${UCX_HOME}/lib:${NIXL_HOME}/lib/x86_64-linux-gnu" ENV NIXL_PLUGIN_DIR="${NIXL_HOME}/lib/x86_64-linux-gnu/plugins/" diff --git a/ci/docker/ray-llm.base.wanda.yaml b/docker/ray-llm/cuda.wanda.yaml similarity index 86% rename from ci/docker/ray-llm.base.wanda.yaml rename to docker/ray-llm/cuda.wanda.yaml index ad7db3ea04ec..f1f91c738382 100644 --- a/ci/docker/ray-llm.base.wanda.yaml +++ b/docker/ray-llm/cuda.wanda.yaml @@ -3,7 +3,7 @@ froms: ["cr.ray.io/rayproject/ray-py$PYTHON_VERSION-cu$CUDA_VERSION-base"] dockerfile: docker/ray-llm/Dockerfile srcs: - python/requirements.txt - - python/requirements_compiled_rayllm_py311_cu128.txt + - python/deplocks/llm/rayllm_py311_cu128.lock build_args: - BASE_IMAGE=cr.ray.io/rayproject/ray-py$PYTHON_VERSION-cu$CUDA_VERSION-base tags: diff --git a/ci/docker/ray-ml.cpu.base.wanda.yaml b/docker/ray-ml/cpu.wanda.yaml similarity index 100% rename from ci/docker/ray-ml.cpu.base.wanda.yaml rename to docker/ray-ml/cpu.wanda.yaml diff --git a/ci/docker/ray-ml.cuda.base.wanda.yaml b/docker/ray-ml/cuda.wanda.yaml similarity index 100% rename from ci/docker/ray-ml.cuda.base.wanda.yaml rename to docker/ray-ml/cuda.wanda.yaml diff --git a/gen_py_proto.py b/gen_py_proto.py new file mode 100644 index 000000000000..cbc71e0a2542 --- /dev/null +++ b/gen_py_proto.py @@ -0,0 +1,12 @@ +from bazel.gen_extract import gen_extract + +if __name__ == "__main__": + gen_extract( + [ + "ray_py_proto.zip", + ], + clear_dir_first=[ + "ray/core/generated", + "ray/serve/generated", + ], + ) diff --git a/java/BUILD.bazel b/java/BUILD.bazel index 357673725614..7833c83d8fa3 100644 --- a/java/BUILD.bazel +++ b/java/BUILD.bazel @@ -6,7 +6,7 @@ load("@rules_java//java:java_binary.bzl", "java_binary") load("@rules_java//java:java_import.bzl", "java_import") load("@rules_java//java:java_library.bzl", "java_library") load("@rules_java//java:java_test.bzl", "java_test") -load("@rules_pkg//pkg:mappings.bzl", "pkg_files") +load("@rules_pkg//pkg:mappings.bzl", "pkg_attributes", "pkg_files") load("@rules_pkg//pkg:zip.bzl", "pkg_zip") load("@rules_proto_grpc//java:defs.bzl", "java_proto_compile") load("@rules_python//python:defs.bzl", "py_binary") @@ -249,22 +249,25 @@ java_test( # More detail please see https://github.com/ray-project/ray/pull/21641. java_proto_compile( name = "common_java_proto", - deps = ["@com_github_ray_project_ray//src/ray/protobuf:common_proto"], + deps = ["@io_ray//src/ray/protobuf:common_proto"], ) java_proto_compile( name = "runtime_env_common_java_proto", - deps = ["@com_github_ray_project_ray//src/ray/protobuf:runtime_env_common_proto"], + deps = [ + "@io_ray//src/ray/protobuf:runtime_env_common_proto", + "@io_ray//src/ray/protobuf/public:runtime_environment_proto", + ], ) java_proto_compile( name = "gcs_java_proto", - deps = ["@com_github_ray_project_ray//src/ray/protobuf:gcs_proto"], + deps = ["@io_ray//src/ray/protobuf:gcs_proto"], ) java_proto_compile( name = "serve_java_proto", - deps = ["@com_github_ray_project_ray//src/ray/protobuf:serve_proto"], + deps = ["@io_ray//src/ray/protobuf:serve_proto"], ) filegroup( @@ -398,39 +401,38 @@ py_binary( deps = ["//bazel:gen_extract"], ) -# Generates the dependencies needed by maven. -genrule( - name = "gen_maven_deps", +pkg_files( + name = "maven_deps_files", srcs = [ - ":pom_files.zip", - ":proto_files.zip", ":java_native_deps", ], - outs = ["gen_maven_deps.out"], - cmd = """ - WORK_DIR="$${PWD}" - # Copy native dependencies. - OS_NAME="" - case "$${OSTYPE}" in - linux*) OS_NAME="linux";; - darwin*) OS_NAME="darwin";; - *) echo "$${OSTYPE} is not supported currently"; exit 1;; - esac - NATIVE_DEPS_DIR="$$WORK_DIR/java/runtime/native_dependencies/native/$$OS_NAME" - rm -rf "$$NATIVE_DEPS_DIR" - mkdir -p "$$NATIVE_DEPS_DIR" - echo "# gen_maven_deps" > $@ - for f in $(locations //java:java_native_deps); do - chmod +w "$$f" - cp "$$f" "$$NATIVE_DEPS_DIR" - if [[ "$$OSTYPE" =~ ^darwin ]]; then shasum "$$f" >> $@ ; else sha1sum "$$f" >> $@ ; fi - done - """, - local = 1, - tags = ["no-cache"], + attributes = pkg_attributes(mode = "755"), + prefix = select( + { + "@platforms//os:linux": "runtime/native_dependencies/native/linux", + "@platforms//os:macos": "runtime/native_dependencies/native/darwin", + }, + no_match_error = "Unsupported platform", + ), visibility = ["//visibility:private"], ) +pkg_zip( + name = "maven_deps", + srcs = [ + ":maven_deps_files", + ], + visibility = ["//visibility:private"], +) + +py_binary( + name = "gen_maven_deps", + srcs = ["gen_maven_deps.py"], + data = [":maven_deps.zip"], + visibility = ["//visibility:private"], + deps = ["//bazel:gen_extract"], +) + java_binary( name = "ray_dist", # This rule is used to package all Ray Java code and the third-party dependencies into a diff --git a/java/build-jar-multiplatform.sh b/java/build-jar-multiplatform.sh index 38baefba6e8b..c54d20dc861a 100755 --- a/java/build-jar-multiplatform.sh +++ b/java/build-jar-multiplatform.sh @@ -38,7 +38,7 @@ build_jars() { bazel run ":gen_proto_files" if [[ $bazel_build == "true" ]]; then echo "Starting building java native dependencies for $p" - bazel build ":gen_maven_deps" + bazel run ":gen_maven_deps" echo "Finished building java native dependencies for $p" fi echo "Start building jars for $p" diff --git a/java/dependencies.bzl b/java/dependencies.bzl index 21c621af9b07..c19e82bb757f 100644 --- a/java/dependencies.bzl +++ b/java/dependencies.bzl @@ -18,7 +18,7 @@ def gen_java_deps(): "de.ruedigermoeller:fst:2.57", "javax.xml.bind:jaxb-api:2.3.0", "javax.activation:activation:1.1.1", - "org.apache.commons:commons-lang3:3.13.0", + "org.apache.commons:commons-lang3:3.18.0", "org.msgpack:msgpack-core:0.8.20", "org.ow2.asm:asm:6.0", "org.apache.logging.log4j:log4j-api:2.17.1", diff --git a/java/gen_maven_deps.py b/java/gen_maven_deps.py new file mode 100644 index 000000000000..be2e7238c22a --- /dev/null +++ b/java/gen_maven_deps.py @@ -0,0 +1,4 @@ +from bazel.gen_extract import gen_extract + +if __name__ == "__main__": + gen_extract(["java/maven_deps.zip"], sub_dir="java") diff --git a/java/runtime/src/main/java/io/ray/runtime/gcs/GcsClient.java b/java/runtime/src/main/java/io/ray/runtime/gcs/GcsClient.java index 5a8d11f84bcf..65c7c629f388 100644 --- a/java/runtime/src/main/java/io/ray/runtime/gcs/GcsClient.java +++ b/java/runtime/src/main/java/io/ray/runtime/gcs/GcsClient.java @@ -122,10 +122,10 @@ public List getAllActorInfo(JobId jobId, ActorState actorState) { try { Gcs.ActorTableData info = Gcs.ActorTableData.parseFrom(result); UniqueId nodeId = UniqueId.NIL; - if (!info.getAddress().getRayletId().isEmpty()) { + if (!info.getAddress().getNodeId().isEmpty()) { nodeId = UniqueId.fromByteBuffer( - ByteBuffer.wrap(info.getAddress().getRayletId().toByteArray())); + ByteBuffer.wrap(info.getAddress().getNodeId().toByteArray())); } actorInfos.add( new ActorInfo( diff --git a/java/runtime/src/main/java/io/ray/runtime/runtimeenv/RuntimeEnvImpl.java b/java/runtime/src/main/java/io/ray/runtime/runtimeenv/RuntimeEnvImpl.java index 81c1cfde5657..b25566964b9d 100644 --- a/java/runtime/src/main/java/io/ray/runtime/runtimeenv/RuntimeEnvImpl.java +++ b/java/runtime/src/main/java/io/ray/runtime/runtimeenv/RuntimeEnvImpl.java @@ -10,7 +10,7 @@ import io.ray.api.exception.RuntimeEnvException; import io.ray.api.runtimeenv.RuntimeEnv; import io.ray.api.runtimeenv.RuntimeEnvConfig; -import io.ray.runtime.generated.RuntimeEnvCommon; +import io.ray.runtime.generated.RuntimeEnvironment; import java.io.IOException; public class RuntimeEnvImpl implements RuntimeEnv { @@ -100,7 +100,7 @@ public boolean isEmpty() { @Override public String serializeToRuntimeEnvInfo() throws RuntimeEnvException { - RuntimeEnvCommon.RuntimeEnvInfo protoRuntimeEnvInfo = GenerateRuntimeEnvInfo(); + RuntimeEnvironment.RuntimeEnvInfo protoRuntimeEnvInfo = GenerateRuntimeEnvInfo(); JsonFormat.Printer printer = JsonFormat.printer(); try { @@ -123,15 +123,15 @@ public RuntimeEnvConfig getConfig() { return get(CONFIG_FIELD_NAME, RuntimeEnvConfig.class); } - public RuntimeEnvCommon.RuntimeEnvInfo GenerateRuntimeEnvInfo() throws RuntimeEnvException { + public RuntimeEnvironment.RuntimeEnvInfo GenerateRuntimeEnvInfo() throws RuntimeEnvException { String serializeRuntimeEnv = serialize(); - RuntimeEnvCommon.RuntimeEnvInfo.Builder protoRuntimeEnvInfoBuilder = - RuntimeEnvCommon.RuntimeEnvInfo.newBuilder(); + RuntimeEnvironment.RuntimeEnvInfo.Builder protoRuntimeEnvInfoBuilder = + RuntimeEnvironment.RuntimeEnvInfo.newBuilder(); protoRuntimeEnvInfoBuilder.setSerializedRuntimeEnv(serializeRuntimeEnv); RuntimeEnvConfig runtimeEnvConfig = getConfig(); if (runtimeEnvConfig != null) { - RuntimeEnvCommon.RuntimeEnvConfig.Builder protoRuntimeEnvConfigBuilder = - RuntimeEnvCommon.RuntimeEnvConfig.newBuilder(); + RuntimeEnvironment.RuntimeEnvConfig.Builder protoRuntimeEnvConfigBuilder = + RuntimeEnvironment.RuntimeEnvConfig.newBuilder(); protoRuntimeEnvConfigBuilder.setSetupTimeoutSeconds( runtimeEnvConfig.getSetupTimeoutSeconds()); protoRuntimeEnvConfigBuilder.setEagerInstall(runtimeEnvConfig.getEagerInstall()); diff --git a/java/test.sh b/java/test.sh index fa1864083554..ab8dd0bbd528 100755 --- a/java/test.sh +++ b/java/test.sh @@ -66,7 +66,7 @@ fi echo "Build java maven deps." bazel run //java:gen_pom_files bazel run //java:gen_proto_files -bazel build //java:gen_maven_deps +bazel run //java:gen_maven_deps echo "Build ray core." bazel run //:gen_ray_pkg diff --git a/pyproject.toml b/pyproject.toml index a8287a17047f..c8c2219e451f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,8 @@ extend-exclude = [ "python/build/", "python/ray/workflow/tests/mock_server.py", "python/ray/serve/tests/test_config_files/syntax_error.py", + "rllib/examples/envs/classes/multi_agent/footsies/game/proto/footsies_service_pb2.py", + "rllib/examples/envs/classes/multi_agent/footsies/game/proto/footsies_service_pb2_grpc.py", ] [tool.ruff.lint] @@ -56,26 +58,13 @@ afterray = ["psutil", "setproctitle"] # python/ray/cloudpickle/* # doc/* # python/ray/__init__.py -# python/ray/setup-dev.py # For the rest we will gradually remove them from the blacklist as we # reformat the code to follow the style guide. [tool.ruff.lint.per-file-ignores] "doc/*" = ["I"] "python/ray/__init__.py" = ["I"] -"python/ray/setup-dev.py" = ["I"] -"python/ray/cloudpickle/*" = ["I"] -"python/ray/dag/*.py" = ["I"] -"ci/*" = ["I"] -"python/ray/includes/*" = ["I"] -"python/ray/internal/*" = ["I"] -"python/ray/ray_operator/*" = ["I"] -"python/ray/scripts/*" = ["I"] -"python/ray/serve/generated/serve_pb2.py" = ["I"] -"python/ray/streaming/*" = ["I"] -"python/ray/tests/*" = ["I"] -"python/ray/util/*" = ["I"] -"python/ray/workers/*" = ["I"] -"python/ray/workflow/*" = ["I"] +"python/ray/dag/__init__.py" = ["I"] +"python/ray/air/__init__.py" = ["I"] "rllib/*" = ["I"] "release/*" = ["I"] diff --git a/python/build-wheel-macos.sh b/python/build-wheel-macos.sh index 9837a007f8eb..3b8b0103aefc 100755 --- a/python/build-wheel-macos.sh +++ b/python/build-wheel-macos.sh @@ -82,7 +82,7 @@ for ((i=0; i<${#PY_MMS[@]}; ++i)); do # Add the correct Python to the path and build the wheel. This is only # needed so that the installation finds the cython executable. # build ray wheel - $PIP_CMD wheel -q -w dist . --no-deps + $PIP_CMD wheel -v -w dist . --no-deps # build ray-cpp wheel RAY_INSTALL_CPP=1 $PIP_CMD wheel -q -w dist . --no-deps mv dist/*.whl ../.whl/ diff --git a/python/build-wheel-manylinux2014.sh b/python/build-wheel-manylinux2014.sh index 9444765cddbc..d065fb3b6255 100755 --- a/python/build-wheel-manylinux2014.sh +++ b/python/build-wheel-manylinux2014.sh @@ -37,7 +37,6 @@ PYTHON_VERSIONS=( # Setup runtime environment ./ci/build/build-manylinux-forge.sh -source "$HOME"/.nvm/nvm.sh # Compile ray ./ci/build/build-manylinux-ray.sh diff --git a/python/build-wheel-windows.sh b/python/build-wheel-windows.sh index 3a4b22e8b890..623d4db613a2 100755 --- a/python/build-wheel-windows.sh +++ b/python/build-wheel-windows.sh @@ -130,11 +130,11 @@ build_wheel_windows() { exit 1 fi # build ray wheel - python -m pip wheel -q -w dist . --no-deps + python -m pip wheel -v -w dist . --no-deps # Pack any needed system dlls like msvcp140.dll delvewheel repair dist/ray-*.whl # build ray-cpp wheel - RAY_INSTALL_CPP=1 python -m pip wheel -q -w dist . --no-deps + RAY_INSTALL_CPP=1 python -m pip wheel -v -w dist . --no-deps # No extra dlls are needed, do not call delvewheel uninstall_ray ) diff --git a/python/requirements_compiled_ray_py311_cpu.txt b/python/deplocks/llm/ray_py311_cpu.lock similarity index 85% rename from python/requirements_compiled_ray_py311_cpu.txt rename to python/deplocks/llm/ray_py311_cpu.lock index 1398bbae57df..9e76d9282fb9 100644 --- a/python/requirements_compiled_ray_py311_cpu.txt +++ b/python/deplocks/llm/ray_py311_cpu.lock @@ -1,5 +1,5 @@ # This file was autogenerated by uv via the following command: -# uv pip compile --generate-hashes --strip-extras --unsafe-package ray --unsafe-package grpcio-tools --unsafe-package setuptools --index-url https://pypi.org/simple --extra-index-url https://download.pytorch.org/whl/cpu --index-strategy unsafe-best-match --no-strip-markers --emit-index-url --emit-find-links -c python/requirements_compiled_ray_test_py311_cpu.txt python/requirements.txt -o python/requirements_compiled_ray_py311_cpu.txt +# uv pip compile --generate-hashes --strip-extras --unsafe-package setuptools --index-url https://pypi.org/simple --index-strategy unsafe-best-match --no-strip-markers --emit-index-url --emit-find-links --extra-index-url https://download.pytorch.org/whl/cpu --python-version=3.11 --unsafe-package ray --python-platform=linux -c python/deplocks/llm/ray_test_py311_cpu.lock python/requirements.txt -o python/deplocks/llm/ray_py311_cpu.lock --index-url https://pypi.org/simple --extra-index-url https://download.pytorch.org/whl/cpu @@ -7,7 +7,7 @@ aiohappyeyeballs==2.6.1 \ --hash=sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558 \ --hash=sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # aiohttp aiohttp==3.11.16 \ --hash=sha256:004511d3413737700835e949433536a2fe95a7d0297edd911a1e9705c5b5ea43 \ @@ -92,59 +92,77 @@ aiohttp==3.11.16 \ --hash=sha256:fb46bb0f24813e6cede6cc07b1961d4b04f331f7112a23b5e21f567da4ee50aa \ --hash=sha256:fd36c119c5d6551bce374fcb5c19269638f8d09862445f85a5a48596fd59f4bb # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt # aiohttp-cors aiohttp-cors==0.7.0 \ --hash=sha256:0451ba59fdf6909d0e2cd21e4c0a43752bc0703d33fc78ae94d9d9321710193e \ --hash=sha256:4d39c6d7100fd9764ed1caf8cebf0eb01bf5e3f24e2e073fda6234bc48b19f5d # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt aiorwlock==1.3.0 \ --hash=sha256:45baf8e4fa9a23e0bb325fbd67da80de1fd7ae1d4f59a6381754c60cec7b289b \ --hash=sha256:83f12d87df4b9728a0b8fda1756585ab0d652b107bab59c6084e1b1ad692ab45 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt aiosignal==1.3.1 \ --hash=sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc \ --hash=sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # aiohttp +amqp==5.3.1 \ + --hash=sha256:43b3319e1b4e7d1251833a93d672b4af1e40f3d632d479b98661a95f117880a2 \ + --hash=sha256:cddc00c725449522023bad949f70fff7b48f0b1ade74d170a6f10ab044739432 + # via + # -c python/deplocks/llm/ray_test_py311_cpu.lock + # kombu annotated-types==0.6.0 \ --hash=sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43 \ --hash=sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # pydantic anyio==3.7.1 \ --hash=sha256:44a3c9aba0f5defa43261a8b3efb97891f2bd7d804e0e1f56419befa1adfc780 \ --hash=sha256:91dee416e570e92c64041bd18b900d1d6fa78dff7048769ce5ac5ddad004fbb5 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # starlette # watchfiles attrs==25.1.0 \ --hash=sha256:1c97078a80c814273a76b2a298a932eb681c87415c11dee0a6921de7f1b02c3e \ --hash=sha256:c75a69e28a550a7e93789579c22aa26b0f5b83b75dc4e08fe092980051e1090a # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # aiohttp # jsonschema # referencing +billiard==4.2.1 \ + --hash=sha256:12b641b0c539073fc8d3f5b8b7be998956665c4233c7c1fcd66a7e677c4fb36f \ + --hash=sha256:40b59a4ac8806ba2c2369ea98d876bc6108b051c227baffd928c644d15d8f3cb + # via + # -c python/deplocks/llm/ray_test_py311_cpu.lock + # celery cachetools==5.5.2 \ --hash=sha256:1a661caa9175d26759571b2e19580f9d6393969e5dfca11fdb1f947a23e640d4 \ --hash=sha256:d26a22bcc62eb95c3beabd9f1ee5e820d3d2704fe2967cbe350e20c8ffcd3f0a # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # google-auth +celery==5.5.3 \ + --hash=sha256:0b5761a07057acee94694464ca482416b959568904c9dfa41ce8413a7d65d525 \ + --hash=sha256:6c972ae7968c2b5281227f01c3a3f984037d21c5129d07bf3550cc2afc6b10a5 + # via + # -c python/deplocks/llm/ray_test_py311_cpu.lock + # -r python/requirements.txt certifi==2025.1.31 \ --hash=sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651 \ --hash=sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # requests cffi==1.16.0 ; platform_python_implementation != 'PyPy' \ --hash=sha256:0c9ef6ff37e974b73c25eecc13952c55bceed9112be2d9d938ded8e856138bcc \ @@ -200,7 +218,7 @@ cffi==1.16.0 ; platform_python_implementation != 'PyPy' \ --hash=sha256:fa3a0128b152627161ce47201262d3140edb5a5c3da88d73a1b790a959126956 \ --hash=sha256:fcc8eb6d5902bb1cf6dc4f187ee3ea80a1eba0a89aba40a5cb20a5087d961357 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # cryptography charset-normalizer==3.3.2 \ --hash=sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027 \ @@ -294,27 +312,49 @@ charset-normalizer==3.3.2 \ --hash=sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519 \ --hash=sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # requests click==8.1.7 \ --hash=sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28 \ --hash=sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt + # celery + # click-didyoumean + # click-plugins + # click-repl # typer # uvicorn +click-didyoumean==0.3.1 \ + --hash=sha256:4f82fdff0dbe64ef8ab2279bd6aa3f6a99c3b28c05aa09cbfc07c9d7fbb5a463 \ + --hash=sha256:5c4bb6007cfea5f2fd6583a2fb6701a22a41eb98957e63d0fac41c10e7c3117c + # via + # -c python/deplocks/llm/ray_test_py311_cpu.lock + # celery +click-plugins==1.1.1.2 \ + --hash=sha256:008d65743833ffc1f5417bf0e78e8d2c23aab04d9745ba817bd3e71b0feb6aa6 \ + --hash=sha256:d7af3984a99d243c131aa1a828331e7630f4a88a9741fd05c927b204bcf92261 + # via + # -c python/deplocks/llm/ray_test_py311_cpu.lock + # celery +click-repl==0.3.0 \ + --hash=sha256:17849c23dba3d667247dc4defe1757fff98694e90fe37474f3feebb69ced26a9 \ + --hash=sha256:fb7e06deb8da8de86180a33a9da97ac316751c094c6899382da7feeeeb51b812 + # via + # -c python/deplocks/llm/ray_test_py311_cpu.lock + # celery cloudpickle==2.2.0 \ --hash=sha256:3f4219469c55453cfe4737e564b67c2a149109dabf7f242478948b895f61106f \ --hash=sha256:7428798d5926d8fcbfd092d18d01a2a03daf8237d8fcdc8095d256b8490796f0 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # gymnasium colorful==0.5.5 \ --hash=sha256:62c187e27c1433db9463ff93b1451898d1e7e23a7e553583fd9daeb6325182e4 \ --hash=sha256:66f8c1264b2a26f7293b96a03bb7a76c4bc8b9634369a0bffdcd12d618056a1d # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt cryptography==44.0.3 \ --hash=sha256:02f55fb4f8b79c1221b0961488eaae21015b69b210e18c386b69de182ebb1259 \ @@ -355,7 +395,7 @@ cryptography==44.0.3 \ --hash=sha256:fc3c9babc1e1faefd62704bb46a69f359a9819eb0292e40df3fb6e3574715cd4 \ --hash=sha256:fe19d8bc5536a91a24a8133328880a41831b6c5df54599a8417b62fe015d3053 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # pyopenssl cupy-cuda12x==13.1.0 ; sys_platform != 'darwin' \ --hash=sha256:230f8a8e99c81a653baa0ed00819990c0ed1f0cf0298214786b5e323461dc61a \ @@ -371,13 +411,13 @@ cupy-cuda12x==13.1.0 ; sys_platform != 'darwin' \ --hash=sha256:dac0284fecb90b5731f514e569a6fcf6674a730ae95b9490781a713b60a34423 \ --hash=sha256:e7a25ef1b44ae6276b5105affc2289edb34f1aa6676babd5bcd80907348c4cfa # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt distlib==0.3.7 \ --hash=sha256:2e24928bc811348f0feb63014e97aaae3037f2cf48712d51ae61df7fd6075057 \ --hash=sha256:9dafe54b34a028eafd95039d5e5d4851a13734540f1331060d31c9916e7147a8 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # virtualenv dm-tree==0.1.8 \ --hash=sha256:054b461f8176f4bce7a21f7b1870f873a1ced3bdbe1282c816c550bb43c71fa6 \ @@ -427,19 +467,19 @@ dm-tree==0.1.8 \ --hash=sha256:f7ac31b9aecccb2c6e1ab29706f6ded3eba0c2c69c770322c9c685929c3d6afb \ --hash=sha256:fa42a605d099ee7d41ba2b5fb75e21423951fd26e5d50583a00471238fb3021d # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt farama-notifications==0.0.4 \ --hash=sha256:13fceff2d14314cf80703c8266462ebf3733c7d165336eee998fc58e545efd18 \ --hash=sha256:14de931035a41961f7c056361dc7f980762a143d05791ef5794a751a2caf05ae # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # gymnasium fastapi==0.115.12 \ --hash=sha256:1e2c2a2646905f9e83d32f04a3f86aff4a286669c6c950ca95b5fd68c2602681 \ --hash=sha256:e94613d6c05e27be7ffebdd6ea5f388112e5e430c8f7d6494a9d1d88d43e814d # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt fastrlock==0.8.2 ; sys_platform != 'darwin' \ --hash=sha256:067edb0a0805bf61e17a251d5046af59f6e9d2b8ad01222e0ef7a0b7937d5548 \ @@ -518,13 +558,13 @@ fastrlock==0.8.2 ; sys_platform != 'darwin' \ --hash=sha256:fb5363cf0fddd9b50525ddbf64a1e1b28ec4c6dfb28670a940cb1cf988a6786b \ --hash=sha256:ff75c90663d6e8996610d435e71487daa853871ad1770dd83dc0f2fc4997241e # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # cupy-cuda12x filelock==3.17.0 \ --hash=sha256:533dc2f7ba78dc2f0f531fc6c4940addf7b70a481e269a5a3b93be94ffbe8338 \ --hash=sha256:ee4e77401ef576ebb38cd7f13b9b28893194acc20a8e68e18730ba9c0e54660e # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt # virtualenv frozenlist==1.4.1 \ @@ -606,109 +646,105 @@ frozenlist==1.4.1 \ --hash=sha256:fde5bd59ab5357e3853313127f4d3565fc7dad314a74d7b5d43c22c6a5ed2ced \ --hash=sha256:fe1a06da377e3a1062ae5fe0926e12b84eceb8a50b350ddca72dc85015873f74 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # aiohttp # aiosignal -fsspec==2023.5.0 \ - --hash=sha256:51a4ad01a5bb66fcc58036e288c0d53d3975a0df2a5dc59a93b59bade0391f2a \ - --hash=sha256:b3b56e00fb93ea321bc9e5d9cf6f8522a0198b20eb24e02774d329e9c6fb84ce +fsspec==2023.12.1 \ + --hash=sha256:6271f1d3075a378bfe432f6f42bf7e1d2a6ba74f78dd9b512385474c579146a0 \ + --hash=sha256:c4da01a35ac65c853f833e43f67802c25213f560820d54ddf248f92eddd5e990 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt google-api-core==2.24.2 \ --hash=sha256:810a63ac95f3c441b7c0e43d344e372887f62ce9071ba972eacf32672e072de9 \ --hash=sha256:81718493daf06d96d6bc76a91c23874dbf2fac0adbbf542831b805ee6e974696 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # opencensus google-auth==2.23.4 \ --hash=sha256:79905d6b1652187def79d491d6e23d0cbb3a21d3c7ba0dbaa9c8a01906b13ff3 \ --hash=sha256:d4bbc92fe4b8bfd2f3e8d88e5ba7085935da208ee38a134fc280e7ce682a05f2 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # google-api-core googleapis-common-protos==1.61.0 \ --hash=sha256:22f1915393bb3245343f6efe87f6fe868532efc12aa26b391b15132e1279f1c0 \ --hash=sha256:8a64866a97f6304a7179873a465d6eee97b7a24ec6cfd78e0f575e96b821240b # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # google-api-core -grpcio==1.66.2 \ - --hash=sha256:02697eb4a5cbe5a9639f57323b4c37bcb3ab2d48cec5da3dc2f13334d72790dd \ - --hash=sha256:03b0b307ba26fae695e067b94cbb014e27390f8bc5ac7a3a39b7723fed085604 \ - --hash=sha256:05bc2ceadc2529ab0b227b1310d249d95d9001cd106aa4d31e8871ad3c428d73 \ - --hash=sha256:06de8ec0bd71be123eec15b0e0d457474931c2c407869b6c349bd9bed4adbac3 \ - --hash=sha256:0be4e0490c28da5377283861bed2941d1d20ec017ca397a5df4394d1c31a9b50 \ - --hash=sha256:12fda97ffae55e6526825daf25ad0fa37483685952b5d0f910d6405c87e3adb6 \ - --hash=sha256:1caa38fb22a8578ab8393da99d4b8641e3a80abc8fd52646f1ecc92bcb8dee34 \ - --hash=sha256:2018b053aa15782db2541ca01a7edb56a0bf18c77efed975392583725974b249 \ - --hash=sha256:20657d6b8cfed7db5e11b62ff7dfe2e12064ea78e93f1434d61888834bc86d75 \ - --hash=sha256:2335c58560a9e92ac58ff2bc5649952f9b37d0735608242973c7a8b94a6437d8 \ - --hash=sha256:31fd163105464797a72d901a06472860845ac157389e10f12631025b3e4d0453 \ - --hash=sha256:38b68498ff579a3b1ee8f93a05eb48dc2595795f2f62716e797dc24774c1aaa8 \ - --hash=sha256:3b00efc473b20d8bf83e0e1ae661b98951ca56111feb9b9611df8efc4fe5d55d \ - --hash=sha256:3ed71e81782966ffead60268bbda31ea3f725ebf8aa73634d5dda44f2cf3fb9c \ - --hash=sha256:45a3d462826f4868b442a6b8fdbe8b87b45eb4f5b5308168c156b21eca43f61c \ - --hash=sha256:49f0ca7ae850f59f828a723a9064cadbed90f1ece179d375966546499b8a2c9c \ - --hash=sha256:4e504572433f4e72b12394977679161d495c4c9581ba34a88d843eaf0f2fbd39 \ - --hash=sha256:4ea1d062c9230278793820146c95d038dc0f468cbdd172eec3363e42ff1c7d01 \ - --hash=sha256:563588c587b75c34b928bc428548e5b00ea38c46972181a4d8b75ba7e3f24231 \ - --hash=sha256:6001e575b8bbd89eee11960bb640b6da6ae110cf08113a075f1e2051cc596cae \ - --hash=sha256:66a0cd8ba6512b401d7ed46bb03f4ee455839957f28b8d61e7708056a806ba6a \ - --hash=sha256:6851de821249340bdb100df5eacfecfc4e6075fa85c6df7ee0eb213170ec8e5d \ - --hash=sha256:728bdf36a186e7f51da73be7f8d09457a03061be848718d0edf000e709418987 \ - --hash=sha256:73e3b425c1e155730273f73e419de3074aa5c5e936771ee0e4af0814631fb30a \ - --hash=sha256:73fc8f8b9b5c4a03e802b3cd0c18b2b06b410d3c1dcbef989fdeb943bd44aff7 \ - --hash=sha256:78fa51ebc2d9242c0fc5db0feecc57a9943303b46664ad89921f5079e2e4ada7 \ - --hash=sha256:7b2c86457145ce14c38e5bf6bdc19ef88e66c5fee2c3d83285c5aef026ba93b3 \ - --hash=sha256:7d69ce1f324dc2d71e40c9261d3fdbe7d4c9d60f332069ff9b2a4d8a257c7b2b \ - --hash=sha256:802d84fd3d50614170649853d121baaaa305de7b65b3e01759247e768d691ddf \ - --hash=sha256:80fd702ba7e432994df208f27514280b4b5c6843e12a48759c9255679ad38db8 \ - --hash=sha256:8ac475e8da31484efa25abb774674d837b343afb78bb3bcdef10f81a93e3d6bf \ - --hash=sha256:950da58d7d80abd0ea68757769c9db0a95b31163e53e5bb60438d263f4bed7b7 \ - --hash=sha256:99a641995a6bc4287a6315989ee591ff58507aa1cbe4c2e70d88411c4dcc0839 \ - --hash=sha256:9c3a99c519f4638e700e9e3f83952e27e2ea10873eecd7935823dab0c1c9250e \ - --hash=sha256:9c509a4f78114cbc5f0740eb3d7a74985fd2eff022971bc9bc31f8bc93e66a3b \ - --hash=sha256:a18e20d8321c6400185b4263e27982488cb5cdd62da69147087a76a24ef4e7e3 \ - --hash=sha256:a917d26e0fe980b0ac7bfcc1a3c4ad6a9a4612c911d33efb55ed7833c749b0ee \ - --hash=sha256:a9539f01cb04950fd4b5ab458e64a15f84c2acc273670072abe49a3f29bbad54 \ - --hash=sha256:ad2efdbe90c73b0434cbe64ed372e12414ad03c06262279b104a029d1889d13e \ - --hash=sha256:b672abf90a964bfde2d0ecbce30f2329a47498ba75ce6f4da35a2f4532b7acbc \ - --hash=sha256:bbd27c24a4cc5e195a7f56cfd9312e366d5d61b86e36d46bbe538457ea6eb8dd \ - --hash=sha256:c400ba5675b67025c8a9f48aa846f12a39cf0c44df5cd060e23fda5b30e9359d \ - --hash=sha256:c408f5ef75cfffa113cacd8b0c0e3611cbfd47701ca3cdc090594109b9fcbaed \ - --hash=sha256:c806852deaedee9ce8280fe98955c9103f62912a5b2d5ee7e3eaa284a6d8d8e7 \ - --hash=sha256:ce89f5876662f146d4c1f695dda29d4433a5d01c8681fbd2539afff535da14d4 \ - --hash=sha256:d25a14af966438cddf498b2e338f88d1c9706f3493b1d73b93f695c99c5f0e2a \ - --hash=sha256:d8d4732cc5052e92cea2f78b233c2e2a52998ac40cd651f40e398893ad0d06ec \ - --hash=sha256:d9a9724a156c8ec6a379869b23ba3323b7ea3600851c91489b871e375f710bc8 \ - --hash=sha256:e636ce23273683b00410f1971d209bf3689238cf5538d960adc3cdfe80dd0dbd \ - --hash=sha256:e88264caad6d8d00e7913996030bac8ad5f26b7411495848cc218bd3a9040b6c \ - --hash=sha256:f145cc21836c332c67baa6fc81099d1d27e266401565bf481948010d6ea32d46 \ - --hash=sha256:fb57870449dfcfac428afbb5a877829fcb0d6db9d9baa1148705739e9083880e \ - --hash=sha256:fb70487c95786e345af5e854ffec8cb8cc781bcc5df7930c4fbb7feaa72e1cdf \ - --hash=sha256:fe96281713168a3270878255983d2cb1a97e034325c8c2c25169a69289d3ecfa \ - --hash=sha256:ff1f7882e56c40b0d33c4922c15dfa30612f05fb785074a012f7cda74d1c3679 - # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt +grpcio==1.74.0 \ + --hash=sha256:0f87bddd6e27fc776aacf7ebfec367b6d49cad0455123951e4488ea99d9b9b8f \ + --hash=sha256:136b53c91ac1d02c8c24201bfdeb56f8b3ac3278668cbb8e0ba49c88069e1bdc \ + --hash=sha256:1733969040989f7acc3d94c22f55b4a9501a30f6aaacdbccfaba0a3ffb255ab7 \ + --hash=sha256:176d60a5168d7948539def20b2a3adcce67d72454d9ae05969a2e73f3a0feee7 \ + --hash=sha256:1a2b06afe2e50ebfd46247ac3ba60cac523f54ec7792ae9ba6073c12daf26f0a \ + --hash=sha256:1bf949792cee20d2078323a9b02bacbbae002b9e3b9e2433f2741c15bdeba1c4 \ + --hash=sha256:22b834cef33429ca6cc28303c9c327ba9a3fafecbf62fae17e9a7b7163cc43ac \ + --hash=sha256:2918948864fec2a11721d91568effffbe0a02b23ecd57f281391d986847982f6 \ + --hash=sha256:2bc2d7d8d184e2362b53905cb1708c84cb16354771c04b490485fa07ce3a1d89 \ + --hash=sha256:2f609a39f62a6f6f05c7512746798282546358a37ea93c1fcbadf8b2fed162e3 \ + --hash=sha256:3601274bc0523f6dc07666c0e01682c94472402ac2fd1226fd96e079863bfa49 \ + --hash=sha256:3b03d8f2a07f0fea8c8f74deb59f8352b770e3900d143b3d1475effcb08eec20 \ + --hash=sha256:3d14e3c4d65e19d8430a4e28ceb71ace4728776fd6c3ce34016947474479683f \ + --hash=sha256:42f8fee287427b94be63d916c90399ed310ed10aadbf9e2e5538b3e497d269bc \ + --hash=sha256:4bc5fca10aaf74779081e16c2bcc3d5ec643ffd528d9e7b1c9039000ead73bae \ + --hash=sha256:4e4181bfc24413d1e3a37a0b7889bea68d973d4b45dd2bc68bb766c140718f82 \ + --hash=sha256:55b453812fa7c7ce2f5c88be3018fb4a490519b6ce80788d5913f3f9d7da8c7b \ + --hash=sha256:566b9395b90cc3d0d0c6404bc8572c7c18786ede549cdb540ae27b58afe0fb91 \ + --hash=sha256:5f251c355167b2360537cf17bea2cf0197995e551ab9da6a0a59b3da5e8704f9 \ + --hash=sha256:60d2d48b0580e70d2e1954d0d19fa3c2e60dd7cbed826aca104fff518310d1c5 \ + --hash=sha256:64229c1e9cea079420527fa8ac45d80fc1e8d3f94deaa35643c381fa8d98f362 \ + --hash=sha256:655726919b75ab3c34cdad39da5c530ac6fa32696fb23119e36b64adcfca174a \ + --hash=sha256:662456c4513e298db6d7bd9c3b8df6f75f8752f0ba01fb653e252ed4a59b5a5d \ + --hash=sha256:68c8ebcca945efff9d86d8d6d7bfb0841cf0071024417e2d7f45c5e46b5b08eb \ + --hash=sha256:69e1a8180868a2576f02356565f16635b99088da7df3d45aaa7e24e73a054e31 \ + --hash=sha256:6bab67d15ad617aff094c382c882e0177637da73cbc5532d52c07b4ee887a87b \ + --hash=sha256:7d95d71ff35291bab3f1c52f52f474c632db26ea12700c2ff0ea0532cb0b5854 \ + --hash=sha256:80d1f4fbb35b0742d3e3d3bb654b7381cd5f015f8497279a1e9c21ba623e01b1 \ + --hash=sha256:834988b6c34515545b3edd13e902c1acdd9f2465d386ea5143fb558f153a7176 \ + --hash=sha256:8533e6e9c5bd630ca98062e3a1326249e6ada07d05acf191a77bc33f8948f3d8 \ + --hash=sha256:85bd5cdf4ed7b2d6438871adf6afff9af7096486fcf51818a81b77ef4dd30907 \ + --hash=sha256:86ad489db097141a907c559988c29718719aa3e13370d40e20506f11b4de0d11 \ + --hash=sha256:885912559974df35d92219e2dc98f51a16a48395f37b92865ad45186f294096c \ + --hash=sha256:8efe72fde5500f47aca1ef59495cb59c885afe04ac89dd11d810f2de87d935d4 \ + --hash=sha256:8f7b5882fb50632ab1e48cb3122d6df55b9afabc265582808036b6e51b9fd6b7 \ + --hash=sha256:9e7c4389771855a92934b2846bd807fc25a3dfa820fd912fe6bd8136026b2707 \ + --hash=sha256:9e912d3c993a29df6c627459af58975b2e5c897d93287939b9d5065f000249b5 \ + --hash=sha256:a8f0302f9ac4e9923f98d8e243939a6fb627cd048f5cd38595c97e38020dffce \ + --hash=sha256:b6a73b2ba83e663b2480a90b82fdae6a7aa6427f62bf43b29912c0cfd1aa2bfa \ + --hash=sha256:c14e803037e572c177ba54a3e090d6eb12efd795d49327c5ee2b3bddb836bf01 \ + --hash=sha256:c3d7bd6e3929fd2ea7fbc3f562e4987229ead70c9ae5f01501a46701e08f1ad9 \ + --hash=sha256:c98e0b7434a7fa4e3e63f250456eaef52499fba5ae661c58cc5b5477d11e7182 \ + --hash=sha256:cce634b10aeab37010449124814b05a62fb5f18928ca878f1bf4750d1f0c815b \ + --hash=sha256:e154d230dc1bbbd78ad2fdc3039fa50ad7ffcf438e4eb2fa30bce223a70c7486 \ + --hash=sha256:e1ea6176d7dfd5b941ea01c2ec34de9531ba494d541fe2057c904e601879f249 \ + --hash=sha256:e759f9e8bc908aaae0412642afe5416c9f983a80499448fcc7fab8692ae044c3 \ + --hash=sha256:e8978003816c7b9eabe217f88c78bc26adc8f9304bf6a594b02e5a49b2ef9c11 \ + --hash=sha256:ecde9ab49f58433abe02f9ed076c7b5be839cf0153883a6d23995937a82392fa \ + --hash=sha256:f6ec94f0e50eb8fa1744a731088b966427575e40c2944a980049798b127a687e \ + --hash=sha256:fd3c71aeee838299c5887230b8a1822795325ddfea635edd82954c1eaa831e24 \ + --hash=sha256:fe0f540750a13fd8e5da4b3eaba91a785eea8dca5ccd2bc2ffe978caa403090e + # via + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt -gymnasium==1.0.0 \ - --hash=sha256:9d2b66f30c1b34fe3c2ce7fae65ecf365d0e9982d2b3d860235e773328a3b403 \ - --hash=sha256:b6f40e1e24c5bd419361e1a5b86a9117d2499baecc3a660d44dfff4c465393ad +gymnasium==1.1.1 \ + --hash=sha256:8bd9ea9bdef32c950a444ff36afc785e1d81051ec32d30435058953c20d2456d \ + --hash=sha256:9c167ec0a2b388666e37f63b2849cd2552f7f5b71938574c637bb36487eb928a # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt h11==0.16.0 \ --hash=sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1 \ --hash=sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # uvicorn idna==3.7 \ --hash=sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc \ --hash=sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # anyio # requests # yarl @@ -716,37 +752,43 @@ imageio==2.34.2 \ --hash=sha256:5c0c0ee8faa018a1c42f649b90395dd4d3bb6187c09053a0cd6f1fdd51bbff5e \ --hash=sha256:a0bb27ec9d5bab36a9f4835e51b21d2cb099e1f78451441f94687ff3404b79f8 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # scikit-image importlib-metadata==6.11.0 \ --hash=sha256:1231cf92d825c9e03cfc4da076a16de6422c863558229ea0b22b675657463443 \ --hash=sha256:f0afba6205ad8f8947c7d338b5342d5db2afbfd82f9cbef7879a9539cc12eb9b # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # opentelemetry-api jinja2==3.1.6 ; sys_platform != 'win32' \ --hash=sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d \ --hash=sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # memray jsonschema==4.23.0 \ --hash=sha256:d71497fef26351a33265337fa77ffeb82423f3ea21283cd9467bb03999266bc4 \ --hash=sha256:fbadb6f8b144a8f8cf9f0b89ba94501d143e50411a1278633f56a7acf7fd5566 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt jsonschema-specifications==2024.10.1 \ --hash=sha256:0f38b83639958ce1152d02a7f062902c41c8fd20d558b0c34344292d417ae272 \ --hash=sha256:a09a0680616357d9a0ecf05c12ad234479f549239d0f5b55f3deea67475da9bf # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jsonschema +kombu==5.5.4 \ + --hash=sha256:886600168275ebeada93b888e831352fe578168342f0d1d5833d88ba0d847363 \ + --hash=sha256:a12ed0557c238897d8e518f1d1fdf84bd1516c5e305af2dacd85c2015115feb8 + # via + # -c python/deplocks/llm/ray_test_py311_cpu.lock + # celery lazy-loader==0.4 \ --hash=sha256:342aa8e14d543a154047afb4ba8ef17f5563baad3fc610d7b15b213b0f119efc \ --hash=sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # scikit-image lz4==4.3.3 \ --hash=sha256:01fe674ef2889dbb9899d8a67361e0c4a2c833af5aeb37dd505727cf5d2a131e \ @@ -786,13 +828,13 @@ lz4==4.3.3 \ --hash=sha256:f4c7bf687303ca47d69f9f0133274958fd672efaa33fb5bcde467862d6c621f0 \ --hash=sha256:f76176492ff082657ada0d0f10c794b6da5800249ef1692b35cf49b1e93e8ef7 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt markdown-it-py==2.2.0 \ --hash=sha256:5a35f8d1870171d9acc47b99612dc146129b631baf04970128b568f190d0cc30 \ --hash=sha256:7c9a5e412688bc771c67432cbfebcdd686c93ce6484913dccf06cb5a0bea35a1 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # rich markupsafe==2.1.3 ; sys_platform != 'win32' \ --hash=sha256:05fb21170423db021895e1ea1e1f3ab3adb85d1c2333cbc2310f2a26bc77272e \ @@ -856,13 +898,13 @@ markupsafe==2.1.3 ; sys_platform != 'win32' \ --hash=sha256:fec21693218efe39aa7f8599346e90c705afa52c5b31ae019b2e57e8f6542bb2 \ --hash=sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jinja2 mdurl==0.1.2 \ --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \ --hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # markdown-it-py memray==1.10.0 ; sys_platform != 'win32' \ --hash=sha256:0a21745fb516b7a6efcd40aa7487c59e9313fcfc782d0193fcfcf00b48426874 \ @@ -901,7 +943,7 @@ memray==1.10.0 ; sys_platform != 'win32' \ --hash=sha256:e356af93e3b031c83957e9ac1a653f5aaba5df1e357dd17142f5ed19bb3dc660 \ --hash=sha256:f16c5c8730b616613dc8bafe32649ca6bd7252606251eb00148582011758d0b5 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt msgpack==1.0.7 \ --hash=sha256:04ad6069c86e531682f9e1e71b71c1c3937d6014a7c3e9edd2aa81ad58842862 \ @@ -961,7 +1003,7 @@ msgpack==1.0.7 \ --hash=sha256:f9a7c509542db4eceed3dcf21ee5267ab565a83555c9b88a8109dcecc4709002 \ --hash=sha256:ff1d0899f104f3921d94579a5638847f783c9b04f2d5f229392ca77fba5b82fc # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt multidict==6.0.5 \ --hash=sha256:01265f5e40f5a17f8241d52656ed27192be03bfa8764d88e8220141d1e4b3556 \ @@ -1055,14 +1097,14 @@ multidict==6.0.5 \ --hash=sha256:fce28b3c8a81b6b36dfac9feb1de115bab619b3c13905b419ec71d03a3fc1423 \ --hash=sha256:fe5d7785250541f7f5019ab9cba2c71169dc7d74d0f45253f8313f436458a4ef # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # aiohttp # yarl networkx==3.2.1 \ --hash=sha256:9f1bb5cf3409bf324e0a722c20bdb4c20ee39bf1c30ce8ae499c8502b0b5e0c6 \ --hash=sha256:f18c69adc97877c42332c170849c96cefa91881c99a7cb3e95b7c659ebdc1ec2 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # scikit-image numpy==1.26.4 \ --hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \ @@ -1102,7 +1144,7 @@ numpy==1.26.4 \ --hash=sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3 \ --hash=sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt # cupy-cuda12x # gymnasium @@ -1116,19 +1158,19 @@ opencensus==0.11.4 \ --hash=sha256:a18487ce68bc19900336e0ff4655c5a116daf10c1b3685ece8d971bddad6a864 \ --hash=sha256:cbef87d8b8773064ab60e5c2a1ced58bbaa38a6d052c41aec224958ce544eff2 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt opencensus-context==0.1.3 \ --hash=sha256:073bb0590007af276853009fac7e4bab1d523c3f03baf4cb4511ca38967c6039 \ --hash=sha256:a03108c3c10d8c80bb5ddf5c8a1f033161fa61972a9917f9b9b3a18517f0088c # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # opencensus opentelemetry-api==1.34.1 \ --hash=sha256:64f0bd06d42824843731d05beea88d4d4b6ae59f9fe347ff7dfa2cc14233bbb3 \ --hash=sha256:b7df4cb0830d5a6c29ad0c0691dbae874d8daefa934b8b1d642de48323d32a8c # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt # opentelemetry-exporter-prometheus # opentelemetry-sdk @@ -1137,33 +1179,34 @@ opentelemetry-exporter-prometheus==0.55b1 \ --hash=sha256:d13ec0b22bf394113ff1ada5da98133a4b051779b803dae183188e26c4bd9ee0 \ --hash=sha256:f364fbbff9e5de37a112ff104d1185fb1d7e2046c5ab5911e5afebc7ab3ddf0e # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt opentelemetry-proto==1.27.0 \ --hash=sha256:33c9345d91dafd8a74fc3d7576c5a38f18b7fdf8d02983ac67485386132aedd6 \ --hash=sha256:b133873de5581a50063e1e4b29cdcf0c5e253a8c2d8dc1229add20a4c3830ace # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt opentelemetry-sdk==1.34.1 \ --hash=sha256:308effad4059562f1d92163c61c8141df649da24ce361827812c40abb2a1e96e \ --hash=sha256:8091db0d763fcd6098d4781bbc80ff0971f94e260739aa6afe6fd379cdf3aa4d # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt # opentelemetry-exporter-prometheus opentelemetry-semantic-conventions==0.55b1 \ --hash=sha256:5da81dfdf7d52e3d37f8fe88d5e771e191de924cfff5f550ab0b8f7b2409baed \ --hash=sha256:ef95b1f009159c28d7a7849f5cbc71c4c34c845bb514d66adfdf1b3fff3598b3 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # opentelemetry-sdk packaging==23.0 \ --hash=sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2 \ --hash=sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt + # kombu # lazy-loader # scikit-image # tensorboardx @@ -1196,7 +1239,7 @@ pandas==1.5.3 \ --hash=sha256:e474390e60ed609cec869b0da796ad94f420bb057d86784191eefc62b65819ae \ --hash=sha256:f76d097d12c82a535fda9dfe5e8dd4127952b45fea9b0276cb30cca5ea313fbc # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt pillow==10.3.0 \ --hash=sha256:048ad577748b9fa4a99a0548c64f2cb8d672d5bf2e643a739ac8faff1164238c \ @@ -1269,22 +1312,28 @@ pillow==10.3.0 \ --hash=sha256:fdcbb4068117dfd9ce0138d068ac512843c52295ed996ae6dd1faf537b6dbc27 \ --hash=sha256:ff61bfd9253c3915e6d41c651d5f962da23eda633cf02262990094a18a55371a # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # imageio # scikit-image platformdirs==3.11.0 \ --hash=sha256:cf8ee52a3afdb965072dcc652433e0c7e3e40cf5ea1477cd4b3b1d2eb75495b3 \ --hash=sha256:e9d171d00af68be50e9202731309c4e658fd8bc76f55c11c7dd760d023bda68e # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # virtualenv prometheus-client==0.19.0 \ --hash=sha256:4585b0d1223148c27a225b10dbec5ae9bc4c81a99a3fa80774fa6209935324e1 \ --hash=sha256:c88b1e6ecf6b41cd8fb5731c7ae919bf66df6ec6fafa555cd6c0e16ca169ae92 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt # opentelemetry-exporter-prometheus +prompt-toolkit==3.0.41 \ + --hash=sha256:941367d97fc815548822aa26c2a269fdc4eb21e9ec05fc5d447cf09bad5d75f0 \ + --hash=sha256:f36fe301fafb7470e86aaf90f036eef600a3210be4decf461a5b1ca8403d3cb2 + # via + # -c python/deplocks/llm/ray_test_py311_cpu.lock + # click-repl propcache==0.3.0 \ --hash=sha256:02df07041e0820cacc8f739510078f2aadcfd3fc57eaeeb16d5ded85c872c89e \ --hash=sha256:03acd9ff19021bd0567582ac88f821b66883e158274183b9e5586f678984f8fe \ @@ -1385,14 +1434,14 @@ propcache==0.3.0 \ --hash=sha256:fb91d20fa2d3b13deea98a690534697742029f4fb83673a3501ae6e3746508b5 \ --hash=sha256:fddb8870bdb83456a489ab67c6b3040a8d5a55069aa6f72f9d872235fbc52f54 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # aiohttp # yarl proto-plus==1.22.3 \ --hash=sha256:a49cd903bc0b6ab41f76bf65510439d56ca76f868adf0274e738bfdd096894df \ --hash=sha256:fdcd09713cbd42480740d2fe29c990f7fbd885a67efc328aa8be6ee3e9f76a6b # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # google-api-core protobuf==4.25.8 \ --hash=sha256:077ff8badf2acf8bc474406706ad890466274191a48d0abd3bd6987107c9cde5 \ @@ -1407,7 +1456,7 @@ protobuf==4.25.8 \ --hash=sha256:d552c53d0415449c8d17ced5c341caba0d89dbf433698e1436c8fa0aae7808a3 \ --hash=sha256:f4510b93a3bec6eba8fd8f1093e9d7fb0d4a24d1a81377c10c0e5bbfe9e4ed24 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt # google-api-core # googleapis-common-protos @@ -1424,7 +1473,7 @@ py-spy==0.4.0 ; python_full_version < '3.12' \ --hash=sha256:eee3d0bde85ca5cf4f01f012d461180ca76c24835a96f7b5c4ded64eb6a008ab \ --hash=sha256:f2cf3f7130e7d780471faa5957441d3b4e0ec39a79b2c00f4c33d494f7728428 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt pyarrow==19.0.1 \ --hash=sha256:008a4009efdb4ea3d2e18f05cd31f9d43c388aad29c636112c2966605ba33466 \ @@ -1470,161 +1519,161 @@ pyarrow==19.0.1 \ --hash=sha256:fca15aabbe9b8355800d923cc2e82c8ef514af321e18b437c3d782aa884eaeec \ --hash=sha256:fd44d66093a239358d07c42a91eebf5015aa54fccba959db899f932218ac9cc8 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt pyasn1==0.5.1 \ --hash=sha256:4439847c58d40b1d0a573d07e3856e95333f1976294494c325775aeca506eb58 \ --hash=sha256:6d391a96e59b23130a5cfa74d6fd7f388dbbe26cc8f1edf39fdddf08d9d6676c # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # pyasn1-modules # rsa pyasn1-modules==0.3.0 \ --hash=sha256:5bd01446b736eb9d31512a30d46c1ac3395d676c6f3cafa4c03eb54b9925631c \ --hash=sha256:d3ccd6ed470d9ffbc716be08bd90efbd44d0734bc9303818f7336070984a162d # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # google-auth pycparser==2.21 ; platform_python_implementation != 'PyPy' \ --hash=sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9 \ --hash=sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # cffi -pydantic==2.10.0 \ - --hash=sha256:0aca0f045ff6e2f097f1fe89521115335f15049eeb8a7bef3dafe4b19a74e289 \ - --hash=sha256:5e7807ba9201bdf61b1b58aa6eb690916c40a47acfb114b1b4fef3e7fd5b30fc +pydantic==2.11.7 \ + --hash=sha256:d989c3c6cb79469287b1569f7447a17848c998458d49ebe294e975b9baf0f0db \ + --hash=sha256:dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt # fastapi -pydantic-core==2.27.0 \ - --hash=sha256:0aa4d1b2eba9a325897308b3124014a142cdccb9f3e016f31d3ebee6b5ea5e75 \ - --hash=sha256:0d06b667e53320332be2bf6f9461f4a9b78092a079b8ce8634c9afaa7e10cd9f \ - --hash=sha256:153017e3d6cd3ce979de06d84343ca424bb6092727375eba1968c8b4693c6ecb \ - --hash=sha256:15e350efb67b855cd014c218716feea4986a149ed1f42a539edd271ee074a196 \ - --hash=sha256:185ef205256cd8b38431205698531026979db89a79587725c1e55c59101d64e9 \ - --hash=sha256:1da0c98a85a6c6ed702d5556db3b09c91f9b0b78de37b7593e2de8d03238807a \ - --hash=sha256:225bfff5d425c34e1fd562cef52d673579d59b967d9de06178850c4802af9039 \ - --hash=sha256:24f984fc7762ed5f806d9e8c4c77ea69fdb2afd987b4fd319ef06c87595a8c55 \ - --hash=sha256:25a7fd4de38f7ff99a37e18fa0098c3140286451bc823d1746ba80cec5b433a1 \ - --hash=sha256:2883b260f7a93235488699d39cbbd94fa7b175d3a8063fbfddd3e81ad9988cb2 \ - --hash=sha256:2a51ce96224eadd1845150b204389623c8e129fde5a67a84b972bd83a85c6c40 \ - --hash=sha256:2be0ad541bb9f059954ccf8877a49ed73877f862529575ff3d54bf4223e4dd61 \ - --hash=sha256:31a2cae5f059329f9cfe3d8d266d3da1543b60b60130d186d9b6a3c20a346361 \ - --hash=sha256:333c840a1303d1474f491e7be0b718226c730a39ead0f7dab2c7e6a2f3855555 \ - --hash=sha256:33d14369739c5d07e2e7102cdb0081a1fa46ed03215e07f097b34e020b83b1ae \ - --hash=sha256:35380671c3c921fe8adf31ad349dc6f7588b7e928dbe44e1093789734f607399 \ - --hash=sha256:359e7951f04ad35111b5ddce184db3391442345d0ab073aa63a95eb8af25a5ef \ - --hash=sha256:36aa167f69d8807ba7e341d67ea93e50fcaaf6bc433bb04939430fa3dab06f31 \ - --hash=sha256:395e3e1148fa7809016231f8065f30bb0dc285a97b4dc4360cd86e17bab58af7 \ - --hash=sha256:3e8d89c276234579cd3d095d5fa2a44eb10db9a218664a17b56363cddf226ff3 \ - --hash=sha256:3eb8849445c26b41c5a474061032c53e14fe92a11a5db969f722a2716cd12206 \ - --hash=sha256:3fd8bc2690e7c39eecdf9071b6a889ce7b22b72073863940edc2a0a23750ca90 \ - --hash=sha256:400bf470e4327e920883b51e255617dfe4496d4e80c3fea0b5a5d0bf2c404dd4 \ - --hash=sha256:4148dc9184ab79e356dc00a4199dc0ee8647973332cb385fc29a7cced49b9f9c \ - --hash=sha256:433689845288f9a1ee5714444e65957be26d30915f7745091ede4a83cfb2d7bb \ - --hash=sha256:43b61989068de9ce62296cde02beffabcadb65672207fc51e7af76dca75e6636 \ - --hash=sha256:4523c4009c3f39d948e01962223c9f5538602e7087a628479b723c939fab262d \ - --hash=sha256:483c2213a609e7db2c592bbc015da58b6c75af7360ca3c981f178110d9787bcf \ - --hash=sha256:49633583eb7dc5cba61aaf7cdb2e9e662323ad394e543ee77af265736bcd3eaa \ - --hash=sha256:4b51f964fcbb02949fc546022e56cdb16cda457af485e9a3e8b78ac2ecf5d77e \ - --hash=sha256:4bf1340ae507f6da6360b24179c2083857c8ca7644aab65807023cf35404ea8d \ - --hash=sha256:4fb49cfdb53af5041aba909be00cccfb2c0d0a2e09281bf542371c5fd36ad04c \ - --hash=sha256:510b11e9c3b1a852876d1ccd8d5903684336d635214148637ceb27366c75a467 \ - --hash=sha256:513cb14c0cc31a4dfd849a4674b20c46d87b364f997bbcb02282306f5e187abf \ - --hash=sha256:58560828ee0951bb125c6f2862fbc37f039996d19ceb6d8ff1905abf7da0bf3d \ - --hash=sha256:58ab0d979c969983cdb97374698d847a4acffb217d543e172838864636ef10d9 \ - --hash=sha256:5982048129f40b082c2654de10c0f37c67a14f5ff9d37cf35be028ae982f26df \ - --hash=sha256:5ab325fc86fbc077284c8d7f996d904d30e97904a87d6fb303dce6b3de7ebba9 \ - --hash=sha256:5cc822ab90a70ea3a91e6aed3afac570b276b1278c6909b1d384f745bd09c714 \ - --hash=sha256:5f2b19b8d6fca432cb3acf48cf5243a7bf512988029b6e6fd27e9e8c0a204d85 \ - --hash=sha256:5fc72fbfebbf42c0856a824b8b0dc2b5cd2e4a896050281a21cfa6fed8879cb1 \ - --hash=sha256:6354e18a9be37bfa124d6b288a87fb30c673745806c92956f1a25e3ae6e76b96 \ - --hash=sha256:678f66462058dd978702db17eb6a3633d634f7aa0deaea61e0a674152766d3fc \ - --hash=sha256:68950bc08f9735306322bfc16a18391fcaac99ded2509e1cc41d03ccb6013cfe \ - --hash=sha256:68ef5377eb582fa4343c9d0b57a5b094046d447b4c73dd9fbd9ffb216f829e7d \ - --hash=sha256:6b4c19525c3538fbc0bbda6229f9682fb8199ce9ac37395880e6952798e00373 \ - --hash=sha256:6bb69bf3b6500f195c3deb69c1205ba8fc3cb21d1915f1f158a10d6b1ef29b6a \ - --hash=sha256:6e19401742ed7b69e51d8e4df3c03ad5ec65a83b36244479fd70edde2828a5d9 \ - --hash=sha256:6f4a53af9e81d757756508b57cae1cf28293f0f31b9fa2bfcb416cc7fb230f9d \ - --hash=sha256:6fda87808429c520a002a85d6e7cdadbf58231d60e96260976c5b8f9a12a8e13 \ - --hash=sha256:78f841523729e43e3928a364ec46e2e3f80e6625a4f62aca5c345f3f626c6e8a \ - --hash=sha256:7a6ebfac28fd51890a61df36ef202adbd77d00ee5aca4a3dadb3d9ed49cfb929 \ - --hash=sha256:7b0202ebf2268954090209a84f9897345719e46a57c5f2c9b7b250ca0a9d3e63 \ - --hash=sha256:8117839a9bdbba86e7f9df57018fe3b96cec934c3940b591b0fd3fbfb485864a \ - --hash=sha256:82e1ad4ca170e8af4c928b67cff731b6296e6a0a0981b97b2eb7c275cc4e15bd \ - --hash=sha256:836a4bfe0cc6d36dc9a9cc1a7b391265bf6ce9d1eb1eac62ac5139f5d8d9a6fa \ - --hash=sha256:84af1cf7bfdcbc6fcf5a5f70cc9896205e0350306e4dd73d54b6a18894f79386 \ - --hash=sha256:84e35afd9e10b2698e6f2f32256678cb23ca6c1568d02628033a837638b3ed12 \ - --hash=sha256:884f1806609c2c66564082540cffc96868c5571c7c3cf3a783f63f2fb49bd3cd \ - --hash=sha256:8a150392102c402c538190730fda06f3bce654fc498865579a9f2c1d2b425833 \ - --hash=sha256:8e21d927469d04b39386255bf00d0feedead16f6253dcc85e9e10ddebc334084 \ - --hash=sha256:8e96ca781e0c01e32115912ebdf7b3fb0780ce748b80d7d28a0802fa9fbaf44e \ - --hash=sha256:8ee4c2a75af9fe21269a4a0898c5425afb01af1f5d276063f57e2ae1bc64e191 \ - --hash=sha256:91bc66f878557313c2a6bcf396e7befcffe5ab4354cfe4427318968af31143c3 \ - --hash=sha256:951e71da6c89d354572098bada5ba5b5dc3a9390c933af8a614e37755d3d1840 \ - --hash=sha256:99b2863c1365f43f74199c980a3d40f18a218fbe683dd64e470199db426c4d6a \ - --hash=sha256:9a8fbf506fde1529a1e3698198fe64bfbe2e0c09557bc6a7dcf872e7c01fec40 \ - --hash=sha256:9ce048deb1e033e7a865ca384770bccc11d44179cf09e5193a535c4c2f497bdc \ - --hash=sha256:9fe94d9d2a2b4edd7a4b22adcd45814b1b59b03feb00e56deb2e89747aec7bfe \ - --hash=sha256:a291d0b4243a259c8ea7e2b84eb9ccb76370e569298875a7c5e3e71baf49057a \ - --hash=sha256:a5c022bb0d453192426221605efc865373dde43b17822a264671c53b068ac20c \ - --hash=sha256:abb4785894936d7682635726613c44578c420a096729f1978cd061a7e72d5275 \ - --hash=sha256:b872c86d8d71827235c7077461c502feb2db3f87d9d6d5a9daa64287d75e4fa0 \ - --hash=sha256:bf37b72834e7239cf84d4a0b2c050e7f9e48bced97bad9bdf98d26b8eb72e846 \ - --hash=sha256:c0c431e4be5c1a0c6654e0c31c661cd89e0ca956ef65305c3c3fd96f4e72ca39 \ - --hash=sha256:c5726eec789ee38f2c53b10b1821457b82274f81f4f746bb1e666d8741fcfadb \ - --hash=sha256:c6fcb3fa3855d583aa57b94cf146f7781d5d5bc06cb95cb3afece33d31aac39b \ - --hash=sha256:c86679f443e7085ea55a7376462553996c688395d18ef3f0d3dbad7838f857a2 \ - --hash=sha256:c91e3c04f5191fd3fb68764bddeaf02025492d5d9f23343b283870f6ace69708 \ - --hash=sha256:c921ad596ff1a82f9c692b0758c944355abc9f0de97a4c13ca60ffc6d8dc15d4 \ - --hash=sha256:c9ed88b398ba7e3bad7bd64d66cc01dcde9cfcb7ec629a6fd78a82fa0b559d78 \ - --hash=sha256:cd2ac6b919f7fed71b17fe0b4603c092a4c9b5bae414817c9c81d3c22d1e1bcc \ - --hash=sha256:d28ca7066d6cdd347a50d8b725dc10d9a1d6a1cce09836cf071ea6a2d4908be0 \ - --hash=sha256:d29e235ce13c91902ef3efc3d883a677655b3908b1cbc73dee816e5e1f8f7739 \ - --hash=sha256:d8b5ee4ae9170e2775d495b81f414cc20268041c42571530513496ba61e94ba3 \ - --hash=sha256:db72e40628967f6dc572020d04b5f800d71264e0531c6da35097e73bdf38b003 \ - --hash=sha256:df45c4073bed486ea2f18757057953afed8dd77add7276ff01bccb79982cf46c \ - --hash=sha256:dfa5f5c0a4c8fced1422dc2ca7eefd872d5d13eb33cf324361dbf1dbfba0a9fe \ - --hash=sha256:e015833384ca3e1a0565a79f5d953b0629d9138021c27ad37c92a9fa1af7623c \ - --hash=sha256:e15315691fe2253eb447503153acef4d7223dfe7e7702f9ed66539fcd0c43801 \ - --hash=sha256:e65466b31be1070b4a5b7dbfbd14b247884cb8e8b79c64fb0f36b472912dbaea \ - --hash=sha256:e7820bb0d65e3ce1e3e70b6708c2f66143f55912fa02f4b618d0f08b61575f12 \ - --hash=sha256:e851a051f7260e6d688267eb039c81f05f23a19431bd7dfa4bf5e3cb34c108cd \ - --hash=sha256:e9f9feee7f334b72ceae46313333d002b56f325b5f04271b4ae2aadd9e993ae4 \ - --hash=sha256:eb40f828bc2f73f777d1eb8fee2e86cd9692a4518b63b6b5aa8af915dfd3207b \ - --hash=sha256:eb704155e73b833801c247f39d562229c0303f54770ca14fb1c053acb376cf10 \ - --hash=sha256:edb1bfd45227dec8d50bc7c7d86463cd8728bcc574f9b07de7369880de4626a3 \ - --hash=sha256:ee7d9d5537daf6d5c74a83b38a638cc001b648096c1cae8ef695b0c919d9d379 \ - --hash=sha256:f57783fbaf648205ac50ae7d646f27582fc706be3977e87c3c124e7a92407b10 \ - --hash=sha256:ff63a92f6e249514ef35bc795de10745be0226eaea06eb48b4bbeaa0c8850a4a - # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt +pydantic-core==2.33.2 \ + --hash=sha256:0069c9acc3f3981b9ff4cdfaf088e98d83440a4c7ea1bc07460af3d4dc22e72d \ + --hash=sha256:031c57d67ca86902726e0fae2214ce6770bbe2f710dc33063187a68744a5ecac \ + --hash=sha256:0405262705a123b7ce9f0b92f123334d67b70fd1f20a9372b907ce1080c7ba02 \ + --hash=sha256:04a1a413977ab517154eebb2d326da71638271477d6ad87a769102f7c2488c56 \ + --hash=sha256:09fb9dd6571aacd023fe6aaca316bd01cf60ab27240d7eb39ebd66a3a15293b4 \ + --hash=sha256:0a39979dcbb70998b0e505fb1556a1d550a0781463ce84ebf915ba293ccb7e22 \ + --hash=sha256:0a9f2c9dd19656823cb8250b0724ee9c60a82f3cdf68a080979d13092a3b0fef \ + --hash=sha256:0e03262ab796d986f978f79c943fc5f620381be7287148b8010b4097f79a39ec \ + --hash=sha256:0e5b2671f05ba48b94cb90ce55d8bdcaaedb8ba00cc5359f6810fc918713983d \ + --hash=sha256:0e6116757f7959a712db11f3e9c0a99ade00a5bbedae83cb801985aa154f071b \ + --hash=sha256:0fb2d542b4d66f9470e8065c5469ec676978d625a8b7a363f07d9a501a9cb36a \ + --hash=sha256:1082dd3e2d7109ad8b7da48e1d4710c8d06c253cbc4a27c1cff4fbcaa97a9e3f \ + --hash=sha256:1a8695a8d00c73e50bff9dfda4d540b7dee29ff9b8053e38380426a85ef10052 \ + --hash=sha256:1e063337ef9e9820c77acc768546325ebe04ee38b08703244c1309cccc4f1bab \ + --hash=sha256:1ea40a64d23faa25e62a70ad163571c0b342b8bf66d5fa612ac0dec4f069d916 \ + --hash=sha256:2058a32994f1fde4ca0480ab9d1e75a0e8c87c22b53a3ae66554f9af78f2fe8c \ + --hash=sha256:235f45e5dbcccf6bd99f9f472858849f73d11120d76ea8707115415f8e5ebebf \ + --hash=sha256:2807668ba86cb38c6817ad9bc66215ab8584d1d304030ce4f0887336f28a5e27 \ + --hash=sha256:2b0a451c263b01acebe51895bfb0e1cc842a5c666efe06cdf13846c7418caa9a \ + --hash=sha256:2b3d326aaef0c0399d9afffeb6367d5e26ddc24d351dbc9c636840ac355dc5d8 \ + --hash=sha256:2bfb5112df54209d820d7bf9317c7a6c9025ea52e49f46b6a2060104bba37de7 \ + --hash=sha256:2f82865531efd18d6e07a04a17331af02cb7a651583c418df8266f17a63c6612 \ + --hash=sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1 \ + --hash=sha256:3a1c81334778f9e3af2f8aeb7a960736e5cab1dfebfb26aabca09afd2906c039 \ + --hash=sha256:3abcd9392a36025e3bd55f9bd38d908bd17962cc49bc6da8e7e96285336e2bca \ + --hash=sha256:3c6db6e52c6d70aa0d00d45cdb9b40f0433b96380071ea80b09277dba021ddf7 \ + --hash=sha256:3dc625f4aa79713512d1976fe9f0bc99f706a9dee21dfd1810b4bbbf228d0e8a \ + --hash=sha256:3eb3fe62804e8f859c49ed20a8451342de53ed764150cb14ca71357c765dc2a6 \ + --hash=sha256:44857c3227d3fb5e753d5fe4a3420d6376fa594b07b621e220cd93703fe21782 \ + --hash=sha256:4b25d91e288e2c4e0662b8038a28c6a07eaac3e196cfc4ff69de4ea3db992a1b \ + --hash=sha256:4c5b0a576fb381edd6d27f0a85915c6daf2f8138dc5c267a57c08a62900758c7 \ + --hash=sha256:4e61206137cbc65e6d5256e1166f88331d3b6238e082d9f74613b9b765fb9025 \ + --hash=sha256:52fb90784e0a242bb96ec53f42196a17278855b0f31ac7c3cc6f5c1ec4811849 \ + --hash=sha256:53a57d2ed685940a504248187d5685e49eb5eef0f696853647bf37c418c538f7 \ + --hash=sha256:572c7e6c8bb4774d2ac88929e3d1f12bc45714ae5ee6d9a788a9fb35e60bb04b \ + --hash=sha256:5c4aa4e82353f65e548c476b37e64189783aa5384903bfea4f41580f255fddfa \ + --hash=sha256:5c92edd15cd58b3c2d34873597a1e20f13094f59cf88068adb18947df5455b4e \ + --hash=sha256:5f483cfb75ff703095c59e365360cb73e00185e01aaea067cd19acffd2ab20ea \ + --hash=sha256:61c18fba8e5e9db3ab908620af374db0ac1baa69f0f32df4f61ae23f15e586ac \ + --hash=sha256:6368900c2d3ef09b69cb0b913f9f8263b03786e5b2a387706c5afb66800efd51 \ + --hash=sha256:64632ff9d614e5eecfb495796ad51b0ed98c453e447a76bcbeeb69615079fc7e \ + --hash=sha256:65132b7b4a1c0beded5e057324b7e16e10910c106d43675d9bd87d4f38dde162 \ + --hash=sha256:6b99022f1d19bc32a4c2a0d544fc9a76e3be90f0b3f4af413f87d38749300e65 \ + --hash=sha256:6bdfe4b3789761f3bcb4b1ddf33355a71079858958e3a552f16d5af19768fef2 \ + --hash=sha256:6fa6dfc3e4d1f734a34710f391ae822e0a8eb8559a85c6979e14e65ee6ba2954 \ + --hash=sha256:73662edf539e72a9440129f231ed3757faab89630d291b784ca99237fb94db2b \ + --hash=sha256:73cf6373c21bc80b2e0dc88444f41ae60b2f070ed02095754eb5a01df12256de \ + --hash=sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc \ + --hash=sha256:7f92c15cd1e97d4b12acd1cc9004fa092578acfa57b67ad5e43a197175d01a64 \ + --hash=sha256:82f68293f055f51b51ea42fafc74b6aad03e70e191799430b90c13d643059ebb \ + --hash=sha256:83aa99b1285bc8f038941ddf598501a86f1536789740991d7d8756e34f1e74d9 \ + --hash=sha256:87acbfcf8e90ca885206e98359d7dca4bcbb35abdc0ff66672a293e1d7a19101 \ + --hash=sha256:87b31b6846e361ef83fedb187bb5b4372d0da3f7e28d85415efa92d6125d6e6d \ + --hash=sha256:881b21b5549499972441da4758d662aeea93f1923f953e9cbaff14b8b9565aef \ + --hash=sha256:8d55ab81c57b8ff8548c3e4947f119551253f4e3787a7bbc0b6b3ca47498a9d3 \ + --hash=sha256:8f57a69461af2a5fa6e6bbd7a5f60d3b7e6cebb687f55106933188e79ad155c1 \ + --hash=sha256:95237e53bb015f67b63c91af7518a62a8660376a6a0db19b89acc77a4d6199f5 \ + --hash=sha256:96081f1605125ba0855dfda83f6f3df5ec90c61195421ba72223de35ccfb2f88 \ + --hash=sha256:970919794d126ba8645f3837ab6046fb4e72bbc057b3709144066204c19a455d \ + --hash=sha256:9cb1da0f5a471435a7bc7e439b8a728e8b61e59784b2af70d7c169f8dd8ae290 \ + --hash=sha256:9fcd347d2cc5c23b06de6d3b7b8275be558a0c90549495c699e379a80bf8379e \ + --hash=sha256:9fdac5d6ffa1b5a83bca06ffe7583f5576555e6c8b3a91fbd25ea7780f825f7d \ + --hash=sha256:a11c8d26a50bfab49002947d3d237abe4d9e4b5bdc8846a63537b6488e197808 \ + --hash=sha256:a144d4f717285c6d9234a66778059f33a89096dfb9b39117663fd8413d582dcc \ + --hash=sha256:a2b911a5b90e0374d03813674bf0a5fbbb7741570dcd4b4e85a2e48d17def29d \ + --hash=sha256:a7ec89dc587667f22b6a0b6579c249fca9026ce7c333fc142ba42411fa243cdc \ + --hash=sha256:aa9d91b338f2df0508606f7009fde642391425189bba6d8c653afd80fd6bb64e \ + --hash=sha256:b0379a2b24882fef529ec3b4987cb5d003b9cda32256024e6fe1586ac45fc640 \ + --hash=sha256:bc7aee6f634a6f4a95676fcb5d6559a2c2a390330098dba5e5a5f28a2e4ada30 \ + --hash=sha256:bdc25f3681f7b78572699569514036afe3c243bc3059d3942624e936ec93450e \ + --hash=sha256:c083a3bdd5a93dfe480f1125926afcdbf2917ae714bdb80b36d34318b2bec5d9 \ + --hash=sha256:c20c462aa4434b33a2661701b861604913f912254e441ab8d78d30485736115a \ + --hash=sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9 \ + --hash=sha256:c52b02ad8b4e2cf14ca7b3d918f3eb0ee91e63b3167c32591e57c4317e134f8f \ + --hash=sha256:c54c939ee22dc8e2d545da79fc5381f1c020d6d3141d3bd747eab59164dc89fb \ + --hash=sha256:c8e7af2f4e0194c22b5b37205bfb293d166a7344a5b0d0eaccebc376546d77d5 \ + --hash=sha256:cca3868ddfaccfbc4bfb1d608e2ccaaebe0ae628e1416aeb9c4d88c001bb45ab \ + --hash=sha256:d3f26877a748dc4251cfcfda9dfb5f13fcb034f5308388066bcfe9031b63ae7d \ + --hash=sha256:d53b22f2032c42eaaf025f7c40c2e3b94568ae077a606f006d206a463bc69572 \ + --hash=sha256:d87c561733f66531dced0da6e864f44ebf89a8fba55f31407b00c2f7f9449593 \ + --hash=sha256:d946c8bf0d5c24bf4fe333af284c59a19358aa3ec18cb3dc4370080da1e8ad29 \ + --hash=sha256:dac89aea9af8cd672fa7b510e7b8c33b0bba9a43186680550ccf23020f32d535 \ + --hash=sha256:db4b41f9bd95fbe5acd76d89920336ba96f03e149097365afe1cb092fceb89a1 \ + --hash=sha256:dc46a01bf8d62f227d5ecee74178ffc448ff4e5197c756331f71efcc66dc980f \ + --hash=sha256:dd14041875d09cc0f9308e37a6f8b65f5585cf2598a53aa0123df8b129d481f8 \ + --hash=sha256:de4b83bb311557e439b9e186f733f6c645b9417c84e2eb8203f3f820a4b988bf \ + --hash=sha256:e799c050df38a639db758c617ec771fd8fb7a5f8eaaa4b27b101f266b216a246 \ + --hash=sha256:e80b087132752f6b3d714f041ccf74403799d3b23a72722ea2e6ba2e892555b9 \ + --hash=sha256:eb8c529b2819c37140eb51b914153063d27ed88e3bdc31b71198a198e921e011 \ + --hash=sha256:eb9b459ca4df0e5c87deb59d37377461a538852765293f9e6ee834f0435a93b9 \ + --hash=sha256:efec8db3266b76ef9607c2c4c419bdb06bf335ae433b80816089ea7585816f6a \ + --hash=sha256:f481959862f57f29601ccced557cc2e817bce7533ab8e01a797a48b49c9692b3 \ + --hash=sha256:f517ca031dfc037a9c07e748cefd8d96235088b83b4f4ba8939105d20fa1dcd6 \ + --hash=sha256:f889f7a40498cc077332c7ab6b4608d296d852182211787d4f3ee377aaae66e8 \ + --hash=sha256:f8de619080e944347f5f20de29a975c2d815d9ddd8be9b9b7268e2e3ef68605a \ + --hash=sha256:f941635f2a3d96b2973e867144fde513665c87f13fe0e193c158ac51bfaaa7b2 \ + --hash=sha256:fa754d1850735a0b0e03bcffd9d4b4343eb417e47196e4485d9cca326073a42c \ + --hash=sha256:fa854f5cf7e33842a892e5c73f45327760bc7bc516339fda888c75ae60edaeb6 \ + --hash=sha256:fe5b32187cbc0c862ee201ad66c30cf218e5ed468ec8dc1cf49dec66e160cc4d + # via + # -c python/deplocks/llm/ray_test_py311_cpu.lock # pydantic pygments==2.18.0 \ --hash=sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199 \ --hash=sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # rich pyopenssl==25.0.0 \ --hash=sha256:424c247065e46e76a37411b9ab1782541c23bb658bf003772c3405fbaa128e90 \ --hash=sha256:cd2cef799efa3936bb08e8ccb9433a575722b9dd986023f1cabc4ae64e9dac16 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt python-dateutil==2.8.2 \ --hash=sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86 \ --hash=sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock + # celery # pandas pytz==2022.7.1 \ --hash=sha256:01a0681c4b9684a28304615eba55d1ab31ae00bf68ec157ec3708a8182dbbcd0 \ --hash=sha256:78f4f37d8198e0627c5f1143240bb0206b8691d8d7ac6d78fee88b78733f8c4a # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # pandas pyyaml==6.0.1 \ --hash=sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5 \ @@ -1679,27 +1728,27 @@ pyyaml==6.0.1 \ --hash=sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d \ --hash=sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt referencing==0.36.2 \ --hash=sha256:df2e89862cd09deabbdba16944cc3f10feb6b3e6f18e902f7cc25609a34775aa \ --hash=sha256:e8699adbbf8b5c7de96d8ffa0eb5c158b3beafce084968e2ea8bb08c6794dcd0 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jsonschema # jsonschema-specifications requests==2.32.3 \ --hash=sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760 \ --hash=sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt # google-api-core rich==13.3.2 \ --hash=sha256:91954fe80cfb7985727a467ca98a7618e5dd15178cc2da10f553b36a93859001 \ --hash=sha256:a104f37270bf677148d8acb07d33be1569eeee87e2d1beb286a4e9113caf6f2f # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt # memray # typer @@ -1808,14 +1857,14 @@ rpds-py==0.22.3 \ --hash=sha256:fda7cb070f442bf80b642cd56483b5548e43d366fe3f39b98e67cce780cded00 \ --hash=sha256:feea821ee2a9273771bae61194004ee2fc33f8ec7db08117ef9147d4bbcbca8e # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jsonschema # referencing rsa==4.7.2 \ --hash=sha256:78f9a9bf4e7be0c5ded4583326e7461e3a3c5aae24073648b4bdfa797d78c9d2 \ --hash=sha256:9d689e6ca1b3038bc82bf8d23e944b6b6037bc02301a574935b2dd946e0353b9 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # google-auth scikit-image==0.24.0 \ --hash=sha256:18836a18d3a7b6aca5376a2d805f0045826bc6c9fc85331659c33b4813e0b563 \ @@ -1840,7 +1889,7 @@ scikit-image==0.24.0 \ --hash=sha256:ef04360eda372ee5cd60aebe9be91258639c86ae2ea24093fb9182118008d009 \ --hash=sha256:fa27b3a0dbad807b966b8db2d78da734cb812ca4787f7fbb143764800ce2fa9c # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt scipy==1.11.4 \ --hash=sha256:00150c5eae7b610c32589dda259eacc7c4f1665aedf25d921907f4d08a951b1c \ @@ -1869,64 +1918,64 @@ scipy==1.11.4 \ --hash=sha256:f313b39a7e94f296025e3cffc2c567618174c0b1dde173960cf23808f9fae4be \ --hash=sha256:f3cd9e7b3c2c1ec26364856f9fbe78695fe631150f94cd1c22228456404cf1ec # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt # scikit-image shellingham==1.5.4 \ --hash=sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686 \ --hash=sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # typer six==1.16.0 \ --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \ --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # opencensus # python-dateutil smart-open==6.2.0 \ --hash=sha256:088bf00f9327c71e549bc2f86567d3320df5d89667f009ce1c16568976068ef7 \ --hash=sha256:1b4df5c8365218f3852c507451920ccad606c80b0acb4e67508e50ba9b5d2632 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt sniffio==1.3.1 \ --hash=sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2 \ --hash=sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # anyio starlette==0.46.2 \ --hash=sha256:595633ce89f8ffa71a015caed34a5b2dc1c0cdb3f0f1fbd1e69339cf2abeec35 \ --hash=sha256:7f7361f34eed179294600af672f565727419830b54b7b084efe44bb82d2fccd5 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt # fastapi tensorboardx==2.6.2.2 \ --hash=sha256:160025acbf759ede23fd3526ae9d9bfbfd8b68eb16c38a010ebe326dc6395db8 \ --hash=sha256:c6476d7cd0d529b0b72f4acadb1269f9ed8b22f441e87a84f2a3b940bb87b666 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt tifffile==2024.7.21 \ --hash=sha256:7f335b5d6ca49401fe0f1d87deb206f5dae47297e47b1ed52a676d05d6d26798 \ --hash=sha256:818b577d49350421fb511f389f937984f9feaa2cd8177fa00823001920bf3483 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # scikit-image typer==0.12.3 \ --hash=sha256:070d7ca53f785acbccba8e7d28b08dcd88f79f1fbda035ade0aecec71ca5c914 \ --hash=sha256:49e73131481d804288ef62598d97a1ceef3058905aa536a1134f90891ba35482 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt typing-extensions==4.12.2 \ --hash=sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d \ --hash=sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # fastapi # gymnasium # opentelemetry-api @@ -1937,23 +1986,44 @@ typing-extensions==4.12.2 \ # pyopenssl # referencing # typer + # typing-inspection +typing-inspection==0.4.1 \ + --hash=sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51 \ + --hash=sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28 + # via + # -c python/deplocks/llm/ray_test_py311_cpu.lock + # pydantic +tzdata==2025.2 \ + --hash=sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8 \ + --hash=sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9 + # via + # -c python/deplocks/llm/ray_test_py311_cpu.lock + # kombu urllib3==1.26.19 \ --hash=sha256:37a0344459b199fce0e80b0d3569837ec6b6937435c5244e7fd73fa6006830f3 \ --hash=sha256:3e3d753a8618b86d7de333b4223005f68720bcd6a7d2bcb9fbd2229ec7c1e429 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # requests uvicorn==0.22.0 \ --hash=sha256:79277ae03db57ce7d9aa0567830bbb51d7a612f54d6e1e3e92da3ef24c2c8ed8 \ --hash=sha256:e9434d3bbf05f310e762147f769c9f21235ee118ba2d2bf1155a7196448bd996 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt +vine==5.1.0 \ + --hash=sha256:40fdf3c48b2cfe1c38a49e9ae2da6fda88e4794c810050a728bd7413811fb1dc \ + --hash=sha256:8b62e981d35c41049211cf62a0a1242d8c1ee9bd15bb196ce38aefd6799e61e0 + # via + # -c python/deplocks/llm/ray_test_py311_cpu.lock + # amqp + # celery + # kombu virtualenv==20.29.1 \ --hash=sha256:4e4cb403c0b0da39e13b46b1b2476e505cb0046b25f242bee80f62bf990b2779 \ --hash=sha256:b8b8970138d32fb606192cb97f6cd4bb644fa486be9308fb9b63f81091b5dc35 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt watchfiles==0.19.0 \ --hash=sha256:0089c6dc24d436b373c3c57657bf4f9a453b13767150d17284fc6162b2791911 \ @@ -1979,8 +2049,14 @@ watchfiles==0.19.0 \ --hash=sha256:d9b073073e048081e502b6c6b0b88714c026a1a4c890569238d04aca5f9ca74b \ --hash=sha256:fac19dc9cbc34052394dbe81e149411a62e71999c0a19e1e09ce537867f95ae0 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt +wcwidth==0.2.13 \ + --hash=sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859 \ + --hash=sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5 + # via + # -c python/deplocks/llm/ray_test_py311_cpu.lock + # prompt-toolkit yarl==1.18.3 \ --hash=sha256:00e5a1fea0fd4f5bfa7440a47eff01d9822a65b4488f7cff83155a0f31a2ecba \ --hash=sha256:02ddb6756f8f4517a2d5e99d8b2f272488e18dd0bfbc802f31c16c6c20f22193 \ @@ -2065,11 +2141,11 @@ yarl==1.18.3 \ --hash=sha256:fe57328fbc1bfd0bd0514470ac692630f3901c0ee39052ae47acd1d90a436719 \ --hash=sha256:fea09ca13323376a2fdfb353a5fa2e59f90cd18d7ca4eaa1fd31f0a8b4f91e62 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # aiohttp zipp==3.19.2 \ --hash=sha256:bf1dcf6450f873a13e952a29504887c89e6de7506209e5b1bcc3460135d4de19 \ --hash=sha256:f091755f667055f2d02b32c53771a7a6c8b47e1fdbc4b72a8b9072b3eef8015c # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # importlib-metadata diff --git a/python/requirements_compiled_ray_py311_cu121.txt b/python/deplocks/llm/ray_py311_cu121.lock similarity index 85% rename from python/requirements_compiled_ray_py311_cu121.txt rename to python/deplocks/llm/ray_py311_cu121.lock index e3e854a33f4d..e9a88445ed84 100644 --- a/python/requirements_compiled_ray_py311_cu121.txt +++ b/python/deplocks/llm/ray_py311_cu121.lock @@ -1,5 +1,5 @@ # This file was autogenerated by uv via the following command: -# uv pip compile --generate-hashes --strip-extras --unsafe-package ray --unsafe-package grpcio-tools --unsafe-package setuptools --index-url https://pypi.org/simple --extra-index-url https://download.pytorch.org/whl/cu121 --index-strategy unsafe-best-match --no-strip-markers --emit-index-url --emit-find-links -c python/requirements_compiled_ray_test_py311_cu121.txt python/requirements.txt -o python/requirements_compiled_ray_py311_cu121.txt +# uv pip compile --generate-hashes --strip-extras --unsafe-package setuptools --index-url https://pypi.org/simple --index-strategy unsafe-best-match --no-strip-markers --emit-index-url --emit-find-links --extra-index-url https://download.pytorch.org/whl/cu121 --python-version=3.11 --unsafe-package ray --python-platform=linux -c python/deplocks/llm/ray_test_py311_cu121.lock python/requirements.txt -o python/deplocks/llm/ray_py311_cu121.lock --index-url https://pypi.org/simple --extra-index-url https://download.pytorch.org/whl/cu121 @@ -7,7 +7,7 @@ aiohappyeyeballs==2.6.1 \ --hash=sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558 \ --hash=sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # aiohttp aiohttp==3.11.16 \ --hash=sha256:004511d3413737700835e949433536a2fe95a7d0297edd911a1e9705c5b5ea43 \ @@ -92,59 +92,77 @@ aiohttp==3.11.16 \ --hash=sha256:fb46bb0f24813e6cede6cc07b1961d4b04f331f7112a23b5e21f567da4ee50aa \ --hash=sha256:fd36c119c5d6551bce374fcb5c19269638f8d09862445f85a5a48596fd59f4bb # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt # aiohttp-cors aiohttp-cors==0.7.0 \ --hash=sha256:0451ba59fdf6909d0e2cd21e4c0a43752bc0703d33fc78ae94d9d9321710193e \ --hash=sha256:4d39c6d7100fd9764ed1caf8cebf0eb01bf5e3f24e2e073fda6234bc48b19f5d # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt aiorwlock==1.3.0 \ --hash=sha256:45baf8e4fa9a23e0bb325fbd67da80de1fd7ae1d4f59a6381754c60cec7b289b \ --hash=sha256:83f12d87df4b9728a0b8fda1756585ab0d652b107bab59c6084e1b1ad692ab45 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt aiosignal==1.3.1 \ --hash=sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc \ --hash=sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # aiohttp +amqp==5.3.1 \ + --hash=sha256:43b3319e1b4e7d1251833a93d672b4af1e40f3d632d479b98661a95f117880a2 \ + --hash=sha256:cddc00c725449522023bad949f70fff7b48f0b1ade74d170a6f10ab044739432 + # via + # -c python/deplocks/llm/ray_test_py311_cu121.lock + # kombu annotated-types==0.6.0 \ --hash=sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43 \ --hash=sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # pydantic anyio==3.7.1 \ --hash=sha256:44a3c9aba0f5defa43261a8b3efb97891f2bd7d804e0e1f56419befa1adfc780 \ --hash=sha256:91dee416e570e92c64041bd18b900d1d6fa78dff7048769ce5ac5ddad004fbb5 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # starlette # watchfiles attrs==25.1.0 \ --hash=sha256:1c97078a80c814273a76b2a298a932eb681c87415c11dee0a6921de7f1b02c3e \ --hash=sha256:c75a69e28a550a7e93789579c22aa26b0f5b83b75dc4e08fe092980051e1090a # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # aiohttp # jsonschema # referencing +billiard==4.2.1 \ + --hash=sha256:12b641b0c539073fc8d3f5b8b7be998956665c4233c7c1fcd66a7e677c4fb36f \ + --hash=sha256:40b59a4ac8806ba2c2369ea98d876bc6108b051c227baffd928c644d15d8f3cb + # via + # -c python/deplocks/llm/ray_test_py311_cu121.lock + # celery cachetools==5.5.2 \ --hash=sha256:1a661caa9175d26759571b2e19580f9d6393969e5dfca11fdb1f947a23e640d4 \ --hash=sha256:d26a22bcc62eb95c3beabd9f1ee5e820d3d2704fe2967cbe350e20c8ffcd3f0a # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # google-auth +celery==5.5.3 \ + --hash=sha256:0b5761a07057acee94694464ca482416b959568904c9dfa41ce8413a7d65d525 \ + --hash=sha256:6c972ae7968c2b5281227f01c3a3f984037d21c5129d07bf3550cc2afc6b10a5 + # via + # -c python/deplocks/llm/ray_test_py311_cu121.lock + # -r python/requirements.txt certifi==2025.1.31 \ --hash=sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651 \ --hash=sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # requests cffi==1.16.0 ; platform_python_implementation != 'PyPy' \ --hash=sha256:0c9ef6ff37e974b73c25eecc13952c55bceed9112be2d9d938ded8e856138bcc \ @@ -200,7 +218,7 @@ cffi==1.16.0 ; platform_python_implementation != 'PyPy' \ --hash=sha256:fa3a0128b152627161ce47201262d3140edb5a5c3da88d73a1b790a959126956 \ --hash=sha256:fcc8eb6d5902bb1cf6dc4f187ee3ea80a1eba0a89aba40a5cb20a5087d961357 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # cryptography charset-normalizer==3.3.2 \ --hash=sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027 \ @@ -294,27 +312,49 @@ charset-normalizer==3.3.2 \ --hash=sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519 \ --hash=sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # requests click==8.1.7 \ --hash=sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28 \ --hash=sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt + # celery + # click-didyoumean + # click-plugins + # click-repl # typer # uvicorn +click-didyoumean==0.3.1 \ + --hash=sha256:4f82fdff0dbe64ef8ab2279bd6aa3f6a99c3b28c05aa09cbfc07c9d7fbb5a463 \ + --hash=sha256:5c4bb6007cfea5f2fd6583a2fb6701a22a41eb98957e63d0fac41c10e7c3117c + # via + # -c python/deplocks/llm/ray_test_py311_cu121.lock + # celery +click-plugins==1.1.1.2 \ + --hash=sha256:008d65743833ffc1f5417bf0e78e8d2c23aab04d9745ba817bd3e71b0feb6aa6 \ + --hash=sha256:d7af3984a99d243c131aa1a828331e7630f4a88a9741fd05c927b204bcf92261 + # via + # -c python/deplocks/llm/ray_test_py311_cu121.lock + # celery +click-repl==0.3.0 \ + --hash=sha256:17849c23dba3d667247dc4defe1757fff98694e90fe37474f3feebb69ced26a9 \ + --hash=sha256:fb7e06deb8da8de86180a33a9da97ac316751c094c6899382da7feeeeb51b812 + # via + # -c python/deplocks/llm/ray_test_py311_cu121.lock + # celery cloudpickle==2.2.0 \ --hash=sha256:3f4219469c55453cfe4737e564b67c2a149109dabf7f242478948b895f61106f \ --hash=sha256:7428798d5926d8fcbfd092d18d01a2a03daf8237d8fcdc8095d256b8490796f0 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # gymnasium colorful==0.5.5 \ --hash=sha256:62c187e27c1433db9463ff93b1451898d1e7e23a7e553583fd9daeb6325182e4 \ --hash=sha256:66f8c1264b2a26f7293b96a03bb7a76c4bc8b9634369a0bffdcd12d618056a1d # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt cryptography==44.0.3 \ --hash=sha256:02f55fb4f8b79c1221b0961488eaae21015b69b210e18c386b69de182ebb1259 \ @@ -355,7 +395,7 @@ cryptography==44.0.3 \ --hash=sha256:fc3c9babc1e1faefd62704bb46a69f359a9819eb0292e40df3fb6e3574715cd4 \ --hash=sha256:fe19d8bc5536a91a24a8133328880a41831b6c5df54599a8417b62fe015d3053 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # pyopenssl cupy-cuda12x==13.1.0 ; sys_platform != 'darwin' \ --hash=sha256:230f8a8e99c81a653baa0ed00819990c0ed1f0cf0298214786b5e323461dc61a \ @@ -371,13 +411,13 @@ cupy-cuda12x==13.1.0 ; sys_platform != 'darwin' \ --hash=sha256:dac0284fecb90b5731f514e569a6fcf6674a730ae95b9490781a713b60a34423 \ --hash=sha256:e7a25ef1b44ae6276b5105affc2289edb34f1aa6676babd5bcd80907348c4cfa # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt distlib==0.3.7 \ --hash=sha256:2e24928bc811348f0feb63014e97aaae3037f2cf48712d51ae61df7fd6075057 \ --hash=sha256:9dafe54b34a028eafd95039d5e5d4851a13734540f1331060d31c9916e7147a8 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # virtualenv dm-tree==0.1.8 \ --hash=sha256:054b461f8176f4bce7a21f7b1870f873a1ced3bdbe1282c816c550bb43c71fa6 \ @@ -427,19 +467,19 @@ dm-tree==0.1.8 \ --hash=sha256:f7ac31b9aecccb2c6e1ab29706f6ded3eba0c2c69c770322c9c685929c3d6afb \ --hash=sha256:fa42a605d099ee7d41ba2b5fb75e21423951fd26e5d50583a00471238fb3021d # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt farama-notifications==0.0.4 \ --hash=sha256:13fceff2d14314cf80703c8266462ebf3733c7d165336eee998fc58e545efd18 \ --hash=sha256:14de931035a41961f7c056361dc7f980762a143d05791ef5794a751a2caf05ae # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # gymnasium fastapi==0.115.12 \ --hash=sha256:1e2c2a2646905f9e83d32f04a3f86aff4a286669c6c950ca95b5fd68c2602681 \ --hash=sha256:e94613d6c05e27be7ffebdd6ea5f388112e5e430c8f7d6494a9d1d88d43e814d # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt fastrlock==0.8.2 ; sys_platform != 'darwin' \ --hash=sha256:067edb0a0805bf61e17a251d5046af59f6e9d2b8ad01222e0ef7a0b7937d5548 \ @@ -518,13 +558,13 @@ fastrlock==0.8.2 ; sys_platform != 'darwin' \ --hash=sha256:fb5363cf0fddd9b50525ddbf64a1e1b28ec4c6dfb28670a940cb1cf988a6786b \ --hash=sha256:ff75c90663d6e8996610d435e71487daa853871ad1770dd83dc0f2fc4997241e # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # cupy-cuda12x filelock==3.17.0 \ --hash=sha256:533dc2f7ba78dc2f0f531fc6c4940addf7b70a481e269a5a3b93be94ffbe8338 \ --hash=sha256:ee4e77401ef576ebb38cd7f13b9b28893194acc20a8e68e18730ba9c0e54660e # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt # virtualenv frozenlist==1.4.1 \ @@ -606,109 +646,105 @@ frozenlist==1.4.1 \ --hash=sha256:fde5bd59ab5357e3853313127f4d3565fc7dad314a74d7b5d43c22c6a5ed2ced \ --hash=sha256:fe1a06da377e3a1062ae5fe0926e12b84eceb8a50b350ddca72dc85015873f74 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # aiohttp # aiosignal -fsspec==2023.5.0 \ - --hash=sha256:51a4ad01a5bb66fcc58036e288c0d53d3975a0df2a5dc59a93b59bade0391f2a \ - --hash=sha256:b3b56e00fb93ea321bc9e5d9cf6f8522a0198b20eb24e02774d329e9c6fb84ce +fsspec==2023.12.1 \ + --hash=sha256:6271f1d3075a378bfe432f6f42bf7e1d2a6ba74f78dd9b512385474c579146a0 \ + --hash=sha256:c4da01a35ac65c853f833e43f67802c25213f560820d54ddf248f92eddd5e990 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt google-api-core==2.24.2 \ --hash=sha256:810a63ac95f3c441b7c0e43d344e372887f62ce9071ba972eacf32672e072de9 \ --hash=sha256:81718493daf06d96d6bc76a91c23874dbf2fac0adbbf542831b805ee6e974696 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # opencensus google-auth==2.23.4 \ --hash=sha256:79905d6b1652187def79d491d6e23d0cbb3a21d3c7ba0dbaa9c8a01906b13ff3 \ --hash=sha256:d4bbc92fe4b8bfd2f3e8d88e5ba7085935da208ee38a134fc280e7ce682a05f2 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # google-api-core googleapis-common-protos==1.61.0 \ --hash=sha256:22f1915393bb3245343f6efe87f6fe868532efc12aa26b391b15132e1279f1c0 \ --hash=sha256:8a64866a97f6304a7179873a465d6eee97b7a24ec6cfd78e0f575e96b821240b # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # google-api-core -grpcio==1.66.2 \ - --hash=sha256:02697eb4a5cbe5a9639f57323b4c37bcb3ab2d48cec5da3dc2f13334d72790dd \ - --hash=sha256:03b0b307ba26fae695e067b94cbb014e27390f8bc5ac7a3a39b7723fed085604 \ - --hash=sha256:05bc2ceadc2529ab0b227b1310d249d95d9001cd106aa4d31e8871ad3c428d73 \ - --hash=sha256:06de8ec0bd71be123eec15b0e0d457474931c2c407869b6c349bd9bed4adbac3 \ - --hash=sha256:0be4e0490c28da5377283861bed2941d1d20ec017ca397a5df4394d1c31a9b50 \ - --hash=sha256:12fda97ffae55e6526825daf25ad0fa37483685952b5d0f910d6405c87e3adb6 \ - --hash=sha256:1caa38fb22a8578ab8393da99d4b8641e3a80abc8fd52646f1ecc92bcb8dee34 \ - --hash=sha256:2018b053aa15782db2541ca01a7edb56a0bf18c77efed975392583725974b249 \ - --hash=sha256:20657d6b8cfed7db5e11b62ff7dfe2e12064ea78e93f1434d61888834bc86d75 \ - --hash=sha256:2335c58560a9e92ac58ff2bc5649952f9b37d0735608242973c7a8b94a6437d8 \ - --hash=sha256:31fd163105464797a72d901a06472860845ac157389e10f12631025b3e4d0453 \ - --hash=sha256:38b68498ff579a3b1ee8f93a05eb48dc2595795f2f62716e797dc24774c1aaa8 \ - --hash=sha256:3b00efc473b20d8bf83e0e1ae661b98951ca56111feb9b9611df8efc4fe5d55d \ - --hash=sha256:3ed71e81782966ffead60268bbda31ea3f725ebf8aa73634d5dda44f2cf3fb9c \ - --hash=sha256:45a3d462826f4868b442a6b8fdbe8b87b45eb4f5b5308168c156b21eca43f61c \ - --hash=sha256:49f0ca7ae850f59f828a723a9064cadbed90f1ece179d375966546499b8a2c9c \ - --hash=sha256:4e504572433f4e72b12394977679161d495c4c9581ba34a88d843eaf0f2fbd39 \ - --hash=sha256:4ea1d062c9230278793820146c95d038dc0f468cbdd172eec3363e42ff1c7d01 \ - --hash=sha256:563588c587b75c34b928bc428548e5b00ea38c46972181a4d8b75ba7e3f24231 \ - --hash=sha256:6001e575b8bbd89eee11960bb640b6da6ae110cf08113a075f1e2051cc596cae \ - --hash=sha256:66a0cd8ba6512b401d7ed46bb03f4ee455839957f28b8d61e7708056a806ba6a \ - --hash=sha256:6851de821249340bdb100df5eacfecfc4e6075fa85c6df7ee0eb213170ec8e5d \ - --hash=sha256:728bdf36a186e7f51da73be7f8d09457a03061be848718d0edf000e709418987 \ - --hash=sha256:73e3b425c1e155730273f73e419de3074aa5c5e936771ee0e4af0814631fb30a \ - --hash=sha256:73fc8f8b9b5c4a03e802b3cd0c18b2b06b410d3c1dcbef989fdeb943bd44aff7 \ - --hash=sha256:78fa51ebc2d9242c0fc5db0feecc57a9943303b46664ad89921f5079e2e4ada7 \ - --hash=sha256:7b2c86457145ce14c38e5bf6bdc19ef88e66c5fee2c3d83285c5aef026ba93b3 \ - --hash=sha256:7d69ce1f324dc2d71e40c9261d3fdbe7d4c9d60f332069ff9b2a4d8a257c7b2b \ - --hash=sha256:802d84fd3d50614170649853d121baaaa305de7b65b3e01759247e768d691ddf \ - --hash=sha256:80fd702ba7e432994df208f27514280b4b5c6843e12a48759c9255679ad38db8 \ - --hash=sha256:8ac475e8da31484efa25abb774674d837b343afb78bb3bcdef10f81a93e3d6bf \ - --hash=sha256:950da58d7d80abd0ea68757769c9db0a95b31163e53e5bb60438d263f4bed7b7 \ - --hash=sha256:99a641995a6bc4287a6315989ee591ff58507aa1cbe4c2e70d88411c4dcc0839 \ - --hash=sha256:9c3a99c519f4638e700e9e3f83952e27e2ea10873eecd7935823dab0c1c9250e \ - --hash=sha256:9c509a4f78114cbc5f0740eb3d7a74985fd2eff022971bc9bc31f8bc93e66a3b \ - --hash=sha256:a18e20d8321c6400185b4263e27982488cb5cdd62da69147087a76a24ef4e7e3 \ - --hash=sha256:a917d26e0fe980b0ac7bfcc1a3c4ad6a9a4612c911d33efb55ed7833c749b0ee \ - --hash=sha256:a9539f01cb04950fd4b5ab458e64a15f84c2acc273670072abe49a3f29bbad54 \ - --hash=sha256:ad2efdbe90c73b0434cbe64ed372e12414ad03c06262279b104a029d1889d13e \ - --hash=sha256:b672abf90a964bfde2d0ecbce30f2329a47498ba75ce6f4da35a2f4532b7acbc \ - --hash=sha256:bbd27c24a4cc5e195a7f56cfd9312e366d5d61b86e36d46bbe538457ea6eb8dd \ - --hash=sha256:c400ba5675b67025c8a9f48aa846f12a39cf0c44df5cd060e23fda5b30e9359d \ - --hash=sha256:c408f5ef75cfffa113cacd8b0c0e3611cbfd47701ca3cdc090594109b9fcbaed \ - --hash=sha256:c806852deaedee9ce8280fe98955c9103f62912a5b2d5ee7e3eaa284a6d8d8e7 \ - --hash=sha256:ce89f5876662f146d4c1f695dda29d4433a5d01c8681fbd2539afff535da14d4 \ - --hash=sha256:d25a14af966438cddf498b2e338f88d1c9706f3493b1d73b93f695c99c5f0e2a \ - --hash=sha256:d8d4732cc5052e92cea2f78b233c2e2a52998ac40cd651f40e398893ad0d06ec \ - --hash=sha256:d9a9724a156c8ec6a379869b23ba3323b7ea3600851c91489b871e375f710bc8 \ - --hash=sha256:e636ce23273683b00410f1971d209bf3689238cf5538d960adc3cdfe80dd0dbd \ - --hash=sha256:e88264caad6d8d00e7913996030bac8ad5f26b7411495848cc218bd3a9040b6c \ - --hash=sha256:f145cc21836c332c67baa6fc81099d1d27e266401565bf481948010d6ea32d46 \ - --hash=sha256:fb57870449dfcfac428afbb5a877829fcb0d6db9d9baa1148705739e9083880e \ - --hash=sha256:fb70487c95786e345af5e854ffec8cb8cc781bcc5df7930c4fbb7feaa72e1cdf \ - --hash=sha256:fe96281713168a3270878255983d2cb1a97e034325c8c2c25169a69289d3ecfa \ - --hash=sha256:ff1f7882e56c40b0d33c4922c15dfa30612f05fb785074a012f7cda74d1c3679 - # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt +grpcio==1.74.0 \ + --hash=sha256:0f87bddd6e27fc776aacf7ebfec367b6d49cad0455123951e4488ea99d9b9b8f \ + --hash=sha256:136b53c91ac1d02c8c24201bfdeb56f8b3ac3278668cbb8e0ba49c88069e1bdc \ + --hash=sha256:1733969040989f7acc3d94c22f55b4a9501a30f6aaacdbccfaba0a3ffb255ab7 \ + --hash=sha256:176d60a5168d7948539def20b2a3adcce67d72454d9ae05969a2e73f3a0feee7 \ + --hash=sha256:1a2b06afe2e50ebfd46247ac3ba60cac523f54ec7792ae9ba6073c12daf26f0a \ + --hash=sha256:1bf949792cee20d2078323a9b02bacbbae002b9e3b9e2433f2741c15bdeba1c4 \ + --hash=sha256:22b834cef33429ca6cc28303c9c327ba9a3fafecbf62fae17e9a7b7163cc43ac \ + --hash=sha256:2918948864fec2a11721d91568effffbe0a02b23ecd57f281391d986847982f6 \ + --hash=sha256:2bc2d7d8d184e2362b53905cb1708c84cb16354771c04b490485fa07ce3a1d89 \ + --hash=sha256:2f609a39f62a6f6f05c7512746798282546358a37ea93c1fcbadf8b2fed162e3 \ + --hash=sha256:3601274bc0523f6dc07666c0e01682c94472402ac2fd1226fd96e079863bfa49 \ + --hash=sha256:3b03d8f2a07f0fea8c8f74deb59f8352b770e3900d143b3d1475effcb08eec20 \ + --hash=sha256:3d14e3c4d65e19d8430a4e28ceb71ace4728776fd6c3ce34016947474479683f \ + --hash=sha256:42f8fee287427b94be63d916c90399ed310ed10aadbf9e2e5538b3e497d269bc \ + --hash=sha256:4bc5fca10aaf74779081e16c2bcc3d5ec643ffd528d9e7b1c9039000ead73bae \ + --hash=sha256:4e4181bfc24413d1e3a37a0b7889bea68d973d4b45dd2bc68bb766c140718f82 \ + --hash=sha256:55b453812fa7c7ce2f5c88be3018fb4a490519b6ce80788d5913f3f9d7da8c7b \ + --hash=sha256:566b9395b90cc3d0d0c6404bc8572c7c18786ede549cdb540ae27b58afe0fb91 \ + --hash=sha256:5f251c355167b2360537cf17bea2cf0197995e551ab9da6a0a59b3da5e8704f9 \ + --hash=sha256:60d2d48b0580e70d2e1954d0d19fa3c2e60dd7cbed826aca104fff518310d1c5 \ + --hash=sha256:64229c1e9cea079420527fa8ac45d80fc1e8d3f94deaa35643c381fa8d98f362 \ + --hash=sha256:655726919b75ab3c34cdad39da5c530ac6fa32696fb23119e36b64adcfca174a \ + --hash=sha256:662456c4513e298db6d7bd9c3b8df6f75f8752f0ba01fb653e252ed4a59b5a5d \ + --hash=sha256:68c8ebcca945efff9d86d8d6d7bfb0841cf0071024417e2d7f45c5e46b5b08eb \ + --hash=sha256:69e1a8180868a2576f02356565f16635b99088da7df3d45aaa7e24e73a054e31 \ + --hash=sha256:6bab67d15ad617aff094c382c882e0177637da73cbc5532d52c07b4ee887a87b \ + --hash=sha256:7d95d71ff35291bab3f1c52f52f474c632db26ea12700c2ff0ea0532cb0b5854 \ + --hash=sha256:80d1f4fbb35b0742d3e3d3bb654b7381cd5f015f8497279a1e9c21ba623e01b1 \ + --hash=sha256:834988b6c34515545b3edd13e902c1acdd9f2465d386ea5143fb558f153a7176 \ + --hash=sha256:8533e6e9c5bd630ca98062e3a1326249e6ada07d05acf191a77bc33f8948f3d8 \ + --hash=sha256:85bd5cdf4ed7b2d6438871adf6afff9af7096486fcf51818a81b77ef4dd30907 \ + --hash=sha256:86ad489db097141a907c559988c29718719aa3e13370d40e20506f11b4de0d11 \ + --hash=sha256:885912559974df35d92219e2dc98f51a16a48395f37b92865ad45186f294096c \ + --hash=sha256:8efe72fde5500f47aca1ef59495cb59c885afe04ac89dd11d810f2de87d935d4 \ + --hash=sha256:8f7b5882fb50632ab1e48cb3122d6df55b9afabc265582808036b6e51b9fd6b7 \ + --hash=sha256:9e7c4389771855a92934b2846bd807fc25a3dfa820fd912fe6bd8136026b2707 \ + --hash=sha256:9e912d3c993a29df6c627459af58975b2e5c897d93287939b9d5065f000249b5 \ + --hash=sha256:a8f0302f9ac4e9923f98d8e243939a6fb627cd048f5cd38595c97e38020dffce \ + --hash=sha256:b6a73b2ba83e663b2480a90b82fdae6a7aa6427f62bf43b29912c0cfd1aa2bfa \ + --hash=sha256:c14e803037e572c177ba54a3e090d6eb12efd795d49327c5ee2b3bddb836bf01 \ + --hash=sha256:c3d7bd6e3929fd2ea7fbc3f562e4987229ead70c9ae5f01501a46701e08f1ad9 \ + --hash=sha256:c98e0b7434a7fa4e3e63f250456eaef52499fba5ae661c58cc5b5477d11e7182 \ + --hash=sha256:cce634b10aeab37010449124814b05a62fb5f18928ca878f1bf4750d1f0c815b \ + --hash=sha256:e154d230dc1bbbd78ad2fdc3039fa50ad7ffcf438e4eb2fa30bce223a70c7486 \ + --hash=sha256:e1ea6176d7dfd5b941ea01c2ec34de9531ba494d541fe2057c904e601879f249 \ + --hash=sha256:e759f9e8bc908aaae0412642afe5416c9f983a80499448fcc7fab8692ae044c3 \ + --hash=sha256:e8978003816c7b9eabe217f88c78bc26adc8f9304bf6a594b02e5a49b2ef9c11 \ + --hash=sha256:ecde9ab49f58433abe02f9ed076c7b5be839cf0153883a6d23995937a82392fa \ + --hash=sha256:f6ec94f0e50eb8fa1744a731088b966427575e40c2944a980049798b127a687e \ + --hash=sha256:fd3c71aeee838299c5887230b8a1822795325ddfea635edd82954c1eaa831e24 \ + --hash=sha256:fe0f540750a13fd8e5da4b3eaba91a785eea8dca5ccd2bc2ffe978caa403090e + # via + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt -gymnasium==1.0.0 \ - --hash=sha256:9d2b66f30c1b34fe3c2ce7fae65ecf365d0e9982d2b3d860235e773328a3b403 \ - --hash=sha256:b6f40e1e24c5bd419361e1a5b86a9117d2499baecc3a660d44dfff4c465393ad +gymnasium==1.1.1 \ + --hash=sha256:8bd9ea9bdef32c950a444ff36afc785e1d81051ec32d30435058953c20d2456d \ + --hash=sha256:9c167ec0a2b388666e37f63b2849cd2552f7f5b71938574c637bb36487eb928a # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt h11==0.16.0 \ --hash=sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1 \ --hash=sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # uvicorn idna==3.7 \ --hash=sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc \ --hash=sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # anyio # requests # yarl @@ -716,37 +752,43 @@ imageio==2.34.2 \ --hash=sha256:5c0c0ee8faa018a1c42f649b90395dd4d3bb6187c09053a0cd6f1fdd51bbff5e \ --hash=sha256:a0bb27ec9d5bab36a9f4835e51b21d2cb099e1f78451441f94687ff3404b79f8 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # scikit-image importlib-metadata==6.11.0 \ --hash=sha256:1231cf92d825c9e03cfc4da076a16de6422c863558229ea0b22b675657463443 \ --hash=sha256:f0afba6205ad8f8947c7d338b5342d5db2afbfd82f9cbef7879a9539cc12eb9b # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # opentelemetry-api jinja2==3.1.6 ; sys_platform != 'win32' \ --hash=sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d \ --hash=sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # memray jsonschema==4.23.0 \ --hash=sha256:d71497fef26351a33265337fa77ffeb82423f3ea21283cd9467bb03999266bc4 \ --hash=sha256:fbadb6f8b144a8f8cf9f0b89ba94501d143e50411a1278633f56a7acf7fd5566 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt jsonschema-specifications==2024.10.1 \ --hash=sha256:0f38b83639958ce1152d02a7f062902c41c8fd20d558b0c34344292d417ae272 \ --hash=sha256:a09a0680616357d9a0ecf05c12ad234479f549239d0f5b55f3deea67475da9bf # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jsonschema +kombu==5.5.4 \ + --hash=sha256:886600168275ebeada93b888e831352fe578168342f0d1d5833d88ba0d847363 \ + --hash=sha256:a12ed0557c238897d8e518f1d1fdf84bd1516c5e305af2dacd85c2015115feb8 + # via + # -c python/deplocks/llm/ray_test_py311_cu121.lock + # celery lazy-loader==0.4 \ --hash=sha256:342aa8e14d543a154047afb4ba8ef17f5563baad3fc610d7b15b213b0f119efc \ --hash=sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # scikit-image lz4==4.3.3 \ --hash=sha256:01fe674ef2889dbb9899d8a67361e0c4a2c833af5aeb37dd505727cf5d2a131e \ @@ -786,13 +828,13 @@ lz4==4.3.3 \ --hash=sha256:f4c7bf687303ca47d69f9f0133274958fd672efaa33fb5bcde467862d6c621f0 \ --hash=sha256:f76176492ff082657ada0d0f10c794b6da5800249ef1692b35cf49b1e93e8ef7 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt markdown-it-py==2.2.0 \ --hash=sha256:5a35f8d1870171d9acc47b99612dc146129b631baf04970128b568f190d0cc30 \ --hash=sha256:7c9a5e412688bc771c67432cbfebcdd686c93ce6484913dccf06cb5a0bea35a1 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # rich markupsafe==2.1.3 ; sys_platform != 'win32' \ --hash=sha256:05fb21170423db021895e1ea1e1f3ab3adb85d1c2333cbc2310f2a26bc77272e \ @@ -856,13 +898,13 @@ markupsafe==2.1.3 ; sys_platform != 'win32' \ --hash=sha256:fec21693218efe39aa7f8599346e90c705afa52c5b31ae019b2e57e8f6542bb2 \ --hash=sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jinja2 mdurl==0.1.2 \ --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \ --hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # markdown-it-py memray==1.10.0 ; sys_platform != 'win32' \ --hash=sha256:0a21745fb516b7a6efcd40aa7487c59e9313fcfc782d0193fcfcf00b48426874 \ @@ -901,7 +943,7 @@ memray==1.10.0 ; sys_platform != 'win32' \ --hash=sha256:e356af93e3b031c83957e9ac1a653f5aaba5df1e357dd17142f5ed19bb3dc660 \ --hash=sha256:f16c5c8730b616613dc8bafe32649ca6bd7252606251eb00148582011758d0b5 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt msgpack==1.0.7 \ --hash=sha256:04ad6069c86e531682f9e1e71b71c1c3937d6014a7c3e9edd2aa81ad58842862 \ @@ -961,7 +1003,7 @@ msgpack==1.0.7 \ --hash=sha256:f9a7c509542db4eceed3dcf21ee5267ab565a83555c9b88a8109dcecc4709002 \ --hash=sha256:ff1d0899f104f3921d94579a5638847f783c9b04f2d5f229392ca77fba5b82fc # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt multidict==6.0.5 \ --hash=sha256:01265f5e40f5a17f8241d52656ed27192be03bfa8764d88e8220141d1e4b3556 \ @@ -1055,14 +1097,14 @@ multidict==6.0.5 \ --hash=sha256:fce28b3c8a81b6b36dfac9feb1de115bab619b3c13905b419ec71d03a3fc1423 \ --hash=sha256:fe5d7785250541f7f5019ab9cba2c71169dc7d74d0f45253f8313f436458a4ef # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # aiohttp # yarl networkx==3.2.1 \ --hash=sha256:9f1bb5cf3409bf324e0a722c20bdb4c20ee39bf1c30ce8ae499c8502b0b5e0c6 \ --hash=sha256:f18c69adc97877c42332c170849c96cefa91881c99a7cb3e95b7c659ebdc1ec2 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # scikit-image numpy==1.26.4 \ --hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \ @@ -1102,7 +1144,7 @@ numpy==1.26.4 \ --hash=sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3 \ --hash=sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt # cupy-cuda12x # gymnasium @@ -1116,19 +1158,19 @@ opencensus==0.11.4 \ --hash=sha256:a18487ce68bc19900336e0ff4655c5a116daf10c1b3685ece8d971bddad6a864 \ --hash=sha256:cbef87d8b8773064ab60e5c2a1ced58bbaa38a6d052c41aec224958ce544eff2 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt opencensus-context==0.1.3 \ --hash=sha256:073bb0590007af276853009fac7e4bab1d523c3f03baf4cb4511ca38967c6039 \ --hash=sha256:a03108c3c10d8c80bb5ddf5c8a1f033161fa61972a9917f9b9b3a18517f0088c # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # opencensus opentelemetry-api==1.34.1 \ --hash=sha256:64f0bd06d42824843731d05beea88d4d4b6ae59f9fe347ff7dfa2cc14233bbb3 \ --hash=sha256:b7df4cb0830d5a6c29ad0c0691dbae874d8daefa934b8b1d642de48323d32a8c # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt # opentelemetry-exporter-prometheus # opentelemetry-sdk @@ -1137,33 +1179,34 @@ opentelemetry-exporter-prometheus==0.55b1 \ --hash=sha256:d13ec0b22bf394113ff1ada5da98133a4b051779b803dae183188e26c4bd9ee0 \ --hash=sha256:f364fbbff9e5de37a112ff104d1185fb1d7e2046c5ab5911e5afebc7ab3ddf0e # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt opentelemetry-proto==1.27.0 \ --hash=sha256:33c9345d91dafd8a74fc3d7576c5a38f18b7fdf8d02983ac67485386132aedd6 \ --hash=sha256:b133873de5581a50063e1e4b29cdcf0c5e253a8c2d8dc1229add20a4c3830ace # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt opentelemetry-sdk==1.34.1 \ --hash=sha256:308effad4059562f1d92163c61c8141df649da24ce361827812c40abb2a1e96e \ --hash=sha256:8091db0d763fcd6098d4781bbc80ff0971f94e260739aa6afe6fd379cdf3aa4d # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt # opentelemetry-exporter-prometheus opentelemetry-semantic-conventions==0.55b1 \ --hash=sha256:5da81dfdf7d52e3d37f8fe88d5e771e191de924cfff5f550ab0b8f7b2409baed \ --hash=sha256:ef95b1f009159c28d7a7849f5cbc71c4c34c845bb514d66adfdf1b3fff3598b3 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # opentelemetry-sdk packaging==23.0 \ --hash=sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2 \ --hash=sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt + # kombu # lazy-loader # scikit-image # tensorboardx @@ -1196,7 +1239,7 @@ pandas==1.5.3 \ --hash=sha256:e474390e60ed609cec869b0da796ad94f420bb057d86784191eefc62b65819ae \ --hash=sha256:f76d097d12c82a535fda9dfe5e8dd4127952b45fea9b0276cb30cca5ea313fbc # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt pillow==10.3.0 \ --hash=sha256:048ad577748b9fa4a99a0548c64f2cb8d672d5bf2e643a739ac8faff1164238c \ @@ -1269,22 +1312,28 @@ pillow==10.3.0 \ --hash=sha256:fdcbb4068117dfd9ce0138d068ac512843c52295ed996ae6dd1faf537b6dbc27 \ --hash=sha256:ff61bfd9253c3915e6d41c651d5f962da23eda633cf02262990094a18a55371a # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # imageio # scikit-image platformdirs==3.11.0 \ --hash=sha256:cf8ee52a3afdb965072dcc652433e0c7e3e40cf5ea1477cd4b3b1d2eb75495b3 \ --hash=sha256:e9d171d00af68be50e9202731309c4e658fd8bc76f55c11c7dd760d023bda68e # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # virtualenv prometheus-client==0.19.0 \ --hash=sha256:4585b0d1223148c27a225b10dbec5ae9bc4c81a99a3fa80774fa6209935324e1 \ --hash=sha256:c88b1e6ecf6b41cd8fb5731c7ae919bf66df6ec6fafa555cd6c0e16ca169ae92 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt # opentelemetry-exporter-prometheus +prompt-toolkit==3.0.41 \ + --hash=sha256:941367d97fc815548822aa26c2a269fdc4eb21e9ec05fc5d447cf09bad5d75f0 \ + --hash=sha256:f36fe301fafb7470e86aaf90f036eef600a3210be4decf461a5b1ca8403d3cb2 + # via + # -c python/deplocks/llm/ray_test_py311_cu121.lock + # click-repl propcache==0.3.0 \ --hash=sha256:02df07041e0820cacc8f739510078f2aadcfd3fc57eaeeb16d5ded85c872c89e \ --hash=sha256:03acd9ff19021bd0567582ac88f821b66883e158274183b9e5586f678984f8fe \ @@ -1385,14 +1434,14 @@ propcache==0.3.0 \ --hash=sha256:fb91d20fa2d3b13deea98a690534697742029f4fb83673a3501ae6e3746508b5 \ --hash=sha256:fddb8870bdb83456a489ab67c6b3040a8d5a55069aa6f72f9d872235fbc52f54 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # aiohttp # yarl proto-plus==1.22.3 \ --hash=sha256:a49cd903bc0b6ab41f76bf65510439d56ca76f868adf0274e738bfdd096894df \ --hash=sha256:fdcd09713cbd42480740d2fe29c990f7fbd885a67efc328aa8be6ee3e9f76a6b # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # google-api-core protobuf==4.25.8 \ --hash=sha256:077ff8badf2acf8bc474406706ad890466274191a48d0abd3bd6987107c9cde5 \ @@ -1407,7 +1456,7 @@ protobuf==4.25.8 \ --hash=sha256:d552c53d0415449c8d17ced5c341caba0d89dbf433698e1436c8fa0aae7808a3 \ --hash=sha256:f4510b93a3bec6eba8fd8f1093e9d7fb0d4a24d1a81377c10c0e5bbfe9e4ed24 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt # google-api-core # googleapis-common-protos @@ -1424,7 +1473,7 @@ py-spy==0.4.0 ; python_full_version < '3.12' \ --hash=sha256:eee3d0bde85ca5cf4f01f012d461180ca76c24835a96f7b5c4ded64eb6a008ab \ --hash=sha256:f2cf3f7130e7d780471faa5957441d3b4e0ec39a79b2c00f4c33d494f7728428 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt pyarrow==19.0.1 \ --hash=sha256:008a4009efdb4ea3d2e18f05cd31f9d43c388aad29c636112c2966605ba33466 \ @@ -1470,161 +1519,161 @@ pyarrow==19.0.1 \ --hash=sha256:fca15aabbe9b8355800d923cc2e82c8ef514af321e18b437c3d782aa884eaeec \ --hash=sha256:fd44d66093a239358d07c42a91eebf5015aa54fccba959db899f932218ac9cc8 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt pyasn1==0.5.1 \ --hash=sha256:4439847c58d40b1d0a573d07e3856e95333f1976294494c325775aeca506eb58 \ --hash=sha256:6d391a96e59b23130a5cfa74d6fd7f388dbbe26cc8f1edf39fdddf08d9d6676c # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # pyasn1-modules # rsa pyasn1-modules==0.3.0 \ --hash=sha256:5bd01446b736eb9d31512a30d46c1ac3395d676c6f3cafa4c03eb54b9925631c \ --hash=sha256:d3ccd6ed470d9ffbc716be08bd90efbd44d0734bc9303818f7336070984a162d # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # google-auth pycparser==2.21 ; platform_python_implementation != 'PyPy' \ --hash=sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9 \ --hash=sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # cffi -pydantic==2.10.0 \ - --hash=sha256:0aca0f045ff6e2f097f1fe89521115335f15049eeb8a7bef3dafe4b19a74e289 \ - --hash=sha256:5e7807ba9201bdf61b1b58aa6eb690916c40a47acfb114b1b4fef3e7fd5b30fc +pydantic==2.11.7 \ + --hash=sha256:d989c3c6cb79469287b1569f7447a17848c998458d49ebe294e975b9baf0f0db \ + --hash=sha256:dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt # fastapi -pydantic-core==2.27.0 \ - --hash=sha256:0aa4d1b2eba9a325897308b3124014a142cdccb9f3e016f31d3ebee6b5ea5e75 \ - --hash=sha256:0d06b667e53320332be2bf6f9461f4a9b78092a079b8ce8634c9afaa7e10cd9f \ - --hash=sha256:153017e3d6cd3ce979de06d84343ca424bb6092727375eba1968c8b4693c6ecb \ - --hash=sha256:15e350efb67b855cd014c218716feea4986a149ed1f42a539edd271ee074a196 \ - --hash=sha256:185ef205256cd8b38431205698531026979db89a79587725c1e55c59101d64e9 \ - --hash=sha256:1da0c98a85a6c6ed702d5556db3b09c91f9b0b78de37b7593e2de8d03238807a \ - --hash=sha256:225bfff5d425c34e1fd562cef52d673579d59b967d9de06178850c4802af9039 \ - --hash=sha256:24f984fc7762ed5f806d9e8c4c77ea69fdb2afd987b4fd319ef06c87595a8c55 \ - --hash=sha256:25a7fd4de38f7ff99a37e18fa0098c3140286451bc823d1746ba80cec5b433a1 \ - --hash=sha256:2883b260f7a93235488699d39cbbd94fa7b175d3a8063fbfddd3e81ad9988cb2 \ - --hash=sha256:2a51ce96224eadd1845150b204389623c8e129fde5a67a84b972bd83a85c6c40 \ - --hash=sha256:2be0ad541bb9f059954ccf8877a49ed73877f862529575ff3d54bf4223e4dd61 \ - --hash=sha256:31a2cae5f059329f9cfe3d8d266d3da1543b60b60130d186d9b6a3c20a346361 \ - --hash=sha256:333c840a1303d1474f491e7be0b718226c730a39ead0f7dab2c7e6a2f3855555 \ - --hash=sha256:33d14369739c5d07e2e7102cdb0081a1fa46ed03215e07f097b34e020b83b1ae \ - --hash=sha256:35380671c3c921fe8adf31ad349dc6f7588b7e928dbe44e1093789734f607399 \ - --hash=sha256:359e7951f04ad35111b5ddce184db3391442345d0ab073aa63a95eb8af25a5ef \ - --hash=sha256:36aa167f69d8807ba7e341d67ea93e50fcaaf6bc433bb04939430fa3dab06f31 \ - --hash=sha256:395e3e1148fa7809016231f8065f30bb0dc285a97b4dc4360cd86e17bab58af7 \ - --hash=sha256:3e8d89c276234579cd3d095d5fa2a44eb10db9a218664a17b56363cddf226ff3 \ - --hash=sha256:3eb8849445c26b41c5a474061032c53e14fe92a11a5db969f722a2716cd12206 \ - --hash=sha256:3fd8bc2690e7c39eecdf9071b6a889ce7b22b72073863940edc2a0a23750ca90 \ - --hash=sha256:400bf470e4327e920883b51e255617dfe4496d4e80c3fea0b5a5d0bf2c404dd4 \ - --hash=sha256:4148dc9184ab79e356dc00a4199dc0ee8647973332cb385fc29a7cced49b9f9c \ - --hash=sha256:433689845288f9a1ee5714444e65957be26d30915f7745091ede4a83cfb2d7bb \ - --hash=sha256:43b61989068de9ce62296cde02beffabcadb65672207fc51e7af76dca75e6636 \ - --hash=sha256:4523c4009c3f39d948e01962223c9f5538602e7087a628479b723c939fab262d \ - --hash=sha256:483c2213a609e7db2c592bbc015da58b6c75af7360ca3c981f178110d9787bcf \ - --hash=sha256:49633583eb7dc5cba61aaf7cdb2e9e662323ad394e543ee77af265736bcd3eaa \ - --hash=sha256:4b51f964fcbb02949fc546022e56cdb16cda457af485e9a3e8b78ac2ecf5d77e \ - --hash=sha256:4bf1340ae507f6da6360b24179c2083857c8ca7644aab65807023cf35404ea8d \ - --hash=sha256:4fb49cfdb53af5041aba909be00cccfb2c0d0a2e09281bf542371c5fd36ad04c \ - --hash=sha256:510b11e9c3b1a852876d1ccd8d5903684336d635214148637ceb27366c75a467 \ - --hash=sha256:513cb14c0cc31a4dfd849a4674b20c46d87b364f997bbcb02282306f5e187abf \ - --hash=sha256:58560828ee0951bb125c6f2862fbc37f039996d19ceb6d8ff1905abf7da0bf3d \ - --hash=sha256:58ab0d979c969983cdb97374698d847a4acffb217d543e172838864636ef10d9 \ - --hash=sha256:5982048129f40b082c2654de10c0f37c67a14f5ff9d37cf35be028ae982f26df \ - --hash=sha256:5ab325fc86fbc077284c8d7f996d904d30e97904a87d6fb303dce6b3de7ebba9 \ - --hash=sha256:5cc822ab90a70ea3a91e6aed3afac570b276b1278c6909b1d384f745bd09c714 \ - --hash=sha256:5f2b19b8d6fca432cb3acf48cf5243a7bf512988029b6e6fd27e9e8c0a204d85 \ - --hash=sha256:5fc72fbfebbf42c0856a824b8b0dc2b5cd2e4a896050281a21cfa6fed8879cb1 \ - --hash=sha256:6354e18a9be37bfa124d6b288a87fb30c673745806c92956f1a25e3ae6e76b96 \ - --hash=sha256:678f66462058dd978702db17eb6a3633d634f7aa0deaea61e0a674152766d3fc \ - --hash=sha256:68950bc08f9735306322bfc16a18391fcaac99ded2509e1cc41d03ccb6013cfe \ - --hash=sha256:68ef5377eb582fa4343c9d0b57a5b094046d447b4c73dd9fbd9ffb216f829e7d \ - --hash=sha256:6b4c19525c3538fbc0bbda6229f9682fb8199ce9ac37395880e6952798e00373 \ - --hash=sha256:6bb69bf3b6500f195c3deb69c1205ba8fc3cb21d1915f1f158a10d6b1ef29b6a \ - --hash=sha256:6e19401742ed7b69e51d8e4df3c03ad5ec65a83b36244479fd70edde2828a5d9 \ - --hash=sha256:6f4a53af9e81d757756508b57cae1cf28293f0f31b9fa2bfcb416cc7fb230f9d \ - --hash=sha256:6fda87808429c520a002a85d6e7cdadbf58231d60e96260976c5b8f9a12a8e13 \ - --hash=sha256:78f841523729e43e3928a364ec46e2e3f80e6625a4f62aca5c345f3f626c6e8a \ - --hash=sha256:7a6ebfac28fd51890a61df36ef202adbd77d00ee5aca4a3dadb3d9ed49cfb929 \ - --hash=sha256:7b0202ebf2268954090209a84f9897345719e46a57c5f2c9b7b250ca0a9d3e63 \ - --hash=sha256:8117839a9bdbba86e7f9df57018fe3b96cec934c3940b591b0fd3fbfb485864a \ - --hash=sha256:82e1ad4ca170e8af4c928b67cff731b6296e6a0a0981b97b2eb7c275cc4e15bd \ - --hash=sha256:836a4bfe0cc6d36dc9a9cc1a7b391265bf6ce9d1eb1eac62ac5139f5d8d9a6fa \ - --hash=sha256:84af1cf7bfdcbc6fcf5a5f70cc9896205e0350306e4dd73d54b6a18894f79386 \ - --hash=sha256:84e35afd9e10b2698e6f2f32256678cb23ca6c1568d02628033a837638b3ed12 \ - --hash=sha256:884f1806609c2c66564082540cffc96868c5571c7c3cf3a783f63f2fb49bd3cd \ - --hash=sha256:8a150392102c402c538190730fda06f3bce654fc498865579a9f2c1d2b425833 \ - --hash=sha256:8e21d927469d04b39386255bf00d0feedead16f6253dcc85e9e10ddebc334084 \ - --hash=sha256:8e96ca781e0c01e32115912ebdf7b3fb0780ce748b80d7d28a0802fa9fbaf44e \ - --hash=sha256:8ee4c2a75af9fe21269a4a0898c5425afb01af1f5d276063f57e2ae1bc64e191 \ - --hash=sha256:91bc66f878557313c2a6bcf396e7befcffe5ab4354cfe4427318968af31143c3 \ - --hash=sha256:951e71da6c89d354572098bada5ba5b5dc3a9390c933af8a614e37755d3d1840 \ - --hash=sha256:99b2863c1365f43f74199c980a3d40f18a218fbe683dd64e470199db426c4d6a \ - --hash=sha256:9a8fbf506fde1529a1e3698198fe64bfbe2e0c09557bc6a7dcf872e7c01fec40 \ - --hash=sha256:9ce048deb1e033e7a865ca384770bccc11d44179cf09e5193a535c4c2f497bdc \ - --hash=sha256:9fe94d9d2a2b4edd7a4b22adcd45814b1b59b03feb00e56deb2e89747aec7bfe \ - --hash=sha256:a291d0b4243a259c8ea7e2b84eb9ccb76370e569298875a7c5e3e71baf49057a \ - --hash=sha256:a5c022bb0d453192426221605efc865373dde43b17822a264671c53b068ac20c \ - --hash=sha256:abb4785894936d7682635726613c44578c420a096729f1978cd061a7e72d5275 \ - --hash=sha256:b872c86d8d71827235c7077461c502feb2db3f87d9d6d5a9daa64287d75e4fa0 \ - --hash=sha256:bf37b72834e7239cf84d4a0b2c050e7f9e48bced97bad9bdf98d26b8eb72e846 \ - --hash=sha256:c0c431e4be5c1a0c6654e0c31c661cd89e0ca956ef65305c3c3fd96f4e72ca39 \ - --hash=sha256:c5726eec789ee38f2c53b10b1821457b82274f81f4f746bb1e666d8741fcfadb \ - --hash=sha256:c6fcb3fa3855d583aa57b94cf146f7781d5d5bc06cb95cb3afece33d31aac39b \ - --hash=sha256:c86679f443e7085ea55a7376462553996c688395d18ef3f0d3dbad7838f857a2 \ - --hash=sha256:c91e3c04f5191fd3fb68764bddeaf02025492d5d9f23343b283870f6ace69708 \ - --hash=sha256:c921ad596ff1a82f9c692b0758c944355abc9f0de97a4c13ca60ffc6d8dc15d4 \ - --hash=sha256:c9ed88b398ba7e3bad7bd64d66cc01dcde9cfcb7ec629a6fd78a82fa0b559d78 \ - --hash=sha256:cd2ac6b919f7fed71b17fe0b4603c092a4c9b5bae414817c9c81d3c22d1e1bcc \ - --hash=sha256:d28ca7066d6cdd347a50d8b725dc10d9a1d6a1cce09836cf071ea6a2d4908be0 \ - --hash=sha256:d29e235ce13c91902ef3efc3d883a677655b3908b1cbc73dee816e5e1f8f7739 \ - --hash=sha256:d8b5ee4ae9170e2775d495b81f414cc20268041c42571530513496ba61e94ba3 \ - --hash=sha256:db72e40628967f6dc572020d04b5f800d71264e0531c6da35097e73bdf38b003 \ - --hash=sha256:df45c4073bed486ea2f18757057953afed8dd77add7276ff01bccb79982cf46c \ - --hash=sha256:dfa5f5c0a4c8fced1422dc2ca7eefd872d5d13eb33cf324361dbf1dbfba0a9fe \ - --hash=sha256:e015833384ca3e1a0565a79f5d953b0629d9138021c27ad37c92a9fa1af7623c \ - --hash=sha256:e15315691fe2253eb447503153acef4d7223dfe7e7702f9ed66539fcd0c43801 \ - --hash=sha256:e65466b31be1070b4a5b7dbfbd14b247884cb8e8b79c64fb0f36b472912dbaea \ - --hash=sha256:e7820bb0d65e3ce1e3e70b6708c2f66143f55912fa02f4b618d0f08b61575f12 \ - --hash=sha256:e851a051f7260e6d688267eb039c81f05f23a19431bd7dfa4bf5e3cb34c108cd \ - --hash=sha256:e9f9feee7f334b72ceae46313333d002b56f325b5f04271b4ae2aadd9e993ae4 \ - --hash=sha256:eb40f828bc2f73f777d1eb8fee2e86cd9692a4518b63b6b5aa8af915dfd3207b \ - --hash=sha256:eb704155e73b833801c247f39d562229c0303f54770ca14fb1c053acb376cf10 \ - --hash=sha256:edb1bfd45227dec8d50bc7c7d86463cd8728bcc574f9b07de7369880de4626a3 \ - --hash=sha256:ee7d9d5537daf6d5c74a83b38a638cc001b648096c1cae8ef695b0c919d9d379 \ - --hash=sha256:f57783fbaf648205ac50ae7d646f27582fc706be3977e87c3c124e7a92407b10 \ - --hash=sha256:ff63a92f6e249514ef35bc795de10745be0226eaea06eb48b4bbeaa0c8850a4a - # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt +pydantic-core==2.33.2 \ + --hash=sha256:0069c9acc3f3981b9ff4cdfaf088e98d83440a4c7ea1bc07460af3d4dc22e72d \ + --hash=sha256:031c57d67ca86902726e0fae2214ce6770bbe2f710dc33063187a68744a5ecac \ + --hash=sha256:0405262705a123b7ce9f0b92f123334d67b70fd1f20a9372b907ce1080c7ba02 \ + --hash=sha256:04a1a413977ab517154eebb2d326da71638271477d6ad87a769102f7c2488c56 \ + --hash=sha256:09fb9dd6571aacd023fe6aaca316bd01cf60ab27240d7eb39ebd66a3a15293b4 \ + --hash=sha256:0a39979dcbb70998b0e505fb1556a1d550a0781463ce84ebf915ba293ccb7e22 \ + --hash=sha256:0a9f2c9dd19656823cb8250b0724ee9c60a82f3cdf68a080979d13092a3b0fef \ + --hash=sha256:0e03262ab796d986f978f79c943fc5f620381be7287148b8010b4097f79a39ec \ + --hash=sha256:0e5b2671f05ba48b94cb90ce55d8bdcaaedb8ba00cc5359f6810fc918713983d \ + --hash=sha256:0e6116757f7959a712db11f3e9c0a99ade00a5bbedae83cb801985aa154f071b \ + --hash=sha256:0fb2d542b4d66f9470e8065c5469ec676978d625a8b7a363f07d9a501a9cb36a \ + --hash=sha256:1082dd3e2d7109ad8b7da48e1d4710c8d06c253cbc4a27c1cff4fbcaa97a9e3f \ + --hash=sha256:1a8695a8d00c73e50bff9dfda4d540b7dee29ff9b8053e38380426a85ef10052 \ + --hash=sha256:1e063337ef9e9820c77acc768546325ebe04ee38b08703244c1309cccc4f1bab \ + --hash=sha256:1ea40a64d23faa25e62a70ad163571c0b342b8bf66d5fa612ac0dec4f069d916 \ + --hash=sha256:2058a32994f1fde4ca0480ab9d1e75a0e8c87c22b53a3ae66554f9af78f2fe8c \ + --hash=sha256:235f45e5dbcccf6bd99f9f472858849f73d11120d76ea8707115415f8e5ebebf \ + --hash=sha256:2807668ba86cb38c6817ad9bc66215ab8584d1d304030ce4f0887336f28a5e27 \ + --hash=sha256:2b0a451c263b01acebe51895bfb0e1cc842a5c666efe06cdf13846c7418caa9a \ + --hash=sha256:2b3d326aaef0c0399d9afffeb6367d5e26ddc24d351dbc9c636840ac355dc5d8 \ + --hash=sha256:2bfb5112df54209d820d7bf9317c7a6c9025ea52e49f46b6a2060104bba37de7 \ + --hash=sha256:2f82865531efd18d6e07a04a17331af02cb7a651583c418df8266f17a63c6612 \ + --hash=sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1 \ + --hash=sha256:3a1c81334778f9e3af2f8aeb7a960736e5cab1dfebfb26aabca09afd2906c039 \ + --hash=sha256:3abcd9392a36025e3bd55f9bd38d908bd17962cc49bc6da8e7e96285336e2bca \ + --hash=sha256:3c6db6e52c6d70aa0d00d45cdb9b40f0433b96380071ea80b09277dba021ddf7 \ + --hash=sha256:3dc625f4aa79713512d1976fe9f0bc99f706a9dee21dfd1810b4bbbf228d0e8a \ + --hash=sha256:3eb3fe62804e8f859c49ed20a8451342de53ed764150cb14ca71357c765dc2a6 \ + --hash=sha256:44857c3227d3fb5e753d5fe4a3420d6376fa594b07b621e220cd93703fe21782 \ + --hash=sha256:4b25d91e288e2c4e0662b8038a28c6a07eaac3e196cfc4ff69de4ea3db992a1b \ + --hash=sha256:4c5b0a576fb381edd6d27f0a85915c6daf2f8138dc5c267a57c08a62900758c7 \ + --hash=sha256:4e61206137cbc65e6d5256e1166f88331d3b6238e082d9f74613b9b765fb9025 \ + --hash=sha256:52fb90784e0a242bb96ec53f42196a17278855b0f31ac7c3cc6f5c1ec4811849 \ + --hash=sha256:53a57d2ed685940a504248187d5685e49eb5eef0f696853647bf37c418c538f7 \ + --hash=sha256:572c7e6c8bb4774d2ac88929e3d1f12bc45714ae5ee6d9a788a9fb35e60bb04b \ + --hash=sha256:5c4aa4e82353f65e548c476b37e64189783aa5384903bfea4f41580f255fddfa \ + --hash=sha256:5c92edd15cd58b3c2d34873597a1e20f13094f59cf88068adb18947df5455b4e \ + --hash=sha256:5f483cfb75ff703095c59e365360cb73e00185e01aaea067cd19acffd2ab20ea \ + --hash=sha256:61c18fba8e5e9db3ab908620af374db0ac1baa69f0f32df4f61ae23f15e586ac \ + --hash=sha256:6368900c2d3ef09b69cb0b913f9f8263b03786e5b2a387706c5afb66800efd51 \ + --hash=sha256:64632ff9d614e5eecfb495796ad51b0ed98c453e447a76bcbeeb69615079fc7e \ + --hash=sha256:65132b7b4a1c0beded5e057324b7e16e10910c106d43675d9bd87d4f38dde162 \ + --hash=sha256:6b99022f1d19bc32a4c2a0d544fc9a76e3be90f0b3f4af413f87d38749300e65 \ + --hash=sha256:6bdfe4b3789761f3bcb4b1ddf33355a71079858958e3a552f16d5af19768fef2 \ + --hash=sha256:6fa6dfc3e4d1f734a34710f391ae822e0a8eb8559a85c6979e14e65ee6ba2954 \ + --hash=sha256:73662edf539e72a9440129f231ed3757faab89630d291b784ca99237fb94db2b \ + --hash=sha256:73cf6373c21bc80b2e0dc88444f41ae60b2f070ed02095754eb5a01df12256de \ + --hash=sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc \ + --hash=sha256:7f92c15cd1e97d4b12acd1cc9004fa092578acfa57b67ad5e43a197175d01a64 \ + --hash=sha256:82f68293f055f51b51ea42fafc74b6aad03e70e191799430b90c13d643059ebb \ + --hash=sha256:83aa99b1285bc8f038941ddf598501a86f1536789740991d7d8756e34f1e74d9 \ + --hash=sha256:87acbfcf8e90ca885206e98359d7dca4bcbb35abdc0ff66672a293e1d7a19101 \ + --hash=sha256:87b31b6846e361ef83fedb187bb5b4372d0da3f7e28d85415efa92d6125d6e6d \ + --hash=sha256:881b21b5549499972441da4758d662aeea93f1923f953e9cbaff14b8b9565aef \ + --hash=sha256:8d55ab81c57b8ff8548c3e4947f119551253f4e3787a7bbc0b6b3ca47498a9d3 \ + --hash=sha256:8f57a69461af2a5fa6e6bbd7a5f60d3b7e6cebb687f55106933188e79ad155c1 \ + --hash=sha256:95237e53bb015f67b63c91af7518a62a8660376a6a0db19b89acc77a4d6199f5 \ + --hash=sha256:96081f1605125ba0855dfda83f6f3df5ec90c61195421ba72223de35ccfb2f88 \ + --hash=sha256:970919794d126ba8645f3837ab6046fb4e72bbc057b3709144066204c19a455d \ + --hash=sha256:9cb1da0f5a471435a7bc7e439b8a728e8b61e59784b2af70d7c169f8dd8ae290 \ + --hash=sha256:9fcd347d2cc5c23b06de6d3b7b8275be558a0c90549495c699e379a80bf8379e \ + --hash=sha256:9fdac5d6ffa1b5a83bca06ffe7583f5576555e6c8b3a91fbd25ea7780f825f7d \ + --hash=sha256:a11c8d26a50bfab49002947d3d237abe4d9e4b5bdc8846a63537b6488e197808 \ + --hash=sha256:a144d4f717285c6d9234a66778059f33a89096dfb9b39117663fd8413d582dcc \ + --hash=sha256:a2b911a5b90e0374d03813674bf0a5fbbb7741570dcd4b4e85a2e48d17def29d \ + --hash=sha256:a7ec89dc587667f22b6a0b6579c249fca9026ce7c333fc142ba42411fa243cdc \ + --hash=sha256:aa9d91b338f2df0508606f7009fde642391425189bba6d8c653afd80fd6bb64e \ + --hash=sha256:b0379a2b24882fef529ec3b4987cb5d003b9cda32256024e6fe1586ac45fc640 \ + --hash=sha256:bc7aee6f634a6f4a95676fcb5d6559a2c2a390330098dba5e5a5f28a2e4ada30 \ + --hash=sha256:bdc25f3681f7b78572699569514036afe3c243bc3059d3942624e936ec93450e \ + --hash=sha256:c083a3bdd5a93dfe480f1125926afcdbf2917ae714bdb80b36d34318b2bec5d9 \ + --hash=sha256:c20c462aa4434b33a2661701b861604913f912254e441ab8d78d30485736115a \ + --hash=sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9 \ + --hash=sha256:c52b02ad8b4e2cf14ca7b3d918f3eb0ee91e63b3167c32591e57c4317e134f8f \ + --hash=sha256:c54c939ee22dc8e2d545da79fc5381f1c020d6d3141d3bd747eab59164dc89fb \ + --hash=sha256:c8e7af2f4e0194c22b5b37205bfb293d166a7344a5b0d0eaccebc376546d77d5 \ + --hash=sha256:cca3868ddfaccfbc4bfb1d608e2ccaaebe0ae628e1416aeb9c4d88c001bb45ab \ + --hash=sha256:d3f26877a748dc4251cfcfda9dfb5f13fcb034f5308388066bcfe9031b63ae7d \ + --hash=sha256:d53b22f2032c42eaaf025f7c40c2e3b94568ae077a606f006d206a463bc69572 \ + --hash=sha256:d87c561733f66531dced0da6e864f44ebf89a8fba55f31407b00c2f7f9449593 \ + --hash=sha256:d946c8bf0d5c24bf4fe333af284c59a19358aa3ec18cb3dc4370080da1e8ad29 \ + --hash=sha256:dac89aea9af8cd672fa7b510e7b8c33b0bba9a43186680550ccf23020f32d535 \ + --hash=sha256:db4b41f9bd95fbe5acd76d89920336ba96f03e149097365afe1cb092fceb89a1 \ + --hash=sha256:dc46a01bf8d62f227d5ecee74178ffc448ff4e5197c756331f71efcc66dc980f \ + --hash=sha256:dd14041875d09cc0f9308e37a6f8b65f5585cf2598a53aa0123df8b129d481f8 \ + --hash=sha256:de4b83bb311557e439b9e186f733f6c645b9417c84e2eb8203f3f820a4b988bf \ + --hash=sha256:e799c050df38a639db758c617ec771fd8fb7a5f8eaaa4b27b101f266b216a246 \ + --hash=sha256:e80b087132752f6b3d714f041ccf74403799d3b23a72722ea2e6ba2e892555b9 \ + --hash=sha256:eb8c529b2819c37140eb51b914153063d27ed88e3bdc31b71198a198e921e011 \ + --hash=sha256:eb9b459ca4df0e5c87deb59d37377461a538852765293f9e6ee834f0435a93b9 \ + --hash=sha256:efec8db3266b76ef9607c2c4c419bdb06bf335ae433b80816089ea7585816f6a \ + --hash=sha256:f481959862f57f29601ccced557cc2e817bce7533ab8e01a797a48b49c9692b3 \ + --hash=sha256:f517ca031dfc037a9c07e748cefd8d96235088b83b4f4ba8939105d20fa1dcd6 \ + --hash=sha256:f889f7a40498cc077332c7ab6b4608d296d852182211787d4f3ee377aaae66e8 \ + --hash=sha256:f8de619080e944347f5f20de29a975c2d815d9ddd8be9b9b7268e2e3ef68605a \ + --hash=sha256:f941635f2a3d96b2973e867144fde513665c87f13fe0e193c158ac51bfaaa7b2 \ + --hash=sha256:fa754d1850735a0b0e03bcffd9d4b4343eb417e47196e4485d9cca326073a42c \ + --hash=sha256:fa854f5cf7e33842a892e5c73f45327760bc7bc516339fda888c75ae60edaeb6 \ + --hash=sha256:fe5b32187cbc0c862ee201ad66c30cf218e5ed468ec8dc1cf49dec66e160cc4d + # via + # -c python/deplocks/llm/ray_test_py311_cu121.lock # pydantic pygments==2.18.0 \ --hash=sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199 \ --hash=sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # rich pyopenssl==25.0.0 \ --hash=sha256:424c247065e46e76a37411b9ab1782541c23bb658bf003772c3405fbaa128e90 \ --hash=sha256:cd2cef799efa3936bb08e8ccb9433a575722b9dd986023f1cabc4ae64e9dac16 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt python-dateutil==2.8.2 \ --hash=sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86 \ --hash=sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock + # celery # pandas pytz==2022.7.1 \ --hash=sha256:01a0681c4b9684a28304615eba55d1ab31ae00bf68ec157ec3708a8182dbbcd0 \ --hash=sha256:78f4f37d8198e0627c5f1143240bb0206b8691d8d7ac6d78fee88b78733f8c4a # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # pandas pyyaml==6.0.1 \ --hash=sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5 \ @@ -1679,27 +1728,27 @@ pyyaml==6.0.1 \ --hash=sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d \ --hash=sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt referencing==0.36.2 \ --hash=sha256:df2e89862cd09deabbdba16944cc3f10feb6b3e6f18e902f7cc25609a34775aa \ --hash=sha256:e8699adbbf8b5c7de96d8ffa0eb5c158b3beafce084968e2ea8bb08c6794dcd0 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jsonschema # jsonschema-specifications requests==2.32.3 \ --hash=sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760 \ --hash=sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt # google-api-core rich==13.3.2 \ --hash=sha256:91954fe80cfb7985727a467ca98a7618e5dd15178cc2da10f553b36a93859001 \ --hash=sha256:a104f37270bf677148d8acb07d33be1569eeee87e2d1beb286a4e9113caf6f2f # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt # memray # typer @@ -1808,14 +1857,14 @@ rpds-py==0.22.3 \ --hash=sha256:fda7cb070f442bf80b642cd56483b5548e43d366fe3f39b98e67cce780cded00 \ --hash=sha256:feea821ee2a9273771bae61194004ee2fc33f8ec7db08117ef9147d4bbcbca8e # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jsonschema # referencing rsa==4.7.2 \ --hash=sha256:78f9a9bf4e7be0c5ded4583326e7461e3a3c5aae24073648b4bdfa797d78c9d2 \ --hash=sha256:9d689e6ca1b3038bc82bf8d23e944b6b6037bc02301a574935b2dd946e0353b9 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # google-auth scikit-image==0.24.0 \ --hash=sha256:18836a18d3a7b6aca5376a2d805f0045826bc6c9fc85331659c33b4813e0b563 \ @@ -1840,7 +1889,7 @@ scikit-image==0.24.0 \ --hash=sha256:ef04360eda372ee5cd60aebe9be91258639c86ae2ea24093fb9182118008d009 \ --hash=sha256:fa27b3a0dbad807b966b8db2d78da734cb812ca4787f7fbb143764800ce2fa9c # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt scipy==1.11.4 \ --hash=sha256:00150c5eae7b610c32589dda259eacc7c4f1665aedf25d921907f4d08a951b1c \ @@ -1869,64 +1918,64 @@ scipy==1.11.4 \ --hash=sha256:f313b39a7e94f296025e3cffc2c567618174c0b1dde173960cf23808f9fae4be \ --hash=sha256:f3cd9e7b3c2c1ec26364856f9fbe78695fe631150f94cd1c22228456404cf1ec # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt # scikit-image shellingham==1.5.4 \ --hash=sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686 \ --hash=sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # typer six==1.16.0 \ --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \ --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # opencensus # python-dateutil smart-open==6.2.0 \ --hash=sha256:088bf00f9327c71e549bc2f86567d3320df5d89667f009ce1c16568976068ef7 \ --hash=sha256:1b4df5c8365218f3852c507451920ccad606c80b0acb4e67508e50ba9b5d2632 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt sniffio==1.3.1 \ --hash=sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2 \ --hash=sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # anyio starlette==0.46.2 \ --hash=sha256:595633ce89f8ffa71a015caed34a5b2dc1c0cdb3f0f1fbd1e69339cf2abeec35 \ --hash=sha256:7f7361f34eed179294600af672f565727419830b54b7b084efe44bb82d2fccd5 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt # fastapi tensorboardx==2.6.2.2 \ --hash=sha256:160025acbf759ede23fd3526ae9d9bfbfd8b68eb16c38a010ebe326dc6395db8 \ --hash=sha256:c6476d7cd0d529b0b72f4acadb1269f9ed8b22f441e87a84f2a3b940bb87b666 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt tifffile==2024.7.21 \ --hash=sha256:7f335b5d6ca49401fe0f1d87deb206f5dae47297e47b1ed52a676d05d6d26798 \ --hash=sha256:818b577d49350421fb511f389f937984f9feaa2cd8177fa00823001920bf3483 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # scikit-image typer==0.12.3 \ --hash=sha256:070d7ca53f785acbccba8e7d28b08dcd88f79f1fbda035ade0aecec71ca5c914 \ --hash=sha256:49e73131481d804288ef62598d97a1ceef3058905aa536a1134f90891ba35482 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt typing-extensions==4.12.2 \ --hash=sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d \ --hash=sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # fastapi # gymnasium # opentelemetry-api @@ -1937,23 +1986,44 @@ typing-extensions==4.12.2 \ # pyopenssl # referencing # typer + # typing-inspection +typing-inspection==0.4.1 \ + --hash=sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51 \ + --hash=sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28 + # via + # -c python/deplocks/llm/ray_test_py311_cu121.lock + # pydantic +tzdata==2025.2 \ + --hash=sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8 \ + --hash=sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9 + # via + # -c python/deplocks/llm/ray_test_py311_cu121.lock + # kombu urllib3==1.26.19 \ --hash=sha256:37a0344459b199fce0e80b0d3569837ec6b6937435c5244e7fd73fa6006830f3 \ --hash=sha256:3e3d753a8618b86d7de333b4223005f68720bcd6a7d2bcb9fbd2229ec7c1e429 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # requests uvicorn==0.22.0 \ --hash=sha256:79277ae03db57ce7d9aa0567830bbb51d7a612f54d6e1e3e92da3ef24c2c8ed8 \ --hash=sha256:e9434d3bbf05f310e762147f769c9f21235ee118ba2d2bf1155a7196448bd996 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt +vine==5.1.0 \ + --hash=sha256:40fdf3c48b2cfe1c38a49e9ae2da6fda88e4794c810050a728bd7413811fb1dc \ + --hash=sha256:8b62e981d35c41049211cf62a0a1242d8c1ee9bd15bb196ce38aefd6799e61e0 + # via + # -c python/deplocks/llm/ray_test_py311_cu121.lock + # amqp + # celery + # kombu virtualenv==20.29.1 \ --hash=sha256:4e4cb403c0b0da39e13b46b1b2476e505cb0046b25f242bee80f62bf990b2779 \ --hash=sha256:b8b8970138d32fb606192cb97f6cd4bb644fa486be9308fb9b63f81091b5dc35 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt watchfiles==0.19.0 \ --hash=sha256:0089c6dc24d436b373c3c57657bf4f9a453b13767150d17284fc6162b2791911 \ @@ -1979,8 +2049,14 @@ watchfiles==0.19.0 \ --hash=sha256:d9b073073e048081e502b6c6b0b88714c026a1a4c890569238d04aca5f9ca74b \ --hash=sha256:fac19dc9cbc34052394dbe81e149411a62e71999c0a19e1e09ce537867f95ae0 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt +wcwidth==0.2.13 \ + --hash=sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859 \ + --hash=sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5 + # via + # -c python/deplocks/llm/ray_test_py311_cu121.lock + # prompt-toolkit yarl==1.18.3 \ --hash=sha256:00e5a1fea0fd4f5bfa7440a47eff01d9822a65b4488f7cff83155a0f31a2ecba \ --hash=sha256:02ddb6756f8f4517a2d5e99d8b2f272488e18dd0bfbc802f31c16c6c20f22193 \ @@ -2065,11 +2141,11 @@ yarl==1.18.3 \ --hash=sha256:fe57328fbc1bfd0bd0514470ac692630f3901c0ee39052ae47acd1d90a436719 \ --hash=sha256:fea09ca13323376a2fdfb353a5fa2e59f90cd18d7ca4eaa1fd31f0a8b4f91e62 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # aiohttp zipp==3.19.2 \ --hash=sha256:bf1dcf6450f873a13e952a29504887c89e6de7506209e5b1bcc3460135d4de19 \ --hash=sha256:f091755f667055f2d02b32c53771a7a6c8b47e1fdbc4b72a8b9072b3eef8015c # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # importlib-metadata diff --git a/python/requirements_compiled_ray_py311_cu128.txt b/python/deplocks/llm/ray_py311_cu128.lock similarity index 85% rename from python/requirements_compiled_ray_py311_cu128.txt rename to python/deplocks/llm/ray_py311_cu128.lock index f1b0a3107207..d65b8adf1934 100644 --- a/python/requirements_compiled_ray_py311_cu128.txt +++ b/python/deplocks/llm/ray_py311_cu128.lock @@ -1,5 +1,5 @@ # This file was autogenerated by uv via the following command: -# uv pip compile --generate-hashes --strip-extras --unsafe-package ray --unsafe-package grpcio-tools --unsafe-package setuptools --index-url https://pypi.org/simple --extra-index-url https://download.pytorch.org/whl/cu128 --index-strategy unsafe-best-match --no-strip-markers --emit-index-url --emit-find-links -c python/requirements_compiled_ray_test_py311_cu128.txt python/requirements.txt -o python/requirements_compiled_ray_py311_cu128.txt +# uv pip compile --generate-hashes --strip-extras --unsafe-package setuptools --index-url https://pypi.org/simple --index-strategy unsafe-best-match --no-strip-markers --emit-index-url --emit-find-links --extra-index-url https://download.pytorch.org/whl/cu128 --python-version=3.11 --unsafe-package ray --python-platform=linux -c python/deplocks/llm/ray_test_py311_cu128.lock python/requirements.txt -o python/deplocks/llm/ray_py311_cu128.lock --index-url https://pypi.org/simple --extra-index-url https://download.pytorch.org/whl/cu128 @@ -7,7 +7,7 @@ aiohappyeyeballs==2.6.1 \ --hash=sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558 \ --hash=sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # aiohttp aiohttp==3.11.16 \ --hash=sha256:004511d3413737700835e949433536a2fe95a7d0297edd911a1e9705c5b5ea43 \ @@ -92,59 +92,77 @@ aiohttp==3.11.16 \ --hash=sha256:fb46bb0f24813e6cede6cc07b1961d4b04f331f7112a23b5e21f567da4ee50aa \ --hash=sha256:fd36c119c5d6551bce374fcb5c19269638f8d09862445f85a5a48596fd59f4bb # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt # aiohttp-cors aiohttp-cors==0.7.0 \ --hash=sha256:0451ba59fdf6909d0e2cd21e4c0a43752bc0703d33fc78ae94d9d9321710193e \ --hash=sha256:4d39c6d7100fd9764ed1caf8cebf0eb01bf5e3f24e2e073fda6234bc48b19f5d # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt aiorwlock==1.3.0 \ --hash=sha256:45baf8e4fa9a23e0bb325fbd67da80de1fd7ae1d4f59a6381754c60cec7b289b \ --hash=sha256:83f12d87df4b9728a0b8fda1756585ab0d652b107bab59c6084e1b1ad692ab45 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt aiosignal==1.3.1 \ --hash=sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc \ --hash=sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # aiohttp +amqp==5.3.1 \ + --hash=sha256:43b3319e1b4e7d1251833a93d672b4af1e40f3d632d479b98661a95f117880a2 \ + --hash=sha256:cddc00c725449522023bad949f70fff7b48f0b1ade74d170a6f10ab044739432 + # via + # -c python/deplocks/llm/ray_test_py311_cu128.lock + # kombu annotated-types==0.6.0 \ --hash=sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43 \ --hash=sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # pydantic anyio==3.7.1 \ --hash=sha256:44a3c9aba0f5defa43261a8b3efb97891f2bd7d804e0e1f56419befa1adfc780 \ --hash=sha256:91dee416e570e92c64041bd18b900d1d6fa78dff7048769ce5ac5ddad004fbb5 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # starlette # watchfiles attrs==25.1.0 \ --hash=sha256:1c97078a80c814273a76b2a298a932eb681c87415c11dee0a6921de7f1b02c3e \ --hash=sha256:c75a69e28a550a7e93789579c22aa26b0f5b83b75dc4e08fe092980051e1090a # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # aiohttp # jsonschema # referencing +billiard==4.2.1 \ + --hash=sha256:12b641b0c539073fc8d3f5b8b7be998956665c4233c7c1fcd66a7e677c4fb36f \ + --hash=sha256:40b59a4ac8806ba2c2369ea98d876bc6108b051c227baffd928c644d15d8f3cb + # via + # -c python/deplocks/llm/ray_test_py311_cu128.lock + # celery cachetools==5.5.2 \ --hash=sha256:1a661caa9175d26759571b2e19580f9d6393969e5dfca11fdb1f947a23e640d4 \ --hash=sha256:d26a22bcc62eb95c3beabd9f1ee5e820d3d2704fe2967cbe350e20c8ffcd3f0a # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # google-auth +celery==5.5.3 \ + --hash=sha256:0b5761a07057acee94694464ca482416b959568904c9dfa41ce8413a7d65d525 \ + --hash=sha256:6c972ae7968c2b5281227f01c3a3f984037d21c5129d07bf3550cc2afc6b10a5 + # via + # -c python/deplocks/llm/ray_test_py311_cu128.lock + # -r python/requirements.txt certifi==2025.1.31 \ --hash=sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651 \ --hash=sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # requests cffi==1.16.0 ; platform_python_implementation != 'PyPy' \ --hash=sha256:0c9ef6ff37e974b73c25eecc13952c55bceed9112be2d9d938ded8e856138bcc \ @@ -200,7 +218,7 @@ cffi==1.16.0 ; platform_python_implementation != 'PyPy' \ --hash=sha256:fa3a0128b152627161ce47201262d3140edb5a5c3da88d73a1b790a959126956 \ --hash=sha256:fcc8eb6d5902bb1cf6dc4f187ee3ea80a1eba0a89aba40a5cb20a5087d961357 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # cryptography charset-normalizer==3.3.2 \ --hash=sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027 \ @@ -294,27 +312,49 @@ charset-normalizer==3.3.2 \ --hash=sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519 \ --hash=sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # requests click==8.1.7 \ --hash=sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28 \ --hash=sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt + # celery + # click-didyoumean + # click-plugins + # click-repl # typer # uvicorn +click-didyoumean==0.3.1 \ + --hash=sha256:4f82fdff0dbe64ef8ab2279bd6aa3f6a99c3b28c05aa09cbfc07c9d7fbb5a463 \ + --hash=sha256:5c4bb6007cfea5f2fd6583a2fb6701a22a41eb98957e63d0fac41c10e7c3117c + # via + # -c python/deplocks/llm/ray_test_py311_cu128.lock + # celery +click-plugins==1.1.1.2 \ + --hash=sha256:008d65743833ffc1f5417bf0e78e8d2c23aab04d9745ba817bd3e71b0feb6aa6 \ + --hash=sha256:d7af3984a99d243c131aa1a828331e7630f4a88a9741fd05c927b204bcf92261 + # via + # -c python/deplocks/llm/ray_test_py311_cu128.lock + # celery +click-repl==0.3.0 \ + --hash=sha256:17849c23dba3d667247dc4defe1757fff98694e90fe37474f3feebb69ced26a9 \ + --hash=sha256:fb7e06deb8da8de86180a33a9da97ac316751c094c6899382da7feeeeb51b812 + # via + # -c python/deplocks/llm/ray_test_py311_cu128.lock + # celery cloudpickle==2.2.0 \ --hash=sha256:3f4219469c55453cfe4737e564b67c2a149109dabf7f242478948b895f61106f \ --hash=sha256:7428798d5926d8fcbfd092d18d01a2a03daf8237d8fcdc8095d256b8490796f0 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # gymnasium colorful==0.5.5 \ --hash=sha256:62c187e27c1433db9463ff93b1451898d1e7e23a7e553583fd9daeb6325182e4 \ --hash=sha256:66f8c1264b2a26f7293b96a03bb7a76c4bc8b9634369a0bffdcd12d618056a1d # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt cryptography==44.0.3 \ --hash=sha256:02f55fb4f8b79c1221b0961488eaae21015b69b210e18c386b69de182ebb1259 \ @@ -355,7 +395,7 @@ cryptography==44.0.3 \ --hash=sha256:fc3c9babc1e1faefd62704bb46a69f359a9819eb0292e40df3fb6e3574715cd4 \ --hash=sha256:fe19d8bc5536a91a24a8133328880a41831b6c5df54599a8417b62fe015d3053 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # pyopenssl cupy-cuda12x==13.1.0 ; sys_platform != 'darwin' \ --hash=sha256:230f8a8e99c81a653baa0ed00819990c0ed1f0cf0298214786b5e323461dc61a \ @@ -371,13 +411,13 @@ cupy-cuda12x==13.1.0 ; sys_platform != 'darwin' \ --hash=sha256:dac0284fecb90b5731f514e569a6fcf6674a730ae95b9490781a713b60a34423 \ --hash=sha256:e7a25ef1b44ae6276b5105affc2289edb34f1aa6676babd5bcd80907348c4cfa # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt distlib==0.3.7 \ --hash=sha256:2e24928bc811348f0feb63014e97aaae3037f2cf48712d51ae61df7fd6075057 \ --hash=sha256:9dafe54b34a028eafd95039d5e5d4851a13734540f1331060d31c9916e7147a8 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # virtualenv dm-tree==0.1.8 \ --hash=sha256:054b461f8176f4bce7a21f7b1870f873a1ced3bdbe1282c816c550bb43c71fa6 \ @@ -427,19 +467,19 @@ dm-tree==0.1.8 \ --hash=sha256:f7ac31b9aecccb2c6e1ab29706f6ded3eba0c2c69c770322c9c685929c3d6afb \ --hash=sha256:fa42a605d099ee7d41ba2b5fb75e21423951fd26e5d50583a00471238fb3021d # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt farama-notifications==0.0.4 \ --hash=sha256:13fceff2d14314cf80703c8266462ebf3733c7d165336eee998fc58e545efd18 \ --hash=sha256:14de931035a41961f7c056361dc7f980762a143d05791ef5794a751a2caf05ae # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # gymnasium fastapi==0.115.12 \ --hash=sha256:1e2c2a2646905f9e83d32f04a3f86aff4a286669c6c950ca95b5fd68c2602681 \ --hash=sha256:e94613d6c05e27be7ffebdd6ea5f388112e5e430c8f7d6494a9d1d88d43e814d # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt fastrlock==0.8.2 ; sys_platform != 'darwin' \ --hash=sha256:067edb0a0805bf61e17a251d5046af59f6e9d2b8ad01222e0ef7a0b7937d5548 \ @@ -518,13 +558,13 @@ fastrlock==0.8.2 ; sys_platform != 'darwin' \ --hash=sha256:fb5363cf0fddd9b50525ddbf64a1e1b28ec4c6dfb28670a940cb1cf988a6786b \ --hash=sha256:ff75c90663d6e8996610d435e71487daa853871ad1770dd83dc0f2fc4997241e # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # cupy-cuda12x filelock==3.17.0 \ --hash=sha256:533dc2f7ba78dc2f0f531fc6c4940addf7b70a481e269a5a3b93be94ffbe8338 \ --hash=sha256:ee4e77401ef576ebb38cd7f13b9b28893194acc20a8e68e18730ba9c0e54660e # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt # virtualenv frozenlist==1.4.1 \ @@ -606,109 +646,105 @@ frozenlist==1.4.1 \ --hash=sha256:fde5bd59ab5357e3853313127f4d3565fc7dad314a74d7b5d43c22c6a5ed2ced \ --hash=sha256:fe1a06da377e3a1062ae5fe0926e12b84eceb8a50b350ddca72dc85015873f74 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # aiohttp # aiosignal -fsspec==2023.5.0 \ - --hash=sha256:51a4ad01a5bb66fcc58036e288c0d53d3975a0df2a5dc59a93b59bade0391f2a \ - --hash=sha256:b3b56e00fb93ea321bc9e5d9cf6f8522a0198b20eb24e02774d329e9c6fb84ce +fsspec==2023.12.1 \ + --hash=sha256:6271f1d3075a378bfe432f6f42bf7e1d2a6ba74f78dd9b512385474c579146a0 \ + --hash=sha256:c4da01a35ac65c853f833e43f67802c25213f560820d54ddf248f92eddd5e990 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt google-api-core==2.24.2 \ --hash=sha256:810a63ac95f3c441b7c0e43d344e372887f62ce9071ba972eacf32672e072de9 \ --hash=sha256:81718493daf06d96d6bc76a91c23874dbf2fac0adbbf542831b805ee6e974696 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # opencensus google-auth==2.23.4 \ --hash=sha256:79905d6b1652187def79d491d6e23d0cbb3a21d3c7ba0dbaa9c8a01906b13ff3 \ --hash=sha256:d4bbc92fe4b8bfd2f3e8d88e5ba7085935da208ee38a134fc280e7ce682a05f2 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # google-api-core googleapis-common-protos==1.61.0 \ --hash=sha256:22f1915393bb3245343f6efe87f6fe868532efc12aa26b391b15132e1279f1c0 \ --hash=sha256:8a64866a97f6304a7179873a465d6eee97b7a24ec6cfd78e0f575e96b821240b # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # google-api-core -grpcio==1.66.2 \ - --hash=sha256:02697eb4a5cbe5a9639f57323b4c37bcb3ab2d48cec5da3dc2f13334d72790dd \ - --hash=sha256:03b0b307ba26fae695e067b94cbb014e27390f8bc5ac7a3a39b7723fed085604 \ - --hash=sha256:05bc2ceadc2529ab0b227b1310d249d95d9001cd106aa4d31e8871ad3c428d73 \ - --hash=sha256:06de8ec0bd71be123eec15b0e0d457474931c2c407869b6c349bd9bed4adbac3 \ - --hash=sha256:0be4e0490c28da5377283861bed2941d1d20ec017ca397a5df4394d1c31a9b50 \ - --hash=sha256:12fda97ffae55e6526825daf25ad0fa37483685952b5d0f910d6405c87e3adb6 \ - --hash=sha256:1caa38fb22a8578ab8393da99d4b8641e3a80abc8fd52646f1ecc92bcb8dee34 \ - --hash=sha256:2018b053aa15782db2541ca01a7edb56a0bf18c77efed975392583725974b249 \ - --hash=sha256:20657d6b8cfed7db5e11b62ff7dfe2e12064ea78e93f1434d61888834bc86d75 \ - --hash=sha256:2335c58560a9e92ac58ff2bc5649952f9b37d0735608242973c7a8b94a6437d8 \ - --hash=sha256:31fd163105464797a72d901a06472860845ac157389e10f12631025b3e4d0453 \ - --hash=sha256:38b68498ff579a3b1ee8f93a05eb48dc2595795f2f62716e797dc24774c1aaa8 \ - --hash=sha256:3b00efc473b20d8bf83e0e1ae661b98951ca56111feb9b9611df8efc4fe5d55d \ - --hash=sha256:3ed71e81782966ffead60268bbda31ea3f725ebf8aa73634d5dda44f2cf3fb9c \ - --hash=sha256:45a3d462826f4868b442a6b8fdbe8b87b45eb4f5b5308168c156b21eca43f61c \ - --hash=sha256:49f0ca7ae850f59f828a723a9064cadbed90f1ece179d375966546499b8a2c9c \ - --hash=sha256:4e504572433f4e72b12394977679161d495c4c9581ba34a88d843eaf0f2fbd39 \ - --hash=sha256:4ea1d062c9230278793820146c95d038dc0f468cbdd172eec3363e42ff1c7d01 \ - --hash=sha256:563588c587b75c34b928bc428548e5b00ea38c46972181a4d8b75ba7e3f24231 \ - --hash=sha256:6001e575b8bbd89eee11960bb640b6da6ae110cf08113a075f1e2051cc596cae \ - --hash=sha256:66a0cd8ba6512b401d7ed46bb03f4ee455839957f28b8d61e7708056a806ba6a \ - --hash=sha256:6851de821249340bdb100df5eacfecfc4e6075fa85c6df7ee0eb213170ec8e5d \ - --hash=sha256:728bdf36a186e7f51da73be7f8d09457a03061be848718d0edf000e709418987 \ - --hash=sha256:73e3b425c1e155730273f73e419de3074aa5c5e936771ee0e4af0814631fb30a \ - --hash=sha256:73fc8f8b9b5c4a03e802b3cd0c18b2b06b410d3c1dcbef989fdeb943bd44aff7 \ - --hash=sha256:78fa51ebc2d9242c0fc5db0feecc57a9943303b46664ad89921f5079e2e4ada7 \ - --hash=sha256:7b2c86457145ce14c38e5bf6bdc19ef88e66c5fee2c3d83285c5aef026ba93b3 \ - --hash=sha256:7d69ce1f324dc2d71e40c9261d3fdbe7d4c9d60f332069ff9b2a4d8a257c7b2b \ - --hash=sha256:802d84fd3d50614170649853d121baaaa305de7b65b3e01759247e768d691ddf \ - --hash=sha256:80fd702ba7e432994df208f27514280b4b5c6843e12a48759c9255679ad38db8 \ - --hash=sha256:8ac475e8da31484efa25abb774674d837b343afb78bb3bcdef10f81a93e3d6bf \ - --hash=sha256:950da58d7d80abd0ea68757769c9db0a95b31163e53e5bb60438d263f4bed7b7 \ - --hash=sha256:99a641995a6bc4287a6315989ee591ff58507aa1cbe4c2e70d88411c4dcc0839 \ - --hash=sha256:9c3a99c519f4638e700e9e3f83952e27e2ea10873eecd7935823dab0c1c9250e \ - --hash=sha256:9c509a4f78114cbc5f0740eb3d7a74985fd2eff022971bc9bc31f8bc93e66a3b \ - --hash=sha256:a18e20d8321c6400185b4263e27982488cb5cdd62da69147087a76a24ef4e7e3 \ - --hash=sha256:a917d26e0fe980b0ac7bfcc1a3c4ad6a9a4612c911d33efb55ed7833c749b0ee \ - --hash=sha256:a9539f01cb04950fd4b5ab458e64a15f84c2acc273670072abe49a3f29bbad54 \ - --hash=sha256:ad2efdbe90c73b0434cbe64ed372e12414ad03c06262279b104a029d1889d13e \ - --hash=sha256:b672abf90a964bfde2d0ecbce30f2329a47498ba75ce6f4da35a2f4532b7acbc \ - --hash=sha256:bbd27c24a4cc5e195a7f56cfd9312e366d5d61b86e36d46bbe538457ea6eb8dd \ - --hash=sha256:c400ba5675b67025c8a9f48aa846f12a39cf0c44df5cd060e23fda5b30e9359d \ - --hash=sha256:c408f5ef75cfffa113cacd8b0c0e3611cbfd47701ca3cdc090594109b9fcbaed \ - --hash=sha256:c806852deaedee9ce8280fe98955c9103f62912a5b2d5ee7e3eaa284a6d8d8e7 \ - --hash=sha256:ce89f5876662f146d4c1f695dda29d4433a5d01c8681fbd2539afff535da14d4 \ - --hash=sha256:d25a14af966438cddf498b2e338f88d1c9706f3493b1d73b93f695c99c5f0e2a \ - --hash=sha256:d8d4732cc5052e92cea2f78b233c2e2a52998ac40cd651f40e398893ad0d06ec \ - --hash=sha256:d9a9724a156c8ec6a379869b23ba3323b7ea3600851c91489b871e375f710bc8 \ - --hash=sha256:e636ce23273683b00410f1971d209bf3689238cf5538d960adc3cdfe80dd0dbd \ - --hash=sha256:e88264caad6d8d00e7913996030bac8ad5f26b7411495848cc218bd3a9040b6c \ - --hash=sha256:f145cc21836c332c67baa6fc81099d1d27e266401565bf481948010d6ea32d46 \ - --hash=sha256:fb57870449dfcfac428afbb5a877829fcb0d6db9d9baa1148705739e9083880e \ - --hash=sha256:fb70487c95786e345af5e854ffec8cb8cc781bcc5df7930c4fbb7feaa72e1cdf \ - --hash=sha256:fe96281713168a3270878255983d2cb1a97e034325c8c2c25169a69289d3ecfa \ - --hash=sha256:ff1f7882e56c40b0d33c4922c15dfa30612f05fb785074a012f7cda74d1c3679 - # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt +grpcio==1.74.0 \ + --hash=sha256:0f87bddd6e27fc776aacf7ebfec367b6d49cad0455123951e4488ea99d9b9b8f \ + --hash=sha256:136b53c91ac1d02c8c24201bfdeb56f8b3ac3278668cbb8e0ba49c88069e1bdc \ + --hash=sha256:1733969040989f7acc3d94c22f55b4a9501a30f6aaacdbccfaba0a3ffb255ab7 \ + --hash=sha256:176d60a5168d7948539def20b2a3adcce67d72454d9ae05969a2e73f3a0feee7 \ + --hash=sha256:1a2b06afe2e50ebfd46247ac3ba60cac523f54ec7792ae9ba6073c12daf26f0a \ + --hash=sha256:1bf949792cee20d2078323a9b02bacbbae002b9e3b9e2433f2741c15bdeba1c4 \ + --hash=sha256:22b834cef33429ca6cc28303c9c327ba9a3fafecbf62fae17e9a7b7163cc43ac \ + --hash=sha256:2918948864fec2a11721d91568effffbe0a02b23ecd57f281391d986847982f6 \ + --hash=sha256:2bc2d7d8d184e2362b53905cb1708c84cb16354771c04b490485fa07ce3a1d89 \ + --hash=sha256:2f609a39f62a6f6f05c7512746798282546358a37ea93c1fcbadf8b2fed162e3 \ + --hash=sha256:3601274bc0523f6dc07666c0e01682c94472402ac2fd1226fd96e079863bfa49 \ + --hash=sha256:3b03d8f2a07f0fea8c8f74deb59f8352b770e3900d143b3d1475effcb08eec20 \ + --hash=sha256:3d14e3c4d65e19d8430a4e28ceb71ace4728776fd6c3ce34016947474479683f \ + --hash=sha256:42f8fee287427b94be63d916c90399ed310ed10aadbf9e2e5538b3e497d269bc \ + --hash=sha256:4bc5fca10aaf74779081e16c2bcc3d5ec643ffd528d9e7b1c9039000ead73bae \ + --hash=sha256:4e4181bfc24413d1e3a37a0b7889bea68d973d4b45dd2bc68bb766c140718f82 \ + --hash=sha256:55b453812fa7c7ce2f5c88be3018fb4a490519b6ce80788d5913f3f9d7da8c7b \ + --hash=sha256:566b9395b90cc3d0d0c6404bc8572c7c18786ede549cdb540ae27b58afe0fb91 \ + --hash=sha256:5f251c355167b2360537cf17bea2cf0197995e551ab9da6a0a59b3da5e8704f9 \ + --hash=sha256:60d2d48b0580e70d2e1954d0d19fa3c2e60dd7cbed826aca104fff518310d1c5 \ + --hash=sha256:64229c1e9cea079420527fa8ac45d80fc1e8d3f94deaa35643c381fa8d98f362 \ + --hash=sha256:655726919b75ab3c34cdad39da5c530ac6fa32696fb23119e36b64adcfca174a \ + --hash=sha256:662456c4513e298db6d7bd9c3b8df6f75f8752f0ba01fb653e252ed4a59b5a5d \ + --hash=sha256:68c8ebcca945efff9d86d8d6d7bfb0841cf0071024417e2d7f45c5e46b5b08eb \ + --hash=sha256:69e1a8180868a2576f02356565f16635b99088da7df3d45aaa7e24e73a054e31 \ + --hash=sha256:6bab67d15ad617aff094c382c882e0177637da73cbc5532d52c07b4ee887a87b \ + --hash=sha256:7d95d71ff35291bab3f1c52f52f474c632db26ea12700c2ff0ea0532cb0b5854 \ + --hash=sha256:80d1f4fbb35b0742d3e3d3bb654b7381cd5f015f8497279a1e9c21ba623e01b1 \ + --hash=sha256:834988b6c34515545b3edd13e902c1acdd9f2465d386ea5143fb558f153a7176 \ + --hash=sha256:8533e6e9c5bd630ca98062e3a1326249e6ada07d05acf191a77bc33f8948f3d8 \ + --hash=sha256:85bd5cdf4ed7b2d6438871adf6afff9af7096486fcf51818a81b77ef4dd30907 \ + --hash=sha256:86ad489db097141a907c559988c29718719aa3e13370d40e20506f11b4de0d11 \ + --hash=sha256:885912559974df35d92219e2dc98f51a16a48395f37b92865ad45186f294096c \ + --hash=sha256:8efe72fde5500f47aca1ef59495cb59c885afe04ac89dd11d810f2de87d935d4 \ + --hash=sha256:8f7b5882fb50632ab1e48cb3122d6df55b9afabc265582808036b6e51b9fd6b7 \ + --hash=sha256:9e7c4389771855a92934b2846bd807fc25a3dfa820fd912fe6bd8136026b2707 \ + --hash=sha256:9e912d3c993a29df6c627459af58975b2e5c897d93287939b9d5065f000249b5 \ + --hash=sha256:a8f0302f9ac4e9923f98d8e243939a6fb627cd048f5cd38595c97e38020dffce \ + --hash=sha256:b6a73b2ba83e663b2480a90b82fdae6a7aa6427f62bf43b29912c0cfd1aa2bfa \ + --hash=sha256:c14e803037e572c177ba54a3e090d6eb12efd795d49327c5ee2b3bddb836bf01 \ + --hash=sha256:c3d7bd6e3929fd2ea7fbc3f562e4987229ead70c9ae5f01501a46701e08f1ad9 \ + --hash=sha256:c98e0b7434a7fa4e3e63f250456eaef52499fba5ae661c58cc5b5477d11e7182 \ + --hash=sha256:cce634b10aeab37010449124814b05a62fb5f18928ca878f1bf4750d1f0c815b \ + --hash=sha256:e154d230dc1bbbd78ad2fdc3039fa50ad7ffcf438e4eb2fa30bce223a70c7486 \ + --hash=sha256:e1ea6176d7dfd5b941ea01c2ec34de9531ba494d541fe2057c904e601879f249 \ + --hash=sha256:e759f9e8bc908aaae0412642afe5416c9f983a80499448fcc7fab8692ae044c3 \ + --hash=sha256:e8978003816c7b9eabe217f88c78bc26adc8f9304bf6a594b02e5a49b2ef9c11 \ + --hash=sha256:ecde9ab49f58433abe02f9ed076c7b5be839cf0153883a6d23995937a82392fa \ + --hash=sha256:f6ec94f0e50eb8fa1744a731088b966427575e40c2944a980049798b127a687e \ + --hash=sha256:fd3c71aeee838299c5887230b8a1822795325ddfea635edd82954c1eaa831e24 \ + --hash=sha256:fe0f540750a13fd8e5da4b3eaba91a785eea8dca5ccd2bc2ffe978caa403090e + # via + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt -gymnasium==1.0.0 \ - --hash=sha256:9d2b66f30c1b34fe3c2ce7fae65ecf365d0e9982d2b3d860235e773328a3b403 \ - --hash=sha256:b6f40e1e24c5bd419361e1a5b86a9117d2499baecc3a660d44dfff4c465393ad +gymnasium==1.1.1 \ + --hash=sha256:8bd9ea9bdef32c950a444ff36afc785e1d81051ec32d30435058953c20d2456d \ + --hash=sha256:9c167ec0a2b388666e37f63b2849cd2552f7f5b71938574c637bb36487eb928a # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt h11==0.16.0 \ --hash=sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1 \ --hash=sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # uvicorn idna==3.7 \ --hash=sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc \ --hash=sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # anyio # requests # yarl @@ -716,37 +752,43 @@ imageio==2.34.2 \ --hash=sha256:5c0c0ee8faa018a1c42f649b90395dd4d3bb6187c09053a0cd6f1fdd51bbff5e \ --hash=sha256:a0bb27ec9d5bab36a9f4835e51b21d2cb099e1f78451441f94687ff3404b79f8 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # scikit-image importlib-metadata==6.11.0 \ --hash=sha256:1231cf92d825c9e03cfc4da076a16de6422c863558229ea0b22b675657463443 \ --hash=sha256:f0afba6205ad8f8947c7d338b5342d5db2afbfd82f9cbef7879a9539cc12eb9b # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # opentelemetry-api jinja2==3.1.6 ; sys_platform != 'win32' \ --hash=sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d \ --hash=sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # memray jsonschema==4.23.0 \ --hash=sha256:d71497fef26351a33265337fa77ffeb82423f3ea21283cd9467bb03999266bc4 \ --hash=sha256:fbadb6f8b144a8f8cf9f0b89ba94501d143e50411a1278633f56a7acf7fd5566 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt jsonschema-specifications==2024.10.1 \ --hash=sha256:0f38b83639958ce1152d02a7f062902c41c8fd20d558b0c34344292d417ae272 \ --hash=sha256:a09a0680616357d9a0ecf05c12ad234479f549239d0f5b55f3deea67475da9bf # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jsonschema +kombu==5.5.4 \ + --hash=sha256:886600168275ebeada93b888e831352fe578168342f0d1d5833d88ba0d847363 \ + --hash=sha256:a12ed0557c238897d8e518f1d1fdf84bd1516c5e305af2dacd85c2015115feb8 + # via + # -c python/deplocks/llm/ray_test_py311_cu128.lock + # celery lazy-loader==0.4 \ --hash=sha256:342aa8e14d543a154047afb4ba8ef17f5563baad3fc610d7b15b213b0f119efc \ --hash=sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # scikit-image lz4==4.3.3 \ --hash=sha256:01fe674ef2889dbb9899d8a67361e0c4a2c833af5aeb37dd505727cf5d2a131e \ @@ -786,13 +828,13 @@ lz4==4.3.3 \ --hash=sha256:f4c7bf687303ca47d69f9f0133274958fd672efaa33fb5bcde467862d6c621f0 \ --hash=sha256:f76176492ff082657ada0d0f10c794b6da5800249ef1692b35cf49b1e93e8ef7 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt markdown-it-py==2.2.0 \ --hash=sha256:5a35f8d1870171d9acc47b99612dc146129b631baf04970128b568f190d0cc30 \ --hash=sha256:7c9a5e412688bc771c67432cbfebcdd686c93ce6484913dccf06cb5a0bea35a1 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # rich markupsafe==2.1.3 ; sys_platform != 'win32' \ --hash=sha256:05fb21170423db021895e1ea1e1f3ab3adb85d1c2333cbc2310f2a26bc77272e \ @@ -821,13 +863,13 @@ markupsafe==2.1.3 ; sys_platform != 'win32' \ --hash=sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc \ --hash=sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jinja2 mdurl==0.1.2 \ --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \ --hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # markdown-it-py memray==1.10.0 ; sys_platform != 'win32' \ --hash=sha256:0a21745fb516b7a6efcd40aa7487c59e9313fcfc782d0193fcfcf00b48426874 \ @@ -866,7 +908,7 @@ memray==1.10.0 ; sys_platform != 'win32' \ --hash=sha256:e356af93e3b031c83957e9ac1a653f5aaba5df1e357dd17142f5ed19bb3dc660 \ --hash=sha256:f16c5c8730b616613dc8bafe32649ca6bd7252606251eb00148582011758d0b5 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt msgpack==1.0.7 \ --hash=sha256:04ad6069c86e531682f9e1e71b71c1c3937d6014a7c3e9edd2aa81ad58842862 \ @@ -926,7 +968,7 @@ msgpack==1.0.7 \ --hash=sha256:f9a7c509542db4eceed3dcf21ee5267ab565a83555c9b88a8109dcecc4709002 \ --hash=sha256:ff1d0899f104f3921d94579a5638847f783c9b04f2d5f229392ca77fba5b82fc # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt multidict==6.0.5 \ --hash=sha256:01265f5e40f5a17f8241d52656ed27192be03bfa8764d88e8220141d1e4b3556 \ @@ -1020,13 +1062,13 @@ multidict==6.0.5 \ --hash=sha256:fce28b3c8a81b6b36dfac9feb1de115bab619b3c13905b419ec71d03a3fc1423 \ --hash=sha256:fe5d7785250541f7f5019ab9cba2c71169dc7d74d0f45253f8313f436458a4ef # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # aiohttp # yarl networkx==3.2.1 \ --hash=sha256:f18c69adc97877c42332c170849c96cefa91881c99a7cb3e95b7c659ebdc1ec2 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # scikit-image numpy==1.26.4 \ --hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \ @@ -1066,7 +1108,7 @@ numpy==1.26.4 \ --hash=sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3 \ --hash=sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt # cupy-cuda12x # gymnasium @@ -1080,19 +1122,19 @@ opencensus==0.11.4 \ --hash=sha256:a18487ce68bc19900336e0ff4655c5a116daf10c1b3685ece8d971bddad6a864 \ --hash=sha256:cbef87d8b8773064ab60e5c2a1ced58bbaa38a6d052c41aec224958ce544eff2 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt opencensus-context==0.1.3 \ --hash=sha256:073bb0590007af276853009fac7e4bab1d523c3f03baf4cb4511ca38967c6039 \ --hash=sha256:a03108c3c10d8c80bb5ddf5c8a1f033161fa61972a9917f9b9b3a18517f0088c # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # opencensus opentelemetry-api==1.34.1 \ --hash=sha256:64f0bd06d42824843731d05beea88d4d4b6ae59f9fe347ff7dfa2cc14233bbb3 \ --hash=sha256:b7df4cb0830d5a6c29ad0c0691dbae874d8daefa934b8b1d642de48323d32a8c # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt # opentelemetry-exporter-prometheus # opentelemetry-sdk @@ -1101,33 +1143,34 @@ opentelemetry-exporter-prometheus==0.55b1 \ --hash=sha256:d13ec0b22bf394113ff1ada5da98133a4b051779b803dae183188e26c4bd9ee0 \ --hash=sha256:f364fbbff9e5de37a112ff104d1185fb1d7e2046c5ab5911e5afebc7ab3ddf0e # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt opentelemetry-proto==1.27.0 \ --hash=sha256:33c9345d91dafd8a74fc3d7576c5a38f18b7fdf8d02983ac67485386132aedd6 \ --hash=sha256:b133873de5581a50063e1e4b29cdcf0c5e253a8c2d8dc1229add20a4c3830ace # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt opentelemetry-sdk==1.34.1 \ --hash=sha256:308effad4059562f1d92163c61c8141df649da24ce361827812c40abb2a1e96e \ --hash=sha256:8091db0d763fcd6098d4781bbc80ff0971f94e260739aa6afe6fd379cdf3aa4d # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt # opentelemetry-exporter-prometheus opentelemetry-semantic-conventions==0.55b1 \ --hash=sha256:5da81dfdf7d52e3d37f8fe88d5e771e191de924cfff5f550ab0b8f7b2409baed \ --hash=sha256:ef95b1f009159c28d7a7849f5cbc71c4c34c845bb514d66adfdf1b3fff3598b3 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # opentelemetry-sdk packaging==23.0 \ --hash=sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2 \ --hash=sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt + # kombu # lazy-loader # scikit-image # tensorboardx @@ -1160,7 +1203,7 @@ pandas==1.5.3 \ --hash=sha256:e474390e60ed609cec869b0da796ad94f420bb057d86784191eefc62b65819ae \ --hash=sha256:f76d097d12c82a535fda9dfe5e8dd4127952b45fea9b0276cb30cca5ea313fbc # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt pillow==10.3.0 \ --hash=sha256:048ad577748b9fa4a99a0548c64f2cb8d672d5bf2e643a739ac8faff1164238c \ @@ -1233,22 +1276,28 @@ pillow==10.3.0 \ --hash=sha256:fdcbb4068117dfd9ce0138d068ac512843c52295ed996ae6dd1faf537b6dbc27 \ --hash=sha256:ff61bfd9253c3915e6d41c651d5f962da23eda633cf02262990094a18a55371a # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # imageio # scikit-image platformdirs==3.11.0 \ --hash=sha256:cf8ee52a3afdb965072dcc652433e0c7e3e40cf5ea1477cd4b3b1d2eb75495b3 \ --hash=sha256:e9d171d00af68be50e9202731309c4e658fd8bc76f55c11c7dd760d023bda68e # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # virtualenv prometheus-client==0.19.0 \ --hash=sha256:4585b0d1223148c27a225b10dbec5ae9bc4c81a99a3fa80774fa6209935324e1 \ --hash=sha256:c88b1e6ecf6b41cd8fb5731c7ae919bf66df6ec6fafa555cd6c0e16ca169ae92 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt # opentelemetry-exporter-prometheus +prompt-toolkit==3.0.41 \ + --hash=sha256:941367d97fc815548822aa26c2a269fdc4eb21e9ec05fc5d447cf09bad5d75f0 \ + --hash=sha256:f36fe301fafb7470e86aaf90f036eef600a3210be4decf461a5b1ca8403d3cb2 + # via + # -c python/deplocks/llm/ray_test_py311_cu128.lock + # click-repl propcache==0.3.0 \ --hash=sha256:02df07041e0820cacc8f739510078f2aadcfd3fc57eaeeb16d5ded85c872c89e \ --hash=sha256:03acd9ff19021bd0567582ac88f821b66883e158274183b9e5586f678984f8fe \ @@ -1349,14 +1398,14 @@ propcache==0.3.0 \ --hash=sha256:fb91d20fa2d3b13deea98a690534697742029f4fb83673a3501ae6e3746508b5 \ --hash=sha256:fddb8870bdb83456a489ab67c6b3040a8d5a55069aa6f72f9d872235fbc52f54 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # aiohttp # yarl proto-plus==1.22.3 \ --hash=sha256:a49cd903bc0b6ab41f76bf65510439d56ca76f868adf0274e738bfdd096894df \ --hash=sha256:fdcd09713cbd42480740d2fe29c990f7fbd885a67efc328aa8be6ee3e9f76a6b # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # google-api-core protobuf==4.25.8 \ --hash=sha256:077ff8badf2acf8bc474406706ad890466274191a48d0abd3bd6987107c9cde5 \ @@ -1371,7 +1420,7 @@ protobuf==4.25.8 \ --hash=sha256:d552c53d0415449c8d17ced5c341caba0d89dbf433698e1436c8fa0aae7808a3 \ --hash=sha256:f4510b93a3bec6eba8fd8f1093e9d7fb0d4a24d1a81377c10c0e5bbfe9e4ed24 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt # google-api-core # googleapis-common-protos @@ -1388,7 +1437,7 @@ py-spy==0.4.0 ; python_full_version < '3.12' \ --hash=sha256:eee3d0bde85ca5cf4f01f012d461180ca76c24835a96f7b5c4ded64eb6a008ab \ --hash=sha256:f2cf3f7130e7d780471faa5957441d3b4e0ec39a79b2c00f4c33d494f7728428 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt pyarrow==19.0.1 \ --hash=sha256:008a4009efdb4ea3d2e18f05cd31f9d43c388aad29c636112c2966605ba33466 \ @@ -1434,161 +1483,161 @@ pyarrow==19.0.1 \ --hash=sha256:fca15aabbe9b8355800d923cc2e82c8ef514af321e18b437c3d782aa884eaeec \ --hash=sha256:fd44d66093a239358d07c42a91eebf5015aa54fccba959db899f932218ac9cc8 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt pyasn1==0.5.1 \ --hash=sha256:4439847c58d40b1d0a573d07e3856e95333f1976294494c325775aeca506eb58 \ --hash=sha256:6d391a96e59b23130a5cfa74d6fd7f388dbbe26cc8f1edf39fdddf08d9d6676c # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # pyasn1-modules # rsa pyasn1-modules==0.3.0 \ --hash=sha256:5bd01446b736eb9d31512a30d46c1ac3395d676c6f3cafa4c03eb54b9925631c \ --hash=sha256:d3ccd6ed470d9ffbc716be08bd90efbd44d0734bc9303818f7336070984a162d # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # google-auth pycparser==2.21 ; platform_python_implementation != 'PyPy' \ --hash=sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9 \ --hash=sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # cffi -pydantic==2.10.0 \ - --hash=sha256:0aca0f045ff6e2f097f1fe89521115335f15049eeb8a7bef3dafe4b19a74e289 \ - --hash=sha256:5e7807ba9201bdf61b1b58aa6eb690916c40a47acfb114b1b4fef3e7fd5b30fc +pydantic==2.11.7 \ + --hash=sha256:d989c3c6cb79469287b1569f7447a17848c998458d49ebe294e975b9baf0f0db \ + --hash=sha256:dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt # fastapi -pydantic-core==2.27.0 \ - --hash=sha256:0aa4d1b2eba9a325897308b3124014a142cdccb9f3e016f31d3ebee6b5ea5e75 \ - --hash=sha256:0d06b667e53320332be2bf6f9461f4a9b78092a079b8ce8634c9afaa7e10cd9f \ - --hash=sha256:153017e3d6cd3ce979de06d84343ca424bb6092727375eba1968c8b4693c6ecb \ - --hash=sha256:15e350efb67b855cd014c218716feea4986a149ed1f42a539edd271ee074a196 \ - --hash=sha256:185ef205256cd8b38431205698531026979db89a79587725c1e55c59101d64e9 \ - --hash=sha256:1da0c98a85a6c6ed702d5556db3b09c91f9b0b78de37b7593e2de8d03238807a \ - --hash=sha256:225bfff5d425c34e1fd562cef52d673579d59b967d9de06178850c4802af9039 \ - --hash=sha256:24f984fc7762ed5f806d9e8c4c77ea69fdb2afd987b4fd319ef06c87595a8c55 \ - --hash=sha256:25a7fd4de38f7ff99a37e18fa0098c3140286451bc823d1746ba80cec5b433a1 \ - --hash=sha256:2883b260f7a93235488699d39cbbd94fa7b175d3a8063fbfddd3e81ad9988cb2 \ - --hash=sha256:2a51ce96224eadd1845150b204389623c8e129fde5a67a84b972bd83a85c6c40 \ - --hash=sha256:2be0ad541bb9f059954ccf8877a49ed73877f862529575ff3d54bf4223e4dd61 \ - --hash=sha256:31a2cae5f059329f9cfe3d8d266d3da1543b60b60130d186d9b6a3c20a346361 \ - --hash=sha256:333c840a1303d1474f491e7be0b718226c730a39ead0f7dab2c7e6a2f3855555 \ - --hash=sha256:33d14369739c5d07e2e7102cdb0081a1fa46ed03215e07f097b34e020b83b1ae \ - --hash=sha256:35380671c3c921fe8adf31ad349dc6f7588b7e928dbe44e1093789734f607399 \ - --hash=sha256:359e7951f04ad35111b5ddce184db3391442345d0ab073aa63a95eb8af25a5ef \ - --hash=sha256:36aa167f69d8807ba7e341d67ea93e50fcaaf6bc433bb04939430fa3dab06f31 \ - --hash=sha256:395e3e1148fa7809016231f8065f30bb0dc285a97b4dc4360cd86e17bab58af7 \ - --hash=sha256:3e8d89c276234579cd3d095d5fa2a44eb10db9a218664a17b56363cddf226ff3 \ - --hash=sha256:3eb8849445c26b41c5a474061032c53e14fe92a11a5db969f722a2716cd12206 \ - --hash=sha256:3fd8bc2690e7c39eecdf9071b6a889ce7b22b72073863940edc2a0a23750ca90 \ - --hash=sha256:400bf470e4327e920883b51e255617dfe4496d4e80c3fea0b5a5d0bf2c404dd4 \ - --hash=sha256:4148dc9184ab79e356dc00a4199dc0ee8647973332cb385fc29a7cced49b9f9c \ - --hash=sha256:433689845288f9a1ee5714444e65957be26d30915f7745091ede4a83cfb2d7bb \ - --hash=sha256:43b61989068de9ce62296cde02beffabcadb65672207fc51e7af76dca75e6636 \ - --hash=sha256:4523c4009c3f39d948e01962223c9f5538602e7087a628479b723c939fab262d \ - --hash=sha256:483c2213a609e7db2c592bbc015da58b6c75af7360ca3c981f178110d9787bcf \ - --hash=sha256:49633583eb7dc5cba61aaf7cdb2e9e662323ad394e543ee77af265736bcd3eaa \ - --hash=sha256:4b51f964fcbb02949fc546022e56cdb16cda457af485e9a3e8b78ac2ecf5d77e \ - --hash=sha256:4bf1340ae507f6da6360b24179c2083857c8ca7644aab65807023cf35404ea8d \ - --hash=sha256:4fb49cfdb53af5041aba909be00cccfb2c0d0a2e09281bf542371c5fd36ad04c \ - --hash=sha256:510b11e9c3b1a852876d1ccd8d5903684336d635214148637ceb27366c75a467 \ - --hash=sha256:513cb14c0cc31a4dfd849a4674b20c46d87b364f997bbcb02282306f5e187abf \ - --hash=sha256:58560828ee0951bb125c6f2862fbc37f039996d19ceb6d8ff1905abf7da0bf3d \ - --hash=sha256:58ab0d979c969983cdb97374698d847a4acffb217d543e172838864636ef10d9 \ - --hash=sha256:5982048129f40b082c2654de10c0f37c67a14f5ff9d37cf35be028ae982f26df \ - --hash=sha256:5ab325fc86fbc077284c8d7f996d904d30e97904a87d6fb303dce6b3de7ebba9 \ - --hash=sha256:5cc822ab90a70ea3a91e6aed3afac570b276b1278c6909b1d384f745bd09c714 \ - --hash=sha256:5f2b19b8d6fca432cb3acf48cf5243a7bf512988029b6e6fd27e9e8c0a204d85 \ - --hash=sha256:5fc72fbfebbf42c0856a824b8b0dc2b5cd2e4a896050281a21cfa6fed8879cb1 \ - --hash=sha256:6354e18a9be37bfa124d6b288a87fb30c673745806c92956f1a25e3ae6e76b96 \ - --hash=sha256:678f66462058dd978702db17eb6a3633d634f7aa0deaea61e0a674152766d3fc \ - --hash=sha256:68950bc08f9735306322bfc16a18391fcaac99ded2509e1cc41d03ccb6013cfe \ - --hash=sha256:68ef5377eb582fa4343c9d0b57a5b094046d447b4c73dd9fbd9ffb216f829e7d \ - --hash=sha256:6b4c19525c3538fbc0bbda6229f9682fb8199ce9ac37395880e6952798e00373 \ - --hash=sha256:6bb69bf3b6500f195c3deb69c1205ba8fc3cb21d1915f1f158a10d6b1ef29b6a \ - --hash=sha256:6e19401742ed7b69e51d8e4df3c03ad5ec65a83b36244479fd70edde2828a5d9 \ - --hash=sha256:6f4a53af9e81d757756508b57cae1cf28293f0f31b9fa2bfcb416cc7fb230f9d \ - --hash=sha256:6fda87808429c520a002a85d6e7cdadbf58231d60e96260976c5b8f9a12a8e13 \ - --hash=sha256:78f841523729e43e3928a364ec46e2e3f80e6625a4f62aca5c345f3f626c6e8a \ - --hash=sha256:7a6ebfac28fd51890a61df36ef202adbd77d00ee5aca4a3dadb3d9ed49cfb929 \ - --hash=sha256:7b0202ebf2268954090209a84f9897345719e46a57c5f2c9b7b250ca0a9d3e63 \ - --hash=sha256:8117839a9bdbba86e7f9df57018fe3b96cec934c3940b591b0fd3fbfb485864a \ - --hash=sha256:82e1ad4ca170e8af4c928b67cff731b6296e6a0a0981b97b2eb7c275cc4e15bd \ - --hash=sha256:836a4bfe0cc6d36dc9a9cc1a7b391265bf6ce9d1eb1eac62ac5139f5d8d9a6fa \ - --hash=sha256:84af1cf7bfdcbc6fcf5a5f70cc9896205e0350306e4dd73d54b6a18894f79386 \ - --hash=sha256:84e35afd9e10b2698e6f2f32256678cb23ca6c1568d02628033a837638b3ed12 \ - --hash=sha256:884f1806609c2c66564082540cffc96868c5571c7c3cf3a783f63f2fb49bd3cd \ - --hash=sha256:8a150392102c402c538190730fda06f3bce654fc498865579a9f2c1d2b425833 \ - --hash=sha256:8e21d927469d04b39386255bf00d0feedead16f6253dcc85e9e10ddebc334084 \ - --hash=sha256:8e96ca781e0c01e32115912ebdf7b3fb0780ce748b80d7d28a0802fa9fbaf44e \ - --hash=sha256:8ee4c2a75af9fe21269a4a0898c5425afb01af1f5d276063f57e2ae1bc64e191 \ - --hash=sha256:91bc66f878557313c2a6bcf396e7befcffe5ab4354cfe4427318968af31143c3 \ - --hash=sha256:951e71da6c89d354572098bada5ba5b5dc3a9390c933af8a614e37755d3d1840 \ - --hash=sha256:99b2863c1365f43f74199c980a3d40f18a218fbe683dd64e470199db426c4d6a \ - --hash=sha256:9a8fbf506fde1529a1e3698198fe64bfbe2e0c09557bc6a7dcf872e7c01fec40 \ - --hash=sha256:9ce048deb1e033e7a865ca384770bccc11d44179cf09e5193a535c4c2f497bdc \ - --hash=sha256:9fe94d9d2a2b4edd7a4b22adcd45814b1b59b03feb00e56deb2e89747aec7bfe \ - --hash=sha256:a291d0b4243a259c8ea7e2b84eb9ccb76370e569298875a7c5e3e71baf49057a \ - --hash=sha256:a5c022bb0d453192426221605efc865373dde43b17822a264671c53b068ac20c \ - --hash=sha256:abb4785894936d7682635726613c44578c420a096729f1978cd061a7e72d5275 \ - --hash=sha256:b872c86d8d71827235c7077461c502feb2db3f87d9d6d5a9daa64287d75e4fa0 \ - --hash=sha256:bf37b72834e7239cf84d4a0b2c050e7f9e48bced97bad9bdf98d26b8eb72e846 \ - --hash=sha256:c0c431e4be5c1a0c6654e0c31c661cd89e0ca956ef65305c3c3fd96f4e72ca39 \ - --hash=sha256:c5726eec789ee38f2c53b10b1821457b82274f81f4f746bb1e666d8741fcfadb \ - --hash=sha256:c6fcb3fa3855d583aa57b94cf146f7781d5d5bc06cb95cb3afece33d31aac39b \ - --hash=sha256:c86679f443e7085ea55a7376462553996c688395d18ef3f0d3dbad7838f857a2 \ - --hash=sha256:c91e3c04f5191fd3fb68764bddeaf02025492d5d9f23343b283870f6ace69708 \ - --hash=sha256:c921ad596ff1a82f9c692b0758c944355abc9f0de97a4c13ca60ffc6d8dc15d4 \ - --hash=sha256:c9ed88b398ba7e3bad7bd64d66cc01dcde9cfcb7ec629a6fd78a82fa0b559d78 \ - --hash=sha256:cd2ac6b919f7fed71b17fe0b4603c092a4c9b5bae414817c9c81d3c22d1e1bcc \ - --hash=sha256:d28ca7066d6cdd347a50d8b725dc10d9a1d6a1cce09836cf071ea6a2d4908be0 \ - --hash=sha256:d29e235ce13c91902ef3efc3d883a677655b3908b1cbc73dee816e5e1f8f7739 \ - --hash=sha256:d8b5ee4ae9170e2775d495b81f414cc20268041c42571530513496ba61e94ba3 \ - --hash=sha256:db72e40628967f6dc572020d04b5f800d71264e0531c6da35097e73bdf38b003 \ - --hash=sha256:df45c4073bed486ea2f18757057953afed8dd77add7276ff01bccb79982cf46c \ - --hash=sha256:dfa5f5c0a4c8fced1422dc2ca7eefd872d5d13eb33cf324361dbf1dbfba0a9fe \ - --hash=sha256:e015833384ca3e1a0565a79f5d953b0629d9138021c27ad37c92a9fa1af7623c \ - --hash=sha256:e15315691fe2253eb447503153acef4d7223dfe7e7702f9ed66539fcd0c43801 \ - --hash=sha256:e65466b31be1070b4a5b7dbfbd14b247884cb8e8b79c64fb0f36b472912dbaea \ - --hash=sha256:e7820bb0d65e3ce1e3e70b6708c2f66143f55912fa02f4b618d0f08b61575f12 \ - --hash=sha256:e851a051f7260e6d688267eb039c81f05f23a19431bd7dfa4bf5e3cb34c108cd \ - --hash=sha256:e9f9feee7f334b72ceae46313333d002b56f325b5f04271b4ae2aadd9e993ae4 \ - --hash=sha256:eb40f828bc2f73f777d1eb8fee2e86cd9692a4518b63b6b5aa8af915dfd3207b \ - --hash=sha256:eb704155e73b833801c247f39d562229c0303f54770ca14fb1c053acb376cf10 \ - --hash=sha256:edb1bfd45227dec8d50bc7c7d86463cd8728bcc574f9b07de7369880de4626a3 \ - --hash=sha256:ee7d9d5537daf6d5c74a83b38a638cc001b648096c1cae8ef695b0c919d9d379 \ - --hash=sha256:f57783fbaf648205ac50ae7d646f27582fc706be3977e87c3c124e7a92407b10 \ - --hash=sha256:ff63a92f6e249514ef35bc795de10745be0226eaea06eb48b4bbeaa0c8850a4a - # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt +pydantic-core==2.33.2 \ + --hash=sha256:0069c9acc3f3981b9ff4cdfaf088e98d83440a4c7ea1bc07460af3d4dc22e72d \ + --hash=sha256:031c57d67ca86902726e0fae2214ce6770bbe2f710dc33063187a68744a5ecac \ + --hash=sha256:0405262705a123b7ce9f0b92f123334d67b70fd1f20a9372b907ce1080c7ba02 \ + --hash=sha256:04a1a413977ab517154eebb2d326da71638271477d6ad87a769102f7c2488c56 \ + --hash=sha256:09fb9dd6571aacd023fe6aaca316bd01cf60ab27240d7eb39ebd66a3a15293b4 \ + --hash=sha256:0a39979dcbb70998b0e505fb1556a1d550a0781463ce84ebf915ba293ccb7e22 \ + --hash=sha256:0a9f2c9dd19656823cb8250b0724ee9c60a82f3cdf68a080979d13092a3b0fef \ + --hash=sha256:0e03262ab796d986f978f79c943fc5f620381be7287148b8010b4097f79a39ec \ + --hash=sha256:0e5b2671f05ba48b94cb90ce55d8bdcaaedb8ba00cc5359f6810fc918713983d \ + --hash=sha256:0e6116757f7959a712db11f3e9c0a99ade00a5bbedae83cb801985aa154f071b \ + --hash=sha256:0fb2d542b4d66f9470e8065c5469ec676978d625a8b7a363f07d9a501a9cb36a \ + --hash=sha256:1082dd3e2d7109ad8b7da48e1d4710c8d06c253cbc4a27c1cff4fbcaa97a9e3f \ + --hash=sha256:1a8695a8d00c73e50bff9dfda4d540b7dee29ff9b8053e38380426a85ef10052 \ + --hash=sha256:1e063337ef9e9820c77acc768546325ebe04ee38b08703244c1309cccc4f1bab \ + --hash=sha256:1ea40a64d23faa25e62a70ad163571c0b342b8bf66d5fa612ac0dec4f069d916 \ + --hash=sha256:2058a32994f1fde4ca0480ab9d1e75a0e8c87c22b53a3ae66554f9af78f2fe8c \ + --hash=sha256:235f45e5dbcccf6bd99f9f472858849f73d11120d76ea8707115415f8e5ebebf \ + --hash=sha256:2807668ba86cb38c6817ad9bc66215ab8584d1d304030ce4f0887336f28a5e27 \ + --hash=sha256:2b0a451c263b01acebe51895bfb0e1cc842a5c666efe06cdf13846c7418caa9a \ + --hash=sha256:2b3d326aaef0c0399d9afffeb6367d5e26ddc24d351dbc9c636840ac355dc5d8 \ + --hash=sha256:2bfb5112df54209d820d7bf9317c7a6c9025ea52e49f46b6a2060104bba37de7 \ + --hash=sha256:2f82865531efd18d6e07a04a17331af02cb7a651583c418df8266f17a63c6612 \ + --hash=sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1 \ + --hash=sha256:3a1c81334778f9e3af2f8aeb7a960736e5cab1dfebfb26aabca09afd2906c039 \ + --hash=sha256:3abcd9392a36025e3bd55f9bd38d908bd17962cc49bc6da8e7e96285336e2bca \ + --hash=sha256:3c6db6e52c6d70aa0d00d45cdb9b40f0433b96380071ea80b09277dba021ddf7 \ + --hash=sha256:3dc625f4aa79713512d1976fe9f0bc99f706a9dee21dfd1810b4bbbf228d0e8a \ + --hash=sha256:3eb3fe62804e8f859c49ed20a8451342de53ed764150cb14ca71357c765dc2a6 \ + --hash=sha256:44857c3227d3fb5e753d5fe4a3420d6376fa594b07b621e220cd93703fe21782 \ + --hash=sha256:4b25d91e288e2c4e0662b8038a28c6a07eaac3e196cfc4ff69de4ea3db992a1b \ + --hash=sha256:4c5b0a576fb381edd6d27f0a85915c6daf2f8138dc5c267a57c08a62900758c7 \ + --hash=sha256:4e61206137cbc65e6d5256e1166f88331d3b6238e082d9f74613b9b765fb9025 \ + --hash=sha256:52fb90784e0a242bb96ec53f42196a17278855b0f31ac7c3cc6f5c1ec4811849 \ + --hash=sha256:53a57d2ed685940a504248187d5685e49eb5eef0f696853647bf37c418c538f7 \ + --hash=sha256:572c7e6c8bb4774d2ac88929e3d1f12bc45714ae5ee6d9a788a9fb35e60bb04b \ + --hash=sha256:5c4aa4e82353f65e548c476b37e64189783aa5384903bfea4f41580f255fddfa \ + --hash=sha256:5c92edd15cd58b3c2d34873597a1e20f13094f59cf88068adb18947df5455b4e \ + --hash=sha256:5f483cfb75ff703095c59e365360cb73e00185e01aaea067cd19acffd2ab20ea \ + --hash=sha256:61c18fba8e5e9db3ab908620af374db0ac1baa69f0f32df4f61ae23f15e586ac \ + --hash=sha256:6368900c2d3ef09b69cb0b913f9f8263b03786e5b2a387706c5afb66800efd51 \ + --hash=sha256:64632ff9d614e5eecfb495796ad51b0ed98c453e447a76bcbeeb69615079fc7e \ + --hash=sha256:65132b7b4a1c0beded5e057324b7e16e10910c106d43675d9bd87d4f38dde162 \ + --hash=sha256:6b99022f1d19bc32a4c2a0d544fc9a76e3be90f0b3f4af413f87d38749300e65 \ + --hash=sha256:6bdfe4b3789761f3bcb4b1ddf33355a71079858958e3a552f16d5af19768fef2 \ + --hash=sha256:6fa6dfc3e4d1f734a34710f391ae822e0a8eb8559a85c6979e14e65ee6ba2954 \ + --hash=sha256:73662edf539e72a9440129f231ed3757faab89630d291b784ca99237fb94db2b \ + --hash=sha256:73cf6373c21bc80b2e0dc88444f41ae60b2f070ed02095754eb5a01df12256de \ + --hash=sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc \ + --hash=sha256:7f92c15cd1e97d4b12acd1cc9004fa092578acfa57b67ad5e43a197175d01a64 \ + --hash=sha256:82f68293f055f51b51ea42fafc74b6aad03e70e191799430b90c13d643059ebb \ + --hash=sha256:83aa99b1285bc8f038941ddf598501a86f1536789740991d7d8756e34f1e74d9 \ + --hash=sha256:87acbfcf8e90ca885206e98359d7dca4bcbb35abdc0ff66672a293e1d7a19101 \ + --hash=sha256:87b31b6846e361ef83fedb187bb5b4372d0da3f7e28d85415efa92d6125d6e6d \ + --hash=sha256:881b21b5549499972441da4758d662aeea93f1923f953e9cbaff14b8b9565aef \ + --hash=sha256:8d55ab81c57b8ff8548c3e4947f119551253f4e3787a7bbc0b6b3ca47498a9d3 \ + --hash=sha256:8f57a69461af2a5fa6e6bbd7a5f60d3b7e6cebb687f55106933188e79ad155c1 \ + --hash=sha256:95237e53bb015f67b63c91af7518a62a8660376a6a0db19b89acc77a4d6199f5 \ + --hash=sha256:96081f1605125ba0855dfda83f6f3df5ec90c61195421ba72223de35ccfb2f88 \ + --hash=sha256:970919794d126ba8645f3837ab6046fb4e72bbc057b3709144066204c19a455d \ + --hash=sha256:9cb1da0f5a471435a7bc7e439b8a728e8b61e59784b2af70d7c169f8dd8ae290 \ + --hash=sha256:9fcd347d2cc5c23b06de6d3b7b8275be558a0c90549495c699e379a80bf8379e \ + --hash=sha256:9fdac5d6ffa1b5a83bca06ffe7583f5576555e6c8b3a91fbd25ea7780f825f7d \ + --hash=sha256:a11c8d26a50bfab49002947d3d237abe4d9e4b5bdc8846a63537b6488e197808 \ + --hash=sha256:a144d4f717285c6d9234a66778059f33a89096dfb9b39117663fd8413d582dcc \ + --hash=sha256:a2b911a5b90e0374d03813674bf0a5fbbb7741570dcd4b4e85a2e48d17def29d \ + --hash=sha256:a7ec89dc587667f22b6a0b6579c249fca9026ce7c333fc142ba42411fa243cdc \ + --hash=sha256:aa9d91b338f2df0508606f7009fde642391425189bba6d8c653afd80fd6bb64e \ + --hash=sha256:b0379a2b24882fef529ec3b4987cb5d003b9cda32256024e6fe1586ac45fc640 \ + --hash=sha256:bc7aee6f634a6f4a95676fcb5d6559a2c2a390330098dba5e5a5f28a2e4ada30 \ + --hash=sha256:bdc25f3681f7b78572699569514036afe3c243bc3059d3942624e936ec93450e \ + --hash=sha256:c083a3bdd5a93dfe480f1125926afcdbf2917ae714bdb80b36d34318b2bec5d9 \ + --hash=sha256:c20c462aa4434b33a2661701b861604913f912254e441ab8d78d30485736115a \ + --hash=sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9 \ + --hash=sha256:c52b02ad8b4e2cf14ca7b3d918f3eb0ee91e63b3167c32591e57c4317e134f8f \ + --hash=sha256:c54c939ee22dc8e2d545da79fc5381f1c020d6d3141d3bd747eab59164dc89fb \ + --hash=sha256:c8e7af2f4e0194c22b5b37205bfb293d166a7344a5b0d0eaccebc376546d77d5 \ + --hash=sha256:cca3868ddfaccfbc4bfb1d608e2ccaaebe0ae628e1416aeb9c4d88c001bb45ab \ + --hash=sha256:d3f26877a748dc4251cfcfda9dfb5f13fcb034f5308388066bcfe9031b63ae7d \ + --hash=sha256:d53b22f2032c42eaaf025f7c40c2e3b94568ae077a606f006d206a463bc69572 \ + --hash=sha256:d87c561733f66531dced0da6e864f44ebf89a8fba55f31407b00c2f7f9449593 \ + --hash=sha256:d946c8bf0d5c24bf4fe333af284c59a19358aa3ec18cb3dc4370080da1e8ad29 \ + --hash=sha256:dac89aea9af8cd672fa7b510e7b8c33b0bba9a43186680550ccf23020f32d535 \ + --hash=sha256:db4b41f9bd95fbe5acd76d89920336ba96f03e149097365afe1cb092fceb89a1 \ + --hash=sha256:dc46a01bf8d62f227d5ecee74178ffc448ff4e5197c756331f71efcc66dc980f \ + --hash=sha256:dd14041875d09cc0f9308e37a6f8b65f5585cf2598a53aa0123df8b129d481f8 \ + --hash=sha256:de4b83bb311557e439b9e186f733f6c645b9417c84e2eb8203f3f820a4b988bf \ + --hash=sha256:e799c050df38a639db758c617ec771fd8fb7a5f8eaaa4b27b101f266b216a246 \ + --hash=sha256:e80b087132752f6b3d714f041ccf74403799d3b23a72722ea2e6ba2e892555b9 \ + --hash=sha256:eb8c529b2819c37140eb51b914153063d27ed88e3bdc31b71198a198e921e011 \ + --hash=sha256:eb9b459ca4df0e5c87deb59d37377461a538852765293f9e6ee834f0435a93b9 \ + --hash=sha256:efec8db3266b76ef9607c2c4c419bdb06bf335ae433b80816089ea7585816f6a \ + --hash=sha256:f481959862f57f29601ccced557cc2e817bce7533ab8e01a797a48b49c9692b3 \ + --hash=sha256:f517ca031dfc037a9c07e748cefd8d96235088b83b4f4ba8939105d20fa1dcd6 \ + --hash=sha256:f889f7a40498cc077332c7ab6b4608d296d852182211787d4f3ee377aaae66e8 \ + --hash=sha256:f8de619080e944347f5f20de29a975c2d815d9ddd8be9b9b7268e2e3ef68605a \ + --hash=sha256:f941635f2a3d96b2973e867144fde513665c87f13fe0e193c158ac51bfaaa7b2 \ + --hash=sha256:fa754d1850735a0b0e03bcffd9d4b4343eb417e47196e4485d9cca326073a42c \ + --hash=sha256:fa854f5cf7e33842a892e5c73f45327760bc7bc516339fda888c75ae60edaeb6 \ + --hash=sha256:fe5b32187cbc0c862ee201ad66c30cf218e5ed468ec8dc1cf49dec66e160cc4d + # via + # -c python/deplocks/llm/ray_test_py311_cu128.lock # pydantic pygments==2.18.0 \ --hash=sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199 \ --hash=sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # rich pyopenssl==25.0.0 \ --hash=sha256:424c247065e46e76a37411b9ab1782541c23bb658bf003772c3405fbaa128e90 \ --hash=sha256:cd2cef799efa3936bb08e8ccb9433a575722b9dd986023f1cabc4ae64e9dac16 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt python-dateutil==2.8.2 \ --hash=sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86 \ --hash=sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock + # celery # pandas pytz==2022.7.1 \ --hash=sha256:01a0681c4b9684a28304615eba55d1ab31ae00bf68ec157ec3708a8182dbbcd0 \ --hash=sha256:78f4f37d8198e0627c5f1143240bb0206b8691d8d7ac6d78fee88b78733f8c4a # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # pandas pyyaml==6.0.1 \ --hash=sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5 \ @@ -1643,27 +1692,27 @@ pyyaml==6.0.1 \ --hash=sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d \ --hash=sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt referencing==0.36.2 \ --hash=sha256:df2e89862cd09deabbdba16944cc3f10feb6b3e6f18e902f7cc25609a34775aa \ --hash=sha256:e8699adbbf8b5c7de96d8ffa0eb5c158b3beafce084968e2ea8bb08c6794dcd0 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jsonschema # jsonschema-specifications requests==2.32.3 \ --hash=sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760 \ --hash=sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt # google-api-core rich==13.3.2 \ --hash=sha256:91954fe80cfb7985727a467ca98a7618e5dd15178cc2da10f553b36a93859001 \ --hash=sha256:a104f37270bf677148d8acb07d33be1569eeee87e2d1beb286a4e9113caf6f2f # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt # memray # typer @@ -1772,14 +1821,14 @@ rpds-py==0.22.3 \ --hash=sha256:fda7cb070f442bf80b642cd56483b5548e43d366fe3f39b98e67cce780cded00 \ --hash=sha256:feea821ee2a9273771bae61194004ee2fc33f8ec7db08117ef9147d4bbcbca8e # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jsonschema # referencing rsa==4.7.2 \ --hash=sha256:78f9a9bf4e7be0c5ded4583326e7461e3a3c5aae24073648b4bdfa797d78c9d2 \ --hash=sha256:9d689e6ca1b3038bc82bf8d23e944b6b6037bc02301a574935b2dd946e0353b9 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # google-auth scikit-image==0.24.0 \ --hash=sha256:18836a18d3a7b6aca5376a2d805f0045826bc6c9fc85331659c33b4813e0b563 \ @@ -1804,7 +1853,7 @@ scikit-image==0.24.0 \ --hash=sha256:ef04360eda372ee5cd60aebe9be91258639c86ae2ea24093fb9182118008d009 \ --hash=sha256:fa27b3a0dbad807b966b8db2d78da734cb812ca4787f7fbb143764800ce2fa9c # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt scipy==1.11.4 \ --hash=sha256:00150c5eae7b610c32589dda259eacc7c4f1665aedf25d921907f4d08a951b1c \ @@ -1833,63 +1882,63 @@ scipy==1.11.4 \ --hash=sha256:f313b39a7e94f296025e3cffc2c567618174c0b1dde173960cf23808f9fae4be \ --hash=sha256:f3cd9e7b3c2c1ec26364856f9fbe78695fe631150f94cd1c22228456404cf1ec # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt # scikit-image shellingham==1.5.4 \ --hash=sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686 \ --hash=sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # typer six==1.16.0 \ --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \ --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # opencensus # python-dateutil smart-open==6.2.0 \ --hash=sha256:088bf00f9327c71e549bc2f86567d3320df5d89667f009ce1c16568976068ef7 \ --hash=sha256:1b4df5c8365218f3852c507451920ccad606c80b0acb4e67508e50ba9b5d2632 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt sniffio==1.3.1 \ --hash=sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2 \ --hash=sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # anyio starlette==0.46.2 \ --hash=sha256:595633ce89f8ffa71a015caed34a5b2dc1c0cdb3f0f1fbd1e69339cf2abeec35 \ --hash=sha256:7f7361f34eed179294600af672f565727419830b54b7b084efe44bb82d2fccd5 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt # fastapi tensorboardx==2.6.2.2 \ --hash=sha256:160025acbf759ede23fd3526ae9d9bfbfd8b68eb16c38a010ebe326dc6395db8 \ --hash=sha256:c6476d7cd0d529b0b72f4acadb1269f9ed8b22f441e87a84f2a3b940bb87b666 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt tifffile==2024.7.21 \ --hash=sha256:7f335b5d6ca49401fe0f1d87deb206f5dae47297e47b1ed52a676d05d6d26798 \ --hash=sha256:818b577d49350421fb511f389f937984f9feaa2cd8177fa00823001920bf3483 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # scikit-image typer==0.12.3 \ --hash=sha256:070d7ca53f785acbccba8e7d28b08dcd88f79f1fbda035ade0aecec71ca5c914 \ --hash=sha256:49e73131481d804288ef62598d97a1ceef3058905aa536a1134f90891ba35482 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt typing-extensions==4.12.2 \ --hash=sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # fastapi # gymnasium # opentelemetry-api @@ -1900,23 +1949,44 @@ typing-extensions==4.12.2 \ # pyopenssl # referencing # typer + # typing-inspection +typing-inspection==0.4.1 \ + --hash=sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51 \ + --hash=sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28 + # via + # -c python/deplocks/llm/ray_test_py311_cu128.lock + # pydantic +tzdata==2025.2 \ + --hash=sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8 \ + --hash=sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9 + # via + # -c python/deplocks/llm/ray_test_py311_cu128.lock + # kombu urllib3==1.26.19 \ --hash=sha256:37a0344459b199fce0e80b0d3569837ec6b6937435c5244e7fd73fa6006830f3 \ --hash=sha256:3e3d753a8618b86d7de333b4223005f68720bcd6a7d2bcb9fbd2229ec7c1e429 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # requests uvicorn==0.22.0 \ --hash=sha256:79277ae03db57ce7d9aa0567830bbb51d7a612f54d6e1e3e92da3ef24c2c8ed8 \ --hash=sha256:e9434d3bbf05f310e762147f769c9f21235ee118ba2d2bf1155a7196448bd996 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt +vine==5.1.0 \ + --hash=sha256:40fdf3c48b2cfe1c38a49e9ae2da6fda88e4794c810050a728bd7413811fb1dc \ + --hash=sha256:8b62e981d35c41049211cf62a0a1242d8c1ee9bd15bb196ce38aefd6799e61e0 + # via + # -c python/deplocks/llm/ray_test_py311_cu128.lock + # amqp + # celery + # kombu virtualenv==20.29.1 \ --hash=sha256:4e4cb403c0b0da39e13b46b1b2476e505cb0046b25f242bee80f62bf990b2779 \ --hash=sha256:b8b8970138d32fb606192cb97f6cd4bb644fa486be9308fb9b63f81091b5dc35 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt watchfiles==0.19.0 \ --hash=sha256:0089c6dc24d436b373c3c57657bf4f9a453b13767150d17284fc6162b2791911 \ @@ -1942,8 +2012,14 @@ watchfiles==0.19.0 \ --hash=sha256:d9b073073e048081e502b6c6b0b88714c026a1a4c890569238d04aca5f9ca74b \ --hash=sha256:fac19dc9cbc34052394dbe81e149411a62e71999c0a19e1e09ce537867f95ae0 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt +wcwidth==0.2.13 \ + --hash=sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859 \ + --hash=sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5 + # via + # -c python/deplocks/llm/ray_test_py311_cu128.lock + # prompt-toolkit yarl==1.18.3 \ --hash=sha256:00e5a1fea0fd4f5bfa7440a47eff01d9822a65b4488f7cff83155a0f31a2ecba \ --hash=sha256:02ddb6756f8f4517a2d5e99d8b2f272488e18dd0bfbc802f31c16c6c20f22193 \ @@ -2028,11 +2104,11 @@ yarl==1.18.3 \ --hash=sha256:fe57328fbc1bfd0bd0514470ac692630f3901c0ee39052ae47acd1d90a436719 \ --hash=sha256:fea09ca13323376a2fdfb353a5fa2e59f90cd18d7ca4eaa1fd31f0a8b4f91e62 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # aiohttp zipp==3.19.2 \ --hash=sha256:bf1dcf6450f873a13e952a29504887c89e6de7506209e5b1bcc3460135d4de19 \ --hash=sha256:f091755f667055f2d02b32c53771a7a6c8b47e1fdbc4b72a8b9072b3eef8015c # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # importlib-metadata diff --git a/python/requirements_compiled_ray_test_py311_cpu.txt b/python/deplocks/llm/ray_test_py311_cpu.lock similarity index 90% rename from python/requirements_compiled_ray_test_py311_cpu.txt rename to python/deplocks/llm/ray_test_py311_cpu.lock index d8bae7252f61..2681827678b0 100644 --- a/python/requirements_compiled_ray_test_py311_cpu.txt +++ b/python/deplocks/llm/ray_test_py311_cpu.lock @@ -1,8 +1,14 @@ # This file was autogenerated by uv via the following command: -# uv pip compile --generate-hashes --strip-extras --unsafe-package ray --unsafe-package grpcio-tools --unsafe-package setuptools --index-url https://pypi.org/simple --extra-index-url https://download.pytorch.org/whl/cpu --index-strategy unsafe-best-match --no-strip-markers --emit-index-url --emit-find-links -c /tmp/ray-deps/requirements_compiled.txt python/requirements.txt python/requirements/cloud-requirements.txt python/requirements/base-test-requirements.txt -o python/requirements_compiled_ray_test_py311_cpu.txt +# uv pip compile --generate-hashes --strip-extras --unsafe-package setuptools --index-url https://pypi.org/simple --index-strategy unsafe-best-match --no-strip-markers --emit-index-url --emit-find-links --extra-index-url https://download.pytorch.org/whl/cpu --python-version=3.11 --unsafe-package ray --python-platform=linux -c /tmp/ray-deps/requirements_compiled.txt python/requirements.txt python/requirements/cloud-requirements.txt python/requirements/base-test-requirements.txt -o python/deplocks/llm/ray_test_py311_cpu.lock --index-url https://pypi.org/simple --extra-index-url https://download.pytorch.org/whl/cpu +adlfs==2023.8.0 \ + --hash=sha256:07e804f6df4593acfcaf01025b162e30ac13e523d3570279c98b2d91a18026d9 \ + --hash=sha256:3eb248a3c2a30b419f1147bd7676d156b5219f96ef7f11d47166afd2a3bdb07e + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # -r python/requirements/cloud-requirements.txt aiofiles==22.1.0 \ --hash=sha256:1142fa8e80dbae46bb6339573ad4c8c0841358f79c6eb50a493dceca14621bad \ --hash=sha256:9107f1ca0b2a5553987a94a3c9959fe5b491fdf731389aa5b7b1bd0733e32de6 @@ -101,6 +107,7 @@ aiohttp==3.11.16 \ # -c /tmp/ray-deps/requirements_compiled.txt # -r python/requirements/cloud-requirements.txt # -r python/requirements.txt + # adlfs # aiohttp-cors # pytest-aiohttp aiohttp-cors==0.7.0 \ @@ -127,6 +134,12 @@ aiosqlite==0.19.0 \ # via # -c /tmp/ray-deps/requirements_compiled.txt # ypy-websocket +amqp==5.3.1 \ + --hash=sha256:43b3319e1b4e7d1251833a93d672b4af1e40f3d632d479b98661a95f117880a2 \ + --hash=sha256:cddc00c725449522023bad949f70fff7b48f0b1ade74d170a6f10ab044739432 + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # kombu annotated-types==0.6.0 \ --hash=sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43 \ --hash=sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d @@ -205,20 +218,29 @@ azure-core==1.29.5 \ --hash=sha256:52983c89d394c6f881a121e5101c5fa67278ca3b1f339c8fb2ef39230c70e9ac # via # -c /tmp/ray-deps/requirements_compiled.txt + # adlfs # azure-identity # azure-storage-blob # smart-open +azure-datalake-store==0.0.53 \ + --hash=sha256:05b6de62ee3f2a0a6e6941e6933b792b800c3e7f6ffce2fc324bc19875757393 \ + --hash=sha256:a30c902a6e360aa47d7f69f086b426729784e71c536f330b691647a51dc42b2b + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # adlfs azure-identity==1.17.1 \ --hash=sha256:32ecc67cc73f4bd0595e4f64b1ca65cd05186f4fe6f98ed2ae9f1aa32646efea \ --hash=sha256:db8d59c183b680e763722bfe8ebc45930e6c57df510620985939f7f3191e0382 # via # -c /tmp/ray-deps/requirements_compiled.txt # -r python/requirements/cloud-requirements.txt + # adlfs azure-storage-blob==12.22.0 \ --hash=sha256:b3804bb4fe8ab1c32771fa464053da772a682c2737b19da438a3f4e5e3b3736e \ --hash=sha256:bb7d2d824ce3f11f14a27ee7d9281289f7e072ac8311c52e3652672455b7d5e8 # via # -c /tmp/ray-deps/requirements_compiled.txt + # adlfs # smart-open babel==2.13.1 \ --hash=sha256:33e0952d7dd6374af8dbf6768cc4ddf3ccfefc244f9986d4074704f2fbd18900 \ @@ -238,22 +260,28 @@ beautifulsoup4==4.11.1 \ # via # -c /tmp/ray-deps/requirements_compiled.txt # nbconvert +billiard==4.2.1 \ + --hash=sha256:12b641b0c539073fc8d3f5b8b7be998956665c4233c7c1fcd66a7e677c4fb36f \ + --hash=sha256:40b59a4ac8806ba2c2369ea98d876bc6108b051c227baffd928c644d15d8f3cb + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # celery bleach==6.1.0 \ --hash=sha256:0a31f1837963c41d46bbf1331b8778e1308ea0791db03cc4e7357b97cf42a8fe \ --hash=sha256:3225f354cfc436b9789c66c4ee030194bee0568fbf9cbdad3bc8b5c26c5f12b6 # via # -c /tmp/ray-deps/requirements_compiled.txt # nbconvert -boto3==1.26.76 \ - --hash=sha256:30c7d967ed1c6b5a05643e42cae9d4d36c3f1cb6782637ddc7007a104cfd9027 \ - --hash=sha256:b4c2969b7677762914394b8273cc1905dfe5b71f250741c1a575487ae357e729 +boto3==1.29.7 \ + --hash=sha256:1eb4c548118b5fc5e018dee956fd33e6fb249cd1f2def85f1bba816aef4d9f3e \ + --hash=sha256:96e9890ebe7cd823b5f4976dd676e112c000c6528c28e20a2f274590589dd18b # via # -c /tmp/ray-deps/requirements_compiled.txt # -r python/requirements/cloud-requirements.txt # smart-open -botocore==1.29.76 \ - --hash=sha256:70735b00cd529f152992231ca6757e458e5ec25db43767b3526e9a35b2f143b7 \ - --hash=sha256:c2f67b6b3f8acf2968eafca06526f07b9fb0d27bac4c68a635d51abb675134a7 +botocore==1.32.7 \ + --hash=sha256:58b33d02cafa23461c8a9d211b30e8cded992380a84de409379fd02811fa3e11 \ + --hash=sha256:c6795c731b04c8e3635588c44cfd1a4462fc5987859195522c96812cf3eceff9 # via # -c /tmp/ray-deps/requirements_compiled.txt # -r python/requirements/cloud-requirements.txt @@ -265,6 +293,12 @@ cachetools==5.5.2 \ # via # -c /tmp/ray-deps/requirements_compiled.txt # google-auth +celery==5.5.3 \ + --hash=sha256:0b5761a07057acee94694464ca482416b959568904c9dfa41ce8413a7d65d525 \ + --hash=sha256:6c972ae7968c2b5281227f01c3a3f984037d21c5129d07bf3550cc2afc6b10a5 + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # -r python/requirements.txt certifi==2025.1.31 \ --hash=sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651 \ --hash=sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe @@ -328,6 +362,7 @@ cffi==1.16.0 \ # via # -c /tmp/ray-deps/requirements_compiled.txt # argon2-cffi-bindings + # azure-datalake-store # cryptography charset-normalizer==3.3.2 \ --hash=sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027 \ @@ -430,8 +465,30 @@ click==8.1.7 \ # -c /tmp/ray-deps/requirements_compiled.txt # -r python/requirements/cloud-requirements.txt # -r python/requirements.txt + # celery + # click-didyoumean + # click-plugins + # click-repl # typer # uvicorn +click-didyoumean==0.3.1 \ + --hash=sha256:4f82fdff0dbe64ef8ab2279bd6aa3f6a99c3b28c05aa09cbfc07c9d7fbb5a463 \ + --hash=sha256:5c4bb6007cfea5f2fd6583a2fb6701a22a41eb98957e63d0fac41c10e7c3117c + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # celery +click-plugins==1.1.1.2 \ + --hash=sha256:008d65743833ffc1f5417bf0e78e8d2c23aab04d9745ba817bd3e71b0feb6aa6 \ + --hash=sha256:d7af3984a99d243c131aa1a828331e7630f4a88a9741fd05c927b204bcf92261 + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # celery +click-repl==0.3.0 \ + --hash=sha256:17849c23dba3d667247dc4defe1757fff98694e90fe37474f3feebb69ced26a9 \ + --hash=sha256:fb7e06deb8da8de86180a33a9da97ac316751c094c6899382da7feeeeb51b812 + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # celery cloudpickle==2.2.0 \ --hash=sha256:3f4219469c55453cfe4737e564b67c2a149109dabf7f242478948b895f61106f \ --hash=sha256:7428798d5926d8fcbfd092d18d01a2a03daf8237d8fcdc8095d256b8490796f0 @@ -814,12 +871,13 @@ frozenlist==1.4.1 \ # -c /tmp/ray-deps/requirements_compiled.txt # aiohttp # aiosignal -fsspec==2023.5.0 \ - --hash=sha256:51a4ad01a5bb66fcc58036e288c0d53d3975a0df2a5dc59a93b59bade0391f2a \ - --hash=sha256:b3b56e00fb93ea321bc9e5d9cf6f8522a0198b20eb24e02774d329e9c6fb84ce +fsspec==2023.12.1 \ + --hash=sha256:6271f1d3075a378bfe432f6f42bf7e1d2a6ba74f78dd9b512385474c579146a0 \ + --hash=sha256:c4da01a35ac65c853f833e43f67802c25213f560820d54ddf248f92eddd5e990 # via # -c /tmp/ray-deps/requirements_compiled.txt # -r python/requirements.txt + # adlfs gitdb==4.0.11 \ --hash=sha256:81a3407ddd2ee8df444cbacea00e2d038e40150acfa3001696fe0dcf1d3adfa4 \ --hash=sha256:bf5421126136d6d0af55bc1e7c1af1c397a34f5b7bd79e776cd3e89785c2b04b @@ -947,70 +1005,118 @@ googleapis-common-protos==1.61.0 \ # via # -c /tmp/ray-deps/requirements_compiled.txt # google-api-core -grpcio==1.66.2 \ - --hash=sha256:02697eb4a5cbe5a9639f57323b4c37bcb3ab2d48cec5da3dc2f13334d72790dd \ - --hash=sha256:03b0b307ba26fae695e067b94cbb014e27390f8bc5ac7a3a39b7723fed085604 \ - --hash=sha256:05bc2ceadc2529ab0b227b1310d249d95d9001cd106aa4d31e8871ad3c428d73 \ - --hash=sha256:06de8ec0bd71be123eec15b0e0d457474931c2c407869b6c349bd9bed4adbac3 \ - --hash=sha256:0be4e0490c28da5377283861bed2941d1d20ec017ca397a5df4394d1c31a9b50 \ - --hash=sha256:12fda97ffae55e6526825daf25ad0fa37483685952b5d0f910d6405c87e3adb6 \ - --hash=sha256:1caa38fb22a8578ab8393da99d4b8641e3a80abc8fd52646f1ecc92bcb8dee34 \ - --hash=sha256:2018b053aa15782db2541ca01a7edb56a0bf18c77efed975392583725974b249 \ - --hash=sha256:20657d6b8cfed7db5e11b62ff7dfe2e12064ea78e93f1434d61888834bc86d75 \ - --hash=sha256:2335c58560a9e92ac58ff2bc5649952f9b37d0735608242973c7a8b94a6437d8 \ - --hash=sha256:31fd163105464797a72d901a06472860845ac157389e10f12631025b3e4d0453 \ - --hash=sha256:38b68498ff579a3b1ee8f93a05eb48dc2595795f2f62716e797dc24774c1aaa8 \ - --hash=sha256:3b00efc473b20d8bf83e0e1ae661b98951ca56111feb9b9611df8efc4fe5d55d \ - --hash=sha256:3ed71e81782966ffead60268bbda31ea3f725ebf8aa73634d5dda44f2cf3fb9c \ - --hash=sha256:45a3d462826f4868b442a6b8fdbe8b87b45eb4f5b5308168c156b21eca43f61c \ - --hash=sha256:49f0ca7ae850f59f828a723a9064cadbed90f1ece179d375966546499b8a2c9c \ - --hash=sha256:4e504572433f4e72b12394977679161d495c4c9581ba34a88d843eaf0f2fbd39 \ - --hash=sha256:4ea1d062c9230278793820146c95d038dc0f468cbdd172eec3363e42ff1c7d01 \ - --hash=sha256:563588c587b75c34b928bc428548e5b00ea38c46972181a4d8b75ba7e3f24231 \ - --hash=sha256:6001e575b8bbd89eee11960bb640b6da6ae110cf08113a075f1e2051cc596cae \ - --hash=sha256:66a0cd8ba6512b401d7ed46bb03f4ee455839957f28b8d61e7708056a806ba6a \ - --hash=sha256:6851de821249340bdb100df5eacfecfc4e6075fa85c6df7ee0eb213170ec8e5d \ - --hash=sha256:728bdf36a186e7f51da73be7f8d09457a03061be848718d0edf000e709418987 \ - --hash=sha256:73e3b425c1e155730273f73e419de3074aa5c5e936771ee0e4af0814631fb30a \ - --hash=sha256:73fc8f8b9b5c4a03e802b3cd0c18b2b06b410d3c1dcbef989fdeb943bd44aff7 \ - --hash=sha256:78fa51ebc2d9242c0fc5db0feecc57a9943303b46664ad89921f5079e2e4ada7 \ - --hash=sha256:7b2c86457145ce14c38e5bf6bdc19ef88e66c5fee2c3d83285c5aef026ba93b3 \ - --hash=sha256:7d69ce1f324dc2d71e40c9261d3fdbe7d4c9d60f332069ff9b2a4d8a257c7b2b \ - --hash=sha256:802d84fd3d50614170649853d121baaaa305de7b65b3e01759247e768d691ddf \ - --hash=sha256:80fd702ba7e432994df208f27514280b4b5c6843e12a48759c9255679ad38db8 \ - --hash=sha256:8ac475e8da31484efa25abb774674d837b343afb78bb3bcdef10f81a93e3d6bf \ - --hash=sha256:950da58d7d80abd0ea68757769c9db0a95b31163e53e5bb60438d263f4bed7b7 \ - --hash=sha256:99a641995a6bc4287a6315989ee591ff58507aa1cbe4c2e70d88411c4dcc0839 \ - --hash=sha256:9c3a99c519f4638e700e9e3f83952e27e2ea10873eecd7935823dab0c1c9250e \ - --hash=sha256:9c509a4f78114cbc5f0740eb3d7a74985fd2eff022971bc9bc31f8bc93e66a3b \ - --hash=sha256:a18e20d8321c6400185b4263e27982488cb5cdd62da69147087a76a24ef4e7e3 \ - --hash=sha256:a917d26e0fe980b0ac7bfcc1a3c4ad6a9a4612c911d33efb55ed7833c749b0ee \ - --hash=sha256:a9539f01cb04950fd4b5ab458e64a15f84c2acc273670072abe49a3f29bbad54 \ - --hash=sha256:ad2efdbe90c73b0434cbe64ed372e12414ad03c06262279b104a029d1889d13e \ - --hash=sha256:b672abf90a964bfde2d0ecbce30f2329a47498ba75ce6f4da35a2f4532b7acbc \ - --hash=sha256:bbd27c24a4cc5e195a7f56cfd9312e366d5d61b86e36d46bbe538457ea6eb8dd \ - --hash=sha256:c400ba5675b67025c8a9f48aa846f12a39cf0c44df5cd060e23fda5b30e9359d \ - --hash=sha256:c408f5ef75cfffa113cacd8b0c0e3611cbfd47701ca3cdc090594109b9fcbaed \ - --hash=sha256:c806852deaedee9ce8280fe98955c9103f62912a5b2d5ee7e3eaa284a6d8d8e7 \ - --hash=sha256:ce89f5876662f146d4c1f695dda29d4433a5d01c8681fbd2539afff535da14d4 \ - --hash=sha256:d25a14af966438cddf498b2e338f88d1c9706f3493b1d73b93f695c99c5f0e2a \ - --hash=sha256:d8d4732cc5052e92cea2f78b233c2e2a52998ac40cd651f40e398893ad0d06ec \ - --hash=sha256:d9a9724a156c8ec6a379869b23ba3323b7ea3600851c91489b871e375f710bc8 \ - --hash=sha256:e636ce23273683b00410f1971d209bf3689238cf5538d960adc3cdfe80dd0dbd \ - --hash=sha256:e88264caad6d8d00e7913996030bac8ad5f26b7411495848cc218bd3a9040b6c \ - --hash=sha256:f145cc21836c332c67baa6fc81099d1d27e266401565bf481948010d6ea32d46 \ - --hash=sha256:fb57870449dfcfac428afbb5a877829fcb0d6db9d9baa1148705739e9083880e \ - --hash=sha256:fb70487c95786e345af5e854ffec8cb8cc781bcc5df7930c4fbb7feaa72e1cdf \ - --hash=sha256:fe96281713168a3270878255983d2cb1a97e034325c8c2c25169a69289d3ecfa \ - --hash=sha256:ff1f7882e56c40b0d33c4922c15dfa30612f05fb785074a012f7cda74d1c3679 +grpcio==1.74.0 \ + --hash=sha256:0f87bddd6e27fc776aacf7ebfec367b6d49cad0455123951e4488ea99d9b9b8f \ + --hash=sha256:136b53c91ac1d02c8c24201bfdeb56f8b3ac3278668cbb8e0ba49c88069e1bdc \ + --hash=sha256:1733969040989f7acc3d94c22f55b4a9501a30f6aaacdbccfaba0a3ffb255ab7 \ + --hash=sha256:176d60a5168d7948539def20b2a3adcce67d72454d9ae05969a2e73f3a0feee7 \ + --hash=sha256:1a2b06afe2e50ebfd46247ac3ba60cac523f54ec7792ae9ba6073c12daf26f0a \ + --hash=sha256:1bf949792cee20d2078323a9b02bacbbae002b9e3b9e2433f2741c15bdeba1c4 \ + --hash=sha256:22b834cef33429ca6cc28303c9c327ba9a3fafecbf62fae17e9a7b7163cc43ac \ + --hash=sha256:2918948864fec2a11721d91568effffbe0a02b23ecd57f281391d986847982f6 \ + --hash=sha256:2bc2d7d8d184e2362b53905cb1708c84cb16354771c04b490485fa07ce3a1d89 \ + --hash=sha256:2f609a39f62a6f6f05c7512746798282546358a37ea93c1fcbadf8b2fed162e3 \ + --hash=sha256:3601274bc0523f6dc07666c0e01682c94472402ac2fd1226fd96e079863bfa49 \ + --hash=sha256:3b03d8f2a07f0fea8c8f74deb59f8352b770e3900d143b3d1475effcb08eec20 \ + --hash=sha256:3d14e3c4d65e19d8430a4e28ceb71ace4728776fd6c3ce34016947474479683f \ + --hash=sha256:42f8fee287427b94be63d916c90399ed310ed10aadbf9e2e5538b3e497d269bc \ + --hash=sha256:4bc5fca10aaf74779081e16c2bcc3d5ec643ffd528d9e7b1c9039000ead73bae \ + --hash=sha256:4e4181bfc24413d1e3a37a0b7889bea68d973d4b45dd2bc68bb766c140718f82 \ + --hash=sha256:55b453812fa7c7ce2f5c88be3018fb4a490519b6ce80788d5913f3f9d7da8c7b \ + --hash=sha256:566b9395b90cc3d0d0c6404bc8572c7c18786ede549cdb540ae27b58afe0fb91 \ + --hash=sha256:5f251c355167b2360537cf17bea2cf0197995e551ab9da6a0a59b3da5e8704f9 \ + --hash=sha256:60d2d48b0580e70d2e1954d0d19fa3c2e60dd7cbed826aca104fff518310d1c5 \ + --hash=sha256:64229c1e9cea079420527fa8ac45d80fc1e8d3f94deaa35643c381fa8d98f362 \ + --hash=sha256:655726919b75ab3c34cdad39da5c530ac6fa32696fb23119e36b64adcfca174a \ + --hash=sha256:662456c4513e298db6d7bd9c3b8df6f75f8752f0ba01fb653e252ed4a59b5a5d \ + --hash=sha256:68c8ebcca945efff9d86d8d6d7bfb0841cf0071024417e2d7f45c5e46b5b08eb \ + --hash=sha256:69e1a8180868a2576f02356565f16635b99088da7df3d45aaa7e24e73a054e31 \ + --hash=sha256:6bab67d15ad617aff094c382c882e0177637da73cbc5532d52c07b4ee887a87b \ + --hash=sha256:7d95d71ff35291bab3f1c52f52f474c632db26ea12700c2ff0ea0532cb0b5854 \ + --hash=sha256:80d1f4fbb35b0742d3e3d3bb654b7381cd5f015f8497279a1e9c21ba623e01b1 \ + --hash=sha256:834988b6c34515545b3edd13e902c1acdd9f2465d386ea5143fb558f153a7176 \ + --hash=sha256:8533e6e9c5bd630ca98062e3a1326249e6ada07d05acf191a77bc33f8948f3d8 \ + --hash=sha256:85bd5cdf4ed7b2d6438871adf6afff9af7096486fcf51818a81b77ef4dd30907 \ + --hash=sha256:86ad489db097141a907c559988c29718719aa3e13370d40e20506f11b4de0d11 \ + --hash=sha256:885912559974df35d92219e2dc98f51a16a48395f37b92865ad45186f294096c \ + --hash=sha256:8efe72fde5500f47aca1ef59495cb59c885afe04ac89dd11d810f2de87d935d4 \ + --hash=sha256:8f7b5882fb50632ab1e48cb3122d6df55b9afabc265582808036b6e51b9fd6b7 \ + --hash=sha256:9e7c4389771855a92934b2846bd807fc25a3dfa820fd912fe6bd8136026b2707 \ + --hash=sha256:9e912d3c993a29df6c627459af58975b2e5c897d93287939b9d5065f000249b5 \ + --hash=sha256:a8f0302f9ac4e9923f98d8e243939a6fb627cd048f5cd38595c97e38020dffce \ + --hash=sha256:b6a73b2ba83e663b2480a90b82fdae6a7aa6427f62bf43b29912c0cfd1aa2bfa \ + --hash=sha256:c14e803037e572c177ba54a3e090d6eb12efd795d49327c5ee2b3bddb836bf01 \ + --hash=sha256:c3d7bd6e3929fd2ea7fbc3f562e4987229ead70c9ae5f01501a46701e08f1ad9 \ + --hash=sha256:c98e0b7434a7fa4e3e63f250456eaef52499fba5ae661c58cc5b5477d11e7182 \ + --hash=sha256:cce634b10aeab37010449124814b05a62fb5f18928ca878f1bf4750d1f0c815b \ + --hash=sha256:e154d230dc1bbbd78ad2fdc3039fa50ad7ffcf438e4eb2fa30bce223a70c7486 \ + --hash=sha256:e1ea6176d7dfd5b941ea01c2ec34de9531ba494d541fe2057c904e601879f249 \ + --hash=sha256:e759f9e8bc908aaae0412642afe5416c9f983a80499448fcc7fab8692ae044c3 \ + --hash=sha256:e8978003816c7b9eabe217f88c78bc26adc8f9304bf6a594b02e5a49b2ef9c11 \ + --hash=sha256:ecde9ab49f58433abe02f9ed076c7b5be839cf0153883a6d23995937a82392fa \ + --hash=sha256:f6ec94f0e50eb8fa1744a731088b966427575e40c2944a980049798b127a687e \ + --hash=sha256:fd3c71aeee838299c5887230b8a1822795325ddfea635edd82954c1eaa831e24 \ + --hash=sha256:fe0f540750a13fd8e5da4b3eaba91a785eea8dca5ccd2bc2ffe978caa403090e # via # -c /tmp/ray-deps/requirements_compiled.txt # -r python/requirements/cloud-requirements.txt # -r python/requirements.txt # grpcio-tools -gymnasium==1.0.0 \ - --hash=sha256:9d2b66f30c1b34fe3c2ce7fae65ecf365d0e9982d2b3d860235e773328a3b403 \ - --hash=sha256:b6f40e1e24c5bd419361e1a5b86a9117d2499baecc3a660d44dfff4c465393ad +grpcio-tools==1.62.3 \ + --hash=sha256:0a52cc9444df978438b8d2332c0ca99000521895229934a59f94f37ed896b133 \ + --hash=sha256:0a8c0c4724ae9c2181b7dbc9b186df46e4f62cb18dc184e46d06c0ebeccf569e \ + --hash=sha256:0cb3a3436ac119cbd37a7d3331d9bdf85dad21a6ac233a3411dff716dcbf401e \ + --hash=sha256:11c625eebefd1fd40a228fc8bae385e448c7e32a6ae134e43cf13bbc23f902b7 \ + --hash=sha256:11f363570dea661dde99e04a51bd108a5807b5df32a6f8bdf4860e34e94a4dbf \ + --hash=sha256:141d028bf5762d4a97f981c501da873589df3f7e02f4c1260e1921e565b376fa \ + --hash=sha256:1c989246c2aebc13253f08be32538a4039a64e12d9c18f6d662d7aee641dc8b5 \ + --hash=sha256:1da38070738da53556a4b35ab67c1b9884a5dd48fa2f243db35dc14079ea3d0c \ + --hash=sha256:27cd9ef5c5d68d5ed104b6dcb96fe9c66b82050e546c9e255716903c3d8f0373 \ + --hash=sha256:2e02d3b96f2d0e4bab9ceaa30f37d4f75571e40c6272e95364bff3125a64d184 \ + --hash=sha256:2f968b049c2849540751ec2100ab05e8086c24bead769ca734fdab58698408c1 \ + --hash=sha256:350a80485e302daaa95d335a931f97b693e170e02d43767ab06552c708808950 \ + --hash=sha256:3eae6ea76d62fcac091e1f15c2dcedf1dc3f114f8df1a972a8a0745e89f4cf61 \ + --hash=sha256:47a5c093ab256dec5714a7a345f8cc89315cb57c298b276fa244f37a0ba507f0 \ + --hash=sha256:5782883a27d3fae8c425b29a9d3dcf5f47d992848a1b76970da3b5a28d424b26 \ + --hash=sha256:6a56d344b0bab30bf342a67e33d386b0b3c4e65868ffe93c341c51e1a8853ca5 \ + --hash=sha256:6c3064610826f50bd69410c63101954676edc703e03f9e8f978a135f1aaf97c1 \ + --hash=sha256:703f46e0012af83a36082b5f30341113474ed0d91e36640da713355cd0ea5d23 \ + --hash=sha256:710fecf6a171dcbfa263a0a3e7070e0df65ba73158d4c539cec50978f11dad5d \ + --hash=sha256:7c7136015c3d62c3eef493efabaf9e3380e3e66d24ee8e94c01cb71377f57833 \ + --hash=sha256:7cc83023acd8bc72cf74c2edbe85b52098501d5b74d8377bfa06f3e929803492 \ + --hash=sha256:7f2483ea232bd72d98a6dc6d7aefd97e5bc80b15cd909b9e356d6f3e326b6e43 \ + --hash=sha256:7ff7d58a45b75df67d25f8f144936a3e44aabd91afec833ee06826bd02b7fbe7 \ + --hash=sha256:8ad0473af5544f89fc5a1ece8676dd03bdf160fb3230f967e05d0f4bf89620e3 \ + --hash=sha256:8c5d22b252dcef11dd1e0fbbe5bbfb9b4ae048e8880d33338215e8ccbdb03edc \ + --hash=sha256:8e62cc7164b0b7c5128e637e394eb2ef3db0e61fc798e80c301de3b2379203ed \ + --hash=sha256:962c84b4da0f3b14b3cdb10bc3837ebc5f136b67d919aea8d7bb3fd3df39528a \ + --hash=sha256:ace43b26d88a58dcff16c20d23ff72b04d0a415f64d2820f4ff06b1166f50557 \ + --hash=sha256:b47d0dda1bdb0a0ba7a9a6de88e5a1ed61f07fad613964879954961e36d49193 \ + --hash=sha256:b77f9f9cee87cd798f0fe26b7024344d1b03a7cd2d2cba7035f8433b13986325 \ + --hash=sha256:b881fd9505a84457e9f7e99362eeedd86497b659030cf57c6f0070df6d9c2b9b \ + --hash=sha256:bfda6ee8990997a9df95c5606f3096dae65f09af7ca03a1e9ca28f088caca5cf \ + --hash=sha256:c3a1ac9d394f8e229eb28eec2e04b9a6f5433fa19c9d32f1cb6066e3c5114a1d \ + --hash=sha256:c8ad5cce554e2fcaf8842dee5d9462583b601a3a78f8b76a153c38c963f58c10 \ + --hash=sha256:ca246dffeca0498be9b4e1ee169b62e64694b0f92e6d0be2573e65522f39eea9 \ + --hash=sha256:ca4f5eeadbb57cf03317d6a2857823239a63a59cc935f5bd6cf6e8b7af7a7ecc \ + --hash=sha256:d102b9b21c4e1e40af9a2ab3c6d41afba6bd29c0aa50ca013bf85c99cdc44ac5 \ + --hash=sha256:db3bc9fa39afc5e4e2767da4459df82b095ef0cab2f257707be06c44a1c2c3e5 \ + --hash=sha256:dc9ad9950119d8ae27634e68b7663cc8d340ae535a0f80d85a55e56a6973ab1f \ + --hash=sha256:e02d7c1a02e3814c94ba0cfe43d93e872c758bd8fd5c2797f894d0c49b4a1dfc \ + --hash=sha256:e0898d412a434e768a0c7e365acabe13ff1558b767e400936e26b5b6ed1ee51f \ + --hash=sha256:e18e15287c31baf574fcdf8251fb7f997d64e96c6ecf467906e576da0a079af6 \ + --hash=sha256:ec279dcf3518201fc592c65002754f58a6b542798cd7f3ecd4af086422f33f29 \ + --hash=sha256:ec6fbded0c61afe6f84e3c2a43e6d656791d95747d6d28b73eff1af64108c434 \ + --hash=sha256:eec73a005443061f4759b71a056f745e3b000dc0dc125c9f20560232dfbcbd14 \ + --hash=sha256:f3d812daffd0c2d2794756bd45a353f89e55dc8f91eb2fc840c51b9f6be62667 \ + --hash=sha256:f4b1615adf67bd8bb71f3464146a6f9949972d06d21a4f5e87e73f6464d97f57 \ + --hash=sha256:f6831fdec2b853c9daa3358535c55eed3694325889aa714070528cf8f92d7d6d + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # -r python/requirements/cloud-requirements.txt +gymnasium==1.1.1 \ + --hash=sha256:8bd9ea9bdef32c950a444ff36afc785e1d81051ec32d30435058953c20d2456d \ + --hash=sha256:9c167ec0a2b388666e37f63b2849cd2552f7f5b71938574c637bb36487eb928a # via # -c /tmp/ray-deps/requirements_compiled.txt # -r python/requirements.txt @@ -1247,6 +1353,12 @@ jupyterlab-widgets==3.0.11 \ # via # -c /tmp/ray-deps/requirements_compiled.txt # ipywidgets +kombu==5.5.4 \ + --hash=sha256:886600168275ebeada93b888e831352fe578168342f0d1d5833d88ba0d847363 \ + --hash=sha256:a12ed0557c238897d8e518f1d1fdf84bd1516c5e305af2dacd85c2015115feb8 + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # celery lazy-loader==0.4 \ --hash=sha256:342aa8e14d543a154047afb4ba8ef17f5563baad3fc610d7b15b213b0f119efc \ --hash=sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1 @@ -1495,6 +1607,7 @@ msal==1.28.1 \ --hash=sha256:d72bbfe2d5c2f2555f4bc6205be4450ddfd12976610dd9a16a9ab0f05c68b64d # via # -c /tmp/ray-deps/requirements_compiled.txt + # azure-datalake-store # azure-identity # msal-extensions msal-extensions==1.2.0b1 \ @@ -1827,6 +1940,7 @@ packaging==23.0 \ # jupyter-server # jupyterlab # jupyterlab-server + # kombu # lazy-loader # nbconvert # pytest @@ -2001,6 +2115,7 @@ prompt-toolkit==3.0.41 \ --hash=sha256:f36fe301fafb7470e86aaf90f036eef600a3210be4decf461a5b1ca8403d3cb2 # via # -c /tmp/ray-deps/requirements_compiled.txt + # click-repl # ipython propcache==0.3.0 \ --hash=sha256:02df07041e0820cacc8f739510078f2aadcfd3fc57eaeeb16d5ded85c872c89e \ @@ -2284,114 +2399,113 @@ pycurl==7.45.3 \ # via # -c /tmp/ray-deps/requirements_compiled.txt # -r python/requirements/cloud-requirements.txt -pydantic==2.10.0 \ - --hash=sha256:0aca0f045ff6e2f097f1fe89521115335f15049eeb8a7bef3dafe4b19a74e289 \ - --hash=sha256:5e7807ba9201bdf61b1b58aa6eb690916c40a47acfb114b1b4fef3e7fd5b30fc +pydantic==2.11.7 \ + --hash=sha256:d989c3c6cb79469287b1569f7447a17848c998458d49ebe294e975b9baf0f0db \ + --hash=sha256:dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b # via # -c /tmp/ray-deps/requirements_compiled.txt # -r python/requirements.txt # fastapi -pydantic-core==2.27.0 \ - --hash=sha256:0aa4d1b2eba9a325897308b3124014a142cdccb9f3e016f31d3ebee6b5ea5e75 \ - --hash=sha256:0d06b667e53320332be2bf6f9461f4a9b78092a079b8ce8634c9afaa7e10cd9f \ - --hash=sha256:153017e3d6cd3ce979de06d84343ca424bb6092727375eba1968c8b4693c6ecb \ - --hash=sha256:15e350efb67b855cd014c218716feea4986a149ed1f42a539edd271ee074a196 \ - --hash=sha256:185ef205256cd8b38431205698531026979db89a79587725c1e55c59101d64e9 \ - --hash=sha256:1da0c98a85a6c6ed702d5556db3b09c91f9b0b78de37b7593e2de8d03238807a \ - --hash=sha256:225bfff5d425c34e1fd562cef52d673579d59b967d9de06178850c4802af9039 \ - --hash=sha256:24f984fc7762ed5f806d9e8c4c77ea69fdb2afd987b4fd319ef06c87595a8c55 \ - --hash=sha256:25a7fd4de38f7ff99a37e18fa0098c3140286451bc823d1746ba80cec5b433a1 \ - --hash=sha256:2883b260f7a93235488699d39cbbd94fa7b175d3a8063fbfddd3e81ad9988cb2 \ - --hash=sha256:2a51ce96224eadd1845150b204389623c8e129fde5a67a84b972bd83a85c6c40 \ - --hash=sha256:2be0ad541bb9f059954ccf8877a49ed73877f862529575ff3d54bf4223e4dd61 \ - --hash=sha256:31a2cae5f059329f9cfe3d8d266d3da1543b60b60130d186d9b6a3c20a346361 \ - --hash=sha256:333c840a1303d1474f491e7be0b718226c730a39ead0f7dab2c7e6a2f3855555 \ - --hash=sha256:33d14369739c5d07e2e7102cdb0081a1fa46ed03215e07f097b34e020b83b1ae \ - --hash=sha256:35380671c3c921fe8adf31ad349dc6f7588b7e928dbe44e1093789734f607399 \ - --hash=sha256:359e7951f04ad35111b5ddce184db3391442345d0ab073aa63a95eb8af25a5ef \ - --hash=sha256:36aa167f69d8807ba7e341d67ea93e50fcaaf6bc433bb04939430fa3dab06f31 \ - --hash=sha256:395e3e1148fa7809016231f8065f30bb0dc285a97b4dc4360cd86e17bab58af7 \ - --hash=sha256:3e8d89c276234579cd3d095d5fa2a44eb10db9a218664a17b56363cddf226ff3 \ - --hash=sha256:3eb8849445c26b41c5a474061032c53e14fe92a11a5db969f722a2716cd12206 \ - --hash=sha256:3fd8bc2690e7c39eecdf9071b6a889ce7b22b72073863940edc2a0a23750ca90 \ - --hash=sha256:400bf470e4327e920883b51e255617dfe4496d4e80c3fea0b5a5d0bf2c404dd4 \ - --hash=sha256:4148dc9184ab79e356dc00a4199dc0ee8647973332cb385fc29a7cced49b9f9c \ - --hash=sha256:433689845288f9a1ee5714444e65957be26d30915f7745091ede4a83cfb2d7bb \ - --hash=sha256:43b61989068de9ce62296cde02beffabcadb65672207fc51e7af76dca75e6636 \ - --hash=sha256:4523c4009c3f39d948e01962223c9f5538602e7087a628479b723c939fab262d \ - --hash=sha256:483c2213a609e7db2c592bbc015da58b6c75af7360ca3c981f178110d9787bcf \ - --hash=sha256:49633583eb7dc5cba61aaf7cdb2e9e662323ad394e543ee77af265736bcd3eaa \ - --hash=sha256:4b51f964fcbb02949fc546022e56cdb16cda457af485e9a3e8b78ac2ecf5d77e \ - --hash=sha256:4bf1340ae507f6da6360b24179c2083857c8ca7644aab65807023cf35404ea8d \ - --hash=sha256:4fb49cfdb53af5041aba909be00cccfb2c0d0a2e09281bf542371c5fd36ad04c \ - --hash=sha256:510b11e9c3b1a852876d1ccd8d5903684336d635214148637ceb27366c75a467 \ - --hash=sha256:513cb14c0cc31a4dfd849a4674b20c46d87b364f997bbcb02282306f5e187abf \ - --hash=sha256:58560828ee0951bb125c6f2862fbc37f039996d19ceb6d8ff1905abf7da0bf3d \ - --hash=sha256:58ab0d979c969983cdb97374698d847a4acffb217d543e172838864636ef10d9 \ - --hash=sha256:5982048129f40b082c2654de10c0f37c67a14f5ff9d37cf35be028ae982f26df \ - --hash=sha256:5ab325fc86fbc077284c8d7f996d904d30e97904a87d6fb303dce6b3de7ebba9 \ - --hash=sha256:5cc822ab90a70ea3a91e6aed3afac570b276b1278c6909b1d384f745bd09c714 \ - --hash=sha256:5f2b19b8d6fca432cb3acf48cf5243a7bf512988029b6e6fd27e9e8c0a204d85 \ - --hash=sha256:5fc72fbfebbf42c0856a824b8b0dc2b5cd2e4a896050281a21cfa6fed8879cb1 \ - --hash=sha256:6354e18a9be37bfa124d6b288a87fb30c673745806c92956f1a25e3ae6e76b96 \ - --hash=sha256:678f66462058dd978702db17eb6a3633d634f7aa0deaea61e0a674152766d3fc \ - --hash=sha256:68950bc08f9735306322bfc16a18391fcaac99ded2509e1cc41d03ccb6013cfe \ - --hash=sha256:68ef5377eb582fa4343c9d0b57a5b094046d447b4c73dd9fbd9ffb216f829e7d \ - --hash=sha256:6b4c19525c3538fbc0bbda6229f9682fb8199ce9ac37395880e6952798e00373 \ - --hash=sha256:6bb69bf3b6500f195c3deb69c1205ba8fc3cb21d1915f1f158a10d6b1ef29b6a \ - --hash=sha256:6e19401742ed7b69e51d8e4df3c03ad5ec65a83b36244479fd70edde2828a5d9 \ - --hash=sha256:6f4a53af9e81d757756508b57cae1cf28293f0f31b9fa2bfcb416cc7fb230f9d \ - --hash=sha256:6fda87808429c520a002a85d6e7cdadbf58231d60e96260976c5b8f9a12a8e13 \ - --hash=sha256:78f841523729e43e3928a364ec46e2e3f80e6625a4f62aca5c345f3f626c6e8a \ - --hash=sha256:7a6ebfac28fd51890a61df36ef202adbd77d00ee5aca4a3dadb3d9ed49cfb929 \ - --hash=sha256:7b0202ebf2268954090209a84f9897345719e46a57c5f2c9b7b250ca0a9d3e63 \ - --hash=sha256:8117839a9bdbba86e7f9df57018fe3b96cec934c3940b591b0fd3fbfb485864a \ - --hash=sha256:82e1ad4ca170e8af4c928b67cff731b6296e6a0a0981b97b2eb7c275cc4e15bd \ - --hash=sha256:836a4bfe0cc6d36dc9a9cc1a7b391265bf6ce9d1eb1eac62ac5139f5d8d9a6fa \ - --hash=sha256:84af1cf7bfdcbc6fcf5a5f70cc9896205e0350306e4dd73d54b6a18894f79386 \ - --hash=sha256:84e35afd9e10b2698e6f2f32256678cb23ca6c1568d02628033a837638b3ed12 \ - --hash=sha256:884f1806609c2c66564082540cffc96868c5571c7c3cf3a783f63f2fb49bd3cd \ - --hash=sha256:8a150392102c402c538190730fda06f3bce654fc498865579a9f2c1d2b425833 \ - --hash=sha256:8e21d927469d04b39386255bf00d0feedead16f6253dcc85e9e10ddebc334084 \ - --hash=sha256:8e96ca781e0c01e32115912ebdf7b3fb0780ce748b80d7d28a0802fa9fbaf44e \ - --hash=sha256:8ee4c2a75af9fe21269a4a0898c5425afb01af1f5d276063f57e2ae1bc64e191 \ - --hash=sha256:91bc66f878557313c2a6bcf396e7befcffe5ab4354cfe4427318968af31143c3 \ - --hash=sha256:951e71da6c89d354572098bada5ba5b5dc3a9390c933af8a614e37755d3d1840 \ - --hash=sha256:99b2863c1365f43f74199c980a3d40f18a218fbe683dd64e470199db426c4d6a \ - --hash=sha256:9a8fbf506fde1529a1e3698198fe64bfbe2e0c09557bc6a7dcf872e7c01fec40 \ - --hash=sha256:9ce048deb1e033e7a865ca384770bccc11d44179cf09e5193a535c4c2f497bdc \ - --hash=sha256:9fe94d9d2a2b4edd7a4b22adcd45814b1b59b03feb00e56deb2e89747aec7bfe \ - --hash=sha256:a291d0b4243a259c8ea7e2b84eb9ccb76370e569298875a7c5e3e71baf49057a \ - --hash=sha256:a5c022bb0d453192426221605efc865373dde43b17822a264671c53b068ac20c \ - --hash=sha256:abb4785894936d7682635726613c44578c420a096729f1978cd061a7e72d5275 \ - --hash=sha256:b872c86d8d71827235c7077461c502feb2db3f87d9d6d5a9daa64287d75e4fa0 \ - --hash=sha256:bf37b72834e7239cf84d4a0b2c050e7f9e48bced97bad9bdf98d26b8eb72e846 \ - --hash=sha256:c0c431e4be5c1a0c6654e0c31c661cd89e0ca956ef65305c3c3fd96f4e72ca39 \ - --hash=sha256:c5726eec789ee38f2c53b10b1821457b82274f81f4f746bb1e666d8741fcfadb \ - --hash=sha256:c6fcb3fa3855d583aa57b94cf146f7781d5d5bc06cb95cb3afece33d31aac39b \ - --hash=sha256:c86679f443e7085ea55a7376462553996c688395d18ef3f0d3dbad7838f857a2 \ - --hash=sha256:c91e3c04f5191fd3fb68764bddeaf02025492d5d9f23343b283870f6ace69708 \ - --hash=sha256:c921ad596ff1a82f9c692b0758c944355abc9f0de97a4c13ca60ffc6d8dc15d4 \ - --hash=sha256:c9ed88b398ba7e3bad7bd64d66cc01dcde9cfcb7ec629a6fd78a82fa0b559d78 \ - --hash=sha256:cd2ac6b919f7fed71b17fe0b4603c092a4c9b5bae414817c9c81d3c22d1e1bcc \ - --hash=sha256:d28ca7066d6cdd347a50d8b725dc10d9a1d6a1cce09836cf071ea6a2d4908be0 \ - --hash=sha256:d29e235ce13c91902ef3efc3d883a677655b3908b1cbc73dee816e5e1f8f7739 \ - --hash=sha256:d8b5ee4ae9170e2775d495b81f414cc20268041c42571530513496ba61e94ba3 \ - --hash=sha256:db72e40628967f6dc572020d04b5f800d71264e0531c6da35097e73bdf38b003 \ - --hash=sha256:df45c4073bed486ea2f18757057953afed8dd77add7276ff01bccb79982cf46c \ - --hash=sha256:dfa5f5c0a4c8fced1422dc2ca7eefd872d5d13eb33cf324361dbf1dbfba0a9fe \ - --hash=sha256:e015833384ca3e1a0565a79f5d953b0629d9138021c27ad37c92a9fa1af7623c \ - --hash=sha256:e15315691fe2253eb447503153acef4d7223dfe7e7702f9ed66539fcd0c43801 \ - --hash=sha256:e65466b31be1070b4a5b7dbfbd14b247884cb8e8b79c64fb0f36b472912dbaea \ - --hash=sha256:e7820bb0d65e3ce1e3e70b6708c2f66143f55912fa02f4b618d0f08b61575f12 \ - --hash=sha256:e851a051f7260e6d688267eb039c81f05f23a19431bd7dfa4bf5e3cb34c108cd \ - --hash=sha256:e9f9feee7f334b72ceae46313333d002b56f325b5f04271b4ae2aadd9e993ae4 \ - --hash=sha256:eb40f828bc2f73f777d1eb8fee2e86cd9692a4518b63b6b5aa8af915dfd3207b \ - --hash=sha256:eb704155e73b833801c247f39d562229c0303f54770ca14fb1c053acb376cf10 \ - --hash=sha256:edb1bfd45227dec8d50bc7c7d86463cd8728bcc574f9b07de7369880de4626a3 \ - --hash=sha256:ee7d9d5537daf6d5c74a83b38a638cc001b648096c1cae8ef695b0c919d9d379 \ - --hash=sha256:f57783fbaf648205ac50ae7d646f27582fc706be3977e87c3c124e7a92407b10 \ - --hash=sha256:ff63a92f6e249514ef35bc795de10745be0226eaea06eb48b4bbeaa0c8850a4a +pydantic-core==2.33.2 \ + --hash=sha256:0069c9acc3f3981b9ff4cdfaf088e98d83440a4c7ea1bc07460af3d4dc22e72d \ + --hash=sha256:031c57d67ca86902726e0fae2214ce6770bbe2f710dc33063187a68744a5ecac \ + --hash=sha256:0405262705a123b7ce9f0b92f123334d67b70fd1f20a9372b907ce1080c7ba02 \ + --hash=sha256:04a1a413977ab517154eebb2d326da71638271477d6ad87a769102f7c2488c56 \ + --hash=sha256:09fb9dd6571aacd023fe6aaca316bd01cf60ab27240d7eb39ebd66a3a15293b4 \ + --hash=sha256:0a39979dcbb70998b0e505fb1556a1d550a0781463ce84ebf915ba293ccb7e22 \ + --hash=sha256:0a9f2c9dd19656823cb8250b0724ee9c60a82f3cdf68a080979d13092a3b0fef \ + --hash=sha256:0e03262ab796d986f978f79c943fc5f620381be7287148b8010b4097f79a39ec \ + --hash=sha256:0e5b2671f05ba48b94cb90ce55d8bdcaaedb8ba00cc5359f6810fc918713983d \ + --hash=sha256:0e6116757f7959a712db11f3e9c0a99ade00a5bbedae83cb801985aa154f071b \ + --hash=sha256:0fb2d542b4d66f9470e8065c5469ec676978d625a8b7a363f07d9a501a9cb36a \ + --hash=sha256:1082dd3e2d7109ad8b7da48e1d4710c8d06c253cbc4a27c1cff4fbcaa97a9e3f \ + --hash=sha256:1a8695a8d00c73e50bff9dfda4d540b7dee29ff9b8053e38380426a85ef10052 \ + --hash=sha256:1e063337ef9e9820c77acc768546325ebe04ee38b08703244c1309cccc4f1bab \ + --hash=sha256:1ea40a64d23faa25e62a70ad163571c0b342b8bf66d5fa612ac0dec4f069d916 \ + --hash=sha256:2058a32994f1fde4ca0480ab9d1e75a0e8c87c22b53a3ae66554f9af78f2fe8c \ + --hash=sha256:235f45e5dbcccf6bd99f9f472858849f73d11120d76ea8707115415f8e5ebebf \ + --hash=sha256:2807668ba86cb38c6817ad9bc66215ab8584d1d304030ce4f0887336f28a5e27 \ + --hash=sha256:2b0a451c263b01acebe51895bfb0e1cc842a5c666efe06cdf13846c7418caa9a \ + --hash=sha256:2b3d326aaef0c0399d9afffeb6367d5e26ddc24d351dbc9c636840ac355dc5d8 \ + --hash=sha256:2bfb5112df54209d820d7bf9317c7a6c9025ea52e49f46b6a2060104bba37de7 \ + --hash=sha256:2f82865531efd18d6e07a04a17331af02cb7a651583c418df8266f17a63c6612 \ + --hash=sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1 \ + --hash=sha256:3a1c81334778f9e3af2f8aeb7a960736e5cab1dfebfb26aabca09afd2906c039 \ + --hash=sha256:3abcd9392a36025e3bd55f9bd38d908bd17962cc49bc6da8e7e96285336e2bca \ + --hash=sha256:3c6db6e52c6d70aa0d00d45cdb9b40f0433b96380071ea80b09277dba021ddf7 \ + --hash=sha256:3dc625f4aa79713512d1976fe9f0bc99f706a9dee21dfd1810b4bbbf228d0e8a \ + --hash=sha256:3eb3fe62804e8f859c49ed20a8451342de53ed764150cb14ca71357c765dc2a6 \ + --hash=sha256:44857c3227d3fb5e753d5fe4a3420d6376fa594b07b621e220cd93703fe21782 \ + --hash=sha256:4b25d91e288e2c4e0662b8038a28c6a07eaac3e196cfc4ff69de4ea3db992a1b \ + --hash=sha256:4c5b0a576fb381edd6d27f0a85915c6daf2f8138dc5c267a57c08a62900758c7 \ + --hash=sha256:4e61206137cbc65e6d5256e1166f88331d3b6238e082d9f74613b9b765fb9025 \ + --hash=sha256:52fb90784e0a242bb96ec53f42196a17278855b0f31ac7c3cc6f5c1ec4811849 \ + --hash=sha256:53a57d2ed685940a504248187d5685e49eb5eef0f696853647bf37c418c538f7 \ + --hash=sha256:572c7e6c8bb4774d2ac88929e3d1f12bc45714ae5ee6d9a788a9fb35e60bb04b \ + --hash=sha256:5c4aa4e82353f65e548c476b37e64189783aa5384903bfea4f41580f255fddfa \ + --hash=sha256:5c92edd15cd58b3c2d34873597a1e20f13094f59cf88068adb18947df5455b4e \ + --hash=sha256:5f483cfb75ff703095c59e365360cb73e00185e01aaea067cd19acffd2ab20ea \ + --hash=sha256:61c18fba8e5e9db3ab908620af374db0ac1baa69f0f32df4f61ae23f15e586ac \ + --hash=sha256:6368900c2d3ef09b69cb0b913f9f8263b03786e5b2a387706c5afb66800efd51 \ + --hash=sha256:64632ff9d614e5eecfb495796ad51b0ed98c453e447a76bcbeeb69615079fc7e \ + --hash=sha256:65132b7b4a1c0beded5e057324b7e16e10910c106d43675d9bd87d4f38dde162 \ + --hash=sha256:6b99022f1d19bc32a4c2a0d544fc9a76e3be90f0b3f4af413f87d38749300e65 \ + --hash=sha256:6bdfe4b3789761f3bcb4b1ddf33355a71079858958e3a552f16d5af19768fef2 \ + --hash=sha256:6fa6dfc3e4d1f734a34710f391ae822e0a8eb8559a85c6979e14e65ee6ba2954 \ + --hash=sha256:73662edf539e72a9440129f231ed3757faab89630d291b784ca99237fb94db2b \ + --hash=sha256:73cf6373c21bc80b2e0dc88444f41ae60b2f070ed02095754eb5a01df12256de \ + --hash=sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc \ + --hash=sha256:7f92c15cd1e97d4b12acd1cc9004fa092578acfa57b67ad5e43a197175d01a64 \ + --hash=sha256:82f68293f055f51b51ea42fafc74b6aad03e70e191799430b90c13d643059ebb \ + --hash=sha256:83aa99b1285bc8f038941ddf598501a86f1536789740991d7d8756e34f1e74d9 \ + --hash=sha256:87acbfcf8e90ca885206e98359d7dca4bcbb35abdc0ff66672a293e1d7a19101 \ + --hash=sha256:87b31b6846e361ef83fedb187bb5b4372d0da3f7e28d85415efa92d6125d6e6d \ + --hash=sha256:881b21b5549499972441da4758d662aeea93f1923f953e9cbaff14b8b9565aef \ + --hash=sha256:8d55ab81c57b8ff8548c3e4947f119551253f4e3787a7bbc0b6b3ca47498a9d3 \ + --hash=sha256:8f57a69461af2a5fa6e6bbd7a5f60d3b7e6cebb687f55106933188e79ad155c1 \ + --hash=sha256:95237e53bb015f67b63c91af7518a62a8660376a6a0db19b89acc77a4d6199f5 \ + --hash=sha256:96081f1605125ba0855dfda83f6f3df5ec90c61195421ba72223de35ccfb2f88 \ + --hash=sha256:970919794d126ba8645f3837ab6046fb4e72bbc057b3709144066204c19a455d \ + --hash=sha256:9cb1da0f5a471435a7bc7e439b8a728e8b61e59784b2af70d7c169f8dd8ae290 \ + --hash=sha256:9fcd347d2cc5c23b06de6d3b7b8275be558a0c90549495c699e379a80bf8379e \ + --hash=sha256:9fdac5d6ffa1b5a83bca06ffe7583f5576555e6c8b3a91fbd25ea7780f825f7d \ + --hash=sha256:a11c8d26a50bfab49002947d3d237abe4d9e4b5bdc8846a63537b6488e197808 \ + --hash=sha256:a144d4f717285c6d9234a66778059f33a89096dfb9b39117663fd8413d582dcc \ + --hash=sha256:a2b911a5b90e0374d03813674bf0a5fbbb7741570dcd4b4e85a2e48d17def29d \ + --hash=sha256:a7ec89dc587667f22b6a0b6579c249fca9026ce7c333fc142ba42411fa243cdc \ + --hash=sha256:aa9d91b338f2df0508606f7009fde642391425189bba6d8c653afd80fd6bb64e \ + --hash=sha256:b0379a2b24882fef529ec3b4987cb5d003b9cda32256024e6fe1586ac45fc640 \ + --hash=sha256:bc7aee6f634a6f4a95676fcb5d6559a2c2a390330098dba5e5a5f28a2e4ada30 \ + --hash=sha256:bdc25f3681f7b78572699569514036afe3c243bc3059d3942624e936ec93450e \ + --hash=sha256:c083a3bdd5a93dfe480f1125926afcdbf2917ae714bdb80b36d34318b2bec5d9 \ + --hash=sha256:c20c462aa4434b33a2661701b861604913f912254e441ab8d78d30485736115a \ + --hash=sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9 \ + --hash=sha256:c52b02ad8b4e2cf14ca7b3d918f3eb0ee91e63b3167c32591e57c4317e134f8f \ + --hash=sha256:c54c939ee22dc8e2d545da79fc5381f1c020d6d3141d3bd747eab59164dc89fb \ + --hash=sha256:c8e7af2f4e0194c22b5b37205bfb293d166a7344a5b0d0eaccebc376546d77d5 \ + --hash=sha256:cca3868ddfaccfbc4bfb1d608e2ccaaebe0ae628e1416aeb9c4d88c001bb45ab \ + --hash=sha256:d3f26877a748dc4251cfcfda9dfb5f13fcb034f5308388066bcfe9031b63ae7d \ + --hash=sha256:d53b22f2032c42eaaf025f7c40c2e3b94568ae077a606f006d206a463bc69572 \ + --hash=sha256:d87c561733f66531dced0da6e864f44ebf89a8fba55f31407b00c2f7f9449593 \ + --hash=sha256:d946c8bf0d5c24bf4fe333af284c59a19358aa3ec18cb3dc4370080da1e8ad29 \ + --hash=sha256:dac89aea9af8cd672fa7b510e7b8c33b0bba9a43186680550ccf23020f32d535 \ + --hash=sha256:db4b41f9bd95fbe5acd76d89920336ba96f03e149097365afe1cb092fceb89a1 \ + --hash=sha256:dc46a01bf8d62f227d5ecee74178ffc448ff4e5197c756331f71efcc66dc980f \ + --hash=sha256:dd14041875d09cc0f9308e37a6f8b65f5585cf2598a53aa0123df8b129d481f8 \ + --hash=sha256:de4b83bb311557e439b9e186f733f6c645b9417c84e2eb8203f3f820a4b988bf \ + --hash=sha256:e799c050df38a639db758c617ec771fd8fb7a5f8eaaa4b27b101f266b216a246 \ + --hash=sha256:e80b087132752f6b3d714f041ccf74403799d3b23a72722ea2e6ba2e892555b9 \ + --hash=sha256:eb8c529b2819c37140eb51b914153063d27ed88e3bdc31b71198a198e921e011 \ + --hash=sha256:eb9b459ca4df0e5c87deb59d37377461a538852765293f9e6ee834f0435a93b9 \ + --hash=sha256:efec8db3266b76ef9607c2c4c419bdb06bf335ae433b80816089ea7585816f6a \ + --hash=sha256:f481959862f57f29601ccced557cc2e817bce7533ab8e01a797a48b49c9692b3 \ + --hash=sha256:f517ca031dfc037a9c07e748cefd8d96235088b83b4f4ba8939105d20fa1dcd6 \ + --hash=sha256:f889f7a40498cc077332c7ab6b4608d296d852182211787d4f3ee377aaae66e8 \ + --hash=sha256:f8de619080e944347f5f20de29a975c2d815d9ddd8be9b9b7268e2e3ef68605a \ + --hash=sha256:f941635f2a3d96b2973e867144fde513665c87f13fe0e193c158ac51bfaaa7b2 \ + --hash=sha256:fa754d1850735a0b0e03bcffd9d4b4343eb417e47196e4485d9cca326073a42c \ + --hash=sha256:fa854f5cf7e33842a892e5c73f45327760bc7bc516339fda888c75ae60edaeb6 \ + --hash=sha256:fe5b32187cbc0c862ee201ad66c30cf218e5ed468ec8dc1cf49dec66e160cc4d # via # -c /tmp/ray-deps/requirements_compiled.txt # pydantic @@ -2451,6 +2565,7 @@ python-dateutil==2.8.2 \ # -r python/requirements/cloud-requirements.txt # arrow # botocore + # celery # jupyter-client # pandas python-json-logger==2.0.7 \ @@ -2633,6 +2748,7 @@ requests==2.32.3 \ # -r python/requirements/cloud-requirements.txt # -r python/requirements.txt # azure-core + # azure-datalake-store # google-api-core # google-cloud-storage # jupyterlab-server @@ -2776,9 +2892,9 @@ rsa==4.7.2 \ # -c /tmp/ray-deps/requirements_compiled.txt # google-auth # oauth2client -s3transfer==0.6.2 \ - --hash=sha256:b014be3a8a2aab98cfe1abc7229cc5a9a0cf05eb9c1f2b86b230fd8df3f78084 \ - --hash=sha256:cab66d3380cca3e70939ef2255d01cd8aece6a4907a9528740f668c4b0611861 +s3transfer==0.8.0 \ + --hash=sha256:baa479dc2e63e5c2ed51611b4d46cdf0295e2070d8d0b86b22f335ee5b954986 \ + --hash=sha256:e8d6bd52ffd99841e3a57b34370a54841f12d3aab072af862cdcc50955288002 # via # -c /tmp/ray-deps/requirements_compiled.txt # boto3 @@ -3053,6 +3169,19 @@ typing-extensions==4.12.2 \ # pyopenssl # referencing # typer + # typing-inspection +typing-inspection==0.4.1 \ + --hash=sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51 \ + --hash=sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28 + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # pydantic +tzdata==2025.2 \ + --hash=sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8 \ + --hash=sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9 + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # kombu tzlocal==5.3 \ --hash=sha256:2fafbfc07e9d8b49ade18f898d6bcd37ae88ce3ad6486842a2e4f03af68323d2 \ --hash=sha256:3814135a1bb29763c6e4f08fd6e41dbb435c7a60bfbb03270211bcc537187d8c @@ -3079,6 +3208,14 @@ uvicorn==0.22.0 \ # via # -c /tmp/ray-deps/requirements_compiled.txt # -r python/requirements.txt +vine==5.1.0 \ + --hash=sha256:40fdf3c48b2cfe1c38a49e9ae2da6fda88e4794c810050a728bd7413811fb1dc \ + --hash=sha256:8b62e981d35c41049211cf62a0a1242d8c1ee9bd15bb196ce38aefd6799e61e0 + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # amqp + # celery + # kombu virtualenv==20.29.1 \ --hash=sha256:4e4cb403c0b0da39e13b46b1b2476e505cb0046b25f242bee80f62bf990b2779 \ --hash=sha256:b8b8970138d32fb606192cb97f6cd4bb644fa486be9308fb9b63f81091b5dc35 @@ -3399,5 +3536,4 @@ zipp==3.19.2 \ # importlib-metadata # The following packages were excluded from the output: -# grpcio-tools # setuptools diff --git a/python/requirements_compiled_ray_test_py311_cu121.txt b/python/deplocks/llm/ray_test_py311_cu121.lock similarity index 90% rename from python/requirements_compiled_ray_test_py311_cu121.txt rename to python/deplocks/llm/ray_test_py311_cu121.lock index ab15c20d3ec4..d61e27a0a6c9 100644 --- a/python/requirements_compiled_ray_test_py311_cu121.txt +++ b/python/deplocks/llm/ray_test_py311_cu121.lock @@ -1,8 +1,14 @@ # This file was autogenerated by uv via the following command: -# uv pip compile --generate-hashes --strip-extras --unsafe-package ray --unsafe-package grpcio-tools --unsafe-package setuptools --index-url https://pypi.org/simple --extra-index-url https://download.pytorch.org/whl/cu121 --index-strategy unsafe-best-match --no-strip-markers --emit-index-url --emit-find-links -c /tmp/ray-deps/requirements_compiled.txt python/requirements.txt python/requirements/cloud-requirements.txt python/requirements/base-test-requirements.txt -o python/requirements_compiled_ray_test_py311_cu121.txt +# uv pip compile --generate-hashes --strip-extras --unsafe-package setuptools --index-url https://pypi.org/simple --index-strategy unsafe-best-match --no-strip-markers --emit-index-url --emit-find-links --extra-index-url https://download.pytorch.org/whl/cu121 --python-version=3.11 --unsafe-package ray --python-platform=linux -c /tmp/ray-deps/requirements_compiled.txt python/requirements.txt python/requirements/cloud-requirements.txt python/requirements/base-test-requirements.txt -o python/deplocks/llm/ray_test_py311_cu121.lock --index-url https://pypi.org/simple --extra-index-url https://download.pytorch.org/whl/cu121 +adlfs==2023.8.0 \ + --hash=sha256:07e804f6df4593acfcaf01025b162e30ac13e523d3570279c98b2d91a18026d9 \ + --hash=sha256:3eb248a3c2a30b419f1147bd7676d156b5219f96ef7f11d47166afd2a3bdb07e + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # -r python/requirements/cloud-requirements.txt aiofiles==22.1.0 \ --hash=sha256:1142fa8e80dbae46bb6339573ad4c8c0841358f79c6eb50a493dceca14621bad \ --hash=sha256:9107f1ca0b2a5553987a94a3c9959fe5b491fdf731389aa5b7b1bd0733e32de6 @@ -101,6 +107,7 @@ aiohttp==3.11.16 \ # -c /tmp/ray-deps/requirements_compiled.txt # -r python/requirements/cloud-requirements.txt # -r python/requirements.txt + # adlfs # aiohttp-cors # pytest-aiohttp aiohttp-cors==0.7.0 \ @@ -127,6 +134,12 @@ aiosqlite==0.19.0 \ # via # -c /tmp/ray-deps/requirements_compiled.txt # ypy-websocket +amqp==5.3.1 \ + --hash=sha256:43b3319e1b4e7d1251833a93d672b4af1e40f3d632d479b98661a95f117880a2 \ + --hash=sha256:cddc00c725449522023bad949f70fff7b48f0b1ade74d170a6f10ab044739432 + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # kombu annotated-types==0.6.0 \ --hash=sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43 \ --hash=sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d @@ -205,20 +218,29 @@ azure-core==1.29.5 \ --hash=sha256:52983c89d394c6f881a121e5101c5fa67278ca3b1f339c8fb2ef39230c70e9ac # via # -c /tmp/ray-deps/requirements_compiled.txt + # adlfs # azure-identity # azure-storage-blob # smart-open +azure-datalake-store==0.0.53 \ + --hash=sha256:05b6de62ee3f2a0a6e6941e6933b792b800c3e7f6ffce2fc324bc19875757393 \ + --hash=sha256:a30c902a6e360aa47d7f69f086b426729784e71c536f330b691647a51dc42b2b + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # adlfs azure-identity==1.17.1 \ --hash=sha256:32ecc67cc73f4bd0595e4f64b1ca65cd05186f4fe6f98ed2ae9f1aa32646efea \ --hash=sha256:db8d59c183b680e763722bfe8ebc45930e6c57df510620985939f7f3191e0382 # via # -c /tmp/ray-deps/requirements_compiled.txt # -r python/requirements/cloud-requirements.txt + # adlfs azure-storage-blob==12.22.0 \ --hash=sha256:b3804bb4fe8ab1c32771fa464053da772a682c2737b19da438a3f4e5e3b3736e \ --hash=sha256:bb7d2d824ce3f11f14a27ee7d9281289f7e072ac8311c52e3652672455b7d5e8 # via # -c /tmp/ray-deps/requirements_compiled.txt + # adlfs # smart-open babel==2.13.1 \ --hash=sha256:33e0952d7dd6374af8dbf6768cc4ddf3ccfefc244f9986d4074704f2fbd18900 \ @@ -238,22 +260,28 @@ beautifulsoup4==4.11.1 \ # via # -c /tmp/ray-deps/requirements_compiled.txt # nbconvert +billiard==4.2.1 \ + --hash=sha256:12b641b0c539073fc8d3f5b8b7be998956665c4233c7c1fcd66a7e677c4fb36f \ + --hash=sha256:40b59a4ac8806ba2c2369ea98d876bc6108b051c227baffd928c644d15d8f3cb + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # celery bleach==6.1.0 \ --hash=sha256:0a31f1837963c41d46bbf1331b8778e1308ea0791db03cc4e7357b97cf42a8fe \ --hash=sha256:3225f354cfc436b9789c66c4ee030194bee0568fbf9cbdad3bc8b5c26c5f12b6 # via # -c /tmp/ray-deps/requirements_compiled.txt # nbconvert -boto3==1.26.76 \ - --hash=sha256:30c7d967ed1c6b5a05643e42cae9d4d36c3f1cb6782637ddc7007a104cfd9027 \ - --hash=sha256:b4c2969b7677762914394b8273cc1905dfe5b71f250741c1a575487ae357e729 +boto3==1.29.7 \ + --hash=sha256:1eb4c548118b5fc5e018dee956fd33e6fb249cd1f2def85f1bba816aef4d9f3e \ + --hash=sha256:96e9890ebe7cd823b5f4976dd676e112c000c6528c28e20a2f274590589dd18b # via # -c /tmp/ray-deps/requirements_compiled.txt # -r python/requirements/cloud-requirements.txt # smart-open -botocore==1.29.76 \ - --hash=sha256:70735b00cd529f152992231ca6757e458e5ec25db43767b3526e9a35b2f143b7 \ - --hash=sha256:c2f67b6b3f8acf2968eafca06526f07b9fb0d27bac4c68a635d51abb675134a7 +botocore==1.32.7 \ + --hash=sha256:58b33d02cafa23461c8a9d211b30e8cded992380a84de409379fd02811fa3e11 \ + --hash=sha256:c6795c731b04c8e3635588c44cfd1a4462fc5987859195522c96812cf3eceff9 # via # -c /tmp/ray-deps/requirements_compiled.txt # -r python/requirements/cloud-requirements.txt @@ -265,6 +293,12 @@ cachetools==5.5.2 \ # via # -c /tmp/ray-deps/requirements_compiled.txt # google-auth +celery==5.5.3 \ + --hash=sha256:0b5761a07057acee94694464ca482416b959568904c9dfa41ce8413a7d65d525 \ + --hash=sha256:6c972ae7968c2b5281227f01c3a3f984037d21c5129d07bf3550cc2afc6b10a5 + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # -r python/requirements.txt certifi==2025.1.31 \ --hash=sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651 \ --hash=sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe @@ -328,6 +362,7 @@ cffi==1.16.0 \ # via # -c /tmp/ray-deps/requirements_compiled.txt # argon2-cffi-bindings + # azure-datalake-store # cryptography charset-normalizer==3.3.2 \ --hash=sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027 \ @@ -430,8 +465,30 @@ click==8.1.7 \ # -c /tmp/ray-deps/requirements_compiled.txt # -r python/requirements/cloud-requirements.txt # -r python/requirements.txt + # celery + # click-didyoumean + # click-plugins + # click-repl # typer # uvicorn +click-didyoumean==0.3.1 \ + --hash=sha256:4f82fdff0dbe64ef8ab2279bd6aa3f6a99c3b28c05aa09cbfc07c9d7fbb5a463 \ + --hash=sha256:5c4bb6007cfea5f2fd6583a2fb6701a22a41eb98957e63d0fac41c10e7c3117c + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # celery +click-plugins==1.1.1.2 \ + --hash=sha256:008d65743833ffc1f5417bf0e78e8d2c23aab04d9745ba817bd3e71b0feb6aa6 \ + --hash=sha256:d7af3984a99d243c131aa1a828331e7630f4a88a9741fd05c927b204bcf92261 + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # celery +click-repl==0.3.0 \ + --hash=sha256:17849c23dba3d667247dc4defe1757fff98694e90fe37474f3feebb69ced26a9 \ + --hash=sha256:fb7e06deb8da8de86180a33a9da97ac316751c094c6899382da7feeeeb51b812 + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # celery cloudpickle==2.2.0 \ --hash=sha256:3f4219469c55453cfe4737e564b67c2a149109dabf7f242478948b895f61106f \ --hash=sha256:7428798d5926d8fcbfd092d18d01a2a03daf8237d8fcdc8095d256b8490796f0 @@ -814,12 +871,13 @@ frozenlist==1.4.1 \ # -c /tmp/ray-deps/requirements_compiled.txt # aiohttp # aiosignal -fsspec==2023.5.0 \ - --hash=sha256:51a4ad01a5bb66fcc58036e288c0d53d3975a0df2a5dc59a93b59bade0391f2a \ - --hash=sha256:b3b56e00fb93ea321bc9e5d9cf6f8522a0198b20eb24e02774d329e9c6fb84ce +fsspec==2023.12.1 \ + --hash=sha256:6271f1d3075a378bfe432f6f42bf7e1d2a6ba74f78dd9b512385474c579146a0 \ + --hash=sha256:c4da01a35ac65c853f833e43f67802c25213f560820d54ddf248f92eddd5e990 # via # -c /tmp/ray-deps/requirements_compiled.txt # -r python/requirements.txt + # adlfs gitdb==4.0.11 \ --hash=sha256:81a3407ddd2ee8df444cbacea00e2d038e40150acfa3001696fe0dcf1d3adfa4 \ --hash=sha256:bf5421126136d6d0af55bc1e7c1af1c397a34f5b7bd79e776cd3e89785c2b04b @@ -947,70 +1005,118 @@ googleapis-common-protos==1.61.0 \ # via # -c /tmp/ray-deps/requirements_compiled.txt # google-api-core -grpcio==1.66.2 \ - --hash=sha256:02697eb4a5cbe5a9639f57323b4c37bcb3ab2d48cec5da3dc2f13334d72790dd \ - --hash=sha256:03b0b307ba26fae695e067b94cbb014e27390f8bc5ac7a3a39b7723fed085604 \ - --hash=sha256:05bc2ceadc2529ab0b227b1310d249d95d9001cd106aa4d31e8871ad3c428d73 \ - --hash=sha256:06de8ec0bd71be123eec15b0e0d457474931c2c407869b6c349bd9bed4adbac3 \ - --hash=sha256:0be4e0490c28da5377283861bed2941d1d20ec017ca397a5df4394d1c31a9b50 \ - --hash=sha256:12fda97ffae55e6526825daf25ad0fa37483685952b5d0f910d6405c87e3adb6 \ - --hash=sha256:1caa38fb22a8578ab8393da99d4b8641e3a80abc8fd52646f1ecc92bcb8dee34 \ - --hash=sha256:2018b053aa15782db2541ca01a7edb56a0bf18c77efed975392583725974b249 \ - --hash=sha256:20657d6b8cfed7db5e11b62ff7dfe2e12064ea78e93f1434d61888834bc86d75 \ - --hash=sha256:2335c58560a9e92ac58ff2bc5649952f9b37d0735608242973c7a8b94a6437d8 \ - --hash=sha256:31fd163105464797a72d901a06472860845ac157389e10f12631025b3e4d0453 \ - --hash=sha256:38b68498ff579a3b1ee8f93a05eb48dc2595795f2f62716e797dc24774c1aaa8 \ - --hash=sha256:3b00efc473b20d8bf83e0e1ae661b98951ca56111feb9b9611df8efc4fe5d55d \ - --hash=sha256:3ed71e81782966ffead60268bbda31ea3f725ebf8aa73634d5dda44f2cf3fb9c \ - --hash=sha256:45a3d462826f4868b442a6b8fdbe8b87b45eb4f5b5308168c156b21eca43f61c \ - --hash=sha256:49f0ca7ae850f59f828a723a9064cadbed90f1ece179d375966546499b8a2c9c \ - --hash=sha256:4e504572433f4e72b12394977679161d495c4c9581ba34a88d843eaf0f2fbd39 \ - --hash=sha256:4ea1d062c9230278793820146c95d038dc0f468cbdd172eec3363e42ff1c7d01 \ - --hash=sha256:563588c587b75c34b928bc428548e5b00ea38c46972181a4d8b75ba7e3f24231 \ - --hash=sha256:6001e575b8bbd89eee11960bb640b6da6ae110cf08113a075f1e2051cc596cae \ - --hash=sha256:66a0cd8ba6512b401d7ed46bb03f4ee455839957f28b8d61e7708056a806ba6a \ - --hash=sha256:6851de821249340bdb100df5eacfecfc4e6075fa85c6df7ee0eb213170ec8e5d \ - --hash=sha256:728bdf36a186e7f51da73be7f8d09457a03061be848718d0edf000e709418987 \ - --hash=sha256:73e3b425c1e155730273f73e419de3074aa5c5e936771ee0e4af0814631fb30a \ - --hash=sha256:73fc8f8b9b5c4a03e802b3cd0c18b2b06b410d3c1dcbef989fdeb943bd44aff7 \ - --hash=sha256:78fa51ebc2d9242c0fc5db0feecc57a9943303b46664ad89921f5079e2e4ada7 \ - --hash=sha256:7b2c86457145ce14c38e5bf6bdc19ef88e66c5fee2c3d83285c5aef026ba93b3 \ - --hash=sha256:7d69ce1f324dc2d71e40c9261d3fdbe7d4c9d60f332069ff9b2a4d8a257c7b2b \ - --hash=sha256:802d84fd3d50614170649853d121baaaa305de7b65b3e01759247e768d691ddf \ - --hash=sha256:80fd702ba7e432994df208f27514280b4b5c6843e12a48759c9255679ad38db8 \ - --hash=sha256:8ac475e8da31484efa25abb774674d837b343afb78bb3bcdef10f81a93e3d6bf \ - --hash=sha256:950da58d7d80abd0ea68757769c9db0a95b31163e53e5bb60438d263f4bed7b7 \ - --hash=sha256:99a641995a6bc4287a6315989ee591ff58507aa1cbe4c2e70d88411c4dcc0839 \ - --hash=sha256:9c3a99c519f4638e700e9e3f83952e27e2ea10873eecd7935823dab0c1c9250e \ - --hash=sha256:9c509a4f78114cbc5f0740eb3d7a74985fd2eff022971bc9bc31f8bc93e66a3b \ - --hash=sha256:a18e20d8321c6400185b4263e27982488cb5cdd62da69147087a76a24ef4e7e3 \ - --hash=sha256:a917d26e0fe980b0ac7bfcc1a3c4ad6a9a4612c911d33efb55ed7833c749b0ee \ - --hash=sha256:a9539f01cb04950fd4b5ab458e64a15f84c2acc273670072abe49a3f29bbad54 \ - --hash=sha256:ad2efdbe90c73b0434cbe64ed372e12414ad03c06262279b104a029d1889d13e \ - --hash=sha256:b672abf90a964bfde2d0ecbce30f2329a47498ba75ce6f4da35a2f4532b7acbc \ - --hash=sha256:bbd27c24a4cc5e195a7f56cfd9312e366d5d61b86e36d46bbe538457ea6eb8dd \ - --hash=sha256:c400ba5675b67025c8a9f48aa846f12a39cf0c44df5cd060e23fda5b30e9359d \ - --hash=sha256:c408f5ef75cfffa113cacd8b0c0e3611cbfd47701ca3cdc090594109b9fcbaed \ - --hash=sha256:c806852deaedee9ce8280fe98955c9103f62912a5b2d5ee7e3eaa284a6d8d8e7 \ - --hash=sha256:ce89f5876662f146d4c1f695dda29d4433a5d01c8681fbd2539afff535da14d4 \ - --hash=sha256:d25a14af966438cddf498b2e338f88d1c9706f3493b1d73b93f695c99c5f0e2a \ - --hash=sha256:d8d4732cc5052e92cea2f78b233c2e2a52998ac40cd651f40e398893ad0d06ec \ - --hash=sha256:d9a9724a156c8ec6a379869b23ba3323b7ea3600851c91489b871e375f710bc8 \ - --hash=sha256:e636ce23273683b00410f1971d209bf3689238cf5538d960adc3cdfe80dd0dbd \ - --hash=sha256:e88264caad6d8d00e7913996030bac8ad5f26b7411495848cc218bd3a9040b6c \ - --hash=sha256:f145cc21836c332c67baa6fc81099d1d27e266401565bf481948010d6ea32d46 \ - --hash=sha256:fb57870449dfcfac428afbb5a877829fcb0d6db9d9baa1148705739e9083880e \ - --hash=sha256:fb70487c95786e345af5e854ffec8cb8cc781bcc5df7930c4fbb7feaa72e1cdf \ - --hash=sha256:fe96281713168a3270878255983d2cb1a97e034325c8c2c25169a69289d3ecfa \ - --hash=sha256:ff1f7882e56c40b0d33c4922c15dfa30612f05fb785074a012f7cda74d1c3679 +grpcio==1.74.0 \ + --hash=sha256:0f87bddd6e27fc776aacf7ebfec367b6d49cad0455123951e4488ea99d9b9b8f \ + --hash=sha256:136b53c91ac1d02c8c24201bfdeb56f8b3ac3278668cbb8e0ba49c88069e1bdc \ + --hash=sha256:1733969040989f7acc3d94c22f55b4a9501a30f6aaacdbccfaba0a3ffb255ab7 \ + --hash=sha256:176d60a5168d7948539def20b2a3adcce67d72454d9ae05969a2e73f3a0feee7 \ + --hash=sha256:1a2b06afe2e50ebfd46247ac3ba60cac523f54ec7792ae9ba6073c12daf26f0a \ + --hash=sha256:1bf949792cee20d2078323a9b02bacbbae002b9e3b9e2433f2741c15bdeba1c4 \ + --hash=sha256:22b834cef33429ca6cc28303c9c327ba9a3fafecbf62fae17e9a7b7163cc43ac \ + --hash=sha256:2918948864fec2a11721d91568effffbe0a02b23ecd57f281391d986847982f6 \ + --hash=sha256:2bc2d7d8d184e2362b53905cb1708c84cb16354771c04b490485fa07ce3a1d89 \ + --hash=sha256:2f609a39f62a6f6f05c7512746798282546358a37ea93c1fcbadf8b2fed162e3 \ + --hash=sha256:3601274bc0523f6dc07666c0e01682c94472402ac2fd1226fd96e079863bfa49 \ + --hash=sha256:3b03d8f2a07f0fea8c8f74deb59f8352b770e3900d143b3d1475effcb08eec20 \ + --hash=sha256:3d14e3c4d65e19d8430a4e28ceb71ace4728776fd6c3ce34016947474479683f \ + --hash=sha256:42f8fee287427b94be63d916c90399ed310ed10aadbf9e2e5538b3e497d269bc \ + --hash=sha256:4bc5fca10aaf74779081e16c2bcc3d5ec643ffd528d9e7b1c9039000ead73bae \ + --hash=sha256:4e4181bfc24413d1e3a37a0b7889bea68d973d4b45dd2bc68bb766c140718f82 \ + --hash=sha256:55b453812fa7c7ce2f5c88be3018fb4a490519b6ce80788d5913f3f9d7da8c7b \ + --hash=sha256:566b9395b90cc3d0d0c6404bc8572c7c18786ede549cdb540ae27b58afe0fb91 \ + --hash=sha256:5f251c355167b2360537cf17bea2cf0197995e551ab9da6a0a59b3da5e8704f9 \ + --hash=sha256:60d2d48b0580e70d2e1954d0d19fa3c2e60dd7cbed826aca104fff518310d1c5 \ + --hash=sha256:64229c1e9cea079420527fa8ac45d80fc1e8d3f94deaa35643c381fa8d98f362 \ + --hash=sha256:655726919b75ab3c34cdad39da5c530ac6fa32696fb23119e36b64adcfca174a \ + --hash=sha256:662456c4513e298db6d7bd9c3b8df6f75f8752f0ba01fb653e252ed4a59b5a5d \ + --hash=sha256:68c8ebcca945efff9d86d8d6d7bfb0841cf0071024417e2d7f45c5e46b5b08eb \ + --hash=sha256:69e1a8180868a2576f02356565f16635b99088da7df3d45aaa7e24e73a054e31 \ + --hash=sha256:6bab67d15ad617aff094c382c882e0177637da73cbc5532d52c07b4ee887a87b \ + --hash=sha256:7d95d71ff35291bab3f1c52f52f474c632db26ea12700c2ff0ea0532cb0b5854 \ + --hash=sha256:80d1f4fbb35b0742d3e3d3bb654b7381cd5f015f8497279a1e9c21ba623e01b1 \ + --hash=sha256:834988b6c34515545b3edd13e902c1acdd9f2465d386ea5143fb558f153a7176 \ + --hash=sha256:8533e6e9c5bd630ca98062e3a1326249e6ada07d05acf191a77bc33f8948f3d8 \ + --hash=sha256:85bd5cdf4ed7b2d6438871adf6afff9af7096486fcf51818a81b77ef4dd30907 \ + --hash=sha256:86ad489db097141a907c559988c29718719aa3e13370d40e20506f11b4de0d11 \ + --hash=sha256:885912559974df35d92219e2dc98f51a16a48395f37b92865ad45186f294096c \ + --hash=sha256:8efe72fde5500f47aca1ef59495cb59c885afe04ac89dd11d810f2de87d935d4 \ + --hash=sha256:8f7b5882fb50632ab1e48cb3122d6df55b9afabc265582808036b6e51b9fd6b7 \ + --hash=sha256:9e7c4389771855a92934b2846bd807fc25a3dfa820fd912fe6bd8136026b2707 \ + --hash=sha256:9e912d3c993a29df6c627459af58975b2e5c897d93287939b9d5065f000249b5 \ + --hash=sha256:a8f0302f9ac4e9923f98d8e243939a6fb627cd048f5cd38595c97e38020dffce \ + --hash=sha256:b6a73b2ba83e663b2480a90b82fdae6a7aa6427f62bf43b29912c0cfd1aa2bfa \ + --hash=sha256:c14e803037e572c177ba54a3e090d6eb12efd795d49327c5ee2b3bddb836bf01 \ + --hash=sha256:c3d7bd6e3929fd2ea7fbc3f562e4987229ead70c9ae5f01501a46701e08f1ad9 \ + --hash=sha256:c98e0b7434a7fa4e3e63f250456eaef52499fba5ae661c58cc5b5477d11e7182 \ + --hash=sha256:cce634b10aeab37010449124814b05a62fb5f18928ca878f1bf4750d1f0c815b \ + --hash=sha256:e154d230dc1bbbd78ad2fdc3039fa50ad7ffcf438e4eb2fa30bce223a70c7486 \ + --hash=sha256:e1ea6176d7dfd5b941ea01c2ec34de9531ba494d541fe2057c904e601879f249 \ + --hash=sha256:e759f9e8bc908aaae0412642afe5416c9f983a80499448fcc7fab8692ae044c3 \ + --hash=sha256:e8978003816c7b9eabe217f88c78bc26adc8f9304bf6a594b02e5a49b2ef9c11 \ + --hash=sha256:ecde9ab49f58433abe02f9ed076c7b5be839cf0153883a6d23995937a82392fa \ + --hash=sha256:f6ec94f0e50eb8fa1744a731088b966427575e40c2944a980049798b127a687e \ + --hash=sha256:fd3c71aeee838299c5887230b8a1822795325ddfea635edd82954c1eaa831e24 \ + --hash=sha256:fe0f540750a13fd8e5da4b3eaba91a785eea8dca5ccd2bc2ffe978caa403090e # via # -c /tmp/ray-deps/requirements_compiled.txt # -r python/requirements/cloud-requirements.txt # -r python/requirements.txt # grpcio-tools -gymnasium==1.0.0 \ - --hash=sha256:9d2b66f30c1b34fe3c2ce7fae65ecf365d0e9982d2b3d860235e773328a3b403 \ - --hash=sha256:b6f40e1e24c5bd419361e1a5b86a9117d2499baecc3a660d44dfff4c465393ad +grpcio-tools==1.62.3 \ + --hash=sha256:0a52cc9444df978438b8d2332c0ca99000521895229934a59f94f37ed896b133 \ + --hash=sha256:0a8c0c4724ae9c2181b7dbc9b186df46e4f62cb18dc184e46d06c0ebeccf569e \ + --hash=sha256:0cb3a3436ac119cbd37a7d3331d9bdf85dad21a6ac233a3411dff716dcbf401e \ + --hash=sha256:11c625eebefd1fd40a228fc8bae385e448c7e32a6ae134e43cf13bbc23f902b7 \ + --hash=sha256:11f363570dea661dde99e04a51bd108a5807b5df32a6f8bdf4860e34e94a4dbf \ + --hash=sha256:141d028bf5762d4a97f981c501da873589df3f7e02f4c1260e1921e565b376fa \ + --hash=sha256:1c989246c2aebc13253f08be32538a4039a64e12d9c18f6d662d7aee641dc8b5 \ + --hash=sha256:1da38070738da53556a4b35ab67c1b9884a5dd48fa2f243db35dc14079ea3d0c \ + --hash=sha256:27cd9ef5c5d68d5ed104b6dcb96fe9c66b82050e546c9e255716903c3d8f0373 \ + --hash=sha256:2e02d3b96f2d0e4bab9ceaa30f37d4f75571e40c6272e95364bff3125a64d184 \ + --hash=sha256:2f968b049c2849540751ec2100ab05e8086c24bead769ca734fdab58698408c1 \ + --hash=sha256:350a80485e302daaa95d335a931f97b693e170e02d43767ab06552c708808950 \ + --hash=sha256:3eae6ea76d62fcac091e1f15c2dcedf1dc3f114f8df1a972a8a0745e89f4cf61 \ + --hash=sha256:47a5c093ab256dec5714a7a345f8cc89315cb57c298b276fa244f37a0ba507f0 \ + --hash=sha256:5782883a27d3fae8c425b29a9d3dcf5f47d992848a1b76970da3b5a28d424b26 \ + --hash=sha256:6a56d344b0bab30bf342a67e33d386b0b3c4e65868ffe93c341c51e1a8853ca5 \ + --hash=sha256:6c3064610826f50bd69410c63101954676edc703e03f9e8f978a135f1aaf97c1 \ + --hash=sha256:703f46e0012af83a36082b5f30341113474ed0d91e36640da713355cd0ea5d23 \ + --hash=sha256:710fecf6a171dcbfa263a0a3e7070e0df65ba73158d4c539cec50978f11dad5d \ + --hash=sha256:7c7136015c3d62c3eef493efabaf9e3380e3e66d24ee8e94c01cb71377f57833 \ + --hash=sha256:7cc83023acd8bc72cf74c2edbe85b52098501d5b74d8377bfa06f3e929803492 \ + --hash=sha256:7f2483ea232bd72d98a6dc6d7aefd97e5bc80b15cd909b9e356d6f3e326b6e43 \ + --hash=sha256:7ff7d58a45b75df67d25f8f144936a3e44aabd91afec833ee06826bd02b7fbe7 \ + --hash=sha256:8ad0473af5544f89fc5a1ece8676dd03bdf160fb3230f967e05d0f4bf89620e3 \ + --hash=sha256:8c5d22b252dcef11dd1e0fbbe5bbfb9b4ae048e8880d33338215e8ccbdb03edc \ + --hash=sha256:8e62cc7164b0b7c5128e637e394eb2ef3db0e61fc798e80c301de3b2379203ed \ + --hash=sha256:962c84b4da0f3b14b3cdb10bc3837ebc5f136b67d919aea8d7bb3fd3df39528a \ + --hash=sha256:ace43b26d88a58dcff16c20d23ff72b04d0a415f64d2820f4ff06b1166f50557 \ + --hash=sha256:b47d0dda1bdb0a0ba7a9a6de88e5a1ed61f07fad613964879954961e36d49193 \ + --hash=sha256:b77f9f9cee87cd798f0fe26b7024344d1b03a7cd2d2cba7035f8433b13986325 \ + --hash=sha256:b881fd9505a84457e9f7e99362eeedd86497b659030cf57c6f0070df6d9c2b9b \ + --hash=sha256:bfda6ee8990997a9df95c5606f3096dae65f09af7ca03a1e9ca28f088caca5cf \ + --hash=sha256:c3a1ac9d394f8e229eb28eec2e04b9a6f5433fa19c9d32f1cb6066e3c5114a1d \ + --hash=sha256:c8ad5cce554e2fcaf8842dee5d9462583b601a3a78f8b76a153c38c963f58c10 \ + --hash=sha256:ca246dffeca0498be9b4e1ee169b62e64694b0f92e6d0be2573e65522f39eea9 \ + --hash=sha256:ca4f5eeadbb57cf03317d6a2857823239a63a59cc935f5bd6cf6e8b7af7a7ecc \ + --hash=sha256:d102b9b21c4e1e40af9a2ab3c6d41afba6bd29c0aa50ca013bf85c99cdc44ac5 \ + --hash=sha256:db3bc9fa39afc5e4e2767da4459df82b095ef0cab2f257707be06c44a1c2c3e5 \ + --hash=sha256:dc9ad9950119d8ae27634e68b7663cc8d340ae535a0f80d85a55e56a6973ab1f \ + --hash=sha256:e02d7c1a02e3814c94ba0cfe43d93e872c758bd8fd5c2797f894d0c49b4a1dfc \ + --hash=sha256:e0898d412a434e768a0c7e365acabe13ff1558b767e400936e26b5b6ed1ee51f \ + --hash=sha256:e18e15287c31baf574fcdf8251fb7f997d64e96c6ecf467906e576da0a079af6 \ + --hash=sha256:ec279dcf3518201fc592c65002754f58a6b542798cd7f3ecd4af086422f33f29 \ + --hash=sha256:ec6fbded0c61afe6f84e3c2a43e6d656791d95747d6d28b73eff1af64108c434 \ + --hash=sha256:eec73a005443061f4759b71a056f745e3b000dc0dc125c9f20560232dfbcbd14 \ + --hash=sha256:f3d812daffd0c2d2794756bd45a353f89e55dc8f91eb2fc840c51b9f6be62667 \ + --hash=sha256:f4b1615adf67bd8bb71f3464146a6f9949972d06d21a4f5e87e73f6464d97f57 \ + --hash=sha256:f6831fdec2b853c9daa3358535c55eed3694325889aa714070528cf8f92d7d6d + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # -r python/requirements/cloud-requirements.txt +gymnasium==1.1.1 \ + --hash=sha256:8bd9ea9bdef32c950a444ff36afc785e1d81051ec32d30435058953c20d2456d \ + --hash=sha256:9c167ec0a2b388666e37f63b2849cd2552f7f5b71938574c637bb36487eb928a # via # -c /tmp/ray-deps/requirements_compiled.txt # -r python/requirements.txt @@ -1247,6 +1353,12 @@ jupyterlab-widgets==3.0.11 \ # via # -c /tmp/ray-deps/requirements_compiled.txt # ipywidgets +kombu==5.5.4 \ + --hash=sha256:886600168275ebeada93b888e831352fe578168342f0d1d5833d88ba0d847363 \ + --hash=sha256:a12ed0557c238897d8e518f1d1fdf84bd1516c5e305af2dacd85c2015115feb8 + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # celery lazy-loader==0.4 \ --hash=sha256:342aa8e14d543a154047afb4ba8ef17f5563baad3fc610d7b15b213b0f119efc \ --hash=sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1 @@ -1495,6 +1607,7 @@ msal==1.28.1 \ --hash=sha256:d72bbfe2d5c2f2555f4bc6205be4450ddfd12976610dd9a16a9ab0f05c68b64d # via # -c /tmp/ray-deps/requirements_compiled.txt + # azure-datalake-store # azure-identity # msal-extensions msal-extensions==1.2.0b1 \ @@ -1827,6 +1940,7 @@ packaging==23.0 \ # jupyter-server # jupyterlab # jupyterlab-server + # kombu # lazy-loader # nbconvert # pytest @@ -2001,6 +2115,7 @@ prompt-toolkit==3.0.41 \ --hash=sha256:f36fe301fafb7470e86aaf90f036eef600a3210be4decf461a5b1ca8403d3cb2 # via # -c /tmp/ray-deps/requirements_compiled.txt + # click-repl # ipython propcache==0.3.0 \ --hash=sha256:02df07041e0820cacc8f739510078f2aadcfd3fc57eaeeb16d5ded85c872c89e \ @@ -2284,114 +2399,113 @@ pycurl==7.45.3 \ # via # -c /tmp/ray-deps/requirements_compiled.txt # -r python/requirements/cloud-requirements.txt -pydantic==2.10.0 \ - --hash=sha256:0aca0f045ff6e2f097f1fe89521115335f15049eeb8a7bef3dafe4b19a74e289 \ - --hash=sha256:5e7807ba9201bdf61b1b58aa6eb690916c40a47acfb114b1b4fef3e7fd5b30fc +pydantic==2.11.7 \ + --hash=sha256:d989c3c6cb79469287b1569f7447a17848c998458d49ebe294e975b9baf0f0db \ + --hash=sha256:dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b # via # -c /tmp/ray-deps/requirements_compiled.txt # -r python/requirements.txt # fastapi -pydantic-core==2.27.0 \ - --hash=sha256:0aa4d1b2eba9a325897308b3124014a142cdccb9f3e016f31d3ebee6b5ea5e75 \ - --hash=sha256:0d06b667e53320332be2bf6f9461f4a9b78092a079b8ce8634c9afaa7e10cd9f \ - --hash=sha256:153017e3d6cd3ce979de06d84343ca424bb6092727375eba1968c8b4693c6ecb \ - --hash=sha256:15e350efb67b855cd014c218716feea4986a149ed1f42a539edd271ee074a196 \ - --hash=sha256:185ef205256cd8b38431205698531026979db89a79587725c1e55c59101d64e9 \ - --hash=sha256:1da0c98a85a6c6ed702d5556db3b09c91f9b0b78de37b7593e2de8d03238807a \ - --hash=sha256:225bfff5d425c34e1fd562cef52d673579d59b967d9de06178850c4802af9039 \ - --hash=sha256:24f984fc7762ed5f806d9e8c4c77ea69fdb2afd987b4fd319ef06c87595a8c55 \ - --hash=sha256:25a7fd4de38f7ff99a37e18fa0098c3140286451bc823d1746ba80cec5b433a1 \ - --hash=sha256:2883b260f7a93235488699d39cbbd94fa7b175d3a8063fbfddd3e81ad9988cb2 \ - --hash=sha256:2a51ce96224eadd1845150b204389623c8e129fde5a67a84b972bd83a85c6c40 \ - --hash=sha256:2be0ad541bb9f059954ccf8877a49ed73877f862529575ff3d54bf4223e4dd61 \ - --hash=sha256:31a2cae5f059329f9cfe3d8d266d3da1543b60b60130d186d9b6a3c20a346361 \ - --hash=sha256:333c840a1303d1474f491e7be0b718226c730a39ead0f7dab2c7e6a2f3855555 \ - --hash=sha256:33d14369739c5d07e2e7102cdb0081a1fa46ed03215e07f097b34e020b83b1ae \ - --hash=sha256:35380671c3c921fe8adf31ad349dc6f7588b7e928dbe44e1093789734f607399 \ - --hash=sha256:359e7951f04ad35111b5ddce184db3391442345d0ab073aa63a95eb8af25a5ef \ - --hash=sha256:36aa167f69d8807ba7e341d67ea93e50fcaaf6bc433bb04939430fa3dab06f31 \ - --hash=sha256:395e3e1148fa7809016231f8065f30bb0dc285a97b4dc4360cd86e17bab58af7 \ - --hash=sha256:3e8d89c276234579cd3d095d5fa2a44eb10db9a218664a17b56363cddf226ff3 \ - --hash=sha256:3eb8849445c26b41c5a474061032c53e14fe92a11a5db969f722a2716cd12206 \ - --hash=sha256:3fd8bc2690e7c39eecdf9071b6a889ce7b22b72073863940edc2a0a23750ca90 \ - --hash=sha256:400bf470e4327e920883b51e255617dfe4496d4e80c3fea0b5a5d0bf2c404dd4 \ - --hash=sha256:4148dc9184ab79e356dc00a4199dc0ee8647973332cb385fc29a7cced49b9f9c \ - --hash=sha256:433689845288f9a1ee5714444e65957be26d30915f7745091ede4a83cfb2d7bb \ - --hash=sha256:43b61989068de9ce62296cde02beffabcadb65672207fc51e7af76dca75e6636 \ - --hash=sha256:4523c4009c3f39d948e01962223c9f5538602e7087a628479b723c939fab262d \ - --hash=sha256:483c2213a609e7db2c592bbc015da58b6c75af7360ca3c981f178110d9787bcf \ - --hash=sha256:49633583eb7dc5cba61aaf7cdb2e9e662323ad394e543ee77af265736bcd3eaa \ - --hash=sha256:4b51f964fcbb02949fc546022e56cdb16cda457af485e9a3e8b78ac2ecf5d77e \ - --hash=sha256:4bf1340ae507f6da6360b24179c2083857c8ca7644aab65807023cf35404ea8d \ - --hash=sha256:4fb49cfdb53af5041aba909be00cccfb2c0d0a2e09281bf542371c5fd36ad04c \ - --hash=sha256:510b11e9c3b1a852876d1ccd8d5903684336d635214148637ceb27366c75a467 \ - --hash=sha256:513cb14c0cc31a4dfd849a4674b20c46d87b364f997bbcb02282306f5e187abf \ - --hash=sha256:58560828ee0951bb125c6f2862fbc37f039996d19ceb6d8ff1905abf7da0bf3d \ - --hash=sha256:58ab0d979c969983cdb97374698d847a4acffb217d543e172838864636ef10d9 \ - --hash=sha256:5982048129f40b082c2654de10c0f37c67a14f5ff9d37cf35be028ae982f26df \ - --hash=sha256:5ab325fc86fbc077284c8d7f996d904d30e97904a87d6fb303dce6b3de7ebba9 \ - --hash=sha256:5cc822ab90a70ea3a91e6aed3afac570b276b1278c6909b1d384f745bd09c714 \ - --hash=sha256:5f2b19b8d6fca432cb3acf48cf5243a7bf512988029b6e6fd27e9e8c0a204d85 \ - --hash=sha256:5fc72fbfebbf42c0856a824b8b0dc2b5cd2e4a896050281a21cfa6fed8879cb1 \ - --hash=sha256:6354e18a9be37bfa124d6b288a87fb30c673745806c92956f1a25e3ae6e76b96 \ - --hash=sha256:678f66462058dd978702db17eb6a3633d634f7aa0deaea61e0a674152766d3fc \ - --hash=sha256:68950bc08f9735306322bfc16a18391fcaac99ded2509e1cc41d03ccb6013cfe \ - --hash=sha256:68ef5377eb582fa4343c9d0b57a5b094046d447b4c73dd9fbd9ffb216f829e7d \ - --hash=sha256:6b4c19525c3538fbc0bbda6229f9682fb8199ce9ac37395880e6952798e00373 \ - --hash=sha256:6bb69bf3b6500f195c3deb69c1205ba8fc3cb21d1915f1f158a10d6b1ef29b6a \ - --hash=sha256:6e19401742ed7b69e51d8e4df3c03ad5ec65a83b36244479fd70edde2828a5d9 \ - --hash=sha256:6f4a53af9e81d757756508b57cae1cf28293f0f31b9fa2bfcb416cc7fb230f9d \ - --hash=sha256:6fda87808429c520a002a85d6e7cdadbf58231d60e96260976c5b8f9a12a8e13 \ - --hash=sha256:78f841523729e43e3928a364ec46e2e3f80e6625a4f62aca5c345f3f626c6e8a \ - --hash=sha256:7a6ebfac28fd51890a61df36ef202adbd77d00ee5aca4a3dadb3d9ed49cfb929 \ - --hash=sha256:7b0202ebf2268954090209a84f9897345719e46a57c5f2c9b7b250ca0a9d3e63 \ - --hash=sha256:8117839a9bdbba86e7f9df57018fe3b96cec934c3940b591b0fd3fbfb485864a \ - --hash=sha256:82e1ad4ca170e8af4c928b67cff731b6296e6a0a0981b97b2eb7c275cc4e15bd \ - --hash=sha256:836a4bfe0cc6d36dc9a9cc1a7b391265bf6ce9d1eb1eac62ac5139f5d8d9a6fa \ - --hash=sha256:84af1cf7bfdcbc6fcf5a5f70cc9896205e0350306e4dd73d54b6a18894f79386 \ - --hash=sha256:84e35afd9e10b2698e6f2f32256678cb23ca6c1568d02628033a837638b3ed12 \ - --hash=sha256:884f1806609c2c66564082540cffc96868c5571c7c3cf3a783f63f2fb49bd3cd \ - --hash=sha256:8a150392102c402c538190730fda06f3bce654fc498865579a9f2c1d2b425833 \ - --hash=sha256:8e21d927469d04b39386255bf00d0feedead16f6253dcc85e9e10ddebc334084 \ - --hash=sha256:8e96ca781e0c01e32115912ebdf7b3fb0780ce748b80d7d28a0802fa9fbaf44e \ - --hash=sha256:8ee4c2a75af9fe21269a4a0898c5425afb01af1f5d276063f57e2ae1bc64e191 \ - --hash=sha256:91bc66f878557313c2a6bcf396e7befcffe5ab4354cfe4427318968af31143c3 \ - --hash=sha256:951e71da6c89d354572098bada5ba5b5dc3a9390c933af8a614e37755d3d1840 \ - --hash=sha256:99b2863c1365f43f74199c980a3d40f18a218fbe683dd64e470199db426c4d6a \ - --hash=sha256:9a8fbf506fde1529a1e3698198fe64bfbe2e0c09557bc6a7dcf872e7c01fec40 \ - --hash=sha256:9ce048deb1e033e7a865ca384770bccc11d44179cf09e5193a535c4c2f497bdc \ - --hash=sha256:9fe94d9d2a2b4edd7a4b22adcd45814b1b59b03feb00e56deb2e89747aec7bfe \ - --hash=sha256:a291d0b4243a259c8ea7e2b84eb9ccb76370e569298875a7c5e3e71baf49057a \ - --hash=sha256:a5c022bb0d453192426221605efc865373dde43b17822a264671c53b068ac20c \ - --hash=sha256:abb4785894936d7682635726613c44578c420a096729f1978cd061a7e72d5275 \ - --hash=sha256:b872c86d8d71827235c7077461c502feb2db3f87d9d6d5a9daa64287d75e4fa0 \ - --hash=sha256:bf37b72834e7239cf84d4a0b2c050e7f9e48bced97bad9bdf98d26b8eb72e846 \ - --hash=sha256:c0c431e4be5c1a0c6654e0c31c661cd89e0ca956ef65305c3c3fd96f4e72ca39 \ - --hash=sha256:c5726eec789ee38f2c53b10b1821457b82274f81f4f746bb1e666d8741fcfadb \ - --hash=sha256:c6fcb3fa3855d583aa57b94cf146f7781d5d5bc06cb95cb3afece33d31aac39b \ - --hash=sha256:c86679f443e7085ea55a7376462553996c688395d18ef3f0d3dbad7838f857a2 \ - --hash=sha256:c91e3c04f5191fd3fb68764bddeaf02025492d5d9f23343b283870f6ace69708 \ - --hash=sha256:c921ad596ff1a82f9c692b0758c944355abc9f0de97a4c13ca60ffc6d8dc15d4 \ - --hash=sha256:c9ed88b398ba7e3bad7bd64d66cc01dcde9cfcb7ec629a6fd78a82fa0b559d78 \ - --hash=sha256:cd2ac6b919f7fed71b17fe0b4603c092a4c9b5bae414817c9c81d3c22d1e1bcc \ - --hash=sha256:d28ca7066d6cdd347a50d8b725dc10d9a1d6a1cce09836cf071ea6a2d4908be0 \ - --hash=sha256:d29e235ce13c91902ef3efc3d883a677655b3908b1cbc73dee816e5e1f8f7739 \ - --hash=sha256:d8b5ee4ae9170e2775d495b81f414cc20268041c42571530513496ba61e94ba3 \ - --hash=sha256:db72e40628967f6dc572020d04b5f800d71264e0531c6da35097e73bdf38b003 \ - --hash=sha256:df45c4073bed486ea2f18757057953afed8dd77add7276ff01bccb79982cf46c \ - --hash=sha256:dfa5f5c0a4c8fced1422dc2ca7eefd872d5d13eb33cf324361dbf1dbfba0a9fe \ - --hash=sha256:e015833384ca3e1a0565a79f5d953b0629d9138021c27ad37c92a9fa1af7623c \ - --hash=sha256:e15315691fe2253eb447503153acef4d7223dfe7e7702f9ed66539fcd0c43801 \ - --hash=sha256:e65466b31be1070b4a5b7dbfbd14b247884cb8e8b79c64fb0f36b472912dbaea \ - --hash=sha256:e7820bb0d65e3ce1e3e70b6708c2f66143f55912fa02f4b618d0f08b61575f12 \ - --hash=sha256:e851a051f7260e6d688267eb039c81f05f23a19431bd7dfa4bf5e3cb34c108cd \ - --hash=sha256:e9f9feee7f334b72ceae46313333d002b56f325b5f04271b4ae2aadd9e993ae4 \ - --hash=sha256:eb40f828bc2f73f777d1eb8fee2e86cd9692a4518b63b6b5aa8af915dfd3207b \ - --hash=sha256:eb704155e73b833801c247f39d562229c0303f54770ca14fb1c053acb376cf10 \ - --hash=sha256:edb1bfd45227dec8d50bc7c7d86463cd8728bcc574f9b07de7369880de4626a3 \ - --hash=sha256:ee7d9d5537daf6d5c74a83b38a638cc001b648096c1cae8ef695b0c919d9d379 \ - --hash=sha256:f57783fbaf648205ac50ae7d646f27582fc706be3977e87c3c124e7a92407b10 \ - --hash=sha256:ff63a92f6e249514ef35bc795de10745be0226eaea06eb48b4bbeaa0c8850a4a +pydantic-core==2.33.2 \ + --hash=sha256:0069c9acc3f3981b9ff4cdfaf088e98d83440a4c7ea1bc07460af3d4dc22e72d \ + --hash=sha256:031c57d67ca86902726e0fae2214ce6770bbe2f710dc33063187a68744a5ecac \ + --hash=sha256:0405262705a123b7ce9f0b92f123334d67b70fd1f20a9372b907ce1080c7ba02 \ + --hash=sha256:04a1a413977ab517154eebb2d326da71638271477d6ad87a769102f7c2488c56 \ + --hash=sha256:09fb9dd6571aacd023fe6aaca316bd01cf60ab27240d7eb39ebd66a3a15293b4 \ + --hash=sha256:0a39979dcbb70998b0e505fb1556a1d550a0781463ce84ebf915ba293ccb7e22 \ + --hash=sha256:0a9f2c9dd19656823cb8250b0724ee9c60a82f3cdf68a080979d13092a3b0fef \ + --hash=sha256:0e03262ab796d986f978f79c943fc5f620381be7287148b8010b4097f79a39ec \ + --hash=sha256:0e5b2671f05ba48b94cb90ce55d8bdcaaedb8ba00cc5359f6810fc918713983d \ + --hash=sha256:0e6116757f7959a712db11f3e9c0a99ade00a5bbedae83cb801985aa154f071b \ + --hash=sha256:0fb2d542b4d66f9470e8065c5469ec676978d625a8b7a363f07d9a501a9cb36a \ + --hash=sha256:1082dd3e2d7109ad8b7da48e1d4710c8d06c253cbc4a27c1cff4fbcaa97a9e3f \ + --hash=sha256:1a8695a8d00c73e50bff9dfda4d540b7dee29ff9b8053e38380426a85ef10052 \ + --hash=sha256:1e063337ef9e9820c77acc768546325ebe04ee38b08703244c1309cccc4f1bab \ + --hash=sha256:1ea40a64d23faa25e62a70ad163571c0b342b8bf66d5fa612ac0dec4f069d916 \ + --hash=sha256:2058a32994f1fde4ca0480ab9d1e75a0e8c87c22b53a3ae66554f9af78f2fe8c \ + --hash=sha256:235f45e5dbcccf6bd99f9f472858849f73d11120d76ea8707115415f8e5ebebf \ + --hash=sha256:2807668ba86cb38c6817ad9bc66215ab8584d1d304030ce4f0887336f28a5e27 \ + --hash=sha256:2b0a451c263b01acebe51895bfb0e1cc842a5c666efe06cdf13846c7418caa9a \ + --hash=sha256:2b3d326aaef0c0399d9afffeb6367d5e26ddc24d351dbc9c636840ac355dc5d8 \ + --hash=sha256:2bfb5112df54209d820d7bf9317c7a6c9025ea52e49f46b6a2060104bba37de7 \ + --hash=sha256:2f82865531efd18d6e07a04a17331af02cb7a651583c418df8266f17a63c6612 \ + --hash=sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1 \ + --hash=sha256:3a1c81334778f9e3af2f8aeb7a960736e5cab1dfebfb26aabca09afd2906c039 \ + --hash=sha256:3abcd9392a36025e3bd55f9bd38d908bd17962cc49bc6da8e7e96285336e2bca \ + --hash=sha256:3c6db6e52c6d70aa0d00d45cdb9b40f0433b96380071ea80b09277dba021ddf7 \ + --hash=sha256:3dc625f4aa79713512d1976fe9f0bc99f706a9dee21dfd1810b4bbbf228d0e8a \ + --hash=sha256:3eb3fe62804e8f859c49ed20a8451342de53ed764150cb14ca71357c765dc2a6 \ + --hash=sha256:44857c3227d3fb5e753d5fe4a3420d6376fa594b07b621e220cd93703fe21782 \ + --hash=sha256:4b25d91e288e2c4e0662b8038a28c6a07eaac3e196cfc4ff69de4ea3db992a1b \ + --hash=sha256:4c5b0a576fb381edd6d27f0a85915c6daf2f8138dc5c267a57c08a62900758c7 \ + --hash=sha256:4e61206137cbc65e6d5256e1166f88331d3b6238e082d9f74613b9b765fb9025 \ + --hash=sha256:52fb90784e0a242bb96ec53f42196a17278855b0f31ac7c3cc6f5c1ec4811849 \ + --hash=sha256:53a57d2ed685940a504248187d5685e49eb5eef0f696853647bf37c418c538f7 \ + --hash=sha256:572c7e6c8bb4774d2ac88929e3d1f12bc45714ae5ee6d9a788a9fb35e60bb04b \ + --hash=sha256:5c4aa4e82353f65e548c476b37e64189783aa5384903bfea4f41580f255fddfa \ + --hash=sha256:5c92edd15cd58b3c2d34873597a1e20f13094f59cf88068adb18947df5455b4e \ + --hash=sha256:5f483cfb75ff703095c59e365360cb73e00185e01aaea067cd19acffd2ab20ea \ + --hash=sha256:61c18fba8e5e9db3ab908620af374db0ac1baa69f0f32df4f61ae23f15e586ac \ + --hash=sha256:6368900c2d3ef09b69cb0b913f9f8263b03786e5b2a387706c5afb66800efd51 \ + --hash=sha256:64632ff9d614e5eecfb495796ad51b0ed98c453e447a76bcbeeb69615079fc7e \ + --hash=sha256:65132b7b4a1c0beded5e057324b7e16e10910c106d43675d9bd87d4f38dde162 \ + --hash=sha256:6b99022f1d19bc32a4c2a0d544fc9a76e3be90f0b3f4af413f87d38749300e65 \ + --hash=sha256:6bdfe4b3789761f3bcb4b1ddf33355a71079858958e3a552f16d5af19768fef2 \ + --hash=sha256:6fa6dfc3e4d1f734a34710f391ae822e0a8eb8559a85c6979e14e65ee6ba2954 \ + --hash=sha256:73662edf539e72a9440129f231ed3757faab89630d291b784ca99237fb94db2b \ + --hash=sha256:73cf6373c21bc80b2e0dc88444f41ae60b2f070ed02095754eb5a01df12256de \ + --hash=sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc \ + --hash=sha256:7f92c15cd1e97d4b12acd1cc9004fa092578acfa57b67ad5e43a197175d01a64 \ + --hash=sha256:82f68293f055f51b51ea42fafc74b6aad03e70e191799430b90c13d643059ebb \ + --hash=sha256:83aa99b1285bc8f038941ddf598501a86f1536789740991d7d8756e34f1e74d9 \ + --hash=sha256:87acbfcf8e90ca885206e98359d7dca4bcbb35abdc0ff66672a293e1d7a19101 \ + --hash=sha256:87b31b6846e361ef83fedb187bb5b4372d0da3f7e28d85415efa92d6125d6e6d \ + --hash=sha256:881b21b5549499972441da4758d662aeea93f1923f953e9cbaff14b8b9565aef \ + --hash=sha256:8d55ab81c57b8ff8548c3e4947f119551253f4e3787a7bbc0b6b3ca47498a9d3 \ + --hash=sha256:8f57a69461af2a5fa6e6bbd7a5f60d3b7e6cebb687f55106933188e79ad155c1 \ + --hash=sha256:95237e53bb015f67b63c91af7518a62a8660376a6a0db19b89acc77a4d6199f5 \ + --hash=sha256:96081f1605125ba0855dfda83f6f3df5ec90c61195421ba72223de35ccfb2f88 \ + --hash=sha256:970919794d126ba8645f3837ab6046fb4e72bbc057b3709144066204c19a455d \ + --hash=sha256:9cb1da0f5a471435a7bc7e439b8a728e8b61e59784b2af70d7c169f8dd8ae290 \ + --hash=sha256:9fcd347d2cc5c23b06de6d3b7b8275be558a0c90549495c699e379a80bf8379e \ + --hash=sha256:9fdac5d6ffa1b5a83bca06ffe7583f5576555e6c8b3a91fbd25ea7780f825f7d \ + --hash=sha256:a11c8d26a50bfab49002947d3d237abe4d9e4b5bdc8846a63537b6488e197808 \ + --hash=sha256:a144d4f717285c6d9234a66778059f33a89096dfb9b39117663fd8413d582dcc \ + --hash=sha256:a2b911a5b90e0374d03813674bf0a5fbbb7741570dcd4b4e85a2e48d17def29d \ + --hash=sha256:a7ec89dc587667f22b6a0b6579c249fca9026ce7c333fc142ba42411fa243cdc \ + --hash=sha256:aa9d91b338f2df0508606f7009fde642391425189bba6d8c653afd80fd6bb64e \ + --hash=sha256:b0379a2b24882fef529ec3b4987cb5d003b9cda32256024e6fe1586ac45fc640 \ + --hash=sha256:bc7aee6f634a6f4a95676fcb5d6559a2c2a390330098dba5e5a5f28a2e4ada30 \ + --hash=sha256:bdc25f3681f7b78572699569514036afe3c243bc3059d3942624e936ec93450e \ + --hash=sha256:c083a3bdd5a93dfe480f1125926afcdbf2917ae714bdb80b36d34318b2bec5d9 \ + --hash=sha256:c20c462aa4434b33a2661701b861604913f912254e441ab8d78d30485736115a \ + --hash=sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9 \ + --hash=sha256:c52b02ad8b4e2cf14ca7b3d918f3eb0ee91e63b3167c32591e57c4317e134f8f \ + --hash=sha256:c54c939ee22dc8e2d545da79fc5381f1c020d6d3141d3bd747eab59164dc89fb \ + --hash=sha256:c8e7af2f4e0194c22b5b37205bfb293d166a7344a5b0d0eaccebc376546d77d5 \ + --hash=sha256:cca3868ddfaccfbc4bfb1d608e2ccaaebe0ae628e1416aeb9c4d88c001bb45ab \ + --hash=sha256:d3f26877a748dc4251cfcfda9dfb5f13fcb034f5308388066bcfe9031b63ae7d \ + --hash=sha256:d53b22f2032c42eaaf025f7c40c2e3b94568ae077a606f006d206a463bc69572 \ + --hash=sha256:d87c561733f66531dced0da6e864f44ebf89a8fba55f31407b00c2f7f9449593 \ + --hash=sha256:d946c8bf0d5c24bf4fe333af284c59a19358aa3ec18cb3dc4370080da1e8ad29 \ + --hash=sha256:dac89aea9af8cd672fa7b510e7b8c33b0bba9a43186680550ccf23020f32d535 \ + --hash=sha256:db4b41f9bd95fbe5acd76d89920336ba96f03e149097365afe1cb092fceb89a1 \ + --hash=sha256:dc46a01bf8d62f227d5ecee74178ffc448ff4e5197c756331f71efcc66dc980f \ + --hash=sha256:dd14041875d09cc0f9308e37a6f8b65f5585cf2598a53aa0123df8b129d481f8 \ + --hash=sha256:de4b83bb311557e439b9e186f733f6c645b9417c84e2eb8203f3f820a4b988bf \ + --hash=sha256:e799c050df38a639db758c617ec771fd8fb7a5f8eaaa4b27b101f266b216a246 \ + --hash=sha256:e80b087132752f6b3d714f041ccf74403799d3b23a72722ea2e6ba2e892555b9 \ + --hash=sha256:eb8c529b2819c37140eb51b914153063d27ed88e3bdc31b71198a198e921e011 \ + --hash=sha256:eb9b459ca4df0e5c87deb59d37377461a538852765293f9e6ee834f0435a93b9 \ + --hash=sha256:efec8db3266b76ef9607c2c4c419bdb06bf335ae433b80816089ea7585816f6a \ + --hash=sha256:f481959862f57f29601ccced557cc2e817bce7533ab8e01a797a48b49c9692b3 \ + --hash=sha256:f517ca031dfc037a9c07e748cefd8d96235088b83b4f4ba8939105d20fa1dcd6 \ + --hash=sha256:f889f7a40498cc077332c7ab6b4608d296d852182211787d4f3ee377aaae66e8 \ + --hash=sha256:f8de619080e944347f5f20de29a975c2d815d9ddd8be9b9b7268e2e3ef68605a \ + --hash=sha256:f941635f2a3d96b2973e867144fde513665c87f13fe0e193c158ac51bfaaa7b2 \ + --hash=sha256:fa754d1850735a0b0e03bcffd9d4b4343eb417e47196e4485d9cca326073a42c \ + --hash=sha256:fa854f5cf7e33842a892e5c73f45327760bc7bc516339fda888c75ae60edaeb6 \ + --hash=sha256:fe5b32187cbc0c862ee201ad66c30cf218e5ed468ec8dc1cf49dec66e160cc4d # via # -c /tmp/ray-deps/requirements_compiled.txt # pydantic @@ -2451,6 +2565,7 @@ python-dateutil==2.8.2 \ # -r python/requirements/cloud-requirements.txt # arrow # botocore + # celery # jupyter-client # pandas python-json-logger==2.0.7 \ @@ -2633,6 +2748,7 @@ requests==2.32.3 \ # -r python/requirements/cloud-requirements.txt # -r python/requirements.txt # azure-core + # azure-datalake-store # google-api-core # google-cloud-storage # jupyterlab-server @@ -2776,9 +2892,9 @@ rsa==4.7.2 \ # -c /tmp/ray-deps/requirements_compiled.txt # google-auth # oauth2client -s3transfer==0.6.2 \ - --hash=sha256:b014be3a8a2aab98cfe1abc7229cc5a9a0cf05eb9c1f2b86b230fd8df3f78084 \ - --hash=sha256:cab66d3380cca3e70939ef2255d01cd8aece6a4907a9528740f668c4b0611861 +s3transfer==0.8.0 \ + --hash=sha256:baa479dc2e63e5c2ed51611b4d46cdf0295e2070d8d0b86b22f335ee5b954986 \ + --hash=sha256:e8d6bd52ffd99841e3a57b34370a54841f12d3aab072af862cdcc50955288002 # via # -c /tmp/ray-deps/requirements_compiled.txt # boto3 @@ -3053,6 +3169,19 @@ typing-extensions==4.12.2 \ # pyopenssl # referencing # typer + # typing-inspection +typing-inspection==0.4.1 \ + --hash=sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51 \ + --hash=sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28 + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # pydantic +tzdata==2025.2 \ + --hash=sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8 \ + --hash=sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9 + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # kombu tzlocal==5.3 \ --hash=sha256:2fafbfc07e9d8b49ade18f898d6bcd37ae88ce3ad6486842a2e4f03af68323d2 \ --hash=sha256:3814135a1bb29763c6e4f08fd6e41dbb435c7a60bfbb03270211bcc537187d8c @@ -3079,6 +3208,14 @@ uvicorn==0.22.0 \ # via # -c /tmp/ray-deps/requirements_compiled.txt # -r python/requirements.txt +vine==5.1.0 \ + --hash=sha256:40fdf3c48b2cfe1c38a49e9ae2da6fda88e4794c810050a728bd7413811fb1dc \ + --hash=sha256:8b62e981d35c41049211cf62a0a1242d8c1ee9bd15bb196ce38aefd6799e61e0 + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # amqp + # celery + # kombu virtualenv==20.29.1 \ --hash=sha256:4e4cb403c0b0da39e13b46b1b2476e505cb0046b25f242bee80f62bf990b2779 \ --hash=sha256:b8b8970138d32fb606192cb97f6cd4bb644fa486be9308fb9b63f81091b5dc35 @@ -3399,5 +3536,4 @@ zipp==3.19.2 \ # importlib-metadata # The following packages were excluded from the output: -# grpcio-tools # setuptools diff --git a/python/requirements_compiled_ray_test_py311_cu128.txt b/python/deplocks/llm/ray_test_py311_cu128.lock similarity index 90% rename from python/requirements_compiled_ray_test_py311_cu128.txt rename to python/deplocks/llm/ray_test_py311_cu128.lock index ff57573edfbe..058fe256012f 100644 --- a/python/requirements_compiled_ray_test_py311_cu128.txt +++ b/python/deplocks/llm/ray_test_py311_cu128.lock @@ -1,8 +1,14 @@ # This file was autogenerated by uv via the following command: -# uv pip compile --generate-hashes --strip-extras --unsafe-package ray --unsafe-package grpcio-tools --unsafe-package setuptools --index-url https://pypi.org/simple --extra-index-url https://download.pytorch.org/whl/cu128 --index-strategy unsafe-best-match --no-strip-markers --emit-index-url --emit-find-links -c /tmp/ray-deps/requirements_compiled.txt python/requirements.txt python/requirements/cloud-requirements.txt python/requirements/base-test-requirements.txt -o python/requirements_compiled_ray_test_py311_cu128.txt +# uv pip compile --generate-hashes --strip-extras --unsafe-package setuptools --index-url https://pypi.org/simple --index-strategy unsafe-best-match --no-strip-markers --emit-index-url --emit-find-links --extra-index-url https://download.pytorch.org/whl/cu128 --python-version=3.11 --unsafe-package ray --python-platform=linux -c /tmp/ray-deps/requirements_compiled.txt python/requirements.txt python/requirements/cloud-requirements.txt python/requirements/base-test-requirements.txt -o python/deplocks/llm/ray_test_py311_cu128.lock --index-url https://pypi.org/simple --extra-index-url https://download.pytorch.org/whl/cu128 +adlfs==2023.8.0 \ + --hash=sha256:07e804f6df4593acfcaf01025b162e30ac13e523d3570279c98b2d91a18026d9 \ + --hash=sha256:3eb248a3c2a30b419f1147bd7676d156b5219f96ef7f11d47166afd2a3bdb07e + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # -r python/requirements/cloud-requirements.txt aiofiles==22.1.0 \ --hash=sha256:1142fa8e80dbae46bb6339573ad4c8c0841358f79c6eb50a493dceca14621bad \ --hash=sha256:9107f1ca0b2a5553987a94a3c9959fe5b491fdf731389aa5b7b1bd0733e32de6 @@ -101,6 +107,7 @@ aiohttp==3.11.16 \ # -c /tmp/ray-deps/requirements_compiled.txt # -r python/requirements/cloud-requirements.txt # -r python/requirements.txt + # adlfs # aiohttp-cors # pytest-aiohttp aiohttp-cors==0.7.0 \ @@ -127,6 +134,12 @@ aiosqlite==0.19.0 \ # via # -c /tmp/ray-deps/requirements_compiled.txt # ypy-websocket +amqp==5.3.1 \ + --hash=sha256:43b3319e1b4e7d1251833a93d672b4af1e40f3d632d479b98661a95f117880a2 \ + --hash=sha256:cddc00c725449522023bad949f70fff7b48f0b1ade74d170a6f10ab044739432 + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # kombu annotated-types==0.6.0 \ --hash=sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43 \ --hash=sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d @@ -205,20 +218,29 @@ azure-core==1.29.5 \ --hash=sha256:52983c89d394c6f881a121e5101c5fa67278ca3b1f339c8fb2ef39230c70e9ac # via # -c /tmp/ray-deps/requirements_compiled.txt + # adlfs # azure-identity # azure-storage-blob # smart-open +azure-datalake-store==0.0.53 \ + --hash=sha256:05b6de62ee3f2a0a6e6941e6933b792b800c3e7f6ffce2fc324bc19875757393 \ + --hash=sha256:a30c902a6e360aa47d7f69f086b426729784e71c536f330b691647a51dc42b2b + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # adlfs azure-identity==1.17.1 \ --hash=sha256:32ecc67cc73f4bd0595e4f64b1ca65cd05186f4fe6f98ed2ae9f1aa32646efea \ --hash=sha256:db8d59c183b680e763722bfe8ebc45930e6c57df510620985939f7f3191e0382 # via # -c /tmp/ray-deps/requirements_compiled.txt # -r python/requirements/cloud-requirements.txt + # adlfs azure-storage-blob==12.22.0 \ --hash=sha256:b3804bb4fe8ab1c32771fa464053da772a682c2737b19da438a3f4e5e3b3736e \ --hash=sha256:bb7d2d824ce3f11f14a27ee7d9281289f7e072ac8311c52e3652672455b7d5e8 # via # -c /tmp/ray-deps/requirements_compiled.txt + # adlfs # smart-open babel==2.13.1 \ --hash=sha256:33e0952d7dd6374af8dbf6768cc4ddf3ccfefc244f9986d4074704f2fbd18900 \ @@ -238,22 +260,28 @@ beautifulsoup4==4.11.1 \ # via # -c /tmp/ray-deps/requirements_compiled.txt # nbconvert +billiard==4.2.1 \ + --hash=sha256:12b641b0c539073fc8d3f5b8b7be998956665c4233c7c1fcd66a7e677c4fb36f \ + --hash=sha256:40b59a4ac8806ba2c2369ea98d876bc6108b051c227baffd928c644d15d8f3cb + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # celery bleach==6.1.0 \ --hash=sha256:0a31f1837963c41d46bbf1331b8778e1308ea0791db03cc4e7357b97cf42a8fe \ --hash=sha256:3225f354cfc436b9789c66c4ee030194bee0568fbf9cbdad3bc8b5c26c5f12b6 # via # -c /tmp/ray-deps/requirements_compiled.txt # nbconvert -boto3==1.26.76 \ - --hash=sha256:30c7d967ed1c6b5a05643e42cae9d4d36c3f1cb6782637ddc7007a104cfd9027 \ - --hash=sha256:b4c2969b7677762914394b8273cc1905dfe5b71f250741c1a575487ae357e729 +boto3==1.29.7 \ + --hash=sha256:1eb4c548118b5fc5e018dee956fd33e6fb249cd1f2def85f1bba816aef4d9f3e \ + --hash=sha256:96e9890ebe7cd823b5f4976dd676e112c000c6528c28e20a2f274590589dd18b # via # -c /tmp/ray-deps/requirements_compiled.txt # -r python/requirements/cloud-requirements.txt # smart-open -botocore==1.29.76 \ - --hash=sha256:70735b00cd529f152992231ca6757e458e5ec25db43767b3526e9a35b2f143b7 \ - --hash=sha256:c2f67b6b3f8acf2968eafca06526f07b9fb0d27bac4c68a635d51abb675134a7 +botocore==1.32.7 \ + --hash=sha256:58b33d02cafa23461c8a9d211b30e8cded992380a84de409379fd02811fa3e11 \ + --hash=sha256:c6795c731b04c8e3635588c44cfd1a4462fc5987859195522c96812cf3eceff9 # via # -c /tmp/ray-deps/requirements_compiled.txt # -r python/requirements/cloud-requirements.txt @@ -265,6 +293,12 @@ cachetools==5.5.2 \ # via # -c /tmp/ray-deps/requirements_compiled.txt # google-auth +celery==5.5.3 \ + --hash=sha256:0b5761a07057acee94694464ca482416b959568904c9dfa41ce8413a7d65d525 \ + --hash=sha256:6c972ae7968c2b5281227f01c3a3f984037d21c5129d07bf3550cc2afc6b10a5 + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # -r python/requirements.txt certifi==2025.1.31 \ --hash=sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651 \ --hash=sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe @@ -328,6 +362,7 @@ cffi==1.16.0 \ # via # -c /tmp/ray-deps/requirements_compiled.txt # argon2-cffi-bindings + # azure-datalake-store # cryptography charset-normalizer==3.3.2 \ --hash=sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027 \ @@ -430,8 +465,30 @@ click==8.1.7 \ # -c /tmp/ray-deps/requirements_compiled.txt # -r python/requirements/cloud-requirements.txt # -r python/requirements.txt + # celery + # click-didyoumean + # click-plugins + # click-repl # typer # uvicorn +click-didyoumean==0.3.1 \ + --hash=sha256:4f82fdff0dbe64ef8ab2279bd6aa3f6a99c3b28c05aa09cbfc07c9d7fbb5a463 \ + --hash=sha256:5c4bb6007cfea5f2fd6583a2fb6701a22a41eb98957e63d0fac41c10e7c3117c + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # celery +click-plugins==1.1.1.2 \ + --hash=sha256:008d65743833ffc1f5417bf0e78e8d2c23aab04d9745ba817bd3e71b0feb6aa6 \ + --hash=sha256:d7af3984a99d243c131aa1a828331e7630f4a88a9741fd05c927b204bcf92261 + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # celery +click-repl==0.3.0 \ + --hash=sha256:17849c23dba3d667247dc4defe1757fff98694e90fe37474f3feebb69ced26a9 \ + --hash=sha256:fb7e06deb8da8de86180a33a9da97ac316751c094c6899382da7feeeeb51b812 + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # celery cloudpickle==2.2.0 \ --hash=sha256:3f4219469c55453cfe4737e564b67c2a149109dabf7f242478948b895f61106f \ --hash=sha256:7428798d5926d8fcbfd092d18d01a2a03daf8237d8fcdc8095d256b8490796f0 @@ -814,12 +871,13 @@ frozenlist==1.4.1 \ # -c /tmp/ray-deps/requirements_compiled.txt # aiohttp # aiosignal -fsspec==2023.5.0 \ - --hash=sha256:51a4ad01a5bb66fcc58036e288c0d53d3975a0df2a5dc59a93b59bade0391f2a \ - --hash=sha256:b3b56e00fb93ea321bc9e5d9cf6f8522a0198b20eb24e02774d329e9c6fb84ce +fsspec==2023.12.1 \ + --hash=sha256:6271f1d3075a378bfe432f6f42bf7e1d2a6ba74f78dd9b512385474c579146a0 \ + --hash=sha256:c4da01a35ac65c853f833e43f67802c25213f560820d54ddf248f92eddd5e990 # via # -c /tmp/ray-deps/requirements_compiled.txt # -r python/requirements.txt + # adlfs gitdb==4.0.11 \ --hash=sha256:81a3407ddd2ee8df444cbacea00e2d038e40150acfa3001696fe0dcf1d3adfa4 \ --hash=sha256:bf5421126136d6d0af55bc1e7c1af1c397a34f5b7bd79e776cd3e89785c2b04b @@ -947,70 +1005,118 @@ googleapis-common-protos==1.61.0 \ # via # -c /tmp/ray-deps/requirements_compiled.txt # google-api-core -grpcio==1.66.2 \ - --hash=sha256:02697eb4a5cbe5a9639f57323b4c37bcb3ab2d48cec5da3dc2f13334d72790dd \ - --hash=sha256:03b0b307ba26fae695e067b94cbb014e27390f8bc5ac7a3a39b7723fed085604 \ - --hash=sha256:05bc2ceadc2529ab0b227b1310d249d95d9001cd106aa4d31e8871ad3c428d73 \ - --hash=sha256:06de8ec0bd71be123eec15b0e0d457474931c2c407869b6c349bd9bed4adbac3 \ - --hash=sha256:0be4e0490c28da5377283861bed2941d1d20ec017ca397a5df4394d1c31a9b50 \ - --hash=sha256:12fda97ffae55e6526825daf25ad0fa37483685952b5d0f910d6405c87e3adb6 \ - --hash=sha256:1caa38fb22a8578ab8393da99d4b8641e3a80abc8fd52646f1ecc92bcb8dee34 \ - --hash=sha256:2018b053aa15782db2541ca01a7edb56a0bf18c77efed975392583725974b249 \ - --hash=sha256:20657d6b8cfed7db5e11b62ff7dfe2e12064ea78e93f1434d61888834bc86d75 \ - --hash=sha256:2335c58560a9e92ac58ff2bc5649952f9b37d0735608242973c7a8b94a6437d8 \ - --hash=sha256:31fd163105464797a72d901a06472860845ac157389e10f12631025b3e4d0453 \ - --hash=sha256:38b68498ff579a3b1ee8f93a05eb48dc2595795f2f62716e797dc24774c1aaa8 \ - --hash=sha256:3b00efc473b20d8bf83e0e1ae661b98951ca56111feb9b9611df8efc4fe5d55d \ - --hash=sha256:3ed71e81782966ffead60268bbda31ea3f725ebf8aa73634d5dda44f2cf3fb9c \ - --hash=sha256:45a3d462826f4868b442a6b8fdbe8b87b45eb4f5b5308168c156b21eca43f61c \ - --hash=sha256:49f0ca7ae850f59f828a723a9064cadbed90f1ece179d375966546499b8a2c9c \ - --hash=sha256:4e504572433f4e72b12394977679161d495c4c9581ba34a88d843eaf0f2fbd39 \ - --hash=sha256:4ea1d062c9230278793820146c95d038dc0f468cbdd172eec3363e42ff1c7d01 \ - --hash=sha256:563588c587b75c34b928bc428548e5b00ea38c46972181a4d8b75ba7e3f24231 \ - --hash=sha256:6001e575b8bbd89eee11960bb640b6da6ae110cf08113a075f1e2051cc596cae \ - --hash=sha256:66a0cd8ba6512b401d7ed46bb03f4ee455839957f28b8d61e7708056a806ba6a \ - --hash=sha256:6851de821249340bdb100df5eacfecfc4e6075fa85c6df7ee0eb213170ec8e5d \ - --hash=sha256:728bdf36a186e7f51da73be7f8d09457a03061be848718d0edf000e709418987 \ - --hash=sha256:73e3b425c1e155730273f73e419de3074aa5c5e936771ee0e4af0814631fb30a \ - --hash=sha256:73fc8f8b9b5c4a03e802b3cd0c18b2b06b410d3c1dcbef989fdeb943bd44aff7 \ - --hash=sha256:78fa51ebc2d9242c0fc5db0feecc57a9943303b46664ad89921f5079e2e4ada7 \ - --hash=sha256:7b2c86457145ce14c38e5bf6bdc19ef88e66c5fee2c3d83285c5aef026ba93b3 \ - --hash=sha256:7d69ce1f324dc2d71e40c9261d3fdbe7d4c9d60f332069ff9b2a4d8a257c7b2b \ - --hash=sha256:802d84fd3d50614170649853d121baaaa305de7b65b3e01759247e768d691ddf \ - --hash=sha256:80fd702ba7e432994df208f27514280b4b5c6843e12a48759c9255679ad38db8 \ - --hash=sha256:8ac475e8da31484efa25abb774674d837b343afb78bb3bcdef10f81a93e3d6bf \ - --hash=sha256:950da58d7d80abd0ea68757769c9db0a95b31163e53e5bb60438d263f4bed7b7 \ - --hash=sha256:99a641995a6bc4287a6315989ee591ff58507aa1cbe4c2e70d88411c4dcc0839 \ - --hash=sha256:9c3a99c519f4638e700e9e3f83952e27e2ea10873eecd7935823dab0c1c9250e \ - --hash=sha256:9c509a4f78114cbc5f0740eb3d7a74985fd2eff022971bc9bc31f8bc93e66a3b \ - --hash=sha256:a18e20d8321c6400185b4263e27982488cb5cdd62da69147087a76a24ef4e7e3 \ - --hash=sha256:a917d26e0fe980b0ac7bfcc1a3c4ad6a9a4612c911d33efb55ed7833c749b0ee \ - --hash=sha256:a9539f01cb04950fd4b5ab458e64a15f84c2acc273670072abe49a3f29bbad54 \ - --hash=sha256:ad2efdbe90c73b0434cbe64ed372e12414ad03c06262279b104a029d1889d13e \ - --hash=sha256:b672abf90a964bfde2d0ecbce30f2329a47498ba75ce6f4da35a2f4532b7acbc \ - --hash=sha256:bbd27c24a4cc5e195a7f56cfd9312e366d5d61b86e36d46bbe538457ea6eb8dd \ - --hash=sha256:c400ba5675b67025c8a9f48aa846f12a39cf0c44df5cd060e23fda5b30e9359d \ - --hash=sha256:c408f5ef75cfffa113cacd8b0c0e3611cbfd47701ca3cdc090594109b9fcbaed \ - --hash=sha256:c806852deaedee9ce8280fe98955c9103f62912a5b2d5ee7e3eaa284a6d8d8e7 \ - --hash=sha256:ce89f5876662f146d4c1f695dda29d4433a5d01c8681fbd2539afff535da14d4 \ - --hash=sha256:d25a14af966438cddf498b2e338f88d1c9706f3493b1d73b93f695c99c5f0e2a \ - --hash=sha256:d8d4732cc5052e92cea2f78b233c2e2a52998ac40cd651f40e398893ad0d06ec \ - --hash=sha256:d9a9724a156c8ec6a379869b23ba3323b7ea3600851c91489b871e375f710bc8 \ - --hash=sha256:e636ce23273683b00410f1971d209bf3689238cf5538d960adc3cdfe80dd0dbd \ - --hash=sha256:e88264caad6d8d00e7913996030bac8ad5f26b7411495848cc218bd3a9040b6c \ - --hash=sha256:f145cc21836c332c67baa6fc81099d1d27e266401565bf481948010d6ea32d46 \ - --hash=sha256:fb57870449dfcfac428afbb5a877829fcb0d6db9d9baa1148705739e9083880e \ - --hash=sha256:fb70487c95786e345af5e854ffec8cb8cc781bcc5df7930c4fbb7feaa72e1cdf \ - --hash=sha256:fe96281713168a3270878255983d2cb1a97e034325c8c2c25169a69289d3ecfa \ - --hash=sha256:ff1f7882e56c40b0d33c4922c15dfa30612f05fb785074a012f7cda74d1c3679 +grpcio==1.74.0 \ + --hash=sha256:0f87bddd6e27fc776aacf7ebfec367b6d49cad0455123951e4488ea99d9b9b8f \ + --hash=sha256:136b53c91ac1d02c8c24201bfdeb56f8b3ac3278668cbb8e0ba49c88069e1bdc \ + --hash=sha256:1733969040989f7acc3d94c22f55b4a9501a30f6aaacdbccfaba0a3ffb255ab7 \ + --hash=sha256:176d60a5168d7948539def20b2a3adcce67d72454d9ae05969a2e73f3a0feee7 \ + --hash=sha256:1a2b06afe2e50ebfd46247ac3ba60cac523f54ec7792ae9ba6073c12daf26f0a \ + --hash=sha256:1bf949792cee20d2078323a9b02bacbbae002b9e3b9e2433f2741c15bdeba1c4 \ + --hash=sha256:22b834cef33429ca6cc28303c9c327ba9a3fafecbf62fae17e9a7b7163cc43ac \ + --hash=sha256:2918948864fec2a11721d91568effffbe0a02b23ecd57f281391d986847982f6 \ + --hash=sha256:2bc2d7d8d184e2362b53905cb1708c84cb16354771c04b490485fa07ce3a1d89 \ + --hash=sha256:2f609a39f62a6f6f05c7512746798282546358a37ea93c1fcbadf8b2fed162e3 \ + --hash=sha256:3601274bc0523f6dc07666c0e01682c94472402ac2fd1226fd96e079863bfa49 \ + --hash=sha256:3b03d8f2a07f0fea8c8f74deb59f8352b770e3900d143b3d1475effcb08eec20 \ + --hash=sha256:3d14e3c4d65e19d8430a4e28ceb71ace4728776fd6c3ce34016947474479683f \ + --hash=sha256:42f8fee287427b94be63d916c90399ed310ed10aadbf9e2e5538b3e497d269bc \ + --hash=sha256:4bc5fca10aaf74779081e16c2bcc3d5ec643ffd528d9e7b1c9039000ead73bae \ + --hash=sha256:4e4181bfc24413d1e3a37a0b7889bea68d973d4b45dd2bc68bb766c140718f82 \ + --hash=sha256:55b453812fa7c7ce2f5c88be3018fb4a490519b6ce80788d5913f3f9d7da8c7b \ + --hash=sha256:566b9395b90cc3d0d0c6404bc8572c7c18786ede549cdb540ae27b58afe0fb91 \ + --hash=sha256:5f251c355167b2360537cf17bea2cf0197995e551ab9da6a0a59b3da5e8704f9 \ + --hash=sha256:60d2d48b0580e70d2e1954d0d19fa3c2e60dd7cbed826aca104fff518310d1c5 \ + --hash=sha256:64229c1e9cea079420527fa8ac45d80fc1e8d3f94deaa35643c381fa8d98f362 \ + --hash=sha256:655726919b75ab3c34cdad39da5c530ac6fa32696fb23119e36b64adcfca174a \ + --hash=sha256:662456c4513e298db6d7bd9c3b8df6f75f8752f0ba01fb653e252ed4a59b5a5d \ + --hash=sha256:68c8ebcca945efff9d86d8d6d7bfb0841cf0071024417e2d7f45c5e46b5b08eb \ + --hash=sha256:69e1a8180868a2576f02356565f16635b99088da7df3d45aaa7e24e73a054e31 \ + --hash=sha256:6bab67d15ad617aff094c382c882e0177637da73cbc5532d52c07b4ee887a87b \ + --hash=sha256:7d95d71ff35291bab3f1c52f52f474c632db26ea12700c2ff0ea0532cb0b5854 \ + --hash=sha256:80d1f4fbb35b0742d3e3d3bb654b7381cd5f015f8497279a1e9c21ba623e01b1 \ + --hash=sha256:834988b6c34515545b3edd13e902c1acdd9f2465d386ea5143fb558f153a7176 \ + --hash=sha256:8533e6e9c5bd630ca98062e3a1326249e6ada07d05acf191a77bc33f8948f3d8 \ + --hash=sha256:85bd5cdf4ed7b2d6438871adf6afff9af7096486fcf51818a81b77ef4dd30907 \ + --hash=sha256:86ad489db097141a907c559988c29718719aa3e13370d40e20506f11b4de0d11 \ + --hash=sha256:885912559974df35d92219e2dc98f51a16a48395f37b92865ad45186f294096c \ + --hash=sha256:8efe72fde5500f47aca1ef59495cb59c885afe04ac89dd11d810f2de87d935d4 \ + --hash=sha256:8f7b5882fb50632ab1e48cb3122d6df55b9afabc265582808036b6e51b9fd6b7 \ + --hash=sha256:9e7c4389771855a92934b2846bd807fc25a3dfa820fd912fe6bd8136026b2707 \ + --hash=sha256:9e912d3c993a29df6c627459af58975b2e5c897d93287939b9d5065f000249b5 \ + --hash=sha256:a8f0302f9ac4e9923f98d8e243939a6fb627cd048f5cd38595c97e38020dffce \ + --hash=sha256:b6a73b2ba83e663b2480a90b82fdae6a7aa6427f62bf43b29912c0cfd1aa2bfa \ + --hash=sha256:c14e803037e572c177ba54a3e090d6eb12efd795d49327c5ee2b3bddb836bf01 \ + --hash=sha256:c3d7bd6e3929fd2ea7fbc3f562e4987229ead70c9ae5f01501a46701e08f1ad9 \ + --hash=sha256:c98e0b7434a7fa4e3e63f250456eaef52499fba5ae661c58cc5b5477d11e7182 \ + --hash=sha256:cce634b10aeab37010449124814b05a62fb5f18928ca878f1bf4750d1f0c815b \ + --hash=sha256:e154d230dc1bbbd78ad2fdc3039fa50ad7ffcf438e4eb2fa30bce223a70c7486 \ + --hash=sha256:e1ea6176d7dfd5b941ea01c2ec34de9531ba494d541fe2057c904e601879f249 \ + --hash=sha256:e759f9e8bc908aaae0412642afe5416c9f983a80499448fcc7fab8692ae044c3 \ + --hash=sha256:e8978003816c7b9eabe217f88c78bc26adc8f9304bf6a594b02e5a49b2ef9c11 \ + --hash=sha256:ecde9ab49f58433abe02f9ed076c7b5be839cf0153883a6d23995937a82392fa \ + --hash=sha256:f6ec94f0e50eb8fa1744a731088b966427575e40c2944a980049798b127a687e \ + --hash=sha256:fd3c71aeee838299c5887230b8a1822795325ddfea635edd82954c1eaa831e24 \ + --hash=sha256:fe0f540750a13fd8e5da4b3eaba91a785eea8dca5ccd2bc2ffe978caa403090e # via # -c /tmp/ray-deps/requirements_compiled.txt # -r python/requirements/cloud-requirements.txt # -r python/requirements.txt # grpcio-tools -gymnasium==1.0.0 \ - --hash=sha256:9d2b66f30c1b34fe3c2ce7fae65ecf365d0e9982d2b3d860235e773328a3b403 \ - --hash=sha256:b6f40e1e24c5bd419361e1a5b86a9117d2499baecc3a660d44dfff4c465393ad +grpcio-tools==1.62.3 \ + --hash=sha256:0a52cc9444df978438b8d2332c0ca99000521895229934a59f94f37ed896b133 \ + --hash=sha256:0a8c0c4724ae9c2181b7dbc9b186df46e4f62cb18dc184e46d06c0ebeccf569e \ + --hash=sha256:0cb3a3436ac119cbd37a7d3331d9bdf85dad21a6ac233a3411dff716dcbf401e \ + --hash=sha256:11c625eebefd1fd40a228fc8bae385e448c7e32a6ae134e43cf13bbc23f902b7 \ + --hash=sha256:11f363570dea661dde99e04a51bd108a5807b5df32a6f8bdf4860e34e94a4dbf \ + --hash=sha256:141d028bf5762d4a97f981c501da873589df3f7e02f4c1260e1921e565b376fa \ + --hash=sha256:1c989246c2aebc13253f08be32538a4039a64e12d9c18f6d662d7aee641dc8b5 \ + --hash=sha256:1da38070738da53556a4b35ab67c1b9884a5dd48fa2f243db35dc14079ea3d0c \ + --hash=sha256:27cd9ef5c5d68d5ed104b6dcb96fe9c66b82050e546c9e255716903c3d8f0373 \ + --hash=sha256:2e02d3b96f2d0e4bab9ceaa30f37d4f75571e40c6272e95364bff3125a64d184 \ + --hash=sha256:2f968b049c2849540751ec2100ab05e8086c24bead769ca734fdab58698408c1 \ + --hash=sha256:350a80485e302daaa95d335a931f97b693e170e02d43767ab06552c708808950 \ + --hash=sha256:3eae6ea76d62fcac091e1f15c2dcedf1dc3f114f8df1a972a8a0745e89f4cf61 \ + --hash=sha256:47a5c093ab256dec5714a7a345f8cc89315cb57c298b276fa244f37a0ba507f0 \ + --hash=sha256:5782883a27d3fae8c425b29a9d3dcf5f47d992848a1b76970da3b5a28d424b26 \ + --hash=sha256:6a56d344b0bab30bf342a67e33d386b0b3c4e65868ffe93c341c51e1a8853ca5 \ + --hash=sha256:6c3064610826f50bd69410c63101954676edc703e03f9e8f978a135f1aaf97c1 \ + --hash=sha256:703f46e0012af83a36082b5f30341113474ed0d91e36640da713355cd0ea5d23 \ + --hash=sha256:710fecf6a171dcbfa263a0a3e7070e0df65ba73158d4c539cec50978f11dad5d \ + --hash=sha256:7c7136015c3d62c3eef493efabaf9e3380e3e66d24ee8e94c01cb71377f57833 \ + --hash=sha256:7cc83023acd8bc72cf74c2edbe85b52098501d5b74d8377bfa06f3e929803492 \ + --hash=sha256:7f2483ea232bd72d98a6dc6d7aefd97e5bc80b15cd909b9e356d6f3e326b6e43 \ + --hash=sha256:7ff7d58a45b75df67d25f8f144936a3e44aabd91afec833ee06826bd02b7fbe7 \ + --hash=sha256:8ad0473af5544f89fc5a1ece8676dd03bdf160fb3230f967e05d0f4bf89620e3 \ + --hash=sha256:8c5d22b252dcef11dd1e0fbbe5bbfb9b4ae048e8880d33338215e8ccbdb03edc \ + --hash=sha256:8e62cc7164b0b7c5128e637e394eb2ef3db0e61fc798e80c301de3b2379203ed \ + --hash=sha256:962c84b4da0f3b14b3cdb10bc3837ebc5f136b67d919aea8d7bb3fd3df39528a \ + --hash=sha256:ace43b26d88a58dcff16c20d23ff72b04d0a415f64d2820f4ff06b1166f50557 \ + --hash=sha256:b47d0dda1bdb0a0ba7a9a6de88e5a1ed61f07fad613964879954961e36d49193 \ + --hash=sha256:b77f9f9cee87cd798f0fe26b7024344d1b03a7cd2d2cba7035f8433b13986325 \ + --hash=sha256:b881fd9505a84457e9f7e99362eeedd86497b659030cf57c6f0070df6d9c2b9b \ + --hash=sha256:bfda6ee8990997a9df95c5606f3096dae65f09af7ca03a1e9ca28f088caca5cf \ + --hash=sha256:c3a1ac9d394f8e229eb28eec2e04b9a6f5433fa19c9d32f1cb6066e3c5114a1d \ + --hash=sha256:c8ad5cce554e2fcaf8842dee5d9462583b601a3a78f8b76a153c38c963f58c10 \ + --hash=sha256:ca246dffeca0498be9b4e1ee169b62e64694b0f92e6d0be2573e65522f39eea9 \ + --hash=sha256:ca4f5eeadbb57cf03317d6a2857823239a63a59cc935f5bd6cf6e8b7af7a7ecc \ + --hash=sha256:d102b9b21c4e1e40af9a2ab3c6d41afba6bd29c0aa50ca013bf85c99cdc44ac5 \ + --hash=sha256:db3bc9fa39afc5e4e2767da4459df82b095ef0cab2f257707be06c44a1c2c3e5 \ + --hash=sha256:dc9ad9950119d8ae27634e68b7663cc8d340ae535a0f80d85a55e56a6973ab1f \ + --hash=sha256:e02d7c1a02e3814c94ba0cfe43d93e872c758bd8fd5c2797f894d0c49b4a1dfc \ + --hash=sha256:e0898d412a434e768a0c7e365acabe13ff1558b767e400936e26b5b6ed1ee51f \ + --hash=sha256:e18e15287c31baf574fcdf8251fb7f997d64e96c6ecf467906e576da0a079af6 \ + --hash=sha256:ec279dcf3518201fc592c65002754f58a6b542798cd7f3ecd4af086422f33f29 \ + --hash=sha256:ec6fbded0c61afe6f84e3c2a43e6d656791d95747d6d28b73eff1af64108c434 \ + --hash=sha256:eec73a005443061f4759b71a056f745e3b000dc0dc125c9f20560232dfbcbd14 \ + --hash=sha256:f3d812daffd0c2d2794756bd45a353f89e55dc8f91eb2fc840c51b9f6be62667 \ + --hash=sha256:f4b1615adf67bd8bb71f3464146a6f9949972d06d21a4f5e87e73f6464d97f57 \ + --hash=sha256:f6831fdec2b853c9daa3358535c55eed3694325889aa714070528cf8f92d7d6d + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # -r python/requirements/cloud-requirements.txt +gymnasium==1.1.1 \ + --hash=sha256:8bd9ea9bdef32c950a444ff36afc785e1d81051ec32d30435058953c20d2456d \ + --hash=sha256:9c167ec0a2b388666e37f63b2849cd2552f7f5b71938574c637bb36487eb928a # via # -c /tmp/ray-deps/requirements_compiled.txt # -r python/requirements.txt @@ -1247,6 +1353,12 @@ jupyterlab-widgets==3.0.11 \ # via # -c /tmp/ray-deps/requirements_compiled.txt # ipywidgets +kombu==5.5.4 \ + --hash=sha256:886600168275ebeada93b888e831352fe578168342f0d1d5833d88ba0d847363 \ + --hash=sha256:a12ed0557c238897d8e518f1d1fdf84bd1516c5e305af2dacd85c2015115feb8 + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # celery lazy-loader==0.4 \ --hash=sha256:342aa8e14d543a154047afb4ba8ef17f5563baad3fc610d7b15b213b0f119efc \ --hash=sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1 @@ -1495,6 +1607,7 @@ msal==1.28.1 \ --hash=sha256:d72bbfe2d5c2f2555f4bc6205be4450ddfd12976610dd9a16a9ab0f05c68b64d # via # -c /tmp/ray-deps/requirements_compiled.txt + # azure-datalake-store # azure-identity # msal-extensions msal-extensions==1.2.0b1 \ @@ -1827,6 +1940,7 @@ packaging==23.0 \ # jupyter-server # jupyterlab # jupyterlab-server + # kombu # lazy-loader # nbconvert # pytest @@ -2001,6 +2115,7 @@ prompt-toolkit==3.0.41 \ --hash=sha256:f36fe301fafb7470e86aaf90f036eef600a3210be4decf461a5b1ca8403d3cb2 # via # -c /tmp/ray-deps/requirements_compiled.txt + # click-repl # ipython propcache==0.3.0 \ --hash=sha256:02df07041e0820cacc8f739510078f2aadcfd3fc57eaeeb16d5ded85c872c89e \ @@ -2284,114 +2399,113 @@ pycurl==7.45.3 \ # via # -c /tmp/ray-deps/requirements_compiled.txt # -r python/requirements/cloud-requirements.txt -pydantic==2.10.0 \ - --hash=sha256:0aca0f045ff6e2f097f1fe89521115335f15049eeb8a7bef3dafe4b19a74e289 \ - --hash=sha256:5e7807ba9201bdf61b1b58aa6eb690916c40a47acfb114b1b4fef3e7fd5b30fc +pydantic==2.11.7 \ + --hash=sha256:d989c3c6cb79469287b1569f7447a17848c998458d49ebe294e975b9baf0f0db \ + --hash=sha256:dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b # via # -c /tmp/ray-deps/requirements_compiled.txt # -r python/requirements.txt # fastapi -pydantic-core==2.27.0 \ - --hash=sha256:0aa4d1b2eba9a325897308b3124014a142cdccb9f3e016f31d3ebee6b5ea5e75 \ - --hash=sha256:0d06b667e53320332be2bf6f9461f4a9b78092a079b8ce8634c9afaa7e10cd9f \ - --hash=sha256:153017e3d6cd3ce979de06d84343ca424bb6092727375eba1968c8b4693c6ecb \ - --hash=sha256:15e350efb67b855cd014c218716feea4986a149ed1f42a539edd271ee074a196 \ - --hash=sha256:185ef205256cd8b38431205698531026979db89a79587725c1e55c59101d64e9 \ - --hash=sha256:1da0c98a85a6c6ed702d5556db3b09c91f9b0b78de37b7593e2de8d03238807a \ - --hash=sha256:225bfff5d425c34e1fd562cef52d673579d59b967d9de06178850c4802af9039 \ - --hash=sha256:24f984fc7762ed5f806d9e8c4c77ea69fdb2afd987b4fd319ef06c87595a8c55 \ - --hash=sha256:25a7fd4de38f7ff99a37e18fa0098c3140286451bc823d1746ba80cec5b433a1 \ - --hash=sha256:2883b260f7a93235488699d39cbbd94fa7b175d3a8063fbfddd3e81ad9988cb2 \ - --hash=sha256:2a51ce96224eadd1845150b204389623c8e129fde5a67a84b972bd83a85c6c40 \ - --hash=sha256:2be0ad541bb9f059954ccf8877a49ed73877f862529575ff3d54bf4223e4dd61 \ - --hash=sha256:31a2cae5f059329f9cfe3d8d266d3da1543b60b60130d186d9b6a3c20a346361 \ - --hash=sha256:333c840a1303d1474f491e7be0b718226c730a39ead0f7dab2c7e6a2f3855555 \ - --hash=sha256:33d14369739c5d07e2e7102cdb0081a1fa46ed03215e07f097b34e020b83b1ae \ - --hash=sha256:35380671c3c921fe8adf31ad349dc6f7588b7e928dbe44e1093789734f607399 \ - --hash=sha256:359e7951f04ad35111b5ddce184db3391442345d0ab073aa63a95eb8af25a5ef \ - --hash=sha256:36aa167f69d8807ba7e341d67ea93e50fcaaf6bc433bb04939430fa3dab06f31 \ - --hash=sha256:395e3e1148fa7809016231f8065f30bb0dc285a97b4dc4360cd86e17bab58af7 \ - --hash=sha256:3e8d89c276234579cd3d095d5fa2a44eb10db9a218664a17b56363cddf226ff3 \ - --hash=sha256:3eb8849445c26b41c5a474061032c53e14fe92a11a5db969f722a2716cd12206 \ - --hash=sha256:3fd8bc2690e7c39eecdf9071b6a889ce7b22b72073863940edc2a0a23750ca90 \ - --hash=sha256:400bf470e4327e920883b51e255617dfe4496d4e80c3fea0b5a5d0bf2c404dd4 \ - --hash=sha256:4148dc9184ab79e356dc00a4199dc0ee8647973332cb385fc29a7cced49b9f9c \ - --hash=sha256:433689845288f9a1ee5714444e65957be26d30915f7745091ede4a83cfb2d7bb \ - --hash=sha256:43b61989068de9ce62296cde02beffabcadb65672207fc51e7af76dca75e6636 \ - --hash=sha256:4523c4009c3f39d948e01962223c9f5538602e7087a628479b723c939fab262d \ - --hash=sha256:483c2213a609e7db2c592bbc015da58b6c75af7360ca3c981f178110d9787bcf \ - --hash=sha256:49633583eb7dc5cba61aaf7cdb2e9e662323ad394e543ee77af265736bcd3eaa \ - --hash=sha256:4b51f964fcbb02949fc546022e56cdb16cda457af485e9a3e8b78ac2ecf5d77e \ - --hash=sha256:4bf1340ae507f6da6360b24179c2083857c8ca7644aab65807023cf35404ea8d \ - --hash=sha256:4fb49cfdb53af5041aba909be00cccfb2c0d0a2e09281bf542371c5fd36ad04c \ - --hash=sha256:510b11e9c3b1a852876d1ccd8d5903684336d635214148637ceb27366c75a467 \ - --hash=sha256:513cb14c0cc31a4dfd849a4674b20c46d87b364f997bbcb02282306f5e187abf \ - --hash=sha256:58560828ee0951bb125c6f2862fbc37f039996d19ceb6d8ff1905abf7da0bf3d \ - --hash=sha256:58ab0d979c969983cdb97374698d847a4acffb217d543e172838864636ef10d9 \ - --hash=sha256:5982048129f40b082c2654de10c0f37c67a14f5ff9d37cf35be028ae982f26df \ - --hash=sha256:5ab325fc86fbc077284c8d7f996d904d30e97904a87d6fb303dce6b3de7ebba9 \ - --hash=sha256:5cc822ab90a70ea3a91e6aed3afac570b276b1278c6909b1d384f745bd09c714 \ - --hash=sha256:5f2b19b8d6fca432cb3acf48cf5243a7bf512988029b6e6fd27e9e8c0a204d85 \ - --hash=sha256:5fc72fbfebbf42c0856a824b8b0dc2b5cd2e4a896050281a21cfa6fed8879cb1 \ - --hash=sha256:6354e18a9be37bfa124d6b288a87fb30c673745806c92956f1a25e3ae6e76b96 \ - --hash=sha256:678f66462058dd978702db17eb6a3633d634f7aa0deaea61e0a674152766d3fc \ - --hash=sha256:68950bc08f9735306322bfc16a18391fcaac99ded2509e1cc41d03ccb6013cfe \ - --hash=sha256:68ef5377eb582fa4343c9d0b57a5b094046d447b4c73dd9fbd9ffb216f829e7d \ - --hash=sha256:6b4c19525c3538fbc0bbda6229f9682fb8199ce9ac37395880e6952798e00373 \ - --hash=sha256:6bb69bf3b6500f195c3deb69c1205ba8fc3cb21d1915f1f158a10d6b1ef29b6a \ - --hash=sha256:6e19401742ed7b69e51d8e4df3c03ad5ec65a83b36244479fd70edde2828a5d9 \ - --hash=sha256:6f4a53af9e81d757756508b57cae1cf28293f0f31b9fa2bfcb416cc7fb230f9d \ - --hash=sha256:6fda87808429c520a002a85d6e7cdadbf58231d60e96260976c5b8f9a12a8e13 \ - --hash=sha256:78f841523729e43e3928a364ec46e2e3f80e6625a4f62aca5c345f3f626c6e8a \ - --hash=sha256:7a6ebfac28fd51890a61df36ef202adbd77d00ee5aca4a3dadb3d9ed49cfb929 \ - --hash=sha256:7b0202ebf2268954090209a84f9897345719e46a57c5f2c9b7b250ca0a9d3e63 \ - --hash=sha256:8117839a9bdbba86e7f9df57018fe3b96cec934c3940b591b0fd3fbfb485864a \ - --hash=sha256:82e1ad4ca170e8af4c928b67cff731b6296e6a0a0981b97b2eb7c275cc4e15bd \ - --hash=sha256:836a4bfe0cc6d36dc9a9cc1a7b391265bf6ce9d1eb1eac62ac5139f5d8d9a6fa \ - --hash=sha256:84af1cf7bfdcbc6fcf5a5f70cc9896205e0350306e4dd73d54b6a18894f79386 \ - --hash=sha256:84e35afd9e10b2698e6f2f32256678cb23ca6c1568d02628033a837638b3ed12 \ - --hash=sha256:884f1806609c2c66564082540cffc96868c5571c7c3cf3a783f63f2fb49bd3cd \ - --hash=sha256:8a150392102c402c538190730fda06f3bce654fc498865579a9f2c1d2b425833 \ - --hash=sha256:8e21d927469d04b39386255bf00d0feedead16f6253dcc85e9e10ddebc334084 \ - --hash=sha256:8e96ca781e0c01e32115912ebdf7b3fb0780ce748b80d7d28a0802fa9fbaf44e \ - --hash=sha256:8ee4c2a75af9fe21269a4a0898c5425afb01af1f5d276063f57e2ae1bc64e191 \ - --hash=sha256:91bc66f878557313c2a6bcf396e7befcffe5ab4354cfe4427318968af31143c3 \ - --hash=sha256:951e71da6c89d354572098bada5ba5b5dc3a9390c933af8a614e37755d3d1840 \ - --hash=sha256:99b2863c1365f43f74199c980a3d40f18a218fbe683dd64e470199db426c4d6a \ - --hash=sha256:9a8fbf506fde1529a1e3698198fe64bfbe2e0c09557bc6a7dcf872e7c01fec40 \ - --hash=sha256:9ce048deb1e033e7a865ca384770bccc11d44179cf09e5193a535c4c2f497bdc \ - --hash=sha256:9fe94d9d2a2b4edd7a4b22adcd45814b1b59b03feb00e56deb2e89747aec7bfe \ - --hash=sha256:a291d0b4243a259c8ea7e2b84eb9ccb76370e569298875a7c5e3e71baf49057a \ - --hash=sha256:a5c022bb0d453192426221605efc865373dde43b17822a264671c53b068ac20c \ - --hash=sha256:abb4785894936d7682635726613c44578c420a096729f1978cd061a7e72d5275 \ - --hash=sha256:b872c86d8d71827235c7077461c502feb2db3f87d9d6d5a9daa64287d75e4fa0 \ - --hash=sha256:bf37b72834e7239cf84d4a0b2c050e7f9e48bced97bad9bdf98d26b8eb72e846 \ - --hash=sha256:c0c431e4be5c1a0c6654e0c31c661cd89e0ca956ef65305c3c3fd96f4e72ca39 \ - --hash=sha256:c5726eec789ee38f2c53b10b1821457b82274f81f4f746bb1e666d8741fcfadb \ - --hash=sha256:c6fcb3fa3855d583aa57b94cf146f7781d5d5bc06cb95cb3afece33d31aac39b \ - --hash=sha256:c86679f443e7085ea55a7376462553996c688395d18ef3f0d3dbad7838f857a2 \ - --hash=sha256:c91e3c04f5191fd3fb68764bddeaf02025492d5d9f23343b283870f6ace69708 \ - --hash=sha256:c921ad596ff1a82f9c692b0758c944355abc9f0de97a4c13ca60ffc6d8dc15d4 \ - --hash=sha256:c9ed88b398ba7e3bad7bd64d66cc01dcde9cfcb7ec629a6fd78a82fa0b559d78 \ - --hash=sha256:cd2ac6b919f7fed71b17fe0b4603c092a4c9b5bae414817c9c81d3c22d1e1bcc \ - --hash=sha256:d28ca7066d6cdd347a50d8b725dc10d9a1d6a1cce09836cf071ea6a2d4908be0 \ - --hash=sha256:d29e235ce13c91902ef3efc3d883a677655b3908b1cbc73dee816e5e1f8f7739 \ - --hash=sha256:d8b5ee4ae9170e2775d495b81f414cc20268041c42571530513496ba61e94ba3 \ - --hash=sha256:db72e40628967f6dc572020d04b5f800d71264e0531c6da35097e73bdf38b003 \ - --hash=sha256:df45c4073bed486ea2f18757057953afed8dd77add7276ff01bccb79982cf46c \ - --hash=sha256:dfa5f5c0a4c8fced1422dc2ca7eefd872d5d13eb33cf324361dbf1dbfba0a9fe \ - --hash=sha256:e015833384ca3e1a0565a79f5d953b0629d9138021c27ad37c92a9fa1af7623c \ - --hash=sha256:e15315691fe2253eb447503153acef4d7223dfe7e7702f9ed66539fcd0c43801 \ - --hash=sha256:e65466b31be1070b4a5b7dbfbd14b247884cb8e8b79c64fb0f36b472912dbaea \ - --hash=sha256:e7820bb0d65e3ce1e3e70b6708c2f66143f55912fa02f4b618d0f08b61575f12 \ - --hash=sha256:e851a051f7260e6d688267eb039c81f05f23a19431bd7dfa4bf5e3cb34c108cd \ - --hash=sha256:e9f9feee7f334b72ceae46313333d002b56f325b5f04271b4ae2aadd9e993ae4 \ - --hash=sha256:eb40f828bc2f73f777d1eb8fee2e86cd9692a4518b63b6b5aa8af915dfd3207b \ - --hash=sha256:eb704155e73b833801c247f39d562229c0303f54770ca14fb1c053acb376cf10 \ - --hash=sha256:edb1bfd45227dec8d50bc7c7d86463cd8728bcc574f9b07de7369880de4626a3 \ - --hash=sha256:ee7d9d5537daf6d5c74a83b38a638cc001b648096c1cae8ef695b0c919d9d379 \ - --hash=sha256:f57783fbaf648205ac50ae7d646f27582fc706be3977e87c3c124e7a92407b10 \ - --hash=sha256:ff63a92f6e249514ef35bc795de10745be0226eaea06eb48b4bbeaa0c8850a4a +pydantic-core==2.33.2 \ + --hash=sha256:0069c9acc3f3981b9ff4cdfaf088e98d83440a4c7ea1bc07460af3d4dc22e72d \ + --hash=sha256:031c57d67ca86902726e0fae2214ce6770bbe2f710dc33063187a68744a5ecac \ + --hash=sha256:0405262705a123b7ce9f0b92f123334d67b70fd1f20a9372b907ce1080c7ba02 \ + --hash=sha256:04a1a413977ab517154eebb2d326da71638271477d6ad87a769102f7c2488c56 \ + --hash=sha256:09fb9dd6571aacd023fe6aaca316bd01cf60ab27240d7eb39ebd66a3a15293b4 \ + --hash=sha256:0a39979dcbb70998b0e505fb1556a1d550a0781463ce84ebf915ba293ccb7e22 \ + --hash=sha256:0a9f2c9dd19656823cb8250b0724ee9c60a82f3cdf68a080979d13092a3b0fef \ + --hash=sha256:0e03262ab796d986f978f79c943fc5f620381be7287148b8010b4097f79a39ec \ + --hash=sha256:0e5b2671f05ba48b94cb90ce55d8bdcaaedb8ba00cc5359f6810fc918713983d \ + --hash=sha256:0e6116757f7959a712db11f3e9c0a99ade00a5bbedae83cb801985aa154f071b \ + --hash=sha256:0fb2d542b4d66f9470e8065c5469ec676978d625a8b7a363f07d9a501a9cb36a \ + --hash=sha256:1082dd3e2d7109ad8b7da48e1d4710c8d06c253cbc4a27c1cff4fbcaa97a9e3f \ + --hash=sha256:1a8695a8d00c73e50bff9dfda4d540b7dee29ff9b8053e38380426a85ef10052 \ + --hash=sha256:1e063337ef9e9820c77acc768546325ebe04ee38b08703244c1309cccc4f1bab \ + --hash=sha256:1ea40a64d23faa25e62a70ad163571c0b342b8bf66d5fa612ac0dec4f069d916 \ + --hash=sha256:2058a32994f1fde4ca0480ab9d1e75a0e8c87c22b53a3ae66554f9af78f2fe8c \ + --hash=sha256:235f45e5dbcccf6bd99f9f472858849f73d11120d76ea8707115415f8e5ebebf \ + --hash=sha256:2807668ba86cb38c6817ad9bc66215ab8584d1d304030ce4f0887336f28a5e27 \ + --hash=sha256:2b0a451c263b01acebe51895bfb0e1cc842a5c666efe06cdf13846c7418caa9a \ + --hash=sha256:2b3d326aaef0c0399d9afffeb6367d5e26ddc24d351dbc9c636840ac355dc5d8 \ + --hash=sha256:2bfb5112df54209d820d7bf9317c7a6c9025ea52e49f46b6a2060104bba37de7 \ + --hash=sha256:2f82865531efd18d6e07a04a17331af02cb7a651583c418df8266f17a63c6612 \ + --hash=sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1 \ + --hash=sha256:3a1c81334778f9e3af2f8aeb7a960736e5cab1dfebfb26aabca09afd2906c039 \ + --hash=sha256:3abcd9392a36025e3bd55f9bd38d908bd17962cc49bc6da8e7e96285336e2bca \ + --hash=sha256:3c6db6e52c6d70aa0d00d45cdb9b40f0433b96380071ea80b09277dba021ddf7 \ + --hash=sha256:3dc625f4aa79713512d1976fe9f0bc99f706a9dee21dfd1810b4bbbf228d0e8a \ + --hash=sha256:3eb3fe62804e8f859c49ed20a8451342de53ed764150cb14ca71357c765dc2a6 \ + --hash=sha256:44857c3227d3fb5e753d5fe4a3420d6376fa594b07b621e220cd93703fe21782 \ + --hash=sha256:4b25d91e288e2c4e0662b8038a28c6a07eaac3e196cfc4ff69de4ea3db992a1b \ + --hash=sha256:4c5b0a576fb381edd6d27f0a85915c6daf2f8138dc5c267a57c08a62900758c7 \ + --hash=sha256:4e61206137cbc65e6d5256e1166f88331d3b6238e082d9f74613b9b765fb9025 \ + --hash=sha256:52fb90784e0a242bb96ec53f42196a17278855b0f31ac7c3cc6f5c1ec4811849 \ + --hash=sha256:53a57d2ed685940a504248187d5685e49eb5eef0f696853647bf37c418c538f7 \ + --hash=sha256:572c7e6c8bb4774d2ac88929e3d1f12bc45714ae5ee6d9a788a9fb35e60bb04b \ + --hash=sha256:5c4aa4e82353f65e548c476b37e64189783aa5384903bfea4f41580f255fddfa \ + --hash=sha256:5c92edd15cd58b3c2d34873597a1e20f13094f59cf88068adb18947df5455b4e \ + --hash=sha256:5f483cfb75ff703095c59e365360cb73e00185e01aaea067cd19acffd2ab20ea \ + --hash=sha256:61c18fba8e5e9db3ab908620af374db0ac1baa69f0f32df4f61ae23f15e586ac \ + --hash=sha256:6368900c2d3ef09b69cb0b913f9f8263b03786e5b2a387706c5afb66800efd51 \ + --hash=sha256:64632ff9d614e5eecfb495796ad51b0ed98c453e447a76bcbeeb69615079fc7e \ + --hash=sha256:65132b7b4a1c0beded5e057324b7e16e10910c106d43675d9bd87d4f38dde162 \ + --hash=sha256:6b99022f1d19bc32a4c2a0d544fc9a76e3be90f0b3f4af413f87d38749300e65 \ + --hash=sha256:6bdfe4b3789761f3bcb4b1ddf33355a71079858958e3a552f16d5af19768fef2 \ + --hash=sha256:6fa6dfc3e4d1f734a34710f391ae822e0a8eb8559a85c6979e14e65ee6ba2954 \ + --hash=sha256:73662edf539e72a9440129f231ed3757faab89630d291b784ca99237fb94db2b \ + --hash=sha256:73cf6373c21bc80b2e0dc88444f41ae60b2f070ed02095754eb5a01df12256de \ + --hash=sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc \ + --hash=sha256:7f92c15cd1e97d4b12acd1cc9004fa092578acfa57b67ad5e43a197175d01a64 \ + --hash=sha256:82f68293f055f51b51ea42fafc74b6aad03e70e191799430b90c13d643059ebb \ + --hash=sha256:83aa99b1285bc8f038941ddf598501a86f1536789740991d7d8756e34f1e74d9 \ + --hash=sha256:87acbfcf8e90ca885206e98359d7dca4bcbb35abdc0ff66672a293e1d7a19101 \ + --hash=sha256:87b31b6846e361ef83fedb187bb5b4372d0da3f7e28d85415efa92d6125d6e6d \ + --hash=sha256:881b21b5549499972441da4758d662aeea93f1923f953e9cbaff14b8b9565aef \ + --hash=sha256:8d55ab81c57b8ff8548c3e4947f119551253f4e3787a7bbc0b6b3ca47498a9d3 \ + --hash=sha256:8f57a69461af2a5fa6e6bbd7a5f60d3b7e6cebb687f55106933188e79ad155c1 \ + --hash=sha256:95237e53bb015f67b63c91af7518a62a8660376a6a0db19b89acc77a4d6199f5 \ + --hash=sha256:96081f1605125ba0855dfda83f6f3df5ec90c61195421ba72223de35ccfb2f88 \ + --hash=sha256:970919794d126ba8645f3837ab6046fb4e72bbc057b3709144066204c19a455d \ + --hash=sha256:9cb1da0f5a471435a7bc7e439b8a728e8b61e59784b2af70d7c169f8dd8ae290 \ + --hash=sha256:9fcd347d2cc5c23b06de6d3b7b8275be558a0c90549495c699e379a80bf8379e \ + --hash=sha256:9fdac5d6ffa1b5a83bca06ffe7583f5576555e6c8b3a91fbd25ea7780f825f7d \ + --hash=sha256:a11c8d26a50bfab49002947d3d237abe4d9e4b5bdc8846a63537b6488e197808 \ + --hash=sha256:a144d4f717285c6d9234a66778059f33a89096dfb9b39117663fd8413d582dcc \ + --hash=sha256:a2b911a5b90e0374d03813674bf0a5fbbb7741570dcd4b4e85a2e48d17def29d \ + --hash=sha256:a7ec89dc587667f22b6a0b6579c249fca9026ce7c333fc142ba42411fa243cdc \ + --hash=sha256:aa9d91b338f2df0508606f7009fde642391425189bba6d8c653afd80fd6bb64e \ + --hash=sha256:b0379a2b24882fef529ec3b4987cb5d003b9cda32256024e6fe1586ac45fc640 \ + --hash=sha256:bc7aee6f634a6f4a95676fcb5d6559a2c2a390330098dba5e5a5f28a2e4ada30 \ + --hash=sha256:bdc25f3681f7b78572699569514036afe3c243bc3059d3942624e936ec93450e \ + --hash=sha256:c083a3bdd5a93dfe480f1125926afcdbf2917ae714bdb80b36d34318b2bec5d9 \ + --hash=sha256:c20c462aa4434b33a2661701b861604913f912254e441ab8d78d30485736115a \ + --hash=sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9 \ + --hash=sha256:c52b02ad8b4e2cf14ca7b3d918f3eb0ee91e63b3167c32591e57c4317e134f8f \ + --hash=sha256:c54c939ee22dc8e2d545da79fc5381f1c020d6d3141d3bd747eab59164dc89fb \ + --hash=sha256:c8e7af2f4e0194c22b5b37205bfb293d166a7344a5b0d0eaccebc376546d77d5 \ + --hash=sha256:cca3868ddfaccfbc4bfb1d608e2ccaaebe0ae628e1416aeb9c4d88c001bb45ab \ + --hash=sha256:d3f26877a748dc4251cfcfda9dfb5f13fcb034f5308388066bcfe9031b63ae7d \ + --hash=sha256:d53b22f2032c42eaaf025f7c40c2e3b94568ae077a606f006d206a463bc69572 \ + --hash=sha256:d87c561733f66531dced0da6e864f44ebf89a8fba55f31407b00c2f7f9449593 \ + --hash=sha256:d946c8bf0d5c24bf4fe333af284c59a19358aa3ec18cb3dc4370080da1e8ad29 \ + --hash=sha256:dac89aea9af8cd672fa7b510e7b8c33b0bba9a43186680550ccf23020f32d535 \ + --hash=sha256:db4b41f9bd95fbe5acd76d89920336ba96f03e149097365afe1cb092fceb89a1 \ + --hash=sha256:dc46a01bf8d62f227d5ecee74178ffc448ff4e5197c756331f71efcc66dc980f \ + --hash=sha256:dd14041875d09cc0f9308e37a6f8b65f5585cf2598a53aa0123df8b129d481f8 \ + --hash=sha256:de4b83bb311557e439b9e186f733f6c645b9417c84e2eb8203f3f820a4b988bf \ + --hash=sha256:e799c050df38a639db758c617ec771fd8fb7a5f8eaaa4b27b101f266b216a246 \ + --hash=sha256:e80b087132752f6b3d714f041ccf74403799d3b23a72722ea2e6ba2e892555b9 \ + --hash=sha256:eb8c529b2819c37140eb51b914153063d27ed88e3bdc31b71198a198e921e011 \ + --hash=sha256:eb9b459ca4df0e5c87deb59d37377461a538852765293f9e6ee834f0435a93b9 \ + --hash=sha256:efec8db3266b76ef9607c2c4c419bdb06bf335ae433b80816089ea7585816f6a \ + --hash=sha256:f481959862f57f29601ccced557cc2e817bce7533ab8e01a797a48b49c9692b3 \ + --hash=sha256:f517ca031dfc037a9c07e748cefd8d96235088b83b4f4ba8939105d20fa1dcd6 \ + --hash=sha256:f889f7a40498cc077332c7ab6b4608d296d852182211787d4f3ee377aaae66e8 \ + --hash=sha256:f8de619080e944347f5f20de29a975c2d815d9ddd8be9b9b7268e2e3ef68605a \ + --hash=sha256:f941635f2a3d96b2973e867144fde513665c87f13fe0e193c158ac51bfaaa7b2 \ + --hash=sha256:fa754d1850735a0b0e03bcffd9d4b4343eb417e47196e4485d9cca326073a42c \ + --hash=sha256:fa854f5cf7e33842a892e5c73f45327760bc7bc516339fda888c75ae60edaeb6 \ + --hash=sha256:fe5b32187cbc0c862ee201ad66c30cf218e5ed468ec8dc1cf49dec66e160cc4d # via # -c /tmp/ray-deps/requirements_compiled.txt # pydantic @@ -2451,6 +2565,7 @@ python-dateutil==2.8.2 \ # -r python/requirements/cloud-requirements.txt # arrow # botocore + # celery # jupyter-client # pandas python-json-logger==2.0.7 \ @@ -2633,6 +2748,7 @@ requests==2.32.3 \ # -r python/requirements/cloud-requirements.txt # -r python/requirements.txt # azure-core + # azure-datalake-store # google-api-core # google-cloud-storage # jupyterlab-server @@ -2776,9 +2892,9 @@ rsa==4.7.2 \ # -c /tmp/ray-deps/requirements_compiled.txt # google-auth # oauth2client -s3transfer==0.6.2 \ - --hash=sha256:b014be3a8a2aab98cfe1abc7229cc5a9a0cf05eb9c1f2b86b230fd8df3f78084 \ - --hash=sha256:cab66d3380cca3e70939ef2255d01cd8aece6a4907a9528740f668c4b0611861 +s3transfer==0.8.0 \ + --hash=sha256:baa479dc2e63e5c2ed51611b4d46cdf0295e2070d8d0b86b22f335ee5b954986 \ + --hash=sha256:e8d6bd52ffd99841e3a57b34370a54841f12d3aab072af862cdcc50955288002 # via # -c /tmp/ray-deps/requirements_compiled.txt # boto3 @@ -3053,6 +3169,19 @@ typing-extensions==4.12.2 \ # pyopenssl # referencing # typer + # typing-inspection +typing-inspection==0.4.1 \ + --hash=sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51 \ + --hash=sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28 + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # pydantic +tzdata==2025.2 \ + --hash=sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8 \ + --hash=sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9 + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # kombu tzlocal==5.3 \ --hash=sha256:2fafbfc07e9d8b49ade18f898d6bcd37ae88ce3ad6486842a2e4f03af68323d2 \ --hash=sha256:3814135a1bb29763c6e4f08fd6e41dbb435c7a60bfbb03270211bcc537187d8c @@ -3079,6 +3208,14 @@ uvicorn==0.22.0 \ # via # -c /tmp/ray-deps/requirements_compiled.txt # -r python/requirements.txt +vine==5.1.0 \ + --hash=sha256:40fdf3c48b2cfe1c38a49e9ae2da6fda88e4794c810050a728bd7413811fb1dc \ + --hash=sha256:8b62e981d35c41049211cf62a0a1242d8c1ee9bd15bb196ce38aefd6799e61e0 + # via + # -c /tmp/ray-deps/requirements_compiled.txt + # amqp + # celery + # kombu virtualenv==20.29.1 \ --hash=sha256:4e4cb403c0b0da39e13b46b1b2476e505cb0046b25f242bee80f62bf990b2779 \ --hash=sha256:b8b8970138d32fb606192cb97f6cd4bb644fa486be9308fb9b63f81091b5dc35 @@ -3399,5 +3536,4 @@ zipp==3.19.2 \ # importlib-metadata # The following packages were excluded from the output: -# grpcio-tools # setuptools diff --git a/python/requirements_compiled_rayllm_py311_cpu.txt b/python/deplocks/llm/rayllm_py311_cpu.lock similarity index 86% rename from python/requirements_compiled_rayllm_py311_cpu.txt rename to python/deplocks/llm/rayllm_py311_cpu.lock index 96179efeaef6..de5b155c04c0 100644 --- a/python/requirements_compiled_rayllm_py311_cpu.txt +++ b/python/deplocks/llm/rayllm_py311_cpu.lock @@ -1,5 +1,5 @@ # This file was autogenerated by uv via the following command: -# uv pip compile --generate-hashes --strip-extras --unsafe-package ray --unsafe-package grpcio-tools --unsafe-package setuptools --index-url https://pypi.org/simple --extra-index-url https://download.pytorch.org/whl/cpu --index-strategy unsafe-best-match --no-strip-markers --emit-index-url --emit-find-links -c python/requirements_compiled_rayllm_test_py311_cpu.txt python/requirements.txt python/requirements/llm/llm-requirements.txt -o python/requirements_compiled_rayllm_py311_cpu.txt +# uv pip compile --generate-hashes --strip-extras --unsafe-package setuptools --index-url https://pypi.org/simple --index-strategy unsafe-best-match --no-strip-markers --emit-index-url --emit-find-links --extra-index-url https://download.pytorch.org/whl/cpu --python-version=3.11 --unsafe-package ray --python-platform=linux -c python/deplocks/llm/rayllm_test_py311_cpu.lock python/requirements.txt python/requirements/llm/llm-requirements.txt -o python/deplocks/llm/rayllm_py311_cpu.lock --index-url https://pypi.org/simple --extra-index-url https://download.pytorch.org/whl/cpu @@ -7,7 +7,7 @@ aiohappyeyeballs==2.6.1 \ --hash=sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558 \ --hash=sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # aiohttp aiohttp==3.11.16 \ --hash=sha256:004511d3413737700835e949433536a2fe95a7d0297edd911a1e9705c5b5ea43 \ @@ -92,7 +92,7 @@ aiohttp==3.11.16 \ --hash=sha256:fb46bb0f24813e6cede6cc07b1961d4b04f331f7112a23b5e21f567da4ee50aa \ --hash=sha256:fd36c119c5d6551bce374fcb5c19269638f8d09862445f85a5a48596fd59f4bb # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt # aiohttp-cors # vllm @@ -100,31 +100,37 @@ aiohttp-cors==0.7.0 \ --hash=sha256:0451ba59fdf6909d0e2cd21e4c0a43752bc0703d33fc78ae94d9d9321710193e \ --hash=sha256:4d39c6d7100fd9764ed1caf8cebf0eb01bf5e3f24e2e073fda6234bc48b19f5d # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt aiorwlock==1.3.0 \ --hash=sha256:45baf8e4fa9a23e0bb325fbd67da80de1fd7ae1d4f59a6381754c60cec7b289b \ --hash=sha256:83f12d87df4b9728a0b8fda1756585ab0d652b107bab59c6084e1b1ad692ab45 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt aiosignal==1.3.1 \ --hash=sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc \ --hash=sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # aiohttp +amqp==5.3.1 \ + --hash=sha256:43b3319e1b4e7d1251833a93d672b4af1e40f3d632d479b98661a95f117880a2 \ + --hash=sha256:cddc00c725449522023bad949f70fff7b48f0b1ade74d170a6f10ab044739432 + # via + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # kombu annotated-types==0.6.0 \ --hash=sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43 \ --hash=sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # pydantic anyio==3.7.1 \ --hash=sha256:44a3c9aba0f5defa43261a8b3efb97891f2bd7d804e0e1f56419befa1adfc780 \ --hash=sha256:91dee416e570e92c64041bd18b900d1d6fa78dff7048769ce5ac5ddad004fbb5 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # httpx # openai # starlette @@ -133,16 +139,22 @@ astor==0.8.1 \ --hash=sha256:070a54e890cefb5b3739d19f30f5a5ec840ffc9c50ffa7d23cc9fc1a38ebbfc5 \ --hash=sha256:6a6effda93f4e1ce9f618779b2dd1d9d84f1e32812c23a29b3fff6fd7f63fa5e # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # depyf attrs==25.1.0 \ --hash=sha256:1c97078a80c814273a76b2a298a932eb681c87415c11dee0a6921de7f1b02c3e \ --hash=sha256:c75a69e28a550a7e93789579c22aa26b0f5b83b75dc4e08fe092980051e1090a # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # aiohttp # jsonschema # referencing +billiard==4.2.1 \ + --hash=sha256:12b641b0c539073fc8d3f5b8b7be998956665c4233c7c1fcd66a7e677c4fb36f \ + --hash=sha256:40b59a4ac8806ba2c2369ea98d876bc6108b051c227baffd928c644d15d8f3cb + # via + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # celery blake3==1.0.4 \ --hash=sha256:00605aa59923205c6a4f21131840840eb2d9a754c59b163357d890566755b97a \ --hash=sha256:08f46c2f1c5f369f07409e3e4ff248bcb22617cd741f2224873d85982dd6034e \ @@ -230,13 +242,13 @@ blake3==1.0.4 \ --hash=sha256:fedc326cac4476d2eab88413a4bf56e491040ae11ea98ddadaa5487cecda9b93 \ --hash=sha256:ff0e96f61b16b365ad5bb7c6272754f83d8a59c95d3b2f70c3bb6324ddf5bc0c # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # vllm cachetools==5.5.2 \ --hash=sha256:1a661caa9175d26759571b2e19580f9d6393969e5dfca11fdb1f947a23e640d4 \ --hash=sha256:d26a22bcc62eb95c3beabd9f1ee5e820d3d2704fe2967cbe350e20c8ffcd3f0a # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # google-auth # vllm cbor2==5.6.5 \ @@ -285,13 +297,19 @@ cbor2==5.6.5 \ --hash=sha256:fde21ac1cf29336a31615a2c469a9cb03cf0add3ae480672d4d38cda467d07fc \ --hash=sha256:fe11c2eb518c882cfbeed456e7a552e544893c17db66fe5d3230dbeaca6b615c # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # vllm +celery==5.5.3 \ + --hash=sha256:0b5761a07057acee94694464ca482416b959568904c9dfa41ce8413a7d65d525 \ + --hash=sha256:6c972ae7968c2b5281227f01c3a3f984037d21c5129d07bf3550cc2afc6b10a5 + # via + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # -r python/requirements.txt certifi==2025.1.31 \ --hash=sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651 \ --hash=sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # httpcore # httpx # requests @@ -349,7 +367,7 @@ cffi==1.16.0 \ --hash=sha256:fa3a0128b152627161ce47201262d3140edb5a5c3da88d73a1b790a959126956 \ --hash=sha256:fcc8eb6d5902bb1cf6dc4f187ee3ea80a1eba0a89aba40a5cb20a5087d961357 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # cryptography # soundfile charset-normalizer==3.3.2 \ @@ -444,35 +462,57 @@ charset-normalizer==3.3.2 \ --hash=sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519 \ --hash=sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # requests click==8.1.7 \ --hash=sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28 \ --hash=sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt + # celery + # click-didyoumean + # click-plugins + # click-repl # ray # typer # uvicorn +click-didyoumean==0.3.1 \ + --hash=sha256:4f82fdff0dbe64ef8ab2279bd6aa3f6a99c3b28c05aa09cbfc07c9d7fbb5a463 \ + --hash=sha256:5c4bb6007cfea5f2fd6583a2fb6701a22a41eb98957e63d0fac41c10e7c3117c + # via + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # celery +click-plugins==1.1.1.2 \ + --hash=sha256:008d65743833ffc1f5417bf0e78e8d2c23aab04d9745ba817bd3e71b0feb6aa6 \ + --hash=sha256:d7af3984a99d243c131aa1a828331e7630f4a88a9741fd05c927b204bcf92261 + # via + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # celery +click-repl==0.3.0 \ + --hash=sha256:17849c23dba3d667247dc4defe1757fff98694e90fe37474f3feebb69ced26a9 \ + --hash=sha256:fb7e06deb8da8de86180a33a9da97ac316751c094c6899382da7feeeeb51b812 + # via + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # celery cloudpickle==2.2.0 \ --hash=sha256:3f4219469c55453cfe4737e564b67c2a149109dabf7f242478948b895f61106f \ --hash=sha256:7428798d5926d8fcbfd092d18d01a2a03daf8237d8fcdc8095d256b8490796f0 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # gymnasium # vllm colorful==0.5.5 \ --hash=sha256:62c187e27c1433db9463ff93b1451898d1e7e23a7e553583fd9daeb6325182e4 \ --hash=sha256:66f8c1264b2a26f7293b96a03bb7a76c4bc8b9634369a0bffdcd12d618056a1d # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt compressed-tensors==0.10.2 \ --hash=sha256:6de13ac535d7ffdd8890fad3d229444c33076170acaa8fab6bab8ecfa96c1d8f \ --hash=sha256:e1b4d9bc2006e3fd3a938e59085f318fdb280c5af64688a4792bf1bc263e579d # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # vllm cryptography==44.0.3 \ --hash=sha256:02f55fb4f8b79c1221b0961488eaae21015b69b210e18c386b69de182ebb1259 \ @@ -513,7 +553,7 @@ cryptography==44.0.3 \ --hash=sha256:fc3c9babc1e1faefd62704bb46a69f359a9819eb0292e40df3fb6e3574715cd4 \ --hash=sha256:fe19d8bc5536a91a24a8133328880a41831b6c5df54599a8417b62fe015d3053 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # pyopenssl cupy-cuda12x==13.1.0 ; sys_platform != 'darwin' \ --hash=sha256:230f8a8e99c81a653baa0ed00819990c0ed1f0cf0298214786b5e323461dc61a \ @@ -529,38 +569,38 @@ cupy-cuda12x==13.1.0 ; sys_platform != 'darwin' \ --hash=sha256:dac0284fecb90b5731f514e569a6fcf6674a730ae95b9490781a713b60a34423 \ --hash=sha256:e7a25ef1b44ae6276b5105affc2289edb34f1aa6676babd5bcd80907348c4cfa # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt # ray depyf==0.19.0 \ --hash=sha256:040b35fc0997d49df024b7d094f2a7836f91e9ed02f49982dd37e70aa3285ad5 \ --hash=sha256:afed0916b32d141cc90fa6220df01885eda442ca43b297d5050eeb90b4a5cb44 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # vllm dill==0.3.9 \ --hash=sha256:468dff3b89520b474c0397703366b7b95eebe6303f108adf9b19da1f702be87a \ --hash=sha256:81aa267dddf68cbfe8029c42ca9ec6a4ab3b22371d1c450abc54422577b4512c # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # depyf diskcache==5.6.3 \ --hash=sha256:2c3a3fa2743d8535d832ec61c2054a1641f41775aa7c556758a109941e33e4fc \ --hash=sha256:5e31b2d5fbad117cc363ebaf6b689474db18a1f6438bc82358b024abd4c2ca19 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # vllm distlib==0.3.7 \ --hash=sha256:2e24928bc811348f0feb63014e97aaae3037f2cf48712d51ae61df7fd6075057 \ --hash=sha256:9dafe54b34a028eafd95039d5e5d4851a13734540f1331060d31c9916e7147a8 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # virtualenv distro==1.9.0 \ --hash=sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed \ --hash=sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # openai dm-tree==0.1.8 \ --hash=sha256:054b461f8176f4bce7a21f7b1870f873a1ced3bdbe1282c816c550bb43c71fa6 \ @@ -610,44 +650,44 @@ dm-tree==0.1.8 \ --hash=sha256:f7ac31b9aecccb2c6e1ab29706f6ded3eba0c2c69c770322c9c685929c3d6afb \ --hash=sha256:fa42a605d099ee7d41ba2b5fb75e21423951fd26e5d50583a00471238fb3021d # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt dnspython==2.7.0 \ --hash=sha256:b4c34b7d10b51bcc3a5071e7b8dee77939f1e878477eeecc965e9835f63c6c86 \ --hash=sha256:ce9c432eda0dc91cf618a5cedf1a4e142651196bbcd2c80e89ed5a907e5cfaf1 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # email-validator einops==0.8.1 \ --hash=sha256:919387eb55330f5757c6bea9165c5ff5cfe63a642682ea788a6d472576d81737 \ --hash=sha256:de5d960a7a761225532e0f1959e5315ebeafc0cd43394732f103ca44b9837e84 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # vllm email-validator==2.2.0 \ --hash=sha256:561977c2d73ce3611850a06fa56b414621e0c8faa9d66f2611407d87465da631 \ --hash=sha256:cb690f344c617a714f22e66ae771445a1ceb46821152df8e165c5f9a364582b7 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # fastapi farama-notifications==0.0.4 \ --hash=sha256:13fceff2d14314cf80703c8266462ebf3733c7d165336eee998fc58e545efd18 \ --hash=sha256:14de931035a41961f7c056361dc7f980762a143d05791ef5794a751a2caf05ae # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # gymnasium fastapi==0.115.12 \ --hash=sha256:1e2c2a2646905f9e83d32f04a3f86aff4a286669c6c950ca95b5fd68c2602681 \ --hash=sha256:e94613d6c05e27be7ffebdd6ea5f388112e5e430c8f7d6494a9d1d88d43e814d # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt # vllm fastapi-cli==0.0.5 \ --hash=sha256:d30e1239c6f46fcb95e606f02cdda59a1e2fa778a54b64686b3ff27f6211ff9f \ --hash=sha256:e94d847524648c748a5350673546bbf9bcaeb086b33c24f2e82e021436866a46 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # fastapi fastrlock==0.8.2 ; sys_platform != 'darwin' \ --hash=sha256:067edb0a0805bf61e17a251d5046af59f6e9d2b8ad01222e0ef7a0b7937d5548 \ @@ -726,13 +766,13 @@ fastrlock==0.8.2 ; sys_platform != 'darwin' \ --hash=sha256:fb5363cf0fddd9b50525ddbf64a1e1b28ec4c6dfb28670a940cb1cf988a6786b \ --hash=sha256:ff75c90663d6e8996610d435e71487daa853871ad1770dd83dc0f2fc4997241e # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # cupy-cuda12x filelock==3.17.0 \ --hash=sha256:533dc2f7ba78dc2f0f531fc6c4940addf7b70a481e269a5a3b93be94ffbe8338 \ --hash=sha256:ee4e77401ef576ebb38cd7f13b9b28893194acc20a8e68e18730ba9c0e54660e # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt # huggingface-hub # ray @@ -819,14 +859,14 @@ frozenlist==1.4.1 \ --hash=sha256:fde5bd59ab5357e3853313127f4d3565fc7dad314a74d7b5d43c22c6a5ed2ced \ --hash=sha256:fe1a06da377e3a1062ae5fe0926e12b84eceb8a50b350ddca72dc85015873f74 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # aiohttp # aiosignal -fsspec==2023.5.0 \ - --hash=sha256:51a4ad01a5bb66fcc58036e288c0d53d3975a0df2a5dc59a93b59bade0391f2a \ - --hash=sha256:b3b56e00fb93ea321bc9e5d9cf6f8522a0198b20eb24e02774d329e9c6fb84ce +fsspec==2023.12.1 \ + --hash=sha256:6271f1d3075a378bfe432f6f42bf7e1d2a6ba74f78dd9b512385474c579146a0 \ + --hash=sha256:c4da01a35ac65c853f833e43f67802c25213f560820d54ddf248f92eddd5e990 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt # huggingface-hub # torch @@ -834,96 +874,92 @@ gguf==0.16.2 \ --hash=sha256:0fc956289a30d0f1f3afd75ec0d493f73ae2629a3f21f3846dd1687d8791c7c1 \ --hash=sha256:e73eb19b30fcc7c7f32894345024dda8b1a0c959b94a12b7c40ded8dd3f96810 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # vllm google-api-core==2.24.2 \ --hash=sha256:810a63ac95f3c441b7c0e43d344e372887f62ce9071ba972eacf32672e072de9 \ --hash=sha256:81718493daf06d96d6bc76a91c23874dbf2fac0adbbf542831b805ee6e974696 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # opencensus google-auth==2.23.4 \ --hash=sha256:79905d6b1652187def79d491d6e23d0cbb3a21d3c7ba0dbaa9c8a01906b13ff3 \ --hash=sha256:d4bbc92fe4b8bfd2f3e8d88e5ba7085935da208ee38a134fc280e7ce682a05f2 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # google-api-core googleapis-common-protos==1.61.0 \ --hash=sha256:22f1915393bb3245343f6efe87f6fe868532efc12aa26b391b15132e1279f1c0 \ --hash=sha256:8a64866a97f6304a7179873a465d6eee97b7a24ec6cfd78e0f575e96b821240b # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # google-api-core -grpcio==1.66.2 \ - --hash=sha256:02697eb4a5cbe5a9639f57323b4c37bcb3ab2d48cec5da3dc2f13334d72790dd \ - --hash=sha256:03b0b307ba26fae695e067b94cbb014e27390f8bc5ac7a3a39b7723fed085604 \ - --hash=sha256:05bc2ceadc2529ab0b227b1310d249d95d9001cd106aa4d31e8871ad3c428d73 \ - --hash=sha256:06de8ec0bd71be123eec15b0e0d457474931c2c407869b6c349bd9bed4adbac3 \ - --hash=sha256:0be4e0490c28da5377283861bed2941d1d20ec017ca397a5df4394d1c31a9b50 \ - --hash=sha256:12fda97ffae55e6526825daf25ad0fa37483685952b5d0f910d6405c87e3adb6 \ - --hash=sha256:1caa38fb22a8578ab8393da99d4b8641e3a80abc8fd52646f1ecc92bcb8dee34 \ - --hash=sha256:2018b053aa15782db2541ca01a7edb56a0bf18c77efed975392583725974b249 \ - --hash=sha256:20657d6b8cfed7db5e11b62ff7dfe2e12064ea78e93f1434d61888834bc86d75 \ - --hash=sha256:2335c58560a9e92ac58ff2bc5649952f9b37d0735608242973c7a8b94a6437d8 \ - --hash=sha256:31fd163105464797a72d901a06472860845ac157389e10f12631025b3e4d0453 \ - --hash=sha256:38b68498ff579a3b1ee8f93a05eb48dc2595795f2f62716e797dc24774c1aaa8 \ - --hash=sha256:3b00efc473b20d8bf83e0e1ae661b98951ca56111feb9b9611df8efc4fe5d55d \ - --hash=sha256:3ed71e81782966ffead60268bbda31ea3f725ebf8aa73634d5dda44f2cf3fb9c \ - --hash=sha256:45a3d462826f4868b442a6b8fdbe8b87b45eb4f5b5308168c156b21eca43f61c \ - --hash=sha256:49f0ca7ae850f59f828a723a9064cadbed90f1ece179d375966546499b8a2c9c \ - --hash=sha256:4e504572433f4e72b12394977679161d495c4c9581ba34a88d843eaf0f2fbd39 \ - --hash=sha256:4ea1d062c9230278793820146c95d038dc0f468cbdd172eec3363e42ff1c7d01 \ - --hash=sha256:563588c587b75c34b928bc428548e5b00ea38c46972181a4d8b75ba7e3f24231 \ - --hash=sha256:6001e575b8bbd89eee11960bb640b6da6ae110cf08113a075f1e2051cc596cae \ - --hash=sha256:66a0cd8ba6512b401d7ed46bb03f4ee455839957f28b8d61e7708056a806ba6a \ - --hash=sha256:6851de821249340bdb100df5eacfecfc4e6075fa85c6df7ee0eb213170ec8e5d \ - --hash=sha256:728bdf36a186e7f51da73be7f8d09457a03061be848718d0edf000e709418987 \ - --hash=sha256:73e3b425c1e155730273f73e419de3074aa5c5e936771ee0e4af0814631fb30a \ - --hash=sha256:73fc8f8b9b5c4a03e802b3cd0c18b2b06b410d3c1dcbef989fdeb943bd44aff7 \ - --hash=sha256:78fa51ebc2d9242c0fc5db0feecc57a9943303b46664ad89921f5079e2e4ada7 \ - --hash=sha256:7b2c86457145ce14c38e5bf6bdc19ef88e66c5fee2c3d83285c5aef026ba93b3 \ - --hash=sha256:7d69ce1f324dc2d71e40c9261d3fdbe7d4c9d60f332069ff9b2a4d8a257c7b2b \ - --hash=sha256:802d84fd3d50614170649853d121baaaa305de7b65b3e01759247e768d691ddf \ - --hash=sha256:80fd702ba7e432994df208f27514280b4b5c6843e12a48759c9255679ad38db8 \ - --hash=sha256:8ac475e8da31484efa25abb774674d837b343afb78bb3bcdef10f81a93e3d6bf \ - --hash=sha256:950da58d7d80abd0ea68757769c9db0a95b31163e53e5bb60438d263f4bed7b7 \ - --hash=sha256:99a641995a6bc4287a6315989ee591ff58507aa1cbe4c2e70d88411c4dcc0839 \ - --hash=sha256:9c3a99c519f4638e700e9e3f83952e27e2ea10873eecd7935823dab0c1c9250e \ - --hash=sha256:9c509a4f78114cbc5f0740eb3d7a74985fd2eff022971bc9bc31f8bc93e66a3b \ - --hash=sha256:a18e20d8321c6400185b4263e27982488cb5cdd62da69147087a76a24ef4e7e3 \ - --hash=sha256:a917d26e0fe980b0ac7bfcc1a3c4ad6a9a4612c911d33efb55ed7833c749b0ee \ - --hash=sha256:a9539f01cb04950fd4b5ab458e64a15f84c2acc273670072abe49a3f29bbad54 \ - --hash=sha256:ad2efdbe90c73b0434cbe64ed372e12414ad03c06262279b104a029d1889d13e \ - --hash=sha256:b672abf90a964bfde2d0ecbce30f2329a47498ba75ce6f4da35a2f4532b7acbc \ - --hash=sha256:bbd27c24a4cc5e195a7f56cfd9312e366d5d61b86e36d46bbe538457ea6eb8dd \ - --hash=sha256:c400ba5675b67025c8a9f48aa846f12a39cf0c44df5cd060e23fda5b30e9359d \ - --hash=sha256:c408f5ef75cfffa113cacd8b0c0e3611cbfd47701ca3cdc090594109b9fcbaed \ - --hash=sha256:c806852deaedee9ce8280fe98955c9103f62912a5b2d5ee7e3eaa284a6d8d8e7 \ - --hash=sha256:ce89f5876662f146d4c1f695dda29d4433a5d01c8681fbd2539afff535da14d4 \ - --hash=sha256:d25a14af966438cddf498b2e338f88d1c9706f3493b1d73b93f695c99c5f0e2a \ - --hash=sha256:d8d4732cc5052e92cea2f78b233c2e2a52998ac40cd651f40e398893ad0d06ec \ - --hash=sha256:d9a9724a156c8ec6a379869b23ba3323b7ea3600851c91489b871e375f710bc8 \ - --hash=sha256:e636ce23273683b00410f1971d209bf3689238cf5538d960adc3cdfe80dd0dbd \ - --hash=sha256:e88264caad6d8d00e7913996030bac8ad5f26b7411495848cc218bd3a9040b6c \ - --hash=sha256:f145cc21836c332c67baa6fc81099d1d27e266401565bf481948010d6ea32d46 \ - --hash=sha256:fb57870449dfcfac428afbb5a877829fcb0d6db9d9baa1148705739e9083880e \ - --hash=sha256:fb70487c95786e345af5e854ffec8cb8cc781bcc5df7930c4fbb7feaa72e1cdf \ - --hash=sha256:fe96281713168a3270878255983d2cb1a97e034325c8c2c25169a69289d3ecfa \ - --hash=sha256:ff1f7882e56c40b0d33c4922c15dfa30612f05fb785074a012f7cda74d1c3679 - # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt +grpcio==1.74.0 \ + --hash=sha256:0f87bddd6e27fc776aacf7ebfec367b6d49cad0455123951e4488ea99d9b9b8f \ + --hash=sha256:136b53c91ac1d02c8c24201bfdeb56f8b3ac3278668cbb8e0ba49c88069e1bdc \ + --hash=sha256:1733969040989f7acc3d94c22f55b4a9501a30f6aaacdbccfaba0a3ffb255ab7 \ + --hash=sha256:176d60a5168d7948539def20b2a3adcce67d72454d9ae05969a2e73f3a0feee7 \ + --hash=sha256:1a2b06afe2e50ebfd46247ac3ba60cac523f54ec7792ae9ba6073c12daf26f0a \ + --hash=sha256:1bf949792cee20d2078323a9b02bacbbae002b9e3b9e2433f2741c15bdeba1c4 \ + --hash=sha256:22b834cef33429ca6cc28303c9c327ba9a3fafecbf62fae17e9a7b7163cc43ac \ + --hash=sha256:2918948864fec2a11721d91568effffbe0a02b23ecd57f281391d986847982f6 \ + --hash=sha256:2bc2d7d8d184e2362b53905cb1708c84cb16354771c04b490485fa07ce3a1d89 \ + --hash=sha256:2f609a39f62a6f6f05c7512746798282546358a37ea93c1fcbadf8b2fed162e3 \ + --hash=sha256:3601274bc0523f6dc07666c0e01682c94472402ac2fd1226fd96e079863bfa49 \ + --hash=sha256:3b03d8f2a07f0fea8c8f74deb59f8352b770e3900d143b3d1475effcb08eec20 \ + --hash=sha256:3d14e3c4d65e19d8430a4e28ceb71ace4728776fd6c3ce34016947474479683f \ + --hash=sha256:42f8fee287427b94be63d916c90399ed310ed10aadbf9e2e5538b3e497d269bc \ + --hash=sha256:4bc5fca10aaf74779081e16c2bcc3d5ec643ffd528d9e7b1c9039000ead73bae \ + --hash=sha256:4e4181bfc24413d1e3a37a0b7889bea68d973d4b45dd2bc68bb766c140718f82 \ + --hash=sha256:55b453812fa7c7ce2f5c88be3018fb4a490519b6ce80788d5913f3f9d7da8c7b \ + --hash=sha256:566b9395b90cc3d0d0c6404bc8572c7c18786ede549cdb540ae27b58afe0fb91 \ + --hash=sha256:5f251c355167b2360537cf17bea2cf0197995e551ab9da6a0a59b3da5e8704f9 \ + --hash=sha256:60d2d48b0580e70d2e1954d0d19fa3c2e60dd7cbed826aca104fff518310d1c5 \ + --hash=sha256:64229c1e9cea079420527fa8ac45d80fc1e8d3f94deaa35643c381fa8d98f362 \ + --hash=sha256:655726919b75ab3c34cdad39da5c530ac6fa32696fb23119e36b64adcfca174a \ + --hash=sha256:662456c4513e298db6d7bd9c3b8df6f75f8752f0ba01fb653e252ed4a59b5a5d \ + --hash=sha256:68c8ebcca945efff9d86d8d6d7bfb0841cf0071024417e2d7f45c5e46b5b08eb \ + --hash=sha256:69e1a8180868a2576f02356565f16635b99088da7df3d45aaa7e24e73a054e31 \ + --hash=sha256:6bab67d15ad617aff094c382c882e0177637da73cbc5532d52c07b4ee887a87b \ + --hash=sha256:7d95d71ff35291bab3f1c52f52f474c632db26ea12700c2ff0ea0532cb0b5854 \ + --hash=sha256:80d1f4fbb35b0742d3e3d3bb654b7381cd5f015f8497279a1e9c21ba623e01b1 \ + --hash=sha256:834988b6c34515545b3edd13e902c1acdd9f2465d386ea5143fb558f153a7176 \ + --hash=sha256:8533e6e9c5bd630ca98062e3a1326249e6ada07d05acf191a77bc33f8948f3d8 \ + --hash=sha256:85bd5cdf4ed7b2d6438871adf6afff9af7096486fcf51818a81b77ef4dd30907 \ + --hash=sha256:86ad489db097141a907c559988c29718719aa3e13370d40e20506f11b4de0d11 \ + --hash=sha256:885912559974df35d92219e2dc98f51a16a48395f37b92865ad45186f294096c \ + --hash=sha256:8efe72fde5500f47aca1ef59495cb59c885afe04ac89dd11d810f2de87d935d4 \ + --hash=sha256:8f7b5882fb50632ab1e48cb3122d6df55b9afabc265582808036b6e51b9fd6b7 \ + --hash=sha256:9e7c4389771855a92934b2846bd807fc25a3dfa820fd912fe6bd8136026b2707 \ + --hash=sha256:9e912d3c993a29df6c627459af58975b2e5c897d93287939b9d5065f000249b5 \ + --hash=sha256:a8f0302f9ac4e9923f98d8e243939a6fb627cd048f5cd38595c97e38020dffce \ + --hash=sha256:b6a73b2ba83e663b2480a90b82fdae6a7aa6427f62bf43b29912c0cfd1aa2bfa \ + --hash=sha256:c14e803037e572c177ba54a3e090d6eb12efd795d49327c5ee2b3bddb836bf01 \ + --hash=sha256:c3d7bd6e3929fd2ea7fbc3f562e4987229ead70c9ae5f01501a46701e08f1ad9 \ + --hash=sha256:c98e0b7434a7fa4e3e63f250456eaef52499fba5ae661c58cc5b5477d11e7182 \ + --hash=sha256:cce634b10aeab37010449124814b05a62fb5f18928ca878f1bf4750d1f0c815b \ + --hash=sha256:e154d230dc1bbbd78ad2fdc3039fa50ad7ffcf438e4eb2fa30bce223a70c7486 \ + --hash=sha256:e1ea6176d7dfd5b941ea01c2ec34de9531ba494d541fe2057c904e601879f249 \ + --hash=sha256:e759f9e8bc908aaae0412642afe5416c9f983a80499448fcc7fab8692ae044c3 \ + --hash=sha256:e8978003816c7b9eabe217f88c78bc26adc8f9304bf6a594b02e5a49b2ef9c11 \ + --hash=sha256:ecde9ab49f58433abe02f9ed076c7b5be839cf0153883a6d23995937a82392fa \ + --hash=sha256:f6ec94f0e50eb8fa1744a731088b966427575e40c2944a980049798b127a687e \ + --hash=sha256:fd3c71aeee838299c5887230b8a1822795325ddfea635edd82954c1eaa831e24 \ + --hash=sha256:fe0f540750a13fd8e5da4b3eaba91a785eea8dca5ccd2bc2ffe978caa403090e + # via + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt -gymnasium==1.0.0 \ - --hash=sha256:9d2b66f30c1b34fe3c2ce7fae65ecf365d0e9982d2b3d860235e773328a3b403 \ - --hash=sha256:b6f40e1e24c5bd419361e1a5b86a9117d2499baecc3a660d44dfff4c465393ad +gymnasium==1.1.1 \ + --hash=sha256:8bd9ea9bdef32c950a444ff36afc785e1d81051ec32d30435058953c20d2456d \ + --hash=sha256:9c167ec0a2b388666e37f63b2849cd2552f7f5b71938574c637bb36487eb928a # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt h11==0.16.0 \ --hash=sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1 \ --hash=sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # httpcore # uvicorn hf-transfer==0.1.9 \ @@ -953,9 +989,9 @@ hf-transfer==0.1.9 \ --hash=sha256:ee8b10afedcb75f71091bcc197c526a6ebf5c58bbbadb34fdeee6160f55f619f \ --hash=sha256:fc6bd19e1cc177c66bdef15ef8636ad3bde79d5a4f608c158021153b4573509d # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements/llm/llm-requirements.txt -hf-xet==1.1.5 \ +hf-xet==1.1.5 ; platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \ --hash=sha256:69ebbcfd9ec44fdc2af73441619eeb06b94ee34511bbcf57cd423820090f5694 \ --hash=sha256:73e167d9807d166596b4b2f0b585c6d5bd84a26dea32843665a8b58f6edba245 \ --hash=sha256:83088ecea236d5113de478acb2339f92c95b4fb0462acaa30621fac02f5a534a \ @@ -965,13 +1001,13 @@ hf-xet==1.1.5 \ --hash=sha256:f52c2fa3635b8c37c7764d8796dfa72706cc4eded19d638331161e82b0792e23 \ --hash=sha256:fc874b5c843e642f45fd85cda1ce599e123308ad2901ead23d3510a47ff506d1 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # huggingface-hub httpcore==1.0.9 \ --hash=sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55 \ --hash=sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # httpx httptools==0.6.4 \ --hash=sha256:0614154d5454c21b6410fdf5262b4a3ddb0f53f1e1721cfd59d55f32138c578a \ @@ -1018,28 +1054,27 @@ httptools==0.6.4 \ --hash=sha256:f9eb89ecf8b290f2e293325c646a211ff1c2493222798bb80a530c5e7502494f \ --hash=sha256:fc411e1c0a7dcd2f902c7c48cf079947a7e65b5485dea9decb82b9105ca71a43 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # uvicorn httpx==0.28.1 \ --hash=sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc \ --hash=sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # fastapi # openai huggingface-hub==0.34.3 \ --hash=sha256:5444550099e2d86e68b2898b09e85878fbd788fc2957b506c6a79ce060e39492 \ --hash=sha256:d58130fd5aa7408480681475491c0abd7e835442082fbc3ef4d45b6c39f83853 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # tokenizers # transformers - # vllm idna==3.7 \ --hash=sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc \ --hash=sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # anyio # email-validator # httpx @@ -1049,25 +1084,25 @@ imageio==2.34.2 \ --hash=sha256:5c0c0ee8faa018a1c42f649b90395dd4d3bb6187c09053a0cd6f1fdd51bbff5e \ --hash=sha256:a0bb27ec9d5bab36a9f4835e51b21d2cb099e1f78451441f94687ff3404b79f8 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # scikit-image importlib-metadata==6.11.0 \ --hash=sha256:1231cf92d825c9e03cfc4da076a16de6422c863558229ea0b22b675657463443 \ --hash=sha256:f0afba6205ad8f8947c7d338b5342d5db2afbfd82f9cbef7879a9539cc12eb9b # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # opentelemetry-api interegular==0.3.3 \ --hash=sha256:b0c07007d48c89d6d19f7204972d369b2a77222722e126b6aa63aa721dc3b19c \ --hash=sha256:d9b697b21b34884711399ba0f0376914b81899ce670032486d0d048344a76600 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # lm-format-enforcer jinja2==3.1.6 \ --hash=sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d \ --hash=sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # fastapi # memray # torch @@ -1149,19 +1184,19 @@ jiter==0.8.2 \ --hash=sha256:fc9043259ee430ecd71d178fccabd8c332a3bf1e81e50cae43cc2b28d19e4cb7 \ --hash=sha256:ffd9fee7d0775ebaba131f7ca2e2d83839a62ad65e8e02fe2bd8fc975cedeb9e # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # openai jsonref==1.1.0 \ --hash=sha256:32fe8e1d85af0fdefbebce950af85590b22b60f9e95443176adbde4e1ecea552 \ --hash=sha256:590dc7773df6c21cbf948b5dac07a72a251db28b0238ceecce0a2abfa8ec30a9 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements/llm/llm-requirements.txt jsonschema==4.23.0 \ --hash=sha256:d71497fef26351a33265337fa77ffeb82423f3ea21283cd9467bb03999266bc4 \ --hash=sha256:fbadb6f8b144a8f8cf9f0b89ba94501d143e50411a1278633f56a7acf7fd5566 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements/llm/llm-requirements.txt # -r python/requirements.txt # mistral-common @@ -1170,19 +1205,25 @@ jsonschema-specifications==2024.10.1 \ --hash=sha256:0f38b83639958ce1152d02a7f062902c41c8fd20d558b0c34344292d417ae272 \ --hash=sha256:a09a0680616357d9a0ecf05c12ad234479f549239d0f5b55f3deea67475da9bf # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # jsonschema +kombu==5.5.4 \ + --hash=sha256:886600168275ebeada93b888e831352fe578168342f0d1d5833d88ba0d847363 \ + --hash=sha256:a12ed0557c238897d8e518f1d1fdf84bd1516c5e305af2dacd85c2015115feb8 + # via + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # celery lark==1.2.2 \ --hash=sha256:c2276486b02f0f1b90be155f2c8ba4a8e194d42775786db622faccd652d8e80c \ --hash=sha256:ca807d0162cd16cef15a8feecb862d7319e7a09bdb13aef927968e45040fed80 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # vllm lazy-loader==0.4 \ --hash=sha256:342aa8e14d543a154047afb4ba8ef17f5563baad3fc610d7b15b213b0f119efc \ --hash=sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # scikit-image llguidance==0.7.26 ; platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \ --hash=sha256:1895ff449c8ec0a5f1d3b142d723fc9b26a85b021b72d7f1173f8b7507f528c0 \ @@ -1194,7 +1235,7 @@ llguidance==0.7.26 ; platform_machine == 'aarch64' or platform_machine == 'arm64 --hash=sha256:e4e552eb3193b56ca3347f96c1382779e438b7dfc1d234323e202fd7c7a98d28 \ --hash=sha256:fa8ca0660df03934027b87d7e574edf1f8651493f77c0932f3f66d6effbed2b1 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # vllm llvmlite==0.44.0 \ --hash=sha256:07667d66a5d150abed9157ab6c0b9393c9356f229784a4385c02f99e94fc94d4 \ @@ -1219,13 +1260,13 @@ llvmlite==0.44.0 \ --hash=sha256:eed7d5f29136bda63b6d7804c279e2b72e08c952b7c5df61f45db408e0ee52f3 \ --hash=sha256:f01a394e9c9b7b1d4e63c327b096d10f6f0ed149ef53d38a09b3749dcf8c9610 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # numba lm-format-enforcer==0.10.11 \ --hash=sha256:563e0dbc930a6d50fb687951506c5de098c6e962601be0ce723f3b7d0b916a1b \ --hash=sha256:8ab371924e166a1df68f243aca73a8a647bea5909f37edd6a53a694e7e7c3274 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # vllm lz4==4.3.3 \ --hash=sha256:01fe674ef2889dbb9899d8a67361e0c4a2c833af5aeb37dd505727cf5d2a131e \ @@ -1265,13 +1306,13 @@ lz4==4.3.3 \ --hash=sha256:f4c7bf687303ca47d69f9f0133274958fd672efaa33fb5bcde467862d6c621f0 \ --hash=sha256:f76176492ff082657ada0d0f10c794b6da5800249ef1692b35cf49b1e93e8ef7 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt markdown-it-py==2.2.0 \ --hash=sha256:5a35f8d1870171d9acc47b99612dc146129b631baf04970128b568f190d0cc30 \ --hash=sha256:7c9a5e412688bc771c67432cbfebcdd686c93ce6484913dccf06cb5a0bea35a1 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # rich markupsafe==2.1.3 \ --hash=sha256:05fb21170423db021895e1ea1e1f3ab3adb85d1c2333cbc2310f2a26bc77272e \ @@ -1335,13 +1376,13 @@ markupsafe==2.1.3 \ --hash=sha256:fec21693218efe39aa7f8599346e90c705afa52c5b31ae019b2e57e8f6542bb2 \ --hash=sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # jinja2 mdurl==0.1.2 \ --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \ --hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # markdown-it-py memray==1.10.0 ; sys_platform != 'win32' \ --hash=sha256:0a21745fb516b7a6efcd40aa7487c59e9313fcfc782d0193fcfcf00b48426874 \ @@ -1380,25 +1421,25 @@ memray==1.10.0 ; sys_platform != 'win32' \ --hash=sha256:e356af93e3b031c83957e9ac1a653f5aaba5df1e357dd17142f5ed19bb3dc660 \ --hash=sha256:f16c5c8730b616613dc8bafe32649ca6bd7252606251eb00148582011758d0b5 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt meson==1.8.3 \ --hash=sha256:ef02b806ce0c5b6becd5bb5dc9fa67662320b29b337e7ace73e4354500590233 \ --hash=sha256:f118aa910fc0a137cc2dd0122232dbf82153d9a12fb5b0f5bb64896f6a157abf # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements/llm/llm-requirements.txt mistral-common==1.8.3 \ --hash=sha256:0d1979d82227b625f6d71b3c828176f059da8d0f5a3307cdf53b48409a3970a4 \ --hash=sha256:846b6e4bbe016dc2e64fd3169fa704a548f6c74467e0cb18dc165b7a7669abd6 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # vllm mpmath==1.3.0 \ --hash=sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f \ --hash=sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # sympy msgpack==1.0.7 \ --hash=sha256:04ad6069c86e531682f9e1e71b71c1c3937d6014a7c3e9edd2aa81ad58842862 \ @@ -1458,7 +1499,7 @@ msgpack==1.0.7 \ --hash=sha256:f9a7c509542db4eceed3dcf21ee5267ab565a83555c9b88a8109dcecc4709002 \ --hash=sha256:ff1d0899f104f3921d94579a5638847f783c9b04f2d5f229392ca77fba5b82fc # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt # ray msgspec==0.19.0 \ @@ -1499,7 +1540,7 @@ msgspec==0.19.0 \ --hash=sha256:f98bd8962ad549c27d63845b50af3f53ec468b6318400c9f1adfe8b092d7b62f \ --hash=sha256:fe2c4bf29bf4e89790b3117470dea2c20b59932772483082c468b990d45fb947 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # vllm multidict==6.0.5 \ --hash=sha256:01265f5e40f5a17f8241d52656ed27192be03bfa8764d88e8220141d1e4b3556 \ @@ -1593,14 +1634,14 @@ multidict==6.0.5 \ --hash=sha256:fce28b3c8a81b6b36dfac9feb1de115bab619b3c13905b419ec71d03a3fc1423 \ --hash=sha256:fe5d7785250541f7f5019ab9cba2c71169dc7d74d0f45253f8313f436458a4ef # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # aiohttp # yarl networkx==3.2.1 \ --hash=sha256:9f1bb5cf3409bf324e0a722c20bdb4c20ee39bf1c30ce8ae499c8502b0b5e0c6 \ --hash=sha256:f18c69adc97877c42332c170849c96cefa91881c99a7cb3e95b7c659ebdc1ec2 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # scikit-image # torch ninja==1.11.1.3 \ @@ -1622,17 +1663,21 @@ ninja==1.11.1.3 \ --hash=sha256:bc3ebc8b2e47716149f3541742b5cd8e0b08f51013b825c05baca3e34854370d \ --hash=sha256:edfa0d2e9d7ead1635b03e40a32ad56cc8f56798b6e2e9848d8300b174897076 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements/llm/llm-requirements.txt # vllm # xgrammar -nixl==0.3.1 \ - --hash=sha256:20428ad2668062a79045fae83cc5cba1f4019d4a2c7053cc8549c3a1533f8a75 \ - --hash=sha256:70b8932b50ccf1a13ac8fa2e10a4b78290baae9f963bfecfa67684104331a94b \ - --hash=sha256:8c144839484b3076f0b34ad8ceaeaff05c23399cf57ca85f2a94b44e1475a39b \ - --hash=sha256:ff59996ad05a7e4ba6c8beba0f1d8ac2f9e53df696a15af0d3340028e2f16081 - # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt +nixl==0.4.1 \ + --hash=sha256:10c7b4a44f89c3fbff3e20cb84973be95f8df36ee336fb108275ed1839fec1f1 \ + --hash=sha256:510cc9e824ad53cac71ce55ff41160f2a9e1507ceb52eb871b775fe1e42beb87 \ + --hash=sha256:8a3d83b28c16b795bdc281f1489b9d247f6e6088ad96ca96406072a36d6354b7 \ + --hash=sha256:9381fd3986d227c7ccb2607c03bbea559ec80f951e2ea47c1fbf381e4cd97164 \ + --hash=sha256:9ab7e580e9962ebdcda8c17f8548858d3fdb648621367d8e717ca317b534b778 \ + --hash=sha256:db144821de7912cb2502052b3070a1ac276b8b019470e6efdfce9c237ffe130d \ + --hash=sha256:e33102b85b3f95a8c95e59b59b29aabd03d47b5bce619de506b9bb83739cf60d \ + --hash=sha256:f16092dd445542e82e3db3553f6c7697ec5a2e837f19d416401283ae245826f9 + # via + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements/llm/llm-requirements.txt numba==0.61.2 \ --hash=sha256:34fba9406078bac7ab052efbf0d13939426c753ad72946baaa5bf9ae0ebb8dd2 \ @@ -1657,7 +1702,7 @@ numba==0.61.2 \ --hash=sha256:ea0247617edcb5dd61f6106a56255baab031acc4257bddaeddb3a1003b4ca3fd \ --hash=sha256:efd3db391df53aaa5cfbee189b6c910a5b471488749fd6606c3f33fc984c2ae2 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # vllm numpy==1.26.4 \ --hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \ @@ -1697,7 +1742,7 @@ numpy==1.26.4 \ --hash=sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3 \ --hash=sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt # cupy-cuda12x # gguf @@ -1718,23 +1763,41 @@ numpy==1.26.4 \ # transformers # vllm # xformers -openai==1.90.0 \ - --hash=sha256:9771982cdd5b6631af68c6a603da72ed44cd2caf73b49f717a72b71374bc565b \ - --hash=sha256:e5dcb5498ea6b42fec47546d10f1bcc05fb854219a7d953a5ba766718b212a02 +openai==1.100.2 \ + --hash=sha256:54d3457b2c8d7303a1bc002a058de46bdd8f37a8117751c7cf4ed4438051f151 \ + --hash=sha256:787b4c3c8a65895182c58c424f790c25c790cc9a0330e34f73d55b6ee5a00e32 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # vllm +openai-harmony==0.0.4 \ + --hash=sha256:038f1d6772d1be5213b36ae76e5d042022395ec35c428a73ccb8b839b2cecf6a \ + --hash=sha256:15e6d53a66502491a3675a536df30e271f976e6c5efe68250a65191efcb85c4f \ + --hash=sha256:2d8d16d84702059833fb03b841b28c25600c54e83cadccef79af44e1c81166b1 \ + --hash=sha256:31e9bcac0902a309e2fc688e52f247eec7fffcd00d17e958b9a83a8fea6519c2 \ + --hash=sha256:3586d90c899cd41f8624e7b82a48c289f6e4be56c66304ecaf3a0ba88963a73f \ + --hash=sha256:3cf2344366f10981bbc0f6d9949a0b2bb87151d209ed295943ed6ad8eda37932 \ + --hash=sha256:567cc568b6bf7b4d041b0c9aa7d6b2c9394f8af6065bc87fa6d23f207b5af9a7 \ + --hash=sha256:5c67ac6df349236fb7b64f57c3dbb0273efcdca24314daa108f2a482c427106c \ + --hash=sha256:746f751de5033b3dbcfcd4a726a4c56ce452c593ad3d54472d8597ce8d8b6d44 \ + --hash=sha256:96a63199c0d81095b5d5d1ae8ca82b64c1c13d18d4e30323ae9e8ab31bc80a3d \ + --hash=sha256:97f1fe3909733212cc6b36f0f199b1421a9c57b79ec665f0322bd604cec47340 \ + --hash=sha256:b9ee9e9ab6a237cebbe16563c787a6e83f3fcc034075c3d321dab94448426282 \ + --hash=sha256:d38f2639f6bf7c3c34a5dfd79e29075811ae2fa9b895a63e76767f74a47a971e \ + --hash=sha256:ef21a1e2384a65c62d5ec5e1cded9fe026f1d032d5c5d725110d1a8d330d8f54 + # via + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # vllm opencensus==0.11.4 \ --hash=sha256:a18487ce68bc19900336e0ff4655c5a116daf10c1b3685ece8d971bddad6a864 \ --hash=sha256:cbef87d8b8773064ab60e5c2a1ced58bbaa38a6d052c41aec224958ce544eff2 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt opencensus-context==0.1.3 \ --hash=sha256:073bb0590007af276853009fac7e4bab1d523c3f03baf4cb4511ca38967c6039 \ --hash=sha256:a03108c3c10d8c80bb5ddf5c8a1f033161fa61972a9917f9b9b3a18517f0088c # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # opencensus opencv-python-headless==4.11.0.86 \ --hash=sha256:0e0a27c19dd1f40ddff94976cfe43066fbbe9dfbb2ec1907d66c19caef42a57b \ @@ -1745,14 +1808,14 @@ opencv-python-headless==4.11.0.86 \ --hash=sha256:a66c1b286a9de872c343ee7c3553b084244299714ebb50fbdcd76f07ebbe6c81 \ --hash=sha256:f447d8acbb0b6f2808da71fddd29c1cdd448d2bc98f72d9bb78a7a898fc9621b # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # mistral-common # vllm opentelemetry-api==1.34.1 \ --hash=sha256:64f0bd06d42824843731d05beea88d4d4b6ae59f9fe347ff7dfa2cc14233bbb3 \ --hash=sha256:b7df4cb0830d5a6c29ad0c0691dbae874d8daefa934b8b1d642de48323d32a8c # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt # opentelemetry-exporter-prometheus # opentelemetry-sdk @@ -1761,26 +1824,26 @@ opentelemetry-exporter-prometheus==0.55b1 \ --hash=sha256:d13ec0b22bf394113ff1ada5da98133a4b051779b803dae183188e26c4bd9ee0 \ --hash=sha256:f364fbbff9e5de37a112ff104d1185fb1d7e2046c5ab5911e5afebc7ab3ddf0e # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt opentelemetry-proto==1.27.0 \ --hash=sha256:33c9345d91dafd8a74fc3d7576c5a38f18b7fdf8d02983ac67485386132aedd6 \ --hash=sha256:b133873de5581a50063e1e4b29cdcf0c5e253a8c2d8dc1229add20a4c3830ace # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt opentelemetry-sdk==1.34.1 \ --hash=sha256:308effad4059562f1d92163c61c8141df649da24ce361827812c40abb2a1e96e \ --hash=sha256:8091db0d763fcd6098d4781bbc80ff0971f94e260739aa6afe6fd379cdf3aa4d # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt # opentelemetry-exporter-prometheus opentelemetry-semantic-conventions==0.55b1 \ --hash=sha256:5da81dfdf7d52e3d37f8fe88d5e771e191de924cfff5f550ab0b8f7b2409baed \ --hash=sha256:ef95b1f009159c28d7a7849f5cbc71c4c34c845bb514d66adfdf1b3fff3598b3 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # opentelemetry-sdk outlines-core==0.2.10 \ --hash=sha256:0a9e4b192ca837a472a1bb1428397509f543db08e1aeeee30252525cec34093a \ @@ -1825,15 +1888,16 @@ outlines-core==0.2.10 \ --hash=sha256:f895834da0a577120dcb8d979c12c0690fe912095413bf0070a73e9ff363b7bf \ --hash=sha256:faf5b43181b1d033871364e74e9d348362c6a77b1d054d7af35e09fdfcff5b16 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # vllm packaging==23.0 \ --hash=sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2 \ --hash=sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt # huggingface-hub + # kombu # lazy-loader # lm-format-enforcer # ray @@ -1869,13 +1933,13 @@ pandas==1.5.3 \ --hash=sha256:e474390e60ed609cec869b0da796ad94f420bb057d86784191eefc62b65819ae \ --hash=sha256:f76d097d12c82a535fda9dfe5e8dd4127952b45fea9b0276cb30cca5ea313fbc # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt partial-json-parser==0.2.1.1.post5 \ --hash=sha256:627715aaa3cb3fb60a65b0d62223243acaa6c70846520a90326fef3a2f0b61ca \ --hash=sha256:992710ac67e90b367921d52727698928040f7713ba7ecb33b96371ea7aec82ca # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # vllm pillow==10.3.0 \ --hash=sha256:048ad577748b9fa4a99a0548c64f2cb8d672d5bf2e643a739ac8faff1164238c \ @@ -1948,7 +2012,7 @@ pillow==10.3.0 \ --hash=sha256:fdcbb4068117dfd9ce0138d068ac512843c52295ed996ae6dd1faf537b6dbc27 \ --hash=sha256:ff61bfd9253c3915e6d41c651d5f962da23eda633cf02262990094a18a55371a # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # imageio # mistral-common # scikit-image @@ -1958,13 +2022,13 @@ platformdirs==3.11.0 \ --hash=sha256:cf8ee52a3afdb965072dcc652433e0c7e3e40cf5ea1477cd4b3b1d2eb75495b3 \ --hash=sha256:e9d171d00af68be50e9202731309c4e658fd8bc76f55c11c7dd760d023bda68e # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # virtualenv prometheus-client==0.19.0 \ --hash=sha256:4585b0d1223148c27a225b10dbec5ae9bc4c81a99a3fa80774fa6209935324e1 \ --hash=sha256:c88b1e6ecf6b41cd8fb5731c7ae919bf66df6ec6fafa555cd6c0e16ca169ae92 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt # opentelemetry-exporter-prometheus # prometheus-fastapi-instrumentator @@ -1973,8 +2037,14 @@ prometheus-fastapi-instrumentator==7.0.2 \ --hash=sha256:8a4d8fb13dbe19d2882ac6af9ce236e4e1f98dc48e3fa44fe88d8e23ac3c953f \ --hash=sha256:975e39992acb7a112758ff13ba95317e6c54d1bbf605f9156f31ac9f2800c32d # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # vllm +prompt-toolkit==3.0.41 \ + --hash=sha256:941367d97fc815548822aa26c2a269fdc4eb21e9ec05fc5d447cf09bad5d75f0 \ + --hash=sha256:f36fe301fafb7470e86aaf90f036eef600a3210be4decf461a5b1ca8403d3cb2 + # via + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # click-repl propcache==0.3.0 \ --hash=sha256:02df07041e0820cacc8f739510078f2aadcfd3fc57eaeeb16d5ded85c872c89e \ --hash=sha256:03acd9ff19021bd0567582ac88f821b66883e158274183b9e5586f678984f8fe \ @@ -2075,14 +2145,14 @@ propcache==0.3.0 \ --hash=sha256:fb91d20fa2d3b13deea98a690534697742029f4fb83673a3501ae6e3746508b5 \ --hash=sha256:fddb8870bdb83456a489ab67c6b3040a8d5a55069aa6f72f9d872235fbc52f54 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # aiohttp # yarl proto-plus==1.22.3 \ --hash=sha256:a49cd903bc0b6ab41f76bf65510439d56ca76f868adf0274e738bfdd096894df \ --hash=sha256:fdcd09713cbd42480740d2fe29c990f7fbd885a67efc328aa8be6ee3e9f76a6b # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # google-api-core protobuf==4.25.8 \ --hash=sha256:077ff8badf2acf8bc474406706ad890466274191a48d0abd3bd6987107c9cde5 \ @@ -2097,7 +2167,7 @@ protobuf==4.25.8 \ --hash=sha256:d552c53d0415449c8d17ced5c341caba0d89dbf433698e1436c8fa0aae7808a3 \ --hash=sha256:f4510b93a3bec6eba8fd8f1093e9d7fb0d4a24d1a81377c10c0e5bbfe9e4ed24 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt # google-api-core # googleapis-common-protos @@ -2124,13 +2194,13 @@ psutil==5.9.6 \ --hash=sha256:fb8a697f11b0f5994550555fcfe3e69799e5b060c8ecf9e2f75c69302cc35c0d \ --hash=sha256:ff18b8d1a784b810df0b0fff3bcb50ab941c3b8e2c8de5726f9c71c601c611aa # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # vllm py-cpuinfo==9.0.0 \ --hash=sha256:3cdbbf3fac90dc6f118bfd64384f309edeadd902d7c8fb17f02ffa1fc3f49690 \ --hash=sha256:859625bc251f64e21f077d099d4162689c762b5d6a4c3c97553d56241c9674d5 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # vllm py-spy==0.4.0 ; python_full_version < '3.12' \ --hash=sha256:47cdda4c34d9b6cb01f3aaeceb2e88faf57da880207fe72ff6ff97e9bb6cc8a9 \ @@ -2142,7 +2212,7 @@ py-spy==0.4.0 ; python_full_version < '3.12' \ --hash=sha256:eee3d0bde85ca5cf4f01f012d461180ca76c24835a96f7b5c4ded64eb6a008ab \ --hash=sha256:f2cf3f7130e7d780471faa5957441d3b4e0ec39a79b2c00f4c33d494f7728428 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt pyarrow==19.0.1 \ --hash=sha256:008a4009efdb4ea3d2e18f05cd31f9d43c388aad29c636112c2966605ba33466 \ @@ -2188,20 +2258,20 @@ pyarrow==19.0.1 \ --hash=sha256:fca15aabbe9b8355800d923cc2e82c8ef514af321e18b437c3d782aa884eaeec \ --hash=sha256:fd44d66093a239358d07c42a91eebf5015aa54fccba959db899f932218ac9cc8 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt pyasn1==0.5.1 \ --hash=sha256:4439847c58d40b1d0a573d07e3856e95333f1976294494c325775aeca506eb58 \ --hash=sha256:6d391a96e59b23130a5cfa74d6fd7f388dbbe26cc8f1edf39fdddf08d9d6676c # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # pyasn1-modules # rsa pyasn1-modules==0.3.0 \ --hash=sha256:5bd01446b736eb9d31512a30d46c1ac3395d676c6f3cafa4c03eb54b9925631c \ --hash=sha256:d3ccd6ed470d9ffbc716be08bd90efbd44d0734bc9303818f7336070984a162d # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # google-auth pybase64==1.4.1 \ --hash=sha256:011a54ff6ca44c5d03746aec3f1f492fce3155bd3f943fb2ceaea92416d40eeb \ @@ -2348,191 +2418,192 @@ pybase64==1.4.1 \ --hash=sha256:fc9504c4c2e893e0a6c1cc80bce51907e3461288289f630eab22b5735eba1104 \ --hash=sha256:ff172a4dacbd964e5edcf1c2152dae157aabf856508aed15276f46d04a22128e # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # vllm pybind11==2.13.6 \ --hash=sha256:237c41e29157b962835d356b370ededd57594a26d5894a795960f0047cb5caf5 \ --hash=sha256:ba6af10348c12b24e92fa086b39cfba0eff619b61ac77c406167d813b096d39a # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements/llm/llm-requirements.txt pycountry==24.6.1 \ --hash=sha256:b61b3faccea67f87d10c1f2b0fc0be714409e8fcdcc1315613174f6466c10221 \ --hash=sha256:f1a4fb391cd7214f8eefd39556d740adcc233c778a27f8942c8dca351d6ce06f # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # pydantic-extra-types pycparser==2.21 \ --hash=sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9 \ --hash=sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # cffi -pydantic==2.10.0 \ - --hash=sha256:0aca0f045ff6e2f097f1fe89521115335f15049eeb8a7bef3dafe4b19a74e289 \ - --hash=sha256:5e7807ba9201bdf61b1b58aa6eb690916c40a47acfb114b1b4fef3e7fd5b30fc +pydantic==2.11.7 \ + --hash=sha256:d989c3c6cb79469287b1569f7447a17848c998458d49ebe294e975b9baf0f0db \ + --hash=sha256:dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt # compressed-tensors # fastapi # lm-format-enforcer # mistral-common # openai + # openai-harmony # pydantic-extra-types # vllm # xgrammar -pydantic-core==2.27.0 \ - --hash=sha256:0aa4d1b2eba9a325897308b3124014a142cdccb9f3e016f31d3ebee6b5ea5e75 \ - --hash=sha256:0d06b667e53320332be2bf6f9461f4a9b78092a079b8ce8634c9afaa7e10cd9f \ - --hash=sha256:153017e3d6cd3ce979de06d84343ca424bb6092727375eba1968c8b4693c6ecb \ - --hash=sha256:15e350efb67b855cd014c218716feea4986a149ed1f42a539edd271ee074a196 \ - --hash=sha256:185ef205256cd8b38431205698531026979db89a79587725c1e55c59101d64e9 \ - --hash=sha256:1da0c98a85a6c6ed702d5556db3b09c91f9b0b78de37b7593e2de8d03238807a \ - --hash=sha256:225bfff5d425c34e1fd562cef52d673579d59b967d9de06178850c4802af9039 \ - --hash=sha256:24f984fc7762ed5f806d9e8c4c77ea69fdb2afd987b4fd319ef06c87595a8c55 \ - --hash=sha256:25a7fd4de38f7ff99a37e18fa0098c3140286451bc823d1746ba80cec5b433a1 \ - --hash=sha256:2883b260f7a93235488699d39cbbd94fa7b175d3a8063fbfddd3e81ad9988cb2 \ - --hash=sha256:2a51ce96224eadd1845150b204389623c8e129fde5a67a84b972bd83a85c6c40 \ - --hash=sha256:2be0ad541bb9f059954ccf8877a49ed73877f862529575ff3d54bf4223e4dd61 \ - --hash=sha256:31a2cae5f059329f9cfe3d8d266d3da1543b60b60130d186d9b6a3c20a346361 \ - --hash=sha256:333c840a1303d1474f491e7be0b718226c730a39ead0f7dab2c7e6a2f3855555 \ - --hash=sha256:33d14369739c5d07e2e7102cdb0081a1fa46ed03215e07f097b34e020b83b1ae \ - --hash=sha256:35380671c3c921fe8adf31ad349dc6f7588b7e928dbe44e1093789734f607399 \ - --hash=sha256:359e7951f04ad35111b5ddce184db3391442345d0ab073aa63a95eb8af25a5ef \ - --hash=sha256:36aa167f69d8807ba7e341d67ea93e50fcaaf6bc433bb04939430fa3dab06f31 \ - --hash=sha256:395e3e1148fa7809016231f8065f30bb0dc285a97b4dc4360cd86e17bab58af7 \ - --hash=sha256:3e8d89c276234579cd3d095d5fa2a44eb10db9a218664a17b56363cddf226ff3 \ - --hash=sha256:3eb8849445c26b41c5a474061032c53e14fe92a11a5db969f722a2716cd12206 \ - --hash=sha256:3fd8bc2690e7c39eecdf9071b6a889ce7b22b72073863940edc2a0a23750ca90 \ - --hash=sha256:400bf470e4327e920883b51e255617dfe4496d4e80c3fea0b5a5d0bf2c404dd4 \ - --hash=sha256:4148dc9184ab79e356dc00a4199dc0ee8647973332cb385fc29a7cced49b9f9c \ - --hash=sha256:433689845288f9a1ee5714444e65957be26d30915f7745091ede4a83cfb2d7bb \ - --hash=sha256:43b61989068de9ce62296cde02beffabcadb65672207fc51e7af76dca75e6636 \ - --hash=sha256:4523c4009c3f39d948e01962223c9f5538602e7087a628479b723c939fab262d \ - --hash=sha256:483c2213a609e7db2c592bbc015da58b6c75af7360ca3c981f178110d9787bcf \ - --hash=sha256:49633583eb7dc5cba61aaf7cdb2e9e662323ad394e543ee77af265736bcd3eaa \ - --hash=sha256:4b51f964fcbb02949fc546022e56cdb16cda457af485e9a3e8b78ac2ecf5d77e \ - --hash=sha256:4bf1340ae507f6da6360b24179c2083857c8ca7644aab65807023cf35404ea8d \ - --hash=sha256:4fb49cfdb53af5041aba909be00cccfb2c0d0a2e09281bf542371c5fd36ad04c \ - --hash=sha256:510b11e9c3b1a852876d1ccd8d5903684336d635214148637ceb27366c75a467 \ - --hash=sha256:513cb14c0cc31a4dfd849a4674b20c46d87b364f997bbcb02282306f5e187abf \ - --hash=sha256:58560828ee0951bb125c6f2862fbc37f039996d19ceb6d8ff1905abf7da0bf3d \ - --hash=sha256:58ab0d979c969983cdb97374698d847a4acffb217d543e172838864636ef10d9 \ - --hash=sha256:5982048129f40b082c2654de10c0f37c67a14f5ff9d37cf35be028ae982f26df \ - --hash=sha256:5ab325fc86fbc077284c8d7f996d904d30e97904a87d6fb303dce6b3de7ebba9 \ - --hash=sha256:5cc822ab90a70ea3a91e6aed3afac570b276b1278c6909b1d384f745bd09c714 \ - --hash=sha256:5f2b19b8d6fca432cb3acf48cf5243a7bf512988029b6e6fd27e9e8c0a204d85 \ - --hash=sha256:5fc72fbfebbf42c0856a824b8b0dc2b5cd2e4a896050281a21cfa6fed8879cb1 \ - --hash=sha256:6354e18a9be37bfa124d6b288a87fb30c673745806c92956f1a25e3ae6e76b96 \ - --hash=sha256:678f66462058dd978702db17eb6a3633d634f7aa0deaea61e0a674152766d3fc \ - --hash=sha256:68950bc08f9735306322bfc16a18391fcaac99ded2509e1cc41d03ccb6013cfe \ - --hash=sha256:68ef5377eb582fa4343c9d0b57a5b094046d447b4c73dd9fbd9ffb216f829e7d \ - --hash=sha256:6b4c19525c3538fbc0bbda6229f9682fb8199ce9ac37395880e6952798e00373 \ - --hash=sha256:6bb69bf3b6500f195c3deb69c1205ba8fc3cb21d1915f1f158a10d6b1ef29b6a \ - --hash=sha256:6e19401742ed7b69e51d8e4df3c03ad5ec65a83b36244479fd70edde2828a5d9 \ - --hash=sha256:6f4a53af9e81d757756508b57cae1cf28293f0f31b9fa2bfcb416cc7fb230f9d \ - --hash=sha256:6fda87808429c520a002a85d6e7cdadbf58231d60e96260976c5b8f9a12a8e13 \ - --hash=sha256:78f841523729e43e3928a364ec46e2e3f80e6625a4f62aca5c345f3f626c6e8a \ - --hash=sha256:7a6ebfac28fd51890a61df36ef202adbd77d00ee5aca4a3dadb3d9ed49cfb929 \ - --hash=sha256:7b0202ebf2268954090209a84f9897345719e46a57c5f2c9b7b250ca0a9d3e63 \ - --hash=sha256:8117839a9bdbba86e7f9df57018fe3b96cec934c3940b591b0fd3fbfb485864a \ - --hash=sha256:82e1ad4ca170e8af4c928b67cff731b6296e6a0a0981b97b2eb7c275cc4e15bd \ - --hash=sha256:836a4bfe0cc6d36dc9a9cc1a7b391265bf6ce9d1eb1eac62ac5139f5d8d9a6fa \ - --hash=sha256:84af1cf7bfdcbc6fcf5a5f70cc9896205e0350306e4dd73d54b6a18894f79386 \ - --hash=sha256:84e35afd9e10b2698e6f2f32256678cb23ca6c1568d02628033a837638b3ed12 \ - --hash=sha256:884f1806609c2c66564082540cffc96868c5571c7c3cf3a783f63f2fb49bd3cd \ - --hash=sha256:8a150392102c402c538190730fda06f3bce654fc498865579a9f2c1d2b425833 \ - --hash=sha256:8e21d927469d04b39386255bf00d0feedead16f6253dcc85e9e10ddebc334084 \ - --hash=sha256:8e96ca781e0c01e32115912ebdf7b3fb0780ce748b80d7d28a0802fa9fbaf44e \ - --hash=sha256:8ee4c2a75af9fe21269a4a0898c5425afb01af1f5d276063f57e2ae1bc64e191 \ - --hash=sha256:91bc66f878557313c2a6bcf396e7befcffe5ab4354cfe4427318968af31143c3 \ - --hash=sha256:951e71da6c89d354572098bada5ba5b5dc3a9390c933af8a614e37755d3d1840 \ - --hash=sha256:99b2863c1365f43f74199c980a3d40f18a218fbe683dd64e470199db426c4d6a \ - --hash=sha256:9a8fbf506fde1529a1e3698198fe64bfbe2e0c09557bc6a7dcf872e7c01fec40 \ - --hash=sha256:9ce048deb1e033e7a865ca384770bccc11d44179cf09e5193a535c4c2f497bdc \ - --hash=sha256:9fe94d9d2a2b4edd7a4b22adcd45814b1b59b03feb00e56deb2e89747aec7bfe \ - --hash=sha256:a291d0b4243a259c8ea7e2b84eb9ccb76370e569298875a7c5e3e71baf49057a \ - --hash=sha256:a5c022bb0d453192426221605efc865373dde43b17822a264671c53b068ac20c \ - --hash=sha256:abb4785894936d7682635726613c44578c420a096729f1978cd061a7e72d5275 \ - --hash=sha256:b872c86d8d71827235c7077461c502feb2db3f87d9d6d5a9daa64287d75e4fa0 \ - --hash=sha256:bf37b72834e7239cf84d4a0b2c050e7f9e48bced97bad9bdf98d26b8eb72e846 \ - --hash=sha256:c0c431e4be5c1a0c6654e0c31c661cd89e0ca956ef65305c3c3fd96f4e72ca39 \ - --hash=sha256:c5726eec789ee38f2c53b10b1821457b82274f81f4f746bb1e666d8741fcfadb \ - --hash=sha256:c6fcb3fa3855d583aa57b94cf146f7781d5d5bc06cb95cb3afece33d31aac39b \ - --hash=sha256:c86679f443e7085ea55a7376462553996c688395d18ef3f0d3dbad7838f857a2 \ - --hash=sha256:c91e3c04f5191fd3fb68764bddeaf02025492d5d9f23343b283870f6ace69708 \ - --hash=sha256:c921ad596ff1a82f9c692b0758c944355abc9f0de97a4c13ca60ffc6d8dc15d4 \ - --hash=sha256:c9ed88b398ba7e3bad7bd64d66cc01dcde9cfcb7ec629a6fd78a82fa0b559d78 \ - --hash=sha256:cd2ac6b919f7fed71b17fe0b4603c092a4c9b5bae414817c9c81d3c22d1e1bcc \ - --hash=sha256:d28ca7066d6cdd347a50d8b725dc10d9a1d6a1cce09836cf071ea6a2d4908be0 \ - --hash=sha256:d29e235ce13c91902ef3efc3d883a677655b3908b1cbc73dee816e5e1f8f7739 \ - --hash=sha256:d8b5ee4ae9170e2775d495b81f414cc20268041c42571530513496ba61e94ba3 \ - --hash=sha256:db72e40628967f6dc572020d04b5f800d71264e0531c6da35097e73bdf38b003 \ - --hash=sha256:df45c4073bed486ea2f18757057953afed8dd77add7276ff01bccb79982cf46c \ - --hash=sha256:dfa5f5c0a4c8fced1422dc2ca7eefd872d5d13eb33cf324361dbf1dbfba0a9fe \ - --hash=sha256:e015833384ca3e1a0565a79f5d953b0629d9138021c27ad37c92a9fa1af7623c \ - --hash=sha256:e15315691fe2253eb447503153acef4d7223dfe7e7702f9ed66539fcd0c43801 \ - --hash=sha256:e65466b31be1070b4a5b7dbfbd14b247884cb8e8b79c64fb0f36b472912dbaea \ - --hash=sha256:e7820bb0d65e3ce1e3e70b6708c2f66143f55912fa02f4b618d0f08b61575f12 \ - --hash=sha256:e851a051f7260e6d688267eb039c81f05f23a19431bd7dfa4bf5e3cb34c108cd \ - --hash=sha256:e9f9feee7f334b72ceae46313333d002b56f325b5f04271b4ae2aadd9e993ae4 \ - --hash=sha256:eb40f828bc2f73f777d1eb8fee2e86cd9692a4518b63b6b5aa8af915dfd3207b \ - --hash=sha256:eb704155e73b833801c247f39d562229c0303f54770ca14fb1c053acb376cf10 \ - --hash=sha256:edb1bfd45227dec8d50bc7c7d86463cd8728bcc574f9b07de7369880de4626a3 \ - --hash=sha256:ee7d9d5537daf6d5c74a83b38a638cc001b648096c1cae8ef695b0c919d9d379 \ - --hash=sha256:f57783fbaf648205ac50ae7d646f27582fc706be3977e87c3c124e7a92407b10 \ - --hash=sha256:ff63a92f6e249514ef35bc795de10745be0226eaea06eb48b4bbeaa0c8850a4a - # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt +pydantic-core==2.33.2 \ + --hash=sha256:0069c9acc3f3981b9ff4cdfaf088e98d83440a4c7ea1bc07460af3d4dc22e72d \ + --hash=sha256:031c57d67ca86902726e0fae2214ce6770bbe2f710dc33063187a68744a5ecac \ + --hash=sha256:0405262705a123b7ce9f0b92f123334d67b70fd1f20a9372b907ce1080c7ba02 \ + --hash=sha256:04a1a413977ab517154eebb2d326da71638271477d6ad87a769102f7c2488c56 \ + --hash=sha256:09fb9dd6571aacd023fe6aaca316bd01cf60ab27240d7eb39ebd66a3a15293b4 \ + --hash=sha256:0a39979dcbb70998b0e505fb1556a1d550a0781463ce84ebf915ba293ccb7e22 \ + --hash=sha256:0a9f2c9dd19656823cb8250b0724ee9c60a82f3cdf68a080979d13092a3b0fef \ + --hash=sha256:0e03262ab796d986f978f79c943fc5f620381be7287148b8010b4097f79a39ec \ + --hash=sha256:0e5b2671f05ba48b94cb90ce55d8bdcaaedb8ba00cc5359f6810fc918713983d \ + --hash=sha256:0e6116757f7959a712db11f3e9c0a99ade00a5bbedae83cb801985aa154f071b \ + --hash=sha256:0fb2d542b4d66f9470e8065c5469ec676978d625a8b7a363f07d9a501a9cb36a \ + --hash=sha256:1082dd3e2d7109ad8b7da48e1d4710c8d06c253cbc4a27c1cff4fbcaa97a9e3f \ + --hash=sha256:1a8695a8d00c73e50bff9dfda4d540b7dee29ff9b8053e38380426a85ef10052 \ + --hash=sha256:1e063337ef9e9820c77acc768546325ebe04ee38b08703244c1309cccc4f1bab \ + --hash=sha256:1ea40a64d23faa25e62a70ad163571c0b342b8bf66d5fa612ac0dec4f069d916 \ + --hash=sha256:2058a32994f1fde4ca0480ab9d1e75a0e8c87c22b53a3ae66554f9af78f2fe8c \ + --hash=sha256:235f45e5dbcccf6bd99f9f472858849f73d11120d76ea8707115415f8e5ebebf \ + --hash=sha256:2807668ba86cb38c6817ad9bc66215ab8584d1d304030ce4f0887336f28a5e27 \ + --hash=sha256:2b0a451c263b01acebe51895bfb0e1cc842a5c666efe06cdf13846c7418caa9a \ + --hash=sha256:2b3d326aaef0c0399d9afffeb6367d5e26ddc24d351dbc9c636840ac355dc5d8 \ + --hash=sha256:2bfb5112df54209d820d7bf9317c7a6c9025ea52e49f46b6a2060104bba37de7 \ + --hash=sha256:2f82865531efd18d6e07a04a17331af02cb7a651583c418df8266f17a63c6612 \ + --hash=sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1 \ + --hash=sha256:3a1c81334778f9e3af2f8aeb7a960736e5cab1dfebfb26aabca09afd2906c039 \ + --hash=sha256:3abcd9392a36025e3bd55f9bd38d908bd17962cc49bc6da8e7e96285336e2bca \ + --hash=sha256:3c6db6e52c6d70aa0d00d45cdb9b40f0433b96380071ea80b09277dba021ddf7 \ + --hash=sha256:3dc625f4aa79713512d1976fe9f0bc99f706a9dee21dfd1810b4bbbf228d0e8a \ + --hash=sha256:3eb3fe62804e8f859c49ed20a8451342de53ed764150cb14ca71357c765dc2a6 \ + --hash=sha256:44857c3227d3fb5e753d5fe4a3420d6376fa594b07b621e220cd93703fe21782 \ + --hash=sha256:4b25d91e288e2c4e0662b8038a28c6a07eaac3e196cfc4ff69de4ea3db992a1b \ + --hash=sha256:4c5b0a576fb381edd6d27f0a85915c6daf2f8138dc5c267a57c08a62900758c7 \ + --hash=sha256:4e61206137cbc65e6d5256e1166f88331d3b6238e082d9f74613b9b765fb9025 \ + --hash=sha256:52fb90784e0a242bb96ec53f42196a17278855b0f31ac7c3cc6f5c1ec4811849 \ + --hash=sha256:53a57d2ed685940a504248187d5685e49eb5eef0f696853647bf37c418c538f7 \ + --hash=sha256:572c7e6c8bb4774d2ac88929e3d1f12bc45714ae5ee6d9a788a9fb35e60bb04b \ + --hash=sha256:5c4aa4e82353f65e548c476b37e64189783aa5384903bfea4f41580f255fddfa \ + --hash=sha256:5c92edd15cd58b3c2d34873597a1e20f13094f59cf88068adb18947df5455b4e \ + --hash=sha256:5f483cfb75ff703095c59e365360cb73e00185e01aaea067cd19acffd2ab20ea \ + --hash=sha256:61c18fba8e5e9db3ab908620af374db0ac1baa69f0f32df4f61ae23f15e586ac \ + --hash=sha256:6368900c2d3ef09b69cb0b913f9f8263b03786e5b2a387706c5afb66800efd51 \ + --hash=sha256:64632ff9d614e5eecfb495796ad51b0ed98c453e447a76bcbeeb69615079fc7e \ + --hash=sha256:65132b7b4a1c0beded5e057324b7e16e10910c106d43675d9bd87d4f38dde162 \ + --hash=sha256:6b99022f1d19bc32a4c2a0d544fc9a76e3be90f0b3f4af413f87d38749300e65 \ + --hash=sha256:6bdfe4b3789761f3bcb4b1ddf33355a71079858958e3a552f16d5af19768fef2 \ + --hash=sha256:6fa6dfc3e4d1f734a34710f391ae822e0a8eb8559a85c6979e14e65ee6ba2954 \ + --hash=sha256:73662edf539e72a9440129f231ed3757faab89630d291b784ca99237fb94db2b \ + --hash=sha256:73cf6373c21bc80b2e0dc88444f41ae60b2f070ed02095754eb5a01df12256de \ + --hash=sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc \ + --hash=sha256:7f92c15cd1e97d4b12acd1cc9004fa092578acfa57b67ad5e43a197175d01a64 \ + --hash=sha256:82f68293f055f51b51ea42fafc74b6aad03e70e191799430b90c13d643059ebb \ + --hash=sha256:83aa99b1285bc8f038941ddf598501a86f1536789740991d7d8756e34f1e74d9 \ + --hash=sha256:87acbfcf8e90ca885206e98359d7dca4bcbb35abdc0ff66672a293e1d7a19101 \ + --hash=sha256:87b31b6846e361ef83fedb187bb5b4372d0da3f7e28d85415efa92d6125d6e6d \ + --hash=sha256:881b21b5549499972441da4758d662aeea93f1923f953e9cbaff14b8b9565aef \ + --hash=sha256:8d55ab81c57b8ff8548c3e4947f119551253f4e3787a7bbc0b6b3ca47498a9d3 \ + --hash=sha256:8f57a69461af2a5fa6e6bbd7a5f60d3b7e6cebb687f55106933188e79ad155c1 \ + --hash=sha256:95237e53bb015f67b63c91af7518a62a8660376a6a0db19b89acc77a4d6199f5 \ + --hash=sha256:96081f1605125ba0855dfda83f6f3df5ec90c61195421ba72223de35ccfb2f88 \ + --hash=sha256:970919794d126ba8645f3837ab6046fb4e72bbc057b3709144066204c19a455d \ + --hash=sha256:9cb1da0f5a471435a7bc7e439b8a728e8b61e59784b2af70d7c169f8dd8ae290 \ + --hash=sha256:9fcd347d2cc5c23b06de6d3b7b8275be558a0c90549495c699e379a80bf8379e \ + --hash=sha256:9fdac5d6ffa1b5a83bca06ffe7583f5576555e6c8b3a91fbd25ea7780f825f7d \ + --hash=sha256:a11c8d26a50bfab49002947d3d237abe4d9e4b5bdc8846a63537b6488e197808 \ + --hash=sha256:a144d4f717285c6d9234a66778059f33a89096dfb9b39117663fd8413d582dcc \ + --hash=sha256:a2b911a5b90e0374d03813674bf0a5fbbb7741570dcd4b4e85a2e48d17def29d \ + --hash=sha256:a7ec89dc587667f22b6a0b6579c249fca9026ce7c333fc142ba42411fa243cdc \ + --hash=sha256:aa9d91b338f2df0508606f7009fde642391425189bba6d8c653afd80fd6bb64e \ + --hash=sha256:b0379a2b24882fef529ec3b4987cb5d003b9cda32256024e6fe1586ac45fc640 \ + --hash=sha256:bc7aee6f634a6f4a95676fcb5d6559a2c2a390330098dba5e5a5f28a2e4ada30 \ + --hash=sha256:bdc25f3681f7b78572699569514036afe3c243bc3059d3942624e936ec93450e \ + --hash=sha256:c083a3bdd5a93dfe480f1125926afcdbf2917ae714bdb80b36d34318b2bec5d9 \ + --hash=sha256:c20c462aa4434b33a2661701b861604913f912254e441ab8d78d30485736115a \ + --hash=sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9 \ + --hash=sha256:c52b02ad8b4e2cf14ca7b3d918f3eb0ee91e63b3167c32591e57c4317e134f8f \ + --hash=sha256:c54c939ee22dc8e2d545da79fc5381f1c020d6d3141d3bd747eab59164dc89fb \ + --hash=sha256:c8e7af2f4e0194c22b5b37205bfb293d166a7344a5b0d0eaccebc376546d77d5 \ + --hash=sha256:cca3868ddfaccfbc4bfb1d608e2ccaaebe0ae628e1416aeb9c4d88c001bb45ab \ + --hash=sha256:d3f26877a748dc4251cfcfda9dfb5f13fcb034f5308388066bcfe9031b63ae7d \ + --hash=sha256:d53b22f2032c42eaaf025f7c40c2e3b94568ae077a606f006d206a463bc69572 \ + --hash=sha256:d87c561733f66531dced0da6e864f44ebf89a8fba55f31407b00c2f7f9449593 \ + --hash=sha256:d946c8bf0d5c24bf4fe333af284c59a19358aa3ec18cb3dc4370080da1e8ad29 \ + --hash=sha256:dac89aea9af8cd672fa7b510e7b8c33b0bba9a43186680550ccf23020f32d535 \ + --hash=sha256:db4b41f9bd95fbe5acd76d89920336ba96f03e149097365afe1cb092fceb89a1 \ + --hash=sha256:dc46a01bf8d62f227d5ecee74178ffc448ff4e5197c756331f71efcc66dc980f \ + --hash=sha256:dd14041875d09cc0f9308e37a6f8b65f5585cf2598a53aa0123df8b129d481f8 \ + --hash=sha256:de4b83bb311557e439b9e186f733f6c645b9417c84e2eb8203f3f820a4b988bf \ + --hash=sha256:e799c050df38a639db758c617ec771fd8fb7a5f8eaaa4b27b101f266b216a246 \ + --hash=sha256:e80b087132752f6b3d714f041ccf74403799d3b23a72722ea2e6ba2e892555b9 \ + --hash=sha256:eb8c529b2819c37140eb51b914153063d27ed88e3bdc31b71198a198e921e011 \ + --hash=sha256:eb9b459ca4df0e5c87deb59d37377461a538852765293f9e6ee834f0435a93b9 \ + --hash=sha256:efec8db3266b76ef9607c2c4c419bdb06bf335ae433b80816089ea7585816f6a \ + --hash=sha256:f481959862f57f29601ccced557cc2e817bce7533ab8e01a797a48b49c9692b3 \ + --hash=sha256:f517ca031dfc037a9c07e748cefd8d96235088b83b4f4ba8939105d20fa1dcd6 \ + --hash=sha256:f889f7a40498cc077332c7ab6b4608d296d852182211787d4f3ee377aaae66e8 \ + --hash=sha256:f8de619080e944347f5f20de29a975c2d815d9ddd8be9b9b7268e2e3ef68605a \ + --hash=sha256:f941635f2a3d96b2973e867144fde513665c87f13fe0e193c158ac51bfaaa7b2 \ + --hash=sha256:fa754d1850735a0b0e03bcffd9d4b4343eb417e47196e4485d9cca326073a42c \ + --hash=sha256:fa854f5cf7e33842a892e5c73f45327760bc7bc516339fda888c75ae60edaeb6 \ + --hash=sha256:fe5b32187cbc0c862ee201ad66c30cf218e5ed468ec8dc1cf49dec66e160cc4d + # via + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # pydantic pydantic-extra-types==2.10.5 \ --hash=sha256:1dcfa2c0cf741a422f088e0dbb4690e7bfadaaf050da3d6f80d6c3cf58a2bad8 \ --hash=sha256:b60c4e23d573a69a4f1a16dd92888ecc0ef34fb0e655b4f305530377fa70e7a8 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # mistral-common pygments==2.18.0 \ --hash=sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199 \ --hash=sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # rich pyopenssl==25.0.0 \ --hash=sha256:424c247065e46e76a37411b9ab1782541c23bb658bf003772c3405fbaa128e90 \ --hash=sha256:cd2cef799efa3936bb08e8ccb9433a575722b9dd986023f1cabc4ae64e9dac16 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt python-dateutil==2.8.2 \ --hash=sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86 \ --hash=sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # celery # pandas python-dotenv==1.0.1 \ --hash=sha256:e324ee90a023d808f1959c46bcbc04446a10ced277783dc6ee09987c37ec10ca \ --hash=sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # uvicorn python-json-logger==2.0.7 \ --hash=sha256:23e7ec02d34237c5aa1e29a070193a4ea87583bb4e7f8fd06d3de8264c4b2e1c \ --hash=sha256:f380b826a991ebbe3de4d897aeec42760035ac760345e57b812938dc8b35e2bd # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # vllm python-multipart==0.0.20 \ --hash=sha256:8a62d3a8335e06589fe01f2a3e178cdcc632f3fbe0d492ad9ee0ec35aab1f104 \ --hash=sha256:8dd0cab45b8e23064ae09147625994d090fa46f5b0d1e13af944c331a7fa9d13 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # fastapi pytz==2022.7.1 \ --hash=sha256:01a0681c4b9684a28304615eba55d1ab31ae00bf68ec157ec3708a8182dbbcd0 \ --hash=sha256:78f4f37d8198e0627c5f1143240bb0206b8691d8d7ac6d78fee88b78733f8c4a # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # pandas pyyaml==6.0.1 \ --hash=sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5 \ @@ -2587,7 +2658,7 @@ pyyaml==6.0.1 \ --hash=sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d \ --hash=sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt # gguf # huggingface-hub @@ -2686,13 +2757,13 @@ pyzmq==26.0.3 \ --hash=sha256:f6b1d1c631e5940cac5a0b22c5379c86e8df6a4ec277c7a856b714021ab6cfad \ --hash=sha256:f6c21c00478a7bea93caaaef9e7629145d4153b15a8653e8bb4609d4bc70dbfc # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # vllm referencing==0.36.2 \ --hash=sha256:df2e89862cd09deabbdba16944cc3f10feb6b3e6f18e902f7cc25609a34775aa \ --hash=sha256:e8699adbbf8b5c7de96d8ffa0eb5c158b3beafce084968e2ea8bb08c6794dcd0 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # jsonschema # jsonschema-specifications regex==2024.11.6 \ @@ -2791,7 +2862,7 @@ regex==2024.11.6 \ --hash=sha256:fdd6028445d2460f33136c55eeb1f601ab06d74cb3347132e1c24250187500d9 \ --hash=sha256:ff590880083d60acc0433f9c3f713c51f7ac6ebb9adf889c79a261ecf541aa91 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # tiktoken # transformers # vllm @@ -2799,7 +2870,7 @@ requests==2.32.3 \ --hash=sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760 \ --hash=sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt # google-api-core # huggingface-hub @@ -2812,7 +2883,7 @@ rich==13.3.2 \ --hash=sha256:91954fe80cfb7985727a467ca98a7618e5dd15178cc2da10f553b36a93859001 \ --hash=sha256:a104f37270bf677148d8acb07d33be1569eeee87e2d1beb286a4e9113caf6f2f # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt # memray # typer @@ -2921,14 +2992,14 @@ rpds-py==0.22.3 \ --hash=sha256:fda7cb070f442bf80b642cd56483b5548e43d366fe3f39b98e67cce780cded00 \ --hash=sha256:feea821ee2a9273771bae61194004ee2fc33f8ec7db08117ef9147d4bbcbca8e # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # jsonschema # referencing rsa==4.7.2 \ --hash=sha256:78f9a9bf4e7be0c5ded4583326e7461e3a3c5aae24073648b4bdfa797d78c9d2 \ --hash=sha256:9d689e6ca1b3038bc82bf8d23e944b6b6037bc02301a574935b2dd946e0353b9 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # google-auth safetensors==0.5.2 \ --hash=sha256:03c937100f38c9ff4c1507abea9928a6a9b02c9c1c9c3609ed4fb2bf413d4975 \ @@ -2947,7 +3018,7 @@ safetensors==0.5.2 \ --hash=sha256:d3a06fae62418ec8e5c635b61a8086032c9e281f16c63c3af46a6efbab33156f \ --hash=sha256:fe55c039d97090d1f85277d402954dd6ad27f63034fa81985a9cc59655ac3ee2 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # transformers scikit-image==0.24.0 \ --hash=sha256:18836a18d3a7b6aca5376a2d805f0045826bc6c9fc85331659c33b4813e0b563 \ @@ -2972,7 +3043,7 @@ scikit-image==0.24.0 \ --hash=sha256:ef04360eda372ee5cd60aebe9be91258639c86ae2ea24093fb9182118008d009 \ --hash=sha256:fa27b3a0dbad807b966b8db2d78da734cb812ca4787f7fbb143764800ce2fa9c # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt scipy==1.11.4 \ --hash=sha256:00150c5eae7b610c32589dda259eacc7c4f1665aedf25d921907f4d08a951b1c \ @@ -3001,7 +3072,7 @@ scipy==1.11.4 \ --hash=sha256:f313b39a7e94f296025e3cffc2c567618174c0b1dde173960cf23808f9fae4be \ --hash=sha256:f3cd9e7b3c2c1ec26364856f9fbe78695fe631150f94cd1c22228456404cf1ec # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt # scikit-image # vllm @@ -3060,34 +3131,135 @@ sentencepiece==0.2.0 \ --hash=sha256:fb89f811e5efd18bab141afc3fea3de141c3f69f3fe9e898f710ae7fe3aab251 \ --hash=sha256:ff88712338b01031910e8e61e7239aff3ce8869ee31a47df63cb38aadd591bea # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # gguf # mistral-common # vllm +setproctitle==1.3.6 \ + --hash=sha256:082413db8a96b1f021088e8ec23f0a61fec352e649aba20881895815388b66d3 \ + --hash=sha256:0dba8faee2e4a96e934797c9f0f2d093f8239bf210406a99060b3eabe549628e \ + --hash=sha256:0e6b5633c94c5111f7137f875e8f1ff48f53b991d5d5b90932f27dc8c1fa9ae4 \ + --hash=sha256:1065ed36bd03a3fd4186d6c6de5f19846650b015789f72e2dea2d77be99bdca1 \ + --hash=sha256:109fc07b1cd6cef9c245b2028e3e98e038283342b220def311d0239179810dbe \ + --hash=sha256:13624d9925bb481bc0ccfbc7f533da38bfbfe6e80652314f789abc78c2e513bd \ + --hash=sha256:156795b3db976611d09252fc80761fcdb65bb7c9b9581148da900851af25ecf4 \ + --hash=sha256:163dba68f979c61e4e2e779c4d643e968973bdae7c33c3ec4d1869f7a9ba8390 \ + --hash=sha256:17d7c833ed6545ada5ac4bb606b86a28f13a04431953d4beac29d3773aa00b1d \ + --hash=sha256:18d0667bafaaae4c1dee831e2e59841c411ff399b9b4766822ba2685d419c3be \ + --hash=sha256:1aa1935aa2195b76f377e5cb018290376b7bf085f0b53f5a95c0c21011b74367 \ + --hash=sha256:2156d55308431ac3b3ec4e5e05b1726d11a5215352d6a22bb933171dee292f8c \ + --hash=sha256:23a57d3b8f1549515c2dbe4a2880ebc1f27780dc126c5e064167563e015817f5 \ + --hash=sha256:2407955dc359d735a20ac6e797ad160feb33d529a2ac50695c11a1ec680eafab \ + --hash=sha256:2940cf13f4fc11ce69ad2ed37a9f22386bfed314b98d8aebfd4f55459aa59108 \ + --hash=sha256:2e51ec673513465663008ce402171192a053564865c2fc6dc840620871a9bd7c \ + --hash=sha256:3393859eb8f19f5804049a685bf286cb08d447e28ba5c6d8543c7bf5500d5970 \ + --hash=sha256:3884002b3a9086f3018a32ab5d4e1e8214dd70695004e27b1a45c25a6243ad0b \ + --hash=sha256:38ca045626af693da042ac35d7332e7b9dbd52e6351d6973b310612e3acee6d6 \ + --hash=sha256:391bb6a29c4fe7ccc9c30812e3744060802d89b39264cfa77f3d280d7f387ea5 \ + --hash=sha256:3cca16fd055316a48f0debfcbfb6af7cea715429fc31515ab3fcac05abd527d8 \ + --hash=sha256:3cde5b83ec4915cd5e6ae271937fd60d14113c8f7769b4a20d51769fe70d8717 \ + --hash=sha256:3f8194b4d631b003a1176a75d1acd545e04b1f54b821638e098a93e6e62830ef \ + --hash=sha256:3fc97805f9d74444b027babff710bf39df1541437a6a585a983d090ae00cedde \ + --hash=sha256:4431629c178193f23c538cb1de3da285a99ccc86b20ee91d81eb5f1a80e0d2ba \ + --hash=sha256:49498ebf68ca3e75321ffe634fcea5cc720502bfaa79bd6b03ded92ce0dc3c24 \ + --hash=sha256:4ac3eb04bcf0119aadc6235a2c162bae5ed5f740e3d42273a7228b915722de20 \ + --hash=sha256:4adf6a0013fe4e0844e3ba7583ec203ca518b9394c6cc0d3354df2bf31d1c034 \ + --hash=sha256:4efc91b437f6ff2578e89e3f17d010c0a0ff01736606473d082913ecaf7859ba \ + --hash=sha256:50706b9c0eda55f7de18695bfeead5f28b58aa42fd5219b3b1692d554ecbc9ec \ + --hash=sha256:5313a4e9380e46ca0e2c681ba739296f9e7c899e6f4d12a6702b2dc9fb846a31 \ + --hash=sha256:543f59601a4e32daf44741b52f9a23e0ee374f9f13b39c41d917302d98fdd7b0 \ + --hash=sha256:57bc54763bf741813a99fbde91f6be138c8706148b7b42d3752deec46545d470 \ + --hash=sha256:63cc10352dc6cf35a33951656aa660d99f25f574eb78132ce41a85001a638aa7 \ + --hash=sha256:6a1d3aa13acfe81f355b0ce4968facc7a19b0d17223a0f80c011a1dba8388f37 \ + --hash=sha256:6af330ddc2ec05a99c3933ab3cba9365357c0b8470a7f2fa054ee4b0984f57d1 \ + --hash=sha256:6d50bfcc1d1692dc55165b3dd2f0b9f8fb5b1f7b571a93e08d660ad54b9ca1a5 \ + --hash=sha256:70100e2087fe05359f249a0b5f393127b3a1819bf34dec3a3e0d4941138650c9 \ + --hash=sha256:74973aebea3543ad033b9103db30579ec2b950a466e09f9c2180089e8346e0ec \ + --hash=sha256:751ba352ed922e0af60458e961167fa7b732ac31c0ddd1476a2dfd30ab5958c5 \ + --hash=sha256:785cd210c0311d9be28a70e281a914486d62bfd44ac926fcd70cf0b4d65dff1c \ + --hash=sha256:7890e291bf4708e3b61db9069ea39b3ab0651e42923a5e1f4d78a7b9e4b18301 \ + --hash=sha256:793a23e8d9cb6c231aa3023d700008224c6ec5b8fd622d50f3c51665e3d0a190 \ + --hash=sha256:797f2846b546a8741413c57d9fb930ad5aa939d925c9c0fa6186d77580035af7 \ + --hash=sha256:7df5fcc48588f82b6cc8073db069609ddd48a49b1e9734a20d0efb32464753c4 \ + --hash=sha256:8050c01331135f77ec99d99307bfbc6519ea24d2f92964b06f3222a804a3ff1f \ + --hash=sha256:805bb33e92fc3d8aa05674db3068d14d36718e3f2c5c79b09807203f229bf4b5 \ + --hash=sha256:807796fe301b7ed76cf100113cc008c119daf4fea2f9f43c578002aef70c3ebf \ + --hash=sha256:81c443310831e29fabbd07b75ebbfa29d0740b56f5907c6af218482d51260431 \ + --hash=sha256:83066ffbf77a5f82b7e96e59bdccbdda203c8dccbfc3f9f0fdad3a08d0001d9c \ + --hash=sha256:8834ab7be6539f1bfadec7c8d12249bbbe6c2413b1d40ffc0ec408692232a0c6 \ + --hash=sha256:92df0e70b884f5da35f2e01489dca3c06a79962fb75636985f1e3a17aec66833 \ + --hash=sha256:9483aa336687463f5497dd37a070094f3dff55e2c888994f8440fcf426a1a844 \ + --hash=sha256:97a138fa875c6f281df7720dac742259e85518135cd0e3551aba1c628103d853 \ + --hash=sha256:9b50700785eccac0819bea794d968ed8f6055c88f29364776b7ea076ac105c5d \ + --hash=sha256:9b73cf0fe28009a04a35bb2522e4c5b5176cc148919431dcb73fdbdfaab15781 \ + --hash=sha256:9d5a369eb7ec5b2fdfa9927530b5259dd21893fa75d4e04a223332f61b84b586 \ + --hash=sha256:a094b7ce455ca341b59a0f6ce6be2e11411ba6e2860b9aa3dbb37468f23338f4 \ + --hash=sha256:a0d6252098e98129a1decb59b46920d4eca17b0395f3d71b0d327d086fefe77d \ + --hash=sha256:a1d856b0f4e4a33e31cdab5f50d0a14998f3a2d726a3fd5cb7c4d45a57b28d1b \ + --hash=sha256:a4ae2ea9afcfdd2b931ddcebf1cf82532162677e00326637b31ed5dff7d985ca \ + --hash=sha256:a5963b663da69ad25fa1559ee064584935570def665917918938c1f1289f5ebc \ + --hash=sha256:ad1c2c2baaba62823a7f348f469a967ece0062140ca39e7a48e4bbb1f20d54c4 \ + --hash=sha256:ae82507fe458f7c0c8227017f2158111a4c9e7ce94de05178894a7ea9fefc8a1 \ + --hash=sha256:af188f3305f0a65c3217c30c6d4c06891e79144076a91e8b454f14256acc7279 \ + --hash=sha256:af44bb7a1af163806bbb679eb8432fa7b4fb6d83a5d403b541b675dcd3798638 \ + --hash=sha256:b0174ca6f3018ddeaa49847f29b69612e590534c1d2186d54ab25161ecc42975 \ + --hash=sha256:b2b17855ed7f994f3f259cf2dfbfad78814538536fa1a91b50253d84d87fd88d \ + --hash=sha256:b2e54f4a2dc6edf0f5ea5b1d0a608d2af3dcb5aa8c8eeab9c8841b23e1b054fe \ + --hash=sha256:b6f4abde9a2946f57e8daaf1160b2351bcf64274ef539e6675c1d945dbd75e2a \ + --hash=sha256:b70c07409d465f3a8b34d52f863871fb8a00755370791d2bd1d4f82b3cdaf3d5 \ + --hash=sha256:bb465dd5825356c1191a038a86ee1b8166e3562d6e8add95eec04ab484cfb8a2 \ + --hash=sha256:c051f46ed1e13ba8214b334cbf21902102807582fbfaf0fef341b9e52f0fafbf \ + --hash=sha256:c1b20a5f4164cec7007be55c9cf18d2cd08ed7c3bf6769b3cd6d044ad888d74b \ + --hash=sha256:c86e9e82bfab579327dbe9b82c71475165fbc8b2134d24f9a3b2edaf200a5c3d \ + --hash=sha256:c9f32b96c700bb384f33f7cf07954bb609d35dd82752cef57fb2ee0968409169 \ + --hash=sha256:cce0ed8b3f64c71c140f0ec244e5fdf8ecf78ddf8d2e591d4a8b6aa1c1214235 \ + --hash=sha256:cdd7315314b0744a7dd506f3bd0f2cf90734181529cdcf75542ee35ad885cab7 \ + --hash=sha256:cf355fbf0d4275d86f9f57be705d8e5eaa7f8ddb12b24ced2ea6cbd68fdb14dc \ + --hash=sha256:d136fbf8ad4321716e44d6d6b3d8dffb4872626010884e07a1db54b7450836cf \ + --hash=sha256:d2c8e20487b3b73c1fa72c56f5c89430617296cd380373e7af3a538a82d4cd6d \ + --hash=sha256:d483cc23cc56ab32911ea0baa0d2d9ea7aa065987f47de847a0a93a58bf57905 \ + --hash=sha256:d5a6c4864bb6fa9fcf7b57a830d21aed69fd71742a5ebcdbafda476be673d212 \ + --hash=sha256:d714e002dd3638170fe7376dc1b686dbac9cb712cde3f7224440af722cc9866a \ + --hash=sha256:d73f14b86d0e2858ece6bf5807c9889670e392c001d414b4293d0d9b291942c3 \ + --hash=sha256:d88c63bd395c787b0aa81d8bbc22c1809f311032ce3e823a6517b711129818e4 \ + --hash=sha256:db608db98ccc21248370d30044a60843b3f0f3d34781ceeea67067c508cd5a28 \ + --hash=sha256:de004939fc3fd0c1200d26ea9264350bfe501ffbf46c8cf5dc7f345f2d87a7f1 \ + --hash=sha256:ded9e86397267732a0641d4776c7c663ea16b64d7dbc4d9cc6ad8536363a2d29 \ + --hash=sha256:e288f8a162d663916060beb5e8165a8551312b08efee9cf68302687471a6545d \ + --hash=sha256:e2a9e62647dc040a76d55563580bf3bb8fe1f5b6ead08447c2ed0d7786e5e794 \ + --hash=sha256:e3e44d08b61de0dd6f205528498f834a51a5c06689f8fb182fe26f3a3ce7dca9 \ + --hash=sha256:ea002088d5554fd75e619742cefc78b84a212ba21632e59931b3501f0cfc8f67 \ + --hash=sha256:eb7452849f6615871eabed6560ffedfe56bc8af31a823b6be4ce1e6ff0ab72c5 \ + --hash=sha256:ebcf34b69df4ca0eabaaaf4a3d890f637f355fed00ba806f7ebdd2d040658c26 \ + --hash=sha256:f24d5b9383318cbd1a5cd969377937d66cf0542f24aa728a4f49d9f98f9c0da8 \ + --hash=sha256:f33fbf96b52d51c23b6cff61f57816539c1c147db270cfc1cc3bc012f4a560a9 + # via + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # vllm shellingham==1.5.4 \ --hash=sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686 \ --hash=sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # typer six==1.16.0 \ --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \ --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # opencensus # python-dateutil smart-open==6.2.0 \ --hash=sha256:088bf00f9327c71e549bc2f86567d3320df5d89667f009ce1c16568976068ef7 \ --hash=sha256:1b4df5c8365218f3852c507451920ccad606c80b0acb4e67508e50ba9b5d2632 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt sniffio==1.3.1 \ --hash=sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2 \ --hash=sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # anyio # openai soundfile==0.13.1 \ @@ -3100,7 +3272,7 @@ soundfile==0.13.1 \ --hash=sha256:b2c68dab1e30297317080a5b43df57e302584c49e2942defdde0acccc53f0e5b \ --hash=sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # mistral-common soxr==0.5.0.post1 \ --hash=sha256:39e0f791ba178d69cd676485dbee37e75a34f20daa478d90341ecb7f6d9d690f \ @@ -3125,13 +3297,13 @@ soxr==0.5.0.post1 \ --hash=sha256:fcc049b0a151a65aa75b92f0ac64bb2dba785d16b78c31c2b94e68c141751d6d \ --hash=sha256:fef509466c9c25f65eae0ce1e4b9ac9705d22c6038c914160ddaf459589c6e31 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # mistral-common starlette==0.46.2 \ --hash=sha256:595633ce89f8ffa71a015caed34a5b2dc1c0cdb3f0f1fbd1e69339cf2abeec35 \ --hash=sha256:7f7361f34eed179294600af672f565727419830b54b7b084efe44bb82d2fccd5 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt # fastapi # prometheus-fastapi-instrumentator @@ -3139,19 +3311,19 @@ sympy==1.14.0 \ --hash=sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517 \ --hash=sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # torch tensorboardx==2.6.2.2 \ --hash=sha256:160025acbf759ede23fd3526ae9d9bfbfd8b68eb16c38a010ebe326dc6395db8 \ --hash=sha256:c6476d7cd0d529b0b72f4acadb1269f9ed8b22f441e87a84f2a3b940bb87b666 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt tifffile==2024.7.21 \ --hash=sha256:7f335b5d6ca49401fe0f1d87deb206f5dae47297e47b1ed52a676d05d6d26798 \ --hash=sha256:818b577d49350421fb511f389f937984f9feaa2cd8177fa00823001920bf3483 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # scikit-image tiktoken==0.9.0 \ --hash=sha256:03935988a91d6d3216e2ec7c645afbb3d870b37bcb67ada1943ec48678e7ee33 \ @@ -3186,7 +3358,7 @@ tiktoken==0.9.0 \ --hash=sha256:f0968d5beeafbca2a72c595e8385a1a1f8af58feaebb02b227229b69ca5357fd \ --hash=sha256:f32cc56168eac4851109e9b5d327637f15fd662aa30dd79f964b7c39fbadd26e # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # mistral-common # vllm tokenizers==0.21.1 \ @@ -3206,7 +3378,7 @@ tokenizers==0.21.1 \ --hash=sha256:e78e413e9e668ad790a29456e677d9d3aa50a9ad311a40905d6861ba7692cf41 \ --hash=sha256:ed248ab5279e601a30a4d67bdb897ecbe955a50f1e7bb62bd99f07dd11c2f5b6 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # transformers # vllm torch==2.7.1+cpu \ @@ -3230,7 +3402,7 @@ torch==2.7.1+cpu \ --hash=sha256:d25435bdc4780d3cb512aad55142aca9584ae1fe8f8691cda6d32f19faf5d58e \ --hash=sha256:eb17646792ac4374ffc87e42369f45d21eff17c790868963b90483ef0b6db4ef # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # compressed-tensors # nixl # torchaudio @@ -3252,7 +3424,7 @@ torchaudio==2.7.1+cpu \ --hash=sha256:deb19d2a1cbbe49f9d14a9fe3dce65fef8dd98570aa8b6a65d7f5d1e0d16d0f3 \ --hash=sha256:e169a2b62e55342f2f30e17640054707c8e339045a1ccc2db33517e9debb2767 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # vllm torchvision==0.22.1+cpu \ --hash=sha256:34c914ad4728b81848ac802c5fc5eeb8de8ff4058cc59c1463a74ce4f4fbf0d8 \ @@ -3268,24 +3440,23 @@ torchvision==0.22.1+cpu \ --hash=sha256:c852e61bc903351169017e2e96389f28f6cfb52ca7c3945acceb31e7fe1b21e6 \ --hash=sha256:e31f1273a8dd9760906288036ac3c8f5fef25eed393da0491db150d7be78910d # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # vllm tqdm==4.67.1 \ --hash=sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2 \ --hash=sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # gguf # huggingface-hub # openai # transformers # vllm -transformers==4.53.2 \ - --hash=sha256:6c3ed95edfb1cba71c4245758f1b4878c93bf8cde77d076307dacb2cbbd72be2 \ - --hash=sha256:db8f4819bb34f000029c73c3c557e7d06fc1b8e612ec142eecdae3947a9c78bf +transformers==4.55.2 \ + --hash=sha256:097e3c2e2c0c9681db3da9d748d8f9d6a724c644514673d0030e8c5a1109f1f1 \ + --hash=sha256:a45ec60c03474fd67adbce5c434685051b7608b3f4f167c25aa6aeb1cad16d4f # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt - # -r python/requirements/llm/llm-requirements.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # compressed-tensors # vllm # xgrammar @@ -3303,13 +3474,13 @@ triton==3.2.0 ; platform_machine == 'x86_64' and sys_platform == 'linux' \ --hash=sha256:f1679fde231fb04c96cb5a01b160c8d0294ce6f7c122565d8b33ad8a910422d7 \ --hash=sha256:f24212d12744266f6229f90f820f34c43a538a69d6511b8e92ee392d2dc0d38b # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # xgrammar typer==0.12.3 \ --hash=sha256:070d7ca53f785acbccba8e7d28b08dcd88f79f1fbda035ade0aecec71ca5c914 \ --hash=sha256:49e73131481d804288ef62598d97a1ceef3058905aa536a1134f90891ba35482 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements/llm/llm-requirements.txt # -r python/requirements.txt # fastapi-cli @@ -3317,7 +3488,7 @@ typing-extensions==4.12.2 \ --hash=sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d \ --hash=sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # fastapi # gymnasium # huggingface-hub @@ -3333,18 +3504,31 @@ typing-extensions==4.12.2 \ # referencing # torch # typer + # typing-inspection # vllm +typing-inspection==0.4.1 \ + --hash=sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51 \ + --hash=sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28 + # via + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # pydantic +tzdata==2025.2 \ + --hash=sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8 \ + --hash=sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9 + # via + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # kombu urllib3==1.26.19 \ --hash=sha256:37a0344459b199fce0e80b0d3569837ec6b6937435c5244e7fd73fa6006830f3 \ --hash=sha256:3e3d753a8618b86d7de333b4223005f68720bcd6a7d2bcb9fbd2229ec7c1e429 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # requests uvicorn==0.22.0 \ --hash=sha256:79277ae03db57ce7d9aa0567830bbb51d7a612f54d6e1e3e92da3ef24c2c8ed8 \ --hash=sha256:e9434d3bbf05f310e762147f769c9f21235ee118ba2d2bf1155a7196448bd996 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt # fastapi # fastapi-cli @@ -3387,19 +3571,27 @@ uvloop==0.21.0 ; platform_python_implementation != 'PyPy' and sys_platform != 'c --hash=sha256:f3df876acd7ec037a3d005b3ab85a7e4110422e4d9c1571d4fc89b0fc41b6816 \ --hash=sha256:f7089d2dc73179ce5ac255bdf37c236a9f914b264825fdaacaded6990a7fb4c2 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # uvicorn +vine==5.1.0 \ + --hash=sha256:40fdf3c48b2cfe1c38a49e9ae2da6fda88e4794c810050a728bd7413811fb1dc \ + --hash=sha256:8b62e981d35c41049211cf62a0a1242d8c1ee9bd15bb196ce38aefd6799e61e0 + # via + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # amqp + # celery + # kombu virtualenv==20.29.1 \ --hash=sha256:4e4cb403c0b0da39e13b46b1b2476e505cb0046b25f242bee80f62bf990b2779 \ --hash=sha256:b8b8970138d32fb606192cb97f6cd4bb644fa486be9308fb9b63f81091b5dc35 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt -vllm==0.10.0 \ - --hash=sha256:8ca37559d82b43b5e8c8248d2e4a1ecb51d6d4e5d517491d656df6491ed93dab \ - --hash=sha256:a44e9013db26082a82c3931ed8772ac884d6d60566d36ecdb0e8dc01c65b241a +vllm==0.10.1.1 \ + --hash=sha256:3099824ee4bdaa14c4c4f7178a092101a0ec206d4c9371edf295849b2b730a39 \ + --hash=sha256:8ca0dd985e1ceac8540e7719c654f1553b3ba8a43c685ac8d3fa1366ffb6443a # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements/llm/llm-requirements.txt watchfiles==0.19.0 \ --hash=sha256:0089c6dc24d436b373c3c57657bf4f9a453b13767150d17284fc6162b2791911 \ @@ -3425,10 +3617,16 @@ watchfiles==0.19.0 \ --hash=sha256:d9b073073e048081e502b6c6b0b88714c026a1a4c890569238d04aca5f9ca74b \ --hash=sha256:fac19dc9cbc34052394dbe81e149411a62e71999c0a19e1e09ce537867f95ae0 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt # uvicorn # vllm +wcwidth==0.2.13 \ + --hash=sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859 \ + --hash=sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5 + # via + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # prompt-toolkit websockets==15.0 \ --hash=sha256:0e389efe46ccb25a1f93d08c7a74e8123a2517f7b7458f043bd7529d1a63ffeb \ --hash=sha256:0f2205cdb444a42a7919690238fb5979a05439b9dbb73dd47c863d39640d85ab \ @@ -3500,14 +3698,14 @@ websockets==15.0 \ --hash=sha256:ffc02b159b65c05f2ed9ec176b715b66918a674bd4daed48a9a7a590dd4be1aa \ --hash=sha256:ffc5ae23ada6515f31604f700009e2df90b091b67d463a8401c1d8a37f76c1d7 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # uvicorn xformers==0.0.31 ; platform_machine == 'x86_64' and sys_platform == 'linux' \ --hash=sha256:23331bdb9831ba0df96f55258537ca0df7ad888efc75cea97a0de79b5e2291c4 \ --hash=sha256:3fccb159c6327c13fc1b08f8b963c2779ca526e2e50755dee9bcc1bac67d20c6 \ --hash=sha256:50aedaea82a38d7d28631f77617d1ed1f6f37c60bdc4bf167a69cbc0e39cee76 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # vllm xgrammar==0.1.21 ; platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \ --hash=sha256:140628376fc701a535600dc64752603ddaed619461dc50669e90626e9f61b8aa \ @@ -3535,7 +3733,7 @@ xgrammar==0.1.21 ; platform_machine == 'aarch64' or platform_machine == 'arm64' --hash=sha256:f89d9ddb4d00fadcffa4bcabd0c3ae75d47c844c728bbb6be695056df3767524 \ --hash=sha256:f9247641c73eec6e972cec15156a8844957334204ba79ad1abdb0d7b03def8a1 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # vllm yarl==1.18.3 \ --hash=sha256:00e5a1fea0fd4f5bfa7440a47eff01d9822a65b4488f7cff83155a0f31a2ecba \ @@ -3621,13 +3819,13 @@ yarl==1.18.3 \ --hash=sha256:fe57328fbc1bfd0bd0514470ac692630f3901c0ee39052ae47acd1d90a436719 \ --hash=sha256:fea09ca13323376a2fdfb353a5fa2e59f90cd18d7ca4eaa1fd31f0a8b4f91e62 # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # aiohttp zipp==3.19.2 \ --hash=sha256:bf1dcf6450f873a13e952a29504887c89e6de7506209e5b1bcc3460135d4de19 \ --hash=sha256:f091755f667055f2d02b32c53771a7a6c8b47e1fdbc4b72a8b9072b3eef8015c # via - # -c python/requirements_compiled_rayllm_test_py311_cpu.txt + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # importlib-metadata # The following packages were excluded from the output: diff --git a/python/requirements_compiled_rayllm_py311_cu121.txt b/python/deplocks/llm/rayllm_py311_cu121.lock similarity index 86% rename from python/requirements_compiled_rayllm_py311_cu121.txt rename to python/deplocks/llm/rayllm_py311_cu121.lock index 61dfd0f354e5..777b41bc8ccd 100644 --- a/python/requirements_compiled_rayllm_py311_cu121.txt +++ b/python/deplocks/llm/rayllm_py311_cu121.lock @@ -1,5 +1,5 @@ # This file was autogenerated by uv via the following command: -# uv pip compile --generate-hashes --strip-extras --unsafe-package ray --unsafe-package grpcio-tools --unsafe-package setuptools --index-url https://pypi.org/simple --extra-index-url https://download.pytorch.org/whl/cu121 --index-strategy unsafe-best-match --no-strip-markers --emit-index-url --emit-find-links -c python/requirements_compiled_rayllm_test_py311_cu121.txt python/requirements.txt python/requirements/llm/llm-requirements.txt -o python/requirements_compiled_rayllm_py311_cu121.txt +# uv pip compile --generate-hashes --strip-extras --unsafe-package setuptools --index-url https://pypi.org/simple --index-strategy unsafe-best-match --no-strip-markers --emit-index-url --emit-find-links --extra-index-url https://download.pytorch.org/whl/cu121 --python-version=3.11 --unsafe-package ray --python-platform=linux -c python/deplocks/llm/rayllm_test_py311_cu121.lock python/requirements.txt python/requirements/llm/llm-requirements.txt -o python/deplocks/llm/rayllm_py311_cu121.lock --index-url https://pypi.org/simple --extra-index-url https://download.pytorch.org/whl/cu121 @@ -7,7 +7,7 @@ aiohappyeyeballs==2.6.1 \ --hash=sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558 \ --hash=sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # aiohttp aiohttp==3.11.16 \ --hash=sha256:004511d3413737700835e949433536a2fe95a7d0297edd911a1e9705c5b5ea43 \ @@ -92,7 +92,7 @@ aiohttp==3.11.16 \ --hash=sha256:fb46bb0f24813e6cede6cc07b1961d4b04f331f7112a23b5e21f567da4ee50aa \ --hash=sha256:fd36c119c5d6551bce374fcb5c19269638f8d09862445f85a5a48596fd59f4bb # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt # aiohttp-cors # vllm @@ -100,31 +100,37 @@ aiohttp-cors==0.7.0 \ --hash=sha256:0451ba59fdf6909d0e2cd21e4c0a43752bc0703d33fc78ae94d9d9321710193e \ --hash=sha256:4d39c6d7100fd9764ed1caf8cebf0eb01bf5e3f24e2e073fda6234bc48b19f5d # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt aiorwlock==1.3.0 \ --hash=sha256:45baf8e4fa9a23e0bb325fbd67da80de1fd7ae1d4f59a6381754c60cec7b289b \ --hash=sha256:83f12d87df4b9728a0b8fda1756585ab0d652b107bab59c6084e1b1ad692ab45 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt aiosignal==1.3.1 \ --hash=sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc \ --hash=sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # aiohttp +amqp==5.3.1 \ + --hash=sha256:43b3319e1b4e7d1251833a93d672b4af1e40f3d632d479b98661a95f117880a2 \ + --hash=sha256:cddc00c725449522023bad949f70fff7b48f0b1ade74d170a6f10ab044739432 + # via + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock + # kombu annotated-types==0.6.0 \ --hash=sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43 \ --hash=sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # pydantic anyio==3.7.1 \ --hash=sha256:44a3c9aba0f5defa43261a8b3efb97891f2bd7d804e0e1f56419befa1adfc780 \ --hash=sha256:91dee416e570e92c64041bd18b900d1d6fa78dff7048769ce5ac5ddad004fbb5 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # httpx # openai # starlette @@ -133,16 +139,22 @@ astor==0.8.1 \ --hash=sha256:070a54e890cefb5b3739d19f30f5a5ec840ffc9c50ffa7d23cc9fc1a38ebbfc5 \ --hash=sha256:6a6effda93f4e1ce9f618779b2dd1d9d84f1e32812c23a29b3fff6fd7f63fa5e # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # depyf attrs==25.1.0 \ --hash=sha256:1c97078a80c814273a76b2a298a932eb681c87415c11dee0a6921de7f1b02c3e \ --hash=sha256:c75a69e28a550a7e93789579c22aa26b0f5b83b75dc4e08fe092980051e1090a # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # aiohttp # jsonschema # referencing +billiard==4.2.1 \ + --hash=sha256:12b641b0c539073fc8d3f5b8b7be998956665c4233c7c1fcd66a7e677c4fb36f \ + --hash=sha256:40b59a4ac8806ba2c2369ea98d876bc6108b051c227baffd928c644d15d8f3cb + # via + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock + # celery blake3==1.0.4 \ --hash=sha256:00605aa59923205c6a4f21131840840eb2d9a754c59b163357d890566755b97a \ --hash=sha256:08f46c2f1c5f369f07409e3e4ff248bcb22617cd741f2224873d85982dd6034e \ @@ -230,13 +242,13 @@ blake3==1.0.4 \ --hash=sha256:fedc326cac4476d2eab88413a4bf56e491040ae11ea98ddadaa5487cecda9b93 \ --hash=sha256:ff0e96f61b16b365ad5bb7c6272754f83d8a59c95d3b2f70c3bb6324ddf5bc0c # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # vllm cachetools==5.5.2 \ --hash=sha256:1a661caa9175d26759571b2e19580f9d6393969e5dfca11fdb1f947a23e640d4 \ --hash=sha256:d26a22bcc62eb95c3beabd9f1ee5e820d3d2704fe2967cbe350e20c8ffcd3f0a # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # google-auth # vllm cbor2==5.6.5 \ @@ -285,13 +297,19 @@ cbor2==5.6.5 \ --hash=sha256:fde21ac1cf29336a31615a2c469a9cb03cf0add3ae480672d4d38cda467d07fc \ --hash=sha256:fe11c2eb518c882cfbeed456e7a552e544893c17db66fe5d3230dbeaca6b615c # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # vllm +celery==5.5.3 \ + --hash=sha256:0b5761a07057acee94694464ca482416b959568904c9dfa41ce8413a7d65d525 \ + --hash=sha256:6c972ae7968c2b5281227f01c3a3f984037d21c5129d07bf3550cc2afc6b10a5 + # via + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock + # -r python/requirements.txt certifi==2025.1.31 \ --hash=sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651 \ --hash=sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # httpcore # httpx # requests @@ -349,7 +367,7 @@ cffi==1.16.0 \ --hash=sha256:fa3a0128b152627161ce47201262d3140edb5a5c3da88d73a1b790a959126956 \ --hash=sha256:fcc8eb6d5902bb1cf6dc4f187ee3ea80a1eba0a89aba40a5cb20a5087d961357 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # cryptography # soundfile charset-normalizer==3.3.2 \ @@ -444,35 +462,57 @@ charset-normalizer==3.3.2 \ --hash=sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519 \ --hash=sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # requests click==8.1.7 \ --hash=sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28 \ --hash=sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt + # celery + # click-didyoumean + # click-plugins + # click-repl # ray # typer # uvicorn +click-didyoumean==0.3.1 \ + --hash=sha256:4f82fdff0dbe64ef8ab2279bd6aa3f6a99c3b28c05aa09cbfc07c9d7fbb5a463 \ + --hash=sha256:5c4bb6007cfea5f2fd6583a2fb6701a22a41eb98957e63d0fac41c10e7c3117c + # via + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock + # celery +click-plugins==1.1.1.2 \ + --hash=sha256:008d65743833ffc1f5417bf0e78e8d2c23aab04d9745ba817bd3e71b0feb6aa6 \ + --hash=sha256:d7af3984a99d243c131aa1a828331e7630f4a88a9741fd05c927b204bcf92261 + # via + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock + # celery +click-repl==0.3.0 \ + --hash=sha256:17849c23dba3d667247dc4defe1757fff98694e90fe37474f3feebb69ced26a9 \ + --hash=sha256:fb7e06deb8da8de86180a33a9da97ac316751c094c6899382da7feeeeb51b812 + # via + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock + # celery cloudpickle==2.2.0 \ --hash=sha256:3f4219469c55453cfe4737e564b67c2a149109dabf7f242478948b895f61106f \ --hash=sha256:7428798d5926d8fcbfd092d18d01a2a03daf8237d8fcdc8095d256b8490796f0 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # gymnasium # vllm colorful==0.5.5 \ --hash=sha256:62c187e27c1433db9463ff93b1451898d1e7e23a7e553583fd9daeb6325182e4 \ --hash=sha256:66f8c1264b2a26f7293b96a03bb7a76c4bc8b9634369a0bffdcd12d618056a1d # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt compressed-tensors==0.10.2 \ --hash=sha256:6de13ac535d7ffdd8890fad3d229444c33076170acaa8fab6bab8ecfa96c1d8f \ --hash=sha256:e1b4d9bc2006e3fd3a938e59085f318fdb280c5af64688a4792bf1bc263e579d # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # vllm cryptography==44.0.3 \ --hash=sha256:02f55fb4f8b79c1221b0961488eaae21015b69b210e18c386b69de182ebb1259 \ @@ -513,7 +553,7 @@ cryptography==44.0.3 \ --hash=sha256:fc3c9babc1e1faefd62704bb46a69f359a9819eb0292e40df3fb6e3574715cd4 \ --hash=sha256:fe19d8bc5536a91a24a8133328880a41831b6c5df54599a8417b62fe015d3053 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # pyopenssl cupy-cuda12x==13.1.0 ; sys_platform != 'darwin' \ --hash=sha256:230f8a8e99c81a653baa0ed00819990c0ed1f0cf0298214786b5e323461dc61a \ @@ -529,38 +569,38 @@ cupy-cuda12x==13.1.0 ; sys_platform != 'darwin' \ --hash=sha256:dac0284fecb90b5731f514e569a6fcf6674a730ae95b9490781a713b60a34423 \ --hash=sha256:e7a25ef1b44ae6276b5105affc2289edb34f1aa6676babd5bcd80907348c4cfa # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt # ray depyf==0.19.0 \ --hash=sha256:040b35fc0997d49df024b7d094f2a7836f91e9ed02f49982dd37e70aa3285ad5 \ --hash=sha256:afed0916b32d141cc90fa6220df01885eda442ca43b297d5050eeb90b4a5cb44 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # vllm dill==0.3.9 \ --hash=sha256:468dff3b89520b474c0397703366b7b95eebe6303f108adf9b19da1f702be87a \ --hash=sha256:81aa267dddf68cbfe8029c42ca9ec6a4ab3b22371d1c450abc54422577b4512c # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # depyf diskcache==5.6.3 \ --hash=sha256:2c3a3fa2743d8535d832ec61c2054a1641f41775aa7c556758a109941e33e4fc \ --hash=sha256:5e31b2d5fbad117cc363ebaf6b689474db18a1f6438bc82358b024abd4c2ca19 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # vllm distlib==0.3.7 \ --hash=sha256:2e24928bc811348f0feb63014e97aaae3037f2cf48712d51ae61df7fd6075057 \ --hash=sha256:9dafe54b34a028eafd95039d5e5d4851a13734540f1331060d31c9916e7147a8 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # virtualenv distro==1.9.0 \ --hash=sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed \ --hash=sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # openai dm-tree==0.1.8 \ --hash=sha256:054b461f8176f4bce7a21f7b1870f873a1ced3bdbe1282c816c550bb43c71fa6 \ @@ -610,44 +650,44 @@ dm-tree==0.1.8 \ --hash=sha256:f7ac31b9aecccb2c6e1ab29706f6ded3eba0c2c69c770322c9c685929c3d6afb \ --hash=sha256:fa42a605d099ee7d41ba2b5fb75e21423951fd26e5d50583a00471238fb3021d # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt dnspython==2.7.0 \ --hash=sha256:b4c34b7d10b51bcc3a5071e7b8dee77939f1e878477eeecc965e9835f63c6c86 \ --hash=sha256:ce9c432eda0dc91cf618a5cedf1a4e142651196bbcd2c80e89ed5a907e5cfaf1 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # email-validator einops==0.8.1 \ --hash=sha256:919387eb55330f5757c6bea9165c5ff5cfe63a642682ea788a6d472576d81737 \ --hash=sha256:de5d960a7a761225532e0f1959e5315ebeafc0cd43394732f103ca44b9837e84 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # vllm email-validator==2.2.0 \ --hash=sha256:561977c2d73ce3611850a06fa56b414621e0c8faa9d66f2611407d87465da631 \ --hash=sha256:cb690f344c617a714f22e66ae771445a1ceb46821152df8e165c5f9a364582b7 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # fastapi farama-notifications==0.0.4 \ --hash=sha256:13fceff2d14314cf80703c8266462ebf3733c7d165336eee998fc58e545efd18 \ --hash=sha256:14de931035a41961f7c056361dc7f980762a143d05791ef5794a751a2caf05ae # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # gymnasium fastapi==0.115.12 \ --hash=sha256:1e2c2a2646905f9e83d32f04a3f86aff4a286669c6c950ca95b5fd68c2602681 \ --hash=sha256:e94613d6c05e27be7ffebdd6ea5f388112e5e430c8f7d6494a9d1d88d43e814d # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt # vllm fastapi-cli==0.0.5 \ --hash=sha256:d30e1239c6f46fcb95e606f02cdda59a1e2fa778a54b64686b3ff27f6211ff9f \ --hash=sha256:e94d847524648c748a5350673546bbf9bcaeb086b33c24f2e82e021436866a46 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # fastapi fastrlock==0.8.2 ; sys_platform != 'darwin' \ --hash=sha256:067edb0a0805bf61e17a251d5046af59f6e9d2b8ad01222e0ef7a0b7937d5548 \ @@ -726,13 +766,13 @@ fastrlock==0.8.2 ; sys_platform != 'darwin' \ --hash=sha256:fb5363cf0fddd9b50525ddbf64a1e1b28ec4c6dfb28670a940cb1cf988a6786b \ --hash=sha256:ff75c90663d6e8996610d435e71487daa853871ad1770dd83dc0f2fc4997241e # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # cupy-cuda12x filelock==3.17.0 \ --hash=sha256:533dc2f7ba78dc2f0f531fc6c4940addf7b70a481e269a5a3b93be94ffbe8338 \ --hash=sha256:ee4e77401ef576ebb38cd7f13b9b28893194acc20a8e68e18730ba9c0e54660e # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt # huggingface-hub # ray @@ -819,14 +859,14 @@ frozenlist==1.4.1 \ --hash=sha256:fde5bd59ab5357e3853313127f4d3565fc7dad314a74d7b5d43c22c6a5ed2ced \ --hash=sha256:fe1a06da377e3a1062ae5fe0926e12b84eceb8a50b350ddca72dc85015873f74 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # aiohttp # aiosignal -fsspec==2023.5.0 \ - --hash=sha256:51a4ad01a5bb66fcc58036e288c0d53d3975a0df2a5dc59a93b59bade0391f2a \ - --hash=sha256:b3b56e00fb93ea321bc9e5d9cf6f8522a0198b20eb24e02774d329e9c6fb84ce +fsspec==2023.12.1 \ + --hash=sha256:6271f1d3075a378bfe432f6f42bf7e1d2a6ba74f78dd9b512385474c579146a0 \ + --hash=sha256:c4da01a35ac65c853f833e43f67802c25213f560820d54ddf248f92eddd5e990 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt # huggingface-hub # torch @@ -834,96 +874,92 @@ gguf==0.16.2 \ --hash=sha256:0fc956289a30d0f1f3afd75ec0d493f73ae2629a3f21f3846dd1687d8791c7c1 \ --hash=sha256:e73eb19b30fcc7c7f32894345024dda8b1a0c959b94a12b7c40ded8dd3f96810 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # vllm google-api-core==2.24.2 \ --hash=sha256:810a63ac95f3c441b7c0e43d344e372887f62ce9071ba972eacf32672e072de9 \ --hash=sha256:81718493daf06d96d6bc76a91c23874dbf2fac0adbbf542831b805ee6e974696 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # opencensus google-auth==2.23.4 \ --hash=sha256:79905d6b1652187def79d491d6e23d0cbb3a21d3c7ba0dbaa9c8a01906b13ff3 \ --hash=sha256:d4bbc92fe4b8bfd2f3e8d88e5ba7085935da208ee38a134fc280e7ce682a05f2 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # google-api-core googleapis-common-protos==1.61.0 \ --hash=sha256:22f1915393bb3245343f6efe87f6fe868532efc12aa26b391b15132e1279f1c0 \ --hash=sha256:8a64866a97f6304a7179873a465d6eee97b7a24ec6cfd78e0f575e96b821240b # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # google-api-core -grpcio==1.66.2 \ - --hash=sha256:02697eb4a5cbe5a9639f57323b4c37bcb3ab2d48cec5da3dc2f13334d72790dd \ - --hash=sha256:03b0b307ba26fae695e067b94cbb014e27390f8bc5ac7a3a39b7723fed085604 \ - --hash=sha256:05bc2ceadc2529ab0b227b1310d249d95d9001cd106aa4d31e8871ad3c428d73 \ - --hash=sha256:06de8ec0bd71be123eec15b0e0d457474931c2c407869b6c349bd9bed4adbac3 \ - --hash=sha256:0be4e0490c28da5377283861bed2941d1d20ec017ca397a5df4394d1c31a9b50 \ - --hash=sha256:12fda97ffae55e6526825daf25ad0fa37483685952b5d0f910d6405c87e3adb6 \ - --hash=sha256:1caa38fb22a8578ab8393da99d4b8641e3a80abc8fd52646f1ecc92bcb8dee34 \ - --hash=sha256:2018b053aa15782db2541ca01a7edb56a0bf18c77efed975392583725974b249 \ - --hash=sha256:20657d6b8cfed7db5e11b62ff7dfe2e12064ea78e93f1434d61888834bc86d75 \ - --hash=sha256:2335c58560a9e92ac58ff2bc5649952f9b37d0735608242973c7a8b94a6437d8 \ - --hash=sha256:31fd163105464797a72d901a06472860845ac157389e10f12631025b3e4d0453 \ - --hash=sha256:38b68498ff579a3b1ee8f93a05eb48dc2595795f2f62716e797dc24774c1aaa8 \ - --hash=sha256:3b00efc473b20d8bf83e0e1ae661b98951ca56111feb9b9611df8efc4fe5d55d \ - --hash=sha256:3ed71e81782966ffead60268bbda31ea3f725ebf8aa73634d5dda44f2cf3fb9c \ - --hash=sha256:45a3d462826f4868b442a6b8fdbe8b87b45eb4f5b5308168c156b21eca43f61c \ - --hash=sha256:49f0ca7ae850f59f828a723a9064cadbed90f1ece179d375966546499b8a2c9c \ - --hash=sha256:4e504572433f4e72b12394977679161d495c4c9581ba34a88d843eaf0f2fbd39 \ - --hash=sha256:4ea1d062c9230278793820146c95d038dc0f468cbdd172eec3363e42ff1c7d01 \ - --hash=sha256:563588c587b75c34b928bc428548e5b00ea38c46972181a4d8b75ba7e3f24231 \ - --hash=sha256:6001e575b8bbd89eee11960bb640b6da6ae110cf08113a075f1e2051cc596cae \ - --hash=sha256:66a0cd8ba6512b401d7ed46bb03f4ee455839957f28b8d61e7708056a806ba6a \ - --hash=sha256:6851de821249340bdb100df5eacfecfc4e6075fa85c6df7ee0eb213170ec8e5d \ - --hash=sha256:728bdf36a186e7f51da73be7f8d09457a03061be848718d0edf000e709418987 \ - --hash=sha256:73e3b425c1e155730273f73e419de3074aa5c5e936771ee0e4af0814631fb30a \ - --hash=sha256:73fc8f8b9b5c4a03e802b3cd0c18b2b06b410d3c1dcbef989fdeb943bd44aff7 \ - --hash=sha256:78fa51ebc2d9242c0fc5db0feecc57a9943303b46664ad89921f5079e2e4ada7 \ - --hash=sha256:7b2c86457145ce14c38e5bf6bdc19ef88e66c5fee2c3d83285c5aef026ba93b3 \ - --hash=sha256:7d69ce1f324dc2d71e40c9261d3fdbe7d4c9d60f332069ff9b2a4d8a257c7b2b \ - --hash=sha256:802d84fd3d50614170649853d121baaaa305de7b65b3e01759247e768d691ddf \ - --hash=sha256:80fd702ba7e432994df208f27514280b4b5c6843e12a48759c9255679ad38db8 \ - --hash=sha256:8ac475e8da31484efa25abb774674d837b343afb78bb3bcdef10f81a93e3d6bf \ - --hash=sha256:950da58d7d80abd0ea68757769c9db0a95b31163e53e5bb60438d263f4bed7b7 \ - --hash=sha256:99a641995a6bc4287a6315989ee591ff58507aa1cbe4c2e70d88411c4dcc0839 \ - --hash=sha256:9c3a99c519f4638e700e9e3f83952e27e2ea10873eecd7935823dab0c1c9250e \ - --hash=sha256:9c509a4f78114cbc5f0740eb3d7a74985fd2eff022971bc9bc31f8bc93e66a3b \ - --hash=sha256:a18e20d8321c6400185b4263e27982488cb5cdd62da69147087a76a24ef4e7e3 \ - --hash=sha256:a917d26e0fe980b0ac7bfcc1a3c4ad6a9a4612c911d33efb55ed7833c749b0ee \ - --hash=sha256:a9539f01cb04950fd4b5ab458e64a15f84c2acc273670072abe49a3f29bbad54 \ - --hash=sha256:ad2efdbe90c73b0434cbe64ed372e12414ad03c06262279b104a029d1889d13e \ - --hash=sha256:b672abf90a964bfde2d0ecbce30f2329a47498ba75ce6f4da35a2f4532b7acbc \ - --hash=sha256:bbd27c24a4cc5e195a7f56cfd9312e366d5d61b86e36d46bbe538457ea6eb8dd \ - --hash=sha256:c400ba5675b67025c8a9f48aa846f12a39cf0c44df5cd060e23fda5b30e9359d \ - --hash=sha256:c408f5ef75cfffa113cacd8b0c0e3611cbfd47701ca3cdc090594109b9fcbaed \ - --hash=sha256:c806852deaedee9ce8280fe98955c9103f62912a5b2d5ee7e3eaa284a6d8d8e7 \ - --hash=sha256:ce89f5876662f146d4c1f695dda29d4433a5d01c8681fbd2539afff535da14d4 \ - --hash=sha256:d25a14af966438cddf498b2e338f88d1c9706f3493b1d73b93f695c99c5f0e2a \ - --hash=sha256:d8d4732cc5052e92cea2f78b233c2e2a52998ac40cd651f40e398893ad0d06ec \ - --hash=sha256:d9a9724a156c8ec6a379869b23ba3323b7ea3600851c91489b871e375f710bc8 \ - --hash=sha256:e636ce23273683b00410f1971d209bf3689238cf5538d960adc3cdfe80dd0dbd \ - --hash=sha256:e88264caad6d8d00e7913996030bac8ad5f26b7411495848cc218bd3a9040b6c \ - --hash=sha256:f145cc21836c332c67baa6fc81099d1d27e266401565bf481948010d6ea32d46 \ - --hash=sha256:fb57870449dfcfac428afbb5a877829fcb0d6db9d9baa1148705739e9083880e \ - --hash=sha256:fb70487c95786e345af5e854ffec8cb8cc781bcc5df7930c4fbb7feaa72e1cdf \ - --hash=sha256:fe96281713168a3270878255983d2cb1a97e034325c8c2c25169a69289d3ecfa \ - --hash=sha256:ff1f7882e56c40b0d33c4922c15dfa30612f05fb785074a012f7cda74d1c3679 - # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt +grpcio==1.74.0 \ + --hash=sha256:0f87bddd6e27fc776aacf7ebfec367b6d49cad0455123951e4488ea99d9b9b8f \ + --hash=sha256:136b53c91ac1d02c8c24201bfdeb56f8b3ac3278668cbb8e0ba49c88069e1bdc \ + --hash=sha256:1733969040989f7acc3d94c22f55b4a9501a30f6aaacdbccfaba0a3ffb255ab7 \ + --hash=sha256:176d60a5168d7948539def20b2a3adcce67d72454d9ae05969a2e73f3a0feee7 \ + --hash=sha256:1a2b06afe2e50ebfd46247ac3ba60cac523f54ec7792ae9ba6073c12daf26f0a \ + --hash=sha256:1bf949792cee20d2078323a9b02bacbbae002b9e3b9e2433f2741c15bdeba1c4 \ + --hash=sha256:22b834cef33429ca6cc28303c9c327ba9a3fafecbf62fae17e9a7b7163cc43ac \ + --hash=sha256:2918948864fec2a11721d91568effffbe0a02b23ecd57f281391d986847982f6 \ + --hash=sha256:2bc2d7d8d184e2362b53905cb1708c84cb16354771c04b490485fa07ce3a1d89 \ + --hash=sha256:2f609a39f62a6f6f05c7512746798282546358a37ea93c1fcbadf8b2fed162e3 \ + --hash=sha256:3601274bc0523f6dc07666c0e01682c94472402ac2fd1226fd96e079863bfa49 \ + --hash=sha256:3b03d8f2a07f0fea8c8f74deb59f8352b770e3900d143b3d1475effcb08eec20 \ + --hash=sha256:3d14e3c4d65e19d8430a4e28ceb71ace4728776fd6c3ce34016947474479683f \ + --hash=sha256:42f8fee287427b94be63d916c90399ed310ed10aadbf9e2e5538b3e497d269bc \ + --hash=sha256:4bc5fca10aaf74779081e16c2bcc3d5ec643ffd528d9e7b1c9039000ead73bae \ + --hash=sha256:4e4181bfc24413d1e3a37a0b7889bea68d973d4b45dd2bc68bb766c140718f82 \ + --hash=sha256:55b453812fa7c7ce2f5c88be3018fb4a490519b6ce80788d5913f3f9d7da8c7b \ + --hash=sha256:566b9395b90cc3d0d0c6404bc8572c7c18786ede549cdb540ae27b58afe0fb91 \ + --hash=sha256:5f251c355167b2360537cf17bea2cf0197995e551ab9da6a0a59b3da5e8704f9 \ + --hash=sha256:60d2d48b0580e70d2e1954d0d19fa3c2e60dd7cbed826aca104fff518310d1c5 \ + --hash=sha256:64229c1e9cea079420527fa8ac45d80fc1e8d3f94deaa35643c381fa8d98f362 \ + --hash=sha256:655726919b75ab3c34cdad39da5c530ac6fa32696fb23119e36b64adcfca174a \ + --hash=sha256:662456c4513e298db6d7bd9c3b8df6f75f8752f0ba01fb653e252ed4a59b5a5d \ + --hash=sha256:68c8ebcca945efff9d86d8d6d7bfb0841cf0071024417e2d7f45c5e46b5b08eb \ + --hash=sha256:69e1a8180868a2576f02356565f16635b99088da7df3d45aaa7e24e73a054e31 \ + --hash=sha256:6bab67d15ad617aff094c382c882e0177637da73cbc5532d52c07b4ee887a87b \ + --hash=sha256:7d95d71ff35291bab3f1c52f52f474c632db26ea12700c2ff0ea0532cb0b5854 \ + --hash=sha256:80d1f4fbb35b0742d3e3d3bb654b7381cd5f015f8497279a1e9c21ba623e01b1 \ + --hash=sha256:834988b6c34515545b3edd13e902c1acdd9f2465d386ea5143fb558f153a7176 \ + --hash=sha256:8533e6e9c5bd630ca98062e3a1326249e6ada07d05acf191a77bc33f8948f3d8 \ + --hash=sha256:85bd5cdf4ed7b2d6438871adf6afff9af7096486fcf51818a81b77ef4dd30907 \ + --hash=sha256:86ad489db097141a907c559988c29718719aa3e13370d40e20506f11b4de0d11 \ + --hash=sha256:885912559974df35d92219e2dc98f51a16a48395f37b92865ad45186f294096c \ + --hash=sha256:8efe72fde5500f47aca1ef59495cb59c885afe04ac89dd11d810f2de87d935d4 \ + --hash=sha256:8f7b5882fb50632ab1e48cb3122d6df55b9afabc265582808036b6e51b9fd6b7 \ + --hash=sha256:9e7c4389771855a92934b2846bd807fc25a3dfa820fd912fe6bd8136026b2707 \ + --hash=sha256:9e912d3c993a29df6c627459af58975b2e5c897d93287939b9d5065f000249b5 \ + --hash=sha256:a8f0302f9ac4e9923f98d8e243939a6fb627cd048f5cd38595c97e38020dffce \ + --hash=sha256:b6a73b2ba83e663b2480a90b82fdae6a7aa6427f62bf43b29912c0cfd1aa2bfa \ + --hash=sha256:c14e803037e572c177ba54a3e090d6eb12efd795d49327c5ee2b3bddb836bf01 \ + --hash=sha256:c3d7bd6e3929fd2ea7fbc3f562e4987229ead70c9ae5f01501a46701e08f1ad9 \ + --hash=sha256:c98e0b7434a7fa4e3e63f250456eaef52499fba5ae661c58cc5b5477d11e7182 \ + --hash=sha256:cce634b10aeab37010449124814b05a62fb5f18928ca878f1bf4750d1f0c815b \ + --hash=sha256:e154d230dc1bbbd78ad2fdc3039fa50ad7ffcf438e4eb2fa30bce223a70c7486 \ + --hash=sha256:e1ea6176d7dfd5b941ea01c2ec34de9531ba494d541fe2057c904e601879f249 \ + --hash=sha256:e759f9e8bc908aaae0412642afe5416c9f983a80499448fcc7fab8692ae044c3 \ + --hash=sha256:e8978003816c7b9eabe217f88c78bc26adc8f9304bf6a594b02e5a49b2ef9c11 \ + --hash=sha256:ecde9ab49f58433abe02f9ed076c7b5be839cf0153883a6d23995937a82392fa \ + --hash=sha256:f6ec94f0e50eb8fa1744a731088b966427575e40c2944a980049798b127a687e \ + --hash=sha256:fd3c71aeee838299c5887230b8a1822795325ddfea635edd82954c1eaa831e24 \ + --hash=sha256:fe0f540750a13fd8e5da4b3eaba91a785eea8dca5ccd2bc2ffe978caa403090e + # via + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt -gymnasium==1.0.0 \ - --hash=sha256:9d2b66f30c1b34fe3c2ce7fae65ecf365d0e9982d2b3d860235e773328a3b403 \ - --hash=sha256:b6f40e1e24c5bd419361e1a5b86a9117d2499baecc3a660d44dfff4c465393ad +gymnasium==1.1.1 \ + --hash=sha256:8bd9ea9bdef32c950a444ff36afc785e1d81051ec32d30435058953c20d2456d \ + --hash=sha256:9c167ec0a2b388666e37f63b2849cd2552f7f5b71938574c637bb36487eb928a # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt h11==0.16.0 \ --hash=sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1 \ --hash=sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # httpcore # uvicorn hf-transfer==0.1.9 \ @@ -953,9 +989,9 @@ hf-transfer==0.1.9 \ --hash=sha256:ee8b10afedcb75f71091bcc197c526a6ebf5c58bbbadb34fdeee6160f55f619f \ --hash=sha256:fc6bd19e1cc177c66bdef15ef8636ad3bde79d5a4f608c158021153b4573509d # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements/llm/llm-requirements.txt -hf-xet==1.1.5 \ +hf-xet==1.1.5 ; platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \ --hash=sha256:69ebbcfd9ec44fdc2af73441619eeb06b94ee34511bbcf57cd423820090f5694 \ --hash=sha256:73e167d9807d166596b4b2f0b585c6d5bd84a26dea32843665a8b58f6edba245 \ --hash=sha256:83088ecea236d5113de478acb2339f92c95b4fb0462acaa30621fac02f5a534a \ @@ -965,13 +1001,13 @@ hf-xet==1.1.5 \ --hash=sha256:f52c2fa3635b8c37c7764d8796dfa72706cc4eded19d638331161e82b0792e23 \ --hash=sha256:fc874b5c843e642f45fd85cda1ce599e123308ad2901ead23d3510a47ff506d1 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # huggingface-hub httpcore==1.0.9 \ --hash=sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55 \ --hash=sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # httpx httptools==0.6.4 \ --hash=sha256:0614154d5454c21b6410fdf5262b4a3ddb0f53f1e1721cfd59d55f32138c578a \ @@ -1018,28 +1054,27 @@ httptools==0.6.4 \ --hash=sha256:f9eb89ecf8b290f2e293325c646a211ff1c2493222798bb80a530c5e7502494f \ --hash=sha256:fc411e1c0a7dcd2f902c7c48cf079947a7e65b5485dea9decb82b9105ca71a43 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # uvicorn httpx==0.28.1 \ --hash=sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc \ --hash=sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # fastapi # openai huggingface-hub==0.34.3 \ --hash=sha256:5444550099e2d86e68b2898b09e85878fbd788fc2957b506c6a79ce060e39492 \ --hash=sha256:d58130fd5aa7408480681475491c0abd7e835442082fbc3ef4d45b6c39f83853 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # tokenizers # transformers - # vllm idna==3.7 \ --hash=sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc \ --hash=sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # anyio # email-validator # httpx @@ -1049,25 +1084,25 @@ imageio==2.34.2 \ --hash=sha256:5c0c0ee8faa018a1c42f649b90395dd4d3bb6187c09053a0cd6f1fdd51bbff5e \ --hash=sha256:a0bb27ec9d5bab36a9f4835e51b21d2cb099e1f78451441f94687ff3404b79f8 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # scikit-image importlib-metadata==6.11.0 \ --hash=sha256:1231cf92d825c9e03cfc4da076a16de6422c863558229ea0b22b675657463443 \ --hash=sha256:f0afba6205ad8f8947c7d338b5342d5db2afbfd82f9cbef7879a9539cc12eb9b # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # opentelemetry-api interegular==0.3.3 \ --hash=sha256:b0c07007d48c89d6d19f7204972d369b2a77222722e126b6aa63aa721dc3b19c \ --hash=sha256:d9b697b21b34884711399ba0f0376914b81899ce670032486d0d048344a76600 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # lm-format-enforcer jinja2==3.1.6 \ --hash=sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d \ --hash=sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # fastapi # memray # torch @@ -1149,19 +1184,19 @@ jiter==0.8.2 \ --hash=sha256:fc9043259ee430ecd71d178fccabd8c332a3bf1e81e50cae43cc2b28d19e4cb7 \ --hash=sha256:ffd9fee7d0775ebaba131f7ca2e2d83839a62ad65e8e02fe2bd8fc975cedeb9e # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # openai jsonref==1.1.0 \ --hash=sha256:32fe8e1d85af0fdefbebce950af85590b22b60f9e95443176adbde4e1ecea552 \ --hash=sha256:590dc7773df6c21cbf948b5dac07a72a251db28b0238ceecce0a2abfa8ec30a9 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements/llm/llm-requirements.txt jsonschema==4.23.0 \ --hash=sha256:d71497fef26351a33265337fa77ffeb82423f3ea21283cd9467bb03999266bc4 \ --hash=sha256:fbadb6f8b144a8f8cf9f0b89ba94501d143e50411a1278633f56a7acf7fd5566 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements/llm/llm-requirements.txt # -r python/requirements.txt # mistral-common @@ -1170,19 +1205,25 @@ jsonschema-specifications==2024.10.1 \ --hash=sha256:0f38b83639958ce1152d02a7f062902c41c8fd20d558b0c34344292d417ae272 \ --hash=sha256:a09a0680616357d9a0ecf05c12ad234479f549239d0f5b55f3deea67475da9bf # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # jsonschema +kombu==5.5.4 \ + --hash=sha256:886600168275ebeada93b888e831352fe578168342f0d1d5833d88ba0d847363 \ + --hash=sha256:a12ed0557c238897d8e518f1d1fdf84bd1516c5e305af2dacd85c2015115feb8 + # via + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock + # celery lark==1.2.2 \ --hash=sha256:c2276486b02f0f1b90be155f2c8ba4a8e194d42775786db622faccd652d8e80c \ --hash=sha256:ca807d0162cd16cef15a8feecb862d7319e7a09bdb13aef927968e45040fed80 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # vllm lazy-loader==0.4 \ --hash=sha256:342aa8e14d543a154047afb4ba8ef17f5563baad3fc610d7b15b213b0f119efc \ --hash=sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # scikit-image llguidance==0.7.26 ; platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \ --hash=sha256:1895ff449c8ec0a5f1d3b142d723fc9b26a85b021b72d7f1173f8b7507f528c0 \ @@ -1194,7 +1235,7 @@ llguidance==0.7.26 ; platform_machine == 'aarch64' or platform_machine == 'arm64 --hash=sha256:e4e552eb3193b56ca3347f96c1382779e438b7dfc1d234323e202fd7c7a98d28 \ --hash=sha256:fa8ca0660df03934027b87d7e574edf1f8651493f77c0932f3f66d6effbed2b1 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # vllm llvmlite==0.44.0 \ --hash=sha256:07667d66a5d150abed9157ab6c0b9393c9356f229784a4385c02f99e94fc94d4 \ @@ -1219,13 +1260,13 @@ llvmlite==0.44.0 \ --hash=sha256:eed7d5f29136bda63b6d7804c279e2b72e08c952b7c5df61f45db408e0ee52f3 \ --hash=sha256:f01a394e9c9b7b1d4e63c327b096d10f6f0ed149ef53d38a09b3749dcf8c9610 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # numba lm-format-enforcer==0.10.11 \ --hash=sha256:563e0dbc930a6d50fb687951506c5de098c6e962601be0ce723f3b7d0b916a1b \ --hash=sha256:8ab371924e166a1df68f243aca73a8a647bea5909f37edd6a53a694e7e7c3274 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # vllm lz4==4.3.3 \ --hash=sha256:01fe674ef2889dbb9899d8a67361e0c4a2c833af5aeb37dd505727cf5d2a131e \ @@ -1265,13 +1306,13 @@ lz4==4.3.3 \ --hash=sha256:f4c7bf687303ca47d69f9f0133274958fd672efaa33fb5bcde467862d6c621f0 \ --hash=sha256:f76176492ff082657ada0d0f10c794b6da5800249ef1692b35cf49b1e93e8ef7 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt markdown-it-py==2.2.0 \ --hash=sha256:5a35f8d1870171d9acc47b99612dc146129b631baf04970128b568f190d0cc30 \ --hash=sha256:7c9a5e412688bc771c67432cbfebcdd686c93ce6484913dccf06cb5a0bea35a1 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # rich markupsafe==2.1.3 \ --hash=sha256:05fb21170423db021895e1ea1e1f3ab3adb85d1c2333cbc2310f2a26bc77272e \ @@ -1335,13 +1376,13 @@ markupsafe==2.1.3 \ --hash=sha256:fec21693218efe39aa7f8599346e90c705afa52c5b31ae019b2e57e8f6542bb2 \ --hash=sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # jinja2 mdurl==0.1.2 \ --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \ --hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # markdown-it-py memray==1.10.0 ; sys_platform != 'win32' \ --hash=sha256:0a21745fb516b7a6efcd40aa7487c59e9313fcfc782d0193fcfcf00b48426874 \ @@ -1380,25 +1421,25 @@ memray==1.10.0 ; sys_platform != 'win32' \ --hash=sha256:e356af93e3b031c83957e9ac1a653f5aaba5df1e357dd17142f5ed19bb3dc660 \ --hash=sha256:f16c5c8730b616613dc8bafe32649ca6bd7252606251eb00148582011758d0b5 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt meson==1.8.3 \ --hash=sha256:ef02b806ce0c5b6becd5bb5dc9fa67662320b29b337e7ace73e4354500590233 \ --hash=sha256:f118aa910fc0a137cc2dd0122232dbf82153d9a12fb5b0f5bb64896f6a157abf # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements/llm/llm-requirements.txt mistral-common==1.8.3 \ --hash=sha256:0d1979d82227b625f6d71b3c828176f059da8d0f5a3307cdf53b48409a3970a4 \ --hash=sha256:846b6e4bbe016dc2e64fd3169fa704a548f6c74467e0cb18dc165b7a7669abd6 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # vllm mpmath==1.3.0 \ --hash=sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f \ --hash=sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # sympy msgpack==1.0.7 \ --hash=sha256:04ad6069c86e531682f9e1e71b71c1c3937d6014a7c3e9edd2aa81ad58842862 \ @@ -1458,7 +1499,7 @@ msgpack==1.0.7 \ --hash=sha256:f9a7c509542db4eceed3dcf21ee5267ab565a83555c9b88a8109dcecc4709002 \ --hash=sha256:ff1d0899f104f3921d94579a5638847f783c9b04f2d5f229392ca77fba5b82fc # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt # ray msgspec==0.19.0 \ @@ -1499,7 +1540,7 @@ msgspec==0.19.0 \ --hash=sha256:f98bd8962ad549c27d63845b50af3f53ec468b6318400c9f1adfe8b092d7b62f \ --hash=sha256:fe2c4bf29bf4e89790b3117470dea2c20b59932772483082c468b990d45fb947 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # vllm multidict==6.0.5 \ --hash=sha256:01265f5e40f5a17f8241d52656ed27192be03bfa8764d88e8220141d1e4b3556 \ @@ -1593,14 +1634,14 @@ multidict==6.0.5 \ --hash=sha256:fce28b3c8a81b6b36dfac9feb1de115bab619b3c13905b419ec71d03a3fc1423 \ --hash=sha256:fe5d7785250541f7f5019ab9cba2c71169dc7d74d0f45253f8313f436458a4ef # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # aiohttp # yarl networkx==3.2.1 \ --hash=sha256:9f1bb5cf3409bf324e0a722c20bdb4c20ee39bf1c30ce8ae499c8502b0b5e0c6 \ --hash=sha256:f18c69adc97877c42332c170849c96cefa91881c99a7cb3e95b7c659ebdc1ec2 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # scikit-image # torch ninja==1.11.1.3 \ @@ -1622,17 +1663,21 @@ ninja==1.11.1.3 \ --hash=sha256:bc3ebc8b2e47716149f3541742b5cd8e0b08f51013b825c05baca3e34854370d \ --hash=sha256:edfa0d2e9d7ead1635b03e40a32ad56cc8f56798b6e2e9848d8300b174897076 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements/llm/llm-requirements.txt # vllm # xgrammar -nixl==0.3.1 \ - --hash=sha256:20428ad2668062a79045fae83cc5cba1f4019d4a2c7053cc8549c3a1533f8a75 \ - --hash=sha256:70b8932b50ccf1a13ac8fa2e10a4b78290baae9f963bfecfa67684104331a94b \ - --hash=sha256:8c144839484b3076f0b34ad8ceaeaff05c23399cf57ca85f2a94b44e1475a39b \ - --hash=sha256:ff59996ad05a7e4ba6c8beba0f1d8ac2f9e53df696a15af0d3340028e2f16081 - # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt +nixl==0.4.1 \ + --hash=sha256:10c7b4a44f89c3fbff3e20cb84973be95f8df36ee336fb108275ed1839fec1f1 \ + --hash=sha256:510cc9e824ad53cac71ce55ff41160f2a9e1507ceb52eb871b775fe1e42beb87 \ + --hash=sha256:8a3d83b28c16b795bdc281f1489b9d247f6e6088ad96ca96406072a36d6354b7 \ + --hash=sha256:9381fd3986d227c7ccb2607c03bbea559ec80f951e2ea47c1fbf381e4cd97164 \ + --hash=sha256:9ab7e580e9962ebdcda8c17f8548858d3fdb648621367d8e717ca317b534b778 \ + --hash=sha256:db144821de7912cb2502052b3070a1ac276b8b019470e6efdfce9c237ffe130d \ + --hash=sha256:e33102b85b3f95a8c95e59b59b29aabd03d47b5bce619de506b9bb83739cf60d \ + --hash=sha256:f16092dd445542e82e3db3553f6c7697ec5a2e837f19d416401283ae245826f9 + # via + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements/llm/llm-requirements.txt numba==0.61.2 \ --hash=sha256:34fba9406078bac7ab052efbf0d13939426c753ad72946baaa5bf9ae0ebb8dd2 \ @@ -1657,7 +1702,7 @@ numba==0.61.2 \ --hash=sha256:ea0247617edcb5dd61f6106a56255baab031acc4257bddaeddb3a1003b4ca3fd \ --hash=sha256:efd3db391df53aaa5cfbee189b6c910a5b471488749fd6606c3f33fc984c2ae2 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # vllm numpy==1.26.4 \ --hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \ @@ -1697,7 +1742,7 @@ numpy==1.26.4 \ --hash=sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3 \ --hash=sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt # cupy-cuda12x # gguf @@ -1723,7 +1768,7 @@ nvidia-cublas-cu12==12.6.4.1 ; platform_machine == 'x86_64' and sys_platform == --hash=sha256:235f728d6e2a409eddf1df58d5b0921cf80cfa9e72b9f2775ccb7b4a87984668 \ --hash=sha256:9e4fa264f4d8a4eb0cdbd34beadc029f453b3bafae02401e999cf3d5a5af75f8 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # nvidia-cudnn-cu12 # nvidia-cusolver-cu12 # torch @@ -1734,14 +1779,14 @@ nvidia-cuda-cupti-cu12==12.6.80 ; platform_machine == 'x86_64' and sys_platform --hash=sha256:a3eff6cdfcc6a4c35db968a06fcadb061cbc7d6dde548609a941ff8701b98b73 \ --hash=sha256:bbe6ae76e83ce5251b56e8c8e61a964f757175682bbad058b170b136266ab00a # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # torch nvidia-cuda-nvrtc-cu12==12.6.77 ; platform_machine == 'x86_64' and sys_platform == 'linux' \ --hash=sha256:35b0cc6ee3a9636d5409133e79273ce1f3fd087abb0532d2d2e8fff1fe9efc53 \ --hash=sha256:5847f1d6e5b757f1d2b3991a01082a44aad6f10ab3c5c0213fa3e25bddc25a13 \ --hash=sha256:f7007dbd914c56bd80ea31bc43e8e149da38f68158f423ba845fc3292684e45a # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # torch nvidia-cuda-runtime-cu12==12.6.77 ; platform_machine == 'x86_64' and sys_platform == 'linux' \ --hash=sha256:6116fad3e049e04791c0256a9778c16237837c08b27ed8c8401e2e45de8d60cd \ @@ -1750,14 +1795,14 @@ nvidia-cuda-runtime-cu12==12.6.77 ; platform_machine == 'x86_64' and sys_platfor --hash=sha256:ba3b56a4f896141e25e19ab287cd71e52a6a0f4b29d0d31609f60e3b4d5219b7 \ --hash=sha256:d461264ecb429c84c8879a7153499ddc7b19b5f8d84c204307491989a365588e # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # torch nvidia-cudnn-cu12==9.5.1.17 ; platform_machine == 'x86_64' and sys_platform == 'linux' \ --hash=sha256:30ac3869f6db17d170e0e556dd6cc5eee02647abc31ca856634d5a40f82c15b2 \ --hash=sha256:9fd4584468533c61873e5fda8ca41bac3a38bcb2d12350830c69b0a96a7e4def \ --hash=sha256:d7af0f8a4f3b4b9dbb3122f2ef553b45694ed9c384d5a75bab197b8eefb79ab8 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # torch nvidia-cufft-cu12==11.3.0.4 ; platform_machine == 'x86_64' and sys_platform == 'linux' \ --hash=sha256:6048ebddfb90d09d2707efb1fd78d4e3a77cb3ae4dc60e19aab6be0ece2ae464 \ @@ -1766,13 +1811,13 @@ nvidia-cufft-cu12==11.3.0.4 ; platform_machine == 'x86_64' and sys_platform == ' --hash=sha256:ccba62eb9cef5559abd5e0d54ceed2d9934030f51163df018532142a8ec533e5 \ --hash=sha256:d16079550df460376455cba121db6564089176d9bac9e4f360493ca4741b22a6 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # torch nvidia-cufile-cu12==1.11.1.6 ; platform_machine == 'x86_64' and sys_platform == 'linux' \ --hash=sha256:8f57a0051dcf2543f6dc2b98a98cb2719c37d3cee1baba8965d57f3bbc90d4db \ --hash=sha256:cc23469d1c7e52ce6c1d55253273d32c565dd22068647f3aa59b3c6b005bf159 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # torch nvidia-curand-cu12==10.3.7.77 ; platform_machine == 'x86_64' and sys_platform == 'linux' \ --hash=sha256:6d6d935ffba0f3d439b7cd968192ff068fafd9018dbf1b85b37261b13cfc9905 \ @@ -1781,7 +1826,7 @@ nvidia-curand-cu12==10.3.7.77 ; platform_machine == 'x86_64' and sys_platform == --hash=sha256:99f1a32f1ac2bd134897fc7a203f779303261268a65762a623bf30cc9fe79117 \ --hash=sha256:a42cd1344297f70b9e39a1e4f467a4e1c10f1da54ff7a85c12197f6c652c8bdf # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # torch nvidia-cusolver-cu12==11.7.1.2 ; platform_machine == 'x86_64' and sys_platform == 'linux' \ --hash=sha256:0ce237ef60acde1efc457335a2ddadfd7610b892d94efee7b776c64bb1cac9e0 \ @@ -1790,7 +1835,7 @@ nvidia-cusolver-cu12==11.7.1.2 ; platform_machine == 'x86_64' and sys_platform = --hash=sha256:dbbe4fc38ec1289c7e5230e16248365e375c3673c9c8bac5796e2e20db07f56e \ --hash=sha256:e9e49843a7707e42022babb9bcfa33c29857a93b88020c4e4434656a655b698c # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # torch nvidia-cusparse-cu12==12.5.4.2 ; platform_machine == 'x86_64' and sys_platform == 'linux' \ --hash=sha256:23749a6571191a215cb74d1cdbff4a86e7b19f1200c071b3fcf844a5bea23a2f \ @@ -1799,7 +1844,7 @@ nvidia-cusparse-cu12==12.5.4.2 ; platform_machine == 'x86_64' and sys_platform = --hash=sha256:7aa32fa5470cf754f72d1116c7cbc300b4e638d3ae5304cfa4a638a5b87161b1 \ --hash=sha256:d25b62fb18751758fe3c93a4a08eff08effedfe4edf1c6bb5afd0890fe88f887 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # nvidia-cusolver-cu12 # torch nvidia-cusparselt-cu12==0.6.3 ; platform_machine == 'x86_64' and sys_platform == 'linux' \ @@ -1807,20 +1852,20 @@ nvidia-cusparselt-cu12==0.6.3 ; platform_machine == 'x86_64' and sys_platform == --hash=sha256:8371549623ba601a06322af2133c4a44350575f5a3108fb75f3ef20b822ad5f1 \ --hash=sha256:e5c8a26c36445dd2e6812f1177978a24e2d37cacce7e090f297a688d1ec44f46 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # torch nvidia-nccl-cu12==2.26.2 ; platform_machine == 'x86_64' and sys_platform == 'linux' \ --hash=sha256:5c196e95e832ad30fbbb50381eb3cbd1fadd5675e587a548563993609af19522 \ --hash=sha256:694cf3879a206553cc9d7dbda76b13efaf610fdb70a50cba303de1b0d1530ac6 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # torch nvidia-nvjitlink-cu12==12.6.85 ; platform_machine == 'x86_64' and sys_platform == 'linux' \ --hash=sha256:cf4eaa7d4b6b543ffd69d6abfb11efdeb2db48270d94dfd3a452c24150829e41 \ --hash=sha256:e61120e52ed675747825cdd16febc6a0730537451d867ee58bee3853b1b13d1c \ --hash=sha256:eedc36df9e88b682efe4309aa16b5b4e78c2407eac59e8c10a6a47535164369a # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # nvidia-cufft-cu12 # nvidia-cusolver-cu12 # nvidia-cusparse-cu12 @@ -1832,25 +1877,43 @@ nvidia-nvtx-cu12==12.6.77 ; platform_machine == 'x86_64' and sys_platform == 'li --hash=sha256:b90bed3df379fa79afbd21be8e04a0314336b8ae16768b58f2d34cb1d04cd7d2 \ --hash=sha256:f44f8d86bb7d5629988d61c8d3ae61dddb2015dee142740536bc7481b022fe4b # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # torch -openai==1.90.0 \ - --hash=sha256:9771982cdd5b6631af68c6a603da72ed44cd2caf73b49f717a72b71374bc565b \ - --hash=sha256:e5dcb5498ea6b42fec47546d10f1bcc05fb854219a7d953a5ba766718b212a02 +openai==1.100.2 \ + --hash=sha256:54d3457b2c8d7303a1bc002a058de46bdd8f37a8117751c7cf4ed4438051f151 \ + --hash=sha256:787b4c3c8a65895182c58c424f790c25c790cc9a0330e34f73d55b6ee5a00e32 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock + # vllm +openai-harmony==0.0.4 \ + --hash=sha256:038f1d6772d1be5213b36ae76e5d042022395ec35c428a73ccb8b839b2cecf6a \ + --hash=sha256:15e6d53a66502491a3675a536df30e271f976e6c5efe68250a65191efcb85c4f \ + --hash=sha256:2d8d16d84702059833fb03b841b28c25600c54e83cadccef79af44e1c81166b1 \ + --hash=sha256:31e9bcac0902a309e2fc688e52f247eec7fffcd00d17e958b9a83a8fea6519c2 \ + --hash=sha256:3586d90c899cd41f8624e7b82a48c289f6e4be56c66304ecaf3a0ba88963a73f \ + --hash=sha256:3cf2344366f10981bbc0f6d9949a0b2bb87151d209ed295943ed6ad8eda37932 \ + --hash=sha256:567cc568b6bf7b4d041b0c9aa7d6b2c9394f8af6065bc87fa6d23f207b5af9a7 \ + --hash=sha256:5c67ac6df349236fb7b64f57c3dbb0273efcdca24314daa108f2a482c427106c \ + --hash=sha256:746f751de5033b3dbcfcd4a726a4c56ce452c593ad3d54472d8597ce8d8b6d44 \ + --hash=sha256:96a63199c0d81095b5d5d1ae8ca82b64c1c13d18d4e30323ae9e8ab31bc80a3d \ + --hash=sha256:97f1fe3909733212cc6b36f0f199b1421a9c57b79ec665f0322bd604cec47340 \ + --hash=sha256:b9ee9e9ab6a237cebbe16563c787a6e83f3fcc034075c3d321dab94448426282 \ + --hash=sha256:d38f2639f6bf7c3c34a5dfd79e29075811ae2fa9b895a63e76767f74a47a971e \ + --hash=sha256:ef21a1e2384a65c62d5ec5e1cded9fe026f1d032d5c5d725110d1a8d330d8f54 + # via + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # vllm opencensus==0.11.4 \ --hash=sha256:a18487ce68bc19900336e0ff4655c5a116daf10c1b3685ece8d971bddad6a864 \ --hash=sha256:cbef87d8b8773064ab60e5c2a1ced58bbaa38a6d052c41aec224958ce544eff2 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt opencensus-context==0.1.3 \ --hash=sha256:073bb0590007af276853009fac7e4bab1d523c3f03baf4cb4511ca38967c6039 \ --hash=sha256:a03108c3c10d8c80bb5ddf5c8a1f033161fa61972a9917f9b9b3a18517f0088c # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # opencensus opencv-python-headless==4.11.0.86 \ --hash=sha256:0e0a27c19dd1f40ddff94976cfe43066fbbe9dfbb2ec1907d66c19caef42a57b \ @@ -1861,14 +1924,14 @@ opencv-python-headless==4.11.0.86 \ --hash=sha256:a66c1b286a9de872c343ee7c3553b084244299714ebb50fbdcd76f07ebbe6c81 \ --hash=sha256:f447d8acbb0b6f2808da71fddd29c1cdd448d2bc98f72d9bb78a7a898fc9621b # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # mistral-common # vllm opentelemetry-api==1.34.1 \ --hash=sha256:64f0bd06d42824843731d05beea88d4d4b6ae59f9fe347ff7dfa2cc14233bbb3 \ --hash=sha256:b7df4cb0830d5a6c29ad0c0691dbae874d8daefa934b8b1d642de48323d32a8c # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt # opentelemetry-exporter-prometheus # opentelemetry-sdk @@ -1877,26 +1940,26 @@ opentelemetry-exporter-prometheus==0.55b1 \ --hash=sha256:d13ec0b22bf394113ff1ada5da98133a4b051779b803dae183188e26c4bd9ee0 \ --hash=sha256:f364fbbff9e5de37a112ff104d1185fb1d7e2046c5ab5911e5afebc7ab3ddf0e # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt opentelemetry-proto==1.27.0 \ --hash=sha256:33c9345d91dafd8a74fc3d7576c5a38f18b7fdf8d02983ac67485386132aedd6 \ --hash=sha256:b133873de5581a50063e1e4b29cdcf0c5e253a8c2d8dc1229add20a4c3830ace # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt opentelemetry-sdk==1.34.1 \ --hash=sha256:308effad4059562f1d92163c61c8141df649da24ce361827812c40abb2a1e96e \ --hash=sha256:8091db0d763fcd6098d4781bbc80ff0971f94e260739aa6afe6fd379cdf3aa4d # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt # opentelemetry-exporter-prometheus opentelemetry-semantic-conventions==0.55b1 \ --hash=sha256:5da81dfdf7d52e3d37f8fe88d5e771e191de924cfff5f550ab0b8f7b2409baed \ --hash=sha256:ef95b1f009159c28d7a7849f5cbc71c4c34c845bb514d66adfdf1b3fff3598b3 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # opentelemetry-sdk outlines-core==0.2.10 \ --hash=sha256:0a9e4b192ca837a472a1bb1428397509f543db08e1aeeee30252525cec34093a \ @@ -1941,15 +2004,16 @@ outlines-core==0.2.10 \ --hash=sha256:f895834da0a577120dcb8d979c12c0690fe912095413bf0070a73e9ff363b7bf \ --hash=sha256:faf5b43181b1d033871364e74e9d348362c6a77b1d054d7af35e09fdfcff5b16 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # vllm packaging==23.0 \ --hash=sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2 \ --hash=sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt # huggingface-hub + # kombu # lazy-loader # lm-format-enforcer # ray @@ -1985,13 +2049,13 @@ pandas==1.5.3 \ --hash=sha256:e474390e60ed609cec869b0da796ad94f420bb057d86784191eefc62b65819ae \ --hash=sha256:f76d097d12c82a535fda9dfe5e8dd4127952b45fea9b0276cb30cca5ea313fbc # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt partial-json-parser==0.2.1.1.post5 \ --hash=sha256:627715aaa3cb3fb60a65b0d62223243acaa6c70846520a90326fef3a2f0b61ca \ --hash=sha256:992710ac67e90b367921d52727698928040f7713ba7ecb33b96371ea7aec82ca # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # vllm pillow==10.3.0 \ --hash=sha256:048ad577748b9fa4a99a0548c64f2cb8d672d5bf2e643a739ac8faff1164238c \ @@ -2064,7 +2128,7 @@ pillow==10.3.0 \ --hash=sha256:fdcbb4068117dfd9ce0138d068ac512843c52295ed996ae6dd1faf537b6dbc27 \ --hash=sha256:ff61bfd9253c3915e6d41c651d5f962da23eda633cf02262990094a18a55371a # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # imageio # mistral-common # scikit-image @@ -2074,13 +2138,13 @@ platformdirs==3.11.0 \ --hash=sha256:cf8ee52a3afdb965072dcc652433e0c7e3e40cf5ea1477cd4b3b1d2eb75495b3 \ --hash=sha256:e9d171d00af68be50e9202731309c4e658fd8bc76f55c11c7dd760d023bda68e # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # virtualenv prometheus-client==0.19.0 \ --hash=sha256:4585b0d1223148c27a225b10dbec5ae9bc4c81a99a3fa80774fa6209935324e1 \ --hash=sha256:c88b1e6ecf6b41cd8fb5731c7ae919bf66df6ec6fafa555cd6c0e16ca169ae92 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt # opentelemetry-exporter-prometheus # prometheus-fastapi-instrumentator @@ -2089,8 +2153,14 @@ prometheus-fastapi-instrumentator==7.0.2 \ --hash=sha256:8a4d8fb13dbe19d2882ac6af9ce236e4e1f98dc48e3fa44fe88d8e23ac3c953f \ --hash=sha256:975e39992acb7a112758ff13ba95317e6c54d1bbf605f9156f31ac9f2800c32d # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # vllm +prompt-toolkit==3.0.41 \ + --hash=sha256:941367d97fc815548822aa26c2a269fdc4eb21e9ec05fc5d447cf09bad5d75f0 \ + --hash=sha256:f36fe301fafb7470e86aaf90f036eef600a3210be4decf461a5b1ca8403d3cb2 + # via + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock + # click-repl propcache==0.3.0 \ --hash=sha256:02df07041e0820cacc8f739510078f2aadcfd3fc57eaeeb16d5ded85c872c89e \ --hash=sha256:03acd9ff19021bd0567582ac88f821b66883e158274183b9e5586f678984f8fe \ @@ -2191,14 +2261,14 @@ propcache==0.3.0 \ --hash=sha256:fb91d20fa2d3b13deea98a690534697742029f4fb83673a3501ae6e3746508b5 \ --hash=sha256:fddb8870bdb83456a489ab67c6b3040a8d5a55069aa6f72f9d872235fbc52f54 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # aiohttp # yarl proto-plus==1.22.3 \ --hash=sha256:a49cd903bc0b6ab41f76bf65510439d56ca76f868adf0274e738bfdd096894df \ --hash=sha256:fdcd09713cbd42480740d2fe29c990f7fbd885a67efc328aa8be6ee3e9f76a6b # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # google-api-core protobuf==4.25.8 \ --hash=sha256:077ff8badf2acf8bc474406706ad890466274191a48d0abd3bd6987107c9cde5 \ @@ -2213,7 +2283,7 @@ protobuf==4.25.8 \ --hash=sha256:d552c53d0415449c8d17ced5c341caba0d89dbf433698e1436c8fa0aae7808a3 \ --hash=sha256:f4510b93a3bec6eba8fd8f1093e9d7fb0d4a24d1a81377c10c0e5bbfe9e4ed24 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt # google-api-core # googleapis-common-protos @@ -2240,13 +2310,13 @@ psutil==5.9.6 \ --hash=sha256:fb8a697f11b0f5994550555fcfe3e69799e5b060c8ecf9e2f75c69302cc35c0d \ --hash=sha256:ff18b8d1a784b810df0b0fff3bcb50ab941c3b8e2c8de5726f9c71c601c611aa # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # vllm py-cpuinfo==9.0.0 \ --hash=sha256:3cdbbf3fac90dc6f118bfd64384f309edeadd902d7c8fb17f02ffa1fc3f49690 \ --hash=sha256:859625bc251f64e21f077d099d4162689c762b5d6a4c3c97553d56241c9674d5 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # vllm py-spy==0.4.0 ; python_full_version < '3.12' \ --hash=sha256:47cdda4c34d9b6cb01f3aaeceb2e88faf57da880207fe72ff6ff97e9bb6cc8a9 \ @@ -2258,7 +2328,7 @@ py-spy==0.4.0 ; python_full_version < '3.12' \ --hash=sha256:eee3d0bde85ca5cf4f01f012d461180ca76c24835a96f7b5c4ded64eb6a008ab \ --hash=sha256:f2cf3f7130e7d780471faa5957441d3b4e0ec39a79b2c00f4c33d494f7728428 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt pyarrow==19.0.1 \ --hash=sha256:008a4009efdb4ea3d2e18f05cd31f9d43c388aad29c636112c2966605ba33466 \ @@ -2304,20 +2374,20 @@ pyarrow==19.0.1 \ --hash=sha256:fca15aabbe9b8355800d923cc2e82c8ef514af321e18b437c3d782aa884eaeec \ --hash=sha256:fd44d66093a239358d07c42a91eebf5015aa54fccba959db899f932218ac9cc8 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt pyasn1==0.5.1 \ --hash=sha256:4439847c58d40b1d0a573d07e3856e95333f1976294494c325775aeca506eb58 \ --hash=sha256:6d391a96e59b23130a5cfa74d6fd7f388dbbe26cc8f1edf39fdddf08d9d6676c # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # pyasn1-modules # rsa pyasn1-modules==0.3.0 \ --hash=sha256:5bd01446b736eb9d31512a30d46c1ac3395d676c6f3cafa4c03eb54b9925631c \ --hash=sha256:d3ccd6ed470d9ffbc716be08bd90efbd44d0734bc9303818f7336070984a162d # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # google-auth pybase64==1.4.1 \ --hash=sha256:011a54ff6ca44c5d03746aec3f1f492fce3155bd3f943fb2ceaea92416d40eeb \ @@ -2464,191 +2534,192 @@ pybase64==1.4.1 \ --hash=sha256:fc9504c4c2e893e0a6c1cc80bce51907e3461288289f630eab22b5735eba1104 \ --hash=sha256:ff172a4dacbd964e5edcf1c2152dae157aabf856508aed15276f46d04a22128e # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # vllm pybind11==2.13.6 \ --hash=sha256:237c41e29157b962835d356b370ededd57594a26d5894a795960f0047cb5caf5 \ --hash=sha256:ba6af10348c12b24e92fa086b39cfba0eff619b61ac77c406167d813b096d39a # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements/llm/llm-requirements.txt pycountry==24.6.1 \ --hash=sha256:b61b3faccea67f87d10c1f2b0fc0be714409e8fcdcc1315613174f6466c10221 \ --hash=sha256:f1a4fb391cd7214f8eefd39556d740adcc233c778a27f8942c8dca351d6ce06f # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # pydantic-extra-types pycparser==2.21 \ --hash=sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9 \ --hash=sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # cffi -pydantic==2.10.0 \ - --hash=sha256:0aca0f045ff6e2f097f1fe89521115335f15049eeb8a7bef3dafe4b19a74e289 \ - --hash=sha256:5e7807ba9201bdf61b1b58aa6eb690916c40a47acfb114b1b4fef3e7fd5b30fc +pydantic==2.11.7 \ + --hash=sha256:d989c3c6cb79469287b1569f7447a17848c998458d49ebe294e975b9baf0f0db \ + --hash=sha256:dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt # compressed-tensors # fastapi # lm-format-enforcer # mistral-common # openai + # openai-harmony # pydantic-extra-types # vllm # xgrammar -pydantic-core==2.27.0 \ - --hash=sha256:0aa4d1b2eba9a325897308b3124014a142cdccb9f3e016f31d3ebee6b5ea5e75 \ - --hash=sha256:0d06b667e53320332be2bf6f9461f4a9b78092a079b8ce8634c9afaa7e10cd9f \ - --hash=sha256:153017e3d6cd3ce979de06d84343ca424bb6092727375eba1968c8b4693c6ecb \ - --hash=sha256:15e350efb67b855cd014c218716feea4986a149ed1f42a539edd271ee074a196 \ - --hash=sha256:185ef205256cd8b38431205698531026979db89a79587725c1e55c59101d64e9 \ - --hash=sha256:1da0c98a85a6c6ed702d5556db3b09c91f9b0b78de37b7593e2de8d03238807a \ - --hash=sha256:225bfff5d425c34e1fd562cef52d673579d59b967d9de06178850c4802af9039 \ - --hash=sha256:24f984fc7762ed5f806d9e8c4c77ea69fdb2afd987b4fd319ef06c87595a8c55 \ - --hash=sha256:25a7fd4de38f7ff99a37e18fa0098c3140286451bc823d1746ba80cec5b433a1 \ - --hash=sha256:2883b260f7a93235488699d39cbbd94fa7b175d3a8063fbfddd3e81ad9988cb2 \ - --hash=sha256:2a51ce96224eadd1845150b204389623c8e129fde5a67a84b972bd83a85c6c40 \ - --hash=sha256:2be0ad541bb9f059954ccf8877a49ed73877f862529575ff3d54bf4223e4dd61 \ - --hash=sha256:31a2cae5f059329f9cfe3d8d266d3da1543b60b60130d186d9b6a3c20a346361 \ - --hash=sha256:333c840a1303d1474f491e7be0b718226c730a39ead0f7dab2c7e6a2f3855555 \ - --hash=sha256:33d14369739c5d07e2e7102cdb0081a1fa46ed03215e07f097b34e020b83b1ae \ - --hash=sha256:35380671c3c921fe8adf31ad349dc6f7588b7e928dbe44e1093789734f607399 \ - --hash=sha256:359e7951f04ad35111b5ddce184db3391442345d0ab073aa63a95eb8af25a5ef \ - --hash=sha256:36aa167f69d8807ba7e341d67ea93e50fcaaf6bc433bb04939430fa3dab06f31 \ - --hash=sha256:395e3e1148fa7809016231f8065f30bb0dc285a97b4dc4360cd86e17bab58af7 \ - --hash=sha256:3e8d89c276234579cd3d095d5fa2a44eb10db9a218664a17b56363cddf226ff3 \ - --hash=sha256:3eb8849445c26b41c5a474061032c53e14fe92a11a5db969f722a2716cd12206 \ - --hash=sha256:3fd8bc2690e7c39eecdf9071b6a889ce7b22b72073863940edc2a0a23750ca90 \ - --hash=sha256:400bf470e4327e920883b51e255617dfe4496d4e80c3fea0b5a5d0bf2c404dd4 \ - --hash=sha256:4148dc9184ab79e356dc00a4199dc0ee8647973332cb385fc29a7cced49b9f9c \ - --hash=sha256:433689845288f9a1ee5714444e65957be26d30915f7745091ede4a83cfb2d7bb \ - --hash=sha256:43b61989068de9ce62296cde02beffabcadb65672207fc51e7af76dca75e6636 \ - --hash=sha256:4523c4009c3f39d948e01962223c9f5538602e7087a628479b723c939fab262d \ - --hash=sha256:483c2213a609e7db2c592bbc015da58b6c75af7360ca3c981f178110d9787bcf \ - --hash=sha256:49633583eb7dc5cba61aaf7cdb2e9e662323ad394e543ee77af265736bcd3eaa \ - --hash=sha256:4b51f964fcbb02949fc546022e56cdb16cda457af485e9a3e8b78ac2ecf5d77e \ - --hash=sha256:4bf1340ae507f6da6360b24179c2083857c8ca7644aab65807023cf35404ea8d \ - --hash=sha256:4fb49cfdb53af5041aba909be00cccfb2c0d0a2e09281bf542371c5fd36ad04c \ - --hash=sha256:510b11e9c3b1a852876d1ccd8d5903684336d635214148637ceb27366c75a467 \ - --hash=sha256:513cb14c0cc31a4dfd849a4674b20c46d87b364f997bbcb02282306f5e187abf \ - --hash=sha256:58560828ee0951bb125c6f2862fbc37f039996d19ceb6d8ff1905abf7da0bf3d \ - --hash=sha256:58ab0d979c969983cdb97374698d847a4acffb217d543e172838864636ef10d9 \ - --hash=sha256:5982048129f40b082c2654de10c0f37c67a14f5ff9d37cf35be028ae982f26df \ - --hash=sha256:5ab325fc86fbc077284c8d7f996d904d30e97904a87d6fb303dce6b3de7ebba9 \ - --hash=sha256:5cc822ab90a70ea3a91e6aed3afac570b276b1278c6909b1d384f745bd09c714 \ - --hash=sha256:5f2b19b8d6fca432cb3acf48cf5243a7bf512988029b6e6fd27e9e8c0a204d85 \ - --hash=sha256:5fc72fbfebbf42c0856a824b8b0dc2b5cd2e4a896050281a21cfa6fed8879cb1 \ - --hash=sha256:6354e18a9be37bfa124d6b288a87fb30c673745806c92956f1a25e3ae6e76b96 \ - --hash=sha256:678f66462058dd978702db17eb6a3633d634f7aa0deaea61e0a674152766d3fc \ - --hash=sha256:68950bc08f9735306322bfc16a18391fcaac99ded2509e1cc41d03ccb6013cfe \ - --hash=sha256:68ef5377eb582fa4343c9d0b57a5b094046d447b4c73dd9fbd9ffb216f829e7d \ - --hash=sha256:6b4c19525c3538fbc0bbda6229f9682fb8199ce9ac37395880e6952798e00373 \ - --hash=sha256:6bb69bf3b6500f195c3deb69c1205ba8fc3cb21d1915f1f158a10d6b1ef29b6a \ - --hash=sha256:6e19401742ed7b69e51d8e4df3c03ad5ec65a83b36244479fd70edde2828a5d9 \ - --hash=sha256:6f4a53af9e81d757756508b57cae1cf28293f0f31b9fa2bfcb416cc7fb230f9d \ - --hash=sha256:6fda87808429c520a002a85d6e7cdadbf58231d60e96260976c5b8f9a12a8e13 \ - --hash=sha256:78f841523729e43e3928a364ec46e2e3f80e6625a4f62aca5c345f3f626c6e8a \ - --hash=sha256:7a6ebfac28fd51890a61df36ef202adbd77d00ee5aca4a3dadb3d9ed49cfb929 \ - --hash=sha256:7b0202ebf2268954090209a84f9897345719e46a57c5f2c9b7b250ca0a9d3e63 \ - --hash=sha256:8117839a9bdbba86e7f9df57018fe3b96cec934c3940b591b0fd3fbfb485864a \ - --hash=sha256:82e1ad4ca170e8af4c928b67cff731b6296e6a0a0981b97b2eb7c275cc4e15bd \ - --hash=sha256:836a4bfe0cc6d36dc9a9cc1a7b391265bf6ce9d1eb1eac62ac5139f5d8d9a6fa \ - --hash=sha256:84af1cf7bfdcbc6fcf5a5f70cc9896205e0350306e4dd73d54b6a18894f79386 \ - --hash=sha256:84e35afd9e10b2698e6f2f32256678cb23ca6c1568d02628033a837638b3ed12 \ - --hash=sha256:884f1806609c2c66564082540cffc96868c5571c7c3cf3a783f63f2fb49bd3cd \ - --hash=sha256:8a150392102c402c538190730fda06f3bce654fc498865579a9f2c1d2b425833 \ - --hash=sha256:8e21d927469d04b39386255bf00d0feedead16f6253dcc85e9e10ddebc334084 \ - --hash=sha256:8e96ca781e0c01e32115912ebdf7b3fb0780ce748b80d7d28a0802fa9fbaf44e \ - --hash=sha256:8ee4c2a75af9fe21269a4a0898c5425afb01af1f5d276063f57e2ae1bc64e191 \ - --hash=sha256:91bc66f878557313c2a6bcf396e7befcffe5ab4354cfe4427318968af31143c3 \ - --hash=sha256:951e71da6c89d354572098bada5ba5b5dc3a9390c933af8a614e37755d3d1840 \ - --hash=sha256:99b2863c1365f43f74199c980a3d40f18a218fbe683dd64e470199db426c4d6a \ - --hash=sha256:9a8fbf506fde1529a1e3698198fe64bfbe2e0c09557bc6a7dcf872e7c01fec40 \ - --hash=sha256:9ce048deb1e033e7a865ca384770bccc11d44179cf09e5193a535c4c2f497bdc \ - --hash=sha256:9fe94d9d2a2b4edd7a4b22adcd45814b1b59b03feb00e56deb2e89747aec7bfe \ - --hash=sha256:a291d0b4243a259c8ea7e2b84eb9ccb76370e569298875a7c5e3e71baf49057a \ - --hash=sha256:a5c022bb0d453192426221605efc865373dde43b17822a264671c53b068ac20c \ - --hash=sha256:abb4785894936d7682635726613c44578c420a096729f1978cd061a7e72d5275 \ - --hash=sha256:b872c86d8d71827235c7077461c502feb2db3f87d9d6d5a9daa64287d75e4fa0 \ - --hash=sha256:bf37b72834e7239cf84d4a0b2c050e7f9e48bced97bad9bdf98d26b8eb72e846 \ - --hash=sha256:c0c431e4be5c1a0c6654e0c31c661cd89e0ca956ef65305c3c3fd96f4e72ca39 \ - --hash=sha256:c5726eec789ee38f2c53b10b1821457b82274f81f4f746bb1e666d8741fcfadb \ - --hash=sha256:c6fcb3fa3855d583aa57b94cf146f7781d5d5bc06cb95cb3afece33d31aac39b \ - --hash=sha256:c86679f443e7085ea55a7376462553996c688395d18ef3f0d3dbad7838f857a2 \ - --hash=sha256:c91e3c04f5191fd3fb68764bddeaf02025492d5d9f23343b283870f6ace69708 \ - --hash=sha256:c921ad596ff1a82f9c692b0758c944355abc9f0de97a4c13ca60ffc6d8dc15d4 \ - --hash=sha256:c9ed88b398ba7e3bad7bd64d66cc01dcde9cfcb7ec629a6fd78a82fa0b559d78 \ - --hash=sha256:cd2ac6b919f7fed71b17fe0b4603c092a4c9b5bae414817c9c81d3c22d1e1bcc \ - --hash=sha256:d28ca7066d6cdd347a50d8b725dc10d9a1d6a1cce09836cf071ea6a2d4908be0 \ - --hash=sha256:d29e235ce13c91902ef3efc3d883a677655b3908b1cbc73dee816e5e1f8f7739 \ - --hash=sha256:d8b5ee4ae9170e2775d495b81f414cc20268041c42571530513496ba61e94ba3 \ - --hash=sha256:db72e40628967f6dc572020d04b5f800d71264e0531c6da35097e73bdf38b003 \ - --hash=sha256:df45c4073bed486ea2f18757057953afed8dd77add7276ff01bccb79982cf46c \ - --hash=sha256:dfa5f5c0a4c8fced1422dc2ca7eefd872d5d13eb33cf324361dbf1dbfba0a9fe \ - --hash=sha256:e015833384ca3e1a0565a79f5d953b0629d9138021c27ad37c92a9fa1af7623c \ - --hash=sha256:e15315691fe2253eb447503153acef4d7223dfe7e7702f9ed66539fcd0c43801 \ - --hash=sha256:e65466b31be1070b4a5b7dbfbd14b247884cb8e8b79c64fb0f36b472912dbaea \ - --hash=sha256:e7820bb0d65e3ce1e3e70b6708c2f66143f55912fa02f4b618d0f08b61575f12 \ - --hash=sha256:e851a051f7260e6d688267eb039c81f05f23a19431bd7dfa4bf5e3cb34c108cd \ - --hash=sha256:e9f9feee7f334b72ceae46313333d002b56f325b5f04271b4ae2aadd9e993ae4 \ - --hash=sha256:eb40f828bc2f73f777d1eb8fee2e86cd9692a4518b63b6b5aa8af915dfd3207b \ - --hash=sha256:eb704155e73b833801c247f39d562229c0303f54770ca14fb1c053acb376cf10 \ - --hash=sha256:edb1bfd45227dec8d50bc7c7d86463cd8728bcc574f9b07de7369880de4626a3 \ - --hash=sha256:ee7d9d5537daf6d5c74a83b38a638cc001b648096c1cae8ef695b0c919d9d379 \ - --hash=sha256:f57783fbaf648205ac50ae7d646f27582fc706be3977e87c3c124e7a92407b10 \ - --hash=sha256:ff63a92f6e249514ef35bc795de10745be0226eaea06eb48b4bbeaa0c8850a4a - # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt +pydantic-core==2.33.2 \ + --hash=sha256:0069c9acc3f3981b9ff4cdfaf088e98d83440a4c7ea1bc07460af3d4dc22e72d \ + --hash=sha256:031c57d67ca86902726e0fae2214ce6770bbe2f710dc33063187a68744a5ecac \ + --hash=sha256:0405262705a123b7ce9f0b92f123334d67b70fd1f20a9372b907ce1080c7ba02 \ + --hash=sha256:04a1a413977ab517154eebb2d326da71638271477d6ad87a769102f7c2488c56 \ + --hash=sha256:09fb9dd6571aacd023fe6aaca316bd01cf60ab27240d7eb39ebd66a3a15293b4 \ + --hash=sha256:0a39979dcbb70998b0e505fb1556a1d550a0781463ce84ebf915ba293ccb7e22 \ + --hash=sha256:0a9f2c9dd19656823cb8250b0724ee9c60a82f3cdf68a080979d13092a3b0fef \ + --hash=sha256:0e03262ab796d986f978f79c943fc5f620381be7287148b8010b4097f79a39ec \ + --hash=sha256:0e5b2671f05ba48b94cb90ce55d8bdcaaedb8ba00cc5359f6810fc918713983d \ + --hash=sha256:0e6116757f7959a712db11f3e9c0a99ade00a5bbedae83cb801985aa154f071b \ + --hash=sha256:0fb2d542b4d66f9470e8065c5469ec676978d625a8b7a363f07d9a501a9cb36a \ + --hash=sha256:1082dd3e2d7109ad8b7da48e1d4710c8d06c253cbc4a27c1cff4fbcaa97a9e3f \ + --hash=sha256:1a8695a8d00c73e50bff9dfda4d540b7dee29ff9b8053e38380426a85ef10052 \ + --hash=sha256:1e063337ef9e9820c77acc768546325ebe04ee38b08703244c1309cccc4f1bab \ + --hash=sha256:1ea40a64d23faa25e62a70ad163571c0b342b8bf66d5fa612ac0dec4f069d916 \ + --hash=sha256:2058a32994f1fde4ca0480ab9d1e75a0e8c87c22b53a3ae66554f9af78f2fe8c \ + --hash=sha256:235f45e5dbcccf6bd99f9f472858849f73d11120d76ea8707115415f8e5ebebf \ + --hash=sha256:2807668ba86cb38c6817ad9bc66215ab8584d1d304030ce4f0887336f28a5e27 \ + --hash=sha256:2b0a451c263b01acebe51895bfb0e1cc842a5c666efe06cdf13846c7418caa9a \ + --hash=sha256:2b3d326aaef0c0399d9afffeb6367d5e26ddc24d351dbc9c636840ac355dc5d8 \ + --hash=sha256:2bfb5112df54209d820d7bf9317c7a6c9025ea52e49f46b6a2060104bba37de7 \ + --hash=sha256:2f82865531efd18d6e07a04a17331af02cb7a651583c418df8266f17a63c6612 \ + --hash=sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1 \ + --hash=sha256:3a1c81334778f9e3af2f8aeb7a960736e5cab1dfebfb26aabca09afd2906c039 \ + --hash=sha256:3abcd9392a36025e3bd55f9bd38d908bd17962cc49bc6da8e7e96285336e2bca \ + --hash=sha256:3c6db6e52c6d70aa0d00d45cdb9b40f0433b96380071ea80b09277dba021ddf7 \ + --hash=sha256:3dc625f4aa79713512d1976fe9f0bc99f706a9dee21dfd1810b4bbbf228d0e8a \ + --hash=sha256:3eb3fe62804e8f859c49ed20a8451342de53ed764150cb14ca71357c765dc2a6 \ + --hash=sha256:44857c3227d3fb5e753d5fe4a3420d6376fa594b07b621e220cd93703fe21782 \ + --hash=sha256:4b25d91e288e2c4e0662b8038a28c6a07eaac3e196cfc4ff69de4ea3db992a1b \ + --hash=sha256:4c5b0a576fb381edd6d27f0a85915c6daf2f8138dc5c267a57c08a62900758c7 \ + --hash=sha256:4e61206137cbc65e6d5256e1166f88331d3b6238e082d9f74613b9b765fb9025 \ + --hash=sha256:52fb90784e0a242bb96ec53f42196a17278855b0f31ac7c3cc6f5c1ec4811849 \ + --hash=sha256:53a57d2ed685940a504248187d5685e49eb5eef0f696853647bf37c418c538f7 \ + --hash=sha256:572c7e6c8bb4774d2ac88929e3d1f12bc45714ae5ee6d9a788a9fb35e60bb04b \ + --hash=sha256:5c4aa4e82353f65e548c476b37e64189783aa5384903bfea4f41580f255fddfa \ + --hash=sha256:5c92edd15cd58b3c2d34873597a1e20f13094f59cf88068adb18947df5455b4e \ + --hash=sha256:5f483cfb75ff703095c59e365360cb73e00185e01aaea067cd19acffd2ab20ea \ + --hash=sha256:61c18fba8e5e9db3ab908620af374db0ac1baa69f0f32df4f61ae23f15e586ac \ + --hash=sha256:6368900c2d3ef09b69cb0b913f9f8263b03786e5b2a387706c5afb66800efd51 \ + --hash=sha256:64632ff9d614e5eecfb495796ad51b0ed98c453e447a76bcbeeb69615079fc7e \ + --hash=sha256:65132b7b4a1c0beded5e057324b7e16e10910c106d43675d9bd87d4f38dde162 \ + --hash=sha256:6b99022f1d19bc32a4c2a0d544fc9a76e3be90f0b3f4af413f87d38749300e65 \ + --hash=sha256:6bdfe4b3789761f3bcb4b1ddf33355a71079858958e3a552f16d5af19768fef2 \ + --hash=sha256:6fa6dfc3e4d1f734a34710f391ae822e0a8eb8559a85c6979e14e65ee6ba2954 \ + --hash=sha256:73662edf539e72a9440129f231ed3757faab89630d291b784ca99237fb94db2b \ + --hash=sha256:73cf6373c21bc80b2e0dc88444f41ae60b2f070ed02095754eb5a01df12256de \ + --hash=sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc \ + --hash=sha256:7f92c15cd1e97d4b12acd1cc9004fa092578acfa57b67ad5e43a197175d01a64 \ + --hash=sha256:82f68293f055f51b51ea42fafc74b6aad03e70e191799430b90c13d643059ebb \ + --hash=sha256:83aa99b1285bc8f038941ddf598501a86f1536789740991d7d8756e34f1e74d9 \ + --hash=sha256:87acbfcf8e90ca885206e98359d7dca4bcbb35abdc0ff66672a293e1d7a19101 \ + --hash=sha256:87b31b6846e361ef83fedb187bb5b4372d0da3f7e28d85415efa92d6125d6e6d \ + --hash=sha256:881b21b5549499972441da4758d662aeea93f1923f953e9cbaff14b8b9565aef \ + --hash=sha256:8d55ab81c57b8ff8548c3e4947f119551253f4e3787a7bbc0b6b3ca47498a9d3 \ + --hash=sha256:8f57a69461af2a5fa6e6bbd7a5f60d3b7e6cebb687f55106933188e79ad155c1 \ + --hash=sha256:95237e53bb015f67b63c91af7518a62a8660376a6a0db19b89acc77a4d6199f5 \ + --hash=sha256:96081f1605125ba0855dfda83f6f3df5ec90c61195421ba72223de35ccfb2f88 \ + --hash=sha256:970919794d126ba8645f3837ab6046fb4e72bbc057b3709144066204c19a455d \ + --hash=sha256:9cb1da0f5a471435a7bc7e439b8a728e8b61e59784b2af70d7c169f8dd8ae290 \ + --hash=sha256:9fcd347d2cc5c23b06de6d3b7b8275be558a0c90549495c699e379a80bf8379e \ + --hash=sha256:9fdac5d6ffa1b5a83bca06ffe7583f5576555e6c8b3a91fbd25ea7780f825f7d \ + --hash=sha256:a11c8d26a50bfab49002947d3d237abe4d9e4b5bdc8846a63537b6488e197808 \ + --hash=sha256:a144d4f717285c6d9234a66778059f33a89096dfb9b39117663fd8413d582dcc \ + --hash=sha256:a2b911a5b90e0374d03813674bf0a5fbbb7741570dcd4b4e85a2e48d17def29d \ + --hash=sha256:a7ec89dc587667f22b6a0b6579c249fca9026ce7c333fc142ba42411fa243cdc \ + --hash=sha256:aa9d91b338f2df0508606f7009fde642391425189bba6d8c653afd80fd6bb64e \ + --hash=sha256:b0379a2b24882fef529ec3b4987cb5d003b9cda32256024e6fe1586ac45fc640 \ + --hash=sha256:bc7aee6f634a6f4a95676fcb5d6559a2c2a390330098dba5e5a5f28a2e4ada30 \ + --hash=sha256:bdc25f3681f7b78572699569514036afe3c243bc3059d3942624e936ec93450e \ + --hash=sha256:c083a3bdd5a93dfe480f1125926afcdbf2917ae714bdb80b36d34318b2bec5d9 \ + --hash=sha256:c20c462aa4434b33a2661701b861604913f912254e441ab8d78d30485736115a \ + --hash=sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9 \ + --hash=sha256:c52b02ad8b4e2cf14ca7b3d918f3eb0ee91e63b3167c32591e57c4317e134f8f \ + --hash=sha256:c54c939ee22dc8e2d545da79fc5381f1c020d6d3141d3bd747eab59164dc89fb \ + --hash=sha256:c8e7af2f4e0194c22b5b37205bfb293d166a7344a5b0d0eaccebc376546d77d5 \ + --hash=sha256:cca3868ddfaccfbc4bfb1d608e2ccaaebe0ae628e1416aeb9c4d88c001bb45ab \ + --hash=sha256:d3f26877a748dc4251cfcfda9dfb5f13fcb034f5308388066bcfe9031b63ae7d \ + --hash=sha256:d53b22f2032c42eaaf025f7c40c2e3b94568ae077a606f006d206a463bc69572 \ + --hash=sha256:d87c561733f66531dced0da6e864f44ebf89a8fba55f31407b00c2f7f9449593 \ + --hash=sha256:d946c8bf0d5c24bf4fe333af284c59a19358aa3ec18cb3dc4370080da1e8ad29 \ + --hash=sha256:dac89aea9af8cd672fa7b510e7b8c33b0bba9a43186680550ccf23020f32d535 \ + --hash=sha256:db4b41f9bd95fbe5acd76d89920336ba96f03e149097365afe1cb092fceb89a1 \ + --hash=sha256:dc46a01bf8d62f227d5ecee74178ffc448ff4e5197c756331f71efcc66dc980f \ + --hash=sha256:dd14041875d09cc0f9308e37a6f8b65f5585cf2598a53aa0123df8b129d481f8 \ + --hash=sha256:de4b83bb311557e439b9e186f733f6c645b9417c84e2eb8203f3f820a4b988bf \ + --hash=sha256:e799c050df38a639db758c617ec771fd8fb7a5f8eaaa4b27b101f266b216a246 \ + --hash=sha256:e80b087132752f6b3d714f041ccf74403799d3b23a72722ea2e6ba2e892555b9 \ + --hash=sha256:eb8c529b2819c37140eb51b914153063d27ed88e3bdc31b71198a198e921e011 \ + --hash=sha256:eb9b459ca4df0e5c87deb59d37377461a538852765293f9e6ee834f0435a93b9 \ + --hash=sha256:efec8db3266b76ef9607c2c4c419bdb06bf335ae433b80816089ea7585816f6a \ + --hash=sha256:f481959862f57f29601ccced557cc2e817bce7533ab8e01a797a48b49c9692b3 \ + --hash=sha256:f517ca031dfc037a9c07e748cefd8d96235088b83b4f4ba8939105d20fa1dcd6 \ + --hash=sha256:f889f7a40498cc077332c7ab6b4608d296d852182211787d4f3ee377aaae66e8 \ + --hash=sha256:f8de619080e944347f5f20de29a975c2d815d9ddd8be9b9b7268e2e3ef68605a \ + --hash=sha256:f941635f2a3d96b2973e867144fde513665c87f13fe0e193c158ac51bfaaa7b2 \ + --hash=sha256:fa754d1850735a0b0e03bcffd9d4b4343eb417e47196e4485d9cca326073a42c \ + --hash=sha256:fa854f5cf7e33842a892e5c73f45327760bc7bc516339fda888c75ae60edaeb6 \ + --hash=sha256:fe5b32187cbc0c862ee201ad66c30cf218e5ed468ec8dc1cf49dec66e160cc4d + # via + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # pydantic pydantic-extra-types==2.10.5 \ --hash=sha256:1dcfa2c0cf741a422f088e0dbb4690e7bfadaaf050da3d6f80d6c3cf58a2bad8 \ --hash=sha256:b60c4e23d573a69a4f1a16dd92888ecc0ef34fb0e655b4f305530377fa70e7a8 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # mistral-common pygments==2.18.0 \ --hash=sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199 \ --hash=sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # rich pyopenssl==25.0.0 \ --hash=sha256:424c247065e46e76a37411b9ab1782541c23bb658bf003772c3405fbaa128e90 \ --hash=sha256:cd2cef799efa3936bb08e8ccb9433a575722b9dd986023f1cabc4ae64e9dac16 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt python-dateutil==2.8.2 \ --hash=sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86 \ --hash=sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock + # celery # pandas python-dotenv==1.0.1 \ --hash=sha256:e324ee90a023d808f1959c46bcbc04446a10ced277783dc6ee09987c37ec10ca \ --hash=sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # uvicorn python-json-logger==2.0.7 \ --hash=sha256:23e7ec02d34237c5aa1e29a070193a4ea87583bb4e7f8fd06d3de8264c4b2e1c \ --hash=sha256:f380b826a991ebbe3de4d897aeec42760035ac760345e57b812938dc8b35e2bd # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # vllm python-multipart==0.0.20 \ --hash=sha256:8a62d3a8335e06589fe01f2a3e178cdcc632f3fbe0d492ad9ee0ec35aab1f104 \ --hash=sha256:8dd0cab45b8e23064ae09147625994d090fa46f5b0d1e13af944c331a7fa9d13 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # fastapi pytz==2022.7.1 \ --hash=sha256:01a0681c4b9684a28304615eba55d1ab31ae00bf68ec157ec3708a8182dbbcd0 \ --hash=sha256:78f4f37d8198e0627c5f1143240bb0206b8691d8d7ac6d78fee88b78733f8c4a # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # pandas pyyaml==6.0.1 \ --hash=sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5 \ @@ -2703,7 +2774,7 @@ pyyaml==6.0.1 \ --hash=sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d \ --hash=sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt # gguf # huggingface-hub @@ -2802,13 +2873,13 @@ pyzmq==26.0.3 \ --hash=sha256:f6b1d1c631e5940cac5a0b22c5379c86e8df6a4ec277c7a856b714021ab6cfad \ --hash=sha256:f6c21c00478a7bea93caaaef9e7629145d4153b15a8653e8bb4609d4bc70dbfc # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # vllm referencing==0.36.2 \ --hash=sha256:df2e89862cd09deabbdba16944cc3f10feb6b3e6f18e902f7cc25609a34775aa \ --hash=sha256:e8699adbbf8b5c7de96d8ffa0eb5c158b3beafce084968e2ea8bb08c6794dcd0 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # jsonschema # jsonschema-specifications regex==2024.11.6 \ @@ -2907,7 +2978,7 @@ regex==2024.11.6 \ --hash=sha256:fdd6028445d2460f33136c55eeb1f601ab06d74cb3347132e1c24250187500d9 \ --hash=sha256:ff590880083d60acc0433f9c3f713c51f7ac6ebb9adf889c79a261ecf541aa91 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # tiktoken # transformers # vllm @@ -2915,7 +2986,7 @@ requests==2.32.3 \ --hash=sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760 \ --hash=sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt # google-api-core # huggingface-hub @@ -2928,7 +2999,7 @@ rich==13.3.2 \ --hash=sha256:91954fe80cfb7985727a467ca98a7618e5dd15178cc2da10f553b36a93859001 \ --hash=sha256:a104f37270bf677148d8acb07d33be1569eeee87e2d1beb286a4e9113caf6f2f # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt # memray # typer @@ -3037,14 +3108,14 @@ rpds-py==0.22.3 \ --hash=sha256:fda7cb070f442bf80b642cd56483b5548e43d366fe3f39b98e67cce780cded00 \ --hash=sha256:feea821ee2a9273771bae61194004ee2fc33f8ec7db08117ef9147d4bbcbca8e # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # jsonschema # referencing rsa==4.7.2 \ --hash=sha256:78f9a9bf4e7be0c5ded4583326e7461e3a3c5aae24073648b4bdfa797d78c9d2 \ --hash=sha256:9d689e6ca1b3038bc82bf8d23e944b6b6037bc02301a574935b2dd946e0353b9 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # google-auth safetensors==0.5.2 \ --hash=sha256:03c937100f38c9ff4c1507abea9928a6a9b02c9c1c9c3609ed4fb2bf413d4975 \ @@ -3063,7 +3134,7 @@ safetensors==0.5.2 \ --hash=sha256:d3a06fae62418ec8e5c635b61a8086032c9e281f16c63c3af46a6efbab33156f \ --hash=sha256:fe55c039d97090d1f85277d402954dd6ad27f63034fa81985a9cc59655ac3ee2 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # transformers scikit-image==0.24.0 \ --hash=sha256:18836a18d3a7b6aca5376a2d805f0045826bc6c9fc85331659c33b4813e0b563 \ @@ -3088,7 +3159,7 @@ scikit-image==0.24.0 \ --hash=sha256:ef04360eda372ee5cd60aebe9be91258639c86ae2ea24093fb9182118008d009 \ --hash=sha256:fa27b3a0dbad807b966b8db2d78da734cb812ca4787f7fbb143764800ce2fa9c # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt scipy==1.11.4 \ --hash=sha256:00150c5eae7b610c32589dda259eacc7c4f1665aedf25d921907f4d08a951b1c \ @@ -3117,7 +3188,7 @@ scipy==1.11.4 \ --hash=sha256:f313b39a7e94f296025e3cffc2c567618174c0b1dde173960cf23808f9fae4be \ --hash=sha256:f3cd9e7b3c2c1ec26364856f9fbe78695fe631150f94cd1c22228456404cf1ec # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt # scikit-image # vllm @@ -3176,34 +3247,135 @@ sentencepiece==0.2.0 \ --hash=sha256:fb89f811e5efd18bab141afc3fea3de141c3f69f3fe9e898f710ae7fe3aab251 \ --hash=sha256:ff88712338b01031910e8e61e7239aff3ce8869ee31a47df63cb38aadd591bea # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # gguf # mistral-common # vllm +setproctitle==1.3.6 \ + --hash=sha256:082413db8a96b1f021088e8ec23f0a61fec352e649aba20881895815388b66d3 \ + --hash=sha256:0dba8faee2e4a96e934797c9f0f2d093f8239bf210406a99060b3eabe549628e \ + --hash=sha256:0e6b5633c94c5111f7137f875e8f1ff48f53b991d5d5b90932f27dc8c1fa9ae4 \ + --hash=sha256:1065ed36bd03a3fd4186d6c6de5f19846650b015789f72e2dea2d77be99bdca1 \ + --hash=sha256:109fc07b1cd6cef9c245b2028e3e98e038283342b220def311d0239179810dbe \ + --hash=sha256:13624d9925bb481bc0ccfbc7f533da38bfbfe6e80652314f789abc78c2e513bd \ + --hash=sha256:156795b3db976611d09252fc80761fcdb65bb7c9b9581148da900851af25ecf4 \ + --hash=sha256:163dba68f979c61e4e2e779c4d643e968973bdae7c33c3ec4d1869f7a9ba8390 \ + --hash=sha256:17d7c833ed6545ada5ac4bb606b86a28f13a04431953d4beac29d3773aa00b1d \ + --hash=sha256:18d0667bafaaae4c1dee831e2e59841c411ff399b9b4766822ba2685d419c3be \ + --hash=sha256:1aa1935aa2195b76f377e5cb018290376b7bf085f0b53f5a95c0c21011b74367 \ + --hash=sha256:2156d55308431ac3b3ec4e5e05b1726d11a5215352d6a22bb933171dee292f8c \ + --hash=sha256:23a57d3b8f1549515c2dbe4a2880ebc1f27780dc126c5e064167563e015817f5 \ + --hash=sha256:2407955dc359d735a20ac6e797ad160feb33d529a2ac50695c11a1ec680eafab \ + --hash=sha256:2940cf13f4fc11ce69ad2ed37a9f22386bfed314b98d8aebfd4f55459aa59108 \ + --hash=sha256:2e51ec673513465663008ce402171192a053564865c2fc6dc840620871a9bd7c \ + --hash=sha256:3393859eb8f19f5804049a685bf286cb08d447e28ba5c6d8543c7bf5500d5970 \ + --hash=sha256:3884002b3a9086f3018a32ab5d4e1e8214dd70695004e27b1a45c25a6243ad0b \ + --hash=sha256:38ca045626af693da042ac35d7332e7b9dbd52e6351d6973b310612e3acee6d6 \ + --hash=sha256:391bb6a29c4fe7ccc9c30812e3744060802d89b39264cfa77f3d280d7f387ea5 \ + --hash=sha256:3cca16fd055316a48f0debfcbfb6af7cea715429fc31515ab3fcac05abd527d8 \ + --hash=sha256:3cde5b83ec4915cd5e6ae271937fd60d14113c8f7769b4a20d51769fe70d8717 \ + --hash=sha256:3f8194b4d631b003a1176a75d1acd545e04b1f54b821638e098a93e6e62830ef \ + --hash=sha256:3fc97805f9d74444b027babff710bf39df1541437a6a585a983d090ae00cedde \ + --hash=sha256:4431629c178193f23c538cb1de3da285a99ccc86b20ee91d81eb5f1a80e0d2ba \ + --hash=sha256:49498ebf68ca3e75321ffe634fcea5cc720502bfaa79bd6b03ded92ce0dc3c24 \ + --hash=sha256:4ac3eb04bcf0119aadc6235a2c162bae5ed5f740e3d42273a7228b915722de20 \ + --hash=sha256:4adf6a0013fe4e0844e3ba7583ec203ca518b9394c6cc0d3354df2bf31d1c034 \ + --hash=sha256:4efc91b437f6ff2578e89e3f17d010c0a0ff01736606473d082913ecaf7859ba \ + --hash=sha256:50706b9c0eda55f7de18695bfeead5f28b58aa42fd5219b3b1692d554ecbc9ec \ + --hash=sha256:5313a4e9380e46ca0e2c681ba739296f9e7c899e6f4d12a6702b2dc9fb846a31 \ + --hash=sha256:543f59601a4e32daf44741b52f9a23e0ee374f9f13b39c41d917302d98fdd7b0 \ + --hash=sha256:57bc54763bf741813a99fbde91f6be138c8706148b7b42d3752deec46545d470 \ + --hash=sha256:63cc10352dc6cf35a33951656aa660d99f25f574eb78132ce41a85001a638aa7 \ + --hash=sha256:6a1d3aa13acfe81f355b0ce4968facc7a19b0d17223a0f80c011a1dba8388f37 \ + --hash=sha256:6af330ddc2ec05a99c3933ab3cba9365357c0b8470a7f2fa054ee4b0984f57d1 \ + --hash=sha256:6d50bfcc1d1692dc55165b3dd2f0b9f8fb5b1f7b571a93e08d660ad54b9ca1a5 \ + --hash=sha256:70100e2087fe05359f249a0b5f393127b3a1819bf34dec3a3e0d4941138650c9 \ + --hash=sha256:74973aebea3543ad033b9103db30579ec2b950a466e09f9c2180089e8346e0ec \ + --hash=sha256:751ba352ed922e0af60458e961167fa7b732ac31c0ddd1476a2dfd30ab5958c5 \ + --hash=sha256:785cd210c0311d9be28a70e281a914486d62bfd44ac926fcd70cf0b4d65dff1c \ + --hash=sha256:7890e291bf4708e3b61db9069ea39b3ab0651e42923a5e1f4d78a7b9e4b18301 \ + --hash=sha256:793a23e8d9cb6c231aa3023d700008224c6ec5b8fd622d50f3c51665e3d0a190 \ + --hash=sha256:797f2846b546a8741413c57d9fb930ad5aa939d925c9c0fa6186d77580035af7 \ + --hash=sha256:7df5fcc48588f82b6cc8073db069609ddd48a49b1e9734a20d0efb32464753c4 \ + --hash=sha256:8050c01331135f77ec99d99307bfbc6519ea24d2f92964b06f3222a804a3ff1f \ + --hash=sha256:805bb33e92fc3d8aa05674db3068d14d36718e3f2c5c79b09807203f229bf4b5 \ + --hash=sha256:807796fe301b7ed76cf100113cc008c119daf4fea2f9f43c578002aef70c3ebf \ + --hash=sha256:81c443310831e29fabbd07b75ebbfa29d0740b56f5907c6af218482d51260431 \ + --hash=sha256:83066ffbf77a5f82b7e96e59bdccbdda203c8dccbfc3f9f0fdad3a08d0001d9c \ + --hash=sha256:8834ab7be6539f1bfadec7c8d12249bbbe6c2413b1d40ffc0ec408692232a0c6 \ + --hash=sha256:92df0e70b884f5da35f2e01489dca3c06a79962fb75636985f1e3a17aec66833 \ + --hash=sha256:9483aa336687463f5497dd37a070094f3dff55e2c888994f8440fcf426a1a844 \ + --hash=sha256:97a138fa875c6f281df7720dac742259e85518135cd0e3551aba1c628103d853 \ + --hash=sha256:9b50700785eccac0819bea794d968ed8f6055c88f29364776b7ea076ac105c5d \ + --hash=sha256:9b73cf0fe28009a04a35bb2522e4c5b5176cc148919431dcb73fdbdfaab15781 \ + --hash=sha256:9d5a369eb7ec5b2fdfa9927530b5259dd21893fa75d4e04a223332f61b84b586 \ + --hash=sha256:a094b7ce455ca341b59a0f6ce6be2e11411ba6e2860b9aa3dbb37468f23338f4 \ + --hash=sha256:a0d6252098e98129a1decb59b46920d4eca17b0395f3d71b0d327d086fefe77d \ + --hash=sha256:a1d856b0f4e4a33e31cdab5f50d0a14998f3a2d726a3fd5cb7c4d45a57b28d1b \ + --hash=sha256:a4ae2ea9afcfdd2b931ddcebf1cf82532162677e00326637b31ed5dff7d985ca \ + --hash=sha256:a5963b663da69ad25fa1559ee064584935570def665917918938c1f1289f5ebc \ + --hash=sha256:ad1c2c2baaba62823a7f348f469a967ece0062140ca39e7a48e4bbb1f20d54c4 \ + --hash=sha256:ae82507fe458f7c0c8227017f2158111a4c9e7ce94de05178894a7ea9fefc8a1 \ + --hash=sha256:af188f3305f0a65c3217c30c6d4c06891e79144076a91e8b454f14256acc7279 \ + --hash=sha256:af44bb7a1af163806bbb679eb8432fa7b4fb6d83a5d403b541b675dcd3798638 \ + --hash=sha256:b0174ca6f3018ddeaa49847f29b69612e590534c1d2186d54ab25161ecc42975 \ + --hash=sha256:b2b17855ed7f994f3f259cf2dfbfad78814538536fa1a91b50253d84d87fd88d \ + --hash=sha256:b2e54f4a2dc6edf0f5ea5b1d0a608d2af3dcb5aa8c8eeab9c8841b23e1b054fe \ + --hash=sha256:b6f4abde9a2946f57e8daaf1160b2351bcf64274ef539e6675c1d945dbd75e2a \ + --hash=sha256:b70c07409d465f3a8b34d52f863871fb8a00755370791d2bd1d4f82b3cdaf3d5 \ + --hash=sha256:bb465dd5825356c1191a038a86ee1b8166e3562d6e8add95eec04ab484cfb8a2 \ + --hash=sha256:c051f46ed1e13ba8214b334cbf21902102807582fbfaf0fef341b9e52f0fafbf \ + --hash=sha256:c1b20a5f4164cec7007be55c9cf18d2cd08ed7c3bf6769b3cd6d044ad888d74b \ + --hash=sha256:c86e9e82bfab579327dbe9b82c71475165fbc8b2134d24f9a3b2edaf200a5c3d \ + --hash=sha256:c9f32b96c700bb384f33f7cf07954bb609d35dd82752cef57fb2ee0968409169 \ + --hash=sha256:cce0ed8b3f64c71c140f0ec244e5fdf8ecf78ddf8d2e591d4a8b6aa1c1214235 \ + --hash=sha256:cdd7315314b0744a7dd506f3bd0f2cf90734181529cdcf75542ee35ad885cab7 \ + --hash=sha256:cf355fbf0d4275d86f9f57be705d8e5eaa7f8ddb12b24ced2ea6cbd68fdb14dc \ + --hash=sha256:d136fbf8ad4321716e44d6d6b3d8dffb4872626010884e07a1db54b7450836cf \ + --hash=sha256:d2c8e20487b3b73c1fa72c56f5c89430617296cd380373e7af3a538a82d4cd6d \ + --hash=sha256:d483cc23cc56ab32911ea0baa0d2d9ea7aa065987f47de847a0a93a58bf57905 \ + --hash=sha256:d5a6c4864bb6fa9fcf7b57a830d21aed69fd71742a5ebcdbafda476be673d212 \ + --hash=sha256:d714e002dd3638170fe7376dc1b686dbac9cb712cde3f7224440af722cc9866a \ + --hash=sha256:d73f14b86d0e2858ece6bf5807c9889670e392c001d414b4293d0d9b291942c3 \ + --hash=sha256:d88c63bd395c787b0aa81d8bbc22c1809f311032ce3e823a6517b711129818e4 \ + --hash=sha256:db608db98ccc21248370d30044a60843b3f0f3d34781ceeea67067c508cd5a28 \ + --hash=sha256:de004939fc3fd0c1200d26ea9264350bfe501ffbf46c8cf5dc7f345f2d87a7f1 \ + --hash=sha256:ded9e86397267732a0641d4776c7c663ea16b64d7dbc4d9cc6ad8536363a2d29 \ + --hash=sha256:e288f8a162d663916060beb5e8165a8551312b08efee9cf68302687471a6545d \ + --hash=sha256:e2a9e62647dc040a76d55563580bf3bb8fe1f5b6ead08447c2ed0d7786e5e794 \ + --hash=sha256:e3e44d08b61de0dd6f205528498f834a51a5c06689f8fb182fe26f3a3ce7dca9 \ + --hash=sha256:ea002088d5554fd75e619742cefc78b84a212ba21632e59931b3501f0cfc8f67 \ + --hash=sha256:eb7452849f6615871eabed6560ffedfe56bc8af31a823b6be4ce1e6ff0ab72c5 \ + --hash=sha256:ebcf34b69df4ca0eabaaaf4a3d890f637f355fed00ba806f7ebdd2d040658c26 \ + --hash=sha256:f24d5b9383318cbd1a5cd969377937d66cf0542f24aa728a4f49d9f98f9c0da8 \ + --hash=sha256:f33fbf96b52d51c23b6cff61f57816539c1c147db270cfc1cc3bc012f4a560a9 + # via + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock + # vllm shellingham==1.5.4 \ --hash=sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686 \ --hash=sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # typer six==1.16.0 \ --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \ --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # opencensus # python-dateutil smart-open==6.2.0 \ --hash=sha256:088bf00f9327c71e549bc2f86567d3320df5d89667f009ce1c16568976068ef7 \ --hash=sha256:1b4df5c8365218f3852c507451920ccad606c80b0acb4e67508e50ba9b5d2632 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt sniffio==1.3.1 \ --hash=sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2 \ --hash=sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # anyio # openai soundfile==0.13.1 \ @@ -3216,7 +3388,7 @@ soundfile==0.13.1 \ --hash=sha256:b2c68dab1e30297317080a5b43df57e302584c49e2942defdde0acccc53f0e5b \ --hash=sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # mistral-common soxr==0.5.0.post1 \ --hash=sha256:39e0f791ba178d69cd676485dbee37e75a34f20daa478d90341ecb7f6d9d690f \ @@ -3241,13 +3413,13 @@ soxr==0.5.0.post1 \ --hash=sha256:fcc049b0a151a65aa75b92f0ac64bb2dba785d16b78c31c2b94e68c141751d6d \ --hash=sha256:fef509466c9c25f65eae0ce1e4b9ac9705d22c6038c914160ddaf459589c6e31 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # mistral-common starlette==0.46.2 \ --hash=sha256:595633ce89f8ffa71a015caed34a5b2dc1c0cdb3f0f1fbd1e69339cf2abeec35 \ --hash=sha256:7f7361f34eed179294600af672f565727419830b54b7b084efe44bb82d2fccd5 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt # fastapi # prometheus-fastapi-instrumentator @@ -3255,19 +3427,19 @@ sympy==1.14.0 \ --hash=sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517 \ --hash=sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # torch tensorboardx==2.6.2.2 \ --hash=sha256:160025acbf759ede23fd3526ae9d9bfbfd8b68eb16c38a010ebe326dc6395db8 \ --hash=sha256:c6476d7cd0d529b0b72f4acadb1269f9ed8b22f441e87a84f2a3b940bb87b666 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt tifffile==2024.7.21 \ --hash=sha256:7f335b5d6ca49401fe0f1d87deb206f5dae47297e47b1ed52a676d05d6d26798 \ --hash=sha256:818b577d49350421fb511f389f937984f9feaa2cd8177fa00823001920bf3483 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # scikit-image tiktoken==0.9.0 \ --hash=sha256:03935988a91d6d3216e2ec7c645afbb3d870b37bcb67ada1943ec48678e7ee33 \ @@ -3302,7 +3474,7 @@ tiktoken==0.9.0 \ --hash=sha256:f0968d5beeafbca2a72c595e8385a1a1f8af58feaebb02b227229b69ca5357fd \ --hash=sha256:f32cc56168eac4851109e9b5d327637f15fd662aa30dd79f964b7c39fbadd26e # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # mistral-common # vllm tokenizers==0.21.1 \ @@ -3322,7 +3494,7 @@ tokenizers==0.21.1 \ --hash=sha256:e78e413e9e668ad790a29456e677d9d3aa50a9ad311a40905d6861ba7692cf41 \ --hash=sha256:ed248ab5279e601a30a4d67bdb897ecbe955a50f1e7bb62bd99f07dd11c2f5b6 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # transformers # vllm torch==2.7.1 \ @@ -3351,7 +3523,7 @@ torch==2.7.1 \ --hash=sha256:e0d81e9a12764b6f3879a866607c8ae93113cbcad57ce01ebde63eb48a576369 \ --hash=sha256:fe955951bdf32d182ee8ead6c3186ad54781492bf03d547d31771a01b3d6fb7d # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # compressed-tensors # nixl # torchaudio @@ -3385,7 +3557,7 @@ torchaudio==2.7.1 \ --hash=sha256:edb4deaa6f95acd5522912ed643303d0b86d79a6f15914362f5a5d49baaf5d13 \ --hash=sha256:f8bd69354a397753b9dea9699d9e1251f8496fbbdf3028c7086a57a615bf33c3 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # vllm torchvision==0.22.1 \ --hash=sha256:043d9e35ed69c2e586aff6eb9e2887382e7863707115668ac9d140da58f42cba \ @@ -3413,38 +3585,37 @@ torchvision==0.22.1 \ --hash=sha256:ef46e065502f7300ad6abc98554131c35dc4c837b978d91306658f1a65c00baa \ --hash=sha256:ef7dee376f42900c0e7b0e34624f391d9ece70ab90ee74b42de0c1fffe371284 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # vllm tqdm==4.67.1 \ --hash=sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2 \ --hash=sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # gguf # huggingface-hub # openai # transformers # vllm -transformers==4.53.2 \ - --hash=sha256:6c3ed95edfb1cba71c4245758f1b4878c93bf8cde77d076307dacb2cbbd72be2 \ - --hash=sha256:db8f4819bb34f000029c73c3c557e7d06fc1b8e612ec142eecdae3947a9c78bf +transformers==4.55.2 \ + --hash=sha256:097e3c2e2c0c9681db3da9d748d8f9d6a724c644514673d0030e8c5a1109f1f1 \ + --hash=sha256:a45ec60c03474fd67adbce5c434685051b7608b3f4f167c25aa6aeb1cad16d4f # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt - # -r python/requirements/llm/llm-requirements.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # compressed-tensors # vllm # xgrammar triton==3.3.1 ; platform_machine == 'x86_64' and sys_platform == 'linux' \ --hash=sha256:b31e3aa26f8cb3cc5bf4e187bf737cbacf17311e1112b781d4a059353dfd731b # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # torch # xgrammar typer==0.12.3 \ --hash=sha256:070d7ca53f785acbccba8e7d28b08dcd88f79f1fbda035ade0aecec71ca5c914 \ --hash=sha256:49e73131481d804288ef62598d97a1ceef3058905aa536a1134f90891ba35482 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements/llm/llm-requirements.txt # -r python/requirements.txt # fastapi-cli @@ -3452,7 +3623,7 @@ typing-extensions==4.12.2 \ --hash=sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d \ --hash=sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # fastapi # gymnasium # huggingface-hub @@ -3468,18 +3639,31 @@ typing-extensions==4.12.2 \ # referencing # torch # typer + # typing-inspection # vllm +typing-inspection==0.4.1 \ + --hash=sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51 \ + --hash=sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28 + # via + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock + # pydantic +tzdata==2025.2 \ + --hash=sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8 \ + --hash=sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9 + # via + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock + # kombu urllib3==1.26.19 \ --hash=sha256:37a0344459b199fce0e80b0d3569837ec6b6937435c5244e7fd73fa6006830f3 \ --hash=sha256:3e3d753a8618b86d7de333b4223005f68720bcd6a7d2bcb9fbd2229ec7c1e429 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # requests uvicorn==0.22.0 \ --hash=sha256:79277ae03db57ce7d9aa0567830bbb51d7a612f54d6e1e3e92da3ef24c2c8ed8 \ --hash=sha256:e9434d3bbf05f310e762147f769c9f21235ee118ba2d2bf1155a7196448bd996 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt # fastapi # fastapi-cli @@ -3522,19 +3706,27 @@ uvloop==0.21.0 ; platform_python_implementation != 'PyPy' and sys_platform != 'c --hash=sha256:f3df876acd7ec037a3d005b3ab85a7e4110422e4d9c1571d4fc89b0fc41b6816 \ --hash=sha256:f7089d2dc73179ce5ac255bdf37c236a9f914b264825fdaacaded6990a7fb4c2 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # uvicorn +vine==5.1.0 \ + --hash=sha256:40fdf3c48b2cfe1c38a49e9ae2da6fda88e4794c810050a728bd7413811fb1dc \ + --hash=sha256:8b62e981d35c41049211cf62a0a1242d8c1ee9bd15bb196ce38aefd6799e61e0 + # via + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock + # amqp + # celery + # kombu virtualenv==20.29.1 \ --hash=sha256:4e4cb403c0b0da39e13b46b1b2476e505cb0046b25f242bee80f62bf990b2779 \ --hash=sha256:b8b8970138d32fb606192cb97f6cd4bb644fa486be9308fb9b63f81091b5dc35 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt -vllm==0.10.0 \ - --hash=sha256:8ca37559d82b43b5e8c8248d2e4a1ecb51d6d4e5d517491d656df6491ed93dab \ - --hash=sha256:a44e9013db26082a82c3931ed8772ac884d6d60566d36ecdb0e8dc01c65b241a +vllm==0.10.1.1 \ + --hash=sha256:3099824ee4bdaa14c4c4f7178a092101a0ec206d4c9371edf295849b2b730a39 \ + --hash=sha256:8ca0dd985e1ceac8540e7719c654f1553b3ba8a43c685ac8d3fa1366ffb6443a # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements/llm/llm-requirements.txt watchfiles==0.19.0 \ --hash=sha256:0089c6dc24d436b373c3c57657bf4f9a453b13767150d17284fc6162b2791911 \ @@ -3560,10 +3752,16 @@ watchfiles==0.19.0 \ --hash=sha256:d9b073073e048081e502b6c6b0b88714c026a1a4c890569238d04aca5f9ca74b \ --hash=sha256:fac19dc9cbc34052394dbe81e149411a62e71999c0a19e1e09ce537867f95ae0 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # -r python/requirements.txt # uvicorn # vllm +wcwidth==0.2.13 \ + --hash=sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859 \ + --hash=sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5 + # via + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock + # prompt-toolkit websockets==15.0 \ --hash=sha256:0e389efe46ccb25a1f93d08c7a74e8123a2517f7b7458f043bd7529d1a63ffeb \ --hash=sha256:0f2205cdb444a42a7919690238fb5979a05439b9dbb73dd47c863d39640d85ab \ @@ -3635,14 +3833,14 @@ websockets==15.0 \ --hash=sha256:ffc02b159b65c05f2ed9ec176b715b66918a674bd4daed48a9a7a590dd4be1aa \ --hash=sha256:ffc5ae23ada6515f31604f700009e2df90b091b67d463a8401c1d8a37f76c1d7 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # uvicorn xformers==0.0.31 ; platform_machine == 'x86_64' and sys_platform == 'linux' \ --hash=sha256:23331bdb9831ba0df96f55258537ca0df7ad888efc75cea97a0de79b5e2291c4 \ --hash=sha256:3fccb159c6327c13fc1b08f8b963c2779ca526e2e50755dee9bcc1bac67d20c6 \ --hash=sha256:50aedaea82a38d7d28631f77617d1ed1f6f37c60bdc4bf167a69cbc0e39cee76 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # vllm xgrammar==0.1.21 ; platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \ --hash=sha256:140628376fc701a535600dc64752603ddaed619461dc50669e90626e9f61b8aa \ @@ -3670,7 +3868,7 @@ xgrammar==0.1.21 ; platform_machine == 'aarch64' or platform_machine == 'arm64' --hash=sha256:f89d9ddb4d00fadcffa4bcabd0c3ae75d47c844c728bbb6be695056df3767524 \ --hash=sha256:f9247641c73eec6e972cec15156a8844957334204ba79ad1abdb0d7b03def8a1 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # vllm yarl==1.18.3 \ --hash=sha256:00e5a1fea0fd4f5bfa7440a47eff01d9822a65b4488f7cff83155a0f31a2ecba \ @@ -3756,15 +3954,15 @@ yarl==1.18.3 \ --hash=sha256:fe57328fbc1bfd0bd0514470ac692630f3901c0ee39052ae47acd1d90a436719 \ --hash=sha256:fea09ca13323376a2fdfb353a5fa2e59f90cd18d7ca4eaa1fd31f0a8b4f91e62 # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # aiohttp zipp==3.19.2 \ --hash=sha256:bf1dcf6450f873a13e952a29504887c89e6de7506209e5b1bcc3460135d4de19 \ --hash=sha256:f091755f667055f2d02b32c53771a7a6c8b47e1fdbc4b72a8b9072b3eef8015c # via - # -c python/requirements_compiled_rayllm_test_py311_cu121.txt + # -c python/deplocks/llm/rayllm_test_py311_cu121.lock # importlib-metadata # The following packages were excluded from the output: -# ray # setuptools +# ray diff --git a/python/requirements_compiled_rayllm_py311_cu128.txt b/python/deplocks/llm/rayllm_py311_cu128.lock similarity index 85% rename from python/requirements_compiled_rayllm_py311_cu128.txt rename to python/deplocks/llm/rayllm_py311_cu128.lock index 0435a7e31115..1af647c3d386 100644 --- a/python/requirements_compiled_rayllm_py311_cu128.txt +++ b/python/deplocks/llm/rayllm_py311_cu128.lock @@ -1,5 +1,5 @@ # This file was autogenerated by uv via the following command: -# uv pip compile --generate-hashes --strip-extras --unsafe-package ray --unsafe-package grpcio-tools --unsafe-package setuptools --index-url https://pypi.org/simple --extra-index-url https://download.pytorch.org/whl/cu128 --index-strategy unsafe-best-match --no-strip-markers --emit-index-url --emit-find-links -c python/requirements_compiled_rayllm_test_py311_cu128.txt python/requirements.txt python/requirements/llm/llm-requirements.txt -o python/requirements_compiled_rayllm_py311_cu128.txt +# uv pip compile --generate-hashes --strip-extras --unsafe-package setuptools --index-url https://pypi.org/simple --index-strategy unsafe-best-match --no-strip-markers --emit-index-url --emit-find-links --extra-index-url https://download.pytorch.org/whl/cu128 --python-version=3.11 --unsafe-package ray --python-platform=linux -c python/deplocks/llm/rayllm_test_py311_cu128.lock python/requirements.txt python/requirements/llm/llm-requirements.txt -o python/deplocks/llm/rayllm_py311_cu128.lock --index-url https://pypi.org/simple --extra-index-url https://download.pytorch.org/whl/cu128 @@ -7,7 +7,7 @@ aiohappyeyeballs==2.6.1 \ --hash=sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558 \ --hash=sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # aiohttp aiohttp==3.11.16 \ --hash=sha256:004511d3413737700835e949433536a2fe95a7d0297edd911a1e9705c5b5ea43 \ @@ -92,7 +92,7 @@ aiohttp==3.11.16 \ --hash=sha256:fb46bb0f24813e6cede6cc07b1961d4b04f331f7112a23b5e21f567da4ee50aa \ --hash=sha256:fd36c119c5d6551bce374fcb5c19269638f8d09862445f85a5a48596fd59f4bb # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt # aiohttp-cors # vllm @@ -100,31 +100,37 @@ aiohttp-cors==0.7.0 \ --hash=sha256:0451ba59fdf6909d0e2cd21e4c0a43752bc0703d33fc78ae94d9d9321710193e \ --hash=sha256:4d39c6d7100fd9764ed1caf8cebf0eb01bf5e3f24e2e073fda6234bc48b19f5d # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt aiorwlock==1.3.0 \ --hash=sha256:45baf8e4fa9a23e0bb325fbd67da80de1fd7ae1d4f59a6381754c60cec7b289b \ --hash=sha256:83f12d87df4b9728a0b8fda1756585ab0d652b107bab59c6084e1b1ad692ab45 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt aiosignal==1.3.1 \ --hash=sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc \ --hash=sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # aiohttp +amqp==5.3.1 \ + --hash=sha256:43b3319e1b4e7d1251833a93d672b4af1e40f3d632d479b98661a95f117880a2 \ + --hash=sha256:cddc00c725449522023bad949f70fff7b48f0b1ade74d170a6f10ab044739432 + # via + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # kombu annotated-types==0.6.0 \ --hash=sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43 \ --hash=sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # pydantic anyio==3.7.1 \ --hash=sha256:44a3c9aba0f5defa43261a8b3efb97891f2bd7d804e0e1f56419befa1adfc780 \ --hash=sha256:91dee416e570e92c64041bd18b900d1d6fa78dff7048769ce5ac5ddad004fbb5 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # httpx # openai # starlette @@ -133,16 +139,22 @@ astor==0.8.1 \ --hash=sha256:070a54e890cefb5b3739d19f30f5a5ec840ffc9c50ffa7d23cc9fc1a38ebbfc5 \ --hash=sha256:6a6effda93f4e1ce9f618779b2dd1d9d84f1e32812c23a29b3fff6fd7f63fa5e # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # depyf attrs==25.1.0 \ --hash=sha256:1c97078a80c814273a76b2a298a932eb681c87415c11dee0a6921de7f1b02c3e \ --hash=sha256:c75a69e28a550a7e93789579c22aa26b0f5b83b75dc4e08fe092980051e1090a # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # aiohttp # jsonschema # referencing +billiard==4.2.1 \ + --hash=sha256:12b641b0c539073fc8d3f5b8b7be998956665c4233c7c1fcd66a7e677c4fb36f \ + --hash=sha256:40b59a4ac8806ba2c2369ea98d876bc6108b051c227baffd928c644d15d8f3cb + # via + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # celery blake3==1.0.5 \ --hash=sha256:03638a6dc8546365c3576fdb293fb2c53b898ac80525b5742d9cf00b4f44dea5 \ --hash=sha256:043a226cebfedff7b51ab9c87d4476c06d2cd10776855eaa9c619f2272b3c32e \ @@ -230,13 +242,13 @@ blake3==1.0.5 \ --hash=sha256:fe333852c5bbafd7735d36da2d60d44a022247bd180f2c43facb2585134c1792 \ --hash=sha256:feb0d1558d720a476f888566ddf2faf91d9147ada9261f3ccf11400ca3798661 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # vllm cachetools==5.5.2 \ --hash=sha256:1a661caa9175d26759571b2e19580f9d6393969e5dfca11fdb1f947a23e640d4 \ --hash=sha256:d26a22bcc62eb95c3beabd9f1ee5e820d3d2704fe2967cbe350e20c8ffcd3f0a # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # google-auth # vllm cbor2==5.6.5 \ @@ -285,13 +297,19 @@ cbor2==5.6.5 \ --hash=sha256:fde21ac1cf29336a31615a2c469a9cb03cf0add3ae480672d4d38cda467d07fc \ --hash=sha256:fe11c2eb518c882cfbeed456e7a552e544893c17db66fe5d3230dbeaca6b615c # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # vllm +celery==5.5.3 \ + --hash=sha256:0b5761a07057acee94694464ca482416b959568904c9dfa41ce8413a7d65d525 \ + --hash=sha256:6c972ae7968c2b5281227f01c3a3f984037d21c5129d07bf3550cc2afc6b10a5 + # via + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # -r python/requirements.txt certifi==2025.1.31 \ --hash=sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651 \ --hash=sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # httpcore # httpx # requests @@ -349,7 +367,7 @@ cffi==1.16.0 \ --hash=sha256:fa3a0128b152627161ce47201262d3140edb5a5c3da88d73a1b790a959126956 \ --hash=sha256:fcc8eb6d5902bb1cf6dc4f187ee3ea80a1eba0a89aba40a5cb20a5087d961357 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # cryptography # soundfile charset-normalizer==3.3.2 \ @@ -444,35 +462,57 @@ charset-normalizer==3.3.2 \ --hash=sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519 \ --hash=sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # requests click==8.1.7 \ --hash=sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28 \ --hash=sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt + # celery + # click-didyoumean + # click-plugins + # click-repl # ray # typer # uvicorn +click-didyoumean==0.3.1 \ + --hash=sha256:4f82fdff0dbe64ef8ab2279bd6aa3f6a99c3b28c05aa09cbfc07c9d7fbb5a463 \ + --hash=sha256:5c4bb6007cfea5f2fd6583a2fb6701a22a41eb98957e63d0fac41c10e7c3117c + # via + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # celery +click-plugins==1.1.1.2 \ + --hash=sha256:008d65743833ffc1f5417bf0e78e8d2c23aab04d9745ba817bd3e71b0feb6aa6 \ + --hash=sha256:d7af3984a99d243c131aa1a828331e7630f4a88a9741fd05c927b204bcf92261 + # via + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # celery +click-repl==0.3.0 \ + --hash=sha256:17849c23dba3d667247dc4defe1757fff98694e90fe37474f3feebb69ced26a9 \ + --hash=sha256:fb7e06deb8da8de86180a33a9da97ac316751c094c6899382da7feeeeb51b812 + # via + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # celery cloudpickle==2.2.0 \ --hash=sha256:3f4219469c55453cfe4737e564b67c2a149109dabf7f242478948b895f61106f \ --hash=sha256:7428798d5926d8fcbfd092d18d01a2a03daf8237d8fcdc8095d256b8490796f0 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # gymnasium # vllm colorful==0.5.5 \ --hash=sha256:62c187e27c1433db9463ff93b1451898d1e7e23a7e553583fd9daeb6325182e4 \ --hash=sha256:66f8c1264b2a26f7293b96a03bb7a76c4bc8b9634369a0bffdcd12d618056a1d # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt compressed-tensors==0.10.2 \ --hash=sha256:6de13ac535d7ffdd8890fad3d229444c33076170acaa8fab6bab8ecfa96c1d8f \ --hash=sha256:e1b4d9bc2006e3fd3a938e59085f318fdb280c5af64688a4792bf1bc263e579d # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # vllm cryptography==44.0.3 \ --hash=sha256:02f55fb4f8b79c1221b0961488eaae21015b69b210e18c386b69de182ebb1259 \ @@ -513,7 +553,7 @@ cryptography==44.0.3 \ --hash=sha256:fc3c9babc1e1faefd62704bb46a69f359a9819eb0292e40df3fb6e3574715cd4 \ --hash=sha256:fe19d8bc5536a91a24a8133328880a41831b6c5df54599a8417b62fe015d3053 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # pyopenssl cupy-cuda12x==13.1.0 ; sys_platform != 'darwin' \ --hash=sha256:230f8a8e99c81a653baa0ed00819990c0ed1f0cf0298214786b5e323461dc61a \ @@ -529,38 +569,38 @@ cupy-cuda12x==13.1.0 ; sys_platform != 'darwin' \ --hash=sha256:dac0284fecb90b5731f514e569a6fcf6674a730ae95b9490781a713b60a34423 \ --hash=sha256:e7a25ef1b44ae6276b5105affc2289edb34f1aa6676babd5bcd80907348c4cfa # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt # ray depyf==0.19.0 \ --hash=sha256:040b35fc0997d49df024b7d094f2a7836f91e9ed02f49982dd37e70aa3285ad5 \ --hash=sha256:afed0916b32d141cc90fa6220df01885eda442ca43b297d5050eeb90b4a5cb44 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # vllm dill==0.4.0 \ --hash=sha256:0633f1d2df477324f53a895b02c901fb961bdbf65a17122586ea7019292cbcf0 \ --hash=sha256:44f54bf6412c2c8464c14e8243eb163690a9800dbe2c367330883b19c7561049 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # depyf diskcache==5.6.3 \ --hash=sha256:2c3a3fa2743d8535d832ec61c2054a1641f41775aa7c556758a109941e33e4fc \ --hash=sha256:5e31b2d5fbad117cc363ebaf6b689474db18a1f6438bc82358b024abd4c2ca19 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # vllm distlib==0.3.7 \ --hash=sha256:2e24928bc811348f0feb63014e97aaae3037f2cf48712d51ae61df7fd6075057 \ --hash=sha256:9dafe54b34a028eafd95039d5e5d4851a13734540f1331060d31c9916e7147a8 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # virtualenv distro==1.9.0 \ --hash=sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed \ --hash=sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # openai dm-tree==0.1.8 \ --hash=sha256:054b461f8176f4bce7a21f7b1870f873a1ced3bdbe1282c816c550bb43c71fa6 \ @@ -610,44 +650,44 @@ dm-tree==0.1.8 \ --hash=sha256:f7ac31b9aecccb2c6e1ab29706f6ded3eba0c2c69c770322c9c685929c3d6afb \ --hash=sha256:fa42a605d099ee7d41ba2b5fb75e21423951fd26e5d50583a00471238fb3021d # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt dnspython==2.7.0 \ --hash=sha256:b4c34b7d10b51bcc3a5071e7b8dee77939f1e878477eeecc965e9835f63c6c86 \ --hash=sha256:ce9c432eda0dc91cf618a5cedf1a4e142651196bbcd2c80e89ed5a907e5cfaf1 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # email-validator einops==0.8.1 \ --hash=sha256:919387eb55330f5757c6bea9165c5ff5cfe63a642682ea788a6d472576d81737 \ --hash=sha256:de5d960a7a761225532e0f1959e5315ebeafc0cd43394732f103ca44b9837e84 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # vllm email-validator==2.2.0 \ --hash=sha256:561977c2d73ce3611850a06fa56b414621e0c8faa9d66f2611407d87465da631 \ --hash=sha256:cb690f344c617a714f22e66ae771445a1ceb46821152df8e165c5f9a364582b7 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # fastapi farama-notifications==0.0.4 \ --hash=sha256:13fceff2d14314cf80703c8266462ebf3733c7d165336eee998fc58e545efd18 \ --hash=sha256:14de931035a41961f7c056361dc7f980762a143d05791ef5794a751a2caf05ae # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # gymnasium fastapi==0.115.12 \ --hash=sha256:1e2c2a2646905f9e83d32f04a3f86aff4a286669c6c950ca95b5fd68c2602681 \ --hash=sha256:e94613d6c05e27be7ffebdd6ea5f388112e5e430c8f7d6494a9d1d88d43e814d # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt # vllm fastapi-cli==0.0.5 \ --hash=sha256:d30e1239c6f46fcb95e606f02cdda59a1e2fa778a54b64686b3ff27f6211ff9f \ --hash=sha256:e94d847524648c748a5350673546bbf9bcaeb086b33c24f2e82e021436866a46 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # fastapi fastrlock==0.8.2 ; sys_platform != 'darwin' \ --hash=sha256:067edb0a0805bf61e17a251d5046af59f6e9d2b8ad01222e0ef7a0b7937d5548 \ @@ -726,13 +766,13 @@ fastrlock==0.8.2 ; sys_platform != 'darwin' \ --hash=sha256:fb5363cf0fddd9b50525ddbf64a1e1b28ec4c6dfb28670a940cb1cf988a6786b \ --hash=sha256:ff75c90663d6e8996610d435e71487daa853871ad1770dd83dc0f2fc4997241e # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # cupy-cuda12x filelock==3.17.0 \ --hash=sha256:533dc2f7ba78dc2f0f531fc6c4940addf7b70a481e269a5a3b93be94ffbe8338 \ --hash=sha256:ee4e77401ef576ebb38cd7f13b9b28893194acc20a8e68e18730ba9c0e54660e # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt # huggingface-hub # ray @@ -819,14 +859,14 @@ frozenlist==1.4.1 \ --hash=sha256:fde5bd59ab5357e3853313127f4d3565fc7dad314a74d7b5d43c22c6a5ed2ced \ --hash=sha256:fe1a06da377e3a1062ae5fe0926e12b84eceb8a50b350ddca72dc85015873f74 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # aiohttp # aiosignal -fsspec==2023.5.0 \ - --hash=sha256:51a4ad01a5bb66fcc58036e288c0d53d3975a0df2a5dc59a93b59bade0391f2a \ - --hash=sha256:b3b56e00fb93ea321bc9e5d9cf6f8522a0198b20eb24e02774d329e9c6fb84ce +fsspec==2023.12.1 \ + --hash=sha256:6271f1d3075a378bfe432f6f42bf7e1d2a6ba74f78dd9b512385474c579146a0 \ + --hash=sha256:c4da01a35ac65c853f833e43f67802c25213f560820d54ddf248f92eddd5e990 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt # huggingface-hub # torch @@ -834,96 +874,92 @@ gguf==0.17.0 \ --hash=sha256:52f2759c6e0ab3d228d4d44f871e3eb140004712c31aed72e2ae82f61aa5aa05 \ --hash=sha256:e3f88278e6f6778e0348fbc97313a4a2f8af63b08fe25dc381251d9c611dae03 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # vllm google-api-core==2.24.2 \ --hash=sha256:810a63ac95f3c441b7c0e43d344e372887f62ce9071ba972eacf32672e072de9 \ --hash=sha256:81718493daf06d96d6bc76a91c23874dbf2fac0adbbf542831b805ee6e974696 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # opencensus google-auth==2.23.4 \ --hash=sha256:79905d6b1652187def79d491d6e23d0cbb3a21d3c7ba0dbaa9c8a01906b13ff3 \ --hash=sha256:d4bbc92fe4b8bfd2f3e8d88e5ba7085935da208ee38a134fc280e7ce682a05f2 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # google-api-core googleapis-common-protos==1.61.0 \ --hash=sha256:22f1915393bb3245343f6efe87f6fe868532efc12aa26b391b15132e1279f1c0 \ --hash=sha256:8a64866a97f6304a7179873a465d6eee97b7a24ec6cfd78e0f575e96b821240b # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # google-api-core -grpcio==1.66.2 \ - --hash=sha256:02697eb4a5cbe5a9639f57323b4c37bcb3ab2d48cec5da3dc2f13334d72790dd \ - --hash=sha256:03b0b307ba26fae695e067b94cbb014e27390f8bc5ac7a3a39b7723fed085604 \ - --hash=sha256:05bc2ceadc2529ab0b227b1310d249d95d9001cd106aa4d31e8871ad3c428d73 \ - --hash=sha256:06de8ec0bd71be123eec15b0e0d457474931c2c407869b6c349bd9bed4adbac3 \ - --hash=sha256:0be4e0490c28da5377283861bed2941d1d20ec017ca397a5df4394d1c31a9b50 \ - --hash=sha256:12fda97ffae55e6526825daf25ad0fa37483685952b5d0f910d6405c87e3adb6 \ - --hash=sha256:1caa38fb22a8578ab8393da99d4b8641e3a80abc8fd52646f1ecc92bcb8dee34 \ - --hash=sha256:2018b053aa15782db2541ca01a7edb56a0bf18c77efed975392583725974b249 \ - --hash=sha256:20657d6b8cfed7db5e11b62ff7dfe2e12064ea78e93f1434d61888834bc86d75 \ - --hash=sha256:2335c58560a9e92ac58ff2bc5649952f9b37d0735608242973c7a8b94a6437d8 \ - --hash=sha256:31fd163105464797a72d901a06472860845ac157389e10f12631025b3e4d0453 \ - --hash=sha256:38b68498ff579a3b1ee8f93a05eb48dc2595795f2f62716e797dc24774c1aaa8 \ - --hash=sha256:3b00efc473b20d8bf83e0e1ae661b98951ca56111feb9b9611df8efc4fe5d55d \ - --hash=sha256:3ed71e81782966ffead60268bbda31ea3f725ebf8aa73634d5dda44f2cf3fb9c \ - --hash=sha256:45a3d462826f4868b442a6b8fdbe8b87b45eb4f5b5308168c156b21eca43f61c \ - --hash=sha256:49f0ca7ae850f59f828a723a9064cadbed90f1ece179d375966546499b8a2c9c \ - --hash=sha256:4e504572433f4e72b12394977679161d495c4c9581ba34a88d843eaf0f2fbd39 \ - --hash=sha256:4ea1d062c9230278793820146c95d038dc0f468cbdd172eec3363e42ff1c7d01 \ - --hash=sha256:563588c587b75c34b928bc428548e5b00ea38c46972181a4d8b75ba7e3f24231 \ - --hash=sha256:6001e575b8bbd89eee11960bb640b6da6ae110cf08113a075f1e2051cc596cae \ - --hash=sha256:66a0cd8ba6512b401d7ed46bb03f4ee455839957f28b8d61e7708056a806ba6a \ - --hash=sha256:6851de821249340bdb100df5eacfecfc4e6075fa85c6df7ee0eb213170ec8e5d \ - --hash=sha256:728bdf36a186e7f51da73be7f8d09457a03061be848718d0edf000e709418987 \ - --hash=sha256:73e3b425c1e155730273f73e419de3074aa5c5e936771ee0e4af0814631fb30a \ - --hash=sha256:73fc8f8b9b5c4a03e802b3cd0c18b2b06b410d3c1dcbef989fdeb943bd44aff7 \ - --hash=sha256:78fa51ebc2d9242c0fc5db0feecc57a9943303b46664ad89921f5079e2e4ada7 \ - --hash=sha256:7b2c86457145ce14c38e5bf6bdc19ef88e66c5fee2c3d83285c5aef026ba93b3 \ - --hash=sha256:7d69ce1f324dc2d71e40c9261d3fdbe7d4c9d60f332069ff9b2a4d8a257c7b2b \ - --hash=sha256:802d84fd3d50614170649853d121baaaa305de7b65b3e01759247e768d691ddf \ - --hash=sha256:80fd702ba7e432994df208f27514280b4b5c6843e12a48759c9255679ad38db8 \ - --hash=sha256:8ac475e8da31484efa25abb774674d837b343afb78bb3bcdef10f81a93e3d6bf \ - --hash=sha256:950da58d7d80abd0ea68757769c9db0a95b31163e53e5bb60438d263f4bed7b7 \ - --hash=sha256:99a641995a6bc4287a6315989ee591ff58507aa1cbe4c2e70d88411c4dcc0839 \ - --hash=sha256:9c3a99c519f4638e700e9e3f83952e27e2ea10873eecd7935823dab0c1c9250e \ - --hash=sha256:9c509a4f78114cbc5f0740eb3d7a74985fd2eff022971bc9bc31f8bc93e66a3b \ - --hash=sha256:a18e20d8321c6400185b4263e27982488cb5cdd62da69147087a76a24ef4e7e3 \ - --hash=sha256:a917d26e0fe980b0ac7bfcc1a3c4ad6a9a4612c911d33efb55ed7833c749b0ee \ - --hash=sha256:a9539f01cb04950fd4b5ab458e64a15f84c2acc273670072abe49a3f29bbad54 \ - --hash=sha256:ad2efdbe90c73b0434cbe64ed372e12414ad03c06262279b104a029d1889d13e \ - --hash=sha256:b672abf90a964bfde2d0ecbce30f2329a47498ba75ce6f4da35a2f4532b7acbc \ - --hash=sha256:bbd27c24a4cc5e195a7f56cfd9312e366d5d61b86e36d46bbe538457ea6eb8dd \ - --hash=sha256:c400ba5675b67025c8a9f48aa846f12a39cf0c44df5cd060e23fda5b30e9359d \ - --hash=sha256:c408f5ef75cfffa113cacd8b0c0e3611cbfd47701ca3cdc090594109b9fcbaed \ - --hash=sha256:c806852deaedee9ce8280fe98955c9103f62912a5b2d5ee7e3eaa284a6d8d8e7 \ - --hash=sha256:ce89f5876662f146d4c1f695dda29d4433a5d01c8681fbd2539afff535da14d4 \ - --hash=sha256:d25a14af966438cddf498b2e338f88d1c9706f3493b1d73b93f695c99c5f0e2a \ - --hash=sha256:d8d4732cc5052e92cea2f78b233c2e2a52998ac40cd651f40e398893ad0d06ec \ - --hash=sha256:d9a9724a156c8ec6a379869b23ba3323b7ea3600851c91489b871e375f710bc8 \ - --hash=sha256:e636ce23273683b00410f1971d209bf3689238cf5538d960adc3cdfe80dd0dbd \ - --hash=sha256:e88264caad6d8d00e7913996030bac8ad5f26b7411495848cc218bd3a9040b6c \ - --hash=sha256:f145cc21836c332c67baa6fc81099d1d27e266401565bf481948010d6ea32d46 \ - --hash=sha256:fb57870449dfcfac428afbb5a877829fcb0d6db9d9baa1148705739e9083880e \ - --hash=sha256:fb70487c95786e345af5e854ffec8cb8cc781bcc5df7930c4fbb7feaa72e1cdf \ - --hash=sha256:fe96281713168a3270878255983d2cb1a97e034325c8c2c25169a69289d3ecfa \ - --hash=sha256:ff1f7882e56c40b0d33c4922c15dfa30612f05fb785074a012f7cda74d1c3679 - # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt +grpcio==1.74.0 \ + --hash=sha256:0f87bddd6e27fc776aacf7ebfec367b6d49cad0455123951e4488ea99d9b9b8f \ + --hash=sha256:136b53c91ac1d02c8c24201bfdeb56f8b3ac3278668cbb8e0ba49c88069e1bdc \ + --hash=sha256:1733969040989f7acc3d94c22f55b4a9501a30f6aaacdbccfaba0a3ffb255ab7 \ + --hash=sha256:176d60a5168d7948539def20b2a3adcce67d72454d9ae05969a2e73f3a0feee7 \ + --hash=sha256:1a2b06afe2e50ebfd46247ac3ba60cac523f54ec7792ae9ba6073c12daf26f0a \ + --hash=sha256:1bf949792cee20d2078323a9b02bacbbae002b9e3b9e2433f2741c15bdeba1c4 \ + --hash=sha256:22b834cef33429ca6cc28303c9c327ba9a3fafecbf62fae17e9a7b7163cc43ac \ + --hash=sha256:2918948864fec2a11721d91568effffbe0a02b23ecd57f281391d986847982f6 \ + --hash=sha256:2bc2d7d8d184e2362b53905cb1708c84cb16354771c04b490485fa07ce3a1d89 \ + --hash=sha256:2f609a39f62a6f6f05c7512746798282546358a37ea93c1fcbadf8b2fed162e3 \ + --hash=sha256:3601274bc0523f6dc07666c0e01682c94472402ac2fd1226fd96e079863bfa49 \ + --hash=sha256:3b03d8f2a07f0fea8c8f74deb59f8352b770e3900d143b3d1475effcb08eec20 \ + --hash=sha256:3d14e3c4d65e19d8430a4e28ceb71ace4728776fd6c3ce34016947474479683f \ + --hash=sha256:42f8fee287427b94be63d916c90399ed310ed10aadbf9e2e5538b3e497d269bc \ + --hash=sha256:4bc5fca10aaf74779081e16c2bcc3d5ec643ffd528d9e7b1c9039000ead73bae \ + --hash=sha256:4e4181bfc24413d1e3a37a0b7889bea68d973d4b45dd2bc68bb766c140718f82 \ + --hash=sha256:55b453812fa7c7ce2f5c88be3018fb4a490519b6ce80788d5913f3f9d7da8c7b \ + --hash=sha256:566b9395b90cc3d0d0c6404bc8572c7c18786ede549cdb540ae27b58afe0fb91 \ + --hash=sha256:5f251c355167b2360537cf17bea2cf0197995e551ab9da6a0a59b3da5e8704f9 \ + --hash=sha256:60d2d48b0580e70d2e1954d0d19fa3c2e60dd7cbed826aca104fff518310d1c5 \ + --hash=sha256:64229c1e9cea079420527fa8ac45d80fc1e8d3f94deaa35643c381fa8d98f362 \ + --hash=sha256:655726919b75ab3c34cdad39da5c530ac6fa32696fb23119e36b64adcfca174a \ + --hash=sha256:662456c4513e298db6d7bd9c3b8df6f75f8752f0ba01fb653e252ed4a59b5a5d \ + --hash=sha256:68c8ebcca945efff9d86d8d6d7bfb0841cf0071024417e2d7f45c5e46b5b08eb \ + --hash=sha256:69e1a8180868a2576f02356565f16635b99088da7df3d45aaa7e24e73a054e31 \ + --hash=sha256:6bab67d15ad617aff094c382c882e0177637da73cbc5532d52c07b4ee887a87b \ + --hash=sha256:7d95d71ff35291bab3f1c52f52f474c632db26ea12700c2ff0ea0532cb0b5854 \ + --hash=sha256:80d1f4fbb35b0742d3e3d3bb654b7381cd5f015f8497279a1e9c21ba623e01b1 \ + --hash=sha256:834988b6c34515545b3edd13e902c1acdd9f2465d386ea5143fb558f153a7176 \ + --hash=sha256:8533e6e9c5bd630ca98062e3a1326249e6ada07d05acf191a77bc33f8948f3d8 \ + --hash=sha256:85bd5cdf4ed7b2d6438871adf6afff9af7096486fcf51818a81b77ef4dd30907 \ + --hash=sha256:86ad489db097141a907c559988c29718719aa3e13370d40e20506f11b4de0d11 \ + --hash=sha256:885912559974df35d92219e2dc98f51a16a48395f37b92865ad45186f294096c \ + --hash=sha256:8efe72fde5500f47aca1ef59495cb59c885afe04ac89dd11d810f2de87d935d4 \ + --hash=sha256:8f7b5882fb50632ab1e48cb3122d6df55b9afabc265582808036b6e51b9fd6b7 \ + --hash=sha256:9e7c4389771855a92934b2846bd807fc25a3dfa820fd912fe6bd8136026b2707 \ + --hash=sha256:9e912d3c993a29df6c627459af58975b2e5c897d93287939b9d5065f000249b5 \ + --hash=sha256:a8f0302f9ac4e9923f98d8e243939a6fb627cd048f5cd38595c97e38020dffce \ + --hash=sha256:b6a73b2ba83e663b2480a90b82fdae6a7aa6427f62bf43b29912c0cfd1aa2bfa \ + --hash=sha256:c14e803037e572c177ba54a3e090d6eb12efd795d49327c5ee2b3bddb836bf01 \ + --hash=sha256:c3d7bd6e3929fd2ea7fbc3f562e4987229ead70c9ae5f01501a46701e08f1ad9 \ + --hash=sha256:c98e0b7434a7fa4e3e63f250456eaef52499fba5ae661c58cc5b5477d11e7182 \ + --hash=sha256:cce634b10aeab37010449124814b05a62fb5f18928ca878f1bf4750d1f0c815b \ + --hash=sha256:e154d230dc1bbbd78ad2fdc3039fa50ad7ffcf438e4eb2fa30bce223a70c7486 \ + --hash=sha256:e1ea6176d7dfd5b941ea01c2ec34de9531ba494d541fe2057c904e601879f249 \ + --hash=sha256:e759f9e8bc908aaae0412642afe5416c9f983a80499448fcc7fab8692ae044c3 \ + --hash=sha256:e8978003816c7b9eabe217f88c78bc26adc8f9304bf6a594b02e5a49b2ef9c11 \ + --hash=sha256:ecde9ab49f58433abe02f9ed076c7b5be839cf0153883a6d23995937a82392fa \ + --hash=sha256:f6ec94f0e50eb8fa1744a731088b966427575e40c2944a980049798b127a687e \ + --hash=sha256:fd3c71aeee838299c5887230b8a1822795325ddfea635edd82954c1eaa831e24 \ + --hash=sha256:fe0f540750a13fd8e5da4b3eaba91a785eea8dca5ccd2bc2ffe978caa403090e + # via + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt -gymnasium==1.0.0 \ - --hash=sha256:9d2b66f30c1b34fe3c2ce7fae65ecf365d0e9982d2b3d860235e773328a3b403 \ - --hash=sha256:b6f40e1e24c5bd419361e1a5b86a9117d2499baecc3a660d44dfff4c465393ad +gymnasium==1.1.1 \ + --hash=sha256:8bd9ea9bdef32c950a444ff36afc785e1d81051ec32d30435058953c20d2456d \ + --hash=sha256:9c167ec0a2b388666e37f63b2849cd2552f7f5b71938574c637bb36487eb928a # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt h11==0.16.0 \ --hash=sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1 \ --hash=sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # httpcore # uvicorn hf-transfer==0.1.9 \ @@ -953,9 +989,9 @@ hf-transfer==0.1.9 \ --hash=sha256:ee8b10afedcb75f71091bcc197c526a6ebf5c58bbbadb34fdeee6160f55f619f \ --hash=sha256:fc6bd19e1cc177c66bdef15ef8636ad3bde79d5a4f608c158021153b4573509d # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements/llm/llm-requirements.txt -hf-xet==1.1.3 \ +hf-xet==1.1.3 ; platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \ --hash=sha256:30c575a5306f8e6fda37edb866762140a435037365eba7a17ce7bd0bc0216a8b \ --hash=sha256:7c1a6aa6abed1f696f8099aa9796ca04c9ee778a58728a115607de9cc4638ff1 \ --hash=sha256:8203f52827e3df65981984936654a5b390566336956f65765a8aa58c362bb841 \ @@ -965,13 +1001,13 @@ hf-xet==1.1.3 \ --hash=sha256:c3b508b5f583a75641aebf732853deb058953370ce8184f5dabc49f803b0819b \ --hash=sha256:fd2da210856444a34aad8ada2fc12f70dabed7cc20f37e90754d1d9b43bc0534 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # huggingface-hub httpcore==1.0.9 \ --hash=sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55 \ --hash=sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # httpx httptools==0.6.4 \ --hash=sha256:0614154d5454c21b6410fdf5262b4a3ddb0f53f1e1721cfd59d55f32138c578a \ @@ -1018,28 +1054,27 @@ httptools==0.6.4 \ --hash=sha256:f9eb89ecf8b290f2e293325c646a211ff1c2493222798bb80a530c5e7502494f \ --hash=sha256:fc411e1c0a7dcd2f902c7c48cf079947a7e65b5485dea9decb82b9105ca71a43 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # uvicorn httpx==0.28.1 \ --hash=sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc \ --hash=sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # fastapi # openai huggingface-hub==0.34.3 \ --hash=sha256:5444550099e2d86e68b2898b09e85878fbd788fc2957b506c6a79ce060e39492 \ --hash=sha256:d58130fd5aa7408480681475491c0abd7e835442082fbc3ef4d45b6c39f83853 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # tokenizers # transformers - # vllm idna==3.7 \ --hash=sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc \ --hash=sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # anyio # email-validator # httpx @@ -1049,25 +1084,25 @@ imageio==2.34.2 \ --hash=sha256:5c0c0ee8faa018a1c42f649b90395dd4d3bb6187c09053a0cd6f1fdd51bbff5e \ --hash=sha256:a0bb27ec9d5bab36a9f4835e51b21d2cb099e1f78451441f94687ff3404b79f8 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # scikit-image importlib-metadata==6.11.0 \ --hash=sha256:1231cf92d825c9e03cfc4da076a16de6422c863558229ea0b22b675657463443 \ --hash=sha256:f0afba6205ad8f8947c7d338b5342d5db2afbfd82f9cbef7879a9539cc12eb9b # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # opentelemetry-api interegular==0.3.3 \ --hash=sha256:b0c07007d48c89d6d19f7204972d369b2a77222722e126b6aa63aa721dc3b19c \ --hash=sha256:d9b697b21b34884711399ba0f0376914b81899ce670032486d0d048344a76600 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # lm-format-enforcer jinja2==3.1.6 \ --hash=sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d \ --hash=sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # fastapi # memray # torch @@ -1150,19 +1185,19 @@ jiter==0.10.0 \ --hash=sha256:fcedb049bdfc555e261d6f65a6abe1d5ad68825b7202ccb9692636c70fcced86 \ --hash=sha256:ff76d8887c8c8ee1e772274fcf8cc1071c2c58590d13e33bd12d02dc9a560397 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # openai jsonref==1.1.0 \ --hash=sha256:32fe8e1d85af0fdefbebce950af85590b22b60f9e95443176adbde4e1ecea552 \ --hash=sha256:590dc7773df6c21cbf948b5dac07a72a251db28b0238ceecce0a2abfa8ec30a9 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements/llm/llm-requirements.txt jsonschema==4.23.0 \ --hash=sha256:d71497fef26351a33265337fa77ffeb82423f3ea21283cd9467bb03999266bc4 \ --hash=sha256:fbadb6f8b144a8f8cf9f0b89ba94501d143e50411a1278633f56a7acf7fd5566 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements/llm/llm-requirements.txt # -r python/requirements.txt # mistral-common @@ -1171,19 +1206,25 @@ jsonschema-specifications==2024.10.1 \ --hash=sha256:0f38b83639958ce1152d02a7f062902c41c8fd20d558b0c34344292d417ae272 \ --hash=sha256:a09a0680616357d9a0ecf05c12ad234479f549239d0f5b55f3deea67475da9bf # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # jsonschema +kombu==5.5.4 \ + --hash=sha256:886600168275ebeada93b888e831352fe578168342f0d1d5833d88ba0d847363 \ + --hash=sha256:a12ed0557c238897d8e518f1d1fdf84bd1516c5e305af2dacd85c2015115feb8 + # via + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # celery lark==1.2.2 \ --hash=sha256:c2276486b02f0f1b90be155f2c8ba4a8e194d42775786db622faccd652d8e80c \ --hash=sha256:ca807d0162cd16cef15a8feecb862d7319e7a09bdb13aef927968e45040fed80 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # vllm lazy-loader==0.4 \ --hash=sha256:342aa8e14d543a154047afb4ba8ef17f5563baad3fc610d7b15b213b0f119efc \ --hash=sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # scikit-image llguidance==0.7.29 ; platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \ --hash=sha256:17fd439957d6ca5f459d0dec755a2d040c2dc946ed7e3c332b469ef6861292f8 \ @@ -1195,7 +1236,7 @@ llguidance==0.7.29 ; platform_machine == 'aarch64' or platform_machine == 'arm64 --hash=sha256:c97f16ddd6be28f4d176eaaa493102b981ba5470299253903de9a764e2501ef3 \ --hash=sha256:d1aa68a54f9496d36750018e7edad3bf624ee2fbcf671a7483883790d798c4fe # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # vllm llvmlite==0.44.0 \ --hash=sha256:07667d66a5d150abed9157ab6c0b9393c9356f229784a4385c02f99e94fc94d4 \ @@ -1220,13 +1261,13 @@ llvmlite==0.44.0 \ --hash=sha256:eed7d5f29136bda63b6d7804c279e2b72e08c952b7c5df61f45db408e0ee52f3 \ --hash=sha256:f01a394e9c9b7b1d4e63c327b096d10f6f0ed149ef53d38a09b3749dcf8c9610 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # numba lm-format-enforcer==0.10.11 \ --hash=sha256:563e0dbc930a6d50fb687951506c5de098c6e962601be0ce723f3b7d0b916a1b \ --hash=sha256:8ab371924e166a1df68f243aca73a8a647bea5909f37edd6a53a694e7e7c3274 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # vllm lz4==4.3.3 \ --hash=sha256:01fe674ef2889dbb9899d8a67361e0c4a2c833af5aeb37dd505727cf5d2a131e \ @@ -1266,13 +1307,13 @@ lz4==4.3.3 \ --hash=sha256:f4c7bf687303ca47d69f9f0133274958fd672efaa33fb5bcde467862d6c621f0 \ --hash=sha256:f76176492ff082657ada0d0f10c794b6da5800249ef1692b35cf49b1e93e8ef7 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt markdown-it-py==2.2.0 \ --hash=sha256:5a35f8d1870171d9acc47b99612dc146129b631baf04970128b568f190d0cc30 \ --hash=sha256:7c9a5e412688bc771c67432cbfebcdd686c93ce6484913dccf06cb5a0bea35a1 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # rich markupsafe==2.1.3 \ --hash=sha256:05fb21170423db021895e1ea1e1f3ab3adb85d1c2333cbc2310f2a26bc77272e \ @@ -1301,13 +1342,13 @@ markupsafe==2.1.3 \ --hash=sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc \ --hash=sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # jinja2 mdurl==0.1.2 \ --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \ --hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # markdown-it-py memray==1.10.0 ; sys_platform != 'win32' \ --hash=sha256:0a21745fb516b7a6efcd40aa7487c59e9313fcfc782d0193fcfcf00b48426874 \ @@ -1346,24 +1387,24 @@ memray==1.10.0 ; sys_platform != 'win32' \ --hash=sha256:e356af93e3b031c83957e9ac1a653f5aaba5df1e357dd17142f5ed19bb3dc660 \ --hash=sha256:f16c5c8730b616613dc8bafe32649ca6bd7252606251eb00148582011758d0b5 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt meson==1.8.3 \ --hash=sha256:ef02b806ce0c5b6becd5bb5dc9fa67662320b29b337e7ace73e4354500590233 \ --hash=sha256:f118aa910fc0a137cc2dd0122232dbf82153d9a12fb5b0f5bb64896f6a157abf # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements/llm/llm-requirements.txt mistral-common==1.8.3 \ --hash=sha256:0d1979d82227b625f6d71b3c828176f059da8d0f5a3307cdf53b48409a3970a4 \ --hash=sha256:846b6e4bbe016dc2e64fd3169fa704a548f6c74467e0cb18dc165b7a7669abd6 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # vllm mpmath==1.3.0 \ --hash=sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # sympy msgpack==1.0.7 \ --hash=sha256:04ad6069c86e531682f9e1e71b71c1c3937d6014a7c3e9edd2aa81ad58842862 \ @@ -1423,7 +1464,7 @@ msgpack==1.0.7 \ --hash=sha256:f9a7c509542db4eceed3dcf21ee5267ab565a83555c9b88a8109dcecc4709002 \ --hash=sha256:ff1d0899f104f3921d94579a5638847f783c9b04f2d5f229392ca77fba5b82fc # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt # ray msgspec==0.19.0 \ @@ -1464,7 +1505,7 @@ msgspec==0.19.0 \ --hash=sha256:f98bd8962ad549c27d63845b50af3f53ec468b6318400c9f1adfe8b092d7b62f \ --hash=sha256:fe2c4bf29bf4e89790b3117470dea2c20b59932772483082c468b990d45fb947 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # vllm multidict==6.0.5 \ --hash=sha256:01265f5e40f5a17f8241d52656ed27192be03bfa8764d88e8220141d1e4b3556 \ @@ -1558,13 +1599,13 @@ multidict==6.0.5 \ --hash=sha256:fce28b3c8a81b6b36dfac9feb1de115bab619b3c13905b419ec71d03a3fc1423 \ --hash=sha256:fe5d7785250541f7f5019ab9cba2c71169dc7d74d0f45253f8313f436458a4ef # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # aiohttp # yarl networkx==3.2.1 \ --hash=sha256:f18c69adc97877c42332c170849c96cefa91881c99a7cb3e95b7c659ebdc1ec2 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # scikit-image # torch ninja==1.11.1.4 \ @@ -1586,17 +1627,21 @@ ninja==1.11.1.4 \ --hash=sha256:ecce44a00325a93631792974659cf253a815cc6da4ec96f89742925dfc295a0d \ --hash=sha256:f6186d7607bb090c3be1e10c8a56b690be238f953616626f5032238c66e56867 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements/llm/llm-requirements.txt # vllm # xgrammar -nixl==0.3.1 \ - --hash=sha256:20428ad2668062a79045fae83cc5cba1f4019d4a2c7053cc8549c3a1533f8a75 \ - --hash=sha256:70b8932b50ccf1a13ac8fa2e10a4b78290baae9f963bfecfa67684104331a94b \ - --hash=sha256:8c144839484b3076f0b34ad8ceaeaff05c23399cf57ca85f2a94b44e1475a39b \ - --hash=sha256:ff59996ad05a7e4ba6c8beba0f1d8ac2f9e53df696a15af0d3340028e2f16081 - # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt +nixl==0.4.1 \ + --hash=sha256:10c7b4a44f89c3fbff3e20cb84973be95f8df36ee336fb108275ed1839fec1f1 \ + --hash=sha256:510cc9e824ad53cac71ce55ff41160f2a9e1507ceb52eb871b775fe1e42beb87 \ + --hash=sha256:8a3d83b28c16b795bdc281f1489b9d247f6e6088ad96ca96406072a36d6354b7 \ + --hash=sha256:9381fd3986d227c7ccb2607c03bbea559ec80f951e2ea47c1fbf381e4cd97164 \ + --hash=sha256:9ab7e580e9962ebdcda8c17f8548858d3fdb648621367d8e717ca317b534b778 \ + --hash=sha256:db144821de7912cb2502052b3070a1ac276b8b019470e6efdfce9c237ffe130d \ + --hash=sha256:e33102b85b3f95a8c95e59b59b29aabd03d47b5bce619de506b9bb83739cf60d \ + --hash=sha256:f16092dd445542e82e3db3553f6c7697ec5a2e837f19d416401283ae245826f9 + # via + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements/llm/llm-requirements.txt numba==0.61.2 \ --hash=sha256:34fba9406078bac7ab052efbf0d13939426c753ad72946baaa5bf9ae0ebb8dd2 \ @@ -1621,7 +1666,7 @@ numba==0.61.2 \ --hash=sha256:ea0247617edcb5dd61f6106a56255baab031acc4257bddaeddb3a1003b4ca3fd \ --hash=sha256:efd3db391df53aaa5cfbee189b6c910a5b471488749fd6606c3f33fc984c2ae2 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # vllm numpy==1.26.4 \ --hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \ @@ -1661,7 +1706,7 @@ numpy==1.26.4 \ --hash=sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3 \ --hash=sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt # cupy-cuda12x # gguf @@ -1685,70 +1730,70 @@ numpy==1.26.4 \ nvidia-cublas-cu12==12.8.3.14 ; platform_machine == 'x86_64' and sys_platform == 'linux' \ --hash=sha256:3f0e05e7293598cf61933258b73e66a160c27d59c4422670bf0b79348c04be44 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # nvidia-cudnn-cu12 # nvidia-cusolver-cu12 # torch nvidia-cuda-cupti-cu12==12.8.57 ; platform_machine == 'x86_64' and sys_platform == 'linux' \ --hash=sha256:8e0b2eb847de260739bee4a3f66fac31378f4ff49538ff527a38a01a9a39f950 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # torch nvidia-cuda-nvrtc-cu12==12.8.61 ; platform_machine == 'x86_64' and sys_platform == 'linux' \ --hash=sha256:a0fa9c2a21583105550ebd871bd76e2037205d56f33f128e69f6d2a55e0af9ed # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # torch nvidia-cuda-runtime-cu12==12.8.57 ; platform_machine == 'x86_64' and sys_platform == 'linux' \ --hash=sha256:75342e28567340b7428ce79a5d6bb6ca5ff9d07b69e7ce00d2c7b4dc23eff0be # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # torch nvidia-cudnn-cu12==9.7.1.26 ; platform_machine == 'x86_64' and sys_platform == 'linux' \ --hash=sha256:6d011159a158f3cfc47bf851aea79e31bcff60d530b70ef70474c84cac484d07 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # torch nvidia-cufft-cu12==11.3.3.41 ; platform_machine == 'x86_64' and sys_platform == 'linux' \ --hash=sha256:da650080ab79fcdf7a4b06aa1b460e99860646b176a43f6208099bdc17836b6a # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # torch nvidia-cufile-cu12==1.13.0.11 ; platform_machine == 'x86_64' and sys_platform == 'linux' \ --hash=sha256:483f434c541806936b98366f6d33caef5440572de8ddf38d453213729da3e7d4 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # torch nvidia-curand-cu12==10.3.9.55 ; platform_machine == 'x86_64' and sys_platform == 'linux' \ --hash=sha256:8387d974240c91f6a60b761b83d4b2f9b938b7e0b9617bae0f0dafe4f5c36b86 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # torch nvidia-cusolver-cu12==11.7.2.55 ; platform_machine == 'x86_64' and sys_platform == 'linux' \ --hash=sha256:4d1354102f1e922cee9db51920dba9e2559877cf6ff5ad03a00d853adafb191b # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # torch nvidia-cusparse-cu12==12.5.7.53 ; platform_machine == 'x86_64' and sys_platform == 'linux' \ --hash=sha256:3c1b61eb8c85257ea07e9354606b26397612627fdcd327bfd91ccf6155e7c86d # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # nvidia-cusolver-cu12 # torch nvidia-cusparselt-cu12==0.6.3 ; platform_machine == 'x86_64' and sys_platform == 'linux' \ --hash=sha256:e5c8a26c36445dd2e6812f1177978a24e2d37cacce7e090f297a688d1ec44f46 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # torch nvidia-nccl-cu12==2.26.2 ; platform_machine == 'x86_64' and sys_platform == 'linux' \ --hash=sha256:694cf3879a206553cc9d7dbda76b13efaf610fdb70a50cba303de1b0d1530ac6 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # torch nvidia-nvjitlink-cu12==12.8.61 ; platform_machine == 'x86_64' and sys_platform == 'linux' \ --hash=sha256:45fd79f2ae20bd67e8bc411055939049873bfd8fac70ff13bd4865e0b9bdab17 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # nvidia-cufft-cu12 # nvidia-cusolver-cu12 # nvidia-cusparse-cu12 @@ -1756,25 +1801,43 @@ nvidia-nvjitlink-cu12==12.8.61 ; platform_machine == 'x86_64' and sys_platform = nvidia-nvtx-cu12==12.8.55 ; platform_machine == 'x86_64' and sys_platform == 'linux' \ --hash=sha256:2dd0780f1a55c21d8e06a743de5bd95653de630decfff40621dbde78cc307102 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # torch -openai==1.90.0 \ - --hash=sha256:9771982cdd5b6631af68c6a603da72ed44cd2caf73b49f717a72b71374bc565b \ - --hash=sha256:e5dcb5498ea6b42fec47546d10f1bcc05fb854219a7d953a5ba766718b212a02 +openai==1.100.2 \ + --hash=sha256:54d3457b2c8d7303a1bc002a058de46bdd8f37a8117751c7cf4ed4438051f151 \ + --hash=sha256:787b4c3c8a65895182c58c424f790c25c790cc9a0330e34f73d55b6ee5a00e32 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # vllm +openai-harmony==0.0.4 \ + --hash=sha256:038f1d6772d1be5213b36ae76e5d042022395ec35c428a73ccb8b839b2cecf6a \ + --hash=sha256:15e6d53a66502491a3675a536df30e271f976e6c5efe68250a65191efcb85c4f \ + --hash=sha256:2d8d16d84702059833fb03b841b28c25600c54e83cadccef79af44e1c81166b1 \ + --hash=sha256:31e9bcac0902a309e2fc688e52f247eec7fffcd00d17e958b9a83a8fea6519c2 \ + --hash=sha256:3586d90c899cd41f8624e7b82a48c289f6e4be56c66304ecaf3a0ba88963a73f \ + --hash=sha256:3cf2344366f10981bbc0f6d9949a0b2bb87151d209ed295943ed6ad8eda37932 \ + --hash=sha256:567cc568b6bf7b4d041b0c9aa7d6b2c9394f8af6065bc87fa6d23f207b5af9a7 \ + --hash=sha256:5c67ac6df349236fb7b64f57c3dbb0273efcdca24314daa108f2a482c427106c \ + --hash=sha256:746f751de5033b3dbcfcd4a726a4c56ce452c593ad3d54472d8597ce8d8b6d44 \ + --hash=sha256:96a63199c0d81095b5d5d1ae8ca82b64c1c13d18d4e30323ae9e8ab31bc80a3d \ + --hash=sha256:97f1fe3909733212cc6b36f0f199b1421a9c57b79ec665f0322bd604cec47340 \ + --hash=sha256:b9ee9e9ab6a237cebbe16563c787a6e83f3fcc034075c3d321dab94448426282 \ + --hash=sha256:d38f2639f6bf7c3c34a5dfd79e29075811ae2fa9b895a63e76767f74a47a971e \ + --hash=sha256:ef21a1e2384a65c62d5ec5e1cded9fe026f1d032d5c5d725110d1a8d330d8f54 + # via + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # vllm opencensus==0.11.4 \ --hash=sha256:a18487ce68bc19900336e0ff4655c5a116daf10c1b3685ece8d971bddad6a864 \ --hash=sha256:cbef87d8b8773064ab60e5c2a1ced58bbaa38a6d052c41aec224958ce544eff2 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt opencensus-context==0.1.3 \ --hash=sha256:073bb0590007af276853009fac7e4bab1d523c3f03baf4cb4511ca38967c6039 \ --hash=sha256:a03108c3c10d8c80bb5ddf5c8a1f033161fa61972a9917f9b9b3a18517f0088c # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # opencensus opencv-python-headless==4.11.0.86 \ --hash=sha256:0e0a27c19dd1f40ddff94976cfe43066fbbe9dfbb2ec1907d66c19caef42a57b \ @@ -1785,14 +1848,14 @@ opencv-python-headless==4.11.0.86 \ --hash=sha256:a66c1b286a9de872c343ee7c3553b084244299714ebb50fbdcd76f07ebbe6c81 \ --hash=sha256:f447d8acbb0b6f2808da71fddd29c1cdd448d2bc98f72d9bb78a7a898fc9621b # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # mistral-common # vllm opentelemetry-api==1.34.1 \ --hash=sha256:64f0bd06d42824843731d05beea88d4d4b6ae59f9fe347ff7dfa2cc14233bbb3 \ --hash=sha256:b7df4cb0830d5a6c29ad0c0691dbae874d8daefa934b8b1d642de48323d32a8c # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt # opentelemetry-exporter-prometheus # opentelemetry-sdk @@ -1801,26 +1864,26 @@ opentelemetry-exporter-prometheus==0.55b1 \ --hash=sha256:d13ec0b22bf394113ff1ada5da98133a4b051779b803dae183188e26c4bd9ee0 \ --hash=sha256:f364fbbff9e5de37a112ff104d1185fb1d7e2046c5ab5911e5afebc7ab3ddf0e # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt opentelemetry-proto==1.27.0 \ --hash=sha256:33c9345d91dafd8a74fc3d7576c5a38f18b7fdf8d02983ac67485386132aedd6 \ --hash=sha256:b133873de5581a50063e1e4b29cdcf0c5e253a8c2d8dc1229add20a4c3830ace # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt opentelemetry-sdk==1.34.1 \ --hash=sha256:308effad4059562f1d92163c61c8141df649da24ce361827812c40abb2a1e96e \ --hash=sha256:8091db0d763fcd6098d4781bbc80ff0971f94e260739aa6afe6fd379cdf3aa4d # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt # opentelemetry-exporter-prometheus opentelemetry-semantic-conventions==0.55b1 \ --hash=sha256:5da81dfdf7d52e3d37f8fe88d5e771e191de924cfff5f550ab0b8f7b2409baed \ --hash=sha256:ef95b1f009159c28d7a7849f5cbc71c4c34c845bb514d66adfdf1b3fff3598b3 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # opentelemetry-sdk outlines-core==0.2.10 \ --hash=sha256:0a9e4b192ca837a472a1bb1428397509f543db08e1aeeee30252525cec34093a \ @@ -1865,15 +1928,16 @@ outlines-core==0.2.10 \ --hash=sha256:f895834da0a577120dcb8d979c12c0690fe912095413bf0070a73e9ff363b7bf \ --hash=sha256:faf5b43181b1d033871364e74e9d348362c6a77b1d054d7af35e09fdfcff5b16 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # vllm packaging==23.0 \ --hash=sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2 \ --hash=sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt # huggingface-hub + # kombu # lazy-loader # lm-format-enforcer # ray @@ -1909,13 +1973,13 @@ pandas==1.5.3 \ --hash=sha256:e474390e60ed609cec869b0da796ad94f420bb057d86784191eefc62b65819ae \ --hash=sha256:f76d097d12c82a535fda9dfe5e8dd4127952b45fea9b0276cb30cca5ea313fbc # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt partial-json-parser==0.2.1.1.post5 \ --hash=sha256:627715aaa3cb3fb60a65b0d62223243acaa6c70846520a90326fef3a2f0b61ca \ --hash=sha256:992710ac67e90b367921d52727698928040f7713ba7ecb33b96371ea7aec82ca # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # vllm pillow==10.3.0 \ --hash=sha256:048ad577748b9fa4a99a0548c64f2cb8d672d5bf2e643a739ac8faff1164238c \ @@ -1988,7 +2052,7 @@ pillow==10.3.0 \ --hash=sha256:fdcbb4068117dfd9ce0138d068ac512843c52295ed996ae6dd1faf537b6dbc27 \ --hash=sha256:ff61bfd9253c3915e6d41c651d5f962da23eda633cf02262990094a18a55371a # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # imageio # mistral-common # scikit-image @@ -1998,13 +2062,13 @@ platformdirs==3.11.0 \ --hash=sha256:cf8ee52a3afdb965072dcc652433e0c7e3e40cf5ea1477cd4b3b1d2eb75495b3 \ --hash=sha256:e9d171d00af68be50e9202731309c4e658fd8bc76f55c11c7dd760d023bda68e # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # virtualenv prometheus-client==0.19.0 \ --hash=sha256:4585b0d1223148c27a225b10dbec5ae9bc4c81a99a3fa80774fa6209935324e1 \ --hash=sha256:c88b1e6ecf6b41cd8fb5731c7ae919bf66df6ec6fafa555cd6c0e16ca169ae92 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt # opentelemetry-exporter-prometheus # prometheus-fastapi-instrumentator @@ -2013,8 +2077,14 @@ prometheus-fastapi-instrumentator==7.1.0 \ --hash=sha256:978130f3c0bb7b8ebcc90d35516a6fe13e02d2eb358c8f83887cdef7020c31e9 \ --hash=sha256:be7cd61eeea4e5912aeccb4261c6631b3f227d8924542d79eaf5af3f439cbe5e # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # vllm +prompt-toolkit==3.0.41 \ + --hash=sha256:941367d97fc815548822aa26c2a269fdc4eb21e9ec05fc5d447cf09bad5d75f0 \ + --hash=sha256:f36fe301fafb7470e86aaf90f036eef600a3210be4decf461a5b1ca8403d3cb2 + # via + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # click-repl propcache==0.3.0 \ --hash=sha256:02df07041e0820cacc8f739510078f2aadcfd3fc57eaeeb16d5ded85c872c89e \ --hash=sha256:03acd9ff19021bd0567582ac88f821b66883e158274183b9e5586f678984f8fe \ @@ -2115,14 +2185,14 @@ propcache==0.3.0 \ --hash=sha256:fb91d20fa2d3b13deea98a690534697742029f4fb83673a3501ae6e3746508b5 \ --hash=sha256:fddb8870bdb83456a489ab67c6b3040a8d5a55069aa6f72f9d872235fbc52f54 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # aiohttp # yarl proto-plus==1.22.3 \ --hash=sha256:a49cd903bc0b6ab41f76bf65510439d56ca76f868adf0274e738bfdd096894df \ --hash=sha256:fdcd09713cbd42480740d2fe29c990f7fbd885a67efc328aa8be6ee3e9f76a6b # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # google-api-core protobuf==4.25.8 \ --hash=sha256:077ff8badf2acf8bc474406706ad890466274191a48d0abd3bd6987107c9cde5 \ @@ -2137,7 +2207,7 @@ protobuf==4.25.8 \ --hash=sha256:d552c53d0415449c8d17ced5c341caba0d89dbf433698e1436c8fa0aae7808a3 \ --hash=sha256:f4510b93a3bec6eba8fd8f1093e9d7fb0d4a24d1a81377c10c0e5bbfe9e4ed24 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt # google-api-core # googleapis-common-protos @@ -2164,13 +2234,13 @@ psutil==5.9.6 \ --hash=sha256:fb8a697f11b0f5994550555fcfe3e69799e5b060c8ecf9e2f75c69302cc35c0d \ --hash=sha256:ff18b8d1a784b810df0b0fff3bcb50ab941c3b8e2c8de5726f9c71c601c611aa # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # vllm py-cpuinfo==9.0.0 \ --hash=sha256:3cdbbf3fac90dc6f118bfd64384f309edeadd902d7c8fb17f02ffa1fc3f49690 \ --hash=sha256:859625bc251f64e21f077d099d4162689c762b5d6a4c3c97553d56241c9674d5 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # vllm py-spy==0.4.0 ; python_full_version < '3.12' \ --hash=sha256:47cdda4c34d9b6cb01f3aaeceb2e88faf57da880207fe72ff6ff97e9bb6cc8a9 \ @@ -2182,7 +2252,7 @@ py-spy==0.4.0 ; python_full_version < '3.12' \ --hash=sha256:eee3d0bde85ca5cf4f01f012d461180ca76c24835a96f7b5c4ded64eb6a008ab \ --hash=sha256:f2cf3f7130e7d780471faa5957441d3b4e0ec39a79b2c00f4c33d494f7728428 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt pyarrow==19.0.1 \ --hash=sha256:008a4009efdb4ea3d2e18f05cd31f9d43c388aad29c636112c2966605ba33466 \ @@ -2228,20 +2298,20 @@ pyarrow==19.0.1 \ --hash=sha256:fca15aabbe9b8355800d923cc2e82c8ef514af321e18b437c3d782aa884eaeec \ --hash=sha256:fd44d66093a239358d07c42a91eebf5015aa54fccba959db899f932218ac9cc8 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt pyasn1==0.5.1 \ --hash=sha256:4439847c58d40b1d0a573d07e3856e95333f1976294494c325775aeca506eb58 \ --hash=sha256:6d391a96e59b23130a5cfa74d6fd7f388dbbe26cc8f1edf39fdddf08d9d6676c # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # pyasn1-modules # rsa pyasn1-modules==0.3.0 \ --hash=sha256:5bd01446b736eb9d31512a30d46c1ac3395d676c6f3cafa4c03eb54b9925631c \ --hash=sha256:d3ccd6ed470d9ffbc716be08bd90efbd44d0734bc9303818f7336070984a162d # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # google-auth pybase64==1.4.1 \ --hash=sha256:011a54ff6ca44c5d03746aec3f1f492fce3155bd3f943fb2ceaea92416d40eeb \ @@ -2388,191 +2458,192 @@ pybase64==1.4.1 \ --hash=sha256:fc9504c4c2e893e0a6c1cc80bce51907e3461288289f630eab22b5735eba1104 \ --hash=sha256:ff172a4dacbd964e5edcf1c2152dae157aabf856508aed15276f46d04a22128e # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # vllm pybind11==2.13.6 \ --hash=sha256:237c41e29157b962835d356b370ededd57594a26d5894a795960f0047cb5caf5 \ --hash=sha256:ba6af10348c12b24e92fa086b39cfba0eff619b61ac77c406167d813b096d39a # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements/llm/llm-requirements.txt pycountry==24.6.1 \ --hash=sha256:b61b3faccea67f87d10c1f2b0fc0be714409e8fcdcc1315613174f6466c10221 \ --hash=sha256:f1a4fb391cd7214f8eefd39556d740adcc233c778a27f8942c8dca351d6ce06f # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # pydantic-extra-types pycparser==2.21 \ --hash=sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9 \ --hash=sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # cffi -pydantic==2.10.0 \ - --hash=sha256:0aca0f045ff6e2f097f1fe89521115335f15049eeb8a7bef3dafe4b19a74e289 \ - --hash=sha256:5e7807ba9201bdf61b1b58aa6eb690916c40a47acfb114b1b4fef3e7fd5b30fc +pydantic==2.11.7 \ + --hash=sha256:d989c3c6cb79469287b1569f7447a17848c998458d49ebe294e975b9baf0f0db \ + --hash=sha256:dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt # compressed-tensors # fastapi # lm-format-enforcer # mistral-common # openai + # openai-harmony # pydantic-extra-types # vllm # xgrammar -pydantic-core==2.27.0 \ - --hash=sha256:0aa4d1b2eba9a325897308b3124014a142cdccb9f3e016f31d3ebee6b5ea5e75 \ - --hash=sha256:0d06b667e53320332be2bf6f9461f4a9b78092a079b8ce8634c9afaa7e10cd9f \ - --hash=sha256:153017e3d6cd3ce979de06d84343ca424bb6092727375eba1968c8b4693c6ecb \ - --hash=sha256:15e350efb67b855cd014c218716feea4986a149ed1f42a539edd271ee074a196 \ - --hash=sha256:185ef205256cd8b38431205698531026979db89a79587725c1e55c59101d64e9 \ - --hash=sha256:1da0c98a85a6c6ed702d5556db3b09c91f9b0b78de37b7593e2de8d03238807a \ - --hash=sha256:225bfff5d425c34e1fd562cef52d673579d59b967d9de06178850c4802af9039 \ - --hash=sha256:24f984fc7762ed5f806d9e8c4c77ea69fdb2afd987b4fd319ef06c87595a8c55 \ - --hash=sha256:25a7fd4de38f7ff99a37e18fa0098c3140286451bc823d1746ba80cec5b433a1 \ - --hash=sha256:2883b260f7a93235488699d39cbbd94fa7b175d3a8063fbfddd3e81ad9988cb2 \ - --hash=sha256:2a51ce96224eadd1845150b204389623c8e129fde5a67a84b972bd83a85c6c40 \ - --hash=sha256:2be0ad541bb9f059954ccf8877a49ed73877f862529575ff3d54bf4223e4dd61 \ - --hash=sha256:31a2cae5f059329f9cfe3d8d266d3da1543b60b60130d186d9b6a3c20a346361 \ - --hash=sha256:333c840a1303d1474f491e7be0b718226c730a39ead0f7dab2c7e6a2f3855555 \ - --hash=sha256:33d14369739c5d07e2e7102cdb0081a1fa46ed03215e07f097b34e020b83b1ae \ - --hash=sha256:35380671c3c921fe8adf31ad349dc6f7588b7e928dbe44e1093789734f607399 \ - --hash=sha256:359e7951f04ad35111b5ddce184db3391442345d0ab073aa63a95eb8af25a5ef \ - --hash=sha256:36aa167f69d8807ba7e341d67ea93e50fcaaf6bc433bb04939430fa3dab06f31 \ - --hash=sha256:395e3e1148fa7809016231f8065f30bb0dc285a97b4dc4360cd86e17bab58af7 \ - --hash=sha256:3e8d89c276234579cd3d095d5fa2a44eb10db9a218664a17b56363cddf226ff3 \ - --hash=sha256:3eb8849445c26b41c5a474061032c53e14fe92a11a5db969f722a2716cd12206 \ - --hash=sha256:3fd8bc2690e7c39eecdf9071b6a889ce7b22b72073863940edc2a0a23750ca90 \ - --hash=sha256:400bf470e4327e920883b51e255617dfe4496d4e80c3fea0b5a5d0bf2c404dd4 \ - --hash=sha256:4148dc9184ab79e356dc00a4199dc0ee8647973332cb385fc29a7cced49b9f9c \ - --hash=sha256:433689845288f9a1ee5714444e65957be26d30915f7745091ede4a83cfb2d7bb \ - --hash=sha256:43b61989068de9ce62296cde02beffabcadb65672207fc51e7af76dca75e6636 \ - --hash=sha256:4523c4009c3f39d948e01962223c9f5538602e7087a628479b723c939fab262d \ - --hash=sha256:483c2213a609e7db2c592bbc015da58b6c75af7360ca3c981f178110d9787bcf \ - --hash=sha256:49633583eb7dc5cba61aaf7cdb2e9e662323ad394e543ee77af265736bcd3eaa \ - --hash=sha256:4b51f964fcbb02949fc546022e56cdb16cda457af485e9a3e8b78ac2ecf5d77e \ - --hash=sha256:4bf1340ae507f6da6360b24179c2083857c8ca7644aab65807023cf35404ea8d \ - --hash=sha256:4fb49cfdb53af5041aba909be00cccfb2c0d0a2e09281bf542371c5fd36ad04c \ - --hash=sha256:510b11e9c3b1a852876d1ccd8d5903684336d635214148637ceb27366c75a467 \ - --hash=sha256:513cb14c0cc31a4dfd849a4674b20c46d87b364f997bbcb02282306f5e187abf \ - --hash=sha256:58560828ee0951bb125c6f2862fbc37f039996d19ceb6d8ff1905abf7da0bf3d \ - --hash=sha256:58ab0d979c969983cdb97374698d847a4acffb217d543e172838864636ef10d9 \ - --hash=sha256:5982048129f40b082c2654de10c0f37c67a14f5ff9d37cf35be028ae982f26df \ - --hash=sha256:5ab325fc86fbc077284c8d7f996d904d30e97904a87d6fb303dce6b3de7ebba9 \ - --hash=sha256:5cc822ab90a70ea3a91e6aed3afac570b276b1278c6909b1d384f745bd09c714 \ - --hash=sha256:5f2b19b8d6fca432cb3acf48cf5243a7bf512988029b6e6fd27e9e8c0a204d85 \ - --hash=sha256:5fc72fbfebbf42c0856a824b8b0dc2b5cd2e4a896050281a21cfa6fed8879cb1 \ - --hash=sha256:6354e18a9be37bfa124d6b288a87fb30c673745806c92956f1a25e3ae6e76b96 \ - --hash=sha256:678f66462058dd978702db17eb6a3633d634f7aa0deaea61e0a674152766d3fc \ - --hash=sha256:68950bc08f9735306322bfc16a18391fcaac99ded2509e1cc41d03ccb6013cfe \ - --hash=sha256:68ef5377eb582fa4343c9d0b57a5b094046d447b4c73dd9fbd9ffb216f829e7d \ - --hash=sha256:6b4c19525c3538fbc0bbda6229f9682fb8199ce9ac37395880e6952798e00373 \ - --hash=sha256:6bb69bf3b6500f195c3deb69c1205ba8fc3cb21d1915f1f158a10d6b1ef29b6a \ - --hash=sha256:6e19401742ed7b69e51d8e4df3c03ad5ec65a83b36244479fd70edde2828a5d9 \ - --hash=sha256:6f4a53af9e81d757756508b57cae1cf28293f0f31b9fa2bfcb416cc7fb230f9d \ - --hash=sha256:6fda87808429c520a002a85d6e7cdadbf58231d60e96260976c5b8f9a12a8e13 \ - --hash=sha256:78f841523729e43e3928a364ec46e2e3f80e6625a4f62aca5c345f3f626c6e8a \ - --hash=sha256:7a6ebfac28fd51890a61df36ef202adbd77d00ee5aca4a3dadb3d9ed49cfb929 \ - --hash=sha256:7b0202ebf2268954090209a84f9897345719e46a57c5f2c9b7b250ca0a9d3e63 \ - --hash=sha256:8117839a9bdbba86e7f9df57018fe3b96cec934c3940b591b0fd3fbfb485864a \ - --hash=sha256:82e1ad4ca170e8af4c928b67cff731b6296e6a0a0981b97b2eb7c275cc4e15bd \ - --hash=sha256:836a4bfe0cc6d36dc9a9cc1a7b391265bf6ce9d1eb1eac62ac5139f5d8d9a6fa \ - --hash=sha256:84af1cf7bfdcbc6fcf5a5f70cc9896205e0350306e4dd73d54b6a18894f79386 \ - --hash=sha256:84e35afd9e10b2698e6f2f32256678cb23ca6c1568d02628033a837638b3ed12 \ - --hash=sha256:884f1806609c2c66564082540cffc96868c5571c7c3cf3a783f63f2fb49bd3cd \ - --hash=sha256:8a150392102c402c538190730fda06f3bce654fc498865579a9f2c1d2b425833 \ - --hash=sha256:8e21d927469d04b39386255bf00d0feedead16f6253dcc85e9e10ddebc334084 \ - --hash=sha256:8e96ca781e0c01e32115912ebdf7b3fb0780ce748b80d7d28a0802fa9fbaf44e \ - --hash=sha256:8ee4c2a75af9fe21269a4a0898c5425afb01af1f5d276063f57e2ae1bc64e191 \ - --hash=sha256:91bc66f878557313c2a6bcf396e7befcffe5ab4354cfe4427318968af31143c3 \ - --hash=sha256:951e71da6c89d354572098bada5ba5b5dc3a9390c933af8a614e37755d3d1840 \ - --hash=sha256:99b2863c1365f43f74199c980a3d40f18a218fbe683dd64e470199db426c4d6a \ - --hash=sha256:9a8fbf506fde1529a1e3698198fe64bfbe2e0c09557bc6a7dcf872e7c01fec40 \ - --hash=sha256:9ce048deb1e033e7a865ca384770bccc11d44179cf09e5193a535c4c2f497bdc \ - --hash=sha256:9fe94d9d2a2b4edd7a4b22adcd45814b1b59b03feb00e56deb2e89747aec7bfe \ - --hash=sha256:a291d0b4243a259c8ea7e2b84eb9ccb76370e569298875a7c5e3e71baf49057a \ - --hash=sha256:a5c022bb0d453192426221605efc865373dde43b17822a264671c53b068ac20c \ - --hash=sha256:abb4785894936d7682635726613c44578c420a096729f1978cd061a7e72d5275 \ - --hash=sha256:b872c86d8d71827235c7077461c502feb2db3f87d9d6d5a9daa64287d75e4fa0 \ - --hash=sha256:bf37b72834e7239cf84d4a0b2c050e7f9e48bced97bad9bdf98d26b8eb72e846 \ - --hash=sha256:c0c431e4be5c1a0c6654e0c31c661cd89e0ca956ef65305c3c3fd96f4e72ca39 \ - --hash=sha256:c5726eec789ee38f2c53b10b1821457b82274f81f4f746bb1e666d8741fcfadb \ - --hash=sha256:c6fcb3fa3855d583aa57b94cf146f7781d5d5bc06cb95cb3afece33d31aac39b \ - --hash=sha256:c86679f443e7085ea55a7376462553996c688395d18ef3f0d3dbad7838f857a2 \ - --hash=sha256:c91e3c04f5191fd3fb68764bddeaf02025492d5d9f23343b283870f6ace69708 \ - --hash=sha256:c921ad596ff1a82f9c692b0758c944355abc9f0de97a4c13ca60ffc6d8dc15d4 \ - --hash=sha256:c9ed88b398ba7e3bad7bd64d66cc01dcde9cfcb7ec629a6fd78a82fa0b559d78 \ - --hash=sha256:cd2ac6b919f7fed71b17fe0b4603c092a4c9b5bae414817c9c81d3c22d1e1bcc \ - --hash=sha256:d28ca7066d6cdd347a50d8b725dc10d9a1d6a1cce09836cf071ea6a2d4908be0 \ - --hash=sha256:d29e235ce13c91902ef3efc3d883a677655b3908b1cbc73dee816e5e1f8f7739 \ - --hash=sha256:d8b5ee4ae9170e2775d495b81f414cc20268041c42571530513496ba61e94ba3 \ - --hash=sha256:db72e40628967f6dc572020d04b5f800d71264e0531c6da35097e73bdf38b003 \ - --hash=sha256:df45c4073bed486ea2f18757057953afed8dd77add7276ff01bccb79982cf46c \ - --hash=sha256:dfa5f5c0a4c8fced1422dc2ca7eefd872d5d13eb33cf324361dbf1dbfba0a9fe \ - --hash=sha256:e015833384ca3e1a0565a79f5d953b0629d9138021c27ad37c92a9fa1af7623c \ - --hash=sha256:e15315691fe2253eb447503153acef4d7223dfe7e7702f9ed66539fcd0c43801 \ - --hash=sha256:e65466b31be1070b4a5b7dbfbd14b247884cb8e8b79c64fb0f36b472912dbaea \ - --hash=sha256:e7820bb0d65e3ce1e3e70b6708c2f66143f55912fa02f4b618d0f08b61575f12 \ - --hash=sha256:e851a051f7260e6d688267eb039c81f05f23a19431bd7dfa4bf5e3cb34c108cd \ - --hash=sha256:e9f9feee7f334b72ceae46313333d002b56f325b5f04271b4ae2aadd9e993ae4 \ - --hash=sha256:eb40f828bc2f73f777d1eb8fee2e86cd9692a4518b63b6b5aa8af915dfd3207b \ - --hash=sha256:eb704155e73b833801c247f39d562229c0303f54770ca14fb1c053acb376cf10 \ - --hash=sha256:edb1bfd45227dec8d50bc7c7d86463cd8728bcc574f9b07de7369880de4626a3 \ - --hash=sha256:ee7d9d5537daf6d5c74a83b38a638cc001b648096c1cae8ef695b0c919d9d379 \ - --hash=sha256:f57783fbaf648205ac50ae7d646f27582fc706be3977e87c3c124e7a92407b10 \ - --hash=sha256:ff63a92f6e249514ef35bc795de10745be0226eaea06eb48b4bbeaa0c8850a4a - # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt +pydantic-core==2.33.2 \ + --hash=sha256:0069c9acc3f3981b9ff4cdfaf088e98d83440a4c7ea1bc07460af3d4dc22e72d \ + --hash=sha256:031c57d67ca86902726e0fae2214ce6770bbe2f710dc33063187a68744a5ecac \ + --hash=sha256:0405262705a123b7ce9f0b92f123334d67b70fd1f20a9372b907ce1080c7ba02 \ + --hash=sha256:04a1a413977ab517154eebb2d326da71638271477d6ad87a769102f7c2488c56 \ + --hash=sha256:09fb9dd6571aacd023fe6aaca316bd01cf60ab27240d7eb39ebd66a3a15293b4 \ + --hash=sha256:0a39979dcbb70998b0e505fb1556a1d550a0781463ce84ebf915ba293ccb7e22 \ + --hash=sha256:0a9f2c9dd19656823cb8250b0724ee9c60a82f3cdf68a080979d13092a3b0fef \ + --hash=sha256:0e03262ab796d986f978f79c943fc5f620381be7287148b8010b4097f79a39ec \ + --hash=sha256:0e5b2671f05ba48b94cb90ce55d8bdcaaedb8ba00cc5359f6810fc918713983d \ + --hash=sha256:0e6116757f7959a712db11f3e9c0a99ade00a5bbedae83cb801985aa154f071b \ + --hash=sha256:0fb2d542b4d66f9470e8065c5469ec676978d625a8b7a363f07d9a501a9cb36a \ + --hash=sha256:1082dd3e2d7109ad8b7da48e1d4710c8d06c253cbc4a27c1cff4fbcaa97a9e3f \ + --hash=sha256:1a8695a8d00c73e50bff9dfda4d540b7dee29ff9b8053e38380426a85ef10052 \ + --hash=sha256:1e063337ef9e9820c77acc768546325ebe04ee38b08703244c1309cccc4f1bab \ + --hash=sha256:1ea40a64d23faa25e62a70ad163571c0b342b8bf66d5fa612ac0dec4f069d916 \ + --hash=sha256:2058a32994f1fde4ca0480ab9d1e75a0e8c87c22b53a3ae66554f9af78f2fe8c \ + --hash=sha256:235f45e5dbcccf6bd99f9f472858849f73d11120d76ea8707115415f8e5ebebf \ + --hash=sha256:2807668ba86cb38c6817ad9bc66215ab8584d1d304030ce4f0887336f28a5e27 \ + --hash=sha256:2b0a451c263b01acebe51895bfb0e1cc842a5c666efe06cdf13846c7418caa9a \ + --hash=sha256:2b3d326aaef0c0399d9afffeb6367d5e26ddc24d351dbc9c636840ac355dc5d8 \ + --hash=sha256:2bfb5112df54209d820d7bf9317c7a6c9025ea52e49f46b6a2060104bba37de7 \ + --hash=sha256:2f82865531efd18d6e07a04a17331af02cb7a651583c418df8266f17a63c6612 \ + --hash=sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1 \ + --hash=sha256:3a1c81334778f9e3af2f8aeb7a960736e5cab1dfebfb26aabca09afd2906c039 \ + --hash=sha256:3abcd9392a36025e3bd55f9bd38d908bd17962cc49bc6da8e7e96285336e2bca \ + --hash=sha256:3c6db6e52c6d70aa0d00d45cdb9b40f0433b96380071ea80b09277dba021ddf7 \ + --hash=sha256:3dc625f4aa79713512d1976fe9f0bc99f706a9dee21dfd1810b4bbbf228d0e8a \ + --hash=sha256:3eb3fe62804e8f859c49ed20a8451342de53ed764150cb14ca71357c765dc2a6 \ + --hash=sha256:44857c3227d3fb5e753d5fe4a3420d6376fa594b07b621e220cd93703fe21782 \ + --hash=sha256:4b25d91e288e2c4e0662b8038a28c6a07eaac3e196cfc4ff69de4ea3db992a1b \ + --hash=sha256:4c5b0a576fb381edd6d27f0a85915c6daf2f8138dc5c267a57c08a62900758c7 \ + --hash=sha256:4e61206137cbc65e6d5256e1166f88331d3b6238e082d9f74613b9b765fb9025 \ + --hash=sha256:52fb90784e0a242bb96ec53f42196a17278855b0f31ac7c3cc6f5c1ec4811849 \ + --hash=sha256:53a57d2ed685940a504248187d5685e49eb5eef0f696853647bf37c418c538f7 \ + --hash=sha256:572c7e6c8bb4774d2ac88929e3d1f12bc45714ae5ee6d9a788a9fb35e60bb04b \ + --hash=sha256:5c4aa4e82353f65e548c476b37e64189783aa5384903bfea4f41580f255fddfa \ + --hash=sha256:5c92edd15cd58b3c2d34873597a1e20f13094f59cf88068adb18947df5455b4e \ + --hash=sha256:5f483cfb75ff703095c59e365360cb73e00185e01aaea067cd19acffd2ab20ea \ + --hash=sha256:61c18fba8e5e9db3ab908620af374db0ac1baa69f0f32df4f61ae23f15e586ac \ + --hash=sha256:6368900c2d3ef09b69cb0b913f9f8263b03786e5b2a387706c5afb66800efd51 \ + --hash=sha256:64632ff9d614e5eecfb495796ad51b0ed98c453e447a76bcbeeb69615079fc7e \ + --hash=sha256:65132b7b4a1c0beded5e057324b7e16e10910c106d43675d9bd87d4f38dde162 \ + --hash=sha256:6b99022f1d19bc32a4c2a0d544fc9a76e3be90f0b3f4af413f87d38749300e65 \ + --hash=sha256:6bdfe4b3789761f3bcb4b1ddf33355a71079858958e3a552f16d5af19768fef2 \ + --hash=sha256:6fa6dfc3e4d1f734a34710f391ae822e0a8eb8559a85c6979e14e65ee6ba2954 \ + --hash=sha256:73662edf539e72a9440129f231ed3757faab89630d291b784ca99237fb94db2b \ + --hash=sha256:73cf6373c21bc80b2e0dc88444f41ae60b2f070ed02095754eb5a01df12256de \ + --hash=sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc \ + --hash=sha256:7f92c15cd1e97d4b12acd1cc9004fa092578acfa57b67ad5e43a197175d01a64 \ + --hash=sha256:82f68293f055f51b51ea42fafc74b6aad03e70e191799430b90c13d643059ebb \ + --hash=sha256:83aa99b1285bc8f038941ddf598501a86f1536789740991d7d8756e34f1e74d9 \ + --hash=sha256:87acbfcf8e90ca885206e98359d7dca4bcbb35abdc0ff66672a293e1d7a19101 \ + --hash=sha256:87b31b6846e361ef83fedb187bb5b4372d0da3f7e28d85415efa92d6125d6e6d \ + --hash=sha256:881b21b5549499972441da4758d662aeea93f1923f953e9cbaff14b8b9565aef \ + --hash=sha256:8d55ab81c57b8ff8548c3e4947f119551253f4e3787a7bbc0b6b3ca47498a9d3 \ + --hash=sha256:8f57a69461af2a5fa6e6bbd7a5f60d3b7e6cebb687f55106933188e79ad155c1 \ + --hash=sha256:95237e53bb015f67b63c91af7518a62a8660376a6a0db19b89acc77a4d6199f5 \ + --hash=sha256:96081f1605125ba0855dfda83f6f3df5ec90c61195421ba72223de35ccfb2f88 \ + --hash=sha256:970919794d126ba8645f3837ab6046fb4e72bbc057b3709144066204c19a455d \ + --hash=sha256:9cb1da0f5a471435a7bc7e439b8a728e8b61e59784b2af70d7c169f8dd8ae290 \ + --hash=sha256:9fcd347d2cc5c23b06de6d3b7b8275be558a0c90549495c699e379a80bf8379e \ + --hash=sha256:9fdac5d6ffa1b5a83bca06ffe7583f5576555e6c8b3a91fbd25ea7780f825f7d \ + --hash=sha256:a11c8d26a50bfab49002947d3d237abe4d9e4b5bdc8846a63537b6488e197808 \ + --hash=sha256:a144d4f717285c6d9234a66778059f33a89096dfb9b39117663fd8413d582dcc \ + --hash=sha256:a2b911a5b90e0374d03813674bf0a5fbbb7741570dcd4b4e85a2e48d17def29d \ + --hash=sha256:a7ec89dc587667f22b6a0b6579c249fca9026ce7c333fc142ba42411fa243cdc \ + --hash=sha256:aa9d91b338f2df0508606f7009fde642391425189bba6d8c653afd80fd6bb64e \ + --hash=sha256:b0379a2b24882fef529ec3b4987cb5d003b9cda32256024e6fe1586ac45fc640 \ + --hash=sha256:bc7aee6f634a6f4a95676fcb5d6559a2c2a390330098dba5e5a5f28a2e4ada30 \ + --hash=sha256:bdc25f3681f7b78572699569514036afe3c243bc3059d3942624e936ec93450e \ + --hash=sha256:c083a3bdd5a93dfe480f1125926afcdbf2917ae714bdb80b36d34318b2bec5d9 \ + --hash=sha256:c20c462aa4434b33a2661701b861604913f912254e441ab8d78d30485736115a \ + --hash=sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9 \ + --hash=sha256:c52b02ad8b4e2cf14ca7b3d918f3eb0ee91e63b3167c32591e57c4317e134f8f \ + --hash=sha256:c54c939ee22dc8e2d545da79fc5381f1c020d6d3141d3bd747eab59164dc89fb \ + --hash=sha256:c8e7af2f4e0194c22b5b37205bfb293d166a7344a5b0d0eaccebc376546d77d5 \ + --hash=sha256:cca3868ddfaccfbc4bfb1d608e2ccaaebe0ae628e1416aeb9c4d88c001bb45ab \ + --hash=sha256:d3f26877a748dc4251cfcfda9dfb5f13fcb034f5308388066bcfe9031b63ae7d \ + --hash=sha256:d53b22f2032c42eaaf025f7c40c2e3b94568ae077a606f006d206a463bc69572 \ + --hash=sha256:d87c561733f66531dced0da6e864f44ebf89a8fba55f31407b00c2f7f9449593 \ + --hash=sha256:d946c8bf0d5c24bf4fe333af284c59a19358aa3ec18cb3dc4370080da1e8ad29 \ + --hash=sha256:dac89aea9af8cd672fa7b510e7b8c33b0bba9a43186680550ccf23020f32d535 \ + --hash=sha256:db4b41f9bd95fbe5acd76d89920336ba96f03e149097365afe1cb092fceb89a1 \ + --hash=sha256:dc46a01bf8d62f227d5ecee74178ffc448ff4e5197c756331f71efcc66dc980f \ + --hash=sha256:dd14041875d09cc0f9308e37a6f8b65f5585cf2598a53aa0123df8b129d481f8 \ + --hash=sha256:de4b83bb311557e439b9e186f733f6c645b9417c84e2eb8203f3f820a4b988bf \ + --hash=sha256:e799c050df38a639db758c617ec771fd8fb7a5f8eaaa4b27b101f266b216a246 \ + --hash=sha256:e80b087132752f6b3d714f041ccf74403799d3b23a72722ea2e6ba2e892555b9 \ + --hash=sha256:eb8c529b2819c37140eb51b914153063d27ed88e3bdc31b71198a198e921e011 \ + --hash=sha256:eb9b459ca4df0e5c87deb59d37377461a538852765293f9e6ee834f0435a93b9 \ + --hash=sha256:efec8db3266b76ef9607c2c4c419bdb06bf335ae433b80816089ea7585816f6a \ + --hash=sha256:f481959862f57f29601ccced557cc2e817bce7533ab8e01a797a48b49c9692b3 \ + --hash=sha256:f517ca031dfc037a9c07e748cefd8d96235088b83b4f4ba8939105d20fa1dcd6 \ + --hash=sha256:f889f7a40498cc077332c7ab6b4608d296d852182211787d4f3ee377aaae66e8 \ + --hash=sha256:f8de619080e944347f5f20de29a975c2d815d9ddd8be9b9b7268e2e3ef68605a \ + --hash=sha256:f941635f2a3d96b2973e867144fde513665c87f13fe0e193c158ac51bfaaa7b2 \ + --hash=sha256:fa754d1850735a0b0e03bcffd9d4b4343eb417e47196e4485d9cca326073a42c \ + --hash=sha256:fa854f5cf7e33842a892e5c73f45327760bc7bc516339fda888c75ae60edaeb6 \ + --hash=sha256:fe5b32187cbc0c862ee201ad66c30cf218e5ed468ec8dc1cf49dec66e160cc4d + # via + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # pydantic pydantic-extra-types==2.10.5 \ --hash=sha256:1dcfa2c0cf741a422f088e0dbb4690e7bfadaaf050da3d6f80d6c3cf58a2bad8 \ --hash=sha256:b60c4e23d573a69a4f1a16dd92888ecc0ef34fb0e655b4f305530377fa70e7a8 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # mistral-common pygments==2.18.0 \ --hash=sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199 \ --hash=sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # rich pyopenssl==25.0.0 \ --hash=sha256:424c247065e46e76a37411b9ab1782541c23bb658bf003772c3405fbaa128e90 \ --hash=sha256:cd2cef799efa3936bb08e8ccb9433a575722b9dd986023f1cabc4ae64e9dac16 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt python-dateutil==2.8.2 \ --hash=sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86 \ --hash=sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # celery # pandas python-dotenv==1.1.0 \ --hash=sha256:41f90bc6f5f177fb41f53e87666db362025010eb28f60a01c9143bfa33a2b2d5 \ --hash=sha256:d7c01d9e2293916c18baf562d95698754b0dbbb5e74d457c45d4f6561fb9d55d # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # uvicorn python-json-logger==2.0.7 \ --hash=sha256:23e7ec02d34237c5aa1e29a070193a4ea87583bb4e7f8fd06d3de8264c4b2e1c \ --hash=sha256:f380b826a991ebbe3de4d897aeec42760035ac760345e57b812938dc8b35e2bd # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # vllm python-multipart==0.0.20 \ --hash=sha256:8a62d3a8335e06589fe01f2a3e178cdcc632f3fbe0d492ad9ee0ec35aab1f104 \ --hash=sha256:8dd0cab45b8e23064ae09147625994d090fa46f5b0d1e13af944c331a7fa9d13 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # fastapi pytz==2022.7.1 \ --hash=sha256:01a0681c4b9684a28304615eba55d1ab31ae00bf68ec157ec3708a8182dbbcd0 \ --hash=sha256:78f4f37d8198e0627c5f1143240bb0206b8691d8d7ac6d78fee88b78733f8c4a # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # pandas pyyaml==6.0.1 \ --hash=sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5 \ @@ -2627,7 +2698,7 @@ pyyaml==6.0.1 \ --hash=sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d \ --hash=sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt # gguf # huggingface-hub @@ -2726,13 +2797,13 @@ pyzmq==26.0.3 \ --hash=sha256:f6b1d1c631e5940cac5a0b22c5379c86e8df6a4ec277c7a856b714021ab6cfad \ --hash=sha256:f6c21c00478a7bea93caaaef9e7629145d4153b15a8653e8bb4609d4bc70dbfc # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # vllm referencing==0.36.2 \ --hash=sha256:df2e89862cd09deabbdba16944cc3f10feb6b3e6f18e902f7cc25609a34775aa \ --hash=sha256:e8699adbbf8b5c7de96d8ffa0eb5c158b3beafce084968e2ea8bb08c6794dcd0 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # jsonschema # jsonschema-specifications regex==2024.11.6 \ @@ -2831,7 +2902,7 @@ regex==2024.11.6 \ --hash=sha256:fdd6028445d2460f33136c55eeb1f601ab06d74cb3347132e1c24250187500d9 \ --hash=sha256:ff590880083d60acc0433f9c3f713c51f7ac6ebb9adf889c79a261ecf541aa91 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # tiktoken # transformers # vllm @@ -2839,7 +2910,7 @@ requests==2.32.3 \ --hash=sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760 \ --hash=sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt # google-api-core # huggingface-hub @@ -2852,7 +2923,7 @@ rich==13.3.2 \ --hash=sha256:91954fe80cfb7985727a467ca98a7618e5dd15178cc2da10f553b36a93859001 \ --hash=sha256:a104f37270bf677148d8acb07d33be1569eeee87e2d1beb286a4e9113caf6f2f # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt # memray # typer @@ -2961,14 +3032,14 @@ rpds-py==0.22.3 \ --hash=sha256:fda7cb070f442bf80b642cd56483b5548e43d366fe3f39b98e67cce780cded00 \ --hash=sha256:feea821ee2a9273771bae61194004ee2fc33f8ec7db08117ef9147d4bbcbca8e # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # jsonschema # referencing rsa==4.7.2 \ --hash=sha256:78f9a9bf4e7be0c5ded4583326e7461e3a3c5aae24073648b4bdfa797d78c9d2 \ --hash=sha256:9d689e6ca1b3038bc82bf8d23e944b6b6037bc02301a574935b2dd946e0353b9 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # google-auth safetensors==0.5.3 \ --hash=sha256:1077f3e94182d72618357b04b5ced540ceb71c8a813d3319f1aba448e68a770d \ @@ -2987,7 +3058,7 @@ safetensors==0.5.3 \ --hash=sha256:cfc0ec0846dcf6763b0ed3d1846ff36008c6e7290683b61616c4b040f6a54ace \ --hash=sha256:df26da01aaac504334644e1b7642fa000bfec820e7cef83aeac4e355e03195ff # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # transformers scikit-image==0.24.0 \ --hash=sha256:18836a18d3a7b6aca5376a2d805f0045826bc6c9fc85331659c33b4813e0b563 \ @@ -3012,7 +3083,7 @@ scikit-image==0.24.0 \ --hash=sha256:ef04360eda372ee5cd60aebe9be91258639c86ae2ea24093fb9182118008d009 \ --hash=sha256:fa27b3a0dbad807b966b8db2d78da734cb812ca4787f7fbb143764800ce2fa9c # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt scipy==1.11.4 \ --hash=sha256:00150c5eae7b610c32589dda259eacc7c4f1665aedf25d921907f4d08a951b1c \ @@ -3041,7 +3112,7 @@ scipy==1.11.4 \ --hash=sha256:f313b39a7e94f296025e3cffc2c567618174c0b1dde173960cf23808f9fae4be \ --hash=sha256:f3cd9e7b3c2c1ec26364856f9fbe78695fe631150f94cd1c22228456404cf1ec # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt # scikit-image # vllm @@ -3100,34 +3171,135 @@ sentencepiece==0.2.0 \ --hash=sha256:fb89f811e5efd18bab141afc3fea3de141c3f69f3fe9e898f710ae7fe3aab251 \ --hash=sha256:ff88712338b01031910e8e61e7239aff3ce8869ee31a47df63cb38aadd591bea # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # gguf # mistral-common # vllm +setproctitle==1.3.6 \ + --hash=sha256:082413db8a96b1f021088e8ec23f0a61fec352e649aba20881895815388b66d3 \ + --hash=sha256:0dba8faee2e4a96e934797c9f0f2d093f8239bf210406a99060b3eabe549628e \ + --hash=sha256:0e6b5633c94c5111f7137f875e8f1ff48f53b991d5d5b90932f27dc8c1fa9ae4 \ + --hash=sha256:1065ed36bd03a3fd4186d6c6de5f19846650b015789f72e2dea2d77be99bdca1 \ + --hash=sha256:109fc07b1cd6cef9c245b2028e3e98e038283342b220def311d0239179810dbe \ + --hash=sha256:13624d9925bb481bc0ccfbc7f533da38bfbfe6e80652314f789abc78c2e513bd \ + --hash=sha256:156795b3db976611d09252fc80761fcdb65bb7c9b9581148da900851af25ecf4 \ + --hash=sha256:163dba68f979c61e4e2e779c4d643e968973bdae7c33c3ec4d1869f7a9ba8390 \ + --hash=sha256:17d7c833ed6545ada5ac4bb606b86a28f13a04431953d4beac29d3773aa00b1d \ + --hash=sha256:18d0667bafaaae4c1dee831e2e59841c411ff399b9b4766822ba2685d419c3be \ + --hash=sha256:1aa1935aa2195b76f377e5cb018290376b7bf085f0b53f5a95c0c21011b74367 \ + --hash=sha256:2156d55308431ac3b3ec4e5e05b1726d11a5215352d6a22bb933171dee292f8c \ + --hash=sha256:23a57d3b8f1549515c2dbe4a2880ebc1f27780dc126c5e064167563e015817f5 \ + --hash=sha256:2407955dc359d735a20ac6e797ad160feb33d529a2ac50695c11a1ec680eafab \ + --hash=sha256:2940cf13f4fc11ce69ad2ed37a9f22386bfed314b98d8aebfd4f55459aa59108 \ + --hash=sha256:2e51ec673513465663008ce402171192a053564865c2fc6dc840620871a9bd7c \ + --hash=sha256:3393859eb8f19f5804049a685bf286cb08d447e28ba5c6d8543c7bf5500d5970 \ + --hash=sha256:3884002b3a9086f3018a32ab5d4e1e8214dd70695004e27b1a45c25a6243ad0b \ + --hash=sha256:38ca045626af693da042ac35d7332e7b9dbd52e6351d6973b310612e3acee6d6 \ + --hash=sha256:391bb6a29c4fe7ccc9c30812e3744060802d89b39264cfa77f3d280d7f387ea5 \ + --hash=sha256:3cca16fd055316a48f0debfcbfb6af7cea715429fc31515ab3fcac05abd527d8 \ + --hash=sha256:3cde5b83ec4915cd5e6ae271937fd60d14113c8f7769b4a20d51769fe70d8717 \ + --hash=sha256:3f8194b4d631b003a1176a75d1acd545e04b1f54b821638e098a93e6e62830ef \ + --hash=sha256:3fc97805f9d74444b027babff710bf39df1541437a6a585a983d090ae00cedde \ + --hash=sha256:4431629c178193f23c538cb1de3da285a99ccc86b20ee91d81eb5f1a80e0d2ba \ + --hash=sha256:49498ebf68ca3e75321ffe634fcea5cc720502bfaa79bd6b03ded92ce0dc3c24 \ + --hash=sha256:4ac3eb04bcf0119aadc6235a2c162bae5ed5f740e3d42273a7228b915722de20 \ + --hash=sha256:4adf6a0013fe4e0844e3ba7583ec203ca518b9394c6cc0d3354df2bf31d1c034 \ + --hash=sha256:4efc91b437f6ff2578e89e3f17d010c0a0ff01736606473d082913ecaf7859ba \ + --hash=sha256:50706b9c0eda55f7de18695bfeead5f28b58aa42fd5219b3b1692d554ecbc9ec \ + --hash=sha256:5313a4e9380e46ca0e2c681ba739296f9e7c899e6f4d12a6702b2dc9fb846a31 \ + --hash=sha256:543f59601a4e32daf44741b52f9a23e0ee374f9f13b39c41d917302d98fdd7b0 \ + --hash=sha256:57bc54763bf741813a99fbde91f6be138c8706148b7b42d3752deec46545d470 \ + --hash=sha256:63cc10352dc6cf35a33951656aa660d99f25f574eb78132ce41a85001a638aa7 \ + --hash=sha256:6a1d3aa13acfe81f355b0ce4968facc7a19b0d17223a0f80c011a1dba8388f37 \ + --hash=sha256:6af330ddc2ec05a99c3933ab3cba9365357c0b8470a7f2fa054ee4b0984f57d1 \ + --hash=sha256:6d50bfcc1d1692dc55165b3dd2f0b9f8fb5b1f7b571a93e08d660ad54b9ca1a5 \ + --hash=sha256:70100e2087fe05359f249a0b5f393127b3a1819bf34dec3a3e0d4941138650c9 \ + --hash=sha256:74973aebea3543ad033b9103db30579ec2b950a466e09f9c2180089e8346e0ec \ + --hash=sha256:751ba352ed922e0af60458e961167fa7b732ac31c0ddd1476a2dfd30ab5958c5 \ + --hash=sha256:785cd210c0311d9be28a70e281a914486d62bfd44ac926fcd70cf0b4d65dff1c \ + --hash=sha256:7890e291bf4708e3b61db9069ea39b3ab0651e42923a5e1f4d78a7b9e4b18301 \ + --hash=sha256:793a23e8d9cb6c231aa3023d700008224c6ec5b8fd622d50f3c51665e3d0a190 \ + --hash=sha256:797f2846b546a8741413c57d9fb930ad5aa939d925c9c0fa6186d77580035af7 \ + --hash=sha256:7df5fcc48588f82b6cc8073db069609ddd48a49b1e9734a20d0efb32464753c4 \ + --hash=sha256:8050c01331135f77ec99d99307bfbc6519ea24d2f92964b06f3222a804a3ff1f \ + --hash=sha256:805bb33e92fc3d8aa05674db3068d14d36718e3f2c5c79b09807203f229bf4b5 \ + --hash=sha256:807796fe301b7ed76cf100113cc008c119daf4fea2f9f43c578002aef70c3ebf \ + --hash=sha256:81c443310831e29fabbd07b75ebbfa29d0740b56f5907c6af218482d51260431 \ + --hash=sha256:83066ffbf77a5f82b7e96e59bdccbdda203c8dccbfc3f9f0fdad3a08d0001d9c \ + --hash=sha256:8834ab7be6539f1bfadec7c8d12249bbbe6c2413b1d40ffc0ec408692232a0c6 \ + --hash=sha256:92df0e70b884f5da35f2e01489dca3c06a79962fb75636985f1e3a17aec66833 \ + --hash=sha256:9483aa336687463f5497dd37a070094f3dff55e2c888994f8440fcf426a1a844 \ + --hash=sha256:97a138fa875c6f281df7720dac742259e85518135cd0e3551aba1c628103d853 \ + --hash=sha256:9b50700785eccac0819bea794d968ed8f6055c88f29364776b7ea076ac105c5d \ + --hash=sha256:9b73cf0fe28009a04a35bb2522e4c5b5176cc148919431dcb73fdbdfaab15781 \ + --hash=sha256:9d5a369eb7ec5b2fdfa9927530b5259dd21893fa75d4e04a223332f61b84b586 \ + --hash=sha256:a094b7ce455ca341b59a0f6ce6be2e11411ba6e2860b9aa3dbb37468f23338f4 \ + --hash=sha256:a0d6252098e98129a1decb59b46920d4eca17b0395f3d71b0d327d086fefe77d \ + --hash=sha256:a1d856b0f4e4a33e31cdab5f50d0a14998f3a2d726a3fd5cb7c4d45a57b28d1b \ + --hash=sha256:a4ae2ea9afcfdd2b931ddcebf1cf82532162677e00326637b31ed5dff7d985ca \ + --hash=sha256:a5963b663da69ad25fa1559ee064584935570def665917918938c1f1289f5ebc \ + --hash=sha256:ad1c2c2baaba62823a7f348f469a967ece0062140ca39e7a48e4bbb1f20d54c4 \ + --hash=sha256:ae82507fe458f7c0c8227017f2158111a4c9e7ce94de05178894a7ea9fefc8a1 \ + --hash=sha256:af188f3305f0a65c3217c30c6d4c06891e79144076a91e8b454f14256acc7279 \ + --hash=sha256:af44bb7a1af163806bbb679eb8432fa7b4fb6d83a5d403b541b675dcd3798638 \ + --hash=sha256:b0174ca6f3018ddeaa49847f29b69612e590534c1d2186d54ab25161ecc42975 \ + --hash=sha256:b2b17855ed7f994f3f259cf2dfbfad78814538536fa1a91b50253d84d87fd88d \ + --hash=sha256:b2e54f4a2dc6edf0f5ea5b1d0a608d2af3dcb5aa8c8eeab9c8841b23e1b054fe \ + --hash=sha256:b6f4abde9a2946f57e8daaf1160b2351bcf64274ef539e6675c1d945dbd75e2a \ + --hash=sha256:b70c07409d465f3a8b34d52f863871fb8a00755370791d2bd1d4f82b3cdaf3d5 \ + --hash=sha256:bb465dd5825356c1191a038a86ee1b8166e3562d6e8add95eec04ab484cfb8a2 \ + --hash=sha256:c051f46ed1e13ba8214b334cbf21902102807582fbfaf0fef341b9e52f0fafbf \ + --hash=sha256:c1b20a5f4164cec7007be55c9cf18d2cd08ed7c3bf6769b3cd6d044ad888d74b \ + --hash=sha256:c86e9e82bfab579327dbe9b82c71475165fbc8b2134d24f9a3b2edaf200a5c3d \ + --hash=sha256:c9f32b96c700bb384f33f7cf07954bb609d35dd82752cef57fb2ee0968409169 \ + --hash=sha256:cce0ed8b3f64c71c140f0ec244e5fdf8ecf78ddf8d2e591d4a8b6aa1c1214235 \ + --hash=sha256:cdd7315314b0744a7dd506f3bd0f2cf90734181529cdcf75542ee35ad885cab7 \ + --hash=sha256:cf355fbf0d4275d86f9f57be705d8e5eaa7f8ddb12b24ced2ea6cbd68fdb14dc \ + --hash=sha256:d136fbf8ad4321716e44d6d6b3d8dffb4872626010884e07a1db54b7450836cf \ + --hash=sha256:d2c8e20487b3b73c1fa72c56f5c89430617296cd380373e7af3a538a82d4cd6d \ + --hash=sha256:d483cc23cc56ab32911ea0baa0d2d9ea7aa065987f47de847a0a93a58bf57905 \ + --hash=sha256:d5a6c4864bb6fa9fcf7b57a830d21aed69fd71742a5ebcdbafda476be673d212 \ + --hash=sha256:d714e002dd3638170fe7376dc1b686dbac9cb712cde3f7224440af722cc9866a \ + --hash=sha256:d73f14b86d0e2858ece6bf5807c9889670e392c001d414b4293d0d9b291942c3 \ + --hash=sha256:d88c63bd395c787b0aa81d8bbc22c1809f311032ce3e823a6517b711129818e4 \ + --hash=sha256:db608db98ccc21248370d30044a60843b3f0f3d34781ceeea67067c508cd5a28 \ + --hash=sha256:de004939fc3fd0c1200d26ea9264350bfe501ffbf46c8cf5dc7f345f2d87a7f1 \ + --hash=sha256:ded9e86397267732a0641d4776c7c663ea16b64d7dbc4d9cc6ad8536363a2d29 \ + --hash=sha256:e288f8a162d663916060beb5e8165a8551312b08efee9cf68302687471a6545d \ + --hash=sha256:e2a9e62647dc040a76d55563580bf3bb8fe1f5b6ead08447c2ed0d7786e5e794 \ + --hash=sha256:e3e44d08b61de0dd6f205528498f834a51a5c06689f8fb182fe26f3a3ce7dca9 \ + --hash=sha256:ea002088d5554fd75e619742cefc78b84a212ba21632e59931b3501f0cfc8f67 \ + --hash=sha256:eb7452849f6615871eabed6560ffedfe56bc8af31a823b6be4ce1e6ff0ab72c5 \ + --hash=sha256:ebcf34b69df4ca0eabaaaf4a3d890f637f355fed00ba806f7ebdd2d040658c26 \ + --hash=sha256:f24d5b9383318cbd1a5cd969377937d66cf0542f24aa728a4f49d9f98f9c0da8 \ + --hash=sha256:f33fbf96b52d51c23b6cff61f57816539c1c147db270cfc1cc3bc012f4a560a9 + # via + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # vllm shellingham==1.5.4 \ --hash=sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686 \ --hash=sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # typer six==1.16.0 \ --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \ --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # opencensus # python-dateutil smart-open==6.2.0 \ --hash=sha256:088bf00f9327c71e549bc2f86567d3320df5d89667f009ce1c16568976068ef7 \ --hash=sha256:1b4df5c8365218f3852c507451920ccad606c80b0acb4e67508e50ba9b5d2632 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt sniffio==1.3.1 \ --hash=sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2 \ --hash=sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # anyio # openai soundfile==0.13.1 \ @@ -3140,7 +3312,7 @@ soundfile==0.13.1 \ --hash=sha256:b2c68dab1e30297317080a5b43df57e302584c49e2942defdde0acccc53f0e5b \ --hash=sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # mistral-common soxr==0.5.0.post1 \ --hash=sha256:39e0f791ba178d69cd676485dbee37e75a34f20daa478d90341ecb7f6d9d690f \ @@ -3165,13 +3337,13 @@ soxr==0.5.0.post1 \ --hash=sha256:fcc049b0a151a65aa75b92f0ac64bb2dba785d16b78c31c2b94e68c141751d6d \ --hash=sha256:fef509466c9c25f65eae0ce1e4b9ac9705d22c6038c914160ddaf459589c6e31 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # mistral-common starlette==0.46.2 \ --hash=sha256:595633ce89f8ffa71a015caed34a5b2dc1c0cdb3f0f1fbd1e69339cf2abeec35 \ --hash=sha256:7f7361f34eed179294600af672f565727419830b54b7b084efe44bb82d2fccd5 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt # fastapi # prometheus-fastapi-instrumentator @@ -3179,19 +3351,19 @@ sympy==1.14.0 \ --hash=sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517 \ --hash=sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # torch tensorboardx==2.6.2.2 \ --hash=sha256:160025acbf759ede23fd3526ae9d9bfbfd8b68eb16c38a010ebe326dc6395db8 \ --hash=sha256:c6476d7cd0d529b0b72f4acadb1269f9ed8b22f441e87a84f2a3b940bb87b666 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt tifffile==2024.7.21 \ --hash=sha256:7f335b5d6ca49401fe0f1d87deb206f5dae47297e47b1ed52a676d05d6d26798 \ --hash=sha256:818b577d49350421fb511f389f937984f9feaa2cd8177fa00823001920bf3483 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # scikit-image tiktoken==0.9.0 \ --hash=sha256:03935988a91d6d3216e2ec7c645afbb3d870b37bcb67ada1943ec48678e7ee33 \ @@ -3226,7 +3398,7 @@ tiktoken==0.9.0 \ --hash=sha256:f0968d5beeafbca2a72c595e8385a1a1f8af58feaebb02b227229b69ca5357fd \ --hash=sha256:f32cc56168eac4851109e9b5d327637f15fd662aa30dd79f964b7c39fbadd26e # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # mistral-common # vllm tokenizers==0.21.1 \ @@ -3246,7 +3418,7 @@ tokenizers==0.21.1 \ --hash=sha256:e78e413e9e668ad790a29456e677d9d3aa50a9ad311a40905d6861ba7692cf41 \ --hash=sha256:ed248ab5279e601a30a4d67bdb897ecbe955a50f1e7bb62bd99f07dd11c2f5b6 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # transformers # vllm torch==2.7.1+cu128 \ @@ -3269,7 +3441,7 @@ torch==2.7.1+cu128 \ --hash=sha256:e27e5f7e74179fb5d814a0412e5026e4b50c9e0081e9050bc4c28c992a276eb1 \ --hash=sha256:f112465fdf42eb1297c6dddda1a8b7f411914428b704e1b8a47870c52e290909 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # compressed-tensors # nixl # torchaudio @@ -3291,7 +3463,7 @@ torchaudio==2.7.1+cu128 \ --hash=sha256:b1e56a999a06a5deaebfb991dc676aaa60d98139907d99badbc6dca6456637ee \ --hash=sha256:cb435329019d441d8177db2d84e8d397881896d100efb4f4c15f0d3732f92a81 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # vllm torchvision==0.22.1+cu128 \ --hash=sha256:02faf51fbf5070592768fa935327d13a484b745faef38b0fee01d85cfb35f5bc \ @@ -3307,45 +3479,44 @@ torchvision==0.22.1+cu128 \ --hash=sha256:eb784cc75a66f3336a04ff3a992bf74160842132db69e8bdbb58b5ab9422c345 \ --hash=sha256:f64ef9bb91d71ab35d8384912a19f7419e35928685bc67544d58f45148334373 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # vllm tqdm==4.67.1 \ --hash=sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2 \ --hash=sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # gguf # huggingface-hub # openai # transformers # vllm -transformers==4.53.2 \ - --hash=sha256:6c3ed95edfb1cba71c4245758f1b4878c93bf8cde77d076307dacb2cbbd72be2 \ - --hash=sha256:db8f4819bb34f000029c73c3c557e7d06fc1b8e612ec142eecdae3947a9c78bf +transformers==4.55.2 \ + --hash=sha256:097e3c2e2c0c9681db3da9d748d8f9d6a724c644514673d0030e8c5a1109f1f1 \ + --hash=sha256:a45ec60c03474fd67adbce5c434685051b7608b3f4f167c25aa6aeb1cad16d4f # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt - # -r python/requirements/llm/llm-requirements.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # compressed-tensors # vllm # xgrammar triton==3.3.1 ; sys_platform == 'linux' \ --hash=sha256:b31e3aa26f8cb3cc5bf4e187bf737cbacf17311e1112b781d4a059353dfd731b # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # torch # xgrammar typer==0.12.3 \ --hash=sha256:070d7ca53f785acbccba8e7d28b08dcd88f79f1fbda035ade0aecec71ca5c914 \ --hash=sha256:49e73131481d804288ef62598d97a1ceef3058905aa536a1134f90891ba35482 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements/llm/llm-requirements.txt # -r python/requirements.txt # fastapi-cli typing-extensions==4.12.2 \ --hash=sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # fastapi # gymnasium # huggingface-hub @@ -3361,18 +3532,31 @@ typing-extensions==4.12.2 \ # referencing # torch # typer + # typing-inspection # vllm +typing-inspection==0.4.1 \ + --hash=sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51 \ + --hash=sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28 + # via + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # pydantic +tzdata==2025.2 \ + --hash=sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8 \ + --hash=sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9 + # via + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # kombu urllib3==1.26.19 \ --hash=sha256:37a0344459b199fce0e80b0d3569837ec6b6937435c5244e7fd73fa6006830f3 \ --hash=sha256:3e3d753a8618b86d7de333b4223005f68720bcd6a7d2bcb9fbd2229ec7c1e429 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # requests uvicorn==0.22.0 \ --hash=sha256:79277ae03db57ce7d9aa0567830bbb51d7a612f54d6e1e3e92da3ef24c2c8ed8 \ --hash=sha256:e9434d3bbf05f310e762147f769c9f21235ee118ba2d2bf1155a7196448bd996 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt # fastapi # fastapi-cli @@ -3415,19 +3599,27 @@ uvloop==0.21.0 ; platform_python_implementation != 'PyPy' and sys_platform != 'c --hash=sha256:f3df876acd7ec037a3d005b3ab85a7e4110422e4d9c1571d4fc89b0fc41b6816 \ --hash=sha256:f7089d2dc73179ce5ac255bdf37c236a9f914b264825fdaacaded6990a7fb4c2 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # uvicorn +vine==5.1.0 \ + --hash=sha256:40fdf3c48b2cfe1c38a49e9ae2da6fda88e4794c810050a728bd7413811fb1dc \ + --hash=sha256:8b62e981d35c41049211cf62a0a1242d8c1ee9bd15bb196ce38aefd6799e61e0 + # via + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # amqp + # celery + # kombu virtualenv==20.29.1 \ --hash=sha256:4e4cb403c0b0da39e13b46b1b2476e505cb0046b25f242bee80f62bf990b2779 \ --hash=sha256:b8b8970138d32fb606192cb97f6cd4bb644fa486be9308fb9b63f81091b5dc35 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt -vllm==0.10.0 \ - --hash=sha256:8ca37559d82b43b5e8c8248d2e4a1ecb51d6d4e5d517491d656df6491ed93dab \ - --hash=sha256:a44e9013db26082a82c3931ed8772ac884d6d60566d36ecdb0e8dc01c65b241a +vllm==0.10.1.1 \ + --hash=sha256:3099824ee4bdaa14c4c4f7178a092101a0ec206d4c9371edf295849b2b730a39 \ + --hash=sha256:8ca0dd985e1ceac8540e7719c654f1553b3ba8a43c685ac8d3fa1366ffb6443a # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements/llm/llm-requirements.txt watchfiles==0.19.0 \ --hash=sha256:0089c6dc24d436b373c3c57657bf4f9a453b13767150d17284fc6162b2791911 \ @@ -3453,10 +3645,16 @@ watchfiles==0.19.0 \ --hash=sha256:d9b073073e048081e502b6c6b0b88714c026a1a4c890569238d04aca5f9ca74b \ --hash=sha256:fac19dc9cbc34052394dbe81e149411a62e71999c0a19e1e09ce537867f95ae0 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt # uvicorn # vllm +wcwidth==0.2.13 \ + --hash=sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859 \ + --hash=sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5 + # via + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # prompt-toolkit websockets==15.0.1 \ --hash=sha256:0701bc3cfcb9164d04a14b149fd74be7347a530ad3bbf15ab2c678a2cd3dd9a2 \ --hash=sha256:0a34631031a8f05657e8e90903e656959234f3a04552259458aac0b0f9ae6fd9 \ @@ -3528,12 +3726,12 @@ websockets==15.0.1 \ --hash=sha256:f7a866fbc1e97b5c617ee4116daaa09b722101d4a3c170c787450ba409f9736f \ --hash=sha256:fcd5cf9e305d7b8338754470cf69cf81f420459dbae8a3b40cee57417f4614a7 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # uvicorn xformers==0.0.31 ; platform_machine == 'x86_64' and sys_platform == 'linux' \ --hash=sha256:b2ea87e0651f46164cb3cd74face021bd1654229ca4f8c0baa03b8c477515c7a # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # vllm xgrammar==0.1.21 ; platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \ --hash=sha256:140628376fc701a535600dc64752603ddaed619461dc50669e90626e9f61b8aa \ @@ -3561,7 +3759,7 @@ xgrammar==0.1.21 ; platform_machine == 'aarch64' or platform_machine == 'arm64' --hash=sha256:f89d9ddb4d00fadcffa4bcabd0c3ae75d47c844c728bbb6be695056df3767524 \ --hash=sha256:f9247641c73eec6e972cec15156a8844957334204ba79ad1abdb0d7b03def8a1 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # vllm yarl==1.18.3 \ --hash=sha256:00e5a1fea0fd4f5bfa7440a47eff01d9822a65b4488f7cff83155a0f31a2ecba \ @@ -3647,15 +3845,15 @@ yarl==1.18.3 \ --hash=sha256:fe57328fbc1bfd0bd0514470ac692630f3901c0ee39052ae47acd1d90a436719 \ --hash=sha256:fea09ca13323376a2fdfb353a5fa2e59f90cd18d7ca4eaa1fd31f0a8b4f91e62 # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # aiohttp zipp==3.19.2 \ --hash=sha256:bf1dcf6450f873a13e952a29504887c89e6de7506209e5b1bcc3460135d4de19 \ --hash=sha256:f091755f667055f2d02b32c53771a7a6c8b47e1fdbc4b72a8b9072b3eef8015c # via - # -c python/requirements_compiled_rayllm_test_py311_cu128.txt + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # importlib-metadata # The following packages were excluded from the output: -# ray # setuptools +# ray diff --git a/python/requirements_compiled_rayllm_test_py311_cpu.txt b/python/deplocks/llm/rayllm_test_py311_cpu.lock similarity index 87% rename from python/requirements_compiled_rayllm_test_py311_cpu.txt rename to python/deplocks/llm/rayllm_test_py311_cpu.lock index 5c59d6a2d967..acbc353b49cb 100644 --- a/python/requirements_compiled_rayllm_test_py311_cpu.txt +++ b/python/deplocks/llm/rayllm_test_py311_cpu.lock @@ -1,19 +1,25 @@ # This file was autogenerated by uv via the following command: -# uv pip compile --generate-hashes --strip-extras --unsafe-package ray --unsafe-package grpcio-tools --unsafe-package setuptools --index-url https://pypi.org/simple --extra-index-url https://download.pytorch.org/whl/cpu --index-strategy unsafe-best-match --no-strip-markers --emit-index-url --emit-find-links -c python/requirements_compiled_ray_test_py311_cpu.txt python/requirements.txt python/requirements/cloud-requirements.txt python/requirements/base-test-requirements.txt python/requirements/llm/llm-requirements.txt python/requirements/llm/llm-test-requirements.txt -o python/requirements_compiled_rayllm_test_py311_cpu.txt +# uv pip compile --generate-hashes --strip-extras --unsafe-package setuptools --index-url https://pypi.org/simple --index-strategy unsafe-best-match --no-strip-markers --emit-index-url --emit-find-links --extra-index-url https://download.pytorch.org/whl/cpu --python-version=3.11 --unsafe-package ray --python-platform=linux -c python/deplocks/llm/ray_test_py311_cpu.lock python/requirements.txt python/requirements/cloud-requirements.txt python/requirements/base-test-requirements.txt python/requirements/llm/llm-requirements.txt python/requirements/llm/llm-test-requirements.txt -o python/deplocks/llm/rayllm_test_py311_cpu.lock --index-url https://pypi.org/simple --extra-index-url https://download.pytorch.org/whl/cpu +adlfs==2023.8.0 \ + --hash=sha256:07e804f6df4593acfcaf01025b162e30ac13e523d3570279c98b2d91a18026d9 \ + --hash=sha256:3eb248a3c2a30b419f1147bd7676d156b5219f96ef7f11d47166afd2a3bdb07e + # via + # -c python/deplocks/llm/ray_test_py311_cpu.lock + # -r python/requirements/cloud-requirements.txt aiofiles==22.1.0 \ --hash=sha256:1142fa8e80dbae46bb6339573ad4c8c0841358f79c6eb50a493dceca14621bad \ --hash=sha256:9107f1ca0b2a5553987a94a3c9959fe5b491fdf731389aa5b7b1bd0733e32de6 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # ypy-websocket aiohappyeyeballs==2.6.1 \ --hash=sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558 \ --hash=sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # aiohttp aiohttp==3.11.16 \ --hash=sha256:004511d3413737700835e949433536a2fe95a7d0297edd911a1e9705c5b5ea43 \ @@ -98,10 +104,11 @@ aiohttp==3.11.16 \ --hash=sha256:fb46bb0f24813e6cede6cc07b1961d4b04f331f7112a23b5e21f567da4ee50aa \ --hash=sha256:fd36c119c5d6551bce374fcb5c19269638f8d09862445f85a5a48596fd59f4bb # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/cloud-requirements.txt # -r python/requirements/llm/llm-test-requirements.txt # -r python/requirements.txt + # adlfs # aiohttp-cors # pytest-aiohttp # vllm @@ -109,41 +116,47 @@ aiohttp-cors==0.7.0 \ --hash=sha256:0451ba59fdf6909d0e2cd21e4c0a43752bc0703d33fc78ae94d9d9321710193e \ --hash=sha256:4d39c6d7100fd9764ed1caf8cebf0eb01bf5e3f24e2e073fda6234bc48b19f5d # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt aiorwlock==1.3.0 \ --hash=sha256:45baf8e4fa9a23e0bb325fbd67da80de1fd7ae1d4f59a6381754c60cec7b289b \ --hash=sha256:83f12d87df4b9728a0b8fda1756585ab0d652b107bab59c6084e1b1ad692ab45 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt aiosignal==1.3.1 \ --hash=sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc \ --hash=sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # aiohttp aiosqlite==0.19.0 \ --hash=sha256:95ee77b91c8d2808bd08a59fbebf66270e9090c3d92ffbf260dc0db0b979577d \ --hash=sha256:edba222e03453e094a3ce605db1b970c4b3376264e56f32e2a4959f948d66a96 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # ypy-websocket alabaster==0.7.16 \ --hash=sha256:75a8b99c28a5dad50dd7f8ccdd447a121ddb3892da9e53d1ca5cca3106d58d65 \ --hash=sha256:b46733c07dce03ae4e150330b975c75737fa60f0a7c591b6c8bf4928a28e2c92 # via sphinx +amqp==5.3.1 \ + --hash=sha256:43b3319e1b4e7d1251833a93d672b4af1e40f3d632d479b98661a95f117880a2 \ + --hash=sha256:cddc00c725449522023bad949f70fff7b48f0b1ade74d170a6f10ab044739432 + # via + # -c python/deplocks/llm/ray_test_py311_cpu.lock + # kombu annotated-types==0.6.0 \ --hash=sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43 \ --hash=sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # pydantic anyio==3.7.1 \ --hash=sha256:44a3c9aba0f5defa43261a8b3efb97891f2bd7d804e0e1f56419befa1adfc780 \ --hash=sha256:91dee416e570e92c64041bd18b900d1d6fa78dff7048769ce5ac5ddad004fbb5 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # httpx # jupyter-server # openai @@ -153,7 +166,7 @@ argon2-cffi==23.1.0 \ --hash=sha256:879c3e79a2729ce768ebb7d36d4609e3a78a4ca2ec3a9f12286ca057e3d0db08 \ --hash=sha256:c670642b78ba29641818ab2e68bd4e6a78ba53b7eff7b4c3815ae16abf91c7ea # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jupyter-server # nbclassic # notebook @@ -180,13 +193,13 @@ argon2-cffi-bindings==21.2.0 \ --hash=sha256:f1152ac548bd5b8bcecfb0b0371f082037e47128653df2e8ba6e914d384f3c3e \ --hash=sha256:f9f8b450ed0547e3d473fdc8612083fd08dd2120d6ac8f73828df9b7d45bb351 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # argon2-cffi arrow==1.3.0 \ --hash=sha256:c728b120ebc00eb84e01882a6f5e7927a53960aa990ce7dd2b10f39005a67f80 \ --hash=sha256:d4540617648cb5f895730f1ad8c82a65f2dad0166f57b75f3ca54759c4d67a85 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # isoduration astor==0.8.1 \ --hash=sha256:070a54e890cefb5b3739d19f30f5a5ec840ffc9c50ffa7d23cc9fc1a38ebbfc5 \ @@ -196,13 +209,13 @@ asttokens==2.4.1 \ --hash=sha256:051ed49c3dcae8913ea7cd08e46a606dba30b79993209636c4875bc1d637bc24 \ --hash=sha256:b03869718ba9a6eb027e134bfdf69f38a236d681c83c160d510768af11254ba0 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # stack-data attrs==25.1.0 \ --hash=sha256:1c97078a80c814273a76b2a298a932eb681c87415c11dee0a6921de7f1b02c3e \ --hash=sha256:c75a69e28a550a7e93789579c22aa26b0f5b83b75dc4e08fe092980051e1090a # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # aiohttp # jsonschema # referencing @@ -210,40 +223,49 @@ azure-common==1.1.28 \ --hash=sha256:4ac0cd3214e36b6a1b6a442686722a5d8cc449603aa833f3f0f40bda836704a3 \ --hash=sha256:5c12d3dcf4ec20599ca6b0d3e09e86e146353d443e7fcc050c9a19c1f9df20ad # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # smart-open azure-core==1.29.5 \ --hash=sha256:0fa04b7b1f7d44a4fb8468c4093deb2ea01fdf4faddbf802ed9205615f99d68c \ --hash=sha256:52983c89d394c6f881a121e5101c5fa67278ca3b1f339c8fb2ef39230c70e9ac # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock + # adlfs # azure-identity # azure-storage-blob # smart-open +azure-datalake-store==0.0.53 \ + --hash=sha256:05b6de62ee3f2a0a6e6941e6933b792b800c3e7f6ffce2fc324bc19875757393 \ + --hash=sha256:a30c902a6e360aa47d7f69f086b426729784e71c536f330b691647a51dc42b2b + # via + # -c python/deplocks/llm/ray_test_py311_cpu.lock + # adlfs azure-identity==1.17.1 \ --hash=sha256:32ecc67cc73f4bd0595e4f64b1ca65cd05186f4fe6f98ed2ae9f1aa32646efea \ --hash=sha256:db8d59c183b680e763722bfe8ebc45930e6c57df510620985939f7f3191e0382 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/cloud-requirements.txt + # adlfs azure-storage-blob==12.22.0 \ --hash=sha256:b3804bb4fe8ab1c32771fa464053da772a682c2737b19da438a3f4e5e3b3736e \ --hash=sha256:bb7d2d824ce3f11f14a27ee7d9281289f7e072ac8311c52e3652672455b7d5e8 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock + # adlfs # smart-open babel==2.13.1 \ --hash=sha256:33e0952d7dd6374af8dbf6768cc4ddf3ccfefc244f9986d4074704f2fbd18900 \ --hash=sha256:7077a4984b02b6727ac10f1f7294484f737443d7e2e66c5e4380e41a3ae0b4ed # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jupyterlab-server # sphinx backcall==0.2.0 \ --hash=sha256:5cbdbf27be5e7cfadb448baf0aa95508f91f2bbc6c6437cd9cd06e2a4c215e1e \ --hash=sha256:fbbce6a29f263178a1f7915c1940bde0ec2b2a967566fe1c65c1dfb7422bd255 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # ipython backoff==2.2.1 \ --hash=sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba \ @@ -253,8 +275,14 @@ beautifulsoup4==4.11.1 \ --hash=sha256:58d5c3d29f5a36ffeb94f02f0d786cd53014cf9b3b3951d42e0080d8a9498d30 \ --hash=sha256:ad9aa55b65ef2808eb405f46cf74df7fcb7044d5cbc26487f96eb2ef2e436693 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # nbconvert +billiard==4.2.1 \ + --hash=sha256:12b641b0c539073fc8d3f5b8b7be998956665c4233c7c1fcd66a7e677c4fb36f \ + --hash=sha256:40b59a4ac8806ba2c2369ea98d876bc6108b051c227baffd928c644d15d8f3cb + # via + # -c python/deplocks/llm/ray_test_py311_cpu.lock + # celery blake3==1.0.4 \ --hash=sha256:00605aa59923205c6a4f21131840840eb2d9a754c59b163357d890566755b97a \ --hash=sha256:08f46c2f1c5f369f07409e3e4ff248bcb22617cd741f2224873d85982dd6034e \ @@ -346,20 +374,20 @@ bleach==6.1.0 \ --hash=sha256:0a31f1837963c41d46bbf1331b8778e1308ea0791db03cc4e7357b97cf42a8fe \ --hash=sha256:3225f354cfc436b9789c66c4ee030194bee0568fbf9cbdad3bc8b5c26c5f12b6 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # nbconvert -boto3==1.26.76 \ - --hash=sha256:30c7d967ed1c6b5a05643e42cae9d4d36c3f1cb6782637ddc7007a104cfd9027 \ - --hash=sha256:b4c2969b7677762914394b8273cc1905dfe5b71f250741c1a575487ae357e729 +boto3==1.29.7 \ + --hash=sha256:1eb4c548118b5fc5e018dee956fd33e6fb249cd1f2def85f1bba816aef4d9f3e \ + --hash=sha256:96e9890ebe7cd823b5f4976dd676e112c000c6528c28e20a2f274590589dd18b # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/cloud-requirements.txt # smart-open -botocore==1.29.76 \ - --hash=sha256:70735b00cd529f152992231ca6757e458e5ec25db43767b3526e9a35b2f143b7 \ - --hash=sha256:c2f67b6b3f8acf2968eafca06526f07b9fb0d27bac4c68a635d51abb675134a7 +botocore==1.32.7 \ + --hash=sha256:58b33d02cafa23461c8a9d211b30e8cded992380a84de409379fd02811fa3e11 \ + --hash=sha256:c6795c731b04c8e3635588c44cfd1a4462fc5987859195522c96812cf3eceff9 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/cloud-requirements.txt # boto3 # s3transfer @@ -367,7 +395,7 @@ cachetools==5.5.2 \ --hash=sha256:1a661caa9175d26759571b2e19580f9d6393969e5dfca11fdb1f947a23e640d4 \ --hash=sha256:d26a22bcc62eb95c3beabd9f1ee5e820d3d2704fe2967cbe350e20c8ffcd3f0a # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # google-auth # vllm cbor2==5.6.5 \ @@ -416,11 +444,17 @@ cbor2==5.6.5 \ --hash=sha256:fde21ac1cf29336a31615a2c469a9cb03cf0add3ae480672d4d38cda467d07fc \ --hash=sha256:fe11c2eb518c882cfbeed456e7a552e544893c17db66fe5d3230dbeaca6b615c # via vllm +celery==5.5.3 \ + --hash=sha256:0b5761a07057acee94694464ca482416b959568904c9dfa41ce8413a7d65d525 \ + --hash=sha256:6c972ae7968c2b5281227f01c3a3f984037d21c5129d07bf3550cc2afc6b10a5 + # via + # -c python/deplocks/llm/ray_test_py311_cpu.lock + # -r python/requirements.txt certifi==2025.1.31 \ --hash=sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651 \ --hash=sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/cloud-requirements.txt # httpcore # httpx @@ -479,8 +513,9 @@ cffi==1.16.0 \ --hash=sha256:fa3a0128b152627161ce47201262d3140edb5a5c3da88d73a1b790a959126956 \ --hash=sha256:fcc8eb6d5902bb1cf6dc4f187ee3ea80a1eba0a89aba40a5cb20a5087d961357 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # argon2-cffi-bindings + # azure-datalake-store # cryptography # soundfile charset-normalizer==3.3.2 \ @@ -575,30 +610,52 @@ charset-normalizer==3.3.2 \ --hash=sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519 \ --hash=sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # requests click==8.1.7 \ --hash=sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28 \ --hash=sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/cloud-requirements.txt # -r python/requirements.txt + # celery + # click-didyoumean + # click-plugins + # click-repl # ray # typer # uvicorn +click-didyoumean==0.3.1 \ + --hash=sha256:4f82fdff0dbe64ef8ab2279bd6aa3f6a99c3b28c05aa09cbfc07c9d7fbb5a463 \ + --hash=sha256:5c4bb6007cfea5f2fd6583a2fb6701a22a41eb98957e63d0fac41c10e7c3117c + # via + # -c python/deplocks/llm/ray_test_py311_cpu.lock + # celery +click-plugins==1.1.1.2 \ + --hash=sha256:008d65743833ffc1f5417bf0e78e8d2c23aab04d9745ba817bd3e71b0feb6aa6 \ + --hash=sha256:d7af3984a99d243c131aa1a828331e7630f4a88a9741fd05c927b204bcf92261 + # via + # -c python/deplocks/llm/ray_test_py311_cpu.lock + # celery +click-repl==0.3.0 \ + --hash=sha256:17849c23dba3d667247dc4defe1757fff98694e90fe37474f3feebb69ced26a9 \ + --hash=sha256:fb7e06deb8da8de86180a33a9da97ac316751c094c6899382da7feeeeb51b812 + # via + # -c python/deplocks/llm/ray_test_py311_cpu.lock + # celery cloudpickle==2.2.0 \ --hash=sha256:3f4219469c55453cfe4737e564b67c2a149109dabf7f242478948b895f61106f \ --hash=sha256:7428798d5926d8fcbfd092d18d01a2a03daf8237d8fcdc8095d256b8490796f0 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # gymnasium # vllm colorama==0.4.6 \ --hash=sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44 \ --hash=sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/cloud-requirements.txt # halo # log-symbols @@ -606,13 +663,13 @@ colorful==0.5.5 \ --hash=sha256:62c187e27c1433db9463ff93b1451898d1e7e23a7e553583fd9daeb6325182e4 \ --hash=sha256:66f8c1264b2a26f7293b96a03bb7a76c4bc8b9634369a0bffdcd12d618056a1d # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt comm==0.2.0 \ --hash=sha256:2da8d9ebb8dd7bfc247adaff99f24dce705638a8042b85cb995066793e391001 \ --hash=sha256:a517ea2ca28931c7007a7a99c562a0fa5883cfb48963140cf642c41c948498be # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # ipykernel # ipywidgets compressed-tensors==0.10.2 \ @@ -658,7 +715,7 @@ cryptography==44.0.3 \ --hash=sha256:fc3c9babc1e1faefd62704bb46a69f359a9819eb0292e40df3fb6e3574715cd4 \ --hash=sha256:fe19d8bc5536a91a24a8133328880a41831b6c5df54599a8417b62fe015d3053 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # azure-identity # azure-storage-blob # msal @@ -678,7 +735,7 @@ cupy-cuda12x==13.1.0 ; sys_platform != 'darwin' \ --hash=sha256:dac0284fecb90b5731f514e569a6fcf6674a730ae95b9490781a713b60a34423 \ --hash=sha256:e7a25ef1b44ae6276b5105affc2289edb34f1aa6676babd5bcd80907348c4cfa # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt # ray debugpy==1.8.0 \ @@ -701,19 +758,19 @@ debugpy==1.8.0 \ --hash=sha256:ef54404365fae8d45cf450d0544ee40cefbcb9cb85ea7afe89a963c27028261e \ --hash=sha256:ef9ab7df0b9a42ed9c878afd3eaaff471fce3fa73df96022e1f5c9f8f8c87ada # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # ipykernel decorator==5.1.1 \ --hash=sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330 \ --hash=sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # ipython defusedxml==0.7.1 \ --hash=sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69 \ --hash=sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # nbconvert depyf==0.19.0 \ --hash=sha256:040b35fc0997d49df024b7d094f2a7836f91e9ed02f49982dd37e70aa3285ad5 \ @@ -731,7 +788,7 @@ distlib==0.3.7 \ --hash=sha256:2e24928bc811348f0feb63014e97aaae3037f2cf48712d51ae61df7fd6075057 \ --hash=sha256:9dafe54b34a028eafd95039d5e5d4851a13734540f1331060d31c9916e7147a8 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # virtualenv distro==1.9.0 \ --hash=sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed \ @@ -785,7 +842,7 @@ dm-tree==0.1.8 \ --hash=sha256:f7ac31b9aecccb2c6e1ab29706f6ded3eba0c2c69c770322c9c685929c3d6afb \ --hash=sha256:fa42a605d099ee7d41ba2b5fb75e21423951fd26e5d50583a00471238fb3021d # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt dnspython==2.7.0 \ --hash=sha256:b4c34b7d10b51bcc3a5071e7b8dee77939f1e878477eeecc965e9835f63c6c86 \ @@ -807,26 +864,26 @@ entrypoints==0.4 \ --hash=sha256:b706eddaa9218a19ebcd67b56818f05bb27589b1ca9e8d797b74affad4ccacd4 \ --hash=sha256:f174b5ff827504fd3cd97cc3f8649f3693f51538c7e4bdf3ef002c8429d42f9f # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jupyter-client # nbconvert executing==2.0.1 \ --hash=sha256:35afe2ce3affba8ee97f2d69927fa823b08b472b7b994e36a52a964b93d16147 \ --hash=sha256:eac49ca94516ccc753f9fb5ce82603156e590b27525a8bc32cce8ae302eb61bc # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # stack-data farama-notifications==0.0.4 \ --hash=sha256:13fceff2d14314cf80703c8266462ebf3733c7d165336eee998fc58e545efd18 \ --hash=sha256:14de931035a41961f7c056361dc7f980762a143d05791ef5794a751a2caf05ae # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # gymnasium fastapi==0.115.12 \ --hash=sha256:1e2c2a2646905f9e83d32f04a3f86aff4a286669c6c950ca95b5fd68c2602681 \ --hash=sha256:e94613d6c05e27be7ffebdd6ea5f388112e5e430c8f7d6494a9d1d88d43e814d # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt # vllm fastapi-cli==0.0.5 \ @@ -837,7 +894,7 @@ fastjsonschema==2.19.0 \ --hash=sha256:b9fd1a2dd6971dbc7fee280a95bd199ae0dd9ce22beb91cc75e9c1c528a5170e \ --hash=sha256:e25df6647e1bc4a26070b700897b07b542ec898dd4f1f6ea013e7f6a88417225 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # nbformat fastrlock==0.8.2 ; sys_platform != 'darwin' \ --hash=sha256:067edb0a0805bf61e17a251d5046af59f6e9d2b8ad01222e0ef7a0b7937d5548 \ @@ -916,13 +973,13 @@ fastrlock==0.8.2 ; sys_platform != 'darwin' \ --hash=sha256:fb5363cf0fddd9b50525ddbf64a1e1b28ec4c6dfb28670a940cb1cf988a6786b \ --hash=sha256:ff75c90663d6e8996610d435e71487daa853871ad1770dd83dc0f2fc4997241e # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # cupy-cuda12x filelock==3.17.0 \ --hash=sha256:533dc2f7ba78dc2f0f531fc6c4940addf7b70a481e269a5a3b93be94ffbe8338 \ --hash=sha256:ee4e77401ef576ebb38cd7f13b9b28893194acc20a8e68e18730ba9c0e54660e # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt # huggingface-hub # ray @@ -934,7 +991,7 @@ fqdn==1.5.1 \ --hash=sha256:105ed3677e767fb5ca086a0c1f4bb66ebc3c100be518f0e0d755d9eae164d89f \ --hash=sha256:3a179af3761e4df6eb2e026ff9e1a3033d3587bf980a0b1b2e1e5d08d7358014 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jsonschema frozenlist==1.4.1 \ --hash=sha256:04ced3e6a46b4cfffe20f9ae482818e34eba9b5fb0ce4056e4cc9b6e212d09b7 \ @@ -1015,15 +1072,16 @@ frozenlist==1.4.1 \ --hash=sha256:fde5bd59ab5357e3853313127f4d3565fc7dad314a74d7b5d43c22c6a5ed2ced \ --hash=sha256:fe1a06da377e3a1062ae5fe0926e12b84eceb8a50b350ddca72dc85015873f74 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # aiohttp # aiosignal -fsspec==2023.5.0 \ - --hash=sha256:51a4ad01a5bb66fcc58036e288c0d53d3975a0df2a5dc59a93b59bade0391f2a \ - --hash=sha256:b3b56e00fb93ea321bc9e5d9cf6f8522a0198b20eb24e02774d329e9c6fb84ce +fsspec==2023.12.1 \ + --hash=sha256:6271f1d3075a378bfe432f6f42bf7e1d2a6ba74f78dd9b512385474c579146a0 \ + --hash=sha256:c4da01a35ac65c853f833e43f67802c25213f560820d54ddf248f92eddd5e990 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt + # adlfs # huggingface-hub # torch gguf==0.16.2 \ @@ -1034,19 +1092,19 @@ gitdb==4.0.11 \ --hash=sha256:81a3407ddd2ee8df444cbacea00e2d038e40150acfa3001696fe0dcf1d3adfa4 \ --hash=sha256:bf5421126136d6d0af55bc1e7c1af1c397a34f5b7bd79e776cd3e89785c2b04b # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # gitpython gitpython==3.1.44 \ --hash=sha256:9e0e10cda9bed1ee64bc9a6de50e7e38a9c9943241cd7f585f6df3ed28011110 \ --hash=sha256:c87e30b26253bf5418b01b0660f818967f3c503193838337fe5e573331249269 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/cloud-requirements.txt google-api-core==2.24.2 \ --hash=sha256:810a63ac95f3c441b7c0e43d344e372887f62ce9071ba972eacf32672e072de9 \ --hash=sha256:81718493daf06d96d6bc76a91c23874dbf2fac0adbbf542831b805ee6e974696 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # google-cloud-core # google-cloud-storage # opencensus @@ -1054,7 +1112,7 @@ google-auth==2.23.4 \ --hash=sha256:79905d6b1652187def79d491d6e23d0cbb3a21d3c7ba0dbaa9c8a01906b13ff3 \ --hash=sha256:d4bbc92fe4b8bfd2f3e8d88e5ba7085935da208ee38a134fc280e7ce682a05f2 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/cloud-requirements.txt # google-api-core # google-cloud-core @@ -1063,13 +1121,13 @@ google-cloud-core==2.4.1 \ --hash=sha256:9b7749272a812bde58fff28868d0c5e2f585b82f37e09a1f6ed2d4d10f134073 \ --hash=sha256:a9e6a4422b9ac5c29f79a0ede9485473338e2ce78d91f2370c01e730eab22e61 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # google-cloud-storage google-cloud-storage==2.14.0 \ --hash=sha256:2d23fcf59b55e7b45336729c148bb1c464468c69d5efbaee30f7201dd90eb97e \ --hash=sha256:8641243bbf2a2042c16a6399551fbb13f062cbc9a2de38d6c0bb5426962e9dbd # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/cloud-requirements.txt # smart-open google-crc32c==1.5.0 \ @@ -1142,100 +1200,148 @@ google-crc32c==1.5.0 \ --hash=sha256:fd8536e902db7e365f49e7d9029283403974ccf29b13fc7028b97e2295b33556 \ --hash=sha256:fe70e325aa68fa4b5edf7d1a4b6f691eb04bbccac0ace68e34820d283b5f80d4 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # google-cloud-storage # google-resumable-media google-resumable-media==2.6.0 \ --hash=sha256:972852f6c65f933e15a4a210c2b96930763b47197cdf4aa5f5bea435efb626e7 \ --hash=sha256:fc03d344381970f79eebb632a3c18bb1828593a2dc5572b5f90115ef7d11e81b # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # google-cloud-storage googleapis-common-protos==1.61.0 \ --hash=sha256:22f1915393bb3245343f6efe87f6fe868532efc12aa26b391b15132e1279f1c0 \ --hash=sha256:8a64866a97f6304a7179873a465d6eee97b7a24ec6cfd78e0f575e96b821240b # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # google-api-core -grpcio==1.66.2 \ - --hash=sha256:02697eb4a5cbe5a9639f57323b4c37bcb3ab2d48cec5da3dc2f13334d72790dd \ - --hash=sha256:03b0b307ba26fae695e067b94cbb014e27390f8bc5ac7a3a39b7723fed085604 \ - --hash=sha256:05bc2ceadc2529ab0b227b1310d249d95d9001cd106aa4d31e8871ad3c428d73 \ - --hash=sha256:06de8ec0bd71be123eec15b0e0d457474931c2c407869b6c349bd9bed4adbac3 \ - --hash=sha256:0be4e0490c28da5377283861bed2941d1d20ec017ca397a5df4394d1c31a9b50 \ - --hash=sha256:12fda97ffae55e6526825daf25ad0fa37483685952b5d0f910d6405c87e3adb6 \ - --hash=sha256:1caa38fb22a8578ab8393da99d4b8641e3a80abc8fd52646f1ecc92bcb8dee34 \ - --hash=sha256:2018b053aa15782db2541ca01a7edb56a0bf18c77efed975392583725974b249 \ - --hash=sha256:20657d6b8cfed7db5e11b62ff7dfe2e12064ea78e93f1434d61888834bc86d75 \ - --hash=sha256:2335c58560a9e92ac58ff2bc5649952f9b37d0735608242973c7a8b94a6437d8 \ - --hash=sha256:31fd163105464797a72d901a06472860845ac157389e10f12631025b3e4d0453 \ - --hash=sha256:38b68498ff579a3b1ee8f93a05eb48dc2595795f2f62716e797dc24774c1aaa8 \ - --hash=sha256:3b00efc473b20d8bf83e0e1ae661b98951ca56111feb9b9611df8efc4fe5d55d \ - --hash=sha256:3ed71e81782966ffead60268bbda31ea3f725ebf8aa73634d5dda44f2cf3fb9c \ - --hash=sha256:45a3d462826f4868b442a6b8fdbe8b87b45eb4f5b5308168c156b21eca43f61c \ - --hash=sha256:49f0ca7ae850f59f828a723a9064cadbed90f1ece179d375966546499b8a2c9c \ - --hash=sha256:4e504572433f4e72b12394977679161d495c4c9581ba34a88d843eaf0f2fbd39 \ - --hash=sha256:4ea1d062c9230278793820146c95d038dc0f468cbdd172eec3363e42ff1c7d01 \ - --hash=sha256:563588c587b75c34b928bc428548e5b00ea38c46972181a4d8b75ba7e3f24231 \ - --hash=sha256:6001e575b8bbd89eee11960bb640b6da6ae110cf08113a075f1e2051cc596cae \ - --hash=sha256:66a0cd8ba6512b401d7ed46bb03f4ee455839957f28b8d61e7708056a806ba6a \ - --hash=sha256:6851de821249340bdb100df5eacfecfc4e6075fa85c6df7ee0eb213170ec8e5d \ - --hash=sha256:728bdf36a186e7f51da73be7f8d09457a03061be848718d0edf000e709418987 \ - --hash=sha256:73e3b425c1e155730273f73e419de3074aa5c5e936771ee0e4af0814631fb30a \ - --hash=sha256:73fc8f8b9b5c4a03e802b3cd0c18b2b06b410d3c1dcbef989fdeb943bd44aff7 \ - --hash=sha256:78fa51ebc2d9242c0fc5db0feecc57a9943303b46664ad89921f5079e2e4ada7 \ - --hash=sha256:7b2c86457145ce14c38e5bf6bdc19ef88e66c5fee2c3d83285c5aef026ba93b3 \ - --hash=sha256:7d69ce1f324dc2d71e40c9261d3fdbe7d4c9d60f332069ff9b2a4d8a257c7b2b \ - --hash=sha256:802d84fd3d50614170649853d121baaaa305de7b65b3e01759247e768d691ddf \ - --hash=sha256:80fd702ba7e432994df208f27514280b4b5c6843e12a48759c9255679ad38db8 \ - --hash=sha256:8ac475e8da31484efa25abb774674d837b343afb78bb3bcdef10f81a93e3d6bf \ - --hash=sha256:950da58d7d80abd0ea68757769c9db0a95b31163e53e5bb60438d263f4bed7b7 \ - --hash=sha256:99a641995a6bc4287a6315989ee591ff58507aa1cbe4c2e70d88411c4dcc0839 \ - --hash=sha256:9c3a99c519f4638e700e9e3f83952e27e2ea10873eecd7935823dab0c1c9250e \ - --hash=sha256:9c509a4f78114cbc5f0740eb3d7a74985fd2eff022971bc9bc31f8bc93e66a3b \ - --hash=sha256:a18e20d8321c6400185b4263e27982488cb5cdd62da69147087a76a24ef4e7e3 \ - --hash=sha256:a917d26e0fe980b0ac7bfcc1a3c4ad6a9a4612c911d33efb55ed7833c749b0ee \ - --hash=sha256:a9539f01cb04950fd4b5ab458e64a15f84c2acc273670072abe49a3f29bbad54 \ - --hash=sha256:ad2efdbe90c73b0434cbe64ed372e12414ad03c06262279b104a029d1889d13e \ - --hash=sha256:b672abf90a964bfde2d0ecbce30f2329a47498ba75ce6f4da35a2f4532b7acbc \ - --hash=sha256:bbd27c24a4cc5e195a7f56cfd9312e366d5d61b86e36d46bbe538457ea6eb8dd \ - --hash=sha256:c400ba5675b67025c8a9f48aa846f12a39cf0c44df5cd060e23fda5b30e9359d \ - --hash=sha256:c408f5ef75cfffa113cacd8b0c0e3611cbfd47701ca3cdc090594109b9fcbaed \ - --hash=sha256:c806852deaedee9ce8280fe98955c9103f62912a5b2d5ee7e3eaa284a6d8d8e7 \ - --hash=sha256:ce89f5876662f146d4c1f695dda29d4433a5d01c8681fbd2539afff535da14d4 \ - --hash=sha256:d25a14af966438cddf498b2e338f88d1c9706f3493b1d73b93f695c99c5f0e2a \ - --hash=sha256:d8d4732cc5052e92cea2f78b233c2e2a52998ac40cd651f40e398893ad0d06ec \ - --hash=sha256:d9a9724a156c8ec6a379869b23ba3323b7ea3600851c91489b871e375f710bc8 \ - --hash=sha256:e636ce23273683b00410f1971d209bf3689238cf5538d960adc3cdfe80dd0dbd \ - --hash=sha256:e88264caad6d8d00e7913996030bac8ad5f26b7411495848cc218bd3a9040b6c \ - --hash=sha256:f145cc21836c332c67baa6fc81099d1d27e266401565bf481948010d6ea32d46 \ - --hash=sha256:fb57870449dfcfac428afbb5a877829fcb0d6db9d9baa1148705739e9083880e \ - --hash=sha256:fb70487c95786e345af5e854ffec8cb8cc781bcc5df7930c4fbb7feaa72e1cdf \ - --hash=sha256:fe96281713168a3270878255983d2cb1a97e034325c8c2c25169a69289d3ecfa \ - --hash=sha256:ff1f7882e56c40b0d33c4922c15dfa30612f05fb785074a012f7cda74d1c3679 - # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt +grpcio==1.74.0 \ + --hash=sha256:0f87bddd6e27fc776aacf7ebfec367b6d49cad0455123951e4488ea99d9b9b8f \ + --hash=sha256:136b53c91ac1d02c8c24201bfdeb56f8b3ac3278668cbb8e0ba49c88069e1bdc \ + --hash=sha256:1733969040989f7acc3d94c22f55b4a9501a30f6aaacdbccfaba0a3ffb255ab7 \ + --hash=sha256:176d60a5168d7948539def20b2a3adcce67d72454d9ae05969a2e73f3a0feee7 \ + --hash=sha256:1a2b06afe2e50ebfd46247ac3ba60cac523f54ec7792ae9ba6073c12daf26f0a \ + --hash=sha256:1bf949792cee20d2078323a9b02bacbbae002b9e3b9e2433f2741c15bdeba1c4 \ + --hash=sha256:22b834cef33429ca6cc28303c9c327ba9a3fafecbf62fae17e9a7b7163cc43ac \ + --hash=sha256:2918948864fec2a11721d91568effffbe0a02b23ecd57f281391d986847982f6 \ + --hash=sha256:2bc2d7d8d184e2362b53905cb1708c84cb16354771c04b490485fa07ce3a1d89 \ + --hash=sha256:2f609a39f62a6f6f05c7512746798282546358a37ea93c1fcbadf8b2fed162e3 \ + --hash=sha256:3601274bc0523f6dc07666c0e01682c94472402ac2fd1226fd96e079863bfa49 \ + --hash=sha256:3b03d8f2a07f0fea8c8f74deb59f8352b770e3900d143b3d1475effcb08eec20 \ + --hash=sha256:3d14e3c4d65e19d8430a4e28ceb71ace4728776fd6c3ce34016947474479683f \ + --hash=sha256:42f8fee287427b94be63d916c90399ed310ed10aadbf9e2e5538b3e497d269bc \ + --hash=sha256:4bc5fca10aaf74779081e16c2bcc3d5ec643ffd528d9e7b1c9039000ead73bae \ + --hash=sha256:4e4181bfc24413d1e3a37a0b7889bea68d973d4b45dd2bc68bb766c140718f82 \ + --hash=sha256:55b453812fa7c7ce2f5c88be3018fb4a490519b6ce80788d5913f3f9d7da8c7b \ + --hash=sha256:566b9395b90cc3d0d0c6404bc8572c7c18786ede549cdb540ae27b58afe0fb91 \ + --hash=sha256:5f251c355167b2360537cf17bea2cf0197995e551ab9da6a0a59b3da5e8704f9 \ + --hash=sha256:60d2d48b0580e70d2e1954d0d19fa3c2e60dd7cbed826aca104fff518310d1c5 \ + --hash=sha256:64229c1e9cea079420527fa8ac45d80fc1e8d3f94deaa35643c381fa8d98f362 \ + --hash=sha256:655726919b75ab3c34cdad39da5c530ac6fa32696fb23119e36b64adcfca174a \ + --hash=sha256:662456c4513e298db6d7bd9c3b8df6f75f8752f0ba01fb653e252ed4a59b5a5d \ + --hash=sha256:68c8ebcca945efff9d86d8d6d7bfb0841cf0071024417e2d7f45c5e46b5b08eb \ + --hash=sha256:69e1a8180868a2576f02356565f16635b99088da7df3d45aaa7e24e73a054e31 \ + --hash=sha256:6bab67d15ad617aff094c382c882e0177637da73cbc5532d52c07b4ee887a87b \ + --hash=sha256:7d95d71ff35291bab3f1c52f52f474c632db26ea12700c2ff0ea0532cb0b5854 \ + --hash=sha256:80d1f4fbb35b0742d3e3d3bb654b7381cd5f015f8497279a1e9c21ba623e01b1 \ + --hash=sha256:834988b6c34515545b3edd13e902c1acdd9f2465d386ea5143fb558f153a7176 \ + --hash=sha256:8533e6e9c5bd630ca98062e3a1326249e6ada07d05acf191a77bc33f8948f3d8 \ + --hash=sha256:85bd5cdf4ed7b2d6438871adf6afff9af7096486fcf51818a81b77ef4dd30907 \ + --hash=sha256:86ad489db097141a907c559988c29718719aa3e13370d40e20506f11b4de0d11 \ + --hash=sha256:885912559974df35d92219e2dc98f51a16a48395f37b92865ad45186f294096c \ + --hash=sha256:8efe72fde5500f47aca1ef59495cb59c885afe04ac89dd11d810f2de87d935d4 \ + --hash=sha256:8f7b5882fb50632ab1e48cb3122d6df55b9afabc265582808036b6e51b9fd6b7 \ + --hash=sha256:9e7c4389771855a92934b2846bd807fc25a3dfa820fd912fe6bd8136026b2707 \ + --hash=sha256:9e912d3c993a29df6c627459af58975b2e5c897d93287939b9d5065f000249b5 \ + --hash=sha256:a8f0302f9ac4e9923f98d8e243939a6fb627cd048f5cd38595c97e38020dffce \ + --hash=sha256:b6a73b2ba83e663b2480a90b82fdae6a7aa6427f62bf43b29912c0cfd1aa2bfa \ + --hash=sha256:c14e803037e572c177ba54a3e090d6eb12efd795d49327c5ee2b3bddb836bf01 \ + --hash=sha256:c3d7bd6e3929fd2ea7fbc3f562e4987229ead70c9ae5f01501a46701e08f1ad9 \ + --hash=sha256:c98e0b7434a7fa4e3e63f250456eaef52499fba5ae661c58cc5b5477d11e7182 \ + --hash=sha256:cce634b10aeab37010449124814b05a62fb5f18928ca878f1bf4750d1f0c815b \ + --hash=sha256:e154d230dc1bbbd78ad2fdc3039fa50ad7ffcf438e4eb2fa30bce223a70c7486 \ + --hash=sha256:e1ea6176d7dfd5b941ea01c2ec34de9531ba494d541fe2057c904e601879f249 \ + --hash=sha256:e759f9e8bc908aaae0412642afe5416c9f983a80499448fcc7fab8692ae044c3 \ + --hash=sha256:e8978003816c7b9eabe217f88c78bc26adc8f9304bf6a594b02e5a49b2ef9c11 \ + --hash=sha256:ecde9ab49f58433abe02f9ed076c7b5be839cf0153883a6d23995937a82392fa \ + --hash=sha256:f6ec94f0e50eb8fa1744a731088b966427575e40c2944a980049798b127a687e \ + --hash=sha256:fd3c71aeee838299c5887230b8a1822795325ddfea635edd82954c1eaa831e24 \ + --hash=sha256:fe0f540750a13fd8e5da4b3eaba91a785eea8dca5ccd2bc2ffe978caa403090e + # via + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/cloud-requirements.txt # -r python/requirements.txt # grpcio-tools -gymnasium==1.0.0 \ - --hash=sha256:9d2b66f30c1b34fe3c2ce7fae65ecf365d0e9982d2b3d860235e773328a3b403 \ - --hash=sha256:b6f40e1e24c5bd419361e1a5b86a9117d2499baecc3a660d44dfff4c465393ad +grpcio-tools==1.62.3 \ + --hash=sha256:0a52cc9444df978438b8d2332c0ca99000521895229934a59f94f37ed896b133 \ + --hash=sha256:0a8c0c4724ae9c2181b7dbc9b186df46e4f62cb18dc184e46d06c0ebeccf569e \ + --hash=sha256:0cb3a3436ac119cbd37a7d3331d9bdf85dad21a6ac233a3411dff716dcbf401e \ + --hash=sha256:11c625eebefd1fd40a228fc8bae385e448c7e32a6ae134e43cf13bbc23f902b7 \ + --hash=sha256:11f363570dea661dde99e04a51bd108a5807b5df32a6f8bdf4860e34e94a4dbf \ + --hash=sha256:141d028bf5762d4a97f981c501da873589df3f7e02f4c1260e1921e565b376fa \ + --hash=sha256:1c989246c2aebc13253f08be32538a4039a64e12d9c18f6d662d7aee641dc8b5 \ + --hash=sha256:1da38070738da53556a4b35ab67c1b9884a5dd48fa2f243db35dc14079ea3d0c \ + --hash=sha256:27cd9ef5c5d68d5ed104b6dcb96fe9c66b82050e546c9e255716903c3d8f0373 \ + --hash=sha256:2e02d3b96f2d0e4bab9ceaa30f37d4f75571e40c6272e95364bff3125a64d184 \ + --hash=sha256:2f968b049c2849540751ec2100ab05e8086c24bead769ca734fdab58698408c1 \ + --hash=sha256:350a80485e302daaa95d335a931f97b693e170e02d43767ab06552c708808950 \ + --hash=sha256:3eae6ea76d62fcac091e1f15c2dcedf1dc3f114f8df1a972a8a0745e89f4cf61 \ + --hash=sha256:47a5c093ab256dec5714a7a345f8cc89315cb57c298b276fa244f37a0ba507f0 \ + --hash=sha256:5782883a27d3fae8c425b29a9d3dcf5f47d992848a1b76970da3b5a28d424b26 \ + --hash=sha256:6a56d344b0bab30bf342a67e33d386b0b3c4e65868ffe93c341c51e1a8853ca5 \ + --hash=sha256:6c3064610826f50bd69410c63101954676edc703e03f9e8f978a135f1aaf97c1 \ + --hash=sha256:703f46e0012af83a36082b5f30341113474ed0d91e36640da713355cd0ea5d23 \ + --hash=sha256:710fecf6a171dcbfa263a0a3e7070e0df65ba73158d4c539cec50978f11dad5d \ + --hash=sha256:7c7136015c3d62c3eef493efabaf9e3380e3e66d24ee8e94c01cb71377f57833 \ + --hash=sha256:7cc83023acd8bc72cf74c2edbe85b52098501d5b74d8377bfa06f3e929803492 \ + --hash=sha256:7f2483ea232bd72d98a6dc6d7aefd97e5bc80b15cd909b9e356d6f3e326b6e43 \ + --hash=sha256:7ff7d58a45b75df67d25f8f144936a3e44aabd91afec833ee06826bd02b7fbe7 \ + --hash=sha256:8ad0473af5544f89fc5a1ece8676dd03bdf160fb3230f967e05d0f4bf89620e3 \ + --hash=sha256:8c5d22b252dcef11dd1e0fbbe5bbfb9b4ae048e8880d33338215e8ccbdb03edc \ + --hash=sha256:8e62cc7164b0b7c5128e637e394eb2ef3db0e61fc798e80c301de3b2379203ed \ + --hash=sha256:962c84b4da0f3b14b3cdb10bc3837ebc5f136b67d919aea8d7bb3fd3df39528a \ + --hash=sha256:ace43b26d88a58dcff16c20d23ff72b04d0a415f64d2820f4ff06b1166f50557 \ + --hash=sha256:b47d0dda1bdb0a0ba7a9a6de88e5a1ed61f07fad613964879954961e36d49193 \ + --hash=sha256:b77f9f9cee87cd798f0fe26b7024344d1b03a7cd2d2cba7035f8433b13986325 \ + --hash=sha256:b881fd9505a84457e9f7e99362eeedd86497b659030cf57c6f0070df6d9c2b9b \ + --hash=sha256:bfda6ee8990997a9df95c5606f3096dae65f09af7ca03a1e9ca28f088caca5cf \ + --hash=sha256:c3a1ac9d394f8e229eb28eec2e04b9a6f5433fa19c9d32f1cb6066e3c5114a1d \ + --hash=sha256:c8ad5cce554e2fcaf8842dee5d9462583b601a3a78f8b76a153c38c963f58c10 \ + --hash=sha256:ca246dffeca0498be9b4e1ee169b62e64694b0f92e6d0be2573e65522f39eea9 \ + --hash=sha256:ca4f5eeadbb57cf03317d6a2857823239a63a59cc935f5bd6cf6e8b7af7a7ecc \ + --hash=sha256:d102b9b21c4e1e40af9a2ab3c6d41afba6bd29c0aa50ca013bf85c99cdc44ac5 \ + --hash=sha256:db3bc9fa39afc5e4e2767da4459df82b095ef0cab2f257707be06c44a1c2c3e5 \ + --hash=sha256:dc9ad9950119d8ae27634e68b7663cc8d340ae535a0f80d85a55e56a6973ab1f \ + --hash=sha256:e02d7c1a02e3814c94ba0cfe43d93e872c758bd8fd5c2797f894d0c49b4a1dfc \ + --hash=sha256:e0898d412a434e768a0c7e365acabe13ff1558b767e400936e26b5b6ed1ee51f \ + --hash=sha256:e18e15287c31baf574fcdf8251fb7f997d64e96c6ecf467906e576da0a079af6 \ + --hash=sha256:ec279dcf3518201fc592c65002754f58a6b542798cd7f3ecd4af086422f33f29 \ + --hash=sha256:ec6fbded0c61afe6f84e3c2a43e6d656791d95747d6d28b73eff1af64108c434 \ + --hash=sha256:eec73a005443061f4759b71a056f745e3b000dc0dc125c9f20560232dfbcbd14 \ + --hash=sha256:f3d812daffd0c2d2794756bd45a353f89e55dc8f91eb2fc840c51b9f6be62667 \ + --hash=sha256:f4b1615adf67bd8bb71f3464146a6f9949972d06d21a4f5e87e73f6464d97f57 \ + --hash=sha256:f6831fdec2b853c9daa3358535c55eed3694325889aa714070528cf8f92d7d6d + # via + # -c python/deplocks/llm/ray_test_py311_cpu.lock + # -r python/requirements/cloud-requirements.txt +gymnasium==1.1.1 \ + --hash=sha256:8bd9ea9bdef32c950a444ff36afc785e1d81051ec32d30435058953c20d2456d \ + --hash=sha256:9c167ec0a2b388666e37f63b2849cd2552f7f5b71938574c637bb36487eb928a # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt h11==0.16.0 \ --hash=sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1 \ --hash=sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # httpcore # uvicorn halo==0.0.31 \ --hash=sha256:5350488fb7d2aa7c31a1344120cee67a872901ce8858f60da7946cef96c208ab \ --hash=sha256:7b67a3521ee91d53b7152d4ee3452811e1d2a6321975137762eb3d70063cc9d6 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/cloud-requirements.txt hf-transfer==0.1.9 \ --hash=sha256:035572865dab29d17e783fbf1e84cf1cb24f3fcf8f1b17db1cfc7fdf139f02bf \ @@ -1264,7 +1370,7 @@ hf-transfer==0.1.9 \ --hash=sha256:ee8b10afedcb75f71091bcc197c526a6ebf5c58bbbadb34fdeee6160f55f619f \ --hash=sha256:fc6bd19e1cc177c66bdef15ef8636ad3bde79d5a4f608c158021153b4573509d # via -r python/requirements/llm/llm-requirements.txt -hf-xet==1.1.5 \ +hf-xet==1.1.5 ; platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \ --hash=sha256:69ebbcfd9ec44fdc2af73441619eeb06b94ee34511bbcf57cd423820090f5694 \ --hash=sha256:73e167d9807d166596b4b2f0b585c6d5bd84a26dea32843665a8b58f6edba245 \ --hash=sha256:83088ecea236d5113de478acb2339f92c95b4fb0462acaa30621fac02f5a534a \ @@ -1282,7 +1388,7 @@ httplib2==0.20.4 \ --hash=sha256:58a98e45b4b1a48273073f905d2961666ecf0fbac4250ea5b47aef259eb5c585 \ --hash=sha256:8b6a905cb1c79eefd03f8669fd993c36dc341f7c558f056cb5a33b5c2f458543 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # oauth2client httptools==0.6.4 \ --hash=sha256:0614154d5454c21b6410fdf5262b4a3ddb0f53f1e1721cfd59d55f32138c578a \ @@ -1342,18 +1448,17 @@ huggingface-hub==0.34.3 \ # via # tokenizers # transformers - # vllm humanize==4.12.1 \ --hash=sha256:1338ba97415c96556758a6e2f65977ed406dddf4620d4c6db9bbdfd07f0f1232 \ --hash=sha256:86014ca5c52675dffa1d404491952f1f5bf03b07c175a51891a343daebf01fea # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/cloud-requirements.txt idna==3.7 \ --hash=sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc \ --hash=sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # anyio # email-validator # httpx @@ -1364,7 +1469,7 @@ imageio==2.34.2 \ --hash=sha256:5c0c0ee8faa018a1c42f649b90395dd4d3bb6187c09053a0cd6f1fdd51bbff5e \ --hash=sha256:a0bb27ec9d5bab36a9f4835e51b21d2cb099e1f78451441f94687ff3404b79f8 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # scikit-image imagesize==1.4.1 \ --hash=sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b \ @@ -1374,13 +1479,13 @@ importlib-metadata==6.11.0 \ --hash=sha256:1231cf92d825c9e03cfc4da076a16de6422c863558229ea0b22b675657463443 \ --hash=sha256:f0afba6205ad8f8947c7d338b5342d5db2afbfd82f9cbef7879a9539cc12eb9b # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # opentelemetry-api iniconfig==2.0.0 \ --hash=sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3 \ --hash=sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # pytest interegular==0.3.3 \ --hash=sha256:b0c07007d48c89d6d19f7204972d369b2a77222722e126b6aa63aa721dc3b19c \ @@ -1390,14 +1495,14 @@ ipykernel==6.27.1 \ --hash=sha256:7d5d594b6690654b4d299edba5e872dc17bb7396a8d0609c97cb7b8a1c605de6 \ --hash=sha256:dab88b47f112f9f7df62236511023c9bdeef67abc73af7c652e4ce4441601686 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # nbclassic # notebook ipython==8.12.3 \ --hash=sha256:3910c4b54543c2ad73d06579aa771041b7d5707b033bd488669b4cf544e3b363 \ --hash=sha256:b0340d46a933d27c657b211a329d0be23793c36595acf9e6ef4164bc01a1804c # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # ipykernel # ipywidgets # jupyterlab @@ -1405,38 +1510,38 @@ ipython-genutils==0.2.0 \ --hash=sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8 \ --hash=sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # nbclassic # notebook ipywidgets==8.1.3 \ --hash=sha256:efafd18f7a142248f7cb0ba890a68b96abd4d6e88ddbda483c9130d12667eaf2 \ --hash=sha256:f5f9eeaae082b1823ce9eac2575272952f40d748893972956dc09700a6392d9c # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/cloud-requirements.txt isodate==0.6.1 \ --hash=sha256:0751eece944162659049d35f4f549ed815792b38793f07cf73381c1c87cbed96 \ --hash=sha256:48c5881de7e8b0a0d648cb024c8062dc84e7b840ed81e864c7614fd3c127bde9 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # azure-storage-blob isoduration==20.11.0 \ --hash=sha256:ac2f9015137935279eac671f94f89eb00584f940f5dc49462a0c4ee692ba1bd9 \ --hash=sha256:b2904c2a4228c3d44f409c8ae8e2370eb21a26f7ac2ec5446df141dde3452042 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jsonschema jedi==0.19.1 \ --hash=sha256:cf0496f3651bc65d7174ac1b7d043eff454892c708a87d1b683e57b569927ffd \ --hash=sha256:e983c654fe5c02867aef4cdfce5a2fbb4a50adc0af145f70504238f18ef5e7e0 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # ipython jinja2==3.1.6 \ --hash=sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d \ --hash=sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # fastapi # jupyter-server # jupyterlab @@ -1529,26 +1634,26 @@ jmespath==1.0.1 \ --hash=sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980 \ --hash=sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # boto3 # botocore json5==0.9.14 \ --hash=sha256:740c7f1b9e584a468dbb2939d8d458db3427f2c93ae2139d05f47e453eae964f \ --hash=sha256:9ed66c3a6ca3510a976a9ef9b8c0787de24802724ab1860bc0153c7fdd589b02 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jupyterlab-server jsonpatch==1.32 \ --hash=sha256:26ac385719ac9f54df8a2f0827bb8253aa3ea8ab7b3368457bcdb8c14595a397 \ --hash=sha256:b6ddfe6c3db30d81a96aaeceb6baf916094ffa23d7dd5fa2c13e13f8b6e600c2 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/cloud-requirements.txt jsonpointer==2.4 \ --hash=sha256:15d51bba20eea3165644553647711d150376234112651b4f1811022aecad7d7a \ --hash=sha256:585cee82b70211fa9e6043b7bb89db6e1aa49524340dde8ad6b63206ea689d88 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jsonpatch # jsonschema jsonref==1.1.0 \ @@ -1559,7 +1664,7 @@ jsonschema==4.23.0 \ --hash=sha256:d71497fef26351a33265337fa77ffeb82423f3ea21283cd9467bb03999266bc4 \ --hash=sha256:fbadb6f8b144a8f8cf9f0b89ba94501d143e50411a1278633f56a7acf7fd5566 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/cloud-requirements.txt # -r python/requirements/llm/llm-requirements.txt # -r python/requirements.txt @@ -1572,13 +1677,13 @@ jsonschema-specifications==2024.10.1 \ --hash=sha256:0f38b83639958ce1152d02a7f062902c41c8fd20d558b0c34344292d417ae272 \ --hash=sha256:a09a0680616357d9a0ecf05c12ad234479f549239d0f5b55f3deea67475da9bf # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jsonschema jupyter-client==7.3.4 \ --hash=sha256:17d74b0d0a7b24f1c8c527b24fcf4607c56bee542ffe8e3418e50b21e514b621 \ --hash=sha256:aa9a6c32054b290374f95f73bb0cae91455c58dfb84f65c8591912b8f65e6d56 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # ipykernel # jupyter-server # nbclassic @@ -1588,7 +1693,7 @@ jupyter-core==5.5.0 \ --hash=sha256:880b86053bf298a8724994f95e99b99130659022a4f7f45f563084b6223861d3 \ --hash=sha256:e11e02cd8ae0a9de5c6c44abf5727df9f2581055afe00b22183f621ba3585805 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # ipykernel # jupyter-client # jupyter-server @@ -1601,13 +1706,13 @@ jupyter-events==0.6.3 \ --hash=sha256:57a2749f87ba387cd1bfd9b22a0875b889237dbf2edc2121ebb22bde47036c17 \ --hash=sha256:9a6e9995f75d1b7146b436ea24d696ce3a35bfa8bfe45e0c33c334c79464d0b3 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jupyter-server-fileid jupyter-server==1.24.0 \ --hash=sha256:23368e8e214baf82b313d4c5a0d828ca73015e1a192ce3829bd74e62fab8d046 \ --hash=sha256:c88ddbe862966ea1aea8c3ccb89a5903abd8fbcfe5cd14090ef549d403332c37 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jupyter-server-fileid # jupyterlab # jupyterlab-server @@ -1617,49 +1722,55 @@ jupyter-server-fileid==0.9.0 \ --hash=sha256:171538b7c7d08d11dbc57d4e6da196e0c258e4c2cd29249ef1e032bb423677f8 \ --hash=sha256:5b489c6fe6783c41174a728c7b81099608518387e53c3d53451a67f46a0cb7b0 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jupyter-server-ydoc jupyter-server-ydoc==0.6.1 \ --hash=sha256:18275ff1ce7e93bbda2301ca066273b3951fc50b0d9c8fc33788374134ad7920 \ --hash=sha256:ab10864708c81fa41ab9f2ed3626b54ff6926eaf14545d1d439714978dad6e9f # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jupyterlab jupyter-ydoc==0.2.5 \ --hash=sha256:5759170f112c70320a84217dd98d287699076ae65a7f88d458d57940a9f2b882 \ --hash=sha256:5a02ca7449f0d875f73e8cb8efdf695dddef15a8e71378b1f4eda6b7c90f5382 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jupyter-server-ydoc # jupyterlab jupyterlab==3.6.1 \ --hash=sha256:ad6707dd0149b629d0ed5b56916cfcdb816b376c6af3190337faba09e27ea29e \ --hash=sha256:aee98c174180e98a30470297d10b959e8e64f2288970c0de65f0a6d2b4807034 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/cloud-requirements.txt jupyterlab-pygments==0.3.0 \ --hash=sha256:721aca4d9029252b11cfa9d185e5b5af4d54772bb8072f9b7036f4170054d35d \ --hash=sha256:841a89020971da1d8693f1a99997aefc5dc424bb1b251fd6322462a1b8842780 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # nbconvert jupyterlab-server==2.24.0 \ --hash=sha256:4e6f99e0a5579bbbc32e449c4dbb039561d4f1a7827d5733273ed56738f21f07 \ --hash=sha256:5f077e142bb8dc9b843d960f940c513581bceca3793a0d80f9c67d9522c4e876 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jupyterlab jupyterlab-widgets==3.0.11 \ --hash=sha256:78287fd86d20744ace330a61625024cf5521e1c012a352ddc0a3cdc2348becd0 \ --hash=sha256:dd5ac679593c969af29c9bed054c24f26842baa51352114736756bc035deee27 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # ipywidgets jupytext==1.16.7 \ --hash=sha256:912f9d9af7bd3f15470105e5c5dddf1669b2d8c17f0c55772687fc5a4a73fe69 \ --hash=sha256:fc4e97f0890e22062c4ef10313c7ca960b07b3767246a1fef7585888cc2afe5d # via -r python/requirements/llm/llm-test-requirements.txt +kombu==5.5.4 \ + --hash=sha256:886600168275ebeada93b888e831352fe578168342f0d1d5833d88ba0d847363 \ + --hash=sha256:a12ed0557c238897d8e518f1d1fdf84bd1516c5e305af2dacd85c2015115feb8 + # via + # -c python/deplocks/llm/ray_test_py311_cpu.lock + # celery lark==1.2.2 \ --hash=sha256:c2276486b02f0f1b90be155f2c8ba4a8e194d42775786db622faccd652d8e80c \ --hash=sha256:ca807d0162cd16cef15a8feecb862d7319e7a09bdb13aef927968e45040fed80 @@ -1668,7 +1779,7 @@ lazy-loader==0.4 \ --hash=sha256:342aa8e14d543a154047afb4ba8ef17f5563baad3fc610d7b15b213b0f119efc \ --hash=sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # scikit-image llguidance==0.7.26 ; platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \ --hash=sha256:1895ff449c8ec0a5f1d3b142d723fc9b26a85b021b72d7f1173f8b7507f528c0 \ @@ -1711,7 +1822,7 @@ log-symbols==0.0.14 \ --hash=sha256:4952106ff8b605ab7d5081dd2c7e6ca7374584eff7086f499c06edd1ce56dcca \ --hash=sha256:cf0bbc6fe1a8e53f0d174a716bc625c4f87043cc21eb55dd8a740cfe22680556 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # halo lxml==4.9.4 \ --hash=sha256:00e91573183ad273e242db5585b52670eddf92bacad095ce25c1e682da14ed91 \ @@ -1808,7 +1919,7 @@ lxml==4.9.4 \ --hash=sha256:fd814847901df6e8de13ce69b84c31fc9b3fb591224d6762d0b256d510cbf382 \ --hash=sha256:fdb325b7fba1e2c40b9b1db407f85642e32404131c08480dd652110fc908561b # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # nbconvert lz4==4.3.3 \ --hash=sha256:01fe674ef2889dbb9899d8a67361e0c4a2c833af5aeb37dd505727cf5d2a131e \ @@ -1848,13 +1959,13 @@ lz4==4.3.3 \ --hash=sha256:f4c7bf687303ca47d69f9f0133274958fd672efaa33fb5bcde467862d6c621f0 \ --hash=sha256:f76176492ff082657ada0d0f10c794b6da5800249ef1692b35cf49b1e93e8ef7 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt markdown-it-py==2.2.0 \ --hash=sha256:5a35f8d1870171d9acc47b99612dc146129b631baf04970128b568f190d0cc30 \ --hash=sha256:7c9a5e412688bc771c67432cbfebcdd686c93ce6484913dccf06cb5a0bea35a1 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jupytext # mdit-py-plugins # rich @@ -1920,14 +2031,14 @@ markupsafe==2.1.3 \ --hash=sha256:fec21693218efe39aa7f8599346e90c705afa52c5b31ae019b2e57e8f6542bb2 \ --hash=sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jinja2 # nbconvert matplotlib-inline==0.1.6 \ --hash=sha256:f1f41aab5328aa5aaea9b16d083b128102f8712542f819fe7e6a420ff581b311 \ --hash=sha256:f887e5f10ba98e8d2b150ddcf4702c1e5f8b3a20005eb0f74bfdbd360ee6f304 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # ipykernel # ipython mdit-py-plugins==0.4.2 \ @@ -1938,7 +2049,7 @@ mdurl==0.1.2 \ --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \ --hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # markdown-it-py memray==1.10.0 ; sys_platform != 'win32' \ --hash=sha256:0a21745fb516b7a6efcd40aa7487c59e9313fcfc782d0193fcfcf00b48426874 \ @@ -1977,7 +2088,7 @@ memray==1.10.0 ; sys_platform != 'win32' \ --hash=sha256:e356af93e3b031c83957e9ac1a653f5aaba5df1e357dd17142f5ed19bb3dc660 \ --hash=sha256:f16c5c8730b616613dc8bafe32649ca6bd7252606251eb00148582011758d0b5 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt meson==1.8.3 \ --hash=sha256:ef02b806ce0c5b6becd5bb5dc9fa67662320b29b337e7ace73e4354500590233 \ @@ -1991,7 +2102,7 @@ mistune==0.8.4 \ --hash=sha256:59a3429db53c50b5c6bcc8a07f8848cb00d7dc8bdb431a4ab41920d201d4756e \ --hash=sha256:88a1051873018da288eee8538d476dffe1262495144b33ecb586c4ab266bb8d4 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # nbconvert mpmath==1.3.0 \ --hash=sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f \ @@ -2001,14 +2112,15 @@ msal==1.28.1 \ --hash=sha256:563c2d70de77a2ca9786aab84cb4e133a38a6897e6676774edc23d610bfc9e7b \ --hash=sha256:d72bbfe2d5c2f2555f4bc6205be4450ddfd12976610dd9a16a9ab0f05c68b64d # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock + # azure-datalake-store # azure-identity # msal-extensions msal-extensions==1.2.0b1 \ --hash=sha256:217f391bb549de11b19abe8029a8375fe3ca0556aa8cce004b2083f00a569b71 \ --hash=sha256:3658b3814cd6a7759e83cb0ec145f30330ee249a92444adaf9aa4eb4f5bbcbbc # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # azure-identity msgpack==1.0.7 \ --hash=sha256:04ad6069c86e531682f9e1e71b71c1c3937d6014a7c3e9edd2aa81ad58842862 \ @@ -2068,7 +2180,7 @@ msgpack==1.0.7 \ --hash=sha256:f9a7c509542db4eceed3dcf21ee5267ab565a83555c9b88a8109dcecc4709002 \ --hash=sha256:ff1d0899f104f3921d94579a5638847f783c9b04f2d5f229392ca77fba5b82fc # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt # ray msgspec==0.19.0 \ @@ -2201,27 +2313,27 @@ multidict==6.0.5 \ --hash=sha256:fce28b3c8a81b6b36dfac9feb1de115bab619b3c13905b419ec71d03a3fc1423 \ --hash=sha256:fe5d7785250541f7f5019ab9cba2c71169dc7d74d0f45253f8313f436458a4ef # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # aiohttp # yarl nbclassic==1.0.0 \ --hash=sha256:0ae11eb2319455d805596bf320336cda9554b41d99ab9a3c31bf8180bffa30e3 \ --hash=sha256:f99e4769b4750076cd4235c044b61232110733322384a94a63791d2e7beacc66 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jupyterlab # notebook nbclient==0.5.13 \ --hash=sha256:40c52c9b5e3c31faecaee69f202b3f53e38d7c1c563de0fadde9d7eda0fdafe8 \ --hash=sha256:47ac905af59379913c1f8f541098d2550153cf8dc58553cbe18c702b181518b0 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # nbconvert nbconvert==6.5.4 \ --hash=sha256:9e3c7c6d491374cbdd5f35d268c05809357716d346f4573186bbeab32ee50bc1 \ --hash=sha256:d679a947f849a966cbbd0bf6e7fedcfdb64be3b20ce7cef11ad55c13f5820e19 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jupyter-server # nbclassic # notebook @@ -2229,7 +2341,7 @@ nbformat==5.9.2 \ --hash=sha256:1c5172d786a41b82bcfd0c23f9e6b6f072e8fb49c39250219e4acfff1efe89e9 \ --hash=sha256:5f98b5ba1997dff175e77e0c17d5c10a96eaed2cbd1de3533d1fc35d5e111192 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jupyter-server # jupytext # nbclassic @@ -2240,7 +2352,7 @@ nest-asyncio==1.5.8 \ --hash=sha256:25aa2ca0d2a5b5531956b9e273b45cf664cae2b145101d73b86b199978d48fdb \ --hash=sha256:accda7a339a70599cb08f9dd09a67e0c2ef8d8d6f4c07f96ab203f2ae254e48d # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # ipykernel # jupyter-client # nbclassic @@ -2250,7 +2362,7 @@ networkx==3.2.1 \ --hash=sha256:9f1bb5cf3409bf324e0a722c20bdb4c20ee39bf1c30ce8ae499c8502b0b5e0c6 \ --hash=sha256:f18c69adc97877c42332c170849c96cefa91881c99a7cb3e95b7c659ebdc1ec2 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # scikit-image # torch ninja==1.11.1.3 \ @@ -2275,23 +2387,27 @@ ninja==1.11.1.3 \ # -r python/requirements/llm/llm-requirements.txt # vllm # xgrammar -nixl==0.3.1 \ - --hash=sha256:20428ad2668062a79045fae83cc5cba1f4019d4a2c7053cc8549c3a1533f8a75 \ - --hash=sha256:70b8932b50ccf1a13ac8fa2e10a4b78290baae9f963bfecfa67684104331a94b \ - --hash=sha256:8c144839484b3076f0b34ad8ceaeaff05c23399cf57ca85f2a94b44e1475a39b \ - --hash=sha256:ff59996ad05a7e4ba6c8beba0f1d8ac2f9e53df696a15af0d3340028e2f16081 +nixl==0.4.1 \ + --hash=sha256:10c7b4a44f89c3fbff3e20cb84973be95f8df36ee336fb108275ed1839fec1f1 \ + --hash=sha256:510cc9e824ad53cac71ce55ff41160f2a9e1507ceb52eb871b775fe1e42beb87 \ + --hash=sha256:8a3d83b28c16b795bdc281f1489b9d247f6e6088ad96ca96406072a36d6354b7 \ + --hash=sha256:9381fd3986d227c7ccb2607c03bbea559ec80f951e2ea47c1fbf381e4cd97164 \ + --hash=sha256:9ab7e580e9962ebdcda8c17f8548858d3fdb648621367d8e717ca317b534b778 \ + --hash=sha256:db144821de7912cb2502052b3070a1ac276b8b019470e6efdfce9c237ffe130d \ + --hash=sha256:e33102b85b3f95a8c95e59b59b29aabd03d47b5bce619de506b9bb83739cf60d \ + --hash=sha256:f16092dd445542e82e3db3553f6c7697ec5a2e837f19d416401283ae245826f9 # via -r python/requirements/llm/llm-requirements.txt notebook==6.5.7 \ --hash=sha256:04eb9011dfac634fbd4442adaf0a8c27cd26beef831fe1d19faf930c327768e4 \ --hash=sha256:a6afa9a4ff4d149a0771ff8b8c881a7a73b3835f9add0606696d6e9d98ac1cd0 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jupyterlab notebook-shim==0.2.3 \ --hash=sha256:a83496a43341c1674b093bfcebf0fe8e74cbe7eda5fd2bbc56f8e39e1486c0c7 \ --hash=sha256:f69388ac283ae008cd506dda10d0288b09a017d822d5e8c7129a152cbd3ce7e9 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # nbclassic numba==0.61.2 \ --hash=sha256:34fba9406078bac7ab052efbf0d13939426c753ad72946baaa5bf9ae0ebb8dd2 \ @@ -2354,7 +2470,7 @@ numpy==1.26.4 \ --hash=sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3 \ --hash=sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt # cupy-cuda12x # gguf @@ -2383,23 +2499,39 @@ oauth2client==4.1.3 \ --hash=sha256:b8a81cc5d60e2d364f0b1b98f958dbd472887acaf1a5b05e21c28c31a2d6d3ac \ --hash=sha256:d486741e451287f69568a4d26d70d9acd73a2bbfa275746c535b4209891cccc6 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/cloud-requirements.txt -openai==1.90.0 \ - --hash=sha256:9771982cdd5b6631af68c6a603da72ed44cd2caf73b49f717a72b71374bc565b \ - --hash=sha256:e5dcb5498ea6b42fec47546d10f1bcc05fb854219a7d953a5ba766718b212a02 +openai==1.100.2 \ + --hash=sha256:54d3457b2c8d7303a1bc002a058de46bdd8f37a8117751c7cf4ed4438051f151 \ + --hash=sha256:787b4c3c8a65895182c58c424f790c25c790cc9a0330e34f73d55b6ee5a00e32 + # via vllm +openai-harmony==0.0.4 \ + --hash=sha256:038f1d6772d1be5213b36ae76e5d042022395ec35c428a73ccb8b839b2cecf6a \ + --hash=sha256:15e6d53a66502491a3675a536df30e271f976e6c5efe68250a65191efcb85c4f \ + --hash=sha256:2d8d16d84702059833fb03b841b28c25600c54e83cadccef79af44e1c81166b1 \ + --hash=sha256:31e9bcac0902a309e2fc688e52f247eec7fffcd00d17e958b9a83a8fea6519c2 \ + --hash=sha256:3586d90c899cd41f8624e7b82a48c289f6e4be56c66304ecaf3a0ba88963a73f \ + --hash=sha256:3cf2344366f10981bbc0f6d9949a0b2bb87151d209ed295943ed6ad8eda37932 \ + --hash=sha256:567cc568b6bf7b4d041b0c9aa7d6b2c9394f8af6065bc87fa6d23f207b5af9a7 \ + --hash=sha256:5c67ac6df349236fb7b64f57c3dbb0273efcdca24314daa108f2a482c427106c \ + --hash=sha256:746f751de5033b3dbcfcd4a726a4c56ce452c593ad3d54472d8597ce8d8b6d44 \ + --hash=sha256:96a63199c0d81095b5d5d1ae8ca82b64c1c13d18d4e30323ae9e8ab31bc80a3d \ + --hash=sha256:97f1fe3909733212cc6b36f0f199b1421a9c57b79ec665f0322bd604cec47340 \ + --hash=sha256:b9ee9e9ab6a237cebbe16563c787a6e83f3fcc034075c3d321dab94448426282 \ + --hash=sha256:d38f2639f6bf7c3c34a5dfd79e29075811ae2fa9b895a63e76767f74a47a971e \ + --hash=sha256:ef21a1e2384a65c62d5ec5e1cded9fe026f1d032d5c5d725110d1a8d330d8f54 # via vllm opencensus==0.11.4 \ --hash=sha256:a18487ce68bc19900336e0ff4655c5a116daf10c1b3685ece8d971bddad6a864 \ --hash=sha256:cbef87d8b8773064ab60e5c2a1ced58bbaa38a6d052c41aec224958ce544eff2 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt opencensus-context==0.1.3 \ --hash=sha256:073bb0590007af276853009fac7e4bab1d523c3f03baf4cb4511ca38967c6039 \ --hash=sha256:a03108c3c10d8c80bb5ddf5c8a1f033161fa61972a9917f9b9b3a18517f0088c # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # opencensus opencv-python-headless==4.11.0.86 \ --hash=sha256:0e0a27c19dd1f40ddff94976cfe43066fbbe9dfbb2ec1907d66c19caef42a57b \ @@ -2416,7 +2548,7 @@ opentelemetry-api==1.34.1 \ --hash=sha256:64f0bd06d42824843731d05beea88d4d4b6ae59f9fe347ff7dfa2cc14233bbb3 \ --hash=sha256:b7df4cb0830d5a6c29ad0c0691dbae874d8daefa934b8b1d642de48323d32a8c # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt # opentelemetry-exporter-prometheus # opentelemetry-sdk @@ -2425,26 +2557,26 @@ opentelemetry-exporter-prometheus==0.55b1 \ --hash=sha256:d13ec0b22bf394113ff1ada5da98133a4b051779b803dae183188e26c4bd9ee0 \ --hash=sha256:f364fbbff9e5de37a112ff104d1185fb1d7e2046c5ab5911e5afebc7ab3ddf0e # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt opentelemetry-proto==1.27.0 \ --hash=sha256:33c9345d91dafd8a74fc3d7576c5a38f18b7fdf8d02983ac67485386132aedd6 \ --hash=sha256:b133873de5581a50063e1e4b29cdcf0c5e253a8c2d8dc1229add20a4c3830ace # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt opentelemetry-sdk==1.34.1 \ --hash=sha256:308effad4059562f1d92163c61c8141df649da24ce361827812c40abb2a1e96e \ --hash=sha256:8091db0d763fcd6098d4781bbc80ff0971f94e260739aa6afe6fd379cdf3aa4d # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt # opentelemetry-exporter-prometheus opentelemetry-semantic-conventions==0.55b1 \ --hash=sha256:5da81dfdf7d52e3d37f8fe88d5e771e191de924cfff5f550ab0b8f7b2409baed \ --hash=sha256:ef95b1f009159c28d7a7849f5cbc71c4c34c845bb514d66adfdf1b3fff3598b3 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # opentelemetry-sdk outlines-core==0.2.10 \ --hash=sha256:0a9e4b192ca837a472a1bb1428397509f543db08e1aeeee30252525cec34093a \ @@ -2493,7 +2625,7 @@ packaging==23.0 \ --hash=sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2 \ --hash=sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/cloud-requirements.txt # -r python/requirements.txt # huggingface-hub @@ -2502,6 +2634,7 @@ packaging==23.0 \ # jupyterlab # jupyterlab-server # jupytext + # kombu # lazy-loader # lm-format-enforcer # nbconvert @@ -2540,19 +2673,19 @@ pandas==1.5.3 \ --hash=sha256:e474390e60ed609cec869b0da796ad94f420bb057d86784191eefc62b65819ae \ --hash=sha256:f76d097d12c82a535fda9dfe5e8dd4127952b45fea9b0276cb30cca5ea313fbc # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt pandocfilters==1.5.0 \ --hash=sha256:0b679503337d233b4339a817bfc8c50064e2eff681314376a47cb582305a7a38 \ --hash=sha256:33aae3f25fd1a026079f5d27bdd52496f0e0803b3469282162bafdcbdf6ef14f # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # nbconvert parso==0.8.3 \ --hash=sha256:8c07be290bb59f03588915921e29e8a50002acaf2cdc5fa0e0114f91709fafa0 \ --hash=sha256:c001d4636cd3aecdaf33cbb40aebb59b094be2a74c556778ef5576c175e19e75 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jedi partial-json-parser==0.2.1.1.post5 \ --hash=sha256:627715aaa3cb3fb60a65b0d62223243acaa6c70846520a90326fef3a2f0b61ca \ @@ -2562,19 +2695,19 @@ pathspec==0.11.2 \ --hash=sha256:1d6ed233af05e679efb96b1851550ea95bbb64b7c490b0f5aa52996c11e92a20 \ --hash=sha256:e0d8d0ac2f12da61956eb2306b69f9469b42f4deb0f3cb6ed47b9cce9996ced3 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/cloud-requirements.txt pexpect==4.8.0 ; sys_platform != 'win32' \ --hash=sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937 \ --hash=sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # ipython pickleshare==0.7.5 \ --hash=sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca \ --hash=sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # ipython pillow==10.3.0 \ --hash=sha256:048ad577748b9fa4a99a0548c64f2cb8d672d5bf2e643a739ac8faff1164238c \ @@ -2647,7 +2780,7 @@ pillow==10.3.0 \ --hash=sha256:fdcbb4068117dfd9ce0138d068ac512843c52295ed996ae6dd1faf537b6dbc27 \ --hash=sha256:ff61bfd9253c3915e6d41c651d5f962da23eda633cf02262990094a18a55371a # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/llm/llm-test-requirements.txt # imageio # mistral-common @@ -2658,26 +2791,26 @@ platformdirs==3.11.0 \ --hash=sha256:cf8ee52a3afdb965072dcc652433e0c7e3e40cf5ea1477cd4b3b1d2eb75495b3 \ --hash=sha256:e9d171d00af68be50e9202731309c4e658fd8bc76f55c11c7dd760d023bda68e # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jupyter-core # virtualenv pluggy==1.3.0 \ --hash=sha256:cf61ae8f126ac6f7c451172cf30e3e43d3ca77615509771b3a984a0730651e12 \ --hash=sha256:d89c696a773f8bd377d18e5ecda92b7a3793cbe66c87060a6fb58c7b6e1061f7 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # pytest portalocker==2.8.2 \ --hash=sha256:2b035aa7828e46c58e9b31390ee1f169b98e1066ab10b9a6a861fe7e25ee4f33 \ --hash=sha256:cfb86acc09b9aa7c3b43594e19be1345b9d16af3feb08bf92f23d4dce513a28e # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # msal-extensions prometheus-client==0.19.0 \ --hash=sha256:4585b0d1223148c27a225b10dbec5ae9bc4c81a99a3fa80774fa6209935324e1 \ --hash=sha256:c88b1e6ecf6b41cd8fb5731c7ae919bf66df6ec6fafa555cd6c0e16ca169ae92 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt # jupyter-server # nbclassic @@ -2693,7 +2826,8 @@ prompt-toolkit==3.0.41 \ --hash=sha256:941367d97fc815548822aa26c2a269fdc4eb21e9ec05fc5d447cf09bad5d75f0 \ --hash=sha256:f36fe301fafb7470e86aaf90f036eef600a3210be4decf461a5b1ca8403d3cb2 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock + # click-repl # ipython propcache==0.3.0 \ --hash=sha256:02df07041e0820cacc8f739510078f2aadcfd3fc57eaeeb16d5ded85c872c89e \ @@ -2795,14 +2929,14 @@ propcache==0.3.0 \ --hash=sha256:fb91d20fa2d3b13deea98a690534697742029f4fb83673a3501ae6e3746508b5 \ --hash=sha256:fddb8870bdb83456a489ab67c6b3040a8d5a55069aa6f72f9d872235fbc52f54 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # aiohttp # yarl proto-plus==1.22.3 \ --hash=sha256:a49cd903bc0b6ab41f76bf65510439d56ca76f868adf0274e738bfdd096894df \ --hash=sha256:fdcd09713cbd42480740d2fe29c990f7fbd885a67efc328aa8be6ee3e9f76a6b # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # google-api-core protobuf==4.25.8 \ --hash=sha256:077ff8badf2acf8bc474406706ad890466274191a48d0abd3bd6987107c9cde5 \ @@ -2817,7 +2951,7 @@ protobuf==4.25.8 \ --hash=sha256:d552c53d0415449c8d17ced5c341caba0d89dbf433698e1436c8fa0aae7808a3 \ --hash=sha256:f4510b93a3bec6eba8fd8f1093e9d7fb0d4a24d1a81377c10c0e5bbfe9e4ed24 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt # google-api-core # googleapis-common-protos @@ -2845,21 +2979,21 @@ psutil==5.9.6 \ --hash=sha256:fb8a697f11b0f5994550555fcfe3e69799e5b060c8ecf9e2f75c69302cc35c0d \ --hash=sha256:ff18b8d1a784b810df0b0fff3bcb50ab941c3b8e2c8de5726f9c71c601c611aa # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # ipykernel # vllm ptyprocess==0.7.0 ; os_name != 'nt' or sys_platform != 'win32' \ --hash=sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35 \ --hash=sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # pexpect # terminado pure-eval==0.2.2 \ --hash=sha256:01eaab343580944bc56080ebe0a674b39ec44a945e6d09ba7db3cb8cec289350 \ --hash=sha256:2b45320af6dfaa1750f543d714b6d1c520a1688dec6fd24d339063ce0aaa9ac3 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # stack-data py-cpuinfo==9.0.0 \ --hash=sha256:3cdbbf3fac90dc6f118bfd64384f309edeadd902d7c8fb17f02ffa1fc3f49690 \ @@ -2875,7 +3009,7 @@ py-spy==0.4.0 ; python_full_version < '3.12' \ --hash=sha256:eee3d0bde85ca5cf4f01f012d461180ca76c24835a96f7b5c4ded64eb6a008ab \ --hash=sha256:f2cf3f7130e7d780471faa5957441d3b4e0ec39a79b2c00f4c33d494f7728428 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt pyarrow==19.0.1 \ --hash=sha256:008a4009efdb4ea3d2e18f05cd31f9d43c388aad29c636112c2966605ba33466 \ @@ -2921,13 +3055,13 @@ pyarrow==19.0.1 \ --hash=sha256:fca15aabbe9b8355800d923cc2e82c8ef514af321e18b437c3d782aa884eaeec \ --hash=sha256:fd44d66093a239358d07c42a91eebf5015aa54fccba959db899f932218ac9cc8 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt pyasn1==0.5.1 \ --hash=sha256:4439847c58d40b1d0a573d07e3856e95333f1976294494c325775aeca506eb58 \ --hash=sha256:6d391a96e59b23130a5cfa74d6fd7f388dbbe26cc8f1edf39fdddf08d9d6676c # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # oauth2client # pyasn1-modules # rsa @@ -2935,7 +3069,7 @@ pyasn1-modules==0.3.0 \ --hash=sha256:5bd01446b736eb9d31512a30d46c1ac3395d676c6f3cafa4c03eb54b9925631c \ --hash=sha256:d3ccd6ed470d9ffbc716be08bd90efbd44d0734bc9303818f7336070984a162d # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # google-auth # oauth2client pybase64==1.4.1 \ @@ -3095,7 +3229,7 @@ pycparser==2.21 \ --hash=sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9 \ --hash=sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # cffi pycurl==7.45.3 \ --hash=sha256:0c41a172d5e8a5cdd8328cc8134f47b2a57960ac677f7cda8520eaa9fbe7d990 \ @@ -3135,125 +3269,125 @@ pycurl==7.45.3 \ --hash=sha256:fa7751b614d9aa82d7a0f49ca90924c29c6cedf85a2f8687fb6a772dbfe48711 \ --hash=sha256:fbd4a6b8654b779089c5a44af1c65c1419c2cd60718780df6d8f354eb35d6d55 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/cloud-requirements.txt -pydantic==2.10.0 \ - --hash=sha256:0aca0f045ff6e2f097f1fe89521115335f15049eeb8a7bef3dafe4b19a74e289 \ - --hash=sha256:5e7807ba9201bdf61b1b58aa6eb690916c40a47acfb114b1b4fef3e7fd5b30fc +pydantic==2.11.7 \ + --hash=sha256:d989c3c6cb79469287b1569f7447a17848c998458d49ebe294e975b9baf0f0db \ + --hash=sha256:dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt # compressed-tensors # fastapi # lm-format-enforcer # mistral-common # openai + # openai-harmony # pydantic-extra-types # vllm # xgrammar -pydantic-core==2.27.0 \ - --hash=sha256:0aa4d1b2eba9a325897308b3124014a142cdccb9f3e016f31d3ebee6b5ea5e75 \ - --hash=sha256:0d06b667e53320332be2bf6f9461f4a9b78092a079b8ce8634c9afaa7e10cd9f \ - --hash=sha256:153017e3d6cd3ce979de06d84343ca424bb6092727375eba1968c8b4693c6ecb \ - --hash=sha256:15e350efb67b855cd014c218716feea4986a149ed1f42a539edd271ee074a196 \ - --hash=sha256:185ef205256cd8b38431205698531026979db89a79587725c1e55c59101d64e9 \ - --hash=sha256:1da0c98a85a6c6ed702d5556db3b09c91f9b0b78de37b7593e2de8d03238807a \ - --hash=sha256:225bfff5d425c34e1fd562cef52d673579d59b967d9de06178850c4802af9039 \ - --hash=sha256:24f984fc7762ed5f806d9e8c4c77ea69fdb2afd987b4fd319ef06c87595a8c55 \ - --hash=sha256:25a7fd4de38f7ff99a37e18fa0098c3140286451bc823d1746ba80cec5b433a1 \ - --hash=sha256:2883b260f7a93235488699d39cbbd94fa7b175d3a8063fbfddd3e81ad9988cb2 \ - --hash=sha256:2a51ce96224eadd1845150b204389623c8e129fde5a67a84b972bd83a85c6c40 \ - --hash=sha256:2be0ad541bb9f059954ccf8877a49ed73877f862529575ff3d54bf4223e4dd61 \ - --hash=sha256:31a2cae5f059329f9cfe3d8d266d3da1543b60b60130d186d9b6a3c20a346361 \ - --hash=sha256:333c840a1303d1474f491e7be0b718226c730a39ead0f7dab2c7e6a2f3855555 \ - --hash=sha256:33d14369739c5d07e2e7102cdb0081a1fa46ed03215e07f097b34e020b83b1ae \ - --hash=sha256:35380671c3c921fe8adf31ad349dc6f7588b7e928dbe44e1093789734f607399 \ - --hash=sha256:359e7951f04ad35111b5ddce184db3391442345d0ab073aa63a95eb8af25a5ef \ - --hash=sha256:36aa167f69d8807ba7e341d67ea93e50fcaaf6bc433bb04939430fa3dab06f31 \ - --hash=sha256:395e3e1148fa7809016231f8065f30bb0dc285a97b4dc4360cd86e17bab58af7 \ - --hash=sha256:3e8d89c276234579cd3d095d5fa2a44eb10db9a218664a17b56363cddf226ff3 \ - --hash=sha256:3eb8849445c26b41c5a474061032c53e14fe92a11a5db969f722a2716cd12206 \ - --hash=sha256:3fd8bc2690e7c39eecdf9071b6a889ce7b22b72073863940edc2a0a23750ca90 \ - --hash=sha256:400bf470e4327e920883b51e255617dfe4496d4e80c3fea0b5a5d0bf2c404dd4 \ - --hash=sha256:4148dc9184ab79e356dc00a4199dc0ee8647973332cb385fc29a7cced49b9f9c \ - --hash=sha256:433689845288f9a1ee5714444e65957be26d30915f7745091ede4a83cfb2d7bb \ - --hash=sha256:43b61989068de9ce62296cde02beffabcadb65672207fc51e7af76dca75e6636 \ - --hash=sha256:4523c4009c3f39d948e01962223c9f5538602e7087a628479b723c939fab262d \ - --hash=sha256:483c2213a609e7db2c592bbc015da58b6c75af7360ca3c981f178110d9787bcf \ - --hash=sha256:49633583eb7dc5cba61aaf7cdb2e9e662323ad394e543ee77af265736bcd3eaa \ - --hash=sha256:4b51f964fcbb02949fc546022e56cdb16cda457af485e9a3e8b78ac2ecf5d77e \ - --hash=sha256:4bf1340ae507f6da6360b24179c2083857c8ca7644aab65807023cf35404ea8d \ - --hash=sha256:4fb49cfdb53af5041aba909be00cccfb2c0d0a2e09281bf542371c5fd36ad04c \ - --hash=sha256:510b11e9c3b1a852876d1ccd8d5903684336d635214148637ceb27366c75a467 \ - --hash=sha256:513cb14c0cc31a4dfd849a4674b20c46d87b364f997bbcb02282306f5e187abf \ - --hash=sha256:58560828ee0951bb125c6f2862fbc37f039996d19ceb6d8ff1905abf7da0bf3d \ - --hash=sha256:58ab0d979c969983cdb97374698d847a4acffb217d543e172838864636ef10d9 \ - --hash=sha256:5982048129f40b082c2654de10c0f37c67a14f5ff9d37cf35be028ae982f26df \ - --hash=sha256:5ab325fc86fbc077284c8d7f996d904d30e97904a87d6fb303dce6b3de7ebba9 \ - --hash=sha256:5cc822ab90a70ea3a91e6aed3afac570b276b1278c6909b1d384f745bd09c714 \ - --hash=sha256:5f2b19b8d6fca432cb3acf48cf5243a7bf512988029b6e6fd27e9e8c0a204d85 \ - --hash=sha256:5fc72fbfebbf42c0856a824b8b0dc2b5cd2e4a896050281a21cfa6fed8879cb1 \ - --hash=sha256:6354e18a9be37bfa124d6b288a87fb30c673745806c92956f1a25e3ae6e76b96 \ - --hash=sha256:678f66462058dd978702db17eb6a3633d634f7aa0deaea61e0a674152766d3fc \ - --hash=sha256:68950bc08f9735306322bfc16a18391fcaac99ded2509e1cc41d03ccb6013cfe \ - --hash=sha256:68ef5377eb582fa4343c9d0b57a5b094046d447b4c73dd9fbd9ffb216f829e7d \ - --hash=sha256:6b4c19525c3538fbc0bbda6229f9682fb8199ce9ac37395880e6952798e00373 \ - --hash=sha256:6bb69bf3b6500f195c3deb69c1205ba8fc3cb21d1915f1f158a10d6b1ef29b6a \ - --hash=sha256:6e19401742ed7b69e51d8e4df3c03ad5ec65a83b36244479fd70edde2828a5d9 \ - --hash=sha256:6f4a53af9e81d757756508b57cae1cf28293f0f31b9fa2bfcb416cc7fb230f9d \ - --hash=sha256:6fda87808429c520a002a85d6e7cdadbf58231d60e96260976c5b8f9a12a8e13 \ - --hash=sha256:78f841523729e43e3928a364ec46e2e3f80e6625a4f62aca5c345f3f626c6e8a \ - --hash=sha256:7a6ebfac28fd51890a61df36ef202adbd77d00ee5aca4a3dadb3d9ed49cfb929 \ - --hash=sha256:7b0202ebf2268954090209a84f9897345719e46a57c5f2c9b7b250ca0a9d3e63 \ - --hash=sha256:8117839a9bdbba86e7f9df57018fe3b96cec934c3940b591b0fd3fbfb485864a \ - --hash=sha256:82e1ad4ca170e8af4c928b67cff731b6296e6a0a0981b97b2eb7c275cc4e15bd \ - --hash=sha256:836a4bfe0cc6d36dc9a9cc1a7b391265bf6ce9d1eb1eac62ac5139f5d8d9a6fa \ - --hash=sha256:84af1cf7bfdcbc6fcf5a5f70cc9896205e0350306e4dd73d54b6a18894f79386 \ - --hash=sha256:84e35afd9e10b2698e6f2f32256678cb23ca6c1568d02628033a837638b3ed12 \ - --hash=sha256:884f1806609c2c66564082540cffc96868c5571c7c3cf3a783f63f2fb49bd3cd \ - --hash=sha256:8a150392102c402c538190730fda06f3bce654fc498865579a9f2c1d2b425833 \ - --hash=sha256:8e21d927469d04b39386255bf00d0feedead16f6253dcc85e9e10ddebc334084 \ - --hash=sha256:8e96ca781e0c01e32115912ebdf7b3fb0780ce748b80d7d28a0802fa9fbaf44e \ - --hash=sha256:8ee4c2a75af9fe21269a4a0898c5425afb01af1f5d276063f57e2ae1bc64e191 \ - --hash=sha256:91bc66f878557313c2a6bcf396e7befcffe5ab4354cfe4427318968af31143c3 \ - --hash=sha256:951e71da6c89d354572098bada5ba5b5dc3a9390c933af8a614e37755d3d1840 \ - --hash=sha256:99b2863c1365f43f74199c980a3d40f18a218fbe683dd64e470199db426c4d6a \ - --hash=sha256:9a8fbf506fde1529a1e3698198fe64bfbe2e0c09557bc6a7dcf872e7c01fec40 \ - --hash=sha256:9ce048deb1e033e7a865ca384770bccc11d44179cf09e5193a535c4c2f497bdc \ - --hash=sha256:9fe94d9d2a2b4edd7a4b22adcd45814b1b59b03feb00e56deb2e89747aec7bfe \ - --hash=sha256:a291d0b4243a259c8ea7e2b84eb9ccb76370e569298875a7c5e3e71baf49057a \ - --hash=sha256:a5c022bb0d453192426221605efc865373dde43b17822a264671c53b068ac20c \ - --hash=sha256:abb4785894936d7682635726613c44578c420a096729f1978cd061a7e72d5275 \ - --hash=sha256:b872c86d8d71827235c7077461c502feb2db3f87d9d6d5a9daa64287d75e4fa0 \ - --hash=sha256:bf37b72834e7239cf84d4a0b2c050e7f9e48bced97bad9bdf98d26b8eb72e846 \ - --hash=sha256:c0c431e4be5c1a0c6654e0c31c661cd89e0ca956ef65305c3c3fd96f4e72ca39 \ - --hash=sha256:c5726eec789ee38f2c53b10b1821457b82274f81f4f746bb1e666d8741fcfadb \ - --hash=sha256:c6fcb3fa3855d583aa57b94cf146f7781d5d5bc06cb95cb3afece33d31aac39b \ - --hash=sha256:c86679f443e7085ea55a7376462553996c688395d18ef3f0d3dbad7838f857a2 \ - --hash=sha256:c91e3c04f5191fd3fb68764bddeaf02025492d5d9f23343b283870f6ace69708 \ - --hash=sha256:c921ad596ff1a82f9c692b0758c944355abc9f0de97a4c13ca60ffc6d8dc15d4 \ - --hash=sha256:c9ed88b398ba7e3bad7bd64d66cc01dcde9cfcb7ec629a6fd78a82fa0b559d78 \ - --hash=sha256:cd2ac6b919f7fed71b17fe0b4603c092a4c9b5bae414817c9c81d3c22d1e1bcc \ - --hash=sha256:d28ca7066d6cdd347a50d8b725dc10d9a1d6a1cce09836cf071ea6a2d4908be0 \ - --hash=sha256:d29e235ce13c91902ef3efc3d883a677655b3908b1cbc73dee816e5e1f8f7739 \ - --hash=sha256:d8b5ee4ae9170e2775d495b81f414cc20268041c42571530513496ba61e94ba3 \ - --hash=sha256:db72e40628967f6dc572020d04b5f800d71264e0531c6da35097e73bdf38b003 \ - --hash=sha256:df45c4073bed486ea2f18757057953afed8dd77add7276ff01bccb79982cf46c \ - --hash=sha256:dfa5f5c0a4c8fced1422dc2ca7eefd872d5d13eb33cf324361dbf1dbfba0a9fe \ - --hash=sha256:e015833384ca3e1a0565a79f5d953b0629d9138021c27ad37c92a9fa1af7623c \ - --hash=sha256:e15315691fe2253eb447503153acef4d7223dfe7e7702f9ed66539fcd0c43801 \ - --hash=sha256:e65466b31be1070b4a5b7dbfbd14b247884cb8e8b79c64fb0f36b472912dbaea \ - --hash=sha256:e7820bb0d65e3ce1e3e70b6708c2f66143f55912fa02f4b618d0f08b61575f12 \ - --hash=sha256:e851a051f7260e6d688267eb039c81f05f23a19431bd7dfa4bf5e3cb34c108cd \ - --hash=sha256:e9f9feee7f334b72ceae46313333d002b56f325b5f04271b4ae2aadd9e993ae4 \ - --hash=sha256:eb40f828bc2f73f777d1eb8fee2e86cd9692a4518b63b6b5aa8af915dfd3207b \ - --hash=sha256:eb704155e73b833801c247f39d562229c0303f54770ca14fb1c053acb376cf10 \ - --hash=sha256:edb1bfd45227dec8d50bc7c7d86463cd8728bcc574f9b07de7369880de4626a3 \ - --hash=sha256:ee7d9d5537daf6d5c74a83b38a638cc001b648096c1cae8ef695b0c919d9d379 \ - --hash=sha256:f57783fbaf648205ac50ae7d646f27582fc706be3977e87c3c124e7a92407b10 \ - --hash=sha256:ff63a92f6e249514ef35bc795de10745be0226eaea06eb48b4bbeaa0c8850a4a - # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt +pydantic-core==2.33.2 \ + --hash=sha256:0069c9acc3f3981b9ff4cdfaf088e98d83440a4c7ea1bc07460af3d4dc22e72d \ + --hash=sha256:031c57d67ca86902726e0fae2214ce6770bbe2f710dc33063187a68744a5ecac \ + --hash=sha256:0405262705a123b7ce9f0b92f123334d67b70fd1f20a9372b907ce1080c7ba02 \ + --hash=sha256:04a1a413977ab517154eebb2d326da71638271477d6ad87a769102f7c2488c56 \ + --hash=sha256:09fb9dd6571aacd023fe6aaca316bd01cf60ab27240d7eb39ebd66a3a15293b4 \ + --hash=sha256:0a39979dcbb70998b0e505fb1556a1d550a0781463ce84ebf915ba293ccb7e22 \ + --hash=sha256:0a9f2c9dd19656823cb8250b0724ee9c60a82f3cdf68a080979d13092a3b0fef \ + --hash=sha256:0e03262ab796d986f978f79c943fc5f620381be7287148b8010b4097f79a39ec \ + --hash=sha256:0e5b2671f05ba48b94cb90ce55d8bdcaaedb8ba00cc5359f6810fc918713983d \ + --hash=sha256:0e6116757f7959a712db11f3e9c0a99ade00a5bbedae83cb801985aa154f071b \ + --hash=sha256:0fb2d542b4d66f9470e8065c5469ec676978d625a8b7a363f07d9a501a9cb36a \ + --hash=sha256:1082dd3e2d7109ad8b7da48e1d4710c8d06c253cbc4a27c1cff4fbcaa97a9e3f \ + --hash=sha256:1a8695a8d00c73e50bff9dfda4d540b7dee29ff9b8053e38380426a85ef10052 \ + --hash=sha256:1e063337ef9e9820c77acc768546325ebe04ee38b08703244c1309cccc4f1bab \ + --hash=sha256:1ea40a64d23faa25e62a70ad163571c0b342b8bf66d5fa612ac0dec4f069d916 \ + --hash=sha256:2058a32994f1fde4ca0480ab9d1e75a0e8c87c22b53a3ae66554f9af78f2fe8c \ + --hash=sha256:235f45e5dbcccf6bd99f9f472858849f73d11120d76ea8707115415f8e5ebebf \ + --hash=sha256:2807668ba86cb38c6817ad9bc66215ab8584d1d304030ce4f0887336f28a5e27 \ + --hash=sha256:2b0a451c263b01acebe51895bfb0e1cc842a5c666efe06cdf13846c7418caa9a \ + --hash=sha256:2b3d326aaef0c0399d9afffeb6367d5e26ddc24d351dbc9c636840ac355dc5d8 \ + --hash=sha256:2bfb5112df54209d820d7bf9317c7a6c9025ea52e49f46b6a2060104bba37de7 \ + --hash=sha256:2f82865531efd18d6e07a04a17331af02cb7a651583c418df8266f17a63c6612 \ + --hash=sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1 \ + --hash=sha256:3a1c81334778f9e3af2f8aeb7a960736e5cab1dfebfb26aabca09afd2906c039 \ + --hash=sha256:3abcd9392a36025e3bd55f9bd38d908bd17962cc49bc6da8e7e96285336e2bca \ + --hash=sha256:3c6db6e52c6d70aa0d00d45cdb9b40f0433b96380071ea80b09277dba021ddf7 \ + --hash=sha256:3dc625f4aa79713512d1976fe9f0bc99f706a9dee21dfd1810b4bbbf228d0e8a \ + --hash=sha256:3eb3fe62804e8f859c49ed20a8451342de53ed764150cb14ca71357c765dc2a6 \ + --hash=sha256:44857c3227d3fb5e753d5fe4a3420d6376fa594b07b621e220cd93703fe21782 \ + --hash=sha256:4b25d91e288e2c4e0662b8038a28c6a07eaac3e196cfc4ff69de4ea3db992a1b \ + --hash=sha256:4c5b0a576fb381edd6d27f0a85915c6daf2f8138dc5c267a57c08a62900758c7 \ + --hash=sha256:4e61206137cbc65e6d5256e1166f88331d3b6238e082d9f74613b9b765fb9025 \ + --hash=sha256:52fb90784e0a242bb96ec53f42196a17278855b0f31ac7c3cc6f5c1ec4811849 \ + --hash=sha256:53a57d2ed685940a504248187d5685e49eb5eef0f696853647bf37c418c538f7 \ + --hash=sha256:572c7e6c8bb4774d2ac88929e3d1f12bc45714ae5ee6d9a788a9fb35e60bb04b \ + --hash=sha256:5c4aa4e82353f65e548c476b37e64189783aa5384903bfea4f41580f255fddfa \ + --hash=sha256:5c92edd15cd58b3c2d34873597a1e20f13094f59cf88068adb18947df5455b4e \ + --hash=sha256:5f483cfb75ff703095c59e365360cb73e00185e01aaea067cd19acffd2ab20ea \ + --hash=sha256:61c18fba8e5e9db3ab908620af374db0ac1baa69f0f32df4f61ae23f15e586ac \ + --hash=sha256:6368900c2d3ef09b69cb0b913f9f8263b03786e5b2a387706c5afb66800efd51 \ + --hash=sha256:64632ff9d614e5eecfb495796ad51b0ed98c453e447a76bcbeeb69615079fc7e \ + --hash=sha256:65132b7b4a1c0beded5e057324b7e16e10910c106d43675d9bd87d4f38dde162 \ + --hash=sha256:6b99022f1d19bc32a4c2a0d544fc9a76e3be90f0b3f4af413f87d38749300e65 \ + --hash=sha256:6bdfe4b3789761f3bcb4b1ddf33355a71079858958e3a552f16d5af19768fef2 \ + --hash=sha256:6fa6dfc3e4d1f734a34710f391ae822e0a8eb8559a85c6979e14e65ee6ba2954 \ + --hash=sha256:73662edf539e72a9440129f231ed3757faab89630d291b784ca99237fb94db2b \ + --hash=sha256:73cf6373c21bc80b2e0dc88444f41ae60b2f070ed02095754eb5a01df12256de \ + --hash=sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc \ + --hash=sha256:7f92c15cd1e97d4b12acd1cc9004fa092578acfa57b67ad5e43a197175d01a64 \ + --hash=sha256:82f68293f055f51b51ea42fafc74b6aad03e70e191799430b90c13d643059ebb \ + --hash=sha256:83aa99b1285bc8f038941ddf598501a86f1536789740991d7d8756e34f1e74d9 \ + --hash=sha256:87acbfcf8e90ca885206e98359d7dca4bcbb35abdc0ff66672a293e1d7a19101 \ + --hash=sha256:87b31b6846e361ef83fedb187bb5b4372d0da3f7e28d85415efa92d6125d6e6d \ + --hash=sha256:881b21b5549499972441da4758d662aeea93f1923f953e9cbaff14b8b9565aef \ + --hash=sha256:8d55ab81c57b8ff8548c3e4947f119551253f4e3787a7bbc0b6b3ca47498a9d3 \ + --hash=sha256:8f57a69461af2a5fa6e6bbd7a5f60d3b7e6cebb687f55106933188e79ad155c1 \ + --hash=sha256:95237e53bb015f67b63c91af7518a62a8660376a6a0db19b89acc77a4d6199f5 \ + --hash=sha256:96081f1605125ba0855dfda83f6f3df5ec90c61195421ba72223de35ccfb2f88 \ + --hash=sha256:970919794d126ba8645f3837ab6046fb4e72bbc057b3709144066204c19a455d \ + --hash=sha256:9cb1da0f5a471435a7bc7e439b8a728e8b61e59784b2af70d7c169f8dd8ae290 \ + --hash=sha256:9fcd347d2cc5c23b06de6d3b7b8275be558a0c90549495c699e379a80bf8379e \ + --hash=sha256:9fdac5d6ffa1b5a83bca06ffe7583f5576555e6c8b3a91fbd25ea7780f825f7d \ + --hash=sha256:a11c8d26a50bfab49002947d3d237abe4d9e4b5bdc8846a63537b6488e197808 \ + --hash=sha256:a144d4f717285c6d9234a66778059f33a89096dfb9b39117663fd8413d582dcc \ + --hash=sha256:a2b911a5b90e0374d03813674bf0a5fbbb7741570dcd4b4e85a2e48d17def29d \ + --hash=sha256:a7ec89dc587667f22b6a0b6579c249fca9026ce7c333fc142ba42411fa243cdc \ + --hash=sha256:aa9d91b338f2df0508606f7009fde642391425189bba6d8c653afd80fd6bb64e \ + --hash=sha256:b0379a2b24882fef529ec3b4987cb5d003b9cda32256024e6fe1586ac45fc640 \ + --hash=sha256:bc7aee6f634a6f4a95676fcb5d6559a2c2a390330098dba5e5a5f28a2e4ada30 \ + --hash=sha256:bdc25f3681f7b78572699569514036afe3c243bc3059d3942624e936ec93450e \ + --hash=sha256:c083a3bdd5a93dfe480f1125926afcdbf2917ae714bdb80b36d34318b2bec5d9 \ + --hash=sha256:c20c462aa4434b33a2661701b861604913f912254e441ab8d78d30485736115a \ + --hash=sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9 \ + --hash=sha256:c52b02ad8b4e2cf14ca7b3d918f3eb0ee91e63b3167c32591e57c4317e134f8f \ + --hash=sha256:c54c939ee22dc8e2d545da79fc5381f1c020d6d3141d3bd747eab59164dc89fb \ + --hash=sha256:c8e7af2f4e0194c22b5b37205bfb293d166a7344a5b0d0eaccebc376546d77d5 \ + --hash=sha256:cca3868ddfaccfbc4bfb1d608e2ccaaebe0ae628e1416aeb9c4d88c001bb45ab \ + --hash=sha256:d3f26877a748dc4251cfcfda9dfb5f13fcb034f5308388066bcfe9031b63ae7d \ + --hash=sha256:d53b22f2032c42eaaf025f7c40c2e3b94568ae077a606f006d206a463bc69572 \ + --hash=sha256:d87c561733f66531dced0da6e864f44ebf89a8fba55f31407b00c2f7f9449593 \ + --hash=sha256:d946c8bf0d5c24bf4fe333af284c59a19358aa3ec18cb3dc4370080da1e8ad29 \ + --hash=sha256:dac89aea9af8cd672fa7b510e7b8c33b0bba9a43186680550ccf23020f32d535 \ + --hash=sha256:db4b41f9bd95fbe5acd76d89920336ba96f03e149097365afe1cb092fceb89a1 \ + --hash=sha256:dc46a01bf8d62f227d5ecee74178ffc448ff4e5197c756331f71efcc66dc980f \ + --hash=sha256:dd14041875d09cc0f9308e37a6f8b65f5585cf2598a53aa0123df8b129d481f8 \ + --hash=sha256:de4b83bb311557e439b9e186f733f6c645b9417c84e2eb8203f3f820a4b988bf \ + --hash=sha256:e799c050df38a639db758c617ec771fd8fb7a5f8eaaa4b27b101f266b216a246 \ + --hash=sha256:e80b087132752f6b3d714f041ccf74403799d3b23a72722ea2e6ba2e892555b9 \ + --hash=sha256:eb8c529b2819c37140eb51b914153063d27ed88e3bdc31b71198a198e921e011 \ + --hash=sha256:eb9b459ca4df0e5c87deb59d37377461a538852765293f9e6ee834f0435a93b9 \ + --hash=sha256:efec8db3266b76ef9607c2c4c419bdb06bf335ae433b80816089ea7585816f6a \ + --hash=sha256:f481959862f57f29601ccced557cc2e817bce7533ab8e01a797a48b49c9692b3 \ + --hash=sha256:f517ca031dfc037a9c07e748cefd8d96235088b83b4f4ba8939105d20fa1dcd6 \ + --hash=sha256:f889f7a40498cc077332c7ab6b4608d296d852182211787d4f3ee377aaae66e8 \ + --hash=sha256:f8de619080e944347f5f20de29a975c2d815d9ddd8be9b9b7268e2e3ef68605a \ + --hash=sha256:f941635f2a3d96b2973e867144fde513665c87f13fe0e193c158ac51bfaaa7b2 \ + --hash=sha256:fa754d1850735a0b0e03bcffd9d4b4343eb417e47196e4485d9cca326073a42c \ + --hash=sha256:fa854f5cf7e33842a892e5c73f45327760bc7bc516339fda888c75ae60edaeb6 \ + --hash=sha256:fe5b32187cbc0c862ee201ad66c30cf218e5ed468ec8dc1cf49dec66e160cc4d + # via + # -c python/deplocks/llm/ray_test_py311_cpu.lock # pydantic pydantic-extra-types==2.10.5 \ --hash=sha256:1dcfa2c0cf741a422f088e0dbb4690e7bfadaaf050da3d6f80d6c3cf58a2bad8 \ @@ -3263,7 +3397,7 @@ pygments==2.18.0 \ --hash=sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199 \ --hash=sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # ipython # nbconvert # rich @@ -3272,7 +3406,7 @@ pyjwt==2.8.0 \ --hash=sha256:57e28d156e3d5c10088e0c68abb90bfac3df82b40a71bd0daa20c65ccd5c23de \ --hash=sha256:59127c392cc44c2da5bb3192169a91f429924e17aff6534d70fdc02ab3e04320 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # msal pynvml==12.0.0 \ --hash=sha256:299ce2451a6a17e6822d6faee750103e25b415f06f59abb8db65d30f794166f5 \ @@ -3282,20 +3416,20 @@ pyopenssl==25.0.0 \ --hash=sha256:424c247065e46e76a37411b9ab1782541c23bb658bf003772c3405fbaa128e90 \ --hash=sha256:cd2cef799efa3936bb08e8ccb9433a575722b9dd986023f1cabc4ae64e9dac16 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/cloud-requirements.txt # -r python/requirements.txt pyparsing==3.1.1 \ --hash=sha256:32c7c0b711493c72ff18a981d24f28aaf9c1fb7ed5e9667c9e84e3db623bdbfb \ --hash=sha256:ede28a1a32462f5a9705e07aea48001a08f7cf81a021585011deba701581a0db # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # httplib2 pytest==7.4.4 \ --hash=sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280 \ --hash=sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/base-test-requirements.txt # -r python/requirements/llm/llm-test-requirements.txt # pytest-aiohttp @@ -3304,23 +3438,24 @@ pytest-aiohttp==1.1.0 \ --hash=sha256:147de8cb164f3fc9d7196967f109ab3c0b93ea3463ab50631e56438eab7b5adc \ --hash=sha256:f39a11693a0dce08dd6c542d241e199dd8047a6e6596b2bcfa60d373f143456d # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/base-test-requirements.txt pytest-asyncio==0.17.2 \ --hash=sha256:6d895b02432c028e6957d25fc936494e78c6305736e785d9fee408b1efbc7ff4 \ --hash=sha256:e0fe5dbea40516b661ef1bcfe0bd9461c2847c4ef4bb40012324f2454fb7d56d # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/base-test-requirements.txt # pytest-aiohttp python-dateutil==2.8.2 \ --hash=sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86 \ --hash=sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/cloud-requirements.txt # arrow # botocore + # celery # jupyter-client # pandas python-dotenv==1.0.1 \ @@ -3331,7 +3466,7 @@ python-json-logger==2.0.7 \ --hash=sha256:23e7ec02d34237c5aa1e29a070193a4ea87583bb4e7f8fd06d3de8264c4b2e1c \ --hash=sha256:f380b826a991ebbe3de4d897aeec42760035ac760345e57b812938dc8b35e2bd # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jupyter-events # vllm python-multipart==0.0.20 \ @@ -3342,7 +3477,7 @@ pytz==2022.7.1 \ --hash=sha256:01a0681c4b9684a28304615eba55d1ab31ae00bf68ec157ec3708a8182dbbcd0 \ --hash=sha256:78f4f37d8198e0627c5f1143240bb0206b8691d8d7ac6d78fee88b78733f8c4a # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # pandas pyyaml==6.0.1 \ --hash=sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5 \ @@ -3397,7 +3532,7 @@ pyyaml==6.0.1 \ --hash=sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d \ --hash=sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/cloud-requirements.txt # -r python/requirements.txt # gguf @@ -3499,7 +3634,7 @@ pyzmq==26.0.3 \ --hash=sha256:f6b1d1c631e5940cac5a0b22c5379c86e8df6a4ec277c7a856b714021ab6cfad \ --hash=sha256:f6c21c00478a7bea93caaaef9e7629145d4153b15a8653e8bb4609d4bc70dbfc # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # ipykernel # jupyter-client # jupyter-server @@ -3510,7 +3645,7 @@ referencing==0.36.2 \ --hash=sha256:df2e89862cd09deabbdba16944cc3f10feb6b3e6f18e902f7cc25609a34775aa \ --hash=sha256:e8699adbbf8b5c7de96d8ffa0eb5c158b3beafce084968e2ea8bb08c6794dcd0 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jsonschema # jsonschema-specifications regex==2024.11.6 \ @@ -3616,10 +3751,11 @@ requests==2.32.3 \ --hash=sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760 \ --hash=sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/cloud-requirements.txt # -r python/requirements.txt # azure-core + # azure-datalake-store # google-api-core # google-cloud-storage # huggingface-hub @@ -3636,21 +3772,21 @@ rfc3339-validator==0.1.4 \ --hash=sha256:138a2abdf93304ad60530167e51d2dfb9549521a836871b88d7f4695d0022f6b \ --hash=sha256:24f6ec1eda14ef823da9e36ec7113124b39c04d50a4d3d3a3c2859577e7791fa # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jsonschema # jupyter-events rfc3986-validator==0.1.1 \ --hash=sha256:2f235c432ef459970b4306369336b9d5dbdda31b510ca1e327636e01f528bfa9 \ --hash=sha256:3d44bde7921b3b9ec3ae4e3adca370438eccebc676456449b145d533b240d055 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jsonschema # jupyter-events rich==13.3.2 \ --hash=sha256:91954fe80cfb7985727a467ca98a7618e5dd15178cc2da10f553b36a93859001 \ --hash=sha256:a104f37270bf677148d8acb07d33be1569eeee87e2d1beb286a4e9113caf6f2f # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/cloud-requirements.txt # -r python/requirements.txt # memray @@ -3760,21 +3896,21 @@ rpds-py==0.22.3 \ --hash=sha256:fda7cb070f442bf80b642cd56483b5548e43d366fe3f39b98e67cce780cded00 \ --hash=sha256:feea821ee2a9273771bae61194004ee2fc33f8ec7db08117ef9147d4bbcbca8e # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jsonschema # referencing rsa==4.7.2 \ --hash=sha256:78f9a9bf4e7be0c5ded4583326e7461e3a3c5aae24073648b4bdfa797d78c9d2 \ --hash=sha256:9d689e6ca1b3038bc82bf8d23e944b6b6037bc02301a574935b2dd946e0353b9 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # google-auth # oauth2client -s3transfer==0.6.2 \ - --hash=sha256:b014be3a8a2aab98cfe1abc7229cc5a9a0cf05eb9c1f2b86b230fd8df3f78084 \ - --hash=sha256:cab66d3380cca3e70939ef2255d01cd8aece6a4907a9528740f668c4b0611861 +s3transfer==0.8.0 \ + --hash=sha256:baa479dc2e63e5c2ed51611b4d46cdf0295e2070d8d0b86b22f335ee5b954986 \ + --hash=sha256:e8d6bd52ffd99841e3a57b34370a54841f12d3aab072af862cdcc50955288002 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # boto3 safetensors==0.5.2 \ --hash=sha256:03c937100f38c9ff4c1507abea9928a6a9b02c9c1c9c3609ed4fb2bf413d4975 \ @@ -3816,7 +3952,7 @@ scikit-image==0.24.0 \ --hash=sha256:ef04360eda372ee5cd60aebe9be91258639c86ae2ea24093fb9182118008d009 \ --hash=sha256:fa27b3a0dbad807b966b8db2d78da734cb812ca4787f7fbb143764800ce2fa9c # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt scipy==1.11.4 \ --hash=sha256:00150c5eae7b610c32589dda259eacc7c4f1665aedf25d921907f4d08a951b1c \ @@ -3845,7 +3981,7 @@ scipy==1.11.4 \ --hash=sha256:f313b39a7e94f296025e3cffc2c567618174c0b1dde173960cf23808f9fae4be \ --hash=sha256:f3cd9e7b3c2c1ec26364856f9fbe78695fe631150f94cd1c22228456404cf1ec # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt # scikit-image # vllm @@ -3853,7 +3989,7 @@ send2trash==1.8.3 \ --hash=sha256:0c31227e0bd08961c7665474a3d1ef7193929fedda4233843689baa056be46c9 \ --hash=sha256:b18e7a3966d99871aefeb00cfbcfdced55ce4871194810fc71f4aa484b953abf # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jupyter-server # nbclassic # notebook @@ -3915,17 +4051,116 @@ sentencepiece==0.2.0 \ # gguf # mistral-common # vllm +setproctitle==1.3.6 \ + --hash=sha256:082413db8a96b1f021088e8ec23f0a61fec352e649aba20881895815388b66d3 \ + --hash=sha256:0dba8faee2e4a96e934797c9f0f2d093f8239bf210406a99060b3eabe549628e \ + --hash=sha256:0e6b5633c94c5111f7137f875e8f1ff48f53b991d5d5b90932f27dc8c1fa9ae4 \ + --hash=sha256:1065ed36bd03a3fd4186d6c6de5f19846650b015789f72e2dea2d77be99bdca1 \ + --hash=sha256:109fc07b1cd6cef9c245b2028e3e98e038283342b220def311d0239179810dbe \ + --hash=sha256:13624d9925bb481bc0ccfbc7f533da38bfbfe6e80652314f789abc78c2e513bd \ + --hash=sha256:156795b3db976611d09252fc80761fcdb65bb7c9b9581148da900851af25ecf4 \ + --hash=sha256:163dba68f979c61e4e2e779c4d643e968973bdae7c33c3ec4d1869f7a9ba8390 \ + --hash=sha256:17d7c833ed6545ada5ac4bb606b86a28f13a04431953d4beac29d3773aa00b1d \ + --hash=sha256:18d0667bafaaae4c1dee831e2e59841c411ff399b9b4766822ba2685d419c3be \ + --hash=sha256:1aa1935aa2195b76f377e5cb018290376b7bf085f0b53f5a95c0c21011b74367 \ + --hash=sha256:2156d55308431ac3b3ec4e5e05b1726d11a5215352d6a22bb933171dee292f8c \ + --hash=sha256:23a57d3b8f1549515c2dbe4a2880ebc1f27780dc126c5e064167563e015817f5 \ + --hash=sha256:2407955dc359d735a20ac6e797ad160feb33d529a2ac50695c11a1ec680eafab \ + --hash=sha256:2940cf13f4fc11ce69ad2ed37a9f22386bfed314b98d8aebfd4f55459aa59108 \ + --hash=sha256:2e51ec673513465663008ce402171192a053564865c2fc6dc840620871a9bd7c \ + --hash=sha256:3393859eb8f19f5804049a685bf286cb08d447e28ba5c6d8543c7bf5500d5970 \ + --hash=sha256:3884002b3a9086f3018a32ab5d4e1e8214dd70695004e27b1a45c25a6243ad0b \ + --hash=sha256:38ca045626af693da042ac35d7332e7b9dbd52e6351d6973b310612e3acee6d6 \ + --hash=sha256:391bb6a29c4fe7ccc9c30812e3744060802d89b39264cfa77f3d280d7f387ea5 \ + --hash=sha256:3cca16fd055316a48f0debfcbfb6af7cea715429fc31515ab3fcac05abd527d8 \ + --hash=sha256:3cde5b83ec4915cd5e6ae271937fd60d14113c8f7769b4a20d51769fe70d8717 \ + --hash=sha256:3f8194b4d631b003a1176a75d1acd545e04b1f54b821638e098a93e6e62830ef \ + --hash=sha256:3fc97805f9d74444b027babff710bf39df1541437a6a585a983d090ae00cedde \ + --hash=sha256:4431629c178193f23c538cb1de3da285a99ccc86b20ee91d81eb5f1a80e0d2ba \ + --hash=sha256:49498ebf68ca3e75321ffe634fcea5cc720502bfaa79bd6b03ded92ce0dc3c24 \ + --hash=sha256:4ac3eb04bcf0119aadc6235a2c162bae5ed5f740e3d42273a7228b915722de20 \ + --hash=sha256:4adf6a0013fe4e0844e3ba7583ec203ca518b9394c6cc0d3354df2bf31d1c034 \ + --hash=sha256:4efc91b437f6ff2578e89e3f17d010c0a0ff01736606473d082913ecaf7859ba \ + --hash=sha256:50706b9c0eda55f7de18695bfeead5f28b58aa42fd5219b3b1692d554ecbc9ec \ + --hash=sha256:5313a4e9380e46ca0e2c681ba739296f9e7c899e6f4d12a6702b2dc9fb846a31 \ + --hash=sha256:543f59601a4e32daf44741b52f9a23e0ee374f9f13b39c41d917302d98fdd7b0 \ + --hash=sha256:57bc54763bf741813a99fbde91f6be138c8706148b7b42d3752deec46545d470 \ + --hash=sha256:63cc10352dc6cf35a33951656aa660d99f25f574eb78132ce41a85001a638aa7 \ + --hash=sha256:6a1d3aa13acfe81f355b0ce4968facc7a19b0d17223a0f80c011a1dba8388f37 \ + --hash=sha256:6af330ddc2ec05a99c3933ab3cba9365357c0b8470a7f2fa054ee4b0984f57d1 \ + --hash=sha256:6d50bfcc1d1692dc55165b3dd2f0b9f8fb5b1f7b571a93e08d660ad54b9ca1a5 \ + --hash=sha256:70100e2087fe05359f249a0b5f393127b3a1819bf34dec3a3e0d4941138650c9 \ + --hash=sha256:74973aebea3543ad033b9103db30579ec2b950a466e09f9c2180089e8346e0ec \ + --hash=sha256:751ba352ed922e0af60458e961167fa7b732ac31c0ddd1476a2dfd30ab5958c5 \ + --hash=sha256:785cd210c0311d9be28a70e281a914486d62bfd44ac926fcd70cf0b4d65dff1c \ + --hash=sha256:7890e291bf4708e3b61db9069ea39b3ab0651e42923a5e1f4d78a7b9e4b18301 \ + --hash=sha256:793a23e8d9cb6c231aa3023d700008224c6ec5b8fd622d50f3c51665e3d0a190 \ + --hash=sha256:797f2846b546a8741413c57d9fb930ad5aa939d925c9c0fa6186d77580035af7 \ + --hash=sha256:7df5fcc48588f82b6cc8073db069609ddd48a49b1e9734a20d0efb32464753c4 \ + --hash=sha256:8050c01331135f77ec99d99307bfbc6519ea24d2f92964b06f3222a804a3ff1f \ + --hash=sha256:805bb33e92fc3d8aa05674db3068d14d36718e3f2c5c79b09807203f229bf4b5 \ + --hash=sha256:807796fe301b7ed76cf100113cc008c119daf4fea2f9f43c578002aef70c3ebf \ + --hash=sha256:81c443310831e29fabbd07b75ebbfa29d0740b56f5907c6af218482d51260431 \ + --hash=sha256:83066ffbf77a5f82b7e96e59bdccbdda203c8dccbfc3f9f0fdad3a08d0001d9c \ + --hash=sha256:8834ab7be6539f1bfadec7c8d12249bbbe6c2413b1d40ffc0ec408692232a0c6 \ + --hash=sha256:92df0e70b884f5da35f2e01489dca3c06a79962fb75636985f1e3a17aec66833 \ + --hash=sha256:9483aa336687463f5497dd37a070094f3dff55e2c888994f8440fcf426a1a844 \ + --hash=sha256:97a138fa875c6f281df7720dac742259e85518135cd0e3551aba1c628103d853 \ + --hash=sha256:9b50700785eccac0819bea794d968ed8f6055c88f29364776b7ea076ac105c5d \ + --hash=sha256:9b73cf0fe28009a04a35bb2522e4c5b5176cc148919431dcb73fdbdfaab15781 \ + --hash=sha256:9d5a369eb7ec5b2fdfa9927530b5259dd21893fa75d4e04a223332f61b84b586 \ + --hash=sha256:a094b7ce455ca341b59a0f6ce6be2e11411ba6e2860b9aa3dbb37468f23338f4 \ + --hash=sha256:a0d6252098e98129a1decb59b46920d4eca17b0395f3d71b0d327d086fefe77d \ + --hash=sha256:a1d856b0f4e4a33e31cdab5f50d0a14998f3a2d726a3fd5cb7c4d45a57b28d1b \ + --hash=sha256:a4ae2ea9afcfdd2b931ddcebf1cf82532162677e00326637b31ed5dff7d985ca \ + --hash=sha256:a5963b663da69ad25fa1559ee064584935570def665917918938c1f1289f5ebc \ + --hash=sha256:ad1c2c2baaba62823a7f348f469a967ece0062140ca39e7a48e4bbb1f20d54c4 \ + --hash=sha256:ae82507fe458f7c0c8227017f2158111a4c9e7ce94de05178894a7ea9fefc8a1 \ + --hash=sha256:af188f3305f0a65c3217c30c6d4c06891e79144076a91e8b454f14256acc7279 \ + --hash=sha256:af44bb7a1af163806bbb679eb8432fa7b4fb6d83a5d403b541b675dcd3798638 \ + --hash=sha256:b0174ca6f3018ddeaa49847f29b69612e590534c1d2186d54ab25161ecc42975 \ + --hash=sha256:b2b17855ed7f994f3f259cf2dfbfad78814538536fa1a91b50253d84d87fd88d \ + --hash=sha256:b2e54f4a2dc6edf0f5ea5b1d0a608d2af3dcb5aa8c8eeab9c8841b23e1b054fe \ + --hash=sha256:b6f4abde9a2946f57e8daaf1160b2351bcf64274ef539e6675c1d945dbd75e2a \ + --hash=sha256:b70c07409d465f3a8b34d52f863871fb8a00755370791d2bd1d4f82b3cdaf3d5 \ + --hash=sha256:bb465dd5825356c1191a038a86ee1b8166e3562d6e8add95eec04ab484cfb8a2 \ + --hash=sha256:c051f46ed1e13ba8214b334cbf21902102807582fbfaf0fef341b9e52f0fafbf \ + --hash=sha256:c1b20a5f4164cec7007be55c9cf18d2cd08ed7c3bf6769b3cd6d044ad888d74b \ + --hash=sha256:c86e9e82bfab579327dbe9b82c71475165fbc8b2134d24f9a3b2edaf200a5c3d \ + --hash=sha256:c9f32b96c700bb384f33f7cf07954bb609d35dd82752cef57fb2ee0968409169 \ + --hash=sha256:cce0ed8b3f64c71c140f0ec244e5fdf8ecf78ddf8d2e591d4a8b6aa1c1214235 \ + --hash=sha256:cdd7315314b0744a7dd506f3bd0f2cf90734181529cdcf75542ee35ad885cab7 \ + --hash=sha256:cf355fbf0d4275d86f9f57be705d8e5eaa7f8ddb12b24ced2ea6cbd68fdb14dc \ + --hash=sha256:d136fbf8ad4321716e44d6d6b3d8dffb4872626010884e07a1db54b7450836cf \ + --hash=sha256:d2c8e20487b3b73c1fa72c56f5c89430617296cd380373e7af3a538a82d4cd6d \ + --hash=sha256:d483cc23cc56ab32911ea0baa0d2d9ea7aa065987f47de847a0a93a58bf57905 \ + --hash=sha256:d5a6c4864bb6fa9fcf7b57a830d21aed69fd71742a5ebcdbafda476be673d212 \ + --hash=sha256:d714e002dd3638170fe7376dc1b686dbac9cb712cde3f7224440af722cc9866a \ + --hash=sha256:d73f14b86d0e2858ece6bf5807c9889670e392c001d414b4293d0d9b291942c3 \ + --hash=sha256:d88c63bd395c787b0aa81d8bbc22c1809f311032ce3e823a6517b711129818e4 \ + --hash=sha256:db608db98ccc21248370d30044a60843b3f0f3d34781ceeea67067c508cd5a28 \ + --hash=sha256:de004939fc3fd0c1200d26ea9264350bfe501ffbf46c8cf5dc7f345f2d87a7f1 \ + --hash=sha256:ded9e86397267732a0641d4776c7c663ea16b64d7dbc4d9cc6ad8536363a2d29 \ + --hash=sha256:e288f8a162d663916060beb5e8165a8551312b08efee9cf68302687471a6545d \ + --hash=sha256:e2a9e62647dc040a76d55563580bf3bb8fe1f5b6ead08447c2ed0d7786e5e794 \ + --hash=sha256:e3e44d08b61de0dd6f205528498f834a51a5c06689f8fb182fe26f3a3ce7dca9 \ + --hash=sha256:ea002088d5554fd75e619742cefc78b84a212ba21632e59931b3501f0cfc8f67 \ + --hash=sha256:eb7452849f6615871eabed6560ffedfe56bc8af31a823b6be4ce1e6ff0ab72c5 \ + --hash=sha256:ebcf34b69df4ca0eabaaaf4a3d890f637f355fed00ba806f7ebdd2d040658c26 \ + --hash=sha256:f24d5b9383318cbd1a5cd969377937d66cf0542f24aa728a4f49d9f98f9c0da8 \ + --hash=sha256:f33fbf96b52d51c23b6cff61f57816539c1c147db270cfc1cc3bc012f4a560a9 + # via vllm shellingham==1.5.4 \ --hash=sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686 \ --hash=sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # typer six==1.16.0 \ --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \ --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/cloud-requirements.txt # asttokens # azure-core @@ -3940,20 +4175,20 @@ smart-open==6.2.0 \ --hash=sha256:088bf00f9327c71e549bc2f86567d3320df5d89667f009ce1c16568976068ef7 \ --hash=sha256:1b4df5c8365218f3852c507451920ccad606c80b0acb4e67508e50ba9b5d2632 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/cloud-requirements.txt # -r python/requirements.txt smmap==5.0.1 \ --hash=sha256:dceeb6c0028fdb6734471eb07c0cd2aae706ccaecab45965ee83f11c8d3b1f62 \ --hash=sha256:e6d8668fa5f93e706934a62d7b4db19c8d9eb8cf2adbb75ef1b675aa332b69da # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # gitdb sniffio==1.3.1 \ --hash=sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2 \ --hash=sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # anyio # openai snowballstemmer==2.2.0 \ @@ -3974,7 +4209,7 @@ soupsieve==2.5 \ --hash=sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690 \ --hash=sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # beautifulsoup4 soxr==0.5.0.post1 \ --hash=sha256:39e0f791ba178d69cd676485dbee37e75a34f20daa478d90341ecb7f6d9d690f \ @@ -4031,19 +4266,19 @@ spinners==0.0.24 \ --hash=sha256:1eb6aeb4781d72ab42ed8a01dcf20f3002bf50740d7154d12fb8c9769bf9e27f \ --hash=sha256:2fa30d0b72c9650ad12bbe031c9943b8d441e41b4f5602b0ec977a19f3290e98 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # halo stack-data==0.6.3 \ --hash=sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9 \ --hash=sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # ipython starlette==0.46.2 \ --hash=sha256:595633ce89f8ffa71a015caed34a5b2dc1c0cdb3f0f1fbd1e69339cf2abeec35 \ --hash=sha256:7f7361f34eed179294600af672f565727419830b54b7b084efe44bb82d2fccd5 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt # fastapi # prometheus-fastapi-instrumentator @@ -4055,25 +4290,25 @@ tabulate==0.9.0 \ --hash=sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c \ --hash=sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/cloud-requirements.txt tensorboardx==2.6.2.2 \ --hash=sha256:160025acbf759ede23fd3526ae9d9bfbfd8b68eb16c38a010ebe326dc6395db8 \ --hash=sha256:c6476d7cd0d529b0b72f4acadb1269f9ed8b22f441e87a84f2a3b940bb87b666 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt termcolor==2.4.0 \ --hash=sha256:9297c0df9c99445c2412e832e882a7884038a25617c60cea2ad69488d4040d63 \ --hash=sha256:aab9e56047c8ac41ed798fa36d892a37aca6b3e9159f3e0c24bc64a9b3ac7b7a # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # halo terminado==0.18.1 \ --hash=sha256:a4468e1b37bb318f8a86514f65814e1afc977cf29b3992a4500d9dd305dcceb0 \ --hash=sha256:de09f2c4b85de4765f7714688fff57d3e75bad1f909b589fde880460c753fd2e # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jupyter-server # nbclassic # notebook @@ -4081,7 +4316,7 @@ tifffile==2024.7.21 \ --hash=sha256:7f335b5d6ca49401fe0f1d87deb206f5dae47297e47b1ed52a676d05d6d26798 \ --hash=sha256:818b577d49350421fb511f389f937984f9feaa2cd8177fa00823001920bf3483 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # scikit-image tiktoken==0.9.0 \ --hash=sha256:03935988a91d6d3216e2ec7c645afbb3d870b37bcb67ada1943ec48678e7ee33 \ @@ -4122,7 +4357,7 @@ tinycss2==1.3.0 \ --hash=sha256:152f9acabd296a8375fbca5b84c961ff95971fcfc32e79550c8df8e29118c54d \ --hash=sha256:54a8dbdffb334d536851be0226030e9505965bb2f30f21a4a82c55fb2a80fae7 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # nbconvert tokenizers==0.21.1 \ --hash=sha256:0f0dcbcc9f6e13e675a66d7a5f2f225a736745ce484c1a4e07476a89ccdad382 \ @@ -4242,7 +4477,7 @@ tornado==6.1 \ --hash=sha256:fa2ba70284fa42c2a5ecb35e322e68823288a4251f9ba9cc77be04ae15eada68 \ --hash=sha256:fba85b6cd9c39be262fcd23865652920832b61583de2a2ca907dbd8e8a8c81e5 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # ipykernel # jupyter-client # jupyter-server @@ -4254,7 +4489,7 @@ tqdm==4.67.1 \ --hash=sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2 \ --hash=sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/cloud-requirements.txt # gguf # huggingface-hub @@ -4265,7 +4500,7 @@ traitlets==5.14.3 \ --hash=sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7 \ --hash=sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # comm # ipykernel # ipython @@ -4280,11 +4515,10 @@ traitlets==5.14.3 \ # nbconvert # nbformat # notebook -transformers==4.53.2 \ - --hash=sha256:6c3ed95edfb1cba71c4245758f1b4878c93bf8cde77d076307dacb2cbbd72be2 \ - --hash=sha256:db8f4819bb34f000029c73c3c557e7d06fc1b8e612ec142eecdae3947a9c78bf +transformers==4.55.2 \ + --hash=sha256:097e3c2e2c0c9681db3da9d748d8f9d6a724c644514673d0030e8c5a1109f1f1 \ + --hash=sha256:a45ec60c03474fd67adbce5c434685051b7608b3f4f167c25aa6aeb1cad16d4f # via - # -r python/requirements/llm/llm-requirements.txt # compressed-tensors # vllm # xgrammar @@ -4306,7 +4540,7 @@ typer==0.12.3 \ --hash=sha256:070d7ca53f785acbccba8e7d28b08dcd88f79f1fbda035ade0aecec71ca5c914 \ --hash=sha256:49e73131481d804288ef62598d97a1ceef3058905aa536a1134f90891ba35482 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/llm/llm-requirements.txt # -r python/requirements.txt # fastapi-cli @@ -4314,13 +4548,13 @@ types-python-dateutil==2.9.0.20240316 \ --hash=sha256:5d2f2e240b86905e40944dd787db6da9263f0deabef1076ddaed797351ec0202 \ --hash=sha256:6b8cb66d960771ce5ff974e9dd45e38facb81718cc1e208b10b1baccbfdbee3b # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # arrow typing-extensions==4.12.2 \ --hash=sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d \ --hash=sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # azure-core # azure-identity # azure-storage-blob @@ -4339,24 +4573,37 @@ typing-extensions==4.12.2 \ # referencing # torch # typer + # typing-inspection # vllm +typing-inspection==0.4.1 \ + --hash=sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51 \ + --hash=sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28 + # via + # -c python/deplocks/llm/ray_test_py311_cpu.lock + # pydantic +tzdata==2025.2 \ + --hash=sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8 \ + --hash=sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9 + # via + # -c python/deplocks/llm/ray_test_py311_cpu.lock + # kombu tzlocal==5.3 \ --hash=sha256:2fafbfc07e9d8b49ade18f898d6bcd37ae88ce3ad6486842a2e4f03af68323d2 \ --hash=sha256:3814135a1bb29763c6e4f08fd6e41dbb435c7a60bfbb03270211bcc537187d8c # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/cloud-requirements.txt uri-template==1.3.0 \ --hash=sha256:0e00f8eb65e18c7de20d595a14336e9f337ead580c70934141624b6d1ffdacc7 \ --hash=sha256:a44a133ea12d44a0c0f06d7d42a52d71282e77e2f937d8abd5655b8d56fc1363 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jsonschema urllib3==1.26.19 \ --hash=sha256:37a0344459b199fce0e80b0d3569837ec6b6937435c5244e7fd73fa6006830f3 \ --hash=sha256:3e3d753a8618b86d7de333b4223005f68720bcd6a7d2bcb9fbd2229ec7c1e429 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/cloud-requirements.txt # botocore # requests @@ -4364,7 +4611,7 @@ uvicorn==0.22.0 \ --hash=sha256:79277ae03db57ce7d9aa0567830bbb51d7a612f54d6e1e3e92da3ef24c2c8ed8 \ --hash=sha256:e9434d3bbf05f310e762147f769c9f21235ee118ba2d2bf1155a7196448bd996 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt # fastapi # fastapi-cli @@ -4407,15 +4654,23 @@ uvloop==0.21.0 ; platform_python_implementation != 'PyPy' and sys_platform != 'c --hash=sha256:f3df876acd7ec037a3d005b3ab85a7e4110422e4d9c1571d4fc89b0fc41b6816 \ --hash=sha256:f7089d2dc73179ce5ac255bdf37c236a9f914b264825fdaacaded6990a7fb4c2 # via uvicorn +vine==5.1.0 \ + --hash=sha256:40fdf3c48b2cfe1c38a49e9ae2da6fda88e4794c810050a728bd7413811fb1dc \ + --hash=sha256:8b62e981d35c41049211cf62a0a1242d8c1ee9bd15bb196ce38aefd6799e61e0 + # via + # -c python/deplocks/llm/ray_test_py311_cpu.lock + # amqp + # celery + # kombu virtualenv==20.29.1 \ --hash=sha256:4e4cb403c0b0da39e13b46b1b2476e505cb0046b25f242bee80f62bf990b2779 \ --hash=sha256:b8b8970138d32fb606192cb97f6cd4bb644fa486be9308fb9b63f81091b5dc35 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt -vllm==0.10.0 \ - --hash=sha256:8ca37559d82b43b5e8c8248d2e4a1ecb51d6d4e5d517491d656df6491ed93dab \ - --hash=sha256:a44e9013db26082a82c3931ed8772ac884d6d60566d36ecdb0e8dc01c65b241a +vllm==0.10.1.1 \ + --hash=sha256:3099824ee4bdaa14c4c4f7178a092101a0ec206d4c9371edf295849b2b730a39 \ + --hash=sha256:8ca0dd985e1ceac8540e7719c654f1553b3ba8a43c685ac8d3fa1366ffb6443a # via -r python/requirements/llm/llm-requirements.txt watchfiles==0.19.0 \ --hash=sha256:0089c6dc24d436b373c3c57657bf4f9a453b13767150d17284fc6162b2791911 \ @@ -4441,7 +4696,7 @@ watchfiles==0.19.0 \ --hash=sha256:d9b073073e048081e502b6c6b0b88714c026a1a4c890569238d04aca5f9ca74b \ --hash=sha256:fac19dc9cbc34052394dbe81e149411a62e71999c0a19e1e09ce537867f95ae0 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt # uvicorn # vllm @@ -4449,26 +4704,26 @@ wcwidth==0.2.13 \ --hash=sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859 \ --hash=sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # prompt-toolkit webcolors==24.6.0 \ --hash=sha256:1d160d1de46b3e81e58d0a280d0c78b467dc80f47294b91b1ad8029d2cedb55b \ --hash=sha256:8cf5bc7e28defd1d48b9e83d5fc30741328305a8195c29a8e668fa45586568a1 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jsonschema webencodings==0.5.1 \ --hash=sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78 \ --hash=sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # bleach # tinycss2 websocket-client==1.8.0 \ --hash=sha256:17b44cc997f5c498e809b22cdf2d9c7a9e71c02c8cc2b6c56e7c2d1239bfa526 \ --hash=sha256:3239df9f44da632f96012472805d40a23281a991027ce11d2f45a6f24ac4c3da # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jupyter-server websockets==15.0 \ --hash=sha256:0e389efe46ccb25a1f93d08c7a74e8123a2517f7b7458f043bd7529d1a63ffeb \ @@ -4545,7 +4800,7 @@ widgetsnbextension==4.0.11 \ --hash=sha256:55d4d6949d100e0d08b94948a42efc3ed6dfdc0e9468b2c4b128c9a2ce3a7a36 \ --hash=sha256:8b22a8f1910bfd188e596fe7fc05dcbd87e810c8a4ba010bdb3da86637398474 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # ipywidgets wrapt==1.14.1 \ --hash=sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3 \ @@ -4623,7 +4878,7 @@ wrapt==1.14.1 \ --hash=sha256:ee6acae74a2b91865910eef5e7de37dc6895ad96fa23603d1d27ea69df545015 \ --hash=sha256:ef3f72c9666bba2bab70d2a8b79f2c6d2c1a42a7f7e2b0ec83bb2f9e383950af # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements/cloud-requirements.txt xformers==0.0.31 ; platform_machine == 'x86_64' and sys_platform == 'linux' \ --hash=sha256:23331bdb9831ba0df96f55258537ca0df7ad888efc75cea97a0de79b5e2291c4 \ @@ -4734,7 +4989,7 @@ y-py==0.6.2 \ --hash=sha256:e92878cc05e844c8da937204bc34c2e6caf66709ce5936802fbfb35f04132892 \ --hash=sha256:ff32548e45e45bf3280ac1d28b3148337a5c6714c28db23aeb0693e33eba257e # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jupyter-ydoc # ypy-websocket yarl==1.18.3 \ @@ -4821,22 +5076,21 @@ yarl==1.18.3 \ --hash=sha256:fe57328fbc1bfd0bd0514470ac692630f3901c0ee39052ae47acd1d90a436719 \ --hash=sha256:fea09ca13323376a2fdfb353a5fa2e59f90cd18d7ca4eaa1fd31f0a8b4f91e62 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # aiohttp ypy-websocket==0.8.4 \ --hash=sha256:43a001473f5c8abcf182f603049cf305cbc855ad8deaa9dfa0f3b5a7cea9d0ff \ --hash=sha256:b1ba0dfcc9762f0ca168d2378062d3ca1299d39076b0f145d961359121042be5 # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # jupyter-server-ydoc zipp==3.19.2 \ --hash=sha256:bf1dcf6450f873a13e952a29504887c89e6de7506209e5b1bcc3460135d4de19 \ --hash=sha256:f091755f667055f2d02b32c53771a7a6c8b47e1fdbc4b72a8b9072b3eef8015c # via - # -c python/requirements_compiled_ray_test_py311_cpu.txt + # -c python/deplocks/llm/ray_test_py311_cpu.lock # importlib-metadata # The following packages were excluded from the output: -# ray -# grpcio-tools # setuptools +# ray diff --git a/python/requirements_compiled_rayllm_test_py311_cu121.txt b/python/deplocks/llm/rayllm_test_py311_cu121.lock similarity index 87% rename from python/requirements_compiled_rayllm_test_py311_cu121.txt rename to python/deplocks/llm/rayllm_test_py311_cu121.lock index b33cc1e6eb75..17c70071f3fb 100644 --- a/python/requirements_compiled_rayllm_test_py311_cu121.txt +++ b/python/deplocks/llm/rayllm_test_py311_cu121.lock @@ -1,19 +1,25 @@ # This file was autogenerated by uv via the following command: -# uv pip compile --generate-hashes --strip-extras --unsafe-package ray --unsafe-package grpcio-tools --unsafe-package setuptools --index-url https://pypi.org/simple --extra-index-url https://download.pytorch.org/whl/cu121 --index-strategy unsafe-best-match --no-strip-markers --emit-index-url --emit-find-links -c python/requirements_compiled_ray_test_py311_cu121.txt python/requirements.txt python/requirements/cloud-requirements.txt python/requirements/base-test-requirements.txt python/requirements/llm/llm-requirements.txt python/requirements/llm/llm-test-requirements.txt -o python/requirements_compiled_rayllm_test_py311_cu121.txt +# uv pip compile --generate-hashes --strip-extras --unsafe-package setuptools --index-url https://pypi.org/simple --index-strategy unsafe-best-match --no-strip-markers --emit-index-url --emit-find-links --extra-index-url https://download.pytorch.org/whl/cu121 --python-version=3.11 --unsafe-package ray --python-platform=linux -c python/deplocks/llm/ray_test_py311_cu121.lock python/requirements.txt python/requirements/cloud-requirements.txt python/requirements/base-test-requirements.txt python/requirements/llm/llm-requirements.txt python/requirements/llm/llm-test-requirements.txt -o python/deplocks/llm/rayllm_test_py311_cu121.lock --index-url https://pypi.org/simple --extra-index-url https://download.pytorch.org/whl/cu121 +adlfs==2023.8.0 \ + --hash=sha256:07e804f6df4593acfcaf01025b162e30ac13e523d3570279c98b2d91a18026d9 \ + --hash=sha256:3eb248a3c2a30b419f1147bd7676d156b5219f96ef7f11d47166afd2a3bdb07e + # via + # -c python/deplocks/llm/ray_test_py311_cu121.lock + # -r python/requirements/cloud-requirements.txt aiofiles==22.1.0 \ --hash=sha256:1142fa8e80dbae46bb6339573ad4c8c0841358f79c6eb50a493dceca14621bad \ --hash=sha256:9107f1ca0b2a5553987a94a3c9959fe5b491fdf731389aa5b7b1bd0733e32de6 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # ypy-websocket aiohappyeyeballs==2.6.1 \ --hash=sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558 \ --hash=sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # aiohttp aiohttp==3.11.16 \ --hash=sha256:004511d3413737700835e949433536a2fe95a7d0297edd911a1e9705c5b5ea43 \ @@ -98,10 +104,11 @@ aiohttp==3.11.16 \ --hash=sha256:fb46bb0f24813e6cede6cc07b1961d4b04f331f7112a23b5e21f567da4ee50aa \ --hash=sha256:fd36c119c5d6551bce374fcb5c19269638f8d09862445f85a5a48596fd59f4bb # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/cloud-requirements.txt # -r python/requirements/llm/llm-test-requirements.txt # -r python/requirements.txt + # adlfs # aiohttp-cors # pytest-aiohttp # vllm @@ -109,41 +116,47 @@ aiohttp-cors==0.7.0 \ --hash=sha256:0451ba59fdf6909d0e2cd21e4c0a43752bc0703d33fc78ae94d9d9321710193e \ --hash=sha256:4d39c6d7100fd9764ed1caf8cebf0eb01bf5e3f24e2e073fda6234bc48b19f5d # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt aiorwlock==1.3.0 \ --hash=sha256:45baf8e4fa9a23e0bb325fbd67da80de1fd7ae1d4f59a6381754c60cec7b289b \ --hash=sha256:83f12d87df4b9728a0b8fda1756585ab0d652b107bab59c6084e1b1ad692ab45 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt aiosignal==1.3.1 \ --hash=sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc \ --hash=sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # aiohttp aiosqlite==0.19.0 \ --hash=sha256:95ee77b91c8d2808bd08a59fbebf66270e9090c3d92ffbf260dc0db0b979577d \ --hash=sha256:edba222e03453e094a3ce605db1b970c4b3376264e56f32e2a4959f948d66a96 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # ypy-websocket alabaster==0.7.16 \ --hash=sha256:75a8b99c28a5dad50dd7f8ccdd447a121ddb3892da9e53d1ca5cca3106d58d65 \ --hash=sha256:b46733c07dce03ae4e150330b975c75737fa60f0a7c591b6c8bf4928a28e2c92 # via sphinx +amqp==5.3.1 \ + --hash=sha256:43b3319e1b4e7d1251833a93d672b4af1e40f3d632d479b98661a95f117880a2 \ + --hash=sha256:cddc00c725449522023bad949f70fff7b48f0b1ade74d170a6f10ab044739432 + # via + # -c python/deplocks/llm/ray_test_py311_cu121.lock + # kombu annotated-types==0.6.0 \ --hash=sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43 \ --hash=sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # pydantic anyio==3.7.1 \ --hash=sha256:44a3c9aba0f5defa43261a8b3efb97891f2bd7d804e0e1f56419befa1adfc780 \ --hash=sha256:91dee416e570e92c64041bd18b900d1d6fa78dff7048769ce5ac5ddad004fbb5 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # httpx # jupyter-server # openai @@ -153,7 +166,7 @@ argon2-cffi==23.1.0 \ --hash=sha256:879c3e79a2729ce768ebb7d36d4609e3a78a4ca2ec3a9f12286ca057e3d0db08 \ --hash=sha256:c670642b78ba29641818ab2e68bd4e6a78ba53b7eff7b4c3815ae16abf91c7ea # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jupyter-server # nbclassic # notebook @@ -180,13 +193,13 @@ argon2-cffi-bindings==21.2.0 \ --hash=sha256:f1152ac548bd5b8bcecfb0b0371f082037e47128653df2e8ba6e914d384f3c3e \ --hash=sha256:f9f8b450ed0547e3d473fdc8612083fd08dd2120d6ac8f73828df9b7d45bb351 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # argon2-cffi arrow==1.3.0 \ --hash=sha256:c728b120ebc00eb84e01882a6f5e7927a53960aa990ce7dd2b10f39005a67f80 \ --hash=sha256:d4540617648cb5f895730f1ad8c82a65f2dad0166f57b75f3ca54759c4d67a85 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # isoduration astor==0.8.1 \ --hash=sha256:070a54e890cefb5b3739d19f30f5a5ec840ffc9c50ffa7d23cc9fc1a38ebbfc5 \ @@ -196,13 +209,13 @@ asttokens==2.4.1 \ --hash=sha256:051ed49c3dcae8913ea7cd08e46a606dba30b79993209636c4875bc1d637bc24 \ --hash=sha256:b03869718ba9a6eb027e134bfdf69f38a236d681c83c160d510768af11254ba0 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # stack-data attrs==25.1.0 \ --hash=sha256:1c97078a80c814273a76b2a298a932eb681c87415c11dee0a6921de7f1b02c3e \ --hash=sha256:c75a69e28a550a7e93789579c22aa26b0f5b83b75dc4e08fe092980051e1090a # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # aiohttp # jsonschema # referencing @@ -210,40 +223,49 @@ azure-common==1.1.28 \ --hash=sha256:4ac0cd3214e36b6a1b6a442686722a5d8cc449603aa833f3f0f40bda836704a3 \ --hash=sha256:5c12d3dcf4ec20599ca6b0d3e09e86e146353d443e7fcc050c9a19c1f9df20ad # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # smart-open azure-core==1.29.5 \ --hash=sha256:0fa04b7b1f7d44a4fb8468c4093deb2ea01fdf4faddbf802ed9205615f99d68c \ --hash=sha256:52983c89d394c6f881a121e5101c5fa67278ca3b1f339c8fb2ef39230c70e9ac # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock + # adlfs # azure-identity # azure-storage-blob # smart-open +azure-datalake-store==0.0.53 \ + --hash=sha256:05b6de62ee3f2a0a6e6941e6933b792b800c3e7f6ffce2fc324bc19875757393 \ + --hash=sha256:a30c902a6e360aa47d7f69f086b426729784e71c536f330b691647a51dc42b2b + # via + # -c python/deplocks/llm/ray_test_py311_cu121.lock + # adlfs azure-identity==1.17.1 \ --hash=sha256:32ecc67cc73f4bd0595e4f64b1ca65cd05186f4fe6f98ed2ae9f1aa32646efea \ --hash=sha256:db8d59c183b680e763722bfe8ebc45930e6c57df510620985939f7f3191e0382 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/cloud-requirements.txt + # adlfs azure-storage-blob==12.22.0 \ --hash=sha256:b3804bb4fe8ab1c32771fa464053da772a682c2737b19da438a3f4e5e3b3736e \ --hash=sha256:bb7d2d824ce3f11f14a27ee7d9281289f7e072ac8311c52e3652672455b7d5e8 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock + # adlfs # smart-open babel==2.13.1 \ --hash=sha256:33e0952d7dd6374af8dbf6768cc4ddf3ccfefc244f9986d4074704f2fbd18900 \ --hash=sha256:7077a4984b02b6727ac10f1f7294484f737443d7e2e66c5e4380e41a3ae0b4ed # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jupyterlab-server # sphinx backcall==0.2.0 \ --hash=sha256:5cbdbf27be5e7cfadb448baf0aa95508f91f2bbc6c6437cd9cd06e2a4c215e1e \ --hash=sha256:fbbce6a29f263178a1f7915c1940bde0ec2b2a967566fe1c65c1dfb7422bd255 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # ipython backoff==2.2.1 \ --hash=sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba \ @@ -253,8 +275,14 @@ beautifulsoup4==4.11.1 \ --hash=sha256:58d5c3d29f5a36ffeb94f02f0d786cd53014cf9b3b3951d42e0080d8a9498d30 \ --hash=sha256:ad9aa55b65ef2808eb405f46cf74df7fcb7044d5cbc26487f96eb2ef2e436693 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # nbconvert +billiard==4.2.1 \ + --hash=sha256:12b641b0c539073fc8d3f5b8b7be998956665c4233c7c1fcd66a7e677c4fb36f \ + --hash=sha256:40b59a4ac8806ba2c2369ea98d876bc6108b051c227baffd928c644d15d8f3cb + # via + # -c python/deplocks/llm/ray_test_py311_cu121.lock + # celery blake3==1.0.4 \ --hash=sha256:00605aa59923205c6a4f21131840840eb2d9a754c59b163357d890566755b97a \ --hash=sha256:08f46c2f1c5f369f07409e3e4ff248bcb22617cd741f2224873d85982dd6034e \ @@ -346,20 +374,20 @@ bleach==6.1.0 \ --hash=sha256:0a31f1837963c41d46bbf1331b8778e1308ea0791db03cc4e7357b97cf42a8fe \ --hash=sha256:3225f354cfc436b9789c66c4ee030194bee0568fbf9cbdad3bc8b5c26c5f12b6 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # nbconvert -boto3==1.26.76 \ - --hash=sha256:30c7d967ed1c6b5a05643e42cae9d4d36c3f1cb6782637ddc7007a104cfd9027 \ - --hash=sha256:b4c2969b7677762914394b8273cc1905dfe5b71f250741c1a575487ae357e729 +boto3==1.29.7 \ + --hash=sha256:1eb4c548118b5fc5e018dee956fd33e6fb249cd1f2def85f1bba816aef4d9f3e \ + --hash=sha256:96e9890ebe7cd823b5f4976dd676e112c000c6528c28e20a2f274590589dd18b # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/cloud-requirements.txt # smart-open -botocore==1.29.76 \ - --hash=sha256:70735b00cd529f152992231ca6757e458e5ec25db43767b3526e9a35b2f143b7 \ - --hash=sha256:c2f67b6b3f8acf2968eafca06526f07b9fb0d27bac4c68a635d51abb675134a7 +botocore==1.32.7 \ + --hash=sha256:58b33d02cafa23461c8a9d211b30e8cded992380a84de409379fd02811fa3e11 \ + --hash=sha256:c6795c731b04c8e3635588c44cfd1a4462fc5987859195522c96812cf3eceff9 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/cloud-requirements.txt # boto3 # s3transfer @@ -367,7 +395,7 @@ cachetools==5.5.2 \ --hash=sha256:1a661caa9175d26759571b2e19580f9d6393969e5dfca11fdb1f947a23e640d4 \ --hash=sha256:d26a22bcc62eb95c3beabd9f1ee5e820d3d2704fe2967cbe350e20c8ffcd3f0a # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # google-auth # vllm cbor2==5.6.5 \ @@ -416,11 +444,17 @@ cbor2==5.6.5 \ --hash=sha256:fde21ac1cf29336a31615a2c469a9cb03cf0add3ae480672d4d38cda467d07fc \ --hash=sha256:fe11c2eb518c882cfbeed456e7a552e544893c17db66fe5d3230dbeaca6b615c # via vllm +celery==5.5.3 \ + --hash=sha256:0b5761a07057acee94694464ca482416b959568904c9dfa41ce8413a7d65d525 \ + --hash=sha256:6c972ae7968c2b5281227f01c3a3f984037d21c5129d07bf3550cc2afc6b10a5 + # via + # -c python/deplocks/llm/ray_test_py311_cu121.lock + # -r python/requirements.txt certifi==2025.1.31 \ --hash=sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651 \ --hash=sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/cloud-requirements.txt # httpcore # httpx @@ -479,8 +513,9 @@ cffi==1.16.0 \ --hash=sha256:fa3a0128b152627161ce47201262d3140edb5a5c3da88d73a1b790a959126956 \ --hash=sha256:fcc8eb6d5902bb1cf6dc4f187ee3ea80a1eba0a89aba40a5cb20a5087d961357 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # argon2-cffi-bindings + # azure-datalake-store # cryptography # soundfile charset-normalizer==3.3.2 \ @@ -575,30 +610,52 @@ charset-normalizer==3.3.2 \ --hash=sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519 \ --hash=sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # requests click==8.1.7 \ --hash=sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28 \ --hash=sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/cloud-requirements.txt # -r python/requirements.txt + # celery + # click-didyoumean + # click-plugins + # click-repl # ray # typer # uvicorn +click-didyoumean==0.3.1 \ + --hash=sha256:4f82fdff0dbe64ef8ab2279bd6aa3f6a99c3b28c05aa09cbfc07c9d7fbb5a463 \ + --hash=sha256:5c4bb6007cfea5f2fd6583a2fb6701a22a41eb98957e63d0fac41c10e7c3117c + # via + # -c python/deplocks/llm/ray_test_py311_cu121.lock + # celery +click-plugins==1.1.1.2 \ + --hash=sha256:008d65743833ffc1f5417bf0e78e8d2c23aab04d9745ba817bd3e71b0feb6aa6 \ + --hash=sha256:d7af3984a99d243c131aa1a828331e7630f4a88a9741fd05c927b204bcf92261 + # via + # -c python/deplocks/llm/ray_test_py311_cu121.lock + # celery +click-repl==0.3.0 \ + --hash=sha256:17849c23dba3d667247dc4defe1757fff98694e90fe37474f3feebb69ced26a9 \ + --hash=sha256:fb7e06deb8da8de86180a33a9da97ac316751c094c6899382da7feeeeb51b812 + # via + # -c python/deplocks/llm/ray_test_py311_cu121.lock + # celery cloudpickle==2.2.0 \ --hash=sha256:3f4219469c55453cfe4737e564b67c2a149109dabf7f242478948b895f61106f \ --hash=sha256:7428798d5926d8fcbfd092d18d01a2a03daf8237d8fcdc8095d256b8490796f0 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # gymnasium # vllm colorama==0.4.6 \ --hash=sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44 \ --hash=sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/cloud-requirements.txt # halo # log-symbols @@ -606,13 +663,13 @@ colorful==0.5.5 \ --hash=sha256:62c187e27c1433db9463ff93b1451898d1e7e23a7e553583fd9daeb6325182e4 \ --hash=sha256:66f8c1264b2a26f7293b96a03bb7a76c4bc8b9634369a0bffdcd12d618056a1d # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt comm==0.2.0 \ --hash=sha256:2da8d9ebb8dd7bfc247adaff99f24dce705638a8042b85cb995066793e391001 \ --hash=sha256:a517ea2ca28931c7007a7a99c562a0fa5883cfb48963140cf642c41c948498be # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # ipykernel # ipywidgets compressed-tensors==0.10.2 \ @@ -658,7 +715,7 @@ cryptography==44.0.3 \ --hash=sha256:fc3c9babc1e1faefd62704bb46a69f359a9819eb0292e40df3fb6e3574715cd4 \ --hash=sha256:fe19d8bc5536a91a24a8133328880a41831b6c5df54599a8417b62fe015d3053 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # azure-identity # azure-storage-blob # msal @@ -678,7 +735,7 @@ cupy-cuda12x==13.1.0 ; sys_platform != 'darwin' \ --hash=sha256:dac0284fecb90b5731f514e569a6fcf6674a730ae95b9490781a713b60a34423 \ --hash=sha256:e7a25ef1b44ae6276b5105affc2289edb34f1aa6676babd5bcd80907348c4cfa # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt # ray debugpy==1.8.0 \ @@ -701,19 +758,19 @@ debugpy==1.8.0 \ --hash=sha256:ef54404365fae8d45cf450d0544ee40cefbcb9cb85ea7afe89a963c27028261e \ --hash=sha256:ef9ab7df0b9a42ed9c878afd3eaaff471fce3fa73df96022e1f5c9f8f8c87ada # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # ipykernel decorator==5.1.1 \ --hash=sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330 \ --hash=sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # ipython defusedxml==0.7.1 \ --hash=sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69 \ --hash=sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # nbconvert depyf==0.19.0 \ --hash=sha256:040b35fc0997d49df024b7d094f2a7836f91e9ed02f49982dd37e70aa3285ad5 \ @@ -731,7 +788,7 @@ distlib==0.3.7 \ --hash=sha256:2e24928bc811348f0feb63014e97aaae3037f2cf48712d51ae61df7fd6075057 \ --hash=sha256:9dafe54b34a028eafd95039d5e5d4851a13734540f1331060d31c9916e7147a8 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # virtualenv distro==1.9.0 \ --hash=sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed \ @@ -785,7 +842,7 @@ dm-tree==0.1.8 \ --hash=sha256:f7ac31b9aecccb2c6e1ab29706f6ded3eba0c2c69c770322c9c685929c3d6afb \ --hash=sha256:fa42a605d099ee7d41ba2b5fb75e21423951fd26e5d50583a00471238fb3021d # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt dnspython==2.7.0 \ --hash=sha256:b4c34b7d10b51bcc3a5071e7b8dee77939f1e878477eeecc965e9835f63c6c86 \ @@ -807,26 +864,26 @@ entrypoints==0.4 \ --hash=sha256:b706eddaa9218a19ebcd67b56818f05bb27589b1ca9e8d797b74affad4ccacd4 \ --hash=sha256:f174b5ff827504fd3cd97cc3f8649f3693f51538c7e4bdf3ef002c8429d42f9f # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jupyter-client # nbconvert executing==2.0.1 \ --hash=sha256:35afe2ce3affba8ee97f2d69927fa823b08b472b7b994e36a52a964b93d16147 \ --hash=sha256:eac49ca94516ccc753f9fb5ce82603156e590b27525a8bc32cce8ae302eb61bc # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # stack-data farama-notifications==0.0.4 \ --hash=sha256:13fceff2d14314cf80703c8266462ebf3733c7d165336eee998fc58e545efd18 \ --hash=sha256:14de931035a41961f7c056361dc7f980762a143d05791ef5794a751a2caf05ae # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # gymnasium fastapi==0.115.12 \ --hash=sha256:1e2c2a2646905f9e83d32f04a3f86aff4a286669c6c950ca95b5fd68c2602681 \ --hash=sha256:e94613d6c05e27be7ffebdd6ea5f388112e5e430c8f7d6494a9d1d88d43e814d # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt # vllm fastapi-cli==0.0.5 \ @@ -837,7 +894,7 @@ fastjsonschema==2.19.0 \ --hash=sha256:b9fd1a2dd6971dbc7fee280a95bd199ae0dd9ce22beb91cc75e9c1c528a5170e \ --hash=sha256:e25df6647e1bc4a26070b700897b07b542ec898dd4f1f6ea013e7f6a88417225 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # nbformat fastrlock==0.8.2 ; sys_platform != 'darwin' \ --hash=sha256:067edb0a0805bf61e17a251d5046af59f6e9d2b8ad01222e0ef7a0b7937d5548 \ @@ -916,13 +973,13 @@ fastrlock==0.8.2 ; sys_platform != 'darwin' \ --hash=sha256:fb5363cf0fddd9b50525ddbf64a1e1b28ec4c6dfb28670a940cb1cf988a6786b \ --hash=sha256:ff75c90663d6e8996610d435e71487daa853871ad1770dd83dc0f2fc4997241e # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # cupy-cuda12x filelock==3.17.0 \ --hash=sha256:533dc2f7ba78dc2f0f531fc6c4940addf7b70a481e269a5a3b93be94ffbe8338 \ --hash=sha256:ee4e77401ef576ebb38cd7f13b9b28893194acc20a8e68e18730ba9c0e54660e # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt # huggingface-hub # ray @@ -934,7 +991,7 @@ fqdn==1.5.1 \ --hash=sha256:105ed3677e767fb5ca086a0c1f4bb66ebc3c100be518f0e0d755d9eae164d89f \ --hash=sha256:3a179af3761e4df6eb2e026ff9e1a3033d3587bf980a0b1b2e1e5d08d7358014 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jsonschema frozenlist==1.4.1 \ --hash=sha256:04ced3e6a46b4cfffe20f9ae482818e34eba9b5fb0ce4056e4cc9b6e212d09b7 \ @@ -1015,15 +1072,16 @@ frozenlist==1.4.1 \ --hash=sha256:fde5bd59ab5357e3853313127f4d3565fc7dad314a74d7b5d43c22c6a5ed2ced \ --hash=sha256:fe1a06da377e3a1062ae5fe0926e12b84eceb8a50b350ddca72dc85015873f74 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # aiohttp # aiosignal -fsspec==2023.5.0 \ - --hash=sha256:51a4ad01a5bb66fcc58036e288c0d53d3975a0df2a5dc59a93b59bade0391f2a \ - --hash=sha256:b3b56e00fb93ea321bc9e5d9cf6f8522a0198b20eb24e02774d329e9c6fb84ce +fsspec==2023.12.1 \ + --hash=sha256:6271f1d3075a378bfe432f6f42bf7e1d2a6ba74f78dd9b512385474c579146a0 \ + --hash=sha256:c4da01a35ac65c853f833e43f67802c25213f560820d54ddf248f92eddd5e990 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt + # adlfs # huggingface-hub # torch gguf==0.16.2 \ @@ -1034,19 +1092,19 @@ gitdb==4.0.11 \ --hash=sha256:81a3407ddd2ee8df444cbacea00e2d038e40150acfa3001696fe0dcf1d3adfa4 \ --hash=sha256:bf5421126136d6d0af55bc1e7c1af1c397a34f5b7bd79e776cd3e89785c2b04b # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # gitpython gitpython==3.1.44 \ --hash=sha256:9e0e10cda9bed1ee64bc9a6de50e7e38a9c9943241cd7f585f6df3ed28011110 \ --hash=sha256:c87e30b26253bf5418b01b0660f818967f3c503193838337fe5e573331249269 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/cloud-requirements.txt google-api-core==2.24.2 \ --hash=sha256:810a63ac95f3c441b7c0e43d344e372887f62ce9071ba972eacf32672e072de9 \ --hash=sha256:81718493daf06d96d6bc76a91c23874dbf2fac0adbbf542831b805ee6e974696 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # google-cloud-core # google-cloud-storage # opencensus @@ -1054,7 +1112,7 @@ google-auth==2.23.4 \ --hash=sha256:79905d6b1652187def79d491d6e23d0cbb3a21d3c7ba0dbaa9c8a01906b13ff3 \ --hash=sha256:d4bbc92fe4b8bfd2f3e8d88e5ba7085935da208ee38a134fc280e7ce682a05f2 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/cloud-requirements.txt # google-api-core # google-cloud-core @@ -1063,13 +1121,13 @@ google-cloud-core==2.4.1 \ --hash=sha256:9b7749272a812bde58fff28868d0c5e2f585b82f37e09a1f6ed2d4d10f134073 \ --hash=sha256:a9e6a4422b9ac5c29f79a0ede9485473338e2ce78d91f2370c01e730eab22e61 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # google-cloud-storage google-cloud-storage==2.14.0 \ --hash=sha256:2d23fcf59b55e7b45336729c148bb1c464468c69d5efbaee30f7201dd90eb97e \ --hash=sha256:8641243bbf2a2042c16a6399551fbb13f062cbc9a2de38d6c0bb5426962e9dbd # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/cloud-requirements.txt # smart-open google-crc32c==1.5.0 \ @@ -1142,100 +1200,148 @@ google-crc32c==1.5.0 \ --hash=sha256:fd8536e902db7e365f49e7d9029283403974ccf29b13fc7028b97e2295b33556 \ --hash=sha256:fe70e325aa68fa4b5edf7d1a4b6f691eb04bbccac0ace68e34820d283b5f80d4 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # google-cloud-storage # google-resumable-media google-resumable-media==2.6.0 \ --hash=sha256:972852f6c65f933e15a4a210c2b96930763b47197cdf4aa5f5bea435efb626e7 \ --hash=sha256:fc03d344381970f79eebb632a3c18bb1828593a2dc5572b5f90115ef7d11e81b # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # google-cloud-storage googleapis-common-protos==1.61.0 \ --hash=sha256:22f1915393bb3245343f6efe87f6fe868532efc12aa26b391b15132e1279f1c0 \ --hash=sha256:8a64866a97f6304a7179873a465d6eee97b7a24ec6cfd78e0f575e96b821240b # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # google-api-core -grpcio==1.66.2 \ - --hash=sha256:02697eb4a5cbe5a9639f57323b4c37bcb3ab2d48cec5da3dc2f13334d72790dd \ - --hash=sha256:03b0b307ba26fae695e067b94cbb014e27390f8bc5ac7a3a39b7723fed085604 \ - --hash=sha256:05bc2ceadc2529ab0b227b1310d249d95d9001cd106aa4d31e8871ad3c428d73 \ - --hash=sha256:06de8ec0bd71be123eec15b0e0d457474931c2c407869b6c349bd9bed4adbac3 \ - --hash=sha256:0be4e0490c28da5377283861bed2941d1d20ec017ca397a5df4394d1c31a9b50 \ - --hash=sha256:12fda97ffae55e6526825daf25ad0fa37483685952b5d0f910d6405c87e3adb6 \ - --hash=sha256:1caa38fb22a8578ab8393da99d4b8641e3a80abc8fd52646f1ecc92bcb8dee34 \ - --hash=sha256:2018b053aa15782db2541ca01a7edb56a0bf18c77efed975392583725974b249 \ - --hash=sha256:20657d6b8cfed7db5e11b62ff7dfe2e12064ea78e93f1434d61888834bc86d75 \ - --hash=sha256:2335c58560a9e92ac58ff2bc5649952f9b37d0735608242973c7a8b94a6437d8 \ - --hash=sha256:31fd163105464797a72d901a06472860845ac157389e10f12631025b3e4d0453 \ - --hash=sha256:38b68498ff579a3b1ee8f93a05eb48dc2595795f2f62716e797dc24774c1aaa8 \ - --hash=sha256:3b00efc473b20d8bf83e0e1ae661b98951ca56111feb9b9611df8efc4fe5d55d \ - --hash=sha256:3ed71e81782966ffead60268bbda31ea3f725ebf8aa73634d5dda44f2cf3fb9c \ - --hash=sha256:45a3d462826f4868b442a6b8fdbe8b87b45eb4f5b5308168c156b21eca43f61c \ - --hash=sha256:49f0ca7ae850f59f828a723a9064cadbed90f1ece179d375966546499b8a2c9c \ - --hash=sha256:4e504572433f4e72b12394977679161d495c4c9581ba34a88d843eaf0f2fbd39 \ - --hash=sha256:4ea1d062c9230278793820146c95d038dc0f468cbdd172eec3363e42ff1c7d01 \ - --hash=sha256:563588c587b75c34b928bc428548e5b00ea38c46972181a4d8b75ba7e3f24231 \ - --hash=sha256:6001e575b8bbd89eee11960bb640b6da6ae110cf08113a075f1e2051cc596cae \ - --hash=sha256:66a0cd8ba6512b401d7ed46bb03f4ee455839957f28b8d61e7708056a806ba6a \ - --hash=sha256:6851de821249340bdb100df5eacfecfc4e6075fa85c6df7ee0eb213170ec8e5d \ - --hash=sha256:728bdf36a186e7f51da73be7f8d09457a03061be848718d0edf000e709418987 \ - --hash=sha256:73e3b425c1e155730273f73e419de3074aa5c5e936771ee0e4af0814631fb30a \ - --hash=sha256:73fc8f8b9b5c4a03e802b3cd0c18b2b06b410d3c1dcbef989fdeb943bd44aff7 \ - --hash=sha256:78fa51ebc2d9242c0fc5db0feecc57a9943303b46664ad89921f5079e2e4ada7 \ - --hash=sha256:7b2c86457145ce14c38e5bf6bdc19ef88e66c5fee2c3d83285c5aef026ba93b3 \ - --hash=sha256:7d69ce1f324dc2d71e40c9261d3fdbe7d4c9d60f332069ff9b2a4d8a257c7b2b \ - --hash=sha256:802d84fd3d50614170649853d121baaaa305de7b65b3e01759247e768d691ddf \ - --hash=sha256:80fd702ba7e432994df208f27514280b4b5c6843e12a48759c9255679ad38db8 \ - --hash=sha256:8ac475e8da31484efa25abb774674d837b343afb78bb3bcdef10f81a93e3d6bf \ - --hash=sha256:950da58d7d80abd0ea68757769c9db0a95b31163e53e5bb60438d263f4bed7b7 \ - --hash=sha256:99a641995a6bc4287a6315989ee591ff58507aa1cbe4c2e70d88411c4dcc0839 \ - --hash=sha256:9c3a99c519f4638e700e9e3f83952e27e2ea10873eecd7935823dab0c1c9250e \ - --hash=sha256:9c509a4f78114cbc5f0740eb3d7a74985fd2eff022971bc9bc31f8bc93e66a3b \ - --hash=sha256:a18e20d8321c6400185b4263e27982488cb5cdd62da69147087a76a24ef4e7e3 \ - --hash=sha256:a917d26e0fe980b0ac7bfcc1a3c4ad6a9a4612c911d33efb55ed7833c749b0ee \ - --hash=sha256:a9539f01cb04950fd4b5ab458e64a15f84c2acc273670072abe49a3f29bbad54 \ - --hash=sha256:ad2efdbe90c73b0434cbe64ed372e12414ad03c06262279b104a029d1889d13e \ - --hash=sha256:b672abf90a964bfde2d0ecbce30f2329a47498ba75ce6f4da35a2f4532b7acbc \ - --hash=sha256:bbd27c24a4cc5e195a7f56cfd9312e366d5d61b86e36d46bbe538457ea6eb8dd \ - --hash=sha256:c400ba5675b67025c8a9f48aa846f12a39cf0c44df5cd060e23fda5b30e9359d \ - --hash=sha256:c408f5ef75cfffa113cacd8b0c0e3611cbfd47701ca3cdc090594109b9fcbaed \ - --hash=sha256:c806852deaedee9ce8280fe98955c9103f62912a5b2d5ee7e3eaa284a6d8d8e7 \ - --hash=sha256:ce89f5876662f146d4c1f695dda29d4433a5d01c8681fbd2539afff535da14d4 \ - --hash=sha256:d25a14af966438cddf498b2e338f88d1c9706f3493b1d73b93f695c99c5f0e2a \ - --hash=sha256:d8d4732cc5052e92cea2f78b233c2e2a52998ac40cd651f40e398893ad0d06ec \ - --hash=sha256:d9a9724a156c8ec6a379869b23ba3323b7ea3600851c91489b871e375f710bc8 \ - --hash=sha256:e636ce23273683b00410f1971d209bf3689238cf5538d960adc3cdfe80dd0dbd \ - --hash=sha256:e88264caad6d8d00e7913996030bac8ad5f26b7411495848cc218bd3a9040b6c \ - --hash=sha256:f145cc21836c332c67baa6fc81099d1d27e266401565bf481948010d6ea32d46 \ - --hash=sha256:fb57870449dfcfac428afbb5a877829fcb0d6db9d9baa1148705739e9083880e \ - --hash=sha256:fb70487c95786e345af5e854ffec8cb8cc781bcc5df7930c4fbb7feaa72e1cdf \ - --hash=sha256:fe96281713168a3270878255983d2cb1a97e034325c8c2c25169a69289d3ecfa \ - --hash=sha256:ff1f7882e56c40b0d33c4922c15dfa30612f05fb785074a012f7cda74d1c3679 - # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt +grpcio==1.74.0 \ + --hash=sha256:0f87bddd6e27fc776aacf7ebfec367b6d49cad0455123951e4488ea99d9b9b8f \ + --hash=sha256:136b53c91ac1d02c8c24201bfdeb56f8b3ac3278668cbb8e0ba49c88069e1bdc \ + --hash=sha256:1733969040989f7acc3d94c22f55b4a9501a30f6aaacdbccfaba0a3ffb255ab7 \ + --hash=sha256:176d60a5168d7948539def20b2a3adcce67d72454d9ae05969a2e73f3a0feee7 \ + --hash=sha256:1a2b06afe2e50ebfd46247ac3ba60cac523f54ec7792ae9ba6073c12daf26f0a \ + --hash=sha256:1bf949792cee20d2078323a9b02bacbbae002b9e3b9e2433f2741c15bdeba1c4 \ + --hash=sha256:22b834cef33429ca6cc28303c9c327ba9a3fafecbf62fae17e9a7b7163cc43ac \ + --hash=sha256:2918948864fec2a11721d91568effffbe0a02b23ecd57f281391d986847982f6 \ + --hash=sha256:2bc2d7d8d184e2362b53905cb1708c84cb16354771c04b490485fa07ce3a1d89 \ + --hash=sha256:2f609a39f62a6f6f05c7512746798282546358a37ea93c1fcbadf8b2fed162e3 \ + --hash=sha256:3601274bc0523f6dc07666c0e01682c94472402ac2fd1226fd96e079863bfa49 \ + --hash=sha256:3b03d8f2a07f0fea8c8f74deb59f8352b770e3900d143b3d1475effcb08eec20 \ + --hash=sha256:3d14e3c4d65e19d8430a4e28ceb71ace4728776fd6c3ce34016947474479683f \ + --hash=sha256:42f8fee287427b94be63d916c90399ed310ed10aadbf9e2e5538b3e497d269bc \ + --hash=sha256:4bc5fca10aaf74779081e16c2bcc3d5ec643ffd528d9e7b1c9039000ead73bae \ + --hash=sha256:4e4181bfc24413d1e3a37a0b7889bea68d973d4b45dd2bc68bb766c140718f82 \ + --hash=sha256:55b453812fa7c7ce2f5c88be3018fb4a490519b6ce80788d5913f3f9d7da8c7b \ + --hash=sha256:566b9395b90cc3d0d0c6404bc8572c7c18786ede549cdb540ae27b58afe0fb91 \ + --hash=sha256:5f251c355167b2360537cf17bea2cf0197995e551ab9da6a0a59b3da5e8704f9 \ + --hash=sha256:60d2d48b0580e70d2e1954d0d19fa3c2e60dd7cbed826aca104fff518310d1c5 \ + --hash=sha256:64229c1e9cea079420527fa8ac45d80fc1e8d3f94deaa35643c381fa8d98f362 \ + --hash=sha256:655726919b75ab3c34cdad39da5c530ac6fa32696fb23119e36b64adcfca174a \ + --hash=sha256:662456c4513e298db6d7bd9c3b8df6f75f8752f0ba01fb653e252ed4a59b5a5d \ + --hash=sha256:68c8ebcca945efff9d86d8d6d7bfb0841cf0071024417e2d7f45c5e46b5b08eb \ + --hash=sha256:69e1a8180868a2576f02356565f16635b99088da7df3d45aaa7e24e73a054e31 \ + --hash=sha256:6bab67d15ad617aff094c382c882e0177637da73cbc5532d52c07b4ee887a87b \ + --hash=sha256:7d95d71ff35291bab3f1c52f52f474c632db26ea12700c2ff0ea0532cb0b5854 \ + --hash=sha256:80d1f4fbb35b0742d3e3d3bb654b7381cd5f015f8497279a1e9c21ba623e01b1 \ + --hash=sha256:834988b6c34515545b3edd13e902c1acdd9f2465d386ea5143fb558f153a7176 \ + --hash=sha256:8533e6e9c5bd630ca98062e3a1326249e6ada07d05acf191a77bc33f8948f3d8 \ + --hash=sha256:85bd5cdf4ed7b2d6438871adf6afff9af7096486fcf51818a81b77ef4dd30907 \ + --hash=sha256:86ad489db097141a907c559988c29718719aa3e13370d40e20506f11b4de0d11 \ + --hash=sha256:885912559974df35d92219e2dc98f51a16a48395f37b92865ad45186f294096c \ + --hash=sha256:8efe72fde5500f47aca1ef59495cb59c885afe04ac89dd11d810f2de87d935d4 \ + --hash=sha256:8f7b5882fb50632ab1e48cb3122d6df55b9afabc265582808036b6e51b9fd6b7 \ + --hash=sha256:9e7c4389771855a92934b2846bd807fc25a3dfa820fd912fe6bd8136026b2707 \ + --hash=sha256:9e912d3c993a29df6c627459af58975b2e5c897d93287939b9d5065f000249b5 \ + --hash=sha256:a8f0302f9ac4e9923f98d8e243939a6fb627cd048f5cd38595c97e38020dffce \ + --hash=sha256:b6a73b2ba83e663b2480a90b82fdae6a7aa6427f62bf43b29912c0cfd1aa2bfa \ + --hash=sha256:c14e803037e572c177ba54a3e090d6eb12efd795d49327c5ee2b3bddb836bf01 \ + --hash=sha256:c3d7bd6e3929fd2ea7fbc3f562e4987229ead70c9ae5f01501a46701e08f1ad9 \ + --hash=sha256:c98e0b7434a7fa4e3e63f250456eaef52499fba5ae661c58cc5b5477d11e7182 \ + --hash=sha256:cce634b10aeab37010449124814b05a62fb5f18928ca878f1bf4750d1f0c815b \ + --hash=sha256:e154d230dc1bbbd78ad2fdc3039fa50ad7ffcf438e4eb2fa30bce223a70c7486 \ + --hash=sha256:e1ea6176d7dfd5b941ea01c2ec34de9531ba494d541fe2057c904e601879f249 \ + --hash=sha256:e759f9e8bc908aaae0412642afe5416c9f983a80499448fcc7fab8692ae044c3 \ + --hash=sha256:e8978003816c7b9eabe217f88c78bc26adc8f9304bf6a594b02e5a49b2ef9c11 \ + --hash=sha256:ecde9ab49f58433abe02f9ed076c7b5be839cf0153883a6d23995937a82392fa \ + --hash=sha256:f6ec94f0e50eb8fa1744a731088b966427575e40c2944a980049798b127a687e \ + --hash=sha256:fd3c71aeee838299c5887230b8a1822795325ddfea635edd82954c1eaa831e24 \ + --hash=sha256:fe0f540750a13fd8e5da4b3eaba91a785eea8dca5ccd2bc2ffe978caa403090e + # via + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/cloud-requirements.txt # -r python/requirements.txt # grpcio-tools -gymnasium==1.0.0 \ - --hash=sha256:9d2b66f30c1b34fe3c2ce7fae65ecf365d0e9982d2b3d860235e773328a3b403 \ - --hash=sha256:b6f40e1e24c5bd419361e1a5b86a9117d2499baecc3a660d44dfff4c465393ad +grpcio-tools==1.62.3 \ + --hash=sha256:0a52cc9444df978438b8d2332c0ca99000521895229934a59f94f37ed896b133 \ + --hash=sha256:0a8c0c4724ae9c2181b7dbc9b186df46e4f62cb18dc184e46d06c0ebeccf569e \ + --hash=sha256:0cb3a3436ac119cbd37a7d3331d9bdf85dad21a6ac233a3411dff716dcbf401e \ + --hash=sha256:11c625eebefd1fd40a228fc8bae385e448c7e32a6ae134e43cf13bbc23f902b7 \ + --hash=sha256:11f363570dea661dde99e04a51bd108a5807b5df32a6f8bdf4860e34e94a4dbf \ + --hash=sha256:141d028bf5762d4a97f981c501da873589df3f7e02f4c1260e1921e565b376fa \ + --hash=sha256:1c989246c2aebc13253f08be32538a4039a64e12d9c18f6d662d7aee641dc8b5 \ + --hash=sha256:1da38070738da53556a4b35ab67c1b9884a5dd48fa2f243db35dc14079ea3d0c \ + --hash=sha256:27cd9ef5c5d68d5ed104b6dcb96fe9c66b82050e546c9e255716903c3d8f0373 \ + --hash=sha256:2e02d3b96f2d0e4bab9ceaa30f37d4f75571e40c6272e95364bff3125a64d184 \ + --hash=sha256:2f968b049c2849540751ec2100ab05e8086c24bead769ca734fdab58698408c1 \ + --hash=sha256:350a80485e302daaa95d335a931f97b693e170e02d43767ab06552c708808950 \ + --hash=sha256:3eae6ea76d62fcac091e1f15c2dcedf1dc3f114f8df1a972a8a0745e89f4cf61 \ + --hash=sha256:47a5c093ab256dec5714a7a345f8cc89315cb57c298b276fa244f37a0ba507f0 \ + --hash=sha256:5782883a27d3fae8c425b29a9d3dcf5f47d992848a1b76970da3b5a28d424b26 \ + --hash=sha256:6a56d344b0bab30bf342a67e33d386b0b3c4e65868ffe93c341c51e1a8853ca5 \ + --hash=sha256:6c3064610826f50bd69410c63101954676edc703e03f9e8f978a135f1aaf97c1 \ + --hash=sha256:703f46e0012af83a36082b5f30341113474ed0d91e36640da713355cd0ea5d23 \ + --hash=sha256:710fecf6a171dcbfa263a0a3e7070e0df65ba73158d4c539cec50978f11dad5d \ + --hash=sha256:7c7136015c3d62c3eef493efabaf9e3380e3e66d24ee8e94c01cb71377f57833 \ + --hash=sha256:7cc83023acd8bc72cf74c2edbe85b52098501d5b74d8377bfa06f3e929803492 \ + --hash=sha256:7f2483ea232bd72d98a6dc6d7aefd97e5bc80b15cd909b9e356d6f3e326b6e43 \ + --hash=sha256:7ff7d58a45b75df67d25f8f144936a3e44aabd91afec833ee06826bd02b7fbe7 \ + --hash=sha256:8ad0473af5544f89fc5a1ece8676dd03bdf160fb3230f967e05d0f4bf89620e3 \ + --hash=sha256:8c5d22b252dcef11dd1e0fbbe5bbfb9b4ae048e8880d33338215e8ccbdb03edc \ + --hash=sha256:8e62cc7164b0b7c5128e637e394eb2ef3db0e61fc798e80c301de3b2379203ed \ + --hash=sha256:962c84b4da0f3b14b3cdb10bc3837ebc5f136b67d919aea8d7bb3fd3df39528a \ + --hash=sha256:ace43b26d88a58dcff16c20d23ff72b04d0a415f64d2820f4ff06b1166f50557 \ + --hash=sha256:b47d0dda1bdb0a0ba7a9a6de88e5a1ed61f07fad613964879954961e36d49193 \ + --hash=sha256:b77f9f9cee87cd798f0fe26b7024344d1b03a7cd2d2cba7035f8433b13986325 \ + --hash=sha256:b881fd9505a84457e9f7e99362eeedd86497b659030cf57c6f0070df6d9c2b9b \ + --hash=sha256:bfda6ee8990997a9df95c5606f3096dae65f09af7ca03a1e9ca28f088caca5cf \ + --hash=sha256:c3a1ac9d394f8e229eb28eec2e04b9a6f5433fa19c9d32f1cb6066e3c5114a1d \ + --hash=sha256:c8ad5cce554e2fcaf8842dee5d9462583b601a3a78f8b76a153c38c963f58c10 \ + --hash=sha256:ca246dffeca0498be9b4e1ee169b62e64694b0f92e6d0be2573e65522f39eea9 \ + --hash=sha256:ca4f5eeadbb57cf03317d6a2857823239a63a59cc935f5bd6cf6e8b7af7a7ecc \ + --hash=sha256:d102b9b21c4e1e40af9a2ab3c6d41afba6bd29c0aa50ca013bf85c99cdc44ac5 \ + --hash=sha256:db3bc9fa39afc5e4e2767da4459df82b095ef0cab2f257707be06c44a1c2c3e5 \ + --hash=sha256:dc9ad9950119d8ae27634e68b7663cc8d340ae535a0f80d85a55e56a6973ab1f \ + --hash=sha256:e02d7c1a02e3814c94ba0cfe43d93e872c758bd8fd5c2797f894d0c49b4a1dfc \ + --hash=sha256:e0898d412a434e768a0c7e365acabe13ff1558b767e400936e26b5b6ed1ee51f \ + --hash=sha256:e18e15287c31baf574fcdf8251fb7f997d64e96c6ecf467906e576da0a079af6 \ + --hash=sha256:ec279dcf3518201fc592c65002754f58a6b542798cd7f3ecd4af086422f33f29 \ + --hash=sha256:ec6fbded0c61afe6f84e3c2a43e6d656791d95747d6d28b73eff1af64108c434 \ + --hash=sha256:eec73a005443061f4759b71a056f745e3b000dc0dc125c9f20560232dfbcbd14 \ + --hash=sha256:f3d812daffd0c2d2794756bd45a353f89e55dc8f91eb2fc840c51b9f6be62667 \ + --hash=sha256:f4b1615adf67bd8bb71f3464146a6f9949972d06d21a4f5e87e73f6464d97f57 \ + --hash=sha256:f6831fdec2b853c9daa3358535c55eed3694325889aa714070528cf8f92d7d6d + # via + # -c python/deplocks/llm/ray_test_py311_cu121.lock + # -r python/requirements/cloud-requirements.txt +gymnasium==1.1.1 \ + --hash=sha256:8bd9ea9bdef32c950a444ff36afc785e1d81051ec32d30435058953c20d2456d \ + --hash=sha256:9c167ec0a2b388666e37f63b2849cd2552f7f5b71938574c637bb36487eb928a # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt h11==0.16.0 \ --hash=sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1 \ --hash=sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # httpcore # uvicorn halo==0.0.31 \ --hash=sha256:5350488fb7d2aa7c31a1344120cee67a872901ce8858f60da7946cef96c208ab \ --hash=sha256:7b67a3521ee91d53b7152d4ee3452811e1d2a6321975137762eb3d70063cc9d6 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/cloud-requirements.txt hf-transfer==0.1.9 \ --hash=sha256:035572865dab29d17e783fbf1e84cf1cb24f3fcf8f1b17db1cfc7fdf139f02bf \ @@ -1264,7 +1370,7 @@ hf-transfer==0.1.9 \ --hash=sha256:ee8b10afedcb75f71091bcc197c526a6ebf5c58bbbadb34fdeee6160f55f619f \ --hash=sha256:fc6bd19e1cc177c66bdef15ef8636ad3bde79d5a4f608c158021153b4573509d # via -r python/requirements/llm/llm-requirements.txt -hf-xet==1.1.5 \ +hf-xet==1.1.5 ; platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \ --hash=sha256:69ebbcfd9ec44fdc2af73441619eeb06b94ee34511bbcf57cd423820090f5694 \ --hash=sha256:73e167d9807d166596b4b2f0b585c6d5bd84a26dea32843665a8b58f6edba245 \ --hash=sha256:83088ecea236d5113de478acb2339f92c95b4fb0462acaa30621fac02f5a534a \ @@ -1282,7 +1388,7 @@ httplib2==0.20.4 \ --hash=sha256:58a98e45b4b1a48273073f905d2961666ecf0fbac4250ea5b47aef259eb5c585 \ --hash=sha256:8b6a905cb1c79eefd03f8669fd993c36dc341f7c558f056cb5a33b5c2f458543 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # oauth2client httptools==0.6.4 \ --hash=sha256:0614154d5454c21b6410fdf5262b4a3ddb0f53f1e1721cfd59d55f32138c578a \ @@ -1342,18 +1448,17 @@ huggingface-hub==0.34.3 \ # via # tokenizers # transformers - # vllm humanize==4.12.1 \ --hash=sha256:1338ba97415c96556758a6e2f65977ed406dddf4620d4c6db9bbdfd07f0f1232 \ --hash=sha256:86014ca5c52675dffa1d404491952f1f5bf03b07c175a51891a343daebf01fea # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/cloud-requirements.txt idna==3.7 \ --hash=sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc \ --hash=sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # anyio # email-validator # httpx @@ -1364,7 +1469,7 @@ imageio==2.34.2 \ --hash=sha256:5c0c0ee8faa018a1c42f649b90395dd4d3bb6187c09053a0cd6f1fdd51bbff5e \ --hash=sha256:a0bb27ec9d5bab36a9f4835e51b21d2cb099e1f78451441f94687ff3404b79f8 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # scikit-image imagesize==1.4.1 \ --hash=sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b \ @@ -1374,13 +1479,13 @@ importlib-metadata==6.11.0 \ --hash=sha256:1231cf92d825c9e03cfc4da076a16de6422c863558229ea0b22b675657463443 \ --hash=sha256:f0afba6205ad8f8947c7d338b5342d5db2afbfd82f9cbef7879a9539cc12eb9b # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # opentelemetry-api iniconfig==2.0.0 \ --hash=sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3 \ --hash=sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # pytest interegular==0.3.3 \ --hash=sha256:b0c07007d48c89d6d19f7204972d369b2a77222722e126b6aa63aa721dc3b19c \ @@ -1390,14 +1495,14 @@ ipykernel==6.27.1 \ --hash=sha256:7d5d594b6690654b4d299edba5e872dc17bb7396a8d0609c97cb7b8a1c605de6 \ --hash=sha256:dab88b47f112f9f7df62236511023c9bdeef67abc73af7c652e4ce4441601686 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # nbclassic # notebook ipython==8.12.3 \ --hash=sha256:3910c4b54543c2ad73d06579aa771041b7d5707b033bd488669b4cf544e3b363 \ --hash=sha256:b0340d46a933d27c657b211a329d0be23793c36595acf9e6ef4164bc01a1804c # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # ipykernel # ipywidgets # jupyterlab @@ -1405,38 +1510,38 @@ ipython-genutils==0.2.0 \ --hash=sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8 \ --hash=sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # nbclassic # notebook ipywidgets==8.1.3 \ --hash=sha256:efafd18f7a142248f7cb0ba890a68b96abd4d6e88ddbda483c9130d12667eaf2 \ --hash=sha256:f5f9eeaae082b1823ce9eac2575272952f40d748893972956dc09700a6392d9c # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/cloud-requirements.txt isodate==0.6.1 \ --hash=sha256:0751eece944162659049d35f4f549ed815792b38793f07cf73381c1c87cbed96 \ --hash=sha256:48c5881de7e8b0a0d648cb024c8062dc84e7b840ed81e864c7614fd3c127bde9 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # azure-storage-blob isoduration==20.11.0 \ --hash=sha256:ac2f9015137935279eac671f94f89eb00584f940f5dc49462a0c4ee692ba1bd9 \ --hash=sha256:b2904c2a4228c3d44f409c8ae8e2370eb21a26f7ac2ec5446df141dde3452042 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jsonschema jedi==0.19.1 \ --hash=sha256:cf0496f3651bc65d7174ac1b7d043eff454892c708a87d1b683e57b569927ffd \ --hash=sha256:e983c654fe5c02867aef4cdfce5a2fbb4a50adc0af145f70504238f18ef5e7e0 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # ipython jinja2==3.1.6 \ --hash=sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d \ --hash=sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # fastapi # jupyter-server # jupyterlab @@ -1529,26 +1634,26 @@ jmespath==1.0.1 \ --hash=sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980 \ --hash=sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # boto3 # botocore json5==0.9.14 \ --hash=sha256:740c7f1b9e584a468dbb2939d8d458db3427f2c93ae2139d05f47e453eae964f \ --hash=sha256:9ed66c3a6ca3510a976a9ef9b8c0787de24802724ab1860bc0153c7fdd589b02 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jupyterlab-server jsonpatch==1.32 \ --hash=sha256:26ac385719ac9f54df8a2f0827bb8253aa3ea8ab7b3368457bcdb8c14595a397 \ --hash=sha256:b6ddfe6c3db30d81a96aaeceb6baf916094ffa23d7dd5fa2c13e13f8b6e600c2 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/cloud-requirements.txt jsonpointer==2.4 \ --hash=sha256:15d51bba20eea3165644553647711d150376234112651b4f1811022aecad7d7a \ --hash=sha256:585cee82b70211fa9e6043b7bb89db6e1aa49524340dde8ad6b63206ea689d88 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jsonpatch # jsonschema jsonref==1.1.0 \ @@ -1559,7 +1664,7 @@ jsonschema==4.23.0 \ --hash=sha256:d71497fef26351a33265337fa77ffeb82423f3ea21283cd9467bb03999266bc4 \ --hash=sha256:fbadb6f8b144a8f8cf9f0b89ba94501d143e50411a1278633f56a7acf7fd5566 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/cloud-requirements.txt # -r python/requirements/llm/llm-requirements.txt # -r python/requirements.txt @@ -1572,13 +1677,13 @@ jsonschema-specifications==2024.10.1 \ --hash=sha256:0f38b83639958ce1152d02a7f062902c41c8fd20d558b0c34344292d417ae272 \ --hash=sha256:a09a0680616357d9a0ecf05c12ad234479f549239d0f5b55f3deea67475da9bf # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jsonschema jupyter-client==7.3.4 \ --hash=sha256:17d74b0d0a7b24f1c8c527b24fcf4607c56bee542ffe8e3418e50b21e514b621 \ --hash=sha256:aa9a6c32054b290374f95f73bb0cae91455c58dfb84f65c8591912b8f65e6d56 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # ipykernel # jupyter-server # nbclassic @@ -1588,7 +1693,7 @@ jupyter-core==5.5.0 \ --hash=sha256:880b86053bf298a8724994f95e99b99130659022a4f7f45f563084b6223861d3 \ --hash=sha256:e11e02cd8ae0a9de5c6c44abf5727df9f2581055afe00b22183f621ba3585805 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # ipykernel # jupyter-client # jupyter-server @@ -1601,13 +1706,13 @@ jupyter-events==0.6.3 \ --hash=sha256:57a2749f87ba387cd1bfd9b22a0875b889237dbf2edc2121ebb22bde47036c17 \ --hash=sha256:9a6e9995f75d1b7146b436ea24d696ce3a35bfa8bfe45e0c33c334c79464d0b3 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jupyter-server-fileid jupyter-server==1.24.0 \ --hash=sha256:23368e8e214baf82b313d4c5a0d828ca73015e1a192ce3829bd74e62fab8d046 \ --hash=sha256:c88ddbe862966ea1aea8c3ccb89a5903abd8fbcfe5cd14090ef549d403332c37 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jupyter-server-fileid # jupyterlab # jupyterlab-server @@ -1617,49 +1722,55 @@ jupyter-server-fileid==0.9.0 \ --hash=sha256:171538b7c7d08d11dbc57d4e6da196e0c258e4c2cd29249ef1e032bb423677f8 \ --hash=sha256:5b489c6fe6783c41174a728c7b81099608518387e53c3d53451a67f46a0cb7b0 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jupyter-server-ydoc jupyter-server-ydoc==0.6.1 \ --hash=sha256:18275ff1ce7e93bbda2301ca066273b3951fc50b0d9c8fc33788374134ad7920 \ --hash=sha256:ab10864708c81fa41ab9f2ed3626b54ff6926eaf14545d1d439714978dad6e9f # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jupyterlab jupyter-ydoc==0.2.5 \ --hash=sha256:5759170f112c70320a84217dd98d287699076ae65a7f88d458d57940a9f2b882 \ --hash=sha256:5a02ca7449f0d875f73e8cb8efdf695dddef15a8e71378b1f4eda6b7c90f5382 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jupyter-server-ydoc # jupyterlab jupyterlab==3.6.1 \ --hash=sha256:ad6707dd0149b629d0ed5b56916cfcdb816b376c6af3190337faba09e27ea29e \ --hash=sha256:aee98c174180e98a30470297d10b959e8e64f2288970c0de65f0a6d2b4807034 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/cloud-requirements.txt jupyterlab-pygments==0.3.0 \ --hash=sha256:721aca4d9029252b11cfa9d185e5b5af4d54772bb8072f9b7036f4170054d35d \ --hash=sha256:841a89020971da1d8693f1a99997aefc5dc424bb1b251fd6322462a1b8842780 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # nbconvert jupyterlab-server==2.24.0 \ --hash=sha256:4e6f99e0a5579bbbc32e449c4dbb039561d4f1a7827d5733273ed56738f21f07 \ --hash=sha256:5f077e142bb8dc9b843d960f940c513581bceca3793a0d80f9c67d9522c4e876 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jupyterlab jupyterlab-widgets==3.0.11 \ --hash=sha256:78287fd86d20744ace330a61625024cf5521e1c012a352ddc0a3cdc2348becd0 \ --hash=sha256:dd5ac679593c969af29c9bed054c24f26842baa51352114736756bc035deee27 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # ipywidgets jupytext==1.16.7 \ --hash=sha256:912f9d9af7bd3f15470105e5c5dddf1669b2d8c17f0c55772687fc5a4a73fe69 \ --hash=sha256:fc4e97f0890e22062c4ef10313c7ca960b07b3767246a1fef7585888cc2afe5d # via -r python/requirements/llm/llm-test-requirements.txt +kombu==5.5.4 \ + --hash=sha256:886600168275ebeada93b888e831352fe578168342f0d1d5833d88ba0d847363 \ + --hash=sha256:a12ed0557c238897d8e518f1d1fdf84bd1516c5e305af2dacd85c2015115feb8 + # via + # -c python/deplocks/llm/ray_test_py311_cu121.lock + # celery lark==1.2.2 \ --hash=sha256:c2276486b02f0f1b90be155f2c8ba4a8e194d42775786db622faccd652d8e80c \ --hash=sha256:ca807d0162cd16cef15a8feecb862d7319e7a09bdb13aef927968e45040fed80 @@ -1668,7 +1779,7 @@ lazy-loader==0.4 \ --hash=sha256:342aa8e14d543a154047afb4ba8ef17f5563baad3fc610d7b15b213b0f119efc \ --hash=sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # scikit-image llguidance==0.7.26 ; platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \ --hash=sha256:1895ff449c8ec0a5f1d3b142d723fc9b26a85b021b72d7f1173f8b7507f528c0 \ @@ -1711,7 +1822,7 @@ log-symbols==0.0.14 \ --hash=sha256:4952106ff8b605ab7d5081dd2c7e6ca7374584eff7086f499c06edd1ce56dcca \ --hash=sha256:cf0bbc6fe1a8e53f0d174a716bc625c4f87043cc21eb55dd8a740cfe22680556 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # halo lxml==4.9.4 \ --hash=sha256:00e91573183ad273e242db5585b52670eddf92bacad095ce25c1e682da14ed91 \ @@ -1808,7 +1919,7 @@ lxml==4.9.4 \ --hash=sha256:fd814847901df6e8de13ce69b84c31fc9b3fb591224d6762d0b256d510cbf382 \ --hash=sha256:fdb325b7fba1e2c40b9b1db407f85642e32404131c08480dd652110fc908561b # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # nbconvert lz4==4.3.3 \ --hash=sha256:01fe674ef2889dbb9899d8a67361e0c4a2c833af5aeb37dd505727cf5d2a131e \ @@ -1848,13 +1959,13 @@ lz4==4.3.3 \ --hash=sha256:f4c7bf687303ca47d69f9f0133274958fd672efaa33fb5bcde467862d6c621f0 \ --hash=sha256:f76176492ff082657ada0d0f10c794b6da5800249ef1692b35cf49b1e93e8ef7 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt markdown-it-py==2.2.0 \ --hash=sha256:5a35f8d1870171d9acc47b99612dc146129b631baf04970128b568f190d0cc30 \ --hash=sha256:7c9a5e412688bc771c67432cbfebcdd686c93ce6484913dccf06cb5a0bea35a1 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jupytext # mdit-py-plugins # rich @@ -1920,14 +2031,14 @@ markupsafe==2.1.3 \ --hash=sha256:fec21693218efe39aa7f8599346e90c705afa52c5b31ae019b2e57e8f6542bb2 \ --hash=sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jinja2 # nbconvert matplotlib-inline==0.1.6 \ --hash=sha256:f1f41aab5328aa5aaea9b16d083b128102f8712542f819fe7e6a420ff581b311 \ --hash=sha256:f887e5f10ba98e8d2b150ddcf4702c1e5f8b3a20005eb0f74bfdbd360ee6f304 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # ipykernel # ipython mdit-py-plugins==0.4.2 \ @@ -1938,7 +2049,7 @@ mdurl==0.1.2 \ --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \ --hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # markdown-it-py memray==1.10.0 ; sys_platform != 'win32' \ --hash=sha256:0a21745fb516b7a6efcd40aa7487c59e9313fcfc782d0193fcfcf00b48426874 \ @@ -1977,7 +2088,7 @@ memray==1.10.0 ; sys_platform != 'win32' \ --hash=sha256:e356af93e3b031c83957e9ac1a653f5aaba5df1e357dd17142f5ed19bb3dc660 \ --hash=sha256:f16c5c8730b616613dc8bafe32649ca6bd7252606251eb00148582011758d0b5 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt meson==1.8.3 \ --hash=sha256:ef02b806ce0c5b6becd5bb5dc9fa67662320b29b337e7ace73e4354500590233 \ @@ -1991,7 +2102,7 @@ mistune==0.8.4 \ --hash=sha256:59a3429db53c50b5c6bcc8a07f8848cb00d7dc8bdb431a4ab41920d201d4756e \ --hash=sha256:88a1051873018da288eee8538d476dffe1262495144b33ecb586c4ab266bb8d4 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # nbconvert mpmath==1.3.0 \ --hash=sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f \ @@ -2001,14 +2112,15 @@ msal==1.28.1 \ --hash=sha256:563c2d70de77a2ca9786aab84cb4e133a38a6897e6676774edc23d610bfc9e7b \ --hash=sha256:d72bbfe2d5c2f2555f4bc6205be4450ddfd12976610dd9a16a9ab0f05c68b64d # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock + # azure-datalake-store # azure-identity # msal-extensions msal-extensions==1.2.0b1 \ --hash=sha256:217f391bb549de11b19abe8029a8375fe3ca0556aa8cce004b2083f00a569b71 \ --hash=sha256:3658b3814cd6a7759e83cb0ec145f30330ee249a92444adaf9aa4eb4f5bbcbbc # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # azure-identity msgpack==1.0.7 \ --hash=sha256:04ad6069c86e531682f9e1e71b71c1c3937d6014a7c3e9edd2aa81ad58842862 \ @@ -2068,7 +2180,7 @@ msgpack==1.0.7 \ --hash=sha256:f9a7c509542db4eceed3dcf21ee5267ab565a83555c9b88a8109dcecc4709002 \ --hash=sha256:ff1d0899f104f3921d94579a5638847f783c9b04f2d5f229392ca77fba5b82fc # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt # ray msgspec==0.19.0 \ @@ -2201,27 +2313,27 @@ multidict==6.0.5 \ --hash=sha256:fce28b3c8a81b6b36dfac9feb1de115bab619b3c13905b419ec71d03a3fc1423 \ --hash=sha256:fe5d7785250541f7f5019ab9cba2c71169dc7d74d0f45253f8313f436458a4ef # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # aiohttp # yarl nbclassic==1.0.0 \ --hash=sha256:0ae11eb2319455d805596bf320336cda9554b41d99ab9a3c31bf8180bffa30e3 \ --hash=sha256:f99e4769b4750076cd4235c044b61232110733322384a94a63791d2e7beacc66 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jupyterlab # notebook nbclient==0.5.13 \ --hash=sha256:40c52c9b5e3c31faecaee69f202b3f53e38d7c1c563de0fadde9d7eda0fdafe8 \ --hash=sha256:47ac905af59379913c1f8f541098d2550153cf8dc58553cbe18c702b181518b0 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # nbconvert nbconvert==6.5.4 \ --hash=sha256:9e3c7c6d491374cbdd5f35d268c05809357716d346f4573186bbeab32ee50bc1 \ --hash=sha256:d679a947f849a966cbbd0bf6e7fedcfdb64be3b20ce7cef11ad55c13f5820e19 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jupyter-server # nbclassic # notebook @@ -2229,7 +2341,7 @@ nbformat==5.9.2 \ --hash=sha256:1c5172d786a41b82bcfd0c23f9e6b6f072e8fb49c39250219e4acfff1efe89e9 \ --hash=sha256:5f98b5ba1997dff175e77e0c17d5c10a96eaed2cbd1de3533d1fc35d5e111192 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jupyter-server # jupytext # nbclassic @@ -2240,7 +2352,7 @@ nest-asyncio==1.5.8 \ --hash=sha256:25aa2ca0d2a5b5531956b9e273b45cf664cae2b145101d73b86b199978d48fdb \ --hash=sha256:accda7a339a70599cb08f9dd09a67e0c2ef8d8d6f4c07f96ab203f2ae254e48d # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # ipykernel # jupyter-client # nbclassic @@ -2250,7 +2362,7 @@ networkx==3.2.1 \ --hash=sha256:9f1bb5cf3409bf324e0a722c20bdb4c20ee39bf1c30ce8ae499c8502b0b5e0c6 \ --hash=sha256:f18c69adc97877c42332c170849c96cefa91881c99a7cb3e95b7c659ebdc1ec2 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # scikit-image # torch ninja==1.11.1.3 \ @@ -2275,23 +2387,27 @@ ninja==1.11.1.3 \ # -r python/requirements/llm/llm-requirements.txt # vllm # xgrammar -nixl==0.3.1 \ - --hash=sha256:20428ad2668062a79045fae83cc5cba1f4019d4a2c7053cc8549c3a1533f8a75 \ - --hash=sha256:70b8932b50ccf1a13ac8fa2e10a4b78290baae9f963bfecfa67684104331a94b \ - --hash=sha256:8c144839484b3076f0b34ad8ceaeaff05c23399cf57ca85f2a94b44e1475a39b \ - --hash=sha256:ff59996ad05a7e4ba6c8beba0f1d8ac2f9e53df696a15af0d3340028e2f16081 +nixl==0.4.1 \ + --hash=sha256:10c7b4a44f89c3fbff3e20cb84973be95f8df36ee336fb108275ed1839fec1f1 \ + --hash=sha256:510cc9e824ad53cac71ce55ff41160f2a9e1507ceb52eb871b775fe1e42beb87 \ + --hash=sha256:8a3d83b28c16b795bdc281f1489b9d247f6e6088ad96ca96406072a36d6354b7 \ + --hash=sha256:9381fd3986d227c7ccb2607c03bbea559ec80f951e2ea47c1fbf381e4cd97164 \ + --hash=sha256:9ab7e580e9962ebdcda8c17f8548858d3fdb648621367d8e717ca317b534b778 \ + --hash=sha256:db144821de7912cb2502052b3070a1ac276b8b019470e6efdfce9c237ffe130d \ + --hash=sha256:e33102b85b3f95a8c95e59b59b29aabd03d47b5bce619de506b9bb83739cf60d \ + --hash=sha256:f16092dd445542e82e3db3553f6c7697ec5a2e837f19d416401283ae245826f9 # via -r python/requirements/llm/llm-requirements.txt notebook==6.5.7 \ --hash=sha256:04eb9011dfac634fbd4442adaf0a8c27cd26beef831fe1d19faf930c327768e4 \ --hash=sha256:a6afa9a4ff4d149a0771ff8b8c881a7a73b3835f9add0606696d6e9d98ac1cd0 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jupyterlab notebook-shim==0.2.3 \ --hash=sha256:a83496a43341c1674b093bfcebf0fe8e74cbe7eda5fd2bbc56f8e39e1486c0c7 \ --hash=sha256:f69388ac283ae008cd506dda10d0288b09a017d822d5e8c7129a152cbd3ce7e9 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # nbclassic numba==0.61.2 \ --hash=sha256:34fba9406078bac7ab052efbf0d13939426c753ad72946baaa5bf9ae0ebb8dd2 \ @@ -2354,7 +2470,7 @@ numpy==1.26.4 \ --hash=sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3 \ --hash=sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt # cupy-cuda12x # gguf @@ -2474,23 +2590,39 @@ oauth2client==4.1.3 \ --hash=sha256:b8a81cc5d60e2d364f0b1b98f958dbd472887acaf1a5b05e21c28c31a2d6d3ac \ --hash=sha256:d486741e451287f69568a4d26d70d9acd73a2bbfa275746c535b4209891cccc6 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/cloud-requirements.txt -openai==1.90.0 \ - --hash=sha256:9771982cdd5b6631af68c6a603da72ed44cd2caf73b49f717a72b71374bc565b \ - --hash=sha256:e5dcb5498ea6b42fec47546d10f1bcc05fb854219a7d953a5ba766718b212a02 +openai==1.100.2 \ + --hash=sha256:54d3457b2c8d7303a1bc002a058de46bdd8f37a8117751c7cf4ed4438051f151 \ + --hash=sha256:787b4c3c8a65895182c58c424f790c25c790cc9a0330e34f73d55b6ee5a00e32 + # via vllm +openai-harmony==0.0.4 \ + --hash=sha256:038f1d6772d1be5213b36ae76e5d042022395ec35c428a73ccb8b839b2cecf6a \ + --hash=sha256:15e6d53a66502491a3675a536df30e271f976e6c5efe68250a65191efcb85c4f \ + --hash=sha256:2d8d16d84702059833fb03b841b28c25600c54e83cadccef79af44e1c81166b1 \ + --hash=sha256:31e9bcac0902a309e2fc688e52f247eec7fffcd00d17e958b9a83a8fea6519c2 \ + --hash=sha256:3586d90c899cd41f8624e7b82a48c289f6e4be56c66304ecaf3a0ba88963a73f \ + --hash=sha256:3cf2344366f10981bbc0f6d9949a0b2bb87151d209ed295943ed6ad8eda37932 \ + --hash=sha256:567cc568b6bf7b4d041b0c9aa7d6b2c9394f8af6065bc87fa6d23f207b5af9a7 \ + --hash=sha256:5c67ac6df349236fb7b64f57c3dbb0273efcdca24314daa108f2a482c427106c \ + --hash=sha256:746f751de5033b3dbcfcd4a726a4c56ce452c593ad3d54472d8597ce8d8b6d44 \ + --hash=sha256:96a63199c0d81095b5d5d1ae8ca82b64c1c13d18d4e30323ae9e8ab31bc80a3d \ + --hash=sha256:97f1fe3909733212cc6b36f0f199b1421a9c57b79ec665f0322bd604cec47340 \ + --hash=sha256:b9ee9e9ab6a237cebbe16563c787a6e83f3fcc034075c3d321dab94448426282 \ + --hash=sha256:d38f2639f6bf7c3c34a5dfd79e29075811ae2fa9b895a63e76767f74a47a971e \ + --hash=sha256:ef21a1e2384a65c62d5ec5e1cded9fe026f1d032d5c5d725110d1a8d330d8f54 # via vllm opencensus==0.11.4 \ --hash=sha256:a18487ce68bc19900336e0ff4655c5a116daf10c1b3685ece8d971bddad6a864 \ --hash=sha256:cbef87d8b8773064ab60e5c2a1ced58bbaa38a6d052c41aec224958ce544eff2 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt opencensus-context==0.1.3 \ --hash=sha256:073bb0590007af276853009fac7e4bab1d523c3f03baf4cb4511ca38967c6039 \ --hash=sha256:a03108c3c10d8c80bb5ddf5c8a1f033161fa61972a9917f9b9b3a18517f0088c # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # opencensus opencv-python-headless==4.11.0.86 \ --hash=sha256:0e0a27c19dd1f40ddff94976cfe43066fbbe9dfbb2ec1907d66c19caef42a57b \ @@ -2507,7 +2639,7 @@ opentelemetry-api==1.34.1 \ --hash=sha256:64f0bd06d42824843731d05beea88d4d4b6ae59f9fe347ff7dfa2cc14233bbb3 \ --hash=sha256:b7df4cb0830d5a6c29ad0c0691dbae874d8daefa934b8b1d642de48323d32a8c # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt # opentelemetry-exporter-prometheus # opentelemetry-sdk @@ -2516,26 +2648,26 @@ opentelemetry-exporter-prometheus==0.55b1 \ --hash=sha256:d13ec0b22bf394113ff1ada5da98133a4b051779b803dae183188e26c4bd9ee0 \ --hash=sha256:f364fbbff9e5de37a112ff104d1185fb1d7e2046c5ab5911e5afebc7ab3ddf0e # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt opentelemetry-proto==1.27.0 \ --hash=sha256:33c9345d91dafd8a74fc3d7576c5a38f18b7fdf8d02983ac67485386132aedd6 \ --hash=sha256:b133873de5581a50063e1e4b29cdcf0c5e253a8c2d8dc1229add20a4c3830ace # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt opentelemetry-sdk==1.34.1 \ --hash=sha256:308effad4059562f1d92163c61c8141df649da24ce361827812c40abb2a1e96e \ --hash=sha256:8091db0d763fcd6098d4781bbc80ff0971f94e260739aa6afe6fd379cdf3aa4d # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt # opentelemetry-exporter-prometheus opentelemetry-semantic-conventions==0.55b1 \ --hash=sha256:5da81dfdf7d52e3d37f8fe88d5e771e191de924cfff5f550ab0b8f7b2409baed \ --hash=sha256:ef95b1f009159c28d7a7849f5cbc71c4c34c845bb514d66adfdf1b3fff3598b3 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # opentelemetry-sdk outlines-core==0.2.10 \ --hash=sha256:0a9e4b192ca837a472a1bb1428397509f543db08e1aeeee30252525cec34093a \ @@ -2584,7 +2716,7 @@ packaging==23.0 \ --hash=sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2 \ --hash=sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/cloud-requirements.txt # -r python/requirements.txt # huggingface-hub @@ -2593,6 +2725,7 @@ packaging==23.0 \ # jupyterlab # jupyterlab-server # jupytext + # kombu # lazy-loader # lm-format-enforcer # nbconvert @@ -2631,19 +2764,19 @@ pandas==1.5.3 \ --hash=sha256:e474390e60ed609cec869b0da796ad94f420bb057d86784191eefc62b65819ae \ --hash=sha256:f76d097d12c82a535fda9dfe5e8dd4127952b45fea9b0276cb30cca5ea313fbc # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt pandocfilters==1.5.0 \ --hash=sha256:0b679503337d233b4339a817bfc8c50064e2eff681314376a47cb582305a7a38 \ --hash=sha256:33aae3f25fd1a026079f5d27bdd52496f0e0803b3469282162bafdcbdf6ef14f # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # nbconvert parso==0.8.3 \ --hash=sha256:8c07be290bb59f03588915921e29e8a50002acaf2cdc5fa0e0114f91709fafa0 \ --hash=sha256:c001d4636cd3aecdaf33cbb40aebb59b094be2a74c556778ef5576c175e19e75 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jedi partial-json-parser==0.2.1.1.post5 \ --hash=sha256:627715aaa3cb3fb60a65b0d62223243acaa6c70846520a90326fef3a2f0b61ca \ @@ -2653,19 +2786,19 @@ pathspec==0.11.2 \ --hash=sha256:1d6ed233af05e679efb96b1851550ea95bbb64b7c490b0f5aa52996c11e92a20 \ --hash=sha256:e0d8d0ac2f12da61956eb2306b69f9469b42f4deb0f3cb6ed47b9cce9996ced3 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/cloud-requirements.txt pexpect==4.8.0 ; sys_platform != 'win32' \ --hash=sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937 \ --hash=sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # ipython pickleshare==0.7.5 \ --hash=sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca \ --hash=sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # ipython pillow==10.3.0 \ --hash=sha256:048ad577748b9fa4a99a0548c64f2cb8d672d5bf2e643a739ac8faff1164238c \ @@ -2738,7 +2871,7 @@ pillow==10.3.0 \ --hash=sha256:fdcbb4068117dfd9ce0138d068ac512843c52295ed996ae6dd1faf537b6dbc27 \ --hash=sha256:ff61bfd9253c3915e6d41c651d5f962da23eda633cf02262990094a18a55371a # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/llm/llm-test-requirements.txt # imageio # mistral-common @@ -2749,26 +2882,26 @@ platformdirs==3.11.0 \ --hash=sha256:cf8ee52a3afdb965072dcc652433e0c7e3e40cf5ea1477cd4b3b1d2eb75495b3 \ --hash=sha256:e9d171d00af68be50e9202731309c4e658fd8bc76f55c11c7dd760d023bda68e # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jupyter-core # virtualenv pluggy==1.3.0 \ --hash=sha256:cf61ae8f126ac6f7c451172cf30e3e43d3ca77615509771b3a984a0730651e12 \ --hash=sha256:d89c696a773f8bd377d18e5ecda92b7a3793cbe66c87060a6fb58c7b6e1061f7 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # pytest portalocker==2.8.2 \ --hash=sha256:2b035aa7828e46c58e9b31390ee1f169b98e1066ab10b9a6a861fe7e25ee4f33 \ --hash=sha256:cfb86acc09b9aa7c3b43594e19be1345b9d16af3feb08bf92f23d4dce513a28e # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # msal-extensions prometheus-client==0.19.0 \ --hash=sha256:4585b0d1223148c27a225b10dbec5ae9bc4c81a99a3fa80774fa6209935324e1 \ --hash=sha256:c88b1e6ecf6b41cd8fb5731c7ae919bf66df6ec6fafa555cd6c0e16ca169ae92 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt # jupyter-server # nbclassic @@ -2784,7 +2917,8 @@ prompt-toolkit==3.0.41 \ --hash=sha256:941367d97fc815548822aa26c2a269fdc4eb21e9ec05fc5d447cf09bad5d75f0 \ --hash=sha256:f36fe301fafb7470e86aaf90f036eef600a3210be4decf461a5b1ca8403d3cb2 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock + # click-repl # ipython propcache==0.3.0 \ --hash=sha256:02df07041e0820cacc8f739510078f2aadcfd3fc57eaeeb16d5ded85c872c89e \ @@ -2886,14 +3020,14 @@ propcache==0.3.0 \ --hash=sha256:fb91d20fa2d3b13deea98a690534697742029f4fb83673a3501ae6e3746508b5 \ --hash=sha256:fddb8870bdb83456a489ab67c6b3040a8d5a55069aa6f72f9d872235fbc52f54 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # aiohttp # yarl proto-plus==1.22.3 \ --hash=sha256:a49cd903bc0b6ab41f76bf65510439d56ca76f868adf0274e738bfdd096894df \ --hash=sha256:fdcd09713cbd42480740d2fe29c990f7fbd885a67efc328aa8be6ee3e9f76a6b # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # google-api-core protobuf==4.25.8 \ --hash=sha256:077ff8badf2acf8bc474406706ad890466274191a48d0abd3bd6987107c9cde5 \ @@ -2908,7 +3042,7 @@ protobuf==4.25.8 \ --hash=sha256:d552c53d0415449c8d17ced5c341caba0d89dbf433698e1436c8fa0aae7808a3 \ --hash=sha256:f4510b93a3bec6eba8fd8f1093e9d7fb0d4a24d1a81377c10c0e5bbfe9e4ed24 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt # google-api-core # googleapis-common-protos @@ -2936,21 +3070,21 @@ psutil==5.9.6 \ --hash=sha256:fb8a697f11b0f5994550555fcfe3e69799e5b060c8ecf9e2f75c69302cc35c0d \ --hash=sha256:ff18b8d1a784b810df0b0fff3bcb50ab941c3b8e2c8de5726f9c71c601c611aa # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # ipykernel # vllm ptyprocess==0.7.0 ; os_name != 'nt' or sys_platform != 'win32' \ --hash=sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35 \ --hash=sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # pexpect # terminado pure-eval==0.2.2 \ --hash=sha256:01eaab343580944bc56080ebe0a674b39ec44a945e6d09ba7db3cb8cec289350 \ --hash=sha256:2b45320af6dfaa1750f543d714b6d1c520a1688dec6fd24d339063ce0aaa9ac3 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # stack-data py-cpuinfo==9.0.0 \ --hash=sha256:3cdbbf3fac90dc6f118bfd64384f309edeadd902d7c8fb17f02ffa1fc3f49690 \ @@ -2966,7 +3100,7 @@ py-spy==0.4.0 ; python_full_version < '3.12' \ --hash=sha256:eee3d0bde85ca5cf4f01f012d461180ca76c24835a96f7b5c4ded64eb6a008ab \ --hash=sha256:f2cf3f7130e7d780471faa5957441d3b4e0ec39a79b2c00f4c33d494f7728428 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt pyarrow==19.0.1 \ --hash=sha256:008a4009efdb4ea3d2e18f05cd31f9d43c388aad29c636112c2966605ba33466 \ @@ -3012,13 +3146,13 @@ pyarrow==19.0.1 \ --hash=sha256:fca15aabbe9b8355800d923cc2e82c8ef514af321e18b437c3d782aa884eaeec \ --hash=sha256:fd44d66093a239358d07c42a91eebf5015aa54fccba959db899f932218ac9cc8 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt pyasn1==0.5.1 \ --hash=sha256:4439847c58d40b1d0a573d07e3856e95333f1976294494c325775aeca506eb58 \ --hash=sha256:6d391a96e59b23130a5cfa74d6fd7f388dbbe26cc8f1edf39fdddf08d9d6676c # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # oauth2client # pyasn1-modules # rsa @@ -3026,7 +3160,7 @@ pyasn1-modules==0.3.0 \ --hash=sha256:5bd01446b736eb9d31512a30d46c1ac3395d676c6f3cafa4c03eb54b9925631c \ --hash=sha256:d3ccd6ed470d9ffbc716be08bd90efbd44d0734bc9303818f7336070984a162d # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # google-auth # oauth2client pybase64==1.4.1 \ @@ -3186,7 +3320,7 @@ pycparser==2.21 \ --hash=sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9 \ --hash=sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # cffi pycurl==7.45.3 \ --hash=sha256:0c41a172d5e8a5cdd8328cc8134f47b2a57960ac677f7cda8520eaa9fbe7d990 \ @@ -3226,125 +3360,125 @@ pycurl==7.45.3 \ --hash=sha256:fa7751b614d9aa82d7a0f49ca90924c29c6cedf85a2f8687fb6a772dbfe48711 \ --hash=sha256:fbd4a6b8654b779089c5a44af1c65c1419c2cd60718780df6d8f354eb35d6d55 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/cloud-requirements.txt -pydantic==2.10.0 \ - --hash=sha256:0aca0f045ff6e2f097f1fe89521115335f15049eeb8a7bef3dafe4b19a74e289 \ - --hash=sha256:5e7807ba9201bdf61b1b58aa6eb690916c40a47acfb114b1b4fef3e7fd5b30fc +pydantic==2.11.7 \ + --hash=sha256:d989c3c6cb79469287b1569f7447a17848c998458d49ebe294e975b9baf0f0db \ + --hash=sha256:dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt # compressed-tensors # fastapi # lm-format-enforcer # mistral-common # openai + # openai-harmony # pydantic-extra-types # vllm # xgrammar -pydantic-core==2.27.0 \ - --hash=sha256:0aa4d1b2eba9a325897308b3124014a142cdccb9f3e016f31d3ebee6b5ea5e75 \ - --hash=sha256:0d06b667e53320332be2bf6f9461f4a9b78092a079b8ce8634c9afaa7e10cd9f \ - --hash=sha256:153017e3d6cd3ce979de06d84343ca424bb6092727375eba1968c8b4693c6ecb \ - --hash=sha256:15e350efb67b855cd014c218716feea4986a149ed1f42a539edd271ee074a196 \ - --hash=sha256:185ef205256cd8b38431205698531026979db89a79587725c1e55c59101d64e9 \ - --hash=sha256:1da0c98a85a6c6ed702d5556db3b09c91f9b0b78de37b7593e2de8d03238807a \ - --hash=sha256:225bfff5d425c34e1fd562cef52d673579d59b967d9de06178850c4802af9039 \ - --hash=sha256:24f984fc7762ed5f806d9e8c4c77ea69fdb2afd987b4fd319ef06c87595a8c55 \ - --hash=sha256:25a7fd4de38f7ff99a37e18fa0098c3140286451bc823d1746ba80cec5b433a1 \ - --hash=sha256:2883b260f7a93235488699d39cbbd94fa7b175d3a8063fbfddd3e81ad9988cb2 \ - --hash=sha256:2a51ce96224eadd1845150b204389623c8e129fde5a67a84b972bd83a85c6c40 \ - --hash=sha256:2be0ad541bb9f059954ccf8877a49ed73877f862529575ff3d54bf4223e4dd61 \ - --hash=sha256:31a2cae5f059329f9cfe3d8d266d3da1543b60b60130d186d9b6a3c20a346361 \ - --hash=sha256:333c840a1303d1474f491e7be0b718226c730a39ead0f7dab2c7e6a2f3855555 \ - --hash=sha256:33d14369739c5d07e2e7102cdb0081a1fa46ed03215e07f097b34e020b83b1ae \ - --hash=sha256:35380671c3c921fe8adf31ad349dc6f7588b7e928dbe44e1093789734f607399 \ - --hash=sha256:359e7951f04ad35111b5ddce184db3391442345d0ab073aa63a95eb8af25a5ef \ - --hash=sha256:36aa167f69d8807ba7e341d67ea93e50fcaaf6bc433bb04939430fa3dab06f31 \ - --hash=sha256:395e3e1148fa7809016231f8065f30bb0dc285a97b4dc4360cd86e17bab58af7 \ - --hash=sha256:3e8d89c276234579cd3d095d5fa2a44eb10db9a218664a17b56363cddf226ff3 \ - --hash=sha256:3eb8849445c26b41c5a474061032c53e14fe92a11a5db969f722a2716cd12206 \ - --hash=sha256:3fd8bc2690e7c39eecdf9071b6a889ce7b22b72073863940edc2a0a23750ca90 \ - --hash=sha256:400bf470e4327e920883b51e255617dfe4496d4e80c3fea0b5a5d0bf2c404dd4 \ - --hash=sha256:4148dc9184ab79e356dc00a4199dc0ee8647973332cb385fc29a7cced49b9f9c \ - --hash=sha256:433689845288f9a1ee5714444e65957be26d30915f7745091ede4a83cfb2d7bb \ - --hash=sha256:43b61989068de9ce62296cde02beffabcadb65672207fc51e7af76dca75e6636 \ - --hash=sha256:4523c4009c3f39d948e01962223c9f5538602e7087a628479b723c939fab262d \ - --hash=sha256:483c2213a609e7db2c592bbc015da58b6c75af7360ca3c981f178110d9787bcf \ - --hash=sha256:49633583eb7dc5cba61aaf7cdb2e9e662323ad394e543ee77af265736bcd3eaa \ - --hash=sha256:4b51f964fcbb02949fc546022e56cdb16cda457af485e9a3e8b78ac2ecf5d77e \ - --hash=sha256:4bf1340ae507f6da6360b24179c2083857c8ca7644aab65807023cf35404ea8d \ - --hash=sha256:4fb49cfdb53af5041aba909be00cccfb2c0d0a2e09281bf542371c5fd36ad04c \ - --hash=sha256:510b11e9c3b1a852876d1ccd8d5903684336d635214148637ceb27366c75a467 \ - --hash=sha256:513cb14c0cc31a4dfd849a4674b20c46d87b364f997bbcb02282306f5e187abf \ - --hash=sha256:58560828ee0951bb125c6f2862fbc37f039996d19ceb6d8ff1905abf7da0bf3d \ - --hash=sha256:58ab0d979c969983cdb97374698d847a4acffb217d543e172838864636ef10d9 \ - --hash=sha256:5982048129f40b082c2654de10c0f37c67a14f5ff9d37cf35be028ae982f26df \ - --hash=sha256:5ab325fc86fbc077284c8d7f996d904d30e97904a87d6fb303dce6b3de7ebba9 \ - --hash=sha256:5cc822ab90a70ea3a91e6aed3afac570b276b1278c6909b1d384f745bd09c714 \ - --hash=sha256:5f2b19b8d6fca432cb3acf48cf5243a7bf512988029b6e6fd27e9e8c0a204d85 \ - --hash=sha256:5fc72fbfebbf42c0856a824b8b0dc2b5cd2e4a896050281a21cfa6fed8879cb1 \ - --hash=sha256:6354e18a9be37bfa124d6b288a87fb30c673745806c92956f1a25e3ae6e76b96 \ - --hash=sha256:678f66462058dd978702db17eb6a3633d634f7aa0deaea61e0a674152766d3fc \ - --hash=sha256:68950bc08f9735306322bfc16a18391fcaac99ded2509e1cc41d03ccb6013cfe \ - --hash=sha256:68ef5377eb582fa4343c9d0b57a5b094046d447b4c73dd9fbd9ffb216f829e7d \ - --hash=sha256:6b4c19525c3538fbc0bbda6229f9682fb8199ce9ac37395880e6952798e00373 \ - --hash=sha256:6bb69bf3b6500f195c3deb69c1205ba8fc3cb21d1915f1f158a10d6b1ef29b6a \ - --hash=sha256:6e19401742ed7b69e51d8e4df3c03ad5ec65a83b36244479fd70edde2828a5d9 \ - --hash=sha256:6f4a53af9e81d757756508b57cae1cf28293f0f31b9fa2bfcb416cc7fb230f9d \ - --hash=sha256:6fda87808429c520a002a85d6e7cdadbf58231d60e96260976c5b8f9a12a8e13 \ - --hash=sha256:78f841523729e43e3928a364ec46e2e3f80e6625a4f62aca5c345f3f626c6e8a \ - --hash=sha256:7a6ebfac28fd51890a61df36ef202adbd77d00ee5aca4a3dadb3d9ed49cfb929 \ - --hash=sha256:7b0202ebf2268954090209a84f9897345719e46a57c5f2c9b7b250ca0a9d3e63 \ - --hash=sha256:8117839a9bdbba86e7f9df57018fe3b96cec934c3940b591b0fd3fbfb485864a \ - --hash=sha256:82e1ad4ca170e8af4c928b67cff731b6296e6a0a0981b97b2eb7c275cc4e15bd \ - --hash=sha256:836a4bfe0cc6d36dc9a9cc1a7b391265bf6ce9d1eb1eac62ac5139f5d8d9a6fa \ - --hash=sha256:84af1cf7bfdcbc6fcf5a5f70cc9896205e0350306e4dd73d54b6a18894f79386 \ - --hash=sha256:84e35afd9e10b2698e6f2f32256678cb23ca6c1568d02628033a837638b3ed12 \ - --hash=sha256:884f1806609c2c66564082540cffc96868c5571c7c3cf3a783f63f2fb49bd3cd \ - --hash=sha256:8a150392102c402c538190730fda06f3bce654fc498865579a9f2c1d2b425833 \ - --hash=sha256:8e21d927469d04b39386255bf00d0feedead16f6253dcc85e9e10ddebc334084 \ - --hash=sha256:8e96ca781e0c01e32115912ebdf7b3fb0780ce748b80d7d28a0802fa9fbaf44e \ - --hash=sha256:8ee4c2a75af9fe21269a4a0898c5425afb01af1f5d276063f57e2ae1bc64e191 \ - --hash=sha256:91bc66f878557313c2a6bcf396e7befcffe5ab4354cfe4427318968af31143c3 \ - --hash=sha256:951e71da6c89d354572098bada5ba5b5dc3a9390c933af8a614e37755d3d1840 \ - --hash=sha256:99b2863c1365f43f74199c980a3d40f18a218fbe683dd64e470199db426c4d6a \ - --hash=sha256:9a8fbf506fde1529a1e3698198fe64bfbe2e0c09557bc6a7dcf872e7c01fec40 \ - --hash=sha256:9ce048deb1e033e7a865ca384770bccc11d44179cf09e5193a535c4c2f497bdc \ - --hash=sha256:9fe94d9d2a2b4edd7a4b22adcd45814b1b59b03feb00e56deb2e89747aec7bfe \ - --hash=sha256:a291d0b4243a259c8ea7e2b84eb9ccb76370e569298875a7c5e3e71baf49057a \ - --hash=sha256:a5c022bb0d453192426221605efc865373dde43b17822a264671c53b068ac20c \ - --hash=sha256:abb4785894936d7682635726613c44578c420a096729f1978cd061a7e72d5275 \ - --hash=sha256:b872c86d8d71827235c7077461c502feb2db3f87d9d6d5a9daa64287d75e4fa0 \ - --hash=sha256:bf37b72834e7239cf84d4a0b2c050e7f9e48bced97bad9bdf98d26b8eb72e846 \ - --hash=sha256:c0c431e4be5c1a0c6654e0c31c661cd89e0ca956ef65305c3c3fd96f4e72ca39 \ - --hash=sha256:c5726eec789ee38f2c53b10b1821457b82274f81f4f746bb1e666d8741fcfadb \ - --hash=sha256:c6fcb3fa3855d583aa57b94cf146f7781d5d5bc06cb95cb3afece33d31aac39b \ - --hash=sha256:c86679f443e7085ea55a7376462553996c688395d18ef3f0d3dbad7838f857a2 \ - --hash=sha256:c91e3c04f5191fd3fb68764bddeaf02025492d5d9f23343b283870f6ace69708 \ - --hash=sha256:c921ad596ff1a82f9c692b0758c944355abc9f0de97a4c13ca60ffc6d8dc15d4 \ - --hash=sha256:c9ed88b398ba7e3bad7bd64d66cc01dcde9cfcb7ec629a6fd78a82fa0b559d78 \ - --hash=sha256:cd2ac6b919f7fed71b17fe0b4603c092a4c9b5bae414817c9c81d3c22d1e1bcc \ - --hash=sha256:d28ca7066d6cdd347a50d8b725dc10d9a1d6a1cce09836cf071ea6a2d4908be0 \ - --hash=sha256:d29e235ce13c91902ef3efc3d883a677655b3908b1cbc73dee816e5e1f8f7739 \ - --hash=sha256:d8b5ee4ae9170e2775d495b81f414cc20268041c42571530513496ba61e94ba3 \ - --hash=sha256:db72e40628967f6dc572020d04b5f800d71264e0531c6da35097e73bdf38b003 \ - --hash=sha256:df45c4073bed486ea2f18757057953afed8dd77add7276ff01bccb79982cf46c \ - --hash=sha256:dfa5f5c0a4c8fced1422dc2ca7eefd872d5d13eb33cf324361dbf1dbfba0a9fe \ - --hash=sha256:e015833384ca3e1a0565a79f5d953b0629d9138021c27ad37c92a9fa1af7623c \ - --hash=sha256:e15315691fe2253eb447503153acef4d7223dfe7e7702f9ed66539fcd0c43801 \ - --hash=sha256:e65466b31be1070b4a5b7dbfbd14b247884cb8e8b79c64fb0f36b472912dbaea \ - --hash=sha256:e7820bb0d65e3ce1e3e70b6708c2f66143f55912fa02f4b618d0f08b61575f12 \ - --hash=sha256:e851a051f7260e6d688267eb039c81f05f23a19431bd7dfa4bf5e3cb34c108cd \ - --hash=sha256:e9f9feee7f334b72ceae46313333d002b56f325b5f04271b4ae2aadd9e993ae4 \ - --hash=sha256:eb40f828bc2f73f777d1eb8fee2e86cd9692a4518b63b6b5aa8af915dfd3207b \ - --hash=sha256:eb704155e73b833801c247f39d562229c0303f54770ca14fb1c053acb376cf10 \ - --hash=sha256:edb1bfd45227dec8d50bc7c7d86463cd8728bcc574f9b07de7369880de4626a3 \ - --hash=sha256:ee7d9d5537daf6d5c74a83b38a638cc001b648096c1cae8ef695b0c919d9d379 \ - --hash=sha256:f57783fbaf648205ac50ae7d646f27582fc706be3977e87c3c124e7a92407b10 \ - --hash=sha256:ff63a92f6e249514ef35bc795de10745be0226eaea06eb48b4bbeaa0c8850a4a - # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt +pydantic-core==2.33.2 \ + --hash=sha256:0069c9acc3f3981b9ff4cdfaf088e98d83440a4c7ea1bc07460af3d4dc22e72d \ + --hash=sha256:031c57d67ca86902726e0fae2214ce6770bbe2f710dc33063187a68744a5ecac \ + --hash=sha256:0405262705a123b7ce9f0b92f123334d67b70fd1f20a9372b907ce1080c7ba02 \ + --hash=sha256:04a1a413977ab517154eebb2d326da71638271477d6ad87a769102f7c2488c56 \ + --hash=sha256:09fb9dd6571aacd023fe6aaca316bd01cf60ab27240d7eb39ebd66a3a15293b4 \ + --hash=sha256:0a39979dcbb70998b0e505fb1556a1d550a0781463ce84ebf915ba293ccb7e22 \ + --hash=sha256:0a9f2c9dd19656823cb8250b0724ee9c60a82f3cdf68a080979d13092a3b0fef \ + --hash=sha256:0e03262ab796d986f978f79c943fc5f620381be7287148b8010b4097f79a39ec \ + --hash=sha256:0e5b2671f05ba48b94cb90ce55d8bdcaaedb8ba00cc5359f6810fc918713983d \ + --hash=sha256:0e6116757f7959a712db11f3e9c0a99ade00a5bbedae83cb801985aa154f071b \ + --hash=sha256:0fb2d542b4d66f9470e8065c5469ec676978d625a8b7a363f07d9a501a9cb36a \ + --hash=sha256:1082dd3e2d7109ad8b7da48e1d4710c8d06c253cbc4a27c1cff4fbcaa97a9e3f \ + --hash=sha256:1a8695a8d00c73e50bff9dfda4d540b7dee29ff9b8053e38380426a85ef10052 \ + --hash=sha256:1e063337ef9e9820c77acc768546325ebe04ee38b08703244c1309cccc4f1bab \ + --hash=sha256:1ea40a64d23faa25e62a70ad163571c0b342b8bf66d5fa612ac0dec4f069d916 \ + --hash=sha256:2058a32994f1fde4ca0480ab9d1e75a0e8c87c22b53a3ae66554f9af78f2fe8c \ + --hash=sha256:235f45e5dbcccf6bd99f9f472858849f73d11120d76ea8707115415f8e5ebebf \ + --hash=sha256:2807668ba86cb38c6817ad9bc66215ab8584d1d304030ce4f0887336f28a5e27 \ + --hash=sha256:2b0a451c263b01acebe51895bfb0e1cc842a5c666efe06cdf13846c7418caa9a \ + --hash=sha256:2b3d326aaef0c0399d9afffeb6367d5e26ddc24d351dbc9c636840ac355dc5d8 \ + --hash=sha256:2bfb5112df54209d820d7bf9317c7a6c9025ea52e49f46b6a2060104bba37de7 \ + --hash=sha256:2f82865531efd18d6e07a04a17331af02cb7a651583c418df8266f17a63c6612 \ + --hash=sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1 \ + --hash=sha256:3a1c81334778f9e3af2f8aeb7a960736e5cab1dfebfb26aabca09afd2906c039 \ + --hash=sha256:3abcd9392a36025e3bd55f9bd38d908bd17962cc49bc6da8e7e96285336e2bca \ + --hash=sha256:3c6db6e52c6d70aa0d00d45cdb9b40f0433b96380071ea80b09277dba021ddf7 \ + --hash=sha256:3dc625f4aa79713512d1976fe9f0bc99f706a9dee21dfd1810b4bbbf228d0e8a \ + --hash=sha256:3eb3fe62804e8f859c49ed20a8451342de53ed764150cb14ca71357c765dc2a6 \ + --hash=sha256:44857c3227d3fb5e753d5fe4a3420d6376fa594b07b621e220cd93703fe21782 \ + --hash=sha256:4b25d91e288e2c4e0662b8038a28c6a07eaac3e196cfc4ff69de4ea3db992a1b \ + --hash=sha256:4c5b0a576fb381edd6d27f0a85915c6daf2f8138dc5c267a57c08a62900758c7 \ + --hash=sha256:4e61206137cbc65e6d5256e1166f88331d3b6238e082d9f74613b9b765fb9025 \ + --hash=sha256:52fb90784e0a242bb96ec53f42196a17278855b0f31ac7c3cc6f5c1ec4811849 \ + --hash=sha256:53a57d2ed685940a504248187d5685e49eb5eef0f696853647bf37c418c538f7 \ + --hash=sha256:572c7e6c8bb4774d2ac88929e3d1f12bc45714ae5ee6d9a788a9fb35e60bb04b \ + --hash=sha256:5c4aa4e82353f65e548c476b37e64189783aa5384903bfea4f41580f255fddfa \ + --hash=sha256:5c92edd15cd58b3c2d34873597a1e20f13094f59cf88068adb18947df5455b4e \ + --hash=sha256:5f483cfb75ff703095c59e365360cb73e00185e01aaea067cd19acffd2ab20ea \ + --hash=sha256:61c18fba8e5e9db3ab908620af374db0ac1baa69f0f32df4f61ae23f15e586ac \ + --hash=sha256:6368900c2d3ef09b69cb0b913f9f8263b03786e5b2a387706c5afb66800efd51 \ + --hash=sha256:64632ff9d614e5eecfb495796ad51b0ed98c453e447a76bcbeeb69615079fc7e \ + --hash=sha256:65132b7b4a1c0beded5e057324b7e16e10910c106d43675d9bd87d4f38dde162 \ + --hash=sha256:6b99022f1d19bc32a4c2a0d544fc9a76e3be90f0b3f4af413f87d38749300e65 \ + --hash=sha256:6bdfe4b3789761f3bcb4b1ddf33355a71079858958e3a552f16d5af19768fef2 \ + --hash=sha256:6fa6dfc3e4d1f734a34710f391ae822e0a8eb8559a85c6979e14e65ee6ba2954 \ + --hash=sha256:73662edf539e72a9440129f231ed3757faab89630d291b784ca99237fb94db2b \ + --hash=sha256:73cf6373c21bc80b2e0dc88444f41ae60b2f070ed02095754eb5a01df12256de \ + --hash=sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc \ + --hash=sha256:7f92c15cd1e97d4b12acd1cc9004fa092578acfa57b67ad5e43a197175d01a64 \ + --hash=sha256:82f68293f055f51b51ea42fafc74b6aad03e70e191799430b90c13d643059ebb \ + --hash=sha256:83aa99b1285bc8f038941ddf598501a86f1536789740991d7d8756e34f1e74d9 \ + --hash=sha256:87acbfcf8e90ca885206e98359d7dca4bcbb35abdc0ff66672a293e1d7a19101 \ + --hash=sha256:87b31b6846e361ef83fedb187bb5b4372d0da3f7e28d85415efa92d6125d6e6d \ + --hash=sha256:881b21b5549499972441da4758d662aeea93f1923f953e9cbaff14b8b9565aef \ + --hash=sha256:8d55ab81c57b8ff8548c3e4947f119551253f4e3787a7bbc0b6b3ca47498a9d3 \ + --hash=sha256:8f57a69461af2a5fa6e6bbd7a5f60d3b7e6cebb687f55106933188e79ad155c1 \ + --hash=sha256:95237e53bb015f67b63c91af7518a62a8660376a6a0db19b89acc77a4d6199f5 \ + --hash=sha256:96081f1605125ba0855dfda83f6f3df5ec90c61195421ba72223de35ccfb2f88 \ + --hash=sha256:970919794d126ba8645f3837ab6046fb4e72bbc057b3709144066204c19a455d \ + --hash=sha256:9cb1da0f5a471435a7bc7e439b8a728e8b61e59784b2af70d7c169f8dd8ae290 \ + --hash=sha256:9fcd347d2cc5c23b06de6d3b7b8275be558a0c90549495c699e379a80bf8379e \ + --hash=sha256:9fdac5d6ffa1b5a83bca06ffe7583f5576555e6c8b3a91fbd25ea7780f825f7d \ + --hash=sha256:a11c8d26a50bfab49002947d3d237abe4d9e4b5bdc8846a63537b6488e197808 \ + --hash=sha256:a144d4f717285c6d9234a66778059f33a89096dfb9b39117663fd8413d582dcc \ + --hash=sha256:a2b911a5b90e0374d03813674bf0a5fbbb7741570dcd4b4e85a2e48d17def29d \ + --hash=sha256:a7ec89dc587667f22b6a0b6579c249fca9026ce7c333fc142ba42411fa243cdc \ + --hash=sha256:aa9d91b338f2df0508606f7009fde642391425189bba6d8c653afd80fd6bb64e \ + --hash=sha256:b0379a2b24882fef529ec3b4987cb5d003b9cda32256024e6fe1586ac45fc640 \ + --hash=sha256:bc7aee6f634a6f4a95676fcb5d6559a2c2a390330098dba5e5a5f28a2e4ada30 \ + --hash=sha256:bdc25f3681f7b78572699569514036afe3c243bc3059d3942624e936ec93450e \ + --hash=sha256:c083a3bdd5a93dfe480f1125926afcdbf2917ae714bdb80b36d34318b2bec5d9 \ + --hash=sha256:c20c462aa4434b33a2661701b861604913f912254e441ab8d78d30485736115a \ + --hash=sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9 \ + --hash=sha256:c52b02ad8b4e2cf14ca7b3d918f3eb0ee91e63b3167c32591e57c4317e134f8f \ + --hash=sha256:c54c939ee22dc8e2d545da79fc5381f1c020d6d3141d3bd747eab59164dc89fb \ + --hash=sha256:c8e7af2f4e0194c22b5b37205bfb293d166a7344a5b0d0eaccebc376546d77d5 \ + --hash=sha256:cca3868ddfaccfbc4bfb1d608e2ccaaebe0ae628e1416aeb9c4d88c001bb45ab \ + --hash=sha256:d3f26877a748dc4251cfcfda9dfb5f13fcb034f5308388066bcfe9031b63ae7d \ + --hash=sha256:d53b22f2032c42eaaf025f7c40c2e3b94568ae077a606f006d206a463bc69572 \ + --hash=sha256:d87c561733f66531dced0da6e864f44ebf89a8fba55f31407b00c2f7f9449593 \ + --hash=sha256:d946c8bf0d5c24bf4fe333af284c59a19358aa3ec18cb3dc4370080da1e8ad29 \ + --hash=sha256:dac89aea9af8cd672fa7b510e7b8c33b0bba9a43186680550ccf23020f32d535 \ + --hash=sha256:db4b41f9bd95fbe5acd76d89920336ba96f03e149097365afe1cb092fceb89a1 \ + --hash=sha256:dc46a01bf8d62f227d5ecee74178ffc448ff4e5197c756331f71efcc66dc980f \ + --hash=sha256:dd14041875d09cc0f9308e37a6f8b65f5585cf2598a53aa0123df8b129d481f8 \ + --hash=sha256:de4b83bb311557e439b9e186f733f6c645b9417c84e2eb8203f3f820a4b988bf \ + --hash=sha256:e799c050df38a639db758c617ec771fd8fb7a5f8eaaa4b27b101f266b216a246 \ + --hash=sha256:e80b087132752f6b3d714f041ccf74403799d3b23a72722ea2e6ba2e892555b9 \ + --hash=sha256:eb8c529b2819c37140eb51b914153063d27ed88e3bdc31b71198a198e921e011 \ + --hash=sha256:eb9b459ca4df0e5c87deb59d37377461a538852765293f9e6ee834f0435a93b9 \ + --hash=sha256:efec8db3266b76ef9607c2c4c419bdb06bf335ae433b80816089ea7585816f6a \ + --hash=sha256:f481959862f57f29601ccced557cc2e817bce7533ab8e01a797a48b49c9692b3 \ + --hash=sha256:f517ca031dfc037a9c07e748cefd8d96235088b83b4f4ba8939105d20fa1dcd6 \ + --hash=sha256:f889f7a40498cc077332c7ab6b4608d296d852182211787d4f3ee377aaae66e8 \ + --hash=sha256:f8de619080e944347f5f20de29a975c2d815d9ddd8be9b9b7268e2e3ef68605a \ + --hash=sha256:f941635f2a3d96b2973e867144fde513665c87f13fe0e193c158ac51bfaaa7b2 \ + --hash=sha256:fa754d1850735a0b0e03bcffd9d4b4343eb417e47196e4485d9cca326073a42c \ + --hash=sha256:fa854f5cf7e33842a892e5c73f45327760bc7bc516339fda888c75ae60edaeb6 \ + --hash=sha256:fe5b32187cbc0c862ee201ad66c30cf218e5ed468ec8dc1cf49dec66e160cc4d + # via + # -c python/deplocks/llm/ray_test_py311_cu121.lock # pydantic pydantic-extra-types==2.10.5 \ --hash=sha256:1dcfa2c0cf741a422f088e0dbb4690e7bfadaaf050da3d6f80d6c3cf58a2bad8 \ @@ -3354,7 +3488,7 @@ pygments==2.18.0 \ --hash=sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199 \ --hash=sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # ipython # nbconvert # rich @@ -3363,7 +3497,7 @@ pyjwt==2.8.0 \ --hash=sha256:57e28d156e3d5c10088e0c68abb90bfac3df82b40a71bd0daa20c65ccd5c23de \ --hash=sha256:59127c392cc44c2da5bb3192169a91f429924e17aff6534d70fdc02ab3e04320 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # msal pynvml==12.0.0 \ --hash=sha256:299ce2451a6a17e6822d6faee750103e25b415f06f59abb8db65d30f794166f5 \ @@ -3373,20 +3507,20 @@ pyopenssl==25.0.0 \ --hash=sha256:424c247065e46e76a37411b9ab1782541c23bb658bf003772c3405fbaa128e90 \ --hash=sha256:cd2cef799efa3936bb08e8ccb9433a575722b9dd986023f1cabc4ae64e9dac16 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/cloud-requirements.txt # -r python/requirements.txt pyparsing==3.1.1 \ --hash=sha256:32c7c0b711493c72ff18a981d24f28aaf9c1fb7ed5e9667c9e84e3db623bdbfb \ --hash=sha256:ede28a1a32462f5a9705e07aea48001a08f7cf81a021585011deba701581a0db # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # httplib2 pytest==7.4.4 \ --hash=sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280 \ --hash=sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/base-test-requirements.txt # -r python/requirements/llm/llm-test-requirements.txt # pytest-aiohttp @@ -3395,23 +3529,24 @@ pytest-aiohttp==1.1.0 \ --hash=sha256:147de8cb164f3fc9d7196967f109ab3c0b93ea3463ab50631e56438eab7b5adc \ --hash=sha256:f39a11693a0dce08dd6c542d241e199dd8047a6e6596b2bcfa60d373f143456d # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/base-test-requirements.txt pytest-asyncio==0.17.2 \ --hash=sha256:6d895b02432c028e6957d25fc936494e78c6305736e785d9fee408b1efbc7ff4 \ --hash=sha256:e0fe5dbea40516b661ef1bcfe0bd9461c2847c4ef4bb40012324f2454fb7d56d # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/base-test-requirements.txt # pytest-aiohttp python-dateutil==2.8.2 \ --hash=sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86 \ --hash=sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/cloud-requirements.txt # arrow # botocore + # celery # jupyter-client # pandas python-dotenv==1.0.1 \ @@ -3422,7 +3557,7 @@ python-json-logger==2.0.7 \ --hash=sha256:23e7ec02d34237c5aa1e29a070193a4ea87583bb4e7f8fd06d3de8264c4b2e1c \ --hash=sha256:f380b826a991ebbe3de4d897aeec42760035ac760345e57b812938dc8b35e2bd # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jupyter-events # vllm python-multipart==0.0.20 \ @@ -3433,7 +3568,7 @@ pytz==2022.7.1 \ --hash=sha256:01a0681c4b9684a28304615eba55d1ab31ae00bf68ec157ec3708a8182dbbcd0 \ --hash=sha256:78f4f37d8198e0627c5f1143240bb0206b8691d8d7ac6d78fee88b78733f8c4a # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # pandas pyyaml==6.0.1 \ --hash=sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5 \ @@ -3488,7 +3623,7 @@ pyyaml==6.0.1 \ --hash=sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d \ --hash=sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/cloud-requirements.txt # -r python/requirements.txt # gguf @@ -3590,7 +3725,7 @@ pyzmq==26.0.3 \ --hash=sha256:f6b1d1c631e5940cac5a0b22c5379c86e8df6a4ec277c7a856b714021ab6cfad \ --hash=sha256:f6c21c00478a7bea93caaaef9e7629145d4153b15a8653e8bb4609d4bc70dbfc # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # ipykernel # jupyter-client # jupyter-server @@ -3601,7 +3736,7 @@ referencing==0.36.2 \ --hash=sha256:df2e89862cd09deabbdba16944cc3f10feb6b3e6f18e902f7cc25609a34775aa \ --hash=sha256:e8699adbbf8b5c7de96d8ffa0eb5c158b3beafce084968e2ea8bb08c6794dcd0 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jsonschema # jsonschema-specifications regex==2024.11.6 \ @@ -3707,10 +3842,11 @@ requests==2.32.3 \ --hash=sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760 \ --hash=sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/cloud-requirements.txt # -r python/requirements.txt # azure-core + # azure-datalake-store # google-api-core # google-cloud-storage # huggingface-hub @@ -3727,21 +3863,21 @@ rfc3339-validator==0.1.4 \ --hash=sha256:138a2abdf93304ad60530167e51d2dfb9549521a836871b88d7f4695d0022f6b \ --hash=sha256:24f6ec1eda14ef823da9e36ec7113124b39c04d50a4d3d3a3c2859577e7791fa # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jsonschema # jupyter-events rfc3986-validator==0.1.1 \ --hash=sha256:2f235c432ef459970b4306369336b9d5dbdda31b510ca1e327636e01f528bfa9 \ --hash=sha256:3d44bde7921b3b9ec3ae4e3adca370438eccebc676456449b145d533b240d055 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jsonschema # jupyter-events rich==13.3.2 \ --hash=sha256:91954fe80cfb7985727a467ca98a7618e5dd15178cc2da10f553b36a93859001 \ --hash=sha256:a104f37270bf677148d8acb07d33be1569eeee87e2d1beb286a4e9113caf6f2f # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/cloud-requirements.txt # -r python/requirements.txt # memray @@ -3851,21 +3987,21 @@ rpds-py==0.22.3 \ --hash=sha256:fda7cb070f442bf80b642cd56483b5548e43d366fe3f39b98e67cce780cded00 \ --hash=sha256:feea821ee2a9273771bae61194004ee2fc33f8ec7db08117ef9147d4bbcbca8e # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jsonschema # referencing rsa==4.7.2 \ --hash=sha256:78f9a9bf4e7be0c5ded4583326e7461e3a3c5aae24073648b4bdfa797d78c9d2 \ --hash=sha256:9d689e6ca1b3038bc82bf8d23e944b6b6037bc02301a574935b2dd946e0353b9 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # google-auth # oauth2client -s3transfer==0.6.2 \ - --hash=sha256:b014be3a8a2aab98cfe1abc7229cc5a9a0cf05eb9c1f2b86b230fd8df3f78084 \ - --hash=sha256:cab66d3380cca3e70939ef2255d01cd8aece6a4907a9528740f668c4b0611861 +s3transfer==0.8.0 \ + --hash=sha256:baa479dc2e63e5c2ed51611b4d46cdf0295e2070d8d0b86b22f335ee5b954986 \ + --hash=sha256:e8d6bd52ffd99841e3a57b34370a54841f12d3aab072af862cdcc50955288002 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # boto3 safetensors==0.5.2 \ --hash=sha256:03c937100f38c9ff4c1507abea9928a6a9b02c9c1c9c3609ed4fb2bf413d4975 \ @@ -3907,7 +4043,7 @@ scikit-image==0.24.0 \ --hash=sha256:ef04360eda372ee5cd60aebe9be91258639c86ae2ea24093fb9182118008d009 \ --hash=sha256:fa27b3a0dbad807b966b8db2d78da734cb812ca4787f7fbb143764800ce2fa9c # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt scipy==1.11.4 \ --hash=sha256:00150c5eae7b610c32589dda259eacc7c4f1665aedf25d921907f4d08a951b1c \ @@ -3936,7 +4072,7 @@ scipy==1.11.4 \ --hash=sha256:f313b39a7e94f296025e3cffc2c567618174c0b1dde173960cf23808f9fae4be \ --hash=sha256:f3cd9e7b3c2c1ec26364856f9fbe78695fe631150f94cd1c22228456404cf1ec # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt # scikit-image # vllm @@ -3944,7 +4080,7 @@ send2trash==1.8.3 \ --hash=sha256:0c31227e0bd08961c7665474a3d1ef7193929fedda4233843689baa056be46c9 \ --hash=sha256:b18e7a3966d99871aefeb00cfbcfdced55ce4871194810fc71f4aa484b953abf # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jupyter-server # nbclassic # notebook @@ -4006,17 +4142,116 @@ sentencepiece==0.2.0 \ # gguf # mistral-common # vllm +setproctitle==1.3.6 \ + --hash=sha256:082413db8a96b1f021088e8ec23f0a61fec352e649aba20881895815388b66d3 \ + --hash=sha256:0dba8faee2e4a96e934797c9f0f2d093f8239bf210406a99060b3eabe549628e \ + --hash=sha256:0e6b5633c94c5111f7137f875e8f1ff48f53b991d5d5b90932f27dc8c1fa9ae4 \ + --hash=sha256:1065ed36bd03a3fd4186d6c6de5f19846650b015789f72e2dea2d77be99bdca1 \ + --hash=sha256:109fc07b1cd6cef9c245b2028e3e98e038283342b220def311d0239179810dbe \ + --hash=sha256:13624d9925bb481bc0ccfbc7f533da38bfbfe6e80652314f789abc78c2e513bd \ + --hash=sha256:156795b3db976611d09252fc80761fcdb65bb7c9b9581148da900851af25ecf4 \ + --hash=sha256:163dba68f979c61e4e2e779c4d643e968973bdae7c33c3ec4d1869f7a9ba8390 \ + --hash=sha256:17d7c833ed6545ada5ac4bb606b86a28f13a04431953d4beac29d3773aa00b1d \ + --hash=sha256:18d0667bafaaae4c1dee831e2e59841c411ff399b9b4766822ba2685d419c3be \ + --hash=sha256:1aa1935aa2195b76f377e5cb018290376b7bf085f0b53f5a95c0c21011b74367 \ + --hash=sha256:2156d55308431ac3b3ec4e5e05b1726d11a5215352d6a22bb933171dee292f8c \ + --hash=sha256:23a57d3b8f1549515c2dbe4a2880ebc1f27780dc126c5e064167563e015817f5 \ + --hash=sha256:2407955dc359d735a20ac6e797ad160feb33d529a2ac50695c11a1ec680eafab \ + --hash=sha256:2940cf13f4fc11ce69ad2ed37a9f22386bfed314b98d8aebfd4f55459aa59108 \ + --hash=sha256:2e51ec673513465663008ce402171192a053564865c2fc6dc840620871a9bd7c \ + --hash=sha256:3393859eb8f19f5804049a685bf286cb08d447e28ba5c6d8543c7bf5500d5970 \ + --hash=sha256:3884002b3a9086f3018a32ab5d4e1e8214dd70695004e27b1a45c25a6243ad0b \ + --hash=sha256:38ca045626af693da042ac35d7332e7b9dbd52e6351d6973b310612e3acee6d6 \ + --hash=sha256:391bb6a29c4fe7ccc9c30812e3744060802d89b39264cfa77f3d280d7f387ea5 \ + --hash=sha256:3cca16fd055316a48f0debfcbfb6af7cea715429fc31515ab3fcac05abd527d8 \ + --hash=sha256:3cde5b83ec4915cd5e6ae271937fd60d14113c8f7769b4a20d51769fe70d8717 \ + --hash=sha256:3f8194b4d631b003a1176a75d1acd545e04b1f54b821638e098a93e6e62830ef \ + --hash=sha256:3fc97805f9d74444b027babff710bf39df1541437a6a585a983d090ae00cedde \ + --hash=sha256:4431629c178193f23c538cb1de3da285a99ccc86b20ee91d81eb5f1a80e0d2ba \ + --hash=sha256:49498ebf68ca3e75321ffe634fcea5cc720502bfaa79bd6b03ded92ce0dc3c24 \ + --hash=sha256:4ac3eb04bcf0119aadc6235a2c162bae5ed5f740e3d42273a7228b915722de20 \ + --hash=sha256:4adf6a0013fe4e0844e3ba7583ec203ca518b9394c6cc0d3354df2bf31d1c034 \ + --hash=sha256:4efc91b437f6ff2578e89e3f17d010c0a0ff01736606473d082913ecaf7859ba \ + --hash=sha256:50706b9c0eda55f7de18695bfeead5f28b58aa42fd5219b3b1692d554ecbc9ec \ + --hash=sha256:5313a4e9380e46ca0e2c681ba739296f9e7c899e6f4d12a6702b2dc9fb846a31 \ + --hash=sha256:543f59601a4e32daf44741b52f9a23e0ee374f9f13b39c41d917302d98fdd7b0 \ + --hash=sha256:57bc54763bf741813a99fbde91f6be138c8706148b7b42d3752deec46545d470 \ + --hash=sha256:63cc10352dc6cf35a33951656aa660d99f25f574eb78132ce41a85001a638aa7 \ + --hash=sha256:6a1d3aa13acfe81f355b0ce4968facc7a19b0d17223a0f80c011a1dba8388f37 \ + --hash=sha256:6af330ddc2ec05a99c3933ab3cba9365357c0b8470a7f2fa054ee4b0984f57d1 \ + --hash=sha256:6d50bfcc1d1692dc55165b3dd2f0b9f8fb5b1f7b571a93e08d660ad54b9ca1a5 \ + --hash=sha256:70100e2087fe05359f249a0b5f393127b3a1819bf34dec3a3e0d4941138650c9 \ + --hash=sha256:74973aebea3543ad033b9103db30579ec2b950a466e09f9c2180089e8346e0ec \ + --hash=sha256:751ba352ed922e0af60458e961167fa7b732ac31c0ddd1476a2dfd30ab5958c5 \ + --hash=sha256:785cd210c0311d9be28a70e281a914486d62bfd44ac926fcd70cf0b4d65dff1c \ + --hash=sha256:7890e291bf4708e3b61db9069ea39b3ab0651e42923a5e1f4d78a7b9e4b18301 \ + --hash=sha256:793a23e8d9cb6c231aa3023d700008224c6ec5b8fd622d50f3c51665e3d0a190 \ + --hash=sha256:797f2846b546a8741413c57d9fb930ad5aa939d925c9c0fa6186d77580035af7 \ + --hash=sha256:7df5fcc48588f82b6cc8073db069609ddd48a49b1e9734a20d0efb32464753c4 \ + --hash=sha256:8050c01331135f77ec99d99307bfbc6519ea24d2f92964b06f3222a804a3ff1f \ + --hash=sha256:805bb33e92fc3d8aa05674db3068d14d36718e3f2c5c79b09807203f229bf4b5 \ + --hash=sha256:807796fe301b7ed76cf100113cc008c119daf4fea2f9f43c578002aef70c3ebf \ + --hash=sha256:81c443310831e29fabbd07b75ebbfa29d0740b56f5907c6af218482d51260431 \ + --hash=sha256:83066ffbf77a5f82b7e96e59bdccbdda203c8dccbfc3f9f0fdad3a08d0001d9c \ + --hash=sha256:8834ab7be6539f1bfadec7c8d12249bbbe6c2413b1d40ffc0ec408692232a0c6 \ + --hash=sha256:92df0e70b884f5da35f2e01489dca3c06a79962fb75636985f1e3a17aec66833 \ + --hash=sha256:9483aa336687463f5497dd37a070094f3dff55e2c888994f8440fcf426a1a844 \ + --hash=sha256:97a138fa875c6f281df7720dac742259e85518135cd0e3551aba1c628103d853 \ + --hash=sha256:9b50700785eccac0819bea794d968ed8f6055c88f29364776b7ea076ac105c5d \ + --hash=sha256:9b73cf0fe28009a04a35bb2522e4c5b5176cc148919431dcb73fdbdfaab15781 \ + --hash=sha256:9d5a369eb7ec5b2fdfa9927530b5259dd21893fa75d4e04a223332f61b84b586 \ + --hash=sha256:a094b7ce455ca341b59a0f6ce6be2e11411ba6e2860b9aa3dbb37468f23338f4 \ + --hash=sha256:a0d6252098e98129a1decb59b46920d4eca17b0395f3d71b0d327d086fefe77d \ + --hash=sha256:a1d856b0f4e4a33e31cdab5f50d0a14998f3a2d726a3fd5cb7c4d45a57b28d1b \ + --hash=sha256:a4ae2ea9afcfdd2b931ddcebf1cf82532162677e00326637b31ed5dff7d985ca \ + --hash=sha256:a5963b663da69ad25fa1559ee064584935570def665917918938c1f1289f5ebc \ + --hash=sha256:ad1c2c2baaba62823a7f348f469a967ece0062140ca39e7a48e4bbb1f20d54c4 \ + --hash=sha256:ae82507fe458f7c0c8227017f2158111a4c9e7ce94de05178894a7ea9fefc8a1 \ + --hash=sha256:af188f3305f0a65c3217c30c6d4c06891e79144076a91e8b454f14256acc7279 \ + --hash=sha256:af44bb7a1af163806bbb679eb8432fa7b4fb6d83a5d403b541b675dcd3798638 \ + --hash=sha256:b0174ca6f3018ddeaa49847f29b69612e590534c1d2186d54ab25161ecc42975 \ + --hash=sha256:b2b17855ed7f994f3f259cf2dfbfad78814538536fa1a91b50253d84d87fd88d \ + --hash=sha256:b2e54f4a2dc6edf0f5ea5b1d0a608d2af3dcb5aa8c8eeab9c8841b23e1b054fe \ + --hash=sha256:b6f4abde9a2946f57e8daaf1160b2351bcf64274ef539e6675c1d945dbd75e2a \ + --hash=sha256:b70c07409d465f3a8b34d52f863871fb8a00755370791d2bd1d4f82b3cdaf3d5 \ + --hash=sha256:bb465dd5825356c1191a038a86ee1b8166e3562d6e8add95eec04ab484cfb8a2 \ + --hash=sha256:c051f46ed1e13ba8214b334cbf21902102807582fbfaf0fef341b9e52f0fafbf \ + --hash=sha256:c1b20a5f4164cec7007be55c9cf18d2cd08ed7c3bf6769b3cd6d044ad888d74b \ + --hash=sha256:c86e9e82bfab579327dbe9b82c71475165fbc8b2134d24f9a3b2edaf200a5c3d \ + --hash=sha256:c9f32b96c700bb384f33f7cf07954bb609d35dd82752cef57fb2ee0968409169 \ + --hash=sha256:cce0ed8b3f64c71c140f0ec244e5fdf8ecf78ddf8d2e591d4a8b6aa1c1214235 \ + --hash=sha256:cdd7315314b0744a7dd506f3bd0f2cf90734181529cdcf75542ee35ad885cab7 \ + --hash=sha256:cf355fbf0d4275d86f9f57be705d8e5eaa7f8ddb12b24ced2ea6cbd68fdb14dc \ + --hash=sha256:d136fbf8ad4321716e44d6d6b3d8dffb4872626010884e07a1db54b7450836cf \ + --hash=sha256:d2c8e20487b3b73c1fa72c56f5c89430617296cd380373e7af3a538a82d4cd6d \ + --hash=sha256:d483cc23cc56ab32911ea0baa0d2d9ea7aa065987f47de847a0a93a58bf57905 \ + --hash=sha256:d5a6c4864bb6fa9fcf7b57a830d21aed69fd71742a5ebcdbafda476be673d212 \ + --hash=sha256:d714e002dd3638170fe7376dc1b686dbac9cb712cde3f7224440af722cc9866a \ + --hash=sha256:d73f14b86d0e2858ece6bf5807c9889670e392c001d414b4293d0d9b291942c3 \ + --hash=sha256:d88c63bd395c787b0aa81d8bbc22c1809f311032ce3e823a6517b711129818e4 \ + --hash=sha256:db608db98ccc21248370d30044a60843b3f0f3d34781ceeea67067c508cd5a28 \ + --hash=sha256:de004939fc3fd0c1200d26ea9264350bfe501ffbf46c8cf5dc7f345f2d87a7f1 \ + --hash=sha256:ded9e86397267732a0641d4776c7c663ea16b64d7dbc4d9cc6ad8536363a2d29 \ + --hash=sha256:e288f8a162d663916060beb5e8165a8551312b08efee9cf68302687471a6545d \ + --hash=sha256:e2a9e62647dc040a76d55563580bf3bb8fe1f5b6ead08447c2ed0d7786e5e794 \ + --hash=sha256:e3e44d08b61de0dd6f205528498f834a51a5c06689f8fb182fe26f3a3ce7dca9 \ + --hash=sha256:ea002088d5554fd75e619742cefc78b84a212ba21632e59931b3501f0cfc8f67 \ + --hash=sha256:eb7452849f6615871eabed6560ffedfe56bc8af31a823b6be4ce1e6ff0ab72c5 \ + --hash=sha256:ebcf34b69df4ca0eabaaaf4a3d890f637f355fed00ba806f7ebdd2d040658c26 \ + --hash=sha256:f24d5b9383318cbd1a5cd969377937d66cf0542f24aa728a4f49d9f98f9c0da8 \ + --hash=sha256:f33fbf96b52d51c23b6cff61f57816539c1c147db270cfc1cc3bc012f4a560a9 + # via vllm shellingham==1.5.4 \ --hash=sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686 \ --hash=sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # typer six==1.16.0 \ --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \ --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/cloud-requirements.txt # asttokens # azure-core @@ -4031,20 +4266,20 @@ smart-open==6.2.0 \ --hash=sha256:088bf00f9327c71e549bc2f86567d3320df5d89667f009ce1c16568976068ef7 \ --hash=sha256:1b4df5c8365218f3852c507451920ccad606c80b0acb4e67508e50ba9b5d2632 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/cloud-requirements.txt # -r python/requirements.txt smmap==5.0.1 \ --hash=sha256:dceeb6c0028fdb6734471eb07c0cd2aae706ccaecab45965ee83f11c8d3b1f62 \ --hash=sha256:e6d8668fa5f93e706934a62d7b4db19c8d9eb8cf2adbb75ef1b675aa332b69da # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # gitdb sniffio==1.3.1 \ --hash=sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2 \ --hash=sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # anyio # openai snowballstemmer==2.2.0 \ @@ -4065,7 +4300,7 @@ soupsieve==2.5 \ --hash=sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690 \ --hash=sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # beautifulsoup4 soxr==0.5.0.post1 \ --hash=sha256:39e0f791ba178d69cd676485dbee37e75a34f20daa478d90341ecb7f6d9d690f \ @@ -4122,19 +4357,19 @@ spinners==0.0.24 \ --hash=sha256:1eb6aeb4781d72ab42ed8a01dcf20f3002bf50740d7154d12fb8c9769bf9e27f \ --hash=sha256:2fa30d0b72c9650ad12bbe031c9943b8d441e41b4f5602b0ec977a19f3290e98 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # halo stack-data==0.6.3 \ --hash=sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9 \ --hash=sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # ipython starlette==0.46.2 \ --hash=sha256:595633ce89f8ffa71a015caed34a5b2dc1c0cdb3f0f1fbd1e69339cf2abeec35 \ --hash=sha256:7f7361f34eed179294600af672f565727419830b54b7b084efe44bb82d2fccd5 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt # fastapi # prometheus-fastapi-instrumentator @@ -4146,25 +4381,25 @@ tabulate==0.9.0 \ --hash=sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c \ --hash=sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/cloud-requirements.txt tensorboardx==2.6.2.2 \ --hash=sha256:160025acbf759ede23fd3526ae9d9bfbfd8b68eb16c38a010ebe326dc6395db8 \ --hash=sha256:c6476d7cd0d529b0b72f4acadb1269f9ed8b22f441e87a84f2a3b940bb87b666 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt termcolor==2.4.0 \ --hash=sha256:9297c0df9c99445c2412e832e882a7884038a25617c60cea2ad69488d4040d63 \ --hash=sha256:aab9e56047c8ac41ed798fa36d892a37aca6b3e9159f3e0c24bc64a9b3ac7b7a # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # halo terminado==0.18.1 \ --hash=sha256:a4468e1b37bb318f8a86514f65814e1afc977cf29b3992a4500d9dd305dcceb0 \ --hash=sha256:de09f2c4b85de4765f7714688fff57d3e75bad1f909b589fde880460c753fd2e # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jupyter-server # nbclassic # notebook @@ -4172,7 +4407,7 @@ tifffile==2024.7.21 \ --hash=sha256:7f335b5d6ca49401fe0f1d87deb206f5dae47297e47b1ed52a676d05d6d26798 \ --hash=sha256:818b577d49350421fb511f389f937984f9feaa2cd8177fa00823001920bf3483 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # scikit-image tiktoken==0.9.0 \ --hash=sha256:03935988a91d6d3216e2ec7c645afbb3d870b37bcb67ada1943ec48678e7ee33 \ @@ -4213,7 +4448,7 @@ tinycss2==1.3.0 \ --hash=sha256:152f9acabd296a8375fbca5b84c961ff95971fcfc32e79550c8df8e29118c54d \ --hash=sha256:54a8dbdffb334d536851be0226030e9505965bb2f30f21a4a82c55fb2a80fae7 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # nbconvert tokenizers==0.21.1 \ --hash=sha256:0f0dcbcc9f6e13e675a66d7a5f2f225a736745ce484c1a4e07476a89ccdad382 \ @@ -4362,7 +4597,7 @@ tornado==6.1 \ --hash=sha256:fa2ba70284fa42c2a5ecb35e322e68823288a4251f9ba9cc77be04ae15eada68 \ --hash=sha256:fba85b6cd9c39be262fcd23865652920832b61583de2a2ca907dbd8e8a8c81e5 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # ipykernel # jupyter-client # jupyter-server @@ -4374,7 +4609,7 @@ tqdm==4.67.1 \ --hash=sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2 \ --hash=sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/cloud-requirements.txt # gguf # huggingface-hub @@ -4385,7 +4620,7 @@ traitlets==5.14.3 \ --hash=sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7 \ --hash=sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # comm # ipykernel # ipython @@ -4400,11 +4635,10 @@ traitlets==5.14.3 \ # nbconvert # nbformat # notebook -transformers==4.53.2 \ - --hash=sha256:6c3ed95edfb1cba71c4245758f1b4878c93bf8cde77d076307dacb2cbbd72be2 \ - --hash=sha256:db8f4819bb34f000029c73c3c557e7d06fc1b8e612ec142eecdae3947a9c78bf +transformers==4.55.2 \ + --hash=sha256:097e3c2e2c0c9681db3da9d748d8f9d6a724c644514673d0030e8c5a1109f1f1 \ + --hash=sha256:a45ec60c03474fd67adbce5c434685051b7608b3f4f167c25aa6aeb1cad16d4f # via - # -r python/requirements/llm/llm-requirements.txt # compressed-tensors # vllm # xgrammar @@ -4417,7 +4651,7 @@ typer==0.12.3 \ --hash=sha256:070d7ca53f785acbccba8e7d28b08dcd88f79f1fbda035ade0aecec71ca5c914 \ --hash=sha256:49e73131481d804288ef62598d97a1ceef3058905aa536a1134f90891ba35482 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/llm/llm-requirements.txt # -r python/requirements.txt # fastapi-cli @@ -4425,13 +4659,13 @@ types-python-dateutil==2.9.0.20240316 \ --hash=sha256:5d2f2e240b86905e40944dd787db6da9263f0deabef1076ddaed797351ec0202 \ --hash=sha256:6b8cb66d960771ce5ff974e9dd45e38facb81718cc1e208b10b1baccbfdbee3b # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # arrow typing-extensions==4.12.2 \ --hash=sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d \ --hash=sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # azure-core # azure-identity # azure-storage-blob @@ -4450,24 +4684,37 @@ typing-extensions==4.12.2 \ # referencing # torch # typer + # typing-inspection # vllm +typing-inspection==0.4.1 \ + --hash=sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51 \ + --hash=sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28 + # via + # -c python/deplocks/llm/ray_test_py311_cu121.lock + # pydantic +tzdata==2025.2 \ + --hash=sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8 \ + --hash=sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9 + # via + # -c python/deplocks/llm/ray_test_py311_cu121.lock + # kombu tzlocal==5.3 \ --hash=sha256:2fafbfc07e9d8b49ade18f898d6bcd37ae88ce3ad6486842a2e4f03af68323d2 \ --hash=sha256:3814135a1bb29763c6e4f08fd6e41dbb435c7a60bfbb03270211bcc537187d8c # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/cloud-requirements.txt uri-template==1.3.0 \ --hash=sha256:0e00f8eb65e18c7de20d595a14336e9f337ead580c70934141624b6d1ffdacc7 \ --hash=sha256:a44a133ea12d44a0c0f06d7d42a52d71282e77e2f937d8abd5655b8d56fc1363 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jsonschema urllib3==1.26.19 \ --hash=sha256:37a0344459b199fce0e80b0d3569837ec6b6937435c5244e7fd73fa6006830f3 \ --hash=sha256:3e3d753a8618b86d7de333b4223005f68720bcd6a7d2bcb9fbd2229ec7c1e429 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/cloud-requirements.txt # botocore # requests @@ -4475,7 +4722,7 @@ uvicorn==0.22.0 \ --hash=sha256:79277ae03db57ce7d9aa0567830bbb51d7a612f54d6e1e3e92da3ef24c2c8ed8 \ --hash=sha256:e9434d3bbf05f310e762147f769c9f21235ee118ba2d2bf1155a7196448bd996 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt # fastapi # fastapi-cli @@ -4518,15 +4765,23 @@ uvloop==0.21.0 ; platform_python_implementation != 'PyPy' and sys_platform != 'c --hash=sha256:f3df876acd7ec037a3d005b3ab85a7e4110422e4d9c1571d4fc89b0fc41b6816 \ --hash=sha256:f7089d2dc73179ce5ac255bdf37c236a9f914b264825fdaacaded6990a7fb4c2 # via uvicorn +vine==5.1.0 \ + --hash=sha256:40fdf3c48b2cfe1c38a49e9ae2da6fda88e4794c810050a728bd7413811fb1dc \ + --hash=sha256:8b62e981d35c41049211cf62a0a1242d8c1ee9bd15bb196ce38aefd6799e61e0 + # via + # -c python/deplocks/llm/ray_test_py311_cu121.lock + # amqp + # celery + # kombu virtualenv==20.29.1 \ --hash=sha256:4e4cb403c0b0da39e13b46b1b2476e505cb0046b25f242bee80f62bf990b2779 \ --hash=sha256:b8b8970138d32fb606192cb97f6cd4bb644fa486be9308fb9b63f81091b5dc35 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt -vllm==0.10.0 \ - --hash=sha256:8ca37559d82b43b5e8c8248d2e4a1ecb51d6d4e5d517491d656df6491ed93dab \ - --hash=sha256:a44e9013db26082a82c3931ed8772ac884d6d60566d36ecdb0e8dc01c65b241a +vllm==0.10.1.1 \ + --hash=sha256:3099824ee4bdaa14c4c4f7178a092101a0ec206d4c9371edf295849b2b730a39 \ + --hash=sha256:8ca0dd985e1ceac8540e7719c654f1553b3ba8a43c685ac8d3fa1366ffb6443a # via -r python/requirements/llm/llm-requirements.txt watchfiles==0.19.0 \ --hash=sha256:0089c6dc24d436b373c3c57657bf4f9a453b13767150d17284fc6162b2791911 \ @@ -4552,7 +4807,7 @@ watchfiles==0.19.0 \ --hash=sha256:d9b073073e048081e502b6c6b0b88714c026a1a4c890569238d04aca5f9ca74b \ --hash=sha256:fac19dc9cbc34052394dbe81e149411a62e71999c0a19e1e09ce537867f95ae0 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements.txt # uvicorn # vllm @@ -4560,26 +4815,26 @@ wcwidth==0.2.13 \ --hash=sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859 \ --hash=sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # prompt-toolkit webcolors==24.6.0 \ --hash=sha256:1d160d1de46b3e81e58d0a280d0c78b467dc80f47294b91b1ad8029d2cedb55b \ --hash=sha256:8cf5bc7e28defd1d48b9e83d5fc30741328305a8195c29a8e668fa45586568a1 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jsonschema webencodings==0.5.1 \ --hash=sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78 \ --hash=sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # bleach # tinycss2 websocket-client==1.8.0 \ --hash=sha256:17b44cc997f5c498e809b22cdf2d9c7a9e71c02c8cc2b6c56e7c2d1239bfa526 \ --hash=sha256:3239df9f44da632f96012472805d40a23281a991027ce11d2f45a6f24ac4c3da # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jupyter-server websockets==15.0 \ --hash=sha256:0e389efe46ccb25a1f93d08c7a74e8123a2517f7b7458f043bd7529d1a63ffeb \ @@ -4656,7 +4911,7 @@ widgetsnbextension==4.0.11 \ --hash=sha256:55d4d6949d100e0d08b94948a42efc3ed6dfdc0e9468b2c4b128c9a2ce3a7a36 \ --hash=sha256:8b22a8f1910bfd188e596fe7fc05dcbd87e810c8a4ba010bdb3da86637398474 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # ipywidgets wrapt==1.14.1 \ --hash=sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3 \ @@ -4734,7 +4989,7 @@ wrapt==1.14.1 \ --hash=sha256:ee6acae74a2b91865910eef5e7de37dc6895ad96fa23603d1d27ea69df545015 \ --hash=sha256:ef3f72c9666bba2bab70d2a8b79f2c6d2c1a42a7f7e2b0ec83bb2f9e383950af # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # -r python/requirements/cloud-requirements.txt xformers==0.0.31 ; platform_machine == 'x86_64' and sys_platform == 'linux' \ --hash=sha256:23331bdb9831ba0df96f55258537ca0df7ad888efc75cea97a0de79b5e2291c4 \ @@ -4845,7 +5100,7 @@ y-py==0.6.2 \ --hash=sha256:e92878cc05e844c8da937204bc34c2e6caf66709ce5936802fbfb35f04132892 \ --hash=sha256:ff32548e45e45bf3280ac1d28b3148337a5c6714c28db23aeb0693e33eba257e # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jupyter-ydoc # ypy-websocket yarl==1.18.3 \ @@ -4932,22 +5187,21 @@ yarl==1.18.3 \ --hash=sha256:fe57328fbc1bfd0bd0514470ac692630f3901c0ee39052ae47acd1d90a436719 \ --hash=sha256:fea09ca13323376a2fdfb353a5fa2e59f90cd18d7ca4eaa1fd31f0a8b4f91e62 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # aiohttp ypy-websocket==0.8.4 \ --hash=sha256:43a001473f5c8abcf182f603049cf305cbc855ad8deaa9dfa0f3b5a7cea9d0ff \ --hash=sha256:b1ba0dfcc9762f0ca168d2378062d3ca1299d39076b0f145d961359121042be5 # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # jupyter-server-ydoc zipp==3.19.2 \ --hash=sha256:bf1dcf6450f873a13e952a29504887c89e6de7506209e5b1bcc3460135d4de19 \ --hash=sha256:f091755f667055f2d02b32c53771a7a6c8b47e1fdbc4b72a8b9072b3eef8015c # via - # -c python/requirements_compiled_ray_test_py311_cu121.txt + # -c python/deplocks/llm/ray_test_py311_cu121.lock # importlib-metadata # The following packages were excluded from the output: -# ray -# grpcio-tools # setuptools +# ray diff --git a/python/requirements_compiled_rayllm_test_py311_cu128.txt b/python/deplocks/llm/rayllm_test_py311_cu128.lock similarity index 86% rename from python/requirements_compiled_rayllm_test_py311_cu128.txt rename to python/deplocks/llm/rayllm_test_py311_cu128.lock index 56592053ec4f..51ce2f2ab445 100644 --- a/python/requirements_compiled_rayllm_test_py311_cu128.txt +++ b/python/deplocks/llm/rayllm_test_py311_cu128.lock @@ -1,19 +1,25 @@ # This file was autogenerated by uv via the following command: -# uv pip compile --generate-hashes --strip-extras --unsafe-package ray --unsafe-package grpcio-tools --unsafe-package setuptools --index-url https://pypi.org/simple --extra-index-url https://download.pytorch.org/whl/cu128 --index-strategy unsafe-best-match --no-strip-markers --emit-index-url --emit-find-links -c python/requirements_compiled_ray_test_py311_cu128.txt python/requirements.txt python/requirements/cloud-requirements.txt python/requirements/base-test-requirements.txt python/requirements/llm/llm-requirements.txt python/requirements/llm/llm-test-requirements.txt -o python/requirements_compiled_rayllm_test_py311_cu128.txt +# uv pip compile --generate-hashes --strip-extras --unsafe-package setuptools --index-url https://pypi.org/simple --index-strategy unsafe-best-match --no-strip-markers --emit-index-url --emit-find-links --extra-index-url https://download.pytorch.org/whl/cu128 --python-version=3.11 --unsafe-package ray --python-platform=linux -c python/deplocks/llm/ray_test_py311_cu128.lock python/requirements.txt python/requirements/cloud-requirements.txt python/requirements/base-test-requirements.txt python/requirements/llm/llm-requirements.txt python/requirements/llm/llm-test-requirements.txt -o python/deplocks/llm/rayllm_test_py311_cu128.lock --index-url https://pypi.org/simple --extra-index-url https://download.pytorch.org/whl/cu128 +adlfs==2023.8.0 \ + --hash=sha256:07e804f6df4593acfcaf01025b162e30ac13e523d3570279c98b2d91a18026d9 \ + --hash=sha256:3eb248a3c2a30b419f1147bd7676d156b5219f96ef7f11d47166afd2a3bdb07e + # via + # -c python/deplocks/llm/ray_test_py311_cu128.lock + # -r python/requirements/cloud-requirements.txt aiofiles==22.1.0 \ --hash=sha256:1142fa8e80dbae46bb6339573ad4c8c0841358f79c6eb50a493dceca14621bad \ --hash=sha256:9107f1ca0b2a5553987a94a3c9959fe5b491fdf731389aa5b7b1bd0733e32de6 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # ypy-websocket aiohappyeyeballs==2.6.1 \ --hash=sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558 \ --hash=sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # aiohttp aiohttp==3.11.16 \ --hash=sha256:004511d3413737700835e949433536a2fe95a7d0297edd911a1e9705c5b5ea43 \ @@ -98,10 +104,11 @@ aiohttp==3.11.16 \ --hash=sha256:fb46bb0f24813e6cede6cc07b1961d4b04f331f7112a23b5e21f567da4ee50aa \ --hash=sha256:fd36c119c5d6551bce374fcb5c19269638f8d09862445f85a5a48596fd59f4bb # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/cloud-requirements.txt # -r python/requirements/llm/llm-test-requirements.txt # -r python/requirements.txt + # adlfs # aiohttp-cors # pytest-aiohttp # vllm @@ -109,41 +116,47 @@ aiohttp-cors==0.7.0 \ --hash=sha256:0451ba59fdf6909d0e2cd21e4c0a43752bc0703d33fc78ae94d9d9321710193e \ --hash=sha256:4d39c6d7100fd9764ed1caf8cebf0eb01bf5e3f24e2e073fda6234bc48b19f5d # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt aiorwlock==1.3.0 \ --hash=sha256:45baf8e4fa9a23e0bb325fbd67da80de1fd7ae1d4f59a6381754c60cec7b289b \ --hash=sha256:83f12d87df4b9728a0b8fda1756585ab0d652b107bab59c6084e1b1ad692ab45 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt aiosignal==1.3.1 \ --hash=sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc \ --hash=sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # aiohttp aiosqlite==0.19.0 \ --hash=sha256:95ee77b91c8d2808bd08a59fbebf66270e9090c3d92ffbf260dc0db0b979577d \ --hash=sha256:edba222e03453e094a3ce605db1b970c4b3376264e56f32e2a4959f948d66a96 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # ypy-websocket alabaster==0.7.16 \ --hash=sha256:75a8b99c28a5dad50dd7f8ccdd447a121ddb3892da9e53d1ca5cca3106d58d65 \ --hash=sha256:b46733c07dce03ae4e150330b975c75737fa60f0a7c591b6c8bf4928a28e2c92 # via sphinx +amqp==5.3.1 \ + --hash=sha256:43b3319e1b4e7d1251833a93d672b4af1e40f3d632d479b98661a95f117880a2 \ + --hash=sha256:cddc00c725449522023bad949f70fff7b48f0b1ade74d170a6f10ab044739432 + # via + # -c python/deplocks/llm/ray_test_py311_cu128.lock + # kombu annotated-types==0.6.0 \ --hash=sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43 \ --hash=sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # pydantic anyio==3.7.1 \ --hash=sha256:44a3c9aba0f5defa43261a8b3efb97891f2bd7d804e0e1f56419befa1adfc780 \ --hash=sha256:91dee416e570e92c64041bd18b900d1d6fa78dff7048769ce5ac5ddad004fbb5 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # httpx # jupyter-server # openai @@ -153,7 +166,7 @@ argon2-cffi==23.1.0 \ --hash=sha256:879c3e79a2729ce768ebb7d36d4609e3a78a4ca2ec3a9f12286ca057e3d0db08 \ --hash=sha256:c670642b78ba29641818ab2e68bd4e6a78ba53b7eff7b4c3815ae16abf91c7ea # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jupyter-server # nbclassic # notebook @@ -180,13 +193,13 @@ argon2-cffi-bindings==21.2.0 \ --hash=sha256:f1152ac548bd5b8bcecfb0b0371f082037e47128653df2e8ba6e914d384f3c3e \ --hash=sha256:f9f8b450ed0547e3d473fdc8612083fd08dd2120d6ac8f73828df9b7d45bb351 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # argon2-cffi arrow==1.3.0 \ --hash=sha256:c728b120ebc00eb84e01882a6f5e7927a53960aa990ce7dd2b10f39005a67f80 \ --hash=sha256:d4540617648cb5f895730f1ad8c82a65f2dad0166f57b75f3ca54759c4d67a85 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # isoduration astor==0.8.1 \ --hash=sha256:070a54e890cefb5b3739d19f30f5a5ec840ffc9c50ffa7d23cc9fc1a38ebbfc5 \ @@ -196,13 +209,13 @@ asttokens==2.4.1 \ --hash=sha256:051ed49c3dcae8913ea7cd08e46a606dba30b79993209636c4875bc1d637bc24 \ --hash=sha256:b03869718ba9a6eb027e134bfdf69f38a236d681c83c160d510768af11254ba0 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # stack-data attrs==25.1.0 \ --hash=sha256:1c97078a80c814273a76b2a298a932eb681c87415c11dee0a6921de7f1b02c3e \ --hash=sha256:c75a69e28a550a7e93789579c22aa26b0f5b83b75dc4e08fe092980051e1090a # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # aiohttp # jsonschema # referencing @@ -210,40 +223,49 @@ azure-common==1.1.28 \ --hash=sha256:4ac0cd3214e36b6a1b6a442686722a5d8cc449603aa833f3f0f40bda836704a3 \ --hash=sha256:5c12d3dcf4ec20599ca6b0d3e09e86e146353d443e7fcc050c9a19c1f9df20ad # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # smart-open azure-core==1.29.5 \ --hash=sha256:0fa04b7b1f7d44a4fb8468c4093deb2ea01fdf4faddbf802ed9205615f99d68c \ --hash=sha256:52983c89d394c6f881a121e5101c5fa67278ca3b1f339c8fb2ef39230c70e9ac # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock + # adlfs # azure-identity # azure-storage-blob # smart-open +azure-datalake-store==0.0.53 \ + --hash=sha256:05b6de62ee3f2a0a6e6941e6933b792b800c3e7f6ffce2fc324bc19875757393 \ + --hash=sha256:a30c902a6e360aa47d7f69f086b426729784e71c536f330b691647a51dc42b2b + # via + # -c python/deplocks/llm/ray_test_py311_cu128.lock + # adlfs azure-identity==1.17.1 \ --hash=sha256:32ecc67cc73f4bd0595e4f64b1ca65cd05186f4fe6f98ed2ae9f1aa32646efea \ --hash=sha256:db8d59c183b680e763722bfe8ebc45930e6c57df510620985939f7f3191e0382 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/cloud-requirements.txt + # adlfs azure-storage-blob==12.22.0 \ --hash=sha256:b3804bb4fe8ab1c32771fa464053da772a682c2737b19da438a3f4e5e3b3736e \ --hash=sha256:bb7d2d824ce3f11f14a27ee7d9281289f7e072ac8311c52e3652672455b7d5e8 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock + # adlfs # smart-open babel==2.13.1 \ --hash=sha256:33e0952d7dd6374af8dbf6768cc4ddf3ccfefc244f9986d4074704f2fbd18900 \ --hash=sha256:7077a4984b02b6727ac10f1f7294484f737443d7e2e66c5e4380e41a3ae0b4ed # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jupyterlab-server # sphinx backcall==0.2.0 \ --hash=sha256:5cbdbf27be5e7cfadb448baf0aa95508f91f2bbc6c6437cd9cd06e2a4c215e1e \ --hash=sha256:fbbce6a29f263178a1f7915c1940bde0ec2b2a967566fe1c65c1dfb7422bd255 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # ipython backoff==2.2.1 \ --hash=sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba \ @@ -253,8 +275,14 @@ beautifulsoup4==4.11.1 \ --hash=sha256:58d5c3d29f5a36ffeb94f02f0d786cd53014cf9b3b3951d42e0080d8a9498d30 \ --hash=sha256:ad9aa55b65ef2808eb405f46cf74df7fcb7044d5cbc26487f96eb2ef2e436693 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # nbconvert +billiard==4.2.1 \ + --hash=sha256:12b641b0c539073fc8d3f5b8b7be998956665c4233c7c1fcd66a7e677c4fb36f \ + --hash=sha256:40b59a4ac8806ba2c2369ea98d876bc6108b051c227baffd928c644d15d8f3cb + # via + # -c python/deplocks/llm/ray_test_py311_cu128.lock + # celery blake3==1.0.5 \ --hash=sha256:03638a6dc8546365c3576fdb293fb2c53b898ac80525b5742d9cf00b4f44dea5 \ --hash=sha256:043a226cebfedff7b51ab9c87d4476c06d2cd10776855eaa9c619f2272b3c32e \ @@ -346,20 +374,20 @@ bleach==6.1.0 \ --hash=sha256:0a31f1837963c41d46bbf1331b8778e1308ea0791db03cc4e7357b97cf42a8fe \ --hash=sha256:3225f354cfc436b9789c66c4ee030194bee0568fbf9cbdad3bc8b5c26c5f12b6 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # nbconvert -boto3==1.26.76 \ - --hash=sha256:30c7d967ed1c6b5a05643e42cae9d4d36c3f1cb6782637ddc7007a104cfd9027 \ - --hash=sha256:b4c2969b7677762914394b8273cc1905dfe5b71f250741c1a575487ae357e729 +boto3==1.29.7 \ + --hash=sha256:1eb4c548118b5fc5e018dee956fd33e6fb249cd1f2def85f1bba816aef4d9f3e \ + --hash=sha256:96e9890ebe7cd823b5f4976dd676e112c000c6528c28e20a2f274590589dd18b # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/cloud-requirements.txt # smart-open -botocore==1.29.76 \ - --hash=sha256:70735b00cd529f152992231ca6757e458e5ec25db43767b3526e9a35b2f143b7 \ - --hash=sha256:c2f67b6b3f8acf2968eafca06526f07b9fb0d27bac4c68a635d51abb675134a7 +botocore==1.32.7 \ + --hash=sha256:58b33d02cafa23461c8a9d211b30e8cded992380a84de409379fd02811fa3e11 \ + --hash=sha256:c6795c731b04c8e3635588c44cfd1a4462fc5987859195522c96812cf3eceff9 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/cloud-requirements.txt # boto3 # s3transfer @@ -367,7 +395,7 @@ cachetools==5.5.2 \ --hash=sha256:1a661caa9175d26759571b2e19580f9d6393969e5dfca11fdb1f947a23e640d4 \ --hash=sha256:d26a22bcc62eb95c3beabd9f1ee5e820d3d2704fe2967cbe350e20c8ffcd3f0a # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # google-auth # vllm cbor2==5.6.5 \ @@ -416,11 +444,17 @@ cbor2==5.6.5 \ --hash=sha256:fde21ac1cf29336a31615a2c469a9cb03cf0add3ae480672d4d38cda467d07fc \ --hash=sha256:fe11c2eb518c882cfbeed456e7a552e544893c17db66fe5d3230dbeaca6b615c # via vllm +celery==5.5.3 \ + --hash=sha256:0b5761a07057acee94694464ca482416b959568904c9dfa41ce8413a7d65d525 \ + --hash=sha256:6c972ae7968c2b5281227f01c3a3f984037d21c5129d07bf3550cc2afc6b10a5 + # via + # -c python/deplocks/llm/ray_test_py311_cu128.lock + # -r python/requirements.txt certifi==2025.1.31 \ --hash=sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651 \ --hash=sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/cloud-requirements.txt # httpcore # httpx @@ -479,8 +513,9 @@ cffi==1.16.0 \ --hash=sha256:fa3a0128b152627161ce47201262d3140edb5a5c3da88d73a1b790a959126956 \ --hash=sha256:fcc8eb6d5902bb1cf6dc4f187ee3ea80a1eba0a89aba40a5cb20a5087d961357 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # argon2-cffi-bindings + # azure-datalake-store # cryptography # soundfile charset-normalizer==3.3.2 \ @@ -575,29 +610,51 @@ charset-normalizer==3.3.2 \ --hash=sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519 \ --hash=sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # requests click==8.1.7 \ --hash=sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28 \ --hash=sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/cloud-requirements.txt # -r python/requirements.txt + # celery + # click-didyoumean + # click-plugins + # click-repl # ray # typer # uvicorn +click-didyoumean==0.3.1 \ + --hash=sha256:4f82fdff0dbe64ef8ab2279bd6aa3f6a99c3b28c05aa09cbfc07c9d7fbb5a463 \ + --hash=sha256:5c4bb6007cfea5f2fd6583a2fb6701a22a41eb98957e63d0fac41c10e7c3117c + # via + # -c python/deplocks/llm/ray_test_py311_cu128.lock + # celery +click-plugins==1.1.1.2 \ + --hash=sha256:008d65743833ffc1f5417bf0e78e8d2c23aab04d9745ba817bd3e71b0feb6aa6 \ + --hash=sha256:d7af3984a99d243c131aa1a828331e7630f4a88a9741fd05c927b204bcf92261 + # via + # -c python/deplocks/llm/ray_test_py311_cu128.lock + # celery +click-repl==0.3.0 \ + --hash=sha256:17849c23dba3d667247dc4defe1757fff98694e90fe37474f3feebb69ced26a9 \ + --hash=sha256:fb7e06deb8da8de86180a33a9da97ac316751c094c6899382da7feeeeb51b812 + # via + # -c python/deplocks/llm/ray_test_py311_cu128.lock + # celery cloudpickle==2.2.0 \ --hash=sha256:3f4219469c55453cfe4737e564b67c2a149109dabf7f242478948b895f61106f \ --hash=sha256:7428798d5926d8fcbfd092d18d01a2a03daf8237d8fcdc8095d256b8490796f0 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # gymnasium # vllm colorama==0.4.6 \ --hash=sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/cloud-requirements.txt # halo # log-symbols @@ -605,13 +662,13 @@ colorful==0.5.5 \ --hash=sha256:62c187e27c1433db9463ff93b1451898d1e7e23a7e553583fd9daeb6325182e4 \ --hash=sha256:66f8c1264b2a26f7293b96a03bb7a76c4bc8b9634369a0bffdcd12d618056a1d # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt comm==0.2.0 \ --hash=sha256:2da8d9ebb8dd7bfc247adaff99f24dce705638a8042b85cb995066793e391001 \ --hash=sha256:a517ea2ca28931c7007a7a99c562a0fa5883cfb48963140cf642c41c948498be # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # ipykernel # ipywidgets compressed-tensors==0.10.2 \ @@ -657,7 +714,7 @@ cryptography==44.0.3 \ --hash=sha256:fc3c9babc1e1faefd62704bb46a69f359a9819eb0292e40df3fb6e3574715cd4 \ --hash=sha256:fe19d8bc5536a91a24a8133328880a41831b6c5df54599a8417b62fe015d3053 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # azure-identity # azure-storage-blob # msal @@ -677,7 +734,7 @@ cupy-cuda12x==13.1.0 ; sys_platform != 'darwin' \ --hash=sha256:dac0284fecb90b5731f514e569a6fcf6674a730ae95b9490781a713b60a34423 \ --hash=sha256:e7a25ef1b44ae6276b5105affc2289edb34f1aa6676babd5bcd80907348c4cfa # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt # ray debugpy==1.8.0 \ @@ -700,19 +757,19 @@ debugpy==1.8.0 \ --hash=sha256:ef54404365fae8d45cf450d0544ee40cefbcb9cb85ea7afe89a963c27028261e \ --hash=sha256:ef9ab7df0b9a42ed9c878afd3eaaff471fce3fa73df96022e1f5c9f8f8c87ada # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # ipykernel decorator==5.1.1 \ --hash=sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330 \ --hash=sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # ipython defusedxml==0.7.1 \ --hash=sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69 \ --hash=sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # nbconvert depyf==0.19.0 \ --hash=sha256:040b35fc0997d49df024b7d094f2a7836f91e9ed02f49982dd37e70aa3285ad5 \ @@ -730,7 +787,7 @@ distlib==0.3.7 \ --hash=sha256:2e24928bc811348f0feb63014e97aaae3037f2cf48712d51ae61df7fd6075057 \ --hash=sha256:9dafe54b34a028eafd95039d5e5d4851a13734540f1331060d31c9916e7147a8 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # virtualenv distro==1.9.0 \ --hash=sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed \ @@ -784,7 +841,7 @@ dm-tree==0.1.8 \ --hash=sha256:f7ac31b9aecccb2c6e1ab29706f6ded3eba0c2c69c770322c9c685929c3d6afb \ --hash=sha256:fa42a605d099ee7d41ba2b5fb75e21423951fd26e5d50583a00471238fb3021d # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt dnspython==2.7.0 \ --hash=sha256:b4c34b7d10b51bcc3a5071e7b8dee77939f1e878477eeecc965e9835f63c6c86 \ @@ -806,26 +863,26 @@ entrypoints==0.4 \ --hash=sha256:b706eddaa9218a19ebcd67b56818f05bb27589b1ca9e8d797b74affad4ccacd4 \ --hash=sha256:f174b5ff827504fd3cd97cc3f8649f3693f51538c7e4bdf3ef002c8429d42f9f # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jupyter-client # nbconvert executing==2.0.1 \ --hash=sha256:35afe2ce3affba8ee97f2d69927fa823b08b472b7b994e36a52a964b93d16147 \ --hash=sha256:eac49ca94516ccc753f9fb5ce82603156e590b27525a8bc32cce8ae302eb61bc # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # stack-data farama-notifications==0.0.4 \ --hash=sha256:13fceff2d14314cf80703c8266462ebf3733c7d165336eee998fc58e545efd18 \ --hash=sha256:14de931035a41961f7c056361dc7f980762a143d05791ef5794a751a2caf05ae # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # gymnasium fastapi==0.115.12 \ --hash=sha256:1e2c2a2646905f9e83d32f04a3f86aff4a286669c6c950ca95b5fd68c2602681 \ --hash=sha256:e94613d6c05e27be7ffebdd6ea5f388112e5e430c8f7d6494a9d1d88d43e814d # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt # vllm fastapi-cli==0.0.5 \ @@ -836,7 +893,7 @@ fastjsonschema==2.19.0 \ --hash=sha256:b9fd1a2dd6971dbc7fee280a95bd199ae0dd9ce22beb91cc75e9c1c528a5170e \ --hash=sha256:e25df6647e1bc4a26070b700897b07b542ec898dd4f1f6ea013e7f6a88417225 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # nbformat fastrlock==0.8.2 ; sys_platform != 'darwin' \ --hash=sha256:067edb0a0805bf61e17a251d5046af59f6e9d2b8ad01222e0ef7a0b7937d5548 \ @@ -915,13 +972,13 @@ fastrlock==0.8.2 ; sys_platform != 'darwin' \ --hash=sha256:fb5363cf0fddd9b50525ddbf64a1e1b28ec4c6dfb28670a940cb1cf988a6786b \ --hash=sha256:ff75c90663d6e8996610d435e71487daa853871ad1770dd83dc0f2fc4997241e # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # cupy-cuda12x filelock==3.17.0 \ --hash=sha256:533dc2f7ba78dc2f0f531fc6c4940addf7b70a481e269a5a3b93be94ffbe8338 \ --hash=sha256:ee4e77401ef576ebb38cd7f13b9b28893194acc20a8e68e18730ba9c0e54660e # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt # huggingface-hub # ray @@ -933,7 +990,7 @@ fqdn==1.5.1 \ --hash=sha256:105ed3677e767fb5ca086a0c1f4bb66ebc3c100be518f0e0d755d9eae164d89f \ --hash=sha256:3a179af3761e4df6eb2e026ff9e1a3033d3587bf980a0b1b2e1e5d08d7358014 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jsonschema frozenlist==1.4.1 \ --hash=sha256:04ced3e6a46b4cfffe20f9ae482818e34eba9b5fb0ce4056e4cc9b6e212d09b7 \ @@ -1014,15 +1071,16 @@ frozenlist==1.4.1 \ --hash=sha256:fde5bd59ab5357e3853313127f4d3565fc7dad314a74d7b5d43c22c6a5ed2ced \ --hash=sha256:fe1a06da377e3a1062ae5fe0926e12b84eceb8a50b350ddca72dc85015873f74 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # aiohttp # aiosignal -fsspec==2023.5.0 \ - --hash=sha256:51a4ad01a5bb66fcc58036e288c0d53d3975a0df2a5dc59a93b59bade0391f2a \ - --hash=sha256:b3b56e00fb93ea321bc9e5d9cf6f8522a0198b20eb24e02774d329e9c6fb84ce +fsspec==2023.12.1 \ + --hash=sha256:6271f1d3075a378bfe432f6f42bf7e1d2a6ba74f78dd9b512385474c579146a0 \ + --hash=sha256:c4da01a35ac65c853f833e43f67802c25213f560820d54ddf248f92eddd5e990 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt + # adlfs # huggingface-hub # torch gguf==0.17.0 \ @@ -1033,19 +1091,19 @@ gitdb==4.0.11 \ --hash=sha256:81a3407ddd2ee8df444cbacea00e2d038e40150acfa3001696fe0dcf1d3adfa4 \ --hash=sha256:bf5421126136d6d0af55bc1e7c1af1c397a34f5b7bd79e776cd3e89785c2b04b # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # gitpython gitpython==3.1.44 \ --hash=sha256:9e0e10cda9bed1ee64bc9a6de50e7e38a9c9943241cd7f585f6df3ed28011110 \ --hash=sha256:c87e30b26253bf5418b01b0660f818967f3c503193838337fe5e573331249269 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/cloud-requirements.txt google-api-core==2.24.2 \ --hash=sha256:810a63ac95f3c441b7c0e43d344e372887f62ce9071ba972eacf32672e072de9 \ --hash=sha256:81718493daf06d96d6bc76a91c23874dbf2fac0adbbf542831b805ee6e974696 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # google-cloud-core # google-cloud-storage # opencensus @@ -1053,7 +1111,7 @@ google-auth==2.23.4 \ --hash=sha256:79905d6b1652187def79d491d6e23d0cbb3a21d3c7ba0dbaa9c8a01906b13ff3 \ --hash=sha256:d4bbc92fe4b8bfd2f3e8d88e5ba7085935da208ee38a134fc280e7ce682a05f2 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/cloud-requirements.txt # google-api-core # google-cloud-core @@ -1062,13 +1120,13 @@ google-cloud-core==2.4.1 \ --hash=sha256:9b7749272a812bde58fff28868d0c5e2f585b82f37e09a1f6ed2d4d10f134073 \ --hash=sha256:a9e6a4422b9ac5c29f79a0ede9485473338e2ce78d91f2370c01e730eab22e61 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # google-cloud-storage google-cloud-storage==2.14.0 \ --hash=sha256:2d23fcf59b55e7b45336729c148bb1c464468c69d5efbaee30f7201dd90eb97e \ --hash=sha256:8641243bbf2a2042c16a6399551fbb13f062cbc9a2de38d6c0bb5426962e9dbd # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/cloud-requirements.txt # smart-open google-crc32c==1.5.0 \ @@ -1141,100 +1199,148 @@ google-crc32c==1.5.0 \ --hash=sha256:fd8536e902db7e365f49e7d9029283403974ccf29b13fc7028b97e2295b33556 \ --hash=sha256:fe70e325aa68fa4b5edf7d1a4b6f691eb04bbccac0ace68e34820d283b5f80d4 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # google-cloud-storage # google-resumable-media google-resumable-media==2.6.0 \ --hash=sha256:972852f6c65f933e15a4a210c2b96930763b47197cdf4aa5f5bea435efb626e7 \ --hash=sha256:fc03d344381970f79eebb632a3c18bb1828593a2dc5572b5f90115ef7d11e81b # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # google-cloud-storage googleapis-common-protos==1.61.0 \ --hash=sha256:22f1915393bb3245343f6efe87f6fe868532efc12aa26b391b15132e1279f1c0 \ --hash=sha256:8a64866a97f6304a7179873a465d6eee97b7a24ec6cfd78e0f575e96b821240b # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # google-api-core -grpcio==1.66.2 \ - --hash=sha256:02697eb4a5cbe5a9639f57323b4c37bcb3ab2d48cec5da3dc2f13334d72790dd \ - --hash=sha256:03b0b307ba26fae695e067b94cbb014e27390f8bc5ac7a3a39b7723fed085604 \ - --hash=sha256:05bc2ceadc2529ab0b227b1310d249d95d9001cd106aa4d31e8871ad3c428d73 \ - --hash=sha256:06de8ec0bd71be123eec15b0e0d457474931c2c407869b6c349bd9bed4adbac3 \ - --hash=sha256:0be4e0490c28da5377283861bed2941d1d20ec017ca397a5df4394d1c31a9b50 \ - --hash=sha256:12fda97ffae55e6526825daf25ad0fa37483685952b5d0f910d6405c87e3adb6 \ - --hash=sha256:1caa38fb22a8578ab8393da99d4b8641e3a80abc8fd52646f1ecc92bcb8dee34 \ - --hash=sha256:2018b053aa15782db2541ca01a7edb56a0bf18c77efed975392583725974b249 \ - --hash=sha256:20657d6b8cfed7db5e11b62ff7dfe2e12064ea78e93f1434d61888834bc86d75 \ - --hash=sha256:2335c58560a9e92ac58ff2bc5649952f9b37d0735608242973c7a8b94a6437d8 \ - --hash=sha256:31fd163105464797a72d901a06472860845ac157389e10f12631025b3e4d0453 \ - --hash=sha256:38b68498ff579a3b1ee8f93a05eb48dc2595795f2f62716e797dc24774c1aaa8 \ - --hash=sha256:3b00efc473b20d8bf83e0e1ae661b98951ca56111feb9b9611df8efc4fe5d55d \ - --hash=sha256:3ed71e81782966ffead60268bbda31ea3f725ebf8aa73634d5dda44f2cf3fb9c \ - --hash=sha256:45a3d462826f4868b442a6b8fdbe8b87b45eb4f5b5308168c156b21eca43f61c \ - --hash=sha256:49f0ca7ae850f59f828a723a9064cadbed90f1ece179d375966546499b8a2c9c \ - --hash=sha256:4e504572433f4e72b12394977679161d495c4c9581ba34a88d843eaf0f2fbd39 \ - --hash=sha256:4ea1d062c9230278793820146c95d038dc0f468cbdd172eec3363e42ff1c7d01 \ - --hash=sha256:563588c587b75c34b928bc428548e5b00ea38c46972181a4d8b75ba7e3f24231 \ - --hash=sha256:6001e575b8bbd89eee11960bb640b6da6ae110cf08113a075f1e2051cc596cae \ - --hash=sha256:66a0cd8ba6512b401d7ed46bb03f4ee455839957f28b8d61e7708056a806ba6a \ - --hash=sha256:6851de821249340bdb100df5eacfecfc4e6075fa85c6df7ee0eb213170ec8e5d \ - --hash=sha256:728bdf36a186e7f51da73be7f8d09457a03061be848718d0edf000e709418987 \ - --hash=sha256:73e3b425c1e155730273f73e419de3074aa5c5e936771ee0e4af0814631fb30a \ - --hash=sha256:73fc8f8b9b5c4a03e802b3cd0c18b2b06b410d3c1dcbef989fdeb943bd44aff7 \ - --hash=sha256:78fa51ebc2d9242c0fc5db0feecc57a9943303b46664ad89921f5079e2e4ada7 \ - --hash=sha256:7b2c86457145ce14c38e5bf6bdc19ef88e66c5fee2c3d83285c5aef026ba93b3 \ - --hash=sha256:7d69ce1f324dc2d71e40c9261d3fdbe7d4c9d60f332069ff9b2a4d8a257c7b2b \ - --hash=sha256:802d84fd3d50614170649853d121baaaa305de7b65b3e01759247e768d691ddf \ - --hash=sha256:80fd702ba7e432994df208f27514280b4b5c6843e12a48759c9255679ad38db8 \ - --hash=sha256:8ac475e8da31484efa25abb774674d837b343afb78bb3bcdef10f81a93e3d6bf \ - --hash=sha256:950da58d7d80abd0ea68757769c9db0a95b31163e53e5bb60438d263f4bed7b7 \ - --hash=sha256:99a641995a6bc4287a6315989ee591ff58507aa1cbe4c2e70d88411c4dcc0839 \ - --hash=sha256:9c3a99c519f4638e700e9e3f83952e27e2ea10873eecd7935823dab0c1c9250e \ - --hash=sha256:9c509a4f78114cbc5f0740eb3d7a74985fd2eff022971bc9bc31f8bc93e66a3b \ - --hash=sha256:a18e20d8321c6400185b4263e27982488cb5cdd62da69147087a76a24ef4e7e3 \ - --hash=sha256:a917d26e0fe980b0ac7bfcc1a3c4ad6a9a4612c911d33efb55ed7833c749b0ee \ - --hash=sha256:a9539f01cb04950fd4b5ab458e64a15f84c2acc273670072abe49a3f29bbad54 \ - --hash=sha256:ad2efdbe90c73b0434cbe64ed372e12414ad03c06262279b104a029d1889d13e \ - --hash=sha256:b672abf90a964bfde2d0ecbce30f2329a47498ba75ce6f4da35a2f4532b7acbc \ - --hash=sha256:bbd27c24a4cc5e195a7f56cfd9312e366d5d61b86e36d46bbe538457ea6eb8dd \ - --hash=sha256:c400ba5675b67025c8a9f48aa846f12a39cf0c44df5cd060e23fda5b30e9359d \ - --hash=sha256:c408f5ef75cfffa113cacd8b0c0e3611cbfd47701ca3cdc090594109b9fcbaed \ - --hash=sha256:c806852deaedee9ce8280fe98955c9103f62912a5b2d5ee7e3eaa284a6d8d8e7 \ - --hash=sha256:ce89f5876662f146d4c1f695dda29d4433a5d01c8681fbd2539afff535da14d4 \ - --hash=sha256:d25a14af966438cddf498b2e338f88d1c9706f3493b1d73b93f695c99c5f0e2a \ - --hash=sha256:d8d4732cc5052e92cea2f78b233c2e2a52998ac40cd651f40e398893ad0d06ec \ - --hash=sha256:d9a9724a156c8ec6a379869b23ba3323b7ea3600851c91489b871e375f710bc8 \ - --hash=sha256:e636ce23273683b00410f1971d209bf3689238cf5538d960adc3cdfe80dd0dbd \ - --hash=sha256:e88264caad6d8d00e7913996030bac8ad5f26b7411495848cc218bd3a9040b6c \ - --hash=sha256:f145cc21836c332c67baa6fc81099d1d27e266401565bf481948010d6ea32d46 \ - --hash=sha256:fb57870449dfcfac428afbb5a877829fcb0d6db9d9baa1148705739e9083880e \ - --hash=sha256:fb70487c95786e345af5e854ffec8cb8cc781bcc5df7930c4fbb7feaa72e1cdf \ - --hash=sha256:fe96281713168a3270878255983d2cb1a97e034325c8c2c25169a69289d3ecfa \ - --hash=sha256:ff1f7882e56c40b0d33c4922c15dfa30612f05fb785074a012f7cda74d1c3679 - # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt +grpcio==1.74.0 \ + --hash=sha256:0f87bddd6e27fc776aacf7ebfec367b6d49cad0455123951e4488ea99d9b9b8f \ + --hash=sha256:136b53c91ac1d02c8c24201bfdeb56f8b3ac3278668cbb8e0ba49c88069e1bdc \ + --hash=sha256:1733969040989f7acc3d94c22f55b4a9501a30f6aaacdbccfaba0a3ffb255ab7 \ + --hash=sha256:176d60a5168d7948539def20b2a3adcce67d72454d9ae05969a2e73f3a0feee7 \ + --hash=sha256:1a2b06afe2e50ebfd46247ac3ba60cac523f54ec7792ae9ba6073c12daf26f0a \ + --hash=sha256:1bf949792cee20d2078323a9b02bacbbae002b9e3b9e2433f2741c15bdeba1c4 \ + --hash=sha256:22b834cef33429ca6cc28303c9c327ba9a3fafecbf62fae17e9a7b7163cc43ac \ + --hash=sha256:2918948864fec2a11721d91568effffbe0a02b23ecd57f281391d986847982f6 \ + --hash=sha256:2bc2d7d8d184e2362b53905cb1708c84cb16354771c04b490485fa07ce3a1d89 \ + --hash=sha256:2f609a39f62a6f6f05c7512746798282546358a37ea93c1fcbadf8b2fed162e3 \ + --hash=sha256:3601274bc0523f6dc07666c0e01682c94472402ac2fd1226fd96e079863bfa49 \ + --hash=sha256:3b03d8f2a07f0fea8c8f74deb59f8352b770e3900d143b3d1475effcb08eec20 \ + --hash=sha256:3d14e3c4d65e19d8430a4e28ceb71ace4728776fd6c3ce34016947474479683f \ + --hash=sha256:42f8fee287427b94be63d916c90399ed310ed10aadbf9e2e5538b3e497d269bc \ + --hash=sha256:4bc5fca10aaf74779081e16c2bcc3d5ec643ffd528d9e7b1c9039000ead73bae \ + --hash=sha256:4e4181bfc24413d1e3a37a0b7889bea68d973d4b45dd2bc68bb766c140718f82 \ + --hash=sha256:55b453812fa7c7ce2f5c88be3018fb4a490519b6ce80788d5913f3f9d7da8c7b \ + --hash=sha256:566b9395b90cc3d0d0c6404bc8572c7c18786ede549cdb540ae27b58afe0fb91 \ + --hash=sha256:5f251c355167b2360537cf17bea2cf0197995e551ab9da6a0a59b3da5e8704f9 \ + --hash=sha256:60d2d48b0580e70d2e1954d0d19fa3c2e60dd7cbed826aca104fff518310d1c5 \ + --hash=sha256:64229c1e9cea079420527fa8ac45d80fc1e8d3f94deaa35643c381fa8d98f362 \ + --hash=sha256:655726919b75ab3c34cdad39da5c530ac6fa32696fb23119e36b64adcfca174a \ + --hash=sha256:662456c4513e298db6d7bd9c3b8df6f75f8752f0ba01fb653e252ed4a59b5a5d \ + --hash=sha256:68c8ebcca945efff9d86d8d6d7bfb0841cf0071024417e2d7f45c5e46b5b08eb \ + --hash=sha256:69e1a8180868a2576f02356565f16635b99088da7df3d45aaa7e24e73a054e31 \ + --hash=sha256:6bab67d15ad617aff094c382c882e0177637da73cbc5532d52c07b4ee887a87b \ + --hash=sha256:7d95d71ff35291bab3f1c52f52f474c632db26ea12700c2ff0ea0532cb0b5854 \ + --hash=sha256:80d1f4fbb35b0742d3e3d3bb654b7381cd5f015f8497279a1e9c21ba623e01b1 \ + --hash=sha256:834988b6c34515545b3edd13e902c1acdd9f2465d386ea5143fb558f153a7176 \ + --hash=sha256:8533e6e9c5bd630ca98062e3a1326249e6ada07d05acf191a77bc33f8948f3d8 \ + --hash=sha256:85bd5cdf4ed7b2d6438871adf6afff9af7096486fcf51818a81b77ef4dd30907 \ + --hash=sha256:86ad489db097141a907c559988c29718719aa3e13370d40e20506f11b4de0d11 \ + --hash=sha256:885912559974df35d92219e2dc98f51a16a48395f37b92865ad45186f294096c \ + --hash=sha256:8efe72fde5500f47aca1ef59495cb59c885afe04ac89dd11d810f2de87d935d4 \ + --hash=sha256:8f7b5882fb50632ab1e48cb3122d6df55b9afabc265582808036b6e51b9fd6b7 \ + --hash=sha256:9e7c4389771855a92934b2846bd807fc25a3dfa820fd912fe6bd8136026b2707 \ + --hash=sha256:9e912d3c993a29df6c627459af58975b2e5c897d93287939b9d5065f000249b5 \ + --hash=sha256:a8f0302f9ac4e9923f98d8e243939a6fb627cd048f5cd38595c97e38020dffce \ + --hash=sha256:b6a73b2ba83e663b2480a90b82fdae6a7aa6427f62bf43b29912c0cfd1aa2bfa \ + --hash=sha256:c14e803037e572c177ba54a3e090d6eb12efd795d49327c5ee2b3bddb836bf01 \ + --hash=sha256:c3d7bd6e3929fd2ea7fbc3f562e4987229ead70c9ae5f01501a46701e08f1ad9 \ + --hash=sha256:c98e0b7434a7fa4e3e63f250456eaef52499fba5ae661c58cc5b5477d11e7182 \ + --hash=sha256:cce634b10aeab37010449124814b05a62fb5f18928ca878f1bf4750d1f0c815b \ + --hash=sha256:e154d230dc1bbbd78ad2fdc3039fa50ad7ffcf438e4eb2fa30bce223a70c7486 \ + --hash=sha256:e1ea6176d7dfd5b941ea01c2ec34de9531ba494d541fe2057c904e601879f249 \ + --hash=sha256:e759f9e8bc908aaae0412642afe5416c9f983a80499448fcc7fab8692ae044c3 \ + --hash=sha256:e8978003816c7b9eabe217f88c78bc26adc8f9304bf6a594b02e5a49b2ef9c11 \ + --hash=sha256:ecde9ab49f58433abe02f9ed076c7b5be839cf0153883a6d23995937a82392fa \ + --hash=sha256:f6ec94f0e50eb8fa1744a731088b966427575e40c2944a980049798b127a687e \ + --hash=sha256:fd3c71aeee838299c5887230b8a1822795325ddfea635edd82954c1eaa831e24 \ + --hash=sha256:fe0f540750a13fd8e5da4b3eaba91a785eea8dca5ccd2bc2ffe978caa403090e + # via + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/cloud-requirements.txt # -r python/requirements.txt # grpcio-tools -gymnasium==1.0.0 \ - --hash=sha256:9d2b66f30c1b34fe3c2ce7fae65ecf365d0e9982d2b3d860235e773328a3b403 \ - --hash=sha256:b6f40e1e24c5bd419361e1a5b86a9117d2499baecc3a660d44dfff4c465393ad +grpcio-tools==1.62.3 \ + --hash=sha256:0a52cc9444df978438b8d2332c0ca99000521895229934a59f94f37ed896b133 \ + --hash=sha256:0a8c0c4724ae9c2181b7dbc9b186df46e4f62cb18dc184e46d06c0ebeccf569e \ + --hash=sha256:0cb3a3436ac119cbd37a7d3331d9bdf85dad21a6ac233a3411dff716dcbf401e \ + --hash=sha256:11c625eebefd1fd40a228fc8bae385e448c7e32a6ae134e43cf13bbc23f902b7 \ + --hash=sha256:11f363570dea661dde99e04a51bd108a5807b5df32a6f8bdf4860e34e94a4dbf \ + --hash=sha256:141d028bf5762d4a97f981c501da873589df3f7e02f4c1260e1921e565b376fa \ + --hash=sha256:1c989246c2aebc13253f08be32538a4039a64e12d9c18f6d662d7aee641dc8b5 \ + --hash=sha256:1da38070738da53556a4b35ab67c1b9884a5dd48fa2f243db35dc14079ea3d0c \ + --hash=sha256:27cd9ef5c5d68d5ed104b6dcb96fe9c66b82050e546c9e255716903c3d8f0373 \ + --hash=sha256:2e02d3b96f2d0e4bab9ceaa30f37d4f75571e40c6272e95364bff3125a64d184 \ + --hash=sha256:2f968b049c2849540751ec2100ab05e8086c24bead769ca734fdab58698408c1 \ + --hash=sha256:350a80485e302daaa95d335a931f97b693e170e02d43767ab06552c708808950 \ + --hash=sha256:3eae6ea76d62fcac091e1f15c2dcedf1dc3f114f8df1a972a8a0745e89f4cf61 \ + --hash=sha256:47a5c093ab256dec5714a7a345f8cc89315cb57c298b276fa244f37a0ba507f0 \ + --hash=sha256:5782883a27d3fae8c425b29a9d3dcf5f47d992848a1b76970da3b5a28d424b26 \ + --hash=sha256:6a56d344b0bab30bf342a67e33d386b0b3c4e65868ffe93c341c51e1a8853ca5 \ + --hash=sha256:6c3064610826f50bd69410c63101954676edc703e03f9e8f978a135f1aaf97c1 \ + --hash=sha256:703f46e0012af83a36082b5f30341113474ed0d91e36640da713355cd0ea5d23 \ + --hash=sha256:710fecf6a171dcbfa263a0a3e7070e0df65ba73158d4c539cec50978f11dad5d \ + --hash=sha256:7c7136015c3d62c3eef493efabaf9e3380e3e66d24ee8e94c01cb71377f57833 \ + --hash=sha256:7cc83023acd8bc72cf74c2edbe85b52098501d5b74d8377bfa06f3e929803492 \ + --hash=sha256:7f2483ea232bd72d98a6dc6d7aefd97e5bc80b15cd909b9e356d6f3e326b6e43 \ + --hash=sha256:7ff7d58a45b75df67d25f8f144936a3e44aabd91afec833ee06826bd02b7fbe7 \ + --hash=sha256:8ad0473af5544f89fc5a1ece8676dd03bdf160fb3230f967e05d0f4bf89620e3 \ + --hash=sha256:8c5d22b252dcef11dd1e0fbbe5bbfb9b4ae048e8880d33338215e8ccbdb03edc \ + --hash=sha256:8e62cc7164b0b7c5128e637e394eb2ef3db0e61fc798e80c301de3b2379203ed \ + --hash=sha256:962c84b4da0f3b14b3cdb10bc3837ebc5f136b67d919aea8d7bb3fd3df39528a \ + --hash=sha256:ace43b26d88a58dcff16c20d23ff72b04d0a415f64d2820f4ff06b1166f50557 \ + --hash=sha256:b47d0dda1bdb0a0ba7a9a6de88e5a1ed61f07fad613964879954961e36d49193 \ + --hash=sha256:b77f9f9cee87cd798f0fe26b7024344d1b03a7cd2d2cba7035f8433b13986325 \ + --hash=sha256:b881fd9505a84457e9f7e99362eeedd86497b659030cf57c6f0070df6d9c2b9b \ + --hash=sha256:bfda6ee8990997a9df95c5606f3096dae65f09af7ca03a1e9ca28f088caca5cf \ + --hash=sha256:c3a1ac9d394f8e229eb28eec2e04b9a6f5433fa19c9d32f1cb6066e3c5114a1d \ + --hash=sha256:c8ad5cce554e2fcaf8842dee5d9462583b601a3a78f8b76a153c38c963f58c10 \ + --hash=sha256:ca246dffeca0498be9b4e1ee169b62e64694b0f92e6d0be2573e65522f39eea9 \ + --hash=sha256:ca4f5eeadbb57cf03317d6a2857823239a63a59cc935f5bd6cf6e8b7af7a7ecc \ + --hash=sha256:d102b9b21c4e1e40af9a2ab3c6d41afba6bd29c0aa50ca013bf85c99cdc44ac5 \ + --hash=sha256:db3bc9fa39afc5e4e2767da4459df82b095ef0cab2f257707be06c44a1c2c3e5 \ + --hash=sha256:dc9ad9950119d8ae27634e68b7663cc8d340ae535a0f80d85a55e56a6973ab1f \ + --hash=sha256:e02d7c1a02e3814c94ba0cfe43d93e872c758bd8fd5c2797f894d0c49b4a1dfc \ + --hash=sha256:e0898d412a434e768a0c7e365acabe13ff1558b767e400936e26b5b6ed1ee51f \ + --hash=sha256:e18e15287c31baf574fcdf8251fb7f997d64e96c6ecf467906e576da0a079af6 \ + --hash=sha256:ec279dcf3518201fc592c65002754f58a6b542798cd7f3ecd4af086422f33f29 \ + --hash=sha256:ec6fbded0c61afe6f84e3c2a43e6d656791d95747d6d28b73eff1af64108c434 \ + --hash=sha256:eec73a005443061f4759b71a056f745e3b000dc0dc125c9f20560232dfbcbd14 \ + --hash=sha256:f3d812daffd0c2d2794756bd45a353f89e55dc8f91eb2fc840c51b9f6be62667 \ + --hash=sha256:f4b1615adf67bd8bb71f3464146a6f9949972d06d21a4f5e87e73f6464d97f57 \ + --hash=sha256:f6831fdec2b853c9daa3358535c55eed3694325889aa714070528cf8f92d7d6d + # via + # -c python/deplocks/llm/ray_test_py311_cu128.lock + # -r python/requirements/cloud-requirements.txt +gymnasium==1.1.1 \ + --hash=sha256:8bd9ea9bdef32c950a444ff36afc785e1d81051ec32d30435058953c20d2456d \ + --hash=sha256:9c167ec0a2b388666e37f63b2849cd2552f7f5b71938574c637bb36487eb928a # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt h11==0.16.0 \ --hash=sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1 \ --hash=sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # httpcore # uvicorn halo==0.0.31 \ --hash=sha256:5350488fb7d2aa7c31a1344120cee67a872901ce8858f60da7946cef96c208ab \ --hash=sha256:7b67a3521ee91d53b7152d4ee3452811e1d2a6321975137762eb3d70063cc9d6 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/cloud-requirements.txt hf-transfer==0.1.9 \ --hash=sha256:035572865dab29d17e783fbf1e84cf1cb24f3fcf8f1b17db1cfc7fdf139f02bf \ @@ -1263,7 +1369,7 @@ hf-transfer==0.1.9 \ --hash=sha256:ee8b10afedcb75f71091bcc197c526a6ebf5c58bbbadb34fdeee6160f55f619f \ --hash=sha256:fc6bd19e1cc177c66bdef15ef8636ad3bde79d5a4f608c158021153b4573509d # via -r python/requirements/llm/llm-requirements.txt -hf-xet==1.1.3 \ +hf-xet==1.1.3 ; platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \ --hash=sha256:30c575a5306f8e6fda37edb866762140a435037365eba7a17ce7bd0bc0216a8b \ --hash=sha256:7c1a6aa6abed1f696f8099aa9796ca04c9ee778a58728a115607de9cc4638ff1 \ --hash=sha256:8203f52827e3df65981984936654a5b390566336956f65765a8aa58c362bb841 \ @@ -1281,7 +1387,7 @@ httplib2==0.20.4 \ --hash=sha256:58a98e45b4b1a48273073f905d2961666ecf0fbac4250ea5b47aef259eb5c585 \ --hash=sha256:8b6a905cb1c79eefd03f8669fd993c36dc341f7c558f056cb5a33b5c2f458543 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # oauth2client httptools==0.6.4 \ --hash=sha256:0614154d5454c21b6410fdf5262b4a3ddb0f53f1e1721cfd59d55f32138c578a \ @@ -1341,18 +1447,17 @@ huggingface-hub==0.34.3 \ # via # tokenizers # transformers - # vllm humanize==4.12.1 \ --hash=sha256:1338ba97415c96556758a6e2f65977ed406dddf4620d4c6db9bbdfd07f0f1232 \ --hash=sha256:86014ca5c52675dffa1d404491952f1f5bf03b07c175a51891a343daebf01fea # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/cloud-requirements.txt idna==3.7 \ --hash=sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc \ --hash=sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # anyio # email-validator # httpx @@ -1363,7 +1468,7 @@ imageio==2.34.2 \ --hash=sha256:5c0c0ee8faa018a1c42f649b90395dd4d3bb6187c09053a0cd6f1fdd51bbff5e \ --hash=sha256:a0bb27ec9d5bab36a9f4835e51b21d2cb099e1f78451441f94687ff3404b79f8 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # scikit-image imagesize==1.4.1 \ --hash=sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b \ @@ -1373,13 +1478,13 @@ importlib-metadata==6.11.0 \ --hash=sha256:1231cf92d825c9e03cfc4da076a16de6422c863558229ea0b22b675657463443 \ --hash=sha256:f0afba6205ad8f8947c7d338b5342d5db2afbfd82f9cbef7879a9539cc12eb9b # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # opentelemetry-api iniconfig==2.0.0 \ --hash=sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3 \ --hash=sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # pytest interegular==0.3.3 \ --hash=sha256:b0c07007d48c89d6d19f7204972d369b2a77222722e126b6aa63aa721dc3b19c \ @@ -1389,14 +1494,14 @@ ipykernel==6.27.1 \ --hash=sha256:7d5d594b6690654b4d299edba5e872dc17bb7396a8d0609c97cb7b8a1c605de6 \ --hash=sha256:dab88b47f112f9f7df62236511023c9bdeef67abc73af7c652e4ce4441601686 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # nbclassic # notebook ipython==8.12.3 \ --hash=sha256:3910c4b54543c2ad73d06579aa771041b7d5707b033bd488669b4cf544e3b363 \ --hash=sha256:b0340d46a933d27c657b211a329d0be23793c36595acf9e6ef4164bc01a1804c # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # ipykernel # ipywidgets # jupyterlab @@ -1404,38 +1509,38 @@ ipython-genutils==0.2.0 \ --hash=sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8 \ --hash=sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # nbclassic # notebook ipywidgets==8.1.3 \ --hash=sha256:efafd18f7a142248f7cb0ba890a68b96abd4d6e88ddbda483c9130d12667eaf2 \ --hash=sha256:f5f9eeaae082b1823ce9eac2575272952f40d748893972956dc09700a6392d9c # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/cloud-requirements.txt isodate==0.6.1 \ --hash=sha256:0751eece944162659049d35f4f549ed815792b38793f07cf73381c1c87cbed96 \ --hash=sha256:48c5881de7e8b0a0d648cb024c8062dc84e7b840ed81e864c7614fd3c127bde9 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # azure-storage-blob isoduration==20.11.0 \ --hash=sha256:ac2f9015137935279eac671f94f89eb00584f940f5dc49462a0c4ee692ba1bd9 \ --hash=sha256:b2904c2a4228c3d44f409c8ae8e2370eb21a26f7ac2ec5446df141dde3452042 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jsonschema jedi==0.19.1 \ --hash=sha256:cf0496f3651bc65d7174ac1b7d043eff454892c708a87d1b683e57b569927ffd \ --hash=sha256:e983c654fe5c02867aef4cdfce5a2fbb4a50adc0af145f70504238f18ef5e7e0 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # ipython jinja2==3.1.6 \ --hash=sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d \ --hash=sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # fastapi # jupyter-server # jupyterlab @@ -1529,26 +1634,26 @@ jmespath==1.0.1 \ --hash=sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980 \ --hash=sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # boto3 # botocore json5==0.9.14 \ --hash=sha256:740c7f1b9e584a468dbb2939d8d458db3427f2c93ae2139d05f47e453eae964f \ --hash=sha256:9ed66c3a6ca3510a976a9ef9b8c0787de24802724ab1860bc0153c7fdd589b02 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jupyterlab-server jsonpatch==1.32 \ --hash=sha256:26ac385719ac9f54df8a2f0827bb8253aa3ea8ab7b3368457bcdb8c14595a397 \ --hash=sha256:b6ddfe6c3db30d81a96aaeceb6baf916094ffa23d7dd5fa2c13e13f8b6e600c2 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/cloud-requirements.txt jsonpointer==2.4 \ --hash=sha256:15d51bba20eea3165644553647711d150376234112651b4f1811022aecad7d7a \ --hash=sha256:585cee82b70211fa9e6043b7bb89db6e1aa49524340dde8ad6b63206ea689d88 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jsonpatch # jsonschema jsonref==1.1.0 \ @@ -1559,7 +1664,7 @@ jsonschema==4.23.0 \ --hash=sha256:d71497fef26351a33265337fa77ffeb82423f3ea21283cd9467bb03999266bc4 \ --hash=sha256:fbadb6f8b144a8f8cf9f0b89ba94501d143e50411a1278633f56a7acf7fd5566 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/cloud-requirements.txt # -r python/requirements/llm/llm-requirements.txt # -r python/requirements.txt @@ -1572,13 +1677,13 @@ jsonschema-specifications==2024.10.1 \ --hash=sha256:0f38b83639958ce1152d02a7f062902c41c8fd20d558b0c34344292d417ae272 \ --hash=sha256:a09a0680616357d9a0ecf05c12ad234479f549239d0f5b55f3deea67475da9bf # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jsonschema jupyter-client==7.3.4 \ --hash=sha256:17d74b0d0a7b24f1c8c527b24fcf4607c56bee542ffe8e3418e50b21e514b621 \ --hash=sha256:aa9a6c32054b290374f95f73bb0cae91455c58dfb84f65c8591912b8f65e6d56 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # ipykernel # jupyter-server # nbclassic @@ -1588,7 +1693,7 @@ jupyter-core==5.5.0 \ --hash=sha256:880b86053bf298a8724994f95e99b99130659022a4f7f45f563084b6223861d3 \ --hash=sha256:e11e02cd8ae0a9de5c6c44abf5727df9f2581055afe00b22183f621ba3585805 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # ipykernel # jupyter-client # jupyter-server @@ -1601,13 +1706,13 @@ jupyter-events==0.6.3 \ --hash=sha256:57a2749f87ba387cd1bfd9b22a0875b889237dbf2edc2121ebb22bde47036c17 \ --hash=sha256:9a6e9995f75d1b7146b436ea24d696ce3a35bfa8bfe45e0c33c334c79464d0b3 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jupyter-server-fileid jupyter-server==1.24.0 \ --hash=sha256:23368e8e214baf82b313d4c5a0d828ca73015e1a192ce3829bd74e62fab8d046 \ --hash=sha256:c88ddbe862966ea1aea8c3ccb89a5903abd8fbcfe5cd14090ef549d403332c37 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jupyter-server-fileid # jupyterlab # jupyterlab-server @@ -1617,49 +1722,55 @@ jupyter-server-fileid==0.9.0 \ --hash=sha256:171538b7c7d08d11dbc57d4e6da196e0c258e4c2cd29249ef1e032bb423677f8 \ --hash=sha256:5b489c6fe6783c41174a728c7b81099608518387e53c3d53451a67f46a0cb7b0 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jupyter-server-ydoc jupyter-server-ydoc==0.6.1 \ --hash=sha256:18275ff1ce7e93bbda2301ca066273b3951fc50b0d9c8fc33788374134ad7920 \ --hash=sha256:ab10864708c81fa41ab9f2ed3626b54ff6926eaf14545d1d439714978dad6e9f # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jupyterlab jupyter-ydoc==0.2.5 \ --hash=sha256:5759170f112c70320a84217dd98d287699076ae65a7f88d458d57940a9f2b882 \ --hash=sha256:5a02ca7449f0d875f73e8cb8efdf695dddef15a8e71378b1f4eda6b7c90f5382 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jupyter-server-ydoc # jupyterlab jupyterlab==3.6.1 \ --hash=sha256:ad6707dd0149b629d0ed5b56916cfcdb816b376c6af3190337faba09e27ea29e \ --hash=sha256:aee98c174180e98a30470297d10b959e8e64f2288970c0de65f0a6d2b4807034 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/cloud-requirements.txt jupyterlab-pygments==0.3.0 \ --hash=sha256:721aca4d9029252b11cfa9d185e5b5af4d54772bb8072f9b7036f4170054d35d \ --hash=sha256:841a89020971da1d8693f1a99997aefc5dc424bb1b251fd6322462a1b8842780 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # nbconvert jupyterlab-server==2.24.0 \ --hash=sha256:4e6f99e0a5579bbbc32e449c4dbb039561d4f1a7827d5733273ed56738f21f07 \ --hash=sha256:5f077e142bb8dc9b843d960f940c513581bceca3793a0d80f9c67d9522c4e876 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jupyterlab jupyterlab-widgets==3.0.11 \ --hash=sha256:78287fd86d20744ace330a61625024cf5521e1c012a352ddc0a3cdc2348becd0 \ --hash=sha256:dd5ac679593c969af29c9bed054c24f26842baa51352114736756bc035deee27 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # ipywidgets jupytext==1.17.2 \ --hash=sha256:4f85dc43bb6a24b75491c5c434001ad5ef563932f68f15dd3e1c8ce12a4a426b \ --hash=sha256:772d92898ac1f2ded69106f897b34af48ce4a85c985fa043a378ff5a65455f02 # via -r python/requirements/llm/llm-test-requirements.txt +kombu==5.5.4 \ + --hash=sha256:886600168275ebeada93b888e831352fe578168342f0d1d5833d88ba0d847363 \ + --hash=sha256:a12ed0557c238897d8e518f1d1fdf84bd1516c5e305af2dacd85c2015115feb8 + # via + # -c python/deplocks/llm/ray_test_py311_cu128.lock + # celery lark==1.2.2 \ --hash=sha256:c2276486b02f0f1b90be155f2c8ba4a8e194d42775786db622faccd652d8e80c \ --hash=sha256:ca807d0162cd16cef15a8feecb862d7319e7a09bdb13aef927968e45040fed80 @@ -1668,7 +1779,7 @@ lazy-loader==0.4 \ --hash=sha256:342aa8e14d543a154047afb4ba8ef17f5563baad3fc610d7b15b213b0f119efc \ --hash=sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # scikit-image llguidance==0.7.29 ; platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \ --hash=sha256:17fd439957d6ca5f459d0dec755a2d040c2dc946ed7e3c332b469ef6861292f8 \ @@ -1711,7 +1822,7 @@ log-symbols==0.0.14 \ --hash=sha256:4952106ff8b605ab7d5081dd2c7e6ca7374584eff7086f499c06edd1ce56dcca \ --hash=sha256:cf0bbc6fe1a8e53f0d174a716bc625c4f87043cc21eb55dd8a740cfe22680556 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # halo lxml==4.9.4 \ --hash=sha256:00e91573183ad273e242db5585b52670eddf92bacad095ce25c1e682da14ed91 \ @@ -1808,7 +1919,7 @@ lxml==4.9.4 \ --hash=sha256:fd814847901df6e8de13ce69b84c31fc9b3fb591224d6762d0b256d510cbf382 \ --hash=sha256:fdb325b7fba1e2c40b9b1db407f85642e32404131c08480dd652110fc908561b # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # nbconvert lz4==4.3.3 \ --hash=sha256:01fe674ef2889dbb9899d8a67361e0c4a2c833af5aeb37dd505727cf5d2a131e \ @@ -1848,13 +1959,13 @@ lz4==4.3.3 \ --hash=sha256:f4c7bf687303ca47d69f9f0133274958fd672efaa33fb5bcde467862d6c621f0 \ --hash=sha256:f76176492ff082657ada0d0f10c794b6da5800249ef1692b35cf49b1e93e8ef7 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt markdown-it-py==2.2.0 \ --hash=sha256:5a35f8d1870171d9acc47b99612dc146129b631baf04970128b568f190d0cc30 \ --hash=sha256:7c9a5e412688bc771c67432cbfebcdd686c93ce6484913dccf06cb5a0bea35a1 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jupytext # mdit-py-plugins # rich @@ -1885,14 +1996,14 @@ markupsafe==2.1.3 \ --hash=sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc \ --hash=sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jinja2 # nbconvert matplotlib-inline==0.1.6 \ --hash=sha256:f1f41aab5328aa5aaea9b16d083b128102f8712542f819fe7e6a420ff581b311 \ --hash=sha256:f887e5f10ba98e8d2b150ddcf4702c1e5f8b3a20005eb0f74bfdbd360ee6f304 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # ipykernel # ipython mdit-py-plugins==0.4.2 \ @@ -1903,7 +2014,7 @@ mdurl==0.1.2 \ --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \ --hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # markdown-it-py memray==1.10.0 ; sys_platform != 'win32' \ --hash=sha256:0a21745fb516b7a6efcd40aa7487c59e9313fcfc782d0193fcfcf00b48426874 \ @@ -1942,7 +2053,7 @@ memray==1.10.0 ; sys_platform != 'win32' \ --hash=sha256:e356af93e3b031c83957e9ac1a653f5aaba5df1e357dd17142f5ed19bb3dc660 \ --hash=sha256:f16c5c8730b616613dc8bafe32649ca6bd7252606251eb00148582011758d0b5 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt meson==1.8.3 \ --hash=sha256:ef02b806ce0c5b6becd5bb5dc9fa67662320b29b337e7ace73e4354500590233 \ @@ -1956,7 +2067,7 @@ mistune==0.8.4 \ --hash=sha256:59a3429db53c50b5c6bcc8a07f8848cb00d7dc8bdb431a4ab41920d201d4756e \ --hash=sha256:88a1051873018da288eee8538d476dffe1262495144b33ecb586c4ab266bb8d4 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # nbconvert mpmath==1.3.0 \ --hash=sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c @@ -1965,14 +2076,15 @@ msal==1.28.1 \ --hash=sha256:563c2d70de77a2ca9786aab84cb4e133a38a6897e6676774edc23d610bfc9e7b \ --hash=sha256:d72bbfe2d5c2f2555f4bc6205be4450ddfd12976610dd9a16a9ab0f05c68b64d # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock + # azure-datalake-store # azure-identity # msal-extensions msal-extensions==1.2.0b1 \ --hash=sha256:217f391bb549de11b19abe8029a8375fe3ca0556aa8cce004b2083f00a569b71 \ --hash=sha256:3658b3814cd6a7759e83cb0ec145f30330ee249a92444adaf9aa4eb4f5bbcbbc # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # azure-identity msgpack==1.0.7 \ --hash=sha256:04ad6069c86e531682f9e1e71b71c1c3937d6014a7c3e9edd2aa81ad58842862 \ @@ -2032,7 +2144,7 @@ msgpack==1.0.7 \ --hash=sha256:f9a7c509542db4eceed3dcf21ee5267ab565a83555c9b88a8109dcecc4709002 \ --hash=sha256:ff1d0899f104f3921d94579a5638847f783c9b04f2d5f229392ca77fba5b82fc # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt # ray msgspec==0.19.0 \ @@ -2165,27 +2277,27 @@ multidict==6.0.5 \ --hash=sha256:fce28b3c8a81b6b36dfac9feb1de115bab619b3c13905b419ec71d03a3fc1423 \ --hash=sha256:fe5d7785250541f7f5019ab9cba2c71169dc7d74d0f45253f8313f436458a4ef # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # aiohttp # yarl nbclassic==1.0.0 \ --hash=sha256:0ae11eb2319455d805596bf320336cda9554b41d99ab9a3c31bf8180bffa30e3 \ --hash=sha256:f99e4769b4750076cd4235c044b61232110733322384a94a63791d2e7beacc66 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jupyterlab # notebook nbclient==0.5.13 \ --hash=sha256:40c52c9b5e3c31faecaee69f202b3f53e38d7c1c563de0fadde9d7eda0fdafe8 \ --hash=sha256:47ac905af59379913c1f8f541098d2550153cf8dc58553cbe18c702b181518b0 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # nbconvert nbconvert==6.5.4 \ --hash=sha256:9e3c7c6d491374cbdd5f35d268c05809357716d346f4573186bbeab32ee50bc1 \ --hash=sha256:d679a947f849a966cbbd0bf6e7fedcfdb64be3b20ce7cef11ad55c13f5820e19 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jupyter-server # nbclassic # notebook @@ -2193,7 +2305,7 @@ nbformat==5.9.2 \ --hash=sha256:1c5172d786a41b82bcfd0c23f9e6b6f072e8fb49c39250219e4acfff1efe89e9 \ --hash=sha256:5f98b5ba1997dff175e77e0c17d5c10a96eaed2cbd1de3533d1fc35d5e111192 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jupyter-server # jupytext # nbclassic @@ -2204,7 +2316,7 @@ nest-asyncio==1.5.8 \ --hash=sha256:25aa2ca0d2a5b5531956b9e273b45cf664cae2b145101d73b86b199978d48fdb \ --hash=sha256:accda7a339a70599cb08f9dd09a67e0c2ef8d8d6f4c07f96ab203f2ae254e48d # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # ipykernel # jupyter-client # nbclassic @@ -2213,7 +2325,7 @@ nest-asyncio==1.5.8 \ networkx==3.2.1 \ --hash=sha256:f18c69adc97877c42332c170849c96cefa91881c99a7cb3e95b7c659ebdc1ec2 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # scikit-image # torch ninja==1.11.1.4 \ @@ -2238,23 +2350,27 @@ ninja==1.11.1.4 \ # -r python/requirements/llm/llm-requirements.txt # vllm # xgrammar -nixl==0.3.1 \ - --hash=sha256:20428ad2668062a79045fae83cc5cba1f4019d4a2c7053cc8549c3a1533f8a75 \ - --hash=sha256:70b8932b50ccf1a13ac8fa2e10a4b78290baae9f963bfecfa67684104331a94b \ - --hash=sha256:8c144839484b3076f0b34ad8ceaeaff05c23399cf57ca85f2a94b44e1475a39b \ - --hash=sha256:ff59996ad05a7e4ba6c8beba0f1d8ac2f9e53df696a15af0d3340028e2f16081 +nixl==0.4.1 \ + --hash=sha256:10c7b4a44f89c3fbff3e20cb84973be95f8df36ee336fb108275ed1839fec1f1 \ + --hash=sha256:510cc9e824ad53cac71ce55ff41160f2a9e1507ceb52eb871b775fe1e42beb87 \ + --hash=sha256:8a3d83b28c16b795bdc281f1489b9d247f6e6088ad96ca96406072a36d6354b7 \ + --hash=sha256:9381fd3986d227c7ccb2607c03bbea559ec80f951e2ea47c1fbf381e4cd97164 \ + --hash=sha256:9ab7e580e9962ebdcda8c17f8548858d3fdb648621367d8e717ca317b534b778 \ + --hash=sha256:db144821de7912cb2502052b3070a1ac276b8b019470e6efdfce9c237ffe130d \ + --hash=sha256:e33102b85b3f95a8c95e59b59b29aabd03d47b5bce619de506b9bb83739cf60d \ + --hash=sha256:f16092dd445542e82e3db3553f6c7697ec5a2e837f19d416401283ae245826f9 # via -r python/requirements/llm/llm-requirements.txt notebook==6.5.7 \ --hash=sha256:04eb9011dfac634fbd4442adaf0a8c27cd26beef831fe1d19faf930c327768e4 \ --hash=sha256:a6afa9a4ff4d149a0771ff8b8c881a7a73b3835f9add0606696d6e9d98ac1cd0 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jupyterlab notebook-shim==0.2.3 \ --hash=sha256:a83496a43341c1674b093bfcebf0fe8e74cbe7eda5fd2bbc56f8e39e1486c0c7 \ --hash=sha256:f69388ac283ae008cd506dda10d0288b09a017d822d5e8c7129a152cbd3ce7e9 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # nbclassic numba==0.61.2 \ --hash=sha256:34fba9406078bac7ab052efbf0d13939426c753ad72946baaa5bf9ae0ebb8dd2 \ @@ -2317,7 +2433,7 @@ numpy==1.26.4 \ --hash=sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3 \ --hash=sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt # cupy-cuda12x # gguf @@ -2397,23 +2513,39 @@ oauth2client==4.1.3 \ --hash=sha256:b8a81cc5d60e2d364f0b1b98f958dbd472887acaf1a5b05e21c28c31a2d6d3ac \ --hash=sha256:d486741e451287f69568a4d26d70d9acd73a2bbfa275746c535b4209891cccc6 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/cloud-requirements.txt -openai==1.90.0 \ - --hash=sha256:9771982cdd5b6631af68c6a603da72ed44cd2caf73b49f717a72b71374bc565b \ - --hash=sha256:e5dcb5498ea6b42fec47546d10f1bcc05fb854219a7d953a5ba766718b212a02 +openai==1.100.2 \ + --hash=sha256:54d3457b2c8d7303a1bc002a058de46bdd8f37a8117751c7cf4ed4438051f151 \ + --hash=sha256:787b4c3c8a65895182c58c424f790c25c790cc9a0330e34f73d55b6ee5a00e32 + # via vllm +openai-harmony==0.0.4 \ + --hash=sha256:038f1d6772d1be5213b36ae76e5d042022395ec35c428a73ccb8b839b2cecf6a \ + --hash=sha256:15e6d53a66502491a3675a536df30e271f976e6c5efe68250a65191efcb85c4f \ + --hash=sha256:2d8d16d84702059833fb03b841b28c25600c54e83cadccef79af44e1c81166b1 \ + --hash=sha256:31e9bcac0902a309e2fc688e52f247eec7fffcd00d17e958b9a83a8fea6519c2 \ + --hash=sha256:3586d90c899cd41f8624e7b82a48c289f6e4be56c66304ecaf3a0ba88963a73f \ + --hash=sha256:3cf2344366f10981bbc0f6d9949a0b2bb87151d209ed295943ed6ad8eda37932 \ + --hash=sha256:567cc568b6bf7b4d041b0c9aa7d6b2c9394f8af6065bc87fa6d23f207b5af9a7 \ + --hash=sha256:5c67ac6df349236fb7b64f57c3dbb0273efcdca24314daa108f2a482c427106c \ + --hash=sha256:746f751de5033b3dbcfcd4a726a4c56ce452c593ad3d54472d8597ce8d8b6d44 \ + --hash=sha256:96a63199c0d81095b5d5d1ae8ca82b64c1c13d18d4e30323ae9e8ab31bc80a3d \ + --hash=sha256:97f1fe3909733212cc6b36f0f199b1421a9c57b79ec665f0322bd604cec47340 \ + --hash=sha256:b9ee9e9ab6a237cebbe16563c787a6e83f3fcc034075c3d321dab94448426282 \ + --hash=sha256:d38f2639f6bf7c3c34a5dfd79e29075811ae2fa9b895a63e76767f74a47a971e \ + --hash=sha256:ef21a1e2384a65c62d5ec5e1cded9fe026f1d032d5c5d725110d1a8d330d8f54 # via vllm opencensus==0.11.4 \ --hash=sha256:a18487ce68bc19900336e0ff4655c5a116daf10c1b3685ece8d971bddad6a864 \ --hash=sha256:cbef87d8b8773064ab60e5c2a1ced58bbaa38a6d052c41aec224958ce544eff2 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt opencensus-context==0.1.3 \ --hash=sha256:073bb0590007af276853009fac7e4bab1d523c3f03baf4cb4511ca38967c6039 \ --hash=sha256:a03108c3c10d8c80bb5ddf5c8a1f033161fa61972a9917f9b9b3a18517f0088c # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # opencensus opencv-python-headless==4.11.0.86 \ --hash=sha256:0e0a27c19dd1f40ddff94976cfe43066fbbe9dfbb2ec1907d66c19caef42a57b \ @@ -2430,7 +2562,7 @@ opentelemetry-api==1.34.1 \ --hash=sha256:64f0bd06d42824843731d05beea88d4d4b6ae59f9fe347ff7dfa2cc14233bbb3 \ --hash=sha256:b7df4cb0830d5a6c29ad0c0691dbae874d8daefa934b8b1d642de48323d32a8c # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt # opentelemetry-exporter-prometheus # opentelemetry-sdk @@ -2439,26 +2571,26 @@ opentelemetry-exporter-prometheus==0.55b1 \ --hash=sha256:d13ec0b22bf394113ff1ada5da98133a4b051779b803dae183188e26c4bd9ee0 \ --hash=sha256:f364fbbff9e5de37a112ff104d1185fb1d7e2046c5ab5911e5afebc7ab3ddf0e # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt opentelemetry-proto==1.27.0 \ --hash=sha256:33c9345d91dafd8a74fc3d7576c5a38f18b7fdf8d02983ac67485386132aedd6 \ --hash=sha256:b133873de5581a50063e1e4b29cdcf0c5e253a8c2d8dc1229add20a4c3830ace # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt opentelemetry-sdk==1.34.1 \ --hash=sha256:308effad4059562f1d92163c61c8141df649da24ce361827812c40abb2a1e96e \ --hash=sha256:8091db0d763fcd6098d4781bbc80ff0971f94e260739aa6afe6fd379cdf3aa4d # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt # opentelemetry-exporter-prometheus opentelemetry-semantic-conventions==0.55b1 \ --hash=sha256:5da81dfdf7d52e3d37f8fe88d5e771e191de924cfff5f550ab0b8f7b2409baed \ --hash=sha256:ef95b1f009159c28d7a7849f5cbc71c4c34c845bb514d66adfdf1b3fff3598b3 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # opentelemetry-sdk outlines-core==0.2.10 \ --hash=sha256:0a9e4b192ca837a472a1bb1428397509f543db08e1aeeee30252525cec34093a \ @@ -2507,7 +2639,7 @@ packaging==23.0 \ --hash=sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2 \ --hash=sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/cloud-requirements.txt # -r python/requirements.txt # huggingface-hub @@ -2516,6 +2648,7 @@ packaging==23.0 \ # jupyterlab # jupyterlab-server # jupytext + # kombu # lazy-loader # lm-format-enforcer # nbconvert @@ -2554,19 +2687,19 @@ pandas==1.5.3 \ --hash=sha256:e474390e60ed609cec869b0da796ad94f420bb057d86784191eefc62b65819ae \ --hash=sha256:f76d097d12c82a535fda9dfe5e8dd4127952b45fea9b0276cb30cca5ea313fbc # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt pandocfilters==1.5.0 \ --hash=sha256:0b679503337d233b4339a817bfc8c50064e2eff681314376a47cb582305a7a38 \ --hash=sha256:33aae3f25fd1a026079f5d27bdd52496f0e0803b3469282162bafdcbdf6ef14f # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # nbconvert parso==0.8.3 \ --hash=sha256:8c07be290bb59f03588915921e29e8a50002acaf2cdc5fa0e0114f91709fafa0 \ --hash=sha256:c001d4636cd3aecdaf33cbb40aebb59b094be2a74c556778ef5576c175e19e75 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jedi partial-json-parser==0.2.1.1.post5 \ --hash=sha256:627715aaa3cb3fb60a65b0d62223243acaa6c70846520a90326fef3a2f0b61ca \ @@ -2576,19 +2709,19 @@ pathspec==0.11.2 \ --hash=sha256:1d6ed233af05e679efb96b1851550ea95bbb64b7c490b0f5aa52996c11e92a20 \ --hash=sha256:e0d8d0ac2f12da61956eb2306b69f9469b42f4deb0f3cb6ed47b9cce9996ced3 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/cloud-requirements.txt pexpect==4.8.0 ; sys_platform != 'win32' \ --hash=sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937 \ --hash=sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # ipython pickleshare==0.7.5 \ --hash=sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca \ --hash=sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # ipython pillow==10.3.0 \ --hash=sha256:048ad577748b9fa4a99a0548c64f2cb8d672d5bf2e643a739ac8faff1164238c \ @@ -2661,7 +2794,7 @@ pillow==10.3.0 \ --hash=sha256:fdcbb4068117dfd9ce0138d068ac512843c52295ed996ae6dd1faf537b6dbc27 \ --hash=sha256:ff61bfd9253c3915e6d41c651d5f962da23eda633cf02262990094a18a55371a # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/llm/llm-test-requirements.txt # imageio # mistral-common @@ -2672,26 +2805,26 @@ platformdirs==3.11.0 \ --hash=sha256:cf8ee52a3afdb965072dcc652433e0c7e3e40cf5ea1477cd4b3b1d2eb75495b3 \ --hash=sha256:e9d171d00af68be50e9202731309c4e658fd8bc76f55c11c7dd760d023bda68e # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jupyter-core # virtualenv pluggy==1.3.0 \ --hash=sha256:cf61ae8f126ac6f7c451172cf30e3e43d3ca77615509771b3a984a0730651e12 \ --hash=sha256:d89c696a773f8bd377d18e5ecda92b7a3793cbe66c87060a6fb58c7b6e1061f7 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # pytest portalocker==2.8.2 \ --hash=sha256:2b035aa7828e46c58e9b31390ee1f169b98e1066ab10b9a6a861fe7e25ee4f33 \ --hash=sha256:cfb86acc09b9aa7c3b43594e19be1345b9d16af3feb08bf92f23d4dce513a28e # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # msal-extensions prometheus-client==0.19.0 \ --hash=sha256:4585b0d1223148c27a225b10dbec5ae9bc4c81a99a3fa80774fa6209935324e1 \ --hash=sha256:c88b1e6ecf6b41cd8fb5731c7ae919bf66df6ec6fafa555cd6c0e16ca169ae92 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt # jupyter-server # nbclassic @@ -2707,7 +2840,8 @@ prompt-toolkit==3.0.41 \ --hash=sha256:941367d97fc815548822aa26c2a269fdc4eb21e9ec05fc5d447cf09bad5d75f0 \ --hash=sha256:f36fe301fafb7470e86aaf90f036eef600a3210be4decf461a5b1ca8403d3cb2 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock + # click-repl # ipython propcache==0.3.0 \ --hash=sha256:02df07041e0820cacc8f739510078f2aadcfd3fc57eaeeb16d5ded85c872c89e \ @@ -2809,14 +2943,14 @@ propcache==0.3.0 \ --hash=sha256:fb91d20fa2d3b13deea98a690534697742029f4fb83673a3501ae6e3746508b5 \ --hash=sha256:fddb8870bdb83456a489ab67c6b3040a8d5a55069aa6f72f9d872235fbc52f54 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # aiohttp # yarl proto-plus==1.22.3 \ --hash=sha256:a49cd903bc0b6ab41f76bf65510439d56ca76f868adf0274e738bfdd096894df \ --hash=sha256:fdcd09713cbd42480740d2fe29c990f7fbd885a67efc328aa8be6ee3e9f76a6b # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # google-api-core protobuf==4.25.8 \ --hash=sha256:077ff8badf2acf8bc474406706ad890466274191a48d0abd3bd6987107c9cde5 \ @@ -2831,7 +2965,7 @@ protobuf==4.25.8 \ --hash=sha256:d552c53d0415449c8d17ced5c341caba0d89dbf433698e1436c8fa0aae7808a3 \ --hash=sha256:f4510b93a3bec6eba8fd8f1093e9d7fb0d4a24d1a81377c10c0e5bbfe9e4ed24 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt # google-api-core # googleapis-common-protos @@ -2859,21 +2993,21 @@ psutil==5.9.6 \ --hash=sha256:fb8a697f11b0f5994550555fcfe3e69799e5b060c8ecf9e2f75c69302cc35c0d \ --hash=sha256:ff18b8d1a784b810df0b0fff3bcb50ab941c3b8e2c8de5726f9c71c601c611aa # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # ipykernel # vllm ptyprocess==0.7.0 ; os_name != 'nt' or sys_platform != 'win32' \ --hash=sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35 \ --hash=sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # pexpect # terminado pure-eval==0.2.2 \ --hash=sha256:01eaab343580944bc56080ebe0a674b39ec44a945e6d09ba7db3cb8cec289350 \ --hash=sha256:2b45320af6dfaa1750f543d714b6d1c520a1688dec6fd24d339063ce0aaa9ac3 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # stack-data py-cpuinfo==9.0.0 \ --hash=sha256:3cdbbf3fac90dc6f118bfd64384f309edeadd902d7c8fb17f02ffa1fc3f49690 \ @@ -2889,7 +3023,7 @@ py-spy==0.4.0 ; python_full_version < '3.12' \ --hash=sha256:eee3d0bde85ca5cf4f01f012d461180ca76c24835a96f7b5c4ded64eb6a008ab \ --hash=sha256:f2cf3f7130e7d780471faa5957441d3b4e0ec39a79b2c00f4c33d494f7728428 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt pyarrow==19.0.1 \ --hash=sha256:008a4009efdb4ea3d2e18f05cd31f9d43c388aad29c636112c2966605ba33466 \ @@ -2935,13 +3069,13 @@ pyarrow==19.0.1 \ --hash=sha256:fca15aabbe9b8355800d923cc2e82c8ef514af321e18b437c3d782aa884eaeec \ --hash=sha256:fd44d66093a239358d07c42a91eebf5015aa54fccba959db899f932218ac9cc8 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt pyasn1==0.5.1 \ --hash=sha256:4439847c58d40b1d0a573d07e3856e95333f1976294494c325775aeca506eb58 \ --hash=sha256:6d391a96e59b23130a5cfa74d6fd7f388dbbe26cc8f1edf39fdddf08d9d6676c # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # oauth2client # pyasn1-modules # rsa @@ -2949,7 +3083,7 @@ pyasn1-modules==0.3.0 \ --hash=sha256:5bd01446b736eb9d31512a30d46c1ac3395d676c6f3cafa4c03eb54b9925631c \ --hash=sha256:d3ccd6ed470d9ffbc716be08bd90efbd44d0734bc9303818f7336070984a162d # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # google-auth # oauth2client pybase64==1.4.1 \ @@ -3109,7 +3243,7 @@ pycparser==2.21 \ --hash=sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9 \ --hash=sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # cffi pycurl==7.45.3 \ --hash=sha256:0c41a172d5e8a5cdd8328cc8134f47b2a57960ac677f7cda8520eaa9fbe7d990 \ @@ -3149,125 +3283,125 @@ pycurl==7.45.3 \ --hash=sha256:fa7751b614d9aa82d7a0f49ca90924c29c6cedf85a2f8687fb6a772dbfe48711 \ --hash=sha256:fbd4a6b8654b779089c5a44af1c65c1419c2cd60718780df6d8f354eb35d6d55 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/cloud-requirements.txt -pydantic==2.10.0 \ - --hash=sha256:0aca0f045ff6e2f097f1fe89521115335f15049eeb8a7bef3dafe4b19a74e289 \ - --hash=sha256:5e7807ba9201bdf61b1b58aa6eb690916c40a47acfb114b1b4fef3e7fd5b30fc +pydantic==2.11.7 \ + --hash=sha256:d989c3c6cb79469287b1569f7447a17848c998458d49ebe294e975b9baf0f0db \ + --hash=sha256:dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt # compressed-tensors # fastapi # lm-format-enforcer # mistral-common # openai + # openai-harmony # pydantic-extra-types # vllm # xgrammar -pydantic-core==2.27.0 \ - --hash=sha256:0aa4d1b2eba9a325897308b3124014a142cdccb9f3e016f31d3ebee6b5ea5e75 \ - --hash=sha256:0d06b667e53320332be2bf6f9461f4a9b78092a079b8ce8634c9afaa7e10cd9f \ - --hash=sha256:153017e3d6cd3ce979de06d84343ca424bb6092727375eba1968c8b4693c6ecb \ - --hash=sha256:15e350efb67b855cd014c218716feea4986a149ed1f42a539edd271ee074a196 \ - --hash=sha256:185ef205256cd8b38431205698531026979db89a79587725c1e55c59101d64e9 \ - --hash=sha256:1da0c98a85a6c6ed702d5556db3b09c91f9b0b78de37b7593e2de8d03238807a \ - --hash=sha256:225bfff5d425c34e1fd562cef52d673579d59b967d9de06178850c4802af9039 \ - --hash=sha256:24f984fc7762ed5f806d9e8c4c77ea69fdb2afd987b4fd319ef06c87595a8c55 \ - --hash=sha256:25a7fd4de38f7ff99a37e18fa0098c3140286451bc823d1746ba80cec5b433a1 \ - --hash=sha256:2883b260f7a93235488699d39cbbd94fa7b175d3a8063fbfddd3e81ad9988cb2 \ - --hash=sha256:2a51ce96224eadd1845150b204389623c8e129fde5a67a84b972bd83a85c6c40 \ - --hash=sha256:2be0ad541bb9f059954ccf8877a49ed73877f862529575ff3d54bf4223e4dd61 \ - --hash=sha256:31a2cae5f059329f9cfe3d8d266d3da1543b60b60130d186d9b6a3c20a346361 \ - --hash=sha256:333c840a1303d1474f491e7be0b718226c730a39ead0f7dab2c7e6a2f3855555 \ - --hash=sha256:33d14369739c5d07e2e7102cdb0081a1fa46ed03215e07f097b34e020b83b1ae \ - --hash=sha256:35380671c3c921fe8adf31ad349dc6f7588b7e928dbe44e1093789734f607399 \ - --hash=sha256:359e7951f04ad35111b5ddce184db3391442345d0ab073aa63a95eb8af25a5ef \ - --hash=sha256:36aa167f69d8807ba7e341d67ea93e50fcaaf6bc433bb04939430fa3dab06f31 \ - --hash=sha256:395e3e1148fa7809016231f8065f30bb0dc285a97b4dc4360cd86e17bab58af7 \ - --hash=sha256:3e8d89c276234579cd3d095d5fa2a44eb10db9a218664a17b56363cddf226ff3 \ - --hash=sha256:3eb8849445c26b41c5a474061032c53e14fe92a11a5db969f722a2716cd12206 \ - --hash=sha256:3fd8bc2690e7c39eecdf9071b6a889ce7b22b72073863940edc2a0a23750ca90 \ - --hash=sha256:400bf470e4327e920883b51e255617dfe4496d4e80c3fea0b5a5d0bf2c404dd4 \ - --hash=sha256:4148dc9184ab79e356dc00a4199dc0ee8647973332cb385fc29a7cced49b9f9c \ - --hash=sha256:433689845288f9a1ee5714444e65957be26d30915f7745091ede4a83cfb2d7bb \ - --hash=sha256:43b61989068de9ce62296cde02beffabcadb65672207fc51e7af76dca75e6636 \ - --hash=sha256:4523c4009c3f39d948e01962223c9f5538602e7087a628479b723c939fab262d \ - --hash=sha256:483c2213a609e7db2c592bbc015da58b6c75af7360ca3c981f178110d9787bcf \ - --hash=sha256:49633583eb7dc5cba61aaf7cdb2e9e662323ad394e543ee77af265736bcd3eaa \ - --hash=sha256:4b51f964fcbb02949fc546022e56cdb16cda457af485e9a3e8b78ac2ecf5d77e \ - --hash=sha256:4bf1340ae507f6da6360b24179c2083857c8ca7644aab65807023cf35404ea8d \ - --hash=sha256:4fb49cfdb53af5041aba909be00cccfb2c0d0a2e09281bf542371c5fd36ad04c \ - --hash=sha256:510b11e9c3b1a852876d1ccd8d5903684336d635214148637ceb27366c75a467 \ - --hash=sha256:513cb14c0cc31a4dfd849a4674b20c46d87b364f997bbcb02282306f5e187abf \ - --hash=sha256:58560828ee0951bb125c6f2862fbc37f039996d19ceb6d8ff1905abf7da0bf3d \ - --hash=sha256:58ab0d979c969983cdb97374698d847a4acffb217d543e172838864636ef10d9 \ - --hash=sha256:5982048129f40b082c2654de10c0f37c67a14f5ff9d37cf35be028ae982f26df \ - --hash=sha256:5ab325fc86fbc077284c8d7f996d904d30e97904a87d6fb303dce6b3de7ebba9 \ - --hash=sha256:5cc822ab90a70ea3a91e6aed3afac570b276b1278c6909b1d384f745bd09c714 \ - --hash=sha256:5f2b19b8d6fca432cb3acf48cf5243a7bf512988029b6e6fd27e9e8c0a204d85 \ - --hash=sha256:5fc72fbfebbf42c0856a824b8b0dc2b5cd2e4a896050281a21cfa6fed8879cb1 \ - --hash=sha256:6354e18a9be37bfa124d6b288a87fb30c673745806c92956f1a25e3ae6e76b96 \ - --hash=sha256:678f66462058dd978702db17eb6a3633d634f7aa0deaea61e0a674152766d3fc \ - --hash=sha256:68950bc08f9735306322bfc16a18391fcaac99ded2509e1cc41d03ccb6013cfe \ - --hash=sha256:68ef5377eb582fa4343c9d0b57a5b094046d447b4c73dd9fbd9ffb216f829e7d \ - --hash=sha256:6b4c19525c3538fbc0bbda6229f9682fb8199ce9ac37395880e6952798e00373 \ - --hash=sha256:6bb69bf3b6500f195c3deb69c1205ba8fc3cb21d1915f1f158a10d6b1ef29b6a \ - --hash=sha256:6e19401742ed7b69e51d8e4df3c03ad5ec65a83b36244479fd70edde2828a5d9 \ - --hash=sha256:6f4a53af9e81d757756508b57cae1cf28293f0f31b9fa2bfcb416cc7fb230f9d \ - --hash=sha256:6fda87808429c520a002a85d6e7cdadbf58231d60e96260976c5b8f9a12a8e13 \ - --hash=sha256:78f841523729e43e3928a364ec46e2e3f80e6625a4f62aca5c345f3f626c6e8a \ - --hash=sha256:7a6ebfac28fd51890a61df36ef202adbd77d00ee5aca4a3dadb3d9ed49cfb929 \ - --hash=sha256:7b0202ebf2268954090209a84f9897345719e46a57c5f2c9b7b250ca0a9d3e63 \ - --hash=sha256:8117839a9bdbba86e7f9df57018fe3b96cec934c3940b591b0fd3fbfb485864a \ - --hash=sha256:82e1ad4ca170e8af4c928b67cff731b6296e6a0a0981b97b2eb7c275cc4e15bd \ - --hash=sha256:836a4bfe0cc6d36dc9a9cc1a7b391265bf6ce9d1eb1eac62ac5139f5d8d9a6fa \ - --hash=sha256:84af1cf7bfdcbc6fcf5a5f70cc9896205e0350306e4dd73d54b6a18894f79386 \ - --hash=sha256:84e35afd9e10b2698e6f2f32256678cb23ca6c1568d02628033a837638b3ed12 \ - --hash=sha256:884f1806609c2c66564082540cffc96868c5571c7c3cf3a783f63f2fb49bd3cd \ - --hash=sha256:8a150392102c402c538190730fda06f3bce654fc498865579a9f2c1d2b425833 \ - --hash=sha256:8e21d927469d04b39386255bf00d0feedead16f6253dcc85e9e10ddebc334084 \ - --hash=sha256:8e96ca781e0c01e32115912ebdf7b3fb0780ce748b80d7d28a0802fa9fbaf44e \ - --hash=sha256:8ee4c2a75af9fe21269a4a0898c5425afb01af1f5d276063f57e2ae1bc64e191 \ - --hash=sha256:91bc66f878557313c2a6bcf396e7befcffe5ab4354cfe4427318968af31143c3 \ - --hash=sha256:951e71da6c89d354572098bada5ba5b5dc3a9390c933af8a614e37755d3d1840 \ - --hash=sha256:99b2863c1365f43f74199c980a3d40f18a218fbe683dd64e470199db426c4d6a \ - --hash=sha256:9a8fbf506fde1529a1e3698198fe64bfbe2e0c09557bc6a7dcf872e7c01fec40 \ - --hash=sha256:9ce048deb1e033e7a865ca384770bccc11d44179cf09e5193a535c4c2f497bdc \ - --hash=sha256:9fe94d9d2a2b4edd7a4b22adcd45814b1b59b03feb00e56deb2e89747aec7bfe \ - --hash=sha256:a291d0b4243a259c8ea7e2b84eb9ccb76370e569298875a7c5e3e71baf49057a \ - --hash=sha256:a5c022bb0d453192426221605efc865373dde43b17822a264671c53b068ac20c \ - --hash=sha256:abb4785894936d7682635726613c44578c420a096729f1978cd061a7e72d5275 \ - --hash=sha256:b872c86d8d71827235c7077461c502feb2db3f87d9d6d5a9daa64287d75e4fa0 \ - --hash=sha256:bf37b72834e7239cf84d4a0b2c050e7f9e48bced97bad9bdf98d26b8eb72e846 \ - --hash=sha256:c0c431e4be5c1a0c6654e0c31c661cd89e0ca956ef65305c3c3fd96f4e72ca39 \ - --hash=sha256:c5726eec789ee38f2c53b10b1821457b82274f81f4f746bb1e666d8741fcfadb \ - --hash=sha256:c6fcb3fa3855d583aa57b94cf146f7781d5d5bc06cb95cb3afece33d31aac39b \ - --hash=sha256:c86679f443e7085ea55a7376462553996c688395d18ef3f0d3dbad7838f857a2 \ - --hash=sha256:c91e3c04f5191fd3fb68764bddeaf02025492d5d9f23343b283870f6ace69708 \ - --hash=sha256:c921ad596ff1a82f9c692b0758c944355abc9f0de97a4c13ca60ffc6d8dc15d4 \ - --hash=sha256:c9ed88b398ba7e3bad7bd64d66cc01dcde9cfcb7ec629a6fd78a82fa0b559d78 \ - --hash=sha256:cd2ac6b919f7fed71b17fe0b4603c092a4c9b5bae414817c9c81d3c22d1e1bcc \ - --hash=sha256:d28ca7066d6cdd347a50d8b725dc10d9a1d6a1cce09836cf071ea6a2d4908be0 \ - --hash=sha256:d29e235ce13c91902ef3efc3d883a677655b3908b1cbc73dee816e5e1f8f7739 \ - --hash=sha256:d8b5ee4ae9170e2775d495b81f414cc20268041c42571530513496ba61e94ba3 \ - --hash=sha256:db72e40628967f6dc572020d04b5f800d71264e0531c6da35097e73bdf38b003 \ - --hash=sha256:df45c4073bed486ea2f18757057953afed8dd77add7276ff01bccb79982cf46c \ - --hash=sha256:dfa5f5c0a4c8fced1422dc2ca7eefd872d5d13eb33cf324361dbf1dbfba0a9fe \ - --hash=sha256:e015833384ca3e1a0565a79f5d953b0629d9138021c27ad37c92a9fa1af7623c \ - --hash=sha256:e15315691fe2253eb447503153acef4d7223dfe7e7702f9ed66539fcd0c43801 \ - --hash=sha256:e65466b31be1070b4a5b7dbfbd14b247884cb8e8b79c64fb0f36b472912dbaea \ - --hash=sha256:e7820bb0d65e3ce1e3e70b6708c2f66143f55912fa02f4b618d0f08b61575f12 \ - --hash=sha256:e851a051f7260e6d688267eb039c81f05f23a19431bd7dfa4bf5e3cb34c108cd \ - --hash=sha256:e9f9feee7f334b72ceae46313333d002b56f325b5f04271b4ae2aadd9e993ae4 \ - --hash=sha256:eb40f828bc2f73f777d1eb8fee2e86cd9692a4518b63b6b5aa8af915dfd3207b \ - --hash=sha256:eb704155e73b833801c247f39d562229c0303f54770ca14fb1c053acb376cf10 \ - --hash=sha256:edb1bfd45227dec8d50bc7c7d86463cd8728bcc574f9b07de7369880de4626a3 \ - --hash=sha256:ee7d9d5537daf6d5c74a83b38a638cc001b648096c1cae8ef695b0c919d9d379 \ - --hash=sha256:f57783fbaf648205ac50ae7d646f27582fc706be3977e87c3c124e7a92407b10 \ - --hash=sha256:ff63a92f6e249514ef35bc795de10745be0226eaea06eb48b4bbeaa0c8850a4a - # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt +pydantic-core==2.33.2 \ + --hash=sha256:0069c9acc3f3981b9ff4cdfaf088e98d83440a4c7ea1bc07460af3d4dc22e72d \ + --hash=sha256:031c57d67ca86902726e0fae2214ce6770bbe2f710dc33063187a68744a5ecac \ + --hash=sha256:0405262705a123b7ce9f0b92f123334d67b70fd1f20a9372b907ce1080c7ba02 \ + --hash=sha256:04a1a413977ab517154eebb2d326da71638271477d6ad87a769102f7c2488c56 \ + --hash=sha256:09fb9dd6571aacd023fe6aaca316bd01cf60ab27240d7eb39ebd66a3a15293b4 \ + --hash=sha256:0a39979dcbb70998b0e505fb1556a1d550a0781463ce84ebf915ba293ccb7e22 \ + --hash=sha256:0a9f2c9dd19656823cb8250b0724ee9c60a82f3cdf68a080979d13092a3b0fef \ + --hash=sha256:0e03262ab796d986f978f79c943fc5f620381be7287148b8010b4097f79a39ec \ + --hash=sha256:0e5b2671f05ba48b94cb90ce55d8bdcaaedb8ba00cc5359f6810fc918713983d \ + --hash=sha256:0e6116757f7959a712db11f3e9c0a99ade00a5bbedae83cb801985aa154f071b \ + --hash=sha256:0fb2d542b4d66f9470e8065c5469ec676978d625a8b7a363f07d9a501a9cb36a \ + --hash=sha256:1082dd3e2d7109ad8b7da48e1d4710c8d06c253cbc4a27c1cff4fbcaa97a9e3f \ + --hash=sha256:1a8695a8d00c73e50bff9dfda4d540b7dee29ff9b8053e38380426a85ef10052 \ + --hash=sha256:1e063337ef9e9820c77acc768546325ebe04ee38b08703244c1309cccc4f1bab \ + --hash=sha256:1ea40a64d23faa25e62a70ad163571c0b342b8bf66d5fa612ac0dec4f069d916 \ + --hash=sha256:2058a32994f1fde4ca0480ab9d1e75a0e8c87c22b53a3ae66554f9af78f2fe8c \ + --hash=sha256:235f45e5dbcccf6bd99f9f472858849f73d11120d76ea8707115415f8e5ebebf \ + --hash=sha256:2807668ba86cb38c6817ad9bc66215ab8584d1d304030ce4f0887336f28a5e27 \ + --hash=sha256:2b0a451c263b01acebe51895bfb0e1cc842a5c666efe06cdf13846c7418caa9a \ + --hash=sha256:2b3d326aaef0c0399d9afffeb6367d5e26ddc24d351dbc9c636840ac355dc5d8 \ + --hash=sha256:2bfb5112df54209d820d7bf9317c7a6c9025ea52e49f46b6a2060104bba37de7 \ + --hash=sha256:2f82865531efd18d6e07a04a17331af02cb7a651583c418df8266f17a63c6612 \ + --hash=sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1 \ + --hash=sha256:3a1c81334778f9e3af2f8aeb7a960736e5cab1dfebfb26aabca09afd2906c039 \ + --hash=sha256:3abcd9392a36025e3bd55f9bd38d908bd17962cc49bc6da8e7e96285336e2bca \ + --hash=sha256:3c6db6e52c6d70aa0d00d45cdb9b40f0433b96380071ea80b09277dba021ddf7 \ + --hash=sha256:3dc625f4aa79713512d1976fe9f0bc99f706a9dee21dfd1810b4bbbf228d0e8a \ + --hash=sha256:3eb3fe62804e8f859c49ed20a8451342de53ed764150cb14ca71357c765dc2a6 \ + --hash=sha256:44857c3227d3fb5e753d5fe4a3420d6376fa594b07b621e220cd93703fe21782 \ + --hash=sha256:4b25d91e288e2c4e0662b8038a28c6a07eaac3e196cfc4ff69de4ea3db992a1b \ + --hash=sha256:4c5b0a576fb381edd6d27f0a85915c6daf2f8138dc5c267a57c08a62900758c7 \ + --hash=sha256:4e61206137cbc65e6d5256e1166f88331d3b6238e082d9f74613b9b765fb9025 \ + --hash=sha256:52fb90784e0a242bb96ec53f42196a17278855b0f31ac7c3cc6f5c1ec4811849 \ + --hash=sha256:53a57d2ed685940a504248187d5685e49eb5eef0f696853647bf37c418c538f7 \ + --hash=sha256:572c7e6c8bb4774d2ac88929e3d1f12bc45714ae5ee6d9a788a9fb35e60bb04b \ + --hash=sha256:5c4aa4e82353f65e548c476b37e64189783aa5384903bfea4f41580f255fddfa \ + --hash=sha256:5c92edd15cd58b3c2d34873597a1e20f13094f59cf88068adb18947df5455b4e \ + --hash=sha256:5f483cfb75ff703095c59e365360cb73e00185e01aaea067cd19acffd2ab20ea \ + --hash=sha256:61c18fba8e5e9db3ab908620af374db0ac1baa69f0f32df4f61ae23f15e586ac \ + --hash=sha256:6368900c2d3ef09b69cb0b913f9f8263b03786e5b2a387706c5afb66800efd51 \ + --hash=sha256:64632ff9d614e5eecfb495796ad51b0ed98c453e447a76bcbeeb69615079fc7e \ + --hash=sha256:65132b7b4a1c0beded5e057324b7e16e10910c106d43675d9bd87d4f38dde162 \ + --hash=sha256:6b99022f1d19bc32a4c2a0d544fc9a76e3be90f0b3f4af413f87d38749300e65 \ + --hash=sha256:6bdfe4b3789761f3bcb4b1ddf33355a71079858958e3a552f16d5af19768fef2 \ + --hash=sha256:6fa6dfc3e4d1f734a34710f391ae822e0a8eb8559a85c6979e14e65ee6ba2954 \ + --hash=sha256:73662edf539e72a9440129f231ed3757faab89630d291b784ca99237fb94db2b \ + --hash=sha256:73cf6373c21bc80b2e0dc88444f41ae60b2f070ed02095754eb5a01df12256de \ + --hash=sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc \ + --hash=sha256:7f92c15cd1e97d4b12acd1cc9004fa092578acfa57b67ad5e43a197175d01a64 \ + --hash=sha256:82f68293f055f51b51ea42fafc74b6aad03e70e191799430b90c13d643059ebb \ + --hash=sha256:83aa99b1285bc8f038941ddf598501a86f1536789740991d7d8756e34f1e74d9 \ + --hash=sha256:87acbfcf8e90ca885206e98359d7dca4bcbb35abdc0ff66672a293e1d7a19101 \ + --hash=sha256:87b31b6846e361ef83fedb187bb5b4372d0da3f7e28d85415efa92d6125d6e6d \ + --hash=sha256:881b21b5549499972441da4758d662aeea93f1923f953e9cbaff14b8b9565aef \ + --hash=sha256:8d55ab81c57b8ff8548c3e4947f119551253f4e3787a7bbc0b6b3ca47498a9d3 \ + --hash=sha256:8f57a69461af2a5fa6e6bbd7a5f60d3b7e6cebb687f55106933188e79ad155c1 \ + --hash=sha256:95237e53bb015f67b63c91af7518a62a8660376a6a0db19b89acc77a4d6199f5 \ + --hash=sha256:96081f1605125ba0855dfda83f6f3df5ec90c61195421ba72223de35ccfb2f88 \ + --hash=sha256:970919794d126ba8645f3837ab6046fb4e72bbc057b3709144066204c19a455d \ + --hash=sha256:9cb1da0f5a471435a7bc7e439b8a728e8b61e59784b2af70d7c169f8dd8ae290 \ + --hash=sha256:9fcd347d2cc5c23b06de6d3b7b8275be558a0c90549495c699e379a80bf8379e \ + --hash=sha256:9fdac5d6ffa1b5a83bca06ffe7583f5576555e6c8b3a91fbd25ea7780f825f7d \ + --hash=sha256:a11c8d26a50bfab49002947d3d237abe4d9e4b5bdc8846a63537b6488e197808 \ + --hash=sha256:a144d4f717285c6d9234a66778059f33a89096dfb9b39117663fd8413d582dcc \ + --hash=sha256:a2b911a5b90e0374d03813674bf0a5fbbb7741570dcd4b4e85a2e48d17def29d \ + --hash=sha256:a7ec89dc587667f22b6a0b6579c249fca9026ce7c333fc142ba42411fa243cdc \ + --hash=sha256:aa9d91b338f2df0508606f7009fde642391425189bba6d8c653afd80fd6bb64e \ + --hash=sha256:b0379a2b24882fef529ec3b4987cb5d003b9cda32256024e6fe1586ac45fc640 \ + --hash=sha256:bc7aee6f634a6f4a95676fcb5d6559a2c2a390330098dba5e5a5f28a2e4ada30 \ + --hash=sha256:bdc25f3681f7b78572699569514036afe3c243bc3059d3942624e936ec93450e \ + --hash=sha256:c083a3bdd5a93dfe480f1125926afcdbf2917ae714bdb80b36d34318b2bec5d9 \ + --hash=sha256:c20c462aa4434b33a2661701b861604913f912254e441ab8d78d30485736115a \ + --hash=sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9 \ + --hash=sha256:c52b02ad8b4e2cf14ca7b3d918f3eb0ee91e63b3167c32591e57c4317e134f8f \ + --hash=sha256:c54c939ee22dc8e2d545da79fc5381f1c020d6d3141d3bd747eab59164dc89fb \ + --hash=sha256:c8e7af2f4e0194c22b5b37205bfb293d166a7344a5b0d0eaccebc376546d77d5 \ + --hash=sha256:cca3868ddfaccfbc4bfb1d608e2ccaaebe0ae628e1416aeb9c4d88c001bb45ab \ + --hash=sha256:d3f26877a748dc4251cfcfda9dfb5f13fcb034f5308388066bcfe9031b63ae7d \ + --hash=sha256:d53b22f2032c42eaaf025f7c40c2e3b94568ae077a606f006d206a463bc69572 \ + --hash=sha256:d87c561733f66531dced0da6e864f44ebf89a8fba55f31407b00c2f7f9449593 \ + --hash=sha256:d946c8bf0d5c24bf4fe333af284c59a19358aa3ec18cb3dc4370080da1e8ad29 \ + --hash=sha256:dac89aea9af8cd672fa7b510e7b8c33b0bba9a43186680550ccf23020f32d535 \ + --hash=sha256:db4b41f9bd95fbe5acd76d89920336ba96f03e149097365afe1cb092fceb89a1 \ + --hash=sha256:dc46a01bf8d62f227d5ecee74178ffc448ff4e5197c756331f71efcc66dc980f \ + --hash=sha256:dd14041875d09cc0f9308e37a6f8b65f5585cf2598a53aa0123df8b129d481f8 \ + --hash=sha256:de4b83bb311557e439b9e186f733f6c645b9417c84e2eb8203f3f820a4b988bf \ + --hash=sha256:e799c050df38a639db758c617ec771fd8fb7a5f8eaaa4b27b101f266b216a246 \ + --hash=sha256:e80b087132752f6b3d714f041ccf74403799d3b23a72722ea2e6ba2e892555b9 \ + --hash=sha256:eb8c529b2819c37140eb51b914153063d27ed88e3bdc31b71198a198e921e011 \ + --hash=sha256:eb9b459ca4df0e5c87deb59d37377461a538852765293f9e6ee834f0435a93b9 \ + --hash=sha256:efec8db3266b76ef9607c2c4c419bdb06bf335ae433b80816089ea7585816f6a \ + --hash=sha256:f481959862f57f29601ccced557cc2e817bce7533ab8e01a797a48b49c9692b3 \ + --hash=sha256:f517ca031dfc037a9c07e748cefd8d96235088b83b4f4ba8939105d20fa1dcd6 \ + --hash=sha256:f889f7a40498cc077332c7ab6b4608d296d852182211787d4f3ee377aaae66e8 \ + --hash=sha256:f8de619080e944347f5f20de29a975c2d815d9ddd8be9b9b7268e2e3ef68605a \ + --hash=sha256:f941635f2a3d96b2973e867144fde513665c87f13fe0e193c158ac51bfaaa7b2 \ + --hash=sha256:fa754d1850735a0b0e03bcffd9d4b4343eb417e47196e4485d9cca326073a42c \ + --hash=sha256:fa854f5cf7e33842a892e5c73f45327760bc7bc516339fda888c75ae60edaeb6 \ + --hash=sha256:fe5b32187cbc0c862ee201ad66c30cf218e5ed468ec8dc1cf49dec66e160cc4d + # via + # -c python/deplocks/llm/ray_test_py311_cu128.lock # pydantic pydantic-extra-types==2.10.5 \ --hash=sha256:1dcfa2c0cf741a422f088e0dbb4690e7bfadaaf050da3d6f80d6c3cf58a2bad8 \ @@ -3277,7 +3411,7 @@ pygments==2.18.0 \ --hash=sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199 \ --hash=sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # ipython # nbconvert # rich @@ -3286,7 +3420,7 @@ pyjwt==2.8.0 \ --hash=sha256:57e28d156e3d5c10088e0c68abb90bfac3df82b40a71bd0daa20c65ccd5c23de \ --hash=sha256:59127c392cc44c2da5bb3192169a91f429924e17aff6534d70fdc02ab3e04320 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # msal pynvml==12.0.0 \ --hash=sha256:299ce2451a6a17e6822d6faee750103e25b415f06f59abb8db65d30f794166f5 \ @@ -3296,20 +3430,20 @@ pyopenssl==25.0.0 \ --hash=sha256:424c247065e46e76a37411b9ab1782541c23bb658bf003772c3405fbaa128e90 \ --hash=sha256:cd2cef799efa3936bb08e8ccb9433a575722b9dd986023f1cabc4ae64e9dac16 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/cloud-requirements.txt # -r python/requirements.txt pyparsing==3.1.1 \ --hash=sha256:32c7c0b711493c72ff18a981d24f28aaf9c1fb7ed5e9667c9e84e3db623bdbfb \ --hash=sha256:ede28a1a32462f5a9705e07aea48001a08f7cf81a021585011deba701581a0db # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # httplib2 pytest==7.4.4 \ --hash=sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280 \ --hash=sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/base-test-requirements.txt # -r python/requirements/llm/llm-test-requirements.txt # pytest-aiohttp @@ -3318,23 +3452,24 @@ pytest-aiohttp==1.1.0 \ --hash=sha256:147de8cb164f3fc9d7196967f109ab3c0b93ea3463ab50631e56438eab7b5adc \ --hash=sha256:f39a11693a0dce08dd6c542d241e199dd8047a6e6596b2bcfa60d373f143456d # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/base-test-requirements.txt pytest-asyncio==0.17.2 \ --hash=sha256:6d895b02432c028e6957d25fc936494e78c6305736e785d9fee408b1efbc7ff4 \ --hash=sha256:e0fe5dbea40516b661ef1bcfe0bd9461c2847c4ef4bb40012324f2454fb7d56d # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/base-test-requirements.txt # pytest-aiohttp python-dateutil==2.8.2 \ --hash=sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86 \ --hash=sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/cloud-requirements.txt # arrow # botocore + # celery # jupyter-client # pandas python-dotenv==1.1.0 \ @@ -3345,7 +3480,7 @@ python-json-logger==2.0.7 \ --hash=sha256:23e7ec02d34237c5aa1e29a070193a4ea87583bb4e7f8fd06d3de8264c4b2e1c \ --hash=sha256:f380b826a991ebbe3de4d897aeec42760035ac760345e57b812938dc8b35e2bd # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jupyter-events # vllm python-multipart==0.0.20 \ @@ -3356,7 +3491,7 @@ pytz==2022.7.1 \ --hash=sha256:01a0681c4b9684a28304615eba55d1ab31ae00bf68ec157ec3708a8182dbbcd0 \ --hash=sha256:78f4f37d8198e0627c5f1143240bb0206b8691d8d7ac6d78fee88b78733f8c4a # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # pandas pyyaml==6.0.1 \ --hash=sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5 \ @@ -3411,7 +3546,7 @@ pyyaml==6.0.1 \ --hash=sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d \ --hash=sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/cloud-requirements.txt # -r python/requirements.txt # gguf @@ -3513,7 +3648,7 @@ pyzmq==26.0.3 \ --hash=sha256:f6b1d1c631e5940cac5a0b22c5379c86e8df6a4ec277c7a856b714021ab6cfad \ --hash=sha256:f6c21c00478a7bea93caaaef9e7629145d4153b15a8653e8bb4609d4bc70dbfc # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # ipykernel # jupyter-client # jupyter-server @@ -3524,7 +3659,7 @@ referencing==0.36.2 \ --hash=sha256:df2e89862cd09deabbdba16944cc3f10feb6b3e6f18e902f7cc25609a34775aa \ --hash=sha256:e8699adbbf8b5c7de96d8ffa0eb5c158b3beafce084968e2ea8bb08c6794dcd0 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jsonschema # jsonschema-specifications regex==2024.11.6 \ @@ -3630,10 +3765,11 @@ requests==2.32.3 \ --hash=sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760 \ --hash=sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/cloud-requirements.txt # -r python/requirements.txt # azure-core + # azure-datalake-store # google-api-core # google-cloud-storage # huggingface-hub @@ -3650,21 +3786,21 @@ rfc3339-validator==0.1.4 \ --hash=sha256:138a2abdf93304ad60530167e51d2dfb9549521a836871b88d7f4695d0022f6b \ --hash=sha256:24f6ec1eda14ef823da9e36ec7113124b39c04d50a4d3d3a3c2859577e7791fa # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jsonschema # jupyter-events rfc3986-validator==0.1.1 \ --hash=sha256:2f235c432ef459970b4306369336b9d5dbdda31b510ca1e327636e01f528bfa9 \ --hash=sha256:3d44bde7921b3b9ec3ae4e3adca370438eccebc676456449b145d533b240d055 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jsonschema # jupyter-events rich==13.3.2 \ --hash=sha256:91954fe80cfb7985727a467ca98a7618e5dd15178cc2da10f553b36a93859001 \ --hash=sha256:a104f37270bf677148d8acb07d33be1569eeee87e2d1beb286a4e9113caf6f2f # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/cloud-requirements.txt # -r python/requirements.txt # memray @@ -3774,21 +3910,21 @@ rpds-py==0.22.3 \ --hash=sha256:fda7cb070f442bf80b642cd56483b5548e43d366fe3f39b98e67cce780cded00 \ --hash=sha256:feea821ee2a9273771bae61194004ee2fc33f8ec7db08117ef9147d4bbcbca8e # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jsonschema # referencing rsa==4.7.2 \ --hash=sha256:78f9a9bf4e7be0c5ded4583326e7461e3a3c5aae24073648b4bdfa797d78c9d2 \ --hash=sha256:9d689e6ca1b3038bc82bf8d23e944b6b6037bc02301a574935b2dd946e0353b9 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # google-auth # oauth2client -s3transfer==0.6.2 \ - --hash=sha256:b014be3a8a2aab98cfe1abc7229cc5a9a0cf05eb9c1f2b86b230fd8df3f78084 \ - --hash=sha256:cab66d3380cca3e70939ef2255d01cd8aece6a4907a9528740f668c4b0611861 +s3transfer==0.8.0 \ + --hash=sha256:baa479dc2e63e5c2ed51611b4d46cdf0295e2070d8d0b86b22f335ee5b954986 \ + --hash=sha256:e8d6bd52ffd99841e3a57b34370a54841f12d3aab072af862cdcc50955288002 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # boto3 safetensors==0.5.3 \ --hash=sha256:1077f3e94182d72618357b04b5ced540ceb71c8a813d3319f1aba448e68a770d \ @@ -3830,7 +3966,7 @@ scikit-image==0.24.0 \ --hash=sha256:ef04360eda372ee5cd60aebe9be91258639c86ae2ea24093fb9182118008d009 \ --hash=sha256:fa27b3a0dbad807b966b8db2d78da734cb812ca4787f7fbb143764800ce2fa9c # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt scipy==1.11.4 \ --hash=sha256:00150c5eae7b610c32589dda259eacc7c4f1665aedf25d921907f4d08a951b1c \ @@ -3859,7 +3995,7 @@ scipy==1.11.4 \ --hash=sha256:f313b39a7e94f296025e3cffc2c567618174c0b1dde173960cf23808f9fae4be \ --hash=sha256:f3cd9e7b3c2c1ec26364856f9fbe78695fe631150f94cd1c22228456404cf1ec # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt # scikit-image # vllm @@ -3867,7 +4003,7 @@ send2trash==1.8.3 \ --hash=sha256:0c31227e0bd08961c7665474a3d1ef7193929fedda4233843689baa056be46c9 \ --hash=sha256:b18e7a3966d99871aefeb00cfbcfdced55ce4871194810fc71f4aa484b953abf # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jupyter-server # nbclassic # notebook @@ -3929,17 +4065,116 @@ sentencepiece==0.2.0 \ # gguf # mistral-common # vllm +setproctitle==1.3.6 \ + --hash=sha256:082413db8a96b1f021088e8ec23f0a61fec352e649aba20881895815388b66d3 \ + --hash=sha256:0dba8faee2e4a96e934797c9f0f2d093f8239bf210406a99060b3eabe549628e \ + --hash=sha256:0e6b5633c94c5111f7137f875e8f1ff48f53b991d5d5b90932f27dc8c1fa9ae4 \ + --hash=sha256:1065ed36bd03a3fd4186d6c6de5f19846650b015789f72e2dea2d77be99bdca1 \ + --hash=sha256:109fc07b1cd6cef9c245b2028e3e98e038283342b220def311d0239179810dbe \ + --hash=sha256:13624d9925bb481bc0ccfbc7f533da38bfbfe6e80652314f789abc78c2e513bd \ + --hash=sha256:156795b3db976611d09252fc80761fcdb65bb7c9b9581148da900851af25ecf4 \ + --hash=sha256:163dba68f979c61e4e2e779c4d643e968973bdae7c33c3ec4d1869f7a9ba8390 \ + --hash=sha256:17d7c833ed6545ada5ac4bb606b86a28f13a04431953d4beac29d3773aa00b1d \ + --hash=sha256:18d0667bafaaae4c1dee831e2e59841c411ff399b9b4766822ba2685d419c3be \ + --hash=sha256:1aa1935aa2195b76f377e5cb018290376b7bf085f0b53f5a95c0c21011b74367 \ + --hash=sha256:2156d55308431ac3b3ec4e5e05b1726d11a5215352d6a22bb933171dee292f8c \ + --hash=sha256:23a57d3b8f1549515c2dbe4a2880ebc1f27780dc126c5e064167563e015817f5 \ + --hash=sha256:2407955dc359d735a20ac6e797ad160feb33d529a2ac50695c11a1ec680eafab \ + --hash=sha256:2940cf13f4fc11ce69ad2ed37a9f22386bfed314b98d8aebfd4f55459aa59108 \ + --hash=sha256:2e51ec673513465663008ce402171192a053564865c2fc6dc840620871a9bd7c \ + --hash=sha256:3393859eb8f19f5804049a685bf286cb08d447e28ba5c6d8543c7bf5500d5970 \ + --hash=sha256:3884002b3a9086f3018a32ab5d4e1e8214dd70695004e27b1a45c25a6243ad0b \ + --hash=sha256:38ca045626af693da042ac35d7332e7b9dbd52e6351d6973b310612e3acee6d6 \ + --hash=sha256:391bb6a29c4fe7ccc9c30812e3744060802d89b39264cfa77f3d280d7f387ea5 \ + --hash=sha256:3cca16fd055316a48f0debfcbfb6af7cea715429fc31515ab3fcac05abd527d8 \ + --hash=sha256:3cde5b83ec4915cd5e6ae271937fd60d14113c8f7769b4a20d51769fe70d8717 \ + --hash=sha256:3f8194b4d631b003a1176a75d1acd545e04b1f54b821638e098a93e6e62830ef \ + --hash=sha256:3fc97805f9d74444b027babff710bf39df1541437a6a585a983d090ae00cedde \ + --hash=sha256:4431629c178193f23c538cb1de3da285a99ccc86b20ee91d81eb5f1a80e0d2ba \ + --hash=sha256:49498ebf68ca3e75321ffe634fcea5cc720502bfaa79bd6b03ded92ce0dc3c24 \ + --hash=sha256:4ac3eb04bcf0119aadc6235a2c162bae5ed5f740e3d42273a7228b915722de20 \ + --hash=sha256:4adf6a0013fe4e0844e3ba7583ec203ca518b9394c6cc0d3354df2bf31d1c034 \ + --hash=sha256:4efc91b437f6ff2578e89e3f17d010c0a0ff01736606473d082913ecaf7859ba \ + --hash=sha256:50706b9c0eda55f7de18695bfeead5f28b58aa42fd5219b3b1692d554ecbc9ec \ + --hash=sha256:5313a4e9380e46ca0e2c681ba739296f9e7c899e6f4d12a6702b2dc9fb846a31 \ + --hash=sha256:543f59601a4e32daf44741b52f9a23e0ee374f9f13b39c41d917302d98fdd7b0 \ + --hash=sha256:57bc54763bf741813a99fbde91f6be138c8706148b7b42d3752deec46545d470 \ + --hash=sha256:63cc10352dc6cf35a33951656aa660d99f25f574eb78132ce41a85001a638aa7 \ + --hash=sha256:6a1d3aa13acfe81f355b0ce4968facc7a19b0d17223a0f80c011a1dba8388f37 \ + --hash=sha256:6af330ddc2ec05a99c3933ab3cba9365357c0b8470a7f2fa054ee4b0984f57d1 \ + --hash=sha256:6d50bfcc1d1692dc55165b3dd2f0b9f8fb5b1f7b571a93e08d660ad54b9ca1a5 \ + --hash=sha256:70100e2087fe05359f249a0b5f393127b3a1819bf34dec3a3e0d4941138650c9 \ + --hash=sha256:74973aebea3543ad033b9103db30579ec2b950a466e09f9c2180089e8346e0ec \ + --hash=sha256:751ba352ed922e0af60458e961167fa7b732ac31c0ddd1476a2dfd30ab5958c5 \ + --hash=sha256:785cd210c0311d9be28a70e281a914486d62bfd44ac926fcd70cf0b4d65dff1c \ + --hash=sha256:7890e291bf4708e3b61db9069ea39b3ab0651e42923a5e1f4d78a7b9e4b18301 \ + --hash=sha256:793a23e8d9cb6c231aa3023d700008224c6ec5b8fd622d50f3c51665e3d0a190 \ + --hash=sha256:797f2846b546a8741413c57d9fb930ad5aa939d925c9c0fa6186d77580035af7 \ + --hash=sha256:7df5fcc48588f82b6cc8073db069609ddd48a49b1e9734a20d0efb32464753c4 \ + --hash=sha256:8050c01331135f77ec99d99307bfbc6519ea24d2f92964b06f3222a804a3ff1f \ + --hash=sha256:805bb33e92fc3d8aa05674db3068d14d36718e3f2c5c79b09807203f229bf4b5 \ + --hash=sha256:807796fe301b7ed76cf100113cc008c119daf4fea2f9f43c578002aef70c3ebf \ + --hash=sha256:81c443310831e29fabbd07b75ebbfa29d0740b56f5907c6af218482d51260431 \ + --hash=sha256:83066ffbf77a5f82b7e96e59bdccbdda203c8dccbfc3f9f0fdad3a08d0001d9c \ + --hash=sha256:8834ab7be6539f1bfadec7c8d12249bbbe6c2413b1d40ffc0ec408692232a0c6 \ + --hash=sha256:92df0e70b884f5da35f2e01489dca3c06a79962fb75636985f1e3a17aec66833 \ + --hash=sha256:9483aa336687463f5497dd37a070094f3dff55e2c888994f8440fcf426a1a844 \ + --hash=sha256:97a138fa875c6f281df7720dac742259e85518135cd0e3551aba1c628103d853 \ + --hash=sha256:9b50700785eccac0819bea794d968ed8f6055c88f29364776b7ea076ac105c5d \ + --hash=sha256:9b73cf0fe28009a04a35bb2522e4c5b5176cc148919431dcb73fdbdfaab15781 \ + --hash=sha256:9d5a369eb7ec5b2fdfa9927530b5259dd21893fa75d4e04a223332f61b84b586 \ + --hash=sha256:a094b7ce455ca341b59a0f6ce6be2e11411ba6e2860b9aa3dbb37468f23338f4 \ + --hash=sha256:a0d6252098e98129a1decb59b46920d4eca17b0395f3d71b0d327d086fefe77d \ + --hash=sha256:a1d856b0f4e4a33e31cdab5f50d0a14998f3a2d726a3fd5cb7c4d45a57b28d1b \ + --hash=sha256:a4ae2ea9afcfdd2b931ddcebf1cf82532162677e00326637b31ed5dff7d985ca \ + --hash=sha256:a5963b663da69ad25fa1559ee064584935570def665917918938c1f1289f5ebc \ + --hash=sha256:ad1c2c2baaba62823a7f348f469a967ece0062140ca39e7a48e4bbb1f20d54c4 \ + --hash=sha256:ae82507fe458f7c0c8227017f2158111a4c9e7ce94de05178894a7ea9fefc8a1 \ + --hash=sha256:af188f3305f0a65c3217c30c6d4c06891e79144076a91e8b454f14256acc7279 \ + --hash=sha256:af44bb7a1af163806bbb679eb8432fa7b4fb6d83a5d403b541b675dcd3798638 \ + --hash=sha256:b0174ca6f3018ddeaa49847f29b69612e590534c1d2186d54ab25161ecc42975 \ + --hash=sha256:b2b17855ed7f994f3f259cf2dfbfad78814538536fa1a91b50253d84d87fd88d \ + --hash=sha256:b2e54f4a2dc6edf0f5ea5b1d0a608d2af3dcb5aa8c8eeab9c8841b23e1b054fe \ + --hash=sha256:b6f4abde9a2946f57e8daaf1160b2351bcf64274ef539e6675c1d945dbd75e2a \ + --hash=sha256:b70c07409d465f3a8b34d52f863871fb8a00755370791d2bd1d4f82b3cdaf3d5 \ + --hash=sha256:bb465dd5825356c1191a038a86ee1b8166e3562d6e8add95eec04ab484cfb8a2 \ + --hash=sha256:c051f46ed1e13ba8214b334cbf21902102807582fbfaf0fef341b9e52f0fafbf \ + --hash=sha256:c1b20a5f4164cec7007be55c9cf18d2cd08ed7c3bf6769b3cd6d044ad888d74b \ + --hash=sha256:c86e9e82bfab579327dbe9b82c71475165fbc8b2134d24f9a3b2edaf200a5c3d \ + --hash=sha256:c9f32b96c700bb384f33f7cf07954bb609d35dd82752cef57fb2ee0968409169 \ + --hash=sha256:cce0ed8b3f64c71c140f0ec244e5fdf8ecf78ddf8d2e591d4a8b6aa1c1214235 \ + --hash=sha256:cdd7315314b0744a7dd506f3bd0f2cf90734181529cdcf75542ee35ad885cab7 \ + --hash=sha256:cf355fbf0d4275d86f9f57be705d8e5eaa7f8ddb12b24ced2ea6cbd68fdb14dc \ + --hash=sha256:d136fbf8ad4321716e44d6d6b3d8dffb4872626010884e07a1db54b7450836cf \ + --hash=sha256:d2c8e20487b3b73c1fa72c56f5c89430617296cd380373e7af3a538a82d4cd6d \ + --hash=sha256:d483cc23cc56ab32911ea0baa0d2d9ea7aa065987f47de847a0a93a58bf57905 \ + --hash=sha256:d5a6c4864bb6fa9fcf7b57a830d21aed69fd71742a5ebcdbafda476be673d212 \ + --hash=sha256:d714e002dd3638170fe7376dc1b686dbac9cb712cde3f7224440af722cc9866a \ + --hash=sha256:d73f14b86d0e2858ece6bf5807c9889670e392c001d414b4293d0d9b291942c3 \ + --hash=sha256:d88c63bd395c787b0aa81d8bbc22c1809f311032ce3e823a6517b711129818e4 \ + --hash=sha256:db608db98ccc21248370d30044a60843b3f0f3d34781ceeea67067c508cd5a28 \ + --hash=sha256:de004939fc3fd0c1200d26ea9264350bfe501ffbf46c8cf5dc7f345f2d87a7f1 \ + --hash=sha256:ded9e86397267732a0641d4776c7c663ea16b64d7dbc4d9cc6ad8536363a2d29 \ + --hash=sha256:e288f8a162d663916060beb5e8165a8551312b08efee9cf68302687471a6545d \ + --hash=sha256:e2a9e62647dc040a76d55563580bf3bb8fe1f5b6ead08447c2ed0d7786e5e794 \ + --hash=sha256:e3e44d08b61de0dd6f205528498f834a51a5c06689f8fb182fe26f3a3ce7dca9 \ + --hash=sha256:ea002088d5554fd75e619742cefc78b84a212ba21632e59931b3501f0cfc8f67 \ + --hash=sha256:eb7452849f6615871eabed6560ffedfe56bc8af31a823b6be4ce1e6ff0ab72c5 \ + --hash=sha256:ebcf34b69df4ca0eabaaaf4a3d890f637f355fed00ba806f7ebdd2d040658c26 \ + --hash=sha256:f24d5b9383318cbd1a5cd969377937d66cf0542f24aa728a4f49d9f98f9c0da8 \ + --hash=sha256:f33fbf96b52d51c23b6cff61f57816539c1c147db270cfc1cc3bc012f4a560a9 + # via vllm shellingham==1.5.4 \ --hash=sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686 \ --hash=sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # typer six==1.16.0 \ --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \ --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/cloud-requirements.txt # asttokens # azure-core @@ -3954,20 +4189,20 @@ smart-open==6.2.0 \ --hash=sha256:088bf00f9327c71e549bc2f86567d3320df5d89667f009ce1c16568976068ef7 \ --hash=sha256:1b4df5c8365218f3852c507451920ccad606c80b0acb4e67508e50ba9b5d2632 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/cloud-requirements.txt # -r python/requirements.txt smmap==5.0.1 \ --hash=sha256:dceeb6c0028fdb6734471eb07c0cd2aae706ccaecab45965ee83f11c8d3b1f62 \ --hash=sha256:e6d8668fa5f93e706934a62d7b4db19c8d9eb8cf2adbb75ef1b675aa332b69da # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # gitdb sniffio==1.3.1 \ --hash=sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2 \ --hash=sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # anyio # openai snowballstemmer==3.0.1 \ @@ -3988,7 +4223,7 @@ soupsieve==2.5 \ --hash=sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690 \ --hash=sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # beautifulsoup4 soxr==0.5.0.post1 \ --hash=sha256:39e0f791ba178d69cd676485dbee37e75a34f20daa478d90341ecb7f6d9d690f \ @@ -4045,19 +4280,19 @@ spinners==0.0.24 \ --hash=sha256:1eb6aeb4781d72ab42ed8a01dcf20f3002bf50740d7154d12fb8c9769bf9e27f \ --hash=sha256:2fa30d0b72c9650ad12bbe031c9943b8d441e41b4f5602b0ec977a19f3290e98 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # halo stack-data==0.6.3 \ --hash=sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9 \ --hash=sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # ipython starlette==0.46.2 \ --hash=sha256:595633ce89f8ffa71a015caed34a5b2dc1c0cdb3f0f1fbd1e69339cf2abeec35 \ --hash=sha256:7f7361f34eed179294600af672f565727419830b54b7b084efe44bb82d2fccd5 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt # fastapi # prometheus-fastapi-instrumentator @@ -4069,25 +4304,25 @@ tabulate==0.9.0 \ --hash=sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c \ --hash=sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/cloud-requirements.txt tensorboardx==2.6.2.2 \ --hash=sha256:160025acbf759ede23fd3526ae9d9bfbfd8b68eb16c38a010ebe326dc6395db8 \ --hash=sha256:c6476d7cd0d529b0b72f4acadb1269f9ed8b22f441e87a84f2a3b940bb87b666 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt termcolor==2.4.0 \ --hash=sha256:9297c0df9c99445c2412e832e882a7884038a25617c60cea2ad69488d4040d63 \ --hash=sha256:aab9e56047c8ac41ed798fa36d892a37aca6b3e9159f3e0c24bc64a9b3ac7b7a # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # halo terminado==0.18.1 \ --hash=sha256:a4468e1b37bb318f8a86514f65814e1afc977cf29b3992a4500d9dd305dcceb0 \ --hash=sha256:de09f2c4b85de4765f7714688fff57d3e75bad1f909b589fde880460c753fd2e # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jupyter-server # nbclassic # notebook @@ -4095,7 +4330,7 @@ tifffile==2024.7.21 \ --hash=sha256:7f335b5d6ca49401fe0f1d87deb206f5dae47297e47b1ed52a676d05d6d26798 \ --hash=sha256:818b577d49350421fb511f389f937984f9feaa2cd8177fa00823001920bf3483 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # scikit-image tiktoken==0.9.0 \ --hash=sha256:03935988a91d6d3216e2ec7c645afbb3d870b37bcb67ada1943ec48678e7ee33 \ @@ -4136,7 +4371,7 @@ tinycss2==1.3.0 \ --hash=sha256:152f9acabd296a8375fbca5b84c961ff95971fcfc32e79550c8df8e29118c54d \ --hash=sha256:54a8dbdffb334d536851be0226030e9505965bb2f30f21a4a82c55fb2a80fae7 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # nbconvert tokenizers==0.21.1 \ --hash=sha256:0f0dcbcc9f6e13e675a66d7a5f2f225a736745ce484c1a4e07476a89ccdad382 \ @@ -4255,7 +4490,7 @@ tornado==6.1 \ --hash=sha256:fa2ba70284fa42c2a5ecb35e322e68823288a4251f9ba9cc77be04ae15eada68 \ --hash=sha256:fba85b6cd9c39be262fcd23865652920832b61583de2a2ca907dbd8e8a8c81e5 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # ipykernel # jupyter-client # jupyter-server @@ -4267,7 +4502,7 @@ tqdm==4.67.1 \ --hash=sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2 \ --hash=sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/cloud-requirements.txt # gguf # huggingface-hub @@ -4278,7 +4513,7 @@ traitlets==5.14.3 \ --hash=sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7 \ --hash=sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # comm # ipykernel # ipython @@ -4293,11 +4528,10 @@ traitlets==5.14.3 \ # nbconvert # nbformat # notebook -transformers==4.53.2 \ - --hash=sha256:6c3ed95edfb1cba71c4245758f1b4878c93bf8cde77d076307dacb2cbbd72be2 \ - --hash=sha256:db8f4819bb34f000029c73c3c557e7d06fc1b8e612ec142eecdae3947a9c78bf +transformers==4.55.2 \ + --hash=sha256:097e3c2e2c0c9681db3da9d748d8f9d6a724c644514673d0030e8c5a1109f1f1 \ + --hash=sha256:a45ec60c03474fd67adbce5c434685051b7608b3f4f167c25aa6aeb1cad16d4f # via - # -r python/requirements/llm/llm-requirements.txt # compressed-tensors # vllm # xgrammar @@ -4310,7 +4544,7 @@ typer==0.12.3 \ --hash=sha256:070d7ca53f785acbccba8e7d28b08dcd88f79f1fbda035ade0aecec71ca5c914 \ --hash=sha256:49e73131481d804288ef62598d97a1ceef3058905aa536a1134f90891ba35482 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/llm/llm-requirements.txt # -r python/requirements.txt # fastapi-cli @@ -4318,12 +4552,12 @@ types-python-dateutil==2.9.0.20240316 \ --hash=sha256:5d2f2e240b86905e40944dd787db6da9263f0deabef1076ddaed797351ec0202 \ --hash=sha256:6b8cb66d960771ce5ff974e9dd45e38facb81718cc1e208b10b1baccbfdbee3b # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # arrow typing-extensions==4.12.2 \ --hash=sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # azure-core # azure-identity # azure-storage-blob @@ -4342,24 +4576,37 @@ typing-extensions==4.12.2 \ # referencing # torch # typer + # typing-inspection # vllm +typing-inspection==0.4.1 \ + --hash=sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51 \ + --hash=sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28 + # via + # -c python/deplocks/llm/ray_test_py311_cu128.lock + # pydantic +tzdata==2025.2 \ + --hash=sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8 \ + --hash=sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9 + # via + # -c python/deplocks/llm/ray_test_py311_cu128.lock + # kombu tzlocal==5.3 \ --hash=sha256:2fafbfc07e9d8b49ade18f898d6bcd37ae88ce3ad6486842a2e4f03af68323d2 \ --hash=sha256:3814135a1bb29763c6e4f08fd6e41dbb435c7a60bfbb03270211bcc537187d8c # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/cloud-requirements.txt uri-template==1.3.0 \ --hash=sha256:0e00f8eb65e18c7de20d595a14336e9f337ead580c70934141624b6d1ffdacc7 \ --hash=sha256:a44a133ea12d44a0c0f06d7d42a52d71282e77e2f937d8abd5655b8d56fc1363 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jsonschema urllib3==1.26.19 \ --hash=sha256:37a0344459b199fce0e80b0d3569837ec6b6937435c5244e7fd73fa6006830f3 \ --hash=sha256:3e3d753a8618b86d7de333b4223005f68720bcd6a7d2bcb9fbd2229ec7c1e429 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/cloud-requirements.txt # botocore # requests @@ -4367,7 +4614,7 @@ uvicorn==0.22.0 \ --hash=sha256:79277ae03db57ce7d9aa0567830bbb51d7a612f54d6e1e3e92da3ef24c2c8ed8 \ --hash=sha256:e9434d3bbf05f310e762147f769c9f21235ee118ba2d2bf1155a7196448bd996 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt # fastapi # fastapi-cli @@ -4410,15 +4657,23 @@ uvloop==0.21.0 ; platform_python_implementation != 'PyPy' and sys_platform != 'c --hash=sha256:f3df876acd7ec037a3d005b3ab85a7e4110422e4d9c1571d4fc89b0fc41b6816 \ --hash=sha256:f7089d2dc73179ce5ac255bdf37c236a9f914b264825fdaacaded6990a7fb4c2 # via uvicorn +vine==5.1.0 \ + --hash=sha256:40fdf3c48b2cfe1c38a49e9ae2da6fda88e4794c810050a728bd7413811fb1dc \ + --hash=sha256:8b62e981d35c41049211cf62a0a1242d8c1ee9bd15bb196ce38aefd6799e61e0 + # via + # -c python/deplocks/llm/ray_test_py311_cu128.lock + # amqp + # celery + # kombu virtualenv==20.29.1 \ --hash=sha256:4e4cb403c0b0da39e13b46b1b2476e505cb0046b25f242bee80f62bf990b2779 \ --hash=sha256:b8b8970138d32fb606192cb97f6cd4bb644fa486be9308fb9b63f81091b5dc35 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt -vllm==0.10.0 \ - --hash=sha256:8ca37559d82b43b5e8c8248d2e4a1ecb51d6d4e5d517491d656df6491ed93dab \ - --hash=sha256:a44e9013db26082a82c3931ed8772ac884d6d60566d36ecdb0e8dc01c65b241a +vllm==0.10.1.1 \ + --hash=sha256:3099824ee4bdaa14c4c4f7178a092101a0ec206d4c9371edf295849b2b730a39 \ + --hash=sha256:8ca0dd985e1ceac8540e7719c654f1553b3ba8a43c685ac8d3fa1366ffb6443a # via -r python/requirements/llm/llm-requirements.txt watchfiles==0.19.0 \ --hash=sha256:0089c6dc24d436b373c3c57657bf4f9a453b13767150d17284fc6162b2791911 \ @@ -4444,7 +4699,7 @@ watchfiles==0.19.0 \ --hash=sha256:d9b073073e048081e502b6c6b0b88714c026a1a4c890569238d04aca5f9ca74b \ --hash=sha256:fac19dc9cbc34052394dbe81e149411a62e71999c0a19e1e09ce537867f95ae0 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt # uvicorn # vllm @@ -4452,26 +4707,26 @@ wcwidth==0.2.13 \ --hash=sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859 \ --hash=sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # prompt-toolkit webcolors==24.6.0 \ --hash=sha256:1d160d1de46b3e81e58d0a280d0c78b467dc80f47294b91b1ad8029d2cedb55b \ --hash=sha256:8cf5bc7e28defd1d48b9e83d5fc30741328305a8195c29a8e668fa45586568a1 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jsonschema webencodings==0.5.1 \ --hash=sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78 \ --hash=sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # bleach # tinycss2 websocket-client==1.8.0 \ --hash=sha256:17b44cc997f5c498e809b22cdf2d9c7a9e71c02c8cc2b6c56e7c2d1239bfa526 \ --hash=sha256:3239df9f44da632f96012472805d40a23281a991027ce11d2f45a6f24ac4c3da # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jupyter-server websockets==15.0.1 \ --hash=sha256:0701bc3cfcb9164d04a14b149fd74be7347a530ad3bbf15ab2c678a2cd3dd9a2 \ @@ -4548,7 +4803,7 @@ widgetsnbextension==4.0.11 \ --hash=sha256:55d4d6949d100e0d08b94948a42efc3ed6dfdc0e9468b2c4b128c9a2ce3a7a36 \ --hash=sha256:8b22a8f1910bfd188e596fe7fc05dcbd87e810c8a4ba010bdb3da86637398474 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # ipywidgets wrapt==1.14.1 \ --hash=sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3 \ @@ -4626,7 +4881,7 @@ wrapt==1.14.1 \ --hash=sha256:ee6acae74a2b91865910eef5e7de37dc6895ad96fa23603d1d27ea69df545015 \ --hash=sha256:ef3f72c9666bba2bab70d2a8b79f2c6d2c1a42a7f7e2b0ec83bb2f9e383950af # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements/cloud-requirements.txt xformers==0.0.31 ; platform_machine == 'x86_64' and sys_platform == 'linux' \ --hash=sha256:b2ea87e0651f46164cb3cd74face021bd1654229ca4f8c0baa03b8c477515c7a @@ -4735,7 +4990,7 @@ y-py==0.6.2 \ --hash=sha256:e92878cc05e844c8da937204bc34c2e6caf66709ce5936802fbfb35f04132892 \ --hash=sha256:ff32548e45e45bf3280ac1d28b3148337a5c6714c28db23aeb0693e33eba257e # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jupyter-ydoc # ypy-websocket yarl==1.18.3 \ @@ -4822,22 +5077,21 @@ yarl==1.18.3 \ --hash=sha256:fe57328fbc1bfd0bd0514470ac692630f3901c0ee39052ae47acd1d90a436719 \ --hash=sha256:fea09ca13323376a2fdfb353a5fa2e59f90cd18d7ca4eaa1fd31f0a8b4f91e62 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # aiohttp ypy-websocket==0.8.4 \ --hash=sha256:43a001473f5c8abcf182f603049cf305cbc855ad8deaa9dfa0f3b5a7cea9d0ff \ --hash=sha256:b1ba0dfcc9762f0ca168d2378062d3ca1299d39076b0f145d961359121042be5 # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # jupyter-server-ydoc zipp==3.19.2 \ --hash=sha256:bf1dcf6450f873a13e952a29504887c89e6de7506209e5b1bcc3460135d4de19 \ --hash=sha256:f091755f667055f2d02b32c53771a7a6c8b47e1fdbc4b72a8b9072b3eef8015c # via - # -c python/requirements_compiled_ray_test_py311_cu128.txt + # -c python/deplocks/llm/ray_test_py311_cu128.lock # importlib-metadata # The following packages were excluded from the output: -# ray -# grpcio-tools # setuptools +# ray diff --git a/python/deplocks/ray_img/ray_img_py310.lock b/python/deplocks/ray_img/ray_img_py310.lock new file mode 100644 index 000000000000..c6d9fab2624b --- /dev/null +++ b/python/deplocks/ray_img/ray_img_py310.lock @@ -0,0 +1,2173 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile --generate-hashes --strip-extras --unsafe-package setuptools --index-url https://pypi.org/simple --extra-index-url https://download.pytorch.org/whl/cpu --index-strategy unsafe-best-match --no-strip-markers --emit-index-url --emit-find-links --python-version=3.10 --find-links=.whl/ -c python/requirements_compiled.txt - -o python/deplocks/ray_img/ray_img_py310.lock +--index-url https://pypi.org/simple +--extra-index-url https://download.pytorch.org/whl/cpu +--find-links .whl/ +--find-links https://data.pyg.org/whl/torch-2.3.0+cpu.html + +aiohappyeyeballs==2.6.1 \ + --hash=sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558 \ + --hash=sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8 + # via + # -c python/requirements_compiled.txt + # aiohttp +aiohttp==3.11.16 \ + --hash=sha256:004511d3413737700835e949433536a2fe95a7d0297edd911a1e9705c5b5ea43 \ + --hash=sha256:0902e887b0e1d50424112f200eb9ae3dfed6c0d0a19fc60f633ae5a57c809656 \ + --hash=sha256:09b00dd520d88eac9d1768439a59ab3d145065c91a8fab97f900d1b5f802895e \ + --hash=sha256:0a2f451849e6b39e5c226803dcacfa9c7133e9825dcefd2f4e837a2ec5a3bb98 \ + --hash=sha256:0a950c2eb8ff17361abd8c85987fd6076d9f47d040ebffce67dce4993285e973 \ + --hash=sha256:0ad1fb47da60ae1ddfb316f0ff16d1f3b8e844d1a1e154641928ea0583d486ed \ + --hash=sha256:13ceac2c5cdcc3f64b9015710221ddf81c900c5febc505dbd8f810e770011540 \ + --hash=sha256:14461157d8426bcb40bd94deb0450a6fa16f05129f7da546090cebf8f3123b0f \ + --hash=sha256:16f8a2c9538c14a557b4d309ed4d0a7c60f0253e8ed7b6c9a2859a7582f8b1b8 \ + --hash=sha256:17ae4664031aadfbcb34fd40ffd90976671fa0c0286e6c4113989f78bebab37a \ + --hash=sha256:1ce63ae04719513dd2651202352a2beb9f67f55cb8490c40f056cea3c5c355ce \ + --hash=sha256:23a15727fbfccab973343b6d1b7181bfb0b4aa7ae280f36fd2f90f5476805682 \ + --hash=sha256:2540ddc83cc724b13d1838026f6a5ad178510953302a49e6d647f6e1de82bc34 \ + --hash=sha256:37dcee4906454ae377be5937ab2a66a9a88377b11dd7c072df7a7c142b63c37c \ + --hash=sha256:38bea84ee4fe24ebcc8edeb7b54bf20f06fd53ce4d2cc8b74344c5b9620597fd \ + --hash=sha256:3ab3367bb7f61ad18793fea2ef71f2d181c528c87948638366bf1de26e239183 \ + --hash=sha256:3ad1d59fd7114e6a08c4814983bb498f391c699f3c78712770077518cae63ff7 \ + --hash=sha256:3b4e6db8dc4879015b9955778cfb9881897339c8fab7b3676f8433f849425913 \ + --hash=sha256:3e061b09f6fa42997cf627307f220315e313ece74907d35776ec4373ed718b86 \ + --hash=sha256:42864e70a248f5f6a49fdaf417d9bc62d6e4d8ee9695b24c5916cb4bb666c802 \ + --hash=sha256:493910ceb2764f792db4dc6e8e4b375dae1b08f72e18e8f10f18b34ca17d0979 \ + --hash=sha256:4d0c970c0d602b1017e2067ff3b7dac41c98fef4f7472ec2ea26fd8a4e8c2149 \ + --hash=sha256:54eb3aead72a5c19fad07219acd882c1643a1027fbcdefac9b502c267242f955 \ + --hash=sha256:56a3443aca82abda0e07be2e1ecb76a050714faf2be84256dae291182ba59049 \ + --hash=sha256:576f5ca28d1b3276026f7df3ec841ae460e0fc3aac2a47cbf72eabcfc0f102e1 \ + --hash=sha256:58ede86453a6cf2d6ce40ef0ca15481677a66950e73b0a788917916f7e35a0bb \ + --hash=sha256:61c721764e41af907c9d16b6daa05a458f066015abd35923051be8705108ed17 \ + --hash=sha256:634d96869be6c4dc232fc503e03e40c42d32cfaa51712aee181e922e61d74814 \ + --hash=sha256:696ef00e8a1f0cec5e30640e64eca75d8e777933d1438f4facc9c0cdf288a810 \ + --hash=sha256:69a2cbd61788d26f8f1e626e188044834f37f6ae3f937bd9f08b65fc9d7e514e \ + --hash=sha256:6a792ce34b999fbe04a7a71a90c74f10c57ae4c51f65461a411faa70e154154e \ + --hash=sha256:6ac13b71761e49d5f9e4d05d33683bbafef753e876e8e5a7ef26e937dd766713 \ + --hash=sha256:6fdec0213244c39973674ca2a7f5435bf74369e7d4e104d6c7473c81c9bcc8c4 \ + --hash=sha256:72b1b03fb4655c1960403c131740755ec19c5898c82abd3961c364c2afd59fe7 \ + --hash=sha256:745f1ed5e2c687baefc3c5e7b4304e91bf3e2f32834d07baaee243e349624b24 \ + --hash=sha256:776c8e959a01e5e8321f1dec77964cb6101020a69d5a94cd3d34db6d555e01f7 \ + --hash=sha256:780df0d837276276226a1ff803f8d0fa5f8996c479aeef52eb040179f3156cbd \ + --hash=sha256:78e6e23b954644737e385befa0deb20233e2dfddf95dd11e9db752bdd2a294d3 \ + --hash=sha256:7951decace76a9271a1ef181b04aa77d3cc309a02a51d73826039003210bdc86 \ + --hash=sha256:7ba92a2d9ace559a0a14b03d87f47e021e4fa7681dc6970ebbc7b447c7d4b7cd \ + --hash=sha256:7f6428fee52d2bcf96a8aa7b62095b190ee341ab0e6b1bcf50c615d7966fd45b \ + --hash=sha256:87944bd16b7fe6160607f6a17808abd25f17f61ae1e26c47a491b970fb66d8cb \ + --hash=sha256:87a6e922b2b2401e0b0cf6b976b97f11ec7f136bfed445e16384fbf6fd5e8602 \ + --hash=sha256:8cb0688a8d81c63d716e867d59a9ccc389e97ac7037ebef904c2b89334407180 \ + --hash=sha256:8df6612df74409080575dca38a5237282865408016e65636a76a2eb9348c2567 \ + --hash=sha256:911a6e91d08bb2c72938bc17f0a2d97864c531536b7832abee6429d5296e5b27 \ + --hash=sha256:92b7ee222e2b903e0a4b329a9943d432b3767f2d5029dbe4ca59fb75223bbe2e \ + --hash=sha256:938f756c2b9374bbcc262a37eea521d8a0e6458162f2a9c26329cc87fdf06534 \ + --hash=sha256:9756d9b9d4547e091f99d554fbba0d2a920aab98caa82a8fb3d3d9bee3c9ae85 \ + --hash=sha256:98b88a2bf26965f2015a771381624dd4b0839034b70d406dc74fd8be4cc053e3 \ + --hash=sha256:9b751a6306f330801665ae69270a8a3993654a85569b3469662efaad6cf5cc50 \ + --hash=sha256:a2a450bcce4931b295fc0848f384834c3f9b00edfc2150baafb4488c27953de6 \ + --hash=sha256:a3814760a1a700f3cfd2f977249f1032301d0a12c92aba74605cfa6ce9f78489 \ + --hash=sha256:a5abcbba9f4b463a45c8ca8b7720891200658f6f46894f79517e6cd11f3405ca \ + --hash=sha256:a6db7458ab89c7d80bc1f4e930cc9df6edee2200127cfa6f6e080cf619eddfbd \ + --hash=sha256:ad497f38a0d6c329cb621774788583ee12321863cd4bd9feee1effd60f2ad133 \ + --hash=sha256:ad9509ffb2396483ceacb1eee9134724443ee45b92141105a4645857244aecc8 \ + --hash=sha256:bbcba75fe879ad6fd2e0d6a8d937f34a571f116a0e4db37df8079e738ea95c71 \ + --hash=sha256:c10d85e81d0b9ef87970ecbdbfaeec14a361a7fa947118817fcea8e45335fa46 \ + --hash=sha256:c15b2271c44da77ee9d822552201180779e5e942f3a71fb74e026bf6172ff287 \ + --hash=sha256:ca37057625693d097543bd88076ceebeb248291df9d6ca8481349efc0b05dcd0 \ + --hash=sha256:cc3a145479a76ad0ed646434d09216d33d08eef0d8c9a11f5ae5cdc37caa3540 \ + --hash=sha256:ccf10f16ab498d20e28bc2b5c1306e9c1512f2840f7b6a67000a517a4b37d5ee \ + --hash=sha256:cd464ba806e27ee24a91362ba3621bfc39dbbb8b79f2e1340201615197370f7c \ + --hash=sha256:d007aa39a52d62373bd23428ba4a2546eed0e7643d7bf2e41ddcefd54519842c \ + --hash=sha256:d0666afbe984f6933fe72cd1f1c3560d8c55880a0bdd728ad774006eb4241ecd \ + --hash=sha256:d07502cc14ecd64f52b2a74ebbc106893d9a9717120057ea9ea1fd6568a747e7 \ + --hash=sha256:d489d9778522fbd0f8d6a5c6e48e3514f11be81cb0a5954bdda06f7e1594b321 \ + --hash=sha256:df7db76400bf46ec6a0a73192b14c8295bdb9812053f4fe53f4e789f3ea66bbb \ + --hash=sha256:e3538bc9fe1b902bef51372462e3d7c96fce2b566642512138a480b7adc9d508 \ + --hash=sha256:e87fd812899aa78252866ae03a048e77bd11b80fb4878ce27c23cade239b42b2 \ + --hash=sha256:ecdb8173e6c7aa09eee342ac62e193e6904923bd232e76b4157ac0bfa670609f \ + --hash=sha256:f244b8e541f414664889e2c87cac11a07b918cb4b540c36f7ada7bfa76571ea2 \ + --hash=sha256:f4065145bf69de124accdd17ea5f4dc770da0a6a6e440c53f6e0a8c27b3e635c \ + --hash=sha256:f420bfe862fb357a6d76f2065447ef6f484bc489292ac91e29bc65d2d7a2c84d \ + --hash=sha256:f6ddd90d9fb4b501c97a4458f1c1720e42432c26cb76d28177c5b5ad4e332601 \ + --hash=sha256:fa73e8c2656a3653ae6c307b3f4e878a21f87859a9afab228280ddccd7369d71 \ + --hash=sha256:fadbb8f1d4140825069db3fedbbb843290fd5f5bc0a5dbd7eaf81d91bf1b003b \ + --hash=sha256:fb3d0cc5cdb926090748ea60172fa8a213cec728bd6c54eae18b96040fcd6227 \ + --hash=sha256:fb46bb0f24813e6cede6cc07b1961d4b04f331f7112a23b5e21f567da4ee50aa \ + --hash=sha256:fd36c119c5d6551bce374fcb5c19269638f8d09862445f85a5a48596fd59f4bb + # via + # -c python/requirements_compiled.txt + # aiohttp-cors + # ray +aiohttp-cors==0.7.0 \ + --hash=sha256:0451ba59fdf6909d0e2cd21e4c0a43752bc0703d33fc78ae94d9d9321710193e \ + --hash=sha256:4d39c6d7100fd9764ed1caf8cebf0eb01bf5e3f24e2e073fda6234bc48b19f5d + # via + # -c python/requirements_compiled.txt + # ray +aiosignal==1.3.1 \ + --hash=sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc \ + --hash=sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17 + # via + # -c python/requirements_compiled.txt + # aiohttp +amqp==5.3.1 \ + --hash=sha256:43b3319e1b4e7d1251833a93d672b4af1e40f3d632d479b98661a95f117880a2 \ + --hash=sha256:cddc00c725449522023bad949f70fff7b48f0b1ade74d170a6f10ab044739432 + # via + # -c python/requirements_compiled.txt + # kombu +annotated-types==0.6.0 \ + --hash=sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43 \ + --hash=sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d + # via + # -c python/requirements_compiled.txt + # pydantic +anyio==3.7.1 \ + --hash=sha256:44a3c9aba0f5defa43261a8b3efb97891f2bd7d804e0e1f56419befa1adfc780 \ + --hash=sha256:91dee416e570e92c64041bd18b900d1d6fa78dff7048769ce5ac5ddad004fbb5 + # via + # -c python/requirements_compiled.txt + # starlette + # watchfiles +async-timeout==4.0.3 ; python_full_version < '3.11' \ + --hash=sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f \ + --hash=sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028 + # via + # -c python/requirements_compiled.txt + # aiohttp +attrs==25.1.0 \ + --hash=sha256:1c97078a80c814273a76b2a298a932eb681c87415c11dee0a6921de7f1b02c3e \ + --hash=sha256:c75a69e28a550a7e93789579c22aa26b0f5b83b75dc4e08fe092980051e1090a + # via + # -c python/requirements_compiled.txt + # aiohttp + # jsonschema + # referencing +billiard==4.2.1 \ + --hash=sha256:12b641b0c539073fc8d3f5b8b7be998956665c4233c7c1fcd66a7e677c4fb36f \ + --hash=sha256:40b59a4ac8806ba2c2369ea98d876bc6108b051c227baffd928c644d15d8f3cb + # via + # -c python/requirements_compiled.txt + # celery +cachetools==5.5.2 \ + --hash=sha256:1a661caa9175d26759571b2e19580f9d6393969e5dfca11fdb1f947a23e640d4 \ + --hash=sha256:d26a22bcc62eb95c3beabd9f1ee5e820d3d2704fe2967cbe350e20c8ffcd3f0a + # via + # -c python/requirements_compiled.txt + # google-auth +celery==5.5.3 \ + --hash=sha256:0b5761a07057acee94694464ca482416b959568904c9dfa41ce8413a7d65d525 \ + --hash=sha256:6c972ae7968c2b5281227f01c3a3f984037d21c5129d07bf3550cc2afc6b10a5 + # via + # -c python/requirements_compiled.txt + # ray +certifi==2025.1.31 \ + --hash=sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651 \ + --hash=sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe + # via + # -c python/requirements_compiled.txt + # requests +cffi==1.16.0 ; platform_python_implementation != 'PyPy' \ + --hash=sha256:0c9ef6ff37e974b73c25eecc13952c55bceed9112be2d9d938ded8e856138bcc \ + --hash=sha256:131fd094d1065b19540c3d72594260f118b231090295d8c34e19a7bbcf2e860a \ + --hash=sha256:1b8ebc27c014c59692bb2664c7d13ce7a6e9a629be20e54e7271fa696ff2b417 \ + --hash=sha256:2c56b361916f390cd758a57f2e16233eb4f64bcbeee88a4881ea90fca14dc6ab \ + --hash=sha256:2d92b25dbf6cae33f65005baf472d2c245c050b1ce709cc4588cdcdd5495b520 \ + --hash=sha256:31d13b0f99e0836b7ff893d37af07366ebc90b678b6664c955b54561fc36ef36 \ + --hash=sha256:32c68ef735dbe5857c810328cb2481e24722a59a2003018885514d4c09af9743 \ + --hash=sha256:3686dffb02459559c74dd3d81748269ffb0eb027c39a6fc99502de37d501faa8 \ + --hash=sha256:582215a0e9adbe0e379761260553ba11c58943e4bbe9c36430c4ca6ac74b15ed \ + --hash=sha256:5b50bf3f55561dac5438f8e70bfcdfd74543fd60df5fa5f62d94e5867deca684 \ + --hash=sha256:5bf44d66cdf9e893637896c7faa22298baebcd18d1ddb6d2626a6e39793a1d56 \ + --hash=sha256:6602bc8dc6f3a9e02b6c22c4fc1e47aa50f8f8e6d3f78a5e16ac33ef5fefa324 \ + --hash=sha256:673739cb539f8cdaa07d92d02efa93c9ccf87e345b9a0b556e3ecc666718468d \ + --hash=sha256:68678abf380b42ce21a5f2abde8efee05c114c2fdb2e9eef2efdb0257fba1235 \ + --hash=sha256:68e7c44931cc171c54ccb702482e9fc723192e88d25a0e133edd7aff8fcd1f6e \ + --hash=sha256:6b3d6606d369fc1da4fd8c357d026317fbb9c9b75d36dc16e90e84c26854b088 \ + --hash=sha256:748dcd1e3d3d7cd5443ef03ce8685043294ad6bd7c02a38d1bd367cfd968e000 \ + --hash=sha256:7651c50c8c5ef7bdb41108b7b8c5a83013bfaa8a935590c5d74627c047a583c7 \ + --hash=sha256:7b78010e7b97fef4bee1e896df8a4bbb6712b7f05b7ef630f9d1da00f6444d2e \ + --hash=sha256:7e61e3e4fa664a8588aa25c883eab612a188c725755afff6289454d6362b9673 \ + --hash=sha256:80876338e19c951fdfed6198e70bc88f1c9758b94578d5a7c4c91a87af3cf31c \ + --hash=sha256:8895613bcc094d4a1b2dbe179d88d7fb4a15cee43c052e8885783fac397d91fe \ + --hash=sha256:88e2b3c14bdb32e440be531ade29d3c50a1a59cd4e51b1dd8b0865c54ea5d2e2 \ + --hash=sha256:8f8e709127c6c77446a8c0a8c8bf3c8ee706a06cd44b1e827c3e6a2ee6b8c098 \ + --hash=sha256:9cb4a35b3642fc5c005a6755a5d17c6c8b6bcb6981baf81cea8bfbc8903e8ba8 \ + --hash=sha256:9f90389693731ff1f659e55c7d1640e2ec43ff725cc61b04b2f9c6d8d017df6a \ + --hash=sha256:a09582f178759ee8128d9270cd1344154fd473bb77d94ce0aeb2a93ebf0feaf0 \ + --hash=sha256:a6a14b17d7e17fa0d207ac08642c8820f84f25ce17a442fd15e27ea18d67c59b \ + --hash=sha256:a72e8961a86d19bdb45851d8f1f08b041ea37d2bd8d4fd19903bc3083d80c896 \ + --hash=sha256:abd808f9c129ba2beda4cfc53bde801e5bcf9d6e0f22f095e45327c038bfe68e \ + --hash=sha256:ac0f5edd2360eea2f1daa9e26a41db02dd4b0451b48f7c318e217ee092a213e9 \ + --hash=sha256:b29ebffcf550f9da55bec9e02ad430c992a87e5f512cd63388abb76f1036d8d2 \ + --hash=sha256:b2ca4e77f9f47c55c194982e10f058db063937845bb2b7a86c84a6cfe0aefa8b \ + --hash=sha256:b7be2d771cdba2942e13215c4e340bfd76398e9227ad10402a8767ab1865d2e6 \ + --hash=sha256:b84834d0cf97e7d27dd5b7f3aca7b6e9263c56308ab9dc8aae9784abb774d404 \ + --hash=sha256:b86851a328eedc692acf81fb05444bdf1891747c25af7529e39ddafaf68a4f3f \ + --hash=sha256:bcb3ef43e58665bbda2fb198698fcae6776483e0c4a631aa5647806c25e02cc0 \ + --hash=sha256:c0f31130ebc2d37cdd8e44605fb5fa7ad59049298b3f745c74fa74c62fbfcfc4 \ + --hash=sha256:c6a164aa47843fb1b01e941d385aab7215563bb8816d80ff3a363a9f8448a8dc \ + --hash=sha256:d8a9d3ebe49f084ad71f9269834ceccbf398253c9fac910c4fd7053ff1386936 \ + --hash=sha256:db8e577c19c0fda0beb7e0d4e09e0ba74b1e4c092e0e40bfa12fe05b6f6d75ba \ + --hash=sha256:dc9b18bf40cc75f66f40a7379f6a9513244fe33c0e8aa72e2d56b0196a7ef872 \ + --hash=sha256:e09f3ff613345df5e8c3667da1d918f9149bd623cd9070c983c013792a9a62eb \ + --hash=sha256:e4108df7fe9b707191e55f33efbcb2d81928e10cea45527879a4749cbe472614 \ + --hash=sha256:e6024675e67af929088fda399b2094574609396b1decb609c55fa58b028a32a1 \ + --hash=sha256:e70f54f1796669ef691ca07d046cd81a29cb4deb1e5f942003f401c0c4a2695d \ + --hash=sha256:e715596e683d2ce000574bae5d07bd522c781a822866c20495e52520564f0969 \ + --hash=sha256:e760191dd42581e023a68b758769e2da259b5d52e3103c6060ddc02c9edb8d7b \ + --hash=sha256:ed86a35631f7bfbb28e108dd96773b9d5a6ce4811cf6ea468bb6a359b256b1e4 \ + --hash=sha256:ee07e47c12890ef248766a6e55bd38ebfb2bb8edd4142d56db91b21ea68b7627 \ + --hash=sha256:fa3a0128b152627161ce47201262d3140edb5a5c3da88d73a1b790a959126956 \ + --hash=sha256:fcc8eb6d5902bb1cf6dc4f187ee3ea80a1eba0a89aba40a5cb20a5087d961357 + # via + # -c python/requirements_compiled.txt + # cryptography +charset-normalizer==3.3.2 \ + --hash=sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027 \ + --hash=sha256:06a81e93cd441c56a9b65d8e1d043daeb97a3d0856d177d5c90ba85acb3db087 \ + --hash=sha256:0a55554a2fa0d408816b3b5cedf0045f4b8e1a6065aec45849de2d6f3f8e9786 \ + --hash=sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8 \ + --hash=sha256:10955842570876604d404661fbccbc9c7e684caf432c09c715ec38fbae45ae09 \ + --hash=sha256:122c7fa62b130ed55f8f285bfd56d5f4b4a5b503609d181f9ad85e55c89f4185 \ + --hash=sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574 \ + --hash=sha256:1d3193f4a680c64b4b6a9115943538edb896edc190f0b222e73761716519268e \ + --hash=sha256:1f79682fbe303db92bc2b1136016a38a42e835d932bab5b3b1bfcfbf0640e519 \ + --hash=sha256:2127566c664442652f024c837091890cb1942c30937add288223dc895793f898 \ + --hash=sha256:22afcb9f253dac0696b5a4be4a1c0f8762f8239e21b99680099abd9b2b1b2269 \ + --hash=sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3 \ + --hash=sha256:2e81c7b9c8979ce92ed306c249d46894776a909505d8f5a4ba55b14206e3222f \ + --hash=sha256:3287761bc4ee9e33561a7e058c72ac0938c4f57fe49a09eae428fd88aafe7bb6 \ + --hash=sha256:34d1c8da1e78d2e001f363791c98a272bb734000fcef47a491c1e3b0505657a8 \ + --hash=sha256:37e55c8e51c236f95b033f6fb391d7d7970ba5fe7ff453dad675e88cf303377a \ + --hash=sha256:3d47fa203a7bd9c5b6cee4736ee84ca03b8ef23193c0d1ca99b5089f72645c73 \ + --hash=sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc \ + --hash=sha256:42cb296636fcc8b0644486d15c12376cb9fa75443e00fb25de0b8602e64c1714 \ + --hash=sha256:45485e01ff4d3630ec0d9617310448a8702f70e9c01906b0d0118bdf9d124cf2 \ + --hash=sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc \ + --hash=sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce \ + --hash=sha256:4d0d1650369165a14e14e1e47b372cfcb31d6ab44e6e33cb2d4e57265290044d \ + --hash=sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e \ + --hash=sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6 \ + --hash=sha256:572c3763a264ba47b3cf708a44ce965d98555f618ca42c926a9c1616d8f34269 \ + --hash=sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96 \ + --hash=sha256:5b4c145409bef602a690e7cfad0a15a55c13320ff7a3ad7ca59c13bb8ba4d45d \ + --hash=sha256:6463effa3186ea09411d50efc7d85360b38d5f09b870c48e4600f63af490e56a \ + --hash=sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4 \ + --hash=sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77 \ + --hash=sha256:6897af51655e3691ff853668779c7bad41579facacf5fd7253b0133308cf000d \ + --hash=sha256:68d1f8a9e9e37c1223b656399be5d6b448dea850bed7d0f87a8311f1ff3dabb0 \ + --hash=sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed \ + --hash=sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068 \ + --hash=sha256:6c4caeef8fa63d06bd437cd4bdcf3ffefe6738fb1b25951440d80dc7df8c03ac \ + --hash=sha256:6ef1d82a3af9d3eecdba2321dc1b3c238245d890843e040e41e470ffa64c3e25 \ + --hash=sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8 \ + --hash=sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab \ + --hash=sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26 \ + --hash=sha256:7f04c839ed0b6b98b1a7501a002144b76c18fb1c1850c8b98d458ac269e26ed2 \ + --hash=sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db \ + --hash=sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f \ + --hash=sha256:8465322196c8b4d7ab6d1e049e4c5cb460d0394da4a27d23cc242fbf0034b6b5 \ + --hash=sha256:86216b5cee4b06df986d214f664305142d9c76df9b6512be2738aa72a2048f99 \ + --hash=sha256:87d1351268731db79e0f8e745d92493ee2841c974128ef629dc518b937d9194c \ + --hash=sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d \ + --hash=sha256:8c622a5fe39a48f78944a87d4fb8a53ee07344641b0562c540d840748571b811 \ + --hash=sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa \ + --hash=sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a \ + --hash=sha256:9063e24fdb1e498ab71cb7419e24622516c4a04476b17a2dab57e8baa30d6e03 \ + --hash=sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b \ + --hash=sha256:923c0c831b7cfcb071580d3f46c4baf50f174be571576556269530f4bbd79d04 \ + --hash=sha256:95f2a5796329323b8f0512e09dbb7a1860c46a39da62ecb2324f116fa8fdc85c \ + --hash=sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001 \ + --hash=sha256:9f96df6923e21816da7e0ad3fd47dd8f94b2a5ce594e00677c0013018b813458 \ + --hash=sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389 \ + --hash=sha256:a50aebfa173e157099939b17f18600f72f84eed3049e743b68ad15bd69b6bf99 \ + --hash=sha256:a981a536974bbc7a512cf44ed14938cf01030a99e9b3a06dd59578882f06f985 \ + --hash=sha256:a9a8e9031d613fd2009c182b69c7b2c1ef8239a0efb1df3f7c8da66d5dd3d537 \ + --hash=sha256:ae5f4161f18c61806f411a13b0310bea87f987c7d2ecdbdaad0e94eb2e404238 \ + --hash=sha256:aed38f6e4fb3f5d6bf81bfa990a07806be9d83cf7bacef998ab1a9bd660a581f \ + --hash=sha256:b01b88d45a6fcb69667cd6d2f7a9aeb4bf53760d7fc536bf679ec94fe9f3ff3d \ + --hash=sha256:b261ccdec7821281dade748d088bb6e9b69e6d15b30652b74cbbac25e280b796 \ + --hash=sha256:b2b0a0c0517616b6869869f8c581d4eb2dd83a4d79e0ebcb7d373ef9956aeb0a \ + --hash=sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143 \ + --hash=sha256:bd8f7df7d12c2db9fab40bdd87a7c09b1530128315d047a086fa3ae3435cb3a8 \ + --hash=sha256:beb58fe5cdb101e3a055192ac291b7a21e3b7ef4f67fa1d74e331a7f2124341c \ + --hash=sha256:c002b4ffc0be611f0d9da932eb0f704fe2602a9a949d1f738e4c34c75b0863d5 \ + --hash=sha256:c083af607d2515612056a31f0a8d9e0fcb5876b7bfc0abad3ecd275bc4ebc2d5 \ + --hash=sha256:c180f51afb394e165eafe4ac2936a14bee3eb10debc9d9e4db8958fe36afe711 \ + --hash=sha256:c235ebd9baae02f1b77bcea61bce332cb4331dc3617d254df3323aa01ab47bd4 \ + --hash=sha256:cd70574b12bb8a4d2aaa0094515df2463cb429d8536cfb6c7ce983246983e5a6 \ + --hash=sha256:d0eccceffcb53201b5bfebb52600a5fb483a20b61da9dbc885f8b103cbe7598c \ + --hash=sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7 \ + --hash=sha256:db364eca23f876da6f9e16c9da0df51aa4f104a972735574842618b8c6d999d4 \ + --hash=sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b \ + --hash=sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae \ + --hash=sha256:e06ed3eb3218bc64786f7db41917d4e686cc4856944f53d5bdf83a6884432e12 \ + --hash=sha256:e27ad930a842b4c5eb8ac0016b0a54f5aebbe679340c26101df33424142c143c \ + --hash=sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae \ + --hash=sha256:eb00ed941194665c332bf8e078baf037d6c35d7c4f3102ea2d4f16ca94a26dc8 \ + --hash=sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887 \ + --hash=sha256:eb8821e09e916165e160797a6c17edda0679379a4be5c716c260e836e122f54b \ + --hash=sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4 \ + --hash=sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f \ + --hash=sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5 \ + --hash=sha256:fb69256e180cb6c8a894fee62b3afebae785babc1ee98b81cdf68bbca1987f33 \ + --hash=sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519 \ + --hash=sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561 + # via + # -c python/requirements_compiled.txt + # requests +click==8.1.7 \ + --hash=sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28 \ + --hash=sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de + # via + # -c python/requirements_compiled.txt + # celery + # click-didyoumean + # click-plugins + # click-repl + # ray + # uvicorn +click-didyoumean==0.3.1 \ + --hash=sha256:4f82fdff0dbe64ef8ab2279bd6aa3f6a99c3b28c05aa09cbfc07c9d7fbb5a463 \ + --hash=sha256:5c4bb6007cfea5f2fd6583a2fb6701a22a41eb98957e63d0fac41c10e7c3117c + # via + # -c python/requirements_compiled.txt + # celery +click-plugins==1.1.1.2 \ + --hash=sha256:008d65743833ffc1f5417bf0e78e8d2c23aab04d9745ba817bd3e71b0feb6aa6 \ + --hash=sha256:d7af3984a99d243c131aa1a828331e7630f4a88a9741fd05c927b204bcf92261 + # via + # -c python/requirements_compiled.txt + # celery +click-repl==0.3.0 \ + --hash=sha256:17849c23dba3d667247dc4defe1757fff98694e90fe37474f3feebb69ced26a9 \ + --hash=sha256:fb7e06deb8da8de86180a33a9da97ac316751c094c6899382da7feeeeb51b812 + # via + # -c python/requirements_compiled.txt + # celery +cloudpickle==2.2.0 \ + --hash=sha256:3f4219469c55453cfe4737e564b67c2a149109dabf7f242478948b895f61106f \ + --hash=sha256:7428798d5926d8fcbfd092d18d01a2a03daf8237d8fcdc8095d256b8490796f0 + # via + # -c python/requirements_compiled.txt + # gymnasium +colorful==0.5.5 \ + --hash=sha256:62c187e27c1433db9463ff93b1451898d1e7e23a7e553583fd9daeb6325182e4 \ + --hash=sha256:66f8c1264b2a26f7293b96a03bb7a76c4bc8b9634369a0bffdcd12d618056a1d + # via + # -c python/requirements_compiled.txt + # ray +cryptography==44.0.3 \ + --hash=sha256:02f55fb4f8b79c1221b0961488eaae21015b69b210e18c386b69de182ebb1259 \ + --hash=sha256:157f1f3b8d941c2bd8f3ffee0af9b049c9665c39d3da9db2dc338feca5e98a43 \ + --hash=sha256:192ed30fac1728f7587c6f4613c29c584abdc565d7417c13904708db10206645 \ + --hash=sha256:21a83f6f35b9cc656d71b5de8d519f566df01e660ac2578805ab245ffd8523f8 \ + --hash=sha256:25cd194c39fa5a0aa4169125ee27d1172097857b27109a45fadc59653ec06f44 \ + --hash=sha256:3883076d5c4cc56dbef0b898a74eb6992fdac29a7b9013870b34efe4ddb39a0d \ + --hash=sha256:3bb0847e6363c037df8f6ede57d88eaf3410ca2267fb12275370a76f85786a6f \ + --hash=sha256:3be3f649d91cb182c3a6bd336de8b61a0a71965bd13d1a04a0e15b39c3d5809d \ + --hash=sha256:3f07943aa4d7dad689e3bb1638ddc4944cc5e0921e3c227486daae0e31a05e54 \ + --hash=sha256:479d92908277bed6e1a1c69b277734a7771c2b78633c224445b5c60a9f4bc1d9 \ + --hash=sha256:4ffc61e8f3bf5b60346d89cd3d37231019c17a081208dfbbd6e1605ba03fa137 \ + --hash=sha256:5639c2b16764c6f76eedf722dbad9a0914960d3489c0cc38694ddf9464f1bb2f \ + --hash=sha256:58968d331425a6f9eedcee087f77fd3c927c88f55368f43ff7e0a19891f2642c \ + --hash=sha256:5d186f32e52e66994dce4f766884bcb9c68b8da62d61d9d215bfe5fb56d21334 \ + --hash=sha256:5d20cc348cca3a8aa7312f42ab953a56e15323800ca3ab0706b8cd452a3a056c \ + --hash=sha256:6866df152b581f9429020320e5eb9794c8780e90f7ccb021940d7f50ee00ae0b \ + --hash=sha256:7d5fe7195c27c32a64955740b949070f21cba664604291c298518d2e255931d2 \ + --hash=sha256:896530bc9107b226f265effa7ef3f21270f18a2026bc09fed1ebd7b66ddf6375 \ + --hash=sha256:962bc30480a08d133e631e8dfd4783ab71cc9e33d5d7c1e192f0b7c06397bb88 \ + --hash=sha256:978631ec51a6bbc0b7e58f23b68a8ce9e5f09721940933e9c217068388789fe5 \ + --hash=sha256:9b4d4a5dbee05a2c390bf212e78b99434efec37b17a4bff42f50285c5c8c9647 \ + --hash=sha256:ab0b005721cc0039e885ac3503825661bd9810b15d4f374e473f8c89b7d5460c \ + --hash=sha256:af653022a0c25ef2e3ffb2c673a50e5a0d02fecc41608f4954176f1933b12359 \ + --hash=sha256:b0cc66c74c797e1db750aaa842ad5b8b78e14805a9b5d1348dc603612d3e3ff5 \ + --hash=sha256:b424563394c369a804ecbee9b06dfb34997f19d00b3518e39f83a5642618397d \ + --hash=sha256:c138abae3a12a94c75c10499f1cbae81294a6f983b3af066390adee73f433028 \ + --hash=sha256:c6cd67722619e4d55fdb42ead64ed8843d64638e9c07f4011163e46bc512cf01 \ + --hash=sha256:c91fc8e8fd78af553f98bc7f2a1d8db977334e4eea302a4bfd75b9461c2d8904 \ + --hash=sha256:cad399780053fb383dc067475135e41c9fe7d901a97dd5d9c5dfb5611afc0d7d \ + --hash=sha256:cb90f60e03d563ca2445099edf605c16ed1d5b15182d21831f58460c48bffb93 \ + --hash=sha256:dad80b45c22e05b259e33ddd458e9e2ba099c86ccf4e88db7bbab4b747b18d06 \ + --hash=sha256:dd3db61b8fe5be220eee484a17233287d0be6932d056cf5738225b9c05ef4fff \ + --hash=sha256:e28d62e59a4dbd1d22e747f57d4f00c459af22181f0b2f787ea83f5a876d7c76 \ + --hash=sha256:e909df4053064a97f1e6565153ff8bb389af12c5c8d29c343308760890560aff \ + --hash=sha256:f3ffef566ac88f75967d7abd852ed5f182da252d23fac11b4766da3957766759 \ + --hash=sha256:fc3c9babc1e1faefd62704bb46a69f359a9819eb0292e40df3fb6e3574715cd4 \ + --hash=sha256:fe19d8bc5536a91a24a8133328880a41831b6c5df54599a8417b62fe015d3053 + # via + # -c python/requirements_compiled.txt + # pyopenssl +cupy-cuda12x==13.1.0 ; sys_platform != 'darwin' \ + --hash=sha256:230f8a8e99c81a653baa0ed00819990c0ed1f0cf0298214786b5e323461dc61a \ + --hash=sha256:2d16eaa2d086e416ac13467d4ff3184b9a081fe76b761ce51d4a46ec1c4bd28a \ + --hash=sha256:432273fd4b61a284f7d705d08b8291403548fd422bcbd945635cc155bc6a923d \ + --hash=sha256:4c51a1062a3c5a826b0425952d229ffe73b1791656a31de95b318117e67a9576 \ + --hash=sha256:4c8e9fdb1f3ffc3151808f8bb8c871518d2783e1be8b53792b698a840543d60c \ + --hash=sha256:51b1d6cb83d82dfa306c9efaeb4d57f24bad3041ebd8716d61072676abbcf67b \ + --hash=sha256:52185a2cf95d3bac2c3fda95c9c8e06a985b5a00cd2e587d3caace337db33899 \ + --hash=sha256:5afb6658faa22f21479ae2c0a07254df31c0aebc36907a64a1f6be4ecc9e96da \ + --hash=sha256:d3dc91ef9c4104652195eea4b282d343ecad653021efe20d1c8dd8dfe8ccfd86 \ + --hash=sha256:d60d1e124592cb82a5f3f45b3e7bee7bda7b72a743029f275e9d6b125f338c60 \ + --hash=sha256:dac0284fecb90b5731f514e569a6fcf6674a730ae95b9490781a713b60a34423 \ + --hash=sha256:e7a25ef1b44ae6276b5105affc2289edb34f1aa6676babd5bcd80907348c4cfa + # via + # -c python/requirements_compiled.txt + # ray +distlib==0.3.7 \ + --hash=sha256:2e24928bc811348f0feb63014e97aaae3037f2cf48712d51ae61df7fd6075057 \ + --hash=sha256:9dafe54b34a028eafd95039d5e5d4851a13734540f1331060d31c9916e7147a8 + # via + # -c python/requirements_compiled.txt + # virtualenv +dm-tree==0.1.8 \ + --hash=sha256:054b461f8176f4bce7a21f7b1870f873a1ced3bdbe1282c816c550bb43c71fa6 \ + --hash=sha256:09964470f76a5201aff2e8f9b26842976de7889300676f927930f6285e256760 \ + --hash=sha256:0d3172394079a86c3a759179c65f64c48d1a42b89495fcf38976d11cc3bb952c \ + --hash=sha256:0e9620ccf06393eb6b613b5e366469304622d4ea96ae6540b28a33840e6c89cf \ + --hash=sha256:0fcaabbb14e7980377439e7140bd05552739ca5e515ecb3119f234acee4b9430 \ + --hash=sha256:1607ce49aa42f010d1e5e616d92ce899d66835d4d8bea49679582435285515de \ + --hash=sha256:181c35521d480d0365f39300542cb6cd7fd2b77351bb43d7acfda15aef63b317 \ + --hash=sha256:1d7c26e431fc93cc7e0cba867eb000db6a05f6f2b25af11ac4e9dada88fc5bca \ + --hash=sha256:1fe962015b2fe1282892b28ebe962faed53c7f98d942da9a4625cbf27baef913 \ + --hash=sha256:250b692fb75f45f02e2f58fbef9ab338904ef334b90557565621fa251df267cf \ + --hash=sha256:2869228d9c619074de501a3c10dc7f07c75422f8fab36ecdcb859b6f1b1ec3ef \ + --hash=sha256:28c52cbf4f8b3dbd0beaedf44f69fa85eec5e9dede612e08035e06ada6ec9426 \ + --hash=sha256:2f7915660f59c09068e428613c480150180df1060561fd0d1470684ae7007bd1 \ + --hash=sha256:343a4a4ebaa127451ff971254a4be4084eb4bdc0b2513c32b46f6f728fd03f9e \ + --hash=sha256:35cc164a79336bfcfafb47e5f297898359123bbd3330c1967f0c4994f9cf9f60 \ + --hash=sha256:378cc8ad93c5fe3590f405a309980721f021c790ca1bdf9b15bb1d59daec57f5 \ + --hash=sha256:39070ba268c0491af9fe7a58644d99e8b4f2cde6e5884ba3380bddc84ed43d5f \ + --hash=sha256:435227cf3c5dc63f4de054cf3d00183790bd9ead4c3623138c74dde7f67f521b \ + --hash=sha256:5483dca4d7eb1a0d65fe86d3b6a53ae717face83c1f17e0887b1a4a64ae5c410 \ + --hash=sha256:694c3654cfd2a81552c08ec66bb5c4a3d48fa292b9a181880fb081c36c5b9134 \ + --hash=sha256:75c5d528bb992981c20793b6b453e91560784215dffb8a5440ba999753c14ceb \ + --hash=sha256:803bfc53b4659f447ac694dbd04235f94a73ef7c1fd1e0df7c84ac41e0bc963b \ + --hash=sha256:81fce77f22a302d7a5968aebdf4efafef4def7ce96528719a354e6990dcd49c7 \ + --hash=sha256:83b7764de0d855338abefc6e3ee9fe40d301668310aa3baea3f778ff051f4393 \ + --hash=sha256:8c60a7eadab64c2278861f56bca320b2720f163dca9d7558103c3b77f2416571 \ + --hash=sha256:8ed3564abed97c806db122c2d3e1a2b64c74a63debe9903aad795167cc301368 \ + --hash=sha256:94d3f0826311f45ee19b75f5b48c99466e4218a0489e81c0f0167bda50cacf22 \ + --hash=sha256:96a548a406a6fb15fe58f6a30a57ff2f2aafbf25f05afab00c8f5e5977b6c715 \ + --hash=sha256:a5d819c38c03f0bb5b3b3703c60e4b170355a0fc6b5819325bf3d4ceb3ae7e80 \ + --hash=sha256:ad16ceba90a56ec47cf45b21856d14962ac314787975ef786efb5e6e9ca75ec7 \ + --hash=sha256:af4b3d372f2477dcd89a6e717e4a575ca35ccc20cc4454a8a4b6f8838a00672d \ + --hash=sha256:b095ba4f8ca1ba19350fd53cf1f8f3eb0bd406aa28af64a6dfc86707b32a810a \ + --hash=sha256:b9bd9b9ccb59409d33d51d84b7668010c04c2af7d4a371632874c1ca356cff3d \ + --hash=sha256:b9f89a454e98806b44fe9d40ec9eee61f848388f7e79ac2371a55679bd5a3ac6 \ + --hash=sha256:bb2d109f42190225112da899b9f3d46d0d5f26aef501c61e43529fe9322530b5 \ + --hash=sha256:c0a94aba18a35457a1b5cd716fd7b46c5dafdc4cf7869b4bae665b91c4682a8e \ + --hash=sha256:c5c8c12e3fda754ef6af94161bacdaeda816d941995fac415d6855c6c386af68 \ + --hash=sha256:d1612fcaecd79023dbc6a6ae48d51a80beb5c385d6f3f6d71688e57bc8d07de8 \ + --hash=sha256:d16e1f2a073604cfcc09f7131ae8d534674f43c3aef4c25742eae295bc60d04f \ + --hash=sha256:d20f2faa3672b52e5013f4077117bfb99c4cfc0b445d3bde1584c34032b57436 \ + --hash=sha256:d40fa4106ca6edc66760246a08f500ec0c85ef55c762fb4a363f6ee739ba02ee \ + --hash=sha256:de287fabc464b8734be251e46e06aa9aa1001f34198da2b6ce07bd197172b9cb \ + --hash=sha256:e4d714371bb08839e4e5e29024fc95832d9affe129825ef38836b143028bd144 \ + --hash=sha256:ea9e59e0451e7d29aece402d9f908f2e2a80922bcde2ebfd5dcb07750fcbfee8 \ + --hash=sha256:f7ac31b9aecccb2c6e1ab29706f6ded3eba0c2c69c770322c9c685929c3d6afb \ + --hash=sha256:fa42a605d099ee7d41ba2b5fb75e21423951fd26e5d50583a00471238fb3021d + # via + # -c python/requirements_compiled.txt + # ray +exceptiongroup==1.3.0 ; python_full_version < '3.11' \ + --hash=sha256:4d111e6e0c13d0644cad6ddaa7ed0261a0b36971f6d23e7ec9b4b9097da78a10 \ + --hash=sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88 + # via anyio +farama-notifications==0.0.4 \ + --hash=sha256:13fceff2d14314cf80703c8266462ebf3733c7d165336eee998fc58e545efd18 \ + --hash=sha256:14de931035a41961f7c056361dc7f980762a143d05791ef5794a751a2caf05ae + # via + # -c python/requirements_compiled.txt + # gymnasium +fastapi==0.115.12 \ + --hash=sha256:1e2c2a2646905f9e83d32f04a3f86aff4a286669c6c950ca95b5fd68c2602681 \ + --hash=sha256:e94613d6c05e27be7ffebdd6ea5f388112e5e430c8f7d6494a9d1d88d43e814d + # via + # -c python/requirements_compiled.txt + # ray +fastrlock==0.8.2 ; sys_platform != 'darwin' \ + --hash=sha256:067edb0a0805bf61e17a251d5046af59f6e9d2b8ad01222e0ef7a0b7937d5548 \ + --hash=sha256:07ed3c7b3867c05a3d6be4ced200c7767000f3431b9be6da66972822dd86e8be \ + --hash=sha256:08315bde19d0c2e6b06593d5a418be3dc8f9b1ee721afa96867b9853fceb45cf \ + --hash=sha256:11bbbbc526363955aeddb9eec4cee2a0012322b7b2f15b54f44454fcf4fd398a \ + --hash=sha256:17734e2e5af4c07ddb0fb10bd484e062c22de3be6b67940b9cc6ec2f18fa61ba \ + --hash=sha256:1b15430b93d7eb3d56f6ff690d2ebecb79ed0e58248427717eba150a508d1cd7 \ + --hash=sha256:1fed2f4797ad68e9982038423018cf08bec5f4ce9fed63a94a790773ed6a795c \ + --hash=sha256:2074548a335fcf7d19ebb18d9208da9e33b06f745754466a7e001d2b1c58dd19 \ + --hash=sha256:2587cedbb36c7988e707d83f0f1175c1f882f362b5ebbee25d70218ea33d220d \ + --hash=sha256:25945f962c7bd808415cfde3da624d4399d4ea71ed8918538375f16bceb79e1c \ + --hash=sha256:27786c62a400e282756ae1b090bcd7cfa35f28270cff65a9e7b27a5327a32561 \ + --hash=sha256:2c1719ddc8218b01e82fb2e82e8451bd65076cb96d7bef4477194bbb4305a968 \ + --hash=sha256:2d5595903444c854b99c42122b87edfe8a37cd698a4eae32f4fd1d2a7b6c115d \ + --hash=sha256:30bdbe4662992348132d03996700e1cf910d141d629179b967b146a22942264e \ + --hash=sha256:31a27a2edf482df72b91fe6c6438314d2c65290aa7becc55589d156c9b91f0da \ + --hash=sha256:320fd55bafee3eb069cfb5d6491f811a912758387ef2193840e2663e80e16f48 \ + --hash=sha256:33145acbad8317584cd64588131c7e1e286beef6280c0009b4544c91fce171d2 \ + --hash=sha256:43a241655e83e4603a152192cf022d5ca348c2f4e56dfb02e5c9c4c1a32f9cdb \ + --hash=sha256:4d63b6596368dab9e0cc66bf047e7182a56f33b34db141816a4f21f5bf958228 \ + --hash=sha256:4fb04442b6d1e2b36c774919c6bcbe3339c61b337261d4bd57e27932589095af \ + --hash=sha256:4fb2e77ff04bc4beb71d63c8e064f052ce5a6ea1e001d528d4d7f4b37d736f2e \ + --hash=sha256:5460c5ee6ced6d61ec8cd2324ebbe793a4960c4ffa2131ffff480e3b61c99ec5 \ + --hash=sha256:59344c1d46b7dec97d3f22f1cc930fafe8980b3c5bc9c9765c56738a5f1559e4 \ + --hash=sha256:5dfb78dd600a12f23fc0c3ec58f81336229fdc74501ecf378d1ce5b3f2f313ea \ + --hash=sha256:643e1e65b4f5b284427e61a894d876d10459820e93aa1e724dfb415117be24e0 \ + --hash=sha256:644ec9215cf9c4df8028d8511379a15d9c1af3e16d80e47f1b6fdc6ba118356a \ + --hash=sha256:66f2662c640bb71a1016a031eea6eef9d25c2bcdf7ffd1d1ddc5a58f9a1ced04 \ + --hash=sha256:685e656048b59d8dfde8c601f188ad53a4d719eb97080cafc8696cda6d75865e \ + --hash=sha256:7269bb3fc15587b0c191eecd95831d771a7d80f0c48929e560806b038ff3066c \ + --hash=sha256:73426f5eb2ecc10626c67cf86bd0af9e00d53e80e5c67d5ce8e18376d6abfa09 \ + --hash=sha256:75c07726c8b1a52147fd7987d6baaa318c5dced1416c3f25593e40f56e10755b \ + --hash=sha256:790fc19bccbd39426060047e53629f171a44745613bf360a045e9f9c8c4a2cea \ + --hash=sha256:7a2ccaf88ac0db153e84305d1ef0aa138cea82c6a88309066f6eaa3bc98636cd \ + --hash=sha256:87f4e01b042c84e6090dbc4fbe3415ddd69f6bc0130382323f9d3f1b8dd71b46 \ + --hash=sha256:88f079335e9da631efa64486c8207564a7bcd0c00526bb9e842e9d5b7e50a6cc \ + --hash=sha256:8c1c91a68926421f5ccbc82c85f83bd3ba593b121a46a1b9a554b3f0dd67a4bf \ + --hash=sha256:9121a894d74e65557e47e777060a495ab85f4b903e80dd73a3c940ba042920d7 \ + --hash=sha256:94e348c72a1fd1f8191f25ea056448e4f5a87b8fbf005b39d290dcb0581a48cd \ + --hash=sha256:98195866d3a9949915935d40a88e4f1c166e82e378f622c88025f2938624a90a \ + --hash=sha256:99dd6652bd6f730beadf74ef769d38c6bbd8ee6d1c15c8d138ea680b0594387f \ + --hash=sha256:9af691a9861027181d4de07ed74f0aee12a9650ac60d0a07f4320bff84b5d95f \ + --hash=sha256:a3b8b5d2935403f1b4b25ae324560e94b59593a38c0d2e7b6c9872126a9622ed \ + --hash=sha256:a3dcc876050b8f5cbc0ee84ef1e7f0c1dfe7c148f10098828bc4403683c33f10 \ + --hash=sha256:a74f5a92fa6e51c4f3c69b29c4662088b97be12f40652a21109605a175c81824 \ + --hash=sha256:ab91b0c36e95d42e1041a4907e3eefd06c482d53af3c7a77be7e214cc7cd4a63 \ + --hash=sha256:ad1bc61c7f6b0e58106aaab034916b6cb041757f708b07fbcdd9d6e1ac629225 \ + --hash=sha256:adcb9e77aa132cc6c9de2ffe7cf880a20aa8cdba21d367d1da1a412f57bddd5d \ + --hash=sha256:b22ea9bf5f9fad2b0077e944a7813f91593a4f61adf8faf734a70aed3f2b3a40 \ + --hash=sha256:b2a1c354f13f22b737621d914f3b4a8434ae69d3027a775e94b3e671756112f9 \ + --hash=sha256:b32fdf874868326351a75b1e4c02f97e802147119ae44c52d3d9da193ec34f5b \ + --hash=sha256:b3853ed4ce522598dc886160a7bab432a093051af85891fa2f5577c1dcac8ed6 \ + --hash=sha256:b443e73a4dfc7b6e0800ea4c13567b9694358e86f53bb2612a51c9e727cac67b \ + --hash=sha256:b4c9083ea89ab236b06e9ef2263971db3b4b507195fc7d5eecab95828dcae325 \ + --hash=sha256:b8ca0fe21458457077e4cb2d81e1ebdb146a00b3e9e2db6180a773f7ea905032 \ + --hash=sha256:c393af77c659a38bffbca215c0bcc8629ba4299568308dd7e4ff65d62cabed39 \ + --hash=sha256:c6bffa978793bea5e1b00e677062e53a62255439339591b70e209fa1552d5ee0 \ + --hash=sha256:ccf39ad5702e33e4d335b48ef9d56e21619b529b7f7471b5211419f380329b62 \ + --hash=sha256:cf81e0278b645004388873e0a1f9e3bc4c9ab8c18e377b14ed1a544be4b18c9a \ + --hash=sha256:d34546ad2e4a480b94b6797bcc5a322b3c705c4c74c3e4e545c4a3841c1b2d59 \ + --hash=sha256:d47713ffe6d4a627fbf078be9836a95ac106b4a0543e3841572c91e292a5d885 \ + --hash=sha256:d918dfe473291e8bfd8e13223ea5cb9b317bd9f50c280923776c377f7c64b428 \ + --hash=sha256:dbdce852e6bb66e1b8c36679d482971d69d93acf1785657522e51b7de30c3356 \ + --hash=sha256:dcc1bf0ac8a194313cf6e645e300a8a379674ceed8e0b1e910a2de3e3c28989e \ + --hash=sha256:dd961a32a7182c3891cdebca417fda67496d5d5de6ae636962254d22723bdf52 \ + --hash=sha256:ddf5d247f686aec853ddcc9a1234bfcc6f57b0a0670d2ad82fc25d8ae7e6a15f \ + --hash=sha256:e27c3cd27fbd25e5223c5c992b300cd4ee8f0a75c6f222ce65838138d853712c \ + --hash=sha256:e380ec4e6d8b26e389713995a43cb7fe56baea2d25fe073d4998c4821a026211 \ + --hash=sha256:e4bbde174a0aff5f6eeba75cf8c4c5d2a316316bc21f03a0bddca0fc3659a6f3 \ + --hash=sha256:e8b49b5743ede51e0bcf6805741f39f5e0e0fd6a172ba460cb39e3097ba803bb \ + --hash=sha256:e9904b5b37c3e5bb4a245c56bc4b7e497da57ffb8528f4fc39af9dcb168ee2e1 \ + --hash=sha256:ea96503b918fceaf40443182742b8964d47b65c5ebdea532893cb9479620000c \ + --hash=sha256:eb31fe390f03f7ae886dcc374f1099ec88526631a4cb891d399b68181f154ff0 \ + --hash=sha256:ebb32d776b61acd49f859a1d16b9e3d84e7b46d0d92aebd58acd54dc38e96664 \ + --hash=sha256:fb5363cf0fddd9b50525ddbf64a1e1b28ec4c6dfb28670a940cb1cf988a6786b \ + --hash=sha256:ff75c90663d6e8996610d435e71487daa853871ad1770dd83dc0f2fc4997241e + # via + # -c python/requirements_compiled.txt + # cupy-cuda12x +filelock==3.17.0 \ + --hash=sha256:533dc2f7ba78dc2f0f531fc6c4940addf7b70a481e269a5a3b93be94ffbe8338 \ + --hash=sha256:ee4e77401ef576ebb38cd7f13b9b28893194acc20a8e68e18730ba9c0e54660e + # via + # -c python/requirements_compiled.txt + # ray + # virtualenv +frozenlist==1.4.1 \ + --hash=sha256:04ced3e6a46b4cfffe20f9ae482818e34eba9b5fb0ce4056e4cc9b6e212d09b7 \ + --hash=sha256:0633c8d5337cb5c77acbccc6357ac49a1770b8c487e5b3505c57b949b4b82e98 \ + --hash=sha256:068b63f23b17df8569b7fdca5517edef76171cf3897eb68beb01341131fbd2ad \ + --hash=sha256:0c250a29735d4f15321007fb02865f0e6b6a41a6b88f1f523ca1596ab5f50bd5 \ + --hash=sha256:1979bc0aeb89b33b588c51c54ab0161791149f2461ea7c7c946d95d5f93b56ae \ + --hash=sha256:1a4471094e146b6790f61b98616ab8e44f72661879cc63fa1049d13ef711e71e \ + --hash=sha256:1b280e6507ea8a4fa0c0a7150b4e526a8d113989e28eaaef946cc77ffd7efc0a \ + --hash=sha256:1d0ce09d36d53bbbe566fe296965b23b961764c0bcf3ce2fa45f463745c04701 \ + --hash=sha256:20b51fa3f588ff2fe658663db52a41a4f7aa6c04f6201449c6c7c476bd255c0d \ + --hash=sha256:23b2d7679b73fe0e5a4560b672a39f98dfc6f60df63823b0a9970525325b95f6 \ + --hash=sha256:23b701e65c7b36e4bf15546a89279bd4d8675faabc287d06bbcfac7d3c33e1e6 \ + --hash=sha256:2471c201b70d58a0f0c1f91261542a03d9a5e088ed3dc6c160d614c01649c106 \ + --hash=sha256:27657df69e8801be6c3638054e202a135c7f299267f1a55ed3a598934f6c0d75 \ + --hash=sha256:29acab3f66f0f24674b7dc4736477bcd4bc3ad4b896f5f45379a67bce8b96868 \ + --hash=sha256:32453c1de775c889eb4e22f1197fe3bdfe457d16476ea407472b9442e6295f7a \ + --hash=sha256:3a670dc61eb0d0eb7080890c13de3066790f9049b47b0de04007090807c776b0 \ + --hash=sha256:3e0153a805a98f5ada7e09826255ba99fb4f7524bb81bf6b47fb702666484ae1 \ + --hash=sha256:410478a0c562d1a5bcc2f7ea448359fcb050ed48b3c6f6f4f18c313a9bdb1826 \ + --hash=sha256:442acde1e068288a4ba7acfe05f5f343e19fac87bfc96d89eb886b0363e977ec \ + --hash=sha256:48f6a4533887e189dae092f1cf981f2e3885175f7a0f33c91fb5b7b682b6bab6 \ + --hash=sha256:4f57dab5fe3407b6c0c1cc907ac98e8a189f9e418f3b6e54d65a718aaafe3950 \ + --hash=sha256:4f9c515e7914626b2a2e1e311794b4c35720a0be87af52b79ff8e1429fc25f19 \ + --hash=sha256:55fdc093b5a3cb41d420884cdaf37a1e74c3c37a31f46e66286d9145d2063bd0 \ + --hash=sha256:5667ed53d68d91920defdf4035d1cdaa3c3121dc0b113255124bcfada1cfa1b8 \ + --hash=sha256:590344787a90ae57d62511dd7c736ed56b428f04cd8c161fcc5e7232c130c69a \ + --hash=sha256:5a7d70357e7cee13f470c7883a063aae5fe209a493c57d86eb7f5a6f910fae09 \ + --hash=sha256:5c3894db91f5a489fc8fa6a9991820f368f0b3cbdb9cd8849547ccfab3392d86 \ + --hash=sha256:5c849d495bf5154cd8da18a9eb15db127d4dba2968d88831aff6f0331ea9bd4c \ + --hash=sha256:64536573d0a2cb6e625cf309984e2d873979709f2cf22839bf2d61790b448ad5 \ + --hash=sha256:693945278a31f2086d9bf3df0fe8254bbeaef1fe71e1351c3bd730aa7d31c41b \ + --hash=sha256:6db4667b187a6742b33afbbaf05a7bc551ffcf1ced0000a571aedbb4aa42fc7b \ + --hash=sha256:6eb73fa5426ea69ee0e012fb59cdc76a15b1283d6e32e4f8dc4482ec67d1194d \ + --hash=sha256:722e1124aec435320ae01ee3ac7bec11a5d47f25d0ed6328f2273d287bc3abb0 \ + --hash=sha256:7268252af60904bf52c26173cbadc3a071cece75f873705419c8681f24d3edea \ + --hash=sha256:74fb4bee6880b529a0c6560885fce4dc95936920f9f20f53d99a213f7bf66776 \ + --hash=sha256:780d3a35680ced9ce682fbcf4cb9c2bad3136eeff760ab33707b71db84664e3a \ + --hash=sha256:82e8211d69a4f4bc360ea22cd6555f8e61a1bd211d1d5d39d3d228b48c83a897 \ + --hash=sha256:89aa2c2eeb20957be2d950b85974b30a01a762f3308cd02bb15e1ad632e22dc7 \ + --hash=sha256:8aefbba5f69d42246543407ed2461db31006b0f76c4e32dfd6f42215a2c41d09 \ + --hash=sha256:96ec70beabbd3b10e8bfe52616a13561e58fe84c0101dd031dc78f250d5128b9 \ + --hash=sha256:9750cc7fe1ae3b1611bb8cfc3f9ec11d532244235d75901fb6b8e42ce9229dfe \ + --hash=sha256:9acbb16f06fe7f52f441bb6f413ebae6c37baa6ef9edd49cdd567216da8600cd \ + --hash=sha256:9d3e0c25a2350080e9319724dede4f31f43a6c9779be48021a7f4ebde8b2d742 \ + --hash=sha256:a06339f38e9ed3a64e4c4e43aec7f59084033647f908e4259d279a52d3757d09 \ + --hash=sha256:a0cb6f11204443f27a1628b0e460f37fb30f624be6051d490fa7d7e26d4af3d0 \ + --hash=sha256:a7496bfe1da7fb1a4e1cc23bb67c58fab69311cc7d32b5a99c2007b4b2a0e932 \ + --hash=sha256:a828c57f00f729620a442881cc60e57cfcec6842ba38e1b19fd3e47ac0ff8dc1 \ + --hash=sha256:a9b2de4cf0cdd5bd2dee4c4f63a653c61d2408055ab77b151c1957f221cabf2a \ + --hash=sha256:b46c8ae3a8f1f41a0d2ef350c0b6e65822d80772fe46b653ab6b6274f61d4a49 \ + --hash=sha256:b7e3ed87d4138356775346e6845cccbe66cd9e207f3cd11d2f0b9fd13681359d \ + --hash=sha256:b7f2f9f912dca3934c1baec2e4585a674ef16fe00218d833856408c48d5beee7 \ + --hash=sha256:ba60bb19387e13597fb059f32cd4d59445d7b18b69a745b8f8e5db0346f33480 \ + --hash=sha256:beee944ae828747fd7cb216a70f120767fc9f4f00bacae8543c14a6831673f89 \ + --hash=sha256:bfa4a17e17ce9abf47a74ae02f32d014c5e9404b6d9ac7f729e01562bbee601e \ + --hash=sha256:c037a86e8513059a2613aaba4d817bb90b9d9b6b69aace3ce9c877e8c8ed402b \ + --hash=sha256:c302220494f5c1ebeb0912ea782bcd5e2f8308037b3c7553fad0e48ebad6ad82 \ + --hash=sha256:c6321c9efe29975232da3bd0af0ad216800a47e93d763ce64f291917a381b8eb \ + --hash=sha256:c757a9dd70d72b076d6f68efdbb9bc943665ae954dad2801b874c8c69e185068 \ + --hash=sha256:c99169d4ff810155ca50b4da3b075cbde79752443117d89429595c2e8e37fed8 \ + --hash=sha256:c9c92be9fd329ac801cc420e08452b70e7aeab94ea4233a4804f0915c14eba9b \ + --hash=sha256:cc7b01b3754ea68a62bd77ce6020afaffb44a590c2289089289363472d13aedb \ + --hash=sha256:db9e724bebd621d9beca794f2a4ff1d26eed5965b004a97f1f1685a173b869c2 \ + --hash=sha256:dca69045298ce5c11fd539682cff879cc1e664c245d1c64da929813e54241d11 \ + --hash=sha256:dd9b1baec094d91bf36ec729445f7769d0d0cf6b64d04d86e45baf89e2b9059b \ + --hash=sha256:e02a0e11cf6597299b9f3bbd3f93d79217cb90cfd1411aec33848b13f5c656cc \ + --hash=sha256:e6a20a581f9ce92d389a8c7d7c3dd47c81fd5d6e655c8dddf341e14aa48659d0 \ + --hash=sha256:e7004be74cbb7d9f34553a5ce5fb08be14fb33bc86f332fb71cbe5216362a497 \ + --hash=sha256:e774d53b1a477a67838a904131c4b0eef6b3d8a651f8b138b04f748fccfefe17 \ + --hash=sha256:edb678da49d9f72c9f6c609fbe41a5dfb9a9282f9e6a2253d5a91e0fc382d7c0 \ + --hash=sha256:f146e0911cb2f1da549fc58fc7bcd2b836a44b79ef871980d605ec392ff6b0d2 \ + --hash=sha256:f56e2333dda1fe0f909e7cc59f021eba0d2307bc6f012a1ccf2beca6ba362439 \ + --hash=sha256:f9a3ea26252bd92f570600098783d1371354d89d5f6b7dfd87359d669f2109b5 \ + --hash=sha256:f9aa1878d1083b276b0196f2dfbe00c9b7e752475ed3b682025ff20c1c1f51ac \ + --hash=sha256:fb3c2db03683b5767dedb5769b8a40ebb47d6f7f45b1b3e3b4b51ec8ad9d9825 \ + --hash=sha256:fbeb989b5cc29e8daf7f976b421c220f1b8c731cbf22b9130d8815418ea45887 \ + --hash=sha256:fde5bd59ab5357e3853313127f4d3565fc7dad314a74d7b5d43c22c6a5ed2ced \ + --hash=sha256:fe1a06da377e3a1062ae5fe0926e12b84eceb8a50b350ddca72dc85015873f74 + # via + # -c python/requirements_compiled.txt + # aiohttp + # aiosignal +fsspec==2023.12.1 \ + --hash=sha256:6271f1d3075a378bfe432f6f42bf7e1d2a6ba74f78dd9b512385474c579146a0 \ + --hash=sha256:c4da01a35ac65c853f833e43f67802c25213f560820d54ddf248f92eddd5e990 + # via + # -c python/requirements_compiled.txt + # ray +google-api-core==2.24.2 \ + --hash=sha256:810a63ac95f3c441b7c0e43d344e372887f62ce9071ba972eacf32672e072de9 \ + --hash=sha256:81718493daf06d96d6bc76a91c23874dbf2fac0adbbf542831b805ee6e974696 + # via + # -c python/requirements_compiled.txt + # opencensus +google-auth==2.23.4 \ + --hash=sha256:79905d6b1652187def79d491d6e23d0cbb3a21d3c7ba0dbaa9c8a01906b13ff3 \ + --hash=sha256:d4bbc92fe4b8bfd2f3e8d88e5ba7085935da208ee38a134fc280e7ce682a05f2 + # via + # -c python/requirements_compiled.txt + # google-api-core +googleapis-common-protos==1.61.0 \ + --hash=sha256:22f1915393bb3245343f6efe87f6fe868532efc12aa26b391b15132e1279f1c0 \ + --hash=sha256:8a64866a97f6304a7179873a465d6eee97b7a24ec6cfd78e0f575e96b821240b + # via + # -c python/requirements_compiled.txt + # google-api-core +grpcio==1.74.0 \ + --hash=sha256:0f87bddd6e27fc776aacf7ebfec367b6d49cad0455123951e4488ea99d9b9b8f \ + --hash=sha256:136b53c91ac1d02c8c24201bfdeb56f8b3ac3278668cbb8e0ba49c88069e1bdc \ + --hash=sha256:1733969040989f7acc3d94c22f55b4a9501a30f6aaacdbccfaba0a3ffb255ab7 \ + --hash=sha256:176d60a5168d7948539def20b2a3adcce67d72454d9ae05969a2e73f3a0feee7 \ + --hash=sha256:1a2b06afe2e50ebfd46247ac3ba60cac523f54ec7792ae9ba6073c12daf26f0a \ + --hash=sha256:1bf949792cee20d2078323a9b02bacbbae002b9e3b9e2433f2741c15bdeba1c4 \ + --hash=sha256:22b834cef33429ca6cc28303c9c327ba9a3fafecbf62fae17e9a7b7163cc43ac \ + --hash=sha256:2918948864fec2a11721d91568effffbe0a02b23ecd57f281391d986847982f6 \ + --hash=sha256:2bc2d7d8d184e2362b53905cb1708c84cb16354771c04b490485fa07ce3a1d89 \ + --hash=sha256:2f609a39f62a6f6f05c7512746798282546358a37ea93c1fcbadf8b2fed162e3 \ + --hash=sha256:3601274bc0523f6dc07666c0e01682c94472402ac2fd1226fd96e079863bfa49 \ + --hash=sha256:3b03d8f2a07f0fea8c8f74deb59f8352b770e3900d143b3d1475effcb08eec20 \ + --hash=sha256:3d14e3c4d65e19d8430a4e28ceb71ace4728776fd6c3ce34016947474479683f \ + --hash=sha256:42f8fee287427b94be63d916c90399ed310ed10aadbf9e2e5538b3e497d269bc \ + --hash=sha256:4bc5fca10aaf74779081e16c2bcc3d5ec643ffd528d9e7b1c9039000ead73bae \ + --hash=sha256:4e4181bfc24413d1e3a37a0b7889bea68d973d4b45dd2bc68bb766c140718f82 \ + --hash=sha256:55b453812fa7c7ce2f5c88be3018fb4a490519b6ce80788d5913f3f9d7da8c7b \ + --hash=sha256:566b9395b90cc3d0d0c6404bc8572c7c18786ede549cdb540ae27b58afe0fb91 \ + --hash=sha256:5f251c355167b2360537cf17bea2cf0197995e551ab9da6a0a59b3da5e8704f9 \ + --hash=sha256:60d2d48b0580e70d2e1954d0d19fa3c2e60dd7cbed826aca104fff518310d1c5 \ + --hash=sha256:64229c1e9cea079420527fa8ac45d80fc1e8d3f94deaa35643c381fa8d98f362 \ + --hash=sha256:655726919b75ab3c34cdad39da5c530ac6fa32696fb23119e36b64adcfca174a \ + --hash=sha256:662456c4513e298db6d7bd9c3b8df6f75f8752f0ba01fb653e252ed4a59b5a5d \ + --hash=sha256:68c8ebcca945efff9d86d8d6d7bfb0841cf0071024417e2d7f45c5e46b5b08eb \ + --hash=sha256:69e1a8180868a2576f02356565f16635b99088da7df3d45aaa7e24e73a054e31 \ + --hash=sha256:6bab67d15ad617aff094c382c882e0177637da73cbc5532d52c07b4ee887a87b \ + --hash=sha256:7d95d71ff35291bab3f1c52f52f474c632db26ea12700c2ff0ea0532cb0b5854 \ + --hash=sha256:80d1f4fbb35b0742d3e3d3bb654b7381cd5f015f8497279a1e9c21ba623e01b1 \ + --hash=sha256:834988b6c34515545b3edd13e902c1acdd9f2465d386ea5143fb558f153a7176 \ + --hash=sha256:8533e6e9c5bd630ca98062e3a1326249e6ada07d05acf191a77bc33f8948f3d8 \ + --hash=sha256:85bd5cdf4ed7b2d6438871adf6afff9af7096486fcf51818a81b77ef4dd30907 \ + --hash=sha256:86ad489db097141a907c559988c29718719aa3e13370d40e20506f11b4de0d11 \ + --hash=sha256:885912559974df35d92219e2dc98f51a16a48395f37b92865ad45186f294096c \ + --hash=sha256:8efe72fde5500f47aca1ef59495cb59c885afe04ac89dd11d810f2de87d935d4 \ + --hash=sha256:8f7b5882fb50632ab1e48cb3122d6df55b9afabc265582808036b6e51b9fd6b7 \ + --hash=sha256:9e7c4389771855a92934b2846bd807fc25a3dfa820fd912fe6bd8136026b2707 \ + --hash=sha256:9e912d3c993a29df6c627459af58975b2e5c897d93287939b9d5065f000249b5 \ + --hash=sha256:a8f0302f9ac4e9923f98d8e243939a6fb627cd048f5cd38595c97e38020dffce \ + --hash=sha256:b6a73b2ba83e663b2480a90b82fdae6a7aa6427f62bf43b29912c0cfd1aa2bfa \ + --hash=sha256:c14e803037e572c177ba54a3e090d6eb12efd795d49327c5ee2b3bddb836bf01 \ + --hash=sha256:c3d7bd6e3929fd2ea7fbc3f562e4987229ead70c9ae5f01501a46701e08f1ad9 \ + --hash=sha256:c98e0b7434a7fa4e3e63f250456eaef52499fba5ae661c58cc5b5477d11e7182 \ + --hash=sha256:cce634b10aeab37010449124814b05a62fb5f18928ca878f1bf4750d1f0c815b \ + --hash=sha256:e154d230dc1bbbd78ad2fdc3039fa50ad7ffcf438e4eb2fa30bce223a70c7486 \ + --hash=sha256:e1ea6176d7dfd5b941ea01c2ec34de9531ba494d541fe2057c904e601879f249 \ + --hash=sha256:e759f9e8bc908aaae0412642afe5416c9f983a80499448fcc7fab8692ae044c3 \ + --hash=sha256:e8978003816c7b9eabe217f88c78bc26adc8f9304bf6a594b02e5a49b2ef9c11 \ + --hash=sha256:ecde9ab49f58433abe02f9ed076c7b5be839cf0153883a6d23995937a82392fa \ + --hash=sha256:f6ec94f0e50eb8fa1744a731088b966427575e40c2944a980049798b127a687e \ + --hash=sha256:fd3c71aeee838299c5887230b8a1822795325ddfea635edd82954c1eaa831e24 \ + --hash=sha256:fe0f540750a13fd8e5da4b3eaba91a785eea8dca5ccd2bc2ffe978caa403090e + # via + # -c python/requirements_compiled.txt + # ray +gymnasium==1.1.1 \ + --hash=sha256:8bd9ea9bdef32c950a444ff36afc785e1d81051ec32d30435058953c20d2456d \ + --hash=sha256:9c167ec0a2b388666e37f63b2849cd2552f7f5b71938574c637bb36487eb928a + # via + # -c python/requirements_compiled.txt + # ray +h11==0.16.0 \ + --hash=sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1 \ + --hash=sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86 + # via + # -c python/requirements_compiled.txt + # uvicorn +httptools==0.6.4 \ + --hash=sha256:0614154d5454c21b6410fdf5262b4a3ddb0f53f1e1721cfd59d55f32138c578a \ + --hash=sha256:0e563e54979e97b6d13f1bbc05a96109923e76b901f786a5eae36e99c01237bd \ + --hash=sha256:16e603a3bff50db08cd578d54f07032ca1631450ceb972c2f834c2b860c28ea2 \ + --hash=sha256:288cd628406cc53f9a541cfaf06041b4c71d751856bab45e3702191f931ccd17 \ + --hash=sha256:28908df1b9bb8187393d5b5db91435ccc9c8e891657f9cbb42a2541b44c82fc8 \ + --hash=sha256:322d20ea9cdd1fa98bd6a74b77e2ec5b818abdc3d36695ab402a0de8ef2865a3 \ + --hash=sha256:342dd6946aa6bda4b8f18c734576106b8a31f2fe31492881a9a160ec84ff4bd5 \ + --hash=sha256:345c288418f0944a6fe67be8e6afa9262b18c7626c3ef3c28adc5eabc06a68da \ + --hash=sha256:3c73ce323711a6ffb0d247dcd5a550b8babf0f757e86a52558fe5b86d6fefcc0 \ + --hash=sha256:40a5ec98d3f49904b9fe36827dcf1aadfef3b89e2bd05b0e35e94f97c2b14721 \ + --hash=sha256:40b0f7fe4fd38e6a507bdb751db0379df1e99120c65fbdc8ee6c1d044897a636 \ + --hash=sha256:40dc6a8e399e15ea525305a2ddba998b0af5caa2566bcd79dcbe8948181eeaff \ + --hash=sha256:4b36913ba52008249223042dca46e69967985fb4051951f94357ea681e1f5dc0 \ + --hash=sha256:4d87b29bd4486c0093fc64dea80231f7c7f7eb4dc70ae394d70a495ab8436071 \ + --hash=sha256:4e93eee4add6493b59a5c514da98c939b244fce4a0d8879cd3f466562f4b7d5c \ + --hash=sha256:59e724f8b332319e2875efd360e61ac07f33b492889284a3e05e6d13746876f4 \ + --hash=sha256:69422b7f458c5af875922cdb5bd586cc1f1033295aa9ff63ee196a87519ac8e1 \ + --hash=sha256:703c346571fa50d2e9856a37d7cd9435a25e7fd15e236c397bf224afaa355fe9 \ + --hash=sha256:85071a1e8c2d051b507161f6c3e26155b5c790e4e28d7f236422dbacc2a9cc44 \ + --hash=sha256:856f4bc0478ae143bad54a4242fccb1f3f86a6e1be5548fecfd4102061b3a083 \ + --hash=sha256:85797e37e8eeaa5439d33e556662cc370e474445d5fab24dcadc65a8ffb04003 \ + --hash=sha256:90d96a385fa941283ebd231464045187a31ad932ebfa541be8edf5b3c2328959 \ + --hash=sha256:94978a49b8f4569ad607cd4946b759d90b285e39c0d4640c6b36ca7a3ddf2efc \ + --hash=sha256:aafe0f1918ed07b67c1e838f950b1c1fabc683030477e60b335649b8020e1076 \ + --hash=sha256:ab9ba8dcf59de5181f6be44a77458e45a578fc99c31510b8c65b7d5acc3cf490 \ + --hash=sha256:ade273d7e767d5fae13fa637f4d53b6e961fb7fd93c7797562663f0171c26660 \ + --hash=sha256:b799de31416ecc589ad79dd85a0b2657a8fe39327944998dea368c1d4c9e55e6 \ + --hash=sha256:c26f313951f6e26147833fc923f78f95604bbec812a43e5ee37f26dc9e5a686c \ + --hash=sha256:ca80b7485c76f768a3bc83ea58373f8db7b015551117375e4918e2aa77ea9b50 \ + --hash=sha256:d1ffd262a73d7c28424252381a5b854c19d9de5f56f075445d33919a637e3547 \ + --hash=sha256:d3f0d369e7ffbe59c4b6116a44d6a8eb4783aae027f2c0b366cf0aa964185dba \ + --hash=sha256:d54efd20338ac52ba31e7da78e4a72570cf729fac82bc31ff9199bedf1dc7440 \ + --hash=sha256:dacdd3d10ea1b4ca9df97a0a303cbacafc04b5cd375fa98732678151643d4988 \ + --hash=sha256:db353d22843cf1028f43c3651581e4bb49374d85692a85f95f7b9a130e1b2cab \ + --hash=sha256:db78cb9ca56b59b016e64b6031eda5653be0589dba2b1b43453f6e8b405a0970 \ + --hash=sha256:deee0e3343f98ee8047e9f4c5bc7cedbf69f5734454a94c38ee829fb2d5fa3c1 \ + --hash=sha256:df017d6c780287d5c80601dafa31f17bddb170232d85c066604d8558683711a2 \ + --hash=sha256:df959752a0c2748a65ab5387d08287abf6779ae9165916fe053e68ae1fbdc47f \ + --hash=sha256:ec4f178901fa1834d4a060320d2f3abc5c9e39766953d038f1458cb885f47e81 \ + --hash=sha256:f47f8ed67cc0ff862b84a1189831d1d33c963fb3ce1ee0c65d3b0cbe7b711069 \ + --hash=sha256:f8787367fbdfccae38e35abf7641dafc5310310a5987b689f4c32cc8cc3ee975 \ + --hash=sha256:f9eb89ecf8b290f2e293325c646a211ff1c2493222798bb80a530c5e7502494f \ + --hash=sha256:fc411e1c0a7dcd2f902c7c48cf079947a7e65b5485dea9decb82b9105ca71a43 + # via uvicorn +idna==3.7 \ + --hash=sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc \ + --hash=sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0 + # via + # -c python/requirements_compiled.txt + # anyio + # requests + # yarl +importlib-metadata==6.11.0 \ + --hash=sha256:1231cf92d825c9e03cfc4da076a16de6422c863558229ea0b22b675657463443 \ + --hash=sha256:f0afba6205ad8f8947c7d338b5342d5db2afbfd82f9cbef7879a9539cc12eb9b + # via + # -c python/requirements_compiled.txt + # opentelemetry-api +jinja2==3.1.6 ; sys_platform != 'win32' \ + --hash=sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d \ + --hash=sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67 + # via + # -c python/requirements_compiled.txt + # memray +jsonschema==4.23.0 \ + --hash=sha256:d71497fef26351a33265337fa77ffeb82423f3ea21283cd9467bb03999266bc4 \ + --hash=sha256:fbadb6f8b144a8f8cf9f0b89ba94501d143e50411a1278633f56a7acf7fd5566 + # via + # -c python/requirements_compiled.txt + # ray +jsonschema-specifications==2024.10.1 \ + --hash=sha256:0f38b83639958ce1152d02a7f062902c41c8fd20d558b0c34344292d417ae272 \ + --hash=sha256:a09a0680616357d9a0ecf05c12ad234479f549239d0f5b55f3deea67475da9bf + # via + # -c python/requirements_compiled.txt + # jsonschema +kombu==5.5.4 \ + --hash=sha256:886600168275ebeada93b888e831352fe578168342f0d1d5833d88ba0d847363 \ + --hash=sha256:a12ed0557c238897d8e518f1d1fdf84bd1516c5e305af2dacd85c2015115feb8 + # via + # -c python/requirements_compiled.txt + # celery +lz4==4.3.3 \ + --hash=sha256:01fe674ef2889dbb9899d8a67361e0c4a2c833af5aeb37dd505727cf5d2a131e \ + --hash=sha256:054b4631a355606e99a42396f5db4d22046a3397ffc3269a348ec41eaebd69d2 \ + --hash=sha256:0a136e44a16fc98b1abc404fbabf7f1fada2bdab6a7e970974fb81cf55b636d0 \ + --hash=sha256:0e9c410b11a31dbdc94c05ac3c480cb4b222460faf9231f12538d0074e56c563 \ + --hash=sha256:222a7e35137d7539c9c33bb53fcbb26510c5748779364014235afc62b0ec797f \ + --hash=sha256:24b3206de56b7a537eda3a8123c644a2b7bf111f0af53bc14bed90ce5562d1aa \ + --hash=sha256:2b901c7784caac9a1ded4555258207d9e9697e746cc8532129f150ffe1f6ba0d \ + --hash=sha256:2f7b1839f795315e480fb87d9bc60b186a98e3e5d17203c6e757611ef7dcef61 \ + --hash=sha256:30e8c20b8857adef7be045c65f47ab1e2c4fabba86a9fa9a997d7674a31ea6b6 \ + --hash=sha256:31ea4be9d0059c00b2572d700bf2c1bc82f241f2c3282034a759c9a4d6ca4dc2 \ + --hash=sha256:337cb94488a1b060ef1685187d6ad4ba8bc61d26d631d7ba909ee984ea736be1 \ + --hash=sha256:33c9a6fd20767ccaf70649982f8f3eeb0884035c150c0b818ea660152cf3c809 \ + --hash=sha256:363ab65bf31338eb364062a15f302fc0fab0a49426051429866d71c793c23394 \ + --hash=sha256:43cf03059c0f941b772c8aeb42a0813d68d7081c009542301637e5782f8a33e2 \ + --hash=sha256:56f4fe9c6327adb97406f27a66420b22ce02d71a5c365c48d6b656b4aaeb7775 \ + --hash=sha256:5d35533bf2cee56f38ced91f766cd0038b6abf46f438a80d50c52750088be93f \ + --hash=sha256:6756212507405f270b66b3ff7f564618de0606395c0fe10a7ae2ffcbbe0b1fba \ + --hash=sha256:6cdc60e21ec70266947a48839b437d46025076eb4b12c76bd47f8e5eb8a75dcc \ + --hash=sha256:abc197e4aca8b63f5ae200af03eb95fb4b5055a8f990079b5bdf042f568469dd \ + --hash=sha256:b14d948e6dce389f9a7afc666d60dd1e35fa2138a8ec5306d30cd2e30d36b40c \ + --hash=sha256:b47839b53956e2737229d70714f1d75f33e8ac26e52c267f0197b3189ca6de24 \ + --hash=sha256:b6d9ec061b9eca86e4dcc003d93334b95d53909afd5a32c6e4f222157b50c071 \ + --hash=sha256:b891880c187e96339474af2a3b2bfb11a8e4732ff5034be919aa9029484cd201 \ + --hash=sha256:bca8fccc15e3add173da91be8f34121578dc777711ffd98d399be35487c934bf \ + --hash=sha256:c81703b12475da73a5d66618856d04b1307e43428a7e59d98cfe5a5d608a74c6 \ + --hash=sha256:d2507ee9c99dbddd191c86f0e0c8b724c76d26b0602db9ea23232304382e1f21 \ + --hash=sha256:e36cd7b9d4d920d3bfc2369840da506fa68258f7bb176b8743189793c055e43d \ + --hash=sha256:e7d84b479ddf39fe3ea05387f10b779155fc0990125f4fb35d636114e1c63a2e \ + --hash=sha256:eac9af361e0d98335a02ff12fb56caeb7ea1196cf1a49dbf6f17828a131da807 \ + --hash=sha256:edfd858985c23523f4e5a7526ca6ee65ff930207a7ec8a8f57a01eae506aaee7 \ + --hash=sha256:ee9ff50557a942d187ec85462bb0960207e7ec5b19b3b48949263993771c6205 \ + --hash=sha256:f0e822cd7644995d9ba248cb4b67859701748a93e2ab7fc9bc18c599a52e4604 \ + --hash=sha256:f180904f33bdd1e92967923a43c22899e303906d19b2cf8bb547db6653ea6e7d \ + --hash=sha256:f1d18718f9d78182c6b60f568c9a9cec8a7204d7cb6fad4e511a2ef279e4cb05 \ + --hash=sha256:f4c7bf687303ca47d69f9f0133274958fd672efaa33fb5bcde467862d6c621f0 \ + --hash=sha256:f76176492ff082657ada0d0f10c794b6da5800249ef1692b35cf49b1e93e8ef7 + # via + # -c python/requirements_compiled.txt + # ray +markdown-it-py==2.2.0 ; sys_platform != 'win32' \ + --hash=sha256:5a35f8d1870171d9acc47b99612dc146129b631baf04970128b568f190d0cc30 \ + --hash=sha256:7c9a5e412688bc771c67432cbfebcdd686c93ce6484913dccf06cb5a0bea35a1 + # via + # -c python/requirements_compiled.txt + # rich +markupsafe==2.1.3 ; sys_platform != 'win32' \ + --hash=sha256:05fb21170423db021895e1ea1e1f3ab3adb85d1c2333cbc2310f2a26bc77272e \ + --hash=sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686 \ + --hash=sha256:1577735524cdad32f9f694208aa75e422adba74f1baee7551620e43a3141f559 \ + --hash=sha256:1b40069d487e7edb2676d3fbdb2b0829ffa2cd63a2ec26c4938b2d34391b4ecc \ + --hash=sha256:1b8dd8c3fd14349433c79fa8abeb573a55fc0fdd769133baac1f5e07abf54aeb \ + --hash=sha256:2c1b19b3aaacc6e57b7e25710ff571c24d6c3613a45e905b1fde04d691b98ee0 \ + --hash=sha256:2ef12179d3a291be237280175b542c07a36e7f60718296278d8593d21ca937d4 \ + --hash=sha256:3c0fae6c3be832a0a0473ac912810b2877c8cb9d76ca48de1ed31e1c68386575 \ + --hash=sha256:3fd4abcb888d15a94f32b75d8fd18ee162ca0c064f35b11134be77050296d6ba \ + --hash=sha256:47d4f1c5f80fc62fdd7777d0d40a2e9dda0a05883ab11374334f6c4de38adffd \ + --hash=sha256:65c1a9bcdadc6c28eecee2c119465aebff8f7a584dd719facdd9e825ec61ab52 \ + --hash=sha256:68e78619a61ecf91e76aa3e6e8e33fc4894a2bebe93410754bd28fce0a8a4f9f \ + --hash=sha256:6b2b56950d93e41f33b4223ead100ea0fe11f8e6ee5f641eb753ce4b77a7042b \ + --hash=sha256:8023faf4e01efadfa183e863fefde0046de576c6f14659e8782065bcece22198 \ + --hash=sha256:8afafd99945ead6e075b973fefa56379c5b5c53fd8937dad92c662da5d8fd5ee \ + --hash=sha256:8c41976a29d078bb235fea9b2ecd3da465df42a562910f9022f1a03107bd02be \ + --hash=sha256:9dcdfd0eaf283af041973bff14a2e143b8bd64e069f4c383416ecd79a81aab58 \ + --hash=sha256:aa57bd9cf8ae831a362185ee444e15a93ecb2e344c8e52e4d721ea3ab6ef1823 \ + --hash=sha256:ad9e82fb8f09ade1c3e1b996a6337afac2b8b9e365f926f5a61aacc71adc5b3c \ + --hash=sha256:b076b6226fb84157e3f7c971a47ff3a679d837cf338547532ab866c57930dbee \ + --hash=sha256:bfce63a9e7834b12b87c64d6b155fdd9b3b96191b6bd334bf37db7ff1fe457f2 \ + --hash=sha256:cd0f502fe016460680cd20aaa5a76d241d6f35a1c3350c474bac1273803893fa \ + --hash=sha256:e09031c87a1e51556fdcb46e5bd4f59dfb743061cf93c4d6831bf894f125eb57 \ + --hash=sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc \ + --hash=sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11 + # via + # -c python/requirements_compiled.txt + # jinja2 +mdurl==0.1.2 ; sys_platform != 'win32' \ + --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \ + --hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba + # via + # -c python/requirements_compiled.txt + # markdown-it-py +memray==1.10.0 ; sys_platform != 'win32' \ + --hash=sha256:0a21745fb516b7a6efcd40aa7487c59e9313fcfc782d0193fcfcf00b48426874 \ + --hash=sha256:22f2a47871c172a0539bd72737bb6b294fc10c510464066b825d90fcd3bb4916 \ + --hash=sha256:23e8c402625cfb32d0e9edb5ec0945f3e5e54bc6b0c5699f6284302082b80bd4 \ + --hash=sha256:2ce59ef485db3634de98b3a026d2450fc0a875e3a58a9ea85f7a89098841defe \ + --hash=sha256:322ed0b69014a0969b777768d461a785203f81f9864386b666b5b26645d9c294 \ + --hash=sha256:38322e052b882790993412f1840517a51818aa55c47037f69915b2007f2c4cee \ + --hash=sha256:38393c86ce6d0a08e6ec0eb1401d49803b7c0c950c2565386751cdc81568cba8 \ + --hash=sha256:391aac6c9f744528d3186bc82d708a1acc83525778f804045d7c96f860f8ec98 \ + --hash=sha256:3a8bb7fbd8303c4f0017ba7faef6b88f904cda2931ed667cbf3b98f024b3bc44 \ + --hash=sha256:3c401c57f49c4c5f1fecaee1e746f537cdc6680da05fb963dc143bd08ee109bf \ + --hash=sha256:4eba29179772b4a2e440a065b320b03bc2e73fe2648bdf7936aa3b9a086fab4a \ + --hash=sha256:53a8f66af18b1f3bcf5c9f3c95ae4134dd675903a38f9d0e6341b7bca01b63d0 \ + --hash=sha256:566602b2143e06b3d592901d98c52ce4599e71aa2555146eeb5cec03506f9498 \ + --hash=sha256:663d463e89a64bae4a6b2f8c837d11a3d094834442d536a4165e1d31899a3500 \ + --hash=sha256:68bd8df023c8a32f44c11d997e5c536837e27c0955daf557d3a377edd55a1dd3 \ + --hash=sha256:6937d7ef67d18ccc01c3250cdf3b4ef1445b859ee8756f09e3d11bd3ff0c7d67 \ + --hash=sha256:6b311e91203be71e1a0ce5e4f978137765bcb1045f3bf5646129c83c5b96ab3c \ + --hash=sha256:6fd13ef666c7fced9768d1cfabf71dc6dfa6724935a8dff463495ac2dc5e13a4 \ + --hash=sha256:8196c684f1be8fe423e5cdd2356d4255a2cb482a1f3e89612b70d2a2862cf5bb \ + --hash=sha256:843a688877691746f9d1835cfa8a65139948471bdd78720435808d20bc30a1cc \ + --hash=sha256:85c32d6613d81b075f740e398c4d653e0803cd48e82c33dcd584c109d6782666 \ + --hash=sha256:898acd60f57a10dc5aaf1fd64aa2f821f0420114f3f60c3058083788603f173a \ + --hash=sha256:8d56f37a34125684746c13d24bd7a3fb17549b0bb355eb50969eb11e05e3ba62 \ + --hash=sha256:92c372cb262eddd23049f945ca9527f0e4cc7c40a070aade1802d066f680885b \ + --hash=sha256:95e563d9c976e429ad597ad2720d95cebbe8bac891a3082465439143e2740772 \ + --hash=sha256:9627184c926252c8f719c301f1fefe970f0d033c643a6448b93fed2889d1ea94 \ + --hash=sha256:a9e985fb7646b0475c303919d19211d2aa54e5a9e2cd2a102472299be5dbebd3 \ + --hash=sha256:b681519357d94f5f0857fbc6029e7c44d3f41436109e955a14fd312d8317bc35 \ + --hash=sha256:b75040f28e8678d0e9c4907d55c95cf26db8ef5adc9941a228f1b280a9efd9c0 \ + --hash=sha256:c3a14960838d89a91747885897d34134afb65883cc3b0ed7ff30fe1af00f9fe6 \ + --hash=sha256:c7aeb47174c42e99740a8e2b3b6fe0932c95d987258d48a746974ead19176c26 \ + --hash=sha256:ce22a887a585ef5020896de89ffc793e531b65ccc81fbafcc7886010c2c562b3 \ + --hash=sha256:cf6d683c4f8d25c6ad06ae18715f218983c5eb86803953615e902d632fdf6ec1 \ + --hash=sha256:e356af93e3b031c83957e9ac1a653f5aaba5df1e357dd17142f5ed19bb3dc660 \ + --hash=sha256:f16c5c8730b616613dc8bafe32649ca6bd7252606251eb00148582011758d0b5 + # via + # -c python/requirements_compiled.txt + # ray +msgpack==1.0.7 \ + --hash=sha256:04ad6069c86e531682f9e1e71b71c1c3937d6014a7c3e9edd2aa81ad58842862 \ + --hash=sha256:0bfdd914e55e0d2c9e1526de210f6fe8ffe9705f2b1dfcc4aecc92a4cb4b533d \ + --hash=sha256:1dc93e8e4653bdb5910aed79f11e165c85732067614f180f70534f056da97db3 \ + --hash=sha256:1e2d69948e4132813b8d1131f29f9101bc2c915f26089a6d632001a5c1349672 \ + --hash=sha256:235a31ec7db685f5c82233bddf9858748b89b8119bf4538d514536c485c15fe0 \ + --hash=sha256:27dcd6f46a21c18fa5e5deed92a43d4554e3df8d8ca5a47bf0615d6a5f39dbc9 \ + --hash=sha256:28efb066cde83c479dfe5a48141a53bc7e5f13f785b92ddde336c716663039ee \ + --hash=sha256:3476fae43db72bd11f29a5147ae2f3cb22e2f1a91d575ef130d2bf49afd21c46 \ + --hash=sha256:36e17c4592231a7dbd2ed09027823ab295d2791b3b1efb2aee874b10548b7524 \ + --hash=sha256:384d779f0d6f1b110eae74cb0659d9aa6ff35aaf547b3955abf2ab4c901c4819 \ + --hash=sha256:38949d30b11ae5f95c3c91917ee7a6b239f5ec276f271f28638dec9156f82cfc \ + --hash=sha256:3967e4ad1aa9da62fd53e346ed17d7b2e922cba5ab93bdd46febcac39be636fc \ + --hash=sha256:3e7bf4442b310ff154b7bb9d81eb2c016b7d597e364f97d72b1acc3817a0fdc1 \ + --hash=sha256:3f0c8c6dfa6605ab8ff0611995ee30d4f9fcff89966cf562733b4008a3d60d82 \ + --hash=sha256:484ae3240666ad34cfa31eea7b8c6cd2f1fdaae21d73ce2974211df099a95d81 \ + --hash=sha256:4a7b4f35de6a304b5533c238bee86b670b75b03d31b7797929caa7a624b5dda6 \ + --hash=sha256:4cb14ce54d9b857be9591ac364cb08dc2d6a5c4318c1182cb1d02274029d590d \ + --hash=sha256:4e71bc4416de195d6e9b4ee93ad3f2f6b2ce11d042b4d7a7ee00bbe0358bd0c2 \ + --hash=sha256:52700dc63a4676669b341ba33520f4d6e43d3ca58d422e22ba66d1736b0a6e4c \ + --hash=sha256:572efc93db7a4d27e404501975ca6d2d9775705c2d922390d878fcf768d92c87 \ + --hash=sha256:576eb384292b139821c41995523654ad82d1916da6a60cff129c715a6223ea84 \ + --hash=sha256:5b0bf0effb196ed76b7ad883848143427a73c355ae8e569fa538365064188b8e \ + --hash=sha256:5b6ccc0c85916998d788b295765ea0e9cb9aac7e4a8ed71d12e7d8ac31c23c95 \ + --hash=sha256:5ed82f5a7af3697b1c4786053736f24a0efd0a1b8a130d4c7bfee4b9ded0f08f \ + --hash=sha256:6d4c80667de2e36970ebf74f42d1088cc9ee7ef5f4e8c35eee1b40eafd33ca5b \ + --hash=sha256:730076207cb816138cf1af7f7237b208340a2c5e749707457d70705715c93b93 \ + --hash=sha256:7687e22a31e976a0e7fc99c2f4d11ca45eff652a81eb8c8085e9609298916dcf \ + --hash=sha256:822ea70dc4018c7e6223f13affd1c5c30c0f5c12ac1f96cd8e9949acddb48a61 \ + --hash=sha256:84b0daf226913133f899ea9b30618722d45feffa67e4fe867b0b5ae83a34060c \ + --hash=sha256:85765fdf4b27eb5086f05ac0491090fc76f4f2b28e09d9350c31aac25a5aaff8 \ + --hash=sha256:8dd178c4c80706546702c59529ffc005681bd6dc2ea234c450661b205445a34d \ + --hash=sha256:8f5b234f567cf76ee489502ceb7165c2a5cecec081db2b37e35332b537f8157c \ + --hash=sha256:98bbd754a422a0b123c66a4c341de0474cad4a5c10c164ceed6ea090f3563db4 \ + --hash=sha256:993584fc821c58d5993521bfdcd31a4adf025c7d745bbd4d12ccfecf695af5ba \ + --hash=sha256:a40821a89dc373d6427e2b44b572efc36a2778d3f543299e2f24eb1a5de65415 \ + --hash=sha256:b291f0ee7961a597cbbcc77709374087fa2a9afe7bdb6a40dbbd9b127e79afee \ + --hash=sha256:b573a43ef7c368ba4ea06050a957c2a7550f729c31f11dd616d2ac4aba99888d \ + --hash=sha256:b610ff0f24e9f11c9ae653c67ff8cc03c075131401b3e5ef4b82570d1728f8a9 \ + --hash=sha256:bdf38ba2d393c7911ae989c3bbba510ebbcdf4ecbdbfec36272abe350c454075 \ + --hash=sha256:bfef2bb6ef068827bbd021017a107194956918ab43ce4d6dc945ffa13efbc25f \ + --hash=sha256:cab3db8bab4b7e635c1c97270d7a4b2a90c070b33cbc00c99ef3f9be03d3e1f7 \ + --hash=sha256:cb70766519500281815dfd7a87d3a178acf7ce95390544b8c90587d76b227681 \ + --hash=sha256:cca1b62fe70d761a282496b96a5e51c44c213e410a964bdffe0928e611368329 \ + --hash=sha256:ccf9a39706b604d884d2cb1e27fe973bc55f2890c52f38df742bc1d79ab9f5e1 \ + --hash=sha256:dc43f1ec66eb8440567186ae2f8c447d91e0372d793dfe8c222aec857b81a8cf \ + --hash=sha256:dd632777ff3beaaf629f1ab4396caf7ba0bdd075d948a69460d13d44357aca4c \ + --hash=sha256:e45ae4927759289c30ccba8d9fdce62bb414977ba158286b5ddaf8df2cddb5c5 \ + --hash=sha256:e50ebce52f41370707f1e21a59514e3375e3edd6e1832f5e5235237db933c98b \ + --hash=sha256:ebbbba226f0a108a7366bf4b59bf0f30a12fd5e75100c630267d94d7f0ad20e5 \ + --hash=sha256:ec79ff6159dffcc30853b2ad612ed572af86c92b5168aa3fc01a67b0fa40665e \ + --hash=sha256:f0936e08e0003f66bfd97e74ee530427707297b0d0361247e9b4f59ab78ddc8b \ + --hash=sha256:f26a07a6e877c76a88e3cecac8531908d980d3d5067ff69213653649ec0f60ad \ + --hash=sha256:f64e376cd20d3f030190e8c32e1c64582eba56ac6dc7d5b0b49a9d44021b52fd \ + --hash=sha256:f6ffbc252eb0d229aeb2f9ad051200668fc3a9aaa8994e49f0cb2ffe2b7867e7 \ + --hash=sha256:f9a7c509542db4eceed3dcf21ee5267ab565a83555c9b88a8109dcecc4709002 \ + --hash=sha256:ff1d0899f104f3921d94579a5638847f783c9b04f2d5f229392ca77fba5b82fc + # via + # -c python/requirements_compiled.txt + # ray +multidict==6.0.5 \ + --hash=sha256:01265f5e40f5a17f8241d52656ed27192be03bfa8764d88e8220141d1e4b3556 \ + --hash=sha256:0275e35209c27a3f7951e1ce7aaf93ce0d163b28948444bec61dd7badc6d3f8c \ + --hash=sha256:04bde7a7b3de05732a4eb39c94574db1ec99abb56162d6c520ad26f83267de29 \ + --hash=sha256:04da1bb8c8dbadf2a18a452639771951c662c5ad03aefe4884775454be322c9b \ + --hash=sha256:09a892e4a9fb47331da06948690ae38eaa2426de97b4ccbfafbdcbe5c8f37ff8 \ + --hash=sha256:0d63c74e3d7ab26de115c49bffc92cc77ed23395303d496eae515d4204a625e7 \ + --hash=sha256:107c0cdefe028703fb5dafe640a409cb146d44a6ae201e55b35a4af8e95457dd \ + --hash=sha256:141b43360bfd3bdd75f15ed811850763555a251e38b2405967f8e25fb43f7d40 \ + --hash=sha256:14c2976aa9038c2629efa2c148022ed5eb4cb939e15ec7aace7ca932f48f9ba6 \ + --hash=sha256:19fe01cea168585ba0f678cad6f58133db2aa14eccaf22f88e4a6dccadfad8b3 \ + --hash=sha256:1d147090048129ce3c453f0292e7697d333db95e52616b3793922945804a433c \ + --hash=sha256:1d9ea7a7e779d7a3561aade7d596649fbecfa5c08a7674b11b423783217933f9 \ + --hash=sha256:215ed703caf15f578dca76ee6f6b21b7603791ae090fbf1ef9d865571039ade5 \ + --hash=sha256:21fd81c4ebdb4f214161be351eb5bcf385426bf023041da2fd9e60681f3cebae \ + --hash=sha256:220dd781e3f7af2c2c1053da9fa96d9cf3072ca58f057f4c5adaaa1cab8fc442 \ + --hash=sha256:228b644ae063c10e7f324ab1ab6b548bdf6f8b47f3ec234fef1093bc2735e5f9 \ + --hash=sha256:29bfeb0dff5cb5fdab2023a7a9947b3b4af63e9c47cae2a10ad58394b517fddc \ + --hash=sha256:2f4848aa3baa109e6ab81fe2006c77ed4d3cd1e0ac2c1fbddb7b1277c168788c \ + --hash=sha256:2faa5ae9376faba05f630d7e5e6be05be22913782b927b19d12b8145968a85ea \ + --hash=sha256:2ffc42c922dbfddb4a4c3b438eb056828719f07608af27d163191cb3e3aa6cc5 \ + --hash=sha256:37b15024f864916b4951adb95d3a80c9431299080341ab9544ed148091b53f50 \ + --hash=sha256:3cc2ad10255f903656017363cd59436f2111443a76f996584d1077e43ee51182 \ + --hash=sha256:3d25f19500588cbc47dc19081d78131c32637c25804df8414463ec908631e453 \ + --hash=sha256:403c0911cd5d5791605808b942c88a8155c2592e05332d2bf78f18697a5fa15e \ + --hash=sha256:411bf8515f3be9813d06004cac41ccf7d1cd46dfe233705933dd163b60e37600 \ + --hash=sha256:425bf820055005bfc8aa9a0b99ccb52cc2f4070153e34b701acc98d201693733 \ + --hash=sha256:435a0984199d81ca178b9ae2c26ec3d49692d20ee29bc4c11a2a8d4514c67eda \ + --hash=sha256:4a6a4f196f08c58c59e0b8ef8ec441d12aee4125a7d4f4fef000ccb22f8d7241 \ + --hash=sha256:4cc0ef8b962ac7a5e62b9e826bd0cd5040e7d401bc45a6835910ed699037a461 \ + --hash=sha256:51d035609b86722963404f711db441cf7134f1889107fb171a970c9701f92e1e \ + --hash=sha256:53689bb4e102200a4fafa9de9c7c3c212ab40a7ab2c8e474491914d2305f187e \ + --hash=sha256:55205d03e8a598cfc688c71ca8ea5f66447164efff8869517f175ea632c7cb7b \ + --hash=sha256:5c0631926c4f58e9a5ccce555ad7747d9a9f8b10619621f22f9635f069f6233e \ + --hash=sha256:5cb241881eefd96b46f89b1a056187ea8e9ba14ab88ba632e68d7a2ecb7aadf7 \ + --hash=sha256:60d698e8179a42ec85172d12f50b1668254628425a6bd611aba022257cac1386 \ + --hash=sha256:612d1156111ae11d14afaf3a0669ebf6c170dbb735e510a7438ffe2369a847fd \ + --hash=sha256:6214c5a5571802c33f80e6c84713b2c79e024995b9c5897f794b43e714daeec9 \ + --hash=sha256:6939c95381e003f54cd4c5516740faba40cf5ad3eeff460c3ad1d3e0ea2549bf \ + --hash=sha256:69db76c09796b313331bb7048229e3bee7928eb62bab5e071e9f7fcc4879caee \ + --hash=sha256:6bf7a982604375a8d49b6cc1b781c1747f243d91b81035a9b43a2126c04766f5 \ + --hash=sha256:766c8f7511df26d9f11cd3a8be623e59cca73d44643abab3f8c8c07620524e4a \ + --hash=sha256:76c0de87358b192de7ea9649beb392f107dcad9ad27276324c24c91774ca5271 \ + --hash=sha256:76f067f5121dcecf0d63a67f29080b26c43c71a98b10c701b0677e4a065fbd54 \ + --hash=sha256:7901c05ead4b3fb75113fb1dd33eb1253c6d3ee37ce93305acd9d38e0b5f21a4 \ + --hash=sha256:79660376075cfd4b2c80f295528aa6beb2058fd289f4c9252f986751a4cd0496 \ + --hash=sha256:79a6d2ba910adb2cbafc95dad936f8b9386e77c84c35bc0add315b856d7c3abb \ + --hash=sha256:7afcdd1fc07befad18ec4523a782cde4e93e0a2bf71239894b8d61ee578c1319 \ + --hash=sha256:7be7047bd08accdb7487737631d25735c9a04327911de89ff1b26b81745bd4e3 \ + --hash=sha256:7c6390cf87ff6234643428991b7359b5f59cc15155695deb4eda5c777d2b880f \ + --hash=sha256:7df704ca8cf4a073334e0427ae2345323613e4df18cc224f647f251e5e75a527 \ + --hash=sha256:85f67aed7bb647f93e7520633d8f51d3cbc6ab96957c71272b286b2f30dc70ed \ + --hash=sha256:896ebdcf62683551312c30e20614305f53125750803b614e9e6ce74a96232604 \ + --hash=sha256:92d16a3e275e38293623ebf639c471d3e03bb20b8ebb845237e0d3664914caef \ + --hash=sha256:99f60d34c048c5c2fabc766108c103612344c46e35d4ed9ae0673d33c8fb26e8 \ + --hash=sha256:9fe7b0653ba3d9d65cbe7698cca585bf0f8c83dbbcc710db9c90f478e175f2d5 \ + --hash=sha256:a3145cb08d8625b2d3fee1b2d596a8766352979c9bffe5d7833e0503d0f0b5e5 \ + --hash=sha256:aeaf541ddbad8311a87dd695ed9642401131ea39ad7bc8cf3ef3967fd093b626 \ + --hash=sha256:b55358304d7a73d7bdf5de62494aaf70bd33015831ffd98bc498b433dfe5b10c \ + --hash=sha256:b82cc8ace10ab5bd93235dfaab2021c70637005e1ac787031f4d1da63d493c1d \ + --hash=sha256:c0868d64af83169e4d4152ec612637a543f7a336e4a307b119e98042e852ad9c \ + --hash=sha256:c1c1496e73051918fcd4f58ff2e0f2f3066d1c76a0c6aeffd9b45d53243702cc \ + --hash=sha256:c9bf56195c6bbd293340ea82eafd0071cb3d450c703d2c93afb89f93b8386ccc \ + --hash=sha256:cbebcd5bcaf1eaf302617c114aa67569dd3f090dd0ce8ba9e35e9985b41ac35b \ + --hash=sha256:cd6c8fca38178e12c00418de737aef1261576bd1b6e8c6134d3e729a4e858b38 \ + --hash=sha256:ceb3b7e6a0135e092de86110c5a74e46bda4bd4fbfeeb3a3bcec79c0f861e450 \ + --hash=sha256:cf590b134eb70629e350691ecca88eac3e3b8b3c86992042fb82e3cb1830d5e1 \ + --hash=sha256:d3eb1ceec286eba8220c26f3b0096cf189aea7057b6e7b7a2e60ed36b373b77f \ + --hash=sha256:d65f25da8e248202bd47445cec78e0025c0fe7582b23ec69c3b27a640dd7a8e3 \ + --hash=sha256:d6f6d4f185481c9669b9447bf9d9cf3b95a0e9df9d169bbc17e363b7d5487755 \ + --hash=sha256:d84a5c3a5f7ce6db1f999fb9438f686bc2e09d38143f2d93d8406ed2dd6b9226 \ + --hash=sha256:d946b0a9eb8aaa590df1fe082cee553ceab173e6cb5b03239716338629c50c7a \ + --hash=sha256:dce1c6912ab9ff5f179eaf6efe7365c1f425ed690b03341911bf4939ef2f3046 \ + --hash=sha256:de170c7b4fe6859beb8926e84f7d7d6c693dfe8e27372ce3b76f01c46e489fcf \ + --hash=sha256:e02021f87a5b6932fa6ce916ca004c4d441509d33bbdbeca70d05dff5e9d2479 \ + --hash=sha256:e030047e85cbcedbfc073f71836d62dd5dadfbe7531cae27789ff66bc551bd5e \ + --hash=sha256:e0e79d91e71b9867c73323a3444724d496c037e578a0e1755ae159ba14f4f3d1 \ + --hash=sha256:e4428b29611e989719874670fd152b6625500ad6c686d464e99f5aaeeaca175a \ + --hash=sha256:e4972624066095e52b569e02b5ca97dbd7a7ddd4294bf4e7247d52635630dd83 \ + --hash=sha256:e7be68734bd8c9a513f2b0cfd508802d6609da068f40dc57d4e3494cefc92929 \ + --hash=sha256:e8e94e6912639a02ce173341ff62cc1201232ab86b8a8fcc05572741a5dc7d93 \ + --hash=sha256:ea1456df2a27c73ce51120fa2f519f1bea2f4a03a917f4a43c8707cf4cbbae1a \ + --hash=sha256:ebd8d160f91a764652d3e51ce0d2956b38efe37c9231cd82cfc0bed2e40b581c \ + --hash=sha256:eca2e9d0cc5a889850e9bbd68e98314ada174ff6ccd1129500103df7a94a7a44 \ + --hash=sha256:edd08e6f2f1a390bf137080507e44ccc086353c8e98c657e666c017718561b89 \ + --hash=sha256:f285e862d2f153a70586579c15c44656f888806ed0e5b56b64489afe4a2dbfba \ + --hash=sha256:f2a1dee728b52b33eebff5072817176c172050d44d67befd681609b4746e1c2e \ + --hash=sha256:f7e301075edaf50500f0b341543c41194d8df3ae5caf4702f2095f3ca73dd8da \ + --hash=sha256:fb616be3538599e797a2017cccca78e354c767165e8858ab5116813146041a24 \ + --hash=sha256:fce28b3c8a81b6b36dfac9feb1de115bab619b3c13905b419ec71d03a3fc1423 \ + --hash=sha256:fe5d7785250541f7f5019ab9cba2c71169dc7d74d0f45253f8313f436458a4ef + # via + # -c python/requirements_compiled.txt + # aiohttp + # yarl +numpy==1.26.4 \ + --hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \ + --hash=sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818 \ + --hash=sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20 \ + --hash=sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0 \ + --hash=sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010 \ + --hash=sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a \ + --hash=sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea \ + --hash=sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c \ + --hash=sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71 \ + --hash=sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110 \ + --hash=sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be \ + --hash=sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a \ + --hash=sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a \ + --hash=sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5 \ + --hash=sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed \ + --hash=sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd \ + --hash=sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c \ + --hash=sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e \ + --hash=sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0 \ + --hash=sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c \ + --hash=sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a \ + --hash=sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b \ + --hash=sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0 \ + --hash=sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6 \ + --hash=sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2 \ + --hash=sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a \ + --hash=sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30 \ + --hash=sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218 \ + --hash=sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5 \ + --hash=sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07 \ + --hash=sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2 \ + --hash=sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4 \ + --hash=sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764 \ + --hash=sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef \ + --hash=sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3 \ + --hash=sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f + # via + # -c python/requirements_compiled.txt + # cupy-cuda12x + # gymnasium + # pandas + # ray + # scipy + # tensorboardx +opencensus==0.11.4 \ + --hash=sha256:a18487ce68bc19900336e0ff4655c5a116daf10c1b3685ece8d971bddad6a864 \ + --hash=sha256:cbef87d8b8773064ab60e5c2a1ced58bbaa38a6d052c41aec224958ce544eff2 + # via + # -c python/requirements_compiled.txt + # ray +opencensus-context==0.1.3 \ + --hash=sha256:073bb0590007af276853009fac7e4bab1d523c3f03baf4cb4511ca38967c6039 \ + --hash=sha256:a03108c3c10d8c80bb5ddf5c8a1f033161fa61972a9917f9b9b3a18517f0088c + # via + # -c python/requirements_compiled.txt + # opencensus +opentelemetry-api==1.34.1 \ + --hash=sha256:64f0bd06d42824843731d05beea88d4d4b6ae59f9fe347ff7dfa2cc14233bbb3 \ + --hash=sha256:b7df4cb0830d5a6c29ad0c0691dbae874d8daefa934b8b1d642de48323d32a8c + # via + # -c python/requirements_compiled.txt + # opentelemetry-exporter-prometheus + # opentelemetry-sdk + # opentelemetry-semantic-conventions +opentelemetry-exporter-prometheus==0.55b1 \ + --hash=sha256:d13ec0b22bf394113ff1ada5da98133a4b051779b803dae183188e26c4bd9ee0 \ + --hash=sha256:f364fbbff9e5de37a112ff104d1185fb1d7e2046c5ab5911e5afebc7ab3ddf0e + # via + # -c python/requirements_compiled.txt + # ray +opentelemetry-proto==1.27.0 \ + --hash=sha256:33c9345d91dafd8a74fc3d7576c5a38f18b7fdf8d02983ac67485386132aedd6 \ + --hash=sha256:b133873de5581a50063e1e4b29cdcf0c5e253a8c2d8dc1229add20a4c3830ace + # via + # -c python/requirements_compiled.txt + # ray +opentelemetry-sdk==1.34.1 \ + --hash=sha256:308effad4059562f1d92163c61c8141df649da24ce361827812c40abb2a1e96e \ + --hash=sha256:8091db0d763fcd6098d4781bbc80ff0971f94e260739aa6afe6fd379cdf3aa4d + # via + # -c python/requirements_compiled.txt + # opentelemetry-exporter-prometheus + # ray +opentelemetry-semantic-conventions==0.55b1 \ + --hash=sha256:5da81dfdf7d52e3d37f8fe88d5e771e191de924cfff5f550ab0b8f7b2409baed \ + --hash=sha256:ef95b1f009159c28d7a7849f5cbc71c4c34c845bb514d66adfdf1b3fff3598b3 + # via + # -c python/requirements_compiled.txt + # opentelemetry-sdk +ormsgpack==1.7.0 \ + --hash=sha256:0d88307ab45d95416ce4071b1b99326ca31362af01c3d206f15a0551a7a874bd \ + --hash=sha256:22418a4d399027a72fb2e6b873559b1886cf2e63323ca7afc17b222c454413b7 \ + --hash=sha256:2c22c62a6bc93bcb194b7f91864ca0b39455b2cbbfc1538a3da0f9ec3c11d184 \ + --hash=sha256:3a6a97937d2cf21496d7689b90a43df83c5062bbe846aaa39197cc9ad73eaa7b \ + --hash=sha256:462089a419dbde654915ccb0b859c0dbe3c178b0ac580018e82befea6ccd73f4 \ + --hash=sha256:4b353204e99b56c1d33f1cf4767bd1fe1195596181a1cc789f25aa26c0b50f3d \ + --hash=sha256:5ec763096d978d35eedcef0af13991a10741717c2e236b26f4c2047b0740ea7b \ + --hash=sha256:5fefa1ca842dbba258401ea958113fe62c6b70a7a4d46edac440113f68dc431e \ + --hash=sha256:65525438b4a8b3b64ccfcda25e758ea3db392d1c206b5e09ef70efbbafa6dbf9 \ + --hash=sha256:6b4c98839cb7fc2a212037d2258f3a22857155249eb293d45c45cb974cfba834 \ + --hash=sha256:6d114652dadd81802b8a35a49e07a3e9ef2a47aed6123fb5031f2220d1c8e434 \ + --hash=sha256:77bc2ea387d85cfad045b9bcb8040bae43ad32dafe9363360f732cc19d489bbe \ + --hash=sha256:7e6ada21f5c7a20ff7cf9b061c44e3814352f819947a12022ad8cb52a9f2a809 \ + --hash=sha256:8d301e47565fe0e52a60052e730a9bb7669dfbd2a94643b8be925e3928c64c15 \ + --hash=sha256:90aabfd816db60dadab1100d583d061e0238209015bf684f8170c0fca4eb445a \ + --hash=sha256:91ebb7d3609db249cdff629ffef83ec3d025b1384749a297cf3b6a8240cf22ac \ + --hash=sha256:97723786755a7df85fcf6e68d7b5359dacea98d5c26b1d9af219a3cc05df4734 \ + --hash=sha256:9b0945523ccc75aa6907f38f2240d36818618baccb8633923bd7740a5a929e67 \ + --hash=sha256:a0ca6a64d47073f22ecc1dd96b384e44f98796d3f88ee383e92dfbcdf18c2efd \ + --hash=sha256:a5e12b51a590be47ccef67907905653e679fc2f920854b456edc216690ecc09c \ + --hash=sha256:a8fbe7bb50ee8381df030823d9366984fac718447947c2327969405d1d799b95 \ + --hash=sha256:c683071bf4527ffa7b6cfcf28f750d1a82eb77846d106743c09261ab1b79b193 \ + --hash=sha256:ca4d35b694f32112eb33ac0b733cb903dbbc59f019d05ca3d74f6ad2f587b0bf \ + --hash=sha256:e8385181bf195af80fc270e64fd477f1c414ffb05837320382e2ec9ca34be0ec \ + --hash=sha256:e86124cdbc8ed249806347c2fba96843e8941122b161b429139a0c973d270de4 \ + --hash=sha256:f9967a7f3647ad118751abf090f8397fda3e4bca6833340cab95a3f2bec598cd + # via + # -c python/requirements_compiled.txt + # ray +packaging==23.0 \ + --hash=sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2 \ + --hash=sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97 + # via + # -c python/requirements_compiled.txt + # kombu + # ray + # tensorboardx +pandas==1.5.3 \ + --hash=sha256:14e45300521902689a81f3f41386dc86f19b8ba8dd5ac5a3c7010ef8d2932813 \ + --hash=sha256:26d9c71772c7afb9d5046e6e9cf42d83dd147b5cf5bcb9d97252077118543792 \ + --hash=sha256:3749077d86e3a2f0ed51367f30bf5b82e131cc0f14260c4d3e499186fccc4406 \ + --hash=sha256:41179ce559943d83a9b4bbacb736b04c928b095b5f25dd2b7389eda08f46f373 \ + --hash=sha256:478ff646ca42b20376e4ed3fa2e8d7341e8a63105586efe54fa2508ee087f328 \ + --hash=sha256:50869a35cbb0f2e0cd5ec04b191e7b12ed688874bd05dd777c19b28cbea90996 \ + --hash=sha256:565fa34a5434d38e9d250af3c12ff931abaf88050551d9fbcdfafca50d62babf \ + --hash=sha256:5f2b952406a1588ad4cad5b3f55f520e82e902388a6d5a4a91baa8d38d23c7f6 \ + --hash=sha256:5fbcb19d6fceb9e946b3e23258757c7b225ba450990d9ed63ccceeb8cae609f7 \ + --hash=sha256:6973549c01ca91ec96199e940495219c887ea815b2083722821f1d7abfa2b4dc \ + --hash=sha256:74a3fd7e5a7ec052f183273dc7b0acd3a863edf7520f5d3a1765c04ffdb3b0b1 \ + --hash=sha256:7a0a56cef15fd1586726dace5616db75ebcfec9179a3a55e78f72c5639fa2a23 \ + --hash=sha256:7cec0bee9f294e5de5bbfc14d0573f65526071029d036b753ee6507d2a21480a \ + --hash=sha256:87bd9c03da1ac870a6d2c8902a0e1fd4267ca00f13bc494c9e5a9020920e1d51 \ + --hash=sha256:972d8a45395f2a2d26733eb8d0f629b2f90bebe8e8eddbb8829b180c09639572 \ + --hash=sha256:9842b6f4b8479e41968eced654487258ed81df7d1c9b7b870ceea24ed9459b31 \ + --hash=sha256:9f69c4029613de47816b1bb30ff5ac778686688751a5e9c99ad8c7031f6508e5 \ + --hash=sha256:a50d9a4336a9621cab7b8eb3fb11adb82de58f9b91d84c2cd526576b881a0c5a \ + --hash=sha256:bc4c368f42b551bf72fac35c5128963a171b40dce866fb066540eeaf46faa003 \ + --hash=sha256:c39a8da13cede5adcd3be1182883aea1c925476f4e84b2807a46e2775306305d \ + --hash=sha256:c3ac844a0fe00bfaeb2c9b51ab1424e5c8744f89860b138434a363b1f620f354 \ + --hash=sha256:c4c00e0b0597c8e4f59e8d461f797e5d70b4d025880516a8261b2817c47759ee \ + --hash=sha256:c74a62747864ed568f5a82a49a23a8d7fe171d0c69038b38cedf0976831296fa \ + --hash=sha256:dd05f7783b3274aa206a1af06f0ceed3f9b412cf665b7247eacd83be41cf7bf0 \ + --hash=sha256:dfd681c5dc216037e0b0a2c821f5ed99ba9f03ebcf119c7dac0e9a7b960b9ec9 \ + --hash=sha256:e474390e60ed609cec869b0da796ad94f420bb057d86784191eefc62b65819ae \ + --hash=sha256:f76d097d12c82a535fda9dfe5e8dd4127952b45fea9b0276cb30cca5ea313fbc + # via + # -c python/requirements_compiled.txt + # ray +platformdirs==3.11.0 \ + --hash=sha256:cf8ee52a3afdb965072dcc652433e0c7e3e40cf5ea1477cd4b3b1d2eb75495b3 \ + --hash=sha256:e9d171d00af68be50e9202731309c4e658fd8bc76f55c11c7dd760d023bda68e + # via + # -c python/requirements_compiled.txt + # virtualenv +prometheus-client==0.19.0 \ + --hash=sha256:4585b0d1223148c27a225b10dbec5ae9bc4c81a99a3fa80774fa6209935324e1 \ + --hash=sha256:c88b1e6ecf6b41cd8fb5731c7ae919bf66df6ec6fafa555cd6c0e16ca169ae92 + # via + # -c python/requirements_compiled.txt + # opentelemetry-exporter-prometheus + # ray +prompt-toolkit==3.0.41 \ + --hash=sha256:941367d97fc815548822aa26c2a269fdc4eb21e9ec05fc5d447cf09bad5d75f0 \ + --hash=sha256:f36fe301fafb7470e86aaf90f036eef600a3210be4decf461a5b1ca8403d3cb2 + # via + # -c python/requirements_compiled.txt + # click-repl +propcache==0.3.0 \ + --hash=sha256:02df07041e0820cacc8f739510078f2aadcfd3fc57eaeeb16d5ded85c872c89e \ + --hash=sha256:03acd9ff19021bd0567582ac88f821b66883e158274183b9e5586f678984f8fe \ + --hash=sha256:03c091bb752349402f23ee43bb2bff6bd80ccab7c9df6b88ad4322258d6960fc \ + --hash=sha256:07700939b2cbd67bfb3b76a12e1412405d71019df00ca5697ce75e5ef789d829 \ + --hash=sha256:0c3e893c4464ebd751b44ae76c12c5f5c1e4f6cbd6fbf67e3783cd93ad221863 \ + --hash=sha256:119e244ab40f70a98c91906d4c1f4c5f2e68bd0b14e7ab0a06922038fae8a20f \ + --hash=sha256:11ae6a8a01b8a4dc79093b5d3ca2c8a4436f5ee251a9840d7790dccbd96cb649 \ + --hash=sha256:15010f29fbed80e711db272909a074dc79858c6d28e2915704cfc487a8ac89c6 \ + --hash=sha256:19d36bb351ad5554ff20f2ae75f88ce205b0748c38b146c75628577020351e3c \ + --hash=sha256:1c8f7d896a16da9455f882870a507567d4f58c53504dc2d4b1e1d386dfe4588a \ + --hash=sha256:2383a17385d9800b6eb5855c2f05ee550f803878f344f58b6e194de08b96352c \ + --hash=sha256:24c04f8fbf60094c531667b8207acbae54146661657a1b1be6d3ca7773b7a545 \ + --hash=sha256:2578541776769b500bada3f8a4eeaf944530516b6e90c089aa368266ed70c49e \ + --hash=sha256:26a67e5c04e3119594d8cfae517f4b9330c395df07ea65eab16f3d559b7068fe \ + --hash=sha256:2b975528998de037dfbc10144b8aed9b8dd5a99ec547f14d1cb7c5665a43f075 \ + --hash=sha256:2d15bc27163cd4df433e75f546b9ac31c1ba7b0b128bfb1b90df19082466ff57 \ + --hash=sha256:2d913d36bdaf368637b4f88d554fb9cb9d53d6920b9c5563846555938d5450bf \ + --hash=sha256:3302c5287e504d23bb0e64d2a921d1eb4a03fb93a0a0aa3b53de059f5a5d737d \ + --hash=sha256:36ca5e9a21822cc1746023e88f5c0af6fce3af3b85d4520efb1ce4221bed75cc \ + --hash=sha256:3b812b3cb6caacd072276ac0492d249f210006c57726b6484a1e1805b3cfeea0 \ + --hash=sha256:3c6ec957025bf32b15cbc6b67afe233c65b30005e4c55fe5768e4bb518d712f1 \ + --hash=sha256:41de3da5458edd5678b0f6ff66691507f9885f5fe6a0fb99a5d10d10c0fd2d64 \ + --hash=sha256:42924dc0c9d73e49908e35bbdec87adedd651ea24c53c29cac103ede0ea1d340 \ + --hash=sha256:4544699674faf66fb6b4473a1518ae4999c1b614f0b8297b1cef96bac25381db \ + --hash=sha256:46ed02532cb66612d42ae5c3929b5e98ae330ea0f3900bc66ec5f4862069519b \ + --hash=sha256:49ea05212a529c2caffe411e25a59308b07d6e10bf2505d77da72891f9a05641 \ + --hash=sha256:4fa0e7c9c3cf7c276d4f6ab9af8adddc127d04e0fcabede315904d2ff76db626 \ + --hash=sha256:507c5357a8d8b4593b97fb669c50598f4e6cccbbf77e22fa9598aba78292b4d7 \ + --hash=sha256:549722908de62aa0b47a78b90531c022fa6e139f9166be634f667ff45632cc92 \ + --hash=sha256:58e6d2a5a7cb3e5f166fd58e71e9a4ff504be9dc61b88167e75f835da5764d07 \ + --hash=sha256:5a16167118677d94bb48bfcd91e420088854eb0737b76ec374b91498fb77a70e \ + --hash=sha256:5d62c4f6706bff5d8a52fd51fec6069bef69e7202ed481486c0bc3874912c787 \ + --hash=sha256:5fa159dcee5dba00c1def3231c249cf261185189205073bde13797e57dd7540a \ + --hash=sha256:6032231d4a5abd67c7f71168fd64a47b6b451fbcb91c8397c2f7610e67683810 \ + --hash=sha256:63f26258a163c34542c24808f03d734b338da66ba91f410a703e505c8485791d \ + --hash=sha256:65a37714b8ad9aba5780325228598a5b16c47ba0f8aeb3dc0514701e4413d7c0 \ + --hash=sha256:67054e47c01b7b349b94ed0840ccae075449503cf1fdd0a1fdd98ab5ddc2667b \ + --hash=sha256:67dda3c7325691c2081510e92c561f465ba61b975f481735aefdfc845d2cd043 \ + --hash=sha256:6985a593417cdbc94c7f9c3403747335e450c1599da1647a5af76539672464d3 \ + --hash=sha256:6a1948df1bb1d56b5e7b0553c0fa04fd0e320997ae99689488201f19fa90d2e7 \ + --hash=sha256:6b5b7fd6ee7b54e01759f2044f936dcf7dea6e7585f35490f7ca0420fe723c0d \ + --hash=sha256:6c929916cbdb540d3407c66f19f73387f43e7c12fa318a66f64ac99da601bcdf \ + --hash=sha256:6f4d7a7c0aff92e8354cceca6fe223973ddf08401047920df0fcb24be2bd5138 \ + --hash=sha256:728af36011bb5d344c4fe4af79cfe186729efb649d2f8b395d1572fb088a996c \ + --hash=sha256:742840d1d0438eb7ea4280f3347598f507a199a35a08294afdcc560c3739989d \ + --hash=sha256:75e872573220d1ee2305b35c9813626e620768248425f58798413e9c39741f46 \ + --hash=sha256:794c3dd744fad478b6232289c866c25406ecdfc47e294618bdf1697e69bd64a6 \ + --hash=sha256:7c0fdbdf6983526e269e5a8d53b7ae3622dd6998468821d660d0daf72779aefa \ + --hash=sha256:7c5f5290799a3f6539cc5e6f474c3e5c5fbeba74a5e1e5be75587746a940d51e \ + --hash=sha256:7c6e7e4f9167fddc438cd653d826f2222222564daed4116a02a184b464d3ef05 \ + --hash=sha256:7cedd25e5f678f7738da38037435b340694ab34d424938041aa630d8bac42663 \ + --hash=sha256:7e2e068a83552ddf7a39a99488bcba05ac13454fb205c847674da0352602082f \ + --hash=sha256:8319293e85feadbbfe2150a5659dbc2ebc4afdeaf7d98936fb9a2f2ba0d4c35c \ + --hash=sha256:8526b0941ec5a40220fc4dfde76aed58808e2b309c03e9fa8e2260083ef7157f \ + --hash=sha256:8884ba1a0fe7210b775106b25850f5e5a9dc3c840d1ae9924ee6ea2eb3acbfe7 \ + --hash=sha256:8cb625bcb5add899cb8ba7bf716ec1d3e8f7cdea9b0713fa99eadf73b6d4986f \ + --hash=sha256:8d663fd71491dde7dfdfc899d13a067a94198e90695b4321084c6e450743b8c7 \ + --hash=sha256:8ee1983728964d6070ab443399c476de93d5d741f71e8f6e7880a065f878e0b9 \ + --hash=sha256:997e7b8f173a391987df40f3b52c423e5850be6f6df0dcfb5376365440b56667 \ + --hash=sha256:9be90eebc9842a93ef8335291f57b3b7488ac24f70df96a6034a13cb58e6ff86 \ + --hash=sha256:9ddd49258610499aab83b4f5b61b32e11fce873586282a0e972e5ab3bcadee51 \ + --hash=sha256:9ecde3671e62eeb99e977f5221abcf40c208f69b5eb986b061ccec317c82ebd0 \ + --hash=sha256:9ff4e9ecb6e4b363430edf2c6e50173a63e0820e549918adef70515f87ced19a \ + --hash=sha256:a254537b9b696ede293bfdbc0a65200e8e4507bc9f37831e2a0318a9b333c85c \ + --hash=sha256:a2b9bf8c79b660d0ca1ad95e587818c30ccdb11f787657458d6f26a1ea18c568 \ + --hash=sha256:a61a68d630e812b67b5bf097ab84e2cd79b48c792857dc10ba8a223f5b06a2af \ + --hash=sha256:a7080b0159ce05f179cfac592cda1a82898ca9cd097dacf8ea20ae33474fbb25 \ + --hash=sha256:a8fd93de4e1d278046345f49e2238cdb298589325849b2645d4a94c53faeffc5 \ + --hash=sha256:a94ffc66738da99232ddffcf7910e0f69e2bbe3a0802e54426dbf0714e1c2ffe \ + --hash=sha256:aa806bbc13eac1ab6291ed21ecd2dd426063ca5417dd507e6be58de20e58dfcf \ + --hash=sha256:b0c1a133d42c6fc1f5fbcf5c91331657a1ff822e87989bf4a6e2e39b818d0ee9 \ + --hash=sha256:b58229a844931bca61b3a20efd2be2a2acb4ad1622fc026504309a6883686fbf \ + --hash=sha256:bb2f144c6d98bb5cbc94adeb0447cfd4c0f991341baa68eee3f3b0c9c0e83767 \ + --hash=sha256:be90c94570840939fecedf99fa72839aed70b0ced449b415c85e01ae67422c90 \ + --hash=sha256:bf0d9a171908f32d54f651648c7290397b8792f4303821c42a74e7805bfb813c \ + --hash=sha256:bf15fc0b45914d9d1b706f7c9c4f66f2b7b053e9517e40123e137e8ca8958b3d \ + --hash=sha256:bf4298f366ca7e1ad1d21bbb58300a6985015909964077afd37559084590c929 \ + --hash=sha256:c441c841e82c5ba7a85ad25986014be8d7849c3cfbdb6004541873505929a74e \ + --hash=sha256:cacea77ef7a2195f04f9279297684955e3d1ae4241092ff0cfcef532bb7a1c32 \ + --hash=sha256:cd54895e4ae7d32f1e3dd91261df46ee7483a735017dc6f987904f194aa5fd14 \ + --hash=sha256:d1323cd04d6e92150bcc79d0174ce347ed4b349d748b9358fd2e497b121e03c8 \ + --hash=sha256:d383bf5e045d7f9d239b38e6acadd7b7fdf6c0087259a84ae3475d18e9a2ae8b \ + --hash=sha256:d3e7420211f5a65a54675fd860ea04173cde60a7cc20ccfbafcccd155225f8bc \ + --hash=sha256:d8074c5dd61c8a3e915fa8fc04754fa55cfa5978200d2daa1e2d4294c1f136aa \ + --hash=sha256:df03cd88f95b1b99052b52b1bb92173229d7a674df0ab06d2b25765ee8404bce \ + --hash=sha256:e45377d5d6fefe1677da2a2c07b024a6dac782088e37c0b1efea4cfe2b1be19b \ + --hash=sha256:e53d19c2bf7d0d1e6998a7e693c7e87300dd971808e6618964621ccd0e01fe4e \ + --hash=sha256:e560fd75aaf3e5693b91bcaddd8b314f4d57e99aef8a6c6dc692f935cc1e6bbf \ + --hash=sha256:ec5060592d83454e8063e487696ac3783cc48c9a329498bafae0d972bc7816c9 \ + --hash=sha256:ecc2920630283e0783c22e2ac94427f8cca29a04cfdf331467d4f661f4072dac \ + --hash=sha256:ed7161bccab7696a473fe7ddb619c1d75963732b37da4618ba12e60899fefe4f \ + --hash=sha256:ee0bd3a7b2e184e88d25c9baa6a9dc609ba25b76daae942edfb14499ac7ec374 \ + --hash=sha256:ee25f1ac091def37c4b59d192bbe3a206298feeb89132a470325bf76ad122a1e \ + --hash=sha256:efa44f64c37cc30c9f05932c740a8b40ce359f51882c70883cc95feac842da4d \ + --hash=sha256:f47d52fd9b2ac418c4890aad2f6d21a6b96183c98021f0a48497a904199f006e \ + --hash=sha256:f857034dc68d5ceb30fb60afb6ff2103087aea10a01b613985610e007053a121 \ + --hash=sha256:fb91d20fa2d3b13deea98a690534697742029f4fb83673a3501ae6e3746508b5 \ + --hash=sha256:fddb8870bdb83456a489ab67c6b3040a8d5a55069aa6f72f9d872235fbc52f54 + # via + # -c python/requirements_compiled.txt + # aiohttp + # yarl +proto-plus==1.22.3 \ + --hash=sha256:a49cd903bc0b6ab41f76bf65510439d56ca76f868adf0274e738bfdd096894df \ + --hash=sha256:fdcd09713cbd42480740d2fe29c990f7fbd885a67efc328aa8be6ee3e9f76a6b + # via + # -c python/requirements_compiled.txt + # google-api-core +protobuf==4.25.8 \ + --hash=sha256:077ff8badf2acf8bc474406706ad890466274191a48d0abd3bd6987107c9cde5 \ + --hash=sha256:15a0af558aa3b13efef102ae6e4f3efac06f1eea11afb3a57db2901447d9fb59 \ + --hash=sha256:27d498ffd1f21fb81d987a041c32d07857d1d107909f5134ba3350e1ce80a4af \ + --hash=sha256:504435d831565f7cfac9f0714440028907f1975e4bed228e58e72ecfff58a1e0 \ + --hash=sha256:6135cf8affe1fc6f76cced2641e4ea8d3e59518d1f24ae41ba97bcad82d397cd \ + --hash=sha256:83e6e54e93d2b696a92cad6e6efc924f3850f82b52e1563778dfab8b355101b0 \ + --hash=sha256:9ad7ef62d92baf5a8654fbb88dac7fa5594cfa70fd3440488a5ca3bfc6d795a7 \ + --hash=sha256:bd551eb1fe1d7e92c1af1d75bdfa572eff1ab0e5bf1736716814cdccdb2360f9 \ + --hash=sha256:ca809b42f4444f144f2115c4c1a747b9a404d590f18f37e9402422033e464e0f \ + --hash=sha256:d552c53d0415449c8d17ced5c341caba0d89dbf433698e1436c8fa0aae7808a3 \ + --hash=sha256:f4510b93a3bec6eba8fd8f1093e9d7fb0d4a24d1a81377c10c0e5bbfe9e4ed24 + # via + # -c python/requirements_compiled.txt + # google-api-core + # googleapis-common-protos + # opentelemetry-proto + # proto-plus + # ray + # tensorboardx +py-spy==0.4.0 ; python_full_version < '3.12' \ + --hash=sha256:47cdda4c34d9b6cb01f3aaeceb2e88faf57da880207fe72ff6ff97e9bb6cc8a9 \ + --hash=sha256:77d8f637ade38367d944874776f45b703b7ac5938b1f7be8891f3a5876ddbb96 \ + --hash=sha256:806602ce7972782cc9c1e383f339bfc27bfb822d42485e6a3e0530ae5040e1f0 \ + --hash=sha256:87573e64dbfdfc89ba2e0f5e2f525aa84e0299c7eb6454b47ea335fde583a7a0 \ + --hash=sha256:8bf2f3702cef367a489faa45177b41a6c31b2a3e5bd78c978d44e29340152f5a \ + --hash=sha256:c5f06ffce4c9c98b7fc9f5e67e5e7db591173f1351837633f3f23d9378b1d18a \ + --hash=sha256:eee3d0bde85ca5cf4f01f012d461180ca76c24835a96f7b5c4ded64eb6a008ab \ + --hash=sha256:f2cf3f7130e7d780471faa5957441d3b4e0ec39a79b2c00f4c33d494f7728428 + # via + # -c python/requirements_compiled.txt + # ray +pyarrow==19.0.1 \ + --hash=sha256:008a4009efdb4ea3d2e18f05cd31f9d43c388aad29c636112c2966605ba33466 \ + --hash=sha256:0148bb4fc158bfbc3d6dfe5001d93ebeed253793fff4435167f6ce1dc4bddeae \ + --hash=sha256:1b93ef2c93e77c442c979b0d596af45e4665d8b96da598db145b0fec014b9136 \ + --hash=sha256:1c7556165bd38cf0cd992df2636f8bcdd2d4b26916c6b7e646101aff3c16f76f \ + --hash=sha256:335d170e050bcc7da867a1ed8ffb8b44c57aaa6e0843b156a501298657b1e972 \ + --hash=sha256:3bf266b485df66a400f282ac0b6d1b500b9d2ae73314a153dbe97d6d5cc8a99e \ + --hash=sha256:41f9706fbe505e0abc10e84bf3a906a1338905cbbcf1177b71486b03e6ea6608 \ + --hash=sha256:4982f8e2b7afd6dae8608d70ba5bd91699077323f812a0448d8b7abdff6cb5d3 \ + --hash=sha256:49a3aecb62c1be1d822f8bf629226d4a96418228a42f5b40835c1f10d42e4db6 \ + --hash=sha256:4d5d1ec7ec5324b98887bdc006f4d2ce534e10e60f7ad995e7875ffa0ff9cb14 \ + --hash=sha256:58d9397b2e273ef76264b45531e9d552d8ec8a6688b7390b5be44c02a37aade8 \ + --hash=sha256:5a9137cf7e1640dce4c190551ee69d478f7121b5c6f323553b319cac936395f6 \ + --hash=sha256:5bd1618ae5e5476b7654c7b55a6364ae87686d4724538c24185bbb2952679960 \ + --hash=sha256:65cf9feebab489b19cdfcfe4aa82f62147218558d8d3f0fc1e9dea0ab8e7905a \ + --hash=sha256:699799f9c80bebcf1da0983ba86d7f289c5a2a5c04b945e2f2bcf7e874a91911 \ + --hash=sha256:6c5941c1aac89a6c2f2b16cd64fe76bcdb94b2b1e99ca6459de4e6f07638d755 \ + --hash=sha256:6ebfb5171bb5f4a52319344ebbbecc731af3f021e49318c74f33d520d31ae0c4 \ + --hash=sha256:7a544ec12de66769612b2d6988c36adc96fb9767ecc8ee0a4d270b10b1c51e00 \ + --hash=sha256:7c1bca1897c28013db5e4c83944a2ab53231f541b9e0c3f4791206d0c0de389a \ + --hash=sha256:80b2ad2b193e7d19e81008a96e313fbd53157945c7be9ac65f44f8937a55427b \ + --hash=sha256:8464c9fbe6d94a7fe1599e7e8965f350fd233532868232ab2596a71586c5a429 \ + --hash=sha256:8f04d49a6b64cf24719c080b3c2029a3a5b16417fd5fd7c4041f94233af732f3 \ + --hash=sha256:96606c3ba57944d128e8a8399da4812f56c7f61de8c647e3470b417f795d0ef9 \ + --hash=sha256:99bc1bec6d234359743b01e70d4310d0ab240c3d6b0da7e2a93663b0158616f6 \ + --hash=sha256:ad76aef7f5f7e4a757fddcdcf010a8290958f09e3470ea458c80d26f4316ae89 \ + --hash=sha256:b4c4156a625f1e35d6c0b2132635a237708944eb41df5fbe7d50f20d20c17832 \ + --hash=sha256:b9766a47a9cb56fefe95cb27f535038b5a195707a08bf61b180e642324963b46 \ + --hash=sha256:c0fe3dbbf054a00d1f162fda94ce236a899ca01123a798c561ba307ca38af5f0 \ + --hash=sha256:c6cb2335a411b713fdf1e82a752162f72d4a7b5dbc588e32aa18383318b05866 \ + --hash=sha256:cc55d71898ea30dc95900297d191377caba257612f384207fe9f8293b5850f90 \ + --hash=sha256:d03c9d6f2a3dffbd62671ca070f13fc527bb1867b4ec2b98c7eeed381d4f389a \ + --hash=sha256:d383591f3dcbe545f6cc62daaef9c7cdfe0dff0fb9e1c8121101cabe9098cfa6 \ + --hash=sha256:d9d46e06846a41ba906ab25302cf0fd522f81aa2a85a71021826f34639ad31ef \ + --hash=sha256:d9dedeaf19097a143ed6da37f04f4051aba353c95ef507764d344229b2b740ae \ + --hash=sha256:e45274b20e524ae5c39d7fc1ca2aa923aab494776d2d4b316b49ec7572ca324c \ + --hash=sha256:ee8dec072569f43835932a3b10c55973593abc00936c202707a4ad06af7cb294 \ + --hash=sha256:f24faab6ed18f216a37870d8c5623f9c044566d75ec586ef884e13a02a9d62c5 \ + --hash=sha256:f2a21d39fbdb948857f67eacb5bbaaf36802de044ec36fbef7a1c8f0dd3a4ab2 \ + --hash=sha256:f3ad4c0eb4e2a9aeb990af6c09e6fa0b195c8c0e7b272ecc8d4d2b6574809d34 \ + --hash=sha256:fc28912a2dc924dddc2087679cc8b7263accc71b9ff025a1362b004711661a69 \ + --hash=sha256:fca15aabbe9b8355800d923cc2e82c8ef514af321e18b437c3d782aa884eaeec \ + --hash=sha256:fd44d66093a239358d07c42a91eebf5015aa54fccba959db899f932218ac9cc8 + # via + # -c python/requirements_compiled.txt + # ray +pyasn1==0.5.1 \ + --hash=sha256:4439847c58d40b1d0a573d07e3856e95333f1976294494c325775aeca506eb58 \ + --hash=sha256:6d391a96e59b23130a5cfa74d6fd7f388dbbe26cc8f1edf39fdddf08d9d6676c + # via + # -c python/requirements_compiled.txt + # pyasn1-modules + # rsa +pyasn1-modules==0.3.0 \ + --hash=sha256:5bd01446b736eb9d31512a30d46c1ac3395d676c6f3cafa4c03eb54b9925631c \ + --hash=sha256:d3ccd6ed470d9ffbc716be08bd90efbd44d0734bc9303818f7336070984a162d + # via + # -c python/requirements_compiled.txt + # google-auth +pycparser==2.21 ; platform_python_implementation != 'PyPy' \ + --hash=sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9 \ + --hash=sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206 + # via + # -c python/requirements_compiled.txt + # cffi +pydantic==2.11.7 \ + --hash=sha256:d989c3c6cb79469287b1569f7447a17848c998458d49ebe294e975b9baf0f0db \ + --hash=sha256:dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b + # via + # -c python/requirements_compiled.txt + # fastapi + # ray +pydantic-core==2.33.2 \ + --hash=sha256:0069c9acc3f3981b9ff4cdfaf088e98d83440a4c7ea1bc07460af3d4dc22e72d \ + --hash=sha256:031c57d67ca86902726e0fae2214ce6770bbe2f710dc33063187a68744a5ecac \ + --hash=sha256:0405262705a123b7ce9f0b92f123334d67b70fd1f20a9372b907ce1080c7ba02 \ + --hash=sha256:04a1a413977ab517154eebb2d326da71638271477d6ad87a769102f7c2488c56 \ + --hash=sha256:09fb9dd6571aacd023fe6aaca316bd01cf60ab27240d7eb39ebd66a3a15293b4 \ + --hash=sha256:0a39979dcbb70998b0e505fb1556a1d550a0781463ce84ebf915ba293ccb7e22 \ + --hash=sha256:0a9f2c9dd19656823cb8250b0724ee9c60a82f3cdf68a080979d13092a3b0fef \ + --hash=sha256:0e03262ab796d986f978f79c943fc5f620381be7287148b8010b4097f79a39ec \ + --hash=sha256:0e5b2671f05ba48b94cb90ce55d8bdcaaedb8ba00cc5359f6810fc918713983d \ + --hash=sha256:0e6116757f7959a712db11f3e9c0a99ade00a5bbedae83cb801985aa154f071b \ + --hash=sha256:0fb2d542b4d66f9470e8065c5469ec676978d625a8b7a363f07d9a501a9cb36a \ + --hash=sha256:1082dd3e2d7109ad8b7da48e1d4710c8d06c253cbc4a27c1cff4fbcaa97a9e3f \ + --hash=sha256:1a8695a8d00c73e50bff9dfda4d540b7dee29ff9b8053e38380426a85ef10052 \ + --hash=sha256:1e063337ef9e9820c77acc768546325ebe04ee38b08703244c1309cccc4f1bab \ + --hash=sha256:1ea40a64d23faa25e62a70ad163571c0b342b8bf66d5fa612ac0dec4f069d916 \ + --hash=sha256:2058a32994f1fde4ca0480ab9d1e75a0e8c87c22b53a3ae66554f9af78f2fe8c \ + --hash=sha256:235f45e5dbcccf6bd99f9f472858849f73d11120d76ea8707115415f8e5ebebf \ + --hash=sha256:2807668ba86cb38c6817ad9bc66215ab8584d1d304030ce4f0887336f28a5e27 \ + --hash=sha256:2b0a451c263b01acebe51895bfb0e1cc842a5c666efe06cdf13846c7418caa9a \ + --hash=sha256:2b3d326aaef0c0399d9afffeb6367d5e26ddc24d351dbc9c636840ac355dc5d8 \ + --hash=sha256:2bfb5112df54209d820d7bf9317c7a6c9025ea52e49f46b6a2060104bba37de7 \ + --hash=sha256:2f82865531efd18d6e07a04a17331af02cb7a651583c418df8266f17a63c6612 \ + --hash=sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1 \ + --hash=sha256:3a1c81334778f9e3af2f8aeb7a960736e5cab1dfebfb26aabca09afd2906c039 \ + --hash=sha256:3abcd9392a36025e3bd55f9bd38d908bd17962cc49bc6da8e7e96285336e2bca \ + --hash=sha256:3c6db6e52c6d70aa0d00d45cdb9b40f0433b96380071ea80b09277dba021ddf7 \ + --hash=sha256:3dc625f4aa79713512d1976fe9f0bc99f706a9dee21dfd1810b4bbbf228d0e8a \ + --hash=sha256:3eb3fe62804e8f859c49ed20a8451342de53ed764150cb14ca71357c765dc2a6 \ + --hash=sha256:44857c3227d3fb5e753d5fe4a3420d6376fa594b07b621e220cd93703fe21782 \ + --hash=sha256:4b25d91e288e2c4e0662b8038a28c6a07eaac3e196cfc4ff69de4ea3db992a1b \ + --hash=sha256:4c5b0a576fb381edd6d27f0a85915c6daf2f8138dc5c267a57c08a62900758c7 \ + --hash=sha256:4e61206137cbc65e6d5256e1166f88331d3b6238e082d9f74613b9b765fb9025 \ + --hash=sha256:52fb90784e0a242bb96ec53f42196a17278855b0f31ac7c3cc6f5c1ec4811849 \ + --hash=sha256:53a57d2ed685940a504248187d5685e49eb5eef0f696853647bf37c418c538f7 \ + --hash=sha256:572c7e6c8bb4774d2ac88929e3d1f12bc45714ae5ee6d9a788a9fb35e60bb04b \ + --hash=sha256:5c4aa4e82353f65e548c476b37e64189783aa5384903bfea4f41580f255fddfa \ + --hash=sha256:5c92edd15cd58b3c2d34873597a1e20f13094f59cf88068adb18947df5455b4e \ + --hash=sha256:5f483cfb75ff703095c59e365360cb73e00185e01aaea067cd19acffd2ab20ea \ + --hash=sha256:61c18fba8e5e9db3ab908620af374db0ac1baa69f0f32df4f61ae23f15e586ac \ + --hash=sha256:6368900c2d3ef09b69cb0b913f9f8263b03786e5b2a387706c5afb66800efd51 \ + --hash=sha256:64632ff9d614e5eecfb495796ad51b0ed98c453e447a76bcbeeb69615079fc7e \ + --hash=sha256:65132b7b4a1c0beded5e057324b7e16e10910c106d43675d9bd87d4f38dde162 \ + --hash=sha256:6b99022f1d19bc32a4c2a0d544fc9a76e3be90f0b3f4af413f87d38749300e65 \ + --hash=sha256:6bdfe4b3789761f3bcb4b1ddf33355a71079858958e3a552f16d5af19768fef2 \ + --hash=sha256:6fa6dfc3e4d1f734a34710f391ae822e0a8eb8559a85c6979e14e65ee6ba2954 \ + --hash=sha256:73662edf539e72a9440129f231ed3757faab89630d291b784ca99237fb94db2b \ + --hash=sha256:73cf6373c21bc80b2e0dc88444f41ae60b2f070ed02095754eb5a01df12256de \ + --hash=sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc \ + --hash=sha256:7f92c15cd1e97d4b12acd1cc9004fa092578acfa57b67ad5e43a197175d01a64 \ + --hash=sha256:82f68293f055f51b51ea42fafc74b6aad03e70e191799430b90c13d643059ebb \ + --hash=sha256:83aa99b1285bc8f038941ddf598501a86f1536789740991d7d8756e34f1e74d9 \ + --hash=sha256:87acbfcf8e90ca885206e98359d7dca4bcbb35abdc0ff66672a293e1d7a19101 \ + --hash=sha256:87b31b6846e361ef83fedb187bb5b4372d0da3f7e28d85415efa92d6125d6e6d \ + --hash=sha256:881b21b5549499972441da4758d662aeea93f1923f953e9cbaff14b8b9565aef \ + --hash=sha256:8d55ab81c57b8ff8548c3e4947f119551253f4e3787a7bbc0b6b3ca47498a9d3 \ + --hash=sha256:8f57a69461af2a5fa6e6bbd7a5f60d3b7e6cebb687f55106933188e79ad155c1 \ + --hash=sha256:95237e53bb015f67b63c91af7518a62a8660376a6a0db19b89acc77a4d6199f5 \ + --hash=sha256:96081f1605125ba0855dfda83f6f3df5ec90c61195421ba72223de35ccfb2f88 \ + --hash=sha256:970919794d126ba8645f3837ab6046fb4e72bbc057b3709144066204c19a455d \ + --hash=sha256:9cb1da0f5a471435a7bc7e439b8a728e8b61e59784b2af70d7c169f8dd8ae290 \ + --hash=sha256:9fcd347d2cc5c23b06de6d3b7b8275be558a0c90549495c699e379a80bf8379e \ + --hash=sha256:9fdac5d6ffa1b5a83bca06ffe7583f5576555e6c8b3a91fbd25ea7780f825f7d \ + --hash=sha256:a11c8d26a50bfab49002947d3d237abe4d9e4b5bdc8846a63537b6488e197808 \ + --hash=sha256:a144d4f717285c6d9234a66778059f33a89096dfb9b39117663fd8413d582dcc \ + --hash=sha256:a2b911a5b90e0374d03813674bf0a5fbbb7741570dcd4b4e85a2e48d17def29d \ + --hash=sha256:a7ec89dc587667f22b6a0b6579c249fca9026ce7c333fc142ba42411fa243cdc \ + --hash=sha256:aa9d91b338f2df0508606f7009fde642391425189bba6d8c653afd80fd6bb64e \ + --hash=sha256:b0379a2b24882fef529ec3b4987cb5d003b9cda32256024e6fe1586ac45fc640 \ + --hash=sha256:bc7aee6f634a6f4a95676fcb5d6559a2c2a390330098dba5e5a5f28a2e4ada30 \ + --hash=sha256:bdc25f3681f7b78572699569514036afe3c243bc3059d3942624e936ec93450e \ + --hash=sha256:c083a3bdd5a93dfe480f1125926afcdbf2917ae714bdb80b36d34318b2bec5d9 \ + --hash=sha256:c20c462aa4434b33a2661701b861604913f912254e441ab8d78d30485736115a \ + --hash=sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9 \ + --hash=sha256:c52b02ad8b4e2cf14ca7b3d918f3eb0ee91e63b3167c32591e57c4317e134f8f \ + --hash=sha256:c54c939ee22dc8e2d545da79fc5381f1c020d6d3141d3bd747eab59164dc89fb \ + --hash=sha256:c8e7af2f4e0194c22b5b37205bfb293d166a7344a5b0d0eaccebc376546d77d5 \ + --hash=sha256:cca3868ddfaccfbc4bfb1d608e2ccaaebe0ae628e1416aeb9c4d88c001bb45ab \ + --hash=sha256:d3f26877a748dc4251cfcfda9dfb5f13fcb034f5308388066bcfe9031b63ae7d \ + --hash=sha256:d53b22f2032c42eaaf025f7c40c2e3b94568ae077a606f006d206a463bc69572 \ + --hash=sha256:d87c561733f66531dced0da6e864f44ebf89a8fba55f31407b00c2f7f9449593 \ + --hash=sha256:d946c8bf0d5c24bf4fe333af284c59a19358aa3ec18cb3dc4370080da1e8ad29 \ + --hash=sha256:dac89aea9af8cd672fa7b510e7b8c33b0bba9a43186680550ccf23020f32d535 \ + --hash=sha256:db4b41f9bd95fbe5acd76d89920336ba96f03e149097365afe1cb092fceb89a1 \ + --hash=sha256:dc46a01bf8d62f227d5ecee74178ffc448ff4e5197c756331f71efcc66dc980f \ + --hash=sha256:dd14041875d09cc0f9308e37a6f8b65f5585cf2598a53aa0123df8b129d481f8 \ + --hash=sha256:de4b83bb311557e439b9e186f733f6c645b9417c84e2eb8203f3f820a4b988bf \ + --hash=sha256:e799c050df38a639db758c617ec771fd8fb7a5f8eaaa4b27b101f266b216a246 \ + --hash=sha256:e80b087132752f6b3d714f041ccf74403799d3b23a72722ea2e6ba2e892555b9 \ + --hash=sha256:eb8c529b2819c37140eb51b914153063d27ed88e3bdc31b71198a198e921e011 \ + --hash=sha256:eb9b459ca4df0e5c87deb59d37377461a538852765293f9e6ee834f0435a93b9 \ + --hash=sha256:efec8db3266b76ef9607c2c4c419bdb06bf335ae433b80816089ea7585816f6a \ + --hash=sha256:f481959862f57f29601ccced557cc2e817bce7533ab8e01a797a48b49c9692b3 \ + --hash=sha256:f517ca031dfc037a9c07e748cefd8d96235088b83b4f4ba8939105d20fa1dcd6 \ + --hash=sha256:f889f7a40498cc077332c7ab6b4608d296d852182211787d4f3ee377aaae66e8 \ + --hash=sha256:f8de619080e944347f5f20de29a975c2d815d9ddd8be9b9b7268e2e3ef68605a \ + --hash=sha256:f941635f2a3d96b2973e867144fde513665c87f13fe0e193c158ac51bfaaa7b2 \ + --hash=sha256:fa754d1850735a0b0e03bcffd9d4b4343eb417e47196e4485d9cca326073a42c \ + --hash=sha256:fa854f5cf7e33842a892e5c73f45327760bc7bc516339fda888c75ae60edaeb6 \ + --hash=sha256:fe5b32187cbc0c862ee201ad66c30cf218e5ed468ec8dc1cf49dec66e160cc4d + # via + # -c python/requirements_compiled.txt + # pydantic +pygments==2.18.0 ; sys_platform != 'win32' \ + --hash=sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199 \ + --hash=sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a + # via + # -c python/requirements_compiled.txt + # rich +pyopenssl==25.0.0 \ + --hash=sha256:424c247065e46e76a37411b9ab1782541c23bb658bf003772c3405fbaa128e90 \ + --hash=sha256:cd2cef799efa3936bb08e8ccb9433a575722b9dd986023f1cabc4ae64e9dac16 + # via + # -c python/requirements_compiled.txt + # ray +python-dateutil==2.8.2 \ + --hash=sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86 \ + --hash=sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9 + # via + # -c python/requirements_compiled.txt + # celery + # pandas +python-dotenv==1.1.1 \ + --hash=sha256:31f23644fe2602f88ff55e1f5c79ba497e01224ee7737937930c448e4d0e24dc \ + --hash=sha256:a8a6399716257f45be6a007360200409fce5cda2661e3dec71d23dc15f6189ab + # via uvicorn +pytz==2022.7.1 \ + --hash=sha256:01a0681c4b9684a28304615eba55d1ab31ae00bf68ec157ec3708a8182dbbcd0 \ + --hash=sha256:78f4f37d8198e0627c5f1143240bb0206b8691d8d7ac6d78fee88b78733f8c4a + # via + # -c python/requirements_compiled.txt + # pandas +pyyaml==6.0.1 \ + --hash=sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5 \ + --hash=sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc \ + --hash=sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df \ + --hash=sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741 \ + --hash=sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206 \ + --hash=sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27 \ + --hash=sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595 \ + --hash=sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62 \ + --hash=sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98 \ + --hash=sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696 \ + --hash=sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290 \ + --hash=sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9 \ + --hash=sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d \ + --hash=sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6 \ + --hash=sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867 \ + --hash=sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47 \ + --hash=sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486 \ + --hash=sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6 \ + --hash=sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3 \ + --hash=sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007 \ + --hash=sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938 \ + --hash=sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0 \ + --hash=sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c \ + --hash=sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735 \ + --hash=sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d \ + --hash=sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28 \ + --hash=sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4 \ + --hash=sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba \ + --hash=sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8 \ + --hash=sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef \ + --hash=sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5 \ + --hash=sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd \ + --hash=sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3 \ + --hash=sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0 \ + --hash=sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515 \ + --hash=sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c \ + --hash=sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c \ + --hash=sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924 \ + --hash=sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34 \ + --hash=sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43 \ + --hash=sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859 \ + --hash=sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673 \ + --hash=sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54 \ + --hash=sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a \ + --hash=sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b \ + --hash=sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab \ + --hash=sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa \ + --hash=sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c \ + --hash=sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585 \ + --hash=sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d \ + --hash=sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f + # via + # -c python/requirements_compiled.txt + # ray + # uvicorn +ray==100.0.0.dev0 \ + --hash=sha256:9739ca053529f0ec60c6248748773470765550f34bb78502c55b913d65bb32eb +referencing==0.36.2 \ + --hash=sha256:df2e89862cd09deabbdba16944cc3f10feb6b3e6f18e902f7cc25609a34775aa \ + --hash=sha256:e8699adbbf8b5c7de96d8ffa0eb5c158b3beafce084968e2ea8bb08c6794dcd0 + # via + # -c python/requirements_compiled.txt + # jsonschema + # jsonschema-specifications +requests==2.32.3 \ + --hash=sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760 \ + --hash=sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6 + # via + # -c python/requirements_compiled.txt + # google-api-core + # ray +rich==13.3.2 ; sys_platform != 'win32' \ + --hash=sha256:91954fe80cfb7985727a467ca98a7618e5dd15178cc2da10f553b36a93859001 \ + --hash=sha256:a104f37270bf677148d8acb07d33be1569eeee87e2d1beb286a4e9113caf6f2f + # via + # -c python/requirements_compiled.txt + # memray +rpds-py==0.22.3 \ + --hash=sha256:009de23c9c9ee54bf11303a966edf4d9087cd43a6003672e6aa7def643d06518 \ + --hash=sha256:02fbb9c288ae08bcb34fb41d516d5eeb0455ac35b5512d03181d755d80810059 \ + --hash=sha256:0a0461200769ab3b9ab7e513f6013b7a97fdeee41c29b9db343f3c5a8e2b9e61 \ + --hash=sha256:0b09865a9abc0ddff4e50b5ef65467cd94176bf1e0004184eb915cbc10fc05c5 \ + --hash=sha256:0b8db6b5b2d4491ad5b6bdc2bc7c017eec108acbf4e6785f42a9eb0ba234f4c9 \ + --hash=sha256:0c150c7a61ed4a4f4955a96626574e9baf1adf772c2fb61ef6a5027e52803543 \ + --hash=sha256:0f3cec041684de9a4684b1572fe28c7267410e02450f4561700ca5a3bc6695a2 \ + --hash=sha256:1352ae4f7c717ae8cba93421a63373e582d19d55d2ee2cbb184344c82d2ae55a \ + --hash=sha256:177c7c0fce2855833819c98e43c262007f42ce86651ffbb84f37883308cb0e7d \ + --hash=sha256:1978d0021e943aae58b9b0b196fb4895a25cc53d3956b8e35e0b7682eefb6d56 \ + --hash=sha256:1a60bce91f81ddaac922a40bbb571a12c1070cb20ebd6d49c48e0b101d87300d \ + --hash=sha256:1aef18820ef3e4587ebe8b3bc9ba6e55892a6d7b93bac6d29d9f631a3b4befbd \ + --hash=sha256:1e9663daaf7a63ceccbbb8e3808fe90415b0757e2abddbfc2e06c857bf8c5e2b \ + --hash=sha256:20070c65396f7373f5df4005862fa162db5d25d56150bddd0b3e8214e8ef45b4 \ + --hash=sha256:214b7a953d73b5e87f0ebece4a32a5bd83c60a3ecc9d4ec8f1dca968a2d91e99 \ + --hash=sha256:22bebe05a9ffc70ebfa127efbc429bc26ec9e9b4ee4d15a740033efda515cf3d \ + --hash=sha256:24e8abb5878e250f2eb0d7859a8e561846f98910326d06c0d51381fed59357bd \ + --hash=sha256:26fd7cac7dd51011a245f29a2cc6489c4608b5a8ce8d75661bb4a1066c52dfbe \ + --hash=sha256:27b1d3b3915a99208fee9ab092b8184c420f2905b7d7feb4aeb5e4a9c509b8a1 \ + --hash=sha256:27e98004595899949bd7a7b34e91fa7c44d7a97c40fcaf1d874168bb652ec67e \ + --hash=sha256:2b8f60e1b739a74bab7e01fcbe3dddd4657ec685caa04681df9d562ef15b625f \ + --hash=sha256:2de29005e11637e7a2361fa151f780ff8eb2543a0da1413bb951e9f14b699ef3 \ + --hash=sha256:2e8b55d8517a2fda8d95cb45d62a5a8bbf9dd0ad39c5b25c8833efea07b880ca \ + --hash=sha256:2fa4331c200c2521512595253f5bb70858b90f750d39b8cbfd67465f8d1b596d \ + --hash=sha256:3445e07bf2e8ecfeef6ef67ac83de670358abf2996916039b16a218e3d95e97e \ + --hash=sha256:3453e8d41fe5f17d1f8e9c383a7473cd46a63661628ec58e07777c2fff7196dc \ + --hash=sha256:378753b4a4de2a7b34063d6f95ae81bfa7b15f2c1a04a9518e8644e81807ebea \ + --hash=sha256:3af6e48651c4e0d2d166dc1b033b7042ea3f871504b6805ba5f4fe31581d8d38 \ + --hash=sha256:3dfcbc95bd7992b16f3f7ba05af8a64ca694331bd24f9157b49dadeeb287493b \ + --hash=sha256:3f21f0495edea7fdbaaa87e633a8689cd285f8f4af5c869f27bc8074638ad69c \ + --hash=sha256:4041711832360a9b75cfb11b25a6a97c8fb49c07b8bd43d0d02b45d0b499a4ff \ + --hash=sha256:44d61b4b7d0c2c9ac019c314e52d7cbda0ae31078aabd0f22e583af3e0d79723 \ + --hash=sha256:4617e1915a539a0d9a9567795023de41a87106522ff83fbfaf1f6baf8e85437e \ + --hash=sha256:4b232061ca880db21fa14defe219840ad9b74b6158adb52ddf0e87bead9e8493 \ + --hash=sha256:5246b14ca64a8675e0a7161f7af68fe3e910e6b90542b4bfb5439ba752191df6 \ + --hash=sha256:5725dd9cc02068996d4438d397e255dcb1df776b7ceea3b9cb972bdb11260a83 \ + --hash=sha256:583f6a1993ca3369e0f80ba99d796d8e6b1a3a2a442dd4e1a79e652116413091 \ + --hash=sha256:59259dc58e57b10e7e18ce02c311804c10c5a793e6568f8af4dead03264584d1 \ + --hash=sha256:593eba61ba0c3baae5bc9be2f5232430453fb4432048de28399ca7376de9c627 \ + --hash=sha256:59f4a79c19232a5774aee369a0c296712ad0e77f24e62cad53160312b1c1eaa1 \ + --hash=sha256:5f0e260eaf54380380ac3808aa4ebe2d8ca28b9087cf411649f96bad6900c728 \ + --hash=sha256:62d9cfcf4948683a18a9aff0ab7e1474d407b7bab2ca03116109f8464698ab16 \ + --hash=sha256:64607d4cbf1b7e3c3c8a14948b99345eda0e161b852e122c6bb71aab6d1d798c \ + --hash=sha256:655ca44a831ecb238d124e0402d98f6212ac527a0ba6c55ca26f616604e60a45 \ + --hash=sha256:666ecce376999bf619756a24ce15bb14c5bfaf04bf00abc7e663ce17c3f34fe7 \ + --hash=sha256:68049202f67380ff9aa52f12e92b1c30115f32e6895cd7198fa2a7961621fc5a \ + --hash=sha256:69803198097467ee7282750acb507fba35ca22cc3b85f16cf45fb01cb9097730 \ + --hash=sha256:6c7b99ca52c2c1752b544e310101b98a659b720b21db00e65edca34483259967 \ + --hash=sha256:6dd9412824c4ce1aca56c47b0991e65bebb7ac3f4edccfd3f156150c96a7bf25 \ + --hash=sha256:70eb60b3ae9245ddea20f8a4190bd79c705a22f8028aaf8bbdebe4716c3fab24 \ + --hash=sha256:70fb28128acbfd264eda9bf47015537ba3fe86e40d046eb2963d75024be4d055 \ + --hash=sha256:7b2513ba235829860b13faa931f3b6846548021846ac808455301c23a101689d \ + --hash=sha256:7ef9d9da710be50ff6809fed8f1963fecdfecc8b86656cadfca3bc24289414b0 \ + --hash=sha256:81e69b0a0e2537f26d73b4e43ad7bc8c8efb39621639b4434b76a3de50c6966e \ + --hash=sha256:8633e471c6207a039eff6aa116e35f69f3156b3989ea3e2d755f7bc41754a4a7 \ + --hash=sha256:8bd7c8cfc0b8247c8799080fbff54e0b9619e17cdfeb0478ba7295d43f635d7c \ + --hash=sha256:9253fc214112405f0afa7db88739294295f0e08466987f1d70e29930262b4c8f \ + --hash=sha256:99b37292234e61325e7a5bb9689e55e48c3f5f603af88b1642666277a81f1fbd \ + --hash=sha256:9bd7228827ec7bb817089e2eb301d907c0d9827a9e558f22f762bb690b131652 \ + --hash=sha256:9beeb01d8c190d7581a4d59522cd3d4b6887040dcfc744af99aa59fef3e041a8 \ + --hash=sha256:a63cbdd98acef6570c62b92a1e43266f9e8b21e699c363c0fef13bd530799c11 \ + --hash=sha256:a76e42402542b1fae59798fab64432b2d015ab9d0c8c47ba7addddbaf7952333 \ + --hash=sha256:ac0a03221cdb5058ce0167ecc92a8c89e8d0decdc9e99a2ec23380793c4dcb96 \ + --hash=sha256:b0b4136a252cadfa1adb705bb81524eee47d9f6aab4f2ee4fa1e9d3cd4581f64 \ + --hash=sha256:b25bc607423935079e05619d7de556c91fb6adeae9d5f80868dde3468657994b \ + --hash=sha256:b3d504047aba448d70cf6fa22e06cb09f7cbd761939fdd47604f5e007675c24e \ + --hash=sha256:bb47271f60660803ad11f4c61b42242b8c1312a31c98c578f79ef9387bbde21c \ + --hash=sha256:bbb232860e3d03d544bc03ac57855cd82ddf19c7a07651a7c0fdb95e9efea8b9 \ + --hash=sha256:bc27863442d388870c1809a87507727b799c8460573cfbb6dc0eeaef5a11b5ec \ + --hash=sha256:bc51abd01f08117283c5ebf64844a35144a0843ff7b2983e0648e4d3d9f10dbb \ + --hash=sha256:be2eb3f2495ba669d2a985f9b426c1797b7d48d6963899276d22f23e33d47e37 \ + --hash=sha256:bf9db5488121b596dbfc6718c76092fda77b703c1f7533a226a5a9f65248f8ad \ + --hash=sha256:c58e2339def52ef6b71b8f36d13c3688ea23fa093353f3a4fee2556e62086ec9 \ + --hash=sha256:cfbc454a2880389dbb9b5b398e50d439e2e58669160f27b60e5eca11f68ae17c \ + --hash=sha256:cff63a0272fcd259dcc3be1657b07c929c466b067ceb1c20060e8d10af56f5bf \ + --hash=sha256:d115bffdd417c6d806ea9069237a4ae02f513b778e3789a359bc5856e0404cc4 \ + --hash=sha256:d20cfb4e099748ea39e6f7b16c91ab057989712d31761d3300d43134e26e165f \ + --hash=sha256:d48424e39c2611ee1b84ad0f44fb3b2b53d473e65de061e3f460fc0be5f1939d \ + --hash=sha256:e0fa2d4ec53dc51cf7d3bb22e0aa0143966119f42a0c3e4998293a3dd2856b09 \ + --hash=sha256:e32fee8ab45d3c2db6da19a5323bc3362237c8b653c70194414b892fd06a080d \ + --hash=sha256:e35ba67d65d49080e8e5a1dd40101fccdd9798adb9b050ff670b7d74fa41c566 \ + --hash=sha256:e3fb866d9932a3d7d0c82da76d816996d1667c44891bd861a0f97ba27e84fc74 \ + --hash=sha256:e61b02c3f7a1e0b75e20c3978f7135fd13cb6cf551bf4a6d29b999a88830a338 \ + --hash=sha256:e67ba3c290821343c192f7eae1d8fd5999ca2dc99994114643e2f2d3e6138b15 \ + --hash=sha256:e79dd39f1e8c3504be0607e5fc6e86bb60fe3584bec8b782578c3b0fde8d932c \ + --hash=sha256:e89391e6d60251560f0a8f4bd32137b077a80d9b7dbe6d5cab1cd80d2746f648 \ + --hash=sha256:ea7433ce7e4bfc3a85654aeb6747babe3f66eaf9a1d0c1e7a4435bbdf27fea84 \ + --hash=sha256:eaf16ae9ae519a0e237a0f528fd9f0197b9bb70f40263ee57ae53c2b8d48aeb3 \ + --hash=sha256:eb0c341fa71df5a4595f9501df4ac5abfb5a09580081dffbd1ddd4654e6e9123 \ + --hash=sha256:f276b245347e6e36526cbd4a266a417796fc531ddf391e43574cf6466c492520 \ + --hash=sha256:f47ad3d5f3258bd7058d2d506852217865afefe6153a36eb4b6928758041d831 \ + --hash=sha256:f56a6b404f74ab372da986d240e2e002769a7d7102cc73eb238a4f72eec5284e \ + --hash=sha256:f5cf2a0c2bdadf3791b5c205d55a37a54025c6e18a71c71f82bb536cf9a454bf \ + --hash=sha256:f5d36399a1b96e1a5fdc91e0522544580dbebeb1f77f27b2b0ab25559e103b8b \ + --hash=sha256:f60bd8423be1d9d833f230fdbccf8f57af322d96bcad6599e5a771b151398eb2 \ + --hash=sha256:f612463ac081803f243ff13cccc648578e2279295048f2a8d5eb430af2bae6e3 \ + --hash=sha256:f73d3fef726b3243a811121de45193c0ca75f6407fe66f3f4e183c983573e130 \ + --hash=sha256:f82a116a1d03628a8ace4859556fb39fd1424c933341a08ea3ed6de1edb0283b \ + --hash=sha256:fb0ba113b4983beac1a2eb16faffd76cb41e176bf58c4afe3e14b9c681f702de \ + --hash=sha256:fb4f868f712b2dd4bcc538b0a0c1f63a2b1d584c925e69a224d759e7070a12d5 \ + --hash=sha256:fb6116dfb8d1925cbdb52595560584db42a7f664617a1f7d7f6e32f138cdf37d \ + --hash=sha256:fda7cb070f442bf80b642cd56483b5548e43d366fe3f39b98e67cce780cded00 \ + --hash=sha256:feea821ee2a9273771bae61194004ee2fc33f8ec7db08117ef9147d4bbcbca8e + # via + # -c python/requirements_compiled.txt + # jsonschema + # referencing +rsa==4.7.2 \ + --hash=sha256:78f9a9bf4e7be0c5ded4583326e7461e3a3c5aae24073648b4bdfa797d78c9d2 \ + --hash=sha256:9d689e6ca1b3038bc82bf8d23e944b6b6037bc02301a574935b2dd946e0353b9 + # via + # -c python/requirements_compiled.txt + # google-auth +scipy==1.11.4 \ + --hash=sha256:00150c5eae7b610c32589dda259eacc7c4f1665aedf25d921907f4d08a951b1c \ + --hash=sha256:028eccd22e654b3ea01ee63705681ee79933652b2d8f873e7949898dda6d11b6 \ + --hash=sha256:1b7c3dca977f30a739e0409fb001056484661cb2541a01aba0bb0029f7b68db8 \ + --hash=sha256:2c6ff6ef9cc27f9b3db93a6f8b38f97387e6e0591600369a297a50a8e96e835d \ + --hash=sha256:36750b7733d960d7994888f0d148d31ea3017ac15eef664194b4ef68d36a4a97 \ + --hash=sha256:530f9ad26440e85766509dbf78edcfe13ffd0ab7fec2560ee5c36ff74d6269ff \ + --hash=sha256:5e347b14fe01003d3b78e196e84bd3f48ffe4c8a7b8a1afbcb8f5505cb710993 \ + --hash=sha256:6550466fbeec7453d7465e74d4f4b19f905642c89a7525571ee91dd7adabb5a3 \ + --hash=sha256:6df1468153a31cf55ed5ed39647279beb9cfb5d3f84369453b49e4b8502394fd \ + --hash=sha256:6e619aba2df228a9b34718efb023966da781e89dd3d21637b27f2e54db0410d7 \ + --hash=sha256:8fce70f39076a5aa62e92e69a7f62349f9574d8405c0a5de6ed3ef72de07f446 \ + --hash=sha256:90a2b78e7f5733b9de748f589f09225013685f9b218275257f8a8168ededaeaa \ + --hash=sha256:91af76a68eeae0064887a48e25c4e616fa519fa0d38602eda7e0f97d65d57937 \ + --hash=sha256:933baf588daa8dc9a92c20a0be32f56d43faf3d1a60ab11b3f08c356430f6e56 \ + --hash=sha256:acf8ed278cc03f5aff035e69cb511741e0418681d25fbbb86ca65429c4f4d9cd \ + --hash=sha256:ad669df80528aeca5f557712102538f4f37e503f0c5b9541655016dd0932ca79 \ + --hash=sha256:b030c6674b9230d37c5c60ab456e2cf12f6784596d15ce8da9365e70896effc4 \ + --hash=sha256:b9999c008ccf00e8fbcce1236f85ade5c569d13144f77a1946bef8863e8f6eb4 \ + --hash=sha256:bc9a714581f561af0848e6b69947fda0614915f072dfd14142ed1bfe1b806710 \ + --hash=sha256:ce7fff2e23ab2cc81ff452a9444c215c28e6305f396b2ba88343a567feec9660 \ + --hash=sha256:cf00bd2b1b0211888d4dc75656c0412213a8b25e80d73898083f402b50f47e41 \ + --hash=sha256:d10e45a6c50211fe256da61a11c34927c68f277e03138777bdebedd933712fea \ + --hash=sha256:ee410e6de8f88fd5cf6eadd73c135020bfbbbdfcd0f6162c36a7638a1ea8cc65 \ + --hash=sha256:f313b39a7e94f296025e3cffc2c567618174c0b1dde173960cf23808f9fae4be \ + --hash=sha256:f3cd9e7b3c2c1ec26364856f9fbe78695fe631150f94cd1c22228456404cf1ec + # via + # -c python/requirements_compiled.txt + # ray +six==1.16.0 \ + --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \ + --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 + # via + # -c python/requirements_compiled.txt + # opencensus + # python-dateutil +smart-open==6.2.0 \ + --hash=sha256:088bf00f9327c71e549bc2f86567d3320df5d89667f009ce1c16568976068ef7 \ + --hash=sha256:1b4df5c8365218f3852c507451920ccad606c80b0acb4e67508e50ba9b5d2632 + # via + # -c python/requirements_compiled.txt + # ray +sniffio==1.3.1 \ + --hash=sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2 \ + --hash=sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc + # via + # -c python/requirements_compiled.txt + # anyio +starlette==0.46.2 \ + --hash=sha256:595633ce89f8ffa71a015caed34a5b2dc1c0cdb3f0f1fbd1e69339cf2abeec35 \ + --hash=sha256:7f7361f34eed179294600af672f565727419830b54b7b084efe44bb82d2fccd5 + # via + # -c python/requirements_compiled.txt + # fastapi + # ray +tensorboardx==2.6.2.2 \ + --hash=sha256:160025acbf759ede23fd3526ae9d9bfbfd8b68eb16c38a010ebe326dc6395db8 \ + --hash=sha256:c6476d7cd0d529b0b72f4acadb1269f9ed8b22f441e87a84f2a3b940bb87b666 + # via + # -c python/requirements_compiled.txt + # ray +typing-extensions==4.12.2 \ + --hash=sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d + # via + # -c python/requirements_compiled.txt + # exceptiongroup + # fastapi + # gymnasium + # opentelemetry-api + # opentelemetry-sdk + # opentelemetry-semantic-conventions + # pydantic + # pydantic-core + # pyopenssl + # referencing + # typing-inspection +typing-inspection==0.4.1 \ + --hash=sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51 \ + --hash=sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28 + # via + # -c python/requirements_compiled.txt + # pydantic +tzdata==2025.2 \ + --hash=sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8 \ + --hash=sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9 + # via + # -c python/requirements_compiled.txt + # kombu +urllib3==1.26.19 \ + --hash=sha256:37a0344459b199fce0e80b0d3569837ec6b6937435c5244e7fd73fa6006830f3 \ + --hash=sha256:3e3d753a8618b86d7de333b4223005f68720bcd6a7d2bcb9fbd2229ec7c1e429 + # via + # -c python/requirements_compiled.txt + # requests +uvicorn==0.22.0 \ + --hash=sha256:79277ae03db57ce7d9aa0567830bbb51d7a612f54d6e1e3e92da3ef24c2c8ed8 \ + --hash=sha256:e9434d3bbf05f310e762147f769c9f21235ee118ba2d2bf1155a7196448bd996 + # via + # -c python/requirements_compiled.txt + # ray +uvloop==0.21.0 ; platform_python_implementation != 'PyPy' and sys_platform != 'cygwin' and sys_platform != 'win32' \ + --hash=sha256:0878c2640cf341b269b7e128b1a5fed890adc4455513ca710d77d5e93aa6d6a0 \ + --hash=sha256:10d66943def5fcb6e7b37310eb6b5639fd2ccbc38df1177262b0640c3ca68c1f \ + --hash=sha256:10da8046cc4a8f12c91a1c39d1dd1585c41162a15caaef165c2174db9ef18bdc \ + --hash=sha256:17df489689befc72c39a08359efac29bbee8eee5209650d4b9f34df73d22e414 \ + --hash=sha256:183aef7c8730e54c9a3ee3227464daed66e37ba13040bb3f350bc2ddc040f22f \ + --hash=sha256:196274f2adb9689a289ad7d65700d37df0c0930fd8e4e743fa4834e850d7719d \ + --hash=sha256:221f4f2a1f46032b403bf3be628011caf75428ee3cc204a22addf96f586b19fd \ + --hash=sha256:2d1f581393673ce119355d56da84fe1dd9d2bb8b3d13ce792524e1607139feff \ + --hash=sha256:359ec2c888397b9e592a889c4d72ba3d6befba8b2bb01743f72fffbde663b59c \ + --hash=sha256:3bf12b0fda68447806a7ad847bfa591613177275d35b6724b1ee573faa3704e3 \ + --hash=sha256:4509360fcc4c3bd2c70d87573ad472de40c13387f5fda8cb58350a1d7475e58d \ + --hash=sha256:460def4412e473896ef179a1671b40c039c7012184b627898eea5072ef6f017a \ + --hash=sha256:461d9ae6660fbbafedd07559c6a2e57cd553b34b0065b6550685f6653a98c1cb \ + --hash=sha256:46923b0b5ee7fc0020bef24afe7836cb068f5050ca04caf6b487c513dc1a20b2 \ + --hash=sha256:53e420a3afe22cdcf2a0f4846e377d16e718bc70103d7088a4f7623567ba5fb0 \ + --hash=sha256:5ee4d4ef48036ff6e5cfffb09dd192c7a5027153948d85b8da7ff705065bacc6 \ + --hash=sha256:67dd654b8ca23aed0a8e99010b4c34aca62f4b7fce88f39d452ed7622c94845c \ + --hash=sha256:787ae31ad8a2856fc4e7c095341cccc7209bd657d0e71ad0dc2ea83c4a6fa8af \ + --hash=sha256:86975dca1c773a2c9864f4c52c5a55631038e387b47eaf56210f873887b6c8dc \ + --hash=sha256:87c43e0f13022b998eb9b973b5e97200c8b90823454d4bc06ab33829e09fb9bb \ + --hash=sha256:88cb67cdbc0e483da00af0b2c3cdad4b7c61ceb1ee0f33fe00e09c81e3a6cb75 \ + --hash=sha256:8a375441696e2eda1c43c44ccb66e04d61ceeffcd76e4929e527b7fa401b90fb \ + --hash=sha256:a5c39f217ab3c663dc699c04cbd50c13813e31d917642d459fdcec07555cc553 \ + --hash=sha256:b9fb766bb57b7388745d8bcc53a359b116b8a04c83a2288069809d2b3466c37e \ + --hash=sha256:baa0e6291d91649c6ba4ed4b2f982f9fa165b5bbd50a9e203c416a2797bab3c6 \ + --hash=sha256:baa4dcdbd9ae0a372f2167a207cd98c9f9a1ea1188a8a526431eef2f8116cc8d \ + --hash=sha256:bc09f0ff191e61c2d592a752423c767b4ebb2986daa9ed62908e2b1b9a9ae206 \ + --hash=sha256:bd53ecc9a0f3d87ab847503c2e1552b690362e005ab54e8a48ba97da3924c0dc \ + --hash=sha256:bfd55dfcc2a512316e65f16e503e9e450cab148ef11df4e4e679b5e8253a5281 \ + --hash=sha256:c097078b8031190c934ed0ebfee8cc5f9ba9642e6eb88322b9958b649750f72b \ + --hash=sha256:c0f3fa6200b3108919f8bdabb9a7f87f20e7097ea3c543754cabc7d717d95cf8 \ + --hash=sha256:e678ad6fe52af2c58d2ae3c73dc85524ba8abe637f134bf3564ed07f555c5e79 \ + --hash=sha256:ec7e6b09a6fdded42403182ab6b832b71f4edaf7f37a9a0e371a01db5f0cb45f \ + --hash=sha256:f0ce1b49560b1d2d8a2977e3ba4afb2414fb46b86a1b64056bc4ab929efdafbe \ + --hash=sha256:f38b2e090258d051d68a5b14d1da7203a3c3677321cf32a95a6f4db4dd8b6f26 \ + --hash=sha256:f3df876acd7ec037a3d005b3ab85a7e4110422e4d9c1571d4fc89b0fc41b6816 \ + --hash=sha256:f7089d2dc73179ce5ac255bdf37c236a9f914b264825fdaacaded6990a7fb4c2 + # via + # -c python/requirements_compiled.txt + # uvicorn +vine==5.1.0 \ + --hash=sha256:40fdf3c48b2cfe1c38a49e9ae2da6fda88e4794c810050a728bd7413811fb1dc \ + --hash=sha256:8b62e981d35c41049211cf62a0a1242d8c1ee9bd15bb196ce38aefd6799e61e0 + # via + # -c python/requirements_compiled.txt + # amqp + # celery + # kombu +virtualenv==20.29.1 \ + --hash=sha256:4e4cb403c0b0da39e13b46b1b2476e505cb0046b25f242bee80f62bf990b2779 \ + --hash=sha256:b8b8970138d32fb606192cb97f6cd4bb644fa486be9308fb9b63f81091b5dc35 + # via + # -c python/requirements_compiled.txt + # ray +watchfiles==0.19.0 \ + --hash=sha256:0089c6dc24d436b373c3c57657bf4f9a453b13767150d17284fc6162b2791911 \ + --hash=sha256:09ea3397aecbc81c19ed7f025e051a7387feefdb789cf768ff994c1228182fda \ + --hash=sha256:176a9a7641ec2c97b24455135d58012a5be5c6217fc4d5fef0b2b9f75dbf5154 \ + --hash=sha256:18b28f6ad871b82df9542ff958d0c86bb0d8310bb09eb8e87d97318a3b5273af \ + --hash=sha256:20b44221764955b1e703f012c74015306fb7e79a00c15370785f309b1ed9aa8d \ + --hash=sha256:3d7d267d27aceeeaa3de0dd161a0d64f0a282264d592e335fff7958cc0cbae7c \ + --hash=sha256:5471582658ea56fca122c0f0d0116a36807c63fefd6fdc92c71ca9a4491b6b48 \ + --hash=sha256:5569fc7f967429d4bc87e355cdfdcee6aabe4b620801e2cf5805ea245c06097c \ + --hash=sha256:68dce92b29575dda0f8d30c11742a8e2b9b8ec768ae414b54f7453f27bdf9545 \ + --hash=sha256:79c533ff593db861ae23436541f481ec896ee3da4e5db8962429b441bbaae16e \ + --hash=sha256:7f3920b1285a7d3ce898e303d84791b7bf40d57b7695ad549dc04e6a44c9f120 \ + --hash=sha256:91633e64712df3051ca454ca7d1b976baf842d7a3640b87622b323c55f3345e7 \ + --hash=sha256:945be0baa3e2440151eb3718fd8846751e8b51d8de7b884c90b17d271d34cae8 \ + --hash=sha256:9afd0d69429172c796164fd7fe8e821ade9be983f51c659a38da3faaaaac44dc \ + --hash=sha256:9c75eff897786ee262c9f17a48886f4e98e6cfd335e011c591c305e5d083c056 \ + --hash=sha256:b538014a87f94d92f98f34d3e6d2635478e6be6423a9ea53e4dd96210065e193 \ + --hash=sha256:b6577b8c6c8701ba8642ea9335a129836347894b666dd1ec2226830e263909d3 \ + --hash=sha256:c0376deac92377817e4fb8f347bf559b7d44ff556d9bc6f6208dd3f79f104aaf \ + --hash=sha256:cae3dde0b4b2078f31527acff6f486e23abed307ba4d3932466ba7cdd5ecec79 \ + --hash=sha256:cb5d45c4143c1dd60f98a16187fd123eda7248f84ef22244818c18d531a249d1 \ + --hash=sha256:d9b073073e048081e502b6c6b0b88714c026a1a4c890569238d04aca5f9ca74b \ + --hash=sha256:fac19dc9cbc34052394dbe81e149411a62e71999c0a19e1e09ce537867f95ae0 + # via + # -c python/requirements_compiled.txt + # ray + # uvicorn +wcwidth==0.2.13 \ + --hash=sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859 \ + --hash=sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5 + # via + # -c python/requirements_compiled.txt + # prompt-toolkit +websockets==11.0.3 \ + --hash=sha256:01f5567d9cf6f502d655151645d4e8b72b453413d3819d2b6f1185abc23e82dd \ + --hash=sha256:03aae4edc0b1c68498f41a6772d80ac7c1e33c06c6ffa2ac1c27a07653e79d6f \ + --hash=sha256:0ac56b661e60edd453585f4bd68eb6a29ae25b5184fd5ba51e97652580458998 \ + --hash=sha256:0ee68fe502f9031f19d495dae2c268830df2760c0524cbac5d759921ba8c8e82 \ + --hash=sha256:1553cb82942b2a74dd9b15a018dce645d4e68674de2ca31ff13ebc2d9f283788 \ + --hash=sha256:1a073fc9ab1c8aff37c99f11f1641e16da517770e31a37265d2755282a5d28aa \ + --hash=sha256:1d2256283fa4b7f4c7d7d3e84dc2ece74d341bce57d5b9bf385df109c2a1a82f \ + --hash=sha256:1d5023a4b6a5b183dc838808087033ec5df77580485fc533e7dab2567851b0a4 \ + --hash=sha256:1fdf26fa8a6a592f8f9235285b8affa72748dc12e964a5518c6c5e8f916716f7 \ + --hash=sha256:2529338a6ff0eb0b50c7be33dc3d0e456381157a31eefc561771ee431134a97f \ + --hash=sha256:279e5de4671e79a9ac877427f4ac4ce93751b8823f276b681d04b2156713b9dd \ + --hash=sha256:2d903ad4419f5b472de90cd2d40384573b25da71e33519a67797de17ef849b69 \ + --hash=sha256:332d126167ddddec94597c2365537baf9ff62dfcc9db4266f263d455f2f031cb \ + --hash=sha256:34fd59a4ac42dff6d4681d8843217137f6bc85ed29722f2f7222bd619d15e95b \ + --hash=sha256:3580dd9c1ad0701169e4d6fc41e878ffe05e6bdcaf3c412f9d559389d0c9e016 \ + --hash=sha256:3ccc8a0c387629aec40f2fc9fdcb4b9d5431954f934da3eaf16cdc94f67dbfac \ + --hash=sha256:41f696ba95cd92dc047e46b41b26dd24518384749ed0d99bea0a941ca87404c4 \ + --hash=sha256:42cc5452a54a8e46a032521d7365da775823e21bfba2895fb7b77633cce031bb \ + --hash=sha256:4841ed00f1026dfbced6fca7d963c4e7043aa832648671b5138008dc5a8f6d99 \ + --hash=sha256:4b253869ea05a5a073ebfdcb5cb3b0266a57c3764cf6fe114e4cd90f4bfa5f5e \ + --hash=sha256:54c6e5b3d3a8936a4ab6870d46bdd6ec500ad62bde9e44462c32d18f1e9a8e54 \ + --hash=sha256:619d9f06372b3a42bc29d0cd0354c9bb9fb39c2cbc1a9c5025b4538738dbffaf \ + --hash=sha256:6505c1b31274723ccaf5f515c1824a4ad2f0d191cec942666b3d0f3aa4cb4007 \ + --hash=sha256:660e2d9068d2bedc0912af508f30bbeb505bbbf9774d98def45f68278cea20d3 \ + --hash=sha256:6681ba9e7f8f3b19440921e99efbb40fc89f26cd71bf539e45d8c8a25c976dc6 \ + --hash=sha256:68b977f21ce443d6d378dbd5ca38621755f2063d6fdb3335bda981d552cfff86 \ + --hash=sha256:69269f3a0b472e91125b503d3c0b3566bda26da0a3261c49f0027eb6075086d1 \ + --hash=sha256:6f1a3f10f836fab6ca6efa97bb952300b20ae56b409414ca85bff2ad241d2a61 \ + --hash=sha256:7622a89d696fc87af8e8d280d9b421db5133ef5b29d3f7a1ce9f1a7bf7fcfa11 \ + --hash=sha256:777354ee16f02f643a4c7f2b3eff8027a33c9861edc691a2003531f5da4f6bc8 \ + --hash=sha256:84d27a4832cc1a0ee07cdcf2b0629a8a72db73f4cf6de6f0904f6661227f256f \ + --hash=sha256:8531fdcad636d82c517b26a448dcfe62f720e1922b33c81ce695d0edb91eb931 \ + --hash=sha256:86d2a77fd490ae3ff6fae1c6ceaecad063d3cc2320b44377efdde79880e11526 \ + --hash=sha256:88fc51d9a26b10fc331be344f1781224a375b78488fc343620184e95a4b27016 \ + --hash=sha256:8a34e13a62a59c871064dfd8ffb150867e54291e46d4a7cf11d02c94a5275bae \ + --hash=sha256:8c82f11964f010053e13daafdc7154ce7385ecc538989a354ccc7067fd7028fd \ + --hash=sha256:92b2065d642bf8c0a82d59e59053dd2fdde64d4ed44efe4870fa816c1232647b \ + --hash=sha256:97b52894d948d2f6ea480171a27122d77af14ced35f62e5c892ca2fae9344311 \ + --hash=sha256:9d9acd80072abcc98bd2c86c3c9cd4ac2347b5a5a0cae7ed5c0ee5675f86d9af \ + --hash=sha256:9f59a3c656fef341a99e3d63189852be7084c0e54b75734cde571182c087b152 \ + --hash=sha256:aa5003845cdd21ac0dc6c9bf661c5beddd01116f6eb9eb3c8e272353d45b3288 \ + --hash=sha256:b16fff62b45eccb9c7abb18e60e7e446998093cdcb50fed33134b9b6878836de \ + --hash=sha256:b30c6590146e53149f04e85a6e4fcae068df4289e31e4aee1fdf56a0dead8f97 \ + --hash=sha256:b58cbf0697721120866820b89f93659abc31c1e876bf20d0b3d03cef14faf84d \ + --hash=sha256:b67c6f5e5a401fc56394f191f00f9b3811fe843ee93f4a70df3c389d1adf857d \ + --hash=sha256:bceab846bac555aff6427d060f2fcfff71042dba6f5fca7dc4f75cac815e57ca \ + --hash=sha256:bee9fcb41db2a23bed96c6b6ead6489702c12334ea20a297aa095ce6d31370d0 \ + --hash=sha256:c114e8da9b475739dde229fd3bc6b05a6537a88a578358bc8eb29b4030fac9c9 \ + --hash=sha256:c1f0524f203e3bd35149f12157438f406eff2e4fb30f71221c8a5eceb3617b6b \ + --hash=sha256:c792ea4eabc0159535608fc5658a74d1a81020eb35195dd63214dcf07556f67e \ + --hash=sha256:c7f3cb904cce8e1be667c7e6fef4516b98d1a6a0635a58a57528d577ac18a128 \ + --hash=sha256:d67ac60a307f760c6e65dad586f556dde58e683fab03323221a4e530ead6f74d \ + --hash=sha256:dcacf2c7a6c3a84e720d1bb2b543c675bf6c40e460300b628bab1b1efc7c034c \ + --hash=sha256:de36fe9c02995c7e6ae6efe2e205816f5f00c22fd1fbf343d4d18c3d5ceac2f5 \ + --hash=sha256:def07915168ac8f7853812cc593c71185a16216e9e4fa886358a17ed0fd9fcf6 \ + --hash=sha256:df41b9bc27c2c25b486bae7cf42fccdc52ff181c8c387bfd026624a491c2671b \ + --hash=sha256:e052b8467dd07d4943936009f46ae5ce7b908ddcac3fda581656b1b19c083d9b \ + --hash=sha256:e063b1865974611313a3849d43f2c3f5368093691349cf3c7c8f8f75ad7cb280 \ + --hash=sha256:e1459677e5d12be8bbc7584c35b992eea142911a6236a3278b9b5ce3326f282c \ + --hash=sha256:e1a99a7a71631f0efe727c10edfba09ea6bee4166a6f9c19aafb6c0b5917d09c \ + --hash=sha256:e590228200fcfc7e9109509e4d9125eace2042fd52b595dd22bbc34bb282307f \ + --hash=sha256:e6316827e3e79b7b8e7d8e3b08f4e331af91a48e794d5d8b099928b6f0b85f20 \ + --hash=sha256:e7837cb169eca3b3ae94cc5787c4fed99eef74c0ab9506756eea335e0d6f3ed8 \ + --hash=sha256:e848f46a58b9fcf3d06061d17be388caf70ea5b8cc3466251963c8345e13f7eb \ + --hash=sha256:ed058398f55163a79bb9f06a90ef9ccc063b204bb346c4de78efc5d15abfe602 \ + --hash=sha256:f2e58f2c36cc52d41f2659e4c0cbf7353e28c8c9e63e30d8c6d3494dc9fdedcf \ + --hash=sha256:f467ba0050b7de85016b43f5a22b46383ef004c4f672148a8abf32bc999a87f0 \ + --hash=sha256:f61bdb1df43dc9c131791fbc2355535f9024b9a04398d3bd0684fc16ab07df74 \ + --hash=sha256:fb06eea71a00a7af0ae6aefbb932fb8a7df3cb390cc217d51a9ad7343de1b8d0 \ + --hash=sha256:ffd7dcaf744f25f82190856bc26ed81721508fc5cbf2a330751e135ff1283564 + # via + # -c python/requirements_compiled.txt + # uvicorn +yarl==1.18.3 \ + --hash=sha256:00e5a1fea0fd4f5bfa7440a47eff01d9822a65b4488f7cff83155a0f31a2ecba \ + --hash=sha256:02ddb6756f8f4517a2d5e99d8b2f272488e18dd0bfbc802f31c16c6c20f22193 \ + --hash=sha256:045b8482ce9483ada4f3f23b3774f4e1bf4f23a2d5c912ed5170f68efb053318 \ + --hash=sha256:09c7907c8548bcd6ab860e5f513e727c53b4a714f459b084f6580b49fa1b9cee \ + --hash=sha256:0b0cad37311123211dc91eadcb322ef4d4a66008d3e1bdc404808992260e1a0e \ + --hash=sha256:0b3c92fa08759dbf12b3a59579a4096ba9af8dd344d9a813fc7f5070d86bbab1 \ + --hash=sha256:0fb2171a4486bb075316ee754c6d8382ea6eb8b399d4ec62fde2b591f879778a \ + --hash=sha256:1a74a13a4c857a84a845505fd2d68e54826a2cd01935a96efb1e9d86c728e186 \ + --hash=sha256:1d407181cfa6e70077df3377938c08012d18893f9f20e92f7d2f314a437c30b1 \ + --hash=sha256:1dd4bdd05407ced96fed3d7f25dbbf88d2ffb045a0db60dbc247f5b3c5c25d50 \ + --hash=sha256:25b411eddcfd56a2f0cd6a384e9f4f7aa3efee14b188de13048c25b5e91f1640 \ + --hash=sha256:2d06d3005e668744e11ed80812e61efd77d70bb7f03e33c1598c301eea20efbb \ + --hash=sha256:2ec9bbba33b2d00999af4631a3397d1fd78290c48e2a3e52d8dd72db3a067ac8 \ + --hash=sha256:3236da9272872443f81fedc389bace88408f64f89f75d1bdb2256069a8730ccc \ + --hash=sha256:35098b24e0327fc4ebdc8ffe336cee0a87a700c24ffed13161af80124b7dc8e5 \ + --hash=sha256:41f7ce59d6ee7741af71d82020346af364949314ed3d87553763a2df1829cc58 \ + --hash=sha256:436c4fc0a4d66b2badc6c5fc5ef4e47bb10e4fd9bf0c79524ac719a01f3607c2 \ + --hash=sha256:4891ed92157e5430874dad17b15eb1fda57627710756c27422200c52d8a4e393 \ + --hash=sha256:4ac515b860c36becb81bb84b667466885096b5fc85596948548b667da3bf9f24 \ + --hash=sha256:5094d9206c64181d0f6e76ebd8fb2f8fe274950a63890ee9e0ebfd58bf9d787b \ + --hash=sha256:54d6921f07555713b9300bee9c50fb46e57e2e639027089b1d795ecd9f7fa910 \ + --hash=sha256:578e281c393af575879990861823ef19d66e2b1d0098414855dd367e234f5b3c \ + --hash=sha256:5a3f356548e34a70b0172d8890006c37be92995f62d95a07b4a42e90fba54272 \ + --hash=sha256:602d98f2c2d929f8e697ed274fbadc09902c4025c5a9963bf4e9edfc3ab6f7ed \ + --hash=sha256:61b1a825a13bef4a5f10b1885245377d3cd0bf87cba068e1d9a88c2ae36880e1 \ + --hash=sha256:61e5e68cb65ac8f547f6b5ef933f510134a6bf31bb178be428994b0cb46c2a04 \ + --hash=sha256:61ee62ead9b68b9123ec24bc866cbef297dd266175d53296e2db5e7f797f902d \ + --hash=sha256:6333c5a377c8e2f5fae35e7b8f145c617b02c939d04110c76f29ee3676b5f9a5 \ + --hash=sha256:6748dbf9bfa5ba1afcc7556b71cda0d7ce5f24768043a02a58846e4a443d808d \ + --hash=sha256:67a283dd2882ac98cc6318384f565bffc751ab564605959df4752d42483ad889 \ + --hash=sha256:75674776d96d7b851b6498f17824ba17849d790a44d282929c42dbb77d4f17ae \ + --hash=sha256:757e81cae69244257d125ff31663249b3013b5dc0a8520d73694aed497fb195b \ + --hash=sha256:77a6e85b90a7641d2e07184df5557132a337f136250caafc9ccaa4a2a998ca2c \ + --hash=sha256:7c33dd1931a95e5d9a772d0ac5e44cac8957eaf58e3c8da8c1414de7dd27c576 \ + --hash=sha256:7df647e8edd71f000a5208fe6ff8c382a1de8edfbccdbbfe649d263de07d8c34 \ + --hash=sha256:7e2ee16578af3b52ac2f334c3b1f92262f47e02cc6193c598502bd46f5cd1477 \ + --hash=sha256:80316a8bd5109320d38eef8833ccf5f89608c9107d02d2a7f985f98ed6876990 \ + --hash=sha256:82123d0c954dc58db301f5021a01854a85bf1f3bb7d12ae0c01afc414a882ca2 \ + --hash=sha256:84b2deecba4a3f1a398df819151eb72d29bfeb3b69abb145a00ddc8d30094512 \ + --hash=sha256:8503ad47387b8ebd39cbbbdf0bf113e17330ffd339ba1144074da24c545f0069 \ + --hash=sha256:877d209b6aebeb5b16c42cbb377f5f94d9e556626b1bfff66d7b0d115be88d0a \ + --hash=sha256:8874027a53e3aea659a6d62751800cf6e63314c160fd607489ba5c2edd753cf6 \ + --hash=sha256:88a19f62ff30117e706ebc9090b8ecc79aeb77d0b1f5ec10d2d27a12bc9f66d0 \ + --hash=sha256:8d39d351e7faf01483cc7ff7c0213c412e38e5a340238826be7e0e4da450fdc8 \ + --hash=sha256:90adb47ad432332d4f0bc28f83a5963f426ce9a1a8809f5e584e704b82685dcb \ + --hash=sha256:913829534200eb0f789d45349e55203a091f45c37a2674678744ae52fae23efa \ + --hash=sha256:93b2e109287f93db79210f86deb6b9bbb81ac32fc97236b16f7433db7fc437d8 \ + --hash=sha256:9d41beda9dc97ca9ab0b9888cb71f7539124bc05df02c0cff6e5acc5a19dcc6e \ + --hash=sha256:a440a2a624683108a1b454705ecd7afc1c3438a08e890a1513d468671d90a04e \ + --hash=sha256:a4bb030cf46a434ec0225bddbebd4b89e6471814ca851abb8696170adb163985 \ + --hash=sha256:a9ca04806f3be0ac6d558fffc2fdf8fcef767e0489d2684a21912cc4ed0cd1b8 \ + --hash=sha256:ac1801c45cbf77b6c99242eeff4fffb5e4e73a800b5c4ad4fc0be5def634d2e1 \ + --hash=sha256:ac36703a585e0929b032fbaab0707b75dc12703766d0b53486eabd5139ebadd5 \ + --hash=sha256:b1771de9944d875f1b98a745bc547e684b863abf8f8287da8466cf470ef52690 \ + --hash=sha256:b464c4ab4bfcb41e3bfd3f1c26600d038376c2de3297760dfe064d2cb7ea8e10 \ + --hash=sha256:b4f6450109834af88cb4cc5ecddfc5380ebb9c228695afc11915a0bf82116789 \ + --hash=sha256:b57f4f58099328dfb26c6a771d09fb20dbbae81d20cfb66141251ea063bd101b \ + --hash=sha256:b643562c12680b01e17239be267bc306bbc6aac1f34f6444d1bded0c5ce438ca \ + --hash=sha256:b958ddd075ddba5b09bb0be8a6d9906d2ce933aee81100db289badbeb966f54e \ + --hash=sha256:b9d60031cf568c627d028239693fd718025719c02c9f55df0a53e587aab951b5 \ + --hash=sha256:ba23302c0c61a9999784e73809427c9dbedd79f66a13d84ad1b1943802eaaf59 \ + --hash=sha256:ba87babd629f8af77f557b61e49e7c7cac36f22f871156b91e10a6e9d4f829e9 \ + --hash=sha256:c017a3b6df3a1bd45b9fa49a0f54005e53fbcad16633870104b66fa1a30a29d8 \ + --hash=sha256:c1e1cc06da1491e6734f0ea1e6294ce00792193c463350626571c287c9a704db \ + --hash=sha256:c654d5207c78e0bd6d749f6dae1dcbbfde3403ad3a4b11f3c5544d9906969dde \ + --hash=sha256:c69697d3adff5aa4f874b19c0e4ed65180ceed6318ec856ebc423aa5850d84f7 \ + --hash=sha256:c7d79f7d9aabd6011004e33b22bc13056a3e3fb54794d138af57f5ee9d9032cb \ + --hash=sha256:ccaa3a4b521b780a7e771cc336a2dba389a0861592bbce09a476190bb0c8b4b3 \ + --hash=sha256:ccd17349166b1bee6e529b4add61727d3f55edb7babbe4069b5764c9587a8cc6 \ + --hash=sha256:ce1af883b94304f493698b00d0f006d56aea98aeb49d75ec7d98cd4a777e9285 \ + --hash=sha256:d0e883008013c0e4aef84dcfe2a0b172c4d23c2669412cf5b3371003941f72bb \ + --hash=sha256:d980e0325b6eddc81331d3f4551e2a333999fb176fd153e075c6d1c2530aa8a8 \ + --hash=sha256:e17c9361d46a4d5addf777c6dd5eab0715a7684c2f11b88c67ac37edfba6c482 \ + --hash=sha256:e2c08cc9b16f4f4bc522771d96734c7901e7ebef70c6c5c35dd0f10845270bcd \ + --hash=sha256:e35ef8683211db69ffe129a25d5634319a677570ab6b2eba4afa860f54eeaf75 \ + --hash=sha256:e3b9fd71836999aad54084906f8663dffcd2a7fb5cdafd6c37713b2e72be1760 \ + --hash=sha256:ef9f7768395923c3039055c14334ba4d926f3baf7b776c923c93d80195624782 \ + --hash=sha256:f52a265001d830bc425f82ca9eabda94a64a4d753b07d623a9f2863fde532b53 \ + --hash=sha256:f91c4803173928a25e1a55b943c81f55b8872f0018be83e3ad4938adffb77dd2 \ + --hash=sha256:fbd6748e8ab9b41171bb95c6142faf068f5ef1511935a0aa07025438dd9a9bc1 \ + --hash=sha256:fe57328fbc1bfd0bd0514470ac692630f3901c0ee39052ae47acd1d90a436719 \ + --hash=sha256:fea09ca13323376a2fdfb353a5fa2e59f90cd18d7ca4eaa1fd31f0a8b4f91e62 + # via + # -c python/requirements_compiled.txt + # aiohttp +zipp==3.19.2 \ + --hash=sha256:f091755f667055f2d02b32c53771a7a6c8b47e1fdbc4b72a8b9072b3eef8015c + # via + # -c python/requirements_compiled.txt + # importlib-metadata diff --git a/python/deplocks/ray_img/ray_img_py311.lock b/python/deplocks/ray_img/ray_img_py311.lock new file mode 100644 index 000000000000..c01332ec3f9a --- /dev/null +++ b/python/deplocks/ray_img/ray_img_py311.lock @@ -0,0 +1,2162 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile --generate-hashes --strip-extras --unsafe-package setuptools --index-url https://pypi.org/simple --extra-index-url https://download.pytorch.org/whl/cpu --index-strategy unsafe-best-match --no-strip-markers --emit-index-url --emit-find-links --python-version=3.11 --find-links=.whl/ -c python/requirements_compiled.txt - -o python/deplocks/ray_img/ray_img_py311.lock +--index-url https://pypi.org/simple +--extra-index-url https://download.pytorch.org/whl/cpu +--find-links .whl/ +--find-links https://data.pyg.org/whl/torch-2.3.0+cpu.html + +aiohappyeyeballs==2.6.1 \ + --hash=sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558 \ + --hash=sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8 + # via + # -c python/requirements_compiled.txt + # aiohttp +aiohttp==3.11.16 \ + --hash=sha256:004511d3413737700835e949433536a2fe95a7d0297edd911a1e9705c5b5ea43 \ + --hash=sha256:0902e887b0e1d50424112f200eb9ae3dfed6c0d0a19fc60f633ae5a57c809656 \ + --hash=sha256:09b00dd520d88eac9d1768439a59ab3d145065c91a8fab97f900d1b5f802895e \ + --hash=sha256:0a2f451849e6b39e5c226803dcacfa9c7133e9825dcefd2f4e837a2ec5a3bb98 \ + --hash=sha256:0a950c2eb8ff17361abd8c85987fd6076d9f47d040ebffce67dce4993285e973 \ + --hash=sha256:0ad1fb47da60ae1ddfb316f0ff16d1f3b8e844d1a1e154641928ea0583d486ed \ + --hash=sha256:13ceac2c5cdcc3f64b9015710221ddf81c900c5febc505dbd8f810e770011540 \ + --hash=sha256:14461157d8426bcb40bd94deb0450a6fa16f05129f7da546090cebf8f3123b0f \ + --hash=sha256:16f8a2c9538c14a557b4d309ed4d0a7c60f0253e8ed7b6c9a2859a7582f8b1b8 \ + --hash=sha256:17ae4664031aadfbcb34fd40ffd90976671fa0c0286e6c4113989f78bebab37a \ + --hash=sha256:1ce63ae04719513dd2651202352a2beb9f67f55cb8490c40f056cea3c5c355ce \ + --hash=sha256:23a15727fbfccab973343b6d1b7181bfb0b4aa7ae280f36fd2f90f5476805682 \ + --hash=sha256:2540ddc83cc724b13d1838026f6a5ad178510953302a49e6d647f6e1de82bc34 \ + --hash=sha256:37dcee4906454ae377be5937ab2a66a9a88377b11dd7c072df7a7c142b63c37c \ + --hash=sha256:38bea84ee4fe24ebcc8edeb7b54bf20f06fd53ce4d2cc8b74344c5b9620597fd \ + --hash=sha256:3ab3367bb7f61ad18793fea2ef71f2d181c528c87948638366bf1de26e239183 \ + --hash=sha256:3ad1d59fd7114e6a08c4814983bb498f391c699f3c78712770077518cae63ff7 \ + --hash=sha256:3b4e6db8dc4879015b9955778cfb9881897339c8fab7b3676f8433f849425913 \ + --hash=sha256:3e061b09f6fa42997cf627307f220315e313ece74907d35776ec4373ed718b86 \ + --hash=sha256:42864e70a248f5f6a49fdaf417d9bc62d6e4d8ee9695b24c5916cb4bb666c802 \ + --hash=sha256:493910ceb2764f792db4dc6e8e4b375dae1b08f72e18e8f10f18b34ca17d0979 \ + --hash=sha256:4d0c970c0d602b1017e2067ff3b7dac41c98fef4f7472ec2ea26fd8a4e8c2149 \ + --hash=sha256:54eb3aead72a5c19fad07219acd882c1643a1027fbcdefac9b502c267242f955 \ + --hash=sha256:56a3443aca82abda0e07be2e1ecb76a050714faf2be84256dae291182ba59049 \ + --hash=sha256:576f5ca28d1b3276026f7df3ec841ae460e0fc3aac2a47cbf72eabcfc0f102e1 \ + --hash=sha256:58ede86453a6cf2d6ce40ef0ca15481677a66950e73b0a788917916f7e35a0bb \ + --hash=sha256:61c721764e41af907c9d16b6daa05a458f066015abd35923051be8705108ed17 \ + --hash=sha256:634d96869be6c4dc232fc503e03e40c42d32cfaa51712aee181e922e61d74814 \ + --hash=sha256:696ef00e8a1f0cec5e30640e64eca75d8e777933d1438f4facc9c0cdf288a810 \ + --hash=sha256:69a2cbd61788d26f8f1e626e188044834f37f6ae3f937bd9f08b65fc9d7e514e \ + --hash=sha256:6a792ce34b999fbe04a7a71a90c74f10c57ae4c51f65461a411faa70e154154e \ + --hash=sha256:6ac13b71761e49d5f9e4d05d33683bbafef753e876e8e5a7ef26e937dd766713 \ + --hash=sha256:6fdec0213244c39973674ca2a7f5435bf74369e7d4e104d6c7473c81c9bcc8c4 \ + --hash=sha256:72b1b03fb4655c1960403c131740755ec19c5898c82abd3961c364c2afd59fe7 \ + --hash=sha256:745f1ed5e2c687baefc3c5e7b4304e91bf3e2f32834d07baaee243e349624b24 \ + --hash=sha256:776c8e959a01e5e8321f1dec77964cb6101020a69d5a94cd3d34db6d555e01f7 \ + --hash=sha256:780df0d837276276226a1ff803f8d0fa5f8996c479aeef52eb040179f3156cbd \ + --hash=sha256:78e6e23b954644737e385befa0deb20233e2dfddf95dd11e9db752bdd2a294d3 \ + --hash=sha256:7951decace76a9271a1ef181b04aa77d3cc309a02a51d73826039003210bdc86 \ + --hash=sha256:7ba92a2d9ace559a0a14b03d87f47e021e4fa7681dc6970ebbc7b447c7d4b7cd \ + --hash=sha256:7f6428fee52d2bcf96a8aa7b62095b190ee341ab0e6b1bcf50c615d7966fd45b \ + --hash=sha256:87944bd16b7fe6160607f6a17808abd25f17f61ae1e26c47a491b970fb66d8cb \ + --hash=sha256:87a6e922b2b2401e0b0cf6b976b97f11ec7f136bfed445e16384fbf6fd5e8602 \ + --hash=sha256:8cb0688a8d81c63d716e867d59a9ccc389e97ac7037ebef904c2b89334407180 \ + --hash=sha256:8df6612df74409080575dca38a5237282865408016e65636a76a2eb9348c2567 \ + --hash=sha256:911a6e91d08bb2c72938bc17f0a2d97864c531536b7832abee6429d5296e5b27 \ + --hash=sha256:92b7ee222e2b903e0a4b329a9943d432b3767f2d5029dbe4ca59fb75223bbe2e \ + --hash=sha256:938f756c2b9374bbcc262a37eea521d8a0e6458162f2a9c26329cc87fdf06534 \ + --hash=sha256:9756d9b9d4547e091f99d554fbba0d2a920aab98caa82a8fb3d3d9bee3c9ae85 \ + --hash=sha256:98b88a2bf26965f2015a771381624dd4b0839034b70d406dc74fd8be4cc053e3 \ + --hash=sha256:9b751a6306f330801665ae69270a8a3993654a85569b3469662efaad6cf5cc50 \ + --hash=sha256:a2a450bcce4931b295fc0848f384834c3f9b00edfc2150baafb4488c27953de6 \ + --hash=sha256:a3814760a1a700f3cfd2f977249f1032301d0a12c92aba74605cfa6ce9f78489 \ + --hash=sha256:a5abcbba9f4b463a45c8ca8b7720891200658f6f46894f79517e6cd11f3405ca \ + --hash=sha256:a6db7458ab89c7d80bc1f4e930cc9df6edee2200127cfa6f6e080cf619eddfbd \ + --hash=sha256:ad497f38a0d6c329cb621774788583ee12321863cd4bd9feee1effd60f2ad133 \ + --hash=sha256:ad9509ffb2396483ceacb1eee9134724443ee45b92141105a4645857244aecc8 \ + --hash=sha256:bbcba75fe879ad6fd2e0d6a8d937f34a571f116a0e4db37df8079e738ea95c71 \ + --hash=sha256:c10d85e81d0b9ef87970ecbdbfaeec14a361a7fa947118817fcea8e45335fa46 \ + --hash=sha256:c15b2271c44da77ee9d822552201180779e5e942f3a71fb74e026bf6172ff287 \ + --hash=sha256:ca37057625693d097543bd88076ceebeb248291df9d6ca8481349efc0b05dcd0 \ + --hash=sha256:cc3a145479a76ad0ed646434d09216d33d08eef0d8c9a11f5ae5cdc37caa3540 \ + --hash=sha256:ccf10f16ab498d20e28bc2b5c1306e9c1512f2840f7b6a67000a517a4b37d5ee \ + --hash=sha256:cd464ba806e27ee24a91362ba3621bfc39dbbb8b79f2e1340201615197370f7c \ + --hash=sha256:d007aa39a52d62373bd23428ba4a2546eed0e7643d7bf2e41ddcefd54519842c \ + --hash=sha256:d0666afbe984f6933fe72cd1f1c3560d8c55880a0bdd728ad774006eb4241ecd \ + --hash=sha256:d07502cc14ecd64f52b2a74ebbc106893d9a9717120057ea9ea1fd6568a747e7 \ + --hash=sha256:d489d9778522fbd0f8d6a5c6e48e3514f11be81cb0a5954bdda06f7e1594b321 \ + --hash=sha256:df7db76400bf46ec6a0a73192b14c8295bdb9812053f4fe53f4e789f3ea66bbb \ + --hash=sha256:e3538bc9fe1b902bef51372462e3d7c96fce2b566642512138a480b7adc9d508 \ + --hash=sha256:e87fd812899aa78252866ae03a048e77bd11b80fb4878ce27c23cade239b42b2 \ + --hash=sha256:ecdb8173e6c7aa09eee342ac62e193e6904923bd232e76b4157ac0bfa670609f \ + --hash=sha256:f244b8e541f414664889e2c87cac11a07b918cb4b540c36f7ada7bfa76571ea2 \ + --hash=sha256:f4065145bf69de124accdd17ea5f4dc770da0a6a6e440c53f6e0a8c27b3e635c \ + --hash=sha256:f420bfe862fb357a6d76f2065447ef6f484bc489292ac91e29bc65d2d7a2c84d \ + --hash=sha256:f6ddd90d9fb4b501c97a4458f1c1720e42432c26cb76d28177c5b5ad4e332601 \ + --hash=sha256:fa73e8c2656a3653ae6c307b3f4e878a21f87859a9afab228280ddccd7369d71 \ + --hash=sha256:fadbb8f1d4140825069db3fedbbb843290fd5f5bc0a5dbd7eaf81d91bf1b003b \ + --hash=sha256:fb3d0cc5cdb926090748ea60172fa8a213cec728bd6c54eae18b96040fcd6227 \ + --hash=sha256:fb46bb0f24813e6cede6cc07b1961d4b04f331f7112a23b5e21f567da4ee50aa \ + --hash=sha256:fd36c119c5d6551bce374fcb5c19269638f8d09862445f85a5a48596fd59f4bb + # via + # -c python/requirements_compiled.txt + # aiohttp-cors + # ray +aiohttp-cors==0.7.0 \ + --hash=sha256:0451ba59fdf6909d0e2cd21e4c0a43752bc0703d33fc78ae94d9d9321710193e \ + --hash=sha256:4d39c6d7100fd9764ed1caf8cebf0eb01bf5e3f24e2e073fda6234bc48b19f5d + # via + # -c python/requirements_compiled.txt + # ray +aiosignal==1.3.1 \ + --hash=sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc \ + --hash=sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17 + # via + # -c python/requirements_compiled.txt + # aiohttp +amqp==5.3.1 \ + --hash=sha256:43b3319e1b4e7d1251833a93d672b4af1e40f3d632d479b98661a95f117880a2 \ + --hash=sha256:cddc00c725449522023bad949f70fff7b48f0b1ade74d170a6f10ab044739432 + # via + # -c python/requirements_compiled.txt + # kombu +annotated-types==0.6.0 \ + --hash=sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43 \ + --hash=sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d + # via + # -c python/requirements_compiled.txt + # pydantic +anyio==3.7.1 \ + --hash=sha256:44a3c9aba0f5defa43261a8b3efb97891f2bd7d804e0e1f56419befa1adfc780 \ + --hash=sha256:91dee416e570e92c64041bd18b900d1d6fa78dff7048769ce5ac5ddad004fbb5 + # via + # -c python/requirements_compiled.txt + # starlette + # watchfiles +attrs==25.1.0 \ + --hash=sha256:1c97078a80c814273a76b2a298a932eb681c87415c11dee0a6921de7f1b02c3e \ + --hash=sha256:c75a69e28a550a7e93789579c22aa26b0f5b83b75dc4e08fe092980051e1090a + # via + # -c python/requirements_compiled.txt + # aiohttp + # jsonschema + # referencing +billiard==4.2.1 \ + --hash=sha256:12b641b0c539073fc8d3f5b8b7be998956665c4233c7c1fcd66a7e677c4fb36f \ + --hash=sha256:40b59a4ac8806ba2c2369ea98d876bc6108b051c227baffd928c644d15d8f3cb + # via + # -c python/requirements_compiled.txt + # celery +cachetools==5.5.2 \ + --hash=sha256:1a661caa9175d26759571b2e19580f9d6393969e5dfca11fdb1f947a23e640d4 \ + --hash=sha256:d26a22bcc62eb95c3beabd9f1ee5e820d3d2704fe2967cbe350e20c8ffcd3f0a + # via + # -c python/requirements_compiled.txt + # google-auth +celery==5.5.3 \ + --hash=sha256:0b5761a07057acee94694464ca482416b959568904c9dfa41ce8413a7d65d525 \ + --hash=sha256:6c972ae7968c2b5281227f01c3a3f984037d21c5129d07bf3550cc2afc6b10a5 + # via + # -c python/requirements_compiled.txt + # ray +certifi==2025.1.31 \ + --hash=sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651 \ + --hash=sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe + # via + # -c python/requirements_compiled.txt + # requests +cffi==1.16.0 ; platform_python_implementation != 'PyPy' \ + --hash=sha256:0c9ef6ff37e974b73c25eecc13952c55bceed9112be2d9d938ded8e856138bcc \ + --hash=sha256:131fd094d1065b19540c3d72594260f118b231090295d8c34e19a7bbcf2e860a \ + --hash=sha256:1b8ebc27c014c59692bb2664c7d13ce7a6e9a629be20e54e7271fa696ff2b417 \ + --hash=sha256:2c56b361916f390cd758a57f2e16233eb4f64bcbeee88a4881ea90fca14dc6ab \ + --hash=sha256:2d92b25dbf6cae33f65005baf472d2c245c050b1ce709cc4588cdcdd5495b520 \ + --hash=sha256:31d13b0f99e0836b7ff893d37af07366ebc90b678b6664c955b54561fc36ef36 \ + --hash=sha256:32c68ef735dbe5857c810328cb2481e24722a59a2003018885514d4c09af9743 \ + --hash=sha256:3686dffb02459559c74dd3d81748269ffb0eb027c39a6fc99502de37d501faa8 \ + --hash=sha256:582215a0e9adbe0e379761260553ba11c58943e4bbe9c36430c4ca6ac74b15ed \ + --hash=sha256:5b50bf3f55561dac5438f8e70bfcdfd74543fd60df5fa5f62d94e5867deca684 \ + --hash=sha256:5bf44d66cdf9e893637896c7faa22298baebcd18d1ddb6d2626a6e39793a1d56 \ + --hash=sha256:6602bc8dc6f3a9e02b6c22c4fc1e47aa50f8f8e6d3f78a5e16ac33ef5fefa324 \ + --hash=sha256:673739cb539f8cdaa07d92d02efa93c9ccf87e345b9a0b556e3ecc666718468d \ + --hash=sha256:68678abf380b42ce21a5f2abde8efee05c114c2fdb2e9eef2efdb0257fba1235 \ + --hash=sha256:68e7c44931cc171c54ccb702482e9fc723192e88d25a0e133edd7aff8fcd1f6e \ + --hash=sha256:6b3d6606d369fc1da4fd8c357d026317fbb9c9b75d36dc16e90e84c26854b088 \ + --hash=sha256:748dcd1e3d3d7cd5443ef03ce8685043294ad6bd7c02a38d1bd367cfd968e000 \ + --hash=sha256:7651c50c8c5ef7bdb41108b7b8c5a83013bfaa8a935590c5d74627c047a583c7 \ + --hash=sha256:7b78010e7b97fef4bee1e896df8a4bbb6712b7f05b7ef630f9d1da00f6444d2e \ + --hash=sha256:7e61e3e4fa664a8588aa25c883eab612a188c725755afff6289454d6362b9673 \ + --hash=sha256:80876338e19c951fdfed6198e70bc88f1c9758b94578d5a7c4c91a87af3cf31c \ + --hash=sha256:8895613bcc094d4a1b2dbe179d88d7fb4a15cee43c052e8885783fac397d91fe \ + --hash=sha256:88e2b3c14bdb32e440be531ade29d3c50a1a59cd4e51b1dd8b0865c54ea5d2e2 \ + --hash=sha256:8f8e709127c6c77446a8c0a8c8bf3c8ee706a06cd44b1e827c3e6a2ee6b8c098 \ + --hash=sha256:9cb4a35b3642fc5c005a6755a5d17c6c8b6bcb6981baf81cea8bfbc8903e8ba8 \ + --hash=sha256:9f90389693731ff1f659e55c7d1640e2ec43ff725cc61b04b2f9c6d8d017df6a \ + --hash=sha256:a09582f178759ee8128d9270cd1344154fd473bb77d94ce0aeb2a93ebf0feaf0 \ + --hash=sha256:a6a14b17d7e17fa0d207ac08642c8820f84f25ce17a442fd15e27ea18d67c59b \ + --hash=sha256:a72e8961a86d19bdb45851d8f1f08b041ea37d2bd8d4fd19903bc3083d80c896 \ + --hash=sha256:abd808f9c129ba2beda4cfc53bde801e5bcf9d6e0f22f095e45327c038bfe68e \ + --hash=sha256:ac0f5edd2360eea2f1daa9e26a41db02dd4b0451b48f7c318e217ee092a213e9 \ + --hash=sha256:b29ebffcf550f9da55bec9e02ad430c992a87e5f512cd63388abb76f1036d8d2 \ + --hash=sha256:b2ca4e77f9f47c55c194982e10f058db063937845bb2b7a86c84a6cfe0aefa8b \ + --hash=sha256:b7be2d771cdba2942e13215c4e340bfd76398e9227ad10402a8767ab1865d2e6 \ + --hash=sha256:b84834d0cf97e7d27dd5b7f3aca7b6e9263c56308ab9dc8aae9784abb774d404 \ + --hash=sha256:b86851a328eedc692acf81fb05444bdf1891747c25af7529e39ddafaf68a4f3f \ + --hash=sha256:bcb3ef43e58665bbda2fb198698fcae6776483e0c4a631aa5647806c25e02cc0 \ + --hash=sha256:c0f31130ebc2d37cdd8e44605fb5fa7ad59049298b3f745c74fa74c62fbfcfc4 \ + --hash=sha256:c6a164aa47843fb1b01e941d385aab7215563bb8816d80ff3a363a9f8448a8dc \ + --hash=sha256:d8a9d3ebe49f084ad71f9269834ceccbf398253c9fac910c4fd7053ff1386936 \ + --hash=sha256:db8e577c19c0fda0beb7e0d4e09e0ba74b1e4c092e0e40bfa12fe05b6f6d75ba \ + --hash=sha256:dc9b18bf40cc75f66f40a7379f6a9513244fe33c0e8aa72e2d56b0196a7ef872 \ + --hash=sha256:e09f3ff613345df5e8c3667da1d918f9149bd623cd9070c983c013792a9a62eb \ + --hash=sha256:e4108df7fe9b707191e55f33efbcb2d81928e10cea45527879a4749cbe472614 \ + --hash=sha256:e6024675e67af929088fda399b2094574609396b1decb609c55fa58b028a32a1 \ + --hash=sha256:e70f54f1796669ef691ca07d046cd81a29cb4deb1e5f942003f401c0c4a2695d \ + --hash=sha256:e715596e683d2ce000574bae5d07bd522c781a822866c20495e52520564f0969 \ + --hash=sha256:e760191dd42581e023a68b758769e2da259b5d52e3103c6060ddc02c9edb8d7b \ + --hash=sha256:ed86a35631f7bfbb28e108dd96773b9d5a6ce4811cf6ea468bb6a359b256b1e4 \ + --hash=sha256:ee07e47c12890ef248766a6e55bd38ebfb2bb8edd4142d56db91b21ea68b7627 \ + --hash=sha256:fa3a0128b152627161ce47201262d3140edb5a5c3da88d73a1b790a959126956 \ + --hash=sha256:fcc8eb6d5902bb1cf6dc4f187ee3ea80a1eba0a89aba40a5cb20a5087d961357 + # via + # -c python/requirements_compiled.txt + # cryptography +charset-normalizer==3.3.2 \ + --hash=sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027 \ + --hash=sha256:06a81e93cd441c56a9b65d8e1d043daeb97a3d0856d177d5c90ba85acb3db087 \ + --hash=sha256:0a55554a2fa0d408816b3b5cedf0045f4b8e1a6065aec45849de2d6f3f8e9786 \ + --hash=sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8 \ + --hash=sha256:10955842570876604d404661fbccbc9c7e684caf432c09c715ec38fbae45ae09 \ + --hash=sha256:122c7fa62b130ed55f8f285bfd56d5f4b4a5b503609d181f9ad85e55c89f4185 \ + --hash=sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574 \ + --hash=sha256:1d3193f4a680c64b4b6a9115943538edb896edc190f0b222e73761716519268e \ + --hash=sha256:1f79682fbe303db92bc2b1136016a38a42e835d932bab5b3b1bfcfbf0640e519 \ + --hash=sha256:2127566c664442652f024c837091890cb1942c30937add288223dc895793f898 \ + --hash=sha256:22afcb9f253dac0696b5a4be4a1c0f8762f8239e21b99680099abd9b2b1b2269 \ + --hash=sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3 \ + --hash=sha256:2e81c7b9c8979ce92ed306c249d46894776a909505d8f5a4ba55b14206e3222f \ + --hash=sha256:3287761bc4ee9e33561a7e058c72ac0938c4f57fe49a09eae428fd88aafe7bb6 \ + --hash=sha256:34d1c8da1e78d2e001f363791c98a272bb734000fcef47a491c1e3b0505657a8 \ + --hash=sha256:37e55c8e51c236f95b033f6fb391d7d7970ba5fe7ff453dad675e88cf303377a \ + --hash=sha256:3d47fa203a7bd9c5b6cee4736ee84ca03b8ef23193c0d1ca99b5089f72645c73 \ + --hash=sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc \ + --hash=sha256:42cb296636fcc8b0644486d15c12376cb9fa75443e00fb25de0b8602e64c1714 \ + --hash=sha256:45485e01ff4d3630ec0d9617310448a8702f70e9c01906b0d0118bdf9d124cf2 \ + --hash=sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc \ + --hash=sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce \ + --hash=sha256:4d0d1650369165a14e14e1e47b372cfcb31d6ab44e6e33cb2d4e57265290044d \ + --hash=sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e \ + --hash=sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6 \ + --hash=sha256:572c3763a264ba47b3cf708a44ce965d98555f618ca42c926a9c1616d8f34269 \ + --hash=sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96 \ + --hash=sha256:5b4c145409bef602a690e7cfad0a15a55c13320ff7a3ad7ca59c13bb8ba4d45d \ + --hash=sha256:6463effa3186ea09411d50efc7d85360b38d5f09b870c48e4600f63af490e56a \ + --hash=sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4 \ + --hash=sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77 \ + --hash=sha256:6897af51655e3691ff853668779c7bad41579facacf5fd7253b0133308cf000d \ + --hash=sha256:68d1f8a9e9e37c1223b656399be5d6b448dea850bed7d0f87a8311f1ff3dabb0 \ + --hash=sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed \ + --hash=sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068 \ + --hash=sha256:6c4caeef8fa63d06bd437cd4bdcf3ffefe6738fb1b25951440d80dc7df8c03ac \ + --hash=sha256:6ef1d82a3af9d3eecdba2321dc1b3c238245d890843e040e41e470ffa64c3e25 \ + --hash=sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8 \ + --hash=sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab \ + --hash=sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26 \ + --hash=sha256:7f04c839ed0b6b98b1a7501a002144b76c18fb1c1850c8b98d458ac269e26ed2 \ + --hash=sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db \ + --hash=sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f \ + --hash=sha256:8465322196c8b4d7ab6d1e049e4c5cb460d0394da4a27d23cc242fbf0034b6b5 \ + --hash=sha256:86216b5cee4b06df986d214f664305142d9c76df9b6512be2738aa72a2048f99 \ + --hash=sha256:87d1351268731db79e0f8e745d92493ee2841c974128ef629dc518b937d9194c \ + --hash=sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d \ + --hash=sha256:8c622a5fe39a48f78944a87d4fb8a53ee07344641b0562c540d840748571b811 \ + --hash=sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa \ + --hash=sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a \ + --hash=sha256:9063e24fdb1e498ab71cb7419e24622516c4a04476b17a2dab57e8baa30d6e03 \ + --hash=sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b \ + --hash=sha256:923c0c831b7cfcb071580d3f46c4baf50f174be571576556269530f4bbd79d04 \ + --hash=sha256:95f2a5796329323b8f0512e09dbb7a1860c46a39da62ecb2324f116fa8fdc85c \ + --hash=sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001 \ + --hash=sha256:9f96df6923e21816da7e0ad3fd47dd8f94b2a5ce594e00677c0013018b813458 \ + --hash=sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389 \ + --hash=sha256:a50aebfa173e157099939b17f18600f72f84eed3049e743b68ad15bd69b6bf99 \ + --hash=sha256:a981a536974bbc7a512cf44ed14938cf01030a99e9b3a06dd59578882f06f985 \ + --hash=sha256:a9a8e9031d613fd2009c182b69c7b2c1ef8239a0efb1df3f7c8da66d5dd3d537 \ + --hash=sha256:ae5f4161f18c61806f411a13b0310bea87f987c7d2ecdbdaad0e94eb2e404238 \ + --hash=sha256:aed38f6e4fb3f5d6bf81bfa990a07806be9d83cf7bacef998ab1a9bd660a581f \ + --hash=sha256:b01b88d45a6fcb69667cd6d2f7a9aeb4bf53760d7fc536bf679ec94fe9f3ff3d \ + --hash=sha256:b261ccdec7821281dade748d088bb6e9b69e6d15b30652b74cbbac25e280b796 \ + --hash=sha256:b2b0a0c0517616b6869869f8c581d4eb2dd83a4d79e0ebcb7d373ef9956aeb0a \ + --hash=sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143 \ + --hash=sha256:bd8f7df7d12c2db9fab40bdd87a7c09b1530128315d047a086fa3ae3435cb3a8 \ + --hash=sha256:beb58fe5cdb101e3a055192ac291b7a21e3b7ef4f67fa1d74e331a7f2124341c \ + --hash=sha256:c002b4ffc0be611f0d9da932eb0f704fe2602a9a949d1f738e4c34c75b0863d5 \ + --hash=sha256:c083af607d2515612056a31f0a8d9e0fcb5876b7bfc0abad3ecd275bc4ebc2d5 \ + --hash=sha256:c180f51afb394e165eafe4ac2936a14bee3eb10debc9d9e4db8958fe36afe711 \ + --hash=sha256:c235ebd9baae02f1b77bcea61bce332cb4331dc3617d254df3323aa01ab47bd4 \ + --hash=sha256:cd70574b12bb8a4d2aaa0094515df2463cb429d8536cfb6c7ce983246983e5a6 \ + --hash=sha256:d0eccceffcb53201b5bfebb52600a5fb483a20b61da9dbc885f8b103cbe7598c \ + --hash=sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7 \ + --hash=sha256:db364eca23f876da6f9e16c9da0df51aa4f104a972735574842618b8c6d999d4 \ + --hash=sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b \ + --hash=sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae \ + --hash=sha256:e06ed3eb3218bc64786f7db41917d4e686cc4856944f53d5bdf83a6884432e12 \ + --hash=sha256:e27ad930a842b4c5eb8ac0016b0a54f5aebbe679340c26101df33424142c143c \ + --hash=sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae \ + --hash=sha256:eb00ed941194665c332bf8e078baf037d6c35d7c4f3102ea2d4f16ca94a26dc8 \ + --hash=sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887 \ + --hash=sha256:eb8821e09e916165e160797a6c17edda0679379a4be5c716c260e836e122f54b \ + --hash=sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4 \ + --hash=sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f \ + --hash=sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5 \ + --hash=sha256:fb69256e180cb6c8a894fee62b3afebae785babc1ee98b81cdf68bbca1987f33 \ + --hash=sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519 \ + --hash=sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561 + # via + # -c python/requirements_compiled.txt + # requests +click==8.1.7 \ + --hash=sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28 \ + --hash=sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de + # via + # -c python/requirements_compiled.txt + # celery + # click-didyoumean + # click-plugins + # click-repl + # ray + # uvicorn +click-didyoumean==0.3.1 \ + --hash=sha256:4f82fdff0dbe64ef8ab2279bd6aa3f6a99c3b28c05aa09cbfc07c9d7fbb5a463 \ + --hash=sha256:5c4bb6007cfea5f2fd6583a2fb6701a22a41eb98957e63d0fac41c10e7c3117c + # via + # -c python/requirements_compiled.txt + # celery +click-plugins==1.1.1.2 \ + --hash=sha256:008d65743833ffc1f5417bf0e78e8d2c23aab04d9745ba817bd3e71b0feb6aa6 \ + --hash=sha256:d7af3984a99d243c131aa1a828331e7630f4a88a9741fd05c927b204bcf92261 + # via + # -c python/requirements_compiled.txt + # celery +click-repl==0.3.0 \ + --hash=sha256:17849c23dba3d667247dc4defe1757fff98694e90fe37474f3feebb69ced26a9 \ + --hash=sha256:fb7e06deb8da8de86180a33a9da97ac316751c094c6899382da7feeeeb51b812 + # via + # -c python/requirements_compiled.txt + # celery +cloudpickle==2.2.0 \ + --hash=sha256:3f4219469c55453cfe4737e564b67c2a149109dabf7f242478948b895f61106f \ + --hash=sha256:7428798d5926d8fcbfd092d18d01a2a03daf8237d8fcdc8095d256b8490796f0 + # via + # -c python/requirements_compiled.txt + # gymnasium +colorful==0.5.5 \ + --hash=sha256:62c187e27c1433db9463ff93b1451898d1e7e23a7e553583fd9daeb6325182e4 \ + --hash=sha256:66f8c1264b2a26f7293b96a03bb7a76c4bc8b9634369a0bffdcd12d618056a1d + # via + # -c python/requirements_compiled.txt + # ray +cryptography==44.0.3 \ + --hash=sha256:02f55fb4f8b79c1221b0961488eaae21015b69b210e18c386b69de182ebb1259 \ + --hash=sha256:157f1f3b8d941c2bd8f3ffee0af9b049c9665c39d3da9db2dc338feca5e98a43 \ + --hash=sha256:192ed30fac1728f7587c6f4613c29c584abdc565d7417c13904708db10206645 \ + --hash=sha256:21a83f6f35b9cc656d71b5de8d519f566df01e660ac2578805ab245ffd8523f8 \ + --hash=sha256:25cd194c39fa5a0aa4169125ee27d1172097857b27109a45fadc59653ec06f44 \ + --hash=sha256:3883076d5c4cc56dbef0b898a74eb6992fdac29a7b9013870b34efe4ddb39a0d \ + --hash=sha256:3bb0847e6363c037df8f6ede57d88eaf3410ca2267fb12275370a76f85786a6f \ + --hash=sha256:3be3f649d91cb182c3a6bd336de8b61a0a71965bd13d1a04a0e15b39c3d5809d \ + --hash=sha256:3f07943aa4d7dad689e3bb1638ddc4944cc5e0921e3c227486daae0e31a05e54 \ + --hash=sha256:479d92908277bed6e1a1c69b277734a7771c2b78633c224445b5c60a9f4bc1d9 \ + --hash=sha256:4ffc61e8f3bf5b60346d89cd3d37231019c17a081208dfbbd6e1605ba03fa137 \ + --hash=sha256:5639c2b16764c6f76eedf722dbad9a0914960d3489c0cc38694ddf9464f1bb2f \ + --hash=sha256:58968d331425a6f9eedcee087f77fd3c927c88f55368f43ff7e0a19891f2642c \ + --hash=sha256:5d186f32e52e66994dce4f766884bcb9c68b8da62d61d9d215bfe5fb56d21334 \ + --hash=sha256:5d20cc348cca3a8aa7312f42ab953a56e15323800ca3ab0706b8cd452a3a056c \ + --hash=sha256:6866df152b581f9429020320e5eb9794c8780e90f7ccb021940d7f50ee00ae0b \ + --hash=sha256:7d5fe7195c27c32a64955740b949070f21cba664604291c298518d2e255931d2 \ + --hash=sha256:896530bc9107b226f265effa7ef3f21270f18a2026bc09fed1ebd7b66ddf6375 \ + --hash=sha256:962bc30480a08d133e631e8dfd4783ab71cc9e33d5d7c1e192f0b7c06397bb88 \ + --hash=sha256:978631ec51a6bbc0b7e58f23b68a8ce9e5f09721940933e9c217068388789fe5 \ + --hash=sha256:9b4d4a5dbee05a2c390bf212e78b99434efec37b17a4bff42f50285c5c8c9647 \ + --hash=sha256:ab0b005721cc0039e885ac3503825661bd9810b15d4f374e473f8c89b7d5460c \ + --hash=sha256:af653022a0c25ef2e3ffb2c673a50e5a0d02fecc41608f4954176f1933b12359 \ + --hash=sha256:b0cc66c74c797e1db750aaa842ad5b8b78e14805a9b5d1348dc603612d3e3ff5 \ + --hash=sha256:b424563394c369a804ecbee9b06dfb34997f19d00b3518e39f83a5642618397d \ + --hash=sha256:c138abae3a12a94c75c10499f1cbae81294a6f983b3af066390adee73f433028 \ + --hash=sha256:c6cd67722619e4d55fdb42ead64ed8843d64638e9c07f4011163e46bc512cf01 \ + --hash=sha256:c91fc8e8fd78af553f98bc7f2a1d8db977334e4eea302a4bfd75b9461c2d8904 \ + --hash=sha256:cad399780053fb383dc067475135e41c9fe7d901a97dd5d9c5dfb5611afc0d7d \ + --hash=sha256:cb90f60e03d563ca2445099edf605c16ed1d5b15182d21831f58460c48bffb93 \ + --hash=sha256:dad80b45c22e05b259e33ddd458e9e2ba099c86ccf4e88db7bbab4b747b18d06 \ + --hash=sha256:dd3db61b8fe5be220eee484a17233287d0be6932d056cf5738225b9c05ef4fff \ + --hash=sha256:e28d62e59a4dbd1d22e747f57d4f00c459af22181f0b2f787ea83f5a876d7c76 \ + --hash=sha256:e909df4053064a97f1e6565153ff8bb389af12c5c8d29c343308760890560aff \ + --hash=sha256:f3ffef566ac88f75967d7abd852ed5f182da252d23fac11b4766da3957766759 \ + --hash=sha256:fc3c9babc1e1faefd62704bb46a69f359a9819eb0292e40df3fb6e3574715cd4 \ + --hash=sha256:fe19d8bc5536a91a24a8133328880a41831b6c5df54599a8417b62fe015d3053 + # via + # -c python/requirements_compiled.txt + # pyopenssl +cupy-cuda12x==13.1.0 ; sys_platform != 'darwin' \ + --hash=sha256:230f8a8e99c81a653baa0ed00819990c0ed1f0cf0298214786b5e323461dc61a \ + --hash=sha256:2d16eaa2d086e416ac13467d4ff3184b9a081fe76b761ce51d4a46ec1c4bd28a \ + --hash=sha256:432273fd4b61a284f7d705d08b8291403548fd422bcbd945635cc155bc6a923d \ + --hash=sha256:4c51a1062a3c5a826b0425952d229ffe73b1791656a31de95b318117e67a9576 \ + --hash=sha256:4c8e9fdb1f3ffc3151808f8bb8c871518d2783e1be8b53792b698a840543d60c \ + --hash=sha256:51b1d6cb83d82dfa306c9efaeb4d57f24bad3041ebd8716d61072676abbcf67b \ + --hash=sha256:52185a2cf95d3bac2c3fda95c9c8e06a985b5a00cd2e587d3caace337db33899 \ + --hash=sha256:5afb6658faa22f21479ae2c0a07254df31c0aebc36907a64a1f6be4ecc9e96da \ + --hash=sha256:d3dc91ef9c4104652195eea4b282d343ecad653021efe20d1c8dd8dfe8ccfd86 \ + --hash=sha256:d60d1e124592cb82a5f3f45b3e7bee7bda7b72a743029f275e9d6b125f338c60 \ + --hash=sha256:dac0284fecb90b5731f514e569a6fcf6674a730ae95b9490781a713b60a34423 \ + --hash=sha256:e7a25ef1b44ae6276b5105affc2289edb34f1aa6676babd5bcd80907348c4cfa + # via + # -c python/requirements_compiled.txt + # ray +distlib==0.3.7 \ + --hash=sha256:2e24928bc811348f0feb63014e97aaae3037f2cf48712d51ae61df7fd6075057 \ + --hash=sha256:9dafe54b34a028eafd95039d5e5d4851a13734540f1331060d31c9916e7147a8 + # via + # -c python/requirements_compiled.txt + # virtualenv +dm-tree==0.1.8 \ + --hash=sha256:054b461f8176f4bce7a21f7b1870f873a1ced3bdbe1282c816c550bb43c71fa6 \ + --hash=sha256:09964470f76a5201aff2e8f9b26842976de7889300676f927930f6285e256760 \ + --hash=sha256:0d3172394079a86c3a759179c65f64c48d1a42b89495fcf38976d11cc3bb952c \ + --hash=sha256:0e9620ccf06393eb6b613b5e366469304622d4ea96ae6540b28a33840e6c89cf \ + --hash=sha256:0fcaabbb14e7980377439e7140bd05552739ca5e515ecb3119f234acee4b9430 \ + --hash=sha256:1607ce49aa42f010d1e5e616d92ce899d66835d4d8bea49679582435285515de \ + --hash=sha256:181c35521d480d0365f39300542cb6cd7fd2b77351bb43d7acfda15aef63b317 \ + --hash=sha256:1d7c26e431fc93cc7e0cba867eb000db6a05f6f2b25af11ac4e9dada88fc5bca \ + --hash=sha256:1fe962015b2fe1282892b28ebe962faed53c7f98d942da9a4625cbf27baef913 \ + --hash=sha256:250b692fb75f45f02e2f58fbef9ab338904ef334b90557565621fa251df267cf \ + --hash=sha256:2869228d9c619074de501a3c10dc7f07c75422f8fab36ecdcb859b6f1b1ec3ef \ + --hash=sha256:28c52cbf4f8b3dbd0beaedf44f69fa85eec5e9dede612e08035e06ada6ec9426 \ + --hash=sha256:2f7915660f59c09068e428613c480150180df1060561fd0d1470684ae7007bd1 \ + --hash=sha256:343a4a4ebaa127451ff971254a4be4084eb4bdc0b2513c32b46f6f728fd03f9e \ + --hash=sha256:35cc164a79336bfcfafb47e5f297898359123bbd3330c1967f0c4994f9cf9f60 \ + --hash=sha256:378cc8ad93c5fe3590f405a309980721f021c790ca1bdf9b15bb1d59daec57f5 \ + --hash=sha256:39070ba268c0491af9fe7a58644d99e8b4f2cde6e5884ba3380bddc84ed43d5f \ + --hash=sha256:435227cf3c5dc63f4de054cf3d00183790bd9ead4c3623138c74dde7f67f521b \ + --hash=sha256:5483dca4d7eb1a0d65fe86d3b6a53ae717face83c1f17e0887b1a4a64ae5c410 \ + --hash=sha256:694c3654cfd2a81552c08ec66bb5c4a3d48fa292b9a181880fb081c36c5b9134 \ + --hash=sha256:75c5d528bb992981c20793b6b453e91560784215dffb8a5440ba999753c14ceb \ + --hash=sha256:803bfc53b4659f447ac694dbd04235f94a73ef7c1fd1e0df7c84ac41e0bc963b \ + --hash=sha256:81fce77f22a302d7a5968aebdf4efafef4def7ce96528719a354e6990dcd49c7 \ + --hash=sha256:83b7764de0d855338abefc6e3ee9fe40d301668310aa3baea3f778ff051f4393 \ + --hash=sha256:8c60a7eadab64c2278861f56bca320b2720f163dca9d7558103c3b77f2416571 \ + --hash=sha256:8ed3564abed97c806db122c2d3e1a2b64c74a63debe9903aad795167cc301368 \ + --hash=sha256:94d3f0826311f45ee19b75f5b48c99466e4218a0489e81c0f0167bda50cacf22 \ + --hash=sha256:96a548a406a6fb15fe58f6a30a57ff2f2aafbf25f05afab00c8f5e5977b6c715 \ + --hash=sha256:a5d819c38c03f0bb5b3b3703c60e4b170355a0fc6b5819325bf3d4ceb3ae7e80 \ + --hash=sha256:ad16ceba90a56ec47cf45b21856d14962ac314787975ef786efb5e6e9ca75ec7 \ + --hash=sha256:af4b3d372f2477dcd89a6e717e4a575ca35ccc20cc4454a8a4b6f8838a00672d \ + --hash=sha256:b095ba4f8ca1ba19350fd53cf1f8f3eb0bd406aa28af64a6dfc86707b32a810a \ + --hash=sha256:b9bd9b9ccb59409d33d51d84b7668010c04c2af7d4a371632874c1ca356cff3d \ + --hash=sha256:b9f89a454e98806b44fe9d40ec9eee61f848388f7e79ac2371a55679bd5a3ac6 \ + --hash=sha256:bb2d109f42190225112da899b9f3d46d0d5f26aef501c61e43529fe9322530b5 \ + --hash=sha256:c0a94aba18a35457a1b5cd716fd7b46c5dafdc4cf7869b4bae665b91c4682a8e \ + --hash=sha256:c5c8c12e3fda754ef6af94161bacdaeda816d941995fac415d6855c6c386af68 \ + --hash=sha256:d1612fcaecd79023dbc6a6ae48d51a80beb5c385d6f3f6d71688e57bc8d07de8 \ + --hash=sha256:d16e1f2a073604cfcc09f7131ae8d534674f43c3aef4c25742eae295bc60d04f \ + --hash=sha256:d20f2faa3672b52e5013f4077117bfb99c4cfc0b445d3bde1584c34032b57436 \ + --hash=sha256:d40fa4106ca6edc66760246a08f500ec0c85ef55c762fb4a363f6ee739ba02ee \ + --hash=sha256:de287fabc464b8734be251e46e06aa9aa1001f34198da2b6ce07bd197172b9cb \ + --hash=sha256:e4d714371bb08839e4e5e29024fc95832d9affe129825ef38836b143028bd144 \ + --hash=sha256:ea9e59e0451e7d29aece402d9f908f2e2a80922bcde2ebfd5dcb07750fcbfee8 \ + --hash=sha256:f7ac31b9aecccb2c6e1ab29706f6ded3eba0c2c69c770322c9c685929c3d6afb \ + --hash=sha256:fa42a605d099ee7d41ba2b5fb75e21423951fd26e5d50583a00471238fb3021d + # via + # -c python/requirements_compiled.txt + # ray +farama-notifications==0.0.4 \ + --hash=sha256:13fceff2d14314cf80703c8266462ebf3733c7d165336eee998fc58e545efd18 \ + --hash=sha256:14de931035a41961f7c056361dc7f980762a143d05791ef5794a751a2caf05ae + # via + # -c python/requirements_compiled.txt + # gymnasium +fastapi==0.115.12 \ + --hash=sha256:1e2c2a2646905f9e83d32f04a3f86aff4a286669c6c950ca95b5fd68c2602681 \ + --hash=sha256:e94613d6c05e27be7ffebdd6ea5f388112e5e430c8f7d6494a9d1d88d43e814d + # via + # -c python/requirements_compiled.txt + # ray +fastrlock==0.8.2 ; sys_platform != 'darwin' \ + --hash=sha256:067edb0a0805bf61e17a251d5046af59f6e9d2b8ad01222e0ef7a0b7937d5548 \ + --hash=sha256:07ed3c7b3867c05a3d6be4ced200c7767000f3431b9be6da66972822dd86e8be \ + --hash=sha256:08315bde19d0c2e6b06593d5a418be3dc8f9b1ee721afa96867b9853fceb45cf \ + --hash=sha256:11bbbbc526363955aeddb9eec4cee2a0012322b7b2f15b54f44454fcf4fd398a \ + --hash=sha256:17734e2e5af4c07ddb0fb10bd484e062c22de3be6b67940b9cc6ec2f18fa61ba \ + --hash=sha256:1b15430b93d7eb3d56f6ff690d2ebecb79ed0e58248427717eba150a508d1cd7 \ + --hash=sha256:1fed2f4797ad68e9982038423018cf08bec5f4ce9fed63a94a790773ed6a795c \ + --hash=sha256:2074548a335fcf7d19ebb18d9208da9e33b06f745754466a7e001d2b1c58dd19 \ + --hash=sha256:2587cedbb36c7988e707d83f0f1175c1f882f362b5ebbee25d70218ea33d220d \ + --hash=sha256:25945f962c7bd808415cfde3da624d4399d4ea71ed8918538375f16bceb79e1c \ + --hash=sha256:27786c62a400e282756ae1b090bcd7cfa35f28270cff65a9e7b27a5327a32561 \ + --hash=sha256:2c1719ddc8218b01e82fb2e82e8451bd65076cb96d7bef4477194bbb4305a968 \ + --hash=sha256:2d5595903444c854b99c42122b87edfe8a37cd698a4eae32f4fd1d2a7b6c115d \ + --hash=sha256:30bdbe4662992348132d03996700e1cf910d141d629179b967b146a22942264e \ + --hash=sha256:31a27a2edf482df72b91fe6c6438314d2c65290aa7becc55589d156c9b91f0da \ + --hash=sha256:320fd55bafee3eb069cfb5d6491f811a912758387ef2193840e2663e80e16f48 \ + --hash=sha256:33145acbad8317584cd64588131c7e1e286beef6280c0009b4544c91fce171d2 \ + --hash=sha256:43a241655e83e4603a152192cf022d5ca348c2f4e56dfb02e5c9c4c1a32f9cdb \ + --hash=sha256:4d63b6596368dab9e0cc66bf047e7182a56f33b34db141816a4f21f5bf958228 \ + --hash=sha256:4fb04442b6d1e2b36c774919c6bcbe3339c61b337261d4bd57e27932589095af \ + --hash=sha256:4fb2e77ff04bc4beb71d63c8e064f052ce5a6ea1e001d528d4d7f4b37d736f2e \ + --hash=sha256:5460c5ee6ced6d61ec8cd2324ebbe793a4960c4ffa2131ffff480e3b61c99ec5 \ + --hash=sha256:59344c1d46b7dec97d3f22f1cc930fafe8980b3c5bc9c9765c56738a5f1559e4 \ + --hash=sha256:5dfb78dd600a12f23fc0c3ec58f81336229fdc74501ecf378d1ce5b3f2f313ea \ + --hash=sha256:643e1e65b4f5b284427e61a894d876d10459820e93aa1e724dfb415117be24e0 \ + --hash=sha256:644ec9215cf9c4df8028d8511379a15d9c1af3e16d80e47f1b6fdc6ba118356a \ + --hash=sha256:66f2662c640bb71a1016a031eea6eef9d25c2bcdf7ffd1d1ddc5a58f9a1ced04 \ + --hash=sha256:685e656048b59d8dfde8c601f188ad53a4d719eb97080cafc8696cda6d75865e \ + --hash=sha256:7269bb3fc15587b0c191eecd95831d771a7d80f0c48929e560806b038ff3066c \ + --hash=sha256:73426f5eb2ecc10626c67cf86bd0af9e00d53e80e5c67d5ce8e18376d6abfa09 \ + --hash=sha256:75c07726c8b1a52147fd7987d6baaa318c5dced1416c3f25593e40f56e10755b \ + --hash=sha256:790fc19bccbd39426060047e53629f171a44745613bf360a045e9f9c8c4a2cea \ + --hash=sha256:7a2ccaf88ac0db153e84305d1ef0aa138cea82c6a88309066f6eaa3bc98636cd \ + --hash=sha256:87f4e01b042c84e6090dbc4fbe3415ddd69f6bc0130382323f9d3f1b8dd71b46 \ + --hash=sha256:88f079335e9da631efa64486c8207564a7bcd0c00526bb9e842e9d5b7e50a6cc \ + --hash=sha256:8c1c91a68926421f5ccbc82c85f83bd3ba593b121a46a1b9a554b3f0dd67a4bf \ + --hash=sha256:9121a894d74e65557e47e777060a495ab85f4b903e80dd73a3c940ba042920d7 \ + --hash=sha256:94e348c72a1fd1f8191f25ea056448e4f5a87b8fbf005b39d290dcb0581a48cd \ + --hash=sha256:98195866d3a9949915935d40a88e4f1c166e82e378f622c88025f2938624a90a \ + --hash=sha256:99dd6652bd6f730beadf74ef769d38c6bbd8ee6d1c15c8d138ea680b0594387f \ + --hash=sha256:9af691a9861027181d4de07ed74f0aee12a9650ac60d0a07f4320bff84b5d95f \ + --hash=sha256:a3b8b5d2935403f1b4b25ae324560e94b59593a38c0d2e7b6c9872126a9622ed \ + --hash=sha256:a3dcc876050b8f5cbc0ee84ef1e7f0c1dfe7c148f10098828bc4403683c33f10 \ + --hash=sha256:a74f5a92fa6e51c4f3c69b29c4662088b97be12f40652a21109605a175c81824 \ + --hash=sha256:ab91b0c36e95d42e1041a4907e3eefd06c482d53af3c7a77be7e214cc7cd4a63 \ + --hash=sha256:ad1bc61c7f6b0e58106aaab034916b6cb041757f708b07fbcdd9d6e1ac629225 \ + --hash=sha256:adcb9e77aa132cc6c9de2ffe7cf880a20aa8cdba21d367d1da1a412f57bddd5d \ + --hash=sha256:b22ea9bf5f9fad2b0077e944a7813f91593a4f61adf8faf734a70aed3f2b3a40 \ + --hash=sha256:b2a1c354f13f22b737621d914f3b4a8434ae69d3027a775e94b3e671756112f9 \ + --hash=sha256:b32fdf874868326351a75b1e4c02f97e802147119ae44c52d3d9da193ec34f5b \ + --hash=sha256:b3853ed4ce522598dc886160a7bab432a093051af85891fa2f5577c1dcac8ed6 \ + --hash=sha256:b443e73a4dfc7b6e0800ea4c13567b9694358e86f53bb2612a51c9e727cac67b \ + --hash=sha256:b4c9083ea89ab236b06e9ef2263971db3b4b507195fc7d5eecab95828dcae325 \ + --hash=sha256:b8ca0fe21458457077e4cb2d81e1ebdb146a00b3e9e2db6180a773f7ea905032 \ + --hash=sha256:c393af77c659a38bffbca215c0bcc8629ba4299568308dd7e4ff65d62cabed39 \ + --hash=sha256:c6bffa978793bea5e1b00e677062e53a62255439339591b70e209fa1552d5ee0 \ + --hash=sha256:ccf39ad5702e33e4d335b48ef9d56e21619b529b7f7471b5211419f380329b62 \ + --hash=sha256:cf81e0278b645004388873e0a1f9e3bc4c9ab8c18e377b14ed1a544be4b18c9a \ + --hash=sha256:d34546ad2e4a480b94b6797bcc5a322b3c705c4c74c3e4e545c4a3841c1b2d59 \ + --hash=sha256:d47713ffe6d4a627fbf078be9836a95ac106b4a0543e3841572c91e292a5d885 \ + --hash=sha256:d918dfe473291e8bfd8e13223ea5cb9b317bd9f50c280923776c377f7c64b428 \ + --hash=sha256:dbdce852e6bb66e1b8c36679d482971d69d93acf1785657522e51b7de30c3356 \ + --hash=sha256:dcc1bf0ac8a194313cf6e645e300a8a379674ceed8e0b1e910a2de3e3c28989e \ + --hash=sha256:dd961a32a7182c3891cdebca417fda67496d5d5de6ae636962254d22723bdf52 \ + --hash=sha256:ddf5d247f686aec853ddcc9a1234bfcc6f57b0a0670d2ad82fc25d8ae7e6a15f \ + --hash=sha256:e27c3cd27fbd25e5223c5c992b300cd4ee8f0a75c6f222ce65838138d853712c \ + --hash=sha256:e380ec4e6d8b26e389713995a43cb7fe56baea2d25fe073d4998c4821a026211 \ + --hash=sha256:e4bbde174a0aff5f6eeba75cf8c4c5d2a316316bc21f03a0bddca0fc3659a6f3 \ + --hash=sha256:e8b49b5743ede51e0bcf6805741f39f5e0e0fd6a172ba460cb39e3097ba803bb \ + --hash=sha256:e9904b5b37c3e5bb4a245c56bc4b7e497da57ffb8528f4fc39af9dcb168ee2e1 \ + --hash=sha256:ea96503b918fceaf40443182742b8964d47b65c5ebdea532893cb9479620000c \ + --hash=sha256:eb31fe390f03f7ae886dcc374f1099ec88526631a4cb891d399b68181f154ff0 \ + --hash=sha256:ebb32d776b61acd49f859a1d16b9e3d84e7b46d0d92aebd58acd54dc38e96664 \ + --hash=sha256:fb5363cf0fddd9b50525ddbf64a1e1b28ec4c6dfb28670a940cb1cf988a6786b \ + --hash=sha256:ff75c90663d6e8996610d435e71487daa853871ad1770dd83dc0f2fc4997241e + # via + # -c python/requirements_compiled.txt + # cupy-cuda12x +filelock==3.17.0 \ + --hash=sha256:533dc2f7ba78dc2f0f531fc6c4940addf7b70a481e269a5a3b93be94ffbe8338 \ + --hash=sha256:ee4e77401ef576ebb38cd7f13b9b28893194acc20a8e68e18730ba9c0e54660e + # via + # -c python/requirements_compiled.txt + # ray + # virtualenv +frozenlist==1.4.1 \ + --hash=sha256:04ced3e6a46b4cfffe20f9ae482818e34eba9b5fb0ce4056e4cc9b6e212d09b7 \ + --hash=sha256:0633c8d5337cb5c77acbccc6357ac49a1770b8c487e5b3505c57b949b4b82e98 \ + --hash=sha256:068b63f23b17df8569b7fdca5517edef76171cf3897eb68beb01341131fbd2ad \ + --hash=sha256:0c250a29735d4f15321007fb02865f0e6b6a41a6b88f1f523ca1596ab5f50bd5 \ + --hash=sha256:1979bc0aeb89b33b588c51c54ab0161791149f2461ea7c7c946d95d5f93b56ae \ + --hash=sha256:1a4471094e146b6790f61b98616ab8e44f72661879cc63fa1049d13ef711e71e \ + --hash=sha256:1b280e6507ea8a4fa0c0a7150b4e526a8d113989e28eaaef946cc77ffd7efc0a \ + --hash=sha256:1d0ce09d36d53bbbe566fe296965b23b961764c0bcf3ce2fa45f463745c04701 \ + --hash=sha256:20b51fa3f588ff2fe658663db52a41a4f7aa6c04f6201449c6c7c476bd255c0d \ + --hash=sha256:23b2d7679b73fe0e5a4560b672a39f98dfc6f60df63823b0a9970525325b95f6 \ + --hash=sha256:23b701e65c7b36e4bf15546a89279bd4d8675faabc287d06bbcfac7d3c33e1e6 \ + --hash=sha256:2471c201b70d58a0f0c1f91261542a03d9a5e088ed3dc6c160d614c01649c106 \ + --hash=sha256:27657df69e8801be6c3638054e202a135c7f299267f1a55ed3a598934f6c0d75 \ + --hash=sha256:29acab3f66f0f24674b7dc4736477bcd4bc3ad4b896f5f45379a67bce8b96868 \ + --hash=sha256:32453c1de775c889eb4e22f1197fe3bdfe457d16476ea407472b9442e6295f7a \ + --hash=sha256:3a670dc61eb0d0eb7080890c13de3066790f9049b47b0de04007090807c776b0 \ + --hash=sha256:3e0153a805a98f5ada7e09826255ba99fb4f7524bb81bf6b47fb702666484ae1 \ + --hash=sha256:410478a0c562d1a5bcc2f7ea448359fcb050ed48b3c6f6f4f18c313a9bdb1826 \ + --hash=sha256:442acde1e068288a4ba7acfe05f5f343e19fac87bfc96d89eb886b0363e977ec \ + --hash=sha256:48f6a4533887e189dae092f1cf981f2e3885175f7a0f33c91fb5b7b682b6bab6 \ + --hash=sha256:4f57dab5fe3407b6c0c1cc907ac98e8a189f9e418f3b6e54d65a718aaafe3950 \ + --hash=sha256:4f9c515e7914626b2a2e1e311794b4c35720a0be87af52b79ff8e1429fc25f19 \ + --hash=sha256:55fdc093b5a3cb41d420884cdaf37a1e74c3c37a31f46e66286d9145d2063bd0 \ + --hash=sha256:5667ed53d68d91920defdf4035d1cdaa3c3121dc0b113255124bcfada1cfa1b8 \ + --hash=sha256:590344787a90ae57d62511dd7c736ed56b428f04cd8c161fcc5e7232c130c69a \ + --hash=sha256:5a7d70357e7cee13f470c7883a063aae5fe209a493c57d86eb7f5a6f910fae09 \ + --hash=sha256:5c3894db91f5a489fc8fa6a9991820f368f0b3cbdb9cd8849547ccfab3392d86 \ + --hash=sha256:5c849d495bf5154cd8da18a9eb15db127d4dba2968d88831aff6f0331ea9bd4c \ + --hash=sha256:64536573d0a2cb6e625cf309984e2d873979709f2cf22839bf2d61790b448ad5 \ + --hash=sha256:693945278a31f2086d9bf3df0fe8254bbeaef1fe71e1351c3bd730aa7d31c41b \ + --hash=sha256:6db4667b187a6742b33afbbaf05a7bc551ffcf1ced0000a571aedbb4aa42fc7b \ + --hash=sha256:6eb73fa5426ea69ee0e012fb59cdc76a15b1283d6e32e4f8dc4482ec67d1194d \ + --hash=sha256:722e1124aec435320ae01ee3ac7bec11a5d47f25d0ed6328f2273d287bc3abb0 \ + --hash=sha256:7268252af60904bf52c26173cbadc3a071cece75f873705419c8681f24d3edea \ + --hash=sha256:74fb4bee6880b529a0c6560885fce4dc95936920f9f20f53d99a213f7bf66776 \ + --hash=sha256:780d3a35680ced9ce682fbcf4cb9c2bad3136eeff760ab33707b71db84664e3a \ + --hash=sha256:82e8211d69a4f4bc360ea22cd6555f8e61a1bd211d1d5d39d3d228b48c83a897 \ + --hash=sha256:89aa2c2eeb20957be2d950b85974b30a01a762f3308cd02bb15e1ad632e22dc7 \ + --hash=sha256:8aefbba5f69d42246543407ed2461db31006b0f76c4e32dfd6f42215a2c41d09 \ + --hash=sha256:96ec70beabbd3b10e8bfe52616a13561e58fe84c0101dd031dc78f250d5128b9 \ + --hash=sha256:9750cc7fe1ae3b1611bb8cfc3f9ec11d532244235d75901fb6b8e42ce9229dfe \ + --hash=sha256:9acbb16f06fe7f52f441bb6f413ebae6c37baa6ef9edd49cdd567216da8600cd \ + --hash=sha256:9d3e0c25a2350080e9319724dede4f31f43a6c9779be48021a7f4ebde8b2d742 \ + --hash=sha256:a06339f38e9ed3a64e4c4e43aec7f59084033647f908e4259d279a52d3757d09 \ + --hash=sha256:a0cb6f11204443f27a1628b0e460f37fb30f624be6051d490fa7d7e26d4af3d0 \ + --hash=sha256:a7496bfe1da7fb1a4e1cc23bb67c58fab69311cc7d32b5a99c2007b4b2a0e932 \ + --hash=sha256:a828c57f00f729620a442881cc60e57cfcec6842ba38e1b19fd3e47ac0ff8dc1 \ + --hash=sha256:a9b2de4cf0cdd5bd2dee4c4f63a653c61d2408055ab77b151c1957f221cabf2a \ + --hash=sha256:b46c8ae3a8f1f41a0d2ef350c0b6e65822d80772fe46b653ab6b6274f61d4a49 \ + --hash=sha256:b7e3ed87d4138356775346e6845cccbe66cd9e207f3cd11d2f0b9fd13681359d \ + --hash=sha256:b7f2f9f912dca3934c1baec2e4585a674ef16fe00218d833856408c48d5beee7 \ + --hash=sha256:ba60bb19387e13597fb059f32cd4d59445d7b18b69a745b8f8e5db0346f33480 \ + --hash=sha256:beee944ae828747fd7cb216a70f120767fc9f4f00bacae8543c14a6831673f89 \ + --hash=sha256:bfa4a17e17ce9abf47a74ae02f32d014c5e9404b6d9ac7f729e01562bbee601e \ + --hash=sha256:c037a86e8513059a2613aaba4d817bb90b9d9b6b69aace3ce9c877e8c8ed402b \ + --hash=sha256:c302220494f5c1ebeb0912ea782bcd5e2f8308037b3c7553fad0e48ebad6ad82 \ + --hash=sha256:c6321c9efe29975232da3bd0af0ad216800a47e93d763ce64f291917a381b8eb \ + --hash=sha256:c757a9dd70d72b076d6f68efdbb9bc943665ae954dad2801b874c8c69e185068 \ + --hash=sha256:c99169d4ff810155ca50b4da3b075cbde79752443117d89429595c2e8e37fed8 \ + --hash=sha256:c9c92be9fd329ac801cc420e08452b70e7aeab94ea4233a4804f0915c14eba9b \ + --hash=sha256:cc7b01b3754ea68a62bd77ce6020afaffb44a590c2289089289363472d13aedb \ + --hash=sha256:db9e724bebd621d9beca794f2a4ff1d26eed5965b004a97f1f1685a173b869c2 \ + --hash=sha256:dca69045298ce5c11fd539682cff879cc1e664c245d1c64da929813e54241d11 \ + --hash=sha256:dd9b1baec094d91bf36ec729445f7769d0d0cf6b64d04d86e45baf89e2b9059b \ + --hash=sha256:e02a0e11cf6597299b9f3bbd3f93d79217cb90cfd1411aec33848b13f5c656cc \ + --hash=sha256:e6a20a581f9ce92d389a8c7d7c3dd47c81fd5d6e655c8dddf341e14aa48659d0 \ + --hash=sha256:e7004be74cbb7d9f34553a5ce5fb08be14fb33bc86f332fb71cbe5216362a497 \ + --hash=sha256:e774d53b1a477a67838a904131c4b0eef6b3d8a651f8b138b04f748fccfefe17 \ + --hash=sha256:edb678da49d9f72c9f6c609fbe41a5dfb9a9282f9e6a2253d5a91e0fc382d7c0 \ + --hash=sha256:f146e0911cb2f1da549fc58fc7bcd2b836a44b79ef871980d605ec392ff6b0d2 \ + --hash=sha256:f56e2333dda1fe0f909e7cc59f021eba0d2307bc6f012a1ccf2beca6ba362439 \ + --hash=sha256:f9a3ea26252bd92f570600098783d1371354d89d5f6b7dfd87359d669f2109b5 \ + --hash=sha256:f9aa1878d1083b276b0196f2dfbe00c9b7e752475ed3b682025ff20c1c1f51ac \ + --hash=sha256:fb3c2db03683b5767dedb5769b8a40ebb47d6f7f45b1b3e3b4b51ec8ad9d9825 \ + --hash=sha256:fbeb989b5cc29e8daf7f976b421c220f1b8c731cbf22b9130d8815418ea45887 \ + --hash=sha256:fde5bd59ab5357e3853313127f4d3565fc7dad314a74d7b5d43c22c6a5ed2ced \ + --hash=sha256:fe1a06da377e3a1062ae5fe0926e12b84eceb8a50b350ddca72dc85015873f74 + # via + # -c python/requirements_compiled.txt + # aiohttp + # aiosignal +fsspec==2023.12.1 \ + --hash=sha256:6271f1d3075a378bfe432f6f42bf7e1d2a6ba74f78dd9b512385474c579146a0 \ + --hash=sha256:c4da01a35ac65c853f833e43f67802c25213f560820d54ddf248f92eddd5e990 + # via + # -c python/requirements_compiled.txt + # ray +google-api-core==2.24.2 \ + --hash=sha256:810a63ac95f3c441b7c0e43d344e372887f62ce9071ba972eacf32672e072de9 \ + --hash=sha256:81718493daf06d96d6bc76a91c23874dbf2fac0adbbf542831b805ee6e974696 + # via + # -c python/requirements_compiled.txt + # opencensus +google-auth==2.23.4 \ + --hash=sha256:79905d6b1652187def79d491d6e23d0cbb3a21d3c7ba0dbaa9c8a01906b13ff3 \ + --hash=sha256:d4bbc92fe4b8bfd2f3e8d88e5ba7085935da208ee38a134fc280e7ce682a05f2 + # via + # -c python/requirements_compiled.txt + # google-api-core +googleapis-common-protos==1.61.0 \ + --hash=sha256:22f1915393bb3245343f6efe87f6fe868532efc12aa26b391b15132e1279f1c0 \ + --hash=sha256:8a64866a97f6304a7179873a465d6eee97b7a24ec6cfd78e0f575e96b821240b + # via + # -c python/requirements_compiled.txt + # google-api-core +grpcio==1.74.0 \ + --hash=sha256:0f87bddd6e27fc776aacf7ebfec367b6d49cad0455123951e4488ea99d9b9b8f \ + --hash=sha256:136b53c91ac1d02c8c24201bfdeb56f8b3ac3278668cbb8e0ba49c88069e1bdc \ + --hash=sha256:1733969040989f7acc3d94c22f55b4a9501a30f6aaacdbccfaba0a3ffb255ab7 \ + --hash=sha256:176d60a5168d7948539def20b2a3adcce67d72454d9ae05969a2e73f3a0feee7 \ + --hash=sha256:1a2b06afe2e50ebfd46247ac3ba60cac523f54ec7792ae9ba6073c12daf26f0a \ + --hash=sha256:1bf949792cee20d2078323a9b02bacbbae002b9e3b9e2433f2741c15bdeba1c4 \ + --hash=sha256:22b834cef33429ca6cc28303c9c327ba9a3fafecbf62fae17e9a7b7163cc43ac \ + --hash=sha256:2918948864fec2a11721d91568effffbe0a02b23ecd57f281391d986847982f6 \ + --hash=sha256:2bc2d7d8d184e2362b53905cb1708c84cb16354771c04b490485fa07ce3a1d89 \ + --hash=sha256:2f609a39f62a6f6f05c7512746798282546358a37ea93c1fcbadf8b2fed162e3 \ + --hash=sha256:3601274bc0523f6dc07666c0e01682c94472402ac2fd1226fd96e079863bfa49 \ + --hash=sha256:3b03d8f2a07f0fea8c8f74deb59f8352b770e3900d143b3d1475effcb08eec20 \ + --hash=sha256:3d14e3c4d65e19d8430a4e28ceb71ace4728776fd6c3ce34016947474479683f \ + --hash=sha256:42f8fee287427b94be63d916c90399ed310ed10aadbf9e2e5538b3e497d269bc \ + --hash=sha256:4bc5fca10aaf74779081e16c2bcc3d5ec643ffd528d9e7b1c9039000ead73bae \ + --hash=sha256:4e4181bfc24413d1e3a37a0b7889bea68d973d4b45dd2bc68bb766c140718f82 \ + --hash=sha256:55b453812fa7c7ce2f5c88be3018fb4a490519b6ce80788d5913f3f9d7da8c7b \ + --hash=sha256:566b9395b90cc3d0d0c6404bc8572c7c18786ede549cdb540ae27b58afe0fb91 \ + --hash=sha256:5f251c355167b2360537cf17bea2cf0197995e551ab9da6a0a59b3da5e8704f9 \ + --hash=sha256:60d2d48b0580e70d2e1954d0d19fa3c2e60dd7cbed826aca104fff518310d1c5 \ + --hash=sha256:64229c1e9cea079420527fa8ac45d80fc1e8d3f94deaa35643c381fa8d98f362 \ + --hash=sha256:655726919b75ab3c34cdad39da5c530ac6fa32696fb23119e36b64adcfca174a \ + --hash=sha256:662456c4513e298db6d7bd9c3b8df6f75f8752f0ba01fb653e252ed4a59b5a5d \ + --hash=sha256:68c8ebcca945efff9d86d8d6d7bfb0841cf0071024417e2d7f45c5e46b5b08eb \ + --hash=sha256:69e1a8180868a2576f02356565f16635b99088da7df3d45aaa7e24e73a054e31 \ + --hash=sha256:6bab67d15ad617aff094c382c882e0177637da73cbc5532d52c07b4ee887a87b \ + --hash=sha256:7d95d71ff35291bab3f1c52f52f474c632db26ea12700c2ff0ea0532cb0b5854 \ + --hash=sha256:80d1f4fbb35b0742d3e3d3bb654b7381cd5f015f8497279a1e9c21ba623e01b1 \ + --hash=sha256:834988b6c34515545b3edd13e902c1acdd9f2465d386ea5143fb558f153a7176 \ + --hash=sha256:8533e6e9c5bd630ca98062e3a1326249e6ada07d05acf191a77bc33f8948f3d8 \ + --hash=sha256:85bd5cdf4ed7b2d6438871adf6afff9af7096486fcf51818a81b77ef4dd30907 \ + --hash=sha256:86ad489db097141a907c559988c29718719aa3e13370d40e20506f11b4de0d11 \ + --hash=sha256:885912559974df35d92219e2dc98f51a16a48395f37b92865ad45186f294096c \ + --hash=sha256:8efe72fde5500f47aca1ef59495cb59c885afe04ac89dd11d810f2de87d935d4 \ + --hash=sha256:8f7b5882fb50632ab1e48cb3122d6df55b9afabc265582808036b6e51b9fd6b7 \ + --hash=sha256:9e7c4389771855a92934b2846bd807fc25a3dfa820fd912fe6bd8136026b2707 \ + --hash=sha256:9e912d3c993a29df6c627459af58975b2e5c897d93287939b9d5065f000249b5 \ + --hash=sha256:a8f0302f9ac4e9923f98d8e243939a6fb627cd048f5cd38595c97e38020dffce \ + --hash=sha256:b6a73b2ba83e663b2480a90b82fdae6a7aa6427f62bf43b29912c0cfd1aa2bfa \ + --hash=sha256:c14e803037e572c177ba54a3e090d6eb12efd795d49327c5ee2b3bddb836bf01 \ + --hash=sha256:c3d7bd6e3929fd2ea7fbc3f562e4987229ead70c9ae5f01501a46701e08f1ad9 \ + --hash=sha256:c98e0b7434a7fa4e3e63f250456eaef52499fba5ae661c58cc5b5477d11e7182 \ + --hash=sha256:cce634b10aeab37010449124814b05a62fb5f18928ca878f1bf4750d1f0c815b \ + --hash=sha256:e154d230dc1bbbd78ad2fdc3039fa50ad7ffcf438e4eb2fa30bce223a70c7486 \ + --hash=sha256:e1ea6176d7dfd5b941ea01c2ec34de9531ba494d541fe2057c904e601879f249 \ + --hash=sha256:e759f9e8bc908aaae0412642afe5416c9f983a80499448fcc7fab8692ae044c3 \ + --hash=sha256:e8978003816c7b9eabe217f88c78bc26adc8f9304bf6a594b02e5a49b2ef9c11 \ + --hash=sha256:ecde9ab49f58433abe02f9ed076c7b5be839cf0153883a6d23995937a82392fa \ + --hash=sha256:f6ec94f0e50eb8fa1744a731088b966427575e40c2944a980049798b127a687e \ + --hash=sha256:fd3c71aeee838299c5887230b8a1822795325ddfea635edd82954c1eaa831e24 \ + --hash=sha256:fe0f540750a13fd8e5da4b3eaba91a785eea8dca5ccd2bc2ffe978caa403090e + # via + # -c python/requirements_compiled.txt + # ray +gymnasium==1.1.1 \ + --hash=sha256:8bd9ea9bdef32c950a444ff36afc785e1d81051ec32d30435058953c20d2456d \ + --hash=sha256:9c167ec0a2b388666e37f63b2849cd2552f7f5b71938574c637bb36487eb928a + # via + # -c python/requirements_compiled.txt + # ray +h11==0.16.0 \ + --hash=sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1 \ + --hash=sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86 + # via + # -c python/requirements_compiled.txt + # uvicorn +httptools==0.6.4 \ + --hash=sha256:0614154d5454c21b6410fdf5262b4a3ddb0f53f1e1721cfd59d55f32138c578a \ + --hash=sha256:0e563e54979e97b6d13f1bbc05a96109923e76b901f786a5eae36e99c01237bd \ + --hash=sha256:16e603a3bff50db08cd578d54f07032ca1631450ceb972c2f834c2b860c28ea2 \ + --hash=sha256:288cd628406cc53f9a541cfaf06041b4c71d751856bab45e3702191f931ccd17 \ + --hash=sha256:28908df1b9bb8187393d5b5db91435ccc9c8e891657f9cbb42a2541b44c82fc8 \ + --hash=sha256:322d20ea9cdd1fa98bd6a74b77e2ec5b818abdc3d36695ab402a0de8ef2865a3 \ + --hash=sha256:342dd6946aa6bda4b8f18c734576106b8a31f2fe31492881a9a160ec84ff4bd5 \ + --hash=sha256:345c288418f0944a6fe67be8e6afa9262b18c7626c3ef3c28adc5eabc06a68da \ + --hash=sha256:3c73ce323711a6ffb0d247dcd5a550b8babf0f757e86a52558fe5b86d6fefcc0 \ + --hash=sha256:40a5ec98d3f49904b9fe36827dcf1aadfef3b89e2bd05b0e35e94f97c2b14721 \ + --hash=sha256:40b0f7fe4fd38e6a507bdb751db0379df1e99120c65fbdc8ee6c1d044897a636 \ + --hash=sha256:40dc6a8e399e15ea525305a2ddba998b0af5caa2566bcd79dcbe8948181eeaff \ + --hash=sha256:4b36913ba52008249223042dca46e69967985fb4051951f94357ea681e1f5dc0 \ + --hash=sha256:4d87b29bd4486c0093fc64dea80231f7c7f7eb4dc70ae394d70a495ab8436071 \ + --hash=sha256:4e93eee4add6493b59a5c514da98c939b244fce4a0d8879cd3f466562f4b7d5c \ + --hash=sha256:59e724f8b332319e2875efd360e61ac07f33b492889284a3e05e6d13746876f4 \ + --hash=sha256:69422b7f458c5af875922cdb5bd586cc1f1033295aa9ff63ee196a87519ac8e1 \ + --hash=sha256:703c346571fa50d2e9856a37d7cd9435a25e7fd15e236c397bf224afaa355fe9 \ + --hash=sha256:85071a1e8c2d051b507161f6c3e26155b5c790e4e28d7f236422dbacc2a9cc44 \ + --hash=sha256:856f4bc0478ae143bad54a4242fccb1f3f86a6e1be5548fecfd4102061b3a083 \ + --hash=sha256:85797e37e8eeaa5439d33e556662cc370e474445d5fab24dcadc65a8ffb04003 \ + --hash=sha256:90d96a385fa941283ebd231464045187a31ad932ebfa541be8edf5b3c2328959 \ + --hash=sha256:94978a49b8f4569ad607cd4946b759d90b285e39c0d4640c6b36ca7a3ddf2efc \ + --hash=sha256:aafe0f1918ed07b67c1e838f950b1c1fabc683030477e60b335649b8020e1076 \ + --hash=sha256:ab9ba8dcf59de5181f6be44a77458e45a578fc99c31510b8c65b7d5acc3cf490 \ + --hash=sha256:ade273d7e767d5fae13fa637f4d53b6e961fb7fd93c7797562663f0171c26660 \ + --hash=sha256:b799de31416ecc589ad79dd85a0b2657a8fe39327944998dea368c1d4c9e55e6 \ + --hash=sha256:c26f313951f6e26147833fc923f78f95604bbec812a43e5ee37f26dc9e5a686c \ + --hash=sha256:ca80b7485c76f768a3bc83ea58373f8db7b015551117375e4918e2aa77ea9b50 \ + --hash=sha256:d1ffd262a73d7c28424252381a5b854c19d9de5f56f075445d33919a637e3547 \ + --hash=sha256:d3f0d369e7ffbe59c4b6116a44d6a8eb4783aae027f2c0b366cf0aa964185dba \ + --hash=sha256:d54efd20338ac52ba31e7da78e4a72570cf729fac82bc31ff9199bedf1dc7440 \ + --hash=sha256:dacdd3d10ea1b4ca9df97a0a303cbacafc04b5cd375fa98732678151643d4988 \ + --hash=sha256:db353d22843cf1028f43c3651581e4bb49374d85692a85f95f7b9a130e1b2cab \ + --hash=sha256:db78cb9ca56b59b016e64b6031eda5653be0589dba2b1b43453f6e8b405a0970 \ + --hash=sha256:deee0e3343f98ee8047e9f4c5bc7cedbf69f5734454a94c38ee829fb2d5fa3c1 \ + --hash=sha256:df017d6c780287d5c80601dafa31f17bddb170232d85c066604d8558683711a2 \ + --hash=sha256:df959752a0c2748a65ab5387d08287abf6779ae9165916fe053e68ae1fbdc47f \ + --hash=sha256:ec4f178901fa1834d4a060320d2f3abc5c9e39766953d038f1458cb885f47e81 \ + --hash=sha256:f47f8ed67cc0ff862b84a1189831d1d33c963fb3ce1ee0c65d3b0cbe7b711069 \ + --hash=sha256:f8787367fbdfccae38e35abf7641dafc5310310a5987b689f4c32cc8cc3ee975 \ + --hash=sha256:f9eb89ecf8b290f2e293325c646a211ff1c2493222798bb80a530c5e7502494f \ + --hash=sha256:fc411e1c0a7dcd2f902c7c48cf079947a7e65b5485dea9decb82b9105ca71a43 + # via uvicorn +idna==3.7 \ + --hash=sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc \ + --hash=sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0 + # via + # -c python/requirements_compiled.txt + # anyio + # requests + # yarl +importlib-metadata==6.11.0 \ + --hash=sha256:1231cf92d825c9e03cfc4da076a16de6422c863558229ea0b22b675657463443 \ + --hash=sha256:f0afba6205ad8f8947c7d338b5342d5db2afbfd82f9cbef7879a9539cc12eb9b + # via + # -c python/requirements_compiled.txt + # opentelemetry-api +jinja2==3.1.6 ; sys_platform != 'win32' \ + --hash=sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d \ + --hash=sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67 + # via + # -c python/requirements_compiled.txt + # memray +jsonschema==4.23.0 \ + --hash=sha256:d71497fef26351a33265337fa77ffeb82423f3ea21283cd9467bb03999266bc4 \ + --hash=sha256:fbadb6f8b144a8f8cf9f0b89ba94501d143e50411a1278633f56a7acf7fd5566 + # via + # -c python/requirements_compiled.txt + # ray +jsonschema-specifications==2024.10.1 \ + --hash=sha256:0f38b83639958ce1152d02a7f062902c41c8fd20d558b0c34344292d417ae272 \ + --hash=sha256:a09a0680616357d9a0ecf05c12ad234479f549239d0f5b55f3deea67475da9bf + # via + # -c python/requirements_compiled.txt + # jsonschema +kombu==5.5.4 \ + --hash=sha256:886600168275ebeada93b888e831352fe578168342f0d1d5833d88ba0d847363 \ + --hash=sha256:a12ed0557c238897d8e518f1d1fdf84bd1516c5e305af2dacd85c2015115feb8 + # via + # -c python/requirements_compiled.txt + # celery +lz4==4.3.3 \ + --hash=sha256:01fe674ef2889dbb9899d8a67361e0c4a2c833af5aeb37dd505727cf5d2a131e \ + --hash=sha256:054b4631a355606e99a42396f5db4d22046a3397ffc3269a348ec41eaebd69d2 \ + --hash=sha256:0a136e44a16fc98b1abc404fbabf7f1fada2bdab6a7e970974fb81cf55b636d0 \ + --hash=sha256:0e9c410b11a31dbdc94c05ac3c480cb4b222460faf9231f12538d0074e56c563 \ + --hash=sha256:222a7e35137d7539c9c33bb53fcbb26510c5748779364014235afc62b0ec797f \ + --hash=sha256:24b3206de56b7a537eda3a8123c644a2b7bf111f0af53bc14bed90ce5562d1aa \ + --hash=sha256:2b901c7784caac9a1ded4555258207d9e9697e746cc8532129f150ffe1f6ba0d \ + --hash=sha256:2f7b1839f795315e480fb87d9bc60b186a98e3e5d17203c6e757611ef7dcef61 \ + --hash=sha256:30e8c20b8857adef7be045c65f47ab1e2c4fabba86a9fa9a997d7674a31ea6b6 \ + --hash=sha256:31ea4be9d0059c00b2572d700bf2c1bc82f241f2c3282034a759c9a4d6ca4dc2 \ + --hash=sha256:337cb94488a1b060ef1685187d6ad4ba8bc61d26d631d7ba909ee984ea736be1 \ + --hash=sha256:33c9a6fd20767ccaf70649982f8f3eeb0884035c150c0b818ea660152cf3c809 \ + --hash=sha256:363ab65bf31338eb364062a15f302fc0fab0a49426051429866d71c793c23394 \ + --hash=sha256:43cf03059c0f941b772c8aeb42a0813d68d7081c009542301637e5782f8a33e2 \ + --hash=sha256:56f4fe9c6327adb97406f27a66420b22ce02d71a5c365c48d6b656b4aaeb7775 \ + --hash=sha256:5d35533bf2cee56f38ced91f766cd0038b6abf46f438a80d50c52750088be93f \ + --hash=sha256:6756212507405f270b66b3ff7f564618de0606395c0fe10a7ae2ffcbbe0b1fba \ + --hash=sha256:6cdc60e21ec70266947a48839b437d46025076eb4b12c76bd47f8e5eb8a75dcc \ + --hash=sha256:abc197e4aca8b63f5ae200af03eb95fb4b5055a8f990079b5bdf042f568469dd \ + --hash=sha256:b14d948e6dce389f9a7afc666d60dd1e35fa2138a8ec5306d30cd2e30d36b40c \ + --hash=sha256:b47839b53956e2737229d70714f1d75f33e8ac26e52c267f0197b3189ca6de24 \ + --hash=sha256:b6d9ec061b9eca86e4dcc003d93334b95d53909afd5a32c6e4f222157b50c071 \ + --hash=sha256:b891880c187e96339474af2a3b2bfb11a8e4732ff5034be919aa9029484cd201 \ + --hash=sha256:bca8fccc15e3add173da91be8f34121578dc777711ffd98d399be35487c934bf \ + --hash=sha256:c81703b12475da73a5d66618856d04b1307e43428a7e59d98cfe5a5d608a74c6 \ + --hash=sha256:d2507ee9c99dbddd191c86f0e0c8b724c76d26b0602db9ea23232304382e1f21 \ + --hash=sha256:e36cd7b9d4d920d3bfc2369840da506fa68258f7bb176b8743189793c055e43d \ + --hash=sha256:e7d84b479ddf39fe3ea05387f10b779155fc0990125f4fb35d636114e1c63a2e \ + --hash=sha256:eac9af361e0d98335a02ff12fb56caeb7ea1196cf1a49dbf6f17828a131da807 \ + --hash=sha256:edfd858985c23523f4e5a7526ca6ee65ff930207a7ec8a8f57a01eae506aaee7 \ + --hash=sha256:ee9ff50557a942d187ec85462bb0960207e7ec5b19b3b48949263993771c6205 \ + --hash=sha256:f0e822cd7644995d9ba248cb4b67859701748a93e2ab7fc9bc18c599a52e4604 \ + --hash=sha256:f180904f33bdd1e92967923a43c22899e303906d19b2cf8bb547db6653ea6e7d \ + --hash=sha256:f1d18718f9d78182c6b60f568c9a9cec8a7204d7cb6fad4e511a2ef279e4cb05 \ + --hash=sha256:f4c7bf687303ca47d69f9f0133274958fd672efaa33fb5bcde467862d6c621f0 \ + --hash=sha256:f76176492ff082657ada0d0f10c794b6da5800249ef1692b35cf49b1e93e8ef7 + # via + # -c python/requirements_compiled.txt + # ray +markdown-it-py==2.2.0 ; sys_platform != 'win32' \ + --hash=sha256:5a35f8d1870171d9acc47b99612dc146129b631baf04970128b568f190d0cc30 \ + --hash=sha256:7c9a5e412688bc771c67432cbfebcdd686c93ce6484913dccf06cb5a0bea35a1 + # via + # -c python/requirements_compiled.txt + # rich +markupsafe==2.1.3 ; sys_platform != 'win32' \ + --hash=sha256:05fb21170423db021895e1ea1e1f3ab3adb85d1c2333cbc2310f2a26bc77272e \ + --hash=sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686 \ + --hash=sha256:1577735524cdad32f9f694208aa75e422adba74f1baee7551620e43a3141f559 \ + --hash=sha256:1b40069d487e7edb2676d3fbdb2b0829ffa2cd63a2ec26c4938b2d34391b4ecc \ + --hash=sha256:1b8dd8c3fd14349433c79fa8abeb573a55fc0fdd769133baac1f5e07abf54aeb \ + --hash=sha256:2c1b19b3aaacc6e57b7e25710ff571c24d6c3613a45e905b1fde04d691b98ee0 \ + --hash=sha256:2ef12179d3a291be237280175b542c07a36e7f60718296278d8593d21ca937d4 \ + --hash=sha256:3c0fae6c3be832a0a0473ac912810b2877c8cb9d76ca48de1ed31e1c68386575 \ + --hash=sha256:3fd4abcb888d15a94f32b75d8fd18ee162ca0c064f35b11134be77050296d6ba \ + --hash=sha256:47d4f1c5f80fc62fdd7777d0d40a2e9dda0a05883ab11374334f6c4de38adffd \ + --hash=sha256:65c1a9bcdadc6c28eecee2c119465aebff8f7a584dd719facdd9e825ec61ab52 \ + --hash=sha256:68e78619a61ecf91e76aa3e6e8e33fc4894a2bebe93410754bd28fce0a8a4f9f \ + --hash=sha256:6b2b56950d93e41f33b4223ead100ea0fe11f8e6ee5f641eb753ce4b77a7042b \ + --hash=sha256:8023faf4e01efadfa183e863fefde0046de576c6f14659e8782065bcece22198 \ + --hash=sha256:8afafd99945ead6e075b973fefa56379c5b5c53fd8937dad92c662da5d8fd5ee \ + --hash=sha256:8c41976a29d078bb235fea9b2ecd3da465df42a562910f9022f1a03107bd02be \ + --hash=sha256:9dcdfd0eaf283af041973bff14a2e143b8bd64e069f4c383416ecd79a81aab58 \ + --hash=sha256:aa57bd9cf8ae831a362185ee444e15a93ecb2e344c8e52e4d721ea3ab6ef1823 \ + --hash=sha256:ad9e82fb8f09ade1c3e1b996a6337afac2b8b9e365f926f5a61aacc71adc5b3c \ + --hash=sha256:b076b6226fb84157e3f7c971a47ff3a679d837cf338547532ab866c57930dbee \ + --hash=sha256:bfce63a9e7834b12b87c64d6b155fdd9b3b96191b6bd334bf37db7ff1fe457f2 \ + --hash=sha256:cd0f502fe016460680cd20aaa5a76d241d6f35a1c3350c474bac1273803893fa \ + --hash=sha256:e09031c87a1e51556fdcb46e5bd4f59dfb743061cf93c4d6831bf894f125eb57 \ + --hash=sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc \ + --hash=sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11 + # via + # -c python/requirements_compiled.txt + # jinja2 +mdurl==0.1.2 ; sys_platform != 'win32' \ + --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \ + --hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba + # via + # -c python/requirements_compiled.txt + # markdown-it-py +memray==1.10.0 ; sys_platform != 'win32' \ + --hash=sha256:0a21745fb516b7a6efcd40aa7487c59e9313fcfc782d0193fcfcf00b48426874 \ + --hash=sha256:22f2a47871c172a0539bd72737bb6b294fc10c510464066b825d90fcd3bb4916 \ + --hash=sha256:23e8c402625cfb32d0e9edb5ec0945f3e5e54bc6b0c5699f6284302082b80bd4 \ + --hash=sha256:2ce59ef485db3634de98b3a026d2450fc0a875e3a58a9ea85f7a89098841defe \ + --hash=sha256:322ed0b69014a0969b777768d461a785203f81f9864386b666b5b26645d9c294 \ + --hash=sha256:38322e052b882790993412f1840517a51818aa55c47037f69915b2007f2c4cee \ + --hash=sha256:38393c86ce6d0a08e6ec0eb1401d49803b7c0c950c2565386751cdc81568cba8 \ + --hash=sha256:391aac6c9f744528d3186bc82d708a1acc83525778f804045d7c96f860f8ec98 \ + --hash=sha256:3a8bb7fbd8303c4f0017ba7faef6b88f904cda2931ed667cbf3b98f024b3bc44 \ + --hash=sha256:3c401c57f49c4c5f1fecaee1e746f537cdc6680da05fb963dc143bd08ee109bf \ + --hash=sha256:4eba29179772b4a2e440a065b320b03bc2e73fe2648bdf7936aa3b9a086fab4a \ + --hash=sha256:53a8f66af18b1f3bcf5c9f3c95ae4134dd675903a38f9d0e6341b7bca01b63d0 \ + --hash=sha256:566602b2143e06b3d592901d98c52ce4599e71aa2555146eeb5cec03506f9498 \ + --hash=sha256:663d463e89a64bae4a6b2f8c837d11a3d094834442d536a4165e1d31899a3500 \ + --hash=sha256:68bd8df023c8a32f44c11d997e5c536837e27c0955daf557d3a377edd55a1dd3 \ + --hash=sha256:6937d7ef67d18ccc01c3250cdf3b4ef1445b859ee8756f09e3d11bd3ff0c7d67 \ + --hash=sha256:6b311e91203be71e1a0ce5e4f978137765bcb1045f3bf5646129c83c5b96ab3c \ + --hash=sha256:6fd13ef666c7fced9768d1cfabf71dc6dfa6724935a8dff463495ac2dc5e13a4 \ + --hash=sha256:8196c684f1be8fe423e5cdd2356d4255a2cb482a1f3e89612b70d2a2862cf5bb \ + --hash=sha256:843a688877691746f9d1835cfa8a65139948471bdd78720435808d20bc30a1cc \ + --hash=sha256:85c32d6613d81b075f740e398c4d653e0803cd48e82c33dcd584c109d6782666 \ + --hash=sha256:898acd60f57a10dc5aaf1fd64aa2f821f0420114f3f60c3058083788603f173a \ + --hash=sha256:8d56f37a34125684746c13d24bd7a3fb17549b0bb355eb50969eb11e05e3ba62 \ + --hash=sha256:92c372cb262eddd23049f945ca9527f0e4cc7c40a070aade1802d066f680885b \ + --hash=sha256:95e563d9c976e429ad597ad2720d95cebbe8bac891a3082465439143e2740772 \ + --hash=sha256:9627184c926252c8f719c301f1fefe970f0d033c643a6448b93fed2889d1ea94 \ + --hash=sha256:a9e985fb7646b0475c303919d19211d2aa54e5a9e2cd2a102472299be5dbebd3 \ + --hash=sha256:b681519357d94f5f0857fbc6029e7c44d3f41436109e955a14fd312d8317bc35 \ + --hash=sha256:b75040f28e8678d0e9c4907d55c95cf26db8ef5adc9941a228f1b280a9efd9c0 \ + --hash=sha256:c3a14960838d89a91747885897d34134afb65883cc3b0ed7ff30fe1af00f9fe6 \ + --hash=sha256:c7aeb47174c42e99740a8e2b3b6fe0932c95d987258d48a746974ead19176c26 \ + --hash=sha256:ce22a887a585ef5020896de89ffc793e531b65ccc81fbafcc7886010c2c562b3 \ + --hash=sha256:cf6d683c4f8d25c6ad06ae18715f218983c5eb86803953615e902d632fdf6ec1 \ + --hash=sha256:e356af93e3b031c83957e9ac1a653f5aaba5df1e357dd17142f5ed19bb3dc660 \ + --hash=sha256:f16c5c8730b616613dc8bafe32649ca6bd7252606251eb00148582011758d0b5 + # via + # -c python/requirements_compiled.txt + # ray +msgpack==1.0.7 \ + --hash=sha256:04ad6069c86e531682f9e1e71b71c1c3937d6014a7c3e9edd2aa81ad58842862 \ + --hash=sha256:0bfdd914e55e0d2c9e1526de210f6fe8ffe9705f2b1dfcc4aecc92a4cb4b533d \ + --hash=sha256:1dc93e8e4653bdb5910aed79f11e165c85732067614f180f70534f056da97db3 \ + --hash=sha256:1e2d69948e4132813b8d1131f29f9101bc2c915f26089a6d632001a5c1349672 \ + --hash=sha256:235a31ec7db685f5c82233bddf9858748b89b8119bf4538d514536c485c15fe0 \ + --hash=sha256:27dcd6f46a21c18fa5e5deed92a43d4554e3df8d8ca5a47bf0615d6a5f39dbc9 \ + --hash=sha256:28efb066cde83c479dfe5a48141a53bc7e5f13f785b92ddde336c716663039ee \ + --hash=sha256:3476fae43db72bd11f29a5147ae2f3cb22e2f1a91d575ef130d2bf49afd21c46 \ + --hash=sha256:36e17c4592231a7dbd2ed09027823ab295d2791b3b1efb2aee874b10548b7524 \ + --hash=sha256:384d779f0d6f1b110eae74cb0659d9aa6ff35aaf547b3955abf2ab4c901c4819 \ + --hash=sha256:38949d30b11ae5f95c3c91917ee7a6b239f5ec276f271f28638dec9156f82cfc \ + --hash=sha256:3967e4ad1aa9da62fd53e346ed17d7b2e922cba5ab93bdd46febcac39be636fc \ + --hash=sha256:3e7bf4442b310ff154b7bb9d81eb2c016b7d597e364f97d72b1acc3817a0fdc1 \ + --hash=sha256:3f0c8c6dfa6605ab8ff0611995ee30d4f9fcff89966cf562733b4008a3d60d82 \ + --hash=sha256:484ae3240666ad34cfa31eea7b8c6cd2f1fdaae21d73ce2974211df099a95d81 \ + --hash=sha256:4a7b4f35de6a304b5533c238bee86b670b75b03d31b7797929caa7a624b5dda6 \ + --hash=sha256:4cb14ce54d9b857be9591ac364cb08dc2d6a5c4318c1182cb1d02274029d590d \ + --hash=sha256:4e71bc4416de195d6e9b4ee93ad3f2f6b2ce11d042b4d7a7ee00bbe0358bd0c2 \ + --hash=sha256:52700dc63a4676669b341ba33520f4d6e43d3ca58d422e22ba66d1736b0a6e4c \ + --hash=sha256:572efc93db7a4d27e404501975ca6d2d9775705c2d922390d878fcf768d92c87 \ + --hash=sha256:576eb384292b139821c41995523654ad82d1916da6a60cff129c715a6223ea84 \ + --hash=sha256:5b0bf0effb196ed76b7ad883848143427a73c355ae8e569fa538365064188b8e \ + --hash=sha256:5b6ccc0c85916998d788b295765ea0e9cb9aac7e4a8ed71d12e7d8ac31c23c95 \ + --hash=sha256:5ed82f5a7af3697b1c4786053736f24a0efd0a1b8a130d4c7bfee4b9ded0f08f \ + --hash=sha256:6d4c80667de2e36970ebf74f42d1088cc9ee7ef5f4e8c35eee1b40eafd33ca5b \ + --hash=sha256:730076207cb816138cf1af7f7237b208340a2c5e749707457d70705715c93b93 \ + --hash=sha256:7687e22a31e976a0e7fc99c2f4d11ca45eff652a81eb8c8085e9609298916dcf \ + --hash=sha256:822ea70dc4018c7e6223f13affd1c5c30c0f5c12ac1f96cd8e9949acddb48a61 \ + --hash=sha256:84b0daf226913133f899ea9b30618722d45feffa67e4fe867b0b5ae83a34060c \ + --hash=sha256:85765fdf4b27eb5086f05ac0491090fc76f4f2b28e09d9350c31aac25a5aaff8 \ + --hash=sha256:8dd178c4c80706546702c59529ffc005681bd6dc2ea234c450661b205445a34d \ + --hash=sha256:8f5b234f567cf76ee489502ceb7165c2a5cecec081db2b37e35332b537f8157c \ + --hash=sha256:98bbd754a422a0b123c66a4c341de0474cad4a5c10c164ceed6ea090f3563db4 \ + --hash=sha256:993584fc821c58d5993521bfdcd31a4adf025c7d745bbd4d12ccfecf695af5ba \ + --hash=sha256:a40821a89dc373d6427e2b44b572efc36a2778d3f543299e2f24eb1a5de65415 \ + --hash=sha256:b291f0ee7961a597cbbcc77709374087fa2a9afe7bdb6a40dbbd9b127e79afee \ + --hash=sha256:b573a43ef7c368ba4ea06050a957c2a7550f729c31f11dd616d2ac4aba99888d \ + --hash=sha256:b610ff0f24e9f11c9ae653c67ff8cc03c075131401b3e5ef4b82570d1728f8a9 \ + --hash=sha256:bdf38ba2d393c7911ae989c3bbba510ebbcdf4ecbdbfec36272abe350c454075 \ + --hash=sha256:bfef2bb6ef068827bbd021017a107194956918ab43ce4d6dc945ffa13efbc25f \ + --hash=sha256:cab3db8bab4b7e635c1c97270d7a4b2a90c070b33cbc00c99ef3f9be03d3e1f7 \ + --hash=sha256:cb70766519500281815dfd7a87d3a178acf7ce95390544b8c90587d76b227681 \ + --hash=sha256:cca1b62fe70d761a282496b96a5e51c44c213e410a964bdffe0928e611368329 \ + --hash=sha256:ccf9a39706b604d884d2cb1e27fe973bc55f2890c52f38df742bc1d79ab9f5e1 \ + --hash=sha256:dc43f1ec66eb8440567186ae2f8c447d91e0372d793dfe8c222aec857b81a8cf \ + --hash=sha256:dd632777ff3beaaf629f1ab4396caf7ba0bdd075d948a69460d13d44357aca4c \ + --hash=sha256:e45ae4927759289c30ccba8d9fdce62bb414977ba158286b5ddaf8df2cddb5c5 \ + --hash=sha256:e50ebce52f41370707f1e21a59514e3375e3edd6e1832f5e5235237db933c98b \ + --hash=sha256:ebbbba226f0a108a7366bf4b59bf0f30a12fd5e75100c630267d94d7f0ad20e5 \ + --hash=sha256:ec79ff6159dffcc30853b2ad612ed572af86c92b5168aa3fc01a67b0fa40665e \ + --hash=sha256:f0936e08e0003f66bfd97e74ee530427707297b0d0361247e9b4f59ab78ddc8b \ + --hash=sha256:f26a07a6e877c76a88e3cecac8531908d980d3d5067ff69213653649ec0f60ad \ + --hash=sha256:f64e376cd20d3f030190e8c32e1c64582eba56ac6dc7d5b0b49a9d44021b52fd \ + --hash=sha256:f6ffbc252eb0d229aeb2f9ad051200668fc3a9aaa8994e49f0cb2ffe2b7867e7 \ + --hash=sha256:f9a7c509542db4eceed3dcf21ee5267ab565a83555c9b88a8109dcecc4709002 \ + --hash=sha256:ff1d0899f104f3921d94579a5638847f783c9b04f2d5f229392ca77fba5b82fc + # via + # -c python/requirements_compiled.txt + # ray +multidict==6.0.5 \ + --hash=sha256:01265f5e40f5a17f8241d52656ed27192be03bfa8764d88e8220141d1e4b3556 \ + --hash=sha256:0275e35209c27a3f7951e1ce7aaf93ce0d163b28948444bec61dd7badc6d3f8c \ + --hash=sha256:04bde7a7b3de05732a4eb39c94574db1ec99abb56162d6c520ad26f83267de29 \ + --hash=sha256:04da1bb8c8dbadf2a18a452639771951c662c5ad03aefe4884775454be322c9b \ + --hash=sha256:09a892e4a9fb47331da06948690ae38eaa2426de97b4ccbfafbdcbe5c8f37ff8 \ + --hash=sha256:0d63c74e3d7ab26de115c49bffc92cc77ed23395303d496eae515d4204a625e7 \ + --hash=sha256:107c0cdefe028703fb5dafe640a409cb146d44a6ae201e55b35a4af8e95457dd \ + --hash=sha256:141b43360bfd3bdd75f15ed811850763555a251e38b2405967f8e25fb43f7d40 \ + --hash=sha256:14c2976aa9038c2629efa2c148022ed5eb4cb939e15ec7aace7ca932f48f9ba6 \ + --hash=sha256:19fe01cea168585ba0f678cad6f58133db2aa14eccaf22f88e4a6dccadfad8b3 \ + --hash=sha256:1d147090048129ce3c453f0292e7697d333db95e52616b3793922945804a433c \ + --hash=sha256:1d9ea7a7e779d7a3561aade7d596649fbecfa5c08a7674b11b423783217933f9 \ + --hash=sha256:215ed703caf15f578dca76ee6f6b21b7603791ae090fbf1ef9d865571039ade5 \ + --hash=sha256:21fd81c4ebdb4f214161be351eb5bcf385426bf023041da2fd9e60681f3cebae \ + --hash=sha256:220dd781e3f7af2c2c1053da9fa96d9cf3072ca58f057f4c5adaaa1cab8fc442 \ + --hash=sha256:228b644ae063c10e7f324ab1ab6b548bdf6f8b47f3ec234fef1093bc2735e5f9 \ + --hash=sha256:29bfeb0dff5cb5fdab2023a7a9947b3b4af63e9c47cae2a10ad58394b517fddc \ + --hash=sha256:2f4848aa3baa109e6ab81fe2006c77ed4d3cd1e0ac2c1fbddb7b1277c168788c \ + --hash=sha256:2faa5ae9376faba05f630d7e5e6be05be22913782b927b19d12b8145968a85ea \ + --hash=sha256:2ffc42c922dbfddb4a4c3b438eb056828719f07608af27d163191cb3e3aa6cc5 \ + --hash=sha256:37b15024f864916b4951adb95d3a80c9431299080341ab9544ed148091b53f50 \ + --hash=sha256:3cc2ad10255f903656017363cd59436f2111443a76f996584d1077e43ee51182 \ + --hash=sha256:3d25f19500588cbc47dc19081d78131c32637c25804df8414463ec908631e453 \ + --hash=sha256:403c0911cd5d5791605808b942c88a8155c2592e05332d2bf78f18697a5fa15e \ + --hash=sha256:411bf8515f3be9813d06004cac41ccf7d1cd46dfe233705933dd163b60e37600 \ + --hash=sha256:425bf820055005bfc8aa9a0b99ccb52cc2f4070153e34b701acc98d201693733 \ + --hash=sha256:435a0984199d81ca178b9ae2c26ec3d49692d20ee29bc4c11a2a8d4514c67eda \ + --hash=sha256:4a6a4f196f08c58c59e0b8ef8ec441d12aee4125a7d4f4fef000ccb22f8d7241 \ + --hash=sha256:4cc0ef8b962ac7a5e62b9e826bd0cd5040e7d401bc45a6835910ed699037a461 \ + --hash=sha256:51d035609b86722963404f711db441cf7134f1889107fb171a970c9701f92e1e \ + --hash=sha256:53689bb4e102200a4fafa9de9c7c3c212ab40a7ab2c8e474491914d2305f187e \ + --hash=sha256:55205d03e8a598cfc688c71ca8ea5f66447164efff8869517f175ea632c7cb7b \ + --hash=sha256:5c0631926c4f58e9a5ccce555ad7747d9a9f8b10619621f22f9635f069f6233e \ + --hash=sha256:5cb241881eefd96b46f89b1a056187ea8e9ba14ab88ba632e68d7a2ecb7aadf7 \ + --hash=sha256:60d698e8179a42ec85172d12f50b1668254628425a6bd611aba022257cac1386 \ + --hash=sha256:612d1156111ae11d14afaf3a0669ebf6c170dbb735e510a7438ffe2369a847fd \ + --hash=sha256:6214c5a5571802c33f80e6c84713b2c79e024995b9c5897f794b43e714daeec9 \ + --hash=sha256:6939c95381e003f54cd4c5516740faba40cf5ad3eeff460c3ad1d3e0ea2549bf \ + --hash=sha256:69db76c09796b313331bb7048229e3bee7928eb62bab5e071e9f7fcc4879caee \ + --hash=sha256:6bf7a982604375a8d49b6cc1b781c1747f243d91b81035a9b43a2126c04766f5 \ + --hash=sha256:766c8f7511df26d9f11cd3a8be623e59cca73d44643abab3f8c8c07620524e4a \ + --hash=sha256:76c0de87358b192de7ea9649beb392f107dcad9ad27276324c24c91774ca5271 \ + --hash=sha256:76f067f5121dcecf0d63a67f29080b26c43c71a98b10c701b0677e4a065fbd54 \ + --hash=sha256:7901c05ead4b3fb75113fb1dd33eb1253c6d3ee37ce93305acd9d38e0b5f21a4 \ + --hash=sha256:79660376075cfd4b2c80f295528aa6beb2058fd289f4c9252f986751a4cd0496 \ + --hash=sha256:79a6d2ba910adb2cbafc95dad936f8b9386e77c84c35bc0add315b856d7c3abb \ + --hash=sha256:7afcdd1fc07befad18ec4523a782cde4e93e0a2bf71239894b8d61ee578c1319 \ + --hash=sha256:7be7047bd08accdb7487737631d25735c9a04327911de89ff1b26b81745bd4e3 \ + --hash=sha256:7c6390cf87ff6234643428991b7359b5f59cc15155695deb4eda5c777d2b880f \ + --hash=sha256:7df704ca8cf4a073334e0427ae2345323613e4df18cc224f647f251e5e75a527 \ + --hash=sha256:85f67aed7bb647f93e7520633d8f51d3cbc6ab96957c71272b286b2f30dc70ed \ + --hash=sha256:896ebdcf62683551312c30e20614305f53125750803b614e9e6ce74a96232604 \ + --hash=sha256:92d16a3e275e38293623ebf639c471d3e03bb20b8ebb845237e0d3664914caef \ + --hash=sha256:99f60d34c048c5c2fabc766108c103612344c46e35d4ed9ae0673d33c8fb26e8 \ + --hash=sha256:9fe7b0653ba3d9d65cbe7698cca585bf0f8c83dbbcc710db9c90f478e175f2d5 \ + --hash=sha256:a3145cb08d8625b2d3fee1b2d596a8766352979c9bffe5d7833e0503d0f0b5e5 \ + --hash=sha256:aeaf541ddbad8311a87dd695ed9642401131ea39ad7bc8cf3ef3967fd093b626 \ + --hash=sha256:b55358304d7a73d7bdf5de62494aaf70bd33015831ffd98bc498b433dfe5b10c \ + --hash=sha256:b82cc8ace10ab5bd93235dfaab2021c70637005e1ac787031f4d1da63d493c1d \ + --hash=sha256:c0868d64af83169e4d4152ec612637a543f7a336e4a307b119e98042e852ad9c \ + --hash=sha256:c1c1496e73051918fcd4f58ff2e0f2f3066d1c76a0c6aeffd9b45d53243702cc \ + --hash=sha256:c9bf56195c6bbd293340ea82eafd0071cb3d450c703d2c93afb89f93b8386ccc \ + --hash=sha256:cbebcd5bcaf1eaf302617c114aa67569dd3f090dd0ce8ba9e35e9985b41ac35b \ + --hash=sha256:cd6c8fca38178e12c00418de737aef1261576bd1b6e8c6134d3e729a4e858b38 \ + --hash=sha256:ceb3b7e6a0135e092de86110c5a74e46bda4bd4fbfeeb3a3bcec79c0f861e450 \ + --hash=sha256:cf590b134eb70629e350691ecca88eac3e3b8b3c86992042fb82e3cb1830d5e1 \ + --hash=sha256:d3eb1ceec286eba8220c26f3b0096cf189aea7057b6e7b7a2e60ed36b373b77f \ + --hash=sha256:d65f25da8e248202bd47445cec78e0025c0fe7582b23ec69c3b27a640dd7a8e3 \ + --hash=sha256:d6f6d4f185481c9669b9447bf9d9cf3b95a0e9df9d169bbc17e363b7d5487755 \ + --hash=sha256:d84a5c3a5f7ce6db1f999fb9438f686bc2e09d38143f2d93d8406ed2dd6b9226 \ + --hash=sha256:d946b0a9eb8aaa590df1fe082cee553ceab173e6cb5b03239716338629c50c7a \ + --hash=sha256:dce1c6912ab9ff5f179eaf6efe7365c1f425ed690b03341911bf4939ef2f3046 \ + --hash=sha256:de170c7b4fe6859beb8926e84f7d7d6c693dfe8e27372ce3b76f01c46e489fcf \ + --hash=sha256:e02021f87a5b6932fa6ce916ca004c4d441509d33bbdbeca70d05dff5e9d2479 \ + --hash=sha256:e030047e85cbcedbfc073f71836d62dd5dadfbe7531cae27789ff66bc551bd5e \ + --hash=sha256:e0e79d91e71b9867c73323a3444724d496c037e578a0e1755ae159ba14f4f3d1 \ + --hash=sha256:e4428b29611e989719874670fd152b6625500ad6c686d464e99f5aaeeaca175a \ + --hash=sha256:e4972624066095e52b569e02b5ca97dbd7a7ddd4294bf4e7247d52635630dd83 \ + --hash=sha256:e7be68734bd8c9a513f2b0cfd508802d6609da068f40dc57d4e3494cefc92929 \ + --hash=sha256:e8e94e6912639a02ce173341ff62cc1201232ab86b8a8fcc05572741a5dc7d93 \ + --hash=sha256:ea1456df2a27c73ce51120fa2f519f1bea2f4a03a917f4a43c8707cf4cbbae1a \ + --hash=sha256:ebd8d160f91a764652d3e51ce0d2956b38efe37c9231cd82cfc0bed2e40b581c \ + --hash=sha256:eca2e9d0cc5a889850e9bbd68e98314ada174ff6ccd1129500103df7a94a7a44 \ + --hash=sha256:edd08e6f2f1a390bf137080507e44ccc086353c8e98c657e666c017718561b89 \ + --hash=sha256:f285e862d2f153a70586579c15c44656f888806ed0e5b56b64489afe4a2dbfba \ + --hash=sha256:f2a1dee728b52b33eebff5072817176c172050d44d67befd681609b4746e1c2e \ + --hash=sha256:f7e301075edaf50500f0b341543c41194d8df3ae5caf4702f2095f3ca73dd8da \ + --hash=sha256:fb616be3538599e797a2017cccca78e354c767165e8858ab5116813146041a24 \ + --hash=sha256:fce28b3c8a81b6b36dfac9feb1de115bab619b3c13905b419ec71d03a3fc1423 \ + --hash=sha256:fe5d7785250541f7f5019ab9cba2c71169dc7d74d0f45253f8313f436458a4ef + # via + # -c python/requirements_compiled.txt + # aiohttp + # yarl +numpy==1.26.4 \ + --hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \ + --hash=sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818 \ + --hash=sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20 \ + --hash=sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0 \ + --hash=sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010 \ + --hash=sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a \ + --hash=sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea \ + --hash=sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c \ + --hash=sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71 \ + --hash=sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110 \ + --hash=sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be \ + --hash=sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a \ + --hash=sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a \ + --hash=sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5 \ + --hash=sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed \ + --hash=sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd \ + --hash=sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c \ + --hash=sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e \ + --hash=sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0 \ + --hash=sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c \ + --hash=sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a \ + --hash=sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b \ + --hash=sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0 \ + --hash=sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6 \ + --hash=sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2 \ + --hash=sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a \ + --hash=sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30 \ + --hash=sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218 \ + --hash=sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5 \ + --hash=sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07 \ + --hash=sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2 \ + --hash=sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4 \ + --hash=sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764 \ + --hash=sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef \ + --hash=sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3 \ + --hash=sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f + # via + # -c python/requirements_compiled.txt + # cupy-cuda12x + # gymnasium + # pandas + # ray + # scipy + # tensorboardx +opencensus==0.11.4 \ + --hash=sha256:a18487ce68bc19900336e0ff4655c5a116daf10c1b3685ece8d971bddad6a864 \ + --hash=sha256:cbef87d8b8773064ab60e5c2a1ced58bbaa38a6d052c41aec224958ce544eff2 + # via + # -c python/requirements_compiled.txt + # ray +opencensus-context==0.1.3 \ + --hash=sha256:073bb0590007af276853009fac7e4bab1d523c3f03baf4cb4511ca38967c6039 \ + --hash=sha256:a03108c3c10d8c80bb5ddf5c8a1f033161fa61972a9917f9b9b3a18517f0088c + # via + # -c python/requirements_compiled.txt + # opencensus +opentelemetry-api==1.34.1 \ + --hash=sha256:64f0bd06d42824843731d05beea88d4d4b6ae59f9fe347ff7dfa2cc14233bbb3 \ + --hash=sha256:b7df4cb0830d5a6c29ad0c0691dbae874d8daefa934b8b1d642de48323d32a8c + # via + # -c python/requirements_compiled.txt + # opentelemetry-exporter-prometheus + # opentelemetry-sdk + # opentelemetry-semantic-conventions +opentelemetry-exporter-prometheus==0.55b1 \ + --hash=sha256:d13ec0b22bf394113ff1ada5da98133a4b051779b803dae183188e26c4bd9ee0 \ + --hash=sha256:f364fbbff9e5de37a112ff104d1185fb1d7e2046c5ab5911e5afebc7ab3ddf0e + # via + # -c python/requirements_compiled.txt + # ray +opentelemetry-proto==1.27.0 \ + --hash=sha256:33c9345d91dafd8a74fc3d7576c5a38f18b7fdf8d02983ac67485386132aedd6 \ + --hash=sha256:b133873de5581a50063e1e4b29cdcf0c5e253a8c2d8dc1229add20a4c3830ace + # via + # -c python/requirements_compiled.txt + # ray +opentelemetry-sdk==1.34.1 \ + --hash=sha256:308effad4059562f1d92163c61c8141df649da24ce361827812c40abb2a1e96e \ + --hash=sha256:8091db0d763fcd6098d4781bbc80ff0971f94e260739aa6afe6fd379cdf3aa4d + # via + # -c python/requirements_compiled.txt + # opentelemetry-exporter-prometheus + # ray +opentelemetry-semantic-conventions==0.55b1 \ + --hash=sha256:5da81dfdf7d52e3d37f8fe88d5e771e191de924cfff5f550ab0b8f7b2409baed \ + --hash=sha256:ef95b1f009159c28d7a7849f5cbc71c4c34c845bb514d66adfdf1b3fff3598b3 + # via + # -c python/requirements_compiled.txt + # opentelemetry-sdk +ormsgpack==1.7.0 \ + --hash=sha256:0d88307ab45d95416ce4071b1b99326ca31362af01c3d206f15a0551a7a874bd \ + --hash=sha256:22418a4d399027a72fb2e6b873559b1886cf2e63323ca7afc17b222c454413b7 \ + --hash=sha256:2c22c62a6bc93bcb194b7f91864ca0b39455b2cbbfc1538a3da0f9ec3c11d184 \ + --hash=sha256:3a6a97937d2cf21496d7689b90a43df83c5062bbe846aaa39197cc9ad73eaa7b \ + --hash=sha256:462089a419dbde654915ccb0b859c0dbe3c178b0ac580018e82befea6ccd73f4 \ + --hash=sha256:4b353204e99b56c1d33f1cf4767bd1fe1195596181a1cc789f25aa26c0b50f3d \ + --hash=sha256:5ec763096d978d35eedcef0af13991a10741717c2e236b26f4c2047b0740ea7b \ + --hash=sha256:5fefa1ca842dbba258401ea958113fe62c6b70a7a4d46edac440113f68dc431e \ + --hash=sha256:65525438b4a8b3b64ccfcda25e758ea3db392d1c206b5e09ef70efbbafa6dbf9 \ + --hash=sha256:6b4c98839cb7fc2a212037d2258f3a22857155249eb293d45c45cb974cfba834 \ + --hash=sha256:6d114652dadd81802b8a35a49e07a3e9ef2a47aed6123fb5031f2220d1c8e434 \ + --hash=sha256:77bc2ea387d85cfad045b9bcb8040bae43ad32dafe9363360f732cc19d489bbe \ + --hash=sha256:7e6ada21f5c7a20ff7cf9b061c44e3814352f819947a12022ad8cb52a9f2a809 \ + --hash=sha256:8d301e47565fe0e52a60052e730a9bb7669dfbd2a94643b8be925e3928c64c15 \ + --hash=sha256:90aabfd816db60dadab1100d583d061e0238209015bf684f8170c0fca4eb445a \ + --hash=sha256:91ebb7d3609db249cdff629ffef83ec3d025b1384749a297cf3b6a8240cf22ac \ + --hash=sha256:97723786755a7df85fcf6e68d7b5359dacea98d5c26b1d9af219a3cc05df4734 \ + --hash=sha256:9b0945523ccc75aa6907f38f2240d36818618baccb8633923bd7740a5a929e67 \ + --hash=sha256:a0ca6a64d47073f22ecc1dd96b384e44f98796d3f88ee383e92dfbcdf18c2efd \ + --hash=sha256:a5e12b51a590be47ccef67907905653e679fc2f920854b456edc216690ecc09c \ + --hash=sha256:a8fbe7bb50ee8381df030823d9366984fac718447947c2327969405d1d799b95 \ + --hash=sha256:c683071bf4527ffa7b6cfcf28f750d1a82eb77846d106743c09261ab1b79b193 \ + --hash=sha256:ca4d35b694f32112eb33ac0b733cb903dbbc59f019d05ca3d74f6ad2f587b0bf \ + --hash=sha256:e8385181bf195af80fc270e64fd477f1c414ffb05837320382e2ec9ca34be0ec \ + --hash=sha256:e86124cdbc8ed249806347c2fba96843e8941122b161b429139a0c973d270de4 \ + --hash=sha256:f9967a7f3647ad118751abf090f8397fda3e4bca6833340cab95a3f2bec598cd + # via + # -c python/requirements_compiled.txt + # ray +packaging==23.0 \ + --hash=sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2 \ + --hash=sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97 + # via + # -c python/requirements_compiled.txt + # kombu + # ray + # tensorboardx +pandas==1.5.3 \ + --hash=sha256:14e45300521902689a81f3f41386dc86f19b8ba8dd5ac5a3c7010ef8d2932813 \ + --hash=sha256:26d9c71772c7afb9d5046e6e9cf42d83dd147b5cf5bcb9d97252077118543792 \ + --hash=sha256:3749077d86e3a2f0ed51367f30bf5b82e131cc0f14260c4d3e499186fccc4406 \ + --hash=sha256:41179ce559943d83a9b4bbacb736b04c928b095b5f25dd2b7389eda08f46f373 \ + --hash=sha256:478ff646ca42b20376e4ed3fa2e8d7341e8a63105586efe54fa2508ee087f328 \ + --hash=sha256:50869a35cbb0f2e0cd5ec04b191e7b12ed688874bd05dd777c19b28cbea90996 \ + --hash=sha256:565fa34a5434d38e9d250af3c12ff931abaf88050551d9fbcdfafca50d62babf \ + --hash=sha256:5f2b952406a1588ad4cad5b3f55f520e82e902388a6d5a4a91baa8d38d23c7f6 \ + --hash=sha256:5fbcb19d6fceb9e946b3e23258757c7b225ba450990d9ed63ccceeb8cae609f7 \ + --hash=sha256:6973549c01ca91ec96199e940495219c887ea815b2083722821f1d7abfa2b4dc \ + --hash=sha256:74a3fd7e5a7ec052f183273dc7b0acd3a863edf7520f5d3a1765c04ffdb3b0b1 \ + --hash=sha256:7a0a56cef15fd1586726dace5616db75ebcfec9179a3a55e78f72c5639fa2a23 \ + --hash=sha256:7cec0bee9f294e5de5bbfc14d0573f65526071029d036b753ee6507d2a21480a \ + --hash=sha256:87bd9c03da1ac870a6d2c8902a0e1fd4267ca00f13bc494c9e5a9020920e1d51 \ + --hash=sha256:972d8a45395f2a2d26733eb8d0f629b2f90bebe8e8eddbb8829b180c09639572 \ + --hash=sha256:9842b6f4b8479e41968eced654487258ed81df7d1c9b7b870ceea24ed9459b31 \ + --hash=sha256:9f69c4029613de47816b1bb30ff5ac778686688751a5e9c99ad8c7031f6508e5 \ + --hash=sha256:a50d9a4336a9621cab7b8eb3fb11adb82de58f9b91d84c2cd526576b881a0c5a \ + --hash=sha256:bc4c368f42b551bf72fac35c5128963a171b40dce866fb066540eeaf46faa003 \ + --hash=sha256:c39a8da13cede5adcd3be1182883aea1c925476f4e84b2807a46e2775306305d \ + --hash=sha256:c3ac844a0fe00bfaeb2c9b51ab1424e5c8744f89860b138434a363b1f620f354 \ + --hash=sha256:c4c00e0b0597c8e4f59e8d461f797e5d70b4d025880516a8261b2817c47759ee \ + --hash=sha256:c74a62747864ed568f5a82a49a23a8d7fe171d0c69038b38cedf0976831296fa \ + --hash=sha256:dd05f7783b3274aa206a1af06f0ceed3f9b412cf665b7247eacd83be41cf7bf0 \ + --hash=sha256:dfd681c5dc216037e0b0a2c821f5ed99ba9f03ebcf119c7dac0e9a7b960b9ec9 \ + --hash=sha256:e474390e60ed609cec869b0da796ad94f420bb057d86784191eefc62b65819ae \ + --hash=sha256:f76d097d12c82a535fda9dfe5e8dd4127952b45fea9b0276cb30cca5ea313fbc + # via + # -c python/requirements_compiled.txt + # ray +platformdirs==3.11.0 \ + --hash=sha256:cf8ee52a3afdb965072dcc652433e0c7e3e40cf5ea1477cd4b3b1d2eb75495b3 \ + --hash=sha256:e9d171d00af68be50e9202731309c4e658fd8bc76f55c11c7dd760d023bda68e + # via + # -c python/requirements_compiled.txt + # virtualenv +prometheus-client==0.19.0 \ + --hash=sha256:4585b0d1223148c27a225b10dbec5ae9bc4c81a99a3fa80774fa6209935324e1 \ + --hash=sha256:c88b1e6ecf6b41cd8fb5731c7ae919bf66df6ec6fafa555cd6c0e16ca169ae92 + # via + # -c python/requirements_compiled.txt + # opentelemetry-exporter-prometheus + # ray +prompt-toolkit==3.0.41 \ + --hash=sha256:941367d97fc815548822aa26c2a269fdc4eb21e9ec05fc5d447cf09bad5d75f0 \ + --hash=sha256:f36fe301fafb7470e86aaf90f036eef600a3210be4decf461a5b1ca8403d3cb2 + # via + # -c python/requirements_compiled.txt + # click-repl +propcache==0.3.0 \ + --hash=sha256:02df07041e0820cacc8f739510078f2aadcfd3fc57eaeeb16d5ded85c872c89e \ + --hash=sha256:03acd9ff19021bd0567582ac88f821b66883e158274183b9e5586f678984f8fe \ + --hash=sha256:03c091bb752349402f23ee43bb2bff6bd80ccab7c9df6b88ad4322258d6960fc \ + --hash=sha256:07700939b2cbd67bfb3b76a12e1412405d71019df00ca5697ce75e5ef789d829 \ + --hash=sha256:0c3e893c4464ebd751b44ae76c12c5f5c1e4f6cbd6fbf67e3783cd93ad221863 \ + --hash=sha256:119e244ab40f70a98c91906d4c1f4c5f2e68bd0b14e7ab0a06922038fae8a20f \ + --hash=sha256:11ae6a8a01b8a4dc79093b5d3ca2c8a4436f5ee251a9840d7790dccbd96cb649 \ + --hash=sha256:15010f29fbed80e711db272909a074dc79858c6d28e2915704cfc487a8ac89c6 \ + --hash=sha256:19d36bb351ad5554ff20f2ae75f88ce205b0748c38b146c75628577020351e3c \ + --hash=sha256:1c8f7d896a16da9455f882870a507567d4f58c53504dc2d4b1e1d386dfe4588a \ + --hash=sha256:2383a17385d9800b6eb5855c2f05ee550f803878f344f58b6e194de08b96352c \ + --hash=sha256:24c04f8fbf60094c531667b8207acbae54146661657a1b1be6d3ca7773b7a545 \ + --hash=sha256:2578541776769b500bada3f8a4eeaf944530516b6e90c089aa368266ed70c49e \ + --hash=sha256:26a67e5c04e3119594d8cfae517f4b9330c395df07ea65eab16f3d559b7068fe \ + --hash=sha256:2b975528998de037dfbc10144b8aed9b8dd5a99ec547f14d1cb7c5665a43f075 \ + --hash=sha256:2d15bc27163cd4df433e75f546b9ac31c1ba7b0b128bfb1b90df19082466ff57 \ + --hash=sha256:2d913d36bdaf368637b4f88d554fb9cb9d53d6920b9c5563846555938d5450bf \ + --hash=sha256:3302c5287e504d23bb0e64d2a921d1eb4a03fb93a0a0aa3b53de059f5a5d737d \ + --hash=sha256:36ca5e9a21822cc1746023e88f5c0af6fce3af3b85d4520efb1ce4221bed75cc \ + --hash=sha256:3b812b3cb6caacd072276ac0492d249f210006c57726b6484a1e1805b3cfeea0 \ + --hash=sha256:3c6ec957025bf32b15cbc6b67afe233c65b30005e4c55fe5768e4bb518d712f1 \ + --hash=sha256:41de3da5458edd5678b0f6ff66691507f9885f5fe6a0fb99a5d10d10c0fd2d64 \ + --hash=sha256:42924dc0c9d73e49908e35bbdec87adedd651ea24c53c29cac103ede0ea1d340 \ + --hash=sha256:4544699674faf66fb6b4473a1518ae4999c1b614f0b8297b1cef96bac25381db \ + --hash=sha256:46ed02532cb66612d42ae5c3929b5e98ae330ea0f3900bc66ec5f4862069519b \ + --hash=sha256:49ea05212a529c2caffe411e25a59308b07d6e10bf2505d77da72891f9a05641 \ + --hash=sha256:4fa0e7c9c3cf7c276d4f6ab9af8adddc127d04e0fcabede315904d2ff76db626 \ + --hash=sha256:507c5357a8d8b4593b97fb669c50598f4e6cccbbf77e22fa9598aba78292b4d7 \ + --hash=sha256:549722908de62aa0b47a78b90531c022fa6e139f9166be634f667ff45632cc92 \ + --hash=sha256:58e6d2a5a7cb3e5f166fd58e71e9a4ff504be9dc61b88167e75f835da5764d07 \ + --hash=sha256:5a16167118677d94bb48bfcd91e420088854eb0737b76ec374b91498fb77a70e \ + --hash=sha256:5d62c4f6706bff5d8a52fd51fec6069bef69e7202ed481486c0bc3874912c787 \ + --hash=sha256:5fa159dcee5dba00c1def3231c249cf261185189205073bde13797e57dd7540a \ + --hash=sha256:6032231d4a5abd67c7f71168fd64a47b6b451fbcb91c8397c2f7610e67683810 \ + --hash=sha256:63f26258a163c34542c24808f03d734b338da66ba91f410a703e505c8485791d \ + --hash=sha256:65a37714b8ad9aba5780325228598a5b16c47ba0f8aeb3dc0514701e4413d7c0 \ + --hash=sha256:67054e47c01b7b349b94ed0840ccae075449503cf1fdd0a1fdd98ab5ddc2667b \ + --hash=sha256:67dda3c7325691c2081510e92c561f465ba61b975f481735aefdfc845d2cd043 \ + --hash=sha256:6985a593417cdbc94c7f9c3403747335e450c1599da1647a5af76539672464d3 \ + --hash=sha256:6a1948df1bb1d56b5e7b0553c0fa04fd0e320997ae99689488201f19fa90d2e7 \ + --hash=sha256:6b5b7fd6ee7b54e01759f2044f936dcf7dea6e7585f35490f7ca0420fe723c0d \ + --hash=sha256:6c929916cbdb540d3407c66f19f73387f43e7c12fa318a66f64ac99da601bcdf \ + --hash=sha256:6f4d7a7c0aff92e8354cceca6fe223973ddf08401047920df0fcb24be2bd5138 \ + --hash=sha256:728af36011bb5d344c4fe4af79cfe186729efb649d2f8b395d1572fb088a996c \ + --hash=sha256:742840d1d0438eb7ea4280f3347598f507a199a35a08294afdcc560c3739989d \ + --hash=sha256:75e872573220d1ee2305b35c9813626e620768248425f58798413e9c39741f46 \ + --hash=sha256:794c3dd744fad478b6232289c866c25406ecdfc47e294618bdf1697e69bd64a6 \ + --hash=sha256:7c0fdbdf6983526e269e5a8d53b7ae3622dd6998468821d660d0daf72779aefa \ + --hash=sha256:7c5f5290799a3f6539cc5e6f474c3e5c5fbeba74a5e1e5be75587746a940d51e \ + --hash=sha256:7c6e7e4f9167fddc438cd653d826f2222222564daed4116a02a184b464d3ef05 \ + --hash=sha256:7cedd25e5f678f7738da38037435b340694ab34d424938041aa630d8bac42663 \ + --hash=sha256:7e2e068a83552ddf7a39a99488bcba05ac13454fb205c847674da0352602082f \ + --hash=sha256:8319293e85feadbbfe2150a5659dbc2ebc4afdeaf7d98936fb9a2f2ba0d4c35c \ + --hash=sha256:8526b0941ec5a40220fc4dfde76aed58808e2b309c03e9fa8e2260083ef7157f \ + --hash=sha256:8884ba1a0fe7210b775106b25850f5e5a9dc3c840d1ae9924ee6ea2eb3acbfe7 \ + --hash=sha256:8cb625bcb5add899cb8ba7bf716ec1d3e8f7cdea9b0713fa99eadf73b6d4986f \ + --hash=sha256:8d663fd71491dde7dfdfc899d13a067a94198e90695b4321084c6e450743b8c7 \ + --hash=sha256:8ee1983728964d6070ab443399c476de93d5d741f71e8f6e7880a065f878e0b9 \ + --hash=sha256:997e7b8f173a391987df40f3b52c423e5850be6f6df0dcfb5376365440b56667 \ + --hash=sha256:9be90eebc9842a93ef8335291f57b3b7488ac24f70df96a6034a13cb58e6ff86 \ + --hash=sha256:9ddd49258610499aab83b4f5b61b32e11fce873586282a0e972e5ab3bcadee51 \ + --hash=sha256:9ecde3671e62eeb99e977f5221abcf40c208f69b5eb986b061ccec317c82ebd0 \ + --hash=sha256:9ff4e9ecb6e4b363430edf2c6e50173a63e0820e549918adef70515f87ced19a \ + --hash=sha256:a254537b9b696ede293bfdbc0a65200e8e4507bc9f37831e2a0318a9b333c85c \ + --hash=sha256:a2b9bf8c79b660d0ca1ad95e587818c30ccdb11f787657458d6f26a1ea18c568 \ + --hash=sha256:a61a68d630e812b67b5bf097ab84e2cd79b48c792857dc10ba8a223f5b06a2af \ + --hash=sha256:a7080b0159ce05f179cfac592cda1a82898ca9cd097dacf8ea20ae33474fbb25 \ + --hash=sha256:a8fd93de4e1d278046345f49e2238cdb298589325849b2645d4a94c53faeffc5 \ + --hash=sha256:a94ffc66738da99232ddffcf7910e0f69e2bbe3a0802e54426dbf0714e1c2ffe \ + --hash=sha256:aa806bbc13eac1ab6291ed21ecd2dd426063ca5417dd507e6be58de20e58dfcf \ + --hash=sha256:b0c1a133d42c6fc1f5fbcf5c91331657a1ff822e87989bf4a6e2e39b818d0ee9 \ + --hash=sha256:b58229a844931bca61b3a20efd2be2a2acb4ad1622fc026504309a6883686fbf \ + --hash=sha256:bb2f144c6d98bb5cbc94adeb0447cfd4c0f991341baa68eee3f3b0c9c0e83767 \ + --hash=sha256:be90c94570840939fecedf99fa72839aed70b0ced449b415c85e01ae67422c90 \ + --hash=sha256:bf0d9a171908f32d54f651648c7290397b8792f4303821c42a74e7805bfb813c \ + --hash=sha256:bf15fc0b45914d9d1b706f7c9c4f66f2b7b053e9517e40123e137e8ca8958b3d \ + --hash=sha256:bf4298f366ca7e1ad1d21bbb58300a6985015909964077afd37559084590c929 \ + --hash=sha256:c441c841e82c5ba7a85ad25986014be8d7849c3cfbdb6004541873505929a74e \ + --hash=sha256:cacea77ef7a2195f04f9279297684955e3d1ae4241092ff0cfcef532bb7a1c32 \ + --hash=sha256:cd54895e4ae7d32f1e3dd91261df46ee7483a735017dc6f987904f194aa5fd14 \ + --hash=sha256:d1323cd04d6e92150bcc79d0174ce347ed4b349d748b9358fd2e497b121e03c8 \ + --hash=sha256:d383bf5e045d7f9d239b38e6acadd7b7fdf6c0087259a84ae3475d18e9a2ae8b \ + --hash=sha256:d3e7420211f5a65a54675fd860ea04173cde60a7cc20ccfbafcccd155225f8bc \ + --hash=sha256:d8074c5dd61c8a3e915fa8fc04754fa55cfa5978200d2daa1e2d4294c1f136aa \ + --hash=sha256:df03cd88f95b1b99052b52b1bb92173229d7a674df0ab06d2b25765ee8404bce \ + --hash=sha256:e45377d5d6fefe1677da2a2c07b024a6dac782088e37c0b1efea4cfe2b1be19b \ + --hash=sha256:e53d19c2bf7d0d1e6998a7e693c7e87300dd971808e6618964621ccd0e01fe4e \ + --hash=sha256:e560fd75aaf3e5693b91bcaddd8b314f4d57e99aef8a6c6dc692f935cc1e6bbf \ + --hash=sha256:ec5060592d83454e8063e487696ac3783cc48c9a329498bafae0d972bc7816c9 \ + --hash=sha256:ecc2920630283e0783c22e2ac94427f8cca29a04cfdf331467d4f661f4072dac \ + --hash=sha256:ed7161bccab7696a473fe7ddb619c1d75963732b37da4618ba12e60899fefe4f \ + --hash=sha256:ee0bd3a7b2e184e88d25c9baa6a9dc609ba25b76daae942edfb14499ac7ec374 \ + --hash=sha256:ee25f1ac091def37c4b59d192bbe3a206298feeb89132a470325bf76ad122a1e \ + --hash=sha256:efa44f64c37cc30c9f05932c740a8b40ce359f51882c70883cc95feac842da4d \ + --hash=sha256:f47d52fd9b2ac418c4890aad2f6d21a6b96183c98021f0a48497a904199f006e \ + --hash=sha256:f857034dc68d5ceb30fb60afb6ff2103087aea10a01b613985610e007053a121 \ + --hash=sha256:fb91d20fa2d3b13deea98a690534697742029f4fb83673a3501ae6e3746508b5 \ + --hash=sha256:fddb8870bdb83456a489ab67c6b3040a8d5a55069aa6f72f9d872235fbc52f54 + # via + # -c python/requirements_compiled.txt + # aiohttp + # yarl +proto-plus==1.22.3 \ + --hash=sha256:a49cd903bc0b6ab41f76bf65510439d56ca76f868adf0274e738bfdd096894df \ + --hash=sha256:fdcd09713cbd42480740d2fe29c990f7fbd885a67efc328aa8be6ee3e9f76a6b + # via + # -c python/requirements_compiled.txt + # google-api-core +protobuf==4.25.8 \ + --hash=sha256:077ff8badf2acf8bc474406706ad890466274191a48d0abd3bd6987107c9cde5 \ + --hash=sha256:15a0af558aa3b13efef102ae6e4f3efac06f1eea11afb3a57db2901447d9fb59 \ + --hash=sha256:27d498ffd1f21fb81d987a041c32d07857d1d107909f5134ba3350e1ce80a4af \ + --hash=sha256:504435d831565f7cfac9f0714440028907f1975e4bed228e58e72ecfff58a1e0 \ + --hash=sha256:6135cf8affe1fc6f76cced2641e4ea8d3e59518d1f24ae41ba97bcad82d397cd \ + --hash=sha256:83e6e54e93d2b696a92cad6e6efc924f3850f82b52e1563778dfab8b355101b0 \ + --hash=sha256:9ad7ef62d92baf5a8654fbb88dac7fa5594cfa70fd3440488a5ca3bfc6d795a7 \ + --hash=sha256:bd551eb1fe1d7e92c1af1d75bdfa572eff1ab0e5bf1736716814cdccdb2360f9 \ + --hash=sha256:ca809b42f4444f144f2115c4c1a747b9a404d590f18f37e9402422033e464e0f \ + --hash=sha256:d552c53d0415449c8d17ced5c341caba0d89dbf433698e1436c8fa0aae7808a3 \ + --hash=sha256:f4510b93a3bec6eba8fd8f1093e9d7fb0d4a24d1a81377c10c0e5bbfe9e4ed24 + # via + # -c python/requirements_compiled.txt + # google-api-core + # googleapis-common-protos + # opentelemetry-proto + # proto-plus + # ray + # tensorboardx +py-spy==0.4.0 ; python_full_version < '3.12' \ + --hash=sha256:47cdda4c34d9b6cb01f3aaeceb2e88faf57da880207fe72ff6ff97e9bb6cc8a9 \ + --hash=sha256:77d8f637ade38367d944874776f45b703b7ac5938b1f7be8891f3a5876ddbb96 \ + --hash=sha256:806602ce7972782cc9c1e383f339bfc27bfb822d42485e6a3e0530ae5040e1f0 \ + --hash=sha256:87573e64dbfdfc89ba2e0f5e2f525aa84e0299c7eb6454b47ea335fde583a7a0 \ + --hash=sha256:8bf2f3702cef367a489faa45177b41a6c31b2a3e5bd78c978d44e29340152f5a \ + --hash=sha256:c5f06ffce4c9c98b7fc9f5e67e5e7db591173f1351837633f3f23d9378b1d18a \ + --hash=sha256:eee3d0bde85ca5cf4f01f012d461180ca76c24835a96f7b5c4ded64eb6a008ab \ + --hash=sha256:f2cf3f7130e7d780471faa5957441d3b4e0ec39a79b2c00f4c33d494f7728428 + # via + # -c python/requirements_compiled.txt + # ray +pyarrow==19.0.1 \ + --hash=sha256:008a4009efdb4ea3d2e18f05cd31f9d43c388aad29c636112c2966605ba33466 \ + --hash=sha256:0148bb4fc158bfbc3d6dfe5001d93ebeed253793fff4435167f6ce1dc4bddeae \ + --hash=sha256:1b93ef2c93e77c442c979b0d596af45e4665d8b96da598db145b0fec014b9136 \ + --hash=sha256:1c7556165bd38cf0cd992df2636f8bcdd2d4b26916c6b7e646101aff3c16f76f \ + --hash=sha256:335d170e050bcc7da867a1ed8ffb8b44c57aaa6e0843b156a501298657b1e972 \ + --hash=sha256:3bf266b485df66a400f282ac0b6d1b500b9d2ae73314a153dbe97d6d5cc8a99e \ + --hash=sha256:41f9706fbe505e0abc10e84bf3a906a1338905cbbcf1177b71486b03e6ea6608 \ + --hash=sha256:4982f8e2b7afd6dae8608d70ba5bd91699077323f812a0448d8b7abdff6cb5d3 \ + --hash=sha256:49a3aecb62c1be1d822f8bf629226d4a96418228a42f5b40835c1f10d42e4db6 \ + --hash=sha256:4d5d1ec7ec5324b98887bdc006f4d2ce534e10e60f7ad995e7875ffa0ff9cb14 \ + --hash=sha256:58d9397b2e273ef76264b45531e9d552d8ec8a6688b7390b5be44c02a37aade8 \ + --hash=sha256:5a9137cf7e1640dce4c190551ee69d478f7121b5c6f323553b319cac936395f6 \ + --hash=sha256:5bd1618ae5e5476b7654c7b55a6364ae87686d4724538c24185bbb2952679960 \ + --hash=sha256:65cf9feebab489b19cdfcfe4aa82f62147218558d8d3f0fc1e9dea0ab8e7905a \ + --hash=sha256:699799f9c80bebcf1da0983ba86d7f289c5a2a5c04b945e2f2bcf7e874a91911 \ + --hash=sha256:6c5941c1aac89a6c2f2b16cd64fe76bcdb94b2b1e99ca6459de4e6f07638d755 \ + --hash=sha256:6ebfb5171bb5f4a52319344ebbbecc731af3f021e49318c74f33d520d31ae0c4 \ + --hash=sha256:7a544ec12de66769612b2d6988c36adc96fb9767ecc8ee0a4d270b10b1c51e00 \ + --hash=sha256:7c1bca1897c28013db5e4c83944a2ab53231f541b9e0c3f4791206d0c0de389a \ + --hash=sha256:80b2ad2b193e7d19e81008a96e313fbd53157945c7be9ac65f44f8937a55427b \ + --hash=sha256:8464c9fbe6d94a7fe1599e7e8965f350fd233532868232ab2596a71586c5a429 \ + --hash=sha256:8f04d49a6b64cf24719c080b3c2029a3a5b16417fd5fd7c4041f94233af732f3 \ + --hash=sha256:96606c3ba57944d128e8a8399da4812f56c7f61de8c647e3470b417f795d0ef9 \ + --hash=sha256:99bc1bec6d234359743b01e70d4310d0ab240c3d6b0da7e2a93663b0158616f6 \ + --hash=sha256:ad76aef7f5f7e4a757fddcdcf010a8290958f09e3470ea458c80d26f4316ae89 \ + --hash=sha256:b4c4156a625f1e35d6c0b2132635a237708944eb41df5fbe7d50f20d20c17832 \ + --hash=sha256:b9766a47a9cb56fefe95cb27f535038b5a195707a08bf61b180e642324963b46 \ + --hash=sha256:c0fe3dbbf054a00d1f162fda94ce236a899ca01123a798c561ba307ca38af5f0 \ + --hash=sha256:c6cb2335a411b713fdf1e82a752162f72d4a7b5dbc588e32aa18383318b05866 \ + --hash=sha256:cc55d71898ea30dc95900297d191377caba257612f384207fe9f8293b5850f90 \ + --hash=sha256:d03c9d6f2a3dffbd62671ca070f13fc527bb1867b4ec2b98c7eeed381d4f389a \ + --hash=sha256:d383591f3dcbe545f6cc62daaef9c7cdfe0dff0fb9e1c8121101cabe9098cfa6 \ + --hash=sha256:d9d46e06846a41ba906ab25302cf0fd522f81aa2a85a71021826f34639ad31ef \ + --hash=sha256:d9dedeaf19097a143ed6da37f04f4051aba353c95ef507764d344229b2b740ae \ + --hash=sha256:e45274b20e524ae5c39d7fc1ca2aa923aab494776d2d4b316b49ec7572ca324c \ + --hash=sha256:ee8dec072569f43835932a3b10c55973593abc00936c202707a4ad06af7cb294 \ + --hash=sha256:f24faab6ed18f216a37870d8c5623f9c044566d75ec586ef884e13a02a9d62c5 \ + --hash=sha256:f2a21d39fbdb948857f67eacb5bbaaf36802de044ec36fbef7a1c8f0dd3a4ab2 \ + --hash=sha256:f3ad4c0eb4e2a9aeb990af6c09e6fa0b195c8c0e7b272ecc8d4d2b6574809d34 \ + --hash=sha256:fc28912a2dc924dddc2087679cc8b7263accc71b9ff025a1362b004711661a69 \ + --hash=sha256:fca15aabbe9b8355800d923cc2e82c8ef514af321e18b437c3d782aa884eaeec \ + --hash=sha256:fd44d66093a239358d07c42a91eebf5015aa54fccba959db899f932218ac9cc8 + # via + # -c python/requirements_compiled.txt + # ray +pyasn1==0.5.1 \ + --hash=sha256:4439847c58d40b1d0a573d07e3856e95333f1976294494c325775aeca506eb58 \ + --hash=sha256:6d391a96e59b23130a5cfa74d6fd7f388dbbe26cc8f1edf39fdddf08d9d6676c + # via + # -c python/requirements_compiled.txt + # pyasn1-modules + # rsa +pyasn1-modules==0.3.0 \ + --hash=sha256:5bd01446b736eb9d31512a30d46c1ac3395d676c6f3cafa4c03eb54b9925631c \ + --hash=sha256:d3ccd6ed470d9ffbc716be08bd90efbd44d0734bc9303818f7336070984a162d + # via + # -c python/requirements_compiled.txt + # google-auth +pycparser==2.21 ; platform_python_implementation != 'PyPy' \ + --hash=sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9 \ + --hash=sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206 + # via + # -c python/requirements_compiled.txt + # cffi +pydantic==2.11.7 \ + --hash=sha256:d989c3c6cb79469287b1569f7447a17848c998458d49ebe294e975b9baf0f0db \ + --hash=sha256:dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b + # via + # -c python/requirements_compiled.txt + # fastapi + # ray +pydantic-core==2.33.2 \ + --hash=sha256:0069c9acc3f3981b9ff4cdfaf088e98d83440a4c7ea1bc07460af3d4dc22e72d \ + --hash=sha256:031c57d67ca86902726e0fae2214ce6770bbe2f710dc33063187a68744a5ecac \ + --hash=sha256:0405262705a123b7ce9f0b92f123334d67b70fd1f20a9372b907ce1080c7ba02 \ + --hash=sha256:04a1a413977ab517154eebb2d326da71638271477d6ad87a769102f7c2488c56 \ + --hash=sha256:09fb9dd6571aacd023fe6aaca316bd01cf60ab27240d7eb39ebd66a3a15293b4 \ + --hash=sha256:0a39979dcbb70998b0e505fb1556a1d550a0781463ce84ebf915ba293ccb7e22 \ + --hash=sha256:0a9f2c9dd19656823cb8250b0724ee9c60a82f3cdf68a080979d13092a3b0fef \ + --hash=sha256:0e03262ab796d986f978f79c943fc5f620381be7287148b8010b4097f79a39ec \ + --hash=sha256:0e5b2671f05ba48b94cb90ce55d8bdcaaedb8ba00cc5359f6810fc918713983d \ + --hash=sha256:0e6116757f7959a712db11f3e9c0a99ade00a5bbedae83cb801985aa154f071b \ + --hash=sha256:0fb2d542b4d66f9470e8065c5469ec676978d625a8b7a363f07d9a501a9cb36a \ + --hash=sha256:1082dd3e2d7109ad8b7da48e1d4710c8d06c253cbc4a27c1cff4fbcaa97a9e3f \ + --hash=sha256:1a8695a8d00c73e50bff9dfda4d540b7dee29ff9b8053e38380426a85ef10052 \ + --hash=sha256:1e063337ef9e9820c77acc768546325ebe04ee38b08703244c1309cccc4f1bab \ + --hash=sha256:1ea40a64d23faa25e62a70ad163571c0b342b8bf66d5fa612ac0dec4f069d916 \ + --hash=sha256:2058a32994f1fde4ca0480ab9d1e75a0e8c87c22b53a3ae66554f9af78f2fe8c \ + --hash=sha256:235f45e5dbcccf6bd99f9f472858849f73d11120d76ea8707115415f8e5ebebf \ + --hash=sha256:2807668ba86cb38c6817ad9bc66215ab8584d1d304030ce4f0887336f28a5e27 \ + --hash=sha256:2b0a451c263b01acebe51895bfb0e1cc842a5c666efe06cdf13846c7418caa9a \ + --hash=sha256:2b3d326aaef0c0399d9afffeb6367d5e26ddc24d351dbc9c636840ac355dc5d8 \ + --hash=sha256:2bfb5112df54209d820d7bf9317c7a6c9025ea52e49f46b6a2060104bba37de7 \ + --hash=sha256:2f82865531efd18d6e07a04a17331af02cb7a651583c418df8266f17a63c6612 \ + --hash=sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1 \ + --hash=sha256:3a1c81334778f9e3af2f8aeb7a960736e5cab1dfebfb26aabca09afd2906c039 \ + --hash=sha256:3abcd9392a36025e3bd55f9bd38d908bd17962cc49bc6da8e7e96285336e2bca \ + --hash=sha256:3c6db6e52c6d70aa0d00d45cdb9b40f0433b96380071ea80b09277dba021ddf7 \ + --hash=sha256:3dc625f4aa79713512d1976fe9f0bc99f706a9dee21dfd1810b4bbbf228d0e8a \ + --hash=sha256:3eb3fe62804e8f859c49ed20a8451342de53ed764150cb14ca71357c765dc2a6 \ + --hash=sha256:44857c3227d3fb5e753d5fe4a3420d6376fa594b07b621e220cd93703fe21782 \ + --hash=sha256:4b25d91e288e2c4e0662b8038a28c6a07eaac3e196cfc4ff69de4ea3db992a1b \ + --hash=sha256:4c5b0a576fb381edd6d27f0a85915c6daf2f8138dc5c267a57c08a62900758c7 \ + --hash=sha256:4e61206137cbc65e6d5256e1166f88331d3b6238e082d9f74613b9b765fb9025 \ + --hash=sha256:52fb90784e0a242bb96ec53f42196a17278855b0f31ac7c3cc6f5c1ec4811849 \ + --hash=sha256:53a57d2ed685940a504248187d5685e49eb5eef0f696853647bf37c418c538f7 \ + --hash=sha256:572c7e6c8bb4774d2ac88929e3d1f12bc45714ae5ee6d9a788a9fb35e60bb04b \ + --hash=sha256:5c4aa4e82353f65e548c476b37e64189783aa5384903bfea4f41580f255fddfa \ + --hash=sha256:5c92edd15cd58b3c2d34873597a1e20f13094f59cf88068adb18947df5455b4e \ + --hash=sha256:5f483cfb75ff703095c59e365360cb73e00185e01aaea067cd19acffd2ab20ea \ + --hash=sha256:61c18fba8e5e9db3ab908620af374db0ac1baa69f0f32df4f61ae23f15e586ac \ + --hash=sha256:6368900c2d3ef09b69cb0b913f9f8263b03786e5b2a387706c5afb66800efd51 \ + --hash=sha256:64632ff9d614e5eecfb495796ad51b0ed98c453e447a76bcbeeb69615079fc7e \ + --hash=sha256:65132b7b4a1c0beded5e057324b7e16e10910c106d43675d9bd87d4f38dde162 \ + --hash=sha256:6b99022f1d19bc32a4c2a0d544fc9a76e3be90f0b3f4af413f87d38749300e65 \ + --hash=sha256:6bdfe4b3789761f3bcb4b1ddf33355a71079858958e3a552f16d5af19768fef2 \ + --hash=sha256:6fa6dfc3e4d1f734a34710f391ae822e0a8eb8559a85c6979e14e65ee6ba2954 \ + --hash=sha256:73662edf539e72a9440129f231ed3757faab89630d291b784ca99237fb94db2b \ + --hash=sha256:73cf6373c21bc80b2e0dc88444f41ae60b2f070ed02095754eb5a01df12256de \ + --hash=sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc \ + --hash=sha256:7f92c15cd1e97d4b12acd1cc9004fa092578acfa57b67ad5e43a197175d01a64 \ + --hash=sha256:82f68293f055f51b51ea42fafc74b6aad03e70e191799430b90c13d643059ebb \ + --hash=sha256:83aa99b1285bc8f038941ddf598501a86f1536789740991d7d8756e34f1e74d9 \ + --hash=sha256:87acbfcf8e90ca885206e98359d7dca4bcbb35abdc0ff66672a293e1d7a19101 \ + --hash=sha256:87b31b6846e361ef83fedb187bb5b4372d0da3f7e28d85415efa92d6125d6e6d \ + --hash=sha256:881b21b5549499972441da4758d662aeea93f1923f953e9cbaff14b8b9565aef \ + --hash=sha256:8d55ab81c57b8ff8548c3e4947f119551253f4e3787a7bbc0b6b3ca47498a9d3 \ + --hash=sha256:8f57a69461af2a5fa6e6bbd7a5f60d3b7e6cebb687f55106933188e79ad155c1 \ + --hash=sha256:95237e53bb015f67b63c91af7518a62a8660376a6a0db19b89acc77a4d6199f5 \ + --hash=sha256:96081f1605125ba0855dfda83f6f3df5ec90c61195421ba72223de35ccfb2f88 \ + --hash=sha256:970919794d126ba8645f3837ab6046fb4e72bbc057b3709144066204c19a455d \ + --hash=sha256:9cb1da0f5a471435a7bc7e439b8a728e8b61e59784b2af70d7c169f8dd8ae290 \ + --hash=sha256:9fcd347d2cc5c23b06de6d3b7b8275be558a0c90549495c699e379a80bf8379e \ + --hash=sha256:9fdac5d6ffa1b5a83bca06ffe7583f5576555e6c8b3a91fbd25ea7780f825f7d \ + --hash=sha256:a11c8d26a50bfab49002947d3d237abe4d9e4b5bdc8846a63537b6488e197808 \ + --hash=sha256:a144d4f717285c6d9234a66778059f33a89096dfb9b39117663fd8413d582dcc \ + --hash=sha256:a2b911a5b90e0374d03813674bf0a5fbbb7741570dcd4b4e85a2e48d17def29d \ + --hash=sha256:a7ec89dc587667f22b6a0b6579c249fca9026ce7c333fc142ba42411fa243cdc \ + --hash=sha256:aa9d91b338f2df0508606f7009fde642391425189bba6d8c653afd80fd6bb64e \ + --hash=sha256:b0379a2b24882fef529ec3b4987cb5d003b9cda32256024e6fe1586ac45fc640 \ + --hash=sha256:bc7aee6f634a6f4a95676fcb5d6559a2c2a390330098dba5e5a5f28a2e4ada30 \ + --hash=sha256:bdc25f3681f7b78572699569514036afe3c243bc3059d3942624e936ec93450e \ + --hash=sha256:c083a3bdd5a93dfe480f1125926afcdbf2917ae714bdb80b36d34318b2bec5d9 \ + --hash=sha256:c20c462aa4434b33a2661701b861604913f912254e441ab8d78d30485736115a \ + --hash=sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9 \ + --hash=sha256:c52b02ad8b4e2cf14ca7b3d918f3eb0ee91e63b3167c32591e57c4317e134f8f \ + --hash=sha256:c54c939ee22dc8e2d545da79fc5381f1c020d6d3141d3bd747eab59164dc89fb \ + --hash=sha256:c8e7af2f4e0194c22b5b37205bfb293d166a7344a5b0d0eaccebc376546d77d5 \ + --hash=sha256:cca3868ddfaccfbc4bfb1d608e2ccaaebe0ae628e1416aeb9c4d88c001bb45ab \ + --hash=sha256:d3f26877a748dc4251cfcfda9dfb5f13fcb034f5308388066bcfe9031b63ae7d \ + --hash=sha256:d53b22f2032c42eaaf025f7c40c2e3b94568ae077a606f006d206a463bc69572 \ + --hash=sha256:d87c561733f66531dced0da6e864f44ebf89a8fba55f31407b00c2f7f9449593 \ + --hash=sha256:d946c8bf0d5c24bf4fe333af284c59a19358aa3ec18cb3dc4370080da1e8ad29 \ + --hash=sha256:dac89aea9af8cd672fa7b510e7b8c33b0bba9a43186680550ccf23020f32d535 \ + --hash=sha256:db4b41f9bd95fbe5acd76d89920336ba96f03e149097365afe1cb092fceb89a1 \ + --hash=sha256:dc46a01bf8d62f227d5ecee74178ffc448ff4e5197c756331f71efcc66dc980f \ + --hash=sha256:dd14041875d09cc0f9308e37a6f8b65f5585cf2598a53aa0123df8b129d481f8 \ + --hash=sha256:de4b83bb311557e439b9e186f733f6c645b9417c84e2eb8203f3f820a4b988bf \ + --hash=sha256:e799c050df38a639db758c617ec771fd8fb7a5f8eaaa4b27b101f266b216a246 \ + --hash=sha256:e80b087132752f6b3d714f041ccf74403799d3b23a72722ea2e6ba2e892555b9 \ + --hash=sha256:eb8c529b2819c37140eb51b914153063d27ed88e3bdc31b71198a198e921e011 \ + --hash=sha256:eb9b459ca4df0e5c87deb59d37377461a538852765293f9e6ee834f0435a93b9 \ + --hash=sha256:efec8db3266b76ef9607c2c4c419bdb06bf335ae433b80816089ea7585816f6a \ + --hash=sha256:f481959862f57f29601ccced557cc2e817bce7533ab8e01a797a48b49c9692b3 \ + --hash=sha256:f517ca031dfc037a9c07e748cefd8d96235088b83b4f4ba8939105d20fa1dcd6 \ + --hash=sha256:f889f7a40498cc077332c7ab6b4608d296d852182211787d4f3ee377aaae66e8 \ + --hash=sha256:f8de619080e944347f5f20de29a975c2d815d9ddd8be9b9b7268e2e3ef68605a \ + --hash=sha256:f941635f2a3d96b2973e867144fde513665c87f13fe0e193c158ac51bfaaa7b2 \ + --hash=sha256:fa754d1850735a0b0e03bcffd9d4b4343eb417e47196e4485d9cca326073a42c \ + --hash=sha256:fa854f5cf7e33842a892e5c73f45327760bc7bc516339fda888c75ae60edaeb6 \ + --hash=sha256:fe5b32187cbc0c862ee201ad66c30cf218e5ed468ec8dc1cf49dec66e160cc4d + # via + # -c python/requirements_compiled.txt + # pydantic +pygments==2.18.0 ; sys_platform != 'win32' \ + --hash=sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199 \ + --hash=sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a + # via + # -c python/requirements_compiled.txt + # rich +pyopenssl==25.0.0 \ + --hash=sha256:424c247065e46e76a37411b9ab1782541c23bb658bf003772c3405fbaa128e90 \ + --hash=sha256:cd2cef799efa3936bb08e8ccb9433a575722b9dd986023f1cabc4ae64e9dac16 + # via + # -c python/requirements_compiled.txt + # ray +python-dateutil==2.8.2 \ + --hash=sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86 \ + --hash=sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9 + # via + # -c python/requirements_compiled.txt + # celery + # pandas +python-dotenv==1.1.1 \ + --hash=sha256:31f23644fe2602f88ff55e1f5c79ba497e01224ee7737937930c448e4d0e24dc \ + --hash=sha256:a8a6399716257f45be6a007360200409fce5cda2661e3dec71d23dc15f6189ab + # via uvicorn +pytz==2022.7.1 \ + --hash=sha256:01a0681c4b9684a28304615eba55d1ab31ae00bf68ec157ec3708a8182dbbcd0 \ + --hash=sha256:78f4f37d8198e0627c5f1143240bb0206b8691d8d7ac6d78fee88b78733f8c4a + # via + # -c python/requirements_compiled.txt + # pandas +pyyaml==6.0.1 \ + --hash=sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5 \ + --hash=sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc \ + --hash=sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df \ + --hash=sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741 \ + --hash=sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206 \ + --hash=sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27 \ + --hash=sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595 \ + --hash=sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62 \ + --hash=sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98 \ + --hash=sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696 \ + --hash=sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290 \ + --hash=sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9 \ + --hash=sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d \ + --hash=sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6 \ + --hash=sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867 \ + --hash=sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47 \ + --hash=sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486 \ + --hash=sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6 \ + --hash=sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3 \ + --hash=sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007 \ + --hash=sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938 \ + --hash=sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0 \ + --hash=sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c \ + --hash=sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735 \ + --hash=sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d \ + --hash=sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28 \ + --hash=sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4 \ + --hash=sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba \ + --hash=sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8 \ + --hash=sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef \ + --hash=sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5 \ + --hash=sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd \ + --hash=sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3 \ + --hash=sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0 \ + --hash=sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515 \ + --hash=sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c \ + --hash=sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c \ + --hash=sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924 \ + --hash=sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34 \ + --hash=sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43 \ + --hash=sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859 \ + --hash=sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673 \ + --hash=sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54 \ + --hash=sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a \ + --hash=sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b \ + --hash=sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab \ + --hash=sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa \ + --hash=sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c \ + --hash=sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585 \ + --hash=sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d \ + --hash=sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f + # via + # -c python/requirements_compiled.txt + # ray + # uvicorn +ray==100.0.0.dev0 \ + --hash=sha256:287f652801352646b3d4ab6cf71d91763555ca2a714364d1a187fbdead96a122 +referencing==0.36.2 \ + --hash=sha256:df2e89862cd09deabbdba16944cc3f10feb6b3e6f18e902f7cc25609a34775aa \ + --hash=sha256:e8699adbbf8b5c7de96d8ffa0eb5c158b3beafce084968e2ea8bb08c6794dcd0 + # via + # -c python/requirements_compiled.txt + # jsonschema + # jsonschema-specifications +requests==2.32.3 \ + --hash=sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760 \ + --hash=sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6 + # via + # -c python/requirements_compiled.txt + # google-api-core + # ray +rich==13.3.2 ; sys_platform != 'win32' \ + --hash=sha256:91954fe80cfb7985727a467ca98a7618e5dd15178cc2da10f553b36a93859001 \ + --hash=sha256:a104f37270bf677148d8acb07d33be1569eeee87e2d1beb286a4e9113caf6f2f + # via + # -c python/requirements_compiled.txt + # memray +rpds-py==0.22.3 \ + --hash=sha256:009de23c9c9ee54bf11303a966edf4d9087cd43a6003672e6aa7def643d06518 \ + --hash=sha256:02fbb9c288ae08bcb34fb41d516d5eeb0455ac35b5512d03181d755d80810059 \ + --hash=sha256:0a0461200769ab3b9ab7e513f6013b7a97fdeee41c29b9db343f3c5a8e2b9e61 \ + --hash=sha256:0b09865a9abc0ddff4e50b5ef65467cd94176bf1e0004184eb915cbc10fc05c5 \ + --hash=sha256:0b8db6b5b2d4491ad5b6bdc2bc7c017eec108acbf4e6785f42a9eb0ba234f4c9 \ + --hash=sha256:0c150c7a61ed4a4f4955a96626574e9baf1adf772c2fb61ef6a5027e52803543 \ + --hash=sha256:0f3cec041684de9a4684b1572fe28c7267410e02450f4561700ca5a3bc6695a2 \ + --hash=sha256:1352ae4f7c717ae8cba93421a63373e582d19d55d2ee2cbb184344c82d2ae55a \ + --hash=sha256:177c7c0fce2855833819c98e43c262007f42ce86651ffbb84f37883308cb0e7d \ + --hash=sha256:1978d0021e943aae58b9b0b196fb4895a25cc53d3956b8e35e0b7682eefb6d56 \ + --hash=sha256:1a60bce91f81ddaac922a40bbb571a12c1070cb20ebd6d49c48e0b101d87300d \ + --hash=sha256:1aef18820ef3e4587ebe8b3bc9ba6e55892a6d7b93bac6d29d9f631a3b4befbd \ + --hash=sha256:1e9663daaf7a63ceccbbb8e3808fe90415b0757e2abddbfc2e06c857bf8c5e2b \ + --hash=sha256:20070c65396f7373f5df4005862fa162db5d25d56150bddd0b3e8214e8ef45b4 \ + --hash=sha256:214b7a953d73b5e87f0ebece4a32a5bd83c60a3ecc9d4ec8f1dca968a2d91e99 \ + --hash=sha256:22bebe05a9ffc70ebfa127efbc429bc26ec9e9b4ee4d15a740033efda515cf3d \ + --hash=sha256:24e8abb5878e250f2eb0d7859a8e561846f98910326d06c0d51381fed59357bd \ + --hash=sha256:26fd7cac7dd51011a245f29a2cc6489c4608b5a8ce8d75661bb4a1066c52dfbe \ + --hash=sha256:27b1d3b3915a99208fee9ab092b8184c420f2905b7d7feb4aeb5e4a9c509b8a1 \ + --hash=sha256:27e98004595899949bd7a7b34e91fa7c44d7a97c40fcaf1d874168bb652ec67e \ + --hash=sha256:2b8f60e1b739a74bab7e01fcbe3dddd4657ec685caa04681df9d562ef15b625f \ + --hash=sha256:2de29005e11637e7a2361fa151f780ff8eb2543a0da1413bb951e9f14b699ef3 \ + --hash=sha256:2e8b55d8517a2fda8d95cb45d62a5a8bbf9dd0ad39c5b25c8833efea07b880ca \ + --hash=sha256:2fa4331c200c2521512595253f5bb70858b90f750d39b8cbfd67465f8d1b596d \ + --hash=sha256:3445e07bf2e8ecfeef6ef67ac83de670358abf2996916039b16a218e3d95e97e \ + --hash=sha256:3453e8d41fe5f17d1f8e9c383a7473cd46a63661628ec58e07777c2fff7196dc \ + --hash=sha256:378753b4a4de2a7b34063d6f95ae81bfa7b15f2c1a04a9518e8644e81807ebea \ + --hash=sha256:3af6e48651c4e0d2d166dc1b033b7042ea3f871504b6805ba5f4fe31581d8d38 \ + --hash=sha256:3dfcbc95bd7992b16f3f7ba05af8a64ca694331bd24f9157b49dadeeb287493b \ + --hash=sha256:3f21f0495edea7fdbaaa87e633a8689cd285f8f4af5c869f27bc8074638ad69c \ + --hash=sha256:4041711832360a9b75cfb11b25a6a97c8fb49c07b8bd43d0d02b45d0b499a4ff \ + --hash=sha256:44d61b4b7d0c2c9ac019c314e52d7cbda0ae31078aabd0f22e583af3e0d79723 \ + --hash=sha256:4617e1915a539a0d9a9567795023de41a87106522ff83fbfaf1f6baf8e85437e \ + --hash=sha256:4b232061ca880db21fa14defe219840ad9b74b6158adb52ddf0e87bead9e8493 \ + --hash=sha256:5246b14ca64a8675e0a7161f7af68fe3e910e6b90542b4bfb5439ba752191df6 \ + --hash=sha256:5725dd9cc02068996d4438d397e255dcb1df776b7ceea3b9cb972bdb11260a83 \ + --hash=sha256:583f6a1993ca3369e0f80ba99d796d8e6b1a3a2a442dd4e1a79e652116413091 \ + --hash=sha256:59259dc58e57b10e7e18ce02c311804c10c5a793e6568f8af4dead03264584d1 \ + --hash=sha256:593eba61ba0c3baae5bc9be2f5232430453fb4432048de28399ca7376de9c627 \ + --hash=sha256:59f4a79c19232a5774aee369a0c296712ad0e77f24e62cad53160312b1c1eaa1 \ + --hash=sha256:5f0e260eaf54380380ac3808aa4ebe2d8ca28b9087cf411649f96bad6900c728 \ + --hash=sha256:62d9cfcf4948683a18a9aff0ab7e1474d407b7bab2ca03116109f8464698ab16 \ + --hash=sha256:64607d4cbf1b7e3c3c8a14948b99345eda0e161b852e122c6bb71aab6d1d798c \ + --hash=sha256:655ca44a831ecb238d124e0402d98f6212ac527a0ba6c55ca26f616604e60a45 \ + --hash=sha256:666ecce376999bf619756a24ce15bb14c5bfaf04bf00abc7e663ce17c3f34fe7 \ + --hash=sha256:68049202f67380ff9aa52f12e92b1c30115f32e6895cd7198fa2a7961621fc5a \ + --hash=sha256:69803198097467ee7282750acb507fba35ca22cc3b85f16cf45fb01cb9097730 \ + --hash=sha256:6c7b99ca52c2c1752b544e310101b98a659b720b21db00e65edca34483259967 \ + --hash=sha256:6dd9412824c4ce1aca56c47b0991e65bebb7ac3f4edccfd3f156150c96a7bf25 \ + --hash=sha256:70eb60b3ae9245ddea20f8a4190bd79c705a22f8028aaf8bbdebe4716c3fab24 \ + --hash=sha256:70fb28128acbfd264eda9bf47015537ba3fe86e40d046eb2963d75024be4d055 \ + --hash=sha256:7b2513ba235829860b13faa931f3b6846548021846ac808455301c23a101689d \ + --hash=sha256:7ef9d9da710be50ff6809fed8f1963fecdfecc8b86656cadfca3bc24289414b0 \ + --hash=sha256:81e69b0a0e2537f26d73b4e43ad7bc8c8efb39621639b4434b76a3de50c6966e \ + --hash=sha256:8633e471c6207a039eff6aa116e35f69f3156b3989ea3e2d755f7bc41754a4a7 \ + --hash=sha256:8bd7c8cfc0b8247c8799080fbff54e0b9619e17cdfeb0478ba7295d43f635d7c \ + --hash=sha256:9253fc214112405f0afa7db88739294295f0e08466987f1d70e29930262b4c8f \ + --hash=sha256:99b37292234e61325e7a5bb9689e55e48c3f5f603af88b1642666277a81f1fbd \ + --hash=sha256:9bd7228827ec7bb817089e2eb301d907c0d9827a9e558f22f762bb690b131652 \ + --hash=sha256:9beeb01d8c190d7581a4d59522cd3d4b6887040dcfc744af99aa59fef3e041a8 \ + --hash=sha256:a63cbdd98acef6570c62b92a1e43266f9e8b21e699c363c0fef13bd530799c11 \ + --hash=sha256:a76e42402542b1fae59798fab64432b2d015ab9d0c8c47ba7addddbaf7952333 \ + --hash=sha256:ac0a03221cdb5058ce0167ecc92a8c89e8d0decdc9e99a2ec23380793c4dcb96 \ + --hash=sha256:b0b4136a252cadfa1adb705bb81524eee47d9f6aab4f2ee4fa1e9d3cd4581f64 \ + --hash=sha256:b25bc607423935079e05619d7de556c91fb6adeae9d5f80868dde3468657994b \ + --hash=sha256:b3d504047aba448d70cf6fa22e06cb09f7cbd761939fdd47604f5e007675c24e \ + --hash=sha256:bb47271f60660803ad11f4c61b42242b8c1312a31c98c578f79ef9387bbde21c \ + --hash=sha256:bbb232860e3d03d544bc03ac57855cd82ddf19c7a07651a7c0fdb95e9efea8b9 \ + --hash=sha256:bc27863442d388870c1809a87507727b799c8460573cfbb6dc0eeaef5a11b5ec \ + --hash=sha256:bc51abd01f08117283c5ebf64844a35144a0843ff7b2983e0648e4d3d9f10dbb \ + --hash=sha256:be2eb3f2495ba669d2a985f9b426c1797b7d48d6963899276d22f23e33d47e37 \ + --hash=sha256:bf9db5488121b596dbfc6718c76092fda77b703c1f7533a226a5a9f65248f8ad \ + --hash=sha256:c58e2339def52ef6b71b8f36d13c3688ea23fa093353f3a4fee2556e62086ec9 \ + --hash=sha256:cfbc454a2880389dbb9b5b398e50d439e2e58669160f27b60e5eca11f68ae17c \ + --hash=sha256:cff63a0272fcd259dcc3be1657b07c929c466b067ceb1c20060e8d10af56f5bf \ + --hash=sha256:d115bffdd417c6d806ea9069237a4ae02f513b778e3789a359bc5856e0404cc4 \ + --hash=sha256:d20cfb4e099748ea39e6f7b16c91ab057989712d31761d3300d43134e26e165f \ + --hash=sha256:d48424e39c2611ee1b84ad0f44fb3b2b53d473e65de061e3f460fc0be5f1939d \ + --hash=sha256:e0fa2d4ec53dc51cf7d3bb22e0aa0143966119f42a0c3e4998293a3dd2856b09 \ + --hash=sha256:e32fee8ab45d3c2db6da19a5323bc3362237c8b653c70194414b892fd06a080d \ + --hash=sha256:e35ba67d65d49080e8e5a1dd40101fccdd9798adb9b050ff670b7d74fa41c566 \ + --hash=sha256:e3fb866d9932a3d7d0c82da76d816996d1667c44891bd861a0f97ba27e84fc74 \ + --hash=sha256:e61b02c3f7a1e0b75e20c3978f7135fd13cb6cf551bf4a6d29b999a88830a338 \ + --hash=sha256:e67ba3c290821343c192f7eae1d8fd5999ca2dc99994114643e2f2d3e6138b15 \ + --hash=sha256:e79dd39f1e8c3504be0607e5fc6e86bb60fe3584bec8b782578c3b0fde8d932c \ + --hash=sha256:e89391e6d60251560f0a8f4bd32137b077a80d9b7dbe6d5cab1cd80d2746f648 \ + --hash=sha256:ea7433ce7e4bfc3a85654aeb6747babe3f66eaf9a1d0c1e7a4435bbdf27fea84 \ + --hash=sha256:eaf16ae9ae519a0e237a0f528fd9f0197b9bb70f40263ee57ae53c2b8d48aeb3 \ + --hash=sha256:eb0c341fa71df5a4595f9501df4ac5abfb5a09580081dffbd1ddd4654e6e9123 \ + --hash=sha256:f276b245347e6e36526cbd4a266a417796fc531ddf391e43574cf6466c492520 \ + --hash=sha256:f47ad3d5f3258bd7058d2d506852217865afefe6153a36eb4b6928758041d831 \ + --hash=sha256:f56a6b404f74ab372da986d240e2e002769a7d7102cc73eb238a4f72eec5284e \ + --hash=sha256:f5cf2a0c2bdadf3791b5c205d55a37a54025c6e18a71c71f82bb536cf9a454bf \ + --hash=sha256:f5d36399a1b96e1a5fdc91e0522544580dbebeb1f77f27b2b0ab25559e103b8b \ + --hash=sha256:f60bd8423be1d9d833f230fdbccf8f57af322d96bcad6599e5a771b151398eb2 \ + --hash=sha256:f612463ac081803f243ff13cccc648578e2279295048f2a8d5eb430af2bae6e3 \ + --hash=sha256:f73d3fef726b3243a811121de45193c0ca75f6407fe66f3f4e183c983573e130 \ + --hash=sha256:f82a116a1d03628a8ace4859556fb39fd1424c933341a08ea3ed6de1edb0283b \ + --hash=sha256:fb0ba113b4983beac1a2eb16faffd76cb41e176bf58c4afe3e14b9c681f702de \ + --hash=sha256:fb4f868f712b2dd4bcc538b0a0c1f63a2b1d584c925e69a224d759e7070a12d5 \ + --hash=sha256:fb6116dfb8d1925cbdb52595560584db42a7f664617a1f7d7f6e32f138cdf37d \ + --hash=sha256:fda7cb070f442bf80b642cd56483b5548e43d366fe3f39b98e67cce780cded00 \ + --hash=sha256:feea821ee2a9273771bae61194004ee2fc33f8ec7db08117ef9147d4bbcbca8e + # via + # -c python/requirements_compiled.txt + # jsonschema + # referencing +rsa==4.7.2 \ + --hash=sha256:78f9a9bf4e7be0c5ded4583326e7461e3a3c5aae24073648b4bdfa797d78c9d2 \ + --hash=sha256:9d689e6ca1b3038bc82bf8d23e944b6b6037bc02301a574935b2dd946e0353b9 + # via + # -c python/requirements_compiled.txt + # google-auth +scipy==1.11.4 \ + --hash=sha256:00150c5eae7b610c32589dda259eacc7c4f1665aedf25d921907f4d08a951b1c \ + --hash=sha256:028eccd22e654b3ea01ee63705681ee79933652b2d8f873e7949898dda6d11b6 \ + --hash=sha256:1b7c3dca977f30a739e0409fb001056484661cb2541a01aba0bb0029f7b68db8 \ + --hash=sha256:2c6ff6ef9cc27f9b3db93a6f8b38f97387e6e0591600369a297a50a8e96e835d \ + --hash=sha256:36750b7733d960d7994888f0d148d31ea3017ac15eef664194b4ef68d36a4a97 \ + --hash=sha256:530f9ad26440e85766509dbf78edcfe13ffd0ab7fec2560ee5c36ff74d6269ff \ + --hash=sha256:5e347b14fe01003d3b78e196e84bd3f48ffe4c8a7b8a1afbcb8f5505cb710993 \ + --hash=sha256:6550466fbeec7453d7465e74d4f4b19f905642c89a7525571ee91dd7adabb5a3 \ + --hash=sha256:6df1468153a31cf55ed5ed39647279beb9cfb5d3f84369453b49e4b8502394fd \ + --hash=sha256:6e619aba2df228a9b34718efb023966da781e89dd3d21637b27f2e54db0410d7 \ + --hash=sha256:8fce70f39076a5aa62e92e69a7f62349f9574d8405c0a5de6ed3ef72de07f446 \ + --hash=sha256:90a2b78e7f5733b9de748f589f09225013685f9b218275257f8a8168ededaeaa \ + --hash=sha256:91af76a68eeae0064887a48e25c4e616fa519fa0d38602eda7e0f97d65d57937 \ + --hash=sha256:933baf588daa8dc9a92c20a0be32f56d43faf3d1a60ab11b3f08c356430f6e56 \ + --hash=sha256:acf8ed278cc03f5aff035e69cb511741e0418681d25fbbb86ca65429c4f4d9cd \ + --hash=sha256:ad669df80528aeca5f557712102538f4f37e503f0c5b9541655016dd0932ca79 \ + --hash=sha256:b030c6674b9230d37c5c60ab456e2cf12f6784596d15ce8da9365e70896effc4 \ + --hash=sha256:b9999c008ccf00e8fbcce1236f85ade5c569d13144f77a1946bef8863e8f6eb4 \ + --hash=sha256:bc9a714581f561af0848e6b69947fda0614915f072dfd14142ed1bfe1b806710 \ + --hash=sha256:ce7fff2e23ab2cc81ff452a9444c215c28e6305f396b2ba88343a567feec9660 \ + --hash=sha256:cf00bd2b1b0211888d4dc75656c0412213a8b25e80d73898083f402b50f47e41 \ + --hash=sha256:d10e45a6c50211fe256da61a11c34927c68f277e03138777bdebedd933712fea \ + --hash=sha256:ee410e6de8f88fd5cf6eadd73c135020bfbbbdfcd0f6162c36a7638a1ea8cc65 \ + --hash=sha256:f313b39a7e94f296025e3cffc2c567618174c0b1dde173960cf23808f9fae4be \ + --hash=sha256:f3cd9e7b3c2c1ec26364856f9fbe78695fe631150f94cd1c22228456404cf1ec + # via + # -c python/requirements_compiled.txt + # ray +six==1.16.0 \ + --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \ + --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 + # via + # -c python/requirements_compiled.txt + # opencensus + # python-dateutil +smart-open==6.2.0 \ + --hash=sha256:088bf00f9327c71e549bc2f86567d3320df5d89667f009ce1c16568976068ef7 \ + --hash=sha256:1b4df5c8365218f3852c507451920ccad606c80b0acb4e67508e50ba9b5d2632 + # via + # -c python/requirements_compiled.txt + # ray +sniffio==1.3.1 \ + --hash=sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2 \ + --hash=sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc + # via + # -c python/requirements_compiled.txt + # anyio +starlette==0.46.2 \ + --hash=sha256:595633ce89f8ffa71a015caed34a5b2dc1c0cdb3f0f1fbd1e69339cf2abeec35 \ + --hash=sha256:7f7361f34eed179294600af672f565727419830b54b7b084efe44bb82d2fccd5 + # via + # -c python/requirements_compiled.txt + # fastapi + # ray +tensorboardx==2.6.2.2 \ + --hash=sha256:160025acbf759ede23fd3526ae9d9bfbfd8b68eb16c38a010ebe326dc6395db8 \ + --hash=sha256:c6476d7cd0d529b0b72f4acadb1269f9ed8b22f441e87a84f2a3b940bb87b666 + # via + # -c python/requirements_compiled.txt + # ray +typing-extensions==4.12.2 \ + --hash=sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d + # via + # -c python/requirements_compiled.txt + # fastapi + # gymnasium + # opentelemetry-api + # opentelemetry-sdk + # opentelemetry-semantic-conventions + # pydantic + # pydantic-core + # pyopenssl + # referencing + # typing-inspection +typing-inspection==0.4.1 \ + --hash=sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51 \ + --hash=sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28 + # via + # -c python/requirements_compiled.txt + # pydantic +tzdata==2025.2 \ + --hash=sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8 \ + --hash=sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9 + # via + # -c python/requirements_compiled.txt + # kombu +urllib3==1.26.19 \ + --hash=sha256:37a0344459b199fce0e80b0d3569837ec6b6937435c5244e7fd73fa6006830f3 \ + --hash=sha256:3e3d753a8618b86d7de333b4223005f68720bcd6a7d2bcb9fbd2229ec7c1e429 + # via + # -c python/requirements_compiled.txt + # requests +uvicorn==0.22.0 \ + --hash=sha256:79277ae03db57ce7d9aa0567830bbb51d7a612f54d6e1e3e92da3ef24c2c8ed8 \ + --hash=sha256:e9434d3bbf05f310e762147f769c9f21235ee118ba2d2bf1155a7196448bd996 + # via + # -c python/requirements_compiled.txt + # ray +uvloop==0.21.0 ; platform_python_implementation != 'PyPy' and sys_platform != 'cygwin' and sys_platform != 'win32' \ + --hash=sha256:0878c2640cf341b269b7e128b1a5fed890adc4455513ca710d77d5e93aa6d6a0 \ + --hash=sha256:10d66943def5fcb6e7b37310eb6b5639fd2ccbc38df1177262b0640c3ca68c1f \ + --hash=sha256:10da8046cc4a8f12c91a1c39d1dd1585c41162a15caaef165c2174db9ef18bdc \ + --hash=sha256:17df489689befc72c39a08359efac29bbee8eee5209650d4b9f34df73d22e414 \ + --hash=sha256:183aef7c8730e54c9a3ee3227464daed66e37ba13040bb3f350bc2ddc040f22f \ + --hash=sha256:196274f2adb9689a289ad7d65700d37df0c0930fd8e4e743fa4834e850d7719d \ + --hash=sha256:221f4f2a1f46032b403bf3be628011caf75428ee3cc204a22addf96f586b19fd \ + --hash=sha256:2d1f581393673ce119355d56da84fe1dd9d2bb8b3d13ce792524e1607139feff \ + --hash=sha256:359ec2c888397b9e592a889c4d72ba3d6befba8b2bb01743f72fffbde663b59c \ + --hash=sha256:3bf12b0fda68447806a7ad847bfa591613177275d35b6724b1ee573faa3704e3 \ + --hash=sha256:4509360fcc4c3bd2c70d87573ad472de40c13387f5fda8cb58350a1d7475e58d \ + --hash=sha256:460def4412e473896ef179a1671b40c039c7012184b627898eea5072ef6f017a \ + --hash=sha256:461d9ae6660fbbafedd07559c6a2e57cd553b34b0065b6550685f6653a98c1cb \ + --hash=sha256:46923b0b5ee7fc0020bef24afe7836cb068f5050ca04caf6b487c513dc1a20b2 \ + --hash=sha256:53e420a3afe22cdcf2a0f4846e377d16e718bc70103d7088a4f7623567ba5fb0 \ + --hash=sha256:5ee4d4ef48036ff6e5cfffb09dd192c7a5027153948d85b8da7ff705065bacc6 \ + --hash=sha256:67dd654b8ca23aed0a8e99010b4c34aca62f4b7fce88f39d452ed7622c94845c \ + --hash=sha256:787ae31ad8a2856fc4e7c095341cccc7209bd657d0e71ad0dc2ea83c4a6fa8af \ + --hash=sha256:86975dca1c773a2c9864f4c52c5a55631038e387b47eaf56210f873887b6c8dc \ + --hash=sha256:87c43e0f13022b998eb9b973b5e97200c8b90823454d4bc06ab33829e09fb9bb \ + --hash=sha256:88cb67cdbc0e483da00af0b2c3cdad4b7c61ceb1ee0f33fe00e09c81e3a6cb75 \ + --hash=sha256:8a375441696e2eda1c43c44ccb66e04d61ceeffcd76e4929e527b7fa401b90fb \ + --hash=sha256:a5c39f217ab3c663dc699c04cbd50c13813e31d917642d459fdcec07555cc553 \ + --hash=sha256:b9fb766bb57b7388745d8bcc53a359b116b8a04c83a2288069809d2b3466c37e \ + --hash=sha256:baa0e6291d91649c6ba4ed4b2f982f9fa165b5bbd50a9e203c416a2797bab3c6 \ + --hash=sha256:baa4dcdbd9ae0a372f2167a207cd98c9f9a1ea1188a8a526431eef2f8116cc8d \ + --hash=sha256:bc09f0ff191e61c2d592a752423c767b4ebb2986daa9ed62908e2b1b9a9ae206 \ + --hash=sha256:bd53ecc9a0f3d87ab847503c2e1552b690362e005ab54e8a48ba97da3924c0dc \ + --hash=sha256:bfd55dfcc2a512316e65f16e503e9e450cab148ef11df4e4e679b5e8253a5281 \ + --hash=sha256:c097078b8031190c934ed0ebfee8cc5f9ba9642e6eb88322b9958b649750f72b \ + --hash=sha256:c0f3fa6200b3108919f8bdabb9a7f87f20e7097ea3c543754cabc7d717d95cf8 \ + --hash=sha256:e678ad6fe52af2c58d2ae3c73dc85524ba8abe637f134bf3564ed07f555c5e79 \ + --hash=sha256:ec7e6b09a6fdded42403182ab6b832b71f4edaf7f37a9a0e371a01db5f0cb45f \ + --hash=sha256:f0ce1b49560b1d2d8a2977e3ba4afb2414fb46b86a1b64056bc4ab929efdafbe \ + --hash=sha256:f38b2e090258d051d68a5b14d1da7203a3c3677321cf32a95a6f4db4dd8b6f26 \ + --hash=sha256:f3df876acd7ec037a3d005b3ab85a7e4110422e4d9c1571d4fc89b0fc41b6816 \ + --hash=sha256:f7089d2dc73179ce5ac255bdf37c236a9f914b264825fdaacaded6990a7fb4c2 + # via + # -c python/requirements_compiled.txt + # uvicorn +vine==5.1.0 \ + --hash=sha256:40fdf3c48b2cfe1c38a49e9ae2da6fda88e4794c810050a728bd7413811fb1dc \ + --hash=sha256:8b62e981d35c41049211cf62a0a1242d8c1ee9bd15bb196ce38aefd6799e61e0 + # via + # -c python/requirements_compiled.txt + # amqp + # celery + # kombu +virtualenv==20.29.1 \ + --hash=sha256:4e4cb403c0b0da39e13b46b1b2476e505cb0046b25f242bee80f62bf990b2779 \ + --hash=sha256:b8b8970138d32fb606192cb97f6cd4bb644fa486be9308fb9b63f81091b5dc35 + # via + # -c python/requirements_compiled.txt + # ray +watchfiles==0.19.0 \ + --hash=sha256:0089c6dc24d436b373c3c57657bf4f9a453b13767150d17284fc6162b2791911 \ + --hash=sha256:09ea3397aecbc81c19ed7f025e051a7387feefdb789cf768ff994c1228182fda \ + --hash=sha256:176a9a7641ec2c97b24455135d58012a5be5c6217fc4d5fef0b2b9f75dbf5154 \ + --hash=sha256:18b28f6ad871b82df9542ff958d0c86bb0d8310bb09eb8e87d97318a3b5273af \ + --hash=sha256:20b44221764955b1e703f012c74015306fb7e79a00c15370785f309b1ed9aa8d \ + --hash=sha256:3d7d267d27aceeeaa3de0dd161a0d64f0a282264d592e335fff7958cc0cbae7c \ + --hash=sha256:5471582658ea56fca122c0f0d0116a36807c63fefd6fdc92c71ca9a4491b6b48 \ + --hash=sha256:5569fc7f967429d4bc87e355cdfdcee6aabe4b620801e2cf5805ea245c06097c \ + --hash=sha256:68dce92b29575dda0f8d30c11742a8e2b9b8ec768ae414b54f7453f27bdf9545 \ + --hash=sha256:79c533ff593db861ae23436541f481ec896ee3da4e5db8962429b441bbaae16e \ + --hash=sha256:7f3920b1285a7d3ce898e303d84791b7bf40d57b7695ad549dc04e6a44c9f120 \ + --hash=sha256:91633e64712df3051ca454ca7d1b976baf842d7a3640b87622b323c55f3345e7 \ + --hash=sha256:945be0baa3e2440151eb3718fd8846751e8b51d8de7b884c90b17d271d34cae8 \ + --hash=sha256:9afd0d69429172c796164fd7fe8e821ade9be983f51c659a38da3faaaaac44dc \ + --hash=sha256:9c75eff897786ee262c9f17a48886f4e98e6cfd335e011c591c305e5d083c056 \ + --hash=sha256:b538014a87f94d92f98f34d3e6d2635478e6be6423a9ea53e4dd96210065e193 \ + --hash=sha256:b6577b8c6c8701ba8642ea9335a129836347894b666dd1ec2226830e263909d3 \ + --hash=sha256:c0376deac92377817e4fb8f347bf559b7d44ff556d9bc6f6208dd3f79f104aaf \ + --hash=sha256:cae3dde0b4b2078f31527acff6f486e23abed307ba4d3932466ba7cdd5ecec79 \ + --hash=sha256:cb5d45c4143c1dd60f98a16187fd123eda7248f84ef22244818c18d531a249d1 \ + --hash=sha256:d9b073073e048081e502b6c6b0b88714c026a1a4c890569238d04aca5f9ca74b \ + --hash=sha256:fac19dc9cbc34052394dbe81e149411a62e71999c0a19e1e09ce537867f95ae0 + # via + # -c python/requirements_compiled.txt + # ray + # uvicorn +wcwidth==0.2.13 \ + --hash=sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859 \ + --hash=sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5 + # via + # -c python/requirements_compiled.txt + # prompt-toolkit +websockets==11.0.3 \ + --hash=sha256:01f5567d9cf6f502d655151645d4e8b72b453413d3819d2b6f1185abc23e82dd \ + --hash=sha256:03aae4edc0b1c68498f41a6772d80ac7c1e33c06c6ffa2ac1c27a07653e79d6f \ + --hash=sha256:0ac56b661e60edd453585f4bd68eb6a29ae25b5184fd5ba51e97652580458998 \ + --hash=sha256:0ee68fe502f9031f19d495dae2c268830df2760c0524cbac5d759921ba8c8e82 \ + --hash=sha256:1553cb82942b2a74dd9b15a018dce645d4e68674de2ca31ff13ebc2d9f283788 \ + --hash=sha256:1a073fc9ab1c8aff37c99f11f1641e16da517770e31a37265d2755282a5d28aa \ + --hash=sha256:1d2256283fa4b7f4c7d7d3e84dc2ece74d341bce57d5b9bf385df109c2a1a82f \ + --hash=sha256:1d5023a4b6a5b183dc838808087033ec5df77580485fc533e7dab2567851b0a4 \ + --hash=sha256:1fdf26fa8a6a592f8f9235285b8affa72748dc12e964a5518c6c5e8f916716f7 \ + --hash=sha256:2529338a6ff0eb0b50c7be33dc3d0e456381157a31eefc561771ee431134a97f \ + --hash=sha256:279e5de4671e79a9ac877427f4ac4ce93751b8823f276b681d04b2156713b9dd \ + --hash=sha256:2d903ad4419f5b472de90cd2d40384573b25da71e33519a67797de17ef849b69 \ + --hash=sha256:332d126167ddddec94597c2365537baf9ff62dfcc9db4266f263d455f2f031cb \ + --hash=sha256:34fd59a4ac42dff6d4681d8843217137f6bc85ed29722f2f7222bd619d15e95b \ + --hash=sha256:3580dd9c1ad0701169e4d6fc41e878ffe05e6bdcaf3c412f9d559389d0c9e016 \ + --hash=sha256:3ccc8a0c387629aec40f2fc9fdcb4b9d5431954f934da3eaf16cdc94f67dbfac \ + --hash=sha256:41f696ba95cd92dc047e46b41b26dd24518384749ed0d99bea0a941ca87404c4 \ + --hash=sha256:42cc5452a54a8e46a032521d7365da775823e21bfba2895fb7b77633cce031bb \ + --hash=sha256:4841ed00f1026dfbced6fca7d963c4e7043aa832648671b5138008dc5a8f6d99 \ + --hash=sha256:4b253869ea05a5a073ebfdcb5cb3b0266a57c3764cf6fe114e4cd90f4bfa5f5e \ + --hash=sha256:54c6e5b3d3a8936a4ab6870d46bdd6ec500ad62bde9e44462c32d18f1e9a8e54 \ + --hash=sha256:619d9f06372b3a42bc29d0cd0354c9bb9fb39c2cbc1a9c5025b4538738dbffaf \ + --hash=sha256:6505c1b31274723ccaf5f515c1824a4ad2f0d191cec942666b3d0f3aa4cb4007 \ + --hash=sha256:660e2d9068d2bedc0912af508f30bbeb505bbbf9774d98def45f68278cea20d3 \ + --hash=sha256:6681ba9e7f8f3b19440921e99efbb40fc89f26cd71bf539e45d8c8a25c976dc6 \ + --hash=sha256:68b977f21ce443d6d378dbd5ca38621755f2063d6fdb3335bda981d552cfff86 \ + --hash=sha256:69269f3a0b472e91125b503d3c0b3566bda26da0a3261c49f0027eb6075086d1 \ + --hash=sha256:6f1a3f10f836fab6ca6efa97bb952300b20ae56b409414ca85bff2ad241d2a61 \ + --hash=sha256:7622a89d696fc87af8e8d280d9b421db5133ef5b29d3f7a1ce9f1a7bf7fcfa11 \ + --hash=sha256:777354ee16f02f643a4c7f2b3eff8027a33c9861edc691a2003531f5da4f6bc8 \ + --hash=sha256:84d27a4832cc1a0ee07cdcf2b0629a8a72db73f4cf6de6f0904f6661227f256f \ + --hash=sha256:8531fdcad636d82c517b26a448dcfe62f720e1922b33c81ce695d0edb91eb931 \ + --hash=sha256:86d2a77fd490ae3ff6fae1c6ceaecad063d3cc2320b44377efdde79880e11526 \ + --hash=sha256:88fc51d9a26b10fc331be344f1781224a375b78488fc343620184e95a4b27016 \ + --hash=sha256:8a34e13a62a59c871064dfd8ffb150867e54291e46d4a7cf11d02c94a5275bae \ + --hash=sha256:8c82f11964f010053e13daafdc7154ce7385ecc538989a354ccc7067fd7028fd \ + --hash=sha256:92b2065d642bf8c0a82d59e59053dd2fdde64d4ed44efe4870fa816c1232647b \ + --hash=sha256:97b52894d948d2f6ea480171a27122d77af14ced35f62e5c892ca2fae9344311 \ + --hash=sha256:9d9acd80072abcc98bd2c86c3c9cd4ac2347b5a5a0cae7ed5c0ee5675f86d9af \ + --hash=sha256:9f59a3c656fef341a99e3d63189852be7084c0e54b75734cde571182c087b152 \ + --hash=sha256:aa5003845cdd21ac0dc6c9bf661c5beddd01116f6eb9eb3c8e272353d45b3288 \ + --hash=sha256:b16fff62b45eccb9c7abb18e60e7e446998093cdcb50fed33134b9b6878836de \ + --hash=sha256:b30c6590146e53149f04e85a6e4fcae068df4289e31e4aee1fdf56a0dead8f97 \ + --hash=sha256:b58cbf0697721120866820b89f93659abc31c1e876bf20d0b3d03cef14faf84d \ + --hash=sha256:b67c6f5e5a401fc56394f191f00f9b3811fe843ee93f4a70df3c389d1adf857d \ + --hash=sha256:bceab846bac555aff6427d060f2fcfff71042dba6f5fca7dc4f75cac815e57ca \ + --hash=sha256:bee9fcb41db2a23bed96c6b6ead6489702c12334ea20a297aa095ce6d31370d0 \ + --hash=sha256:c114e8da9b475739dde229fd3bc6b05a6537a88a578358bc8eb29b4030fac9c9 \ + --hash=sha256:c1f0524f203e3bd35149f12157438f406eff2e4fb30f71221c8a5eceb3617b6b \ + --hash=sha256:c792ea4eabc0159535608fc5658a74d1a81020eb35195dd63214dcf07556f67e \ + --hash=sha256:c7f3cb904cce8e1be667c7e6fef4516b98d1a6a0635a58a57528d577ac18a128 \ + --hash=sha256:d67ac60a307f760c6e65dad586f556dde58e683fab03323221a4e530ead6f74d \ + --hash=sha256:dcacf2c7a6c3a84e720d1bb2b543c675bf6c40e460300b628bab1b1efc7c034c \ + --hash=sha256:de36fe9c02995c7e6ae6efe2e205816f5f00c22fd1fbf343d4d18c3d5ceac2f5 \ + --hash=sha256:def07915168ac8f7853812cc593c71185a16216e9e4fa886358a17ed0fd9fcf6 \ + --hash=sha256:df41b9bc27c2c25b486bae7cf42fccdc52ff181c8c387bfd026624a491c2671b \ + --hash=sha256:e052b8467dd07d4943936009f46ae5ce7b908ddcac3fda581656b1b19c083d9b \ + --hash=sha256:e063b1865974611313a3849d43f2c3f5368093691349cf3c7c8f8f75ad7cb280 \ + --hash=sha256:e1459677e5d12be8bbc7584c35b992eea142911a6236a3278b9b5ce3326f282c \ + --hash=sha256:e1a99a7a71631f0efe727c10edfba09ea6bee4166a6f9c19aafb6c0b5917d09c \ + --hash=sha256:e590228200fcfc7e9109509e4d9125eace2042fd52b595dd22bbc34bb282307f \ + --hash=sha256:e6316827e3e79b7b8e7d8e3b08f4e331af91a48e794d5d8b099928b6f0b85f20 \ + --hash=sha256:e7837cb169eca3b3ae94cc5787c4fed99eef74c0ab9506756eea335e0d6f3ed8 \ + --hash=sha256:e848f46a58b9fcf3d06061d17be388caf70ea5b8cc3466251963c8345e13f7eb \ + --hash=sha256:ed058398f55163a79bb9f06a90ef9ccc063b204bb346c4de78efc5d15abfe602 \ + --hash=sha256:f2e58f2c36cc52d41f2659e4c0cbf7353e28c8c9e63e30d8c6d3494dc9fdedcf \ + --hash=sha256:f467ba0050b7de85016b43f5a22b46383ef004c4f672148a8abf32bc999a87f0 \ + --hash=sha256:f61bdb1df43dc9c131791fbc2355535f9024b9a04398d3bd0684fc16ab07df74 \ + --hash=sha256:fb06eea71a00a7af0ae6aefbb932fb8a7df3cb390cc217d51a9ad7343de1b8d0 \ + --hash=sha256:ffd7dcaf744f25f82190856bc26ed81721508fc5cbf2a330751e135ff1283564 + # via + # -c python/requirements_compiled.txt + # uvicorn +yarl==1.18.3 \ + --hash=sha256:00e5a1fea0fd4f5bfa7440a47eff01d9822a65b4488f7cff83155a0f31a2ecba \ + --hash=sha256:02ddb6756f8f4517a2d5e99d8b2f272488e18dd0bfbc802f31c16c6c20f22193 \ + --hash=sha256:045b8482ce9483ada4f3f23b3774f4e1bf4f23a2d5c912ed5170f68efb053318 \ + --hash=sha256:09c7907c8548bcd6ab860e5f513e727c53b4a714f459b084f6580b49fa1b9cee \ + --hash=sha256:0b0cad37311123211dc91eadcb322ef4d4a66008d3e1bdc404808992260e1a0e \ + --hash=sha256:0b3c92fa08759dbf12b3a59579a4096ba9af8dd344d9a813fc7f5070d86bbab1 \ + --hash=sha256:0fb2171a4486bb075316ee754c6d8382ea6eb8b399d4ec62fde2b591f879778a \ + --hash=sha256:1a74a13a4c857a84a845505fd2d68e54826a2cd01935a96efb1e9d86c728e186 \ + --hash=sha256:1d407181cfa6e70077df3377938c08012d18893f9f20e92f7d2f314a437c30b1 \ + --hash=sha256:1dd4bdd05407ced96fed3d7f25dbbf88d2ffb045a0db60dbc247f5b3c5c25d50 \ + --hash=sha256:25b411eddcfd56a2f0cd6a384e9f4f7aa3efee14b188de13048c25b5e91f1640 \ + --hash=sha256:2d06d3005e668744e11ed80812e61efd77d70bb7f03e33c1598c301eea20efbb \ + --hash=sha256:2ec9bbba33b2d00999af4631a3397d1fd78290c48e2a3e52d8dd72db3a067ac8 \ + --hash=sha256:3236da9272872443f81fedc389bace88408f64f89f75d1bdb2256069a8730ccc \ + --hash=sha256:35098b24e0327fc4ebdc8ffe336cee0a87a700c24ffed13161af80124b7dc8e5 \ + --hash=sha256:41f7ce59d6ee7741af71d82020346af364949314ed3d87553763a2df1829cc58 \ + --hash=sha256:436c4fc0a4d66b2badc6c5fc5ef4e47bb10e4fd9bf0c79524ac719a01f3607c2 \ + --hash=sha256:4891ed92157e5430874dad17b15eb1fda57627710756c27422200c52d8a4e393 \ + --hash=sha256:4ac515b860c36becb81bb84b667466885096b5fc85596948548b667da3bf9f24 \ + --hash=sha256:5094d9206c64181d0f6e76ebd8fb2f8fe274950a63890ee9e0ebfd58bf9d787b \ + --hash=sha256:54d6921f07555713b9300bee9c50fb46e57e2e639027089b1d795ecd9f7fa910 \ + --hash=sha256:578e281c393af575879990861823ef19d66e2b1d0098414855dd367e234f5b3c \ + --hash=sha256:5a3f356548e34a70b0172d8890006c37be92995f62d95a07b4a42e90fba54272 \ + --hash=sha256:602d98f2c2d929f8e697ed274fbadc09902c4025c5a9963bf4e9edfc3ab6f7ed \ + --hash=sha256:61b1a825a13bef4a5f10b1885245377d3cd0bf87cba068e1d9a88c2ae36880e1 \ + --hash=sha256:61e5e68cb65ac8f547f6b5ef933f510134a6bf31bb178be428994b0cb46c2a04 \ + --hash=sha256:61ee62ead9b68b9123ec24bc866cbef297dd266175d53296e2db5e7f797f902d \ + --hash=sha256:6333c5a377c8e2f5fae35e7b8f145c617b02c939d04110c76f29ee3676b5f9a5 \ + --hash=sha256:6748dbf9bfa5ba1afcc7556b71cda0d7ce5f24768043a02a58846e4a443d808d \ + --hash=sha256:67a283dd2882ac98cc6318384f565bffc751ab564605959df4752d42483ad889 \ + --hash=sha256:75674776d96d7b851b6498f17824ba17849d790a44d282929c42dbb77d4f17ae \ + --hash=sha256:757e81cae69244257d125ff31663249b3013b5dc0a8520d73694aed497fb195b \ + --hash=sha256:77a6e85b90a7641d2e07184df5557132a337f136250caafc9ccaa4a2a998ca2c \ + --hash=sha256:7c33dd1931a95e5d9a772d0ac5e44cac8957eaf58e3c8da8c1414de7dd27c576 \ + --hash=sha256:7df647e8edd71f000a5208fe6ff8c382a1de8edfbccdbbfe649d263de07d8c34 \ + --hash=sha256:7e2ee16578af3b52ac2f334c3b1f92262f47e02cc6193c598502bd46f5cd1477 \ + --hash=sha256:80316a8bd5109320d38eef8833ccf5f89608c9107d02d2a7f985f98ed6876990 \ + --hash=sha256:82123d0c954dc58db301f5021a01854a85bf1f3bb7d12ae0c01afc414a882ca2 \ + --hash=sha256:84b2deecba4a3f1a398df819151eb72d29bfeb3b69abb145a00ddc8d30094512 \ + --hash=sha256:8503ad47387b8ebd39cbbbdf0bf113e17330ffd339ba1144074da24c545f0069 \ + --hash=sha256:877d209b6aebeb5b16c42cbb377f5f94d9e556626b1bfff66d7b0d115be88d0a \ + --hash=sha256:8874027a53e3aea659a6d62751800cf6e63314c160fd607489ba5c2edd753cf6 \ + --hash=sha256:88a19f62ff30117e706ebc9090b8ecc79aeb77d0b1f5ec10d2d27a12bc9f66d0 \ + --hash=sha256:8d39d351e7faf01483cc7ff7c0213c412e38e5a340238826be7e0e4da450fdc8 \ + --hash=sha256:90adb47ad432332d4f0bc28f83a5963f426ce9a1a8809f5e584e704b82685dcb \ + --hash=sha256:913829534200eb0f789d45349e55203a091f45c37a2674678744ae52fae23efa \ + --hash=sha256:93b2e109287f93db79210f86deb6b9bbb81ac32fc97236b16f7433db7fc437d8 \ + --hash=sha256:9d41beda9dc97ca9ab0b9888cb71f7539124bc05df02c0cff6e5acc5a19dcc6e \ + --hash=sha256:a440a2a624683108a1b454705ecd7afc1c3438a08e890a1513d468671d90a04e \ + --hash=sha256:a4bb030cf46a434ec0225bddbebd4b89e6471814ca851abb8696170adb163985 \ + --hash=sha256:a9ca04806f3be0ac6d558fffc2fdf8fcef767e0489d2684a21912cc4ed0cd1b8 \ + --hash=sha256:ac1801c45cbf77b6c99242eeff4fffb5e4e73a800b5c4ad4fc0be5def634d2e1 \ + --hash=sha256:ac36703a585e0929b032fbaab0707b75dc12703766d0b53486eabd5139ebadd5 \ + --hash=sha256:b1771de9944d875f1b98a745bc547e684b863abf8f8287da8466cf470ef52690 \ + --hash=sha256:b464c4ab4bfcb41e3bfd3f1c26600d038376c2de3297760dfe064d2cb7ea8e10 \ + --hash=sha256:b4f6450109834af88cb4cc5ecddfc5380ebb9c228695afc11915a0bf82116789 \ + --hash=sha256:b57f4f58099328dfb26c6a771d09fb20dbbae81d20cfb66141251ea063bd101b \ + --hash=sha256:b643562c12680b01e17239be267bc306bbc6aac1f34f6444d1bded0c5ce438ca \ + --hash=sha256:b958ddd075ddba5b09bb0be8a6d9906d2ce933aee81100db289badbeb966f54e \ + --hash=sha256:b9d60031cf568c627d028239693fd718025719c02c9f55df0a53e587aab951b5 \ + --hash=sha256:ba23302c0c61a9999784e73809427c9dbedd79f66a13d84ad1b1943802eaaf59 \ + --hash=sha256:ba87babd629f8af77f557b61e49e7c7cac36f22f871156b91e10a6e9d4f829e9 \ + --hash=sha256:c017a3b6df3a1bd45b9fa49a0f54005e53fbcad16633870104b66fa1a30a29d8 \ + --hash=sha256:c1e1cc06da1491e6734f0ea1e6294ce00792193c463350626571c287c9a704db \ + --hash=sha256:c654d5207c78e0bd6d749f6dae1dcbbfde3403ad3a4b11f3c5544d9906969dde \ + --hash=sha256:c69697d3adff5aa4f874b19c0e4ed65180ceed6318ec856ebc423aa5850d84f7 \ + --hash=sha256:c7d79f7d9aabd6011004e33b22bc13056a3e3fb54794d138af57f5ee9d9032cb \ + --hash=sha256:ccaa3a4b521b780a7e771cc336a2dba389a0861592bbce09a476190bb0c8b4b3 \ + --hash=sha256:ccd17349166b1bee6e529b4add61727d3f55edb7babbe4069b5764c9587a8cc6 \ + --hash=sha256:ce1af883b94304f493698b00d0f006d56aea98aeb49d75ec7d98cd4a777e9285 \ + --hash=sha256:d0e883008013c0e4aef84dcfe2a0b172c4d23c2669412cf5b3371003941f72bb \ + --hash=sha256:d980e0325b6eddc81331d3f4551e2a333999fb176fd153e075c6d1c2530aa8a8 \ + --hash=sha256:e17c9361d46a4d5addf777c6dd5eab0715a7684c2f11b88c67ac37edfba6c482 \ + --hash=sha256:e2c08cc9b16f4f4bc522771d96734c7901e7ebef70c6c5c35dd0f10845270bcd \ + --hash=sha256:e35ef8683211db69ffe129a25d5634319a677570ab6b2eba4afa860f54eeaf75 \ + --hash=sha256:e3b9fd71836999aad54084906f8663dffcd2a7fb5cdafd6c37713b2e72be1760 \ + --hash=sha256:ef9f7768395923c3039055c14334ba4d926f3baf7b776c923c93d80195624782 \ + --hash=sha256:f52a265001d830bc425f82ca9eabda94a64a4d753b07d623a9f2863fde532b53 \ + --hash=sha256:f91c4803173928a25e1a55b943c81f55b8872f0018be83e3ad4938adffb77dd2 \ + --hash=sha256:fbd6748e8ab9b41171bb95c6142faf068f5ef1511935a0aa07025438dd9a9bc1 \ + --hash=sha256:fe57328fbc1bfd0bd0514470ac692630f3901c0ee39052ae47acd1d90a436719 \ + --hash=sha256:fea09ca13323376a2fdfb353a5fa2e59f90cd18d7ca4eaa1fd31f0a8b4f91e62 + # via + # -c python/requirements_compiled.txt + # aiohttp +zipp==3.19.2 \ + --hash=sha256:f091755f667055f2d02b32c53771a7a6c8b47e1fdbc4b72a8b9072b3eef8015c + # via + # -c python/requirements_compiled.txt + # importlib-metadata diff --git a/python/deplocks/ray_img/ray_img_py312.lock b/python/deplocks/ray_img/ray_img_py312.lock new file mode 100644 index 000000000000..4cc81338771e --- /dev/null +++ b/python/deplocks/ray_img/ray_img_py312.lock @@ -0,0 +1,2172 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile --generate-hashes --strip-extras --unsafe-package setuptools --index-url https://pypi.org/simple --extra-index-url https://download.pytorch.org/whl/cpu --index-strategy unsafe-best-match --no-strip-markers --emit-index-url --emit-find-links --python-version=3.12 --find-links=.whl/ -c python/requirements_compiled.txt - -o python/deplocks/ray_img/ray_img_py312.lock +--index-url https://pypi.org/simple +--extra-index-url https://download.pytorch.org/whl/cpu +--find-links .whl/ +--find-links https://data.pyg.org/whl/torch-2.3.0+cpu.html + +aiohappyeyeballs==2.6.1 \ + --hash=sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558 \ + --hash=sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8 + # via + # -c python/requirements_compiled.txt + # aiohttp +aiohttp==3.11.16 \ + --hash=sha256:004511d3413737700835e949433536a2fe95a7d0297edd911a1e9705c5b5ea43 \ + --hash=sha256:0902e887b0e1d50424112f200eb9ae3dfed6c0d0a19fc60f633ae5a57c809656 \ + --hash=sha256:09b00dd520d88eac9d1768439a59ab3d145065c91a8fab97f900d1b5f802895e \ + --hash=sha256:0a2f451849e6b39e5c226803dcacfa9c7133e9825dcefd2f4e837a2ec5a3bb98 \ + --hash=sha256:0a950c2eb8ff17361abd8c85987fd6076d9f47d040ebffce67dce4993285e973 \ + --hash=sha256:0ad1fb47da60ae1ddfb316f0ff16d1f3b8e844d1a1e154641928ea0583d486ed \ + --hash=sha256:13ceac2c5cdcc3f64b9015710221ddf81c900c5febc505dbd8f810e770011540 \ + --hash=sha256:14461157d8426bcb40bd94deb0450a6fa16f05129f7da546090cebf8f3123b0f \ + --hash=sha256:16f8a2c9538c14a557b4d309ed4d0a7c60f0253e8ed7b6c9a2859a7582f8b1b8 \ + --hash=sha256:17ae4664031aadfbcb34fd40ffd90976671fa0c0286e6c4113989f78bebab37a \ + --hash=sha256:1ce63ae04719513dd2651202352a2beb9f67f55cb8490c40f056cea3c5c355ce \ + --hash=sha256:23a15727fbfccab973343b6d1b7181bfb0b4aa7ae280f36fd2f90f5476805682 \ + --hash=sha256:2540ddc83cc724b13d1838026f6a5ad178510953302a49e6d647f6e1de82bc34 \ + --hash=sha256:37dcee4906454ae377be5937ab2a66a9a88377b11dd7c072df7a7c142b63c37c \ + --hash=sha256:38bea84ee4fe24ebcc8edeb7b54bf20f06fd53ce4d2cc8b74344c5b9620597fd \ + --hash=sha256:3ab3367bb7f61ad18793fea2ef71f2d181c528c87948638366bf1de26e239183 \ + --hash=sha256:3ad1d59fd7114e6a08c4814983bb498f391c699f3c78712770077518cae63ff7 \ + --hash=sha256:3b4e6db8dc4879015b9955778cfb9881897339c8fab7b3676f8433f849425913 \ + --hash=sha256:3e061b09f6fa42997cf627307f220315e313ece74907d35776ec4373ed718b86 \ + --hash=sha256:42864e70a248f5f6a49fdaf417d9bc62d6e4d8ee9695b24c5916cb4bb666c802 \ + --hash=sha256:493910ceb2764f792db4dc6e8e4b375dae1b08f72e18e8f10f18b34ca17d0979 \ + --hash=sha256:4d0c970c0d602b1017e2067ff3b7dac41c98fef4f7472ec2ea26fd8a4e8c2149 \ + --hash=sha256:54eb3aead72a5c19fad07219acd882c1643a1027fbcdefac9b502c267242f955 \ + --hash=sha256:56a3443aca82abda0e07be2e1ecb76a050714faf2be84256dae291182ba59049 \ + --hash=sha256:576f5ca28d1b3276026f7df3ec841ae460e0fc3aac2a47cbf72eabcfc0f102e1 \ + --hash=sha256:58ede86453a6cf2d6ce40ef0ca15481677a66950e73b0a788917916f7e35a0bb \ + --hash=sha256:61c721764e41af907c9d16b6daa05a458f066015abd35923051be8705108ed17 \ + --hash=sha256:634d96869be6c4dc232fc503e03e40c42d32cfaa51712aee181e922e61d74814 \ + --hash=sha256:696ef00e8a1f0cec5e30640e64eca75d8e777933d1438f4facc9c0cdf288a810 \ + --hash=sha256:69a2cbd61788d26f8f1e626e188044834f37f6ae3f937bd9f08b65fc9d7e514e \ + --hash=sha256:6a792ce34b999fbe04a7a71a90c74f10c57ae4c51f65461a411faa70e154154e \ + --hash=sha256:6ac13b71761e49d5f9e4d05d33683bbafef753e876e8e5a7ef26e937dd766713 \ + --hash=sha256:6fdec0213244c39973674ca2a7f5435bf74369e7d4e104d6c7473c81c9bcc8c4 \ + --hash=sha256:72b1b03fb4655c1960403c131740755ec19c5898c82abd3961c364c2afd59fe7 \ + --hash=sha256:745f1ed5e2c687baefc3c5e7b4304e91bf3e2f32834d07baaee243e349624b24 \ + --hash=sha256:776c8e959a01e5e8321f1dec77964cb6101020a69d5a94cd3d34db6d555e01f7 \ + --hash=sha256:780df0d837276276226a1ff803f8d0fa5f8996c479aeef52eb040179f3156cbd \ + --hash=sha256:78e6e23b954644737e385befa0deb20233e2dfddf95dd11e9db752bdd2a294d3 \ + --hash=sha256:7951decace76a9271a1ef181b04aa77d3cc309a02a51d73826039003210bdc86 \ + --hash=sha256:7ba92a2d9ace559a0a14b03d87f47e021e4fa7681dc6970ebbc7b447c7d4b7cd \ + --hash=sha256:7f6428fee52d2bcf96a8aa7b62095b190ee341ab0e6b1bcf50c615d7966fd45b \ + --hash=sha256:87944bd16b7fe6160607f6a17808abd25f17f61ae1e26c47a491b970fb66d8cb \ + --hash=sha256:87a6e922b2b2401e0b0cf6b976b97f11ec7f136bfed445e16384fbf6fd5e8602 \ + --hash=sha256:8cb0688a8d81c63d716e867d59a9ccc389e97ac7037ebef904c2b89334407180 \ + --hash=sha256:8df6612df74409080575dca38a5237282865408016e65636a76a2eb9348c2567 \ + --hash=sha256:911a6e91d08bb2c72938bc17f0a2d97864c531536b7832abee6429d5296e5b27 \ + --hash=sha256:92b7ee222e2b903e0a4b329a9943d432b3767f2d5029dbe4ca59fb75223bbe2e \ + --hash=sha256:938f756c2b9374bbcc262a37eea521d8a0e6458162f2a9c26329cc87fdf06534 \ + --hash=sha256:9756d9b9d4547e091f99d554fbba0d2a920aab98caa82a8fb3d3d9bee3c9ae85 \ + --hash=sha256:98b88a2bf26965f2015a771381624dd4b0839034b70d406dc74fd8be4cc053e3 \ + --hash=sha256:9b751a6306f330801665ae69270a8a3993654a85569b3469662efaad6cf5cc50 \ + --hash=sha256:a2a450bcce4931b295fc0848f384834c3f9b00edfc2150baafb4488c27953de6 \ + --hash=sha256:a3814760a1a700f3cfd2f977249f1032301d0a12c92aba74605cfa6ce9f78489 \ + --hash=sha256:a5abcbba9f4b463a45c8ca8b7720891200658f6f46894f79517e6cd11f3405ca \ + --hash=sha256:a6db7458ab89c7d80bc1f4e930cc9df6edee2200127cfa6f6e080cf619eddfbd \ + --hash=sha256:ad497f38a0d6c329cb621774788583ee12321863cd4bd9feee1effd60f2ad133 \ + --hash=sha256:ad9509ffb2396483ceacb1eee9134724443ee45b92141105a4645857244aecc8 \ + --hash=sha256:bbcba75fe879ad6fd2e0d6a8d937f34a571f116a0e4db37df8079e738ea95c71 \ + --hash=sha256:c10d85e81d0b9ef87970ecbdbfaeec14a361a7fa947118817fcea8e45335fa46 \ + --hash=sha256:c15b2271c44da77ee9d822552201180779e5e942f3a71fb74e026bf6172ff287 \ + --hash=sha256:ca37057625693d097543bd88076ceebeb248291df9d6ca8481349efc0b05dcd0 \ + --hash=sha256:cc3a145479a76ad0ed646434d09216d33d08eef0d8c9a11f5ae5cdc37caa3540 \ + --hash=sha256:ccf10f16ab498d20e28bc2b5c1306e9c1512f2840f7b6a67000a517a4b37d5ee \ + --hash=sha256:cd464ba806e27ee24a91362ba3621bfc39dbbb8b79f2e1340201615197370f7c \ + --hash=sha256:d007aa39a52d62373bd23428ba4a2546eed0e7643d7bf2e41ddcefd54519842c \ + --hash=sha256:d0666afbe984f6933fe72cd1f1c3560d8c55880a0bdd728ad774006eb4241ecd \ + --hash=sha256:d07502cc14ecd64f52b2a74ebbc106893d9a9717120057ea9ea1fd6568a747e7 \ + --hash=sha256:d489d9778522fbd0f8d6a5c6e48e3514f11be81cb0a5954bdda06f7e1594b321 \ + --hash=sha256:df7db76400bf46ec6a0a73192b14c8295bdb9812053f4fe53f4e789f3ea66bbb \ + --hash=sha256:e3538bc9fe1b902bef51372462e3d7c96fce2b566642512138a480b7adc9d508 \ + --hash=sha256:e87fd812899aa78252866ae03a048e77bd11b80fb4878ce27c23cade239b42b2 \ + --hash=sha256:ecdb8173e6c7aa09eee342ac62e193e6904923bd232e76b4157ac0bfa670609f \ + --hash=sha256:f244b8e541f414664889e2c87cac11a07b918cb4b540c36f7ada7bfa76571ea2 \ + --hash=sha256:f4065145bf69de124accdd17ea5f4dc770da0a6a6e440c53f6e0a8c27b3e635c \ + --hash=sha256:f420bfe862fb357a6d76f2065447ef6f484bc489292ac91e29bc65d2d7a2c84d \ + --hash=sha256:f6ddd90d9fb4b501c97a4458f1c1720e42432c26cb76d28177c5b5ad4e332601 \ + --hash=sha256:fa73e8c2656a3653ae6c307b3f4e878a21f87859a9afab228280ddccd7369d71 \ + --hash=sha256:fadbb8f1d4140825069db3fedbbb843290fd5f5bc0a5dbd7eaf81d91bf1b003b \ + --hash=sha256:fb3d0cc5cdb926090748ea60172fa8a213cec728bd6c54eae18b96040fcd6227 \ + --hash=sha256:fb46bb0f24813e6cede6cc07b1961d4b04f331f7112a23b5e21f567da4ee50aa \ + --hash=sha256:fd36c119c5d6551bce374fcb5c19269638f8d09862445f85a5a48596fd59f4bb + # via + # -c python/requirements_compiled.txt + # aiohttp-cors + # ray +aiohttp-cors==0.7.0 \ + --hash=sha256:0451ba59fdf6909d0e2cd21e4c0a43752bc0703d33fc78ae94d9d9321710193e \ + --hash=sha256:4d39c6d7100fd9764ed1caf8cebf0eb01bf5e3f24e2e073fda6234bc48b19f5d + # via + # -c python/requirements_compiled.txt + # ray +aiosignal==1.3.1 \ + --hash=sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc \ + --hash=sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17 + # via + # -c python/requirements_compiled.txt + # aiohttp +amqp==5.3.1 \ + --hash=sha256:43b3319e1b4e7d1251833a93d672b4af1e40f3d632d479b98661a95f117880a2 \ + --hash=sha256:cddc00c725449522023bad949f70fff7b48f0b1ade74d170a6f10ab044739432 + # via + # -c python/requirements_compiled.txt + # kombu +annotated-types==0.6.0 \ + --hash=sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43 \ + --hash=sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d + # via + # -c python/requirements_compiled.txt + # pydantic +anyio==3.7.1 \ + --hash=sha256:44a3c9aba0f5defa43261a8b3efb97891f2bd7d804e0e1f56419befa1adfc780 \ + --hash=sha256:91dee416e570e92c64041bd18b900d1d6fa78dff7048769ce5ac5ddad004fbb5 + # via + # -c python/requirements_compiled.txt + # starlette + # watchfiles +attrs==25.1.0 \ + --hash=sha256:1c97078a80c814273a76b2a298a932eb681c87415c11dee0a6921de7f1b02c3e \ + --hash=sha256:c75a69e28a550a7e93789579c22aa26b0f5b83b75dc4e08fe092980051e1090a + # via + # -c python/requirements_compiled.txt + # aiohttp + # jsonschema + # referencing +billiard==4.2.1 \ + --hash=sha256:12b641b0c539073fc8d3f5b8b7be998956665c4233c7c1fcd66a7e677c4fb36f \ + --hash=sha256:40b59a4ac8806ba2c2369ea98d876bc6108b051c227baffd928c644d15d8f3cb + # via + # -c python/requirements_compiled.txt + # celery +cachetools==5.5.2 \ + --hash=sha256:1a661caa9175d26759571b2e19580f9d6393969e5dfca11fdb1f947a23e640d4 \ + --hash=sha256:d26a22bcc62eb95c3beabd9f1ee5e820d3d2704fe2967cbe350e20c8ffcd3f0a + # via + # -c python/requirements_compiled.txt + # google-auth +celery==5.5.3 \ + --hash=sha256:0b5761a07057acee94694464ca482416b959568904c9dfa41ce8413a7d65d525 \ + --hash=sha256:6c972ae7968c2b5281227f01c3a3f984037d21c5129d07bf3550cc2afc6b10a5 + # via + # -c python/requirements_compiled.txt + # ray +certifi==2025.1.31 \ + --hash=sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651 \ + --hash=sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe + # via + # -c python/requirements_compiled.txt + # requests +cffi==1.16.0 ; platform_python_implementation != 'PyPy' \ + --hash=sha256:0c9ef6ff37e974b73c25eecc13952c55bceed9112be2d9d938ded8e856138bcc \ + --hash=sha256:131fd094d1065b19540c3d72594260f118b231090295d8c34e19a7bbcf2e860a \ + --hash=sha256:1b8ebc27c014c59692bb2664c7d13ce7a6e9a629be20e54e7271fa696ff2b417 \ + --hash=sha256:2c56b361916f390cd758a57f2e16233eb4f64bcbeee88a4881ea90fca14dc6ab \ + --hash=sha256:2d92b25dbf6cae33f65005baf472d2c245c050b1ce709cc4588cdcdd5495b520 \ + --hash=sha256:31d13b0f99e0836b7ff893d37af07366ebc90b678b6664c955b54561fc36ef36 \ + --hash=sha256:32c68ef735dbe5857c810328cb2481e24722a59a2003018885514d4c09af9743 \ + --hash=sha256:3686dffb02459559c74dd3d81748269ffb0eb027c39a6fc99502de37d501faa8 \ + --hash=sha256:582215a0e9adbe0e379761260553ba11c58943e4bbe9c36430c4ca6ac74b15ed \ + --hash=sha256:5b50bf3f55561dac5438f8e70bfcdfd74543fd60df5fa5f62d94e5867deca684 \ + --hash=sha256:5bf44d66cdf9e893637896c7faa22298baebcd18d1ddb6d2626a6e39793a1d56 \ + --hash=sha256:6602bc8dc6f3a9e02b6c22c4fc1e47aa50f8f8e6d3f78a5e16ac33ef5fefa324 \ + --hash=sha256:673739cb539f8cdaa07d92d02efa93c9ccf87e345b9a0b556e3ecc666718468d \ + --hash=sha256:68678abf380b42ce21a5f2abde8efee05c114c2fdb2e9eef2efdb0257fba1235 \ + --hash=sha256:68e7c44931cc171c54ccb702482e9fc723192e88d25a0e133edd7aff8fcd1f6e \ + --hash=sha256:6b3d6606d369fc1da4fd8c357d026317fbb9c9b75d36dc16e90e84c26854b088 \ + --hash=sha256:748dcd1e3d3d7cd5443ef03ce8685043294ad6bd7c02a38d1bd367cfd968e000 \ + --hash=sha256:7651c50c8c5ef7bdb41108b7b8c5a83013bfaa8a935590c5d74627c047a583c7 \ + --hash=sha256:7b78010e7b97fef4bee1e896df8a4bbb6712b7f05b7ef630f9d1da00f6444d2e \ + --hash=sha256:7e61e3e4fa664a8588aa25c883eab612a188c725755afff6289454d6362b9673 \ + --hash=sha256:80876338e19c951fdfed6198e70bc88f1c9758b94578d5a7c4c91a87af3cf31c \ + --hash=sha256:8895613bcc094d4a1b2dbe179d88d7fb4a15cee43c052e8885783fac397d91fe \ + --hash=sha256:88e2b3c14bdb32e440be531ade29d3c50a1a59cd4e51b1dd8b0865c54ea5d2e2 \ + --hash=sha256:8f8e709127c6c77446a8c0a8c8bf3c8ee706a06cd44b1e827c3e6a2ee6b8c098 \ + --hash=sha256:9cb4a35b3642fc5c005a6755a5d17c6c8b6bcb6981baf81cea8bfbc8903e8ba8 \ + --hash=sha256:9f90389693731ff1f659e55c7d1640e2ec43ff725cc61b04b2f9c6d8d017df6a \ + --hash=sha256:a09582f178759ee8128d9270cd1344154fd473bb77d94ce0aeb2a93ebf0feaf0 \ + --hash=sha256:a6a14b17d7e17fa0d207ac08642c8820f84f25ce17a442fd15e27ea18d67c59b \ + --hash=sha256:a72e8961a86d19bdb45851d8f1f08b041ea37d2bd8d4fd19903bc3083d80c896 \ + --hash=sha256:abd808f9c129ba2beda4cfc53bde801e5bcf9d6e0f22f095e45327c038bfe68e \ + --hash=sha256:ac0f5edd2360eea2f1daa9e26a41db02dd4b0451b48f7c318e217ee092a213e9 \ + --hash=sha256:b29ebffcf550f9da55bec9e02ad430c992a87e5f512cd63388abb76f1036d8d2 \ + --hash=sha256:b2ca4e77f9f47c55c194982e10f058db063937845bb2b7a86c84a6cfe0aefa8b \ + --hash=sha256:b7be2d771cdba2942e13215c4e340bfd76398e9227ad10402a8767ab1865d2e6 \ + --hash=sha256:b84834d0cf97e7d27dd5b7f3aca7b6e9263c56308ab9dc8aae9784abb774d404 \ + --hash=sha256:b86851a328eedc692acf81fb05444bdf1891747c25af7529e39ddafaf68a4f3f \ + --hash=sha256:bcb3ef43e58665bbda2fb198698fcae6776483e0c4a631aa5647806c25e02cc0 \ + --hash=sha256:c0f31130ebc2d37cdd8e44605fb5fa7ad59049298b3f745c74fa74c62fbfcfc4 \ + --hash=sha256:c6a164aa47843fb1b01e941d385aab7215563bb8816d80ff3a363a9f8448a8dc \ + --hash=sha256:d8a9d3ebe49f084ad71f9269834ceccbf398253c9fac910c4fd7053ff1386936 \ + --hash=sha256:db8e577c19c0fda0beb7e0d4e09e0ba74b1e4c092e0e40bfa12fe05b6f6d75ba \ + --hash=sha256:dc9b18bf40cc75f66f40a7379f6a9513244fe33c0e8aa72e2d56b0196a7ef872 \ + --hash=sha256:e09f3ff613345df5e8c3667da1d918f9149bd623cd9070c983c013792a9a62eb \ + --hash=sha256:e4108df7fe9b707191e55f33efbcb2d81928e10cea45527879a4749cbe472614 \ + --hash=sha256:e6024675e67af929088fda399b2094574609396b1decb609c55fa58b028a32a1 \ + --hash=sha256:e70f54f1796669ef691ca07d046cd81a29cb4deb1e5f942003f401c0c4a2695d \ + --hash=sha256:e715596e683d2ce000574bae5d07bd522c781a822866c20495e52520564f0969 \ + --hash=sha256:e760191dd42581e023a68b758769e2da259b5d52e3103c6060ddc02c9edb8d7b \ + --hash=sha256:ed86a35631f7bfbb28e108dd96773b9d5a6ce4811cf6ea468bb6a359b256b1e4 \ + --hash=sha256:ee07e47c12890ef248766a6e55bd38ebfb2bb8edd4142d56db91b21ea68b7627 \ + --hash=sha256:fa3a0128b152627161ce47201262d3140edb5a5c3da88d73a1b790a959126956 \ + --hash=sha256:fcc8eb6d5902bb1cf6dc4f187ee3ea80a1eba0a89aba40a5cb20a5087d961357 + # via + # -c python/requirements_compiled.txt + # cryptography +charset-normalizer==3.3.2 \ + --hash=sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027 \ + --hash=sha256:06a81e93cd441c56a9b65d8e1d043daeb97a3d0856d177d5c90ba85acb3db087 \ + --hash=sha256:0a55554a2fa0d408816b3b5cedf0045f4b8e1a6065aec45849de2d6f3f8e9786 \ + --hash=sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8 \ + --hash=sha256:10955842570876604d404661fbccbc9c7e684caf432c09c715ec38fbae45ae09 \ + --hash=sha256:122c7fa62b130ed55f8f285bfd56d5f4b4a5b503609d181f9ad85e55c89f4185 \ + --hash=sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574 \ + --hash=sha256:1d3193f4a680c64b4b6a9115943538edb896edc190f0b222e73761716519268e \ + --hash=sha256:1f79682fbe303db92bc2b1136016a38a42e835d932bab5b3b1bfcfbf0640e519 \ + --hash=sha256:2127566c664442652f024c837091890cb1942c30937add288223dc895793f898 \ + --hash=sha256:22afcb9f253dac0696b5a4be4a1c0f8762f8239e21b99680099abd9b2b1b2269 \ + --hash=sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3 \ + --hash=sha256:2e81c7b9c8979ce92ed306c249d46894776a909505d8f5a4ba55b14206e3222f \ + --hash=sha256:3287761bc4ee9e33561a7e058c72ac0938c4f57fe49a09eae428fd88aafe7bb6 \ + --hash=sha256:34d1c8da1e78d2e001f363791c98a272bb734000fcef47a491c1e3b0505657a8 \ + --hash=sha256:37e55c8e51c236f95b033f6fb391d7d7970ba5fe7ff453dad675e88cf303377a \ + --hash=sha256:3d47fa203a7bd9c5b6cee4736ee84ca03b8ef23193c0d1ca99b5089f72645c73 \ + --hash=sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc \ + --hash=sha256:42cb296636fcc8b0644486d15c12376cb9fa75443e00fb25de0b8602e64c1714 \ + --hash=sha256:45485e01ff4d3630ec0d9617310448a8702f70e9c01906b0d0118bdf9d124cf2 \ + --hash=sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc \ + --hash=sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce \ + --hash=sha256:4d0d1650369165a14e14e1e47b372cfcb31d6ab44e6e33cb2d4e57265290044d \ + --hash=sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e \ + --hash=sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6 \ + --hash=sha256:572c3763a264ba47b3cf708a44ce965d98555f618ca42c926a9c1616d8f34269 \ + --hash=sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96 \ + --hash=sha256:5b4c145409bef602a690e7cfad0a15a55c13320ff7a3ad7ca59c13bb8ba4d45d \ + --hash=sha256:6463effa3186ea09411d50efc7d85360b38d5f09b870c48e4600f63af490e56a \ + --hash=sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4 \ + --hash=sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77 \ + --hash=sha256:6897af51655e3691ff853668779c7bad41579facacf5fd7253b0133308cf000d \ + --hash=sha256:68d1f8a9e9e37c1223b656399be5d6b448dea850bed7d0f87a8311f1ff3dabb0 \ + --hash=sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed \ + --hash=sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068 \ + --hash=sha256:6c4caeef8fa63d06bd437cd4bdcf3ffefe6738fb1b25951440d80dc7df8c03ac \ + --hash=sha256:6ef1d82a3af9d3eecdba2321dc1b3c238245d890843e040e41e470ffa64c3e25 \ + --hash=sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8 \ + --hash=sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab \ + --hash=sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26 \ + --hash=sha256:7f04c839ed0b6b98b1a7501a002144b76c18fb1c1850c8b98d458ac269e26ed2 \ + --hash=sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db \ + --hash=sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f \ + --hash=sha256:8465322196c8b4d7ab6d1e049e4c5cb460d0394da4a27d23cc242fbf0034b6b5 \ + --hash=sha256:86216b5cee4b06df986d214f664305142d9c76df9b6512be2738aa72a2048f99 \ + --hash=sha256:87d1351268731db79e0f8e745d92493ee2841c974128ef629dc518b937d9194c \ + --hash=sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d \ + --hash=sha256:8c622a5fe39a48f78944a87d4fb8a53ee07344641b0562c540d840748571b811 \ + --hash=sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa \ + --hash=sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a \ + --hash=sha256:9063e24fdb1e498ab71cb7419e24622516c4a04476b17a2dab57e8baa30d6e03 \ + --hash=sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b \ + --hash=sha256:923c0c831b7cfcb071580d3f46c4baf50f174be571576556269530f4bbd79d04 \ + --hash=sha256:95f2a5796329323b8f0512e09dbb7a1860c46a39da62ecb2324f116fa8fdc85c \ + --hash=sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001 \ + --hash=sha256:9f96df6923e21816da7e0ad3fd47dd8f94b2a5ce594e00677c0013018b813458 \ + --hash=sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389 \ + --hash=sha256:a50aebfa173e157099939b17f18600f72f84eed3049e743b68ad15bd69b6bf99 \ + --hash=sha256:a981a536974bbc7a512cf44ed14938cf01030a99e9b3a06dd59578882f06f985 \ + --hash=sha256:a9a8e9031d613fd2009c182b69c7b2c1ef8239a0efb1df3f7c8da66d5dd3d537 \ + --hash=sha256:ae5f4161f18c61806f411a13b0310bea87f987c7d2ecdbdaad0e94eb2e404238 \ + --hash=sha256:aed38f6e4fb3f5d6bf81bfa990a07806be9d83cf7bacef998ab1a9bd660a581f \ + --hash=sha256:b01b88d45a6fcb69667cd6d2f7a9aeb4bf53760d7fc536bf679ec94fe9f3ff3d \ + --hash=sha256:b261ccdec7821281dade748d088bb6e9b69e6d15b30652b74cbbac25e280b796 \ + --hash=sha256:b2b0a0c0517616b6869869f8c581d4eb2dd83a4d79e0ebcb7d373ef9956aeb0a \ + --hash=sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143 \ + --hash=sha256:bd8f7df7d12c2db9fab40bdd87a7c09b1530128315d047a086fa3ae3435cb3a8 \ + --hash=sha256:beb58fe5cdb101e3a055192ac291b7a21e3b7ef4f67fa1d74e331a7f2124341c \ + --hash=sha256:c002b4ffc0be611f0d9da932eb0f704fe2602a9a949d1f738e4c34c75b0863d5 \ + --hash=sha256:c083af607d2515612056a31f0a8d9e0fcb5876b7bfc0abad3ecd275bc4ebc2d5 \ + --hash=sha256:c180f51afb394e165eafe4ac2936a14bee3eb10debc9d9e4db8958fe36afe711 \ + --hash=sha256:c235ebd9baae02f1b77bcea61bce332cb4331dc3617d254df3323aa01ab47bd4 \ + --hash=sha256:cd70574b12bb8a4d2aaa0094515df2463cb429d8536cfb6c7ce983246983e5a6 \ + --hash=sha256:d0eccceffcb53201b5bfebb52600a5fb483a20b61da9dbc885f8b103cbe7598c \ + --hash=sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7 \ + --hash=sha256:db364eca23f876da6f9e16c9da0df51aa4f104a972735574842618b8c6d999d4 \ + --hash=sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b \ + --hash=sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae \ + --hash=sha256:e06ed3eb3218bc64786f7db41917d4e686cc4856944f53d5bdf83a6884432e12 \ + --hash=sha256:e27ad930a842b4c5eb8ac0016b0a54f5aebbe679340c26101df33424142c143c \ + --hash=sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae \ + --hash=sha256:eb00ed941194665c332bf8e078baf037d6c35d7c4f3102ea2d4f16ca94a26dc8 \ + --hash=sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887 \ + --hash=sha256:eb8821e09e916165e160797a6c17edda0679379a4be5c716c260e836e122f54b \ + --hash=sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4 \ + --hash=sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f \ + --hash=sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5 \ + --hash=sha256:fb69256e180cb6c8a894fee62b3afebae785babc1ee98b81cdf68bbca1987f33 \ + --hash=sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519 \ + --hash=sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561 + # via + # -c python/requirements_compiled.txt + # requests +click==8.1.7 \ + --hash=sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28 \ + --hash=sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de + # via + # -c python/requirements_compiled.txt + # celery + # click-didyoumean + # click-plugins + # click-repl + # ray + # uvicorn +click-didyoumean==0.3.1 \ + --hash=sha256:4f82fdff0dbe64ef8ab2279bd6aa3f6a99c3b28c05aa09cbfc07c9d7fbb5a463 \ + --hash=sha256:5c4bb6007cfea5f2fd6583a2fb6701a22a41eb98957e63d0fac41c10e7c3117c + # via + # -c python/requirements_compiled.txt + # celery +click-plugins==1.1.1.2 \ + --hash=sha256:008d65743833ffc1f5417bf0e78e8d2c23aab04d9745ba817bd3e71b0feb6aa6 \ + --hash=sha256:d7af3984a99d243c131aa1a828331e7630f4a88a9741fd05c927b204bcf92261 + # via + # -c python/requirements_compiled.txt + # celery +click-repl==0.3.0 \ + --hash=sha256:17849c23dba3d667247dc4defe1757fff98694e90fe37474f3feebb69ced26a9 \ + --hash=sha256:fb7e06deb8da8de86180a33a9da97ac316751c094c6899382da7feeeeb51b812 + # via + # -c python/requirements_compiled.txt + # celery +cloudpickle==3.1.1 \ + --hash=sha256:b216fa8ae4019d5482a8ac3c95d8f6346115d8835911fd4aefd1a445e4242c64 \ + --hash=sha256:c8c5a44295039331ee9dad40ba100a9c7297b6f988e50e87ccdf3765a668350e + # via gymnasium +colorful==0.5.5 \ + --hash=sha256:62c187e27c1433db9463ff93b1451898d1e7e23a7e553583fd9daeb6325182e4 \ + --hash=sha256:66f8c1264b2a26f7293b96a03bb7a76c4bc8b9634369a0bffdcd12d618056a1d + # via + # -c python/requirements_compiled.txt + # ray +cryptography==44.0.3 \ + --hash=sha256:02f55fb4f8b79c1221b0961488eaae21015b69b210e18c386b69de182ebb1259 \ + --hash=sha256:157f1f3b8d941c2bd8f3ffee0af9b049c9665c39d3da9db2dc338feca5e98a43 \ + --hash=sha256:192ed30fac1728f7587c6f4613c29c584abdc565d7417c13904708db10206645 \ + --hash=sha256:21a83f6f35b9cc656d71b5de8d519f566df01e660ac2578805ab245ffd8523f8 \ + --hash=sha256:25cd194c39fa5a0aa4169125ee27d1172097857b27109a45fadc59653ec06f44 \ + --hash=sha256:3883076d5c4cc56dbef0b898a74eb6992fdac29a7b9013870b34efe4ddb39a0d \ + --hash=sha256:3bb0847e6363c037df8f6ede57d88eaf3410ca2267fb12275370a76f85786a6f \ + --hash=sha256:3be3f649d91cb182c3a6bd336de8b61a0a71965bd13d1a04a0e15b39c3d5809d \ + --hash=sha256:3f07943aa4d7dad689e3bb1638ddc4944cc5e0921e3c227486daae0e31a05e54 \ + --hash=sha256:479d92908277bed6e1a1c69b277734a7771c2b78633c224445b5c60a9f4bc1d9 \ + --hash=sha256:4ffc61e8f3bf5b60346d89cd3d37231019c17a081208dfbbd6e1605ba03fa137 \ + --hash=sha256:5639c2b16764c6f76eedf722dbad9a0914960d3489c0cc38694ddf9464f1bb2f \ + --hash=sha256:58968d331425a6f9eedcee087f77fd3c927c88f55368f43ff7e0a19891f2642c \ + --hash=sha256:5d186f32e52e66994dce4f766884bcb9c68b8da62d61d9d215bfe5fb56d21334 \ + --hash=sha256:5d20cc348cca3a8aa7312f42ab953a56e15323800ca3ab0706b8cd452a3a056c \ + --hash=sha256:6866df152b581f9429020320e5eb9794c8780e90f7ccb021940d7f50ee00ae0b \ + --hash=sha256:7d5fe7195c27c32a64955740b949070f21cba664604291c298518d2e255931d2 \ + --hash=sha256:896530bc9107b226f265effa7ef3f21270f18a2026bc09fed1ebd7b66ddf6375 \ + --hash=sha256:962bc30480a08d133e631e8dfd4783ab71cc9e33d5d7c1e192f0b7c06397bb88 \ + --hash=sha256:978631ec51a6bbc0b7e58f23b68a8ce9e5f09721940933e9c217068388789fe5 \ + --hash=sha256:9b4d4a5dbee05a2c390bf212e78b99434efec37b17a4bff42f50285c5c8c9647 \ + --hash=sha256:ab0b005721cc0039e885ac3503825661bd9810b15d4f374e473f8c89b7d5460c \ + --hash=sha256:af653022a0c25ef2e3ffb2c673a50e5a0d02fecc41608f4954176f1933b12359 \ + --hash=sha256:b0cc66c74c797e1db750aaa842ad5b8b78e14805a9b5d1348dc603612d3e3ff5 \ + --hash=sha256:b424563394c369a804ecbee9b06dfb34997f19d00b3518e39f83a5642618397d \ + --hash=sha256:c138abae3a12a94c75c10499f1cbae81294a6f983b3af066390adee73f433028 \ + --hash=sha256:c6cd67722619e4d55fdb42ead64ed8843d64638e9c07f4011163e46bc512cf01 \ + --hash=sha256:c91fc8e8fd78af553f98bc7f2a1d8db977334e4eea302a4bfd75b9461c2d8904 \ + --hash=sha256:cad399780053fb383dc067475135e41c9fe7d901a97dd5d9c5dfb5611afc0d7d \ + --hash=sha256:cb90f60e03d563ca2445099edf605c16ed1d5b15182d21831f58460c48bffb93 \ + --hash=sha256:dad80b45c22e05b259e33ddd458e9e2ba099c86ccf4e88db7bbab4b747b18d06 \ + --hash=sha256:dd3db61b8fe5be220eee484a17233287d0be6932d056cf5738225b9c05ef4fff \ + --hash=sha256:e28d62e59a4dbd1d22e747f57d4f00c459af22181f0b2f787ea83f5a876d7c76 \ + --hash=sha256:e909df4053064a97f1e6565153ff8bb389af12c5c8d29c343308760890560aff \ + --hash=sha256:f3ffef566ac88f75967d7abd852ed5f182da252d23fac11b4766da3957766759 \ + --hash=sha256:fc3c9babc1e1faefd62704bb46a69f359a9819eb0292e40df3fb6e3574715cd4 \ + --hash=sha256:fe19d8bc5536a91a24a8133328880a41831b6c5df54599a8417b62fe015d3053 + # via + # -c python/requirements_compiled.txt + # pyopenssl +cupy-cuda12x==13.1.0 ; sys_platform != 'darwin' \ + --hash=sha256:230f8a8e99c81a653baa0ed00819990c0ed1f0cf0298214786b5e323461dc61a \ + --hash=sha256:2d16eaa2d086e416ac13467d4ff3184b9a081fe76b761ce51d4a46ec1c4bd28a \ + --hash=sha256:432273fd4b61a284f7d705d08b8291403548fd422bcbd945635cc155bc6a923d \ + --hash=sha256:4c51a1062a3c5a826b0425952d229ffe73b1791656a31de95b318117e67a9576 \ + --hash=sha256:4c8e9fdb1f3ffc3151808f8bb8c871518d2783e1be8b53792b698a840543d60c \ + --hash=sha256:51b1d6cb83d82dfa306c9efaeb4d57f24bad3041ebd8716d61072676abbcf67b \ + --hash=sha256:52185a2cf95d3bac2c3fda95c9c8e06a985b5a00cd2e587d3caace337db33899 \ + --hash=sha256:5afb6658faa22f21479ae2c0a07254df31c0aebc36907a64a1f6be4ecc9e96da \ + --hash=sha256:d3dc91ef9c4104652195eea4b282d343ecad653021efe20d1c8dd8dfe8ccfd86 \ + --hash=sha256:d60d1e124592cb82a5f3f45b3e7bee7bda7b72a743029f275e9d6b125f338c60 \ + --hash=sha256:dac0284fecb90b5731f514e569a6fcf6674a730ae95b9490781a713b60a34423 \ + --hash=sha256:e7a25ef1b44ae6276b5105affc2289edb34f1aa6676babd5bcd80907348c4cfa + # via + # -c python/requirements_compiled.txt + # ray +distlib==0.3.7 \ + --hash=sha256:2e24928bc811348f0feb63014e97aaae3037f2cf48712d51ae61df7fd6075057 \ + --hash=sha256:9dafe54b34a028eafd95039d5e5d4851a13734540f1331060d31c9916e7147a8 + # via + # -c python/requirements_compiled.txt + # virtualenv +dm-tree==0.1.8 \ + --hash=sha256:054b461f8176f4bce7a21f7b1870f873a1ced3bdbe1282c816c550bb43c71fa6 \ + --hash=sha256:09964470f76a5201aff2e8f9b26842976de7889300676f927930f6285e256760 \ + --hash=sha256:0d3172394079a86c3a759179c65f64c48d1a42b89495fcf38976d11cc3bb952c \ + --hash=sha256:0e9620ccf06393eb6b613b5e366469304622d4ea96ae6540b28a33840e6c89cf \ + --hash=sha256:0fcaabbb14e7980377439e7140bd05552739ca5e515ecb3119f234acee4b9430 \ + --hash=sha256:1607ce49aa42f010d1e5e616d92ce899d66835d4d8bea49679582435285515de \ + --hash=sha256:181c35521d480d0365f39300542cb6cd7fd2b77351bb43d7acfda15aef63b317 \ + --hash=sha256:1d7c26e431fc93cc7e0cba867eb000db6a05f6f2b25af11ac4e9dada88fc5bca \ + --hash=sha256:1fe962015b2fe1282892b28ebe962faed53c7f98d942da9a4625cbf27baef913 \ + --hash=sha256:250b692fb75f45f02e2f58fbef9ab338904ef334b90557565621fa251df267cf \ + --hash=sha256:2869228d9c619074de501a3c10dc7f07c75422f8fab36ecdcb859b6f1b1ec3ef \ + --hash=sha256:28c52cbf4f8b3dbd0beaedf44f69fa85eec5e9dede612e08035e06ada6ec9426 \ + --hash=sha256:2f7915660f59c09068e428613c480150180df1060561fd0d1470684ae7007bd1 \ + --hash=sha256:343a4a4ebaa127451ff971254a4be4084eb4bdc0b2513c32b46f6f728fd03f9e \ + --hash=sha256:35cc164a79336bfcfafb47e5f297898359123bbd3330c1967f0c4994f9cf9f60 \ + --hash=sha256:378cc8ad93c5fe3590f405a309980721f021c790ca1bdf9b15bb1d59daec57f5 \ + --hash=sha256:39070ba268c0491af9fe7a58644d99e8b4f2cde6e5884ba3380bddc84ed43d5f \ + --hash=sha256:435227cf3c5dc63f4de054cf3d00183790bd9ead4c3623138c74dde7f67f521b \ + --hash=sha256:5483dca4d7eb1a0d65fe86d3b6a53ae717face83c1f17e0887b1a4a64ae5c410 \ + --hash=sha256:694c3654cfd2a81552c08ec66bb5c4a3d48fa292b9a181880fb081c36c5b9134 \ + --hash=sha256:75c5d528bb992981c20793b6b453e91560784215dffb8a5440ba999753c14ceb \ + --hash=sha256:803bfc53b4659f447ac694dbd04235f94a73ef7c1fd1e0df7c84ac41e0bc963b \ + --hash=sha256:81fce77f22a302d7a5968aebdf4efafef4def7ce96528719a354e6990dcd49c7 \ + --hash=sha256:83b7764de0d855338abefc6e3ee9fe40d301668310aa3baea3f778ff051f4393 \ + --hash=sha256:8c60a7eadab64c2278861f56bca320b2720f163dca9d7558103c3b77f2416571 \ + --hash=sha256:8ed3564abed97c806db122c2d3e1a2b64c74a63debe9903aad795167cc301368 \ + --hash=sha256:94d3f0826311f45ee19b75f5b48c99466e4218a0489e81c0f0167bda50cacf22 \ + --hash=sha256:96a548a406a6fb15fe58f6a30a57ff2f2aafbf25f05afab00c8f5e5977b6c715 \ + --hash=sha256:a5d819c38c03f0bb5b3b3703c60e4b170355a0fc6b5819325bf3d4ceb3ae7e80 \ + --hash=sha256:ad16ceba90a56ec47cf45b21856d14962ac314787975ef786efb5e6e9ca75ec7 \ + --hash=sha256:af4b3d372f2477dcd89a6e717e4a575ca35ccc20cc4454a8a4b6f8838a00672d \ + --hash=sha256:b095ba4f8ca1ba19350fd53cf1f8f3eb0bd406aa28af64a6dfc86707b32a810a \ + --hash=sha256:b9bd9b9ccb59409d33d51d84b7668010c04c2af7d4a371632874c1ca356cff3d \ + --hash=sha256:b9f89a454e98806b44fe9d40ec9eee61f848388f7e79ac2371a55679bd5a3ac6 \ + --hash=sha256:bb2d109f42190225112da899b9f3d46d0d5f26aef501c61e43529fe9322530b5 \ + --hash=sha256:c0a94aba18a35457a1b5cd716fd7b46c5dafdc4cf7869b4bae665b91c4682a8e \ + --hash=sha256:c5c8c12e3fda754ef6af94161bacdaeda816d941995fac415d6855c6c386af68 \ + --hash=sha256:d1612fcaecd79023dbc6a6ae48d51a80beb5c385d6f3f6d71688e57bc8d07de8 \ + --hash=sha256:d16e1f2a073604cfcc09f7131ae8d534674f43c3aef4c25742eae295bc60d04f \ + --hash=sha256:d20f2faa3672b52e5013f4077117bfb99c4cfc0b445d3bde1584c34032b57436 \ + --hash=sha256:d40fa4106ca6edc66760246a08f500ec0c85ef55c762fb4a363f6ee739ba02ee \ + --hash=sha256:de287fabc464b8734be251e46e06aa9aa1001f34198da2b6ce07bd197172b9cb \ + --hash=sha256:e4d714371bb08839e4e5e29024fc95832d9affe129825ef38836b143028bd144 \ + --hash=sha256:ea9e59e0451e7d29aece402d9f908f2e2a80922bcde2ebfd5dcb07750fcbfee8 \ + --hash=sha256:f7ac31b9aecccb2c6e1ab29706f6ded3eba0c2c69c770322c9c685929c3d6afb \ + --hash=sha256:fa42a605d099ee7d41ba2b5fb75e21423951fd26e5d50583a00471238fb3021d + # via + # -c python/requirements_compiled.txt + # ray +farama-notifications==0.0.4 \ + --hash=sha256:13fceff2d14314cf80703c8266462ebf3733c7d165336eee998fc58e545efd18 \ + --hash=sha256:14de931035a41961f7c056361dc7f980762a143d05791ef5794a751a2caf05ae + # via + # -c python/requirements_compiled.txt + # gymnasium +fastapi==0.115.12 \ + --hash=sha256:1e2c2a2646905f9e83d32f04a3f86aff4a286669c6c950ca95b5fd68c2602681 \ + --hash=sha256:e94613d6c05e27be7ffebdd6ea5f388112e5e430c8f7d6494a9d1d88d43e814d + # via + # -c python/requirements_compiled.txt + # ray +fastrlock==0.8.2 ; sys_platform != 'darwin' \ + --hash=sha256:067edb0a0805bf61e17a251d5046af59f6e9d2b8ad01222e0ef7a0b7937d5548 \ + --hash=sha256:07ed3c7b3867c05a3d6be4ced200c7767000f3431b9be6da66972822dd86e8be \ + --hash=sha256:08315bde19d0c2e6b06593d5a418be3dc8f9b1ee721afa96867b9853fceb45cf \ + --hash=sha256:11bbbbc526363955aeddb9eec4cee2a0012322b7b2f15b54f44454fcf4fd398a \ + --hash=sha256:17734e2e5af4c07ddb0fb10bd484e062c22de3be6b67940b9cc6ec2f18fa61ba \ + --hash=sha256:1b15430b93d7eb3d56f6ff690d2ebecb79ed0e58248427717eba150a508d1cd7 \ + --hash=sha256:1fed2f4797ad68e9982038423018cf08bec5f4ce9fed63a94a790773ed6a795c \ + --hash=sha256:2074548a335fcf7d19ebb18d9208da9e33b06f745754466a7e001d2b1c58dd19 \ + --hash=sha256:2587cedbb36c7988e707d83f0f1175c1f882f362b5ebbee25d70218ea33d220d \ + --hash=sha256:25945f962c7bd808415cfde3da624d4399d4ea71ed8918538375f16bceb79e1c \ + --hash=sha256:27786c62a400e282756ae1b090bcd7cfa35f28270cff65a9e7b27a5327a32561 \ + --hash=sha256:2c1719ddc8218b01e82fb2e82e8451bd65076cb96d7bef4477194bbb4305a968 \ + --hash=sha256:2d5595903444c854b99c42122b87edfe8a37cd698a4eae32f4fd1d2a7b6c115d \ + --hash=sha256:30bdbe4662992348132d03996700e1cf910d141d629179b967b146a22942264e \ + --hash=sha256:31a27a2edf482df72b91fe6c6438314d2c65290aa7becc55589d156c9b91f0da \ + --hash=sha256:320fd55bafee3eb069cfb5d6491f811a912758387ef2193840e2663e80e16f48 \ + --hash=sha256:33145acbad8317584cd64588131c7e1e286beef6280c0009b4544c91fce171d2 \ + --hash=sha256:43a241655e83e4603a152192cf022d5ca348c2f4e56dfb02e5c9c4c1a32f9cdb \ + --hash=sha256:4d63b6596368dab9e0cc66bf047e7182a56f33b34db141816a4f21f5bf958228 \ + --hash=sha256:4fb04442b6d1e2b36c774919c6bcbe3339c61b337261d4bd57e27932589095af \ + --hash=sha256:4fb2e77ff04bc4beb71d63c8e064f052ce5a6ea1e001d528d4d7f4b37d736f2e \ + --hash=sha256:5460c5ee6ced6d61ec8cd2324ebbe793a4960c4ffa2131ffff480e3b61c99ec5 \ + --hash=sha256:59344c1d46b7dec97d3f22f1cc930fafe8980b3c5bc9c9765c56738a5f1559e4 \ + --hash=sha256:5dfb78dd600a12f23fc0c3ec58f81336229fdc74501ecf378d1ce5b3f2f313ea \ + --hash=sha256:643e1e65b4f5b284427e61a894d876d10459820e93aa1e724dfb415117be24e0 \ + --hash=sha256:644ec9215cf9c4df8028d8511379a15d9c1af3e16d80e47f1b6fdc6ba118356a \ + --hash=sha256:66f2662c640bb71a1016a031eea6eef9d25c2bcdf7ffd1d1ddc5a58f9a1ced04 \ + --hash=sha256:685e656048b59d8dfde8c601f188ad53a4d719eb97080cafc8696cda6d75865e \ + --hash=sha256:7269bb3fc15587b0c191eecd95831d771a7d80f0c48929e560806b038ff3066c \ + --hash=sha256:73426f5eb2ecc10626c67cf86bd0af9e00d53e80e5c67d5ce8e18376d6abfa09 \ + --hash=sha256:75c07726c8b1a52147fd7987d6baaa318c5dced1416c3f25593e40f56e10755b \ + --hash=sha256:790fc19bccbd39426060047e53629f171a44745613bf360a045e9f9c8c4a2cea \ + --hash=sha256:7a2ccaf88ac0db153e84305d1ef0aa138cea82c6a88309066f6eaa3bc98636cd \ + --hash=sha256:87f4e01b042c84e6090dbc4fbe3415ddd69f6bc0130382323f9d3f1b8dd71b46 \ + --hash=sha256:88f079335e9da631efa64486c8207564a7bcd0c00526bb9e842e9d5b7e50a6cc \ + --hash=sha256:8c1c91a68926421f5ccbc82c85f83bd3ba593b121a46a1b9a554b3f0dd67a4bf \ + --hash=sha256:9121a894d74e65557e47e777060a495ab85f4b903e80dd73a3c940ba042920d7 \ + --hash=sha256:94e348c72a1fd1f8191f25ea056448e4f5a87b8fbf005b39d290dcb0581a48cd \ + --hash=sha256:98195866d3a9949915935d40a88e4f1c166e82e378f622c88025f2938624a90a \ + --hash=sha256:99dd6652bd6f730beadf74ef769d38c6bbd8ee6d1c15c8d138ea680b0594387f \ + --hash=sha256:9af691a9861027181d4de07ed74f0aee12a9650ac60d0a07f4320bff84b5d95f \ + --hash=sha256:a3b8b5d2935403f1b4b25ae324560e94b59593a38c0d2e7b6c9872126a9622ed \ + --hash=sha256:a3dcc876050b8f5cbc0ee84ef1e7f0c1dfe7c148f10098828bc4403683c33f10 \ + --hash=sha256:a74f5a92fa6e51c4f3c69b29c4662088b97be12f40652a21109605a175c81824 \ + --hash=sha256:ab91b0c36e95d42e1041a4907e3eefd06c482d53af3c7a77be7e214cc7cd4a63 \ + --hash=sha256:ad1bc61c7f6b0e58106aaab034916b6cb041757f708b07fbcdd9d6e1ac629225 \ + --hash=sha256:adcb9e77aa132cc6c9de2ffe7cf880a20aa8cdba21d367d1da1a412f57bddd5d \ + --hash=sha256:b22ea9bf5f9fad2b0077e944a7813f91593a4f61adf8faf734a70aed3f2b3a40 \ + --hash=sha256:b2a1c354f13f22b737621d914f3b4a8434ae69d3027a775e94b3e671756112f9 \ + --hash=sha256:b32fdf874868326351a75b1e4c02f97e802147119ae44c52d3d9da193ec34f5b \ + --hash=sha256:b3853ed4ce522598dc886160a7bab432a093051af85891fa2f5577c1dcac8ed6 \ + --hash=sha256:b443e73a4dfc7b6e0800ea4c13567b9694358e86f53bb2612a51c9e727cac67b \ + --hash=sha256:b4c9083ea89ab236b06e9ef2263971db3b4b507195fc7d5eecab95828dcae325 \ + --hash=sha256:b8ca0fe21458457077e4cb2d81e1ebdb146a00b3e9e2db6180a773f7ea905032 \ + --hash=sha256:c393af77c659a38bffbca215c0bcc8629ba4299568308dd7e4ff65d62cabed39 \ + --hash=sha256:c6bffa978793bea5e1b00e677062e53a62255439339591b70e209fa1552d5ee0 \ + --hash=sha256:ccf39ad5702e33e4d335b48ef9d56e21619b529b7f7471b5211419f380329b62 \ + --hash=sha256:cf81e0278b645004388873e0a1f9e3bc4c9ab8c18e377b14ed1a544be4b18c9a \ + --hash=sha256:d34546ad2e4a480b94b6797bcc5a322b3c705c4c74c3e4e545c4a3841c1b2d59 \ + --hash=sha256:d47713ffe6d4a627fbf078be9836a95ac106b4a0543e3841572c91e292a5d885 \ + --hash=sha256:d918dfe473291e8bfd8e13223ea5cb9b317bd9f50c280923776c377f7c64b428 \ + --hash=sha256:dbdce852e6bb66e1b8c36679d482971d69d93acf1785657522e51b7de30c3356 \ + --hash=sha256:dcc1bf0ac8a194313cf6e645e300a8a379674ceed8e0b1e910a2de3e3c28989e \ + --hash=sha256:dd961a32a7182c3891cdebca417fda67496d5d5de6ae636962254d22723bdf52 \ + --hash=sha256:ddf5d247f686aec853ddcc9a1234bfcc6f57b0a0670d2ad82fc25d8ae7e6a15f \ + --hash=sha256:e27c3cd27fbd25e5223c5c992b300cd4ee8f0a75c6f222ce65838138d853712c \ + --hash=sha256:e380ec4e6d8b26e389713995a43cb7fe56baea2d25fe073d4998c4821a026211 \ + --hash=sha256:e4bbde174a0aff5f6eeba75cf8c4c5d2a316316bc21f03a0bddca0fc3659a6f3 \ + --hash=sha256:e8b49b5743ede51e0bcf6805741f39f5e0e0fd6a172ba460cb39e3097ba803bb \ + --hash=sha256:e9904b5b37c3e5bb4a245c56bc4b7e497da57ffb8528f4fc39af9dcb168ee2e1 \ + --hash=sha256:ea96503b918fceaf40443182742b8964d47b65c5ebdea532893cb9479620000c \ + --hash=sha256:eb31fe390f03f7ae886dcc374f1099ec88526631a4cb891d399b68181f154ff0 \ + --hash=sha256:ebb32d776b61acd49f859a1d16b9e3d84e7b46d0d92aebd58acd54dc38e96664 \ + --hash=sha256:fb5363cf0fddd9b50525ddbf64a1e1b28ec4c6dfb28670a940cb1cf988a6786b \ + --hash=sha256:ff75c90663d6e8996610d435e71487daa853871ad1770dd83dc0f2fc4997241e + # via + # -c python/requirements_compiled.txt + # cupy-cuda12x +filelock==3.17.0 \ + --hash=sha256:533dc2f7ba78dc2f0f531fc6c4940addf7b70a481e269a5a3b93be94ffbe8338 \ + --hash=sha256:ee4e77401ef576ebb38cd7f13b9b28893194acc20a8e68e18730ba9c0e54660e + # via + # -c python/requirements_compiled.txt + # ray + # virtualenv +frozenlist==1.4.1 \ + --hash=sha256:04ced3e6a46b4cfffe20f9ae482818e34eba9b5fb0ce4056e4cc9b6e212d09b7 \ + --hash=sha256:0633c8d5337cb5c77acbccc6357ac49a1770b8c487e5b3505c57b949b4b82e98 \ + --hash=sha256:068b63f23b17df8569b7fdca5517edef76171cf3897eb68beb01341131fbd2ad \ + --hash=sha256:0c250a29735d4f15321007fb02865f0e6b6a41a6b88f1f523ca1596ab5f50bd5 \ + --hash=sha256:1979bc0aeb89b33b588c51c54ab0161791149f2461ea7c7c946d95d5f93b56ae \ + --hash=sha256:1a4471094e146b6790f61b98616ab8e44f72661879cc63fa1049d13ef711e71e \ + --hash=sha256:1b280e6507ea8a4fa0c0a7150b4e526a8d113989e28eaaef946cc77ffd7efc0a \ + --hash=sha256:1d0ce09d36d53bbbe566fe296965b23b961764c0bcf3ce2fa45f463745c04701 \ + --hash=sha256:20b51fa3f588ff2fe658663db52a41a4f7aa6c04f6201449c6c7c476bd255c0d \ + --hash=sha256:23b2d7679b73fe0e5a4560b672a39f98dfc6f60df63823b0a9970525325b95f6 \ + --hash=sha256:23b701e65c7b36e4bf15546a89279bd4d8675faabc287d06bbcfac7d3c33e1e6 \ + --hash=sha256:2471c201b70d58a0f0c1f91261542a03d9a5e088ed3dc6c160d614c01649c106 \ + --hash=sha256:27657df69e8801be6c3638054e202a135c7f299267f1a55ed3a598934f6c0d75 \ + --hash=sha256:29acab3f66f0f24674b7dc4736477bcd4bc3ad4b896f5f45379a67bce8b96868 \ + --hash=sha256:32453c1de775c889eb4e22f1197fe3bdfe457d16476ea407472b9442e6295f7a \ + --hash=sha256:3a670dc61eb0d0eb7080890c13de3066790f9049b47b0de04007090807c776b0 \ + --hash=sha256:3e0153a805a98f5ada7e09826255ba99fb4f7524bb81bf6b47fb702666484ae1 \ + --hash=sha256:410478a0c562d1a5bcc2f7ea448359fcb050ed48b3c6f6f4f18c313a9bdb1826 \ + --hash=sha256:442acde1e068288a4ba7acfe05f5f343e19fac87bfc96d89eb886b0363e977ec \ + --hash=sha256:48f6a4533887e189dae092f1cf981f2e3885175f7a0f33c91fb5b7b682b6bab6 \ + --hash=sha256:4f57dab5fe3407b6c0c1cc907ac98e8a189f9e418f3b6e54d65a718aaafe3950 \ + --hash=sha256:4f9c515e7914626b2a2e1e311794b4c35720a0be87af52b79ff8e1429fc25f19 \ + --hash=sha256:55fdc093b5a3cb41d420884cdaf37a1e74c3c37a31f46e66286d9145d2063bd0 \ + --hash=sha256:5667ed53d68d91920defdf4035d1cdaa3c3121dc0b113255124bcfada1cfa1b8 \ + --hash=sha256:590344787a90ae57d62511dd7c736ed56b428f04cd8c161fcc5e7232c130c69a \ + --hash=sha256:5a7d70357e7cee13f470c7883a063aae5fe209a493c57d86eb7f5a6f910fae09 \ + --hash=sha256:5c3894db91f5a489fc8fa6a9991820f368f0b3cbdb9cd8849547ccfab3392d86 \ + --hash=sha256:5c849d495bf5154cd8da18a9eb15db127d4dba2968d88831aff6f0331ea9bd4c \ + --hash=sha256:64536573d0a2cb6e625cf309984e2d873979709f2cf22839bf2d61790b448ad5 \ + --hash=sha256:693945278a31f2086d9bf3df0fe8254bbeaef1fe71e1351c3bd730aa7d31c41b \ + --hash=sha256:6db4667b187a6742b33afbbaf05a7bc551ffcf1ced0000a571aedbb4aa42fc7b \ + --hash=sha256:6eb73fa5426ea69ee0e012fb59cdc76a15b1283d6e32e4f8dc4482ec67d1194d \ + --hash=sha256:722e1124aec435320ae01ee3ac7bec11a5d47f25d0ed6328f2273d287bc3abb0 \ + --hash=sha256:7268252af60904bf52c26173cbadc3a071cece75f873705419c8681f24d3edea \ + --hash=sha256:74fb4bee6880b529a0c6560885fce4dc95936920f9f20f53d99a213f7bf66776 \ + --hash=sha256:780d3a35680ced9ce682fbcf4cb9c2bad3136eeff760ab33707b71db84664e3a \ + --hash=sha256:82e8211d69a4f4bc360ea22cd6555f8e61a1bd211d1d5d39d3d228b48c83a897 \ + --hash=sha256:89aa2c2eeb20957be2d950b85974b30a01a762f3308cd02bb15e1ad632e22dc7 \ + --hash=sha256:8aefbba5f69d42246543407ed2461db31006b0f76c4e32dfd6f42215a2c41d09 \ + --hash=sha256:96ec70beabbd3b10e8bfe52616a13561e58fe84c0101dd031dc78f250d5128b9 \ + --hash=sha256:9750cc7fe1ae3b1611bb8cfc3f9ec11d532244235d75901fb6b8e42ce9229dfe \ + --hash=sha256:9acbb16f06fe7f52f441bb6f413ebae6c37baa6ef9edd49cdd567216da8600cd \ + --hash=sha256:9d3e0c25a2350080e9319724dede4f31f43a6c9779be48021a7f4ebde8b2d742 \ + --hash=sha256:a06339f38e9ed3a64e4c4e43aec7f59084033647f908e4259d279a52d3757d09 \ + --hash=sha256:a0cb6f11204443f27a1628b0e460f37fb30f624be6051d490fa7d7e26d4af3d0 \ + --hash=sha256:a7496bfe1da7fb1a4e1cc23bb67c58fab69311cc7d32b5a99c2007b4b2a0e932 \ + --hash=sha256:a828c57f00f729620a442881cc60e57cfcec6842ba38e1b19fd3e47ac0ff8dc1 \ + --hash=sha256:a9b2de4cf0cdd5bd2dee4c4f63a653c61d2408055ab77b151c1957f221cabf2a \ + --hash=sha256:b46c8ae3a8f1f41a0d2ef350c0b6e65822d80772fe46b653ab6b6274f61d4a49 \ + --hash=sha256:b7e3ed87d4138356775346e6845cccbe66cd9e207f3cd11d2f0b9fd13681359d \ + --hash=sha256:b7f2f9f912dca3934c1baec2e4585a674ef16fe00218d833856408c48d5beee7 \ + --hash=sha256:ba60bb19387e13597fb059f32cd4d59445d7b18b69a745b8f8e5db0346f33480 \ + --hash=sha256:beee944ae828747fd7cb216a70f120767fc9f4f00bacae8543c14a6831673f89 \ + --hash=sha256:bfa4a17e17ce9abf47a74ae02f32d014c5e9404b6d9ac7f729e01562bbee601e \ + --hash=sha256:c037a86e8513059a2613aaba4d817bb90b9d9b6b69aace3ce9c877e8c8ed402b \ + --hash=sha256:c302220494f5c1ebeb0912ea782bcd5e2f8308037b3c7553fad0e48ebad6ad82 \ + --hash=sha256:c6321c9efe29975232da3bd0af0ad216800a47e93d763ce64f291917a381b8eb \ + --hash=sha256:c757a9dd70d72b076d6f68efdbb9bc943665ae954dad2801b874c8c69e185068 \ + --hash=sha256:c99169d4ff810155ca50b4da3b075cbde79752443117d89429595c2e8e37fed8 \ + --hash=sha256:c9c92be9fd329ac801cc420e08452b70e7aeab94ea4233a4804f0915c14eba9b \ + --hash=sha256:cc7b01b3754ea68a62bd77ce6020afaffb44a590c2289089289363472d13aedb \ + --hash=sha256:db9e724bebd621d9beca794f2a4ff1d26eed5965b004a97f1f1685a173b869c2 \ + --hash=sha256:dca69045298ce5c11fd539682cff879cc1e664c245d1c64da929813e54241d11 \ + --hash=sha256:dd9b1baec094d91bf36ec729445f7769d0d0cf6b64d04d86e45baf89e2b9059b \ + --hash=sha256:e02a0e11cf6597299b9f3bbd3f93d79217cb90cfd1411aec33848b13f5c656cc \ + --hash=sha256:e6a20a581f9ce92d389a8c7d7c3dd47c81fd5d6e655c8dddf341e14aa48659d0 \ + --hash=sha256:e7004be74cbb7d9f34553a5ce5fb08be14fb33bc86f332fb71cbe5216362a497 \ + --hash=sha256:e774d53b1a477a67838a904131c4b0eef6b3d8a651f8b138b04f748fccfefe17 \ + --hash=sha256:edb678da49d9f72c9f6c609fbe41a5dfb9a9282f9e6a2253d5a91e0fc382d7c0 \ + --hash=sha256:f146e0911cb2f1da549fc58fc7bcd2b836a44b79ef871980d605ec392ff6b0d2 \ + --hash=sha256:f56e2333dda1fe0f909e7cc59f021eba0d2307bc6f012a1ccf2beca6ba362439 \ + --hash=sha256:f9a3ea26252bd92f570600098783d1371354d89d5f6b7dfd87359d669f2109b5 \ + --hash=sha256:f9aa1878d1083b276b0196f2dfbe00c9b7e752475ed3b682025ff20c1c1f51ac \ + --hash=sha256:fb3c2db03683b5767dedb5769b8a40ebb47d6f7f45b1b3e3b4b51ec8ad9d9825 \ + --hash=sha256:fbeb989b5cc29e8daf7f976b421c220f1b8c731cbf22b9130d8815418ea45887 \ + --hash=sha256:fde5bd59ab5357e3853313127f4d3565fc7dad314a74d7b5d43c22c6a5ed2ced \ + --hash=sha256:fe1a06da377e3a1062ae5fe0926e12b84eceb8a50b350ddca72dc85015873f74 + # via + # -c python/requirements_compiled.txt + # aiohttp + # aiosignal +fsspec==2023.12.1 \ + --hash=sha256:6271f1d3075a378bfe432f6f42bf7e1d2a6ba74f78dd9b512385474c579146a0 \ + --hash=sha256:c4da01a35ac65c853f833e43f67802c25213f560820d54ddf248f92eddd5e990 + # via + # -c python/requirements_compiled.txt + # ray +google-api-core==2.24.2 \ + --hash=sha256:810a63ac95f3c441b7c0e43d344e372887f62ce9071ba972eacf32672e072de9 \ + --hash=sha256:81718493daf06d96d6bc76a91c23874dbf2fac0adbbf542831b805ee6e974696 + # via + # -c python/requirements_compiled.txt + # opencensus +google-auth==2.23.4 \ + --hash=sha256:79905d6b1652187def79d491d6e23d0cbb3a21d3c7ba0dbaa9c8a01906b13ff3 \ + --hash=sha256:d4bbc92fe4b8bfd2f3e8d88e5ba7085935da208ee38a134fc280e7ce682a05f2 + # via + # -c python/requirements_compiled.txt + # google-api-core +googleapis-common-protos==1.61.0 \ + --hash=sha256:22f1915393bb3245343f6efe87f6fe868532efc12aa26b391b15132e1279f1c0 \ + --hash=sha256:8a64866a97f6304a7179873a465d6eee97b7a24ec6cfd78e0f575e96b821240b + # via + # -c python/requirements_compiled.txt + # google-api-core +grpcio==1.74.0 \ + --hash=sha256:0f87bddd6e27fc776aacf7ebfec367b6d49cad0455123951e4488ea99d9b9b8f \ + --hash=sha256:136b53c91ac1d02c8c24201bfdeb56f8b3ac3278668cbb8e0ba49c88069e1bdc \ + --hash=sha256:1733969040989f7acc3d94c22f55b4a9501a30f6aaacdbccfaba0a3ffb255ab7 \ + --hash=sha256:176d60a5168d7948539def20b2a3adcce67d72454d9ae05969a2e73f3a0feee7 \ + --hash=sha256:1a2b06afe2e50ebfd46247ac3ba60cac523f54ec7792ae9ba6073c12daf26f0a \ + --hash=sha256:1bf949792cee20d2078323a9b02bacbbae002b9e3b9e2433f2741c15bdeba1c4 \ + --hash=sha256:22b834cef33429ca6cc28303c9c327ba9a3fafecbf62fae17e9a7b7163cc43ac \ + --hash=sha256:2918948864fec2a11721d91568effffbe0a02b23ecd57f281391d986847982f6 \ + --hash=sha256:2bc2d7d8d184e2362b53905cb1708c84cb16354771c04b490485fa07ce3a1d89 \ + --hash=sha256:2f609a39f62a6f6f05c7512746798282546358a37ea93c1fcbadf8b2fed162e3 \ + --hash=sha256:3601274bc0523f6dc07666c0e01682c94472402ac2fd1226fd96e079863bfa49 \ + --hash=sha256:3b03d8f2a07f0fea8c8f74deb59f8352b770e3900d143b3d1475effcb08eec20 \ + --hash=sha256:3d14e3c4d65e19d8430a4e28ceb71ace4728776fd6c3ce34016947474479683f \ + --hash=sha256:42f8fee287427b94be63d916c90399ed310ed10aadbf9e2e5538b3e497d269bc \ + --hash=sha256:4bc5fca10aaf74779081e16c2bcc3d5ec643ffd528d9e7b1c9039000ead73bae \ + --hash=sha256:4e4181bfc24413d1e3a37a0b7889bea68d973d4b45dd2bc68bb766c140718f82 \ + --hash=sha256:55b453812fa7c7ce2f5c88be3018fb4a490519b6ce80788d5913f3f9d7da8c7b \ + --hash=sha256:566b9395b90cc3d0d0c6404bc8572c7c18786ede549cdb540ae27b58afe0fb91 \ + --hash=sha256:5f251c355167b2360537cf17bea2cf0197995e551ab9da6a0a59b3da5e8704f9 \ + --hash=sha256:60d2d48b0580e70d2e1954d0d19fa3c2e60dd7cbed826aca104fff518310d1c5 \ + --hash=sha256:64229c1e9cea079420527fa8ac45d80fc1e8d3f94deaa35643c381fa8d98f362 \ + --hash=sha256:655726919b75ab3c34cdad39da5c530ac6fa32696fb23119e36b64adcfca174a \ + --hash=sha256:662456c4513e298db6d7bd9c3b8df6f75f8752f0ba01fb653e252ed4a59b5a5d \ + --hash=sha256:68c8ebcca945efff9d86d8d6d7bfb0841cf0071024417e2d7f45c5e46b5b08eb \ + --hash=sha256:69e1a8180868a2576f02356565f16635b99088da7df3d45aaa7e24e73a054e31 \ + --hash=sha256:6bab67d15ad617aff094c382c882e0177637da73cbc5532d52c07b4ee887a87b \ + --hash=sha256:7d95d71ff35291bab3f1c52f52f474c632db26ea12700c2ff0ea0532cb0b5854 \ + --hash=sha256:80d1f4fbb35b0742d3e3d3bb654b7381cd5f015f8497279a1e9c21ba623e01b1 \ + --hash=sha256:834988b6c34515545b3edd13e902c1acdd9f2465d386ea5143fb558f153a7176 \ + --hash=sha256:8533e6e9c5bd630ca98062e3a1326249e6ada07d05acf191a77bc33f8948f3d8 \ + --hash=sha256:85bd5cdf4ed7b2d6438871adf6afff9af7096486fcf51818a81b77ef4dd30907 \ + --hash=sha256:86ad489db097141a907c559988c29718719aa3e13370d40e20506f11b4de0d11 \ + --hash=sha256:885912559974df35d92219e2dc98f51a16a48395f37b92865ad45186f294096c \ + --hash=sha256:8efe72fde5500f47aca1ef59495cb59c885afe04ac89dd11d810f2de87d935d4 \ + --hash=sha256:8f7b5882fb50632ab1e48cb3122d6df55b9afabc265582808036b6e51b9fd6b7 \ + --hash=sha256:9e7c4389771855a92934b2846bd807fc25a3dfa820fd912fe6bd8136026b2707 \ + --hash=sha256:9e912d3c993a29df6c627459af58975b2e5c897d93287939b9d5065f000249b5 \ + --hash=sha256:a8f0302f9ac4e9923f98d8e243939a6fb627cd048f5cd38595c97e38020dffce \ + --hash=sha256:b6a73b2ba83e663b2480a90b82fdae6a7aa6427f62bf43b29912c0cfd1aa2bfa \ + --hash=sha256:c14e803037e572c177ba54a3e090d6eb12efd795d49327c5ee2b3bddb836bf01 \ + --hash=sha256:c3d7bd6e3929fd2ea7fbc3f562e4987229ead70c9ae5f01501a46701e08f1ad9 \ + --hash=sha256:c98e0b7434a7fa4e3e63f250456eaef52499fba5ae661c58cc5b5477d11e7182 \ + --hash=sha256:cce634b10aeab37010449124814b05a62fb5f18928ca878f1bf4750d1f0c815b \ + --hash=sha256:e154d230dc1bbbd78ad2fdc3039fa50ad7ffcf438e4eb2fa30bce223a70c7486 \ + --hash=sha256:e1ea6176d7dfd5b941ea01c2ec34de9531ba494d541fe2057c904e601879f249 \ + --hash=sha256:e759f9e8bc908aaae0412642afe5416c9f983a80499448fcc7fab8692ae044c3 \ + --hash=sha256:e8978003816c7b9eabe217f88c78bc26adc8f9304bf6a594b02e5a49b2ef9c11 \ + --hash=sha256:ecde9ab49f58433abe02f9ed076c7b5be839cf0153883a6d23995937a82392fa \ + --hash=sha256:f6ec94f0e50eb8fa1744a731088b966427575e40c2944a980049798b127a687e \ + --hash=sha256:fd3c71aeee838299c5887230b8a1822795325ddfea635edd82954c1eaa831e24 \ + --hash=sha256:fe0f540750a13fd8e5da4b3eaba91a785eea8dca5ccd2bc2ffe978caa403090e + # via + # -c python/requirements_compiled.txt + # ray +gymnasium==1.1.1 \ + --hash=sha256:8bd9ea9bdef32c950a444ff36afc785e1d81051ec32d30435058953c20d2456d \ + --hash=sha256:9c167ec0a2b388666e37f63b2849cd2552f7f5b71938574c637bb36487eb928a + # via + # -c python/requirements_compiled.txt + # ray +h11==0.16.0 \ + --hash=sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1 \ + --hash=sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86 + # via + # -c python/requirements_compiled.txt + # uvicorn +httptools==0.6.4 \ + --hash=sha256:0614154d5454c21b6410fdf5262b4a3ddb0f53f1e1721cfd59d55f32138c578a \ + --hash=sha256:0e563e54979e97b6d13f1bbc05a96109923e76b901f786a5eae36e99c01237bd \ + --hash=sha256:16e603a3bff50db08cd578d54f07032ca1631450ceb972c2f834c2b860c28ea2 \ + --hash=sha256:288cd628406cc53f9a541cfaf06041b4c71d751856bab45e3702191f931ccd17 \ + --hash=sha256:28908df1b9bb8187393d5b5db91435ccc9c8e891657f9cbb42a2541b44c82fc8 \ + --hash=sha256:322d20ea9cdd1fa98bd6a74b77e2ec5b818abdc3d36695ab402a0de8ef2865a3 \ + --hash=sha256:342dd6946aa6bda4b8f18c734576106b8a31f2fe31492881a9a160ec84ff4bd5 \ + --hash=sha256:345c288418f0944a6fe67be8e6afa9262b18c7626c3ef3c28adc5eabc06a68da \ + --hash=sha256:3c73ce323711a6ffb0d247dcd5a550b8babf0f757e86a52558fe5b86d6fefcc0 \ + --hash=sha256:40a5ec98d3f49904b9fe36827dcf1aadfef3b89e2bd05b0e35e94f97c2b14721 \ + --hash=sha256:40b0f7fe4fd38e6a507bdb751db0379df1e99120c65fbdc8ee6c1d044897a636 \ + --hash=sha256:40dc6a8e399e15ea525305a2ddba998b0af5caa2566bcd79dcbe8948181eeaff \ + --hash=sha256:4b36913ba52008249223042dca46e69967985fb4051951f94357ea681e1f5dc0 \ + --hash=sha256:4d87b29bd4486c0093fc64dea80231f7c7f7eb4dc70ae394d70a495ab8436071 \ + --hash=sha256:4e93eee4add6493b59a5c514da98c939b244fce4a0d8879cd3f466562f4b7d5c \ + --hash=sha256:59e724f8b332319e2875efd360e61ac07f33b492889284a3e05e6d13746876f4 \ + --hash=sha256:69422b7f458c5af875922cdb5bd586cc1f1033295aa9ff63ee196a87519ac8e1 \ + --hash=sha256:703c346571fa50d2e9856a37d7cd9435a25e7fd15e236c397bf224afaa355fe9 \ + --hash=sha256:85071a1e8c2d051b507161f6c3e26155b5c790e4e28d7f236422dbacc2a9cc44 \ + --hash=sha256:856f4bc0478ae143bad54a4242fccb1f3f86a6e1be5548fecfd4102061b3a083 \ + --hash=sha256:85797e37e8eeaa5439d33e556662cc370e474445d5fab24dcadc65a8ffb04003 \ + --hash=sha256:90d96a385fa941283ebd231464045187a31ad932ebfa541be8edf5b3c2328959 \ + --hash=sha256:94978a49b8f4569ad607cd4946b759d90b285e39c0d4640c6b36ca7a3ddf2efc \ + --hash=sha256:aafe0f1918ed07b67c1e838f950b1c1fabc683030477e60b335649b8020e1076 \ + --hash=sha256:ab9ba8dcf59de5181f6be44a77458e45a578fc99c31510b8c65b7d5acc3cf490 \ + --hash=sha256:ade273d7e767d5fae13fa637f4d53b6e961fb7fd93c7797562663f0171c26660 \ + --hash=sha256:b799de31416ecc589ad79dd85a0b2657a8fe39327944998dea368c1d4c9e55e6 \ + --hash=sha256:c26f313951f6e26147833fc923f78f95604bbec812a43e5ee37f26dc9e5a686c \ + --hash=sha256:ca80b7485c76f768a3bc83ea58373f8db7b015551117375e4918e2aa77ea9b50 \ + --hash=sha256:d1ffd262a73d7c28424252381a5b854c19d9de5f56f075445d33919a637e3547 \ + --hash=sha256:d3f0d369e7ffbe59c4b6116a44d6a8eb4783aae027f2c0b366cf0aa964185dba \ + --hash=sha256:d54efd20338ac52ba31e7da78e4a72570cf729fac82bc31ff9199bedf1dc7440 \ + --hash=sha256:dacdd3d10ea1b4ca9df97a0a303cbacafc04b5cd375fa98732678151643d4988 \ + --hash=sha256:db353d22843cf1028f43c3651581e4bb49374d85692a85f95f7b9a130e1b2cab \ + --hash=sha256:db78cb9ca56b59b016e64b6031eda5653be0589dba2b1b43453f6e8b405a0970 \ + --hash=sha256:deee0e3343f98ee8047e9f4c5bc7cedbf69f5734454a94c38ee829fb2d5fa3c1 \ + --hash=sha256:df017d6c780287d5c80601dafa31f17bddb170232d85c066604d8558683711a2 \ + --hash=sha256:df959752a0c2748a65ab5387d08287abf6779ae9165916fe053e68ae1fbdc47f \ + --hash=sha256:ec4f178901fa1834d4a060320d2f3abc5c9e39766953d038f1458cb885f47e81 \ + --hash=sha256:f47f8ed67cc0ff862b84a1189831d1d33c963fb3ce1ee0c65d3b0cbe7b711069 \ + --hash=sha256:f8787367fbdfccae38e35abf7641dafc5310310a5987b689f4c32cc8cc3ee975 \ + --hash=sha256:f9eb89ecf8b290f2e293325c646a211ff1c2493222798bb80a530c5e7502494f \ + --hash=sha256:fc411e1c0a7dcd2f902c7c48cf079947a7e65b5485dea9decb82b9105ca71a43 + # via uvicorn +idna==3.7 \ + --hash=sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc \ + --hash=sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0 + # via + # -c python/requirements_compiled.txt + # anyio + # requests + # yarl +importlib-metadata==6.11.0 \ + --hash=sha256:1231cf92d825c9e03cfc4da076a16de6422c863558229ea0b22b675657463443 \ + --hash=sha256:f0afba6205ad8f8947c7d338b5342d5db2afbfd82f9cbef7879a9539cc12eb9b + # via + # -c python/requirements_compiled.txt + # opentelemetry-api +jinja2==3.1.6 ; sys_platform != 'win32' \ + --hash=sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d \ + --hash=sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67 + # via + # -c python/requirements_compiled.txt + # memray +jsonschema==4.23.0 \ + --hash=sha256:d71497fef26351a33265337fa77ffeb82423f3ea21283cd9467bb03999266bc4 \ + --hash=sha256:fbadb6f8b144a8f8cf9f0b89ba94501d143e50411a1278633f56a7acf7fd5566 + # via + # -c python/requirements_compiled.txt + # ray +jsonschema-specifications==2024.10.1 \ + --hash=sha256:0f38b83639958ce1152d02a7f062902c41c8fd20d558b0c34344292d417ae272 \ + --hash=sha256:a09a0680616357d9a0ecf05c12ad234479f549239d0f5b55f3deea67475da9bf + # via + # -c python/requirements_compiled.txt + # jsonschema +kombu==5.5.4 \ + --hash=sha256:886600168275ebeada93b888e831352fe578168342f0d1d5833d88ba0d847363 \ + --hash=sha256:a12ed0557c238897d8e518f1d1fdf84bd1516c5e305af2dacd85c2015115feb8 + # via + # -c python/requirements_compiled.txt + # celery +lz4==4.3.3 \ + --hash=sha256:01fe674ef2889dbb9899d8a67361e0c4a2c833af5aeb37dd505727cf5d2a131e \ + --hash=sha256:054b4631a355606e99a42396f5db4d22046a3397ffc3269a348ec41eaebd69d2 \ + --hash=sha256:0a136e44a16fc98b1abc404fbabf7f1fada2bdab6a7e970974fb81cf55b636d0 \ + --hash=sha256:0e9c410b11a31dbdc94c05ac3c480cb4b222460faf9231f12538d0074e56c563 \ + --hash=sha256:222a7e35137d7539c9c33bb53fcbb26510c5748779364014235afc62b0ec797f \ + --hash=sha256:24b3206de56b7a537eda3a8123c644a2b7bf111f0af53bc14bed90ce5562d1aa \ + --hash=sha256:2b901c7784caac9a1ded4555258207d9e9697e746cc8532129f150ffe1f6ba0d \ + --hash=sha256:2f7b1839f795315e480fb87d9bc60b186a98e3e5d17203c6e757611ef7dcef61 \ + --hash=sha256:30e8c20b8857adef7be045c65f47ab1e2c4fabba86a9fa9a997d7674a31ea6b6 \ + --hash=sha256:31ea4be9d0059c00b2572d700bf2c1bc82f241f2c3282034a759c9a4d6ca4dc2 \ + --hash=sha256:337cb94488a1b060ef1685187d6ad4ba8bc61d26d631d7ba909ee984ea736be1 \ + --hash=sha256:33c9a6fd20767ccaf70649982f8f3eeb0884035c150c0b818ea660152cf3c809 \ + --hash=sha256:363ab65bf31338eb364062a15f302fc0fab0a49426051429866d71c793c23394 \ + --hash=sha256:43cf03059c0f941b772c8aeb42a0813d68d7081c009542301637e5782f8a33e2 \ + --hash=sha256:56f4fe9c6327adb97406f27a66420b22ce02d71a5c365c48d6b656b4aaeb7775 \ + --hash=sha256:5d35533bf2cee56f38ced91f766cd0038b6abf46f438a80d50c52750088be93f \ + --hash=sha256:6756212507405f270b66b3ff7f564618de0606395c0fe10a7ae2ffcbbe0b1fba \ + --hash=sha256:6cdc60e21ec70266947a48839b437d46025076eb4b12c76bd47f8e5eb8a75dcc \ + --hash=sha256:abc197e4aca8b63f5ae200af03eb95fb4b5055a8f990079b5bdf042f568469dd \ + --hash=sha256:b14d948e6dce389f9a7afc666d60dd1e35fa2138a8ec5306d30cd2e30d36b40c \ + --hash=sha256:b47839b53956e2737229d70714f1d75f33e8ac26e52c267f0197b3189ca6de24 \ + --hash=sha256:b6d9ec061b9eca86e4dcc003d93334b95d53909afd5a32c6e4f222157b50c071 \ + --hash=sha256:b891880c187e96339474af2a3b2bfb11a8e4732ff5034be919aa9029484cd201 \ + --hash=sha256:bca8fccc15e3add173da91be8f34121578dc777711ffd98d399be35487c934bf \ + --hash=sha256:c81703b12475da73a5d66618856d04b1307e43428a7e59d98cfe5a5d608a74c6 \ + --hash=sha256:d2507ee9c99dbddd191c86f0e0c8b724c76d26b0602db9ea23232304382e1f21 \ + --hash=sha256:e36cd7b9d4d920d3bfc2369840da506fa68258f7bb176b8743189793c055e43d \ + --hash=sha256:e7d84b479ddf39fe3ea05387f10b779155fc0990125f4fb35d636114e1c63a2e \ + --hash=sha256:eac9af361e0d98335a02ff12fb56caeb7ea1196cf1a49dbf6f17828a131da807 \ + --hash=sha256:edfd858985c23523f4e5a7526ca6ee65ff930207a7ec8a8f57a01eae506aaee7 \ + --hash=sha256:ee9ff50557a942d187ec85462bb0960207e7ec5b19b3b48949263993771c6205 \ + --hash=sha256:f0e822cd7644995d9ba248cb4b67859701748a93e2ab7fc9bc18c599a52e4604 \ + --hash=sha256:f180904f33bdd1e92967923a43c22899e303906d19b2cf8bb547db6653ea6e7d \ + --hash=sha256:f1d18718f9d78182c6b60f568c9a9cec8a7204d7cb6fad4e511a2ef279e4cb05 \ + --hash=sha256:f4c7bf687303ca47d69f9f0133274958fd672efaa33fb5bcde467862d6c621f0 \ + --hash=sha256:f76176492ff082657ada0d0f10c794b6da5800249ef1692b35cf49b1e93e8ef7 + # via + # -c python/requirements_compiled.txt + # ray +markdown-it-py==2.2.0 ; sys_platform != 'win32' \ + --hash=sha256:5a35f8d1870171d9acc47b99612dc146129b631baf04970128b568f190d0cc30 \ + --hash=sha256:7c9a5e412688bc771c67432cbfebcdd686c93ce6484913dccf06cb5a0bea35a1 + # via + # -c python/requirements_compiled.txt + # rich +markupsafe==2.1.3 ; sys_platform != 'win32' \ + --hash=sha256:05fb21170423db021895e1ea1e1f3ab3adb85d1c2333cbc2310f2a26bc77272e \ + --hash=sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686 \ + --hash=sha256:1577735524cdad32f9f694208aa75e422adba74f1baee7551620e43a3141f559 \ + --hash=sha256:1b40069d487e7edb2676d3fbdb2b0829ffa2cd63a2ec26c4938b2d34391b4ecc \ + --hash=sha256:1b8dd8c3fd14349433c79fa8abeb573a55fc0fdd769133baac1f5e07abf54aeb \ + --hash=sha256:2c1b19b3aaacc6e57b7e25710ff571c24d6c3613a45e905b1fde04d691b98ee0 \ + --hash=sha256:2ef12179d3a291be237280175b542c07a36e7f60718296278d8593d21ca937d4 \ + --hash=sha256:3c0fae6c3be832a0a0473ac912810b2877c8cb9d76ca48de1ed31e1c68386575 \ + --hash=sha256:3fd4abcb888d15a94f32b75d8fd18ee162ca0c064f35b11134be77050296d6ba \ + --hash=sha256:47d4f1c5f80fc62fdd7777d0d40a2e9dda0a05883ab11374334f6c4de38adffd \ + --hash=sha256:65c1a9bcdadc6c28eecee2c119465aebff8f7a584dd719facdd9e825ec61ab52 \ + --hash=sha256:68e78619a61ecf91e76aa3e6e8e33fc4894a2bebe93410754bd28fce0a8a4f9f \ + --hash=sha256:6b2b56950d93e41f33b4223ead100ea0fe11f8e6ee5f641eb753ce4b77a7042b \ + --hash=sha256:8023faf4e01efadfa183e863fefde0046de576c6f14659e8782065bcece22198 \ + --hash=sha256:8afafd99945ead6e075b973fefa56379c5b5c53fd8937dad92c662da5d8fd5ee \ + --hash=sha256:8c41976a29d078bb235fea9b2ecd3da465df42a562910f9022f1a03107bd02be \ + --hash=sha256:9dcdfd0eaf283af041973bff14a2e143b8bd64e069f4c383416ecd79a81aab58 \ + --hash=sha256:aa57bd9cf8ae831a362185ee444e15a93ecb2e344c8e52e4d721ea3ab6ef1823 \ + --hash=sha256:ad9e82fb8f09ade1c3e1b996a6337afac2b8b9e365f926f5a61aacc71adc5b3c \ + --hash=sha256:b076b6226fb84157e3f7c971a47ff3a679d837cf338547532ab866c57930dbee \ + --hash=sha256:bfce63a9e7834b12b87c64d6b155fdd9b3b96191b6bd334bf37db7ff1fe457f2 \ + --hash=sha256:cd0f502fe016460680cd20aaa5a76d241d6f35a1c3350c474bac1273803893fa \ + --hash=sha256:e09031c87a1e51556fdcb46e5bd4f59dfb743061cf93c4d6831bf894f125eb57 \ + --hash=sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc \ + --hash=sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11 + # via + # -c python/requirements_compiled.txt + # jinja2 +mdurl==0.1.2 ; sys_platform != 'win32' \ + --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \ + --hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba + # via + # -c python/requirements_compiled.txt + # markdown-it-py +memray==1.10.0 ; sys_platform != 'win32' \ + --hash=sha256:0a21745fb516b7a6efcd40aa7487c59e9313fcfc782d0193fcfcf00b48426874 \ + --hash=sha256:22f2a47871c172a0539bd72737bb6b294fc10c510464066b825d90fcd3bb4916 \ + --hash=sha256:23e8c402625cfb32d0e9edb5ec0945f3e5e54bc6b0c5699f6284302082b80bd4 \ + --hash=sha256:2ce59ef485db3634de98b3a026d2450fc0a875e3a58a9ea85f7a89098841defe \ + --hash=sha256:322ed0b69014a0969b777768d461a785203f81f9864386b666b5b26645d9c294 \ + --hash=sha256:38322e052b882790993412f1840517a51818aa55c47037f69915b2007f2c4cee \ + --hash=sha256:38393c86ce6d0a08e6ec0eb1401d49803b7c0c950c2565386751cdc81568cba8 \ + --hash=sha256:391aac6c9f744528d3186bc82d708a1acc83525778f804045d7c96f860f8ec98 \ + --hash=sha256:3a8bb7fbd8303c4f0017ba7faef6b88f904cda2931ed667cbf3b98f024b3bc44 \ + --hash=sha256:3c401c57f49c4c5f1fecaee1e746f537cdc6680da05fb963dc143bd08ee109bf \ + --hash=sha256:4eba29179772b4a2e440a065b320b03bc2e73fe2648bdf7936aa3b9a086fab4a \ + --hash=sha256:53a8f66af18b1f3bcf5c9f3c95ae4134dd675903a38f9d0e6341b7bca01b63d0 \ + --hash=sha256:566602b2143e06b3d592901d98c52ce4599e71aa2555146eeb5cec03506f9498 \ + --hash=sha256:663d463e89a64bae4a6b2f8c837d11a3d094834442d536a4165e1d31899a3500 \ + --hash=sha256:68bd8df023c8a32f44c11d997e5c536837e27c0955daf557d3a377edd55a1dd3 \ + --hash=sha256:6937d7ef67d18ccc01c3250cdf3b4ef1445b859ee8756f09e3d11bd3ff0c7d67 \ + --hash=sha256:6b311e91203be71e1a0ce5e4f978137765bcb1045f3bf5646129c83c5b96ab3c \ + --hash=sha256:6fd13ef666c7fced9768d1cfabf71dc6dfa6724935a8dff463495ac2dc5e13a4 \ + --hash=sha256:8196c684f1be8fe423e5cdd2356d4255a2cb482a1f3e89612b70d2a2862cf5bb \ + --hash=sha256:843a688877691746f9d1835cfa8a65139948471bdd78720435808d20bc30a1cc \ + --hash=sha256:85c32d6613d81b075f740e398c4d653e0803cd48e82c33dcd584c109d6782666 \ + --hash=sha256:898acd60f57a10dc5aaf1fd64aa2f821f0420114f3f60c3058083788603f173a \ + --hash=sha256:8d56f37a34125684746c13d24bd7a3fb17549b0bb355eb50969eb11e05e3ba62 \ + --hash=sha256:92c372cb262eddd23049f945ca9527f0e4cc7c40a070aade1802d066f680885b \ + --hash=sha256:95e563d9c976e429ad597ad2720d95cebbe8bac891a3082465439143e2740772 \ + --hash=sha256:9627184c926252c8f719c301f1fefe970f0d033c643a6448b93fed2889d1ea94 \ + --hash=sha256:a9e985fb7646b0475c303919d19211d2aa54e5a9e2cd2a102472299be5dbebd3 \ + --hash=sha256:b681519357d94f5f0857fbc6029e7c44d3f41436109e955a14fd312d8317bc35 \ + --hash=sha256:b75040f28e8678d0e9c4907d55c95cf26db8ef5adc9941a228f1b280a9efd9c0 \ + --hash=sha256:c3a14960838d89a91747885897d34134afb65883cc3b0ed7ff30fe1af00f9fe6 \ + --hash=sha256:c7aeb47174c42e99740a8e2b3b6fe0932c95d987258d48a746974ead19176c26 \ + --hash=sha256:ce22a887a585ef5020896de89ffc793e531b65ccc81fbafcc7886010c2c562b3 \ + --hash=sha256:cf6d683c4f8d25c6ad06ae18715f218983c5eb86803953615e902d632fdf6ec1 \ + --hash=sha256:e356af93e3b031c83957e9ac1a653f5aaba5df1e357dd17142f5ed19bb3dc660 \ + --hash=sha256:f16c5c8730b616613dc8bafe32649ca6bd7252606251eb00148582011758d0b5 + # via + # -c python/requirements_compiled.txt + # ray +msgpack==1.0.7 \ + --hash=sha256:04ad6069c86e531682f9e1e71b71c1c3937d6014a7c3e9edd2aa81ad58842862 \ + --hash=sha256:0bfdd914e55e0d2c9e1526de210f6fe8ffe9705f2b1dfcc4aecc92a4cb4b533d \ + --hash=sha256:1dc93e8e4653bdb5910aed79f11e165c85732067614f180f70534f056da97db3 \ + --hash=sha256:1e2d69948e4132813b8d1131f29f9101bc2c915f26089a6d632001a5c1349672 \ + --hash=sha256:235a31ec7db685f5c82233bddf9858748b89b8119bf4538d514536c485c15fe0 \ + --hash=sha256:27dcd6f46a21c18fa5e5deed92a43d4554e3df8d8ca5a47bf0615d6a5f39dbc9 \ + --hash=sha256:28efb066cde83c479dfe5a48141a53bc7e5f13f785b92ddde336c716663039ee \ + --hash=sha256:3476fae43db72bd11f29a5147ae2f3cb22e2f1a91d575ef130d2bf49afd21c46 \ + --hash=sha256:36e17c4592231a7dbd2ed09027823ab295d2791b3b1efb2aee874b10548b7524 \ + --hash=sha256:384d779f0d6f1b110eae74cb0659d9aa6ff35aaf547b3955abf2ab4c901c4819 \ + --hash=sha256:38949d30b11ae5f95c3c91917ee7a6b239f5ec276f271f28638dec9156f82cfc \ + --hash=sha256:3967e4ad1aa9da62fd53e346ed17d7b2e922cba5ab93bdd46febcac39be636fc \ + --hash=sha256:3e7bf4442b310ff154b7bb9d81eb2c016b7d597e364f97d72b1acc3817a0fdc1 \ + --hash=sha256:3f0c8c6dfa6605ab8ff0611995ee30d4f9fcff89966cf562733b4008a3d60d82 \ + --hash=sha256:484ae3240666ad34cfa31eea7b8c6cd2f1fdaae21d73ce2974211df099a95d81 \ + --hash=sha256:4a7b4f35de6a304b5533c238bee86b670b75b03d31b7797929caa7a624b5dda6 \ + --hash=sha256:4cb14ce54d9b857be9591ac364cb08dc2d6a5c4318c1182cb1d02274029d590d \ + --hash=sha256:4e71bc4416de195d6e9b4ee93ad3f2f6b2ce11d042b4d7a7ee00bbe0358bd0c2 \ + --hash=sha256:52700dc63a4676669b341ba33520f4d6e43d3ca58d422e22ba66d1736b0a6e4c \ + --hash=sha256:572efc93db7a4d27e404501975ca6d2d9775705c2d922390d878fcf768d92c87 \ + --hash=sha256:576eb384292b139821c41995523654ad82d1916da6a60cff129c715a6223ea84 \ + --hash=sha256:5b0bf0effb196ed76b7ad883848143427a73c355ae8e569fa538365064188b8e \ + --hash=sha256:5b6ccc0c85916998d788b295765ea0e9cb9aac7e4a8ed71d12e7d8ac31c23c95 \ + --hash=sha256:5ed82f5a7af3697b1c4786053736f24a0efd0a1b8a130d4c7bfee4b9ded0f08f \ + --hash=sha256:6d4c80667de2e36970ebf74f42d1088cc9ee7ef5f4e8c35eee1b40eafd33ca5b \ + --hash=sha256:730076207cb816138cf1af7f7237b208340a2c5e749707457d70705715c93b93 \ + --hash=sha256:7687e22a31e976a0e7fc99c2f4d11ca45eff652a81eb8c8085e9609298916dcf \ + --hash=sha256:822ea70dc4018c7e6223f13affd1c5c30c0f5c12ac1f96cd8e9949acddb48a61 \ + --hash=sha256:84b0daf226913133f899ea9b30618722d45feffa67e4fe867b0b5ae83a34060c \ + --hash=sha256:85765fdf4b27eb5086f05ac0491090fc76f4f2b28e09d9350c31aac25a5aaff8 \ + --hash=sha256:8dd178c4c80706546702c59529ffc005681bd6dc2ea234c450661b205445a34d \ + --hash=sha256:8f5b234f567cf76ee489502ceb7165c2a5cecec081db2b37e35332b537f8157c \ + --hash=sha256:98bbd754a422a0b123c66a4c341de0474cad4a5c10c164ceed6ea090f3563db4 \ + --hash=sha256:993584fc821c58d5993521bfdcd31a4adf025c7d745bbd4d12ccfecf695af5ba \ + --hash=sha256:a40821a89dc373d6427e2b44b572efc36a2778d3f543299e2f24eb1a5de65415 \ + --hash=sha256:b291f0ee7961a597cbbcc77709374087fa2a9afe7bdb6a40dbbd9b127e79afee \ + --hash=sha256:b573a43ef7c368ba4ea06050a957c2a7550f729c31f11dd616d2ac4aba99888d \ + --hash=sha256:b610ff0f24e9f11c9ae653c67ff8cc03c075131401b3e5ef4b82570d1728f8a9 \ + --hash=sha256:bdf38ba2d393c7911ae989c3bbba510ebbcdf4ecbdbfec36272abe350c454075 \ + --hash=sha256:bfef2bb6ef068827bbd021017a107194956918ab43ce4d6dc945ffa13efbc25f \ + --hash=sha256:cab3db8bab4b7e635c1c97270d7a4b2a90c070b33cbc00c99ef3f9be03d3e1f7 \ + --hash=sha256:cb70766519500281815dfd7a87d3a178acf7ce95390544b8c90587d76b227681 \ + --hash=sha256:cca1b62fe70d761a282496b96a5e51c44c213e410a964bdffe0928e611368329 \ + --hash=sha256:ccf9a39706b604d884d2cb1e27fe973bc55f2890c52f38df742bc1d79ab9f5e1 \ + --hash=sha256:dc43f1ec66eb8440567186ae2f8c447d91e0372d793dfe8c222aec857b81a8cf \ + --hash=sha256:dd632777ff3beaaf629f1ab4396caf7ba0bdd075d948a69460d13d44357aca4c \ + --hash=sha256:e45ae4927759289c30ccba8d9fdce62bb414977ba158286b5ddaf8df2cddb5c5 \ + --hash=sha256:e50ebce52f41370707f1e21a59514e3375e3edd6e1832f5e5235237db933c98b \ + --hash=sha256:ebbbba226f0a108a7366bf4b59bf0f30a12fd5e75100c630267d94d7f0ad20e5 \ + --hash=sha256:ec79ff6159dffcc30853b2ad612ed572af86c92b5168aa3fc01a67b0fa40665e \ + --hash=sha256:f0936e08e0003f66bfd97e74ee530427707297b0d0361247e9b4f59ab78ddc8b \ + --hash=sha256:f26a07a6e877c76a88e3cecac8531908d980d3d5067ff69213653649ec0f60ad \ + --hash=sha256:f64e376cd20d3f030190e8c32e1c64582eba56ac6dc7d5b0b49a9d44021b52fd \ + --hash=sha256:f6ffbc252eb0d229aeb2f9ad051200668fc3a9aaa8994e49f0cb2ffe2b7867e7 \ + --hash=sha256:f9a7c509542db4eceed3dcf21ee5267ab565a83555c9b88a8109dcecc4709002 \ + --hash=sha256:ff1d0899f104f3921d94579a5638847f783c9b04f2d5f229392ca77fba5b82fc + # via + # -c python/requirements_compiled.txt + # ray +multidict==6.0.5 \ + --hash=sha256:01265f5e40f5a17f8241d52656ed27192be03bfa8764d88e8220141d1e4b3556 \ + --hash=sha256:0275e35209c27a3f7951e1ce7aaf93ce0d163b28948444bec61dd7badc6d3f8c \ + --hash=sha256:04bde7a7b3de05732a4eb39c94574db1ec99abb56162d6c520ad26f83267de29 \ + --hash=sha256:04da1bb8c8dbadf2a18a452639771951c662c5ad03aefe4884775454be322c9b \ + --hash=sha256:09a892e4a9fb47331da06948690ae38eaa2426de97b4ccbfafbdcbe5c8f37ff8 \ + --hash=sha256:0d63c74e3d7ab26de115c49bffc92cc77ed23395303d496eae515d4204a625e7 \ + --hash=sha256:107c0cdefe028703fb5dafe640a409cb146d44a6ae201e55b35a4af8e95457dd \ + --hash=sha256:141b43360bfd3bdd75f15ed811850763555a251e38b2405967f8e25fb43f7d40 \ + --hash=sha256:14c2976aa9038c2629efa2c148022ed5eb4cb939e15ec7aace7ca932f48f9ba6 \ + --hash=sha256:19fe01cea168585ba0f678cad6f58133db2aa14eccaf22f88e4a6dccadfad8b3 \ + --hash=sha256:1d147090048129ce3c453f0292e7697d333db95e52616b3793922945804a433c \ + --hash=sha256:1d9ea7a7e779d7a3561aade7d596649fbecfa5c08a7674b11b423783217933f9 \ + --hash=sha256:215ed703caf15f578dca76ee6f6b21b7603791ae090fbf1ef9d865571039ade5 \ + --hash=sha256:21fd81c4ebdb4f214161be351eb5bcf385426bf023041da2fd9e60681f3cebae \ + --hash=sha256:220dd781e3f7af2c2c1053da9fa96d9cf3072ca58f057f4c5adaaa1cab8fc442 \ + --hash=sha256:228b644ae063c10e7f324ab1ab6b548bdf6f8b47f3ec234fef1093bc2735e5f9 \ + --hash=sha256:29bfeb0dff5cb5fdab2023a7a9947b3b4af63e9c47cae2a10ad58394b517fddc \ + --hash=sha256:2f4848aa3baa109e6ab81fe2006c77ed4d3cd1e0ac2c1fbddb7b1277c168788c \ + --hash=sha256:2faa5ae9376faba05f630d7e5e6be05be22913782b927b19d12b8145968a85ea \ + --hash=sha256:2ffc42c922dbfddb4a4c3b438eb056828719f07608af27d163191cb3e3aa6cc5 \ + --hash=sha256:37b15024f864916b4951adb95d3a80c9431299080341ab9544ed148091b53f50 \ + --hash=sha256:3cc2ad10255f903656017363cd59436f2111443a76f996584d1077e43ee51182 \ + --hash=sha256:3d25f19500588cbc47dc19081d78131c32637c25804df8414463ec908631e453 \ + --hash=sha256:403c0911cd5d5791605808b942c88a8155c2592e05332d2bf78f18697a5fa15e \ + --hash=sha256:411bf8515f3be9813d06004cac41ccf7d1cd46dfe233705933dd163b60e37600 \ + --hash=sha256:425bf820055005bfc8aa9a0b99ccb52cc2f4070153e34b701acc98d201693733 \ + --hash=sha256:435a0984199d81ca178b9ae2c26ec3d49692d20ee29bc4c11a2a8d4514c67eda \ + --hash=sha256:4a6a4f196f08c58c59e0b8ef8ec441d12aee4125a7d4f4fef000ccb22f8d7241 \ + --hash=sha256:4cc0ef8b962ac7a5e62b9e826bd0cd5040e7d401bc45a6835910ed699037a461 \ + --hash=sha256:51d035609b86722963404f711db441cf7134f1889107fb171a970c9701f92e1e \ + --hash=sha256:53689bb4e102200a4fafa9de9c7c3c212ab40a7ab2c8e474491914d2305f187e \ + --hash=sha256:55205d03e8a598cfc688c71ca8ea5f66447164efff8869517f175ea632c7cb7b \ + --hash=sha256:5c0631926c4f58e9a5ccce555ad7747d9a9f8b10619621f22f9635f069f6233e \ + --hash=sha256:5cb241881eefd96b46f89b1a056187ea8e9ba14ab88ba632e68d7a2ecb7aadf7 \ + --hash=sha256:60d698e8179a42ec85172d12f50b1668254628425a6bd611aba022257cac1386 \ + --hash=sha256:612d1156111ae11d14afaf3a0669ebf6c170dbb735e510a7438ffe2369a847fd \ + --hash=sha256:6214c5a5571802c33f80e6c84713b2c79e024995b9c5897f794b43e714daeec9 \ + --hash=sha256:6939c95381e003f54cd4c5516740faba40cf5ad3eeff460c3ad1d3e0ea2549bf \ + --hash=sha256:69db76c09796b313331bb7048229e3bee7928eb62bab5e071e9f7fcc4879caee \ + --hash=sha256:6bf7a982604375a8d49b6cc1b781c1747f243d91b81035a9b43a2126c04766f5 \ + --hash=sha256:766c8f7511df26d9f11cd3a8be623e59cca73d44643abab3f8c8c07620524e4a \ + --hash=sha256:76c0de87358b192de7ea9649beb392f107dcad9ad27276324c24c91774ca5271 \ + --hash=sha256:76f067f5121dcecf0d63a67f29080b26c43c71a98b10c701b0677e4a065fbd54 \ + --hash=sha256:7901c05ead4b3fb75113fb1dd33eb1253c6d3ee37ce93305acd9d38e0b5f21a4 \ + --hash=sha256:79660376075cfd4b2c80f295528aa6beb2058fd289f4c9252f986751a4cd0496 \ + --hash=sha256:79a6d2ba910adb2cbafc95dad936f8b9386e77c84c35bc0add315b856d7c3abb \ + --hash=sha256:7afcdd1fc07befad18ec4523a782cde4e93e0a2bf71239894b8d61ee578c1319 \ + --hash=sha256:7be7047bd08accdb7487737631d25735c9a04327911de89ff1b26b81745bd4e3 \ + --hash=sha256:7c6390cf87ff6234643428991b7359b5f59cc15155695deb4eda5c777d2b880f \ + --hash=sha256:7df704ca8cf4a073334e0427ae2345323613e4df18cc224f647f251e5e75a527 \ + --hash=sha256:85f67aed7bb647f93e7520633d8f51d3cbc6ab96957c71272b286b2f30dc70ed \ + --hash=sha256:896ebdcf62683551312c30e20614305f53125750803b614e9e6ce74a96232604 \ + --hash=sha256:92d16a3e275e38293623ebf639c471d3e03bb20b8ebb845237e0d3664914caef \ + --hash=sha256:99f60d34c048c5c2fabc766108c103612344c46e35d4ed9ae0673d33c8fb26e8 \ + --hash=sha256:9fe7b0653ba3d9d65cbe7698cca585bf0f8c83dbbcc710db9c90f478e175f2d5 \ + --hash=sha256:a3145cb08d8625b2d3fee1b2d596a8766352979c9bffe5d7833e0503d0f0b5e5 \ + --hash=sha256:aeaf541ddbad8311a87dd695ed9642401131ea39ad7bc8cf3ef3967fd093b626 \ + --hash=sha256:b55358304d7a73d7bdf5de62494aaf70bd33015831ffd98bc498b433dfe5b10c \ + --hash=sha256:b82cc8ace10ab5bd93235dfaab2021c70637005e1ac787031f4d1da63d493c1d \ + --hash=sha256:c0868d64af83169e4d4152ec612637a543f7a336e4a307b119e98042e852ad9c \ + --hash=sha256:c1c1496e73051918fcd4f58ff2e0f2f3066d1c76a0c6aeffd9b45d53243702cc \ + --hash=sha256:c9bf56195c6bbd293340ea82eafd0071cb3d450c703d2c93afb89f93b8386ccc \ + --hash=sha256:cbebcd5bcaf1eaf302617c114aa67569dd3f090dd0ce8ba9e35e9985b41ac35b \ + --hash=sha256:cd6c8fca38178e12c00418de737aef1261576bd1b6e8c6134d3e729a4e858b38 \ + --hash=sha256:ceb3b7e6a0135e092de86110c5a74e46bda4bd4fbfeeb3a3bcec79c0f861e450 \ + --hash=sha256:cf590b134eb70629e350691ecca88eac3e3b8b3c86992042fb82e3cb1830d5e1 \ + --hash=sha256:d3eb1ceec286eba8220c26f3b0096cf189aea7057b6e7b7a2e60ed36b373b77f \ + --hash=sha256:d65f25da8e248202bd47445cec78e0025c0fe7582b23ec69c3b27a640dd7a8e3 \ + --hash=sha256:d6f6d4f185481c9669b9447bf9d9cf3b95a0e9df9d169bbc17e363b7d5487755 \ + --hash=sha256:d84a5c3a5f7ce6db1f999fb9438f686bc2e09d38143f2d93d8406ed2dd6b9226 \ + --hash=sha256:d946b0a9eb8aaa590df1fe082cee553ceab173e6cb5b03239716338629c50c7a \ + --hash=sha256:dce1c6912ab9ff5f179eaf6efe7365c1f425ed690b03341911bf4939ef2f3046 \ + --hash=sha256:de170c7b4fe6859beb8926e84f7d7d6c693dfe8e27372ce3b76f01c46e489fcf \ + --hash=sha256:e02021f87a5b6932fa6ce916ca004c4d441509d33bbdbeca70d05dff5e9d2479 \ + --hash=sha256:e030047e85cbcedbfc073f71836d62dd5dadfbe7531cae27789ff66bc551bd5e \ + --hash=sha256:e0e79d91e71b9867c73323a3444724d496c037e578a0e1755ae159ba14f4f3d1 \ + --hash=sha256:e4428b29611e989719874670fd152b6625500ad6c686d464e99f5aaeeaca175a \ + --hash=sha256:e4972624066095e52b569e02b5ca97dbd7a7ddd4294bf4e7247d52635630dd83 \ + --hash=sha256:e7be68734bd8c9a513f2b0cfd508802d6609da068f40dc57d4e3494cefc92929 \ + --hash=sha256:e8e94e6912639a02ce173341ff62cc1201232ab86b8a8fcc05572741a5dc7d93 \ + --hash=sha256:ea1456df2a27c73ce51120fa2f519f1bea2f4a03a917f4a43c8707cf4cbbae1a \ + --hash=sha256:ebd8d160f91a764652d3e51ce0d2956b38efe37c9231cd82cfc0bed2e40b581c \ + --hash=sha256:eca2e9d0cc5a889850e9bbd68e98314ada174ff6ccd1129500103df7a94a7a44 \ + --hash=sha256:edd08e6f2f1a390bf137080507e44ccc086353c8e98c657e666c017718561b89 \ + --hash=sha256:f285e862d2f153a70586579c15c44656f888806ed0e5b56b64489afe4a2dbfba \ + --hash=sha256:f2a1dee728b52b33eebff5072817176c172050d44d67befd681609b4746e1c2e \ + --hash=sha256:f7e301075edaf50500f0b341543c41194d8df3ae5caf4702f2095f3ca73dd8da \ + --hash=sha256:fb616be3538599e797a2017cccca78e354c767165e8858ab5116813146041a24 \ + --hash=sha256:fce28b3c8a81b6b36dfac9feb1de115bab619b3c13905b419ec71d03a3fc1423 \ + --hash=sha256:fe5d7785250541f7f5019ab9cba2c71169dc7d74d0f45253f8313f436458a4ef + # via + # -c python/requirements_compiled.txt + # aiohttp + # yarl +numpy==1.26.4 \ + --hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \ + --hash=sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818 \ + --hash=sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20 \ + --hash=sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0 \ + --hash=sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010 \ + --hash=sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a \ + --hash=sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea \ + --hash=sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c \ + --hash=sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71 \ + --hash=sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110 \ + --hash=sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be \ + --hash=sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a \ + --hash=sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a \ + --hash=sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5 \ + --hash=sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed \ + --hash=sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd \ + --hash=sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c \ + --hash=sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e \ + --hash=sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0 \ + --hash=sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c \ + --hash=sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a \ + --hash=sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b \ + --hash=sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0 \ + --hash=sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6 \ + --hash=sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2 \ + --hash=sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a \ + --hash=sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30 \ + --hash=sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218 \ + --hash=sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5 \ + --hash=sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07 \ + --hash=sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2 \ + --hash=sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4 \ + --hash=sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764 \ + --hash=sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef \ + --hash=sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3 \ + --hash=sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f + # via + # -c python/requirements_compiled.txt + # cupy-cuda12x + # gymnasium + # pandas + # ray + # scipy + # tensorboardx +opencensus==0.11.4 \ + --hash=sha256:a18487ce68bc19900336e0ff4655c5a116daf10c1b3685ece8d971bddad6a864 \ + --hash=sha256:cbef87d8b8773064ab60e5c2a1ced58bbaa38a6d052c41aec224958ce544eff2 + # via + # -c python/requirements_compiled.txt + # ray +opencensus-context==0.1.3 \ + --hash=sha256:073bb0590007af276853009fac7e4bab1d523c3f03baf4cb4511ca38967c6039 \ + --hash=sha256:a03108c3c10d8c80bb5ddf5c8a1f033161fa61972a9917f9b9b3a18517f0088c + # via + # -c python/requirements_compiled.txt + # opencensus +opentelemetry-api==1.34.1 \ + --hash=sha256:64f0bd06d42824843731d05beea88d4d4b6ae59f9fe347ff7dfa2cc14233bbb3 \ + --hash=sha256:b7df4cb0830d5a6c29ad0c0691dbae874d8daefa934b8b1d642de48323d32a8c + # via + # -c python/requirements_compiled.txt + # opentelemetry-exporter-prometheus + # opentelemetry-sdk + # opentelemetry-semantic-conventions +opentelemetry-exporter-prometheus==0.55b1 \ + --hash=sha256:d13ec0b22bf394113ff1ada5da98133a4b051779b803dae183188e26c4bd9ee0 \ + --hash=sha256:f364fbbff9e5de37a112ff104d1185fb1d7e2046c5ab5911e5afebc7ab3ddf0e + # via + # -c python/requirements_compiled.txt + # ray +opentelemetry-proto==1.27.0 \ + --hash=sha256:33c9345d91dafd8a74fc3d7576c5a38f18b7fdf8d02983ac67485386132aedd6 \ + --hash=sha256:b133873de5581a50063e1e4b29cdcf0c5e253a8c2d8dc1229add20a4c3830ace + # via + # -c python/requirements_compiled.txt + # ray +opentelemetry-sdk==1.34.1 \ + --hash=sha256:308effad4059562f1d92163c61c8141df649da24ce361827812c40abb2a1e96e \ + --hash=sha256:8091db0d763fcd6098d4781bbc80ff0971f94e260739aa6afe6fd379cdf3aa4d + # via + # -c python/requirements_compiled.txt + # opentelemetry-exporter-prometheus + # ray +opentelemetry-semantic-conventions==0.55b1 \ + --hash=sha256:5da81dfdf7d52e3d37f8fe88d5e771e191de924cfff5f550ab0b8f7b2409baed \ + --hash=sha256:ef95b1f009159c28d7a7849f5cbc71c4c34c845bb514d66adfdf1b3fff3598b3 + # via + # -c python/requirements_compiled.txt + # opentelemetry-sdk +ormsgpack==1.7.0 \ + --hash=sha256:0d88307ab45d95416ce4071b1b99326ca31362af01c3d206f15a0551a7a874bd \ + --hash=sha256:22418a4d399027a72fb2e6b873559b1886cf2e63323ca7afc17b222c454413b7 \ + --hash=sha256:2c22c62a6bc93bcb194b7f91864ca0b39455b2cbbfc1538a3da0f9ec3c11d184 \ + --hash=sha256:3a6a97937d2cf21496d7689b90a43df83c5062bbe846aaa39197cc9ad73eaa7b \ + --hash=sha256:462089a419dbde654915ccb0b859c0dbe3c178b0ac580018e82befea6ccd73f4 \ + --hash=sha256:4b353204e99b56c1d33f1cf4767bd1fe1195596181a1cc789f25aa26c0b50f3d \ + --hash=sha256:5ec763096d978d35eedcef0af13991a10741717c2e236b26f4c2047b0740ea7b \ + --hash=sha256:5fefa1ca842dbba258401ea958113fe62c6b70a7a4d46edac440113f68dc431e \ + --hash=sha256:65525438b4a8b3b64ccfcda25e758ea3db392d1c206b5e09ef70efbbafa6dbf9 \ + --hash=sha256:6b4c98839cb7fc2a212037d2258f3a22857155249eb293d45c45cb974cfba834 \ + --hash=sha256:6d114652dadd81802b8a35a49e07a3e9ef2a47aed6123fb5031f2220d1c8e434 \ + --hash=sha256:77bc2ea387d85cfad045b9bcb8040bae43ad32dafe9363360f732cc19d489bbe \ + --hash=sha256:7e6ada21f5c7a20ff7cf9b061c44e3814352f819947a12022ad8cb52a9f2a809 \ + --hash=sha256:8d301e47565fe0e52a60052e730a9bb7669dfbd2a94643b8be925e3928c64c15 \ + --hash=sha256:90aabfd816db60dadab1100d583d061e0238209015bf684f8170c0fca4eb445a \ + --hash=sha256:91ebb7d3609db249cdff629ffef83ec3d025b1384749a297cf3b6a8240cf22ac \ + --hash=sha256:97723786755a7df85fcf6e68d7b5359dacea98d5c26b1d9af219a3cc05df4734 \ + --hash=sha256:9b0945523ccc75aa6907f38f2240d36818618baccb8633923bd7740a5a929e67 \ + --hash=sha256:a0ca6a64d47073f22ecc1dd96b384e44f98796d3f88ee383e92dfbcdf18c2efd \ + --hash=sha256:a5e12b51a590be47ccef67907905653e679fc2f920854b456edc216690ecc09c \ + --hash=sha256:a8fbe7bb50ee8381df030823d9366984fac718447947c2327969405d1d799b95 \ + --hash=sha256:c683071bf4527ffa7b6cfcf28f750d1a82eb77846d106743c09261ab1b79b193 \ + --hash=sha256:ca4d35b694f32112eb33ac0b733cb903dbbc59f019d05ca3d74f6ad2f587b0bf \ + --hash=sha256:e8385181bf195af80fc270e64fd477f1c414ffb05837320382e2ec9ca34be0ec \ + --hash=sha256:e86124cdbc8ed249806347c2fba96843e8941122b161b429139a0c973d270de4 \ + --hash=sha256:f9967a7f3647ad118751abf090f8397fda3e4bca6833340cab95a3f2bec598cd + # via + # -c python/requirements_compiled.txt + # ray +packaging==23.0 \ + --hash=sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2 \ + --hash=sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97 + # via + # -c python/requirements_compiled.txt + # kombu + # ray + # tensorboardx +pandas==2.3.2 \ + --hash=sha256:0064187b80a5be6f2f9c9d6bdde29372468751dfa89f4211a3c5871854cfbf7a \ + --hash=sha256:0bd281310d4f412733f319a5bc552f86d62cddc5f51d2e392c8787335c994175 \ + --hash=sha256:0c6ecbac99a354a051ef21c5307601093cb9e0f4b1855984a084bfec9302699e \ + --hash=sha256:0cee69d583b9b128823d9514171cabb6861e09409af805b54459bd0c821a35c2 \ + --hash=sha256:114c2fe4f4328cf98ce5716d1532f3ab79c5919f95a9cfee81d9140064a2e4d6 \ + --hash=sha256:12d039facec710f7ba305786837d0225a3444af7bbd9c15c32ca2d40d157ed8b \ + --hash=sha256:1333e9c299adcbb68ee89a9bb568fc3f20f9cbb419f1dd5225071e6cddb2a743 \ + --hash=sha256:13bd629c653856f00c53dc495191baa59bcafbbf54860a46ecc50d3a88421a96 \ + --hash=sha256:1b9b52693123dd234b7c985c68b709b0b009f4521000d0525f2b95c22f15944b \ + --hash=sha256:1d81573b3f7db40d020983f78721e9bfc425f411e616ef019a10ebf597aedb2e \ + --hash=sha256:213a5adf93d020b74327cb2c1b842884dbdd37f895f42dcc2f09d451d949f811 \ + --hash=sha256:21bb612d148bb5860b7eb2c10faacf1a810799245afd342cf297d7551513fbb6 \ + --hash=sha256:220cc5c35ffaa764dd5bb17cf42df283b5cb7fdf49e10a7b053a06c9cb48ee2b \ + --hash=sha256:2319656ed81124982900b4c37f0e0c58c015af9a7bbc62342ba5ad07ace82ba9 \ + --hash=sha256:36d627906fd44b5fd63c943264e11e96e923f8de77d6016dc2f667b9ad193438 \ + --hash=sha256:3fbb977f802156e7a3f829e9d1d5398f6192375a3e2d1a9ee0803e35fe70a2b9 \ + --hash=sha256:42c05e15111221384019897df20c6fe893b2f697d03c811ee67ec9e0bb5a3424 \ + --hash=sha256:45178cf09d1858a1509dc73ec261bf5b25a625a389b65be2e47b559905f0ab6a \ + --hash=sha256:48fa91c4dfb3b2b9bfdb5c24cd3567575f4e13f9636810462ffed8925352be5a \ + --hash=sha256:4ac8c320bded4718b298281339c1a50fb00a6ba78cb2a63521c39bec95b0209b \ + --hash=sha256:52bc29a946304c360561974c6542d1dd628ddafa69134a7131fdfd6a5d7a1a35 \ + --hash=sha256:76972bcbd7de8e91ad5f0ca884a9f2c477a2125354af624e022c49e5bd0dfff4 \ + --hash=sha256:77cefe00e1b210f9c76c697fedd8fdb8d3dd86563e9c8adc9fa72b90f5e9e4c2 \ + --hash=sha256:837248b4fc3a9b83b9c6214699a13f069dc13510a6a6d7f9ba33145d2841a012 \ + --hash=sha256:88080a0ff8a55eac9c84e3ff3c7665b3b5476c6fbc484775ca1910ce1c3e0b87 \ + --hash=sha256:8c13b81a9347eb8c7548f53fd9a4f08d4dfe996836543f805c987bafa03317ae \ + --hash=sha256:9467697b8083f9667b212633ad6aa4ab32436dcbaf4cd57325debb0ddef2012f \ + --hash=sha256:96d31a6b4354e3b9b8a2c848af75d31da390657e3ac6f30c05c82068b9ed79b9 \ + --hash=sha256:a9d7ec92d71a420185dec44909c32e9a362248c4ae2238234b76d5be37f208cc \ + --hash=sha256:ab7b58f8f82706890924ccdfb5f48002b83d2b5a3845976a9fb705d36c34dcdb \ + --hash=sha256:b37205ad6f00d52f16b6d09f406434ba928c1a1966e2771006a9033c736d30d2 \ + --hash=sha256:b62d586eb25cb8cb70a5746a378fc3194cb7f11ea77170d59f889f5dfe3cec7a \ + --hash=sha256:b98bdd7c456a05eef7cd21fd6b29e3ca243591fe531c62be94a2cc987efb5ac2 \ + --hash=sha256:c253828cb08f47488d60f43c5fc95114c771bbfff085da54bfc79cb4f9e3a372 \ + --hash=sha256:c624b615ce97864eb588779ed4046186f967374185c047070545253a52ab2d57 \ + --hash=sha256:c6f048aa0fd080d6a06cc7e7537c09b53be6642d330ac6f54a600c3ace857ee9 \ + --hash=sha256:cc03acc273c5515ab69f898df99d9d4f12c4d70dbfc24c3acc6203751d0804cf \ + --hash=sha256:d25c20a03e8870f6339bcf67281b946bd20b86f1a544ebbebb87e66a8d642cba \ + --hash=sha256:d2c3554bd31b731cd6490d94a28f3abb8dd770634a9e06eb6d2911b9827db370 \ + --hash=sha256:d4a558c7620340a0931828d8065688b3cc5b4c8eb674bcaf33d18ff4a6870b4a \ + --hash=sha256:df4df0b9d02bb873a106971bb85d448378ef14b86ba96f035f50bbd3688456b4 \ + --hash=sha256:e190b738675a73b581736cc8ec71ae113d6c3768d0bd18bffa5b9a0927b0b6ea + # via ray +platformdirs==3.11.0 \ + --hash=sha256:cf8ee52a3afdb965072dcc652433e0c7e3e40cf5ea1477cd4b3b1d2eb75495b3 \ + --hash=sha256:e9d171d00af68be50e9202731309c4e658fd8bc76f55c11c7dd760d023bda68e + # via + # -c python/requirements_compiled.txt + # virtualenv +prometheus-client==0.19.0 \ + --hash=sha256:4585b0d1223148c27a225b10dbec5ae9bc4c81a99a3fa80774fa6209935324e1 \ + --hash=sha256:c88b1e6ecf6b41cd8fb5731c7ae919bf66df6ec6fafa555cd6c0e16ca169ae92 + # via + # -c python/requirements_compiled.txt + # opentelemetry-exporter-prometheus + # ray +prompt-toolkit==3.0.41 \ + --hash=sha256:941367d97fc815548822aa26c2a269fdc4eb21e9ec05fc5d447cf09bad5d75f0 \ + --hash=sha256:f36fe301fafb7470e86aaf90f036eef600a3210be4decf461a5b1ca8403d3cb2 + # via + # -c python/requirements_compiled.txt + # click-repl +propcache==0.3.0 \ + --hash=sha256:02df07041e0820cacc8f739510078f2aadcfd3fc57eaeeb16d5ded85c872c89e \ + --hash=sha256:03acd9ff19021bd0567582ac88f821b66883e158274183b9e5586f678984f8fe \ + --hash=sha256:03c091bb752349402f23ee43bb2bff6bd80ccab7c9df6b88ad4322258d6960fc \ + --hash=sha256:07700939b2cbd67bfb3b76a12e1412405d71019df00ca5697ce75e5ef789d829 \ + --hash=sha256:0c3e893c4464ebd751b44ae76c12c5f5c1e4f6cbd6fbf67e3783cd93ad221863 \ + --hash=sha256:119e244ab40f70a98c91906d4c1f4c5f2e68bd0b14e7ab0a06922038fae8a20f \ + --hash=sha256:11ae6a8a01b8a4dc79093b5d3ca2c8a4436f5ee251a9840d7790dccbd96cb649 \ + --hash=sha256:15010f29fbed80e711db272909a074dc79858c6d28e2915704cfc487a8ac89c6 \ + --hash=sha256:19d36bb351ad5554ff20f2ae75f88ce205b0748c38b146c75628577020351e3c \ + --hash=sha256:1c8f7d896a16da9455f882870a507567d4f58c53504dc2d4b1e1d386dfe4588a \ + --hash=sha256:2383a17385d9800b6eb5855c2f05ee550f803878f344f58b6e194de08b96352c \ + --hash=sha256:24c04f8fbf60094c531667b8207acbae54146661657a1b1be6d3ca7773b7a545 \ + --hash=sha256:2578541776769b500bada3f8a4eeaf944530516b6e90c089aa368266ed70c49e \ + --hash=sha256:26a67e5c04e3119594d8cfae517f4b9330c395df07ea65eab16f3d559b7068fe \ + --hash=sha256:2b975528998de037dfbc10144b8aed9b8dd5a99ec547f14d1cb7c5665a43f075 \ + --hash=sha256:2d15bc27163cd4df433e75f546b9ac31c1ba7b0b128bfb1b90df19082466ff57 \ + --hash=sha256:2d913d36bdaf368637b4f88d554fb9cb9d53d6920b9c5563846555938d5450bf \ + --hash=sha256:3302c5287e504d23bb0e64d2a921d1eb4a03fb93a0a0aa3b53de059f5a5d737d \ + --hash=sha256:36ca5e9a21822cc1746023e88f5c0af6fce3af3b85d4520efb1ce4221bed75cc \ + --hash=sha256:3b812b3cb6caacd072276ac0492d249f210006c57726b6484a1e1805b3cfeea0 \ + --hash=sha256:3c6ec957025bf32b15cbc6b67afe233c65b30005e4c55fe5768e4bb518d712f1 \ + --hash=sha256:41de3da5458edd5678b0f6ff66691507f9885f5fe6a0fb99a5d10d10c0fd2d64 \ + --hash=sha256:42924dc0c9d73e49908e35bbdec87adedd651ea24c53c29cac103ede0ea1d340 \ + --hash=sha256:4544699674faf66fb6b4473a1518ae4999c1b614f0b8297b1cef96bac25381db \ + --hash=sha256:46ed02532cb66612d42ae5c3929b5e98ae330ea0f3900bc66ec5f4862069519b \ + --hash=sha256:49ea05212a529c2caffe411e25a59308b07d6e10bf2505d77da72891f9a05641 \ + --hash=sha256:4fa0e7c9c3cf7c276d4f6ab9af8adddc127d04e0fcabede315904d2ff76db626 \ + --hash=sha256:507c5357a8d8b4593b97fb669c50598f4e6cccbbf77e22fa9598aba78292b4d7 \ + --hash=sha256:549722908de62aa0b47a78b90531c022fa6e139f9166be634f667ff45632cc92 \ + --hash=sha256:58e6d2a5a7cb3e5f166fd58e71e9a4ff504be9dc61b88167e75f835da5764d07 \ + --hash=sha256:5a16167118677d94bb48bfcd91e420088854eb0737b76ec374b91498fb77a70e \ + --hash=sha256:5d62c4f6706bff5d8a52fd51fec6069bef69e7202ed481486c0bc3874912c787 \ + --hash=sha256:5fa159dcee5dba00c1def3231c249cf261185189205073bde13797e57dd7540a \ + --hash=sha256:6032231d4a5abd67c7f71168fd64a47b6b451fbcb91c8397c2f7610e67683810 \ + --hash=sha256:63f26258a163c34542c24808f03d734b338da66ba91f410a703e505c8485791d \ + --hash=sha256:65a37714b8ad9aba5780325228598a5b16c47ba0f8aeb3dc0514701e4413d7c0 \ + --hash=sha256:67054e47c01b7b349b94ed0840ccae075449503cf1fdd0a1fdd98ab5ddc2667b \ + --hash=sha256:67dda3c7325691c2081510e92c561f465ba61b975f481735aefdfc845d2cd043 \ + --hash=sha256:6985a593417cdbc94c7f9c3403747335e450c1599da1647a5af76539672464d3 \ + --hash=sha256:6a1948df1bb1d56b5e7b0553c0fa04fd0e320997ae99689488201f19fa90d2e7 \ + --hash=sha256:6b5b7fd6ee7b54e01759f2044f936dcf7dea6e7585f35490f7ca0420fe723c0d \ + --hash=sha256:6c929916cbdb540d3407c66f19f73387f43e7c12fa318a66f64ac99da601bcdf \ + --hash=sha256:6f4d7a7c0aff92e8354cceca6fe223973ddf08401047920df0fcb24be2bd5138 \ + --hash=sha256:728af36011bb5d344c4fe4af79cfe186729efb649d2f8b395d1572fb088a996c \ + --hash=sha256:742840d1d0438eb7ea4280f3347598f507a199a35a08294afdcc560c3739989d \ + --hash=sha256:75e872573220d1ee2305b35c9813626e620768248425f58798413e9c39741f46 \ + --hash=sha256:794c3dd744fad478b6232289c866c25406ecdfc47e294618bdf1697e69bd64a6 \ + --hash=sha256:7c0fdbdf6983526e269e5a8d53b7ae3622dd6998468821d660d0daf72779aefa \ + --hash=sha256:7c5f5290799a3f6539cc5e6f474c3e5c5fbeba74a5e1e5be75587746a940d51e \ + --hash=sha256:7c6e7e4f9167fddc438cd653d826f2222222564daed4116a02a184b464d3ef05 \ + --hash=sha256:7cedd25e5f678f7738da38037435b340694ab34d424938041aa630d8bac42663 \ + --hash=sha256:7e2e068a83552ddf7a39a99488bcba05ac13454fb205c847674da0352602082f \ + --hash=sha256:8319293e85feadbbfe2150a5659dbc2ebc4afdeaf7d98936fb9a2f2ba0d4c35c \ + --hash=sha256:8526b0941ec5a40220fc4dfde76aed58808e2b309c03e9fa8e2260083ef7157f \ + --hash=sha256:8884ba1a0fe7210b775106b25850f5e5a9dc3c840d1ae9924ee6ea2eb3acbfe7 \ + --hash=sha256:8cb625bcb5add899cb8ba7bf716ec1d3e8f7cdea9b0713fa99eadf73b6d4986f \ + --hash=sha256:8d663fd71491dde7dfdfc899d13a067a94198e90695b4321084c6e450743b8c7 \ + --hash=sha256:8ee1983728964d6070ab443399c476de93d5d741f71e8f6e7880a065f878e0b9 \ + --hash=sha256:997e7b8f173a391987df40f3b52c423e5850be6f6df0dcfb5376365440b56667 \ + --hash=sha256:9be90eebc9842a93ef8335291f57b3b7488ac24f70df96a6034a13cb58e6ff86 \ + --hash=sha256:9ddd49258610499aab83b4f5b61b32e11fce873586282a0e972e5ab3bcadee51 \ + --hash=sha256:9ecde3671e62eeb99e977f5221abcf40c208f69b5eb986b061ccec317c82ebd0 \ + --hash=sha256:9ff4e9ecb6e4b363430edf2c6e50173a63e0820e549918adef70515f87ced19a \ + --hash=sha256:a254537b9b696ede293bfdbc0a65200e8e4507bc9f37831e2a0318a9b333c85c \ + --hash=sha256:a2b9bf8c79b660d0ca1ad95e587818c30ccdb11f787657458d6f26a1ea18c568 \ + --hash=sha256:a61a68d630e812b67b5bf097ab84e2cd79b48c792857dc10ba8a223f5b06a2af \ + --hash=sha256:a7080b0159ce05f179cfac592cda1a82898ca9cd097dacf8ea20ae33474fbb25 \ + --hash=sha256:a8fd93de4e1d278046345f49e2238cdb298589325849b2645d4a94c53faeffc5 \ + --hash=sha256:a94ffc66738da99232ddffcf7910e0f69e2bbe3a0802e54426dbf0714e1c2ffe \ + --hash=sha256:aa806bbc13eac1ab6291ed21ecd2dd426063ca5417dd507e6be58de20e58dfcf \ + --hash=sha256:b0c1a133d42c6fc1f5fbcf5c91331657a1ff822e87989bf4a6e2e39b818d0ee9 \ + --hash=sha256:b58229a844931bca61b3a20efd2be2a2acb4ad1622fc026504309a6883686fbf \ + --hash=sha256:bb2f144c6d98bb5cbc94adeb0447cfd4c0f991341baa68eee3f3b0c9c0e83767 \ + --hash=sha256:be90c94570840939fecedf99fa72839aed70b0ced449b415c85e01ae67422c90 \ + --hash=sha256:bf0d9a171908f32d54f651648c7290397b8792f4303821c42a74e7805bfb813c \ + --hash=sha256:bf15fc0b45914d9d1b706f7c9c4f66f2b7b053e9517e40123e137e8ca8958b3d \ + --hash=sha256:bf4298f366ca7e1ad1d21bbb58300a6985015909964077afd37559084590c929 \ + --hash=sha256:c441c841e82c5ba7a85ad25986014be8d7849c3cfbdb6004541873505929a74e \ + --hash=sha256:cacea77ef7a2195f04f9279297684955e3d1ae4241092ff0cfcef532bb7a1c32 \ + --hash=sha256:cd54895e4ae7d32f1e3dd91261df46ee7483a735017dc6f987904f194aa5fd14 \ + --hash=sha256:d1323cd04d6e92150bcc79d0174ce347ed4b349d748b9358fd2e497b121e03c8 \ + --hash=sha256:d383bf5e045d7f9d239b38e6acadd7b7fdf6c0087259a84ae3475d18e9a2ae8b \ + --hash=sha256:d3e7420211f5a65a54675fd860ea04173cde60a7cc20ccfbafcccd155225f8bc \ + --hash=sha256:d8074c5dd61c8a3e915fa8fc04754fa55cfa5978200d2daa1e2d4294c1f136aa \ + --hash=sha256:df03cd88f95b1b99052b52b1bb92173229d7a674df0ab06d2b25765ee8404bce \ + --hash=sha256:e45377d5d6fefe1677da2a2c07b024a6dac782088e37c0b1efea4cfe2b1be19b \ + --hash=sha256:e53d19c2bf7d0d1e6998a7e693c7e87300dd971808e6618964621ccd0e01fe4e \ + --hash=sha256:e560fd75aaf3e5693b91bcaddd8b314f4d57e99aef8a6c6dc692f935cc1e6bbf \ + --hash=sha256:ec5060592d83454e8063e487696ac3783cc48c9a329498bafae0d972bc7816c9 \ + --hash=sha256:ecc2920630283e0783c22e2ac94427f8cca29a04cfdf331467d4f661f4072dac \ + --hash=sha256:ed7161bccab7696a473fe7ddb619c1d75963732b37da4618ba12e60899fefe4f \ + --hash=sha256:ee0bd3a7b2e184e88d25c9baa6a9dc609ba25b76daae942edfb14499ac7ec374 \ + --hash=sha256:ee25f1ac091def37c4b59d192bbe3a206298feeb89132a470325bf76ad122a1e \ + --hash=sha256:efa44f64c37cc30c9f05932c740a8b40ce359f51882c70883cc95feac842da4d \ + --hash=sha256:f47d52fd9b2ac418c4890aad2f6d21a6b96183c98021f0a48497a904199f006e \ + --hash=sha256:f857034dc68d5ceb30fb60afb6ff2103087aea10a01b613985610e007053a121 \ + --hash=sha256:fb91d20fa2d3b13deea98a690534697742029f4fb83673a3501ae6e3746508b5 \ + --hash=sha256:fddb8870bdb83456a489ab67c6b3040a8d5a55069aa6f72f9d872235fbc52f54 + # via + # -c python/requirements_compiled.txt + # aiohttp + # yarl +proto-plus==1.22.3 \ + --hash=sha256:a49cd903bc0b6ab41f76bf65510439d56ca76f868adf0274e738bfdd096894df \ + --hash=sha256:fdcd09713cbd42480740d2fe29c990f7fbd885a67efc328aa8be6ee3e9f76a6b + # via + # -c python/requirements_compiled.txt + # google-api-core +protobuf==4.25.8 \ + --hash=sha256:077ff8badf2acf8bc474406706ad890466274191a48d0abd3bd6987107c9cde5 \ + --hash=sha256:15a0af558aa3b13efef102ae6e4f3efac06f1eea11afb3a57db2901447d9fb59 \ + --hash=sha256:27d498ffd1f21fb81d987a041c32d07857d1d107909f5134ba3350e1ce80a4af \ + --hash=sha256:504435d831565f7cfac9f0714440028907f1975e4bed228e58e72ecfff58a1e0 \ + --hash=sha256:6135cf8affe1fc6f76cced2641e4ea8d3e59518d1f24ae41ba97bcad82d397cd \ + --hash=sha256:83e6e54e93d2b696a92cad6e6efc924f3850f82b52e1563778dfab8b355101b0 \ + --hash=sha256:9ad7ef62d92baf5a8654fbb88dac7fa5594cfa70fd3440488a5ca3bfc6d795a7 \ + --hash=sha256:bd551eb1fe1d7e92c1af1d75bdfa572eff1ab0e5bf1736716814cdccdb2360f9 \ + --hash=sha256:ca809b42f4444f144f2115c4c1a747b9a404d590f18f37e9402422033e464e0f \ + --hash=sha256:d552c53d0415449c8d17ced5c341caba0d89dbf433698e1436c8fa0aae7808a3 \ + --hash=sha256:f4510b93a3bec6eba8fd8f1093e9d7fb0d4a24d1a81377c10c0e5bbfe9e4ed24 + # via + # -c python/requirements_compiled.txt + # google-api-core + # googleapis-common-protos + # opentelemetry-proto + # proto-plus + # ray + # tensorboardx +py-spy==0.4.1 \ + --hash=sha256:1fb8bf71ab8df95a95cc387deed6552934c50feef2cf6456bc06692a5508fd0c \ + --hash=sha256:4972c21890b6814017e39ac233c22572c4a61fd874524ebc5ccab0f2237aee0a \ + --hash=sha256:532d3525538254d1859b49de1fbe9744df6b8865657c9f0e444bf36ce3f19226 \ + --hash=sha256:6a80ec05eb8a6883863a367c6a4d4f2d57de68466f7956b6367d4edd5c61bb29 \ + --hash=sha256:809094208c6256c8f4ccadd31e9a513fe2429253f48e20066879239ba12cd8cc \ + --hash=sha256:d92e522bd40e9bf7d87c204033ce5bb5c828fca45fa28d970f58d71128069fdc \ + --hash=sha256:e53aa53daa2e47c2eef97dd2455b47bb3a7e7f962796a86cc3e7dbde8e6f4db4 \ + --hash=sha256:ee776b9d512a011d1ad3907ed53ae32ce2f3d9ff3e1782236554e22103b5c084 + # via ray +pyarrow==19.0.1 \ + --hash=sha256:008a4009efdb4ea3d2e18f05cd31f9d43c388aad29c636112c2966605ba33466 \ + --hash=sha256:0148bb4fc158bfbc3d6dfe5001d93ebeed253793fff4435167f6ce1dc4bddeae \ + --hash=sha256:1b93ef2c93e77c442c979b0d596af45e4665d8b96da598db145b0fec014b9136 \ + --hash=sha256:1c7556165bd38cf0cd992df2636f8bcdd2d4b26916c6b7e646101aff3c16f76f \ + --hash=sha256:335d170e050bcc7da867a1ed8ffb8b44c57aaa6e0843b156a501298657b1e972 \ + --hash=sha256:3bf266b485df66a400f282ac0b6d1b500b9d2ae73314a153dbe97d6d5cc8a99e \ + --hash=sha256:41f9706fbe505e0abc10e84bf3a906a1338905cbbcf1177b71486b03e6ea6608 \ + --hash=sha256:4982f8e2b7afd6dae8608d70ba5bd91699077323f812a0448d8b7abdff6cb5d3 \ + --hash=sha256:49a3aecb62c1be1d822f8bf629226d4a96418228a42f5b40835c1f10d42e4db6 \ + --hash=sha256:4d5d1ec7ec5324b98887bdc006f4d2ce534e10e60f7ad995e7875ffa0ff9cb14 \ + --hash=sha256:58d9397b2e273ef76264b45531e9d552d8ec8a6688b7390b5be44c02a37aade8 \ + --hash=sha256:5a9137cf7e1640dce4c190551ee69d478f7121b5c6f323553b319cac936395f6 \ + --hash=sha256:5bd1618ae5e5476b7654c7b55a6364ae87686d4724538c24185bbb2952679960 \ + --hash=sha256:65cf9feebab489b19cdfcfe4aa82f62147218558d8d3f0fc1e9dea0ab8e7905a \ + --hash=sha256:699799f9c80bebcf1da0983ba86d7f289c5a2a5c04b945e2f2bcf7e874a91911 \ + --hash=sha256:6c5941c1aac89a6c2f2b16cd64fe76bcdb94b2b1e99ca6459de4e6f07638d755 \ + --hash=sha256:6ebfb5171bb5f4a52319344ebbbecc731af3f021e49318c74f33d520d31ae0c4 \ + --hash=sha256:7a544ec12de66769612b2d6988c36adc96fb9767ecc8ee0a4d270b10b1c51e00 \ + --hash=sha256:7c1bca1897c28013db5e4c83944a2ab53231f541b9e0c3f4791206d0c0de389a \ + --hash=sha256:80b2ad2b193e7d19e81008a96e313fbd53157945c7be9ac65f44f8937a55427b \ + --hash=sha256:8464c9fbe6d94a7fe1599e7e8965f350fd233532868232ab2596a71586c5a429 \ + --hash=sha256:8f04d49a6b64cf24719c080b3c2029a3a5b16417fd5fd7c4041f94233af732f3 \ + --hash=sha256:96606c3ba57944d128e8a8399da4812f56c7f61de8c647e3470b417f795d0ef9 \ + --hash=sha256:99bc1bec6d234359743b01e70d4310d0ab240c3d6b0da7e2a93663b0158616f6 \ + --hash=sha256:ad76aef7f5f7e4a757fddcdcf010a8290958f09e3470ea458c80d26f4316ae89 \ + --hash=sha256:b4c4156a625f1e35d6c0b2132635a237708944eb41df5fbe7d50f20d20c17832 \ + --hash=sha256:b9766a47a9cb56fefe95cb27f535038b5a195707a08bf61b180e642324963b46 \ + --hash=sha256:c0fe3dbbf054a00d1f162fda94ce236a899ca01123a798c561ba307ca38af5f0 \ + --hash=sha256:c6cb2335a411b713fdf1e82a752162f72d4a7b5dbc588e32aa18383318b05866 \ + --hash=sha256:cc55d71898ea30dc95900297d191377caba257612f384207fe9f8293b5850f90 \ + --hash=sha256:d03c9d6f2a3dffbd62671ca070f13fc527bb1867b4ec2b98c7eeed381d4f389a \ + --hash=sha256:d383591f3dcbe545f6cc62daaef9c7cdfe0dff0fb9e1c8121101cabe9098cfa6 \ + --hash=sha256:d9d46e06846a41ba906ab25302cf0fd522f81aa2a85a71021826f34639ad31ef \ + --hash=sha256:d9dedeaf19097a143ed6da37f04f4051aba353c95ef507764d344229b2b740ae \ + --hash=sha256:e45274b20e524ae5c39d7fc1ca2aa923aab494776d2d4b316b49ec7572ca324c \ + --hash=sha256:ee8dec072569f43835932a3b10c55973593abc00936c202707a4ad06af7cb294 \ + --hash=sha256:f24faab6ed18f216a37870d8c5623f9c044566d75ec586ef884e13a02a9d62c5 \ + --hash=sha256:f2a21d39fbdb948857f67eacb5bbaaf36802de044ec36fbef7a1c8f0dd3a4ab2 \ + --hash=sha256:f3ad4c0eb4e2a9aeb990af6c09e6fa0b195c8c0e7b272ecc8d4d2b6574809d34 \ + --hash=sha256:fc28912a2dc924dddc2087679cc8b7263accc71b9ff025a1362b004711661a69 \ + --hash=sha256:fca15aabbe9b8355800d923cc2e82c8ef514af321e18b437c3d782aa884eaeec \ + --hash=sha256:fd44d66093a239358d07c42a91eebf5015aa54fccba959db899f932218ac9cc8 + # via + # -c python/requirements_compiled.txt + # ray +pyasn1==0.5.1 \ + --hash=sha256:4439847c58d40b1d0a573d07e3856e95333f1976294494c325775aeca506eb58 \ + --hash=sha256:6d391a96e59b23130a5cfa74d6fd7f388dbbe26cc8f1edf39fdddf08d9d6676c + # via + # -c python/requirements_compiled.txt + # pyasn1-modules + # rsa +pyasn1-modules==0.3.0 \ + --hash=sha256:5bd01446b736eb9d31512a30d46c1ac3395d676c6f3cafa4c03eb54b9925631c \ + --hash=sha256:d3ccd6ed470d9ffbc716be08bd90efbd44d0734bc9303818f7336070984a162d + # via + # -c python/requirements_compiled.txt + # google-auth +pycparser==2.21 ; platform_python_implementation != 'PyPy' \ + --hash=sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9 \ + --hash=sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206 + # via + # -c python/requirements_compiled.txt + # cffi +pydantic==2.11.7 \ + --hash=sha256:d989c3c6cb79469287b1569f7447a17848c998458d49ebe294e975b9baf0f0db \ + --hash=sha256:dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b + # via + # -c python/requirements_compiled.txt + # fastapi + # ray +pydantic-core==2.33.2 \ + --hash=sha256:0069c9acc3f3981b9ff4cdfaf088e98d83440a4c7ea1bc07460af3d4dc22e72d \ + --hash=sha256:031c57d67ca86902726e0fae2214ce6770bbe2f710dc33063187a68744a5ecac \ + --hash=sha256:0405262705a123b7ce9f0b92f123334d67b70fd1f20a9372b907ce1080c7ba02 \ + --hash=sha256:04a1a413977ab517154eebb2d326da71638271477d6ad87a769102f7c2488c56 \ + --hash=sha256:09fb9dd6571aacd023fe6aaca316bd01cf60ab27240d7eb39ebd66a3a15293b4 \ + --hash=sha256:0a39979dcbb70998b0e505fb1556a1d550a0781463ce84ebf915ba293ccb7e22 \ + --hash=sha256:0a9f2c9dd19656823cb8250b0724ee9c60a82f3cdf68a080979d13092a3b0fef \ + --hash=sha256:0e03262ab796d986f978f79c943fc5f620381be7287148b8010b4097f79a39ec \ + --hash=sha256:0e5b2671f05ba48b94cb90ce55d8bdcaaedb8ba00cc5359f6810fc918713983d \ + --hash=sha256:0e6116757f7959a712db11f3e9c0a99ade00a5bbedae83cb801985aa154f071b \ + --hash=sha256:0fb2d542b4d66f9470e8065c5469ec676978d625a8b7a363f07d9a501a9cb36a \ + --hash=sha256:1082dd3e2d7109ad8b7da48e1d4710c8d06c253cbc4a27c1cff4fbcaa97a9e3f \ + --hash=sha256:1a8695a8d00c73e50bff9dfda4d540b7dee29ff9b8053e38380426a85ef10052 \ + --hash=sha256:1e063337ef9e9820c77acc768546325ebe04ee38b08703244c1309cccc4f1bab \ + --hash=sha256:1ea40a64d23faa25e62a70ad163571c0b342b8bf66d5fa612ac0dec4f069d916 \ + --hash=sha256:2058a32994f1fde4ca0480ab9d1e75a0e8c87c22b53a3ae66554f9af78f2fe8c \ + --hash=sha256:235f45e5dbcccf6bd99f9f472858849f73d11120d76ea8707115415f8e5ebebf \ + --hash=sha256:2807668ba86cb38c6817ad9bc66215ab8584d1d304030ce4f0887336f28a5e27 \ + --hash=sha256:2b0a451c263b01acebe51895bfb0e1cc842a5c666efe06cdf13846c7418caa9a \ + --hash=sha256:2b3d326aaef0c0399d9afffeb6367d5e26ddc24d351dbc9c636840ac355dc5d8 \ + --hash=sha256:2bfb5112df54209d820d7bf9317c7a6c9025ea52e49f46b6a2060104bba37de7 \ + --hash=sha256:2f82865531efd18d6e07a04a17331af02cb7a651583c418df8266f17a63c6612 \ + --hash=sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1 \ + --hash=sha256:3a1c81334778f9e3af2f8aeb7a960736e5cab1dfebfb26aabca09afd2906c039 \ + --hash=sha256:3abcd9392a36025e3bd55f9bd38d908bd17962cc49bc6da8e7e96285336e2bca \ + --hash=sha256:3c6db6e52c6d70aa0d00d45cdb9b40f0433b96380071ea80b09277dba021ddf7 \ + --hash=sha256:3dc625f4aa79713512d1976fe9f0bc99f706a9dee21dfd1810b4bbbf228d0e8a \ + --hash=sha256:3eb3fe62804e8f859c49ed20a8451342de53ed764150cb14ca71357c765dc2a6 \ + --hash=sha256:44857c3227d3fb5e753d5fe4a3420d6376fa594b07b621e220cd93703fe21782 \ + --hash=sha256:4b25d91e288e2c4e0662b8038a28c6a07eaac3e196cfc4ff69de4ea3db992a1b \ + --hash=sha256:4c5b0a576fb381edd6d27f0a85915c6daf2f8138dc5c267a57c08a62900758c7 \ + --hash=sha256:4e61206137cbc65e6d5256e1166f88331d3b6238e082d9f74613b9b765fb9025 \ + --hash=sha256:52fb90784e0a242bb96ec53f42196a17278855b0f31ac7c3cc6f5c1ec4811849 \ + --hash=sha256:53a57d2ed685940a504248187d5685e49eb5eef0f696853647bf37c418c538f7 \ + --hash=sha256:572c7e6c8bb4774d2ac88929e3d1f12bc45714ae5ee6d9a788a9fb35e60bb04b \ + --hash=sha256:5c4aa4e82353f65e548c476b37e64189783aa5384903bfea4f41580f255fddfa \ + --hash=sha256:5c92edd15cd58b3c2d34873597a1e20f13094f59cf88068adb18947df5455b4e \ + --hash=sha256:5f483cfb75ff703095c59e365360cb73e00185e01aaea067cd19acffd2ab20ea \ + --hash=sha256:61c18fba8e5e9db3ab908620af374db0ac1baa69f0f32df4f61ae23f15e586ac \ + --hash=sha256:6368900c2d3ef09b69cb0b913f9f8263b03786e5b2a387706c5afb66800efd51 \ + --hash=sha256:64632ff9d614e5eecfb495796ad51b0ed98c453e447a76bcbeeb69615079fc7e \ + --hash=sha256:65132b7b4a1c0beded5e057324b7e16e10910c106d43675d9bd87d4f38dde162 \ + --hash=sha256:6b99022f1d19bc32a4c2a0d544fc9a76e3be90f0b3f4af413f87d38749300e65 \ + --hash=sha256:6bdfe4b3789761f3bcb4b1ddf33355a71079858958e3a552f16d5af19768fef2 \ + --hash=sha256:6fa6dfc3e4d1f734a34710f391ae822e0a8eb8559a85c6979e14e65ee6ba2954 \ + --hash=sha256:73662edf539e72a9440129f231ed3757faab89630d291b784ca99237fb94db2b \ + --hash=sha256:73cf6373c21bc80b2e0dc88444f41ae60b2f070ed02095754eb5a01df12256de \ + --hash=sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc \ + --hash=sha256:7f92c15cd1e97d4b12acd1cc9004fa092578acfa57b67ad5e43a197175d01a64 \ + --hash=sha256:82f68293f055f51b51ea42fafc74b6aad03e70e191799430b90c13d643059ebb \ + --hash=sha256:83aa99b1285bc8f038941ddf598501a86f1536789740991d7d8756e34f1e74d9 \ + --hash=sha256:87acbfcf8e90ca885206e98359d7dca4bcbb35abdc0ff66672a293e1d7a19101 \ + --hash=sha256:87b31b6846e361ef83fedb187bb5b4372d0da3f7e28d85415efa92d6125d6e6d \ + --hash=sha256:881b21b5549499972441da4758d662aeea93f1923f953e9cbaff14b8b9565aef \ + --hash=sha256:8d55ab81c57b8ff8548c3e4947f119551253f4e3787a7bbc0b6b3ca47498a9d3 \ + --hash=sha256:8f57a69461af2a5fa6e6bbd7a5f60d3b7e6cebb687f55106933188e79ad155c1 \ + --hash=sha256:95237e53bb015f67b63c91af7518a62a8660376a6a0db19b89acc77a4d6199f5 \ + --hash=sha256:96081f1605125ba0855dfda83f6f3df5ec90c61195421ba72223de35ccfb2f88 \ + --hash=sha256:970919794d126ba8645f3837ab6046fb4e72bbc057b3709144066204c19a455d \ + --hash=sha256:9cb1da0f5a471435a7bc7e439b8a728e8b61e59784b2af70d7c169f8dd8ae290 \ + --hash=sha256:9fcd347d2cc5c23b06de6d3b7b8275be558a0c90549495c699e379a80bf8379e \ + --hash=sha256:9fdac5d6ffa1b5a83bca06ffe7583f5576555e6c8b3a91fbd25ea7780f825f7d \ + --hash=sha256:a11c8d26a50bfab49002947d3d237abe4d9e4b5bdc8846a63537b6488e197808 \ + --hash=sha256:a144d4f717285c6d9234a66778059f33a89096dfb9b39117663fd8413d582dcc \ + --hash=sha256:a2b911a5b90e0374d03813674bf0a5fbbb7741570dcd4b4e85a2e48d17def29d \ + --hash=sha256:a7ec89dc587667f22b6a0b6579c249fca9026ce7c333fc142ba42411fa243cdc \ + --hash=sha256:aa9d91b338f2df0508606f7009fde642391425189bba6d8c653afd80fd6bb64e \ + --hash=sha256:b0379a2b24882fef529ec3b4987cb5d003b9cda32256024e6fe1586ac45fc640 \ + --hash=sha256:bc7aee6f634a6f4a95676fcb5d6559a2c2a390330098dba5e5a5f28a2e4ada30 \ + --hash=sha256:bdc25f3681f7b78572699569514036afe3c243bc3059d3942624e936ec93450e \ + --hash=sha256:c083a3bdd5a93dfe480f1125926afcdbf2917ae714bdb80b36d34318b2bec5d9 \ + --hash=sha256:c20c462aa4434b33a2661701b861604913f912254e441ab8d78d30485736115a \ + --hash=sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9 \ + --hash=sha256:c52b02ad8b4e2cf14ca7b3d918f3eb0ee91e63b3167c32591e57c4317e134f8f \ + --hash=sha256:c54c939ee22dc8e2d545da79fc5381f1c020d6d3141d3bd747eab59164dc89fb \ + --hash=sha256:c8e7af2f4e0194c22b5b37205bfb293d166a7344a5b0d0eaccebc376546d77d5 \ + --hash=sha256:cca3868ddfaccfbc4bfb1d608e2ccaaebe0ae628e1416aeb9c4d88c001bb45ab \ + --hash=sha256:d3f26877a748dc4251cfcfda9dfb5f13fcb034f5308388066bcfe9031b63ae7d \ + --hash=sha256:d53b22f2032c42eaaf025f7c40c2e3b94568ae077a606f006d206a463bc69572 \ + --hash=sha256:d87c561733f66531dced0da6e864f44ebf89a8fba55f31407b00c2f7f9449593 \ + --hash=sha256:d946c8bf0d5c24bf4fe333af284c59a19358aa3ec18cb3dc4370080da1e8ad29 \ + --hash=sha256:dac89aea9af8cd672fa7b510e7b8c33b0bba9a43186680550ccf23020f32d535 \ + --hash=sha256:db4b41f9bd95fbe5acd76d89920336ba96f03e149097365afe1cb092fceb89a1 \ + --hash=sha256:dc46a01bf8d62f227d5ecee74178ffc448ff4e5197c756331f71efcc66dc980f \ + --hash=sha256:dd14041875d09cc0f9308e37a6f8b65f5585cf2598a53aa0123df8b129d481f8 \ + --hash=sha256:de4b83bb311557e439b9e186f733f6c645b9417c84e2eb8203f3f820a4b988bf \ + --hash=sha256:e799c050df38a639db758c617ec771fd8fb7a5f8eaaa4b27b101f266b216a246 \ + --hash=sha256:e80b087132752f6b3d714f041ccf74403799d3b23a72722ea2e6ba2e892555b9 \ + --hash=sha256:eb8c529b2819c37140eb51b914153063d27ed88e3bdc31b71198a198e921e011 \ + --hash=sha256:eb9b459ca4df0e5c87deb59d37377461a538852765293f9e6ee834f0435a93b9 \ + --hash=sha256:efec8db3266b76ef9607c2c4c419bdb06bf335ae433b80816089ea7585816f6a \ + --hash=sha256:f481959862f57f29601ccced557cc2e817bce7533ab8e01a797a48b49c9692b3 \ + --hash=sha256:f517ca031dfc037a9c07e748cefd8d96235088b83b4f4ba8939105d20fa1dcd6 \ + --hash=sha256:f889f7a40498cc077332c7ab6b4608d296d852182211787d4f3ee377aaae66e8 \ + --hash=sha256:f8de619080e944347f5f20de29a975c2d815d9ddd8be9b9b7268e2e3ef68605a \ + --hash=sha256:f941635f2a3d96b2973e867144fde513665c87f13fe0e193c158ac51bfaaa7b2 \ + --hash=sha256:fa754d1850735a0b0e03bcffd9d4b4343eb417e47196e4485d9cca326073a42c \ + --hash=sha256:fa854f5cf7e33842a892e5c73f45327760bc7bc516339fda888c75ae60edaeb6 \ + --hash=sha256:fe5b32187cbc0c862ee201ad66c30cf218e5ed468ec8dc1cf49dec66e160cc4d + # via + # -c python/requirements_compiled.txt + # pydantic +pygments==2.18.0 ; sys_platform != 'win32' \ + --hash=sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199 \ + --hash=sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a + # via + # -c python/requirements_compiled.txt + # rich +pyopenssl==25.0.0 \ + --hash=sha256:424c247065e46e76a37411b9ab1782541c23bb658bf003772c3405fbaa128e90 \ + --hash=sha256:cd2cef799efa3936bb08e8ccb9433a575722b9dd986023f1cabc4ae64e9dac16 + # via + # -c python/requirements_compiled.txt + # ray +python-dateutil==2.8.2 \ + --hash=sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86 \ + --hash=sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9 + # via + # -c python/requirements_compiled.txt + # celery + # pandas +python-dotenv==1.1.1 \ + --hash=sha256:31f23644fe2602f88ff55e1f5c79ba497e01224ee7737937930c448e4d0e24dc \ + --hash=sha256:a8a6399716257f45be6a007360200409fce5cda2661e3dec71d23dc15f6189ab + # via uvicorn +pytz==2022.7.1 \ + --hash=sha256:01a0681c4b9684a28304615eba55d1ab31ae00bf68ec157ec3708a8182dbbcd0 \ + --hash=sha256:78f4f37d8198e0627c5f1143240bb0206b8691d8d7ac6d78fee88b78733f8c4a + # via + # -c python/requirements_compiled.txt + # pandas +pyyaml==6.0.1 \ + --hash=sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5 \ + --hash=sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc \ + --hash=sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df \ + --hash=sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741 \ + --hash=sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206 \ + --hash=sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27 \ + --hash=sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595 \ + --hash=sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62 \ + --hash=sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98 \ + --hash=sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696 \ + --hash=sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290 \ + --hash=sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9 \ + --hash=sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d \ + --hash=sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6 \ + --hash=sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867 \ + --hash=sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47 \ + --hash=sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486 \ + --hash=sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6 \ + --hash=sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3 \ + --hash=sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007 \ + --hash=sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938 \ + --hash=sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0 \ + --hash=sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c \ + --hash=sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735 \ + --hash=sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d \ + --hash=sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28 \ + --hash=sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4 \ + --hash=sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba \ + --hash=sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8 \ + --hash=sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef \ + --hash=sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5 \ + --hash=sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd \ + --hash=sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3 \ + --hash=sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0 \ + --hash=sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515 \ + --hash=sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c \ + --hash=sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c \ + --hash=sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924 \ + --hash=sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34 \ + --hash=sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43 \ + --hash=sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859 \ + --hash=sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673 \ + --hash=sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54 \ + --hash=sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a \ + --hash=sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b \ + --hash=sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab \ + --hash=sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa \ + --hash=sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c \ + --hash=sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585 \ + --hash=sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d \ + --hash=sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f + # via + # -c python/requirements_compiled.txt + # ray + # uvicorn +ray==100.0.0.dev0 \ + --hash=sha256:4fef4f9d9cc8b516f22c2eea42b7fa244ef1dace261809d9f175875de0ec9fed +referencing==0.36.2 \ + --hash=sha256:df2e89862cd09deabbdba16944cc3f10feb6b3e6f18e902f7cc25609a34775aa \ + --hash=sha256:e8699adbbf8b5c7de96d8ffa0eb5c158b3beafce084968e2ea8bb08c6794dcd0 + # via + # -c python/requirements_compiled.txt + # jsonschema + # jsonschema-specifications +requests==2.32.3 \ + --hash=sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760 \ + --hash=sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6 + # via + # -c python/requirements_compiled.txt + # google-api-core + # ray +rich==13.3.2 ; sys_platform != 'win32' \ + --hash=sha256:91954fe80cfb7985727a467ca98a7618e5dd15178cc2da10f553b36a93859001 \ + --hash=sha256:a104f37270bf677148d8acb07d33be1569eeee87e2d1beb286a4e9113caf6f2f + # via + # -c python/requirements_compiled.txt + # memray +rpds-py==0.22.3 \ + --hash=sha256:009de23c9c9ee54bf11303a966edf4d9087cd43a6003672e6aa7def643d06518 \ + --hash=sha256:02fbb9c288ae08bcb34fb41d516d5eeb0455ac35b5512d03181d755d80810059 \ + --hash=sha256:0a0461200769ab3b9ab7e513f6013b7a97fdeee41c29b9db343f3c5a8e2b9e61 \ + --hash=sha256:0b09865a9abc0ddff4e50b5ef65467cd94176bf1e0004184eb915cbc10fc05c5 \ + --hash=sha256:0b8db6b5b2d4491ad5b6bdc2bc7c017eec108acbf4e6785f42a9eb0ba234f4c9 \ + --hash=sha256:0c150c7a61ed4a4f4955a96626574e9baf1adf772c2fb61ef6a5027e52803543 \ + --hash=sha256:0f3cec041684de9a4684b1572fe28c7267410e02450f4561700ca5a3bc6695a2 \ + --hash=sha256:1352ae4f7c717ae8cba93421a63373e582d19d55d2ee2cbb184344c82d2ae55a \ + --hash=sha256:177c7c0fce2855833819c98e43c262007f42ce86651ffbb84f37883308cb0e7d \ + --hash=sha256:1978d0021e943aae58b9b0b196fb4895a25cc53d3956b8e35e0b7682eefb6d56 \ + --hash=sha256:1a60bce91f81ddaac922a40bbb571a12c1070cb20ebd6d49c48e0b101d87300d \ + --hash=sha256:1aef18820ef3e4587ebe8b3bc9ba6e55892a6d7b93bac6d29d9f631a3b4befbd \ + --hash=sha256:1e9663daaf7a63ceccbbb8e3808fe90415b0757e2abddbfc2e06c857bf8c5e2b \ + --hash=sha256:20070c65396f7373f5df4005862fa162db5d25d56150bddd0b3e8214e8ef45b4 \ + --hash=sha256:214b7a953d73b5e87f0ebece4a32a5bd83c60a3ecc9d4ec8f1dca968a2d91e99 \ + --hash=sha256:22bebe05a9ffc70ebfa127efbc429bc26ec9e9b4ee4d15a740033efda515cf3d \ + --hash=sha256:24e8abb5878e250f2eb0d7859a8e561846f98910326d06c0d51381fed59357bd \ + --hash=sha256:26fd7cac7dd51011a245f29a2cc6489c4608b5a8ce8d75661bb4a1066c52dfbe \ + --hash=sha256:27b1d3b3915a99208fee9ab092b8184c420f2905b7d7feb4aeb5e4a9c509b8a1 \ + --hash=sha256:27e98004595899949bd7a7b34e91fa7c44d7a97c40fcaf1d874168bb652ec67e \ + --hash=sha256:2b8f60e1b739a74bab7e01fcbe3dddd4657ec685caa04681df9d562ef15b625f \ + --hash=sha256:2de29005e11637e7a2361fa151f780ff8eb2543a0da1413bb951e9f14b699ef3 \ + --hash=sha256:2e8b55d8517a2fda8d95cb45d62a5a8bbf9dd0ad39c5b25c8833efea07b880ca \ + --hash=sha256:2fa4331c200c2521512595253f5bb70858b90f750d39b8cbfd67465f8d1b596d \ + --hash=sha256:3445e07bf2e8ecfeef6ef67ac83de670358abf2996916039b16a218e3d95e97e \ + --hash=sha256:3453e8d41fe5f17d1f8e9c383a7473cd46a63661628ec58e07777c2fff7196dc \ + --hash=sha256:378753b4a4de2a7b34063d6f95ae81bfa7b15f2c1a04a9518e8644e81807ebea \ + --hash=sha256:3af6e48651c4e0d2d166dc1b033b7042ea3f871504b6805ba5f4fe31581d8d38 \ + --hash=sha256:3dfcbc95bd7992b16f3f7ba05af8a64ca694331bd24f9157b49dadeeb287493b \ + --hash=sha256:3f21f0495edea7fdbaaa87e633a8689cd285f8f4af5c869f27bc8074638ad69c \ + --hash=sha256:4041711832360a9b75cfb11b25a6a97c8fb49c07b8bd43d0d02b45d0b499a4ff \ + --hash=sha256:44d61b4b7d0c2c9ac019c314e52d7cbda0ae31078aabd0f22e583af3e0d79723 \ + --hash=sha256:4617e1915a539a0d9a9567795023de41a87106522ff83fbfaf1f6baf8e85437e \ + --hash=sha256:4b232061ca880db21fa14defe219840ad9b74b6158adb52ddf0e87bead9e8493 \ + --hash=sha256:5246b14ca64a8675e0a7161f7af68fe3e910e6b90542b4bfb5439ba752191df6 \ + --hash=sha256:5725dd9cc02068996d4438d397e255dcb1df776b7ceea3b9cb972bdb11260a83 \ + --hash=sha256:583f6a1993ca3369e0f80ba99d796d8e6b1a3a2a442dd4e1a79e652116413091 \ + --hash=sha256:59259dc58e57b10e7e18ce02c311804c10c5a793e6568f8af4dead03264584d1 \ + --hash=sha256:593eba61ba0c3baae5bc9be2f5232430453fb4432048de28399ca7376de9c627 \ + --hash=sha256:59f4a79c19232a5774aee369a0c296712ad0e77f24e62cad53160312b1c1eaa1 \ + --hash=sha256:5f0e260eaf54380380ac3808aa4ebe2d8ca28b9087cf411649f96bad6900c728 \ + --hash=sha256:62d9cfcf4948683a18a9aff0ab7e1474d407b7bab2ca03116109f8464698ab16 \ + --hash=sha256:64607d4cbf1b7e3c3c8a14948b99345eda0e161b852e122c6bb71aab6d1d798c \ + --hash=sha256:655ca44a831ecb238d124e0402d98f6212ac527a0ba6c55ca26f616604e60a45 \ + --hash=sha256:666ecce376999bf619756a24ce15bb14c5bfaf04bf00abc7e663ce17c3f34fe7 \ + --hash=sha256:68049202f67380ff9aa52f12e92b1c30115f32e6895cd7198fa2a7961621fc5a \ + --hash=sha256:69803198097467ee7282750acb507fba35ca22cc3b85f16cf45fb01cb9097730 \ + --hash=sha256:6c7b99ca52c2c1752b544e310101b98a659b720b21db00e65edca34483259967 \ + --hash=sha256:6dd9412824c4ce1aca56c47b0991e65bebb7ac3f4edccfd3f156150c96a7bf25 \ + --hash=sha256:70eb60b3ae9245ddea20f8a4190bd79c705a22f8028aaf8bbdebe4716c3fab24 \ + --hash=sha256:70fb28128acbfd264eda9bf47015537ba3fe86e40d046eb2963d75024be4d055 \ + --hash=sha256:7b2513ba235829860b13faa931f3b6846548021846ac808455301c23a101689d \ + --hash=sha256:7ef9d9da710be50ff6809fed8f1963fecdfecc8b86656cadfca3bc24289414b0 \ + --hash=sha256:81e69b0a0e2537f26d73b4e43ad7bc8c8efb39621639b4434b76a3de50c6966e \ + --hash=sha256:8633e471c6207a039eff6aa116e35f69f3156b3989ea3e2d755f7bc41754a4a7 \ + --hash=sha256:8bd7c8cfc0b8247c8799080fbff54e0b9619e17cdfeb0478ba7295d43f635d7c \ + --hash=sha256:9253fc214112405f0afa7db88739294295f0e08466987f1d70e29930262b4c8f \ + --hash=sha256:99b37292234e61325e7a5bb9689e55e48c3f5f603af88b1642666277a81f1fbd \ + --hash=sha256:9bd7228827ec7bb817089e2eb301d907c0d9827a9e558f22f762bb690b131652 \ + --hash=sha256:9beeb01d8c190d7581a4d59522cd3d4b6887040dcfc744af99aa59fef3e041a8 \ + --hash=sha256:a63cbdd98acef6570c62b92a1e43266f9e8b21e699c363c0fef13bd530799c11 \ + --hash=sha256:a76e42402542b1fae59798fab64432b2d015ab9d0c8c47ba7addddbaf7952333 \ + --hash=sha256:ac0a03221cdb5058ce0167ecc92a8c89e8d0decdc9e99a2ec23380793c4dcb96 \ + --hash=sha256:b0b4136a252cadfa1adb705bb81524eee47d9f6aab4f2ee4fa1e9d3cd4581f64 \ + --hash=sha256:b25bc607423935079e05619d7de556c91fb6adeae9d5f80868dde3468657994b \ + --hash=sha256:b3d504047aba448d70cf6fa22e06cb09f7cbd761939fdd47604f5e007675c24e \ + --hash=sha256:bb47271f60660803ad11f4c61b42242b8c1312a31c98c578f79ef9387bbde21c \ + --hash=sha256:bbb232860e3d03d544bc03ac57855cd82ddf19c7a07651a7c0fdb95e9efea8b9 \ + --hash=sha256:bc27863442d388870c1809a87507727b799c8460573cfbb6dc0eeaef5a11b5ec \ + --hash=sha256:bc51abd01f08117283c5ebf64844a35144a0843ff7b2983e0648e4d3d9f10dbb \ + --hash=sha256:be2eb3f2495ba669d2a985f9b426c1797b7d48d6963899276d22f23e33d47e37 \ + --hash=sha256:bf9db5488121b596dbfc6718c76092fda77b703c1f7533a226a5a9f65248f8ad \ + --hash=sha256:c58e2339def52ef6b71b8f36d13c3688ea23fa093353f3a4fee2556e62086ec9 \ + --hash=sha256:cfbc454a2880389dbb9b5b398e50d439e2e58669160f27b60e5eca11f68ae17c \ + --hash=sha256:cff63a0272fcd259dcc3be1657b07c929c466b067ceb1c20060e8d10af56f5bf \ + --hash=sha256:d115bffdd417c6d806ea9069237a4ae02f513b778e3789a359bc5856e0404cc4 \ + --hash=sha256:d20cfb4e099748ea39e6f7b16c91ab057989712d31761d3300d43134e26e165f \ + --hash=sha256:d48424e39c2611ee1b84ad0f44fb3b2b53d473e65de061e3f460fc0be5f1939d \ + --hash=sha256:e0fa2d4ec53dc51cf7d3bb22e0aa0143966119f42a0c3e4998293a3dd2856b09 \ + --hash=sha256:e32fee8ab45d3c2db6da19a5323bc3362237c8b653c70194414b892fd06a080d \ + --hash=sha256:e35ba67d65d49080e8e5a1dd40101fccdd9798adb9b050ff670b7d74fa41c566 \ + --hash=sha256:e3fb866d9932a3d7d0c82da76d816996d1667c44891bd861a0f97ba27e84fc74 \ + --hash=sha256:e61b02c3f7a1e0b75e20c3978f7135fd13cb6cf551bf4a6d29b999a88830a338 \ + --hash=sha256:e67ba3c290821343c192f7eae1d8fd5999ca2dc99994114643e2f2d3e6138b15 \ + --hash=sha256:e79dd39f1e8c3504be0607e5fc6e86bb60fe3584bec8b782578c3b0fde8d932c \ + --hash=sha256:e89391e6d60251560f0a8f4bd32137b077a80d9b7dbe6d5cab1cd80d2746f648 \ + --hash=sha256:ea7433ce7e4bfc3a85654aeb6747babe3f66eaf9a1d0c1e7a4435bbdf27fea84 \ + --hash=sha256:eaf16ae9ae519a0e237a0f528fd9f0197b9bb70f40263ee57ae53c2b8d48aeb3 \ + --hash=sha256:eb0c341fa71df5a4595f9501df4ac5abfb5a09580081dffbd1ddd4654e6e9123 \ + --hash=sha256:f276b245347e6e36526cbd4a266a417796fc531ddf391e43574cf6466c492520 \ + --hash=sha256:f47ad3d5f3258bd7058d2d506852217865afefe6153a36eb4b6928758041d831 \ + --hash=sha256:f56a6b404f74ab372da986d240e2e002769a7d7102cc73eb238a4f72eec5284e \ + --hash=sha256:f5cf2a0c2bdadf3791b5c205d55a37a54025c6e18a71c71f82bb536cf9a454bf \ + --hash=sha256:f5d36399a1b96e1a5fdc91e0522544580dbebeb1f77f27b2b0ab25559e103b8b \ + --hash=sha256:f60bd8423be1d9d833f230fdbccf8f57af322d96bcad6599e5a771b151398eb2 \ + --hash=sha256:f612463ac081803f243ff13cccc648578e2279295048f2a8d5eb430af2bae6e3 \ + --hash=sha256:f73d3fef726b3243a811121de45193c0ca75f6407fe66f3f4e183c983573e130 \ + --hash=sha256:f82a116a1d03628a8ace4859556fb39fd1424c933341a08ea3ed6de1edb0283b \ + --hash=sha256:fb0ba113b4983beac1a2eb16faffd76cb41e176bf58c4afe3e14b9c681f702de \ + --hash=sha256:fb4f868f712b2dd4bcc538b0a0c1f63a2b1d584c925e69a224d759e7070a12d5 \ + --hash=sha256:fb6116dfb8d1925cbdb52595560584db42a7f664617a1f7d7f6e32f138cdf37d \ + --hash=sha256:fda7cb070f442bf80b642cd56483b5548e43d366fe3f39b98e67cce780cded00 \ + --hash=sha256:feea821ee2a9273771bae61194004ee2fc33f8ec7db08117ef9147d4bbcbca8e + # via + # -c python/requirements_compiled.txt + # jsonschema + # referencing +rsa==4.7.2 \ + --hash=sha256:78f9a9bf4e7be0c5ded4583326e7461e3a3c5aae24073648b4bdfa797d78c9d2 \ + --hash=sha256:9d689e6ca1b3038bc82bf8d23e944b6b6037bc02301a574935b2dd946e0353b9 + # via + # -c python/requirements_compiled.txt + # google-auth +scipy==1.11.4 \ + --hash=sha256:00150c5eae7b610c32589dda259eacc7c4f1665aedf25d921907f4d08a951b1c \ + --hash=sha256:028eccd22e654b3ea01ee63705681ee79933652b2d8f873e7949898dda6d11b6 \ + --hash=sha256:1b7c3dca977f30a739e0409fb001056484661cb2541a01aba0bb0029f7b68db8 \ + --hash=sha256:2c6ff6ef9cc27f9b3db93a6f8b38f97387e6e0591600369a297a50a8e96e835d \ + --hash=sha256:36750b7733d960d7994888f0d148d31ea3017ac15eef664194b4ef68d36a4a97 \ + --hash=sha256:530f9ad26440e85766509dbf78edcfe13ffd0ab7fec2560ee5c36ff74d6269ff \ + --hash=sha256:5e347b14fe01003d3b78e196e84bd3f48ffe4c8a7b8a1afbcb8f5505cb710993 \ + --hash=sha256:6550466fbeec7453d7465e74d4f4b19f905642c89a7525571ee91dd7adabb5a3 \ + --hash=sha256:6df1468153a31cf55ed5ed39647279beb9cfb5d3f84369453b49e4b8502394fd \ + --hash=sha256:6e619aba2df228a9b34718efb023966da781e89dd3d21637b27f2e54db0410d7 \ + --hash=sha256:8fce70f39076a5aa62e92e69a7f62349f9574d8405c0a5de6ed3ef72de07f446 \ + --hash=sha256:90a2b78e7f5733b9de748f589f09225013685f9b218275257f8a8168ededaeaa \ + --hash=sha256:91af76a68eeae0064887a48e25c4e616fa519fa0d38602eda7e0f97d65d57937 \ + --hash=sha256:933baf588daa8dc9a92c20a0be32f56d43faf3d1a60ab11b3f08c356430f6e56 \ + --hash=sha256:acf8ed278cc03f5aff035e69cb511741e0418681d25fbbb86ca65429c4f4d9cd \ + --hash=sha256:ad669df80528aeca5f557712102538f4f37e503f0c5b9541655016dd0932ca79 \ + --hash=sha256:b030c6674b9230d37c5c60ab456e2cf12f6784596d15ce8da9365e70896effc4 \ + --hash=sha256:b9999c008ccf00e8fbcce1236f85ade5c569d13144f77a1946bef8863e8f6eb4 \ + --hash=sha256:bc9a714581f561af0848e6b69947fda0614915f072dfd14142ed1bfe1b806710 \ + --hash=sha256:ce7fff2e23ab2cc81ff452a9444c215c28e6305f396b2ba88343a567feec9660 \ + --hash=sha256:cf00bd2b1b0211888d4dc75656c0412213a8b25e80d73898083f402b50f47e41 \ + --hash=sha256:d10e45a6c50211fe256da61a11c34927c68f277e03138777bdebedd933712fea \ + --hash=sha256:ee410e6de8f88fd5cf6eadd73c135020bfbbbdfcd0f6162c36a7638a1ea8cc65 \ + --hash=sha256:f313b39a7e94f296025e3cffc2c567618174c0b1dde173960cf23808f9fae4be \ + --hash=sha256:f3cd9e7b3c2c1ec26364856f9fbe78695fe631150f94cd1c22228456404cf1ec + # via + # -c python/requirements_compiled.txt + # ray +six==1.16.0 \ + --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \ + --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 + # via + # -c python/requirements_compiled.txt + # opencensus + # python-dateutil +smart-open==6.2.0 \ + --hash=sha256:088bf00f9327c71e549bc2f86567d3320df5d89667f009ce1c16568976068ef7 \ + --hash=sha256:1b4df5c8365218f3852c507451920ccad606c80b0acb4e67508e50ba9b5d2632 + # via + # -c python/requirements_compiled.txt + # ray +sniffio==1.3.1 \ + --hash=sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2 \ + --hash=sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc + # via + # -c python/requirements_compiled.txt + # anyio +starlette==0.46.2 \ + --hash=sha256:595633ce89f8ffa71a015caed34a5b2dc1c0cdb3f0f1fbd1e69339cf2abeec35 \ + --hash=sha256:7f7361f34eed179294600af672f565727419830b54b7b084efe44bb82d2fccd5 + # via + # -c python/requirements_compiled.txt + # fastapi + # ray +tensorboardx==2.6.2.2 \ + --hash=sha256:160025acbf759ede23fd3526ae9d9bfbfd8b68eb16c38a010ebe326dc6395db8 \ + --hash=sha256:c6476d7cd0d529b0b72f4acadb1269f9ed8b22f441e87a84f2a3b940bb87b666 + # via + # -c python/requirements_compiled.txt + # ray +typing-extensions==4.12.2 \ + --hash=sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d + # via + # -c python/requirements_compiled.txt + # fastapi + # gymnasium + # opentelemetry-api + # opentelemetry-sdk + # opentelemetry-semantic-conventions + # pydantic + # pydantic-core + # pyopenssl + # referencing + # typing-inspection +typing-inspection==0.4.1 \ + --hash=sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51 \ + --hash=sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28 + # via + # -c python/requirements_compiled.txt + # pydantic +tzdata==2025.2 \ + --hash=sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8 \ + --hash=sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9 + # via + # -c python/requirements_compiled.txt + # kombu + # pandas +urllib3==1.26.19 \ + --hash=sha256:37a0344459b199fce0e80b0d3569837ec6b6937435c5244e7fd73fa6006830f3 \ + --hash=sha256:3e3d753a8618b86d7de333b4223005f68720bcd6a7d2bcb9fbd2229ec7c1e429 + # via + # -c python/requirements_compiled.txt + # requests +uvicorn==0.22.0 \ + --hash=sha256:79277ae03db57ce7d9aa0567830bbb51d7a612f54d6e1e3e92da3ef24c2c8ed8 \ + --hash=sha256:e9434d3bbf05f310e762147f769c9f21235ee118ba2d2bf1155a7196448bd996 + # via + # -c python/requirements_compiled.txt + # ray +uvloop==0.21.0 ; platform_python_implementation != 'PyPy' and sys_platform != 'cygwin' and sys_platform != 'win32' \ + --hash=sha256:0878c2640cf341b269b7e128b1a5fed890adc4455513ca710d77d5e93aa6d6a0 \ + --hash=sha256:10d66943def5fcb6e7b37310eb6b5639fd2ccbc38df1177262b0640c3ca68c1f \ + --hash=sha256:10da8046cc4a8f12c91a1c39d1dd1585c41162a15caaef165c2174db9ef18bdc \ + --hash=sha256:17df489689befc72c39a08359efac29bbee8eee5209650d4b9f34df73d22e414 \ + --hash=sha256:183aef7c8730e54c9a3ee3227464daed66e37ba13040bb3f350bc2ddc040f22f \ + --hash=sha256:196274f2adb9689a289ad7d65700d37df0c0930fd8e4e743fa4834e850d7719d \ + --hash=sha256:221f4f2a1f46032b403bf3be628011caf75428ee3cc204a22addf96f586b19fd \ + --hash=sha256:2d1f581393673ce119355d56da84fe1dd9d2bb8b3d13ce792524e1607139feff \ + --hash=sha256:359ec2c888397b9e592a889c4d72ba3d6befba8b2bb01743f72fffbde663b59c \ + --hash=sha256:3bf12b0fda68447806a7ad847bfa591613177275d35b6724b1ee573faa3704e3 \ + --hash=sha256:4509360fcc4c3bd2c70d87573ad472de40c13387f5fda8cb58350a1d7475e58d \ + --hash=sha256:460def4412e473896ef179a1671b40c039c7012184b627898eea5072ef6f017a \ + --hash=sha256:461d9ae6660fbbafedd07559c6a2e57cd553b34b0065b6550685f6653a98c1cb \ + --hash=sha256:46923b0b5ee7fc0020bef24afe7836cb068f5050ca04caf6b487c513dc1a20b2 \ + --hash=sha256:53e420a3afe22cdcf2a0f4846e377d16e718bc70103d7088a4f7623567ba5fb0 \ + --hash=sha256:5ee4d4ef48036ff6e5cfffb09dd192c7a5027153948d85b8da7ff705065bacc6 \ + --hash=sha256:67dd654b8ca23aed0a8e99010b4c34aca62f4b7fce88f39d452ed7622c94845c \ + --hash=sha256:787ae31ad8a2856fc4e7c095341cccc7209bd657d0e71ad0dc2ea83c4a6fa8af \ + --hash=sha256:86975dca1c773a2c9864f4c52c5a55631038e387b47eaf56210f873887b6c8dc \ + --hash=sha256:87c43e0f13022b998eb9b973b5e97200c8b90823454d4bc06ab33829e09fb9bb \ + --hash=sha256:88cb67cdbc0e483da00af0b2c3cdad4b7c61ceb1ee0f33fe00e09c81e3a6cb75 \ + --hash=sha256:8a375441696e2eda1c43c44ccb66e04d61ceeffcd76e4929e527b7fa401b90fb \ + --hash=sha256:a5c39f217ab3c663dc699c04cbd50c13813e31d917642d459fdcec07555cc553 \ + --hash=sha256:b9fb766bb57b7388745d8bcc53a359b116b8a04c83a2288069809d2b3466c37e \ + --hash=sha256:baa0e6291d91649c6ba4ed4b2f982f9fa165b5bbd50a9e203c416a2797bab3c6 \ + --hash=sha256:baa4dcdbd9ae0a372f2167a207cd98c9f9a1ea1188a8a526431eef2f8116cc8d \ + --hash=sha256:bc09f0ff191e61c2d592a752423c767b4ebb2986daa9ed62908e2b1b9a9ae206 \ + --hash=sha256:bd53ecc9a0f3d87ab847503c2e1552b690362e005ab54e8a48ba97da3924c0dc \ + --hash=sha256:bfd55dfcc2a512316e65f16e503e9e450cab148ef11df4e4e679b5e8253a5281 \ + --hash=sha256:c097078b8031190c934ed0ebfee8cc5f9ba9642e6eb88322b9958b649750f72b \ + --hash=sha256:c0f3fa6200b3108919f8bdabb9a7f87f20e7097ea3c543754cabc7d717d95cf8 \ + --hash=sha256:e678ad6fe52af2c58d2ae3c73dc85524ba8abe637f134bf3564ed07f555c5e79 \ + --hash=sha256:ec7e6b09a6fdded42403182ab6b832b71f4edaf7f37a9a0e371a01db5f0cb45f \ + --hash=sha256:f0ce1b49560b1d2d8a2977e3ba4afb2414fb46b86a1b64056bc4ab929efdafbe \ + --hash=sha256:f38b2e090258d051d68a5b14d1da7203a3c3677321cf32a95a6f4db4dd8b6f26 \ + --hash=sha256:f3df876acd7ec037a3d005b3ab85a7e4110422e4d9c1571d4fc89b0fc41b6816 \ + --hash=sha256:f7089d2dc73179ce5ac255bdf37c236a9f914b264825fdaacaded6990a7fb4c2 + # via + # -c python/requirements_compiled.txt + # uvicorn +vine==5.1.0 \ + --hash=sha256:40fdf3c48b2cfe1c38a49e9ae2da6fda88e4794c810050a728bd7413811fb1dc \ + --hash=sha256:8b62e981d35c41049211cf62a0a1242d8c1ee9bd15bb196ce38aefd6799e61e0 + # via + # -c python/requirements_compiled.txt + # amqp + # celery + # kombu +virtualenv==20.29.1 \ + --hash=sha256:4e4cb403c0b0da39e13b46b1b2476e505cb0046b25f242bee80f62bf990b2779 \ + --hash=sha256:b8b8970138d32fb606192cb97f6cd4bb644fa486be9308fb9b63f81091b5dc35 + # via + # -c python/requirements_compiled.txt + # ray +watchfiles==0.19.0 \ + --hash=sha256:0089c6dc24d436b373c3c57657bf4f9a453b13767150d17284fc6162b2791911 \ + --hash=sha256:09ea3397aecbc81c19ed7f025e051a7387feefdb789cf768ff994c1228182fda \ + --hash=sha256:176a9a7641ec2c97b24455135d58012a5be5c6217fc4d5fef0b2b9f75dbf5154 \ + --hash=sha256:18b28f6ad871b82df9542ff958d0c86bb0d8310bb09eb8e87d97318a3b5273af \ + --hash=sha256:20b44221764955b1e703f012c74015306fb7e79a00c15370785f309b1ed9aa8d \ + --hash=sha256:3d7d267d27aceeeaa3de0dd161a0d64f0a282264d592e335fff7958cc0cbae7c \ + --hash=sha256:5471582658ea56fca122c0f0d0116a36807c63fefd6fdc92c71ca9a4491b6b48 \ + --hash=sha256:5569fc7f967429d4bc87e355cdfdcee6aabe4b620801e2cf5805ea245c06097c \ + --hash=sha256:68dce92b29575dda0f8d30c11742a8e2b9b8ec768ae414b54f7453f27bdf9545 \ + --hash=sha256:79c533ff593db861ae23436541f481ec896ee3da4e5db8962429b441bbaae16e \ + --hash=sha256:7f3920b1285a7d3ce898e303d84791b7bf40d57b7695ad549dc04e6a44c9f120 \ + --hash=sha256:91633e64712df3051ca454ca7d1b976baf842d7a3640b87622b323c55f3345e7 \ + --hash=sha256:945be0baa3e2440151eb3718fd8846751e8b51d8de7b884c90b17d271d34cae8 \ + --hash=sha256:9afd0d69429172c796164fd7fe8e821ade9be983f51c659a38da3faaaaac44dc \ + --hash=sha256:9c75eff897786ee262c9f17a48886f4e98e6cfd335e011c591c305e5d083c056 \ + --hash=sha256:b538014a87f94d92f98f34d3e6d2635478e6be6423a9ea53e4dd96210065e193 \ + --hash=sha256:b6577b8c6c8701ba8642ea9335a129836347894b666dd1ec2226830e263909d3 \ + --hash=sha256:c0376deac92377817e4fb8f347bf559b7d44ff556d9bc6f6208dd3f79f104aaf \ + --hash=sha256:cae3dde0b4b2078f31527acff6f486e23abed307ba4d3932466ba7cdd5ecec79 \ + --hash=sha256:cb5d45c4143c1dd60f98a16187fd123eda7248f84ef22244818c18d531a249d1 \ + --hash=sha256:d9b073073e048081e502b6c6b0b88714c026a1a4c890569238d04aca5f9ca74b \ + --hash=sha256:fac19dc9cbc34052394dbe81e149411a62e71999c0a19e1e09ce537867f95ae0 + # via + # -c python/requirements_compiled.txt + # ray + # uvicorn +wcwidth==0.2.13 \ + --hash=sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859 \ + --hash=sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5 + # via + # -c python/requirements_compiled.txt + # prompt-toolkit +websockets==11.0.3 \ + --hash=sha256:01f5567d9cf6f502d655151645d4e8b72b453413d3819d2b6f1185abc23e82dd \ + --hash=sha256:03aae4edc0b1c68498f41a6772d80ac7c1e33c06c6ffa2ac1c27a07653e79d6f \ + --hash=sha256:0ac56b661e60edd453585f4bd68eb6a29ae25b5184fd5ba51e97652580458998 \ + --hash=sha256:0ee68fe502f9031f19d495dae2c268830df2760c0524cbac5d759921ba8c8e82 \ + --hash=sha256:1553cb82942b2a74dd9b15a018dce645d4e68674de2ca31ff13ebc2d9f283788 \ + --hash=sha256:1a073fc9ab1c8aff37c99f11f1641e16da517770e31a37265d2755282a5d28aa \ + --hash=sha256:1d2256283fa4b7f4c7d7d3e84dc2ece74d341bce57d5b9bf385df109c2a1a82f \ + --hash=sha256:1d5023a4b6a5b183dc838808087033ec5df77580485fc533e7dab2567851b0a4 \ + --hash=sha256:1fdf26fa8a6a592f8f9235285b8affa72748dc12e964a5518c6c5e8f916716f7 \ + --hash=sha256:2529338a6ff0eb0b50c7be33dc3d0e456381157a31eefc561771ee431134a97f \ + --hash=sha256:279e5de4671e79a9ac877427f4ac4ce93751b8823f276b681d04b2156713b9dd \ + --hash=sha256:2d903ad4419f5b472de90cd2d40384573b25da71e33519a67797de17ef849b69 \ + --hash=sha256:332d126167ddddec94597c2365537baf9ff62dfcc9db4266f263d455f2f031cb \ + --hash=sha256:34fd59a4ac42dff6d4681d8843217137f6bc85ed29722f2f7222bd619d15e95b \ + --hash=sha256:3580dd9c1ad0701169e4d6fc41e878ffe05e6bdcaf3c412f9d559389d0c9e016 \ + --hash=sha256:3ccc8a0c387629aec40f2fc9fdcb4b9d5431954f934da3eaf16cdc94f67dbfac \ + --hash=sha256:41f696ba95cd92dc047e46b41b26dd24518384749ed0d99bea0a941ca87404c4 \ + --hash=sha256:42cc5452a54a8e46a032521d7365da775823e21bfba2895fb7b77633cce031bb \ + --hash=sha256:4841ed00f1026dfbced6fca7d963c4e7043aa832648671b5138008dc5a8f6d99 \ + --hash=sha256:4b253869ea05a5a073ebfdcb5cb3b0266a57c3764cf6fe114e4cd90f4bfa5f5e \ + --hash=sha256:54c6e5b3d3a8936a4ab6870d46bdd6ec500ad62bde9e44462c32d18f1e9a8e54 \ + --hash=sha256:619d9f06372b3a42bc29d0cd0354c9bb9fb39c2cbc1a9c5025b4538738dbffaf \ + --hash=sha256:6505c1b31274723ccaf5f515c1824a4ad2f0d191cec942666b3d0f3aa4cb4007 \ + --hash=sha256:660e2d9068d2bedc0912af508f30bbeb505bbbf9774d98def45f68278cea20d3 \ + --hash=sha256:6681ba9e7f8f3b19440921e99efbb40fc89f26cd71bf539e45d8c8a25c976dc6 \ + --hash=sha256:68b977f21ce443d6d378dbd5ca38621755f2063d6fdb3335bda981d552cfff86 \ + --hash=sha256:69269f3a0b472e91125b503d3c0b3566bda26da0a3261c49f0027eb6075086d1 \ + --hash=sha256:6f1a3f10f836fab6ca6efa97bb952300b20ae56b409414ca85bff2ad241d2a61 \ + --hash=sha256:7622a89d696fc87af8e8d280d9b421db5133ef5b29d3f7a1ce9f1a7bf7fcfa11 \ + --hash=sha256:777354ee16f02f643a4c7f2b3eff8027a33c9861edc691a2003531f5da4f6bc8 \ + --hash=sha256:84d27a4832cc1a0ee07cdcf2b0629a8a72db73f4cf6de6f0904f6661227f256f \ + --hash=sha256:8531fdcad636d82c517b26a448dcfe62f720e1922b33c81ce695d0edb91eb931 \ + --hash=sha256:86d2a77fd490ae3ff6fae1c6ceaecad063d3cc2320b44377efdde79880e11526 \ + --hash=sha256:88fc51d9a26b10fc331be344f1781224a375b78488fc343620184e95a4b27016 \ + --hash=sha256:8a34e13a62a59c871064dfd8ffb150867e54291e46d4a7cf11d02c94a5275bae \ + --hash=sha256:8c82f11964f010053e13daafdc7154ce7385ecc538989a354ccc7067fd7028fd \ + --hash=sha256:92b2065d642bf8c0a82d59e59053dd2fdde64d4ed44efe4870fa816c1232647b \ + --hash=sha256:97b52894d948d2f6ea480171a27122d77af14ced35f62e5c892ca2fae9344311 \ + --hash=sha256:9d9acd80072abcc98bd2c86c3c9cd4ac2347b5a5a0cae7ed5c0ee5675f86d9af \ + --hash=sha256:9f59a3c656fef341a99e3d63189852be7084c0e54b75734cde571182c087b152 \ + --hash=sha256:aa5003845cdd21ac0dc6c9bf661c5beddd01116f6eb9eb3c8e272353d45b3288 \ + --hash=sha256:b16fff62b45eccb9c7abb18e60e7e446998093cdcb50fed33134b9b6878836de \ + --hash=sha256:b30c6590146e53149f04e85a6e4fcae068df4289e31e4aee1fdf56a0dead8f97 \ + --hash=sha256:b58cbf0697721120866820b89f93659abc31c1e876bf20d0b3d03cef14faf84d \ + --hash=sha256:b67c6f5e5a401fc56394f191f00f9b3811fe843ee93f4a70df3c389d1adf857d \ + --hash=sha256:bceab846bac555aff6427d060f2fcfff71042dba6f5fca7dc4f75cac815e57ca \ + --hash=sha256:bee9fcb41db2a23bed96c6b6ead6489702c12334ea20a297aa095ce6d31370d0 \ + --hash=sha256:c114e8da9b475739dde229fd3bc6b05a6537a88a578358bc8eb29b4030fac9c9 \ + --hash=sha256:c1f0524f203e3bd35149f12157438f406eff2e4fb30f71221c8a5eceb3617b6b \ + --hash=sha256:c792ea4eabc0159535608fc5658a74d1a81020eb35195dd63214dcf07556f67e \ + --hash=sha256:c7f3cb904cce8e1be667c7e6fef4516b98d1a6a0635a58a57528d577ac18a128 \ + --hash=sha256:d67ac60a307f760c6e65dad586f556dde58e683fab03323221a4e530ead6f74d \ + --hash=sha256:dcacf2c7a6c3a84e720d1bb2b543c675bf6c40e460300b628bab1b1efc7c034c \ + --hash=sha256:de36fe9c02995c7e6ae6efe2e205816f5f00c22fd1fbf343d4d18c3d5ceac2f5 \ + --hash=sha256:def07915168ac8f7853812cc593c71185a16216e9e4fa886358a17ed0fd9fcf6 \ + --hash=sha256:df41b9bc27c2c25b486bae7cf42fccdc52ff181c8c387bfd026624a491c2671b \ + --hash=sha256:e052b8467dd07d4943936009f46ae5ce7b908ddcac3fda581656b1b19c083d9b \ + --hash=sha256:e063b1865974611313a3849d43f2c3f5368093691349cf3c7c8f8f75ad7cb280 \ + --hash=sha256:e1459677e5d12be8bbc7584c35b992eea142911a6236a3278b9b5ce3326f282c \ + --hash=sha256:e1a99a7a71631f0efe727c10edfba09ea6bee4166a6f9c19aafb6c0b5917d09c \ + --hash=sha256:e590228200fcfc7e9109509e4d9125eace2042fd52b595dd22bbc34bb282307f \ + --hash=sha256:e6316827e3e79b7b8e7d8e3b08f4e331af91a48e794d5d8b099928b6f0b85f20 \ + --hash=sha256:e7837cb169eca3b3ae94cc5787c4fed99eef74c0ab9506756eea335e0d6f3ed8 \ + --hash=sha256:e848f46a58b9fcf3d06061d17be388caf70ea5b8cc3466251963c8345e13f7eb \ + --hash=sha256:ed058398f55163a79bb9f06a90ef9ccc063b204bb346c4de78efc5d15abfe602 \ + --hash=sha256:f2e58f2c36cc52d41f2659e4c0cbf7353e28c8c9e63e30d8c6d3494dc9fdedcf \ + --hash=sha256:f467ba0050b7de85016b43f5a22b46383ef004c4f672148a8abf32bc999a87f0 \ + --hash=sha256:f61bdb1df43dc9c131791fbc2355535f9024b9a04398d3bd0684fc16ab07df74 \ + --hash=sha256:fb06eea71a00a7af0ae6aefbb932fb8a7df3cb390cc217d51a9ad7343de1b8d0 \ + --hash=sha256:ffd7dcaf744f25f82190856bc26ed81721508fc5cbf2a330751e135ff1283564 + # via + # -c python/requirements_compiled.txt + # uvicorn +yarl==1.18.3 \ + --hash=sha256:00e5a1fea0fd4f5bfa7440a47eff01d9822a65b4488f7cff83155a0f31a2ecba \ + --hash=sha256:02ddb6756f8f4517a2d5e99d8b2f272488e18dd0bfbc802f31c16c6c20f22193 \ + --hash=sha256:045b8482ce9483ada4f3f23b3774f4e1bf4f23a2d5c912ed5170f68efb053318 \ + --hash=sha256:09c7907c8548bcd6ab860e5f513e727c53b4a714f459b084f6580b49fa1b9cee \ + --hash=sha256:0b0cad37311123211dc91eadcb322ef4d4a66008d3e1bdc404808992260e1a0e \ + --hash=sha256:0b3c92fa08759dbf12b3a59579a4096ba9af8dd344d9a813fc7f5070d86bbab1 \ + --hash=sha256:0fb2171a4486bb075316ee754c6d8382ea6eb8b399d4ec62fde2b591f879778a \ + --hash=sha256:1a74a13a4c857a84a845505fd2d68e54826a2cd01935a96efb1e9d86c728e186 \ + --hash=sha256:1d407181cfa6e70077df3377938c08012d18893f9f20e92f7d2f314a437c30b1 \ + --hash=sha256:1dd4bdd05407ced96fed3d7f25dbbf88d2ffb045a0db60dbc247f5b3c5c25d50 \ + --hash=sha256:25b411eddcfd56a2f0cd6a384e9f4f7aa3efee14b188de13048c25b5e91f1640 \ + --hash=sha256:2d06d3005e668744e11ed80812e61efd77d70bb7f03e33c1598c301eea20efbb \ + --hash=sha256:2ec9bbba33b2d00999af4631a3397d1fd78290c48e2a3e52d8dd72db3a067ac8 \ + --hash=sha256:3236da9272872443f81fedc389bace88408f64f89f75d1bdb2256069a8730ccc \ + --hash=sha256:35098b24e0327fc4ebdc8ffe336cee0a87a700c24ffed13161af80124b7dc8e5 \ + --hash=sha256:41f7ce59d6ee7741af71d82020346af364949314ed3d87553763a2df1829cc58 \ + --hash=sha256:436c4fc0a4d66b2badc6c5fc5ef4e47bb10e4fd9bf0c79524ac719a01f3607c2 \ + --hash=sha256:4891ed92157e5430874dad17b15eb1fda57627710756c27422200c52d8a4e393 \ + --hash=sha256:4ac515b860c36becb81bb84b667466885096b5fc85596948548b667da3bf9f24 \ + --hash=sha256:5094d9206c64181d0f6e76ebd8fb2f8fe274950a63890ee9e0ebfd58bf9d787b \ + --hash=sha256:54d6921f07555713b9300bee9c50fb46e57e2e639027089b1d795ecd9f7fa910 \ + --hash=sha256:578e281c393af575879990861823ef19d66e2b1d0098414855dd367e234f5b3c \ + --hash=sha256:5a3f356548e34a70b0172d8890006c37be92995f62d95a07b4a42e90fba54272 \ + --hash=sha256:602d98f2c2d929f8e697ed274fbadc09902c4025c5a9963bf4e9edfc3ab6f7ed \ + --hash=sha256:61b1a825a13bef4a5f10b1885245377d3cd0bf87cba068e1d9a88c2ae36880e1 \ + --hash=sha256:61e5e68cb65ac8f547f6b5ef933f510134a6bf31bb178be428994b0cb46c2a04 \ + --hash=sha256:61ee62ead9b68b9123ec24bc866cbef297dd266175d53296e2db5e7f797f902d \ + --hash=sha256:6333c5a377c8e2f5fae35e7b8f145c617b02c939d04110c76f29ee3676b5f9a5 \ + --hash=sha256:6748dbf9bfa5ba1afcc7556b71cda0d7ce5f24768043a02a58846e4a443d808d \ + --hash=sha256:67a283dd2882ac98cc6318384f565bffc751ab564605959df4752d42483ad889 \ + --hash=sha256:75674776d96d7b851b6498f17824ba17849d790a44d282929c42dbb77d4f17ae \ + --hash=sha256:757e81cae69244257d125ff31663249b3013b5dc0a8520d73694aed497fb195b \ + --hash=sha256:77a6e85b90a7641d2e07184df5557132a337f136250caafc9ccaa4a2a998ca2c \ + --hash=sha256:7c33dd1931a95e5d9a772d0ac5e44cac8957eaf58e3c8da8c1414de7dd27c576 \ + --hash=sha256:7df647e8edd71f000a5208fe6ff8c382a1de8edfbccdbbfe649d263de07d8c34 \ + --hash=sha256:7e2ee16578af3b52ac2f334c3b1f92262f47e02cc6193c598502bd46f5cd1477 \ + --hash=sha256:80316a8bd5109320d38eef8833ccf5f89608c9107d02d2a7f985f98ed6876990 \ + --hash=sha256:82123d0c954dc58db301f5021a01854a85bf1f3bb7d12ae0c01afc414a882ca2 \ + --hash=sha256:84b2deecba4a3f1a398df819151eb72d29bfeb3b69abb145a00ddc8d30094512 \ + --hash=sha256:8503ad47387b8ebd39cbbbdf0bf113e17330ffd339ba1144074da24c545f0069 \ + --hash=sha256:877d209b6aebeb5b16c42cbb377f5f94d9e556626b1bfff66d7b0d115be88d0a \ + --hash=sha256:8874027a53e3aea659a6d62751800cf6e63314c160fd607489ba5c2edd753cf6 \ + --hash=sha256:88a19f62ff30117e706ebc9090b8ecc79aeb77d0b1f5ec10d2d27a12bc9f66d0 \ + --hash=sha256:8d39d351e7faf01483cc7ff7c0213c412e38e5a340238826be7e0e4da450fdc8 \ + --hash=sha256:90adb47ad432332d4f0bc28f83a5963f426ce9a1a8809f5e584e704b82685dcb \ + --hash=sha256:913829534200eb0f789d45349e55203a091f45c37a2674678744ae52fae23efa \ + --hash=sha256:93b2e109287f93db79210f86deb6b9bbb81ac32fc97236b16f7433db7fc437d8 \ + --hash=sha256:9d41beda9dc97ca9ab0b9888cb71f7539124bc05df02c0cff6e5acc5a19dcc6e \ + --hash=sha256:a440a2a624683108a1b454705ecd7afc1c3438a08e890a1513d468671d90a04e \ + --hash=sha256:a4bb030cf46a434ec0225bddbebd4b89e6471814ca851abb8696170adb163985 \ + --hash=sha256:a9ca04806f3be0ac6d558fffc2fdf8fcef767e0489d2684a21912cc4ed0cd1b8 \ + --hash=sha256:ac1801c45cbf77b6c99242eeff4fffb5e4e73a800b5c4ad4fc0be5def634d2e1 \ + --hash=sha256:ac36703a585e0929b032fbaab0707b75dc12703766d0b53486eabd5139ebadd5 \ + --hash=sha256:b1771de9944d875f1b98a745bc547e684b863abf8f8287da8466cf470ef52690 \ + --hash=sha256:b464c4ab4bfcb41e3bfd3f1c26600d038376c2de3297760dfe064d2cb7ea8e10 \ + --hash=sha256:b4f6450109834af88cb4cc5ecddfc5380ebb9c228695afc11915a0bf82116789 \ + --hash=sha256:b57f4f58099328dfb26c6a771d09fb20dbbae81d20cfb66141251ea063bd101b \ + --hash=sha256:b643562c12680b01e17239be267bc306bbc6aac1f34f6444d1bded0c5ce438ca \ + --hash=sha256:b958ddd075ddba5b09bb0be8a6d9906d2ce933aee81100db289badbeb966f54e \ + --hash=sha256:b9d60031cf568c627d028239693fd718025719c02c9f55df0a53e587aab951b5 \ + --hash=sha256:ba23302c0c61a9999784e73809427c9dbedd79f66a13d84ad1b1943802eaaf59 \ + --hash=sha256:ba87babd629f8af77f557b61e49e7c7cac36f22f871156b91e10a6e9d4f829e9 \ + --hash=sha256:c017a3b6df3a1bd45b9fa49a0f54005e53fbcad16633870104b66fa1a30a29d8 \ + --hash=sha256:c1e1cc06da1491e6734f0ea1e6294ce00792193c463350626571c287c9a704db \ + --hash=sha256:c654d5207c78e0bd6d749f6dae1dcbbfde3403ad3a4b11f3c5544d9906969dde \ + --hash=sha256:c69697d3adff5aa4f874b19c0e4ed65180ceed6318ec856ebc423aa5850d84f7 \ + --hash=sha256:c7d79f7d9aabd6011004e33b22bc13056a3e3fb54794d138af57f5ee9d9032cb \ + --hash=sha256:ccaa3a4b521b780a7e771cc336a2dba389a0861592bbce09a476190bb0c8b4b3 \ + --hash=sha256:ccd17349166b1bee6e529b4add61727d3f55edb7babbe4069b5764c9587a8cc6 \ + --hash=sha256:ce1af883b94304f493698b00d0f006d56aea98aeb49d75ec7d98cd4a777e9285 \ + --hash=sha256:d0e883008013c0e4aef84dcfe2a0b172c4d23c2669412cf5b3371003941f72bb \ + --hash=sha256:d980e0325b6eddc81331d3f4551e2a333999fb176fd153e075c6d1c2530aa8a8 \ + --hash=sha256:e17c9361d46a4d5addf777c6dd5eab0715a7684c2f11b88c67ac37edfba6c482 \ + --hash=sha256:e2c08cc9b16f4f4bc522771d96734c7901e7ebef70c6c5c35dd0f10845270bcd \ + --hash=sha256:e35ef8683211db69ffe129a25d5634319a677570ab6b2eba4afa860f54eeaf75 \ + --hash=sha256:e3b9fd71836999aad54084906f8663dffcd2a7fb5cdafd6c37713b2e72be1760 \ + --hash=sha256:ef9f7768395923c3039055c14334ba4d926f3baf7b776c923c93d80195624782 \ + --hash=sha256:f52a265001d830bc425f82ca9eabda94a64a4d753b07d623a9f2863fde532b53 \ + --hash=sha256:f91c4803173928a25e1a55b943c81f55b8872f0018be83e3ad4938adffb77dd2 \ + --hash=sha256:fbd6748e8ab9b41171bb95c6142faf068f5ef1511935a0aa07025438dd9a9bc1 \ + --hash=sha256:fe57328fbc1bfd0bd0514470ac692630f3901c0ee39052ae47acd1d90a436719 \ + --hash=sha256:fea09ca13323376a2fdfb353a5fa2e59f90cd18d7ca4eaa1fd31f0a8b4f91e62 + # via + # -c python/requirements_compiled.txt + # aiohttp +zipp==3.19.2 \ + --hash=sha256:f091755f667055f2d02b32c53771a7a6c8b47e1fdbc4b72a8b9072b3eef8015c + # via + # -c python/requirements_compiled.txt + # importlib-metadata diff --git a/python/deplocks/ray_img/ray_img_py39.lock b/python/deplocks/ray_img/ray_img_py39.lock new file mode 100644 index 000000000000..9787b8fc5971 --- /dev/null +++ b/python/deplocks/ray_img/ray_img_py39.lock @@ -0,0 +1,2173 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile --generate-hashes --strip-extras --unsafe-package setuptools --index-url https://pypi.org/simple --extra-index-url https://download.pytorch.org/whl/cpu --index-strategy unsafe-best-match --no-strip-markers --emit-index-url --emit-find-links --python-version=3.9 --find-links=.whl/ -c python/requirements_compiled.txt - -o python/deplocks/ray_img/ray_img_py39.lock +--index-url https://pypi.org/simple +--extra-index-url https://download.pytorch.org/whl/cpu +--find-links .whl/ +--find-links https://data.pyg.org/whl/torch-2.3.0+cpu.html + +aiohappyeyeballs==2.6.1 \ + --hash=sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558 \ + --hash=sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8 + # via + # -c python/requirements_compiled.txt + # aiohttp +aiohttp==3.11.16 \ + --hash=sha256:004511d3413737700835e949433536a2fe95a7d0297edd911a1e9705c5b5ea43 \ + --hash=sha256:0902e887b0e1d50424112f200eb9ae3dfed6c0d0a19fc60f633ae5a57c809656 \ + --hash=sha256:09b00dd520d88eac9d1768439a59ab3d145065c91a8fab97f900d1b5f802895e \ + --hash=sha256:0a2f451849e6b39e5c226803dcacfa9c7133e9825dcefd2f4e837a2ec5a3bb98 \ + --hash=sha256:0a950c2eb8ff17361abd8c85987fd6076d9f47d040ebffce67dce4993285e973 \ + --hash=sha256:0ad1fb47da60ae1ddfb316f0ff16d1f3b8e844d1a1e154641928ea0583d486ed \ + --hash=sha256:13ceac2c5cdcc3f64b9015710221ddf81c900c5febc505dbd8f810e770011540 \ + --hash=sha256:14461157d8426bcb40bd94deb0450a6fa16f05129f7da546090cebf8f3123b0f \ + --hash=sha256:16f8a2c9538c14a557b4d309ed4d0a7c60f0253e8ed7b6c9a2859a7582f8b1b8 \ + --hash=sha256:17ae4664031aadfbcb34fd40ffd90976671fa0c0286e6c4113989f78bebab37a \ + --hash=sha256:1ce63ae04719513dd2651202352a2beb9f67f55cb8490c40f056cea3c5c355ce \ + --hash=sha256:23a15727fbfccab973343b6d1b7181bfb0b4aa7ae280f36fd2f90f5476805682 \ + --hash=sha256:2540ddc83cc724b13d1838026f6a5ad178510953302a49e6d647f6e1de82bc34 \ + --hash=sha256:37dcee4906454ae377be5937ab2a66a9a88377b11dd7c072df7a7c142b63c37c \ + --hash=sha256:38bea84ee4fe24ebcc8edeb7b54bf20f06fd53ce4d2cc8b74344c5b9620597fd \ + --hash=sha256:3ab3367bb7f61ad18793fea2ef71f2d181c528c87948638366bf1de26e239183 \ + --hash=sha256:3ad1d59fd7114e6a08c4814983bb498f391c699f3c78712770077518cae63ff7 \ + --hash=sha256:3b4e6db8dc4879015b9955778cfb9881897339c8fab7b3676f8433f849425913 \ + --hash=sha256:3e061b09f6fa42997cf627307f220315e313ece74907d35776ec4373ed718b86 \ + --hash=sha256:42864e70a248f5f6a49fdaf417d9bc62d6e4d8ee9695b24c5916cb4bb666c802 \ + --hash=sha256:493910ceb2764f792db4dc6e8e4b375dae1b08f72e18e8f10f18b34ca17d0979 \ + --hash=sha256:4d0c970c0d602b1017e2067ff3b7dac41c98fef4f7472ec2ea26fd8a4e8c2149 \ + --hash=sha256:54eb3aead72a5c19fad07219acd882c1643a1027fbcdefac9b502c267242f955 \ + --hash=sha256:56a3443aca82abda0e07be2e1ecb76a050714faf2be84256dae291182ba59049 \ + --hash=sha256:576f5ca28d1b3276026f7df3ec841ae460e0fc3aac2a47cbf72eabcfc0f102e1 \ + --hash=sha256:58ede86453a6cf2d6ce40ef0ca15481677a66950e73b0a788917916f7e35a0bb \ + --hash=sha256:61c721764e41af907c9d16b6daa05a458f066015abd35923051be8705108ed17 \ + --hash=sha256:634d96869be6c4dc232fc503e03e40c42d32cfaa51712aee181e922e61d74814 \ + --hash=sha256:696ef00e8a1f0cec5e30640e64eca75d8e777933d1438f4facc9c0cdf288a810 \ + --hash=sha256:69a2cbd61788d26f8f1e626e188044834f37f6ae3f937bd9f08b65fc9d7e514e \ + --hash=sha256:6a792ce34b999fbe04a7a71a90c74f10c57ae4c51f65461a411faa70e154154e \ + --hash=sha256:6ac13b71761e49d5f9e4d05d33683bbafef753e876e8e5a7ef26e937dd766713 \ + --hash=sha256:6fdec0213244c39973674ca2a7f5435bf74369e7d4e104d6c7473c81c9bcc8c4 \ + --hash=sha256:72b1b03fb4655c1960403c131740755ec19c5898c82abd3961c364c2afd59fe7 \ + --hash=sha256:745f1ed5e2c687baefc3c5e7b4304e91bf3e2f32834d07baaee243e349624b24 \ + --hash=sha256:776c8e959a01e5e8321f1dec77964cb6101020a69d5a94cd3d34db6d555e01f7 \ + --hash=sha256:780df0d837276276226a1ff803f8d0fa5f8996c479aeef52eb040179f3156cbd \ + --hash=sha256:78e6e23b954644737e385befa0deb20233e2dfddf95dd11e9db752bdd2a294d3 \ + --hash=sha256:7951decace76a9271a1ef181b04aa77d3cc309a02a51d73826039003210bdc86 \ + --hash=sha256:7ba92a2d9ace559a0a14b03d87f47e021e4fa7681dc6970ebbc7b447c7d4b7cd \ + --hash=sha256:7f6428fee52d2bcf96a8aa7b62095b190ee341ab0e6b1bcf50c615d7966fd45b \ + --hash=sha256:87944bd16b7fe6160607f6a17808abd25f17f61ae1e26c47a491b970fb66d8cb \ + --hash=sha256:87a6e922b2b2401e0b0cf6b976b97f11ec7f136bfed445e16384fbf6fd5e8602 \ + --hash=sha256:8cb0688a8d81c63d716e867d59a9ccc389e97ac7037ebef904c2b89334407180 \ + --hash=sha256:8df6612df74409080575dca38a5237282865408016e65636a76a2eb9348c2567 \ + --hash=sha256:911a6e91d08bb2c72938bc17f0a2d97864c531536b7832abee6429d5296e5b27 \ + --hash=sha256:92b7ee222e2b903e0a4b329a9943d432b3767f2d5029dbe4ca59fb75223bbe2e \ + --hash=sha256:938f756c2b9374bbcc262a37eea521d8a0e6458162f2a9c26329cc87fdf06534 \ + --hash=sha256:9756d9b9d4547e091f99d554fbba0d2a920aab98caa82a8fb3d3d9bee3c9ae85 \ + --hash=sha256:98b88a2bf26965f2015a771381624dd4b0839034b70d406dc74fd8be4cc053e3 \ + --hash=sha256:9b751a6306f330801665ae69270a8a3993654a85569b3469662efaad6cf5cc50 \ + --hash=sha256:a2a450bcce4931b295fc0848f384834c3f9b00edfc2150baafb4488c27953de6 \ + --hash=sha256:a3814760a1a700f3cfd2f977249f1032301d0a12c92aba74605cfa6ce9f78489 \ + --hash=sha256:a5abcbba9f4b463a45c8ca8b7720891200658f6f46894f79517e6cd11f3405ca \ + --hash=sha256:a6db7458ab89c7d80bc1f4e930cc9df6edee2200127cfa6f6e080cf619eddfbd \ + --hash=sha256:ad497f38a0d6c329cb621774788583ee12321863cd4bd9feee1effd60f2ad133 \ + --hash=sha256:ad9509ffb2396483ceacb1eee9134724443ee45b92141105a4645857244aecc8 \ + --hash=sha256:bbcba75fe879ad6fd2e0d6a8d937f34a571f116a0e4db37df8079e738ea95c71 \ + --hash=sha256:c10d85e81d0b9ef87970ecbdbfaeec14a361a7fa947118817fcea8e45335fa46 \ + --hash=sha256:c15b2271c44da77ee9d822552201180779e5e942f3a71fb74e026bf6172ff287 \ + --hash=sha256:ca37057625693d097543bd88076ceebeb248291df9d6ca8481349efc0b05dcd0 \ + --hash=sha256:cc3a145479a76ad0ed646434d09216d33d08eef0d8c9a11f5ae5cdc37caa3540 \ + --hash=sha256:ccf10f16ab498d20e28bc2b5c1306e9c1512f2840f7b6a67000a517a4b37d5ee \ + --hash=sha256:cd464ba806e27ee24a91362ba3621bfc39dbbb8b79f2e1340201615197370f7c \ + --hash=sha256:d007aa39a52d62373bd23428ba4a2546eed0e7643d7bf2e41ddcefd54519842c \ + --hash=sha256:d0666afbe984f6933fe72cd1f1c3560d8c55880a0bdd728ad774006eb4241ecd \ + --hash=sha256:d07502cc14ecd64f52b2a74ebbc106893d9a9717120057ea9ea1fd6568a747e7 \ + --hash=sha256:d489d9778522fbd0f8d6a5c6e48e3514f11be81cb0a5954bdda06f7e1594b321 \ + --hash=sha256:df7db76400bf46ec6a0a73192b14c8295bdb9812053f4fe53f4e789f3ea66bbb \ + --hash=sha256:e3538bc9fe1b902bef51372462e3d7c96fce2b566642512138a480b7adc9d508 \ + --hash=sha256:e87fd812899aa78252866ae03a048e77bd11b80fb4878ce27c23cade239b42b2 \ + --hash=sha256:ecdb8173e6c7aa09eee342ac62e193e6904923bd232e76b4157ac0bfa670609f \ + --hash=sha256:f244b8e541f414664889e2c87cac11a07b918cb4b540c36f7ada7bfa76571ea2 \ + --hash=sha256:f4065145bf69de124accdd17ea5f4dc770da0a6a6e440c53f6e0a8c27b3e635c \ + --hash=sha256:f420bfe862fb357a6d76f2065447ef6f484bc489292ac91e29bc65d2d7a2c84d \ + --hash=sha256:f6ddd90d9fb4b501c97a4458f1c1720e42432c26cb76d28177c5b5ad4e332601 \ + --hash=sha256:fa73e8c2656a3653ae6c307b3f4e878a21f87859a9afab228280ddccd7369d71 \ + --hash=sha256:fadbb8f1d4140825069db3fedbbb843290fd5f5bc0a5dbd7eaf81d91bf1b003b \ + --hash=sha256:fb3d0cc5cdb926090748ea60172fa8a213cec728bd6c54eae18b96040fcd6227 \ + --hash=sha256:fb46bb0f24813e6cede6cc07b1961d4b04f331f7112a23b5e21f567da4ee50aa \ + --hash=sha256:fd36c119c5d6551bce374fcb5c19269638f8d09862445f85a5a48596fd59f4bb + # via + # -c python/requirements_compiled.txt + # aiohttp-cors + # ray +aiohttp-cors==0.7.0 \ + --hash=sha256:0451ba59fdf6909d0e2cd21e4c0a43752bc0703d33fc78ae94d9d9321710193e \ + --hash=sha256:4d39c6d7100fd9764ed1caf8cebf0eb01bf5e3f24e2e073fda6234bc48b19f5d + # via + # -c python/requirements_compiled.txt + # ray +aiosignal==1.3.1 \ + --hash=sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc \ + --hash=sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17 + # via + # -c python/requirements_compiled.txt + # aiohttp +amqp==5.3.1 \ + --hash=sha256:43b3319e1b4e7d1251833a93d672b4af1e40f3d632d479b98661a95f117880a2 \ + --hash=sha256:cddc00c725449522023bad949f70fff7b48f0b1ade74d170a6f10ab044739432 + # via + # -c python/requirements_compiled.txt + # kombu +annotated-types==0.6.0 \ + --hash=sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43 \ + --hash=sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d + # via + # -c python/requirements_compiled.txt + # pydantic +anyio==3.7.1 \ + --hash=sha256:44a3c9aba0f5defa43261a8b3efb97891f2bd7d804e0e1f56419befa1adfc780 \ + --hash=sha256:91dee416e570e92c64041bd18b900d1d6fa78dff7048769ce5ac5ddad004fbb5 + # via + # -c python/requirements_compiled.txt + # starlette + # watchfiles +async-timeout==4.0.3 ; python_full_version < '3.11' \ + --hash=sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f \ + --hash=sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028 + # via + # -c python/requirements_compiled.txt + # aiohttp +attrs==25.1.0 \ + --hash=sha256:1c97078a80c814273a76b2a298a932eb681c87415c11dee0a6921de7f1b02c3e \ + --hash=sha256:c75a69e28a550a7e93789579c22aa26b0f5b83b75dc4e08fe092980051e1090a + # via + # -c python/requirements_compiled.txt + # aiohttp + # jsonschema + # referencing +billiard==4.2.1 \ + --hash=sha256:12b641b0c539073fc8d3f5b8b7be998956665c4233c7c1fcd66a7e677c4fb36f \ + --hash=sha256:40b59a4ac8806ba2c2369ea98d876bc6108b051c227baffd928c644d15d8f3cb + # via + # -c python/requirements_compiled.txt + # celery +cachetools==5.5.2 \ + --hash=sha256:1a661caa9175d26759571b2e19580f9d6393969e5dfca11fdb1f947a23e640d4 \ + --hash=sha256:d26a22bcc62eb95c3beabd9f1ee5e820d3d2704fe2967cbe350e20c8ffcd3f0a + # via + # -c python/requirements_compiled.txt + # google-auth +celery==5.5.3 \ + --hash=sha256:0b5761a07057acee94694464ca482416b959568904c9dfa41ce8413a7d65d525 \ + --hash=sha256:6c972ae7968c2b5281227f01c3a3f984037d21c5129d07bf3550cc2afc6b10a5 + # via + # -c python/requirements_compiled.txt + # ray +certifi==2025.1.31 \ + --hash=sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651 \ + --hash=sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe + # via + # -c python/requirements_compiled.txt + # requests +cffi==1.16.0 ; platform_python_implementation != 'PyPy' \ + --hash=sha256:0c9ef6ff37e974b73c25eecc13952c55bceed9112be2d9d938ded8e856138bcc \ + --hash=sha256:131fd094d1065b19540c3d72594260f118b231090295d8c34e19a7bbcf2e860a \ + --hash=sha256:1b8ebc27c014c59692bb2664c7d13ce7a6e9a629be20e54e7271fa696ff2b417 \ + --hash=sha256:2c56b361916f390cd758a57f2e16233eb4f64bcbeee88a4881ea90fca14dc6ab \ + --hash=sha256:2d92b25dbf6cae33f65005baf472d2c245c050b1ce709cc4588cdcdd5495b520 \ + --hash=sha256:31d13b0f99e0836b7ff893d37af07366ebc90b678b6664c955b54561fc36ef36 \ + --hash=sha256:32c68ef735dbe5857c810328cb2481e24722a59a2003018885514d4c09af9743 \ + --hash=sha256:3686dffb02459559c74dd3d81748269ffb0eb027c39a6fc99502de37d501faa8 \ + --hash=sha256:582215a0e9adbe0e379761260553ba11c58943e4bbe9c36430c4ca6ac74b15ed \ + --hash=sha256:5b50bf3f55561dac5438f8e70bfcdfd74543fd60df5fa5f62d94e5867deca684 \ + --hash=sha256:5bf44d66cdf9e893637896c7faa22298baebcd18d1ddb6d2626a6e39793a1d56 \ + --hash=sha256:6602bc8dc6f3a9e02b6c22c4fc1e47aa50f8f8e6d3f78a5e16ac33ef5fefa324 \ + --hash=sha256:673739cb539f8cdaa07d92d02efa93c9ccf87e345b9a0b556e3ecc666718468d \ + --hash=sha256:68678abf380b42ce21a5f2abde8efee05c114c2fdb2e9eef2efdb0257fba1235 \ + --hash=sha256:68e7c44931cc171c54ccb702482e9fc723192e88d25a0e133edd7aff8fcd1f6e \ + --hash=sha256:6b3d6606d369fc1da4fd8c357d026317fbb9c9b75d36dc16e90e84c26854b088 \ + --hash=sha256:748dcd1e3d3d7cd5443ef03ce8685043294ad6bd7c02a38d1bd367cfd968e000 \ + --hash=sha256:7651c50c8c5ef7bdb41108b7b8c5a83013bfaa8a935590c5d74627c047a583c7 \ + --hash=sha256:7b78010e7b97fef4bee1e896df8a4bbb6712b7f05b7ef630f9d1da00f6444d2e \ + --hash=sha256:7e61e3e4fa664a8588aa25c883eab612a188c725755afff6289454d6362b9673 \ + --hash=sha256:80876338e19c951fdfed6198e70bc88f1c9758b94578d5a7c4c91a87af3cf31c \ + --hash=sha256:8895613bcc094d4a1b2dbe179d88d7fb4a15cee43c052e8885783fac397d91fe \ + --hash=sha256:88e2b3c14bdb32e440be531ade29d3c50a1a59cd4e51b1dd8b0865c54ea5d2e2 \ + --hash=sha256:8f8e709127c6c77446a8c0a8c8bf3c8ee706a06cd44b1e827c3e6a2ee6b8c098 \ + --hash=sha256:9cb4a35b3642fc5c005a6755a5d17c6c8b6bcb6981baf81cea8bfbc8903e8ba8 \ + --hash=sha256:9f90389693731ff1f659e55c7d1640e2ec43ff725cc61b04b2f9c6d8d017df6a \ + --hash=sha256:a09582f178759ee8128d9270cd1344154fd473bb77d94ce0aeb2a93ebf0feaf0 \ + --hash=sha256:a6a14b17d7e17fa0d207ac08642c8820f84f25ce17a442fd15e27ea18d67c59b \ + --hash=sha256:a72e8961a86d19bdb45851d8f1f08b041ea37d2bd8d4fd19903bc3083d80c896 \ + --hash=sha256:abd808f9c129ba2beda4cfc53bde801e5bcf9d6e0f22f095e45327c038bfe68e \ + --hash=sha256:ac0f5edd2360eea2f1daa9e26a41db02dd4b0451b48f7c318e217ee092a213e9 \ + --hash=sha256:b29ebffcf550f9da55bec9e02ad430c992a87e5f512cd63388abb76f1036d8d2 \ + --hash=sha256:b2ca4e77f9f47c55c194982e10f058db063937845bb2b7a86c84a6cfe0aefa8b \ + --hash=sha256:b7be2d771cdba2942e13215c4e340bfd76398e9227ad10402a8767ab1865d2e6 \ + --hash=sha256:b84834d0cf97e7d27dd5b7f3aca7b6e9263c56308ab9dc8aae9784abb774d404 \ + --hash=sha256:b86851a328eedc692acf81fb05444bdf1891747c25af7529e39ddafaf68a4f3f \ + --hash=sha256:bcb3ef43e58665bbda2fb198698fcae6776483e0c4a631aa5647806c25e02cc0 \ + --hash=sha256:c0f31130ebc2d37cdd8e44605fb5fa7ad59049298b3f745c74fa74c62fbfcfc4 \ + --hash=sha256:c6a164aa47843fb1b01e941d385aab7215563bb8816d80ff3a363a9f8448a8dc \ + --hash=sha256:d8a9d3ebe49f084ad71f9269834ceccbf398253c9fac910c4fd7053ff1386936 \ + --hash=sha256:db8e577c19c0fda0beb7e0d4e09e0ba74b1e4c092e0e40bfa12fe05b6f6d75ba \ + --hash=sha256:dc9b18bf40cc75f66f40a7379f6a9513244fe33c0e8aa72e2d56b0196a7ef872 \ + --hash=sha256:e09f3ff613345df5e8c3667da1d918f9149bd623cd9070c983c013792a9a62eb \ + --hash=sha256:e4108df7fe9b707191e55f33efbcb2d81928e10cea45527879a4749cbe472614 \ + --hash=sha256:e6024675e67af929088fda399b2094574609396b1decb609c55fa58b028a32a1 \ + --hash=sha256:e70f54f1796669ef691ca07d046cd81a29cb4deb1e5f942003f401c0c4a2695d \ + --hash=sha256:e715596e683d2ce000574bae5d07bd522c781a822866c20495e52520564f0969 \ + --hash=sha256:e760191dd42581e023a68b758769e2da259b5d52e3103c6060ddc02c9edb8d7b \ + --hash=sha256:ed86a35631f7bfbb28e108dd96773b9d5a6ce4811cf6ea468bb6a359b256b1e4 \ + --hash=sha256:ee07e47c12890ef248766a6e55bd38ebfb2bb8edd4142d56db91b21ea68b7627 \ + --hash=sha256:fa3a0128b152627161ce47201262d3140edb5a5c3da88d73a1b790a959126956 \ + --hash=sha256:fcc8eb6d5902bb1cf6dc4f187ee3ea80a1eba0a89aba40a5cb20a5087d961357 + # via + # -c python/requirements_compiled.txt + # cryptography +charset-normalizer==3.3.2 \ + --hash=sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027 \ + --hash=sha256:06a81e93cd441c56a9b65d8e1d043daeb97a3d0856d177d5c90ba85acb3db087 \ + --hash=sha256:0a55554a2fa0d408816b3b5cedf0045f4b8e1a6065aec45849de2d6f3f8e9786 \ + --hash=sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8 \ + --hash=sha256:10955842570876604d404661fbccbc9c7e684caf432c09c715ec38fbae45ae09 \ + --hash=sha256:122c7fa62b130ed55f8f285bfd56d5f4b4a5b503609d181f9ad85e55c89f4185 \ + --hash=sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574 \ + --hash=sha256:1d3193f4a680c64b4b6a9115943538edb896edc190f0b222e73761716519268e \ + --hash=sha256:1f79682fbe303db92bc2b1136016a38a42e835d932bab5b3b1bfcfbf0640e519 \ + --hash=sha256:2127566c664442652f024c837091890cb1942c30937add288223dc895793f898 \ + --hash=sha256:22afcb9f253dac0696b5a4be4a1c0f8762f8239e21b99680099abd9b2b1b2269 \ + --hash=sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3 \ + --hash=sha256:2e81c7b9c8979ce92ed306c249d46894776a909505d8f5a4ba55b14206e3222f \ + --hash=sha256:3287761bc4ee9e33561a7e058c72ac0938c4f57fe49a09eae428fd88aafe7bb6 \ + --hash=sha256:34d1c8da1e78d2e001f363791c98a272bb734000fcef47a491c1e3b0505657a8 \ + --hash=sha256:37e55c8e51c236f95b033f6fb391d7d7970ba5fe7ff453dad675e88cf303377a \ + --hash=sha256:3d47fa203a7bd9c5b6cee4736ee84ca03b8ef23193c0d1ca99b5089f72645c73 \ + --hash=sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc \ + --hash=sha256:42cb296636fcc8b0644486d15c12376cb9fa75443e00fb25de0b8602e64c1714 \ + --hash=sha256:45485e01ff4d3630ec0d9617310448a8702f70e9c01906b0d0118bdf9d124cf2 \ + --hash=sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc \ + --hash=sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce \ + --hash=sha256:4d0d1650369165a14e14e1e47b372cfcb31d6ab44e6e33cb2d4e57265290044d \ + --hash=sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e \ + --hash=sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6 \ + --hash=sha256:572c3763a264ba47b3cf708a44ce965d98555f618ca42c926a9c1616d8f34269 \ + --hash=sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96 \ + --hash=sha256:5b4c145409bef602a690e7cfad0a15a55c13320ff7a3ad7ca59c13bb8ba4d45d \ + --hash=sha256:6463effa3186ea09411d50efc7d85360b38d5f09b870c48e4600f63af490e56a \ + --hash=sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4 \ + --hash=sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77 \ + --hash=sha256:6897af51655e3691ff853668779c7bad41579facacf5fd7253b0133308cf000d \ + --hash=sha256:68d1f8a9e9e37c1223b656399be5d6b448dea850bed7d0f87a8311f1ff3dabb0 \ + --hash=sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed \ + --hash=sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068 \ + --hash=sha256:6c4caeef8fa63d06bd437cd4bdcf3ffefe6738fb1b25951440d80dc7df8c03ac \ + --hash=sha256:6ef1d82a3af9d3eecdba2321dc1b3c238245d890843e040e41e470ffa64c3e25 \ + --hash=sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8 \ + --hash=sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab \ + --hash=sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26 \ + --hash=sha256:7f04c839ed0b6b98b1a7501a002144b76c18fb1c1850c8b98d458ac269e26ed2 \ + --hash=sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db \ + --hash=sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f \ + --hash=sha256:8465322196c8b4d7ab6d1e049e4c5cb460d0394da4a27d23cc242fbf0034b6b5 \ + --hash=sha256:86216b5cee4b06df986d214f664305142d9c76df9b6512be2738aa72a2048f99 \ + --hash=sha256:87d1351268731db79e0f8e745d92493ee2841c974128ef629dc518b937d9194c \ + --hash=sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d \ + --hash=sha256:8c622a5fe39a48f78944a87d4fb8a53ee07344641b0562c540d840748571b811 \ + --hash=sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa \ + --hash=sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a \ + --hash=sha256:9063e24fdb1e498ab71cb7419e24622516c4a04476b17a2dab57e8baa30d6e03 \ + --hash=sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b \ + --hash=sha256:923c0c831b7cfcb071580d3f46c4baf50f174be571576556269530f4bbd79d04 \ + --hash=sha256:95f2a5796329323b8f0512e09dbb7a1860c46a39da62ecb2324f116fa8fdc85c \ + --hash=sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001 \ + --hash=sha256:9f96df6923e21816da7e0ad3fd47dd8f94b2a5ce594e00677c0013018b813458 \ + --hash=sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389 \ + --hash=sha256:a50aebfa173e157099939b17f18600f72f84eed3049e743b68ad15bd69b6bf99 \ + --hash=sha256:a981a536974bbc7a512cf44ed14938cf01030a99e9b3a06dd59578882f06f985 \ + --hash=sha256:a9a8e9031d613fd2009c182b69c7b2c1ef8239a0efb1df3f7c8da66d5dd3d537 \ + --hash=sha256:ae5f4161f18c61806f411a13b0310bea87f987c7d2ecdbdaad0e94eb2e404238 \ + --hash=sha256:aed38f6e4fb3f5d6bf81bfa990a07806be9d83cf7bacef998ab1a9bd660a581f \ + --hash=sha256:b01b88d45a6fcb69667cd6d2f7a9aeb4bf53760d7fc536bf679ec94fe9f3ff3d \ + --hash=sha256:b261ccdec7821281dade748d088bb6e9b69e6d15b30652b74cbbac25e280b796 \ + --hash=sha256:b2b0a0c0517616b6869869f8c581d4eb2dd83a4d79e0ebcb7d373ef9956aeb0a \ + --hash=sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143 \ + --hash=sha256:bd8f7df7d12c2db9fab40bdd87a7c09b1530128315d047a086fa3ae3435cb3a8 \ + --hash=sha256:beb58fe5cdb101e3a055192ac291b7a21e3b7ef4f67fa1d74e331a7f2124341c \ + --hash=sha256:c002b4ffc0be611f0d9da932eb0f704fe2602a9a949d1f738e4c34c75b0863d5 \ + --hash=sha256:c083af607d2515612056a31f0a8d9e0fcb5876b7bfc0abad3ecd275bc4ebc2d5 \ + --hash=sha256:c180f51afb394e165eafe4ac2936a14bee3eb10debc9d9e4db8958fe36afe711 \ + --hash=sha256:c235ebd9baae02f1b77bcea61bce332cb4331dc3617d254df3323aa01ab47bd4 \ + --hash=sha256:cd70574b12bb8a4d2aaa0094515df2463cb429d8536cfb6c7ce983246983e5a6 \ + --hash=sha256:d0eccceffcb53201b5bfebb52600a5fb483a20b61da9dbc885f8b103cbe7598c \ + --hash=sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7 \ + --hash=sha256:db364eca23f876da6f9e16c9da0df51aa4f104a972735574842618b8c6d999d4 \ + --hash=sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b \ + --hash=sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae \ + --hash=sha256:e06ed3eb3218bc64786f7db41917d4e686cc4856944f53d5bdf83a6884432e12 \ + --hash=sha256:e27ad930a842b4c5eb8ac0016b0a54f5aebbe679340c26101df33424142c143c \ + --hash=sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae \ + --hash=sha256:eb00ed941194665c332bf8e078baf037d6c35d7c4f3102ea2d4f16ca94a26dc8 \ + --hash=sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887 \ + --hash=sha256:eb8821e09e916165e160797a6c17edda0679379a4be5c716c260e836e122f54b \ + --hash=sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4 \ + --hash=sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f \ + --hash=sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5 \ + --hash=sha256:fb69256e180cb6c8a894fee62b3afebae785babc1ee98b81cdf68bbca1987f33 \ + --hash=sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519 \ + --hash=sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561 + # via + # -c python/requirements_compiled.txt + # requests +click==8.1.7 \ + --hash=sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28 \ + --hash=sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de + # via + # -c python/requirements_compiled.txt + # celery + # click-didyoumean + # click-plugins + # click-repl + # ray + # uvicorn +click-didyoumean==0.3.1 \ + --hash=sha256:4f82fdff0dbe64ef8ab2279bd6aa3f6a99c3b28c05aa09cbfc07c9d7fbb5a463 \ + --hash=sha256:5c4bb6007cfea5f2fd6583a2fb6701a22a41eb98957e63d0fac41c10e7c3117c + # via + # -c python/requirements_compiled.txt + # celery +click-plugins==1.1.1.2 \ + --hash=sha256:008d65743833ffc1f5417bf0e78e8d2c23aab04d9745ba817bd3e71b0feb6aa6 \ + --hash=sha256:d7af3984a99d243c131aa1a828331e7630f4a88a9741fd05c927b204bcf92261 + # via + # -c python/requirements_compiled.txt + # celery +click-repl==0.3.0 \ + --hash=sha256:17849c23dba3d667247dc4defe1757fff98694e90fe37474f3feebb69ced26a9 \ + --hash=sha256:fb7e06deb8da8de86180a33a9da97ac316751c094c6899382da7feeeeb51b812 + # via + # -c python/requirements_compiled.txt + # celery +cloudpickle==2.2.0 \ + --hash=sha256:3f4219469c55453cfe4737e564b67c2a149109dabf7f242478948b895f61106f \ + --hash=sha256:7428798d5926d8fcbfd092d18d01a2a03daf8237d8fcdc8095d256b8490796f0 + # via + # -c python/requirements_compiled.txt + # gymnasium +colorful==0.5.5 \ + --hash=sha256:62c187e27c1433db9463ff93b1451898d1e7e23a7e553583fd9daeb6325182e4 \ + --hash=sha256:66f8c1264b2a26f7293b96a03bb7a76c4bc8b9634369a0bffdcd12d618056a1d + # via + # -c python/requirements_compiled.txt + # ray +cryptography==44.0.3 \ + --hash=sha256:02f55fb4f8b79c1221b0961488eaae21015b69b210e18c386b69de182ebb1259 \ + --hash=sha256:157f1f3b8d941c2bd8f3ffee0af9b049c9665c39d3da9db2dc338feca5e98a43 \ + --hash=sha256:192ed30fac1728f7587c6f4613c29c584abdc565d7417c13904708db10206645 \ + --hash=sha256:21a83f6f35b9cc656d71b5de8d519f566df01e660ac2578805ab245ffd8523f8 \ + --hash=sha256:25cd194c39fa5a0aa4169125ee27d1172097857b27109a45fadc59653ec06f44 \ + --hash=sha256:3883076d5c4cc56dbef0b898a74eb6992fdac29a7b9013870b34efe4ddb39a0d \ + --hash=sha256:3bb0847e6363c037df8f6ede57d88eaf3410ca2267fb12275370a76f85786a6f \ + --hash=sha256:3be3f649d91cb182c3a6bd336de8b61a0a71965bd13d1a04a0e15b39c3d5809d \ + --hash=sha256:3f07943aa4d7dad689e3bb1638ddc4944cc5e0921e3c227486daae0e31a05e54 \ + --hash=sha256:479d92908277bed6e1a1c69b277734a7771c2b78633c224445b5c60a9f4bc1d9 \ + --hash=sha256:4ffc61e8f3bf5b60346d89cd3d37231019c17a081208dfbbd6e1605ba03fa137 \ + --hash=sha256:5639c2b16764c6f76eedf722dbad9a0914960d3489c0cc38694ddf9464f1bb2f \ + --hash=sha256:58968d331425a6f9eedcee087f77fd3c927c88f55368f43ff7e0a19891f2642c \ + --hash=sha256:5d186f32e52e66994dce4f766884bcb9c68b8da62d61d9d215bfe5fb56d21334 \ + --hash=sha256:5d20cc348cca3a8aa7312f42ab953a56e15323800ca3ab0706b8cd452a3a056c \ + --hash=sha256:6866df152b581f9429020320e5eb9794c8780e90f7ccb021940d7f50ee00ae0b \ + --hash=sha256:7d5fe7195c27c32a64955740b949070f21cba664604291c298518d2e255931d2 \ + --hash=sha256:896530bc9107b226f265effa7ef3f21270f18a2026bc09fed1ebd7b66ddf6375 \ + --hash=sha256:962bc30480a08d133e631e8dfd4783ab71cc9e33d5d7c1e192f0b7c06397bb88 \ + --hash=sha256:978631ec51a6bbc0b7e58f23b68a8ce9e5f09721940933e9c217068388789fe5 \ + --hash=sha256:9b4d4a5dbee05a2c390bf212e78b99434efec37b17a4bff42f50285c5c8c9647 \ + --hash=sha256:ab0b005721cc0039e885ac3503825661bd9810b15d4f374e473f8c89b7d5460c \ + --hash=sha256:af653022a0c25ef2e3ffb2c673a50e5a0d02fecc41608f4954176f1933b12359 \ + --hash=sha256:b0cc66c74c797e1db750aaa842ad5b8b78e14805a9b5d1348dc603612d3e3ff5 \ + --hash=sha256:b424563394c369a804ecbee9b06dfb34997f19d00b3518e39f83a5642618397d \ + --hash=sha256:c138abae3a12a94c75c10499f1cbae81294a6f983b3af066390adee73f433028 \ + --hash=sha256:c6cd67722619e4d55fdb42ead64ed8843d64638e9c07f4011163e46bc512cf01 \ + --hash=sha256:c91fc8e8fd78af553f98bc7f2a1d8db977334e4eea302a4bfd75b9461c2d8904 \ + --hash=sha256:cad399780053fb383dc067475135e41c9fe7d901a97dd5d9c5dfb5611afc0d7d \ + --hash=sha256:cb90f60e03d563ca2445099edf605c16ed1d5b15182d21831f58460c48bffb93 \ + --hash=sha256:dad80b45c22e05b259e33ddd458e9e2ba099c86ccf4e88db7bbab4b747b18d06 \ + --hash=sha256:dd3db61b8fe5be220eee484a17233287d0be6932d056cf5738225b9c05ef4fff \ + --hash=sha256:e28d62e59a4dbd1d22e747f57d4f00c459af22181f0b2f787ea83f5a876d7c76 \ + --hash=sha256:e909df4053064a97f1e6565153ff8bb389af12c5c8d29c343308760890560aff \ + --hash=sha256:f3ffef566ac88f75967d7abd852ed5f182da252d23fac11b4766da3957766759 \ + --hash=sha256:fc3c9babc1e1faefd62704bb46a69f359a9819eb0292e40df3fb6e3574715cd4 \ + --hash=sha256:fe19d8bc5536a91a24a8133328880a41831b6c5df54599a8417b62fe015d3053 + # via + # -c python/requirements_compiled.txt + # pyopenssl +cupy-cuda12x==13.1.0 ; sys_platform != 'darwin' \ + --hash=sha256:230f8a8e99c81a653baa0ed00819990c0ed1f0cf0298214786b5e323461dc61a \ + --hash=sha256:2d16eaa2d086e416ac13467d4ff3184b9a081fe76b761ce51d4a46ec1c4bd28a \ + --hash=sha256:432273fd4b61a284f7d705d08b8291403548fd422bcbd945635cc155bc6a923d \ + --hash=sha256:4c51a1062a3c5a826b0425952d229ffe73b1791656a31de95b318117e67a9576 \ + --hash=sha256:4c8e9fdb1f3ffc3151808f8bb8c871518d2783e1be8b53792b698a840543d60c \ + --hash=sha256:51b1d6cb83d82dfa306c9efaeb4d57f24bad3041ebd8716d61072676abbcf67b \ + --hash=sha256:52185a2cf95d3bac2c3fda95c9c8e06a985b5a00cd2e587d3caace337db33899 \ + --hash=sha256:5afb6658faa22f21479ae2c0a07254df31c0aebc36907a64a1f6be4ecc9e96da \ + --hash=sha256:d3dc91ef9c4104652195eea4b282d343ecad653021efe20d1c8dd8dfe8ccfd86 \ + --hash=sha256:d60d1e124592cb82a5f3f45b3e7bee7bda7b72a743029f275e9d6b125f338c60 \ + --hash=sha256:dac0284fecb90b5731f514e569a6fcf6674a730ae95b9490781a713b60a34423 \ + --hash=sha256:e7a25ef1b44ae6276b5105affc2289edb34f1aa6676babd5bcd80907348c4cfa + # via + # -c python/requirements_compiled.txt + # ray +distlib==0.3.7 \ + --hash=sha256:2e24928bc811348f0feb63014e97aaae3037f2cf48712d51ae61df7fd6075057 \ + --hash=sha256:9dafe54b34a028eafd95039d5e5d4851a13734540f1331060d31c9916e7147a8 + # via + # -c python/requirements_compiled.txt + # virtualenv +dm-tree==0.1.8 \ + --hash=sha256:054b461f8176f4bce7a21f7b1870f873a1ced3bdbe1282c816c550bb43c71fa6 \ + --hash=sha256:09964470f76a5201aff2e8f9b26842976de7889300676f927930f6285e256760 \ + --hash=sha256:0d3172394079a86c3a759179c65f64c48d1a42b89495fcf38976d11cc3bb952c \ + --hash=sha256:0e9620ccf06393eb6b613b5e366469304622d4ea96ae6540b28a33840e6c89cf \ + --hash=sha256:0fcaabbb14e7980377439e7140bd05552739ca5e515ecb3119f234acee4b9430 \ + --hash=sha256:1607ce49aa42f010d1e5e616d92ce899d66835d4d8bea49679582435285515de \ + --hash=sha256:181c35521d480d0365f39300542cb6cd7fd2b77351bb43d7acfda15aef63b317 \ + --hash=sha256:1d7c26e431fc93cc7e0cba867eb000db6a05f6f2b25af11ac4e9dada88fc5bca \ + --hash=sha256:1fe962015b2fe1282892b28ebe962faed53c7f98d942da9a4625cbf27baef913 \ + --hash=sha256:250b692fb75f45f02e2f58fbef9ab338904ef334b90557565621fa251df267cf \ + --hash=sha256:2869228d9c619074de501a3c10dc7f07c75422f8fab36ecdcb859b6f1b1ec3ef \ + --hash=sha256:28c52cbf4f8b3dbd0beaedf44f69fa85eec5e9dede612e08035e06ada6ec9426 \ + --hash=sha256:2f7915660f59c09068e428613c480150180df1060561fd0d1470684ae7007bd1 \ + --hash=sha256:343a4a4ebaa127451ff971254a4be4084eb4bdc0b2513c32b46f6f728fd03f9e \ + --hash=sha256:35cc164a79336bfcfafb47e5f297898359123bbd3330c1967f0c4994f9cf9f60 \ + --hash=sha256:378cc8ad93c5fe3590f405a309980721f021c790ca1bdf9b15bb1d59daec57f5 \ + --hash=sha256:39070ba268c0491af9fe7a58644d99e8b4f2cde6e5884ba3380bddc84ed43d5f \ + --hash=sha256:435227cf3c5dc63f4de054cf3d00183790bd9ead4c3623138c74dde7f67f521b \ + --hash=sha256:5483dca4d7eb1a0d65fe86d3b6a53ae717face83c1f17e0887b1a4a64ae5c410 \ + --hash=sha256:694c3654cfd2a81552c08ec66bb5c4a3d48fa292b9a181880fb081c36c5b9134 \ + --hash=sha256:75c5d528bb992981c20793b6b453e91560784215dffb8a5440ba999753c14ceb \ + --hash=sha256:803bfc53b4659f447ac694dbd04235f94a73ef7c1fd1e0df7c84ac41e0bc963b \ + --hash=sha256:81fce77f22a302d7a5968aebdf4efafef4def7ce96528719a354e6990dcd49c7 \ + --hash=sha256:83b7764de0d855338abefc6e3ee9fe40d301668310aa3baea3f778ff051f4393 \ + --hash=sha256:8c60a7eadab64c2278861f56bca320b2720f163dca9d7558103c3b77f2416571 \ + --hash=sha256:8ed3564abed97c806db122c2d3e1a2b64c74a63debe9903aad795167cc301368 \ + --hash=sha256:94d3f0826311f45ee19b75f5b48c99466e4218a0489e81c0f0167bda50cacf22 \ + --hash=sha256:96a548a406a6fb15fe58f6a30a57ff2f2aafbf25f05afab00c8f5e5977b6c715 \ + --hash=sha256:a5d819c38c03f0bb5b3b3703c60e4b170355a0fc6b5819325bf3d4ceb3ae7e80 \ + --hash=sha256:ad16ceba90a56ec47cf45b21856d14962ac314787975ef786efb5e6e9ca75ec7 \ + --hash=sha256:af4b3d372f2477dcd89a6e717e4a575ca35ccc20cc4454a8a4b6f8838a00672d \ + --hash=sha256:b095ba4f8ca1ba19350fd53cf1f8f3eb0bd406aa28af64a6dfc86707b32a810a \ + --hash=sha256:b9bd9b9ccb59409d33d51d84b7668010c04c2af7d4a371632874c1ca356cff3d \ + --hash=sha256:b9f89a454e98806b44fe9d40ec9eee61f848388f7e79ac2371a55679bd5a3ac6 \ + --hash=sha256:bb2d109f42190225112da899b9f3d46d0d5f26aef501c61e43529fe9322530b5 \ + --hash=sha256:c0a94aba18a35457a1b5cd716fd7b46c5dafdc4cf7869b4bae665b91c4682a8e \ + --hash=sha256:c5c8c12e3fda754ef6af94161bacdaeda816d941995fac415d6855c6c386af68 \ + --hash=sha256:d1612fcaecd79023dbc6a6ae48d51a80beb5c385d6f3f6d71688e57bc8d07de8 \ + --hash=sha256:d16e1f2a073604cfcc09f7131ae8d534674f43c3aef4c25742eae295bc60d04f \ + --hash=sha256:d20f2faa3672b52e5013f4077117bfb99c4cfc0b445d3bde1584c34032b57436 \ + --hash=sha256:d40fa4106ca6edc66760246a08f500ec0c85ef55c762fb4a363f6ee739ba02ee \ + --hash=sha256:de287fabc464b8734be251e46e06aa9aa1001f34198da2b6ce07bd197172b9cb \ + --hash=sha256:e4d714371bb08839e4e5e29024fc95832d9affe129825ef38836b143028bd144 \ + --hash=sha256:ea9e59e0451e7d29aece402d9f908f2e2a80922bcde2ebfd5dcb07750fcbfee8 \ + --hash=sha256:f7ac31b9aecccb2c6e1ab29706f6ded3eba0c2c69c770322c9c685929c3d6afb \ + --hash=sha256:fa42a605d099ee7d41ba2b5fb75e21423951fd26e5d50583a00471238fb3021d + # via + # -c python/requirements_compiled.txt + # ray +exceptiongroup==1.3.0 ; python_full_version < '3.11' \ + --hash=sha256:4d111e6e0c13d0644cad6ddaa7ed0261a0b36971f6d23e7ec9b4b9097da78a10 \ + --hash=sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88 + # via anyio +farama-notifications==0.0.4 \ + --hash=sha256:13fceff2d14314cf80703c8266462ebf3733c7d165336eee998fc58e545efd18 \ + --hash=sha256:14de931035a41961f7c056361dc7f980762a143d05791ef5794a751a2caf05ae + # via + # -c python/requirements_compiled.txt + # gymnasium +fastapi==0.115.12 \ + --hash=sha256:1e2c2a2646905f9e83d32f04a3f86aff4a286669c6c950ca95b5fd68c2602681 \ + --hash=sha256:e94613d6c05e27be7ffebdd6ea5f388112e5e430c8f7d6494a9d1d88d43e814d + # via + # -c python/requirements_compiled.txt + # ray +fastrlock==0.8.2 ; sys_platform != 'darwin' \ + --hash=sha256:067edb0a0805bf61e17a251d5046af59f6e9d2b8ad01222e0ef7a0b7937d5548 \ + --hash=sha256:07ed3c7b3867c05a3d6be4ced200c7767000f3431b9be6da66972822dd86e8be \ + --hash=sha256:08315bde19d0c2e6b06593d5a418be3dc8f9b1ee721afa96867b9853fceb45cf \ + --hash=sha256:11bbbbc526363955aeddb9eec4cee2a0012322b7b2f15b54f44454fcf4fd398a \ + --hash=sha256:17734e2e5af4c07ddb0fb10bd484e062c22de3be6b67940b9cc6ec2f18fa61ba \ + --hash=sha256:1b15430b93d7eb3d56f6ff690d2ebecb79ed0e58248427717eba150a508d1cd7 \ + --hash=sha256:1fed2f4797ad68e9982038423018cf08bec5f4ce9fed63a94a790773ed6a795c \ + --hash=sha256:2074548a335fcf7d19ebb18d9208da9e33b06f745754466a7e001d2b1c58dd19 \ + --hash=sha256:2587cedbb36c7988e707d83f0f1175c1f882f362b5ebbee25d70218ea33d220d \ + --hash=sha256:25945f962c7bd808415cfde3da624d4399d4ea71ed8918538375f16bceb79e1c \ + --hash=sha256:27786c62a400e282756ae1b090bcd7cfa35f28270cff65a9e7b27a5327a32561 \ + --hash=sha256:2c1719ddc8218b01e82fb2e82e8451bd65076cb96d7bef4477194bbb4305a968 \ + --hash=sha256:2d5595903444c854b99c42122b87edfe8a37cd698a4eae32f4fd1d2a7b6c115d \ + --hash=sha256:30bdbe4662992348132d03996700e1cf910d141d629179b967b146a22942264e \ + --hash=sha256:31a27a2edf482df72b91fe6c6438314d2c65290aa7becc55589d156c9b91f0da \ + --hash=sha256:320fd55bafee3eb069cfb5d6491f811a912758387ef2193840e2663e80e16f48 \ + --hash=sha256:33145acbad8317584cd64588131c7e1e286beef6280c0009b4544c91fce171d2 \ + --hash=sha256:43a241655e83e4603a152192cf022d5ca348c2f4e56dfb02e5c9c4c1a32f9cdb \ + --hash=sha256:4d63b6596368dab9e0cc66bf047e7182a56f33b34db141816a4f21f5bf958228 \ + --hash=sha256:4fb04442b6d1e2b36c774919c6bcbe3339c61b337261d4bd57e27932589095af \ + --hash=sha256:4fb2e77ff04bc4beb71d63c8e064f052ce5a6ea1e001d528d4d7f4b37d736f2e \ + --hash=sha256:5460c5ee6ced6d61ec8cd2324ebbe793a4960c4ffa2131ffff480e3b61c99ec5 \ + --hash=sha256:59344c1d46b7dec97d3f22f1cc930fafe8980b3c5bc9c9765c56738a5f1559e4 \ + --hash=sha256:5dfb78dd600a12f23fc0c3ec58f81336229fdc74501ecf378d1ce5b3f2f313ea \ + --hash=sha256:643e1e65b4f5b284427e61a894d876d10459820e93aa1e724dfb415117be24e0 \ + --hash=sha256:644ec9215cf9c4df8028d8511379a15d9c1af3e16d80e47f1b6fdc6ba118356a \ + --hash=sha256:66f2662c640bb71a1016a031eea6eef9d25c2bcdf7ffd1d1ddc5a58f9a1ced04 \ + --hash=sha256:685e656048b59d8dfde8c601f188ad53a4d719eb97080cafc8696cda6d75865e \ + --hash=sha256:7269bb3fc15587b0c191eecd95831d771a7d80f0c48929e560806b038ff3066c \ + --hash=sha256:73426f5eb2ecc10626c67cf86bd0af9e00d53e80e5c67d5ce8e18376d6abfa09 \ + --hash=sha256:75c07726c8b1a52147fd7987d6baaa318c5dced1416c3f25593e40f56e10755b \ + --hash=sha256:790fc19bccbd39426060047e53629f171a44745613bf360a045e9f9c8c4a2cea \ + --hash=sha256:7a2ccaf88ac0db153e84305d1ef0aa138cea82c6a88309066f6eaa3bc98636cd \ + --hash=sha256:87f4e01b042c84e6090dbc4fbe3415ddd69f6bc0130382323f9d3f1b8dd71b46 \ + --hash=sha256:88f079335e9da631efa64486c8207564a7bcd0c00526bb9e842e9d5b7e50a6cc \ + --hash=sha256:8c1c91a68926421f5ccbc82c85f83bd3ba593b121a46a1b9a554b3f0dd67a4bf \ + --hash=sha256:9121a894d74e65557e47e777060a495ab85f4b903e80dd73a3c940ba042920d7 \ + --hash=sha256:94e348c72a1fd1f8191f25ea056448e4f5a87b8fbf005b39d290dcb0581a48cd \ + --hash=sha256:98195866d3a9949915935d40a88e4f1c166e82e378f622c88025f2938624a90a \ + --hash=sha256:99dd6652bd6f730beadf74ef769d38c6bbd8ee6d1c15c8d138ea680b0594387f \ + --hash=sha256:9af691a9861027181d4de07ed74f0aee12a9650ac60d0a07f4320bff84b5d95f \ + --hash=sha256:a3b8b5d2935403f1b4b25ae324560e94b59593a38c0d2e7b6c9872126a9622ed \ + --hash=sha256:a3dcc876050b8f5cbc0ee84ef1e7f0c1dfe7c148f10098828bc4403683c33f10 \ + --hash=sha256:a74f5a92fa6e51c4f3c69b29c4662088b97be12f40652a21109605a175c81824 \ + --hash=sha256:ab91b0c36e95d42e1041a4907e3eefd06c482d53af3c7a77be7e214cc7cd4a63 \ + --hash=sha256:ad1bc61c7f6b0e58106aaab034916b6cb041757f708b07fbcdd9d6e1ac629225 \ + --hash=sha256:adcb9e77aa132cc6c9de2ffe7cf880a20aa8cdba21d367d1da1a412f57bddd5d \ + --hash=sha256:b22ea9bf5f9fad2b0077e944a7813f91593a4f61adf8faf734a70aed3f2b3a40 \ + --hash=sha256:b2a1c354f13f22b737621d914f3b4a8434ae69d3027a775e94b3e671756112f9 \ + --hash=sha256:b32fdf874868326351a75b1e4c02f97e802147119ae44c52d3d9da193ec34f5b \ + --hash=sha256:b3853ed4ce522598dc886160a7bab432a093051af85891fa2f5577c1dcac8ed6 \ + --hash=sha256:b443e73a4dfc7b6e0800ea4c13567b9694358e86f53bb2612a51c9e727cac67b \ + --hash=sha256:b4c9083ea89ab236b06e9ef2263971db3b4b507195fc7d5eecab95828dcae325 \ + --hash=sha256:b8ca0fe21458457077e4cb2d81e1ebdb146a00b3e9e2db6180a773f7ea905032 \ + --hash=sha256:c393af77c659a38bffbca215c0bcc8629ba4299568308dd7e4ff65d62cabed39 \ + --hash=sha256:c6bffa978793bea5e1b00e677062e53a62255439339591b70e209fa1552d5ee0 \ + --hash=sha256:ccf39ad5702e33e4d335b48ef9d56e21619b529b7f7471b5211419f380329b62 \ + --hash=sha256:cf81e0278b645004388873e0a1f9e3bc4c9ab8c18e377b14ed1a544be4b18c9a \ + --hash=sha256:d34546ad2e4a480b94b6797bcc5a322b3c705c4c74c3e4e545c4a3841c1b2d59 \ + --hash=sha256:d47713ffe6d4a627fbf078be9836a95ac106b4a0543e3841572c91e292a5d885 \ + --hash=sha256:d918dfe473291e8bfd8e13223ea5cb9b317bd9f50c280923776c377f7c64b428 \ + --hash=sha256:dbdce852e6bb66e1b8c36679d482971d69d93acf1785657522e51b7de30c3356 \ + --hash=sha256:dcc1bf0ac8a194313cf6e645e300a8a379674ceed8e0b1e910a2de3e3c28989e \ + --hash=sha256:dd961a32a7182c3891cdebca417fda67496d5d5de6ae636962254d22723bdf52 \ + --hash=sha256:ddf5d247f686aec853ddcc9a1234bfcc6f57b0a0670d2ad82fc25d8ae7e6a15f \ + --hash=sha256:e27c3cd27fbd25e5223c5c992b300cd4ee8f0a75c6f222ce65838138d853712c \ + --hash=sha256:e380ec4e6d8b26e389713995a43cb7fe56baea2d25fe073d4998c4821a026211 \ + --hash=sha256:e4bbde174a0aff5f6eeba75cf8c4c5d2a316316bc21f03a0bddca0fc3659a6f3 \ + --hash=sha256:e8b49b5743ede51e0bcf6805741f39f5e0e0fd6a172ba460cb39e3097ba803bb \ + --hash=sha256:e9904b5b37c3e5bb4a245c56bc4b7e497da57ffb8528f4fc39af9dcb168ee2e1 \ + --hash=sha256:ea96503b918fceaf40443182742b8964d47b65c5ebdea532893cb9479620000c \ + --hash=sha256:eb31fe390f03f7ae886dcc374f1099ec88526631a4cb891d399b68181f154ff0 \ + --hash=sha256:ebb32d776b61acd49f859a1d16b9e3d84e7b46d0d92aebd58acd54dc38e96664 \ + --hash=sha256:fb5363cf0fddd9b50525ddbf64a1e1b28ec4c6dfb28670a940cb1cf988a6786b \ + --hash=sha256:ff75c90663d6e8996610d435e71487daa853871ad1770dd83dc0f2fc4997241e + # via + # -c python/requirements_compiled.txt + # cupy-cuda12x +filelock==3.17.0 \ + --hash=sha256:533dc2f7ba78dc2f0f531fc6c4940addf7b70a481e269a5a3b93be94ffbe8338 \ + --hash=sha256:ee4e77401ef576ebb38cd7f13b9b28893194acc20a8e68e18730ba9c0e54660e + # via + # -c python/requirements_compiled.txt + # ray + # virtualenv +frozenlist==1.4.1 \ + --hash=sha256:04ced3e6a46b4cfffe20f9ae482818e34eba9b5fb0ce4056e4cc9b6e212d09b7 \ + --hash=sha256:0633c8d5337cb5c77acbccc6357ac49a1770b8c487e5b3505c57b949b4b82e98 \ + --hash=sha256:068b63f23b17df8569b7fdca5517edef76171cf3897eb68beb01341131fbd2ad \ + --hash=sha256:0c250a29735d4f15321007fb02865f0e6b6a41a6b88f1f523ca1596ab5f50bd5 \ + --hash=sha256:1979bc0aeb89b33b588c51c54ab0161791149f2461ea7c7c946d95d5f93b56ae \ + --hash=sha256:1a4471094e146b6790f61b98616ab8e44f72661879cc63fa1049d13ef711e71e \ + --hash=sha256:1b280e6507ea8a4fa0c0a7150b4e526a8d113989e28eaaef946cc77ffd7efc0a \ + --hash=sha256:1d0ce09d36d53bbbe566fe296965b23b961764c0bcf3ce2fa45f463745c04701 \ + --hash=sha256:20b51fa3f588ff2fe658663db52a41a4f7aa6c04f6201449c6c7c476bd255c0d \ + --hash=sha256:23b2d7679b73fe0e5a4560b672a39f98dfc6f60df63823b0a9970525325b95f6 \ + --hash=sha256:23b701e65c7b36e4bf15546a89279bd4d8675faabc287d06bbcfac7d3c33e1e6 \ + --hash=sha256:2471c201b70d58a0f0c1f91261542a03d9a5e088ed3dc6c160d614c01649c106 \ + --hash=sha256:27657df69e8801be6c3638054e202a135c7f299267f1a55ed3a598934f6c0d75 \ + --hash=sha256:29acab3f66f0f24674b7dc4736477bcd4bc3ad4b896f5f45379a67bce8b96868 \ + --hash=sha256:32453c1de775c889eb4e22f1197fe3bdfe457d16476ea407472b9442e6295f7a \ + --hash=sha256:3a670dc61eb0d0eb7080890c13de3066790f9049b47b0de04007090807c776b0 \ + --hash=sha256:3e0153a805a98f5ada7e09826255ba99fb4f7524bb81bf6b47fb702666484ae1 \ + --hash=sha256:410478a0c562d1a5bcc2f7ea448359fcb050ed48b3c6f6f4f18c313a9bdb1826 \ + --hash=sha256:442acde1e068288a4ba7acfe05f5f343e19fac87bfc96d89eb886b0363e977ec \ + --hash=sha256:48f6a4533887e189dae092f1cf981f2e3885175f7a0f33c91fb5b7b682b6bab6 \ + --hash=sha256:4f57dab5fe3407b6c0c1cc907ac98e8a189f9e418f3b6e54d65a718aaafe3950 \ + --hash=sha256:4f9c515e7914626b2a2e1e311794b4c35720a0be87af52b79ff8e1429fc25f19 \ + --hash=sha256:55fdc093b5a3cb41d420884cdaf37a1e74c3c37a31f46e66286d9145d2063bd0 \ + --hash=sha256:5667ed53d68d91920defdf4035d1cdaa3c3121dc0b113255124bcfada1cfa1b8 \ + --hash=sha256:590344787a90ae57d62511dd7c736ed56b428f04cd8c161fcc5e7232c130c69a \ + --hash=sha256:5a7d70357e7cee13f470c7883a063aae5fe209a493c57d86eb7f5a6f910fae09 \ + --hash=sha256:5c3894db91f5a489fc8fa6a9991820f368f0b3cbdb9cd8849547ccfab3392d86 \ + --hash=sha256:5c849d495bf5154cd8da18a9eb15db127d4dba2968d88831aff6f0331ea9bd4c \ + --hash=sha256:64536573d0a2cb6e625cf309984e2d873979709f2cf22839bf2d61790b448ad5 \ + --hash=sha256:693945278a31f2086d9bf3df0fe8254bbeaef1fe71e1351c3bd730aa7d31c41b \ + --hash=sha256:6db4667b187a6742b33afbbaf05a7bc551ffcf1ced0000a571aedbb4aa42fc7b \ + --hash=sha256:6eb73fa5426ea69ee0e012fb59cdc76a15b1283d6e32e4f8dc4482ec67d1194d \ + --hash=sha256:722e1124aec435320ae01ee3ac7bec11a5d47f25d0ed6328f2273d287bc3abb0 \ + --hash=sha256:7268252af60904bf52c26173cbadc3a071cece75f873705419c8681f24d3edea \ + --hash=sha256:74fb4bee6880b529a0c6560885fce4dc95936920f9f20f53d99a213f7bf66776 \ + --hash=sha256:780d3a35680ced9ce682fbcf4cb9c2bad3136eeff760ab33707b71db84664e3a \ + --hash=sha256:82e8211d69a4f4bc360ea22cd6555f8e61a1bd211d1d5d39d3d228b48c83a897 \ + --hash=sha256:89aa2c2eeb20957be2d950b85974b30a01a762f3308cd02bb15e1ad632e22dc7 \ + --hash=sha256:8aefbba5f69d42246543407ed2461db31006b0f76c4e32dfd6f42215a2c41d09 \ + --hash=sha256:96ec70beabbd3b10e8bfe52616a13561e58fe84c0101dd031dc78f250d5128b9 \ + --hash=sha256:9750cc7fe1ae3b1611bb8cfc3f9ec11d532244235d75901fb6b8e42ce9229dfe \ + --hash=sha256:9acbb16f06fe7f52f441bb6f413ebae6c37baa6ef9edd49cdd567216da8600cd \ + --hash=sha256:9d3e0c25a2350080e9319724dede4f31f43a6c9779be48021a7f4ebde8b2d742 \ + --hash=sha256:a06339f38e9ed3a64e4c4e43aec7f59084033647f908e4259d279a52d3757d09 \ + --hash=sha256:a0cb6f11204443f27a1628b0e460f37fb30f624be6051d490fa7d7e26d4af3d0 \ + --hash=sha256:a7496bfe1da7fb1a4e1cc23bb67c58fab69311cc7d32b5a99c2007b4b2a0e932 \ + --hash=sha256:a828c57f00f729620a442881cc60e57cfcec6842ba38e1b19fd3e47ac0ff8dc1 \ + --hash=sha256:a9b2de4cf0cdd5bd2dee4c4f63a653c61d2408055ab77b151c1957f221cabf2a \ + --hash=sha256:b46c8ae3a8f1f41a0d2ef350c0b6e65822d80772fe46b653ab6b6274f61d4a49 \ + --hash=sha256:b7e3ed87d4138356775346e6845cccbe66cd9e207f3cd11d2f0b9fd13681359d \ + --hash=sha256:b7f2f9f912dca3934c1baec2e4585a674ef16fe00218d833856408c48d5beee7 \ + --hash=sha256:ba60bb19387e13597fb059f32cd4d59445d7b18b69a745b8f8e5db0346f33480 \ + --hash=sha256:beee944ae828747fd7cb216a70f120767fc9f4f00bacae8543c14a6831673f89 \ + --hash=sha256:bfa4a17e17ce9abf47a74ae02f32d014c5e9404b6d9ac7f729e01562bbee601e \ + --hash=sha256:c037a86e8513059a2613aaba4d817bb90b9d9b6b69aace3ce9c877e8c8ed402b \ + --hash=sha256:c302220494f5c1ebeb0912ea782bcd5e2f8308037b3c7553fad0e48ebad6ad82 \ + --hash=sha256:c6321c9efe29975232da3bd0af0ad216800a47e93d763ce64f291917a381b8eb \ + --hash=sha256:c757a9dd70d72b076d6f68efdbb9bc943665ae954dad2801b874c8c69e185068 \ + --hash=sha256:c99169d4ff810155ca50b4da3b075cbde79752443117d89429595c2e8e37fed8 \ + --hash=sha256:c9c92be9fd329ac801cc420e08452b70e7aeab94ea4233a4804f0915c14eba9b \ + --hash=sha256:cc7b01b3754ea68a62bd77ce6020afaffb44a590c2289089289363472d13aedb \ + --hash=sha256:db9e724bebd621d9beca794f2a4ff1d26eed5965b004a97f1f1685a173b869c2 \ + --hash=sha256:dca69045298ce5c11fd539682cff879cc1e664c245d1c64da929813e54241d11 \ + --hash=sha256:dd9b1baec094d91bf36ec729445f7769d0d0cf6b64d04d86e45baf89e2b9059b \ + --hash=sha256:e02a0e11cf6597299b9f3bbd3f93d79217cb90cfd1411aec33848b13f5c656cc \ + --hash=sha256:e6a20a581f9ce92d389a8c7d7c3dd47c81fd5d6e655c8dddf341e14aa48659d0 \ + --hash=sha256:e7004be74cbb7d9f34553a5ce5fb08be14fb33bc86f332fb71cbe5216362a497 \ + --hash=sha256:e774d53b1a477a67838a904131c4b0eef6b3d8a651f8b138b04f748fccfefe17 \ + --hash=sha256:edb678da49d9f72c9f6c609fbe41a5dfb9a9282f9e6a2253d5a91e0fc382d7c0 \ + --hash=sha256:f146e0911cb2f1da549fc58fc7bcd2b836a44b79ef871980d605ec392ff6b0d2 \ + --hash=sha256:f56e2333dda1fe0f909e7cc59f021eba0d2307bc6f012a1ccf2beca6ba362439 \ + --hash=sha256:f9a3ea26252bd92f570600098783d1371354d89d5f6b7dfd87359d669f2109b5 \ + --hash=sha256:f9aa1878d1083b276b0196f2dfbe00c9b7e752475ed3b682025ff20c1c1f51ac \ + --hash=sha256:fb3c2db03683b5767dedb5769b8a40ebb47d6f7f45b1b3e3b4b51ec8ad9d9825 \ + --hash=sha256:fbeb989b5cc29e8daf7f976b421c220f1b8c731cbf22b9130d8815418ea45887 \ + --hash=sha256:fde5bd59ab5357e3853313127f4d3565fc7dad314a74d7b5d43c22c6a5ed2ced \ + --hash=sha256:fe1a06da377e3a1062ae5fe0926e12b84eceb8a50b350ddca72dc85015873f74 + # via + # -c python/requirements_compiled.txt + # aiohttp + # aiosignal +fsspec==2023.12.1 \ + --hash=sha256:6271f1d3075a378bfe432f6f42bf7e1d2a6ba74f78dd9b512385474c579146a0 \ + --hash=sha256:c4da01a35ac65c853f833e43f67802c25213f560820d54ddf248f92eddd5e990 + # via + # -c python/requirements_compiled.txt + # ray +google-api-core==2.24.2 \ + --hash=sha256:810a63ac95f3c441b7c0e43d344e372887f62ce9071ba972eacf32672e072de9 \ + --hash=sha256:81718493daf06d96d6bc76a91c23874dbf2fac0adbbf542831b805ee6e974696 + # via + # -c python/requirements_compiled.txt + # opencensus +google-auth==2.23.4 \ + --hash=sha256:79905d6b1652187def79d491d6e23d0cbb3a21d3c7ba0dbaa9c8a01906b13ff3 \ + --hash=sha256:d4bbc92fe4b8bfd2f3e8d88e5ba7085935da208ee38a134fc280e7ce682a05f2 + # via + # -c python/requirements_compiled.txt + # google-api-core +googleapis-common-protos==1.61.0 \ + --hash=sha256:22f1915393bb3245343f6efe87f6fe868532efc12aa26b391b15132e1279f1c0 \ + --hash=sha256:8a64866a97f6304a7179873a465d6eee97b7a24ec6cfd78e0f575e96b821240b + # via + # -c python/requirements_compiled.txt + # google-api-core +grpcio==1.74.0 \ + --hash=sha256:0f87bddd6e27fc776aacf7ebfec367b6d49cad0455123951e4488ea99d9b9b8f \ + --hash=sha256:136b53c91ac1d02c8c24201bfdeb56f8b3ac3278668cbb8e0ba49c88069e1bdc \ + --hash=sha256:1733969040989f7acc3d94c22f55b4a9501a30f6aaacdbccfaba0a3ffb255ab7 \ + --hash=sha256:176d60a5168d7948539def20b2a3adcce67d72454d9ae05969a2e73f3a0feee7 \ + --hash=sha256:1a2b06afe2e50ebfd46247ac3ba60cac523f54ec7792ae9ba6073c12daf26f0a \ + --hash=sha256:1bf949792cee20d2078323a9b02bacbbae002b9e3b9e2433f2741c15bdeba1c4 \ + --hash=sha256:22b834cef33429ca6cc28303c9c327ba9a3fafecbf62fae17e9a7b7163cc43ac \ + --hash=sha256:2918948864fec2a11721d91568effffbe0a02b23ecd57f281391d986847982f6 \ + --hash=sha256:2bc2d7d8d184e2362b53905cb1708c84cb16354771c04b490485fa07ce3a1d89 \ + --hash=sha256:2f609a39f62a6f6f05c7512746798282546358a37ea93c1fcbadf8b2fed162e3 \ + --hash=sha256:3601274bc0523f6dc07666c0e01682c94472402ac2fd1226fd96e079863bfa49 \ + --hash=sha256:3b03d8f2a07f0fea8c8f74deb59f8352b770e3900d143b3d1475effcb08eec20 \ + --hash=sha256:3d14e3c4d65e19d8430a4e28ceb71ace4728776fd6c3ce34016947474479683f \ + --hash=sha256:42f8fee287427b94be63d916c90399ed310ed10aadbf9e2e5538b3e497d269bc \ + --hash=sha256:4bc5fca10aaf74779081e16c2bcc3d5ec643ffd528d9e7b1c9039000ead73bae \ + --hash=sha256:4e4181bfc24413d1e3a37a0b7889bea68d973d4b45dd2bc68bb766c140718f82 \ + --hash=sha256:55b453812fa7c7ce2f5c88be3018fb4a490519b6ce80788d5913f3f9d7da8c7b \ + --hash=sha256:566b9395b90cc3d0d0c6404bc8572c7c18786ede549cdb540ae27b58afe0fb91 \ + --hash=sha256:5f251c355167b2360537cf17bea2cf0197995e551ab9da6a0a59b3da5e8704f9 \ + --hash=sha256:60d2d48b0580e70d2e1954d0d19fa3c2e60dd7cbed826aca104fff518310d1c5 \ + --hash=sha256:64229c1e9cea079420527fa8ac45d80fc1e8d3f94deaa35643c381fa8d98f362 \ + --hash=sha256:655726919b75ab3c34cdad39da5c530ac6fa32696fb23119e36b64adcfca174a \ + --hash=sha256:662456c4513e298db6d7bd9c3b8df6f75f8752f0ba01fb653e252ed4a59b5a5d \ + --hash=sha256:68c8ebcca945efff9d86d8d6d7bfb0841cf0071024417e2d7f45c5e46b5b08eb \ + --hash=sha256:69e1a8180868a2576f02356565f16635b99088da7df3d45aaa7e24e73a054e31 \ + --hash=sha256:6bab67d15ad617aff094c382c882e0177637da73cbc5532d52c07b4ee887a87b \ + --hash=sha256:7d95d71ff35291bab3f1c52f52f474c632db26ea12700c2ff0ea0532cb0b5854 \ + --hash=sha256:80d1f4fbb35b0742d3e3d3bb654b7381cd5f015f8497279a1e9c21ba623e01b1 \ + --hash=sha256:834988b6c34515545b3edd13e902c1acdd9f2465d386ea5143fb558f153a7176 \ + --hash=sha256:8533e6e9c5bd630ca98062e3a1326249e6ada07d05acf191a77bc33f8948f3d8 \ + --hash=sha256:85bd5cdf4ed7b2d6438871adf6afff9af7096486fcf51818a81b77ef4dd30907 \ + --hash=sha256:86ad489db097141a907c559988c29718719aa3e13370d40e20506f11b4de0d11 \ + --hash=sha256:885912559974df35d92219e2dc98f51a16a48395f37b92865ad45186f294096c \ + --hash=sha256:8efe72fde5500f47aca1ef59495cb59c885afe04ac89dd11d810f2de87d935d4 \ + --hash=sha256:8f7b5882fb50632ab1e48cb3122d6df55b9afabc265582808036b6e51b9fd6b7 \ + --hash=sha256:9e7c4389771855a92934b2846bd807fc25a3dfa820fd912fe6bd8136026b2707 \ + --hash=sha256:9e912d3c993a29df6c627459af58975b2e5c897d93287939b9d5065f000249b5 \ + --hash=sha256:a8f0302f9ac4e9923f98d8e243939a6fb627cd048f5cd38595c97e38020dffce \ + --hash=sha256:b6a73b2ba83e663b2480a90b82fdae6a7aa6427f62bf43b29912c0cfd1aa2bfa \ + --hash=sha256:c14e803037e572c177ba54a3e090d6eb12efd795d49327c5ee2b3bddb836bf01 \ + --hash=sha256:c3d7bd6e3929fd2ea7fbc3f562e4987229ead70c9ae5f01501a46701e08f1ad9 \ + --hash=sha256:c98e0b7434a7fa4e3e63f250456eaef52499fba5ae661c58cc5b5477d11e7182 \ + --hash=sha256:cce634b10aeab37010449124814b05a62fb5f18928ca878f1bf4750d1f0c815b \ + --hash=sha256:e154d230dc1bbbd78ad2fdc3039fa50ad7ffcf438e4eb2fa30bce223a70c7486 \ + --hash=sha256:e1ea6176d7dfd5b941ea01c2ec34de9531ba494d541fe2057c904e601879f249 \ + --hash=sha256:e759f9e8bc908aaae0412642afe5416c9f983a80499448fcc7fab8692ae044c3 \ + --hash=sha256:e8978003816c7b9eabe217f88c78bc26adc8f9304bf6a594b02e5a49b2ef9c11 \ + --hash=sha256:ecde9ab49f58433abe02f9ed076c7b5be839cf0153883a6d23995937a82392fa \ + --hash=sha256:f6ec94f0e50eb8fa1744a731088b966427575e40c2944a980049798b127a687e \ + --hash=sha256:fd3c71aeee838299c5887230b8a1822795325ddfea635edd82954c1eaa831e24 \ + --hash=sha256:fe0f540750a13fd8e5da4b3eaba91a785eea8dca5ccd2bc2ffe978caa403090e + # via ray +gymnasium==1.1.1 \ + --hash=sha256:8bd9ea9bdef32c950a444ff36afc785e1d81051ec32d30435058953c20d2456d \ + --hash=sha256:9c167ec0a2b388666e37f63b2849cd2552f7f5b71938574c637bb36487eb928a + # via + # -c python/requirements_compiled.txt + # ray +h11==0.16.0 \ + --hash=sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1 \ + --hash=sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86 + # via + # -c python/requirements_compiled.txt + # uvicorn +httptools==0.6.4 \ + --hash=sha256:0614154d5454c21b6410fdf5262b4a3ddb0f53f1e1721cfd59d55f32138c578a \ + --hash=sha256:0e563e54979e97b6d13f1bbc05a96109923e76b901f786a5eae36e99c01237bd \ + --hash=sha256:16e603a3bff50db08cd578d54f07032ca1631450ceb972c2f834c2b860c28ea2 \ + --hash=sha256:288cd628406cc53f9a541cfaf06041b4c71d751856bab45e3702191f931ccd17 \ + --hash=sha256:28908df1b9bb8187393d5b5db91435ccc9c8e891657f9cbb42a2541b44c82fc8 \ + --hash=sha256:322d20ea9cdd1fa98bd6a74b77e2ec5b818abdc3d36695ab402a0de8ef2865a3 \ + --hash=sha256:342dd6946aa6bda4b8f18c734576106b8a31f2fe31492881a9a160ec84ff4bd5 \ + --hash=sha256:345c288418f0944a6fe67be8e6afa9262b18c7626c3ef3c28adc5eabc06a68da \ + --hash=sha256:3c73ce323711a6ffb0d247dcd5a550b8babf0f757e86a52558fe5b86d6fefcc0 \ + --hash=sha256:40a5ec98d3f49904b9fe36827dcf1aadfef3b89e2bd05b0e35e94f97c2b14721 \ + --hash=sha256:40b0f7fe4fd38e6a507bdb751db0379df1e99120c65fbdc8ee6c1d044897a636 \ + --hash=sha256:40dc6a8e399e15ea525305a2ddba998b0af5caa2566bcd79dcbe8948181eeaff \ + --hash=sha256:4b36913ba52008249223042dca46e69967985fb4051951f94357ea681e1f5dc0 \ + --hash=sha256:4d87b29bd4486c0093fc64dea80231f7c7f7eb4dc70ae394d70a495ab8436071 \ + --hash=sha256:4e93eee4add6493b59a5c514da98c939b244fce4a0d8879cd3f466562f4b7d5c \ + --hash=sha256:59e724f8b332319e2875efd360e61ac07f33b492889284a3e05e6d13746876f4 \ + --hash=sha256:69422b7f458c5af875922cdb5bd586cc1f1033295aa9ff63ee196a87519ac8e1 \ + --hash=sha256:703c346571fa50d2e9856a37d7cd9435a25e7fd15e236c397bf224afaa355fe9 \ + --hash=sha256:85071a1e8c2d051b507161f6c3e26155b5c790e4e28d7f236422dbacc2a9cc44 \ + --hash=sha256:856f4bc0478ae143bad54a4242fccb1f3f86a6e1be5548fecfd4102061b3a083 \ + --hash=sha256:85797e37e8eeaa5439d33e556662cc370e474445d5fab24dcadc65a8ffb04003 \ + --hash=sha256:90d96a385fa941283ebd231464045187a31ad932ebfa541be8edf5b3c2328959 \ + --hash=sha256:94978a49b8f4569ad607cd4946b759d90b285e39c0d4640c6b36ca7a3ddf2efc \ + --hash=sha256:aafe0f1918ed07b67c1e838f950b1c1fabc683030477e60b335649b8020e1076 \ + --hash=sha256:ab9ba8dcf59de5181f6be44a77458e45a578fc99c31510b8c65b7d5acc3cf490 \ + --hash=sha256:ade273d7e767d5fae13fa637f4d53b6e961fb7fd93c7797562663f0171c26660 \ + --hash=sha256:b799de31416ecc589ad79dd85a0b2657a8fe39327944998dea368c1d4c9e55e6 \ + --hash=sha256:c26f313951f6e26147833fc923f78f95604bbec812a43e5ee37f26dc9e5a686c \ + --hash=sha256:ca80b7485c76f768a3bc83ea58373f8db7b015551117375e4918e2aa77ea9b50 \ + --hash=sha256:d1ffd262a73d7c28424252381a5b854c19d9de5f56f075445d33919a637e3547 \ + --hash=sha256:d3f0d369e7ffbe59c4b6116a44d6a8eb4783aae027f2c0b366cf0aa964185dba \ + --hash=sha256:d54efd20338ac52ba31e7da78e4a72570cf729fac82bc31ff9199bedf1dc7440 \ + --hash=sha256:dacdd3d10ea1b4ca9df97a0a303cbacafc04b5cd375fa98732678151643d4988 \ + --hash=sha256:db353d22843cf1028f43c3651581e4bb49374d85692a85f95f7b9a130e1b2cab \ + --hash=sha256:db78cb9ca56b59b016e64b6031eda5653be0589dba2b1b43453f6e8b405a0970 \ + --hash=sha256:deee0e3343f98ee8047e9f4c5bc7cedbf69f5734454a94c38ee829fb2d5fa3c1 \ + --hash=sha256:df017d6c780287d5c80601dafa31f17bddb170232d85c066604d8558683711a2 \ + --hash=sha256:df959752a0c2748a65ab5387d08287abf6779ae9165916fe053e68ae1fbdc47f \ + --hash=sha256:ec4f178901fa1834d4a060320d2f3abc5c9e39766953d038f1458cb885f47e81 \ + --hash=sha256:f47f8ed67cc0ff862b84a1189831d1d33c963fb3ce1ee0c65d3b0cbe7b711069 \ + --hash=sha256:f8787367fbdfccae38e35abf7641dafc5310310a5987b689f4c32cc8cc3ee975 \ + --hash=sha256:f9eb89ecf8b290f2e293325c646a211ff1c2493222798bb80a530c5e7502494f \ + --hash=sha256:fc411e1c0a7dcd2f902c7c48cf079947a7e65b5485dea9decb82b9105ca71a43 + # via uvicorn +idna==3.7 \ + --hash=sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc \ + --hash=sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0 + # via + # -c python/requirements_compiled.txt + # anyio + # requests + # yarl +importlib-metadata==6.11.0 \ + --hash=sha256:1231cf92d825c9e03cfc4da076a16de6422c863558229ea0b22b675657463443 \ + --hash=sha256:f0afba6205ad8f8947c7d338b5342d5db2afbfd82f9cbef7879a9539cc12eb9b + # via + # -c python/requirements_compiled.txt + # gymnasium + # opentelemetry-api +jinja2==3.1.6 ; sys_platform != 'win32' \ + --hash=sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d \ + --hash=sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67 + # via + # -c python/requirements_compiled.txt + # memray +jsonschema==4.23.0 \ + --hash=sha256:d71497fef26351a33265337fa77ffeb82423f3ea21283cd9467bb03999266bc4 \ + --hash=sha256:fbadb6f8b144a8f8cf9f0b89ba94501d143e50411a1278633f56a7acf7fd5566 + # via + # -c python/requirements_compiled.txt + # ray +jsonschema-specifications==2024.10.1 \ + --hash=sha256:0f38b83639958ce1152d02a7f062902c41c8fd20d558b0c34344292d417ae272 \ + --hash=sha256:a09a0680616357d9a0ecf05c12ad234479f549239d0f5b55f3deea67475da9bf + # via + # -c python/requirements_compiled.txt + # jsonschema +kombu==5.5.4 \ + --hash=sha256:886600168275ebeada93b888e831352fe578168342f0d1d5833d88ba0d847363 \ + --hash=sha256:a12ed0557c238897d8e518f1d1fdf84bd1516c5e305af2dacd85c2015115feb8 + # via + # -c python/requirements_compiled.txt + # celery +lz4==4.3.3 \ + --hash=sha256:01fe674ef2889dbb9899d8a67361e0c4a2c833af5aeb37dd505727cf5d2a131e \ + --hash=sha256:054b4631a355606e99a42396f5db4d22046a3397ffc3269a348ec41eaebd69d2 \ + --hash=sha256:0a136e44a16fc98b1abc404fbabf7f1fada2bdab6a7e970974fb81cf55b636d0 \ + --hash=sha256:0e9c410b11a31dbdc94c05ac3c480cb4b222460faf9231f12538d0074e56c563 \ + --hash=sha256:222a7e35137d7539c9c33bb53fcbb26510c5748779364014235afc62b0ec797f \ + --hash=sha256:24b3206de56b7a537eda3a8123c644a2b7bf111f0af53bc14bed90ce5562d1aa \ + --hash=sha256:2b901c7784caac9a1ded4555258207d9e9697e746cc8532129f150ffe1f6ba0d \ + --hash=sha256:2f7b1839f795315e480fb87d9bc60b186a98e3e5d17203c6e757611ef7dcef61 \ + --hash=sha256:30e8c20b8857adef7be045c65f47ab1e2c4fabba86a9fa9a997d7674a31ea6b6 \ + --hash=sha256:31ea4be9d0059c00b2572d700bf2c1bc82f241f2c3282034a759c9a4d6ca4dc2 \ + --hash=sha256:337cb94488a1b060ef1685187d6ad4ba8bc61d26d631d7ba909ee984ea736be1 \ + --hash=sha256:33c9a6fd20767ccaf70649982f8f3eeb0884035c150c0b818ea660152cf3c809 \ + --hash=sha256:363ab65bf31338eb364062a15f302fc0fab0a49426051429866d71c793c23394 \ + --hash=sha256:43cf03059c0f941b772c8aeb42a0813d68d7081c009542301637e5782f8a33e2 \ + --hash=sha256:56f4fe9c6327adb97406f27a66420b22ce02d71a5c365c48d6b656b4aaeb7775 \ + --hash=sha256:5d35533bf2cee56f38ced91f766cd0038b6abf46f438a80d50c52750088be93f \ + --hash=sha256:6756212507405f270b66b3ff7f564618de0606395c0fe10a7ae2ffcbbe0b1fba \ + --hash=sha256:6cdc60e21ec70266947a48839b437d46025076eb4b12c76bd47f8e5eb8a75dcc \ + --hash=sha256:abc197e4aca8b63f5ae200af03eb95fb4b5055a8f990079b5bdf042f568469dd \ + --hash=sha256:b14d948e6dce389f9a7afc666d60dd1e35fa2138a8ec5306d30cd2e30d36b40c \ + --hash=sha256:b47839b53956e2737229d70714f1d75f33e8ac26e52c267f0197b3189ca6de24 \ + --hash=sha256:b6d9ec061b9eca86e4dcc003d93334b95d53909afd5a32c6e4f222157b50c071 \ + --hash=sha256:b891880c187e96339474af2a3b2bfb11a8e4732ff5034be919aa9029484cd201 \ + --hash=sha256:bca8fccc15e3add173da91be8f34121578dc777711ffd98d399be35487c934bf \ + --hash=sha256:c81703b12475da73a5d66618856d04b1307e43428a7e59d98cfe5a5d608a74c6 \ + --hash=sha256:d2507ee9c99dbddd191c86f0e0c8b724c76d26b0602db9ea23232304382e1f21 \ + --hash=sha256:e36cd7b9d4d920d3bfc2369840da506fa68258f7bb176b8743189793c055e43d \ + --hash=sha256:e7d84b479ddf39fe3ea05387f10b779155fc0990125f4fb35d636114e1c63a2e \ + --hash=sha256:eac9af361e0d98335a02ff12fb56caeb7ea1196cf1a49dbf6f17828a131da807 \ + --hash=sha256:edfd858985c23523f4e5a7526ca6ee65ff930207a7ec8a8f57a01eae506aaee7 \ + --hash=sha256:ee9ff50557a942d187ec85462bb0960207e7ec5b19b3b48949263993771c6205 \ + --hash=sha256:f0e822cd7644995d9ba248cb4b67859701748a93e2ab7fc9bc18c599a52e4604 \ + --hash=sha256:f180904f33bdd1e92967923a43c22899e303906d19b2cf8bb547db6653ea6e7d \ + --hash=sha256:f1d18718f9d78182c6b60f568c9a9cec8a7204d7cb6fad4e511a2ef279e4cb05 \ + --hash=sha256:f4c7bf687303ca47d69f9f0133274958fd672efaa33fb5bcde467862d6c621f0 \ + --hash=sha256:f76176492ff082657ada0d0f10c794b6da5800249ef1692b35cf49b1e93e8ef7 + # via + # -c python/requirements_compiled.txt + # ray +markdown-it-py==2.2.0 ; sys_platform != 'win32' \ + --hash=sha256:5a35f8d1870171d9acc47b99612dc146129b631baf04970128b568f190d0cc30 \ + --hash=sha256:7c9a5e412688bc771c67432cbfebcdd686c93ce6484913dccf06cb5a0bea35a1 + # via + # -c python/requirements_compiled.txt + # rich +markupsafe==2.1.3 ; sys_platform != 'win32' \ + --hash=sha256:05fb21170423db021895e1ea1e1f3ab3adb85d1c2333cbc2310f2a26bc77272e \ + --hash=sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686 \ + --hash=sha256:1577735524cdad32f9f694208aa75e422adba74f1baee7551620e43a3141f559 \ + --hash=sha256:1b40069d487e7edb2676d3fbdb2b0829ffa2cd63a2ec26c4938b2d34391b4ecc \ + --hash=sha256:1b8dd8c3fd14349433c79fa8abeb573a55fc0fdd769133baac1f5e07abf54aeb \ + --hash=sha256:2c1b19b3aaacc6e57b7e25710ff571c24d6c3613a45e905b1fde04d691b98ee0 \ + --hash=sha256:2ef12179d3a291be237280175b542c07a36e7f60718296278d8593d21ca937d4 \ + --hash=sha256:3c0fae6c3be832a0a0473ac912810b2877c8cb9d76ca48de1ed31e1c68386575 \ + --hash=sha256:3fd4abcb888d15a94f32b75d8fd18ee162ca0c064f35b11134be77050296d6ba \ + --hash=sha256:47d4f1c5f80fc62fdd7777d0d40a2e9dda0a05883ab11374334f6c4de38adffd \ + --hash=sha256:65c1a9bcdadc6c28eecee2c119465aebff8f7a584dd719facdd9e825ec61ab52 \ + --hash=sha256:68e78619a61ecf91e76aa3e6e8e33fc4894a2bebe93410754bd28fce0a8a4f9f \ + --hash=sha256:6b2b56950d93e41f33b4223ead100ea0fe11f8e6ee5f641eb753ce4b77a7042b \ + --hash=sha256:8023faf4e01efadfa183e863fefde0046de576c6f14659e8782065bcece22198 \ + --hash=sha256:8afafd99945ead6e075b973fefa56379c5b5c53fd8937dad92c662da5d8fd5ee \ + --hash=sha256:8c41976a29d078bb235fea9b2ecd3da465df42a562910f9022f1a03107bd02be \ + --hash=sha256:9dcdfd0eaf283af041973bff14a2e143b8bd64e069f4c383416ecd79a81aab58 \ + --hash=sha256:aa57bd9cf8ae831a362185ee444e15a93ecb2e344c8e52e4d721ea3ab6ef1823 \ + --hash=sha256:ad9e82fb8f09ade1c3e1b996a6337afac2b8b9e365f926f5a61aacc71adc5b3c \ + --hash=sha256:b076b6226fb84157e3f7c971a47ff3a679d837cf338547532ab866c57930dbee \ + --hash=sha256:bfce63a9e7834b12b87c64d6b155fdd9b3b96191b6bd334bf37db7ff1fe457f2 \ + --hash=sha256:cd0f502fe016460680cd20aaa5a76d241d6f35a1c3350c474bac1273803893fa \ + --hash=sha256:e09031c87a1e51556fdcb46e5bd4f59dfb743061cf93c4d6831bf894f125eb57 \ + --hash=sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc \ + --hash=sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11 + # via + # -c python/requirements_compiled.txt + # jinja2 +mdurl==0.1.2 ; sys_platform != 'win32' \ + --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \ + --hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba + # via + # -c python/requirements_compiled.txt + # markdown-it-py +memray==1.10.0 ; sys_platform != 'win32' \ + --hash=sha256:0a21745fb516b7a6efcd40aa7487c59e9313fcfc782d0193fcfcf00b48426874 \ + --hash=sha256:22f2a47871c172a0539bd72737bb6b294fc10c510464066b825d90fcd3bb4916 \ + --hash=sha256:23e8c402625cfb32d0e9edb5ec0945f3e5e54bc6b0c5699f6284302082b80bd4 \ + --hash=sha256:2ce59ef485db3634de98b3a026d2450fc0a875e3a58a9ea85f7a89098841defe \ + --hash=sha256:322ed0b69014a0969b777768d461a785203f81f9864386b666b5b26645d9c294 \ + --hash=sha256:38322e052b882790993412f1840517a51818aa55c47037f69915b2007f2c4cee \ + --hash=sha256:38393c86ce6d0a08e6ec0eb1401d49803b7c0c950c2565386751cdc81568cba8 \ + --hash=sha256:391aac6c9f744528d3186bc82d708a1acc83525778f804045d7c96f860f8ec98 \ + --hash=sha256:3a8bb7fbd8303c4f0017ba7faef6b88f904cda2931ed667cbf3b98f024b3bc44 \ + --hash=sha256:3c401c57f49c4c5f1fecaee1e746f537cdc6680da05fb963dc143bd08ee109bf \ + --hash=sha256:4eba29179772b4a2e440a065b320b03bc2e73fe2648bdf7936aa3b9a086fab4a \ + --hash=sha256:53a8f66af18b1f3bcf5c9f3c95ae4134dd675903a38f9d0e6341b7bca01b63d0 \ + --hash=sha256:566602b2143e06b3d592901d98c52ce4599e71aa2555146eeb5cec03506f9498 \ + --hash=sha256:663d463e89a64bae4a6b2f8c837d11a3d094834442d536a4165e1d31899a3500 \ + --hash=sha256:68bd8df023c8a32f44c11d997e5c536837e27c0955daf557d3a377edd55a1dd3 \ + --hash=sha256:6937d7ef67d18ccc01c3250cdf3b4ef1445b859ee8756f09e3d11bd3ff0c7d67 \ + --hash=sha256:6b311e91203be71e1a0ce5e4f978137765bcb1045f3bf5646129c83c5b96ab3c \ + --hash=sha256:6fd13ef666c7fced9768d1cfabf71dc6dfa6724935a8dff463495ac2dc5e13a4 \ + --hash=sha256:8196c684f1be8fe423e5cdd2356d4255a2cb482a1f3e89612b70d2a2862cf5bb \ + --hash=sha256:843a688877691746f9d1835cfa8a65139948471bdd78720435808d20bc30a1cc \ + --hash=sha256:85c32d6613d81b075f740e398c4d653e0803cd48e82c33dcd584c109d6782666 \ + --hash=sha256:898acd60f57a10dc5aaf1fd64aa2f821f0420114f3f60c3058083788603f173a \ + --hash=sha256:8d56f37a34125684746c13d24bd7a3fb17549b0bb355eb50969eb11e05e3ba62 \ + --hash=sha256:92c372cb262eddd23049f945ca9527f0e4cc7c40a070aade1802d066f680885b \ + --hash=sha256:95e563d9c976e429ad597ad2720d95cebbe8bac891a3082465439143e2740772 \ + --hash=sha256:9627184c926252c8f719c301f1fefe970f0d033c643a6448b93fed2889d1ea94 \ + --hash=sha256:a9e985fb7646b0475c303919d19211d2aa54e5a9e2cd2a102472299be5dbebd3 \ + --hash=sha256:b681519357d94f5f0857fbc6029e7c44d3f41436109e955a14fd312d8317bc35 \ + --hash=sha256:b75040f28e8678d0e9c4907d55c95cf26db8ef5adc9941a228f1b280a9efd9c0 \ + --hash=sha256:c3a14960838d89a91747885897d34134afb65883cc3b0ed7ff30fe1af00f9fe6 \ + --hash=sha256:c7aeb47174c42e99740a8e2b3b6fe0932c95d987258d48a746974ead19176c26 \ + --hash=sha256:ce22a887a585ef5020896de89ffc793e531b65ccc81fbafcc7886010c2c562b3 \ + --hash=sha256:cf6d683c4f8d25c6ad06ae18715f218983c5eb86803953615e902d632fdf6ec1 \ + --hash=sha256:e356af93e3b031c83957e9ac1a653f5aaba5df1e357dd17142f5ed19bb3dc660 \ + --hash=sha256:f16c5c8730b616613dc8bafe32649ca6bd7252606251eb00148582011758d0b5 + # via + # -c python/requirements_compiled.txt + # ray +msgpack==1.0.7 \ + --hash=sha256:04ad6069c86e531682f9e1e71b71c1c3937d6014a7c3e9edd2aa81ad58842862 \ + --hash=sha256:0bfdd914e55e0d2c9e1526de210f6fe8ffe9705f2b1dfcc4aecc92a4cb4b533d \ + --hash=sha256:1dc93e8e4653bdb5910aed79f11e165c85732067614f180f70534f056da97db3 \ + --hash=sha256:1e2d69948e4132813b8d1131f29f9101bc2c915f26089a6d632001a5c1349672 \ + --hash=sha256:235a31ec7db685f5c82233bddf9858748b89b8119bf4538d514536c485c15fe0 \ + --hash=sha256:27dcd6f46a21c18fa5e5deed92a43d4554e3df8d8ca5a47bf0615d6a5f39dbc9 \ + --hash=sha256:28efb066cde83c479dfe5a48141a53bc7e5f13f785b92ddde336c716663039ee \ + --hash=sha256:3476fae43db72bd11f29a5147ae2f3cb22e2f1a91d575ef130d2bf49afd21c46 \ + --hash=sha256:36e17c4592231a7dbd2ed09027823ab295d2791b3b1efb2aee874b10548b7524 \ + --hash=sha256:384d779f0d6f1b110eae74cb0659d9aa6ff35aaf547b3955abf2ab4c901c4819 \ + --hash=sha256:38949d30b11ae5f95c3c91917ee7a6b239f5ec276f271f28638dec9156f82cfc \ + --hash=sha256:3967e4ad1aa9da62fd53e346ed17d7b2e922cba5ab93bdd46febcac39be636fc \ + --hash=sha256:3e7bf4442b310ff154b7bb9d81eb2c016b7d597e364f97d72b1acc3817a0fdc1 \ + --hash=sha256:3f0c8c6dfa6605ab8ff0611995ee30d4f9fcff89966cf562733b4008a3d60d82 \ + --hash=sha256:484ae3240666ad34cfa31eea7b8c6cd2f1fdaae21d73ce2974211df099a95d81 \ + --hash=sha256:4a7b4f35de6a304b5533c238bee86b670b75b03d31b7797929caa7a624b5dda6 \ + --hash=sha256:4cb14ce54d9b857be9591ac364cb08dc2d6a5c4318c1182cb1d02274029d590d \ + --hash=sha256:4e71bc4416de195d6e9b4ee93ad3f2f6b2ce11d042b4d7a7ee00bbe0358bd0c2 \ + --hash=sha256:52700dc63a4676669b341ba33520f4d6e43d3ca58d422e22ba66d1736b0a6e4c \ + --hash=sha256:572efc93db7a4d27e404501975ca6d2d9775705c2d922390d878fcf768d92c87 \ + --hash=sha256:576eb384292b139821c41995523654ad82d1916da6a60cff129c715a6223ea84 \ + --hash=sha256:5b0bf0effb196ed76b7ad883848143427a73c355ae8e569fa538365064188b8e \ + --hash=sha256:5b6ccc0c85916998d788b295765ea0e9cb9aac7e4a8ed71d12e7d8ac31c23c95 \ + --hash=sha256:5ed82f5a7af3697b1c4786053736f24a0efd0a1b8a130d4c7bfee4b9ded0f08f \ + --hash=sha256:6d4c80667de2e36970ebf74f42d1088cc9ee7ef5f4e8c35eee1b40eafd33ca5b \ + --hash=sha256:730076207cb816138cf1af7f7237b208340a2c5e749707457d70705715c93b93 \ + --hash=sha256:7687e22a31e976a0e7fc99c2f4d11ca45eff652a81eb8c8085e9609298916dcf \ + --hash=sha256:822ea70dc4018c7e6223f13affd1c5c30c0f5c12ac1f96cd8e9949acddb48a61 \ + --hash=sha256:84b0daf226913133f899ea9b30618722d45feffa67e4fe867b0b5ae83a34060c \ + --hash=sha256:85765fdf4b27eb5086f05ac0491090fc76f4f2b28e09d9350c31aac25a5aaff8 \ + --hash=sha256:8dd178c4c80706546702c59529ffc005681bd6dc2ea234c450661b205445a34d \ + --hash=sha256:8f5b234f567cf76ee489502ceb7165c2a5cecec081db2b37e35332b537f8157c \ + --hash=sha256:98bbd754a422a0b123c66a4c341de0474cad4a5c10c164ceed6ea090f3563db4 \ + --hash=sha256:993584fc821c58d5993521bfdcd31a4adf025c7d745bbd4d12ccfecf695af5ba \ + --hash=sha256:a40821a89dc373d6427e2b44b572efc36a2778d3f543299e2f24eb1a5de65415 \ + --hash=sha256:b291f0ee7961a597cbbcc77709374087fa2a9afe7bdb6a40dbbd9b127e79afee \ + --hash=sha256:b573a43ef7c368ba4ea06050a957c2a7550f729c31f11dd616d2ac4aba99888d \ + --hash=sha256:b610ff0f24e9f11c9ae653c67ff8cc03c075131401b3e5ef4b82570d1728f8a9 \ + --hash=sha256:bdf38ba2d393c7911ae989c3bbba510ebbcdf4ecbdbfec36272abe350c454075 \ + --hash=sha256:bfef2bb6ef068827bbd021017a107194956918ab43ce4d6dc945ffa13efbc25f \ + --hash=sha256:cab3db8bab4b7e635c1c97270d7a4b2a90c070b33cbc00c99ef3f9be03d3e1f7 \ + --hash=sha256:cb70766519500281815dfd7a87d3a178acf7ce95390544b8c90587d76b227681 \ + --hash=sha256:cca1b62fe70d761a282496b96a5e51c44c213e410a964bdffe0928e611368329 \ + --hash=sha256:ccf9a39706b604d884d2cb1e27fe973bc55f2890c52f38df742bc1d79ab9f5e1 \ + --hash=sha256:dc43f1ec66eb8440567186ae2f8c447d91e0372d793dfe8c222aec857b81a8cf \ + --hash=sha256:dd632777ff3beaaf629f1ab4396caf7ba0bdd075d948a69460d13d44357aca4c \ + --hash=sha256:e45ae4927759289c30ccba8d9fdce62bb414977ba158286b5ddaf8df2cddb5c5 \ + --hash=sha256:e50ebce52f41370707f1e21a59514e3375e3edd6e1832f5e5235237db933c98b \ + --hash=sha256:ebbbba226f0a108a7366bf4b59bf0f30a12fd5e75100c630267d94d7f0ad20e5 \ + --hash=sha256:ec79ff6159dffcc30853b2ad612ed572af86c92b5168aa3fc01a67b0fa40665e \ + --hash=sha256:f0936e08e0003f66bfd97e74ee530427707297b0d0361247e9b4f59ab78ddc8b \ + --hash=sha256:f26a07a6e877c76a88e3cecac8531908d980d3d5067ff69213653649ec0f60ad \ + --hash=sha256:f64e376cd20d3f030190e8c32e1c64582eba56ac6dc7d5b0b49a9d44021b52fd \ + --hash=sha256:f6ffbc252eb0d229aeb2f9ad051200668fc3a9aaa8994e49f0cb2ffe2b7867e7 \ + --hash=sha256:f9a7c509542db4eceed3dcf21ee5267ab565a83555c9b88a8109dcecc4709002 \ + --hash=sha256:ff1d0899f104f3921d94579a5638847f783c9b04f2d5f229392ca77fba5b82fc + # via + # -c python/requirements_compiled.txt + # ray +multidict==6.0.5 \ + --hash=sha256:01265f5e40f5a17f8241d52656ed27192be03bfa8764d88e8220141d1e4b3556 \ + --hash=sha256:0275e35209c27a3f7951e1ce7aaf93ce0d163b28948444bec61dd7badc6d3f8c \ + --hash=sha256:04bde7a7b3de05732a4eb39c94574db1ec99abb56162d6c520ad26f83267de29 \ + --hash=sha256:04da1bb8c8dbadf2a18a452639771951c662c5ad03aefe4884775454be322c9b \ + --hash=sha256:09a892e4a9fb47331da06948690ae38eaa2426de97b4ccbfafbdcbe5c8f37ff8 \ + --hash=sha256:0d63c74e3d7ab26de115c49bffc92cc77ed23395303d496eae515d4204a625e7 \ + --hash=sha256:107c0cdefe028703fb5dafe640a409cb146d44a6ae201e55b35a4af8e95457dd \ + --hash=sha256:141b43360bfd3bdd75f15ed811850763555a251e38b2405967f8e25fb43f7d40 \ + --hash=sha256:14c2976aa9038c2629efa2c148022ed5eb4cb939e15ec7aace7ca932f48f9ba6 \ + --hash=sha256:19fe01cea168585ba0f678cad6f58133db2aa14eccaf22f88e4a6dccadfad8b3 \ + --hash=sha256:1d147090048129ce3c453f0292e7697d333db95e52616b3793922945804a433c \ + --hash=sha256:1d9ea7a7e779d7a3561aade7d596649fbecfa5c08a7674b11b423783217933f9 \ + --hash=sha256:215ed703caf15f578dca76ee6f6b21b7603791ae090fbf1ef9d865571039ade5 \ + --hash=sha256:21fd81c4ebdb4f214161be351eb5bcf385426bf023041da2fd9e60681f3cebae \ + --hash=sha256:220dd781e3f7af2c2c1053da9fa96d9cf3072ca58f057f4c5adaaa1cab8fc442 \ + --hash=sha256:228b644ae063c10e7f324ab1ab6b548bdf6f8b47f3ec234fef1093bc2735e5f9 \ + --hash=sha256:29bfeb0dff5cb5fdab2023a7a9947b3b4af63e9c47cae2a10ad58394b517fddc \ + --hash=sha256:2f4848aa3baa109e6ab81fe2006c77ed4d3cd1e0ac2c1fbddb7b1277c168788c \ + --hash=sha256:2faa5ae9376faba05f630d7e5e6be05be22913782b927b19d12b8145968a85ea \ + --hash=sha256:2ffc42c922dbfddb4a4c3b438eb056828719f07608af27d163191cb3e3aa6cc5 \ + --hash=sha256:37b15024f864916b4951adb95d3a80c9431299080341ab9544ed148091b53f50 \ + --hash=sha256:3cc2ad10255f903656017363cd59436f2111443a76f996584d1077e43ee51182 \ + --hash=sha256:3d25f19500588cbc47dc19081d78131c32637c25804df8414463ec908631e453 \ + --hash=sha256:403c0911cd5d5791605808b942c88a8155c2592e05332d2bf78f18697a5fa15e \ + --hash=sha256:411bf8515f3be9813d06004cac41ccf7d1cd46dfe233705933dd163b60e37600 \ + --hash=sha256:425bf820055005bfc8aa9a0b99ccb52cc2f4070153e34b701acc98d201693733 \ + --hash=sha256:435a0984199d81ca178b9ae2c26ec3d49692d20ee29bc4c11a2a8d4514c67eda \ + --hash=sha256:4a6a4f196f08c58c59e0b8ef8ec441d12aee4125a7d4f4fef000ccb22f8d7241 \ + --hash=sha256:4cc0ef8b962ac7a5e62b9e826bd0cd5040e7d401bc45a6835910ed699037a461 \ + --hash=sha256:51d035609b86722963404f711db441cf7134f1889107fb171a970c9701f92e1e \ + --hash=sha256:53689bb4e102200a4fafa9de9c7c3c212ab40a7ab2c8e474491914d2305f187e \ + --hash=sha256:55205d03e8a598cfc688c71ca8ea5f66447164efff8869517f175ea632c7cb7b \ + --hash=sha256:5c0631926c4f58e9a5ccce555ad7747d9a9f8b10619621f22f9635f069f6233e \ + --hash=sha256:5cb241881eefd96b46f89b1a056187ea8e9ba14ab88ba632e68d7a2ecb7aadf7 \ + --hash=sha256:60d698e8179a42ec85172d12f50b1668254628425a6bd611aba022257cac1386 \ + --hash=sha256:612d1156111ae11d14afaf3a0669ebf6c170dbb735e510a7438ffe2369a847fd \ + --hash=sha256:6214c5a5571802c33f80e6c84713b2c79e024995b9c5897f794b43e714daeec9 \ + --hash=sha256:6939c95381e003f54cd4c5516740faba40cf5ad3eeff460c3ad1d3e0ea2549bf \ + --hash=sha256:69db76c09796b313331bb7048229e3bee7928eb62bab5e071e9f7fcc4879caee \ + --hash=sha256:6bf7a982604375a8d49b6cc1b781c1747f243d91b81035a9b43a2126c04766f5 \ + --hash=sha256:766c8f7511df26d9f11cd3a8be623e59cca73d44643abab3f8c8c07620524e4a \ + --hash=sha256:76c0de87358b192de7ea9649beb392f107dcad9ad27276324c24c91774ca5271 \ + --hash=sha256:76f067f5121dcecf0d63a67f29080b26c43c71a98b10c701b0677e4a065fbd54 \ + --hash=sha256:7901c05ead4b3fb75113fb1dd33eb1253c6d3ee37ce93305acd9d38e0b5f21a4 \ + --hash=sha256:79660376075cfd4b2c80f295528aa6beb2058fd289f4c9252f986751a4cd0496 \ + --hash=sha256:79a6d2ba910adb2cbafc95dad936f8b9386e77c84c35bc0add315b856d7c3abb \ + --hash=sha256:7afcdd1fc07befad18ec4523a782cde4e93e0a2bf71239894b8d61ee578c1319 \ + --hash=sha256:7be7047bd08accdb7487737631d25735c9a04327911de89ff1b26b81745bd4e3 \ + --hash=sha256:7c6390cf87ff6234643428991b7359b5f59cc15155695deb4eda5c777d2b880f \ + --hash=sha256:7df704ca8cf4a073334e0427ae2345323613e4df18cc224f647f251e5e75a527 \ + --hash=sha256:85f67aed7bb647f93e7520633d8f51d3cbc6ab96957c71272b286b2f30dc70ed \ + --hash=sha256:896ebdcf62683551312c30e20614305f53125750803b614e9e6ce74a96232604 \ + --hash=sha256:92d16a3e275e38293623ebf639c471d3e03bb20b8ebb845237e0d3664914caef \ + --hash=sha256:99f60d34c048c5c2fabc766108c103612344c46e35d4ed9ae0673d33c8fb26e8 \ + --hash=sha256:9fe7b0653ba3d9d65cbe7698cca585bf0f8c83dbbcc710db9c90f478e175f2d5 \ + --hash=sha256:a3145cb08d8625b2d3fee1b2d596a8766352979c9bffe5d7833e0503d0f0b5e5 \ + --hash=sha256:aeaf541ddbad8311a87dd695ed9642401131ea39ad7bc8cf3ef3967fd093b626 \ + --hash=sha256:b55358304d7a73d7bdf5de62494aaf70bd33015831ffd98bc498b433dfe5b10c \ + --hash=sha256:b82cc8ace10ab5bd93235dfaab2021c70637005e1ac787031f4d1da63d493c1d \ + --hash=sha256:c0868d64af83169e4d4152ec612637a543f7a336e4a307b119e98042e852ad9c \ + --hash=sha256:c1c1496e73051918fcd4f58ff2e0f2f3066d1c76a0c6aeffd9b45d53243702cc \ + --hash=sha256:c9bf56195c6bbd293340ea82eafd0071cb3d450c703d2c93afb89f93b8386ccc \ + --hash=sha256:cbebcd5bcaf1eaf302617c114aa67569dd3f090dd0ce8ba9e35e9985b41ac35b \ + --hash=sha256:cd6c8fca38178e12c00418de737aef1261576bd1b6e8c6134d3e729a4e858b38 \ + --hash=sha256:ceb3b7e6a0135e092de86110c5a74e46bda4bd4fbfeeb3a3bcec79c0f861e450 \ + --hash=sha256:cf590b134eb70629e350691ecca88eac3e3b8b3c86992042fb82e3cb1830d5e1 \ + --hash=sha256:d3eb1ceec286eba8220c26f3b0096cf189aea7057b6e7b7a2e60ed36b373b77f \ + --hash=sha256:d65f25da8e248202bd47445cec78e0025c0fe7582b23ec69c3b27a640dd7a8e3 \ + --hash=sha256:d6f6d4f185481c9669b9447bf9d9cf3b95a0e9df9d169bbc17e363b7d5487755 \ + --hash=sha256:d84a5c3a5f7ce6db1f999fb9438f686bc2e09d38143f2d93d8406ed2dd6b9226 \ + --hash=sha256:d946b0a9eb8aaa590df1fe082cee553ceab173e6cb5b03239716338629c50c7a \ + --hash=sha256:dce1c6912ab9ff5f179eaf6efe7365c1f425ed690b03341911bf4939ef2f3046 \ + --hash=sha256:de170c7b4fe6859beb8926e84f7d7d6c693dfe8e27372ce3b76f01c46e489fcf \ + --hash=sha256:e02021f87a5b6932fa6ce916ca004c4d441509d33bbdbeca70d05dff5e9d2479 \ + --hash=sha256:e030047e85cbcedbfc073f71836d62dd5dadfbe7531cae27789ff66bc551bd5e \ + --hash=sha256:e0e79d91e71b9867c73323a3444724d496c037e578a0e1755ae159ba14f4f3d1 \ + --hash=sha256:e4428b29611e989719874670fd152b6625500ad6c686d464e99f5aaeeaca175a \ + --hash=sha256:e4972624066095e52b569e02b5ca97dbd7a7ddd4294bf4e7247d52635630dd83 \ + --hash=sha256:e7be68734bd8c9a513f2b0cfd508802d6609da068f40dc57d4e3494cefc92929 \ + --hash=sha256:e8e94e6912639a02ce173341ff62cc1201232ab86b8a8fcc05572741a5dc7d93 \ + --hash=sha256:ea1456df2a27c73ce51120fa2f519f1bea2f4a03a917f4a43c8707cf4cbbae1a \ + --hash=sha256:ebd8d160f91a764652d3e51ce0d2956b38efe37c9231cd82cfc0bed2e40b581c \ + --hash=sha256:eca2e9d0cc5a889850e9bbd68e98314ada174ff6ccd1129500103df7a94a7a44 \ + --hash=sha256:edd08e6f2f1a390bf137080507e44ccc086353c8e98c657e666c017718561b89 \ + --hash=sha256:f285e862d2f153a70586579c15c44656f888806ed0e5b56b64489afe4a2dbfba \ + --hash=sha256:f2a1dee728b52b33eebff5072817176c172050d44d67befd681609b4746e1c2e \ + --hash=sha256:f7e301075edaf50500f0b341543c41194d8df3ae5caf4702f2095f3ca73dd8da \ + --hash=sha256:fb616be3538599e797a2017cccca78e354c767165e8858ab5116813146041a24 \ + --hash=sha256:fce28b3c8a81b6b36dfac9feb1de115bab619b3c13905b419ec71d03a3fc1423 \ + --hash=sha256:fe5d7785250541f7f5019ab9cba2c71169dc7d74d0f45253f8313f436458a4ef + # via + # -c python/requirements_compiled.txt + # aiohttp + # yarl +numpy==1.26.4 \ + --hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \ + --hash=sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818 \ + --hash=sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20 \ + --hash=sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0 \ + --hash=sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010 \ + --hash=sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a \ + --hash=sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea \ + --hash=sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c \ + --hash=sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71 \ + --hash=sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110 \ + --hash=sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be \ + --hash=sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a \ + --hash=sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a \ + --hash=sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5 \ + --hash=sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed \ + --hash=sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd \ + --hash=sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c \ + --hash=sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e \ + --hash=sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0 \ + --hash=sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c \ + --hash=sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a \ + --hash=sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b \ + --hash=sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0 \ + --hash=sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6 \ + --hash=sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2 \ + --hash=sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a \ + --hash=sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30 \ + --hash=sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218 \ + --hash=sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5 \ + --hash=sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07 \ + --hash=sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2 \ + --hash=sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4 \ + --hash=sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764 \ + --hash=sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef \ + --hash=sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3 \ + --hash=sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f + # via + # -c python/requirements_compiled.txt + # cupy-cuda12x + # gymnasium + # pandas + # ray + # scipy + # tensorboardx +opencensus==0.11.4 \ + --hash=sha256:a18487ce68bc19900336e0ff4655c5a116daf10c1b3685ece8d971bddad6a864 \ + --hash=sha256:cbef87d8b8773064ab60e5c2a1ced58bbaa38a6d052c41aec224958ce544eff2 + # via + # -c python/requirements_compiled.txt + # ray +opencensus-context==0.1.3 \ + --hash=sha256:073bb0590007af276853009fac7e4bab1d523c3f03baf4cb4511ca38967c6039 \ + --hash=sha256:a03108c3c10d8c80bb5ddf5c8a1f033161fa61972a9917f9b9b3a18517f0088c + # via + # -c python/requirements_compiled.txt + # opencensus +opentelemetry-api==1.34.1 \ + --hash=sha256:64f0bd06d42824843731d05beea88d4d4b6ae59f9fe347ff7dfa2cc14233bbb3 \ + --hash=sha256:b7df4cb0830d5a6c29ad0c0691dbae874d8daefa934b8b1d642de48323d32a8c + # via + # -c python/requirements_compiled.txt + # opentelemetry-exporter-prometheus + # opentelemetry-sdk + # opentelemetry-semantic-conventions +opentelemetry-exporter-prometheus==0.55b1 \ + --hash=sha256:d13ec0b22bf394113ff1ada5da98133a4b051779b803dae183188e26c4bd9ee0 \ + --hash=sha256:f364fbbff9e5de37a112ff104d1185fb1d7e2046c5ab5911e5afebc7ab3ddf0e + # via + # -c python/requirements_compiled.txt + # ray +opentelemetry-proto==1.27.0 \ + --hash=sha256:33c9345d91dafd8a74fc3d7576c5a38f18b7fdf8d02983ac67485386132aedd6 \ + --hash=sha256:b133873de5581a50063e1e4b29cdcf0c5e253a8c2d8dc1229add20a4c3830ace + # via + # -c python/requirements_compiled.txt + # ray +opentelemetry-sdk==1.34.1 \ + --hash=sha256:308effad4059562f1d92163c61c8141df649da24ce361827812c40abb2a1e96e \ + --hash=sha256:8091db0d763fcd6098d4781bbc80ff0971f94e260739aa6afe6fd379cdf3aa4d + # via + # -c python/requirements_compiled.txt + # opentelemetry-exporter-prometheus + # ray +opentelemetry-semantic-conventions==0.55b1 \ + --hash=sha256:5da81dfdf7d52e3d37f8fe88d5e771e191de924cfff5f550ab0b8f7b2409baed \ + --hash=sha256:ef95b1f009159c28d7a7849f5cbc71c4c34c845bb514d66adfdf1b3fff3598b3 + # via + # -c python/requirements_compiled.txt + # opentelemetry-sdk +ormsgpack==1.7.0 \ + --hash=sha256:0d88307ab45d95416ce4071b1b99326ca31362af01c3d206f15a0551a7a874bd \ + --hash=sha256:22418a4d399027a72fb2e6b873559b1886cf2e63323ca7afc17b222c454413b7 \ + --hash=sha256:2c22c62a6bc93bcb194b7f91864ca0b39455b2cbbfc1538a3da0f9ec3c11d184 \ + --hash=sha256:3a6a97937d2cf21496d7689b90a43df83c5062bbe846aaa39197cc9ad73eaa7b \ + --hash=sha256:462089a419dbde654915ccb0b859c0dbe3c178b0ac580018e82befea6ccd73f4 \ + --hash=sha256:4b353204e99b56c1d33f1cf4767bd1fe1195596181a1cc789f25aa26c0b50f3d \ + --hash=sha256:5ec763096d978d35eedcef0af13991a10741717c2e236b26f4c2047b0740ea7b \ + --hash=sha256:5fefa1ca842dbba258401ea958113fe62c6b70a7a4d46edac440113f68dc431e \ + --hash=sha256:65525438b4a8b3b64ccfcda25e758ea3db392d1c206b5e09ef70efbbafa6dbf9 \ + --hash=sha256:6b4c98839cb7fc2a212037d2258f3a22857155249eb293d45c45cb974cfba834 \ + --hash=sha256:6d114652dadd81802b8a35a49e07a3e9ef2a47aed6123fb5031f2220d1c8e434 \ + --hash=sha256:77bc2ea387d85cfad045b9bcb8040bae43ad32dafe9363360f732cc19d489bbe \ + --hash=sha256:7e6ada21f5c7a20ff7cf9b061c44e3814352f819947a12022ad8cb52a9f2a809 \ + --hash=sha256:8d301e47565fe0e52a60052e730a9bb7669dfbd2a94643b8be925e3928c64c15 \ + --hash=sha256:90aabfd816db60dadab1100d583d061e0238209015bf684f8170c0fca4eb445a \ + --hash=sha256:91ebb7d3609db249cdff629ffef83ec3d025b1384749a297cf3b6a8240cf22ac \ + --hash=sha256:97723786755a7df85fcf6e68d7b5359dacea98d5c26b1d9af219a3cc05df4734 \ + --hash=sha256:9b0945523ccc75aa6907f38f2240d36818618baccb8633923bd7740a5a929e67 \ + --hash=sha256:a0ca6a64d47073f22ecc1dd96b384e44f98796d3f88ee383e92dfbcdf18c2efd \ + --hash=sha256:a5e12b51a590be47ccef67907905653e679fc2f920854b456edc216690ecc09c \ + --hash=sha256:a8fbe7bb50ee8381df030823d9366984fac718447947c2327969405d1d799b95 \ + --hash=sha256:c683071bf4527ffa7b6cfcf28f750d1a82eb77846d106743c09261ab1b79b193 \ + --hash=sha256:ca4d35b694f32112eb33ac0b733cb903dbbc59f019d05ca3d74f6ad2f587b0bf \ + --hash=sha256:e8385181bf195af80fc270e64fd477f1c414ffb05837320382e2ec9ca34be0ec \ + --hash=sha256:e86124cdbc8ed249806347c2fba96843e8941122b161b429139a0c973d270de4 \ + --hash=sha256:f9967a7f3647ad118751abf090f8397fda3e4bca6833340cab95a3f2bec598cd + # via + # -c python/requirements_compiled.txt + # ray +packaging==23.0 \ + --hash=sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2 \ + --hash=sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97 + # via + # -c python/requirements_compiled.txt + # kombu + # ray + # tensorboardx +pandas==1.5.3 \ + --hash=sha256:14e45300521902689a81f3f41386dc86f19b8ba8dd5ac5a3c7010ef8d2932813 \ + --hash=sha256:26d9c71772c7afb9d5046e6e9cf42d83dd147b5cf5bcb9d97252077118543792 \ + --hash=sha256:3749077d86e3a2f0ed51367f30bf5b82e131cc0f14260c4d3e499186fccc4406 \ + --hash=sha256:41179ce559943d83a9b4bbacb736b04c928b095b5f25dd2b7389eda08f46f373 \ + --hash=sha256:478ff646ca42b20376e4ed3fa2e8d7341e8a63105586efe54fa2508ee087f328 \ + --hash=sha256:50869a35cbb0f2e0cd5ec04b191e7b12ed688874bd05dd777c19b28cbea90996 \ + --hash=sha256:565fa34a5434d38e9d250af3c12ff931abaf88050551d9fbcdfafca50d62babf \ + --hash=sha256:5f2b952406a1588ad4cad5b3f55f520e82e902388a6d5a4a91baa8d38d23c7f6 \ + --hash=sha256:5fbcb19d6fceb9e946b3e23258757c7b225ba450990d9ed63ccceeb8cae609f7 \ + --hash=sha256:6973549c01ca91ec96199e940495219c887ea815b2083722821f1d7abfa2b4dc \ + --hash=sha256:74a3fd7e5a7ec052f183273dc7b0acd3a863edf7520f5d3a1765c04ffdb3b0b1 \ + --hash=sha256:7a0a56cef15fd1586726dace5616db75ebcfec9179a3a55e78f72c5639fa2a23 \ + --hash=sha256:7cec0bee9f294e5de5bbfc14d0573f65526071029d036b753ee6507d2a21480a \ + --hash=sha256:87bd9c03da1ac870a6d2c8902a0e1fd4267ca00f13bc494c9e5a9020920e1d51 \ + --hash=sha256:972d8a45395f2a2d26733eb8d0f629b2f90bebe8e8eddbb8829b180c09639572 \ + --hash=sha256:9842b6f4b8479e41968eced654487258ed81df7d1c9b7b870ceea24ed9459b31 \ + --hash=sha256:9f69c4029613de47816b1bb30ff5ac778686688751a5e9c99ad8c7031f6508e5 \ + --hash=sha256:a50d9a4336a9621cab7b8eb3fb11adb82de58f9b91d84c2cd526576b881a0c5a \ + --hash=sha256:bc4c368f42b551bf72fac35c5128963a171b40dce866fb066540eeaf46faa003 \ + --hash=sha256:c39a8da13cede5adcd3be1182883aea1c925476f4e84b2807a46e2775306305d \ + --hash=sha256:c3ac844a0fe00bfaeb2c9b51ab1424e5c8744f89860b138434a363b1f620f354 \ + --hash=sha256:c4c00e0b0597c8e4f59e8d461f797e5d70b4d025880516a8261b2817c47759ee \ + --hash=sha256:c74a62747864ed568f5a82a49a23a8d7fe171d0c69038b38cedf0976831296fa \ + --hash=sha256:dd05f7783b3274aa206a1af06f0ceed3f9b412cf665b7247eacd83be41cf7bf0 \ + --hash=sha256:dfd681c5dc216037e0b0a2c821f5ed99ba9f03ebcf119c7dac0e9a7b960b9ec9 \ + --hash=sha256:e474390e60ed609cec869b0da796ad94f420bb057d86784191eefc62b65819ae \ + --hash=sha256:f76d097d12c82a535fda9dfe5e8dd4127952b45fea9b0276cb30cca5ea313fbc + # via + # -c python/requirements_compiled.txt + # ray +platformdirs==3.11.0 \ + --hash=sha256:cf8ee52a3afdb965072dcc652433e0c7e3e40cf5ea1477cd4b3b1d2eb75495b3 \ + --hash=sha256:e9d171d00af68be50e9202731309c4e658fd8bc76f55c11c7dd760d023bda68e + # via + # -c python/requirements_compiled.txt + # virtualenv +prometheus-client==0.19.0 \ + --hash=sha256:4585b0d1223148c27a225b10dbec5ae9bc4c81a99a3fa80774fa6209935324e1 \ + --hash=sha256:c88b1e6ecf6b41cd8fb5731c7ae919bf66df6ec6fafa555cd6c0e16ca169ae92 + # via + # -c python/requirements_compiled.txt + # opentelemetry-exporter-prometheus + # ray +prompt-toolkit==3.0.41 \ + --hash=sha256:941367d97fc815548822aa26c2a269fdc4eb21e9ec05fc5d447cf09bad5d75f0 \ + --hash=sha256:f36fe301fafb7470e86aaf90f036eef600a3210be4decf461a5b1ca8403d3cb2 + # via + # -c python/requirements_compiled.txt + # click-repl +propcache==0.3.0 \ + --hash=sha256:02df07041e0820cacc8f739510078f2aadcfd3fc57eaeeb16d5ded85c872c89e \ + --hash=sha256:03acd9ff19021bd0567582ac88f821b66883e158274183b9e5586f678984f8fe \ + --hash=sha256:03c091bb752349402f23ee43bb2bff6bd80ccab7c9df6b88ad4322258d6960fc \ + --hash=sha256:07700939b2cbd67bfb3b76a12e1412405d71019df00ca5697ce75e5ef789d829 \ + --hash=sha256:0c3e893c4464ebd751b44ae76c12c5f5c1e4f6cbd6fbf67e3783cd93ad221863 \ + --hash=sha256:119e244ab40f70a98c91906d4c1f4c5f2e68bd0b14e7ab0a06922038fae8a20f \ + --hash=sha256:11ae6a8a01b8a4dc79093b5d3ca2c8a4436f5ee251a9840d7790dccbd96cb649 \ + --hash=sha256:15010f29fbed80e711db272909a074dc79858c6d28e2915704cfc487a8ac89c6 \ + --hash=sha256:19d36bb351ad5554ff20f2ae75f88ce205b0748c38b146c75628577020351e3c \ + --hash=sha256:1c8f7d896a16da9455f882870a507567d4f58c53504dc2d4b1e1d386dfe4588a \ + --hash=sha256:2383a17385d9800b6eb5855c2f05ee550f803878f344f58b6e194de08b96352c \ + --hash=sha256:24c04f8fbf60094c531667b8207acbae54146661657a1b1be6d3ca7773b7a545 \ + --hash=sha256:2578541776769b500bada3f8a4eeaf944530516b6e90c089aa368266ed70c49e \ + --hash=sha256:26a67e5c04e3119594d8cfae517f4b9330c395df07ea65eab16f3d559b7068fe \ + --hash=sha256:2b975528998de037dfbc10144b8aed9b8dd5a99ec547f14d1cb7c5665a43f075 \ + --hash=sha256:2d15bc27163cd4df433e75f546b9ac31c1ba7b0b128bfb1b90df19082466ff57 \ + --hash=sha256:2d913d36bdaf368637b4f88d554fb9cb9d53d6920b9c5563846555938d5450bf \ + --hash=sha256:3302c5287e504d23bb0e64d2a921d1eb4a03fb93a0a0aa3b53de059f5a5d737d \ + --hash=sha256:36ca5e9a21822cc1746023e88f5c0af6fce3af3b85d4520efb1ce4221bed75cc \ + --hash=sha256:3b812b3cb6caacd072276ac0492d249f210006c57726b6484a1e1805b3cfeea0 \ + --hash=sha256:3c6ec957025bf32b15cbc6b67afe233c65b30005e4c55fe5768e4bb518d712f1 \ + --hash=sha256:41de3da5458edd5678b0f6ff66691507f9885f5fe6a0fb99a5d10d10c0fd2d64 \ + --hash=sha256:42924dc0c9d73e49908e35bbdec87adedd651ea24c53c29cac103ede0ea1d340 \ + --hash=sha256:4544699674faf66fb6b4473a1518ae4999c1b614f0b8297b1cef96bac25381db \ + --hash=sha256:46ed02532cb66612d42ae5c3929b5e98ae330ea0f3900bc66ec5f4862069519b \ + --hash=sha256:49ea05212a529c2caffe411e25a59308b07d6e10bf2505d77da72891f9a05641 \ + --hash=sha256:4fa0e7c9c3cf7c276d4f6ab9af8adddc127d04e0fcabede315904d2ff76db626 \ + --hash=sha256:507c5357a8d8b4593b97fb669c50598f4e6cccbbf77e22fa9598aba78292b4d7 \ + --hash=sha256:549722908de62aa0b47a78b90531c022fa6e139f9166be634f667ff45632cc92 \ + --hash=sha256:58e6d2a5a7cb3e5f166fd58e71e9a4ff504be9dc61b88167e75f835da5764d07 \ + --hash=sha256:5a16167118677d94bb48bfcd91e420088854eb0737b76ec374b91498fb77a70e \ + --hash=sha256:5d62c4f6706bff5d8a52fd51fec6069bef69e7202ed481486c0bc3874912c787 \ + --hash=sha256:5fa159dcee5dba00c1def3231c249cf261185189205073bde13797e57dd7540a \ + --hash=sha256:6032231d4a5abd67c7f71168fd64a47b6b451fbcb91c8397c2f7610e67683810 \ + --hash=sha256:63f26258a163c34542c24808f03d734b338da66ba91f410a703e505c8485791d \ + --hash=sha256:65a37714b8ad9aba5780325228598a5b16c47ba0f8aeb3dc0514701e4413d7c0 \ + --hash=sha256:67054e47c01b7b349b94ed0840ccae075449503cf1fdd0a1fdd98ab5ddc2667b \ + --hash=sha256:67dda3c7325691c2081510e92c561f465ba61b975f481735aefdfc845d2cd043 \ + --hash=sha256:6985a593417cdbc94c7f9c3403747335e450c1599da1647a5af76539672464d3 \ + --hash=sha256:6a1948df1bb1d56b5e7b0553c0fa04fd0e320997ae99689488201f19fa90d2e7 \ + --hash=sha256:6b5b7fd6ee7b54e01759f2044f936dcf7dea6e7585f35490f7ca0420fe723c0d \ + --hash=sha256:6c929916cbdb540d3407c66f19f73387f43e7c12fa318a66f64ac99da601bcdf \ + --hash=sha256:6f4d7a7c0aff92e8354cceca6fe223973ddf08401047920df0fcb24be2bd5138 \ + --hash=sha256:728af36011bb5d344c4fe4af79cfe186729efb649d2f8b395d1572fb088a996c \ + --hash=sha256:742840d1d0438eb7ea4280f3347598f507a199a35a08294afdcc560c3739989d \ + --hash=sha256:75e872573220d1ee2305b35c9813626e620768248425f58798413e9c39741f46 \ + --hash=sha256:794c3dd744fad478b6232289c866c25406ecdfc47e294618bdf1697e69bd64a6 \ + --hash=sha256:7c0fdbdf6983526e269e5a8d53b7ae3622dd6998468821d660d0daf72779aefa \ + --hash=sha256:7c5f5290799a3f6539cc5e6f474c3e5c5fbeba74a5e1e5be75587746a940d51e \ + --hash=sha256:7c6e7e4f9167fddc438cd653d826f2222222564daed4116a02a184b464d3ef05 \ + --hash=sha256:7cedd25e5f678f7738da38037435b340694ab34d424938041aa630d8bac42663 \ + --hash=sha256:7e2e068a83552ddf7a39a99488bcba05ac13454fb205c847674da0352602082f \ + --hash=sha256:8319293e85feadbbfe2150a5659dbc2ebc4afdeaf7d98936fb9a2f2ba0d4c35c \ + --hash=sha256:8526b0941ec5a40220fc4dfde76aed58808e2b309c03e9fa8e2260083ef7157f \ + --hash=sha256:8884ba1a0fe7210b775106b25850f5e5a9dc3c840d1ae9924ee6ea2eb3acbfe7 \ + --hash=sha256:8cb625bcb5add899cb8ba7bf716ec1d3e8f7cdea9b0713fa99eadf73b6d4986f \ + --hash=sha256:8d663fd71491dde7dfdfc899d13a067a94198e90695b4321084c6e450743b8c7 \ + --hash=sha256:8ee1983728964d6070ab443399c476de93d5d741f71e8f6e7880a065f878e0b9 \ + --hash=sha256:997e7b8f173a391987df40f3b52c423e5850be6f6df0dcfb5376365440b56667 \ + --hash=sha256:9be90eebc9842a93ef8335291f57b3b7488ac24f70df96a6034a13cb58e6ff86 \ + --hash=sha256:9ddd49258610499aab83b4f5b61b32e11fce873586282a0e972e5ab3bcadee51 \ + --hash=sha256:9ecde3671e62eeb99e977f5221abcf40c208f69b5eb986b061ccec317c82ebd0 \ + --hash=sha256:9ff4e9ecb6e4b363430edf2c6e50173a63e0820e549918adef70515f87ced19a \ + --hash=sha256:a254537b9b696ede293bfdbc0a65200e8e4507bc9f37831e2a0318a9b333c85c \ + --hash=sha256:a2b9bf8c79b660d0ca1ad95e587818c30ccdb11f787657458d6f26a1ea18c568 \ + --hash=sha256:a61a68d630e812b67b5bf097ab84e2cd79b48c792857dc10ba8a223f5b06a2af \ + --hash=sha256:a7080b0159ce05f179cfac592cda1a82898ca9cd097dacf8ea20ae33474fbb25 \ + --hash=sha256:a8fd93de4e1d278046345f49e2238cdb298589325849b2645d4a94c53faeffc5 \ + --hash=sha256:a94ffc66738da99232ddffcf7910e0f69e2bbe3a0802e54426dbf0714e1c2ffe \ + --hash=sha256:aa806bbc13eac1ab6291ed21ecd2dd426063ca5417dd507e6be58de20e58dfcf \ + --hash=sha256:b0c1a133d42c6fc1f5fbcf5c91331657a1ff822e87989bf4a6e2e39b818d0ee9 \ + --hash=sha256:b58229a844931bca61b3a20efd2be2a2acb4ad1622fc026504309a6883686fbf \ + --hash=sha256:bb2f144c6d98bb5cbc94adeb0447cfd4c0f991341baa68eee3f3b0c9c0e83767 \ + --hash=sha256:be90c94570840939fecedf99fa72839aed70b0ced449b415c85e01ae67422c90 \ + --hash=sha256:bf0d9a171908f32d54f651648c7290397b8792f4303821c42a74e7805bfb813c \ + --hash=sha256:bf15fc0b45914d9d1b706f7c9c4f66f2b7b053e9517e40123e137e8ca8958b3d \ + --hash=sha256:bf4298f366ca7e1ad1d21bbb58300a6985015909964077afd37559084590c929 \ + --hash=sha256:c441c841e82c5ba7a85ad25986014be8d7849c3cfbdb6004541873505929a74e \ + --hash=sha256:cacea77ef7a2195f04f9279297684955e3d1ae4241092ff0cfcef532bb7a1c32 \ + --hash=sha256:cd54895e4ae7d32f1e3dd91261df46ee7483a735017dc6f987904f194aa5fd14 \ + --hash=sha256:d1323cd04d6e92150bcc79d0174ce347ed4b349d748b9358fd2e497b121e03c8 \ + --hash=sha256:d383bf5e045d7f9d239b38e6acadd7b7fdf6c0087259a84ae3475d18e9a2ae8b \ + --hash=sha256:d3e7420211f5a65a54675fd860ea04173cde60a7cc20ccfbafcccd155225f8bc \ + --hash=sha256:d8074c5dd61c8a3e915fa8fc04754fa55cfa5978200d2daa1e2d4294c1f136aa \ + --hash=sha256:df03cd88f95b1b99052b52b1bb92173229d7a674df0ab06d2b25765ee8404bce \ + --hash=sha256:e45377d5d6fefe1677da2a2c07b024a6dac782088e37c0b1efea4cfe2b1be19b \ + --hash=sha256:e53d19c2bf7d0d1e6998a7e693c7e87300dd971808e6618964621ccd0e01fe4e \ + --hash=sha256:e560fd75aaf3e5693b91bcaddd8b314f4d57e99aef8a6c6dc692f935cc1e6bbf \ + --hash=sha256:ec5060592d83454e8063e487696ac3783cc48c9a329498bafae0d972bc7816c9 \ + --hash=sha256:ecc2920630283e0783c22e2ac94427f8cca29a04cfdf331467d4f661f4072dac \ + --hash=sha256:ed7161bccab7696a473fe7ddb619c1d75963732b37da4618ba12e60899fefe4f \ + --hash=sha256:ee0bd3a7b2e184e88d25c9baa6a9dc609ba25b76daae942edfb14499ac7ec374 \ + --hash=sha256:ee25f1ac091def37c4b59d192bbe3a206298feeb89132a470325bf76ad122a1e \ + --hash=sha256:efa44f64c37cc30c9f05932c740a8b40ce359f51882c70883cc95feac842da4d \ + --hash=sha256:f47d52fd9b2ac418c4890aad2f6d21a6b96183c98021f0a48497a904199f006e \ + --hash=sha256:f857034dc68d5ceb30fb60afb6ff2103087aea10a01b613985610e007053a121 \ + --hash=sha256:fb91d20fa2d3b13deea98a690534697742029f4fb83673a3501ae6e3746508b5 \ + --hash=sha256:fddb8870bdb83456a489ab67c6b3040a8d5a55069aa6f72f9d872235fbc52f54 + # via + # -c python/requirements_compiled.txt + # aiohttp + # yarl +proto-plus==1.22.3 \ + --hash=sha256:a49cd903bc0b6ab41f76bf65510439d56ca76f868adf0274e738bfdd096894df \ + --hash=sha256:fdcd09713cbd42480740d2fe29c990f7fbd885a67efc328aa8be6ee3e9f76a6b + # via + # -c python/requirements_compiled.txt + # google-api-core +protobuf==4.25.8 \ + --hash=sha256:077ff8badf2acf8bc474406706ad890466274191a48d0abd3bd6987107c9cde5 \ + --hash=sha256:15a0af558aa3b13efef102ae6e4f3efac06f1eea11afb3a57db2901447d9fb59 \ + --hash=sha256:27d498ffd1f21fb81d987a041c32d07857d1d107909f5134ba3350e1ce80a4af \ + --hash=sha256:504435d831565f7cfac9f0714440028907f1975e4bed228e58e72ecfff58a1e0 \ + --hash=sha256:6135cf8affe1fc6f76cced2641e4ea8d3e59518d1f24ae41ba97bcad82d397cd \ + --hash=sha256:83e6e54e93d2b696a92cad6e6efc924f3850f82b52e1563778dfab8b355101b0 \ + --hash=sha256:9ad7ef62d92baf5a8654fbb88dac7fa5594cfa70fd3440488a5ca3bfc6d795a7 \ + --hash=sha256:bd551eb1fe1d7e92c1af1d75bdfa572eff1ab0e5bf1736716814cdccdb2360f9 \ + --hash=sha256:ca809b42f4444f144f2115c4c1a747b9a404d590f18f37e9402422033e464e0f \ + --hash=sha256:d552c53d0415449c8d17ced5c341caba0d89dbf433698e1436c8fa0aae7808a3 \ + --hash=sha256:f4510b93a3bec6eba8fd8f1093e9d7fb0d4a24d1a81377c10c0e5bbfe9e4ed24 + # via + # -c python/requirements_compiled.txt + # google-api-core + # googleapis-common-protos + # opentelemetry-proto + # proto-plus + # ray + # tensorboardx +py-spy==0.4.0 ; python_full_version < '3.12' \ + --hash=sha256:47cdda4c34d9b6cb01f3aaeceb2e88faf57da880207fe72ff6ff97e9bb6cc8a9 \ + --hash=sha256:77d8f637ade38367d944874776f45b703b7ac5938b1f7be8891f3a5876ddbb96 \ + --hash=sha256:806602ce7972782cc9c1e383f339bfc27bfb822d42485e6a3e0530ae5040e1f0 \ + --hash=sha256:87573e64dbfdfc89ba2e0f5e2f525aa84e0299c7eb6454b47ea335fde583a7a0 \ + --hash=sha256:8bf2f3702cef367a489faa45177b41a6c31b2a3e5bd78c978d44e29340152f5a \ + --hash=sha256:c5f06ffce4c9c98b7fc9f5e67e5e7db591173f1351837633f3f23d9378b1d18a \ + --hash=sha256:eee3d0bde85ca5cf4f01f012d461180ca76c24835a96f7b5c4ded64eb6a008ab \ + --hash=sha256:f2cf3f7130e7d780471faa5957441d3b4e0ec39a79b2c00f4c33d494f7728428 + # via + # -c python/requirements_compiled.txt + # ray +pyarrow==19.0.1 \ + --hash=sha256:008a4009efdb4ea3d2e18f05cd31f9d43c388aad29c636112c2966605ba33466 \ + --hash=sha256:0148bb4fc158bfbc3d6dfe5001d93ebeed253793fff4435167f6ce1dc4bddeae \ + --hash=sha256:1b93ef2c93e77c442c979b0d596af45e4665d8b96da598db145b0fec014b9136 \ + --hash=sha256:1c7556165bd38cf0cd992df2636f8bcdd2d4b26916c6b7e646101aff3c16f76f \ + --hash=sha256:335d170e050bcc7da867a1ed8ffb8b44c57aaa6e0843b156a501298657b1e972 \ + --hash=sha256:3bf266b485df66a400f282ac0b6d1b500b9d2ae73314a153dbe97d6d5cc8a99e \ + --hash=sha256:41f9706fbe505e0abc10e84bf3a906a1338905cbbcf1177b71486b03e6ea6608 \ + --hash=sha256:4982f8e2b7afd6dae8608d70ba5bd91699077323f812a0448d8b7abdff6cb5d3 \ + --hash=sha256:49a3aecb62c1be1d822f8bf629226d4a96418228a42f5b40835c1f10d42e4db6 \ + --hash=sha256:4d5d1ec7ec5324b98887bdc006f4d2ce534e10e60f7ad995e7875ffa0ff9cb14 \ + --hash=sha256:58d9397b2e273ef76264b45531e9d552d8ec8a6688b7390b5be44c02a37aade8 \ + --hash=sha256:5a9137cf7e1640dce4c190551ee69d478f7121b5c6f323553b319cac936395f6 \ + --hash=sha256:5bd1618ae5e5476b7654c7b55a6364ae87686d4724538c24185bbb2952679960 \ + --hash=sha256:65cf9feebab489b19cdfcfe4aa82f62147218558d8d3f0fc1e9dea0ab8e7905a \ + --hash=sha256:699799f9c80bebcf1da0983ba86d7f289c5a2a5c04b945e2f2bcf7e874a91911 \ + --hash=sha256:6c5941c1aac89a6c2f2b16cd64fe76bcdb94b2b1e99ca6459de4e6f07638d755 \ + --hash=sha256:6ebfb5171bb5f4a52319344ebbbecc731af3f021e49318c74f33d520d31ae0c4 \ + --hash=sha256:7a544ec12de66769612b2d6988c36adc96fb9767ecc8ee0a4d270b10b1c51e00 \ + --hash=sha256:7c1bca1897c28013db5e4c83944a2ab53231f541b9e0c3f4791206d0c0de389a \ + --hash=sha256:80b2ad2b193e7d19e81008a96e313fbd53157945c7be9ac65f44f8937a55427b \ + --hash=sha256:8464c9fbe6d94a7fe1599e7e8965f350fd233532868232ab2596a71586c5a429 \ + --hash=sha256:8f04d49a6b64cf24719c080b3c2029a3a5b16417fd5fd7c4041f94233af732f3 \ + --hash=sha256:96606c3ba57944d128e8a8399da4812f56c7f61de8c647e3470b417f795d0ef9 \ + --hash=sha256:99bc1bec6d234359743b01e70d4310d0ab240c3d6b0da7e2a93663b0158616f6 \ + --hash=sha256:ad76aef7f5f7e4a757fddcdcf010a8290958f09e3470ea458c80d26f4316ae89 \ + --hash=sha256:b4c4156a625f1e35d6c0b2132635a237708944eb41df5fbe7d50f20d20c17832 \ + --hash=sha256:b9766a47a9cb56fefe95cb27f535038b5a195707a08bf61b180e642324963b46 \ + --hash=sha256:c0fe3dbbf054a00d1f162fda94ce236a899ca01123a798c561ba307ca38af5f0 \ + --hash=sha256:c6cb2335a411b713fdf1e82a752162f72d4a7b5dbc588e32aa18383318b05866 \ + --hash=sha256:cc55d71898ea30dc95900297d191377caba257612f384207fe9f8293b5850f90 \ + --hash=sha256:d03c9d6f2a3dffbd62671ca070f13fc527bb1867b4ec2b98c7eeed381d4f389a \ + --hash=sha256:d383591f3dcbe545f6cc62daaef9c7cdfe0dff0fb9e1c8121101cabe9098cfa6 \ + --hash=sha256:d9d46e06846a41ba906ab25302cf0fd522f81aa2a85a71021826f34639ad31ef \ + --hash=sha256:d9dedeaf19097a143ed6da37f04f4051aba353c95ef507764d344229b2b740ae \ + --hash=sha256:e45274b20e524ae5c39d7fc1ca2aa923aab494776d2d4b316b49ec7572ca324c \ + --hash=sha256:ee8dec072569f43835932a3b10c55973593abc00936c202707a4ad06af7cb294 \ + --hash=sha256:f24faab6ed18f216a37870d8c5623f9c044566d75ec586ef884e13a02a9d62c5 \ + --hash=sha256:f2a21d39fbdb948857f67eacb5bbaaf36802de044ec36fbef7a1c8f0dd3a4ab2 \ + --hash=sha256:f3ad4c0eb4e2a9aeb990af6c09e6fa0b195c8c0e7b272ecc8d4d2b6574809d34 \ + --hash=sha256:fc28912a2dc924dddc2087679cc8b7263accc71b9ff025a1362b004711661a69 \ + --hash=sha256:fca15aabbe9b8355800d923cc2e82c8ef514af321e18b437c3d782aa884eaeec \ + --hash=sha256:fd44d66093a239358d07c42a91eebf5015aa54fccba959db899f932218ac9cc8 + # via + # -c python/requirements_compiled.txt + # ray +pyasn1==0.5.1 \ + --hash=sha256:4439847c58d40b1d0a573d07e3856e95333f1976294494c325775aeca506eb58 \ + --hash=sha256:6d391a96e59b23130a5cfa74d6fd7f388dbbe26cc8f1edf39fdddf08d9d6676c + # via + # -c python/requirements_compiled.txt + # pyasn1-modules + # rsa +pyasn1-modules==0.3.0 \ + --hash=sha256:5bd01446b736eb9d31512a30d46c1ac3395d676c6f3cafa4c03eb54b9925631c \ + --hash=sha256:d3ccd6ed470d9ffbc716be08bd90efbd44d0734bc9303818f7336070984a162d + # via + # -c python/requirements_compiled.txt + # google-auth +pycparser==2.21 ; platform_python_implementation != 'PyPy' \ + --hash=sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9 \ + --hash=sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206 + # via + # -c python/requirements_compiled.txt + # cffi +pydantic==2.11.7 \ + --hash=sha256:d989c3c6cb79469287b1569f7447a17848c998458d49ebe294e975b9baf0f0db \ + --hash=sha256:dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b + # via + # -c python/requirements_compiled.txt + # fastapi + # ray +pydantic-core==2.33.2 \ + --hash=sha256:0069c9acc3f3981b9ff4cdfaf088e98d83440a4c7ea1bc07460af3d4dc22e72d \ + --hash=sha256:031c57d67ca86902726e0fae2214ce6770bbe2f710dc33063187a68744a5ecac \ + --hash=sha256:0405262705a123b7ce9f0b92f123334d67b70fd1f20a9372b907ce1080c7ba02 \ + --hash=sha256:04a1a413977ab517154eebb2d326da71638271477d6ad87a769102f7c2488c56 \ + --hash=sha256:09fb9dd6571aacd023fe6aaca316bd01cf60ab27240d7eb39ebd66a3a15293b4 \ + --hash=sha256:0a39979dcbb70998b0e505fb1556a1d550a0781463ce84ebf915ba293ccb7e22 \ + --hash=sha256:0a9f2c9dd19656823cb8250b0724ee9c60a82f3cdf68a080979d13092a3b0fef \ + --hash=sha256:0e03262ab796d986f978f79c943fc5f620381be7287148b8010b4097f79a39ec \ + --hash=sha256:0e5b2671f05ba48b94cb90ce55d8bdcaaedb8ba00cc5359f6810fc918713983d \ + --hash=sha256:0e6116757f7959a712db11f3e9c0a99ade00a5bbedae83cb801985aa154f071b \ + --hash=sha256:0fb2d542b4d66f9470e8065c5469ec676978d625a8b7a363f07d9a501a9cb36a \ + --hash=sha256:1082dd3e2d7109ad8b7da48e1d4710c8d06c253cbc4a27c1cff4fbcaa97a9e3f \ + --hash=sha256:1a8695a8d00c73e50bff9dfda4d540b7dee29ff9b8053e38380426a85ef10052 \ + --hash=sha256:1e063337ef9e9820c77acc768546325ebe04ee38b08703244c1309cccc4f1bab \ + --hash=sha256:1ea40a64d23faa25e62a70ad163571c0b342b8bf66d5fa612ac0dec4f069d916 \ + --hash=sha256:2058a32994f1fde4ca0480ab9d1e75a0e8c87c22b53a3ae66554f9af78f2fe8c \ + --hash=sha256:235f45e5dbcccf6bd99f9f472858849f73d11120d76ea8707115415f8e5ebebf \ + --hash=sha256:2807668ba86cb38c6817ad9bc66215ab8584d1d304030ce4f0887336f28a5e27 \ + --hash=sha256:2b0a451c263b01acebe51895bfb0e1cc842a5c666efe06cdf13846c7418caa9a \ + --hash=sha256:2b3d326aaef0c0399d9afffeb6367d5e26ddc24d351dbc9c636840ac355dc5d8 \ + --hash=sha256:2bfb5112df54209d820d7bf9317c7a6c9025ea52e49f46b6a2060104bba37de7 \ + --hash=sha256:2f82865531efd18d6e07a04a17331af02cb7a651583c418df8266f17a63c6612 \ + --hash=sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1 \ + --hash=sha256:3a1c81334778f9e3af2f8aeb7a960736e5cab1dfebfb26aabca09afd2906c039 \ + --hash=sha256:3abcd9392a36025e3bd55f9bd38d908bd17962cc49bc6da8e7e96285336e2bca \ + --hash=sha256:3c6db6e52c6d70aa0d00d45cdb9b40f0433b96380071ea80b09277dba021ddf7 \ + --hash=sha256:3dc625f4aa79713512d1976fe9f0bc99f706a9dee21dfd1810b4bbbf228d0e8a \ + --hash=sha256:3eb3fe62804e8f859c49ed20a8451342de53ed764150cb14ca71357c765dc2a6 \ + --hash=sha256:44857c3227d3fb5e753d5fe4a3420d6376fa594b07b621e220cd93703fe21782 \ + --hash=sha256:4b25d91e288e2c4e0662b8038a28c6a07eaac3e196cfc4ff69de4ea3db992a1b \ + --hash=sha256:4c5b0a576fb381edd6d27f0a85915c6daf2f8138dc5c267a57c08a62900758c7 \ + --hash=sha256:4e61206137cbc65e6d5256e1166f88331d3b6238e082d9f74613b9b765fb9025 \ + --hash=sha256:52fb90784e0a242bb96ec53f42196a17278855b0f31ac7c3cc6f5c1ec4811849 \ + --hash=sha256:53a57d2ed685940a504248187d5685e49eb5eef0f696853647bf37c418c538f7 \ + --hash=sha256:572c7e6c8bb4774d2ac88929e3d1f12bc45714ae5ee6d9a788a9fb35e60bb04b \ + --hash=sha256:5c4aa4e82353f65e548c476b37e64189783aa5384903bfea4f41580f255fddfa \ + --hash=sha256:5c92edd15cd58b3c2d34873597a1e20f13094f59cf88068adb18947df5455b4e \ + --hash=sha256:5f483cfb75ff703095c59e365360cb73e00185e01aaea067cd19acffd2ab20ea \ + --hash=sha256:61c18fba8e5e9db3ab908620af374db0ac1baa69f0f32df4f61ae23f15e586ac \ + --hash=sha256:6368900c2d3ef09b69cb0b913f9f8263b03786e5b2a387706c5afb66800efd51 \ + --hash=sha256:64632ff9d614e5eecfb495796ad51b0ed98c453e447a76bcbeeb69615079fc7e \ + --hash=sha256:65132b7b4a1c0beded5e057324b7e16e10910c106d43675d9bd87d4f38dde162 \ + --hash=sha256:6b99022f1d19bc32a4c2a0d544fc9a76e3be90f0b3f4af413f87d38749300e65 \ + --hash=sha256:6bdfe4b3789761f3bcb4b1ddf33355a71079858958e3a552f16d5af19768fef2 \ + --hash=sha256:6fa6dfc3e4d1f734a34710f391ae822e0a8eb8559a85c6979e14e65ee6ba2954 \ + --hash=sha256:73662edf539e72a9440129f231ed3757faab89630d291b784ca99237fb94db2b \ + --hash=sha256:73cf6373c21bc80b2e0dc88444f41ae60b2f070ed02095754eb5a01df12256de \ + --hash=sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc \ + --hash=sha256:7f92c15cd1e97d4b12acd1cc9004fa092578acfa57b67ad5e43a197175d01a64 \ + --hash=sha256:82f68293f055f51b51ea42fafc74b6aad03e70e191799430b90c13d643059ebb \ + --hash=sha256:83aa99b1285bc8f038941ddf598501a86f1536789740991d7d8756e34f1e74d9 \ + --hash=sha256:87acbfcf8e90ca885206e98359d7dca4bcbb35abdc0ff66672a293e1d7a19101 \ + --hash=sha256:87b31b6846e361ef83fedb187bb5b4372d0da3f7e28d85415efa92d6125d6e6d \ + --hash=sha256:881b21b5549499972441da4758d662aeea93f1923f953e9cbaff14b8b9565aef \ + --hash=sha256:8d55ab81c57b8ff8548c3e4947f119551253f4e3787a7bbc0b6b3ca47498a9d3 \ + --hash=sha256:8f57a69461af2a5fa6e6bbd7a5f60d3b7e6cebb687f55106933188e79ad155c1 \ + --hash=sha256:95237e53bb015f67b63c91af7518a62a8660376a6a0db19b89acc77a4d6199f5 \ + --hash=sha256:96081f1605125ba0855dfda83f6f3df5ec90c61195421ba72223de35ccfb2f88 \ + --hash=sha256:970919794d126ba8645f3837ab6046fb4e72bbc057b3709144066204c19a455d \ + --hash=sha256:9cb1da0f5a471435a7bc7e439b8a728e8b61e59784b2af70d7c169f8dd8ae290 \ + --hash=sha256:9fcd347d2cc5c23b06de6d3b7b8275be558a0c90549495c699e379a80bf8379e \ + --hash=sha256:9fdac5d6ffa1b5a83bca06ffe7583f5576555e6c8b3a91fbd25ea7780f825f7d \ + --hash=sha256:a11c8d26a50bfab49002947d3d237abe4d9e4b5bdc8846a63537b6488e197808 \ + --hash=sha256:a144d4f717285c6d9234a66778059f33a89096dfb9b39117663fd8413d582dcc \ + --hash=sha256:a2b911a5b90e0374d03813674bf0a5fbbb7741570dcd4b4e85a2e48d17def29d \ + --hash=sha256:a7ec89dc587667f22b6a0b6579c249fca9026ce7c333fc142ba42411fa243cdc \ + --hash=sha256:aa9d91b338f2df0508606f7009fde642391425189bba6d8c653afd80fd6bb64e \ + --hash=sha256:b0379a2b24882fef529ec3b4987cb5d003b9cda32256024e6fe1586ac45fc640 \ + --hash=sha256:bc7aee6f634a6f4a95676fcb5d6559a2c2a390330098dba5e5a5f28a2e4ada30 \ + --hash=sha256:bdc25f3681f7b78572699569514036afe3c243bc3059d3942624e936ec93450e \ + --hash=sha256:c083a3bdd5a93dfe480f1125926afcdbf2917ae714bdb80b36d34318b2bec5d9 \ + --hash=sha256:c20c462aa4434b33a2661701b861604913f912254e441ab8d78d30485736115a \ + --hash=sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9 \ + --hash=sha256:c52b02ad8b4e2cf14ca7b3d918f3eb0ee91e63b3167c32591e57c4317e134f8f \ + --hash=sha256:c54c939ee22dc8e2d545da79fc5381f1c020d6d3141d3bd747eab59164dc89fb \ + --hash=sha256:c8e7af2f4e0194c22b5b37205bfb293d166a7344a5b0d0eaccebc376546d77d5 \ + --hash=sha256:cca3868ddfaccfbc4bfb1d608e2ccaaebe0ae628e1416aeb9c4d88c001bb45ab \ + --hash=sha256:d3f26877a748dc4251cfcfda9dfb5f13fcb034f5308388066bcfe9031b63ae7d \ + --hash=sha256:d53b22f2032c42eaaf025f7c40c2e3b94568ae077a606f006d206a463bc69572 \ + --hash=sha256:d87c561733f66531dced0da6e864f44ebf89a8fba55f31407b00c2f7f9449593 \ + --hash=sha256:d946c8bf0d5c24bf4fe333af284c59a19358aa3ec18cb3dc4370080da1e8ad29 \ + --hash=sha256:dac89aea9af8cd672fa7b510e7b8c33b0bba9a43186680550ccf23020f32d535 \ + --hash=sha256:db4b41f9bd95fbe5acd76d89920336ba96f03e149097365afe1cb092fceb89a1 \ + --hash=sha256:dc46a01bf8d62f227d5ecee74178ffc448ff4e5197c756331f71efcc66dc980f \ + --hash=sha256:dd14041875d09cc0f9308e37a6f8b65f5585cf2598a53aa0123df8b129d481f8 \ + --hash=sha256:de4b83bb311557e439b9e186f733f6c645b9417c84e2eb8203f3f820a4b988bf \ + --hash=sha256:e799c050df38a639db758c617ec771fd8fb7a5f8eaaa4b27b101f266b216a246 \ + --hash=sha256:e80b087132752f6b3d714f041ccf74403799d3b23a72722ea2e6ba2e892555b9 \ + --hash=sha256:eb8c529b2819c37140eb51b914153063d27ed88e3bdc31b71198a198e921e011 \ + --hash=sha256:eb9b459ca4df0e5c87deb59d37377461a538852765293f9e6ee834f0435a93b9 \ + --hash=sha256:efec8db3266b76ef9607c2c4c419bdb06bf335ae433b80816089ea7585816f6a \ + --hash=sha256:f481959862f57f29601ccced557cc2e817bce7533ab8e01a797a48b49c9692b3 \ + --hash=sha256:f517ca031dfc037a9c07e748cefd8d96235088b83b4f4ba8939105d20fa1dcd6 \ + --hash=sha256:f889f7a40498cc077332c7ab6b4608d296d852182211787d4f3ee377aaae66e8 \ + --hash=sha256:f8de619080e944347f5f20de29a975c2d815d9ddd8be9b9b7268e2e3ef68605a \ + --hash=sha256:f941635f2a3d96b2973e867144fde513665c87f13fe0e193c158ac51bfaaa7b2 \ + --hash=sha256:fa754d1850735a0b0e03bcffd9d4b4343eb417e47196e4485d9cca326073a42c \ + --hash=sha256:fa854f5cf7e33842a892e5c73f45327760bc7bc516339fda888c75ae60edaeb6 \ + --hash=sha256:fe5b32187cbc0c862ee201ad66c30cf218e5ed468ec8dc1cf49dec66e160cc4d + # via + # -c python/requirements_compiled.txt + # pydantic +pygments==2.18.0 ; sys_platform != 'win32' \ + --hash=sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199 \ + --hash=sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a + # via + # -c python/requirements_compiled.txt + # rich +pyopenssl==25.0.0 \ + --hash=sha256:424c247065e46e76a37411b9ab1782541c23bb658bf003772c3405fbaa128e90 \ + --hash=sha256:cd2cef799efa3936bb08e8ccb9433a575722b9dd986023f1cabc4ae64e9dac16 + # via + # -c python/requirements_compiled.txt + # ray +python-dateutil==2.8.2 \ + --hash=sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86 \ + --hash=sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9 + # via + # -c python/requirements_compiled.txt + # celery + # pandas +python-dotenv==1.1.1 \ + --hash=sha256:31f23644fe2602f88ff55e1f5c79ba497e01224ee7737937930c448e4d0e24dc \ + --hash=sha256:a8a6399716257f45be6a007360200409fce5cda2661e3dec71d23dc15f6189ab + # via uvicorn +pytz==2022.7.1 \ + --hash=sha256:01a0681c4b9684a28304615eba55d1ab31ae00bf68ec157ec3708a8182dbbcd0 \ + --hash=sha256:78f4f37d8198e0627c5f1143240bb0206b8691d8d7ac6d78fee88b78733f8c4a + # via + # -c python/requirements_compiled.txt + # pandas +pyyaml==6.0.1 \ + --hash=sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5 \ + --hash=sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc \ + --hash=sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df \ + --hash=sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741 \ + --hash=sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206 \ + --hash=sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27 \ + --hash=sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595 \ + --hash=sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62 \ + --hash=sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98 \ + --hash=sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696 \ + --hash=sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290 \ + --hash=sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9 \ + --hash=sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d \ + --hash=sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6 \ + --hash=sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867 \ + --hash=sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47 \ + --hash=sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486 \ + --hash=sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6 \ + --hash=sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3 \ + --hash=sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007 \ + --hash=sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938 \ + --hash=sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0 \ + --hash=sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c \ + --hash=sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735 \ + --hash=sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d \ + --hash=sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28 \ + --hash=sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4 \ + --hash=sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba \ + --hash=sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8 \ + --hash=sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef \ + --hash=sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5 \ + --hash=sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd \ + --hash=sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3 \ + --hash=sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0 \ + --hash=sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515 \ + --hash=sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c \ + --hash=sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c \ + --hash=sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924 \ + --hash=sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34 \ + --hash=sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43 \ + --hash=sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859 \ + --hash=sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673 \ + --hash=sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54 \ + --hash=sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a \ + --hash=sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b \ + --hash=sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab \ + --hash=sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa \ + --hash=sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c \ + --hash=sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585 \ + --hash=sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d \ + --hash=sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f + # via + # -c python/requirements_compiled.txt + # ray + # uvicorn +ray==100.0.0.dev0 \ + --hash=sha256:09b6b63a28bde8dfce18d07c3316c1330ecb81d57d4e2831a4d3e83883b6267d +referencing==0.36.2 \ + --hash=sha256:df2e89862cd09deabbdba16944cc3f10feb6b3e6f18e902f7cc25609a34775aa \ + --hash=sha256:e8699adbbf8b5c7de96d8ffa0eb5c158b3beafce084968e2ea8bb08c6794dcd0 + # via + # -c python/requirements_compiled.txt + # jsonschema + # jsonschema-specifications +requests==2.32.3 \ + --hash=sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760 \ + --hash=sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6 + # via + # -c python/requirements_compiled.txt + # google-api-core + # ray +rich==13.3.2 ; sys_platform != 'win32' \ + --hash=sha256:91954fe80cfb7985727a467ca98a7618e5dd15178cc2da10f553b36a93859001 \ + --hash=sha256:a104f37270bf677148d8acb07d33be1569eeee87e2d1beb286a4e9113caf6f2f + # via + # -c python/requirements_compiled.txt + # memray +rpds-py==0.22.3 \ + --hash=sha256:009de23c9c9ee54bf11303a966edf4d9087cd43a6003672e6aa7def643d06518 \ + --hash=sha256:02fbb9c288ae08bcb34fb41d516d5eeb0455ac35b5512d03181d755d80810059 \ + --hash=sha256:0a0461200769ab3b9ab7e513f6013b7a97fdeee41c29b9db343f3c5a8e2b9e61 \ + --hash=sha256:0b09865a9abc0ddff4e50b5ef65467cd94176bf1e0004184eb915cbc10fc05c5 \ + --hash=sha256:0b8db6b5b2d4491ad5b6bdc2bc7c017eec108acbf4e6785f42a9eb0ba234f4c9 \ + --hash=sha256:0c150c7a61ed4a4f4955a96626574e9baf1adf772c2fb61ef6a5027e52803543 \ + --hash=sha256:0f3cec041684de9a4684b1572fe28c7267410e02450f4561700ca5a3bc6695a2 \ + --hash=sha256:1352ae4f7c717ae8cba93421a63373e582d19d55d2ee2cbb184344c82d2ae55a \ + --hash=sha256:177c7c0fce2855833819c98e43c262007f42ce86651ffbb84f37883308cb0e7d \ + --hash=sha256:1978d0021e943aae58b9b0b196fb4895a25cc53d3956b8e35e0b7682eefb6d56 \ + --hash=sha256:1a60bce91f81ddaac922a40bbb571a12c1070cb20ebd6d49c48e0b101d87300d \ + --hash=sha256:1aef18820ef3e4587ebe8b3bc9ba6e55892a6d7b93bac6d29d9f631a3b4befbd \ + --hash=sha256:1e9663daaf7a63ceccbbb8e3808fe90415b0757e2abddbfc2e06c857bf8c5e2b \ + --hash=sha256:20070c65396f7373f5df4005862fa162db5d25d56150bddd0b3e8214e8ef45b4 \ + --hash=sha256:214b7a953d73b5e87f0ebece4a32a5bd83c60a3ecc9d4ec8f1dca968a2d91e99 \ + --hash=sha256:22bebe05a9ffc70ebfa127efbc429bc26ec9e9b4ee4d15a740033efda515cf3d \ + --hash=sha256:24e8abb5878e250f2eb0d7859a8e561846f98910326d06c0d51381fed59357bd \ + --hash=sha256:26fd7cac7dd51011a245f29a2cc6489c4608b5a8ce8d75661bb4a1066c52dfbe \ + --hash=sha256:27b1d3b3915a99208fee9ab092b8184c420f2905b7d7feb4aeb5e4a9c509b8a1 \ + --hash=sha256:27e98004595899949bd7a7b34e91fa7c44d7a97c40fcaf1d874168bb652ec67e \ + --hash=sha256:2b8f60e1b739a74bab7e01fcbe3dddd4657ec685caa04681df9d562ef15b625f \ + --hash=sha256:2de29005e11637e7a2361fa151f780ff8eb2543a0da1413bb951e9f14b699ef3 \ + --hash=sha256:2e8b55d8517a2fda8d95cb45d62a5a8bbf9dd0ad39c5b25c8833efea07b880ca \ + --hash=sha256:2fa4331c200c2521512595253f5bb70858b90f750d39b8cbfd67465f8d1b596d \ + --hash=sha256:3445e07bf2e8ecfeef6ef67ac83de670358abf2996916039b16a218e3d95e97e \ + --hash=sha256:3453e8d41fe5f17d1f8e9c383a7473cd46a63661628ec58e07777c2fff7196dc \ + --hash=sha256:378753b4a4de2a7b34063d6f95ae81bfa7b15f2c1a04a9518e8644e81807ebea \ + --hash=sha256:3af6e48651c4e0d2d166dc1b033b7042ea3f871504b6805ba5f4fe31581d8d38 \ + --hash=sha256:3dfcbc95bd7992b16f3f7ba05af8a64ca694331bd24f9157b49dadeeb287493b \ + --hash=sha256:3f21f0495edea7fdbaaa87e633a8689cd285f8f4af5c869f27bc8074638ad69c \ + --hash=sha256:4041711832360a9b75cfb11b25a6a97c8fb49c07b8bd43d0d02b45d0b499a4ff \ + --hash=sha256:44d61b4b7d0c2c9ac019c314e52d7cbda0ae31078aabd0f22e583af3e0d79723 \ + --hash=sha256:4617e1915a539a0d9a9567795023de41a87106522ff83fbfaf1f6baf8e85437e \ + --hash=sha256:4b232061ca880db21fa14defe219840ad9b74b6158adb52ddf0e87bead9e8493 \ + --hash=sha256:5246b14ca64a8675e0a7161f7af68fe3e910e6b90542b4bfb5439ba752191df6 \ + --hash=sha256:5725dd9cc02068996d4438d397e255dcb1df776b7ceea3b9cb972bdb11260a83 \ + --hash=sha256:583f6a1993ca3369e0f80ba99d796d8e6b1a3a2a442dd4e1a79e652116413091 \ + --hash=sha256:59259dc58e57b10e7e18ce02c311804c10c5a793e6568f8af4dead03264584d1 \ + --hash=sha256:593eba61ba0c3baae5bc9be2f5232430453fb4432048de28399ca7376de9c627 \ + --hash=sha256:59f4a79c19232a5774aee369a0c296712ad0e77f24e62cad53160312b1c1eaa1 \ + --hash=sha256:5f0e260eaf54380380ac3808aa4ebe2d8ca28b9087cf411649f96bad6900c728 \ + --hash=sha256:62d9cfcf4948683a18a9aff0ab7e1474d407b7bab2ca03116109f8464698ab16 \ + --hash=sha256:64607d4cbf1b7e3c3c8a14948b99345eda0e161b852e122c6bb71aab6d1d798c \ + --hash=sha256:655ca44a831ecb238d124e0402d98f6212ac527a0ba6c55ca26f616604e60a45 \ + --hash=sha256:666ecce376999bf619756a24ce15bb14c5bfaf04bf00abc7e663ce17c3f34fe7 \ + --hash=sha256:68049202f67380ff9aa52f12e92b1c30115f32e6895cd7198fa2a7961621fc5a \ + --hash=sha256:69803198097467ee7282750acb507fba35ca22cc3b85f16cf45fb01cb9097730 \ + --hash=sha256:6c7b99ca52c2c1752b544e310101b98a659b720b21db00e65edca34483259967 \ + --hash=sha256:6dd9412824c4ce1aca56c47b0991e65bebb7ac3f4edccfd3f156150c96a7bf25 \ + --hash=sha256:70eb60b3ae9245ddea20f8a4190bd79c705a22f8028aaf8bbdebe4716c3fab24 \ + --hash=sha256:70fb28128acbfd264eda9bf47015537ba3fe86e40d046eb2963d75024be4d055 \ + --hash=sha256:7b2513ba235829860b13faa931f3b6846548021846ac808455301c23a101689d \ + --hash=sha256:7ef9d9da710be50ff6809fed8f1963fecdfecc8b86656cadfca3bc24289414b0 \ + --hash=sha256:81e69b0a0e2537f26d73b4e43ad7bc8c8efb39621639b4434b76a3de50c6966e \ + --hash=sha256:8633e471c6207a039eff6aa116e35f69f3156b3989ea3e2d755f7bc41754a4a7 \ + --hash=sha256:8bd7c8cfc0b8247c8799080fbff54e0b9619e17cdfeb0478ba7295d43f635d7c \ + --hash=sha256:9253fc214112405f0afa7db88739294295f0e08466987f1d70e29930262b4c8f \ + --hash=sha256:99b37292234e61325e7a5bb9689e55e48c3f5f603af88b1642666277a81f1fbd \ + --hash=sha256:9bd7228827ec7bb817089e2eb301d907c0d9827a9e558f22f762bb690b131652 \ + --hash=sha256:9beeb01d8c190d7581a4d59522cd3d4b6887040dcfc744af99aa59fef3e041a8 \ + --hash=sha256:a63cbdd98acef6570c62b92a1e43266f9e8b21e699c363c0fef13bd530799c11 \ + --hash=sha256:a76e42402542b1fae59798fab64432b2d015ab9d0c8c47ba7addddbaf7952333 \ + --hash=sha256:ac0a03221cdb5058ce0167ecc92a8c89e8d0decdc9e99a2ec23380793c4dcb96 \ + --hash=sha256:b0b4136a252cadfa1adb705bb81524eee47d9f6aab4f2ee4fa1e9d3cd4581f64 \ + --hash=sha256:b25bc607423935079e05619d7de556c91fb6adeae9d5f80868dde3468657994b \ + --hash=sha256:b3d504047aba448d70cf6fa22e06cb09f7cbd761939fdd47604f5e007675c24e \ + --hash=sha256:bb47271f60660803ad11f4c61b42242b8c1312a31c98c578f79ef9387bbde21c \ + --hash=sha256:bbb232860e3d03d544bc03ac57855cd82ddf19c7a07651a7c0fdb95e9efea8b9 \ + --hash=sha256:bc27863442d388870c1809a87507727b799c8460573cfbb6dc0eeaef5a11b5ec \ + --hash=sha256:bc51abd01f08117283c5ebf64844a35144a0843ff7b2983e0648e4d3d9f10dbb \ + --hash=sha256:be2eb3f2495ba669d2a985f9b426c1797b7d48d6963899276d22f23e33d47e37 \ + --hash=sha256:bf9db5488121b596dbfc6718c76092fda77b703c1f7533a226a5a9f65248f8ad \ + --hash=sha256:c58e2339def52ef6b71b8f36d13c3688ea23fa093353f3a4fee2556e62086ec9 \ + --hash=sha256:cfbc454a2880389dbb9b5b398e50d439e2e58669160f27b60e5eca11f68ae17c \ + --hash=sha256:cff63a0272fcd259dcc3be1657b07c929c466b067ceb1c20060e8d10af56f5bf \ + --hash=sha256:d115bffdd417c6d806ea9069237a4ae02f513b778e3789a359bc5856e0404cc4 \ + --hash=sha256:d20cfb4e099748ea39e6f7b16c91ab057989712d31761d3300d43134e26e165f \ + --hash=sha256:d48424e39c2611ee1b84ad0f44fb3b2b53d473e65de061e3f460fc0be5f1939d \ + --hash=sha256:e0fa2d4ec53dc51cf7d3bb22e0aa0143966119f42a0c3e4998293a3dd2856b09 \ + --hash=sha256:e32fee8ab45d3c2db6da19a5323bc3362237c8b653c70194414b892fd06a080d \ + --hash=sha256:e35ba67d65d49080e8e5a1dd40101fccdd9798adb9b050ff670b7d74fa41c566 \ + --hash=sha256:e3fb866d9932a3d7d0c82da76d816996d1667c44891bd861a0f97ba27e84fc74 \ + --hash=sha256:e61b02c3f7a1e0b75e20c3978f7135fd13cb6cf551bf4a6d29b999a88830a338 \ + --hash=sha256:e67ba3c290821343c192f7eae1d8fd5999ca2dc99994114643e2f2d3e6138b15 \ + --hash=sha256:e79dd39f1e8c3504be0607e5fc6e86bb60fe3584bec8b782578c3b0fde8d932c \ + --hash=sha256:e89391e6d60251560f0a8f4bd32137b077a80d9b7dbe6d5cab1cd80d2746f648 \ + --hash=sha256:ea7433ce7e4bfc3a85654aeb6747babe3f66eaf9a1d0c1e7a4435bbdf27fea84 \ + --hash=sha256:eaf16ae9ae519a0e237a0f528fd9f0197b9bb70f40263ee57ae53c2b8d48aeb3 \ + --hash=sha256:eb0c341fa71df5a4595f9501df4ac5abfb5a09580081dffbd1ddd4654e6e9123 \ + --hash=sha256:f276b245347e6e36526cbd4a266a417796fc531ddf391e43574cf6466c492520 \ + --hash=sha256:f47ad3d5f3258bd7058d2d506852217865afefe6153a36eb4b6928758041d831 \ + --hash=sha256:f56a6b404f74ab372da986d240e2e002769a7d7102cc73eb238a4f72eec5284e \ + --hash=sha256:f5cf2a0c2bdadf3791b5c205d55a37a54025c6e18a71c71f82bb536cf9a454bf \ + --hash=sha256:f5d36399a1b96e1a5fdc91e0522544580dbebeb1f77f27b2b0ab25559e103b8b \ + --hash=sha256:f60bd8423be1d9d833f230fdbccf8f57af322d96bcad6599e5a771b151398eb2 \ + --hash=sha256:f612463ac081803f243ff13cccc648578e2279295048f2a8d5eb430af2bae6e3 \ + --hash=sha256:f73d3fef726b3243a811121de45193c0ca75f6407fe66f3f4e183c983573e130 \ + --hash=sha256:f82a116a1d03628a8ace4859556fb39fd1424c933341a08ea3ed6de1edb0283b \ + --hash=sha256:fb0ba113b4983beac1a2eb16faffd76cb41e176bf58c4afe3e14b9c681f702de \ + --hash=sha256:fb4f868f712b2dd4bcc538b0a0c1f63a2b1d584c925e69a224d759e7070a12d5 \ + --hash=sha256:fb6116dfb8d1925cbdb52595560584db42a7f664617a1f7d7f6e32f138cdf37d \ + --hash=sha256:fda7cb070f442bf80b642cd56483b5548e43d366fe3f39b98e67cce780cded00 \ + --hash=sha256:feea821ee2a9273771bae61194004ee2fc33f8ec7db08117ef9147d4bbcbca8e + # via + # -c python/requirements_compiled.txt + # jsonschema + # referencing +rsa==4.7.2 \ + --hash=sha256:78f9a9bf4e7be0c5ded4583326e7461e3a3c5aae24073648b4bdfa797d78c9d2 \ + --hash=sha256:9d689e6ca1b3038bc82bf8d23e944b6b6037bc02301a574935b2dd946e0353b9 + # via + # -c python/requirements_compiled.txt + # google-auth +scipy==1.11.4 \ + --hash=sha256:00150c5eae7b610c32589dda259eacc7c4f1665aedf25d921907f4d08a951b1c \ + --hash=sha256:028eccd22e654b3ea01ee63705681ee79933652b2d8f873e7949898dda6d11b6 \ + --hash=sha256:1b7c3dca977f30a739e0409fb001056484661cb2541a01aba0bb0029f7b68db8 \ + --hash=sha256:2c6ff6ef9cc27f9b3db93a6f8b38f97387e6e0591600369a297a50a8e96e835d \ + --hash=sha256:36750b7733d960d7994888f0d148d31ea3017ac15eef664194b4ef68d36a4a97 \ + --hash=sha256:530f9ad26440e85766509dbf78edcfe13ffd0ab7fec2560ee5c36ff74d6269ff \ + --hash=sha256:5e347b14fe01003d3b78e196e84bd3f48ffe4c8a7b8a1afbcb8f5505cb710993 \ + --hash=sha256:6550466fbeec7453d7465e74d4f4b19f905642c89a7525571ee91dd7adabb5a3 \ + --hash=sha256:6df1468153a31cf55ed5ed39647279beb9cfb5d3f84369453b49e4b8502394fd \ + --hash=sha256:6e619aba2df228a9b34718efb023966da781e89dd3d21637b27f2e54db0410d7 \ + --hash=sha256:8fce70f39076a5aa62e92e69a7f62349f9574d8405c0a5de6ed3ef72de07f446 \ + --hash=sha256:90a2b78e7f5733b9de748f589f09225013685f9b218275257f8a8168ededaeaa \ + --hash=sha256:91af76a68eeae0064887a48e25c4e616fa519fa0d38602eda7e0f97d65d57937 \ + --hash=sha256:933baf588daa8dc9a92c20a0be32f56d43faf3d1a60ab11b3f08c356430f6e56 \ + --hash=sha256:acf8ed278cc03f5aff035e69cb511741e0418681d25fbbb86ca65429c4f4d9cd \ + --hash=sha256:ad669df80528aeca5f557712102538f4f37e503f0c5b9541655016dd0932ca79 \ + --hash=sha256:b030c6674b9230d37c5c60ab456e2cf12f6784596d15ce8da9365e70896effc4 \ + --hash=sha256:b9999c008ccf00e8fbcce1236f85ade5c569d13144f77a1946bef8863e8f6eb4 \ + --hash=sha256:bc9a714581f561af0848e6b69947fda0614915f072dfd14142ed1bfe1b806710 \ + --hash=sha256:ce7fff2e23ab2cc81ff452a9444c215c28e6305f396b2ba88343a567feec9660 \ + --hash=sha256:cf00bd2b1b0211888d4dc75656c0412213a8b25e80d73898083f402b50f47e41 \ + --hash=sha256:d10e45a6c50211fe256da61a11c34927c68f277e03138777bdebedd933712fea \ + --hash=sha256:ee410e6de8f88fd5cf6eadd73c135020bfbbbdfcd0f6162c36a7638a1ea8cc65 \ + --hash=sha256:f313b39a7e94f296025e3cffc2c567618174c0b1dde173960cf23808f9fae4be \ + --hash=sha256:f3cd9e7b3c2c1ec26364856f9fbe78695fe631150f94cd1c22228456404cf1ec + # via + # -c python/requirements_compiled.txt + # ray +six==1.16.0 \ + --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \ + --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 + # via + # -c python/requirements_compiled.txt + # opencensus + # python-dateutil +smart-open==6.2.0 \ + --hash=sha256:088bf00f9327c71e549bc2f86567d3320df5d89667f009ce1c16568976068ef7 \ + --hash=sha256:1b4df5c8365218f3852c507451920ccad606c80b0acb4e67508e50ba9b5d2632 + # via + # -c python/requirements_compiled.txt + # ray +sniffio==1.3.1 \ + --hash=sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2 \ + --hash=sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc + # via + # -c python/requirements_compiled.txt + # anyio +starlette==0.46.2 \ + --hash=sha256:595633ce89f8ffa71a015caed34a5b2dc1c0cdb3f0f1fbd1e69339cf2abeec35 \ + --hash=sha256:7f7361f34eed179294600af672f565727419830b54b7b084efe44bb82d2fccd5 + # via + # -c python/requirements_compiled.txt + # fastapi + # ray +tensorboardx==2.6.2.2 \ + --hash=sha256:160025acbf759ede23fd3526ae9d9bfbfd8b68eb16c38a010ebe326dc6395db8 \ + --hash=sha256:c6476d7cd0d529b0b72f4acadb1269f9ed8b22f441e87a84f2a3b940bb87b666 + # via + # -c python/requirements_compiled.txt + # ray +typing-extensions==4.12.2 \ + --hash=sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d + # via + # -c python/requirements_compiled.txt + # exceptiongroup + # fastapi + # gymnasium + # opentelemetry-api + # opentelemetry-sdk + # opentelemetry-semantic-conventions + # pydantic + # pydantic-core + # pyopenssl + # referencing + # starlette + # typing-inspection +typing-inspection==0.4.1 \ + --hash=sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51 \ + --hash=sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28 + # via + # -c python/requirements_compiled.txt + # pydantic +tzdata==2025.2 \ + --hash=sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8 \ + --hash=sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9 + # via + # -c python/requirements_compiled.txt + # kombu +urllib3==1.26.19 \ + --hash=sha256:37a0344459b199fce0e80b0d3569837ec6b6937435c5244e7fd73fa6006830f3 \ + --hash=sha256:3e3d753a8618b86d7de333b4223005f68720bcd6a7d2bcb9fbd2229ec7c1e429 + # via + # -c python/requirements_compiled.txt + # requests +uvicorn==0.22.0 \ + --hash=sha256:79277ae03db57ce7d9aa0567830bbb51d7a612f54d6e1e3e92da3ef24c2c8ed8 \ + --hash=sha256:e9434d3bbf05f310e762147f769c9f21235ee118ba2d2bf1155a7196448bd996 + # via + # -c python/requirements_compiled.txt + # ray +uvloop==0.21.0 ; platform_python_implementation != 'PyPy' and sys_platform != 'cygwin' and sys_platform != 'win32' \ + --hash=sha256:0878c2640cf341b269b7e128b1a5fed890adc4455513ca710d77d5e93aa6d6a0 \ + --hash=sha256:10d66943def5fcb6e7b37310eb6b5639fd2ccbc38df1177262b0640c3ca68c1f \ + --hash=sha256:10da8046cc4a8f12c91a1c39d1dd1585c41162a15caaef165c2174db9ef18bdc \ + --hash=sha256:17df489689befc72c39a08359efac29bbee8eee5209650d4b9f34df73d22e414 \ + --hash=sha256:183aef7c8730e54c9a3ee3227464daed66e37ba13040bb3f350bc2ddc040f22f \ + --hash=sha256:196274f2adb9689a289ad7d65700d37df0c0930fd8e4e743fa4834e850d7719d \ + --hash=sha256:221f4f2a1f46032b403bf3be628011caf75428ee3cc204a22addf96f586b19fd \ + --hash=sha256:2d1f581393673ce119355d56da84fe1dd9d2bb8b3d13ce792524e1607139feff \ + --hash=sha256:359ec2c888397b9e592a889c4d72ba3d6befba8b2bb01743f72fffbde663b59c \ + --hash=sha256:3bf12b0fda68447806a7ad847bfa591613177275d35b6724b1ee573faa3704e3 \ + --hash=sha256:4509360fcc4c3bd2c70d87573ad472de40c13387f5fda8cb58350a1d7475e58d \ + --hash=sha256:460def4412e473896ef179a1671b40c039c7012184b627898eea5072ef6f017a \ + --hash=sha256:461d9ae6660fbbafedd07559c6a2e57cd553b34b0065b6550685f6653a98c1cb \ + --hash=sha256:46923b0b5ee7fc0020bef24afe7836cb068f5050ca04caf6b487c513dc1a20b2 \ + --hash=sha256:53e420a3afe22cdcf2a0f4846e377d16e718bc70103d7088a4f7623567ba5fb0 \ + --hash=sha256:5ee4d4ef48036ff6e5cfffb09dd192c7a5027153948d85b8da7ff705065bacc6 \ + --hash=sha256:67dd654b8ca23aed0a8e99010b4c34aca62f4b7fce88f39d452ed7622c94845c \ + --hash=sha256:787ae31ad8a2856fc4e7c095341cccc7209bd657d0e71ad0dc2ea83c4a6fa8af \ + --hash=sha256:86975dca1c773a2c9864f4c52c5a55631038e387b47eaf56210f873887b6c8dc \ + --hash=sha256:87c43e0f13022b998eb9b973b5e97200c8b90823454d4bc06ab33829e09fb9bb \ + --hash=sha256:88cb67cdbc0e483da00af0b2c3cdad4b7c61ceb1ee0f33fe00e09c81e3a6cb75 \ + --hash=sha256:8a375441696e2eda1c43c44ccb66e04d61ceeffcd76e4929e527b7fa401b90fb \ + --hash=sha256:a5c39f217ab3c663dc699c04cbd50c13813e31d917642d459fdcec07555cc553 \ + --hash=sha256:b9fb766bb57b7388745d8bcc53a359b116b8a04c83a2288069809d2b3466c37e \ + --hash=sha256:baa0e6291d91649c6ba4ed4b2f982f9fa165b5bbd50a9e203c416a2797bab3c6 \ + --hash=sha256:baa4dcdbd9ae0a372f2167a207cd98c9f9a1ea1188a8a526431eef2f8116cc8d \ + --hash=sha256:bc09f0ff191e61c2d592a752423c767b4ebb2986daa9ed62908e2b1b9a9ae206 \ + --hash=sha256:bd53ecc9a0f3d87ab847503c2e1552b690362e005ab54e8a48ba97da3924c0dc \ + --hash=sha256:bfd55dfcc2a512316e65f16e503e9e450cab148ef11df4e4e679b5e8253a5281 \ + --hash=sha256:c097078b8031190c934ed0ebfee8cc5f9ba9642e6eb88322b9958b649750f72b \ + --hash=sha256:c0f3fa6200b3108919f8bdabb9a7f87f20e7097ea3c543754cabc7d717d95cf8 \ + --hash=sha256:e678ad6fe52af2c58d2ae3c73dc85524ba8abe637f134bf3564ed07f555c5e79 \ + --hash=sha256:ec7e6b09a6fdded42403182ab6b832b71f4edaf7f37a9a0e371a01db5f0cb45f \ + --hash=sha256:f0ce1b49560b1d2d8a2977e3ba4afb2414fb46b86a1b64056bc4ab929efdafbe \ + --hash=sha256:f38b2e090258d051d68a5b14d1da7203a3c3677321cf32a95a6f4db4dd8b6f26 \ + --hash=sha256:f3df876acd7ec037a3d005b3ab85a7e4110422e4d9c1571d4fc89b0fc41b6816 \ + --hash=sha256:f7089d2dc73179ce5ac255bdf37c236a9f914b264825fdaacaded6990a7fb4c2 + # via + # -c python/requirements_compiled.txt + # uvicorn +vine==5.1.0 \ + --hash=sha256:40fdf3c48b2cfe1c38a49e9ae2da6fda88e4794c810050a728bd7413811fb1dc \ + --hash=sha256:8b62e981d35c41049211cf62a0a1242d8c1ee9bd15bb196ce38aefd6799e61e0 + # via + # -c python/requirements_compiled.txt + # amqp + # celery + # kombu +virtualenv==20.29.1 \ + --hash=sha256:4e4cb403c0b0da39e13b46b1b2476e505cb0046b25f242bee80f62bf990b2779 \ + --hash=sha256:b8b8970138d32fb606192cb97f6cd4bb644fa486be9308fb9b63f81091b5dc35 + # via + # -c python/requirements_compiled.txt + # ray +watchfiles==0.19.0 \ + --hash=sha256:0089c6dc24d436b373c3c57657bf4f9a453b13767150d17284fc6162b2791911 \ + --hash=sha256:09ea3397aecbc81c19ed7f025e051a7387feefdb789cf768ff994c1228182fda \ + --hash=sha256:176a9a7641ec2c97b24455135d58012a5be5c6217fc4d5fef0b2b9f75dbf5154 \ + --hash=sha256:18b28f6ad871b82df9542ff958d0c86bb0d8310bb09eb8e87d97318a3b5273af \ + --hash=sha256:20b44221764955b1e703f012c74015306fb7e79a00c15370785f309b1ed9aa8d \ + --hash=sha256:3d7d267d27aceeeaa3de0dd161a0d64f0a282264d592e335fff7958cc0cbae7c \ + --hash=sha256:5471582658ea56fca122c0f0d0116a36807c63fefd6fdc92c71ca9a4491b6b48 \ + --hash=sha256:5569fc7f967429d4bc87e355cdfdcee6aabe4b620801e2cf5805ea245c06097c \ + --hash=sha256:68dce92b29575dda0f8d30c11742a8e2b9b8ec768ae414b54f7453f27bdf9545 \ + --hash=sha256:79c533ff593db861ae23436541f481ec896ee3da4e5db8962429b441bbaae16e \ + --hash=sha256:7f3920b1285a7d3ce898e303d84791b7bf40d57b7695ad549dc04e6a44c9f120 \ + --hash=sha256:91633e64712df3051ca454ca7d1b976baf842d7a3640b87622b323c55f3345e7 \ + --hash=sha256:945be0baa3e2440151eb3718fd8846751e8b51d8de7b884c90b17d271d34cae8 \ + --hash=sha256:9afd0d69429172c796164fd7fe8e821ade9be983f51c659a38da3faaaaac44dc \ + --hash=sha256:9c75eff897786ee262c9f17a48886f4e98e6cfd335e011c591c305e5d083c056 \ + --hash=sha256:b538014a87f94d92f98f34d3e6d2635478e6be6423a9ea53e4dd96210065e193 \ + --hash=sha256:b6577b8c6c8701ba8642ea9335a129836347894b666dd1ec2226830e263909d3 \ + --hash=sha256:c0376deac92377817e4fb8f347bf559b7d44ff556d9bc6f6208dd3f79f104aaf \ + --hash=sha256:cae3dde0b4b2078f31527acff6f486e23abed307ba4d3932466ba7cdd5ecec79 \ + --hash=sha256:cb5d45c4143c1dd60f98a16187fd123eda7248f84ef22244818c18d531a249d1 \ + --hash=sha256:d9b073073e048081e502b6c6b0b88714c026a1a4c890569238d04aca5f9ca74b \ + --hash=sha256:fac19dc9cbc34052394dbe81e149411a62e71999c0a19e1e09ce537867f95ae0 + # via + # -c python/requirements_compiled.txt + # ray + # uvicorn +wcwidth==0.2.13 \ + --hash=sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859 \ + --hash=sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5 + # via + # -c python/requirements_compiled.txt + # prompt-toolkit +websockets==11.0.3 \ + --hash=sha256:01f5567d9cf6f502d655151645d4e8b72b453413d3819d2b6f1185abc23e82dd \ + --hash=sha256:03aae4edc0b1c68498f41a6772d80ac7c1e33c06c6ffa2ac1c27a07653e79d6f \ + --hash=sha256:0ac56b661e60edd453585f4bd68eb6a29ae25b5184fd5ba51e97652580458998 \ + --hash=sha256:0ee68fe502f9031f19d495dae2c268830df2760c0524cbac5d759921ba8c8e82 \ + --hash=sha256:1553cb82942b2a74dd9b15a018dce645d4e68674de2ca31ff13ebc2d9f283788 \ + --hash=sha256:1a073fc9ab1c8aff37c99f11f1641e16da517770e31a37265d2755282a5d28aa \ + --hash=sha256:1d2256283fa4b7f4c7d7d3e84dc2ece74d341bce57d5b9bf385df109c2a1a82f \ + --hash=sha256:1d5023a4b6a5b183dc838808087033ec5df77580485fc533e7dab2567851b0a4 \ + --hash=sha256:1fdf26fa8a6a592f8f9235285b8affa72748dc12e964a5518c6c5e8f916716f7 \ + --hash=sha256:2529338a6ff0eb0b50c7be33dc3d0e456381157a31eefc561771ee431134a97f \ + --hash=sha256:279e5de4671e79a9ac877427f4ac4ce93751b8823f276b681d04b2156713b9dd \ + --hash=sha256:2d903ad4419f5b472de90cd2d40384573b25da71e33519a67797de17ef849b69 \ + --hash=sha256:332d126167ddddec94597c2365537baf9ff62dfcc9db4266f263d455f2f031cb \ + --hash=sha256:34fd59a4ac42dff6d4681d8843217137f6bc85ed29722f2f7222bd619d15e95b \ + --hash=sha256:3580dd9c1ad0701169e4d6fc41e878ffe05e6bdcaf3c412f9d559389d0c9e016 \ + --hash=sha256:3ccc8a0c387629aec40f2fc9fdcb4b9d5431954f934da3eaf16cdc94f67dbfac \ + --hash=sha256:41f696ba95cd92dc047e46b41b26dd24518384749ed0d99bea0a941ca87404c4 \ + --hash=sha256:42cc5452a54a8e46a032521d7365da775823e21bfba2895fb7b77633cce031bb \ + --hash=sha256:4841ed00f1026dfbced6fca7d963c4e7043aa832648671b5138008dc5a8f6d99 \ + --hash=sha256:4b253869ea05a5a073ebfdcb5cb3b0266a57c3764cf6fe114e4cd90f4bfa5f5e \ + --hash=sha256:54c6e5b3d3a8936a4ab6870d46bdd6ec500ad62bde9e44462c32d18f1e9a8e54 \ + --hash=sha256:619d9f06372b3a42bc29d0cd0354c9bb9fb39c2cbc1a9c5025b4538738dbffaf \ + --hash=sha256:6505c1b31274723ccaf5f515c1824a4ad2f0d191cec942666b3d0f3aa4cb4007 \ + --hash=sha256:660e2d9068d2bedc0912af508f30bbeb505bbbf9774d98def45f68278cea20d3 \ + --hash=sha256:6681ba9e7f8f3b19440921e99efbb40fc89f26cd71bf539e45d8c8a25c976dc6 \ + --hash=sha256:68b977f21ce443d6d378dbd5ca38621755f2063d6fdb3335bda981d552cfff86 \ + --hash=sha256:69269f3a0b472e91125b503d3c0b3566bda26da0a3261c49f0027eb6075086d1 \ + --hash=sha256:6f1a3f10f836fab6ca6efa97bb952300b20ae56b409414ca85bff2ad241d2a61 \ + --hash=sha256:7622a89d696fc87af8e8d280d9b421db5133ef5b29d3f7a1ce9f1a7bf7fcfa11 \ + --hash=sha256:777354ee16f02f643a4c7f2b3eff8027a33c9861edc691a2003531f5da4f6bc8 \ + --hash=sha256:84d27a4832cc1a0ee07cdcf2b0629a8a72db73f4cf6de6f0904f6661227f256f \ + --hash=sha256:8531fdcad636d82c517b26a448dcfe62f720e1922b33c81ce695d0edb91eb931 \ + --hash=sha256:86d2a77fd490ae3ff6fae1c6ceaecad063d3cc2320b44377efdde79880e11526 \ + --hash=sha256:88fc51d9a26b10fc331be344f1781224a375b78488fc343620184e95a4b27016 \ + --hash=sha256:8a34e13a62a59c871064dfd8ffb150867e54291e46d4a7cf11d02c94a5275bae \ + --hash=sha256:8c82f11964f010053e13daafdc7154ce7385ecc538989a354ccc7067fd7028fd \ + --hash=sha256:92b2065d642bf8c0a82d59e59053dd2fdde64d4ed44efe4870fa816c1232647b \ + --hash=sha256:97b52894d948d2f6ea480171a27122d77af14ced35f62e5c892ca2fae9344311 \ + --hash=sha256:9d9acd80072abcc98bd2c86c3c9cd4ac2347b5a5a0cae7ed5c0ee5675f86d9af \ + --hash=sha256:9f59a3c656fef341a99e3d63189852be7084c0e54b75734cde571182c087b152 \ + --hash=sha256:aa5003845cdd21ac0dc6c9bf661c5beddd01116f6eb9eb3c8e272353d45b3288 \ + --hash=sha256:b16fff62b45eccb9c7abb18e60e7e446998093cdcb50fed33134b9b6878836de \ + --hash=sha256:b30c6590146e53149f04e85a6e4fcae068df4289e31e4aee1fdf56a0dead8f97 \ + --hash=sha256:b58cbf0697721120866820b89f93659abc31c1e876bf20d0b3d03cef14faf84d \ + --hash=sha256:b67c6f5e5a401fc56394f191f00f9b3811fe843ee93f4a70df3c389d1adf857d \ + --hash=sha256:bceab846bac555aff6427d060f2fcfff71042dba6f5fca7dc4f75cac815e57ca \ + --hash=sha256:bee9fcb41db2a23bed96c6b6ead6489702c12334ea20a297aa095ce6d31370d0 \ + --hash=sha256:c114e8da9b475739dde229fd3bc6b05a6537a88a578358bc8eb29b4030fac9c9 \ + --hash=sha256:c1f0524f203e3bd35149f12157438f406eff2e4fb30f71221c8a5eceb3617b6b \ + --hash=sha256:c792ea4eabc0159535608fc5658a74d1a81020eb35195dd63214dcf07556f67e \ + --hash=sha256:c7f3cb904cce8e1be667c7e6fef4516b98d1a6a0635a58a57528d577ac18a128 \ + --hash=sha256:d67ac60a307f760c6e65dad586f556dde58e683fab03323221a4e530ead6f74d \ + --hash=sha256:dcacf2c7a6c3a84e720d1bb2b543c675bf6c40e460300b628bab1b1efc7c034c \ + --hash=sha256:de36fe9c02995c7e6ae6efe2e205816f5f00c22fd1fbf343d4d18c3d5ceac2f5 \ + --hash=sha256:def07915168ac8f7853812cc593c71185a16216e9e4fa886358a17ed0fd9fcf6 \ + --hash=sha256:df41b9bc27c2c25b486bae7cf42fccdc52ff181c8c387bfd026624a491c2671b \ + --hash=sha256:e052b8467dd07d4943936009f46ae5ce7b908ddcac3fda581656b1b19c083d9b \ + --hash=sha256:e063b1865974611313a3849d43f2c3f5368093691349cf3c7c8f8f75ad7cb280 \ + --hash=sha256:e1459677e5d12be8bbc7584c35b992eea142911a6236a3278b9b5ce3326f282c \ + --hash=sha256:e1a99a7a71631f0efe727c10edfba09ea6bee4166a6f9c19aafb6c0b5917d09c \ + --hash=sha256:e590228200fcfc7e9109509e4d9125eace2042fd52b595dd22bbc34bb282307f \ + --hash=sha256:e6316827e3e79b7b8e7d8e3b08f4e331af91a48e794d5d8b099928b6f0b85f20 \ + --hash=sha256:e7837cb169eca3b3ae94cc5787c4fed99eef74c0ab9506756eea335e0d6f3ed8 \ + --hash=sha256:e848f46a58b9fcf3d06061d17be388caf70ea5b8cc3466251963c8345e13f7eb \ + --hash=sha256:ed058398f55163a79bb9f06a90ef9ccc063b204bb346c4de78efc5d15abfe602 \ + --hash=sha256:f2e58f2c36cc52d41f2659e4c0cbf7353e28c8c9e63e30d8c6d3494dc9fdedcf \ + --hash=sha256:f467ba0050b7de85016b43f5a22b46383ef004c4f672148a8abf32bc999a87f0 \ + --hash=sha256:f61bdb1df43dc9c131791fbc2355535f9024b9a04398d3bd0684fc16ab07df74 \ + --hash=sha256:fb06eea71a00a7af0ae6aefbb932fb8a7df3cb390cc217d51a9ad7343de1b8d0 \ + --hash=sha256:ffd7dcaf744f25f82190856bc26ed81721508fc5cbf2a330751e135ff1283564 + # via + # -c python/requirements_compiled.txt + # uvicorn +yarl==1.18.3 \ + --hash=sha256:00e5a1fea0fd4f5bfa7440a47eff01d9822a65b4488f7cff83155a0f31a2ecba \ + --hash=sha256:02ddb6756f8f4517a2d5e99d8b2f272488e18dd0bfbc802f31c16c6c20f22193 \ + --hash=sha256:045b8482ce9483ada4f3f23b3774f4e1bf4f23a2d5c912ed5170f68efb053318 \ + --hash=sha256:09c7907c8548bcd6ab860e5f513e727c53b4a714f459b084f6580b49fa1b9cee \ + --hash=sha256:0b0cad37311123211dc91eadcb322ef4d4a66008d3e1bdc404808992260e1a0e \ + --hash=sha256:0b3c92fa08759dbf12b3a59579a4096ba9af8dd344d9a813fc7f5070d86bbab1 \ + --hash=sha256:0fb2171a4486bb075316ee754c6d8382ea6eb8b399d4ec62fde2b591f879778a \ + --hash=sha256:1a74a13a4c857a84a845505fd2d68e54826a2cd01935a96efb1e9d86c728e186 \ + --hash=sha256:1d407181cfa6e70077df3377938c08012d18893f9f20e92f7d2f314a437c30b1 \ + --hash=sha256:1dd4bdd05407ced96fed3d7f25dbbf88d2ffb045a0db60dbc247f5b3c5c25d50 \ + --hash=sha256:25b411eddcfd56a2f0cd6a384e9f4f7aa3efee14b188de13048c25b5e91f1640 \ + --hash=sha256:2d06d3005e668744e11ed80812e61efd77d70bb7f03e33c1598c301eea20efbb \ + --hash=sha256:2ec9bbba33b2d00999af4631a3397d1fd78290c48e2a3e52d8dd72db3a067ac8 \ + --hash=sha256:3236da9272872443f81fedc389bace88408f64f89f75d1bdb2256069a8730ccc \ + --hash=sha256:35098b24e0327fc4ebdc8ffe336cee0a87a700c24ffed13161af80124b7dc8e5 \ + --hash=sha256:41f7ce59d6ee7741af71d82020346af364949314ed3d87553763a2df1829cc58 \ + --hash=sha256:436c4fc0a4d66b2badc6c5fc5ef4e47bb10e4fd9bf0c79524ac719a01f3607c2 \ + --hash=sha256:4891ed92157e5430874dad17b15eb1fda57627710756c27422200c52d8a4e393 \ + --hash=sha256:4ac515b860c36becb81bb84b667466885096b5fc85596948548b667da3bf9f24 \ + --hash=sha256:5094d9206c64181d0f6e76ebd8fb2f8fe274950a63890ee9e0ebfd58bf9d787b \ + --hash=sha256:54d6921f07555713b9300bee9c50fb46e57e2e639027089b1d795ecd9f7fa910 \ + --hash=sha256:578e281c393af575879990861823ef19d66e2b1d0098414855dd367e234f5b3c \ + --hash=sha256:5a3f356548e34a70b0172d8890006c37be92995f62d95a07b4a42e90fba54272 \ + --hash=sha256:602d98f2c2d929f8e697ed274fbadc09902c4025c5a9963bf4e9edfc3ab6f7ed \ + --hash=sha256:61b1a825a13bef4a5f10b1885245377d3cd0bf87cba068e1d9a88c2ae36880e1 \ + --hash=sha256:61e5e68cb65ac8f547f6b5ef933f510134a6bf31bb178be428994b0cb46c2a04 \ + --hash=sha256:61ee62ead9b68b9123ec24bc866cbef297dd266175d53296e2db5e7f797f902d \ + --hash=sha256:6333c5a377c8e2f5fae35e7b8f145c617b02c939d04110c76f29ee3676b5f9a5 \ + --hash=sha256:6748dbf9bfa5ba1afcc7556b71cda0d7ce5f24768043a02a58846e4a443d808d \ + --hash=sha256:67a283dd2882ac98cc6318384f565bffc751ab564605959df4752d42483ad889 \ + --hash=sha256:75674776d96d7b851b6498f17824ba17849d790a44d282929c42dbb77d4f17ae \ + --hash=sha256:757e81cae69244257d125ff31663249b3013b5dc0a8520d73694aed497fb195b \ + --hash=sha256:77a6e85b90a7641d2e07184df5557132a337f136250caafc9ccaa4a2a998ca2c \ + --hash=sha256:7c33dd1931a95e5d9a772d0ac5e44cac8957eaf58e3c8da8c1414de7dd27c576 \ + --hash=sha256:7df647e8edd71f000a5208fe6ff8c382a1de8edfbccdbbfe649d263de07d8c34 \ + --hash=sha256:7e2ee16578af3b52ac2f334c3b1f92262f47e02cc6193c598502bd46f5cd1477 \ + --hash=sha256:80316a8bd5109320d38eef8833ccf5f89608c9107d02d2a7f985f98ed6876990 \ + --hash=sha256:82123d0c954dc58db301f5021a01854a85bf1f3bb7d12ae0c01afc414a882ca2 \ + --hash=sha256:84b2deecba4a3f1a398df819151eb72d29bfeb3b69abb145a00ddc8d30094512 \ + --hash=sha256:8503ad47387b8ebd39cbbbdf0bf113e17330ffd339ba1144074da24c545f0069 \ + --hash=sha256:877d209b6aebeb5b16c42cbb377f5f94d9e556626b1bfff66d7b0d115be88d0a \ + --hash=sha256:8874027a53e3aea659a6d62751800cf6e63314c160fd607489ba5c2edd753cf6 \ + --hash=sha256:88a19f62ff30117e706ebc9090b8ecc79aeb77d0b1f5ec10d2d27a12bc9f66d0 \ + --hash=sha256:8d39d351e7faf01483cc7ff7c0213c412e38e5a340238826be7e0e4da450fdc8 \ + --hash=sha256:90adb47ad432332d4f0bc28f83a5963f426ce9a1a8809f5e584e704b82685dcb \ + --hash=sha256:913829534200eb0f789d45349e55203a091f45c37a2674678744ae52fae23efa \ + --hash=sha256:93b2e109287f93db79210f86deb6b9bbb81ac32fc97236b16f7433db7fc437d8 \ + --hash=sha256:9d41beda9dc97ca9ab0b9888cb71f7539124bc05df02c0cff6e5acc5a19dcc6e \ + --hash=sha256:a440a2a624683108a1b454705ecd7afc1c3438a08e890a1513d468671d90a04e \ + --hash=sha256:a4bb030cf46a434ec0225bddbebd4b89e6471814ca851abb8696170adb163985 \ + --hash=sha256:a9ca04806f3be0ac6d558fffc2fdf8fcef767e0489d2684a21912cc4ed0cd1b8 \ + --hash=sha256:ac1801c45cbf77b6c99242eeff4fffb5e4e73a800b5c4ad4fc0be5def634d2e1 \ + --hash=sha256:ac36703a585e0929b032fbaab0707b75dc12703766d0b53486eabd5139ebadd5 \ + --hash=sha256:b1771de9944d875f1b98a745bc547e684b863abf8f8287da8466cf470ef52690 \ + --hash=sha256:b464c4ab4bfcb41e3bfd3f1c26600d038376c2de3297760dfe064d2cb7ea8e10 \ + --hash=sha256:b4f6450109834af88cb4cc5ecddfc5380ebb9c228695afc11915a0bf82116789 \ + --hash=sha256:b57f4f58099328dfb26c6a771d09fb20dbbae81d20cfb66141251ea063bd101b \ + --hash=sha256:b643562c12680b01e17239be267bc306bbc6aac1f34f6444d1bded0c5ce438ca \ + --hash=sha256:b958ddd075ddba5b09bb0be8a6d9906d2ce933aee81100db289badbeb966f54e \ + --hash=sha256:b9d60031cf568c627d028239693fd718025719c02c9f55df0a53e587aab951b5 \ + --hash=sha256:ba23302c0c61a9999784e73809427c9dbedd79f66a13d84ad1b1943802eaaf59 \ + --hash=sha256:ba87babd629f8af77f557b61e49e7c7cac36f22f871156b91e10a6e9d4f829e9 \ + --hash=sha256:c017a3b6df3a1bd45b9fa49a0f54005e53fbcad16633870104b66fa1a30a29d8 \ + --hash=sha256:c1e1cc06da1491e6734f0ea1e6294ce00792193c463350626571c287c9a704db \ + --hash=sha256:c654d5207c78e0bd6d749f6dae1dcbbfde3403ad3a4b11f3c5544d9906969dde \ + --hash=sha256:c69697d3adff5aa4f874b19c0e4ed65180ceed6318ec856ebc423aa5850d84f7 \ + --hash=sha256:c7d79f7d9aabd6011004e33b22bc13056a3e3fb54794d138af57f5ee9d9032cb \ + --hash=sha256:ccaa3a4b521b780a7e771cc336a2dba389a0861592bbce09a476190bb0c8b4b3 \ + --hash=sha256:ccd17349166b1bee6e529b4add61727d3f55edb7babbe4069b5764c9587a8cc6 \ + --hash=sha256:ce1af883b94304f493698b00d0f006d56aea98aeb49d75ec7d98cd4a777e9285 \ + --hash=sha256:d0e883008013c0e4aef84dcfe2a0b172c4d23c2669412cf5b3371003941f72bb \ + --hash=sha256:d980e0325b6eddc81331d3f4551e2a333999fb176fd153e075c6d1c2530aa8a8 \ + --hash=sha256:e17c9361d46a4d5addf777c6dd5eab0715a7684c2f11b88c67ac37edfba6c482 \ + --hash=sha256:e2c08cc9b16f4f4bc522771d96734c7901e7ebef70c6c5c35dd0f10845270bcd \ + --hash=sha256:e35ef8683211db69ffe129a25d5634319a677570ab6b2eba4afa860f54eeaf75 \ + --hash=sha256:e3b9fd71836999aad54084906f8663dffcd2a7fb5cdafd6c37713b2e72be1760 \ + --hash=sha256:ef9f7768395923c3039055c14334ba4d926f3baf7b776c923c93d80195624782 \ + --hash=sha256:f52a265001d830bc425f82ca9eabda94a64a4d753b07d623a9f2863fde532b53 \ + --hash=sha256:f91c4803173928a25e1a55b943c81f55b8872f0018be83e3ad4938adffb77dd2 \ + --hash=sha256:fbd6748e8ab9b41171bb95c6142faf068f5ef1511935a0aa07025438dd9a9bc1 \ + --hash=sha256:fe57328fbc1bfd0bd0514470ac692630f3901c0ee39052ae47acd1d90a436719 \ + --hash=sha256:fea09ca13323376a2fdfb353a5fa2e59f90cd18d7ca4eaa1fd31f0a8b4f91e62 + # via + # -c python/requirements_compiled.txt + # aiohttp +zipp==3.19.2 \ + --hash=sha256:f091755f667055f2d02b32c53771a7a6c8b47e1fdbc4b72a8b9072b3eef8015c + # via + # -c python/requirements_compiled.txt + # importlib-metadata diff --git a/rllib/utils/deprecation.py b/python/ray/_common/deprecation.py similarity index 98% rename from rllib/utils/deprecation.py rename to python/ray/_common/deprecation.py index 7f5dd0e78b8b..fb89526d87a9 100644 --- a/rllib/utils/deprecation.py +++ b/python/ray/_common/deprecation.py @@ -71,7 +71,7 @@ def Deprecated(old=None, *, new=None, help=None, error): .. testcode:: :skipif: True - from ray.rllib.utils.deprecation import Deprecated + from ray._common.deprecation import Deprecated # Deprecated class: Patches the constructor to warn if the class is # used. @Deprecated(new="NewAndMuchCoolerClass", error=False) diff --git a/python/ray/_private/ray_logging/filters.py b/python/ray/_common/filters.py similarity index 100% rename from python/ray/_private/ray_logging/filters.py rename to python/ray/_common/filters.py diff --git a/python/ray/_private/ray_logging/formatters.py b/python/ray/_common/formatters.py similarity index 98% rename from python/ray/_private/ray_logging/formatters.py rename to python/ray/_common/formatters.py index 9c1cc8a51e40..bf67a309bb5c 100644 --- a/python/ray/_private/ray_logging/formatters.py +++ b/python/ray/_common/formatters.py @@ -63,6 +63,7 @@ def generate_record_format_attrs( LogKey.MESSAGE.value: record.getMessage(), LogKey.FILENAME.value: record.filename, LogKey.LINENO.value: record.lineno, + LogKey.PROCESS.value: record.process, } ) if record.exc_info: diff --git a/python/ray/_common/network_utils.py b/python/ray/_common/network_utils.py index 8b3c510a09e7..b97eb55042d8 100644 --- a/python/ray/_common/network_utils.py +++ b/python/ray/_common/network_utils.py @@ -1,5 +1,7 @@ from typing import Optional, Tuple, Union +from ray._raylet import build_address as _build_address, parse_address as _parse_address + def parse_address(address: str) -> Optional[Tuple[str, str]]: """Parse a network address string into host and port. @@ -10,21 +12,7 @@ def parse_address(address: str) -> Optional[Tuple[str, str]]: Returns: Tuple with (host, port) if port found, None if no colon separator. """ - pos = address.rfind(":") - if pos == -1: - return None - - host = address[:pos] - port = address[pos + 1 :] - - if ":" in host: - if host.startswith("[") and host.endswith("]"): - host = host[1:-1] - else: - # Invalid IPv6 (missing brackets) or colon is part of the address, not a host:port split. - return None - - return (host, port) + return _parse_address(address) def build_address(host: str, port: Union[int, str]) -> str: @@ -37,8 +25,16 @@ def build_address(host: str, port: Union[int, str]) -> str: Returns: Formatted address string (e.g., "localhost:8000" or "[::1]:8000"). """ - if host is not None and ":" in host: - # IPv6 address - return f"[{host}]:{port}" - # IPv4 address or hostname - return f"{host}:{port}" + return _build_address(host, port) + + +def is_localhost(host: str) -> bool: + """Check if the given host string represents a localhost address. + + Args: + host: The hostname or IP address to check. + + Returns: + True if the host is a localhost address, False otherwise. + """ + return host in ("localhost", "127.0.0.1", "::1") diff --git a/python/ray/_common/ray_option_utils.py b/python/ray/_common/ray_option_utils.py index 6ff64108f795..8d56f0d0f675 100644 --- a/python/ray/_common/ray_option_utils.py +++ b/python/ray/_common/ray_option_utils.py @@ -222,7 +222,7 @@ def issubclass_safe(obj: Any, cls_: type) -> bool: _actor_only_options = { "concurrency_groups": Option((list, dict, type(None))), - "enable_tensor_transport": Option(bool, default_value=False), + "enable_tensor_transport": Option((bool, type(None)), default_value=None), "lifetime": Option( (str, type(None)), lambda x: None diff --git a/python/ray/_common/test_utils.py b/python/ray/_common/test_utils.py index 957a73be0158..c5e6020b1c98 100644 --- a/python/ray/_common/test_utils.py +++ b/python/ray/_common/test_utils.py @@ -6,21 +6,20 @@ """ import asyncio -from collections.abc import Awaitable -from contextlib import contextmanager import inspect import os import time import traceback -from typing import Any, Callable, Dict, Iterator, List, Optional, Set import uuid +from collections.abc import Awaitable +from contextlib import contextmanager from enum import Enum - +from typing import Any, Callable, Dict, Iterator, List, Optional, Set import ray -from ray._common.network_utils import build_address -import ray._private.utils import ray._common.usage.usage_lib as ray_usage_lib +import ray._private.utils +from ray._common.network_utils import build_address @ray.remote(num_cpus=0) diff --git a/python/ray/_common/tests/BUILD b/python/ray/_common/tests/BUILD.bazel similarity index 89% rename from python/ray/_common/tests/BUILD rename to python/ray/_common/tests/BUILD.bazel index 8535dd58cb07..d9aba47b5eb9 100644 --- a/python/ray/_common/tests/BUILD +++ b/python/ray/_common/tests/BUILD.bazel @@ -14,6 +14,9 @@ py_library( py_test_module_list( size = "small", files = [ + "test_deprecation.py", + "test_filters.py", + "test_formatters.py", "test_network_utils.py", "test_ray_option_utils.py", "test_signal_semaphore_utils.py", @@ -26,6 +29,7 @@ py_test_module_list( "team:core", ], deps = [ + ":conftest", "//:ray_lib", ], ) diff --git a/python/ray/_common/tests/conftest.py b/python/ray/_common/tests/conftest.py new file mode 100644 index 000000000000..07810c3694dc --- /dev/null +++ b/python/ray/_common/tests/conftest.py @@ -0,0 +1,2 @@ +# Imports for filters and formatters tests +pytest_plugins = ["ray.tests.conftest"] diff --git a/python/ray/_common/tests/test_deprecation.py b/python/ray/_common/tests/test_deprecation.py new file mode 100644 index 000000000000..a6d9d7a13f54 --- /dev/null +++ b/python/ray/_common/tests/test_deprecation.py @@ -0,0 +1,97 @@ +import sys +from unittest.mock import patch + +import pytest + +from ray._common.deprecation import ( + DEPRECATED_VALUE, + Deprecated, + deprecation_warning, +) + + +def test_deprecation_warning_warn(): + with patch("ray._common.deprecation.logger.warning") as mock_warning: + deprecation_warning("old_feature", "new_feature") + + mock_warning.assert_called_once() + args, _ = mock_warning.call_args + assert ( + "DeprecationWarning: `old_feature` has been deprecated. Use `new_feature` instead." + in args[0] + ) + + +def test_deprecation_warning_error(): + with pytest.raises(ValueError) as excinfo: + deprecation_warning("old_feature", error=True) + assert "`old_feature` has been deprecated." in str(excinfo.value) + + +def test_deprecated_decorator_function(): + with patch("ray._common.deprecation.logger.warning") as mock_warning, patch( + "ray._common.deprecation.log_once" + ) as mock_log_once: + mock_log_once.return_value = True + + @Deprecated(old="old_func", new="new_func", error=False) + def old_func(): + return "result" + + result = old_func() + assert result == "result" + mock_warning.assert_called_once() + + +def test_deprecated_decorator_class(): + with patch("ray._common.deprecation.logger.warning") as mock_warning, patch( + "ray._common.deprecation.log_once" + ) as mock_log_once: + mock_log_once.return_value = True + + @Deprecated(old="OldClass", new="NewClass", error=False) + class OldClass: + pass + + instance = OldClass() + assert isinstance(instance, OldClass) + mock_warning.assert_called_once() + + +def test_deprecated_decorator_method(): + with patch("ray._common.deprecation.logger.warning") as mock_warning, patch( + "ray._common.deprecation.log_once" + ) as mock_log_once: + mock_log_once.return_value = True + + class MyClass: + @Deprecated(old="old_method", new="new_method", error=False) + def old_method(self): + return "method_result" + + instance = MyClass() + result = instance.old_method() + assert result == "method_result" + mock_warning.assert_called_once() + + +def test_deprecated_decorator_error(): + with patch("ray._common.deprecation.log_once") as mock_log_once: + mock_log_once.return_value = True + + @Deprecated(old="old_func", error=True) + def old_func(): + pass + + with pytest.raises(ValueError): + old_func() + + +def test_deprecated_value_constant(): + assert ( + DEPRECATED_VALUE == -1 + ), f"DEPRECATED_VALUE should be -1, but got {DEPRECATED_VALUE}" + + +if __name__ == "__main__": + sys.exit(pytest.main(["-sv", __file__])) diff --git a/python/ray/_common/tests/test_filters.py b/python/ray/_common/tests/test_filters.py new file mode 100644 index 000000000000..330cc69a1696 --- /dev/null +++ b/python/ray/_common/tests/test_filters.py @@ -0,0 +1,106 @@ +import logging +import logging.config +import sys + +import pytest + +import ray +from ray._common.filters import CoreContextFilter + + +class TestCoreContextFilter: + def test_driver_process(self, shutdown_only): + log_context = ["job_id", "worker_id", "node_id"] + filter = CoreContextFilter() + record = logging.makeLogRecord({}) + assert filter.filter(record) + # Ray is not initialized so no context except PID which should be available + for attr in log_context: + assert not hasattr(record, attr) + # PID should be available even when Ray is not initialized + assert hasattr(record, "process") + assert hasattr(record, "_ray_timestamp_ns") + + ray.init() + record = logging.makeLogRecord({}) + assert filter.filter(record) + runtime_context = ray.get_runtime_context() + expected_values = { + "job_id": runtime_context.get_job_id(), + "worker_id": runtime_context.get_worker_id(), + "node_id": runtime_context.get_node_id(), + "process": record.process, + } + for attr in log_context: + assert hasattr(record, attr) + assert getattr(record, attr) == expected_values[attr] + # This is not a worker process, so actor_id and task_id should not exist. + for attr in ["actor_id", "task_id"]: + assert not hasattr(record, attr) + assert hasattr(record, "_ray_timestamp_ns") + + def test_task_process(self, shutdown_only): + @ray.remote + def f(): + filter = CoreContextFilter() + record = logging.makeLogRecord({}) + assert filter.filter(record) + should_exist = ["job_id", "worker_id", "node_id", "task_id", "process"] + runtime_context = ray.get_runtime_context() + expected_values = { + "job_id": runtime_context.get_job_id(), + "worker_id": runtime_context.get_worker_id(), + "node_id": runtime_context.get_node_id(), + "task_id": runtime_context.get_task_id(), + "task_name": runtime_context.get_task_name(), + "task_func_name": runtime_context.get_task_function_name(), + "process": record.process, + } + for attr in should_exist: + assert hasattr(record, attr) + assert getattr(record, attr) == expected_values[attr] + assert not hasattr(record, "actor_id") + assert not hasattr(record, "actor_name") + assert hasattr(record, "_ray_timestamp_ns") + + obj_ref = f.remote() + ray.get(obj_ref) + + def test_actor_process(self, shutdown_only): + @ray.remote + class A: + def f(self): + filter = CoreContextFilter() + record = logging.makeLogRecord({}) + assert filter.filter(record) + should_exist = [ + "job_id", + "worker_id", + "node_id", + "actor_id", + "task_id", + "process", + ] + runtime_context = ray.get_runtime_context() + expected_values = { + "job_id": runtime_context.get_job_id(), + "worker_id": runtime_context.get_worker_id(), + "node_id": runtime_context.get_node_id(), + "actor_id": runtime_context.get_actor_id(), + "actor_name": runtime_context.get_actor_name(), + "task_id": runtime_context.get_task_id(), + "task_name": runtime_context.get_task_name(), + "task_func_name": runtime_context.get_task_function_name(), + "process": record.process, + } + for attr in should_exist: + assert hasattr(record, attr) + assert getattr(record, attr) == expected_values[attr] + assert hasattr(record, "_ray_timestamp_ns") + + actor = A.remote() + ray.get(actor.f.remote()) + + +if __name__ == "__main__": + sys.exit(pytest.main(["-sv", __file__])) diff --git a/python/ray/_common/tests/test_formatters.py b/python/ray/_common/tests/test_formatters.py new file mode 100644 index 000000000000..f81dcdffe84d --- /dev/null +++ b/python/ray/_common/tests/test_formatters.py @@ -0,0 +1,162 @@ +import json +import logging +import logging.config +import sys + +import pytest + +from ray._common.formatters import JSONFormatter, TextFormatter + + +class TestJSONFormatter: + def test_empty_record(self, shutdown_only): + formatter = JSONFormatter() + record = logging.makeLogRecord({}) + formatted = formatter.format(record) + + record_dict = json.loads(formatted) + should_exist = [ + "process", + "asctime", + "levelname", + "message", + "filename", + "lineno", + "timestamp_ns", + ] + for key in should_exist: + assert key in record_dict + assert len(record_dict) == len(should_exist) + assert "exc_text" not in record_dict + + def test_record_with_exception(self, shutdown_only): + formatter = JSONFormatter() + record = logging.makeLogRecord({}) + try: + raise ValueError("test") + except ValueError: + record.exc_info = sys.exc_info() + formatted = formatter.format(record) + record_dict = json.loads(formatted) + should_exist = [ + "process", + "asctime", + "levelname", + "message", + "filename", + "lineno", + "exc_text", + "timestamp_ns", + ] + for key in should_exist: + assert key in record_dict + assert "Traceback (most recent call last):" in record_dict["exc_text"] + assert len(record_dict) == len(should_exist) + + def test_record_with_user_provided_context(self, shutdown_only): + formatter = JSONFormatter() + record = logging.makeLogRecord({"user": "ray"}) + formatted = formatter.format(record) + record_dict = json.loads(formatted) + should_exist = [ + "process", + "asctime", + "levelname", + "message", + "filename", + "lineno", + "user", + "timestamp_ns", + ] + for key in should_exist: + assert key in record_dict + assert record_dict["user"] == "ray" + assert len(record_dict) == len(should_exist) + assert "exc_text" not in record_dict + + def test_record_with_flatten_keys_invalid_value(self, shutdown_only): + formatter = JSONFormatter() + record = logging.makeLogRecord({"ray_serve_extra_fields": "not_a_dict"}) + with pytest.raises(ValueError): + formatter.format(record) + + def test_record_with_flatten_keys_valid_dict(self, shutdown_only): + formatter = JSONFormatter() + record = logging.makeLogRecord( + {"ray_serve_extra_fields": {"key1": "value1", "key2": 2}} + ) + formatted = formatter.format(record) + record_dict = json.loads(formatted) + should_exist = [ + "process", + "asctime", + "levelname", + "message", + "filename", + "lineno", + "key1", + "key2", + "timestamp_ns", + ] + for key in should_exist: + assert key in record_dict + assert record_dict["key1"] == "value1", record_dict + assert record_dict["key2"] == 2 + assert "ray_serve_extra_fields" not in record_dict + assert len(record_dict) == len(should_exist) + assert "exc_text" not in record_dict + + def test_record_with_valid_additional_log_standard_attrs(self, shutdown_only): + formatter = JSONFormatter() + formatter.set_additional_log_standard_attrs(["name"]) + record = logging.makeLogRecord({}) + formatted = formatter.format(record) + + record_dict = json.loads(formatted) + should_exist = [ + "process", + "asctime", + "levelname", + "message", + "filename", + "lineno", + "timestamp_ns", + "name", + ] + for key in should_exist: + assert key in record_dict + assert len(record_dict) == len(should_exist) + + +class TestTextFormatter: + def test_record_with_user_provided_context(self): + formatter = TextFormatter() + record = logging.makeLogRecord({"user": "ray"}) + formatted = formatter.format(record) + assert "user=ray" in formatted + + def test_record_with_exception(self): + formatter = TextFormatter() + record = logging.LogRecord( + name="test_logger", + level=logging.INFO, + pathname="test.py", + lineno=1000, + msg="Test message", + args=None, + exc_info=None, + ) + formatted = formatter.format(record) + for s in ["INFO", "Test message", "test.py:1000", "--"]: + assert s in formatted + + def test_record_with_valid_additional_log_standard_attrs(self, shutdown_only): + formatter = TextFormatter() + formatter.set_additional_log_standard_attrs(["name"]) + record = logging.makeLogRecord({}) + formatted = formatter.format(record) + assert "name=" in formatted + + +if __name__ == "__main__": + sys.exit(pytest.main(["-sv", __file__])) diff --git a/python/ray/_common/tests/test_network_utils.py b/python/ray/_common/tests/test_network_utils.py index 2eee263f9a29..8aac0e1be420 100644 --- a/python/ray/_common/tests/test_network_utils.py +++ b/python/ray/_common/tests/test_network_utils.py @@ -1,70 +1,16 @@ -import pytest import sys -from ray._common.network_utils import parse_address, build_address - - -class TestBuildAddress: - """Test cases for build_address function, matching C++ tests exactly.""" - - @pytest.mark.parametrize( - "host,port,expected", - [ - # IPv4 - ("192.168.1.1", 8080, "192.168.1.1:8080"), - ("192.168.1.1", "8080", "192.168.1.1:8080"), - # IPv6 - ("::1", 8080, "[::1]:8080"), - ("::1", "8080", "[::1]:8080"), - ("2001:db8::1", 8080, "[2001:db8::1]:8080"), - ("2001:db8::1", "8080", "[2001:db8::1]:8080"), - # Hostname - ("localhost", 9000, "localhost:9000"), - ("localhost", "9000", "localhost:9000"), - ], - ) - def test_build_address(self, host, port, expected): - """Test building address strings from host and port.""" - result = build_address(host, port) - assert result == expected - +import pytest -class TestParseAddress: - """Test cases for parse_address function, matching C++ tests exactly.""" +from ray._common.network_utils import is_localhost - @pytest.mark.parametrize( - "address,expected", - [ - # IPv4 - ("192.168.1.1:8080", ("192.168.1.1", "8080")), - # IPv6:loopback address - ("[::1]:8080", ("::1", "8080")), - # IPv6 - ("[2001:db8::1]:8080", ("2001:db8::1", "8080")), - # Hostname:Port - ("localhost:9000", ("localhost", "9000")), - ], - ) - def test_parse_valid_addresses(self, address, expected): - """Test parsing valid addresses.""" - result = parse_address(address) - assert result == expected - @pytest.mark.parametrize( - "address", - [ - # bare IP or hostname - # should return None when no port is found - "::1", - "2001:db8::1", - "192.168.1.1", - "localhost", - ], - ) - def test_parse_bare_addresses(self, address): - """Test parsing bare addresses returns None.""" - result = parse_address(address) - assert result is None +def test_is_localhost(): + assert is_localhost("localhost") + assert is_localhost("127.0.0.1") + assert is_localhost("::1") + assert not is_localhost("8.8.8.8") + assert not is_localhost("2001:db8::1") if __name__ == "__main__": diff --git a/python/ray/_common/tests/test_ray_option_utils.py b/python/ray/_common/tests/test_ray_option_utils.py index 48d48f385927..5cf52057a1f7 100644 --- a/python/ray/_common/tests/test_ray_option_utils.py +++ b/python/ray/_common/tests/test_ray_option_utils.py @@ -1,20 +1,21 @@ -import pytest import re import sys from unittest.mock import patch -from ray.util.placement_group import PlacementGroup +import pytest + from ray._common.ray_option_utils import ( Option, + _check_deprecate_placement_group, _counting_option, - _validate_resource_quantity, _resource_option, + _validate_resource_quantity, _validate_resources, - validate_task_options, - validate_actor_options, update_options, - _check_deprecate_placement_group, + validate_actor_options, + validate_task_options, ) +from ray.util.placement_group import PlacementGroup class TestOptionValidation: diff --git a/python/ray/_common/tests/test_signal_semaphore_utils.py b/python/ray/_common/tests/test_signal_semaphore_utils.py index 3c798c783678..dec2a21800b5 100644 --- a/python/ray/_common/tests/test_signal_semaphore_utils.py +++ b/python/ray/_common/tests/test_signal_semaphore_utils.py @@ -5,13 +5,14 @@ and synchronization in Ray tests. """ -import pytest import sys -import ray -from ray._common.test_utils import SignalActor, Semaphore -from ray._common.test_utils import wait_for_condition import time +import pytest + +import ray +from ray._common.test_utils import Semaphore, SignalActor, wait_for_condition + @pytest.fixture(scope="module") def ray_init(): diff --git a/python/ray/_common/tests/test_signature.py b/python/ray/_common/tests/test_signature.py index 8e0173fc38e6..e4691eebeae7 100644 --- a/python/ray/_common/tests/test_signature.py +++ b/python/ray/_common/tests/test_signature.py @@ -6,18 +6,19 @@ """ import inspect -import pytest import sys from typing import Any, Optional from unittest.mock import Mock, patch +import pytest + from ray._common.signature import ( - get_signature, + DUMMY_TYPE, extract_signature, - validate_args, flatten_args, + get_signature, recover_args, - DUMMY_TYPE, + validate_args, ) diff --git a/python/ray/_common/tests/test_usage_stats.py b/python/ray/_common/tests/test_usage_stats.py index fc871a2e9638..99fa1b1d6cc7 100644 --- a/python/ray/_common/tests/test_usage_stats.py +++ b/python/ray/_common/tests/test_usage_stats.py @@ -2,34 +2,34 @@ import os import pathlib import sys -import time import threading +import time from dataclasses import asdict +from http.server import BaseHTTPRequestHandler, HTTPServer from pathlib import Path from unittest.mock import Mock, patch -from ray._common.test_utils import wait_for_condition -from ray._raylet import GcsClient -from ray.tests.conftest import * # noqa: F403 -import requests import pytest +import requests from jsonschema import validate -from http.server import BaseHTTPRequestHandler, HTTPServer import ray import ray._common.usage.usage_constants as usage_constants import ray._common.usage.usage_lib as ray_usage_lib +from ray._common.test_utils import wait_for_condition +from ray._common.usage.usage_lib import ClusterConfigToReport, UsageStatsEnabledness +from ray._private.accelerators import NvidiaGPUAcceleratorManager from ray._private.test_utils import ( format_web_url, run_string_as_driver, wait_until_server_available, ) -from ray._common.usage.usage_lib import ClusterConfigToReport, UsageStatsEnabledness +from ray._raylet import GcsClient from ray.autoscaler._private.cli_logger import cli_logger +from ray.tests.conftest import * # noqa: F403 from ray.util.placement_group import ( placement_group, ) -from ray._private.accelerators import NvidiaGPUAcceleratorManager schema = { "$schema": "http://json-schema.org/draft-07/schema#", diff --git a/python/ray/_common/tests/test_utils.py b/python/ray/_common/tests/test_utils.py index c3d437bb7586..491781924df9 100644 --- a/python/ray/_common/tests/test_utils.py +++ b/python/ray/_common/tests/test_utils.py @@ -6,20 +6,20 @@ """ import asyncio -import warnings -import sys import os +import sys import tempfile +import warnings import pytest from ray._common.utils import ( + _BACKGROUND_TASKS, get_or_create_event_loop, + get_system_memory, + load_class, run_background_task, - _BACKGROUND_TASKS, try_to_create_directory, - load_class, - get_system_memory, ) # Optional imports for testing diff --git a/python/ray/_common/tests/test_wait_for_condition.py b/python/ray/_common/tests/test_wait_for_condition.py index 52cb8c9cd2b0..045817ca1aa2 100644 --- a/python/ray/_common/tests/test_wait_for_condition.py +++ b/python/ray/_common/tests/test_wait_for_condition.py @@ -1,9 +1,10 @@ import asyncio -import time import sys +import time + import pytest -from ray._common.test_utils import wait_for_condition, async_wait_for_condition +from ray._common.test_utils import async_wait_for_condition, wait_for_condition class TestWaitForCondition: diff --git a/python/ray/_common/usage/usage_lib.py b/python/ray/_common/usage/usage_lib.py index c2e7f2345f33..cb1536721186 100644 --- a/python/ray/_common/usage/usage_lib.py +++ b/python/ray/_common/usage/usage_lib.py @@ -57,8 +57,8 @@ import yaml import ray -import ray._private.ray_constants as ray_constants import ray._common.usage.usage_constants as usage_constant +import ray._private.ray_constants as ray_constants from ray._raylet import GcsClient from ray.core.generated import gcs_pb2, usage_pb2 from ray.experimental.internal_kv import ( diff --git a/python/ray/_common/utils.py b/python/ray/_common/utils.py index 103c40397801..28a05a356549 100644 --- a/python/ray/_common/utils.py +++ b/python/ray/_common/utils.py @@ -3,15 +3,16 @@ import errno import importlib import inspect -from inspect import signature import os -import psutil import random import string import sys import tempfile +from inspect import signature from typing import Any, Coroutine, Dict, Optional +import psutil + def import_attr(full_path: str, *, reload_module: bool = False): """Given a full import path to a module attr, return the imported attr. diff --git a/python/ray/_private/BUILD b/python/ray/_private/BUILD.bazel similarity index 100% rename from python/ray/_private/BUILD rename to python/ray/_private/BUILD.bazel diff --git a/python/ray/_private/accelerators/__init__.py b/python/ray/_private/accelerators/__init__.py index 003074ad71fb..4cb14fef7956 100644 --- a/python/ray/_private/accelerators/__init__.py +++ b/python/ray/_private/accelerators/__init__.py @@ -1,6 +1,9 @@ from typing import Optional, Set -from ray._private.accelerators.accelerator import AcceleratorManager +from ray._private.accelerators.accelerator import ( + RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO_ENV_VAR, + AcceleratorManager, +) from ray._private.accelerators.amd_gpu import AMDGPUAcceleratorManager from ray._private.accelerators.hpu import HPUAcceleratorManager from ray._private.accelerators.intel_gpu import IntelGPUAcceleratorManager @@ -77,4 +80,5 @@ def get_accelerator_manager_for_resource( "get_all_accelerator_managers", "get_all_accelerator_resource_names", "get_accelerator_manager_for_resource", + "RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO_ENV_VAR", ] diff --git a/python/ray/_private/accelerators/accelerator.py b/python/ray/_private/accelerators/accelerator.py index a2cd98565fd2..4b5332cb8a07 100644 --- a/python/ray/_private/accelerators/accelerator.py +++ b/python/ray/_private/accelerators/accelerator.py @@ -1,6 +1,19 @@ from abc import ABC, abstractmethod from typing import Dict, List, Optional, Tuple +# https://github.com/ray-project/ray/issues/54868 +# In the future, ray will avoid overriding the accelerator ids environment variables +# when the number of accelerators is zero. +# For example, when this environment variable is set, if a user sets `num_gpus=0` +# in the `ray.init()` call, the environment variable `CUDA_VISIBLE_DEVICES` will +# not be set to an empty string. +# +# This environment variable is used to disable this behavior temporarily. +# And to avoid breaking changes, this environment variable is set to True by default +# to follow the previous behavior. +# +RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO_ENV_VAR = "RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO" + class AcceleratorManager(ABC): """This class contains all the functions needed for supporting diff --git a/python/ray/_private/accelerators/amd_gpu.py b/python/ray/_private/accelerators/amd_gpu.py index 662e858c71b9..0e820b28a666 100644 --- a/python/ray/_private/accelerators/amd_gpu.py +++ b/python/ray/_private/accelerators/amd_gpu.py @@ -11,6 +11,7 @@ NOSET_HIP_VISIBLE_DEVICES_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES" amd_product_dict = { + "0x66a1": "AMD-Instinct-MI50", "0x738c": "AMD-Instinct-MI100", "0x7408": "AMD-Instinct-MI250X", "0x740c": "AMD-Instinct-MI250X-MI250", @@ -20,6 +21,8 @@ "0x74a2": "AMD-Instinct-MI308X-OAM", "0x74a9": "AMD-Instinct-MI300X-HF", "0x74a5": "AMD-Instinct-MI325X-OAM", + "0x75a0": "AMD-Instinct-MI350X-OAM", + "0x75a3": "AMD-Instinct-MI355X-OAM", "0x6798": "AMD-Radeon-R9-200-HD-7900", "0x6799": "AMD-Radeon-HD-7900", "0x679A": "AMD-Radeon-HD-7900", diff --git a/python/ray/_private/accelerators/tpu.py b/python/ray/_private/accelerators/tpu.py index c6df2c858779..83da22475879 100644 --- a/python/ray/_private/accelerators/tpu.py +++ b/python/ray/_private/accelerators/tpu.py @@ -9,6 +9,7 @@ import ray from ray._private.accelerators.accelerator import AcceleratorManager +from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy logger = logging.getLogger(__name__) @@ -110,6 +111,91 @@ def get_tpu_cores_per_chip(accelerator_type: str) -> int: return DEFAULT_TPU_NUM_CORES_PER_CHIP +def infer_tpu_pod_type_from_topology( + topology: str, accelerator_type: str +) -> Optional[str]: + """Infer the TPU pod type (e.g. v4-32) from topology and accelerator type.""" + try: + num_chips = 1 + for value in topology.strip().lower().split("x"): + num_chips *= int(value) + generation = accelerator_type.lower().replace("tpu-", "") + return f"{generation}-{num_chips}" + except Exception as e: + logger.warning( + f"Failed to infer pod type from topology {topology} and type {accelerator_type}: {e}" + ) + return None + + +def fetch_tpu_slice_name_from_pg(pg): + @ray.remote(num_cpus=0) + def _get_tpu_slice_name(): + return TPUAcceleratorManager.get_current_node_tpu_name() + + tpu_name_ref = _get_tpu_slice_name.options( + scheduling_strategy=PlacementGroupSchedulingStrategy( + placement_group=pg, placement_group_bundle_index=0 + ) + ).remote() + + return ray.get(tpu_name_ref) + + +def reserve_tpu_slice( + topology: str, + accelerator_type: str, +) -> Optional[str]: + """Reserves a TPU slice using its head resource and returns the slice name. + This enables gang scheduling of training workers with multi-host TPUs. + This is used by JaxTrainer with TPUs in Ray Train. + + Args: + topology: The TPU topology string (e.g. "2x2x2"). + accelerator_type: The accelerator type of the node (e.g. "TPU-V4"). + + Returns: + A string representing a unique TPU slice name. + """ + pod_type = infer_tpu_pod_type_from_topology(topology, accelerator_type) + if pod_type is None: + return None + + # Reserve a slice by creating a placement group on the TPU head. + head_label_selector = { + "ray.io/tpu-worker-id": "0", + "ray.io/tpu-pod-type": pod_type, + } + head_placement_group = ray.util.placement_group( + bundles=[{f"TPU-{pod_type}-head": 1}], + bundle_label_selector=[head_label_selector], + ) + + logger.debug("Waiting to reserve multi-host slice head.") + timeout = 100 + ready, _ = ray.wait([head_placement_group.ready()], timeout=timeout) + + if not ready: + raise TimeoutError( + "Failed to reserve TPU head for slice with shape: {}. " + "Ensure your cluster has sufficient resources. Requesting TPU " + "head node with labels: {}. Current resources: {}".format( + pod_type, head_label_selector, ray.available_resources() + ) + ) + + # Retrieve the unique slice ID. + slice_name = fetch_tpu_slice_name_from_pg(head_placement_group) + if slice_name is None: + raise RuntimeError( + "Failed to retrieve TPU slice name after reserving head placement group. " + "Ensure that TPU slice metadata is available and correctly configured on multi-host nodes." + ) + + # TODO: return both the slice name and reference to the PG reservation. + return slice_name + + class TPUAcceleratorManager(AcceleratorManager): """Google TPU accelerators.""" diff --git a/python/ray/_private/custom_types.py b/python/ray/_private/custom_types.py index 2237972c3b6b..9b20312ec3c1 100644 --- a/python/ray/_private/custom_types.py +++ b/python/ray/_private/custom_types.py @@ -125,6 +125,7 @@ class TensorTransportEnum(Enum): OBJECT_STORE = TensorTransport.Value("OBJECT_STORE") NCCL = TensorTransport.Value("NCCL") GLOO = TensorTransport.Value("GLOO") + NIXL = TensorTransport.Value("NIXL") @classmethod def from_str(cls, name: str) -> "TensorTransportEnum": diff --git a/python/ray/_private/event/export_event_logger.py b/python/ray/_private/event/export_event_logger.py index 4d47c68fb833..4e77ca1421ce 100644 --- a/python/ray/_private/event/export_event_logger.py +++ b/python/ray/_private/event/export_event_logger.py @@ -13,6 +13,9 @@ from ray.core.generated.export_dataset_metadata_pb2 import ( ExportDatasetMetadata, ) +from ray.core.generated.export_dataset_operator_event_pb2 import ( + ExportDatasetOperatorEventData, +) from ray.core.generated.export_event_pb2 import ExportEvent from ray.core.generated.export_submission_job_event_pb2 import ( ExportSubmissionJobEventData, @@ -31,6 +34,7 @@ ExportTrainRunEventData, ExportTrainRunAttemptEventData, ExportDatasetMetadata, + ExportDatasetOperatorEventData, ] @@ -43,6 +47,7 @@ class EventLogType(Enum): TRAIN_STATE: Export events related to training state, supporting train run and attempt events. SUBMISSION_JOB: Export events related to job submissions. DATASET_METADATA: Export events related to dataset metadata. + DATASET_OPERATOR_EVENT: Export events related to Ray Data operator. """ TRAIN_STATE = ( @@ -51,6 +56,10 @@ class EventLogType(Enum): ) SUBMISSION_JOB = ("EXPORT_SUBMISSION_JOB", {ExportSubmissionJobEventData}) DATASET_METADATA = ("EXPORT_DATASET_METADATA", {ExportDatasetMetadata}) + DATASET_OPERATOR_EVENT = ( + "EXPORT_DATASET_OPERATOR_EVENT", + {ExportDatasetOperatorEventData}, + ) def __init__(self, log_type_name: str, event_types: set[ExportEventDataType]): """Initialize an EventLogType enum value. @@ -119,6 +128,9 @@ def _create_export_event(self, event_data: ExportEventDataType) -> ExportEvent: elif isinstance(event_data, ExportDatasetMetadata): event.dataset_metadata.CopyFrom(event_data) event.source_type = ExportEvent.SourceType.EXPORT_DATASET_METADATA + elif isinstance(event_data, ExportDatasetOperatorEventData): + event.dataset_operator_event_data.CopyFrom(event_data) + event.source_type = ExportEvent.SourceType.EXPORT_DATASET_OPERATOR_EVENT else: raise TypeError(f"Invalid event_data type: {type(event_data)}") if not self.log_type.supports_event_type(event_data): diff --git a/python/ray/_private/function_manager.py b/python/ray/_private/function_manager.py index 854a50249d0a..76d98d536d62 100644 --- a/python/ray/_private/function_manager.py +++ b/python/ray/_private/function_manager.py @@ -600,7 +600,11 @@ def _create_fake_actor_class( self, actor_class_name, actor_method_names, traceback_str ): class TemporaryActor: - pass + async def __dummy_method(self): + """Dummy method for this fake actor class to work for async actors. + Without this method, this temporary actor class fails to initialize + if the original actor class was async.""" + pass def temporary_actor_method(*args, **kwargs): raise RuntimeError( diff --git a/python/ray/_private/gc_collect_manager.py b/python/ray/_private/gc_collect_manager.py new file mode 100644 index 000000000000..d9bb723b88b0 --- /dev/null +++ b/python/ray/_private/gc_collect_manager.py @@ -0,0 +1,65 @@ +import gc +import logging +import threading +import time +from typing import Callable, Optional + +logger = logging.getLogger(__name__) + + +class PythonGCThread(threading.Thread): + """A background thread that triggers Python garbage collection. + + This thread waits for GC events from CoreWorker and triggers `gc.collect()` when + requested, ensuring that collections are spaced out by at least + `min_interval_s` seconds.""" + + def __init__( + self, *, min_interval_s: int = 5, gc_collect_func: Optional[Callable] = None + ): + logger.debug("Starting Python GC thread") + super().__init__(name="PythonGCThread", daemon=True) + self._should_exit = False + self._last_gc_time = float("-inf") + self._min_gc_interval = min_interval_s + self._gc_event = threading.Event() + # Set the gc_collect_func for UT, defaulting to gc.collect if None + self._gc_collect_func = gc_collect_func or gc.collect + + def trigger_gc(self) -> None: + self._gc_event.set() + + def run(self): + while not self._should_exit: + self._gc_event.wait() + self._gc_event.clear() + + if self._should_exit: + break + + time_since_last_gc = time.monotonic() - self._last_gc_time + if time_since_last_gc < self._min_gc_interval: + logger.debug( + f"Skipping GC, only {time_since_last_gc:.2f}s since last GC" + ) + continue + + try: + start = time.monotonic() + num_freed = self._gc_collect_func() + self._last_gc_time = time.monotonic() + if num_freed > 0: + logger.debug( + "gc.collect() freed {} refs in {} seconds".format( + num_freed, self._last_gc_time - start + ) + ) + except Exception as e: + logger.error(f"Error during GC: {e}") + self._last_gc_time = time.monotonic() + + def stop(self): + logger.debug("Stopping Python GC thread") + self._should_exit = True + self._gc_event.set() + self.join() diff --git a/python/ray/_private/metrics_agent.py b/python/ray/_private/metrics_agent.py index 1de034dfd49c..6116687e391a 100644 --- a/python/ray/_private/metrics_agent.py +++ b/python/ray/_private/metrics_agent.py @@ -775,8 +775,18 @@ def __init__(self, gcs_address, temp_dir): ray._private.state.state._initialize_global_state(gcs_client_options) self.temp_dir = temp_dir self.default_service_discovery_flush_period = 5 + + # The last service discovery content that PrometheusServiceDiscoveryWriter has seen + self.latest_service_discovery_content = [] + self._content_lock = threading.RLock() + super().__init__() + def get_latest_service_discovery_content(self): + """Return the latest stored service discovery content.""" + with self._content_lock: + return self.latest_service_discovery_content + def get_file_discovery_content(self): """Return the content for Prometheus service discovery.""" nodes = ray.nodes() @@ -792,9 +802,10 @@ def get_file_discovery_content(self): dashboard_addr = gcs_client.internal_kv_get(b"DashboardMetricsAddress", None) if dashboard_addr: metrics_export_addresses.append(dashboard_addr.decode("utf-8")) - return json.dumps( - [{"labels": {"job": "ray"}, "targets": metrics_export_addresses}] - ) + content = [{"labels": {"job": "ray"}, "targets": metrics_export_addresses}] + with self._content_lock: + self.latest_service_discovery_content = content + return json.dumps(content) def write(self): # Write a file based on https://prometheus.io/docs/guides/file-sd/ diff --git a/python/ray/_private/node.py b/python/ray/_private/node.py index b11a4664400a..49a50e2ff4bb 100644 --- a/python/ray/_private/node.py +++ b/python/ray/_private/node.py @@ -211,28 +211,13 @@ def __init__( node_ip_address = ray.util.get_node_ip_address() assert node_ip_address is not None - ray_params.update_if_absent( - node_ip_address=node_ip_address, raylet_ip_address=node_ip_address - ) + ray_params.update_if_absent(node_ip_address=node_ip_address) self._node_ip_address = node_ip_address if not connect_only: ray._private.services.write_node_ip_address( self.get_session_dir_path(), node_ip_address ) - if ray_params.raylet_ip_address: - raylet_ip_address = ray_params.raylet_ip_address - else: - raylet_ip_address = node_ip_address - - if raylet_ip_address != node_ip_address and (not connect_only or head): - raise ValueError( - "The raylet IP address should only be different than the node " - "IP address when connecting to an existing raylet; i.e., when " - "head=False and connect_only=True." - ) - self._raylet_ip_address = raylet_ip_address - self._object_spilling_config = self._get_object_spilling_config() logger.debug( f"Starting node with object spilling config: {self._object_spilling_config}" @@ -272,7 +257,7 @@ def __init__( # from Redis or GCS. node_info = ray._private.services.get_node_to_connect_for_driver( self.gcs_address, - self._raylet_ip_address, + self._node_ip_address, ) self._plasma_store_socket_name = node_info["object_store_socket_name"] self._raylet_socket_name = node_info["raylet_socket_name"] @@ -553,7 +538,7 @@ def node_id(self): @property def session_name(self): - """Get the session name (cluster ID).""" + """Get the current Ray session name.""" return self._session_name @property @@ -561,11 +546,6 @@ def node_ip_address(self): """Get the IP address of this node.""" return self._node_ip_address - @property - def raylet_ip_address(self): - """Get the IP address of the raylet that this node connects to.""" - return self._raylet_ip_address - @property def address(self): """Get the address for bootstrapping, e.g. the address to pass to @@ -633,7 +613,7 @@ def runtime_env_agent_port(self): @property def runtime_env_agent_address(self): """Get the address that exposes runtime env agent as http""" - return f"http://{build_address(self._raylet_ip_address, self._runtime_env_agent_port)}" + return f"http://{build_address(self._node_ip_address, self._runtime_env_agent_port)}" @property def dashboard_agent_listen_port(self): @@ -653,7 +633,6 @@ def address_info(self): """Get a dictionary of addresses.""" return { "node_ip_address": self._node_ip_address, - "raylet_ip_address": self._raylet_ip_address, "redis_address": self.redis_address, "object_store_address": self._plasma_store_socket_name, "raylet_socket_name": self._raylet_socket_name, @@ -1443,10 +1422,11 @@ def start_ray_processes(self): if self.resource_isolation_config.is_enabled(): self.resource_isolation_config.add_object_store_memory(object_store_memory) - self.start_raylet(plasma_directory, fallback_directory, object_store_memory) if self._ray_params.include_log_monitor: self.start_log_monitor() + self.start_raylet(plasma_directory, fallback_directory, object_store_memory) + def _kill_process_type( self, process_type, diff --git a/python/ray/_private/parameter.py b/python/ray/_private/parameter.py index 7f415c391b0d..60ab5ab9aa81 100644 --- a/python/ray/_private/parameter.py +++ b/python/ray/_private/parameter.py @@ -36,8 +36,6 @@ class RayParams: node_manager_port: The port to use for the node manager. gcs_server_port: The port to use for the GCS server. node_ip_address: The IP address of the node that we are on. - raylet_ip_address: The IP address of the raylet that this node - connects to. min_worker_port: The lowest port number that workers will bind on. If not set or set to 0, random ports will be chosen. max_worker_port: The highest port number that workers will bind @@ -114,7 +112,7 @@ class RayParams: worker available externally to the node it is running on. This will bind on 0.0.0.0 instead of localhost. env_vars: Override environment variables for the raylet. - session_name: The name of the session of the ray cluster. + session_name: The current Ray session name. webui: The url of the UI. cluster_id: The cluster ID in hex string. resource_isolation_config: settings for cgroupv2 based isolation of ray @@ -138,7 +136,6 @@ def __init__( gcs_server_port: Optional[int] = None, node_ip_address: Optional[str] = None, node_name: Optional[str] = None, - raylet_ip_address: Optional[str] = None, min_worker_port: Optional[int] = None, max_worker_port: Optional[int] = None, worker_port_list: Optional[List[int]] = None, @@ -196,7 +193,6 @@ def __init__( self.gcs_server_port = gcs_server_port self.node_ip_address = node_ip_address self.node_name = node_name - self.raylet_ip_address = raylet_ip_address self.min_worker_port = min_worker_port self.max_worker_port = max_worker_port self.worker_port_list = worker_port_list diff --git a/python/ray/_private/protobuf_compat.py b/python/ray/_private/protobuf_compat.py index 66971d8812d9..01256a5a82c3 100644 --- a/python/ray/_private/protobuf_compat.py +++ b/python/ray/_private/protobuf_compat.py @@ -1,6 +1,6 @@ import inspect -from google.protobuf.json_format import MessageToDict +from google.protobuf.json_format import MessageToDict, MessageToJson """ This module provides a compatibility layer for different versions of the protobuf @@ -21,7 +21,7 @@ def _protobuf_has_old_arg_name(): def rename_always_print_fields_with_no_presence(kwargs): """ - Protobuf version 5.26.0rc2 renamed argument for `MessageToDict`: + Protobuf version 5.26.0rc2 renamed argument for `MessageToDict` and `MessageToJson`: `including_default_value_fields` -> `always_print_fields_with_no_presence`. See https://github.com/protocolbuffers/protobuf/commit/06e7caba58ede0220b110b89d08f329e5f8a7537#diff-8de817c14d6a087981503c9aea38730b1b3e98f4e306db5ff9d525c7c304f234L129 # noqa: E501 @@ -45,3 +45,8 @@ def rename_always_print_fields_with_no_presence(kwargs): def message_to_dict(*args, **kwargs): kwargs = rename_always_print_fields_with_no_presence(kwargs) return MessageToDict(*args, **kwargs) + + +def message_to_json(*args, **kwargs): + kwargs = rename_always_print_fields_with_no_presence(kwargs) + return MessageToJson(*args, **kwargs) diff --git a/python/ray/_private/ray_constants.py b/python/ray/_private/ray_constants.py index 35b8ea4e43ac..c161a95c38f9 100644 --- a/python/ray/_private/ray_constants.py +++ b/python/ray/_private/ray_constants.py @@ -107,11 +107,6 @@ def env_set_by_user(key): OBJECT_STORE_MINIMUM_MEMORY_BYTES = 75 * 1024 * 1024 # Each ObjectRef currently uses about 3KB of caller memory. CALLER_MEMORY_USAGE_PER_OBJECT_REF = 3000 -# Match max_direct_call_object_size in -# src/ray/common/ray_config_def.h. -# TODO(swang): Ideally this should be pulled directly from the -# config in case the user overrides it. -DEFAULT_MAX_DIRECT_CALL_OBJECT_SIZE = 100 * 1024 # Above this number of bytes, raise an error by default unless the user sets # RAY_ALLOW_SLOW_STORAGE=1. This avoids swapping with large object stores. REQUIRE_SHM_SIZE_THRESHOLD = 10**10 @@ -126,6 +121,7 @@ def env_set_by_user(key): DEFAULT_PORT = 6379 RAY_ADDRESS_ENVIRONMENT_VARIABLE = "RAY_ADDRESS" +RAY_API_SERVER_ADDRESS_ENVIRONMENT_VARIABLE = "RAY_API_SERVER_ADDRESS" RAY_NAMESPACE_ENVIRONMENT_VARIABLE = "RAY_NAMESPACE" RAY_RUNTIME_ENV_ENVIRONMENT_VARIABLE = "RAY_RUNTIME_ENV" RAY_RUNTIME_ENV_URI_PIN_EXPIRATION_S_ENV_VAR = ( @@ -577,20 +573,9 @@ def gcs_actor_scheduling_enabled(): # Defaults to False to use pynvml to collect usage. RAY_METRIC_ENABLE_GPU_NVSMI = env_bool("RAY_metric_enable_gpu_nvsmi", False) -# Whether enable OpenTelemetry as the metrics collection backend on the driver -# component. This flag is only used during the migration of the metric collection -# backend from OpenCensus to OpenTelemetry. It will be removed in the future. -RAY_EXPERIMENTAL_ENABLE_OPEN_TELEMETRY_ON_AGENT = env_bool( - "RAY_experimental_enable_open_telemetry_on_agent", False -) - -# Whether enable OpenTelemetry as the metrics collection backend on the core -# components (core workers, gcs server, raylet, etc.). This flag is only used during -# the migration of the metric collection backend from OpenCensus to OpenTelemetry. -# It will be removed in the future. -RAY_EXPERIMENTAL_ENABLE_OPEN_TELEMETRY_ON_CORE = env_bool( - "RAY_experimental_enable_open_telemetry_on_core", False -) +# Whether enable OpenTelemetry as the metrics collection backend. The default is +# using OpenCensus. +RAY_ENABLE_OPEN_TELEMETRY = env_bool("RAY_enable_open_telemetry", False) # How long to wait for a fetch to complete during ray.get before timing out and raising an exception to the user. # @@ -599,3 +584,5 @@ def gcs_actor_scheduling_enabled(): FETCH_FAIL_TIMEOUT_SECONDS = ( env_integer("RAY_fetch_fail_timeout_milliseconds", 60000) / 1000 ) + +RAY_GC_MIN_COLLECT_INTERVAL = env_float("RAY_GC_MIN_COLLECT_INTERVAL_S", 5) diff --git a/python/ray/_private/ray_logging/constants.py b/python/ray/_private/ray_logging/constants.py index 6accad120006..a5bf5850a708 100644 --- a/python/ray/_private/ray_logging/constants.py +++ b/python/ray/_private/ray_logging/constants.py @@ -53,6 +53,7 @@ class LogKey(str, Enum): FILENAME = "filename" LINENO = "lineno" EXC_TEXT = "exc_text" + PROCESS = "process" # Ray logging context TIMESTAMP_NS = "timestamp_ns" diff --git a/python/ray/_private/ray_logging/logging_config.py b/python/ray/_private/ray_logging/logging_config.py index 843935d6b415..67453163a027 100644 --- a/python/ray/_private/ray_logging/logging_config.py +++ b/python/ray/_private/ray_logging/logging_config.py @@ -3,10 +3,10 @@ from dataclasses import dataclass, field from typing import Set +from ray._common.filters import CoreContextFilter +from ray._common.formatters import JSONFormatter, TextFormatter from ray._private.ray_logging import default_impl from ray._private.ray_logging.constants import LOGRECORD_STANDARD_ATTRS -from ray._private.ray_logging.filters import CoreContextFilter -from ray._private.ray_logging.formatters import JSONFormatter, TextFormatter from ray.util.annotations import PublicAPI diff --git a/python/ray/_private/resource_and_label_spec.py b/python/ray/_private/resource_and_label_spec.py index 03c8efd0e119..d737b70961da 100644 --- a/python/ray/_private/resource_and_label_spec.py +++ b/python/ray/_private/resource_and_label_spec.py @@ -10,7 +10,6 @@ from ray._common.utils import RESOURCE_CONSTRAINT_PREFIX from ray._private import accelerators from ray._private.accelerators import AcceleratorManager -from ray._private.accelerators.tpu import TPUAcceleratorManager logger = logging.getLogger(__name__) @@ -292,10 +291,11 @@ def _get_default_labels( ray._raylet.RAY_NODE_ACCELERATOR_TYPE_KEY ] = accelerator_type - # Set TPU specific default labels to enable SPMD scheduling. - if isinstance(accelerator_manager, TPUAcceleratorManager): + # Set TPU specific default labels to enable multi-host scheduling. + if accelerator_manager.get_resource_name() == "TPU": tpu_labels = accelerator_manager.get_current_node_accelerator_labels() - default_labels.update(tpu_labels) + if tpu_labels: + default_labels.update(tpu_labels) return default_labels diff --git a/python/ray/_private/runtime_env/BUILD b/python/ray/_private/runtime_env/BUILD.bazel similarity index 100% rename from python/ray/_private/runtime_env/BUILD rename to python/ray/_private/runtime_env/BUILD.bazel diff --git a/python/ray/_private/runtime_env/agent/main.py b/python/ray/_private/runtime_env/agent/main.py index f9beaa6167c9..e65de4d63bd4 100644 --- a/python/ray/_private/runtime_env/agent/main.py +++ b/python/ray/_private/runtime_env/agent/main.py @@ -218,13 +218,10 @@ def parent_dead_callback(msg): check_raylet_task = create_check_raylet_task( args.log_dir, gcs_client, parent_dead_callback, loop ) - runtime_env_agent_ip = ( - "127.0.0.1" if args.node_ip_address == "127.0.0.1" else "0.0.0.0" - ) try: web.run_app( app, - host=runtime_env_agent_ip, + host=args.node_ip_address, port=args.runtime_env_agent_port, loop=loop, ) diff --git a/python/ray/_private/runtime_env/default_impl.py b/python/ray/_private/runtime_env/default_impl.py index 331dc7fce01e..f0d1567530af 100644 --- a/python/ray/_private/runtime_env/default_impl.py +++ b/python/ray/_private/runtime_env/default_impl.py @@ -3,9 +3,3 @@ def get_image_uri_plugin_cls(): return ImageURIPlugin - - -def get_protocols_provider(): - from ray._private.runtime_env.protocol import ProtocolsProvider - - return ProtocolsProvider diff --git a/python/ray/_private/runtime_env/protocol.py b/python/ray/_private/runtime_env/protocol.py index b61dea8f71fa..663e9d4366da 100644 --- a/python/ray/_private/runtime_env/protocol.py +++ b/python/ray/_private/runtime_env/protocol.py @@ -1,7 +1,6 @@ import enum import os - -from ray._private.runtime_env.default_impl import get_protocols_provider +from urllib.parse import urlparse class ProtocolsProvider: @@ -30,13 +29,15 @@ def get_protocols(cls): "gs", # Remote azure blob storage path, assumes everything packed in one zip file. "azure", + # Remote Azure Blob File System Secure path, assumes everything packed in one zip file. + "abfss", # File storage path, assumes everything packed in one zip file. "file", } @classmethod def get_remote_protocols(cls): - return {"https", "s3", "gs", "azure", "file"} + return {"https", "s3", "gs", "azure", "abfss", "file"} @classmethod def _handle_s3_protocol(cls): @@ -57,7 +58,20 @@ def _handle_s3_protocol(cls): "to fetch URIs in s3 bucket. " + cls._MISSING_DEPENDENCIES_WARNING ) - transport_params = {"client": boto3.client("s3")} + # Create S3 client, falling back to unsigned for public buckets + session = boto3.Session() + # session.get_credentials() will return None if no credentials can be found. + if session.get_credentials(): + # If credentials are found, use a standard signed client. + s3_client = session.client("s3") + else: + # No credentials found, fall back to an unsigned client for public buckets. + from botocore import UNSIGNED + from botocore.config import Config + + s3_client = boto3.client("s3", config=Config(signature_version=UNSIGNED)) + + transport_params = {"client": s3_client} return open_file, transport_params @classmethod @@ -122,6 +136,67 @@ def _handle_azure_protocol(cls): return open_file, transport_params + @classmethod + def _handle_abfss_protocol(cls): + """Set up Azure Blob File System Secure (ABFSS) protocol handling. + + Returns: + tuple: (open_file function, transport_params) + + Raises: + ImportError: If required dependencies are not installed. + ValueError: If the ABFSS URI format is invalid. + """ + try: + import adlfs + from azure.identity import DefaultAzureCredential + except ImportError: + raise ImportError( + "You must `pip install adlfs azure-identity` " + "to fetch URIs in Azure Blob File System Secure. " + + cls._MISSING_DEPENDENCIES_WARNING + ) + + def open_file(uri, mode, *, transport_params=None): + # Parse and validate the ABFSS URI + parsed = urlparse(uri) + + # Validate ABFSS URI format: abfss://container@account.dfs.core.windows.net/path + if not parsed.netloc or "@" not in parsed.netloc: + raise ValueError( + f"Invalid ABFSS URI format - missing container@account: {uri}" + ) + + container_part, hostname_part = parsed.netloc.split("@", 1) + + # Validate container name (must be non-empty) + if not container_part: + raise ValueError( + f"Invalid ABFSS URI format - empty container name: {uri}" + ) + + # Validate hostname format + if not hostname_part or not hostname_part.endswith(".dfs.core.windows.net"): + raise ValueError( + f"Invalid ABFSS URI format - invalid hostname (must end with .dfs.core.windows.net): {uri}" + ) + + # Extract and validate account name + azure_storage_account_name = hostname_part.split(".")[0] + if not azure_storage_account_name: + raise ValueError( + f"Invalid ABFSS URI format - empty account name: {uri}" + ) + + # Handle ABFSS URI with adlfs + filesystem = adlfs.AzureBlobFileSystem( + account_name=azure_storage_account_name, + credential=DefaultAzureCredential(), + ) + return filesystem.open(uri, mode) + + return open_file, None + @classmethod def download_remote_uri(cls, protocol: str, source_uri: str, dest_file: str): """Download file from remote URI to destination file. @@ -151,6 +226,8 @@ def open_file(uri, mode, *, transport_params=None): open_file, tp = cls._handle_gs_protocol() elif protocol == "azure": open_file, tp = cls._handle_azure_protocol() + elif protocol == "abfss": + open_file, tp = cls._handle_abfss_protocol() else: try: from smart_open import open as open_file @@ -162,15 +239,13 @@ def open_file(uri, mode, *, transport_params=None): ) with open_file(source_uri, "rb", transport_params=tp) as fin: - with open_file(dest_file, "wb") as fout: + with open(dest_file, "wb") as fout: fout.write(fin.read()) -_protocols_provider = get_protocols_provider() - Protocol = enum.Enum( "Protocol", - {protocol.upper(): protocol for protocol in _protocols_provider.get_protocols()}, + {protocol.upper(): protocol for protocol in ProtocolsProvider.get_protocols()}, ) @@ -179,7 +254,7 @@ def _remote_protocols(cls): # Returns a list of protocols that support remote storage # These protocols should only be used with paths that end in ".zip" or ".whl" return [ - cls[protocol.upper()] for protocol in _protocols_provider.get_remote_protocols() + cls[protocol.upper()] for protocol in ProtocolsProvider.get_remote_protocols() ] @@ -187,7 +262,7 @@ def _remote_protocols(cls): def _download_remote_uri(self, source_uri, dest_file): - return _protocols_provider.download_remote_uri(self.value, source_uri, dest_file) + return ProtocolsProvider.download_remote_uri(self.value, source_uri, dest_file) Protocol.download_remote_uri = _download_remote_uri diff --git a/python/ray/_private/serialization.py b/python/ray/_private/serialization.py index bc2ac1af8d63..e2729ac38ab8 100644 --- a/python/ray/_private/serialization.py +++ b/python/ray/_private/serialization.py @@ -7,8 +7,6 @@ if TYPE_CHECKING: import torch - from ray.experimental.gpu_object_manager.gpu_object_store import GPUObject - import google.protobuf.message import ray._private.utils @@ -84,7 +82,9 @@ def pickle_dumps(obj: Any, error_msg: str): raise ray.exceptions.OufOfBandObjectRefSerializationException(msg) -def _object_ref_deserializer(binary, call_site, owner_address, object_status): +def _object_ref_deserializer( + binary, call_site, owner_address, object_status, tensor_transport_val +): # NOTE(suquark): This function should be a global function so # cloudpickle can access it directly. Otherwise cloudpickle # has to dump the whole function definition, which is inefficient. @@ -93,7 +93,9 @@ def _object_ref_deserializer(binary, call_site, owner_address, object_status): # the core worker to resolve the value. This is to make sure # that the ref count for the ObjectRef is greater than 0 by the # time the core worker resolves the value of the object. - obj_ref = ray.ObjectRef(binary, owner_address, call_site) + obj_ref = ray.ObjectRef( + binary, owner_address, call_site, tensor_transport_val=tensor_transport_val + ) # TODO(edoakes): we should be able to just capture a reference # to 'self' here instead, but this function is itself pickled @@ -113,6 +115,40 @@ def _object_ref_deserializer(binary, call_site, owner_address, object_status): return obj_ref +def _gpu_object_ref_deserializer( + binary, + call_site, + owner_address, + object_status, + tensor_transport_val, + gpu_object_meta, +): + """ + Deserialize a GPU object ref. When the GPU object ref is deserialized, + it firstly deserialize the normal object ref, and then add metadata of + the GPU object to the GPU object manager, which will be used to fetch + the GPU object later. + + Args: + binary: The binary data of the object ref. + call_site: The call site of the object ref. + owner_address: The owner address of the object ref. + object_status: The object status of the object ref. + tensor_transport_val: The tensor transport value of the GPU object ref. + gpu_object_meta: The GPU object metadata. This is used to fetch the GPU object later. + + Returns: + The deserialized GPU object ref. + """ + obj_ref = _object_ref_deserializer( + binary, call_site, owner_address, object_status, tensor_transport_val + ) + gpu_object_manager = ray._private.worker.global_worker.gpu_object_manager + gpu_object_manager.add_gpu_object_metadata(obj_ref, gpu_object_meta) + + return obj_ref + + def _actor_handle_deserializer(serialized_obj, weak_ref): # If this actor handle was stored in another object, then tell the # core worker. @@ -163,6 +199,7 @@ def compiled_dag_ref_reducer(obj): def object_ref_reducer(obj): worker = ray._private.worker.global_worker worker.check_connected() + self.add_contained_object_ref( obj, allow_out_of_band_serialization=( @@ -170,14 +207,35 @@ def object_ref_reducer(obj): ), call_site=obj.call_site(), ) + obj, owner_address, object_status = worker.core_worker.serialize_object_ref( obj ) + # Check if this is a GPU ObjectRef being serialized inside a collection + if ( + self.is_in_band_serialization() + and worker.gpu_object_manager.is_managed_object(obj.hex()) + ): + + gpu_object_manager = ( + ray._private.worker.global_worker.gpu_object_manager + ) + gpu_object_meta = gpu_object_manager._get_gpu_object_metadata(obj) + return _gpu_object_ref_deserializer, ( + obj.binary(), + obj.call_site(), + owner_address, + object_status, + obj.tensor_transport(), + gpu_object_meta, + ) + return _object_ref_deserializer, ( obj.binary(), obj.call_site(), owner_address, object_status, + obj.tensor_transport(), ) self._register_cloudpickle_reducer(ray.ObjectRef, object_ref_reducer) @@ -506,7 +564,7 @@ def deserialize_objects( self, serialized_ray_objects: List[SerializedRayObject], object_refs, - gpu_objects: Dict[str, "GPUObject"], + gpu_objects: Dict[str, List["torch.Tensor"]], ): assert len(serialized_ray_objects) == len(object_refs) # initialize the thread-local field @@ -524,11 +582,7 @@ def deserialize_objects( if object_ref is not None: object_id = object_ref.hex() if object_id in gpu_objects: - gpu_object = gpu_objects[object_id] - object_tensors = gpu_object.data - gpu_object.num_readers -= 1 - if gpu_object.num_readers == 0: - gpu_objects.pop(object_id) + object_tensors = gpu_objects[object_id] obj = self._deserialize_object( data, metadata, @@ -618,24 +672,19 @@ def _python_serializer(o): metadata, msgpack_data, contained_object_refs, pickle5_serialized_object ) - def serialize_and_store_gpu_objects( + def serialize_gpu_objects( self, value: Any, - obj_id: bytes, - ) -> MessagePackSerializedObject: + ) -> Tuple[MessagePackSerializedObject, List["torch.Tensor"]]: """Retrieve GPU data from `value` and store it in the GPU object store. Then, return the serialized value. Args: value: The value to serialize. - obj_id: The object ID of the value. `obj_id` is required, and the GPU data (e.g. tensors) in `value` - will be stored in the GPU object store with the key `obj_id`. Returns: Serialized value. """ - assert ( - obj_id is not None - ), "`obj_id` is required, and it is the key to retrieve corresponding tensors from the GPU object store." + if not self._torch_custom_serializer_registered: # Register a custom serializer for torch.Tensor. If the method is # decorated with `@ray.method(tensor_transport="xxx")`, it will @@ -648,16 +697,28 @@ def serialize_and_store_gpu_objects( self._torch_custom_serializer_registered = True serialized_val, tensors = self._serialize_and_retrieve_tensors(value) + + return serialized_val, tensors + + def store_gpu_objects(self, obj_id: str, tensors: List["torch.Tensor"]): + """ + Store GPU objects in the GPU object store. + + Args: + obj_id: The object ID of the value. `obj_id` is required, and the GPU data (e.g. tensors) in `value` + will be stored in the GPU object store with the key `obj_id`. + tensors: The tensors to store in the GPU object store. + """ + assert ( + obj_id is not None + ), "`obj_id` is required, and it is the key to retrieve corresponding tensors from the GPU object store." # Regardless of whether `tensors` is empty, we always store the GPU object # in the GPU object store. This ensures that `_get_tensor_meta` is not # blocked indefinitely. - obj_id = obj_id.decode("ascii") worker = ray._private.worker.global_worker gpu_object_manager = worker.gpu_object_manager gpu_object_manager.gpu_object_store.add_object(obj_id, tensors, is_primary=True) - return serialized_val - def serialize( self, value: Any ) -> Union[RawSerializedObject, MessagePackSerializedObject]: diff --git a/python/ray/_private/services.py b/python/ray/_private/services.py index 6785a78c571f..94040866676d 100644 --- a/python/ray/_private/services.py +++ b/python/ray/_private/services.py @@ -1469,7 +1469,7 @@ def start_gcs_server( If None, stdout is not redirected. stderr_filepath: The file path to dump gcs server stderr. If None, stderr is not redirected. - session_name: The session name (cluster id) of this cluster. + session_name: The current Ray session name. redis_username: The username of the Redis server. redis_password: The password of the Redis server. config: Optional configuration that will @@ -1606,7 +1606,7 @@ def start_raylet( fallback_directory: A directory where the Object store fallback files will be created. object_store_memory: The amount of memory (in bytes) to start the object store with. - session_name: The session name (cluster id) of this cluster. + session_name: The current Ray session name. resource_isolation_config: Resource isolation configuration for reserving memory and cpu resources for ray system processes through cgroupv2 is_head_node: whether this node is the head node. @@ -1792,7 +1792,7 @@ def start_raylet( os.path.join(RAY_PATH, "dashboard", "agent.py"), f"--node-ip-address={node_ip_address}", f"--metrics-export-port={metrics_export_port}", - f"--dashboard-agent-port={metrics_agent_port}", + f"--grpc-port={metrics_agent_port}", f"--listen-port={dashboard_agent_listen_port}", "--node-manager-port=RAY_NODE_MANAGER_PORT_PLACEHOLDER", f"--object-store-name={plasma_store_name}", diff --git a/python/ray/_private/state.py b/python/ray/_private/state.py index 8c17054b5730..331ba4a1fea6 100644 --- a/python/ray/_private/state.py +++ b/python/ray/_private/state.py @@ -138,12 +138,12 @@ def _gen_actor_info(self, actor_table_data): "Address": { "IPAddress": actor_table_data.address.ip_address, "Port": actor_table_data.address.port, - "NodeID": binary_to_hex(actor_table_data.address.raylet_id), + "NodeID": binary_to_hex(actor_table_data.address.node_id), }, "OwnerAddress": { "IPAddress": actor_table_data.owner_address.ip_address, "Port": actor_table_data.owner_address.port, - "NodeID": binary_to_hex(actor_table_data.owner_address.raylet_id), + "NodeID": binary_to_hex(actor_table_data.owner_address.node_id), }, "State": gcs_pb2.ActorTableData.ActorState.DESCRIPTOR.values_by_number[ actor_table_data.state diff --git a/python/ray/_private/telemetry/open_telemetry_metric_recorder.py b/python/ray/_private/telemetry/open_telemetry_metric_recorder.py index 3f8e64fbfe08..1076ed5c0857 100644 --- a/python/ray/_private/telemetry/open_telemetry_metric_recorder.py +++ b/python/ray/_private/telemetry/open_telemetry_metric_recorder.py @@ -23,17 +23,29 @@ class OpenTelemetryMetricRecorder: It uses OpenTelemetry's Prometheus exporter to export metrics. """ + _metrics_initialized = False + _metrics_initialized_lock = threading.Lock() + def __init__(self): self._lock = threading.Lock() self._registered_instruments = {} self._observations_by_name = defaultdict(dict) self._histogram_bucket_midpoints = defaultdict(list) - - prometheus_reader = PrometheusMetricReader() - provider = MeterProvider(metric_readers=[prometheus_reader]) - metrics.set_meter_provider(provider) + self._init_metrics() self.meter = metrics.get_meter(__name__) + def _init_metrics(self): + # Initialize the global metrics provider and meter. We only do this once on + # the first initialization of the class, because re-setting the meter provider + # can result in loss of metrics. + with self._metrics_initialized_lock: + if self._metrics_initialized: + return + prometheus_reader = PrometheusMetricReader() + provider = MeterProvider(metric_readers=[prometheus_reader]) + metrics.set_meter_provider(provider) + self._metrics_initialized = True + def register_gauge_metric(self, name: str, description: str) -> None: with self._lock: if name in self._registered_instruments: @@ -47,6 +59,8 @@ def callback(options): # Take snapshot of current observations. with self._lock: observations = self._observations_by_name[name] + # Clear the observations to avoid emitting dead observations. + self._observations_by_name[name] = {} # Drop high cardinality from tag_set and sum up the value for # same tag set after dropping aggregated_observations = defaultdict(float) diff --git a/python/ray/_private/utils.py b/python/ray/_private/utils.py index 84ab89094544..7793787e488f 100644 --- a/python/ray/_private/utils.py +++ b/python/ray/_private/utils.py @@ -35,7 +35,7 @@ get_ray_address_file, get_system_memory, ) -from ray.core.generated.runtime_env_common_pb2 import ( +from ray.core.generated.runtime_environment_pb2 import ( RuntimeEnvInfo as ProtoRuntimeEnvInfo, ) @@ -266,18 +266,46 @@ def set_omp_num_threads_if_unset() -> bool: return True -def set_visible_accelerator_ids() -> None: +def set_visible_accelerator_ids() -> Mapping[str, Optional[str]]: """Set (CUDA_VISIBLE_DEVICES, ONEAPI_DEVICE_SELECTOR, HIP_VISIBLE_DEVICES, NEURON_RT_VISIBLE_CORES, TPU_VISIBLE_CHIPS , HABANA_VISIBLE_MODULES ,...) - environment variables based on the accelerator runtime. + environment variables based on the accelerator runtime. Return the original + environment variables. """ + from ray._private.ray_constants import env_bool + + original_visible_accelerator_env_vars = {} + override_on_zero = env_bool( + ray._private.accelerators.RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO_ENV_VAR, + True, + ) for resource_name, accelerator_ids in ( ray.get_runtime_context().get_accelerator_ids().items() ): + # If no accelerator ids are set, skip overriding the environment variable. + if not override_on_zero and len(accelerator_ids) == 0: + continue + env_var = ray._private.accelerators.get_accelerator_manager_for_resource( + resource_name + ).get_visible_accelerator_ids_env_var() + original_visible_accelerator_env_vars[env_var] = os.environ.get(env_var, None) ray._private.accelerators.get_accelerator_manager_for_resource( resource_name ).set_current_process_visible_accelerator_ids(accelerator_ids) + return original_visible_accelerator_env_vars + + +def reset_visible_accelerator_env_vars( + original_visible_accelerator_env_vars: Mapping[str, Optional[str]] +) -> None: + """Reset the visible accelerator env vars to the original values.""" + for env_var, env_value in original_visible_accelerator_env_vars.items(): + if env_value is None: + os.environ.pop(env_var, None) + else: + os.environ[env_var] = env_value + class Unbuffered(object): """There's no "built-in" solution to programatically disabling buffering of @@ -342,9 +370,10 @@ def _get_docker_cpus( # See: https://bugs.openjdk.java.net/browse/JDK-8146115 if os.path.exists(cpu_quota_file_name) and os.path.exists(cpu_period_file_name): try: - with open(cpu_quota_file_name, "r") as quota_file, open( - cpu_period_file_name, "r" - ) as period_file: + with ( + open(cpu_quota_file_name, "r") as quota_file, + open(cpu_period_file_name, "r") as period_file, + ): cpu_quota = float(quota_file.read()) / float(period_file.read()) except Exception: logger.exception("Unexpected error calculating docker cpu quota.") diff --git a/python/ray/_private/worker.py b/python/ray/_private/worker.py index e7fa2ef7782d..f40a8c4f0464 100644 --- a/python/ray/_private/worker.py +++ b/python/ray/_private/worker.py @@ -16,6 +16,7 @@ from collections.abc import Mapping from contextlib import contextmanager from dataclasses import dataclass +from functools import wraps from typing import ( TYPE_CHECKING, Any, @@ -37,6 +38,9 @@ ) from urllib.parse import urlparse +if TYPE_CHECKING: + import torch + import colorama import ray @@ -100,9 +104,6 @@ from ray.widgets import Template from ray.widgets.util import repr_with_fallback -if TYPE_CHECKING: - from ray.experimental.gpu_object_manager import GPUObject - SCRIPT_MODE = 0 WORKER_MODE = 1 LOCAL_MODE = 2 @@ -796,6 +797,7 @@ def put_object( value: Any, owner_address: Optional[str] = None, _is_experimental_channel: bool = False, + _tensor_transport: str = "object_store", ): """Put value in the local object store. @@ -812,7 +814,7 @@ def put_object( objects. If True, then the returned object will not have a valid value. The object must be written to using the ray.experimental.channel API before readers can read. - + _tensor_transport: [Alpha] The tensor transport backend to use. Currently, this supports "object_store" and "nixl". Returns: ObjectRef: The object ref the object was put under. @@ -828,9 +830,25 @@ def put_object( "If you really want to do this, you can wrap the " "ray.ObjectRef in a list and call 'put' on it." ) - + tensors = None + tensor_transport: TensorTransportEnum = TensorTransportEnum.from_str( + _tensor_transport + ) + if tensor_transport not in [ + TensorTransportEnum.OBJECT_STORE, + TensorTransportEnum.NIXL, + ]: + raise ValueError( + "Currently, Ray Direct Transport only supports 'object_store' and 'nixl' for tensor transport in ray.put()." + ) try: - serialized_value = self.get_serialization_context().serialize(value) + if tensor_transport != TensorTransportEnum.OBJECT_STORE: + ( + serialized_value, + tensors, + ) = self.get_serialization_context().serialize_gpu_objects(value) + else: + serialized_value = self.get_serialization_context().serialize(value) except TypeError as e: sio = io.StringIO() ray.util.inspect_serializability(value, print_file=sio) @@ -851,16 +869,17 @@ def put_object( # reference will be created. If another reference is created and # removed before this one, it will corrupt the state in the # reference counter. - return ray.ObjectRef( - self.core_worker.put_serialized_object_and_increment_local_ref( - serialized_value, - pin_object=pin_object, - owner_address=owner_address, - _is_experimental_channel=_is_experimental_channel, - ), - # The initial local reference is already acquired internally. - skip_adding_local_ref=True, + ret = self.core_worker.put_object( + serialized_value, + pin_object=pin_object, + owner_address=owner_address, + inline_small_object=True, + _is_experimental_channel=_is_experimental_channel, + tensor_transport_val=tensor_transport.value, ) + if tensors: + self.gpu_object_manager.put_object(ret, tensor_transport, tensors) + return ret def raise_errors(self, serialized_objects, object_refs): out = self.deserialize_objects(serialized_objects, object_refs) @@ -869,23 +888,48 @@ def raise_errors(self, serialized_objects, object_refs): for e in out: _unhandled_error_handler(e) - def deserialize_objects(self, serialized_objects, object_refs): - gpu_objects: Dict[str, GPUObject] = {} + def deserialize_objects( + self, + serialized_objects, + object_refs, + tensor_transport_hint: Optional[TensorTransportEnum] = None, + ): + gpu_objects: Dict[str, List["torch.Tensor"]] = {} for obj_ref, (_, _, tensor_transport) in zip(object_refs, serialized_objects): - # If using a non-object store transport, then tensors will be sent - # out-of-band. Get them before deserializing the object store data. + # TODO: Here tensor_transport_hint is set by the user in ray.get(), tensor_transport is set + # in serialize_objects by ray.method(tensor_transport="xxx"), and obj_ref.tensor_transport() + # is set by ray.put(). We may clean up this logic in the future. if ( tensor_transport is None or tensor_transport == TensorTransportEnum.OBJECT_STORE + ) and ( + obj_ref is None + or obj_ref.tensor_transport() == TensorTransportEnum.OBJECT_STORE.value ): + # The object is not a gpu object, so we cannot use other external transport to + # fetch it. continue + # If the object is a gpu object, we can choose to use the object store or other external + # transport to fetch it. The `tensor_transport_hint` has the highest priority, then the + # tensor_transport in obj_ref.tensor_transport(), then the tensor_transport in serialize_objects, + # then the default value `OBJECT_STORE`. + chosen_tensor_transport = ( + tensor_transport_hint + or ( + TensorTransportEnum(obj_ref.tensor_transport()) if obj_ref else None + ) + or tensor_transport + or TensorTransportEnum.OBJECT_STORE + ) + object_id = obj_ref.hex() if object_id not in gpu_objects: + # If using a non-object store transport, then tensors will be sent + # out-of-band. Get them before deserializing the object store data. gpu_objects[object_id] = self.gpu_object_manager.get_gpu_object( - object_id + object_id, tensor_transport == chosen_tensor_transport ) - gpu_objects[object_id].num_readers += 1 # Function actor manager or the import thread may call pickle.loads # at the same time which can lead to failed imports @@ -903,6 +947,7 @@ def get_objects( timeout: Optional[float] = None, return_exceptions: bool = False, skip_deserialization: bool = False, + _tensor_transport: Optional[str] = None, ) -> Tuple[List[serialization.SerializedRayObject], bytes]: """Get the values in the object store associated with the IDs. @@ -921,6 +966,7 @@ def get_objects( raised. skip_deserialization: If true, only the buffer will be released and the object associated with the buffer will not be deserialized. + _tensor_transport: [Alpha] The tensor transport to use to fetch `torch.Tensors` found in the Ray Direct Transport object. Currently, this supports "object_store" and "nixl". Returns: list: List of deserialized objects or None if skip_deserialization is True. bytes: UUID of the debugger breakpoint we should drop @@ -933,7 +979,16 @@ def get_objects( f"Attempting to call `get` on the value {object_ref}, " "which is not an ray.ObjectRef." ) - + tensor_transport: TensorTransportEnum = ( + TensorTransportEnum.from_str(_tensor_transport) + if _tensor_transport is not None + else None + ) + assert tensor_transport in [ + TensorTransportEnum.OBJECT_STORE, + TensorTransportEnum.NIXL, + None, + ], "Currently, RDT only supports 'object_store' and 'nixl' for tensor transport in ray.get()." timeout_ms = ( int(timeout * 1000) if timeout is not None and timeout != -1 else -1 ) @@ -957,7 +1012,9 @@ def get_objects( if skip_deserialization: return None, debugger_breakpoint - values = self.deserialize_objects(serialized_objects, object_refs) + values = self.deserialize_objects( + serialized_objects, object_refs, tensor_transport_hint=tensor_transport + ) if not return_exceptions: # Raise exceptions instead of returning them to the user. for i, value in enumerate(values): @@ -1086,6 +1143,18 @@ def get_accelerator_ids_for_accelerator_resource( return list(assigned_ids) +_connect_or_shutdown_lock = threading.RLock() + + +def with_connect_or_shutdown_lock(func: Callable) -> Callable: + @wraps(func) + def wrapper(*args, **kwargs): + with _connect_or_shutdown_lock: + return func(*args, **kwargs) + + return wrapper + + @PublicAPI @client_mode_hook def get_gpu_ids() -> Union[List[int], List[str]]: @@ -1967,6 +2036,21 @@ def sigterm_handler(signum, frame): for hook in _post_init_hooks: hook() + # Check and show accelerator override warning during driver initialization + from ray._private.ray_constants import env_bool + + override_on_zero = env_bool( + ray._private.accelerators.RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO_ENV_VAR, + True, + ) + if override_on_zero and log_once("ray_accel_env_var_override_on_zero"): + warnings.warn( + "Tip: In future versions of Ray, Ray will no longer override accelerator " + "visible devices env var if num_gpus=0 or num_gpus=None (default). To enable " + "this behavior and turn off this error message, set RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO=0", + FutureWarning, + ) + node_id = global_worker.core_worker.get_current_node_id() global_node_address_info = _global_node.address_info.copy() global_node_address_info["webui_url"] = _remove_protocol_from_url(dashboard_url) @@ -1979,6 +2063,7 @@ def sigterm_handler(signum, frame): @PublicAPI @client_mode_hook +@with_connect_or_shutdown_lock def shutdown(_exiting_interpreter: bool = False): """Disconnect the worker, and terminate processes started by ray.init(). @@ -2355,7 +2440,7 @@ def is_initialized() -> bool: return ray._private.worker.global_worker.connected -# TODO(hjiang): Add cgroup path along with [enable_resource_isolation]. +@with_connect_or_shutdown_lock def connect( node, session_name: str, @@ -2373,13 +2458,12 @@ def connect( worker_launch_time_ms: int = -1, worker_launched_time_ms: int = -1, debug_source: str = "", - enable_resource_isolation: bool = False, ): """Connect this worker to the raylet, to Plasma, and to GCS. Args: node (ray._private.node.Node): The node to connect. - session_name: The session name (cluster id) of this cluster. + session_name: The current Ray session name. mode: The mode of the worker. One of SCRIPT_MODE, WORKER_MODE, and LOCAL_MODE. log_to_driver: If true, then output from all of the worker processes on all nodes will be directed to the driver. @@ -2402,7 +2486,6 @@ def connect( finshes launching. If the worker is not launched by raylet (e.g., driver), this must be -1 (default value). debug_source: Source information for `CoreWorker`, used for debugging and informational purpose, rather than functional purpose. - enable_resource_isolation: If true, core worker enables resource isolation by adding itself into appropriate cgroup. """ # Do some basic checking to make sure we didn't call ray.init twice. error_message = "Perhaps you called ray.init twice by accident?" @@ -2569,7 +2652,6 @@ def connect( logs_dir, node.node_ip_address, node.node_manager_port, - node.raylet_ip_address, (mode == LOCAL_MODE), driver_name, serialized_job_config, @@ -2582,7 +2664,6 @@ def connect( worker_launch_time_ms, worker_launched_time_ms, debug_source, - enable_resource_isolation, ) if mode == SCRIPT_MODE: @@ -2772,6 +2853,7 @@ def get( ], *, timeout: Optional[float] = None, + _tensor_transport: Optional[str] = None, ) -> Union[Any, List[Any]]: """Get a remote object or a list of remote objects from the object store. @@ -2807,6 +2889,7 @@ def get( corresponding object becomes available. Setting ``timeout=0`` will return the object immediately if it's available, else raise GetTimeoutError in accordance with the above docstring. + _tensor_transport: [Alpha] The tensor transport to use to fetch `torch.Tensors` found in the Ray Direct Transport object. Currently, this supports "object_store" and "nixl". Returns: A Python object or a list of Python objects. @@ -2866,7 +2949,9 @@ def get( "'object_refs' must either be an ObjectRef or a list of ObjectRefs. " ) - values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout) + values, debugger_breakpoint = worker.get_objects( + object_refs, timeout=timeout, _tensor_transport=_tensor_transport + ) for i, value in enumerate(values): if isinstance(value, RayError): if isinstance(value, ray.exceptions.ObjectLostError): @@ -2902,6 +2987,7 @@ def put( value: Any, *, _owner: Optional["ray.actor.ActorHandle"] = None, + _tensor_transport: str = "object_store", ) -> "ray.ObjectRef": """Store an object in the object store. @@ -2921,6 +3007,7 @@ def put( object prior to the object creator exiting, otherwise the reference will still be lost. *Note that this argument is an experimental API and should be avoided if possible.* + _tensor_transport: [Alpha] The tensor transport to use for the GPU object. Currently, this supports "object_store" and "nixl" for tensor transport in ray.put(). Returns: The object ref assigned to this value. @@ -2947,7 +3034,11 @@ def put( with profiling.profile("ray.put"): try: - object_ref = worker.put_object(value, owner_address=serialize_owner_address) + object_ref = worker.put_object( + value, + owner_address=serialize_owner_address, + _tensor_transport=_tensor_transport, + ) except ObjectStoreFullError: logger.info( "Put failed since the value was either too large or the " diff --git a/python/ray/_private/workers/default_worker.py b/python/ray/_private/workers/default_worker.py index 03ea6e456e24..12cf83040574 100644 --- a/python/ray/_private/workers/default_worker.py +++ b/python/ray/_private/workers/default_worker.py @@ -165,7 +165,9 @@ action="store_true", help="True if Ray debugger is made available externally.", ) -parser.add_argument("--session-name", required=False, help="The current session name") +parser.add_argument( + "--session-name", required=False, help="The current Ray session name" +) parser.add_argument( "--webui", required=False, @@ -218,12 +220,8 @@ # for asyncio try_install_uvloop() - raylet_ip_address = args.raylet_ip_address - if raylet_ip_address is None: - raylet_ip_address = args.node_ip_address ray_params = RayParams( node_ip_address=args.node_ip_address, - raylet_ip_address=raylet_ip_address, node_manager_port=args.node_manager_port, redis_address=args.redis_address, redis_username=args.redis_username, @@ -273,7 +271,6 @@ ray_debugger_external=args.ray_debugger_external, worker_launch_time_ms=args.worker_launch_time_ms, worker_launched_time_ms=worker_launched_time_ms, - enable_resource_isolation=args.enable_resource_isolation, ) worker = ray._private.worker.global_worker diff --git a/python/ray/_raylet.pxd b/python/ray/_raylet.pxd index a45f127a3291..98b445fc3abb 100644 --- a/python/ray/_raylet.pxd +++ b/python/ray/_raylet.pxd @@ -110,9 +110,12 @@ cdef class ObjectRef(BaseID): # it up. c_bool in_core_worker c_string call_site_data + int tensor_transport_val cdef CObjectID native(self) + cdef CTensorTransport c_tensor_transport(self) + cdef class ActorID(BaseID): cdef CActorID data @@ -137,6 +140,7 @@ cdef class CoreWorker: object _task_id_to_future_lock dict _task_id_to_future object event_loop_executor + object _gc_thread cdef unique_ptr[CAddress] _convert_python_address(self, address=*) cdef store_task_output( diff --git a/python/ray/_raylet.pyi b/python/ray/_raylet.pyi index c28976409578..fff69c451b67 100644 --- a/python/ray/_raylet.pyi +++ b/python/ray/_raylet.pyi @@ -1,11 +1,37 @@ -from typing import Awaitable, TypeVar +from ray.includes.object_ref import ObjectRef, _set_future_helper +from ray.includes.unique_ids import ( + ActorClassID, + ActorID, + BaseID, + ClusterID, + FunctionID, + JobID, + NodeID, + ObjectID, + PlacementGroupID, + TaskID, + UniqueID, + WorkerID, + check_id, +) -R = TypeVar("R") +__all__ = [ + # ray.includes.unique_ids + "ActorClassID", + "ActorID", + "BaseID", + "ClusterID", + "FunctionID", + "JobID", + "NodeID", + "ObjectID", + "PlacementGroupID", + "TaskID", + "UniqueID", + "WorkerID", + "check_id", - -class ObjectRef(Awaitable[R]): # type: ignore - pass - - -class ObjectID(Awaitable[R]): # type: ignore - pass + # ray.includes.object_ref + "_set_future_helper", + "ObjectRef", +] diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index e51d12958700..995a417b472f 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -184,6 +184,22 @@ from ray.includes.optional cimport ( optional, nullopt ) +cimport cpython + +include "includes/network_util.pxi" +include "includes/object_ref.pxi" +include "includes/unique_ids.pxi" +include "includes/ray_config.pxi" +include "includes/function_descriptor.pxi" +include "includes/buffer.pxi" +include "includes/common.pxi" +include "includes/gcs_client.pxi" +include "includes/serialization.pxi" +include "includes/libcoreworker.pxi" +include "includes/global_state_accessor.pxi" +include "includes/metric.pxi" +include "includes/setproctitle.pxi" + import ray from ray.exceptions import ( RayActorError, @@ -230,24 +246,9 @@ import ray._private.profiling as profiling from ray._common.utils import decode from ray._private.utils import DeferSigint from ray._private.object_ref_generator import DynamicObjectRefGenerator -from ray._common.network_utils import build_address, parse_address from ray.util.annotations import PublicAPI from ray._private.custom_types import TensorTransportEnum - -cimport cpython - -include "includes/object_ref.pxi" -include "includes/unique_ids.pxi" -include "includes/ray_config.pxi" -include "includes/function_descriptor.pxi" -include "includes/buffer.pxi" -include "includes/common.pxi" -include "includes/gcs_client.pxi" -include "includes/serialization.pxi" -include "includes/libcoreworker.pxi" -include "includes/global_state_accessor.pxi" -include "includes/metric.pxi" -include "includes/setproctitle.pxi" +from ray._private.gc_collect_manager import PythonGCThread # Expose GCC & Clang macro to report # whether C++ optimizations were enabled during compilation. @@ -261,6 +262,13 @@ GRPC_STATUS_CODE_UNIMPLEMENTED = CGrpcStatusCode.UNIMPLEMENTED logger = logging.getLogger(__name__) +import warnings +class NumReturnsWarning(UserWarning): + """Warning when num_returns=0 but the task returns a non-None value.""" + pass + +warnings.filterwarnings("once", category=NumReturnsWarning) + # The currently running task, if any. These are used to synchronize task # interruption for ray.cancel. current_task_id = None @@ -598,7 +606,7 @@ class SerializedRayObject(NamedTuple): cdef RayObjectsToSerializedRayObjects( - const c_vector[shared_ptr[CRayObject]] objects): + const c_vector[shared_ptr[CRayObject]] objects, object_refs: Optional[List[ObjectRef]] = None): serialized_ray_objects = [] for i in range(objects.size()): # core_worker will return a nullptr for objects that couldn't be @@ -614,6 +622,11 @@ cdef RayObjectsToSerializedRayObjects( metadata = Buffer.make( objects[i].get().GetMetadata()).to_pybytes() tensor_transport = TensorTransportEnum((objects[i].get().GetTensorTransport())) + if ( + tensor_transport == TensorTransportEnum.OBJECT_STORE + and object_refs is not None + ): + tensor_transport = TensorTransportEnum(object_refs[i].tensor_transport()) serialized_ray_objects.append(SerializedRayObject(data, metadata, tensor_transport)) return serialized_ray_objects @@ -622,11 +635,13 @@ cdef VectorToObjectRefs(const c_vector[CObjectReference] &object_refs, skip_adding_local_ref): result = [] for i in range(object_refs.size()): + tensor_transport_val = object_refs[i].tensor_transport() result.append(ObjectRef( object_refs[i].object_id(), object_refs[i].owner_address().SerializeAsString(), object_refs[i].call_site(), - skip_adding_local_ref=skip_adding_local_ref)) + skip_adding_local_ref=skip_adding_local_ref, + tensor_transport_val=tensor_transport_val)) return result @@ -771,6 +786,7 @@ cdef int prepare_labels( if label_dict is None: return 0 + label_map[0].reserve(len(label_dict)) for key, value in label_dict.items(): if not isinstance(key, str): raise ValueError(f"Label key must be string, but got {type(key)}") @@ -787,6 +803,7 @@ cdef int prepare_label_selector( if label_selector_dict is None: return 0 + label_selector[0].reserve(len(label_selector_dict)) for key, value in label_selector_dict.items(): if not isinstance(key, str): raise ValueError(f"Label selector key type must be string, but got {type(key)}") @@ -814,6 +831,7 @@ cdef int prepare_resources( if resource_dict is None: raise ValueError("Must provide resource map.") + resource_map[0].reserve(len(resource_dict)) for key, value in resource_dict.items(): if not (isinstance(value, int) or isinstance(value, float)): raise ValueError("Resource quantities may only be ints or floats.") @@ -840,6 +858,7 @@ cdef c_vector[CFunctionDescriptor] prepare_function_descriptors(pyfd_list): c_vector[CFunctionDescriptor] fd_list CRayFunction ray_function + fd_list.reserve(len(pyfd_list)) for pyfd in pyfd_list: fd_list.push_back(CFunctionDescriptorBuilder.BuildPython( pyfd.module_name, pyfd.class_name, pyfd.function_name, b"")) @@ -851,17 +870,16 @@ cdef int prepare_actor_concurrency_groups( c_vector[CConcurrencyGroup] *concurrency_groups): cdef: - CConcurrencyGroup cg c_vector[CFunctionDescriptor] c_fd_list if concurrency_groups_dict is None: raise ValueError("Must provide it...") + concurrency_groups.reserve(len(concurrency_groups_dict)) for key, value in concurrency_groups_dict.items(): c_fd_list = prepare_function_descriptors(value["function_descriptors"]) - cg = CConcurrencyGroup( - key.encode("ascii"), value["max_concurrency"], c_fd_list) - concurrency_groups.push_back(cg) + concurrency_groups.push_back(CConcurrencyGroup( + key.encode("ascii"), value["max_concurrency"], move(c_fd_list))) return 1 @@ -937,11 +955,13 @@ cdef prepare_args_internal( op_status = CCoreWorkerProcess.GetCoreWorker().GetOwnerAddress( c_arg, &c_owner_address) check_status(op_status) + c_tensor_transport = (arg).c_tensor_transport() args_vector.push_back( unique_ptr[CTaskArg](new CTaskArgByReference( c_arg, c_owner_address, - arg.call_site()))) + arg.call_site(), + c_tensor_transport))) else: try: @@ -998,11 +1018,11 @@ cdef prepare_args_internal( new CTaskArgByReference( put_id, CCoreWorkerProcess.GetCoreWorker().GetRpcAddress(), - put_arg_call_site + put_arg_call_site, + TENSOR_TRANSPORT_OBJECT_STORE ))) incremented_put_arg_ids.push_back(put_id) - cdef raise_if_dependency_failed(arg): """This method is used to improve the readability of backtrace. @@ -1029,7 +1049,7 @@ def serialize_retry_exception_allowlist(retry_exception_allowlist, function_desc cdef c_bool determine_if_retryable( c_bool should_retry_exceptions, - Exception e, + e: BaseException, const c_string serialized_retry_exception_allowlist, FunctionDescriptor function_descriptor, ): @@ -1883,10 +1903,10 @@ cdef void execute_task( if c_args.empty(): args, kwargs = [], {} else: - metadata_pairs = RayObjectsToSerializedRayObjects(c_args) object_refs = VectorToObjectRefs( c_arg_refs, skip_adding_local_ref=False) + metadata_pairs = RayObjectsToSerializedRayObjects(c_args, object_refs) if core_worker.current_actor_is_asyncio(): # We deserialize objects in event loop thread to # prevent segfaults. See #7799 @@ -1919,9 +1939,6 @@ cdef void execute_task( if (task_type == TASK_TYPE_ACTOR_CREATION_TASK): actor_id = core_worker.get_actor_id() actor = worker.actors[actor_id] - class_name = actor.__class__.__name__ - actor_title = f"{class_name}({args!r}, {kwargs!r})" - core_worker.set_actor_title(actor_title.encode("utf-8")) worker.record_task_log_start(task_id, attempt_number) @@ -2012,7 +2029,10 @@ cdef void execute_task( task_exception = False except AsyncioActorExit as e: exit_current_actor_if_asyncio() - except Exception as e: + except (KeyboardInterrupt, SystemExit): + # Special casing these two because Ray can raise them + raise + except BaseException as e: is_retryable_error[0] = determine_if_retryable( should_retry_exceptions, e, @@ -2121,8 +2141,10 @@ cdef void execute_task( None, # ref_generator_id c_tensor_transport ) - - except Exception as e: + except (KeyboardInterrupt, SystemExit): + # Special casing these two because Ray can raise them + raise + except BaseException as e: num_errors_stored = store_task_errors( worker, e, task_exception, actor, actor_id, function_name, task_type, title, caller_address, returns, application_error, c_tensor_transport) @@ -2174,16 +2196,15 @@ cdef execute_task_with_cancellation_handler( title = f"ray::{task_name}" # Automatically restrict the GPUs (CUDA), neuron_core, TPU accelerator - # runtime_ids to restrict availability to this task. + # runtime_ids, OMP_NUM_THREADS to restrict availability to this task. # Once actor is created, users can change the visible accelerator ids within # an actor task and we don't want to reset it. if (task_type != TASK_TYPE_ACTOR_TASK): - ray._private.utils.set_visible_accelerator_ids() - - # Automatically configure OMP_NUM_THREADS to the assigned CPU number. - # It will be unset after the task execution if it was overwridden here. - # No-op if already set. - omp_num_threads_overriden = ray._private.utils.set_omp_num_threads_if_unset() + original_visible_accelerator_env_vars = ray._private.utils.set_visible_accelerator_ids() + omp_num_threads_overriden = ray._private.utils.set_omp_num_threads_if_unset() + else: + original_visible_accelerator_env_vars = None + omp_num_threads_overriden = False # Initialize the actor if this is an actor creation task. We do this here # before setting the current task ID so that we can get the execution info, @@ -2195,6 +2216,7 @@ cdef execute_task_with_cancellation_handler( actor_id = core_worker.get_actor_id() actor = actor_class.__new__(actor_class) worker.actors[actor_id] = actor + # Record the actor class via :actor_name: magic token in the log. # # (Phase 1): this covers code run before __init__ finishes. @@ -2283,9 +2305,14 @@ cdef execute_task_with_cancellation_handler( with current_task_id_lock: current_task_id = None - if omp_num_threads_overriden: - # Reset the OMP_NUM_THREADS environ if it was set. - os.environ.pop("OMP_NUM_THREADS", None) + if (task_type == TASK_TYPE_NORMAL_TASK): + if original_visible_accelerator_env_vars: + # Reset the visible accelerator env vars for normal tasks, since they may be reused. + ray._private.utils.reset_visible_accelerator_env_vars(original_visible_accelerator_env_vars) + if omp_num_threads_overriden: + # Reset the OMP_NUM_THREADS environ if it was set. + os.environ.pop("OMP_NUM_THREADS", None) + if execution_info.max_calls != 0: # Reset the state of the worker for the next task to execute. @@ -2480,14 +2507,21 @@ cdef CRayStatus check_signals() nogil: cdef void gc_collect(c_bool triggered_by_global_gc) nogil: - with gil: - start = time.perf_counter() - num_freed = gc.collect() - end = time.perf_counter() - if num_freed > 0: - logger.debug( - "gc.collect() freed {} refs in {} seconds".format( - num_freed, end - start)) + with gil: + if RayConfig.instance().start_python_gc_manager_thread(): + start = time.perf_counter() + worker = ray._private.worker.global_worker + worker.core_worker.trigger_gc() + end = time.perf_counter() + logger.debug("GC event triggered in {} seconds".format(end - start)) + else: + start = time.perf_counter() + num_freed = gc.collect() + end = time.perf_counter() + if num_freed > 0: + logger.debug( + "gc.collect() freed {} refs in {} seconds".format( + num_freed, end - start)) cdef c_vector[c_string] spill_objects_handler( @@ -2739,6 +2773,38 @@ cdef void terminate_asyncio_thread() nogil: core_worker = ray._private.worker.global_worker.core_worker core_worker.stop_and_join_asyncio_threads_if_exist() + +cdef void call_actor_shutdown() noexcept nogil: + """C++ wrapper function that calls the Python actor shutdown callback.""" + with gil: + _call_actor_shutdown() + + +def _call_actor_shutdown(): + """Internal function that calls actor's __ray_shutdown__ method.""" + try: + worker = ray._private.worker.global_worker + + if not worker.actors: + return + + actor_id, actor_instance = next(iter(worker.actors.items())) + if actor_instance is not None: + # Only call __ray_shutdown__ if the method exists and is callable + # This preserves backward compatibility: actors without __ray_shutdown__ + # use Python's normal exit flow (including atexit handlers) + if hasattr(actor_instance, '__ray_shutdown__') and callable(getattr(actor_instance, '__ray_shutdown__')): + try: + actor_instance.__ray_shutdown__() + except Exception: + logger.exception("Error during actor __ray_shutdown__ method") + # Always clean up the actor instance + worker.actors.pop(actor_id, None) + except Exception: + # Catch any system-level exceptions to prevent propagation to C++ + logger.exception("System error during actor shutdown callback") + + cdef class StreamRedirector: @staticmethod def redirect_stdout(const c_string &file_path, uint64_t rotation_max_size, uint64_t rotation_max_file_count, c_bool tee_to_stdout, c_bool tee_to_stderr): @@ -2796,7 +2862,7 @@ cdef class GcsClient: cdef class _GcsSubscriber: - """Cython wrapper class of C++ `ray::gcs::PythonGcsSubscriber`.""" + """Cython wrapper class of C++ `ray::pubsub::PythonGcsSubscriber`.""" cdef: shared_ptr[CPythonGcsSubscriber] inner @@ -2911,73 +2977,35 @@ cdef class GcsLogSubscriber(_GcsSubscriber): with nogil: check_status(self.inner.get().PollLogs(&key_id, timeout_ms, &log_batch)) - c_log_lines = PythonGetLogBatchLines(log_batch) - - log_lines = [] - for c_log_line in c_log_lines: - log_lines.append(c_log_line.decode()) - - return { + result = { "ip": log_batch.ip().decode(), "pid": log_batch.pid().decode(), "job": log_batch.job_id().decode(), "is_err": log_batch.is_error(), - "lines": log_lines, "actor_name": log_batch.actor_name().decode(), "task_name": log_batch.task_name().decode(), } - -# This class should only be used for tests -cdef class _TestOnly_GcsActorSubscriber(_GcsSubscriber): - """Subscriber to actor updates. Thread safe. - - Usage example: - subscriber = GcsActorSubscriber() - # Subscribe to the actor channel. - subscriber.subscribe() - ... - while running: - actor_data = subscriber.poll() - ...... - # Unsubscribe from the channel. - subscriber.close() - """ - - def __init__(self, address, worker_id=None): - self._construct(address, GCS_ACTOR_CHANNEL, worker_id) - - def poll(self, timeout=None): - """Polls for new actor messages. - - Returns: - A byte string of function key. - None if polling times out or subscriber closed. - """ - cdef: - CActorTableData actor_data - c_string key_id - int64_t timeout_ms = round(1000 * timeout) if timeout else -1 - with nogil: - check_status(self.inner.get().PollActor( - &key_id, timeout_ms, &actor_data)) + c_log_lines = PythonGetLogBatchLines(move(log_batch)) - info = ActorTableData.FromString( - actor_data.SerializeAsString()) + log_lines = [] + for c_log_line in c_log_lines: + log_lines.append(c_log_line.decode()) - return [(key_id, info)] + result["lines"] = log_lines + return result cdef class CoreWorker: def __cinit__(self, worker_type, store_socket, raylet_socket, JobID job_id, GcsClientOptions gcs_options, log_dir, - node_ip_address, node_manager_port, raylet_ip_address, + node_ip_address, node_manager_port, local_mode, driver_name, serialized_job_config, metrics_agent_port, runtime_env_hash, startup_token, session_name, cluster_id, entrypoint, - worker_launch_time_ms, worker_launched_time_ms, debug_source, enable_resource_isolation): + worker_launch_time_ms, worker_launched_time_ms, debug_source): self.is_local_mode = local_mode cdef CCoreWorkerOptions options = CCoreWorkerOptions() @@ -3007,7 +3035,6 @@ cdef class CoreWorker: options.interactive = hasattr(sys, "ps1") options.node_ip_address = node_ip_address.encode("utf-8") options.node_manager_port = node_manager_port - options.raylet_ip_address = raylet_ip_address.encode("utf-8") options.driver_name = driver_name options.initialize_thread_callback = initialize_pygilstate_for_thread options.task_execution_callback = task_execution_handler @@ -3023,6 +3050,7 @@ cdef class CoreWorker: options.is_local_mode = local_mode options.kill_main = kill_main_task options.terminate_asyncio_thread = terminate_asyncio_thread + options.actor_shutdown_callback = call_actor_shutdown options.serialized_job_config = serialized_job_config options.metrics_agent_port = metrics_agent_port options.runtime_env_hash = runtime_env_hash @@ -3033,7 +3061,6 @@ cdef class CoreWorker: options.worker_launch_time_ms = worker_launch_time_ms options.worker_launched_time_ms = worker_launched_time_ms options.debug_source = debug_source - options.enable_resource_isolation = enable_resource_isolation CCoreWorkerProcess.Initialize(options) self.cgname_to_eventloop_dict = None @@ -3044,6 +3071,11 @@ cdef class CoreWorker: self._task_id_to_future = {} self.event_loop_executor = None + self._gc_thread = None + if RayConfig.instance().start_python_gc_manager_thread(): + self._gc_thread = PythonGCThread(min_interval_s=ray_constants.RAY_GC_MIN_COLLECT_INTERVAL) + self._gc_thread.start() + def shutdown_driver(self): # If it's a worker, the core worker process should have been # shutdown. So we can't call @@ -3051,6 +3083,9 @@ cdef class CoreWorker: # Instead, we use the cached `is_driver` flag to test if it's a # driver. assert self.is_driver + if self._gc_thread is not None: + self._gc_thread.stop() + self._gc_thread = None with nogil: CCoreWorkerProcess.Shutdown() @@ -3192,9 +3227,6 @@ cdef class CoreWorker: def set_webui_display(self, key, message): CCoreWorkerProcess.GetCoreWorker().SetWebuiDisplay(key, message) - def set_actor_title(self, title): - CCoreWorkerProcess.GetCoreWorker().SetActorTitle(title) - def set_actor_repr_name(self, repr_name): CCoreWorkerProcess.GetCoreWorker().SetActorReprName(repr_name) @@ -3384,6 +3416,32 @@ cdef class CoreWorker: CCoreWorkerProcess.GetCoreWorker() .ExperimentalRegisterMutableObjectReader(c_object_id)) + def put_object( + self, + serialized_object, + *, + c_bool pin_object, + owner_address, + c_bool inline_small_object, + c_bool _is_experimental_channel, + int tensor_transport_val=0 + ): + """Create an object reference with the current worker as the owner. + """ + created_object = self.put_serialized_object_and_increment_local_ref( + serialized_object, pin_object, owner_address, inline_small_object, _is_experimental_channel, tensor_transport_val) + if owner_address is None: + owner_address = CCoreWorkerProcess.GetCoreWorker().GetRpcAddress().SerializeAsString() + + # skip_adding_local_ref is True because it's already added through the call to + # put_serialized_object_and_increment_local_ref. + return ObjectRef( + created_object, + owner_address, + skip_adding_local_ref=True, + tensor_transport_val=tensor_transport_val + ) + def put_serialized_object_and_increment_local_ref( self, serialized_object, @@ -3391,6 +3449,7 @@ cdef class CoreWorker: owner_address=None, c_bool inline_small_object=True, c_bool _is_experimental_channel=False, + int tensor_transport_val=0 ): cdef: CObjectID c_object_id @@ -3404,6 +3463,7 @@ cdef class CoreWorker: serialized_object.contained_object_refs) size_t total_bytes = serialized_object.total_bytes + c_tensor_transport_val = tensor_transport_val with nogil: check_status(CCoreWorkerProcess.GetCoreWorker() .CreateOwnedAndIncrementLocalRef( @@ -3414,7 +3474,8 @@ cdef class CoreWorker: &c_object_id, &data, c_owner_address, - inline_small_object)) + inline_small_object, + c_tensor_transport_val)) if (data.get() == NULL): # Object already exists @@ -3778,6 +3839,7 @@ cdef class CoreWorker: labels, label_selector, c_bool allow_out_of_order_execution, + c_bool enable_tensor_transport, ): cdef: CRayFunction ray_function @@ -3832,6 +3894,7 @@ cdef class CoreWorker: c_concurrency_groups, allow_out_of_order_execution, max_pending_calls, + enable_tensor_transport, enable_task_events, c_labels, c_label_selector), @@ -3859,7 +3922,6 @@ cdef class CoreWorker: c_vector[unordered_map[c_string, double]] bundles, c_string strategy, c_bool is_detached, - double max_cpu_fraction_per_node, soft_target_node_id, c_vector[unordered_map[c_string, c_string]] bundle_label_selector): cdef: @@ -3891,7 +3953,6 @@ cdef class CoreWorker: c_strategy, bundles, is_detached, - max_cpu_fraction_per_node, c_soft_target_node_id, bundle_label_selector), &c_placement_group_id)) @@ -4113,6 +4174,7 @@ cdef class CoreWorker: max_task_retries = dereference(c_actor_handle).MaxTaskRetries() enable_task_events = dereference(c_actor_handle).EnableTaskEvents() allow_out_of_order_execution = dereference(c_actor_handle).AllowOutOfOrderExecution() + enable_tensor_transport = dereference(c_actor_handle).EnableTensorTransport() if language == Language.PYTHON: assert isinstance(actor_creation_function_descriptor, PythonFunctionDescriptor) @@ -4136,6 +4198,7 @@ cdef class CoreWorker: method_meta.retry_exceptions, method_meta.generator_backpressure_num_objects, # noqa method_meta.enable_task_events, + enable_tensor_transport, method_meta.method_name_to_tensor_transport, actor_method_cpu, actor_creation_function_descriptor, @@ -4154,6 +4217,7 @@ cdef class CoreWorker: {}, # method retry_exceptions {}, # generator_backpressure_num_objects {}, # enable_task_events + False, # enable_tensor_transport None, # method_name_to_tensor_transport 0, # actor method cpu actor_creation_function_descriptor, @@ -4362,6 +4426,17 @@ cdef class CoreWorker: num_returns = returns[0].size() if num_returns == 0: + if outputs is not None and len(outputs) > 0: + # Warn if num_returns=0 but the task returns a non-None value (likely unintended). + task_name = self.get_current_task_name() + obj_value = repr(outputs) + warnings.warn( + f"Task '{task_name}' has num_returns=0 but returned a non-None value '{obj_value}'. " + "The return value will be ignored.", + NumReturnsWarning, + stacklevel=2 + ) + return num_outputs_stored task_output_inlined_bytes = 0 @@ -4401,7 +4476,9 @@ cdef class CoreWorker: if c_tensor_transport != TENSOR_TRANSPORT_OBJECT_STORE: # `output` contains tensors. We need to retrieve these tensors from `output` # and store them in the GPUObjectManager. - serialized_object = context.serialize_and_store_gpu_objects(output, return_id.Hex()) + serialized_object, tensors = context.serialize_gpu_objects(output) + context.store_gpu_objects(return_id.Hex().decode("ascii"), tensors) + else: serialized_object = context.serialize(output) data_size = serialized_object.total_bytes @@ -4684,6 +4761,9 @@ cdef class CoreWorker: return self.current_runtime_env + def trigger_gc(self): + self._gc_thread.trigger_gc() + def get_pending_children_task_ids(self, parent_task_id: TaskID): cdef: CTaskID c_parent_task_id = parent_task_id.native() diff --git a/python/ray/actor.py b/python/ray/actor.py index f2a00efe5deb..07f808c90352 100644 --- a/python/ray/actor.py +++ b/python/ray/actor.py @@ -434,13 +434,18 @@ def bar(self): to use for the actor method. By default, the actor is single-threaded and runs all actor tasks on the same thread. See :ref:`Defining Concurrency Groups `. - tensor_transport: [Experimental] The tensor transport protocol to + tensor_transport: [Alpha] The tensor transport protocol to use for the actor method. The valid values are "OBJECT_STORE" - (default), "NCCL", or "GLOO" (case-insensitive). torch.Tensors - returned by this task will be sent to other tasks using the - specified transport. NCCL and GLOO transports require first creating - a collective with the involved actors using - `ray.experimental.collective.create_collective_group`. + (default), "NCCL", "GLOO", or "NIXL" (case-insensitive). If a + non-object store transport is specified, Ray will store a + *reference* instead of a copy of any torch.Tensors found inside + values returned by this task, and the tensors will be sent directly + to other tasks using the specified transport. NCCL and GLOO + transports require first creating a collective with the involved + actors using + :func:`ray.experimental.collective.create_collective_group`. + See :ref:`Ray Direct Transport (RDT) ` for more + details. """ valid_kwargs = [ "num_returns", @@ -815,11 +820,26 @@ def _remote( if tensor_transport is None: tensor_transport = self._tensor_transport - if tensor_transport != TensorTransportEnum.OBJECT_STORE and num_returns != 1: - raise ValueError( - f"Currently, methods with tensor_transport={tensor_transport.name} only support 1 return value. " - "Please make sure the actor method is decorated with `@ray.method(num_returns=1)` (the default)." - ) + if tensor_transport != TensorTransportEnum.OBJECT_STORE: + if num_returns != 1: + raise ValueError( + f"Currently, methods with tensor_transport={tensor_transport.name} only support 1 return value. " + "Please make sure the actor method is decorated with `@ray.method(num_returns=1)` (the default)." + ) + if not self._actor._ray_enable_tensor_transport: + raise ValueError( + f'Currently, methods with .options(tensor_transport="{tensor_transport.name}") are not supported when enable_tensor_transport=False. ' + "Please set @ray.remote(enable_tensor_transport=True) on the actor class definition." + ) + gpu_object_manager = ray._private.worker.global_worker.gpu_object_manager + if not gpu_object_manager.actor_has_tensor_transport( + self._actor, tensor_transport + ): + raise ValueError( + f'{self._actor} does not have tensor transport {tensor_transport.name} available. If using a collective-based transport ("nccl" or "gloo"), please create a communicator with ' + "`ray.experimental.collective.create_collective_group` " + "before calling actor tasks with non-default tensor_transport." + ) args = args or [] kwargs = kwargs or {} @@ -855,14 +875,18 @@ def invocation(args, kwargs): if self._decorator is not None: invocation = self._decorator(invocation) - obj_ref = invocation(args, kwargs) + object_refs = invocation(args, kwargs) if tensor_transport != TensorTransportEnum.OBJECT_STORE: + # Currently, we only support transfer tensor out-of-band when + # num_returns is 1. + assert isinstance(object_refs, ObjectRef) + object_ref = object_refs gpu_object_manager = ray._private.worker.global_worker.gpu_object_manager gpu_object_manager.add_gpu_object_ref( - obj_ref, self._actor, tensor_transport + object_ref, self._actor, tensor_transport ) - return obj_ref + return object_refs def __getstate__(self): return { @@ -875,6 +899,7 @@ def __getstate__(self): "is_generator": self._is_generator, "generator_backpressure_num_objects": self._generator_backpressure_num_objects, # noqa "enable_task_events": self._enable_task_events, + "_tensor_transport": self._tensor_transport, } def __setstate__(self, state): @@ -888,6 +913,7 @@ def __setstate__(self, state): state["generator_backpressure_num_objects"], state["enable_task_events"], state["decorator"], + state["_tensor_transport"], ) @@ -929,7 +955,6 @@ def create( cls, modified_class, actor_creation_function_descriptor, - enable_tensor_transport: bool = False, ): # Try to create an instance from cache. cached_meta = cls._cache.get(actor_creation_function_descriptor) @@ -956,6 +981,15 @@ def create( self.concurrency_group_for_methods = {} self.method_name_to_tensor_transport: Dict[str, TensorTransportEnum] = {} + # Check whether any actor methods specify a non-default tensor transport. + self.has_tensor_transport_methods = any( + getattr( + method, "__ray_tensor_transport__", TensorTransportEnum.OBJECT_STORE + ) + != TensorTransportEnum.OBJECT_STORE + for _, method in actor_methods + ) + for method_name, method in actor_methods: # Whether or not this method requires binding of its first # argument. For class and static methods, we do not want to bind @@ -1015,15 +1049,6 @@ def create( method_name ] = method.__ray_tensor_transport__ - method_tensor_transport = self.method_name_to_tensor_transport.get( - method_name, None - ) - if not enable_tensor_transport and method_tensor_transport is not None: - if method_tensor_transport != TensorTransportEnum.OBJECT_STORE: - raise ValueError( - f"Method {method_name} has tensor_transport={method_tensor_transport.name} but enable_tensor_transport is False" - ) - # Update cache. cls._cache[actor_creation_function_descriptor] = self return self @@ -1039,6 +1064,7 @@ class _ActorClassMetadata: actor_creation_function_descriptor: The function descriptor for the actor creation task. class_id: The ID of this actor class. + method_meta: The actor method metadata. class_name: The name of this class. num_cpus: The default number of CPUs required by the actor creation task. @@ -1054,14 +1080,14 @@ class _ActorClassMetadata: See :ref:`accelerator types `. runtime_env: The runtime environment for this actor. scheduling_strategy: Strategy about how to schedule this actor. - enable_tensor_transport: Whether to enable out-of-band tensor transport for this actor. last_export_cluster_and_job: A pair of the last exported cluster and job to help us to know whether this function was exported. This is an imperfect mechanism used to determine if we need to export the remote function again. It is imperfect in the sense that the actor class definition could be exported multiple times by different workers. - method_meta: The actor method metadata. + enable_tensor_transport: Whether to enable out-of-band tensor transport + for this actor. """ def __init__( @@ -1070,6 +1096,7 @@ def __init__( modified_class, actor_creation_function_descriptor, class_id, + method_meta, max_restarts, max_task_retries, num_cpus, @@ -1082,11 +1109,12 @@ def __init__( runtime_env, concurrency_groups, scheduling_strategy: SchedulingStrategyT, - enable_tensor_transport: bool = False, + enable_tensor_transport: bool, ): self.language = language self.modified_class = modified_class self.actor_creation_function_descriptor = actor_creation_function_descriptor + self.method_meta = method_meta self.class_name = actor_creation_function_descriptor.class_name self.is_cross_language = language != Language.PYTHON self.class_id = class_id @@ -1102,13 +1130,8 @@ def __init__( self.runtime_env = runtime_env self.concurrency_groups = concurrency_groups self.scheduling_strategy = scheduling_strategy - self.enable_tensor_transport = enable_tensor_transport self.last_export_cluster_and_job = None - self.method_meta = _ActorClassMethodMetadata.create( - modified_class, - actor_creation_function_descriptor, - self.enable_tensor_transport, - ) + self.enable_tensor_transport = enable_tensor_transport @PublicAPI @@ -1116,7 +1139,7 @@ class ActorClassInheritanceException(TypeError): pass -def _process_option_dict(actor_options): +def _process_option_dict(actor_options, has_tensor_transport_methods): _filled_options = {} arg_names = set(inspect.getfullargspec(_ActorClassMetadata.__init__)[0]) for k, v in ray_option_utils.actor_options.items(): @@ -1125,15 +1148,28 @@ def _process_option_dict(actor_options): _filled_options["runtime_env"] = parse_runtime_env_for_task_or_actor( _filled_options["runtime_env"] ) + # If any actor method has a non-default tensor transport, automatically + # enable tensor transport, unless it was explicitly set to False by the + # user. + if has_tensor_transport_methods: + if _filled_options["enable_tensor_transport"] is False: + raise ValueError( + "Actor class has methods with @ray.method(tensor_transport=...) decorator but @ray.remote(enable_tensor_transport=False). " + "Either set enable_tensor_transport=True or remove the @ray.method(tensor_transport=...) decorator from the methods." + ) + _filled_options["enable_tensor_transport"] = True # Ray GPU objects requires a background thread for data transfer. However, # currently by default the background thread will be blocked if the main - # thread does not yield. For now, we explicitly create the background - # thread, which forces Ray to execute all tasks on background threads - # instead of the main thread. + # thread does not yield. For now, we explicitly create the background thread + # if `@ray.remote(enable_tensor_transport=True)` or if any methods are + # decorated with `@ray.method(tensor_transport=...)` and a non-default + # tensor transport. This forces Ray to execute all tasks on background + # threads instead of the main thread. # TODO(swang): Remove this code once # https://github.com/ray-project/ray/issues/54639 is fixed. - if _filled_options.get("enable_tensor_transport", False): + enable_tensor_transport = _filled_options.get("enable_tensor_transport", False) + if enable_tensor_transport: if _filled_options.get("concurrency_groups", None) is None: _filled_options["concurrency_groups"] = {} _filled_options["concurrency_groups"]["_ray_system"] = 1 @@ -1249,12 +1285,19 @@ def __init__(self, *args, **kwargs): modified_class.__ray_actor_class__ ) + actor_method_meta = _ActorClassMethodMetadata.create( + modified_class, + actor_creation_function_descriptor, + ) self.__ray_metadata__ = _ActorClassMetadata( Language.PYTHON, modified_class, actor_creation_function_descriptor, class_id, - **_process_option_dict(actor_options), + actor_method_meta, + **_process_option_dict( + actor_options, actor_method_meta.has_tensor_transport_methods + ), ) self._default_options = actor_options if "runtime_env" in self._default_options: @@ -1270,12 +1313,20 @@ def _ray_from_function_descriptor( actor_options, ): self = ActorClass.__new__(ActorClass) + modified_class = None + actor_method_meta = _ActorClassMethodMetadata.create( + modified_class, + actor_creation_function_descriptor, + ) self.__ray_metadata__ = _ActorClassMetadata( language, - None, + modified_class, actor_creation_function_descriptor, None, - **_process_option_dict(actor_options), + actor_method_meta, + **_process_option_dict( + actor_options, actor_method_meta.has_tensor_transport_methods + ), ) self._default_options = actor_options if "runtime_env" in self._default_options: @@ -1296,7 +1347,7 @@ def remote(self, *args, **kwargs) -> ActorProxy[T]: """ return self._remote(args=args, kwargs=kwargs, **self._default_options) - def options(self, **actor_options): + def options(self, **actor_options) -> "ActorClass[T]": """Configures and overrides the actor instantiation parameters. The arguments are the same as those that can be passed @@ -1751,6 +1802,7 @@ def _remote(self, args=None, kwargs=None, **actor_options) -> ActorProxy[T]: labels=actor_options.get("_labels"), label_selector=actor_options.get("label_selector"), allow_out_of_order_execution=allow_out_of_order_execution, + enable_tensor_transport=meta.enable_tensor_transport, ) if _actor_launch_hook: @@ -1771,6 +1823,7 @@ def _remote(self, args=None, kwargs=None, **actor_options) -> ActorProxy[T]: meta.method_meta.retry_exceptions, meta.method_meta.generator_backpressure_num_objects, meta.method_meta.enable_task_events, + meta.enable_tensor_transport, meta.method_meta.method_name_to_tensor_transport, actor_method_cpu, meta.actor_creation_function_descriptor, @@ -1847,6 +1900,7 @@ class ActorHandle(Generic[T]): _ray_actor_creation_function_descriptor: The function descriptor of the actor creation task. _ray_allow_out_of_order_execution: Whether the actor can execute tasks out of order. + _ray_enable_tensor_transport: Whether tensor transport is enabled for this actor. """ def __init__( @@ -1863,6 +1917,7 @@ def __init__( method_retry_exceptions: Dict[str, Union[bool, list, tuple]], method_generator_backpressure_num_objects: Dict[str, int], method_enable_task_events: Dict[str, bool], + enable_tensor_transport: bool, method_name_to_tensor_transport: Dict[str, TensorTransportEnum], actor_method_cpus: int, actor_creation_function_descriptor, @@ -1886,6 +1941,10 @@ def __init__( method_retry_exceptions: Dictionary mapping method names to their retry exception settings. method_generator_backpressure_num_objects: Dictionary mapping method names to their generator backpressure settings. method_enable_task_events: Dictionary mapping method names to whether task events are enabled. + enable_tensor_transport: Whether tensor transport is enabled for + this actor. If True, then methods can be called with + .options(tensor_transport=...) to specify a non-default tensor + transport. method_name_to_tensor_transport: Dictionary mapping method names to their tensor transport settings. actor_method_cpus: The number of CPUs required by actor methods. actor_creation_function_descriptor: The function descriptor for actor creation. @@ -1912,6 +1971,7 @@ def __init__( method_generator_backpressure_num_objects ) self._ray_method_enable_task_events = method_enable_task_events + self._ray_enable_tensor_transport = enable_tensor_transport self._ray_method_name_to_tensor_transport = method_name_to_tensor_transport self._ray_actor_method_cpus = actor_method_cpus self._ray_cluster_and_job = cluster_and_job @@ -2234,6 +2294,8 @@ def _serialization_helper(self): self._ray_method_generator_backpressure_num_objects ), "method_enable_task_events": self._ray_method_enable_task_events, + "enable_tensor_transport": self._ray_enable_tensor_transport, + "method_name_to_tensor_transport": self._ray_method_name_to_tensor_transport, "actor_method_cpus": self._ray_actor_method_cpus, "actor_creation_function_descriptor": self._ray_actor_creation_function_descriptor, # noqa: E501 }, @@ -2283,6 +2345,8 @@ def _deserialization_helper(cls, state, weak_ref: bool, outer_object_ref=None): state["method_retry_exceptions"], state["method_generator_backpressure_num_objects"], state["method_enable_task_events"], + state["enable_tensor_transport"], + state["method_name_to_tensor_transport"], state["actor_method_cpus"], state["actor_creation_function_descriptor"], state["current_cluster_and_job"], diff --git a/python/ray/air/BUILD b/python/ray/air/BUILD.bazel similarity index 100% rename from python/ray/air/BUILD rename to python/ray/air/BUILD.bazel diff --git a/python/ray/air/_internal/device_manager/npu.py b/python/ray/air/_internal/device_manager/npu.py index 3a3c554da44f..0a40594e14f1 100644 --- a/python/ray/air/_internal/device_manager/npu.py +++ b/python/ray/air/_internal/device_manager/npu.py @@ -6,8 +6,8 @@ import ray import ray._private.ray_constants as ray_constants -from ray.air._internal.device_manager.torch_device_manager import TorchDeviceManager from ray._private.accelerators.npu import ASCEND_RT_VISIBLE_DEVICES_ENV_VAR +from ray.air._internal.device_manager.torch_device_manager import TorchDeviceManager def is_package_present(package_name: str) -> bool: diff --git a/python/ray/air/_internal/torch_utils.py b/python/ray/air/_internal/torch_utils.py index 2264e8c17e96..96fe7bd84c74 100644 --- a/python/ray/air/_internal/torch_utils.py +++ b/python/ray/air/_internal/torch_utils.py @@ -1,24 +1,23 @@ import warnings -from typing import Any, Dict, List, Optional, Union, Sequence +from typing import Any, Dict, List, Optional, Sequence, Union import numpy as np import pandas as pd -import torch import pyarrow +import torch +from ray._private.ray_constants import env_bool from ray.air._internal.device_manager import get_torch_device_manager_by_context from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed from ray.data.collate_fn import ( - TensorBatchType, TensorBatchReturnType, - _is_tensor, - _is_tensor_sequence, + TensorBatchType, _is_nested_tensor_sequence, + _is_tensor, _is_tensor_mapping, + _is_tensor_sequence, _is_tensor_sequence_mapping, ) -from ray._private.ray_constants import env_bool - # Default non-blocking transfer for tensors. DEFAULT_TENSOR_NON_BLOCKING_TRANSFER = env_bool( @@ -385,8 +384,8 @@ def arrow_batch_to_tensors( A dictionary of column name to list of tensors. For non-chunked columns, the list will contain a single tensor. """ - from ray.data._internal.arrow_ops import transform_pyarrow from ray.data._internal.arrow_block import ArrowBlockAccessor + from ray.data._internal.arrow_ops import transform_pyarrow if combine_chunks: numpy_batch = ArrowBlockAccessor(batch).to_batch_format("numpy") diff --git a/python/ray/air/_internal/usage.py b/python/ray/air/_internal/usage.py index 9e921d40f803..4933a7517631 100644 --- a/python/ray/air/_internal/usage.py +++ b/python/ray/air/_internal/usage.py @@ -24,6 +24,7 @@ TRAIN_V2_TRAINERS = { "DataParallelTrainer", + "JaxTrainer", "LightGBMTrainer", "TensorflowTrainer", "TorchTrainer", diff --git a/python/ray/air/config.py b/python/ray/air/config.py index 727ceb5c8cfb..01c93e3c354c 100644 --- a/python/ray/air/config.py +++ b/python/ray/air/config.py @@ -1,7 +1,8 @@ import logging +import os +import warnings from collections import Counter, defaultdict from dataclasses import _MISSING_TYPE, dataclass, fields -import os from pathlib import Path from typing import ( TYPE_CHECKING, @@ -14,7 +15,6 @@ Tuple, Union, ) -import warnings import pyarrow.fs diff --git a/python/ray/air/tests/test_air_usage.py b/python/ray/air/tests/test_air_usage.py index 6a1d65b96ac3..bc6bbc194bd4 100644 --- a/python/ray/air/tests/test_air_usage.py +++ b/python/ray/air/tests/test_air_usage.py @@ -210,10 +210,10 @@ def test_tag_air_entrypoint(ray_start_4_cpus, mock_record, entrypoint, tuner, tr ) def test_tag_train_entrypoint(mock_record): """Test that Train v2 entrypoints are recorded correctly.""" - from ray.train.v2.torch.torch_trainer import TorchTrainer + from ray.train.v2.lightgbm.lightgbm_trainer import LightGBMTrainer from ray.train.v2.tensorflow.tensorflow_trainer import TensorflowTrainer + from ray.train.v2.torch.torch_trainer import TorchTrainer from ray.train.v2.xgboost.xgboost_trainer import XGBoostTrainer - from ray.train.v2.lightgbm.lightgbm_trainer import LightGBMTrainer trainer_classes = [ TorchTrainer, diff --git a/python/ray/air/tests/test_arrow.py b/python/ray/air/tests/test_arrow.py index 31d533155c3a..4c2ebc3099e9 100644 --- a/python/ray/air/tests/test_arrow.py +++ b/python/ray/air/tests/test_arrow.py @@ -9,10 +9,10 @@ from ray._private.arrow_utils import get_pyarrow_version from ray.air.util.tensor_extensions.arrow import ( ArrowConversionError, + ArrowTensorArray, _convert_to_pyarrow_native_array, _infer_pyarrow_type, convert_to_pyarrow_array, - ArrowTensorArray, ) from ray.air.util.tensor_extensions.utils import create_ragged_ndarray from ray.data import DataContext diff --git a/python/ray/air/tests/test_integration_wandb.py b/python/ray/air/tests/test_integration_wandb.py index 04228162d2fd..05a64ee82d34 100644 --- a/python/ray/air/tests/test_integration_wandb.py +++ b/python/ray/air/tests/test_integration_wandb.py @@ -50,10 +50,10 @@ WANDB_POPULATE_RUN_LOCATION_HOOK, WANDB_PROJECT_ENV_VAR, WANDB_SETUP_API_KEY_HOOK, + RunDisabled, WandbLoggerCallback, _QueueItem, _WandbLoggingActor, - RunDisabled, setup_wandb, ) from ray.air.tests.mocked_wandb_integration import ( diff --git a/python/ray/air/tests/test_tensor_extension.py b/python/ray/air/tests/test_tensor_extension.py index fb5b6bbd43ab..1f0c8ff08756 100644 --- a/python/ray/air/tests/test_tensor_extension.py +++ b/python/ray/air/tests/test_tensor_extension.py @@ -800,6 +800,32 @@ def test_large_arrow_tensor_array(restore_data_context, tensor_format): assert np.asarray(arr).shape == (1000, 550) +@pytest.mark.parametrize("tensor_format", ["v1", "v2"]) +def test_tensor_array_string_tensors_simple(restore_data_context, tensor_format): + """Simple test for fixed-shape string tensor arrays with pandas/arrow roundtrip.""" + DataContext.get_current().use_arrow_tensor_v2 = tensor_format == "v2" + + # Create fixed-shape string tensor + string_tensors = np.array( + [["hello", "world"], ["arrow", "pandas"], ["tensor", "string"]] + ) + + # Create pandas DataFrame with TensorArray + df_pandas = pd.DataFrame({"id": [1, 2, 3], "strings": TensorArray(string_tensors)}) + # Convert to Arrow table + arrow_table = pa.Table.from_pandas(df_pandas) + + # Convert back to pandas + df_roundtrip = arrow_table.to_pandas(ignore_metadata=True) + + # Verify the roundtrip preserves the data + original_strings = df_pandas["strings"].to_numpy() + roundtrip_strings = df_roundtrip["strings"].to_numpy() + + np.testing.assert_array_equal(original_strings, roundtrip_strings) + np.testing.assert_array_equal(roundtrip_strings, string_tensors) + + if __name__ == "__main__": import sys diff --git a/python/ray/air/util/object_extensions/arrow.py b/python/ray/air/util/object_extensions/arrow.py index 180fcfc96367..47867e54f5a3 100644 --- a/python/ray/air/util/object_extensions/arrow.py +++ b/python/ray/air/util/object_extensions/arrow.py @@ -6,8 +6,8 @@ from packaging.version import parse as parse_version import ray.air.util.object_extensions.pandas -from ray._private.serialization import pickle_dumps from ray._private.arrow_utils import get_pyarrow_version +from ray._private.serialization import pickle_dumps from ray.util.annotations import PublicAPI MIN_PYARROW_VERSION_SCALAR_SUBCLASS = parse_version("9.0.0") @@ -71,6 +71,9 @@ def __reduce__(self): self.__arrow_ext_serialize__(), ) + def __hash__(self) -> int: + return hash((type(self), self.storage_type.id, self.extension_name)) + @PublicAPI(stability="alpha") class ArrowPythonObjectScalar(pa.ExtensionScalar): diff --git a/python/ray/air/util/tensor_extensions/arrow.py b/python/ray/air/util/tensor_extensions/arrow.py index be0a88c83bb8..712b2af080c0 100644 --- a/python/ray/air/util/tensor_extensions/arrow.py +++ b/python/ray/air/util/tensor_extensions/arrow.py @@ -1,34 +1,32 @@ import abc -from datetime import datetime - import itertools import json import logging import sys +from datetime import datetime +from enum import Enum from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union import numpy as np import pyarrow as pa from packaging.version import parse as parse_version -import ray.cloudpickle as cloudpickle -from enum import Enum +import ray.cloudpickle as cloudpickle from ray._private.arrow_utils import get_pyarrow_version +from ray._private.ray_constants import env_integer from ray.air.util.tensor_extensions.utils import ( + ArrayLike, _is_ndarray_variable_shaped_tensor, - create_ragged_ndarray, _should_convert_to_tensor, - ArrayLike, + create_ragged_ndarray, ) from ray.data._internal.numpy_support import ( - convert_to_numpy, _convert_datetime_to_np_datetime, + convert_to_numpy, ) from ray.util import log_once from ray.util.annotations import DeveloperAPI, PublicAPI from ray.util.common import INT32_MAX -from ray._private.ray_constants import env_integer - PYARROW_VERSION = get_pyarrow_version() # Minimum version of Arrow that supports subclassable ExtensionScalars. @@ -574,6 +572,9 @@ def _need_variable_shaped_tensor_array( shape = arr_type.shape return False + def __hash__(self) -> int: + return hash((type(self), self.extension_name, self.storage_type, self._shape)) + @PublicAPI(stability="beta") class ArrowTensorType(_BaseFixedShapeArrowTensorType): @@ -584,6 +585,7 @@ class ArrowTensorType(_BaseFixedShapeArrowTensorType): """ OFFSET_DTYPE = np.int32 + __hash__ = _BaseFixedShapeArrowTensorType.__hash__ def __init__(self, shape: Tuple[int, ...], dtype: pa.DataType): """ @@ -614,6 +616,7 @@ class ArrowTensorTypeV2(_BaseFixedShapeArrowTensorType): """Arrow ExtensionType (v2) for tensors (supporting tensors > 4Gb).""" OFFSET_DTYPE = np.int64 + __hash__ = _BaseFixedShapeArrowTensorType.__hash__ def __init__(self, shape: Tuple[int, ...], dtype: pa.DataType): """ @@ -1125,6 +1128,9 @@ def _extension_scalar_to_ndarray(self, scalar: "pa.ExtensionScalar") -> np.ndarr data_buffer = raw_values.buffers()[1] return _to_ndarray_helper(shape, value_type, offset, data_buffer) + def __hash__(self) -> int: + return hash((type(self), self.extension_name, self.storage_type, self._ndim)) + # NOTE: We need to inherit from the mixin before pa.ExtensionArray to ensure that the # mixin's overriding methods appear first in the MRO. diff --git a/python/ray/air/util/tensor_extensions/utils.py b/python/ray/air/util/tensor_extensions/utils.py index 8468f721751e..142814285ffd 100644 --- a/python/ray/air/util/tensor_extensions/utils.py +++ b/python/ray/air/util/tensor_extensions/utils.py @@ -1,5 +1,5 @@ import warnings -from typing import TYPE_CHECKING, Any, Sequence, Union, List, Protocol +from typing import TYPE_CHECKING, Any, List, Protocol, Sequence, Union import numpy as np diff --git a/python/ray/air/util/torch_dist.py b/python/ray/air/util/torch_dist.py index acc7d78a47f8..c133d8133425 100644 --- a/python/ray/air/util/torch_dist.py +++ b/python/ray/air/util/torch_dist.py @@ -15,10 +15,10 @@ import torch.distributed as dist import ray +from ray._common.network_utils import build_address from ray.actor import ActorHandle from ray.air._internal.torch_utils import get_devices from ray.train._internal.utils import get_address_and_port -from ray._common.network_utils import build_address class TorchDistributedWorker(ABC): diff --git a/python/ray/autoscaler/BUILD b/python/ray/autoscaler/BUILD.bazel similarity index 100% rename from python/ray/autoscaler/BUILD rename to python/ray/autoscaler/BUILD.bazel diff --git a/python/ray/autoscaler/_private/_azure/config.py b/python/ray/autoscaler/_private/_azure/config.py index 5320b1698ec3..1c2b1a91a90f 100644 --- a/python/ray/autoscaler/_private/_azure/config.py +++ b/python/ray/autoscaler/_private/_azure/config.py @@ -1,5 +1,6 @@ import json import logging +import os import random from hashlib import sha256 from pathlib import Path @@ -10,6 +11,12 @@ from azure.mgmt.resource import ResourceManagementClient from azure.mgmt.resource.resources.models import DeploymentMode +from ray.autoscaler._private.util import ( + generate_rsa_key_pair, + generate_ssh_key_name, + generate_ssh_key_paths, +) + UNIQUE_ID_LEN = 4 logger = logging.getLogger(__name__) @@ -218,22 +225,104 @@ def _configure_resource_group(config): def _configure_key_pair(config): + """ + Configure SSH keypair. Use user specified custom paths, otherwise, + generate Ray-specific keypair in this format: "ray-autoscaler_azure_{region}_{resource_group}_{ssh_user}_{index}" + """ ssh_user = config["auth"]["ssh_user"] public_key = None - # search if the keys exist - for key_type in ["ssh_private_key", "ssh_public_key"]: - try: - key_path = Path(config["auth"][key_type]).expanduser() - except KeyError: - raise Exception("Config must define {}".format(key_type)) - except TypeError: - raise Exception("Invalid config value for {}".format(key_type)) - assert key_path.is_file(), "Could not find ssh key: {}".format(key_path) + # Check if user specified custom SSH key paths + user_specified_private_key = "ssh_private_key" in config["auth"] + user_specified_public_key = "ssh_public_key" in config["auth"] + + # Validate that the user either specfied both keys or none, but not just one + if user_specified_private_key != user_specified_public_key: + if user_specified_private_key: + missing_key, specified_key = "ssh_public_key", "ssh_private_key" + else: + missing_key, specified_key = "ssh_private_key", "ssh_public_key" + raise ValueError( + f"{specified_key} is specified but {missing_key} is missing. " + "Both SSH key paths must be specified together, or omit both from " + "your config to use auto-generated keys." + ) + + if user_specified_private_key and user_specified_public_key: + # User specified custom paths + private_key_path = Path(config["auth"]["ssh_private_key"]).expanduser() + public_key_path = Path(config["auth"]["ssh_public_key"]).expanduser() + + # Validate that user-specified keys exist + missing_keys = [] + if not private_key_path.is_file(): + missing_keys.append(f"ssh_private_key: {private_key_path}") + if not public_key_path.is_file(): + missing_keys.append(f"ssh_public_key: {public_key_path}") + + if missing_keys: + raise ValueError( + "SSH key files from config do not exist: {}. " + "Please create the keys or remove the custom paths from your config " + "to use auto-generated keys.".format(", ".join(missing_keys)) + ) + logger.info( + "Using specified SSH keys from config: {} and {}".format( + private_key_path, public_key_path + ) + ) - if key_type == "ssh_public_key": - with open(key_path, "r") as f: + with open(public_key_path, "r") as f: + public_key = f.read() + else: + # Generate Ray-specific keys + region = config["provider"]["location"] + resource_group = config["provider"]["resource_group"] + + # Generate single deterministic key name for this configuration + key_name = generate_ssh_key_name( + "azure", None, region, resource_group, ssh_user + ) + public_key_path, private_key_path = generate_ssh_key_paths(key_name) + + # Check if this key pair already exists + if os.path.exists(private_key_path) and os.path.exists(public_key_path): + logger.info( + "Using existing Ray-specific SSH keys: {} and {}".format( + private_key_path, public_key_path + ) + ) + with open(public_key_path, "r") as f: public_key = f.read() + else: + # Create a key pair since it doesn't exist locally + logger.info( + "Generating new Ray-specific SSH key pair at {} and {}".format( + private_key_path, public_key_path + ) + ) + os.makedirs(os.path.dirname(private_key_path), exist_ok=True) + public_key, private_key = generate_rsa_key_pair() + with open( + private_key_path, + "w", + opener=lambda path, flags: os.open(path, flags, 0o600), + ) as f: + f.write(private_key) + with open(public_key_path, "w") as f: + f.write(public_key) + + assert os.path.exists( + private_key_path + ), "Private key file {} not found for user {}".format( + private_key_path, ssh_user + ) + + config["auth"]["ssh_private_key"] = private_key_path + config["auth"]["ssh_public_key"] = public_key_path + if "file_mounts" not in config: + config["file_mounts"] = {} + config["file_mounts"]["~/.ssh/id_rsa.pub"] = public_key_path for node_type in config["available_node_types"].values(): azure_arm_parameters = node_type["node_config"].setdefault( diff --git a/python/ray/autoscaler/_private/autoscaler.py b/python/ray/autoscaler/_private/autoscaler.py index e352479b9308..2051977bf655 100644 --- a/python/ray/autoscaler/_private/autoscaler.py +++ b/python/ray/autoscaler/_private/autoscaler.py @@ -207,7 +207,7 @@ def __init__( config_reader: Path to a Ray Autoscaler YAML, or a function to read and return the latest config. load_metrics: Provides metrics for the Ray cluster. - session_name: The session name of the cluster this autoscaler + session_name: The current Ray session name when this autoscaler is deployed. max_launch_batch: Max number of nodes to launch in one request. max_concurrent_launches: Max number of nodes that can be @@ -635,10 +635,10 @@ def drain_nodes_via_gcs(self, provider_node_ids_to_drain: List[NodeID]): # For type checking, assert that this object has been instantitiated. assert self.provider - # The GCS expects Raylet ids in the request, rather than NodeProvider - # ids. To get the Raylet ids of the nodes to we're draining, we make + # The GCS expects Node ids in the request, rather than NodeProvider + # ids. To get the Node ids of the nodes to we're draining, we make # the following translations of identifiers: - # node provider node id -> ip -> raylet id + # node provider node id -> ip -> node id # Convert node provider node ids to ips. node_ips = set() @@ -660,29 +660,29 @@ def drain_nodes_via_gcs(self, provider_node_ids_to_drain: List[NodeID]): # Only attempt to drain connected nodes, i.e. nodes with ips in # LoadMetrics. - connected_node_ips = node_ips & self.load_metrics.raylet_id_by_ip.keys() + connected_node_ips = node_ips & self.load_metrics.node_id_by_ip.keys() - # Convert ips to Raylet ids. - # (The assignment ip->raylet_id is well-defined under current + # Convert ips to Node ids. + # (The assignment ip->node_id is well-defined under current # assumptions. See "use_node_id_as_ip" in monitor.py) - raylet_ids_to_drain = { - self.load_metrics.raylet_id_by_ip[ip] for ip in connected_node_ips + node_ids_to_drain = { + self.load_metrics.node_id_by_ip[ip] for ip in connected_node_ips } - if not raylet_ids_to_drain: + if not node_ids_to_drain: return - logger.info(f"Draining {len(raylet_ids_to_drain)} raylet(s).") + logger.info(f"Draining {len(node_ids_to_drain)} raylet(s).") try: # A successful response indicates that the GCS has marked the # desired nodes as "drained." The cloud provider can then terminate # the nodes without the GCS printing an error. # Check if we succeeded in draining all of the intended nodes by # looking at the RPC response. - drained_raylet_ids = set( - self.gcs_client.drain_nodes(raylet_ids_to_drain, timeout=5) + drained_node_ids = set( + self.gcs_client.drain_nodes(node_ids_to_drain, timeout=5) ) - failed_to_drain = raylet_ids_to_drain - drained_raylet_ids + failed_to_drain = node_ids_to_drain - drained_node_ids if failed_to_drain: self.prom_metrics.drain_node_exceptions.inc() logger.error(f"Failed to drain {len(failed_to_drain)} raylet(s).") diff --git a/python/ray/autoscaler/_private/aws/node_provider.py b/python/ray/autoscaler/_private/aws/node_provider.py index f11e8a6dbcdb..3b8673dbb730 100644 --- a/python/ray/autoscaler/_private/aws/node_provider.py +++ b/python/ray/autoscaler/_private/aws/node_provider.py @@ -127,6 +127,8 @@ def __init__(self, provider_config, cluster_name): self.ready_for_new_batch.set() self.tag_cache_lock = threading.Lock() self.count_lock = threading.Lock() + # Prevent concurrent create_node calls to get the same stopped/stopping node to reuse. + self._reuse_node_lock = threading.Lock() # Cache of node objects from the last nodes() call. This avoids # excessive DescribeInstances requests. @@ -290,32 +292,35 @@ def create_node(self, node_config, tags, count) -> Dict[str, Any]: } ) - reuse_nodes = list(self.ec2.instances.filter(Filters=filters))[:count] - reuse_node_ids = [n.id for n in reuse_nodes] - reused_nodes_dict = {n.id: n for n in reuse_nodes} - if reuse_nodes: - cli_logger.print( - # todo: handle plural vs singular? - "Reusing nodes {}. " - "To disable reuse, set `cache_stopped_nodes: False` " - "under `provider` in the cluster configuration.", - cli_logger.render_list(reuse_node_ids), - ) + with self._reuse_node_lock: + reuse_nodes = list(self.ec2.instances.filter(Filters=filters))[:count] + reuse_node_ids = [n.id for n in reuse_nodes] + reused_nodes_dict = {n.id: n for n in reuse_nodes} + if reuse_nodes: + cli_logger.print( + # todo: handle plural vs singular? + "Reusing nodes {}. " + "To disable reuse, set `cache_stopped_nodes: False` " + "under `provider` in the cluster configuration.", + cli_logger.render_list(reuse_node_ids), + ) - # todo: timed? - with cli_logger.group("Stopping instances to reuse"): - for node in reuse_nodes: - self.tag_cache[node.id] = from_aws_format( - {x["Key"]: x["Value"] for x in node.tags} - ) - if node.state["Name"] == "stopping": - cli_logger.print("Waiting for instance {} to stop", node.id) - node.wait_until_stopped() - - self.ec2.meta.client.start_instances(InstanceIds=reuse_node_ids) - for node_id in reuse_node_ids: - self.set_node_tags(node_id, tags) - count -= len(reuse_node_ids) + # todo: timed? + with cli_logger.group("Stopping instances to reuse"): + for node in reuse_nodes: + self.tag_cache[node.id] = from_aws_format( + {x["Key"]: x["Value"] for x in node.tags} + ) + if node.state["Name"] == "stopping": + cli_logger.print( + "Waiting for instance {} to stop", node.id + ) + node.wait_until_stopped() + + self.ec2.meta.client.start_instances(InstanceIds=reuse_node_ids) + for node_id in reuse_node_ids: + self.set_node_tags(node_id, tags) + count -= len(reuse_node_ids) created_nodes_dict = {} if count: diff --git a/python/ray/autoscaler/_private/command_runner.py b/python/ray/autoscaler/_private/command_runner.py index 44ca7369efe3..55ffd5d9c838 100644 --- a/python/ray/autoscaler/_private/command_runner.py +++ b/python/ray/autoscaler/_private/command_runner.py @@ -133,13 +133,18 @@ def __init__(self, ssh_key, control_path=None, **kwargs): "ServerAliveCountMax": 3, } if control_path: - self.arg_dict.update( - { - "ControlMaster": "auto", - "ControlPath": "{}/%C".format(control_path), - "ControlPersist": "10s", - } - ) + if sys.platform == "win32": + # Don't set any control path options on Windows + pass + else: + self.arg_dict.update( + { + "ControlMaster": "auto", + "ControlPath": "{}/%C".format(control_path), + "ControlPersist": "10s", + } + ) + self.arg_dict.update(kwargs) def to_ssh_options_list(self, *, timeout=60): @@ -170,9 +175,13 @@ def __init__( ssh_control_hash = hashlib.sha1(cluster_name.encode()).hexdigest() ssh_user_hash = hashlib.sha1(getuser().encode()).hexdigest() - ssh_control_path = "/tmp/ray_ssh_{}/{}".format( - ssh_user_hash[:HASH_MAX_LENGTH], ssh_control_hash[:HASH_MAX_LENGTH] - ) + if sys.platform == "win32": + # Disable SSH control paths on Windows - currently using it cause socket errors + ssh_control_path = None + else: + ssh_control_path = "/tmp/ray_ssh_{}/{}".format( + ssh_user_hash[:HASH_MAX_LENGTH], ssh_control_hash[:HASH_MAX_LENGTH] + ) self.cluster_name = cluster_name self.log_prefix = log_prefix @@ -238,10 +247,11 @@ def _set_ssh_ip_if_required(self): # This should run before any SSH commands and therefore ensure that # the ControlPath directory exists, allowing SSH to maintain # persistent sessions later on. - try: - os.makedirs(self.ssh_control_path, mode=0o700, exist_ok=True) - except OSError as e: - cli_logger.warning("{}", str(e)) # todo: msg + if self.ssh_control_path is not None: + try: + os.makedirs(self.ssh_control_path, mode=0o700, exist_ok=True) + except OSError as e: + cli_logger.warning("{}", str(e)) # todo: msg def _run_helper( self, @@ -406,32 +416,48 @@ def run_rsync_up(self, source, target, options=None): self._set_ssh_ip_if_required() options = options or {} - command = ["rsync"] - command += [ - "--rsh", - subprocess.list2cmdline( - ["ssh"] + self.ssh_options.to_ssh_options_list(timeout=120) - ), - ] - command += ["-avz"] - command += self._create_rsync_filter_args(options=options) - command += [source, "{}@{}:{}".format(self.ssh_user, self.ssh_ip, target)] + # on windows use scp -r instead of rsync + if sys.platform == "win32": + # Use scp as fallback for Windows + command = ["scp", "-r"] + command += self.ssh_options.to_ssh_options_list(timeout=120) + command += [source, "{}@{}:{}".format(self.ssh_user, self.ssh_ip, target)] + else: + command = ["rsync"] + command += [ + "--rsh", + subprocess.list2cmdline( + ["ssh"] + self.ssh_options.to_ssh_options_list(timeout=120) + ), + ] + command += ["-avz"] + command += self._create_rsync_filter_args(options=options) + command += [source, "{}@{}:{}".format(self.ssh_user, self.ssh_ip, target)] + cli_logger.verbose("Running `{}`", cf.bold(" ".join(command))) self._run_helper(command, silent=is_rsync_silent()) def run_rsync_down(self, source, target, options=None): self._set_ssh_ip_if_required() - command = ["rsync"] - command += [ - "--rsh", - subprocess.list2cmdline( - ["ssh"] + self.ssh_options.to_ssh_options_list(timeout=120) - ), - ] - command += ["-avz"] - command += self._create_rsync_filter_args(options=options) - command += ["{}@{}:{}".format(self.ssh_user, self.ssh_ip, source), target] + # on Windows use scp -r instead of rsync + if sys.platform == "win32": + # Use scp as fallback for Windows + command = ["scp", "-r"] + command += self.ssh_options.to_ssh_options_list(timeout=120) + command += ["{}@{}:{}".format(self.ssh_user, self.ssh_ip, source), target] + else: + command = ["rsync"] + command += [ + "--rsh", + subprocess.list2cmdline( + ["ssh"] + self.ssh_options.to_ssh_options_list(timeout=120) + ), + ] + command += ["-avz"] + command += self._create_rsync_filter_args(options=options) + command += ["{}@{}:{}".format(self.ssh_user, self.ssh_ip, source), target] + cli_logger.verbose("Running `{}`", cf.bold(" ".join(command))) self._run_helper(command, silent=is_rsync_silent()) @@ -510,8 +536,13 @@ def run_rsync_up(self, source, target, options=None): self._get_docker_host_mount_location(self.ssh_command_runner.cluster_name), target.lstrip("/"), ) - host_mount_location = os.path.dirname(host_destination.rstrip("/")) + if sys.platform == "win32": + # fix paths if running on Windows + source = source.replace("\\", "/") + host_mount_location = host_mount_location.replace("\\", "/") + host_destination = host_destination.replace("\\", "/") + self.ssh_command_runner.run( f"mkdir -p {host_mount_location} && chown -R " f"{self.ssh_command_runner.ssh_user} {host_mount_location}", @@ -558,9 +589,11 @@ def run_rsync_down(self, source, target, options=None): source.lstrip("/"), ) host_mount_location = os.path.dirname(host_source.rstrip("/")) + # Convert Windows paths to Unix-style for remote commands + host_mount_location_unix = host_mount_location.replace("\\", "/") self.ssh_command_runner.run( - f"mkdir -p {host_mount_location} && chown -R " - f"{self.ssh_command_runner.ssh_user} {host_mount_location}", + f"mkdir -p {host_mount_location_unix} && chown -R " + f"{self.ssh_command_runner.ssh_user} {host_mount_location_unix}", silent=is_rsync_silent(), ) if source[-1] == "/": @@ -575,7 +608,9 @@ def run_rsync_down(self, source, target, options=None): self.docker_cmd, self.container_name, self._docker_expand_user(source), - host_source, + host_source.replace( + "\\", "/" + ), # Convert Windows paths to Unix-style for rsync ), silent=is_rsync_silent(), ) @@ -728,7 +763,6 @@ def run_init( "{} pull {}".format(self.docker_cmd, specific_image), run_env="host" ) else: - self.run( f"{self.docker_cmd} image inspect {specific_image} " "1> /dev/null 2>&1 || " @@ -750,9 +784,9 @@ def run_init( specific_image, cleaned_bind_mounts ) if requires_re_init: - self.run( - f"{self.docker_cmd} stop {self.container_name}", run_env="host" - ) + docker_stop_cmd = f"{self.docker_cmd} stop {self.container_name}" + logger.info("Executing Docker command: %s", docker_stop_cmd) + self.run(docker_stop_cmd, run_env="host") if (not container_running) or requires_re_init: if not sync_run_yet: @@ -821,7 +855,9 @@ def run_init( self.ssh_command_runner.cluster_name ), mount, - ), + ).replace( + "\\", "/" + ), # Convert Windows paths to Unix-style for rsync container=self.container_name, dst=self._docker_expand_user(mount), ) diff --git a/python/ray/autoscaler/_private/commands.py b/python/ray/autoscaler/_private/commands.py index 52f87f2e31a0..fd15218e65e2 100644 --- a/python/ray/autoscaler/_private/commands.py +++ b/python/ray/autoscaler/_private/commands.py @@ -824,8 +824,16 @@ def get_or_create_head_node( # Use RAY_UP_enable_autoscaler_v2 instead of RAY_enable_autoscaler_v2 # to avoid accidentally enabling autoscaler v2 for ray up - # due to env inheritance. - if os.getenv("RAY_UP_enable_autoscaler_v2", "0") == "1": + # due to env inheritance. The default value is 1 since Ray 2.50.0. + if os.getenv("RAY_UP_enable_autoscaler_v2", "1") == "1": + if "RAY_UP_enable_autoscaler_v2" not in os.environ: + # TODO (rueian): Remove this notice after Ray 2.52.0. + cli_logger.print( + "Autoscaler v2 is now enabled by default (since Ray 2.50.0). " + "To switch back to v1, set {}=0. This message can be suppressed by setting {} explicitly.", + cf.bold("RAY_UP_enable_autoscaler_v2"), + cf.bold("RAY_UP_enable_autoscaler_v2"), + ) ray_start_commands = with_envs( ray_start_commands, { @@ -926,6 +934,18 @@ def get_or_create_head_node( ) cli_logger.newline() + # Clean up temporary config file if it was created + # Clean up temporary config file if it was created on Windows + if ( + sys.platform == "win32" + and not no_monitor_on_head + and "remote_config_file" in locals() + ): + try: + os.remove(remote_config_file.name) + except OSError: + pass # Ignore cleanup errors + def _should_create_new_head( head_node_id: Optional[str], @@ -1025,9 +1045,14 @@ def _set_up_config_for_head_node( remote_config = provider.prepare_for_head_node(remote_config) # Now inject the rewritten config and SSH key into the head node - remote_config_file = tempfile.NamedTemporaryFile("w", prefix="ray-bootstrap-") + is_windows = sys.platform == "win32" + remote_config_file = tempfile.NamedTemporaryFile( + "w", prefix="ray-bootstrap-", delete=not is_windows + ) remote_config_file.write(json.dumps(remote_config)) remote_config_file.flush() + if is_windows: + remote_config_file.close() # Close the file handle to ensure it's accessible config["file_mounts"].update( {"~/ray_bootstrap_config.yaml": remote_config_file.name} ) diff --git a/python/ray/autoscaler/_private/gcp/config.py b/python/ray/autoscaler/_private/gcp/config.py index b48a7e984762..2e646526cb34 100644 --- a/python/ray/autoscaler/_private/gcp/config.py +++ b/python/ray/autoscaler/_private/gcp/config.py @@ -9,17 +9,18 @@ import google_auth_httplib2 import googleapiclient import httplib2 -from cryptography.hazmat.backends import default_backend -from cryptography.hazmat.primitives import serialization -from cryptography.hazmat.primitives.asymmetric import rsa from google.oauth2 import service_account from google.oauth2.credentials import Credentials as OAuthCredentials from googleapiclient import discovery, errors -from ray._private.accelerators import TPUAcceleratorManager -from ray._private.accelerators import tpu +from ray._private.accelerators import TPUAcceleratorManager, tpu from ray.autoscaler._private.gcp.node import MAX_POLLS, POLL_INTERVAL, GCPNodeType -from ray.autoscaler._private.util import check_legacy_fields +from ray.autoscaler._private.util import ( + check_legacy_fields, + generate_rsa_key_pair, + generate_ssh_key_name, + generate_ssh_key_paths, +) logger = logging.getLogger(__name__) @@ -244,43 +245,6 @@ def wait_for_compute_global_operation(project_name, operation, compute): return result -def key_pair_name(i, region, project_id, ssh_user): - """Returns the ith default gcp_key_pair_name.""" - key_name = "{}_gcp_{}_{}_{}_{}".format(RAY, region, project_id, ssh_user, i) - return key_name - - -def key_pair_paths(key_name): - """Returns public and private key paths for a given key_name.""" - public_key_path = os.path.expanduser("~/.ssh/{}.pub".format(key_name)) - private_key_path = os.path.expanduser("~/.ssh/{}.pem".format(key_name)) - return public_key_path, private_key_path - - -def generate_rsa_key_pair(): - """Create public and private ssh-keys.""" - - key = rsa.generate_private_key( - backend=default_backend(), public_exponent=65537, key_size=2048 - ) - - public_key = ( - key.public_key() - .public_bytes( - serialization.Encoding.OpenSSH, serialization.PublicFormat.OpenSSH - ) - .decode("utf-8") - ) - - pem = key.private_bytes( - encoding=serialization.Encoding.PEM, - format=serialization.PrivateFormat.TraditionalOpenSSL, - encryption_algorithm=serialization.NoEncryption(), - ).decode("utf-8") - - return public_key, pem - - def _has_tpus_in_node_configs(config: dict) -> bool: """Check if any nodes in config are TPUs.""" node_configs = [ @@ -555,10 +519,14 @@ def _configure_key_pair(config, compute): # Try a few times to get or create a good key pair. key_found = False for i in range(10): - key_name = key_pair_name( - i, config["provider"]["region"], config["provider"]["project_id"], ssh_user + key_name = generate_ssh_key_name( + "gcp", + i, + config["provider"]["region"], + config["provider"]["project_id"], + ssh_user, ) - public_key_path, private_key_path = key_pair_paths(key_name) + public_key_path, private_key_path = generate_ssh_key_paths(key_name) for ssh_key in ssh_keys: key_parts = ssh_key.split(" ") diff --git a/python/ray/autoscaler/_private/gcp/node_provider.py b/python/ray/autoscaler/_private/gcp/node_provider.py index 2d7147a60619..56398433f624 100644 --- a/python/ray/autoscaler/_private/gcp/node_provider.py +++ b/python/ray/autoscaler/_private/gcp/node_provider.py @@ -18,8 +18,8 @@ # The logic has been abstracted away here to allow for different GCP resources # (API endpoints), which can differ widely, making it impossible to use # the same logic for everything. -from ray.autoscaler._private.gcp.node import GCPTPU # noqa from ray.autoscaler._private.gcp.node import ( + GCPTPU, # noqa GCPCompute, GCPNode, GCPNodeType, diff --git a/python/ray/autoscaler/_private/kuberay/node_provider.py b/python/ray/autoscaler/_private/kuberay/node_provider.py index b62b5ca78fa2..b3715f0dd9fa 100644 --- a/python/ray/autoscaler/_private/kuberay/node_provider.py +++ b/python/ray/autoscaler/_private/kuberay/node_provider.py @@ -8,6 +8,7 @@ import requests +from ray._common.network_utils import build_address from ray.autoscaler._private.constants import WORKER_LIVENESS_CHECK_KEY from ray.autoscaler._private.util import NodeID, NodeIP, NodeKind, NodeStatus, NodeType from ray.autoscaler.batching_node_provider import ( @@ -22,7 +23,6 @@ STATUS_UPDATE_FAILED, TAG_RAY_USER_NODE_TYPE, ) -from ray._common.network_utils import build_address # Key for KubeRay label that identifies a Ray pod as head or worker. KUBERAY_LABEL_KEY_KIND = "ray.io/node-type" diff --git a/python/ray/autoscaler/_private/kuberay/run_autoscaler.py b/python/ray/autoscaler/_private/kuberay/run_autoscaler.py index dcc810073797..37b09db1f46a 100644 --- a/python/ray/autoscaler/_private/kuberay/run_autoscaler.py +++ b/python/ray/autoscaler/_private/kuberay/run_autoscaler.py @@ -4,15 +4,15 @@ import time import ray -from ray._private import ray_constants +from ray._common.network_utils import build_address from ray._common.ray_constants import ( - LOGGING_ROTATE_BYTES, LOGGING_ROTATE_BACKUP_COUNT, + LOGGING_ROTATE_BYTES, ) +from ray._common.utils import try_to_create_directory +from ray._private import ray_constants from ray._private.ray_logging import setup_component_logger from ray._private.services import get_node_ip_address -from ray._common.network_utils import build_address -from ray._common.utils import try_to_create_directory from ray._raylet import GcsClient from ray.autoscaler._private.kuberay.autoscaling_config import AutoscalingConfigProducer from ray.autoscaler._private.monitor import Monitor diff --git a/python/ray/autoscaler/_private/load_metrics.py b/python/ray/autoscaler/_private/load_metrics.py index 2e083730aa30..ec94647bda9c 100644 --- a/python/ray/autoscaler/_private/load_metrics.py +++ b/python/ray/autoscaler/_private/load_metrics.py @@ -73,7 +73,7 @@ def __init__(self): self.last_heartbeat_time_by_ip = {} self.static_resources_by_ip = {} self.dynamic_resources_by_ip = {} - self.raylet_id_by_ip = {} + self.node_id_by_ip = {} self.waiting_bundles = [] self.infeasible_bundles = [] self.pending_placement_groups = [] @@ -85,12 +85,12 @@ def __bool__(self): """A load metrics instance is Falsey iff the autoscaler process has not received a resource message from the GCS. """ - return bool(self.raylet_id_by_ip) + return bool(self.node_id_by_ip) def update( self, ip: str, - raylet_id: bytes, + node_id: bytes, static_resources: Dict[str, Dict], dynamic_resources: Dict[str, Dict], node_idle_duration_s: float, @@ -100,7 +100,7 @@ def update( cluster_full_of_actors_detected: bool = False, ): self.static_resources_by_ip[ip] = static_resources - self.raylet_id_by_ip[ip] = raylet_id + self.node_id_by_ip[ip] = node_id self.cluster_full_of_actors_detected = cluster_full_of_actors_detected if not waiting_bundles: @@ -163,7 +163,7 @@ def prune(mapping, should_log): prune(self.ray_nodes_last_used_time_by_ip, should_log=True) prune(self.static_resources_by_ip, should_log=False) - prune(self.raylet_id_by_ip, should_log=False) + prune(self.node_id_by_ip, should_log=False) prune(self.dynamic_resources_by_ip, should_log=False) prune(self.last_heartbeat_time_by_ip, should_log=False) diff --git a/python/ray/autoscaler/_private/monitor.py b/python/ray/autoscaler/_private/monitor.py index d62886e3a669..55a922293fca 100644 --- a/python/ray/autoscaler/_private/monitor.py +++ b/python/ray/autoscaler/_private/monitor.py @@ -14,14 +14,15 @@ import ray import ray._private.ray_constants as ray_constants +from ray._common.network_utils import build_address, parse_address from ray._common.ray_constants import ( - LOGGING_ROTATE_BYTES, LOGGING_ROTATE_BACKUP_COUNT, + LOGGING_ROTATE_BYTES, ) +from ray._private import logging_utils from ray._private.event.event_logger import get_event_logger from ray._private.ray_logging import setup_component_logger from ray._raylet import GcsClient -from ray._common.network_utils import parse_address, build_address from ray.autoscaler._private.autoscaler import StandardAutoscaler from ray.autoscaler._private.commands import teardown_cluster from ray.autoscaler._private.constants import ( @@ -44,7 +45,6 @@ _internal_kv_initialized, _internal_kv_put, ) -from ray._private import logging_utils try: import prometheus_client diff --git a/python/ray/autoscaler/_private/spark/node_provider.py b/python/ray/autoscaler/_private/spark/node_provider.py index 9fbea1f525ab..9c59ba4ed9fd 100644 --- a/python/ray/autoscaler/_private/spark/node_provider.py +++ b/python/ray/autoscaler/_private/spark/node_provider.py @@ -6,6 +6,7 @@ import requests +from ray._common.network_utils import build_address from ray.autoscaler.node_launch_exception import NodeLaunchException from ray.autoscaler.node_provider import NodeProvider from ray.autoscaler.tags import ( @@ -18,7 +19,6 @@ TAG_RAY_NODE_STATUS, TAG_RAY_USER_NODE_TYPE, ) -from ray._common.network_utils import build_address logger = logging.getLogger(__name__) diff --git a/python/ray/autoscaler/_private/util.py b/python/ray/autoscaler/_private/util.py index 7da96de2b14b..396dd409cdf4 100644 --- a/python/ray/autoscaler/_private/util.py +++ b/python/ray/autoscaler/_private/util.py @@ -4,6 +4,7 @@ import json import logging import os +import sys import threading from dataclasses import dataclass from datetime import datetime @@ -12,8 +13,8 @@ from typing import Any, Dict, List, Optional, Tuple, Union import ray -from ray._common.utils import PLACEMENT_GROUP_BUNDLE_RESOURCE_NAME import ray._private.services as services +from ray._common.utils import PLACEMENT_GROUP_BUNDLE_RESOURCE_NAME from ray._private.utils import ( PLACEMENT_GROUP_INDEXED_BUNDLED_RESOURCE_PATTERN, PLACEMENT_GROUP_WILDCARD_RESOURCE_PATTERN, @@ -193,6 +194,12 @@ def validate_config(config: Dict[str, Any]) -> None: "sum of `min_workers` of all the available node types." ) + if sys.platform == "win32" and config.get("file_mounts_sync_continuously", False): + raise ValueError( + "`file_mounts_sync_continuously` is not supported on Windows. " + "Please set this to False when running on Windows." + ) + def check_legacy_fields(config: Dict[str, Any]) -> None: """For use in providers that have completed the migration to @@ -749,7 +756,7 @@ def get_constraint_report(request_demand: List[DictCount]): if len(constraint_lines) > 0: constraints_report = "\n".join(constraint_lines) else: - constraints_report = " (no request_resources() constraints)" + constraints_report = " (none)" return constraints_report @@ -941,9 +948,9 @@ def format_info_string( {separator} Total Usage: {usage_report} -Total Constraints: +From request_resources: {constraints_report} -Total Demands: +Pending Demands: {demand_report}""" if verbose: @@ -990,3 +997,47 @@ def format_no_node_type_string(node_type: dict): output_lines.append(output_line) return "\n ".join(output_lines) + + +def generate_rsa_key_pair(): + from cryptography.hazmat.backends import default_backend + from cryptography.hazmat.primitives import serialization + from cryptography.hazmat.primitives.asymmetric import rsa + + key = rsa.generate_private_key( + backend=default_backend(), public_exponent=65537, key_size=2048 + ) + + public_key = ( + key.public_key() + .public_bytes( + serialization.Encoding.OpenSSH, serialization.PublicFormat.OpenSSH + ) + .decode("utf-8") + ) + + pem = key.private_bytes( + encoding=serialization.Encoding.PEM, + format=serialization.PrivateFormat.TraditionalOpenSSL, + encryption_algorithm=serialization.NoEncryption(), + ).decode("utf-8") + + return public_key, pem + + +def generate_ssh_key_paths(key_name): + public_key_path = os.path.expanduser("~/.ssh/{}.pub".format(key_name)) + private_key_path = os.path.expanduser("~/.ssh/{}.pem".format(key_name)) + return public_key_path, private_key_path + + +def generate_ssh_key_name(provider, i, region, identifier, ssh_user): + RAY_PREFIX = "ray-autoscaler" + if i is not None: + return "{}_{}_{}_{}_{}_{}".format( + RAY_PREFIX, provider, region, identifier, ssh_user, i + ) + else: + return "{}_{}_{}_{}_{}".format( + RAY_PREFIX, provider, region, identifier, ssh_user + ) diff --git a/python/ray/autoscaler/aws/BUILD b/python/ray/autoscaler/aws/BUILD.bazel similarity index 100% rename from python/ray/autoscaler/aws/BUILD rename to python/ray/autoscaler/aws/BUILD.bazel diff --git a/python/ray/autoscaler/azure/BUILD b/python/ray/autoscaler/azure/BUILD.bazel similarity index 100% rename from python/ray/autoscaler/azure/BUILD rename to python/ray/autoscaler/azure/BUILD.bazel diff --git a/python/ray/autoscaler/azure/defaults.yaml b/python/ray/autoscaler/azure/defaults.yaml index 592a0f02e681..1c0e32655a3a 100644 --- a/python/ray/autoscaler/azure/defaults.yaml +++ b/python/ray/autoscaler/azure/defaults.yaml @@ -36,11 +36,10 @@ provider: # How Ray will authenticate with newly launched nodes. auth: ssh_user: ubuntu - # you must specify paths to matching private and public key pair files - # use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair - ssh_private_key: ~/.ssh/id_rsa - # changes to this should match what is specified in file_mounts - ssh_public_key: ~/.ssh/id_rsa.pub + # SSH keys will be auto-generated with Ray-specific names if not specified + # Uncomment and specify custom paths if you want to use different existing keys: + # ssh_private_key: /path/to/your/key.pem + # ssh_public_key: /path/to/your/key.pub # More specific customization to node configurations can be made using the ARM template azure-vm-template.json file # See documentation here: https://docs.microsoft.com/en-us/azure/templates/microsoft.compute/2019-03-01/virtualmachines @@ -92,7 +91,6 @@ head_node_type: ray.head.default file_mounts: { # "/path1/on/remote/machine": "/path1/on/local/machine", # "/path2/on/remote/machine": "/path2/on/local/machine", - "~/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub" } # Files or directories to copy from the head node to the worker nodes. The format is a diff --git a/python/ray/autoscaler/azure/example-full.yaml b/python/ray/autoscaler/azure/example-full.yaml index 6bb911268f05..fbcf05f82b17 100644 --- a/python/ray/autoscaler/azure/example-full.yaml +++ b/python/ray/autoscaler/azure/example-full.yaml @@ -59,11 +59,10 @@ provider: # How Ray will authenticate with newly launched nodes. auth: ssh_user: ubuntu - # You must specify paths to matching private and public key pair files. - # Use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair. - ssh_private_key: ~/.ssh/id_rsa - # Changes to this should match what is specified in file_mounts. - ssh_public_key: ~/.ssh/id_rsa.pub + # SSH keys will be auto-generated with Ray-specific names if not specified + # Uncomment and specify custom paths if you want to use different existing keys: + # ssh_private_key: /path/to/your/key.pem + # ssh_public_key: /path/to/your/key.pub # You can make more specific customization to node configurations can be made using the ARM template azure-vm-template.json file. # See this documentation here: https://docs.microsoft.com/en-us/azure/templates/microsoft.compute/2019-03-01/virtualmachines @@ -127,7 +126,7 @@ head_node_type: ray.head.default file_mounts: { # "/path1/on/remote/machine": "/path1/on/local/machine", # "/path2/on/remote/machine": "/path2/on/local/machine", - "~/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub"} +} # Files or directories to copy from the head node to the worker nodes. The format is a # list of paths. Ray copies the same path on the head node to the worker node. diff --git a/python/ray/autoscaler/azure/example-gpu-docker.yaml b/python/ray/autoscaler/azure/example-gpu-docker.yaml index 3ebc763e7d26..6f322324b98f 100644 --- a/python/ray/autoscaler/azure/example-gpu-docker.yaml +++ b/python/ray/autoscaler/azure/example-gpu-docker.yaml @@ -43,11 +43,10 @@ provider: # How Ray will authenticate with newly launched nodes. auth: ssh_user: ubuntu - # you must specify paths to matching private and public key pair files - # use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair - ssh_private_key: ~/.ssh/id_rsa - # changes to this should match what is specified in file_mounts - ssh_public_key: ~/.ssh/id_rsa.pub + # SSH keys will be auto-generated with Ray-specific names if not specified + # Uncomment and specify custom paths if you want to use different existing keys: + # ssh_private_key: /path/to/your/key.pem + # ssh_public_key: /path/to/your/key.pub # Tell the autoscaler the allowed node types and the resources they provide. # The key is the name of the node type, which is just for debugging purposes. @@ -98,7 +97,6 @@ head_node_type: ray.head.gpu file_mounts: { # "/path1/on/remote/machine": "/path1/on/local/machine", # "/path2/on/remote/machine": "/path2/on/local/machine", - "~/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub" } # List of commands that will be run before `setup_commands`. If docker is diff --git a/python/ray/autoscaler/azure/example-minimal.yaml b/python/ray/autoscaler/azure/example-minimal.yaml index 768eef14a325..de51922482df 100644 --- a/python/ray/autoscaler/azure/example-minimal.yaml +++ b/python/ray/autoscaler/azure/example-minimal.yaml @@ -14,13 +14,14 @@ provider: # How Ray will authenticate with newly launched nodes. auth: ssh_user: ubuntu - # you must specify paths to matching private and public key pair files - # use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair - ssh_private_key: ~/.ssh/id_rsa - # changes to this should match what is specified in file_mounts - ssh_public_key: ~/.ssh/id_rsa.pub + # SSH keys will be auto-generated with Ray-specific names if not specified + # Uncomment and specify custom paths if you want to use different existing keys: + # ssh_private_key: /path/to/your/key.pem + # ssh_public_key: /path/to/your/key.pub # Files or directories to copy to the head and worker nodes. The format is a # dictionary from REMOTE_PATH: LOCAL_PATH, e.g. file_mounts: { - "~/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub"} +# "/path1/on/remote/machine": "/path1/on/local/machine", +# "/path2/on/remote/machine": "/path2/on/local/machine", +} diff --git a/python/ray/autoscaler/gcp/BUILD b/python/ray/autoscaler/gcp/BUILD.bazel similarity index 100% rename from python/ray/autoscaler/gcp/BUILD rename to python/ray/autoscaler/gcp/BUILD.bazel diff --git a/python/ray/autoscaler/local/BUILD b/python/ray/autoscaler/local/BUILD.bazel similarity index 100% rename from python/ray/autoscaler/local/BUILD rename to python/ray/autoscaler/local/BUILD.bazel diff --git a/python/ray/autoscaler/local/coordinator_server.py b/python/ray/autoscaler/local/coordinator_server.py index 6ea69d71857b..7cca12645631 100644 --- a/python/ray/autoscaler/local/coordinator_server.py +++ b/python/ray/autoscaler/local/coordinator_server.py @@ -6,12 +6,12 @@ import argparse import json import logging -import threading import socket +import threading from http.server import HTTPServer, SimpleHTTPRequestHandler -from ray.autoscaler._private.local.node_provider import LocalNodeProvider from ray._common.network_utils import build_address +from ray.autoscaler._private.local.node_provider import LocalNodeProvider logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) diff --git a/python/ray/autoscaler/sdk/sdk.py b/python/ray/autoscaler/sdk/sdk.py index 276e85892c0e..437538bbeb3b 100644 --- a/python/ray/autoscaler/sdk/sdk.py +++ b/python/ray/autoscaler/sdk/sdk.py @@ -8,8 +8,10 @@ from ray.autoscaler._private import commands from ray.autoscaler._private.cli_logger import cli_logger -from ray.autoscaler._private.event_system import CreateClusterEvent # noqa: F401 -from ray.autoscaler._private.event_system import global_event_system # noqa: F401 +from ray.autoscaler._private.event_system import ( + CreateClusterEvent, # noqa: F401 + global_event_system, # noqa: F401 +) from ray.util.annotations import DeveloperAPI diff --git a/python/ray/autoscaler/v2/BUILD b/python/ray/autoscaler/v2/BUILD.bazel similarity index 100% rename from python/ray/autoscaler/v2/BUILD rename to python/ray/autoscaler/v2/BUILD.bazel diff --git a/python/ray/autoscaler/v2/autoscaler.py b/python/ray/autoscaler/v2/autoscaler.py index c55646a46d8e..cdc8620bc2b7 100644 --- a/python/ray/autoscaler/v2/autoscaler.py +++ b/python/ray/autoscaler/v2/autoscaler.py @@ -1,6 +1,7 @@ import logging from queue import Queue from typing import List, Optional +from urllib.parse import urlsplit from ray._raylet import GcsClient from ray.autoscaler._private.providers import _get_node_provider @@ -39,7 +40,6 @@ from ray.autoscaler.v2.scheduler import ResourceDemandScheduler from ray.autoscaler.v2.sdk import get_cluster_resource_state from ray.core.generated.autoscaler_pb2 import AutoscalingState -from urllib.parse import urlsplit logger = logging.getLogger(__name__) @@ -55,7 +55,7 @@ def __init__( ) -> None: """ Args: - session_name: The name of the ray session. + session_name: The current Ray session name. config_reader: The config reader. gcs_client: The GCS client. event_logger: The event logger for emitting cluster events. diff --git a/python/ray/autoscaler/v2/event_logger.py b/python/ray/autoscaler/v2/event_logger.py index 316378867e27..b4db3d2b798b 100644 --- a/python/ray/autoscaler/v2/event_logger.py +++ b/python/ray/autoscaler/v2/event_logger.py @@ -3,8 +3,6 @@ from typing import Dict, List, Optional from ray._private.event.event_logger import EventLoggerAdapter -from ray.autoscaler.v2.instance_manager.config import NodeTypeConfig -from ray.autoscaler.v2.schema import NodeType from ray.autoscaler.v2.utils import ResourceRequestUtil from ray.core.generated.autoscaler_pb2 import ( ClusterResourceConstraint, @@ -31,8 +29,7 @@ def __init__(self, logger: EventLoggerAdapter): def log_cluster_scheduling_update( self, - node_type_configs: Dict[NodeType, NodeTypeConfig], - cluster_shape: Dict[NodeType, int], + cluster_resources: Dict[str, float], launch_requests: Optional[List[LaunchRequest]] = None, terminate_requests: Optional[List[TerminationRequest]] = None, infeasible_requests: Optional[List[ResourceRequest]] = None, @@ -42,7 +39,29 @@ def log_cluster_scheduling_update( ] = None, ) -> None: """ - Log any update of the cluster scheduling state. + Log updates to the autoscaler scheduling state. + + Emits: + - info logs for node launches and terminations (counts grouped by node type). + - an info log summarizing the cluster size after a resize (CPUs/GPUs/TPUs). + - warnings describing infeasible single resource requests, infeasible gang + (placement group) requests, and infeasible cluster resource constraints. + + Args: + cluster_resources: Mapping of resource name to total resources for the + current cluster state. + launch_requests: Node launch requests issued in this scheduling step. + terminate_requests: Node termination requests issued in this scheduling + step. + infeasible_requests: Resource requests that could not be satisfied by + any available node type. + infeasible_gang_requests: Gang/placement group requests that could not + be scheduled. + infeasible_cluster_resource_constraints: Cluster-level resource + constraints that could not be satisfied. + + Returns: + None """ # Log any launch events. @@ -78,23 +97,16 @@ def log_cluster_scheduling_update( # Cluster shape changes. if launch_requests or terminate_requests: - total_resources = defaultdict(float) - - for node_type, count in cluster_shape.items(): - node_config = node_type_configs[node_type] - for resource_name, resource_quantity in node_config.resources.items(): - total_resources[resource_name] += resource_quantity * count - - num_cpus = total_resources.get("CPU", 0) + num_cpus = cluster_resources.get("CPU", 0) log_str = f"Resized to {int(num_cpus)} CPUs" - if "GPU" in total_resources: - log_str += f", {int(total_resources['GPU'])} GPUs" - if "TPU" in total_resources: - log_str += f", {int(total_resources['TPU'])} TPUs" + if "GPU" in cluster_resources: + log_str += f", {int(cluster_resources['GPU'])} GPUs" + if "TPU" in cluster_resources: + log_str += f", {int(cluster_resources['TPU'])} TPUs" self._logger.info(f"{log_str}.") - self._logger.debug(f"Current cluster shape: {dict(cluster_shape)}.") + self._logger.debug(f"Current cluster resources: {dict(cluster_resources)}.") # Log any infeasible requests. if infeasible_requests: diff --git a/python/ray/autoscaler/v2/instance_manager/config.py b/python/ray/autoscaler/v2/instance_manager/config.py index ef329f804e71..d94d157c86fa 100644 --- a/python/ray/autoscaler/v2/instance_manager/config.py +++ b/python/ray/autoscaler/v2/instance_manager/config.py @@ -8,8 +8,8 @@ import yaml -from ray._private.ray_constants import env_integer from ray._common.utils import binary_to_hex +from ray._private.ray_constants import env_integer from ray._raylet import GcsClient from ray.autoscaler._private.constants import ( AUTOSCALER_MAX_CONCURRENT_LAUNCHES, diff --git a/python/ray/autoscaler/v2/instance_manager/ray_installer.py b/python/ray/autoscaler/v2/instance_manager/ray_installer.py index 3daf5d3e0b71..e99b2b1492ca 100644 --- a/python/ray/autoscaler/v2/instance_manager/ray_installer.py +++ b/python/ray/autoscaler/v2/instance_manager/ray_installer.py @@ -2,9 +2,9 @@ import subprocess from ray.autoscaler._private.updater import ( - NodeUpdater, - TAG_RAY_NODE_STATUS, STATUS_UP_TO_DATE, + TAG_RAY_NODE_STATUS, + NodeUpdater, ) from ray.autoscaler._private.util import with_envs, with_head_node_ip from ray.autoscaler.node_provider import NodeProvider as NodeProviderV1 diff --git a/python/ray/autoscaler/v2/instance_manager/reconciler.py b/python/ray/autoscaler/v2/instance_manager/reconciler.py index f274854eca6a..b403803e577b 100644 --- a/python/ray/autoscaler/v2/instance_manager/reconciler.py +++ b/python/ray/autoscaler/v2/instance_manager/reconciler.py @@ -21,10 +21,10 @@ LaunchNodeError, TerminateNodeError, ) +from ray.autoscaler.v2.instance_manager.subscribers.ray_stopper import RayStopError from ray.autoscaler.v2.instance_manager.subscribers.threaded_ray_installer import ( RayInstallError, ) -from ray.autoscaler.v2.instance_manager.subscribers.ray_stopper import RayStopError from ray.autoscaler.v2.metrics_reporter import AutoscalerMetricsReporter from ray.autoscaler.v2.scheduler import IResourceScheduler, SchedulingRequest from ray.autoscaler.v2.schema import AutoscalerInstance, NodeType @@ -38,12 +38,10 @@ PendingInstance, PendingInstanceRequest, ) -from ray.core.generated.instance_manager_pb2 import GetInstanceManagerStateRequest -from ray.core.generated.instance_manager_pb2 import Instance as IMInstance from ray.core.generated.instance_manager_pb2 import ( + GetInstanceManagerStateRequest, + Instance as IMInstance, InstanceUpdateEvent as IMInstanceUpdateEvent, -) -from ray.core.generated.instance_manager_pb2 import ( NodeKind, StatusCode, UpdateInstanceManagerStateRequest, diff --git a/python/ray/autoscaler/v2/instance_manager/subscribers/threaded_ray_installer.py b/python/ray/autoscaler/v2/instance_manager/subscribers/threaded_ray_installer.py index d364ccea07e0..d525b1aeccaa 100644 --- a/python/ray/autoscaler/v2/instance_manager/subscribers/threaded_ray_installer.py +++ b/python/ray/autoscaler/v2/instance_manager/subscribers/threaded_ray_installer.py @@ -2,8 +2,8 @@ import logging import time from concurrent.futures import ThreadPoolExecutor -from typing import List from queue import Queue +from typing import List from ray.autoscaler.v2.instance_manager.instance_manager import ( InstanceUpdatedSubscriber, @@ -11,9 +11,9 @@ from ray.autoscaler.v2.instance_manager.instance_storage import InstanceStorage from ray.autoscaler.v2.instance_manager.ray_installer import RayInstaller from ray.core.generated.instance_manager_pb2 import ( - NodeKind, Instance, InstanceUpdateEvent, + NodeKind, ) logger = logging.getLogger(__name__) diff --git a/python/ray/autoscaler/v2/monitor.py b/python/ray/autoscaler/v2/monitor.py index e771d6e7e404..34e31e7ac649 100644 --- a/python/ray/autoscaler/v2/monitor.py +++ b/python/ray/autoscaler/v2/monitor.py @@ -13,16 +13,17 @@ import ray import ray._private.ray_constants as ray_constants +from ray._common.network_utils import build_address, parse_address from ray._common.ray_constants import ( - LOGGING_ROTATE_BYTES, LOGGING_ROTATE_BACKUP_COUNT, + LOGGING_ROTATE_BYTES, ) +from ray._common.usage.usage_lib import record_extra_usage_tag +from ray._private import logging_utils from ray._private.event.event_logger import get_event_logger from ray._private.ray_logging import setup_component_logger -from ray._common.usage.usage_lib import record_extra_usage_tag from ray._private.worker import SCRIPT_MODE from ray._raylet import GcsClient -from ray._common.network_utils import parse_address, build_address from ray.autoscaler._private.constants import ( AUTOSCALER_METRIC_PORT, AUTOSCALER_UPDATE_INTERVAL_S, @@ -39,7 +40,6 @@ from ray.core.generated.autoscaler_pb2 import AutoscalingState from ray.core.generated.event_pb2 import Event as RayEvent from ray.core.generated.usage_pb2 import TagKey -from ray._private import logging_utils try: import prometheus_client diff --git a/python/ray/autoscaler/v2/scheduler.py b/python/ray/autoscaler/v2/scheduler.py index 641baa2c81a9..6dbacd893619 100644 --- a/python/ray/autoscaler/v2/scheduler.py +++ b/python/ray/autoscaler/v2/scheduler.py @@ -20,13 +20,13 @@ from ray.autoscaler.v2.instance_manager.config import NodeTypeConfig from ray.autoscaler.v2.schema import AutoscalerInstance, NodeType from ray.autoscaler.v2.utils import ProtobufUtil, ResourceRequestUtil -from ray.core.generated.common_pb2 import LabelSelectorOperator from ray.core.generated.autoscaler_pb2 import ( ClusterResourceConstraint, GangResourceRequest, ResourceRequest, ResourceRequestByCount, ) +from ray.core.generated.common_pb2 import LabelSelectorOperator from ray.core.generated.instance_manager_pb2 import ( Instance, LaunchRequest, @@ -161,6 +161,8 @@ class SchedulingNode: # The node's current resource capacity. total_resources: Dict[str, float] = field(default_factory=dict) # Node's labels, including static or dynamic labels. + # Note that dynamic labels are a deprecated feature. And it is only used for the + # autoscaler’s strict-spread placement group scheduling (antiaffinity) labels: Dict[str, str] = field(default_factory=dict) # Observability descriptive message for why the node was launched in the # first place. @@ -278,6 +280,9 @@ def new( available_resources=dict(instance.ray_node.available_resources), labels={ **(instance.ray_node.labels or {}), + # DEPRECATED: Dynamic labels are a deprecated feature. This field + # is used here only for the autoscaler’s strict-spread placement + # group scheduling (antiaffinity). **(instance.ray_node.dynamic_labels or {}), }, status=SchedulingNodeStatus.SCHEDULABLE, @@ -606,7 +611,7 @@ def _try_schedule_one( # Add the request to the node. self.add_sched_request(request, resource_request_source) - # Update the dynamic labels if there's any + # Update the placement group in labels if there's any for constraint in request.placement_constraints: # We don't need to check for affinity constraints here since # we have already combined resource requests with the affinity @@ -825,6 +830,26 @@ def get_cluster_shape(self) -> Dict[NodeType, int]: cluster_shape[node.node_type] += 1 return cluster_shape + def get_cluster_resources(self) -> Dict[str, float]: + """ + Aggregate total cluster resources. + + Sums each node's `total_resources` across the current context, + excluding nodes marked `TO_TERMINATE`. + + Returns: + A dict mapping resource names to their summed resources. + """ + cluster_resources = defaultdict(float) + for node in self._nodes: + if node.status == SchedulingNodeStatus.TO_TERMINATE: + # Skip the nodes that are to be terminated. + continue + + for key, value in node.total_resources.items(): + cluster_resources[key] += value + return cluster_resources + def get_idle_timeout_s(self) -> Optional[float]: return self._idle_timeout_s @@ -949,8 +974,7 @@ def schedule(self, request: SchedulingRequest) -> SchedulingReply: infeasible_requests=infeasible_requests, infeasible_gang_requests=infeasible_gang_requests, infeasible_cluster_resource_constraints=infeasible_constraints, - cluster_shape=ctx.get_cluster_shape(), - node_type_configs=ctx.get_node_type_configs(), + cluster_resources=ctx.get_cluster_resources(), ) except Exception: logger.exception("Failed to emit event logs.") diff --git a/python/ray/autoscaler/v2/tests/test_e2e.py b/python/ray/autoscaler/v2/tests/test_e2e.py index 4283af09a091..afa5b67baa1c 100644 --- a/python/ray/autoscaler/v2/tests/test_e2e.py +++ b/python/ray/autoscaler/v2/tests/test_e2e.py @@ -7,10 +7,10 @@ import pytest import ray -from ray._common.test_utils import wait_for_condition from ray._common.constants import HEAD_NODE_RESOURCE_NAME -from ray._private.test_utils import run_string_as_driver_nonblocking +from ray._common.test_utils import wait_for_condition from ray._common.usage.usage_lib import get_extra_usage_tags_to_report +from ray._private.test_utils import run_string_as_driver_nonblocking from ray._raylet import GcsClient from ray.autoscaler.v2.sdk import get_cluster_status from ray.cluster_utils import AutoscalingCluster diff --git a/python/ray/autoscaler/v2/tests/test_event_logger.py b/python/ray/autoscaler/v2/tests/test_event_logger.py index da127b1a2be0..1f7a339aa903 100644 --- a/python/ray/autoscaler/v2/tests/test_event_logger.py +++ b/python/ray/autoscaler/v2/tests/test_event_logger.py @@ -5,7 +5,6 @@ import pytest from ray.autoscaler.v2.event_logger import AutoscalerEventLogger -from ray.autoscaler.v2.instance_manager.config import NodeTypeConfig from ray.autoscaler.v2.tests.util import MockEventLogger from ray.autoscaler.v2.utils import ResourceRequestUtil from ray.core.generated.autoscaler_pb2 import ( @@ -83,21 +82,7 @@ def test_log_scheduling_updates(): ) ) ], - cluster_shape={"type-1": 1, "type-2": 2}, - node_type_configs={ - "type-1": NodeTypeConfig( - name="type-1", - max_worker_nodes=10, - min_worker_nodes=1, - resources={"CPU": 1, "GPU": 1}, - ), - "type-2": NodeTypeConfig( - name="type-2", - max_worker_nodes=10, - min_worker_nodes=1, - resources={"CPU": 2, "GPU": 2, "TPU": 1}, - ), - }, + cluster_resources={"CPU": 5, "GPU": 5, "TPU": 2}, ) assert mock_logger.get_logs("info") == [ @@ -117,7 +102,7 @@ def test_log_scheduling_updates(): assert mock_logger.get_logs("error") == [] assert mock_logger.get_logs("debug") == [ - "Current cluster shape: {'type-1': 1, 'type-2': 2}." + "Current cluster resources: {'CPU': 5, 'GPU': 5, 'TPU': 2}." ] diff --git a/python/ray/autoscaler/v2/tests/test_scheduler.py b/python/ray/autoscaler/v2/tests/test_scheduler.py index 1f95f83df3b2..a1ca0a9f0944 100644 --- a/python/ray/autoscaler/v2/tests/test_scheduler.py +++ b/python/ray/autoscaler/v2/tests/test_scheduler.py @@ -22,7 +22,6 @@ from ray.autoscaler.v2.schema import AutoscalerInstance, NodeType from ray.autoscaler.v2.tests.util import MockEventLogger, make_autoscaler_instance from ray.autoscaler.v2.utils import ResourceRequestUtil -from ray.core.generated.common_pb2 import LabelSelectorOperator from ray.core.generated.autoscaler_pb2 import ( ClusterResourceConstraint, GangResourceRequest, @@ -30,6 +29,7 @@ NodeStatus, ResourceRequest, ) +from ray.core.generated.common_pb2 import LabelSelectorOperator from ray.core.generated.instance_manager_pb2 import ( Instance, NodeKind, diff --git a/python/ray/autoscaler/v2/tests/test_threaded_ray_installer.py b/python/ray/autoscaler/v2/tests/test_threaded_ray_installer.py index 79cd43092d36..12594562d678 100644 --- a/python/ray/autoscaler/v2/tests/test_threaded_ray_installer.py +++ b/python/ray/autoscaler/v2/tests/test_threaded_ray_installer.py @@ -2,8 +2,8 @@ import os import sys import unittest -from unittest.mock import patch from queue import Queue +from unittest.mock import patch import pytest # noqa diff --git a/python/ray/autoscaler/v2/tests/test_utils.py b/python/ray/autoscaler/v2/tests/test_utils.py index 1322c5ee3b0d..2bec1c29e4e4 100644 --- a/python/ray/autoscaler/v2/tests/test_utils.py +++ b/python/ray/autoscaler/v2/tests/test_utils.py @@ -567,9 +567,9 @@ def test_cluster_status_formatter(): 0.0/4.0 GPU 5.42KiB/10.04KiB object_store_memory -Total Constraints: +From request_resources: {'GPU': 2, 'CPU': 100}: 2 from request_resources() -Total Demands: +Pending Demands: {'CPU': 1, 'GPU': 1}: 11+ pending tasks/actors {'CPU': 1, 'GPU': 1} * 1 (STRICT_SPREAD): 1+ pending placement groups {'GPU': 2} * 1 (STRICT_PACK): 2+ pending placement groups diff --git a/python/ray/autoscaler/v2/utils.py b/python/ray/autoscaler/v2/utils.py index d3128e961c63..bca10ddd1786 100644 --- a/python/ray/autoscaler/v2/utils.py +++ b/python/ray/autoscaler/v2/utils.py @@ -39,13 +39,11 @@ NodeStatus, PlacementConstraint, ResourceRequest, -) -from ray.core.generated.autoscaler_pb2 import ( ResourceRequestByCount as ResourceRequestByCountProto, ) from ray.core.generated.common_pb2 import ( - LabelSelectorConstraint, LabelSelector, + LabelSelectorConstraint, ) from ray.experimental.internal_kv import internal_kv_get_gcs_client @@ -400,9 +398,9 @@ def format(cls, data: ClusterStatus, verbose: bool = False) -> str: separator, "Total Usage:", cluster_usage_report, - "Total Constraints:", + "From request_resources:", constraints_report, - "Total Demands:", + "Pending Demands:", demand_report, node_usage_report, ] @@ -631,7 +629,7 @@ def _constraint_report( constraint_lines.append(f" {bundle}: {count} from request_resources()") if constraint_lines: return "\n".join(constraint_lines) - return " (no request_resources() constraints)" + return " (none)" @staticmethod def _demand_report(data: ClusterStatus) -> str: diff --git a/python/ray/client_builder.py b/python/ray/client_builder.py index 714a941699c3..48d7f4cd0718 100644 --- a/python/ray/client_builder.py +++ b/python/ray/client_builder.py @@ -15,8 +15,7 @@ RAY_RUNTIME_ENV_ENVIRONMENT_VARIABLE, ) from ray._private.utils import get_ray_client_dependency_error, split_address -from ray._private.worker import BaseContext -from ray._private.worker import init as ray_driver_init +from ray._private.worker import BaseContext, init as ray_driver_init from ray.job_config import JobConfig from ray.util.annotations import Deprecated, PublicAPI diff --git a/python/ray/dag/BUILD b/python/ray/dag/BUILD.bazel similarity index 100% rename from python/ray/dag/BUILD rename to python/ray/dag/BUILD.bazel diff --git a/python/ray/dag/class_node.py b/python/ray/dag/class_node.py index 63d29086d34a..1a5b78e8e706 100644 --- a/python/ray/dag/class_node.py +++ b/python/ray/dag/class_node.py @@ -1,19 +1,18 @@ +from typing import Any, Dict, List, Optional, Tuple, Union from weakref import ReferenceType import ray -from ray.dag.dag_node import DAGNode -from ray.dag.input_node import InputNode -from ray.dag.format_utils import get_dag_node_str from ray.dag.constants import ( - PARENT_CLASS_NODE_KEY, - PREV_CLASS_METHOD_CALL_KEY, BIND_INDEX_KEY, IS_CLASS_METHOD_OUTPUT_KEY, + PARENT_CLASS_NODE_KEY, + PREV_CLASS_METHOD_CALL_KEY, ) +from ray.dag.dag_node import DAGNode +from ray.dag.format_utils import get_dag_node_str +from ray.dag.input_node import InputNode from ray.util.annotations import DeveloperAPI -from typing import Any, Dict, List, Union, Tuple, Optional - @DeveloperAPI class ClassNode(DAGNode): diff --git a/python/ray/dag/collective_node.py b/python/ray/dag/collective_node.py index ad55b8c1a08c..03609b20cc2e 100644 --- a/python/ray/dag/collective_node.py +++ b/python/ray/dag/collective_node.py @@ -1,21 +1,21 @@ -from typing import Any, Dict, List, Union, Tuple, Optional, TYPE_CHECKING +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union if TYPE_CHECKING: import torch import ray from ray.dag import ( - DAGNode, ClassMethodNode, + DAGNode, ) from ray.dag.constants import COLLECTIVE_OPERATION_KEY, IS_CLASS_METHOD_OUTPUT_KEY from ray.experimental.channel import ChannelContext from ray.experimental.channel.torch_tensor_type import Communicator, TorchTensorType from ray.experimental.util.types import ( - _CollectiveOp, AllGatherOp, AllReduceOp, ReduceScatterOp, + _CollectiveOp, ) from ray.util.annotations import DeveloperAPI diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py index ce59c2c244da..2204cb40ab7b 100644 --- a/python/ray/dag/compiled_dag_node.py +++ b/python/ray/dag/compiled_dag_node.py @@ -1,89 +1,84 @@ -import weakref import asyncio +import logging +import threading +import time +import traceback +import uuid +import weakref from collections import defaultdict from contextlib import nullcontext -from dataclasses import dataclass, asdict +from dataclasses import asdict, dataclass from typing import ( Any, Dict, List, - Tuple, - Union, Optional, Set, + Tuple, + Union, ) -import logging -import threading -import time -import uuid -import traceback -from ray.experimental.channel.auto_transport_type import ( - AutoTransportType, - TypeHintResolver, -) +import ray import ray.exceptions -from ray.dag.dag_operation_future import GPUFuture, DAGOperationFuture, ResolvedFuture -from ray.experimental.channel.cached_channel import CachedChannel -from ray.experimental.channel.communicator import Communicator from ray.dag.constants import ( RAY_CGRAPH_ENABLE_NVTX_PROFILING, RAY_CGRAPH_ENABLE_TORCH_PROFILING, RAY_CGRAPH_VISUALIZE_SCHEDULE, ) -import ray +from ray.dag.dag_node_operation import ( + _build_dag_node_operation_graph, + _DAGNodeOperation, + _DAGNodeOperationType, + _DAGOperationGraphNode, + _extract_execution_schedule, + _generate_actor_to_execution_schedule, + _generate_overlapped_execution_schedule, + _visualize_execution_schedule, +) +from ray.dag.dag_operation_future import DAGOperationFuture, GPUFuture, ResolvedFuture from ray.exceptions import ( RayCgraphCapacityExceeded, - RayTaskError, RayChannelError, RayChannelTimeoutError, -) -from ray.experimental.compiled_dag_ref import ( - CompiledDAGRef, - CompiledDAGFuture, - _process_return_vals, + RayTaskError, ) from ray.experimental.channel import ( + AwaitableBackgroundReader, + AwaitableBackgroundWriter, ChannelContext, ChannelInterface, ChannelOutputType, - ReaderInterface, - SynchronousReader, - WriterInterface, - SynchronousWriter, - AwaitableBackgroundReader, - AwaitableBackgroundWriter, CompiledDAGArgs, CompositeChannel, IntraProcessChannel, + ReaderInterface, + SynchronousReader, + SynchronousWriter, + WriterInterface, ) -from ray.util.annotations import DeveloperAPI - +from ray.experimental.channel.accelerator_context import AcceleratorContext +from ray.experimental.channel.auto_transport_type import ( + AutoTransportType, + TypeHintResolver, +) +from ray.experimental.channel.cached_channel import CachedChannel +from ray.experimental.channel.communicator import Communicator from ray.experimental.channel.shared_memory_channel import ( SharedMemoryType, ) -from ray.experimental.channel.torch_tensor_type import TorchTensorType - from ray.experimental.channel.torch_tensor_accelerator_channel import ( - _init_communicator, _destroy_communicator, + _init_communicator, ) - -from ray.dag.dag_node_operation import ( - _DAGNodeOperation, - _DAGNodeOperationType, - _DAGOperationGraphNode, - _build_dag_node_operation_graph, - _extract_execution_schedule, - _generate_actor_to_execution_schedule, - _generate_overlapped_execution_schedule, - _visualize_execution_schedule, +from ray.experimental.channel.torch_tensor_type import TorchTensorType +from ray.experimental.compiled_dag_ref import ( + CompiledDAGFuture, + CompiledDAGRef, + _process_return_vals, ) - +from ray.util.annotations import DeveloperAPI from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy -from ray.experimental.channel.accelerator_context import AcceleratorContext - logger = logging.getLogger(__name__) # Keep tracking of every compiled dag created during the lifetime of @@ -370,6 +365,7 @@ def _device_context_manager(): return nullcontext() import torch + from ray.experimental.channel.accelerator_context import AcceleratorContext device = ChannelContext.get_current().torch_device @@ -1091,9 +1087,9 @@ def _preprocess(self) -> None: This function is idempotent. """ from ray.dag import ( - DAGNode, ClassMethodNode, CollectiveOutputNode, + DAGNode, FunctionNode, InputAttributeNode, InputNode, @@ -1491,8 +1487,8 @@ def _check_leaf_nodes(self) -> None: Check if there are leaf nodes in the DAG and raise an error if there are. """ from ray.dag import ( - DAGNode, ClassMethodNode, + DAGNode, ) leaf_nodes: List[DAGNode] = [] @@ -1565,11 +1561,11 @@ def _get_or_compile( outputs for the DAG. """ from ray.dag import ( + ClassMethodNode, DAGNode, - InputNode, InputAttributeNode, + InputNode, MultiOutputNode, - ClassMethodNode, ) if self.input_task_idx is None: @@ -2789,11 +2785,11 @@ def _visualize_ascii(self) -> str: """ from ray.dag import ( + ClassMethodNode, + DAGNode, InputAttributeNode, InputNode, MultiOutputNode, - ClassMethodNode, - DAGNode, ) # Check that the DAG has been compiled @@ -3097,11 +3093,11 @@ def visualize( "You can install it by running `pip install graphviz`." ) from ray.dag import ( + ClassMethodNode, + DAGNode, InputAttributeNode, InputNode, MultiOutputNode, - ClassMethodNode, - DAGNode, ) # Check that the DAG has been compiled diff --git a/python/ray/dag/conftest.py b/python/ray/dag/conftest.py index a350eb5be2d7..a6a1a22b89a8 100644 --- a/python/ray/dag/conftest.py +++ b/python/ray/dag/conftest.py @@ -1,4 +1,5 @@ import os + import pytest import ray diff --git a/python/ray/dag/context.py b/python/ray/dag/context.py index 37e29521603c..89fb981eb019 100644 --- a/python/ray/dag/context.py +++ b/python/ray/dag/context.py @@ -1,7 +1,8 @@ -from dataclasses import dataclass import os import threading +from dataclasses import dataclass from typing import Optional + from ray.util.annotations import DeveloperAPI # The context singleton on this process. diff --git a/python/ray/dag/dag_node.py b/python/ray/dag/dag_node.py index 8c43a7bf5f22..c2d0f0579df5 100644 --- a/python/ray/dag/dag_node.py +++ b/python/ray/dag/dag_node.py @@ -1,31 +1,29 @@ +import asyncio import copy -from ray.experimental.channel.auto_transport_type import AutoTransportType -from ray.experimental.channel.torch_tensor_type import TorchTensorType -import ray -from ray.dag.base import DAGNodeBase -from ray.dag.py_obj_scanner import _PyObjScanner -from ray.util.annotations import DeveloperAPI - +import uuid from itertools import chain - from typing import ( - Optional, - Union, - List, - Tuple, - Dict, Any, - TypeVar, Callable, + Dict, + List, Literal, + Optional, + Tuple, + TypeVar, + Union, ) -import uuid -import asyncio +import ray +from ray.dag.base import DAGNodeBase from ray.dag.compiled_dag_node import build_compiled_dag_from_ray_dag +from ray.dag.py_obj_scanner import _PyObjScanner from ray.experimental.channel import ChannelOutputType +from ray.experimental.channel.auto_transport_type import AutoTransportType from ray.experimental.channel.communicator import Communicator +from ray.experimental.channel.torch_tensor_type import TorchTensorType from ray.experimental.util.types import Device +from ray.util.annotations import DeveloperAPI T = TypeVar("T") diff --git a/python/ray/dag/dag_node_operation.py b/python/ray/dag/dag_node_operation.py index 52072eec12e9..5a192e9f5da2 100644 --- a/python/ray/dag/dag_node_operation.py +++ b/python/ray/dag/dag_node_operation.py @@ -1,12 +1,12 @@ -from functools import total_ordering -from enum import Enum -from typing import Set, Tuple, List, Dict, Optional import copy -import logging -import ray import heapq +import logging from collections import defaultdict +from enum import Enum +from functools import total_ordering +from typing import Dict, List, Optional, Set, Tuple +import ray logger = logging.getLogger(__name__) diff --git a/python/ray/dag/dag_operation_future.py b/python/ray/dag/dag_operation_future.py index acfc83d7c1d6..392c86286a99 100644 --- a/python/ray/dag/dag_operation_future.py +++ b/python/ray/dag/dag_operation_future.py @@ -1,8 +1,8 @@ from abc import ABC, abstractmethod -from typing import Any, Generic, TypeVar, Dict -from ray.util.annotations import DeveloperAPI -from ray.experimental.channel.accelerator_context import AcceleratorContext +from typing import Any, Dict, Generic, TypeVar +from ray.experimental.channel.accelerator_context import AcceleratorContext +from ray.util.annotations import DeveloperAPI T = TypeVar("T") diff --git a/python/ray/dag/function_node.py b/python/ray/dag/function_node.py index 4565fcffe8ff..b48c63509f2c 100644 --- a/python/ray/dag/function_node.py +++ b/python/ray/dag/function_node.py @@ -1,6 +1,5 @@ from typing import Any, Dict, List - import ray from ray.dag.dag_node import DAGNode from ray.dag.format_utils import get_dag_node_str diff --git a/python/ray/dag/input_node.py b/python/ray/dag/input_node.py index 83f212e4e58f..0386f84cb999 100644 --- a/python/ray/dag/input_node.py +++ b/python/ray/dag/input_node.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List, Union, Optional +from typing import Any, Dict, List, Optional, Union from ray.dag import DAGNode from ray.dag.format_utils import get_dag_node_str diff --git a/python/ray/dag/output_node.py b/python/ray/dag/output_node.py index f9abdf1643e0..fc0ec1a10026 100644 --- a/python/ray/dag/output_node.py +++ b/python/ray/dag/output_node.py @@ -1,6 +1,6 @@ -import ray -from typing import Any, Dict, List, Union, Tuple +from typing import Any, Dict, List, Tuple, Union +import ray from ray.dag import DAGNode from ray.dag.format_utils import get_dag_node_str from ray.util.annotations import DeveloperAPI diff --git a/python/ray/dag/py_obj_scanner.py b/python/ray/dag/py_obj_scanner.py index 6bd6b94ab535..d86b982c10e1 100644 --- a/python/ray/dag/py_obj_scanner.py +++ b/python/ray/dag/py_obj_scanner.py @@ -1,12 +1,10 @@ import io -from typing import Any, Dict, Generic, List, Tuple, Type, TypeVar, Union - import pickle # noqa: F401 +from typing import Any, Dict, Generic, List, Tuple, Type, TypeVar, Union import ray from ray.dag.base import DAGNodeBase - # Used in deserialization hooks to reference scanner instances. _instances: Dict[int, "_PyObjScanner"] = {} diff --git a/python/ray/dag/tests/experimental/actor_defs.py b/python/ray/dag/tests/experimental/actor_defs.py index 55603ef64268..a0446746bc78 100644 --- a/python/ray/dag/tests/experimental/actor_defs.py +++ b/python/ray/dag/tests/experimental/actor_defs.py @@ -1,7 +1,8 @@ -import ray import os -import time import random +import time + +import ray @ray.remote diff --git a/python/ray/dag/tests/experimental/test_collective_dag.py b/python/ray/dag/tests/experimental/test_collective_dag.py index cc14a3b36dfb..9c426791ec14 100644 --- a/python/ray/dag/tests/experimental/test_collective_dag.py +++ b/python/ray/dag/tests/experimental/test_collective_dag.py @@ -2,7 +2,8 @@ import logging import os import sys -from typing import Callable, List, Optional, Tuple, TYPE_CHECKING +from typing import TYPE_CHECKING, Callable, List, Optional, Tuple + import pytest import ray diff --git a/python/ray/dag/tests/experimental/test_compiled_graphs.py b/python/ray/dag/tests/experimental/test_compiled_graphs.py index 4195eb14a481..85b2a5083920 100644 --- a/python/ray/dag/tests/experimental/test_compiled_graphs.py +++ b/python/ray/dag/tests/experimental/test_compiled_graphs.py @@ -5,25 +5,22 @@ import re import sys import time -import numpy as np -import torch +import numpy as np import pytest +import torch - -from ray._private.test_utils import run_string_as_driver -from ray.exceptions import RayChannelTimeoutError import ray import ray._private import ray.cluster_utils -from ray.dag import DAGContext, InputNode, MultiOutputNode -from ray.tests.conftest import * # noqa from ray._common.utils import ( get_or_create_event_loop, ) - +from ray._private.test_utils import run_string_as_driver +from ray.dag import DAGContext, InputNode, MultiOutputNode from ray.dag.tests.experimental.actor_defs import Actor, Collector - +from ray.exceptions import RayChannelTimeoutError +from ray.tests.conftest import * # noqa logger = logging.getLogger(__name__) diff --git a/python/ray/dag/tests/experimental/test_cpu_communicator_dag.py b/python/ray/dag/tests/experimental/test_cpu_communicator_dag.py index 0ec0a2ebd9e3..64a375985069 100644 --- a/python/ray/dag/tests/experimental/test_cpu_communicator_dag.py +++ b/python/ray/dag/tests/experimental/test_cpu_communicator_dag.py @@ -1,16 +1,16 @@ import os import sys -import torch import pytest +import torch import ray import ray.cluster_utils -from ray.exceptions import RayChannelError, RayTaskError -from ray.experimental.channel.cpu_communicator import CPUCommunicator -from ray.dag import InputNode import ray.experimental.collective as collective +from ray.dag import InputNode from ray.dag.output_node import MultiOutputNode +from ray.exceptions import RayChannelError, RayTaskError +from ray.experimental.channel.cpu_communicator import CPUCommunicator from ray.tests.conftest import * # noqa diff --git a/python/ray/dag/tests/experimental/test_dag_error_handling.py b/python/ray/dag/tests/experimental/test_dag_error_handling.py index bb129a444ca3..e0753e2b4e22 100644 --- a/python/ray/dag/tests/experimental/test_dag_error_handling.py +++ b/python/ray/dag/tests/experimental/test_dag_error_handling.py @@ -3,18 +3,16 @@ import logging import pickle import re +import signal import sys import time import pytest - -from ray.exceptions import ActorDiedError, RayChannelError, RayChannelTimeoutError import ray import ray._private import ray.cluster_utils -from ray.dag import DAGContext, InputNode, MultiOutputNode -from ray.tests.conftest import * # noqa +from ray._common.test_utils import SignalActor from ray._common.utils import ( get_or_create_event_loop, ) @@ -22,10 +20,10 @@ run_string_as_driver_nonblocking, wait_for_pid_to_exit, ) -from ray._common.test_utils import SignalActor -import signal - +from ray.dag import DAGContext, InputNode, MultiOutputNode from ray.dag.tests.experimental.actor_defs import Actor +from ray.exceptions import ActorDiedError, RayChannelError, RayChannelTimeoutError +from ray.tests.conftest import * # noqa logger = logging.getLogger(__name__) diff --git a/python/ray/dag/tests/experimental/test_dag_visualization.py b/python/ray/dag/tests/experimental/test_dag_visualization.py index 4278df31a196..c2908ef63f1e 100644 --- a/python/ray/dag/tests/experimental/test_dag_visualization.py +++ b/python/ray/dag/tests/experimental/test_dag_visualization.py @@ -1,12 +1,13 @@ +import os import sys -import ray + import pydot -import os +import pytest + +import ray from ray.dag import InputNode, MultiOutputNode from ray.tests.conftest import * # noqa -import pytest - @pytest.fixture def cleanup_files(): diff --git a/python/ray/dag/tests/experimental/test_execution_schedule.py b/python/ray/dag/tests/experimental/test_execution_schedule.py index 46bd714d7f47..2c6e2a025dae 100644 --- a/python/ray/dag/tests/experimental/test_execution_schedule.py +++ b/python/ray/dag/tests/experimental/test_execution_schedule.py @@ -1,24 +1,24 @@ # coding: utf-8 import os import sys +from typing import Dict, List, Tuple import pytest -from ray.tests.conftest import * # noqa -from ray.dag import InputNode, MultiOutputNode, ClassMethodNode +from ray.actor import ActorHandle +from ray.dag import ClassMethodNode, InputNode, MultiOutputNode +from ray.dag.compiled_dag_node import CompiledTask from ray.dag.dag_node_operation import ( + _add_edge, + _build_dag_node_operation_graph, + _DAGNodeOperation, _DAGNodeOperationType, _DAGOperationGraphNode, - _DAGNodeOperation, _extract_execution_schedule, - _select_next_nodes, - _build_dag_node_operation_graph, - _add_edge, _generate_actor_to_execution_schedule, + _select_next_nodes, ) -from ray.dag.compiled_dag_node import CompiledTask -from typing import List, Dict, Tuple -from ray.actor import ActorHandle +from ray.tests.conftest import * # noqa if sys.platform != "linux" and sys.platform != "darwin": pytest.skip("Skipping, requires Linux or Mac.", allow_module_level=True) diff --git a/python/ray/dag/tests/experimental/test_execution_schedule_gpu.py b/python/ray/dag/tests/experimental/test_execution_schedule_gpu.py index 8bd3a9dbf751..639db895cff3 100644 --- a/python/ray/dag/tests/experimental/test_execution_schedule_gpu.py +++ b/python/ray/dag/tests/experimental/test_execution_schedule_gpu.py @@ -1,17 +1,17 @@ # coding: utf-8 import os import sys +from typing import Optional import pytest +import torch import ray import ray.cluster_utils -from ray.tests.conftest import * # noqa from ray.dag import InputNode, MultiOutputNode -from ray.dag.dag_node_operation import _DAGNodeOperationType -import torch -from typing import Optional from ray.dag.compiled_dag_node import CompiledDAG +from ray.dag.dag_node_operation import _DAGNodeOperationType +from ray.tests.conftest import * # noqa if sys.platform != "linux" and sys.platform != "darwin": pytest.skip("Skipping, requires Linux or Mac.", allow_module_level=True) diff --git a/python/ray/dag/tests/experimental/test_mocked_nccl_dag.py b/python/ray/dag/tests/experimental/test_mocked_nccl_dag.py index ad10e6d53c5f..0a5dae633792 100644 --- a/python/ray/dag/tests/experimental/test_mocked_nccl_dag.py +++ b/python/ray/dag/tests/experimental/test_mocked_nccl_dag.py @@ -2,20 +2,19 @@ import os import sys -from ray._common.test_utils import wait_for_condition -import torch - import pytest +import torch import ray import ray.cluster_utils +from ray._common.test_utils import wait_for_condition +from ray.dag import InputNode from ray.exceptions import RayChannelError, RayTaskError from ray.experimental.channel.conftest import ( Barrier, start_nccl_mock, ) from ray.tests.conftest import * # noqa -from ray.dag import InputNode def error_logged(capsys, msg): diff --git a/python/ray/dag/tests/experimental/test_multi_args_gpu.py b/python/ray/dag/tests/experimental/test_multi_args_gpu.py index 9a746b8b7f03..d0d88432c099 100644 --- a/python/ray/dag/tests/experimental/test_multi_args_gpu.py +++ b/python/ray/dag/tests/experimental/test_multi_args_gpu.py @@ -3,12 +3,12 @@ import sys import pytest +import torch import ray -from ray.dag import InputNode, MultiOutputNode import ray.cluster_utils +from ray.dag import InputNode, MultiOutputNode from ray.tests.conftest import * # noqa -import torch if sys.platform != "linux" and sys.platform != "darwin": pytest.skip("Skipping, requires Linux or Mac.", allow_module_level=True) diff --git a/python/ray/dag/tests/experimental/test_multi_node_dag.py b/python/ray/dag/tests/experimental/test_multi_node_dag.py index 92c38df66cb5..301f187115c9 100644 --- a/python/ray/dag/tests/experimental/test_multi_node_dag.py +++ b/python/ray/dag/tests/experimental/test_multi_node_dag.py @@ -1,14 +1,16 @@ -import random -import ray import os +import random import sys import time + import pytest + +import ray +import ray.remote_function from ray._common.test_utils import wait_for_condition from ray.dag import InputNode, MultiOutputNode -import ray.remote_function -from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy from ray.tests.conftest import * # noqa +from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy if sys.platform != "linux" and sys.platform != "darwin": pytest.skip("Skipping, requires Linux or Mac.", allow_module_level=True) diff --git a/python/ray/dag/tests/experimental/test_torch_tensor_dag.py b/python/ray/dag/tests/experimental/test_torch_tensor_dag.py index 24c4d2524ff8..84ceb2d17f43 100644 --- a/python/ray/dag/tests/experimental/test_torch_tensor_dag.py +++ b/python/ray/dag/tests/experimental/test_torch_tensor_dag.py @@ -3,31 +3,31 @@ import os import socket import sys +import time from typing import List, Optional, Tuple import pytest +import torch + import ray import ray.cluster_utils import ray.experimental.collective as collective -import torch -import time +from ray._private.test_utils import ( + get_log_message, + init_log_pubsub, +) from ray.dag import InputNode -from ray.exceptions import RayChannelError, RayTaskError from ray.dag.output_node import MultiOutputNode +from ray.exceptions import RayChannelError, RayTaskError +from ray.experimental.channel.accelerator_context import AcceleratorContext from ray.experimental.channel.communicator import ( Communicator, TorchTensorAllocator, ) -from ray.experimental.channel.torch_tensor_type import TorchTensorType from ray.experimental.channel.nccl_group import _NcclGroup -from ray._private.test_utils import ( - get_log_message, - init_log_pubsub, -) - -from ray.tests.conftest import * # noqa +from ray.experimental.channel.torch_tensor_type import TorchTensorType from ray.experimental.util.types import ReduceOp -from ray.experimental.channel.accelerator_context import AcceleratorContext +from ray.tests.conftest import * # noqa logger = logging.getLogger(__name__) diff --git a/python/ray/dag/tests/experimental/test_torch_tensor_transport.py b/python/ray/dag/tests/experimental/test_torch_tensor_transport.py index 5ec1f2526e75..84722ef2f7db 100644 --- a/python/ray/dag/tests/experimental/test_torch_tensor_transport.py +++ b/python/ray/dag/tests/experimental/test_torch_tensor_transport.py @@ -1,13 +1,14 @@ -import ray import os import sys -import torch -import pytest from typing import Dict + +import pytest +import torch + +import ray from ray.dag import InputNode -from ray.exceptions import RayTaskError +from ray.exceptions import RaySystemError, RayTaskError from ray.tests.conftest import * # noqa -from ray.exceptions import RaySystemError if sys.platform != "linux" and sys.platform != "darwin": pytest.skip("Skipping, requires Linux or Mac.", allow_module_level=True) diff --git a/python/ray/dag/tests/test_input_node.py b/python/ray/dag/tests/test_input_node.py index 6874ff21cfd1..e5b54e6c60fd 100644 --- a/python/ray/dag/tests/test_input_node.py +++ b/python/ray/dag/tests/test_input_node.py @@ -3,12 +3,13 @@ request, for all DAGNode types. """ -import pytest -from ray.dag.dag_node import DAGNode -from ray.dag.input_node import InputNode from typing import Any, TypeVar +import pytest + import ray +from ray.dag.dag_node import DAGNode +from ray.dag.input_node import InputNode RayHandleLike = TypeVar("RayHandleLike") diff --git a/python/ray/dag/tests/test_output_node.py b/python/ray/dag/tests/test_output_node.py index 95890b5b6b30..795e736cdfa7 100644 --- a/python/ray/dag/tests/test_output_node.py +++ b/python/ray/dag/tests/test_output_node.py @@ -1,10 +1,10 @@ import pytest import ray +from ray._common.test_utils import wait_for_condition from ray.dag.input_node import InputNode from ray.dag.output_node import MultiOutputNode from ray.util.state import list_tasks -from ray._common.test_utils import wait_for_condition def test_output_node(shared_ray_instance): diff --git a/python/ray/dag/tests/test_plot.py b/python/ray/dag/tests/test_plot.py index d6e00f14b3ef..d3d1244e3ecf 100644 --- a/python/ray/dag/tests/test_plot.py +++ b/python/ray/dag/tests/test_plot.py @@ -1,8 +1,9 @@ import os -import pytest import sys import tempfile +import pytest + import ray diff --git a/python/ray/dag/tests/test_py_obj_scanner.py b/python/ray/dag/tests/test_py_obj_scanner.py index c07fdd499e38..104e6dc94d8f 100644 --- a/python/ray/dag/tests/test_py_obj_scanner.py +++ b/python/ray/dag/tests/test_py_obj_scanner.py @@ -1,7 +1,8 @@ -import pytest from typing import Any -from ray.dag.py_obj_scanner import _PyObjScanner, _instances +import pytest + +from ray.dag.py_obj_scanner import _instances, _PyObjScanner class Source: diff --git a/python/ray/dag/utils.py b/python/ray/dag/utils.py index ce96b3c27a8a..2fe1f3adf3d7 100644 --- a/python/ray/dag/utils.py +++ b/python/ray/dag/utils.py @@ -1,12 +1,12 @@ from typing import Dict from ray.dag import ( + ClassMethodNode, + ClassNode, DAGNode, - InputNode, - InputAttributeNode, FunctionNode, - ClassNode, - ClassMethodNode, + InputAttributeNode, + InputNode, MultiOutputNode, ) diff --git a/python/ray/dag/vis_utils.py b/python/ray/dag/vis_utils.py index c5a3b5cbc096..1274a53cc20d 100644 --- a/python/ray/dag/vis_utils.py +++ b/python/ray/dag/vis_utils.py @@ -1,8 +1,7 @@ -from ray.dag import DAGNode - import os import tempfile +from ray.dag import DAGNode from ray.dag.utils import _DAGNodeNameGenerator from ray.util.annotations import DeveloperAPI diff --git a/python/ray/dashboard/BUILD b/python/ray/dashboard/BUILD.bazel similarity index 94% rename from python/ray/dashboard/BUILD rename to python/ray/dashboard/BUILD.bazel index a443e8f6d8dc..f0e5fc640fb5 100644 --- a/python/ray/dashboard/BUILD +++ b/python/ray/dashboard/BUILD.bazel @@ -40,6 +40,7 @@ py_test_run_all_subdirectory( "tests/test_dashboard.py", "tests/test_state_head.py", "modules/serve/tests/**/*.py", + "modules/job/tests/test_job_manager.py", ], extra_srcs = [], tags = [ @@ -60,6 +61,17 @@ py_test( deps = [":conftest"], ) +py_test( + name = "test_job_manager", + size = "large", + srcs = ["modules/job/tests/test_job_manager.py"], + tags = [ + "exclusive", + "team:core", + ], + deps = [":conftest"], +) + py_test( name = "test_http_job_server", size = "large", diff --git a/python/ray/dashboard/agent.py b/python/ray/dashboard/agent.py index 9302f020e898..6b95ad4d1444 100644 --- a/python/ray/dashboard/agent.py +++ b/python/ray/dashboard/agent.py @@ -9,9 +9,9 @@ import ray._private.ray_constants as ray_constants import ray.dashboard.consts as dashboard_consts import ray.dashboard.utils as dashboard_utils +from ray._common.network_utils import build_address, is_localhost from ray._common.utils import get_or_create_event_loop from ray._private import logging_utils -from ray._common.network_utils import build_address from ray._private.process_watcher import create_check_raylet_task from ray._private.ray_constants import AGENT_GRPC_MAX_MESSAGE_LENGTH from ray._private.ray_logging import setup_component_logger @@ -24,12 +24,13 @@ class DashboardAgent: def __init__( self, node_ip_address, - dashboard_agent_port, + grpc_port, gcs_address, cluster_id_hex, minimal, metrics_export_port=None, node_manager_port=None, + events_export_addr=None, listen_port=ray_constants.DEFAULT_DASHBOARD_AGENT_LISTEN_PORT, disable_metrics_collection: bool = False, *, # the following are required kwargs @@ -53,9 +54,10 @@ def __init__( self.temp_dir = temp_dir self.session_dir = session_dir self.log_dir = log_dir - self.dashboard_agent_port = dashboard_agent_port + self.grpc_port = grpc_port self.metrics_export_port = metrics_export_port self.node_manager_port = node_manager_port + self.events_export_addr = events_export_addr self.listen_port = listen_port self.object_store_name = object_store_name self.raylet_name = raylet_name @@ -109,11 +111,10 @@ def _init_non_minimal(self): ), ) # noqa ) - grpc_ip = "127.0.0.1" if self.ip == "127.0.0.1" else "0.0.0.0" try: - self.grpc_port = add_port_to_grpc_server( - self.server, build_address(grpc_ip, self.dashboard_agent_port) - ) + add_port_to_grpc_server(self.server, build_address(self.ip, self.grpc_port)) + if not is_localhost(self.ip): + add_port_to_grpc_server(self.server, f"127.0.0.1:{self.grpc_port}") except Exception: # TODO(SongGuyang): Catch the exception here because there is # port conflict issue which brought from static port. We should @@ -127,7 +128,7 @@ def _init_non_minimal(self): else: logger.info( "Dashboard agent grpc address: %s", - build_address(grpc_ip, self.grpc_port), + build_address(self.ip, self.grpc_port), ) # If the agent is not minimal it should start the http server @@ -261,7 +262,7 @@ async def wait_forever(): help="The port to expose metrics through Prometheus.", ) parser.add_argument( - "--dashboard-agent-port", + "--grpc-port", required=True, type=int, help="The port on which the dashboard agent will receive GRPCs.", @@ -371,7 +372,7 @@ async def wait_forever(): required=False, type=str, default=None, - help="The session name (cluster id) of this cluster.", + help="The current Ray session name.", ) parser.add_argument( "--stdout-filepath", @@ -422,7 +423,7 @@ async def wait_forever(): agent = DashboardAgent( args.node_ip_address, - args.dashboard_agent_port, + args.grpc_port, args.gcs_address, args.cluster_id_hex, args.minimal, diff --git a/python/ray/dashboard/client/src/common/ProfilingLink.tsx b/python/ray/dashboard/client/src/common/ProfilingLink.tsx index 5c44c4547a61..5639bf827556 100644 --- a/python/ray/dashboard/client/src/common/ProfilingLink.tsx +++ b/python/ray/dashboard/client/src/common/ProfilingLink.tsx @@ -20,7 +20,7 @@ import { ClassNameProps } from "./props"; type CpuProfilingLinkProps = PropsWithChildren< { pid: string | number | null | undefined; - ip: string | null | undefined; + nodeId: string | null | undefined; type: string | null; } & ClassNameProps >; @@ -34,7 +34,7 @@ type TaskProfilingStackTraceProps = { type MemoryProfilingProps = PropsWithChildren< { pid: string | number | null | undefined; - ip: string | null | undefined; + nodeId: string | null | undefined; type?: string | null; } & ClassNameProps >; @@ -92,15 +92,20 @@ export const TaskCpuStackTraceLink = ({ export const CpuStackTraceLink = ({ pid, - ip, + nodeId, type = "", }: CpuProfilingLinkProps) => { - if (!pid || !ip || typeof pid === "undefined" || typeof ip === "undefined") { + if ( + !pid || + !nodeId || + typeof pid === "undefined" || + typeof nodeId === "undefined" + ) { return
; } return ( { - if (!pid || !ip) { + if (!pid || !nodeId) { return
; } return ( { - if (!pid || !ip) { + if (!pid || !nodeId) { return
; } - const profilerUrl = `memory_profile?pid=${pid}&ip=${ip}`; + const profilerUrl = `memory_profile?pid=${pid}&node_id=${nodeId}`; return ; }; diff --git a/python/ray/dashboard/client/src/components/ActorTable.component.test.tsx b/python/ray/dashboard/client/src/components/ActorTable.component.test.tsx index cebe9825ed7a..ec58ad0f568c 100644 --- a/python/ray/dashboard/client/src/components/ActorTable.component.test.tsx +++ b/python/ray/dashboard/client/src/components/ActorTable.component.test.tsx @@ -10,7 +10,7 @@ const MOCK_ACTORS: { [actorId: string]: ActorDetail } = { actorId: "ACTOR_1", jobId: "01000000", address: { - rayletId: "426854e68e4225b3941deaf03c8dcfcb1daacc69a92711d370dbb0e1", + nodeId: "426854e68e4225b3941deaf03c8dcfcb1daacc69a92711d370dbb0e1", ipAddress: "172.31.11.178", port: 10003, workerId: "b8b276a03612644098ed7a929c3b0e50f5bde894eb0d8cab288fbb6d", @@ -61,7 +61,7 @@ const MOCK_ACTORS: { [actorId: string]: ActorDetail } = { actorId: "ACTOR_2", jobId: "01000000", address: { - rayletId: "426854e68e4225b3941deaf03c8dcfcb1daacc69a92711d370dbb0e1", + nodeId: "426854e68e4225b3941deaf03c8dcfcb1daacc69a92711d370dbb0e1", ipAddress: "172.31.11.178", port: 10003, workerId: "b8b276a03612644098ed7a929c3b0e50f5bde894eb0d8cab288fbb6d", @@ -120,7 +120,7 @@ describe("ActorTable", () => { ACTOR_2: { ...MOCK_ACTORS.ACTOR_2, address: { - rayletId: "426854e68e4225b3941deaf03c8dcfcb1daacc69a92711d370dbb0e2", + nodeId: "426854e68e4225b3941deaf03c8dcfcb1daacc69a92711d370dbb0e2", ipAddress: "172.31.11.178", port: 10003, workerId: "b8b276a03612644098ed7a929c3b0e50f5bde894eb0d8cab288fbb6e", diff --git a/python/ray/dashboard/client/src/components/ActorTable.tsx b/python/ray/dashboard/client/src/components/ActorTable.tsx index 5c8ebebfa8ed..bff6b40352ad 100644 --- a/python/ray/dashboard/client/src/components/ActorTable.tsx +++ b/python/ray/dashboard/client/src/components/ActorTable.tsx @@ -385,10 +385,10 @@ const ActorTable = ({ data-testid="nodeIdFilter" style={{ margin: 8, width: 150 }} options={Array.from( - new Set(Object.values(actors).map((e) => e.address?.rayletId)), + new Set(Object.values(actors).map((e) => e.address?.nodeId)), )} onInputChange={(_: any, value: string) => { - changeFilter("address.rayletId", value.trim()); + changeFilter("address.nodeId", value.trim()); }} renderInput={(params: TextFieldProps) => ( @@ -655,19 +655,19 @@ const ActorTable = ({


@@ -684,14 +684,14 @@ const ActorTable = ({ {address?.ipAddress ? address?.ipAddress : "-"} - {address?.rayletId ? ( - + {address?.nodeId ? ( + - {address?.rayletId} + {address?.nodeId} diff --git a/python/ray/dashboard/client/src/pages/actor/ActorDetail.tsx b/python/ray/dashboard/client/src/pages/actor/ActorDetail.tsx index 2e42384d1151..6cf3096272e0 100644 --- a/python/ray/dashboard/client/src/pages/actor/ActorDetail.tsx +++ b/python/ray/dashboard/client/src/pages/actor/ActorDetail.tsx @@ -118,12 +118,12 @@ const ActorDetailPage = () => { }, { label: "Node ID", - content: actorDetail.address?.rayletId + content: actorDetail.address?.nodeId ? { - value: actorDetail.address?.rayletId, - copyableValue: actorDetail.address?.rayletId, - link: actorDetail.address.rayletId - ? generateNodeLink(actorDetail.address.rayletId) + value: actorDetail.address?.nodeId, + copyableValue: actorDetail.address?.nodeId, + link: actorDetail.address.nodeId + ? generateNodeLink(actorDetail.address.nodeId) : undefined, } : { value: "-" }, @@ -191,19 +191,19 @@ const ActorDetailPage = () => {


diff --git a/python/ray/dashboard/client/src/pages/actor/ActorLogs.tsx b/python/ray/dashboard/client/src/pages/actor/ActorLogs.tsx index 5191001d955f..23d9fe2ae438 100644 --- a/python/ray/dashboard/client/src/pages/actor/ActorLogs.tsx +++ b/python/ray/dashboard/client/src/pages/actor/ActorLogs.tsx @@ -13,7 +13,7 @@ export const ActorLogs = ({ actor: { actorId, pid, - address: { workerId, rayletId }, + address: { workerId, nodeId }, }, }: ActorLogsProps) => { const tabs: MultiTabLogViewerTabDetails[] = [ @@ -29,7 +29,7 @@ export const ActorLogs = ({ }, { title: "system", - nodeId: rayletId, + nodeId: nodeId, // TODO(aguo): Have API return the log file name. filename: `python-core-worker-${workerId}_${pid}.log`, }, diff --git a/python/ray/dashboard/client/src/pages/actor/hook/mockedUseActorList.ts b/python/ray/dashboard/client/src/pages/actor/hook/mockedUseActorList.ts index d4bb6061bd7d..384387228c4b 100644 --- a/python/ray/dashboard/client/src/pages/actor/hook/mockedUseActorList.ts +++ b/python/ray/dashboard/client/src/pages/actor/hook/mockedUseActorList.ts @@ -5,7 +5,7 @@ const MOCK_ACTORS: { [actorId: string]: Actor } = { actorId: "ACTOR_1", jobId: "01000000", address: { - rayletId: "426854e68e4225b3941deaf03c8dcfcb1daacc69a92711d370dbb0e1", + nodeId: "426854e68e4225b3941deaf03c8dcfcb1daacc69a92711d370dbb0e1", ipAddress: "172.31.11.178", port: 10003, workerId: "b8b276a03612644098ed7a929c3b0e50f5bde894eb0d8cab288fbb6d", @@ -28,7 +28,7 @@ const MOCK_ACTORS: { [actorId: string]: Actor } = { actorId: "ACTOR_2", jobId: "01000000", address: { - rayletId: "426854e68e4225b3941deaf03c8dcfcb1daacc69a92711d370dbb0e1", + nodeId: "426854e68e4225b3941deaf03c8dcfcb1daacc69a92711d370dbb0e1", ipAddress: "172.31.11.178", port: 10003, workerId: "b8b276a03612644098ed7a929c3b0e50f5bde894eb0d8cab288fbb6d", @@ -51,7 +51,7 @@ const MOCK_ACTORS: { [actorId: string]: Actor } = { actorId: "ACTOR_3", jobId: "01000000", address: { - rayletId: "426854e68e4225b3941deaf03c8dcfcb1daacc69a92711d370dbb0e1", + nodeId: "426854e68e4225b3941deaf03c8dcfcb1daacc69a92711d370dbb0e1", ipAddress: "172.31.11.178", port: 10003, workerId: "b8b276a03612644098ed7a929c3b0e50f5bde894eb0d8cab288fbb6d", @@ -74,7 +74,7 @@ const MOCK_ACTORS: { [actorId: string]: Actor } = { actorId: "ACTOR_4", jobId: "01000000", address: { - rayletId: "426854e68e4225b3941deaf03c8dcfcb1daacc69a92711d370dbb0e1", + nodeId: "426854e68e4225b3941deaf03c8dcfcb1daacc69a92711d370dbb0e1", ipAddress: "172.31.11.178", port: 10003, workerId: "b8b276a03612644098ed7a929c3b0e50f5bde894eb0d8cab288fbb6d", @@ -97,7 +97,7 @@ const MOCK_ACTORS: { [actorId: string]: Actor } = { actorId: "ACTOR_5", jobId: "01000000", address: { - rayletId: "426854e68e4225b3941deaf03c8dcfcb1daacc69a92711d370dbb0e1", + nodeId: "426854e68e4225b3941deaf03c8dcfcb1daacc69a92711d370dbb0e1", ipAddress: "172.31.11.178", port: 10003, workerId: "b8b276a03612644098ed7a929c3b0e50f5bde894eb0d8cab288fbb6d", diff --git a/python/ray/dashboard/client/src/pages/job/JobDetailInfoPage.tsx b/python/ray/dashboard/client/src/pages/job/JobDetailInfoPage.tsx index 112b19ed5df6..4c0c1562267b 100644 --- a/python/ray/dashboard/client/src/pages/job/JobDetailInfoPage.tsx +++ b/python/ray/dashboard/client/src/pages/job/JobDetailInfoPage.tsx @@ -172,19 +172,19 @@ export const JobMetadataSection = ({ job }: JobMetadataSectionProps) => {


diff --git a/python/ray/dashboard/client/src/pages/job/JobRow.tsx b/python/ray/dashboard/client/src/pages/job/JobRow.tsx index 8440dcc26edf..dfb571fb5ec4 100644 --- a/python/ray/dashboard/client/src/pages/job/JobRow.tsx +++ b/python/ray/dashboard/client/src/pages/job/JobRow.tsx @@ -116,19 +116,19 @@ export const JobRow = ({ job }: JobRowProps) => { )}

diff --git a/python/ray/dashboard/client/src/pages/metrics/Metrics.tsx b/python/ray/dashboard/client/src/pages/metrics/Metrics.tsx index 07e7af851266..f446e79e0605 100644 --- a/python/ray/dashboard/client/src/pages/metrics/Metrics.tsx +++ b/python/ray/dashboard/client/src/pages/metrics/Metrics.tsx @@ -221,10 +221,6 @@ const DATA_METRICS_CONFIG: MetricsSectionConfig[] = [ title: "Bytes Spilled", pathParams: "theme=light&panelId=1", }, - { - title: "Bytes Allocated", - pathParams: "theme=light&panelId=2", - }, { title: "Bytes Freed", pathParams: "theme=light&panelId=3", @@ -329,20 +325,8 @@ const DATA_METRICS_CONFIG: MetricsSectionConfig[] = [ pathParams: "theme=light&panelId=37", }, { - title: "(p50) Task Completion Time", - pathParams: "theme=light&panelId=40", - }, - { - title: "(p75) Task Completion Time", - pathParams: "theme=light&panelId=41", - }, - { - title: "(p99) Task Completion Time", - pathParams: "theme=light&panelId=44", - }, - { - title: "(p100) Task Completion Time", - pathParams: "theme=light&panelId=45", + title: "Task Completion Time", + pathParams: "theme=light&panelId=38", }, ], }, diff --git a/python/ray/dashboard/client/src/pages/node/NodeRow.tsx b/python/ray/dashboard/client/src/pages/node/NodeRow.tsx index d8a135510503..b873884564a0 100644 --- a/python/ray/dashboard/client/src/pages/node/NodeRow.tsx +++ b/python/ray/dashboard/client/src/pages/node/NodeRow.tsx @@ -227,7 +227,6 @@ type WorkerRowProps = { */ export const WorkerRow = ({ node, worker }: WorkerRowProps) => { const { - ip, mem, raylet: { nodeId }, } = node; @@ -278,11 +277,11 @@ export const WorkerRow = ({ node, worker }: WorkerRowProps) => { Log
- +
- +
- + diff --git a/python/ray/dashboard/client/src/pages/serve/ServeSystemActorDetailPage.tsx b/python/ray/dashboard/client/src/pages/serve/ServeSystemActorDetailPage.tsx index c326e2c86cd3..3e7c1ecdab5c 100644 --- a/python/ray/dashboard/client/src/pages/serve/ServeSystemActorDetailPage.tsx +++ b/python/ray/dashboard/client/src/pages/serve/ServeSystemActorDetailPage.tsx @@ -237,14 +237,14 @@ const ServeSystemActorLogs = ({ actor: { actorId, pid, - address: { workerId, rayletId }, + address: { workerId, nodeId }, }, systemLogFilePath, }: ServeSystemActorLogsProps) => { const tabs: MultiTabLogViewerTabDetails[] = [ { title: type === "controller" ? "Controller logs" : "proxy logs", - nodeId: rayletId, + nodeId: nodeId, filename: systemLogFilePath.startsWith("/") ? systemLogFilePath.substring(1) : systemLogFilePath, diff --git a/python/ray/dashboard/client/src/type/actor.ts b/python/ray/dashboard/client/src/type/actor.ts index 52c8527ab94e..b1242ed86c74 100644 --- a/python/ray/dashboard/client/src/type/actor.ts +++ b/python/ray/dashboard/client/src/type/actor.ts @@ -9,7 +9,7 @@ export enum ActorEnum { } export type Address = { - rayletId: string; + nodeId: string; ipAddress: string; port: number; workerId: string; diff --git a/python/ray/dashboard/client/src/type/worker.d.ts b/python/ray/dashboard/client/src/type/worker.d.ts index 8f4d89e685e9..f8822f75b733 100644 --- a/python/ray/dashboard/client/src/type/worker.d.ts +++ b/python/ray/dashboard/client/src/type/worker.d.ts @@ -15,7 +15,6 @@ export type CoreWorkerStats = { numExecutedTasks: number; numPendingTasks: number; workerId: string; - actorTitle: string; jobId: string; numObjectRefsInScope: number; numInPlasma: number; diff --git a/python/ray/dashboard/consts.py b/python/ray/dashboard/consts.py index 1c5fdb9386b2..30505878cb80 100644 --- a/python/ray/dashboard/consts.py +++ b/python/ray/dashboard/consts.py @@ -65,7 +65,9 @@ # Port that dashboard prometheus metrics will be exported to DASHBOARD_METRIC_PORT = env_integer("DASHBOARD_METRIC_PORT", 44227) -NODE_TAG_KEYS = ["ip", "Version", "SessionName", "IsHeadNode"] +# We use RayNodeType to mark head/worker nodes. IsHeadNode is retained +# for backward compatibility for user-customized dashboards that might rely on it +NODE_TAG_KEYS = ["ip", "Version", "SessionName", "IsHeadNode", "RayNodeType"] GPU_TAG_KEYS = NODE_TAG_KEYS + ["GpuDeviceName", "GpuIndex"] # TpuDeviceName and TpuIndex are expected to be equal to the number of TPU diff --git a/python/ray/dashboard/dashboard.py b/python/ray/dashboard/dashboard.py index 921a6069c88e..f774e22300d5 100644 --- a/python/ray/dashboard/dashboard.py +++ b/python/ray/dashboard/dashboard.py @@ -9,13 +9,13 @@ import ray import ray._private.ray_constants as ray_constants -from ray._common.ray_constants import ( - LOGGING_ROTATE_BYTES, - LOGGING_ROTATE_BACKUP_COUNT, -) import ray.dashboard.consts as dashboard_consts import ray.dashboard.head as dashboard_head import ray.dashboard.utils as dashboard_utils +from ray._common.ray_constants import ( + LOGGING_ROTATE_BACKUP_COUNT, + LOGGING_ROTATE_BYTES, +) from ray._common.utils import get_or_create_event_loop from ray._private import logging_utils from ray._private.ray_logging import setup_component_logger diff --git a/python/ray/dashboard/head.py b/python/ray/dashboard/head.py index 90469fd5c94d..94a8bb3cf380 100644 --- a/python/ray/dashboard/head.py +++ b/python/ray/dashboard/head.py @@ -9,10 +9,11 @@ import ray.dashboard.consts as dashboard_consts import ray.dashboard.utils as dashboard_utils import ray.experimental.internal_kv as internal_kv +from ray._common.network_utils import build_address +from ray._common.usage.usage_lib import TagKey, record_extra_usage_tag from ray._private import ray_constants from ray._private.async_utils import enable_monitor_loop_lag from ray._private.ray_constants import env_integer -from ray._common.usage.usage_lib import TagKey, record_extra_usage_tag from ray._raylet import GcsClient from ray.dashboard.consts import ( AVAILABLE_COMPONENT_NAMES_FOR_METRICS, @@ -24,7 +25,6 @@ DashboardHeadModuleConfig, async_loop_forever, ) -from ray._common.network_utils import build_address import psutil diff --git a/python/ray/dashboard/http_server_agent.py b/python/ray/dashboard/http_server_agent.py index 846df9c565b9..b9146066933f 100644 --- a/python/ray/dashboard/http_server_agent.py +++ b/python/ray/dashboard/http_server_agent.py @@ -6,8 +6,8 @@ from packaging.version import Version import ray.dashboard.optional_utils as dashboard_optional_utils +from ray._common.network_utils import build_address, is_localhost from ray._common.utils import get_or_create_event_loop -from ray._common.network_utils import build_address from ray.dashboard.optional_deps import aiohttp, aiohttp_cors, hdrs logger = logging.getLogger(__name__) @@ -44,10 +44,17 @@ async def _start_site_with_retry( try: site = aiohttp.web.TCPSite( self.runner, - "127.0.0.1" if self.ip == "127.0.0.1" else "0.0.0.0", + self.ip, self.listen_port, ) await site.start() + if not is_localhost(self.ip): + local_site = aiohttp.web.TCPSite( + self.runner, + "127.0.0.1", + self.listen_port, + ) + await local_site.start() if attempt > 0: logger.info( f"Successfully started agent on port {self.listen_port} " diff --git a/python/ray/dashboard/http_server_head.py b/python/ray/dashboard/http_server_head.py index ffac41f4d7d1..49f748309271 100644 --- a/python/ray/dashboard/http_server_head.py +++ b/python/ray/dashboard/http_server_head.py @@ -17,10 +17,9 @@ import ray.dashboard.timezone_utils as timezone_utils import ray.dashboard.utils as dashboard_utils from ray import ray_constants -from ray._common.utils import get_or_create_event_loop -from ray._common.network_utils import build_address +from ray._common.network_utils import build_address, parse_address from ray._common.usage.usage_lib import TagKey, record_extra_usage_tag -from ray._common.network_utils import parse_address +from ray._common.utils import get_or_create_event_loop from ray.dashboard.dashboard_metrics import DashboardPrometheusMetrics from ray.dashboard.head import DashboardHeadModule diff --git a/python/ray/dashboard/memory_utils.py b/python/ray/dashboard/memory_utils.py index ef2d51a1de1c..e81532f5af9e 100644 --- a/python/ray/dashboard/memory_utils.py +++ b/python/ray/dashboard/memory_utils.py @@ -430,7 +430,7 @@ def memory_summary( "Type", "Call Site", "Status", - "Attampt", + "Attempt", "Size", "Reference Type", "Object Ref", @@ -444,7 +444,7 @@ def memory_summary( mem += f"Grouping by {group_by}...\ Sorting by {sort_by}...\ - Display {num_entries if num_entries is not None else 'all'}\ + Display {num_entries if num_entries is not None else 'all'} \ entries per group...\n\n\n" for key, group in memory_table["group"].items(): diff --git a/python/ray/dashboard/modules/aggregator/aggregator_agent.py b/python/ray/dashboard/modules/aggregator/aggregator_agent.py index 97ba70bfabe8..c5dbde841470 100644 --- a/python/ray/dashboard/modules/aggregator/aggregator_agent.py +++ b/python/ray/dashboard/modules/aggregator/aggregator_agent.py @@ -1,17 +1,18 @@ import asyncio -import signal -import time -import os import json +import logging +import os import queue -from concurrent.futures import ThreadPoolExecutor +import signal import threading -import logging -from urllib3.util import Retry +import time +from concurrent.futures import ThreadPoolExecutor + from requests import Session from requests.adapters import HTTPAdapter +from urllib3.util import Retry -from google.protobuf.json_format import MessageToJson +from ray._private.protobuf_compat import message_to_json try: import prometheus_client @@ -20,14 +21,14 @@ prometheus_client = None import ray +import ray.dashboard.consts as dashboard_consts +import ray.dashboard.utils as dashboard_utils from ray._common.utils import get_or_create_event_loop from ray._private import ray_constants -import ray.dashboard.utils as dashboard_utils -import ray.dashboard.consts as dashboard_consts from ray.core.generated import ( + events_base_event_pb2, events_event_aggregator_service_pb2, events_event_aggregator_service_pb2_grpc, - events_base_event_pb2, ) logger = logging.getLogger(__name__) @@ -66,12 +67,8 @@ REQUEST_BACKOFF_FACTOR = ray_constants.env_float( f"{env_var_prefix}_REQUEST_BACKOFF_FACTOR", 1.0 ) -# Address of the external service to send events -EVENT_SEND_ADDR = os.environ.get( - f"{env_var_prefix}_EVENT_SEND_ADDR", "http://127.0.0.1" -) -# Port of the external service to send events -EVENT_SEND_PORT = ray_constants.env_integer(f"{env_var_prefix}_EVENT_SEND_PORT", 12345) +# Address of the external service to send events with format of "http://:" +EVENTS_EXPORT_ADDR = os.environ.get(f"{env_var_prefix}_EVENTS_EXPORT_ADDR", "") # Interval to update metrics METRICS_UPDATE_INTERVAL_SECONDS = ray_constants.env_float( f"{env_var_prefix}_METRICS_UPDATE_INTERVAL_SECONDS", 0.1 @@ -79,9 +76,14 @@ # Event filtering configurations # Comma-separated list of event types that are allowed to be exposed to external services # Valid values: TASK_DEFINITION_EVENT, TASK_EXECUTION_EVENT, ACTOR_TASK_DEFINITION_EVENT, ACTOR_TASK_EXECUTION_EVENT -# The list of all supported event types can be found in src/ray/protobuf/events_base_event.proto (EventType enum) +# The list of all supported event types can be found in src/ray/protobuf/public/events_base_event.proto (EventType enum) # By default TASK_PROFILE_EVENT is not exposed to external services -DEFAULT_EXPOSABLE_EVENT_TYPES = "TASK_DEFINITION_EVENT,TASK_EXECUTION_EVENT,ACTOR_TASK_DEFINITION_EVENT,ACTOR_TASK_EXECUTION_EVENT" +DEFAULT_EXPOSABLE_EVENT_TYPES = ( + "TASK_DEFINITION_EVENT,TASK_EXECUTION_EVENT," + "ACTOR_TASK_DEFINITION_EVENT,ACTOR_TASK_EXECUTION_EVENT," + "DRIVER_JOB_DEFINITION_EVENT,DRIVER_JOB_EXECUTION_EVENT," + "ACTOR_DEFINITION_EVENT,ACTOR_LIFECYCLE_EVENT" +) EXPOSABLE_EVENT_TYPES = os.environ.get( f"{env_var_prefix}_EXPOSABLE_EVENT_TYPES", DEFAULT_EXPOSABLE_EVENT_TYPES ) @@ -119,7 +121,9 @@ ) events_filtered_out = Counter( f"{metrics_prefix}_events_filtered_out", - "Total number of events filtered out before publishing to external server.", + "Total number of events filtered out before publishing to external server. The " + "metric counts the events that are received by the aggregator agent but are " + "not part of the public API yet.", tuple(dashboard_consts.COMPONENT_METRICS_TAG_KEYS), namespace="ray", ) @@ -164,13 +168,22 @@ def __init__(self, dashboard_agent) -> None: self._events_dropped_at_event_aggregator_since_last_metrics_update = 0 self._events_published_since_last_metrics_update = 0 self._events_filtered_out_since_last_metrics_update = 0 - - self._orig_sigterm_handler = signal.signal( - signal.SIGTERM, self._sigterm_handler + self._events_export_addr = ( + dashboard_agent.events_export_addr or EVENTS_EXPORT_ADDR ) - self._is_cleanup = False - self._cleanup_finished_event = threading.Event() + self._event_http_target_enabled = bool(self._events_export_addr) + if not self._event_http_target_enabled: + logger.info( + "Event HTTP target not set, skipping sending events to " + f"external http service. events_export_addr: {self._events_export_addr}" + ) + + self._event_processing_enabled = self._event_http_target_enabled + if self._event_processing_enabled: + logger.info("Event processing enabled") + else: + logger.info("Event processing disabled") self._exposable_event_types = { event_type.strip() @@ -178,6 +191,13 @@ def __init__(self, dashboard_agent) -> None: if event_type.strip() } + self._orig_sigterm_handler = signal.signal( + signal.SIGTERM, self._sigterm_handler + ) + + self._is_cleanup = False + self._cleanup_finished_event = threading.Event() + async def AddEvents(self, request, context) -> None: """ gRPC handler for adding events to the event aggregator @@ -192,6 +212,9 @@ def _receive_events(self, request): """ Receives events from the request, adds them to the event buffer, """ + if not self._event_processing_enabled: + return events_event_aggregator_service_pb2.AddEventsReply() + # TODO(myan) #54515: Considering adding a mechanism to also send out the events # metadata (e.g. dropped task attempts) to help with event processing at the # downstream @@ -202,6 +225,7 @@ def _receive_events(self, request): try: self._event_buffer.put_nowait(event) except queue.Full: + # Remove the oldest event to make room for the new event. self._event_buffer.get_nowait() self._event_buffer.put_nowait(event) with self._lock: @@ -234,7 +258,7 @@ def _send_events_to_external_service(self, event_batch) -> None: """ Sends a batch of events to the external service via HTTP POST request """ - if not event_batch: + if not event_batch or not self._event_http_target_enabled: return filtered_event_batch = [ @@ -249,12 +273,15 @@ def _send_events_to_external_service(self, event_batch) -> None: # Convert protobuf objects to JSON dictionaries for HTTP POST filtered_event_batch_json = [ - json.loads(MessageToJson(event)) for event in filtered_event_batch + json.loads( + message_to_json(event, always_print_fields_with_no_presence=True) + ) + for event in filtered_event_batch ] try: response = self._http_session.post( - f"{EVENT_SEND_ADDR}:{EVENT_SEND_PORT}", json=filtered_event_batch_json + f"{self._events_export_addr}", json=filtered_event_batch_json ) response.raise_for_status() with self._lock: diff --git a/python/ray/dashboard/modules/aggregator/tests/test_aggregator_agent.py b/python/ray/dashboard/modules/aggregator/tests/test_aggregator_agent.py index 94d29ab0840c..4d5bf15ed4ba 100644 --- a/python/ray/dashboard/modules/aggregator/tests/test_aggregator_agent.py +++ b/python/ray/dashboard/modules/aggregator/tests/test_aggregator_agent.py @@ -1,43 +1,79 @@ -import sys -import json -import time import base64 +import json +import sys +from unittest.mock import MagicMock import pytest from google.protobuf.timestamp_pb2 import Timestamp -from ray.dashboard.tests.conftest import * # noqa - -from ray._private import ray_constants -from ray._private.utils import init_grpc_channel -from ray._private.test_utils import wait_for_condition -from ray._raylet import GcsClient import ray.dashboard.consts as dashboard_consts +from ray._private import ray_constants from ray._private.test_utils import ( - wait_until_server_available, find_free_port, + wait_for_condition, ) -from ray._common.network_utils import parse_address, build_address - -from ray.core.generated.events_event_aggregator_service_pb2_grpc import ( - EventAggregatorServiceStub, +from ray._private.utils import init_grpc_channel +from ray._raylet import GcsClient +from ray.core.generated.common_pb2 import ( + ErrorType, + FunctionDescriptor, + Language, + PythonFunctionDescriptor, + RayErrorInfo, + TaskStatus, + TaskType, +) +from ray.core.generated.events_base_event_pb2 import RayEvent +from ray.core.generated.events_driver_job_definition_event_pb2 import ( + DriverJobDefinitionEvent, +) +from ray.core.generated.events_driver_job_execution_event_pb2 import ( + DriverJobExecutionEvent, ) from ray.core.generated.events_event_aggregator_service_pb2 import ( AddEventsRequest, RayEventsData, TaskEventsMetadata, ) -from ray.core.generated.events_base_event_pb2 import RayEvent -from ray.core.generated.profile_events_pb2 import ProfileEvents, ProfileEventEntry +from ray.core.generated.events_event_aggregator_service_pb2_grpc import ( + EventAggregatorServiceStub, +) +from ray.core.generated.events_task_definition_event_pb2 import ( + TaskDefinitionEvent, +) +from ray.core.generated.events_task_execution_event_pb2 import ( + TaskExecutionEvent, +) from ray.core.generated.events_task_profile_events_pb2 import TaskProfileEvents - +from ray.core.generated.profile_events_pb2 import ProfileEventEntry, ProfileEvents +from ray.core.generated.runtime_environment_pb2 import ( + RuntimeEnvConfig, + RuntimeEnvInfo, + RuntimeEnvUris, +) +from ray.dashboard.modules.aggregator.aggregator_agent import AggregatorAgent +from ray.dashboard.tests.conftest import * # noqa _EVENT_AGGREGATOR_AGENT_TARGET_PORT = find_free_port() +_EVENT_AGGREGATOR_AGENT_TARGET_IP = "127.0.0.1" +_EVENT_AGGREGATOR_AGENT_TARGET_ADDR = ( + f"http://{_EVENT_AGGREGATOR_AGENT_TARGET_IP}:{_EVENT_AGGREGATOR_AGENT_TARGET_PORT}" +) @pytest.fixture(scope="module") def httpserver_listen_address(): - return ("127.0.0.1", _EVENT_AGGREGATOR_AGENT_TARGET_PORT) + return (_EVENT_AGGREGATOR_AGENT_TARGET_IP, _EVENT_AGGREGATOR_AGENT_TARGET_PORT) + + +@pytest.fixture +def fake_timestamp(): + """ + Returns a fake proto timestamp and the expected timestamp string in the event JSON. + """ + test_time = 1751302230130457542 + seconds, nanos = divmod(test_time, 10**9) + return Timestamp(seconds=seconds, nanos=nanos), "2025-06-30T16:50:30.130457542Z" _with_aggregator_port = pytest.mark.parametrize( @@ -45,9 +81,7 @@ def httpserver_listen_address(): [ { "env_vars": { - "RAY_DASHBOARD_AGGREGATOR_AGENT_EVENT_SEND_PORT": str( - _EVENT_AGGREGATOR_AGENT_TARGET_PORT - ), + "RAY_DASHBOARD_AGGREGATOR_AGENT_EVENTS_EXPORT_ADDR": _EVENT_AGGREGATOR_AGENT_TARGET_ADDR, }, }, ], @@ -55,43 +89,104 @@ def httpserver_listen_address(): ) -def get_event_aggregator_grpc_stub(webui_url, gcs_address, head_node_id): +def get_event_aggregator_grpc_stub(gcs_address, head_node_id): """ An helper function to get the gRPC stub for the event aggregator agent. Should only be used in tests. """ - ip, _ = parse_address(webui_url) - agent_address = build_address(ip, ray_constants.DEFAULT_DASHBOARD_AGENT_LISTEN_PORT) - assert wait_until_server_available(agent_address) gcs_address = gcs_address gcs_client = GcsClient(address=gcs_address) - agent_addr = gcs_client.internal_kv_get( - f"{dashboard_consts.DASHBOARD_AGENT_ADDR_NODE_ID_PREFIX}{head_node_id}".encode(), - namespace=ray_constants.KV_NAMESPACE_DASHBOARD, - timeout=dashboard_consts.GCS_RPC_TIMEOUT_SECONDS, - ) - ip, http_port, grpc_port = json.loads(agent_addr) + + def get_addr(): + return gcs_client.internal_kv_get( + f"{dashboard_consts.DASHBOARD_AGENT_ADDR_NODE_ID_PREFIX}{head_node_id}".encode(), + namespace=ray_constants.KV_NAMESPACE_DASHBOARD, + timeout=dashboard_consts.GCS_RPC_TIMEOUT_SECONDS, + ) + + wait_for_condition(lambda: get_addr() is not None) + ip, _, grpc_port = json.loads(get_addr()) options = ray_constants.GLOBAL_GRPC_OPTIONS channel = init_grpc_channel(f"{ip}:{grpc_port}", options=options) return EventAggregatorServiceStub(channel) +@pytest.mark.parametrize( + ( + "export_addr", + "expected_http_target_enabled", + "expected_event_processing_enabled", + ), + [ + ("", False, False), + ("http://127.0.0.1:" + str(_EVENT_AGGREGATOR_AGENT_TARGET_PORT), True, True), + ], +) +def test_aggregator_agent_http_target_not_enabled( + export_addr, + expected_http_target_enabled, + expected_event_processing_enabled, +): + dashboard_agent = MagicMock() + dashboard_agent.events_export_addr = export_addr + agent = AggregatorAgent(dashboard_agent) + assert agent._event_http_target_enabled == expected_http_target_enabled + assert agent._event_processing_enabled == expected_event_processing_enabled + + +@pytest.mark.parametrize( + "ray_start_cluster_head_with_env_vars", + [ + { + "env_vars": { + "RAY_DASHBOARD_AGGREGATOR_AGENT_EVENTS_EXPORT_ADDR": "", + }, + }, + ], + indirect=True, +) +def test_aggregator_agent_event_processing_disabled( + ray_start_cluster_head_with_env_vars, httpserver, fake_timestamp +): + cluster = ray_start_cluster_head_with_env_vars + stub = get_event_aggregator_grpc_stub( + cluster.gcs_address, cluster.head_node.node_id + ) + + httpserver.expect_request("/", method="POST").respond_with_data("", status=200) + + request = AddEventsRequest( + events_data=RayEventsData( + events=[ + RayEvent( + event_id=b"1", + source_type=RayEvent.SourceType.CORE_WORKER, + event_type=RayEvent.EventType.TASK_DEFINITION_EVENT, + timestamp=fake_timestamp[0], + severity=RayEvent.Severity.INFO, + message="hello", + ), + ], + task_events_metadata=TaskEventsMetadata( + dropped_task_attempts=[], + ), + ) + ) + stub.AddEvents(request) + + @_with_aggregator_port def test_aggregator_agent_receive_publish_events_normally( - ray_start_cluster_head_with_env_vars, httpserver + ray_start_cluster_head_with_env_vars, httpserver, fake_timestamp ): cluster = ray_start_cluster_head_with_env_vars stub = get_event_aggregator_grpc_stub( - cluster.webui_url, cluster.gcs_address, cluster.head_node.node_id + cluster.gcs_address, cluster.head_node.node_id ) httpserver.expect_request("/", method="POST").respond_with_data("", status=200) - test_time = 1751302230130457542 - seconds, nanos = divmod(test_time, 10**9) - timestamp = Timestamp(seconds=seconds, nanos=nanos) - request = AddEventsRequest( events_data=RayEventsData( events=[ @@ -99,7 +194,7 @@ def test_aggregator_agent_receive_publish_events_normally( event_id=b"1", source_type=RayEvent.SourceType.CORE_WORKER, event_type=RayEvent.EventType.TASK_DEFINITION_EVENT, - timestamp=timestamp, + timestamp=fake_timestamp[0], severity=RayEvent.Severity.INFO, message="hello", ), @@ -110,8 +205,7 @@ def test_aggregator_agent_receive_publish_events_normally( ) ) - reply = stub.AddEvents(request) - assert reply is not None + stub.AddEvents(request) wait_for_condition(lambda: len(httpserver.log) == 1) req, _ = httpserver.log[0] @@ -123,7 +217,7 @@ def test_aggregator_agent_receive_publish_events_normally( assert req_json[0]["eventType"] == "TASK_DEFINITION_EVENT" assert req_json[0]["severity"] == "INFO" assert req_json[0]["message"] == "hello" - assert req_json[0]["timestamp"] == "2025-06-30T16:50:30.130457542Z" + assert req_json[0]["timestamp"] == fake_timestamp[1] @pytest.mark.parametrize( @@ -132,28 +226,22 @@ def test_aggregator_agent_receive_publish_events_normally( { "env_vars": { "RAY_DASHBOARD_AGGREGATOR_AGENT_MAX_EVENT_BUFFER_SIZE": 1, - "RAY_DASHBOARD_AGGREGATOR_AGENT_EVENT_SEND_PORT": str( - _EVENT_AGGREGATOR_AGENT_TARGET_PORT - ), + "RAY_DASHBOARD_AGGREGATOR_AGENT_EVENTS_EXPORT_ADDR": _EVENT_AGGREGATOR_AGENT_TARGET_ADDR, }, }, ], indirect=True, ) def test_aggregator_agent_receive_event_full( - ray_start_cluster_head_with_env_vars, httpserver + ray_start_cluster_head_with_env_vars, httpserver, fake_timestamp ): cluster = ray_start_cluster_head_with_env_vars stub = get_event_aggregator_grpc_stub( - cluster.webui_url, cluster.gcs_address, cluster.head_node.node_id + cluster.gcs_address, cluster.head_node.node_id ) httpserver.expect_request("/", method="POST").respond_with_data("", status=200) - test_time = 1751302230130457542 - seconds, nanos = divmod(test_time, 10**9) - timestamp = Timestamp(seconds=seconds, nanos=nanos) - request = AddEventsRequest( events_data=RayEventsData( events=[ @@ -161,7 +249,7 @@ def test_aggregator_agent_receive_event_full( event_id=b"2", source_type=RayEvent.SourceType.CORE_WORKER, event_type=RayEvent.EventType.TASK_DEFINITION_EVENT, - timestamp=timestamp, + timestamp=fake_timestamp[0], severity=RayEvent.Severity.INFO, message="hello", ), @@ -169,7 +257,7 @@ def test_aggregator_agent_receive_event_full( event_id=b"3", source_type=RayEvent.SourceType.CORE_WORKER, event_type=RayEvent.EventType.TASK_DEFINITION_EVENT, - timestamp=timestamp, + timestamp=fake_timestamp[0], severity=RayEvent.Severity.INFO, message="hello", ), @@ -180,8 +268,7 @@ def test_aggregator_agent_receive_event_full( ) ) - reply = stub.AddEvents(request) - assert reply is not None + stub.AddEvents(request) wait_for_condition(lambda: len(httpserver.log) == 1) req, _ = httpserver.log[0] @@ -193,17 +280,14 @@ def test_aggregator_agent_receive_event_full( @_with_aggregator_port def test_aggregator_agent_receive_multiple_events( - ray_start_cluster_head_with_env_vars, httpserver + ray_start_cluster_head_with_env_vars, httpserver, fake_timestamp ): cluster = ray_start_cluster_head_with_env_vars stub = get_event_aggregator_grpc_stub( - cluster.webui_url, cluster.gcs_address, cluster.head_node.node_id + cluster.gcs_address, cluster.head_node.node_id ) httpserver.expect_request("/", method="POST").respond_with_data("", status=200) - now = time.time_ns() - seconds, nanos = divmod(now, 10**9) - timestamp = Timestamp(seconds=seconds, nanos=nanos) request = AddEventsRequest( events_data=RayEventsData( events=[ @@ -211,7 +295,7 @@ def test_aggregator_agent_receive_multiple_events( event_id=b"4", source_type=RayEvent.SourceType.CORE_WORKER, event_type=RayEvent.EventType.TASK_DEFINITION_EVENT, - timestamp=timestamp, + timestamp=fake_timestamp[0], severity=RayEvent.Severity.INFO, message="event1", ), @@ -219,7 +303,7 @@ def test_aggregator_agent_receive_multiple_events( event_id=b"5", source_type=RayEvent.SourceType.CORE_WORKER, event_type=RayEvent.EventType.TASK_DEFINITION_EVENT, - timestamp=timestamp, + timestamp=fake_timestamp[0], severity=RayEvent.Severity.INFO, message="event2", ), @@ -229,8 +313,7 @@ def test_aggregator_agent_receive_multiple_events( ), ) ) - reply = stub.AddEvents(request) - assert reply is not None + stub.AddEvents(request) wait_for_condition(lambda: len(httpserver.log) == 1) req, _ = httpserver.log[0] req_json = json.loads(req.data) @@ -247,25 +330,20 @@ def test_aggregator_agent_receive_multiple_events( { "env_vars": { "RAY_DASHBOARD_AGGREGATOR_AGENT_MAX_EVENT_BUFFER_SIZE": 1, - "RAY_DASHBOARD_AGGREGATOR_AGENT_EVENT_SEND_PORT": str( - _EVENT_AGGREGATOR_AGENT_TARGET_PORT - ), + "RAY_DASHBOARD_AGGREGATOR_AGENT_EVENTS_EXPORT_ADDR": _EVENT_AGGREGATOR_AGENT_TARGET_ADDR, }, }, ], indirect=True, ) def test_aggregator_agent_receive_multiple_events_failures( - ray_start_cluster_head_with_env_vars, httpserver + ray_start_cluster_head_with_env_vars, httpserver, fake_timestamp ): cluster = ray_start_cluster_head_with_env_vars stub = get_event_aggregator_grpc_stub( - cluster.webui_url, cluster.gcs_address, cluster.head_node.node_id + cluster.gcs_address, cluster.head_node.node_id ) httpserver.expect_request("/", method="POST").respond_with_data("", status=200) - now = time.time_ns() - seconds, nanos = divmod(now, 10**9) - timestamp = Timestamp(seconds=seconds, nanos=nanos) request = AddEventsRequest( events_data=RayEventsData( events=[ @@ -273,7 +351,7 @@ def test_aggregator_agent_receive_multiple_events_failures( event_id=b"1", source_type=RayEvent.SourceType.CORE_WORKER, event_type=RayEvent.EventType.TASK_DEFINITION_EVENT, - timestamp=timestamp, + timestamp=fake_timestamp[0], severity=RayEvent.Severity.INFO, message="event1", ), @@ -281,7 +359,7 @@ def test_aggregator_agent_receive_multiple_events_failures( event_id=b"2", source_type=RayEvent.SourceType.CORE_WORKER, event_type=RayEvent.EventType.TASK_DEFINITION_EVENT, - timestamp=timestamp, + timestamp=fake_timestamp[0], severity=RayEvent.Severity.INFO, message="event2", ), @@ -289,15 +367,14 @@ def test_aggregator_agent_receive_multiple_events_failures( event_id=b"3", source_type=RayEvent.SourceType.CORE_WORKER, event_type=RayEvent.EventType.TASK_DEFINITION_EVENT, - timestamp=timestamp, + timestamp=fake_timestamp[0], severity=RayEvent.Severity.INFO, message="event3", ), ], ) ) - reply = stub.AddEvents(request) - assert reply is not None + stub.AddEvents(request) wait_for_condition(lambda: len(httpserver.log) == 1) req, _ = httpserver.log[0] req_json = json.loads(req.data) @@ -311,7 +388,7 @@ def test_aggregator_agent_receive_empty_events( ): cluster = ray_start_cluster_head_with_env_vars stub = get_event_aggregator_grpc_stub( - cluster.webui_url, cluster.gcs_address, cluster.head_node.node_id + cluster.gcs_address, cluster.head_node.node_id ) httpserver.expect_request("/", method="POST").respond_with_data("", status=200) request = AddEventsRequest( @@ -322,35 +399,29 @@ def test_aggregator_agent_receive_empty_events( ), ) ) - reply = stub.AddEvents(request) - assert reply is not None + stub.AddEvents(request) @_with_aggregator_port def test_aggregator_agent_profile_events_not_exposed( - ray_start_cluster_head_with_env_vars, httpserver + ray_start_cluster_head_with_env_vars, httpserver, fake_timestamp ): """Test that profile events are not sent when not in exposable event types.""" cluster = ray_start_cluster_head_with_env_vars stub = get_event_aggregator_grpc_stub( - cluster.webui_url, cluster.gcs_address, cluster.head_node.node_id + cluster.gcs_address, cluster.head_node.node_id ) httpserver.expect_request("/", method="POST").respond_with_data("", status=200) - - now = time.time_ns() - seconds, nanos = divmod(now, 10**9) - timestamp = Timestamp(seconds=seconds, nanos=nanos) - request = AddEventsRequest( events_data=RayEventsData( events=[ - _create_profile_event_request(), + _create_profile_event_request(fake_timestamp[0]), RayEvent( event_id=b"1", source_type=RayEvent.SourceType.CORE_WORKER, event_type=RayEvent.EventType.TASK_DEFINITION_EVENT, - timestamp=timestamp, + timestamp=fake_timestamp[0], severity=RayEvent.Severity.INFO, message="event1", ), @@ -361,8 +432,7 @@ def test_aggregator_agent_profile_events_not_exposed( ) ) - reply = stub.AddEvents(request) - assert reply is not None + stub.AddEvents(request) # Wait for exactly one event to be received (the TASK_DEFINITION_EVENT) wait_for_condition(lambda: len(httpserver.log) == 1) @@ -376,55 +446,178 @@ def test_aggregator_agent_profile_events_not_exposed( assert req_json[0]["eventType"] == "TASK_DEFINITION_EVENT" -@pytest.mark.parametrize( - "ray_start_cluster_head_with_env_vars", - [ - { - "env_vars": { - "RAY_DASHBOARD_AGGREGATOR_AGENT_EVENT_SEND_PORT": str( - _EVENT_AGGREGATOR_AGENT_TARGET_PORT +def _create_task_definition_event_proto(timestamp): + return RayEvent( + event_id=b"1", + source_type=RayEvent.SourceType.CORE_WORKER, + event_type=RayEvent.EventType.TASK_DEFINITION_EVENT, + timestamp=timestamp, + severity=RayEvent.Severity.INFO, + session_name="test_session", + task_definition_event=TaskDefinitionEvent( + task_id=b"1", + task_attempt=1, + task_type=TaskType.NORMAL_TASK, + language=Language.PYTHON, + task_func=FunctionDescriptor( + python_function_descriptor=PythonFunctionDescriptor( + module_name="test_module", + class_name="test_class", + function_name="test_function", + function_hash="test_hash", ), - "RAY_DASHBOARD_AGGREGATOR_AGENT_EXPOSABLE_EVENT_TYPES": "TASK_DEFINITION_EVENT,TASK_EXECUTION_EVENT,ACTOR_TASK_DEFINITION_EVENT,ACTOR_TASK_EXECUTION_EVENT,TASK_PROFILE_EVENT", + ), + task_name="test_task", + required_resources={ + "CPU": 1.0, + "GPU": 0.0, }, - }, - ], - indirect=True, -) -def test_aggregator_agent_receive_profile_events( - ray_start_cluster_head_with_env_vars, httpserver -): - cluster = ray_start_cluster_head_with_env_vars - stub = get_event_aggregator_grpc_stub( - cluster.webui_url, cluster.gcs_address, cluster.head_node.node_id + runtime_env_info=RuntimeEnvInfo( + serialized_runtime_env="{}", + ), + job_id=b"1", + parent_task_id=b"1", + placement_group_id=b"1", + ref_ids={ + "key1": b"value1", + "key2": b"value2", + }, + ), ) - httpserver.expect_request("/", method="POST").respond_with_data("", status=200) - request = AddEventsRequest( - events_data=RayEventsData( - events=[_create_profile_event_request()], - task_events_metadata=TaskEventsMetadata( - dropped_task_attempts=[], - ), - ) +def _verify_task_definition_event_json(req_json, expected_timestamp): + assert len(req_json) == 1 + + # Verify the base event fields + assert req_json[0]["eventId"] == base64.b64encode(b"1").decode() + assert req_json[0]["sourceType"] == "CORE_WORKER" + assert req_json[0]["eventType"] == "TASK_DEFINITION_EVENT" + assert req_json[0]["timestamp"] == expected_timestamp + assert req_json[0]["severity"] == "INFO" + assert ( + req_json[0]["message"] == "" + ) # Make sure the default value is included when it is not set + assert req_json[0]["sessionName"] == "test_session" + + # Verify the task definition event specific fields + assert ( + req_json[0]["taskDefinitionEvent"]["taskId"] == base64.b64encode(b"1").decode() + ) + assert req_json[0]["taskDefinitionEvent"]["taskAttempt"] == 1 + assert req_json[0]["taskDefinitionEvent"]["taskType"] == "NORMAL_TASK" + assert req_json[0]["taskDefinitionEvent"]["language"] == "PYTHON" + assert ( + req_json[0]["taskDefinitionEvent"]["taskFunc"]["pythonFunctionDescriptor"][ + "moduleName" + ] + == "test_module" + ) + assert ( + req_json[0]["taskDefinitionEvent"]["taskFunc"]["pythonFunctionDescriptor"][ + "className" + ] + == "test_class" + ) + assert ( + req_json[0]["taskDefinitionEvent"]["taskFunc"]["pythonFunctionDescriptor"][ + "functionName" + ] + == "test_function" + ) + assert ( + req_json[0]["taskDefinitionEvent"]["taskFunc"]["pythonFunctionDescriptor"][ + "functionHash" + ] + == "test_hash" + ) + assert req_json[0]["taskDefinitionEvent"]["taskName"] == "test_task" + assert req_json[0]["taskDefinitionEvent"]["requiredResources"] == { + "CPU": 1.0, + "GPU": 0.0, + } + assert ( + req_json[0]["taskDefinitionEvent"]["runtimeEnvInfo"]["serializedRuntimeEnv"] + == "{}" + ) + assert ( + req_json[0]["taskDefinitionEvent"]["jobId"] == base64.b64encode(b"1").decode() ) + assert ( + req_json[0]["taskDefinitionEvent"]["parentTaskId"] + == base64.b64encode(b"1").decode() + ) + assert ( + req_json[0]["taskDefinitionEvent"]["placementGroupId"] + == base64.b64encode(b"1").decode() + ) + assert req_json[0]["taskDefinitionEvent"]["refIds"] == { + "key1": base64.b64encode(b"value1").decode(), + "key2": base64.b64encode(b"value2").decode(), + } - reply = stub.AddEvents(request) - assert reply is not None - wait_for_condition(lambda: len(httpserver.log) == 1) +def _create_task_execution_event_proto(timestamp): + return RayEvent( + event_id=b"1", + source_type=RayEvent.SourceType.CORE_WORKER, + event_type=RayEvent.EventType.TASK_EXECUTION_EVENT, + timestamp=timestamp, + severity=RayEvent.Severity.INFO, + session_name="test_session", + task_execution_event=TaskExecutionEvent( + task_id=b"1", + task_attempt=1, + task_state={ + TaskStatus.RUNNING: timestamp, + }, + ray_error_info=RayErrorInfo( + error_type=ErrorType.TASK_EXECUTION_EXCEPTION, + ), + node_id=b"1", + worker_id=b"1", + worker_pid=1, + ), + ) - req, _ = httpserver.log[0] - req_json = json.loads(req.data) - _verify_profile_event_json(req_json) +def _verify_task_execution_event_json(req_json, expected_timestamp): + assert len(req_json) == 1 + + # Verify the base event fields + assert req_json[0]["eventId"] == base64.b64encode(b"1").decode() + assert req_json[0]["sourceType"] == "CORE_WORKER" + assert req_json[0]["eventType"] == "TASK_EXECUTION_EVENT" + assert req_json[0]["timestamp"] == expected_timestamp + assert req_json[0]["severity"] == "INFO" + assert ( + req_json[0]["message"] == "" + ) # Make sure the default value is included when it is not set + assert req_json[0]["sessionName"] == "test_session" + + # Verify the task execution event specific fields + assert ( + req_json[0]["taskExecutionEvent"]["taskId"] == base64.b64encode(b"1").decode() + ) + assert req_json[0]["taskExecutionEvent"]["taskAttempt"] == 1 + assert req_json[0]["taskExecutionEvent"]["taskState"] == { + "8": expected_timestamp, + } + assert ( + req_json[0]["taskExecutionEvent"]["rayErrorInfo"]["errorType"] + == "TASK_EXECUTION_EXCEPTION" + ) + assert ( + req_json[0]["taskExecutionEvent"]["nodeId"] == base64.b64encode(b"1").decode() + ) + assert ( + req_json[0]["taskExecutionEvent"]["workerId"] == base64.b64encode(b"1").decode() + ) + assert req_json[0]["taskExecutionEvent"]["workerPid"] == 1 -def _create_profile_event_request(): +def _create_profile_event_request(timestamp): """Helper function to create a profile event request.""" - test_time = 1751302230130457542 - seconds, nanos = (test_time // 10**9, test_time % 10**9) - timestamp = Timestamp(seconds=seconds, nanos=nanos) return RayEvent( event_id=b"1", @@ -454,7 +647,7 @@ def _create_profile_event_request(): ) -def _verify_profile_event_json(req_json): +def _verify_profile_event_json(req_json, expected_timestamp): """Helper function to verify profile event JSON structure.""" assert len(req_json) == 1 assert req_json[0]["eventId"] == base64.b64encode(b"1").decode() @@ -462,7 +655,7 @@ def _verify_profile_event_json(req_json): assert req_json[0]["eventType"] == "TASK_PROFILE_EVENT" assert req_json[0]["severity"] == "INFO" assert req_json[0]["message"] == "profile event test" - assert req_json[0]["timestamp"] == "2025-06-30T16:50:30.130457542Z" + assert req_json[0]["timestamp"] == expected_timestamp # Verify task profile event specific fields assert "taskProfileEvents" in req_json[0] @@ -485,5 +678,192 @@ def _verify_profile_event_json(req_json): assert event_entry["extraData"] == '{"cpu_usage": 0.8}' +# tuple: (create_event, verify) +EVENT_TYPES_TO_TEST = [ + pytest.param( + _create_task_definition_event_proto, + _verify_task_definition_event_json, + id="task_definition_event", + ), + pytest.param( + _create_task_execution_event_proto, + _verify_task_execution_event_json, + id="task_execution_event", + ), + pytest.param( + _create_profile_event_request, _verify_profile_event_json, id="profile_event" + ), +] + + +@pytest.mark.parametrize("create_event, verify_event", EVENT_TYPES_TO_TEST) +@pytest.mark.parametrize( + "ray_start_cluster_head_with_env_vars", + [ + { + "env_vars": { + "RAY_DASHBOARD_AGGREGATOR_AGENT_EVENTS_EXPORT_ADDR": _EVENT_AGGREGATOR_AGENT_TARGET_ADDR, + "RAY_DASHBOARD_AGGREGATOR_AGENT_EXPOSABLE_EVENT_TYPES": "TASK_DEFINITION_EVENT,TASK_EXECUTION_EVENT,ACTOR_TASK_DEFINITION_EVENT,ACTOR_TASK_EXECUTION_EVENT,TASK_PROFILE_EVENT", + }, + }, + ], + indirect=True, +) +def test_aggregator_agent_receive_events( + create_event, + verify_event, + ray_start_cluster_head_with_env_vars, + httpserver, + fake_timestamp, +): + cluster = ray_start_cluster_head_with_env_vars + stub = get_event_aggregator_grpc_stub( + cluster.gcs_address, cluster.head_node.node_id + ) + httpserver.expect_request("/", method="POST").respond_with_data("", status=200) + request = AddEventsRequest( + events_data=RayEventsData( + events=[create_event(fake_timestamp[0])], + task_events_metadata=TaskEventsMetadata( + dropped_task_attempts=[], + ), + ) + ) + + stub.AddEvents(request) + wait_for_condition(lambda: len(httpserver.log) == 1) + req, _ = httpserver.log[0] + req_json = json.loads(req.data) + verify_event(req_json, fake_timestamp[1]) + + +@_with_aggregator_port +def test_aggregator_agent_receive_driver_job_definition_event( + ray_start_cluster_head_with_env_vars, httpserver +): + cluster = ray_start_cluster_head_with_env_vars + stub = get_event_aggregator_grpc_stub( + cluster.gcs_address, cluster.head_node.node_id + ) + httpserver.expect_request("/", method="POST").respond_with_data("", status=200) + test_time = 1751302230130457542 + seconds, nanos = divmod(test_time, 10**9) + timestamp = Timestamp(seconds=seconds, nanos=nanos) + request = AddEventsRequest( + events_data=RayEventsData( + events=[ + RayEvent( + event_id=b"1", + source_type=RayEvent.SourceType.CORE_WORKER, + event_type=RayEvent.EventType.DRIVER_JOB_DEFINITION_EVENT, + timestamp=timestamp, + severity=RayEvent.Severity.INFO, + message="driver job event", + driver_job_definition_event=DriverJobDefinitionEvent( + job_id=b"1", + config=DriverJobDefinitionEvent.Config( + runtime_env_info=RuntimeEnvInfo( + serialized_runtime_env="{}", + uris=RuntimeEnvUris( + working_dir_uri="file:///tmp/ray/runtime_env", + py_modules_uris=[], + ), + runtime_env_config=RuntimeEnvConfig( + setup_timeout_seconds=10, + eager_install=True, + log_files=[], + ), + ), + metadata={}, + ), + ), + ), + ], + task_events_metadata=TaskEventsMetadata( + dropped_task_attempts=[], + ), + ) + ) + stub.AddEvents(request) + wait_for_condition(lambda: len(httpserver.log) == 1) + req, _ = httpserver.log[0] + req_json = json.loads(req.data) + assert req_json[0]["message"] == "driver job event" + assert ( + req_json[0]["driverJobDefinitionEvent"]["config"]["runtimeEnvInfo"][ + "serializedRuntimeEnv" + ] + == "{}" + ) + assert ( + req_json[0]["driverJobDefinitionEvent"]["config"]["runtimeEnvInfo"]["uris"][ + "workingDirUri" + ] + == "file:///tmp/ray/runtime_env" + ) + assert ( + req_json[0]["driverJobDefinitionEvent"]["config"]["runtimeEnvInfo"][ + "runtimeEnvConfig" + ]["setupTimeoutSeconds"] + == 10.0 + ) + + +@_with_aggregator_port +def test_aggregator_agent_receive_driver_job_execution_event( + ray_start_cluster_head_with_env_vars, httpserver +): + cluster = ray_start_cluster_head_with_env_vars + stub = get_event_aggregator_grpc_stub( + cluster.gcs_address, cluster.head_node.node_id + ) + httpserver.expect_request("/", method="POST").respond_with_data("", status=200) + test_time = 1751302230130457542 + seconds, nanos = divmod(test_time, 10**9) + timestamp = Timestamp(seconds=seconds, nanos=nanos) + request = AddEventsRequest( + events_data=RayEventsData( + events=[ + RayEvent( + event_id=b"1", + source_type=RayEvent.SourceType.CORE_WORKER, + event_type=RayEvent.EventType.DRIVER_JOB_EXECUTION_EVENT, + timestamp=timestamp, + severity=RayEvent.Severity.INFO, + message="driver job execution event", + driver_job_execution_event=DriverJobExecutionEvent( + job_id=b"1", + states=[ + DriverJobExecutionEvent.StateTimestamp( + state=DriverJobExecutionEvent.State.CREATED, + timestamp=Timestamp(seconds=1234567890), + ), + DriverJobExecutionEvent.StateTimestamp( + state=DriverJobExecutionEvent.State.FINISHED, + timestamp=Timestamp(seconds=1234567890), + ), + ], + ), + ), + ], + task_events_metadata=TaskEventsMetadata( + dropped_task_attempts=[], + ), + ) + ) + stub.AddEvents(request) + wait_for_condition(lambda: len(httpserver.log) == 1) + req, _ = httpserver.log[0] + req_json = json.loads(req.data) + assert req_json[0]["message"] == "driver job execution event" + assert ( + req_json[0]["driverJobExecutionEvent"]["jobId"] + == base64.b64encode(b"1").decode() + ) + assert len(req_json[0]["driverJobExecutionEvent"]["states"]) == 2 + assert req_json[0]["driverJobExecutionEvent"]["states"][0]["state"] == "CREATED" + assert req_json[0]["driverJobExecutionEvent"]["states"][1]["state"] == "FINISHED" + + if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/dashboard/modules/aggregator/tests/test_ray_job_events.py b/python/ray/dashboard/modules/aggregator/tests/test_ray_job_events.py new file mode 100644 index 000000000000..006bccd2f19d --- /dev/null +++ b/python/ray/dashboard/modules/aggregator/tests/test_ray_job_events.py @@ -0,0 +1,69 @@ +import base64 +import json +import sys + +import pytest + +import ray +import ray.dashboard.consts as dashboard_consts +from ray._private import ray_constants +from ray._private.test_utils import wait_for_condition +from ray._raylet import GcsClient +from ray.dashboard.tests.conftest import * # noqa + +_RAY_EVENT_PORT = 12345 + + +@pytest.fixture(scope="session") +def httpserver_listen_address(): + return ("127.0.0.1", _RAY_EVENT_PORT) + + +def wait_for_dashboard_agent_available(cluster): + gcs_client = GcsClient(address=cluster.address) + + def get_dashboard_agent_address(): + return gcs_client.internal_kv_get( + f"{dashboard_consts.DASHBOARD_AGENT_ADDR_NODE_ID_PREFIX}{cluster.head_node.node_id}".encode(), + namespace=ray_constants.KV_NAMESPACE_DASHBOARD, + timeout=dashboard_consts.GCS_RPC_TIMEOUT_SECONDS, + ) + + wait_for_condition(lambda: get_dashboard_agent_address() is not None) + + +def test_ray_job_events(ray_start_cluster, httpserver): + cluster = ray_start_cluster + cluster.add_node( + env_vars={ + "RAY_DASHBOARD_AGGREGATOR_AGENT_EVENTS_EXPORT_ADDR": f"http://127.0.0.1:{_RAY_EVENT_PORT}", + "RAY_DASHBOARD_AGGREGATOR_AGENT_EXPOSABLE_EVENT_TYPES": "DRIVER_JOB_DEFINITION_EVENT,DRIVER_JOB_EXECUTION_EVENT", + }, + _system_config={ + "enable_ray_event": True, + }, + ) + cluster.wait_for_nodes() + ray.init(address=cluster.address) + wait_for_dashboard_agent_available(cluster) + + # Submit a ray job + @ray.remote + def f(): + return 1 + + ray.get(f.remote()) + + # Check that a driver job event with the correct job id is published. + httpserver.expect_request("/", method="POST").respond_with_data("", status=200) + wait_for_condition(lambda: len(httpserver.log) >= 1) + req, _ = httpserver.log[0] + req_json = json.loads(req.data) + assert ( + base64.b64decode(req_json[0]["driverJobDefinitionEvent"]["jobId"]).hex() + == ray.get_runtime_context().get_job_id() + ) + + +if __name__ == "__main__": + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/dashboard/modules/event/event_head.py b/python/ray/dashboard/modules/event/event_head.py index 0a8a9712992e..b9e3d90eebba 100644 --- a/python/ray/dashboard/modules/event/event_head.py +++ b/python/ray/dashboard/modules/event/event_head.py @@ -13,9 +13,9 @@ import ray import ray.dashboard.optional_utils as dashboard_optional_utils import ray.dashboard.utils as dashboard_utils +from ray._common.usage.usage_lib import TagKey, record_extra_usage_tag from ray._common.utils import get_or_create_event_loop from ray._private.ray_constants import env_integer -from ray._common.usage.usage_lib import TagKey, record_extra_usage_tag from ray.dashboard.consts import ( RAY_STATE_SERVER_MAX_HTTP_REQUEST, RAY_STATE_SERVER_MAX_HTTP_REQUEST_ALLOWED, diff --git a/python/ray/dashboard/modules/event/tests/test_event.py b/python/ray/dashboard/modules/event/tests/test_event.py index 87200781a9f5..afff1116de8d 100644 --- a/python/ray/dashboard/modules/event/tests/test_event.py +++ b/python/ray/dashboard/modules/event/tests/test_event.py @@ -14,10 +14,11 @@ import numpy as np import pytest -from ray._common.test_utils import wait_for_condition import requests import ray +from ray._common.test_utils import wait_for_condition +from ray._common.utils import binary_to_hex from ray._private.event.event_logger import ( filter_event_by_level, get_event_id, @@ -33,7 +34,6 @@ format_web_url, wait_until_server_available, ) -from ray._common.utils import binary_to_hex from ray.cluster_utils import AutoscalingCluster from ray.core.generated import ( event_pb2, diff --git a/python/ray/dashboard/modules/event/tests/test_export_task.py b/python/ray/dashboard/modules/event/tests/test_export_task.py index 50b47601090c..6698ffa9703f 100644 --- a/python/ray/dashboard/modules/event/tests/test_export_task.py +++ b/python/ray/dashboard/modules/event/tests/test_export_task.py @@ -10,6 +10,7 @@ from ray.dashboard.tests.conftest import * # noqa os.environ["RAY_enable_export_api_write"] = "1" +os.environ["RAY_enable_core_worker_ray_event_to_aggregator"] = "0" @pytest.mark.asyncio diff --git a/python/ray/dashboard/modules/job/cli.py b/python/ray/dashboard/modules/job/cli.py index c90c30c30789..f6219ad1cca9 100644 --- a/python/ray/dashboard/modules/job/cli.py +++ b/python/ray/dashboard/modules/job/cli.py @@ -8,10 +8,10 @@ import click -from ray._common.utils import load_class import ray._private.ray_constants as ray_constants from ray._common.utils import ( get_or_create_event_loop, + load_class, ) from ray._private.utils import ( parse_metadata_json, @@ -115,7 +115,7 @@ def job_cli_group(): required=False, help=( "Address of the Ray cluster to connect to. Can also be specified " - "using the RAY_ADDRESS environment variable." + "using the RAY_API_SERVER_ADDRESS environment variable (falls back to RAY_ADDRESS)." ), ) @click.option( @@ -333,7 +333,7 @@ def submit( required=False, help=( "Address of the Ray cluster to connect to. Can also be specified " - "using the `RAY_ADDRESS` environment variable." + "using the RAY_API_SERVER_ADDRESS environment variable (falls back to RAY_ADDRESS)." ), ) @click.argument("job-id", type=str) @@ -363,7 +363,7 @@ def status( required=False, help=( "Address of the Ray cluster to connect to. Can also be specified " - "using the `RAY_ADDRESS` environment variable." + "using the RAY_API_SERVER_ADDRESS environment variable (falls back to RAY_ADDRESS)." ), ) @click.option( @@ -418,7 +418,7 @@ def stop( required=False, help=( "Address of the Ray cluster to connect to. Can also be specified " - "using the RAY_ADDRESS environment variable." + "using the RAY_API_SERVER_ADDRESS environment variable (falls back to RAY_ADDRESS)." ), ) @click.argument("job-id", type=str) @@ -455,7 +455,7 @@ def delete( required=False, help=( "Address of the Ray cluster to connect to. Can also be specified " - "using the RAY_ADDRESS environment variable." + "using the RAY_API_SERVER_ADDRESS environment variable (falls back to RAY_ADDRESS)." ), ) @click.argument("job-id", type=str) @@ -508,7 +508,7 @@ def logs( required=False, help=( "Address of the Ray cluster to connect to. Can also be specified " - "using the RAY_ADDRESS environment variable." + "using the RAY_API_SERVER_ADDRESS environment variable (falls back to RAY_ADDRESS)." ), ) @add_common_job_options diff --git a/python/ray/dashboard/modules/job/common.py b/python/ray/dashboard/modules/job/common.py index 9b543cd049d2..4466740f0ed8 100644 --- a/python/ray/dashboard/modules/job/common.py +++ b/python/ray/dashboard/modules/job/common.py @@ -66,6 +66,28 @@ def is_terminal(self) -> bool: return self.value in {"STOPPED", "SUCCEEDED", "FAILED"} +@PublicAPI(stability="stable") +class JobErrorType(str, Enum): + """An enumeration for describing the error type of a job.""" + + # Runtime environment failed to be set up + RUNTIME_ENV_SETUP_FAILURE = "RUNTIME_ENV_SETUP_FAILURE" + # Job supervisor actor launched, but job failed to start within timeout + JOB_SUPERVISOR_ACTOR_START_TIMEOUT = "JOB_SUPERVISOR_ACTOR_START_TIMEOUT" + # Job supervisor actor failed to start + JOB_SUPERVISOR_ACTOR_START_FAILURE = "JOB_SUPERVISOR_ACTOR_START_FAILURE" + # Job supervisor actor failed to be scheduled + JOB_SUPERVISOR_ACTOR_UNSCHEDULABLE = "JOB_SUPERVISOR_ACTOR_UNSCHEDULABLE" + # Job supervisor actor failed for unknown exception + JOB_SUPERVISOR_ACTOR_UNKNOWN_FAILURE = "JOB_SUPERVISOR_ACTOR_UNKNOWN_FAILURE" + # Job supervisor actor died + JOB_SUPERVISOR_ACTOR_DIED = "JOB_SUPERVISOR_ACTOR_DIED" + # Job driver script failed to start due to exception + JOB_ENTRYPOINT_COMMAND_START_ERROR = "JOB_ENTRYPOINT_COMMAND_START_ERROR" + # Job driver script failed due to non-zero exit code + JOB_ENTRYPOINT_COMMAND_ERROR = "JOB_ENTRYPOINT_COMMAND_ERROR" + + # TODO(aguo): Convert to pydantic model @PublicAPI(stability="stable") @dataclass @@ -81,9 +103,8 @@ class JobInfo: entrypoint: str #: A message describing the status in more detail. message: Optional[str] = None - # TODO(architkulkarni): Populate this field with e.g. Runtime env setup failure, #: Internal error, user script error - error_type: Optional[str] = None + error_type: Optional[JobErrorType] = None #: The time when the job was started. A Unix timestamp in ms. start_time: Optional[int] = None #: The time when the job moved into a terminal state. A Unix timestamp in ms. @@ -157,6 +178,9 @@ def to_json(self) -> Dict[str, Any]: # Convert enum values to strings. json_dict["status"] = str(json_dict["status"]) + json_dict["error_type"] = ( + json_dict["error_type"].value if json_dict.get("error_type") else None + ) # Convert runtime_env to a JSON-serialized string. if "runtime_env" in json_dict: @@ -181,6 +205,11 @@ def from_json(cls, json_dict: Dict[str, Any]) -> None: """ # Convert enum values to enum objects. json_dict["status"] = JobStatus(json_dict["status"]) + json_dict["error_type"] = ( + JobErrorType(json_dict["error_type"]) + if json_dict.get("error_type") + else None + ) # Convert runtime_env from a JSON-serialized string to a dictionary. if "runtime_env_json" in json_dict: @@ -231,7 +260,11 @@ def __init__( ) async def put_info( - self, job_id: str, job_info: JobInfo, overwrite: bool = True + self, + job_id: str, + job_info: JobInfo, + overwrite: bool = True, + timeout: Optional[int] = 30, ) -> bool: """Put job info to the internal kv store. @@ -239,6 +272,7 @@ async def put_info( job_id: The job id. job_info: The job info. overwrite: Whether to overwrite the existing job info. + timeout: The timeout in seconds for the GCS operation. Returns: True if a new key is added. @@ -248,6 +282,7 @@ async def put_info( json.dumps(job_info.to_json()).encode(), overwrite, namespace=ray_constants.KV_NAMESPACE_JOB, + timeout=timeout, ) if added_num == 1 or overwrite: # Write export event if data was updated in the KV store @@ -322,16 +357,21 @@ async def put_status( status: JobStatus, message: Optional[str] = None, driver_exit_code: Optional[int] = None, + error_type: Optional[JobErrorType] = None, jobinfo_replace_kwargs: Optional[Dict[str, Any]] = None, + timeout: Optional[int] = 30, ): """Puts or updates job status. Sets end_time if status is terminal.""" - old_info = await self.get_info(job_id) + old_info = await self.get_info(job_id, timeout=timeout) if jobinfo_replace_kwargs is None: jobinfo_replace_kwargs = dict() jobinfo_replace_kwargs.update( - status=status, message=message, driver_exit_code=driver_exit_code + status=status, + message=message, + driver_exit_code=driver_exit_code, + error_type=error_type, ) if old_info is not None: if status != old_info.status and old_info.status.is_terminal(): @@ -345,10 +385,10 @@ async def put_status( if status.is_terminal(): new_info.end_time = int(time.time() * 1000) - await self.put_info(job_id, new_info) + await self.put_info(job_id, new_info, timeout=timeout) - async def get_status(self, job_id: str) -> Optional[JobStatus]: - job_info = await self.get_info(job_id) + async def get_status(self, job_id: str, timeout: int = 30) -> Optional[JobStatus]: + job_info = await self.get_info(job_id, timeout) if job_info is None: return None else: diff --git a/python/ray/dashboard/modules/job/job_head.py b/python/ray/dashboard/modules/job/job_head.py index 333075b10679..e91fda0fd3aa 100644 --- a/python/ray/dashboard/modules/job/job_head.py +++ b/python/ray/dashboard/modules/job/job_head.py @@ -7,25 +7,23 @@ import time import traceback from datetime import datetime -from random import choice -from typing import AsyncIterator, Dict, List, Optional, Tuple +from typing import AsyncIterator, Dict, Optional, Tuple import aiohttp.web from aiohttp.client import ClientResponse from aiohttp.web import Request, Response, StreamResponse import ray -import ray.dashboard.consts as dashboard_consts from ray import NodeID -from ray._common.utils import get_or_create_event_loop, load_class +from ray._common.network_utils import build_address from ray._common.pydantic_compat import BaseModel, Extra, Field, validator -from ray._private.ray_constants import KV_NAMESPACE_DASHBOARD, env_bool +from ray._common.utils import get_or_create_event_loop, load_class +from ray._private.ray_constants import KV_NAMESPACE_DASHBOARD from ray._private.runtime_env.packaging import ( package_exists, pin_runtime_env_uri, upload_package_to_gcs, ) -from ray._common.network_utils import build_address from ray.dashboard.consts import ( DASHBOARD_AGENT_ADDR_NODE_ID_PREFIX, GCS_RPC_TIMEOUT_SECONDS, @@ -57,12 +55,6 @@ logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) -# Feature flag controlling whether critical Ray Job control operations are performed -# exclusively by the Job Agent running on the Head node (or randomly sampled Worker one) -# -# NOTE: This flag serves as a temporary kill-switch and should be eventually cleaned up -RAY_JOB_AGENT_USE_HEAD_NODE_ONLY = env_bool("RAY_JOB_AGENT_USE_HEAD_NODE_ONLY", True) - class RayActivityStatus(str, enum.Enum): ACTIVE = "ACTIVE" @@ -249,86 +241,7 @@ async def get_target_agent( Raises: TimeoutError: If the operation times out. """ - if RAY_JOB_AGENT_USE_HEAD_NODE_ONLY: - return await self._get_head_node_agent(timeout_s) - - return await self._pick_random_agent(timeout_s) - - async def _pick_random_agent( - self, timeout_s: float - ) -> Optional[JobAgentSubmissionClient]: - """ - Try to disperse as much as possible to select one of - the `CANDIDATE_AGENT_NUMBER` agents to solve requests. - the agents will not pop from `self._agents` unless - it's dead. Saved in `self._agents` is the agent that was - used before. - Strategy: - 1. if the number of `self._agents` has reached - `CANDIDATE_AGENT_NUMBER`, randomly select one agent from - `self._agents`. - 2. if not, randomly select one agent from all available agents, - it is possible that the selected one already exists in - `self._agents`. - - If there's no agent available at all, or there's exception, it will retry every - `TRY_TO_GET_AGENT_INFO_INTERVAL_SECONDS` seconds indefinitely. - - Args: - timeout_s: The timeout for the operation. - - Returns: - A `JobAgentSubmissionClient` for interacting with jobs via an agent process. - - Raises: - TimeoutError: If the operation times out. - """ - start_time_s = time.time() - last_exception = None - while time.time() < start_time_s + timeout_s: - try: - return await self._pick_random_agent_once() - except Exception as e: - last_exception = e - logger.exception( - f"Failed to pick a random agent, retrying in {TRY_TO_GET_AGENT_INFO_INTERVAL_SECONDS} seconds..." - ) - await asyncio.sleep(TRY_TO_GET_AGENT_INFO_INTERVAL_SECONDS) - raise TimeoutError( - f"Failed to pick a random agent within {timeout_s} seconds. The last exception is {last_exception}" - ) - - async def _pick_random_agent_once(self) -> JobAgentSubmissionClient: - """ - Query the internal kv for all agent infos, and pick agents randomly. May raise - exception if there's no agent available at all or there's network error. - """ - # NOTE: Following call will block until there's at least 1 agent info - # being populated from GCS - agent_node_ids = await self._fetch_all_agent_node_ids() - - # delete dead agents. - for dead_node in set(self._agents) - set(agent_node_ids): - client = self._agents.pop(dead_node) - await client.close() - - if len(self._agents) >= dashboard_consts.CANDIDATE_AGENT_NUMBER: - node_id = choice(list(self._agents)) - return self._agents[node_id] - else: - # Randomly select one from among all agents, it is possible that - # the selected one already exists in `self._agents` - node_id = choice(list(agent_node_ids)) - - if node_id not in self._agents: - # Fetch agent info from InternalKV, and create a new - # JobAgentSubmissionClient. May raise if the node_id is removed in - # InternalKV after the _fetch_all_agent_node_ids, though unlikely. - ip, http_port, _ = await self._fetch_agent_info(node_id) - agent_http_address = f"http://{build_address(ip, http_port)}" - self._agents[node_id] = JobAgentSubmissionClient(agent_http_address) - - return self._agents[node_id] + return await self._get_head_node_agent(timeout_s) async def _get_head_node_agent_once(self) -> JobAgentSubmissionClient: head_node_id_hex = await get_head_node_id(self.gcs_client) @@ -374,26 +287,6 @@ async def _get_head_node_agent(self, timeout_s: float) -> JobAgentSubmissionClie f"Failed to get head node agent within {timeout_s} seconds. The last exception is {exception}" ) - async def _fetch_all_agent_node_ids(self) -> List[NodeID]: - """ - Fetches all NodeIDs with agent infos in the cluster. - - May raise exception if there's no agent available at all or there's network error. - Returns: List[NodeID] - """ - keys = await self.gcs_client.async_internal_kv_keys( - f"{DASHBOARD_AGENT_ADDR_NODE_ID_PREFIX}".encode(), - namespace=KV_NAMESPACE_DASHBOARD, - timeout=GCS_RPC_TIMEOUT_SECONDS, - ) - if not keys: - # No agent keys found, retry - raise Exception("No agents found in InternalKV.") - return [ - NodeID.from_hex(key[len(DASHBOARD_AGENT_ADDR_NODE_ID_PREFIX) :].decode()) - for key in keys - ] - async def _fetch_agent_info(self, target_node_id: NodeID) -> Tuple[str, int, int]: """ Fetches agent info by the Node ID. May raise exception if there's network error or the diff --git a/python/ray/dashboard/modules/job/job_manager.py b/python/ray/dashboard/modules/job/job_manager.py index 5479757ea1cd..22692757e4eb 100644 --- a/python/ray/dashboard/modules/job/job_manager.py +++ b/python/ray/dashboard/modules/job/job_manager.py @@ -32,8 +32,8 @@ from ray.dashboard.modules.job.job_supervisor import JobSupervisor from ray.dashboard.modules.job.utils import get_head_node_id from ray.dashboard.utils import close_logger_file_descriptor -from ray.exceptions import ActorUnschedulableError, RuntimeEnvSetupError -from ray.job_submission import JobStatus +from ray.exceptions import ActorDiedError, ActorUnschedulableError, RuntimeEnvSetupError +from ray.job_submission import JobErrorType, JobStatus from ray.runtime_env import RuntimeEnvConfig from ray.util.scheduling_strategies import ( NodeAffinitySchedulingStrategy, @@ -145,6 +145,9 @@ async def _monitor_job( self.monitored_jobs.add(job_id) try: await self._monitor_job_internal(job_id, job_supervisor) + except Exception as e: + logger.error("Unhandled exception in job monitoring!", exc_info=e) + raise e finally: self.monitored_jobs.remove(job_id) @@ -158,16 +161,29 @@ async def _monitor_job_internal( ) ) - is_alive = True + job_status = None + job_info = None + ping_obj_ref = None - while is_alive: + while True: try: - job_status = await self._job_info_client.get_status(job_id) + # NOTE: Job monitoring loop sleeps before proceeding with monitoring + # sequence to consolidate the control-flow of the pacing + # in a single place, rather than having it spread across + # many branches + await asyncio.sleep(self.JOB_MONITOR_LOOP_PERIOD_S) + + job_status = await self._job_info_client.get_status( + job_id, timeout=None + ) if job_status == JobStatus.PENDING: # Compare the current time with the job start time. # If the job is still pending, we will set the status # to FAILED. - job_info = await self._job_info_client.get_info(job_id) + if job_info is None: + job_info = await self._job_info_client.get_info( + job_id, timeout=None + ) if time.time() - job_info.start_time / 1000 > timeout: err_msg = ( @@ -208,10 +224,11 @@ async def _monitor_job_internal( job_id, JobStatus.FAILED, message=err_msg, + error_type=JobErrorType.JOB_SUPERVISOR_ACTOR_START_TIMEOUT, + timeout=None, ) - is_alive = False logger.error(err_msg) - continue + break if job_supervisor is None: job_supervisor = self._get_actor_for_job(job_id) @@ -234,80 +251,100 @@ async def _monitor_job_internal( "Unexpected error occurred: " "failed to get job supervisor." ), + error_type=JobErrorType.JOB_SUPERVISOR_ACTOR_START_FAILURE, + timeout=None, ) - is_alive = False - continue - - await job_supervisor.ping.remote() + break + + # Check to see if `JobSupervisor` is alive and reachable + if ping_obj_ref is None: + ping_obj_ref = job_supervisor.ping.options( + max_task_retries=-1 + ).remote() + ready, _ = ray.wait([ping_obj_ref], timeout=0) + if ready: + ray.get(ping_obj_ref) + ping_obj_ref = None + else: + continue - await asyncio.sleep(self.JOB_MONITOR_LOOP_PERIOD_S) except Exception as e: - is_alive = False - job_status = await self._job_info_client.get_status(job_id) - job_error_message = None - if job_status == JobStatus.FAILED: - job_error_message = ( - "See more details from the dashboard " - "`Job` page or the state API `ray list jobs`." - ) - - job_error_message = "" - if job_status.is_terminal(): + job_status = await self._job_info_client.get_status( + job_id, timeout=None + ) + target_job_error_message = "" + target_job_error_type: Optional[JobErrorType] = None + if job_status is not None and job_status.is_terminal(): # If the job is already in a terminal state, then the actor # exiting is expected. pass - elif isinstance(e, RuntimeEnvSetupError): - logger.info(f"Failed to set up runtime_env for job {job_id}.") - job_error_message = f"runtime_env setup failed: {e}" - job_status = JobStatus.FAILED - await self._job_info_client.put_status( - job_id, - job_status, - message=job_error_message, - ) - elif isinstance(e, ActorUnschedulableError): - logger.info( - f"Failed to schedule job {job_id} because the supervisor actor " - f"could not be scheduled: {e}" - ) - job_error_message = ( - f"Job supervisor actor could not be scheduled: {e}" - ) - await self._job_info_client.put_status( - job_id, - JobStatus.FAILED, - message=job_error_message, - ) else: - logger.warning( - f"Job supervisor for job {job_id} failed unexpectedly: {e}." - ) - job_error_message = f"Unexpected error occurred: {e}" + if isinstance(e, RuntimeEnvSetupError): + logger.error(f"Failed to set up runtime_env for job {job_id}.") + + target_job_error_message = f"runtime_env setup failed: {e}" + target_job_error_type = JobErrorType.RUNTIME_ENV_SETUP_FAILURE + + elif isinstance(e, ActorUnschedulableError): + logger.error( + f"Failed to schedule job {job_id} because the supervisor " + f"actor could not be scheduled: {e}" + ) + + target_job_error_message = ( + f"Job supervisor actor could not be scheduled: {e}" + ) + target_job_error_type = ( + JobErrorType.JOB_SUPERVISOR_ACTOR_UNSCHEDULABLE + ) + + elif isinstance(e, ActorDiedError): + logger.error(f"Job supervisor actor for {job_id} died: {e}") + target_job_error_message = f"Job supervisor actor died: {e}" + target_job_error_type = JobErrorType.JOB_SUPERVISOR_ACTOR_DIED + + else: + logger.error( + f"Job monitoring for job {job_id} failed " + f"unexpectedly: {e}.", + exc_info=e, + ) + + target_job_error_message = f"Unexpected error occurred: {e}" + target_job_error_type = ( + JobErrorType.JOB_SUPERVISOR_ACTOR_UNKNOWN_FAILURE + ) + job_status = JobStatus.FAILED await self._job_info_client.put_status( job_id, job_status, - message=job_error_message, + message=target_job_error_message, + error_type=target_job_error_type + or JobErrorType.JOB_SUPERVISOR_ACTOR_UNKNOWN_FAILURE, + timeout=None, ) # Log error message to the job driver file for easy access. - if job_error_message: + if target_job_error_message: log_path = self._log_client.get_log_file_path(job_id) os.makedirs(os.path.dirname(log_path), exist_ok=True) with open(log_path, "a") as log_file: - log_file.write(job_error_message) + log_file.write(target_job_error_message) # Log events if self.event_logger: event_log = ( f"Completed a ray job {job_id} with a status {job_status}." ) - if job_error_message: - event_log += f" {job_error_message}" + if target_job_error_message: + event_log += f" {target_job_error_message}" self.event_logger.error(event_log, submission_id=job_id) else: self.event_logger.info(event_log, submission_id=job_id) + break + # Kill the actor defensively to avoid leaking actors in unexpected error cases. if job_supervisor is not None: ray.kill(job_supervisor, no_restart=True) @@ -575,6 +612,7 @@ async def submit_job( f"Failed to start supervisor actor {submission_id}: '{e}'" f". Full traceback:\n{tb_str}" ), + error_type=JobErrorType.JOB_SUPERVISOR_ACTOR_START_FAILURE, ) finally: close_logger_file_descriptor(driver_logger) diff --git a/python/ray/dashboard/modules/job/job_supervisor.py b/python/ray/dashboard/modules/job/job_supervisor.py index 9b9536a4129e..60766ee93935 100644 --- a/python/ray/dashboard/modules/job/job_supervisor.py +++ b/python/ray/dashboard/modules/job/job_supervisor.py @@ -11,9 +11,10 @@ import ray import ray._private.ray_constants as ray_constants +from ray._common.filters import CoreContextFilter +from ray._common.formatters import JSONFormatter, TextFormatter +from ray._common.network_utils import build_address from ray._private.accelerators.nvidia_gpu import NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR -from ray._private.ray_logging.filters import CoreContextFilter -from ray._private.ray_logging.formatters import JSONFormatter, TextFormatter from ray._private.runtime_env.constants import RAY_JOB_CONFIG_JSON_ENV_VAR from ray._private.utils import remove_ray_internal_flags_from_env from ray._raylet import GcsClient @@ -24,8 +25,7 @@ JobInfoStorageClient, ) from ray.dashboard.modules.job.job_log_storage_client import JobLogStorageClient -from ray.job_submission import JobStatus -from ray._common.network_utils import build_address +from ray.job_submission import JobErrorType, JobStatus import psutil @@ -450,6 +450,7 @@ async def run( JobStatus.FAILED, message=message, driver_exit_code=return_code, + error_type=JobErrorType.JOB_ENTRYPOINT_COMMAND_ERROR, ) except Exception: self._logger.error( @@ -461,6 +462,7 @@ async def run( self._job_id, JobStatus.FAILED, message=traceback.format_exc(), + error_type=JobErrorType.JOB_ENTRYPOINT_COMMAND_START_ERROR, ) except Exception: self._logger.error( diff --git a/python/ray/dashboard/modules/job/sdk.py b/python/ray/dashboard/modules/job/sdk.py index f8442e09dbc8..e01c880a0e87 100644 --- a/python/ray/dashboard/modules/job/sdk.py +++ b/python/ray/dashboard/modules/job/sdk.py @@ -46,7 +46,7 @@ class JobSubmissionClient(SubmissionClient): ray.init(), e.g. a Ray Client address (ray://:10001), or "auto", or "localhost:". If unspecified, will try to connect to a running local Ray cluster. This argument is always overridden by the - RAY_ADDRESS environment variable. + RAY_API_SERVER_ADDRESS or RAY_ADDRESS environment variable. create_cluster_if_needed: Indicates whether the cluster at the specified address needs to already be running. Ray doesn't start a cluster before interacting with jobs, but third-party job managers may do so. diff --git a/python/ray/dashboard/modules/job/tests/test_cli_integration.py b/python/ray/dashboard/modules/job/tests/test_cli_integration.py index 883671af0415..872a1c823d57 100644 --- a/python/ray/dashboard/modules/job/tests/test_cli_integration.py +++ b/python/ray/dashboard/modules/job/tests/test_cli_integration.py @@ -142,11 +142,38 @@ def test_empty_ray_address(self, ray_start_stop): assert "succeeded" in stdout @pytest.mark.parametrize( - "ray_client_address", ["127.0.0.1:8265", "ray://127.0.0.1:8265"] + "ray_api_server_address,should_fail", + [ + ("http://127.0.0.1:8265", False), # correct API server + ("127.0.0.1:8265", True), # wrong format without http + ("http://127.0.0.1:9999", True), # wrong port + ], ) - def test_ray_client_address(self, ray_start_stop, ray_client_address: str): + def test_ray_api_server_address( + self, + ray_start_stop, + ray_api_server_address: str, + should_fail: bool, + ): + # Set a `RAY_ADDRESS` that would not work with the `ray job submit` CLI because it uses the `ray://` prefix. + # This verifies that the `RAY_API_SERVER_ADDRESS` env var takes precedence. + with set_env_var("RAY_ADDRESS", "ray://127.0.0.1:8265"): + with set_env_var("RAY_API_SERVER_ADDRESS", ray_api_server_address): + _run_cmd("ray job submit -- echo hello", should_fail=should_fail) + + @pytest.mark.parametrize( + "ray_client_address,should_fail", + [ + ("127.0.0.1:8265", True), + ("ray://127.0.0.1:8265", True), + ("http://127.0.0.1:8265", False), + ], + ) + def test_ray_client_address( + self, ray_start_stop, ray_client_address: str, should_fail: bool + ): with set_env_var("RAY_ADDRESS", ray_client_address): - _run_cmd("ray job submit -- echo hello", should_fail=True) + _run_cmd("ray job submit -- echo hello", should_fail=should_fail) def test_valid_http_ray_address(self, ray_start_stop): stdout, _ = _run_cmd("ray job submit -- echo hello") diff --git a/python/ray/dashboard/modules/job/tests/test_common.py b/python/ray/dashboard/modules/job/tests/test_common.py index 1bd9d51b9f87..036ad19386a3 100644 --- a/python/ray/dashboard/modules/job/tests/test_common.py +++ b/python/ray/dashboard/modules/job/tests/test_common.py @@ -5,6 +5,7 @@ from ray.core.generated.gcs_pb2 import JobsAPIInfo from ray.dashboard.modules.job.common import ( + JobErrorType, JobInfo, JobStatus, JobSubmitRequest, @@ -179,7 +180,7 @@ def test_job_info_json_to_proto(): info = JobInfo( status=JobStatus.PENDING, entrypoint="echo hi", - error_type="error_type", + error_type=JobErrorType.JOB_SUPERVISOR_ACTOR_UNSCHEDULABLE, start_time=123, end_time=456, metadata={"hi": "hi2"}, @@ -208,7 +209,7 @@ def test_job_info_json_to_proto(): "(CPUs, GPUs, memory, custom resources) to become available. " "It may be waiting for the runtime environment to be set up." ) - assert info_proto.error_type == "error_type" + assert info_proto.error_type == "JOB_SUPERVISOR_ACTOR_UNSCHEDULABLE" assert info_proto.driver_agent_http_address == "http://localhost:1234" assert info_proto.driver_node_id == "node_id" diff --git a/python/ray/dashboard/modules/job/tests/test_component_activities.py b/python/ray/dashboard/modules/job/tests/test_component_activities.py index c0bbabca2086..9aac1651b116 100644 --- a/python/ray/dashboard/modules/job/tests/test_component_activities.py +++ b/python/ray/dashboard/modules/job/tests/test_component_activities.py @@ -5,9 +5,9 @@ import jsonschema import pytest -from ray._common.test_utils import wait_for_condition import requests +from ray._common.test_utils import wait_for_condition from ray._private.test_utils import ( format_web_url, run_string_as_driver, diff --git a/python/ray/dashboard/modules/job/tests/test_http_job_server.py b/python/ray/dashboard/modules/job/tests/test_http_job_server.py index 07151a341038..84d3d09d319f 100644 --- a/python/ray/dashboard/modules/job/tests/test_http_job_server.py +++ b/python/ray/dashboard/modules/job/tests/test_http_job_server.py @@ -1,4 +1,3 @@ -import asyncio import json import logging import os @@ -8,16 +7,15 @@ import tempfile import time from pathlib import Path -from typing import Dict, List, Optional, Union +from typing import Optional from unittest.mock import patch import pytest -from ray._common.test_utils import wait_for_condition import requests import yaml import ray -from ray import NodeID +from ray._common.test_utils import wait_for_condition from ray._private.runtime_env.packaging import ( create_package, download_and_unpack_package, @@ -26,16 +24,10 @@ from ray._private.test_utils import ( chdir, format_web_url, - ray_constants, wait_until_server_available, ) -from ray.dashboard.consts import ( - DASHBOARD_AGENT_ADDR_IP_PREFIX, - DASHBOARD_AGENT_ADDR_NODE_ID_PREFIX, -) from ray.dashboard.modules.dashboard_sdk import ClusterInfo, parse_cluster_info from ray.dashboard.modules.job.common import uri_to_http_components -from ray.dashboard.modules.job.job_head import JobHead from ray.dashboard.modules.job.pydantic_models import JobDetails from ray.dashboard.modules.job.tests.test_cli_integration import set_env_var from ray.dashboard.modules.version import CURRENT_VERSION @@ -746,202 +738,6 @@ def test_jobs_env_hook(job_sdk_client: JobSubmissionClient): assert f.read().strip() == "Ray rocks!" -@pytest.mark.asyncio -async def test_job_head_pick_random_job_agent(monkeypatch): - with set_env_var("CANDIDATE_AGENT_NUMBER", "2"): - import importlib - - importlib.reload(ray.dashboard.consts) - - # Fake GCS client - class _FakeGcsClient: - def __init__(self): - self._kv: Dict[bytes, bytes] = {} - - @staticmethod - def ensure_bytes(key: Union[bytes, str]) -> bytes: - return key.encode() if isinstance(key, str) else key - - async def async_internal_kv_put( - self, key: Union[bytes, str], value: bytes, **kwargs - ): - key = self.ensure_bytes(key) - self._kv[key] = value - - async def async_internal_kv_get(self, key: Union[bytes, str], **kwargs): - key = self.ensure_bytes(key) - return self._kv.get(key, None) - - async def async_internal_kv_multi_get( - self, keys: List[Union[bytes, str]], **kwargs - ): - return {key: self.internal_kv_get(key) for key in keys} - - async def async_internal_kv_del(self, key: Union[bytes, str], **kwargs): - key = self.ensure_bytes(key) - self._kv.pop(key) - - async def async_internal_kv_keys(self, prefix: Union[bytes, str], **kwargs): - prefix = self.ensure_bytes(prefix) - return [key for key in self._kv.keys() if key.startswith(prefix)] - - class MockJobHead(JobHead): - def __init__(self): - self._agents = dict() - self._gcs_client = _FakeGcsClient() - - @property - def gcs_client(self): - # Overrides JobHead.gcs_client - return self._gcs_client - - job_head = MockJobHead() - job_head._gcs_client = _FakeGcsClient() - - async def add_agent(agent): - node_id = agent[0] - node_ip = agent[1]["ipAddress"] - http_port = agent[1]["httpPort"] - grpc_port = agent[1]["grpcPort"] - - await job_head._gcs_client.async_internal_kv_put( - f"{DASHBOARD_AGENT_ADDR_NODE_ID_PREFIX}{node_id.hex()}".encode(), - json.dumps([node_ip, http_port, grpc_port]).encode(), - namespace=ray_constants.KV_NAMESPACE_DASHBOARD, - ) - await job_head._gcs_client.async_internal_kv_put( - f"{DASHBOARD_AGENT_ADDR_IP_PREFIX}{node_ip}".encode(), - json.dumps([node_id.hex(), http_port, grpc_port]).encode(), - namespace=ray_constants.KV_NAMESPACE_DASHBOARD, - ) - - async def del_agent(agent): - node_id = agent[0] - node_ip = agent[1]["ipAddress"] - await job_head._gcs_client.async_internal_kv_del( - f"{DASHBOARD_AGENT_ADDR_NODE_ID_PREFIX}{node_id.hex()}".encode(), - namespace=ray_constants.KV_NAMESPACE_DASHBOARD, - ) - await job_head._gcs_client.async_internal_kv_del( - f"{DASHBOARD_AGENT_ADDR_IP_PREFIX}{node_ip}".encode(), - namespace=ray_constants.KV_NAMESPACE_DASHBOARD, - ) - - head_node_id = NodeID.from_random() - await job_head._gcs_client.async_internal_kv_put( - ray_constants.KV_HEAD_NODE_ID_KEY, - head_node_id.hex().encode(), - namespace=ray_constants.KV_NAMESPACE_JOB, - ) - - agent_1 = ( - head_node_id, - dict( - ipAddress="1.1.1.1", - httpPort=1, - grpcPort=1, - httpAddress="1.1.1.1:1", - ), - ) - agent_2 = ( - NodeID.from_random(), - dict( - ipAddress="2.2.2.2", - httpPort=2, - grpcPort=2, - httpAddress="2.2.2.2:2", - ), - ) - agent_3 = ( - NodeID.from_random(), - dict( - ipAddress="3.3.3.3", - httpPort=3, - grpcPort=3, - httpAddress="3.3.3.3:3", - ), - ) - - # Disable Head-node routing for the Ray job critical ops (enabling - # random agent sampling) - monkeypatch.setattr( - f"{JobHead.__module__}.RAY_JOB_AGENT_USE_HEAD_NODE_ONLY", False - ) - - # Check only 1 agent present, only agent being returned - await add_agent(agent_1) - job_agent_client = await job_head.get_target_agent() - assert job_agent_client._agent_address == "http://1.1.1.1:1" - - # Remove only agent, no agents present, should time out - await del_agent(agent_1) - with pytest.raises(asyncio.TimeoutError): - await asyncio.wait_for(job_head.get_target_agent(), timeout=3) - - # Enable Head-node routing for the Ray job critical ops (disabling - # random agent sampling) - monkeypatch.setattr( - f"{JobHead.__module__}.RAY_JOB_AGENT_USE_HEAD_NODE_ONLY", True - ) - - # Add 3 agents - await add_agent(agent_1) - await add_agent(agent_2) - await add_agent(agent_3) - - # Make sure returned agent is a head-node - # NOTE: We run 3 tims to make sure we're not hitting branch probabilistically - for _ in range(3): - job_agent_client = await job_head.get_target_agent() - assert job_agent_client._agent_address == "http://1.1.1.1:1" - - # Disable Head-node routing for the Ray job critical ops (enabling - # random agent sampling) - monkeypatch.setattr( - f"{JobHead.__module__}.RAY_JOB_AGENT_USE_HEAD_NODE_ONLY", False - ) - - # Theoretically, the probability of failure is 1/3^100 - addresses_1 = set() - for address in range(100): - job_agent_client = await job_head.get_target_agent() - addresses_1.add(job_agent_client._agent_address) - assert len(addresses_1) == 2 - addresses_2 = set() - for address in range(100): - job_agent_client = await job_head.get_target_agent() - addresses_2.add(job_agent_client._agent_address) - assert len(addresses_2) == 2 and addresses_1 == addresses_2 - - for agent in [agent_1, agent_2, agent_3]: - if f"http://{agent[1]['httpAddress']}" in addresses_2: - break - await del_agent(agent) - - # Theoretically, the probability of failure is 1/2^100 - addresses_3 = set() - for address in range(100): - job_agent_client = await job_head.get_target_agent() - addresses_3.add(job_agent_client._agent_address) - assert len(addresses_3) == 2 - assert addresses_2 - addresses_3 == {f"http://{agent[1]['httpAddress']}"} - addresses_4 = set() - for address in range(100): - job_agent_client = await job_head.get_target_agent() - addresses_4.add(job_agent_client._agent_address) - assert addresses_4 == addresses_3 - - for agent in [agent_1, agent_2, agent_3]: - if f"http://{agent[1]['httpAddress']}" in addresses_4: - break - await del_agent(agent) - address = None - for _ in range(3): - job_agent_client = await job_head.get_target_agent() - assert address is None or address == job_agent_client._agent_address - address = job_agent_client._agent_address - - @pytest.mark.asyncio async def test_get_upload_package(ray_start_context, tmp_path): assert wait_until_server_available(ray_start_context["webui_url"]) diff --git a/python/ray/dashboard/modules/job/tests/test_job_agent.py b/python/ray/dashboard/modules/job/tests/test_job_agent.py index 7922c1cdff97..3a1c012d7030 100644 --- a/python/ray/dashboard/modules/job/tests/test_job_agent.py +++ b/python/ray/dashboard/modules/job/tests/test_job_agent.py @@ -9,11 +9,12 @@ import pytest import pytest_asyncio -from ray._common.test_utils import async_wait_for_condition, wait_for_condition import requests import yaml import ray +from ray._common.network_utils import build_address +from ray._common.test_utils import async_wait_for_condition, wait_for_condition from ray._common.utils import get_or_create_event_loop from ray._private.ray_constants import DEFAULT_DASHBOARD_AGENT_LISTEN_PORT from ray._private.runtime_env.py_modules import upload_py_modules_if_needed @@ -25,7 +26,6 @@ run_string_as_driver_nonblocking, wait_until_server_available, ) -from ray._common.network_utils import parse_address, build_address from ray.dashboard.modules.job.common import ( JOB_ACTOR_NAME_TEMPLATE, SUPERVISOR_ACTOR_RAY_NAMESPACE, @@ -77,8 +77,8 @@ def __init__(self, *args, **kwargs): @pytest_asyncio.fixture async def job_sdk_client(make_sure_dashboard_http_port_unused): with _ray_start(include_dashboard=True, num_cpus=1) as ctx: - ip, _ = parse_address(ctx.address_info["webui_url"]) - agent_address = build_address(ip, DEFAULT_DASHBOARD_AGENT_LISTEN_PORT) + node_ip = ctx.address_info["node_ip_address"] + agent_address = build_address(node_ip, DEFAULT_DASHBOARD_AGENT_LISTEN_PORT) assert wait_until_server_available(agent_address) head_address = ctx.address_info["webui_url"] assert wait_until_server_available(head_address) @@ -469,8 +469,8 @@ async def test_job_log_in_multiple_node( dashboard_agent_listen_port=DEFAULT_DASHBOARD_AGENT_LISTEN_PORT + 2 ) - ip, _ = parse_address(cluster.webui_url) - agent_address = build_address(ip, DEFAULT_DASHBOARD_AGENT_LISTEN_PORT) + node_ip = cluster.head_node.node_ip_address + agent_address = build_address(node_ip, DEFAULT_DASHBOARD_AGENT_LISTEN_PORT) assert wait_until_server_available(agent_address) client = JobAgentSubmissionClient(format_web_url(agent_address)) @@ -595,18 +595,18 @@ async def test_non_default_dashboard_agent_http_port(tmp_path): """ import subprocess - cmd = ( - "ray start --head " f"--dashboard-agent-listen-port {get_current_unused_port()}" - ) + dashboard_agent_port = get_current_unused_port() + cmd = "ray start --head " f"--dashboard-agent-listen-port {dashboard_agent_port}" subprocess.check_output(cmd, shell=True) try: # We will need to wait for the ray to be started in the subprocess. address_info = ray.init("auto", ignore_reinit_error=True).address_info - ip, _ = parse_address(address_info["webui_url"]) + node_ip = address_info["node_ip_address"] + dashboard_agent_listen_port = address_info["dashboard_agent_listen_port"] - agent_address = build_address(ip, dashboard_agent_listen_port) + agent_address = build_address(node_ip, dashboard_agent_listen_port) print("agent address = ", agent_address) agent_client = JobAgentSubmissionClient(format_web_url(agent_address)) diff --git a/python/ray/dashboard/modules/job/tests/test_job_manager.py b/python/ray/dashboard/modules/job/tests/test_job_manager.py index 4481acb667b8..8f6c37d73d13 100644 --- a/python/ray/dashboard/modules/job/tests/test_job_manager.py +++ b/python/ray/dashboard/modules/job/tests/test_job_manager.py @@ -10,17 +10,18 @@ import pytest import ray -from ray._common.test_utils import SignalActor, wait_for_condition +from ray._common.network_utils import build_address +from ray._common.test_utils import ( + SignalActor, + async_wait_for_condition, + wait_for_condition, +) from ray._private.ray_constants import ( DEFAULT_DASHBOARD_AGENT_LISTEN_PORT, KV_HEAD_NODE_ID_KEY, KV_NAMESPACE_JOB, RAY_ADDRESS_ENVIRONMENT_VARIABLE, ) -from ray._common.network_utils import build_address -from ray._common.test_utils import ( - async_wait_for_condition, -) from ray.dashboard.consts import ( RAY_JOB_ALLOW_DRIVER_ON_WORKER_NODES_ENV_VAR, RAY_JOB_START_TIMEOUT_SECONDS_ENV_VAR, @@ -37,7 +38,7 @@ create_job_manager, create_ray_cluster, ) -from ray.job_submission import JobStatus +from ray.job_submission import JobErrorType, JobStatus from ray.tests.conftest import call_ray_start # noqa: F401 from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy # noqa: F401 from ray.util.state import list_tasks @@ -350,14 +351,46 @@ async def test_runtime_env_setup_logged_to_job_driver_logs( assert start_message in logs -@pytest.fixture(scope="module") +@pytest.mark.asyncio +@pytest.mark.parametrize( + "call_ray_start", + [ + { + "cmd": "ray start --head", + "env": { + "RAY_testing_rpc_failure": "ray::rpc::InternalKVGcsService.grpc_client.InternalKVGet=2:50:50,CoreWorkerService.grpc_client.PushTask=3:50:50" + }, + }, + ], + indirect=True, +) +async def test_job_manager_network_fault_tolerance( + call_ray_start, tmp_path # noqa: F811 +): + """Test that the job manager is tolerant to transient network failures + when making RPCs to GCS and supervisor actor.""" + + ray.init(address=call_ray_start) + gcs_client = ray._private.worker.global_worker.gcs_client + job_manager = JobManager(gcs_client, tmp_path) + + job_id = await job_manager.submit_job( + entrypoint="echo hello 1", + ) + await async_wait_for_condition( + check_job_succeeded, job_manager=job_manager, job_id=job_id + ) + + +@pytest.fixture def shared_ray_instance(): # Remove ray address for test ray cluster in case we have # lingering RAY_ADDRESS="http://127.0.0.1:8265" from previous local job # submissions. old_ray_address = os.environ.pop(RAY_ADDRESS_ENVIRONMENT_VARIABLE, None) - yield create_ray_cluster() + with create_ray_cluster() as cluster: + yield cluster if old_ray_address is not None: os.environ[RAY_ADDRESS_ENVIRONMENT_VARIABLE] = old_ray_address @@ -365,7 +398,10 @@ def shared_ray_instance(): @pytest.fixture def job_manager(shared_ray_instance, tmp_path): - yield create_job_manager(shared_ray_instance, tmp_path) + job_manager = create_job_manager(shared_ray_instance, tmp_path) + job_manager.JOB_MONITOR_LOOP_PERIOD_S = 0.01 + + yield job_manager async def _run_hanging_command(job_manager, tmp_dir, start_signal_actor=None): @@ -400,7 +436,14 @@ async def _run_hanging_command(job_manager, tmp_dir, start_signal_actor=None): async def check_job_succeeded(job_manager, job_id): - data = await job_manager.get_job_info(job_id) + return await _check_job_succeeded( + get_job_info=job_manager.get_job_info, job_id=job_id + ) + + +async def _check_job_succeeded(*, get_job_info, job_id: str): + data = await get_job_info(job_id) + status = data.status if status == JobStatus.FAILED: raise RuntimeError(f"Job failed! {data.message}") @@ -412,9 +455,20 @@ async def check_job_succeeded(job_manager, job_id): return status == JobStatus.SUCCEEDED -async def check_job_failed(job_manager, job_id): - status = await job_manager.get_job_status(job_id) +async def check_job_failed(job_manager, job_id, expected_error_type=None): + return await _check_job_failed( + get_job_info=job_manager.get_job_info, + job_id=job_id, + expected_error_type=expected_error_type, + ) + + +async def _check_job_failed(*, get_job_info, job_id: str, expected_error_type=None): + data = await get_job_info(job_id) + status = data.status assert status in {JobStatus.PENDING, JobStatus.RUNNING, JobStatus.FAILED} + if expected_error_type: + assert data.error_type == expected_error_type return status == JobStatus.FAILED @@ -720,7 +774,10 @@ async def test_failed_runtime_env_setup(self, job_manager): ) await async_wait_for_condition( - check_job_failed, job_manager=job_manager, job_id=job_id + check_job_failed, + job_manager=job_manager, + job_id=job_id, + expected_error_type=JobErrorType.RUNTIME_ENV_SETUP_FAILURE, ) data = await job_manager.get_job_info(job_id) @@ -880,7 +937,10 @@ async def test_kill_job_actor_in_before_driver_finish(self, job_manager): actor = job_manager._get_actor_for_job(job_id) ray.kill(actor, no_restart=True) await async_wait_for_condition( - check_job_failed, job_manager=job_manager, job_id=job_id + check_job_failed, + job_manager=job_manager, + job_id=job_id, + expected_error_type=JobErrorType.JOB_SUPERVISOR_ACTOR_DIED, ) data = await job_manager.get_job_info(job_id) assert data.driver_exit_code is None @@ -934,10 +994,18 @@ async def test_kill_job_actor_in_pending(self, job_manager): actor = job_manager._get_actor_for_job(job_id) ray.kill(actor, no_restart=True) await async_wait_for_condition( - check_job_failed, job_manager=job_manager, job_id=job_id + check_job_failed, + job_manager=job_manager, + job_id=job_id, + expected_error_type=JobErrorType.JOB_SUPERVISOR_ACTOR_DIED, ) data = await job_manager.get_job_info(job_id) + assert data.driver_exit_code is None + assert data.message.startswith( + "Job supervisor actor died: The actor died unexpectedly before " + "finishing this task" + ) async def test_stop_job_subprocess_cleanup_upon_stop(self, job_manager): """ @@ -1040,7 +1108,10 @@ async def test_failed_job(self, job_manager): print(lines, end="") await async_wait_for_condition( - check_job_failed, job_manager=job_manager, job_id=job_id + check_job_failed, + job_manager=job_manager, + job_id=job_id, + expected_error_type=JobErrorType.JOB_ENTRYPOINT_COMMAND_ERROR, ) # check if the driver is killed data = await job_manager.get_job_info(job_id) @@ -1255,7 +1326,10 @@ async def test_failed_job_logs_max_char(job_manager): ) await async_wait_for_condition( - check_job_failed, job_manager=job_manager, job_id=job_id + check_job_failed, + job_manager=job_manager, + job_id=job_id, + expected_error_type=JobErrorType.JOB_ENTRYPOINT_COMMAND_ERROR, ) # Verify the status message length @@ -1309,6 +1383,44 @@ async def test_monitor_job_pending(job_manager): ) +@pytest.mark.asyncio +@pytest.mark.parametrize( + "call_ray_start", + ["ray start --head --num-cpus=1"], + indirect=True, +) +async def test_job_timeout_lack_of_entrypoint_resources( + call_ray_start, tmp_path, monkeypatch # noqa: F811 +): + """Test the timeout when there are not enough resources to schedule the supervisor actor)""" + + monkeypatch.setenv(RAY_JOB_START_TIMEOUT_SECONDS_ENV_VAR, "1") + + ray.init(address=call_ray_start) + gcs_client = ray._private.worker.global_worker.gcs_client + job_manager = JobManager(gcs_client, tmp_path) + + # Submit a job with unsatisfied resource. + job_id = await job_manager.submit_job( + entrypoint="echo 'hello world'", + entrypoint_num_cpus=2, + ) + + # Wait for the job to timeout. + await async_wait_for_condition( + check_job_failed, + job_manager=job_manager, + job_id=job_id, + expected_error_type=JobErrorType.JOB_SUPERVISOR_ACTOR_START_TIMEOUT, + ) + + # Check that the job timed out. + job_info = await job_manager.get_job_info(job_id) + assert job_info.status == JobStatus.FAILED + assert "Job supervisor actor failed to start within" in job_info.message + assert job_info.driver_exit_code is None + + @pytest.mark.asyncio async def test_job_pending_timeout(job_manager, monkeypatch): """Test the timeout for pending jobs.""" @@ -1330,7 +1442,10 @@ async def test_job_pending_timeout(job_manager, monkeypatch): # Wait for the job to timeout. await async_wait_for_condition( - check_job_failed, job_manager=job_manager, job_id=job_id + check_job_failed, + job_manager=job_manager, + job_id=job_id, + expected_error_type=JobErrorType.JOB_SUPERVISOR_ACTOR_START_TIMEOUT, ) # Check that the job timed out. @@ -1355,7 +1470,10 @@ async def test_failed_driver_exit_code(job_manager): job_id = await job_manager.submit_job(entrypoint=exit_code_cmd) # Wait for the job to timeout. await async_wait_for_condition( - check_job_failed, job_manager=job_manager, job_id=job_id + check_job_failed, + job_manager=job_manager, + job_id=job_id, + expected_error_type=JobErrorType.JOB_ENTRYPOINT_COMMAND_ERROR, ) # Check that the job failed diff --git a/python/ray/dashboard/modules/job/tests/test_sdk.py b/python/ray/dashboard/modules/job/tests/test_sdk.py index 41507e3530de..fdbffad0654b 100644 --- a/python/ray/dashboard/modules/job/tests/test_sdk.py +++ b/python/ray/dashboard/modules/job/tests/test_sdk.py @@ -8,11 +8,9 @@ import pytest -import ray -from ray._common.test_utils import wait_for_condition import ray.experimental.internal_kv as kv +from ray._common.test_utils import wait_for_condition from ray._private.ray_constants import ( - DEFAULT_DASHBOARD_AGENT_LISTEN_PORT, KV_NAMESPACE_DASHBOARD, ) from ray._private.test_utils import ( @@ -37,8 +35,6 @@ from ray.tests.conftest import _ray_start from ray.util.state import list_nodes -import psutil - def _check_job_succeeded(client: JobSubmissionClient, job_id: str) -> bool: status = client.get_job_status(job_id) @@ -166,13 +162,6 @@ def test_temporary_uri_reference(monkeypatch, expiration_s): print("Internal KV was GC'ed at time ", time.time() - start) -@pytest.fixture -def mock_candidate_number(): - os.environ["CANDIDATE_AGENT_NUMBER"] = "2" - yield - os.environ.pop("CANDIDATE_AGENT_NUMBER", None) - - def get_register_agents_number(gcs_client): keys = gcs_client.internal_kv_keys( prefix=DASHBOARD_AGENT_ADDR_NODE_ID_PREFIX, @@ -182,132 +171,6 @@ def get_register_agents_number(gcs_client): return len(keys) -@pytest.mark.parametrize( - "ray_start_cluster_head_with_env_vars", - [ - { - "include_dashboard": True, - "env_vars": { - "CANDIDATE_AGENT_NUMBER": "2", - RAY_JOB_ALLOW_DRIVER_ON_WORKER_NODES_ENV_VAR: "1", - "RAY_health_check_initial_delay_ms": "0", - "RAY_health_check_period_ms": "1000", - "RAY_JOB_AGENT_USE_HEAD_NODE_ONLY": "0", - }, - } - ], - indirect=True, -) -def test_job_head_choose_job_agent_E2E(ray_start_cluster_head_with_env_vars): - cluster = ray_start_cluster_head_with_env_vars - assert wait_until_server_available(cluster.webui_url) is True - webui_url = cluster.webui_url - webui_url = format_web_url(webui_url) - client = JobSubmissionClient(webui_url) - gcs_client = GcsClient(address=cluster.gcs_address) - - def submit_job_and_wait_finish(): - submission_id = client.submit_job(entrypoint="echo hello") - - wait_for_condition( - _check_job_succeeded, client=client, job_id=submission_id, timeout=30 - ) - - head_http_port = DEFAULT_DASHBOARD_AGENT_LISTEN_PORT - worker_1_http_port = 52366 - cluster.add_node(dashboard_agent_listen_port=worker_1_http_port) - wait_for_condition(lambda: get_register_agents_number(gcs_client) == 2, timeout=20) - assert len(cluster.worker_nodes) == 1 - node_try_to_kill = list(cluster.worker_nodes)[0] - - def make_sure_worker_node_run_job(port): - actors = ray.state.actors() - - def _kill_all_driver(): - for _, actor_info in actors.items(): - if actor_info["State"] != "ALIVE": - continue - if actor_info["Name"].startswith("_ray_internal_job_actor"): - proc = psutil.Process(actor_info["Pid"]) - try: - proc.kill() - except Exception: - pass - - try: - for _, actor_info in actors.items(): - if actor_info["State"] != "ALIVE": - continue - if actor_info["Name"].startswith("_ray_internal_job_actor"): - proc = psutil.Process(actor_info["Pid"]) - parent_proc = proc.parent() - if f"--listen-port={port}" in " ".join(parent_proc.cmdline()): - _kill_all_driver() - return True - except Exception as ex: - print("Got exception:", ex) - raise - client.submit_job(entrypoint="sleep 3600") - return False - - # Make `list(cluster.worker_nodes)[0]` and head node called at least once - wait_for_condition( - lambda: make_sure_worker_node_run_job(worker_1_http_port), timeout=60 - ) - wait_for_condition( - lambda: make_sure_worker_node_run_job(head_http_port), timeout=60 - ) - - worker_2_http_port = 52367 - cluster.add_node(dashboard_agent_listen_port=worker_2_http_port) - wait_for_condition(lambda: get_register_agents_number(gcs_client) == 3, timeout=20) - - # The third `JobAgent` will not be called here. - submit_job_and_wait_finish() - submit_job_and_wait_finish() - submit_job_and_wait_finish() - - def get_all_new_supervisor_actor_info(old_supervisor_actor_ids): - all_actors = ray.state.state.actor_table(None) - res = dict() - for actor_id, actor_info in all_actors.items(): - if actor_id in old_supervisor_actor_ids: - continue - if not actor_info["Name"].startswith("_ray_internal_job_actor"): - continue - res[actor_id] = actor_info - return res - - old_supervisor_actor_ids = set() - new_supervisor_actor = get_all_new_supervisor_actor_info(old_supervisor_actor_ids) - new_owner_port = set() - for actor_id, actor_info in new_supervisor_actor.items(): - old_supervisor_actor_ids.add(actor_id) - new_owner_port.add(actor_info["OwnerAddress"]["Port"]) - - assert len(new_owner_port) == 2 - old_owner_port = new_owner_port - - node_try_to_kill.kill_raylet() - - # make sure the head updates the info of the dead node. - wait_for_condition(lambda: get_register_agents_number(gcs_client) == 2, timeout=20) - - # Make sure the third JobAgent will be called here. - wait_for_condition( - lambda: make_sure_worker_node_run_job(worker_2_http_port), timeout=60 - ) - - new_supervisor_actor = get_all_new_supervisor_actor_info(old_supervisor_actor_ids) - new_owner_port = set() - for actor_id, actor_info in new_supervisor_actor.items(): - old_supervisor_actor_ids.add(actor_id) - new_owner_port.add(actor_info["OwnerAddress"]["Port"]) - assert len(new_owner_port) == 2 - assert len(old_owner_port - new_owner_port) == 1 - assert len(new_owner_port - old_owner_port) == 1 - - @pytest.mark.parametrize( "ray_start_cluster_head_with_env_vars", [ diff --git a/python/ray/dashboard/modules/log/log_manager.py b/python/ray/dashboard/modules/log/log_manager.py index 783ee1c3a760..60f503888ffb 100644 --- a/python/ray/dashboard/modules/log/log_manager.py +++ b/python/ray/dashboard/modules/log/log_manager.py @@ -231,7 +231,7 @@ async def _resolve_actor_filename( "Actor is not scheduled yet." ) worker_id = WorkerID(worker_id_binary) - node_id_binary = actor_data.address.raylet_id + node_id_binary = actor_data.address.node_id if not node_id_binary: raise ValueError( f"Node ID for Actor ID {actor_id} not found. " diff --git a/python/ray/dashboard/modules/metrics/dashboards/common.py b/python/ray/dashboard/modules/metrics/dashboards/common.py index 4e6b82a21330..a4a1e4b4787e 100644 --- a/python/ray/dashboard/modules/metrics/dashboards/common.py +++ b/python/ray/dashboard/modules/metrics/dashboards/common.py @@ -2,7 +2,10 @@ from enum import Enum from typing import List, Optional +from ray.util.annotations import DeveloperAPI + +@DeveloperAPI @dataclass class GridPos: x: int @@ -30,11 +33,13 @@ class GridPos: } +@DeveloperAPI class TargetTemplate(Enum): GRAPH = GRAPH_TARGET_TEMPLATE HEATMAP = HEATMAP_TARGET_TEMPLATE +@DeveloperAPI @dataclass class Target: """Defines a Grafana target (time-series query) within a panel. @@ -360,6 +365,7 @@ class Target: } +@DeveloperAPI class PanelTemplate(Enum): GRAPH = GRAPH_PANEL_TEMPLATE HEATMAP = HEATMAP_TEMPLATE @@ -368,6 +374,7 @@ class PanelTemplate(Enum): GAUGE = GAUGE_PANEL_TEMPLATE +@DeveloperAPI @dataclass class Panel: """Defines a Grafana panel (graph) for the Ray dashboard page. @@ -397,6 +404,7 @@ class Panel: template: Optional[PanelTemplate] = PanelTemplate.GRAPH +@DeveloperAPI @dataclass class Row: """Defines a Grafana row that can contain multiple panels. @@ -413,6 +421,7 @@ class Row: collapsed: bool = False +@DeveloperAPI @dataclass class DashboardConfig: # This dashboard name is an internal key used to determine which env vars diff --git a/python/ray/dashboard/modules/metrics/dashboards/data_dashboard_panels.py b/python/ray/dashboard/modules/metrics/dashboards/data_dashboard_panels.py index 47c52d65c57d..b9b9e14abc17 100644 --- a/python/ray/dashboard/modules/metrics/dashboards/data_dashboard_panels.py +++ b/python/ray/dashboard/modules/metrics/dashboards/data_dashboard_panels.py @@ -3,6 +3,7 @@ from ray.dashboard.modules.metrics.dashboards.common import ( DashboardConfig, Panel, + Row, Target, ) @@ -15,7 +16,7 @@ # targets=[ # Target( # expr=f"sum(ray_data_{metric.name}" -# + "{{{global_filters}}}) by (dataset, operator)", +# + "{{{global_filters}, operator=~"$Operator"}}) by (dataset, operator)", # legend=legend, # ) # ], @@ -24,766 +25,961 @@ # ) -DATA_GRAFANA_PANELS = [ - # Ray Data Metrics (Overview) - Panel( - id=1, - title="Bytes Spilled", - description="Amount spilled by dataset operators. DataContext.enable_get_object_locations_for_metrics must be set to True to report this metric", - unit="bytes", - targets=[ - Target( - expr="sum(ray_data_spilled_bytes{{{global_filters}}}) by (dataset, operator)", - legend="Bytes Spilled: {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=False, - ), - Panel( - id=3, - title="Bytes Freed", - description="Amount freed by dataset operators.", - unit="bytes", - targets=[ - Target( - expr="sum(ray_data_freed_bytes{{{global_filters}}}) by (dataset, operator)", - legend="Bytes Freed: {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=False, - ), - Panel( - id=4, - title="Object Store Memory", - description="Amount of memory store used by dataset operators.", - unit="bytes", - targets=[ - Target( - expr="sum(ray_data_current_bytes{{{global_filters}}}) by (dataset, operator)", - legend="Current Usage: {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=False, - ), - Panel( - id=5, - title="CPUs (logical slots)", - description="Logical CPUs allocated to dataset operators.", - unit="cores", - targets=[ - Target( - expr="sum(ray_data_cpu_usage_cores{{{global_filters}}}) by (dataset, operator)", - legend="CPU Usage: {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=False, - ), - Panel( - id=6, - title="GPUs (logical slots)", - description="Logical GPUs allocated to dataset operators.", - unit="cores", - targets=[ - Target( - expr="sum(ray_data_gpu_usage_cores{{{global_filters}}}) by (dataset, operator)", - legend="GPU Usage: {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=False, - ), - Panel( - id=7, - title="Bytes Output / Second", - description="Bytes output per second by dataset operators.", - unit="Bps", - targets=[ - Target( - expr="sum(rate(ray_data_output_bytes{{{global_filters}}}[1m])) by (dataset, operator)", - legend="Bytes Output / Second: {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=False, - ), - Panel( - id=55, - title="Max Bytes to Read", - description="Maximum bytes to read from streaming generator buffer.", - unit="bytes", - targets=[ - Target( - expr="sum(ray_data_max_bytes_to_read{{{global_filters}}}) by (dataset, operator)", - legend="Max Bytes to Read: {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=False, - ), - Panel( - id=11, - title="Rows Output / Second", - description="Total rows output per second by dataset operators.", - unit="rows/sec", - targets=[ - Target( - expr="sum(rate(ray_data_output_rows{{{global_filters}}}[1m])) by (dataset, operator)", - legend="Rows Output / Second: {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=False, - ), - # Ray Data Metrics (Inputs) - Panel( - id=17, - title="Input Blocks Received by Operator / Second", - description="Number of input blocks received by operator per second.", - unit="blocks/sec", - targets=[ - Target( - expr="sum(rate(ray_data_num_inputs_received{{{global_filters}}}[1m])) by (dataset, operator)", - legend="Blocks Received / Second: {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=False, - ), - Panel( - id=18, - title="Input Bytes Received by Operator / Second", - description="Byte size of input blocks received by operator per second.", - unit="Bps", - targets=[ - Target( - expr="sum(rate(ray_data_bytes_inputs_received{{{global_filters}}}[1m])) by (dataset, operator)", - legend="Bytes Received / Second: {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=False, - ), - Panel( - id=19, - title="Input Blocks Processed by Tasks / Second", - description=( - "Number of input blocks that operator's tasks have finished processing per second." - ), - unit="blocks/sec", - targets=[ - Target( - expr="sum(rate(ray_data_num_task_inputs_processed{{{global_filters}}}[1m])) by (dataset, operator)", - legend="Blocks Processed / Second: {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=False, - ), - Panel( - id=20, - title="Input Bytes Processed by Tasks / Second", - description=( - "Byte size of input blocks that operator's tasks have finished processing per second." +# Ray Data Metrics (Overview) +BYTES_SPILLED_PANEL = Panel( + id=1, + title="Bytes Spilled", + description="Amount spilled by dataset operators. DataContext.enable_get_object_locations_for_metrics must be set to True to report this metric", + unit="bytes", + targets=[ + Target( + expr='sum(ray_data_spilled_bytes{{{global_filters}, operator=~"$Operator"}}) by (dataset, operator)', + legend="Bytes Spilled: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, +) + +BYTES_FREED_PANEL = Panel( + id=3, + title="Bytes Freed", + description="Amount freed by dataset operators.", + unit="bytes", + targets=[ + Target( + expr='sum(ray_data_freed_bytes{{{global_filters}, operator=~"$Operator"}}) by (dataset, operator)', + legend="Bytes Freed: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, +) + +OBJECT_STORE_MEMORY_PANEL = Panel( + id=4, + title="Object Store Memory", + description="Amount of memory store used by dataset operators.", + unit="bytes", + targets=[ + Target( + expr='sum(ray_data_current_bytes{{{global_filters}, operator=~"$Operator"}}) by (dataset, operator)', + legend="Current Usage: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, +) + +CPU_USAGE_PANEL = Panel( + id=5, + title="Logical Slots Being Used (CPU)", + description="Logical CPUs currently being used by dataset operators.", + unit="cores", + targets=[ + Target( + expr='sum(ray_data_cpu_usage_cores{{{global_filters}, operator=~"$Operator"}}) by (dataset, operator)', + legend="CPU Usage: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, +) + +GPU_USAGE_PANEL = Panel( + id=6, + title="Logical Slots Being Used (GPU)", + description="Logical GPUs currently being used by dataset operators.", + unit="cores", + targets=[ + Target( + expr='sum(ray_data_gpu_usage_cores{{{global_filters}, operator=~"$Operator"}}) by (dataset, operator)', + legend="GPU Usage: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, +) + +BYTES_OUTPUT_PER_SECOND_PANEL = Panel( + id=7, + title="Bytes Output / Second", + description="Bytes output per second by dataset operators.", + unit="Bps", + targets=[ + Target( + expr='sum(rate(ray_data_output_bytes{{{global_filters}, operator=~"$Operator"}}[1m])) by (dataset, operator)', + legend="Bytes Output / Second: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, +) + +ROWS_OUTPUT_PER_SECOND_PANEL = Panel( + id=11, + title="Rows Output / Second", + description="Total rows output per second by dataset operators.", + unit="rows/sec", + targets=[ + Target( + expr='sum(rate(ray_data_output_rows{{{global_filters}, operator=~"$Operator"}}[1m])) by (dataset, operator)', + legend="Rows Output / Second: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, +) + +# Ray Data Metrics (Inputs) +INPUT_BLOCKS_RECEIVED_PANEL = Panel( + id=17, + title="Input Blocks Received by Operator / Second", + description="Number of input blocks received by operator per second.", + unit="blocks/sec", + targets=[ + Target( + expr='sum(rate(ray_data_num_inputs_received{{{global_filters}, operator=~"$Operator"}}[1m])) by (dataset, operator)', + legend="Blocks Received / Second: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, +) + +INPUT_BYTES_RECEIVED_PANEL = Panel( + id=18, + title="Input Bytes Received by Operator / Second", + description="Byte size of input blocks received by operator per second.", + unit="Bps", + targets=[ + Target( + expr='sum(rate(ray_data_bytes_inputs_received{{{global_filters}, operator=~"$Operator"}}[1m])) by (dataset, operator)', + legend="Bytes Received / Second: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, +) + +INPUT_BLOCKS_PROCESSED_PANEL = Panel( + id=19, + title="Input Blocks Processed by Tasks / Second", + description=( + "Number of input blocks that operator's tasks have finished processing per second." + ), + unit="blocks/sec", + targets=[ + Target( + expr='sum(rate(ray_data_num_task_inputs_processed{{{global_filters}, operator=~"$Operator"}}[1m])) by (dataset, operator)', + legend="Blocks Processed / Second: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, +) + +INPUT_BYTES_PROCESSED_PANEL = Panel( + id=20, + title="Input Bytes Processed by Tasks / Second", + description=( + "Byte size of input blocks that operator's tasks have finished processing per second." + ), + unit="Bps", + targets=[ + Target( + expr='sum(rate(ray_data_bytes_task_inputs_processed{{{global_filters}, operator=~"$Operator"}}[1m])) by (dataset, operator)', + legend="Bytes Processed / Second: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, +) + +INPUT_BYTES_SUBMITTED_PANEL = Panel( + id=21, + title="Input Bytes Submitted to Tasks / Second", + description="Byte size of input blocks passed to submitted tasks per second.", + unit="Bps", + targets=[ + Target( + expr='sum(rate(ray_data_bytes_inputs_of_submitted_tasks{{{global_filters}, operator=~"$Operator"}}[1m])) by (dataset, operator)', + legend="Bytes Submitted / Second: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, +) + +# Ray Data Metrics (Outputs) +BLOCKS_GENERATED_PANEL = Panel( + id=22, + title="Blocks Generated by Tasks / Second", + description="Number of output blocks generated by tasks per second.", + unit="blocks/sec", + targets=[ + Target( + expr='sum(rate(ray_data_num_task_outputs_generated{{{global_filters}, operator=~"$Operator"}}[1m])) by (dataset, operator)', + legend="Blocks Generated / Second: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, +) + +BYTES_GENERATED_PANEL = Panel( + id=23, + title="Bytes Generated by Tasks / Second", + description="Byte size of output blocks generated by tasks per second.", + unit="Bps", + targets=[ + Target( + expr='sum(rate(ray_data_bytes_task_outputs_generated{{{global_filters}, operator=~"$Operator"}}[1m])) by (dataset, operator)', + legend="Bytes Generated / Second: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, +) + +ROWS_GENERATED_PANEL = Panel( + id=24, + title="Rows Generated by Tasks / Second", + description="Number of rows in generated output blocks from finished tasks per second.", + unit="rows/sec", + targets=[ + Target( + expr='sum(rate(ray_data_rows_task_outputs_generated{{{global_filters}, operator=~"$Operator"}}[1m])) by (dataset, operator)', + legend="Rows Generated / Second: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, +) + +OUTPUT_BLOCKS_TAKEN_PANEL = Panel( + id=25, + title="Output Blocks Taken by Downstream Operators / Second", + description="Number of output blocks taken by downstream operators per second.", + unit="blocks/sec", + targets=[ + Target( + expr='sum(rate(ray_data_num_outputs_taken{{{global_filters}, operator=~"$Operator"}}[1m])) by (dataset, operator)', + legend="Blocks Taken / Second: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, +) + +OUTPUT_BYTES_TAKEN_PANEL = Panel( + id=26, + title="Output Bytes Taken by Downstream Operators / Second", + description=( + "Byte size of output blocks taken by downstream operators per second." + ), + unit="Bps", + targets=[ + Target( + expr='sum(rate(ray_data_bytes_outputs_taken{{{global_filters}, operator=~"$Operator"}}[1m])) by (dataset, operator)', + legend="Bytes Taken / Second: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, +) + +AVERAGE_BYTES_PER_BLOCK_PANEL = Panel( + id=49, + title="Average Bytes Generated / Output Block", + description="Average byte size of output blocks generated by tasks.", + unit="bytes", + targets=[ + Target( + expr='increase(ray_data_bytes_task_outputs_generated{{{global_filters}, operator=~"$Operator"}}[5m]) / increase(ray_data_num_task_outputs_generated{{{global_filters}, operator=~"$Operator"}}[5m])', + legend="Average Bytes Generated / Output Block: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, +) + +AVERAGE_BLOCKS_PER_TASK_PANEL = Panel( + id=50, + title="Average Number of Output Blocks / Task", + description="Average number of output blocks generated by tasks.", + unit="blocks", + targets=[ + Target( + expr='increase(ray_data_num_task_outputs_generated{{{global_filters}, operator=~"$Operator"}}[5m]) / increase(ray_data_num_tasks_finished{{{global_filters}, operator=~"$Operator"}}[5m])', + legend="Average Number of Output Blocks / Task: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, +) + +OUTPUT_BYTES_BY_NODE_PANEL = Panel( + id=43, + title="Output Bytes from Finished Tasks / Second (by Node)", + description=( + "Byte size of output blocks from finished tasks per second, grouped by node." + ), + unit="Bps", + targets=[ + Target( + expr='sum(rate(ray_data_bytes_outputs_of_finished_tasks_per_node{{{global_filters}, operator=~"$Operator"}}[1m])) by (dataset, node_ip)', + legend="Bytes output / Second: {{dataset}}, {{node_ip}}", + ) + ], + fill=0, + stack=False, +) + +BLOCKS_BY_NODE_PANEL = Panel( + id=48, + title="Blocks from Finished Tasks / Second (by Node)", + description=( + "Number of output blocks from finished tasks per second, grouped by node." + ), + unit="blocks/s", + targets=[ + Target( + expr='sum(rate(ray_data_blocks_outputs_of_finished_tasks_per_node{{{global_filters}, operator=~"$Operator"}}[1m])) by (dataset, node_ip)', + legend="Blocks output / Second: {{dataset}}, {{node_ip}}", + ) + ], + fill=0, + stack=False, +) + +# Ray Data Metrics (Tasks) +SUBMITTED_TASKS_PANEL = Panel( + id=29, + title="Submitted Tasks", + description="Number of submitted tasks.", + unit="tasks", + targets=[ + Target( + expr='sum(ray_data_num_tasks_submitted{{{global_filters}, operator=~"$Operator"}}) by (dataset, operator)', + legend="Submitted Tasks: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, +) + +RUNNING_TASKS_PANEL = Panel( + id=30, + title="Running Tasks", + description="Number of running tasks.", + unit="tasks", + targets=[ + Target( + expr='sum(ray_data_num_tasks_running{{{global_filters}, operator=~"$Operator"}}) by (dataset, operator)', + legend="Running Tasks: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, +) + +TASKS_WITH_OUTPUT_PANEL = Panel( + id=31, + title="Tasks with output blocks", + description="Number of tasks that already have output.", + unit="tasks", + targets=[ + Target( + expr='sum(ray_data_num_tasks_have_outputs{{{global_filters}, operator=~"$Operator"}}) by (dataset, operator)', + legend="Tasks with output blocks: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, +) + +FINISHED_TASKS_PANEL = Panel( + id=32, + title="Finished Tasks", + description="Number of finished tasks.", + unit="tasks", + targets=[ + Target( + expr='sum(ray_data_num_tasks_finished{{{global_filters}, operator=~"$Operator"}}) by (dataset, operator)', + legend="Finished Tasks: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, +) + +FAILED_TASKS_PANEL = Panel( + id=33, + title="Failed Tasks", + description="Number of failed tasks.", + unit="tasks", + targets=[ + Target( + expr='sum(ray_data_num_tasks_failed{{{global_filters}, operator=~"$Operator"}}) by (dataset, operator)', + legend="Failed Tasks: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, +) + +TASK_THROUGHPUT_BY_NODE_PANEL = Panel( + id=46, + title="Task Throughput (by Node)", + description="Number of finished tasks per second, grouped by node.", + unit="tasks/s", + targets=[ + Target( + expr='sum(rate(ray_data_num_tasks_finished_per_node{{{global_filters}, operator=~"$Operator"}}[1m])) by (dataset, node_ip)', + legend="Finished Tasks: {{dataset}}, {{node_ip}}", + ) + ], + fill=0, + stack=False, +) + +BLOCK_GENERATION_TIME_PANEL = Panel( + id=8, + title="Block Generation Time", + description="Time spent generating blocks in tasks.", + unit="seconds", + targets=[ + Target( + expr='sum(ray_data_block_generation_time{{{global_filters}, operator=~"$Operator"}}) by (dataset, operator)', + legend="Block Generation Time: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, +) + +TASK_SUBMISSION_BACKPRESSURE_PANEL = Panel( + id=37, + title="Task Submission Backpressure Time", + description="Time spent in task submission backpressure.", + unit="seconds", + targets=[ + Target( + expr='sum(ray_data_task_submission_backpressure_time{{{global_filters}, operator=~"$Operator"}}) by (dataset, operator)', + legend="Backpressure Time: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=True, +) + +# Task Completion Time Percentiles +TASK_COMPLETION_TIME_PANEL = Panel( + id=38, + title="Task Completion Time", + description="Time spent running tasks to completion w/ backpressure.", + unit="seconds", + targets=[ + Target( + expr='increase(ray_data_task_completion_time{{{global_filters}, operator=~"$Operator"}}[5m]) / increase(ray_data_num_tasks_finished{{{global_filters}, operator=~"$Operator"}}[5m])', + legend="Task Completion Time: {{dataset}}, {{operator}}", ), - unit="Bps", - targets=[ - Target( - expr="sum(rate(ray_data_bytes_task_inputs_processed{{{global_filters}}}[1m])) by (dataset, operator)", - legend="Bytes Processed / Second: {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=False, - ), - Panel( - id=21, - title="Input Bytes Submitted to Tasks / Second", - description="Byte size of input blocks passed to submitted tasks per second.", - unit="Bps", - targets=[ - Target( - expr="sum(rate(ray_data_bytes_inputs_of_submitted_tasks{{{global_filters}}}[1m])) by (dataset, operator)", - legend="Bytes Submitted / Second: {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=False, - ), - Panel( - id=22, - title="Blocks Generated by Tasks / Second", - description="Number of output blocks generated by tasks per second.", - unit="blocks/sec", - targets=[ - Target( - expr="sum(rate(ray_data_num_task_outputs_generated{{{global_filters}}}[1m])) by (dataset, operator)", - legend="Blocks Generated / Second: {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=False, - ), - Panel( - id=23, - title="Bytes Generated by Tasks / Second", - description="Byte size of output blocks generated by tasks per second.", - unit="Bps", - targets=[ - Target( - expr="sum(rate(ray_data_bytes_task_outputs_generated{{{global_filters}}}[1m])) by (dataset, operator)", - legend="Bytes Generated / Second: {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=False, - ), - Panel( - id=49, - title="Average Bytes Generated / Output Block", - description="Average byte size of output blocks generated by tasks.", - unit="bytes", - targets=[ - Target( - expr="increase(ray_data_bytes_task_outputs_generated{{{global_filters}}}[5m]) / increase(ray_data_num_task_outputs_generated{{{global_filters}}}[5m])", - legend="Average Bytes Generated / Output Block: {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=False, - ), - Panel( - id=50, - title="Average Number of Output Blocks / Task", - description="Average number of output blocks generated by tasks.", - unit="blocks", - targets=[ - Target( - expr="increase(ray_data_num_task_outputs_generated{{{global_filters}}}[5m]) / increase(ray_data_num_tasks_finished{{{global_filters}}}[5m])", - legend="Average Number of Output Blocks / Task: {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=False, - ), - Panel( - id=24, - title="Rows Generated by Tasks / Second", - description="Number of rows in generated output blocks from finished tasks per second.", - unit="rows/sec", - targets=[ - Target( - expr="sum(rate(ray_data_rows_task_outputs_generated{{{global_filters}}}[1m])) by (dataset, operator)", - legend="Rows Generated / Second: {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=False, - ), - Panel( - id=25, - title="Output Blocks Taken by Downstream Operators / Second", - description="Number of output blocks taken by downstream operators per second.", - unit="blocks/sec", - targets=[ - Target( - expr="sum(rate(ray_data_num_outputs_taken{{{global_filters}}}[1m])) by (dataset, operator)", - legend="Blocks Taken / Second: {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=False, - ), - Panel( - id=26, - title="Output Bytes Taken by Downstream Operators / Second", - description=( - "Byte size of output blocks taken by downstream operators per second." + ], + fill=0, + stack=False, +) + +TASK_OUTPUT_BACKPRESSURE_TIME_PANEL = Panel( + id=39, + title="Task Output Backpressure Time", + description="Time spent in output backpressure.", + unit="seconds", + targets=[ + Target( + expr='increase(ray_data_task_output_backpressure_time{{{global_filters}, operator=~"$Operator"}}[5m]) / increase(ray_data_num_tasks_finished{{{global_filters}, operator=~"$Operator"}}[5m])', + legend="Task Output Backpressure Time: {{dataset}}, {{operator}}", ), - unit="Bps", - targets=[ - Target( - expr="sum(rate(ray_data_bytes_outputs_taken{{{global_filters}}}[1m])) by (dataset, operator)", - legend="Bytes Taken / Second: {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=False, - ), - Panel( - id=43, - title="Output Bytes from Finished Tasks / Second (by Node)", - description=( - "Byte size of output blocks from finished tasks per second, grouped by node." + ], + fill=0, + stack=False, +) + +TASK_COMPLETION_TIME_WITHOUT_BACKPRESSURE_PANEL = Panel( + id=40, + title="Task Completion Time Without Backpressure", + description="Time spent running tasks to completion w/o backpressure.", + unit="seconds", + targets=[ + Target( + expr='increase(ray_data_task_completion_time_without_backpressure{{{global_filters}, operator=~"$Operator"}}[5m]) / increase(ray_data_num_tasks_finished{{{global_filters}, operator=~"$Operator"}}[5m])', + legend="Task Completion Time w/o Backpressure: {{dataset}}, {{operator}}", ), - unit="Bps", - targets=[ - Target( - expr="sum(rate(ray_data_bytes_outputs_of_finished_tasks_per_node{{{global_filters}}}[1m])) by (dataset, node_ip)", - legend="Bytes output / Second: {{dataset}}, {{node_ip}}", - ) - ], - fill=0, - stack=False, - ), - Panel( - id=48, - title="Blocks from Finished Tasks / Second (by Node)", - description=( - "Number of output blocks from finished tasks per second, grouped by node." + ], + fill=0, + stack=False, +) + +# Ray Data Metrics (Object Store Memory) +INTERNAL_INQUEUE_BLOCKS_PANEL = Panel( + id=13, + title="Operator Internal Inqueue Size (Blocks)", + description="Number of blocks in operator's internal input queue", + unit="blocks", + targets=[ + Target( + expr='sum(ray_data_obj_store_mem_internal_inqueue_blocks{{{global_filters}, operator=~"$Operator"}}) by (dataset, operator)', + legend="Number of Blocks: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, +) + +INTERNAL_INQUEUE_BYTES_PANEL = Panel( + id=14, + title="Operator Internal Inqueue Size (Bytes)", + description="Byte size of input blocks in the operator's internal input queue.", + unit="bytes", + targets=[ + Target( + expr='sum(ray_data_obj_store_mem_internal_inqueue{{{global_filters}, operator=~"$Operator"}}) by (dataset, operator)', + legend="Bytes Size: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=True, +) + +INTERNAL_OUTQUEUE_BLOCKS_PANEL = Panel( + id=15, + title="Operator Internal Outqueue Size (Blocks)", + description="Number of blocks in operator's internal output queue", + unit="blocks", + targets=[ + Target( + expr='sum(ray_data_obj_store_mem_internal_outqueue_blocks{{{global_filters}, operator=~"$Operator"}}) by (dataset, operator)', + legend="Number of Blocks: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, +) + +INTERNAL_OUTQUEUE_BYTES_PANEL = Panel( + id=16, + title="Operator Internal Outqueue Size (Bytes)", + description=("Byte size of output blocks in the operator's internal output queue."), + unit="bytes", + targets=[ + Target( + expr='sum(ray_data_obj_store_mem_internal_outqueue{{{global_filters}, operator=~"$Operator"}}) by (dataset, operator)', + legend="Bytes Size: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=True, +) + +EXTERNAL_INQUEUE_BLOCKS_PANEL = Panel( + id=2, + title="Operator External InQueue Size (Blocks)", + description="Number of blocks in operator's external input queue", + unit="blocks", + targets=[ + Target( + expr='sum(ray_data_num_external_inqueue_blocks{{{global_filters}, operator=~"$Operator"}}) by (dataset, operator)', + legend="Number of Blocks: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, +) + +EXTERNAL_INQUEUE_BYTES_PANEL = Panel( + id=27, + title="Operator External InQueue Size (bytes)", + description="Byte size of blocks in operator's external input queue", + unit="bytes", + targets=[ + Target( + expr='sum(ray_data_num_external_inqueue_bytes{{{global_filters}, operator=~"$Operator"}}) by (dataset, operator)', + legend="Number of Bytes: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, +) + +# Combined Inqueue and Outqueue Blocks Panel +COMBINED_INQUEUE_OUTQUEUE_BLOCKS_PANEL = Panel( + id=56, + title="Operator Combined Internal + External Inqueue Size (Blocks)", + description="Total number of blocks in operator's internal + external input queue.", + unit="blocks", + targets=[ + Target( + expr='sum(ray_data_obj_store_mem_internal_inqueue_blocks{{{global_filters}, operator=~"$Operator"}} + ray_data_num_external_inqueue_blocks{{{global_filters}, operator=~"$Operator"}}) by (dataset, operator)', + legend="Combined Blocks: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, +) + +PENDING_TASK_INPUTS_PANEL = Panel( + id=34, + title="Size of Blocks used in Pending Tasks (Bytes)", + description="Byte size of input blocks used by pending tasks.", + unit="bytes", + targets=[ + Target( + expr='sum(ray_data_obj_store_mem_pending_task_inputs{{{global_filters}, operator=~"$Operator"}}) by (dataset, operator)', + legend="Bytes Size: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=True, +) + +FREED_MEMORY_PANEL = Panel( + id=35, + title="Freed Memory in Object Store (Bytes)", + description="Byte size of freed memory in object store.", + unit="bytes", + targets=[ + Target( + expr='sum(ray_data_obj_store_mem_freed{{{global_filters}, operator=~"$Operator"}}) by (dataset, operator)', + legend="Bytes Size: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=True, +) + +SPILLED_MEMORY_PANEL = Panel( + id=36, + title="Spilled Memory in Object Store (Bytes)", + description="Byte size of spilled memory in object store.", + unit="bytes", + targets=[ + Target( + expr='sum(ray_data_obj_store_mem_spilled{{{global_filters}, operator=~"$Operator"}}) by (dataset, operator)', + legend="Bytes Size: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=True, +) + +# Ray Data Metrics (Iteration) +ITERATION_INITIALIZATION_PANEL = Panel( + id=12, + title="Iteration Initialization Time", + description="Seconds spent in iterator initialization code", + unit="seconds", + targets=[ + Target( + expr="sum(ray_data_iter_initialize_seconds{{{global_filters}}}) by (dataset)", + legend="Seconds: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, +) + +ITERATION_BLOCKED_PANEL = Panel( + id=9, + title="Iteration Blocked Time", + description="Seconds user thread is blocked by iter_batches()", + unit="seconds", + targets=[ + Target( + expr="sum(ray_data_iter_total_blocked_seconds{{{global_filters}}}) by (dataset)", + legend="Seconds: {{dataset}}", + ) + ], + fill=0, + stack=False, +) + +ITERATION_USER_PANEL = Panel( + id=10, + title="Iteration User Time", + description="Seconds spent in user code", + unit="seconds", + targets=[ + Target( + expr="sum(ray_data_iter_user_seconds{{{global_filters}}}) by (dataset)", + legend="Seconds: {{dataset}}", + ) + ], + fill=0, + stack=False, +) + +# Ray Data Metrics (Miscellaneous) +SCHEDULING_LOOP_DURATION_PANEL = Panel( + id=47, + title="Scheduling Loop Duration", + description=("Duration of the scheduling loop in seconds."), + unit="seconds", + targets=[ + Target( + expr="sum(ray_data_sched_loop_duration_s{{{global_filters}}}) by (dataset)", + legend="Scheduling Loop Duration: {{dataset}}", + ) + ], + fill=0, + stack=False, +) + +MAX_BYTES_TO_READ_PANEL = Panel( + id=55, + title="Max Bytes to Read", + description="Maximum bytes to read from streaming generator buffer.", + unit="bytes", + targets=[ + Target( + expr='sum(ray_data_max_bytes_to_read{{{global_filters}, operator=~"$Operator"}}) by (dataset, operator)', + legend="Max Bytes to Read: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, +) + +# Budget Panels +CPU_BUDGET_PANEL = Panel( + id=51, + title="Budget (CPU)", + description=("Budget (CPU) for the operator."), + unit="cpu", + targets=[ + Target( + expr='sum(ray_data_cpu_budget{{{global_filters}, operator=~"$Operator"}}) by (dataset, operator)', + legend="Budget (CPU): {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, +) + +GPU_BUDGET_PANEL = Panel( + id=52, + title="Budget (GPU)", + description=("Budget (GPU) for the operator."), + unit="gpu", + targets=[ + Target( + expr='sum(ray_data_gpu_budget{{{global_filters}, operator=~"$Operator"}}) by (dataset, operator)', + legend="Budget (GPU): {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, +) + +MEMORY_BUDGET_PANEL = Panel( + id=53, + title="Budget (Memory)", + description=("Budget (Memory) for the operator."), + unit="bytes", + targets=[ + Target( + expr='sum(ray_data_memory_budget{{{global_filters}, operator=~"$Operator"}}) by (dataset, operator)', + legend="Budget (Memory): {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, +) + +OBJECT_STORE_MEMORY_BUDGET_PANEL = Panel( + id=54, + title="Budget (Object Store Memory)", + description=("Budget (Object Store Memory) for the operator."), + unit="bytes", + targets=[ + Target( + expr='sum(ray_data_object_store_memory_budget{{{global_filters}, operator=~"$Operator"}}) by (dataset, operator)', + legend="Budget (Object Store Memory): {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, +) + +ALL_RESOURCES_UTILIZATION_PANEL = Panel( + id=57, + title="All logical resources utilization", + description=( + "Shows all logical resources utilization on a single graph. Filtering by operator is recommended." + ), + unit="cores", + targets=[ + Target( + expr='sum(ray_data_cpu_usage_cores{{{global_filters}, operator=~"$Operator"}}) by (dataset, operator)', + legend="CPU: {{dataset}}, {{operator}}", ), - unit="blocks/s", - targets=[ - Target( - expr="sum(rate(ray_data_blocks_outputs_of_finished_tasks_per_node{{{global_filters}}}[1m])) by (dataset, node_ip)", - legend="Blocks output / Second: {{dataset}}, {{node_ip}}", - ) - ], - fill=0, - stack=False, - ), - # Ray Data Metrics (Tasks) - Panel( - id=29, - title="Submitted Tasks", - description="Number of submitted tasks.", - unit="tasks", - targets=[ - Target( - expr="sum(ray_data_num_tasks_submitted{{{global_filters}}}) by (dataset, operator)", - legend="Submitted Tasks: {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=False, - ), - Panel( - id=30, - title="Running Tasks", - description="Number of running tasks.", - unit="tasks", - targets=[ - Target( - expr="sum(ray_data_num_tasks_running{{{global_filters}}}) by (dataset, operator)", - legend="Running Tasks: {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=False, - ), - Panel( - id=31, - title="Tasks with output blocks", - description="Number of tasks that already have output.", - unit="tasks", - targets=[ - Target( - expr="sum(ray_data_num_tasks_have_outputs{{{global_filters}}}) by (dataset, operator)", - legend="Tasks with output blocks: {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=False, - ), - Panel( - id=32, - title="Finished Tasks", - description="Number of finished tasks.", - unit="tasks", - targets=[ - Target( - expr="sum(ray_data_num_tasks_finished{{{global_filters}}}) by (dataset, operator)", - legend="Finished Tasks: {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=False, - ), - Panel( - id=46, - title="Task Throughput (by Node)", - description="Number of finished tasks per second, grouped by node.", - unit="tasks/s", - targets=[ - Target( - expr="sum(rate(ray_data_num_tasks_finished_per_node{{{global_filters}}}[1m])) by (dataset, node_ip)", - legend="Finished Tasks: {{dataset}}, {{node_ip}}", - ) - ], - fill=0, - stack=False, - ), - Panel( - id=33, - title="Failed Tasks", - description="Number of failed tasks.", - unit="tasks", - targets=[ - Target( - expr="sum(ray_data_num_tasks_failed{{{global_filters}}}) by (dataset, operator)", - legend="Failed Tasks: {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=False, - ), - Panel( - id=8, - title="Block Generation Time", - description="Time spent generating blocks in tasks.", - unit="seconds", - targets=[ - Target( - expr="sum(ray_data_block_generation_time{{{global_filters}}}) by (dataset, operator)", - legend="Block Generation Time: {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=False, - ), - Panel( - id=37, - title="Task Submission Backpressure Time", - description="Time spent in task submission backpressure.", - unit="seconds", - targets=[ - Target( - expr="sum(ray_data_task_submission_backpressure_time{{{global_filters}}}) by (dataset, operator)", - legend="Backpressure Time: {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=True, - ), - Panel( - id=38, - title="(p00) Task Completion Time", - description="Time spent running tasks to completion.", - unit="seconds", - targets=[ - Target( - expr="histogram_quantile(0, sum by (dataset, operator, le) (rate(ray_data_task_completion_time_bucket{{{global_filters}}}[5m])))", - legend="(p00) Completion Time: {{dataset}}, {{operator}}", - ), - ], - fill=0, - stack=False, - ), - Panel( - id=39, - title="(p05) Task Completion Time", - description="Time spent running tasks to completion.", - unit="seconds", - targets=[ - Target( - expr="histogram_quantile(0.05, sum by (dataset, operator, le) (rate(ray_data_task_completion_time_bucket{{{global_filters}}}[5m])))", - legend="(p05) Completion Time: {{dataset}}, {{operator}}", - ), - ], - fill=0, - stack=False, - ), - Panel( - id=40, - title="(p50) Task Completion Time", - description="Time spent running tasks to completion.", - unit="seconds", - targets=[ - Target( - expr="histogram_quantile(0.50, sum by (dataset, operator, le) (rate(ray_data_task_completion_time_bucket{{{global_filters}}}[5m])))", - legend="(p50) Completion Time: {{dataset}}, {{operator}}", - ), - ], - fill=0, - stack=False, - ), - Panel( - id=41, - title="(p75) Task Completion Time", - description="Time spent running tasks to completion.", - unit="seconds", - targets=[ - Target( - expr="histogram_quantile(0.75, sum by (dataset, operator, le) (rate(ray_data_task_completion_time_bucket{{{global_filters}}}[5m])))", - legend="(p75) Completion Time: {{dataset}}, {{operator}}", - ), - ], - fill=0, - stack=False, - ), - Panel( - id=42, - title="(p90) Task Completion Time", - description="Time spent running tasks to completion.", - unit="seconds", - targets=[ - Target( - expr="histogram_quantile(0.9, sum by (dataset, operator, le) (rate(ray_data_task_completion_time_bucket{{{global_filters}}}[5m])))", - legend="(p90) Completion Time: {{dataset}}, {{operator}}", - ), - ], - fill=0, - stack=False, - ), - Panel( - id=44, - title="p(99) Task Completion Time", - description="Time spent running tasks to completion.", - unit="seconds", - targets=[ - Target( - expr="histogram_quantile(0.99, sum by (dataset, operator, le) (rate(ray_data_task_completion_time_bucket{{{global_filters}}}[5m])))", - legend="(p99) Completion Time: {{dataset}}, {{operator}}", - ), - ], - fill=0, - stack=False, - ), - Panel( - id=45, - title="p(100) Task Completion Time", - description="Time spent running tasks to completion.", - unit="seconds", - targets=[ - Target( - expr="histogram_quantile(1, sum by (dataset, operator, le) (rate(ray_data_task_completion_time_bucket{{{global_filters}}}[5m])))", - legend="(p100) Completion Time: {{dataset}}, {{operator}}", - ), - ], - fill=0, - stack=False, - ), - # Ray Data Metrics (Object Store Memory) - Panel( - id=13, - title="Operator Internal Inqueue Size (Blocks)", - description="Number of blocks in operator's internal input queue", - unit="blocks", - targets=[ - Target( - expr="sum(ray_data_obj_store_mem_internal_inqueue_blocks{{{global_filters}}}) by (dataset, operator)", - legend="Number of Blocks: {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=False, - ), - Panel( - id=14, - title="Operator Internal Inqueue Size (Bytes)", - description="Byte size of input blocks in the operator's internal input queue.", - unit="bytes", - targets=[ - Target( - expr="sum(ray_data_obj_store_mem_internal_inqueue{{{global_filters}}}) by (dataset, operator)", - legend="Bytes Size: {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=True, - ), - Panel( - id=15, - title="Operator Internal Outqueue Size (Blocks)", - description="Number of blocks in operator's internal output queue", - unit="blocks", - targets=[ - Target( - expr="sum(ray_data_obj_store_mem_internal_outqueue_blocks{{{global_filters}}}) by (dataset, operator)", - legend="Number of Blocks: {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=False, - ), - Panel( - id=16, - title="Operator Internal Outqueue Size (Bytes)", - description=( - "Byte size of output blocks in the operator's internal output queue." + Target( + expr='sum(ray_data_gpu_usage_cores{{{global_filters}, operator=~"$Operator"}}) by (dataset, operator)', + legend="GPU: {{dataset}}, {{operator}}", ), - unit="bytes", - targets=[ - Target( - expr="sum(ray_data_obj_store_mem_internal_outqueue{{{global_filters}}}) by (dataset, operator)", - legend="Bytes Size: {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=True, - ), - Panel( - id=34, - title="Size of Blocks used in Pending Tasks (Bytes)", - description="Byte size of input blocks used by pending tasks.", - unit="bytes", - targets=[ - Target( - expr="sum(ray_data_obj_store_mem_pending_task_inputs{{{global_filters}}}) by (dataset, operator)", - legend="Bytes Size: {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=True, - ), - Panel( - id=35, - title="Freed Memory in Object Store (Bytes)", - description="Byte size of freed memory in object store.", - unit="bytes", - targets=[ - Target( - expr="sum(ray_data_obj_store_mem_freed{{{global_filters}}}) by (dataset, operator)", - legend="Bytes Size: {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=True, - ), - Panel( - id=36, - title="Spilled Memory in Object Store (Bytes)", - description="Byte size of spilled memory in object store.", - unit="bytes", - targets=[ - Target( - expr="sum(ray_data_obj_store_mem_spilled{{{global_filters}}}) by (dataset, operator)", - legend="Bytes Size: {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=True, - ), - # Ray Data Metrics (Iteration) - Panel( - id=12, - title="Iteration Initialization Time", - description="Seconds spent in iterator initialization code", - unit="seconds", - targets=[ - Target( - expr="sum(ray_data_iter_initialize_seconds{{{global_filters}}}) by (dataset)", - legend="Seconds: {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=False, - ), - Panel( - id=9, - title="Iteration Blocked Time", - description="Seconds user thread is blocked by iter_batches()", - unit="seconds", - targets=[ - Target( - expr="sum(ray_data_iter_total_blocked_seconds{{{global_filters}}}) by (dataset)", - legend="Seconds: {{dataset}}", - ) - ], - fill=0, - stack=False, - ), - Panel( - id=10, - title="Iteration User Time", - description="Seconds spent in user code", - unit="seconds", - targets=[ - Target( - expr="sum(ray_data_iter_user_seconds{{{global_filters}}}) by (dataset)", - legend="Seconds: {{dataset}}", - ) - ], - fill=0, - stack=False, - ), - # Ray Data Metrics (Miscellaneous) - Panel( - id=47, - title="Scheduling Loop Duration", - description=("Duration of the scheduling loop in seconds."), - unit="seconds", - targets=[ - Target( - expr="sum(ray_data_sched_loop_duration_s{{{global_filters}}}) by (dataset)", - legend="Scheduling Loop Duration: {{dataset}}", - ) - ], - fill=0, - stack=False, - ), - Panel( - id=51, - title="Budget (CPU)", - description=("Budget (CPU) for the operator."), - unit="cpu", - targets=[ - Target( - expr="sum(ray_data_cpu_budget{{{global_filters}}}) by (dataset, operator)", - legend="Budget (CPU): {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=False, - ), - Panel( - id=52, - title="Budget (GPU)", - description=("Budget (GPU) for the operator."), - unit="gpu", - targets=[ - Target( - expr="sum(ray_data_gpu_budget{{{global_filters}}}) by (dataset, operator)", - legend="Budget (GPU): {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=False, - ), - Panel( - id=53, - title="Budget (Memory)", - description=("Budget (Memory) for the operator."), - unit="bytes", - targets=[ - Target( - expr="sum(ray_data_memory_budget{{{global_filters}}}) by (dataset, operator)", - legend="Budget (Memory): {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=False, - ), - Panel( - id=54, - title="Budget (Object Store Memory)", - description=("Budget (Object Store Memory) for the operator."), - unit="bytes", - targets=[ - Target( - expr="sum(ray_data_object_store_memory_budget{{{global_filters}}}) by (dataset, operator)", - legend="Budget (Object Store Memory): {{dataset}}, {{operator}}", - ) - ], - fill=0, - stack=False, + ], + fill=0, + stack=False, +) + +OPERATOR_PANELS = [ROWS_OUTPUT_PER_SECOND_PANEL, ALL_RESOURCES_UTILIZATION_PANEL] + +DATA_GRAFANA_ROWS = [ + # Overview Row + Row( + title="Overview", + id=99, + panels=[ + BYTES_GENERATED_PANEL, + BLOCKS_GENERATED_PANEL, + ROWS_GENERATED_PANEL, + OBJECT_STORE_MEMORY_PANEL, + RUNNING_TASKS_PANEL, + COMBINED_INQUEUE_OUTQUEUE_BLOCKS_PANEL, + ], + collapsed=False, + ), + # Pending Inputs Row + Row( + title="Pending Inputs", + id=100, + panels=[ + INTERNAL_INQUEUE_BLOCKS_PANEL, + INTERNAL_INQUEUE_BYTES_PANEL, + EXTERNAL_INQUEUE_BLOCKS_PANEL, + EXTERNAL_INQUEUE_BYTES_PANEL, + PENDING_TASK_INPUTS_PANEL, + ], + collapsed=True, + ), + # Inputs Row + Row( + title="Inputs", + id=101, + panels=[ + INPUT_BLOCKS_RECEIVED_PANEL, + INPUT_BYTES_RECEIVED_PANEL, + INPUT_BLOCKS_PROCESSED_PANEL, + INPUT_BYTES_PROCESSED_PANEL, + INPUT_BYTES_SUBMITTED_PANEL, + ], + collapsed=True, + ), + # Pending Outputs Row + Row( + title="Pending Outputs", + id=102, + panels=[ + INTERNAL_OUTQUEUE_BLOCKS_PANEL, + INTERNAL_OUTQUEUE_BYTES_PANEL, + MAX_BYTES_TO_READ_PANEL, + ], + collapsed=True, + ), + # Outputs Row + Row( + title="Outputs", + id=103, + panels=[ + OUTPUT_BLOCKS_TAKEN_PANEL, + OUTPUT_BYTES_TAKEN_PANEL, + OUTPUT_BYTES_BY_NODE_PANEL, + BLOCKS_BY_NODE_PANEL, + BYTES_OUTPUT_PER_SECOND_PANEL, + ROWS_OUTPUT_PER_SECOND_PANEL, + AVERAGE_BYTES_PER_BLOCK_PANEL, + AVERAGE_BLOCKS_PER_TASK_PANEL, + BLOCK_GENERATION_TIME_PANEL, + ], + collapsed=True, + ), + # Tasks + Row( + title="Tasks", + id=104, + panels=[ + TASK_COMPLETION_TIME_PANEL, + TASK_COMPLETION_TIME_WITHOUT_BACKPRESSURE_PANEL, + TASK_OUTPUT_BACKPRESSURE_TIME_PANEL, + TASK_SUBMISSION_BACKPRESSURE_PANEL, + TASK_THROUGHPUT_BY_NODE_PANEL, + TASKS_WITH_OUTPUT_PANEL, + SUBMITTED_TASKS_PANEL, + FINISHED_TASKS_PANEL, + FAILED_TASKS_PANEL, + ], + collapsed=True, + ), + # Resource Budget / Usage Row + Row( + title="Resource Budget / Usage", + id=105, + panels=[ + CPU_USAGE_PANEL, + GPU_USAGE_PANEL, + CPU_BUDGET_PANEL, + GPU_BUDGET_PANEL, + MEMORY_BUDGET_PANEL, + OBJECT_STORE_MEMORY_BUDGET_PANEL, + FREED_MEMORY_PANEL, + SPILLED_MEMORY_PANEL, + BYTES_SPILLED_PANEL, + BYTES_FREED_PANEL, + ], + collapsed=True, + ), + # Scheduling Loop Row + Row( + title="Scheduling Loop", + id=106, + panels=[ + SCHEDULING_LOOP_DURATION_PANEL, + ], + collapsed=True, + ), + # Iteration Row + Row( + title="Iteration", + id=107, + panels=[ + ITERATION_INITIALIZATION_PANEL, + ITERATION_BLOCKED_PANEL, + ITERATION_USER_PANEL, + ], + collapsed=True, + ), + # Operator Panels Row (these graphs should only be viewed when filtering down to a single operator) + Row( + title="Operator Panels", + id=108, + panels=[ALL_RESOURCES_UTILIZATION_PANEL], + collapsed=True, ), ] -ids = [] -for panel in DATA_GRAFANA_PANELS: - ids.append(panel.id) -assert len(ids) == len( - set(ids) -), f"Duplicated id found. Use unique id for each panel. {ids}" +# Get all panel IDs from both top-level panels and panels within rows +all_panel_ids = [] +for row in DATA_GRAFANA_ROWS: + all_panel_ids.append(row.id) + all_panel_ids.extend(panel.id for panel in row.panels) + +assert len(all_panel_ids) == len( + set(all_panel_ids) +), f"Duplicated id found. Use unique id for each panel. {all_panel_ids}" data_dashboard_config = DashboardConfig( name="DATA", default_uid="rayDataDashboard", - panels=DATA_GRAFANA_PANELS, + rows=DATA_GRAFANA_ROWS, standard_global_filters=[ 'dataset=~"$DatasetID"', 'SessionName=~"$SessionName"', diff --git a/python/ray/dashboard/modules/metrics/dashboards/data_grafana_dashboard_base.json b/python/ray/dashboard/modules/metrics/dashboards/data_grafana_dashboard_base.json index dea96d4513b2..23606bb9120f 100644 --- a/python/ray/dashboard/modules/metrics/dashboards/data_grafana_dashboard_base.json +++ b/python/ray/dashboard/modules/metrics/dashboards/data_grafana_dashboard_base.json @@ -104,6 +104,41 @@ "type": "query", "useTags": false }, + { + "allValue": ".+", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "${datasource}", + "definition": "query_result(count by (operator)(last_over_time(ray_data_output_bytes{{SessionName=~\"$SessionName\",{global_filters}}}[$__range])))", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "Operator", + "options": [], + "query": { + "query": "query_result(count by (operator)(last_over_time(ray_data_output_bytes{{SessionName=~\"$SessionName\",{global_filters}}}[$__range])))", + "refId": "Prometheus-Dataset-Variable-Query" + }, + "refresh": 2, + "regex": "{operator=\"(?.*)\".*", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, { "current": { "selected": false diff --git a/python/ray/dashboard/modules/metrics/dashboards/default_dashboard_panels.py b/python/ray/dashboard/modules/metrics/dashboards/default_dashboard_panels.py index dd4e702b34d7..72d43ccbd8b5 100644 --- a/python/ray/dashboard/modules/metrics/dashboards/default_dashboard_panels.py +++ b/python/ray/dashboard/modules/metrics/dashboards/default_dashboard_panels.py @@ -3,6 +3,7 @@ from ray.dashboard.modules.metrics.dashboards.common import ( DashboardConfig, Panel, + Row, Target, ) @@ -30,11 +31,86 @@ def max_plus_pending(max_resource, pending_resource): # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # IMPORTANT: Please keep this in sync with Metrics.tsx and ray-metrics.rst # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -DEFAULT_GRAFANA_PANELS = [ +OVERVIEW_AND_HEALTH_PANELS = [ + Panel( + id=24, + title="Node Count", + description='Note: not impacted by "Instance" variable.\n\nA total number of active failed, and pending nodes from the cluster. \n\nACTIVE: A node is alive and available.\n\nFAILED: A node is dead and not available. The node is considered dead when the raylet process on the node is terminated. The node will get into the failed state if it cannot be provided (e.g., there\'s no available node from the cloud provider) or failed to setup (e.g., setup_commands have errors). \n\nPending: A node is being started by the Ray cluster launcher. The node is unavailable now because it is being provisioned and initialized.', + unit="nodes", + targets=[ + Target( + expr="sum(autoscaler_active_nodes{{{global_filters}}}) by (NodeType)", + legend="Active Nodes: {{NodeType}}", + ), + Target( + expr="sum(autoscaler_recently_failed_nodes{{{global_filters}}}) by (NodeType)", + legend="Failed Nodes: {{NodeType}}", + ), + Target( + expr="sum(autoscaler_pending_nodes{{{global_filters}}}) by (NodeType)", + legend="Pending Nodes: {{NodeType}}", + ), + ], + ), + Panel( + id=41, + title="Cluster Utilization", + description="Aggregated utilization of all physical resources (CPU, GPU, memory, disk, or etc.) across the cluster.", + unit="%", + targets=[ + # CPU + Target( + expr='avg(ray_node_cpu_utilization{{instance=~"$Instance",{global_filters}}})', + legend="CPU (physical)", + ), + # GPU + Target( + expr='sum(ray_node_gpus_utilization{{instance=~"$Instance",{global_filters}}}) / on() (sum(ray_node_gpus_available{{instance=~"$Instance",{global_filters}}}) or vector(0))', + legend="GPU (physical)", + ), + # Memory + Target( + expr='sum(ray_node_mem_used{{instance=~"$Instance",{global_filters}}}) / on() (sum(ray_node_mem_total{{instance=~"$Instance",{global_filters}}})) * 100', + legend="Memory (RAM)", + ), + # GRAM + Target( + expr='sum(ray_node_gram_used{{instance=~"$Instance",{global_filters}}}) / on() (sum(ray_node_gram_available{{instance=~"$Instance",{global_filters}}}) + sum(ray_node_gram_used{{instance=~"$Instance",{global_filters}}})) * 100', + legend="GRAM", + ), + # Object Store + Target( + expr='sum(ray_object_store_memory{{instance=~"$Instance",{global_filters}}}) / on() sum(ray_resources{{Name="object_store_memory",instance=~"$Instance",{global_filters}}}) * 100', + legend="Object Store Memory", + ), + # Disk + Target( + expr='sum(ray_node_disk_usage{{instance=~"$Instance",{global_filters}}}) / on() (sum(ray_node_disk_free{{instance=~"$Instance",{global_filters}}}) + sum(ray_node_disk_usage{{instance=~"$Instance",{global_filters}}})) * 100', + legend="Disk", + ), + ], + fill=0, + stack=False, + ), + Panel( + id=44, + title="Ray OOM Kills (Tasks and Actors)", + description="The number of tasks and actors killed by the Ray Out of Memory killer due to high memory pressure. Metrics are broken down by IP and the name. https://docs.ray.io/en/master/ray-core/scheduling/ray-oom-prevention.html.", + unit="failures", + targets=[ + Target( + expr='sum(ray_memory_manager_worker_eviction_total{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}}) by (Name, instance, RayNodeType)', + legend="OOM Killed: {{Name}}, {{instance}} ({{RayNodeType}})", + ), + ], + ), +] + +RAY_TASKS_ACTORS_PLACEMENT_GROUPS_PANELS = [ Panel( id=26, - title="Scheduler Task State", - description="Current number of tasks in a particular state.\n\nState: the task state, as described by rpc::TaskState proto in common.proto. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.", + title="All Tasks by State", + description="Current count of tasks, grouped by scheduler state (e.g., pending, running, finished).\n\nState: the task state, as described by rpc::TaskStatus proto in common.proto. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.", unit="tasks", targets=[ Target( @@ -51,8 +127,8 @@ def max_plus_pending(max_resource, pending_resource): ), Panel( id=35, - title="Requested Live Tasks by Name", - description="Current number of (live) tasks with a particular name. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.", + title="Active Tasks by Name", + description="Current count of active tasks (i.e. pending or running; not finished), grouped by task name. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.", unit="tasks", targets=[ Target( @@ -70,7 +146,7 @@ def max_plus_pending(max_resource, pending_resource): Panel( id=38, title="Running Tasks by Name", - description="Current number of (running) tasks with a particular name. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.", + description="Current count of tasks that are currently executing, grouped by task name. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.", unit="tasks", targets=[ Target( @@ -87,8 +163,8 @@ def max_plus_pending(max_resource, pending_resource): ), Panel( id=33, - title="Scheduler Actor State", - description='Note: not impacted by "Instance" variable.\n\nCurrent number of actors in a particular state.\n\nState: the actor state, as described by rpc::ActorTableData proto in gcs.proto.', + title="All Actors by State", + description='Note: not impacted by "Instance" variable.\n\nCurrent count of actors, grouped by lifecycle state (e.g., alive, restarting, dead/terminated).\n\nState: the actor state, as described by rpc::ActorTableData proto in gcs.proto.', unit="actors", targets=[ Target( @@ -99,8 +175,8 @@ def max_plus_pending(max_resource, pending_resource): ), Panel( id=42, - title="Live Actor State", - description="Current number of alive actors in a particular state.\n\nState: IDLE, RUNNING_TASK, RUNNING_IN_RAY_GET, RUNNING_IN_RAY_WAIT", + title="Active Actors by State", + description="Current count of alive actors (i.e. not dead/terminated), grouped by state.\n\nState: the actor state, as described by rpc::ActorTableData proto in gcs.proto.", unit="actors", targets=[ Target( @@ -111,8 +187,8 @@ def max_plus_pending(max_resource, pending_resource): ), Panel( id=36, - title="Live Actors by Name", - description="Current number of alive actors with a particular name.", + title="Active Actors by Name", + description="Current count of alive actors, grouped by actor name.", unit="actors", targets=[ Target( @@ -121,9 +197,24 @@ def max_plus_pending(max_resource, pending_resource): ) ], ), + Panel( + id=40, + title="All Placement Groups by State", + description='Note: not impacted by "Instance" variable.\n\nCurrent count of placement groups, grouped by state.\n\nState: the placement group state, as described by the rpc::PlacementGroupTableData proto in gcs.proto.', + unit="placement groups", + targets=[ + Target( + expr="sum(ray_placement_groups{{{global_filters}}}) by (State)", + legend="{{State}}", + ) + ], + ), +] + +RAY_RESOURCES_PANELS = [ Panel( id=27, - title="Scheduler CPUs (logical slots)", + title="Logical CPUs used", description="Logical CPU usage of Ray. The dotted line indicates the total number of CPUs. The logical CPU is allocated by `num_cpus` arguments from tasks and actors. PENDING means the number of CPUs that will be available when new nodes are up after the autoscaler scales up.\n\nNOTE: Ray's logical CPU is different from physical CPU usage. Ray's logical CPU is allocated by `num_cpus` arguments.", unit="cores", targets=[ @@ -143,25 +234,9 @@ def max_plus_pending(max_resource, pending_resource): ), ], ), - Panel( - id=29, - title="Object Store Memory", - description="Object store memory usage by location. The dotted line indicates the object store memory capacity.\n\nLocation: where the memory was allocated, which is MMAP_SHM or MMAP_DISK to indicate memory-mapped page, SPILLED to indicate spillage to disk, and WORKER_HEAP for objects small enough to be inlined in worker memory. Refer to metric_defs.cc for more information.", - unit="bytes", - targets=[ - Target( - expr='sum(ray_object_store_memory{{instance=~"$Instance",{global_filters}}}) by (Location)', - legend="{{Location}}", - ), - Target( - expr='sum(ray_resources{{Name="object_store_memory",instance=~"$Instance",{global_filters}}})', - legend="MAX", - ), - ], - ), Panel( id=28, - title="Scheduler GPUs (logical slots)", + title="Logical GPUs used", description="Logical GPU usage of Ray. The dotted line indicates the total number of GPUs. The logical GPU is allocated by `num_gpus` arguments from tasks and actors. PENDING means the number of GPUs that will be available when new nodes are up after the autoscaler scales up.", unit="GPUs", targets=[ @@ -182,30 +257,34 @@ def max_plus_pending(max_resource, pending_resource): ], ), Panel( - id=40, - title="Scheduler Placement Groups", - description='Note: not impacted by "Instance" variable.\n\nCurrent number of placement groups in a particular state.\n\nState: the placement group state, as described by the rpc::PlacementGroupTable proto in gcs.proto.', - unit="placement groups", + id=29, + title="Object Store Memory", + description="Object store memory usage by location. The dotted line indicates the object store memory capacity.\n\nLocation: where the memory was allocated, which is MMAP_SHM or MMAP_DISK to indicate memory-mapped page, SPILLED to indicate spillage to disk, and WORKER_HEAP for objects small enough to be inlined in worker memory. Refer to metric_defs.cc for more information.", + unit="bytes", targets=[ Target( - expr="sum(ray_placement_groups{{{global_filters}}}) by (State)", - legend="{{State}}", - ) + expr='sum(ray_object_store_memory{{instance=~"$Instance",{global_filters}}}) by (Location)', + legend="{{Location}}", + ), + Target( + expr='sum(ray_resources{{Name="object_store_memory",instance=~"$Instance",{global_filters}}})', + legend="MAX", + ), ], ), +] + +NODE_HARDWARE_UTILIZATION_BY_RAY_COMPONENT_PANELS = [ Panel( - id=2, - title="Node CPU (hardware utilization)", - description="", + id=37, + title="Node CPU by Component", + description="The physical (hardware) CPU usage across the cluster, broken down by component. This reports the summed CPU usage per Ray component. Ray components consist of system components (e.g., raylet, gcs, dashboard, or agent) and the process (that contains method names) names of running tasks/actors.", unit="cores", targets=[ Target( - expr='sum(ray_node_cpu_utilization{{instance=~"$Instance", IsHeadNode="false", {global_filters}}} * ray_node_cpu_count{{instance=~"$Instance",{global_filters}}} / 100) by (instance)', - legend="CPU Usage: {{instance}}", - ), - Target( - expr='sum(ray_node_cpu_utilization{{instance=~"$Instance", IsHeadNode="true", {global_filters}}} * ray_node_cpu_count{{instance=~"$Instance",{global_filters}}} / 100) by (instance)', - legend="CPU Usage: {{instance}} (head)", + # ray_component_cpu_percentage returns a percentage that can be > 100. It means that it uses more than 1 CPU. + expr='sum(ray_component_cpu_percentage{{instance=~"$Instance",{global_filters}}}) by (Component) / 100', + legend="{{Component}}", ), Target( expr='sum(ray_node_cpu_count{{instance=~"$Instance",{global_filters}}})', @@ -214,66 +293,85 @@ def max_plus_pending(max_resource, pending_resource): ], ), Panel( - id=8, - title="Node GPU (hardware utilization)", - description="Node's physical (hardware) GPU usage. The dotted line means the total number of hardware GPUs from the cluster. ", - unit="GPUs", + id=34, + title="Node Memory by Component", + description="The physical (hardware) memory usage across the cluster, broken down by component. This reports the summed RSS-SHM per Ray component, which corresponds to an approximate memory usage per proc. Ray components consist of system components (e.g., raylet, gcs, dashboard, or agent) and the process (that contains method names) names of running tasks/actors.", + unit="bytes", targets=[ Target( - expr='sum(ray_node_gpus_utilization{{instance=~"$Instance", IsHeadNode="false", {global_filters}}} / 100) by (instance, GpuIndex, GpuDeviceName)', - legend="GPU Usage: {{instance}}, gpu.{{GpuIndex}}, {{GpuDeviceName}}", + expr='(sum(ray_component_rss_mb{{instance=~"$Instance",{global_filters}}} * 1024 * 1024) by (Component)) - (sum(ray_component_mem_shared_bytes{{instance=~"$Instance",{global_filters}}}) by (Component))', + legend="{{Component}}", ), Target( - expr='sum(ray_node_gpus_utilization{{instance=~"$Instance", IsHeadNode="true", {global_filters}}} / 100) by (instance, GpuIndex, GpuDeviceName)', - legend="GPU Usage: {{instance}} (head), gpu.{{GpuIndex}}, {{GpuDeviceName}}", + expr='sum(ray_node_mem_shared_bytes{{instance=~"$Instance",{global_filters}}})', + legend="shared_memory", ), Target( - expr='sum(ray_node_gpus_available{{instance=~"$Instance",{global_filters}}})', + expr='sum(ray_node_mem_total{{instance=~"$Instance",{global_filters}}})', legend="MAX", ), ], ), Panel( - id=6, - title="Node Disk", - description="Node's physical (hardware) disk usage. The dotted line means the total amount of disk space from the cluster.\n\nNOTE: When Ray is deployed within a container, this shows the disk usage from the host machine. ", - unit="bytes", + id=45, + title="Node GPU by Component", + description="The physical (hardware) GPU usage across the cluster, broken down by component. This reports the summed GPU usage per Ray component.", + unit="GPUs", targets=[ Target( - expr='sum(ray_node_disk_usage{{instance=~"$Instance", IsHeadNode="false", {global_filters}}}) by (instance)', - legend="Disk Used: {{instance}}", + expr="sum(ray_component_gpu_percentage{{{global_filters}}} / 100) by (Component)", + legend="{{Component}}", ), + ], + ), + Panel( + id=46, + title="Node GPU Memory by Component", + description="The physical (hardware) GPU memory usage across the cluster, broken down by component. This reports the summed GPU memory usage per Ray component.", + unit="bytes", + targets=[ Target( - expr='sum(ray_node_disk_usage{{instance=~"$Instance", IsHeadNode="true", {global_filters}}}) by (instance)', - legend="Disk Used: {{instance}} (head)", + expr="sum(ray_component_gpu_memory_mb{{{global_filters}}} * 1024 * 1024) by (Component)", + legend="{{Component}}", ), Target( - expr='sum(ray_node_disk_free{{instance=~"$Instance",{global_filters}}}) + sum(ray_node_disk_usage{{instance=~"$Instance",{global_filters}}})', + expr='(sum(ray_node_gram_available{{instance=~"$Instance",{global_filters}}}) + sum(ray_node_gram_used{{instance=~"$Instance",{global_filters}}})) * 1024 * 1024', legend="MAX", ), ], ), +] + +NODE_HARDWARE_UTILIZATION_PANELS = [ Panel( - id=32, - title="Node Disk IO Speed", - description="Disk IO per node.", - unit="Bps", + id=2, + title="Node CPU utilization", + description="", + unit="cores", targets=[ Target( - expr='sum(ray_node_disk_io_write_speed{{instance=~"$Instance", IsHeadNode="false", {global_filters}}}) by (instance)', - legend="Write: {{instance}}", + expr='sum(ray_node_cpu_utilization{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}} * ray_node_cpu_count{{instance=~"$Instance", RayNodeType=~"$RayNodeType",{global_filters}}} / 100) by (instance, RayNodeType)', + legend="CPU Usage: {{instance}} ({{RayNodeType}})", ), Target( - expr='sum(ray_node_disk_io_write_speed{{instance=~"$Instance", IsHeadNode="true", {global_filters}}}) by (instance)', - legend="Write: {{instance}} (head)", + expr='sum(ray_node_cpu_count{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}})', + legend="MAX", ), + ], + ), + Panel( + id=8, + title="Node GPU utilization", + description="Node's physical (hardware) GPU usage. The dotted line means the total number of hardware GPUs from the cluster. ", + unit="GPUs", + targets=[ Target( - expr='sum(ray_node_disk_io_read_speed{{instance=~"$Instance", IsHeadNode="false", {global_filters}}}) by (instance)', - legend="Read: {{instance}}", + expr='sum(ray_node_gpus_utilization{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}} / 100) by (instance, RayNodeType, GpuIndex, GpuDeviceName)', + legend="GPU Usage: {{instance}} ({{RayNodeType}}), gpu.{{GpuIndex}}, {{GpuDeviceName}}", ), Target( - expr='sum(ray_node_disk_io_read_speed{{instance=~"$Instance", IsHeadNode="true", {global_filters}}}) by (instance)', - legend="Read: {{instance}} (head)", + expr='sum(ray_node_gpus_available{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}})', + legend="MAX", ), ], ), @@ -284,105 +382,99 @@ def max_plus_pending(max_resource, pending_resource): unit="bytes", targets=[ Target( - expr='sum(ray_node_mem_used{{instance=~"$Instance", IsHeadNode="false", {global_filters}}}) by (instance)', - legend="Memory Used: {{instance}}", - ), - Target( - expr='sum(ray_node_mem_used{{instance=~"$Instance", IsHeadNode="true", {global_filters}}}) by (instance)', - legend="Memory Used: {{instance}} (head)", + expr='sum(ray_node_mem_used{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}}) by (instance, RayNodeType)', + legend="Memory Used: {{instance}} ({{RayNodeType}})", ), Target( - expr='sum(ray_node_mem_total{{instance=~"$Instance",{global_filters}}})', + expr='sum(ray_node_mem_total{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}})', legend="MAX", ), ], ), Panel( id=48, - title="Node Memory Percentage (heap + object store)", + title="Node Memory % (heap + object store)", description="The percentage of physical (hardware) memory usage for each node.", unit="%", targets=[ Target( - expr='sum(ray_node_mem_used{{instance=~"$Instance", IsHeadNode="false", {global_filters}}}/ray_node_mem_total{{instance=~"$Instance", IsHeadNode="false", {global_filters}}} * 100) by (instance)', - legend="Memory Used: {{instance}}", - ), - Target( - expr='sum(ray_node_mem_used{{instance=~"$Instance", IsHeadNode="true", {global_filters}}}/ray_node_mem_total{{instance=~"$Instance", IsHeadNode="true", {global_filters}}} * 100) by (instance)', - legend="Memory Used: {{instance}} (head)", + expr='sum(ray_node_mem_used{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}}/ray_node_mem_total{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}} * 100) by (instance, RayNodeType)', + legend="Memory Used: {{instance}} ({{RayNodeType}})", ), ], fill=0, stack=False, ), Panel( - id=44, - title="Node Out of Memory Failures by Name", - description="The number of tasks and actors killed by the Ray Out of Memory killer due to high memory pressure. Metrics are broken down by IP and the name. https://docs.ray.io/en/master/ray-core/scheduling/ray-oom-prevention.html.", - unit="failures", + id=18, + title="Node GPU Memory (GRAM)", + description="The physical (hardware) GPU memory usage for each node. The dotted line means the total amount of GPU memory from the cluster.", + unit="bytes", targets=[ Target( - expr='sum(ray_memory_manager_worker_eviction_total{{instance=~"$Instance",{global_filters}}}) by (Name, instance)', - legend="OOM Killed: {{Name}}, {{instance}}", + expr='sum(ray_node_gram_used{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}} * 1024 * 1024) by (instance, RayNodeType, GpuIndex, GpuDeviceName)', + legend="Used GRAM: {{instance}} ({{RayNodeType}}), gpu.{{GpuIndex}}, {{GpuDeviceName}}", + ), + Target( + expr='(sum(ray_node_gram_available{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}}) + sum(ray_node_gram_used{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}})) * 1024 * 1024', + legend="MAX", ), ], ), Panel( - id=34, - title="Node Memory by Component", - description="The physical (hardware) memory usage across the cluster, broken down by component. This reports the summed RSS-SHM per Ray component, which corresponds to an approximate memory usage per proc. Ray components consist of system components (e.g., raylet, gcs, dashboard, or agent) and the process (that contains method names) names of running tasks/actors.", + id=6, + title="Node Disk", + description="Node's physical (hardware) disk usage. The dotted line means the total amount of disk space from the cluster.\n\nNOTE: When Ray is deployed within a container, this shows the disk usage from the host machine. ", unit="bytes", targets=[ Target( - expr='(sum(ray_component_rss_mb{{instance=~"$Instance",{global_filters}}} * 1e6) by (Component)) - (sum(ray_component_mem_shared_bytes{{instance=~"$Instance",{global_filters}}}) by (Component))', - legend="{{Component}}", - ), - Target( - expr='sum(ray_node_mem_shared_bytes{{instance=~"$Instance",{global_filters}}})', - legend="shared_memory", + expr='sum(ray_node_disk_usage{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}}) by (instance, RayNodeType)', + legend="Disk Used: {{instance}} ({{RayNodeType}})", ), Target( - expr='sum(ray_node_mem_total{{instance=~"$Instance",{global_filters}}})', + expr='sum(ray_node_disk_free{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}}) + sum(ray_node_disk_usage{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}})', legend="MAX", ), ], ), Panel( - id=37, - title="Node CPU by Component", - description="The physical (hardware) CPU usage across the cluster, broken down by component. This reports the summed CPU usage per Ray component. Ray components consist of system components (e.g., raylet, gcs, dashboard, or agent) and the process (that contains method names) names of running tasks/actors.", - unit="cores", + id=32, + title="Node Disk IO Speed", + description="Disk IO per node.", + unit="Bps", targets=[ Target( - # ray_component_cpu_percentage returns a percentage that can be > 100. It means that it uses more than 1 CPU. - expr='sum(ray_component_cpu_percentage{{instance=~"$Instance",{global_filters}}}) by (Component) / 100', - legend="{{Component}}", + expr='sum(ray_node_disk_io_write_speed{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}}) by (instance, RayNodeType)', + legend="Write: {{instance}} ({{RayNodeType}})", ), Target( - expr='sum(ray_node_cpu_count{{instance=~"$Instance",{global_filters}}})', - legend="MAX", + expr='sum(ray_node_disk_io_read_speed{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}}) by (instance, RayNodeType)', + legend="Read: {{instance}} ({{RayNodeType}})", ), ], ), Panel( - id=18, - title="Node GPU Memory (GRAM)", - description="The physical (hardware) GPU memory usage for each node. The dotted line means the total amount of GPU memory from the cluster.", - unit="bytes", + id=20, + title="Node Network", + description="Network speed per node", + unit="Bps", targets=[ Target( - expr='sum(ray_node_gram_used{{instance=~"$Instance",{global_filters}}} * 1024 * 1024) by (instance, GpuIndex, GpuDeviceName)', - legend="Used GRAM: {{instance}}, gpu.{{GpuIndex}}, {{GpuDeviceName}}", + expr='sum(ray_node_network_receive_speed{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}}) by (instance, RayNodeType)', + legend="Recv: {{instance}} ({{RayNodeType}})", ), Target( - expr='(sum(ray_node_gram_available{{instance=~"$Instance",{global_filters}}}) + sum(ray_node_gram_used{{instance=~"$Instance",{global_filters}}})) * 1024 * 1024', - legend="MAX", + expr='sum(ray_node_network_send_speed{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}}) by (instance, RayNodeType)', + legend="Send: {{instance}} ({{RayNodeType}})", ), ], ), +] + +NODE_TPU_UTILIZATION_PANELS = [ Panel( id=50, - title="Node TPU Tensorcore Utilization (Percentage)", + title="Node TPU Tensorcore Utilization %", description="Percentage of tensorcore utilization for the TPUs on this node. Computed by dividing the number of tensorcore operations by the maximum supported number of operations during the sample period.", unit="%", targets=[ @@ -394,7 +486,7 @@ def max_plus_pending(max_resource, pending_resource): ), Panel( id=51, - title="Node TPU High Bandwidth Memory Utilization (Percentage)", + title="Node TPU High Bandwidth Memory Utilization %", description="Percentage of bandwidth memory utilization for the TPUs on this node. Computed by dividing the memory bandwidth used by the maximum supported memory bandwidth limit during the sample period.", unit="%", targets=[ @@ -406,7 +498,7 @@ def max_plus_pending(max_resource, pending_resource): ), Panel( id=52, - title="Node TPU Duty Cycle (Percentage)", + title="Node TPU Duty Cycle %", description="Percentage of time over the sample period during which the TPU is actively processing.", unit="%", targets=[ @@ -432,116 +524,51 @@ def max_plus_pending(max_resource, pending_resource): ), ], ), - Panel( - id=20, - title="Node Network", - description="Network speed per node", - unit="Bps", - targets=[ - Target( - expr='sum(ray_node_network_receive_speed{{instance=~"$Instance",{global_filters}}}) by (instance)', - legend="Recv: {{instance}}", - ), - Target( - expr='sum(ray_node_network_send_speed{{instance=~"$Instance",{global_filters}}}) by (instance)', - legend="Send: {{instance}}", - ), - ], +] + +DEFAULT_GRAFANA_ROWS = [ + Row( + title="Overview and Health", + id=1001, + panels=OVERVIEW_AND_HEALTH_PANELS, + collapsed=False, ), - Panel( - id=24, - title="Node Count", - description='Note: not impacted by "Instance" variable.\n\nA total number of active failed, and pending nodes from the cluster. \n\nACTIVE: A node is alive and available.\n\nFAILED: A node is dead and not available. The node is considered dead when the raylet process on the node is terminated. The node will get into the failed state if it cannot be provided (e.g., there\'s no available node from the cloud provider) or failed to setup (e.g., setup_commands have errors). \n\nPending: A node is being started by the Ray cluster launcher. The node is unavailable now because it is being provisioned and initialized.', - unit="nodes", - targets=[ - Target( - expr="sum(autoscaler_active_nodes{{{global_filters}}}) by (NodeType)", - legend="Active Nodes: {{NodeType}}", - ), - Target( - expr="sum(autoscaler_recently_failed_nodes{{{global_filters}}}) by (NodeType)", - legend="Failed Nodes: {{NodeType}}", - ), - Target( - expr="sum(autoscaler_pending_nodes{{{global_filters}}}) by (NodeType)", - legend="Pending Nodes: {{NodeType}}", - ), - ], + Row( + title="Ray Tasks, Actors and Placement Groups", + id=1002, + panels=RAY_TASKS_ACTORS_PLACEMENT_GROUPS_PANELS, + collapsed=False, ), - Panel( - id=41, - title="Cluster Utilization", - description="Aggregated utilization of all physical resources (CPU, GPU, memory, disk, or etc.) across the cluster.", - unit="%", - targets=[ - # CPU - Target( - expr='avg(ray_node_cpu_utilization{{instance=~"$Instance",{global_filters}}})', - legend="CPU (physical)", - ), - # GPU - Target( - expr='sum(ray_node_gpus_utilization{{instance=~"$Instance",{global_filters}}}) / on() (sum(ray_node_gpus_available{{instance=~"$Instance",{global_filters}}}) or vector(0))', - legend="GPU (physical)", - ), - # Memory - Target( - expr='sum(ray_node_mem_used{{instance=~"$Instance",{global_filters}}}) / on() (sum(ray_node_mem_total{{instance=~"$Instance",{global_filters}}})) * 100', - legend="Memory (RAM)", - ), - # GRAM - Target( - expr='sum(ray_node_gram_used{{instance=~"$Instance",{global_filters}}}) / on() (sum(ray_node_gram_available{{instance=~"$Instance",{global_filters}}}) + sum(ray_node_gram_used{{instance=~"$Instance",{global_filters}}})) * 100', - legend="GRAM", - ), - # Object Store - Target( - expr='sum(ray_object_store_memory{{instance=~"$Instance",{global_filters}}}) / on() sum(ray_resources{{Name="object_store_memory",instance=~"$Instance",{global_filters}}}) * 100', - legend="Object Store Memory", - ), - # Disk - Target( - expr='sum(ray_node_disk_usage{{instance=~"$Instance",{global_filters}}}) / on() (sum(ray_node_disk_free{{instance=~"$Instance",{global_filters}}}) + sum(ray_node_disk_usage{{instance=~"$Instance",{global_filters}}})) * 100', - legend="Disk", - ), - ], - fill=0, - stack=False, + Row( + title="Ray Resources", + id=1003, + panels=RAY_RESOURCES_PANELS, + collapsed=False, ), - Panel( - id=45, - title="Node GPU by Component", - description="The physical (hardware) GPU usage across the cluster, broken down by component. This reports the summed GPU usage per Ray component.", - unit="GPUs", - targets=[ - Target( - expr="sum(ray_component_gpu_percentage{{{global_filters}}} / 100) by (Component)", - legend="{{Component}}", - ), - ], + Row( + title="Hardware Utilization by Ray Component", + id=1004, + panels=NODE_HARDWARE_UTILIZATION_BY_RAY_COMPONENT_PANELS, + collapsed=False, ), - Panel( - id=46, - title="Node GPU Memory by Component", - description="The physical (hardware) GPU memory usage across the cluster, broken down by component. This reports the summed GPU memory usage per Ray component.", - unit="bytes", - targets=[ - Target( - expr="sum(ray_component_gpu_memory_mb{{{global_filters}}}) by (Component)", - legend="{{Component}}", - ), - Target( - expr='(sum(ray_node_gram_available{{instance=~"$Instance",{global_filters}}}) + sum(ray_node_gram_used{{instance=~"$Instance",{global_filters}}}))*1024*1024', - legend="MAX", - ), - ], + Row( + title="Hardware Utilization by Node", + id=1005, + panels=NODE_HARDWARE_UTILIZATION_PANELS, + collapsed=False, + ), + Row( + title="TPU Utilization by Node", + id=1006, + panels=NODE_TPU_UTILIZATION_PANELS, + collapsed=True, ), ] - ids = [] -for panel in DEFAULT_GRAFANA_PANELS: - ids.append(panel.id) +for row in DEFAULT_GRAFANA_ROWS: + ids.append(row.id) + ids.extend(panel.id for panel in row.panels) assert len(ids) == len( set(ids) ), f"Duplicated id found. Use unique id for each panel. {ids}" @@ -549,7 +576,7 @@ def max_plus_pending(max_resource, pending_resource): default_dashboard_config = DashboardConfig( name="DEFAULT", default_uid="rayDefaultDashboard", - panels=DEFAULT_GRAFANA_PANELS, + rows=DEFAULT_GRAFANA_ROWS, standard_global_filters=[ 'SessionName=~"$SessionName"', 'ray_io_cluster=~"$Cluster"', diff --git a/python/ray/dashboard/modules/metrics/dashboards/default_grafana_dashboard_base.json b/python/ray/dashboard/modules/metrics/dashboards/default_grafana_dashboard_base.json index 76cf304f21b0..3771296cd2a1 100644 --- a/python/ray/dashboard/modules/metrics/dashboards/default_grafana_dashboard_base.json +++ b/python/ray/dashboard/modules/metrics/dashboards/default_grafana_dashboard_base.json @@ -47,7 +47,7 @@ }, "datasource": "${datasource}", "definition": "label_values(ray_node_network_receive_speed{{{global_filters}}}, SessionName)", - "description": "Filter queries to specific ray sessions.", + "description": "Filter queries to specific Ray sessions.", "error": null, "hide": 0, "includeAll": true, @@ -78,7 +78,7 @@ }, "datasource": "${datasource}", "definition": "label_values(ray_node_network_receive_speed{{SessionName=~\"$SessionName\",{global_filters}}}, instance)", - "description": null, + "description": "Filter queries to specific Ray nodes by their IP address.", "error": null, "hide": 0, "includeAll": true, @@ -106,7 +106,7 @@ }, "datasource": "${datasource}", "definition": "label_values(ray_node_network_receive_speed{{{global_filters}}}, ray_io_cluster)", - "description": "Filter queries to specific Ray clusters for KubeRay. When ingesting metrics across multiple ray clusters, the ray_io_cluster label should be set per cluster. For KubeRay users, this is done automaticaly with Prometheus PodMonitor.", + "description": "Filter queries to specific Ray clusters for KubeRay. When ingesting metrics across multiple Ray clusters, the ray_io_cluster label should be set per cluster. For KubeRay users, this is done automatically with Prometheus PodMonitor.", "error": null, "hide": 0, "includeAll": true, @@ -127,9 +127,43 @@ "tagsQuery": "", "type": "query", "useTags": false + }, + { + "current": { + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "description": "Filter queries to specific Ray node types (head or worker).", + "includeAll": true, + "multi": true, + "name": "RayNodeType", + "options": [ + { + "selected": false, + "text": "All", + "value": "$__all" + }, + { + "selected": false, + "text": "Head Node", + "value": "head" + }, + { + "selected": false, + "text": "Worker Node", + "value": "worker" + } + ], + "query": "head, worker", + "type": "custom" } ] }, + "rayMeta": ["supportsFullGrafanaView"], "time": { "from": "now-30m", "to": "now" diff --git a/python/ray/dashboard/modules/metrics/dashboards/serve_llm_dashboard_panels.py b/python/ray/dashboard/modules/metrics/dashboards/serve_llm_dashboard_panels.py index dff60f13a117..cc876bd17771 100644 --- a/python/ray/dashboard/modules/metrics/dashboards/serve_llm_dashboard_panels.py +++ b/python/ray/dashboard/modules/metrics/dashboards/serve_llm_dashboard_panels.py @@ -274,6 +274,43 @@ stack=False, grid_pos=GridPos(12, 40, 12, 8), ), + Panel( + id=28, + title="vLLM: Prefix Cache Hit Rate", + description="Percentage of prefix cache queries that resulted in a cache hit (GPU).", + unit="percentunit", + targets=[ + Target( + expr='increase(ray_vllm:gpu_prefix_cache_hits_total{{model_name=~"$vllm_model_name", WorkerId=~"$workerid", {global_filters}}}[30s]) / increase(ray_vllm:gpu_prefix_cache_queries_total{{model_name=~"$vllm_model_name", WorkerId=~"$workerid", {global_filters}}}[30s])', + legend="GPU: {{model_name}} - {{WorkerId}}", + ), + ], + fill=1, + linewidth=2, + stack=False, + grid_pos=GridPos(0, 48, 12, 8), + ), + Panel( + id=27, + title="Tokens Per Request Per Model Last 7 Days", + description="", + unit="Tokens", + targets=[ + Target( + expr='sum by (model_name) (delta(ray_vllm:prompt_tokens_total{{WorkerId=~"$workerid", {global_filters}}}[1w])) / sum by (model_name) (delta(ray_vllm:request_success_total{{WorkerId=~"$workerid", {global_filters}}}[1w]))', + legend="In: {{ model_name}}", + ), + Target( + expr='sum by (model_name) (delta(ray_vllm:generation_tokens_total{{WorkerId=~"$workerid", {global_filters}}}[1w])) / sum by (model_name) (delta(ray_vllm:request_success_total{{WorkerId=~"$workerid", {global_filters}}}[1w]))', + legend="Out: {{ model_name}}", + ), + ], + fill=1, + linewidth=2, + stack=False, + grid_pos=GridPos(12, 48, 12, 8), + template=PanelTemplate.GAUGE, + ), Panel( id=14, title="Tokens Last 24 Hours", @@ -292,7 +329,7 @@ fill=1, linewidth=2, stack=False, - grid_pos=GridPos(0, 48, 12, 8), + grid_pos=GridPos(0, 56, 12, 8), template=PanelTemplate.STAT, ), Panel( @@ -313,7 +350,7 @@ fill=1, linewidth=2, stack=False, - grid_pos=GridPos(12, 48, 12, 8), + grid_pos=GridPos(12, 56, 12, 8), template=PanelTemplate.STAT, ), Panel( @@ -330,7 +367,7 @@ fill=1, linewidth=2, stack=False, - grid_pos=GridPos(12, 56, 12, 8), + grid_pos=GridPos(0, 64, 12, 8), template=PanelTemplate.PIE_CHART, ), Panel( @@ -347,7 +384,7 @@ fill=1, linewidth=2, stack=False, - grid_pos=GridPos(0, 64, 12, 8), + grid_pos=GridPos(12, 64, 12, 8), template=PanelTemplate.STAT, ), Panel( @@ -364,7 +401,7 @@ fill=1, linewidth=2, stack=False, - grid_pos=GridPos(12, 64, 12, 8), + grid_pos=GridPos(0, 72, 12, 8), template=PanelTemplate.STAT, ), Panel( @@ -381,7 +418,7 @@ fill=1, linewidth=2, stack=False, - grid_pos=GridPos(0, 72, 12, 8), + grid_pos=GridPos(12, 72, 12, 8), template=PanelTemplate.STAT, ), Panel( @@ -398,7 +435,7 @@ fill=1, linewidth=2, stack=False, - grid_pos=GridPos(12, 72, 12, 8), + grid_pos=GridPos(0, 80, 12, 8), template=PanelTemplate.GAUGE, ), Panel( @@ -415,7 +452,7 @@ fill=1, linewidth=2, stack=False, - grid_pos=GridPos(0, 80, 12, 8), + grid_pos=GridPos(12, 80, 12, 8), template=PanelTemplate.GAUGE, ), Panel( @@ -432,7 +469,7 @@ fill=1, linewidth=2, stack=False, - grid_pos=GridPos(12, 80, 12, 8), + grid_pos=GridPos(0, 88, 12, 8), template=PanelTemplate.GAUGE, ), Panel( @@ -453,27 +490,6 @@ fill=1, linewidth=2, stack=False, - grid_pos=GridPos(0, 88, 12, 8), - template=PanelTemplate.GAUGE, - ), - Panel( - id=27, - title="Tokens Per Request Per Model Last 7 Days", - description="", - unit="Tokens", - targets=[ - Target( - expr='sum by (model_name) (delta(ray_vllm:prompt_tokens_total{{WorkerId=~"$workerid", {global_filters}}}[1w])) / sum by (model_name) (delta(ray_vllm:request_success_total{{WorkerId=~"$workerid", {global_filters}}}[1w]))', - legend="In: {{ model_name}}", - ), - Target( - expr='sum by (model_name) (delta(ray_vllm:generation_tokens_total{{WorkerId=~"$workerid", {global_filters}}}[1w])) / sum by (model_name) (delta(ray_vllm:request_success_total{{WorkerId=~"$workerid", {global_filters}}}[1w]))', - legend="Out: {{ model_name}}", - ), - ], - fill=1, - linewidth=2, - stack=False, grid_pos=GridPos(12, 88, 12, 8), template=PanelTemplate.GAUGE, ), diff --git a/python/ray/dashboard/modules/metrics/dashboards/train_dashboard_panels.py b/python/ray/dashboard/modules/metrics/dashboards/train_dashboard_panels.py index ed53c44ee4bd..d8f0be9a5f1d 100644 --- a/python/ray/dashboard/modules/metrics/dashboards/train_dashboard_panels.py +++ b/python/ray/dashboard/modules/metrics/dashboards/train_dashboard_panels.py @@ -2,27 +2,13 @@ from ray.dashboard.modules.metrics.dashboards.common import ( DashboardConfig, Panel, - Target, Row, + Target, ) - -class PanelId: - """ - A class to generate unique panel IDs. - """ - - id = 0 - - @staticmethod - def next(): - PanelId.id += 1 - return PanelId.id - - # Ray Train Metrics (Controller) CONTROLLER_STATE_PANEL = Panel( - id=PanelId.next(), + id=1, title="Controller State", description="Current state of the train controller.", unit="", @@ -35,7 +21,7 @@ def next(): ) CONTROLLER_OPERATION_TIME_PANEL = Panel( - id=PanelId.next(), + id=2, title="Controller Operation Time", description="Time taken by the controller for worker group operations.", unit="seconds", @@ -55,7 +41,7 @@ def next(): # Ray Train Metrics (Worker) WORKER_CHECKPOINT_REPORT_TIME_PANEL = Panel( - id=PanelId.next(), + id=3, title="Checkpoint Report Time", description="Time taken to report a checkpoint to storage.", unit="seconds", @@ -71,52 +57,52 @@ def next(): # Core System Resources CPU_UTILIZATION_PANEL = Panel( - id=PanelId.next(), + id=4, title="CPU Usage", description="CPU core utilization across all workers.", unit="cores", targets=[ Target( - expr='sum(ray_node_cpu_utilization{{instance=~"$Instance", {global_filters}}} * ray_node_cpu_count{{instance=~"$Instance", {global_filters}}} / 100) by (instance)', - legend="CPU Usage: {{instance}}", + expr='sum(ray_node_cpu_utilization{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}} * ray_node_cpu_count{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}} / 100) by (instance, RayNodeType)', + legend="CPU Usage: {{instance}} ({{RayNodeType}})", ), Target( - expr='sum(ray_node_cpu_count{{instance=~"$Instance", {global_filters}}})', + expr='sum(ray_node_cpu_count{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}})', legend="MAX", ), ], ) MEMORY_UTILIZATION_PANEL = Panel( - id=PanelId.next(), + id=5, title="Total Memory Usage", description="Total physical memory used vs total available memory.", unit="bytes", targets=[ Target( - expr='sum(ray_node_mem_used{{instance=~"$Instance", {global_filters}}}) by (instance)', - legend="Memory Used: {{instance}}", + expr='sum(ray_node_mem_used{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}}) by (instance, RayNodeType)', + legend="Memory Used: {{instance}} ({{RayNodeType}})", ), Target( - expr='sum(ray_node_mem_total{{instance=~"$Instance", {global_filters}}})', + expr='sum(ray_node_mem_total{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}})', legend="MAX", ), ], ) MEMORY_DETAILED_PANEL = Panel( - id=PanelId.next(), + id=6, title="Memory Allocation Details", description="Memory allocation details including available and shared memory.", unit="bytes", targets=[ Target( - expr='sum(ray_node_mem_available{{instance=~"$Instance", {global_filters}}}) by (instance)', - legend="Available Memory: {{instance}}", + expr='sum(ray_node_mem_available{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}}) by (instance, RayNodeType)', + legend="Available Memory: {{instance}} ({{RayNodeType}})", ), Target( - expr='sum(ray_node_mem_shared_bytes{{instance=~"$Instance", {global_filters}}}) by (instance)', - legend="Shared Memory: {{instance}}", + expr='sum(ray_node_mem_shared_bytes{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}}) by (instance, RayNodeType)', + legend="Shared Memory: {{instance}} ({{RayNodeType}})", ), ], ) @@ -124,34 +110,34 @@ def next(): # GPU Resources # TODO: Add GPU Device/Index as a filter. GPU_UTILIZATION_PANEL = Panel( - id=PanelId.next(), + id=7, title="GPU Usage", description="GPU utilization across all workers.", unit="GPUs", targets=[ Target( - expr='sum(ray_node_gpus_utilization{{instance=~"$Instance", GpuIndex=~"$GpuIndex", GpuDeviceName=~"$GpuDeviceName", {global_filters}}} / 100) by (instance, GpuIndex, GpuDeviceName)', - legend="GPU Usage: {{instance}}, gpu.{{GpuIndex}}, {{GpuDeviceName}}", + expr='sum(ray_node_gpus_utilization{{instance=~"$Instance", RayNodeType=~"$RayNodeType", GpuIndex=~"$GpuIndex", GpuDeviceName=~"$GpuDeviceName", {global_filters}}} / 100) by (instance, RayNodeType, GpuIndex, GpuDeviceName)', + legend="GPU Usage: {{instance}} ({{RayNodeType}}), gpu.{{GpuIndex}}, {{GpuDeviceName}}", ), Target( - expr='sum(ray_node_gpus_available{{instance=~"$Instance", GpuIndex=~"$GpuIndex", GpuDeviceName=~"$GpuDeviceName", {global_filters}}})', + expr='sum(ray_node_gpus_available{{instance=~"$Instance", RayNodeType=~"$RayNodeType", GpuIndex=~"$GpuIndex", GpuDeviceName=~"$GpuDeviceName", {global_filters}}})', legend="MAX", ), ], ) GPU_MEMORY_UTILIZATION_PANEL = Panel( - id=PanelId.next(), + id=8, title="GPU Memory Usage", description="GPU memory usage across all workers.", unit="bytes", targets=[ Target( - expr='sum(ray_node_gram_used{{instance=~"$Instance", GpuIndex=~"$GpuIndex", GpuDeviceName=~"$GpuDeviceName", {global_filters}}} * 1024 * 1024) by (instance, GpuIndex, GpuDeviceName)', - legend="Used GRAM: {{instance}}, gpu.{{GpuIndex}}, {{GpuDeviceName}}", + expr='sum(ray_node_gram_used{{instance=~"$Instance", RayNodeType=~"$RayNodeType", GpuIndex=~"$GpuIndex", GpuDeviceName=~"$GpuDeviceName", {global_filters}}} * 1024 * 1024) by (instance, RayNodeType, GpuIndex, GpuDeviceName)', + legend="Used GRAM: {{instance}} ({{RayNodeType}}), gpu.{{GpuIndex}}, {{GpuDeviceName}}", ), Target( - expr='(sum(ray_node_gram_available{{instance=~"$Instance", GpuIndex=~"$GpuIndex", GpuDeviceName=~"$GpuDeviceName", {global_filters}}}) + sum(ray_node_gram_used{{instance=~"$Instance", GpuIndex=~"$GpuIndex", GpuDeviceName=~"$GpuDeviceName", {global_filters}}})) * 1024 * 1024', + expr='(sum(ray_node_gram_available{{instance=~"$Instance", RayNodeType=~"$RayNodeType", GpuIndex=~"$GpuIndex", GpuDeviceName=~"$GpuDeviceName", {global_filters}}}) + sum(ray_node_gram_used{{instance=~"$Instance", RayNodeType=~"$RayNodeType", GpuIndex=~"$GpuIndex", GpuDeviceName=~"$GpuDeviceName", {global_filters}}})) * 1024 * 1024', legend="MAX", ), ], @@ -159,87 +145,87 @@ def next(): # Storage Resources DISK_UTILIZATION_PANEL = Panel( - id=PanelId.next(), + id=9, title="Disk Space Usage", description="Disk space usage across all workers.", unit="bytes", targets=[ Target( - expr='sum(ray_node_disk_usage{{instance=~"$Instance", {global_filters}}}) by (instance)', - legend="Disk Used: {{instance}}", + expr='sum(ray_node_disk_usage{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}}) by (instance, RayNodeType)', + legend="Disk Used: {{instance}} ({{RayNodeType}})", ), Target( - expr='sum(ray_node_disk_free{{instance=~"$Instance", {global_filters}}}) + sum(ray_node_disk_usage{{instance=~"$Instance", {global_filters}}})', + expr='sum(ray_node_disk_free{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}}) + sum(ray_node_disk_usage{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}})', legend="MAX", ), ], ) DISK_THROUGHPUT_PANEL = Panel( - id=PanelId.next(), + id=10, title="Disk Throughput", description="Current disk read/write throughput.", unit="Bps", targets=[ Target( - expr='sum(ray_node_disk_io_read_speed{{instance=~"$Instance", {global_filters}}}) by (instance)', - legend="Read Speed: {{instance}}", + expr='sum(ray_node_disk_io_read_speed{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}}) by (instance, RayNodeType)', + legend="Read Speed: {{instance}} ({{RayNodeType}})", ), Target( - expr='sum(ray_node_disk_io_write_speed{{instance=~"$Instance", {global_filters}}}) by (instance)', - legend="Write Speed: {{instance}}", + expr='sum(ray_node_disk_io_write_speed{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}}) by (instance, RayNodeType)', + legend="Write Speed: {{instance}} ({{RayNodeType}})", ), ], ) DISK_OPERATIONS_PANEL = Panel( - id=PanelId.next(), + id=11, title="Disk Operations", description="Current disk read/write operations per second.", unit="ops/s", targets=[ Target( - expr='sum(ray_node_disk_read_iops{{instance=~"$Instance", {global_filters}}}) by (instance)', - legend="Read IOPS: {{instance}}", + expr='sum(ray_node_disk_read_iops{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}}) by (instance, RayNodeType)', + legend="Read IOPS: {{instance}} ({{RayNodeType}})", ), Target( - expr='sum(ray_node_disk_write_iops{{instance=~"$Instance", {global_filters}}}) by (instance)', - legend="Write IOPS: {{instance}}", + expr='sum(ray_node_disk_write_iops{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}}) by (instance, RayNodeType)', + legend="Write IOPS: {{instance}} ({{RayNodeType}})", ), ], ) # Network Resources NETWORK_THROUGHPUT_PANEL = Panel( - id=PanelId.next(), + id=12, title="Network Throughput", description="Current network send/receive throughput.", unit="Bps", targets=[ Target( - expr='sum(ray_node_network_receive_speed{{instance=~"$Instance", {global_filters}}}) by (instance)', - legend="Receive Speed: {{instance}}", + expr='sum(ray_node_network_receive_speed{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}}) by (instance, RayNodeType)', + legend="Receive Speed: {{instance}} ({{RayNodeType}})", ), Target( - expr='sum(ray_node_network_send_speed{{instance=~"$Instance", {global_filters}}}) by (instance)', - legend="Send Speed: {{instance}}", + expr='sum(ray_node_network_send_speed{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}}) by (instance, RayNodeType)', + legend="Send Speed: {{instance}} ({{RayNodeType}})", ), ], ) NETWORK_TOTAL_PANEL = Panel( - id=PanelId.next(), + id=13, title="Network Total Traffic", description="Total network traffic sent/received.", unit="bytes", targets=[ Target( - expr='sum(ray_node_network_sent{{instance=~"$Instance", {global_filters}}}) by (instance)', - legend="Total Sent: {{instance}}", + expr='sum(ray_node_network_sent{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}}) by (instance, RayNodeType)', + legend="Total Sent: {{instance}} ({{RayNodeType}})", ), Target( - expr='sum(ray_node_network_received{{instance=~"$Instance", {global_filters}}}) by (instance)', - legend="Total Received: {{instance}}", + expr='sum(ray_node_network_received{{instance=~"$Instance", RayNodeType=~"$RayNodeType", {global_filters}}}) by (instance, RayNodeType)', + legend="Total Received: {{instance}} ({{RayNodeType}})", ), ], ) @@ -250,7 +236,7 @@ def next(): # Train Metrics Row Row( title="Train Metrics", - id=PanelId.next(), + id=14, panels=[ # Ray Train Metrics (Controller) CONTROLLER_STATE_PANEL, @@ -263,7 +249,7 @@ def next(): # System Resources Row Row( title="Resource Utilization", - id=PanelId.next(), + id=15, panels=[ CPU_UTILIZATION_PANEL, MEMORY_UTILIZATION_PANEL, diff --git a/python/ray/dashboard/modules/metrics/dashboards/train_grafana_dashboard_base.json b/python/ray/dashboard/modules/metrics/dashboards/train_grafana_dashboard_base.json index d94d8816ad99..82570ed428e8 100644 --- a/python/ray/dashboard/modules/metrics/dashboards/train_grafana_dashboard_base.json +++ b/python/ray/dashboard/modules/metrics/dashboards/train_grafana_dashboard_base.json @@ -227,8 +227,41 @@ "text": ["All"], "value": ["$__all"] } - } + }, + { + "current": { + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "description": "Filter queries to specific Ray node types (head or worker).", + "includeAll": true, + "multi": true, + "name": "RayNodeType", + "options": [ + { + "selected": false, + "text": "All", + "value": "$__all" + }, + { + "selected": false, + "text": "Head Node", + "value": "head" + }, + { + "selected": false, + "text": "Worker Node", + "value": "worker" + } + ], + "query": "head, worker", + "type": "custom" + } ] } } diff --git a/python/ray/dashboard/modules/metrics/grafana_dashboard_factory.py b/python/ray/dashboard/modules/metrics/grafana_dashboard_factory.py index 2d57abe853ce..0ffa6b1cec15 100644 --- a/python/ray/dashboard/modules/metrics/grafana_dashboard_factory.py +++ b/python/ray/dashboard/modules/metrics/grafana_dashboard_factory.py @@ -66,7 +66,10 @@ def _read_configs_for_dashboard( ) or "" ) - global_filters = global_filters_str.split(",") + if global_filters_str == "": + global_filters = [] + else: + global_filters = global_filters_str.split(",") return uid, global_filters diff --git a/python/ray/dashboard/modules/node/datacenter.py b/python/ray/dashboard/modules/node/datacenter.py index dcdd0c286060..3fa86de65e92 100644 --- a/python/ray/dashboard/modules/node/datacenter.py +++ b/python/ray/dashboard/modules/node/datacenter.py @@ -198,22 +198,18 @@ async def get_actor_infos(cls, actor_ids: Optional[List[str]] = None): } @staticmethod - async def _get_actor_info(actor): + async def _get_actor_info(actor: Optional[dict]) -> Optional[dict]: if actor is None: return None - actor = dict(actor) + actor = actor.copy() worker_id = actor["address"]["workerId"] core_worker_stats = DataSource.core_worker_stats.get(worker_id, {}) - actor_constructor = core_worker_stats.get( - "actorTitle", "Unknown actor constructor" - ) - actor["actorConstructor"] = actor_constructor actor.update(core_worker_stats) # TODO(fyrestone): remove this, give a link from actor # info to worker info in front-end. - node_id = actor["address"]["rayletId"] + node_id = actor["address"]["nodeId"] pid = core_worker_stats.get("pid") node_physical_stats = DataSource.node_physical_stats.get(node_id, {}) actor_process_stats = None @@ -225,7 +221,7 @@ async def _get_actor_info(actor): break for gpu_stats in node_physical_stats.get("gpus", []): - # gpu_stats.get("processes") can be None, an empty list or a + # gpu_stats.get("processesPids") can be None, an empty list or a # list of dictionaries. for process in gpu_stats.get("processesPids") or []: if process["pid"] == pid: diff --git a/python/ray/dashboard/modules/node/node_head.py b/python/ray/dashboard/modules/node/node_head.py index bc696afc343f..d774ca7cac36 100644 --- a/python/ray/dashboard/modules/node/node_head.py +++ b/python/ray/dashboard/modules/node/node_head.py @@ -87,7 +87,7 @@ def _actor_table_data_to_dict(message): "parentId", "jobId", "workerId", - "rayletId", + "nodeId", "callerId", "taskId", "parentTaskId", @@ -576,7 +576,7 @@ async def _update_actors(self): # Update node actors and job actors. node_actors = defaultdict(dict) for actor_id_bytes, updated_actor_table in actor_dicts.items(): - node_id = updated_actor_table["address"]["rayletId"] + node_id = updated_actor_table["address"]["nodeId"] # Update only when node_id is not Nil. if node_id != actor_consts.NIL_NODE_ID: node_actors[node_id][actor_id_bytes] = updated_actor_table @@ -653,7 +653,7 @@ def _process_updated_actor_table( actor_table_data = actor actor_id = actor_table_data["actorId"] - node_id = actor_table_data["address"]["rayletId"] + node_id = actor_table_data["address"]["nodeId"] if actor_table_data["state"] == "DEAD": self._destroyed_actors_queue.append(actor_id) @@ -688,7 +688,7 @@ async def _cleanup_actors(self): actor_id = self._destroyed_actors_queue.popleft() if actor_id in DataSource.actors: actor = DataSource.actors.pop(actor_id) - node_id = actor["address"].get("rayletId") + node_id = actor["address"].get("nodeId") if node_id and node_id != actor_consts.NIL_NODE_ID: del DataSource.node_actors[node_id][actor_id] await asyncio.sleep(ACTOR_CLEANUP_FREQUENCY) diff --git a/python/ray/dashboard/modules/node/tests/test_actor.py b/python/ray/dashboard/modules/node/tests/test_actor.py index 3b8c4cbaf888..e374a28a2c0a 100644 --- a/python/ray/dashboard/modules/node/tests/test_actor.py +++ b/python/ray/dashboard/modules/node/tests/test_actor.py @@ -7,7 +7,6 @@ import requests import ray -import ray.dashboard.utils as dashboard_utils from ray._private.test_utils import format_web_url, wait_until_server_available from ray.dashboard.modules.node import actor_consts from ray.dashboard.tests.conftest import * # noqa @@ -102,7 +101,7 @@ def get_placement_group_id(self): assert "Foo" in actor_response["className"] assert "address" in actor_response assert type(actor_response["address"]) is dict - assert actor_response["address"]["rayletId"] == node_id + assert actor_response["address"]["nodeId"] == node_id assert actor_response["state"] == "ALIVE" assert actor_response["name"] == "first" assert actor_response["numRestarts"] == "0" @@ -239,98 +238,6 @@ def get_actor_id(self): raise Exception(f"Timed out while testing, {ex_stack}") -def test_actor_pubsub(disable_aiohttp_cache, ray_start_with_dashboard): - timeout = 5 - assert wait_until_server_available(ray_start_with_dashboard["webui_url"]) - address_info = ray_start_with_dashboard - - sub = ray._raylet._TestOnly_GcsActorSubscriber(address=address_info["gcs_address"]) - sub.subscribe() - - @ray.remote - class DummyActor: - def __init__(self): - pass - - # Create a dummy actor. - a = DummyActor.remote() - - def handle_pub_messages(msgs, timeout, expect_num): - start_time = time.time() - while time.time() - start_time < timeout and len(msgs) < expect_num: - published = sub.poll(timeout=timeout) - for _, actor_data in published: - if actor_data is None: - continue - msgs.append(actor_data) - - msgs = [] - handle_pub_messages(msgs, timeout, 3) - # Assert we received published actor messages with state - # DEPENDENCIES_UNREADY, PENDING_CREATION and ALIVE. - assert len(msgs) == 3, msgs - - # Kill actor. - ray.kill(a) - handle_pub_messages(msgs, timeout, 4) - - # Assert we received published actor messages with state DEAD. - assert len(msgs) == 4 - - def actor_table_data_to_dict(message): - return dashboard_utils.message_to_dict( - message, - { - "actorId", - "parentId", - "jobId", - "workerId", - "rayletId", - "callerId", - "taskId", - "parentTaskId", - "sourceActorId", - "placementGroupId", - }, - always_print_fields_with_no_presence=False, - ) - - non_state_keys = ("actorId", "jobId") - - for msg in msgs: - actor_data_dict = actor_table_data_to_dict(msg) - # DEPENDENCIES_UNREADY is 0, which would not be kept in dict. We - # need check its original value. - if msg.state == 0: - assert len(actor_data_dict) > 5 - for k in non_state_keys: - assert k in actor_data_dict - # For status that is not DEPENDENCIES_UNREADY, only states fields will - # be published. - elif actor_data_dict["state"] in ("ALIVE", "DEAD"): - assert actor_data_dict.keys() >= { - "state", - "address", - "timestamp", - "pid", - "rayNamespace", - } - elif actor_data_dict["state"] == "PENDING_CREATION": - assert actor_data_dict.keys() == { - "state", - "address", - "actorId", - "jobId", - "ownerAddress", - "className", - "serializedRuntimeEnv", - "rayNamespace", - "functionDescriptor", - } - else: - raise Exception("Unknown state: {}".format(actor_data_dict["state"])) - - def test_nil_node(enable_test_module, disable_aiohttp_cache, ray_start_with_dashboard): assert wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True webui_url = ray_start_with_dashboard["webui_url"] diff --git a/python/ray/dashboard/modules/node/tests/test_node.py b/python/ray/dashboard/modules/node/tests/test_node.py index 3016d65491d0..df4f980f4125 100644 --- a/python/ray/dashboard/modules/node/tests/test_node.py +++ b/python/ray/dashboard/modules/node/tests/test_node.py @@ -8,10 +8,10 @@ from datetime import datetime, timedelta import pytest -from ray._common.test_utils import wait_for_condition import requests import ray +from ray._common.test_utils import wait_for_condition from ray._private.test_utils import ( format_web_url, wait_until_server_available, diff --git a/python/ray/dashboard/modules/reporter/gpu_profile_manager.py b/python/ray/dashboard/modules/reporter/gpu_profile_manager.py index de66113b86dd..3c0fe01b7402 100644 --- a/python/ray/dashboard/modules/reporter/gpu_profile_manager.py +++ b/python/ray/dashboard/modules/reporter/gpu_profile_manager.py @@ -106,7 +106,7 @@ def node_has_gpus(cls) -> bool: try: subprocess.check_output(["nvidia-smi"], stderr=subprocess.DEVNULL) return True - except (subprocess.CalledProcessError, FileNotFoundError): + except Exception: return False @classmethod diff --git a/python/ray/dashboard/modules/reporter/gpu_providers.py b/python/ray/dashboard/modules/reporter/gpu_providers.py index b50a417c1c48..d26ea4597fa6 100644 --- a/python/ray/dashboard/modules/reporter/gpu_providers.py +++ b/python/ray/dashboard/modules/reporter/gpu_providers.py @@ -8,8 +8,8 @@ import enum import logging import subprocess -from typing import Dict, List, Optional, Union, TypedDict from collections import defaultdict +from typing import Dict, List, Optional, TypedDict, Union from ray._private.ray_constants import RAY_METRIC_ENABLE_GPU_NVSMI @@ -230,9 +230,9 @@ def _parse_nvsmi_pmon_output( 1 7175 C 86 26 - - - - ray::TorchGPUWo 2 - - - - - - - - - - Returns a dict mapping GPU index to list of ProcessGPUInfo. + Returns a dict mapping GPU index to dict of pid to ProcessGPUInfo. """ - process_utilizations = defaultdict(list) + process_utilizations = defaultdict(dict) lines = nvsmi_stdout.splitlines() # Get the first line that is started with # table_header = None @@ -275,7 +275,7 @@ def _parse_nvsmi_pmon_output( ), # Convert percentage to MB gpu_utilization=sm, ) - process_utilizations[gpu_id].append(process_info) + process_utilizations[gpu_id][pid] = process_info return process_utilizations def _get_pynvml_gpu_usage(self) -> List[GpuUtilizationInfo]: diff --git a/python/ray/dashboard/modules/reporter/healthz_agent.py b/python/ray/dashboard/modules/reporter/healthz_agent.py index 09581852404d..cff4edaf33d1 100644 --- a/python/ray/dashboard/modules/reporter/healthz_agent.py +++ b/python/ray/dashboard/modules/reporter/healthz_agent.py @@ -3,8 +3,8 @@ import ray.dashboard.optional_utils as optional_utils import ray.dashboard.utils as dashboard_utils import ray.exceptions -from ray.dashboard.modules.reporter.utils import HealthChecker from ray._raylet import NodeID +from ray.dashboard.modules.reporter.utils import HealthChecker routes = optional_utils.DashboardAgentRouteTable diff --git a/python/ray/dashboard/modules/reporter/reporter_agent.py b/python/ray/dashboard/modules/reporter/reporter_agent.py index 877257b78b1e..663a897794e1 100644 --- a/python/ray/dashboard/modules/reporter/reporter_agent.py +++ b/python/ray/dashboard/modules/reporter/reporter_agent.py @@ -3,7 +3,6 @@ import json import logging import os -import requests import socket import sys import traceback @@ -11,43 +10,37 @@ from concurrent.futures import ThreadPoolExecutor from typing import List, Optional, Tuple +import requests +from grpc.aio import ServicerContext from opencensus.stats import stats as stats_module -from prometheus_client.core import REGISTRY -from prometheus_client.parser import text_string_to_metric_families from opentelemetry.proto.collector.metrics.v1 import ( metrics_service_pb2, metrics_service_pb2_grpc, ) from opentelemetry.proto.metrics.v1.metrics_pb2 import Metric -from grpc.aio import ServicerContext - +from prometheus_client.core import REGISTRY +from prometheus_client.parser import text_string_to_metric_families import ray import ray._private.prometheus_exporter as prometheus_exporter import ray.dashboard.modules.reporter.reporter_consts as reporter_consts import ray.dashboard.utils as dashboard_utils +from ray._common.network_utils import parse_address from ray._common.utils import ( get_or_create_event_loop, get_user_temp_dir, ) -from ray._common.network_utils import parse_address -from ray._private.utils import get_system_memory -from ray.dashboard.modules.reporter.gpu_providers import ( - GpuMetricProvider, - GpuUtilizationInfo, - TpuUtilizationInfo, -) from ray._private import utils from ray._private.metrics_agent import Gauge, MetricsAgent, Record from ray._private.ray_constants import ( DEBUG_AUTOSCALING_STATUS, - RAY_EXPERIMENTAL_ENABLE_OPEN_TELEMETRY_ON_AGENT, - RAY_EXPERIMENTAL_ENABLE_OPEN_TELEMETRY_ON_CORE, + RAY_ENABLE_OPEN_TELEMETRY, env_integer, ) from ray._private.telemetry.open_telemetry_metric_recorder import ( OpenTelemetryMetricRecorder, ) +from ray._private.utils import get_system_memory from ray._raylet import GCS_PID_KEY, WorkerID from ray.core.generated import reporter_pb2, reporter_pb2_grpc from ray.dashboard import k8s_utils @@ -57,10 +50,15 @@ COMPONENT_METRICS_TAG_KEYS, GCS_RPC_TIMEOUT_SECONDS, GPU_TAG_KEYS, - TPU_TAG_KEYS, NODE_TAG_KEYS, + TPU_TAG_KEYS, ) from ray.dashboard.modules.reporter.gpu_profile_manager import GpuProfilingManager +from ray.dashboard.modules.reporter.gpu_providers import ( + GpuMetricProvider, + GpuUtilizationInfo, + TpuUtilizationInfo, +) from ray.dashboard.modules.reporter.profile_manager import ( CpuProfilingManager, MemoryProfilingManager, @@ -475,6 +473,7 @@ def __init__(self, dashboard_agent): thread_name_prefix="reporter_agent_executor", ) self._gcs_pid = None + self._gcs_proc = None self._gpu_profiling_manager = GpuProfilingManager( profile_dir_path=self._log_dir, ip_address=self._ip @@ -535,6 +534,17 @@ async def MemoryProfiling(self, request, context): output=output, success=success, warning=warning ) + async def HealthCheck( + self, + _request: reporter_pb2.HealthCheckRequest, + _context: ServicerContext, + ) -> reporter_pb2.HealthCheckReply: + """This is a health check endpoint for the reporter agent. + + It is used to check if the reporter agent is ready to receive requests. + """ + return reporter_pb2.HealthCheckReply() + async def ReportOCMetrics(self, request, context): # Do nothing if metrics collection is disabled. if self._metrics_collection_disabled: @@ -883,7 +893,7 @@ def _get_agent_proc(self) -> psutil.Process: def _generate_worker_key(self, proc: psutil.Process) -> Tuple[int, float]: return (proc.pid, proc.create_time()) - def _get_workers(self, gpus: Optional[List[GpuUtilizationInfo]] = None): + def _get_worker_processes(self): raylet_proc = self._get_raylet_proc() if raylet_proc is None: return [] @@ -900,7 +910,13 @@ def _get_workers(self, gpus: Optional[List[GpuUtilizationInfo]] = None): self._generate_worker_key(proc): proc for proc in raylet_proc.children() } + return workers + def _get_workers(self, gpus: Optional[List[GpuUtilizationInfo]] = None): + workers = self._get_worker_processes() + if not workers: + return [] + else: # We should keep `raylet_proc.children()` in `self` because # when `cpu_percent` is first called, it returns the meaningless 0. # See more: https://github.com/ray-project/ray/issues/29848 @@ -927,7 +943,7 @@ def _get_workers(self, gpus: Optional[List[GpuUtilizationInfo]] = None): processes = gpu.get("processes_pids") if processes: for proc in processes.values(): - gpu_pid_mapping[proc.pid].append(proc) + gpu_pid_mapping[proc["pid"]].append(proc) result = [] for w in self._workers.values(): @@ -988,9 +1004,11 @@ def _get_raylet_proc(self): def _get_gcs(self): if self._gcs_pid: - gcs_proc = psutil.Process(self._gcs_pid) - if gcs_proc: - return gcs_proc.as_dict(attrs=PSUTIL_PROCESS_ATTRS) + if not self._gcs_proc or self._gcs_pid != self._gcs_proc.pid: + self._gcs_proc = psutil.Process(self._gcs_pid) + if self._gcs_proc: + dictionary = self._gcs_proc.as_dict(attrs=PSUTIL_PROCESS_ATTRS) + return dictionary return {} def _get_raylet(self): @@ -1320,10 +1338,12 @@ def generate_worker_stats_record(self, worker_stats: List[dict]) -> List[Record] def _to_records(self, stats, cluster_stats) -> List[Record]: records_reported = [] ip = stats["ip"] - is_head_node = str(self._is_head_node).lower() + ray_node_type = "head" if self._is_head_node else "worker" + is_head_node = "true" if self._is_head_node else "false" # Common tags for node-level metrics - node_tags = {"ip": ip, "IsHeadNode": is_head_node} + # We use RayNodeType to mark head/worker node, IsHeadNode is retained for backward compatibility + node_tags = {"ip": ip, "RayNodeType": ray_node_type, "IsHeadNode": is_head_node} # -- Instance count of cluster -- # Only report cluster stats on head node @@ -1730,7 +1750,7 @@ def _compose_stats_payload( records = self._to_records(stats, cluster_stats) - if RAY_EXPERIMENTAL_ENABLE_OPEN_TELEMETRY_ON_AGENT: + if RAY_ENABLE_OPEN_TELEMETRY: self._open_telemetry_metric_recorder.record_and_export( records, global_tags={ @@ -1749,12 +1769,21 @@ def _compose_stats_payload( self._metrics_agent.clean_all_dead_worker_metrics() + # Convert processes_pids back to a list of dictionaries to maintain backwards-compatibility + for gpu in stats["gpus"]: + if isinstance(gpu.get("processes_pids"), dict): + gpu["processes_pids"] = list(gpu["processes_pids"].values()) + + # TODO(aguo): Add a pydantic model for this dict to maintain compatibility + # with the Ray Dashboard API and UI code. + + # NOTE: This converts keys to "Google style", (e.g: "processes_pids" -> "processesPids") return jsonify_asdict(stats) async def run(self, server): if server: reporter_pb2_grpc.add_ReporterServiceServicer_to_server(self, server) - if RAY_EXPERIMENTAL_ENABLE_OPEN_TELEMETRY_ON_CORE: + if RAY_ENABLE_OPEN_TELEMETRY: metrics_service_pb2_grpc.add_MetricsServiceServicer_to_server( self, server ) diff --git a/python/ray/dashboard/modules/reporter/reporter_head.py b/python/ray/dashboard/modules/reporter/reporter_head.py index d8e76b25bb07..8971991c4dea 100644 --- a/python/ray/dashboard/modules/reporter/reporter_head.py +++ b/python/ray/dashboard/modules/reporter/reporter_head.py @@ -12,8 +12,9 @@ import ray.dashboard.optional_utils as dashboard_optional_utils import ray.dashboard.utils as dashboard_utils from ray import ActorID, NodeID -from ray._private.metrics_agent import PrometheusServiceDiscoveryWriter from ray._common.network_utils import build_address +from ray._common.usage.usage_constants import CLUSTER_METADATA_KEY +from ray._private.metrics_agent import PrometheusServiceDiscoveryWriter from ray._private.ray_constants import ( DEBUG_AUTOSCALING_ERROR, DEBUG_AUTOSCALING_STATUS, @@ -23,7 +24,6 @@ KV_NAMESPACE_DASHBOARD, env_integer, ) -from ray._common.usage.usage_constants import CLUSTER_METADATA_KEY from ray._private.utils import init_grpc_channel from ray.autoscaler._private.commands import debug_status from ray.core.generated import reporter_pb2, reporter_pb2_grpc @@ -413,21 +413,32 @@ async def get_traceback(self, req: aiohttp.web.Request) -> aiohttp.web.Response: Params: pid: Required. The PID of the worker. - ip: Required. The IP address of the node. + ip or node_id: Required. The IP address or hex ID of the node. """ pid = req.query.get("pid") ip = req.query.get("ip") + node_id_hex = req.query.get("node_id") if not pid: raise ValueError("pid is required") - if not ip: - raise ValueError("ip is required") + if not node_id_hex and not ip: + raise ValueError("ip or node_id is required") - addrs = await self._get_stub_address_by_ip(ip) - if not addrs: - raise aiohttp.web.HTTPInternalServerError( - text=f"Failed to get agent address for node at IP {ip}" + if node_id_hex: + addrs = await self._get_stub_address_by_node_id( + NodeID.from_hex(node_id_hex) ) + if not addrs: + raise aiohttp.web.HTTPInternalServerError( + text=f"Failed to get agent address for node at node_id {node_id_hex}" + ) + else: + addrs = await self._get_stub_address_by_ip(ip) + if not addrs: + raise aiohttp.web.HTTPInternalServerError( + text=f"Failed to get agent address for node at IP {ip}" + ) + node_id, ip, http_port, grpc_port = addrs reporter_stub = self._make_stub(build_address(ip, grpc_port)) # Default not using `--native` for profiling @@ -451,29 +462,40 @@ async def cpu_profile(self, req: aiohttp.web.Request) -> aiohttp.web.Response: Params: pid: Required. The PID of the worker. - ip: Required. The IP address of the node. + ip or node_id: Required. The IP address or hex ID of the node. duration: Optional. Duration in seconds for profiling (default: 5, max: 60). format: Optional. Output format (default: "flamegraph"). native: Optional. Whether to use native profiling (default: false). Raises: ValueError: If pid is not provided. - ValueError: If ip is not provided. + ValueError: If ip or node_id is not provided. ValueError: If duration exceeds 60 seconds. aiohttp.web.HTTPInternalServerError: If there is an internal server error during the profile retrieval. """ pid = req.query.get("pid") ip = req.query.get("ip") + node_id_hex = req.query.get("node_id") if not pid: raise ValueError("pid is required") - if not ip: - raise ValueError("ip is required") + if not node_id_hex and not ip: + raise ValueError("ip or node_id is required") - addrs = await self._get_stub_address_by_ip(ip) - if not addrs: - raise aiohttp.web.HTTPInternalServerError( - text=f"Failed to get agent address for node at IP {ip}" + if node_id_hex: + addrs = await self._get_stub_address_by_node_id( + NodeID.from_hex(node_id_hex) ) + if not addrs: + raise aiohttp.web.HTTPInternalServerError( + text=f"Failed to get agent address for node at node_id {node_id_hex}" + ) + else: + addrs = await self._get_stub_address_by_ip(ip) + if not addrs: + raise aiohttp.web.HTTPInternalServerError( + text=f"Failed to get agent address for node at IP {ip}" + ) + node_id, ip, http_port, grpc_port = addrs reporter_stub = self._make_stub(build_address(ip, grpc_port)) @@ -517,7 +539,7 @@ async def gpu_profile(self, req: aiohttp.web.Request) -> aiohttp.web.Response: Params: req: A request with the following query parameters: pid: Required. The PID of the GPU training worker. - ip: Required. The IP address of the node where the GPU training worker is running. + ip or node_id: Required. The IP address or hex ID of the node where the GPU training worker is running. num_iterations: Number of training steps for profiling. Defaults to 4 This is the number of calls to the torch Optimizer.step(). @@ -536,16 +558,27 @@ async def gpu_profile(self, req: aiohttp.web.Request) -> aiohttp.web.Response: pid = req.query.get("pid") ip = req.query.get("ip") + node_id_hex = req.query.get("node_id") if not pid: raise ValueError("pid is required") - if not ip: - raise ValueError("ip is required") + if not node_id_hex and not ip: + raise ValueError("ip or node_id is required") - addrs = await self._get_stub_address_by_ip(ip) - if not addrs: - raise aiohttp.web.HTTPInternalServerError( - text=f"Failed to get agent address for node at IP {ip}, pid {pid}" + if node_id_hex: + addrs = await self._get_stub_address_by_node_id( + NodeID.from_hex(node_id_hex) ) + if not addrs: + raise aiohttp.web.HTTPInternalServerError( + text=f"Failed to get agent address for node at node_id {node_id_hex}, pid {pid}" + ) + else: + addrs = await self._get_stub_address_by_ip(ip) + if not addrs: + raise aiohttp.web.HTTPInternalServerError( + text=f"Failed to get agent address for node at IP {ip}, pid {pid}" + ) + node_id, ip, http_port, grpc_port = addrs reporter_stub = self._make_stub(build_address(ip, grpc_port)) @@ -592,7 +625,7 @@ async def memory_profile(self, req: aiohttp.web.Request) -> aiohttp.web.Response Params (1): pid: The PID of the worker. - ip: The IP address of the node. + ip or node_id: The IP address or hex ID of the node. Params (2): task_id: The ID of the task. @@ -601,7 +634,7 @@ async def memory_profile(self, req: aiohttp.web.Request) -> aiohttp.web.Response Raises: aiohttp.web.HTTPInternalServerError: If no stub - found from the given IP value + found from the given IP address or hex ID value aiohttp.web.HTTPInternalServerError: If the "task_id" parameter exists but either "attempt_number" or "node id" is missing in the request query. @@ -652,12 +685,27 @@ async def memory_profile(self, req: aiohttp.web.Request) -> aiohttp.web.Response else: pid = int(req.query["pid"]) ip = req.query.get("ip") - addrs = await self._get_stub_address_by_ip(ip) - if not addrs: - return aiohttp.web.HTTPInternalServerError( - text=f"Failed to execute: no agent address found for node IP {ip}" + node_id_hex = req.query.get("node_id") + + if not node_id_hex and not ip: + raise ValueError("ip or node_id is required") + + if node_id_hex: + addrs = await self._get_stub_address_by_node_id( + NodeID.from_hex(node_id_hex) ) - _, ip, _, grpc_port = addrs + if not addrs: + return aiohttp.web.HTTPInternalServerError( + text=f"Failed to execute: no agent address found for node {node_id_hex}" + ) + _, ip, _, grpc_port = addrs + else: + addrs = await self._get_stub_address_by_ip(ip) + if not addrs: + return aiohttp.web.HTTPInternalServerError( + text=f"Failed to execute: no agent address found for node IP {ip}" + ) + _, ip, _, grpc_port = addrs assert pid is not None ip_port = build_address(ip, grpc_port) @@ -785,6 +833,28 @@ async def kill_actor_gcs(self, req: aiohttp.web.Request) -> aiohttp.web.Response status_code=status_code, message=message ) + @routes.get("/api/prometheus/sd") + async def prometheus_service_discovery(self, req) -> aiohttp.web.Response: + """ + Expose Prometheus metrics targets through HTTP Service Discovery. + """ + content = self.service_discovery.get_latest_service_discovery_content() + if not isinstance(content, list): + error_message = "service discovery error: content is not a list" + logger.warning(error_message) + return aiohttp.web.json_response( + {"error": error_message}, + status=dashboard_utils.HTTPStatusCode.INTERNAL_ERROR, + headers={"Cache-Control": "no-store"}, + ) + return aiohttp.web.Response( + text=json.dumps(content), + content_type="application/json", + charset="utf-8", + status=dashboard_utils.HTTPStatusCode.OK, + headers={"Cache-Control": "no-store"}, + ) + async def _get_stub_address_by_node_id( self, node_id: NodeID ) -> Optional[Tuple[NodeID, str, int, int]]: diff --git a/python/ray/dashboard/modules/reporter/tests/test_actors.py b/python/ray/dashboard/modules/reporter/tests/test_actors.py index 47eea650b2c0..ea1693430da0 100644 --- a/python/ray/dashboard/modules/reporter/tests/test_actors.py +++ b/python/ray/dashboard/modules/reporter/tests/test_actors.py @@ -7,32 +7,21 @@ import requests import ray +from ray._common.test_utils import wait_for_condition +from ray._private.state_api_test_utils import _is_actor_task_running from ray._private.test_utils import format_web_url, wait_until_server_available from ray.dashboard.tests.conftest import * # noqa +import psutil + logger = logging.getLogger(__name__) KILL_ACTOR_ENDPOINT = "/api/actors/kill" def _actor_killed(pid: str) -> bool: - """Check For the existence of a unix pid.""" - try: - os.kill(pid, 0) - except OSError: - return True - else: - return False - - -def _actor_killed_loop(worker_pid: str, timeout_secs=3) -> bool: - dead = False - for _ in range(timeout_secs): - time.sleep(1) - if _actor_killed(worker_pid): - dead = True - break - return dead + """Check if a process with given pid is running.""" + return not psutil.pid_exists(int(pid)) def _kill_actor_using_dashboard_gcs( @@ -44,6 +33,7 @@ def _kill_actor_using_dashboard_gcs( "actor_id": actor_id, "force_kill": force_kill, }, + timeout=5, ) assert resp.status_code == expected_status_code resp_json = resp.json() @@ -78,7 +68,7 @@ def loop(self): OK = 200 NOT_FOUND = 404 - # Kill an non-existent actor + # Kill a non-existent actor resp = _kill_actor_using_dashboard_gcs( webui_url, "non-existent-actor-id", NOT_FOUND ) @@ -87,7 +77,7 @@ def loop(self): # Kill the actor resp = _kill_actor_using_dashboard_gcs(webui_url, actor_id, OK, force_kill=False) assert "It will exit once running tasks complete" in resp["msg"] - assert _actor_killed_loop(worker_pid) + wait_for_condition(lambda: _actor_killed(worker_pid)) # Create an actor and have it loop a = Actor.remote() @@ -95,15 +85,21 @@ def loop(self): actor_id = a._ray_actor_id.hex() a.loop.remote() + # wait for loop() to start + wait_for_condition(lambda: _is_actor_task_running(worker_pid, "Actor.loop")) + # Try to kill the actor, it should not die since a task is running resp = _kill_actor_using_dashboard_gcs(webui_url, actor_id, OK, force_kill=False) assert "It will exit once running tasks complete" in resp["msg"] - assert not _actor_killed_loop(worker_pid, timeout_secs=1) + with pytest.raises( + RuntimeError, match="The condition wasn't met before the timeout expired." + ): + wait_for_condition(lambda: _actor_killed(worker_pid), 1) # Force kill the actor resp = _kill_actor_using_dashboard_gcs(webui_url, actor_id, OK, force_kill=True) assert "Force killed actor with id" in resp["msg"] - assert _actor_killed_loop(worker_pid) + wait_for_condition(lambda: _actor_killed(worker_pid)) if __name__ == "__main__": diff --git a/python/ray/dashboard/modules/reporter/tests/test_gpu_providers.py b/python/ray/dashboard/modules/reporter/tests/test_gpu_providers.py index a49e97d421ab..516be688a746 100644 --- a/python/ray/dashboard/modules/reporter/tests/test_gpu_providers.py +++ b/python/ray/dashboard/modules/reporter/tests/test_gpu_providers.py @@ -4,14 +4,14 @@ from unittest.mock import Mock, patch from ray.dashboard.modules.reporter.gpu_providers import ( + MB, + AmdGpuProvider, + GpuMetricProvider, GpuProvider, GpuProviderType, + GpuUtilizationInfo, NvidiaGpuProvider, - AmdGpuProvider, - GpuMetricProvider, ProcessGPUInfo, - GpuUtilizationInfo, - MB, ) diff --git a/python/ray/dashboard/modules/reporter/tests/test_healthz.py b/python/ray/dashboard/modules/reporter/tests/test_healthz.py index 9087a5581674..aed2c9515ab3 100644 --- a/python/ray/dashboard/modules/reporter/tests/test_healthz.py +++ b/python/ray/dashboard/modules/reporter/tests/test_healthz.py @@ -1,10 +1,10 @@ import sys import pytest -from ray._common.test_utils import wait_for_condition import requests import ray._private.ray_constants as ray_constants +from ray._common.test_utils import wait_for_condition from ray._private.test_utils import find_free_port from ray.tests.conftest import * # noqa: F401 F403 @@ -25,7 +25,7 @@ def test_healthz_head(monkeypatch, ray_start_cluster): def test_healthz_agent_1(monkeypatch, ray_start_cluster): agent_port = find_free_port() h = ray_start_cluster.add_node(dashboard_agent_listen_port=agent_port) - uri = f"http://localhost:{agent_port}/api/local_raylet_healthz" + uri = f"http://{h.node_ip_address}:{agent_port}/api/local_raylet_healthz" wait_for_condition(lambda: requests.get(uri).status_code == 200) @@ -43,7 +43,7 @@ def test_healthz_agent_2(monkeypatch, ray_start_cluster): agent_port = find_free_port() h = ray_start_cluster.add_node(dashboard_agent_listen_port=agent_port) - uri = f"http://localhost:{agent_port}/api/local_raylet_healthz" + uri = f"http://{h.node_ip_address}:{agent_port}/api/local_raylet_healthz" wait_for_condition(lambda: requests.get(uri).status_code == 200) diff --git a/python/ray/dashboard/modules/reporter/tests/test_reporter.py b/python/ray/dashboard/modules/reporter/tests/test_reporter.py index 4ea6b4070f2f..8bcb52e76156 100644 --- a/python/ray/dashboard/modules/reporter/tests/test_reporter.py +++ b/python/ray/dashboard/modules/reporter/tests/test_reporter.py @@ -9,22 +9,22 @@ import numpy as np import pytest -from ray._common.test_utils import wait_for_condition import requests from google.protobuf import text_format import ray import ray._common.usage.usage_lib as ray_usage_lib +from ray._common.network_utils import build_address +from ray._common.test_utils import wait_for_condition from ray._private import ray_constants from ray._private.metrics_agent import fix_grpc_metric -from ray._common.network_utils import build_address from ray._private.test_utils import ( fetch_prometheus, format_web_url, wait_until_server_available, ) from ray.core.generated.metrics_pb2 import Metric -from ray.dashboard.modules.reporter.gpu_providers import NvidiaGpuProvider, MB +from ray.dashboard.modules.reporter.gpu_providers import MB, NvidiaGpuProvider from ray.dashboard.modules.reporter.reporter_agent import ( ReporterAgent, TpuUtilizationInfo, @@ -200,11 +200,11 @@ def enable_open_telemetry(request): Fixture to enable OpenTelemetry for the test. """ if request.param: - os.environ["RAY_experimental_enable_open_telemetry_on_agent"] = "1" + os.environ["RAY_enable_open_telemetry"] = "1" else: - os.environ["RAY_experimental_enable_open_telemetry_on_agent"] = "0" + os.environ["RAY_enable_open_telemetry"] = "0" yield - os.environ.pop("RAY_experimental_enable_open_telemetry_on_agent", None) + os.environ.pop("RAY_enable_open_telemetry", None) @pytest.mark.skipif(prometheus_client is None, reason="prometheus_client not installed") @@ -217,7 +217,7 @@ def test_prometheus_physical_stats_record( ): addresses = ray.init(include_dashboard=True, num_cpus=1) metrics_export_port = addresses["metrics_export_port"] - addr = addresses["raylet_ip_address"] + addr = addresses["node_ip_address"] prom_addresses = [build_address(addr, metrics_export_port)] def test_case_stats_exist(): @@ -270,11 +270,8 @@ def test_case_ip_correct(): break return str(raylet_proc.process.pid) == str(raylet_pid) - wait_for_condition( - lambda: test_case_stats_exist() and test_case_ip_correct(), - timeout=30, - retry_interval_ms=1000, - ) + wait_for_condition(test_case_stats_exist, timeout=30, retry_interval_ms=1000) + wait_for_condition(test_case_ip_correct, timeout=30, retry_interval_ms=1000) @pytest.mark.skipif( @@ -284,7 +281,7 @@ def test_case_ip_correct(): def test_prometheus_export_worker_and_memory_stats(enable_test_module, shutdown_only): addresses = ray.init(include_dashboard=True, num_cpus=1) metrics_export_port = addresses["metrics_export_port"] - addr = addresses["raylet_ip_address"] + addr = addresses["node_ip_address"] prom_addresses = [build_address(addr, metrics_export_port)] @ray.remote @@ -313,6 +310,7 @@ def test_worker_stats(): def test_report_stats(): dashboard_agent = MagicMock() + dashboard_agent.gcs_address = build_address("127.0.0.1", 6379) agent = ReporterAgent(dashboard_agent) # Assume it is a head node. agent._is_head_node = True @@ -335,9 +333,11 @@ def test_report_stats(): print(record.gauge.name) print(record) assert len(records) == 41 - # Verify IsHeadNode tag + # Verify RayNodeType and IsHeadNode tags for record in records: if record.gauge.name.startswith("node_"): + assert "RayNodeType" in record.tags + assert record.tags["RayNodeType"] == "head" assert "IsHeadNode" in record.tags assert record.tags["IsHeadNode"] == "true" # Test stats without raylets @@ -372,6 +372,7 @@ def test_report_stats(): def test_report_stats_gpu(): dashboard_agent = MagicMock() + dashboard_agent.gcs_address = build_address("127.0.0.1", 6379) agent = ReporterAgent(dashboard_agent) # Assume it is a head node. agent._is_head_node = True @@ -458,13 +459,19 @@ def test_report_stats_gpu(): index = 0 for record in records: if record.tags["GpuIndex"] == "3": - assert record.tags == {"ip": ip, "GpuIndex": "3", "IsHeadNode": "true"} + assert record.tags == { + "ip": ip, + "GpuIndex": "3", + "IsHeadNode": "true", + "RayNodeType": "head", + } else: assert record.tags == { "ip": ip, # The tag value must be string for prometheus. "GpuIndex": str(index), "GpuDeviceName": "NVIDIA A10G", + "RayNodeType": "head", "IsHeadNode": "true", } @@ -486,6 +493,7 @@ def test_report_stats_gpu(): def test_report_per_component_stats_gpu(): dashboard_agent = MagicMock() + dashboard_agent.gcs_address = build_address("127.0.0.1", 6379) agent = ReporterAgent(dashboard_agent) # Assume it is a head node. agent._is_head_node = True @@ -501,58 +509,256 @@ def test_report_per_component_stats_gpu(): """ GPU_MEMORY = 22731 - STATS_TEMPLATE["gpus"] = [ - { - "index": 0, - "uuid": "GPU-36e1567d-37ed-051e-f8ff-df807517b396", - "name": "NVIDIA A10G", - "utilization_gpu": 0, # NOTE: this is a dummy value - "memory_used": 0, - "memory_total": GPU_MEMORY, - "processes_pids": { - 2297322: { - "pid": 2297322, - "gpu_memory_usage": 26, - "gpu_utilization": None, - } + # Prepare the stats data that would be collected by _collect_stats + mock_collected_stats = { + "now": 1614826393.975763, + "hostname": "fake_hostname.local", + "ip": "127.0.0.1", + "cpu": 57.4, + "cpus": (8, 4), + "mem": (17179869184, 5723353088, 66.7, 9234341888), + "shm": 456, + "workers": [ + { + "memory_info": Bunch( + rss=55934976, vms=7026937856, pfaults=15354, pageins=0 + ), + "memory_full_info": Bunch(uss=51428381), + "cpu_percent": 0.0, + "num_fds": 10, + "cmdline": ["ray::IDLE", "", "", "", "", "", "", "", "", "", "", ""], + "create_time": 1614826391.338613, + "pid": 7174, + "cpu_times": Bunch( + user=0.607899328, + system=0.274044032, + children_user=0.0, + children_system=0.0, + ), }, - }, - { - "index": 1, - "uuid": "GPU-36e1567d-37ed-051e-f8ff-df807517b397", - "name": "NVIDIA A10G", - "utilization_gpu": 1, - "memory_used": 1, - "memory_total": GPU_MEMORY, - "processes_pids": { - 2297332: { - "pid": 2297332, - "gpu_memory_usage": 26, - "gpu_utilization": None, - } + { + "memory_info": Bunch( + rss=55934976, vms=7026937856, pfaults=15354, pageins=0 + ), + "memory_full_info": Bunch(uss=51428381), + "cpu_percent": 10.0, + "num_fds": 5, + "cmdline": [ + "ray::TorchGPUWorker.dummy_method", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + ], + "create_time": 1614826391.338613, + "pid": 7175, + "cpu_times": Bunch( + user=0.607899328, + system=0.274044032, + children_user=0.0, + children_system=0.0, + ), }, + ], + "gcs": { + "memory_info": Bunch(rss=18354171, vms=6921486336, pfaults=6203, pageins=2), + "memory_full_info": Bunch(uss=51428384), + "cpu_percent": 5.0, + "num_fds": 14, + "cmdline": ["fake gcs cmdline"], + "create_time": 1614826395.274854, + "pid": 7154, + "cpu_times": Bunch( + user=0.01683138, + system=0.045913716, + children_user=0.0, + children_system=0.0, + ), }, - ] - gpu_worker = STATS_TEMPLATE["workers"][0].copy() - gpu_worker.update( - {"pid": 7175, "cmdline": ["ray::TorchGPUWorker.dummy_method", ""]} - ) + "raylet": { + "memory_info": Bunch(rss=18354176, vms=6921486336, pfaults=6206, pageins=3), + "cpu_percent": 0.0, + "num_fds": 10, + "cmdline": ["fake raylet cmdline"], + "create_time": 1614826390.274854, + "pid": 7153, + "cpu_times": Bunch( + user=0.03683138, + system=0.035913716, + children_user=0.0, + children_system=0.0, + ), + }, + "agent": { + "memory_info": Bunch(rss=18354176, vms=6921486336, pfaults=6206, pageins=3), + "cpu_percent": 0.0, + "num_fds": 10, + "cmdline": ["fake raylet cmdline"], + "create_time": 1614826390.274854, + "pid": 7154, + "cpu_times": Bunch( + user=0.03683138, + system=0.035913716, + children_user=0.0, + children_system=0.0, + ), + }, + "bootTime": 1612934656.0, + "loadAvg": ((4.4521484375, 3.61083984375, 3.5400390625), (0.56, 0.45, 0.44)), + "disk_io": (100, 100, 100, 100), + "disk_io_speed": (100, 100, 100, 100), + "disk": { + "/": Bunch( + total=250790436864, used=11316781056, free=22748921856, percent=33.2 + ), + "/tmp": Bunch( + total=250790436864, used=209532035072, free=22748921856, percent=90.2 + ), + }, + "gpus": [ + { + "index": 0, + "uuid": "GPU-36e1567d-37ed-051e-f8ff-df807517b396", + "name": "NVIDIA A10G", + "utilization_gpu": 0, # NOTE: this is a dummy value + "memory_used": 0, + "memory_total": GPU_MEMORY, + "processes_pids": { + 2297322: { + "pid": 2297322, + "gpu_memory_usage": 26, + "gpu_utilization": None, + } + }, + }, + { + "index": 1, + "uuid": "GPU-36e1567d-37ed-051e-f8ff-df807517b397", + "name": "NVIDIA A10G", + "utilization_gpu": 1, + "memory_used": 1, + "memory_total": GPU_MEMORY, + "processes_pids": { + 2297332: { + "pid": 2297332, + "gpu_memory_usage": 26, + "gpu_utilization": None, + } + }, + }, + ], + "gpu_processes": {}, + "tpus": [], + "network": (13621160960, 11914936320), + "network_speed": (8.435062128545095, 7.378462703142336), + "cmdline": ["fake raylet cmdline"], + } + gpu_metrics_aggregatd = { "component_gpu_utilization": 0, "component_gpu_memory_usage": 0, } - STATS_TEMPLATE["workers"].append(gpu_worker) + def create_mock_agent_proc(): + """Helper function to create a mock agent process.""" + mock_agent_proc = MagicMock() + mock_agent_proc.pid = agent_proc_pid + mock_agent_proc.create_time.return_value = agent_proc_create_time + return mock_agent_proc + + agent_proc_pid = 22334 + agent_proc_create_time = 1614826392.338613 + agent_proc_mock = create_mock_agent_proc() + + def create_mock_worker_processes(): + """Helper function to create mock worker processes for testing.""" + mock_workers = {} + + # Create mock worker processes that match what _get_workers expects + for i, worker_data in enumerate(mock_collected_stats["workers"]): + mock_proc = MagicMock() + mock_proc.status.return_value = psutil.STATUS_RUNNING + mock_proc.as_dict.return_value = { + "pid": worker_data["pid"], + "cmdline": worker_data["cmdline"], + "cpu_percent": worker_data["cpu_percent"], + "memory_info": worker_data["memory_info"], + "memory_full_info": worker_data["memory_full_info"], + "num_fds": worker_data["num_fds"], + "create_time": worker_data["create_time"], + "cpu_times": worker_data["cpu_times"], + } + mock_workers[f"worker_{i}"] = mock_proc + + # Add the agent process to the mock workers + mock_workers[agent._generate_worker_key(agent_proc_mock)] = agent_proc_mock + return mock_workers + + # Mock all the individual methods that _collect_stats calls to return predictable data + mock_patches = { + "_get_network_stats": lambda: (13621160960, 11914936320), + "_get_disk_io_stats": lambda: (100, 100, 100, 100), + "_get_gpu_usage": lambda: mock_collected_stats["gpus"], + "_get_cpu_percent": lambda _: 57.4, + "_get_mem_usage": lambda: (17179869184, 5723353088, 66.7, 9234341888), + "_get_shm_usage": lambda: 456, + "_get_raylet": lambda: mock_collected_stats["raylet"], + "_get_agent": lambda: mock_collected_stats["agent"], + "_get_boot_time": lambda: 1612934656.0, + "_get_load_avg": lambda: ( + (4.4521484375, 3.61083984375, 3.5400390625), + (0.56, 0.45, 0.44), + ), + "_get_disk_usage": lambda: mock_collected_stats["disk"], + "_get_tpu_usage": lambda: [], + "_get_gcs": lambda: mock_collected_stats["gcs"], + "_get_worker_processes": lambda: create_mock_worker_processes(), + "_get_agent_proc": lambda: agent_proc_mock, + } + + with patch.multiple(agent, **mock_patches): + # Call _collect_stats to actually run through the collection process + collected_stats_result = agent._collect_stats() + + # Verify that _collect_stats was called and returned the expected structure + assert "gpus" in collected_stats_result + assert "workers" in collected_stats_result + assert "gcs" in collected_stats_result # Should be present for head node + assert len(collected_stats_result["gpus"]) == 2 + assert len(collected_stats_result["workers"]) == 2 + assert collected_stats_result["cpu"] == 57.4 + assert collected_stats_result["mem"] == ( + 17179869184, + 5723353088, + 66.7, + 9234341888, + ) + assert collected_stats_result["shm"] == 456 + assert collected_stats_result["network"] == (13621160960, 11914936320) + assert collected_stats_result["disk_io"] == (100, 100, 100, 100) + + # Now add the GPU processes data to the collected stats result NVSMI_OUTPUT_TWO_TASK_ON_TWO_GPUS = ( "# gpu pid type sm mem enc dec jpg ofa command \n" "# Idx # C/G % % % % % % name \n" " 0 7175 C 84 26 - - - - ray::TorchGPUWo\n" " 1 7175 C 86 26 - - - - ray::TorchGPUWo\n" ) - STATS_TEMPLATE["gpu_processes"] = NvidiaGpuProvider._parse_nvsmi_pmon_output( - NVSMI_OUTPUT_TWO_TASK_ON_TWO_GPUS, STATS_TEMPLATE["gpus"] + collected_stats_result[ + "gpu_processes" + ] = NvidiaGpuProvider._parse_nvsmi_pmon_output( + NVSMI_OUTPUT_TWO_TASK_ON_TWO_GPUS, collected_stats_result["gpus"] ) - records = agent._to_records(STATS_TEMPLATE, {}) + + # Use the collected stats result for _to_records instead of STATS_TEMPLATE + records = agent._to_records(collected_stats_result, {}) gpu_component_records = defaultdict(list) @@ -579,21 +785,50 @@ def test_report_per_component_stats_gpu(): " 0 7176 C 77 22 - - - - ray::TorchGPUWo\n" " 1 - - - - - - - - - \n" ) - STATS_TEMPLATE["gpu_processes"] = NvidiaGpuProvider._parse_nvsmi_pmon_output( - NVSMI_OUTPUT_TWO_TASK_ON_ONE_GPUS, STATS_TEMPLATE["gpus"] + + # Update the collected stats result for the second test scenario + collected_stats_result[ + "gpu_processes" + ] = NvidiaGpuProvider._parse_nvsmi_pmon_output( + NVSMI_OUTPUT_TWO_TASK_ON_ONE_GPUS, collected_stats_result["gpus"] ) # Move process from GPU 1 to GPU 0 - gpu1_process = STATS_TEMPLATE["gpus"][1]["processes_pids"][2297332] - STATS_TEMPLATE["gpus"][0]["processes_pids"][2297332] = gpu1_process - STATS_TEMPLATE["gpus"][1]["processes_pids"] = {} + gpu1_process = collected_stats_result["gpus"][1]["processes_pids"][2297332] + collected_stats_result["gpus"][0]["processes_pids"][2297332] = gpu1_process + collected_stats_result["gpus"][1]["processes_pids"] = {} - gpu_worker = gpu_worker.copy() - gpu_worker.update( - {"pid": 7176, "cmdline": ["ray::TorchGPUWorker.dummy_method_2", ""]} - ) - STATS_TEMPLATE["workers"].append(gpu_worker) + # Add the second GPU worker to the collected stats result + gpu_worker_2 = { + "memory_info": Bunch(rss=55934976, vms=7026937856, pfaults=15354, pageins=0), + "memory_full_info": Bunch(uss=51428381), + "cpu_percent": 15.0, + "num_fds": 6, + "cmdline": [ + "ray::TorchGPUWorker.dummy_method_2", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + ], + "create_time": 1614826391.338613, + "pid": 7176, + "cpu_times": Bunch( + user=0.607899328, + system=0.274044032, + children_user=0.0, + children_system=0.0, + ), + } + collected_stats_result["workers"].append(gpu_worker_2) - records = agent._to_records(STATS_TEMPLATE, {}) + records = agent._to_records(collected_stats_result, {}) gpu_component_records = defaultdict(list) for record in records: @@ -622,6 +857,7 @@ def test_report_per_component_stats_gpu(): def test_get_tpu_usage(): dashboard_agent = MagicMock() + dashboard_agent.gcs_address = build_address("127.0.0.1", 6379) agent = ReporterAgent(dashboard_agent) fake_metrics_content = """ @@ -678,6 +914,7 @@ def test_get_tpu_usage(): def test_report_stats_tpu(): dashboard_agent = MagicMock() + dashboard_agent.gcs_address = build_address("127.0.0.1", 6379) agent = ReporterAgent(dashboard_agent) STATS_TEMPLATE["tpus"] = [ @@ -750,6 +987,7 @@ def test_report_stats_tpu(): def test_report_per_component_stats(): dashboard_agent = MagicMock() + dashboard_agent.gcs_address = build_address("127.0.0.1", 6379) agent = ReporterAgent(dashboard_agent) # Assume it is a head node. agent._is_head_node = True @@ -1009,6 +1247,9 @@ def _get_agent_proc(self): def _generate_worker_key(self, proc): return (proc.pid, proc.create_time()) + def _get_worker_processes(self): + return ReporterAgent._get_worker_processes(self) + obj = ReporterAgentDummy() try: diff --git a/python/ray/dashboard/modules/serve/serve_head.py b/python/ray/dashboard/modules/serve/serve_head.py index 151a19908002..02e940bc7601 100644 --- a/python/ray/dashboard/modules/serve/serve_head.py +++ b/python/ray/dashboard/modules/serve/serve_head.py @@ -3,6 +3,7 @@ import json import logging from functools import wraps +from typing import Optional import aiohttp from aiohttp.web import Request, Response @@ -81,7 +82,27 @@ async def get_version(self, req: Request) -> Response: @dashboard_optional_utils.init_ray_and_catch_exceptions() @validate_endpoint() async def get_serve_instance_details(self, req: Request) -> Response: - from ray.serve.schema import ServeInstanceDetails + from ray.serve.schema import APIType, ServeInstanceDetails + + api_type: Optional[APIType] = None + api_type_str = req.query.get("api_type") + + if api_type_str: + api_type_lower = api_type_str.lower() + valid_values = APIType.get_valid_user_values() + + if api_type_lower not in valid_values: + # Explicitly check against valid user values (excludes 'unknown') + return Response( + status=400, + text=( + f"Invalid 'api_type' value: '{api_type_str}'. " + f"Must be one of: {', '.join(valid_values)}" + ), + content_type="text/plain", + ) + + api_type = APIType(api_type_lower) controller = await self.get_serve_controller() @@ -90,7 +111,9 @@ async def get_serve_instance_details(self, req: Request) -> Response: details = ServeInstanceDetails.get_empty_schema_dict() else: try: - details = await controller.get_serve_instance_details.remote() + details = await controller.get_serve_instance_details.remote( + source=api_type + ) except ray.exceptions.RayTaskError as e: # Task failure sometimes are due to GCS # failure. When GCS failed, we expect a longer time diff --git a/python/ray/dashboard/modules/serve/tests/test_serve_dashboard.py b/python/ray/dashboard/modules/serve/tests/test_serve_dashboard.py index f0d31bdf3618..2d019f25dee2 100644 --- a/python/ray/dashboard/modules/serve/tests/test_serve_dashboard.py +++ b/python/ray/dashboard/modules/serve/tests/test_serve_dashboard.py @@ -6,9 +6,9 @@ from typing import Dict import pytest -from ray._common.test_utils import wait_for_condition import requests +from ray._common.test_utils import wait_for_condition from ray.serve._private.common import ( DeploymentStatus, DeploymentStatusTrigger, @@ -572,5 +572,171 @@ def applications_running(): print("Finished checking application details.") +@pytest.mark.skipif( + sys.platform == "darwin" and not TEST_ON_DARWIN, reason="Flaky on OSX." +) +def test_get_serve_instance_details_api_type_filtering(ray_start_stop): + """ + Test the api_type query parameter for filtering applications by API type. + Tests both declarative and imperative applications. + """ + # First, deploy declarative applications + world_import_path = "ray.serve.tests.test_config_files.world.DagNode" + declarative_config = { + "applications": [ + { + "name": "declarative_app1", + "route_prefix": "/declarative1", + "import_path": world_import_path, + }, + { + "name": "declarative_app2", + "route_prefix": "/declarative2", + "import_path": world_import_path, + }, + ], + } + + deploy_config_multi_app(declarative_config, SERVE_HEAD_URL) + + # Wait for declarative apps to be running + def declarative_apps_running(): + response = requests.get(SERVE_HEAD_URL, timeout=15) + assert response.status_code == 200 + serve_details = ServeInstanceDetails(**response.json()) + return len(serve_details.applications) == 2 and all( + app.status == ApplicationStatus.RUNNING + for app in serve_details.applications.values() + ) + + wait_for_condition(declarative_apps_running, timeout=15) + print("Declarative applications are running.") + + # Deploy imperative applications using subprocess + deploy = subprocess.run( + [ + sys.executable, + str(Path(__file__).parent / "deploy_imperative_serve_apps.py"), + ], + capture_output=True, + universal_newlines=True, + ) + assert deploy.returncode == 0 + + # Wait for imperative apps to be running + def all_apps_running(): + response = requests.get(SERVE_HEAD_URL, timeout=15) + assert response.status_code == 200 + serve_details = ServeInstanceDetails(**response.json()) + return len( + serve_details.applications + ) == 4 and all( # 2 declarative + 2 imperative + app.status == ApplicationStatus.RUNNING + for app in serve_details.applications.values() + ) + + wait_for_condition(all_apps_running, timeout=15) + print("All applications (declarative + imperative) are running.") + + # Test 1: No api_type parameter - should return all applications + response = requests.get(SERVE_HEAD_URL, timeout=15) + assert response.status_code == 200 + serve_details = ServeInstanceDetails(**response.json()) + assert len(serve_details.applications) == 4 + app_names = set(serve_details.applications.keys()) + assert app_names == {"declarative_app1", "declarative_app2", "app1", "app2"} + + # Test 2: Filter by declarative applications + response = requests.get(SERVE_HEAD_URL + "?api_type=declarative", timeout=15) + assert response.status_code == 200 + serve_details = ServeInstanceDetails(**response.json()) + assert len(serve_details.applications) == 2 + app_names = set(serve_details.applications.keys()) + assert app_names == {"declarative_app1", "declarative_app2"} + for app in serve_details.applications.values(): + assert app.source == "declarative" + + # Test 3: Filter by imperative applications + response = requests.get(SERVE_HEAD_URL + "?api_type=imperative", timeout=15) + assert response.status_code == 200 + serve_details = ServeInstanceDetails(**response.json()) + assert len(serve_details.applications) == 2 + app_names = set(serve_details.applications.keys()) + assert app_names == {"app1", "app2"} + for app in serve_details.applications.values(): + assert app.source == "imperative" + + # Test 4: Filter by unknown - should return 400 error (unknown is not a valid user input) + response = requests.get(SERVE_HEAD_URL + "?api_type=unknown", timeout=15) + assert response.status_code == 400 + assert "Invalid 'api_type' value" in response.text + assert "Must be one of: imperative, declarative" in response.text + + +@pytest.mark.skipif( + sys.platform == "darwin" and not TEST_ON_DARWIN, reason="Flaky on OSX." +) +def test_get_serve_instance_details_invalid_api_type(ray_start_stop): + """ + Test that invalid api_type values return appropriate error responses. + """ + # Test with invalid api_type value + response = requests.get(SERVE_HEAD_URL + "?api_type=invalid_type", timeout=15) + assert response.status_code == 400 + assert "Invalid 'api_type' value" in response.text + assert "Must be one of: imperative, declarative" in response.text + + # Test with another invalid value + response = requests.get(SERVE_HEAD_URL + "?api_type=python", timeout=15) + assert response.status_code == 400 + assert "Invalid 'api_type' value" in response.text + + +@pytest.mark.skipif( + sys.platform == "darwin" and not TEST_ON_DARWIN, reason="Flaky on OSX." +) +def test_get_serve_instance_details_api_type_case_insensitive(ray_start_stop): + """ + Test that api_type parameter is case insensitive. + """ + # Deploy a declarative application + world_import_path = "ray.serve.tests.test_config_files.world.DagNode" + config = { + "applications": [ + { + "name": "test_app", + "route_prefix": "/test", + "import_path": world_import_path, + } + ], + } + + deploy_config_multi_app(config, SERVE_HEAD_URL) + + def app_running(): + response = requests.get(SERVE_HEAD_URL, timeout=15) + assert response.status_code == 200 + serve_details = ServeInstanceDetails(**response.json()) + return ( + len(serve_details.applications) == 1 + and serve_details.applications["test_app"].status + == ApplicationStatus.RUNNING + ) + + wait_for_condition(app_running, timeout=15) + + # Test case insensitive filtering + test_cases = ["DECLARATIVE", "Declarative", "declarative", "DeClArAtIvE"] + + for api_type_value in test_cases: + response = requests.get( + f"{SERVE_HEAD_URL}?api_type={api_type_value}", timeout=15 + ) + assert response.status_code == 200 + serve_details = ServeInstanceDetails(**response.json()) + assert len(serve_details.applications) == 1 + assert "test_app" in serve_details.applications + + if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/dashboard/modules/serve/tests/test_serve_dashboard_2.py b/python/ray/dashboard/modules/serve/tests/test_serve_dashboard_2.py index 17e25c6c5b52..a21cad86ad32 100644 --- a/python/ray/dashboard/modules/serve/tests/test_serve_dashboard_2.py +++ b/python/ray/dashboard/modules/serve/tests/test_serve_dashboard_2.py @@ -7,12 +7,12 @@ import grpc import pytest -from ray._common.test_utils import wait_for_condition import requests import ray import ray._private.ray_constants as ray_constants from ray import serve +from ray._common.test_utils import wait_for_condition from ray._private.test_utils import generate_system_config_map from ray.serve.generated import serve_pb2, serve_pb2_grpc from ray.serve.schema import HTTPOptionsSchema, ServeInstanceDetails diff --git a/python/ray/dashboard/modules/state/state_head.py b/python/ray/dashboard/modules/state/state_head.py index 7ef52b3a7cdf..4fa2755d5839 100644 --- a/python/ray/dashboard/modules/state/state_head.py +++ b/python/ray/dashboard/modules/state/state_head.py @@ -10,8 +10,8 @@ import ray from ray import ActorID -from ray._private.ray_constants import env_integer from ray._common.usage.usage_lib import TagKey, record_extra_usage_tag +from ray._private.ray_constants import env_integer from ray.core.generated.gcs_pb2 import ActorTableData from ray.dashboard.consts import ( RAY_STATE_SERVER_MAX_HTTP_REQUEST, diff --git a/python/ray/dashboard/modules/usage_stats/usage_stats_head.py b/python/ray/dashboard/modules/usage_stats/usage_stats_head.py index 0cecf0fba77a..91c500fb1f6f 100644 --- a/python/ray/dashboard/modules/usage_stats/usage_stats_head.py +++ b/python/ray/dashboard/modules/usage_stats/usage_stats_head.py @@ -9,9 +9,9 @@ import ray import ray._common.usage.usage_lib as ray_usage_lib import ray.dashboard.utils as dashboard_utils +from ray._common.network_utils import build_address from ray._common.utils import get_or_create_event_loop from ray.dashboard.utils import async_loop_forever -from ray._common.network_utils import build_address logger = logging.getLogger(__name__) diff --git a/python/ray/dashboard/state_aggregator.py b/python/ray/dashboard/state_aggregator.py index 70b939ddde4f..a33dd4e7b3be 100644 --- a/python/ray/dashboard/state_aggregator.py +++ b/python/ray/dashboard/state_aggregator.py @@ -243,10 +243,10 @@ def transform(reply) -> ListApiResponse: result = [] for message in reply.worker_table_data: data = protobuf_message_to_dict( - message=message, fields_to_decode=["worker_id", "raylet_id"] + message=message, fields_to_decode=["worker_id", "node_id"] ) data["worker_id"] = data["worker_address"]["worker_id"] - data["node_id"] = data["worker_address"]["raylet_id"] + data["node_id"] = data["worker_address"]["node_id"] data["ip"] = data["worker_address"]["ip_address"] data["start_time_ms"] = int(data["start_time_ms"]) data["end_time_ms"] = int(data["end_time_ms"]) diff --git a/python/ray/dashboard/subprocesses/tests/test_e2e.py b/python/ray/dashboard/subprocesses/tests/test_e2e.py index 05d3cd0a39cb..1f1567f70320 100644 --- a/python/ray/dashboard/subprocesses/tests/test_e2e.py +++ b/python/ray/dashboard/subprocesses/tests/test_e2e.py @@ -6,14 +6,13 @@ import pytest -from ray._common.test_utils import wait_for_condition import ray._private.ray_constants as ray_constants +import ray.dashboard.consts as dashboard_consts from ray._common.ray_constants import ( - LOGGING_ROTATE_BYTES, LOGGING_ROTATE_BACKUP_COUNT, + LOGGING_ROTATE_BYTES, ) -import ray.dashboard.consts as dashboard_consts -from ray._common.test_utils import async_wait_for_condition +from ray._common.test_utils import async_wait_for_condition, wait_for_condition from ray.dashboard.optional_deps import aiohttp from ray.dashboard.subprocesses.handle import SubprocessModuleHandle from ray.dashboard.subprocesses.module import SubprocessModule, SubprocessModuleConfig diff --git a/python/ray/dashboard/tests/test_dashboard.py b/python/ray/dashboard/tests/test_dashboard.py index 43afbe941b26..3219606b96bc 100644 --- a/python/ray/dashboard/tests/test_dashboard.py +++ b/python/ray/dashboard/tests/test_dashboard.py @@ -14,23 +14,23 @@ from urllib.parse import quote_plus import pytest -from ray._common.test_utils import wait_for_condition import requests from click.testing import CliRunner from requests.exceptions import ConnectionError, HTTPError import ray +import ray._private.ray_constants as ray_constants import ray.dashboard.consts as dashboard_consts import ray.dashboard.modules import ray.dashboard.utils as dashboard_utils import ray.scripts.scripts as scripts -from ray._common.utils import get_or_create_event_loop -import ray._private.ray_constants as ray_constants +from ray._common.network_utils import build_address, parse_address from ray._common.ray_constants import ( - LOGGING_ROTATE_BYTES, LOGGING_ROTATE_BACKUP_COUNT, + LOGGING_ROTATE_BYTES, ) -from ray._common.network_utils import build_address, parse_address +from ray._common.test_utils import wait_for_condition +from ray._common.utils import get_or_create_event_loop from ray._private.ray_constants import ( DEBUG_AUTOSCALING_ERROR, DEBUG_AUTOSCALING_STATUS_LEGACY, @@ -1315,7 +1315,7 @@ async def make_blocking_call(): await asyncio.gather(*tasks) # Fetch the metrics from the dashboard. - addr = ray_context["raylet_ip_address"] + addr = ray_context["node_ip_address"] prom_addresses = [build_address(addr, dashboard_consts.DASHBOARD_METRIC_PORT)] def check_lag_metrics(): diff --git a/python/ray/dashboard/utils.py b/python/ray/dashboard/utils.py index 47c6340133d4..4cb60681abeb 100644 --- a/python/ray/dashboard/utils.py +++ b/python/ray/dashboard/utils.py @@ -26,9 +26,9 @@ import ray._private.ray_constants as ray_constants import ray._private.services as services import ray.experimental.internal_kv as internal_kv +from ray._common.network_utils import parse_address from ray._common.utils import get_or_create_event_loop from ray._private.gcs_utils import GcsChannel -from ray._common.network_utils import parse_address from ray._private.utils import ( get_dashboard_dependency_error, split_address, @@ -360,7 +360,7 @@ def node_stats_to_dict( "parentTaskId", "sourceActorId", "callerId", - "rayletId", + "nodeId", "workerId", "placementGroupId", } @@ -709,9 +709,15 @@ def get_address_for_submission_client(address: Optional[str]) -> str: Returns: API server HTTP URL, e.g. "http://:8265". """ - if os.environ.get("RAY_ADDRESS"): - logger.debug(f"Using RAY_ADDRESS={os.environ['RAY_ADDRESS']}") - address = os.environ["RAY_ADDRESS"] + if api_server_address := os.environ.get( + ray_constants.RAY_API_SERVER_ADDRESS_ENVIRONMENT_VARIABLE + ): + address = api_server_address + logger.debug(f"Using RAY_API_SERVER_ADDRESS={address}") + # Fall back to RAY_ADDRESS if RAY_API_SERVER_ADDRESS not set + elif ray_address := os.environ.get(ray_constants.RAY_ADDRESS_ENVIRONMENT_VARIABLE): + address = ray_address + logger.debug(f"Using RAY_ADDRESS={address}") if address and "://" in address: module_string, _ = split_address(address) diff --git a/python/ray/data/BUILD b/python/ray/data/BUILD.bazel similarity index 96% rename from python/ray/data/BUILD rename to python/ray/data/BUILD.bazel index 6b0083700fd0..a3b44dadf21b 100644 --- a/python/ray/data/BUILD +++ b/python/ray/data/BUILD.bazel @@ -111,6 +111,16 @@ py_test( ], ) +py_test( + name = "test_datatype", + size = "small", + srcs = ["tests/test_datatype.py"], + tags = [ + "exclusive", + "team:data", + ], +) + py_test( name = "test_sql", size = "small", @@ -214,9 +224,9 @@ py_test( ) py_test( - name = "test_arrow_block_scaling", + name = "test_jumbo_arrow_block", size = "large", - srcs = ["tests/test_arrow_block_scaling.py"], + srcs = ["tests/test_jumbo_arrow_block.py"], tags = [ "data_non_parallel", "exclusive", @@ -373,6 +383,7 @@ py_test( size = "enormous", srcs = ["tests/test_groupby_e2e.py"], tags = [ + "data_non_parallel", "exclusive", "team:data", ], @@ -1239,6 +1250,20 @@ py_test( ], ) +py_test( + name = "test_unify_schemas_performance", + size = "small", + srcs = ["tests/test_unify_schemas_performance.py"], + tags = [ + "exclusive", + "team:data", + ], + deps = [ + ":conftest", + "//:ray_lib", + ], +) + py_test( name = "test_util", size = "small", @@ -1323,6 +1348,20 @@ py_test( ], ) +py_test( + name = "test_downstream_capacity_backpressure_policy", + size = "medium", + srcs = ["tests/test_downstream_capacity_backpressure_policy.py"], + tags = [ + "exclusive", + "team:data", + ], + deps = [ + ":conftest", + "//:ray_lib", + ], +) + py_test( name = "test_backpressure_e2e", size = "large", @@ -1421,6 +1460,20 @@ py_test( ], ) +py_test( + name = "test_download_expression", + size = "small", + srcs = ["tests/test_download_expression.py"], + tags = [ + "exclusive", + "team:data", + ], + deps = [ + ":conftest", + "//:ray_lib", + ], +) + py_test( name = "test_context", size = "small", diff --git a/python/ray/data/__init__.py b/python/ray/data/__init__.py index c4962fd7db54..96a774f5a63f 100644 --- a/python/ray/data/__init__.py +++ b/python/ray/data/__init__.py @@ -1,6 +1,9 @@ # Short term workaround for https://github.com/ray-project/ray/issues/32435 # Dataset has a hard dependency on pandas, so it doesn't need to be delayed. import pandas # noqa +from packaging.version import parse as parse_version + +from ray._private.arrow_utils import get_pyarrow_version from ray.data._internal.compute import ActorPoolStrategy from ray.data._internal.datasource.tfrecords_datasource import TFXReadOptions @@ -75,6 +78,40 @@ configure_logging() +try: + import pyarrow as pa + + # Import these arrow extension types to ensure that they are registered. + from ray.air.util.tensor_extensions.arrow import ( # noqa + ArrowTensorType, + ArrowVariableShapedTensorType, + ) + + # https://github.com/apache/arrow/pull/38608 deprecated `PyExtensionType`, and + # disabled it's deserialization by default. To ensure that users can load data + # written with earlier version of Ray Data, we enable auto-loading of serialized + # tensor extensions. + # + # NOTE: `PyExtensionType` is deleted from Arrow >= 21.0 + pyarrow_version = get_pyarrow_version() + if pyarrow_version is None or pyarrow_version >= parse_version("21.0.0"): + pass + else: + from ray._private.ray_constants import env_bool + + RAY_DATA_AUTOLOAD_PYEXTENSIONTYPE = env_bool( + "RAY_DATA_AUTOLOAD_PYEXTENSIONTYPE", False + ) + + if ( + pyarrow_version >= parse_version("14.0.1") + and RAY_DATA_AUTOLOAD_PYEXTENSIONTYPE + ): + pa.PyExtensionType.set_auto_load(True) + +except ModuleNotFoundError: + pass + __all__ = [ "ActorPoolStrategy", diff --git a/python/ray/data/_expression_evaluator.py b/python/ray/data/_expression_evaluator.py index 370ef90b83bc..26642055aa2e 100644 --- a/python/ray/data/_expression_evaluator.py +++ b/python/ray/data/_expression_evaluator.py @@ -3,16 +3,19 @@ import operator from typing import Any, Callable, Dict +import numpy as np import pandas as pd import pyarrow as pa import pyarrow.compute as pc +from ray.data.block import DataBatch from ray.data.expressions import ( BinaryExpr, ColumnExpr, Expr, LiteralExpr, Operation, + UDFExpr, ) _PANDAS_EXPR_OPS_MAP = { @@ -44,7 +47,9 @@ } -def _eval_expr_recursive(expr: "Expr", batch, ops: Dict["Operation", Callable]) -> Any: +def _eval_expr_recursive( + expr: "Expr", batch: DataBatch, ops: Dict["Operation", Callable[..., Any]] +) -> Any: """Generic recursive expression evaluator.""" # TODO: Separate unresolved expressions (arbitrary AST with unresolved refs) # and resolved expressions (bound to a schema) for better error handling @@ -58,10 +63,26 @@ def _eval_expr_recursive(expr: "Expr", batch, ops: Dict["Operation", Callable]) _eval_expr_recursive(expr.left, batch, ops), _eval_expr_recursive(expr.right, batch, ops), ) - raise TypeError(f"Unsupported expression node: {type(expr).__name__}") + if isinstance(expr, UDFExpr): + args = [_eval_expr_recursive(arg, batch, ops) for arg in expr.args] + kwargs = { + k: _eval_expr_recursive(v, batch, ops) for k, v in expr.kwargs.items() + } + result = expr.fn(*args, **kwargs) + # Can't perform type validation for unions if python version is < 3.10 + if not isinstance(result, (pd.Series, np.ndarray, pa.Array, pa.ChunkedArray)): + function_name = expr.fn.__name__ + raise TypeError( + f"UDF '{function_name}' returned invalid type {type(result).__name__}. " + f"Expected type (pandas.Series, numpy.ndarray, pyarrow.Array, or pyarrow.ChunkedArray)" + ) -def eval_expr(expr: "Expr", batch) -> Any: + return result + raise TypeError(f"Unsupported expression node: {type(expr).__name__}") + + +def eval_expr(expr: "Expr", batch: DataBatch) -> Any: """Recursively evaluate *expr* against a batch of the appropriate type.""" if isinstance(batch, pd.DataFrame): return _eval_expr_recursive(expr, batch, _PANDAS_EXPR_OPS_MAP) diff --git a/python/ray/data/_internal/actor_autoscaler/__init__.py b/python/ray/data/_internal/actor_autoscaler/__init__.py new file mode 100644 index 000000000000..6d29cbc9e78c --- /dev/null +++ b/python/ray/data/_internal/actor_autoscaler/__init__.py @@ -0,0 +1,30 @@ +from typing import TYPE_CHECKING + +from .autoscaling_actor_pool import ActorPoolScalingRequest, AutoscalingActorPool +from .base_actor_autoscaler import ActorAutoscaler +from .default_actor_autoscaler import DefaultActorAutoscaler + +if TYPE_CHECKING: + from ray.data._internal.execution.resource_manager import ResourceManager + from ray.data._internal.execution.streaming_executor_state import Topology + from ray.data.context import AutoscalingConfig + + +def create_actor_autoscaler( + topology: "Topology", + resource_manager: "ResourceManager", + config: "AutoscalingConfig", +) -> ActorAutoscaler: + return DefaultActorAutoscaler( + topology, + resource_manager, + config=config, + ) + + +__all__ = [ + "ActorAutoscaler", + "ActorPoolScalingRequest", + "AutoscalingActorPool", + "create_actor_autoscaler", +] diff --git a/python/ray/data/_internal/execution/autoscaler/autoscaling_actor_pool.py b/python/ray/data/_internal/actor_autoscaler/autoscaling_actor_pool.py similarity index 89% rename from python/ray/data/_internal/execution/autoscaler/autoscaling_actor_pool.py rename to python/ray/data/_internal/actor_autoscaler/autoscaling_actor_pool.py index 57bd47932b20..f6145a4c175f 100644 --- a/python/ray/data/_internal/execution/autoscaler/autoscaling_actor_pool.py +++ b/python/ray/data/_internal/actor_autoscaler/autoscaling_actor_pool.py @@ -10,6 +10,7 @@ class ActorPoolScalingRequest: delta: int + force: bool = field(default=False) reason: Optional[str] = field(default=None) @classmethod @@ -22,9 +23,11 @@ def upscale(cls, *, delta: int, reason: Optional[str] = None): return ActorPoolScalingRequest(delta=delta, reason=reason) @classmethod - def downscale(cls, *, delta: int, reason: Optional[str] = None): + def downscale( + cls, *, delta: int, force: bool = False, reason: Optional[str] = None + ): assert delta < 0, "For scale down delta is expected to be negative!" - return ActorPoolScalingRequest(delta=delta, reason=reason) + return ActorPoolScalingRequest(delta=delta, force=force, reason=reason) @DeveloperAPI @@ -106,3 +109,6 @@ def per_actor_resource_usage(self) -> ExecutionResources: def get_pool_util(self) -> float: """Calculate the utilization of the given actor pool.""" ... + + def max_concurrent_tasks(self) -> int: + return self.max_actor_concurrency() * self.num_running_actors() diff --git a/python/ray/data/_internal/actor_autoscaler/base_actor_autoscaler.py b/python/ray/data/_internal/actor_autoscaler/base_actor_autoscaler.py new file mode 100644 index 000000000000..aebdb89bb431 --- /dev/null +++ b/python/ray/data/_internal/actor_autoscaler/base_actor_autoscaler.py @@ -0,0 +1,31 @@ +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING + +from ray.util.annotations import DeveloperAPI + +if TYPE_CHECKING: + from ray.data._internal.execution.resource_manager import ResourceManager + from ray.data._internal.execution.streaming_executor_state import Topology + + +@DeveloperAPI +class ActorAutoscaler(ABC): + """Abstract interface for Ray Data actor autoscaler.""" + + def __init__( + self, + topology: "Topology", + resource_manager: "ResourceManager", + ): + self._topology = topology + self._resource_manager = resource_manager + + @abstractmethod + def try_trigger_scaling(self): + """Try trigger autoscaling. + + This method will be called each time when StreamingExecutor makes + a scheduling decision. A subclass should override this method to + handle the autoscaling of `AutoscalingActorPool`s. + """ + ... diff --git a/python/ray/data/_internal/execution/autoscaler/default_autoscaler.py b/python/ray/data/_internal/actor_autoscaler/default_actor_autoscaler.py similarity index 58% rename from python/ray/data/_internal/execution/autoscaler/default_autoscaler.py rename to python/ray/data/_internal/actor_autoscaler/default_actor_autoscaler.py index fd385f97ba33..50cc1662fcdb 100644 --- a/python/ray/data/_internal/execution/autoscaler/default_autoscaler.py +++ b/python/ray/data/_internal/actor_autoscaler/default_actor_autoscaler.py @@ -1,15 +1,9 @@ import logging import math -import time -from typing import TYPE_CHECKING, Dict +from typing import TYPE_CHECKING, Optional -import ray -from .autoscaler import Autoscaler from .autoscaling_actor_pool import ActorPoolScalingRequest, AutoscalingActorPool -from .util import get_max_scale_up -from ray.data._internal.execution.autoscaling_requester import ( - get_or_create_autoscaling_requester_actor, -) +from .base_actor_autoscaler import ActorAutoscaler from ray.data._internal.execution.interfaces.execution_options import ExecutionResources from ray.data.context import WARN_PREFIX, AutoscalingConfig @@ -18,24 +12,18 @@ from ray.data._internal.execution.resource_manager import ResourceManager from ray.data._internal.execution.streaming_executor_state import OpState, Topology - logger = logging.getLogger(__name__) -class DefaultAutoscaler(Autoscaler): - - # Min number of seconds between two autoscaling requests. - MIN_GAP_BETWEEN_AUTOSCALING_REQUESTS = 20 - +class DefaultActorAutoscaler(ActorAutoscaler): def __init__( self, topology: "Topology", resource_manager: "ResourceManager", *, - execution_id: str, config: AutoscalingConfig, ): - super().__init__(topology, resource_manager, execution_id) + super().__init__(topology, resource_manager) self._actor_pool_scaling_up_threshold = ( config.actor_pool_util_upscaling_threshold @@ -46,12 +34,14 @@ def __init__( self._validate_autoscaling_config() - # Last time when a request was sent to Ray's autoscaler. - self._last_request_time = 0 - def try_trigger_scaling(self): - self._try_scale_up_cluster() - self._try_scale_up_or_down_actor_pool() + for op, state in self._topology.items(): + actor_pools = op.get_autoscaling_actor_pools() + for actor_pool in actor_pools: + # Trigger auto-scaling + actor_pool.scale( + self._derive_target_scaling_config(actor_pool, op, state) + ) def _derive_target_scaling_config( self, @@ -64,7 +54,7 @@ def _derive_target_scaling_config( op._inputs_complete and op_state.total_enqueued_input_bundles() == 0 ): return ActorPoolScalingRequest.downscale( - delta=-1, reason="consumed all inputs" + delta=-1, force=True, reason="consumed all inputs" ) if actor_pool.current_size() < actor_pool.min_size(): @@ -99,7 +89,7 @@ def _derive_target_scaling_config( reason="operator exceeding resource quota" ) budget = self._resource_manager.get_budget(op) - if get_max_scale_up(actor_pool, budget) == 0: + if _get_max_scale_up(actor_pool, budget) == 0: return ActorPoolScalingRequest.no_op(reason="exceeded resource limits") return ActorPoolScalingRequest.upscale( @@ -129,86 +119,6 @@ def _derive_target_scaling_config( ) ) - def _try_scale_up_or_down_actor_pool(self): - for op, state in self._topology.items(): - actor_pools = op.get_autoscaling_actor_pools() - for actor_pool in actor_pools: - # Trigger auto-scaling - actor_pool.scale( - self._derive_target_scaling_config(actor_pool, op, state) - ) - - def _try_scale_up_cluster(self): - """Try to scale up the cluster to accomodate the provided in-progress workload. - - This makes a resource request to Ray's autoscaler consisting of the current, - aggregate usage of all operators in the DAG + the incremental usage of all - operators that are ready for dispatch (i.e. that have inputs queued). If the - autoscaler were to grant this resource request, it would allow us to dispatch - one task for every ready operator. - - Note that this resource request does not take the global resource limits or the - liveness policy into account; it only tries to make the existing resource usage - + one more task per ready operator feasible in the cluster. - """ - # Limit the frequency of autoscaling requests. - now = time.time() - if now - self._last_request_time < self.MIN_GAP_BETWEEN_AUTOSCALING_REQUESTS: - return - - # Scale up the cluster, if no ops are allowed to run, but there are still data - # in the input queues. - no_runnable_op = all( - not op_state._scheduling_status.runnable - for _, op_state in self._topology.items() - ) - any_has_input = any( - op_state._pending_dispatch_input_bundles_count() > 0 - for _, op_state in self._topology.items() - ) - if not (no_runnable_op and any_has_input): - return - - self._last_request_time = now - - # Get resource usage for all ops + additional resources needed to launch one - # more task for each ready op. - resource_request = [] - - def to_bundle(resource: ExecutionResources) -> Dict: - req = {} - if resource.cpu: - req["CPU"] = math.ceil(resource.cpu) - if resource.gpu: - req["GPU"] = math.ceil(resource.gpu) - return req - - for op, state in self._topology.items(): - per_task_resource = op.incremental_resource_usage() - task_bundle = to_bundle(per_task_resource) - resource_request.extend([task_bundle] * op.num_active_tasks()) - # Only include incremental resource usage for ops that are ready for - # dispatch. - if state._pending_dispatch_input_bundles_count() > 0: - # TODO(Clark): Scale up more aggressively by adding incremental resource - # usage for more than one bundle in the queue for this op? - resource_request.append(task_bundle) - - self._send_resource_request(resource_request) - - def _send_resource_request(self, resource_request): - # Make autoscaler resource request. - actor = get_or_create_autoscaling_requester_actor() - actor.request_resources.remote(resource_request, self._execution_id) - - def on_executor_shutdown(self): - # Make request for zero resources to autoscaler for this execution. - actor = get_or_create_autoscaling_requester_actor() - actor.request_resources.remote({}, self._execution_id) - - def get_total_resources(self) -> ExecutionResources: - return ExecutionResources.from_resource_dict(ray.cluster_resources()) - def _validate_autoscaling_config(self): for op, state in self._topology.items(): for actor_pool in op.get_autoscaling_actor_pools(): @@ -229,3 +139,46 @@ def _validate_actor_pool_autoscaling_config( f"actor pool is configured to avoid buffering (its " f"`max_tasks_in_flight_per_actor` == `max_concurrency`)" ) + + +def _get_max_scale_up( + actor_pool: AutoscalingActorPool, + budget: Optional[ExecutionResources], +) -> Optional[int]: + """Get the maximum number of actors that can be scaled up. + + Args: + actor_pool: The actor pool to scale up. + budget: The budget to scale up. + + Returns: + The maximum number of actors that can be scaled up, or `None` if you can + scale up infinitely. + """ + if budget is None: + return None + + assert budget.cpu >= 0 and budget.gpu >= 0 + + num_cpus_per_actor = actor_pool.per_actor_resource_usage().cpu + num_gpus_per_actor = actor_pool.per_actor_resource_usage().gpu + assert num_cpus_per_actor >= 0 and num_gpus_per_actor >= 0 + + max_cpu_scale_up: float = float("inf") + if num_cpus_per_actor > 0 and not math.isinf(budget.cpu): + max_cpu_scale_up = budget.cpu // num_cpus_per_actor + + max_gpu_scale_up: float = float("inf") + if num_gpus_per_actor > 0 and not math.isinf(budget.gpu): + max_gpu_scale_up = budget.gpu // num_gpus_per_actor + + max_scale_up = min(max_cpu_scale_up, max_gpu_scale_up) + if math.isinf(max_scale_up): + return None + else: + assert not math.isnan(max_scale_up), ( + budget, + num_cpus_per_actor, + num_gpus_per_actor, + ) + return int(max_scale_up) diff --git a/python/ray/data/_internal/arrow_block.py b/python/ray/data/_internal/arrow_block.py index 9d18db5bf028..14a11b8b0fab 100644 --- a/python/ray/data/_internal/arrow_block.py +++ b/python/ray/data/_internal/arrow_block.py @@ -187,7 +187,7 @@ def _get_max_chunk_size( if table.nbytes == 0: return None else: - avg_row_size = int(table.nbytes / table.num_rows) + avg_row_size = table.nbytes / table.num_rows return max(1, int(max_chunk_size_bytes / avg_row_size)) @@ -263,8 +263,10 @@ def schema(self) -> "pyarrow.lib.Schema": def to_pandas(self) -> "pandas.DataFrame": from ray.air.util.data_batch_conversion import _cast_tensor_columns_to_ndarrays - df = self._table.to_pandas() + # We specify ignore_metadata=True because pyarrow will use the metadata + # to build the Table. This is handled incorrectly for older pyarrow versions ctx = DataContext.get_current() + df = self._table.to_pandas(ignore_metadata=ctx.pandas_block_ignore_metadata) if ctx.enable_tensor_extension_casting: df = _cast_tensor_columns_to_ndarrays(df) return df @@ -335,6 +337,19 @@ def _zip(self, acc: BlockAccessor) -> "Block": r = r.append_column(col_name, col) return r + def upsert_column( + self, column_name: str, column_data: BlockColumn + ) -> "pyarrow.Table": + assert isinstance( + column_data, (pyarrow.Array, pyarrow.ChunkedArray) + ), f"Expected either a pyarrow.Array or pyarrow.ChunkedArray, got: {type(column_data)}" + + column_idx = self._table.schema.get_field_index(column_name) + if column_idx == -1: + return self._table.append_column(column_name, column_data) + else: + return self._table.set_column(column_idx, column_name, column_data) + @staticmethod def builder() -> ArrowBlockBuilder: return ArrowBlockBuilder() diff --git a/python/ray/data/_internal/arrow_ops/transform_pyarrow.py b/python/ray/data/_internal/arrow_ops/transform_pyarrow.py index 71a85c91d168..d52f97eb9d41 100644 --- a/python/ray/data/_internal/arrow_ops/transform_pyarrow.py +++ b/python/ray/data/_internal/arrow_ops/transform_pyarrow.py @@ -1,5 +1,6 @@ import logging -from typing import TYPE_CHECKING, Dict, List, Optional, Union +from collections import defaultdict +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union import numpy as np from packaging.version import parse as parse_version @@ -10,6 +11,7 @@ from ray.air.util.tensor_extensions.arrow import ( MIN_PYARROW_VERSION_CHUNKED_ARRAY_TO_NUMPY_ZERO_COPY_ONLY, PYARROW_VERSION, + get_arrow_extension_tensor_types, ) try: @@ -67,6 +69,21 @@ def _create_empty_table(schema: "pyarrow.Schema"): return pa.table(arrays, schema=schema) +def _hash_partition( + table: "pyarrow.Table", + num_partitions: int, +) -> np.ndarray: + + partitions = np.zeros((table.num_rows,), dtype=np.int64) + for i in range(table.num_rows): + _tuple = tuple(c[i] for c in table.columns) + partitions[i] = hash(_tuple) % num_partitions + + # Convert to ndarray to compute hash partition indices + # more efficiently + return partitions + + def hash_partition( table: "pyarrow.Table", *, @@ -90,15 +107,7 @@ def hash_partition( return {0: table} projected_table = table.select(hash_cols) - - partitions = np.zeros((projected_table.num_rows,)) - for i in range(projected_table.num_rows): - _tuple = tuple(c[i] for c in projected_table.columns) - partitions[i] = hash(_tuple) % num_partitions - - # Convert to ndarray to compute hash partition indices - # more efficiently - partitions_array = np.asarray(partitions) + partitions_array = _hash_partition(projected_table, num_partitions=num_partitions) # For every partition compile list of indices of rows falling # under that partition indices = [np.where(partitions_array == p)[0] for p in range(num_partitions)] @@ -148,169 +157,210 @@ def take_table( return table -def unify_schemas( - schemas: List["pyarrow.Schema"], *, promote_types: bool = False -) -> "pyarrow.Schema": - """Version of `pyarrow.unify_schemas()` which also handles checks for - variable-shaped tensors in the given schemas. - - This function scans all input schemas to identify columns that contain - variable-shaped tensors or objects. For tensor columns, it ensures the - use of appropriate tensor types (including variable-shaped tensor types). - For object columns, it uses a specific object type to accommodate any - objects present. Additionally, it handles columns with null-typed lists - by determining their actual types from the given schemas. - - Currently, it disallows the concatenation of tensor columns and - pickled object columsn for performance reasons. +def _reconcile_diverging_fields( + unique_schemas: List["pyarrow.Schema"], + promote_types: bool, +) -> Dict[str, Any]: """ - import pyarrow as pa + Identify and reconcile fields whose presence or types differ across the provided schemas. - from ray.air.util.object_extensions.arrow import ArrowPythonObjectType - from ray.air.util.tensor_extensions.arrow import ( - ArrowTensorType, - ArrowVariableShapedTensorType, - ) + Args: + unique_schemas: List of PyArrow schemas to find diverging fields in. + promote_types: Whether to promote types. - schemas_to_unify = [] - schema_field_overrides = {} + Returns: + A dictionary of diverging fields with their reconciled types. + """ + from ray.air.util.object_extensions.arrow import ArrowPythonObjectType - # Rollup columns with opaque (null-typed) lists, to override types in - # the following for-loop. - cols_with_null_list = set() + reconciled_fields = {} + field_types = defaultdict(set) # field_name -> set of types seen so far + field_flags = defaultdict( + lambda: defaultdict(bool) + ) # field_name -> dict of boolean flags + + # Process schemas and reconcile on-the-fly + for schema in unique_schemas: + for field_name in schema.names: + if field_name in reconciled_fields: + # If the field has already been reconciled, skip it. + continue + + field_type = schema.field(field_name).type + field_types[field_name].add(field_type) + flags = field_flags[field_name] + + # Update flags + flags["has_object"] |= isinstance(field_type, ArrowPythonObjectType) + flags["has_tensor"] |= isinstance( + field_type, get_arrow_extension_tensor_types() + ) + flags["has_list"] |= pyarrow.types.is_list(field_type) + flags["has_null"] |= pyarrow.types.is_null(field_type) + flags["has_struct"] |= pyarrow.types.is_struct(field_type) - all_columns = set() - for schema in schemas: - for col_name in schema.names: - # Check for duplicate field names in this schema - if schema.names.count(col_name) > 1: - # This is broken for Pandas blocks and broken with the logic here + # Check for object-tensor conflict + if flags["has_object"] and flags["has_tensor"]: raise ValueError( - f"Schema {schema} has multiple fields with the same name: {col_name}" + f"Found columns with both objects and tensors: {field_name}" ) - col_type = schema.field(col_name).type - if pa.types.is_list(col_type) and pa.types.is_null(col_type.value_type): - cols_with_null_list.add(col_name) - all_columns.add(col_name) + # Reconcile immediately if it's a special type and if it's divergent. + if any(flags.values()) and len(field_types[field_name]) > 1: + reconciled_value = _reconcile_field( + non_null_types=field_types[field_name], + promote_types=promote_types, + ) + if reconciled_value is not None: + reconciled_fields[field_name] = reconciled_value + + return reconciled_fields + + +def _reconcile_field( + non_null_types: List[pyarrow.DataType], + promote_types: bool = False, +) -> Optional[pyarrow.DataType]: + """ + Reconcile a single divergent field across schemas. + + Returns reconciled type or None if default PyArrow handling is sufficient. + """ + from ray.air.util.object_extensions.arrow import ArrowPythonObjectType from ray.air.util.tensor_extensions.arrow import ( - get_arrow_extension_fixed_shape_tensor_types, + ArrowTensorType, + ArrowVariableShapedTensorType, get_arrow_extension_tensor_types, ) - arrow_tensor_types = get_arrow_extension_tensor_types() - arrow_fixed_shape_tensor_types = get_arrow_extension_fixed_shape_tensor_types() - - columns_with_objects = set() - columns_with_tensor_array = set() - columns_with_struct = set() - for col_name in all_columns: - for s in schemas: - if col_name in s.names: - if isinstance(s.field(col_name).type, ArrowPythonObjectType): - columns_with_objects.add(col_name) - if isinstance(s.field(col_name).type, arrow_tensor_types): - columns_with_tensor_array.add(col_name) - if isinstance(s.field(col_name).type, pa.StructType): - columns_with_struct.add(col_name) - - if len(columns_with_objects.intersection(columns_with_tensor_array)) > 0: - # This is supportable if we use object type, but it will be expensive - raise ValueError( - "Found columns with both objects and tensors: " - f"{columns_with_tensor_array.intersection(columns_with_objects)}" + if not non_null_types: + return None + + tensor_types = get_arrow_extension_tensor_types() + + # Handle special cases in priority order + + # 1. Tensor fields + tensor_field_types = [t for t in non_null_types if isinstance(t, tensor_types)] + if tensor_field_types: + needs_variable_shape = ArrowTensorType._need_variable_shaped_tensor_array( + tensor_field_types ) - for col_name in columns_with_tensor_array: - tensor_array_types = [ - s.field(col_name).type - for s in schemas - if col_name in s.names - and isinstance(s.field(col_name).type, arrow_tensor_types) - ] - # Check if we have missing tensor fields (some schemas don't have this field) - has_missing_fields = len(tensor_array_types) < len(schemas) - - # Convert to variable-shaped if needed or if we have missing fields - if ( - ArrowTensorType._need_variable_shaped_tensor_array(tensor_array_types) - or has_missing_fields - ): - if isinstance(tensor_array_types[0], ArrowVariableShapedTensorType): - new_type = tensor_array_types[0] - elif isinstance(tensor_array_types[0], arrow_fixed_shape_tensor_types): - new_type = ArrowVariableShapedTensorType( - dtype=tensor_array_types[0].scalar_type, - ndim=len(tensor_array_types[0].shape), - ) + if needs_variable_shape: + first_tensor = tensor_field_types[0] + if isinstance(first_tensor, ArrowVariableShapedTensorType): + return first_tensor else: - raise ValueError( - "Detected need for variable shaped tensor representation, " - f"but schema is not ArrayTensorType: {tensor_array_types[0]}" + # Convert fixed-shape to variable-shape + return ArrowVariableShapedTensorType( + dtype=first_tensor.scalar_type, ndim=len(first_tensor.shape) ) - schema_field_overrides[col_name] = new_type - - for col_name in columns_with_objects: - schema_field_overrides[col_name] = ArrowPythonObjectType() - for col_name in columns_with_struct: - field_types = [s.field(col_name).type for s in schemas] + # 2. Object fields + if any(isinstance(t, ArrowPythonObjectType) for t in non_null_types): + return ArrowPythonObjectType() - # Unify struct schemas + # 3. Struct fields (recursive unification) + struct_types = [t for t in non_null_types if pyarrow.types.is_struct(t)] + if struct_types: + # Convert struct types to schemas struct_schemas = [] - for t in field_types: - if t is not None and pa.types.is_struct(t): - struct_schemas.append(pa.schema(list(t))) - else: - struct_schemas.append(pa.schema([])) + for t in non_null_types: + if pyarrow.types.is_struct(t): + struct_schemas.append(pyarrow.schema(list(t))) + # Recursively unify + unified_struct = unify_schemas(struct_schemas, promote_types=promote_types) + return pyarrow.struct(list(unified_struct)) + + # 4. Null-typed list fields (Need this pyarrow < 14.0.0) + null_lists = [ + t + for t in non_null_types + if pyarrow.types.is_list(t) and pyarrow.types.is_null(t.value_type) + ] + if null_lists: + # Find first non-null list type + for t in non_null_types: + if not (pyarrow.types.is_list(t) and pyarrow.types.is_null(t.value_type)): + return t + # At this phase, we have no special types to reconcile, so return None. Arrow will fail to unify. + return None + + +def _unify_schemas_pyarrow( + schemas: List["pyarrow.Schema"], promote_types: bool = False +) -> "pyarrow.Schema": + """Wrapper for pyarrow.unify_schemas with version compatibility.""" + if get_pyarrow_version() < MIN_PYARROW_VERSION_TYPE_PROMOTION: + return pyarrow.unify_schemas(schemas) - unified_struct_schema = unify_schemas( - struct_schemas, promote_types=promote_types - ) + promote_options = "permissive" if promote_types else "default" + return pyarrow.unify_schemas(schemas, promote_options=promote_options) - schema_field_overrides[col_name] = pa.struct(list(unified_struct_schema)) - - if cols_with_null_list: - # For each opaque list column, iterate through all schemas until we find - # a valid value_type that can be used to override the column types in - # the following for-loop. - for col_name in cols_with_null_list: - for schema in schemas: - col_type = schema.field(col_name).type - if not pa.types.is_list(col_type) or not pa.types.is_null( - col_type.value_type - ): - schema_field_overrides[col_name] = col_type - break - - if schema_field_overrides: - # Go through all schemas and update the types of columns from the above loop. - for schema in schemas: - for col_name, col_new_type in schema_field_overrides.items(): - if col_name in schema.names: - var_shaped_col = schema.field(col_name).with_type(col_new_type) - col_idx = schema.get_field_index(col_name) - schema = schema.set(col_idx, var_shaped_col) - schemas_to_unify.append(schema) - else: - schemas_to_unify = schemas - try: - if get_pyarrow_version() < MIN_PYARROW_VERSION_TYPE_PROMOTION: - return pyarrow.unify_schemas(schemas_to_unify) +def unify_schemas( + schemas: List["pyarrow.Schema"], *, promote_types: bool = False +) -> "pyarrow.Schema": + """ + Unify schemas handling Ray-specific types (tensors, objects, etc.). + + Falls back to PyArrow's unify_schemas when possible, with custom + handling for tensor arrays, object types, and recursive struct unification. + """ + if not schemas: + raise ValueError("No schemas provided for unify_schemas") - # NOTE: By default type promotion (from "smaller" to "larger" types) is disabled, - # allowing only promotion b/w nullable and non-nullable ones - arrow_promote_types_mode = "permissive" if promote_types else "default" + # Deduplicate schemas. Calling this before PyArrow's unify_schemas is more efficient (100x faster). - return pyarrow.unify_schemas( - schemas_to_unify, promote_options=arrow_promote_types_mode - ) - except Exception as e: - schemas_str = "\n-----\n".join([str(s) for s in schemas_to_unify]) + # Remove metadata for hashability + schemas[0].remove_metadata() + schemas_to_unify = [schemas[0]] + for schema in schemas[1:]: + schema.remove_metadata() + if not schema.equals(schemas[0]): + schemas_to_unify.append(schema) - logger.error(f"Failed to unify schemas: {schemas_str}", exc_info=e) + pyarrow_exception = None + # If there is only one schema, return it + if len(schemas_to_unify) == 1: + return schemas_to_unify[0] + # Try PyArrow's unification first, only reconcile for tensor fields + try: + return _unify_schemas_pyarrow(schemas_to_unify, promote_types) + except (pyarrow.lib.ArrowTypeError, pyarrow.lib.ArrowInvalid) as e: + # If we raise only on non tensor errors, it fails to unify PythonObjectType and pyarrow primitives. + # Look at test_pyarrow_conversion_error_handling for an example. + pyarrow_exception = e + pass + + # Reconcile diverging fields + overrides = _reconcile_diverging_fields(schemas_to_unify, promote_types) + + # At this point, we're not able to reconcile the fields, so raise the original exception. + if not overrides: + raise pyarrow_exception + + # Apply overrides to schemas + updated_schemas = [] + for schema in schemas_to_unify: + for name, new_type in overrides.items(): + try: + idx = schema.get_field_index(name) + field = schema.field(name).with_type(new_type) + schema = schema.set(idx, field) + except KeyError: + pass + updated_schemas.append(schema) + schemas_to_unify = updated_schemas + + # Final unification with overrides applied + try: + return _unify_schemas_pyarrow(schemas_to_unify, promote_types) + except Exception as e: + schemas_str = "\n-----\n".join(str(s) for s in schemas_to_unify) + logger.error(f"Failed to unify schemas: {schemas_str}", exc_info=e) raise diff --git a/python/ray/data/_internal/block_batching/interfaces.py b/python/ray/data/_internal/block_batching/interfaces.py index 452b6d850b93..4f0bed6b3dd4 100644 --- a/python/ray/data/_internal/block_batching/interfaces.py +++ b/python/ray/data/_internal/block_batching/interfaces.py @@ -7,30 +7,38 @@ @dataclass -class Batch: - """A batch of data with a corresponding index. +class BatchMetadata: + """Metadata associated with a batch. Attributes: batch_idx: The global index of this batch so that downstream operations can maintain ordering. - data: The batch of data. """ batch_idx: int + + +@dataclass +class Batch: + """A batch of data. + + Attributes: + metadata: Metadata associated with this batch. + data: The batch of data. + """ + + metadata: BatchMetadata data: DataBatch class CollatedBatch(Batch): - """A batch of collated data with a corresponding index. + """A batch of collated data. Attributes: - batch_idx: The global index of this batch so that downstream operations can - maintain ordering. data: The batch of data which is the output of a user provided collate_fn Therefore, the type of this data can be Any. """ - batch_idx: int data: Any diff --git a/python/ray/data/_internal/block_batching/iter_batches.py b/python/ray/data/_internal/block_batching/iter_batches.py index 824f17aecbcb..9dc052d12aaa 100644 --- a/python/ray/data/_internal/block_batching/iter_batches.py +++ b/python/ray/data/_internal/block_batching/iter_batches.py @@ -1,5 +1,5 @@ import collections -from contextlib import nullcontext +from contextlib import contextmanager, nullcontext from typing import Any, Callable, Dict, Iterator, Optional import ray @@ -9,42 +9,28 @@ WaitBlockPrefetcher, blocks_to_batches, collate, - extract_data_from_batch, finalize_batches, format_batches, resolve_block_refs, ) from ray.data._internal.execution.interfaces.ref_bundle import RefBundle from ray.data._internal.memory_tracing import trace_deallocation -from ray.data._internal.stats import DatasetStats +from ray.data._internal.stats import DatasetStats, StatsManager from ray.data._internal.util import make_async_gen from ray.data.block import Block, DataBatch from ray.data.context import DataContext from ray.types import ObjectRef -def iter_batches( - ref_bundles: Iterator[RefBundle], - *, - stats: Optional[DatasetStats] = None, - clear_block_after_read: bool = False, - batch_size: Optional[int] = None, - batch_format: Optional[str] = "default", - drop_last: bool = False, - collate_fn: Optional[Callable[[DataBatch], Any]] = None, - finalize_fn: Optional[Callable[[Any], Any]] = None, - shuffle_buffer_min_size: Optional[int] = None, - shuffle_seed: Optional[int] = None, - ensure_copy: bool = False, - prefetch_batches: int = 1, -) -> Iterator[DataBatch]: - """Create formatted batches of data from an iterator of block object references and - corresponding metadata. +class BatchIterator: + """Defines an iterator pipeline to convert a stream of block object references + into a stream of formatted batches ready to be consumed by the user. This takes a block iterator and creates batch_size batches, slicing, unioning, shuffling, prefetching, and formatting blocks as needed. - The algorithm uses both pipeline parallelism and data parallelism: + This involves both pipeline parallelism (e.g. prefetching) + and data parallelism (e.g. threadpool operations): If prefetch_batches=2, these are all the batches in flight: @@ -74,6 +60,7 @@ def iter_batches( Args: ref_bundles: An iterator over RefBundles. stats: DatasetStats object to record timing and other statistics. + dataset_tag: The tag of the dataset to record timing and other statistics. clear_block_after_read: Whether to clear the block from object store manually (i.e. without waiting for Python's automatic GC) after it is read. Doing so will reclaim memory faster and hence reduce the @@ -103,86 +90,181 @@ def iter_batches( the specified amount of formatted batches from blocks. This improves performance for non-CPU bound UDFs, allowing batch fetching compute and formatting to be overlapped with the UDF. Defaults to 1. - - Returns: - An iterator over record batches. """ - context = DataContext.get_current() - if ( - prefetch_batches > 0 - and context.actor_prefetcher_enabled - and not ray.util.client.ray.is_connected() + def __init__( + self, + ref_bundles: Iterator[RefBundle], + *, + stats: Optional[DatasetStats] = None, + dataset_tag: Optional[str] = None, + clear_block_after_read: bool = False, + batch_size: Optional[int] = None, + batch_format: Optional[str] = "default", + drop_last: bool = False, + collate_fn: Optional[Callable[[DataBatch], Any]] = None, + finalize_fn: Optional[Callable[[Any], Any]] = None, + shuffle_buffer_min_size: Optional[int] = None, + shuffle_seed: Optional[int] = None, + ensure_copy: bool = False, + prefetch_batches: int = 1, ): - prefetcher = ActorBlockPrefetcher() - else: - prefetcher = WaitBlockPrefetcher() + self._ref_bundles = ref_bundles + self._stats = stats + self._dataset_tag = dataset_tag + self._batch_size = batch_size + self._batch_format = batch_format + self._drop_last = drop_last + self._collate_fn = collate_fn + self._finalize_fn = finalize_fn + self._shuffle_buffer_min_size = shuffle_buffer_min_size + self._shuffle_seed = shuffle_seed + self._ensure_copy = ensure_copy + self._prefetch_batches = prefetch_batches + self._eager_free = ( + clear_block_after_read and DataContext.get_current().eager_free + ) - eager_free = clear_block_after_read and DataContext.get_current().eager_free + actor_prefetcher_enabled = ( + prefetch_batches > 0 + and DataContext.get_current().actor_prefetcher_enabled + and not ray.util.client.ray.is_connected() + ) + self._prefetcher = ( + ActorBlockPrefetcher() + if actor_prefetcher_enabled + else WaitBlockPrefetcher() + ) + self._yielded_first_batch = False - def _async_iter_batches( - ref_bundles: Iterator[RefBundle], - ) -> Iterator[DataBatch]: - # Step 1: Prefetch logical batches locally. - block_iter = prefetch_batches_locally( + def _prefetch_blocks( + self, ref_bundles: Iterator[RefBundle] + ) -> Iterator[ObjectRef[Block]]: + return prefetch_batches_locally( ref_bundles=ref_bundles, - prefetcher=prefetcher, - num_batches_to_prefetch=prefetch_batches, - batch_size=batch_size, - eager_free=eager_free, + prefetcher=self._prefetcher, + num_batches_to_prefetch=self._prefetch_batches, + batch_size=self._batch_size, + eager_free=self._eager_free, + ) + + def _resolve_block_refs( + self, block_refs: Iterator[ObjectRef[Block]] + ) -> Iterator[Block]: + return resolve_block_refs(block_ref_iter=block_refs, stats=self._stats) + + def _blocks_to_batches(self, blocks: Iterator[Block]) -> Iterator[Batch]: + return blocks_to_batches( + block_iter=blocks, + stats=self._stats, + batch_size=self._batch_size, + drop_last=self._drop_last, + shuffle_buffer_min_size=self._shuffle_buffer_min_size, + shuffle_seed=self._shuffle_seed, + ensure_copy=self._ensure_copy, + ) + + def _format_batches(self, batches: Iterator[Batch]) -> Iterator[Batch]: + return _format_in_threadpool( + batch_iter=batches, + stats=self._stats, + batch_format=self._batch_format, + collate_fn=self._collate_fn, + num_threadpool_workers=self._prefetch_batches, + ) + + def _finalize_batches( + self, + batch_iter: Iterator[Batch], + ) -> Iterator[Batch]: + if self._finalize_fn is None: + return batch_iter + + return finalize_batches( + batch_iter, finalize_fn=self._finalize_fn, stats=self._stats ) + def _restore_original_batch_order( + self, batches: Iterator[Batch] + ) -> Iterator[Batch]: + return restore_original_order(batches) + + def _pipeline(self, ref_bundles: Iterator[RefBundle]) -> Iterator[Batch]: + # Step 1: Prefetch logical batches locally. + block_iter = self._prefetch_blocks(ref_bundles) + # Step 2: Resolve the blocks. - block_iter = resolve_block_refs(block_ref_iter=block_iter, stats=stats) + block_iter = self._resolve_block_refs(block_iter) # Step 3: Batch and shuffle the resolved blocks. - batch_iter = blocks_to_batches( - block_iter=block_iter, - stats=stats, - batch_size=batch_size, - drop_last=drop_last, - shuffle_buffer_min_size=shuffle_buffer_min_size, - shuffle_seed=shuffle_seed, - ensure_copy=ensure_copy, - ) + batch_iter = self._blocks_to_batches(block_iter) + + # Step 4: Format and collate the batches in a threadpool. + batch_iter = self._format_batches(batch_iter) + + # Step 5: Finalize the batches (e.g., move to GPU). + batch_iter = self._finalize_batches(batch_iter) + + # Step 6: Restore the original order of the batches, as the prior + # threadpool operations may have reordered the batches non-deterministically. + batch_iter = self._restore_original_batch_order(batch_iter) + + yield from batch_iter - # Step 4: Use a threadpool for formatting and collation. - batch_iter = _format_in_threadpool( - batch_iter, - stats=stats, - batch_format=batch_format, - collate_fn=collate_fn, - num_threadpool_workers=prefetch_batches, + def _iter_batches(self) -> Iterator[DataBatch]: + async_batch_iter = make_async_gen( + self._ref_bundles, + fn=self._pipeline, + num_workers=1, + preserve_ordering=False, ) - # Step 5: Finalize each batch. - if finalize_fn is not None: - batch_iter = finalize_batches( - batch_iter, finalize_fn=finalize_fn, stats=stats - ) + self.before_epoch_start() - # Step 6: Restore original order. - batch_iter: Iterator[Batch] = restore_original_order(batch_iter) + while True: + with self.get_next_batch_context(): + try: + batch = next(async_batch_iter) + except StopIteration: + break + with self.yield_batch_context(batch): + yield batch.data - yield from extract_data_from_batch(batch_iter) + self.after_epoch_end() - # Run everything in a separate thread to not block the main thread when waiting - # for streaming results. - async_batch_iter = make_async_gen( - ref_bundles, - fn=_async_iter_batches, - num_workers=1, - preserve_ordering=False, - ) + def __iter__(self) -> Iterator[DataBatch]: + return self._iter_batches() - while True: - with stats.iter_total_blocked_s.timer() if stats else nullcontext(): - try: - next_batch = next(async_batch_iter) - except StopIteration: - break - with stats.iter_user_s.timer() if stats else nullcontext(): - yield next_batch + def before_epoch_start(self): + self._yielded_first_batch = False + + def after_epoch_end(self): + StatsManager.clear_iteration_metrics(self._dataset_tag) + + @contextmanager + def get_next_batch_context(self): + try: + if self._stats: + # Always track total blocked time + total_timer = self._stats.iter_total_blocked_s.timer() + # Also track the time until the first batch is ready + first_batch_ready_timer = ( + self._stats.iter_time_to_first_batch_s.timer() + if not self._yielded_first_batch + else nullcontext() + ) + with total_timer, first_batch_ready_timer: + yield + else: + yield + finally: + self._yielded_first_batch = True + + @contextmanager + def yield_batch_context(self, batch: Batch): + with self._stats.iter_user_s.timer() if self._stats else nullcontext(): + yield + StatsManager.update_iteration_metrics(self._stats, self._dataset_tag) def _format_in_threadpool( @@ -315,8 +397,8 @@ def restore_original_order(batch_iter: Iterator[Batch]) -> Iterator[Batch]: next_index_required = 0 buffer: Dict[int, Batch] = {} for batch in batch_iter: - assert batch.batch_idx not in buffer - buffer[batch.batch_idx] = batch + assert batch.metadata.batch_idx not in buffer + buffer[batch.metadata.batch_idx] = batch while next_index_required in buffer: yield buffer.pop(next_index_required) next_index_required += 1 diff --git a/python/ray/data/_internal/block_batching/util.py b/python/ray/data/_internal/block_batching/util.py index 4cea60abca80..a1678b569062 100644 --- a/python/ray/data/_internal/block_batching/util.py +++ b/python/ray/data/_internal/block_batching/util.py @@ -1,3 +1,4 @@ +import dataclasses import logging import threading from contextlib import nullcontext @@ -8,6 +9,7 @@ from ray.data._internal.batcher import Batcher, ShufflingBatcher from ray.data._internal.block_batching.interfaces import ( Batch, + BatchMetadata, BlockPrefetcher, CollatedBatch, ) @@ -120,7 +122,7 @@ def get_iter_next_batch_s_timer(): while batcher.has_batch(): with get_iter_next_batch_s_timer(): batch = batcher.next_batch() - yield Batch(global_counter, batch) + yield Batch(metadata=BatchMetadata(batch_idx=global_counter), data=batch) global_counter += 1 # Signal to the batcher that there are no more blocks to add. @@ -130,38 +132,38 @@ def get_iter_next_batch_s_timer(): while batcher.has_batch(): with get_iter_next_batch_s_timer(): batch = batcher.next_batch() - yield Batch(global_counter, batch) + yield Batch(metadata=BatchMetadata(batch_idx=global_counter), data=batch) global_counter += 1 # Get any remaining data. if not drop_last and batcher.has_any(): with get_iter_next_batch_s_timer(): batch = batcher.next_batch() - yield Batch(global_counter, batch) + yield Batch(metadata=BatchMetadata(batch_idx=global_counter), data=batch) global_counter += 1 def format_batches( - block_iter: Iterator[Batch], + batch_iter: Iterator[Batch], batch_format: Optional[str], stats: Optional[DatasetStats] = None, ) -> Iterator[Batch]: """Given an iterator of blocks, returns an iterator of formatted batches. Args: - block_iter: An iterator over blocks. + batch_iter: An iterator over batches. batch_format: The batch format to use. stats: An optional stats object to record formatting times. Returns: An iterator over batch index and the formatted batch. """ - for batch in block_iter: + for batch in batch_iter: with stats.iter_format_batch_s.timer() if stats else nullcontext(): formatted_batch = BlockAccessor.for_block(batch.data).to_batch_format( batch_format ) - yield Batch(batch.batch_idx, formatted_batch) + yield dataclasses.replace(batch, data=formatted_batch) def collate( @@ -180,7 +182,7 @@ def collate( for batch in batch_iter: with stats.iter_collate_batch_s.timer() if stats else nullcontext(): collated_batch = collate_fn(batch.data) - yield CollatedBatch(batch.batch_idx, collated_batch) + yield CollatedBatch(metadata=batch.metadata, data=collated_batch) def finalize_batches( @@ -204,7 +206,7 @@ def finalize_batches( for batch in batch_iter: with stats.iter_finalize_batch_s.timer() if stats else nullcontext(): finalized_batch = finalize_fn(batch.data) - yield CollatedBatch(batch.batch_idx, finalized_batch) + yield dataclasses.replace(batch, data=finalized_batch) def extract_data_from_batch(batch_iter: Iterator[Batch]) -> Iterator[Any]: diff --git a/python/ray/data/_internal/cluster_autoscaler/__init__.py b/python/ray/data/_internal/cluster_autoscaler/__init__.py new file mode 100644 index 000000000000..01e15270a0c4 --- /dev/null +++ b/python/ray/data/_internal/cluster_autoscaler/__init__.py @@ -0,0 +1,19 @@ +from typing import TYPE_CHECKING + +from .base_cluster_autoscaler import ClusterAutoscaler +from .default_cluster_autoscaler import DefaultClusterAutoscaler + +if TYPE_CHECKING: + from ray.data._internal.execution.resource_manager import ResourceManager + from ray.data._internal.execution.streaming_executor_state import Topology + + +def create_cluster_autoscaler( + topology: "Topology", resource_manager: "ResourceManager", *, execution_id: str +) -> ClusterAutoscaler: + return DefaultClusterAutoscaler( + topology, resource_manager, execution_id=execution_id + ) + + +__all__ = ["ClusterAutoscaler"] diff --git a/python/ray/data/_internal/execution/autoscaler/autoscaler.py b/python/ray/data/_internal/cluster_autoscaler/base_cluster_autoscaler.py similarity index 88% rename from python/ray/data/_internal/execution/autoscaler/autoscaler.py rename to python/ray/data/_internal/cluster_autoscaler/base_cluster_autoscaler.py index ea49631472c7..cdca9187a70f 100644 --- a/python/ray/data/_internal/execution/autoscaler/autoscaler.py +++ b/python/ray/data/_internal/cluster_autoscaler/base_cluster_autoscaler.py @@ -12,8 +12,8 @@ @DeveloperAPI -class Autoscaler(ABC): - """Abstract interface for Ray Data autoscaler.""" +class ClusterAutoscaler(ABC): + """Abstract interface for Ray Data cluster autoscaler.""" def __init__( self, @@ -31,7 +31,7 @@ def try_trigger_scaling(self): This method will be called each time when StreamingExecutor makes a scheduling decision. A subclass should override this method to - handle the autoscaling of both the cluster and `AutoscalingActorPool`s. + handle the autoscaling of the cluster. """ ... diff --git a/python/ray/data/_internal/cluster_autoscaler/default_cluster_autoscaler.py b/python/ray/data/_internal/cluster_autoscaler/default_cluster_autoscaler.py new file mode 100644 index 000000000000..0ab1d3ea56ce --- /dev/null +++ b/python/ray/data/_internal/cluster_autoscaler/default_cluster_autoscaler.py @@ -0,0 +1,107 @@ +import logging +import math +import time +from typing import TYPE_CHECKING, Dict + +import ray +from .base_cluster_autoscaler import ClusterAutoscaler +from ray.data._internal.execution.autoscaling_requester import ( + get_or_create_autoscaling_requester_actor, +) +from ray.data._internal.execution.interfaces import ExecutionResources + +if TYPE_CHECKING: + from ray.data._internal.execution.resource_manager import ResourceManager + from ray.data._internal.execution.streaming_executor_state import Topology + + +logger = logging.getLogger(__name__) + + +class DefaultClusterAutoscaler(ClusterAutoscaler): + + # Min number of seconds between two autoscaling requests. + MIN_GAP_BETWEEN_AUTOSCALING_REQUESTS = 20 + + def __init__( + self, + topology: "Topology", + resource_manager: "ResourceManager", + *, + execution_id: str, + ): + super().__init__(topology, resource_manager, execution_id) + + # Last time when a request was sent to Ray's autoscaler. + self._last_request_time = 0 + + def try_trigger_scaling(self): + """Try to scale up the cluster to accomodate the provided in-progress workload. + + This makes a resource request to Ray's autoscaler consisting of the current, + aggregate usage of all operators in the DAG + the incremental usage of all + operators that are ready for dispatch (i.e. that have inputs queued). If the + autoscaler were to grant this resource request, it would allow us to dispatch + one task for every ready operator. + + Note that this resource request does not take the global resource limits or the + liveness policy into account; it only tries to make the existing resource usage + + one more task per ready operator feasible in the cluster. + """ + # Limit the frequency of autoscaling requests. + now = time.time() + if now - self._last_request_time < self.MIN_GAP_BETWEEN_AUTOSCALING_REQUESTS: + return + + # Scale up the cluster, if no ops are allowed to run, but there are still data + # in the input queues. + no_runnable_op = all( + not op_state._scheduling_status.runnable + for _, op_state in self._topology.items() + ) + any_has_input = any( + op_state._pending_dispatch_input_bundles_count() > 0 + for _, op_state in self._topology.items() + ) + if not (no_runnable_op and any_has_input): + return + + self._last_request_time = now + + # Get resource usage for all ops + additional resources needed to launch one + # more task for each ready op. + resource_request = [] + + def to_bundle(resource: ExecutionResources) -> Dict: + req = {} + if resource.cpu: + req["CPU"] = math.ceil(resource.cpu) + if resource.gpu: + req["GPU"] = math.ceil(resource.gpu) + return req + + for op, state in self._topology.items(): + per_task_resource = op.incremental_resource_usage() + task_bundle = to_bundle(per_task_resource) + resource_request.extend([task_bundle] * op.num_active_tasks()) + # Only include incremental resource usage for ops that are ready for + # dispatch. + if state._pending_dispatch_input_bundles_count() > 0: + # TODO(Clark): Scale up more aggressively by adding incremental resource + # usage for more than one bundle in the queue for this op? + resource_request.append(task_bundle) + + self._send_resource_request(resource_request) + + def _send_resource_request(self, resource_request): + # Make autoscaler resource request. + actor = get_or_create_autoscaling_requester_actor() + actor.request_resources.remote(resource_request, self._execution_id) + + def on_executor_shutdown(self): + # Make request for zero resources to autoscaler for this execution. + actor = get_or_create_autoscaling_requester_actor() + actor.request_resources.remote({}, self._execution_id) + + def get_total_resources(self) -> ExecutionResources: + return ExecutionResources.from_resource_dict(ray.cluster_resources()) diff --git a/python/ray/data/_internal/compute.py b/python/ray/data/_internal/compute.py index 333662c35f9f..0644023bb58a 100644 --- a/python/ray/data/_internal/compute.py +++ b/python/ray/data/_internal/compute.py @@ -76,6 +76,7 @@ def __init__( size: Optional[int] = None, min_size: Optional[int] = None, max_size: Optional[int] = None, + initial_size: Optional[int] = None, max_tasks_in_flight_per_actor: Optional[int] = None, ): """Construct ActorPoolStrategy for a Dataset transform. @@ -85,6 +86,8 @@ def __init__( specify both `size` and `min_size` or `max_size`. min_size: The minimum size of the actor pool. max_size: The maximum size of the actor pool. + initial_size: The initial number of actors to start with. If not specified, + defaults to min_size. Must be between min_size and max_size. max_tasks_in_flight_per_actor: The maximum number of tasks to concurrently send to a single actor worker. Increasing this will increase opportunities for pipelining task dependency prefetching with @@ -94,12 +97,13 @@ def __init__( if size is not None: if size < 1: raise ValueError("size must be >= 1", size) - if max_size is not None or min_size is not None: + if max_size is not None or min_size is not None or initial_size is not None: raise ValueError( - "min_size and max_size cannot be set at the same time as `size`" + "min_size, max_size, and initial_size cannot be set at the same time as `size`" ) min_size = size max_size = size + initial_size = size if min_size is not None and min_size < 1: raise ValueError("min_size must be >= 1", min_size) if max_size is not None: @@ -115,8 +119,22 @@ def __init__( "max_tasks_in_flight_per_actor must be >= 1, got: ", max_tasks_in_flight_per_actor, ) + self.min_size = min_size or 1 self.max_size = max_size or float("inf") + + # Validate and set initial_size + if initial_size is not None: + if initial_size < self.min_size: + raise ValueError( + f"initial_size ({initial_size}) must be >= min_size ({self.min_size})" + ) + if self.max_size != float("inf") and initial_size > self.max_size: + raise ValueError( + f"initial_size ({initial_size}) must be <= max_size ({self.max_size})" + ) + + self.initial_size = initial_size or self.min_size self.max_tasks_in_flight_per_actor = max_tasks_in_flight_per_actor self.num_workers = 0 self.ready_to_total_workers_ratio = 0.8 @@ -125,6 +143,7 @@ def __eq__(self, other: Any) -> bool: return isinstance(other, ActorPoolStrategy) and ( self.min_size == other.min_size and self.max_size == other.max_size + and self.initial_size == other.initial_size and self.max_tasks_in_flight_per_actor == other.max_tasks_in_flight_per_actor ) @@ -133,6 +152,7 @@ def __repr__(self) -> str: return ( f"ActorPoolStrategy(min_size={self.min_size}, " f"max_size={self.max_size}, " + f"initial_size={self.initial_size}, " f"max_tasks_in_flight_per_actor={self.max_tasks_in_flight_per_actor})" f"num_workers={self.num_workers}, " f"ready_to_total_workers_ratio={self.ready_to_total_workers_ratio})" diff --git a/python/ray/data/_internal/datasource/parquet_datasource.py b/python/ray/data/_internal/datasource/parquet_datasource.py index 548f6b6a88dc..ae66e7cab478 100644 --- a/python/ray/data/_internal/datasource/parquet_datasource.py +++ b/python/ray/data/_internal/datasource/parquet_datasource.py @@ -1,4 +1,6 @@ import logging +import math +import os import warnings from dataclasses import dataclass from typing import ( @@ -6,6 +8,7 @@ Any, Callable, Dict, + Iterable, Iterator, List, Literal, @@ -15,28 +18,29 @@ ) import numpy as np +from packaging.version import parse as parse_version import ray -import ray.cloudpickle as cloudpickle +from ray._private.arrow_utils import get_pyarrow_version +from ray.data._internal.arrow_block import ArrowBlockAccessor from ray.data._internal.progress_bar import ProgressBar from ray.data._internal.remote_fn import cached_remote_fn from ray.data._internal.util import ( RetryingPyFileSystem, _check_pyarrow_version, _is_local_scheme, - call_with_retry, iterate_with_retry, ) -from ray.data.block import Block, BlockAccessor +from ray.data.block import Block, BlockAccessor, BlockMetadata from ray.data.context import DataContext from ray.data.datasource import Datasource from ray.data.datasource.datasource import ReadTask from ray.data.datasource.file_based_datasource import FileShuffleConfig from ray.data.datasource.file_meta_provider import ( - DefaultFileMetadataProvider, + FileMetadataProvider, _handle_read_os_error, + _list_files, ) -from ray.data.datasource.parquet_meta_provider import ParquetMetadataProvider from ray.data.datasource.partitioning import ( PartitionDataType, Partitioning, @@ -51,18 +55,23 @@ if TYPE_CHECKING: import pyarrow + from pyarrow import parquet as pq from pyarrow.dataset import ParquetFileFragment logger = logging.getLogger(__name__) + +MIN_PYARROW_TO_BATCHES_READAHEAD = parse_version("10.0.0") + + # The `num_cpus` for each metadata prefetching task. # Default to 0.5 instead of 1 because it is cheaper than normal read task. NUM_CPUS_FOR_META_FETCH_TASK = 0.5 # The number of rows to read per batch. This is sized to generate 10MiB batches # for rows about 1KiB in size. -PARQUET_READER_ROW_BATCH_SIZE = 10_000 +DEFAULT_PARQUET_READER_ROW_BATCH_SIZE = 10_000 FILE_READING_RETRY = 8 # The default size multiplier for reading Parquet data source in Arrow. @@ -95,36 +104,39 @@ PARQUET_ENCODING_RATIO_ESTIMATE_NUM_ROWS = 1024 -@dataclass(frozen=True) -class _SampleInfo: - actual_bytes_per_row: Optional[int] - estimated_bytes_per_row: Optional[int] +_BATCH_SIZE_PRESERVING_STUB_COL_NAME = "__bsp_stub" -# TODO(ekl) this is a workaround for a pyarrow serialization bug, where serializing a -# raw pyarrow file fragment causes S3 network calls. -class SerializedFragment: - def __init__(self, frag: "ParquetFileFragment"): - self._data = cloudpickle.dumps( - (frag.format, frag.path, frag.filesystem, frag.partition_expression) - ) +class _ParquetFragment: + """This wrapper class is created to avoid utilizing `ParquetFileFragment` original + serialization protocol that actually does network RPCs during serialization + (to fetch actual parquet metadata)""" - def deserialize(self) -> "ParquetFileFragment": - # Implicitly trigger S3 subsystem initialization by importing - # pyarrow.fs. - import pyarrow.fs # noqa: F401 + def __init__(self, f: "ParquetFileFragment", file_size: int): + self._fragment = f + self._file_size = file_size - (file_format, path, filesystem, partition_expression) = cloudpickle.loads( - self._data - ) - return file_format.make_fragment(path, filesystem, partition_expression) + @property + def file_size(self) -> int: + return self._file_size + @property + def original(self) -> "ParquetFileFragment": + return self._fragment + + def __reduce__(self): + return _ParquetFragment.make_fragment, ( + self._fragment.format, + self._fragment.path, + self._fragment.filesystem, + self._fragment.partition_expression, + self._file_size, + ) -# Visible for test mocking. -def _deserialize_fragments( - serialized_fragments: List[SerializedFragment], -) -> List["pyarrow._dataset.ParquetFileFragment"]: - return [p.deserialize() for p in serialized_fragments] + @staticmethod + def make_fragment(format, path, filesystem, partition_expression, file_size): + fragment = format.make_fragment(path, filesystem, partition_expression) + return _ParquetFragment(fragment, file_size) def check_for_legacy_tensor_type(schema): @@ -172,7 +184,7 @@ def __init__( _block_udf: Optional[Callable[[Block], Block]] = None, filesystem: Optional["pyarrow.fs.FileSystem"] = None, schema: Optional[Union[type, "pyarrow.lib.Schema"]] = None, - meta_provider: ParquetMetadataProvider = ParquetMetadataProvider(), + meta_provider: Optional[FileMetadataProvider] = None, partition_filter: PathPartitionFilter = None, partitioning: Optional[Partitioning] = Partitioning("hive"), shuffle: Union[Literal["files"], None] = None, @@ -197,35 +209,30 @@ def __init__( ray.get_runtime_context().get_node_id(), soft=False ) - self._unresolved_paths = paths paths, self._filesystem = _resolve_paths_and_filesystem(paths, filesystem) filesystem = RetryingPyFileSystem.wrap( self._filesystem, retryable_errors=DataContext.get_current().retried_io_errors, ) - # HACK: PyArrow's `ParquetDataset` errors if input paths contain non-parquet - # files. To avoid this, we expand the input paths with the default metadata - # provider and then apply the partition filter or file extensions. - if partition_filter is not None or file_extensions is not None: - default_meta_provider = DefaultFileMetadataProvider() - expanded_paths, _ = map( - list, zip(*default_meta_provider.expand_paths(paths, filesystem)) - ) - - paths = list(expanded_paths) - if partition_filter is not None: - paths = partition_filter(paths) - if file_extensions is not None: - paths = [ - path for path in paths if _has_file_extension(path, file_extensions) - ] + listed_files = _list_files( + paths, + filesystem, + partition_filter=partition_filter, + file_extensions=file_extensions, + ) - filtered_paths = set(expanded_paths) - set(paths) - if filtered_paths: - logger.info(f"Filtered out {len(filtered_paths)} paths") + if listed_files: + paths, file_sizes = zip(*listed_files) + else: + paths, file_sizes = [], [] - if dataset_kwargs is None: + if dataset_kwargs is not None: + logger.warning( + "Please note that `ParquetDatasource.__init__`s `dataset_kwargs` " + "is a deprecated parameter and will be removed in the future." + ) + else: dataset_kwargs = {} if "partitioning" in dataset_kwargs: @@ -238,7 +245,9 @@ def __init__( # duplicating the partition data, we disable PyArrow's partitioning. dataset_kwargs["partitioning"] = None - pq_ds = get_parquet_dataset(paths, filesystem, dataset_kwargs) + # NOTE: ParquetDataset only accepts list of paths, hence we need to convert + # it to a list + pq_ds = get_parquet_dataset(list(paths), filesystem, dataset_kwargs) # `read_schema` is the schema object that will be used to perform # read operations. @@ -263,38 +272,17 @@ def __init__( columns, pq_ds.fragments[0], partitioning ) - try: - prefetch_remote_args = {} - prefetch_remote_args["num_cpus"] = NUM_CPUS_FOR_META_FETCH_TASK - if self._local_scheduling: - prefetch_remote_args["scheduling_strategy"] = self._local_scheduling - else: - # Use the scheduling strategy ("SPREAD" by default) provided in - # `DataContext``, to spread out prefetch tasks in cluster, avoid - # AWS S3 throttling error. - # Note: this is the same scheduling strategy used by read tasks. - prefetch_remote_args[ - "scheduling_strategy" - ] = DataContext.get_current().scheduling_strategy - - self._metadata = ( - meta_provider.prefetch_file_metadata( - pq_ds.fragments, **prefetch_remote_args - ) - or [] - ) - except OSError as e: - _handle_read_os_error(e, paths) - if to_batch_kwargs is None: to_batch_kwargs = {} # NOTE: Store the custom serialized `ParquetFileFragment` to avoid unexpected # network calls when `_ParquetDatasourceReader` is serialized. See # `_SerializedFragment()` implementation for more details. - self._pq_fragments = [SerializedFragment(p) for p in pq_ds.fragments] + self._pq_fragments = [ + _ParquetFragment(fragment, file_size) + for fragment, file_size in zip(pq_ds.fragments, file_sizes) + ] self._pq_paths = [p.path for p in pq_ds.fragments] - self._meta_provider = meta_provider self._block_udf = _block_udf self._to_batches_kwargs = to_batch_kwargs self._data_columns = data_columns @@ -304,21 +292,35 @@ def __init__( self._file_metadata_shuffler = None self._include_paths = include_paths self._partitioning = partitioning + if shuffle == "files": self._file_metadata_shuffler = np.random.default_rng() elif isinstance(shuffle, FileShuffleConfig): self._file_metadata_shuffler = np.random.default_rng(shuffle.seed) - sample_infos = sample_fragments( + # Sample small number of parquet files to estimate + # - Encoding ratio: ratio of file size on disk to approximate expected + # size of the corresponding block in memory + # - Default batch-size: number of rows to be read from a file at a time, + # used to limit amount of memory pressure + sampled_fragments = _sample_fragments( self._pq_fragments, - to_batches_kwargs=to_batch_kwargs, - columns=data_columns, - schema=self._read_schema, + ) + + sampled_file_infos = _fetch_file_infos( + sampled_fragments, + columns=self._data_columns, + schema=schema, local_scheduling=self._local_scheduling, ) - self._encoding_ratio = estimate_files_encoding_ratio(sample_infos) - self._default_read_batch_size_rows = estimate_default_read_batch_size_rows( - sample_infos + + self._encoding_ratio = _estimate_files_encoding_ratio( + sampled_fragments, + sampled_file_infos, + ) + + self._default_batch_size = _estimate_reader_batch_size( + sampled_file_infos, DataContext.get_current().target_max_block_size ) if file_extensions is None: @@ -329,60 +331,41 @@ def __init__( emit_file_extensions_future_warning(self._FUTURE_FILE_EXTENSIONS) break - def estimate_inmemory_data_size(self) -> Optional[int]: - total_size = 0 - for file_metadata in self._metadata: - total_size += file_metadata.total_byte_size - return total_size * self._encoding_ratio + def estimate_inmemory_data_size(self) -> int: + return self._estimate_in_mem_size(self._pq_fragments) def get_read_tasks(self, parallelism: int) -> List[ReadTask]: # NOTE: We override the base class FileBasedDatasource.get_read_tasks() # method in order to leverage pyarrow's ParquetDataset abstraction, # which simplifies partitioning logic. We still use # FileBasedDatasource's write side, however. - pq_metadata = self._metadata - if len(pq_metadata) < len(self._pq_fragments): - # Pad `pq_metadata` to be same length of `self._pq_fragments`. - # This can happen when no file metadata being prefetched. - pq_metadata += [None] * (len(self._pq_fragments) - len(pq_metadata)) - if self._file_metadata_shuffler is not None: - files_metadata = list(zip(self._pq_fragments, self._pq_paths, pq_metadata)) + files_metadata = list(zip(self._pq_fragments, self._pq_paths)) shuffled_files_metadata = [ files_metadata[i] for i in self._file_metadata_shuffler.permutation(len(files_metadata)) ] - pq_fragments, pq_paths, pq_metadata = list( - map(list, zip(*shuffled_files_metadata)) - ) + pq_fragments, pq_paths = list(map(list, zip(*shuffled_files_metadata))) else: - pq_fragments, pq_paths, pq_metadata = ( + pq_fragments, pq_paths = ( self._pq_fragments, self._pq_paths, - pq_metadata, ) read_tasks = [] - for fragments, paths, metadata in zip( + for fragments, paths in zip( np.array_split(pq_fragments, parallelism), np.array_split(pq_paths, parallelism), - np.array_split(pq_metadata, parallelism), ): if len(fragments) <= 0: continue - meta = self._meta_provider( - paths, - num_fragments=len(fragments), - prefetched_metadata=metadata, + meta = BlockMetadata( + num_rows=None, + size_bytes=self._estimate_in_mem_size(fragments), + input_files=paths, + exec_stats=None, ) - # If there is a filter operation, reset the calculated row count, - # since the resulting row count is unknown. - if self._to_batches_kwargs.get("filter") is not None: - meta.num_rows = None - - if meta.size_bytes is not None: - meta.size_bytes = int(meta.size_bytes * self._encoding_ratio) ( block_udf, @@ -396,7 +379,7 @@ def get_read_tasks(self, parallelism: int) -> List[ReadTask]: ) = ( self._block_udf, self._to_batches_kwargs, - self._default_read_batch_size_rows, + self._default_batch_size, self._data_columns, self._partition_columns, self._read_schema, @@ -435,6 +418,11 @@ def get_name(self): def supports_distributed_reads(self) -> bool: return self._supports_distributed_reads + def _estimate_in_mem_size(self, fragments: List[_ParquetFragment]) -> int: + in_mem_size = sum([f.file_size for f in fragments]) * self._encoding_ratio + + return round(in_mem_size) + def read_fragments( block_udf, @@ -443,65 +431,35 @@ def read_fragments( data_columns, partition_columns, schema, - serialized_fragments: List[SerializedFragment], + fragments: List[_ParquetFragment], include_paths: bool, partitioning: Partitioning, ) -> Iterator["pyarrow.Table"]: # This import is necessary to load the tensor extension type. from ray.data.extensions.tensor_extension import ArrowTensorType # noqa - # Deserialize after loading the filesystem class. - fragments: List[ - "pyarrow._dataset.ParquetFileFragment" - ] = _deserialize_fragments_with_retry(serialized_fragments) - # Ensure that we're reading at least one dataset fragment. assert len(fragments) > 0 - import pyarrow as pa - logger.debug(f"Reading {len(fragments)} parquet fragments") - use_threads = to_batches_kwargs.pop("use_threads", False) - batch_size = to_batches_kwargs.pop("batch_size", default_read_batch_size_rows) for fragment in fragments: - partitions = {} - if partitioning is not None: - parse = PathPartitionParser(partitioning) - partitions = parse(fragment.path) - - # Filter out partitions that aren't in the user-specified columns list. - if partition_columns is not None: - partitions = { - field_name: value - for field_name, value in partitions.items() - if field_name in partition_columns - } - - def get_batch_iterable(): - if batch_size is not None: - to_batches_kwargs["batch_size"] = batch_size - - return fragment.to_batches( - use_threads=use_threads, - columns=data_columns, - schema=schema, - **to_batches_kwargs, - ) - # S3 can raise transient errors during iteration, and PyArrow doesn't expose a # way to retry specific batches. ctx = ray.data.DataContext.get_current() - for batch in iterate_with_retry( - get_batch_iterable, "load batch", match=ctx.retried_io_errors + for table in iterate_with_retry( + lambda: _read_batches_from( + fragment.original, + schema=schema, + data_columns=data_columns, + partition_columns=partition_columns, + partitioning=partitioning, + include_path=include_paths, + batch_size=default_read_batch_size_rows, + to_batches_kwargs=to_batches_kwargs, + ), + "reading batches", + match=ctx.retried_io_errors, ): - table = pa.Table.from_batches([batch], schema=schema) - if include_paths: - table = BlockAccessor.for_block(table).fill_column( - "path", fragment.path - ) - if partitions: - table = _add_partitions_to_table(partitions, table) - # If the table is empty, drop it. if table.num_rows > 0: if block_udf is not None: @@ -510,74 +468,183 @@ def get_batch_iterable(): yield table -def _deserialize_fragments_with_retry(fragments): - # The deserialization retry helps when the upstream datasource is not able to - # handle overloaded read request or failed with some retriable failures. - # For example when reading data from HA hdfs service, hdfs might - # lose connection for some unknown reason expecially when - # simutaneously running many hyper parameter tuning jobs - # with ray.data parallelism setting at high value like the default 200 - # Such connection failure can be restored with some waiting and retry. - return call_with_retry( - lambda: _deserialize_fragments(fragments), - description="deserialize fragments", - max_attempts=FILE_READING_RETRY, +def _read_batches_from( + fragment: "ParquetFileFragment", + *, + schema: "pyarrow.Schema", + data_columns: Optional[List[str]], + partition_columns: Optional[List[str]], + partitioning: Partitioning, + filter_expr: Optional["pyarrow.dataset.Expression"] = None, + batch_size: Optional[int] = None, + include_path: bool = False, + use_threads: bool = False, + to_batches_kwargs: Optional[Dict[str, Any]] = None, +) -> Iterable["pyarrow.Table"]: + """Get an iterable of batches from a parquet fragment.""" + + import pyarrow as pa + + # Copy to avoid modifying passed in arg + to_batches_kwargs = dict(to_batches_kwargs or {}) + + # NOTE: Passed in kwargs overrides always take precedence + # TODO deprecate to_batches_kwargs + use_threads = to_batches_kwargs.pop("use_threads", use_threads) + filter_expr = to_batches_kwargs.pop("filter", filter_expr) + # NOTE: Arrow's ``to_batches`` expects ``batch_size`` as an int + if batch_size is not None: + to_batches_kwargs.setdefault("batch_size", batch_size) + + partition_col_values = _parse_partition_column_values( + fragment, partition_columns, partitioning ) + try: + for batch in fragment.to_batches( + columns=data_columns, + filter=filter_expr, + schema=schema, + use_threads=use_threads, + **to_batches_kwargs, + ): + table = pa.Table.from_batches([batch]) + + if include_path: + table = ArrowBlockAccessor.for_block(table).fill_column( + "path", fragment.path + ) + + if partition_col_values: + table = _add_partitions_to_table(partition_col_values, table) + + # ``ParquetFileFragment.to_batches`` returns ``RecordBatch``, + # which could have empty projection (ie ``num_columns`` == 0) + # while having non-empty rows (ie ``num_rows`` > 0), which + # could occur when list of requested columns is empty. + # + # However, when ``RecordBatches`` are concatenated using + # ``pyarrow.concat_tables`` it will return a single ``Table`` + # with 0 columns and therefore 0 rows (since ``Table``s number of + # rows is determined as the length of its columns). + # + # To avoid running into this pitfall, we introduce a stub column + # holding just nulls to maintain invariance of the number of rows. + # + # NOTE: There's no impact from this as the binary size of the + # extra column is basically 0 + if table.num_columns == 0 and table.num_rows > 0: + table = table.append_column( + _BATCH_SIZE_PRESERVING_STUB_COL_NAME, pa.nulls(table.num_rows) + ) + + yield table + + except pa.lib.ArrowInvalid as e: + error_message = str(e) + if "No match for FieldRef.Name" in error_message and filter_expr is not None: + filename = os.path.basename(fragment.path) + file_columns = set(fragment.physical_schema.names) + raise RuntimeError( + f"Filter expression: '{filter_expr}' failed on parquet " + f"file: '{filename}' with columns: {file_columns}" + ) + raise -def _sample_fragment( - to_batches_kwargs, - columns, - schema, - file_fragment: SerializedFragment, -) -> _SampleInfo: - # Sample the first rows batch from file fragment `serialized_fragment`. - fragment = _deserialize_fragments_with_retry([file_fragment])[0] +def _parse_partition_column_values( + fragment: "ParquetFileFragment", + partition_columns: Optional[List[str]], + partitioning: Partitioning, +): + partitions = {} + + if partitioning is not None: + parse = PathPartitionParser(partitioning) + partitions = parse(fragment.path) + + # Filter out partitions that aren't in the user-specified columns list. + if partition_columns is not None: + partitions = { + field_name: value + for field_name, value in partitions.items() + if field_name in partition_columns + } + + return partitions + + +def _fetch_parquet_file_info( + fragment: _ParquetFragment, + *, + columns: Optional[List[str]], + schema: Optional["pyarrow.Schema"], +) -> Optional["_ParquetFileInfo"]: # If the fragment has no row groups, it's an empty or metadata-only file. # Skip it by returning empty sample info. - if fragment.metadata.num_row_groups == 0: - return _SampleInfo(actual_bytes_per_row=None, estimated_bytes_per_row=None) + # + # NOTE: Accessing `ParquetFileFragment.metadata` does fetch a parquet footer + # from storage + metadata = fragment.original.metadata + + if metadata.num_row_groups == 0: + return None # Only sample the first row group. - fragment = fragment.subset(row_group_ids=[0]) + row_group_fragment = fragment.original.subset(row_group_ids=[0]) batch_size = max( - min(fragment.metadata.num_rows, PARQUET_ENCODING_RATIO_ESTIMATE_NUM_ROWS), 1 + min( + row_group_fragment.metadata.num_rows, + PARQUET_ENCODING_RATIO_ESTIMATE_NUM_ROWS, + ), + 1, ) - # Use the batch_size calculated above, and ignore the one specified by user if set. - # This is to avoid sampling too few or too many rows. - to_batches_kwargs.pop("batch_size", None) - batches = fragment.to_batches( + + to_batches_kwargs = {} + + if get_pyarrow_version() >= MIN_PYARROW_TO_BATCHES_READAHEAD: + # Limit prefetching to just 1 batch + to_batches_kwargs["batch_readahead"] = 1 + + batches_iter = row_group_fragment.to_batches( columns=columns, schema=schema, batch_size=batch_size, **to_batches_kwargs, ) - # Use first batch in-memory size for estimation. - try: - batch = next(batches) - except StopIteration: - sample_data = _SampleInfo( - actual_bytes_per_row=None, estimated_bytes_per_row=None - ) - else: + + avg_row_size: Optional[int] = None + # Use first batch non-empty batch to estimate the avg size of the + # row in-memory + for batch in batches_iter: if batch.num_rows > 0: - metadata = fragment.metadata - total_size = 0 - for idx in range(metadata.num_row_groups): - total_size += metadata.row_group(idx).total_byte_size - sample_data = _SampleInfo( - actual_bytes_per_row=batch.nbytes / batch.num_rows, - estimated_bytes_per_row=total_size / metadata.num_rows, - ) - else: - sample_data = _SampleInfo( - actual_bytes_per_row=None, estimated_bytes_per_row=None - ) - return sample_data + avg_row_size = math.ceil(batch.nbytes / batch.num_rows) + break + + return _ParquetFileInfo( + avg_row_in_mem_bytes=avg_row_size, + metadata=metadata, + ) + +@dataclass +class _ParquetFileInfo: + # Estimated avg byte size of a row (in-memory) + avg_row_in_mem_bytes: Optional[int] + # Corresponding file metadata + metadata: "pyarrow._parquet.FileMetaData" -def estimate_files_encoding_ratio(sample_infos: List[_SampleInfo]) -> float: + def estimate_in_memory_bytes(self) -> Optional[int]: + if self.avg_row_in_mem_bytes is None: + return None + + return self.avg_row_in_mem_bytes * self.metadata.num_rows + + +def _estimate_files_encoding_ratio( + fragments: List[_ParquetFragment], + file_infos: List[_ParquetFileInfo], +) -> float: """Return an estimate of the Parquet files encoding ratio. To avoid OOMs, it is safer to return an over-estimate than an underestimate. @@ -585,46 +652,90 @@ def estimate_files_encoding_ratio(sample_infos: List[_SampleInfo]) -> float: if not DataContext.get_current().decoding_size_estimation: return PARQUET_ENCODING_RATIO_ESTIMATE_DEFAULT - def compute_encoding_ratio(sample_info: _SampleInfo) -> float: - if ( - sample_info.actual_bytes_per_row is None - or sample_info.estimated_bytes_per_row is None - ): - return PARQUET_ENCODING_RATIO_ESTIMATE_LOWER_BOUND - else: - return ( - sample_info.actual_bytes_per_row / sample_info.estimated_bytes_per_row + assert len(file_infos) == len(fragments) + + # Estimate size of the rows in a file in memory + estimated_in_mem_size_arr = [ + fi.estimate_in_memory_bytes() if fi is not None else None for fi in file_infos + ] + + file_size_arr = [f.file_size for f in fragments] + + estimated_encoding_ratios = [ + float(in_mem_size) / file_size + for in_mem_size, file_size in zip(estimated_in_mem_size_arr, file_size_arr) + if file_size > 0 and in_mem_size is not None + ] + + # Return default estimate of 5 if all sampled files turned out to be empty + if not estimated_encoding_ratios: + return PARQUET_ENCODING_RATIO_ESTIMATE_DEFAULT + + estimated_ratio = np.mean(estimated_encoding_ratios) + + logger.info(f"Estimated parquet encoding ratio is {estimated_ratio:.3f}.") + + return max(estimated_ratio, PARQUET_ENCODING_RATIO_ESTIMATE_LOWER_BOUND) + + +def _fetch_file_infos( + sampled_fragments: List[_ParquetFragment], + *, + columns: Optional[List[str]], + schema: Optional["pyarrow.Schema"], + local_scheduling: Optional[bool], +) -> List[Optional[_ParquetFileInfo]]: + fetc_file_info = cached_remote_fn(_fetch_parquet_file_info) + futures = [] + + for fragment in sampled_fragments: + # Sample the first rows batch in i-th file. + # Use SPREAD scheduling strategy to avoid packing many sampling tasks on + # same machine to cause OOM issue, as sampling can be memory-intensive. + futures.append( + fetc_file_info.options( + scheduling_strategy=local_scheduling + or DataContext.get_current().scheduling_strategy, + # Retry in case of transient errors during sampling. + retry_exceptions=[OSError], + ).remote( + fragment, + columns=columns, + schema=schema, ) + ) - ratio = np.mean(list(map(compute_encoding_ratio, sample_infos))) - logger.debug(f"Estimated Parquet encoding ratio from sampling is {ratio}.") - return max(ratio, PARQUET_ENCODING_RATIO_ESTIMATE_LOWER_BOUND) + sample_bar = ProgressBar("Parquet dataset sampling", len(futures), unit="file") + file_infos = sample_bar.fetch_until_complete(futures) + sample_bar.close() + return file_infos -def estimate_default_read_batch_size_rows( - sample_infos: List[_SampleInfo], + +def _estimate_reader_batch_size( + file_infos: List[Optional[_ParquetFileInfo]], target_block_size: Optional[int] ) -> Optional[int]: - ctx = DataContext.get_current() - if ctx.target_max_block_size is None: + if target_block_size is None: return None - def compute_batch_size_rows(sample_info: _SampleInfo) -> int: - # 'actual_bytes_per_row' is None if the sampled file was empty and 0 if the data - # was all null. - if not sample_info.actual_bytes_per_row: - return PARQUET_READER_ROW_BATCH_SIZE - else: - max_parquet_reader_row_batch_size_bytes = ctx.target_max_block_size // 10 - return max( - 1, - min( - PARQUET_READER_ROW_BATCH_SIZE, - max_parquet_reader_row_batch_size_bytes - // sample_info.actual_bytes_per_row, - ), - ) + avg_num_rows_per_block = [ + target_block_size / fi.avg_row_in_mem_bytes + for fi in file_infos + if ( + fi is not None + and fi.avg_row_in_mem_bytes is not None + and fi.avg_row_in_mem_bytes > 0 + ) + ] + + if not avg_num_rows_per_block: + return DEFAULT_PARQUET_READER_ROW_BATCH_SIZE - return np.mean(list(map(compute_batch_size_rows, sample_infos))) + estimated_batch_size: int = max(math.ceil(np.mean(avg_num_rows_per_block)), 1) + + logger.info(f"Estimated parquet reader batch size at {estimated_batch_size} rows") + + return estimated_batch_size def get_parquet_dataset(paths, filesystem, dataset_kwargs): @@ -648,19 +759,10 @@ def get_parquet_dataset(paths, filesystem, dataset_kwargs): return dataset -def sample_fragments( - serialized_fragments, - *, - to_batches_kwargs, - columns, - schema, - local_scheduling=None, -) -> List[_SampleInfo]: - # Sample a few rows from Parquet files to estimate the encoding ratio. - # Launch tasks to sample multiple files remotely in parallel. - # Evenly distributed to sample N rows in i-th row group in i-th file. - # TODO(ekl/cheng) take into account column pruning. - num_files = len(serialized_fragments) +def _sample_fragments( + fragments: List[_ParquetFragment], +) -> List[_ParquetFragment]: + num_files = len(fragments) num_samples = int(num_files * PARQUET_ENCODING_RATIO_ESTIMATE_SAMPLING_RATIO) min_num_samples = min(PARQUET_ENCODING_RATIO_ESTIMATE_MIN_NUM_SAMPLES, num_files) max_num_samples = min(PARQUET_ENCODING_RATIO_ESTIMATE_MAX_NUM_SAMPLES, num_files) @@ -668,45 +770,25 @@ def sample_fragments( # Evenly distributed to choose which file to sample, to avoid biased prediction # if data is skewed. - file_samples = [ - serialized_fragments[idx] + return [ + fragments[idx] for idx in np.linspace(0, num_files - 1, num_samples).astype(int).tolist() ] - sample_fragment = cached_remote_fn(_sample_fragment) - futures = [] - scheduling = local_scheduling or DataContext.get_current().scheduling_strategy - for sample in file_samples: - # Sample the first rows batch in i-th file. - # Use SPREAD scheduling strategy to avoid packing many sampling tasks on - # same machine to cause OOM issue, as sampling can be memory-intensive. - futures.append( - sample_fragment.options( - scheduling_strategy=scheduling, - # Retry in case of transient errors during sampling. - retry_exceptions=[OSError], - ).remote( - to_batches_kwargs, - columns, - schema, - sample, - ) - ) - sample_bar = ProgressBar("Parquet Files Sample", len(futures), unit="file") - sample_infos = sample_bar.fetch_until_complete(futures) - sample_bar.close() - - return sample_infos - def _add_partitions_to_table( - partitions: Dict[str, PartitionDataType], table: "pyarrow.Table" + partition_col_values: Dict[str, PartitionDataType], table: "pyarrow.Table" ) -> "pyarrow.Table": - for field_name, value in partitions.items(): - field_index = table.schema.get_field_index(field_name) + for partition_col, value in partition_col_values.items(): + field_index = table.schema.get_field_index(partition_col) if field_index == -1: - table = BlockAccessor.for_block(table).fill_column(field_name, value) + table = BlockAccessor.for_block(table).fill_column(partition_col, value) + elif log_once(f"duplicate_partition_field_{partition_col}"): + logger.warning( + f"The partition field '{partition_col}' also exists in the Parquet " + f"file. Ray Data will default to using the value in the Parquet file." + ) return table @@ -757,7 +839,11 @@ def emit_file_extensions_future_warning(future_file_extensions: List[str]): def _infer_schema( - parquet_dataset, schema, columns, partitioning, _block_udf + parquet_dataset: "pq.ParquetDataset", + schema: "pyarrow.Schema", + columns: Optional[List[str]], + partitioning, + _block_udf, ) -> "pyarrow.Schema": """Infer the schema of read data using the user-specified parameters.""" import pyarrow as pa @@ -770,7 +856,7 @@ def _infer_schema( partitioning, inferred_schema, parquet_dataset ) - if columns: + if columns is not None: inferred_schema = pa.schema( [inferred_schema.field(column) for column in columns], inferred_schema.metadata, @@ -825,4 +911,7 @@ def _infer_data_and_partition_columns( partition_columns = [ column for column in user_specified_columns if column in partitions ] + else: + partition_columns = [] + return data_columns, partition_columns diff --git a/python/ray/data/_internal/equalize.py b/python/ray/data/_internal/equalize.py index 92f6ff6c2afa..52561020dfab 100644 --- a/python/ray/data/_internal/equalize.py +++ b/python/ray/data/_internal/equalize.py @@ -2,8 +2,12 @@ from ray.data._internal.execution.interfaces import RefBundle from ray.data._internal.split import _calculate_blocks_rows, _split_at_indices -from ray.data._internal.util import unify_ref_bundles_schema -from ray.data.block import Block, BlockMetadata, BlockPartition +from ray.data.block import ( + Block, + BlockMetadata, + BlockPartition, + _take_first_non_empty_schema, +) from ray.types import ObjectRef @@ -41,7 +45,7 @@ def _equalize( # phase 2: based on the num rows needed for each shaved split, split the leftovers # in the shape that exactly matches the rows needed. - schema = unify_ref_bundles_schema(per_split_bundles) + schema = _take_first_non_empty_schema(bundle.schema for bundle in per_split_bundles) leftover_bundle = RefBundle(leftovers, owns_blocks=owned_by_consumer, schema=schema) leftover_splits = _split_leftovers(leftover_bundle, per_split_needed_rows) diff --git a/python/ray/data/_internal/execution/autoscaler/__init__.py b/python/ray/data/_internal/execution/autoscaler/__init__.py deleted file mode 100644 index 5a566026d591..000000000000 --- a/python/ray/data/_internal/execution/autoscaler/__init__.py +++ /dev/null @@ -1,33 +0,0 @@ -from typing import TYPE_CHECKING - -from .autoscaler import Autoscaler -from .autoscaling_actor_pool import AutoscalingActorPool -from .default_autoscaler import DefaultAutoscaler - -if TYPE_CHECKING: - from ..resource_manager import ResourceManager - from ..streaming_executor_state import Topology - from ray.data.context import AutoscalingConfig - - -def create_autoscaler( - topology: "Topology", - resource_manager: "ResourceManager", - config: "AutoscalingConfig", - *, - execution_id: str -) -> Autoscaler: - return DefaultAutoscaler( - topology, - resource_manager, - config=config, - execution_id=execution_id, - ) - - -__all__ = [ - "Autoscaler", - "DefaultAutoscaler", - "create_autoscaler", - "AutoscalingActorPool", -] diff --git a/python/ray/data/_internal/execution/autoscaler/util.py b/python/ray/data/_internal/execution/autoscaler/util.py deleted file mode 100644 index 550e2a0066e5..000000000000 --- a/python/ray/data/_internal/execution/autoscaler/util.py +++ /dev/null @@ -1,48 +0,0 @@ -import math -from typing import Optional - -from .autoscaling_actor_pool import AutoscalingActorPool -from ray.data._internal.execution.interfaces import ExecutionResources - - -def get_max_scale_up( - actor_pool: AutoscalingActorPool, - budget: Optional[ExecutionResources], -) -> Optional[int]: - """Get the maximum number of actors that can be scaled up. - - Args: - actor_pool: The actor pool to scale up. - budget: The budget to scale up. - - Returns: - The maximum number of actors that can be scaled up, or `None` if you can - scale up infinitely. - """ - if budget is None: - return None - - assert budget.cpu >= 0 and budget.gpu >= 0 - - num_cpus_per_actor = actor_pool.per_actor_resource_usage().cpu - num_gpus_per_actor = actor_pool.per_actor_resource_usage().gpu - assert num_cpus_per_actor >= 0 and num_gpus_per_actor >= 0 - - max_cpu_scale_up: float = float("inf") - if num_cpus_per_actor > 0 and not math.isinf(budget.cpu): - max_cpu_scale_up = budget.cpu // num_cpus_per_actor - - max_gpu_scale_up: float = float("inf") - if num_gpus_per_actor > 0 and not math.isinf(budget.gpu): - max_gpu_scale_up = budget.gpu // num_gpus_per_actor - - max_scale_up = min(max_cpu_scale_up, max_gpu_scale_up) - if math.isinf(max_scale_up): - return None - else: - assert not math.isnan(max_scale_up), ( - budget, - num_cpus_per_actor, - num_gpus_per_actor, - ) - return int(max_scale_up) diff --git a/python/ray/data/_internal/execution/backpressure_policy/__init__.py b/python/ray/data/_internal/execution/backpressure_policy/__init__.py index e08fd3a5f55b..c0aad671df10 100644 --- a/python/ray/data/_internal/execution/backpressure_policy/__init__.py +++ b/python/ray/data/_internal/execution/backpressure_policy/__init__.py @@ -2,6 +2,9 @@ from .backpressure_policy import BackpressurePolicy from .concurrency_cap_backpressure_policy import ConcurrencyCapBackpressurePolicy +from .downstream_capacity_backpressure_policy import ( + DownstreamCapacityBackpressurePolicy, +) from .resource_budget_backpressure_policy import ResourceBudgetBackpressurePolicy from ray.data.context import DataContext @@ -14,6 +17,7 @@ ENABLED_BACKPRESSURE_POLICIES = [ ConcurrencyCapBackpressurePolicy, ResourceBudgetBackpressurePolicy, + DownstreamCapacityBackpressurePolicy, ] ENABLED_BACKPRESSURE_POLICIES_CONFIG_KEY = "backpressure_policies.enabled" @@ -33,6 +37,7 @@ def get_backpressure_policies( __all__ = [ "BackpressurePolicy", "ConcurrencyCapBackpressurePolicy", + "DownstreamCapacityBackpressurePolicy", "ENABLED_BACKPRESSURE_POLICIES_CONFIG_KEY", "get_backpressure_policies", ] diff --git a/python/ray/data/_internal/execution/backpressure_policy/downstream_capacity_backpressure_policy.py b/python/ray/data/_internal/execution/backpressure_policy/downstream_capacity_backpressure_policy.py new file mode 100644 index 000000000000..bf1fb57dc852 --- /dev/null +++ b/python/ray/data/_internal/execution/backpressure_policy/downstream_capacity_backpressure_policy.py @@ -0,0 +1,94 @@ +import logging +from typing import TYPE_CHECKING + +from .backpressure_policy import BackpressurePolicy +from ray.data._internal.execution.operators.actor_pool_map_operator import ( + ActorPoolMapOperator, +) +from ray.data.context import DataContext + +if TYPE_CHECKING: + from ray.data._internal.execution.interfaces.physical_operator import ( + PhysicalOperator, + ) + from ray.data._internal.execution.resource_manager import ResourceManager + from ray.data._internal.execution.streaming_executor_state import Topology + +logger = logging.getLogger(__name__) + + +class DownstreamCapacityBackpressurePolicy(BackpressurePolicy): + """Backpressure policy based on downstream processing capacity. + + This policy triggers backpressure when the output bundles size exceeds both: + 1. A ratio threshold multiplied by the number of running tasks in downstream operators + 2. An absolute threshold for the output bundles size + + The policy monitors actual downstream processing capacity by tracking the number + of currently running tasks rather than configured parallelism. This approach + ensures effective backpressure even when cluster resources are insufficient or + scaling is slow, preventing memory pressure and maintaining pipeline stability. + + Key benefits: + - Prevents memory bloat from unprocessed output objects + - Adapts to actual cluster conditions and resource availability + - Maintains balanced throughput across pipeline operators + - Reduces object spilling and unnecessary rebuilds + """ + + def __init__( + self, + data_context: DataContext, + topology: "Topology", + resource_manager: "ResourceManager", + ): + super().__init__(data_context, topology, resource_manager) + self._backpressure_concurrency_ratio = ( + self._data_context.downstream_capacity_backpressure_ratio + ) + self._backpressure_max_queued_bundles = ( + self._data_context.downstream_capacity_backpressure_max_queued_bundles + ) + self._backpressure_disabled = ( + self._backpressure_concurrency_ratio is None + or self._backpressure_max_queued_bundles is None + ) + + def _max_concurrent_tasks(self, op: "PhysicalOperator") -> int: + if isinstance(op, ActorPoolMapOperator): + return sum( + [ + actor_pool.max_concurrent_tasks() + for actor_pool in op.get_autoscaling_actor_pools() + ] + ) + return op.num_active_tasks() + + def can_add_input(self, op: "PhysicalOperator") -> bool: + """Determine if we can add input to the operator based on downstream capacity.""" + if self._backpressure_disabled: + return True + for output_dependency in op.output_dependencies: + total_enqueued_input_bundles = self._topology[ + output_dependency + ].total_enqueued_input_bundles() + + avg_inputs_per_task = ( + output_dependency.metrics.num_task_inputs_processed + / max(output_dependency.metrics.num_tasks_finished, 1) + ) + outstanding_tasks = total_enqueued_input_bundles / max( + avg_inputs_per_task, 1 + ) + max_allowed_outstanding = ( + self._max_concurrent_tasks(output_dependency) + * self._backpressure_concurrency_ratio + ) + + if ( + total_enqueued_input_bundles > self._backpressure_max_queued_bundles + and outstanding_tasks > max_allowed_outstanding + ): + return False + + return True diff --git a/python/ray/data/_internal/execution/dataset_state.py b/python/ray/data/_internal/execution/dataset_state.py new file mode 100644 index 000000000000..702963234baf --- /dev/null +++ b/python/ray/data/_internal/execution/dataset_state.py @@ -0,0 +1,22 @@ +import enum + + +class DatasetState(enum.IntEnum): + """Enum representing the possible states of a dataset during execution.""" + + UNKNOWN = 0 + RUNNING = 1 + FINISHED = 2 + FAILED = 3 + PENDING = 4 + + def __str__(self): + return self.name + + @classmethod + def from_string(cls, text): + """Get enum by name.""" + try: + return cls[text] # This uses the name to lookup the enum + except KeyError: + return cls.UNKNOWN diff --git a/python/ray/data/_internal/execution/interfaces/execution_options.py b/python/ray/data/_internal/execution/interfaces/execution_options.py index 0485a12ec68c..3edfa2dceda5 100644 --- a/python/ray/data/_internal/execution/interfaces/execution_options.py +++ b/python/ray/data/_internal/execution/interfaces/execution_options.py @@ -49,6 +49,15 @@ def from_resource_dict( memory=resource_dict.get("memory", None), ) + def to_resource_dict(self) -> Dict[str, float]: + """Convert this ExecutionResources object to a resource dict.""" + return { + "CPU": self.cpu, + "GPU": self.gpu, + "object_store_memory": self.object_store_memory, + "memory": self.memory, + } + @classmethod def for_limits( cls, @@ -102,6 +111,16 @@ def __eq__(self, other: "ExecutionResources") -> bool: and self.memory == other.memory ) + def __hash__(self) -> int: + return hash( + ( + self.cpu, + self.gpu, + self.object_store_memory, + self.memory, + ) + ) + @classmethod def zero(cls) -> "ExecutionResources": """Returns an ExecutionResources object with zero resources.""" diff --git a/python/ray/data/_internal/execution/interfaces/op_runtime_metrics.py b/python/ray/data/_internal/execution/interfaces/op_runtime_metrics.py index e9c048fd8e84..eeffc116216c 100644 --- a/python/ray/data/_internal/execution/interfaces/op_runtime_metrics.py +++ b/python/ray/data/_internal/execution/interfaces/op_runtime_metrics.py @@ -130,6 +130,7 @@ class RunningTaskInfo: bytes_outputs: int num_rows_produced: int start_time: float + cum_block_gen_time: float @dataclass @@ -323,6 +324,16 @@ class OpRuntimeMetrics(metaclass=OpRuntimesMetricsMeta): description=("Number of rows generated by finished tasks."), metrics_group=MetricsGroup.OUTPUTS, ) + num_external_inqueue_blocks: int = metric_field( + default=0, + description="Number of blocks in the external inqueue", + metrics_group=MetricsGroup.OUTPUTS, + ) + num_external_inqueue_bytes: int = metric_field( + default=0, + description="Byte size of blocks in the external inqueue", + metrics_group=MetricsGroup.OUTPUTS, + ) # === Tasks-related metrics === num_tasks_submitted: int = metric_field( @@ -386,13 +397,15 @@ class OpRuntimeMetrics(metaclass=OpRuntimesMetricsMeta): 2500.0, 5000.0, ] - - mean_task_completion_time: float = metric_field( + task_completion_time: float = metric_field( default=0, description="Time spent running tasks to completion.", metrics_group=MetricsGroup.TASKS, - metrics_type=MetricsType.Histogram, - metrics_args={"boundaries": histogram_buckets_s}, + ) + task_completion_time_without_backpressure: float = metric_field( + default=0, + description="Time spent running tasks to completion without backpressure.", + metrics_group=MetricsGroup.TASKS, ) # === Actor-related metrics === @@ -509,6 +522,31 @@ def average_num_outputs_per_task(self) -> Optional[float]: else: return self.num_outputs_of_finished_tasks / self.num_tasks_finished + @metric_property( + description="Average number of blocks generated per task.", + metrics_group=MetricsGroup.INPUTS, + ) + def average_num_inputs_per_task(self) -> Optional[float]: + """Average number of input blocks per task, or None if no task has finished.""" + if self.num_tasks_finished == 0: + return None + else: + return self.num_task_inputs_processed / self.num_tasks_finished + + @metric_property( + description="Average number of output blocks per task per second.", + metrics_group=MetricsGroup.OUTPUTS, + ) + def num_output_blocks_per_task_s(self) -> Optional[float]: + """Average number of output blocks per task per second. + + If the operator hasn't produced any output yet, this metric returns `None`. + """ + if self.block_generation_time == 0: + return None + else: + return self.num_task_outputs_generated / self.block_generation_time + @metric_property( description="Average size of task output in bytes.", metrics_group=MetricsGroup.OUTPUTS, @@ -721,9 +759,8 @@ def on_toggle_task_output_backpressure(self, in_backpressure): self._task_output_backpressure_start_time = time.perf_counter() elif self._task_output_backpressure_start_time != -1: # backpressure stopping, stop timer - self.task_output_backpressure_time += ( - time.perf_counter() - self._task_output_backpressure_start_time - ) + delta = time.perf_counter() - self._task_output_backpressure_start_time + self.task_output_backpressure_time += delta self._task_output_backpressure_start_time = -1 def on_output_taken(self, output: RefBundle): @@ -746,6 +783,7 @@ def on_task_submitted(self, task_index: int, inputs: RefBundle): bytes_outputs=0, num_rows_produced=0, start_time=time.perf_counter(), + cum_block_gen_time=0, ) def on_task_output_generated(self, task_index: int, output: RefBundle): @@ -771,6 +809,7 @@ def on_task_output_generated(self, task_index: int, output: RefBundle): meta.exec_stats is not None and meta.exec_stats.wall_time_s is not None ) self.block_generation_time += meta.exec_stats.wall_time_s + task_info.cum_block_gen_time += meta.exec_stats.wall_time_s assert meta.num_rows is not None trace_allocation(block_ref, "operator_output") if meta.exec_stats.max_uss_bytes is not None: @@ -802,8 +841,10 @@ def on_task_finished(self, task_index: int, exception: Optional[Exception]): self.rows_outputs_of_finished_tasks += task_info.num_rows_produced task_time_delta = time.perf_counter() - task_info.start_time - self._op_task_duration_stats.add_duration(task_time_delta) - self.mean_task_completion_time = self._op_task_duration_stats.mean() + self.task_completion_time += task_time_delta + + assert task_info.cum_block_gen_time is not None + self.task_completion_time_without_backpressure += task_info.cum_block_gen_time inputs = self._running_tasks[task_index].inputs self.num_task_inputs_processed += len(inputs) total_input_size = inputs.size_bytes() diff --git a/python/ray/data/_internal/execution/interfaces/physical_operator.py b/python/ray/data/_internal/execution/interfaces/physical_operator.py index 7de458c239b1..769f7f967812 100644 --- a/python/ray/data/_internal/execution/interfaces/physical_operator.py +++ b/python/ray/data/_internal/execution/interfaces/physical_operator.py @@ -18,7 +18,7 @@ import ray from .ref_bundle import RefBundle from ray._raylet import ObjectRefGenerator -from ray.data._internal.execution.autoscaler.autoscaling_actor_pool import ( +from ray.data._internal.actor_autoscaler.autoscaling_actor_pool import ( AutoscalingActorPool, ) from ray.data._internal.execution.interfaces.execution_options import ( @@ -251,15 +251,15 @@ def __init__( name: str, input_dependencies: List["PhysicalOperator"], data_context: DataContext, - target_max_block_size: Optional[int], + target_max_block_size_override: Optional[int] = None, ): super().__init__(name, input_dependencies) for x in input_dependencies: assert isinstance(x, PhysicalOperator), x self._inputs_complete = not input_dependencies - self._output_block_size_option = None - self.set_target_max_block_size(target_max_block_size) + self._output_block_size_option_override = None + self.override_target_max_block_size(target_max_block_size_override) self._started = False self._shutdown = False self._in_task_submission_backpressure = False @@ -307,15 +307,15 @@ def set_logical_operators( self._logical_operators = list(logical_ops) @property - def target_max_block_size(self) -> Optional[int]: + def target_max_block_size_override(self) -> Optional[int]: """ Target max block size output by this operator. If this returns None, then the default from DataContext should be used. """ - if self._output_block_size_option is None: + if self._output_block_size_option_override is None: return None else: - return self._output_block_size_option.target_max_block_size + return self._output_block_size_option_override.target_max_block_size @property def actual_target_max_block_size(self) -> Optional[int]: @@ -325,18 +325,18 @@ def actual_target_max_block_size(self) -> Optional[int]: `None` if the target max block size is not set, otherwise the target max block size. `None` means the block size is infinite. """ - target_max_block_size = self.target_max_block_size + target_max_block_size = self.target_max_block_size_override if target_max_block_size is None: target_max_block_size = self.data_context.target_max_block_size return target_max_block_size - def set_target_max_block_size(self, target_max_block_size: Optional[int]): + def override_target_max_block_size(self, target_max_block_size: Optional[int]): if target_max_block_size is not None: - self._output_block_size_option = OutputBlockSizeOption( + self._output_block_size_option_override = OutputBlockSizeOption( target_max_block_size=target_max_block_size ) - elif self._output_block_size_option is not None: - self._output_block_size_option = None + elif self._output_block_size_option_override is not None: + self._output_block_size_option_override = None def mark_execution_finished(self): """Manually mark that this operator has finished execution.""" @@ -402,6 +402,40 @@ def _get_logical_args(self) -> Dict[str, Dict[str, Any]]: res[logical_op_id] = logical_op._get_args() return res + # TODO(@balaji): Disambiguate this with `incremental_resource_usage`. + def per_task_resource_allocation( + self: "PhysicalOperator", + ) -> ExecutionResources: + """The amount of logical resources used by each task. + + For regular tasks, these are the resources required to schedule a task. For + actor tasks, these are the resources required to schedule an actor divided by + the number of actor threads (i.e., `max_concurrency`). + + Returns: + The resource requirement per task. + """ + return ExecutionResources.zero() + + def max_task_concurrency(self: "PhysicalOperator") -> Optional[int]: + """The maximum number of tasks that can be run concurrently. + + Some operators manually configure a maximum concurrency. For example, if you + specify `concurrency` in `map_batches`. + """ + return None + + # TODO(@balaji): Disambiguate this with `base_resource_usage`. + def min_scheduling_resources( + self: "PhysicalOperator", + ) -> ExecutionResources: + """The minimum resource bundle required to schedule a worker. + + For regular tasks, this is the resources required to schedule a task. For actor + tasks, this is the resources required to schedule an actor. + """ + return ExecutionResources.zero() + def progress_str(self) -> str: """Return any extra status to be displayed in the operator progress bar. diff --git a/python/ray/data/_internal/execution/interfaces/task_context.py b/python/ray/data/_internal/execution/interfaces/task_context.py index 9fb4ffe6e20f..7ff0f60f9670 100644 --- a/python/ray/data/_internal/execution/interfaces/task_context.py +++ b/python/ray/data/_internal/execution/interfaces/task_context.py @@ -44,8 +44,8 @@ class TaskContext: # This should be set if upstream_map_transformer is set. upstream_map_ray_remote_args: Optional[Dict[str, Any]] = None - # The target maximum number of bytes to include in the task's output block. - target_max_block_size: Optional[int] = None + # Override of the target max-block-size for the task + target_max_block_size_override: Optional[int] = None # Additional keyword arguments passed to the task. kwargs: Dict[str, Any] = field(default_factory=dict) diff --git a/python/ray/data/_internal/execution/legacy_compat.py b/python/ray/data/_internal/execution/legacy_compat.py index af651c797352..fefb1592c736 100644 --- a/python/ray/data/_internal/execution/legacy_compat.py +++ b/python/ray/data/_internal/execution/legacy_compat.py @@ -16,10 +16,11 @@ from ray.data._internal.logical.util import record_operators_usage from ray.data._internal.plan import ExecutionPlan from ray.data._internal.stats import DatasetStats -from ray.data._internal.util import ( - unify_schemas_with_validation, +from ray.data.block import ( + BlockMetadata, + BlockMetadataWithSchema, + _take_first_non_empty_schema, ) -from ray.data.block import BlockMetadata, BlockMetadataWithSchema # Warn about tasks larger than this. TASK_SIZE_WARN_THRESHOLD_BYTES = 100000 @@ -171,18 +172,18 @@ def _get_initial_stats_from_plan(plan: ExecutionPlan) -> DatasetStats: def _bundles_to_block_list(bundles: Iterator[RefBundle]) -> BlockList: blocks, metadata = [], [] owns_blocks = True - schemas = [] + bundle_list = list(bundles) + schema = _take_first_non_empty_schema( + ref_bundle.schema for ref_bundle in bundle_list + ) - for ref_bundle in bundles: + for ref_bundle in bundle_list: if not ref_bundle.owns_blocks: owns_blocks = False blocks.extend(ref_bundle.block_refs) metadata.extend(ref_bundle.metadata) - schemas.append(ref_bundle.schema) - unified_schema = unify_schemas_with_validation(schemas) - return BlockList( - blocks, metadata, owned_by_consumer=owns_blocks, schema=unified_schema - ) + + return BlockList(blocks, metadata, owned_by_consumer=owns_blocks, schema=schema) def _set_stats_uuid_recursive(stats: DatasetStats, dataset_uuid: str) -> None: diff --git a/python/ray/data/_internal/execution/operators/actor_pool_map_operator.py b/python/ray/data/_internal/execution/operators/actor_pool_map_operator.py index 818ee7ac7fe7..231046c984e3 100644 --- a/python/ray/data/_internal/execution/operators/actor_pool_map_operator.py +++ b/python/ray/data/_internal/execution/operators/actor_pool_map_operator.py @@ -10,11 +10,13 @@ import ray from ray.actor import ActorHandle from ray.core.generated import gcs_pb2 -from ray.data._internal.compute import ActorPoolStrategy -from ray.data._internal.execution.autoscaler import AutoscalingActorPool -from ray.data._internal.execution.autoscaler.default_autoscaler import ( +from ray.data._internal.actor_autoscaler import ( + AutoscalingActorPool, +) +from ray.data._internal.actor_autoscaler.autoscaling_actor_pool import ( ActorPoolScalingRequest, ) +from ray.data._internal.compute import ActorPoolStrategy from ray.data._internal.execution.bundle_queue import create_bundle_queue from ray.data._internal.execution.bundle_queue.bundle_queue import BundleQueue from ray.data._internal.execution.interfaces import ( @@ -63,7 +65,6 @@ def __init__( map_transformer: MapTransformer, input_op: PhysicalOperator, data_context: DataContext, - target_max_block_size: Optional[int], compute_strategy: ActorPoolStrategy, name: str = "ActorPoolMap", min_rows_per_bundle: Optional[int] = None, @@ -71,6 +72,7 @@ def __init__( map_task_kwargs: Optional[Dict[str, Any]] = None, ray_remote_args_fn: Optional[Callable[[], Dict[str, Any]]] = None, ray_remote_args: Optional[Dict[str, Any]] = None, + target_max_block_size_override: Optional[int] = None, ): """Create an ActorPoolMapOperator instance. @@ -79,8 +81,6 @@ def __init__( to each ref bundle input. input_op: Operator generating input data for this op. data_context: The DataContext instance containing configuration settings. - target_max_block_size: The target maximum number of bytes to - include in an output block. compute_strategy: `ComputeStrategy` used for this operator. name: The name of this operator. min_rows_per_bundle: The number of rows to gather per batch passed to the @@ -98,13 +98,15 @@ def __init__( advanced, experimental feature. ray_remote_args: Customize the ray remote args for this op's tasks. See :func:`ray.remote` for details. + target_max_block_size_override: The target maximum number of bytes to + include in an output block. """ super().__init__( map_transformer, input_op, data_context, name, - target_max_block_size, + target_max_block_size_override, min_rows_per_bundle, supports_fusion, map_task_kwargs, @@ -148,6 +150,7 @@ def __init__( per_actor_resource_usage, min_size=compute_strategy.min_size, max_size=compute_strategy.max_size, + initial_size=compute_strategy.initial_size, max_actor_concurrency=max_actor_concurrency, max_tasks_in_flight_per_actor=( # NOTE: Unless explicitly configured by the user, max tasks-in-flight config @@ -201,7 +204,7 @@ def start(self, options: ExecutionOptions): self._actor_cls = ray.remote(**self._ray_remote_args)(self._map_worker_cls) self._actor_pool.scale( ActorPoolScalingRequest( - delta=self._actor_pool.min_size(), reason="scaling to min size" + delta=self._actor_pool.initial_size(), reason="scaling to initial size" ) ) @@ -298,7 +301,7 @@ def _dispatch_tasks(self): ctx = TaskContext( task_idx=self._next_data_task_idx, op_name=self.name, - target_max_block_size=self.actual_target_max_block_size, + target_max_block_size_override=self.actual_target_max_block_size, ) gen = actor.submit.options( num_returns="streaming", @@ -476,6 +479,22 @@ def _apply_default_remote_args( def get_autoscaling_actor_pools(self) -> List[AutoscalingActorPool]: return [self._actor_pool] + def per_task_resource_allocation( + self: "PhysicalOperator", + ) -> ExecutionResources: + max_concurrency = self._ray_remote_args.get("max_concurrency", 1) + per_actor_resource_usage = self._actor_pool.per_actor_resource_usage() + return per_actor_resource_usage.scale(1 / max_concurrency) + + def max_task_concurrency(self: "PhysicalOperator") -> Optional[int]: + max_concurrency = self._ray_remote_args.get("max_concurrency", 1) + return max_concurrency * self._actor_pool.max_size() + + def min_scheduling_resources( + self: "PhysicalOperator", + ) -> ExecutionResources: + return self._actor_pool.per_actor_resource_usage() + def update_resource_usage(self) -> None: """Updates resources usage.""" for actor in self._actor_pool.get_running_actor_refs(): @@ -700,7 +719,7 @@ class _ActorPool(AutoscalingActorPool): actors when the operator is done submitting work to the pool. """ - _ACTOR_POOL_SCALE_DOWN_DEBOUNCE_PERIOD_S = 30 + _ACTOR_POOL_SCALE_DOWN_DEBOUNCE_PERIOD_S = 10 _ACTOR_POOL_GRACEFUL_SHUTDOWN_TIMEOUT_S = 30 _LOGICAL_ACTOR_ID_LABEL_KEY = "__ray_data_logical_actor_id" @@ -711,6 +730,7 @@ def __init__( *, min_size: int, max_size: int, + initial_size: int, max_actor_concurrency: int, max_tasks_in_flight_per_actor: int, _enable_actor_pool_on_exit_hook: bool = False, @@ -726,8 +746,9 @@ def __init__( in the pool. Note, that this constraint could be violated when no new work is available for scheduling in the actor pool (ie when operator completes execution). - max_size: The minimum number of running actors to be maintained + max_size: The maximum number of running actors to be maintained in the pool. + initial_size: The initial number of actors to start with. max_actor_concurrency: The maximum number of concurrent tasks a single actor can execute (derived from `ray_remote_args` passed to the operator). @@ -739,6 +760,7 @@ def __init__( self._min_size: int = min_size self._max_size: int = max_size + self._initial_size: int = initial_size self._max_actor_concurrency: int = max_actor_concurrency self._max_tasks_in_flight: int = max_tasks_in_flight_per_actor self._create_actor_fn = create_actor_fn @@ -746,11 +768,13 @@ def __init__( assert self._min_size >= 1 assert self._max_size >= self._min_size + assert self._initial_size <= self._max_size + assert self._initial_size >= self._min_size assert self._max_tasks_in_flight >= 1 assert self._create_actor_fn is not None # Timestamp of the last scale up action - self._last_upscaling_ts: Optional[float] = None + self._last_upscaled_at: Optional[float] = None self._last_downscaling_debounce_warning_ts: Optional[float] = None # Actors that have started running, including alive and restarting actors. self._running_actors: Dict[ray.actor.ActorHandle, _ActorState] = {} @@ -802,6 +826,9 @@ def max_actor_concurrency(self) -> int: def num_tasks_in_flight(self) -> int: return self._total_num_tasks_in_flight + def initial_size(self) -> int: + return self._initial_size + def _can_apply(self, config: ActorPoolScalingRequest) -> bool: """Returns whether Actor Pool is able to execute scaling request""" @@ -815,21 +842,21 @@ def _can_apply(self, config: ActorPoolScalingRequest) -> bool: # scaling up, ie if actor pool just scaled down, it'd still be able # to scale back up immediately. if ( - self._last_upscaling_ts is not None - and time.time() - <= self._last_upscaling_ts - + self._ACTOR_POOL_SCALE_DOWN_DEBOUNCE_PERIOD_S + not config.force + and self._last_upscaled_at is not None + and ( + time.time() + <= self._last_upscaled_at + + self._ACTOR_POOL_SCALE_DOWN_DEBOUNCE_PERIOD_S + ) ): # NOTE: To avoid spamming logs unnecessarily, debounce log is produced once # per upscaling event - if ( - self._last_upscaling_ts - != self._last_downscaling_debounce_warning_ts - ): + if self._last_upscaled_at != self._last_downscaling_debounce_warning_ts: logger.debug( - f"Ignoring scaling down request (request={config}; reason=debounced from scaling up at {self._last_upscaling_ts})" + f"Ignoring scaling down request (request={config}; reason=debounced from scaling up at {self._last_upscaled_at})" ) - self._last_downscaling_debounce_warning_ts = self._last_upscaling_ts + self._last_downscaling_debounce_warning_ts = self._last_upscaled_at return False @@ -853,7 +880,7 @@ def scale(self, req: ActorPoolScalingRequest) -> Optional[int]: self.add_pending_actor(actor, ready_ref) # Capture last scale up timestamp - self._last_upscaling_ts = time.time() + self._last_upscaled_at = time.time() return target_num_actors diff --git a/python/ray/data/_internal/execution/operators/aggregate_num_rows.py b/python/ray/data/_internal/execution/operators/aggregate_num_rows.py index 674012b00990..68084d2d0ad7 100644 --- a/python/ray/data/_internal/execution/operators/aggregate_num_rows.py +++ b/python/ray/data/_internal/execution/operators/aggregate_num_rows.py @@ -23,7 +23,6 @@ def __init__( "AggregateNumRows", input_dependencies, data_context, - target_max_block_size=None, ) self._column_name = column_name diff --git a/python/ray/data/_internal/execution/operators/base_physical_operator.py b/python/ray/data/_internal/execution/operators/base_physical_operator.py index df934ae323f6..948a34d92fef 100644 --- a/python/ray/data/_internal/execution/operators/base_physical_operator.py +++ b/python/ray/data/_internal/execution/operators/base_physical_operator.py @@ -31,16 +31,16 @@ def __init__( name: str, input_op: PhysicalOperator, data_context: DataContext, - target_max_block_size: Optional[int], + target_max_block_size_override: Optional[int] = None, ): """Create a OneToOneOperator. Args: input_op: Operator generating input data for this op. name: The name of this operator. - target_max_block_size: The target maximum number of bytes to + target_max_block_size_override: The target maximum number of bytes to include in an output block. """ - super().__init__(name, [input_op], data_context, target_max_block_size) + super().__init__(name, [input_op], data_context, target_max_block_size_override) @property def input_dependency(self) -> PhysicalOperator: @@ -58,7 +58,7 @@ def __init__( bulk_fn: AllToAllTransformFn, input_op: PhysicalOperator, data_context: DataContext, - target_max_block_size: Optional[int], + target_max_block_size_override: Optional[int] = None, num_outputs: Optional[int] = None, sub_progress_bar_names: Optional[List[str]] = None, name: str = "AllToAll", @@ -69,6 +69,9 @@ def __init__( list of input ref bundles, and the outputs are the output ref bundles and a stats dict. input_op: Operator generating input data for this op. + data_context: The DataContext instance containing configuration settings. + target_max_block_size_override: The target maximum number of bytes to + include in an output block. num_outputs: The number of expected output bundles for progress bar. sub_progress_bar_names: The names of internal sub progress bars. name: The name of this operator. @@ -82,7 +85,7 @@ def __init__( self._input_buffer: List[RefBundle] = [] self._output_buffer: List[RefBundle] = [] self._stats: StatsDict = {} - super().__init__(name, [input_op], data_context, target_max_block_size) + super().__init__(name, [input_op], data_context, target_max_block_size_override) def num_outputs_total(self) -> Optional[int]: return ( @@ -112,7 +115,7 @@ def all_inputs_done(self) -> None: task_idx=self._next_task_index, op_name=self.name, sub_progress_bar_dict=self._sub_progress_bar_dict, - target_max_block_size=self.actual_target_max_block_size, + target_max_block_size_override=self.actual_target_max_block_size, ) # NOTE: We don't account object store memory use from intermediate `bulk_fn` # outputs (e.g., map outputs for map-reduce). @@ -191,6 +194,4 @@ def __init__( """ input_names = ", ".join([op._name for op in input_ops]) op_name = f"{self.__class__.__name__}({input_names})" - super().__init__( - op_name, list(input_ops), data_context, target_max_block_size=None - ) + super().__init__(op_name, list(input_ops), data_context) diff --git a/python/ray/data/_internal/execution/operators/hash_shuffle.py b/python/ray/data/_internal/execution/operators/hash_shuffle.py index 50cb0136228e..3ada3a7bd2ad 100644 --- a/python/ray/data/_internal/execution/operators/hash_shuffle.py +++ b/python/ray/data/_internal/execution/operators/hash_shuffle.py @@ -426,7 +426,6 @@ def __init__( name=name, input_dependencies=input_ops, data_context=data_context, - target_max_block_size=None, ) if shuffle_progress_bar_name is None: @@ -469,6 +468,11 @@ def __init__( data_context=data_context, ) + # We track the running usage total because iterating + # and summing over all shuffling tasks can be expensive + # if the # of shuffling tasks is large + self._shuffling_resource_usage = ExecutionResources.zero() + self._input_block_transformer = input_block_transformer self._next_shuffle_tasks_idx: int = 0 @@ -585,6 +589,11 @@ def _do_add_input_inner(self, input_bundle: RefBundle, input_index: int): def _on_partitioning_done(cur_shuffle_task_idx: int): task = self._shuffling_tasks[input_index].pop(cur_shuffle_task_idx) + self._shuffling_resource_usage = ( + self._shuffling_resource_usage.subtract( + task.get_requested_resource_bundle() + ) + ) # Fetch input block and resulting partition shards block metadata and # handle obtained metadata # @@ -614,16 +623,22 @@ def _on_partitioning_done(cur_shuffle_task_idx: int): self.shuffle_bar.update(i=input_block_metadata.num_rows) # TODO update metrics - self._shuffling_tasks[input_index][cur_shuffle_task_idx] = MetadataOpTask( + task = self._shuffling_tasks[input_index][ + cur_shuffle_task_idx + ] = MetadataOpTask( task_index=cur_shuffle_task_idx, object_ref=input_block_partition_shards_metadata_tuple_ref, task_done_callback=functools.partial( _on_partitioning_done, cur_shuffle_task_idx ), - task_resource_bundle=( - ExecutionResources.from_resource_dict(shuffle_task_resource_bundle) + task_resource_bundle=ExecutionResources.from_resource_dict( + shuffle_task_resource_bundle ), ) + if task.get_requested_resource_bundle() is not None: + self._shuffling_resource_usage = self._shuffling_resource_usage.add( + task.get_requested_resource_bundle() + ) # Update Shuffle Metrics on task submission self.shuffle_metrics.on_task_submitted( @@ -634,7 +649,13 @@ def _on_partitioning_done(cur_shuffle_task_idx: int): ) # Update Shuffle progress bar - self.shuffle_bar.update(total=self.shuffle_metrics.num_row_inputs_received) + _, _, num_rows = estimate_total_num_of_blocks( + cur_shuffle_task_idx + 1, + self.upstream_op_num_outputs(), + self.shuffle_metrics, + total_num_tasks=None, + ) + self.shuffle_bar.update(total=num_rows) def has_next(self) -> bool: self._try_finalize() @@ -850,19 +871,13 @@ def current_processor_usage(self) -> ExecutionResources: # `base_resource_usage` method) # - Active shuffling tasks # - Active finalizing tasks (actor tasks) - base_usage = self.base_resource_usage() - - shuffling_tasks = self._get_active_shuffling_tasks() - shuffling_tasks_cpus_used = sum( - [t.get_requested_resource_bundle().cpu for t in shuffling_tasks] - ) + base_usage = self.base_resource_usage + running_usage = self._shuffling_resource_usage # TODO add memory to resources being tracked - return ExecutionResources( - cpu=base_usage.cpu + shuffling_tasks_cpus_used, - gpu=0, - ) + return base_usage.add(running_usage) + @property def base_resource_usage(self) -> ExecutionResources: # TODO add memory to resources being tracked return ExecutionResources( diff --git a/python/ray/data/_internal/execution/operators/input_data_buffer.py b/python/ray/data/_internal/execution/operators/input_data_buffer.py index 4aa6c4a63688..b2213f9321d0 100644 --- a/python/ray/data/_internal/execution/operators/input_data_buffer.py +++ b/python/ray/data/_internal/execution/operators/input_data_buffer.py @@ -33,7 +33,7 @@ def __init__( num_output_blocks: The number of output blocks. If not specified, progress bars total will be set based on num output bundles instead. """ - super().__init__("Input", [], data_context, target_max_block_size=None) + super().__init__("Input", [], data_context) if input_data is not None: assert input_data_factory is None # Copy the input data to avoid mutating the original list. diff --git a/python/ray/data/_internal/execution/operators/limit_operator.py b/python/ray/data/_internal/execution/operators/limit_operator.py index c4702323d565..246f34b82453 100644 --- a/python/ray/data/_internal/execution/operators/limit_operator.py +++ b/python/ray/data/_internal/execution/operators/limit_operator.py @@ -29,7 +29,7 @@ def __init__( self._name = f"limit={limit}" self._output_blocks_stats: List[BlockStats] = [] self._cur_output_bundles = 0 - super().__init__(self._name, input_op, data_context, target_max_block_size=None) + super().__init__(self._name, input_op, data_context) if self._limit <= 0: self.mark_execution_finished() diff --git a/python/ray/data/_internal/execution/operators/map_operator.py b/python/ray/data/_internal/execution/operators/map_operator.py index 3a9f9bd1bced..9af913687c5f 100644 --- a/python/ray/data/_internal/execution/operators/map_operator.py +++ b/python/ray/data/_internal/execution/operators/map_operator.py @@ -48,13 +48,14 @@ ) from ray.data._internal.execution.util import memory_string from ray.data._internal.stats import StatsDict -from ray.data._internal.util import MemoryProfiler, unify_ref_bundles_schema +from ray.data._internal.util import MemoryProfiler from ray.data.block import ( Block, BlockAccessor, BlockExecStats, BlockMetadataWithSchema, BlockStats, + _take_first_non_empty_schema, to_stats, ) from ray.data.context import DataContext @@ -238,7 +239,7 @@ def create( map_transformer, input_op, data_context, - target_max_block_size=target_max_block_size, + target_max_block_size_override=target_max_block_size, compute_strategy=compute_strategy, name=name, min_rows_per_bundle=min_rows_per_bundle, @@ -541,8 +542,6 @@ def _map_task( A generator of blocks, followed by the list of BlockMetadata for the blocks as the last generator return. """ - from ray.data.block import BlockMetadataWithSchema - logger.debug( "Executing map task of operator %s with task index %d", ctx.op_name, @@ -552,7 +551,7 @@ def _map_task( ctx.kwargs.update(kwargs) TaskContext.set_current(ctx) stats = BlockExecStats.builder() - map_transformer.set_target_max_block_size(ctx.target_max_block_size) + map_transformer.override_target_max_block_size(ctx.target_max_block_size_override) with MemoryProfiler(data_context.memory_usage_poll_interval_s) as profiler: for b_out in map_transformer.apply_transform(iter(blocks), ctx): # TODO(Clark): Add input file propagation from input blocks. @@ -662,14 +661,13 @@ def _get_bundle_size(bundle: RefBundle): def _merge_ref_bundles(*bundles: RefBundle) -> RefBundle: """Merge N ref bundles into a single bundle of multiple blocks.""" # Check that at least one bundle is non-null. - assert any(bundle is not None for bundle in bundles) + bundles = [bundle for bundle in bundles if bundle is not None] + assert len(bundles) > 0 blocks = list( - itertools.chain( - block for bundle in bundles if bundle is not None for block in bundle.blocks - ) + itertools.chain(block for bundle in bundles for block in bundle.blocks) ) - owns_blocks = all(bundle.owns_blocks for bundle in bundles if bundle is not None) - schema = unify_ref_bundles_schema(bundles) + owns_blocks = all(bundle.owns_blocks for bundle in bundles) + schema = _take_first_non_empty_schema(bundle.schema for bundle in bundles) return RefBundle(blocks, owns_blocks=owns_blocks, schema=schema) diff --git a/python/ray/data/_internal/execution/operators/map_transformer.py b/python/ray/data/_internal/execution/operators/map_transformer.py index db3e5fc5353e..4d8870c4d3b8 100644 --- a/python/ray/data/_internal/execution/operators/map_transformer.py +++ b/python/ray/data/_internal/execution/operators/map_transformer.py @@ -89,7 +89,7 @@ def category(self) -> MapTransformFnCategory: def output_block_size_option(self): return self._output_block_size_option - def set_target_max_block_size(self, target_max_block_size: Optional[int]): + def override_target_max_block_size(self, target_max_block_size: Optional[int]): self._output_block_size_option = OutputBlockSizeOption( target_max_block_size=target_max_block_size ) @@ -163,7 +163,7 @@ def get_transform_fns(self) -> List[MapTransformFn]: """Get the transform functions.""" return self._transform_fns - def set_target_max_block_size(self, target_max_block_size: int): + def override_target_max_block_size(self, target_max_block_size: Optional[int]): if target_max_block_size is not None: self._output_block_size_option = OutputBlockSizeOption( target_max_block_size=target_max_block_size @@ -221,7 +221,7 @@ def apply_transform( """Apply the transform functions to the input blocks.""" for transform_fn in self._transform_fns: if not transform_fn.output_block_size_option: - transform_fn.set_target_max_block_size(self.target_max_block_size) + transform_fn.override_target_max_block_size(self.target_max_block_size) iter = input_blocks # Apply the transform functions sequentially to the input iterable. @@ -251,7 +251,7 @@ def fused_init_fn(): fused_transform_fns = self._transform_fns + other._transform_fns transformer = MapTransformer(fused_transform_fns, init_fn=fused_init_fn) - transformer.set_target_max_block_size(target_max_block_size) + transformer.override_target_max_block_size(target_max_block_size) return transformer def udf_time(self) -> float: diff --git a/python/ray/data/_internal/execution/operators/output_splitter.py b/python/ray/data/_internal/execution/operators/output_splitter.py index 160ab0558ebd..df8c7e0ec962 100644 --- a/python/ray/data/_internal/execution/operators/output_splitter.py +++ b/python/ray/data/_internal/execution/operators/output_splitter.py @@ -47,7 +47,6 @@ def __init__( f"split({n}, equal={equal})", [input_op], data_context, - target_max_block_size=None, ) self._equal = equal # Buffer of bundles not yet assigned to output splits. diff --git a/python/ray/data/_internal/execution/operators/task_pool_map_operator.py b/python/ray/data/_internal/execution/operators/task_pool_map_operator.py index 3ec90eb32f71..fd402d73cd05 100644 --- a/python/ray/data/_internal/execution/operators/task_pool_map_operator.py +++ b/python/ray/data/_internal/execution/operators/task_pool_map_operator.py @@ -85,7 +85,7 @@ def _add_bundled_input(self, bundle: RefBundle): ctx = TaskContext( task_idx=self._next_data_task_idx, op_name=self.name, - target_max_block_size=self.actual_target_max_block_size, + target_max_block_size_override=self.actual_target_max_block_size, ) dynamic_ray_remote_args = self._get_runtime_ray_remote_args(input_bundle=bundle) @@ -140,6 +140,19 @@ def incremental_resource_usage(self) -> ExecutionResources: or 0, ) + def per_task_resource_allocation( + self: "PhysicalOperator", + ) -> ExecutionResources: + return self.incremental_resource_usage() + + def max_task_concurrency(self: "PhysicalOperator") -> Optional[int]: + return self._concurrency + + def min_scheduling_resources( + self: "PhysicalOperator", + ) -> ExecutionResources: + return self.incremental_resource_usage() + def get_concurrency(self) -> Optional[int]: return self._concurrency diff --git a/python/ray/data/_internal/execution/operators/zip_operator.py b/python/ray/data/_internal/execution/operators/zip_operator.py index 01f2a94774d9..d37ecd59a821 100644 --- a/python/ray/data/_internal/execution/operators/zip_operator.py +++ b/python/ray/data/_internal/execution/operators/zip_operator.py @@ -52,7 +52,6 @@ def __init__( "Zip", [left_input_op, right_input_op], data_context, - target_max_block_size=None, ) def num_outputs_total(self) -> Optional[int]: diff --git a/python/ray/data/_internal/execution/resource_manager.py b/python/ray/data/_internal/execution/resource_manager.py index f18e3a3f9771..4b2bfe4af666 100644 --- a/python/ray/data/_internal/execution/resource_manager.py +++ b/python/ray/data/_internal/execution/resource_manager.py @@ -493,18 +493,56 @@ def _get_eligible_ops(self) -> List[PhysicalOperator]: op for op in self._resource_manager._topology if self._is_op_eligible(op) ] + def _get_ineligible_ops_with_usage(self) -> List[PhysicalOperator]: + """ + Resource reservation is based on the number of eligible operators. + However, there might be completed operators that still have blocks in their output queue, which we need to exclude them from the reservation. + And we also need to exclude the downstream ineligible operators. + + E.g., for the following pipeline: + ``` + map1 (completed, but still has blocks in its output queue) -> limit1 (ineligible, not completed) -> map2 (not completed) -> limit2 -> map3 + ``` + + The reservation is based on the number of eligible operators (map2 and map3), but we need to exclude map1 and limit1 from the reservation. + """ + last_completed_ops = [] + ops_to_exclude_from_reservation = [] + # Traverse operator tree collecting all operators that have already finished + for op in self._resource_manager._topology: + if not op.execution_finished(): + for dep in op.input_dependencies: + if dep.execution_finished(): + last_completed_ops.append(dep) + + # In addition to completed operators, + # filter out downstream ineligible operators since they are omitted from reservation calculations. + for op in last_completed_ops: + ops_to_exclude_from_reservation.extend( + list(self._get_downstream_ineligible_ops(op)) + ) + ops_to_exclude_from_reservation.append(op) + return list(set(ops_to_exclude_from_reservation)) + def _update_reservation(self): - global_limits = self._resource_manager.get_global_limits() + global_limits = self._resource_manager.get_global_limits().copy() eligible_ops = self._get_eligible_ops() self._op_reserved.clear() self._reserved_for_op_outputs.clear() self._reserved_min_resources.clear() - remaining = global_limits.copy() if len(eligible_ops) == 0: return + op_to_exclude_from_reservation = self._get_ineligible_ops_with_usage() + for completed_op in op_to_exclude_from_reservation: + global_limits = global_limits.subtract( + self._resource_manager.get_op_usage(completed_op) + ) + global_limits = global_limits.max(ExecutionResources.zero()) + remaining = global_limits.copy() + # Reserve `reservation_ratio * global_limits / num_ops` resources for each # operator. default_reserved = global_limits.scale( diff --git a/python/ray/data/_internal/execution/streaming_executor.py b/python/ray/data/_internal/execution/streaming_executor.py index 6806c69bdd9d..7d95c7e4fc72 100644 --- a/python/ray/data/_internal/execution/streaming_executor.py +++ b/python/ray/data/_internal/execution/streaming_executor.py @@ -4,11 +4,15 @@ import time from typing import Dict, List, Optional, Tuple -from ray.data._internal.execution.autoscaler import create_autoscaler +from ray.data._internal.actor_autoscaler import ( + create_actor_autoscaler, +) +from ray.data._internal.cluster_autoscaler import create_cluster_autoscaler from ray.data._internal.execution.backpressure_policy import ( BackpressurePolicy, get_backpressure_policies, ) +from ray.data._internal.execution.dataset_state import DatasetState from ray.data._internal.execution.execution_callback import get_execution_callbacks from ray.data._internal.execution.interfaces import ( ExecutionResources, @@ -37,7 +41,7 @@ ) from ray.data._internal.metadata_exporter import Topology as TopologyMetadata from ray.data._internal.progress_bar import ProgressBar -from ray.data._internal.stats import DatasetState, DatasetStats, StatsManager, Timer +from ray.data._internal.stats import DatasetStats, StatsManager, Timer from ray.data.context import OK_PREFIX, WARN_PREFIX, DataContext from ray.util.metrics import Gauge @@ -174,18 +178,22 @@ def execute( self._resource_manager = ResourceManager( self._topology, self._options, - lambda: self._autoscaler.get_total_resources(), + lambda: self._cluster_autoscaler.get_total_resources(), self._data_context, ) self._backpressure_policies = get_backpressure_policies( self._data_context, self._topology, self._resource_manager ) - self._autoscaler = create_autoscaler( + self._cluster_autoscaler = create_cluster_autoscaler( self._topology, self._resource_manager, - config=self._data_context.autoscaling_config, execution_id=self._dataset_id, ) + self._actor_autoscaler = create_actor_autoscaler( + self._topology, + self._resource_manager, + config=self._data_context.autoscaling_config, + ) self._has_op_completed = dict.fromkeys(self._topology, False) @@ -293,7 +301,7 @@ def shutdown(self, force: bool, exception: Optional[Exception] = None): for callback in get_execution_callbacks(self._data_context): callback.after_execution_fails(self, exception) - self._autoscaler.on_executor_shutdown() + self._cluster_autoscaler.on_executor_shutdown() dur = time.perf_counter() - start @@ -461,7 +469,8 @@ def _scheduling_loop_step(self, topology: Topology) -> bool: self._refresh_progress_bars(topology) # Trigger autoscaling - self._autoscaler.try_trigger_scaling() + self._cluster_autoscaler.try_trigger_scaling() + self._actor_autoscaler.try_trigger_scaling() update_operator_states(topology) self._refresh_progress_bars(topology) @@ -548,7 +557,9 @@ def _get_state_dict(self, state): "progress": last_state.num_completed_tasks, "total": last_op.num_outputs_total(), "total_rows": last_op.num_output_rows_total(), - "end_time": time.time() if state != DatasetState.RUNNING.name else None, + "end_time": time.time() + if state in (DatasetState.FINISHED.name, DatasetState.FAILED.name) + else None, "operators": { f"{self._get_operator_id(op, i)}": { "name": op.name, diff --git a/python/ray/data/_internal/execution/streaming_executor_state.py b/python/ray/data/_internal/execution/streaming_executor_state.py index 6475b0b72631..c3d7ae8e95dc 100644 --- a/python/ray/data/_internal/execution/streaming_executor_state.py +++ b/python/ray/data/_internal/execution/streaming_executor_state.py @@ -284,7 +284,10 @@ def add_output(self, ref: RefBundle) -> None: """Move a bundle produced by the operator to its outqueue.""" ref, diverged = dedupe_schemas_with_validation( - self._schema, ref, warn=not self._warned_on_schema_divergence + self._schema, + ref, + warn=not self._warned_on_schema_divergence, + enforce_schemas=self.op.data_context.enforce_schemas, ) self._schema = ref.schema self._warned_on_schema_divergence |= diverged @@ -303,6 +306,9 @@ def add_output(self, ref: RefBundle) -> None: self.op.metrics.num_alive_actors = actor_info.running self.op.metrics.num_restarting_actors = actor_info.restarting self.op.metrics.num_pending_actors = actor_info.pending + for next_op in self.op.output_dependencies: + next_op.metrics.num_external_inqueue_blocks = self.output_queue.num_blocks + next_op.metrics.num_external_inqueue_bytes = self.output_queue.memory_usage def refresh_progress_bar(self, resource_manager: ResourceManager) -> None: """Update the console with the latest operator progress.""" @@ -754,7 +760,7 @@ def dedupe_schemas_with_validation( old_schema: Optional["Schema"], bundle: "RefBundle", warn: bool = True, - allow_divergent: bool = False, + enforce_schemas: bool = False, ) -> Tuple["RefBundle", bool]: """Unify/Dedupe two schemas, warning if warn=True @@ -763,7 +769,7 @@ def dedupe_schemas_with_validation( the new schema will be used as the old schema. bundle: The new `RefBundle` to unify with the old schema. warn: Raise a warning if the schemas diverge. - allow_divergent: If `True`, allow the schemas to diverge and return unified schema. + enforce_schemas: If `True`, allow the schemas to diverge and return unified schema. If `False`, but keep the old schema. Returns: @@ -784,13 +790,13 @@ def dedupe_schemas_with_validation( return bundle, diverged diverged = True - if warn: + if warn and enforce_schemas: logger.warning( f"Operator produced a RefBundle with a different schema " f"than the previous one. Previous schema: {old_schema}, " f"new schema: {bundle.schema}. This may lead to unexpected behavior." ) - if allow_divergent: + if enforce_schemas: old_schema = unify_schemas_with_validation([old_schema, bundle.schema]) return ( diff --git a/python/ray/data/_internal/issue_detection/issue_detector_configuration.py b/python/ray/data/_internal/issue_detection/issue_detector_configuration.py index 53aea74dbd63..6b59a7318d23 100644 --- a/python/ray/data/_internal/issue_detection/issue_detector_configuration.py +++ b/python/ray/data/_internal/issue_detection/issue_detector_configuration.py @@ -13,10 +13,10 @@ @dataclass class IssueDetectorsConfiguration: hanging_detector_config: HangingExecutionIssueDetectorConfig = field( - default=HangingExecutionIssueDetectorConfig + default_factory=HangingExecutionIssueDetectorConfig ) high_memory_detector_config: HighMemoryIssueDetectorConfig = field( - default=HighMemoryIssueDetectorConfig + default_factory=HighMemoryIssueDetectorConfig ) detectors: List[Type[IssueDetector]] = field( default_factory=lambda: [HangingExecutionIssueDetector, HighMemoryIssueDetector] diff --git a/python/ray/data/_internal/issue_detection/issue_detector_manager.py b/python/ray/data/_internal/issue_detection/issue_detector_manager.py index 91569e16deac..33ebbc69dafe 100644 --- a/python/ray/data/_internal/issue_detection/issue_detector_manager.py +++ b/python/ray/data/_internal/issue_detection/issue_detector_manager.py @@ -2,11 +2,19 @@ import time from typing import TYPE_CHECKING, Dict, List +from ray.core.generated.export_dataset_operator_event_pb2 import ( + ExportDatasetOperatorEventData as ProtoOperatorEventData, +) from ray.data._internal.issue_detection.issue_detector import ( Issue, IssueDetector, IssueType, ) +from ray.data._internal.operator_event_exporter import ( + OperatorEvent, + format_export_issue_event_name, + get_operator_event_exporter, +) if TYPE_CHECKING: from ray.data._internal.execution.interfaces.physical_operator import ( @@ -27,6 +35,7 @@ def __init__(self, executor: "StreamingExecutor"): detector: time.perf_counter() for detector in self._issue_detectors } self.executor = executor + self._operator_event_exporter = get_operator_event_exporter() def invoke_detectors(self) -> None: curr_time = time.perf_counter() @@ -47,8 +56,10 @@ def invoke_detectors(self) -> None: def _report_issues(self, issues: List[Issue]) -> None: operators: Dict[str, "PhysicalOperator"] = {} - for operator in self.executor._topology.keys(): + op_to_id: Dict["PhysicalOperator", str] = {} + for i, operator in enumerate(self.executor._topology.keys()): operators[operator.id] = operator + op_to_id[operator] = self.executor._get_operator_id(operator, i) # Reset issue detector metrics for each operator so that previous issues # don't affect the current ones. operator.metrics._issue_detector_hanging = 0 @@ -59,6 +70,24 @@ def _report_issues(self, issues: List[Issue]) -> None: operator = operators.get(issue.operator_id) if not operator: continue + + issue_event_type = format_export_issue_event_name(issue.issue_type) + if ( + self._operator_event_exporter is not None + and issue_event_type + in ProtoOperatorEventData.DatasetOperatorEventType.keys() + ): + event_time = time.time() + operator_event = OperatorEvent( + dataset_id=issue.dataset_name, + operator_id=op_to_id[operator], + operator_name=operator.name, + event_time=event_time, + event_type=issue_event_type, + message=issue.message, + ) + self._operator_event_exporter.export_operator_event(operator_event) + if issue.issue_type == IssueType.HANGING: operator.metrics._issue_detector_hanging += 1 if issue.issue_type == IssueType.HIGH_MEMORY: diff --git a/python/ray/data/_internal/iterator/stream_split_iterator.py b/python/ray/data/_internal/iterator/stream_split_iterator.py index ab804886a49f..93f041994872 100644 --- a/python/ray/data/_internal/iterator/stream_split_iterator.py +++ b/python/ray/data/_internal/iterator/stream_split_iterator.py @@ -139,9 +139,13 @@ def __init__( locality_hints: Optional[List[NodeIdStr]], ): dataset = dataset_wrapper._dataset + # Set current DataContext. - self._data_context = dataset.context + # This needs to be a deep copy so that updates to the base dataset's + # context does not affect this process's global DataContext. + self._data_context = dataset.context.copy() ray.data.DataContext._set_current(self._data_context) + if self._data_context.execution_options.locality_with_output is True: self._data_context.execution_options.locality_with_output = locality_hints logger.info(f"Auto configuring locality_with_output={locality_hints}") diff --git a/python/ray/data/_internal/logging.py b/python/ray/data/_internal/logging.py index 0184ac58e5d6..9c3b5abc0301 100644 --- a/python/ray/data/_internal/logging.py +++ b/python/ray/data/_internal/logging.py @@ -10,7 +10,7 @@ DEFAULT_TEXT_FORMATTER = ( "%(asctime)s\t%(levelname)s %(filename)s:%(lineno)s -- %(message)s" # noqa: E501 ) -DEFAULT_JSON_FORMATTER = ray._private.ray_logging.formatters.JSONFormatter +DEFAULT_JSON_FORMATTER = ray._common.formatters.JSONFormatter DEFAULT_CONFIG = { "version": 1, "disable_existing_loggers": False, @@ -22,9 +22,7 @@ }, "filters": { "console_filter": {"()": "ray.data._internal.logging.HiddenRecordFilter"}, - "core_context_filter": { - "()": "ray._private.ray_logging.filters.CoreContextFilter" - }, + "core_context_filter": {"()": "ray._common.filters.CoreContextFilter"}, }, "handlers": { "file": { diff --git a/python/ray/data/_internal/logical/operators/count_operator.py b/python/ray/data/_internal/logical/operators/count_operator.py index 409c99e3c000..39ec706f7e50 100644 --- a/python/ray/data/_internal/logical/operators/count_operator.py +++ b/python/ray/data/_internal/logical/operators/count_operator.py @@ -1,5 +1,3 @@ -from typing import List - from ray.data._internal.logical.interfaces import LogicalOperator @@ -15,6 +13,6 @@ class Count(LogicalOperator): def __init__( self, - input_dependencies: List["LogicalOperator"], + input_op: LogicalOperator, ): - super().__init__("Count", input_dependencies) + super().__init__("Count", [input_op]) diff --git a/python/ray/data/_internal/logical/operators/from_operators.py b/python/ray/data/_internal/logical/operators/from_operators.py index 05d0d668d3a6..0f6ec1a7a2d2 100644 --- a/python/ray/data/_internal/logical/operators/from_operators.py +++ b/python/ray/data/_internal/logical/operators/from_operators.py @@ -4,8 +4,12 @@ from ray.data._internal.execution.interfaces import RefBundle from ray.data._internal.logical.interfaces import LogicalOperator, SourceOperator -from ray.data._internal.util import unify_block_metadata_schema -from ray.data.block import Block, BlockMetadata, BlockMetadataWithSchema +from ray.data._internal.util import unify_ref_bundles_schema +from ray.data.block import ( + Block, + BlockMetadata, + BlockMetadataWithSchema, +) from ray.types import ObjectRef if TYPE_CHECKING: @@ -28,12 +32,11 @@ def __init__( len(input_metadata), ) # `owns_blocks` is False because this op may be shared by multiple Datasets. - self._schema = unify_block_metadata_schema(input_metadata) self._input_data = [ RefBundle( [(input_blocks[i], input_metadata[i])], owns_blocks=False, - schema=self._schema, + schema=input_metadata[i].schema, ) for i in range(len(input_blocks)) ] @@ -71,7 +74,7 @@ def infer_metadata(self) -> BlockMetadata: return self._cached_output_metadata def infer_schema(self): - return self._schema + return unify_ref_bundles_schema(self._input_data) def is_lineage_serializable(self) -> bool: # This operator isn't serializable because it contains ObjectRefs. diff --git a/python/ray/data/_internal/logical/operators/input_data_operator.py b/python/ray/data/_internal/logical/operators/input_data_operator.py index f779d582706b..373a12e84961 100644 --- a/python/ray/data/_internal/logical/operators/input_data_operator.py +++ b/python/ray/data/_internal/logical/operators/input_data_operator.py @@ -3,7 +3,7 @@ from ray.data._internal.execution.interfaces import RefBundle from ray.data._internal.logical.interfaces import LogicalOperator, SourceOperator -from ray.data._internal.util import unify_ref_bundles_schema +from ray.data._internal.util import unify_schemas_with_validation from ray.data.block import BlockMetadata @@ -49,7 +49,7 @@ def _size_bytes(self): return None def infer_schema(self): - return unify_ref_bundles_schema(self.input_data) + return unify_schemas_with_validation([data.schema for data in self.input_data]) def is_lineage_serializable(self) -> bool: # This operator isn't serializable because it contains ObjectRefs. diff --git a/python/ray/data/_internal/logical/operators/one_to_one_operator.py b/python/ray/data/_internal/logical/operators/one_to_one_operator.py index d9195aebcbdb..79da7094793a 100644 --- a/python/ray/data/_internal/logical/operators/one_to_one_operator.py +++ b/python/ray/data/_internal/logical/operators/one_to_one_operator.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING, Any, Dict, Optional from ray.data._internal.logical.interfaces import LogicalOperator from ray.data.block import BlockMetadata @@ -83,3 +83,34 @@ def _input_files(self): assert len(self._input_dependencies) == 1, len(self._input_dependencies) assert isinstance(self._input_dependencies[0], LogicalOperator) return self._input_dependencies[0].infer_metadata().input_files + + +class Download(AbstractOneToOne): + """Logical operator for download operation.""" + + def __init__( + self, + input_op: LogicalOperator, + uri_column_name: str, + output_bytes_column_name: str, + ray_remote_args: Optional[Dict[str, Any]] = None, + ): + super().__init__("Download", input_op) + self._uri_column_name = uri_column_name + self._output_bytes_column_name = output_bytes_column_name + self._ray_remote_args = ray_remote_args or {} + + def can_modify_num_rows(self) -> bool: + return False + + @property + def uri_column_name(self) -> str: + return self._uri_column_name + + @property + def output_bytes_column_name(self) -> str: + return self._output_bytes_column_name + + @property + def ray_remote_args(self) -> Dict[str, Any]: + return self._ray_remote_args diff --git a/python/ray/data/_internal/logical/operators/read_operator.py b/python/ray/data/_internal/logical/operators/read_operator.py index b3ba9c42b498..aef39c554d23 100644 --- a/python/ray/data/_internal/logical/operators/read_operator.py +++ b/python/ray/data/_internal/logical/operators/read_operator.py @@ -3,7 +3,10 @@ from ray.data._internal.logical.interfaces import SourceOperator from ray.data._internal.logical.operators.map_operator import AbstractMap -from ray.data.block import BlockMetadata, BlockMetadataWithSchema +from ray.data.block import ( + BlockMetadata, + BlockMetadataWithSchema, +) from ray.data.datasource.datasource import Datasource, Reader diff --git a/python/ray/data/_internal/logical/rules/inherit_target_max_block_size.py b/python/ray/data/_internal/logical/rules/inherit_target_max_block_size.py index 298ff6c4edbf..a7d55ccb0ead 100644 --- a/python/ray/data/_internal/logical/rules/inherit_target_max_block_size.py +++ b/python/ray/data/_internal/logical/rules/inherit_target_max_block_size.py @@ -16,13 +16,13 @@ def apply(self, plan: PhysicalPlan) -> PhysicalPlan: def _propagate_target_max_block_size_to_upstream_ops( self, dag: PhysicalOperator, target_max_block_size: Optional[int] = None ): - if dag.target_max_block_size is not None: + if dag.target_max_block_size_override is not None: # Set the target block size to inherit for # upstream ops. - target_max_block_size = dag.target_max_block_size + target_max_block_size = dag.target_max_block_size_override elif target_max_block_size is not None: # Inherit from downstream op. - dag.set_target_max_block_size(target_max_block_size) + dag.override_target_max_block_size(target_max_block_size) for upstream_op in dag.input_dependencies: self._propagate_target_max_block_size_to_upstream_ops( diff --git a/python/ray/data/_internal/logical/rules/operator_fusion.py b/python/ray/data/_internal/logical/rules/operator_fusion.py index 13104de04be8..ba1e07797d4c 100644 --- a/python/ray/data/_internal/logical/rules/operator_fusion.py +++ b/python/ray/data/_internal/logical/rules/operator_fusion.py @@ -212,8 +212,8 @@ def _can_fuse(self, down_op: PhysicalOperator, up_op: PhysicalOperator) -> bool: return False if not self._can_merge_target_max_block_size( - up_op.target_max_block_size, - down_op.target_max_block_size, + up_op.target_max_block_size_override, + down_op.target_max_block_size_override, up_op.data_context, ): return False @@ -302,7 +302,7 @@ def _get_fused_map_operator( ) target_max_block_size = self._get_merged_target_max_block_size( - up_op.target_max_block_size, down_op.target_max_block_size + up_op.target_max_block_size_override, down_op.target_max_block_size_override ) compute = self._fuse_compute_strategy( @@ -436,7 +436,7 @@ def fused_all_to_all_transform_fn( input_op = input_deps[0] target_max_block_size = self._get_merged_target_max_block_size( - up_op.target_max_block_size, down_op.target_max_block_size + up_op.target_max_block_size_override, down_op.target_max_block_size_override ) assert up_op.data_context is down_op.data_context @@ -444,7 +444,7 @@ def fused_all_to_all_transform_fn( fused_all_to_all_transform_fn, input_op, up_op.data_context, - target_max_block_size=target_max_block_size, + target_max_block_size_override=target_max_block_size, num_outputs=down_op._num_outputs, # Transfer over the existing sub-progress bars from # the AllToAllOperator (if any) into the fused operator. diff --git a/python/ray/data/_internal/metadata_exporter.py b/python/ray/data/_internal/metadata_exporter.py index dfc2a60bcffc..59f90db33164 100644 --- a/python/ray/data/_internal/metadata_exporter.py +++ b/python/ray/data/_internal/metadata_exporter.py @@ -1,10 +1,9 @@ """Metadata exporter API for Ray Data datasets.""" -import json import logging import os from abc import ABC, abstractmethod -from dataclasses import dataclass, field +from dataclasses import asdict, dataclass, field, is_dataclass from typing import TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Sequence import ray @@ -13,6 +12,14 @@ check_export_api_enabled, get_export_event_logger, ) +from ray.core.generated.export_dataset_metadata_pb2 import ( + ExportDatasetMetadata as ProtoDatasetMetadata, +) +from ray.dashboard.modules.metrics.dashboards.common import Panel +from ray.dashboard.modules.metrics.dashboards.data_dashboard_panels import ( + OPERATOR_PANELS, +) +from ray.data._internal.execution.dataset_state import DatasetState from ray.data.context import DataContext if TYPE_CHECKING: @@ -60,11 +67,17 @@ class Operator: sub_stages: List of sub-stages contained within this operator. args: User-specified arguments associated with the operator, which may include configuration settings, options, or other relevant data for the operator. + execution_start_time: The timestamp when the operator execution begins. + execution_end_time: The timestamp when the operator execution ends. + state: The state of the operator. """ name: str id: str uuid: str + execution_start_time: Optional[float] + execution_end_time: Optional[float] + state: str input_dependencies: List[str] = field(default_factory=list) sub_stages: List[SubStage] = field(default_factory=list) args: Dict[str, Any] = field(default_factory=dict) @@ -108,6 +121,9 @@ def create_topology_metadata( op_to_id[dep] for dep in op.input_dependencies if dep in op_to_id ], args=sanitize_for_struct(op._get_logical_args()), + execution_start_time=None, + execution_end_time=None, + state=DatasetState.PENDING.name, ) # Add sub-stages if they exist @@ -131,8 +147,11 @@ class DatasetMetadata: job_id: The ID of the job running this dataset. topology: The structure of the dataset's operator DAG. dataset_id: The unique ID of the dataset. - start_time: The timestamp when the dataset execution started. + start_time: The timestamp when the dataset is registered. data_context: The DataContext attached to the dataset. + execution_start_time: The timestamp when the dataset execution starts. + execution_end_time: The timestamp when the dataset execution ends. + state: The state of the dataset. """ job_id: str @@ -140,32 +159,47 @@ class DatasetMetadata: dataset_id: str start_time: float data_context: DataContext + execution_start_time: Optional[float] + execution_end_time: Optional[float] + state: str -def _add_ellipsis(s, truncate_length): +def _add_ellipsis_for_string(s: str, truncate_length: int) -> str: if len(s) > truncate_length: return s[:truncate_length] + "..." return s def sanitize_for_struct(obj, truncate_length=DEFAULT_TRUNCATION_LENGTH): + """Prepares the obj for Struct Protobuf format by recursively + going through dictionaries, lists, etc... + + - Dataclasses will be converted to dicts + - Dictionary keys will be converted to strings + - Lists, tuples, sets, bytes, bytearrays will be converted to lists + """ if isinstance(obj, Mapping): - return {k: sanitize_for_struct(v, truncate_length) for k, v in obj.items()} - elif isinstance(obj, (int, float, bool)) or obj is None: - return obj + # protobuf Struct key names must be strings. + return {str(k): sanitize_for_struct(v, truncate_length) for k, v in obj.items()} elif isinstance(obj, str): - return _add_ellipsis(obj, truncate_length) - elif isinstance(obj, Sequence): - return [sanitize_for_struct(v, truncate_length) for v in obj] + return _add_ellipsis_for_string(obj, truncate_length) + elif isinstance(obj, (Sequence, set)): + # Convert all sequence-like types (lists, tuples, sets, bytes, other sequences) to lists + res = [] + for i, v in enumerate(obj): + if i >= truncate_length: + res.append("...") + break + res.append(sanitize_for_struct(v, truncate_length)) + return res else: - # Convert unhandled types to string try: - return _add_ellipsis(json.dumps(obj), truncate_length) - except (TypeError, OverflowError): - try: - return _add_ellipsis(str(obj), truncate_length) - except Exception: - return UNKNOWN + if is_dataclass(obj): + return sanitize_for_struct(asdict(obj), truncate_length) + return _add_ellipsis_for_string(str(obj), truncate_length) + except Exception: + unk_name = f"{UNKNOWN}: {type(obj).__name__}" + return _add_ellipsis_for_string(unk_name, truncate_length) def dataset_metadata_to_proto(dataset_metadata: DatasetMetadata) -> Any: @@ -178,7 +212,6 @@ def dataset_metadata_to_proto(dataset_metadata: DatasetMetadata) -> Any: Returns: The protobuf message representing the dataset metadata. """ - from dataclasses import asdict from google.protobuf.struct_pb2 import Struct @@ -202,6 +235,9 @@ def dataset_metadata_to_proto(dataset_metadata: DatasetMetadata) -> Any: id=op.id, uuid=op.uuid, args=args, + execution_start_time=op.execution_start_time, + execution_end_time=op.execution_end_time, + state=ProtoOperator.OperatorState.Value(op.state), ) # Add input dependencies @@ -221,18 +257,34 @@ def dataset_metadata_to_proto(dataset_metadata: DatasetMetadata) -> Any: # Populate the data metadata proto data_context = Struct() - data_context.update(sanitize_for_struct(asdict(dataset_metadata.data_context))) + data_context.update(sanitize_for_struct(dataset_metadata.data_context)) proto_dataset_metadata = ProtoDatasetMetadata( dataset_id=dataset_metadata.dataset_id, job_id=dataset_metadata.job_id, start_time=dataset_metadata.start_time, data_context=data_context, + execution_start_time=dataset_metadata.execution_start_time, + execution_end_time=dataset_metadata.execution_end_time, + state=ProtoDatasetMetadata.DatasetState.Value(dataset_metadata.state), + operator_panels=[_to_proto_dashboard_panel(p) for p in OPERATOR_PANELS], ) proto_dataset_metadata.topology.CopyFrom(proto_topology) return proto_dataset_metadata +def _to_proto_dashboard_panel( + panel: Panel, +) -> ProtoDatasetMetadata.DashboardPanelMetadata: + """Convert Dashboard Panel to protobuf format.""" + proto_panel = ProtoDatasetMetadata.DashboardPanelMetadata( + id=str(panel.id), + title=panel.title, + ) + + return proto_panel + + def get_dataset_metadata_exporter() -> "DatasetMetadataExporter": """Get the dataset metadata exporter instance. diff --git a/python/ray/data/_internal/operator_event_exporter.py b/python/ray/data/_internal/operator_event_exporter.py new file mode 100644 index 000000000000..5ee60f2131b9 --- /dev/null +++ b/python/ray/data/_internal/operator_event_exporter.py @@ -0,0 +1,164 @@ +"""Exporter API for Ray Data operator events.""" + +import logging +import os +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Any, Optional + +import ray +from ray._private.event.export_event_logger import ( + EventLogType, + check_export_api_enabled, + get_export_event_logger, +) + +logger = logging.getLogger(__name__) + + +@dataclass +class OperatorEvent: + """Represents an Ray Data operator event, such as issue detection + + Attributes: + dataset_id: The id of the dataset. + operator_id: The id of the operator within the DAG structure, typically + incorporating a position or index (e.g., "ReadParquet_0") + operator_name: The name of the operator. + event_time: The timestamp when the event is emitted (in seconds since epoch). + event_type: The type of the event. + message: The content of the event message. + """ + + dataset_id: str + operator_id: str + operator_name: str + event_time: float + event_type: str + message: str + + +def operator_event_to_proto(operator_event: OperatorEvent) -> Any: + """Convert the operator event to a protobuf message. + + Args: + operator_event: OperatorEvent object containing the event details + + Returns: + The protobuf message representing the operator event. + """ + + from ray.core.generated.export_dataset_operator_event_pb2 import ( + ExportDatasetOperatorEventData as ProtoOperatorEventData, + ) + + # Create the protobuf message + proto_operator_event_data = ProtoOperatorEventData( + dataset_id=operator_event.dataset_id, + operator_id=operator_event.operator_id, + operator_name=operator_event.operator_name, + event_time=operator_event.event_time, + event_type=ProtoOperatorEventData.DatasetOperatorEventType.Value( + operator_event.event_type + ), + message=operator_event.message, + ) + + return proto_operator_event_data + + +def format_export_issue_event_name(issue_name: str) -> str: + return "ISSUE_DETECTION_" + issue_name.upper().replace(" ", "_") + + +def get_operator_event_exporter() -> "OperatorEventExporter": + """Get the operator event exporter instance. + + Returns: + The operator event exporter instance. + """ + return LoggerOperatorEventExporter.create_if_enabled() + + +class OperatorEventExporter(ABC): + """Abstract base class for operator event exporters. + + Implementations of this interface can export Ray Data operator event to various + destinations like log files, databases, or monitoring systems. + """ + + @abstractmethod + def export_operator_event(self, operator_event: OperatorEvent) -> None: + """Export operator event to the destination. + + Args: + operator_event: OperatorEvent object containing operator event details. + """ + pass + + @classmethod + @abstractmethod + def create_if_enabled(cls) -> Optional["OperatorEventExporter"]: + """Create an event exporter instance if the export functionality is enabled. + + Returns: + An event exporter instance if enabled, none otherwise. + """ + pass + + +class LoggerOperatorEventExporter(OperatorEventExporter): + """Operator event exporter implementation that uses the Ray export event logger. + + This exporter writes operator event to log files using Ray's export event system. + """ + + def __init__(self, logger: logging.Logger): + """Initialize with a configured export event logger. + + Args: + logger: The export event logger to use for writing events. + """ + self._export_logger = logger + + def export_operator_event(self, operator_event: OperatorEvent) -> None: + """Export operator event using the export event logger. + + Args: + operator_event: OperatorEvent object containing operator event details. + """ + operator_event_proto = operator_event_to_proto(operator_event) + self._export_logger.send_event(operator_event_proto) + + @classmethod + def create_if_enabled(cls) -> Optional["LoggerOperatorEventExporter"]: + """Create a logger-based exporter if the export API is enabled. + + Returns: + A LoggerOperatorEventExporter instance, none otherwise. + """ + from ray.core.generated.export_event_pb2 import ExportEvent + + is_operator_event_export_api_enabled = check_export_api_enabled( + ExportEvent.SourceType.EXPORT_DATASET_OPERATOR_EVENT + ) + if not is_operator_event_export_api_enabled: + # The export API is not enabled, so we shouldn't create an exporter + return None + + log_directory = os.path.join( + ray._private.worker._global_node.get_session_dir_path(), "logs" + ) + + try: + logger = get_export_event_logger( + EventLogType.DATASET_OPERATOR_EVENT, + log_directory, + ) + return LoggerOperatorEventExporter(logger) + except Exception: + logger.exception( + "Unable to initialize the export event logger, so no operator export " + "events will be written." + ) + return None diff --git a/python/ray/data/_internal/output_buffer.py b/python/ray/data/_internal/output_buffer.py index 20265ebd35b2..f8332b30d2b8 100644 --- a/python/ray/data/_internal/output_buffer.py +++ b/python/ray/data/_internal/output_buffer.py @@ -90,7 +90,7 @@ def has_next(self) -> bool: self._exceeded_buffer_row_limit() or self._exceeded_buffer_size_limit() ) - def _exceeded_block_size_slice_limit(self, block: Block) -> bool: + def _exceeded_block_size_slice_limit(self, block: BlockAccessor) -> bool: # Slice a block to respect the target max block size. We only do this if we are # more than 50% above the target block size, because this ensures that the last # block produced will be at least half the target block size. @@ -101,7 +101,7 @@ def _exceeded_block_size_slice_limit(self, block: Block) -> bool: * self._output_block_size_option.target_max_block_size ) - def _exceeded_block_row_slice_limit(self, block: Block) -> bool: + def _exceeded_block_row_slice_limit(self, block: BlockAccessor) -> bool: # Slice a block to respect the target max rows per block. We only do this if we # are more than 50% above the target rows per block, because this ensures that # the last block produced will be at least half the target row count. diff --git a/python/ray/data/_internal/pandas_block.py b/python/ray/data/_internal/pandas_block.py index a95ba56691e1..1c82b10cc5c5 100644 --- a/python/ray/data/_internal/pandas_block.py +++ b/python/ray/data/_internal/pandas_block.py @@ -317,6 +317,14 @@ def select(self, columns: List[str]) -> "pandas.DataFrame": def rename_columns(self, columns_rename: Dict[str, str]) -> "pandas.DataFrame": return self._table.rename(columns=columns_rename, inplace=False, copy=False) + def upsert_column( + self, column_name: str, column_data: BlockColumn + ) -> "pandas.DataFrame": + if isinstance(column_data, (pyarrow.Array, pyarrow.ChunkedArray)): + column_data = column_data.to_pandas() + + return self._table.assign(**{column_name: column_data}) + def random_shuffle(self, random_seed: Optional[int]) -> "pandas.DataFrame": table = self._table.sample(frac=1, random_state=random_seed) table.reset_index(drop=True, inplace=True) diff --git a/python/ray/data/_internal/plan.py b/python/ray/data/_internal/plan.py index 148d144d6d10..ba4d6df6727f 100644 --- a/python/ray/data/_internal/plan.py +++ b/python/ray/data/_internal/plan.py @@ -11,10 +11,10 @@ from ray.data._internal.logical.interfaces import SourceOperator from ray.data._internal.logical.interfaces.logical_operator import LogicalOperator from ray.data._internal.logical.interfaces.logical_plan import LogicalPlan +from ray.data._internal.logical.interfaces.operator import Operator from ray.data._internal.logical.operators.read_operator import Read from ray.data._internal.stats import DatasetStats -from ray.data._internal.util import unify_ref_bundles_schema -from ray.data.block import BlockMetadataWithSchema +from ray.data.block import BlockMetadataWithSchema, _take_first_non_empty_schema from ray.data.context import DataContext from ray.data.exceptions import omit_traceback_stdout from ray.util.debug import log_once @@ -111,6 +111,54 @@ def __repr__(self) -> str: f")" ) + def explain(self) -> str: + """Return a string representation of the logical and physical plan.""" + from ray.data._internal.logical.optimizers import get_execution_plan + + logical_plan = self._logical_plan + logical_plan_str, _ = self.generate_plan_string(logical_plan.dag) + logical_plan_str = "-------- Logical Plan --------\n" + logical_plan_str + + physical_plan = get_execution_plan(self._logical_plan) + physical_plan_str, _ = self.generate_plan_string( + physical_plan.dag, show_op_repr=True + ) + physical_plan_str = "-------- Physical Plan --------\n" + physical_plan_str + + return logical_plan_str + physical_plan_str + + @staticmethod + def generate_plan_string( + op: Operator, + curr_str: str = "", + depth: int = 0, + including_source: bool = True, + show_op_repr: bool = False, + ): + """Traverse (DFS) the Plan DAG and + return a string representation of the operators.""" + if not including_source and isinstance(op, SourceOperator): + return curr_str, depth + + curr_max_depth = depth + + # For logical plan, only show the operator name like "Aggregate". + # But for physical plan, show the operator class name as well like "AllToAllOperator[Aggregate]". + op_str = repr(op) if show_op_repr else op.name + + if depth == 0: + curr_str += f"{op_str}\n" + else: + trailing_space = " " * ((depth - 1) * 3) + curr_str += f"{trailing_space}+- {op_str}\n" + + for input in op.input_dependencies: + curr_str, input_max_depth = ExecutionPlan.generate_plan_string( + input, curr_str, depth + 1, including_source, show_op_repr + ) + curr_max_depth = max(curr_max_depth, input_max_depth) + return curr_str, curr_max_depth + def get_plan_as_string(self, dataset_cls: Type["Dataset"]) -> str: """Create a cosmetic string representation of this execution plan. @@ -128,35 +176,9 @@ def get_plan_as_string(self, dataset_cls: Type["Dataset"]) -> str: plan_str = "" plan_max_depth = 0 if not self.has_computed_output(): - - def generate_logical_plan_string( - op: LogicalOperator, - curr_str: str = "", - depth: int = 0, - ): - """Traverse (DFS) the LogicalPlan DAG and - return a string representation of the operators.""" - if isinstance(op, SourceOperator): - return curr_str, depth - - curr_max_depth = depth - op_name = op.name - if depth == 0: - curr_str += f"{op_name}\n" - else: - trailing_space = " " * ((depth - 1) * 3) - curr_str += f"{trailing_space}+- {op_name}\n" - - for input in op.input_dependencies: - curr_str, input_max_depth = generate_logical_plan_string( - input, curr_str, depth + 1 - ) - curr_max_depth = max(curr_max_depth, input_max_depth) - return curr_str, curr_max_depth - - # generate_logical_plan_string(self._logical_plan.dag) - plan_str, plan_max_depth = generate_logical_plan_string( - self._logical_plan.dag + # using dataset as source here, so don't generate source operator in generate_plan_string + plan_str, plan_max_depth = self.generate_plan_string( + self._logical_plan.dag, including_source=False ) if self._snapshot_bundle is not None: @@ -378,10 +400,9 @@ def schema( iter_ref_bundles, _, executor = self.execute_to_iterator() # Make sure executor is fully shutdown upon exiting with executor: - for bundle in iter_ref_bundles: - if bundle.schema is not None: - schema = bundle.schema - break + schema = _take_first_non_empty_schema( + bundle.schema for bundle in iter_ref_bundles + ) self.cache_schema(schema) return self._schema @@ -493,9 +514,10 @@ def execute( # `List[RefBundle]` instead of `RefBundle`. Among other reasons, it'd # allow us to remove the unwrapping logic below. output_bundles = self._logical_plan.dag.output_data() - schema = self._logical_plan.dag.infer_schema() owns_blocks = all(bundle.owns_blocks for bundle in output_bundles) - schema = unify_ref_bundles_schema(output_bundles) + schema = _take_first_non_empty_schema( + bundle.schema for bundle in output_bundles + ) bundle = RefBundle( [ (block, metadata) diff --git a/python/ray/data/_internal/planner/aggregate.py b/python/ray/data/_internal/planner/aggregate.py index 75c42657391f..199382e226f0 100644 --- a/python/ray/data/_internal/planner/aggregate.py +++ b/python/ray/data/_internal/planner/aggregate.py @@ -52,6 +52,7 @@ def fn( metadata.extend(ref_bundle.metadata) if len(blocks) == 0: return (blocks, {}) + unified_schema = unify_ref_bundles_schema(refs) for agg_fn in aggs: agg_fn._validate(unified_schema) diff --git a/python/ray/data/_internal/planner/exchange/push_based_shuffle_task_scheduler.py b/python/ray/data/_internal/planner/exchange/push_based_shuffle_task_scheduler.py index eac3c1e88e30..901eb69bd969 100644 --- a/python/ray/data/_internal/planner/exchange/push_based_shuffle_task_scheduler.py +++ b/python/ray/data/_internal/planner/exchange/push_based_shuffle_task_scheduler.py @@ -14,7 +14,6 @@ from ray.data._internal.stats import StatsDict from ray.data._internal.util import ( convert_bytes_to_human_readable_str, - unify_schemas_with_validation, unzip, ) from ray.data.block import ( @@ -23,6 +22,7 @@ BlockExecStats, BlockMetadata, BlockMetadataWithSchema, + _take_first_non_empty_schema, to_stats, ) from ray.data.context import DataContext @@ -743,13 +743,14 @@ def _merge( del block schemas.append(meta_with_schema.schema) + schema = _take_first_non_empty_schema(iter(schemas)) + meta = BlockMetadata( num_rows=num_rows, size_bytes=size_bytes, input_files=None, exec_stats=stats.build(), ) - schema = unify_schemas_with_validation(schemas) meta_with_schema = BlockMetadataWithSchema(metadata=meta, schema=schema) yield meta_with_schema diff --git a/python/ray/data/_internal/planner/plan_all_to_all_op.py b/python/ray/data/_internal/planner/plan_all_to_all_op.py index 0fbef1188079..d3c4c0ae74dc 100644 --- a/python/ray/data/_internal/planner/plan_all_to_all_op.py +++ b/python/ray/data/_internal/planner/plan_all_to_all_op.py @@ -161,7 +161,6 @@ def plan_all_to_all_op( fn, input_physical_dag, data_context, - target_max_block_size=None, num_outputs=op._num_outputs, sub_progress_bar_names=op._sub_progress_bar_names, name=op.name, diff --git a/python/ray/data/_internal/planner/plan_download_op.py b/python/ray/data/_internal/planner/plan_download_op.py new file mode 100644 index 000000000000..279e90ab9470 --- /dev/null +++ b/python/ray/data/_internal/planner/plan_download_op.py @@ -0,0 +1,247 @@ +import logging +import math +from concurrent.futures import ThreadPoolExecutor, as_completed +from typing import Iterator, List +from urllib.parse import urlparse + +import pyarrow as pa + +import ray +from ray.data._internal.compute import ActorPoolStrategy, TaskPoolStrategy +from ray.data._internal.execution.interfaces import PhysicalOperator +from ray.data._internal.execution.operators.map_operator import MapOperator +from ray.data._internal.execution.operators.map_transformer import ( + BlockMapTransformFn, + MapTransformer, +) +from ray.data._internal.logical.operators.one_to_one_operator import Download +from ray.data._internal.util import RetryingPyFileSystem, make_async_gen +from ray.data.block import BlockAccessor +from ray.data.context import DataContext +from ray.data.datasource.path_util import _resolve_paths_and_filesystem + +logger = logging.getLogger(__name__) + +URI_DOWNLOAD_MAX_WORKERS = 16 + + +def plan_download_op( + op: Download, + physical_children: List[PhysicalOperator], + data_context: DataContext, +) -> MapOperator: + """Plan the download operation with partitioning and downloading stages.""" + assert len(physical_children) == 1 + input_physical_dag = physical_children[0] + + upstream_op_is_download = False + if len(input_physical_dag._logical_operators) == 1 and isinstance( + input_physical_dag._logical_operators[0], Download + ): + upstream_op_is_download = True + + uri_column_name = op.uri_column_name + output_bytes_column_name = op.output_bytes_column_name + ray_remote_args = op.ray_remote_args + + # Import _get_udf from the main planner file + from ray.data._internal.planner.plan_udf_map_op import ( + _generate_transform_fn_for_map_batches, + _get_udf, + ) + + # If we have multiple download operators in a row, we should only include the partition actor + # at the start of the chain. This is primarily done to prevent partition actors from bottlenecking + # the chain becuase the interleaved operators would be a single actor. As a result, the + # URIDownloader physical operator is responsible for outputting appropriately sized blocks. + partition_map_operator = None + if not upstream_op_is_download: + # PartitionActor is a callable class, so we need ActorPoolStrategy + partition_compute = ActorPoolStrategy( + size=1 + ) # Use single actor for partitioning + + fn, init_fn = _get_udf( + PartitionActor, (), {}, (uri_column_name, data_context), {} + ) + block_fn = _generate_transform_fn_for_map_batches(fn) + partition_transform_fns = [ + BlockMapTransformFn(block_fn), + ] + partition_map_transformer = MapTransformer(partition_transform_fns, init_fn) + partition_map_operator = MapOperator.create( + partition_map_transformer, + input_physical_dag, + data_context, + name="URIPartitioner", + compute_strategy=partition_compute, # Use actor-based compute for callable class + ray_remote_args=ray_remote_args, + ) + + fn, init_fn = _get_udf( + download_bytes_threaded, + (uri_column_name, output_bytes_column_name, data_context), + {}, + None, + None, + ) + download_transform_fn = _generate_transform_fn_for_map_batches(fn) + transform_fns = [ + BlockMapTransformFn(download_transform_fn), + ] + download_map_transformer = MapTransformer(transform_fns, init_fn) + download_compute = TaskPoolStrategy() + download_map_operator = MapOperator.create( + download_map_transformer, + partition_map_operator if partition_map_operator else input_physical_dag, + data_context, + name="URIDownloader", + compute_strategy=download_compute, + ray_remote_args=ray_remote_args, + ) + + return download_map_operator + + +def uri_to_path(uri: str) -> str: + """Convert a URI to a filesystem path.""" + # TODO(mowen): urlparse might be slow. in the future we could use a faster alternative. + parsed = urlparse(uri) + if parsed.scheme == "file": + return parsed.path + return parsed.netloc + parsed.path + + +def _arrow_batcher(table: pa.Table, output_batch_size: int): + """Batch a PyArrow table into smaller tables of size n using zero-copy slicing.""" + num_rows = table.num_rows + for i in range(0, num_rows, output_batch_size): + end_idx = min(i + output_batch_size, num_rows) + # Use PyArrow's zero-copy slice operation + batch_table = table.slice(i, end_idx - i) + yield batch_table + + +def download_bytes_threaded( + block: pa.Table, + uri_column_name: str, + output_bytes_column_name, + data_context: DataContext, +) -> Iterator[pa.Table]: + """Optimized version that uses make_async_gen for concurrent downloads.""" + if not isinstance(block, pa.Table): + block = BlockAccessor.for_block(block).to_arrow() + + # Extract URIs from PyArrow table + uris = block.column(uri_column_name).to_pylist() + + if len(uris) == 0: + yield block + return + + paths, fs = _resolve_paths_and_filesystem(uris) + fs = RetryingPyFileSystem.wrap(fs, retryable_errors=data_context.retried_io_errors) + + def load_uri_bytes(uri_path_iterator): + """Function that takes an iterator of URI paths and yields downloaded bytes for each.""" + for uri_path in uri_path_iterator: + with fs.open_input_file(uri_path) as f: + yield f.read() + + # Use make_async_gen to download URI bytes concurrently + # This preserves the order of results to match the input URIs + uri_bytes = list( + make_async_gen( + base_iterator=iter(paths), + fn=load_uri_bytes, + preserve_ordering=True, + num_workers=URI_DOWNLOAD_MAX_WORKERS, + ) + ) + + # Add the new column to the PyArrow table + output_block = block.add_column( + len(block.column_names), output_bytes_column_name, pa.array(uri_bytes) + ) + output_block_size = output_block.nbytes + ctx = ray.data.context.DatasetContext.get_current() + max_bytes = ctx.target_max_block_size + if max_bytes is not None and output_block_size > max_bytes: + num_blocks = math.ceil(output_block_size / max_bytes) + num_rows = output_block.num_rows + yield from _arrow_batcher(output_block, int(math.ceil(num_rows / num_blocks))) + else: + yield output_block + + +class PartitionActor: + """Actor that partitions download operations based on estimated file sizes.""" + + INIT_SAMPLE_BATCH_SIZE = 25 + + def __init__(self, uri_column_name: str, data_context: DataContext): + self._uri_column_name = uri_column_name + self._data_context = data_context + self._batch_size_estimate = None + + def _sample_sizes(self, uris: List[str]) -> List[int]: + """Fetch file sizes in parallel using ThreadPoolExecutor.""" + + def get_file_size(uri_path, fs): + try: + return fs.get_file_info(uri_path).size + except Exception: + return None + + # If no URIs, return empty list + if not uris: + return [] + + # Get the filesystem from the first URI + paths, fs = _resolve_paths_and_filesystem(uris) + fs = RetryingPyFileSystem.wrap( + fs, retryable_errors=self._data_context.retried_io_errors + ) + + # Use ThreadPoolExecutor for concurrent size fetching + file_sizes = [] + with ThreadPoolExecutor(max_workers=URI_DOWNLOAD_MAX_WORKERS) as executor: + # Submit all size fetch tasks + futures = [ + executor.submit(get_file_size, uri_path, fs) for uri_path in paths + ] + + # Collect results as they complete (order doesn't matter) + for future in as_completed(futures): + try: + size = future.result() + if size is not None: + file_sizes.append(size) + except Exception as e: + logger.warning(f"Error fetching file size for download: {e}") + + return file_sizes + + def __call__(self, block: pa.Table) -> Iterator[pa.Table]: + if not isinstance(block, pa.Table): + block = BlockAccessor.for_block(block).to_arrow() + + if self._batch_size_estimate is None: + # Extract URIs from PyArrow table for sampling + uris = block.column(self._uri_column_name).to_pylist() + sample_uris = uris[: self.INIT_SAMPLE_BATCH_SIZE] + file_sizes = self._sample_sizes(sample_uris) + if not file_sizes or sum(file_sizes) == 0: + # Fallback to incoming block size if no file sizes could be determined + # or if the total size sampled is 0 + logger.warning( + "No file sizes could be determined, using incoming block size" + ) + self._batch_size_estimate = block.num_rows + else: + file_size_estimate = sum(file_sizes) / len(file_sizes) + ctx = ray.data.context.DatasetContext.get_current() + max_bytes = ctx.target_max_block_size + self._batch_size_estimate = math.floor(max_bytes / file_size_estimate) + + yield from _arrow_batcher(block, self._batch_size_estimate) diff --git a/python/ray/data/_internal/planner/plan_read_op.py b/python/ray/data/_internal/planner/plan_read_op.py index 360dce30cea8..34bb16e49f67 100644 --- a/python/ray/data/_internal/planner/plan_read_op.py +++ b/python/ray/data/_internal/planner/plan_read_op.py @@ -42,7 +42,7 @@ def _derive_metadata(read_task: ReadTask, read_task_ref: ObjectRef) -> BlockMeta warnings.warn( "The serialized size of your read function named " f"'{read_task.read_fn.__name__}' is {memory_string(task_size)}. This size " - "relatively large. As a result, Ray might excessively " + "is relatively large. As a result, Ray might excessively " "spill objects during execution. To fix this issue, avoid accessing " f"`self` or other large objects in '{read_task.read_fn.__name__}'." ) diff --git a/python/ray/data/_internal/planner/plan_udf_map_op.py b/python/ray/data/_internal/planner/plan_udf_map_op.py index 945d67717f6b..3809bcafd8a5 100644 --- a/python/ray/data/_internal/planner/plan_udf_map_op.py +++ b/python/ray/data/_internal/planner/plan_udf_map_op.py @@ -5,7 +5,17 @@ import queue from threading import Thread from types import GeneratorType -from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, TypeVar +from typing import ( + Any, + Callable, + Dict, + Iterable, + Iterator, + List, + Optional, + Tuple, + TypeVar, +) import numpy as np import pandas as pd @@ -118,19 +128,14 @@ def fn(block: Block) -> Block: # 1. evaluate / add expressions if exprs: block_accessor = BlockAccessor.for_block(block) - new_columns = {} - for col_name in block_accessor.column_names(): - # For Arrow blocks, block[col_name] gives us a ChunkedArray - # For Pandas blocks, block[col_name] gives us a Series - new_columns[col_name] = block[col_name] - # Add/update with expression results + result_block = block for name, expr in exprs.items(): - result = eval_expr(expr, block) - new_columns[name] = result + result = eval_expr(expr, result_block) + result_block_accessor = BlockAccessor.for_block(result_block) + result_block = result_block_accessor.upsert_column(name, result) - # Create a new block from the combined columns and add it - block = BlockAccessor.batch_to_block(new_columns) + block = result_block # 2. (optional) column projection if columns: @@ -144,7 +149,7 @@ def fn(block: Block) -> Block: return block except Exception as e: - _try_wrap_udf_exception(e, block) + _try_wrap_udf_exception(e) compute = get_compute(op._compute) transform_fn = _generate_transform_fn_for_map_block(fn) @@ -203,7 +208,7 @@ def filter_batch_fn(block: "pa.Table") -> "pa.Table": try: return block.filter(expression) except Exception as e: - _try_wrap_udf_exception(e, block) + _try_wrap_udf_exception(e) transform_fn = _generate_transform_fn_for_map_batches(filter_batch_fn) map_transformer = _create_map_transformer_for_map_batches_op( @@ -213,7 +218,14 @@ def filter_batch_fn(block: "pa.Table") -> "pa.Table": zero_copy_batch=True, ) else: - filter_fn, init_fn = _get_udf(op) + udf_is_callable_class = isinstance(op._fn, CallableClass) + filter_fn, init_fn = _get_udf( + op._fn, + op._fn_args, + op._fn_kwargs, + op._fn_constructor_args if udf_is_callable_class else None, + op._fn_constructor_kwargs if udf_is_callable_class else None, + ) transform_fn = _generate_transform_fn_for_filter(filter_fn) map_transformer = _create_map_transformer_for_row_based_map_op( transform_fn, init_fn @@ -244,7 +256,14 @@ def plan_udf_map_op( input_physical_dag = physical_children[0] compute = get_compute(op._compute) - fn, init_fn = _get_udf(op) + udf_is_callable_class = isinstance(op._fn, CallableClass) + fn, init_fn = _get_udf( + op._fn, + op._fn_args, + op._fn_kwargs, + op._fn_constructor_args if udf_is_callable_class else None, + op._fn_constructor_kwargs if udf_is_callable_class else None, + ) if isinstance(op, MapBatches): transform_fn = _generate_transform_fn_for_map_batches(fn) @@ -280,17 +299,23 @@ def plan_udf_map_op( ) -def _get_udf(op: AbstractUDFMap): +def _get_udf( + op_fn: Callable, + op_fn_args: Tuple[Any, ...], + op_fn_kwargs: Dict[str, Any], + op_fn_constructor_args: Optional[Tuple[Any, ...]], + op_fn_constructor_kwargs: Optional[Dict[str, Any]], +): # Note, it's important to define these standalone variables. # So the parsed functions won't need to capture the entire operator, which may not # be serializable. - udf = op._fn - fn_args = op._fn_args or () - fn_kwargs = op._fn_kwargs or {} + udf = op_fn + fn_args = op_fn_args or () + fn_kwargs = op_fn_kwargs or {} if isinstance(udf, CallableClass): - fn_constructor_args = op._fn_constructor_args or () - fn_constructor_kwargs = op._fn_constructor_kwargs or {} + fn_constructor_args = op_fn_constructor_args or () + fn_constructor_kwargs = op_fn_constructor_kwargs or {} is_async_udf = _is_async_udf(udf.__call__) @@ -323,7 +348,7 @@ async def _wrapped_udf_map_fn(item: Any) -> Any: **fn_kwargs, ) except Exception as e: - _try_wrap_udf_exception(e, item) + _try_wrap_udf_exception(e) elif inspect.isasyncgenfunction(udf.__call__): @@ -358,7 +383,7 @@ def _wrapped_udf_map_fn(item: Any) -> Any: **fn_kwargs, ) except Exception as e: - _try_wrap_udf_exception(e, item) + _try_wrap_udf_exception(e) else: @@ -366,7 +391,7 @@ def _wrapped_udf_map_fn(item: Any) -> Any: try: return udf(item, *fn_args, **fn_kwargs) except Exception as e: - _try_wrap_udf_exception(e, item) + _try_wrap_udf_exception(e) def init_fn(): pass @@ -378,14 +403,11 @@ def _try_wrap_udf_exception(e: Exception, item: Any = None): """If the Ray Debugger is enabled, keep the full stack trace unmodified so that the debugger can stop at the initial unhandled exception. Otherwise, clear the stack trace to omit noisy internal code path.""" - error_message = f"Failed to process the following data block: {item}" - ctx = ray.data.DataContext.get_current() if _is_ray_debugger_post_mortem_enabled() or ctx.raise_original_map_exception: - logger.error(error_message) raise e else: - raise UserCodeException(error_message) from e + raise UserCodeException("UDF failed to process a data block.") from e # Following are util functions for converting UDFs to `MapTransformCallable`s. diff --git a/python/ray/data/_internal/planner/planner.py b/python/ray/data/_internal/planner/planner.py index 3b42c4ab7f91..5cc78fb7aa3a 100644 --- a/python/ray/data/_internal/planner/planner.py +++ b/python/ray/data/_internal/planner/planner.py @@ -31,11 +31,12 @@ StreamingRepartition, ) from ray.data._internal.logical.operators.n_ary_operator import Union, Zip -from ray.data._internal.logical.operators.one_to_one_operator import Limit +from ray.data._internal.logical.operators.one_to_one_operator import Download, Limit from ray.data._internal.logical.operators.read_operator import Read from ray.data._internal.logical.operators.streaming_split_operator import StreamingSplit from ray.data._internal.logical.operators.write_operator import Write from ray.data._internal.planner.plan_all_to_all_op import plan_all_to_all_op +from ray.data._internal.planner.plan_download_op import plan_download_op from ray.data._internal.planner.plan_read_op import plan_read_op from ray.data._internal.planner.plan_udf_map_op import ( plan_filter_op, @@ -157,6 +158,7 @@ class Planner: StreamingRepartition: plan_streaming_repartition_op, Join: plan_join_op, StreamingSplit: plan_streaming_split_op, + Download: plan_download_op, } def plan(self, logical_plan: LogicalPlan) -> PhysicalPlan: @@ -214,8 +216,11 @@ def _plan_recursively( break curr_physical_op.set_logical_operators(logical_op) - queue.extend(physical_op.input_dependencies) + # Add this operator to the op_map so optimizer can find it + op_map[curr_physical_op] = logical_op + queue.extend(curr_physical_op.input_dependencies) + # Also add the final operator (in case the loop didn't catch it) op_map[physical_op] = logical_op return physical_op, op_map diff --git a/python/ray/data/_internal/planner/random_shuffle.py b/python/ray/data/_internal/planner/random_shuffle.py index 8a78fa587840..b698a3ecc91b 100644 --- a/python/ray/data/_internal/planner/random_shuffle.py +++ b/python/ray/data/_internal/planner/random_shuffle.py @@ -54,7 +54,7 @@ def fn( # overhead. This can be removed once dynamic block splitting is # supported for all-to-all ops. # See https://github.com/ray-project/ray/issues/40518. - map_transformer.set_target_max_block_size(float("inf")) + map_transformer.override_target_max_block_size(float("inf")) def upstream_map_fn(blocks): return map_transformer.apply_transform(blocks, ctx) @@ -64,7 +64,7 @@ def upstream_map_fn(blocks): ray_remote_args = ctx.upstream_map_ray_remote_args shuffle_spec = ShuffleTaskSpec( - ctx.target_max_block_size, + ctx.target_max_block_size_override, random_shuffle=True, random_seed=seed, upstream_map_fn=upstream_map_fn, diff --git a/python/ray/data/_internal/planner/repartition.py b/python/ray/data/_internal/planner/repartition.py index 6ba3afd0e147..5a119f540c83 100644 --- a/python/ray/data/_internal/planner/repartition.py +++ b/python/ray/data/_internal/planner/repartition.py @@ -48,13 +48,13 @@ def shuffle_repartition_fn( # overhead. This can be removed once dynamic block splitting is # supported for all-to-all ops. # See https://github.com/ray-project/ray/issues/40518. - map_transformer.set_target_max_block_size(float("inf")) + map_transformer.override_target_max_block_size(float("inf")) def upstream_map_fn(blocks): return map_transformer.apply_transform(blocks, ctx) shuffle_spec = ShuffleTaskSpec( - ctx.target_max_block_size, + ctx.target_max_block_size_override, random_shuffle=False, upstream_map_fn=upstream_map_fn, ) @@ -77,7 +77,9 @@ def split_repartition_fn( refs: List[RefBundle], ctx: TaskContext, ) -> AllToAllTransformFnResult: - shuffle_spec = ShuffleTaskSpec(ctx.target_max_block_size, random_shuffle=False) + shuffle_spec = ShuffleTaskSpec( + ctx.target_max_block_size_override, random_shuffle=False + ) scheduler = SplitRepartitionTaskScheduler(shuffle_spec) return scheduler.execute(refs, num_outputs, ctx) diff --git a/python/ray/data/_internal/planner/sort.py b/python/ray/data/_internal/planner/sort.py index ece9a76e01ce..852154c66c36 100644 --- a/python/ray/data/_internal/planner/sort.py +++ b/python/ray/data/_internal/planner/sort.py @@ -36,6 +36,7 @@ def fn( blocks.extend(ref_bundle.block_refs) if len(blocks) == 0: return (blocks, {}) + sort_key.validate_schema(unify_ref_bundles_schema(refs)) num_mappers = len(blocks) diff --git a/python/ray/data/_internal/stats.py b/python/ray/data/_internal/stats.py index 1b129f81c5c0..088acffc2e0f 100644 --- a/python/ray/data/_internal/stats.py +++ b/python/ray/data/_internal/stats.py @@ -1,5 +1,5 @@ import collections -import enum +import copy import logging import threading import time @@ -14,6 +14,7 @@ import ray from ray.actor import ActorHandle from ray.data._internal.block_list import BlockList +from ray.data._internal.execution.dataset_state import DatasetState from ray.data._internal.execution.interfaces.op_runtime_metrics import ( NODE_UNKNOWN, MetricsGroup, @@ -21,7 +22,11 @@ NodeMetrics, OpRuntimeMetrics, ) -from ray.data._internal.metadata_exporter import Topology, get_dataset_metadata_exporter +from ray.data._internal.metadata_exporter import ( + DatasetMetadata, + Topology, + get_dataset_metadata_exporter, +) from ray.data._internal.util import capfirst from ray.data.block import BlockStats from ray.data.context import DataContext @@ -158,7 +163,6 @@ def __init__(self, max_stats=1000): self.last_time = {} self.start_time = {} self.max_stats = max_stats - self.fifo_queue = [] # Assign dataset uuids with a global counter. self.next_dataset_id = 0 @@ -170,6 +174,11 @@ def __init__(self, max_stats=1000): # Initialize the metadata exporter self._metadata_exporter = get_dataset_metadata_exporter() + self.dataset_metadatas: Dict[str, DatasetMetadata] = {} + + # A FIFO queue of dataset_tags for finished datasets. This is used to + # efficiently evict the oldest finished datasets when max_stats is reached. + self.finished_datasets_queue = collections.deque() # Ray Data dashboard metrics # Everything is a gauge because we need to reset all of @@ -274,6 +283,12 @@ def __init__(self, max_stats=1000): description="Seconds user thread is blocked by iter_batches()", tag_keys=iter_tag_keys, ) + self.time_to_first_batch_s = Gauge( + "data_iter_time_to_first_batch_seconds", + description="Total time spent waiting for the first batch after starting iteration. " + "This includes the dataset pipeline warmup time. This metric is accumulated across different epochs.", + tag_keys=iter_tag_keys, + ) self.iter_user_s = Gauge( "data_iter_user_seconds", description="Seconds spent in user code", @@ -463,6 +478,7 @@ def update_iteration_metrics( ): tags = self._create_tags(dataset_tag) self.iter_total_blocked_s.set(stats.iter_total_blocked_s.get(), tags) + self.time_to_first_batch_s.set(stats.iter_time_to_first_batch_s.get(), tags) self.iter_user_s.set(stats.iter_user_s.get(), tags) self.iter_initialize_s.set(stats.iter_initialize_s.get(), tags) @@ -477,7 +493,7 @@ def register_dataset( start_time = time.time() self.datasets[dataset_tag] = { "job_id": job_id, - "state": DatasetState.RUNNING.name, + "state": DatasetState.PENDING.name, "progress": 0, "total": 0, "total_rows": 0, @@ -485,7 +501,7 @@ def register_dataset( "end_time": None, "operators": { operator: { - "state": DatasetState.RUNNING.name, + "state": DatasetState.PENDING.name, "progress": 0, "total": 0, "queued_blocks": 0, @@ -494,16 +510,19 @@ def register_dataset( }, } if self._metadata_exporter is not None: - from ray.data._internal.metadata_exporter import DatasetMetadata - - dataset_metadata = DatasetMetadata( + self.dataset_metadatas[dataset_tag] = DatasetMetadata( job_id=job_id, topology=topology, dataset_id=dataset_tag, start_time=start_time, data_context=data_context, + execution_start_time=None, + execution_end_time=None, + state=DatasetState.PENDING.name, + ) + self._metadata_exporter.export_dataset_metadata( + self.dataset_metadatas[dataset_tag] ) - self._metadata_exporter.export_dataset_metadata(dataset_metadata) def update_dataset(self, dataset_tag: str, state: Dict[str, Any]): self.datasets[dataset_tag].update(state) @@ -527,8 +546,10 @@ def update_dataset(self, dataset_tag: str, state: Dict[str, Any]): state_string = state.get("state", DatasetState.UNKNOWN.name) state_enum = DatasetState.from_string(state_string) self.data_dataset_state.set(state_enum.value, dataset_tags) + self.update_dataset_metadata_state(dataset_tag, state_string) # Update operator-level metrics + operator_states: Dict[str, str] = {} for operator, op_state in state.get("operators", {}).items(): operator_tags = { "dataset": dataset_tag, @@ -548,12 +569,87 @@ def update_dataset(self, dataset_tag: str, state: Dict[str, Any]): state_string = op_state.get("state", DatasetState.UNKNOWN.name) state_enum = DatasetState.from_string(state_string) self.data_operator_state.set(state_enum.value, operator_tags) + operator_states[operator] = state_string + + self.update_dataset_metadata_operator_states(dataset_tag, operator_states) + + # Evict the oldest finished datasets to ensure the `max_stats` limit is enforced. + if state["state"] in {DatasetState.FINISHED.name, DatasetState.FAILED.name}: + self.finished_datasets_queue.append(dataset_tag) + while len(self.datasets) > self.max_stats and self.finished_datasets_queue: + tag_to_evict = self.finished_datasets_queue.popleft() + self.datasets.pop(tag_to_evict, None) + self.dataset_metadatas.pop(tag_to_evict, None) def get_datasets(self, job_id: Optional[str] = None): if not job_id: return self.datasets return {k: v for k, v in self.datasets.items() if v["job_id"] == job_id} + def update_dataset_metadata_state(self, dataset_id: str, new_state: str): + if dataset_id not in self.dataset_metadatas: + return + update_time = time.time() + dataset_metadata = self.dataset_metadatas[dataset_id] + if dataset_metadata.state == new_state: + return + updated_dataset_metadata = copy.deepcopy(dataset_metadata) + updated_dataset_metadata.state = new_state + if new_state == DatasetState.RUNNING.name: + updated_dataset_metadata.execution_start_time = update_time + elif new_state in (DatasetState.FINISHED.name, DatasetState.FAILED.name): + updated_dataset_metadata.execution_end_time = update_time + # Update metadata of running operators + for operator in updated_dataset_metadata.topology.operators: + if operator.state == DatasetState.RUNNING.name: + operator.state = new_state + operator.execution_end_time = update_time + + self.dataset_metadatas[dataset_id] = updated_dataset_metadata + self._metadata_exporter.export_dataset_metadata(updated_dataset_metadata) + + def update_dataset_metadata_operator_states( + self, dataset_id: str, operator_states: Dict[str, str] + ): + if dataset_id not in self.dataset_metadatas: + return + + dataset_metadata = self.dataset_metadatas[dataset_id] + update_needed = False + for operator in dataset_metadata.topology.operators: + if ( + operator.id in operator_states + and operator.state != operator_states[operator.id] + ): + update_needed = True + break + + if not update_needed: + return + + updated_dataset_metadata = copy.deepcopy(dataset_metadata) + update_time = time.time() + for operator in updated_dataset_metadata.topology.operators: + if operator.id in operator_states: + new_state = operator_states[operator.id] + if operator.state == new_state: + continue + operator.state = new_state + if new_state == DatasetState.RUNNING.name: + operator.execution_start_time = update_time + elif new_state in ( + DatasetState.FINISHED.name, + DatasetState.FAILED.name, + ): + operator.execution_end_time = update_time + # Handle outlier case for InputDataBuffer, which is marked as finished immediately and does not have a RUNNING state. + # Set the execution time the same as its end time + if not operator.execution_start_time: + operator.execution_start_time = update_time + + self.dataset_metadatas[dataset_id] = updated_dataset_metadata + self._metadata_exporter.export_dataset_metadata(updated_dataset_metadata) + def _create_tags( self, dataset_tag: str, @@ -633,7 +729,9 @@ def __init__(self): self._update_thread: Optional[threading.Thread] = None self._update_thread_lock: threading.Lock = threading.Lock() - def _get_stats_actor(self, skip_cache: bool = False) -> Optional[ActorHandle]: + def _get_or_create_stats_actor( + self, skip_cache: bool = False + ) -> Optional[ActorHandle]: if ray._private.worker._global_node is None: raise RuntimeError( "Global node is not initialized. Driver might be not connected to Ray." @@ -650,27 +748,13 @@ def _get_stats_actor(self, skip_cache: bool = False) -> Optional[ActorHandle]: self._stats_actor_handle = ray.get_actor( name=STATS_ACTOR_NAME, namespace=STATS_ACTOR_NAMESPACE ) + self._stats_actor_cluster_id = current_cluster_id except ValueError: - return None - self._stats_actor_cluster_id = current_cluster_id - - return self._stats_actor_handle - - def _get_or_create_stats_actor(self) -> Optional[ActorHandle]: - if ray._private.worker._global_node is None: - raise RuntimeError( - "Global node is not initialized. Driver might be not connected to Ray." - ) - - # NOTE: In some cases (for ex, when registering dataset) actor might be gone - # (for ex, when prior driver disconnects) and therefore to avoid using - # stale handle we force looking up the actor with Ray to determine if - # we should create a new one. - actor = self._get_stats_actor(skip_cache=True) - - if actor is None: - self._stats_actor_handle = _get_or_create_stats_actor() - self._stats_actor_cluster_id = ray._private.worker._global_node.cluster_id + # Create an actor if it doesn't exist + self._stats_actor_handle = _get_or_create_stats_actor() + self._stats_actor_cluster_id = ( + ray._private.worker._global_node.cluster_id + ) return self._stats_actor_handle @@ -684,11 +768,7 @@ def _run_update_loop(): while True: if self._last_iteration_stats or self._last_execution_stats: try: - # Do not create _StatsActor if it doesn't exist because - # this thread can be running even after the cluster is - # shutdown. Creating an actor will automatically start - # a new cluster. - stats_actor = self._get_stats_actor() + stats_actor = self._get_or_create_stats_actor() if stats_actor is None: continue stats_actor.update_metrics.remote( @@ -806,7 +886,14 @@ def register_dataset_to_stats_actor( topology: Optional Topology representing the DAG structure to export data_context: The DataContext attached to the dataset """ - self._get_or_create_stats_actor().register_dataset.remote( + + # NOTE: In some cases (for ex, when registering dataset) actor might be gone + # (for ex, when prior driver disconnects) and therefore to avoid using + # stale handle we force looking up the actor with Ray to determine if + # we should create a new one. + stats_actor = self._get_or_create_stats_actor(skip_cache=True) + + stats_actor.register_dataset.remote( ray.get_runtime_context().get_job_id(), dataset_tag, operator_tags, @@ -816,7 +903,13 @@ def register_dataset_to_stats_actor( def get_dataset_id_from_stats_actor(self) -> str: try: - return ray.get(self._get_or_create_stats_actor().get_dataset_id.remote()) + # NOTE: In some cases (for ex, when registering dataset) actor might be gone + # (for ex, when prior driver disconnects) and therefore to avoid using + # stale handle we force looking up the actor with Ray to determine if + # we should create a new one. + stats_actor = self._get_or_create_stats_actor(skip_cache=True) + + return ray.get(stats_actor.get_dataset_id.remote()) except Exception: # Getting dataset id from _StatsActor may fail, in this case # fall back to uuid4 @@ -826,26 +919,6 @@ def get_dataset_id_from_stats_actor(self) -> str: StatsManager = _StatsManager() -class DatasetState(enum.IntEnum): - """Enum representing the possible states of a dataset during execution.""" - - UNKNOWN = 0 - RUNNING = 1 - FINISHED = 2 - FAILED = 3 - - def __str__(self): - return self.name - - @classmethod - def from_string(cls, text): - """Get enum by name.""" - try: - return cls[text] # This uses the name to lookup the enum - except KeyError: - return cls.UNKNOWN - - class DatasetStats: """Holds the execution times for a given Dataset. @@ -893,6 +966,7 @@ def __init__( self.iter_format_batch_s: Timer = Timer() self.iter_collate_batch_s: Timer = Timer() self.iter_finalize_batch_s: Timer = Timer() + self.iter_time_to_first_batch_s: Timer = Timer() self.iter_total_blocked_s: Timer = Timer() self.iter_user_s: Timer = Timer() self.iter_initialize_s: Timer = Timer() @@ -932,14 +1006,6 @@ def to_summary(self) -> "DatasetStatsSummary": object, which can be used to generate a summary string.""" operators_stats = [] is_sub_operator = len(self.metadata) > 1 - for name, stats in self.metadata.items(): - operators_stats.append( - OperatorStatsSummary.from_block_metadata( - name, - stats, - is_sub_operator=is_sub_operator, - ) - ) iter_stats = IterStatsSummary( self.iter_wait_s, @@ -948,6 +1014,7 @@ def to_summary(self) -> "DatasetStatsSummary": self.iter_format_batch_s, self.iter_collate_batch_s, self.iter_finalize_batch_s, + self.iter_time_to_first_batch_s, self.iter_total_blocked_s, self.iter_user_s, self.iter_initialize_s, @@ -957,9 +1024,56 @@ def to_summary(self) -> "DatasetStatsSummary": self.iter_blocks_remote, self.iter_unknown_location, ) + stats_summary_parents = [] if self.parents is not None: stats_summary_parents = [p.to_summary() for p in self.parents] + + # Collect the sum of the final output row counts from all parent nodes + parent_total_output = 0 + for i, parent_summary in enumerate(stats_summary_parents): + if parent_summary.operators_stats: + # Get the last operator stats from the current parent summary + last_parent_op = parent_summary.operators_stats[-1] + # Extract output row count (handle dict type with "sum" key) + op_output = ( + last_parent_op.output_num_rows.get("sum", 0) + if isinstance(last_parent_op.output_num_rows, dict) + else 0 + ) + logger.debug( + f"Parent {i + 1} (operator: {last_parent_op.operator_name}) contributes {op_output} rows to input" + ) + parent_total_output += op_output + + # Create temporary operator stats objects from block metadata + op_stats = [ + OperatorStatsSummary.from_block_metadata( + name, stats, is_sub_operator=is_sub_operator + ) + for name, stats in self.metadata.items() + ] + + for i, op_stat in enumerate(op_stats): + # For sub-operators: inherit input based on the order in the current list + if is_sub_operator: + if i == 0: + # Input of the first sub-operator is the total output from parent nodes + op_stat.total_input_num_rows = parent_total_output + else: + # Input of subsequent sub-operators is the output of the previous sub-operator + prev_op = op_stats[i - 1] + op_stat.total_input_num_rows = ( + prev_op.output_num_rows["sum"] + if ( + prev_op.output_num_rows and "sum" in prev_op.output_num_rows + ) + else 0 + ) + else: + # Single operator scenario: input rows = total output from all parent nodes + op_stat.total_input_num_rows = parent_total_output + operators_stats.append(op_stat) streaming_exec_schedule_s = ( self.streaming_exec_schedule_s.get() if self.streaming_exec_schedule_s @@ -1261,6 +1375,8 @@ class OperatorStatsSummary: udf_time: Optional[Dict[str, float]] = None # memory: no "sum" stat memory: Optional[Dict[str, float]] = None + # Use the output_num_rows of the parent Operator as output_num_rows + total_input_num_rows: Optional[int] = None output_num_rows: Optional[Dict[str, float]] = None output_size_bytes: Optional[Dict[str, float]] = None # node_count: "count" stat instead of "sum" @@ -1395,6 +1511,9 @@ def from_block_metadata( "count": len(node_counts), } + # Assign a value in to_summary and initialize it as None. + total_input_num_rows = None + return OperatorStatsSummary( operator_name=operator_name, is_sub_operator=is_sub_operator, @@ -1406,6 +1525,7 @@ def from_block_metadata( cpu_time=cpu_stats, udf_time=udf_stats, memory=memory_stats, + total_input_num_rows=total_input_num_rows, output_num_rows=output_num_rows_stats, output_size_bytes=output_size_bytes_stats, node_count=node_counts_stats, @@ -1519,9 +1639,18 @@ def __str__(self) -> str: # total number of rows produced by the sum of the wall times across all # blocks of the operator. This assumes that on a single node the work done # would be equivalent, with no concurrency. + total_num_in_rows = ( + self.total_input_num_rows if self.total_input_num_rows else 0 + ) total_num_out_rows = output_num_rows_stats["sum"] out += indent out += "* Operator throughput:\n" + out += ( + indent + "\t* Total input num rows:" f" {total_num_in_rows} " "rows\n" + ) + out += ( + indent + "\t* Total output num rows:" f" {total_num_out_rows} " "rows\n" + ) out += ( indent + "\t* Ray Data throughput:" f" {total_num_out_rows / self.time_total_s} " @@ -1587,6 +1716,8 @@ class IterStatsSummary: collate_time: Timer # Time spent in finalize_fn, in seconds finalize_batch_time: Timer + # Time user thread is blocked waiting for first batch + time_to_first_batch: Timer # Total time user thread is blocked by iter_batches block_time: Timer # Time spent in user code, in seconds @@ -1610,6 +1741,7 @@ def to_string(self) -> str: out = "" if ( self.block_time.get() + or self.time_to_first_batch.get() or self.total_time.get() or self.get_time.get() or self.next_time.get() @@ -1630,6 +1762,11 @@ def to_string(self) -> str: " * Total time user thread is blocked by Ray Data iter_batches: " "{}\n".format(fmt(self.block_time.get())) ) + if self.time_to_first_batch.get(): + out += ( + " * Total time spent waiting for the first batch after starting iteration: " + "{}\n".format(fmt(self.time_to_first_batch.get())) + ) if self.user_time.get(): out += " * Total execution time for user thread: {}\n".format( fmt(self.user_time.get()) diff --git a/python/ray/data/_internal/util.py b/python/ray/data/_internal/util.py index d5220441f9f7..3db54be3d547 100644 --- a/python/ray/data/_internal/util.py +++ b/python/ray/data/_internal/util.py @@ -571,7 +571,7 @@ def get_compute_strategy( fn: "UserDefinedFunction", fn_constructor_args: Optional[Iterable[Any]] = None, compute: Optional[Union[str, "ComputeStrategy"]] = None, - concurrency: Optional[Union[int, Tuple[int, int]]] = None, + concurrency: Optional[Union[int, Tuple[int, int], Tuple[int, int, int]]] = None, ) -> "ComputeStrategy": """Get `ComputeStrategy` based on the function or class, and concurrency information. @@ -630,26 +630,34 @@ def get_compute_strategy( return compute elif concurrency is not None: if isinstance(concurrency, tuple): - if ( - len(concurrency) == 2 - and isinstance(concurrency[0], int) - and isinstance(concurrency[1], int) + # Validate tuple length and that all elements are integers + if len(concurrency) not in (2, 3) or not all( + isinstance(c, int) for c in concurrency ): - if is_callable_class: - return ActorPoolStrategy( - min_size=concurrency[0], max_size=concurrency[1] - ) - else: - raise ValueError( - "``concurrency`` is set as a tuple of integers, but ``fn`` " - f"is not a callable class: {fn}. Use ``concurrency=n`` to " - "control maximum number of workers to use." - ) - else: raise ValueError( "``concurrency`` is expected to be set as a tuple of " f"integers, but got: {concurrency}." ) + + # Check if function is callable class (common validation) + if not is_callable_class: + raise ValueError( + "``concurrency`` is set as a tuple of integers, but ``fn`` " + f"is not a callable class: {fn}. Use ``concurrency=n`` to " + "control maximum number of workers to use." + ) + + # Create ActorPoolStrategy based on tuple length + if len(concurrency) == 2: + return ActorPoolStrategy( + min_size=concurrency[0], max_size=concurrency[1] + ) + else: # len(concurrency) == 3 + return ActorPoolStrategy( + min_size=concurrency[0], + max_size=concurrency[1], + initial_size=concurrency[2], + ) elif isinstance(concurrency, int): if is_callable_class: return ActorPoolStrategy(size=concurrency) diff --git a/python/ray/data/block.py b/python/ray/data/block.py index 64b09334f4f2..6cbeeeae387b 100644 --- a/python/ray/data/block.py +++ b/python/ray/data/block.py @@ -57,6 +57,11 @@ # Represents a single column of the ``Block`` BlockColumn = Union["pyarrow.ChunkedArray", "pyarrow.Array", "pandas.Series"] +# Represents a single column of the ``Batch`` +BatchColumn = Union[ + "pandas.Series", "np.ndarray", "pyarrow.Array", "pyarrow.ChunkedArray" +] + logger = logging.getLogger(__name__) @@ -114,6 +119,21 @@ def _is_empty_schema(schema: Optional[Schema]) -> bool: ) +def _take_first_non_empty_schema(schemas: Iterator["Schema"]) -> Optional["Schema"]: + """Return the first non-empty schema from an iterator of schemas. + + Args: + schemas: Iterator of schemas to check. + + Returns: + The first non-empty schema, or None if all schemas are empty. + """ + for schema in schemas: + if not _is_empty_schema(schema): + return schema + return None + + def _apply_batch_format(given_batch_format: Optional[str]) -> str: if given_batch_format == "default": given_batch_format = DEFAULT_BATCH_FORMAT @@ -323,6 +343,19 @@ def rename_columns(self, columns_rename: Dict[str, str]) -> Block: """Return the block reflecting the renamed columns.""" raise NotImplementedError + def upsert_column(self, column_name: str, column_data: BlockColumn) -> Block: + """ + Upserts a column into the block. If the column already exists, it will be replaced. + + Args: + column_name: The name of the column to upsert. + column_data: The data to upsert into the column. (Arrow Array/ChunkedArray for Arrow blocks, Series or array-like for Pandas blocks) + + Returns: + The updated block. + """ + raise NotImplementedError() + def random_shuffle(self, random_seed: Optional[int]) -> Block: """Randomly shuffle this block.""" raise NotImplementedError diff --git a/python/ray/data/context.py b/python/ray/data/context.py index 4058adf1ba2f..30daaf92249a 100644 --- a/python/ray/data/context.py +++ b/python/ray/data/context.py @@ -70,6 +70,10 @@ class ShuffleStrategy(str, enum.Enum): DEFAULT_ENABLE_PANDAS_BLOCK = True +DEFAULT_PANDAS_BLOCK_IGNORE_METADATA = bool( + os.environ.get("RAY_DATA_PANDAS_BLOCK_IGNORE_METADATA", 0) +) + DEFAULT_READ_OP_MIN_NUM_BLOCKS = 200 DEFAULT_ACTOR_PREFETCHER_ENABLED = False @@ -140,6 +144,8 @@ class ShuffleStrategy(str, enum.Enum): "RAY_DATA_ENABLE_PROGRESS_BAR_NAME_TRUNCATION", True ) +DEFAULT_ENFORCE_SCHEMAS = env_bool("RAY_DATA_ENFORCE_SCHEMAS", False) + DEFAULT_ENABLE_GET_OBJECT_LOCATIONS_FOR_METRICS = False @@ -230,23 +236,24 @@ class ShuffleStrategy(str, enum.Enum): @DeveloperAPI @dataclass class AutoscalingConfig: - # Actor Pool utilization threshold for upscaling. Once Actor Pool - # exceeds this utilization threshold it will start adding new actors. - # - # NOTE: Actor Pool utilization is defined as ratio of - # - # - Number of submitted tasks to - # - Max number of tasks the current set of Actors in the pool could run - # (defined as Ray Actor's `max_concurrency` * `pool.num_running_actors`) - # - # This utilization value could exceed 100%, when the number of submitted tasks - # exceed available concurrency-slots to run them in the current set of actors. - # - # This is possible when `max_tasks_in_flight_per_actor` (defaults to 2 x - # of `max_concurrency`) > Actor's `max_concurrency` and allows to overlap - # task execution with the fetching of the blocks for the next task providing - # for ability to negotiate a trade-off between autoscaling speed and resource - # efficiency (ie making tasks wait instead of immediately triggering execution) + """Configuration for autoscaling of Ray Data. + + Args: + actor_pool_util_upscaling_threshold: Actor Pool utilization threshold for upscaling. + Once Actor Pool exceeds this utilization threshold it will start adding new actors. + Actor Pool utilization is defined as ratio of number of submitted tasks to the + number of available concurrency-slots to run them in the current set of actors. + This utilization value could exceed 100%, when the number of submitted tasks + exceed available concurrency-slots to run them in the current set of actors. + This is possible when `max_tasks_in_flight_per_actor` + (defaults to 2 x of `max_concurrency`) > Actor's `max_concurrency` + and allows to overlap task execution with the fetching of the blocks + for the next task providing for ability to negotiate a trade-off + between autoscaling speed and resource efficiency (i.e., + making tasks wait instead of immediately triggering execution). + actor_pool_util_downscaling_threshold: Actor Pool utilization threshold for downscaling. + """ + actor_pool_util_upscaling_threshold: float = ( DEFAULT_ACTOR_POOL_UTIL_UPSCALING_THRESHOLD ) @@ -357,6 +364,8 @@ class DataContext: to use. use_ray_tqdm: Whether to enable distributed tqdm. enable_progress_bars: Whether to enable progress bars. + enable_operator_progress_bars: Whether to enable progress bars for individual + operators during execution. enable_progress_bar_name_truncation: If True, the name of the progress bar (often the operator name) will be truncated if it exceeds `ProgressBar.MAX_NAME_LENGTH`. Otherwise, the full operator name is shown. @@ -372,7 +381,8 @@ class DataContext: retry. This follows same format as :ref:`retry_exceptions ` in Ray Core. Default to `False` to not retry on any errors. Set to `True` to retry all errors, or set to a list of errors to retry. - enable_op_resource_reservation: Whether to reserve resources for each operator. + op_resource_reservation_enabled: Whether to enable resource reservation for + operators to prevent resource contention. op_resource_reservation_ratio: The ratio of the total resources to reserve for each operator. max_errored_blocks: Max number of blocks that are allowed to have errors, @@ -402,10 +412,42 @@ class DataContext: retried_io_errors: A list of substrings of error messages that should trigger a retry when reading or writing files. This is useful for handling transient errors when reading from remote storage systems. + default_hash_shuffle_parallelism: Default parallelism level for hash-based + shuffle operations if the number of partitions is unspecifed. + max_hash_shuffle_aggregators: Maximum number of aggregating actors that can be + provisioned for hash-shuffle aggregations. + min_hash_shuffle_aggregator_wait_time_in_s: Minimum time to wait for hash + shuffle aggregators to become available, in seconds. + hash_shuffle_aggregator_health_warning_interval_s: Interval for health warning + checks on hash shuffle aggregators, in seconds. + max_hash_shuffle_finalization_batch_size: Maximum batch size for concurrent + hash-shuffle finalization tasks. If `None`, defaults to + `max_hash_shuffle_aggregators`. + join_operator_actor_num_cpus_per_partition_override: Override CPU allocation + per partition for join operator actors. + hash_shuffle_operator_actor_num_cpus_per_partition_override: Override CPU + allocation per partition for hash shuffle operator actors. + hash_aggregate_operator_actor_num_cpus_per_partition_override: Override CPU + allocation per partition for hash aggregate operator actors. + use_polars_sort: Whether to use Polars for tabular dataset sorting operations. enable_per_node_metrics: Enable per node metrics reporting for Ray Data, disabled by default. + override_object_store_memory_limit_fraction: Override the fraction of object + store memory limit. If `None`, uses Ray's default. memory_usage_poll_interval_s: The interval to poll the USS of map tasks. If `None`, map tasks won't record memory stats. + dataset_logger_id: Optional logger ID for dataset operations. If `None`, uses + default logging configuration. + issue_detectors_config: Configuration for issue detection and monitoring during + dataset operations. + downstream_capacity_backpressure_ratio: Ratio for downstream capacity + backpressure control. A higher ratio causes backpressure to kick-in + later. If `None`, this type of backpressure is disabled. + downstream_capacity_backpressure_max_queued_bundles: Maximum number of queued + bundles before applying backpressure. If `None`, no limit is applied. + enforce_schemas: Whether to enforce schema consistency across dataset operations. + pandas_block_ignore_metadata: Whether to ignore pandas metadata when converting + between Arrow and pandas formats for better type inference. """ # `None` means the block size is infinite. @@ -433,7 +475,7 @@ class DataContext: # Default hash-shuffle parallelism level (will be used when not # provided explicitly) - default_hash_shuffle_parallelism = DEFAULT_MIN_PARALLELISM + default_hash_shuffle_parallelism: int = DEFAULT_MIN_PARALLELISM # Max number of aggregating actors that could be provisioned # to perform aggregations on partitions produced during hash-shuffling @@ -533,6 +575,13 @@ class DataContext: default_factory=_issue_detectors_config_factory ) + downstream_capacity_backpressure_ratio: float = None + downstream_capacity_backpressure_max_queued_bundles: int = None + + enforce_schemas: bool = DEFAULT_ENFORCE_SCHEMAS + + pandas_block_ignore_metadata: bool = DEFAULT_PANDAS_BLOCK_IGNORE_METADATA + def __post_init__(self): # The additonal ray remote args that should be added to # the task-pool-based data tasks. diff --git a/python/ray/data/dataset.py b/python/ray/data/dataset.py index 8766434c425a..53fcc3b0ef73 100644 --- a/python/ray/data/dataset.py +++ b/python/ray/data/dataset.py @@ -284,7 +284,7 @@ def map( num_cpus: Optional[float] = None, num_gpus: Optional[float] = None, memory: Optional[float] = None, - concurrency: Optional[Union[int, Tuple[int, int]]] = None, + concurrency: Optional[Union[int, Tuple[int, int], Tuple[int, int, int]]] = None, ray_remote_args_fn: Optional[Callable[[], Dict[str, Any]]] = None, **ray_remote_args, ) -> "Dataset": @@ -371,6 +371,9 @@ def parse_filename(row: Dict[str, Any]) -> Dict[str, Any]: * If ``fn`` is a class and ``concurrency`` is a tuple ``(m, n)``, Ray Data uses an autoscaling actor pool from ``m`` to ``n`` workers. + * If ``fn`` is a class and ``concurrency`` is a tuple ``(m, n, initial)``, Ray + Data uses an autoscaling actor pool from ``m`` to ``n`` workers, with an initial size of ``initial``. + * If ``fn`` is a class and ``concurrency`` isn't set (default), this method raises an error. @@ -467,7 +470,7 @@ def map_batches( num_cpus: Optional[float] = None, num_gpus: Optional[float] = None, memory: Optional[float] = None, - concurrency: Optional[Union[int, Tuple[int, int]]] = None, + concurrency: Optional[Union[int, Tuple[int, int], Tuple[int, int, int]]] = None, ray_remote_args_fn: Optional[Callable[[], Dict[str, Any]]] = None, **ray_remote_args, ) -> "Dataset": @@ -632,6 +635,9 @@ def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: * If ``fn`` is a class and ``concurrency`` is a tuple ``(m, n)``, Ray Data uses an autoscaling actor pool from ``m`` to ``n`` workers. + * If ``fn`` is a class and ``concurrency`` is a tuple ``(m, n, initial)``, Ray + Data uses an autoscaling actor pool from ``m`` to ``n`` workers, with an initial size of ``initial``. + * If ``fn`` is a class and ``concurrency`` isn't set (default), this method raises an error. @@ -722,7 +728,7 @@ def _map_batches_without_batch_size_validation( num_cpus: Optional[float], num_gpus: Optional[float], memory: Optional[float], - concurrency: Optional[Union[int, Tuple[int, int]]], + concurrency: Optional[Union[int, Tuple[int, int], Tuple[int, int, int]]], ray_remote_args_fn: Optional[Callable[[], Dict[str, Any]]], **ray_remote_args, ): @@ -783,42 +789,75 @@ def _map_batches_without_batch_size_validation( return Dataset(plan, logical_plan) @PublicAPI(api_group=EXPRESSION_API_GROUP, stability="alpha") - def with_column(self, column_name: str, expr: Expr, **ray_remote_args) -> "Dataset": + def with_column( + self, + column_name: str, + expr: Expr, + **ray_remote_args, + ) -> "Dataset": """ Add a new column to the dataset via an expression. - Examples: + This method allows you to add a new column to a dataset by applying an + expression. The expression can be composed of existing columns, literals, + and user-defined functions (UDFs). + Examples: >>> import ray >>> from ray.data.expressions import col >>> ds = ray.data.range(100) - >>> ds.with_column("id_2", (col("id") * 2)).schema() - Column Type - ------ ---- - id int64 - id_2 int64 + >>> # Add a new column 'id_2' by multiplying 'id' by 2. + >>> ds.with_column("id_2", col("id") * 2).show(2) + {'id': 0, 'id_2': 0} + {'id': 1, 'id_2': 2} + + >>> # Using a UDF with with_column + >>> from ray.data.datatype import DataType + >>> from ray.data.expressions import udf + >>> import pyarrow.compute as pc + >>> + >>> @udf(return_dtype=DataType.int32()) + ... def add_one(column): + ... return pc.add(column, 1) + >>> + >>> ds.with_column("id_plus_one", add_one(col("id"))).show(2) + {'id': 0, 'id_plus_one': 1} + {'id': 1, 'id_plus_one': 2} Args: column_name: The name of the new column. expr: An expression that defines the new column values. **ray_remote_args: Additional resource requirements to request from - Ray (e.g., num_gpus=1 to request GPUs for the map tasks). See - :func:`ray.remote` for details. + Ray for the map tasks (e.g., `num_gpus=1`). Returns: A new dataset with the added column evaluated via the expression. """ + # TODO: update schema based on the expression AST. from ray.data._internal.logical.operators.map_operator import Project + from ray.data._internal.logical.operators.one_to_one_operator import Download + + # TODO: Once the expression API supports UDFs, we can clean up the code here. + from ray.data.expressions import DownloadExpr plan = self._plan.copy() - project_op = Project( - self._logical_plan.dag, - cols=None, - cols_rename=None, - exprs={column_name: expr}, - ray_remote_args=ray_remote_args, - ) - logical_plan = LogicalPlan(project_op, self.context) + if isinstance(expr, DownloadExpr): + download_op = Download( + self._logical_plan.dag, + uri_column_name=expr.uri_column_name, + output_bytes_column_name=column_name, + ray_remote_args=ray_remote_args, + ) + logical_plan = LogicalPlan(download_op, self.context) + else: + project_op = Project( + self._logical_plan.dag, + cols=None, + cols_rename=None, + exprs={column_name: expr}, + ray_remote_args=ray_remote_args, + ) + logical_plan = LogicalPlan(project_op, self.context) return Dataset(plan, logical_plan) @PublicAPI(api_group=BT_API_GROUP) @@ -892,11 +931,7 @@ def add_column(batch: DataBatch) -> DataBatch: # The index of the column must be set # to align with the index of the batch. - if ( - isinstance(column, pd.Series) - or isinstance(column, pd.DataFrame) - or isinstance(column, pd.Index) - ): + if isinstance(column, (pd.DataFrame, pd.Index, pd.Series)): column.index = batch.index batch.loc[:, col] = column return batch @@ -917,8 +952,7 @@ def add_column(batch: DataBatch) -> DataBatch: column_idx = batch.schema.get_field_index(col) if column_idx == -1: return batch.append_column(col, column) - else: - return batch.set_column(column_idx, col, column) + return batch.set_column(column_idx, col, column) else: # batch format is assumed to be numpy since we checked at the @@ -1089,7 +1123,7 @@ def rename_columns( self, names: Union[List[str], Dict[str, str]], *, - concurrency: Optional[Union[int, Tuple[int, int]]] = None, + concurrency: Optional[Union[int, Tuple[int, int], Tuple[int, int, int]]] = None, **ray_remote_args, ): """Rename columns in the dataset. @@ -1223,7 +1257,7 @@ def flat_map( num_cpus: Optional[float] = None, num_gpus: Optional[float] = None, memory: Optional[float] = None, - concurrency: Optional[Union[int, Tuple[int, int]]] = None, + concurrency: Optional[Union[int, Tuple[int, int], Tuple[int, int, int]]] = None, ray_remote_args_fn: Optional[Callable[[], Dict[str, Any]]] = None, **ray_remote_args, ) -> "Dataset": @@ -1304,6 +1338,9 @@ def duplicate_row(row: Dict[str, Any]) -> List[Dict[str, Any]]: * If ``fn`` is a class and ``concurrency`` is a tuple ``(m, n)``, Ray Data uses an autoscaling actor pool from ``m`` to ``n`` workers. + * If ``fn`` is a class and ``concurrency`` is a tuple ``(m, n, initial)``, Ray + Data uses an autoscaling actor pool from ``m`` to ``n`` workers, with an initial size of ``initial``. + * If ``fn`` is a class and ``concurrency`` isn't set (default), this method raises an error. @@ -1366,7 +1403,7 @@ def filter( fn_kwargs: Optional[Dict[str, Any]] = None, fn_constructor_args: Optional[Iterable[Any]] = None, fn_constructor_kwargs: Optional[Dict[str, Any]] = None, - concurrency: Optional[Union[int, Tuple[int, int]]] = None, + concurrency: Optional[Union[int, Tuple[int, int], Tuple[int, int, int]]] = None, ray_remote_args_fn: Optional[Callable[[], Dict[str, Any]]] = None, **ray_remote_args, ) -> "Dataset": @@ -1422,6 +1459,9 @@ def filter( * If ``fn`` is a class and ``concurrency`` is a tuple ``(m, n)``, Ray Data uses an autoscaling actor pool from ``m`` to ``n`` workers. + * If ``fn`` is a class and ``concurrency`` is a tuple ``(m, n, initial)``, Ray + Data uses an autoscaling actor pool from ``m`` to ``n`` workers, with an initial size of ``initial``. + * If ``fn`` is a class and ``concurrency`` isn't set (default), this method raises an error. @@ -1494,7 +1534,6 @@ def filter( logical_plan = LogicalPlan(op, self.context) return Dataset(plan, logical_plan) - @AllToAllAPI @PublicAPI(api_group=SSR_API_GROUP) def repartition( self, @@ -1517,9 +1556,11 @@ def repartition( .. note:: - Repartition has two modes. If ``shuffle=False``, Ray Data performs the - minimal data movement needed to equalize block sizes. Otherwise, Ray Data - performs a full distributed shuffle. + Repartition has three modes: + + * When ``num_blocks`` and ``shuffle=True`` are specified Ray Data performs a full distributed shuffle producing exactly ``num_blocks`` blocks. + * When ``num_blocks`` and ``shuffle=False`` are specified, Ray Data does NOT perform full shuffle, instead opting in for splitting and combining of the blocks attempting to minimize the necessary data movement (relative to full-blown shuffle). Exactly ``num_blocks`` will be produced. + * If ``target_num_rows_per_block`` is set (exclusive with ``num_blocks`` and ``shuffle``), streaming repartitioning will be executed, where blocks will be made to carry no more than ``target_num_rows_per_block``. Smaller blocks will be combined into bigger ones up to ``target_num_rows_per_block`` as well. .. image:: /data/images/dataset-shuffle.svg :align: center @@ -1538,7 +1579,8 @@ def repartition( Args: num_blocks: Number of blocks after repartitioning. target_num_rows_per_block: [Experimental] The target number of rows per block to - repartition. Note that either `num_blocks` or + repartition. Performs streaming repartitioning of the dataset (no shuffling). + Note that either `num_blocks` or `target_num_rows_per_block` must be set, but not both. When `target_num_rows_per_block` is set, it only repartitions :class:`Dataset` :ref:`blocks ` that are larger than @@ -3363,7 +3405,10 @@ def count(self) -> int: return meta_count plan = self._plan.copy() - count_op = Count([self._logical_plan.dag]) + + # NOTE: Project the dataset to avoid the need to carrying actual + # data when we're only interested in the total count + count_op = Count(Project(self._logical_plan.dag, cols=[])) logical_plan = LogicalPlan(count_op, self.context) count_ds = Dataset(plan, logical_plan) @@ -5181,137 +5226,6 @@ def iter_tf_batches( local_shuffle_seed=local_shuffle_seed, ) - @ConsumptionAPI(pattern="Time complexity:") - @Deprecated - def to_torch( - self, - *, - label_column: Optional[str] = None, - feature_columns: Optional[ - Union[List[str], List[List[str]], Dict[str, List[str]]] - ] = None, - label_column_dtype: Optional["torch.dtype"] = None, - feature_column_dtypes: Optional[ - Union["torch.dtype", List["torch.dtype"], Dict[str, "torch.dtype"]] - ] = None, - batch_size: int = 1, - prefetch_batches: int = 1, - drop_last: bool = False, - local_shuffle_buffer_size: Optional[int] = None, - local_shuffle_seed: Optional[int] = None, - unsqueeze_label_tensor: bool = True, - unsqueeze_feature_tensors: bool = True, - ) -> "torch.utils.data.IterableDataset": - """Return a - `Torch IterableDataset `_ - over this :class:`~ray.data.Dataset`. - - This is only supported for datasets convertible to Arrow records. - - It is recommended to use the returned ``IterableDataset`` directly - instead of passing it into a torch ``DataLoader``. - - Each element in ``IterableDataset`` is a tuple consisting of 2 - elements. The first item contains the feature tensor(s), and the - second item is the label tensor. Those can take on different - forms, depending on the specified arguments. - - For the features tensor (N is the ``batch_size`` and n, m, k - are the number of features per tensor): - - * If ``feature_columns`` is a ``List[str]``, the features is - a tensor of shape (N, n), with columns corresponding to - ``feature_columns`` - - * If ``feature_columns`` is a ``List[List[str]]``, the features is - a list of tensors of shape [(N, m),...,(N, k)], with columns of each - tensor corresponding to the elements of ``feature_columns`` - - * If ``feature_columns`` is a ``Dict[str, List[str]]``, the features - is a dict of key-tensor pairs of shape - {key1: (N, m),..., keyN: (N, k)}, with columns of each - tensor corresponding to the value of ``feature_columns`` under the - key. - - If ``unsqueeze_label_tensor=True`` (default), the label tensor is - of shape (N, 1). Otherwise, it is of shape (N,). - If ``label_column`` is specified as ``None``, then no column from the - ``Dataset`` is treated as the label, and the output label tensor - is ``None``. - - Note that you probably want to call :meth:`Dataset.split` on this dataset if - there are to be multiple Torch workers consuming the data. - - Time complexity: O(1) - - Args: - label_column: The name of the column used as the - label (second element of the output list). Can be None for - prediction, in which case the second element of returned - tuple will also be None. - feature_columns: The names of the columns - to use as the features. Can be a list of lists or - a dict of string-list pairs for multi-tensor output. - If ``None``, then use all columns except the label column as - the features. - label_column_dtype: The torch dtype to - use for the label column. If ``None``, then automatically infer - the dtype. - feature_column_dtypes: The dtypes to use for the feature - tensors. This should match the format of ``feature_columns``, - or be a single dtype, in which case it is applied to - all tensors. If ``None``, then automatically infer the dtype. - batch_size: How many samples per batch to yield at a time. - Defaults to 1. - prefetch_batches: The number of batches to fetch ahead of the current batch - to fetch. If set to greater than 0, a separate threadpool is used - to fetch the objects to the local node, format the batches, and apply - the collate_fn. Defaults to 1. - drop_last: Set to True to drop the last incomplete batch, - if the dataset size is not divisible by the batch size. If - False and the size of the stream is not divisible by the batch - size, then the last batch is smaller. Defaults to False. - local_shuffle_buffer_size: If non-None, the data is randomly shuffled - using a local in-memory shuffle buffer, and this value will serve as the - minimum number of rows that must be in the local in-memory shuffle - buffer in order to yield a batch. When there are no more rows to add to - the buffer, the remaining rows in the buffer are drained. This - buffer size must be greater than or equal to ``batch_size``, and - therefore ``batch_size`` must also be specified when using local - shuffling. - local_shuffle_seed: The seed to use for the local random shuffle. - unsqueeze_label_tensor: If set to True, the label tensor - is unsqueezed (reshaped to (N, 1)). Otherwise, it will - be left as is, that is (N, ). In general, regression loss - functions expect an unsqueezed tensor, while classification - loss functions expect a squeezed one. Defaults to True. - unsqueeze_feature_tensors: If set to True, the features tensors - are unsqueezed (reshaped to (N, 1)) before being concatenated into - the final features tensor. Otherwise, they are left as is, that is - (N, ). Defaults to True. - - Returns: - A `Torch IterableDataset`_. - """ # noqa: E501 - warnings.warn( - "`to_torch` is deprecated and will be removed after May 2025. Use " - "`iter_torch_batches` instead.", - DeprecationWarning, - ) - return self.iterator().to_torch( - label_column=label_column, - feature_columns=feature_columns, - label_column_dtype=label_column_dtype, - feature_column_dtypes=feature_column_dtypes, - batch_size=batch_size, - prefetch_batches=prefetch_batches, - drop_last=drop_last, - local_shuffle_buffer_size=local_shuffle_buffer_size, - local_shuffle_seed=local_shuffle_seed, - unsqueeze_label_tensor=unsqueeze_label_tensor, - unsqueeze_feature_tensors=unsqueeze_feature_tensors, - ) - @ConsumptionAPI @PublicAPI(api_group=IOC_API_GROUP) def to_tf( @@ -5965,6 +5879,32 @@ def stats(self) -> str: return self._write_ds.stats() return self._get_stats_summary().to_string() + @PublicAPI(api_group=IM_API_GROUP, stability="alpha") + def explain(self): + """Show the logical plan and physical plan of the dataset. + + Examples: + + .. testcode:: + + import ray + from ray.data import Dataset + ds: Dataset = ray.data.range(10, override_num_blocks=10) + ds = ds.map(lambda x: x + 1) + ds.explain() + + .. testoutput:: + + -------- Logical Plan -------- + Map() + +- ReadRange + -------- Physical Plan -------- + TaskPoolMapOperator[ReadRange->Map()] + +- InputDataBuffer[Input] + + """ + print(self._plan.explain()) + def _get_stats_summary(self) -> DatasetStatsSummary: return self._plan.stats().to_summary() diff --git a/python/ray/data/datasource/__init__.py b/python/ray/data/datasource/__init__.py index ef2eca5977ed..1e76f1bfd9d0 100644 --- a/python/ray/data/datasource/__init__.py +++ b/python/ray/data/datasource/__init__.py @@ -28,7 +28,6 @@ FileMetadataProvider, ) from ray.data.datasource.filename_provider import FilenameProvider -from ray.data.datasource.parquet_meta_provider import ParquetMetadataProvider from ray.data.datasource.partitioning import ( Partitioning, PartitionStyle, @@ -53,7 +52,6 @@ "FileShuffleConfig", "FileMetadataProvider", "FilenameProvider", - "ParquetMetadataProvider", "PartitionStyle", "PathPartitionFilter", "PathPartitionParser", diff --git a/python/ray/data/datasource/file_based_datasource.py b/python/ray/data/datasource/file_based_datasource.py index 521872ac888b..d65fe8d98293 100644 --- a/python/ray/data/datasource/file_based_datasource.py +++ b/python/ray/data/datasource/file_based_datasource.py @@ -140,7 +140,6 @@ def __init__( self._partitioning = partitioning self._ignore_missing_paths = ignore_missing_paths self._include_paths = include_paths - self._unresolved_paths = paths paths, self._filesystem = _resolve_paths_and_filesystem(paths, filesystem) self._filesystem = RetryingPyFileSystem.wrap( self._filesystem, retryable_errors=self._data_context.retried_io_errors @@ -273,8 +272,7 @@ def read_task_fn(): num_threads = 0 if num_threads > 0: - if len(read_paths) < num_threads: - num_threads = len(read_paths) + num_threads = min(num_threads, len(read_paths)) logger.debug( f"Reading {len(read_paths)} files with {num_threads} threads." diff --git a/python/ray/data/datasource/file_meta_provider.py b/python/ray/data/datasource/file_meta_provider.py index ed5ae26f903c..5d3b2b55cb45 100644 --- a/python/ray/data/datasource/file_meta_provider.py +++ b/python/ray/data/datasource/file_meta_provider.py @@ -20,7 +20,8 @@ from ray.data._internal.remote_fn import cached_remote_fn from ray.data._internal.util import RetryingPyFileSystem from ray.data.block import BlockMetadata -from ray.data.datasource.partitioning import Partitioning +from ray.data.datasource.partitioning import Partitioning, PathPartitionFilter +from ray.data.datasource.path_util import _has_file_extension from ray.util.annotations import DeveloperAPI if TYPE_CHECKING: @@ -36,7 +37,6 @@ class FileMetadataProvider: Current subclasses: - :class:`BaseFileMetadataProvider` - - :class:`ParquetMetadataProvider` """ def _get_block_metadata( @@ -243,6 +243,46 @@ def _handle_read_os_error(error: OSError, paths: Union[str, List[str]]) -> str: raise error +def _list_files( + paths: List[str], + filesystem: "RetryingPyFileSystem", + *, + partition_filter: Optional[PathPartitionFilter], + file_extensions: Optional[List[str]], +) -> List[Tuple[str, int]]: + return list( + _list_files_internal( + paths, + filesystem, + partition_filter=partition_filter, + file_extensions=file_extensions, + ) + ) + + +def _list_files_internal( + paths: List[str], + filesystem: "RetryingPyFileSystem", + *, + partition_filter: Optional[PathPartitionFilter], + file_extensions: Optional[List[str]], +) -> Iterator[Tuple[str, int]]: + default_meta_provider = DefaultFileMetadataProvider() + + for path, file_size in default_meta_provider.expand_paths(paths, filesystem): + # HACK: PyArrow's `ParquetDataset` errors if input paths contain non-parquet + # files. To avoid this, we expand the input paths with the default metadata + # provider and then apply the partition filter or file extensions. + if ( + partition_filter + and not partition_filter.apply(path) + or not _has_file_extension(path, file_extensions) + ): + continue + + yield path, file_size + + def _expand_paths( paths: List[str], filesystem: "RetryingPyFileSystem", diff --git a/python/ray/data/datasource/parquet_meta_provider.py b/python/ray/data/datasource/parquet_meta_provider.py deleted file mode 100644 index c8484574da18..000000000000 --- a/python/ray/data/datasource/parquet_meta_provider.py +++ /dev/null @@ -1,247 +0,0 @@ -from typing import TYPE_CHECKING, List, Optional - -import ray.cloudpickle as cloudpickle -from ray.data._internal.util import call_with_retry -from ray.data.block import BlockMetadata -from ray.data.datasource.file_meta_provider import ( - FileMetadataProvider, - _fetch_metadata_parallel, -) -from ray.util.annotations import DeveloperAPI - -if TYPE_CHECKING: - import pyarrow - - from ray.data._internal.datasource.parquet_datasource import SerializedFragment - - -FRAGMENTS_PER_META_FETCH = 6 -PARALLELIZE_META_FETCH_THRESHOLD = 24 - -# The application-level exceptions to retry for metadata prefetching task. -# Default to retry on access denied and read timeout errors because AWS S3 would throw -# these transient errors when load is too high. -RETRY_EXCEPTIONS_FOR_META_FETCH_TASK = ["AWS Error ACCESS_DENIED", "Timeout"] -# Maximum number of retries for metadata prefetching task due to transient errors. -RETRY_MAX_ATTEMPTS_FOR_META_FETCH_TASK = 32 -# Maximum retry back-off interval in seconds for failed metadata prefetching task. -RETRY_MAX_BACKOFF_S_FOR_META_FETCH_TASK = 64 - - -class _ParquetFileFragmentMetaData: - """Class to store metadata of a Parquet file fragment. This includes - all attributes from `pyarrow.parquet.FileMetaData` except for `schema`, - which is stored in `self.schema_pickled` as a pickled object from - `cloudpickle.loads()`, used in deduplicating schemas across multiple fragments.""" - - def __init__(self, fragment_metadata: "pyarrow.parquet.FileMetaData"): - self.created_by = fragment_metadata.created_by - self.format_version = fragment_metadata.format_version - self.num_columns = fragment_metadata.num_columns - self.num_row_groups = fragment_metadata.num_row_groups - self.num_rows = fragment_metadata.num_rows - self.serialized_size = fragment_metadata.serialized_size - # This is a pickled schema object, to be set later with - # `self.set_schema_pickled()`. To get the underlying schema, use - # `cloudpickle.loads(self.schema_pickled)`. - self.schema_pickled = None - - # Calculate the total byte size of the file fragment using the original - # object, as it is not possible to access row groups from this class. - self.total_byte_size = 0 - for row_group_idx in range(fragment_metadata.num_row_groups): - row_group_metadata = fragment_metadata.row_group(row_group_idx) - self.total_byte_size += row_group_metadata.total_byte_size - - def set_schema_pickled(self, schema_pickled: bytes): - """Note: to get the underlying schema, use - `cloudpickle.loads(self.schema_pickled)`.""" - self.schema_pickled = schema_pickled - - -@DeveloperAPI -class ParquetMetadataProvider(FileMetadataProvider): - """Provides block metadata for Arrow Parquet file fragments.""" - - def _get_block_metadata( - self, - paths: List[str], - *, - num_fragments: int, - prefetched_metadata: Optional[List["_ParquetFileFragmentMetaData"]], - ) -> BlockMetadata: - """Resolves and returns block metadata for files of a single dataset block. - - Args: - paths: The file paths for a single dataset block. - num_fragments: The number of Parquet file fragments derived from the input - file paths. - prefetched_metadata: Metadata previously returned from - `prefetch_file_metadata()` for each file fragment, where - `prefetched_metadata[i]` contains the metadata for `fragments[i]`. - - Returns: - BlockMetadata aggregated across the given file paths. - """ - if ( - prefetched_metadata is not None - and len(prefetched_metadata) == num_fragments - and all(m is not None for m in prefetched_metadata) - ): - # Fragment metadata was available, construct a normal - # BlockMetadata. - block_metadata = BlockMetadata( - num_rows=sum(m.num_rows for m in prefetched_metadata), - size_bytes=sum(m.total_byte_size for m in prefetched_metadata), - input_files=paths, - exec_stats=None, - ) # Exec stats filled in later. - else: - # Fragment metadata was not available, construct an empty - # BlockMetadata. - block_metadata = BlockMetadata( - num_rows=None, - size_bytes=None, - input_files=paths, - exec_stats=None, - ) - return block_metadata - - def prefetch_file_metadata( - self, - fragments: List["pyarrow.dataset.ParquetFileFragment"], - **ray_remote_args, - ) -> Optional[List[_ParquetFileFragmentMetaData]]: - """Pre-fetches file metadata for all Parquet file fragments in a single batch. - - Subsets of the metadata returned will be provided as input to subsequent calls - to ``_get_block_metadata`` together with their corresponding Parquet file - fragments. - - Args: - fragments: The Parquet file fragments to fetch metadata for. - - Returns: - Metadata resolved for each input file fragment, or `None`. Metadata - must be returned in the same order as all input file fragments, such - that `metadata[i]` always contains the metadata for `fragments[i]`. - """ - from ray.data._internal.datasource.parquet_datasource import SerializedFragment - - if len(fragments) > PARALLELIZE_META_FETCH_THRESHOLD: - # Wrap Parquet fragments in serialization workaround. - fragments = [SerializedFragment(fragment) for fragment in fragments] - # Fetch Parquet metadata in parallel using Ray tasks. - - def fetch_func(fragments): - return _fetch_metadata_serialization_wrapper( - fragments, - # Ensure that retry settings are propagated to remote tasks. - retry_match=RETRY_EXCEPTIONS_FOR_META_FETCH_TASK, - retry_max_attempts=RETRY_MAX_ATTEMPTS_FOR_META_FETCH_TASK, - retry_max_interval=RETRY_MAX_BACKOFF_S_FOR_META_FETCH_TASK, - ) - - raw_metadata = list( - _fetch_metadata_parallel( - fragments, - fetch_func, - FRAGMENTS_PER_META_FETCH, - **ray_remote_args, - ) - ) - else: - raw_metadata = _fetch_metadata(fragments) - - return _dedupe_metadata(raw_metadata) - - -def _fetch_metadata_serialization_wrapper( - fragments: List["SerializedFragment"], - retry_match: Optional[List[str]], - retry_max_attempts: int, - retry_max_interval: int, -) -> List["pyarrow.parquet.FileMetaData"]: - from ray.data._internal.datasource.parquet_datasource import ( - _deserialize_fragments_with_retry, - ) - - deserialized_fragments = _deserialize_fragments_with_retry(fragments) - try: - metadata = call_with_retry( - lambda: _fetch_metadata(deserialized_fragments), - description="fetch metdata", - match=retry_match, - max_attempts=retry_max_attempts, - max_backoff_s=retry_max_interval, - ) - except OSError as e: - raise RuntimeError( - f"Exceeded maximum number of attempts ({retry_max_attempts}) to retry " - "metadata fetching task. Metadata fetching tasks can fail due to transient " - "errors like rate limiting.\n" - "\n" - "To increase the maximum number of attempts, configure " - "`RETRY_MAX_ATTEMPTS_FOR_META_FETCH_TASK`. For example:\n" - "```\n" - "ray.data._internal.datasource.parquet_datasource.RETRY_MAX_ATTEMPTS_FOR_META_FETCH_TASK = 64\n" # noqa: E501 - "```\n" - "To increase the maximum retry backoff interval, configure " - "`RETRY_MAX_BACKOFF_S_FOR_META_FETCH_TASK`. For example:\n" - "```\n" - "ray.data._internal.datasource.parquet_datasource.RETRY_MAX_BACKOFF_S_FOR_META_FETCH_TASK = 128\n" # noqa: E501 - "```\n" - "If the error continues to occur, you can also try decresasing the " - "concurency of metadata fetching tasks by setting " - "`NUM_CPUS_FOR_META_FETCH_TASK` to a larger value. For example:\n" - "```\n" - "ray.data._internal.datasource.parquet_datasource.NUM_CPUS_FOR_META_FETCH_TASK = 4.\n" # noqa: E501 - "```\n" - "To change which exceptions to retry on, set " - "`RETRY_EXCEPTIONS_FOR_META_FETCH_TASK` to a list of error messages. For " - "example:\n" - "```\n" - 'ray.data._internal.datasource.parquet_datasource.RETRY_EXCEPTIONS_FOR_META_FETCH_TASK = ["AWS Error ACCESS_DENIED", "Timeout"]\n' # noqa: E501 - "```" - ) from e - return metadata - - -def _fetch_metadata( - fragments: List["pyarrow.dataset.ParquetFileFragment"], -) -> List["pyarrow.parquet.FileMetaData"]: - fragment_metadata = [] - for f in fragments: - try: - fragment_metadata.append(f.metadata) - except AttributeError: - break - return fragment_metadata - - -def _dedupe_metadata( - raw_metadatas: List["pyarrow.parquet.FileMetaData"], -) -> List[_ParquetFileFragmentMetaData]: - """For datasets with a large number of columns, the FileMetaData - (in particular the schema) can be very large. We can reduce the - memory usage by only keeping unique schema objects across all - file fragments. This method deduplicates the schemas and returns - a list of `_ParquetFileFragmentMetaData` objects.""" - schema_to_id = {} # schema_id -> serialized_schema - id_to_schema = {} # serialized_schema -> schema_id - stripped_metadatas = [] - for fragment_metadata in raw_metadatas: - stripped_md = _ParquetFileFragmentMetaData(fragment_metadata) - - schema_ser = cloudpickle.dumps(fragment_metadata.schema.to_arrow_schema()) - if schema_ser not in schema_to_id: - schema_id = len(schema_to_id) - schema_to_id[schema_ser] = schema_id - id_to_schema[schema_id] = schema_ser - stripped_md.set_schema_pickled(schema_ser) - else: - schema_id = schema_to_id.get(schema_ser) - existing_schema_ser = id_to_schema[schema_id] - stripped_md.set_schema_pickled(existing_schema_ser) - stripped_metadatas.append(stripped_md) - return stripped_metadatas diff --git a/python/ray/data/datasource/partitioning.py b/python/ray/data/datasource/partitioning.py index 2d83fe6b67de..20a626b09bce 100644 --- a/python/ray/data/datasource/partitioning.py +++ b/python/ray/data/datasource/partitioning.py @@ -434,11 +434,12 @@ def __call__(self, paths: List[str]) -> List[str]: """ filtered_paths = paths if self._filter_fn is not None: - filtered_paths = [ - path for path in paths if self._filter_fn(self._parser(path)) - ] + filtered_paths = [path for path in paths if self.apply(path)] return filtered_paths + def apply(self, path: str) -> bool: + return self._filter_fn(self._parser(path)) + @property def parser(self) -> PathPartitionParser: """Returns the path partition parser for this filter.""" diff --git a/python/ray/data/datasource/path_util.py b/python/ray/data/datasource/path_util.py index 6498300caa9f..5d9527243f36 100644 --- a/python/ray/data/datasource/path_util.py +++ b/python/ray/data/datasource/path_util.py @@ -39,7 +39,7 @@ def _has_file_extension(path: str, extensions: Optional[List[str]]) -> bool: def _resolve_paths_and_filesystem( paths: Union[str, List[str]], - filesystem: "pyarrow.fs.FileSystem" = None, + filesystem: Optional["pyarrow.fs.FileSystem"] = None, ) -> Tuple[List[str], "pyarrow.fs.FileSystem"]: """ Resolves and normalizes all provided paths, infers a filesystem from the @@ -69,7 +69,7 @@ def _resolve_paths_and_filesystem( elif not isinstance(paths, list) or any(not isinstance(p, str) for p in paths): raise ValueError( "Expected `paths` to be a `str`, `pathlib.Path`, or `list[str]`, but got " - f"`{paths}`." + f"`{paths}`" ) elif len(paths) == 0: raise ValueError("Must provide at least one path.") diff --git a/python/ray/data/datatype.py b/python/ray/data/datatype.py new file mode 100644 index 000000000000..4c9fb79defce --- /dev/null +++ b/python/ray/data/datatype.py @@ -0,0 +1,255 @@ +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple, Union + +import numpy as np +import pyarrow as pa + +from ray.air.util.tensor_extensions.arrow import ( + _infer_pyarrow_type, +) +from ray.util.annotations import PublicAPI + +PYARROW_TYPE_DEFINITIONS: Dict[str, Tuple[callable, str]] = { + "int8": (pa.int8, "an 8-bit signed integer"), + "int16": (pa.int16, "a 16-bit signed integer"), + "int32": (pa.int32, "a 32-bit signed integer"), + "int64": (pa.int64, "a 64-bit signed integer"), + "uint8": (pa.uint8, "an 8-bit unsigned integer"), + "uint16": (pa.uint16, "a 16-bit unsigned integer"), + "uint32": (pa.uint32, "a 32-bit unsigned integer"), + "uint64": (pa.uint64, "a 64-bit unsigned integer"), + "float32": (pa.float32, "a 32-bit floating point number"), + "float64": (pa.float64, "a 64-bit floating point number"), + "string": (pa.string, "a variable-length string"), + "bool": (pa.bool_, "a boolean value"), + "binary": (pa.binary, "variable-length binary data"), +} + + +def _factory_methods(cls: type): + """Metaprogramming: Class decorator to generate factory methods for PyArrow types using from_arrow. + + This decorator automatically creates class methods for common PyArrow data types. + Each generated method is a convenient factory that calls cls.from_arrow(pa.type()). + + Generated methods include: + - Signed integers: int8, int16, int32, int64 + - Unsigned integers: uint8, uint16, uint32, uint64 + - Floating point: float32, float64 + - Other types: string, bool, binary + + Examples of generated methods:: + + @classmethod + def int32(cls): + \"\"\"Create a DataType representing a 32-bit signed integer. + + Returns: + DataType: A DataType with PyArrow int32 type + \"\"\" + return cls.from_arrow(pa.int32()) + + @classmethod + def string(cls): + \"\"\"Create a DataType representing a variable-length string. + + Returns: + DataType: A DataType with PyArrow string type + \"\"\" + return cls.from_arrow(pa.string()) + + Usage: + Instead of DataType.from_arrow(pa.int32()), you can use DataType.int32() + """ + + for method_name, (pa_func, description) in PYARROW_TYPE_DEFINITIONS.items(): + + def create_method(name, func, desc): + def factory_method(cls): + return cls.from_arrow(func()) + + factory_method.__doc__ = f"""Create a DataType representing {desc}. + + Returns: + DataType: A DataType with PyArrow {name} type + """ + factory_method.__name__ = name + factory_method.__qualname__ = f"{cls.__name__}.{name}" + return classmethod(factory_method) + + setattr(cls, method_name, create_method(method_name, pa_func, description)) + + return cls + + +@PublicAPI(stability="alpha") +@dataclass +@_factory_methods +class DataType: + """A simplified Ray Data DataType supporting Arrow, NumPy, and Python types.""" + + _internal_type: Union[pa.DataType, np.dtype, type] + + def __post_init__(self): + """Validate the _internal_type after initialization.""" + # TODO: Support Pandas extension types + if not isinstance( + self._internal_type, + (pa.DataType, np.dtype, type), + ): + raise TypeError( + f"DataType supports only PyArrow DataType, NumPy dtype, or Python type, but was given type {type(self._internal_type)}." + ) + + # Type checking methods + def is_arrow_type(self) -> bool: + return isinstance(self._internal_type, pa.DataType) + + def is_numpy_type(self) -> bool: + return isinstance(self._internal_type, np.dtype) + + def is_python_type(self) -> bool: + return isinstance(self._internal_type, type) + + # Conversion methods + def to_arrow_dtype(self, values: Optional[List[Any]] = None) -> pa.DataType: + """ + Convert the DataType to a PyArrow DataType. + + Args: + values: Optional list of values to infer the Arrow type from. Required if the DataType is a Python type. + + Returns: + A PyArrow DataType + """ + if self.is_arrow_type(): + return self._internal_type + else: + if isinstance(self._internal_type, np.dtype): + return pa.from_numpy_dtype(self._internal_type) + else: + assert ( + values is not None and len(values) > 0 + ), "Values are required to infer Arrow type if the provided type is a Python type" + return _infer_pyarrow_type(values) + + def to_numpy_dtype(self) -> np.dtype: + if self.is_numpy_type(): + return self._internal_type + elif self.is_arrow_type(): + try: + # For most basic arrow types, this will work + pandas_dtype = self._internal_type.to_pandas_dtype() + if isinstance(pandas_dtype, np.dtype): + return pandas_dtype + else: + # If pandas returns an extension dtype, fall back to object + return np.dtype("object") + except (TypeError, NotImplementedError, pa.ArrowNotImplementedError): + return np.dtype("object") + else: + return np.dtype("object") + + def to_python_type(self) -> type: + if self.is_python_type(): + return self._internal_type + else: + raise ValueError(f"DataType {self} is not a Python type") + + # Factory methods from external systems + @classmethod + def from_arrow(cls, arrow_type: pa.DataType) -> "DataType": + """Create a DataType from a PyArrow DataType. + + Args: + arrow_type: A PyArrow DataType to wrap + + Returns: + DataType: A DataType wrapping the given PyArrow type + + Examples: + >>> import pyarrow as pa + >>> from ray.data.datatype import DataType + >>> DataType.from_arrow(pa.timestamp('s')) + DataType(arrow:timestamp[s]) + >>> DataType.from_arrow(pa.int64()) + DataType(arrow:int64) + """ + return cls(_internal_type=arrow_type) + + @classmethod + def from_numpy(cls, numpy_dtype: Union[np.dtype, str]) -> "DataType": + """Create a DataType from a NumPy dtype. + + Args: + numpy_dtype: A NumPy dtype object or string representation + + Returns: + DataType: A DataType wrapping the given NumPy dtype + + Examples: + >>> import numpy as np + >>> from ray.data.datatype import DataType + >>> DataType.from_numpy(np.dtype('int32')) + DataType(numpy:int32) + >>> DataType.from_numpy('float64') + DataType(numpy:float64) + """ + if isinstance(numpy_dtype, str): + numpy_dtype = np.dtype(numpy_dtype) + return cls(_internal_type=numpy_dtype) + + @classmethod + def infer_dtype(cls, value: Any) -> "DataType": + """Infer DataType from a Python value, handling numpy, Arrow, and Python types. + + Args: + value: Any Python value to infer the type from + + Returns: + DataType: The inferred data type + + Examples: + >>> import numpy as np + >>> from ray.data.datatype import DataType + >>> DataType.infer_dtype(5) + DataType(arrow:int64) + >>> DataType.infer_dtype("hello") + DataType(arrow:string) + >>> DataType.infer_dtype(np.int32(42)) + DataType(numpy:int32) + """ + # 1. Handle numpy arrays and scalars + if isinstance(value, (np.ndarray, np.generic)): + return cls.from_numpy(value.dtype) + # 3. Try PyArrow type inference for regular Python values + try: + inferred_arrow_type = _infer_pyarrow_type([value]) + if inferred_arrow_type is not None: + return cls.from_arrow(inferred_arrow_type) + except Exception: + return cls(type(value)) + + def __repr__(self) -> str: + if self.is_arrow_type(): + return f"DataType(arrow:{self._internal_type})" + elif self.is_numpy_type(): + return f"DataType(numpy:{self._internal_type})" + else: + return f"DataType(python:{self._internal_type.__name__})" + + def __eq__(self, other) -> bool: + if not isinstance(other, DataType): + return False + + # Ensure they're from the same type system by checking the actual type + # of the internal type object, not just the value + if type(self._internal_type) is not type(other._internal_type): + return False + + return self._internal_type == other._internal_type + + def __hash__(self) -> int: + # Include the type of the internal type in the hash to ensure + # different type systems don't collide + return hash((type(self._internal_type), self._internal_type)) diff --git a/python/ray/data/expressions.py b/python/ray/data/expressions.py index 3ba8f48356da..94a799802068 100644 --- a/python/ray/data/expressions.py +++ b/python/ray/data/expressions.py @@ -1,10 +1,13 @@ from __future__ import annotations +import functools from abc import ABC, abstractmethod -from dataclasses import dataclass +from dataclasses import dataclass, field from enum import Enum -from typing import Any +from typing import Any, Callable, Dict, List +from ray.data.block import BatchColumn +from ray.data.datatype import DataType from ray.util.annotations import DeveloperAPI, PublicAPI @@ -67,6 +70,8 @@ class Expr(ABC): subclasses like ColumnExpr, LiteralExpr, etc. """ + data_type: DataType + @abstractmethod def structurally_equals(self, other: Any) -> bool: """Compare two expression ASTs for structural equality.""" @@ -172,6 +177,7 @@ class ColumnExpr(Expr): """ name: str + data_type: DataType = field(default_factory=lambda: DataType(object), init=False) def structurally_equals(self, other: Any) -> bool: return isinstance(other, ColumnExpr) and self.name == other.name @@ -190,12 +196,22 @@ class LiteralExpr(Expr): Example: >>> from ray.data.expressions import lit + >>> import numpy as np >>> # Create a literal value >>> five = lit(5) # Creates LiteralExpr(value=5) >>> name = lit("John") # Creates LiteralExpr(value="John") + >>> numpy_val = lit(np.int32(42)) # Creates LiteralExpr with numpy type """ value: Any + data_type: DataType = field(init=False) + + def __post_init__(self): + # Infer the type from the value using DataType.infer_dtype + inferred_dtype = DataType.infer_dtype(self.value) + + # Use object.__setattr__ since the dataclass is frozen + object.__setattr__(self, "data_type", inferred_dtype) def structurally_equals(self, other: Any) -> bool: return ( @@ -230,6 +246,8 @@ class BinaryExpr(Expr): left: Expr right: Expr + data_type: DataType = field(default_factory=lambda: DataType(object), init=False) + def structurally_equals(self, other: Any) -> bool: return ( isinstance(other, BinaryExpr) @@ -239,6 +257,163 @@ def structurally_equals(self, other: Any) -> bool: ) +@DeveloperAPI(stability="alpha") +@dataclass(frozen=True, eq=False) +class UDFExpr(Expr): + """Expression that represents a user-defined function call. + + This expression type wraps a UDF with schema inference capabilities, + allowing UDFs to be used seamlessly within the expression system. + + UDFs operate on batches of data, where each column argument is passed + as a PyArrow Array containing multiple values from that column across the batch. + + Args: + fn: The user-defined function to call + args: List of argument expressions (positional arguments) + kwargs: Dictionary of keyword argument expressions + function_name: Optional name for the function (for debugging) + + Example: + >>> from ray.data.expressions import col, udf + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> + >>> @udf(return_dtype=DataType.int32()) + ... def add_one(x: pa.Array) -> pa.Array: + ... return pc.add(x, 1) + >>> + >>> # Use in expressions + >>> expr = add_one(col("value")) + """ + + fn: Callable[..., BatchColumn] + args: List[Expr] + kwargs: Dict[str, Expr] + + def structurally_equals(self, other: Any) -> bool: + return ( + isinstance(other, UDFExpr) + and self.fn == other.fn + and len(self.args) == len(other.args) + and all(a.structurally_equals(b) for a, b in zip(self.args, other.args)) + and self.kwargs.keys() == other.kwargs.keys() + and all( + self.kwargs[k].structurally_equals(other.kwargs[k]) + for k in self.kwargs.keys() + ) + ) + + +def _create_udf_callable( + fn: Callable[..., BatchColumn], return_dtype: DataType +) -> Callable[..., UDFExpr]: + """Create a callable that generates UDFExpr when called with expressions.""" + + def udf_callable(*args, **kwargs) -> UDFExpr: + # Convert arguments to expressions if they aren't already + expr_args = [] + for arg in args: + if isinstance(arg, Expr): + expr_args.append(arg) + else: + expr_args.append(LiteralExpr(arg)) + + expr_kwargs = {} + for k, v in kwargs.items(): + if isinstance(v, Expr): + expr_kwargs[k] = v + else: + expr_kwargs[k] = LiteralExpr(v) + + return UDFExpr( + fn=fn, + args=expr_args, + kwargs=expr_kwargs, + data_type=return_dtype, + ) + + # Preserve original function metadata + functools.update_wrapper(udf_callable, fn) + + # Store the original function for access if needed + udf_callable._original_fn = fn + + return udf_callable + + +@PublicAPI(stability="alpha") +def udf(return_dtype: DataType) -> Callable[..., UDFExpr]: + """ + Decorator to convert a UDF into an expression-compatible function. + + This decorator allows UDFs to be used seamlessly within the expression system, + enabling schema inference and integration with other expressions. + + IMPORTANT: UDFs operate on batches of data, not individual rows. When your UDF + is called, each column argument will be passed as a PyArrow Array containing + multiple values from that column across the batch. Under the hood, when working + with multiple columns, they get translated to PyArrow arrays (one array per column). + + Args: + return_dtype: The data type of the return value of the UDF + + Returns: + A callable that creates UDFExpr instances when called with expressions + + Example: + >>> from ray.data.expressions import col, udf + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> import ray + >>> + >>> # UDF that operates on a batch of values (PyArrow Array) + >>> @udf(return_dtype=DataType.int32()) + ... def add_one(x: pa.Array) -> pa.Array: + ... return pc.add(x, 1) # Vectorized operation on the entire Array + >>> + >>> # UDF that combines multiple columns (each as a PyArrow Array) + >>> @udf(return_dtype=DataType.string()) + ... def format_name(first: pa.Array, last: pa.Array) -> pa.Array: + ... return pc.binary_join_element_wise(first, last, " ") # Vectorized string concatenation + >>> + >>> # Use in dataset operations + >>> ds = ray.data.from_items([ + ... {"value": 5, "first": "John", "last": "Doe"}, + ... {"value": 10, "first": "Jane", "last": "Smith"} + ... ]) + >>> + >>> # Single column transformation (operates on batches) + >>> ds_incremented = ds.with_column("value_plus_one", add_one(col("value"))) + >>> + >>> # Multi-column transformation (each column becomes a PyArrow Array) + >>> ds_formatted = ds.with_column("full_name", format_name(col("first"), col("last"))) + >>> + >>> # Can also be used in complex expressions + >>> ds_complex = ds.with_column("doubled_plus_one", add_one(col("value")) * 2) + """ + + def decorator(func: Callable[..., BatchColumn]) -> Callable[..., UDFExpr]: + return _create_udf_callable(func, return_dtype) + + return decorator + + +@DeveloperAPI(stability="alpha") +@dataclass(frozen=True, eq=False) +class DownloadExpr(Expr): + """Expression that represents a download operation.""" + + uri_column_name: str + data_type: DataType = field(default_factory=lambda: DataType.binary(), init=False) + + def structurally_equals(self, other: Any) -> bool: + return ( + isinstance(other, DownloadExpr) + and self.uri_column_name == other.uri_column_name + ) + + @PublicAPI(stability="beta") def col(name: str) -> ColumnExpr: """ @@ -301,6 +476,34 @@ def lit(value: Any) -> LiteralExpr: return LiteralExpr(value) +@DeveloperAPI(stability="alpha") +def download(uri_column_name: str) -> DownloadExpr: + """ + Create a download expression that downloads content from URIs. + + This creates an expression that will download bytes from URIs stored in + a specified column. When evaluated, it will fetch the content from each URI + and return the downloaded bytes. + + Args: + uri_column_name: The name of the column containing URIs to download from + Returns: + A DownloadExpr that will download content from the specified URI column + + Example: + >>> from ray.data.expressions import download + >>> import ray + >>> # Create dataset with URIs + >>> ds = ray.data.from_items([ + ... {"uri": "s3://bucket/file1.jpg", "id": "1"}, + ... {"uri": "s3://bucket/file2.jpg", "id": "2"} + ... ]) + >>> # Add downloaded bytes column + >>> ds_with_bytes = ds.with_column("bytes", download("uri")) + """ + return DownloadExpr(uri_column_name=uri_column_name) + + # ────────────────────────────────────── # Public API for evaluation # ────────────────────────────────────── @@ -314,6 +517,10 @@ def lit(value: Any) -> LiteralExpr: "ColumnExpr", "LiteralExpr", "BinaryExpr", + "UDFExpr", + "udf", + "DownloadExpr", "col", "lit", + "download", ] diff --git a/python/ray/data/grouped_data.py b/python/ray/data/grouped_data.py index 0771c6bac3f6..0d7fb0ed5e2b 100644 --- a/python/ray/data/grouped_data.py +++ b/python/ray/data/grouped_data.py @@ -108,7 +108,7 @@ def map_groups( num_cpus: Optional[float] = None, num_gpus: Optional[float] = None, memory: Optional[float] = None, - concurrency: Optional[Union[int, Tuple[int, int]]] = None, + concurrency: Optional[Union[int, Tuple[int, int], Tuple[int, int, int]]] = None, ray_remote_args_fn: Optional[Callable[[], Dict[str, Any]]] = None, **ray_remote_args, ) -> "Dataset": @@ -201,6 +201,9 @@ def map_groups( * If ``fn`` is a class and ``concurrency`` is a tuple ``(m, n)``, Ray Data uses an autoscaling actor pool from ``m`` to ``n`` workers. + * If ``fn`` is a class and ``concurrency`` is a tuple ``(m, n, initial)``, Ray + Data uses an autoscaling actor pool from ``m`` to ``n`` workers, with an initial size of ``initial``. + * If ``fn`` is a class and ``concurrency`` isn't set (default), this method raises an error. diff --git a/python/ray/data/iterator.py b/python/ray/data/iterator.py index 8602baf10f0a..7ed94eef077e 100644 --- a/python/ray/data/iterator.py +++ b/python/ray/data/iterator.py @@ -17,7 +17,7 @@ import numpy as np -from ray.data._internal.block_batching.iter_batches import iter_batches +from ray.data._internal.block_batching.iter_batches import BatchIterator from ray.data._internal.execution.interfaces import RefBundle from ray.data._internal.logical.interfaces import LogicalPlan from ray.data._internal.logical.operators.input_data_operator import InputData @@ -158,6 +158,11 @@ def iter_batches( local_shuffle_seed=local_shuffle_seed, ) + def _create_batch_iterator( + self, ref_bundles_iter: Iterator[RefBundle], **kwargs + ) -> BatchIterator: + return BatchIterator(ref_bundles_iter, **kwargs) + def _iter_batches( self, *, @@ -184,31 +189,27 @@ def _create_iterator() -> Iterator[DataBatch]: blocks_owned_by_consumer, ) = self._to_ref_bundle_iterator() - iterator = iter( - iter_batches( - ref_bundles_iterator, - stats=stats, - clear_block_after_read=blocks_owned_by_consumer, - batch_size=batch_size, - batch_format=batch_format, - drop_last=drop_last, - collate_fn=_collate_fn, - finalize_fn=_finalize_fn, - shuffle_buffer_min_size=local_shuffle_buffer_size, - shuffle_seed=local_shuffle_seed, - prefetch_batches=prefetch_batches, - ) - ) - dataset_tag = self._get_dataset_tag() + batch_iterator = self._create_batch_iterator( + ref_bundles_iterator, + stats=stats, + dataset_tag=dataset_tag, + clear_block_after_read=blocks_owned_by_consumer, + batch_size=batch_size, + batch_format=batch_format, + drop_last=drop_last, + collate_fn=_collate_fn, + finalize_fn=_finalize_fn, + shuffle_buffer_min_size=local_shuffle_buffer_size, + shuffle_seed=local_shuffle_seed, + prefetch_batches=prefetch_batches, + ) + if stats: stats.iter_initialize_s.add(time.perf_counter() - time_start) - for batch in iterator: - yield batch - StatsManager.update_iteration_metrics(stats, dataset_tag) - StatsManager.clear_iteration_metrics(dataset_tag) + yield from batch_iterator if stats: stats.iter_total_s.add(time.perf_counter() - time_start) diff --git a/python/ray/data/llm.py b/python/ray/data/llm.py index bdae67778cfb..a0718c597e98 100644 --- a/python/ray/data/llm.py +++ b/python/ray/data/llm.py @@ -5,6 +5,7 @@ HttpRequestProcessorConfig as _HttpRequestProcessorConfig, Processor, ProcessorConfig as _ProcessorConfig, + ServeDeploymentProcessorConfig as _ServeDeploymentProcessorConfig, SGLangEngineProcessorConfig as _SGLangEngineProcessorConfig, vLLMEngineProcessorConfig as _vLLMEngineProcessorConfig, ) @@ -27,6 +28,11 @@ class ProcessorConfig(_ProcessorConfig): accelerator_type: The accelerator type used by the LLM stage in a processor. Default to None, meaning that only the CPU will be used. concurrency: The number of workers for data parallelism. Default to 1. + If ``concurrency`` is a ``tuple`` ``(m, n)``, Ray creates an autoscaling + actor pool that scales between ``m`` and ``n`` workers (``1 <= m <= n``). + If ``concurrency`` is an ``int`` ``n``, Ray uses either a fixed pool of ``n`` + workers or an autoscaling pool from ``1`` to ``n`` workers, depending on + the processor and stage. """ pass @@ -40,7 +46,9 @@ class HttpRequestProcessorConfig(_HttpRequestProcessorConfig): batch_size: The batch size to send to the HTTP request. url: The URL to send the HTTP request to. headers: The headers to send with the HTTP request. - concurrency: The number of concurrent requests to send. + concurrency: The number of concurrent requests to send. Default to 1. + If ``concurrency`` is a ``tuple`` ``(m, n)``, + autoscaling strategy is used (``1 <= m <= n``). Examples: .. testcode:: @@ -115,6 +123,10 @@ class vLLMEngineProcessorConfig(_vLLMEngineProcessorConfig): accelerator_type: The accelerator type used by the LLM stage in a processor. Default to None, meaning that only the CPU will be used. concurrency: The number of workers for data parallelism. Default to 1. + If ``concurrency`` is a tuple ``(m, n)``, Ray creates an autoscaling + actor pool that scales between ``m`` and ``n`` workers (``1 <= m <= n``). + If ``concurrency`` is an ``int`` ``n``, CPU stages use an autoscaling + pool from ``(1, n)``, while GPU stages use a fixed pool of ``n`` workers. Examples: @@ -176,7 +188,7 @@ class SGLangEngineProcessorConfig(_SGLangEngineProcessorConfig): Args: model_source: The model source to use for the SGLang engine. - batch_size: The batch size to send to the vLLM engine. Large batch sizes are + batch_size: The batch size to send to the SGLang engine. Large batch sizes are likely to saturate the compute resources and could achieve higher throughput. On the other hand, small batch sizes are more fault-tolerant and could reduce bubbles in the data pipeline. You can tune the batch size to balance @@ -196,12 +208,16 @@ class SGLangEngineProcessorConfig(_SGLangEngineProcessorConfig): apply_chat_template: Whether to apply chat template. chat_template: The chat template to use. This is usually not needed if the model checkpoint already contains the chat template. - tokenize: Whether to tokenize the input before passing it to the vLLM engine. - If not, vLLM will tokenize the prompt in the engine. + tokenize: Whether to tokenize the input before passing it to the SGLang engine. + If not, SGLang will tokenize the prompt in the engine. detokenize: Whether to detokenize the output. accelerator_type: The accelerator type used by the LLM stage in a processor. Default to None, meaning that only the CPU will be used. concurrency: The number of workers for data parallelism. Default to 1. + If ``concurrency`` is a tuple ``(m, n)``, Ray creates an autoscaling + actor pool that scales between ``m`` and ``n`` workers (``1 <= m <= n``). + If ``concurrency`` is an ``int`` ``n``, CPU stages use an autoscaling + pool from ``(1, n)``, while GPU stages use a fixed pool of ``n`` workers. Examples: .. testcode:: @@ -244,6 +260,109 @@ class SGLangEngineProcessorConfig(_SGLangEngineProcessorConfig): pass +@PublicAPI(stability="alpha") +class ServeDeploymentProcessorConfig(_ServeDeploymentProcessorConfig): + """The configuration for the serve deployment processor. + + This processor enables sharing serve deployments across multiple processors. This is useful + for sharing the same LLM engine across multiple processors. + + Args: + deployment_name: The name of the serve deployment to use. + app_name: The name of the serve application to use. + batch_size: The batch size to send to the serve deployment. Large batch sizes are + likely to saturate the compute resources and could achieve higher throughput. + On the other hand, small batch sizes are more fault-tolerant and could + reduce bubbles in the data pipeline. You can tune the batch size to balance + the throughput and fault-tolerance based on your use case. + dtype_mapping: The mapping of the request class name to the request class. If this is + not provided, the serve deployment is expected to accept a dict as the request. + concurrency: The number of workers for data parallelism. Default to 1. Note that this is + not the concurrency of the underlying serve deployment. + + Examples: + + .. testcode:: + :skipif: True + + import ray + from ray import serve + from ray.data.llm import ServeDeploymentProcessorConfig, build_llm_processor + from ray.serve.llm import ( + LLMConfig, + ModelLoadingConfig, + build_llm_deployment, + ) + from ray.serve.llm.openai_api_models import CompletionRequest + + llm_config = LLMConfig( + model_loading_config=ModelLoadingConfig( + model_id="facebook/opt-1.3b", + model_source="facebook/opt-1.3b", + ), + accelerator_type="A10G", + deployment_config=dict( + name="facebook", + autoscaling_config=dict( + min_replicas=1, + max_replicas=1, + ), + ), + engine_kwargs=dict( + enable_prefix_caching=True, + enable_chunked_prefill=True, + max_num_batched_tokens=4096, + ), + ) + + APP_NAME = "facebook_opt_app" + DEPLOYMENT_NAME = "facebook_deployment" + override_serve_options = dict(name=DEPLOYMENT_NAME) + + llm_app = build_llm_deployment( + llm_config, override_serve_options=override_serve_options + ) + app = serve.run(llm_app, name=APP_NAME) + + config = ServeDeploymentProcessorConfig( + deployment_name=DEPLOYMENT_NAME, + app_name=APP_NAME, + dtype_mapping={ + "CompletionRequest": CompletionRequest, + }, + concurrency=1, + batch_size=64, + ) + processor = build_llm_processor( + config, + preprocess=lambda row: dict( + method="completions", + dtype="CompletionRequest", + request_kwargs=dict( + model="facebook/opt-1.3b", + prompt=f"This is a prompt for {row['id']}", + stream=False, + ), + ), + postprocess=lambda row: dict( + resp=row["choices"][0]["text"], + ), + ) + + # The processor requires specific input columns, which depend on + # your processor config. You can use the following API to check + # the required input columns: + processor.log_input_column_names() + + ds = ray.data.range(10) + ds = processor(ds) + for row in ds.take_all(): + print(row) + """ + + pass + + @PublicAPI(stability="alpha") def build_llm_processor( config: ProcessorConfig, @@ -324,5 +443,6 @@ def build_llm_processor( "HttpRequestProcessorConfig", "vLLMEngineProcessorConfig", "SGLangEngineProcessorConfig", + "ServeDeploymentProcessorConfig", "build_llm_processor", ] diff --git a/python/ray/data/read_api.py b/python/ray/data/read_api.py index 227a1378f68f..5ead3a4eead5 100644 --- a/python/ray/data/read_api.py +++ b/python/ray/data/read_api.py @@ -95,8 +95,8 @@ from ray.data.datasource.file_meta_provider import ( DefaultFileMetadataProvider, FastFileMetadataProvider, + FileMetadataProvider, ) -from ray.data.datasource.parquet_meta_provider import ParquetMetadataProvider from ray.data.datasource.partitioning import Partitioning from ray.types import ObjectRef from ray.util.annotations import Deprecated, DeveloperAPI, PublicAPI @@ -801,7 +801,7 @@ def read_parquet( parallelism: int = -1, ray_remote_args: Dict[str, Any] = None, tensor_column_schema: Optional[Dict[str, Tuple[np.dtype, Tuple[int, ...]]]] = None, - meta_provider: Optional[ParquetMetadataProvider] = None, + meta_provider: Optional[FileMetadataProvider] = None, partition_filter: Optional[PathPartitionFilter] = None, partitioning: Optional[Partitioning] = Partitioning("hive"), shuffle: Optional[Union[Literal["files"], FileShuffleConfig]] = None, @@ -937,8 +937,6 @@ def read_parquet( _emit_meta_provider_deprecation_warning(meta_provider) _validate_shuffle_arg(shuffle) - if meta_provider is None: - meta_provider = ParquetMetadataProvider() arrow_parquet_args = _resolve_parquet_args( tensor_column_schema, **arrow_parquet_args, @@ -3428,7 +3426,7 @@ def from_huggingface( hf_ds_arrow = dataset.with_format("arrow") ray_ds = from_arrow(hf_ds_arrow[:], override_num_blocks=override_num_blocks) return ray_ds - elif isinstance(dataset, (datasets.DatasetDict, datasets.IterableDatasetDict)): + if isinstance(dataset, (datasets.DatasetDict, datasets.IterableDatasetDict)): available_keys = list(dataset.keys()) raise DeprecationWarning( "You provided a Hugging Face DatasetDict or IterableDatasetDict, " @@ -3886,7 +3884,7 @@ def read_delta( columns: Optional[List[str]] = None, parallelism: int = -1, ray_remote_args: Optional[Dict[str, Any]] = None, - meta_provider: Optional[ParquetMetadataProvider] = None, + meta_provider: Optional[FileMetadataProvider] = None, partition_filter: Optional[PathPartitionFilter] = None, partitioning: Optional[Partitioning] = Partitioning("hive"), shuffle: Union[Literal["files"], None] = None, diff --git a/python/ray/data/tests/block_batching/test_iter_batches.py b/python/ray/data/tests/block_batching/test_iter_batches.py index ecbdd7e16173..7ee6812fab9a 100644 --- a/python/ray/data/tests/block_batching/test_iter_batches.py +++ b/python/ray/data/tests/block_batching/test_iter_batches.py @@ -8,9 +8,13 @@ import pytest import ray -from ray.data._internal.block_batching.interfaces import Batch, BlockPrefetcher +from ray.data._internal.block_batching.interfaces import ( + Batch, + BatchMetadata, + BlockPrefetcher, +) from ray.data._internal.block_batching.iter_batches import ( - iter_batches, + BatchIterator, prefetch_batches_locally, restore_original_order, ) @@ -95,14 +99,14 @@ def prefetch_blocks(self, block_refs: List[ObjectRef[Block]]): def test_restore_from_original_order(): base_iterator = [ - Batch(1, None), - Batch(0, None), - Batch(3, None), - Batch(2, None), + Batch(BatchMetadata(batch_idx=1), None), + Batch(BatchMetadata(batch_idx=0), None), + Batch(BatchMetadata(batch_idx=3), None), + Batch(BatchMetadata(batch_idx=2), None), ] ordered = list(restore_original_order(iter(base_iterator))) - idx = [batch.batch_idx for batch in ordered] + idx = [batch.metadata.batch_idx for batch in ordered] assert idx == [0, 1, 2, 3] @@ -123,7 +127,7 @@ def finalize_enforce_single_thread(batch): # Test that finalize_fn is called in a single thread, # even if prefetch_batches is set. - output_batches = iter_batches( + output_batches = BatchIterator( ref_bundles_iter, collate_fn=lambda batch: batch, finalize_fn=finalize_enforce_single_thread, @@ -156,7 +160,7 @@ def collate_fn(batch: pd.DataFrame): ref_bundles_iter = ref_bundle_generator(num_blocks=4, num_rows=2) - output_batches = iter_batches( + output_batches = BatchIterator( ref_bundles_iter, batch_size=batch_size, prefetch_batches=prefetch_batches, @@ -198,7 +202,7 @@ def collate_fn(batch): ref_bundles = ref_bundle_generator(num_blocks=20, num_rows=2) start_time = time.time() - output_batches = iter_batches( + output_batches = BatchIterator( ref_bundles, batch_size=None, collate_fn=collate_fn, diff --git a/python/ray/data/tests/block_batching/test_util.py b/python/ray/data/tests/block_batching/test_util.py index 098ed64a4004..f8be82e43281 100644 --- a/python/ray/data/tests/block_batching/test_util.py +++ b/python/ray/data/tests/block_batching/test_util.py @@ -10,7 +10,7 @@ import pytest import ray -from ray.data._internal.block_batching.interfaces import Batch +from ray.data._internal.block_batching.interfaces import Batch, BatchMetadata from ray.data._internal.block_batching.util import ( _calculate_ref_hits, blocks_to_batches, @@ -64,13 +64,17 @@ def test_blocks_to_batches(block_size, drop_last): assert leftover_batches == 1 assert full_batches == (dataset_size // batch_size) - assert [batch.batch_idx for batch in batch_iter] == list(range(len(batch_iter))) + assert [batch.metadata.batch_idx for batch in batch_iter] == list( + range(len(batch_iter)) + ) @pytest.mark.parametrize("batch_format", ["pandas", "numpy", "pyarrow"]) def test_format_batches(batch_format): block_iter = block_generator(num_rows=2, num_blocks=2) - batch_iter = (Batch(i, block) for i, block in enumerate(block_iter)) + batch_iter = ( + Batch(BatchMetadata(batch_idx=i), block) for i, block in enumerate(block_iter) + ) batch_iter = list(format_batches(batch_iter, batch_format=batch_format)) for batch in batch_iter: @@ -82,7 +86,9 @@ def test_format_batches(batch_format): assert isinstance(batch.data, dict) assert isinstance(batch.data["foo"], np.ndarray) - assert [batch.batch_idx for batch in batch_iter] == list(range(len(batch_iter))) + assert [batch.metadata.batch_idx for batch in batch_iter] == list( + range(len(batch_iter)) + ) def test_collate(): @@ -90,13 +96,13 @@ def collate_fn(batch): return pa.table({"bar": [1] * 2}) batches = [ - Batch(i, data) + Batch(BatchMetadata(batch_idx=i), data) for i, data in enumerate(block_generator(num_rows=2, num_blocks=2)) ] batch_iter = collate(batches, collate_fn=collate_fn) for i, batch in enumerate(batch_iter): - assert batch.batch_idx == i + assert batch.metadata.batch_idx == i assert batch.data == pa.table({"bar": [1] * 2}) @@ -105,13 +111,13 @@ def finalize_fn(batch): return pa.table({"bar": [1] * 2}) batches = [ - Batch(i, data) + Batch(BatchMetadata(batch_idx=i), data) for i, data in enumerate(block_generator(num_rows=2, num_blocks=2)) ] batch_iter = finalize_batches(batches, finalize_fn=finalize_fn) for i, batch in enumerate(batch_iter): - assert batch.batch_idx == i + assert batch.metadata.batch_idx == i assert batch.data == pa.table({"bar": [1] * 2}) diff --git a/python/ray/data/tests/conftest.py b/python/ray/data/tests/conftest.py index 53145fa06e98..6ee6e4df6bfc 100644 --- a/python/ray/data/tests/conftest.py +++ b/python/ray/data/tests/conftest.py @@ -16,7 +16,6 @@ from ray._private.internal_api import get_memory_info_reply, get_state_from_address from ray.air.constants import TENSOR_COLUMN_NAME from ray.air.util.tensor_extensions.arrow import ArrowTensorArray -from ray.data import Schema from ray.data.block import BlockExecStats, BlockMetadata from ray.data.context import DEFAULT_TARGET_MAX_BLOCK_SIZE, DataContext, ShuffleStrategy from ray.data.tests.mock_server import * # noqa @@ -145,19 +144,38 @@ def _s3_fs(aws_credentials, s3_server, s3_path): kwargs["allow_bucket_creation"] = True kwargs["allow_bucket_deletion"] = True - fs = pa.fs.S3FileSystem( - region="us-west-2", - endpoint_override=s3_server, - **kwargs, - ) - if s3_path.startswith("s3://"): - if "@" in s3_path: - s3_path = s3_path.split("@")[-1] - else: - s3_path = s3_path[len("s3://") :] - s3_path = urllib.parse.quote(s3_path) - fs.create_dir(s3_path) - yield fs + fs = None + try: + fs = pa.fs.S3FileSystem( + region="us-west-2", + endpoint_override=s3_server, + **kwargs, + ) + if s3_path.startswith("s3://"): + if "@" in s3_path: + s3_path = s3_path.split("@")[-1] + else: + s3_path = s3_path[len("s3://") :] + s3_path = urllib.parse.quote(s3_path) + fs.create_dir(s3_path) + yield fs + + finally: + # Explicit cleanup for S3FileSystem resources + if fs is not None: + try: + # Clean up test directory if it exists + try: + file_info = fs.get_file_info(s3_path) + if file_info.type != pa.fs.FileType.NotFound: + fs.delete_dir(s3_path) + except (OSError, pa.lib.ArrowIOError): + # Directory doesn't exist or can't be deleted, that's fine + pass + except Exception as e: + print(f"Warning: S3 filesystem cleanup error: {e}") + finally: + fs = None @pytest.fixture(scope="function") @@ -213,65 +231,6 @@ def _write_partitioned_df( yield _write_partitioned_df -@pytest.fixture(scope="function") -def write_base_partitioned_df(base_partitioned_df, write_partitioned_df): - def _write_base_partitioned_df( - partition_keys, - partition_path_encoder, - file_writer_fn, - ): - write_partitioned_df( - base_partitioned_df, - partition_keys, - partition_path_encoder, - file_writer_fn, - ) - - yield _write_base_partitioned_df - - -@pytest.fixture(scope="function") -def assert_base_partitioned_ds(): - def _assert_base_partitioned_ds( - ds, - count=6, - num_input_files=2, - num_rows=6, - schema=Schema(pa.schema([("one", pa.int64()), ("two", pa.string())])), - sorted_values=None, - ds_take_transform_fn=None, - sorted_values_transform_fn=None, - ): - if ds_take_transform_fn is None: - ds_take_transform_fn = lambda taken: [ # noqa: E731 - [s["one"], s["two"]] for s in taken - ] - - if sorted_values_transform_fn is None: - sorted_values_transform_fn = ( # noqa: E731 - lambda sorted_values: sorted_values - ) - - if sorted_values is None: - sorted_values = [[1, "a"], [1, "b"], [1, "c"], [3, "e"], [3, "f"], [3, "g"]] - # Test metadata ops. - assert not ds._plan.has_started_execution - assert ds.count() == count, f"{ds.count()} != {count}" - assert ds.size_bytes() > 0, f"{ds.size_bytes()} <= 0" - assert ds.schema() == schema - actual_input_files = ds.input_files() - assert len(actual_input_files) == num_input_files, actual_input_files - - # Force a data read. - values = ds_take_transform_fn(ds.take_all()) - actual_sorted_values = sorted_values_transform_fn(sorted(values)) - assert ( - actual_sorted_values == sorted_values - ), f"{actual_sorted_values} != {sorted_values}" - - yield _assert_base_partitioned_ds - - @pytest.fixture def restore_data_context(request): """Restore any DataContext changes after the test runs""" diff --git a/python/ray/data/tests/mock_server.py b/python/ray/data/tests/mock_server.py index f8a5e22bfa12..337aa1f87a72 100644 --- a/python/ray/data/tests/mock_server.py +++ b/python/ray/data/tests/mock_server.py @@ -1,5 +1,6 @@ import shutil import signal +import socket import subprocess as sp import time @@ -16,10 +17,51 @@ } +def _is_port_available(host, port): + """Check if a port is available for use.""" + try: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + s.bind((host, port)) + return True + except OSError: + return False + + +def _find_available_port(host, preferred_port, max_attempts=10): + """Find an available port starting from preferred_port.""" + + # Try the preferred port first + if _is_port_available(host, preferred_port): + return preferred_port + + # Try a wider range if preferred port is busy + for i in range(1, max_attempts): + port = preferred_port + i + if _is_port_available(host, port): + return port + + # If all else fails, let the OS pick a port + try: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + s.bind((host, 0)) # Let OS pick port + _, port = s.getsockname() + return port + except OSError as e: + raise RuntimeError( + f"Could not find any available port starting from " f"{preferred_port}: {e}" + ) from e + + def start_service(service_name, host, port): moto_svr_path = shutil.which("moto_server") if not moto_svr_path: pytest.skip("moto not installed") + + # Always use port conflict resolution to be safe + port = _find_available_port(host, port) + args = [moto_svr_path, service_name, "-H", host, "-p", str(port)] # For debugging # args = '{0} {1} -H {2} -p {3} 2>&1 | \ @@ -48,21 +90,25 @@ def start_service(service_name, host, port): stop_process(process) # pytest.fail doesn't call stop_process pytest.fail("Can not start service: {}".format(service_name)) - return process + return process, url def stop_process(process): + """Stop process with shorter timeout to prevent test hangs.""" + if process is None or process.poll() is not None: + return # Already stopped + try: process.send_signal(signal.SIGTERM) process.communicate(timeout=20) except sp.TimeoutExpired: process.kill() - outs, errors = process.communicate(timeout=20) - exit_code = process.returncode - msg = "Child process finished {} not in clean way: {} {}".format( - exit_code, outs, errors - ) - raise RuntimeError(msg) + try: + process.communicate(timeout=5) # Short timeout for kill + except sp.TimeoutExpired: + print("Warning: Process cleanup timed out") + except Exception as e: + print(f"Warning: Error during process cleanup: {e}") # TODO(Clark): We should be able to use "session" scope here, but we've found @@ -75,7 +121,6 @@ def stop_process(process): def s3_server(): host = "localhost" port = 5002 - url = f"http://{build_address(host, port)}" - process = start_service("s3", host, port) + process, url = start_service("s3", host, port) yield url stop_process(process) diff --git a/python/ray/data/tests/test_actor_pool_map_operator.py b/python/ray/data/tests/test_actor_pool_map_operator.py index cf68f0ee05c8..e6b149c153a5 100644 --- a/python/ray/data/tests/test_actor_pool_map_operator.py +++ b/python/ray/data/tests/test_actor_pool_map_operator.py @@ -4,6 +4,7 @@ import threading import time import unittest +from dataclasses import replace from typing import Any, Dict, Optional, Tuple from unittest.mock import MagicMock @@ -13,11 +14,9 @@ import ray from ray._common.test_utils import wait_for_condition from ray.actor import ActorHandle -from ray.data._internal.execution.autoscaler.default_autoscaler import ( - ActorPoolScalingRequest, -) +from ray.data._internal.actor_autoscaler import ActorPoolScalingRequest from ray.data._internal.execution.bundle_queue import FIFOBundleQueue -from ray.data._internal.execution.interfaces import ExecutionResources +from ray.data._internal.execution.interfaces import ExecutionOptions, ExecutionResources from ray.data._internal.execution.interfaces.physical_operator import _ActorPoolInfo from ray.data._internal.execution.interfaces.ref_bundle import RefBundle from ray.data._internal.execution.operators.actor_pool_map_operator import ( @@ -93,11 +92,13 @@ def _create_actor_pool( self, min_size=1, max_size=4, + initial_size=1, max_tasks_in_flight=4, ): pool = _ActorPool( min_size=min_size, max_size=max_size, + initial_size=initial_size, max_actor_concurrency=1, max_tasks_in_flight_per_actor=max_tasks_in_flight, create_actor_fn=self._create_actor_fn, @@ -161,7 +162,11 @@ def test_can_scale_down(self): pool.scale(ActorPoolScalingRequest(delta=1, reason="scaling up")) # Assert we can't scale down immediately after scale up assert not pool._can_apply(downscaling_request) - assert pool._last_upscaling_ts == time.time() + assert pool._last_upscaled_at == time.time() + + # Check that we can still scale down if downscaling request + # is a forced one + assert pool._can_apply(replace(downscaling_request, force=True)) # Advance clock f.tick( @@ -588,13 +593,32 @@ def test_locality_based_actor_ranking_no_locations(self): assert res5 is None +def test_setting_initial_size_for_actor_pool(): + data_context = ray.data.DataContext.get_current() + op = ActorPoolMapOperator( + map_transformer=MagicMock(), + input_op=InputDataBuffer(data_context, input_data=MagicMock()), + data_context=data_context, + compute_strategy=ray.data.ActorPoolStrategy( + min_size=1, max_size=4, initial_size=2 + ), + ray_remote_args={"num_cpus": 1}, + ) + + op.start(ExecutionOptions()) + + assert op._actor_pool.get_actor_info() == _ActorPoolInfo( + running=0, pending=2, restarting=0 + ) + ray.shutdown() + + def test_min_max_resource_requirements(restore_data_context): data_context = ray.data.DataContext.get_current() op = ActorPoolMapOperator( map_transformer=MagicMock(), input_op=InputDataBuffer(data_context, input_data=MagicMock()), data_context=data_context, - target_max_block_size=None, compute_strategy=ray.data.ActorPoolStrategy( min_size=1, max_size=2, diff --git a/python/ray/data/tests/test_arrow_block.py b/python/ray/data/tests/test_arrow_block.py index 3790964d19b3..f599bfc58ee5 100644 --- a/python/ray/data/tests/test_arrow_block.py +++ b/python/ray/data/tests/test_arrow_block.py @@ -542,6 +542,7 @@ def test_arrow_nan_element(): "table_data,max_chunk_size_bytes,expected", [ ({"a": []}, 100, None), + ({"a": list(range(100))}, 7, 1), ({"a": list(range(100))}, 10, 1), ({"a": list(range(100))}, 25, 3), ({"a": list(range(100))}, 50, 6), diff --git a/python/ray/data/tests/test_autoscaler.py b/python/ray/data/tests/test_autoscaler.py index 736fff47fb58..a41c7d3bebff 100644 --- a/python/ray/data/tests/test_autoscaler.py +++ b/python/ray/data/tests/test_autoscaler.py @@ -8,10 +8,11 @@ import ray from ray.data import ExecutionResources -from ray.data._internal.execution.autoscaler.default_autoscaler import ( +from ray.data._internal.actor_autoscaler import ( ActorPoolScalingRequest, - DefaultAutoscaler, + DefaultActorAutoscaler, ) +from ray.data._internal.cluster_autoscaler import DefaultClusterAutoscaler from ray.data._internal.execution.operators.actor_pool_map_operator import _ActorPool from ray.data._internal.execution.operators.base_physical_operator import ( InternalQueueOperatorMixin, @@ -30,10 +31,9 @@ def test_actor_pool_scaling(): resource_manager = MagicMock( spec=ResourceManager, get_budget=MagicMock(return_value=None) ) - autoscaler = DefaultAutoscaler( + autoscaler = DefaultActorAutoscaler( topology=MagicMock(), resource_manager=resource_manager, - execution_id="execution_id", config=AutoscalingConfig( actor_pool_util_upscaling_threshold=1.0, actor_pool_util_downscaling_threshold=0.5, @@ -80,14 +80,16 @@ def patch(mock, attr, value, is_method=True): yield setattr(mock, attr, original) - def assert_autoscaling_action(*, delta: int, expected_reason: Optional[str]): + def assert_autoscaling_action( + *, delta: int, expected_reason: Optional[str], force: bool = False + ): nonlocal actor_pool, op, op_state assert autoscaler._derive_target_scaling_config( actor_pool=actor_pool, op=op, op_state=op_state, - ) == ActorPoolScalingRequest(delta=delta, reason=expected_reason) + ) == ActorPoolScalingRequest(delta=delta, force=force, reason=expected_reason) # Should scale up since the util above the threshold. assert actor_pool.get_pool_util() == 1.5 @@ -141,6 +143,7 @@ def assert_autoscaling_action(*, delta: int, expected_reason: Optional[str]): assert_autoscaling_action( delta=-1, expected_reason="consumed all inputs", + force=True, ) # Should scale down only once all inputs have been already dispatched AND @@ -150,6 +153,7 @@ def assert_autoscaling_action(*, delta: int, expected_reason: Optional[str]): with patch(op, "_inputs_complete", True, is_method=False): assert_autoscaling_action( delta=-1, + force=True, expected_reason="consumed all inputs", ) @@ -236,15 +240,14 @@ def test_cluster_scaling(): op2: op_state2, } - autoscaler = DefaultAutoscaler( + autoscaler = DefaultClusterAutoscaler( topology=topology, resource_manager=MagicMock(), execution_id="execution_id", - config=AutoscalingConfig(), ) autoscaler._send_resource_request = MagicMock() - autoscaler._try_scale_up_cluster() + autoscaler.try_trigger_scaling() autoscaler._send_resource_request.assert_called_once_with( [{"CPU": 1}, {"CPU": 2}, {"CPU": 2}] diff --git a/python/ray/data/tests/test_binary.py b/python/ray/data/tests/test_binary.py index f1735da802f7..18e07200306a 100644 --- a/python/ray/data/tests/test_binary.py +++ b/python/ray/data/tests/test_binary.py @@ -1,45 +1,22 @@ import os from io import BytesIO -import pandas as pd import pyarrow as pa import pytest import requests import snappy import ray -from ray.data import Schema from ray.data.datasource import ( BaseFileMetadataProvider, FastFileMetadataProvider, - Partitioning, - PartitionStyle, - PathPartitionFilter, ) from ray.data.tests.conftest import * # noqa from ray.data.tests.mock_http_server import * # noqa -from ray.data.tests.test_partitioning import PathPartitionEncoder from ray.data.tests.util import extract_values, gen_bin_files from ray.tests.conftest import * # noqa -def test_read_binary_files_partitioning(ray_start_regular_shared, tmp_path): - os.mkdir(os.path.join(tmp_path, "country=us")) - path = os.path.join(tmp_path, "country=us", "file.bin") - with open(path, "wb") as f: - f.write(b"foo") - - ds = ray.data.read_binary_files(path, partitioning=Partitioning("hive")) - - assert ds.take() == [{"bytes": b"foo", "country": "us"}] - - ds = ray.data.read_binary_files( - path, include_paths=True, partitioning=Partitioning("hive") - ) - - assert ds.take() == [{"bytes": b"foo", "path": path, "country": "us"}] - - def test_read_binary_files(ray_start_regular_shared): with gen_bin_files(10) as (_, paths): ds = ray.data.read_binary_files(paths) @@ -52,24 +29,6 @@ def test_read_binary_files(ray_start_regular_shared): assert "bytes" in str(ds), ds -@pytest.mark.parametrize("ignore_missing_paths", [True, False]) -def test_read_binary_files_ignore_missing_paths( - ray_start_regular_shared, ignore_missing_paths -): - with gen_bin_files(1) as (_, paths): - paths = paths + ["missing_file"] - if ignore_missing_paths: - ds = ray.data.read_binary_files( - paths, ignore_missing_paths=ignore_missing_paths - ) - assert ds.input_files() == [paths[0]] - else: - with pytest.raises(FileNotFoundError): - ds = ray.data.read_binary_files( - paths, ignore_missing_paths=ignore_missing_paths - ).materialize() - - def test_read_binary_files_with_fs(ray_start_regular_shared): with gen_bin_files(10) as (tempdir, paths): # All the paths are absolute, so we want the root file system. @@ -142,59 +101,6 @@ def test_read_binary_meta_provider( ) -@pytest.mark.parametrize("style", [PartitionStyle.HIVE, PartitionStyle.DIRECTORY]) -def test_read_binary_snappy_partitioned_with_filter( - style, - ray_start_regular_shared, - tmp_path, - write_base_partitioned_df, - assert_base_partitioned_ds, -): - def df_to_binary(dataframe, path, **kwargs): - with open(path, "wb") as f: - df_string = dataframe.to_string(index=False, header=False, **kwargs) - byte_str = df_string.encode() - bytes = BytesIO(byte_str) - snappy.stream_compress(bytes, f) - - partition_keys = ["one"] - - def skip_unpartitioned(kv_dict): - return bool(kv_dict) - - base_dir = os.path.join(tmp_path, style.value) - partition_path_encoder = PathPartitionEncoder.of( - style=style, - base_dir=base_dir, - field_names=partition_keys, - ) - write_base_partitioned_df( - partition_keys, - partition_path_encoder, - df_to_binary, - ) - df_to_binary(pd.DataFrame({"1": [1]}), os.path.join(base_dir, "test.snappy")) - partition_path_filter = PathPartitionFilter.of( - style=style, - base_dir=base_dir, - field_names=partition_keys, - filter_fn=skip_unpartitioned, - ) - ds = ray.data.read_binary_files( - base_dir, - partition_filter=partition_path_filter, - arrow_open_stream_args=dict(compression="snappy"), - ) - assert_base_partitioned_ds( - ds, - count=2, - num_rows=2, - schema=Schema(pa.schema([("bytes", pa.binary())])), - sorted_values=[b"1 a\n1 b\n1 c", b"3 e\n3 f\n3 g"], - ds_take_transform_fn=lambda t: extract_values("bytes", t), - ) - - if __name__ == "__main__": import sys diff --git a/python/ray/data/tests/test_consumption.py b/python/ray/data/tests/test_consumption.py index 4176bce5aa7b..c8437d5aaa22 100644 --- a/python/ray/data/tests/test_consumption.py +++ b/python/ray/data/tests/test_consumption.py @@ -535,6 +535,50 @@ def my_dummy_fn(x): ) +def test_dataset_explain(ray_start_regular_shared, capsys): + ds = ray.data.range(10, override_num_blocks=10) + ds = ds.map(lambda x: x) + + ds.explain() + captured = capsys.readouterr() + assert captured.out.rstrip() == ( + "-------- Logical Plan --------\n" + "Map()\n" + "+- ReadRange\n" + "-------- Physical Plan --------\n" + "TaskPoolMapOperator[ReadRange->Map()]\n" + "+- InputDataBuffer[Input]" + ) + + ds = ds.filter(lambda x: x["id"] > 0) + ds.explain() + captured = capsys.readouterr() + assert captured.out.rstrip() == ( + "-------- Logical Plan --------\n" + "Filter()\n" + "+- Map()\n" + " +- ReadRange\n" + "-------- Physical Plan --------\n" + "TaskPoolMapOperator[ReadRange->Map()->Filter()]\n" + "+- InputDataBuffer[Input]" + ) + ds = ds.random_shuffle().map(lambda x: x) + ds.explain() + captured = capsys.readouterr() + assert captured.out.rstrip() == ( + "-------- Logical Plan --------\n" + "Map()\n" + "+- RandomShuffle\n" + " +- Filter()\n" + " +- Map()\n" + " +- ReadRange\n" + "-------- Physical Plan --------\n" + "TaskPoolMapOperator[Map()]\n" + "+- AllToAllOperator[ReadRange->Map()->Filter()->RandomShuffle]\n" + " +- InputDataBuffer[Input]" + ) + + @pytest.mark.parametrize("lazy", [False, True]) def test_limit(ray_start_regular_shared, lazy): ds = ray.data.range(100, override_num_blocks=20) @@ -1819,7 +1863,7 @@ def test_dataset_plan_as_string(ray_start_cluster): ds = ray.data.read_parquet("example://iris.parquet", override_num_blocks=8) assert ds._plan.get_plan_as_string(type(ds)) == ( "Dataset(\n" - " num_rows=150,\n" + " num_rows=?,\n" " schema={\n" " sepal.length: double,\n" " sepal.width: double,\n" @@ -1838,7 +1882,7 @@ def test_dataset_plan_as_string(ray_start_cluster): " +- MapBatches()\n" " +- MapBatches()\n" " +- Dataset(\n" - " num_rows=150,\n" + " num_rows=?,\n" " schema={\n" " sepal.length: double,\n" " sepal.width: double,\n" diff --git a/python/ray/data/tests/test_csv.py b/python/ray/data/tests/test_csv.py index 93ef35261e8f..6beca52fe113 100644 --- a/python/ray/data/tests/test_csv.py +++ b/python/ray/data/tests/test_csv.py @@ -1,14 +1,11 @@ -import itertools import os import shutil -from functools import partial import pandas as pd import pyarrow as pa import pyarrow.parquet as pq import pytest from packaging.version import Version -from pytest_lazy_fixtures import lf as lazy_fixture import ray from ray.data import Schema @@ -17,8 +14,6 @@ from ray.data.datasource import ( BaseFileMetadataProvider, FastFileMetadataProvider, - PartitionStyle, - PathPartitionFilter, ) from ray.data.datasource.file_based_datasource import ( FILE_SIZE_FETCH_PARALLELIZATION_THRESHOLD, @@ -26,7 +21,6 @@ from ray.data.datasource.path_util import _unwrap_protocol from ray.data.tests.conftest import * # noqa from ray.data.tests.mock_http_server import * # noqa -from ray.data.tests.test_partitioning import PathPartitionEncoder from ray.tests.conftest import * # noqa @@ -34,55 +28,14 @@ def df_to_csv(dataframe, path, **kwargs): dataframe.to_csv(path, **kwargs) -def test_csv_read_partitioning(ray_start_regular_shared, tmp_path): - path = os.path.join(tmp_path, "country=us", "file.csv") - os.mkdir(os.path.dirname(path)) - df = pd.DataFrame({"numbers": [1, 2, 3], "letters": ["a", "b", "c"]}) - df.to_csv(path, index=False) - - ds = ray.data.read_csv(path) - - assert ds.take() == [ - {"numbers": 1, "letters": "a", "country": "us"}, - {"numbers": 2, "letters": "b", "country": "us"}, - {"numbers": 3, "letters": "c", "country": "us"}, - ] - - -@pytest.mark.parametrize( - "fs,data_path,endpoint_url", - [ - (None, lazy_fixture("local_path"), None), - (lazy_fixture("local_fs"), lazy_fixture("local_path"), None), - (lazy_fixture("s3_fs"), lazy_fixture("s3_path"), lazy_fixture("s3_server")), - ( - lazy_fixture("s3_fs_with_space"), - lazy_fixture("s3_path_with_space"), - lazy_fixture("s3_server"), - ), - ( - lazy_fixture("s3_fs_with_special_chars"), - lazy_fixture("s3_path_with_special_chars"), - lazy_fixture("s3_server"), - ), - ], -) def test_csv_read( - ray_start_regular_shared, - fs, - data_path, - endpoint_url, - target_max_block_size_infinite_or_default, + ray_start_regular_shared, tmp_path, target_max_block_size_infinite_or_default ): - if endpoint_url is None: - storage_options = {} - else: - storage_options = dict(client_kwargs=dict(endpoint_url=endpoint_url)) # Single file. df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) - path1 = os.path.join(data_path, "test1.csv") - df1.to_csv(path1, index=False, storage_options=storage_options) - ds = ray.data.read_csv(path1, filesystem=fs, partitioning=None) + path1 = os.path.join(tmp_path, "test1.csv") + df1.to_csv(path1, index=False) + ds = ray.data.read_csv(path1, partitioning=None) dsdf = ds.to_pandas().sort_values(by=["one", "two"]).reset_index(drop=True) assert df1.equals(dsdf) # Test metadata ops. @@ -92,11 +45,9 @@ def test_csv_read( # Two files, override_num_blocks=2. df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]}) - path2 = os.path.join(data_path, "test2.csv") - df2.to_csv(path2, index=False, storage_options=storage_options) - ds = ray.data.read_csv( - [path1, path2], override_num_blocks=2, filesystem=fs, partitioning=None - ) + path2 = os.path.join(tmp_path, "test2.csv") + df2.to_csv(path2, index=False) + ds = ray.data.read_csv([path1, path2], override_num_blocks=2, partitioning=None) dsdf = ds.to_pandas().sort_values(by=["one", "two"]).reset_index(drop=True) df = pd.concat([df1, df2], ignore_index=True) assert df.equals(dsdf) @@ -106,12 +57,11 @@ def test_csv_read( # Three files, override_num_blocks=2. df3 = pd.DataFrame({"one": [7, 8, 9], "two": ["h", "i", "j"]}) - path3 = os.path.join(data_path, "test3.csv") - df3.to_csv(path3, index=False, storage_options=storage_options) + path3 = os.path.join(tmp_path, "test3.csv") + df3.to_csv(path3, index=False) ds = ray.data.read_csv( [path1, path2, path3], override_num_blocks=2, - filesystem=fs, partitioning=None, ) df = pd.concat([df1, df2, df3], ignore_index=True) @@ -119,136 +69,89 @@ def test_csv_read( assert df.equals(dsdf) # Directory, two files. - path = os.path.join(data_path, "test_csv_dir") - if fs is None: - os.mkdir(path) - else: - fs.create_dir(_unwrap_protocol(path)) + path = os.path.join(tmp_path, "test_csv_dir") + os.mkdir(path) df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) path1 = os.path.join(path, "data0.csv") - df1.to_csv(path1, index=False, storage_options=storage_options) + df1.to_csv(path1, index=False) df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]}) path2 = os.path.join(path, "data1.csv") - df2.to_csv(path2, index=False, storage_options=storage_options) - ds = ray.data.read_csv(path, filesystem=fs, partitioning=None) + df2.to_csv(path2, index=False) + ds = ray.data.read_csv(path, partitioning=None) df = pd.concat([df1, df2], ignore_index=True) dsdf = ds.to_pandas().sort_values(by=["one", "two"]).reset_index(drop=True) pd.testing.assert_frame_equal(df, dsdf) - if fs is None: - shutil.rmtree(path) - else: - fs.delete_dir(_unwrap_protocol(path)) + shutil.rmtree(path) # Two directories, three files. - path1 = os.path.join(data_path, "test_csv_dir1") - path2 = os.path.join(data_path, "test_csv_dir2") - if fs is None: - os.mkdir(path1) - os.mkdir(path2) - else: - fs.create_dir(_unwrap_protocol(path1)) - fs.create_dir(_unwrap_protocol(path2)) + path1 = os.path.join(tmp_path, "test_csv_dir1") + path2 = os.path.join(tmp_path, "test_csv_dir2") + os.mkdir(path1) + os.mkdir(path2) df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) file_path1 = os.path.join(path1, "data0.csv") - df1.to_csv(file_path1, index=False, storage_options=storage_options) + df1.to_csv(file_path1, index=False) df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]}) file_path2 = os.path.join(path2, "data1.csv") - df2.to_csv(file_path2, index=False, storage_options=storage_options) + df2.to_csv(file_path2, index=False) df3 = pd.DataFrame({"one": [7, 8, 9], "two": ["h", "i", "j"]}) file_path3 = os.path.join(path2, "data2.csv") - df3.to_csv(file_path3, index=False, storage_options=storage_options) - ds = ray.data.read_csv([path1, path2], filesystem=fs, partitioning=None) + df3.to_csv(file_path3, index=False) + ds = ray.data.read_csv([path1, path2], partitioning=None) df = pd.concat([df1, df2, df3], ignore_index=True) dsdf = ds.to_pandas().sort_values(by=["one", "two"]).reset_index(drop=True) assert df.equals(dsdf) - if fs is None: - shutil.rmtree(path1) - shutil.rmtree(path2) - else: - fs.delete_dir(_unwrap_protocol(path1)) - fs.delete_dir(_unwrap_protocol(path2)) + shutil.rmtree(path1) + shutil.rmtree(path2) # Directory and file, two files. - dir_path = os.path.join(data_path, "test_csv_dir") - if fs is None: - os.mkdir(dir_path) - else: - fs.create_dir(_unwrap_protocol(dir_path)) + dir_path = os.path.join(tmp_path, "test_csv_dir") + os.mkdir(dir_path) df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) path1 = os.path.join(dir_path, "data0.csv") - df1.to_csv(path1, index=False, storage_options=storage_options) + df1.to_csv(path1, index=False) df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]}) - path2 = os.path.join(data_path, "data1.csv") - df2.to_csv(path2, index=False, storage_options=storage_options) - ds = ray.data.read_csv([dir_path, path2], filesystem=fs, partitioning=None) + path2 = os.path.join(tmp_path, "data1.csv") + df2.to_csv(path2, index=False) + ds = ray.data.read_csv([dir_path, path2], partitioning=None) df = pd.concat([df1, df2], ignore_index=True) dsdf = ds.to_pandas().sort_values(by=["one", "two"]).reset_index(drop=True) assert df.equals(dsdf) - if fs is None: - shutil.rmtree(dir_path) - else: - fs.delete_dir(_unwrap_protocol(dir_path)) + shutil.rmtree(dir_path) # Directory, two files and non-csv file (test extension-based path filtering). - path = os.path.join(data_path, "test_csv_dir") - if fs is None: - os.mkdir(path) - else: - fs.create_dir(_unwrap_protocol(path)) + path = os.path.join(tmp_path, "test_csv_dir") + os.mkdir(path) df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) path1 = os.path.join(path, "data0.csv") - df1.to_csv(path1, index=False, storage_options=storage_options) + df1.to_csv(path1, index=False) df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]}) path2 = os.path.join(path, "data1.csv") - df2.to_csv(path2, index=False, storage_options=storage_options) + df2.to_csv(path2, index=False) # Add a file with a non-matching file extension. This file should be ignored. df_txt = pd.DataFrame({"foobar": [1, 2, 3]}) df_txt.to_json( os.path.join(path, "foo.txt"), - storage_options=storage_options, ) ds = ray.data.read_csv( path, - filesystem=fs, file_extensions=["csv"], partitioning=None, ) df = pd.concat([df1, df2], ignore_index=True) dsdf = ds.to_pandas().sort_values(by=["one", "two"]).reset_index(drop=True) assert df.equals(dsdf) - if fs is None: - shutil.rmtree(path) - else: - fs.delete_dir(_unwrap_protocol(path)) - - -@pytest.mark.parametrize( - "fs,data_path,endpoint_url", - [ - (None, lazy_fixture("local_path"), None), - (lazy_fixture("local_fs"), lazy_fixture("local_path"), None), - (lazy_fixture("s3_fs"), lazy_fixture("s3_path"), lazy_fixture("s3_server")), - ], -) -def test_csv_read_meta_provider( - ray_start_regular_shared, - fs, - data_path, - endpoint_url, -): - if endpoint_url is None: - storage_options = {} - else: - storage_options = dict(client_kwargs=dict(endpoint_url=endpoint_url)) + shutil.rmtree(path) + +def test_csv_read_meta_provider(ray_start_regular_shared, tmp_path): df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) - path1 = os.path.join(data_path, "test1.csv") - df1.to_csv(path1, index=False, storage_options=storage_options) + path1 = os.path.join(tmp_path, "test1.csv") + df1.to_csv(path1, index=False) ds = ray.data.read_csv( path1, - filesystem=fs, meta_provider=FastFileMetadataProvider(), ) @@ -263,142 +166,35 @@ def test_csv_read_meta_provider( with pytest.raises(NotImplementedError): ray.data.read_csv( path1, - filesystem=fs, meta_provider=BaseFileMetadataProvider(), ) -@pytest.mark.parametrize( - "fs,data_path,endpoint_url", - [ - (None, lazy_fixture("local_path"), None), - (lazy_fixture("local_fs"), lazy_fixture("local_path"), None), - (lazy_fixture("s3_fs"), lazy_fixture("s3_path"), lazy_fixture("s3_server")), - ], -) -def test_csv_read_many_files_basic( - ray_start_regular_shared, - fs, - data_path, - endpoint_url, -): - if endpoint_url is None: - storage_options = {} - else: - storage_options = dict(client_kwargs=dict(endpoint_url=endpoint_url)) - +def test_csv_read_many_files_basic(ray_start_regular_shared, tmp_path): paths = [] dfs = [] num_dfs = 4 * FILE_SIZE_FETCH_PARALLELIZATION_THRESHOLD for i in range(num_dfs): df = pd.DataFrame({"one": list(range(i * 3, (i + 1) * 3))}) dfs.append(df) - path = os.path.join(data_path, f"test_{i}.csv") + path = os.path.join(tmp_path, f"test_{i}.csv") paths.append(path) - df.to_csv(path, index=False, storage_options=storage_options) - ds = ray.data.read_csv(paths, filesystem=fs) + df.to_csv(path, index=False) + ds = ray.data.read_csv(paths) dsdf = ds.to_pandas() df = pd.concat(dfs).reset_index(drop=True) pd.testing.assert_frame_equal(df, dsdf) -@pytest.mark.parametrize( - "fs,data_path,endpoint_url", - [ - (None, lazy_fixture("local_path"), None), - (lazy_fixture("local_fs"), lazy_fixture("local_path"), None), - (lazy_fixture("s3_fs"), lazy_fixture("s3_path"), lazy_fixture("s3_server")), - ], -) -def test_csv_read_many_files_partitioned( - ray_start_regular_shared, - fs, - data_path, - endpoint_url, - write_partitioned_df, - assert_base_partitioned_ds, -): - if endpoint_url is None: - storage_options = {} - else: - storage_options = dict(client_kwargs=dict(endpoint_url=endpoint_url)) - - partition_keys = ["one"] - partition_path_encoder = PathPartitionEncoder.of( - base_dir=data_path, - field_names=partition_keys, - filesystem=fs, - ) - paths = [] - dfs = [] - num_dfs = FILE_SIZE_FETCH_PARALLELIZATION_THRESHOLD - num_rows = 6 * num_dfs - num_files = 2 * num_dfs - for i in range(num_dfs): - df = pd.DataFrame( - {"one": [1, 1, 1, 3, 3, 3], "two": list(range(6 * i, 6 * (i + 1)))} - ) - df_paths = write_partitioned_df( - df, - partition_keys, - partition_path_encoder, - partial(df_to_csv, storage_options=storage_options, index=False), - file_name_suffix=i, - ) - dfs.append(df) - paths.extend(df_paths) - - ds = ray.data.read_csv( - paths, - filesystem=fs, - partitioning=partition_path_encoder.scheme, - override_num_blocks=num_files, - ) - - assert_base_partitioned_ds( - ds, - count=num_rows, - num_input_files=num_files, - schema=Schema(pa.schema([("one", pa.int64()), ("two", pa.int64())])), - sorted_values=sorted( - itertools.chain.from_iterable( - list( - map(list, zip([1, 1, 1, 3, 3, 3], list(range(6 * i, 6 * (i + 1))))) - ) - for i in range(num_dfs) - ) - ), - ) - - -@pytest.mark.parametrize( - "fs,data_path,endpoint_url", - [ - (None, lazy_fixture("local_path"), None), - (lazy_fixture("local_fs"), lazy_fixture("local_path"), None), - (lazy_fixture("s3_fs"), lazy_fixture("s3_path"), lazy_fixture("s3_server")), - ], -) def test_csv_read_many_files_diff_dirs( ray_start_regular_shared, - fs, - data_path, - endpoint_url, + tmp_path, ): - if endpoint_url is None: - storage_options = {} - else: - storage_options = dict(client_kwargs=dict(endpoint_url=endpoint_url)) - - dir1 = os.path.join(data_path, "dir1") - dir2 = os.path.join(data_path, "dir2") - if fs is None: - os.mkdir(dir1) - os.mkdir(dir2) - else: - fs.create_dir(_unwrap_protocol(dir1)) - fs.create_dir(_unwrap_protocol(dir2)) + dir1 = os.path.join(tmp_path, "dir1") + dir2 = os.path.join(tmp_path, "dir2") + os.mkdir(dir1) + os.mkdir(dir2) paths = [] dfs = [] @@ -409,234 +205,14 @@ def test_csv_read_many_files_diff_dirs( dfs.append(df) path = os.path.join(dir_path, f"test_{j}.csv") paths.append(path) - df.to_csv(path, index=False, storage_options=storage_options) - ds = ray.data.read_csv([dir1, dir2], filesystem=fs) + df.to_csv(path, index=False) + ds = ray.data.read_csv([dir1, dir2]) dsdf = ds.to_pandas().sort_values(by=["one"]).reset_index(drop=True) df = pd.concat(dfs).reset_index(drop=True) pd.testing.assert_frame_equal(df, dsdf) -@pytest.mark.parametrize( - "fs,data_path,endpoint_url", - [ - (None, lazy_fixture("local_path"), None), - (lazy_fixture("local_fs"), lazy_fixture("local_path"), None), - (lazy_fixture("s3_fs"), lazy_fixture("s3_path"), lazy_fixture("s3_server")), - ( - lazy_fixture("s3_fs_with_anonymous_crendential"), - lazy_fixture("s3_path_with_anonymous_crendential"), - lazy_fixture("s3_server"), - ), - ], -) -def test_csv_read_partitioned_hive_implicit( - ray_start_regular_shared, - fs, - data_path, - endpoint_url, - write_base_partitioned_df, - assert_base_partitioned_ds, -): - storage_options = ( - {} - if endpoint_url is None - else dict(client_kwargs=dict(endpoint_url=endpoint_url)) - ) - partition_keys = ["one"] - partition_path_encoder = PathPartitionEncoder.of( - base_dir=data_path, - field_names=partition_keys, - filesystem=fs, - ) - write_base_partitioned_df( - partition_keys, - partition_path_encoder, - partial(df_to_csv, storage_options=storage_options, index=False), - ) - ds = ray.data.read_csv( - data_path, - partition_filter=PathPartitionFilter.of(None, filesystem=fs), - filesystem=fs, - ) - assert_base_partitioned_ds(ds) - - -@pytest.mark.parametrize( - "fs,data_path,endpoint_url", - [ - (None, lazy_fixture("local_path"), None), - (lazy_fixture("local_fs"), lazy_fixture("local_path"), None), - (lazy_fixture("s3_fs"), lazy_fixture("s3_path"), lazy_fixture("s3_server")), - ( - lazy_fixture("s3_fs_with_anonymous_crendential"), - lazy_fixture("s3_path_with_anonymous_crendential"), - lazy_fixture("s3_server"), - ), - ], -) -def test_csv_read_partitioned_styles_explicit( - ray_start_regular_shared, - fs, - data_path, - endpoint_url, - write_base_partitioned_df, - assert_base_partitioned_ds, -): - storage_options = ( - {} - if endpoint_url is None - else dict(client_kwargs=dict(endpoint_url=endpoint_url)) - ) - partition_keys = ["one"] - for style in [PartitionStyle.HIVE, PartitionStyle.DIRECTORY]: - base_dir = os.path.join(data_path, style.value) - partition_path_encoder = PathPartitionEncoder.of( - style=style, - base_dir=base_dir, - field_names=partition_keys, - filesystem=fs, - ) - write_base_partitioned_df( - partition_keys, - partition_path_encoder, - partial(df_to_csv, storage_options=storage_options, index=False), - ) - partition_path_filter = PathPartitionFilter.of( - None, - style=style, - base_dir=base_dir, - field_names=partition_keys, - filesystem=fs, - ) - ds = ray.data.read_csv( - base_dir, - partition_filter=partition_path_filter, - filesystem=fs, - ) - assert_base_partitioned_ds(ds) - - -@pytest.mark.parametrize( - "fs,data_path,endpoint_url", - [ - (None, lazy_fixture("local_path"), None), - (lazy_fixture("local_fs"), lazy_fixture("local_path"), None), - (lazy_fixture("s3_fs"), lazy_fixture("s3_path"), lazy_fixture("s3_server")), - ], -) -@pytest.mark.parametrize("style", [PartitionStyle.HIVE, PartitionStyle.DIRECTORY]) -def test_csv_read_partitioned_with_filter( - style, - ray_start_regular_shared, - fs, - data_path, - endpoint_url, - write_base_partitioned_df, - assert_base_partitioned_ds, -): - storage_options = ( - {} - if endpoint_url is None - else dict(client_kwargs=dict(endpoint_url=endpoint_url)) - ) - partition_keys = ["one"] - file_writer_fn = partial(df_to_csv, storage_options=storage_options, index=False) - - def skip_unpartitioned(kv_dict): - return bool(kv_dict) - - base_dir = os.path.join(data_path, style.value) - partition_path_encoder = PathPartitionEncoder.of( - style=style, - base_dir=base_dir, - field_names=partition_keys, - filesystem=fs, - ) - write_base_partitioned_df( - partition_keys, - partition_path_encoder, - file_writer_fn, - ) - file_writer_fn(pd.DataFrame({"1": [1]}), os.path.join(base_dir, "test.csv")) - partition_path_filter = PathPartitionFilter.of( - style=style, - base_dir=base_dir, - field_names=partition_keys, - filesystem=fs, - filter_fn=skip_unpartitioned, - ) - ds = ray.data.read_csv( - base_dir, - partition_filter=partition_path_filter, - filesystem=fs, - ) - assert_base_partitioned_ds(ds) - - -@pytest.mark.parametrize( - "fs,data_path,endpoint_url", - [ - (None, lazy_fixture("local_path"), None), - (lazy_fixture("local_fs"), lazy_fixture("local_path"), None), - (lazy_fixture("s3_fs"), lazy_fixture("s3_path"), lazy_fixture("s3_server")), - ], -) -@pytest.mark.parametrize("style", [PartitionStyle.HIVE, PartitionStyle.DIRECTORY]) -def test_csv_read_partitioned_with_filter_multikey( - style, - ray_start_regular_shared, - fs, - data_path, - endpoint_url, - write_base_partitioned_df, - assert_base_partitioned_ds, -): - storage_options = ( - {} - if endpoint_url is None - else dict(client_kwargs=dict(endpoint_url=endpoint_url)) - ) - partition_keys = ["one", "two"] - file_writer_fn = partial(df_to_csv, storage_options=storage_options, index=False) - - def keep_expected_partitions(kv_dict): - keep = bool(kv_dict) and ( - (kv_dict["one"] == "1" and kv_dict["two"] in {"a", "b", "c"}) - or (kv_dict["one"] == "3" and kv_dict["two"] in {"e", "f", "g"}) - ) - return keep - - base_dir = os.path.join(data_path, style.value) - partition_path_encoder = PathPartitionEncoder.of( - style=style, - base_dir=base_dir, - field_names=partition_keys, - filesystem=fs, - ) - write_base_partitioned_df( - partition_keys, - partition_path_encoder, - file_writer_fn, - ) - df = pd.DataFrame({"1": [1]}) - file_writer_fn(df, os.path.join(data_path, "test0.csv")) - partition_path_filter = PathPartitionFilter.of( - style=style, - base_dir=base_dir, - field_names=partition_keys, - filesystem=fs, - filter_fn=keep_expected_partitions, - ) - ds = ray.data.read_csv( - data_path, - partition_filter=partition_path_filter, - filesystem=fs, - override_num_blocks=6, - ) - assert_base_partitioned_ds(ds, num_input_files=6) - - def test_csv_write( ray_start_regular_shared, tmp_path, target_max_block_size_infinite_or_default ): diff --git a/python/ray/data/tests/test_datatype.py b/python/ray/data/tests/test_datatype.py new file mode 100644 index 000000000000..ceb3a2650941 --- /dev/null +++ b/python/ray/data/tests/test_datatype.py @@ -0,0 +1,392 @@ +import numpy as np +import pyarrow as pa +import pytest + +from ray.data.datatype import DataType + + +class TestDataTypeFactoryMethods: + """Test the generated factory methods.""" + + @pytest.mark.parametrize( + "method_name,pa_type,description", + [ + ("int8", pa.int8(), "8-bit signed integer"), + ("int16", pa.int16(), "16-bit signed integer"), + ("int32", pa.int32(), "32-bit signed integer"), + ("int64", pa.int64(), "64-bit signed integer"), + ("uint8", pa.uint8(), "8-bit unsigned integer"), + ("uint16", pa.uint16(), "16-bit unsigned integer"), + ("uint32", pa.uint32(), "32-bit unsigned integer"), + ("uint64", pa.uint64(), "64-bit unsigned integer"), + ("float32", pa.float32(), "32-bit floating point number"), + ("float64", pa.float64(), "64-bit floating point number"), + ("string", pa.string(), "variable-length string"), + ("bool", pa.bool_(), "boolean value"), + ("binary", pa.binary(), "variable-length binary data"), + ], + ) + def test_factory_method_creates_correct_type( + self, method_name, pa_type, description + ): + """Test that factory methods create DataType with correct PyArrow type.""" + factory_method = getattr(DataType, method_name) + result = factory_method() + + assert isinstance(result, DataType) + assert result.is_arrow_type() + assert result._internal_type == pa_type + + @pytest.mark.parametrize( + "method_name", + [ + "int8", + "int16", + "int32", + "int64", + "uint8", + "uint16", + "uint32", + "uint64", + "float32", + "float64", + "string", + "bool", + "binary", + ], + ) + def test_factory_method_has_proper_docstring(self, method_name): + """Test that generated factory methods have proper docstrings.""" + factory_method = getattr(DataType, method_name) + doc = factory_method.__doc__ + + assert "Create a DataType representing" in doc + assert "Returns:" in doc + assert f"DataType with PyArrow {method_name} type" in doc + + +class TestDataTypeValidation: + """Test DataType validation and initialization.""" + + @pytest.mark.parametrize( + "valid_type", + [ + pa.int64(), + pa.string(), + pa.timestamp("s"), + np.dtype("int32"), + np.dtype("float64"), + int, + str, + float, + ], + ) + def test_post_init_accepts_valid_types(self, valid_type): + """Test that __post_init__ accepts valid type objects.""" + # Should not raise + dt = DataType(valid_type) + assert dt._internal_type == valid_type + + @pytest.mark.parametrize( + "invalid_type", + [ + "string", + 123, + [1, 2, 3], + {"key": "value"}, + None, + object(), + ], + ) + def test_post_init_rejects_invalid_types(self, invalid_type): + """Test that __post_init__ rejects invalid type objects.""" + with pytest.raises( + TypeError, + match="DataType supports only PyArrow DataType, NumPy dtype, or Python type", + ): + DataType(invalid_type) + + +class TestDataTypeCheckers: + """Test type checking methods.""" + + @pytest.mark.parametrize( + "datatype,is_arrow,is_numpy,is_python", + [ + (DataType.from_arrow(pa.int64()), True, False, False), + (DataType.from_arrow(pa.string()), True, False, False), + (DataType.from_numpy(np.dtype("int32")), False, True, False), + (DataType.from_numpy(np.dtype("float64")), False, True, False), + (DataType(int), False, False, True), + (DataType(str), False, False, True), + ], + ) + def test_type_checkers(self, datatype, is_arrow, is_numpy, is_python): + """Test is_arrow_type, is_numpy_type, and is_python_type methods.""" + assert datatype.is_arrow_type() == is_arrow + assert datatype.is_numpy_type() == is_numpy + assert datatype.is_python_type() == is_python + + +class TestDataTypeFactories: + """Test factory methods from external systems.""" + + @pytest.mark.parametrize( + "pa_type", + [ + pa.int32(), + pa.string(), + pa.timestamp("s"), + pa.list_(pa.int32()), + pa.decimal128(10, 2), + ], + ) + def test_from_arrow(self, pa_type): + """Test from_arrow factory method.""" + dt = DataType.from_arrow(pa_type) + + assert isinstance(dt, DataType) + assert dt.is_arrow_type() + assert dt._internal_type == pa_type + + @pytest.mark.parametrize( + "numpy_input,expected_dtype", + [ + (np.dtype("int32"), np.dtype("int32")), + (np.dtype("float64"), np.dtype("float64")), + ("int64", np.dtype("int64")), + ("float32", np.dtype("float32")), + ], + ) + def test_from_numpy(self, numpy_input, expected_dtype): + """Test from_numpy factory method.""" + dt = DataType.from_numpy(numpy_input) + + assert isinstance(dt, DataType) + assert dt.is_numpy_type() + assert dt._internal_type == expected_dtype + + +class TestDataTypeConversions: + """Test type conversion methods.""" + + def test_to_arrow_dtype_arrow_passthrough(self): + """Test that Arrow types return themselves.""" + dt = DataType.from_arrow(pa.int64()) + result = dt.to_arrow_dtype() + assert result == pa.int64() + + def test_to_arrow_dtype_numpy_conversion(self): + """Test conversion from NumPy to Arrow types.""" + dt = DataType.from_numpy(np.dtype("int32")) + result = dt.to_arrow_dtype() + assert result == pa.int32() + + def test_to_arrow_dtype_python_conversion(self): + """Test conversion from Python to Arrow types.""" + dt = DataType(int) + result = dt.to_arrow_dtype([1]) + # Python int should map to int64 in Arrow + assert result == pa.int64() + + @pytest.mark.parametrize( + "source_dt,expected_result", + [ + # NumPy types should return themselves + (DataType.from_numpy(np.dtype("int32")), np.dtype("int32")), + (DataType.from_numpy(np.dtype("float64")), np.dtype("float64")), + # Python types should fall back to object + (DataType(str), np.dtype("object")), + (DataType(list), np.dtype("object")), + ], + ) + def test_to_numpy_dtype(self, source_dt, expected_result): + """Test to_numpy_dtype conversion.""" + result = source_dt.to_numpy_dtype() + assert result == expected_result + + def test_to_numpy_dtype_arrow_basic_types(self): + """Test Arrow to NumPy conversion for types that should work.""" + # Test basic types that should convert properly + test_cases = [ + (pa.int32(), np.dtype("int32")), + (pa.float64(), np.dtype("float64")), + (pa.bool_(), np.dtype("bool")), + ] + + for pa_type, expected_np_dtype in test_cases: + dt = DataType.from_arrow(pa_type) + result = dt.to_numpy_dtype() + # Some Arrow types may not convert exactly as expected, + # so let's just verify the result is a valid numpy dtype + assert isinstance(result, np.dtype) + + def test_to_numpy_dtype_complex_arrow_fallback(self): + """Test that complex Arrow types fall back to object dtype.""" + complex_dt = DataType.from_arrow(pa.list_(pa.int32())) + result = complex_dt.to_numpy_dtype() + assert result == np.dtype("object") + + @pytest.mark.parametrize("python_type", [int, str, float, bool, list]) + def test_to_python_type_success(self, python_type): + """Test to_python_type returns the original Python type.""" + dt = DataType(python_type) + result = dt.to_python_type() + assert result == python_type + + @pytest.mark.parametrize( + "non_python_dt", + [ + DataType.from_arrow(pa.int64()), + DataType.from_numpy(np.dtype("float32")), + ], + ) + def test_to_python_type_failure(self, non_python_dt): + """Test to_python_type raises ValueError for non-Python types.""" + with pytest.raises(ValueError, match="is not a Python type"): + non_python_dt.to_python_type() + + +class TestDataTypeInference: + """Test type inference from values.""" + + @pytest.mark.parametrize( + "numpy_value,expected_dtype", + [ + (np.array([1, 2, 3], dtype="int32"), np.dtype("int32")), + (np.array([1.0, 2.0], dtype="float64"), np.dtype("float64")), + (np.int64(42), np.dtype("int64")), + (np.float32(3.14), np.dtype("float32")), + ], + ) + def test_infer_dtype_numpy_values(self, numpy_value, expected_dtype): + """Test inference of NumPy arrays and scalars.""" + dt = DataType.infer_dtype(numpy_value) + + assert dt.is_numpy_type() + assert dt._internal_type == expected_dtype + + # Removed test_infer_dtype_pyarrow_scalar - no longer works with current implementation + + @pytest.mark.parametrize( + "python_value", + [ + 42, # int + 3.14, # float + "hello", # str + True, # bool + [1, 2, 3], # list + ], + ) + def test_infer_dtype_python_values_arrow_success(self, python_value): + """Test inference of Python values that Arrow can handle.""" + dt = DataType.infer_dtype(python_value) + + # Should infer to Arrow type for basic Python values + assert dt.is_arrow_type() + + # Removed test_infer_dtype_fallback_to_python_type - no longer supported + + +class TestDataTypeStringRepresentation: + """Test string representation methods.""" + + @pytest.mark.parametrize( + "datatype,expected_repr", + [ + (DataType.from_arrow(pa.int64()), "DataType(arrow:int64)"), + (DataType.from_arrow(pa.string()), "DataType(arrow:string)"), + (DataType.from_numpy(np.dtype("float32")), "DataType(numpy:float32)"), + (DataType.from_numpy(np.dtype("int64")), "DataType(numpy:int64)"), + (DataType(str), "DataType(python:str)"), + (DataType(int), "DataType(python:int)"), + ], + ) + def test_repr(self, datatype, expected_repr): + """Test __repr__ method for different type categories.""" + assert repr(datatype) == expected_repr + + +class TestDataTypeEqualityAndHashing: + """Test equality and hashing behavior.""" + + @pytest.mark.parametrize( + "dt1,dt2,should_be_equal", + [ + # Same types should be equal + (DataType.from_arrow(pa.int64()), DataType.from_arrow(pa.int64()), True), + ( + DataType.from_numpy(np.dtype("float32")), + DataType.from_numpy(np.dtype("float32")), + True, + ), + (DataType(str), DataType(str), True), + # Different Arrow types should not be equal + (DataType.from_arrow(pa.int64()), DataType.from_arrow(pa.int32()), False), + # Same conceptual type but different systems should not be equal + ( + DataType.from_arrow(pa.int64()), + DataType.from_numpy(np.dtype("int64")), + False, + ), + ], + ) + def test_equality(self, dt1, dt2, should_be_equal): + """Test __eq__ method.""" + if should_be_equal: + assert dt1 == dt2 + assert hash(dt1) == hash(dt2) + else: + assert dt1 != dt2 + + def test_numpy_vs_python_inequality(self): + """Test that numpy int64 and python int are not equal.""" + numpy_dt = DataType.from_numpy(np.dtype("int64")) + python_dt = DataType(int) + + # These represent the same conceptual type but with different systems + # so they should not be equal + + # First verify they have different internal types + assert type(numpy_dt._internal_type) is not type(python_dt._internal_type) + assert numpy_dt._internal_type is not python_dt._internal_type + + # Test the type checkers return different results + assert numpy_dt.is_numpy_type() and not python_dt.is_numpy_type() + assert python_dt.is_python_type() and not numpy_dt.is_python_type() + + # They should not be equal + assert numpy_dt != python_dt + + @pytest.mark.parametrize( + "non_datatype_value", + [ + "not_a_datatype", + 42, + [1, 2, 3], + {"key": "value"}, + None, + ], + ) + def test_inequality_with_non_datatype(self, non_datatype_value): + """Test that DataType is not equal to non-DataType objects.""" + dt = DataType.from_arrow(pa.int64()) + assert dt != non_datatype_value + + def test_hashability(self): + """Test that DataType objects can be used in sets and as dict keys.""" + dt1 = DataType.from_arrow(pa.int64()) + dt2 = DataType.from_arrow(pa.int64()) # Same as dt1 + dt3 = DataType.from_arrow(pa.int32()) # Different + + # Test in set + dt_set = {dt1, dt2, dt3} + assert len(dt_set) == 2 # dt1 and dt2 are the same + + # Test as dict keys + dt_dict = {dt1: "first", dt3: "second"} + assert dt_dict[dt2] == "first" # dt2 should match dt1 + + +if __name__ == "__main__": + pytest.main(["-v", __file__]) diff --git a/python/ray/data/tests/test_deduping_schema.py b/python/ray/data/tests/test_deduping_schema.py index 651efaf86223..2574c7195161 100644 --- a/python/ray/data/tests/test_deduping_schema.py +++ b/python/ray/data/tests/test_deduping_schema.py @@ -7,7 +7,8 @@ from ray.data._internal.execution.streaming_executor_state import ( dedupe_schemas_with_validation, ) -from ray.data.block import Schema +from ray.data._internal.pandas_block import PandasBlockSchema +from ray.data.block import Schema, _is_empty_schema @pytest.mark.parametrize( @@ -15,6 +16,8 @@ [ pa.schema([pa.field("uuid", pa.string())]), # NOTE: diff from old_schema pa.schema([]), # Empty Schema + PandasBlockSchema(names=["col1"], types=[int]), + PandasBlockSchema(names=[], types=[]), None, # Null Schema ], ) @@ -23,6 +26,8 @@ [ pa.schema([pa.field("id", pa.int64())]), pa.schema([]), # Empty Schema + PandasBlockSchema(names=["col2"], types=[int]), + PandasBlockSchema(names=[], types=[]), None, # Null Schema ], ) @@ -33,10 +38,10 @@ def test_dedupe_schema_handle_empty( incoming_bundle = RefBundle([], owns_blocks=False, schema=incoming_schema) out_bundle, diverged = dedupe_schemas_with_validation( - old_schema, incoming_bundle, allow_divergent=False, warn=False + old_schema, incoming_bundle, enforce_schemas=False, warn=False ) - if old_schema is None or len(old_schema) == 0: + if _is_empty_schema(old_schema): # old_schema is invalid assert not diverged, (old_schema, incoming_schema) assert out_bundle.schema == incoming_schema, (old_schema, incoming_schema) @@ -47,25 +52,25 @@ def test_dedupe_schema_handle_empty( assert old_schema == out_bundle.schema, (old_schema, incoming_schema) -@pytest.mark.parametrize("allow_divergent", [False, True]) +@pytest.mark.parametrize("enforce_schemas", [False, True]) @pytest.mark.parametrize( "incoming_schema", [pa.schema([pa.field("uuid", pa.string())])] ) @pytest.mark.parametrize("old_schema", [pa.schema([pa.field("id", pa.int64())])]) def test_dedupe_schema_divergence( - allow_divergent: bool, + enforce_schemas: bool, old_schema: Optional["Schema"], incoming_schema: Optional["Schema"], ): incoming_bundle = RefBundle([], owns_blocks=False, schema=incoming_schema) out_bundle, diverged = dedupe_schemas_with_validation( - old_schema, incoming_bundle, allow_divergent=allow_divergent, warn=False + old_schema, incoming_bundle, enforce_schemas=enforce_schemas, warn=False ) assert diverged - if allow_divergent: + if enforce_schemas: assert out_bundle.schema == pa.schema(list(old_schema) + list(incoming_schema)) else: assert out_bundle.schema == old_schema diff --git a/python/ray/data/tests/test_delta.py b/python/ray/data/tests/test_delta.py index c00882b1bda9..60851c2f6573 100644 --- a/python/ray/data/tests/test_delta.py +++ b/python/ray/data/tests/test_delta.py @@ -2,23 +2,14 @@ import pyarrow as pa import pytest -from pytest_lazy_fixtures import lf as lazy_fixture import ray from ray.data import Schema -from ray.data.datasource.path_util import _unwrap_protocol from ray.data.tests.conftest import * # noqa from ray.data.tests.mock_http_server import * # noqa from ray.tests.conftest import * # noqa -@pytest.mark.parametrize( - "data_path", - [ - lazy_fixture("local_path"), - lazy_fixture("s3_path"), - ], -) @pytest.mark.parametrize( "batch_size", [1, 100], @@ -27,13 +18,12 @@ "write_mode", ["append", "overwrite"], ) -def test_delta_read_basic(data_path, batch_size, write_mode): +def test_delta_read_basic(tmp_path, batch_size, write_mode): import pandas as pd from deltalake import write_deltalake # Parse the data path. - setup_data_path = _unwrap_protocol(data_path) - path = os.path.join(setup_data_path, "tmp_test_delta") + path = os.path.join(tmp_path, "tmp_test_delta") # Create a sample Delta Lake table df = pd.DataFrame( diff --git a/python/ray/data/tests/test_download_expression.py b/python/ray/data/tests/test_download_expression.py new file mode 100644 index 000000000000..2c593190de11 --- /dev/null +++ b/python/ray/data/tests/test_download_expression.py @@ -0,0 +1,293 @@ +import io + +import pyarrow as pa +import pytest +from PIL import Image + +import ray +from ray.data.expressions import DownloadExpr, col, download + + +class TestDownloadExpressionStructure: + """Test DownloadExpr structural equality and basic properties.""" + + def test_download_expression_creation(self): + """Test that download() creates a DownloadExpr with correct properties.""" + expr = download("uri_column") + + assert isinstance(expr, DownloadExpr) + assert expr.uri_column_name == "uri_column" + + def test_download_expression_structural_equality(self): + """Test structural equality comparison for download expressions.""" + # Same expressions should be equal + expr1 = download("uri") + expr2 = download("uri") + assert expr1.structurally_equals(expr2) + assert expr2.structurally_equals(expr1) + + # Different URI column names should not be equal + expr3 = download("different_uri") + assert not expr1.structurally_equals(expr3) + assert not expr3.structurally_equals(expr1) + + # Compare with non-DownloadExpr + non_download_expr = col("uri") + assert not expr1.structurally_equals(non_download_expr) + assert not non_download_expr.structurally_equals(expr1) + + +class TestDownloadExpressionFunctionality: + """Test actual download functionality with real and mocked data.""" + + def test_download_expression_with_local_files(self, tmp_path): + """Test basic download expression functionality with local files.""" + # Create sample files with different content types + sample_data = [ + b"This is test file 1 content", + b"Different content for file 2", + b"File 3 has some binary data: \x00\x01\x02\x03", + ] + + file_paths = [] + for i, data in enumerate(sample_data): + file_path = tmp_path / f"test_file_{i}.txt" + file_path.write_bytes(data) + file_paths.append(str(file_path)) + + # Create dataset with file URIs and metadata + table = pa.Table.from_arrays( + [ + pa.array([f"local://{path}" for path in file_paths]), + pa.array([f"id_{i}" for i in range(len(file_paths))]), + pa.array([f"metadata_{i}" for i in range(len(file_paths))]), + pa.array(range(len(file_paths))), + ], + names=["file_uri", "file_id", "metadata", "index"], + ) + + ds = ray.data.from_arrow(table) + + # Add download column using expression + ds_with_downloads = ds.with_column("file_bytes", download("file_uri")) + + # Verify results + results = ds_with_downloads.take_all() + assert len(results) == len(sample_data) + + for i, result in enumerate(results): + # Download column should be added correctly + assert "file_bytes" in result + assert result["file_bytes"] == sample_data[i] + + # All original columns should be preserved + assert result["file_id"] == f"id_{i}" + assert result["metadata"] == f"metadata_{i}" + assert result["index"] == i + assert result["file_uri"] == f"local://{file_paths[i]}" + + def test_download_expression_empty_dataset(self): + """Test download expression with empty dataset.""" + # Create empty dataset with correct schema + table = pa.Table.from_arrays( + [ + pa.array([], type=pa.string()), + ], + names=["uri"], + ) + + ds = ray.data.from_arrow(table) + ds_with_downloads = ds.with_column("bytes", download("uri")) + + results = ds_with_downloads.take_all() + assert len(results) == 0 + + def test_download_expression_with_different_file_types(self, tmp_path): + """Test download expression with various file types including actual images.""" + # Create a small 8x8 RGB image + small_image = Image.new("RGB", (8, 8), color=(255, 0, 0)) # Red 8x8 image + image_buffer = io.BytesIO() + small_image.save(image_buffer, format="PNG") + image_bytes = image_buffer.getvalue() + + # Create files with different types of content + test_files = [ + ("text_file.txt", b"Simple text content"), + ("binary_file.dat", b"\x00\x01\x02\x03\x04\x05"), + ("json_file.json", b'{"key": "value", "number": 123}'), + ("small_image.png", image_bytes), # Actual PNG image (primary use case) + ("empty_file.txt", b""), # Empty file edge case + ] + + file_paths = [] + expected_data = [] + for filename, content in test_files: + file_path = tmp_path / filename + file_path.write_bytes(content) + file_paths.append(str(file_path)) + expected_data.append(content) + + # Create dataset + table = pa.Table.from_arrays( + [ + pa.array([f"local://{path}" for path in file_paths]), + pa.array( + [f.split(".")[0] for f, _ in test_files] + ), # filename without extension + ], + names=["file_uri", "file_type"], + ) + + ds = ray.data.from_arrow(table) + ds_with_downloads = ds.with_column("content", download("file_uri")) + + results = ds_with_downloads.take_all() + assert len(results) == len(test_files) + + for i, result in enumerate(results): + assert result["content"] == expected_data[i] + assert result["file_type"] == test_files[i][0].split(".")[0] + + # Special verification for image file - ensure it can be loaded as an image + if test_files[i][0].endswith(".png"): + downloaded_image = Image.open(io.BytesIO(result["content"])) + assert downloaded_image.size == (8, 8) + assert downloaded_image.mode == "RGB" + + def test_chained_download_expressions(self, tmp_path): + """Test chained download expressions functionality.""" + # Create sample files with different content + sample_data = [ + b"Content for file 1", + b"Content for file 2", + b"Content for file 3", + ] + + file_paths = [] + for i, data in enumerate(sample_data): + file_path = tmp_path / f"test_file_{i}.txt" + file_path.write_bytes(data) + file_paths.append(str(file_path)) + + # Create dataset with file URIs + table = pa.Table.from_arrays( + [ + pa.array([f"local://{path}" for path in file_paths]), + pa.array([f"id_{i}" for i in range(len(file_paths))]), + ], + names=["file_uri", "file_id"], + ) + + ds = ray.data.from_arrow(table) + + # Chain multiple download expressions from the same URI column + ds_with_chained_downloads = ( + ds.with_column("file_bytes_1", download("file_uri")) + .with_column("file_bytes_2", download("file_uri")) + .with_column("file_bytes_3", download("file_uri")) + ) + + # Verify results + results = ds_with_chained_downloads.take_all() + assert len(results) == len(sample_data) + + for i, result in enumerate(results): + # All download columns should have the same content + assert "file_bytes_1" in result + assert "file_bytes_2" in result + assert "file_bytes_3" in result + assert result["file_bytes_1"] == sample_data[i] + assert result["file_bytes_2"] == sample_data[i] + assert result["file_bytes_3"] == sample_data[i] + + # Original columns should be preserved + assert result["file_id"] == f"id_{i}" + assert result["file_uri"] == f"local://{file_paths[i]}" + + +class TestDownloadExpressionErrors: + """Test error conditions and edge cases for download expressions.""" + + def test_download_expression_invalid_uri_column(self): + """Test download expression with non-existent URI column.""" + table = pa.Table.from_arrays( + [ + pa.array(["local://test.txt"]), + ], + names=["existing_column"], + ) + + ds = ray.data.from_arrow(table) + ds_with_downloads = ds.with_column("bytes", download("non_existent_column")) + + # Should raise error when trying to execute + with pytest.raises(Exception): # Could be KeyError or similar + ds_with_downloads.take_all() + + def test_download_expression_with_null_uris(self): + """Test download expression handling of null/empty URIs.""" + table = pa.Table.from_arrays( + [ + pa.array(["local://test.txt", None, ""]), + ], + names=["uri"], + ) + + ds = ray.data.from_arrow(table) + ds_with_downloads = ds.with_column("bytes", download("uri")) + + # Should handle nulls gracefully (exact behavior may vary) + # This test mainly ensures no crash occurs + try: + results = ds_with_downloads.take_all() + # If it succeeds, verify structure is reasonable + assert len(results) == 3 + for result in results: + assert "bytes" in result + except Exception as e: + # If it fails, should be a reasonable error (not a crash) + assert isinstance(e, (ValueError, KeyError, RuntimeError)) + + +class TestDownloadExpressionIntegration: + """Integration tests combining download expressions with other Ray Data operations.""" + + def test_download_expression_with_map_batches(self, tmpdir): + """Test download expression followed by map_batches processing.""" + # Create a test file + test_file = tmpdir.join("test.txt") + test_content = b"Hello, World!" + test_file.write_binary(test_content) + + # Create dataset + table = pa.Table.from_arrays( + [ + pa.array([f"local://{test_file}"]), + ], + names=["uri"], + ) + + ds = ray.data.from_arrow(table) + + # Download then process + ds_with_content = ds.with_column("raw_bytes", download("uri")) + + def decode_bytes(batch): + # Access the specific column containing the bytes data + batch["decoded_text"] = [ + data.decode("utf-8") for data in batch["raw_bytes"] + ] + return batch + + ds_decoded = ds_with_content.map_batches(decode_bytes) + results = ds_decoded.take_all() + + assert len(results) == 1 + assert results[0]["decoded_text"] == "Hello, World!" + assert results[0]["raw_bytes"] == test_content + + +if __name__ == "__main__": + import sys + + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/data/tests/test_downstream_capacity_backpressure_policy.py b/python/ray/data/tests/test_downstream_capacity_backpressure_policy.py new file mode 100644 index 000000000000..4acec6c01852 --- /dev/null +++ b/python/ray/data/tests/test_downstream_capacity_backpressure_policy.py @@ -0,0 +1,127 @@ +from unittest.mock import MagicMock + +import pytest + +from ray.data._internal.execution.backpressure_policy.downstream_capacity_backpressure_policy import ( + DownstreamCapacityBackpressurePolicy, +) +from ray.data._internal.execution.interfaces.physical_operator import ( + OpRuntimeMetrics, + PhysicalOperator, +) +from ray.data._internal.execution.operators.actor_pool_map_operator import ( + ActorPoolMapOperator, +) +from ray.data._internal.execution.streaming_executor_state import OpState, Topology +from ray.data.context import DataContext + + +class TestDownstreamCapacityBackpressurePolicy: + def _mock_operator( + self, + op_class: PhysicalOperator = PhysicalOperator, + num_enqueued_input_bundles: int = 0, + num_task_inputs_processed: int = 0, + num_tasks_finished: int = 0, + max_concurrent_tasks: int = 100, + ): + """Helper method to create mock operator.""" + mock_operator = MagicMock(spec=op_class) + mock_operator.metrics = MagicMock(spec=OpRuntimeMetrics) + mock_operator.metrics.num_task_inputs_processed = num_task_inputs_processed + mock_operator.metrics.num_tasks_finished = num_tasks_finished + mock_operator.num_active_tasks.return_value = max_concurrent_tasks + + op_state = MagicMock(spec=OpState) + op_state.total_enqueued_input_bundles.return_value = num_enqueued_input_bundles + return mock_operator, op_state + + def _mock_actor_pool_map_operator( + self, + num_enqueued_input_bundles: int, + num_task_inputs_processed: int, + num_tasks_finished: int, + max_concurrent_tasks: int = 100, + ): + """Helper method to create mock actor pool map operator.""" + op, op_state = self._mock_operator( + ActorPoolMapOperator, + num_enqueued_input_bundles, + num_task_inputs_processed, + num_tasks_finished, + max_concurrent_tasks, + ) + actor_pool = MagicMock( + spec="ray.data._internal.execution.operators.actor_pool_map_operator._ActorPool" + ) + actor_pool.max_concurrent_tasks = MagicMock(return_value=max_concurrent_tasks) + op.get_autoscaling_actor_pools.return_value = [actor_pool] + return op, op_state + + def _create_policy( + self, data_context: DataContext = None, topology: Topology = None + ): + """Helper method to create policy instance.""" + context = data_context or self.context + return DownstreamCapacityBackpressurePolicy( + data_context=context, + topology=topology, + resource_manager=MagicMock(), + ) + + @pytest.mark.parametrize( + "mock_method", + [ + (_mock_operator), + (_mock_actor_pool_map_operator), + ], + ) + @pytest.mark.parametrize( + "num_enqueued, num_task_inputs_processed, num_tasks_finished, backpressure_ratio, max_queued_bundles, expected_result, test_name", + [ + (100, 100, 10, 2, 4000, True, "no_backpressure_low_queue"), + (5000, 100, 10, 2, 4000, False, "high_queue_pressure"), + (100, 0, 0, 2, 400, True, "zero_inputs_protection"), + (1000000, 1, 1, None, None, True, "default disabled"), + ], + ) + def test_backpressure_conditions( + self, + mock_method, + num_enqueued, + num_task_inputs_processed, + num_tasks_finished, + backpressure_ratio, + max_queued_bundles, + expected_result, + test_name, + ): + """Parameterized test covering various backpressure conditions.""" + context = DataContext() + context.downstream_capacity_backpressure_ratio = backpressure_ratio + context.downstream_capacity_backpressure_max_queued_bundles = max_queued_bundles + + op, op_state = self._mock_operator(PhysicalOperator) + op_output_dep, op_output_state = mock_method( + self, + num_enqueued_input_bundles=num_enqueued, + num_task_inputs_processed=num_task_inputs_processed, + num_tasks_finished=num_tasks_finished, + ) + op.output_dependencies = [op_output_dep] + + policy = self._create_policy( + context, topology={op: op_state, op_output_dep: op_output_state} + ) + result = policy.can_add_input(op) + + assert result == expected_result, test_name + assert ( + backpressure_ratio is None or max_queued_bundles is None + ) == policy._backpressure_disabled, test_name + + +if __name__ == "__main__": + import sys + + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/data/tests/test_exceptions.py b/python/ray/data/tests/test_exceptions.py index 7d57586ca1d2..9818c3f379e0 100644 --- a/python/ray/data/tests/test_exceptions.py +++ b/python/ray/data/tests/test_exceptions.py @@ -9,22 +9,6 @@ from ray.tests.conftest import * # noqa -def test_handle_debugger_exception(ray_start_regular_shared): - def _bad(batch): - if batch["id"][0] == 5: - raise Exception("Test exception") - - return batch - - dataset = ray.data.range(8, override_num_blocks=8).map_batches(_bad) - - with pytest.raises( - UserCodeException, - match=r"Failed to process the following data block: \{'id': array\(\[5\]\)\}", - ): - dataset.materialize() - - @pytest.mark.parametrize("log_internal_stack_trace_to_stdout", [True, False]) def test_user_exception( log_internal_stack_trace_to_stdout, diff --git a/python/ray/data/tests/test_executor_resource_management.py b/python/ray/data/tests/test_executor_resource_management.py index d863842ec412..3a2836cce545 100644 --- a/python/ray/data/tests/test_executor_resource_management.py +++ b/python/ray/data/tests/test_executor_resource_management.py @@ -1,10 +1,8 @@ import pytest import ray +from ray.data._internal.actor_autoscaler import ActorPoolScalingRequest from ray.data._internal.compute import ActorPoolStrategy, TaskPoolStrategy -from ray.data._internal.execution.autoscaler.default_autoscaler import ( - ActorPoolScalingRequest, -) from ray.data._internal.execution.interfaces import ExecutionOptions, ExecutionResources from ray.data._internal.execution.operators.input_data_buffer import InputDataBuffer from ray.data._internal.execution.operators.limit_operator import LimitOperator @@ -583,6 +581,16 @@ def test_output_splitter_resource_reporting(ray_start_10_cpus_shared): assert op.metrics.obj_store_mem_internal_outqueue == 0 +def test_execution_resources_to_resource_dict(): + resources = ExecutionResources(cpu=1, gpu=2, object_store_memory=3, memory=4) + assert resources.to_resource_dict() == { + "CPU": 1, + "GPU": 2, + "object_store_memory": 3, + "memory": 4, + } + + if __name__ == "__main__": import sys diff --git a/python/ray/data/tests/test_file_based_datasource.py b/python/ray/data/tests/test_file_based_datasource.py index a73d72fbee5b..fbf8c548bb44 100644 --- a/python/ray/data/tests/test_file_based_datasource.py +++ b/python/ray/data/tests/test_file_based_datasource.py @@ -1,13 +1,21 @@ import os -from typing import Iterator +from typing import Any, Dict, Iterator, List +from urllib.parse import urlparse import pyarrow import pytest +from pytest_lazy_fixtures import lf as lazy_fixture import ray from ray.data._internal.delegating_block_builder import DelegatingBlockBuilder -from ray.data.block import Block +from ray.data.block import Block, BlockAccessor +from ray.data.datasource.datasource import ReadTask from ray.data.datasource.file_based_datasource import FileBasedDatasource +from ray.data.datasource.partitioning import ( + Partitioning, + PartitionStyle, + PathPartitionFilter, +) class MockFileBasedDatasource(FileBasedDatasource): @@ -17,6 +25,198 @@ def _read_stream(self, f: "pyarrow.NativeFile", path: str) -> Iterator[Block]: yield builder.build() +def execute_read_tasks(tasks: List[ReadTask]) -> List[Dict[str, Any]]: + """Execute the read tasks and return the resulting rows. + + The motivation for this utility function is so that we can test datasources without + scheduling Ray tasks. + """ + builder = DelegatingBlockBuilder() + for task in tasks: + for block in task(): + builder.add_block(block) + block = builder.build() + + block_accessor = BlockAccessor.for_block(block) + rows = list(block_accessor.iter_rows(public_row_format=True)) + + return rows + + +def strip_scheme(uri): + """Remove scheme from a URI, if it exists.""" + parsed = urlparse(uri) + if parsed.scheme: + return uri.split("://", 1)[1] # remove scheme + return uri # no scheme, return as-is + + +@pytest.mark.parametrize( + "filesystem,dir_path,endpoint_url", + [ + (None, lazy_fixture("local_path"), None), + (lazy_fixture("local_fs"), lazy_fixture("local_path"), None), + (lazy_fixture("s3_fs"), lazy_fixture("s3_path"), lazy_fixture("s3_server")), + ( + lazy_fixture("s3_fs_with_space"), + lazy_fixture("s3_path_with_space"), + lazy_fixture("s3_server"), + ), + ( + lazy_fixture("s3_fs_with_special_chars"), + lazy_fixture("s3_path_with_special_chars"), + lazy_fixture("s3_server"), + ), + ], +) +def test_read_single_file(ray_start_regular_shared, filesystem, dir_path, endpoint_url): + # `FileBasedDatasource` should read from the local filesystem if you don't specify + # one. + write_filesystem = filesystem + if write_filesystem is None: + write_filesystem = pyarrow.fs.LocalFileSystem() + + # PyArrow filesystems expect paths without schemes. `FileBasedDatasource` handles + # this internally, but we need to manually strip the scheme for the test setup. + write_path = strip_scheme(os.path.join(dir_path, "file.txt")) + with write_filesystem.open_output_stream(write_path) as f: + f.write(b"spam") + + datasource = MockFileBasedDatasource(dir_path, filesystem=filesystem) + tasks = datasource.get_read_tasks(1) + + rows = execute_read_tasks(tasks) + + assert rows == [{"data": b"spam"}] + + +def test_partitioning_hive(ray_start_regular_shared, tmp_path): + path = os.path.join(tmp_path, "country=us") + os.mkdir(path) + with open(os.path.join(path, "file.txt"), "wb") as file: + file.write(b"") + + datasource = MockFileBasedDatasource(tmp_path, partitioning=Partitioning("hive")) + + tasks = datasource.get_read_tasks(1) + rows = execute_read_tasks(tasks) + + assert rows == [{"data": b"", "country": "us"}] + + +def test_partition_filter_hive(ray_start_regular_shared, tmp_path): + for country in ["us", "jp"]: + path = os.path.join(tmp_path, f"country={country}") + os.mkdir(path) + with open(os.path.join(path, "file.txt"), "wb") as file: + file.write(b"") + + filter = PathPartitionFilter.of( + style=PartitionStyle.HIVE, + filter_fn=lambda partitions: partitions["country"] == "us", + ) + datasource = MockFileBasedDatasource( + tmp_path, partitioning=Partitioning("hive"), partition_filter=filter + ) + + tasks = datasource.get_read_tasks(1) + rows = execute_read_tasks(tasks) + + assert rows == [{"data": b"", "country": "us"}] + + +def test_partitioning_dir(ray_start_regular_shared, tmp_path): + path = os.path.join(tmp_path, "us") + os.mkdir(path) + with open(os.path.join(path, "file.txt"), "wb") as file: + file.write(b"") + + datasource = MockFileBasedDatasource( + tmp_path, + partitioning=Partitioning("dir", field_names=["country"], base_dir=tmp_path), + ) + + tasks = datasource.get_read_tasks(1) + rows = execute_read_tasks(tasks) + + assert rows == [{"data": b"", "country": "us"}] + + +def test_partition_filter_dir(ray_start_regular_shared, tmp_path): + for country in ["us", "jp"]: + path = os.path.join(tmp_path, country) + os.mkdir(path) + with open(os.path.join(path, "file.txt"), "wb") as file: + file.write(b"") + + filter = PathPartitionFilter.of( + style=PartitionStyle.DIRECTORY, + base_dir=tmp_path, + field_names=["country"], + filter_fn=lambda partitions: partitions["country"] == "us", + ) + partitioning = Partitioning("dir", field_names=["country"], base_dir=tmp_path) + datasource = MockFileBasedDatasource( + tmp_path, partitioning=partitioning, partition_filter=filter + ) + + tasks = datasource.get_read_tasks(1) + rows = execute_read_tasks(tasks) + + assert rows == [{"data": b"", "country": "us"}] + + +def test_partitioning_raises_on_mismatch(ray_start_regular_shared, tmp_path): + """Test when the partition key already exists in the data.""" + + class StubDatasource(FileBasedDatasource): + def _read_stream(self, f: "pyarrow.NativeFile", path: str) -> Iterator[Block]: + builder = DelegatingBlockBuilder() + builder.add({"country": f.readall()}) + yield builder.build() + + path = os.path.join(tmp_path, "country=us") + os.mkdir(path) + with open(os.path.join(path, "file.txt"), "wb") as file: + file.write(b"jp") + + datasource = StubDatasource(tmp_path, partitioning=Partitioning("hive")) + + # The data is `jp`, but the path contains `us`. Since the values are different, + # the datasource should raise a ValueError. + with pytest.raises(ValueError): + tasks = datasource.get_read_tasks(1) + execute_read_tasks(tasks) + + +def test_ignore_missing_paths_true(ray_start_regular_shared, tmp_path): + path = os.path.join(tmp_path, "file.txt") + with open(path, "wb") as file: + file.write(b"") + + datasource = MockFileBasedDatasource( + [path, "missing.txt"], ignore_missing_paths=True + ) + + tasks = datasource.get_read_tasks(1) + rows = execute_read_tasks(tasks) + + assert rows == [{"data": b""}] + + +def test_ignore_missing_paths_false(ray_start_regular_shared, tmp_path): + path = os.path.join(tmp_path, "file.txt") + with open(path, "wb") as file: + file.write(b"") + + with pytest.raises(FileNotFoundError): + datasource = MockFileBasedDatasource( + [path, "missing.txt"], ignore_missing_paths=False + ) + tasks = datasource.get_read_tasks(1) + execute_read_tasks(tasks) + + def test_local_paths(ray_start_regular_shared, tmp_path): path = os.path.join(tmp_path, "test.txt") with open(path, "w"): diff --git a/python/ray/data/tests/test_formats.py b/python/ray/data/tests/test_formats.py index 19d2d7aa36ae..c0ac1f8f4df7 100644 --- a/python/ray/data/tests/test_formats.py +++ b/python/ray/data/tests/test_formats.py @@ -5,7 +5,6 @@ import pyarrow as pa import pyarrow.parquet as pq import pytest -import torchvision from fsspec.implementations.http import HTTPFileSystem from fsspec.implementations.local import LocalFileSystem @@ -249,79 +248,6 @@ def test_from_tf(ray_start_regular_shared): tf.debugging.assert_equal(expected_label, actual_label) -@pytest.mark.parametrize("local_read", [True, False]) -def test_from_torch(shutdown_only, local_read, tmp_path): - torch_dataset = torchvision.datasets.FashionMNIST(tmp_path, download=True) - expected_data = list(torch_dataset) - - ray_dataset = ray.data.from_torch(torch_dataset, local_read=local_read) - - actual_data = extract_values("item", list(ray_dataset.take_all())) - assert actual_data == expected_data - - import torch - - class IterFashionMNIST(torch.utils.data.IterableDataset): - def __len__(self): - return len(torch_dataset) - - def __iter__(self): - return iter(torch_dataset) - - iter_torch_dataset = IterFashionMNIST() - ray_dataset = ray.data.from_torch(iter_torch_dataset) - - actual_data = extract_values("item", list(ray_dataset.take_all())) - assert actual_data == expected_data - - -@pytest.mark.parametrize("local_read", [True, False]) -def test_from_torch_boundary_conditions(shutdown_only, local_read): - """ - Tests that from_torch respects __len__ for map-style datasets - """ - from torch.utils.data import Dataset - - class BoundaryTestMapDataset(Dataset): - """A map-style dataset where __len__ is less than the underlying data size.""" - - def __init__(self, data, length): - super().__init__() - self._data = data - self._length = length - assert self._length <= len( - self._data - ), "Length must be <= data size to properly test boundary conditions" - - def __len__(self): - return self._length - - def __getitem__(self, index): - if not (0 <= index < self._length): - # Note: don't use IndexError because we want to fail clearly if - # Ray Data tries to access beyond __len__ - 1 - raise RuntimeError( - f"Index {index} out of bounds for dataset with length {self._length}" - ) - return self._data[index] - - source_data = list(range(10)) - dataset_len = 8 # Intentionally less than len(source_data) - - # --- Test MapDataset --- - map_ds = BoundaryTestMapDataset(source_data, dataset_len) - # Expected data only includes elements up to dataset_len - 1 - expected_items = source_data[:dataset_len] - - ray_ds_map = ray.data.from_torch(map_ds, local_read=local_read) - actual_items_map = extract_values("item", list(ray_ds_map.take_all())) - - # This assertion verifies that ray_ds_map didn't try to access index 8 or 9, - # which would have raised an IndexError in BoundaryTestMapDataset.__getitem__ - assert actual_items_map == expected_items - assert len(actual_items_map) == dataset_len - - def test_read_s3_file_error(shutdown_only, s3_path): dummy_path = s3_path + "_dummy" error_message = "Please check that file exists and has properly configured access." diff --git a/python/ray/data/tests/test_image.py b/python/ray/data/tests/test_image.py index a7e72b10081c..535c3d6ab64a 100644 --- a/python/ray/data/tests/test_image.py +++ b/python/ray/data/tests/test_image.py @@ -3,7 +3,6 @@ from typing import Dict import numpy as np -import pyarrow as pa import pytest from fsspec.implementations.local import LocalFileSystem from PIL import Image @@ -16,7 +15,6 @@ ImageDatasource, ImageFileMetadataProvider, ) -from ray.data.datasource import Partitioning from ray.data.datasource.file_meta_provider import FastFileMetadataProvider from ray.data.tests.conftest import * # noqa from ray.data.tests.mock_http_server import * # noqa @@ -71,27 +69,6 @@ def test_file_metadata_provider(self, ray_start_regular_shared): ) assert ds.count() == 3 - @pytest.mark.parametrize("ignore_missing_paths", [True, False]) - def test_ignore_missing_paths(self, ray_start_regular_shared, ignore_missing_paths): - paths = [ - "example://image-datasets/simple/image1.jpg", - "example://missing.jpg", - "example://image-datasets/missing/", - ] - - if ignore_missing_paths: - ds = ray.data.read_images(paths, ignore_missing_paths=ignore_missing_paths) - # example:// directive redirects to /ray/python/ray/data/examples/data - assert len(ds.input_files()) == 1 and ds.input_files()[0].endswith( - "ray/data/examples/data/image-datasets/simple/image1.jpg", - ) - else: - with pytest.raises(FileNotFoundError): - ds = ray.data.read_images( - paths, ignore_missing_paths=ignore_missing_paths - ) - ds.materialize() - def test_filtering(self, ray_start_regular_shared): # "different-extensions" contains three images and two non-images. ds = ray.data.read_images("example://image-datasets/different-extensions") @@ -130,27 +107,6 @@ def test_mode( ds = ray.data.read_images("example://image-datasets/different-modes", mode=mode) assert all([record["image"].shape == expected_shape for record in ds.take()]) - def test_partitioning( - self, ray_start_regular_shared, enable_automatic_tensor_extension_cast - ): - root = "example://image-datasets/dir-partitioned" - partitioning = Partitioning("dir", base_dir=root, field_names=["label"]) - - ds = ray.data.read_images(root, partitioning=partitioning) - - assert ds.schema().names == ["image", "label"] - - image_type, label_type = ds.schema().types - assert isinstance(image_type, get_arrow_extension_fixed_shape_tensor_types()) - assert pa.types.is_string(label_type) - - df = ds.to_pandas() - assert sorted(df["label"]) == ["cat", "cat", "dog"] - if enable_automatic_tensor_extension_cast: - assert all(tensor.shape == (32, 32, 3) for tensor in df["image"]) - else: - assert all(tensor.numpy_shape == (32, 32, 3) for tensor in df["image"]) - def test_random_shuffle(self, ray_start_regular_shared, restore_data_context): # NOTE: set preserve_order to True to allow consistent output behavior. context = ray.data.DataContext.get_current() diff --git a/python/ray/data/tests/test_issue_detection_manager.py b/python/ray/data/tests/test_issue_detection_manager.py index 3dc1712aaeff..fbcd392bb2a2 100644 --- a/python/ray/data/tests/test_issue_detection_manager.py +++ b/python/ray/data/tests/test_issue_detection_manager.py @@ -1,14 +1,19 @@ +import json +import os import sys from unittest.mock import MagicMock import pytest +import ray +from ray._private import ray_constants from ray.data._internal.execution.operators.input_data_buffer import ( InputDataBuffer, ) from ray.data._internal.execution.operators.task_pool_map_operator import ( MapOperator, ) +from ray.data._internal.execution.streaming_executor import StreamingExecutor from ray.data._internal.issue_detection.issue_detector import ( Issue, IssueType, @@ -16,10 +21,30 @@ from ray.data._internal.issue_detection.issue_detector_manager import ( IssueDetectorManager, ) +from ray.data._internal.operator_event_exporter import ( + format_export_issue_event_name, +) from ray.data.context import DataContext +def _get_exported_data(): + exported_file = os.path.join( + ray._private.worker._global_node.get_session_dir_path(), + "logs", + "export_events", + "event_EXPORT_DATASET_OPERATOR_EVENT.log", + ) + assert os.path.isfile(exported_file) + + with open(exported_file, "r") as f: + data = f.readlines() + + return [json.loads(line) for line in data] + + def test_report_issues(): + ray.init() + ray_constants.RAY_ENABLE_EXPORT_API_WRITE_CONFIG = "EXPORT_DATASET_OPERATOR_EVENT" ctx = DataContext.get_current() input_operator = InputDataBuffer(ctx, input_data=[]) map_operator = MapOperator.create( @@ -29,7 +54,8 @@ def test_report_issues(): ray_remote_args={}, ) topology = {input_operator: MagicMock(), map_operator: MagicMock()} - executor = MagicMock(_topology=topology) + executor = StreamingExecutor(ctx) + executor._topology = topology detector = IssueDetectorManager(executor) detector._report_issues( @@ -53,6 +79,23 @@ def test_report_issues(): assert map_operator.metrics.issue_detector_hanging == 0 assert map_operator.metrics.issue_detector_high_memory == 1 + data = _get_exported_data() + assert len(data) == 2 + assert data[0]["event_data"]["dataset_id"] == "dataset" + assert data[0]["event_data"]["operator_id"] == f"{input_operator.name}_0" + assert data[0]["event_data"]["operator_name"] == input_operator.name + assert data[0]["event_data"]["event_type"] == format_export_issue_event_name( + IssueType.HANGING + ) + assert data[0]["event_data"]["message"] == "Hanging detected" + assert data[1]["event_data"]["dataset_id"] == "dataset" + assert data[1]["event_data"]["operator_id"] == f"{map_operator.name}_1" + assert data[1]["event_data"]["operator_name"] == map_operator.name + assert data[1]["event_data"]["event_type"] == format_export_issue_event_name( + IssueType.HIGH_MEMORY + ) + assert data[1]["event_data"]["message"] == "High memory usage detected" + if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/data/tests/test_json.py b/python/ray/data/tests/test_json.py index 891f88ca46ea..a10dc93658b1 100644 --- a/python/ray/data/tests/test_json.py +++ b/python/ray/data/tests/test_json.py @@ -2,14 +2,12 @@ import json import os import shutil -from functools import partial import pandas as pd import pyarrow as pa import pyarrow.fs as fs import pyarrow.json as pajson import pytest -from pytest_lazy_fixtures import lf as lazy_fixture import ray from ray.data import Schema @@ -20,72 +18,37 @@ from ray.data.datasource import ( BaseFileMetadataProvider, FastFileMetadataProvider, - PartitionStyle, - PathPartitionFilter, ) from ray.data.datasource.file_based_datasource import ( FILE_SIZE_FETCH_PARALLELIZATION_THRESHOLD, ) -from ray.data.datasource.path_util import _unwrap_protocol from ray.data.tests.conftest import * # noqa -from ray.data.tests.test_partitioning import PathPartitionEncoder from ray.tests.conftest import * # noqa - -def test_json_read_partitioning( - ray_start_regular_shared, tmp_path, target_max_block_size_infinite_or_default -): - path = os.path.join(tmp_path, "country=us") - os.mkdir(path) - with open(os.path.join(path, "file1.json"), "w") as file: - json.dump({"number": 0, "string": "foo"}, file) - with open(os.path.join(path, "file2.json"), "w") as file: - json.dump({"number": 1, "string": "bar"}, file) - - ds = ray.data.read_json(path) - - assert sorted(ds.take(), key=lambda row: row["number"]) == [ - {"number": 0, "string": "foo", "country": "us"}, - {"number": 1, "string": "bar", "country": "us"}, - ] +# Set the test timeout to 6 minutes +pytestmark = pytest.mark.timeout(360) -@pytest.mark.parametrize( - "fs,data_path,endpoint_url", - [ - (None, lazy_fixture("local_path"), None), - (lazy_fixture("local_fs"), lazy_fixture("local_path"), None), - (lazy_fixture("s3_fs"), lazy_fixture("s3_path"), lazy_fixture("s3_server")), - ], -) def test_json_read( - ray_start_regular_shared, - fs, - data_path, - endpoint_url, - target_max_block_size_infinite_or_default, + ray_start_regular_shared, target_max_block_size_infinite_or_default, tmp_path ): - if endpoint_url is None: - storage_options = {} - else: - storage_options = dict(client_kwargs=dict(endpoint_url=endpoint_url)) # Single file. df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) - path1 = os.path.join(data_path, "test1.json") - df1.to_json(path1, orient="records", lines=True, storage_options=storage_options) - ds = ray.data.read_json(path1, filesystem=fs) + path1 = os.path.join(tmp_path, "test1.json") + df1.to_json(path1, orient="records", lines=True) + ds = ray.data.read_json(path1) dsdf = ds.to_pandas() assert df1.equals(dsdf) # Test metadata ops. assert ds.count() == 3 - assert ds.input_files() == [_unwrap_protocol(path1)] + assert ds.input_files() == [path1] assert ds.schema() == Schema(pa.schema([("one", pa.int64()), ("two", pa.string())])) # Two files, override_num_blocks=2. df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]}) - path2 = os.path.join(data_path, "test2.json") - df2.to_json(path2, orient="records", lines=True, storage_options=storage_options) - ds = ray.data.read_json([path1, path2], override_num_blocks=2, filesystem=fs) + path2 = os.path.join(tmp_path, "test2.json") + df2.to_json(path2, orient="records", lines=True) + ds = ray.data.read_json([path1, path2], override_num_blocks=2) dsdf = ds.to_pandas() df = pd.concat([df1, df2], ignore_index=True) assert df.equals(dsdf) @@ -95,102 +58,74 @@ def test_json_read( # Three files, override_num_blocks=2. df3 = pd.DataFrame({"one": [7, 8, 9], "two": ["h", "i", "j"]}) - path3 = os.path.join(data_path, "test3.json") - df3.to_json(path3, orient="records", lines=True, storage_options=storage_options) - ds = ray.data.read_json([path1, path2, path3], override_num_blocks=2, filesystem=fs) + path3 = os.path.join(tmp_path, "test3.json") + df3.to_json(path3, orient="records", lines=True) + ds = ray.data.read_json([path1, path2, path3], override_num_blocks=2) df = pd.concat([df1, df2, df3], ignore_index=True) dsdf = ds.to_pandas() assert df.equals(dsdf) # Directory, two files. - path = os.path.join(data_path, "test_json_dir") - if fs is None: - os.mkdir(path) - else: - fs.create_dir(_unwrap_protocol(path)) + path = os.path.join(tmp_path, "test_json_dir") + os.mkdir(path) + df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) path1 = os.path.join(path, "data0.json") - df1.to_json(path1, orient="records", lines=True, storage_options=storage_options) + df1.to_json(path1, orient="records", lines=True) df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]}) path2 = os.path.join(path, "data1.json") - df2.to_json(path2, orient="records", lines=True, storage_options=storage_options) - ds = ray.data.read_json(path, filesystem=fs) + df2.to_json(path2, orient="records", lines=True) + ds = ray.data.read_json(path) df = pd.concat([df1, df2], ignore_index=True) dsdf = ds.to_pandas().sort_values(by=["one", "two"]).reset_index(drop=True) assert df.equals(dsdf) - if fs is None: - shutil.rmtree(path) - else: - fs.delete_dir(_unwrap_protocol(path)) + shutil.rmtree(path) # Two directories, three files. - path1 = os.path.join(data_path, "test_json_dir1") - path2 = os.path.join(data_path, "test_json_dir2") - if fs is None: - os.mkdir(path1) - os.mkdir(path2) - else: - fs.create_dir(_unwrap_protocol(path1)) - fs.create_dir(_unwrap_protocol(path2)) + path1 = os.path.join(tmp_path, "test_json_dir1") + path2 = os.path.join(tmp_path, "test_json_dir2") + os.mkdir(path1) + os.mkdir(path2) df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) file_path1 = os.path.join(path1, "data0.json") - df1.to_json( - file_path1, orient="records", lines=True, storage_options=storage_options - ) + df1.to_json(file_path1, orient="records", lines=True) df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]}) file_path2 = os.path.join(path2, "data1.json") - df2.to_json( - file_path2, orient="records", lines=True, storage_options=storage_options - ) + df2.to_json(file_path2, orient="records", lines=True) df3 = pd.DataFrame({"one": [7, 8, 9], "two": ["h", "i", "j"]}) file_path3 = os.path.join(path2, "data2.json") - df3.to_json( - file_path3, orient="records", lines=True, storage_options=storage_options - ) - ds = ray.data.read_json([path1, path2], filesystem=fs) + df3.to_json(file_path3, orient="records", lines=True) + ds = ray.data.read_json([path1, path2]) df = pd.concat([df1, df2, df3], ignore_index=True) dsdf = ds.to_pandas().sort_values(by=["one", "two"]).reset_index(drop=True) assert df.equals(dsdf) - if fs is None: - shutil.rmtree(path1) - shutil.rmtree(path2) - else: - fs.delete_dir(_unwrap_protocol(path1)) - fs.delete_dir(_unwrap_protocol(path2)) + shutil.rmtree(path1) + shutil.rmtree(path2) # Directory and file, two files. - dir_path = os.path.join(data_path, "test_json_dir") - if fs is None: - os.mkdir(dir_path) - else: - fs.create_dir(_unwrap_protocol(dir_path)) + dir_path = os.path.join(tmp_path, "test_json_dir") + os.mkdir(dir_path) df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) path1 = os.path.join(dir_path, "data0.json") - df1.to_json(path1, orient="records", lines=True, storage_options=storage_options) + df1.to_json(path1, orient="records", lines=True) df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]}) - path2 = os.path.join(data_path, "data1.json") - df2.to_json(path2, orient="records", lines=True, storage_options=storage_options) - ds = ray.data.read_json([dir_path, path2], filesystem=fs) + path2 = os.path.join(tmp_path, "data1.json") + df2.to_json(path2, orient="records", lines=True) + ds = ray.data.read_json([dir_path, path2]) df = pd.concat([df1, df2], ignore_index=True) dsdf = ds.to_pandas().sort_values(by=["one", "two"]).reset_index(drop=True) assert df.equals(dsdf) - if fs is None: - shutil.rmtree(dir_path) - else: - fs.delete_dir(_unwrap_protocol(dir_path)) + shutil.rmtree(dir_path) # Directory, two files and non-json file (test default extension-based filtering). - path = os.path.join(data_path, "test_json_dir") - if fs is None: - os.mkdir(path) - else: - fs.create_dir(_unwrap_protocol(path)) + path = os.path.join(tmp_path, "test_json_dir") + os.mkdir(path) df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) path1 = os.path.join(path, "data0.json") - df1.to_json(path1, orient="records", lines=True, storage_options=storage_options) + df1.to_json(path1, orient="records", lines=True) df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]}) path2 = os.path.join(path, "data1.json") - df2.to_json(path2, orient="records", lines=True, storage_options=storage_options) + df2.to_json(path2, orient="records", lines=True) # Add a file with a non-matching file extension. This file should be ignored. df_txt = pd.DataFrame({"foobar": [1, 2, 3]}) @@ -198,42 +133,13 @@ def test_json_read( os.path.join(path, "foo.txt"), orient="records", lines=True, - storage_options=storage_options, ) - ds = ray.data.read_json(path, filesystem=fs) + ds = ray.data.read_json(path) df = pd.concat([df1, df2], ignore_index=True) dsdf = ds.to_pandas().sort_values(by=["one", "two"]).reset_index(drop=True) assert df.equals(dsdf) - if fs is None: - shutil.rmtree(path) - else: - fs.delete_dir(_unwrap_protocol(path)) - - -@pytest.mark.parametrize("ignore_missing_paths", [True, False]) -def test_read_json_ignore_missing_paths( - ray_start_regular_shared, - local_path, - ignore_missing_paths, - target_max_block_size_infinite_or_default, -): - df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) - path1 = os.path.join(local_path, "test1.json") - df1.to_json(path1, orient="records", lines=True) - - paths = [ - path1, - "missing.json", - ] - - if ignore_missing_paths: - ds = ray.data.read_json(paths, ignore_missing_paths=ignore_missing_paths) - assert ds.input_files() == [path1] - else: - with pytest.raises(FileNotFoundError): - ds = ray.data.read_json(paths, ignore_missing_paths=ignore_missing_paths) - ds.materialize() + shutil.rmtree(path) def test_zipped_json_read( @@ -298,118 +204,71 @@ def test_read_json_fallback_from_pyarrow_failure( assert ds.take_all() == data -@pytest.mark.parametrize( - "fs,data_path,endpoint_url", - [ - (None, lazy_fixture("local_path"), None), - (lazy_fixture("local_fs"), lazy_fixture("local_path"), None), - (lazy_fixture("s3_fs"), lazy_fixture("s3_path"), lazy_fixture("s3_server")), - ], -) def test_json_read_meta_provider( ray_start_regular_shared, - fs, - data_path, - endpoint_url, + tmp_path, target_max_block_size_infinite_or_default, ): - if endpoint_url is None: - storage_options = {} - else: - storage_options = dict(client_kwargs=dict(endpoint_url=endpoint_url)) - df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) - path1 = os.path.join(data_path, "test1.json") - df1.to_json(path1, orient="records", lines=True, storage_options=storage_options) + path1 = os.path.join(tmp_path, "test1.json") + df1.to_json(path1, orient="records", lines=True) ds = ray.data.read_json( path1, - filesystem=fs, meta_provider=FastFileMetadataProvider(), ) # Expect to lazily compute all metadata correctly. assert ds.count() == 3 - assert ds.input_files() == [_unwrap_protocol(path1)] + assert ds.input_files() == [path1] assert ds.schema() == Schema(pa.schema([("one", pa.int64()), ("two", pa.string())])) with pytest.raises(NotImplementedError): ray.data.read_json( path1, - filesystem=fs, meta_provider=BaseFileMetadataProvider(), ) -@pytest.mark.parametrize( - "fs,data_path,endpoint_url", - [ - (None, lazy_fixture("local_path"), None), - (lazy_fixture("local_fs"), lazy_fixture("local_path"), None), - (lazy_fixture("s3_fs"), lazy_fixture("s3_path"), lazy_fixture("s3_server")), - ], -) def test_json_read_with_read_options( ray_start_regular_shared, - fs, - data_path, - endpoint_url, + tmp_path, target_max_block_size_infinite_or_default, ): # Arrow's JSON ReadOptions isn't serializable in pyarrow < 8.0.0, so this test # covers our custom ReadOptions serializer. # TODO(Clark): Remove this test and our custom serializer once we require # pyarrow >= 8.0.0. - if endpoint_url is None: - storage_options = {} - else: - storage_options = dict(client_kwargs=dict(endpoint_url=endpoint_url)) df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) - path1 = os.path.join(data_path, "test1.json") - df1.to_json(path1, orient="records", lines=True, storage_options=storage_options) + path1 = os.path.join(tmp_path, "test1.json") + df1.to_json(path1, orient="records", lines=True) ds = ray.data.read_json( path1, - filesystem=fs, read_options=pajson.ReadOptions(use_threads=False, block_size=2**30), ) dsdf = ds.to_pandas() assert df1.equals(dsdf) # Test metadata ops. assert ds.count() == 3 - assert ds.input_files() == [_unwrap_protocol(path1)] + assert ds.input_files() == [path1] assert ds.schema() == Schema(pa.schema([("one", pa.int64()), ("two", pa.string())])) -@pytest.mark.parametrize( - "fs,data_path,endpoint_url", - [ - (None, lazy_fixture("local_path"), None), - (lazy_fixture("local_fs"), lazy_fixture("local_path"), None), - (lazy_fixture("s3_fs"), lazy_fixture("s3_path"), lazy_fixture("s3_server")), - ], -) def test_json_read_with_parse_options( ray_start_regular_shared, - fs, - data_path, - endpoint_url, + tmp_path, target_max_block_size_infinite_or_default, ): # Arrow's JSON ParseOptions isn't serializable in pyarrow < 8.0.0, so this test # covers our custom ParseOptions serializer, similar to ReadOptions in above test. # TODO(chengsu): Remove this test and our custom serializer once we require # pyarrow >= 8.0.0. - if endpoint_url is None: - storage_options = {} - else: - storage_options = dict(client_kwargs=dict(endpoint_url=endpoint_url)) df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) - path1 = os.path.join(data_path, "test1.json") - df1.to_json(path1, orient="records", lines=True, storage_options=storage_options) + path1 = os.path.join(tmp_path, "test1.json") + df1.to_json(path1, orient="records", lines=True) ds = ray.data.read_json( path1, - filesystem=fs, parse_options=pajson.ParseOptions( explicit_schema=pa.schema([("two", pa.string())]), unexpected_field_behavior="ignore", @@ -420,77 +279,10 @@ def test_json_read_with_parse_options( assert (df1["two"]).equals(dsdf["two"]) # Test metadata ops. assert ds.count() == 3 - assert ds.input_files() == [_unwrap_protocol(path1)] + assert ds.input_files() == [path1] assert ds.schema() == Schema(pa.schema([("two", pa.string())])) -@pytest.mark.parametrize( - "fs,data_path,endpoint_url", - [ - (None, lazy_fixture("local_path"), None), - (lazy_fixture("local_fs"), lazy_fixture("local_path"), None), - (lazy_fixture("s3_fs"), lazy_fixture("s3_path"), lazy_fixture("s3_server")), - ], -) -@pytest.mark.parametrize("style", [PartitionStyle.HIVE, PartitionStyle.DIRECTORY]) -def test_json_read_partitioned_with_filter( - style, - ray_start_regular_shared, - fs, - data_path, - endpoint_url, - write_base_partitioned_df, - assert_base_partitioned_ds, - target_max_block_size_infinite_or_default, -): - def df_to_json(dataframe, path, **kwargs): - dataframe.to_json(path, **kwargs) - - storage_options = ( - {} - if endpoint_url is None - else dict(client_kwargs=dict(endpoint_url=endpoint_url)) - ) - file_writer_fn = partial( - df_to_json, - orient="records", - lines=True, - storage_options=storage_options, - ) - partition_keys = ["one"] - - def skip_unpartitioned(kv_dict): - return bool(kv_dict) - - base_dir = os.path.join(data_path, style.value) - partition_path_encoder = PathPartitionEncoder.of( - style=style, - base_dir=base_dir, - field_names=partition_keys, - filesystem=fs, - ) - write_base_partitioned_df( - partition_keys, - partition_path_encoder, - file_writer_fn, - ) - file_writer_fn(pd.DataFrame({"1": [1]}), os.path.join(base_dir, "test.json")) - partition_path_filter = PathPartitionFilter.of( - style=style, - base_dir=base_dir, - field_names=partition_keys, - filter_fn=skip_unpartitioned, - filesystem=fs, - ) - ds = ray.data.read_json( - base_dir, - partition_filter=partition_path_filter, - file_extensions=None, - filesystem=fs, - ) - assert_base_partitioned_ds(ds) - - @pytest.mark.parametrize("override_num_blocks", [None, 1, 3]) def test_jsonl_lists( ray_start_regular_shared, @@ -580,62 +372,31 @@ def test_json_roundtrip( assert BlockAccessor.for_block(ray.get(block)).size_bytes() == meta.size_bytes -@pytest.mark.parametrize( - "fs,data_path,endpoint_url", - [ - (None, lazy_fixture("local_path"), None), - (lazy_fixture("local_fs"), lazy_fixture("local_path"), None), - (lazy_fixture("s3_fs"), lazy_fixture("s3_path"), lazy_fixture("s3_server")), - ], -) def test_json_read_small_file_unit_block_size( ray_start_regular_shared, - fs, - data_path, - endpoint_url, + tmp_path, target_max_block_size_infinite_or_default, ): """Test reading a small JSON file with unit block_size.""" - if endpoint_url is None: - storage_options = {} - else: - storage_options = dict(client_kwargs=dict(endpoint_url=endpoint_url)) df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) - path1 = os.path.join(data_path, "test1.json") - df1.to_json(path1, orient="records", lines=True, storage_options=storage_options) - ds = ray.data.read_json( - path1, filesystem=fs, read_options=pajson.ReadOptions(block_size=1) - ) + path1 = os.path.join(tmp_path, "test1.json") + df1.to_json(path1, orient="records", lines=True) + ds = ray.data.read_json(path1, read_options=pajson.ReadOptions(block_size=1)) dsdf = ds.to_pandas() assert df1.equals(dsdf) # Test metadata ops. assert ds.count() == 3 - assert ds.input_files() == [_unwrap_protocol(path1)] + assert ds.input_files() == [path1] assert ds.schema() == Schema(pa.schema([("one", pa.int64()), ("two", pa.string())])) -@pytest.mark.parametrize( - "fs,data_path,endpoint_url", - [ - (None, lazy_fixture("local_path"), None), - (lazy_fixture("local_fs"), lazy_fixture("local_path"), None), - (lazy_fixture("s3_fs"), lazy_fixture("s3_path"), lazy_fixture("s3_server")), - ], -) def test_json_read_file_larger_than_block_size( ray_start_regular_shared, - fs, - data_path, - endpoint_url, + tmp_path, target_max_block_size_infinite_or_default, ): """Test reading a JSON file larger than the block size.""" - if endpoint_url is None: - storage_options = {} - else: - storage_options = dict(client_kwargs=dict(endpoint_url=endpoint_url)) - block_size = 1024 num_chars = 2500 num_rows = 3 @@ -645,84 +406,48 @@ def test_json_read_file_larger_than_block_size( "two": ["b" * num_chars for _ in range(num_rows)], } ) - path2 = os.path.join(data_path, "test2.json") - df2.to_json(path2, orient="records", lines=True, storage_options=storage_options) + path2 = os.path.join(tmp_path, "test2.json") + df2.to_json(path2, orient="records", lines=True) ds = ray.data.read_json( - path2, filesystem=fs, read_options=pajson.ReadOptions(block_size=block_size) + path2, read_options=pajson.ReadOptions(block_size=block_size) ) dsdf = ds.to_pandas() assert df2.equals(dsdf) # Test metadata ops. assert ds.count() == num_rows - assert ds.input_files() == [_unwrap_protocol(path2)] + assert ds.input_files() == [path2] assert ds.schema() == Schema( pa.schema([("one", pa.string()), ("two", pa.string())]) ) -@pytest.mark.parametrize( - "fs,data_path,endpoint_url", - [ - (None, lazy_fixture("local_path"), None), - (lazy_fixture("local_fs"), lazy_fixture("local_path"), None), - (lazy_fixture("s3_fs"), lazy_fixture("s3_path"), lazy_fixture("s3_server")), - ], -) def test_json_read_negative_block_size_fallback( - ray_start_regular_shared, - fs, - data_path, - endpoint_url, - target_max_block_size_infinite_or_default, + ray_start_regular_shared, tmp_path, target_max_block_size_infinite_or_default ): """Test reading JSON with negative block_size triggers fallback to json.load().""" - if endpoint_url is None: - storage_options = {} - else: - storage_options = dict(client_kwargs=dict(endpoint_url=endpoint_url)) df3 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) - path3 = os.path.join(data_path, "test3.json") - df3.to_json(path3, orient="records", lines=True, storage_options=storage_options) + path3 = os.path.join(tmp_path, "test3.json") + df3.to_json(path3, orient="records", lines=True) # Negative Buffer Size, fails with arrow but succeeds in fallback to json.load() - ds = ray.data.read_json( - path3, filesystem=fs, read_options=pajson.ReadOptions(block_size=-1) - ) + ds = ray.data.read_json(path3, read_options=pajson.ReadOptions(block_size=-1)) dsdf = ds.to_pandas() assert df3.equals(dsdf) -@pytest.mark.parametrize( - "fs,data_path,endpoint_url", - [ - (None, lazy_fixture("local_path"), None), - (lazy_fixture("local_fs"), lazy_fixture("local_path"), None), - (lazy_fixture("s3_fs"), lazy_fixture("s3_path"), lazy_fixture("s3_server")), - ], -) def test_json_read_zero_block_size_failure( - ray_start_regular_shared, - fs, - data_path, - endpoint_url, - target_max_block_size_infinite_or_default, + ray_start_regular_shared, tmp_path, target_max_block_size_infinite_or_default ): """Test reading JSON with zero block_size fails in both arrow and fallback.""" - if endpoint_url is None: - storage_options = {} - else: - storage_options = dict(client_kwargs=dict(endpoint_url=endpoint_url)) df3 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) - path3 = os.path.join(data_path, "test3.json") - df3.to_json(path3, orient="records", lines=True, storage_options=storage_options) + path3 = os.path.join(tmp_path, "test3.json") + df3.to_json(path3, orient="records", lines=True) # Zero Buffer Size, fails with arrow and fails in fallback to json.load() with pytest.raises(json.decoder.JSONDecodeError, match="Extra data"): - ds = ray.data.read_json( - path3, filesystem=fs, read_options=pajson.ReadOptions(block_size=0) - ) + ds = ray.data.read_json(path3, read_options=pajson.ReadOptions(block_size=0)) dsdf = ds.to_pandas() assert dsdf.equals(df3) diff --git a/python/ray/data/tests/test_arrow_block_scaling.py b/python/ray/data/tests/test_jumbo_arrow_block.py similarity index 82% rename from python/ray/data/tests/test_arrow_block_scaling.py rename to python/ray/data/tests/test_jumbo_arrow_block.py index 75282dcdb085..ba7fd0b586d3 100644 --- a/python/ray/data/tests/test_arrow_block_scaling.py +++ b/python/ray/data/tests/test_jumbo_arrow_block.py @@ -10,6 +10,7 @@ import ray from ray.data import DataContext from ray.data._internal.util import GiB, MiB +from ray.tests.conftest import _ray_start @pytest.fixture(scope="module") @@ -45,6 +46,18 @@ def parquet_dataset_single_column_gt_2gb(): print(f">>> Cleaning up dataset at {dataset_path}") +@pytest.fixture(scope="module") +def ray_cluster_3gb_object_store(): + original_limit = ray._private.ray_constants.MAC_DEGRADED_PERF_MMAP_SIZE_LIMIT + + ray._private.ray_constants.MAC_DEGRADED_PERF_MMAP_SIZE_LIMIT = 3 * GiB + + with _ray_start(object_store_memory=3 * GiB) as res: + yield res + + ray._private.ray_constants.MAC_DEGRADED_PERF_MMAP_SIZE_LIMIT = original_limit + + @pytest.mark.parametrize( "op", [ @@ -54,7 +67,7 @@ def parquet_dataset_single_column_gt_2gb(): ) @pytest.mark.timeout(300) def test_arrow_batch_gt_2gb( - ray_start_regular, + ray_cluster_3gb_object_store, parquet_dataset_single_column_gt_2gb, restore_data_context, op, @@ -76,9 +89,9 @@ def _id(x): # numpy format ds = ds.map_batches( _id, - batch_format="numpy", + batch_format="pyarrow", batch_size=num_rows, - zero_copy_batch=False, + zero_copy_batch=True, ) batch = ds.take_batch() diff --git a/python/ray/data/tests/test_map.py b/python/ray/data/tests/test_map.py index 0bfb35f68f17..439d8d991170 100644 --- a/python/ray/data/tests/test_map.py +++ b/python/ray/data/tests/test_map.py @@ -34,8 +34,9 @@ _MapActorContext, ) from ray.data.context import DataContext +from ray.data.datatype import DataType from ray.data.exceptions import UserCodeException -from ray.data.expressions import col, lit +from ray.data.expressions import col, lit, udf from ray.data.tests.conftest import * # noqa from ray.data.tests.test_util import ConcurrencyCounter # noqa from ray.data.tests.util import column_udf, extract_values @@ -300,8 +301,8 @@ def __call__(self, x): # Test function and class. for fn in [udf, UDFClass]: # Test concurrency with None, single integer and a tuple of integers. - for concurrency in [2, (2, 4)]: - if fn == udf and concurrency == (2, 4): + for concurrency in [2, (2, 4), (2, 6, 4)]: + if fn == udf and (concurrency == (2, 4) or concurrency == (2, 6, 4)): error_message = "``concurrency`` is set as a tuple of integers" with pytest.raises(ValueError, match=error_message): ds.map(fn, concurrency=concurrency).take_all() @@ -311,7 +312,7 @@ def __call__(self, x): # Test concurrency with an illegal value. error_message = "``concurrency`` is expected to be set a" - for concurrency in ["dummy", (1, 3, 5)]: + for concurrency in ["dummy", (1, 3, 5, 7)]: with pytest.raises(ValueError, match=error_message): ds.map(UDFClass, concurrency=concurrency).take_all() @@ -2368,6 +2369,348 @@ def test_with_column_multiple_expressions( assert set(ds.schema().names) == {"id", "plus_one", "times_two", "ten_minus_id"} +@pytest.mark.skipif( + get_pyarrow_version() < parse_version("20.0.0"), + reason="with_column requires PyArrow >= 20.0.0", +) +@pytest.mark.parametrize( + "udf_function, column_name, expected_result", + [ + # Single column UDF - add one to each value + pytest.param( + lambda: udf(DataType.int64())(lambda x: pc.add(x, 1)), + "add_one", + 1, # 0 + 1 = 1 + id="single_column_add_one", + ), + # Single column UDF - multiply by 2 + pytest.param( + lambda: udf(DataType.int64())(lambda x: pc.multiply(x, 2)), + "times_two", + 0, # 0 * 2 = 0 + id="single_column_multiply", + ), + # Single column UDF - square the value + pytest.param( + lambda: udf(DataType.int64())(lambda x: pc.multiply(x, x)), + "squared", + 0, # 0 * 0 = 0 + id="single_column_square", + ), + # Single column UDF with string return type + pytest.param( + lambda: udf(DataType.string())(lambda x: pc.cast(x, pa.string())), + "id_str", + "0", # Convert 0 to "0" + id="single_column_to_string", + ), + # Single column UDF with float return type + pytest.param( + lambda: udf(DataType.float64())(lambda x: pc.divide(x, 2.0)), + "half", + 0.0, # 0 / 2.0 = 0.0 + id="single_column_divide_float", + ), + ], +) +def test_with_column_udf_single_column( + ray_start_regular_shared, + udf_function, + column_name, + expected_result, + target_max_block_size_infinite_or_default, +): + """Test UDFExpr functionality with single column operations in with_column.""" + ds = ray.data.range(5) + udf_fn = udf_function() + + # Apply the UDF to the "id" column + ds_with_udf = ds.with_column(column_name, udf_fn(col("id"))) + + result = ds_with_udf.take(1)[0] + assert result["id"] == 0 + assert result[column_name] == expected_result + + +@pytest.mark.skipif( + get_pyarrow_version() < parse_version("20.0.0"), + reason="with_column requires PyArrow >= 20.0.0", +) +@pytest.mark.parametrize( + "test_scenario", + [ + # Multi-column UDF - add two columns + pytest.param( + { + "data": [{"a": 1, "b": 2}, {"a": 3, "b": 4}], + "udf": lambda: udf(DataType.int64())(lambda x, y: pc.add(x, y)), + "column_name": "sum_ab", + "expected_first": 3, # 1 + 2 = 3 + "expected_second": 7, # 3 + 4 = 7 + }, + id="multi_column_add", + ), + # Multi-column UDF - multiply two columns + pytest.param( + { + "data": [{"x": 2, "y": 3}, {"x": 4, "y": 5}], + "udf": lambda: udf(DataType.int64())(lambda x, y: pc.multiply(x, y)), + "column_name": "product_xy", + "expected_first": 6, # 2 * 3 = 6 + "expected_second": 20, # 4 * 5 = 20 + }, + id="multi_column_multiply", + ), + # Multi-column UDF - string concatenation + pytest.param( + { + "data": [ + {"first": "John", "last": "Doe"}, + {"first": "Jane", "last": "Smith"}, + ], + "udf": lambda: udf(DataType.string())( + lambda first, last: pc.binary_join_element_wise(first, last, " ") + ), + "column_name": "full_name", + "expected_first": "John Doe", + "expected_second": "Jane Smith", + }, + id="multi_column_string_concat", + ), + ], +) +def test_with_column_udf_multi_column( + ray_start_regular_shared, + test_scenario, + target_max_block_size_infinite_or_default, +): + """Test UDFExpr functionality with multi-column operations in with_column.""" + data = test_scenario["data"] + udf_fn = test_scenario["udf"]() + column_name = test_scenario["column_name"] + expected_first = test_scenario["expected_first"] + expected_second = test_scenario["expected_second"] + + ds = ray.data.from_items(data) + + # Apply UDF to multiple columns based on the scenario + if "a" in data[0] and "b" in data[0]: + ds_with_udf = ds.with_column(column_name, udf_fn(col("a"), col("b"))) + elif "x" in data[0] and "y" in data[0]: + ds_with_udf = ds.with_column(column_name, udf_fn(col("x"), col("y"))) + else: # first/last name scenario + ds_with_udf = ds.with_column(column_name, udf_fn(col("first"), col("last"))) + + results = ds_with_udf.take(2) + assert results[0][column_name] == expected_first + assert results[1][column_name] == expected_second + + +@pytest.mark.skipif( + get_pyarrow_version() < parse_version("20.0.0"), + reason="with_column requires PyArrow >= 20.0.0", +) +@pytest.mark.parametrize( + "expression_scenario", + [ + # UDF in arithmetic expression + pytest.param( + { + "expression_factory": lambda add_one_udf: add_one_udf(col("id")) * 2, + "expected": 2, # (0 + 1) * 2 = 2 + "column_name": "udf_times_two", + }, + id="udf_in_arithmetic", + ), + # UDF with literal addition + pytest.param( + { + "expression_factory": lambda add_one_udf: add_one_udf(col("id")) + + lit(10), + "expected": 11, # (0 + 1) + 10 = 11 + "column_name": "udf_plus_literal", + }, + id="udf_plus_literal", + ), + # UDF in comparison + pytest.param( + { + "expression_factory": lambda add_one_udf: add_one_udf(col("id")) > 0, + "expected": True, # (0 + 1) > 0 = True + "column_name": "udf_comparison", + }, + id="udf_in_comparison", + ), + # Nested UDF operations (UDF + regular expression) + pytest.param( + { + "expression_factory": lambda add_one_udf: add_one_udf(col("id") + 5), + "expected": 6, # add_one(0 + 5) = add_one(5) = 6 + "column_name": "nested_udf", + }, + id="nested_udf_expression", + ), + ], +) +def test_with_column_udf_in_complex_expressions( + ray_start_regular_shared, + expression_scenario, + target_max_block_size_infinite_or_default, +): + """Test UDFExpr functionality in complex expressions with with_column.""" + ds = ray.data.range(5) + + # Create a simple add_one UDF for use in expressions + @udf(DataType.int64()) + def add_one(x: pa.Array) -> pa.Array: + return pc.add(x, 1) + + expression = expression_scenario["expression_factory"](add_one) + expected = expression_scenario["expected"] + column_name = expression_scenario["column_name"] + + ds_with_expr = ds.with_column(column_name, expression) + + result = ds_with_expr.take(1)[0] + assert result["id"] == 0 + assert result[column_name] == expected + + +@pytest.mark.skipif( + get_pyarrow_version() < parse_version("20.0.0"), + reason="with_column requires PyArrow >= 20.0.0", +) +def test_with_column_udf_multiple_udfs( + ray_start_regular_shared, target_max_block_size_infinite_or_default +): + """Test applying multiple UDFs in sequence with with_column.""" + ds = ray.data.range(5) + + # Define multiple UDFs + @udf(DataType.int64()) + def add_one(x: pa.Array) -> pa.Array: + return pc.add(x, 1) + + @udf(DataType.int64()) + def multiply_by_two(x: pa.Array) -> pa.Array: + return pc.multiply(x, 2) + + @udf(DataType.float64()) + def divide_by_three(x: pa.Array) -> pa.Array: + return pc.divide(x, 3.0) + + # Apply UDFs in sequence + ds = ds.with_column("plus_one", add_one(col("id"))) + ds = ds.with_column("times_two", multiply_by_two(col("plus_one"))) + ds = ds.with_column("div_three", divide_by_three(col("times_two"))) + + # Convert to pandas and compare with expected result + result_df = ds.to_pandas() + + expected_df = pd.DataFrame( + { + "id": [0, 1, 2, 3, 4], + "plus_one": [1, 2, 3, 4, 5], # id + 1 + "times_two": [2, 4, 6, 8, 10], # (id + 1) * 2 + "div_three": [ + 2.0 / 3.0, + 4.0 / 3.0, + 2.0, + 8.0 / 3.0, + 10.0 / 3.0, + ], # ((id + 1) * 2) / 3 + } + ) + + pd.testing.assert_frame_equal(result_df, expected_df) + + +@pytest.mark.skipif( + get_pyarrow_version() < parse_version("20.0.0"), + reason="with_column requires PyArrow >= 20.0.0", +) +def test_with_column_mixed_udf_and_regular_expressions( + ray_start_regular_shared, target_max_block_size_infinite_or_default +): + """Test mixing UDF expressions and regular expressions in with_column operations.""" + ds = ray.data.range(5) + + # Define a UDF for testing + @udf(DataType.int64()) + def multiply_by_three(x: pa.Array) -> pa.Array: + return pc.multiply(x, 3) + + # Mix regular expressions and UDF expressions + ds = ds.with_column("plus_ten", col("id") + 10) # Regular expression + ds = ds.with_column("times_three", multiply_by_three(col("id"))) # UDF expression + ds = ds.with_column("minus_five", col("id") - 5) # Regular expression + ds = ds.with_column( + "udf_plus_regular", multiply_by_three(col("id")) + col("plus_ten") + ) # Mixed: UDF + regular + ds = ds.with_column( + "comparison", col("times_three") > col("plus_ten") + ) # Regular expression using UDF result + + # Convert to pandas and compare with expected result + result_df = ds.to_pandas() + + expected_df = pd.DataFrame( + { + "id": [0, 1, 2, 3, 4], + "plus_ten": [10, 11, 12, 13, 14], # id + 10 + "times_three": [0, 3, 6, 9, 12], # id * 3 + "minus_five": [-5, -4, -3, -2, -1], # id - 5 + "udf_plus_regular": [10, 14, 18, 22, 26], # (id * 3) + (id + 10) + "comparison": [False, False, False, False, False], # times_three > plus_ten + } + ) + + pd.testing.assert_frame_equal(result_df, expected_df) + + +@pytest.mark.skipif( + get_pyarrow_version() < parse_version("20.0.0"), + reason="with_column requires PyArrow >= 20.0.0", +) +def test_with_column_udf_invalid_return_type_validation( + ray_start_regular_shared, target_max_block_size_infinite_or_default +): + """Test that UDFs returning invalid types raise TypeError with clear message.""" + ds = ray.data.range(3) + + # Test UDF returning invalid type (dict) - expecting string but returning dict + @udf(DataType.string()) + def invalid_dict_return(x: pa.Array) -> dict: + return {"invalid": "return_type"} + + # Test UDF returning invalid type (str) - expecting string but returning plain str + @udf(DataType.string()) + def invalid_str_return(x: pa.Array) -> str: + return "invalid_string" + + # Test UDF returning invalid type (int) - expecting int64 but returning plain int + @udf(DataType.int64()) + def invalid_int_return(x: pa.Array) -> int: + return 42 + + # Test each invalid return type + test_cases = [ + (invalid_dict_return, "dict"), + (invalid_str_return, "str"), + (invalid_int_return, "int"), + ] + + for invalid_udf, expected_type_name in test_cases: + with pytest.raises((RayTaskError, UserCodeException)) as exc_info: + ds.with_column("invalid_col", invalid_udf(col("id"))).take(1) + + # The actual TypeError gets wrapped, so we need to check the exception chain + error_message = str(exc_info.value) + assert f"returned invalid type {expected_type_name}" in error_message + assert "Expected type" in error_message + assert "pandas.Series" in error_message and "numpy.ndarray" in error_message + + if __name__ == "__main__": import sys diff --git a/python/ray/data/tests/test_metadata_provider.py b/python/ray/data/tests/test_metadata_provider.py index ef4899abd085..b8d49544c9da 100644 --- a/python/ray/data/tests/test_metadata_provider.py +++ b/python/ray/data/tests/test_metadata_provider.py @@ -6,8 +6,6 @@ from unittest.mock import patch import pandas as pd -import pyarrow as pa -import pyarrow.parquet as pq import pytest from pyarrow.fs import LocalFileSystem from pytest_lazy_fixtures import lf as lazy_fixture @@ -17,7 +15,6 @@ DefaultFileMetadataProvider, FastFileMetadataProvider, FileMetadataProvider, - ParquetMetadataProvider, ) from ray.data.datasource.file_based_datasource import ( FILE_SIZE_FETCH_PARALLELIZATION_THRESHOLD, @@ -40,13 +37,6 @@ def df_to_csv(dataframe, path, **kwargs): dataframe.to_csv(path, **kwargs) -def _get_parquet_file_meta_size_bytes(file_metas): - return sum( - sum(m.row_group(i).total_byte_size for i in range(m.num_row_groups)) - for m in file_metas - ) - - def _get_file_sizes_bytes(paths, fs): from pyarrow.fs import FileType @@ -71,55 +61,6 @@ def test_file_metadata_providers_not_implemented(): meta_provider.expand_paths(["/foo/bar.csv"], None) -@pytest.mark.parametrize( - "fs,data_path", - [ - (None, lazy_fixture("local_path")), - (lazy_fixture("local_fs"), lazy_fixture("local_path")), - (lazy_fixture("s3_fs"), lazy_fixture("s3_path")), - ( - lazy_fixture("s3_fs_with_space"), - lazy_fixture("s3_path_with_space"), - ), # Path contains space. - ( - lazy_fixture("s3_fs_with_special_chars"), - lazy_fixture("s3_path_with_special_chars"), - ), - ], -) -def test_default_parquet_metadata_provider(fs, data_path): - path_module = os.path if urllib.parse.urlparse(data_path).scheme else posixpath - paths = [ - path_module.join(data_path, "test1.parquet"), - path_module.join(data_path, "test2.parquet"), - ] - paths, fs = _resolve_paths_and_filesystem(paths, fs) - - df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) - table = pa.Table.from_pandas(df1) - pq.write_table(table, paths[0], filesystem=fs) - df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]}) - table = pa.Table.from_pandas(df2) - pq.write_table(table, paths[1], filesystem=fs) - - meta_provider = ParquetMetadataProvider() - pq_ds = pq.ParquetDataset(paths, filesystem=fs) - fragment_file_metas = meta_provider.prefetch_file_metadata(pq_ds.fragments) - - meta = meta_provider( - [p.path for p in pq_ds.fragments], - num_fragments=len(pq_ds.fragments), - prefetched_metadata=fragment_file_metas, - ) - expected_meta_size_bytes = _get_parquet_file_meta_size_bytes( - [f.metadata for f in pq_ds.fragments] - ) - assert meta.size_bytes == expected_meta_size_bytes - assert meta.num_rows == 6 - assert len(paths) == 2 - assert all(path in meta.input_files for path in paths) - - @pytest.mark.parametrize( "fs,data_path,endpoint_url", [ diff --git a/python/ray/data/tests/test_numpy.py b/python/ray/data/tests/test_numpy.py index a4240c18041a..fa6a26fd404a 100644 --- a/python/ray/data/tests/test_numpy.py +++ b/python/ray/data/tests/test_numpy.py @@ -4,7 +4,6 @@ import pandas as pd import pyarrow as pa import pytest -from pytest_lazy_fixtures import lf as lazy_fixture import ray from ray.air.util.tensor_extensions.arrow import ArrowTensorTypeV2 @@ -12,14 +11,10 @@ from ray.data.datasource import ( BaseFileMetadataProvider, FastFileMetadataProvider, - Partitioning, - PartitionStyle, - PathPartitionFilter, ) from ray.data.extensions.tensor_extension import ArrowTensorType from ray.data.tests.conftest import * # noqa from ray.data.tests.mock_http_server import * # noqa -from ray.data.tests.test_partitioning import PathPartitionEncoder from ray.data.tests.util import extract_values from ray.tests.conftest import * # noqa @@ -32,17 +27,6 @@ def _get_tensor_type(): ) -def test_numpy_read_partitioning(ray_start_regular_shared, tmp_path): - path = os.path.join(tmp_path, "country=us", "data.npy") - os.mkdir(os.path.dirname(path)) - np.save(path, np.arange(4).reshape([2, 2])) - - ds = ray.data.read_numpy(path, partitioning=Partitioning("hive")) - - assert ds.schema().names == ["data", "country"] - assert [r["country"] for r in ds.take()] == ["us", "us"] - - @pytest.mark.parametrize("from_ref", [False, True]) def test_from_numpy(ray_start_regular_shared, from_ref): arr1 = np.expand_dims(np.arange(0, 4), axis=1) @@ -109,24 +93,12 @@ def test_to_numpy_refs(ray_start_regular_shared): ) -@pytest.mark.parametrize( - "fs,data_path", - [ - (None, lazy_fixture("local_path")), - (lazy_fixture("local_fs"), lazy_fixture("local_path")), - (lazy_fixture("s3_fs"), lazy_fixture("s3_path")), - ( - lazy_fixture("s3_fs_with_anonymous_crendential"), - lazy_fixture("s3_path_with_anonymous_crendential"), - ), - ], -) -def test_numpy_roundtrip(ray_start_regular_shared, fs, data_path): +def test_numpy_roundtrip(ray_start_regular_shared, tmp_path): tensor_type = _get_tensor_type() ds = ray.data.range_tensor(10, override_num_blocks=2) - ds.write_numpy(data_path, filesystem=fs, column="data") - ds = ray.data.read_numpy(data_path, filesystem=fs) + ds.write_numpy(tmp_path, column="data") + ds = ray.data.read_numpy(tmp_path) assert ds.count() == 10 assert ds.schema() == Schema(pa.schema([("data", tensor_type((1,), pa.int64()))])) assert sorted(ds.take_all(), key=lambda row: row["data"]) == [ @@ -158,28 +130,6 @@ def test_numpy_read_x(ray_start_regular_shared, tmp_path): assert [v["data"].item() for v in ds.take(2)] == [0, 1] -@pytest.mark.parametrize("ignore_missing_paths", [True, False]) -def test_numpy_read_ignore_missing_paths( - ray_start_regular_shared, tmp_path, ignore_missing_paths -): - path = os.path.join(tmp_path, "test_np_dir") - os.mkdir(path) - np.save(os.path.join(path, "test.npy"), np.expand_dims(np.arange(0, 10), 1)) - - paths = [ - os.path.join(path, "test.npy"), - "missing.npy", - ] - - if ignore_missing_paths: - ds = ray.data.read_numpy(paths, ignore_missing_paths=ignore_missing_paths) - assert ds.input_files() == [paths[0]] - else: - with pytest.raises(FileNotFoundError): - ds = ray.data.read_numpy(paths, ignore_missing_paths=ignore_missing_paths) - ds.materialize() - - def test_numpy_read_meta_provider(ray_start_regular_shared, tmp_path): tensor_type = _get_tensor_type() @@ -203,64 +153,6 @@ def test_numpy_read_meta_provider(ray_start_regular_shared, tmp_path): ) -@pytest.mark.parametrize("style", [PartitionStyle.HIVE, PartitionStyle.DIRECTORY]) -def test_numpy_read_partitioned_with_filter( - style, - ray_start_regular_shared, - tmp_path, - write_partitioned_df, - assert_base_partitioned_ds, -): - tensor_type = _get_tensor_type() - - def df_to_np(dataframe, path, **kwargs): - np.save(path, dataframe.to_numpy(dtype=np.dtype(np.int8)), **kwargs) - - df = pd.DataFrame({"one": [1, 1, 1, 3, 3, 3], "two": [0, 1, 2, 3, 4, 5]}) - partition_keys = ["one"] - - def skip_unpartitioned(kv_dict): - return bool(kv_dict) - - base_dir = os.path.join(tmp_path, style.value) - partition_path_encoder = PathPartitionEncoder.of( - style=style, - base_dir=base_dir, - field_names=partition_keys, - ) - write_partitioned_df( - df, - partition_keys, - partition_path_encoder, - df_to_np, - ) - df_to_np(df, os.path.join(base_dir, "test.npy")) - partition_path_filter = PathPartitionFilter.of( - style=style, - base_dir=base_dir, - field_names=partition_keys, - filter_fn=skip_unpartitioned, - ) - ds = ray.data.read_numpy(base_dir, partition_filter=partition_path_filter) - - def sorted_values_transform_fn(sorted_values): - # HACK: `assert_base_partitioned_ds` doesn't properly sort the values. This is a - # hack to make the test pass. - # TODO(@bveeramani): Clean this up. - actually_sorted_values = sorted(sorted_values[0], key=lambda item: tuple(item)) - return str([actually_sorted_values]) - - vals = [[1, 0], [1, 1], [1, 2], [3, 3], [3, 4], [3, 5]] - val_str = "".join(f"array({v}, dtype=int8), " for v in vals)[:-2] - assert_base_partitioned_ds( - ds, - schema=Schema(pa.schema([("data", tensor_type((2,), pa.int8()))])), - sorted_values=f"[[{val_str}]]", - ds_take_transform_fn=lambda taken: [extract_values("data", taken)], - sorted_values_transform_fn=sorted_values_transform_fn, - ) - - def test_numpy_write(ray_start_regular_shared, tmp_path): ds = ray.data.range_tensor(1) diff --git a/python/ray/data/tests/test_object_gc.py b/python/ray/data/tests/test_object_gc.py index fd6b5e3f49a4..2994a57bcf77 100644 --- a/python/ray/data/tests/test_object_gc.py +++ b/python/ray/data/tests/test_object_gc.py @@ -33,17 +33,6 @@ def _all_executor_threads_exited(): wait_for_condition(_all_executor_threads_exited, timeout=10, retry_interval_ms=1000) -def check_to_torch_no_spill(ctx, dataset): - # Iterate over the dataset for 10 epochs to stress test that - # no spilling will happen. - max_epoch = 10 - for _ in range(max_epoch): - for _ in dataset.to_torch(batch_size=None): - pass - meminfo = memory_summary(ctx.address_info["address"], stats_only=True) - assert "Spilled" not in meminfo, meminfo - - def check_iter_torch_batches_no_spill(ctx, dataset): # Iterate over the dataset for 10 epochs to stress test that # no spilling will happen. @@ -93,8 +82,6 @@ def test_torch_iteration(shutdown_only): # The size of dataset is 500*(80*80*4)*8B, about 100MB. ds = ray.data.range_tensor(500, shape=(80, 80, 4), override_num_blocks=100) - # to_torch - check_to_torch_no_spill(ctx, ds) # iter_torch_batches check_iter_torch_batches_no_spill(ctx, ds) diff --git a/python/ray/data/tests/test_operators.py b/python/ray/data/tests/test_operators.py index e361b237fc37..8ef0f2267386 100644 --- a/python/ray/data/tests/test_operators.py +++ b/python/ray/data/tests/test_operators.py @@ -11,10 +11,8 @@ import ray from ray._common.test_utils import wait_for_condition +from ray.data._internal.actor_autoscaler import ActorPoolScalingRequest from ray.data._internal.compute import ActorPoolStrategy, TaskPoolStrategy -from ray.data._internal.execution.autoscaler.default_autoscaler import ( - ActorPoolScalingRequest, -) from ray.data._internal.execution.interfaces import ( ExecutionOptions, PhysicalOperator, @@ -123,7 +121,7 @@ def dummy_all_transform(bundles: List[RefBundle], ctx): dummy_all_transform, input_op, DataContext.get_current(), - target_max_block_size=DataContext.get_current().target_max_block_size, + target_max_block_size_override=DataContext.get_current().target_max_block_size, num_outputs=2, sub_progress_bar_names=["Test1", "Test2"], name="TestAll", @@ -141,7 +139,9 @@ def dummy_all_transform(bundles: List[RefBundle], ctx): # Check we return transformed bundles. assert not op.completed() - assert _take_outputs(op) == [[1, 2], [3, 4]] + outputs = _take_outputs(op) + expected = [[1, 2], [3, 4]] + assert sorted(outputs) == expected, f"Expected {expected}, got {outputs}" stats = op.get_stats() assert "FooStats" in stats assert op.completed() @@ -172,7 +172,7 @@ def dummy_all_transform(bundles: List[RefBundle]): dummy_all_transform, input_op=op1, data_context=DataContext.get_current(), - target_max_block_size=DataContext.get_current().target_max_block_size, + target_max_block_size_override=DataContext.get_current().target_max_block_size, name="TestAll", ) assert op2.num_outputs_total() is None @@ -517,7 +517,9 @@ def test_map_operator_ray_args(shutdown_only, use_actors): run_op_tasks_sync(op) # Check we don't hang and complete with num_gpus=1. - assert _take_outputs(op) == [[i * 2] for i in range(10)] + outputs = _take_outputs(op) + expected = [[i * 2] for i in range(10)] + assert sorted(outputs) == expected, f"Expected {expected}, got {outputs}" assert op.completed() diff --git a/python/ray/data/tests/test_parquet.py b/python/ray/data/tests/test_parquet.py index dd43f22e1ee2..59de820688c8 100644 --- a/python/ray/data/tests/test_parquet.py +++ b/python/ray/data/tests/test_parquet.py @@ -2,7 +2,7 @@ import shutil import time from dataclasses import dataclass -from typing import Any, Optional +from typing import Optional import numpy as np import pandas as pd @@ -21,19 +21,15 @@ from ray.data import FileShuffleConfig, Schema from ray.data._internal.datasource.parquet_bulk_datasource import ParquetBulkDatasource from ray.data._internal.datasource.parquet_datasource import ( - NUM_CPUS_FOR_META_FETCH_TASK, ParquetDatasource, - SerializedFragment, - _deserialize_fragments_with_retry, ) from ray.data._internal.execution.interfaces.ref_bundle import ( _ref_bundles_iterator_to_block_refs_list, ) from ray.data._internal.util import rows_same -from ray.data.block import BlockAccessor, BlockMetadata +from ray.data.block import BlockAccessor from ray.data.context import DataContext -from ray.data.datasource import DefaultFileMetadataProvider, ParquetMetadataProvider -from ray.data.datasource.parquet_meta_provider import PARALLELIZE_META_FETCH_THRESHOLD +from ray.data.datasource import DefaultFileMetadataProvider from ray.data.datasource.partitioning import Partitioning, PathPartitionFilter from ray.data.datasource.path_util import _unwrap_protocol from ray.data.tests.conftest import * # noqa @@ -114,70 +110,6 @@ def test_include_paths( assert paths == [path, path] -@pytest.mark.parametrize( - "fs,data_path", - [ - (lazy_fixture("local_fs"), lazy_fixture("local_path")), - ], -) -def test_parquet_deserialize_fragments_with_retry( - ray_start_regular_shared, fs, data_path, monkeypatch -): - setup_data_path = _unwrap_protocol(data_path) - df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) - table = pa.Table.from_pandas(df1) - path1 = os.path.join(setup_data_path, "test1.parquet") - pq.write_table(table, path1, filesystem=fs) - df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]}) - table = pa.Table.from_pandas(df2) - path2 = os.path.join(setup_data_path, "test2.parquet") - pq.write_table(table, path2, filesystem=fs) - - dataset_kwargs = {} - pq_ds = pq.ParquetDataset( - data_path, - **dataset_kwargs, - filesystem=fs, - ) - serialized_fragments = [SerializedFragment(p) for p in pq_ds.fragments] - - # test 1st attempt succeed - fragments = _deserialize_fragments_with_retry(serialized_fragments) - assert "test1.parquet" in fragments[0].path - assert "test2.parquet" in fragments[1].path - - # test the 3rd attempt succeed with a mock function constructed - # to throw in the first two attempts - class MockDeserializer: - def __init__(self, planned_exp_or_return): - self.planned_exp_or_return = planned_exp_or_return - self.cur_index = 0 - - def __call__(self, *args: Any, **kwds: Any) -> Any: - exp_or_ret = self.planned_exp_or_return[self.cur_index] - self.cur_index += 1 - if isinstance(exp_or_ret, Exception): - raise exp_or_ret - else: - return exp_or_ret - - mock_deserializer = MockDeserializer( - [ - Exception("1st mock failed attempt"), - Exception("2nd mock failed attempt"), - fragments, - ] - ) - monkeypatch.setattr( - ray.data._internal.datasource.parquet_datasource, - "_deserialize_fragments", - mock_deserializer, - ) - retried_fragments = _deserialize_fragments_with_retry(serialized_fragments) - assert "test1.parquet" in retried_fragments[0].path - assert "test2.parquet" in retried_fragments[1].path - - @pytest.mark.parametrize( "fs,data_path", [ @@ -243,137 +175,6 @@ def test_parquet_read_basic( assert sorted(values) == [1, 2, 3, 4, 5, 6] -@pytest.mark.parametrize( - "fs,data_path", - [ - (None, lazy_fixture("local_path")), - (lazy_fixture("local_fs"), lazy_fixture("local_path")), - (lazy_fixture("s3_fs"), lazy_fixture("s3_path")), - ( - lazy_fixture("s3_fs_with_anonymous_crendential"), - lazy_fixture("s3_path_with_anonymous_crendential"), - ), - ], -) -def test_parquet_read_meta_provider(ray_start_regular_shared, fs, data_path): - df1 = pd.DataFrame({"one": range(30_000), "two": ["a", "b", "c"] * 10_000}) - table = pa.Table.from_pandas(df1) - setup_data_path = _unwrap_protocol(data_path) - path1 = os.path.join(setup_data_path, "test1.parquet") - pq.write_table(table, path1, filesystem=fs) - df2 = pd.DataFrame({"one": range(30_000, 60_000), "two": ["e", "f", "g"] * 10000}) - table = pa.Table.from_pandas(df2) - path2 = os.path.join(setup_data_path, "test2.parquet") - pq.write_table(table, path2, filesystem=fs) - - expected_num_rows = len(df1) + len(df2) - expected_byte_size = 787500 - - # - # Case 1: Test metadata fetching happy path (obtaining, caching and propagating - # metadata) - # - - class AssertingMetadataProvider(ParquetMetadataProvider): - def prefetch_file_metadata(self, fragments, **ray_remote_args): - assert ray_remote_args["num_cpus"] == NUM_CPUS_FOR_META_FETCH_TASK - assert ( - ray_remote_args["scheduling_strategy"] - == DataContext.get_current().scheduling_strategy - ) - return super().prefetch_file_metadata(fragments, **ray_remote_args) - - ds = ray.data.read_parquet( - data_path, - filesystem=fs, - meta_provider=AssertingMetadataProvider(), - ) - - # Expect precomputed row counts and block sizes to be missing. - assert ds._meta_count() == expected_num_rows - - read_op = ds._plan._logical_plan.dag - - # Assert Read op metadata propagation - assert read_op.infer_metadata() == BlockMetadata( - num_rows=expected_num_rows, - size_bytes=expected_byte_size, - exec_stats=None, - input_files=[path1, path2], - ) - - expected_schema = pa.schema({"one": pa.int64(), "two": pa.string()}) - - assert read_op.infer_schema().equals(expected_schema) - - # Expected - # - Fetched Parquet metadata to be reused - # - *No* dataset execution performed - assert ds.count() == expected_num_rows - assert ds.size_bytes() == expected_byte_size - assert ds.schema() == Schema(expected_schema) - assert set(ds.input_files()) == {path1, path2} - - assert not ds._plan.has_computed_output() - - expected_values = list( - zip(range(60_000), ["a", "b", "c"] * 10_000 + ["e", "f", "g"] * 10_000) - ) - - values = [(s["one"], s["two"]) for s in ds.take(60000)] - - exec_stats = ds._plan._snapshot_stats - read_stats = exec_stats.parents[0] - - # Assert that ref-bundles - # - Passed to ReadParquet hold metadata matching actual bundle - # - Produced by ReadParquet reflects actual amount of bytes read - assert read_stats.base_name == "ReadParquet" - # NOTE: Size of the task should be ~5kb, but could vary from platform to platform - # alas for different Python versions. However, it is substantially smaller - # than the dataset itself (~750kb) - assert read_stats.extra_metrics["average_bytes_inputs_per_task"] < 10_000 - - # TODO stats are broken for iteration-based executions due to the fact - # that returned stats object is obtained before iteration completes, - # hence not capturing the final state of the pipeline - # assert ( - # read_stats.extra_metrics["bytes_task_outputs_generated"] == expected_byte_size - # ) - - assert sorted(values) == expected_values - - # - # Case 2: Test metadata fetching *failing* (falling back to actually - # executing the dataset) - # - - class FailingMetadataProvider(ParquetMetadataProvider): - def prefetch_file_metadata(self, fragments, **ray_remote_args): - assert ray_remote_args["num_cpus"] == NUM_CPUS_FOR_META_FETCH_TASK - assert ( - ray_remote_args["scheduling_strategy"] - == DataContext.get_current().scheduling_strategy - ) - return None - - ds = ray.data.read_parquet( - data_path, - filesystem=fs, - meta_provider=FailingMetadataProvider(), - ) - - # Expected - # - Fetched Parquet metadata is not used (returns null), hence - # - Dataset execution has to be performed - assert ds.count() == expected_num_rows - assert ds.size_bytes() == expected_byte_size - assert ds.schema() == Schema(expected_schema) - assert set(ds.input_files()) == {path1, path2} - - assert ds._plan.has_computed_output() - - @pytest.mark.parametrize( "fs,data_path", [ @@ -800,6 +601,32 @@ def test_parquet_read_partitioned_explicit( ] +def test_proper_projection_for_partitioned_datasets(temp_dir): + ds = ray.data.read_parquet("example://iris.parquet").materialize() + + partitioned_ds_path = f"{temp_dir}/partitioned_iris" + # Write out partitioned dataset + ds.write_parquet(partitioned_ds_path, partition_cols=["variety"]) + + partitioned_ds = ray.data.read_parquet( + partitioned_ds_path, columns=["variety"] + ).materialize() + + print(partitioned_ds.schema()) + + assert [ + "sepal.length", + "sepal.width", + "petal.length", + "petal.width", + "variety", + ] == ds.take_batch(batch_format="pyarrow").column_names + + assert ["variety"] == partitioned_ds.take_batch(batch_format="pyarrow").column_names + + assert ds.count() == partitioned_ds.count() + + def test_parquet_read_with_udf( ray_start_regular_shared, tmp_path, target_max_block_size_infinite_or_default ): @@ -850,59 +677,18 @@ def _block_udf(block: pa.Table): np.testing.assert_array_equal(sorted(ones), np.array(one_data[:2]) + 1) -@pytest.mark.parametrize( - "fs,data_path", - [ - (None, lazy_fixture("local_path")), - (lazy_fixture("local_fs"), lazy_fixture("local_path")), - (lazy_fixture("s3_fs"), lazy_fixture("s3_path")), - (lazy_fixture("s3_fs_with_space"), lazy_fixture("s3_path_with_space")), - ( - lazy_fixture("s3_fs_with_anonymous_crendential"), - lazy_fixture("s3_path_with_anonymous_crendential"), - ), - ], -) -def test_parquet_read_parallel_meta_fetch( - ray_start_regular_shared, fs, data_path, target_max_block_size_infinite_or_default -): - setup_data_path = _unwrap_protocol(data_path) - num_dfs = PARALLELIZE_META_FETCH_THRESHOLD + 1 - for idx in range(num_dfs): - df = pd.DataFrame({"one": list(range(3 * idx, 3 * (idx + 1)))}) - table = pa.Table.from_pandas(df) - path = os.path.join(setup_data_path, f"test_{idx}.parquet") - pq.write_table(table, path, filesystem=fs) - - parallelism = 8 - ds = ray.data.read_parquet( - data_path, filesystem=fs, override_num_blocks=parallelism - ) - - # Test metadata-only parquet ops. - assert ds.count() == num_dfs * 3 - assert ds.size_bytes() > 0 - # Schema information and input files are available from Parquet metadata, - # so we do not need to compute the first block. - assert ds.schema() is not None - input_files = ds.input_files() - assert len(input_files) == num_dfs, input_files - - # Forces a data read. - values = [s["one"] for s in ds.take(limit=3 * num_dfs)] - assert sorted(values) == list(range(3 * num_dfs)) - - def test_parquet_reader_estimate_data_size(shutdown_only, tmp_path): ctx = ray.data.context.DataContext.get_current() old_decoding_size_estimation = ctx.decoding_size_estimation ctx.decoding_size_estimation = True try: tensor_output_path = os.path.join(tmp_path, "tensor") - ray.data.range_tensor(1000, shape=(1000,)).write_parquet(tensor_output_path) - ds = ray.data.read_parquet( - tensor_output_path, meta_provider=ParquetMetadataProvider() - ) + # NOTE: It's crucial to override # of blocks to get stable # of files + # produced and make sure data size estimates are stable + ray.data.range_tensor( + 1000, shape=(1000,), override_num_blocks=10 + ).write_parquet(tensor_output_path) + ds = ray.data.read_parquet(tensor_output_path) assert ds._plan.initial_num_blocks() > 1 data_size = ds.size_bytes() assert ( @@ -913,9 +699,7 @@ def test_parquet_reader_estimate_data_size(shutdown_only, tmp_path): data_size >= 7_000_000 and data_size <= 10_000_000 ), "actual data size is out of expected bound" - datasource = ParquetDatasource( - tensor_output_path, meta_provider=ParquetMetadataProvider() - ) + datasource = ParquetDatasource(tensor_output_path) assert ( datasource._encoding_ratio >= 300 and datasource._encoding_ratio <= 600 ), "encoding ratio is out of expected bound" @@ -925,43 +709,35 @@ def test_parquet_reader_estimate_data_size(shutdown_only, tmp_path): ), "estimated data size is either out of expected bound" assert ( data_size - == ParquetDatasource( - tensor_output_path, meta_provider=ParquetMetadataProvider() - ).estimate_inmemory_data_size() + == ParquetDatasource(tensor_output_path).estimate_inmemory_data_size() ), "estimated data size is not deterministic in multiple calls." text_output_path = os.path.join(tmp_path, "text") ray.data.range(1000).map(lambda _: {"text": "a" * 1000}).write_parquet( text_output_path ) - ds = ray.data.read_parquet( - text_output_path, meta_provider=ParquetMetadataProvider() - ) + ds = ray.data.read_parquet(text_output_path) assert ds._plan.initial_num_blocks() > 1 data_size = ds.size_bytes() assert ( - data_size >= 800_000 and data_size <= 2_000_000 + data_size >= 700_000 and data_size <= 2_200_000 ), "estimated data size is out of expected bound" data_size = ds.materialize().size_bytes() assert ( data_size >= 1_000_000 and data_size <= 2_000_000 ), "actual data size is out of expected bound" - datasource = ParquetDatasource( - text_output_path, meta_provider=ParquetMetadataProvider() - ) + datasource = ParquetDatasource(text_output_path) assert ( - datasource._encoding_ratio >= 9 and datasource._encoding_ratio <= 300 + datasource._encoding_ratio >= 6 and datasource._encoding_ratio <= 300 ), "encoding ratio is out of expected bound" data_size = datasource.estimate_inmemory_data_size() assert ( - data_size >= 800_000 and data_size <= 2_000_000 + data_size >= 700_000 and data_size <= 2_200_000 ), "estimated data size is out of expected bound" assert ( data_size - == ParquetDatasource( - text_output_path, meta_provider=ParquetMetadataProvider() - ).estimate_inmemory_data_size() + == ParquetDatasource(text_output_path).estimate_inmemory_data_size() ), "estimated data size is not deterministic in multiple calls." finally: ctx.decoding_size_estimation = old_decoding_size_estimation @@ -2223,6 +1999,128 @@ def test_parquet_write_parallel_overwrite( assert result.count() == 1000 +def test_read_parquet_with_none_partitioning_and_columns(tmp_path): + # Test for https://github.com/ray-project/ray/issues/55279. + table = pa.table({"column": [42]}) + path = os.path.join(tmp_path, "file.parquet") + pq.write_table(table, path) + + ds = ray.data.read_parquet(path, partitioning=None, columns=["column"]) + + assert ds.take_all() == [{"column": 42}] + + +def _create_test_data(num_rows: int) -> dict: + return { + "int_col": list(range(num_rows)), + "float_col": [float(i) for i in range(num_rows)], + "str_col": [f"str_{i}" for i in range(num_rows)], + } + + +@pytest.mark.parametrize( + "batch_size,filter_expr,expected_rows,description", + [ + # No batch size cases + (None, "int_col > 500", 499, "No batch size, int > 500"), + (None, "int_col < 200", 200, "No batch size, int < 200"), + ( + None, + "float_col == 42.0", + 1, + "No batch size, float == 42.0", + ), + ( + None, + "str_col == 'str_42'", + 1, + "No batch size, str == str_42", + ), + # Batch size cases + (100, "int_col > 500", 499, "Fixed batch size, int > 500"), + (200, "int_col < 200", 200, "Fixed batch size, int < 200"), + ( + 300, + "float_col == 42.0", + 1, + "Fixed batch size, float == 42.0", + ), + ( + 400, + "str_col == 'str_42'", + 1, + "Fixed batch size, str == str_42", + ), + ], +) +def test_read_parquet_with_filter_selectivity( + ray_start_regular_shared, + tmp_path, + batch_size, + filter_expr, + expected_rows, + description, +): + """Test reading parquet files with filter expressions and different batch sizes.""" + num_rows = 1000 + data = _create_test_data(num_rows) + table = pa.Table.from_pydict(data) + + file_path = os.path.join(tmp_path, "test.parquet") + pq.write_table(table, file_path, row_group_size=200) + + if batch_size is not None: + ray.data.DataContext.get_current().target_max_block_size = batch_size + ds = ray.data.read_parquet(file_path).filter(expr=filter_expr) + + assert ds.count() == expected_rows, ( + f"{description}: Filter '{filter_expr}' returned {ds.count()} rows, " + f"expected {expected_rows}" + ) + + # Verify schema has expected columns and types + assert ds.schema().base_schema == table.schema + + +@pytest.mark.parametrize("batch_size", [None, 100, 200, 10_000]) +@pytest.mark.parametrize( + "columns", + [ + # Empty projection + [], + ["int_col"], + ["int_col", "float_col", "str_col"], + ], +) +def test_read_parquet_with_columns_selectivity( + ray_start_regular_shared, + tmp_path, + batch_size, + columns, +): + """Test reading parquet files with different column selections and batch sizes.""" + num_rows = 1000 + data = _create_test_data(num_rows) + table = pa.Table.from_pydict(data) + + file_path = os.path.join(tmp_path, "test.parquet") + pq.write_table(table, file_path, row_group_size=200) + + if batch_size is not None: + ray.data.DataContext.get_current().target_max_block_size = batch_size + ds = ray.data.read_parquet(file_path, columns=columns) + + assert ds.count() == num_rows, ( + f"Column selection {columns} with batch_size={batch_size} " + f"returned {ds.count()} rows, expected {num_rows}" + ) + + assert set(ds.schema().names) == set(columns), ( + f"Column selection {columns} with batch_size={batch_size} " + f"returned columns {ds.schema().names}" + ) + + if __name__ == "__main__": import sys diff --git a/python/ray/data/tests/test_raydp.py b/python/ray/data/tests/test_raydp.py index 84576aedaed9..633018798182 100644 --- a/python/ray/data/tests/test_raydp.py +++ b/python/ray/data/tests/test_raydp.py @@ -1,7 +1,6 @@ import pandas import pytest import raydp -import torch import ray from ray.data.tests.conftest import * # noqa @@ -58,19 +57,6 @@ def test_from_spark_e2e(spark): _check_usage_record(["FromArrow"]) -def test_raydp_to_torch_iter(spark): - spark_df = spark.createDataFrame([(1, 0), (2, 0), (3, 1)], ["feature", "label"]) - data_size = spark_df.count() - features = [r["feature"] for r in spark_df.take(data_size)] - features = torch.tensor(features).reshape(data_size, 1) - labels = [r["label"] for r in spark_df.take(data_size)] - labels = torch.tensor(labels).reshape(data_size, 1) - ds = ray.data.from_spark(spark_df) - dataset = ds.to_torch(label_column="label", batch_size=3) - data_features, data_labels = next(dataset.__iter__()) - assert torch.equal(data_features, features) and torch.equal(data_labels, labels) - - def test_to_pandas(spark): df = spark.range(100) ds = ray.data.from_spark(df) diff --git a/python/ray/data/tests/test_resource_manager.py b/python/ray/data/tests/test_resource_manager.py index 90b69361e51b..fe03b8d1d3be 100644 --- a/python/ray/data/tests/test_resource_manager.py +++ b/python/ray/data/tests/test_resource_manager.py @@ -10,8 +10,10 @@ ExecutionResources, ) from ray.data._internal.execution.operators.input_data_buffer import InputDataBuffer +from ray.data._internal.execution.operators.join import JoinOperator from ray.data._internal.execution.operators.limit_operator import LimitOperator from ray.data._internal.execution.operators.map_operator import MapOperator +from ray.data._internal.execution.operators.union_operator import UnionOperator from ray.data._internal.execution.resource_manager import ( ReservationOpResourceAllocator, ResourceManager, @@ -45,6 +47,45 @@ def mock_map_op( return op +def mock_union_op( + input_ops, + incremental_resource_usage=None, +): + op = UnionOperator( + DataContext.get_current(), + *input_ops, + ) + op.start = MagicMock(side_effect=lambda _: None) + if incremental_resource_usage is not None: + op.incremental_resource_usage = MagicMock( + return_value=incremental_resource_usage + ) + return op + + +def mock_join_op( + left_input_op, + right_input_op, + incremental_resource_usage=None, +): + op = JoinOperator( + DataContext.get_current(), + left_input_op, + right_input_op, + ("id",), + ("id",), + "inner", + num_partitions=1, + ) + + op.start = MagicMock(side_effect=lambda _: None) + if incremental_resource_usage is not None: + op.incremental_resource_usage = MagicMock( + return_value=incremental_resource_usage + ) + return op + + class TestResourceManager: """Unit tests for ResourceManager.""" @@ -740,6 +781,318 @@ def test_gpu_usage_exceeds_global_limits(self, restore_data_context): assert allocator._op_budgets[o2].gpu == 0 + def test_get_ineligible_ops_with_usage(self, restore_data_context): + DataContext.get_current().op_resource_reservation_enabled = True + + o1 = InputDataBuffer(DataContext.get_current(), []) + o2 = mock_map_op( + o1, + ) + o3 = LimitOperator(1, o2, DataContext.get_current()) + o4 = mock_map_op( + o3, + ) + o5 = mock_map_op( + o4, + ) + o1.mark_execution_finished() + o2.mark_execution_finished() + + topo, _ = build_streaming_topology(o5, ExecutionOptions()) + + resource_manager = ResourceManager( + topo, ExecutionOptions(), MagicMock(), DataContext.get_current() + ) + + allocator = resource_manager._op_resource_allocator + + ops_to_exclude = allocator._get_ineligible_ops_with_usage() + assert len(ops_to_exclude) == 2 + assert set(ops_to_exclude) == {o2, o3} + + def test_get_ineligible_ops_with_usage_complex_graph(self, restore_data_context): + """ + o1 (InputDataBuffer) + | + v + o2 (MapOperator, completed) + | + v + o3 (LimitOperator) + | + v o4 (InputDataBuffer) + | | + | v + | o5 (MapOperator, completed) + | | + v v + o6 (UnionOperator) <-- + | + v + o8 (JoinOperator) <-- o7 (InputDataBuffer, completed) + """ + DataContext.get_current().op_resource_reservation_enabled = True + + o1 = InputDataBuffer(DataContext.get_current(), []) + o2 = mock_map_op( + o1, + ) + o3 = LimitOperator(1, o2, DataContext.get_current()) + o4 = InputDataBuffer(DataContext.get_current(), []) + o5 = mock_map_op( + o4, + ) + o6 = mock_union_op([o3, o5]) + o7 = InputDataBuffer(DataContext.get_current(), []) + o8 = mock_join_op(o7, o6) + + o1.mark_execution_finished() + o2.mark_execution_finished() + o4.mark_execution_finished() + o5.mark_execution_finished() + o7.mark_execution_finished() + + topo, _ = build_streaming_topology(o8, ExecutionOptions()) + + resource_manager = ResourceManager( + topo, ExecutionOptions(), MagicMock(), DataContext.get_current() + ) + + allocator = resource_manager._op_resource_allocator + + ops_to_exclude = allocator._get_ineligible_ops_with_usage() + assert len(ops_to_exclude) == 4 + assert set(ops_to_exclude) == {o2, o3, o5, o7} + + def test_reservation_accounts_for_completed_ops(self, restore_data_context): + """Test that resource reservation properly accounts for completed ops.""" + DataContext.get_current().op_resource_reservation_enabled = True + DataContext.get_current().op_resource_reservation_ratio = 0.5 + + o1 = InputDataBuffer(DataContext.get_current(), []) + o2 = mock_map_op(o1, incremental_resource_usage=ExecutionResources(1, 0, 10)) + o3 = mock_map_op(o2, incremental_resource_usage=ExecutionResources(1, 0, 10)) + o4 = mock_map_op(o3, incremental_resource_usage=ExecutionResources(1, 0, 10)) + o1.mark_execution_finished() + o2.mark_execution_finished() + + op_usages = { + o1: ExecutionResources.zero(), + o2: ExecutionResources(cpu=2, object_store_memory=50), + o3: ExecutionResources.zero(), + o4: ExecutionResources.zero(), + } + op_internal_usage = dict.fromkeys([o1, o2, o3, o4], 0) + op_outputs_usages = dict.fromkeys([o1, o2, o3, o4], 0) + + topo, _ = build_streaming_topology(o4, ExecutionOptions()) + + global_limits = ExecutionResources(cpu=10, object_store_memory=250) + + resource_manager = ResourceManager( + topo, ExecutionOptions(), MagicMock(), DataContext.get_current() + ) + resource_manager.get_op_usage = MagicMock(side_effect=lambda op: op_usages[op]) + resource_manager._mem_op_internal = op_internal_usage + resource_manager._mem_op_outputs = op_outputs_usages + resource_manager.get_global_limits = MagicMock(return_value=global_limits) + + allocator = resource_manager._op_resource_allocator + allocator.update_usages() + + # Check that o2's usage was subtracted from remaining resources + # global_limits (10 CPU, 250 mem) - o1 usage (0) - o2 usage (2 CPU, 50 mem) = remaining (8 CPU, 200 mem) + # With 2 eligible ops (o3, o4) and 50% reservation ratio: + # Each op gets reserved: (8 CPU, 200 mem) * 0.5 / 2 = (2 CPU, 50 mem) + + # Verify that reservations are calculated correctly + assert allocator._op_reserved[o3].cpu == 2.0 + assert allocator._op_reserved[o4].cpu == 2.0 + + # The total reserved memory should account for o2's usage being subtracted + total_reserved_memory = ( + allocator._op_reserved[o3].object_store_memory + + allocator._reserved_for_op_outputs[o3] + + allocator._op_reserved[o4].object_store_memory + + allocator._reserved_for_op_outputs[o4] + ) + + assert abs(total_reserved_memory - 100) < 1.0 + + def test_reservation_accounts_for_completed_ops_complex_graph( + self, restore_data_context + ): + """ + o1 (InputDataBuffer) + | + v + o2 (MapOperator, completed) + | + v + o3 (LimitOperator) + | + v o4 (InputDataBuffer) + | | + | v + | o5 (MapOperator, completed) + | | + v v + o6 (UnionOperator) <-- + | + v + o8 (JoinOperator) <-- o7 (InputDataBuffer, completed) + """ + DataContext.get_current().op_resource_reservation_enabled = True + DataContext.get_current().op_resource_reservation_ratio = 0.5 + + o1 = InputDataBuffer(DataContext.get_current(), []) + o2 = mock_map_op(o1, incremental_resource_usage=ExecutionResources(1, 0, 15)) + o3 = LimitOperator(1, o2, DataContext.get_current()) + o4 = InputDataBuffer(DataContext.get_current(), []) + o5 = mock_map_op(o4, incremental_resource_usage=ExecutionResources(1, 0, 10)) + o6 = mock_union_op( + [o3, o5], incremental_resource_usage=ExecutionResources(1, 0, 20) + ) + o7 = InputDataBuffer(DataContext.get_current(), []) + o8 = mock_join_op( + o7, o6, incremental_resource_usage=ExecutionResources(1, 0, 30) + ) + + o1.mark_execution_finished() + o2.mark_execution_finished() + o4.mark_execution_finished() + o5.mark_execution_finished() + o7.mark_execution_finished() + + op_usages = { + o1: ExecutionResources.zero(), + o2: ExecutionResources(cpu=2, object_store_memory=150), + o3: ExecutionResources(cpu=2, object_store_memory=50), + o4: ExecutionResources.zero(), + o5: ExecutionResources(cpu=3, object_store_memory=100), + o6: ExecutionResources.zero(), + o7: ExecutionResources(cpu=1, object_store_memory=100), + o8: ExecutionResources.zero(), + } + op_internal_usage = dict.fromkeys([o1, o2, o3, o4, o5, o6, o7, o8], 0) + op_outputs_usages = dict.fromkeys([o1, o2, o3, o4, o5, o6, o7, o8], 0) + + topo, _ = build_streaming_topology(o8, ExecutionOptions()) + + global_limits = ExecutionResources.zero() + + def mock_get_global_limits(): + nonlocal global_limits + return global_limits + + resource_manager = ResourceManager( + topo, ExecutionOptions(), MagicMock(), DataContext.get_current() + ) + resource_manager.get_op_usage = MagicMock(side_effect=lambda op: op_usages[op]) + resource_manager.get_global_limits = MagicMock( + side_effect=mock_get_global_limits + ) + resource_manager._mem_op_internal = op_internal_usage + resource_manager._mem_op_outputs = op_outputs_usages + + allocator = resource_manager._op_resource_allocator + global_limits = ExecutionResources(cpu=20, object_store_memory=2000) + allocator.update_usages() + """ + global_limits (20 CPU, 2000 mem) - o2 usage (2 CPU, 150 mem) - o3 usage (2 CPU, 50 mem) - o5 usage (3 CPU, 100 mem) - o7 usage (1 CPU, 100 mem) = remaining (12 CPU, 1600 mem) + +-----+------------------+------------------+--------------+ + | | _op_reserved | _reserved_for | used shared | + | | (used/remaining) | _op_outputs | resources | + | | | (used/remaining) | | + +-----+------------------+------------------+--------------+ + | op6 | 0/200 | 0/200 | 0 | + +-----+------------------+------------------+--------------+ + | op8 | 0/200 | 0/200 | 0 | + +-----+------------------+------------------+--------------+ + """ + assert set(allocator._op_budgets.keys()) == {o6, o8} + assert set(allocator._op_reserved.keys()) == {o6, o8} + assert allocator._op_reserved[o6] == ExecutionResources( + cpu=3, object_store_memory=200 + ) + assert allocator._op_reserved[o8] == ExecutionResources( + cpu=3, object_store_memory=200 + ) + assert allocator._reserved_for_op_outputs[o6] == 200 + assert allocator._reserved_for_op_outputs[o8] == 200 + assert allocator._total_shared == ExecutionResources( + cpu=6, object_store_memory=800 + ) + assert allocator._op_budgets[o6] == ExecutionResources( + cpu=6, object_store_memory=600 + ) + assert allocator._op_budgets[o8] == ExecutionResources( + cpu=6, object_store_memory=600 + ) + + # Test when resources are used. + op_usages[o6] = ExecutionResources(2, 0, 500) + op_internal_usage[o6] = 300 + op_outputs_usages[o6] = 200 + op_usages[o8] = ExecutionResources(2, 0, 100) + op_internal_usage[o8] = 50 + op_outputs_usages[o8] = 50 + """ + +-----+------------------+------------------+--------------+ + | | _op_reserved | _reserved_for | used shared | + | | (used/remaining) | _op_outputs | resources | + | | | (used/remaining) | | + +-----+------------------+------------------+--------------+ + | op6 | 200/0 | 200/0 | 100 | + +-----+------------------+------------------+--------------+ + | op8 | 50/150 | 50/150 | 0 | + +-----+------------------+------------------+--------------+ + """ + allocator.update_usages() + assert allocator._op_budgets[o6] == ExecutionResources( + cpu=4, object_store_memory=350 + ) + assert allocator._op_budgets[o8] == ExecutionResources( + cpu=4, object_store_memory=500 + ) + + # Test when completed ops update the usage. + op_usages[o5] = ExecutionResources.zero() + allocator.update_usages() + """ + global_limits (20 CPU, 2000 mem) - o2 usage (2 CPU, 150 mem) - o3 usage (2 CPU, 50 mem) - o5 usage (0 CPU, 0 mem) - o7 usage (1 CPU, 100 mem) = remaining (15 CPU, 1700 mem) + +-----+------------------+------------------+--------------+ + | | _op_reserved | _reserved_for | used shared | + | | (used/remaining) | _op_outputs | resources | + | | | (used/remaining) | | + +-----+------------------+------------------+--------------+ + | op6 | 213/0 | 200/13 | 300-213=87 | + +-----+------------------+------------------+--------------+ + | op8 | 50/163 | 50/163 | 0 | + +-----+------------------+------------------+--------------+ + """ + assert set(allocator._op_budgets.keys()) == {o6, o8} + assert set(allocator._op_reserved.keys()) == {o6, o8} + assert allocator._op_reserved[o6] == ExecutionResources( + cpu=3.75, object_store_memory=213 + ) + assert allocator._op_reserved[o8] == ExecutionResources( + cpu=3.75, object_store_memory=213 + ) + assert allocator._reserved_for_op_outputs[o6] == 212 + assert allocator._reserved_for_op_outputs[o8] == 212 + assert allocator._total_shared == ExecutionResources( + cpu=7.5, object_store_memory=850 + ) + # object_store_memory budget = 0 + (850 - 87) / 2 = 381 (rounded down) + assert allocator._op_budgets[o6] == ExecutionResources( + cpu=5.5, object_store_memory=381 + ) + # object_store_memory budget = 163 + (850 - 87) / 2 = 545 (rounded up) + assert allocator._op_budgets[o8] == ExecutionResources( + cpu=5.5, object_store_memory=545 + ) + if __name__ == "__main__": import sys diff --git a/python/ray/data/tests/test_size_estimation.py b/python/ray/data/tests/test_size_estimation.py index 7615b1a3beea..d23610b86261 100644 --- a/python/ray/data/tests/test_size_estimation.py +++ b/python/ray/data/tests/test_size_estimation.py @@ -146,7 +146,7 @@ def gen(name): nrow = ds2._block_num_rows() assert 2 < len(nrow) < 5, nrow for x in nrow[:-1]: - assert 50000 < x < 95000, (x, nrow) + assert 50000 < x < 96000, (x, nrow) # 1MiB ctx.target_max_block_size = 1_000_000 diff --git a/python/ray/data/tests/test_sort.py b/python/ray/data/tests/test_sort.py index c97207e01f4b..7467b86c9f02 100644 --- a/python/ray/data/tests/test_sort.py +++ b/python/ray/data/tests/test_sort.py @@ -558,9 +558,9 @@ def options(**task_options): def patch_ray_get(callback): original_ray_get = ray.get - def ray_get_override(object_refs): + def ray_get_override(object_refs, *args, **kwargs): callback(object_refs) - return original_ray_get(object_refs) + return original_ray_get(object_refs, *args, **kwargs) ray.get = ray_get_override return original_ray_get diff --git a/python/ray/data/tests/test_state_export.py b/python/ray/data/tests/test_state_export.py index 58beeff858a9..4bacfbbb6ffe 100644 --- a/python/ray/data/tests/test_state_export.py +++ b/python/ray/data/tests/test_state_export.py @@ -1,11 +1,14 @@ import json import os -from dataclasses import asdict +from dataclasses import asdict, dataclass +from typing import Tuple import pytest import ray from ray.data import DataContext +from ray.data._internal.execution.dataset_state import DatasetState +from ray.data._internal.logical.interfaces import LogicalOperator from ray.data._internal.metadata_exporter import ( UNKNOWN, Operator, @@ -62,9 +65,81 @@ def ray_start_cluster_with_export_api_write(shutdown_only): yield res +@dataclass +class TestDataclass: + """A test dataclass for testing dataclass serialization.""" + + list_field: list = None + dict_field: dict = None + string_field: str = "test" + int_field: int = 1 + float_field: float = 1.0 + set_field: set = None + tuple_field: Tuple[int] = None + bool_field: bool = True + none_field: None = None + + def __post_init__(self): + self.list_field = [1, 2, 3] + self.dict_field = {1: 2, "3": "4"} + self.set_field = {1, 2, 3} + self.tuple_field = (1, 2, 3) + + +class DummyLogicalOperator(LogicalOperator): + """A dummy logical operator for testing _get_logical_args with various data types.""" + + def __init__(self, input_op=None): + super().__init__("DummyOperator", []) + + # Test various data types that might be returned by _get_logical_args + self._string_value = "test_string" + self._int_value = 42 + self._float_value = 3.14 + self._bool_value = True + self._none_value = None + self._list_value = [1, 2, 3, "string", None] + self._dict_value = {"key1": "value1", "key2": 123, "key3": None} + self._nested_dict = { + "level1": { + "level2": { + "level3": "deep_value", + "numbers": [1, 2, 3], + "mixed": {"a": 1, "b": "string", "c": None}, + } + } + } + self._tuple_value = (1, "string", None, 3.14) + self._set_value = {1} + self._bytes_value = b"binary_data" + self._complex_dict = { + "string_keys": {"a": 1, "b": 2}, + "int_keys": {1: "one", 2: "two"}, # This should cause issues if not handled + "mixed_keys": {"str": "value", 1: "int_key", None: "none_key"}, + } + self._empty_containers = { + "empty_list": [], + "empty_dict": {}, + "empty_tuple": (), + "empty_set": set(), + } + self._special_values = { + "zero": 0, + "negative": -1, + "large_int": 999999999999999999, + "small_float": 0.0000001, + "inf": float("inf"), + "neg_inf": float("-inf"), + "nan": float("nan"), + } + + self._data_class = TestDataclass() + + @pytest.fixture def dummy_dataset_topology(): """Create a dummy Topology.""" + dummy_operator = DummyLogicalOperator() dummy_topology = Topology( operators=[ Operator( @@ -73,6 +148,10 @@ def dummy_dataset_topology(): uuid="uuid_0", input_dependencies=[], sub_stages=[], + execution_start_time=1.0, + execution_end_time=1.0, + state="FINISHED", + args=sanitize_for_struct(dummy_operator._get_args()), ), Operator( name="ReadRange->Map()->Filter()", @@ -80,12 +159,188 @@ def dummy_dataset_topology(): uuid="uuid_1", input_dependencies=["Input_0"], sub_stages=[], + execution_start_time=0.0, + execution_end_time=0.0, + state="RUNNING", + args=sanitize_for_struct(dummy_operator._get_args()), ), ], ) return dummy_topology +@pytest.fixture +def dummy_dataset_topology_expected_output(): + return { + "operators": [ + { + "name": "Input", + "id": "Input_0", + "uuid": "uuid_0", + "args": { + "_num_outputs": "None", + "_int_value": "42", + "_special_values": { + "negative": "-1", + "inf": "inf", + "zero": "0", + "large_int": "999999999999999999", + "small_float": "1e-07", + "neg_inf": "-inf", + "nan": "nan", + }, + "_none_value": "None", + "_name": "DummyOperator", + "_output_dependencies": [], + "_float_value": "3.14", + "_list_value": ["1", "2", "3", "string", "None"], + "_dict_value": {"key1": "value1", "key3": "None", "key2": "123"}, + "_set_value": ["1"], + "_tuple_value": ["1", "string", "None", "3.14"], + "_bytes_value": [ + "98", + "105", + "110", + "97", + "114", + "121", + "95", + "100", + "97", + "116", + "97", + ], + "_input_dependencies": [], + "_empty_containers": { + "empty_set": [], + "empty_tuple": [], + "empty_dict": {}, + "empty_list": [], + }, + "_bool_value": "True", + "_nested_dict": { + "level1": { + "level2": { + "mixed": {"a": "1", "b": "string", "c": "None"}, + "numbers": ["1", "2", "3"], + "level3": "deep_value", + } + } + }, + "_string_value": "test_string", + "_complex_dict": { + "string_keys": {"a": "1", "b": "2"}, + "mixed_keys": { + "None": "none_key", + "str": "value", + "1": "int_key", + }, + "int_keys": {"1": "one", "2": "two"}, + }, + "_data_class": { + "list_field": ["1", "2", "3"], + "dict_field": {"3": "4", "1": "2"}, + "tuple_field": ["1", "2", "3"], + "set_field": ["1", "2", "3"], + "int_field": "1", + "none_field": "None", + "bool_field": "True", + "string_field": "test", + "float_field": "1.0", + }, + }, + "input_dependencies": [], + "sub_stages": [], + "execution_start_time": 1.0, + "execution_end_time": 1.0, + "state": "FINISHED", + }, + { + "name": "ReadRange->Map()->Filter()", + "id": "ReadRange->Map()->Filter()_1", + "uuid": "uuid_1", + "input_dependencies": ["Input_0"], + "args": { + "_num_outputs": "None", + "_int_value": "42", + "_special_values": { + "negative": "-1", + "inf": "inf", + "zero": "0", + "large_int": "999999999999999999", + "small_float": "1e-07", + "neg_inf": "-inf", + "nan": "nan", + }, + "_none_value": "None", + "_name": "DummyOperator", + "_output_dependencies": [], + "_float_value": "3.14", + "_list_value": ["1", "2", "3", "string", "None"], + "_dict_value": {"key1": "value1", "key3": "None", "key2": "123"}, + "_set_value": ["1"], + "_tuple_value": ["1", "string", "None", "3.14"], + "_bytes_value": [ + "98", + "105", + "110", + "97", + "114", + "121", + "95", + "100", + "97", + "116", + "97", + ], + "_input_dependencies": [], + "_empty_containers": { + "empty_set": [], + "empty_tuple": [], + "empty_dict": {}, + "empty_list": [], + }, + "_bool_value": "True", + "_nested_dict": { + "level1": { + "level2": { + "mixed": {"a": "1", "b": "string", "c": "None"}, + "numbers": ["1", "2", "3"], + "level3": "deep_value", + } + } + }, + "_string_value": "test_string", + "_complex_dict": { + "string_keys": {"a": "1", "b": "2"}, + "mixed_keys": { + "None": "none_key", + "str": "value", + "1": "int_key", + }, + "int_keys": {"1": "one", "2": "two"}, + }, + "_data_class": { + "list_field": ["1", "2", "3"], + "dict_field": {"3": "4", "1": "2"}, + "tuple_field": ["1", "2", "3"], + "set_field": ["1", "2", "3"], + "int_field": "1", + "none_field": "None", + "bool_field": "True", + "string_field": "test", + "float_field": "1.0", + }, + }, + "sub_stages": [], + "execution_start_time": 0.0, + "execution_end_time": 0.0, + "state": "RUNNING", + }, + ] + } + + def test_export_disabled(ray_start_regular, dummy_dataset_topology): """Test that no export files are created when export API is disabled.""" stats_actor = _get_or_create_stats_actor() @@ -105,7 +360,7 @@ def test_export_disabled(ray_start_regular, dummy_dataset_topology): assert not os.path.exists(_get_export_file_path()) -def _test_dataset_metadata_export(topology): +def _test_dataset_metadata_export(topology, dummy_dataset_topology_expected_output): """Test that dataset metadata export events are written when export API is enabled.""" stats_actor = _get_or_create_stats_actor() @@ -124,22 +379,30 @@ def _test_dataset_metadata_export(topology): data = _get_exported_data() assert len(data) == 1 assert data[0]["source_type"] == "EXPORT_DATASET_METADATA" - assert data[0]["event_data"]["topology"] == sanitize_for_struct(asdict(topology)) + assert data[0]["event_data"]["topology"] == dummy_dataset_topology_expected_output assert data[0]["event_data"]["dataset_id"] == STUB_DATASET_ID assert data[0]["event_data"]["job_id"] == STUB_JOB_ID assert data[0]["event_data"]["start_time"] is not None def test_export_dataset_metadata_enabled_by_config( - ray_start_cluster_with_export_api_config, dummy_dataset_topology + ray_start_cluster_with_export_api_config, + dummy_dataset_topology, + dummy_dataset_topology_expected_output, ): - _test_dataset_metadata_export(dummy_dataset_topology) + _test_dataset_metadata_export( + dummy_dataset_topology, dummy_dataset_topology_expected_output + ) def test_export_dataset_metadata( - ray_start_cluster_with_export_api_write, dummy_dataset_topology + ray_start_cluster_with_export_api_write, + dummy_dataset_topology, + dummy_dataset_topology_expected_output, ): - _test_dataset_metadata_export(dummy_dataset_topology) + _test_dataset_metadata_export( + dummy_dataset_topology, dummy_dataset_topology_expected_output + ) @pytest.mark.parametrize( @@ -181,7 +444,9 @@ def __call__(self, x): def test_export_multiple_datasets( - ray_start_cluster_with_export_api_write, dummy_dataset_topology + ray_start_cluster_with_export_api_write, + dummy_dataset_topology, + dummy_dataset_topology_expected_output, ): """Test that multiple datasets can be exported when export API is enabled.""" stats_actor = _get_or_create_stats_actor() @@ -195,6 +460,9 @@ def test_export_multiple_datasets( uuid="second_uuid_0", input_dependencies=[], sub_stages=[], + execution_start_time=1.0, + execution_end_time=1.0, + state="FINISHED", ), Operator( name="ReadRange->Map()", @@ -202,6 +470,9 @@ def test_export_multiple_datasets( uuid="second_uuid_1", input_dependencies=["Input_0"], sub_stages=[], + execution_start_time=2.0, + execution_end_time=0.0, + state="RUNNING", ), ], ) @@ -245,8 +516,8 @@ def test_export_multiple_datasets( ), f"First dataset {first_dataset_id} not found in exported data" first_entry = datasets_by_id[first_dataset_id] assert first_entry["source_type"] == "EXPORT_DATASET_METADATA" - assert first_entry["event_data"]["topology"] == sanitize_for_struct( - asdict(dummy_dataset_topology) + assert ( + first_entry["event_data"]["topology"] == dummy_dataset_topology_expected_output ) assert first_entry["event_data"]["job_id"] == STUB_JOB_ID assert first_entry["event_data"]["start_time"] is not None @@ -257,9 +528,7 @@ def test_export_multiple_datasets( ), f"Second dataset {second_dataset_id} not found in exported data" second_entry = datasets_by_id[second_dataset_id] assert second_entry["source_type"] == "EXPORT_DATASET_METADATA" - assert second_entry["event_data"]["topology"] == sanitize_for_struct( - asdict(second_topology) - ) + assert second_entry["event_data"]["topology"] == asdict(second_topology) assert second_entry["event_data"]["job_id"] == STUB_JOB_ID assert second_entry["event_data"]["start_time"] is not None @@ -287,12 +556,12 @@ def __str__(self): @pytest.mark.parametrize( "input_obj,expected_output,truncate_length", [ - # Basic types - should return as-is - (42, 42, 100), - (3.14, 3.14, 100), - (True, True, 100), - (False, False, 100), - (None, None, 100), + # Basic types - should return as strings + (42, "42", 100), + (3.14, "3.14", 100), + (True, "True", 100), + (False, "False", 100), + (None, "None", 100), # Strings - short strings return as-is ("hello", "hello", 100), # Strings - long strings get truncated @@ -302,36 +571,157 @@ def __str__(self): ({"key": "value"}, {"key": "value"}, 100), ({"long_key": "a" * 150}, {"long_key": "a" * 100 + "..."}, 100), ({"nested": {"inner": "value"}}, {"nested": {"inner": "value"}}, 100), - # Sequences - should recursively sanitize elements - ([1, 2, 3], [1, 2, 3], 100), + # Sequences - should recursively sanitize elements (convert to strings) + ([1, 2, 3], ["1", "2", "3"], 100), (["short", "a" * 150], ["short", "a" * 100 + "..."], 100), # Complex nested structures ( {"list": [1, "a" * 150], "dict": {"key": "a" * 150}}, - {"list": [1, "a" * 100 + "..."], "dict": {"key": "a" * 100 + "..."}}, + {"list": ["1", "a" * 100 + "..."], "dict": {"key": "a" * 100 + "..."}}, 100, ), # Objects that can be converted to string (BasicObject("test"), "BasicObject(test)", 100), # Falls back to str() - # Objects that can't be JSON serialized but can be stringified - ({1, 2, 3}, "{1, 2, 3}", 100), # Falls back to str() + # Sets can be converted to Lists of strings + ({1, 2, 3}, ["1", "2", "3"], 100), + ((1, 2, 3), ["1", "2", "3"], 100), # Objects that can't be serialized or stringified - (UnserializableObject(), UNKNOWN, 100), + (UnserializableObject(), f"{UNKNOWN}: {UnserializableObject.__name__}", 100), # Empty containers ({}, {}, 100), ([], [], 100), - # Mixed type sequences + # Mixed type sequences - all converted to strings ( [1, "hello", {"key": "value"}, None], - [1, "hello", {"key": "value"}, None], + ["1", "hello", {"key": "value"}, "None"], 100, ), + # Bytearrays/bytes - should be converted to lists of string representations + (bytearray(b"hello"), ["104", "101", "108", "108", "111"], 100), + (bytearray([1, 2, 3, 4, 5]), ["1", "2", "3", "4", "5"], 100), + (bytes(b"test"), ["116", "101", "115", "116"], 100), + # Dataclass + ( + TestDataclass(), + { + "list_field": ["1", "2", "3"], + "dict_field": {"1": "2", "3": "4"}, # key should be strings + "string_field": "test", + "int_field": "1", + "float_field": "1.0", + "set_field": [ + "1", + "2", + "3", + ], # sets will be converted to Lists of strings + "tuple_field": [ + "1", + "2", + "3", + ], # tuples will be converted to Lists of strings + "bool_field": "True", + "none_field": "None", + }, + 100, + ), + # Test sequence truncation - list longer than truncate_length gets truncated + ( + [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + ["1", "2", "3", "..."], # Only first 3 elements after truncation + ... + 3, + ), ], ) def test_sanitize_for_struct(input_obj, expected_output, truncate_length): """Test sanitize_for_struct with various input types and truncation lengths.""" result = sanitize_for_struct(input_obj, truncate_length) - assert result == expected_output + assert result == expected_output, f"Expected {expected_output}, got {result}" + + +def test_update_dataset_metadata_state( + ray_start_cluster_with_export_api_write, dummy_dataset_topology +): + """Test dataset state update at the export API""" + stats_actor = _get_or_create_stats_actor() + # Register dataset + ray.get( + stats_actor.register_dataset.remote( + job_id=STUB_JOB_ID, + dataset_tag=STUB_DATASET_ID, + operator_tags=["Input_0", "ReadRange->Map()->Filter()_1"], + topology=dummy_dataset_topology, + data_context=DataContext.get_current(), + ) + ) + # Check that export files were created as expected + data = _get_exported_data() + assert len(data) == 1 + assert data[0]["event_data"]["state"] == DatasetState.PENDING.name + + # Test update state to RUNNING + ray.get( + stats_actor.update_dataset_metadata_state.remote( + dataset_id=STUB_DATASET_ID, new_state=DatasetState.RUNNING.name + ) + ) + data = _get_exported_data() + assert len(data) == 2 + assert data[1]["event_data"]["state"] == DatasetState.RUNNING.name + assert data[1]["event_data"]["execution_start_time"] > 0 + + # Test update to FINISHED + ray.get( + stats_actor.update_dataset_metadata_state.remote( + dataset_id=STUB_DATASET_ID, new_state=DatasetState.FINISHED.name + ) + ) + data = _get_exported_data() + assert len(data) == 3 + assert data[2]["event_data"]["state"] == DatasetState.FINISHED.name + assert data[2]["event_data"]["execution_end_time"] > 0 + assert ( + data[2]["event_data"]["topology"]["operators"][1]["state"] + == DatasetState.FINISHED.name + ) + assert data[2]["event_data"]["topology"]["operators"][1]["execution_end_time"] > 0 + + +def test_update_dataset_metadata_operator_states( + ray_start_cluster_with_export_api_write, dummy_dataset_topology +): + stats_actor = _get_or_create_stats_actor() + # Register dataset + ray.get( + stats_actor.register_dataset.remote( + dataset_tag=STUB_DATASET_ID, + operator_tags=["Input_0", "ReadRange->Map()->Filter()_1"], + topology=dummy_dataset_topology, + job_id=STUB_JOB_ID, + data_context=DataContext.get_current(), + ) + ) + data = _get_exported_data() + assert len(data) == 1 + assert ( + data[0]["event_data"]["topology"]["operators"][1]["state"] + == DatasetState.RUNNING.name + ) + + # Test update to FINISHED + operator_id = "ReadRange->Map()->Filter()_1" + ray.get( + stats_actor.update_dataset_metadata_operator_states.remote( + dataset_id=STUB_DATASET_ID, + operator_states={operator_id: DatasetState.FINISHED.name}, + ) + ) + data = _get_exported_data() + assert len(data) == 2 + assert ( + data[1]["event_data"]["topology"]["operators"][1]["state"] + == DatasetState.FINISHED.name + ) + assert data[1]["event_data"]["topology"]["operators"][1]["execution_end_time"] > 0 if __name__ == "__main__": diff --git a/python/ray/data/tests/test_stats.py b/python/ray/data/tests/test_stats.py index 23953efe059b..07cb5c359fd2 100644 --- a/python/ray/data/tests/test_stats.py +++ b/python/ray/data/tests/test_stats.py @@ -22,6 +22,7 @@ from ray.data._internal.execution.backpressure_policy.backpressure_policy import ( BackpressurePolicy, ) +from ray.data._internal.execution.dataset_state import DatasetState from ray.data._internal.execution.interfaces.op_runtime_metrics import TaskDurationStats from ray.data._internal.execution.interfaces.physical_operator import PhysicalOperator from ray.data._internal.stats import ( @@ -29,6 +30,7 @@ NodeMetrics, StatsManager, _get_or_create_stats_actor, + _StatsActor, ) from ray.data._internal.util import MemoryProfiler from ray.data.context import DataContext @@ -73,6 +75,8 @@ def gen_expected_metrics( if is_map: metrics = [ "'average_num_outputs_per_task': N", + "'average_num_inputs_per_task': N", + "'num_output_blocks_per_task_s': N", "'average_bytes_per_output': N", "'obj_store_mem_internal_inqueue': Z", "'obj_store_mem_internal_outqueue': Z", @@ -99,6 +103,8 @@ def gen_expected_metrics( "'num_outputs_of_finished_tasks': N", "'bytes_outputs_of_finished_tasks': N", "'rows_outputs_of_finished_tasks': N", + "'num_external_inqueue_blocks': N", + "'num_external_inqueue_bytes': N", "'num_tasks_submitted': N", "'num_tasks_running': Z", "'num_tasks_have_outputs': N", @@ -113,7 +119,11 @@ def gen_expected_metrics( "'task_output_backpressure_time': " f"{'N' if task_output_backpressure else 'Z'}" ), - ("'mean_task_completion_time': " f"{'N' if task_backpressure else 'Z'}"), + ("'task_completion_time': " f"{'N' if task_backpressure else 'Z'}"), + ( + "'task_completion_time_without_backpressure': " + f"{'N' if task_backpressure else 'Z'}" + ), "'num_alive_actors': Z", "'num_restarting_actors': Z", "'num_pending_actors': Z", @@ -128,6 +138,8 @@ def gen_expected_metrics( else: metrics = [ "'average_num_outputs_per_task': None", + "'average_num_inputs_per_task': None", + "'num_output_blocks_per_task_s': None", "'average_bytes_per_output': None", "'obj_store_mem_internal_inqueue': Z", "'obj_store_mem_internal_outqueue': Z", @@ -154,6 +166,8 @@ def gen_expected_metrics( "'num_outputs_of_finished_tasks': Z", "'bytes_outputs_of_finished_tasks': Z", "'rows_outputs_of_finished_tasks': Z", + "'num_external_inqueue_blocks': N", + "'num_external_inqueue_bytes': N", "'num_tasks_submitted': Z", "'num_tasks_running': Z", "'num_tasks_have_outputs': Z", @@ -168,7 +182,11 @@ def gen_expected_metrics( "'task_output_backpressure_time': " f"{'N' if task_output_backpressure else 'Z'}" ), - ("'mean_task_completion_time': " f"{'N' if task_backpressure else 'Z'}"), + ("'task_completion_time': " f"{'N' if task_backpressure else 'Z'}"), + ( + "'task_completion_time_without_backpressure': " + f"{'N' if task_backpressure else 'Z'}" + ), "'num_alive_actors': Z", "'num_restarting_actors': Z", "'num_pending_actors': Z", @@ -282,6 +300,14 @@ def canonicalize(stats: str, filter_global_stats: bool = True) -> str: ) # Handle floats in (0, 1) canonicalized_stats = re.sub(r" (0\.0*[1-9][0-9]*)", " N", canonicalized_stats) + # Replace input rows value (0 or non-0) with 'N' while keeping key prefix + canonicalized_stats = re.sub( + r"(Total input num rows: )\d+(\.\d+)?", r"\g<1>N", canonicalized_stats + ) + # Replace output rows value (0 or non-0) with 'N' while keeping key prefix + canonicalized_stats = re.sub( + r"(Total output num rows: )\d+(\.\d+)?", r"\g<1>N", canonicalized_stats + ) # Handle zero values specially so we can check for missing values. canonicalized_stats = re.sub(r" [0]+(\.[0])?", " Z", canonicalized_stats) # Scientific notation for small or large numbers @@ -370,6 +396,8 @@ def test_streaming_split_stats(ray_start_regular_shared, restore_data_context): * Output rows per task: N min, N max, N mean, N tasks used * Tasks per node: N min, N max, N mean; N nodes used * Operator throughput: + * Total input num rows: N rows + * Total output num rows: N rows * Ray Data throughput: N rows/s * Estimated single node throughput: N rows/s * Extra metrics: {extra_metrics_1} @@ -383,6 +411,7 @@ def test_streaming_split_stats(ray_start_regular_shared, restore_data_context): * Total time overall: T * Total time in Ray Data iterator initialization code: T * Total time user thread is blocked by Ray Data iter_batches: T + * Total time spent waiting for the first batch after starting iteration: T * Total execution time for user thread: T * Batch iteration time breakdown (summed across prefetch threads): * In ray.get(): T min, T max, T avg, T total @@ -433,6 +462,8 @@ def test_large_args_scheduling_strategy( f"* Output rows per task: N min, N max, N mean, N tasks used\n" f"* Tasks per node: N min, N max, N mean; N nodes used\n" f"* Operator throughput:\n" + f" * Total input num rows: N rows\n" + f" * Total output num rows: N rows\n" f" * Ray Data throughput: N rows/s\n" f" * Estimated single node throughput: N rows/s\n" f"{read_extra_metrics}\n" @@ -446,6 +477,8 @@ def test_large_args_scheduling_strategy( f"* Output rows per task: N min, N max, N mean, N tasks used\n" f"* Tasks per node: N min, N max, N mean; N nodes used\n" f"* Operator throughput:\n" + f" * Total input num rows: N rows\n" + f" * Total output num rows: N rows\n" f" * Ray Data throughput: N rows/s\n" f" * Estimated single node throughput: N rows/s\n" f"{map_extra_metrics}" @@ -490,6 +523,8 @@ def test_dataset_stats_basic( f"* Output rows per task: N min, N max, N mean, N tasks used\n" f"* Tasks per node: N min, N max, N mean; N nodes used\n" f"* Operator throughput:\n" + f" * Total input num rows: N rows\n" + f" * Total output num rows: N rows\n" f" * Ray Data throughput: N rows/s\n" f" * Estimated single node throughput: N rows/s\n" f"{gen_extra_metrics_str(STANDARD_EXTRA_METRICS_TASK_BACKPRESSURE, verbose_stats_logs)}" # noqa: E501 @@ -515,6 +550,8 @@ def test_dataset_stats_basic( f"* Output rows per task: N min, N max, N mean, N tasks used\n" f"* Tasks per node: N min, N max, N mean; N nodes used\n" f"* Operator throughput:\n" + f" * Total input num rows: N rows\n" + f" * Total output num rows: N rows\n" f" * Ray Data throughput: N rows/s\n" f" * Estimated single node throughput: N rows/s\n" f"{gen_extra_metrics_str(STANDARD_EXTRA_METRICS_TASK_BACKPRESSURE, verbose_stats_logs)}" # noqa: E501 @@ -545,6 +582,8 @@ def test_dataset_stats_basic( f"* Output rows per task: N min, N max, N mean, N tasks used\n" f"* Tasks per node: N min, N max, N mean; N nodes used\n" f"* Operator throughput:\n" + f" * Total input num rows: N rows\n" + f" * Total output num rows: N rows\n" f" * Ray Data throughput: N rows/s\n" f" * Estimated single node throughput: N rows/s\n" f"{extra_metrics}\n" @@ -558,6 +597,8 @@ def test_dataset_stats_basic( f"* Output rows per task: N min, N max, N mean, N tasks used\n" f"* Tasks per node: N min, N max, N mean; N nodes used\n" f"* Operator throughput:\n" + f" * Total input num rows: N rows\n" + f" * Total output num rows: N rows\n" f" * Ray Data throughput: N rows/s\n" f" * Estimated single node throughput: N rows/s\n" f"{extra_metrics}\n" @@ -565,6 +606,7 @@ def test_dataset_stats_basic( f"* Total time overall: T\n" f" * Total time in Ray Data iterator initialization code: T\n" f" * Total time user thread is blocked by Ray Data iter_batches: T\n" + f" * Total time spent waiting for the first batch after starting iteration: T\n" f" * Total execution time for user thread: T\n" f"* Batch iteration time breakdown (summed across prefetch threads):\n" f" * In ray.get(): T min, T max, T avg, T total\n" @@ -599,6 +641,8 @@ def test_block_location_nums(ray_start_regular_shared, restore_data_context): f"* Output rows per task: N min, N max, N mean, N tasks used\n" f"* Tasks per node: N min, N max, N mean; N nodes used\n" f"* Operator throughput:\n" + f" * Total input num rows: N rows\n" + f" * Total output num rows: N rows\n" f" * Ray Data throughput: N rows/s\n" f" * Estimated single node throughput: N rows/s\n" f"\n" @@ -606,6 +650,7 @@ def test_block_location_nums(ray_start_regular_shared, restore_data_context): f"* Total time overall: T\n" f" * Total time in Ray Data iterator initialization code: T\n" f" * Total time user thread is blocked by Ray Data iter_batches: T\n" + f" * Total time spent waiting for the first batch after starting iteration: T\n" f" * Total execution time for user thread: T\n" f"* Batch iteration time breakdown (summed across prefetch threads):\n" f" * In ray.get(): T min, T max, T avg, T total\n" @@ -637,6 +682,8 @@ def test_dataset__repr__(ray_start_regular_shared, restore_data_context): " number=N,\n" " extra_metrics={\n" " average_num_outputs_per_task: N,\n" + " average_num_inputs_per_task: N,\n" + " num_output_blocks_per_task_s: N,\n" " average_bytes_per_output: N,\n" " obj_store_mem_internal_inqueue: Z,\n" " obj_store_mem_internal_outqueue: Z,\n" @@ -663,6 +710,8 @@ def test_dataset__repr__(ray_start_regular_shared, restore_data_context): " num_outputs_of_finished_tasks: N,\n" " bytes_outputs_of_finished_tasks: N,\n" " rows_outputs_of_finished_tasks: N,\n" + " num_external_inqueue_blocks: N,\n" + " num_external_inqueue_bytes: N,\n" " num_tasks_submitted: N,\n" " num_tasks_running: Z,\n" " num_tasks_have_outputs: N,\n" @@ -671,7 +720,8 @@ def test_dataset__repr__(ray_start_regular_shared, restore_data_context): " block_generation_time: N,\n" " task_submission_backpressure_time: N,\n" " task_output_backpressure_time: Z,\n" - " mean_task_completion_time: N,\n" + " task_completion_time: N,\n" + " task_completion_time_without_backpressure: N,\n" " num_alive_actors: Z,\n" " num_restarting_actors: Z,\n" " num_pending_actors: Z,\n" @@ -764,6 +814,8 @@ def check_stats(): " number=N,\n" " extra_metrics={\n" " average_num_outputs_per_task: N,\n" + " average_num_inputs_per_task: N,\n" + " num_output_blocks_per_task_s: N,\n" " average_bytes_per_output: N,\n" " obj_store_mem_internal_inqueue: Z,\n" " obj_store_mem_internal_outqueue: Z,\n" @@ -790,6 +842,8 @@ def check_stats(): " num_outputs_of_finished_tasks: N,\n" " bytes_outputs_of_finished_tasks: N,\n" " rows_outputs_of_finished_tasks: N,\n" + " num_external_inqueue_blocks: N,\n" + " num_external_inqueue_bytes: N,\n" " num_tasks_submitted: N,\n" " num_tasks_running: Z,\n" " num_tasks_have_outputs: N,\n" @@ -798,7 +852,8 @@ def check_stats(): " block_generation_time: N,\n" " task_submission_backpressure_time: N,\n" " task_output_backpressure_time: Z,\n" - " mean_task_completion_time: N,\n" + " task_completion_time: N,\n" + " task_completion_time_without_backpressure: N,\n" " num_alive_actors: Z,\n" " num_restarting_actors: Z,\n" " num_pending_actors: Z,\n" @@ -846,6 +901,8 @@ def check_stats(): " number=N,\n" " extra_metrics={\n" " average_num_outputs_per_task: N,\n" + " average_num_inputs_per_task: N,\n" + " num_output_blocks_per_task_s: N,\n" " average_bytes_per_output: N,\n" " obj_store_mem_internal_inqueue: Z,\n" " obj_store_mem_internal_outqueue: Z,\n" @@ -872,6 +929,8 @@ def check_stats(): " num_outputs_of_finished_tasks: N,\n" " bytes_outputs_of_finished_tasks: N,\n" " rows_outputs_of_finished_tasks: N,\n" + " num_external_inqueue_blocks: N,\n" + " num_external_inqueue_bytes: N,\n" " num_tasks_submitted: N,\n" " num_tasks_running: Z,\n" " num_tasks_have_outputs: N,\n" @@ -880,7 +939,8 @@ def check_stats(): " block_generation_time: N,\n" " task_submission_backpressure_time: N,\n" " task_output_backpressure_time: Z,\n" - " mean_task_completion_time: N,\n" + " task_completion_time: N,\n" + " task_completion_time_without_backpressure: N,\n" " num_alive_actors: Z,\n" " num_restarting_actors: Z,\n" " num_pending_actors: Z,\n" @@ -980,6 +1040,8 @@ def test_dataset_stats_shuffle(ray_start_regular_shared): * Output rows per task: N min, N max, N mean, N tasks used * Tasks per node: N min, N max, N mean; N nodes used * Operator throughput: + * Total input num rows: N rows + * Total output num rows: N rows * Ray Data throughput: N rows/s * Estimated single node throughput: N rows/s @@ -993,6 +1055,8 @@ def test_dataset_stats_shuffle(ray_start_regular_shared): * Output rows per task: N min, N max, N mean, N tasks used * Tasks per node: N min, N max, N mean; N nodes used * Operator throughput: + * Total input num rows: N rows + * Total output num rows: N rows * Ray Data throughput: N rows/s * Estimated single node throughput: N rows/s @@ -1008,6 +1072,8 @@ def test_dataset_stats_shuffle(ray_start_regular_shared): * Output rows per task: N min, N max, N mean, N tasks used * Tasks per node: N min, N max, N mean; N nodes used * Operator throughput: + * Total input num rows: N rows + * Total output num rows: N rows * Ray Data throughput: N rows/s * Estimated single node throughput: N rows/s @@ -1021,6 +1087,8 @@ def test_dataset_stats_shuffle(ray_start_regular_shared): * Output rows per task: N min, N max, N mean, N tasks used * Tasks per node: N min, N max, N mean; N nodes used * Operator throughput: + * Total input num rows: N rows + * Total output num rows: N rows * Ray Data throughput: N rows/s * Estimated single node throughput: N rows/s @@ -1080,6 +1148,8 @@ def test_dataset_stats_range(ray_start_regular_shared, tmp_path): f"* Output rows per task: N min, N max, N mean, N tasks used\n" f"* Tasks per node: N min, N max, N mean; N nodes used\n" f"* Operator throughput:\n" + f" * Total input num rows: N rows\n" + f" * Total output num rows: N rows\n" f" * Ray Data throughput: N rows/s\n" f" * Estimated single node throughput: N rows/s\n" f"\n" @@ -1109,6 +1179,8 @@ def test_dataset_split_stats(ray_start_regular_shared, tmp_path): f"* Output rows per task: N min, N max, N mean, N tasks used\n" f"* Tasks per node: N min, N max, N mean; N nodes used\n" f"* Operator throughput:\n" + f" * Total input num rows: N rows\n" + f" * Total output num rows: N rows\n" f" * Ray Data throughput: N rows/s\n" f" * Estimated single node throughput: N rows/s\n" f"\n" @@ -1122,6 +1194,8 @@ def test_dataset_split_stats(ray_start_regular_shared, tmp_path): f"* Output rows per task: N min, N max, N mean, N tasks used\n" f"* Tasks per node: N min, N max, N mean; N nodes used\n" f"* Operator throughput:\n" + f" * Total input num rows: N rows\n" + f" * Total output num rows: N rows\n" f" * Ray Data throughput: N rows/s\n" f" * Estimated single node throughput: N rows/s\n" f"\n" @@ -1135,6 +1209,8 @@ def test_dataset_split_stats(ray_start_regular_shared, tmp_path): f"* Output rows per task: N min, N max, N mean, N tasks used\n" f"* Tasks per node: N min, N max, N mean; N nodes used\n" f"* Operator throughput:\n" + f" * Total input num rows: N rows\n" + f" * Total output num rows: N rows\n" f" * Ray Data throughput: N rows/s\n" f" * Estimated single node throughput: N rows/s\n" f"\n" @@ -1335,6 +1411,8 @@ def test_streaming_stats_full(ray_start_regular_shared, restore_data_context): * Output rows per task: N min, N max, N mean, N tasks used * Tasks per node: N min, N max, N mean; N nodes used * Operator throughput: + * Total input num rows: N rows + * Total output num rows: N rows * Ray Data throughput: N rows/s * Estimated single node throughput: N rows/s @@ -1342,6 +1420,7 @@ def test_streaming_stats_full(ray_start_regular_shared, restore_data_context): * Total time overall: T * Total time in Ray Data iterator initialization code: T * Total time user thread is blocked by Ray Data iter_batches: T + * Total time spent waiting for the first batch after starting iteration: T * Total execution time for user thread: T * Batch iteration time breakdown (summed across prefetch threads): * In ray.get(): T min, T max, T avg, T total @@ -1372,6 +1451,8 @@ def test_write_ds_stats(ray_start_regular_shared, tmp_path): * Output rows per task: N min, N max, N mean, N tasks used * Tasks per node: N min, N max, N mean; N nodes used * Operator throughput: + * Total input num rows: N rows + * Total output num rows: N rows * Ray Data throughput: N rows/s * Estimated single node throughput: N rows/s @@ -1403,6 +1484,8 @@ def test_write_ds_stats(ray_start_regular_shared, tmp_path): * Output rows per task: N min, N max, N mean, N tasks used * Tasks per node: N min, N max, N mean; N nodes used * Operator throughput: + * Total input num rows: N rows + * Total output num rows: N rows * Ray Data throughput: N rows/s * Estimated single node throughput: N rows/s @@ -1416,6 +1499,8 @@ def test_write_ds_stats(ray_start_regular_shared, tmp_path): * Output rows per task: N min, N max, N mean, N tasks used * Tasks per node: N min, N max, N mean; N nodes used * Operator throughput: + * Total input num rows: N rows + * Total output num rows: N rows * Ray Data throughput: N rows/s * Estimated single node throughput: N rows/s @@ -1613,9 +1698,8 @@ def test_dataset_throughput(shutdown_only): f = dummy_map_batches_sleep(0.01) ds = ray.data.range(100).map(f).materialize().map(f).materialize() - # Pattern to match operator throughput operator_pattern = re.compile( - r"Operator (\d+).*?Ray Data throughput: (\d+\.\d+) rows/s.*?Estimated single node throughput: (\d+\.\d+) rows/s", # noqa: E501 + r"Operator (\d+).*?\* Operator throughput:\s*.*?\* Ray Data throughput: (\d+\.\d+) rows/s.*?\* Estimated single node throughput: (\d+\.\d+) rows/s", re.DOTALL, ) @@ -1634,6 +1718,73 @@ def test_dataset_throughput(shutdown_only): assert float(dataset_match[1]) >= float(dataset_match[2]) +def test_individual_operator_num_rows(shutdown_only): + # The input num rows of an individual operator should be the same as the output num rows of its parent operator. + ray.shutdown() + ray.init(num_cpus=2) + + data = [{"id": i, "value": i * 1.5, "category": i % 5} for i in range(500)] + ds = ( + ray.data.from_items(data) + .map(lambda x: {**x, "value_squared": x["value"] ** 2}) + .filter(lambda x: x["value_squared"] > 300) + ) + + stats_output = ds.materialize().stats() + re_op0_output = re.compile(r"Operator 0.*?Total output num rows: (\d+)", re.DOTALL) + re_op1_input = re.compile(r"Operator 1.*?Total input num rows: (\d+)", re.DOTALL) + + op0_output = int(re_op0_output.search(stats_output).group(1)) + op1_input = int(re_op1_input.search(stats_output).group(1)) + + assert op0_output == 500 + assert op0_output == op1_input + + +def test_sub_operator_num_rows(shutdown_only): + # The input num rows of sub operator: + # The first sub-operator: total output from all parent nodes + # Subsequent sub-operators: output of the previous sub-operator + ray.shutdown() + ray.init(num_cpus=2) + + data1 = [{"id": i, "value1": i * 1.5, "category1": i % 5} for i in range(500)] + ds1 = ray.data.from_items(data1) + data2 = [{"id": i, "value2": i * 1.5, "category2": i % 5} for i in range(300)] + ds2 = ray.data.from_items(data2) + ds = ds1.join(ds2, join_type="left_outer", num_partitions=2) + + stats_output = ds.materialize().stats() + + patterns = { + "operator0_output": re.compile( + r"Operator 0.*?Total output num rows: (\d+)", re.DOTALL + ), + "subop0_input": re.compile( + r"Suboperator 0.*?Total input num rows: (\d+)", re.DOTALL + ), + "subop0_output": re.compile( + r"Suboperator 0.*?Total output num rows: (\d+)", re.DOTALL + ), + "subop1_input": re.compile( + r"Suboperator 1.*?Total input num rows: (\d+)", re.DOTALL + ), + } + + extracted_data = {} + for key, pattern in patterns.items(): + match = pattern.search(stats_output) + if match: + extracted_data[key] = int(match.group(1)) + else: + extracted_data[key] = None + + assert extracted_data["operator0_output"] == 500 + assert extracted_data["subop0_output"] == 800 + assert extracted_data["operator0_output"] == extracted_data["subop0_input"] + assert extracted_data["subop0_output"] == extracted_data["subop1_input"] + + @pytest.mark.parametrize("verbose_stats_logs", [True, False]) def test_spilled_stats(shutdown_only, verbose_stats_logs, restore_data_context): context = DataContext.get_current() @@ -1659,6 +1810,8 @@ def test_spilled_stats(shutdown_only, verbose_stats_logs, restore_data_context): f"* Output rows per task: N min, N max, N mean, N tasks used\n" f"* Tasks per node: N min, N max, N mean; N nodes used\n" f"* Operator throughput:\n" + f" * Total input num rows: N rows\n" + f" * Total output num rows: N rows\n" f" * Ray Data throughput: N rows/s\n" f" * Estimated single node throughput: N rows/s\n" f"{extra_metrics}\n" @@ -1831,6 +1984,13 @@ def test_op_metrics_logging(): "Operator InputDataBuffer[Input] completed. Operator Metrics:\n" + gen_expected_metrics(is_map=False) ) # .replace("'obj_store_mem_used': N", "'obj_store_mem_used': Z") + # InputDataBuffer has no inqueue, manually set to 0 + input_str = input_str.replace( + "'num_external_inqueue_blocks': N", "'num_external_inqueue_blocks': Z" + ) + input_str = input_str.replace( + "'num_external_inqueue_bytes': N", "'num_external_inqueue_bytes': Z" + ) map_str = ( "Operator TaskPoolMapOperator[ReadRange->MapBatches()] completed. " "Operator Metrics:\n" @@ -1883,6 +2043,79 @@ def test_stats_actor_datasets(ray_start_cluster): assert value["state"] == "FINISHED" +def test_stats_actor_datasets_eviction(ray_start_cluster): + """ + Tests that finished datasets are evicted from the _StatsActor when + the number of datasets exceeds the configured `max_stats` limit. + """ + # Set a low max_stats limit to easily trigger eviction. + max_stats = 2 + # Create a dedicated _StatsActor for this test to avoid interfering + # with the global actor. + stats_actor = _StatsActor.remote(max_stats=max_stats) + + # Patch the function that retrieves the stats actor to return our + # test-specific actor instance. + with patch( + "ray.data._internal.stats._get_or_create_stats_actor", + return_value=stats_actor, + ): + + def check_ds_finished(ds_name): + """Helper to check if a dataset is marked as FINISHED in the actor.""" + datasets = ray.get(stats_actor.get_datasets.remote()) + ds_tag = next((tag for tag in datasets if tag.startswith(ds_name)), None) + if not ds_tag: + return False + return datasets[ds_tag]["state"] == DatasetState.FINISHED.name + + # --- DS1 --- + # Create and materialize the first dataset. + ds1 = ray.data.range(1, override_num_blocks=1) + ds1.set_name("ds1") + ds1.materialize() + # Wait until the actor has been updated with the FINISHED state. + wait_for_condition(lambda: check_ds_finished("ds1")) + + # --- DS2 --- + # Create and materialize the second dataset. + # This brings the total number of datasets to the `max_stats` limit. + ds2 = ray.data.range(1, override_num_blocks=1) + ds2.set_name("ds2") + ds2.materialize() + wait_for_condition(lambda: check_ds_finished("ds2")) + + # --- Verify state before eviction --- + # At this point, both ds1 and ds2 should be in the actor. + datasets = ray.get(stats_actor.get_datasets.remote()) + names_in_actor = {k.split("_")[0] for k in datasets.keys()} + assert names_in_actor == {"ds1", "ds2"} + + # --- DS3 --- + # Create and materialize the third dataset. This should trigger the + # eviction of the oldest finished dataset (ds1). + ds3 = ray.data.range(1, override_num_blocks=1) + ds3.set_name("ds3") + ds3.materialize() + + def check_eviction(): + """ + Helper to check that the actor state reflects the eviction. + The actor should now contain ds2 and ds3, but not ds1. + """ + datasets = ray.get(stats_actor.get_datasets.remote()) + # The eviction happens asynchronously, so we might briefly see 3 datasets. + # We wait until the count is back to 2. + if len(datasets) == max_stats + 1: + return False + names = {k.split("_")[0] for k in datasets.keys()} + assert names == {"ds2", "ds3"} + return True + + # Wait until the eviction has occurred and the actor state is correct. + wait_for_condition(check_eviction) + + @patch.object(StatsManager, "STATS_ACTOR_UPDATE_INTERVAL_SECONDS", new=0.5) @patch.object(StatsManager, "_stats_actor_handle") @patch.object(StatsManager, "UPDATE_THREAD_INACTIVITY_LIMIT", new=1) diff --git a/python/ray/data/tests/test_streaming_executor.py b/python/ray/data/tests/test_streaming_executor.py index ae9f0e4ef286..38344b4f4134 100644 --- a/python/ray/data/tests/test_streaming_executor.py +++ b/python/ray/data/tests/test_streaming_executor.py @@ -142,7 +142,6 @@ def test_disallow_non_unique_operators(): "test_combine", [o2, o3], DataContext.get_current(), - target_max_block_size=None, ) with pytest.raises(ValueError): build_streaming_topology(o4, ExecutionOptions(verbose_progress=True)) @@ -579,7 +578,7 @@ def test_streaming_exec_schedule_s(): continue ds_stats = ds._plan.stats() - assert 0 < ds_stats.streaming_exec_schedule_s.get() < 1 + assert ds_stats.streaming_exec_schedule_s.get() > 0 def test_execution_callbacks(): @@ -798,7 +797,6 @@ def udf(row): assert isinstance(logical_ops[0], Read) datasource = logical_ops[0]._datasource assert isinstance(datasource, ParquetDatasource) - assert datasource._unresolved_paths == input_path assert isinstance(logical_ops[1], MapRows) assert logical_ops[1]._fn == udf diff --git a/python/ray/data/tests/test_text.py b/python/ray/data/tests/test_text.py index af7f142085db..3a56fc513dda 100644 --- a/python/ray/data/tests/test_text.py +++ b/python/ray/data/tests/test_text.py @@ -1,25 +1,17 @@ import os -import pandas as pd -import pyarrow as pa import pytest import ray -from ray.data import Schema from ray.data._internal.execution.interfaces.ref_bundle import ( _ref_bundles_iterator_to_block_refs_list, ) from ray.data.datasource import ( BaseFileMetadataProvider, FastFileMetadataProvider, - Partitioning, - PartitionStyle, - PathPartitionFilter, ) from ray.data.tests.conftest import * # noqa from ray.data.tests.mock_http_server import * # noqa -from ray.data.tests.test_partitioning import PathPartitionEncoder -from ray.data.tests.util import Counter from ray.tests.conftest import * # noqa @@ -27,20 +19,6 @@ def _to_lines(rows): return [row["text"] for row in rows] -def test_read_text_partitioning(ray_start_regular_shared, tmp_path): - path = os.path.join(tmp_path, "country=us") - os.mkdir(path) - with open(os.path.join(path, "file.txt"), "w") as f: - f.write("foo\nbar\nbaz") - - ds = ray.data.read_text(path, partitioning=Partitioning("hive")) - - df = ds.to_pandas() - assert list(df.columns) == ["text", "country"] - assert sorted(df["text"]) == ["bar", "baz", "foo"] - assert list(df["country"]) == ["us", "us", "us"] - - def test_empty_text_files(ray_start_regular_shared, tmp_path): path = os.path.join(tmp_path, "test_text") os.mkdir(path) @@ -69,30 +47,6 @@ def test_read_text(ray_start_regular_shared, tmp_path): assert ds.count() == 4 -@pytest.mark.parametrize("ignore_missing_paths", [True, False]) -def test_read_text_ignore_missing_paths( - ray_start_regular_shared, tmp_path, ignore_missing_paths -): - path = os.path.join(tmp_path, "test_text") - os.mkdir(path) - with open(os.path.join(path, "file1.txt"), "w") as f: - f.write("hello\n") - f.write("world") - - paths = [ - path, - "missing.txt", - ] - - if ignore_missing_paths: - ds = ray.data.read_text(paths, ignore_missing_paths=ignore_missing_paths) - assert ds.input_files() == [os.path.join(path, "file1.txt")] - else: - with pytest.raises(FileNotFoundError): - ds = ray.data.read_text(paths, ignore_missing_paths=ignore_missing_paths) - ds.materialize() - - def test_read_text_meta_provider( ray_start_regular_shared, tmp_path, @@ -117,57 +71,6 @@ def test_read_text_meta_provider( ) -def test_read_text_partitioned_with_filter( - shutdown_only, - tmp_path, - write_base_partitioned_df, - assert_base_partitioned_ds, -): - def df_to_text(dataframe, path, **kwargs): - dataframe.to_string(path, index=False, header=False, **kwargs) - - partition_keys = ["one"] - kept_file_counter = Counter.remote() - skipped_file_counter = Counter.remote() - - def skip_unpartitioned(kv_dict): - keep = bool(kv_dict) - counter = kept_file_counter if keep else skipped_file_counter - ray.get(counter.increment.remote()) - return keep - - for style in [PartitionStyle.HIVE, PartitionStyle.DIRECTORY]: - base_dir = os.path.join(tmp_path, style.value) - partition_path_encoder = PathPartitionEncoder.of( - style=style, - base_dir=base_dir, - field_names=partition_keys, - ) - write_base_partitioned_df( - partition_keys, - partition_path_encoder, - df_to_text, - ) - df_to_text(pd.DataFrame({"1": [1]}), os.path.join(base_dir, "test.txt")) - partition_path_filter = PathPartitionFilter.of( - style=style, - base_dir=base_dir, - field_names=partition_keys, - filter_fn=skip_unpartitioned, - ) - ds = ray.data.read_text(base_dir, partition_filter=partition_path_filter) - assert_base_partitioned_ds( - ds, - schema=Schema(pa.schema([("text", pa.string())])), - sorted_values=["1 a", "1 b", "1 c", "3 e", "3 f", "3 g"], - ds_take_transform_fn=_to_lines, - ) - assert ray.get(kept_file_counter.get.remote()) == 2 - assert ray.get(skipped_file_counter.get.remote()) == 1 - ray.get(kept_file_counter.reset.remote()) - ray.get(skipped_file_counter.reset.remote()) - - def test_read_text_remote_args(ray_start_cluster, tmp_path): cluster = ray_start_cluster cluster.add_node( @@ -177,6 +80,7 @@ def test_read_text_remote_args(ray_start_cluster, tmp_path): ) cluster.add_node(resources={"bar": 100}, num_cpus=1) + ray.shutdown() ray.init(cluster.address) @ray.remote diff --git a/python/ray/data/tests/test_tfrecords.py b/python/ray/data/tests/test_tfrecords.py index 3d39cd588fdc..31c355f95db3 100644 --- a/python/ray/data/tests/test_tfrecords.py +++ b/python/ray/data/tests/test_tfrecords.py @@ -489,36 +489,6 @@ def test_read_tfrecords_ray_remote_args( assert kwargs["ray_remote_args"] == ray_remote_args -@pytest.mark.parametrize("ignore_missing_paths", [True, False]) -def test_read_tfrecords_ignore_missing_paths( - ray_start_regular_shared, tmp_path, ignore_missing_paths -): - import tensorflow as tf - - example = tf_records_empty()[0] - - path = os.path.join(tmp_path, "data.tfrecords") - with tf.io.TFRecordWriter(path=path) as writer: - writer.write(example.SerializeToString()) - - paths = [ - path, - "missing.tfrecords", - ] - - if ignore_missing_paths: - ds = read_tfrecords_with_tfx_read_override( - path, ignore_missing_paths=ignore_missing_paths - ) - assert ds.input_files() == [path] - else: - with pytest.raises(FileNotFoundError): - ds = read_tfrecords_with_tfx_read_override( - paths, ignore_missing_paths=ignore_missing_paths - ) - ds.materialize() - - @pytest.mark.parametrize("with_tf_schema", (True, False)) def test_write_tfrecords( with_tf_schema, diff --git a/python/ray/data/tests/test_torch.py b/python/ray/data/tests/test_torch.py index 024a8f1044c3..41e69ec2a293 100644 --- a/python/ray/data/tests/test_torch.py +++ b/python/ray/data/tests/test_torch.py @@ -1,271 +1,14 @@ import numpy as np import pandas as pd import pytest +import torch import ray -from ray.data.extensions.tensor_extension import TensorArray from ray.data.tests.conftest import * # noqa +from ray.data.tests.util import extract_values from ray.tests.conftest import * # noqa -def test_to_torch_emits_deprecation_warning(ray_start_10_cpus_shared): - with pytest.warns(DeprecationWarning): - ray.data.range(1).to_torch() - - -def test_to_torch(ray_start_10_cpus_shared): - import torch - - df1 = pd.DataFrame( - {"one": [1, 2, 3], "two": [1.0, 2.0, 3.0], "label": [1.0, 2.0, 3.0]} - ) - df2 = pd.DataFrame( - {"one": [4, 5, 6], "two": [4.0, 5.0, 6.0], "label": [4.0, 5.0, 6.0]} - ) - df3 = pd.DataFrame({"one": [7, 8], "two": [7.0, 8.0], "label": [7.0, 8.0]}) - df = pd.concat([df1, df2, df3]) - ds = ray.data.from_pandas([df1, df2, df3]) - torchd = ds.to_torch(label_column="label", batch_size=3) - - num_epochs = 2 - for _ in range(num_epochs): - iterations = [] - for batch in iter(torchd): - iterations.append(torch.cat((batch[0], batch[1]), dim=1).numpy()) - combined_iterations = np.concatenate(iterations) - np.testing.assert_array_equal(np.sort(df.values), np.sort(combined_iterations)) - - -@pytest.mark.parametrize("input", ["single", "list", "dict"]) -@pytest.mark.parametrize("force_dtype", [False, True]) -@pytest.mark.parametrize("label_type", [None, "squeezed", "unsqueezed"]) -def test_to_torch_feature_columns( - ray_start_10_cpus_shared, input, force_dtype, label_type -): - import torch - - df1 = pd.DataFrame( - { - "one": [1, 2, 3], - "two": [1.0, 2.0, 3.0], - "three": [4.0, 5.0, 6.0], - "label": [1.0, 2.0, 3.0], - } - ) - df2 = pd.DataFrame( - { - "one": [4, 5, 6], - "two": [4.0, 5.0, 6.0], - "three": [7.0, 8.0, 9.0], - "label": [4.0, 5.0, 6.0], - } - ) - df3 = pd.DataFrame( - {"one": [7, 8], "two": [7.0, 8.0], "three": [10.0, 11.0], "label": [7.0, 8.0]} - ) - df = pd.concat([df1, df2, df3]).drop("three", axis=1) - ds = ray.data.from_pandas([df1, df2, df3]) - - feature_column_dtypes = None - label_column_dtype = None - if force_dtype: - label_column_dtype = torch.long - if input == "single": - feature_columns = ["one", "two"] - if force_dtype: - feature_column_dtypes = torch.long - elif input == "list": - feature_columns = [["one"], ["two"]] - if force_dtype: - feature_column_dtypes = [torch.long, torch.long] - elif input == "dict": - feature_columns = {"X1": ["one"], "X2": ["two"]} - if force_dtype: - feature_column_dtypes = {"X1": torch.long, "X2": torch.long} - - label_column = None if label_type is None else "label" - unsqueeze_label_tensor = label_type == "unsqueezed" - - torchd = ds.to_torch( - label_column=label_column, - feature_columns=feature_columns, - feature_column_dtypes=feature_column_dtypes, - label_column_dtype=label_column_dtype, - unsqueeze_label_tensor=unsqueeze_label_tensor, - batch_size=3, - ) - iterations = [] - - for batch in iter(torchd): - features, label = batch - - if input == "single": - assert isinstance(features, torch.Tensor) - if force_dtype: - assert features.dtype == torch.long - data = features - elif input == "list": - assert isinstance(features, list) - assert all(isinstance(item, torch.Tensor) for item in features) - if force_dtype: - assert all(item.dtype == torch.long for item in features) - data = torch.cat(tuple(features), dim=1) - elif input == "dict": - assert isinstance(features, dict) - assert all(isinstance(item, torch.Tensor) for item in features.values()) - if force_dtype: - assert all(item.dtype == torch.long for item in features.values()) - data = torch.cat(tuple(features.values()), dim=1) - - if not label_type: - assert label is None - else: - assert isinstance(label, torch.Tensor) - if force_dtype: - assert label.dtype == torch.long - if unsqueeze_label_tensor: - assert label.dim() == 2 - else: - assert label.dim() == 1 - label = label.view(-1, 1) - data = torch.cat((data, label), dim=1) - iterations.append(data.numpy()) - - combined_iterations = np.concatenate(iterations) - if not label_type: - df.drop("label", axis=1, inplace=True) - np.testing.assert_array_equal(df.values, combined_iterations) - - -def test_tensors_in_tables_to_torch(ray_start_10_cpus_shared): - outer_dim = 3 - inner_shape = (2, 2, 2) - shape = (outer_dim,) + inner_shape - num_items = np.prod(np.array(shape)) - arr = np.arange(num_items).reshape(shape) - df1 = pd.DataFrame( - {"one": TensorArray(arr), "two": TensorArray(arr + 1), "label": [1.0, 2.0, 3.0]} - ) - arr2 = np.arange(num_items, 2 * num_items).reshape(shape) - df2 = pd.DataFrame( - { - "one": TensorArray(arr2), - "two": TensorArray(arr2 + 1), - "label": [4.0, 5.0, 6.0], - } - ) - df = pd.concat([df1, df2]) - ds = ray.data.from_pandas([df1, df2]) - torchd = ds.to_torch( - label_column="label", batch_size=2, unsqueeze_label_tensor=False - ) - - num_epochs = 2 - for _ in range(num_epochs): - features, labels = [], [] - for batch in iter(torchd): - features.append(batch[0].numpy()) - labels.append(batch[1].numpy()) - features, labels = np.concatenate(features), np.concatenate(labels) - values = np.stack([df["one"].to_numpy(), df["two"].to_numpy()], axis=1) - np.testing.assert_array_equal(values, features) - np.testing.assert_array_equal(df["label"].to_numpy(), labels) - - -def test_tensors_in_tables_to_torch_mix(ray_start_10_cpus_shared): - outer_dim = 3 - inner_shape = (2, 2, 2) - shape = (outer_dim,) + inner_shape - num_items = np.prod(np.array(shape)) - arr = np.arange(num_items).reshape(shape) - df1 = pd.DataFrame( - { - "one": TensorArray(arr), - "two": [1, 2, 3], - "label": [1.0, 2.0, 3.0], - } - ) - arr2 = np.arange(num_items, 2 * num_items).reshape(shape) - df2 = pd.DataFrame( - { - "one": TensorArray(arr2), - "two": [4, 5, 6], - "label": [4.0, 5.0, 6.0], - } - ) - df = pd.concat([df1, df2]) - ds = ray.data.from_pandas([df1, df2]) - torchd = ds.to_torch( - label_column="label", - feature_columns=[["one"], ["two"]], - batch_size=2, - unsqueeze_label_tensor=False, - unsqueeze_feature_tensors=False, - ) - - num_epochs = 2 - for _ in range(num_epochs): - col1, col2, labels = [], [], [] - for batch in iter(torchd): - col1.append(batch[0][0].numpy()) - col2.append(batch[0][1].numpy()) - labels.append(batch[1].numpy()) - col1, col2 = np.concatenate(col1), np.concatenate(col2) - labels = np.concatenate(labels) - np.testing.assert_array_equal(col1, np.sort(df["one"].to_numpy())) - np.testing.assert_array_equal(col2, np.sort(df["two"].to_numpy())) - np.testing.assert_array_equal(labels, np.sort(df["label"].to_numpy())) - - -@pytest.mark.skip( - reason=( - "Waiting for Torch to support unsqueezing and concatenating nested tensors." - ) -) -def test_tensors_in_tables_to_torch_variable_shaped(ray_start_10_cpus_shared): - shapes = [(2, 2), (3, 3), (4, 4)] - cumsum_sizes = np.cumsum([0] + [np.prod(shape) for shape in shapes[:-1]]) - arrs1 = [ - np.arange(offset, offset + np.prod(shape)).reshape(shape) - for offset, shape in zip(cumsum_sizes, shapes) - ] - df1 = pd.DataFrame( - { - "one": TensorArray(arrs1), - "two": TensorArray([a + 1 for a in arrs1]), - "label": [1.0, 2.0, 3.0], - } - ) - base = cumsum_sizes[-1] - arrs2 = [ - np.arange(base + offset, base + offset + np.prod(shape)).reshape(shape) - for offset, shape in zip(cumsum_sizes, shapes) - ] - df2 = pd.DataFrame( - { - "one": TensorArray(arrs2), - "two": TensorArray([a + 1 for a in arrs2]), - "label": [4.0, 5.0, 6.0], - } - ) - df = pd.concat([df1, df2]) - ds = ray.data.from_pandas([df1, df2]) - torchd = ds.to_torch( - label_column="label", batch_size=2, unsqueeze_label_tensor=False - ) - - num_epochs = 2 - for _ in range(num_epochs): - features, labels = [], [] - for batch in iter(torchd): - features.append(batch[0].numpy()) - labels.append(batch[1].numpy()) - features, labels = np.concatenate(features), np.concatenate(labels) - values = np.stack([df["one"].to_numpy(), df["two"].to_numpy()], axis=1) - np.testing.assert_array_equal(values, features) - np.testing.assert_array_equal(df["label"].to_numpy(), labels) - - def test_iter_torch_batches(ray_start_10_cpus_shared): import torch @@ -336,6 +79,86 @@ def train_loop_per_worker(): my_trainer.fit() +@pytest.mark.parametrize("local_read", [True, False]) +def test_from_torch_map_style_dataset(ray_start_10_cpus_shared, local_read): + class StubDataset(torch.utils.data.Dataset): + def __len__(self): + return 1 + + def __getitem__(self, index): + return index + + torch_dataset = StubDataset() + + ray_dataset = ray.data.from_torch(torch_dataset, local_read=local_read) + + actual_data = ray_dataset.take_all() + assert actual_data == [{"item": 0}] + + +def test_from_torch_iterable_style_dataset(ray_start_10_cpus_shared): + class StubIterableDataset(torch.utils.data.IterableDataset): + def __len__(self): + return 1 + + def __iter__(self): + return iter([0]) + + iter_torch_dataset = StubIterableDataset() + + ray_dataset = ray.data.from_torch(iter_torch_dataset) + + actual_data = ray_dataset.take_all() + assert actual_data == [{"item": 0}] + + +@pytest.mark.parametrize("local_read", [True, False]) +def test_from_torch_boundary_conditions(ray_start_10_cpus_shared, local_read): + """ + Tests that from_torch respects __len__ for map-style datasets + """ + from torch.utils.data import Dataset + + class BoundaryTestMapDataset(Dataset): + """A map-style dataset where __len__ is less than the underlying data size.""" + + def __init__(self, data, length): + super().__init__() + self._data = data + self._length = length + assert self._length <= len( + self._data + ), "Length must be <= data size to properly test boundary conditions" + + def __len__(self): + return self._length + + def __getitem__(self, index): + if not (0 <= index < self._length): + # Note: don't use IndexError because we want to fail clearly if + # Ray Data tries to access beyond __len__ - 1 + raise RuntimeError( + f"Index {index} out of bounds for dataset with length {self._length}" + ) + return self._data[index] + + source_data = list(range(10)) + dataset_len = 8 # Intentionally less than len(source_data) + + # --- Test MapDataset --- + map_ds = BoundaryTestMapDataset(source_data, dataset_len) + # Expected data only includes elements up to dataset_len - 1 + expected_items = source_data[:dataset_len] + + ray_ds_map = ray.data.from_torch(map_ds, local_read=local_read) + actual_items_map = extract_values("item", list(ray_ds_map.take_all())) + + # This assertion verifies that ray_ds_map didn't try to access index 8 or 9, + # which would have raised an IndexError in BoundaryTestMapDataset.__getitem__ + assert actual_items_map == expected_items + assert len(actual_items_map) == dataset_len + + if __name__ == "__main__": import sys diff --git a/python/ray/data/tests/test_transform_pyarrow.py b/python/ray/data/tests/test_transform_pyarrow.py index f2a58f1c6bcc..6cefc62ce113 100644 --- a/python/ray/data/tests/test_transform_pyarrow.py +++ b/python/ray/data/tests/test_transform_pyarrow.py @@ -101,7 +101,7 @@ def _concat_and_sort_partitions(parts: Iterable[pa.Table]) -> pa.Table: t, hash_cols=["structs"], num_partitions=101 ) - assert len(_structs_partition_dict) == 34 + assert len(_structs_partition_dict) <= 101 assert t == _concat_and_sort_partitions(_structs_partition_dict.values()) @@ -572,22 +572,6 @@ def test_unify_schemas(unify_schemas_basic_schemas, unify_schemas_multicol_schem ) -def test_unify_schemas_null_typed_lists(unify_schemas_null_typed_lists_schemas): - """Test handling of null-typed lists (cols_with_null_list functionality).""" - schemas = unify_schemas_null_typed_lists_schemas - - # Should find valid value_type from schema2 and override - result = unify_schemas([schemas["null_list"], schemas["int_list"]]) - assert result == schemas["expected"] - - # Test with multiple schemas, some with null types - result = unify_schemas( - [schemas["null_list"], schemas["int_list"], schemas["string_list"]] - ) - # Should use the first non-null type found (int32) - assert result == schemas["expected"] - - def test_unify_schemas_object_types(unify_schemas_object_types_schemas): """Test handling of object types (columns_with_objects functionality).""" schemas = unify_schemas_object_types_schemas @@ -603,12 +587,6 @@ def test_unify_schemas_object_types(unify_schemas_object_types_schemas): assert result == schemas["expected"] -def test_unify_schemas_duplicate_fields(unify_schemas_duplicate_fields_schema): - """Test error handling for duplicate field names.""" - with pytest.raises(ValueError, match="has multiple fields with the same name"): - unify_schemas([unify_schemas_duplicate_fields_schema]) - - @pytest.mark.skipif( get_pyarrow_version() < parse_version("17.0.0"), reason="Requires PyArrow version 17 or higher", @@ -632,6 +610,10 @@ def test_unify_schemas_objects_and_tensors(unify_schemas_objects_and_tensors_sch unify_schemas(unify_schemas_objects_and_tensors_schemas) +@pytest.mark.skipif( + get_pyarrow_version() < parse_version("17.0.0"), + reason="Requires PyArrow version 17 or higher", +) def test_unify_schemas_missing_tensor_fields( unify_schemas_missing_tensor_fields_schemas, ): @@ -2230,7 +2212,7 @@ def struct_with_null_tensor_values_expected(): "struct", pa.struct( [ - ("tensor", ArrowVariableShapedTensorType(pa.float32(), 2)), + ("tensor", ArrowTensorTypeV2((2,), pa.float32())), ("value", pa.int64()), ] ), @@ -2756,20 +2738,6 @@ def struct_variable_shaped_tensor_expected(): } -@pytest.fixture -def unify_schemas_null_typed_lists_schemas(): - """Fixture for null typed lists unify schemas test data.""" - schema1 = pa.schema([("list_col", pa.list_(pa.null()))]) - schema2 = pa.schema([("list_col", pa.list_(pa.int32()))]) - schema3 = pa.schema([("list_col", pa.list_(pa.string()))]) - return { - "null_list": schema1, - "int_list": schema2, - "string_list": schema3, - "expected": pa.schema([("list_col", pa.list_(pa.int32()))]), - } - - @pytest.fixture def unify_schemas_object_types_schemas(): """Fixture for object types unify schemas test data.""" @@ -2788,12 +2756,6 @@ def unify_schemas_object_types_schemas(): } -@pytest.fixture -def unify_schemas_duplicate_fields_schema(): - """Fixture for duplicate fields unify schemas test data.""" - return pa.schema([("col", pa.int32()), ("col", pa.int64())]) # Duplicate name - - @pytest.fixture def unify_schemas_incompatible_tensor_schemas(): """Fixture for incompatible tensor dtypes unify schemas test data.""" @@ -2837,7 +2799,7 @@ def unify_schemas_missing_tensor_fields_schemas(): "struct", pa.struct( [ - ("tensor", ArrowVariableShapedTensorType(pa.int32(), 2)), + ("tensor", ArrowTensorType((2, 2), pa.int32())), ("value", pa.int64()), ] ), @@ -2899,7 +2861,7 @@ def unify_schemas_nested_struct_tensors_schemas(): [ ( "tensor", - ArrowVariableShapedTensorType(pa.float32(), 2), + ArrowTensorType((3, 3), pa.float32()), ), ("data", pa.string()), ] diff --git a/python/ray/data/tests/test_unify_schemas_performance.py b/python/ray/data/tests/test_unify_schemas_performance.py new file mode 100644 index 000000000000..704f15de5167 --- /dev/null +++ b/python/ray/data/tests/test_unify_schemas_performance.py @@ -0,0 +1,140 @@ +import pyarrow as pa +import pytest + +from ray.data._internal.arrow_ops.transform_pyarrow import ( + unify_schemas, +) +from ray.data.extensions import ( + ArrowPythonObjectType, + ArrowTensorType, + ArrowVariableShapedTensorType, +) + + +# Schema factory functions - just return schemas +def _create_simple_schema(num_columns): + return pa.schema([(f"col_{i}", pa.int64()) for i in range(num_columns)]) + + +def _create_tensor_fixed_schema(num_columns): + return pa.schema( + [ + (f"tensor_{i}", ArrowTensorType((2, 2), pa.float32())) + for i in range(num_columns) + ] + ) + + +def _create_tensor_variable_schema(num_columns): + return pa.schema( + [ + (f"tensor_{i}", ArrowVariableShapedTensorType(pa.float32(), 2)) + for i in range(num_columns) + ] + ) + + +def _create_object_schema(num_columns): + return pa.schema( + [(f"obj_{i}", ArrowPythonObjectType()) for i in range(num_columns)] + ) + + +def _create_nested_struct_schema(num_columns): + fields = [] + for i in range(num_columns): + inner_struct = pa.struct( + [("x", pa.int32()), ("y", pa.string()), ("z", pa.float64())] + ) + fields.append((f"struct_{i}", inner_struct)) + return pa.schema(fields) + + +def _create_deep_nested_schema(num_columns): + fields = [] + for i in range(num_columns): + level4 = pa.struct([("data", pa.int32()), ("meta", pa.string())]) + level3 = pa.struct([("level4", level4), ("id3", pa.int64())]) + level2 = pa.struct([("level3", level3), ("id2", pa.int64())]) + level1 = pa.struct([("level2", level2), ("id1", pa.int64())]) + fields.append((f"deep_{i}", level1)) + return pa.schema(fields) + + +def _create_mixed_complex_schema(num_columns): + fields = [] + for i in range(num_columns): + field_type = i % 5 + if field_type == 0: + fields.append((f"col_{i}", pa.int64())) + elif field_type == 1: + fields.append((f"col_{i}", ArrowTensorType((3, 3), pa.int32()))) + elif field_type == 2: + fields.append((f"col_{i}", ArrowPythonObjectType())) + elif field_type == 3: + inner_struct = pa.struct([("a", pa.int32()), ("b", pa.string())]) + fields.append((f"col_{i}", inner_struct)) + else: + fields.append((f"col_{i}", pa.list_(pa.float64()))) + return pa.schema(fields) + + +@pytest.mark.parametrize("num_schemas", [10, 100]) +@pytest.mark.parametrize("num_columns", [10, 100, 1000, 5000]) +@pytest.mark.parametrize( + "schema_factory,expected_time_per_schema_per_column", + [ + (_create_simple_schema, 0.00001), + (_create_tensor_fixed_schema, 0.00005), + (_create_tensor_variable_schema, 0.00005), + (_create_object_schema, 0.00005), + (_create_nested_struct_schema, 0.0001), + (_create_deep_nested_schema, 0.0002), + (_create_mixed_complex_schema, 0.0002), + ], +) +def test_unify_schemas_equivalent_performance( + num_schemas, num_columns, schema_factory, expected_time_per_schema_per_column +): + """Stress test for unify_schemas when ALL schemas are equivalent (identical). + + This tests the fast path where all schemas are the same and should be optimized + to return quickly without expensive comparisons. + """ + import time + + # Create the base schema + base_schema = schema_factory(num_columns) + + # Create list of identical schemas + schemas = [base_schema] * num_schemas + + # Time the unification + start_time = time.time() + unified = unify_schemas(schemas) + elapsed_time = time.time() - start_time + + # Verify the result is correct (should be identical to base schema) + assert unified == base_schema + + # Performance assertions with scaling based on complexity + scale_factor = num_schemas * num_columns + max_allowed_time = expected_time_per_schema_per_column * scale_factor + buffer_factor = 2 + assert elapsed_time < buffer_factor * max_allowed_time, ( + f"unify_schemas took {elapsed_time:.4f}s for {num_schemas} identical " + f"{schema_factory.__name__} schemas with {num_columns} columns, " + f"should be < {max_allowed_time:.4f}s" + ) + + # Print timing info for large cases + if num_schemas >= 1000 or num_columns >= 100: + print( + f"\n{schema_factory.__name__}: {num_schemas} schemas x {num_columns} cols = {elapsed_time:.4f}s" + ) + + +if __name__ == "__main__": + import sys + + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/exceptions.py b/python/ray/exceptions.py index a91b010ff93b..2ea042b515c0 100644 --- a/python/ray/exceptions.py +++ b/python/ray/exceptions.py @@ -19,7 +19,6 @@ ) from ray.util.annotations import DeveloperAPI, PublicAPI - logger = logging.getLogger(__name__) @@ -48,9 +47,16 @@ def from_ray_exception(ray_exception): if ray_exception.language == PYTHON: try: return pickle.loads(ray_exception.serialized_exception) - except Exception as e: - msg = "Failed to unpickle serialized exception" - raise RuntimeError(msg) from e + except Exception: + # formatted_exception_string is set in to_bytes() above by calling + # traceback.format_exception() on the original exception. It contains + # the string representation and stack trace of the original error. + original_stacktrace = getattr( + ray_exception, + "formatted_exception_string", + "No formatted exception string available.", + ) + return UnserializableException(original_stacktrace) else: return CrossLanguageError(ray_exception) @@ -900,6 +906,33 @@ class RayCgraphCapacityExceeded(RaySystemError): pass +@PublicAPI(stability="alpha") +class UnserializableException(RayError): + """Raised when there is an error deserializing a serialized exception. + + This occurs when deserializing (unpickling) a previously serialized exception + fails. In this case, we fall back to raising the string representation of + the original exception along with its stack trace that was captured at the + time of serialization. + + For more details and how to handle this with custom serializers, :ref:`configuring custom exeception serializers ` + + Args: + original_stack_trace: The string representation and stack trace of the + original exception that was captured during serialization. + """ + + def __init__(self, original_stack_trace: str): + self._original_stack_trace = original_stack_trace + + def __str__(self): + return ( + "Failed to deserialize exception. Refer to https://docs.ray.io/en/latest/ray-core/objects/serialization.html#custom-serializers-for-exceptions for more information.\n" + "Original exception:\n" + f"{self._original_stack_trace}" + ) + + RAY_EXCEPTION_TYPES = [ PlasmaObjectNotAvailable, RayError, @@ -929,4 +962,5 @@ class RayCgraphCapacityExceeded(RaySystemError): RayChannelTimeoutError, OufOfBandObjectRefSerializationException, RayCgraphCapacityExceeded, + UnserializableException, ] diff --git a/python/ray/experimental/BUILD b/python/ray/experimental/BUILD.bazel similarity index 100% rename from python/ray/experimental/BUILD rename to python/ray/experimental/BUILD.bazel diff --git a/python/ray/experimental/__init__.py b/python/ray/experimental/__init__.py index 57e565dd2d44..37cb09a1513b 100644 --- a/python/ray/experimental/__init__.py +++ b/python/ray/experimental/__init__.py @@ -1,10 +1,11 @@ from ray.experimental.dynamic_resources import set_resource +from ray.experimental.gpu_object_manager import GPUObjectManager, wait_tensor_freed from ray.experimental.locations import get_local_object_locations, get_object_locations -from ray.experimental.gpu_object_manager import GPUObjectManager __all__ = [ "get_object_locations", "get_local_object_locations", "set_resource", "GPUObjectManager", + "wait_tensor_freed", ] diff --git a/python/ray/experimental/channel/accelerator_context.py b/python/ray/experimental/channel/accelerator_context.py index 9acc1ad67d33..f4aa622af2b5 100644 --- a/python/ray/experimental/channel/accelerator_context.py +++ b/python/ray/experimental/channel/accelerator_context.py @@ -1,10 +1,11 @@ -import threading import importlib -import ray -from typing import TYPE_CHECKING, Optional, Type, ContextManager, List +import threading from contextlib import nullcontext -from ray.experimental.channel.communicator import Communicator +from typing import TYPE_CHECKING, ContextManager, List, Optional, Type + +import ray from ray._private.accelerators import get_accelerator_manager_for_resource +from ray.experimental.channel.communicator import Communicator if TYPE_CHECKING: import torch diff --git a/python/ray/experimental/channel/common.py b/python/ray/experimental/channel/common.py index 8de5f5642312..0f1b916a7224 100644 --- a/python/ray/experimental/channel/common.py +++ b/python/ray/experimental/channel/common.py @@ -18,9 +18,9 @@ import ray import ray.exceptions +from ray.experimental.channel.accelerator_context import AcceleratorContext from ray.experimental.channel.communicator import Communicator from ray.experimental.channel.communicator_handle import CommunicatorHandle -from ray.experimental.channel.accelerator_context import AcceleratorContext from ray.experimental.channel.serialization_context import _SerializationContext from ray.util.annotations import DeveloperAPI, PublicAPI diff --git a/python/ray/experimental/channel/communicator_handle.py b/python/ray/experimental/channel/communicator_handle.py index 26dd000ad98c..c6d8865bfc44 100644 --- a/python/ray/experimental/channel/communicator_handle.py +++ b/python/ray/experimental/channel/communicator_handle.py @@ -1,4 +1,5 @@ from typing import List + import ray diff --git a/python/ray/experimental/channel/nccl_group.py b/python/ray/experimental/channel/nccl_group.py index f81c8c4bcadb..64b640818833 100644 --- a/python/ray/experimental/channel/nccl_group.py +++ b/python/ray/experimental/channel/nccl_group.py @@ -4,9 +4,9 @@ import ray from ray.exceptions import RayChannelError +from ray.experimental.channel.accelerator_context import AcceleratorContext from ray.experimental.channel.communicator import Communicator, TorchTensorAllocator from ray.experimental.util.types import ReduceOp -from ray.experimental.channel.accelerator_context import AcceleratorContext if TYPE_CHECKING: import torch diff --git a/python/ray/experimental/channel/serialization_context.py b/python/ray/experimental/channel/serialization_context.py index 40784b38f409..548d36301f6b 100644 --- a/python/ray/experimental/channel/serialization_context.py +++ b/python/ray/experimental/channel/serialization_context.py @@ -97,7 +97,9 @@ def serialize_tensor( from ray.experimental.channel import ChannelContext ctx = ChannelContext.get_current() - if self._use_external_transport and tensor.device == ctx.torch_device: + if self._use_external_transport and ( + ctx._torch_device is None or ctx._torch_device == tensor.device + ): # External transport is enabled and we found a tensor that matches # our device. Add the actual tensor to a buffer. The buffer of # tensors should later be popped by the caller and sent via @@ -172,8 +174,8 @@ def deserialize_from_numpy_or_scalar( tensor_device_type: str, target_device: Device, ): - import torch import numpy as np + import torch if target_device == Device.DEFAULT: target_device_type = tensor_device_type diff --git a/python/ray/experimental/channel/torch_tensor_accelerator_channel.py b/python/ray/experimental/channel/torch_tensor_accelerator_channel.py index ae1cb3772b6c..08149406951a 100644 --- a/python/ray/experimental/channel/torch_tensor_accelerator_channel.py +++ b/python/ray/experimental/channel/torch_tensor_accelerator_channel.py @@ -3,24 +3,24 @@ import uuid from dataclasses import dataclass from types import ModuleType -from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union, Type +from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Type, Union import ray import ray.util.serialization from ray.experimental.channel import ChannelContext, utils +from ray.experimental.channel.accelerator_context import ( + AcceleratorContext, + is_accelerator_context_registered, + register_accelerator_context, +) from ray.experimental.channel.common import ChannelInterface from ray.experimental.channel.communicator import Communicator +from ray.experimental.channel.communicator_handle import CommunicatorHandle from ray.experimental.channel.cpu_communicator import CPUCommunicator from ray.experimental.channel.intra_process_channel import IntraProcessChannel -from ray.experimental.channel.communicator_handle import CommunicatorHandle from ray.experimental.channel.shared_memory_channel import SharedMemoryType from ray.experimental.channel.torch_tensor_type import TorchTensorType from ray.util.annotations import DeveloperAPI -from ray.experimental.channel.accelerator_context import ( - AcceleratorContext, - register_accelerator_context, - is_accelerator_context_registered, -) if TYPE_CHECKING: import torch diff --git a/python/ray/experimental/collective/__init__.py b/python/ray/experimental/collective/__init__.py index de2314acf6ab..42289cee1653 100644 --- a/python/ray/experimental/collective/__init__.py +++ b/python/ray/experimental/collective/__init__.py @@ -1,14 +1,15 @@ +from ray.experimental.collective.collective import ( + create_collective_group, + destroy_all_collective_groups, + destroy_collective_group, + get_collective_groups, +) from ray.experimental.collective.operations import ( allgather, allreduce, reducescatter, ) -from ray.experimental.collective.collective import ( - get_collective_groups, - create_collective_group, - destroy_collective_group, - destroy_all_collective_groups, -) +from ray.experimental.collective.util import get_tensor_transport_manager __all__ = [ "allgather", @@ -18,4 +19,5 @@ "create_collective_group", "destroy_collective_group", "destroy_all_collective_groups", + "get_tensor_transport_manager", ] diff --git a/python/ray/experimental/collective/collective.py b/python/ray/experimental/collective/collective.py index b2b05a931d39..db60fd500dab 100644 --- a/python/ray/experimental/collective/collective.py +++ b/python/ray/experimental/collective/collective.py @@ -1,17 +1,16 @@ -from typing import Dict, List, Optional, Union import threading import uuid +from typing import Dict, List, Optional, Union import ray +import ray.experimental.internal_kv as internal_kv from ray.experimental.collective.communicator import CommunicatorHandle from ray.experimental.collective.util import get_address_and_port -import ray.experimental.internal_kv as internal_kv -from ray.util.collective.types import Backend +from ray.util.annotations import PublicAPI from ray.util.collective.collective_group.torch_gloo_collective_group import ( get_master_address_metadata_key, ) -from ray.util.annotations import PublicAPI - +from ray.util.collective.types import Backend _remote_communicator_manager: "Optional[RemoteCommunicatorManager]" = None _remote_communicator_manager_lock = threading.Lock() diff --git a/python/ray/experimental/collective/collective_tensor_transport.py b/python/ray/experimental/collective/collective_tensor_transport.py new file mode 100644 index 000000000000..61996d608d68 --- /dev/null +++ b/python/ray/experimental/collective/collective_tensor_transport.py @@ -0,0 +1,180 @@ +from typing import TYPE_CHECKING, List, Optional + +import ray +from ray.experimental.collective.tensor_transport_manager import ( + TensorTransportManager, +) +from ray.util.collective.types import ( + Backend, + CollectiveCommunicatorMetadata, + CollectiveTransportMetadata, +) + +if TYPE_CHECKING: + import torch + + +class CollectiveTensorTransport(TensorTransportManager): + def __init__(self, tensor_transport_backend: Backend): + self._tensor_transport_backend = tensor_transport_backend + + @property + def tensor_transport_backend(self) -> Backend: + return self._tensor_transport_backend + + @staticmethod + def is_one_sided() -> bool: + return False + + def actor_has_tensor_transport(self, actor: "ray.actor.ActorHandle") -> bool: + from ray.experimental.collective import get_collective_groups + + communicators = get_collective_groups( + [actor], backend=self.tensor_transport_backend + ) + return len(communicators) > 0 + + @staticmethod + def extract_tensor_transport_metadata( + gpu_object: List["torch.Tensor"], + ) -> CollectiveTransportMetadata: + tensor_meta = [] + device = None + if gpu_object: + device = gpu_object[0].device + for t in gpu_object: + if t.device.type != device.type: + raise ValueError( + "All tensors in an RDT object must have the same device type." + ) + tensor_meta.append((t.shape, t.dtype)) + return CollectiveTransportMetadata( + tensor_meta=tensor_meta, + tensor_device=device, + ) + + @staticmethod + def get_tensor_transport_metadata( + src_actor: "ray.actor.ActorHandle", + obj_id: str, + ) -> CollectiveTransportMetadata: + def __ray_get_tensor_transport_metadata__( + self: "ray.actor.ActorHandle", + obj_id: str, + ) -> CollectiveTransportMetadata: + + from ray._private.worker import global_worker + + gpu_object_store = global_worker.gpu_object_manager.gpu_object_store + # NOTE: We do not specify a timeout here because the user task that returns + # it could take arbitrarily long and we don't want to trigger a spurious + # timeout. + gpu_object = gpu_object_store.wait_and_get_object(obj_id) + return CollectiveTensorTransport.extract_tensor_transport_metadata( + gpu_object + ) + + # Submit a Ray actor task to the source actor to get the tensor metadata. + # The metadata is a list of tuples, where each tuple contains the shape and dtype + # of a tensor in the GPU object store. This function returns an ObjectRef that + # points to the tensor metadata. + # NOTE(swang): We put this task on the background thread to avoid tasks + # executing on the main thread blocking this task. + + return src_actor.__ray_call__.options(concurrency_group="_ray_system").remote( + __ray_get_tensor_transport_metadata__, obj_id + ) + + @staticmethod + def get_communicator_metadata( + src_actor: "ray.actor.ActorHandle", + dst_actor: "ray.actor.ActorHandle", + backend: Optional[str] = None, + ) -> CollectiveCommunicatorMetadata: + + from ray.experimental.collective import get_collective_groups + + communicators = get_collective_groups( + [src_actor, dst_actor], + backend=backend, + ) + # TODO(kevin85421): Support multiple communicators. + if len(communicators) == 0: + raise ValueError( + f"No communicators found for actors {src_actor} and {dst_actor}. " + "Create a communicator with " + "`ray.experimental.collective.create_collective_group` " + "before calling actor tasks. with non-default tensor_transport." + ) + elif len(communicators) > 1: + raise ValueError( + f"There are {len(communicators)} possible communicators that contain actors {src_actor} and {dst_actor}. " + "Currently, RDT objects only support one communicator. Please make sure only " + "one communicator exists." + ) + communicator = communicators[0] + src_rank = communicator.get_rank(src_actor) + if src_rank == -1: + raise ValueError( + f"Sender actor {src_actor} not found in communicator. " + "Please make sure the sender and receiver are in the same communicator." + ) + dst_rank = communicator.get_rank(dst_actor) + if dst_rank == -1: + raise ValueError( + f"Receiver actor {dst_actor} not found in communicator. " + "Please make sure the sender and receiver are in the same communicator." + ) + + communicator_metadata = CollectiveCommunicatorMetadata( + communicator_name=communicator.name, + src_rank=src_rank, + dst_rank=dst_rank, + ) + return communicator_metadata + + @staticmethod + def recv_multiple_tensors( + tensors, + tensor_transport_metadata: CollectiveTransportMetadata, + communicator_metadata: CollectiveCommunicatorMetadata, + ): + from ray.util.collective import types + from ray.util.collective.collective import recv + + assert isinstance( + tensor_transport_metadata, types.CollectiveTransportMetadata + ), "metadata must be a CollectiveTransportMetadata object for non-NIXL transport" + assert isinstance( + communicator_metadata, types.CollectiveCommunicatorMetadata + ), "metadata must be a CollectiveCommunicatorMetadata object for non-NIXL transport" + + for tensor in tensors: + recv( + tensor, + communicator_metadata.src_rank, + communicator_metadata.communicator_name, + ) + + @staticmethod + def send_multiple_tensors( + tensors: List["torch.Tensor"], + tensor_transport_metadata: CollectiveTransportMetadata, + communicator_metadata: CollectiveCommunicatorMetadata, + ): + import ray.util.collective as collective + + device = tensors[0].device if tensors else None + + for tensor in tensors: + if tensor.device.type != device.type: + # TODO(swang): Right now there is no way to catch this error + # and the receiving Ray task will hang. + raise ValueError( + f"tensor device {tensor.device} does not match device {device}" + ) + collective.send( + tensor, + communicator_metadata.dst_rank, + communicator_metadata.communicator_name, + ) diff --git a/python/ray/experimental/collective/communicator.py b/python/ray/experimental/collective/communicator.py index ba4aaecb87ae..2379bf220389 100644 --- a/python/ray/experimental/collective/communicator.py +++ b/python/ray/experimental/collective/communicator.py @@ -1,5 +1,5 @@ -from typing import List from dataclasses import dataclass +from typing import List import ray from ray.util.collective.types import Backend diff --git a/python/ray/experimental/collective/nixl_tensor_transport.py b/python/ray/experimental/collective/nixl_tensor_transport.py new file mode 100644 index 000000000000..40701590e9d0 --- /dev/null +++ b/python/ray/experimental/collective/nixl_tensor_transport.py @@ -0,0 +1,150 @@ +from typing import TYPE_CHECKING, List, Optional + +import ray +from ray.experimental.collective.tensor_transport_manager import ( + TensorTransportManager, +) +from ray.util.collective.collective import get_group_handle +from ray.util.collective.types import ( + NIXL_GROUP_NAME, + Backend, + NixlCommunicatorMetadata, + NixlTransportMetadata, +) + +if TYPE_CHECKING: + import torch + + +class NixlTensorTransport(TensorTransportManager): + @property + def tensor_transport_backend(self) -> Backend: + return Backend.NIXL + + @staticmethod + def is_one_sided() -> bool: + return True + + def actor_has_tensor_transport(self, actor: "ray.actor.ActorHandle") -> bool: + def __ray_actor_has_tensor_transport__( + self: "ray.actor.ActorHandle", + ) -> bool: + try: + nixl_backend = get_group_handle(NIXL_GROUP_NAME) + return nixl_backend is not None + except Exception: + return False + + return ray.get( + actor.__ray_call__.options(concurrency_group="_ray_system").remote( + __ray_actor_has_tensor_transport__ + ) + ) + + @staticmethod + def extract_tensor_transport_metadata( + gpu_object: List["torch.Tensor"], + ) -> NixlTransportMetadata: + from ray.util.collective.collective_group.nixl_backend import NixlBackend + from ray.util.collective.types import NixlTransportMetadata + + nixl_backend: NixlBackend = get_group_handle(NIXL_GROUP_NAME) + device = None + tensor_meta = [] + if gpu_object: + serialized_descs, agent_meta = nixl_backend.get_nixl_metadata(gpu_object) + # We assume all tensors in one GPU object have the same device type. + device = gpu_object[0].device + for t in gpu_object: + if t.device.type != device.type: + raise ValueError( + "All tensors in an RDT object must have the same device type." + ) + tensor_meta.append((t.shape, t.dtype)) + else: + serialized_descs, agent_meta = None, None + return NixlTransportMetadata( + tensor_meta=tensor_meta, + tensor_device=device, + nixl_serialized_descs=serialized_descs, + nixl_agent_meta=agent_meta, + ) + + @staticmethod + def get_tensor_transport_metadata( + src_actor: "ray.actor.ActorHandle", + obj_id: str, + ) -> NixlTransportMetadata: + def __ray_get_tensor_transport_metadata__( + self: "ray.actor.ActorHandle", + obj_id: str, + ) -> NixlTransportMetadata: + + from ray._private.worker import global_worker + + gpu_object_store = global_worker.gpu_object_manager.gpu_object_store + # NOTE: We do not specify a timeout here because the user task that returns + # it could take arbitrarily long and we don't want to trigger a spurious + # timeout. + gpu_object = gpu_object_store.wait_and_get_object(obj_id) + + return NixlTensorTransport.extract_tensor_transport_metadata(gpu_object) + + # Submit a Ray actor task to the source actor to get the tensor metadata. + # The metadata is a list of tuples, where each tuple contains the shape and dtype + # of a tensor in the GPU object store. This function returns an ObjectRef that + # points to the tensor metadata. + # NOTE(swang): We put this task on the background thread to avoid tasks + # executing on the main thread blocking this task. + + return src_actor.__ray_call__.options(concurrency_group="_ray_system").remote( + __ray_get_tensor_transport_metadata__, obj_id + ) + + @staticmethod + def get_communicator_metadata( + src_actor: "ray.actor.ActorHandle", + dst_actor: "ray.actor.ActorHandle", + backend: Optional[str] = None, + ) -> NixlCommunicatorMetadata: + + communicator_metadata = NixlCommunicatorMetadata( + communicator_name=NIXL_GROUP_NAME, + ) + + return communicator_metadata + + @staticmethod + def recv_multiple_tensors( + tensors, + tensor_transport_metadata: NixlTransportMetadata, + communicator_metadata: NixlCommunicatorMetadata, + ): + from ray.util.collective import types + from ray.util.collective.collective import get_group_handle + + if tensors: + g = get_group_handle(communicator_metadata.communicator_name) + + assert isinstance( + tensor_transport_metadata, types.NixlTransportMetadata + ), "metadata must be a NixlTransportMetadata object for NIXL transport" + assert isinstance( + communicator_metadata, types.NixlCommunicatorMetadata + ), "metadata must be a NixlCommunicatorMetadata object for NIXL transport" + + g.recv( + tensors, + tensor_transport_metadata.nixl_serialized_descs, + tensor_transport_metadata.nixl_agent_meta, + ) + + @staticmethod + def send_multiple_tensors( + tensors: List["torch.Tensor"], + communicator_metadata: NixlCommunicatorMetadata, + device: "torch.device", + ): + raise NotImplementedError( + "NIXL transport does not support send_multiple_tensors, since it is a one-sided transport." + ) diff --git a/python/ray/experimental/collective/operations.py b/python/ray/experimental/collective/operations.py index 31edbcb9eb28..aa8ba37ecc1f 100644 --- a/python/ray/experimental/collective/operations.py +++ b/python/ray/experimental/collective/operations.py @@ -6,14 +6,14 @@ from ray.dag.constants import ( BIND_INDEX_KEY, COLLECTIVE_OPERATION_KEY, - PARENT_CLASS_NODE_KEY, IS_CLASS_METHOD_OUTPUT_KEY, + PARENT_CLASS_NODE_KEY, ) from ray.experimental.channel.torch_tensor_type import Communicator, TorchTensorType from ray.experimental.util.types import ( - ReduceOp, AllGatherOp, AllReduceOp, + ReduceOp, ReduceScatterOp, _CollectiveOp, ) diff --git a/python/ray/experimental/collective/tensor_transport_manager.py b/python/ray/experimental/collective/tensor_transport_manager.py new file mode 100644 index 000000000000..c86dc4554f17 --- /dev/null +++ b/python/ray/experimental/collective/tensor_transport_manager.py @@ -0,0 +1,192 @@ +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, List, Optional + +import ray +from ray.util.collective.types import ( + Backend, + CommunicatorMetadata, + TensorTransportMetadata, +) + +if TYPE_CHECKING: + import torch + + +class TensorTransportManager(ABC): + @property + @abstractmethod + def tensor_transport_backend(self) -> Backend: + """The tensor transport backend, e.g., NCCL. + + Returns: + Backend: The backend of the tensor transport. + """ + + @staticmethod + @abstractmethod + def is_one_sided() -> bool: + """Whether the backend is one-sided. + + Returns: + bool: True if the backend is one-sided, False otherwise. + """ + + @abstractmethod + def actor_has_tensor_transport(self, actor: "ray.actor.ActorHandle") -> bool: + """Whether the actor has the tensor transport available. + + Args: + actor: The actor to check. + + Returns: + bool: True if the actor has the tensor transport available, False otherwise. + """ + + @staticmethod + @abstractmethod + def get_tensor_transport_metadata( + src_actor: "ray.actor.ActorHandle", + obj_id: str, + ) -> TensorTransportMetadata: + """ + Get the tensor transport metadata for the GPU object. + This function retrieves metadata about tensors stored in the GPU object store, + including their shapes, dtypes, and any transport-specific metadata, e.g., NIXL descriptors. + + Args: + src_actor: The actor that runs this function. + obj_id: The ID of the GPU object to get metadata for + + Returns: + TensorTransportMetadata: A named tuple containing the tensor metadata. + """ + + @staticmethod + @abstractmethod + def extract_tensor_transport_metadata( + gpu_object: List["torch.Tensor"], + ) -> TensorTransportMetadata: + """ + Extract the tensor transport metadata from the GPU object. + + Args: + gpu_object: The GPU object to extract the tensor transport metadata from. + + Returns: + TensorTransportMetadata: The tensor transport metadata. + """ + + @staticmethod + @abstractmethod + def get_communicator_metadata( + src_actor: "ray.actor.ActorHandle", + dst_actor: "ray.actor.ActorHandle", + backend: Optional[str] = None, + ) -> CommunicatorMetadata: + """ + Get the communicator metadata (e.g. communicator name, src/dst rank) for the send/recv operation. + This function is called before sending the GPU object. + + Args: + src_actor: The actor that runs this function. + dst_actor: The actor that runs this function. + backend: The backend to use for the collective operation. + + Returns: + CommunicatorMetadata: The communicator metadata. + """ + + @staticmethod + def send_object( + src_actor: "ray.actor.ActorHandle", + obj_id: str, + tensor_transport_meta: TensorTransportMetadata, + communicator_metadata_ref: CommunicatorMetadata, + ): + """ + Send the GPU object to the destination actor. + + Args: + src_actor: The actor that runs this function. + obj_id: The ID of the GPU object to send. + tensor_transport_meta: The tensor transport metadata for the GPU object. + communicator_metadata_ref: The ObjectRef of communicator metadata for the send/recv operation. + """ + from ray.experimental.gpu_object_manager.gpu_object_store import __ray_send__ + + # Send tensors stored in the `src_actor`'s GPU object store to the + # destination rank `dst_rank`. + # NOTE(swang): We put this task on the background thread to avoid tasks + # executing on the main thread blocking the data transfer. + src_actor.__ray_call__.options(concurrency_group="_ray_system").remote( + __ray_send__, + obj_id, + tensor_transport_meta, + communicator_metadata_ref, + ) + + @staticmethod + def recv_object( + dst_actor: "ray.actor.ActorHandle", + obj_id: str, + tensor_transport_metadata_ref: TensorTransportMetadata, + communicator_metadata_ref: CommunicatorMetadata, + ): + """ + Receive the GPU object from the source actor. + This function receives tensors from the source rank and stores them in the + `dst_actor`'s GPU object store. + + Args: + dst_actor: The actor that runs this function. + obj_id: The ID of the GPU object to receive. + tensor_transport_metadata_ref: The ObjectRef of tensor transport metadata for the GPU object. + communicator_metadata_ref: The ObjectRef of communicator metadata for the send/recv operation. + """ + from ray.experimental.gpu_object_manager.gpu_object_store import __ray_recv__ + + # Receive tensors from the source rank and store them in the + # `dst_actor`'s GPU object store. + # + # NOTE(swang): We put this task on the background thread to avoid tasks + # executing on the main thread blocking the data transfer. Technically, + # this is only needed for the sender task, but we put the receiver task + # on the same background thread to ensure that all communication + # operations are executed in a global order. + dst_actor.__ray_call__.options(concurrency_group="_ray_system").remote( + __ray_recv__, + obj_id, + tensor_transport_metadata_ref, + communicator_metadata_ref, + ) + + @staticmethod + @abstractmethod + def recv_multiple_tensors( + tensors: List["torch.Tensor"], + tensor_transport_metadata: TensorTransportMetadata, + communicator_metadata: CommunicatorMetadata, + ): + """ + Receive multiple tensors from the source actor. + + Args: + tensors: The pre-allocated tensor space to receive the tensors. + tensor_transport_metadata: The tensor transport metadata for the GPU object. + communicator_metadata: The communicator metadata for the send/recv operation. + + """ + + @staticmethod + @abstractmethod + def send_multiple_tensors( + tensors: List["torch.Tensor"], + communicator_metadata: CommunicatorMetadata, + ): + """ + Send multiple tensors to the destination actor. + + Args: + tensors: The tensors to send. + communicator_metadata: The communicator metadata for the send/recv operation. + """ diff --git a/python/ray/experimental/collective/util.py b/python/ray/experimental/collective/util.py index ea518a002458..5b09697247f3 100644 --- a/python/ray/experimental/collective/util.py +++ b/python/ray/experimental/collective/util.py @@ -1,8 +1,64 @@ -from typing import Tuple -from contextlib import closing import socket +from contextlib import closing +from typing import TYPE_CHECKING, Tuple import ray +from ray.experimental.collective.collective_tensor_transport import ( + CollectiveTensorTransport, +) +from ray.experimental.collective.nixl_tensor_transport import NixlTensorTransport +from ray.experimental.collective.tensor_transport_manager import TensorTransportManager +from ray.util.collective.types import Backend + +if TYPE_CHECKING: + import torch + +# Singleton instances for tensor transport managers +_nixl_tensor_transport_manager = None +_gloo_tensor_transport_manager = None +_nccl_tensor_transport_manager = None + + +def get_tensor_transport_manager( + tensor_transport: Backend, +) -> "TensorTransportManager": + """Get the tensor transport manager for the given tensor transport protocol. + + Args: + tensor_transport: The tensor transport protocol to use for the GPU object. + + Returns: + TensorTransportManager: The tensor transport manager for the given tensor transport protocol. + """ + if tensor_transport == Backend.NIXL: + global _nixl_tensor_transport_manager + if _nixl_tensor_transport_manager is None: + _nixl_tensor_transport_manager = NixlTensorTransport() + return _nixl_tensor_transport_manager + elif tensor_transport == Backend.TORCH_GLOO: + global _gloo_tensor_transport_manager + if _gloo_tensor_transport_manager is None: + _gloo_tensor_transport_manager = CollectiveTensorTransport(tensor_transport) + return _gloo_tensor_transport_manager + elif tensor_transport == Backend.NCCL: + global _nccl_tensor_transport_manager + if _nccl_tensor_transport_manager is None: + _nccl_tensor_transport_manager = CollectiveTensorTransport(tensor_transport) + return _nccl_tensor_transport_manager + else: + raise ValueError(f"Unsupported tensor transport protocol: {tensor_transport}") + + +def device_match_transport(device: "torch.device", tensor_transport: Backend) -> bool: + """Check if the device matches the transport.""" + if tensor_transport == Backend.NIXL: + return device.type == "cuda" or device.type == "cpu" + elif tensor_transport == Backend.TORCH_GLOO: + return device.type == "cpu" + elif tensor_transport == Backend.NCCL: + return device.type == "cuda" + else: + raise ValueError(f"Unsupported tensor transport protocol: {tensor_transport}") def find_free_port() -> int: diff --git a/python/ray/experimental/gpu_object_manager/__init__.py b/python/ray/experimental/gpu_object_manager/__init__.py index 9cc5b4d9c981..13be59395445 100644 --- a/python/ray/experimental/gpu_object_manager/__init__.py +++ b/python/ray/experimental/gpu_object_manager/__init__.py @@ -1,3 +1,6 @@ -from ray.experimental.gpu_object_manager.gpu_object_manager import GPUObjectManager +from ray.experimental.gpu_object_manager.gpu_object_manager import ( + GPUObjectManager, + wait_tensor_freed, +) -__all__ = ["GPUObjectManager"] +__all__ = ["GPUObjectManager", "wait_tensor_freed"] diff --git a/python/ray/experimental/gpu_object_manager/gpu_object_manager.py b/python/ray/experimental/gpu_object_manager/gpu_object_manager.py index 06a2b040e49e..73ac927e355e 100644 --- a/python/ray/experimental/gpu_object_manager/gpu_object_manager.py +++ b/python/ray/experimental/gpu_object_manager/gpu_object_manager.py @@ -1,21 +1,22 @@ -from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Tuple import threading +import warnings +from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Set, Tuple import ray +from ray._private import ray_constants from ray._private.custom_types import TensorTransportEnum from ray._raylet import ObjectRef -from ray._private import ray_constants - if TYPE_CHECKING: import torch + from ray.experimental.gpu_object_manager.gpu_object_store import ( GPUObjectStore, - GPUObject, ) + from ray.util.collective.types import TensorTransportMetadata # GPUObjectMeta is a named tuple containing the source actor, tensor transport -# backend, and tensor metadata. +# backend, tensor metadata, and other information that needs to be recorded. # - The tensor transport backend is the backend used to transport the tensors. # Currently, the supported backends are "nccl" and "torch_gloo". # - The tensor metadata is a list of tuples, each containing the shape and dtype @@ -25,7 +26,36 @@ class GPUObjectMeta(NamedTuple): # Must be a valid backend name as defined in # `ray.util.collective.types.Backend`. tensor_transport_backend: str - tensor_meta: List[Tuple["torch.Size", "torch.dtype"]] + tensor_transport_meta: "TensorTransportMetadata" + # sent_dest_actors tracks the set of actor IDs that this object has been sent to. + sent_dest_actors: Set[str] + # sent_to_src_actor_and_others_warned indicates whether the object has already triggered a warning about being sent back to the source actor and other actors simultaneously. + sent_to_src_actor_and_others_warned: bool + + +# TODO(swang): Uncomment and add an API docs page and example usage. +# @PublicAPI(stability="alpha") +def wait_tensor_freed(tensor: "torch.Tensor", timeout: Optional[float] = None): + """ + Wait for the tensor to be freed from this actor's GPU object store. + + This function is useful for cases where an actor keeps a reference to a + tensor after returning the tensor from a task annotated with + `@ray.method(tensor_transport=...)`. Tensors that are returned by these + tasks may be sent to other actors while the corresponding `ray.ObjectRef` is + still in scope. If the actor modifies the tensor while it is still in the + actor's GPU object store, then Ray may end up sending invalid data to other + tasks. Call this function to ensure that the `ray.ObjectRef` has gone out of + scope and therefore the tensor is safe to write to again. + + Args: + tensor: The tensor to wait to be freed. + timeout: The timeout in seconds. Set to None to wait indefinitely. Note + that this function could then hang if the `ray.ObjectRef` that + refers to this tensor never goes out of scope. + """ + gpu_object_manager = ray.worker.global_worker.gpu_object_manager + gpu_object_manager.gpu_object_store.wait_tensor_freed(tensor, timeout) class GPUObjectManager: @@ -34,6 +64,7 @@ def __init__(self): # This dictionary is hosted on the "driver" process of the actors that # store and send/receive GPU objects. self.managed_gpu_object_metadata: Dict[str, GPUObjectMeta] = {} + # Per-actor local storage for GPU objects. We create the GPU object # store lazily, if a user specifies a non-default tensor_transport, to # avoid circular import and because it imports third-party dependencies @@ -53,23 +84,6 @@ def gpu_object_store(self) -> "ray.experimental.GPUObjectStore": self._gpu_object_store = GPUObjectStore() return self._gpu_object_store - def _get_tensor_meta( - self, src_actor: "ray.actor.ActorHandle", obj_id: str - ) -> ObjectRef: - from ray.experimental.gpu_object_manager.gpu_object_store import ( - __ray_get_tensor_meta__, - ) - - # Submit a Ray actor task to the source actor to get the tensor metadata. - # The metadata is a list of tuples, where each tuple contains the shape and dtype - # of a tensor in the GPU object store. This function returns an ObjectRef that - # points to the tensor metadata. - # NOTE(swang): We put this task on the background thread to avoid tasks - # executing on the main thread blocking this task. - return src_actor.__ray_call__.options(concurrency_group="_ray_system").remote( - __ray_get_tensor_meta__, obj_id - ) - def is_managed_object(self, obj_id: str) -> bool: """ Check if the GPU object is managed by this process. @@ -83,11 +97,25 @@ def is_managed_object(self, obj_id: str) -> bool: """ return obj_id in self.managed_gpu_object_metadata + def add_gpu_object_metadata( + self, obj_ref: ObjectRef, gpu_object_meta: GPUObjectMeta + ): + """ + Add the GPU object metadata to the GPU object manager. + + Args: + obj_ref: The ObjectRef of the GPU object. + gpu_object_meta: The GPU object metadata. + """ + obj_id = obj_ref.hex() + self.managed_gpu_object_metadata[obj_id] = gpu_object_meta + def add_gpu_object_ref( self, obj_ref: ObjectRef, src_actor: "ray.actor.ActorHandle", tensor_transport: TensorTransportEnum, + tensor_transport_meta: Optional["TensorTransportMetadata"] = None, ): """Add a GPU object reference to the GPU object manager. This should be called whenever the current process calls a task that is annotated with @@ -97,7 +125,9 @@ def add_gpu_object_ref( obj_ref: The ObjectRef of the task output. src_actor: The actor that executes the task and that creates the GPU object. tensor_transport: The tensor transport protocol to use for the GPU object. + tensor_transport_meta: The tensor transport metadata that is pre-computed. """ + from ray.experimental.collective import get_tensor_transport_manager from ray.experimental.gpu_object_manager.gpu_object_store import ( _tensor_transport_to_collective_backend, ) @@ -106,57 +136,32 @@ def add_gpu_object_ref( tensor_transport ) obj_id = obj_ref.hex() - tensor_meta = self._get_tensor_meta(src_actor, obj_id) + tensor_transport_manager = get_tensor_transport_manager( + tensor_transport_backend + ) + if not tensor_transport_meta: + tensor_meta = tensor_transport_manager.get_tensor_transport_metadata( + src_actor, obj_id + ) + else: + tensor_meta = tensor_transport_meta self.managed_gpu_object_metadata[obj_id] = GPUObjectMeta( src_actor=src_actor, tensor_transport_backend=tensor_transport_backend, - tensor_meta=tensor_meta, + tensor_transport_meta=tensor_meta, + sent_dest_actors=set(), + sent_to_src_actor_and_others_warned=False, ) def _get_gpu_object_metadata(self, obj_ref: ObjectRef) -> GPUObjectMeta: obj_id = obj_ref.hex() return self.managed_gpu_object_metadata[obj_id] - def _send_object( - self, - communicator_name: str, - src_actor: "ray.actor.ActorHandle", - obj_id: str, - dst_rank: int, - ): - from ray.experimental.gpu_object_manager.gpu_object_store import __ray_send__ - - # Send tensors stored in the `src_actor`'s GPU object store to the - # destination rank `dst_rank`. - # NOTE(swang): We put this task on the background thread to avoid tasks - # executing on the main thread blocking the data transfer. - src_actor.__ray_call__.options(concurrency_group="_ray_system").remote( - __ray_send__, communicator_name, obj_id, dst_rank - ) - - def _recv_object( + def fetch_object( self, - communicator_name: str, - dst_actor: "ray.actor.ActorHandle", obj_id: str, - src_rank: int, - tensor_meta: List[Tuple["torch.Size", "torch.dtype"]], + tensor_transport: TensorTransportEnum = TensorTransportEnum.OBJECT_STORE, ): - from ray.experimental.gpu_object_manager.gpu_object_store import __ray_recv__ - - # Receive tensors from the source rank and store them in the - # `dst_actor`'s GPU object store. - # - # NOTE(swang): We put this task on the background thread to avoid tasks - # executing on the main thread blocking the data transfer. Technically, - # this is only needed for the sender task, but we put the receiver task - # on the same background thread to ensure that all communication - # operations are executed in a global order. - dst_actor.__ray_call__.options(concurrency_group="_ray_system").remote( - __ray_recv__, communicator_name, obj_id, src_rank, tensor_meta - ) - - def fetch_object(self, obj_id: str): """ Fetches the GPU object from the source actor's GPU object store via the object store instead of out-of-band tensor transfer and stores the tensors in the local GPU object store. @@ -167,25 +172,45 @@ def fetch_object(self, obj_id: str): Args: obj_id: The object ID of the GPU object. + tensor_transport: The tensor transport to use to fetch the GPU object. Returns: None """ + from ray.experimental.collective import get_tensor_transport_manager from ray.experimental.gpu_object_manager.gpu_object_store import ( __ray_fetch_gpu_object__, ) if self.gpu_object_store.has_object(obj_id): return - gpu_object_meta = self.managed_gpu_object_metadata[obj_id] src_actor = gpu_object_meta.src_actor - tensors = ray.get( - src_actor.__ray_call__.options(concurrency_group="_ray_system").remote( - __ray_fetch_gpu_object__, obj_id - ) + tensor_transport_backend = gpu_object_meta.tensor_transport_backend + tensor_transport_manager = get_tensor_transport_manager( + tensor_transport_backend + ) + tensor_transport_meta = gpu_object_meta.tensor_transport_meta + use_object_store = ( + tensor_transport == TensorTransportEnum.OBJECT_STORE + or isinstance(tensor_transport_meta, ObjectRef) ) - self.gpu_object_store.add_object(obj_id, tensors) + if use_object_store: + tensors = ray.get( + src_actor.__ray_call__.options(concurrency_group="_ray_system").remote( + __ray_fetch_gpu_object__, obj_id + ) + ) + self.gpu_object_store.add_object(obj_id, tensors) + else: + from ray.experimental.gpu_object_manager.gpu_object_store import ( + __ray_recv__, + ) + + communicator_meta = tensor_transport_manager.get_communicator_metadata( + None, None, tensor_transport_backend + ) + __ray_recv__(None, obj_id, tensor_transport_meta, communicator_meta) def trigger_out_of_band_tensor_transfer( self, dst_actor: "ray.actor.ActorHandle", task_args: Tuple[Any, ...] @@ -207,6 +232,7 @@ def trigger_out_of_band_tensor_transfer( dst_actor: The target actor to receive tensors task_args: List of arguments for the target actor task that may contain ObjectRefs. """ + gpu_object_refs = set() for arg in task_args: # If an ObjectRef is managed, it means the actual value is a list of tensors stored @@ -216,65 +242,90 @@ def trigger_out_of_band_tensor_transfer( continue if self.is_managed_object(arg.hex()): gpu_object_refs.add(arg) + if gpu_object_refs: + from ray.experimental.collective import get_tensor_transport_manager # Count the number of readers for each GPU object. for obj_ref in gpu_object_refs: # Import get_collective_groups here to avoid dependency on # collective libraries for default Ray installation. - from ray.experimental.collective import get_collective_groups gpu_object_meta = self._get_gpu_object_metadata(obj_ref) src_actor = gpu_object_meta.src_actor - tensor_meta = gpu_object_meta.tensor_meta - communicators = get_collective_groups( - [src_actor, dst_actor], backend=gpu_object_meta.tensor_transport_backend - ) - # TODO(kevin85421): Support multiple communicators. - if len(communicators) == 0: - raise ValueError( - f"No communicators found for actors {src_actor} and {dst_actor}. " - "Create a communicator with " - "`ray.experimental.collective.create_collective_group` " - "before calling actor tasks." - ) - elif len(communicators) > 1: - raise ValueError( - f"There are {len(communicators)} possible communicators that contain actors {src_actor} and {dst_actor}. " - "Currently, GPU objects only support one communicator. Please make sure only " - "one communicator exists." - ) - communicator = communicators[0] - src_rank = communicator.get_rank(src_actor) - if src_rank == -1: - raise ValueError( - f"Sender actor {src_actor} not found in communicator. " - "Please make sure the sender and receiver are in the same communicator." + tensor_transport_meta = gpu_object_meta.tensor_transport_meta + + obj_id = obj_ref.hex() + + # Update the set of destination actors for this object + # The set inside NamedTuple is mutable, so we can modify it directly + gpu_object_meta.sent_dest_actors.add(dst_actor._actor_id) + # Check if a warning should be triggered for this object: + # 1. object has not triggered a warning yet. + # 2. object is sent back to its source actor. + # 3. object is also sent to at least one other actor + if ( + not gpu_object_meta.sent_to_src_actor_and_others_warned + and src_actor._actor_id in gpu_object_meta.sent_dest_actors + and len(gpu_object_meta.sent_dest_actors) > 1 + ): + warnings.warn( + f"GPU ObjectRef({obj_id}) is being passed back to the actor that created it {src_actor}. " + "Note that GPU objects are mutable. If the tensor is modified, Ray's internal copy will also be updated, and subsequent passes to other actors " + "will receive the updated version instead of the original.", + UserWarning, ) - dst_rank = communicator.get_rank(dst_actor) - if dst_rank == -1: - raise ValueError( - f"Receiver actor {dst_actor} not found in communicator. " - "Please make sure the sender and receiver are in the same communicator." + # Mark the object as warned by creating a new NamedTuple instance + self.managed_gpu_object_metadata[obj_id] = gpu_object_meta._replace( + sent_to_src_actor_and_others_warned=True ) - if src_rank == dst_rank: - # If the source and destination ranks are the same, the tensors can + + if src_actor._actor_id == dst_actor._actor_id: + # If the source and destination actors are the same, the tensors can # be transferred intra-process, so we skip the out-of-band tensor # transfer. continue - obj_id = obj_ref.hex() - self._send_object(communicator.name, src_actor, obj_id, dst_rank) - self._recv_object( - communicator.name, dst_actor, obj_id, src_rank, tensor_meta + + tensor_transport_manager = get_tensor_transport_manager( + gpu_object_meta.tensor_transport_backend + ) + communicator_meta = tensor_transport_manager.get_communicator_metadata( + src_actor, + dst_actor, + gpu_object_meta.tensor_transport_backend, + ) + if not tensor_transport_manager.is_one_sided(): + tensor_transport_manager.send_object( + src_actor, + obj_id, + tensor_transport_meta, + communicator_meta, + ) + tensor_transport_manager.recv_object( + dst_actor, + obj_id, + tensor_transport_meta, + communicator_meta, ) - def get_gpu_object(self, object_id: str) -> "GPUObject": + def get_gpu_object( + self, + object_id: str, + tensor_transport: TensorTransportEnum = TensorTransportEnum.OBJECT_STORE, + ) -> List["torch.Tensor"]: """ Get the GPU object for a given object ID. + + Args: + object_id: The object ID of the GPU object. + tensor_transport: The tensor transport to use to fetch the GPU object. + + Returns: + The GPU object. """ gpu_object_store = self.gpu_object_store if self.is_managed_object(object_id): - self.fetch_object(object_id) + self.fetch_object(object_id, tensor_transport) # If the GPU object is the primary copy, it means the transfer is intra-actor. # In this case, we should not remove the GPU object after it is consumed once, @@ -290,3 +341,68 @@ def get_gpu_object(self, object_id: str) -> "GPUObject": object_id, timeout=ray_constants.FETCH_FAIL_TIMEOUT_SECONDS ) return gpu_object + + def actor_has_tensor_transport( + self, actor: "ray.actor.ActorHandle", tensor_transport: TensorTransportEnum + ): + """ + Check if the actor has a communicator for the given tensor transport backend. + + Args: + actor: The actor to check. + tensor_transport: The tensor transport backend to check. + + Returns: + True if the actor has a communicator for the given tensor transport backend, False otherwise. + """ + # Import get_collective_groups here to avoid dependency on + # collective libraries for default Ray installation. + from ray.experimental.collective import get_tensor_transport_manager + from ray.experimental.gpu_object_manager.gpu_object_store import ( + _tensor_transport_to_collective_backend, + ) + + tensor_transport_backend = _tensor_transport_to_collective_backend( + tensor_transport + ) + tensor_transport_manager = get_tensor_transport_manager( + tensor_transport_backend + ) + return tensor_transport_manager.actor_has_tensor_transport(actor) + + def put_object( + self, + obj_ref: ObjectRef, + tensor_transport: TensorTransportEnum, + tensors: List["torch.Tensor"], + ): + """ + Put the GPU object into the GPU object manager. + + Args: + obj_ref: The object ref of the GPU object. + tensor_transport: The tensor transport backend to use. + tensors: The tensors to put into the GPU object manager. + + """ + from ray.experimental.collective import get_tensor_transport_manager + from ray.experimental.gpu_object_manager.gpu_object_store import ( + _tensor_transport_to_collective_backend, + ) + + tensor_transport_backend = _tensor_transport_to_collective_backend( + tensor_transport + ) + transport_manager = get_tensor_transport_manager(tensor_transport_backend) + tensor_transport_meta = transport_manager.extract_tensor_transport_metadata( + tensors + ) + + src_actor = ray.get_runtime_context().current_actor + self.gpu_object_store.add_object(obj_ref.hex(), tensors, is_primary=True) + self.add_gpu_object_ref( + obj_ref, + src_actor, + tensor_transport, + tensor_transport_meta=tensor_transport_meta, + ) diff --git a/python/ray/experimental/gpu_object_manager/gpu_object_store.py b/python/ray/experimental/gpu_object_manager/gpu_object_store.py index b16e056685c7..3bd9f532ad1d 100644 --- a/python/ray/experimental/gpu_object_manager/gpu_object_store.py +++ b/python/ray/experimental/gpu_object_manager/gpu_object_store.py @@ -1,11 +1,17 @@ -from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple import threading +from collections import defaultdict, deque +from dataclasses import dataclass +from typing import Dict, List, Optional, Set import ray.util.collective as collective from ray._private.custom_types import TensorTransportEnum -from ray.util.collective.types import Backend - +from ray.experimental.collective import get_tensor_transport_manager +from ray.experimental.collective.util import device_match_transport +from ray.util.collective.types import ( + Backend, + CommunicatorMetadata, + TensorTransportMetadata, +) try: import torch @@ -18,11 +24,7 @@ TENSOR_TRANSPORT_TO_COLLECTIVE_BACKEND = { TensorTransportEnum.NCCL: Backend.NCCL, TensorTransportEnum.GLOO: Backend.TORCH_GLOO, -} - -COLLECTIVE_BACKEND_TO_TORCH_DEVICE = { - Backend.NCCL: torch.device("cuda"), - Backend.TORCH_GLOO: torch.device("cpu"), + TensorTransportEnum.NIXL: Backend.NIXL, } @@ -37,7 +39,12 @@ def _tensor_transport_to_collective_backend( ) -def __ray_send__(self, communicator_name: str, obj_id: str, dst_rank: int): +def __ray_send__( + self, + obj_id: str, + tensor_transport_meta: TensorTransportMetadata, + communicator_meta: CommunicatorMetadata, +): """Helper function that runs on the src actor to send tensors to the dst actor.""" from ray._private.worker import global_worker @@ -45,54 +52,56 @@ def __ray_send__(self, communicator_name: str, obj_id: str, dst_rank: int): assert gpu_object_store.has_object( obj_id ), f"obj_id={obj_id} not found in GPU object store" - tensors = gpu_object_store.get_object(obj_id).data - backend = collective.get_group_handle(communicator_name).backend() - device = COLLECTIVE_BACKEND_TO_TORCH_DEVICE[backend] + tensors = gpu_object_store.get_object(obj_id) - for tensor in tensors: - if tensor.device.type != device.type: - # TODO(swang): Right now there is no way to catch this error - # and the receiving Ray task will hang. - raise ValueError( - f"tensor device {tensor.device} does not match device {device}" - ) - collective.send(tensor, dst_rank, group_name=communicator_name) + backend = collective.get_group_handle(communicator_meta.communicator_name).backend() + + tensor_transport_manager = get_tensor_transport_manager(backend) + if tensors and not device_match_transport(tensors[0].device, backend): + raise ValueError( + f"Tensor transport backend {backend} does not support tensor transfer on device {tensors[0].device}." + ) + tensor_transport_manager.send_multiple_tensors( + tensors, + tensor_transport_meta, + communicator_meta, + ) def __ray_recv__( self, - communicator_name: str, obj_id: str, - src_rank: int, - tensor_meta: List[Tuple["torch.Size", "torch.dtype"]], + tensor_transport_meta: TensorTransportMetadata, + communicator_meta: CommunicatorMetadata, ): """Helper function that runs on the dst actor to receive tensors from the src actor.""" from ray._private.worker import global_worker - backend = collective.get_group_handle(communicator_name).backend() - device = COLLECTIVE_BACKEND_TO_TORCH_DEVICE[backend] + backend = collective.get_group_handle(communicator_meta.communicator_name).backend() + + device = tensor_transport_meta.tensor_device + tensor_meta = tensor_transport_meta.tensor_meta gpu_object_store = global_worker.gpu_object_manager.gpu_object_store + if tensor_meta and not device_match_transport(device, backend): + raise ValueError( + f"Tensor transport backend {backend} does not support tensor transfer on device {device}." + ) tensors = [] for meta in tensor_meta: shape, dtype = meta tensor = torch.zeros(shape, dtype=dtype, device=device) - collective.recv(tensor, src_rank, group_name=communicator_name) tensors.append(tensor) - gpu_object_store.add_object(obj_id, tensors) + tensor_transport_manager = get_tensor_transport_manager(backend) + tensor_transport_manager.recv_multiple_tensors( + tensors, + tensor_transport_meta, + communicator_meta, + ) -def __ray_get_tensor_meta__(self, obj_id: str): - """Helper function that runs on the src actor to get the tensor metadata.""" - from ray._private.worker import global_worker - - gpu_object_store = global_worker.gpu_object_manager.gpu_object_store - # NOTE: We do not specify a timeout here because the user task that returns - # it could take arbitrarily long and we don't want to trigger a spurious - # timeout. - gpu_object = gpu_object_store.wait_and_get_object(obj_id) - return [(t.shape, t.dtype) for t in gpu_object.data] + gpu_object_store.add_object(obj_id, tensors) def __ray_fetch_gpu_object__(self, obj_id: str): @@ -104,21 +113,15 @@ def __ray_fetch_gpu_object__(self, obj_id: str): obj_id ), f"obj_id={obj_id} not found in GPU object store" gpu_object = gpu_object_store.get_object(obj_id) - return gpu_object.data + return gpu_object @dataclass -class GPUObject: +class _GPUObject: # A list of tensors representing the GPU object. data: List["torch.Tensor"] # Whether the GPU object is the primary copy. is_primary: bool - # The number of reads allowed to the GPU object before it will be GCed from this actor. - # This is used to implement garbage collection for receiver actors, - # handling cases where the same GPU object reference is passed to the - # same actor task multiple times. For sender actors, we still rely on - # the object store's reference counting mechanism. - num_readers: int = 0 class GPUObjectStore: @@ -134,22 +137,33 @@ class GPUObjectStore: """ def __init__(self): - # A dictionary that maps from an object ID to a list of tensors. + # A dictionary that maps from an object ID to a queue of tensor lists. # - # Note: Currently, `gpu_object_store` is only supported for Ray Actors. - self._gpu_object_store: Dict[str, GPUObject] = {} + # Note: Currently, `_gpu_object_store` is only supported for Ray Actors. + self._gpu_object_store: Dict[str, deque[_GPUObject]] = defaultdict(deque) + # Mapping from tensor to the IDs of objects that contain it. + self._tensor_to_object_ids: Dict["torch.Tensor", Set[str]] = defaultdict(set) # Synchronization for GPU object store. self._lock = threading.RLock() # Signal when an object becomes present in the object store. self._object_present_cv = threading.Condition(self._lock) + # Signal when an object is freed from the object store. + self._object_freed_cv = threading.Condition(self._lock) def has_object(self, obj_id: str) -> bool: with self._lock: - return obj_id in self._gpu_object_store + existed = obj_id in self._gpu_object_store + if existed: + return len(self._gpu_object_store[obj_id]) > 0 + return existed - def get_object(self, obj_id: str) -> Optional[GPUObject]: + def has_tensor(self, tensor: "torch.Tensor") -> bool: with self._lock: - return self._gpu_object_store[obj_id] + return tensor in self._tensor_to_object_ids + + def get_object(self, obj_id: str) -> Optional[List["torch.Tensor"]]: + with self._lock: + return self._gpu_object_store[obj_id][0].data def add_object( self, @@ -166,22 +180,26 @@ def add_object( is_primary: Whether the GPU object is the primary copy. """ with self._object_present_cv: - self._gpu_object_store[obj_id] = GPUObject( - gpu_object, - is_primary, + for tensor in gpu_object: + self._tensor_to_object_ids[tensor].add(obj_id) + # Append to the queue instead of overwriting + self._gpu_object_store[obj_id].append( + _GPUObject( + gpu_object, + is_primary, + ) ) self._object_present_cv.notify_all() def is_primary_copy(self, obj_id: str) -> bool: with self._lock: return ( - obj_id in self._gpu_object_store - and self._gpu_object_store[obj_id].is_primary + self.has_object(obj_id) and self._gpu_object_store[obj_id][0].is_primary ) def wait_and_get_object( self, obj_id: str, timeout: Optional[float] = None - ) -> GPUObject: + ) -> List["torch.Tensor"]: """Atomically waits for the GPU object to be present in the GPU object store, then gets it. If the object is not present after the optional timeout, raise a TimeoutError. @@ -200,7 +218,7 @@ def wait_and_get_object( def wait_and_pop_object( self, obj_id: str, timeout: Optional[float] = None - ) -> GPUObject: + ) -> List["torch.Tensor"]: """Atomically waits for the GPU object to be present in the GPU object store, then pops it. If the object is not present after the optional timeout, raise a TimeoutError. @@ -230,24 +248,48 @@ def _wait_object(self, obj_id: str, timeout: Optional[float] = None) -> None: indefinitely. """ with self._object_present_cv: - present = self._object_present_cv.wait_for( - lambda: obj_id in self._gpu_object_store, timeout=timeout - ) - if not present: + if not self._object_present_cv.wait_for( + lambda: self.has_object(obj_id), + timeout=timeout, + ): raise TimeoutError( - f"ObjectRef({obj_id}) not found in GPU object store after {timeout}s, transfer may have failed. Please report this issue on GitHub: https://github.com/ray-project/ray/issues/new/choose" + f"ObjectRef({obj_id}) not found in RDT object store after {timeout}s, transfer may have failed. Please report this issue on GitHub: https://github.com/ray-project/ray/issues/new/choose" ) - def pop_object(self, obj_id: str) -> GPUObject: + def pop_object(self, obj_id: str) -> List["torch.Tensor"]: with self._lock: - assert ( - obj_id in self._gpu_object_store + assert self.has_object( + obj_id ), f"obj_id={obj_id} not found in GPU object store" - return self._gpu_object_store.pop(obj_id) + queue = self._gpu_object_store.get(obj_id) + gpu_object = queue.popleft() + if len(queue) == 0: + del self._gpu_object_store[obj_id] + for tensor in gpu_object.data: + self._tensor_to_object_ids[tensor].remove(obj_id) + if len(self._tensor_to_object_ids[tensor]) == 0: + self._tensor_to_object_ids.pop(tensor) + self._object_freed_cv.notify_all() + return gpu_object.data + + def wait_tensor_freed( + self, tensor: "torch.Tensor", timeout: Optional[float] = None + ) -> None: + """ + Wait for the object to be freed from the GPU object store. + """ + with self._object_freed_cv: + if not self._object_freed_cv.wait_for( + lambda: tensor not in self._tensor_to_object_ids, timeout=timeout + ): + raise TimeoutError( + f"Tensor {tensor} not freed from RDT object store after {timeout}s. The tensor will not be freed until all ObjectRefs containing the tensor have gone out of scope." + ) def get_num_objects(self) -> int: """ Return the number of objects in the GPU object store. """ with self._lock: - return len(self._gpu_object_store) + # Count total objects across all queues + return sum(len(queue) for queue in self._gpu_object_store.values()) diff --git a/python/ray/includes/array.pxd b/python/ray/includes/array.pxd new file mode 100644 index 000000000000..a6ce5e135a70 --- /dev/null +++ b/python/ray/includes/array.pxd @@ -0,0 +1,6 @@ +from libc.stddef cimport size_t +from libcpp.string cimport string + +cdef extern from "" namespace "std": + cdef cppclass array_string_2 "std::array": + string& operator[](size_t) except + diff --git a/python/ray/includes/common.pxd b/python/ray/includes/common.pxd index f9a6314dea1d..ec40d7c43f8c 100644 --- a/python/ray/includes/common.pxd +++ b/python/ray/includes/common.pxd @@ -158,7 +158,7 @@ cdef extern from "ray/common/id.h" namespace "ray" nogil: cdef extern from "src/ray/protobuf/common.pb.h" nogil: - cdef cppclass CLanguage "Language": + cdef cppclass CLanguage "ray::Language": pass cdef cppclass CWorkerType "ray::core::WorkerType": pass @@ -204,6 +204,7 @@ cdef extern from "src/ray/protobuf/common.pb.h" nogil: CAddress owner_address() const const c_string &object_id() const const c_string &call_site() const + CTensorTransport tensor_transport() const cdef cppclass CNodeLabelSchedulingStrategy "ray::rpc::NodeLabelSchedulingStrategy": # noqa: E501 CNodeLabelSchedulingStrategy() CLabelMatchExpressions* mutable_hard() @@ -242,9 +243,9 @@ cdef extern from "src/ray/protobuf/common.pb.h" nogil: # This is a workaround for C++ enum class since Cython has no corresponding # representation. cdef extern from "src/ray/protobuf/common.pb.h" nogil: - cdef CLanguage LANGUAGE_PYTHON "Language::PYTHON" - cdef CLanguage LANGUAGE_CPP "Language::CPP" - cdef CLanguage LANGUAGE_JAVA "Language::JAVA" + cdef CLanguage LANGUAGE_PYTHON "ray::Language::PYTHON" + cdef CLanguage LANGUAGE_CPP "ray::Language::CPP" + cdef CLanguage LANGUAGE_JAVA "ray::Language::JAVA" cdef extern from "src/ray/protobuf/common.pb.h" nogil: cdef CWorkerType WORKER_TYPE_WORKER "ray::core::WorkerType::WORKER" @@ -263,6 +264,9 @@ cdef extern from "src/ray/protobuf/common.pb.h" nogil: cdef extern from "src/ray/protobuf/common.pb.h" nogil: cdef CTensorTransport TENSOR_TRANSPORT_OBJECT_STORE "ray::rpc::TensorTransport::OBJECT_STORE" + cdef CTensorTransport TENSOR_TRANSPORT_NCCL "ray::rpc::TensorTransport::NCCL" + cdef CTensorTransport TENSOR_TRANSPORT_GLOO "ray::rpc::TensorTransport::GLOO" + cdef CTensorTransport TENSOR_TRANSPORT_NIXL "ray::rpc::TensorTransport::NIXL" cdef extern from "src/ray/protobuf/common.pb.h" nogil: cdef CPlacementStrategy PLACEMENT_STRATEGY_PACK \ @@ -318,7 +322,8 @@ cdef extern from "ray/core_worker/common.h" nogil: cdef cppclass CTaskArgByReference "ray::TaskArgByReference": CTaskArgByReference(const CObjectID &object_id, const CAddress &owner_address, - const c_string &call_site) + const c_string &call_site, + const CTensorTransport &tensor_transport) cdef cppclass CTaskArgByValue "ray::TaskArgByValue": CTaskArgByValue(const shared_ptr[CRayObject] &data) @@ -360,6 +365,7 @@ cdef extern from "ray/core_worker/common.h" nogil: const c_vector[CConcurrencyGroup] &concurrency_groups, c_bool allow_out_of_order_execution, int32_t max_pending_calls, + c_bool enable_tensor_transport, c_bool enable_task_events, const unordered_map[c_string, c_string] &labels, const unordered_map[c_string, c_string] &label_selector) @@ -372,7 +378,6 @@ cdef extern from "ray/core_worker/common.h" nogil: CPlacementStrategy strategy, const c_vector[unordered_map[c_string, double]] &bundles, c_bool is_detached, - double max_cpu_fraction_per_node, CNodeID soft_target_node_id, const c_vector[unordered_map[c_string, c_string]] &bundle_label_selector, ) @@ -386,7 +391,7 @@ cdef extern from "ray/core_worker/common.h" nogil: const CNodeID &GetSpilledNodeID() const const c_bool GetDidSpill() const -cdef extern from "ray/gcs/gcs_client/python_callbacks.h" namespace "ray::gcs": +cdef extern from "ray/gcs_client/python_callbacks.h" namespace "ray::gcs": cdef cppclass MultiItemPyCallback[T]: MultiItemPyCallback( object (*)(CRayStatus, c_vector[T]) nogil, @@ -405,7 +410,7 @@ cdef extern from "ray/gcs/gcs_client/python_callbacks.h" namespace "ray::gcs": void (object, object) nogil, object) nogil -cdef extern from "ray/gcs/gcs_client/accessor.h" nogil: +cdef extern from "ray/gcs_client/accessor.h" nogil: cdef cppclass CActorInfoAccessor "ray::gcs::ActorInfoAccessor": void AsyncGetAllByFilter( const optional[CActorID] &actor_id, @@ -611,7 +616,7 @@ cdef extern from "ray/gcs/gcs_client/accessor.h" nogil: ) -cdef extern from "ray/gcs/gcs_client/gcs_client.h" nogil: +cdef extern from "ray/gcs_client/gcs_client.h" nogil: cdef enum CGrpcStatusCode "grpc::StatusCode": UNAVAILABLE "grpc::StatusCode::UNAVAILABLE", UNKNOWN "grpc::StatusCode::UNKNOWN", @@ -641,12 +646,12 @@ cdef extern from "ray/gcs/gcs_client/gcs_client.h" nogil: cdef CRayStatus ConnectOnSingletonIoContext(CGcsClient &gcs_client, int timeout_ms) -cdef extern from "ray/gcs/gcs_client/gcs_client.h" namespace "ray::gcs" nogil: +cdef extern from "ray/gcs_client/gcs_client.h" namespace "ray::gcs" nogil: unordered_map[c_string, double] PythonGetResourcesTotal( const CGcsNodeInfo& node_info) -cdef extern from "ray/gcs/pubsub/gcs_pub_sub.h" nogil: - cdef cppclass CPythonGcsSubscriber "ray::gcs::PythonGcsSubscriber": +cdef extern from "ray/pubsub/python_gcs_subscriber.h" nogil: + cdef cppclass CPythonGcsSubscriber "ray::pubsub::PythonGcsSubscriber": CPythonGcsSubscriber( const c_string& gcs_address, int gcs_port, CChannelType channel_type, @@ -662,15 +667,12 @@ cdef extern from "ray/gcs/pubsub/gcs_pub_sub.h" nogil: CRayStatus PollLogs( c_string* key_id, int64_t timeout_ms, CLogBatch* data) - CRayStatus PollActor( - c_string* key_id, int64_t timeout_ms, CActorTableData* data) - CRayStatus Close() -cdef extern from "ray/gcs/pubsub/gcs_pub_sub.h" namespace "ray::gcs" nogil: - c_vector[c_string] PythonGetLogBatchLines(const CLogBatch& log_batch) +cdef extern from "ray/pubsub/python_gcs_subscriber.h" namespace "ray::pubsub" nogil: + c_vector[c_string] PythonGetLogBatchLines(CLogBatch log_batch) -cdef extern from "ray/gcs/gcs_client/gcs_client.h" namespace "ray::gcs" nogil: +cdef extern from "ray/gcs_client/gcs_client.h" namespace "ray::gcs" nogil: unordered_map[c_string, c_string] PythonGetNodeLabels( const CGcsNodeInfo& node_info) @@ -764,9 +766,9 @@ cdef extern from "src/ray/protobuf/autoscaler.pb.h" nogil: cdef extern from "ray/common/task/task_spec.h" nogil: cdef cppclass CConcurrencyGroup "ray::ConcurrencyGroup": CConcurrencyGroup( - const c_string &name, + c_string name, uint32_t max_concurrency, - const c_vector[CFunctionDescriptor] &c_fds) + c_vector[CFunctionDescriptor] c_fds) CConcurrencyGroup() c_string GetName() const uint32_t GetMaxConcurrency() const diff --git a/python/ray/includes/gcs_client.pxi b/python/ray/includes/gcs_client.pxi index 63fd99cf8c3c..bc47457b179b 100644 --- a/python/ray/includes/gcs_client.pxi +++ b/python/ray/includes/gcs_client.pxi @@ -14,7 +14,7 @@ Binding of C++ ray::gcs::GcsClient. # # We need to best-effort import everything we need. # -# For how async API are implemented, see src/ray/gcs/gcs_client/python_callbacks.h +# For how async API are implemented, see src/ray/gcs_client/python_callbacks.h from asyncio import Future from typing import List, Sequence from libcpp.utility cimport move @@ -628,9 +628,8 @@ cdef class InnerGcsClient: error_info.set_timestamp(time.time()) with nogil: - check_status_timeout_as_rpc_error( - self.inner.get().Publisher().PublishError( - move(c_key_id), move(error_info), timeout_ms)) + self.inner.get().Publisher().PublishError( + move(c_key_id), move(error_info), timeout_ms) def publish_logs(self, log_json: dict, timeout = None): cdef: diff --git a/python/ray/includes/global_state_accessor.pxd b/python/ray/includes/global_state_accessor.pxd index f6733151e800..44d2e3321c1c 100644 --- a/python/ray/includes/global_state_accessor.pxd +++ b/python/ray/includes/global_state_accessor.pxd @@ -24,7 +24,7 @@ from ray.includes.optional cimport ( optional ) -cdef extern from "ray/gcs/gcs_client/global_state_accessor.h" nogil: +cdef extern from "ray/gcs_client/global_state_accessor.h" nogil: cdef cppclass CGlobalStateAccessor "ray::gcs::GlobalStateAccessor": CGlobalStateAccessor(const CGcsClientOptions&) c_bool Connect() @@ -70,9 +70,9 @@ cdef extern from "ray/gcs/gcs_client/global_state_accessor.h" nogil: cdef extern from * namespace "ray::gcs" nogil: """ #include - #include "ray/gcs/gcs_server/store_client_kv.h" - #include "ray/gcs/redis_client.h" + #include "ray/gcs/store_client_kv.h" #include "ray/gcs/store_client/redis_store_client.h" + #include "ray/util/raii.h" namespace ray { namespace gcs { @@ -94,23 +94,17 @@ cdef extern from * namespace "ray::gcs" nogil: /*log_rotation_max_size=*/1ULL << 29, /*log_rotation_file_num=*/10); - RedisClientOptions options(host, port, username, password, use_ssl); - std::string config_list; RAY_CHECK(absl::Base64Unescape(config, &config_list)); RayConfig::instance().initialize(config_list); instrumented_io_context io_service{/*enable_lag_probe=*/false, /*running_on_single_thread=*/true}; - - auto redis_client = std::make_shared(options); - auto status = redis_client->Connect(io_service); - RAY_CHECK_OK(status) << "Failed to connect to redis."; - - auto cli = std::make_unique( - std::make_unique(std::move(redis_client))); + RedisClientOptions options{host, port, username, password, use_ssl}; + auto client = std::make_unique( + std::make_unique(io_service, options)); bool ret_val = false; - cli->Get("session", key, {[&](std::optional result) { + client->Get("session", key, {[&](std::optional result) { if (result.has_value()) { *data = result.value(); ret_val = true; diff --git a/python/ray/includes/libcoreworker.pxd b/python/ray/includes/libcoreworker.pxd index 0d7b4fe68559..98ec283bbe81 100644 --- a/python/ray/includes/libcoreworker.pxd +++ b/python/ray/includes/libcoreworker.pxd @@ -117,6 +117,7 @@ cdef extern from "ray/core_worker/core_worker.h" nogil: int MaxTaskRetries() const c_bool EnableTaskEvents() const c_bool AllowOutOfOrderExecution() const + c_bool EnableTensorTransport() const cdef cppclass CCoreWorker "ray::core::CoreWorker": CWorkerType GetWorkerType() @@ -212,7 +213,6 @@ cdef extern from "ray/core_worker/core_worker.h" nogil: c_bool ShouldCaptureChildTasksInPlacementGroup() CActorID GetActorId() const const c_string GetActorName() - void SetActorTitle(const c_string &title) void SetActorReprName(const c_string &repr_name) void SetWebuiDisplay(const c_string &key, const c_string &message) const ResourceMappingType &GetResourceIDs() const @@ -261,7 +261,8 @@ cdef extern from "ray/core_worker/core_worker.h" nogil: const c_vector[CObjectID] &contained_object_ids, CObjectID *object_id, shared_ptr[CBuffer] *data, const unique_ptr[CAddress] &owner_address, - c_bool inline_small_object) + c_bool inline_small_object, + CTensorTransport tensor_transport) CRayStatus CreateExisting(const shared_ptr[CBuffer] &metadata, const size_t data_size, const CObjectID &object_id, @@ -379,7 +380,6 @@ cdef extern from "ray/core_worker/core_worker.h" nogil: c_bool interactive c_string node_ip_address int node_manager_port - c_string raylet_ip_address c_string driver_name (CRayStatus( const CAddress &caller_address, @@ -423,6 +423,7 @@ cdef extern from "ray/core_worker/core_worker.h" nogil: const c_vector[c_string]&) nogil) run_on_util_worker_handler (void(const CRayObject&) nogil) unhandled_exception_handler (c_bool(const CTaskID &c_task_id) nogil) cancel_async_actor_task + (void() noexcept nogil) actor_shutdown_callback (void(c_string *stack_out) nogil) get_lang_stack c_bool is_local_mode int num_workers @@ -439,7 +440,6 @@ cdef extern from "ray/core_worker/core_worker.h" nogil: int64_t worker_launch_time_ms int64_t worker_launched_time_ms c_string debug_source - c_bool enable_resource_isolation cdef cppclass CCoreWorkerProcess "ray::core::CoreWorkerProcess": @staticmethod diff --git a/python/ray/includes/network_util.pxd b/python/ray/includes/network_util.pxd new file mode 100644 index 000000000000..df4f8cb9d18d --- /dev/null +++ b/python/ray/includes/network_util.pxd @@ -0,0 +1,9 @@ +from libc.stddef cimport size_t +from libcpp.string cimport string +from ray.includes.array cimport array_string_2 +from ray.includes.optional cimport optional + +cdef extern from "ray/util/network_util.h" namespace "ray": + optional[array_string_2] ParseAddress(const string &address) + string BuildAddress(const string &host, const string &port) + string BuildAddress(const string &host, int port) diff --git a/python/ray/includes/network_util.pxi b/python/ray/includes/network_util.pxi new file mode 100644 index 000000000000..27e330eeace4 --- /dev/null +++ b/python/ray/includes/network_util.pxi @@ -0,0 +1,47 @@ +from ray.includes.network_util cimport ( + BuildAddress, + ParseAddress, + array_string_2, + optional, +) +from libcpp.string cimport string +from typing import Optional, Tuple, Union + +def parse_address(address: str) -> Optional[Tuple[str, str]]: + """Parse a network address string into host and port. + + Args: + address: The address string to parse (e.g., "localhost:8000", "[::1]:8000"). + + Returns: + Tuple with (host, port) if port found, None if no colon separator. + """ + cdef optional[array_string_2] res = ParseAddress(address.encode('utf-8')) + if not res.has_value(): + return None + + cdef array_string_2 ip_port = res.value() + return (ip_port[0].decode('utf-8'), ip_port[1].decode('utf-8')) + + +def build_address(host: str, port: Union[int, str]) -> str: + """Build a network address string from host and port. + + Args: + host: The hostname or IP address. + port: The port number (int or string). + + Returns: + Formatted address string (e.g., "localhost:8000" or "[::1]:8000"). + """ + cdef string host_c = host.encode('utf-8') + cdef string result + cdef string port_c + + if isinstance(port, int): + result = BuildAddress(host_c, port) + else: + port_c = str(port).encode('utf-8') + result = BuildAddress(host_c, port_c) + + return result.decode('utf-8') diff --git a/python/ray/includes/object_ref.pxi b/python/ray/includes/object_ref.pxi index f447c9aaa0ce..fa498c14bf98 100644 --- a/python/ray/includes/object_ref.pxi +++ b/python/ray/includes/object_ref.pxi @@ -6,6 +6,7 @@ import functools import logging import threading from typing import Callable, Any, Union +from _collections_abc import GenericAlias import ray import cython @@ -34,18 +35,19 @@ def _set_future_helper( cdef class ObjectRef(BaseID): + __class_getitem__ = classmethod(GenericAlias) # should match how typing.Generic works def __cinit__(self): self.in_core_worker = False def __init__( self, id, owner_addr="", call_site_data="", - skip_adding_local_ref=False): + skip_adding_local_ref=False, tensor_transport_val=0): self._set_id(id) self.owner_addr = owner_addr self.in_core_worker = False self.call_site_data = call_site_data - + self.tensor_transport_val = tensor_transport_val worker = ray._private.worker.global_worker # TODO(edoakes): We should be able to remove the in_core_worker flag. # But there are still some dummy object refs being created outside the @@ -97,7 +99,8 @@ cdef class ObjectRef(BaseID): def call_site(self): return decode(self.call_site_data) - def size(self): + @classmethod + def size(cls): return CObjectID.Size() def _set_id(self, id): @@ -152,3 +155,9 @@ cdef class ObjectRef(BaseID): core_worker = ray._private.worker.global_worker.core_worker core_worker.set_get_async_callback(self, py_callback) return self + + def tensor_transport(self): + return self.tensor_transport_val + + cdef CTensorTransport c_tensor_transport(self): + return self.tensor_transport_val diff --git a/python/ray/includes/object_ref.pyi b/python/ray/includes/object_ref.pyi new file mode 100644 index 000000000000..78a744e8a856 --- /dev/null +++ b/python/ray/includes/object_ref.pyi @@ -0,0 +1,74 @@ +# source: object_ref.pxi +import asyncio +import concurrent.futures +from typing import Any, Awaitable, Callable, Generator, TypeVar, Union + +from ray.includes.unique_ids import BaseID, JobID, TaskID + +_T = TypeVar("_T") +def _set_future_helper( + result: _T, + *, + py_future: Union[asyncio.Future[_T], concurrent.futures.Future[_T]], +) -> None: ... + + +_OR = TypeVar("_OR", bound=ObjectRef) +class ObjectRef(BaseID, Awaitable[_T]): + + + def __init__( + self, id: bytes, owner_addr: str = "", call_site_data: str = "", + skip_adding_local_ref: bool = False, tensor_transport_val = 0) -> None: ... + + def __dealloc__(self) -> None: ... + + + def task_id(self) -> TaskID: ... + + def job_id(self) -> JobID: ... + + def owner_address(self) -> str: ... + + def call_site(self) -> str: ... + + @classmethod + def size(cls) -> int: ... + + def _set_id(self, id: bytes) -> None: ... + + @classmethod + def nil(cls: type[_OR]) -> _OR: ... + + @classmethod + def from_random(cls: type[_OR]) -> _OR: ... + + def future(self) -> concurrent.futures.Future[_T]: + """Wrap ObjectRef with a concurrent.futures.Future + + Note that the future cancellation will not cancel the correspoding + task when the ObjectRef representing return object of a task. + Additionally, future.running() will always be ``False`` even if the + underlying task is running. + """ + ... + + def __await__(self) -> Generator[Any, None, _T]: ... + + def as_future(self, _internal=False) -> asyncio.Future[_T]: + """Wrap ObjectRef with an asyncio.Future. + + Note that the future cancellation will not cancel the correspoding + task when the ObjectRef representing return object of a task. + """ + ... + + def _on_completed(self, py_callback: Callable[[_T], None]): + """Register a callback that will be called after Object is ready. + If the ObjectRef is already ready, the callback will be called soon. + The callback should take the result as the only argument. The result + can be an exception object in case of task error. + """ + ... + + def tensor_transport(self) -> int: ... diff --git a/python/ray/includes/ray_config.pxd b/python/ray/includes/ray_config.pxd index 7189c2b5bd14..729395a22ee3 100644 --- a/python/ray/includes/ray_config.pxd +++ b/python/ray/includes/ray_config.pxd @@ -37,8 +37,6 @@ cdef extern from "ray/common/ray_config.h" nogil: int object_manager_push_timeout_ms() const - uint64_t object_manager_default_chunk_size() const - uint32_t maximum_gcs_deletion_batch_size() const int64_t max_direct_call_object_size() const @@ -71,12 +69,6 @@ cdef extern from "ray/common/ray_config.h" nogil: int64_t health_check_failure_threshold() const - uint64_t memory_monitor_refresh_ms() const - - int64_t grpc_keepalive_time_ms() const - - int64_t grpc_keepalive_timeout_ms() const - int64_t grpc_client_keepalive_time_ms() const int64_t grpc_client_keepalive_timeout_ms() const @@ -91,8 +83,8 @@ cdef extern from "ray/common/ray_config.h" nogil: int64_t py_gcs_connect_timeout_s() const - int gcs_rpc_server_reconnect_timeout_s() const - int maximum_gcs_destroyed_actor_cached_count() const c_bool record_task_actor_creation_sites() const + + c_bool start_python_gc_manager_thread() const diff --git a/python/ray/includes/ray_config.pxi b/python/ray/includes/ray_config.pxi index d83273b4800f..6915e4877962 100644 --- a/python/ray/includes/ray_config.pxi +++ b/python/ray/includes/ray_config.pxi @@ -61,10 +61,6 @@ cdef class Config: def object_manager_push_timeout_ms(): return RayConfig.instance().object_manager_push_timeout_ms() - @staticmethod - def object_manager_default_chunk_size(): - return RayConfig.instance().object_manager_default_chunk_size() - @staticmethod def maximum_gcs_deletion_batch_size(): return RayConfig.instance().maximum_gcs_deletion_batch_size() @@ -121,18 +117,6 @@ cdef class Config: def health_check_failure_threshold(): return RayConfig.instance().health_check_failure_threshold() - @staticmethod - def memory_monitor_refresh_ms(): - return (RayConfig.instance().memory_monitor_refresh_ms()) - - @staticmethod - def grpc_keepalive_time_ms(): - return RayConfig.instance().grpc_keepalive_time_ms() - - @staticmethod - def grpc_keepalive_timeout_ms(): - return RayConfig.instance().grpc_keepalive_timeout_ms() - @staticmethod def grpc_client_keepalive_time_ms(): return RayConfig.instance().grpc_client_keepalive_time_ms() @@ -153,10 +137,10 @@ cdef class Config: def py_gcs_connect_timeout_s(): return RayConfig.instance().py_gcs_connect_timeout_s() - @staticmethod - def gcs_rpc_server_reconnect_timeout_s(): - return RayConfig.instance().gcs_rpc_server_reconnect_timeout_s() - @staticmethod def maximum_gcs_destroyed_actor_cached_count(): return RayConfig.instance().maximum_gcs_destroyed_actor_cached_count() + + @staticmethod + def start_python_gc_manager_thread(): + return RayConfig.instance().start_python_gc_manager_thread() diff --git a/python/ray/includes/unique_ids.pxi b/python/ray/includes/unique_ids.pxi index 9e2adad94825..3c387833dc29 100644 --- a/python/ray/includes/unique_ids.pxi +++ b/python/ray/includes/unique_ids.pxi @@ -4,9 +4,6 @@ We define different types for different IDs for type safety. See https://github.com/ray-project/ray/issues/3721. """ -# WARNING: Any additional ID types defined in this file must be added to the -# _ID_TYPES list at the bottom of this file. - import logging import os @@ -430,17 +427,3 @@ cdef class PlacementGroupID(BaseID): cdef size_t hash(self): return self.data.Hash() - -_ID_TYPES = [ - ActorClassID, - ActorID, - NodeID, - JobID, - WorkerID, - FunctionID, - ObjectID, - TaskID, - UniqueID, - PlacementGroupID, - ClusterID, -] diff --git a/python/ray/includes/unique_ids.pyi b/python/ray/includes/unique_ids.pyi new file mode 100644 index 000000000000..5f04389f1aed --- /dev/null +++ b/python/ray/includes/unique_ids.pyi @@ -0,0 +1,146 @@ +from __future__ import annotations + +from typing import Tuple, TypeVar + +# backwards compatibility. Luckily circular references are fine in type stubs +from ray._raylet import ObjectRef + +ObjectID = ObjectRef + +# implementations are in unique_ids.pxi +def check_id(b: bytes, size: int = ...) -> None: ... + +_BID = TypeVar("_BID", bound=BaseID) +class BaseID: + + @classmethod + def from_binary(cls: type[_BID], id_bytes: bytes) -> _BID: ... + + @classmethod + def from_hex(cls: type[_BID], hex_id: str | bytes) -> _BID: ... + + def binary(self) -> bytes: ... + + @classmethod + def size(cls) -> int: ... + + def hex(self) -> str: ... + + def is_nil(self) -> bool: ... + + def __hash__(self) -> int: ... + + def __eq__(self, other: object) -> bool: ... + + def __ne__(self, other: object) -> bool: ... + + def __bytes__(self) -> bytes: ... + + def __hex__(self) -> str: ... + + def __repr__(self) -> str: ... + + def __str__(self) -> str: ... + + def __reduce__(self: _BID) -> Tuple[type[_BID], Tuple[bytes]]: ... + + def redis_shard_hash(self) -> int: ... + + +_UID = TypeVar("_UID", bound=UniqueID) +class UniqueID(BaseID): + + def __init__(self, id: bytes) -> None: ... + + @classmethod + def nil(cls: type[_UID]) -> _UID: ... + + @classmethod + def from_random(cls: type[_UID]) -> _UID: ... + + +_TID = TypeVar("_TID", bound=TaskID) +class TaskID(BaseID): + + def __init__(self, id: bytes) -> None: ... + + def actor_id(self) -> ActorID: ... + + def job_id(self) -> JobID: ... + + @classmethod + def nil(cls: type[_TID]) -> _TID: ... + + @classmethod + def for_fake_task(cls: type[_TID], job_id: JobID) -> _TID: ... + + @classmethod + def for_driver_task(cls: type[_TID], job_id: JobID) -> _TID: ... + + @classmethod + def for_actor_creation_task(cls: type[_TID], actor_id: ActorID) -> _TID: ... + + @classmethod + def for_actor_task(cls: type[_TID], job_id: JobID, parent_task_id: TaskID, + parent_task_counter: int, actor_id: ActorID) -> _TID: ... + + @classmethod + def for_normal_task(cls: type[_TID], job_id: JobID, parent_task_id: TaskID, parent_task_counter: int) -> _TID: ... + + +class NodeID(UniqueID): ... + +_JID = TypeVar("_JID", bound=JobID) +class JobID(BaseID): + + def __init__(self, id: bytes) -> None: ... + + @classmethod + def from_int(cls: type[_JID], value: int) -> _JID: ... + + @classmethod + def nil(cls: type[_JID]) -> _JID: ... + + def int(self) -> int: ... + + +class WorkerID(UniqueID): ... + +_AID = TypeVar("_AID", bound=ActorID) +class ActorID(BaseID): + + def __init__(self, id: bytes) -> None: ... + + @classmethod + def of(cls: type[_AID], job_id: JobID, parent_task_id: TaskID, parent_task_counter: int) -> _AID: ... + + @classmethod + def nil(cls: type[_AID]) -> _AID: ... + + @classmethod + def from_random(cls: type[_AID]) -> _AID: ... + + def _set_id(self, id: bytes) -> None: ... + + @property + def job_id(self) -> JobID: ... + + +class FunctionID(UniqueID): ... +class ActorClassID(UniqueID): ... +class ClusterID(UniqueID): ... + + +_PGID = TypeVar("_PGID", bound=PlacementGroupID) +class PlacementGroupID(BaseID): + + def __init__(self, id: bytes) -> None: ... + + @classmethod + def from_random(cls: type[_PGID]) -> _PGID: ... + + @classmethod + def of(cls: type[_PGID], job_id: JobID) -> _PGID: ... + + @classmethod + def nil(cls: type[_PGID]) -> _PGID: ... diff --git a/python/ray/job_submission/__init__.py b/python/ray/job_submission/__init__.py index 6a86cf73c329..b3a76be1e535 100644 --- a/python/ray/job_submission/__init__.py +++ b/python/ray/job_submission/__init__.py @@ -1,10 +1,11 @@ -from ray.dashboard.modules.job.common import JobInfo, JobStatus +from ray.dashboard.modules.job.common import JobErrorType, JobInfo, JobStatus from ray.dashboard.modules.job.pydantic_models import DriverInfo, JobDetails, JobType from ray.dashboard.modules.job.sdk import JobSubmissionClient __all__ = [ "JobSubmissionClient", "JobStatus", + "JobErrorType", "JobInfo", "JobDetails", "DriverInfo", diff --git a/python/ray/llm/_internal/batch/observability/logging/__init__.py b/python/ray/llm/_internal/batch/observability/logging/__init__.py index 4a81025a613c..04cd4d26101f 100644 --- a/python/ray/llm/_internal/batch/observability/logging/__init__.py +++ b/python/ray/llm/_internal/batch/observability/logging/__init__.py @@ -1,7 +1,7 @@ import logging from typing import Optional -from ray._private.ray_logging.filters import CoreContextFilter +from ray._common.filters import CoreContextFilter def _setup_logger(logger_name: str): diff --git a/python/ray/llm/_internal/batch/observability/logging/setup.py b/python/ray/llm/_internal/batch/observability/logging/setup.py index 0c547e4a6305..75edff664939 100644 --- a/python/ray/llm/_internal/batch/observability/logging/setup.py +++ b/python/ray/llm/_internal/batch/observability/logging/setup.py @@ -1,7 +1,7 @@ import logging -from ray._private.ray_logging.filters import CoreContextFilter -from ray._private.ray_logging.formatters import JSONFormatter +from ray._common.filters import CoreContextFilter +from ray._common.formatters import JSONFormatter def _configure_stdlib_logging(): diff --git a/python/ray/llm/_internal/batch/processor/__init__.py b/python/ray/llm/_internal/batch/processor/__init__.py index 99388bbbaaef..fed7d021fe8e 100644 --- a/python/ray/llm/_internal/batch/processor/__init__.py +++ b/python/ray/llm/_internal/batch/processor/__init__.py @@ -1,5 +1,6 @@ from .base import Processor, ProcessorBuilder, ProcessorConfig from .http_request_proc import HttpRequestProcessorConfig +from .serve_deployment_proc import ServeDeploymentProcessorConfig from .sglang_engine_proc import SGLangEngineProcessorConfig from .vllm_engine_proc import vLLMEngineProcessorConfig @@ -9,5 +10,6 @@ "HttpRequestProcessorConfig", "vLLMEngineProcessorConfig", "SGLangEngineProcessorConfig", + "ServeDeploymentProcessorConfig", "Processor", ] diff --git a/python/ray/llm/_internal/batch/processor/base.py b/python/ray/llm/_internal/batch/processor/base.py index 049e5c2685f5..2029a1a0d33b 100644 --- a/python/ray/llm/_internal/batch/processor/base.py +++ b/python/ray/llm/_internal/batch/processor/base.py @@ -1,8 +1,8 @@ import logging from collections import OrderedDict -from typing import Any, Callable, Dict, List, Optional, Type +from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union -from pydantic import Field +from pydantic import Field, field_validator import ray from ray.data import Dataset @@ -45,9 +45,14 @@ class ProcessorConfig(BaseModelExtended): description="The accelerator type used by the LLM stage in a processor. " "Default to None, meaning that only the CPU will be used.", ) - concurrency: Optional[int] = Field( + concurrency: Union[int, Tuple[int, int]] = Field( default=1, - description="The number of workers for data parallelism. Default to 1.", + description="The number of workers for data parallelism. Default to 1. " + "If ``concurrency`` is a ``tuple`` ``(m, n)``, Ray creates an autoscaling " + "actor pool that scales between ``m`` and ``n`` workers (``1 <= m <= n``). " + "If ``concurrency`` is an ``int`` ``n``, Ray uses either a fixed pool of ``n`` " + "workers or an autoscaling pool from ``1`` to ``n`` workers, depending on " + "the processor and stage.", ) experimental: Dict[str, Any] = Field( @@ -57,6 +62,71 @@ class ProcessorConfig(BaseModelExtended): "`max_tasks_in_flight_per_actor`: The maximum number of tasks in flight per actor. Default to 4.", ) + @field_validator("concurrency") + def validate_concurrency( + cls, concurrency: Union[int, Tuple[int, int]] + ) -> Union[int, Tuple[int, int]]: + """Validate that `concurrency` is either: + - a positive int, or + - a 2-tuple `(min, max)` of positive ints with `min <= max`. + """ + + def require(condition: bool, message: str) -> None: + if not condition: + raise ValueError(message) + + if isinstance(concurrency, int): + require( + concurrency > 0, + f"A positive integer for `concurrency` is expected! Got: `{concurrency}`.", + ) + elif isinstance(concurrency, tuple): + require( + all(c > 0 for c in concurrency), + f"`concurrency` tuple items must be positive integers! Got: `{concurrency}`.", + ) + + min_concurrency, max_concurrency = concurrency + require( + min_concurrency <= max_concurrency, + f"min > max in the concurrency tuple `{concurrency}`!", + ) + return concurrency + + def get_concurrency(self, autoscaling_enabled: bool = True) -> Tuple[int, int]: + """Return a normalized `(min, max)` worker range from `self.concurrency`. + + Behavior: + - If `concurrency` is an int `n`: + - `autoscaling_enabled` is True -> return `(1, n)` (autoscaling). + - `autoscaling_enabled` is False -> return `(n, n)` (fixed-size pool). + - If `concurrency` is a 2-tuple `(m, n)`, return it unchanged + (the `autoscaling_enabled` flag is ignored). + + Args: + autoscaling_enabled: When False, treat an integer `concurrency` as fixed `(n, n)`; + otherwise treat it as a range `(1, n)`. Defaults to True. + + Returns: + tuple[int, int]: The allowed worker range `(min, max)`. + + Examples: + >>> self.concurrency = (2, 4) + >>> self.get_concurrency() + (2, 4) + >>> self.concurrency = 4 + >>> self.get_concurrency() + (1, 4) + >>> self.get_concurrency(autoscaling_enabled=False) + (4, 4) + """ + if isinstance(self.concurrency, int): + if autoscaling_enabled: + return 1, self.concurrency + else: + return self.concurrency, self.concurrency + return self.concurrency + class Config: validate_assignment = True arbitrary_types_allowed = True @@ -263,7 +333,7 @@ class ProcessorBuilder: @classmethod def register(cls, config_type: Type[ProcessorConfig], builder: Callable) -> None: - """A decorator to assoicate a particular pipeline config + """A decorator to associate a particular pipeline config with its build function. """ type_name = config_type.__name__ diff --git a/python/ray/llm/_internal/batch/processor/serve_deployment_proc.py b/python/ray/llm/_internal/batch/processor/serve_deployment_proc.py new file mode 100644 index 000000000000..5a0b4e930318 --- /dev/null +++ b/python/ray/llm/_internal/batch/processor/serve_deployment_proc.py @@ -0,0 +1,78 @@ +"""The processor that runs serve deployment.""" + +from typing import Any, Dict, Optional, Type + +from pydantic import Field + +from ray.data.block import UserDefinedFunction +from ray.llm._internal.batch.processor.base import ( + Processor, + ProcessorBuilder, + ProcessorConfig, +) +from ray.llm._internal.batch.stages import ( + ServeDeploymentStage, +) + + +class ServeDeploymentProcessorConfig(ProcessorConfig): + """The configuration for the serve deployment processor.""" + + # Configurations used to build the serve deployment + deployment_name: str = Field( + description="The name of the serve deployment to use.", + ) + app_name: str = Field( + description="The name of the serve application to use.", + default="default", + ) + dtype_mapping: Dict[str, Type[Any]] = Field( + description="A dictionary mapping data type names to their corresponding request classes for the serve deployment.", + default=None, + ) + + +def build_serve_deployment_processor( + config: ServeDeploymentProcessorConfig, + preprocess: Optional[UserDefinedFunction] = None, + postprocess: Optional[UserDefinedFunction] = None, +) -> Processor: + """ + Construct a processor that runs a serve deployment. + + Args: + config: The configuration for the processor. + preprocess: An optional lambda function that takes a row (dict) as input + and returns a preprocessed row (dict). The output row must contain the + required fields for the following processing stages. + postprocess: An optional lambda function that takes a row (dict) as input + and returns a postprocessed row (dict). + + Returns: + The constructed processor. + """ + stages = [ + ServeDeploymentStage( + fn_constructor_kwargs=dict( + deployment_name=config.deployment_name, + app_name=config.app_name, + dtype_mapping=config.dtype_mapping, + ), + map_batches_kwargs=dict( + concurrency=config.concurrency, + ), + ) + ] + # TODO (Kourosh): Add telemetry for ServeDeploymentStage + processor = Processor( + config, + stages, + preprocess=preprocess, + postprocess=postprocess, + ) + return processor + + +ProcessorBuilder.register( + ServeDeploymentProcessorConfig, build_serve_deployment_processor +) diff --git a/python/ray/llm/_internal/batch/processor/sglang_engine_proc.py b/python/ray/llm/_internal/batch/processor/sglang_engine_proc.py index 1156e830177d..602536fc0ad5 100644 --- a/python/ray/llm/_internal/batch/processor/sglang_engine_proc.py +++ b/python/ray/llm/_internal/batch/processor/sglang_engine_proc.py @@ -85,7 +85,7 @@ def build_sglang_engine_processor( ), map_batches_kwargs=dict( zero_copy_batch=True, - concurrency=(1, config.concurrency), + concurrency=config.get_concurrency(), batch_size=config.batch_size, runtime_env=config.runtime_env, ), @@ -100,7 +100,7 @@ def build_sglang_engine_processor( ), map_batches_kwargs=dict( zero_copy_batch=True, - concurrency=(1, config.concurrency), + concurrency=config.get_concurrency(), batch_size=config.batch_size, runtime_env=config.runtime_env, ), @@ -123,8 +123,8 @@ def build_sglang_engine_processor( # which initiates enough many overlapping UDF calls per actor, to # saturate `max_concurrency`. compute=ray.data.ActorPoolStrategy( - min_size=config.concurrency, - max_size=config.concurrency, + min_size=config.get_concurrency(autoscaling_enabled=False)[0], + max_size=config.get_concurrency(autoscaling_enabled=False)[1], max_tasks_in_flight_per_actor=config.experimental.get( "max_tasks_in_flight_per_actor", DEFAULT_MAX_TASKS_IN_FLIGHT ), @@ -148,7 +148,7 @@ def build_sglang_engine_processor( ), map_batches_kwargs=dict( zero_copy_batch=True, - concurrency=(1, config.concurrency), + concurrency=config.get_concurrency(), batch_size=config.batch_size, runtime_env=config.runtime_env, ), diff --git a/python/ray/llm/_internal/batch/processor/vllm_engine_proc.py b/python/ray/llm/_internal/batch/processor/vllm_engine_proc.py index 2fdd80c641d4..da88c482feb8 100644 --- a/python/ray/llm/_internal/batch/processor/vllm_engine_proc.py +++ b/python/ray/llm/_internal/batch/processor/vllm_engine_proc.py @@ -80,6 +80,7 @@ def build_vllm_engine_processor( required fields for the following processing stages. postprocess: An optional lambda function that takes a row (dict) as input and returns a postprocessed row (dict). + telemetry_agent: An optional telemetry agent for collecting usage telemetry. Returns: The constructed processor. @@ -87,21 +88,13 @@ def build_vllm_engine_processor( ray.init(runtime_env=config.runtime_env, ignore_reinit_error=True) stages = [] - if isinstance(config.concurrency, int): - # For CPU-only stages, we leverage auto-scaling to recycle resources. - processor_concurrency = (1, config.concurrency) - else: - raise ValueError( - "``concurrency`` is expected to be set as an integer," - f" but got: {config.concurrency}." - ) if config.has_image: stages.append( PrepareImageStage( map_batches_kwargs=dict( zero_copy_batch=True, - concurrency=processor_concurrency, + concurrency=config.get_concurrency(), batch_size=config.batch_size, ), ) @@ -115,7 +108,7 @@ def build_vllm_engine_processor( ), map_batches_kwargs=dict( zero_copy_batch=True, - concurrency=processor_concurrency, + concurrency=config.get_concurrency(), batch_size=config.batch_size, runtime_env=config.runtime_env, ), @@ -130,7 +123,7 @@ def build_vllm_engine_processor( ), map_batches_kwargs=dict( zero_copy_batch=True, - concurrency=processor_concurrency, + concurrency=config.get_concurrency(), batch_size=config.batch_size, runtime_env=config.runtime_env, ), @@ -157,10 +150,8 @@ def build_vllm_engine_processor( # which initiates enough many overlapping UDF calls per actor, to # saturate `max_concurrency`. compute=ray.data.ActorPoolStrategy( - # vLLM start up time is significant, so if user give fixed - # concurrency, start all instances without auto-scaling. - min_size=config.concurrency, - max_size=config.concurrency, + min_size=config.get_concurrency(autoscaling_enabled=False)[0], + max_size=config.get_concurrency(autoscaling_enabled=False)[1], max_tasks_in_flight_per_actor=config.experimental.get( "max_tasks_in_flight_per_actor", DEFAULT_MAX_TASKS_IN_FLIGHT ), @@ -184,7 +175,7 @@ def build_vllm_engine_processor( ), map_batches_kwargs=dict( zero_copy_batch=True, - concurrency=processor_concurrency, + concurrency=config.get_concurrency(), batch_size=config.batch_size, runtime_env=config.runtime_env, ), diff --git a/python/ray/llm/_internal/batch/stages/__init__.py b/python/ray/llm/_internal/batch/stages/__init__.py index 0742784cf592..a45d21fc7670 100644 --- a/python/ray/llm/_internal/batch/stages/__init__.py +++ b/python/ray/llm/_internal/batch/stages/__init__.py @@ -6,6 +6,7 @@ from ray.llm._internal.batch.stages.chat_template_stage import ChatTemplateStage from ray.llm._internal.batch.stages.http_request_stage import HttpRequestStage from ray.llm._internal.batch.stages.prepare_image_stage import PrepareImageStage +from ray.llm._internal.batch.stages.serve_deployment_stage import ServeDeploymentStage from ray.llm._internal.batch.stages.sglang_engine_stage import SGLangEngineStage from ray.llm._internal.batch.stages.tokenize_stage import DetokenizeStage, TokenizeStage from ray.llm._internal.batch.stages.vllm_engine_stage import vLLMEngineStage @@ -18,6 +19,7 @@ "DetokenizeStage", "vLLMEngineStage", "SGLangEngineStage", + "ServeDeploymentStage", "wrap_preprocess", "wrap_postprocess", "PrepareImageStage", diff --git a/python/ray/llm/_internal/batch/stages/prepare_image_stage.py b/python/ray/llm/_internal/batch/stages/prepare_image_stage.py index 8b1989863c42..7c5c27571486 100644 --- a/python/ray/llm/_internal/batch/stages/prepare_image_stage.py +++ b/python/ray/llm/_internal/batch/stages/prepare_image_stage.py @@ -1,4 +1,5 @@ """Prepare Image Stage""" + import asyncio import base64 import importlib @@ -322,12 +323,31 @@ def extract_image_info(self, messages: List[Dict]) -> List[_ImageType]: image_info: List[_ImageType] = [] for message in messages: - if not isinstance(message["content"], list): + content = message["content"] + + # Convert PyArrow objects to Python objects if needed (like ChatTemplateStage). + # This handles the case where unform content types are serialized with PyArrow + # instead of pickle- happens when all messages have the same content structure + # (e.g., no system prompt + string content mixed with user messages with list content). + if hasattr(content, "tolist"): + content = content.tolist() + + if not isinstance(content, list): continue - for content in message["content"]: - if content["type"] not in ("image", "image_url"): + for content_item in content: + if content_item["type"] not in ("image", "image_url"): continue - image = content[content["type"]] + + image_data = content_item[content_item["type"]] + if content_item["type"] == "image_url" and isinstance(image_data, dict): + # OpenAI nested format: {"image_url": {"url": "..."}} + image = image_data.get("url") + if image is None: + raise ValueError("image_url dict must contain 'url' key") + else: + # Simple format: {"image": "..."} or {"image_url": "..."} + image = image_data + if not isinstance(image, str) and not isinstance( image, self.Image.Image ): diff --git a/python/ray/llm/_internal/batch/stages/serve_deployment_stage.py b/python/ray/llm/_internal/batch/stages/serve_deployment_stage.py new file mode 100644 index 000000000000..04626e734cd7 --- /dev/null +++ b/python/ray/llm/_internal/batch/stages/serve_deployment_stage.py @@ -0,0 +1,156 @@ +"""The stage that runs serve deployment.""" + +import asyncio +import logging +import time +import uuid +from typing import Any, AsyncIterator, Dict, List, Optional, Tuple, Type + +from pydantic import BaseModel + +from ray import serve +from ray.llm._internal.batch.stages.base import ( + StatefulStage, + StatefulStageUDF, +) + +logger = logging.getLogger(__name__) + + +class ServeDeploymentStageUDF(StatefulStageUDF): + def __init__( + self, + data_column: str, + expected_input_keys: List[str], + *, + deployment_name: str, + app_name: str, + dtype_mapping: Dict[str, Type[Any]], + ): + """ + Initialize the ServeDeploymentStageUDF. + + Args: + data_column: The data column name. + expected_input_keys: The expected input keys of the stage. + deployment_name: The name of the deployment. + app_name: The name of the deployment app. + dtype_mapping: The mapping of the request class name to the request class. + """ + super().__init__(data_column, expected_input_keys) + self._dtype_mapping = dtype_mapping + + # Using stream=True as LLM serve deployments return async generators. + # TODO (Kourosh): Generalize this to support non-streaming deployments. + self._dh = serve.get_deployment_handle(deployment_name, app_name).options( + stream=True + ) + self.request_id = 0 + + def _prepare_request( + self, row: Dict[str, Any] + ) -> Tuple[Dict[str, Any], Optional[Type[Any]], str]: + """ + Decorate the request with metadata related to the batch. + + Args: + row: The row. + + Returns: + A tuple of (decorated_request, dtype, method_name). dtype is the class of the request object and + can be None if the serve deployment accepts a raw dict. method_name is the name of the method to + invoke on the serve deployment. + """ + method = row.get("method") + dtype_name = row.get("dtype") + + dtype = None + if dtype_name is not None: + if not self._dtype_mapping or dtype_name not in self._dtype_mapping: + raise ValueError( + f"{dtype_name} must be provided in ServeDeploymentProcessorConfig's dtype_mapping." + ) + dtype = self._dtype_mapping[dtype_name] + + request_kwargs = row.pop("request_kwargs") + request = { + "request_id": str(self.request_id), + "idx_in_batch": row[self.IDX_IN_BATCH_COLUMN], + **request_kwargs, + } + self.request_id += 1 + + return request, dtype, method + + async def generate_async( + self, row: Dict[str, Any] + ) -> Tuple[Dict[str, Any], Dict[str, Any], float]: + """ + Run the serve deployment. + + Args: + row: The row to run the serve deployment on. + + Returns: + The response from the serve deployment. + """ + request, dtype, method = self._prepare_request(row) + request_obj = dtype(**request) if dtype else request + + if getattr(self._dh, method) is None: + raise ValueError(f"Method {method} not found in the serve deployment.") + + t = time.perf_counter() + # Directly using anext() requires python3.10 and above + output_data = await getattr(self._dh, method).remote(request_obj).__anext__() + time_taken = time.perf_counter() - t + + # Convert the output data to a dict if it is a Pydantic model. + if isinstance(output_data, BaseModel): + output_data = output_data.model_dump() + + return request, output_data, time_taken + + async def udf(self, batch: List[Dict[str, Any]]) -> AsyncIterator[Dict[str, Any]]: + """ + Run the serve deployment. + + Args: + batch: A list of rows to run the serve deployment on. + + Yields: + Dict[str, Any]: A dictionary containing the response from the serve deployment + along with processing metadata. + """ + batch_uuid = uuid.uuid4() + t = time.perf_counter() + tasks = [asyncio.create_task(self.generate_async(row)) for row in batch] + + for resp in asyncio.as_completed(tasks): + request, output, time_taken = await resp + + yield { + "request_id": request["request_id"], + self.IDX_IN_BATCH_COLUMN: request["idx_in_batch"], + "batch_uuid": batch_uuid.hex, + "time_taken": time_taken, + **output, + } + + batch_time_taken = time.perf_counter() - t + logger.info( + "[LLM Batch - Serve Deployment] Elapsed time for batch %s with size %d: %s", + batch_uuid.hex, + len(batch), + batch_time_taken, + ) + + +class ServeDeploymentStage(StatefulStage): + fn: Type[StatefulStageUDF] = ServeDeploymentStageUDF + + def get_required_input_keys(self) -> Dict[str, str]: + return { + "method": "Name of the method to invoke on the serve deployment.", + "request_kwargs": "The request_kwargs to construct the request to the serve deployment.", + } diff --git a/python/ray/llm/_internal/batch/stages/sglang_engine_stage.py b/python/ray/llm/_internal/batch/stages/sglang_engine_stage.py index a49ef7c18bce..f140b666b15d 100644 --- a/python/ray/llm/_internal/batch/stages/sglang_engine_stage.py +++ b/python/ray/llm/_internal/batch/stages/sglang_engine_stage.py @@ -177,22 +177,25 @@ async def _prepare_llm_request(self, row: Dict[str, Any]) -> SGLangEngineRequest async def generate_async( self, row: Dict[str, Any] - ) -> Tuple[SGLangEngineRequest, Dict[str, Any]]: + ) -> Tuple[SGLangEngineRequest, Dict[str, Any], float]: """Process a single request. Args: request: The request. Returns: - A tuple of index in batch, request output and bypassed custom fields. + A tuple of index in batch, request output and bypassed custom fields, and time taken. """ request = await self._prepare_llm_request(row) + t = time.perf_counter() async with self.semaphore: output = await self._generate_async(request) + time_taken = time.perf_counter() - t + output_data = SGLangOutputData.from_sglang_engine_output(output) - return request, output_data.model_dump() + return request, output_data.model_dump(), time_taken async def _generate_async(self, request: SGLangEngineRequest) -> Any: """Process a single request. @@ -321,29 +324,28 @@ async def udf(self, batch: List[Dict[str, Any]]) -> AsyncIterator[Dict[str, Any] The response of the SGLang engine. """ batch_uuid = uuid.uuid4() - t = time.perf_counter() + batch_start_time = time.perf_counter() tasks = [asyncio.create_task(self.llm.generate_async(row)) for row in batch] - time_taken = -1.0 for resp in asyncio.as_completed(tasks): - request, output = await resp - time_taken = time.perf_counter() - t + request, output, time_taken_llm = await resp yield { **output, "request_id": request.request_id, self.IDX_IN_BATCH_COLUMN: request.idx_in_batch, "batch_uuid": batch_uuid.hex, - "time_taken_llm": time_taken, + "time_taken_llm": time_taken_llm, "params": str(request.params), } + batch_time_taken = time.perf_counter() - batch_start_time logger.info( "[SGLang] Elapsed time for batch %s with size %d: %s", batch_uuid.hex, len(batch), - time_taken, + batch_time_taken, ) def __del__(self): diff --git a/python/ray/llm/_internal/batch/stages/vllm_engine_stage.py b/python/ray/llm/_internal/batch/stages/vllm_engine_stage.py index 481da600da7b..18606714869f 100644 --- a/python/ray/llm/_internal/batch/stages/vllm_engine_stage.py +++ b/python/ray/llm/_internal/batch/stages/vllm_engine_stage.py @@ -11,6 +11,7 @@ from typing import Any, AsyncIterator, Dict, List, Optional, Tuple, Type import numpy as np +import torch from pydantic import BaseModel, Field, root_validator import ray @@ -109,6 +110,11 @@ def from_vllm_engine_output(cls, output: Any) -> "vLLMOutputData": data.num_generated_tokens = len(output.outputs[0].token_ids) elif isinstance(output, vllm.outputs.PoolingRequestOutput): data.embeddings = output.outputs.data.cpu() + if ( + isinstance(data.embeddings, torch.Tensor) + and data.embeddings.dtype == torch.bfloat16 + ): + data.embeddings = data.embeddings.to(torch.float32) else: raise ValueError(f"Unknown output type: {type(output)}") @@ -302,22 +308,25 @@ async def _prepare_llm_request(self, row: Dict[str, Any]) -> vLLMEngineRequest: async def generate_async( self, row: Dict[str, Any] - ) -> Tuple[vLLMEngineRequest, Dict[str, Any]]: + ) -> Tuple[vLLMEngineRequest, Dict[str, Any], float]: """Process a single request. Args: request: The request. Returns: - A tuple of index in batch, request output and bypassed custom fields. + A tuple of index in batch, request output and bypassed custom fields, and time taken. """ request = await self._prepare_llm_request(row) + t = time.perf_counter() async with self.semaphore: output = await self._generate_async(request) + time_taken = time.perf_counter() - t + output_data = vLLMOutputData.from_vllm_engine_output(output) - return request, output_data.model_dump() + return request, output_data.model_dump(), time_taken async def generate_async_v0(self, request: vLLMEngineRequest) -> Any: """Process a single request. @@ -462,11 +471,20 @@ def __init__( if self.max_pending_requests > 0: logger.info("Max pending requests is set to %d", self.max_pending_requests) + exclude_safetensors = self.engine_kwargs.get("load_format") in [ + "runai_streamer", + "tensorizer", + ] + if exclude_safetensors: + download_model = NodeModelDownloadable.EXCLUDE_SAFETENSORS + else: + download_model = NodeModelDownloadable.MODEL_AND_TOKENIZER + # Download the model if needed. model_source = download_model_files( model_id=self.model, mirror_config=None, - download_model=NodeModelDownloadable.MODEL_AND_TOKENIZER, + download_model=download_model, download_extra_files=False, ) @@ -475,7 +493,7 @@ def __init__( model=self.model, model_source=model_source, idx_in_batch_column=self.IDX_IN_BATCH_COLUMN, - disable_log_requests=True, + enable_log_requests=False, max_pending_requests=self.max_pending_requests, dynamic_lora_loading_path=dynamic_lora_loading_path, **self.engine_kwargs, @@ -539,31 +557,30 @@ async def udf(self, batch: List[Dict[str, Any]]) -> AsyncIterator[Dict[str, Any] The response of the vLLM engine. """ batch_uuid = uuid.uuid4() - t = time.perf_counter() + batch_start_time = time.perf_counter() tasks = [asyncio.create_task(self.llm.generate_async(row)) for row in batch] - time_taken = -1.0 for resp in asyncio.as_completed(tasks): - request, output = await resp - time_taken = time.perf_counter() - t + request, output, time_taken_llm = await resp yield { **output, "request_id": request.request_id, self.IDX_IN_BATCH_COLUMN: request.idx_in_batch, "batch_uuid": batch_uuid.hex, - "time_taken_llm": time_taken, + "time_taken_llm": time_taken_llm, "params": str(request.params), } + batch_time_taken = time.perf_counter() - batch_start_time # TODO: Add metrics to the UDf wrapper so that we don't need # timer in UDFs anymore. logger.info( "[vLLM] Elapsed time for batch %s with size %d: %s", batch_uuid.hex, len(batch), - time_taken, + batch_time_taken, ) # Log engine stats after each batch is done conditioned on the flag diff --git a/python/ray/llm/_internal/common/base_pydantic.py b/python/ray/llm/_internal/common/base_pydantic.py index 7add5baee6d8..ce4d49d8f955 100644 --- a/python/ray/llm/_internal/common/base_pydantic.py +++ b/python/ray/llm/_internal/common/base_pydantic.py @@ -13,7 +13,10 @@ class BaseModelExtended(BaseModel): # namespace as not protected. This means we need to be careful about overriding # internal attributes starting with `model_`. # See: https://github.com/anyscale/ray-llm/issues/1425 - model_config = ConfigDict(protected_namespaces=tuple()) + model_config = ConfigDict( + protected_namespaces=tuple(), + extra="forbid", + ) @classmethod def parse_yaml(cls: Type[ModelT], file, **kwargs) -> ModelT: diff --git a/python/ray/llm/_internal/common/observability/logging/__init__.py b/python/ray/llm/_internal/common/observability/logging/__init__.py index cc1e3ce04cfd..789ae4e09e9c 100644 --- a/python/ray/llm/_internal/common/observability/logging/__init__.py +++ b/python/ray/llm/_internal/common/observability/logging/__init__.py @@ -1,7 +1,7 @@ import logging from typing import Optional -from ray._private.ray_logging.filters import CoreContextFilter +from ray._common.filters import CoreContextFilter def _setup_logger(logger_name: str): diff --git a/python/ray/llm/_internal/common/utils/cloud_utils.py b/python/ray/llm/_internal/common/utils/cloud_utils.py index f654e044cd6e..0db75809efd5 100644 --- a/python/ray/llm/_internal/common/utils/cloud_utils.py +++ b/python/ray/llm/_internal/common/utils/cloud_utils.py @@ -148,7 +148,13 @@ def get_fs_and_path(object_uri: str) -> Tuple[pa_fs.FileSystem, str]: object_uri = f"{scheme}://{parts[1]}" if object_uri.startswith("s3://"): - fs = pa_fs.S3FileSystem(anonymous=anonymous) + endpoint = os.getenv("AWS_ENDPOINT_URL_S3", None) + virtual_hosted_style = os.getenv("AWS_S3_ADDRESSING_STYLE", None) + fs = pa_fs.S3FileSystem( + anonymous=anonymous, + endpoint_override=endpoint, + force_virtual_addressing=(virtual_hosted_style == "virtual"), + ) path = object_uri[5:] # Remove "s3://" elif object_uri.startswith("gs://"): fs = pa_fs.GcsFileSystem(anonymous=anonymous) @@ -227,6 +233,7 @@ def download_files( path: str, bucket_uri: str, substrings_to_include: Optional[List[str]] = None, + suffixes_to_exclude: Optional[List[str]] = None, ) -> None: """Download files from cloud storage to a local directory. @@ -234,6 +241,7 @@ def download_files( path: Local directory where files will be downloaded bucket_uri: URI of cloud directory substrings_to_include: Only include files containing these substrings + suffixes_to_exclude: Exclude certain files from download (e.g .safetensors) """ try: fs, source_path = CloudFileSystem.get_fs_and_path(bucket_uri) @@ -260,6 +268,11 @@ def download_files( ): continue + # Check if file matches suffixes to exclude filter + if suffixes_to_exclude: + if any(rel_path.endswith(suffix) for suffix in suffixes_to_exclude): + continue + # Create destination directory if needed if "/" in rel_path: dest_dir = os.path.join(path, os.path.dirname(rel_path)) @@ -277,7 +290,10 @@ def download_files( @staticmethod def download_model( - destination_path: str, bucket_uri: str, tokenizer_only: bool + destination_path: str, + bucket_uri: str, + tokenizer_only: bool, + exclude_safetensors: bool = False, ) -> None: """Download a model from cloud storage. @@ -288,6 +304,7 @@ def download_model( destination_path: Path where the model will be stored bucket_uri: URI of the cloud directory containing the model tokenizer_only: If True, only download tokenizer-related files + exclude_safetensors: If True, skip download of safetensor files """ try: fs, source_path = CloudFileSystem.get_fs_and_path(bucket_uri) @@ -327,10 +344,14 @@ def download_model( tokenizer_file_substrings = ( ["tokenizer", "config.json"] if tokenizer_only else [] ) + + safetensors_to_exclude = [".safetensors"] if exclude_safetensors else None + CloudFileSystem.download_files( path=destination_dir, bucket_uri=bucket_uri, substrings_to_include=tokenizer_file_substrings, + suffixes_to_exclude=safetensors_to_exclude, ) except Exception as e: diff --git a/python/ray/llm/_internal/common/utils/download_utils.py b/python/ray/llm/_internal/common/utils/download_utils.py index 2d4e0db908d0..88d8e208a226 100644 --- a/python/ray/llm/_internal/common/utils/download_utils.py +++ b/python/ray/llm/_internal/common/utils/download_utils.py @@ -24,6 +24,7 @@ class NodeModelDownloadable(enum.Enum): MODEL_AND_TOKENIZER = enum.auto() TOKENIZER_ONLY = enum.auto() + EXCLUDE_SAFETENSORS = enum.auto() NONE = enum.auto() def __bool__(self): @@ -36,7 +37,11 @@ def union(self, other: "NodeModelDownloadable") -> "NodeModelDownloadable": or other == NodeModelDownloadable.MODEL_AND_TOKENIZER ): return NodeModelDownloadable.MODEL_AND_TOKENIZER - + if ( + self == NodeModelDownloadable.EXCLUDE_SAFETENSORS + or other == NodeModelDownloadable.EXCLUDE_SAFETENSORS + ): + return NodeModelDownloadable.EXCLUDE_SAFETENSORS if ( self == NodeModelDownloadable.TOKENIZER_ONLY or other == NodeModelDownloadable.TOKENIZER_ONLY @@ -111,11 +116,13 @@ class CloudModelDownloader(CloudModelAccessor): def get_model( self, tokenizer_only: bool, + exclude_safetensors: bool = False, ) -> str: """Gets a model from cloud storage and stores it locally. Args: tokenizer_only: whether to download only the tokenizer files. + exclude_safetensors: whether to download safetensors files to disk. Returns: file path of model if downloaded, else the model id. """ @@ -135,10 +142,13 @@ def get_model( # This ensures that subsequent processes don't duplicate work. with FileLock(lock_path, timeout=0): try: + if exclude_safetensors: + logger.info("Skipping download of safetensors files.") CloudFileSystem.download_model( destination_path=path, bucket_uri=bucket_uri, tokenizer_only=tokenizer_only, + exclude_safetensors=exclude_safetensors, ) logger.info( "Finished downloading %s for %s from %s storage", @@ -282,7 +292,9 @@ def download_model_files( if download_model != NodeModelDownloadable.NONE: model_path_or_id = downloader.get_model( - tokenizer_only=download_model == NodeModelDownloadable.TOKENIZER_ONLY + tokenizer_only=download_model == NodeModelDownloadable.TOKENIZER_ONLY, + exclude_safetensors=download_model + == NodeModelDownloadable.EXCLUDE_SAFETENSORS, ) if download_extra_files: diff --git a/python/ray/llm/_internal/serve/builders/application_builders.py b/python/ray/llm/_internal/serve/builders/application_builders.py index a0f7607e14fa..c8de292f5cda 100644 --- a/python/ray/llm/_internal/serve/builders/application_builders.py +++ b/python/ray/llm/_internal/serve/builders/application_builders.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Sequence +from typing import Any, Dict, List, Optional, Sequence, overload from ray.llm._internal.serve.configs.server_models import ( LLMConfig, @@ -21,14 +21,18 @@ def build_llm_deployment( *, name_prefix: Optional[str] = None, deployment_kwargs: Optional[dict] = None, + override_serve_options: Optional[dict] = None, ) -> Application: - name_prefix = name_prefix or "LLMDeployment" + name_prefix = name_prefix or "LLMServer:" deployment_kwargs = deployment_kwargs or {} deployment_options = llm_config.get_serve_options( name_prefix=name_prefix, ) + if override_serve_options: + deployment_options.update(override_serve_options) + return LLMDeployment.options(**deployment_options).bind( llm_config=llm_config, **deployment_kwargs ) @@ -52,6 +56,11 @@ def _get_llm_deployments( return llm_deployments +@overload +def build_openai_app(llm_serving_args: Dict[str, Any]) -> Application: + ... + + def build_openai_app(llm_serving_args: LLMServingArgs) -> Application: rayllm_args = LLMServingArgs.model_validate(llm_serving_args).parse_args() diff --git a/python/ray/llm/_internal/serve/configs/openai_api_models.py b/python/ray/llm/_internal/serve/configs/openai_api_models.py index bb0b195d93f4..78d4f4687e25 100644 --- a/python/ray/llm/_internal/serve/configs/openai_api_models.py +++ b/python/ray/llm/_internal/serve/configs/openai_api_models.py @@ -21,7 +21,10 @@ EmbeddingChatRequest as vLLMEmbeddingChatRequest, EmbeddingCompletionRequest as vLLMEmbeddingCompletionRequest, EmbeddingResponse as vLLMEmbeddingResponse, + ErrorInfo as vLLMErrorInfo, ErrorResponse as vLLMErrorResponse, + ScoreRequest as vLLMScoreRequest, + ScoreResponse as vLLMScoreResponse, ) from vllm.utils import random_uuid @@ -41,6 +44,10 @@ class ChatCompletionStreamResponse(vLLMChatCompletionStreamResponse): model_config = ConfigDict(arbitrary_types_allowed=True) +class ErrorInfo(vLLMErrorInfo): + model_config = ConfigDict(arbitrary_types_allowed=True) + + class ErrorResponse(vLLMErrorResponse): model_config = ConfigDict(arbitrary_types_allowed=True) @@ -89,12 +96,24 @@ class EmbeddingResponse(vLLMEmbeddingResponse): model_config = ConfigDict(arbitrary_types_allowed=True) +class ScoreRequest(vLLMScoreRequest): + model_config = ConfigDict(arbitrary_types_allowed=True) + + +class ScoreResponse(vLLMScoreResponse): + model_config = ConfigDict(arbitrary_types_allowed=True) + + EmbeddingRequest = Union[EmbeddingCompletionRequest, EmbeddingChatRequest] LLMEmbeddingsResponse = Union[ AsyncGenerator[Union[EmbeddingResponse, ErrorResponse], None], ] +LLMScoreResponse = Union[ + AsyncGenerator[Union[ScoreResponse, ErrorResponse], None], +] + LLMChatResponse = Union[ AsyncGenerator[Union[str, ChatCompletionResponse, ErrorResponse], None], ] diff --git a/python/ray/llm/_internal/serve/configs/server_models.py b/python/ray/llm/_internal/serve/configs/server_models.py index cacdf4aeeba3..d7a7b9af9f75 100644 --- a/python/ray/llm/_internal/serve/configs/server_models.py +++ b/python/ray/llm/_internal/serve/configs/server_models.py @@ -13,7 +13,6 @@ import pydantic from pydantic import ( BaseModel, - ConfigDict, Field, PositiveInt, PrivateAttr, @@ -30,11 +29,15 @@ ) from ray.llm._internal.common.utils.import_utils import try_import from ray.llm._internal.serve.configs.constants import ( + DEFAULT_MAX_ONGOING_REQUESTS, DEFAULT_MULTIPLEX_DOWNLOAD_TIMEOUT_S, DEFAULT_MULTIPLEX_DOWNLOAD_TRIES, ENABLE_WORKER_PROCESS_SETUP_HOOK, MODEL_RESPONSE_BATCH_TIMEOUT_MS, ) +from ray.llm._internal.serve.deployments.llm.vllm.kv_transfer_backends import ( + SUPPORTED_BACKENDS as SUPPORTED_KV_CONNECTOR_BACKENDS, +) from ray.llm._internal.serve.observability.logging import get_logger from ray.serve._private.config import DeploymentConfig @@ -108,6 +111,7 @@ def validate_dynamic_lora_loading_path(cls, value: Optional[str]): class ModelLoadingConfig(BaseModelExtended): + model_id: str = Field( description="The ID that should be used by end users to access this model.", ) @@ -134,22 +138,17 @@ class ModelLoadingConfig(BaseModelExtended): class LLMConfig(BaseModelExtended): - # model_config is a Pydantic setting. This setting merges with - # model_configs in parent classes. - model_config = ConfigDict( - extra="forbid", - ) runtime_env: Optional[Dict[str, Any]] = Field( - None, + default=None, description=( "The runtime_env to use for the model deployment replica " "and the engine workers." ), ) - model_loading_config: ModelLoadingConfig = Field( - description="The settings for how to download and expose the model." + model_loading_config: Union[Dict[str, Any], ModelLoadingConfig] = Field( + description="The settings for how to download and expose the model. Validated against ModelLoadingConfig." ) llm_engine: str = Field( @@ -179,8 +178,9 @@ class LLMConfig(BaseModelExtended): description=f"The type of accelerator runs the model on. Only the following values are supported: {str([t.value for t in GPUType])}", ) - lora_config: Optional[LoraConfig] = Field( - default=None, description="Settings for LoRA adapter." + lora_config: Optional[Union[Dict[str, Any], LoraConfig]] = Field( + default=None, + description="Settings for LoRA adapter. Validated against LoraConfig.", ) deployment_config: Dict[str, Any] = Field( @@ -192,7 +192,7 @@ class LLMConfig(BaseModelExtended): `autoscaling_config`, `max_queued_requests`, `user_config`, `health_check_period_s`, `health_check_timeout_s`, `graceful_shutdown_wait_loop_s`, `graceful_shutdown_timeout_s`, - `logging_config`. + `logging_config`, `request_router_config`. For more details, see the `Ray Serve Documentation `_. """, ) @@ -211,7 +211,7 @@ class LLMConfig(BaseModelExtended): ) log_engine_metrics: Optional[bool] = Field( - False, + default=False, description="Enable additional engine metrics via Ray Prometheus port. Only compatible with V1 vLLM engine. NOTE: once v1 is fully rolled out, we will remove this flag and turn it on by default.", ) @@ -225,8 +225,16 @@ def _infer_supports_vision(self, model_id_or_path: str) -> None: attribute based on whether the config has `vision_config`. All LVM models has `vision_config` setup. """ - hf_config = transformers.PretrainedConfig.from_pretrained(model_id_or_path) - self._supports_vision = hasattr(hf_config, "vision_config") + try: + hf_config = transformers.PretrainedConfig.from_pretrained(model_id_or_path) + self._supports_vision = hasattr(hf_config, "vision_config") + except Exception as e: + raise ValueError( + f"Failed to load Hugging Face config for model_id='{model_id_or_path}'.\ + Ensure `model_id` is a valid Hugging Face repo or a local path that \ + contains a valid `config.json` file. " + f"Original error: {repr(e)}" + ) from e def _set_model_architecture( self, @@ -238,9 +246,23 @@ def _set_model_architecture( attribute based on whether the config has `architectures`. """ if model_id_or_path: - hf_config = transformers.PretrainedConfig.from_pretrained(model_id_or_path) - if hasattr(hf_config, "architectures") and hf_config.architectures: - self._model_architecture = hf_config.architectures[0] + try: + hf_config = transformers.PretrainedConfig.from_pretrained( + model_id_or_path + ) + if ( + hf_config + and hasattr(hf_config, "architectures") + and hf_config.architectures + ): + self._model_architecture = hf_config.architectures[0] + except Exception as e: + raise ValueError( + f"Failed to load Hugging Face config for model_id='{model_id_or_path}'.\ + Ensure `model_id` is a valid Hugging Face repo or a local path that \ + contains a valid `config.json` file. " + f"Original error: {repr(e)}" + ) from e if model_architecture: self._model_architecture = model_architecture @@ -312,6 +334,36 @@ def validate_deployment_config(cls, value: Dict[str, Any]) -> Dict[str, Any]: return value + @field_validator("model_loading_config") + def validate_model_loading_config( + cls, value: Union[Dict[str, Any], ModelLoadingConfig] + ) -> ModelLoadingConfig: + """Validates the model loading config dictionary.""" + if isinstance(value, ModelLoadingConfig): + return value + + try: + model_loading_config = ModelLoadingConfig(**value) + except Exception as e: + raise ValueError(f"Invalid model_loading_config: {value}") from e + + return model_loading_config + + @field_validator("lora_config") + def validate_lora_config( + cls, value: Optional[Union[Dict[str, Any], LoraConfig]] + ) -> Optional[LoraConfig]: + """Validates the lora config dictionary.""" + if value is None or isinstance(value, LoraConfig): + return value + + try: + lora_config = LoraConfig(**value) + except Exception as e: + raise ValueError(f"Invalid lora_config: {value}") from e + + return lora_config + @model_validator(mode="after") def _check_log_stats_with_metrics(self): # Require disable_log_stats is not set to True when log_engine_metrics is enabled. @@ -360,6 +412,50 @@ def get_engine_config(self) -> EngineConfigType: return self._engine_config + def update_engine_kwargs(self, **kwargs: Any) -> None: + """Update the engine_kwargs and the engine_config engine_kwargs. + + This is typically called during engine starts, when certain engine_kwargs + (e.g., data_parallel_rank) become available. + """ + self.engine_kwargs.update(kwargs) + # engine_config may be created before engine starts, this makes sure + # the engine_config is updated with the latest engine_kwargs. + if self._engine_config: + self._engine_config.engine_kwargs.update(kwargs) + + def _merge_replica_actor_and_child_actor_bundles( + self, + child_actor_bundles: List[Dict[str, float]], + replica_actor_bundle: Dict[str, float], + ) -> List[Dict[str, float]]: + """Sum up the bundles from replica actor bundles with the first bundle from child actor bundles. + + This is because the replica actor will use the first bundle in the list, and we want to collocate the replica actor with the child actor. + So we need to group them together. + + So for example: + child_actor_bundles = [{"GPU": 1, "CPU": 1}, {"GPU": 1, "CPU": 1}] + replica_actor_bundle = {"GPU": 0, "CPU": 1, "memory": 100} + return [{"GPU": 1, "CPU": 2, "memory": 100}, {"GPU": 1, "CPU": 1}] + """ + + if not child_actor_bundles: + return [replica_actor_bundle] + + if not replica_actor_bundle: + return child_actor_bundles + + first_bundle = child_actor_bundles[0] + bundle_key_set = set(first_bundle.keys()) | set(replica_actor_bundle.keys()) + + for key in bundle_key_set: + first_bundle[key] = replica_actor_bundle.get(key, 0) + first_bundle.get( + key, 0 + ) + + return [first_bundle] + child_actor_bundles[1:] + def _set_deployment_placement_options(self) -> Dict[str, Any]: deployment_config = self.deployment_config engine_config = self.get_engine_config() @@ -385,15 +481,17 @@ def _set_deployment_placement_options(self) -> Dict[str, Any]: ) try: - bundles = engine_config.placement_bundles + child_actor_bundles = engine_config.placement_bundles except ValueError: # May happen if all bundles are empty. - bundles = [] + child_actor_bundles = [] - bundles = [replica_actor_resources] + bundles + pg_bundles = self._merge_replica_actor_and_child_actor_bundles( + child_actor_bundles, replica_actor_resources + ) deployment_config.update( { - "placement_group_bundles": bundles, + "placement_group_bundles": pg_bundles, "placement_group_strategy": engine_config.placement_strategy, } ) @@ -436,7 +534,7 @@ def get_serve_options( The dictionary to use in .options() when creating the deployment. """ - deployment_config = self._set_deployment_placement_options() + deployment_options = self._set_deployment_placement_options() default_runtime_env = ray.get_runtime_context().runtime_env if ENABLE_WORKER_PROCESS_SETUP_HOOK: @@ -444,22 +542,73 @@ def get_serve_options( "worker_process_setup_hook" ] = "ray.llm._internal.serve._worker_process_setup_hook" - ray_actor_options = deployment_config.get("ray_actor_options", {}) + ray_actor_options = deployment_options.get("ray_actor_options", {}) ray_actor_options["runtime_env"] = { **default_runtime_env, # Existing runtime_env should take precedence over the default. **ray_actor_options.get("runtime_env", {}), **(self.runtime_env if self.runtime_env else {}), } - deployment_config["ray_actor_options"] = ray_actor_options + deployment_options["ray_actor_options"] = ray_actor_options # Set the name of the deployment config to map to the model ID. - if "name" not in deployment_config: - deployment_config["name"] = self._get_deployment_name() + if "name" not in deployment_options: + deployment_options["name"] = self._get_deployment_name() if name_prefix: - deployment_config["name"] = name_prefix + deployment_config["name"] + deployment_options["name"] = name_prefix + deployment_options["name"] - return deployment_config + # Configure DP deployment options. + # TODO(rui): move the following to DPServer, e.g., + # deployment_options = DPServer.get_deployment_options(llm_config) + dp_size = self.engine_kwargs.get("data_parallel_size", 1) + if not (isinstance(dp_size, int) and dp_size > 0): + raise ValueError( + f"Invalid data_parallel_size: {dp_size}, expecting " "positive integer." + ) + if dp_size != 1: + if "num_replicas" in deployment_options: + raise ValueError( + "num_replicas should not be specified for DP deployment, " + f"use engine_kwargs.data_parallel_size={dp_size} instead." + ) + if "autoscaling_config" in deployment_options: + raise ValueError( + "autoscaling_config is not supported for DP deployment, " + f"use engine_kwargs.data_parallel_size={dp_size} to set a " + "fixed number of replicas instead." + ) + deployment_options["num_replicas"] = dp_size + deployment_options["max_ongoing_requests"] = DEFAULT_MAX_ONGOING_REQUESTS + if deployment_options["placement_group_strategy"] != "STRICT_PACK": + logger.warning( + f"DP deployment with placement_strategy={deployment_options['placement_group_strategy']} " + "is not supported. Using STRICT_PACK instead." + ) + deployment_options["placement_group_strategy"] = "STRICT_PACK" + + return deployment_options + + def setup_engine_backend(self): + self._setup_kv_connector_backend() + + def _setup_kv_connector_backend(self): + """Private method to setup kv connector depending on the local deployment state""" + # 1. validate that the backend is one of the backends supported (Nixl or LMCache) + kv_transfer_config = self.engine_kwargs.get("kv_transfer_config") + if not kv_transfer_config: + return + + kv_connector = kv_transfer_config.get("kv_connector") + if not kv_connector: + raise ValueError("Connector type is not specified.") + + kv_connector_backend_class = SUPPORTED_KV_CONNECTOR_BACKENDS.get(kv_connector) + if not kv_connector_backend_class: + raise ValueError(f"Unsupported connector type: {kv_connector}") + + # 2. Setup the backend + kv_connector_backend = kv_connector_backend_class(self) + kv_connector_backend.setup() def _is_yaml_file(filename: str) -> bool: @@ -537,7 +686,7 @@ def parse_args( return models -class LLMServingArgs(BaseModel): +class LLMServingArgs(BaseModelExtended): llm_configs: List[Union[str, LLMConfig]] = Field( description="A list of LLMConfigs, or paths to LLMConfigs, to run.", ) diff --git a/python/ray/train/v2/lightning/__init__.py b/python/ray/llm/_internal/serve/deployments/data_parallel/__init__.py similarity index 100% rename from python/ray/train/v2/lightning/__init__.py rename to python/ray/llm/_internal/serve/deployments/data_parallel/__init__.py diff --git a/python/ray/llm/_internal/serve/deployments/data_parallel/dp_rank_assigner.py b/python/ray/llm/_internal/serve/deployments/data_parallel/dp_rank_assigner.py new file mode 100644 index 000000000000..ac65ba259216 --- /dev/null +++ b/python/ray/llm/_internal/serve/deployments/data_parallel/dp_rank_assigner.py @@ -0,0 +1,131 @@ +import asyncio +import logging +from typing import Dict, List, Optional + +from ray import serve + +logger = logging.getLogger(__name__) + + +@serve.deployment(num_replicas=1) +class DPRankAssigner: + """ + Data Parallel Rank Assigner. + + This class is used to assign a rank to each replica in the data parallel + deployment. + """ + + def __init__(self, dp_size: int, dp_size_per_node: Optional[int] = None): + self.dp_size: int = dp_size + self.dp_size_per_node: Optional[int] = dp_size_per_node + self.lock: asyncio.Lock = asyncio.Lock() + self.dp_address: Optional[str] = None + self.dp_rpc_port: Optional[int] = None + self.master_info_event: asyncio.Event = asyncio.Event() + + # Fields for _register_random_placement(): + # Next rank to assign + self.next_rank: Optional[int] = None + + # Fields for _register_node_pack_placement(): + # Number of nodes to assign to + self.num_nodes: Optional[int] = None + # Map from node id to available ranks + self.node_to_avail_ranks: Dict[str, List[int]] = {} + + if dp_size_per_node is None: + self.next_rank = 0 + logger.info( + f"Using random placement rank assigner for DP size {self.dp_size}" + ) + else: + if self.dp_size_per_node <= 0: + raise ValueError( + f"dp_size_per_node {self.dp_size_per_node} must be greater than 0" + ) + if self.dp_size % self.dp_size_per_node != 0: + raise ValueError( + f"dp_size {self.dp_size} must be divisible by dp_size_per_node {self.dp_size_per_node}" + ) + self.num_nodes = self.dp_size // self.dp_size_per_node + logger.info( + f"Using node pack placement rank assigner for DP size {self.dp_size}" + f"with dp_size_per_node {self.dp_size_per_node}" + ) + + async def register( + self, replica_ctx: "serve.context.ReplicaContext", node_id: Optional[str] = None + ): + """ + Register a replica and assign a rank to it. + + Args: + replica_ctx: The replica context. + node_id: The node id of the replica. + + Returns: + The rank of the replica. + """ + if self.dp_size_per_node is None: + return await self._register_random_placement() + else: + if node_id is None: + raise ValueError("node_id is required for node pack placement") + return await self._register_node_pack_placement(node_id) + + async def _register_random_placement(self): + """ + Assign a rank based on random placement. + + The ranks are assigned in a random order, regardless of its node id. + """ + async with self.lock: + if self.next_rank >= self.dp_size: + raise ValueError( + f"Attempted to assign rank {self.next_rank} but dp_size is {self.dp_size}" + ) + # TODO(rui): instead of using the naive increment approach, + # we should use the Ray Serve Replica Rank API to assign ranks. + rank = self.next_rank + self.next_rank += 1 + return rank + + async def _register_node_pack_placement(self, node_id: str): + """ + Assign a rank based on node pack placement. + + This should be used for DeepEP which assumes that the ranks ranging from + [dp_rank_per_node * node_rank, dp_rank_per_node * (node_rank + 1) - 1] are + assigned to the same node. + + For example, if dp_size_per_node is 8, and there are 16 ranks in total, then + the ranks [0, 7] should be assigned to one node, and ranks [8, 15] should be + assigned to another node. + """ + async with self.lock: + if not self.node_to_avail_ranks: + self.node_to_avail_ranks[node_id] = list( + range(1, self.dp_size_per_node) + ) + return 0 + elif node_id not in self.node_to_avail_ranks: + node_rank = len(self.node_to_avail_ranks) + assert node_rank < self.num_nodes + rank = node_rank * self.dp_size_per_node + self.node_to_avail_ranks[node_id] = list( + range(rank + 1, rank + self.dp_size_per_node) + ) + return rank + else: + rank = self.node_to_avail_ranks[node_id].pop(0) + return rank + + async def set_dp_master_info(self, dp_address: str, dp_rpc_port: int): + self.dp_address = dp_address + self.dp_rpc_port = dp_rpc_port + self.master_info_event.set() + + async def get_dp_master_info(self): + await self.master_info_event.wait() + return self.dp_address, self.dp_rpc_port diff --git a/python/ray/llm/_internal/serve/deployments/data_parallel/dp_server.py b/python/ray/llm/_internal/serve/deployments/data_parallel/dp_server.py new file mode 100644 index 000000000000..11c8d8e25e98 --- /dev/null +++ b/python/ray/llm/_internal/serve/deployments/data_parallel/dp_server.py @@ -0,0 +1,102 @@ +import logging +import time +from typing import Optional + +from ray import serve +from ray.experimental.collective.util import get_address_and_port +from ray.llm._internal.serve.configs.server_models import LLMConfig +from ray.llm._internal.serve.deployments.data_parallel.dp_rank_assigner import ( + DPRankAssigner, +) +from ray.llm._internal.serve.deployments.llm.llm_server import LLMServer +from ray.runtime_context import get_runtime_context +from ray.serve.deployment import Application +from ray.serve.handle import DeploymentHandle + +logger = logging.getLogger(__name__) + + +class DPServer(LLMServer): + """ + Data Parallel LLM Server. + + This class is used to serve data parallel attention (DP Attention) + deployment paradigm, where the attention layers are replicated and + the MoE layers are sharded. DP Attention is typically used for models + like DeepSeek-V3. + """ + + async def __init__(self, llm_config: LLMConfig, dp_rank_assigner: DeploymentHandle): + self.dp_rank_assigner = dp_rank_assigner + + replica_ctx = serve.get_replica_context() + node_id = get_runtime_context().get_node_id() + self.dp_rank = await self.dp_rank_assigner.register.remote(replica_ctx, node_id) + + logger.info(f"DP rank {self.dp_rank} registered with rank assigner") + + if self.dp_rank == 0: + self.dp_address, self.dp_rpc_port = get_address_and_port() + await self.dp_rank_assigner.set_dp_master_info.remote( + self.dp_address, self.dp_rpc_port + ) + logger.info( + f"DP rank {self.dp_rank} has set DP master info: " + f"data_parallel_address={self.dp_address}, " + f"data_parallel_rpc_port={self.dp_rpc_port}" + ) + else: + timestamp = time.time() + ( + self.dp_address, + self.dp_rpc_port, + ) = await self.dp_rank_assigner.get_dp_master_info.remote() + logger.info( + f"DP rank {self.dp_rank} got DP master info: " + f"data_parallel_address={self.dp_address}, " + f"data_parallel_rpc_port={self.dp_rpc_port}, " + f"waited {time.time() - timestamp:.3f} seconds" + ) + + # Update the engine_kwargs to assign the DP information + llm_config.update_engine_kwargs( + data_parallel_rank=self.dp_rank, + data_parallel_address=self.dp_address, + data_parallel_rpc_port=self.dp_rpc_port, + ) + + await super().__init__(llm_config) + + @classmethod + def as_deployment(cls, deployment_options: dict) -> serve.Deployment: + return serve.deployment(cls).options(**deployment_options) + + +def build_dp_deployment( + llm_config: LLMConfig, + *, + name_prefix: Optional[str] = None, + options_override: Optional[dict] = None, +) -> Application: + """Build a data parallel LLM deployment.""" + dp_size = llm_config.engine_kwargs.get("data_parallel_size", 1) + if dp_size == 1: + raise ValueError( + "data_parallel_size should be greater than 1 for DP deployment." + ) + + # TODO(rui): figure out a better way to pass in dp_size_per_node. + # NOTE: we cannot use engine_kwargs.data_parallel_size_local to specify + # the number of ranks per node because that has special semantics in vLLM. + dp_size_per_node = llm_config.experimental_configs.get("dp_size_per_node", None) + + dp_rank_assigner = DPRankAssigner.bind( + dp_size=dp_size, dp_size_per_node=dp_size_per_node + ) + deployment_options = llm_config.get_serve_options(name_prefix=name_prefix) + if options_override: + deployment_options.update(options_override) + + return DPServer.as_deployment(deployment_options).bind( + llm_config=llm_config, dp_rank_assigner=dp_rank_assigner + ) diff --git a/python/ray/llm/_internal/serve/deployments/llm/llm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/llm_engine.py index ea20ab3d18d3..89bf1a2f5cc9 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/llm_engine.py +++ b/python/ray/llm/_internal/serve/deployments/llm/llm_engine.py @@ -36,6 +36,10 @@ async def resolve_lora(self, lora_model: DiskMultiplexConfig): """Mounts the LoRA model on the engine, given the local disk path.""" pass + @abc.abstractmethod + async def reset_prefix_cache(self) -> None: + """Reset the prefix cache of the underlying engine""" + @abc.abstractmethod async def chat( self, request: "ChatCompletionRequest" diff --git a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py index 551329753526..faa9f98f87b5 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py +++ b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py @@ -52,6 +52,8 @@ EmbeddingRequest, EmbeddingResponse, ErrorResponse, + ScoreRequest, + ScoreResponse, ) logger = get_logger(__name__) @@ -104,6 +106,18 @@ async def check_health(self) -> None: """ ... + @abstractmethod + async def reset_prefix_cache(self) -> None: + """Reset the prefix cache of the underlying engine""" + + @abstractmethod + async def start_profile(self) -> None: + """Start profiling""" + + @abstractmethod + async def stop_profile(self) -> None: + """Stop profiling""" + # TODO (Kourosh): This does not belong here. async def llm_config(self) -> Optional[LLMConfig]: return None @@ -294,7 +308,10 @@ def _batch_output_stream( async def _run_request( self, request: Union[ - "ChatCompletionRequest", "CompletionRequest", "EmbeddingRequest" + "ChatCompletionRequest", + "CompletionRequest", + "EmbeddingRequest", + "ScoreRequest", ], *, engine_method: str, @@ -380,6 +397,24 @@ async def embeddings( request, engine_method="embeddings", batch_output_stream=False ) + async def score( + self, request: "ScoreRequest" + ) -> AsyncGenerator[Union["ScoreResponse", "ErrorResponse"], None]: + """Runs a score request to the engine and returns the response. + + Returns an AsyncGenerator over the ScoreResponse object. This is so that the caller can have a consistent interface across all the methods of chat, completions, embeddings, and score. + + Args: + request: A ScoreRequest object. + + Returns: + An AsyncGenerator over the ScoreResponse object. + """ + # NOTE: Score does not need batching, similar to embeddings. + return await self._run_request( + request, engine_method="score", batch_output_stream=False + ) + async def check_health(self) -> None: """ Check the health of the replica. Does not return anything. Raise error when @@ -393,6 +428,41 @@ async def check_health(self) -> None: logger.error("Engine health check failed in LLMServer.check_health: %s", e) raise e + async def reset_prefix_cache(self) -> None: + """Reset the prefix cache of the underlying engine""" + if self.engine is None: + return + try: + await self.engine.reset_prefix_cache() + except Exception as e: + logger.error( + "Engine reset prefix cache failed in LLMServer.reset_prefix_cache: %s", + e, + ) + raise e + + async def start_profile(self) -> None: + """Start profiling""" + if self.engine is None: + return + try: + await self.engine.start_profile() + except Exception as e: + logger.error( + "Engine start profile failed in LLMServer.start_profile: %s", e + ) + raise e + + async def stop_profile(self) -> None: + """Stop profiling""" + if self.engine is None: + return + try: + await self.engine.stop_profile() + except Exception as e: + logger.error("Engine stop profile failed in LLMServer.stop_profile: %s", e) + raise e + async def llm_config(self) -> Optional[LLMConfig]: return self._llm_config diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/kv_transfer_backends/__init__.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/kv_transfer_backends/__init__.py new file mode 100644 index 000000000000..72e24a18bad5 --- /dev/null +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/kv_transfer_backends/__init__.py @@ -0,0 +1,16 @@ +from typing import Dict + +from ray.llm._internal.serve.deployments.llm.vllm.kv_transfer_backends.base import ( + BaseConnectorBackend, +) +from ray.llm._internal.serve.deployments.llm.vllm.kv_transfer_backends.lmcache_connector_v1 import ( + LMCacheConnectorV1Backend, +) +from ray.llm._internal.serve.deployments.llm.vllm.kv_transfer_backends.nixl_connector import ( + NixlConnectorBackend, +) + +SUPPORTED_BACKENDS: Dict[str, BaseConnectorBackend] = { + "LMCacheConnectorV1": LMCacheConnectorV1Backend, + "NixlConnector": NixlConnectorBackend, +} diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/kv_transfer_backends/base.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/kv_transfer_backends/base.py new file mode 100644 index 000000000000..2999a147c887 --- /dev/null +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/kv_transfer_backends/base.py @@ -0,0 +1,44 @@ +import abc +import random +import string +from typing import TYPE_CHECKING, Any, Dict + +if TYPE_CHECKING: + from ray.llm._internal.serve.configs.server_models import LLMConfig + + +class BaseConnectorBackend(abc.ABC): + def __init__(self, llm_config: "LLMConfig"): + """Base class for connector backends. + + Args: + llm_config: The llm configuration for this engine + """ + self.llm_config = llm_config + + @property + def kv_transfer_config(self) -> Dict[str, Any]: + engine_kwargs = self.llm_config.engine_kwargs + kv_transfer_config = engine_kwargs.get("kv_transfer_config") + assert ( + kv_transfer_config is not None + ), "In Connector backend, kv_transfer_config is not set" + return kv_transfer_config + + def _get_unique_suffix(self, len: int = 6) -> str: + """Generates unique alphanumeric suffix. + + Args: + len: Length of the suffix to generate. + Returns: + A unique alphanumeric suffix string of specified length. + """ + return "".join(random.choices(string.ascii_letters + string.digits, k=len)) + + @abc.abstractmethod + def setup(self) -> None: + """Setup the connector backend. + + This method is called to setup the connector backend. + """ + pass diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/kv_transfer_backends/lmcache_connector_v1.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/kv_transfer_backends/lmcache_connector_v1.py new file mode 100644 index 000000000000..85945c30eb14 --- /dev/null +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/kv_transfer_backends/lmcache_connector_v1.py @@ -0,0 +1,61 @@ +from ray.llm._internal.serve.deployments.llm.vllm.kv_transfer_backends.base import ( + BaseConnectorBackend, +) +from ray.llm._internal.serve.observability.logging import get_logger + +logger = get_logger(__name__) + + +def _check_lmcache_installed(): + try: + import lmcache # noqa: F401 + except ImportError: + raise ImportError( + "LMCache is not installed. Please install it with `pip install lmcache`." + ) + + +class LMCacheConnectorV1Backend(BaseConnectorBackend): + + KV_CONNECTOR_EXTRA_CONFIG_FIELD_NAME = "kv_connector_extra_config" + LMCACHE_RPC_PORT_FIELD_NAME = "lmcache_rpc_port" + DEFAULT_LMCACHE_RPC_PORT_NAME = "lmcache_rpc_port" + + def setup(self) -> None: + """Initialize the LMCache connector backend. + This method sets up the LMCache connector by: + 1. Checking if LMCache is installed. + 2. Configuring the LMCache RPC port if not already set. + 3. Creating a unique LMCache RPC port across replicas. + Raises: + ImportError: If LMCache is not installed. + """ + _check_lmcache_installed() + + if ( + LMCacheConnectorV1Backend.KV_CONNECTOR_EXTRA_CONFIG_FIELD_NAME + not in self.kv_transfer_config + ): + return + + kv_connector_extra_config = self.kv_transfer_config[ + LMCacheConnectorV1Backend.KV_CONNECTOR_EXTRA_CONFIG_FIELD_NAME + ] + lmcache_rpc_port = ( + kv_connector_extra_config.get( + LMCacheConnectorV1Backend.LMCACHE_RPC_PORT_FIELD_NAME, + LMCacheConnectorV1Backend.DEFAULT_LMCACHE_RPC_PORT_NAME, + ) + + self._get_unique_suffix() + ) + if ( + LMCacheConnectorV1Backend.LMCACHE_RPC_PORT_FIELD_NAME + in kv_connector_extra_config + ): + logger.info( + f"Setting unique {lmcache_rpc_port=} for current replica LMCacheConnectorV1." + ) + + kv_connector_extra_config[ + LMCacheConnectorV1Backend.LMCACHE_RPC_PORT_FIELD_NAME + ] = lmcache_rpc_port diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/kv_transfer_backends/nixl_connector.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/kv_transfer_backends/nixl_connector.py new file mode 100644 index 000000000000..76036a5f3117 --- /dev/null +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/kv_transfer_backends/nixl_connector.py @@ -0,0 +1,64 @@ +import os + +from ray.llm._internal.serve.deployments.llm.vllm.kv_transfer_backends.base import ( + BaseConnectorBackend, +) + + +class NixlConnectorBackend(BaseConnectorBackend): + def _set_side_channel_port(self): + from vllm import envs as vllm_envs, utils as vllm_utils + + if not vllm_envs.is_set("VLLM_NIXL_SIDE_CHANNEL_PORT"): + base_port: int = int( + self.llm_config.experimental_configs.get( + "NIXL_SIDE_CHANNEL_PORT_BASE", vllm_utils.get_open_port() + ) + ) + # If dp_rank is set, we should use the + # base port + dp_rank as the side channel port + # due to a potential ray condition for getting the free ports. + dp_rank = self.llm_config.engine_kwargs.get("data_parallel_rank", 0) + port = base_port + dp_rank + os.environ["VLLM_NIXL_SIDE_CHANNEL_PORT"] = str(port) + + def _set_side_channel_host(self): + from vllm import envs as vllm_envs, utils as vllm_utils + + if not vllm_envs.is_set("VLLM_NIXL_SIDE_CHANNEL_HOST"): + os.environ["VLLM_NIXL_SIDE_CHANNEL_HOST"] = vllm_utils.get_ip() + + def setup(self) -> None: + """Initialize the NIXL connector backend. + + This method sets up the NIXL (Network Interface for eXtended LLM) connector by: + 1. Verifying that the required vLLM environment variables are supported + 2. Configuring the side channel port and host if not already set + 3. Creating a unique engine ID across replicas + + The side channel is used for KV cache transfer between vLLM instances. + + Raises: + ValueError: If the current vLLM version doesn't support the required + NIXL environment variables. + """ + from vllm import envs as vllm_envs + + if ( + "VLLM_NIXL_SIDE_CHANNEL_PORT" not in vllm_envs.environment_variables + or "VLLM_NIXL_SIDE_CHANNEL_HOST" not in vllm_envs.environment_variables + ): + raise ValueError( + "This vLLM version does not support VLLM_NIXL_SIDE_CHANNEL_PORT" + "or VLLM_NIXL_SIDE_CHANNEL_HOST environment variable. It's likely" + "that you are using an older version of vLLM." + ) + + self._set_side_channel_port() + self._set_side_channel_host() + + # We need to overwrite the engine_id to make it unique across replicas. + engine_id = self.kv_transfer_config.get("engine_id", self._get_unique_suffix()) + host = vllm_envs.VLLM_NIXL_SIDE_CHANNEL_HOST + port = vllm_envs.VLLM_NIXL_SIDE_CHANNEL_PORT + self.kv_transfer_config["engine_id"] = "-".join([engine_id, host, str(port)]) diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py index b55c27120179..d8eeea45801a 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py @@ -1,11 +1,9 @@ import argparse import os -import uuid from typing import TYPE_CHECKING, AsyncGenerator, Optional, Tuple, Union from starlette.datastructures import State from starlette.requests import Request -from transformers.dynamic_module_utils import init_hf_modules from vllm.engine.arg_utils import AsyncEngineArgs from vllm.entrypoints.openai.cli_args import FrontendArgs from vllm.entrypoints.openai.protocol import ErrorResponse as VLLMErrorResponse @@ -19,7 +17,10 @@ CompletionResponse, EmbeddingRequest, EmbeddingResponse, + ErrorInfo, ErrorResponse, + ScoreRequest, + ScoreResponse, ) from ray.llm._internal.serve.configs.server_models import ( DiskMultiplexConfig, @@ -44,6 +45,7 @@ from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding from vllm.entrypoints.openai.serving_models import OpenAIServingModels + from vllm.entrypoints.openai.serving_score import ServingScores vllm = try_import("vllm") logger = get_logger(__name__) @@ -108,51 +110,20 @@ def __init__( """ super().__init__(llm_config) - # Ensure transformers_modules is initialized early in worker processes. - # This is critical for models with trust_remote_code=True to avoid pickle errors. - init_hf_modules() - self.llm_config = llm_config if vllm is None: raise ImportError( "vLLM is not installed. Please install it with `pip install ray[llm]`." ) - from vllm import envs as vllm_envs, utils as vllm_utils + from vllm import envs as vllm_envs if not vllm_envs.VLLM_USE_V1: logger.warning( "vLLM v0 is getting fully deprecated. As a result in Ray Serve LLM only v1 is supported. Only when you know what you are doing, you can set VLLM_USE_V1=0" ) - # TODO (Kourosh): This validation logic belongs to the PDProxy module. - # Pick a random port in P/D case. - kv_transfer_config = llm_config.engine_kwargs.get("kv_transfer_config", None) - if kv_transfer_config is not None: - connector_type = getattr(kv_transfer_config, "kv_connector", "") - if connector_type != "NixlConnector": - raise ValueError("Only NixlConnector is supported for kv transfer.") - if ( - "VLLM_NIXL_SIDE_CHANNEL_PORT" not in vllm_envs.environment_variables - or "VLLM_NIXL_SIDE_CHANNEL_HOST" not in vllm_envs.environment_variables - ): - raise ValueError( - "This vLLM version does not support VLLM_NIXL_SIDE_CHANNEL_PORT" - "or VLLM_NIXL_SIDE_CHANNEL_HOST environment variable. It's likely" - "that you are using an older version of vLLM." - ) - - if not vllm_envs.is_set("VLLM_NIXL_SIDE_CHANNEL_PORT"): - port: int = vllm_utils.get_open_port() - os.environ["VLLM_NIXL_SIDE_CHANNEL_PORT"] = str(port) - if not vllm_envs.is_set("VLLM_NIXL_SIDE_CHANNEL_HOST"): - os.environ["VLLM_NIXL_SIDE_CHANNEL_HOST"] = vllm_utils.get_ip() - - # We need to overwrite the engine_id to make it unique across replicas. - engine_id = getattr(kv_transfer_config, "engine_id", str(uuid.uuid4())) - host = vllm_envs.VLLM_NIXL_SIDE_CHANNEL_HOST - port = vllm_envs.VLLM_NIXL_SIDE_CHANNEL_PORT - kv_transfer_config.engine_id = "-".join([engine_id, host, str(port)]) + self.llm_config.setup_engine_backend() self._running = False @@ -162,6 +133,7 @@ def __init__( self._oai_serving_chat: Optional["OpenAIServingChat"] = None self._oai_serving_completion: Optional["OpenAIServingCompletion"] = None self._oai_serving_embedding: Optional["OpenAIServingEmbedding"] = None + self._oai_serving_scores: Optional["ServingScores"] = None async def start(self) -> None: """Start the vLLM engine. @@ -217,6 +189,7 @@ async def start(self) -> None: self._oai_serving_chat = state.openai_serving_chat self._oai_serving_completion = state.openai_serving_completion self._oai_serving_embedding = state.openai_serving_embedding + self._oai_serving_scores = state.openai_serving_scores self._validate_openai_serving_models() self._validate_engine_client() @@ -249,6 +222,11 @@ def _validate_openai_serving_embedding(self): self._oai_serving_embedding, "create_embedding" ), "oai_serving_embedding must have a create_embedding attribute" + def _validate_openai_serving_scores(self): + assert hasattr( + self._oai_serving_scores, "create_score" + ), "oai_serving_scores must have a create_score attribute" + def _validate_engine_client(self): assert hasattr( self._engine_client, "check_health" @@ -328,7 +306,7 @@ def _start_async_llm_engine( """Creates an async LLM engine from the engine arguments.""" from vllm import envs as vllm_envs - # NOTE: This is a temporary solution untill vLLM v1 supports embeddings. + # NOTE: This is a temporary solution until vLLM v1 supports embeddings. if not vllm_envs.VLLM_USE_V1: return self._start_async_llm_engine_v0( vllm_engine_args, vllm_engine_config, placement_group @@ -378,11 +356,13 @@ async def resolve_lora(self, disk_lora_model: DiskMultiplexConfig): ) if isinstance(lora_request, VLLMErrorResponse): - raise ValueError(f"Failed to load lora model: {lora_request.message}") + raise ValueError(f"Failed to load lora model: {lora_request.error.message}") def _create_raw_request( self, - request: Union[CompletionRequest, ChatCompletionRequest, EmbeddingRequest], + request: Union[ + CompletionRequest, ChatCompletionRequest, EmbeddingRequest, ScoreRequest + ], path: str, ) -> Request: scope = { @@ -418,7 +398,7 @@ async def chat( yield response else: if isinstance(chat_response, VLLMErrorResponse): - yield ErrorResponse(**chat_response.model_dump()) + yield ErrorResponse(error=ErrorInfo(**chat_response.error.model_dump())) else: yield ChatCompletionResponse(**chat_response.model_dump()) @@ -447,7 +427,9 @@ async def completions( yield response else: if isinstance(completion_response, VLLMErrorResponse): - yield ErrorResponse(**completion_response.model_dump()) + yield ErrorResponse( + error=ErrorInfo(**completion_response.error.model_dump()) + ) else: yield CompletionResponse(**completion_response.model_dump()) @@ -466,10 +448,28 @@ async def embeddings( ) if isinstance(embedding_response, VLLMErrorResponse): - yield ErrorResponse(**embedding_response.model_dump()) + yield ErrorResponse( + error=ErrorInfo(**embedding_response.error.model_dump()) + ) else: yield EmbeddingResponse(**embedding_response.model_dump()) + async def score( + self, request: ScoreRequest + ) -> AsyncGenerator[Union[ScoreResponse, ErrorResponse], None]: + self._validate_openai_serving_scores() + + raw_request = self._create_raw_request(request, "/score") + + score_response = await self._oai_serving_scores.create_score( + request, raw_request=raw_request + ) + + if isinstance(score_response, VLLMErrorResponse): + yield ErrorResponse(**score_response.model_dump()) + else: + yield ScoreResponse(**score_response.model_dump()) + async def check_health(self) -> None: assert self._engine_client is not None, "engine_client is not initialized" @@ -478,3 +478,15 @@ async def check_health(self) -> None: except BaseException as e: logger.error("Healthcheck failed. The replica will be restarted") raise e from None + + async def reset_prefix_cache(self) -> None: + assert self._engine_client is not None, "engine_client is not initialized" + await self._engine_client.reset_prefix_cache() + + async def start_profile(self) -> None: + assert self._engine_client is not None, "engine_client is not initialized" + await self._engine_client.start_profile() + + async def stop_profile(self) -> None: + assert self._engine_client is not None, "engine_client is not initialized" + await self._engine_client.stop_profile() diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py index 77e7237f3b56..610205de86c4 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py @@ -1,3 +1,4 @@ +import copy import dataclasses import os from typing import Any, Dict, List, Optional @@ -34,7 +35,6 @@ class VLLMEngineConfig(BaseModelExtended): model_config = ConfigDict( use_enum_values=True, - extra="forbid", ) model_id: str = Field( @@ -93,11 +93,17 @@ def get_initialization_kwargs(self) -> dict: else: engine_kwargs["distributed_executor_backend"] = "ray" - if "disable_log_requests" not in engine_kwargs: - logger.info( - "Disabling request logging by default. To enable, set to False in engine_kwargs." + # TODO (Nikhil): Remove this once vLLM fully deprecates disable_log_requests. + if "disable_log_requests" in engine_kwargs: + logger.warning( + "disable_log_requests is set in engine_kwargs, but vLLM " + "does not support it. Converting to enable_log_requests." ) - engine_kwargs["disable_log_requests"] = True + engine_kwargs["enable_log_requests"] = not engine_kwargs.pop( + "disable_log_requests" + ) + elif "enable_log_requests" not in engine_kwargs: + engine_kwargs["enable_log_requests"] = False return engine_kwargs @@ -180,13 +186,14 @@ def placement_strategy(self) -> str: @property def placement_bundles(self) -> List[Dict[str, float]]: + if self.resources_per_bundle: bundle = self.resources_per_bundle else: bundle = {"GPU": 1} if self.accelerator_type: bundle[self.ray_accelerator_type()] = 0.001 - bundles = [bundle for _ in range(self.num_devices)] + bundles = [copy.deepcopy(bundle) for _ in range(self.num_devices)] return bundles @@ -224,6 +231,7 @@ def get_or_create_pg(self) -> PlacementGroup: If we are already in a placement group, return the existing placement group. Else, create a new placement group based on the scaling config. """ + dp_rank = self.engine_kwargs.get("data_parallel_rank", None) pg = get_current_placement_group() if pg: logger.debug( @@ -238,8 +246,12 @@ def get_or_create_pg(self) -> PlacementGroup: "Change RAYLLM_ALLOW_NEW_PLACEMENT_GROUPS_IN_DEPLOYMENT " "if this is not intended." ) + name = "" if dp_rank is None else f"dp_{dp_rank}" + pg = placement_group( - self.placement_bundles, strategy=self.placement_strategy + bundles=self.placement_bundles, + strategy=self.placement_strategy, + name=name, ) logger.info(f"Using new placement group {pg}. {placement_group_table(pg)}") diff --git a/python/ray/llm/_internal/serve/deployments/prefill_decode_disagg/prefill_decode_disagg.py b/python/ray/llm/_internal/serve/deployments/prefill_decode_disagg/prefill_decode_disagg.py index ff11971dad74..0c0ff2cff450 100644 --- a/python/ray/llm/_internal/serve/deployments/prefill_decode_disagg/prefill_decode_disagg.py +++ b/python/ray/llm/_internal/serve/deployments/prefill_decode_disagg/prefill_decode_disagg.py @@ -4,10 +4,10 @@ import uuid from typing import Any, AsyncGenerator, Dict, Union -from pydantic import BaseModel, Field -from vllm.config import KVTransferConfig +from pydantic import Field from ray import serve +from ray.llm._internal.common.base_pydantic import BaseModelExtended from ray.llm._internal.serve.configs.openai_api_models import ( ChatCompletionRequest, ChatCompletionResponse, @@ -33,7 +33,7 @@ RequestType = Union[ChatCompletionRequest, CompletionRequest] -class PDServingArgs(BaseModel): +class PDServingArgs(BaseModelExtended): """Schema for P/D serving args.""" prefill_config: Union[str, LLMConfig] @@ -184,14 +184,12 @@ def build_pd_openai_app(pd_serving_args: dict) -> Application: for config in [pd_config.prefill_config, pd_config.decode_config]: if "kv_transfer_config" not in config.engine_kwargs: - config.engine_kwargs.update( - { - "kv_transfer_config": KVTransferConfig( - kv_connector="NixlConnector", - kv_role="kv_both", - engine_id=str(uuid.uuid4()), - ) - } + config.update_engine_kwargs( + kv_transfer_config=dict( + kv_connector="NixlConnector", + kv_role="kv_both", + engine_id=str(uuid.uuid4()), + ) ) prefill_deployment = build_llm_deployment( diff --git a/python/ray/llm/_internal/serve/deployments/routers/middleware.py b/python/ray/llm/_internal/serve/deployments/routers/middleware.py index 961e199332ff..6a2588d2a4dd 100644 --- a/python/ray/llm/_internal/serve/deployments/routers/middleware.py +++ b/python/ray/llm/_internal/serve/deployments/routers/middleware.py @@ -67,10 +67,10 @@ def _uncaught_exception_handler(request: Request, e: Exception): logger.error(f"Uncaught exception while handling request {request_id}", exc_info=e) - response_payload = get_response_for_error(e, request_id) + error_response = get_response_for_error(e, request_id) return JSONResponse( - content=response_payload.model_dump(), status_code=response_payload.code + content=error_response.model_dump(), status_code=error_response.error.code ) @@ -111,11 +111,11 @@ async def _handle_application_exceptions( return await _handle_validation_error(request, e) except Exception as e: request_id = get_request_id(request) - response_payload = get_response_for_error(e, request_id) + error_response = get_response_for_error(e, request_id) return JSONResponse( - content=response_payload.model_dump(), - status_code=response_payload.code, + content=error_response.model_dump(), + status_code=error_response.error.code, ) # This adds last-resort uncaught exception handler into Starlette diff --git a/python/ray/llm/_internal/serve/deployments/routers/router.py b/python/ray/llm/_internal/serve/deployments/routers/router.py index ac79d7e22a8c..5a859bc5e100 100644 --- a/python/ray/llm/_internal/serve/deployments/routers/router.py +++ b/python/ray/llm/_internal/serve/deployments/routers/router.py @@ -1,6 +1,7 @@ import asyncio import json import sys +from contextlib import asynccontextmanager from typing import ( Any, AsyncGenerator, @@ -46,9 +47,12 @@ LLMChatResponse, LLMCompletionsResponse, LLMEmbeddingsResponse, + LLMScoreResponse, ModelCard, ModelList, OpenAIHTTPException, + ScoreRequest, + ScoreResponse, to_model_metadata, ) from ray.llm._internal.serve.configs.server_models import LLMConfig @@ -77,6 +81,31 @@ logger = get_logger(__name__) T = TypeVar("T") + + +def _sanitize_chat_completion_request( + request: ChatCompletionRequest, +) -> ChatCompletionRequest: + """Sanitize ChatCompletionRequest to fix Pydantic ValidatorIterator serialization issue. + + This addresses a known Pydantic bug where tool_calls fields become ValidatorIterator + objects that cannot be pickled for Ray remote calls. + + References: + - vLLM PR that introduces the workaround: https://github.com/vllm-project/vllm/pull/9951 + - Pydantic Issue: https://github.com/pydantic/pydantic/issues/9467 + - Related Issue: https://github.com/pydantic/pydantic/issues/9541 + - Official Workaround: https://github.com/pydantic/pydantic/issues/9467#issuecomment-2442097291 + + TODO(seiji): Remove when we update to Pydantic v2.11+ with the fix. + """ + from vllm.transformers_utils.tokenizers.mistral import maybe_serialize_tool_calls + + maybe_serialize_tool_calls(request) + + return request + + StreamResponseType = Union[ ChatCompletionStreamResponse, CompletionStreamResponse, @@ -190,6 +219,19 @@ async def _openai_json_wrapper( yield "data: [DONE]\n\n" +@asynccontextmanager +async def router_request_timeout(timeout_duration: float): + try: + async with timeout(timeout_duration): + yield + except asyncio.TimeoutError as e: + raise OpenAIHTTPException( + status_code=status.HTTP_408_REQUEST_TIMEOUT, + message="Request server side timeout", + internal_message=str(e), + ) + + class LLMRouter: def __init__( self, @@ -285,10 +327,18 @@ def _get_configured_serve_handle(self, model_id: str): async def _get_response( self, *, - body: Union[CompletionRequest, ChatCompletionRequest, EmbeddingRequest], + body: Union[ + CompletionRequest, ChatCompletionRequest, EmbeddingRequest, ScoreRequest + ], call_method: str, ) -> AsyncGenerator[ - Union[LLMChatResponse, LLMCompletionsResponse, LLMEmbeddingsResponse], None + Union[ + LLMChatResponse, + LLMCompletionsResponse, + LLMEmbeddingsResponse, + LLMScoreResponse, + ], + None, ]: """Calls the model deployment and returns the stream.""" model: str = body.model @@ -302,6 +352,11 @@ async def _get_response( model_handle = self._get_configured_serve_handle(model) + # TODO(seiji): Remove when we update to Pydantic v2.11+ with the fix + # for tool calling ValidatorIterator serialization issue. + if isinstance(body, ChatCompletionRequest): + body = _sanitize_chat_completion_request(body) + async for response in getattr(model_handle, call_method).remote(body): yield response @@ -377,7 +432,7 @@ async def _process_llm_request( ) call_method = "chat" if is_chat else "completions" - async with timeout(DEFAULT_LLM_ROUTER_HTTP_TIMEOUT): + async with router_request_timeout(DEFAULT_LLM_ROUTER_HTTP_TIMEOUT): gen = self._get_response(body=body, call_method=call_method) @@ -391,9 +446,9 @@ async def _process_llm_request( if isinstance(first_chunk, ErrorResponse): raise OpenAIHTTPException( - message=first_chunk.message, - status_code=first_chunk.code, - type=first_chunk.type, + message=first_chunk.error.message, + status_code=first_chunk.error.code, + type=first_chunk.error.type, ) if isinstance(first_chunk, NoneStreamingResponseType): @@ -435,9 +490,35 @@ async def embeddings(self, body: EmbeddingRequest) -> Response: Returns: A response object with embeddings. """ - async with timeout(DEFAULT_LLM_ROUTER_HTTP_TIMEOUT): + async with router_request_timeout(DEFAULT_LLM_ROUTER_HTTP_TIMEOUT): results = self._get_response(body=body, call_method="embeddings") result = await results.__anext__() + if isinstance(result, ErrorResponse): + raise OpenAIHTTPException( + message=result.error.message, + status_code=result.error.code, + type=result.error.type, + ) + + if isinstance(result, EmbeddingResponse): + return JSONResponse(content=result.model_dump()) + + @fastapi_router_app.post("/v1/score") + async def score(self, body: ScoreRequest) -> Response: + """Create scores for the provided text pairs. + + Note: This is a vLLM specific endpoint. + + Args: + body: The score request containing input text pairs to score. + + Returns: + A response object with scores. + """ + + async with router_request_timeout(DEFAULT_LLM_ROUTER_HTTP_TIMEOUT): + results = self._get_response(body=body, call_method="score") + result = await results.__anext__() if isinstance(result, ErrorResponse): raise OpenAIHTTPException( message=result.message, @@ -445,7 +526,7 @@ async def embeddings(self, body: EmbeddingRequest) -> Response: type=result.type, ) - if isinstance(result, EmbeddingResponse): + if isinstance(result, ScoreResponse): return JSONResponse(content=result.model_dump()) @classmethod diff --git a/python/ray/llm/_internal/serve/deployments/utils/server_utils.py b/python/ray/llm/_internal/serve/deployments/utils/server_utils.py index 121b5ea68118..4d490ec329f2 100644 --- a/python/ray/llm/_internal/serve/deployments/utils/server_utils.py +++ b/python/ray/llm/_internal/serve/deployments/utils/server_utils.py @@ -9,6 +9,7 @@ from ray import serve from ray.llm._internal.serve.configs.openai_api_models import ( + ErrorInfo, ErrorResponse, OpenAIHTTPException, ) @@ -110,11 +111,12 @@ def get_response_for_error( if "(Request ID: " not in internal_message: internal_message += f" (Request ID: {request_id})" - error_response = ErrorResponse( + error_info = ErrorInfo( message=f"Message: {message}, Internal exception: {internal_message}, original exception: {str(e)}", code=status_code, type=exc_type, ) + error_response = ErrorResponse(error=error_info) return error_response diff --git a/python/ray/llm/_internal/serve/observability/logging/__init__.py b/python/ray/llm/_internal/serve/observability/logging/__init__.py index 6e684874f33e..914e2a8dce9f 100644 --- a/python/ray/llm/_internal/serve/observability/logging/__init__.py +++ b/python/ray/llm/_internal/serve/observability/logging/__init__.py @@ -1,7 +1,7 @@ import logging from typing import Optional -from ray._private.ray_logging.filters import CoreContextFilter +from ray._common.filters import CoreContextFilter from ray.serve._private.logging_utils import ServeContextFilter diff --git a/python/ray/llm/_internal/serve/observability/logging/setup.py b/python/ray/llm/_internal/serve/observability/logging/setup.py index b57f7e149484..3b1915fd2ac6 100644 --- a/python/ray/llm/_internal/serve/observability/logging/setup.py +++ b/python/ray/llm/_internal/serve/observability/logging/setup.py @@ -1,7 +1,7 @@ import logging -from ray._private.ray_logging.filters import CoreContextFilter -from ray._private.ray_logging.formatters import JSONFormatter +from ray._common.filters import CoreContextFilter +from ray._common.formatters import JSONFormatter from ray.serve._private.logging_utils import ServeContextFilter diff --git a/python/ray/llm/_internal/serve/observability/usage_telemetry/usage.py b/python/ray/llm/_internal/serve/observability/usage_telemetry/usage.py index 061e34460d66..b011db4810cd 100644 --- a/python/ray/llm/_internal/serve/observability/usage_telemetry/usage.py +++ b/python/ray/llm/_internal/serve/observability/usage_telemetry/usage.py @@ -1,3 +1,5 @@ +import random +import time from enum import Enum from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Sequence @@ -203,9 +205,39 @@ def _get_or_create_telemetry_agent() -> TelemetryAgent: return telemetry_agent +def _retry_get_telemetry_agent( + max_retries: int = 5, base_delay: float = 0.1 +) -> TelemetryAgent: + max_retries = 5 + base_delay = 0.1 + + telemetry_agent = None + for attempt in range(max_retries): + try: + telemetry_agent = _get_or_create_telemetry_agent() + return telemetry_agent + except ValueError as e: + # Due to race conditions among multiple replicas, we may get: + # ValueError: Actor with name 'llm_serve_telemetry' already + # exists in the namespace llm_serve_telemetry + logger.info( + "Attempt %s/%s to get telemetry agent failed", attempt + 1, max_retries + ) + if attempt == max_retries - 1: + raise e + + # Exponential backoff with jitter + exponential_delay = base_delay * (2**attempt) + jitter = random.uniform(0, 0.5) + delay = exponential_delay + jitter + # Max total wait time is ~3.5 seconds for 5 attempts. + time.sleep(delay) + + def _push_telemetry_report(model: Optional[TelemetryModel] = None) -> None: """Push telemetry report for a model.""" - telemetry_agent = _get_or_create_telemetry_agent() + telemetry_agent = _retry_get_telemetry_agent() + assert telemetry_agent is not None ray.get(telemetry_agent.record.remote(model)) diff --git a/python/ray/llm/_internal/serve/request_router/prefix_aware/prefix_aware_router.py b/python/ray/llm/_internal/serve/request_router/prefix_aware/prefix_aware_router.py index 3fb897753ad4..6d3027b0bef5 100644 --- a/python/ray/llm/_internal/serve/request_router/prefix_aware/prefix_aware_router.py +++ b/python/ray/llm/_internal/serve/request_router/prefix_aware/prefix_aware_router.py @@ -2,6 +2,7 @@ import logging import time from typing import ( + Any, List, Optional, ) @@ -139,14 +140,47 @@ def _extract_text_from_request(self, pending_request: PendingRequest) -> str: "No request with message or prompt attribute found in pending_request.args" ) - # Convert list of messages to concatenated string + return self._normalize_prompt_to_string(prompt) + + def _coerce_to_text(self, value: Any) -> str: + if value is None: + return "" + if isinstance(value, str): + return value + if isinstance(value, list): + return "".join(self._coerce_to_text(item) for item in value) + if isinstance(value, dict): + text_value = value.get("text") + if isinstance(text_value, str): + return text_value + if "content" in value: + return self._coerce_to_text(value["content"]) + + return "" + + def _normalize_prompt_to_string(self, prompt: Any) -> str: + """Normalize prompt/messages a single string of characters. + This is not exhaustive (e.g. thinking parts, multimodal are not supported). + TODO(seiji): find a more maintainable way to normalize the prompt/messages. + + Supported: + - string → return as-is + - list of strings → concat + - list of message dicts with 'content' as string → concat + - list of message dicts with 'content' as list of dicts → concat the 'text' fields from those parts + """ + if isinstance(prompt, str): + return prompt + if isinstance(prompt, list): - concatenated_messages = "".join( - msg.get("content", "") for msg in prompt if "content" in msg + return "".join( + self._coerce_to_text( + message.get("content") if isinstance(message, dict) else message + ) + for message in prompt ) - return concatenated_messages - else: - return prompt + + return "" async def _prefix_match_best_replicas( self, @@ -167,6 +201,7 @@ async def _prefix_match_best_replicas( ): input_text = self._extract_text_from_request(pending_request) if input_text is not None: + # Start Sphinx tag: __begin_load_balance_component__ # Check for imbalanced load. highest_queue_len = 0 lowest_queue_len = float("inf") @@ -195,6 +230,8 @@ async def _prefix_match_best_replicas( is_imbalanced = ( highest_queue_len - lowest_queue_len > self._imbalanced_threshold ) + # End Sphinx tag: __end_load_balance_component__ + # Start Sphinx tag: __begin_prefix_match_component__ if not is_imbalanced: # Convert candidate replica IDs to strings for prefix matching. candidate_replica_ids_strings = [ @@ -221,6 +258,7 @@ async def _prefix_match_best_replicas( and len(matched_tenant_id_strings) > 0 ): chosen_replica_id_strings = matched_tenant_id_strings + # End Sphinx tag: __end_prefix_match_component__ return [ [ self._replicas[ReplicaID.from_full_id_str(chosen_id_string)] @@ -228,11 +266,14 @@ async def _prefix_match_best_replicas( ] ] + # Start Sphinx tag: __begin_on_replica_actor_died__ def on_replica_actor_died(self, replica_id: ReplicaID): """Drop replica from replica set so it's not considered for future requests.""" super().on_replica_actor_died(replica_id) ray.get(self._tree_actor.remove_tenants.remote([replica_id.to_full_id_str()])) + # End Sphinx tag: __end_on_replica_actor_died__ + def update_replicas(self, replicas: List[RunningReplica]): """Update the set of available replicas to be considered for routing. @@ -283,6 +324,7 @@ async def choose_replicas( model ID are available after that timeout, it will fall back to the regular procedure. """ + # Start Sphinx tag: __begin_pow2_router_base__ # Get fallback replicas from PowerOfTwoChoicesRequestRouter fallback_replicas = await PowerOfTwoChoicesRequestRouter.choose_replicas( self, @@ -291,6 +333,7 @@ async def choose_replicas( ) if pending_request is None or not fallback_replicas: return fallback_replicas + # End Sphinx tag: __end_pow2_router_base__ if ( pending_request is not None @@ -324,6 +367,7 @@ async def choose_replicas( return fallback_replicas + # Start Sphinx tag: __begin_on_request_routed__ def on_request_routed( self, pending_request: PendingRequest, @@ -349,3 +393,5 @@ def on_request_routed( input_text, replica_id.to_full_id_str(), time.time() ) ) + + # End Sphinx tag: __end_on_request_routed__ diff --git a/python/ray/llm/tests/BUILD b/python/ray/llm/tests/BUILD.bazel similarity index 100% rename from python/ray/llm/tests/BUILD rename to python/ray/llm/tests/BUILD.bazel diff --git a/python/ray/llm/tests/batch/cpu/processor/test_processor_base.py b/python/ray/llm/tests/batch/cpu/processor/test_processor_base.py index 4e2b64323c64..ba93421d8252 100644 --- a/python/ray/llm/tests/batch/cpu/processor/test_processor_base.py +++ b/python/ray/llm/tests/batch/cpu/processor/test_processor_base.py @@ -190,17 +190,87 @@ def overrider(name: str, stage: StatefulStage): class TestProcessorConfig: def test_valid_concurrency(self): + config = vLLMEngineProcessorConfig( + model_source="unsloth/Llama-3.2-1B-Instruct", + concurrency=(1, 2), + ) + assert config.concurrency == (1, 2) - with pytest.raises(pydantic.ValidationError, match="should be a valid integer"): - config = vLLMEngineProcessorConfig( - model_source="unsloth/Llama-3.2-1B-Instruct", - concurrency=(1, 2), - ) config = vLLMEngineProcessorConfig( model_source="unsloth/Llama-3.2-1B-Instruct", ) assert config.concurrency == 1 + def test_invalid_concurrency(self): + with pytest.raises(pydantic.ValidationError): + vLLMEngineProcessorConfig( + model_source="unsloth/Llama-3.2-1B-Instruct", + concurrency=1.1, + ) + + with pytest.raises(pydantic.ValidationError): + vLLMEngineProcessorConfig( + model_source="unsloth/Llama-3.2-1B-Instruct", + concurrency=[1, 2, 3], + ) + + @pytest.mark.parametrize("n", [1, 2, 10]) + def test_positive_int_not_fail(self, n): + conf = ProcessorConfig(concurrency=n) + assert conf.concurrency == n + + def test_positive_int_unusual_not_fail(self): + assert ProcessorConfig(concurrency="1").concurrency == 1 + assert ProcessorConfig(concurrency=1.0).concurrency == 1 + assert ProcessorConfig(concurrency="1.0").concurrency == 1 + + @pytest.mark.parametrize("pair", [(1, 1), (1, 2), (2, 8)]) + def test_valid_tuple_not_fail(self, pair): + conf = ProcessorConfig(concurrency=pair) + assert conf.concurrency == pair + + def test_valid_tuple_unusual_not_fail(self): + assert ProcessorConfig(concurrency=("1", 2)).concurrency == (1, 2) + assert ProcessorConfig(concurrency=(1, "2")).concurrency == (1, 2) + assert ProcessorConfig(concurrency=[1, "2"]).concurrency == (1, 2) + + @pytest.mark.parametrize( + "bad,msg_part", + [ + (0, "positive integer"), + (-5, "positive integer"), + ((1, 2, 3), "at most 2 items"), + ((0, 1), "positive integers"), + ((1, 0), "positive integers"), + ((-1, 2), "positive integers"), + ((1, -2), "positive integers"), + ((1, 2.5), "a number with a fractional part"), + ("2.1", "unable to parse string"), + ((5, 2), "min > max"), + ], + ) + def test_invalid_inputs_raise(self, bad, msg_part): + with pytest.raises(pydantic.ValidationError) as e: + ProcessorConfig(concurrency=bad) + assert msg_part in str(e.value) + + @pytest.mark.parametrize( + "n,expected", [(1, (1, 1)), (4, (1, 4)), (10, (1, 10)), ("10", (1, 10))] + ) + def test_with_int_concurrency_scaling(self, n, expected): + conf = ProcessorConfig(concurrency=n) + assert conf.get_concurrency() == expected + + @pytest.mark.parametrize("n,expected", [(1, (1, 1)), (4, (4, 4)), (10, (10, 10))]) + def test_with_int_concurrency_fixed(self, n, expected): + conf = ProcessorConfig(concurrency=n) + assert conf.get_concurrency(autoscaling_enabled=False) == expected + + @pytest.mark.parametrize("pair", [(1, 1), (1, 3), (2, 8)]) + def test_with_tuple_concurrency(self, pair): + conf = ProcessorConfig(concurrency=pair) + assert conf.get_concurrency() == pair + if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/llm/tests/batch/cpu/stages/test_prepare_image_stage.py b/python/ray/llm/tests/batch/cpu/stages/test_prepare_image_stage.py index 07b5182dfbcd..6e27850a7fd5 100644 --- a/python/ray/llm/tests/batch/cpu/stages/test_prepare_image_stage.py +++ b/python/ray/llm/tests/batch/cpu/stages/test_prepare_image_stage.py @@ -164,5 +164,119 @@ async def test_prepare_image_udf_invalid_image_type(mock_image_processor): pass +# Test that image extraction works consistently with both uniform content types +# (no system prompt) and mixed content types (with system prompt) +@pytest.mark.parametrize( + "messages,expected_images,test_description", + [ + # Test with system prompt + ( + [ + {"role": "system", "content": "You are an assistant"}, + { + "role": "user", + "content": [ + { + "type": "image", + "image": "https://example.com/test-image.jpg", + }, + { + "type": "text", + "text": "Can you describe this image in 1 words?", + }, + ], + }, + ], + ["https://example.com/test-image.jpg"], + "with_system_prompt", + ), + # Test without system prompt + ( + [ + { + "role": "user", + "content": [ + { + "type": "image", + "image": "https://example.com/test-image.jpg", + }, + { + "type": "text", + "text": "Can you describe this image in 1 words?", + }, + ], + } + ], + ["https://example.com/test-image.jpg"], + "without_system_prompt", + ), + # Test multiple images without system prompt + ( + [ + { + "role": "user", + "content": [ + {"type": "image", "image": "https://example.com/image1.jpg"}, + {"type": "text", "text": "Describe this image"}, + ], + }, + { + "role": "user", + "content": [ + {"type": "image", "image": "https://example.com/image2.jpg"}, + {"type": "text", "text": "What do you see?"}, + ], + }, + ], + ["https://example.com/image1.jpg", "https://example.com/image2.jpg"], + "multiple_images_no_system_prompt", + ), + # Test image_url format without system prompt + ( + [ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": "https://example.com/image.jpg", + }, + {"type": "text", "text": "Describe this image"}, + ], + } + ], + ["https://example.com/image.jpg"], + "image_url_format_no_system_prompt", + ), + # Test OpenAI nested format without system prompt + # https://github.com/openai/openai-openapi/blob/manual_spec/openapi.yaml#L1937-L1940 + ( + [ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": {"url": "https://example.com/image.jpg"}, + }, + {"type": "text", "text": "Describe this image"}, + ], + } + ], + ["https://example.com/image.jpg"], + "openai_image_url_format_no_system_prompt", + ), + ], + ids=lambda x: x if isinstance(x, str) else None, +) +def test_extract_image_info(messages, expected_images, test_description): + """Test image extraction with various message structures and formats.""" + udf = PrepareImageUDF(data_column="__data", expected_input_keys=["messages"]) + + image_info = udf.extract_image_info(messages) + assert len(image_info) == len(expected_images) + assert image_info == expected_images + + if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/llm/tests/batch/gpu/processor/test_serve_deployment_proc.py b/python/ray/llm/tests/batch/gpu/processor/test_serve_deployment_proc.py new file mode 100644 index 000000000000..74f72b1eb916 --- /dev/null +++ b/python/ray/llm/tests/batch/gpu/processor/test_serve_deployment_proc.py @@ -0,0 +1,236 @@ +import sys +from typing import Any, Dict + +import pytest + +import ray +from ray import serve +from ray.llm._internal.batch.processor import ProcessorBuilder +from ray.llm._internal.batch.processor.serve_deployment_proc import ( + ServeDeploymentProcessorConfig, +) +from ray.serve.llm.openai_api_models import ChatCompletionRequest, CompletionRequest + + +@pytest.mark.parametrize( + "dtype_mapping", [None, {"CompletionRequest": CompletionRequest}] +) +def test_serve_deployment_processor(dtype_mapping): + app_name = "test_serve_deployment_processor_app" + deployment_name = "test_serve_deployment_name" + + config_kwargs = dict( + deployment_name=deployment_name, + app_name=app_name, + batch_size=16, + concurrency=1, + ) + if dtype_mapping is not None: + config_kwargs["dtype_mapping"] = dtype_mapping + config = ServeDeploymentProcessorConfig(**config_kwargs) + + processor = ProcessorBuilder.build(config) + assert processor.list_stage_names() == [ + "ServeDeploymentStage", + ] + + stage = processor.get_stage_by_name("ServeDeploymentStage") + assert stage.fn_constructor_kwargs == { + "deployment_name": deployment_name, + "app_name": app_name, + "dtype_mapping": dtype_mapping, + } + + assert stage.map_batches_kwargs == { + "concurrency": 1, + } + + +def test_simple_serve_deployment(serve_cleanup): + @serve.deployment + class SimpleServeDeployment: + # ServeDeploymentStageUDF expects an async generator. + async def add(self, request: Dict[str, Any]): + yield {"result": request["x"] + 1} + + app_name = "simple_serve_deployment_app" + deployment_name = "SimpleServeDeployment" + + serve.run(SimpleServeDeployment.bind(), name=app_name) + + config = ServeDeploymentProcessorConfig( + deployment_name=deployment_name, + app_name=app_name, + batch_size=16, + concurrency=1, + ) + + processor = ProcessorBuilder.build( + config, + preprocess=lambda row: dict( + method="add", + dtype=None, # Empty dtype since output is already dict format + request_kwargs=dict(x=row["id"]), + ), + postprocess=lambda row: dict( + resp=row["result"], + id=row["id"], + ), + ) + + ds = ray.data.range(60) + ds = ds.map(lambda x: {"id": x["id"]}) + ds = processor(ds) + + outs = ds.take_all() + assert len(outs) == 60 + assert all("resp" in out for out in outs) + assert all(out["resp"] == out["id"] + 1 for out in outs) + + +def test_completion_model(model_opt_125m, create_model_opt_125m_deployment): + deployment_name, app_name = create_model_opt_125m_deployment + config = ServeDeploymentProcessorConfig( + deployment_name=deployment_name, + app_name=app_name, + dtype_mapping={ + "CompletionRequest": CompletionRequest, + }, + batch_size=16, + concurrency=1, + ) + + processor = ProcessorBuilder.build( + config, + preprocess=lambda row: dict( + method="completions", + dtype="CompletionRequest", + request_kwargs=dict( + model=model_opt_125m, + prompt=row["prompt"], + stream=False, + ), + ), + postprocess=lambda row: dict( + resp=row["choices"][0]["text"], + ), + ) + + ds = ray.data.range(60) + ds = ds.map(lambda x: {"prompt": f"Hello {x['id']}"}) + ds = processor(ds) + ds = ds.materialize() + outs = ds.take_all() + assert len(outs) == 60 + assert all("resp" in out for out in outs) + + +def test_multi_turn_completion_model(model_opt_125m, create_model_opt_125m_deployment): + deployment_name, app_name = create_model_opt_125m_deployment + + config1 = ServeDeploymentProcessorConfig( + deployment_name=deployment_name, + app_name=app_name, + dtype_mapping={ + "CompletionRequest": CompletionRequest, + }, + # Use lower batch size to reduce resource usage as there are multiple processors + batch_size=4, + concurrency=1, + ) + + processor1 = ProcessorBuilder.build( + config1, + preprocess=lambda row: dict( + dtype="CompletionRequest", + method="completions", + request_kwargs=dict( + model=model_opt_125m, + prompt=row["prompt"], + stream=False, + ), + ), + postprocess=lambda row: dict( + prompt=row["choices"][0]["text"], + ), + ) + + config2 = ServeDeploymentProcessorConfig( + deployment_name=deployment_name, + app_name=app_name, + dtype_mapping={ + "CompletionRequest": CompletionRequest, + }, + batch_size=4, + concurrency=1, + ) + + processor2 = ProcessorBuilder.build( + config2, + preprocess=lambda row: dict( + dtype="CompletionRequest", + method="completions", + request_kwargs=dict( + model=model_opt_125m, + prompt=row["prompt"], + stream=False, + ), + ), + postprocess=lambda row: dict( + resp=row["choices"][0]["text"], + ), + ) + + ds = ray.data.range(60) + ds = ds.map(lambda x: {"prompt": f"Hello {x['id']}"}) + ds = processor1(ds) + ds = processor2(ds) + + ds = ds.materialize() + outs = ds.take_all() + assert len(outs) == 60 + assert all("resp" in out for out in outs) + + +def test_chat_model(model_opt_125m, create_model_opt_125m_deployment): + deployment_name, app_name = create_model_opt_125m_deployment + config = ServeDeploymentProcessorConfig( + deployment_name=deployment_name, + app_name=app_name, + dtype_mapping={ + "ChatCompletionRequest": ChatCompletionRequest, + }, + batch_size=16, + concurrency=1, + ) + + processor = ProcessorBuilder.build( + config, + preprocess=lambda row: dict( + dtype="ChatCompletionRequest", + method="chat", + request_kwargs=dict( + model=model_opt_125m, + messages=[ + {"role": "system", "content": "You are a helpful assistant"}, + {"role": "user", "content": f"Hello {row['id']}"}, + ], + stream=False, + ), + ), + postprocess=lambda row: dict( + resp=row["choices"][0]["message"]["content"], + ), + ) + + ds = ray.data.range(60) + ds = ds.map(lambda x: {"id": x["id"]}) + ds = processor(ds) + ds = ds.materialize() + outs = ds.take_all() + assert len(outs) == 60 + assert all("resp" in out for out in outs) + + +if __name__ == "__main__": + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/llm/tests/batch/gpu/stages/test_serve_deployment_stage.py b/python/ray/llm/tests/batch/gpu/stages/test_serve_deployment_stage.py new file mode 100644 index 000000000000..8e07fa739d92 --- /dev/null +++ b/python/ray/llm/tests/batch/gpu/stages/test_serve_deployment_stage.py @@ -0,0 +1,177 @@ +import sys +from unittest.mock import MagicMock, patch + +import pytest + +from ray.llm._internal.batch.stages.serve_deployment_stage import ( + ServeDeploymentStageUDF, +) +from ray.serve.llm.openai_api_models import ChatCompletionRequest, CompletionRequest + + +@pytest.fixture +def mock_serve_deployment_handle(): + """Mock the serve deployment handle and its methods.""" + with patch("ray.serve.get_deployment_handle") as mock_get_handle: + mock_handle = MagicMock() + mock_handle.options.return_value = mock_handle + + # Mock the chat and completions methods + mock_handle.chat = MagicMock() + mock_handle.completions = MagicMock() + + mock_get_handle.return_value = mock_handle + yield mock_handle + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "method,test_data", + [ + ( + "completions", + [ + { + "method": "completions", + "dtype": "CompletionRequest", + "request_kwargs": {"prompt": "Hello", "temperature": 0.7}, + }, + ], + ), + ( + "chat", + [ + { + "method": "chat", + "dtype": "ChatCompletionRequest", + "request_kwargs": { + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant", + }, + {"role": "user", "content": "Hello 1"}, + ] + }, + }, + ], + ), + ], +) +async def test_serve_deployment_udf_methods( + mock_serve_deployment_handle, method, test_data +): + """Test both completions and chat methods.""" + # Create a mock response that will be returned directly + mock_response = {"test": "response"} + + def mock_remote_call(*args, **kwargs): + async def mock_async_iterator(): + yield mock_response + + return mock_async_iterator() + + getattr(mock_serve_deployment_handle, method).remote.side_effect = mock_remote_call + + udf = ServeDeploymentStageUDF( + data_column="__data", + expected_input_keys=["method", "request_kwargs"], + deployment_name="test_deployment", + app_name="test_app", + dtype_mapping={ + "CompletionRequest": CompletionRequest, + "ChatCompletionRequest": ChatCompletionRequest, + }, + ) + + batch = {"__data": test_data} + + responses = [] + async for response in udf(batch): + responses.append(response) + + assert len(responses) == 1 + assert "__data" in responses[0] + assert len(responses[0]["__data"]) == len(test_data) + + for i, item in enumerate(responses[0]["__data"]): + assert "batch_uuid" in item + assert "time_taken" in item + assert item["request_id"] == str(i) + assert "test" in item # From the mock response + + assert getattr(mock_serve_deployment_handle, method).remote.call_count == len( + test_data + ) + + +@pytest.mark.asyncio +async def test_serve_deployment_invalid_method(mock_serve_deployment_handle): + """Test that invalid method raises error at runtime.""" + # Set up the mock to simulate a method that doesn't exist + mock_serve_deployment_handle.invalid_method = None + + udf = ServeDeploymentStageUDF( + data_column="__data", + expected_input_keys=["method", "request_kwargs"], + deployment_name="test_deployment", + app_name="test_app", + dtype_mapping={ + "CompletionRequest": CompletionRequest, + }, + ) + + batch = { + "__data": [ + { + "method": "invalid_method", + "dtype": "CompletionRequest", + "request_kwargs": {"prompt": "Hello", "temperature": 0.7}, + } + ] + } + + with pytest.raises( + ValueError, match="Method invalid_method not found in the serve deployment." + ): + async for _ in udf(batch): + pass + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "dtype_mapping", [None, {"ChatCompletionRequest": ChatCompletionRequest}] +) +async def test_serve_deployment_missing_dtype( + mock_serve_deployment_handle, dtype_mapping +): + """Test that missing dtype raises error at runtime.""" + + udf = ServeDeploymentStageUDF( + data_column="__data", + expected_input_keys=["method", "request_kwargs"], + deployment_name="test_deployment", + app_name="test_app", + dtype_mapping=dtype_mapping, + ) + + batch = { + "__data": [ + { + "method": "completions", + "dtype": "CompletionRequest", + "request_kwargs": {"prompt": "Hello", "temperature": 0.7}, + } + ] + } + + with pytest.raises( + ValueError, + match="CompletionRequest must be provided in ServeDeploymentProcessorConfig's dtype_mapping.", + ): + async for _ in udf(batch): + pass + + +if __name__ == "__main__": + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/llm/tests/batch/gpu/stages/test_sglang_engine_stage.py b/python/ray/llm/tests/batch/gpu/stages/test_sglang_engine_stage.py index 9f0d3a453f0f..f23dcd98c40c 100644 --- a/python/ray/llm/tests/batch/gpu/stages/test_sglang_engine_stage.py +++ b/python/ray/llm/tests/batch/gpu/stages/test_sglang_engine_stage.py @@ -42,6 +42,7 @@ async def mock_generate(row): "generated_text": f"Response to: {row['prompt']}", "num_generated_tokens": 3, }, + 0.1, # time_taken_llm ) mock_instance.generate_async.side_effect = mock_generate @@ -226,9 +227,10 @@ async def test_sglang_wrapper( assert mock_generate_async.call_count == batch_size # Verify the outputs match expected values - for i, (request, output) in enumerate(results): + for i, (request, output, time_taken_llm) in enumerate(results): assert output["prompt"] == f"Test {i}" assert output["num_generated_tokens"] == i + 5 # max_new_tokens we set + assert time_taken_llm > 0 @pytest.mark.asyncio diff --git a/python/ray/llm/tests/batch/gpu/stages/test_vllm_engine_stage.py b/python/ray/llm/tests/batch/gpu/stages/test_vllm_engine_stage.py index 8e165fadbb7e..71971caf62e1 100644 --- a/python/ray/llm/tests/batch/gpu/stages/test_vllm_engine_stage.py +++ b/python/ray/llm/tests/batch/gpu/stages/test_vllm_engine_stage.py @@ -46,6 +46,7 @@ async def mock_generate(row): "num_generated_tokens": 3, "time_per_token": 0.1, }, + 0.1, # time_taken_llm ) mock_instance.generate_async.side_effect = mock_generate @@ -170,7 +171,7 @@ async def test_vllm_engine_udf_basic(mock_vllm_wrapper, model_llama_3_2_216M): task=vLLMTaskType.GENERATE, max_num_seqs=100, dynamic_lora_loading_path=None, - disable_log_requests=True, + enable_log_requests=False, ) @@ -298,10 +299,11 @@ async def test_vllm_wrapper_generate(model_llama_3_2_216M): tasks = [asyncio.create_task(wrapper.generate_async(row)) for row in batch] for resp in asyncio.as_completed(tasks): - request, output = await resp + request, output, time_taken_llm = await resp params = request.params max_tokens = params.max_tokens assert max_tokens == output["num_generated_tokens"] + assert time_taken_llm > 0 # Clean up GPU memory wrapper.shutdown() @@ -332,8 +334,9 @@ async def test_vllm_wrapper_embed(model_opt_125m): tasks = [asyncio.create_task(wrapper.generate_async(row)) for row in batch] for resp in asyncio.as_completed(tasks): - _, output = await resp + _, output, time_taken_llm = await resp assert output["embeddings"].shape == (768,) + assert time_taken_llm > 0 # Clean up GPU memory wrapper.shutdown() @@ -380,10 +383,11 @@ async def test_vllm_wrapper_lora(model_llama_3_2_216M, model_llama_3_2_216M_lora tasks = [asyncio.create_task(wrapper.generate_async(row)) for row in batch] for resp in asyncio.as_completed(tasks): - request, output = await resp + request, output, time_taken_llm = await resp params = request.params max_tokens = params.max_tokens assert max_tokens == output["num_generated_tokens"] + assert time_taken_llm > 0 # Clean up GPU memory wrapper.shutdown() @@ -430,12 +434,13 @@ class AnswerModel(BaseModel): tasks = [asyncio.create_task(wrapper.generate_async(row)) for row in batch] for resp in asyncio.as_completed(tasks): - _, output = await resp + _, output, time_taken_llm = await resp json_obj = json.loads(output["generated_text"]) assert "answer" in json_obj assert isinstance(json_obj["answer"], int) assert "explain" in json_obj assert isinstance(json_obj["explain"], str) + assert time_taken_llm > 0 # Clean up GPU memory wrapper.shutdown() diff --git a/python/ray/llm/tests/conftest.py b/python/ray/llm/tests/conftest.py index 65291b778714..778c32131de5 100644 --- a/python/ray/llm/tests/conftest.py +++ b/python/ray/llm/tests/conftest.py @@ -5,6 +5,9 @@ import pytest import requests +from ray import serve +from ray.serve.llm import LLMConfig, ModelLoadingConfig, build_llm_deployment + S3_ARTIFACT_URL = "https://air-example-data.s3.amazonaws.com/" S3_ARTIFACT_LLM_OSSCI_URL = S3_ARTIFACT_URL + "rayllm-ossci/" @@ -167,3 +170,66 @@ def gpu_type(): print("Failed to import torch to get GPU type", flush=True) except ValueError as err: print(f"Failed to get the GPU type: {err}", flush=True) + + +@pytest.fixture +def serve_cleanup(): + yield + serve.shutdown() + + +@pytest.fixture +def create_model_opt_125m_deployment(gpu_type, model_opt_125m, serve_cleanup): + """Create a serve deployment for testing.""" + app_name = "test_serve_deployment_processor_app" + deployment_name = "test_deployment_name" + + chat_template = """ +{% if messages[0]['role'] == 'system' %} + {% set offset = 1 %} +{% else %} + {% set offset = 0 %} +{% endif %} + +{{ bos_token }} +{% for message in messages %} + {% if (message['role'] == 'user') != (loop.index0 % 2 == offset) %} + {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }} + {% endif %} + + {{ '<|im_start|>' + message['role'] + '\n' + message['content'] | trim + '<|im_end|>\n' }} +{% endfor %} + +{% if add_generation_prompt %} + {{ '<|im_start|>assistant\n' }} +{% endif %} + """ + + # Create a vLLM serve deployment + llm_config = LLMConfig( + model_loading_config=ModelLoadingConfig( + model_id=model_opt_125m, + model_source=model_opt_125m, + ), + accelerator_type=gpu_type, + deployment_config=dict( + name="test_deployment_name", # This is not necessarily the final deployment name + autoscaling_config=dict( + min_replicas=1, + max_replicas=1, + ), + ), + engine_kwargs=dict( + enable_prefix_caching=True, + enable_chunked_prefill=True, + max_num_batched_tokens=4096, + # Add chat template for OPT model to enable chat API + chat_template=chat_template, + ), + ) + + llm_app = build_llm_deployment( + llm_config, override_serve_options=dict(name=deployment_name) + ) + serve.run(llm_app, name=app_name) + yield deployment_name, app_name diff --git a/python/ray/llm/tests/serve/conftest.py b/python/ray/llm/tests/serve/conftest.py index 55d94387f88c..f39a4bfc8483 100644 --- a/python/ray/llm/tests/serve/conftest.py +++ b/python/ray/llm/tests/serve/conftest.py @@ -15,6 +15,7 @@ ChatCompletionRequest, CompletionRequest, EmbeddingCompletionRequest, + ScoreRequest, ) from ray.llm._internal.serve.deployments.llm.vllm.vllm_models import ( VLLMEngineConfig, @@ -112,6 +113,16 @@ def mock_embedding_request(dimensions): return request +@pytest.fixture +def mock_score_request(): + """Fixture for creating score requests for mock testing.""" + return ScoreRequest( + model=MOCK_MODEL_ID, + text_1="What is the capital of France?", + text_2="The capital of France is Paris.", + ) + + def get_test_model_path(yaml_file: str) -> pathlib.Path: current_file_dir = pathlib.Path(__file__).absolute().parent test_model_path = current_file_dir / yaml_file diff --git a/python/ray/llm/tests/serve/cpu/builders/test_application_builders.py b/python/ray/llm/tests/serve/cpu/builders/test_application_builders.py index a64dbd803c47..0bb3bbd48da4 100644 --- a/python/ray/llm/tests/serve/cpu/builders/test_application_builders.py +++ b/python/ray/llm/tests/serve/cpu/builders/test_application_builders.py @@ -196,7 +196,7 @@ def test_build_llm_deployment( app = build_llm_deployment(llm_config_with_mock_engine) assert isinstance(app, serve.Application) handle = serve.run(app) - assert handle.deployment_name.startswith("LLMDeployment") + assert handle.deployment_name.startswith("LLMServer") def test_build_llm_deployment_with_name_prefix( self, diff --git a/python/ray/llm/tests/serve/cpu/configs/test_models.py b/python/ray/llm/tests/serve/cpu/configs/test_models.py index 886cf7430e31..2ae92508c9e9 100644 --- a/python/ray/llm/tests/serve/cpu/configs/test_models.py +++ b/python/ray/llm/tests/serve/cpu/configs/test_models.py @@ -1,10 +1,15 @@ import sys +from copy import deepcopy from pathlib import Path import pydantic import pytest -from ray.llm._internal.serve.configs.server_models import LLMConfig, ModelLoadingConfig +from ray.llm._internal.serve.configs.server_models import ( + LLMConfig, + LoraConfig, + ModelLoadingConfig, +) CONFIG_DIRS_PATH = str(Path(__file__).parent / "configs") @@ -68,6 +73,22 @@ def test_invalid_accelerator_type(self): accelerator_type="A100_40G", # Should use A100-40G instead ) + def test_model_loading_config_forbids_extra_fields(self): + """Test that ModelLoadingConfig rejects extra fields.""" + + with pytest.raises(pydantic.ValidationError, match="engine_kwargs"): + ModelLoadingConfig( + model_id="test_model", + model_source="test_source", + engine_kwargs={"max_model_len": 8000}, # This should be rejected + ) + + valid_config = ModelLoadingConfig( + model_id="test_model", model_source="test_source" + ) + assert valid_config.model_id == "test_model" + assert valid_config.model_source == "test_source" + def test_invalid_generation_config(self, disable_placement_bundles): """Test that passing an invalid generation_config raises an error.""" with pytest.raises( @@ -158,8 +179,7 @@ def test_get_serve_options_with_accelerator_type(self): "max_replicas": 10, } assert serve_options["placement_group_bundles"] == [ - {"CPU": 1, "GPU": 0}, - {"GPU": 1, "accelerator_type:A100-40G": 0.001}, + {"CPU": 1, "GPU": 1, "accelerator_type:A100-40G": 0.001}, ] assert serve_options["placement_group_strategy"] == "STRICT_PACK" assert serve_options["name"] == "Test:test_model" @@ -194,10 +214,7 @@ def test_get_serve_options_without_accelerator_type(self): "initial_replicas": 1, "max_replicas": 10, } - assert serve_options["placement_group_bundles"] == [ - {"CPU": 1, "GPU": 0}, - {"GPU": 1}, - ] + assert serve_options["placement_group_bundles"] == [{"CPU": 1, "GPU": 1}] assert serve_options["placement_group_strategy"] == "STRICT_PACK" assert serve_options["name"] == "Test:test_model" @@ -219,8 +236,9 @@ def test_resources_per_bundle(self): model_loading_config=dict(model_id="test_model"), engine_kwargs=dict(tensor_parallel_size=3, pipeline_parallel_size=2), ).get_serve_options(name_prefix="Test:") - assert serve_options["placement_group_bundles"] == [{"CPU": 1, "GPU": 0}] + [ - {"GPU": 1} for _ in range(6) + + assert serve_options["placement_group_bundles"] == [{"CPU": 1, "GPU": 1}] + [ + {"GPU": 1} for _ in range(5) ] # Test the custom resource bundle @@ -229,9 +247,9 @@ def test_resources_per_bundle(self): engine_kwargs=dict(tensor_parallel_size=3, pipeline_parallel_size=2), resources_per_bundle={"XPU": 1}, ).get_serve_options(name_prefix="Test:") - assert serve_options["placement_group_bundles"] == [{"CPU": 1, "GPU": 0}] + [ - {"XPU": 1} for _ in range(6) - ] + assert serve_options["placement_group_bundles"] == [ + {"CPU": 1, "GPU": 0, "XPU": 1} + ] + [{"XPU": 1} for _ in range(5)] def test_engine_config_cached(self): """Test that the engine config is cached and not recreated when calling @@ -285,6 +303,114 @@ def test_log_engine_metrics_disable_log_stats_validation(self): engine_kwargs={"disable_log_stats": True}, ) + @pytest.mark.parametrize( + "data_parallel_size,num_replica,allowed", + [ + (None, 1, True), + (None, 2, True), + (None, 3, True), + (1, 1, True), + (1, 2, True), + (1, 3, True), + (2, 2, False), + (2, 3, False), + (4, 2, False), + (2, None, True), + (None, None, True), + ], + ) + def test_multi_replica_dp_validation( + self, data_parallel_size, num_replica, allowed + ): + """Test that multi-replica and DP size are mutually exclusive. + + Ray.llm's implementation does not yet support multi-replica + deployment along with DP. + """ + engine_kwargs = ( + {} + if data_parallel_size is None + else {"data_parallel_size": data_parallel_size} + ) + deployment_config = {} if num_replica is None else {"num_replicas": num_replica} + + def get_serve_options_with_num_replica(): + return LLMConfig( + model_loading_config=dict(model_id="test_model"), + engine_kwargs=deepcopy(engine_kwargs), + deployment_config=deepcopy(deployment_config), + ).get_serve_options(name_prefix="Test:") + + if allowed: + serve_options = get_serve_options_with_num_replica() + actual_num_replicas = serve_options.get("num_replicas", 1) + expected_num_replicas = (data_parallel_size or 1) * (num_replica or 1) + assert actual_num_replicas == expected_num_replicas + else: + with pytest.raises( + ValueError, + match="use engine_kwargs.data_parallel_size", + ): + get_serve_options_with_num_replica() + + +class TestFieldValidators: + """Test the field validators for dict validation.""" + + def test_model_loading_config_dict_validation(self): + """Test that model_loading_config accepts and validates dict input.""" + config_dict = {"model_id": "microsoft/DialoGPT-medium"} + + llm_config = LLMConfig(model_loading_config=config_dict, llm_engine="vLLM") + + assert isinstance(llm_config.model_loading_config, ModelLoadingConfig) + assert llm_config.model_loading_config.model_id == "microsoft/DialoGPT-medium" + + def test_model_loading_config_validation_error(self): + """Test that invalid dict raises proper validation error.""" + with pytest.raises(pydantic.ValidationError) as exc_info: + LLMConfig( + model_loading_config={"invalid_field": "value"}, llm_engine="vLLM" + ) + + assert "Invalid model_loading_config" in str(exc_info.value) + + def test_lora_config_dict_validation(self): + """Test that lora_config accepts and validates dict input.""" + llm_config = LLMConfig( + model_loading_config={"model_id": "test"}, + lora_config=None, + llm_engine="vLLM", + ) + + assert llm_config.lora_config is None + + lora_dict = { + "dynamic_lora_loading_path": "s3://bucket/lora", + "max_num_adapters_per_replica": 8, + } + + llm_config2 = LLMConfig( + model_loading_config={"model_id": "test"}, + lora_config=lora_dict, + llm_engine="vLLM", + ) + + assert isinstance(llm_config2.lora_config, LoraConfig) + assert llm_config2.lora_config.max_num_adapters_per_replica == 8 + assert llm_config2.lora_config.dynamic_lora_loading_path == "s3://bucket/lora" + + def test_lora_config_validation_error(self): + """Test that invalid lora config dict raises proper validation error.""" + with pytest.raises(pydantic.ValidationError) as exc_info: + LLMConfig( + model_loading_config={"model_id": "test"}, + lora_config={"max_num_adapters_per_replica": "invalid_string"}, + llm_engine="vLLM", + ) + + assert "Invalid lora_config" in str(exc_info.value) + if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py index ef3603cfde2a..e473d6f4e14f 100644 --- a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py +++ b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py @@ -81,3 +81,18 @@ async def test_embedding_mock_engine( async for response in engine.embeddings(request): LLMResponseValidator.validate_embedding_response(response, dimensions) + + @pytest.mark.asyncio + async def test_score_mock_engine(self, mock_llm_config, mock_score_request): + """Test score API for text similarity.""" + # Create and start the engine + engine = MockVLLMEngine(mock_llm_config) + await engine.start() + + # Create score request + request = mock_score_request + + print("\n\n_____ SCORE _____\n\n") + + async for response in engine.score(request): + LLMResponseValidator.validate_score_response(response) diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py index 65dcd74cf38e..44404585003b 100644 --- a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py +++ b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py @@ -152,6 +152,34 @@ async def test_embedding_llm_server( # Validate embedding response LLMResponseValidator.validate_embedding_response(chunks[0], dimensions) + @pytest.mark.asyncio + async def test_score_llm_server( + self, + serve_handle, + mock_llm_config, + mock_score_request, + ): + """Test score API from LLMServer perspective.""" + + # Create score request + request = mock_score_request + + print("\n\n_____ SCORE SERVER _____\n\n") + + # Get the response + batched_chunks = serve_handle.score.remote(request) + + # Collect responses (should be just one) + chunks = [] + async for batch in batched_chunks: + chunks.append(batch) + + # Check that we got one response + assert len(chunks) == 1 + + # Validate score response + LLMResponseValidator.validate_score_response(chunks[0]) + @pytest.mark.asyncio async def test_check_health(self, mock_llm_config): """Test health check functionality.""" @@ -175,6 +203,75 @@ async def check_health(self): # Check that the health check method was called assert server.engine.check_health_called + @pytest.mark.asyncio + async def test_reset_prefix_cache(self, mock_llm_config): + """Test reset prefix cache functionality.""" + + # Mock the engine's reset_prefix_cache method + class LocalMockEngine(MockVLLMEngine): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.reset_prefix_cache_called = False + + async def reset_prefix_cache(self): + self.reset_prefix_cache_called = True + + # Create a server with a mocked engine + server = LLMServer.sync_init(mock_llm_config, engine_cls=LocalMockEngine) + await server.start() + + # Reset prefix cache, no exceptions should be raised + await server.reset_prefix_cache() + + # Check that the reset prefix cache method was called + assert server.engine.reset_prefix_cache_called + + @pytest.mark.asyncio + async def test_start_profile(self, mock_llm_config): + """Test start profile functionality.""" + + # Mock the engine's start_profile method + class LocalMockEngine(MockVLLMEngine): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.start_profile_called = False + + async def start_profile(self): + self.start_profile_called = True + + # Create a server with a mocked engine + server = LLMServer.sync_init(mock_llm_config, engine_cls=LocalMockEngine) + await server.start() + + # Start profile, no exceptions should be raised + await server.start_profile() + + # Check that the start profile method was called + assert server.engine.start_profile_called + + @pytest.mark.asyncio + async def test_stop_profile(self, mock_llm_config): + """Test stop profile functionality.""" + + # Mock the engine's stop_profile method + class LocalMockEngine(MockVLLMEngine): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.stop_profile_called = False + + async def stop_profile(self): + self.stop_profile_called = True + + # Create a server with a mocked engine + server = LLMServer.sync_init(mock_llm_config, engine_cls=LocalMockEngine) + await server.start() + + # Stop profile, no exceptions should be raised + await server.stop_profile() + + # Check that the stop profile method was called + assert server.engine.stop_profile_called + @pytest.mark.asyncio async def test_llm_config_property(self, mock_llm_config): """Test the llm_config property.""" diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/vllm/kv_transfer_backends/__init__.py b/python/ray/llm/tests/serve/cpu/deployments/llm/vllm/kv_transfer_backends/__init__.py new file mode 100644 index 000000000000..0a39e777cc97 --- /dev/null +++ b/python/ray/llm/tests/serve/cpu/deployments/llm/vllm/kv_transfer_backends/__init__.py @@ -0,0 +1 @@ +# Test package for KV transfer backends diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/vllm/kv_transfer_backends/test_lmcache_connector_v1.py b/python/ray/llm/tests/serve/cpu/deployments/llm/vllm/kv_transfer_backends/test_lmcache_connector_v1.py new file mode 100644 index 000000000000..e6da3303343f --- /dev/null +++ b/python/ray/llm/tests/serve/cpu/deployments/llm/vllm/kv_transfer_backends/test_lmcache_connector_v1.py @@ -0,0 +1,86 @@ +import pytest + +from ray.llm._internal.serve.deployments.llm.vllm.kv_transfer_backends.lmcache_connector_v1 import ( + LMCacheConnectorV1Backend, +) + + +class TestLMCacheConnectorV1Backend: + @pytest.fixture + def lmcache_backend_basic(self): + """Fixture for basic LMCacheConnectorV1Backend.""" + return LMCacheConnectorV1Backend( + { + "kv_connector": "LMCacheConnectorV1", + "kv_role": "kv_both", + } + ) + + @pytest.fixture + def lmcache_backend_with_extra(self): + """Fixture for LMCacheConnectorV1Backend with extra config.""" + return LMCacheConnectorV1Backend( + { + "kv_connector": "LMCacheConnectorV1", + "kv_role": "kv_both", + "kv_connector_extra_config": {}, + } + ) + + @pytest.fixture + def lmcache_backend_with_port(self): + """Fixture for LMCacheConnectorV1Backend with port config.""" + return LMCacheConnectorV1Backend( + { + "kv_connector": "LMCacheConnectorV1", + "kv_role": "kv_both", + "kv_connector_extra_config": { + "lmcache_rpc_port": LMCacheConnectorV1Backend.DEFAULT_LMCACHE_RPC_PORT_NAME, + }, + } + ) + + def test_setup_basic_config(self, lmcache_backend_basic): + """Test setup with basic configuration (no kv_connector_extra_config).""" + lmcache_backend_basic.setup() + + # Configuration should remain unchanged + assert ( + "kv_connector_extra_config" not in lmcache_backend_basic.kv_transfer_config + ) + + def test_setup_with_extra_config_no_port(self, lmcache_backend_with_extra): + """Test setup with extra config but no lmcache_rpc_port.""" + lmcache_backend_with_extra.setup() + + # Should add lmcache_rpc_port with default DEFAULT_LMCACHE_RPC_PORT_NAME + random string + assert ( + "lmcache_rpc_port" + in lmcache_backend_with_extra.kv_transfer_config[ + "kv_connector_extra_config" + ] + ) + port_value = lmcache_backend_with_extra.kv_transfer_config[ + "kv_connector_extra_config" + ]["lmcache_rpc_port"] + assert port_value.startswith( + LMCacheConnectorV1Backend.DEFAULT_LMCACHE_RPC_PORT_NAME + ) + assert len(port_value) > len( + LMCacheConnectorV1Backend.DEFAULT_LMCACHE_RPC_PORT_NAME + ) # Should have random string appended + + def test_setup_with_existing_port(self, lmcache_backend_with_port): + """Test setup with existing lmcache_rpc_port configuration.""" + original_port = lmcache_backend_with_port.kv_transfer_config[ + "kv_connector_extra_config" + ]["lmcache_rpc_port"] + + lmcache_backend_with_port.setup() + + # Should modify the existing port by appending random string + new_port = lmcache_backend_with_port.kv_transfer_config[ + "kv_connector_extra_config" + ]["lmcache_rpc_port"] + assert new_port.startswith(original_port) + assert len(new_port) > len(original_port) # Should have random string appended diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/vllm/kv_transfer_backends/test_nixl_connector.py b/python/ray/llm/tests/serve/cpu/deployments/llm/vllm/kv_transfer_backends/test_nixl_connector.py new file mode 100644 index 000000000000..aabe766c4a5a --- /dev/null +++ b/python/ray/llm/tests/serve/cpu/deployments/llm/vllm/kv_transfer_backends/test_nixl_connector.py @@ -0,0 +1,48 @@ +import os +import uuid +from unittest.mock import patch + +import pytest + +from ray.llm._internal.serve.deployments.llm.vllm.kv_transfer_backends.nixl_connector import ( + NixlConnectorBackend, +) + + +@pytest.fixture +def engine_id(): + """Fixture for the engine ID.""" + return str(uuid.uuid4()) + + +class TestNixlConnectorBackend: + @pytest.fixture + def nixl_backend(self, engine_id: str): + """Fixture for the NixlConnectorBackend.""" + return NixlConnectorBackend( + dict( + kv_connector="NixlConnector", + kv_role="kv_both", + engine_id=engine_id, + ) + ) + + @pytest.mark.parametrize( + "env_vars", + [ + {}, + {"VLLM_NIXL_SIDE_CHANNEL_PORT": "8080"}, + {"VLLM_NIXL_SIDE_CHANNEL_HOST": "127.0.0.1"}, + { + "VLLM_NIXL_SIDE_CHANNEL_PORT": "8080", + "VLLM_NIXL_SIDE_CHANNEL_HOST": "127.0.0.1", + }, + ], + ) + def test_setup_environment_variables(self, nixl_backend, env_vars, engine_id: str): + """Test that setup configures environment variables and overrides engine_id correctly.""" + with patch.dict("os.environ", env_vars, clear=True): + nixl_backend.setup() + assert "VLLM_NIXL_SIDE_CHANNEL_PORT" in os.environ + assert "VLLM_NIXL_SIDE_CHANNEL_HOST" in os.environ + assert engine_id in nixl_backend.kv_transfer_config["engine_id"] diff --git a/python/ray/llm/tests/serve/cpu/deployments/prefill_decode_disagg/test_prefill_decode_disagg.py b/python/ray/llm/tests/serve/cpu/deployments/prefill_decode_disagg/test_prefill_decode_disagg.py index c6444a8b7152..75c50a0ba443 100644 --- a/python/ray/llm/tests/serve/cpu/deployments/prefill_decode_disagg/test_prefill_decode_disagg.py +++ b/python/ray/llm/tests/serve/cpu/deployments/prefill_decode_disagg/test_prefill_decode_disagg.py @@ -9,7 +9,8 @@ class TestServingArgsParsing: - def test_parse_dict(self): + @pytest.mark.parametrize("kv_connector", ["NixlConnector", "LMCacheConnectorV1"]) + def test_parse_dict(self, kv_connector: str): prefill_config = LLMConfig( model_loading_config=dict( model_id="qwen-0.5b", diff --git a/python/ray/llm/tests/serve/cpu/deployments/routers/test_router.py b/python/ray/llm/tests/serve/cpu/deployments/routers/test_router.py index 4204231fd069..032e1e655c00 100644 --- a/python/ray/llm/tests/serve/cpu/deployments/routers/test_router.py +++ b/python/ray/llm/tests/serve/cpu/deployments/routers/test_router.py @@ -115,6 +115,54 @@ async def test_completion(self, stream_batching_interval_ms, client, stream): expected_text = " ".join([f"test_{i}" for i in range(n_tokens)]) assert text.strip() == expected_text + @pytest.mark.asyncio + @pytest.mark.parametrize("stream", [True, False]) + async def test_tool_call(self, client, stream): + response = client.chat.completions.create( + model="llm_model_id", + messages=[ + { + "role": "user", + "content": "Can you tell me what the temperate will be in Dallas, in fahrenheit?", + }, + { + "content": None, + "role": "assistant", + "tool_calls": [ + { + "id": "RBS92VTjJ", + "function": { + "arguments": '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}', + "name": "get_current_weather", + }, + "type": "function", + } + ], + }, + { + "role": "tool", + "content": "The weather in Dallas, TX is 85 degrees fahrenheit. It is partly cloudly, with highs in the 90's.", + "tool_call_id": "n3OMUpydP", + }, + ], + stream=stream, + max_tokens=200, + ) + + if stream: + text = "" + role = None + for chunk in response: + if chunk.choices[0].delta.role is not None and role is None: + role = chunk.choices[0].delta.role + if chunk.choices[0].delta.content: + text += chunk.choices[0].delta.content + else: + text = response.choices[0].message.content + role = response.choices[0].message.role + + assert text + def test_router_with_num_router_replicas_config(self): """Test the router with num_router_replicas config.""" # Test with no num_router_replicas config. diff --git a/python/ray/llm/tests/serve/cpu/deployments/test_prefix_aware_request_router.py b/python/ray/llm/tests/serve/cpu/deployments/test_prefix_aware_request_router.py index a3ac2ced6185..5d28ef464b06 100644 --- a/python/ray/llm/tests/serve/cpu/deployments/test_prefix_aware_request_router.py +++ b/python/ray/llm/tests/serve/cpu/deployments/test_prefix_aware_request_router.py @@ -284,6 +284,74 @@ async def test_eviction_task_creation(self, prefix_request_router): ray.get(prefix_request_router._tree_actor.stop_eviction_loop.remote()) await asyncio.sleep(0.1) + +class TestPromptNormalization: + """Tests for input normalization in the prefix-aware router.""" + + def test_normalize_prompt_string(self, prefix_request_router): + req = fake_pending_request(prompt="Hello world") + normalized = prefix_request_router._extract_text_from_request(req) + assert normalized == "Hello world" + + def test_normalize_messages_list_of_strings(self, prefix_request_router): + req = fake_pending_request(messages=["Hello", " ", "world"]) + normalized = prefix_request_router._extract_text_from_request(req) + assert normalized == "Hello world" + + def test_normalize_messages_dict_content_string(self, prefix_request_router): + req = fake_pending_request( + messages=[ + {"content": "Hello"}, + {"content": " world"}, + ] + ) + normalized = prefix_request_router._extract_text_from_request(req) + assert normalized == "Hello world" + + def test_normalize_messages_dict_content_list_of_dicts_text( + self, prefix_request_router + ): + req = fake_pending_request( + messages=[ + { + "content": [ + {"type": "text", "text": "Hello"}, + {"type": "text", "text": " world"}, + ] + } + ] + ) + normalized = prefix_request_router._extract_text_from_request(req) + assert normalized == "Hello world" + + def test_normalize_messages_dict_content_list_of_strings( + self, prefix_request_router + ): + req = fake_pending_request(messages=[{"content": ["Hello", " ", "world"]}]) + normalized = prefix_request_router._extract_text_from_request(req) + assert normalized == "Hello world" + + def test_normalize_unsupported_returns_empty(self, prefix_request_router): + # For now, unsupported multimodal parts should be ignored, resulting in empty string + req = fake_pending_request( + messages=[ + { + "content": [ + { + "type": "image_url", + "image_url": {"url": "http://example.com"}, + }, + ] + } + ] + ) + normalized = prefix_request_router._extract_text_from_request(req) + assert normalized == "" + + def test_extract_raises_when_no_prompt_or_messages(self, prefix_request_router): + with pytest.raises(ValueError): + _ = prefix_request_router._extract_text_from_request(fake_pending_request()) + @pytest.mark.asyncio @pytest.mark.parametrize( "prefix_request_router", @@ -331,3 +399,10 @@ async def test_eviction_threshold_behavior(self, prefix_request_router): ray.get(prefix_request_router._tree_actor.stop_eviction_loop.remote()) await asyncio.sleep(0.1) + + +if __name__ == "__main__": + import sys + + exit_code = pytest.main(["-vs", __file__]) + sys.exit(exit_code) diff --git a/python/ray/llm/tests/serve/cpu/observability/usage_telemetry/test_usage.py b/python/ray/llm/tests/serve/cpu/observability/usage_telemetry/test_usage.py index 3b561d8c8312..022314082619 100644 --- a/python/ray/llm/tests/serve/cpu/observability/usage_telemetry/test_usage.py +++ b/python/ray/llm/tests/serve/cpu/observability/usage_telemetry/test_usage.py @@ -13,6 +13,7 @@ from ray.llm._internal.serve.observability.usage_telemetry.usage import ( HardwareUsage, _get_or_create_telemetry_agent, + _retry_get_telemetry_agent, push_telemetry_report_for_all_models, ) @@ -136,6 +137,35 @@ def fake_get_gpu_type(*args, **kwargs): } +@ray.remote(num_cpus=0) +class Replica: + def wait_for_init(self): + """ + When this method returns, the actor initialization is guaranteed + to be complete. + + This is used for synchronization between multiple replicas, + increasing the chance for get_telemetry_agent() to be called + at the same time. + """ + pass + + def get_telemetry_agent(self): + return _retry_get_telemetry_agent() + + +def test_telemetry_race_condition(): + replicas = [Replica.remote() for _ in range(30)] + init_refs = [replica.wait_for_init.remote() for replica in replicas] + ray.get(init_refs) + + get_refs = [replica.get_telemetry_agent.remote() for replica in replicas] + telemetry_agents = ray.get(get_refs) + for telemetry_agent in telemetry_agents: + assert telemetry_agent is not None + assert len(set(telemetry_agents)) == 1 + + def test_infer_gpu_from_hardware(): # Test with a valid GPU type def fake_get_gpu_type(*args, **kwargs): diff --git a/python/ray/llm/tests/serve/gpu/deployments/llm/prefill_decode_disagg/test_prefill_decode_disagg_gpu.py b/python/ray/llm/tests/serve/gpu/deployments/llm/prefill_decode_disagg/test_prefill_decode_disagg_gpu.py index 6083ae772ea2..b4da68d79087 100644 --- a/python/ray/llm/tests/serve/gpu/deployments/llm/prefill_decode_disagg/test_prefill_decode_disagg_gpu.py +++ b/python/ray/llm/tests/serve/gpu/deployments/llm/prefill_decode_disagg/test_prefill_decode_disagg_gpu.py @@ -1,7 +1,7 @@ import sys +from unittest.mock import MagicMock import pytest -from vllm.config import KVTransferConfig from ray.llm._internal.serve.configs.server_models import ( LLMConfig, @@ -15,17 +15,23 @@ class TestPDDisaggVLLMEngine: """Test vLLM engine under PD disagg.""" @pytest.mark.asyncio + @pytest.mark.parametrize("kv_connector", ["NixlConnector", "LMCacheConnectorV1"]) async def test_pd_disagg_vllm_engine( self, # llm_config is a fixture defined in serve.tests.conftest.py llm_config: LLMConfig, + kv_connector: str, + monkeypatch, ): """Test vLLM engine under PD disagg.""" + if kv_connector == "LMCacheConnectorV1": + lmcache_mock = MagicMock() + monkeypatch.setitem(sys.modules, "lmcache", lmcache_mock) llm_config = llm_config.model_copy(deep=True) llm_config.engine_kwargs.update( { - "kv_transfer_config": KVTransferConfig( - kv_connector="NixlConnector", + "kv_transfer_config": dict( + kv_connector=kv_connector, kv_role="kv_both", ), } diff --git a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py index 5db89b1f3c0c..068621fbfa71 100644 --- a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py +++ b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py @@ -13,6 +13,8 @@ EmbeddingRequest, EmbeddingResponse, ErrorResponse, + ScoreRequest, + ScoreResponse, ) from ray.llm._internal.serve.configs.server_models import ( DiskMultiplexConfig, @@ -51,6 +53,21 @@ async def check_health(self) -> None: if not self.started: raise RuntimeError("Engine not started") + async def reset_prefix_cache(self) -> None: + """Reset the prefix cache of the mock engine.""" + if not self.started: + raise RuntimeError("Engine not started") + + async def start_profile(self) -> None: + """Start profiling of the mock engine.""" + if not self.started: + raise RuntimeError("Engine not started") + + async def stop_profile(self) -> None: + """Stop profiling of the mock engine.""" + if not self.started: + raise RuntimeError("Engine not started") + async def chat( self, request: ChatCompletionRequest ) -> AsyncGenerator[Union[str, ChatCompletionResponse, ErrorResponse], None]: @@ -120,6 +137,41 @@ async def embeddings( ) yield response + async def score( + self, request: ScoreRequest + ) -> AsyncGenerator[Union[str, ScoreResponse, ErrorResponse], None]: + """Mock score generation for text pairs.""" + if not self.started: + raise RuntimeError("Engine not started") + + # Extract text_1 and text_2 from the request + text_1 = getattr(request, "text_1", "") + text_2 = getattr(request, "text_2", "") + + # Convert to lists if they aren't already + text_1_list = text_1 if isinstance(text_1, list) else [text_1] + text_2_list = text_2 if isinstance(text_2, list) else [text_2] + + # Generate mock scores for each pair + score_data = [] + for i, (t1, t2) in enumerate(zip(text_1_list, text_2_list)): + # Generate a random score (can be any float value) + score = random.uniform(-10.0, 10.0) + + score_data.append({"object": "score", "score": score, "index": i}) + + # Create the response + response = ScoreResponse( + object="list", + data=score_data, + model=getattr(request, "model", "mock-model"), + usage={ + "prompt_tokens": len(str(text_1).split()) + len(str(text_2).split()), + "total_tokens": len(str(text_1).split()) + len(str(text_2).split()), + }, + ) + yield response + async def _generate_chat_response( self, request: ChatCompletionRequest, prompt_text: str, max_tokens: int ) -> AsyncGenerator[Union[str, ChatCompletionResponse], None]: diff --git a/python/ray/llm/tests/serve/utils/testing_utils.py b/python/ray/llm/tests/serve/utils/testing_utils.py index fd62a3034deb..496dd2b50b0b 100644 --- a/python/ray/llm/tests/serve/utils/testing_utils.py +++ b/python/ray/llm/tests/serve/utils/testing_utils.py @@ -11,6 +11,7 @@ ChatCompletionResponse, CompletionResponse, EmbeddingResponse, + ScoreResponse, ) @@ -94,3 +95,16 @@ def validate_embedding_response( # Check dimensions if specified if expected_dimensions: assert len(response.data[0].embedding) == expected_dimensions + + @staticmethod + def validate_score_response(response: ScoreResponse): + """Validate score responses.""" + assert isinstance(response, ScoreResponse) + assert response.object == "list" + assert len(response.data) >= 1 + + # Validate each score data element + for i, score_data in enumerate(response.data): + assert score_data.object == "score" + assert isinstance(score_data.score, float) + assert score_data.index == i # Index should match position in list diff --git a/python/ray/remote_function.py b/python/ray/remote_function.py index ac661dde1a23..0d6d20e44579 100644 --- a/python/ray/remote_function.py +++ b/python/ray/remote_function.py @@ -9,12 +9,12 @@ import ray._common.signature from ray import Language, cross_language from ray._common import ray_option_utils +from ray._common.ray_option_utils import _warn_if_using_deprecated_placement_group from ray._private.auto_init_hook import wrap_auto_init from ray._private.client_mode_hook import ( client_mode_convert_function, client_mode_should_convert, ) -from ray._common.ray_option_utils import _warn_if_using_deprecated_placement_group from ray._private.serialization import pickle_dumps from ray._private.utils import get_runtime_env_info, parse_runtime_env_for_task_or_actor from ray._raylet import ( diff --git a/python/ray/runtime_context.py b/python/ray/runtime_context.py index 567b7ac2c7e7..df669f3447f5 100644 --- a/python/ray/runtime_context.py +++ b/python/ray/runtime_context.py @@ -1,6 +1,6 @@ import logging -from typing import Any, Dict, List, Optional import threading +from typing import Any, Dict, List, Optional import ray._private.worker from ray._private.client_mode_hook import client_mode_hook @@ -501,6 +501,7 @@ def current_actor(self): @property def gcs_address(self): """Get the GCS address of the ray cluster. + Returns: The GCS address of the cluster. """ diff --git a/python/ray/runtime_env/BUILD b/python/ray/runtime_env/BUILD.bazel similarity index 100% rename from python/ray/runtime_env/BUILD rename to python/ray/runtime_env/BUILD.bazel diff --git a/python/ray/runtime_env/runtime_env.py b/python/ray/runtime_env/runtime_env.py index 0682c48539fb..b1ef6c87a743 100644 --- a/python/ray/runtime_env/runtime_env.py +++ b/python/ray/runtime_env/runtime_env.py @@ -13,11 +13,11 @@ from ray._private.runtime_env.plugin_schema_manager import RuntimeEnvPluginSchemaManager from ray._private.runtime_env.uv import get_uri as get_uv_uri from ray._private.runtime_env.validation import ( - OPTION_TO_VALIDATION_FN, OPTION_TO_NO_PATH_VALIDATION_FN, + OPTION_TO_VALIDATION_FN, ) from ray._private.thirdparty.dacite import from_dict -from ray.core.generated.runtime_env_common_pb2 import ( +from ray.core.generated.runtime_environment_pb2 import ( RuntimeEnvConfig as ProtoRuntimeEnvConfig, ) from ray.util.annotations import PublicAPI diff --git a/python/ray/scripts/BUILD b/python/ray/scripts/BUILD.bazel similarity index 100% rename from python/ray/scripts/BUILD rename to python/ray/scripts/BUILD.bazel diff --git a/python/ray/scripts/scripts.py b/python/ray/scripts/scripts.py index 1e28c74efd0b..b1aa3e6440bb 100644 --- a/python/ray/scripts/scripts.py +++ b/python/ray/scripts/scripts.py @@ -3,6 +3,7 @@ import logging import os import platform +import shutil import signal import subprocess import sys @@ -10,35 +11,32 @@ import urllib import urllib.parse import warnings -import shutil from datetime import datetime -from typing import Optional, Set, List, Tuple -from ray._common.utils import load_class -from ray.dashboard.modules.metrics import install_and_start_prometheus -from ray.util.check_open_ports import check_open_ports -import requests +from typing import List, Optional, Set, Tuple import click import colorama -import psutil +import requests import yaml import ray +import ray._common.usage.usage_constants as usage_constant import ray._private.ray_constants as ray_constants import ray._private.services as services +from ray._common.network_utils import build_address, parse_address +from ray._common.usage import usage_lib +from ray._common.utils import load_class +from ray._private.internal_api import memory_summary from ray._private.label_utils import ( - parse_node_labels_json, parse_node_labels_from_yaml_file, + parse_node_labels_json, parse_node_labels_string, ) +from ray._private.resource_isolation_config import ResourceIsolationConfig from ray._private.utils import ( get_ray_client_dependency_error, parse_resources_json, ) -from ray._common.network_utils import parse_address, build_address -from ray._private.internal_api import memory_summary -from ray._common.usage import usage_lib -import ray._common.usage.usage_constants as usage_constant from ray.autoscaler._private.cli_logger import add_click_logging_options, cf, cli_logger from ray.autoscaler._private.commands import ( RUN_ENV_TYPES, @@ -57,10 +55,12 @@ ) from ray.autoscaler._private.constants import RAY_PROCESSES from ray.autoscaler._private.fake_multi_node.node_provider import FAKE_HEAD_NODE_ID -from ray.util.annotations import PublicAPI from ray.core.generated import autoscaler_pb2 -from ray._private.resource_isolation_config import ResourceIsolationConfig +from ray.dashboard.modules.metrics import install_and_start_prometheus +from ray.util.annotations import PublicAPI +from ray.util.check_open_ports import check_open_ports +import psutil logger = logging.getLogger(__name__) @@ -983,7 +983,7 @@ def start( cli_logger.print("To submit a Ray job using the Ray Jobs CLI:") cli_logger.print( cf.bold( - " RAY_ADDRESS='http://{}' ray job submit " + " RAY_API_SERVER_ADDRESS='http://{}' ray job submit " "--working-dir . " "-- python my_script.py" ), @@ -2619,33 +2619,29 @@ def cpp(show_library_path, generate_bazel_project_template_to): cli_logger.print("Ray C++ include path {} ", cf.bold(f"{include_dir}")) cli_logger.print("Ray C++ library path {} ", cf.bold(f"{lib_dir}")) if generate_bazel_project_template_to: + out_dir = generate_bazel_project_template_to # copytree expects that the dst dir doesn't exist # so we manually delete it if it exists. - if os.path.exists(generate_bazel_project_template_to): - shutil.rmtree(generate_bazel_project_template_to) - shutil.copytree(cpp_templete_dir, generate_bazel_project_template_to) - out_include_dir = os.path.join( - generate_bazel_project_template_to, "thirdparty/include" - ) - if os.path.exists(out_include_dir): - shutil.rmtree(out_include_dir) + if os.path.exists(out_dir): + shutil.rmtree(out_dir) + + shutil.copytree(cpp_templete_dir, out_dir) + for filename in ["_WORKSPACE", "_BUILD.bazel", "_.bazelrc"]: + # Renames the bazel related files by removing the leading underscore. + dest_name = os.path.join(out_dir, filename[1:]) + shutil.move(os.path.join(out_dir, filename), dest_name) + + out_include_dir = os.path.join(out_dir, "thirdparty/include") shutil.copytree(include_dir, out_include_dir) - out_lib_dir = os.path.join(generate_bazel_project_template_to, "thirdparty/lib") - if os.path.exists(out_lib_dir): - shutil.rmtree(out_lib_dir) + out_lib_dir = os.path.join(out_dir, "thirdparty/lib") shutil.copytree(lib_dir, out_lib_dir) cli_logger.print( "Project template generated to {}", - cf.bold(f"{os.path.abspath(generate_bazel_project_template_to)}"), + cf.bold(f"{os.path.abspath(out_dir)}"), ) cli_logger.print("To build and run this template, run") - cli_logger.print( - cf.bold( - f" cd {os.path.abspath(generate_bazel_project_template_to)}" - " && bash run.sh" - ) - ) + cli_logger.print(cf.bold(f" cd {os.path.abspath(out_dir)} && bash run.sh")) @cli.command(hidden=True) @@ -2725,9 +2721,9 @@ def add_command_alias(command, name, hidden): try: from ray.util.state.state_cli import ( + logs_state_cli_group, ray_get, ray_list, - logs_state_cli_group, summary_state_cli_group, ) diff --git a/python/ray/scripts/symmetric_run.py b/python/ray/scripts/symmetric_run.py new file mode 100644 index 000000000000..22f43e58ad57 --- /dev/null +++ b/python/ray/scripts/symmetric_run.py @@ -0,0 +1,262 @@ +"""Symmetric Run for Ray.""" + +import socket +import subprocess +import sys +import time +from typing import List + +import click + +import ray +from ray._private.ray_constants import env_integer +from ray._raylet import GcsClient + +import psutil + +CLUSTER_WAIT_TIMEOUT = env_integer("RAY_SYMMETRIC_RUN_CLUSTER_WAIT_TIMEOUT", 30) + + +def check_ray_already_started() -> bool: + import ray._private.services as services + + # Try auto-detecting the Ray instance. + running_gcs_addresses = services.find_gcs_addresses() + return len(running_gcs_addresses) > 0 + + +def check_cluster_ready(nnodes, timeout=CLUSTER_WAIT_TIMEOUT): + """Wait for all nodes to start. + + Raises an exception if the nodes don't start in time. + """ + start_time = time.time() + current_nodes = 1 + ray.init(ignore_reinit_error=True) + + while time.time() - start_time < timeout: + time.sleep(5) + current_nodes = len(ray.nodes()) + if current_nodes == nnodes: + return True + else: + click.echo( + f"Waiting for nodes to start... {current_nodes}/{nnodes} nodes started" + ) + return False + + +def check_head_node_ready(address: str, timeout=CLUSTER_WAIT_TIMEOUT): + start_time = time.time() + gcs_client = GcsClient(address=address) + while time.time() - start_time < timeout: + if gcs_client.check_alive([], timeout=1): + click.echo("Ray cluster is ready!") + return True + time.sleep(5) + return False + + +def curate_and_validate_ray_start_args(run_and_start_args: List[str]) -> List[str]: + # Reparse the arguments to remove symmetric_run arguments. + ctx = symmetric_run.make_context("_", run_and_start_args, resilient_parsing=True) + cleaned_args = list(ctx.params["ray_args_and_entrypoint"]) + + for arg in cleaned_args: + if arg == "--head": + raise click.ClickException("Cannot use --head option in symmetric_run.") + if arg == "--node-ip-address": + raise click.ClickException( + "Cannot use --node-ip-address option in symmetric_run." + ) + if arg == "--port": + raise click.ClickException("Cannot use --port option in symmetric_run.") + if arg == "--block": + raise click.ClickException("Cannot use --block option in symmetric_run.") + + return cleaned_args + + +@click.command( + name="symmetric_run", + context_settings={"ignore_unknown_options": True, "allow_extra_args": True}, + help="""Command to start Ray across all nodes and execute an entrypoint command. + +USAGE: + + python -m ray.scripts.symmetric_run --address ADDRESS +[--min-nodes NUM_NODES] [RAY_START_OPTIONS] -- [ENTRYPOINT_COMMAND] + +DESCRIPTION: + + This command (1) starts a Ray cluster across all nodes, +(2) runs a command on the head node, and (3) stops the Ray cluster. + + The '--' separator is required to distinguish between Ray start arguments +and the entrypoint command. The --min-nodes option is optional and +can be used to wait for a specific number of nodes to start. + +EXAMPLES: + + # Start Ray with default settings and run a Python script + + python -m ray.scripts.symmetric_run --address 127.0.0.1:6379 -- python my_script.py + + # Start Ray with specific head node and run a command + + python -m ray.scripts.symmetric_run --address 127.0.0.1:6379 --min-nodes 4 -- python train_model.py --epochs=100 + + # Start Ray and run a multi-word command + + python -m ray.scripts.symmetric_run --address 127.0.0.1:6379 --min-nodes 4 --num-cpus=4 -- python -m my_module --config=prod + +RAY START OPTIONS: + + Most ray start command options are supported. Arguments that are not +supported are: --head, --node-ip-address, --port, --block. + +SEPARATOR REQUIREMENT: + + The '--' separator is mandatory and must appear between Ray start + arguments and the entrypoint command. This ensures clear separation + between the two sets of arguments. +""", +) +@click.option( + "--address", required=True, type=str, help="The address of the Ray cluster." +) +@click.option( + "--min-nodes", + type=int, + help="If provided, wait for this number of nodes to start.", +) +@click.argument("ray_args_and_entrypoint", nargs=-1, type=click.UNPROCESSED) +def symmetric_run(address, min_nodes, ray_args_and_entrypoint): + all_args = sys.argv[1:] + separator = all_args.index("--") + + if separator == -1: + raise click.ClickException("No separator '--' found in arguments.") + + run_and_start_args, entrypoint_on_head = ( + all_args[:separator], + all_args[separator + 1 :], + ) + + ray_start_args = curate_and_validate_ray_start_args(run_and_start_args) + + min_nodes = 1 if min_nodes is None else min_nodes + + if not entrypoint_on_head: + raise click.ClickException("No entrypoint command provided.") + + if check_ray_already_started(): + raise click.ClickException("Ray is already started on this node.") + + # 1. Parse address and check if we are on the head node. + gcs_host_port = ray._common.network_utils.parse_address(address) + if gcs_host_port is None: + raise click.ClickException( + f"Invalid address format: {address}, should be `host:port`" + ) + gcs_host, gcs_port = gcs_host_port + + try: + # AF_UNSPEC allows resolving both IPv4 and IPv6 + addrinfo = socket.getaddrinfo( + gcs_host, gcs_port, socket.AF_UNSPEC, socket.SOCK_STREAM + ) + resolved_gcs_host = addrinfo[0][4][0] + except socket.gaierror: + raise click.ClickException(f"Could not resolve hostname: {gcs_host}") + + my_ips = [] + for iface, addrs in psutil.net_if_addrs().items(): + for addr in addrs: + # Look for AF_INET (IPv4) or AF_INET6 (IPv6) + if addr.family in [ + socket.AddressFamily.AF_INET, + socket.AddressFamily.AF_INET6, + ]: + my_ips.append(addr.address) + + if min_nodes > 1: + # Ban localhost ips if we are not running on a single node + # to avoid starting N head nodes + my_ips = [ip for ip in my_ips if ip != "127.0.0.1" and ip != "::1"] + + is_head = resolved_gcs_host in my_ips + + result = None + # 2. Start Ray and run commands. + try: + if is_head: + # On the head node, start Ray, run the command, then stop Ray. + click.echo("On head node. Starting Ray cluster head...") + + # Build the ray start command with all parameters + ray_start_cmd = [ + "ray", + "start", + "--head", + f"--node-ip-address={resolved_gcs_host}", + f"--port={gcs_port}", + *ray_start_args, + ] + + # Start Ray head. This runs in the background and hides output. + subprocess.run(ray_start_cmd, check=True, capture_output=True) + click.echo("Head node started.") + click.echo("=======================") + if min_nodes > 1 and not check_cluster_ready(min_nodes): + raise click.ClickException( + "Timed out waiting for other nodes to start." + ) + + click.echo( + f"Running command on head node: {entrypoint_on_head}", + ) + click.echo("=======================") + result = subprocess.run(entrypoint_on_head) + click.echo("=======================") + else: + # On a worker node, start Ray and connect to the head. + click.echo(f"On worker node. Connecting to Ray cluster at {address}...") + + if not check_head_node_ready(address): + raise click.ClickException("Timed out waiting for head node to start.") + + # Build the ray start command for worker nodes with all parameters + ray_start_cmd = [ + "ray", + "start", + "--address", + address, + "--block", + *ray_start_args, + ] + + # This command will block until the Ray cluster is stopped. + subprocess.run(ray_start_cmd, check=True) + + except subprocess.CalledProcessError as e: + click.echo(f"Failed to start Ray: {e}", err=True) + if e.stdout: + click.echo(f"stdout:\n{e.stdout.decode()}", err=True) + if e.stderr: + click.echo(f"stderr:\n{e.stderr.decode()}", err=True) + except KeyboardInterrupt: + # This can be triggered by ctrl-c on the user's side. + click.echo("Interrupted by user.", err=True) + finally: + # Stop Ray cluster. + subprocess.run(["ray", "stop"]) + + # Propagate the exit code of the user script. + if result is not None and result.returncode != 0: + click.echo(f"Command failed with return code {result.returncode}", err=True) + sys.exit(result.returncode) + + +if __name__ == "__main__": + symmetric_run() diff --git a/python/ray/serve/BUILD b/python/ray/serve/BUILD.bazel similarity index 100% rename from python/ray/serve/BUILD rename to python/ray/serve/BUILD.bazel diff --git a/python/ray/serve/__init__.py b/python/ray/serve/__init__.py index 0d5b38cf84fe..379b118ac1f5 100644 --- a/python/ray/serve/__init__.py +++ b/python/ray/serve/__init__.py @@ -19,6 +19,7 @@ run, run_many, shutdown, + shutdown_async, start, status, ) @@ -47,6 +48,7 @@ "HTTPOptions", "get_replica_context", "shutdown", + "shutdown_async", "ingress", "deployment", "run", diff --git a/python/ray/serve/_private/application_state.py b/python/ray/serve/_private/application_state.py index 41108f62edd4..6f5a36bb3d59 100644 --- a/python/ray/serve/_private/application_state.py +++ b/python/ray/serve/_private/application_state.py @@ -400,10 +400,18 @@ def _clear_target_state_and_store_config( deleting=False, ) - def _delete_deployment(self, name): + def _delete_deployment(self, name: str) -> bool: + """Delete a deployment in the application. + + Args: + name: The name of the deployment to delete. + + Returns: + Whether the target state has changed. + """ id = DeploymentID(name=name, app_name=self._name) self._endpoint_state.delete_endpoint(id) - self._deployment_state_manager.delete_deployment(id) + return self._deployment_state_manager.delete_deployment(id) def delete(self): """Delete the application""" @@ -426,8 +434,16 @@ def apply_deployment_info( self, deployment_name: str, deployment_info: DeploymentInfo, - ) -> None: - """Deploys a deployment in the application.""" + ) -> bool: + """Deploys a deployment in the application. + + Args: + deployment_name: The name of the deployment to apply. + deployment_info: The deployment info to apply. + + Returns: + Whether the target state has changed. + """ route_prefix = deployment_info.route_prefix if route_prefix is not None and not route_prefix.startswith("/"): raise RayServeException( @@ -436,7 +452,9 @@ def apply_deployment_info( deployment_id = DeploymentID(name=deployment_name, app_name=self._name) - self._deployment_state_manager.deploy(deployment_id, deployment_info) + target_state_changed = self._deployment_state_manager.deploy( + deployment_id, deployment_info + ) if deployment_info.route_prefix is not None: config = deployment_info.deployment_config @@ -457,6 +475,8 @@ def apply_deployment_info( else: self._endpoint_state.delete_endpoint(deployment_id) + return target_state_changed + def deploy_app(self, deployment_infos: Dict[str, DeploymentInfo]): """(Re-)deploy the application from list of deployment infos. @@ -761,6 +781,7 @@ def _reconcile_target_deployments(self) -> None: Ensure each deployment is running on up-to-date info, and remove outdated deployments from the application. """ + target_state_changed = False # Set target state for each deployment for deployment_name, info in self._target_state.deployment_infos.items(): @@ -784,26 +805,34 @@ def _reconcile_target_deployments(self) -> None: deploy_info.deployment_config.logging_config = ( self._target_state.config.logging_config ) - self.apply_deployment_info(deployment_name, deploy_info) + target_state_changed = ( + self.apply_deployment_info(deployment_name, deploy_info) + or target_state_changed + ) # Delete outdated deployments for deployment_name in self._get_live_deployments(): if deployment_name not in self.target_deployments: - self._delete_deployment(deployment_name) + target_state_changed = ( + self._delete_deployment(deployment_name) or target_state_changed + ) + + return target_state_changed - def update(self) -> bool: + def update(self) -> Tuple[bool, bool]: """Attempts to reconcile this application to match its target state. Updates the application status and status message based on the current state of the system. Returns: - A boolean indicating whether the application is ready to be - deleted. + Whether the target state has changed. """ infos, task_status, msg = self._reconcile_build_app_task() + target_state_changed = False if task_status == BuildAppStatus.SUCCEEDED: + target_state_changed = True self._set_target_state( deployment_infos=infos, code_version=self._build_app_task_info.code_version, @@ -821,14 +850,16 @@ def update(self) -> bool: # it's not finished, we don't know what the target list of deployments # is, so we don't perform any reconciliation. if self._target_state.deployment_infos is not None: - self._reconcile_target_deployments() + target_state_changed = ( + self._reconcile_target_deployments() or target_state_changed + ) status, status_msg = self._determine_app_status() self._update_status(status, status_msg) # Check if app is ready to be deleted if self._target_state.deleting: - return self.is_deleted() - return False + return self.is_deleted(), target_state_changed + return False, target_state_changed def get_checkpoint_data(self) -> ApplicationTargetState: return self._target_state @@ -1073,12 +1104,30 @@ def get_ingress_deployment_name(self, name: str) -> Optional[str]: def get_app_source(self, name: str) -> APIType: return self._application_states[name].api_type - def list_app_statuses(self) -> Dict[str, ApplicationStatusInfo]: - """Return a dictionary with {app name: application info}""" - return { - name: self._application_states[name].get_application_status_info() - for name in self._application_states - } + def list_app_statuses( + self, source: Optional[APIType] = None + ) -> Dict[str, ApplicationStatusInfo]: + """Return a dictionary with {app name: application info} + + Args: + source: Optional API type filter. If provided, only returns apps + deployed via the specified API type. + + Returns: + Dict[str, ApplicationStatusInfo]: A dictionary mapping application names + to their corresponding status information. + """ + if source is None: + return { + name: self._application_states[name].get_application_status_info() + for name in self._application_states + } + else: + return { + name: self._application_states[name].get_application_status_info() + for name in self._application_states + if self.get_app_source(name) is source + } def list_deployment_details(self, name: str) -> Dict[str, DeploymentDetails]: """Gets detailed info on all deployments in specified application.""" @@ -1087,10 +1136,14 @@ def list_deployment_details(self, name: str) -> Dict[str, DeploymentDetails]: return self._application_states[name].list_deployment_details() def update(self): - """Update each application state""" + """Update each application state.""" apps_to_be_deleted = [] + any_target_state_changed = False for name, app in self._application_states.items(): - ready_to_be_deleted = app.update() + ready_to_be_deleted, app_target_state_changed = app.update() + any_target_state_changed = ( + any_target_state_changed or app_target_state_changed + ) if ready_to_be_deleted: apps_to_be_deleted.append(name) logger.debug(f"Application '{name}' deleted successfully.") @@ -1100,6 +1153,10 @@ def update(self): del self._application_states[app_name] ServeUsageTag.NUM_APPS.record(str(len(self._application_states))) + if any_target_state_changed: + self.save_checkpoint() + self._deployment_state_manager.save_checkpoint() + def shutdown(self) -> None: self._shutting_down = True diff --git a/python/ray/serve/_private/autoscaling_state.py b/python/ray/serve/_private/autoscaling_state.py index e8adb9562998..e7eace67b2d2 100644 --- a/python/ray/serve/_private/autoscaling_state.py +++ b/python/ray/serve/_private/autoscaling_state.py @@ -1,12 +1,14 @@ import logging import time from dataclasses import dataclass -from typing import Dict, List, Optional, Set +from typing import Any, Dict, List, Optional, Set from ray.serve._private.common import ( - DeploymentHandleSource, + RUNNING_REQUESTS_KEY, DeploymentID, + HandleMetricReport, ReplicaID, + ReplicaMetricReport, TargetCapacityDirection, ) from ray.serve._private.constants import ( @@ -20,62 +22,42 @@ @dataclass -class HandleMetricReport: - """Report from a deployment handle on queued and ongoing requests. - - Args: - actor_id: If the deployment handle (from which this metric was - sent) lives on an actor, the actor ID of that actor. - handle_source: Describes what kind of entity holds this - deployment handle: a Serve proxy, a Serve replica, or - unknown. - queued_requests: The current number of queued requests at the - handle, i.e. requests that haven't been assigned to any - replica yet. - running_requests: A map of replica ID to the average number of - requests, assigned through the handle, running at that - replica. - timestamp: The time at which this report was received. - """ +class AutoscalingContext: + """Rich context provided to custom autoscaling policies.""" - actor_id: Optional[str] - handle_source: DeploymentHandleSource - queued_requests: float - running_requests: Dict[ReplicaID, float] - timestamp: float - - @property - def total_requests(self) -> float: - """Total number of queued and running requests.""" - return self.queued_requests + sum(self.running_requests.values()) - - @property - def is_serve_component_source(self) -> bool: - """Whether the handle source is a Serve actor. - - More specifically, this returns whether a Serve actor tracked - by the controller holds the deployment handle that sent this - report. If the deployment handle lives on a driver, a Ray task, - or an actor that's not a Serve replica, then this returns False. - """ - return self.handle_source in [ - DeploymentHandleSource.PROXY, - DeploymentHandleSource.REPLICA, - ] + # Deployment information + deployment_id: DeploymentID + deployment_name: str + app_name: Optional[str] + # Current state + current_num_replicas: int + target_num_replicas: int + running_replicas: List[ReplicaID] -@dataclass -class ReplicaMetricReport: - """Report from a replica on ongoing requests. + # Built-in metrics + total_num_requests: float + queued_requests: Optional[float] + requests_per_replica: Dict[ReplicaID, float] - Args: - running_requests: Average number of running requests at the - replica. - timestamp: The time at which this report was received. - """ + # Custom metrics + aggregated_metrics: Dict[str, Dict[ReplicaID, float]] + raw_metrics: Dict[str, Dict[ReplicaID, List[float]]] - running_requests: float - timestamp: float + # Capacity and bounds + capacity_adjusted_min_replicas: int + capacity_adjusted_max_replicas: int + + # Policy state + policy_state: Dict[str, Any] + + # Timing + last_scale_up_time: Optional[float] + last_scale_down_time: Optional[float] + current_time: Optional[float] + + # Config + config: Optional[Any] class AutoscalingState: @@ -175,47 +157,32 @@ def apply_bounds(self, num_replicas: int) -> int: ) def record_request_metrics_for_replica( - self, replica_id: ReplicaID, window_avg: Optional[float], send_timestamp: float + self, replica_metric_report: ReplicaMetricReport ) -> None: """Records average number of ongoing requests at a replica.""" - if window_avg is None: - return - + replica_id = replica_metric_report.replica_id + send_timestamp = replica_metric_report.timestamp if ( replica_id not in self._replica_requests or send_timestamp > self._replica_requests[replica_id].timestamp ): - self._replica_requests[replica_id] = ReplicaMetricReport( - running_requests=window_avg, - timestamp=send_timestamp, - ) + self._replica_requests[replica_id] = replica_metric_report def record_request_metrics_for_handle( self, - *, - handle_id: str, - actor_id: Optional[str], - handle_source: DeploymentHandleSource, - queued_requests: float, - running_requests: Dict[ReplicaID, float], - send_timestamp: float, + handle_metric_report: HandleMetricReport, ) -> None: """Records average number of queued and running requests at a handle for this deployment. """ - + handle_id = handle_metric_report.handle_id + send_timestamp = handle_metric_report.timestamp if ( handle_id not in self._handle_requests or send_timestamp > self._handle_requests[handle_id].timestamp ): - self._handle_requests[handle_id] = HandleMetricReport( - actor_id=actor_id, - handle_source=handle_source, - queued_requests=queued_requests, - running_requests=running_requests, - timestamp=send_timestamp, - ) + self._handle_requests[handle_id] = handle_metric_report def drop_stale_handle_metrics(self, alive_serve_actor_ids: Set[str]) -> None: """Drops handle metrics that are no longer valid. @@ -270,16 +237,29 @@ def get_decision_num_replicas( `_skip_bound_check` is True, then the bounds are not applied. """ - decision_num_replicas = self._policy( - curr_target_num_replicas=curr_target_num_replicas, + autoscaling_context: AutoscalingContext = AutoscalingContext( + deployment_id=self._deployment_id, + deployment_name=self._deployment_id.name, + app_name=self._deployment_id.app_name, + current_num_replicas=len(self._running_replicas), + target_num_replicas=curr_target_num_replicas, + running_replicas=self._running_replicas, total_num_requests=self.get_total_num_requests(), - num_running_replicas=len(self._running_replicas), - config=self._config, capacity_adjusted_min_replicas=self.get_num_replicas_lower_bound(), capacity_adjusted_max_replicas=self.get_num_replicas_upper_bound(), - policy_state=self._policy_state, + policy_state=self._policy_state.copy(), + current_time=time.time(), + config=self._config, + queued_requests=None, + requests_per_replica=None, + aggregated_metrics=None, + raw_metrics=None, + last_scale_up_time=None, + last_scale_down_time=None, ) + decision_num_replicas, self._policy_state = self._policy(autoscaling_context) + if _skip_bound_check: return decision_num_replicas @@ -301,16 +281,22 @@ def get_total_num_requests(self) -> float: for id in self._running_replicas: if id in self._replica_requests: - total_requests += self._replica_requests[id].running_requests + total_requests += self._replica_requests[id].aggregated_metrics.get( + RUNNING_REQUESTS_KEY + ) metrics_collected_on_replicas = total_requests > 0 for handle_metric in self._handle_requests.values(): total_requests += handle_metric.queued_requests if not metrics_collected_on_replicas: - for id in self._running_replicas: - if id in handle_metric.running_requests: - total_requests += handle_metric.running_requests[id] + for replica_id in self._running_replicas: + if replica_id in handle_metric.aggregated_metrics.get( + RUNNING_REQUESTS_KEY + ): + total_requests += handle_metric.aggregated_metrics.get( + RUNNING_REQUESTS_KEY + ).get(replica_id) return total_requests @@ -379,39 +365,26 @@ def is_within_bounds( ) def record_request_metrics_for_replica( - self, replica_id: ReplicaID, window_avg: Optional[float], send_timestamp: float + self, replica_metric_report: ReplicaMetricReport ) -> None: - deployment_id = replica_id.deployment_id + deployment_id = replica_metric_report.replica_id.deployment_id # Defensively guard against delayed replica metrics arriving # after the deployment's been deleted if deployment_id in self._autoscaling_states: self._autoscaling_states[deployment_id].record_request_metrics_for_replica( - replica_id=replica_id, - window_avg=window_avg, - send_timestamp=send_timestamp, + replica_metric_report ) def record_request_metrics_for_handle( self, - *, - deployment_id: str, - handle_id: str, - actor_id: Optional[str], - handle_source: DeploymentHandleSource, - queued_requests: float, - running_requests: Dict[ReplicaID, float], - send_timestamp: float, + handle_metric_report: HandleMetricReport, ) -> None: """Update request metric for a specific handle.""" + deployment_id = handle_metric_report.deployment_id if deployment_id in self._autoscaling_states: self._autoscaling_states[deployment_id].record_request_metrics_for_handle( - handle_id=handle_id, - actor_id=actor_id, - handle_source=handle_source, - queued_requests=queued_requests, - running_requests=running_requests, - send_timestamp=send_timestamp, + handle_metric_report ) def drop_stale_handle_metrics(self, alive_serve_actor_ids: Set[str]) -> None: diff --git a/python/ray/serve/_private/benchmarks/common.py b/python/ray/serve/_private/benchmarks/common.py index f5daad3d493f..1c51801662f7 100644 --- a/python/ray/serve/_private/benchmarks/common.py +++ b/python/ray/serve/_private/benchmarks/common.py @@ -170,6 +170,43 @@ def __call__(self, *args, **kwargs): return b"" +@serve.deployment +class ModelComp: + def __init__(self, child): + logging.getLogger("ray.serve").setLevel(logging.WARNING) + self._child = child + + async def __call__(self, *args, **kwargs): + return await self._child.remote() + + +@serve.deployment +class GrpcDeployment: + def __init__(self): + logging.getLogger("ray.serve").setLevel(logging.WARNING) + + async def grpc_call(self, user_message): + return serve_pb2.ModelOutput(output=9) + + async def call_with_string(self, user_message): + return serve_pb2.ModelOutput(output=9) + + +@serve.deployment +class GrpcModelComp: + def __init__(self, child): + logging.getLogger("ray.serve").setLevel(logging.WARNING) + self._child = child + + async def grpc_call(self, user_message): + await self._child.remote() + return serve_pb2.ModelOutput(output=9) + + async def call_with_string(self, user_message): + await self._child.remote() + return serve_pb2.ModelOutput(output=9) + + @serve.deployment class Streamer: def __init__(self, tokens_per_request: int, inter_token_delay_ms: int = 10): diff --git a/python/ray/serve/_private/benchmarks/locust_utils.py b/python/ray/serve/_private/benchmarks/locust_utils.py new file mode 100644 index 000000000000..7949b69ea52e --- /dev/null +++ b/python/ray/serve/_private/benchmarks/locust_utils.py @@ -0,0 +1,279 @@ +import argparse +import logging +import time +from dataclasses import asdict, dataclass +from typing import Any, Dict, List + +from ray.serve._private.utils import generate_request_id + +logger = logging.getLogger(__file__) +logging.basicConfig(level=logging.INFO) + +MASTER_PORT = 5557 + + +@dataclass +class LocustStage: + duration_s: int + users: int + spawn_rate: float + + +@dataclass +class PerformanceStats: + p50_latency: float + p90_latency: float + p99_latency: float + rps: float + + +@dataclass +class LocustTestResults: + history: List[Dict] + total_requests: int + num_failures: int + avg_latency: float + p50_latency: float + p90_latency: float + p99_latency: float + avg_rps: float + stats_in_stages: List[PerformanceStats] + + +@dataclass +class FailedRequest: + request_id: str + status_code: int + exception: str + response_time_ms: float + start_time_s: float + + +class LocustClient: + def __init__( + self, + host_url: str, + token: str, + data: Dict[str, Any] = None, + ): + from locust import FastHttpUser, constant, events, task + from locust.contrib.fasthttp import FastResponse + + self.errors = [] + self.stats_in_stages: List[PerformanceStats] = [] + + class EndpointUser(FastHttpUser): + wait_time = constant(0) + failed_requests = [] + host = host_url + + @task + def test(self): + request_id = generate_request_id() + headers = ( + {"Authorization": f"Bearer {token}", "X-Request-ID": request_id} + if token + else None + ) + with self.client.get( + "", headers=headers, json=data, catch_response=True + ) as r: + r.request_meta["context"]["request_id"] = request_id + + @events.request.add_listener + def on_request( + response: FastResponse, + exception, + context, + start_time: float, + response_time: float, + **kwargs, + ): + if exception and response.status_code != 0: + request_id = context["request_id"] + print( + f"Request '{request_id}' failed with exception:\n" + f"{exception}\n{response.text}" + ) + + if response.status_code != 0: + response.encoding = "utf-8" + err = FailedRequest( + request_id=request_id, + status_code=response.status_code, + exception=response.text, + response_time_ms=response_time, + start_time_s=start_time, + ) + self.errors.append(err) + print( + f"Request '{request_id}' failed with exception:\n" + f"{exception}\n{response.text}" + ) + + self.user_class = EndpointUser + + +def on_stage_finished(master_runner, stats_in_stages): + stats_entry_key = ("", "GET") + stats_entry = master_runner.stats.entries.get(stats_entry_key) + + stats_in_stages.append( + PerformanceStats( + p50_latency=stats_entry.get_current_response_time_percentile(0.5), + p90_latency=stats_entry.get_current_response_time_percentile(0.9), + p99_latency=stats_entry.get_current_response_time_percentile(0.99), + rps=stats_entry.current_rps, + ) + ) + + +def run_locust_worker( + master_address: str, host_url: str, token: str, data: Dict[str, Any] +): + import locust + from locust.env import Environment + from locust.log import setup_logging + + setup_logging("INFO") + client = LocustClient(host_url=host_url, token=token, data=data) + env = Environment(user_classes=[client.user_class], events=locust.events) + + runner = env.create_worker_runner( + master_host=master_address, master_port=MASTER_PORT + ) + runner.greenlet.join() + + if client.errors: + raise RuntimeError(f"There were {len(client.errors)} errors: {client.errors}") + + +def run_locust_master( + host_url: str, + token: str, + expected_num_workers: int, + stages: List[LocustStage], + wait_for_workers_timeout_s: float, +): + import gevent + import locust + from locust import LoadTestShape + from locust.env import Environment + from locust.stats import ( + get_error_report_summary, + get_percentile_stats_summary, + get_stats_summary, + stats_history, + stats_printer, + ) + + client = LocustClient(host_url, token) + + class StagesShape(LoadTestShape): + curr_stage_ix = 0 + + def tick(cls): + run_time = cls.get_run_time() + prefix_time = 0 + for i, stage in enumerate(stages): + prefix_time += stage.duration_s + + if run_time < prefix_time: + if i != cls.curr_stage_ix: + on_stage_finished(master_runner, client.stats_in_stages) + cls.curr_stage_ix = i + + current_stage = stages[cls.curr_stage_ix] + return current_stage.users, current_stage.spawn_rate + + # End of stage test + on_stage_finished(master_runner, client.stats_in_stages) + + master_env = Environment( + user_classes=[client.user_class], + shape_class=StagesShape(), + events=locust.events, + ) + master_runner = master_env.create_master_runner("*", MASTER_PORT) + + start = time.time() + while len(master_runner.clients.ready) < expected_num_workers: + if time.time() - start > wait_for_workers_timeout_s: + raise RuntimeError( + f"Timed out waiting for {expected_num_workers} workers to " + "connect to Locust master." + ) + + print( + f"Waiting for workers to be ready, " + f"{len(master_runner.clients.ready)} " + f"of {expected_num_workers} ready." + ) + time.sleep(1) + + # Periodically output current stats (each entry is aggregated + # stats over the past 10 seconds, by default) + gevent.spawn(stats_printer(master_env.stats)) + gevent.spawn(stats_history, master_runner) + + # Start test & wait for the shape test to finish + master_runner.start_shape() + master_runner.shape_greenlet.join() + # Send quit signal to all locust workers + master_runner.quit() + + # Print stats + for line in get_stats_summary(master_runner.stats, current=False): + print(line) + # Print percentile stats + for line in get_percentile_stats_summary(master_runner.stats): + print(line) + # Print error report + if master_runner.stats.errors: + for line in get_error_report_summary(master_runner.stats): + print(line) + + stats_entry_key = ("", "GET") + stats_entry = master_runner.stats.entries.get(stats_entry_key) + results = LocustTestResults( + history=master_runner.stats.history, + total_requests=master_runner.stats.num_requests, + num_failures=master_runner.stats.num_failures, + avg_latency=stats_entry.avg_response_time, + p50_latency=stats_entry.get_response_time_percentile(0.5), + p90_latency=stats_entry.get_response_time_percentile(0.9), + p99_latency=stats_entry.get_response_time_percentile(0.99), + avg_rps=stats_entry.total_rps, + stats_in_stages=client.stats_in_stages, + ) + return asdict(results) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--worker-type", type=str, required=True) + parser.add_argument("--host-url", type=str, required=True) + parser.add_argument("--token", type=str, required=True) + parser.add_argument("--master-address", type=str, required=False) + parser.add_argument("--expected-num-workers", type=int, required=False) + parser.add_argument("--stages", type=str, required=False) + parser.add_argument("--wait-for-workers-timeout-s", type=float, required=False) + args = parser.parse_args() + host_url = args.host_url + token = args.token + if args.worker_type == "master": + results = run_locust_master( + host_url, + token, + args.expected_num_workers, + args.stages, + args.wait_for_workers_timeout_s, + ) + else: + results = run_locust_worker(args.master_address, host_url, token, args.data) + + print(results) + + +if __name__ == "__main__": + main() diff --git a/python/ray/serve/_private/client.py b/python/ray/serve/_private/client.py index ffec15fa89c2..b871553abda0 100644 --- a/python/ray/serve/_private/client.py +++ b/python/ray/serve/_private/client.py @@ -1,3 +1,4 @@ +import asyncio import logging import random import time @@ -80,18 +81,31 @@ def http_config(self): def __reduce__(self): raise RayServeException(("Ray Serve client cannot be serialized.")) - def shutdown_cached_handles(self, _skip_asyncio_check: bool = False): + def shutdown_cached_handles(self): """Shuts down all cached handles. Remove the reference to the cached handles so that they can be garbage collected. """ for cache_key in list(self.handle_cache): - self.handle_cache[cache_key].shutdown( - _skip_asyncio_check=_skip_asyncio_check - ) + self.handle_cache[cache_key].shutdown() + del self.handle_cache[cache_key] + + async def shutdown_cached_handles_async(self): + """Shuts down all cached handles asynchronously. + + Remove the reference to the cached handles so that they can be + garbage collected. + """ + + async def shutdown_task(cache_key): + await self.handle_cache[cache_key].shutdown_async() del self.handle_cache[cache_key] + await asyncio.gather( + *[shutdown_task(cache_key) for cache_key in list(self.handle_cache)] + ) + def shutdown(self, timeout_s: float = 30.0) -> None: """Completely shut down the connected Serve instance. @@ -113,6 +127,29 @@ def shutdown(self, timeout_s: float = 30.0) -> None: ) self._shutdown = True + async def shutdown_async(self, timeout_s: float = 30.0) -> None: + """Completely shut down the connected Serve instance. + + Shuts down all processes and deletes all state associated with the + instance. + """ + await self.shutdown_cached_handles_async() + + if ray.is_initialized() and not self._shutdown: + try: + await asyncio.wait_for( + self._controller.graceful_shutdown.remote(), timeout=timeout_s + ) + except ray.exceptions.RayActorError: + # Controller has been shut down. + pass + except TimeoutError: + logger.warning( + f"Controller failed to shut down within {timeout_s}s. " + "Check controller logs for more details." + ) + self._shutdown = True + def _wait_for_deployment_healthy(self, name: str, timeout_s: int = -1): """Waits for the named deployment to enter "HEALTHY" status. diff --git a/python/ray/serve/_private/common.py b/python/ray/serve/_private/common.py index 5746e26a56ae..5493560ac988 100644 --- a/python/ray/serve/_private/common.py +++ b/python/ray/serve/_private/common.py @@ -746,3 +746,93 @@ class CreatePlacementGroupRequest: target_node_id: str name: str runtime_env: Optional[str] = None + + +# This error is used to raise when a by-value DeploymentResponse is converted to an +# ObjectRef. +OBJ_REF_NOT_SUPPORTED_ERROR = RuntimeError( + "Converting by-value DeploymentResponses to ObjectRefs is not supported. " + "Use handle.options(_by_reference=True) to enable it." +) + +RUNNING_REQUESTS_KEY = "running_requests" + + +@dataclass(order=True) +class TimeStampedValue: + timestamp: float + value: float = field(compare=False) + + +@dataclass +class HandleMetricReport: + """Report from a deployment handle on queued and ongoing requests. + + Args: + deployment_id: The deployment ID of the deployment handle. + handle_id: The handle ID of the deployment handle. + actor_id: If the deployment handle (from which this metric was + sent) lives on an actor, the ID of that actor. + handle_source: Describes what kind of entity holds this + deployment handle: a Serve proxy, a Serve replica, or + unknown. + queued_requests: The current number of queued requests at the + handle, i.e. requests that haven't been assigned to any + replica yet. + aggregated_metrics: A map of metric name to the aggregated value over the past + look_back_period_s seconds at the handle for each replica. + metrics: A map of metric name to the list of values running at that handle for each replica + over the past look_back_period_s seconds. This is a list because + we take multiple measurements over time. + timestamp: The time at which this report was created. + """ + + deployment_id: DeploymentID + handle_id: str + actor_id: str + handle_source: DeploymentHandleSource + queued_requests: float + aggregated_metrics: Dict[str, Dict[ReplicaID, float]] + metrics: Dict[str, Dict[ReplicaID, List[float]]] + timestamp: float + + @property + def total_requests(self) -> float: + """Total number of queued and running requests.""" + return self.queued_requests + sum( + self.aggregated_metrics.get(RUNNING_REQUESTS_KEY, {}).values() + ) + + @property + def is_serve_component_source(self) -> bool: + """Whether the handle source is a Serve actor. + + More specifically, this returns whether a Serve actor tracked + by the controller holds the deployment handle that sent this + report. If the deployment handle lives on a driver, a Ray task, + or an actor that's not a Serve replica, then this returns False. + """ + return self.handle_source in [ + DeploymentHandleSource.PROXY, + DeploymentHandleSource.REPLICA, + ] + + +@dataclass +class ReplicaMetricReport: + """Report from a replica on ongoing requests. + + Args: + replica_id: The replica ID of the replica. + aggregated_metrics: A map of metric name to the aggregated value over the past + look_back_period_s seconds at the replica. + metrics: A map of metric name to the list of values running at that replica + over the past look_back_period_s seconds. This is a list because + we take multiple measurements over time. + timestamp: The time at which this report was created. + """ + + replica_id: ReplicaID + aggregated_metrics: Dict[str, float] + metrics: Dict[str, List[float]] + timestamp: float diff --git a/python/ray/serve/_private/constants.py b/python/ray/serve/_private/constants.py index 166abcae2246..a670fdd53616 100644 --- a/python/ray/serve/_private/constants.py +++ b/python/ray/serve/_private/constants.py @@ -4,7 +4,9 @@ get_env_bool, get_env_float, get_env_float_non_negative, + get_env_float_positive, get_env_int, + get_env_int_non_negative, get_env_int_positive, get_env_str, parse_latency_buckets, @@ -51,15 +53,20 @@ #: Max retry count for allowing failures in replica constructor. #: If no replicas at target version is running by the time we're at -#: max construtor retry count, deploy() is considered failed. +#: max constructor retry count, deploy() is considered failed. #: By default we set threshold as min(num_replicas * 3, this value) MAX_DEPLOYMENT_CONSTRUCTOR_RETRY_COUNT = get_env_int( - "MAX_DEPLOYMENT_CONSTRUCTOR_RETRY_COUNT", 20 + "RAY_SERVE_MAX_DEPLOYMENT_CONSTRUCTOR_RETRY_COUNT", + get_env_int("MAX_DEPLOYMENT_CONSTRUCTOR_RETRY_COUNT", 20), ) # Max retry on deployment constructor is # min(num_replicas * MAX_PER_REPLICA_RETRY_COUNT, MAX_DEPLOYMENT_CONSTRUCTOR_RETRY_COUNT) -MAX_PER_REPLICA_RETRY_COUNT = get_env_int("MAX_PER_REPLICA_RETRY_COUNT", 3) +MAX_PER_REPLICA_RETRY_COUNT = get_env_int( + "RAY_SERVE_MAX_PER_REPLICA_RETRY_COUNT", + get_env_int("MAX_PER_REPLICA_RETRY_COUNT", 3), +) + # If you are wondering why we are using histogram buckets, please refer to # https://prometheus.io/docs/practices/histograms/ @@ -100,11 +107,19 @@ # RAY_SERVE_MODEL_LOAD_LATENCY_BUCKET_MS="1,2,3,4" #: Histogram buckets for request latency. REQUEST_LATENCY_BUCKETS_MS = parse_latency_buckets( - get_env_str("REQUEST_LATENCY_BUCKETS_MS", ""), DEFAULT_LATENCY_BUCKET_MS + get_env_str( + "RAY_SERVE_REQUEST_LATENCY_BUCKETS_MS", + get_env_str("REQUEST_LATENCY_BUCKETS_MS", ""), + ), + DEFAULT_LATENCY_BUCKET_MS, ) #: Histogram buckets for model load/unload latency. MODEL_LOAD_LATENCY_BUCKETS_MS = parse_latency_buckets( - get_env_str("MODEL_LOAD_LATENCY_BUCKETS_MS", ""), DEFAULT_LATENCY_BUCKET_MS + get_env_str( + "RAY_SERVE_MODEL_LOAD_LATENCY_BUCKETS_MS", + get_env_str("MODEL_LOAD_LATENCY_BUCKETS_MS", ""), + ), + DEFAULT_LATENCY_BUCKET_MS, ) #: Name of deployment health check method implemented by user. @@ -117,11 +132,16 @@ #: Limit the number of cached handles because each handle has long poll #: overhead. See https://github.com/ray-project/ray/issues/18980 -MAX_CACHED_HANDLES = get_env_int_positive("MAX_CACHED_HANDLES", 100) +MAX_CACHED_HANDLES = get_env_int_positive( + "RAY_SERVE_MAX_CACHED_HANDLES", get_env_int_positive("MAX_CACHED_HANDLES", 100) +) #: Because ServeController will accept one long poll request per handle, its #: concurrency needs to scale as O(num_handles) -CONTROLLER_MAX_CONCURRENCY = get_env_int_positive("CONTROLLER_MAX_CONCURRENCY", 15_000) +CONTROLLER_MAX_CONCURRENCY = get_env_int_positive( + "RAY_SERVE_CONTROLLER_MAX_CONCURRENCY", + get_env_int_positive("CONTROLLER_MAX_CONCURRENCY", 15_000), +) DEFAULT_GRACEFUL_SHUTDOWN_TIMEOUT_S = 20 DEFAULT_GRACEFUL_SHUTDOWN_WAIT_LOOP_S = 2 @@ -131,15 +151,15 @@ DEFAULT_TARGET_ONGOING_REQUESTS = 2 # HTTP Proxy health check configs -PROXY_HEALTH_CHECK_TIMEOUT_S = ( - get_env_float("RAY_SERVE_PROXY_HEALTH_CHECK_TIMEOUT_S", 10.0) or 10.0 +PROXY_HEALTH_CHECK_TIMEOUT_S = get_env_float_positive( + "RAY_SERVE_PROXY_HEALTH_CHECK_TIMEOUT_S", 10.0 ) -PROXY_HEALTH_CHECK_PERIOD_S = ( - get_env_float("RAY_SERVE_PROXY_HEALTH_CHECK_PERIOD_S", 10.0) or 10.0 +PROXY_HEALTH_CHECK_PERIOD_S = get_env_float_positive( + "RAY_SERVE_PROXY_HEALTH_CHECK_PERIOD_S", 10.0 ) -PROXY_READY_CHECK_TIMEOUT_S = ( - get_env_float("RAY_SERVE_PROXY_READY_CHECK_TIMEOUT_S", 5.0) or 5.0 +PROXY_READY_CHECK_TIMEOUT_S = get_env_float_positive( + "RAY_SERVE_PROXY_READY_CHECK_TIMEOUT_S", 5.0 ) # Number of times in a row that a HTTP proxy must fail the health check before @@ -147,8 +167,8 @@ PROXY_HEALTH_CHECK_UNHEALTHY_THRESHOLD = 3 # The minimum drain period for a HTTP proxy. -PROXY_MIN_DRAINING_PERIOD_S = ( - get_env_float("RAY_SERVE_PROXY_MIN_DRAINING_PERIOD_S", 30.0) or 30.0 +PROXY_MIN_DRAINING_PERIOD_S = get_env_float_positive( + "RAY_SERVE_PROXY_MIN_DRAINING_PERIOD_S", 30.0 ) # The time in seconds that the http proxy state waits before # rechecking whether the proxy actor is drained or not. @@ -166,7 +186,7 @@ CLIENT_CHECK_CREATION_POLLING_INTERVAL_S = 0.1 # Timeout for GCS internal KV service -RAY_SERVE_KV_TIMEOUT_S = get_env_float("RAY_SERVE_KV_TIMEOUT_S", 0.0) or None +RAY_SERVE_KV_TIMEOUT_S = get_env_float_positive("RAY_SERVE_KV_TIMEOUT_S", None) # Timeout for GCS RPC request RAY_GCS_RPC_TIMEOUT_S = 3.0 @@ -225,13 +245,15 @@ "skip_context_filter", } -RAY_SERVE_HTTP_KEEP_ALIVE_TIMEOUT_S = get_env_int( +RAY_SERVE_HTTP_KEEP_ALIVE_TIMEOUT_S = get_env_int_non_negative( "RAY_SERVE_HTTP_KEEP_ALIVE_TIMEOUT_S", 0 ) RAY_SERVE_REQUEST_PROCESSING_TIMEOUT_S = ( - get_env_float("RAY_SERVE_REQUEST_PROCESSING_TIMEOUT_S", 0.0) - or get_env_float("SERVE_REQUEST_PROCESSING_TIMEOUT_S", 0.0) + get_env_float_non_negative( + "RAY_SERVE_REQUEST_PROCESSING_TIMEOUT_S", + get_env_float_non_negative("SERVE_REQUEST_PROCESSING_TIMEOUT_S", 0.0), + ) or None ) @@ -286,11 +308,11 @@ # Serve multiplexed matching timeout. # This is the timeout for the matching process of multiplexed requests. To avoid -# thundering herd problem, the timeout value will be randomed between this value +# thundering herd problem, the timeout value will be randomized between this value # and this value * 2. The unit is second. # If the matching process takes longer than the timeout, the request will be # fallen to the default routing strategy. -RAY_SERVE_MULTIPLEXED_MODEL_ID_MATCHING_TIMEOUT_S = get_env_float( +RAY_SERVE_MULTIPLEXED_MODEL_ID_MATCHING_TIMEOUT_S = get_env_float_non_negative( "RAY_SERVE_MULTIPLEXED_MODEL_ID_MATCHING_TIMEOUT_S", 1.0 ) @@ -325,7 +347,7 @@ ) # Length of time to respect entries in the queue length cache when routing requests. -RAY_SERVE_QUEUE_LENGTH_CACHE_TIMEOUT_S = get_env_float( +RAY_SERVE_QUEUE_LENGTH_CACHE_TIMEOUT_S = get_env_float_non_negative( "RAY_SERVE_QUEUE_LENGTH_CACHE_TIMEOUT_S", 10.0 ) @@ -343,7 +365,9 @@ ) # The default autoscaling policy to use if none is specified. -DEFAULT_AUTOSCALING_POLICY = "ray.serve.autoscaling_policy:default_autoscaling_policy" +DEFAULT_AUTOSCALING_POLICY_NAME = ( + "ray.serve.autoscaling_policy:default_autoscaling_policy" +) # Feature flag to enable collecting all queued and ongoing request # metrics at handles instead of replicas. ON by default. @@ -351,7 +375,7 @@ "RAY_SERVE_COLLECT_AUTOSCALING_METRICS_ON_HANDLE", "1" ) -RAY_SERVE_MIN_HANDLE_METRICS_TIMEOUT_S = get_env_float( +RAY_SERVE_MIN_HANDLE_METRICS_TIMEOUT_S = get_env_float_non_negative( "RAY_SERVE_MIN_HANDLE_METRICS_TIMEOUT_S", 10.0 ) @@ -413,7 +437,7 @@ ) # Used for gc.set_threshold() when proxy GC optimizations are enabled. -RAY_SERVE_PROXY_GC_THRESHOLD = get_env_int("RAY_SERVE_PROXY_GC_THRESHOLD", 10_000) +RAY_SERVE_PROXY_GC_THRESHOLD = get_env_int("RAY_SERVE_PROXY_GC_THRESHOLD", 700) # Interval at which cached metrics will be exported using the Ray metric API. # Set to `0` to disable caching entirely. @@ -457,5 +481,23 @@ "RAY_SERVE_REQUEST_PATH_LOG_BUFFER_SIZE", 1 ) +# Feature flag to fail the deployment if the rank is not set. +# TODO (abrar): Remove this flag after the feature is stable. +RAY_SERVE_FAIL_ON_RANK_ERROR = get_env_bool("RAY_SERVE_FAIL_ON_RANK_ERROR", "0") + # The message to return when the replica is healthy. HEALTHY_MESSAGE = "success" + +# If throughput optimized Ray Serve is enabled, set the following constants. +# This should be at the end. +RAY_SERVE_THROUGHPUT_OPTIMIZED = get_env_bool("RAY_SERVE_THROUGHPUT_OPTIMIZED", "0") +if RAY_SERVE_THROUGHPUT_OPTIMIZED: + RAY_SERVE_RUN_USER_CODE_IN_SEPARATE_THREAD = False + RAY_SERVE_REQUEST_PATH_LOG_BUFFER_SIZE = 1000 + RAY_SERVE_RUN_ROUTER_IN_SEPARATE_LOOP = False + RAY_SERVE_LOG_TO_STDERR = False + +# The maximum allowed RPC latency in milliseconds. +# This is used to detect and warn about long RPC latencies +# between the controller and the replicas. +RAY_SERVE_RPC_LATENCY_WARNING_THRESHOLD_MS = 2000 diff --git a/python/ray/serve/_private/constants_utils.py b/python/ray/serve/_private/constants_utils.py index d06790701930..019df87d86b8 100644 --- a/python/ray/serve/_private/constants_utils.py +++ b/python/ray/serve/_private/constants_utils.py @@ -1,4 +1,5 @@ import os +import warnings from typing import Callable, List, Optional, Type, TypeVar @@ -46,30 +47,75 @@ def parse_latency_buckets(bucket_str: str, default_buckets: List[float]) -> List T = TypeVar("T") +# todo: remove for the '3.0.0' release. +_wrong_names_white_list = { + "MAX_DEPLOYMENT_CONSTRUCTOR_RETRY_COUNT", + "MAX_PER_REPLICA_RETRY_COUNT", + "REQUEST_LATENCY_BUCKETS_MS", + "MODEL_LOAD_LATENCY_BUCKETS_MS", + "MAX_CACHED_HANDLES", + "CONTROLLER_MAX_CONCURRENCY", + "SERVE_REQUEST_PROCESSING_TIMEOUT_S", +} + + +def _validate_name(name: str) -> None: + """Validate Ray Serve environment variable name.""" + required_prefix = "RAY_SERVE_" + + if not name.startswith(required_prefix): + if name in _wrong_names_white_list: + return + + raise ValueError( + f"Got unexpected environment variable name `{name}`! " + f"Ray Serve environment variables require prefix `{required_prefix}`. " + ) + def _get_env_value( name: str, - default: T, + default: Optional[T], value_type: Type[T], - value_requirement: Optional[Callable[[T], bool]] = None, - error_message: str = None, -) -> T: + validation_func: Optional[Callable[[T], bool]] = None, + expected_value_description: Optional[str] = None, +) -> Optional[T]: """Get environment variable with type conversion and validation. + This function retrieves an environment variable, converts it to the specified type, + and optionally validates the converted value. + Args: name: The name of the environment variable. default: Default value to use if the environment variable is not set. - value_type: Type to convert the environment variable value to. - value_requirement: Optional function to validate the converted value. - error_message: Error message description for validation failures. + If None, the function will return None without validation. + value_type: Type to convert the environment variable value to (e.g., int, float, str). + validation_func: Optional function that takes the converted value and returns + a boolean indicating whether the value is valid. + expected_value_description: Description of the expected value characteristics + (e.g., "positive", "non-negative") used in error messages. + Optional, expected only if validation_func is provided. Returns: - The converted and validated environment variable value. + The environment variable value converted to the specified type and validated, + or the default value if the environment variable is not set. Raises: - ValueError: If type conversion fails or validation fails. + ValueError: If the environment variable value cannot be converted to the specified + type, or if it fails the optional validation check. Also, if name validation fails. """ - raw = os.environ.get(name, default) + _validate_name(name) + + explicitly_defined_value = os.environ.get(name) + if explicitly_defined_value is None: + if default is None: + return None + else: + raw = default + else: + _deprecation_warning(name) + raw = explicitly_defined_value + try: value = value_type(raw) except ValueError as e: @@ -77,15 +123,16 @@ def _get_env_value( f"Environment variable `{name}` value `{raw}` cannot be converted to `{value_type.__name__}`!" ) from e - if value_requirement and not value_requirement(value): + if validation_func and not validation_func(value): raise ValueError( - f"Got unexpected value `{value}` for `{name}` environment variable! Expected {error_message} `{value_type.__name__}`." + f"Got unexpected value `{value}` for `{name}` environment variable! " + f"Expected {expected_value_description} `{value_type.__name__}`." ) return value -def get_env_int(name: str, default: int) -> int: +def get_env_int(name: str, default: Optional[int]) -> Optional[int]: """Get environment variable as an integer. Args: @@ -101,7 +148,7 @@ def get_env_int(name: str, default: int) -> int: return _get_env_value(name, default, int) -def get_env_int_positive(name: str, default: int) -> int: +def get_env_int_positive(name: str, default: Optional[int]) -> Optional[int]: """Get environment variable as a positive integer. Args: @@ -117,7 +164,7 @@ def get_env_int_positive(name: str, default: int) -> int: return _get_env_value(name, default, int, lambda x: x > 0, "positive") -def get_env_int_non_negative(name: str, default: int) -> int: +def get_env_int_non_negative(name: str, default: Optional[int]) -> Optional[int]: """Get environment variable as a non-negative integer. Args: @@ -133,7 +180,7 @@ def get_env_int_non_negative(name: str, default: int) -> int: return _get_env_value(name, default, int, lambda x: x >= 0, "non negative") -def get_env_float(name: str, default: float) -> float: +def get_env_float(name: str, default: Optional[float]) -> Optional[float]: """Get environment variable as a float. Args: @@ -149,7 +196,7 @@ def get_env_float(name: str, default: float) -> float: return _get_env_value(name, default, float) -def get_env_float_positive(name: str, default: float) -> float: +def get_env_float_positive(name: str, default: Optional[float]) -> Optional[float]: """Get environment variable as a positive float. Args: @@ -165,7 +212,7 @@ def get_env_float_positive(name: str, default: float) -> float: return _get_env_value(name, default, float, lambda x: x > 0, "positive") -def get_env_float_non_negative(name: str, default: float) -> float: +def get_env_float_non_negative(name: str, default: Optional[float]) -> Optional[float]: """Get environment variable as a non-negative float. Args: @@ -181,7 +228,7 @@ def get_env_float_non_negative(name: str, default: float) -> float: return _get_env_value(name, default, float, lambda x: x >= 0, "non negative") -def get_env_str(name: str, default: Optional[str]) -> str: +def get_env_str(name: str, default: Optional[str]) -> Optional[str]: """Get environment variable as a string. Args: @@ -190,11 +237,12 @@ def get_env_str(name: str, default: Optional[str]) -> str: Returns: The environment variable value as a string. + Returns `None` if default is `None` and value not found. """ - return os.environ.get(name, default) + return _get_env_value(name, default, str) -def get_env_bool(name: str, default: Optional[str]) -> bool: +def get_env_bool(name: str, default: str) -> bool: """Get environment variable as a boolean. Environment variable values of "1" are interpreted as True, all others as False. @@ -202,8 +250,42 @@ def get_env_bool(name: str, default: Optional[str]) -> bool: Args: name: The name of the environment variable. default: Default value to use if the environment variable is not set. + Expects "0" or "1". Returns: True if the environment variable value is "1", False otherwise. """ - return os.environ.get(name, default) == "1" + env_value_str = _get_env_value(name, default, str) + return env_value_str == "1" + + +def _deprecation_warning(name: str) -> None: + """Log replacement warning for wrong or legacy environment variables. + + TODO: remove this function for the '3.0.0' release. + + :param name: environment variable name + """ + + def get_new_name(name: str) -> str: + if name == "RAY_SERVE_HANDLE_METRIC_PUSH_INTERVAL_S": + return "RAY_SERVE_HANDLE_AUTOSCALING_METRIC_PUSH_INTERVAL_S" + elif name == "SERVE_REQUEST_PROCESSING_TIMEOUT_S": + return "RAY_SERVE_REQUEST_PROCESSING_TIMEOUT_S" + else: + return f"{required_prefix}{name}" + + change_version = "3.0.0" + required_prefix = "RAY_SERVE_" + + if ( + name in _wrong_names_white_list + or name == "RAY_SERVE_HANDLE_METRIC_PUSH_INTERVAL_S" + ): + new_name = get_new_name(name) + warnings.warn( + f"Starting from version `{change_version}` environment variable " + f"`{name}` will be deprecated. Please use `{new_name}` instead.", + FutureWarning, + stacklevel=4, + ) diff --git a/python/ray/serve/_private/controller.py b/python/ray/serve/_private/controller.py index cc0724f20565..7e28d0d3f1ff 100644 --- a/python/ray/serve/_private/controller.py +++ b/python/ray/serve/_private/controller.py @@ -13,9 +13,11 @@ from ray.serve._private.application_state import ApplicationStateManager, StatusOverview from ray.serve._private.autoscaling_state import AutoscalingStateManager from ray.serve._private.common import ( - DeploymentHandleSource, + RUNNING_REQUESTS_KEY, DeploymentID, + HandleMetricReport, NodeId, + ReplicaMetricReport, RequestProtocol, RequestRoutingInfo, RunningReplicaInfo, @@ -25,6 +27,7 @@ from ray.serve._private.constants import ( CONTROL_LOOP_INTERVAL_S, RAY_SERVE_CONTROLLER_CALLBACK_IMPORT_PATH, + RAY_SERVE_RPC_LATENCY_WARNING_THRESHOLD_MS, RECOVERING_LONG_POLL_BROADCAST_TIMEOUT_S, SERVE_CONTROLLER_NAME, SERVE_DEFAULT_APP_NAME, @@ -64,6 +67,7 @@ EndpointSet, ) from ray.serve.schema import ( + APIType, ApplicationDetails, DeploymentDetails, HTTPOptionsSchema, @@ -259,38 +263,43 @@ def check_alive(self) -> None: def get_pid(self) -> int: return os.getpid() - def record_autoscaling_metrics( - self, replica_id: str, window_avg: Optional[float], send_timestamp: float + def record_autoscaling_metrics_from_replica( + self, replica_metric_report: ReplicaMetricReport ): logger.debug( - f"Received metrics from replica {replica_id}: {window_avg} running requests" + f"Received metrics from replica {replica_metric_report.replica_id}: {replica_metric_report.aggregated_metrics.get(RUNNING_REQUESTS_KEY)} running requests" ) + latency = time.time() - replica_metric_report.timestamp + latency_ms = latency * 1000 + if latency_ms > RAY_SERVE_RPC_LATENCY_WARNING_THRESHOLD_MS: + logger.warning( + f"Received autoscaling metrics from replica {replica_metric_report.replica_id} with timestamp {replica_metric_report.timestamp} " + f"which is {latency_ms}ms ago. " + f"This is greater than the warning threshold RPC latency of {RAY_SERVE_RPC_LATENCY_WARNING_THRESHOLD_MS}ms. " + "This may indicate a performance issue with the controller try increasing the RAY_SERVE_RPC_LATENCY_WARNING_THRESHOLD_MS environment variable." + ) self.autoscaling_state_manager.record_request_metrics_for_replica( - replica_id, window_avg, send_timestamp + replica_metric_report ) - def record_handle_metrics( - self, - deployment_id: str, - handle_id: str, - actor_id: Optional[str], - handle_source: DeploymentHandleSource, - queued_requests: float, - running_requests: Dict[str, float], - send_timestamp: float, + def record_autoscaling_metrics_from_handle( + self, handle_metric_report: HandleMetricReport ): logger.debug( - f"Received metrics from handle {handle_id} for deployment {deployment_id}: " - f"{queued_requests} queued requests and {running_requests} running requests" + f"Received metrics from handle {handle_metric_report.handle_id} for deployment {handle_metric_report.deployment_id}: " + f"{handle_metric_report.queued_requests} queued requests and {handle_metric_report.aggregated_metrics[RUNNING_REQUESTS_KEY]} running requests" ) + latency = time.time() - handle_metric_report.timestamp + latency_ms = latency * 1000 + if latency_ms > RAY_SERVE_RPC_LATENCY_WARNING_THRESHOLD_MS: + logger.warning( + f"Received autoscaling metrics from handle {handle_metric_report.handle_id} for deployment {handle_metric_report.deployment_id} with timestamp {handle_metric_report.timestamp} " + f"which is {latency_ms}ms ago. " + f"This is greater than the warning threshold RPC latency of {RAY_SERVE_RPC_LATENCY_WARNING_THRESHOLD_MS}ms. " + "This may indicate a performance issue with the controller try increasing the RAY_SERVE_RPC_LATENCY_WARNING_THRESHOLD_MS environment variable." + ) self.autoscaling_state_manager.record_request_metrics_for_handle( - deployment_id=deployment_id, - handle_id=handle_id, - actor_id=actor_id, - handle_source=handle_source, - queued_requests=queued_requests, - running_requests=running_requests, - send_timestamp=send_timestamp, + handle_metric_report ) def _dump_autoscaling_metrics_for_testing(self): @@ -442,8 +451,6 @@ async def run_control_loop_step( dsm_update_start_time = time.time() any_recovering = self.deployment_state_manager.update() - self.deployment_state_manager.save_checkpoint() - self.dsm_update_duration_gauge_s.set(time.time() - dsm_update_start_time) if not self.done_recovering_event.is_set() and not any_recovering: self.done_recovering_event.set() @@ -461,11 +468,6 @@ async def run_control_loop_step( asm_update_start_time = time.time() self.application_state_manager.update() - self.application_state_manager.save_checkpoint() - # ApplicationStateManager.update() can also mutate the - # DeploymentStateManager so we need to checkpoint that as well - self.deployment_state_manager.save_checkpoint() - self.asm_update_duration_gauge_s.set(time.time() - asm_update_start_time) except Exception: logger.exception("Exception updating application state.") @@ -659,6 +661,9 @@ def get_root_url(self): if SERVE_ROOT_URL_ENV_KEY in os.environ: return os.environ[SERVE_ROOT_URL_ENV_KEY] else: + # HTTP is disabled + if http_config.host is None: + return "" return ( f"http://{build_address(http_config.host, http_config.port)}" f"{http_config.root_path}" @@ -916,12 +921,17 @@ def list_deployment_ids(self) -> List[DeploymentID]: """Gets the current list of all deployments' identifiers.""" return self.deployment_state_manager._deployment_states.keys() - def get_serve_instance_details(self) -> Dict: + def get_serve_instance_details(self, source: Optional[APIType] = None) -> Dict: """Gets details on all applications on the cluster and system-level info. The information includes application and deployment statuses, config options, error messages, etc. + Args: + source: If provided, returns application + statuses for applications matching this API type. + Defaults to None, which means all applications are returned. + Returns: Dict that follows the format of the schema ServeInstanceDetails. """ @@ -930,7 +940,7 @@ def get_serve_instance_details(self) -> Dict: grpc_config = self.get_grpc_config() applications = {} - app_statuses = self.application_state_manager.list_app_statuses() + app_statuses = self.application_state_manager.list_app_statuses(source=source) # If there are no app statuses, there's no point getting the app configs. # Moreover, there might be no app statuses because the GCS is down, @@ -1107,6 +1117,15 @@ def record_request_routing_info(self, info: RequestRoutingInfo): """ self.deployment_state_manager.record_request_routing_info(info) + def _get_replica_ranks_mapping(self, deployment_id: DeploymentID) -> Dict[str, int]: + """Get the current rank mapping for all replicas in a deployment. + Args: + deployment_id: The deployment ID to get ranks for. + Returns: + Dictionary mapping replica_id to rank. + """ + return self.deployment_state_manager._get_replica_ranks_mapping(deployment_id) + async def graceful_shutdown(self, wait: bool = True): """Set the shutting down flag on controller to signal shutdown in run_control_loop(). diff --git a/python/ray/serve/_private/deployment_state.py b/python/ray/serve/_private/deployment_state.py index 3f5cd5f2717d..dd9a136632b4 100644 --- a/python/ray/serve/_private/deployment_state.py +++ b/python/ray/serve/_private/deployment_state.py @@ -38,6 +38,7 @@ MAX_DEPLOYMENT_CONSTRUCTOR_RETRY_COUNT, MAX_PER_REPLICA_RETRY_COUNT, RAY_SERVE_ENABLE_TASK_EVENTS, + RAY_SERVE_FAIL_ON_RANK_ERROR, RAY_SERVE_FORCE_STOP_UNHEALTHY_REPLICAS, RAY_SERVE_USE_COMPACT_SCHEDULING_STRATEGY, REPLICA_HEALTH_CHECK_UNHEALTHY_THRESHOLD, @@ -131,6 +132,7 @@ def create( placement_group_bundles=info.replica_config.placement_group_bundles, placement_group_strategy=info.replica_config.placement_group_strategy, max_replicas_per_node=info.replica_config.max_replicas_per_node, + route_prefix=info.route_prefix, ) return cls(info, target_num_replicas, version, deleting) @@ -246,8 +248,10 @@ def __init__( self._last_health_check_time: float = 0.0 self._consecutive_health_check_failures = 0 self._initialization_latency_s: Optional[float] = None - self._port: Optional[int] = None + self._internal_grpc_port: Optional[int] = None self._docs_path: Optional[str] = None + # Rank assigned to the replica. + self._rank: Optional[int] = None # Populated in `on_scheduled` or `recover`. self._actor_handle: ActorHandle = None self._placement_group: PlacementGroup = None @@ -260,6 +264,8 @@ def __init__( self._node_ip: str = None self._node_instance_id: str = None self._log_file_path: str = None + self._http_port: int = None + self._grpc_port: int = None # Populated in self.stop(). self._graceful_shutdown_ref: ObjectRef = None @@ -280,6 +286,10 @@ def replica_id(self) -> str: def deployment_name(self) -> str: return self._deployment_id.name + @property + def rank(self) -> Optional[int]: + return self._rank + @property def app_name(self) -> str: return self._deployment_id.app_name @@ -357,6 +367,14 @@ def health_check_period_s(self) -> float: def health_check_timeout_s(self) -> float: return self.deployment_config.health_check_timeout_s + @property + def http_port(self) -> Optional[int]: + return self._http_port + + @property + def grpc_port(self) -> Optional[int]: + return self._grpc_port + @property def request_routing_stats_period_s(self) -> float: return ( @@ -416,12 +434,15 @@ def initialization_latency_s(self) -> Optional[float]: return self._initialization_latency_s - def start(self, deployment_info: DeploymentInfo) -> ReplicaSchedulingRequest: + def start( + self, deployment_info: DeploymentInfo, rank: int + ) -> ReplicaSchedulingRequest: """Start the current DeploymentReplica instance. The replica will be in the STARTING and PENDING_ALLOCATION states until the deployment scheduler schedules the underlying actor. """ + self._rank = rank # Store the rank assigned to this replica self._actor_resources = deployment_info.replica_config.resource_dict self._ingress = deployment_info.ingress # it is currently not possible to create a placement group @@ -465,6 +486,7 @@ def start(self, deployment_info: DeploymentInfo) -> ReplicaSchedulingRequest: self._version, deployment_info.ingress, deployment_info.route_prefix, + rank, ) # TODO(simon): unify the constructor arguments across language elif ( @@ -580,7 +602,11 @@ def _format_user_config(self, user_config: Any): temp = msgpack_deserialize(temp) return temp - def reconfigure(self, version: DeploymentVersion) -> bool: + def reconfigure( + self, + version: DeploymentVersion, + rank: int, + ) -> bool: """ Update replica version. Also, updates the deployment config on the actor behind this DeploymentReplica instance if necessary. @@ -588,19 +614,30 @@ def reconfigure(self, version: DeploymentVersion) -> bool: Returns: whether the actor is being updated. """ updating = False - if self._version.requires_actor_reconfigure(version): + + # Determine if we need heavyweight reconfiguration + # vs lightweight updates + needs_actor_reconfigure = self._version.requires_actor_reconfigure(version) + has_rank_changes = self._rank != rank + + if needs_actor_reconfigure or has_rank_changes: # Call into replica actor reconfigure() with updated user config and # graceful_shutdown_wait_loop_s + # Setting updating=True because we want to transition to UPDATING state + # when rank is updated or deployment config changes. updating = True deployment_config = copy(version.deployment_config) deployment_config.user_config = self._format_user_config( deployment_config.user_config ) self._ready_obj_ref = self._actor_handle.reconfigure.remote( - deployment_config + deployment_config, + rank, + version.route_prefix, ) self._version = version + self._rank = rank return updating def recover(self) -> bool: @@ -725,8 +762,11 @@ def check_ready(self) -> Tuple[ReplicaStartupStatus, Optional[str]]: _, self._version, self._initialization_latency_s, - self._port, + self._internal_grpc_port, self._docs_path, + self._http_port, + self._grpc_port, + self._rank, ) = ray.get(self._ready_obj_ref) except RayTaskError as e: logger.exception( @@ -1037,7 +1077,7 @@ def get_running_replica_info( is_cross_language=self._actor.is_cross_language, multiplexed_model_ids=self.multiplexed_model_ids, routing_stats=self.routing_stats, - port=self._actor._port, + port=self._actor._internal_grpc_port, ) def record_multiplexed_model_ids(self, multiplexed_model_ids: List[str]): @@ -1098,6 +1138,14 @@ def actor_node_id(self) -> Optional[str]: """Returns the node id of the actor, None if not placed.""" return self._actor.node_id + @property + def actor_http_port(self) -> Optional[int]: + return self._actor.http_port + + @property + def actor_grpc_port(self) -> Optional[int]: + return self._actor.grpc_port + @property def actor_pid(self) -> Optional[int]: """Returns the node id of the actor, None if not placed.""" @@ -1109,24 +1157,30 @@ def initialization_latency_s(self) -> Optional[float]: return self._actor.initialization_latency_s - def start(self, deployment_info: DeploymentInfo) -> ReplicaSchedulingRequest: + def start( + self, deployment_info: DeploymentInfo, rank: int + ) -> ReplicaSchedulingRequest: """ Start a new actor for current DeploymentReplica instance. """ - replica_scheduling_request = self._actor.start(deployment_info) + replica_scheduling_request = self._actor.start(deployment_info, rank=rank) self._start_time = time.time() self._logged_shutdown_message = False self.update_actor_details(start_time_s=self._start_time) return replica_scheduling_request - def reconfigure(self, version: DeploymentVersion) -> bool: + def reconfigure( + self, + version: DeploymentVersion, + rank: int, + ) -> bool: """ Update replica version. Also, updates the deployment config on the actor behind this DeploymentReplica instance if necessary. Returns: whether the actor is being updated. """ - return self._actor.reconfigure(version) + return self._actor.reconfigure(version, rank=rank) def recover(self) -> bool: """ @@ -1144,6 +1198,11 @@ def recover(self) -> bool: self.update_actor_details(start_time_s=self._start_time) return True + @property + def rank(self) -> Optional[int]: + """Get the rank assigned to the replica.""" + return self._actor.rank + def check_started( self, ) -> Tuple[ReplicaStartupStatus, Optional[str], Optional[float]]: @@ -1388,6 +1447,268 @@ def __repr__(self): return repr(self._replicas) +class DeploymentRankManager: + """Manages replica ranks for a deployment. + This class handles rank assignment, release, consistency checking, and reassignment. + It maintains the rank system invariants and provides a clean interface for rank operations. + """ + + def __init__(self, _fail_on_error: Optional[bool] = None): + # Maps replica_id to assigned rank + self._replica_ranks: Dict[str, int] = {} + # Set of available ranks (initially empty, grows as target replicas change) + self._released_ranks: Set[int] = set() + # Next rank to assign (increments as new replicas are created) + self._next_rank: int = 0 + # Whether to fail on rank errors (for testing control) + self._fail_on_error = ( + _fail_on_error + if _fail_on_error is not None + else RAY_SERVE_FAIL_ON_RANK_ERROR + ) + + def assign_rank(self, replica_id: str) -> int: + """Assign a rank to a new replica. + Args: + replica_id: The unique ID of the replica + Returns: + The assigned rank + Raises: + RuntimeError: If the replica already has a rank assigned + """ + if replica_id in self._replica_ranks: + raise RuntimeError( + f"Replica {replica_id} already has a rank assigned: {self._replica_ranks[replica_id]}" + ) + + # First try to reuse an available rank + if self._released_ranks: + rank = min(self._released_ranks) + self._released_ranks.remove(rank) + else: + # Otherwise use the next available rank + rank = self._next_rank + self._next_rank += 1 + + self._replica_ranks[replica_id] = rank + return rank + + def release_rank(self, replica_id: str) -> None: + """Release a rank when a replica is stopped. + Args: + replica_id: The unique ID of the replica whose rank should be released + """ + if replica_id not in self._replica_ranks: + raise RuntimeError(f"Replica {replica_id} has no rank assigned") + + rank = self._replica_ranks.pop(replica_id) + self._released_ranks.add(rank) + + def recover_rank(self, replica_id: str, rank: int) -> None: + """Recover a rank from a live replica during controller restart. + Args: + replica_id: The unique ID of the replica + rank: The rank to recover + Raises: + RuntimeError: If the replica already has a rank or the rank is invalid + ValueError: If the rank is invalid (negative) + """ + if replica_id in self._replica_ranks: + raise RuntimeError(f"Replica {replica_id} already has a rank assigned") + + self._replica_ranks[replica_id] = rank + + # Update available ranks tracking + if rank in self._released_ranks: + self._released_ranks.remove(rank) + + # Update next_rank to ensure we don't assign duplicates + if rank >= self._next_rank: + self._next_rank = rank + 1 + + def get_replica_rank(self, replica_id: str) -> Optional[int]: + """Get the rank assigned to a replica. + Args: + replica_id: The unique ID of the replica + Returns: + The assigned rank, or None if no rank is assigned + """ + if replica_id not in self._replica_ranks: + raise RuntimeError(f"Replica {replica_id} has no rank assigned") + return self._replica_ranks.get(replica_id) + + def get_replica_ranks_mapping(self) -> Dict[str, int]: + """Get a copy of the current replica ranks mapping. + Returns: + A copy of the replica_id to rank mapping + """ + return self._replica_ranks.copy() + + def check_rank_consistency_and_reassign_minimally( + self, + active_replicas: List["DeploymentReplica"], + ) -> List["DeploymentReplica"]: + """Verify rank system invariants and reassign ranks when needed. + This method ensures: + 1. All active replicas have ranks + 2. No duplicate ranks exist + 3. Ranks are contiguous when at target replica count + Args: + active_replicas: List of currently active replicas + Returns: + List of replicas that need to be reconfigured with new ranks + Raises: + RuntimeError: If rank system invariants are violated + """ + if not active_replicas: + return [] + + active_replica_ids = { + replica.replica_id.unique_id for replica in active_replicas + } + replica_ids_needs_reconfiguration = set() + + # Check for stale ranks - this should never happen + stale_replica_ids = set(self._replica_ranks.keys()) - active_replica_ids + if stale_replica_ids: + logger.error( + f"Found stale ranks for replicas: {stale_replica_ids}. " + "This should never happen. Please report this as a bug." + ) + if self._fail_on_error: + raise RuntimeError("Controller rank system is in an invalid state.") + # TODO (abrar): handle this case by removing the stale ranks, but remove this when + # RAY_SERVE_FAIL_ON_RANK_ERROR is set to 1 in the future + for replica_id in stale_replica_ids: + self.release_rank(replica_id) + replica_ids_needs_reconfiguration.add(replica_id) + + # Verify system invariants - all active replicas must have ranks + unranked_replica_ids = active_replica_ids - set(self._replica_ranks.keys()) + if unranked_replica_ids: + logger.error( + f"Found active replicas without ranks: {unranked_replica_ids}. " + "This should never happen. Please report this as a bug." + ) + if self._fail_on_error: + raise RuntimeError("Controller rank system is in an invalid state.") + # TODO (abrar): handle this case by assigning new ranks to the unranked replicas + # but remove this when RAY_SERVE_FAIL_ON_RANK_ERROR is set to 1 in the future + for replica_id in unranked_replica_ids: + self.assign_rank(replica_id) + replica_ids_needs_reconfiguration.add(replica_id) + + # Check for duplicate ranks - this should never happen + rank_counts = {} + for replica_id, rank in self._replica_ranks.copy().items(): + if replica_id in active_replica_ids: # Only check active replicas + rank_counts[rank] = rank_counts.get(rank, 0) + 1 + if rank_counts[rank] > 1: + logger.error( + f"Found duplicate rank {rank} assigned to multiple replicas. " + "This should never happen. Please report this as a bug." + ) + if self._fail_on_error: + raise RuntimeError( + "Controller rank system is in an invalid state." + ) + # TODO (abrar): handle this case by releasing the rank of the replica with the duplicate rank + # and assigning a new rank to the replica with the duplicate rank + # but remove this when RAY_SERVE_FAIL_ON_RANK_ERROR is set to 1 in the future + self._replica_ranks.pop(replica_id) + self.assign_rank(replica_id) + replica_ids_needs_reconfiguration.add(replica_id) + + # Check if we need to reassign ranks for contiguity + # Only force contiguity when at target replica count (e.g., after autoscaling down) + current_ranks = sorted(self._replica_ranks.values()) + expected_ranks = list(range(len(active_replicas))) + + replicas_needing_reconfiguration = [] + + if current_ranks != expected_ranks: + logger.info( + f"Deployment at target replica count but ranks are not contiguous. " + f"Current: {current_ranks}, Expected: {expected_ranks}. " + "Performing minimal reassignment." + ) + replicas_needing_reconfiguration.extend( + self._perform_minimal_rank_reassignment(active_replicas) + ) + + # TODO (abrar): remove this when RAY_SERVE_FAIL_ON_RANK_ERROR is set to 1 in the future + for replica in active_replicas: + if replica.replica_id.unique_id in replica_ids_needs_reconfiguration: + replicas_needing_reconfiguration.append(replica) + + return replicas_needing_reconfiguration + + def _perform_minimal_rank_reassignment( + self, active_replicas: List["DeploymentReplica"] + ) -> List["DeploymentReplica"]: + """Perform minimal rank reassignment to achieve contiguity. + This method reassigns ranks while minimizing the number of replicas that need + to be reconfigured. It prioritizes keeping existing ranks when possible. + Args: + active_replicas: List of currently active replicas + Returns: + List of replicas that need to be reconfigured with new ranks + """ + target_ranks_set = set(range(len(active_replicas))) + + # Find which replicas need new ranks + replicas_needing_ranks = [] + replicas_keeping_ranks = [] + + for replica in active_replicas: + replica_id = replica.replica_id.unique_id + current_rank = self.get_replica_rank(replica_id) + + if current_rank in target_ranks_set: + # This replica can keep its rank + target_ranks_set.remove(current_rank) # O(1) operation + replicas_keeping_ranks.append(replica) + else: + # This replica needs a new rank + replicas_needing_ranks.append(replica) + + # Convert remaining target ranks to sorted list for deterministic assignment + available_ranks = sorted(target_ranks_set) + + # Assign new ranks to replicas that need them + for i, replica in enumerate(replicas_needing_ranks): + replica_id = replica.replica_id.unique_id + new_rank = available_ranks[i] # O(1) operation + + # Store the old rank before updating + old_rank = self._replica_ranks[replica_id] + + logger.info( + f"Reassigning replica {replica_id}: rank {old_rank} -> {new_rank}" + ) + + # Update the rank mapping + self._replica_ranks[replica_id] = new_rank + # Remove the newly assigned rank from available ranks + self._released_ranks.discard(new_rank) + # Add the old rank back to available ranks for reuse + self._released_ranks.add(old_rank) + + # Log the reassignment summary + logger.info( + f"Minimal reassignment complete: {len(replicas_keeping_ranks)} replicas kept ranks, " + f"{len(replicas_needing_ranks)} replicas reassigned" + ) + + return replicas_needing_ranks + + def clear(self) -> None: + """Clear all rank data. Used for testing and reset.""" + self._replica_ranks.clear() + self._released_ranks.clear() + self._next_rank = 0 + + class DeploymentState: """Manages the target state and replicas for a single deployment.""" @@ -1428,6 +1749,8 @@ def __init__( DeploymentStatusTrigger.CONFIG_UPDATE_STARTED, ) + self._rank_manager = DeploymentRankManager() + self.replica_average_ongoing_requests: Dict[str, float] = {} self.health_check_gauge = metrics.Gauge( @@ -1730,7 +2053,7 @@ def deploy(self, deployment_info: DeploymentInfo) -> bool: this method returns False. Returns: - bool: Whether or not the deployment is being updated. + bool: Whether the target state has changed. """ curr_deployment_info = self._target_state.info @@ -1745,6 +2068,7 @@ def deploy(self, deployment_info: DeploymentInfo) -> bool: != deployment_info.deployment_config or curr_deployment_info.replica_config.ray_actor_options != deployment_info.replica_config.ray_actor_options + or curr_deployment_info.route_prefix != deployment_info.route_prefix or deployment_info.version is None or curr_deployment_info.version != deployment_info.version ) @@ -1813,10 +2137,14 @@ def deploy(self, deployment_info: DeploymentInfo) -> bool: return True def autoscale(self) -> int: - """Autoscale the deployment based on metrics.""" + """Autoscale the deployment based on metrics. + + Returns: + Whether the target state has changed. + """ if self._target_state.deleting: - return + return False decision_num_replicas = self._autoscaling_state_manager.get_target_num_replicas( deployment_id=self._id, @@ -1827,7 +2155,7 @@ def autoscale(self) -> int: decision_num_replicas is None or decision_num_replicas == self._target_state.target_num_replicas ): - return + return False new_info = copy(self._target_state.info) new_info.version = self._target_state.version.code_version @@ -1843,7 +2171,7 @@ def autoscale(self) -> int: states=[ReplicaState.RUNNING], version=self._target_state.version ), ): - return + return True curr_stats_str = ( f"Current ongoing requests: " @@ -1870,10 +2198,14 @@ def autoscale(self) -> int: trigger=DeploymentStatusInternalTrigger.AUTOSCALE_DOWN, message=f"Downscaling from {old_num} to {new_num} replicas.", ) + return True - def delete(self) -> None: + def delete(self) -> bool: if not self._target_state.deleting: self._set_target_state_deleting() + return True + + return False def _stop_or_update_outdated_version_replicas(self, max_to_stop=math.inf) -> bool: """Stop or update replicas with outdated versions. @@ -1917,7 +2249,13 @@ def _stop_or_update_outdated_version_replicas(self, max_to_stop=math.inf) -> boo self._target_state.version ): replicas_changed = True - actor_updating = replica.reconfigure(self._target_state.version) + # Get current rank for the replica + current_rank = self._rank_manager.get_replica_rank( + replica.replica_id.unique_id + ) + actor_updating = replica.reconfigure( + self._target_state.version, rank=current_rank + ) if actor_updating: self._replicas.add(ReplicaState.UPDATING, replica) else: @@ -2032,14 +2370,23 @@ def scale_deployment_replicas( logger.info(f"Adding {to_add} replica{'s' * (to_add>1)} to {self._id}.") for _ in range(to_add): replica_id = ReplicaID(get_random_string(), deployment_id=self._id) + + # Assign rank during replica creation (startup process) + assigned_rank = self._rank_manager.assign_rank(replica_id.unique_id) + + logger.info( + f"Assigned rank {assigned_rank} to new replica {replica_id.unique_id} during startup" + ) new_deployment_replica = DeploymentReplica( replica_id, self._target_state.version, ) - upscale.append( - new_deployment_replica.start(self._target_state.info) + scheduling_request = new_deployment_replica.start( + self._target_state.info, rank=assigned_rank ) + upscale.append(scheduling_request) + self._replicas.add(ReplicaState.STARTING, new_deployment_replica) elif delta_replicas < 0: @@ -2145,6 +2492,16 @@ def _check_startup_replicas( for replica in self._replicas.pop(states=[original_state]): start_status, error_msg = replica.check_started() if start_status == ReplicaStartupStatus.SUCCEEDED: + if original_state == ReplicaState.RECOVERING: + # If the previous state was RECOVERING, that mean the replica + # crashed and is now starting up again. We need to recover the rank + # from the replica actor. The invariant is that the rank is assigned + # during startup and before the replica is added to the replicas + # data structure with RUNNING state. + # Recover rank from the replica actor during controller restart + replica_id = replica.replica_id.unique_id + recovered_rank = replica.rank + self._rank_manager.recover_rank(replica_id, recovered_rank) # This replica should be now be added to handle's replica # set. self._replicas.add(ReplicaState.RUNNING, replica) @@ -2374,8 +2731,73 @@ def check_and_update_replicas(self): self._replicas.add(ReplicaState.STOPPING, replica) else: logger.info(f"{replica.replica_id} is stopped.") + # Release rank only after replica is successfully stopped + # This ensures rank is available during draining/graceful shutdown + replica_id = replica.replica_id.unique_id + self._rank_manager.release_rank(replica_id) + logger.info( + f"Released rank from replica {replica_id} in deployment {self._id}" + ) self._autoscaling_state_manager.on_replica_stopped(replica.replica_id) + # After replica state updates, check rank consistency and perform minimal reassignment if needed + # This ensures ranks are continuous after lifecycle events + # Only do consistency check when deployment is stable (not during active updates) + # maybe this constraint need to be relaxed in the future. The implication is that + # if we delay the rank reassignment, the rank system will be in an invalid state + # for a longer period of time. Abrar made this decision because he is not confident + # about how rollouts work in the deployment state machine. + active_replicas = self._replicas.get() + if ( + active_replicas + and self._curr_status_info.status == DeploymentStatus.HEALTHY + ): + replicas_to_reconfigure = ( + self._rank_manager.check_rank_consistency_and_reassign_minimally( + active_replicas, + ) + ) + + # Reconfigure replicas that had their ranks reassigned + self._reconfigure_replicas_with_new_ranks(replicas_to_reconfigure) + + def _reconfigure_replicas_with_new_ranks( + self, replicas_to_reconfigure: List["DeploymentReplica"] + ): + """Reconfigure replicas with their new ranks after reassignment. + This uses the reconfigure() mechanism to update replicas with their new ranks. + """ + if not replicas_to_reconfigure: + return + + logger.info( + f"Reconfiguring {len(replicas_to_reconfigure)} replicas with rank changes in deployment {self._id}" + ) + + updated_count = 0 + for replica in replicas_to_reconfigure: + replica_id = replica.replica_id.unique_id + new_rank = self._rank_manager.get_replica_rank(replica_id) + + # Use reconfigure() to update rank + # World size is calculated automatically from deployment config + _ = replica.reconfigure( + self._target_state.version, + rank=new_rank, + ) + updated_count += 1 + + logger.info( + f"Successfully reconfigured {updated_count} replicas with new ranks in deployment {self._id}" + ) + + def _get_replica_ranks_mapping(self) -> Dict[str, int]: + """Get the current mapping of replica IDs to ranks. + Returns: + Dictionary mapping replica_id to rank. + """ + return self._rank_manager.get_replica_ranks_mapping() + def _choose_pending_migration_replicas_to_stop( self, replicas: List[DeploymentReplica], @@ -2500,6 +2922,9 @@ def _stop_one_running_replica_for_testing(self): for replica in running_replicas: self._replicas.add(ReplicaState.RUNNING, replica) + def is_ingress(self) -> bool: + return self._target_state.info.ingress + class DeploymentStateManager: """Manages all state for deployments in the system. @@ -2803,7 +3228,7 @@ def deploy( this is a no-op and returns False. Returns: - bool: Whether or not the deployment is being updated. + bool: Whether the target state has changed. """ if deployment_id not in self._deployment_states: self._deployment_states[deployment_id] = self._create_deployment_state( @@ -2822,7 +3247,9 @@ def delete_deployment(self, id: DeploymentID): # This method must be idempotent. We should validate that the # specified deployment exists on the client. if id in self._deployment_states: - self._deployment_states[id].delete() + return self._deployment_states[id].delete() + + return False def update(self) -> bool: """Updates the state of all deployments to match their goal state. @@ -2834,11 +3261,14 @@ def update(self) -> bool: any_recovering = False upscales: Dict[DeploymentID, List[ReplicaSchedulingRequest]] = {} downscales: Dict[DeploymentID, DeploymentDownscaleRequest] = {} + target_state_changed = False # STEP 1: Update current state for deployment_state in self._deployment_states.values(): if deployment_state.should_autoscale(): - deployment_state.autoscale() + target_state_changed = ( + deployment_state.autoscale() or target_state_changed + ) deployment_state.check_and_update_replicas() @@ -2890,10 +3320,6 @@ def update(self) -> bool: deleted_ids.append(deployment_id) any_recovering |= any_replicas_recovering - # Take a checkpoint before actually affecting the state of the cluster - # by starting/stopping replicas. - self.save_checkpoint() - # STEP 6: Schedule all STARTING replicas and stop all STOPPING replicas deployment_to_replicas_to_stop = self._deployment_scheduler.schedule( upscales, downscales @@ -2933,6 +3359,9 @@ def update(self) -> bool: if len(deleted_ids): self._record_deployment_usage() + if target_state_changed: + self.save_checkpoint() + return any_recovering def _handle_scheduling_request_failures( @@ -3014,3 +3443,37 @@ def get_active_node_ids(self) -> Set[str]: for deployment_state in self._deployment_states.values(): node_ids.update(deployment_state.get_active_node_ids()) return node_ids + + def get_ingress_replicas_info(self) -> List[Tuple[str, str, int, int]]: + """Get all ingress replicas info for all deployments.""" + ingress_replicas_list = [ + deployment_state._replicas.get() + for deployment_state in self._deployment_states.values() + if deployment_state.is_ingress() + ] + + ingress_replicas_info = [] + for replicas in ingress_replicas_list: + for replica in replicas: + ingress_replicas_info.append( + ( + replica.actor_node_id, + replica.replica_id.unique_id, + replica.actor_http_port, + replica.actor_grpc_port, + ) + ) + return ingress_replicas_info + + def _get_replica_ranks_mapping(self, deployment_id: DeploymentID) -> Dict[str, int]: + """Get the current rank mapping for all replicas in a deployment. + Args: + deployment_id: The deployment ID to get ranks for. + Returns: + Dictionary mapping replica_id to rank. + """ + deployment_state = self._deployment_states.get(deployment_id) + if deployment_state is None: + return {} + + return deployment_state._get_replica_ranks_mapping() diff --git a/python/ray/serve/_private/handle_options.py b/python/ray/serve/_private/handle_options.py index ce2c624fce22..86ac4bc78ad2 100644 --- a/python/ray/serve/_private/handle_options.py +++ b/python/ray/serve/_private/handle_options.py @@ -62,6 +62,8 @@ def copy_and_update(self, **kwargs) -> "DynamicHandleOptionsBase": @dataclass(frozen=True) class DynamicHandleOptions(DynamicHandleOptionsBase): + _by_reference: bool = True + def copy_and_update(self, **kwargs) -> "DynamicHandleOptions": new_kwargs = {} diff --git a/python/ray/serve/_private/http_util.py b/python/ray/serve/_private/http_util.py index ee233d7c354d..93880b65f77b 100644 --- a/python/ray/serve/_private/http_util.py +++ b/python/ray/serve/_private/http_util.py @@ -432,7 +432,7 @@ def make_fastapi_class_based_view(fastapi_app, cls: Type) -> None: from fastapi import APIRouter, Depends from fastapi.routing import APIRoute, APIWebSocketRoute - def get_current_servable_instance(): + async def get_current_servable_instance(): from ray import serve return serve.get_replica_context().servable_object @@ -714,6 +714,23 @@ async def start_asgi_http_server( # has no use to us. logging.getLogger("uvicorn.error").level = logging.CRITICAL + # Configure SSL if certificates are provided + ssl_kwargs = {} + if http_options.ssl_keyfile and http_options.ssl_certfile: + ssl_kwargs = { + "ssl_keyfile": http_options.ssl_keyfile, + "ssl_certfile": http_options.ssl_certfile, + } + if http_options.ssl_keyfile_password: + ssl_kwargs["ssl_keyfile_password"] = http_options.ssl_keyfile_password + if http_options.ssl_ca_certs: + ssl_kwargs["ssl_ca_certs"] = http_options.ssl_ca_certs + + logger.info( + f"Starting HTTPS server on {http_options.host}:{http_options.port} " + f"with SSL certificate: {http_options.ssl_certfile}" + ) + # NOTE: We have to use lower level uvicorn Config and Server # class because we want to run the server as a coroutine. The only # alternative is to call uvicorn.run which is blocking. @@ -730,6 +747,7 @@ async def start_asgi_http_server( access_log=False, log_level=None, log_config=None, + **ssl_kwargs, ) ) diff --git a/python/ray/serve/_private/local_testing_mode.py b/python/ray/serve/_private/local_testing_mode.py index e09ad3ff2097..c43eb4667a75 100644 --- a/python/ray/serve/_private/local_testing_mode.py +++ b/python/ray/serve/_private/local_testing_mode.py @@ -104,6 +104,9 @@ class LocalReplicaResult(ReplicaResult): "Converting DeploymentResponses to ObjectRefs is not supported " "in local testing mode." ) + REJECTION_NOT_SUPPORTED_ERROR = RuntimeError( + "Request rejection is not supported in local testing mode." + ) def __init__( self, @@ -153,6 +156,10 @@ async def async_wrapper(self, *args, **kwargs): else: return wrapper + @_process_response + async def get_rejection_response(self): + raise self.REJECTION_NOT_SUPPORTED_ERROR + @_process_response def get(self, timeout_s: Optional[float]): assert ( diff --git a/python/ray/serve/_private/logging_utils.py b/python/ray/serve/_private/logging_utils.py index 4688e30e4089..521b675610a2 100644 --- a/python/ray/serve/_private/logging_utils.py +++ b/python/ray/serve/_private/logging_utils.py @@ -6,9 +6,9 @@ from typing import Any, Optional import ray +from ray._common.filters import CoreContextFilter +from ray._common.formatters import JSONFormatter, TextFormatter from ray._common.ray_constants import LOGGING_ROTATE_BACKUP_COUNT, LOGGING_ROTATE_BYTES -from ray._private.ray_logging.filters import CoreContextFilter -from ray._private.ray_logging.formatters import JSONFormatter, TextFormatter from ray.serve._private.common import ServeComponentType from ray.serve._private.constants import ( RAY_SERVE_ENABLE_JSON_LOGGING, @@ -93,6 +93,7 @@ class ServeContextFilter(logging.Filter): def filter(self, record): if should_skip_context_filter(record): return True + request_context = ray.serve.context._get_serve_request_context() if request_context.route: setattr(record, SERVE_LOG_ROUTE, request_context.route) @@ -369,15 +370,24 @@ def configure_component_logger( maxBytes=max_bytes, backupCount=backup_count, ) + # Create a memory handler that buffers log records and flushes to file handler + # Buffer capacity: buffer_size records + # Flush triggers: buffer full, ERROR messages, or explicit flush + memory_handler = logging.handlers.MemoryHandler( + capacity=buffer_size, + target=file_handler, + flushLevel=logging.ERROR, # Auto-flush on ERROR/CRITICAL + ) if RAY_SERVE_ENABLE_JSON_LOGGING: logger.warning( "'RAY_SERVE_ENABLE_JSON_LOGGING' is deprecated, please use " "'LoggingConfig' to enable json format." ) + # Add filters directly to the memory handler effective for both buffered and non buffered cases if RAY_SERVE_ENABLE_JSON_LOGGING or logging_config.encoding == EncodingType.JSON: - file_handler.addFilter(ServeCoreContextFilter()) - file_handler.addFilter(ServeContextFilter()) - file_handler.addFilter( + memory_handler.addFilter(ServeCoreContextFilter()) + memory_handler.addFilter(ServeContextFilter()) + memory_handler.addFilter( ServeComponentFilter(component_name, component_id, component_type) ) file_handler.setFormatter(json_formatter) @@ -385,12 +395,12 @@ def configure_component_logger( file_handler.setFormatter(serve_formatter) if logging_config.enable_access_log is False: - file_handler.addFilter(log_access_log_filter) + memory_handler.addFilter(log_access_log_filter) else: - file_handler.addFilter(ServeContextFilter()) + memory_handler.addFilter(ServeContextFilter()) # Remove unwanted attributes from the log record. - file_handler.addFilter(ServeLogAttributeRemovalFilter()) + memory_handler.addFilter(ServeLogAttributeRemovalFilter()) # Redirect print, stdout, and stderr to Serve logger, only when it's on the replica. if not RAY_SERVE_LOG_TO_STDERR and component_type == ServeComponentType.REPLICA: @@ -398,15 +408,6 @@ def configure_component_logger( sys.stdout = StreamToLogger(logger, logging.INFO, sys.stdout) sys.stderr = StreamToLogger(logger, logging.INFO, sys.stderr) - # Create a memory handler that buffers log records and flushes to file handler - # Buffer capacity: buffer_size records - # Flush triggers: buffer full, ERROR messages, or explicit flush - memory_handler = logging.handlers.MemoryHandler( - capacity=buffer_size, - target=file_handler, - flushLevel=logging.ERROR, # Auto-flush on ERROR/CRITICAL - ) - # Add the memory handler instead of the file handler directly logger.addHandler(memory_handler) diff --git a/python/ray/serve/_private/metrics_utils.py b/python/ray/serve/_private/metrics_utils.py index 14efb553ca09..10d493979c61 100644 --- a/python/ray/serve/_private/metrics_utils.py +++ b/python/ray/serve/_private/metrics_utils.py @@ -1,15 +1,29 @@ import asyncio import bisect import logging +import statistics from collections import defaultdict -from dataclasses import dataclass, field -from typing import Callable, DefaultDict, Dict, Hashable, List, Optional +from dataclasses import dataclass +from itertools import chain +from typing import ( + Callable, + DefaultDict, + Dict, + Hashable, + Iterable, + List, + Optional, + Tuple, +) +from ray.serve._private.common import TimeStampedValue from ray.serve._private.constants import ( METRICS_PUSHER_GRACEFUL_SHUTDOWN_TIMEOUT_S, SERVE_LOGGER_NAME, ) +QUEUED_REQUESTS_KEY = "queued" + logger = logging.getLogger(SERVE_LOGGER_NAME) @@ -110,12 +124,6 @@ async def graceful_shutdown(self): self._async_tasks.clear() -@dataclass(order=True) -class TimeStampedValue: - timestamp: float - value: float = field(compare=False) - - class InMemoryMetricsStore: """A very simple, in memory time series database""" @@ -152,7 +160,7 @@ def prune_keys_and_compact_data(self, start_timestamp_s: float): def _get_datapoints( self, key: Hashable, window_start_timestamp_s: float - ) -> List[float]: + ) -> List[TimeStampedValue]: """Get all data points given key after window_start_timestamp_s""" datapoints = self.data[key] @@ -165,52 +173,205 @@ def _get_datapoints( ) return datapoints[idx:] - def window_average( - self, key: Hashable, window_start_timestamp_s: float, do_compact: bool = True + def _aggregate_reduce( + self, + keys: Iterable[Hashable], + aggregate_fn: Callable[[Iterable[float]], float], + ) -> Tuple[Optional[float], int]: + """Reduce the entire set of timeseries values across the specified keys. + + Args: + keys: Iterable of keys to aggregate across. + aggregate_fn: Function to apply across all float values, e.g., sum, max. + + Returns: + A tuple of (float, int) where the first element is the aggregated value + and the second element is the number of valid keys used. + Returns (None, 0) if no valid keys have data. + + Example: + Suppose the store contains: + >>> store = InMemoryMetricsStore() + >>> store.data.update({ + ... "a": [TimeStampedValue(0, 1.0), TimeStampedValue(1, 2.0)], + ... "b": [], + ... "c": [TimeStampedValue(0, 10.0)], + ... }) + + Using sum across keys: + + >>> store._aggregate_reduce(keys=["a", "b", "c"], aggregate_fn=sum) + (13.0, 2) + + Here: + - The aggregated value is 1.0 + 2.0 + 10.0 = 13.0 + - Only keys "a" and "c" contribute values, so report_count = 2 + """ + valid_key_count = 0 + + def _values_generator(): + """Generator that yields values from valid keys without storing them all in memory.""" + nonlocal valid_key_count + for key in keys: + series = self.data.get(key, []) + if not series: + continue + + valid_key_count += 1 + for timestamp_value in series: + yield timestamp_value.value + + # Create the generator and check if it has any values + values_gen = _values_generator() + try: + first_value = next(values_gen) + except StopIteration: + # No valid data found + return None, 0 + + # Apply aggregation to the generator (memory efficient) + aggregated_result = aggregate_fn(chain([first_value], values_gen)) + return aggregated_result, valid_key_count + + def get_latest( + self, + key: Hashable, ) -> Optional[float]: - """Perform a window average operation for metric `key` + """Get the latest value for a given key.""" + if not self.data.get(key, None): + return None + return self.data[key][-1].value + + def aggregate_min( + self, + keys: Iterable[Hashable], + ) -> Tuple[Optional[float], int]: + """Find the min value across all timeseries values at the specified keys. Args: - key: the metric name. - window_start_timestamp_s: the unix epoch timestamp for the - start of the window. The computed average will use all datapoints - from this timestamp until now. - do_compact: whether or not to delete the datapoints that's - before `window_start_timestamp_s` to save memory. Default is - true. + keys: Iterable of keys to aggregate across. Returns: - The average of all the datapoints for the key on and after time - window_start_timestamp_s, or None if there are no such points. + A tuple of (float, int) where the first element is the min across + all values found at `keys`, and the second is the number of valid + keys used to compute the min. + Returns (None, 0) if no valid keys have data. """ - points_after_idx = self._get_datapoints(key, window_start_timestamp_s) + return self._aggregate_reduce(keys, min) - if do_compact: - self.data[key] = points_after_idx + def aggregate_max( + self, + keys: Iterable[Hashable], + ) -> Tuple[Optional[float], int]: + """Find the max value across all timeseries values at the specified keys. - if len(points_after_idx) == 0: - return - return sum(point.value for point in points_after_idx) / len(points_after_idx) + Args: + keys: Iterable of keys to aggregate across. + Returns: + A tuple of (float, int) where the first element is the max across + all values found at `keys`, and the second is the number of valid + keys used to compute the max. + Returns (None, 0) if no valid keys have data. + """ + return self._aggregate_reduce(keys, max) - def max( - self, key: Hashable, window_start_timestamp_s: float, do_compact: bool = True - ): - """Perform a max operation for metric `key`. + def aggregate_sum( + self, + keys: Iterable[Hashable], + ) -> Tuple[Optional[float], int]: + """Sum the entire set of timeseries values across the specified keys. Args: - key: the metric name. - window_start_timestamp_s: the unix epoch timestamp for the - start of the window. The computed average will use all datapoints - from this timestamp until now. - do_compact: whether or not to delete the datapoints that's - before `window_start_timestamp_s` to save memory. Default is - true. + keys: Iterable of keys to aggregate across. Returns: - Max value of the data points for the key on and after time - window_start_timestamp_s, or None if there are no such points. + A tuple of (float, int) where the first element is the sum across + all values found at `keys`, and the second is the number of valid + keys used to compute the sum. + Returns (None, 0) if no valid keys have data. """ - points_after_idx = self._get_datapoints(key, window_start_timestamp_s) + return self._aggregate_reduce(keys, sum) - if do_compact: - self.data[key] = points_after_idx + def aggregate_avg( + self, + keys: Iterable[Hashable], + ) -> Tuple[Optional[float], int]: + """Average the entire set of timeseries values across the specified keys. - return max((point.value for point in points_after_idx), default=None) + Args: + keys: Iterable of keys to aggregate across. + Returns: + A tuple of (float, int) where the first element is the mean across + all values found at `keys`, and the second is the number of valid + keys used to compute the mean. + Returns (None, 0) if no valid keys have data. + """ + return self._aggregate_reduce(keys, statistics.mean) + + +def _bucket_latest_by_window( + series: List[TimeStampedValue], + start: float, + window_s: float, +) -> Dict[int, float]: + """ + Map each window index -> latest value seen in that window. + Assumes series is sorted by timestamp ascending. + """ + buckets: Dict[int, float] = {} + for p in series: + w = int((p.timestamp - start) // window_s) + buckets[w] = p.value # overwrite keeps the latest within the window + return buckets + + +def _merge_two_timeseries( + t1: List[TimeStampedValue], t2: List[TimeStampedValue], window_s: float +) -> List[TimeStampedValue]: + """ + Merge two ascending time series by summing values within a specified time window. + If multiple values fall within the same window in a series, the latest value is used. + The output contains one point per window that had at least one value, timestamped + at the window center. + """ + if window_s <= 0: + raise ValueError(f"window_s must be positive, got {window_s}") + + if not t1 and not t2: + return [] + + # Align windows so each output timestamp sits at the start of its window. + # start is snapped to window_s boundary for binning stability + earliest = min(x[0].timestamp for x in (t1, t2) if x) + start = earliest // window_s * window_s + + b1 = _bucket_latest_by_window(t1, start, window_s) + b2 = _bucket_latest_by_window(t2, start, window_s) + + windows = sorted(set(b1.keys()) | set(b2.keys())) + + merged: List[TimeStampedValue] = [] + for w in windows: + v = b1.get(w, 0.0) + b2.get(w, 0.0) + ts_start = start + w * window_s + merged.append(TimeStampedValue(timestamp=ts_start, value=v)) + return merged + + +def merge_timeseries_dicts( + *timeseries_dicts: DefaultDict[Hashable, List[TimeStampedValue]], + window_s: float, +) -> DefaultDict[Hashable, List[TimeStampedValue]]: + """ + Merge multiple time-series dictionaries, typically contained within + InMemoryMetricsStore().data. For the same key across stores, time series + are merged with a windowed sum, where each series keeps only its latest + value per window before summing. + """ + merged: DefaultDict[Hashable, List[TimeStampedValue]] = defaultdict(list) + for timeseries_dict in timeseries_dicts: + for key, ts in timeseries_dict.items(): + if key in merged: + merged[key] = _merge_two_timeseries(merged[key], ts, window_s) + else: + # Window the data, even if the key is unique. + merged[key] = _merge_two_timeseries(ts, [], window_s) + return merged diff --git a/python/ray/serve/_private/proxy.py b/python/ray/serve/_private/proxy.py index c9d65fe4afbf..66ad787a2146 100644 --- a/python/ray/serve/_private/proxy.py +++ b/python/ray/serve/_private/proxy.py @@ -15,8 +15,8 @@ from starlette.types import Receive import ray +from ray._common.filters import CoreContextFilter from ray._common.utils import get_or_create_event_loop -from ray._private.ray_logging.filters import CoreContextFilter from ray.serve._private.common import ( DeploymentID, EndpointInfo, @@ -1016,8 +1016,113 @@ async def send_request_to_replica( yield status +class ProxyActorInterface(ABC): + """Abstract interface for proxy actors in Ray Serve. + + This interface defines the contract that all proxy actor implementations must follow, + allowing for different proxy backends (Ray HTTP/gRPC proxies, HAProxy, etc.). + """ + + def __init__( + self, + *, + node_id: NodeId, + node_ip_address: str, + logging_config: LoggingConfig, + ): + """Initialize the proxy actor. + + Args: + node_id: ID of the node this proxy is running on + node_ip_address: IP address of the node + logging_config: Logging configuration + """ + self._node_id = node_id + self._node_ip_address = node_ip_address + self._logging_config = logging_config + + @abstractmethod + async def ready(self) -> str: + """Blocks until the proxy is ready to serve requests. + + Returns: + JSON-serialized metadata containing proxy information (worker ID, log file path, etc.) + """ + pass + + @abstractmethod + async def update_draining( + self, draining: bool, _after: Optional[Any] = None + ) -> None: + """Update the draining status of the proxy. + + Args: + draining: Whether the proxy should be draining + _after: Optional ObjectRef for scheduling dependency + """ + pass + + @abstractmethod + async def is_drained(self, _after: Optional[Any] = None) -> bool: + """Check whether the proxy is drained. + + Args: + _after: Optional ObjectRef for scheduling dependency + + Returns: + True if the proxy is drained, False otherwise + """ + pass + + @abstractmethod + async def check_health(self) -> None: + """Check the health of the proxy. + + Raises: + Exception: if the proxy is unhealthy + """ + pass + + @abstractmethod + def pong(self) -> str: + """Respond to ping from replicas. + + Returns: + A response string + """ + pass + + @abstractmethod + async def receive_asgi_messages(self, request_metadata: RequestMetadata) -> bytes: + """Handle ASGI messages for HTTP requests. + + Args: + request_metadata: Metadata about the request + + Returns: + Serialized ASGI messages + """ + pass + + # Testing and debugging methods + @abstractmethod + def _get_http_options(self) -> HTTPOptions: + """Get HTTP options used by the proxy.""" + pass + + @abstractmethod + def _get_logging_config(self) -> Optional[str]: + """Get the file path for the logger (for testing purposes).""" + pass + + @abstractmethod + def _dump_ingress_replicas_for_testing(self, route: str) -> Set: + """Get replicas for a route (for testing).""" + pass + + @ray.remote(num_cpus=0) -class ProxyActor: +class ProxyActor(ProxyActorInterface): def __init__( self, http_options: HTTPOptions, @@ -1028,12 +1133,15 @@ def __init__( logging_config: LoggingConfig, long_poll_client: Optional[LongPollClient] = None, ): # noqa: F821 - self._node_id = node_id - self._node_ip_address = node_ip_address - self._http_options = configure_http_middlewares(http_options) + super().__init__( + node_id=node_id, + node_ip_address=node_ip_address, + logging_config=logging_config, + ) + self._grpc_options = grpc_options + self._http_options = configure_http_middlewares(http_options) grpc_enabled = is_grpc_enabled(self._grpc_options) - event_loop = get_or_create_event_loop() self.long_poll_client = long_poll_client or LongPollClient( ray.get_actor(SERVE_CONTROLLER_NAME, namespace=SERVE_NAMESPACE), diff --git a/python/ray/serve/_private/replica.py b/python/ray/serve/_private/replica.py index d1831ea1c7fe..ff1b8425c888 100644 --- a/python/ray/serve/_private/replica.py +++ b/python/ray/serve/_private/replica.py @@ -33,14 +33,16 @@ import ray from ray import cloudpickle +from ray._common.filters import CoreContextFilter from ray._common.utils import get_or_create_event_loop -from ray._private.ray_logging.filters import CoreContextFilter from ray.actor import ActorClass, ActorHandle from ray.remote_function import RemoteFunction from ray.serve import metrics from ray.serve._private.common import ( + RUNNING_REQUESTS_KEY, DeploymentID, ReplicaID, + ReplicaMetricReport, ReplicaQueueLengthInfo, RequestMetadata, ServeComponentType, @@ -115,6 +117,7 @@ Optional[float], Optional[int], Optional[str], + int, ] @@ -327,12 +330,22 @@ def record_request_metrics(self, *, route: str, latency_ms: float, was_error: bo def _push_autoscaling_metrics(self) -> Dict[str, Any]: look_back_period = self._autoscaling_config.look_back_period_s - self._controller_handle.record_autoscaling_metrics.remote( + self._metrics_store.prune_keys_and_compact_data(time.time() - look_back_period) + replica_metric_report = ReplicaMetricReport( replica_id=self._replica_id, - window_avg=self._metrics_store.window_average( - self._replica_id, time.time() - look_back_period - ), - send_timestamp=time.time(), + timestamp=time.time(), + aggregated_metrics={ + RUNNING_REQUESTS_KEY: self._metrics_store.aggregate_avg( + [self._replica_id] + )[0] + or 0.0 + }, + metrics={ + RUNNING_REQUESTS_KEY: self._metrics_store.data.get(self._replica_id, []) + }, + ) + self._controller_handle.record_autoscaling_metrics_from_replica.remote( + replica_metric_report ) def _add_autoscaling_metrics_point(self) -> None: @@ -356,6 +369,7 @@ def __init__( version: DeploymentVersion, ingress: bool, route_prefix: str, + rank: int, ): self._version = version self._replica_id = replica_id @@ -402,7 +416,7 @@ def __init__( # Set metadata for logs and metrics. # servable_object will be populated in `initialize_and_get_metadata`. - self._set_internal_replica_context(servable_object=None) + self._set_internal_replica_context(servable_object=None, rank=rank) self._metrics_manager = create_replica_metrics_manager( replica_id=replica_id, @@ -411,8 +425,10 @@ def __init__( ingress=ingress, ) - self._port: Optional[int] = None + self._internal_grpc_port: Optional[int] = None self._docs_path: Optional[str] = None + self._http_port: Optional[int] = None + self._grpc_port: Optional[int] = None @property def max_ongoing_requests(self) -> int: @@ -422,19 +438,29 @@ def get_num_ongoing_requests(self) -> int: return self._metrics_manager.get_num_ongoing_requests() def get_metadata(self) -> ReplicaMetadata: + current_rank = ray.serve.context._get_internal_replica_context().rank return ( self._version.deployment_config, self._version, self._initialization_latency, - self._port, + self._internal_grpc_port, self._docs_path, + self._http_port, + self._grpc_port, + current_rank, ) - def _set_internal_replica_context(self, *, servable_object: Callable = None): + def _set_internal_replica_context( + self, *, servable_object: Callable = None, rank: int = None + ): + # Calculate world_size from deployment config instead of storing it + world_size = self._deployment_config.num_replicas ray.serve.context._set_internal_replica_context( replica_id=self._replica_id, servable_object=servable_object, _deployment_config=self._deployment_config, + rank=rank, + world_size=world_size, ) def _configure_logger_and_profilers( @@ -751,7 +777,12 @@ async def initialize(self, deployment_config: DeploymentConfig): except Exception: raise RuntimeError(traceback.format_exc()) from None - async def reconfigure(self, deployment_config: DeploymentConfig): + async def reconfigure( + self, + deployment_config: DeploymentConfig, + rank: int, + route_prefix: Optional[str] = None, + ): try: user_config_changed = ( deployment_config.user_config != self._deployment_config.user_config @@ -762,7 +793,7 @@ async def reconfigure(self, deployment_config: DeploymentConfig): ) self._deployment_config = deployment_config self._version = DeploymentVersion.from_deployment_version( - self._version, deployment_config + self._version, deployment_config, route_prefix ) self._metrics_manager.set_autoscaling_config( @@ -780,10 +811,14 @@ async def reconfigure(self, deployment_config: DeploymentConfig): ) # We need to update internal replica context to reflect the new - # deployment_config. + # deployment_config and rank. self._set_internal_replica_context( - servable_object=self._user_callable_wrapper.user_callable + servable_object=self._user_callable_wrapper.user_callable, + rank=rank, ) + + self._route_prefix = self._version.route_prefix + except Exception: raise RuntimeError(traceback.format_exc()) from None @@ -889,8 +924,11 @@ async def record_routing_stats(self) -> Dict[str, Any]: class Replica(ReplicaBase): async def _on_initialized(self): + # Get current rank from replica context during initialization + current_rank = ray.serve.context._get_internal_replica_context().rank self._set_internal_replica_context( - servable_object=self._user_callable_wrapper.user_callable + servable_object=self._user_callable_wrapper.user_callable, + rank=current_rank, ) # Save the initialization latency if the replica is initializing @@ -964,6 +1002,7 @@ async def __init__( version: DeploymentVersion, ingress: bool, route_prefix: str, + rank: int, ): deployment_config = DeploymentConfig.from_proto_bytes( deployment_config_proto_bytes @@ -980,6 +1019,7 @@ async def __init__( version=version, ingress=ingress, route_prefix=route_prefix, + rank=rank, ) def push_proxy_handle(self, handle: ActorHandle): @@ -1041,8 +1081,10 @@ async def check_health(self): async def record_routing_stats(self) -> Dict[str, Any]: return await self._replica_impl.record_routing_stats() - async def reconfigure(self, deployment_config) -> ReplicaMetadata: - await self._replica_impl.reconfigure(deployment_config) + async def reconfigure( + self, deployment_config, rank: int, route_prefix: Optional[str] = None + ) -> ReplicaMetadata: + await self._replica_impl.reconfigure(deployment_config, rank, route_prefix) return self._replica_impl.get_metadata() def _preprocess_request_args( diff --git a/python/ray/serve/_private/replica_result.py b/python/ray/serve/_private/replica_result.py index 780b4a44bd45..9deaf00ef774 100644 --- a/python/ray/serve/_private/replica_result.py +++ b/python/ray/serve/_private/replica_result.py @@ -1,5 +1,7 @@ import asyncio import inspect +import logging +import pickle import threading import time from abc import ABC, abstractmethod @@ -7,12 +9,20 @@ from typing import Callable, Coroutine, Optional, Union import ray -from ray.serve._private.common import RequestMetadata +from ray.exceptions import TaskCancelledError +from ray.serve._private.common import ReplicaQueueLengthInfo, RequestMetadata +from ray.serve._private.constants import SERVE_LOGGER_NAME from ray.serve._private.utils import calculate_remaining_timeout, generate_request_id from ray.serve.exceptions import RequestCancelledError +logger = logging.getLogger(SERVE_LOGGER_NAME) + class ReplicaResult(ABC): + @abstractmethod + async def get_rejection_response(self) -> Optional[ReplicaQueueLengthInfo]: + raise NotImplementedError + @abstractmethod def get(self, timeout_s: Optional[float]): raise NotImplementedError @@ -57,6 +67,8 @@ def __init__( self, obj_ref_or_gen: Union[ray.ObjectRef, ray.ObjectRefGenerator], metadata: RequestMetadata, + *, + with_rejection: bool = False, ): self._obj_ref: Optional[ray.ObjectRef] = None self._obj_ref_gen: Optional[ray.ObjectRefGenerator] = None @@ -64,6 +76,8 @@ def __init__( self._request_id: str = metadata.request_id self._object_ref_or_gen_sync_lock = threading.Lock() self._lazy_object_ref_or_gen_asyncio_lock = None + self._with_rejection = with_rejection + self._rejection_response = None if isinstance(obj_ref_or_gen, ray.ObjectRefGenerator): self._obj_ref_gen = obj_ref_or_gen @@ -116,6 +130,29 @@ async def async_wrapper(self, *args, **kwargs): else: return wrapper + @_process_response + async def get_rejection_response(self) -> Optional[ReplicaQueueLengthInfo]: + """Get the queue length info from the replica to handle rejection.""" + assert ( + self._with_rejection and self._obj_ref_gen is not None + ), "get_rejection_response() can only be called when request rejection is enabled." + + try: + if self._rejection_response is None: + response = await (await self._obj_ref_gen.__anext__()) + self._rejection_response = pickle.loads(response) + + return self._rejection_response + except asyncio.CancelledError as e: + # HTTP client disconnected or request was explicitly canceled. + logger.info( + "Cancelling request that has already been assigned to a replica." + ) + self.cancel() + raise e from None + except TaskCancelledError: + raise asyncio.CancelledError() + @_process_response def get(self, timeout_s: Optional[float]): assert ( diff --git a/python/ray/serve/_private/request_router/common.py b/python/ray/serve/_private/request_router/common.py index a58659d11ad2..4653daca2185 100644 --- a/python/ray/serve/_private/request_router/common.py +++ b/python/ray/serve/_private/request_router/common.py @@ -49,6 +49,9 @@ class PendingRequest: ) """Context for request routing, used to track routing attempts and backoff.""" + resolved: bool = False + """Whether the arguments have been resolved.""" + def reset_future(self): """Reset the `asyncio.Future`, must be called if this request is re-used.""" self.future = asyncio.Future() diff --git a/python/ray/serve/_private/request_router/replica_wrapper.py b/python/ray/serve/_private/request_router/replica_wrapper.py index af4fcbe05c7f..d42266c312cf 100644 --- a/python/ray/serve/_private/request_router/replica_wrapper.py +++ b/python/ray/serve/_private/request_router/replica_wrapper.py @@ -1,27 +1,20 @@ import asyncio -import logging import pickle from abc import ABC, abstractmethod -from typing import Any, Dict, Optional, Set, Tuple, Union +from typing import Any, Dict, Optional, Set import ray -from ray import ObjectRef, ObjectRefGenerator from ray.actor import ActorHandle -from ray.exceptions import TaskCancelledError from ray.serve._private.common import ( ReplicaID, - ReplicaQueueLengthInfo, RunningReplicaInfo, ) -from ray.serve._private.constants import SERVE_LOGGER_NAME from ray.serve._private.replica_result import ActorReplicaResult, ReplicaResult from ray.serve._private.request_router.common import PendingRequest from ray.serve._private.utils import JavaActorHandleProxy from ray.serve.generated.serve_pb2 import RequestMetadata as RequestMetadataProto from ray.util.annotations import PublicAPI -logger = logging.getLogger(SERVE_LOGGER_NAME) - class ReplicaWrapper(ABC): """This is used to abstract away details of the transport layer @@ -36,7 +29,7 @@ def send_request_java(self, pr: PendingRequest) -> ReplicaResult: @abstractmethod def send_request_python( self, pr: PendingRequest, *, with_rejection: bool - ) -> Tuple[ReplicaResult, Optional[ReplicaQueueLengthInfo]]: + ) -> ReplicaResult: """Send request to Python replica. If sending request with rejection, the replica will yield a @@ -77,9 +70,9 @@ def send_request_java(self, pr: PendingRequest) -> ActorReplicaResult: pr.metadata, ) - def _send_request_python( + def send_request_python( self, pr: PendingRequest, *, with_rejection: bool - ) -> Union[ObjectRef, ObjectRefGenerator]: + ) -> ActorReplicaResult: """Send the request to a Python replica.""" if with_rejection: # Call a separate handler that may reject the request. @@ -95,29 +88,10 @@ def _send_request_python( else: method = self._actor_handle.handle_request - return method.remote(pickle.dumps(pr.metadata), *pr.args, **pr.kwargs) - - async def send_request_python( - self, pr: PendingRequest, with_rejection: bool - ) -> Tuple[ActorReplicaResult, Optional[ReplicaQueueLengthInfo]]: - obj_ref_gen = self._send_request_python(pr, with_rejection=with_rejection) - - if not with_rejection: - return ActorReplicaResult(obj_ref_gen, pr.metadata), None - - try: - first_ref = await obj_ref_gen.__anext__() - queue_len_info: ReplicaQueueLengthInfo = pickle.loads(await first_ref) - return ActorReplicaResult(obj_ref_gen, pr.metadata), queue_len_info - except asyncio.CancelledError as e: - # HTTP client disconnected or request was explicitly canceled. - logger.info( - "Cancelling request that has already been assigned to a replica." - ) - ray.cancel(obj_ref_gen) - raise e from None - except TaskCancelledError: - raise asyncio.CancelledError() + obj_ref_gen = method.remote(pickle.dumps(pr.metadata), *pr.args, **pr.kwargs) + return ActorReplicaResult( + obj_ref_gen, pr.metadata, with_rejection=with_rejection + ) @PublicAPI(stability="alpha") @@ -196,17 +170,13 @@ async def get_queue_len(self, *, deadline_s: float) -> int: ray.cancel(obj_ref) raise - async def send_request( + def try_send_request( self, pr: PendingRequest, with_rejection: bool - ) -> Tuple[Optional[ReplicaResult], Optional[ReplicaQueueLengthInfo]]: - """Send request to this replica.""" + ) -> ReplicaResult: + """Try to send the request to this replica. It may be rejected.""" wrapper = self._get_replica_wrapper(pr) if self._replica_info.is_cross_language: assert not with_rejection, "Request rejection not supported for Java." - return wrapper.send_request_java(pr), None - - result, queue_len_info = await wrapper.send_request_python(pr, with_rejection) - if queue_len_info and not queue_len_info.accepted: - return None, queue_len_info + return wrapper.send_request_java(pr) - return result, queue_len_info + return wrapper.send_request_python(pr, with_rejection=with_rejection) diff --git a/python/ray/serve/_private/request_router/request_router.py b/python/ray/serve/_private/request_router/request_router.py index 11153f58c59a..4002ee4540c0 100644 --- a/python/ray/serve/_private/request_router/request_router.py +++ b/python/ray/serve/_private/request_router/request_router.py @@ -624,6 +624,12 @@ def on_new_queue_len_info( replica_id, queue_len_info.num_ongoing_requests ) + def on_send_request(self, replica_id: ReplicaID): + """Increment queue length cache when a request is sent to a replica.""" + if self._use_replica_queue_len_cache: + num_ongoing_requests = self._replica_queue_len_cache.get(replica_id) or 0 + self._replica_queue_len_cache.update(replica_id, num_ongoing_requests + 1) + def update_replicas(self, replicas: List[RunningReplica]): """Update the set of available replicas to be considered for routing. @@ -973,46 +979,52 @@ async def _fulfill_pending_requests(self): backoff_index = 0 pending_request = self._get_next_pending_request_to_route() request_metadata = pending_request.metadata if pending_request else None - async for candidates in self._choose_replicas_with_backoff( + gen_choose_replicas_with_backoff = self._choose_replicas_with_backoff( pending_request - ): - # Clear out pending requests at the front of the - # queue that have been cancelled, then reevaluate - # if we need to continue this routing task. - while ( - len(self._pending_requests_to_fulfill) > 0 - and self._pending_requests_to_fulfill[0].future.done() - ): - self._pending_requests_to_fulfill.popleft() - - if len(self._routing_tasks) > self.target_num_routing_tasks: - break - - replica = await self._select_from_candidate_replicas( - candidates, backoff_index - ) - if replica is not None: - self._fulfill_next_pending_request(replica, request_metadata) - break - - backoff_index += 1 - if backoff_index >= 50 and backoff_index % 50 == 0: - routing_time_elapsed = time.time() - start_time - warning_log = ( - "Failed to route request after " - f"{backoff_index} attempts over " - f"{routing_time_elapsed:.2f}s. Retrying." + ) + try: + async for candidates in gen_choose_replicas_with_backoff: + # Clear out pending requests at the front of the + # queue that have been cancelled, then reevaluate + # if we need to continue this routing task. + while ( + len(self._pending_requests_to_fulfill) > 0 + and self._pending_requests_to_fulfill[0].future.done() + ): + self._pending_requests_to_fulfill.popleft() + + if len(self._routing_tasks) > self.target_num_routing_tasks: + break + + replica = await self._select_from_candidate_replicas( + candidates, backoff_index ) - if request_metadata is not None: - warning_log += ( - f" Request ID: {request_metadata.request_id}." + if replica is not None: + self._fulfill_next_pending_request( + replica, request_metadata + ) + break + + backoff_index += 1 + if backoff_index >= 50 and backoff_index % 50 == 0: + routing_time_elapsed = time.time() - start_time + warning_log = ( + "Failed to route request after " + f"{backoff_index} attempts over " + f"{routing_time_elapsed:.2f}s. Retrying." ) - if request_metadata.multiplexed_model_id: + if request_metadata is not None: warning_log += ( - " Multiplexed model ID: " - f"{request_metadata.multiplexed_model_id}." + f" Request ID: {request_metadata.request_id}." ) - logger.warning(warning_log) + if request_metadata.multiplexed_model_id: + warning_log += ( + " Multiplexed model ID: " + f"{request_metadata.multiplexed_model_id}." + ) + logger.warning(warning_log) + finally: + await gen_choose_replicas_with_backoff.aclose() except Exception: logger.exception("Unexpected error in _fulfill_pending_requests.") diff --git a/python/ray/serve/_private/router.py b/python/ray/serve/_private/router.py index ad4f67a117b7..02af2ce0a53c 100644 --- a/python/ray/serve/_private/router.py +++ b/python/ray/serve/_private/router.py @@ -5,7 +5,7 @@ import time import weakref from abc import ABC, abstractmethod -from asyncio import AbstractEventLoop +from asyncio import AbstractEventLoop, ensure_future, futures from collections import defaultdict from collections.abc import MutableMapping from contextlib import contextmanager @@ -18,7 +18,6 @@ Dict, List, Optional, - Tuple, Union, ) @@ -26,9 +25,11 @@ from ray.actor import ActorHandle from ray.exceptions import ActorDiedError, ActorUnavailableError, RayError from ray.serve._private.common import ( + RUNNING_REQUESTS_KEY, DeploymentHandleSource, DeploymentID, DeploymentTargetInfo, + HandleMetricReport, ReplicaID, RequestMetadata, RunningReplicaInfo, @@ -38,11 +39,17 @@ RAY_SERVE_COLLECT_AUTOSCALING_METRICS_ON_HANDLE, RAY_SERVE_HANDLE_AUTOSCALING_METRIC_PUSH_INTERVAL_S, RAY_SERVE_HANDLE_AUTOSCALING_METRIC_RECORD_INTERVAL_S, + RAY_SERVE_METRICS_EXPORT_INTERVAL_MS, RAY_SERVE_PROXY_PREFER_LOCAL_AZ_ROUTING, SERVE_LOGGER_NAME, ) from ray.serve._private.long_poll import LongPollClient, LongPollNamespace -from ray.serve._private.metrics_utils import InMemoryMetricsStore, MetricsPusher +from ray.serve._private.metrics_utils import ( + QUEUED_REQUESTS_KEY, + InMemoryMetricsStore, + MetricsPusher, + TimeStampedValue, +) from ray.serve._private.replica_result import ReplicaResult from ray.serve._private.request_router import PendingRequest, RequestRouter from ray.serve._private.request_router.pow_2_router import ( @@ -53,7 +60,6 @@ from ray.serve._private.utils import ( generate_request_id, resolve_deployment_response, - run_coroutine_or_future_threadsafe, ) from ray.serve.config import AutoscalingConfig from ray.serve.exceptions import BackPressureError, DeploymentUnavailableError @@ -62,9 +68,6 @@ logger = logging.getLogger(SERVE_LOGGER_NAME) -QUEUED_REQUESTS_KEY = "queued" - - class RouterMetricsManager: """Manages metrics for the router.""" @@ -81,6 +84,7 @@ def __init__( router_requests_counter: metrics.Counter, queued_requests_gauge: metrics.Gauge, running_requests_gauge: metrics.Gauge, + event_loop: asyncio.BaseEventLoop, ): self._handle_id = handle_id self._deployment_id = deployment_id @@ -140,6 +144,21 @@ def __init__( # Track whether the metrics manager has been shutdown self._shutdown: bool = False + # If the interval is set to 0, eagerly sets all metrics. + self._cached_metrics_enabled = RAY_SERVE_METRICS_EXPORT_INTERVAL_MS != 0 + self._cached_metrics_interval_s = RAY_SERVE_METRICS_EXPORT_INTERVAL_MS / 1000 + + if self._cached_metrics_enabled: + self._cached_num_router_requests = defaultdict(int) + + def create_metrics_task(): + event_loop.create_task(self._report_cached_metrics_forever()) + + # the constructor is called in the user thread, but its trying to create a task on the event loop + # which is running in the router thread. This is not thread safe, so we need to use call_soon_threadsafe + # to create the task on the event loop thread safely. + event_loop.call_soon_threadsafe(create_metrics_task) + @contextmanager def wrap_request_assignment(self, request_meta: RequestMetadata): max_queued_requests = ( @@ -174,9 +193,21 @@ def wrap_request_assignment(self, request_meta: RequestMetadata): logger.warning(e.message) raise e + self.inc_num_total_requests(request_meta.route) + yield + + @contextmanager + def wrap_queued_request(self, is_retry: bool, num_curr_replicas: int): + """Increment queued requests gauge and maybe push autoscaling metrics to controller.""" try: - self.inc_num_total_requests(request_meta.route) self.inc_num_queued_requests() + # Optimization: if there are currently zero replicas for a deployment, + # push handle metric to controller to allow for fast cold start time. + # Only do this on the first attempt to route the request. + if not is_retry and self.should_send_scaled_to_zero_optimized_push( + curr_num_replicas=num_curr_replicas + ): + self.push_autoscaling_metrics_to_controller() yield finally: @@ -260,30 +291,65 @@ def update_deployment_config( if self.metrics_pusher: self.metrics_pusher.stop_tasks() + def _report_cached_metrics(self): + for route, count in self._cached_num_router_requests.items(): + self.num_router_requests.inc(count, tags={"route": route}) + self._cached_num_router_requests.clear() + + self.num_queued_requests_gauge.set(self.num_queued_requests) + + self.num_running_requests_gauge.set( + sum(self.num_requests_sent_to_replicas.values()) + ) + + async def _report_cached_metrics_forever(self): + assert self._cached_metrics_interval_s > 0 + + consecutive_errors = 0 + while True: + try: + await asyncio.sleep(self._cached_metrics_interval_s) + self._report_cached_metrics() + consecutive_errors = 0 + except Exception: + logger.exception("Unexpected error reporting metrics.") + + # Exponential backoff starting at 1s and capping at 10s. + backoff_time_s = min(10, 2**consecutive_errors) + consecutive_errors += 1 + await asyncio.sleep(backoff_time_s) + def inc_num_total_requests(self, route: str): - self.num_router_requests.inc(tags={"route": route}) + if self._cached_metrics_enabled: + self._cached_num_router_requests[route] += 1 + else: + self.num_router_requests.inc(tags={"route": route}) def inc_num_queued_requests(self): self.num_queued_requests += 1 - self.num_queued_requests_gauge.set(self.num_queued_requests) + if not self._cached_metrics_enabled: + self.num_queued_requests_gauge.set(self.num_queued_requests) def dec_num_queued_requests(self): self.num_queued_requests -= 1 - self.num_queued_requests_gauge.set(self.num_queued_requests) + if not self._cached_metrics_enabled: + self.num_queued_requests_gauge.set(self.num_queued_requests) def inc_num_running_requests_for_replica(self, replica_id: ReplicaID): with self._queries_lock: self.num_requests_sent_to_replicas[replica_id] += 1 - self.num_running_requests_gauge.set( - sum(self.num_requests_sent_to_replicas.values()) - ) + if not self._cached_metrics_enabled: + self.num_running_requests_gauge.set( + sum(self.num_requests_sent_to_replicas.values()) + ) def dec_num_running_requests_for_replica(self, replica_id: ReplicaID): with self._queries_lock: self.num_requests_sent_to_replicas[replica_id] -= 1 - self.num_running_requests_gauge.set( - sum(self.num_requests_sent_to_replicas.values()) - ) + if not self._cached_metrics_enabled: + self.num_running_requests_gauge.set( + sum(self.num_requests_sent_to_replicas.values()) + ) def should_send_scaled_to_zero_optimized_push(self, curr_num_replicas: int) -> bool: return ( @@ -297,14 +363,8 @@ def push_autoscaling_metrics_to_controller(self): These metrics are used by the controller for autoscaling. """ - - self._controller_handle.record_handle_metrics.remote( - send_timestamp=time.time(), - deployment_id=self._deployment_id, - handle_id=self._handle_id, - actor_id=self._self_actor_id, - handle_source=self._handle_source, - **self._get_aggregated_requests(), + self._controller_handle.record_autoscaling_metrics_from_handle.remote( + self._get_metrics_report() ) def _add_autoscaling_metrics_point(self): @@ -326,24 +386,40 @@ def _add_autoscaling_metrics_point(self): start_timestamp = time.time() - self.autoscaling_config.look_back_period_s self.metrics_store.prune_keys_and_compact_data(start_timestamp) - def _get_aggregated_requests(self): + def _get_metrics_report(self) -> HandleMetricReport: running_requests = dict() + avg_running_requests = dict() + timestamp = time.time() if RAY_SERVE_COLLECT_AUTOSCALING_METRICS_ON_HANDLE and self.autoscaling_config: look_back_period = self.autoscaling_config.look_back_period_s - running_requests = { - replica_id: self.metrics_store.window_average( - replica_id, time.time() - look_back_period + self.metrics_store.prune_keys_and_compact_data( + time.time() - look_back_period + ) + for replica_id, num_requests in self.num_requests_sent_to_replicas.items(): + # Calculate avg running requests + avg_running_requests[replica_id] = ( + self.metrics_store.aggregate_avg([replica_id])[0] + # If data hasn't been recorded yet, return current + # number of queued and ongoing requests. + or num_requests + ) + # Get running requests data + running_requests[replica_id] = self.metrics_store.data.get( + replica_id, [TimeStampedValue(timestamp, num_requests)] ) - # If data hasn't been recorded yet, return current - # number of queued and ongoing requests. - or num_requests - for replica_id, num_requests in self.num_requests_sent_to_replicas.items() # noqa: E501 - } - return { - "queued_requests": self.num_queued_requests, - "running_requests": running_requests, - } + handle_metric_report = HandleMetricReport( + deployment_id=self._deployment_id, + handle_id=self._handle_id, + actor_id=self._self_actor_id, + handle_source=self._handle_source, + queued_requests=self.num_queued_requests, + aggregated_metrics={RUNNING_REQUESTS_KEY: avg_running_requests}, + metrics={RUNNING_REQUESTS_KEY: running_requests}, + timestamp=timestamp, + ) + + return handle_metric_report async def shutdown(self): """Shutdown metrics manager gracefully.""" @@ -467,6 +543,7 @@ def __init__( ), tag_keys=("deployment", "application", "handle", "actor_id"), ), + event_loop, ) # The Router needs to stay informed about changes to the target deployment's @@ -571,25 +648,26 @@ def update_deployment_config(self, deployment_config: DeploymentConfig): async def _resolve_request_arguments( self, - request_metadata: RequestMetadata, - request_args: Tuple[Any], - request_kwargs: Dict[str, Any], - ) -> Tuple[Tuple[Any], Dict[str, Any]]: + pr: PendingRequest, + ) -> None: """Asynchronously resolve and replace top-level request args and kwargs.""" - new_args = list(request_args) - new_kwargs = request_kwargs.copy() + if pr.resolved: + return + + new_args = list(pr.args) + new_kwargs = pr.kwargs.copy() # Map from index -> task for resolving positional arg resolve_arg_tasks = {} - for i, obj in enumerate(request_args): - task = await self._resolve_request_arg_func(obj, request_metadata) + for i, obj in enumerate(pr.args): + task = await self._resolve_request_arg_func(obj, pr.metadata) if task is not None: resolve_arg_tasks[i] = task # Map from key -> task for resolving key-word arg resolve_kwarg_tasks = {} - for k, obj in request_kwargs.items(): - task = await self._resolve_request_arg_func(obj, request_metadata) + for k, obj in pr.kwargs.items(): + task = await self._resolve_request_arg_func(obj, pr.metadata) if task is not None: resolve_kwarg_tasks[k] = task @@ -606,8 +684,9 @@ async def _resolve_request_arguments( for key, task in resolve_kwarg_tasks.items(): new_kwargs[key] = task.result() - # Return new args and new kwargs - return new_args, new_kwargs + pr.args = new_args + pr.kwargs = new_kwargs + pr.resolved = True def _process_finished_request( self, @@ -639,9 +718,99 @@ def _process_finished_request( f"Request failed because {replica_id} is temporarily unavailable." ) + async def _route_and_send_request_once( + self, + pr: PendingRequest, + response_id: str, + is_retry: bool, + ) -> Optional[ReplicaResult]: + result: Optional[ReplicaResult] = None + replica: Optional[RunningReplica] = None + try: + num_curr_replicas = len(self.request_router.curr_replicas) + with self._metrics_manager.wrap_queued_request(is_retry, num_curr_replicas): + # If the pending request is uninitialized, we do so by resolving the + # request arguments. This should only be done once per request, and + # should happen after incrementing `num_queued_requests`, so that Serve + # can upscale the downstream deployment while arguments are resolving. + if not pr.resolved: + await self._resolve_request_arguments(pr) + + replica = await self.request_router._choose_replica_for_request( + pr, is_retry=is_retry + ) + + # If the queue len cache is disabled or we're sending a request to Java, + # then directly send the query and hand the response back. The replica will + # never reject requests in this code path. + with_rejection = ( + self._enable_strict_max_ongoing_requests + and not replica.is_cross_language + ) + result = replica.try_send_request(pr, with_rejection=with_rejection) + + # Proactively update the queue length cache. + self.request_router.on_send_request(replica.replica_id) + + # Keep track of requests that have been sent out to replicas + if RAY_SERVE_COLLECT_AUTOSCALING_METRICS_ON_HANDLE: + _request_context = ray.serve.context._get_serve_request_context() + request_id: str = _request_context.request_id + self._metrics_manager.inc_num_running_requests_for_replica( + replica.replica_id + ) + callback = partial( + self._process_finished_request, + replica.replica_id, + request_id, + response_id, + ) + result.add_done_callback(callback) + + if not with_rejection: + return result + + queue_info = await result.get_rejection_response() + self.request_router.on_new_queue_len_info(replica.replica_id, queue_info) + if queue_info.accepted: + self.request_router.on_request_routed(pr, replica.replica_id, result) + return result + + except asyncio.CancelledError: + # NOTE(edoakes): this is not strictly necessary because there are + # currently no `await` statements between getting the ref and returning, + # but I'm adding it defensively. + if result is not None: + result.cancel() + + raise + except ActorDiedError: + # Replica has died but controller hasn't notified the router yet. + # Don't consider this replica for requests in the future, and retry + # routing request. + if replica is not None: + self.request_router.on_replica_actor_died(replica.replica_id) + logger.warning( + f"{replica.replica_id} will not be considered for future " + "requests because it has died." + ) + except ActorUnavailableError: + # There are network issues, or replica has died but GCS is down so + # ActorUnavailableError will be raised until GCS recovers. For the + # time being, invalidate the cache entry so that we don't try to + # send requests to this replica without actively probing, and retry + # routing request. + if replica is not None: + self.request_router.on_replica_actor_unavailable(replica.replica_id) + logger.warning(f"{replica.replica_id} is temporarily unavailable.") + + return None + async def route_and_send_request( - self, pr: PendingRequest - ) -> Tuple[ReplicaResult, ReplicaID]: + self, + pr: PendingRequest, + response_id: str, + ) -> ReplicaResult: """Choose a replica for the request and send it. This will block indefinitely if no replicas are available to handle the @@ -650,54 +819,21 @@ async def route_and_send_request( # Wait for the router to be initialized before sending the request. await self._request_router_initialized.wait() - r = await self.request_router._choose_replica_for_request(pr) - - # If the queue len cache is disabled or we're sending a request to Java, - # then directly send the query and hand the response back. The replica will - # never reject requests in this code path. - if not self._enable_strict_max_ongoing_requests or r.is_cross_language: - result, _ = await r.send_request(pr, with_rejection=False) - return result, r.replica_id - + is_retry = False while True: - result = None - try: - result, queue_info = await r.send_request(pr, with_rejection=True) - self.request_router.on_new_queue_len_info(r.replica_id, queue_info) - if queue_info.accepted: - self.request_router.on_request_routed(pr, r.replica_id, result) - return result, r.replica_id - except asyncio.CancelledError: - # NOTE(edoakes): this is not strictly necessary because there are - # currently no `await` statements between getting the ref and returning, - # but I'm adding it defensively. - if result is not None: - result.cancel() - - raise - except ActorDiedError: - # Replica has died but controller hasn't notified the router yet. - # Don't consider this replica for requests in the future, and retry - # routing request. - self.request_router.on_replica_actor_died(r.replica_id) - logger.warning( - f"{r.replica_id} will not be considered for future " - "requests because it has died." - ) - except ActorUnavailableError: - # There are network issues, or replica has died but GCS is down so - # ActorUnavailableError will be raised until GCS recovers. For the - # time being, invalidate the cache entry so that we don't try to - # send requests to this replica without actively probing, and retry - # routing request. - self.request_router.on_replica_actor_unavailable(r.replica_id) - logger.warning(f"{r.replica_id} is temporarily unavailable.") + result = await self._route_and_send_request_once( + pr, + response_id, + is_retry, + ) + if result is not None: + return result # If the replica rejects the request, retry the routing process. The # request will be placed on the front of the queue to avoid tail latencies. # TODO(edoakes): this retry procedure is not perfect because it'll reset the # process of choosing candidates replicas (i.e., for locality-awareness). - r = await self.request_router._choose_replica_for_request(pr, is_retry=True) + is_retry = True async def assign_request( self, @@ -725,41 +861,16 @@ async def assign_request( await self._request_router_initialized.wait() with self._metrics_manager.wrap_request_assignment(request_meta): - # Optimization: if there are currently zero replicas for a deployment, - # push handle metric to controller to allow for fast cold start time. - if self._metrics_manager.should_send_scaled_to_zero_optimized_push( - curr_num_replicas=len(self.request_router.curr_replicas) - ): - self._metrics_manager.push_autoscaling_metrics_to_controller() - replica_result = None try: - request_args, request_kwargs = await self._resolve_request_arguments( - request_meta, request_args, request_kwargs - ) - replica_result, replica_id = await self.route_and_send_request( + replica_result = await self.route_and_send_request( PendingRequest( args=list(request_args), kwargs=request_kwargs, metadata=request_meta, ), + response_id, ) - - # Keep track of requests that have been sent out to replicas - if RAY_SERVE_COLLECT_AUTOSCALING_METRICS_ON_HANDLE: - _request_context = ray.serve.context._get_serve_request_context() - request_id: str = _request_context.request_id - self._metrics_manager.inc_num_running_requests_for_replica( - replica_id - ) - callback = partial( - self._process_finished_request, - replica_id, - request_id, - response_id, - ) - replica_result.add_done_callback(callback) - return replica_result except asyncio.CancelledError: # NOTE(edoakes): this is not strictly necessary because @@ -858,17 +969,34 @@ def asyncio_future_callback( ) result.cancel() - task = self._asyncio_loop.create_task( - self._asyncio_router.assign_request( - request_meta, *request_args, **request_kwargs + concurrent_future = concurrent.futures.Future() + + def create_task_and_setup(): + task = self._asyncio_loop.create_task( + self._asyncio_router.assign_request( + request_meta, *request_args, **request_kwargs + ) ) - ) - # Route the actual request assignment coroutine on the asyncio loop thread. - concurrent_future = run_coroutine_or_future_threadsafe( - task, - loop=self._asyncio_loop, - ) - task.add_done_callback(lambda _: asyncio_future_callback(_, concurrent_future)) + + # Set up your cancellation callback + task.add_done_callback( + lambda _: asyncio_future_callback(_, concurrent_future) + ) + + try: + # chain the two futures to handle direction channel of cancellation + futures._chain_future( + ensure_future(task, loop=self._asyncio_loop), concurrent_future + ) + except (SystemExit, KeyboardInterrupt): + raise + except BaseException as exc: + if concurrent_future.set_running_or_notify_cancel(): + concurrent_future.set_exception(exc) + raise + + # Schedule on the event loop thread + self._asyncio_loop.call_soon_threadsafe(create_task_and_setup) return concurrent_future def shutdown(self) -> concurrent.futures.Future: diff --git a/python/ray/serve/_private/test_utils.py b/python/ray/serve/_private/test_utils.py index 1d95c41a4893..acdff7669a51 100644 --- a/python/ray/serve/_private/test_utils.py +++ b/python/ray/serve/_private/test_utils.py @@ -9,6 +9,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union import grpc +import httpx import requests from starlette.requests import Request @@ -16,6 +17,7 @@ import ray.util.state as state_api from ray import serve from ray._common.network_utils import build_address +from ray._common.test_utils import wait_for_condition from ray.actor import ActorHandle from ray.serve._private.client import ServeControllerClient from ray.serve._private.common import ( @@ -715,6 +717,29 @@ def tlog(s: str, level: str = "INFO"): print(f"[{level}] {now} {s}") +def check_target_groups_ready( + client: ServeControllerClient, + app_name: str, + protocol: Union[str, RequestProtocol] = RequestProtocol.HTTP, +): + """Wait for target groups to be ready for the given app and protocol. + + Target groups are ready when there are at least one target for the given protocol. And it's + possible that target groups are not ready immediately. An example is when the controller + is recovering from a crash. + """ + target_groups = ray.get(client._controller.get_target_groups.remote(app_name)) + target_groups = [ + target_group + for target_group in target_groups + if target_group.protocol == protocol + ] + all_targets = [ + target for target_group in target_groups for target in target_group.targets + ] + return len(all_targets) > 0 + + def get_application_urls( protocol: Union[str, RequestProtocol] = RequestProtocol.HTTP, app_name: str = SERVE_DEFAULT_APP_NAME, @@ -741,7 +766,9 @@ def get_application_urls( app_name in serve_details["applications"] ), f"App {app_name} not found in serve details. Use this method only when the app is known to be running." route_prefix = serve_details["applications"][app_name]["route_prefix"] - if exclude_route_prefix: + # route_prefix is set to None when route_prefix value is specifically set to None + # in the config used to deploy the app. + if exclude_route_prefix or route_prefix is None: route_prefix = "" if isinstance(protocol, str): protocol = RequestProtocol(protocol) @@ -753,7 +780,6 @@ def get_application_urls( for target_group in target_groups if target_group.protocol == protocol ] - if len(target_groups) == 0: raise ValueError( f"No target group found for app {app_name} with protocol {protocol} and route prefix {route_prefix}" @@ -812,3 +838,22 @@ def get_application_url( def check_running(app_name: str = SERVE_DEFAULT_APP_NAME): assert serve.status().applications[app_name].status == ApplicationStatus.RUNNING return True + + +def request_with_retries(timeout=30, app_name=SERVE_DEFAULT_APP_NAME): + result_holder = {"resp": None} + + def _attempt() -> bool: + try: + url = get_application_url("HTTP", app_name=app_name) + result_holder["resp"] = httpx.get(url, timeout=timeout) + return True + except (httpx.RequestError, IndexError): + return False + + try: + wait_for_condition(_attempt, timeout=timeout) + return result_holder["resp"] + except RuntimeError as e: + # Preserve previous API by raising TimeoutError on expiry + raise TimeoutError from e diff --git a/python/ray/serve/_private/utils.py b/python/ray/serve/_private/utils.py index 3cfad95ff0cd..065f6a8a301d 100644 --- a/python/ray/serve/_private/utils.py +++ b/python/ray/serve/_private/utils.py @@ -1,6 +1,5 @@ import asyncio import collections -import concurrent.futures import copy import importlib import inspect @@ -10,7 +9,6 @@ import time import uuid from abc import ABC, abstractmethod -from asyncio import coroutines, ensure_future, futures from decimal import ROUND_HALF_UP, Decimal from enum import Enum from functools import wraps @@ -27,7 +25,6 @@ from ray.actor import ActorHandle from ray.serve._private.common import RequestMetadata, ServeComponentType from ray.serve._private.constants import HTTP_PROXY_TIMEOUT, SERVE_LOGGER_NAME -from ray.serve.config import gRPCOptions from ray.types import ObjectRef from ray.util.serialization import StandaloneSerializationContext @@ -44,6 +41,27 @@ FILE_NAME_REGEX = r"[^\x20-\x7E]|[<>:\"/\\|?*]" MESSAGE_PACK_OFFSET = 9 + + +def validate_ssl_config( + ssl_certfile: Optional[str], ssl_keyfile: Optional[str] +) -> None: + """Validate SSL configuration for HTTPS support. + + Args: + ssl_certfile: Path to SSL certificate file + ssl_keyfile: Path to SSL private key file + + Raises: + ValueError: If only one of ssl_certfile or ssl_keyfile is provided + """ + if (ssl_certfile and not ssl_keyfile) or (ssl_keyfile and not ssl_certfile): + raise ValueError( + "Both ssl_keyfile and ssl_certfile must be provided together " + "to enable HTTPS." + ) + + GENERATOR_COMPOSITION_NOT_SUPPORTED_ERROR = RuntimeError( "Streaming deployment handle results cannot be passed to " "downstream handle calls. If you have a use case requiring " @@ -612,41 +630,10 @@ def wait_for_interrupt() -> None: raise -def is_grpc_enabled(grpc_config: gRPCOptions) -> bool: +def is_grpc_enabled(grpc_config) -> bool: return grpc_config.port > 0 and len(grpc_config.grpc_servicer_functions) > 0 -def run_coroutine_or_future_threadsafe(coro_or_future, loop): - """Submit a coroutine object or future to a given event loop. - - Ref: https://github.com/python/cpython/blob/eef49c359505eaf109d519d39e53dfd3c78d066a/Lib/asyncio/tasks.py#L991 - - Return a concurrent.futures.Future to access the result. - """ - if not coroutines.iscoroutine(coro_or_future) and not futures.isfuture( - coro_or_future - ): - raise TypeError("A coroutine object or future is required") - - if futures.isfuture(coro_or_future): - assert loop == coro_or_future.get_loop() - - future = concurrent.futures.Future() - - def callback(): - try: - futures._chain_future(ensure_future(coro_or_future, loop=loop), future) - except (SystemExit, KeyboardInterrupt): - raise - except BaseException as exc: - if future.set_running_or_notify_cancel(): - future.set_exception(exc) - raise - - loop.call_soon_threadsafe(callback) - return future - - class Semaphore: """Based on asyncio.Semaphore. diff --git a/python/ray/serve/_private/version.py b/python/ray/serve/_private/version.py index 9242dfc928e9..1c064a9a9dc7 100644 --- a/python/ray/serve/_private/version.py +++ b/python/ray/serve/_private/version.py @@ -22,6 +22,7 @@ def __init__( placement_group_bundles: Optional[List[Dict[str, float]]] = None, placement_group_strategy: Optional[str] = None, max_replicas_per_node: Optional[int] = None, + route_prefix: Optional[str] = None, ): if code_version is not None and not isinstance(code_version, str): raise TypeError(f"code_version must be str, got {type(code_version)}.") @@ -37,12 +38,16 @@ def __init__( self.placement_group_bundles = placement_group_bundles self.placement_group_strategy = placement_group_strategy self.max_replicas_per_node = max_replicas_per_node + self.route_prefix = route_prefix self.compute_hashes() @classmethod - def from_deployment_version(cls, deployment_version, deployment_config): + def from_deployment_version( + cls, deployment_version, deployment_config, route_prefix: Optional[str] = None + ): version_copy = deepcopy(deployment_version) version_copy.deployment_config = deployment_config + version_copy.route_prefix = route_prefix version_copy.compute_hashes() return version_copy @@ -95,11 +100,15 @@ def compute_hashes(self): combined_placement_group_options ) self.placement_group_options_hash = crc32(serialized_placement_group_options) + # Include app-level route prefix in the version hashes so changing + # it triggers an in-place reconfigure of running replicas. + serialized_route_prefix = _serialize(self.route_prefix) # If this changes, DeploymentReplica.reconfigure() will call reconfigure on the # actual replica actor self.reconfigure_actor_hash = crc32( - self._get_serialized_options( + serialized_route_prefix + + self._get_serialized_options( [DeploymentOptionUpdateType.NeedsActorReconfigure] ) ) @@ -111,6 +120,7 @@ def compute_hashes(self): + serialized_ray_actor_options + serialized_placement_group_options + str(self.max_replicas_per_node).encode("utf-8") + + serialized_route_prefix + self._get_serialized_options( [ DeploymentOptionUpdateType.NeedsReconfigure, diff --git a/python/ray/serve/api.py b/python/ray/serve/api.py index 8a852d6a1dc9..5ae14f3c8afd 100644 --- a/python/ray/serve/api.py +++ b/python/ray/serve/api.py @@ -138,6 +138,26 @@ def shutdown(): _set_global_client(None) +@PublicAPI(stability="alpha") +async def shutdown_async(): + """Completely shut down Serve on the cluster asynchronously. + + Deletes all applications and shuts down Serve system actors. + """ + + try: + client = _get_global_client() + except RayServeException: + logger.info( + "Nothing to shut down. There's no Serve application " + "running on this Ray cluster." + ) + return + + await client.shutdown_async() + _set_global_client(None) + + @DeveloperAPI def get_replica_context() -> ReplicaContext: """Returns the deployment and replica tag from within a replica at runtime. diff --git a/python/ray/serve/autoscaling_policy.py b/python/ray/serve/autoscaling_policy.py index 2cabe736a870..eb443a46c03a 100644 --- a/python/ray/serve/autoscaling_policy.py +++ b/python/ray/serve/autoscaling_policy.py @@ -1,7 +1,8 @@ import logging import math -from typing import Any, Dict, Optional +from typing import Any, Dict, Optional, Tuple +from ray.serve._private.autoscaling_state import AutoscalingContext from ray.serve._private.constants import CONTROL_LOOP_INTERVAL_S, SERVE_LOGGER_NAME from ray.serve.config import AutoscalingConfig from ray.util.annotations import PublicAPI @@ -83,14 +84,8 @@ def _calculate_desired_num_replicas( @PublicAPI(stability="alpha") def replica_queue_length_autoscaling_policy( - curr_target_num_replicas: int, - total_num_requests: int, - num_running_replicas: int, - config: Optional[AutoscalingConfig], - capacity_adjusted_min_replicas: int, - capacity_adjusted_max_replicas: int, - policy_state: Dict[str, Any], -) -> int: + ctx: AutoscalingContext, +) -> Tuple[int, Dict[str, Any]]: """The default autoscaling policy based on basic thresholds for scaling. There is a minimum threshold for the average queue length in the cluster to scale up and a maximum threshold to scale down. Each period, a 'scale @@ -100,15 +95,26 @@ def replica_queue_length_autoscaling_policy( `get_decision_num_replicas` is called once every CONTROL_LOOP_PERIOD_S seconds. """ + + curr_target_num_replicas: int = ctx.target_num_replicas + total_num_requests: int = ctx.total_num_requests + num_running_replicas: int = ctx.current_num_replicas + config: Optional[AutoscalingConfig] = ctx.config + capacity_adjusted_min_replicas: int = ctx.capacity_adjusted_min_replicas + capacity_adjusted_max_replicas: int = ctx.capacity_adjusted_max_replicas + policy_state: Dict[str, Any] = ctx.policy_state decision_counter = policy_state.get("decision_counter", 0) if num_running_replicas == 0: # When 0 replicas and queries are queued, scale up the replicas if total_num_requests > 0: - return max( - math.ceil(1 * config.get_upscaling_factor()), - curr_target_num_replicas, + return ( + max( + math.ceil(1 * config.get_upscaling_factor()), + curr_target_num_replicas, + ), + policy_state, ) - return curr_target_num_replicas + return curr_target_num_replicas, policy_state decision_num_replicas = curr_target_num_replicas @@ -153,7 +159,7 @@ def replica_queue_length_autoscaling_policy( decision_counter = 0 policy_state["decision_counter"] = decision_counter - return decision_num_replicas + return decision_num_replicas, policy_state default_autoscaling_policy = replica_queue_length_autoscaling_policy diff --git a/python/ray/serve/batching.py b/python/ray/serve/batching.py index 41c1b64e516a..ab16fe47e962 100644 --- a/python/ray/serve/batching.py +++ b/python/ray/serve/batching.py @@ -45,6 +45,7 @@ class _SingleRequest: self_arg: Any flattened_args: List[Any] future: asyncio.Future + request_context: serve.context._RequestContext @dataclass @@ -306,6 +307,9 @@ async def _assign_func_results( async def _process_batches(self, func: Callable) -> None: """Loops infinitely and processes queued request batches.""" + # When asyncio task is created, the task will inherit the request context from the current context. + # So we unset the request context so the current context is not inherited by the task, _process_batch. + serve.context._unset_request_context() while not self._loop.is_closed(): batch = await self.wait_for_batch() promise = self._process_batch(func, batch) @@ -343,6 +347,11 @@ async def _process_batch(self, func: Callable, batch: List[_SingleRequest]) -> N else: func_future_or_generator = func(*args, **kwargs) + # Add individual request context to the batch request context + serve.context._set_batch_request_context( + [req.request_context for req in batch] + ) + if isasyncgenfunction(func): func_generator = func_future_or_generator await self._consume_func_generator( @@ -352,6 +361,8 @@ async def _process_batch(self, func: Callable, batch: List[_SingleRequest]) -> N func_future = func_future_or_generator await self._assign_func_results(func_future, futures, len(batch)) + # Reset the batch request context after the batch is processed + serve.context._set_batch_request_context([]) except Exception as e: logger.exception("_process_batch ran into an unexpected exception.") @@ -690,7 +701,10 @@ def enqueue_request(args, kwargs) -> asyncio.Future: batch_queue = lazy_batch_queue_wrapper.queue future = get_or_create_event_loop().create_future() - batch_queue.put(_SingleRequest(self, flattened_args, future)) + request_context = serve.context._get_serve_request_context() + batch_queue.put( + _SingleRequest(self, flattened_args, future, request_context) + ) return future @wraps(_func) diff --git a/python/ray/serve/config.py b/python/ray/serve/config.py index d06c8f8cddf4..650f62f71b44 100644 --- a/python/ray/serve/config.py +++ b/python/ray/serve/config.py @@ -17,7 +17,7 @@ ) from ray._common.utils import import_attr from ray.serve._private.constants import ( - DEFAULT_AUTOSCALING_POLICY, + DEFAULT_AUTOSCALING_POLICY_NAME, DEFAULT_GRPC_PORT, DEFAULT_HTTP_HOST, DEFAULT_HTTP_PORT, @@ -28,6 +28,7 @@ DEFAULT_UVICORN_KEEP_ALIVE_TIMEOUT_S, SERVE_LOGGER_NAME, ) +from ray.serve._private.utils import validate_ssl_config from ray.util.annotations import Deprecated, PublicAPI logger = logging.getLogger(SERVE_LOGGER_NAME) @@ -162,6 +163,15 @@ def get_request_router_class(self) -> Callable: DEFAULT_METRICS_INTERVAL_S = 10.0 +@PublicAPI(stability="alpha") +class AutoscalingPolicy(BaseModel): + name: Union[str, Callable] = Field( + default=DEFAULT_AUTOSCALING_POLICY_NAME, + description="Name of the policy function or the import path of the policy. " + "Will be the concatenation of the policy module and the policy name if user passed a callable.", + ) + + @PublicAPI(stability="stable") class AutoscalingConfig(BaseModel): """Config for the Serve Autoscaler.""" @@ -174,7 +184,7 @@ class AutoscalingConfig(BaseModel): initial_replicas: Optional[NonNegativeInt] = None max_replicas: PositiveInt = 1 - target_ongoing_requests: PositiveFloat = DEFAULT_TARGET_ONGOING_REQUESTS + target_ongoing_requests: Optional[PositiveFloat] = DEFAULT_TARGET_ONGOING_REQUESTS metrics_interval_s: PositiveFloat = Field( default=DEFAULT_METRICS_INTERVAL_S, @@ -222,8 +232,11 @@ class AutoscalingConfig(BaseModel): # Cloudpickled policy definition. _serialized_policy_def: bytes = PrivateAttr(default=b"") - # Custom autoscaling config. Defaults to the request-based autoscaler. - _policy: Union[str, Callable] = PrivateAttr(default=DEFAULT_AUTOSCALING_POLICY) + # Autoscaling policy. This policy is deployment scoped. Defaults to the request-based autoscaler. + policy: AutoscalingPolicy = Field( + default_factory=AutoscalingPolicy, + description="The autoscaling policy for the deployment. This option is experimental.", + ) @validator("max_replicas", always=True) def replicas_settings_valid(cls, max_replicas, values): @@ -271,20 +284,16 @@ def serialize_policy(self) -> None: Import the policy if it's passed in as a string import path. Then cloudpickle the policy and set `serialized_policy_def` if it's empty. """ - values = self.dict() - policy = values.get("_policy") - if isinstance(policy, Callable): - policy = f"{policy.__module__}.{policy.__name__}" + policy = self.policy + policy_name = policy.name - if not policy: - policy = DEFAULT_AUTOSCALING_POLICY + if isinstance(policy_name, Callable): + policy_name = f"{policy_name.__module__}.{policy_name.__name__}" - policy_path = policy - policy = import_attr(policy) + if not self._serialized_policy_def: + self._serialized_policy_def = cloudpickle.dumps(import_attr(policy_name)) - if not values.get("_serialized_policy_def"): - self._serialized_policy_def = cloudpickle.dumps(policy) - self._policy = policy_path + self.policy = AutoscalingPolicy(name=policy_name) @classmethod def default(cls): @@ -394,6 +403,13 @@ class HTTPOptions(BaseModel): - request_timeout_s: End-to-end timeout for HTTP requests. - keep_alive_timeout_s: Duration to keep idle connections alive when no requests are ongoing. + - ssl_keyfile: Path to the SSL key file for HTTPS. If provided with + ssl_certfile, the HTTP server will use HTTPS. + - ssl_certfile: Path to the SSL certificate file for HTTPS. If provided + with ssl_keyfile, the HTTP server will use HTTPS. + - ssl_keyfile_password: Optional password for the SSL key file. + - ssl_ca_certs: Optional path to CA certificate file for client certificate + verification. - location: [DEPRECATED: use `proxy_location` field instead] The deployment location of HTTP servers: @@ -417,6 +433,10 @@ class HTTPOptions(BaseModel): root_path: str = "" request_timeout_s: Optional[float] = None keep_alive_timeout_s: int = DEFAULT_UVICORN_KEEP_ALIVE_TIMEOUT_S + ssl_keyfile: Optional[str] = None + ssl_certfile: Optional[str] = None + ssl_keyfile_password: Optional[str] = None + ssl_ca_certs: Optional[str] = None @validator("location", always=True) def location_backfill_no_server(cls, v, values): @@ -425,6 +445,12 @@ def location_backfill_no_server(cls, v, values): return v + @validator("ssl_certfile") + def validate_ssl_certfile(cls, v, values): + ssl_keyfile = values.get("ssl_keyfile") + validate_ssl_config(v, ssl_keyfile) + return v + @validator("middlewares", always=True) def warn_for_middlewares(cls, v, values): if v: diff --git a/python/ray/serve/context.py b/python/ray/serve/context.py index ecb412aab37b..b6ad7bb2a685 100644 --- a/python/ray/serve/context.py +++ b/python/ray/serve/context.py @@ -8,7 +8,7 @@ import logging from collections import defaultdict from dataclasses import dataclass -from typing import Callable, Dict, Optional +from typing import Callable, Dict, List, Optional import ray from ray.exceptions import RayActorError @@ -41,11 +41,15 @@ class ReplicaContext: - deployment: name of the deployment the replica is a part of. - replica_tag: unique ID for the replica. - servable_object: instance of the user class/function this replica is running. + - rank: the rank of the replica. + - world_size: the number of replicas in the deployment. """ replica_id: ReplicaID servable_object: Callable _deployment_config: DeploymentConfig + rank: int + world_size: int @property def app_name(self) -> str: @@ -108,12 +112,16 @@ def _set_internal_replica_context( replica_id: ReplicaID, servable_object: Callable, _deployment_config: DeploymentConfig, + rank: int, + world_size: int, ): global _INTERNAL_REPLICA_CONTEXT _INTERNAL_REPLICA_CONTEXT = ReplicaContext( replica_id=replica_id, servable_object=servable_object, _deployment_config=_deployment_config, + rank=rank, + world_size=world_size, ) @@ -187,6 +195,10 @@ class _RequestContext: "Serve internal request context variable", default=None ) +_serve_batch_request_context = contextvars.ContextVar( + "Serve internal batching request context variable", default=None +) + def _get_serve_request_context(): """Get the current request context. @@ -200,6 +212,13 @@ def _get_serve_request_context(): return _serve_request_context.get() +def _get_serve_batch_request_context(): + """Get the list of request contexts for the current batch.""" + if _serve_batch_request_context.get() is None: + _serve_batch_request_context.set([]) + return _serve_batch_request_context.get() + + def _set_request_context( route: str = "", request_id: str = "", @@ -225,6 +244,16 @@ def _set_request_context( ) +def _unset_request_context(): + """Unset the request context.""" + _serve_request_context.set(_RequestContext()) + + +def _set_batch_request_context(request_contexts: List[_RequestContext]): + """Add the request context to the batch request context.""" + _serve_batch_request_context.set(request_contexts) + + # `_requests_pending_assignment` is a map from request ID to a # dictionary of asyncio tasks. # The request ID points to an ongoing request that is executing on the diff --git a/python/ray/serve/handle.py b/python/ray/serve/handle.py index f95e593655e9..f5ecf3661d83 100644 --- a/python/ray/serve/handle.py +++ b/python/ray/serve/handle.py @@ -9,6 +9,7 @@ from ray import serve from ray._raylet import ObjectRefGenerator from ray.serve._private.common import ( + OBJ_REF_NOT_SUPPORTED_ERROR, DeploymentHandleSource, DeploymentID, RequestMetadata, @@ -211,17 +212,17 @@ def _remote( def __getattr__(self, name): return self.options(method_name=name) - def shutdown(self, _skip_asyncio_check: bool = False): + def shutdown(self): if self._router: shutdown_future = self._router.shutdown() if self._is_router_running_in_separate_loop(): shutdown_future.result() else: - if not _skip_asyncio_check: - raise RuntimeError( - "Sync methods should not be called from within an `asyncio` event " - "loop. Use `await handle.shutdown_async()` instead." - ) + logger.warning( + "Synchronously shutting down a router that's running in the same " + "event loop can only be done best effort. Please use " + "`shutdown_async` instead." + ) async def shutdown_async(self): if self._router: @@ -269,6 +270,10 @@ def __init__( def request_id(self) -> str: return self._request_metadata.request_id + @property + def by_reference(self) -> bool: + return self._request_metadata._by_reference + def _fetch_future_result_sync( self, _timeout_s: Optional[float] = None ) -> ReplicaResult: @@ -499,6 +504,9 @@ async def _to_object_ref(self) -> ray.ObjectRef: ServeUsageTag.DEPLOYMENT_HANDLE_TO_OBJECT_REF_API_USED.record("1") + if not self._request_metadata._by_reference: + raise OBJ_REF_NOT_SUPPORTED_ERROR + replica_result = await self._fetch_future_result_async() return await replica_result.to_object_ref_async() @@ -523,6 +531,9 @@ def _to_object_ref_sync( ServeUsageTag.DEPLOYMENT_HANDLE_TO_OBJECT_REF_API_USED.record("1") + if not self._request_metadata._by_reference: + raise OBJ_REF_NOT_SUPPORTED_ERROR + if not _allow_running_in_asyncio_loop and is_running_in_asyncio_loop(): raise RuntimeError( "Sync methods should not be called from within an `asyncio` event " @@ -640,6 +651,9 @@ async def _to_object_ref_gen(self) -> ObjectRefGenerator: ServeUsageTag.DEPLOYMENT_HANDLE_TO_OBJECT_REF_API_USED.record("1") + if not self._request_metadata._by_reference: + raise OBJ_REF_NOT_SUPPORTED_ERROR + replica_result = await self._fetch_future_result_async() return replica_result.to_object_ref_gen() @@ -661,6 +675,9 @@ def _to_object_ref_gen_sync( ServeUsageTag.DEPLOYMENT_HANDLE_TO_OBJECT_REF_API_USED.record("1") + if not self._request_metadata._by_reference: + raise OBJ_REF_NOT_SUPPORTED_ERROR + if not _allow_running_in_asyncio_loop and is_running_in_asyncio_loop(): raise RuntimeError( "Sync methods should not be called from within an `asyncio` event " diff --git a/python/ray/serve/llm/__init__.py b/python/ray/serve/llm/__init__.py index 6bee92952088..178268965db5 100644 --- a/python/ray/serve/llm/__init__.py +++ b/python/ray/serve/llm/__init__.py @@ -1,5 +1,6 @@ -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING, Any, Dict, Optional, Union +from ray._common.deprecation import Deprecated from ray.llm._internal.serve.configs.server_models import ( CloudMirrorConfig as _CloudMirrorConfig, LLMConfig as _LLMConfig, @@ -15,10 +16,6 @@ from ray.llm._internal.serve.deployments.routers.router import ( LLMRouter as _LLMRouter, ) - -# Using Deprecated from rllib since they are retuning better messages. -# TODO: Ray core should inherit that. -from ray.rllib.utils.deprecation import Deprecated from ray.util.annotations import PublicAPI if TYPE_CHECKING: @@ -93,7 +90,10 @@ class LLMRouter(_LLMRouter): @PublicAPI(stability="alpha") def build_llm_deployment( - llm_config: "LLMConfig", *, name_prefix: Optional[str] = None + llm_config: "LLMConfig", + *, + name_prefix: Optional[str] = None, + override_serve_options: Optional[dict] = None, ) -> "Application": """Helper to build a single vllm deployment from the given llm config. @@ -150,17 +150,24 @@ async def query_model(model_handle): Args: llm_config: The llm config to build vllm deployment. name_prefix: Optional prefix to be used for the deployment name. + override_serve_options: Optional serve options to override the original serve options based on the llm_config. Returns: The configured Ray Serve Application for vllm deployment. """ from ray.llm._internal.serve.builders import build_llm_deployment - return build_llm_deployment(llm_config=llm_config, name_prefix=name_prefix) + return build_llm_deployment( + llm_config=llm_config, + name_prefix=name_prefix, + override_serve_options=override_serve_options, + ) @PublicAPI(stability="alpha") -def build_openai_app(llm_serving_args: "LLMServingArgs") -> "Application": +def build_openai_app( + llm_serving_args: Union["LLMServingArgs", Dict[str, Any]] +) -> "Application": """Helper to build an OpenAI compatible app with the llm deployment setup from the given llm serving args. This is the main entry point for users to create a Serve application serving LLMs. @@ -252,8 +259,8 @@ def build_openai_app(llm_serving_args: "LLMServingArgs") -> "Application": Args: - llm_serving_args: The list of llm configs or the paths to the llm config to - build the app. + llm_serving_args: Either a dict with "llm_configs" key containing a list of + LLMConfig objects, or an LLMServingArgs object. Returns: The configured Ray Serve Application router. diff --git a/python/ray/serve/schema.py b/python/ray/serve/schema.py index 607097fee8a7..055420eff9a4 100644 --- a/python/ray/serve/schema.py +++ b/python/ray/serve/schema.py @@ -2,7 +2,7 @@ from collections import Counter from dataclasses import dataclass, field from enum import Enum -from typing import Any, Dict, List, Optional, Set, Union +from typing import Any, Callable, Dict, List, Optional, Set, Union from zlib import crc32 from ray._common.pydantic_compat import ( @@ -32,7 +32,7 @@ SERVE_DEFAULT_APP_NAME, ) from ray.serve._private.deployment_info import DeploymentInfo -from ray.serve._private.utils import DEFAULT +from ray.serve._private.utils import DEFAULT, validate_ssl_config from ray.serve.config import ProxyLocation, RequestRouterConfig from ray.util.annotations import PublicAPI @@ -713,6 +713,31 @@ class HTTPOptionsSchema(BaseModel): "before closing them when no requests are ongoing. Defaults to " f"{DEFAULT_UVICORN_KEEP_ALIVE_TIMEOUT_S} seconds.", ) + ssl_keyfile: Optional[str] = Field( + default=None, + description="Path to the SSL key file for HTTPS. If provided with ssl_certfile, " + "the HTTP server will use HTTPS. Cannot be updated once Serve has started.", + ) + ssl_certfile: Optional[str] = Field( + default=None, + description="Path to the SSL certificate file for HTTPS. If provided with " + "ssl_keyfile, the HTTP server will use HTTPS. Cannot be updated once Serve " + "has started.", + ) + ssl_keyfile_password: Optional[str] = Field( + default=None, + description="Password for the SSL key file, if encrypted.", + ) + ssl_ca_certs: Optional[str] = Field( + default=None, + description="Path to the CA certificate file for verifying client certificates.", + ) + + @validator("ssl_certfile") + def validate_ssl_certfile(cls, v, values): + ssl_keyfile = values.get("ssl_keyfile") + validate_ssl_config(v, ssl_keyfile) + return v @PublicAPI(stability="stable") @@ -953,6 +978,63 @@ class ReplicaDetails(ServeActorDetails, frozen=True): ) +@PublicAPI(stability="alpha") +class AutoscalingMetricsHealth(str, Enum): + HEALTHY = "healthy" + DELAYED = "delayed" + UNAVAILABLE = "unavailable" + + +@PublicAPI(stability="alpha") +class AutoscalingStatus(str, Enum): + UPSCALING = "UPSCALING" + DOWNSCALING = "DOWNSCALING" + STABLE = "STABLE" + + +@PublicAPI(stability="alpha") +class ScalingDecision(BaseModel): + """One autoscaling decision with minimal provenance.""" + + timestamp_s: float = Field( + ..., description="Unix time (seconds) when the decision was made." + ) + reason: str = Field( + ..., description="Short, human-readable reason for the decision." + ) + prev_num_replicas: int = Field( + ..., ge=0, description="Replica count before the decision." + ) + curr_num_replicas: int = Field( + ..., ge=0, description="Replica count after the decision." + ) + policy: Optional[str] = Field( + None, description="Policy name or identifier (if applicable)." + ) + + +@PublicAPI(stability="alpha") +class DeploymentAutoscalingDetail(BaseModel): + """Deployment-level autoscaler observability.""" + + scaling_status: AutoscalingStatus = Field( + ..., description="Current scaling direction or stability." + ) + decisions: List[ScalingDecision] = Field( + default_factory=list, description="Recent scaling decisions." + ) + metrics: Optional[Dict[str, Any]] = Field( + None, description="Aggregated metrics for this deployment." + ) + metrics_health: AutoscalingMetricsHealth = Field( + AutoscalingMetricsHealth.HEALTHY, + description="Health of metrics collection pipeline.", + ) + errors: List[str] = Field( + default_factory=list, description="Recent errors/abnormal events." + ) + + @PublicAPI(stability="stable") class DeploymentDetails(BaseModel, extra=Extra.forbid, frozen=True): """ @@ -993,6 +1075,11 @@ class DeploymentDetails(BaseModel, extra=Extra.forbid, frozen=True): description="Details about the live replicas of this deployment." ) + autoscaling_detail: Optional[DeploymentAutoscalingDetail] = Field( + default=None, + description="[EXPERIMENTAL] Deployment-level autoscaler observability for this deployment.", + ) + @PublicAPI(stability="alpha") class APIType(str, Enum): @@ -1002,6 +1089,14 @@ class APIType(str, Enum): IMPERATIVE = "imperative" DECLARATIVE = "declarative" + @classmethod + def get_valid_user_values(cls): + """Get list of valid APIType values that users can explicitly pass. + + Excludes 'unknown' which is for internal use only. + """ + return [cls.IMPERATIVE.value, cls.DECLARATIVE.value] + @PublicAPI(stability="stable") class ApplicationDetails(BaseModel, extra=Extra.forbid, frozen=True): @@ -1202,3 +1297,64 @@ def _get_user_facing_json_serializable_dict( ) return values + + +@PublicAPI(stability="alpha") +class CeleryAdapterConfig(BaseModel): + """ + Celery adapter config. You can use it to configure the Celery task processor for your Serve application. + """ + + broker_url: str = Field(..., description="The URL of the broker to use for Celery.") + backend_url: str = Field( + ..., description="The URL of the backend to use for Celery." + ) + broker_transport_options: Optional[Dict[str, Any]] = Field( + default=None, description="The broker transport options to use for Celery." + ) + worker_concurrency: Optional[int] = Field( + default=10, + description="The number of concurrent worker threads for the task processor.", + ) + + +@PublicAPI(stability="alpha") +class TaskProcessorConfig(BaseModel): + """ + Task processor config. You can use it to configure the task processor for your Serve application. + """ + + queue_name: str = Field( + ..., description="The name of the queue to use for task processing." + ) + adapter: Union[str, Callable] = Field( + default="ray.serve.task_processor.CeleryTaskProcessorAdapter", + description="The adapter to use for task processing. By default, Celery is used.", + ) + adapter_config: Any = Field(..., description="The adapter config.") + max_retries: Optional[int] = Field( + default=3, + description="The maximum number of times to retry a task before marking it as failed.", + ) + failed_task_queue_name: Optional[str] = Field( + default=None, + description="The name of the failed task queue. This is used to move failed tasks to a dead-letter queue after max retries.", + ) + unprocessable_task_queue_name: Optional[str] = Field( + default=None, + description="The name of the unprocessable task queue. This is used to move unprocessable tasks(like tasks with serialization issue, or missing handler) to a dead-letter queue.", + ) + + +@PublicAPI(stability="alpha") +class TaskResult(BaseModel): + """ + Task result Model. + """ + + id: str = Field(..., description="The ID of the task.") + status: str = Field(..., description="The status of the task.") + created_at: Optional[float] = Field( + default=None, description="The timestamp of the task creation." + ) + result: Any = Field(..., description="The result of the task.") diff --git a/python/ray/serve/task_consumer.py b/python/ray/serve/task_consumer.py new file mode 100644 index 000000000000..da1266969980 --- /dev/null +++ b/python/ray/serve/task_consumer.py @@ -0,0 +1,206 @@ +import inspect +import logging +from functools import wraps +from typing import Callable, Optional + +from ray._common.utils import import_attr +from ray.serve._private.constants import SERVE_LOGGER_NAME +from ray.serve.schema import TaskProcessorConfig +from ray.serve.task_processor import TaskProcessorAdapter +from ray.util.annotations import PublicAPI + +logger = logging.getLogger(SERVE_LOGGER_NAME) + + +@PublicAPI(stability="alpha") +def instantiate_adapter_from_config( + task_processor_config: TaskProcessorConfig, +) -> TaskProcessorAdapter: + """ + Create a TaskProcessorAdapter instance from the provided configuration and call .initialize(). This function supports two ways to specify an adapter: + + 1. String path: A fully qualified module path to an adapter class + Example: "ray.serve.task_processor.CeleryTaskProcessorAdapter" + + 2. Class reference: A direct reference to an adapter class + Example: CeleryTaskProcessorAdapter + + Args: + task_processor_config: Configuration object containing adapter specification. + + Returns: + An initialized TaskProcessorAdapter instance ready for use. + + Raises: + ValueError: If the adapter string path is malformed or cannot be imported. + TypeError: If the adapter is not a string or callable class. + + Example: + .. code-block:: python + + config = TaskProcessorConfig( + adapter="my.module.CustomAdapter", + adapter_config={"param": "value"}, + queue_name="my_queue" + ) + adapter = instantiate_adapter_from_config(config) + """ + + adapter = task_processor_config.adapter + + # Handle string-based adapter specification (module path) + if isinstance(adapter, str): + adapter_class = import_attr(adapter) + + elif callable(adapter): + adapter_class = adapter + + else: + raise TypeError( + f"Adapter must be either a string path or a callable class, got {type(adapter).__name__}: {adapter}" + ) + + try: + adapter_instance = adapter_class(config=task_processor_config) + except Exception as e: + raise RuntimeError(f"Failed to instantiate {adapter_class.__name__}: {e}") + + if not isinstance(adapter_instance, TaskProcessorAdapter): + raise TypeError( + f"{adapter_class.__name__} must inherit from TaskProcessorAdapter, got {type(adapter_instance).__name__}" + ) + + try: + adapter_instance.initialize() + except Exception as e: + raise RuntimeError(f"Failed to initialize {adapter_class.__name__}: {e}") + + return adapter_instance + + +@PublicAPI(stability="alpha") +def task_consumer(*, task_processor_config: TaskProcessorConfig): + """ + Decorator to mark a class as a TaskConsumer. + + Args: + task_processor_config: Configuration for the task processor (required) + + Note: + This decorator must be used with parentheses: + @task_consumer(task_processor_config=config) + + Returns: + A wrapper class that inherits from the target class and implements the task consumer functionality. + + Example: + .. code-block:: python + + from ray import serve + from ray.serve.task_consumer import task_consumer, task_handler + + @serve.deployment + @task_consumer(task_processor_config=config) + class MyTaskConsumer: + + @task_handler(name="my_task") + def my_task(self, *args, **kwargs): + pass + + """ + + def decorator(target_cls): + class TaskConsumerWrapper(target_cls): + _adapter: TaskProcessorAdapter + + def __init__(self, *args, **kwargs): + target_cls.__init__(self, *args, **kwargs) + + self._adapter = instantiate_adapter_from_config(task_processor_config) + + for name, method in inspect.getmembers( + target_cls, predicate=inspect.isfunction + ): + if getattr(method, "_is_task_handler", False): + task_name = getattr(method, "_task_name", name) + + # Create a callable that properly binds the method to this instance + bound_method = getattr(self, name) + + self._adapter.register_task_handle(bound_method, task_name) + + try: + self._adapter.start_consumer() + logger.info("task consumer started successfully") + except Exception as e: + logger.error(f"Failed to start task consumer: {e}") + raise + + def __del__(self): + self._adapter.stop_consumer() + self._adapter.shutdown() + + if hasattr(target_cls, "__del__"): + target_cls.__del__(self) + + return TaskConsumerWrapper + + return decorator + + +@PublicAPI(stability="alpha") +def task_handler( + _func: Optional[Callable] = None, *, name: Optional[str] = None +) -> Callable: + """ + Decorator to mark a method as a task handler. + Optionally specify a task name. Default is the method name. + + Arguments: + _func: The function to decorate. + name: The name of the task. Default is the method name. + + Returns: + A wrapper function that is marked as a task handler. + + Example: + .. code-block:: python + + from ray import serve + from ray.serve.task_consumer import task_consumer, task_handler + + @serve.deployment + @task_consumer(task_processor_config=config) + class MyTaskConsumer: + + @task_handler(name="my_task") + def my_task(self, *args, **kwargs): + pass + + """ + + # Validate name parameter if provided + if name is not None and (not isinstance(name, str) or not name.strip()): + raise ValueError(f"Task name must be a non-empty string, got {name}") + + def decorator(f): + # async functions are not supported yet in celery `threads` worker pool + if not inspect.iscoroutinefunction(f): + + @wraps(f) + def wrapper(*args, **kwargs): + return f(*args, **kwargs) + + wrapper._is_task_handler = True # type: ignore + wrapper._task_name = name or f.__name__ # type: ignore + return wrapper + + else: + raise NotImplementedError("Async task handlers are not supported yet") + + if _func is not None: + # Used without arguments: @task_handler + return decorator(_func) + else: + # Used with arguments: @task_handler(name="...") + return decorator diff --git a/python/ray/serve/task_processor.py b/python/ray/serve/task_processor.py new file mode 100644 index 000000000000..cd66d83ff354 --- /dev/null +++ b/python/ray/serve/task_processor.py @@ -0,0 +1,625 @@ +import json +import logging +import threading +import time +from abc import ABC, abstractmethod +from enum import Enum, auto +from typing import Any, Callable, Dict, List, Optional, Set + +from celery import Celery +from celery.signals import task_failure, task_unknown + +from ray.serve import get_replica_context +from ray.serve._private.constants import SERVE_LOGGER_NAME +from ray.serve.schema import ( + CeleryAdapterConfig, + TaskProcessorConfig, + TaskResult, +) +from ray.util.annotations import PublicAPI + +logger = logging.getLogger(SERVE_LOGGER_NAME) + + +@PublicAPI(stability="alpha") +class AsyncCapability(Enum): + """ + Enum defining different async capabilities a TaskProcessor can support. + + Each capability represents an async operation that an adapter may or may not + support. Use TaskProcessorAdapter.supports_async_capability() to check if + a specific capability is available before using the corresponding async method. + """ + + ENQUEUE_TASK = auto() # Ability to enqueue tasks asynchronously + GET_TASK_STATUS = auto() # Ability to retrieve task status asynchronously + CANCEL_TASK = auto() # Ability to cancel tasks asynchronously + GET_METRICS = auto() # Ability to retrieve metrics asynchronously + HEALTH_CHECK = auto() # Ability to perform health checks asynchronously + + +def _json_dump(obj: Any) -> Any: + """Recursively make an object JSON serializable.""" + if isinstance(obj, dict): + return {k: _json_dump(v) for k, v in obj.items()} + if isinstance(obj, list): + return [_json_dump(i) for i in obj] + try: + return json.dumps(obj) + except (TypeError, ValueError): + return str(obj) + + +@PublicAPI(stability="alpha") +class TaskProcessorAdapter(ABC): + """ + Abstract base class for task processing adapters. + + Subclasses can support different combinations of sync and async operations. + Use supports_async_capability() to check if a specific async operation is supported. + """ + + def __init__(self): + """ + Initialize the TaskProcessorAdapter. + + Sets up an empty set of async capabilities. Subclasses should add their + supported async capabilities to self._async_capabilities in their __init__ + method. + """ + self._async_capabilities: Set[AsyncCapability] = set() + + @property + def async_capabilities(self) -> Set[AsyncCapability]: + """ + Get the set of async capabilities supported by this adapter. + + Returns: + Set[AsyncCapability]: A copy of the set containing all async capabilities + supported by this adapter. Modifying the returned set will not affect + the adapter's capabilities. + """ + return self._async_capabilities.copy() + + def supports_async_capability(self, capability: AsyncCapability) -> bool: + """ + Check if this adapter supports a specific async capability. + + Args: + capability: The AsyncCapability enum value to check for. + + Returns: + bool: True if the capability is supported, False otherwise. + """ + return capability in self._async_capabilities + + def supports_any_async(self) -> bool: + """ + Check if this adapter supports any async operations. + + Returns: + bool: True if at least one async capability is supported, False if this is a sync-only adapter. + """ + return len(self._async_capabilities) > 0 + + @abstractmethod + def initialize(self): + """ + Initialize the task processor. + """ + pass + + @abstractmethod + def register_task_handle(self, func: Callable, name: Optional[str] = None): + """ + Register a function as a task handler. + + Args: + func: The function to register as a task handler. + name: Custom name for the task. + """ + pass + + @abstractmethod + def enqueue_task_sync( + self, + task_name: str, + args: Optional[Any] = None, + kwargs: Optional[Any] = None, + **options, + ) -> TaskResult: + """ + Enqueue a task for execution synchronously. + + Args: + task_name: Name of the registered task to execute. + args: Positional arguments to pass to the task function. + kwargs: Keyword arguments to pass to the task function. + **options: Additional adapter-specific options for task execution. + + Returns: + TaskResult: Object containing task ID, status, and other metadata. + """ + pass + + @abstractmethod + def get_task_status_sync(self, task_id: str) -> TaskResult: + """ + Retrieve the current status of a task synchronously. + + Args: + task_id: Unique identifier of the task to query. + + Returns: + TaskResult: Object containing current task status, result, and other metadata. + """ + pass + + @abstractmethod + def start_consumer(self, **kwargs): + """ + Start the task consumer/worker process. + """ + pass + + @abstractmethod + def stop_consumer(self, timeout: float = 10.0): + """ + Stop the task consumer gracefully. + + Args: + timeout: Maximum time in seconds to wait for the consumer to stop. + """ + pass + + @abstractmethod + def shutdown(self): + """ + Shutdown the task processor and clean up resources. + """ + pass + + @abstractmethod + def cancel_task_sync(self, task_id: str): + """ + Cancel a task synchronously. + + Args: + task_id: Unique identifier of the task to cancel. + """ + pass + + @abstractmethod + def get_metrics_sync(self) -> Dict[str, Any]: + """ + Get metrics synchronously. + + Returns: + Dict[str, Any]: Adapter-specific metrics data. + """ + pass + + @abstractmethod + def health_check_sync(self) -> List[Dict]: + """ + Perform health check synchronously. + + Returns: + List[Dict]: Health status information for workers/components. + """ + pass + + async def enqueue_task_async( + self, + task_name: str, + args: Optional[Any] = None, + kwargs: Optional[Any] = None, + **options, + ) -> TaskResult: + """ + Enqueue a task asynchronously. + + Args: + task_name: Name of the registered task to execute. + args: Positional arguments to pass to the task function. + kwargs: Keyword arguments to pass to the task function. + **options: Additional adapter-specific options for task execution. + + Returns: + TaskResult: Object containing task ID, status, and other metadata. + + Raises: + NotImplementedError: If async enqueue is not supported by this adapter. + """ + if not self.supports_async_capability(AsyncCapability.ENQUEUE_TASK): + raise NotImplementedError( + f"{self.__class__.__name__} does not support async task enqueueing. " + f"Use enqueue_task_sync() instead or check supports_async_capability() first." + ) + + raise NotImplementedError("Subclass must implement enqueue_task_async function") + + async def get_task_status_async(self, task_id: str) -> TaskResult: + """ + Get task status asynchronously. + + Args: + task_id: Unique identifier of the task to query. + + Returns: + TaskResult: Object containing current task status, result, and other metadata. + + Raises: + NotImplementedError: If async status retrieval is not supported by this adapter. + """ + if not self.supports_async_capability(AsyncCapability.GET_TASK_STATUS): + raise NotImplementedError( + f"{self.__class__.__name__} does not support async task status retrieval. " + f"Use get_task_status_sync() instead or check supports_async_capability() first." + ) + + raise NotImplementedError( + "Subclass must implement get_task_status_async function" + ) + + async def cancel_task_async(self, task_id: str): + """ + Cancel a task. + + Args: + task_id: Unique identifier of the task to cancel. + + Raises: + NotImplementedError: If async task cancellation is not supported by this adapter. + """ + if not self.supports_async_capability(AsyncCapability.CANCEL_TASK): + raise NotImplementedError( + f"{self.__class__.__name__} does not support async task cancellation. " + f"Check supports_async_capability() first." + ) + + raise NotImplementedError("Subclass must implement cancel_task_async function") + + async def get_metrics_async(self) -> Dict[str, Any]: + """ + Get metrics asynchronously. + + Returns: + Dict[str, Any]: Adapter-specific metrics data. + + Raises: + NotImplementedError: If async metrics retrieval is not supported by this adapter. + """ + if not self.supports_async_capability(AsyncCapability.GET_METRICS): + raise NotImplementedError( + f"{self.__class__.__name__} does not support async metrics retrieval. " + f"Check supports_async_capability() first." + ) + + raise NotImplementedError("Subclass must implement get_metrics_async function") + + async def health_check_async(self) -> List[Dict]: + """ + Perform health check asynchronously. + + Returns: + List[Dict]: Health status information for workers/components. + + Raises: + NotImplementedError: If async health check is not supported by this adapter. + """ + if not self.supports_async_capability(AsyncCapability.HEALTH_CHECK): + raise NotImplementedError( + f"{self.__class__.__name__} does not support async health check. " + f"Check supports_async_capability() first." + ) + + raise NotImplementedError("Subclass must implement health_check_async function") + + +@PublicAPI(stability="alpha") +class CeleryTaskProcessorAdapter(TaskProcessorAdapter): + """ + Celery-based task processor adapter. + This adapter does NOT support any async operations. + All operations must be performed synchronously. + """ + + _app: Celery + _config: TaskProcessorConfig + _worker_thread: Optional[threading.Thread] = None + _worker_hostname: Optional[str] = None + + def __init__(self, config: TaskProcessorConfig): + super().__init__() + + if not isinstance(config.adapter_config, CeleryAdapterConfig): + raise TypeError( + "TaskProcessorConfig.adapter_config must be an instance of CeleryAdapterConfig" + ) + + self._config = config + + # Celery adapter does not support any async capabilities + # self._async_capabilities is already an empty set from parent class + + def initialize(self): + self._app = Celery( + self._config.queue_name, + backend=self._config.adapter_config.backend_url, + broker=self._config.adapter_config.broker_url, + ) + + self._app.conf.update( + loglevel="info", + worker_pool="threads", + worker_concurrency=self._config.adapter_config.worker_concurrency, + max_retries=self._config.max_retries, + task_default_queue=self._config.queue_name, + # Store task results so they can be retrieved after completion + task_ignore_result=False, + # Acknowledge tasks only after completion (not when received) for better reliability + task_acks_late=True, + # Reject and requeue tasks when worker is lost to prevent data loss + task_reject_on_worker_lost=True, + # Only prefetch 1 task at a time to match concurrency and prevent task hoarding + worker_prefetch_multiplier=1, + ) + + queue_config = { + self._config.queue_name: { + "exchange": self._config.queue_name, + "exchange_type": "direct", + "routing_key": self._config.queue_name, + }, + } + + if self._config.failed_task_queue_name: + queue_config[self._config.failed_task_queue_name] = { + "exchange": self._config.failed_task_queue_name, + "exchange_type": "direct", + "routing_key": self._config.failed_task_queue_name, + } + + if self._config.unprocessable_task_queue_name: + queue_config[self._config.unprocessable_task_queue_name] = { + "exchange": self._config.unprocessable_task_queue_name, + "exchange_type": "direct", + "routing_key": self._config.unprocessable_task_queue_name, + } + + self._app.conf.update( + task_queues=queue_config, + task_routes={ + # Default tasks go to main queue + "*": {"queue": self._config.queue_name}, + }, + ) + + if self._config.adapter_config.broker_transport_options is not None: + self._app.conf.update( + broker_transport_options=self._config.adapter_config.broker_transport_options, + ) + + if self._config.failed_task_queue_name: + task_failure.connect(self._handle_task_failure) + + if self._config.unprocessable_task_queue_name: + task_unknown.connect(self._handle_unknown_task) + + def register_task_handle(self, func, name=None): + task_options = { + "autoretry_for": (Exception,), + "retry_kwargs": {"max_retries": self._config.max_retries}, + "retry_backoff": True, + "retry_backoff_max": 60, # Max backoff of 60 seconds + "retry_jitter": False, # Disable jitter for predictable testing + } + + if name: + self._app.task(name=name, **task_options)(func) + else: + self._app.task(**task_options)(func) + + def enqueue_task_sync( + self, task_name, args=None, kwargs=None, **options + ) -> TaskResult: + task_response = self._app.send_task( + task_name, + args=args, + kwargs=kwargs, + queue=self._config.queue_name, + **options, + ) + + return TaskResult( + id=task_response.id, + status=task_response.status, + created_at=time.time(), + result=task_response.result, + ) + + def get_task_status_sync(self, task_id) -> TaskResult: + task_details = self._app.AsyncResult(task_id) + return TaskResult( + id=task_details.id, + result=task_details.result, + status=task_details.status, + ) + + def start_consumer(self, **kwargs): + """Starts the Celery worker thread.""" + if self._worker_thread is not None and self._worker_thread.is_alive(): + logger.info("Celery worker thread is already running.") + return + + unique_id = get_replica_context().replica_tag + self._worker_hostname = f"{self._app.main}_{unique_id}" + + worker_args = [ + "worker", + f"--hostname={self._worker_hostname}", + "-Q", + self._config.queue_name, + ] + + self._worker_thread = threading.Thread( + target=self._app.worker_main, + args=(worker_args,), + ) + self._worker_thread.start() + + logger.info( + f"Celery worker thread started with hostname: {self._worker_hostname}" + ) + + def stop_consumer(self, timeout: float = 10.0): + """Signals the Celery worker to shut down and waits for it to terminate.""" + if self._worker_thread is None or not self._worker_thread.is_alive(): + logger.info("Celery worker thread is not running.") + return + + logger.info("Sending shutdown signal to Celery worker...") + + # Use the worker's hostname for targeted shutdown + self._app.control.broadcast( + "shutdown", destination=[f"celery@{self._worker_hostname}"] + ) + self._worker_thread.join(timeout=timeout) + + if self._worker_thread.is_alive(): + logger.warning(f"Worker thread did not terminate after {timeout} seconds.") + else: + logger.info("Celery worker thread has stopped.") + + self._worker_thread = None + + def shutdown(self): + logger.info("Shutting down Celery worker...") + self._app.control.shutdown() + logger.info("Celery worker shutdown complete...") + + def cancel_task_sync(self, task_id): + """ + Cancels a task synchronously. Only supported for Redis and RabbitMQ brokers by Celery. + More details can be found here: https://docs.celeryq.dev/en/stable/userguide/workers.html#revoke-revoking-tasks + """ + self._app.control.revoke(task_id) + + def get_metrics_sync(self) -> Dict[str, Any]: + """ + Returns the metrics of the Celery worker synchronously. + More details can be found here: https://docs.celeryq.dev/en/stable/reference/celery.app.control.html#celery.app.control.Inspect.stats + """ + return self._app.control.inspect().stats() + + def health_check_sync(self) -> List[Dict]: + """ + Checks the health of the Celery worker synchronously. + Returns a list of dictionaries, each containing the worker name and a dictionary with the health status. + Example: [{'celery@192.168.1.100': {'ok': 'pong'}}] + More details can be found here: https://docs.celeryq.dev/en/stable/reference/celery.app.control.html#celery.app.control.Control.ping + """ + return self._app.control.ping() + + def _handle_task_failure( + self, + sender: Any = None, + task_id: str = None, + args: Any = None, + kwargs: Any = None, + einfo: Any = None, + **kw, + ): + """Handle task failures and route them to appropriate dead letter queues. + + This method is called when a task fails after all retry attempts have been + exhausted. It logs the failure and moves the task to failed_task_queue + + Args: + sender: The task object that failed + task_id: Unique identifier of the failed task + args: Positional arguments passed to the task + kwargs: Keyword arguments passed to the task + einfo: Exception info object containing exception details and traceback + **kw: Additional keyword arguments passed by Celery + """ + logger.info( + f"Task failure detected for task_id: {task_id}, args: {args}, kwargs: {kwargs}, einfo: {einfo}" + ) + + dlq_args = [ + task_id, + str(einfo.exception), + _json_dump(args), + _json_dump(kwargs), + str(einfo), + ] + + if self._config.failed_task_queue_name: + self._move_task_to_queue( + self._config.failed_task_queue_name, + sender.name, + dlq_args, + ) + + logger.error( + f"Task {task_id} failed after max retries. Exception: {einfo}. Moved it to the {self._config.failed_task_queue_name} queue." + ) + + def _handle_unknown_task( + self, + sender: Any = None, + name: str = None, + id: str = None, + message: Any = None, + exc: Any = None, + **kwargs, + ): + """Handle unknown or unregistered tasks received by Celery. + + This method is called when Celery receives a task that it doesn't recognize + (i.e., a task that hasn't been registered with the Celery app). These tasks + are moved to the unprocessable task queue if configured. + + Args: + sender: The Celery app or worker that detected the unknown task + name: Name of the unknown task + id: Task ID of the unknown task + message: The raw message received for the unknown task + exc: The exception raised when trying to process the unknown task + **kwargs: Additional context information from Celery + """ + logger.info( + f"Unknown task detected by Celery. Name: {name}, ID: {id}, Message: {message}" + ) + + if self._config.unprocessable_task_queue_name: + self._move_task_to_queue( + self._config.unprocessable_task_queue_name, + name, + [ + name, + id, + _json_dump(message), + str(exc), + _json_dump(kwargs), + ], + ) + + def _move_task_to_queue(self, queue_name: str, task_name: str, args: list): + """Helper function to move a task to a specified queue.""" + try: + logger.info( + f"Moving task: {task_name} to queue: {queue_name}, args: {args}" + ) + self._app.send_task( + name=task_name, + queue=queue_name, + args=args, + ) + except Exception as e: + logger.error( + f"Failed to move task: {task_name} to queue: {queue_name}, error: {e}" + ) + raise e diff --git a/python/ray/serve/tests/BUILD b/python/ray/serve/tests/BUILD.bazel similarity index 98% rename from python/ray/serve/tests/BUILD rename to python/ray/serve/tests/BUILD.bazel index 3db7c5d6a0e8..3a1cd3fde236 100644 --- a/python/ray/serve/tests/BUILD +++ b/python/ray/serve/tests/BUILD.bazel @@ -81,6 +81,7 @@ py_test_module_list( "test_healthcheck.py", "test_http_headers.py", "test_http_routes.py", + "test_https_proxy.py", "test_max_replicas_per_node.py", "test_multiplex.py", "test_proxy.py", @@ -92,6 +93,7 @@ py_test_module_list( "test_request_timeout.py", "test_streaming_response.py", "test_target_capacity.py", + "test_task_processor.py", "test_telemetry.py", ], tags = [ @@ -108,10 +110,14 @@ py_test_module_list( # Medium tests, don't run on windows. py_test_module_list( size = "medium", + env = { + "RAY_SERVE_FAIL_ON_RANK_ERROR": "1", + }, files = [ "test_fastapi.py", "test_gcs_failure.py", "test_gradio.py", + "test_replica_ranks.py", ], tags = [ "exclusive", diff --git a/python/ray/serve/tests/conftest.py b/python/ray/serve/tests/conftest.py index 3ccfb9e45dc5..69e7392e4b91 100644 --- a/python/ray/serve/tests/conftest.py +++ b/python/ray/serve/tests/conftest.py @@ -7,6 +7,7 @@ import httpx import pytest +import pytest_asyncio import ray from ray import serve @@ -22,7 +23,11 @@ ) from ray.serve.config import HTTPOptions, gRPCOptions from ray.serve.context import _get_global_client -from ray.tests.conftest import propagate_logs, pytest_runtest_makereport # noqa +from ray.tests.conftest import ( # noqa + external_redis, + propagate_logs, + pytest_runtest_makereport, +) # https://tools.ietf.org/html/rfc6335#section-6 MIN_DYNAMIC_PORT = 49152 @@ -152,13 +157,22 @@ def _shared_serve_instance(): yield _get_global_client() +@pytest_asyncio.fixture +async def serve_instance_async(_shared_serve_instance): + yield _shared_serve_instance + # Clear all state for 2.x applications and deployments. + _shared_serve_instance.delete_all_apps() + # Clear the ServeHandle cache between tests to avoid them piling up. + await _shared_serve_instance.shutdown_cached_handles_async() + + @pytest.fixture def serve_instance(_shared_serve_instance): yield _shared_serve_instance # Clear all state for 2.x applications and deployments. _shared_serve_instance.delete_all_apps() # Clear the ServeHandle cache between tests to avoid them piling up. - _shared_serve_instance.shutdown_cached_handles(_skip_asyncio_check=True) + _shared_serve_instance.shutdown_cached_handles() @pytest.fixture diff --git a/python/ray/serve/tests/test_actor_replica_wrapper.py b/python/ray/serve/tests/test_actor_replica_wrapper.py index f0bd3041e275..19b6957cf35a 100644 --- a/python/ray/serve/tests/test_actor_replica_wrapper.py +++ b/python/ray/serve/tests/test_actor_replica_wrapper.py @@ -69,6 +69,8 @@ async def handle_request_with_rejection( async with send_signal_on_cancellation(cancelled_signal_actor): await executing_signal_actor.send.remote() + return + # Special case: if "raise_task_cancelled_error" is in kwargs, raise TaskCancelledError # This simulates the scenario where the underlying Ray task gets cancelled if kwargs.pop("raise_task_cancelled_error", False): @@ -116,7 +118,7 @@ async def test_send_request_without_rejection(setup_fake_replica, is_streaming: is_streaming=is_streaming, ), ) - replica_result, _ = await replica.send_request(pr, with_rejection=False) + replica_result = replica.try_send_request(pr, with_rejection=False) if is_streaming: assert isinstance(replica_result.to_object_ref_gen(), ObjectRefGenerator) for i in range(5): @@ -150,11 +152,12 @@ async def test_send_request_with_rejection( is_streaming=is_streaming, ), ) - replica_result, info = await replica.send_request(pr, with_rejection=True) + replica_result = replica.try_send_request(pr, with_rejection=True) + info = await replica_result.get_rejection_response() assert info.accepted == accepted assert info.num_ongoing_requests == 10 if not accepted: - assert replica_result is None + pass elif is_streaming: assert isinstance(replica_result.to_object_ref_gen(), ObjectRefGenerator) for i in range(5): @@ -190,21 +193,22 @@ async def test_send_request_with_rejection_cancellation(setup_fake_replica): # Send request should hang because the downstream actor method call blocks # before sending the system message. - send_request_task = get_or_create_event_loop().create_task( - replica.send_request(pr, with_rejection=True) + replica_result = replica.try_send_request(pr, with_rejection=True) + request_task = get_or_create_event_loop().create_task( + replica_result.get_rejection_response() ) # Check that the downstream actor method call has started. await executing_signal_actor.wait.remote() - _, pending = await asyncio.wait([send_request_task], timeout=0.001) + _, pending = await asyncio.wait([request_task], timeout=0.001) assert len(pending) == 1 # Cancel the task. This should cause the downstream actor method call to # be cancelled (verified via signal actor). - send_request_task.cancel() + request_task.cancel() with pytest.raises(asyncio.CancelledError): - await send_request_task + await request_task await cancelled_signal_actor.wait.remote() @@ -237,8 +241,9 @@ async def test_send_request_with_rejection_task_cancelled_error(setup_fake_repli ) # The TaskCancelledError should be caught and converted to asyncio.CancelledError + replica_result = replica.try_send_request(pr, with_rejection=True) with pytest.raises(asyncio.CancelledError): - await replica.send_request(pr, with_rejection=True) + await replica_result.get_rejection_response() if __name__ == "__main__": diff --git a/python/ray/serve/tests/test_api.py b/python/ray/serve/tests/test_api.py index 75b76b9fb7a2..b3d5419a79d3 100644 --- a/python/ray/serve/tests/test_api.py +++ b/python/ray/serve/tests/test_api.py @@ -1,7 +1,7 @@ import asyncio import os import sys -from typing import Dict, List, Optional +from typing import Dict, List, Optional, overload import httpx import pytest @@ -1136,6 +1136,55 @@ def test_custom_request_router_kwargs(serve_instance): assert handle.remote().result() == "Hello, world!" +def test_overloaded_app_builder_signatures(): + """Test that call_user_app_builder_with_args_if_necessary validates the base + function signature with a pydantic basemodel, rather than the overload that + accepts a dict (for the sake of lint permissiveness). + """ + + class Config(BaseModel): + name: str + value: int = 42 + + @serve.deployment + class MockDeployment: + def __call__(self): + return "mock" + + mock_app = MockDeployment.bind() + + # Overloaded function where the implementation has a pydantic annotation + @overload + def overloaded_builder(args: dict) -> Application: + ... + + def overloaded_builder(args: Config) -> Application: + """Implementation with pydantic BaseModel annotation.""" + + assert isinstance(args, Config), f"Expected Config but got {type(args)}" + return mock_app + + # Test 1: Valid input should work and convert to Config model + result = call_user_app_builder_with_args_if_necessary( + overloaded_builder, {"name": "test", "value": 123} + ) + assert isinstance(result, Application) + + # Test 2: Invalid dict input should raise validation error + # Missing required field 'name' + with pytest.raises(ValidationError): + call_user_app_builder_with_args_if_necessary( + overloaded_builder, {"value": 123} # Missing required 'name' field + ) + + # Test 3: Wrong type should also raise validation error + with pytest.raises(ValidationError): + call_user_app_builder_with_args_if_necessary( + overloaded_builder, + {"name": "test", "value": "not_an_int"}, # 'value' should be int + ) + + if __name__ == "__main__": import sys diff --git a/python/ray/serve/tests/test_autoscaling_policy.py b/python/ray/serve/tests/test_autoscaling_policy.py index 32b66f6c305b..707cc5179b14 100644 --- a/python/ray/serve/tests/test_autoscaling_policy.py +++ b/python/ray/serve/tests/test_autoscaling_policy.py @@ -15,6 +15,7 @@ import ray.util.state as state_api from ray import serve from ray._common.test_utils import SignalActor, wait_for_condition +from ray.serve._private.autoscaling_state import AutoscalingContext from ray.serve._private.common import ( DeploymentID, DeploymentStatus, @@ -36,7 +37,7 @@ get_num_alive_replicas, tlog, ) -from ray.serve.config import AutoscalingConfig +from ray.serve.config import AutoscalingConfig, AutoscalingPolicy from ray.serve.handle import DeploymentHandle from ray.serve.schema import ApplicationStatus, ServeDeploySchema from ray.util.state import list_actors @@ -377,7 +378,7 @@ async def call(self): # Wait for deployment A to scale up wait_for_condition(check_num_requests_eq, client=client, id=dep_id, expected=20) - wait_for_condition(check_num_replicas_eq, name="A", target=5) + wait_for_condition(check_num_replicas_eq, name="A", target=5, timeout=20) print("Confirmed deployment scaled to 5 replicas.") # Kill CallerActor @@ -411,8 +412,8 @@ def test_e2e_scale_up_down_basic(min_replicas, serve_instance_with_signal): max_ongoing_requests=1000, ) class A: - def __call__(self): - ray.get(signal.wait.remote()) + async def __call__(self): + await signal.wait.remote() handle = serve.run(A.bind()) wait_for_condition( @@ -598,8 +599,8 @@ class A: def __init__(self): logging.getLogger("ray.serve").setLevel(logging.ERROR) - def __call__(self): - ray.get(signal.wait.remote()) + async def __call__(self): + await signal.wait.remote() handle = serve.run(A.bind()) wait_for_condition( @@ -662,8 +663,8 @@ def test_e2e_intermediate_downscaling(serve_instance_with_signal): max_ongoing_requests=1000, ) class A: - def __call__(self): - ray.get(signal.wait.remote()) + async def __call__(self): + await signal.wait.remote() handle = serve.run(A.bind()) wait_for_condition( @@ -1047,9 +1048,9 @@ def test_e2e_preserve_prev_replicas_rest_api(serve_instance_with_signal): import os @serve.deployment -def g(): +async def g(): signal = ray.get_actor("signal123") - ray.get(signal.wait.remote()) + await signal.wait.remote() return os.getpid() @@ -1520,6 +1521,64 @@ def check_expected_statuses( print("Statuses are as expected.") +def custom_autoscaling_policy(ctx: AutoscalingContext): + if ctx.total_num_requests > 50: + return 3, {} + else: + return 2, {} + + +@pytest.mark.parametrize( + "policy", + [ + {"name": "ray.serve.tests.test_autoscaling_policy.custom_autoscaling_policy"}, + AutoscalingPolicy( + name="ray.serve.tests.test_autoscaling_policy.custom_autoscaling_policy" + ), + AutoscalingPolicy(name=custom_autoscaling_policy), + ], +) +def test_e2e_scale_up_down_basic_with_custom_policy(serve_instance_with_signal, policy): + """Send 100 requests and check that we autoscale up, and then back down.""" + + _, signal = serve_instance_with_signal + + @serve.deployment( + autoscaling_config={ + "min_replicas": 1, + "max_replicas": 4, + "downscale_delay_s": 0.5, + "upscale_delay_s": 0, + "policy": policy, + "metrics_interval_s": 0.1, + }, + # We will send over a lot of queries. This will make sure replicas are + # killed quickly during cleanup. + graceful_shutdown_timeout_s=1, + max_ongoing_requests=1000, + ) + class A: + async def __call__(self): + await signal.wait.remote() + + handle = serve.run(A.bind()) + wait_for_condition( + check_deployment_status, name="A", expected_status=DeploymentStatus.HEALTHY + ) + + [handle.remote() for _ in range(40)] + + # scale up one more replica from min_replicas + wait_for_condition(check_num_replicas_eq, name="A", target=2) + print("Scaled up to 2 replicas.") + + ray.get(signal.send.remote(clear=True)) + wait_for_condition(lambda: ray.get(signal.cur_num_waiters.remote()) == 0) + [handle.remote() for _ in range(70)] + wait_for_condition(check_num_replicas_eq, name="A", target=3) + ray.get(signal.send.remote(clear=True)) + + if __name__ == "__main__": import sys diff --git a/python/ray/serve/tests/test_batching.py b/python/ray/serve/tests/test_batching.py index 80f9f35eb10c..ac149e79ad13 100644 --- a/python/ray/serve/tests/test_batching.py +++ b/python/ray/serve/tests/test_batching.py @@ -3,6 +3,7 @@ from collections.abc import Callable from concurrent.futures.thread import ThreadPoolExecutor from functools import partial +from threading import Thread from typing import List, Optional import httpx @@ -13,6 +14,10 @@ from ray._common.test_utils import SignalActor, async_wait_for_condition from ray.serve._private.test_utils import get_application_url from ray.serve.batching import _RuntimeSummaryStatistics +from ray.serve.context import ( + _get_serve_batch_request_context, + _get_serve_request_context, +) def test_batching(serve_instance): @@ -214,7 +219,7 @@ async def __call__(self, request): @pytest.mark.parametrize("max_batch_size", [1, 10]) @pytest.mark.parametrize("n_requests", [1, 10]) async def test_observability_helpers( - n_requests: int, max_batch_size: int, max_concurrent_batches: int + serve_instance, n_requests: int, max_batch_size: int, max_concurrent_batches: int ) -> None: """Checks observability helper methods that are used for batching. @@ -310,6 +315,93 @@ async def poll() -> bool: return await async_wait_for_condition(poll) +def test_batching_request_context(serve_instance): + """Test that _get_serve_batch_request_context() works correctly with batching. + + With 6 requests and max_batch_size=3, Serve should create 2 batches processed in parallel. + Each batch should have access to the request contexts of all requests in that batch, + and context should be properly unset after processing. + """ + + @serve.deployment(max_ongoing_requests=10) + class BatchContextTester: + def __init__(self): + self.batch_results = [] + + @serve.batch( + max_batch_size=3, batch_wait_timeout_s=1.0, max_concurrent_batches=2 + ) + async def handle_batch(self, batch): + # Store results for verification + batch_result = { + "batch_size": len(batch), + "batch_request_contexts": _get_serve_batch_request_context(), + "current_request_context": _get_serve_request_context(), + } + self.batch_results.append(batch_result) + + return ["ok" for _ in range(len(batch))] + + async def __call__(self, request): + return await self.handle_batch(1) + + async def get_results(self): + return self.batch_results + + handle = serve.run(BatchContextTester.bind()) + + def do_request(): + """Make a request with a specific request ID.""" + url = get_application_url() + r = httpx.post(f"{url}/") + r.raise_for_status() + + # Launch 6 requests. Expect 2 batches of 3 requests each. + threads = [Thread(target=do_request) for _ in range(6)] + + for t in threads: + t.start() + for t in threads: + t.join() + + # Get results from the deployment + batch_results = handle.get_results.remote().result() + + # Verify each batch has correct size and context + total_requests_processed = 0 + request_ids_in_batch_context = set() + + for result in batch_results: + # Batch context should contain all 3 request contexts + assert ( + len(result["batch_request_contexts"]) == 3 + ), f"Expected 3 contexts in batch, got {result['batch_request_contexts']}" + req_ids_in_batch_context = [ + ctx.request_id for ctx in result["batch_request_contexts"] + ] + assert ( + len(req_ids_in_batch_context) == 3 + ), f"Expected 3 batch request IDs, got {len(req_ids_in_batch_context)}" + request_ids_in_batch_context.update(req_ids_in_batch_context) + + # Current request context read within the batcher should be a default empty context. + current_request_context = result["current_request_context"] + assert current_request_context.request_id == "" + assert current_request_context.route == "" + assert current_request_context.app_name == "" + assert current_request_context.multiplexed_model_id == "" + + total_requests_processed += result["batch_size"] + + # Verify all 6 requests were processed + assert ( + total_requests_processed == 6 + ), f"Expected 6 total requests processed, got {total_requests_processed}" + assert ( + len(request_ids_in_batch_context) == 6 + ), f"Expected 6 unique request IDs, got {len(request_ids_in_batch_context)}" + + if __name__ == "__main__": import sys diff --git a/python/ray/serve/tests/test_certs/ca.crt b/python/ray/serve/tests/test_certs/ca.crt new file mode 100644 index 000000000000..5b0a5e11bf42 --- /dev/null +++ b/python/ray/serve/tests/test_certs/ca.crt @@ -0,0 +1,21 @@ +-----BEGIN CERTIFICATE----- +MIIDfTCCAmWgAwIBAgIUYcUOt0aN1Ml/1WnFPB9gveNNniQwDQYJKoZIhvcNAQEL +BQAwZzELMAkGA1UEBhMCVVMxEzARBgNVBAgMCkNhbGlmb3JuaWExFjAUBgNVBAcM +DVNhbiBGcmFuY2lzY28xFzAVBgNVBAoMDlJheSBTZXJ2ZSBUZXN0MRIwEAYDVQQD +DAlsb2NhbGhvc3QwHhcNMjUwODIwMTgxODUzWhcNMjYwODIwMTgxODUzWjBnMQsw +CQYDVQQGEwJVUzETMBEGA1UECAwKQ2FsaWZvcm5pYTEWMBQGA1UEBwwNU2FuIEZy +YW5jaXNjbzEXMBUGA1UECgwOUmF5IFNlcnZlIFRlc3QxEjAQBgNVBAMMCWxvY2Fs +aG9zdDCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBAKYXcIirTR5AHb5V +T6yijOR8mvc6AXSKkmIKu7n2vaJ3Jrt7d6mPz/ScXlLYxq+mgt4avX/VozES0ARM +NcbqlHOcahgfyyN+/02q/Aimwbaf/FwiS5qyQfMXzFg70kydqlDlUsyE49qdFHEv +xx4ostLnTeyIpS7AS14qJXGeg5NE9Pm+XSs0HVBPZBaM6VCJl8/Pjog0qqffovGo +/qN8gVxnydg4ayTZ9nl+NNMivFJ/f5MUXmJiuFYAoZnwMiCy2QAU9TmdA5mCOGNZ +pv/KSSdqkVh7X6JNGB6OLgikCsObWxAJqq7WZgiHoc2WlXuN+U2SLuA0JLZZZr+t +zpw1DH0CAwEAAaMhMB8wHQYDVR0OBBYEFIey4ZBoVICZ7kAJv7K5kY/SHP6wMA0G +CSqGSIb3DQEBCwUAA4IBAQAg47MfYFykzDdynJnKf/Aqlp4bnT3GVEW3lRk8AMv9 +yrjwQeVKihiQLgC6b7ChyLUQWxcxJPqhzAIe/+sn9bAxz448oGMtU6ghHtxt13T2 +9VKsyyrjgZ3fbiFT5AFMYxwYlcaf1hJPE+PKKU3oUhYxUlEBKweDjTw7+7xym/Ix +hNYv36lDst/zwA1HKmvorDhCVOT3Y90deVA31NxFQbqNpeCjG6uiURAtO3jMan50 +m9U60cHjJBkSxCKCw4SQXOan9VKePIsHnZgIiDPmO25KYSJxeat92sHVtI3FZfrh +pN3cjQaXhMbJFO9ySv5tqr0KxUbymN56ynWkScMGbI0W +-----END CERTIFICATE----- diff --git a/python/ray/serve/tests/test_certs/server.crt b/python/ray/serve/tests/test_certs/server.crt new file mode 100644 index 000000000000..5b0a5e11bf42 --- /dev/null +++ b/python/ray/serve/tests/test_certs/server.crt @@ -0,0 +1,21 @@ +-----BEGIN CERTIFICATE----- +MIIDfTCCAmWgAwIBAgIUYcUOt0aN1Ml/1WnFPB9gveNNniQwDQYJKoZIhvcNAQEL +BQAwZzELMAkGA1UEBhMCVVMxEzARBgNVBAgMCkNhbGlmb3JuaWExFjAUBgNVBAcM +DVNhbiBGcmFuY2lzY28xFzAVBgNVBAoMDlJheSBTZXJ2ZSBUZXN0MRIwEAYDVQQD +DAlsb2NhbGhvc3QwHhcNMjUwODIwMTgxODUzWhcNMjYwODIwMTgxODUzWjBnMQsw +CQYDVQQGEwJVUzETMBEGA1UECAwKQ2FsaWZvcm5pYTEWMBQGA1UEBwwNU2FuIEZy +YW5jaXNjbzEXMBUGA1UECgwOUmF5IFNlcnZlIFRlc3QxEjAQBgNVBAMMCWxvY2Fs +aG9zdDCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBAKYXcIirTR5AHb5V +T6yijOR8mvc6AXSKkmIKu7n2vaJ3Jrt7d6mPz/ScXlLYxq+mgt4avX/VozES0ARM +NcbqlHOcahgfyyN+/02q/Aimwbaf/FwiS5qyQfMXzFg70kydqlDlUsyE49qdFHEv +xx4ostLnTeyIpS7AS14qJXGeg5NE9Pm+XSs0HVBPZBaM6VCJl8/Pjog0qqffovGo +/qN8gVxnydg4ayTZ9nl+NNMivFJ/f5MUXmJiuFYAoZnwMiCy2QAU9TmdA5mCOGNZ +pv/KSSdqkVh7X6JNGB6OLgikCsObWxAJqq7WZgiHoc2WlXuN+U2SLuA0JLZZZr+t +zpw1DH0CAwEAAaMhMB8wHQYDVR0OBBYEFIey4ZBoVICZ7kAJv7K5kY/SHP6wMA0G +CSqGSIb3DQEBCwUAA4IBAQAg47MfYFykzDdynJnKf/Aqlp4bnT3GVEW3lRk8AMv9 +yrjwQeVKihiQLgC6b7ChyLUQWxcxJPqhzAIe/+sn9bAxz448oGMtU6ghHtxt13T2 +9VKsyyrjgZ3fbiFT5AFMYxwYlcaf1hJPE+PKKU3oUhYxUlEBKweDjTw7+7xym/Ix +hNYv36lDst/zwA1HKmvorDhCVOT3Y90deVA31NxFQbqNpeCjG6uiURAtO3jMan50 +m9U60cHjJBkSxCKCw4SQXOan9VKePIsHnZgIiDPmO25KYSJxeat92sHVtI3FZfrh +pN3cjQaXhMbJFO9ySv5tqr0KxUbymN56ynWkScMGbI0W +-----END CERTIFICATE----- diff --git a/python/ray/serve/tests/test_certs/server.csr b/python/ray/serve/tests/test_certs/server.csr new file mode 100644 index 000000000000..3d26126664ef --- /dev/null +++ b/python/ray/serve/tests/test_certs/server.csr @@ -0,0 +1,17 @@ +-----BEGIN CERTIFICATE REQUEST----- +MIICrDCCAZQCAQAwZzELMAkGA1UEBhMCVVMxEzARBgNVBAgMCkNhbGlmb3JuaWEx +FjAUBgNVBAcMDVNhbiBGcmFuY2lzY28xFzAVBgNVBAoMDlJheSBTZXJ2ZSBUZXN0 +MRIwEAYDVQQDDAlsb2NhbGhvc3QwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEK +AoIBAQCmF3CIq00eQB2+VU+soozkfJr3OgF0ipJiCru59r2idya7e3epj8/0nF5S +2MavpoLeGr1/1aMxEtAETDXG6pRznGoYH8sjfv9NqvwIpsG2n/xcIkuaskHzF8xY +O9JMnapQ5VLMhOPanRRxL8ceKLLS503siKUuwEteKiVxnoOTRPT5vl0rNB1QT2QW +jOlQiZfPz46INKqn36LxqP6jfIFcZ8nYOGsk2fZ5fjTTIrxSf3+TFF5iYrhWAKGZ +8DIgstkAFPU5nQOZgjhjWab/ykknapFYe1+iTRgeji4IpArDm1sQCaqu1mYIh6HN +lpV7jflNki7gNCS2WWa/rc6cNQx9AgMBAAGgADANBgkqhkiG9w0BAQsFAAOCAQEA +igYR2ZQ4fmp339T/BGvXSDIjQQkecd9MeifdcXuN/2FZ7dhyfDWHjQadtohgXSZw +LwfUx43L+JcebMY8GyN/4JIAKA5hVqqvAiaMb+vRUItgku5M2WIpnPLVKQJHTUGC +aaDq6u7aS4eFcvuYGaFTUD7tNMOfRP8SfQL/sk2UqZVOCIxCFX9gLS/p4IyorUsb +VjdQBHRvOZnZCFMwmisquXXeGxtAPabUWMPLvSqcP/93WdjFwtrcscyY68s+AC6o +9sx1x3qjnTxnx+a8ho5f0p/JSUqye+G/gzqzB5WMZK5U7oiYgP0rEajU9odGIPSK +AqzWpVDtZBSr8FFamw4uqQ== +-----END CERTIFICATE REQUEST----- diff --git a/python/ray/serve/tests/test_certs/server.key b/python/ray/serve/tests/test_certs/server.key new file mode 100644 index 000000000000..de16d5454e9d --- /dev/null +++ b/python/ray/serve/tests/test_certs/server.key @@ -0,0 +1,28 @@ +-----BEGIN PRIVATE KEY----- +MIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQCmF3CIq00eQB2+ +VU+soozkfJr3OgF0ipJiCru59r2idya7e3epj8/0nF5S2MavpoLeGr1/1aMxEtAE +TDXG6pRznGoYH8sjfv9NqvwIpsG2n/xcIkuaskHzF8xYO9JMnapQ5VLMhOPanRRx +L8ceKLLS503siKUuwEteKiVxnoOTRPT5vl0rNB1QT2QWjOlQiZfPz46INKqn36Lx +qP6jfIFcZ8nYOGsk2fZ5fjTTIrxSf3+TFF5iYrhWAKGZ8DIgstkAFPU5nQOZgjhj +Wab/ykknapFYe1+iTRgeji4IpArDm1sQCaqu1mYIh6HNlpV7jflNki7gNCS2WWa/ +rc6cNQx9AgMBAAECggEAFj7SHLaiJ+i7KHCcBj7ok1Bjyl8OLizCebfUTY5QTH/x +mRoVUd7oIcFbxMmHUE6t/STPDV3GHgmAq5gFeonqrigHRwnjFvL91h+OOB5q7ZSJ ++VEX7TVDg1VEUkEDjq1t+qhsVDuBmm3VfL9tx4qjQNTSvq536UYUvMefp5MX2P54 +/7IDM9osP5VgeFIUx/d7QYymhgmVaSv+xcxxlZCwT3ib/wW7eU964FjkuRG8eein +zlyOwRufmg+eEvOUHN/4Fth0AUUirCMpflgRdcQtKs77FARiG8LybMGyDDsE7YBt +5f/UBZea2TQG9q4aGNUIHA869CCNKg2R27AtBpTtBQKBgQDd95GDIZMlEmR3GzpJ +6rqRHtfXj7BHUlzew+RCI1KhWkjRZqv2bavmeqRLpRdKd36aC+l+QallSW/MME+7 +JSgRMqqdQK2myLJnZOIcONjMlOn9xzEQGYUsKL4IiPkdP0lWdzJ6iqAHm/Xq7GxE +BJF5XkYD1NP2+y3dlZYNrmUGHwKBgQC/jrOCV7Y34IriUPHSQA1JaPZQDBBxwiNo +ifPYkRc5C0zwskiELnJGF34Y/fK06n73JyBh6WqMdu7+V/QKNCEgcKU45n+pnlAL +vx+xflfMknWEOhLdT31ca0kvxtGEomOD1MNV+b1cRYBlL/oMC2IpIKd0N/HFa3Nc +pDmLcBWB4wKBgAIHXD4dlXG2TFLGXe8FBTWEWaavuoW8W/rxQWnVVtEAuT+ot5Om +BvcxUcUbOi5FD1QrHbQ4t2qklDAClQf52/bkRqjvSWcH2JGXW3W0k06zYbwfEPS7 +tvrjWHFNhzFcPbhbmIuELthC9alzBb5NaGL6mJs6W8GbJB0tW9S+LlAzAoGBAIlB +h/B6Rs+s7fcSBuQfDyYttmhO7K2GbPan+niQJfKy3TOOm5VS7oC4rprbw7/MUqNn +frWJmdYCFmdawDtbdO0Yqdqmlo0EKdjw3pXAsMqdmuTe88tt/KZvHWbFcDU4YlQA +7OI662slRcW7ZdChi3lqs3H78BoETwnvhmgaLN7/AoGBAIVtEVcieOsasQ3Cje4L +mZxo9WFwtX4llH/CTZZeyek6VZBEWP8b3i1uh0uOzeiR7nDiwGEbHfXdvIvWrZqf +IC9Lo1D24uzE14XcKypFsYL5GAwtNhTAuP52tfV9V7DlS2QmxQt6hzx0/MhtdM3X +1XCsMrmi/WleIy611H2j0gUj +-----END PRIVATE KEY----- diff --git a/python/ray/serve/tests/test_controller.py b/python/ray/serve/tests/test_controller.py index d3696ca521fb..f9765663eda0 100644 --- a/python/ray/serve/tests/test_controller.py +++ b/python/ray/serve/tests/test_controller.py @@ -9,7 +9,7 @@ from ray.serve._private.common import DeploymentID from ray.serve._private.config import DeploymentConfig from ray.serve._private.constants import ( - DEFAULT_AUTOSCALING_POLICY, + DEFAULT_AUTOSCALING_POLICY_NAME, SERVE_DEFAULT_APP_NAME, ) from ray.serve._private.deployment_info import DeploymentInfo @@ -79,9 +79,9 @@ def check_custom_exception() -> bool: @pytest.mark.parametrize( - "policy", [None, DEFAULT_AUTOSCALING_POLICY, default_autoscaling_policy] + "policy_name", [None, DEFAULT_AUTOSCALING_POLICY_NAME, default_autoscaling_policy] ) -def test_get_serve_instance_details_json_serializable(serve_instance, policy): +def test_get_serve_instance_details_json_serializable(serve_instance, policy_name): """Test the result from get_serve_instance_details is json serializable.""" controller = _get_global_client()._controller @@ -89,9 +89,9 @@ def test_get_serve_instance_details_json_serializable(serve_instance, policy): autoscaling_config = { "min_replicas": 1, "max_replicas": 10, - "_policy": policy, + "_policy": {"name": policy_name}, } - if policy is None: + if policy_name is None: autoscaling_config.pop("_policy") @serve.deployment(autoscaling_config=autoscaling_config) @@ -177,6 +177,9 @@ def autoscaling_app(): "downscaling_factor": None, "downscale_delay_s": 600.0, "upscale_delay_s": 30.0, + "policy": { + "name": "ray.serve.autoscaling_policy:default_autoscaling_policy" + }, }, "graceful_shutdown_wait_loop_s": 2.0, "graceful_shutdown_timeout_s": 20.0, diff --git a/python/ray/serve/tests/test_controller_recovery.py b/python/ray/serve/tests/test_controller_recovery.py index b5cbf0af1637..535483411e71 100644 --- a/python/ray/serve/tests/test_controller_recovery.py +++ b/python/ray/serve/tests/test_controller_recovery.py @@ -19,9 +19,12 @@ SERVE_NAMESPACE, SERVE_PROXY_NAME, ) -from ray.serve._private.test_utils import check_replica_counts, get_application_url +from ray.serve._private.test_utils import ( + check_replica_counts, + get_application_url, + request_with_retries, +) from ray.serve.schema import LoggingConfig, ServeDeploySchema -from ray.serve.tests.test_failure import request_with_retries from ray.util.state import list_actors @@ -51,9 +54,7 @@ def __call__(self, *args): serve.run(TransientConstructorFailureDeployment.bind(), name="app") for _ in range(10): - response = request_with_retries( - "/recover_start_from_replica_actor_names/", timeout=30, app_name="app" - ) + response = request_with_retries(timeout=30, app_name="app") assert response.text == "hii" # Assert 2 replicas are running in deployment deployment after partially # successful deploy() call with transient error @@ -64,7 +65,7 @@ def __call__(self, *args): replica_version_hash = None for replica in deployment_dict[id]: ref = replica.actor_handle.initialize_and_get_metadata.remote() - _, version, _, _, _ = ray.get(ref) + _, version, _, _, _, _, _, _ = ray.get(ref) if replica_version_hash is None: replica_version_hash = hash(version) assert replica_version_hash == hash(version), ( @@ -96,9 +97,7 @@ def __call__(self, *args): lambda: get_application_url("HTTP", "app", use_localhost=True) is not None ) for _ in range(10): - response = request_with_retries( - "/recover_start_from_replica_actor_names/", timeout=30, app_name="app" - ) + response = request_with_retries(timeout=30, app_name="app") assert response.text == "hii" # Ensure recovered replica names are the same @@ -119,7 +118,7 @@ def __call__(self, *args): for replica_name in recovered_replica_names: actor_handle = ray.get_actor(replica_name, namespace=SERVE_NAMESPACE) ref = actor_handle.initialize_and_get_metadata.remote() - _, version, _, _, _ = ray.get(ref) + _, version, _, _, _, _, _, _ = ray.get(ref) assert replica_version_hash == hash( version ), "Replica version hash should be the same after recover from actor names" diff --git a/python/ray/serve/tests/test_deploy_2.py b/python/ray/serve/tests/test_deploy_2.py index bf809d8d3ceb..1dc32c7a23a0 100644 --- a/python/ray/serve/tests/test_deploy_2.py +++ b/python/ray/serve/tests/test_deploy_2.py @@ -266,13 +266,20 @@ def __call__(self): url = get_application_url("HTTP", app_name="app1") assert httpx.get(f"{url}").text == "hello alice" - proxy_url = "http://localhost:8000/-/routes" - routes = httpx.get(proxy_url).json() + url_without_route_prefix = get_application_url( + "HTTP", app_name="app1", exclude_route_prefix=True + ) + routes_url = f"{url_without_route_prefix}/-/routes" + routes = httpx.get(routes_url).json() assert routes["/app1"] == "app1" url = get_application_url("HTTP", app_name="app2") assert httpx.get(f"{url}").text == "hello bob" - routes = httpx.get(proxy_url).json() + url_without_route_prefix = get_application_url( + "HTTP", app_name="app2", exclude_route_prefix=True + ) + routes_url = f"{url_without_route_prefix}/-/routes" + routes = httpx.get(routes_url).json() assert routes["/app2"] == "app2" app1_status = serve.status().applications["app1"] @@ -324,6 +331,7 @@ async def __call__(self): "downscaling_factor": None, "smoothing_factor": 1.0, "initial_replicas": None, + "policy": {"name": "ray.serve.autoscaling_policy:default_autoscaling_policy"}, } @@ -377,6 +385,7 @@ async def __call__(self): "downscaling_factor": None, "smoothing_factor": 1.0, "initial_replicas": None, + "policy": {"name": "ray.serve.autoscaling_policy:default_autoscaling_policy"}, } for i in range(3): diff --git a/python/ray/serve/tests/test_deploy_app_2.py b/python/ray/serve/tests/test_deploy_app_2.py index 0c379082e77b..28ce0a1adb92 100644 --- a/python/ray/serve/tests/test_deploy_app_2.py +++ b/python/ray/serve/tests/test_deploy_app_2.py @@ -14,9 +14,15 @@ from ray import serve from ray._common.test_utils import SignalActor, wait_for_condition from ray.serve._private.common import DeploymentID, ReplicaID -from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME, SERVE_NAMESPACE +from ray.serve._private.constants import ( + SERVE_DEFAULT_APP_NAME, + SERVE_NAMESPACE, +) from ray.serve._private.test_utils import ( check_num_replicas_eq, + check_running, + check_target_groups_ready, + get_application_url, ) from ray.serve.schema import ( ApplicationStatus, @@ -24,7 +30,6 @@ ServeDeploySchema, ServeInstanceDetails, ) -from ray.serve.tests.test_deploy_app import check_running from ray.tests.conftest import call_ray_stop_only # noqa: F401 from ray.util.state import list_actors @@ -404,18 +409,9 @@ def test_deploy_does_not_affect_dynamic_apps(serve_instance): ], ) client.deploy_apps(config) - - def check_application_running( - name: str, route_prefix: str, *, msg: str = "wonderful world" - ): - status = serve.status().applications[name] - assert status.status == "RUNNING" - assert httpx.post(f"http://localhost:8000{route_prefix}/").text == msg - return True - - wait_for_condition( - check_application_running, name="declarative-app-1", route_prefix="/app-1" - ) + wait_for_condition(check_running, app_name="declarative-app-1") + url = get_application_url(app_name="declarative-app-1") + assert httpx.post(url).text == "wonderful world" # Now `serve.run` a dynamic app. @serve.deployment @@ -424,12 +420,9 @@ def __call__(self, *args) -> str: return "Hello!" serve.run(D.bind(), name="dynamic-app", route_prefix="/dynamic") - wait_for_condition( - check_application_running, - name="dynamic-app", - route_prefix="/dynamic", - msg="Hello!", - ) + wait_for_condition(check_running, app_name="dynamic-app") + url = get_application_url(app_name="dynamic-app") + assert httpx.post(url).text == "Hello!" # Add a new app via declarative API. # Existing declarative app and dynamic app should not be affected. @@ -441,46 +434,35 @@ def __call__(self, *args) -> str: ), ) client.deploy_apps(config) + wait_for_condition(check_running, app_name="declarative-app-2") + url = get_application_url(app_name="declarative-app-2") + assert httpx.post(url).text == "wonderful world" - wait_for_condition( - check_application_running, name="declarative-app-2", route_prefix="/app-2" - ) - wait_for_condition( - check_application_running, name="declarative-app-1", route_prefix="/app-1" - ) - wait_for_condition( - check_application_running, - name="dynamic-app", - route_prefix="/dynamic", - msg="Hello!", - ) + url = get_application_url(app_name="declarative-app-1") + assert httpx.post(url).text == "wonderful world" + + url = get_application_url(app_name="dynamic-app") + assert httpx.post(url).text == "Hello!" # Delete one of the apps via declarative API. # Other declarative app and dynamic app should not be affected. config.applications.pop(0) client.deploy_apps(config) + wait_for_condition(check_running, app_name="declarative-app-2") + url = get_application_url(app_name="declarative-app-2") + assert httpx.post(url).text == "wonderful world" - wait_for_condition( - check_application_running, name="declarative-app-2", route_prefix="/app-2" - ) - wait_for_condition( - check_application_running, - name="dynamic-app", - route_prefix="/dynamic", - msg="Hello!", - ) + url = get_application_url(app_name="dynamic-app") + assert httpx.post(url).text == "Hello!" wait_for_condition(lambda: "declarative-app-1" not in serve.status().applications) # Now overwrite the declarative app with a dynamic app with the same name. # On subsequent declarative apply, that app should not be affected. serve.run(D.bind(), name="declarative-app-2", route_prefix="/app-2") - wait_for_condition( - check_application_running, - name="declarative-app-2", - route_prefix="/app-2", - msg="Hello!", - ) + wait_for_condition(check_running, app_name="declarative-app-2") + url = get_application_url(app_name="declarative-app-2") + assert httpx.post(url).text == "Hello!" config.applications = [ ServeApplicationSchema( @@ -490,39 +472,41 @@ def __call__(self, *args) -> str: ), ] client.deploy_apps(config) + wait_for_condition(check_running, app_name="declarative-app-1") + url = get_application_url(app_name="declarative-app-1") + assert httpx.post(url).text == "wonderful world" - wait_for_condition( - check_application_running, - name="declarative-app-1", - route_prefix="/app-1", - ) - wait_for_condition( - check_application_running, - name="dynamic-app", - route_prefix="/dynamic", - msg="Hello!", - ) - wait_for_condition( - check_application_running, - name="declarative-app-2", - route_prefix="/app-2", - msg="Hello!", - ) + wait_for_condition(check_running, app_name="dynamic-app") + url = get_application_url(app_name="dynamic-app") + assert httpx.post(url).text == "Hello!" + + wait_for_condition(check_running, app_name="declarative-app-2") + url = get_application_url(app_name="declarative-app-2") + assert httpx.post(url).text == "Hello!" # Verify that the controller does not delete the dynamic apps on recovery. ray.kill(client._controller, no_restart=False) + + wait_for_condition(check_running, app_name="declarative-app-1") + # It takes some time for the target groups to be ready after controller recovery. + # So we make sure the target groups are ready before obtaining the URL. wait_for_condition( - check_application_running, - name="dynamic-app", - route_prefix="/dynamic", - msg="Hello!", + check_target_groups_ready, client=client, app_name="declarative-app-1" ) + url = get_application_url(app_name="declarative-app-1") + assert httpx.post(url).text == "wonderful world" + + wait_for_condition(check_running, app_name="dynamic-app") + wait_for_condition(check_target_groups_ready, client=client, app_name="dynamic-app") + url = get_application_url(app_name="dynamic-app") + assert httpx.post(url).text == "Hello!" + + wait_for_condition(check_running, app_name="declarative-app-2") wait_for_condition( - check_application_running, - name="declarative-app-2", - route_prefix="/app-2", - msg="Hello!", + check_target_groups_ready, client=client, app_name="declarative-app-2" ) + url = get_application_url(app_name="declarative-app-2") + assert httpx.post(url).text == "Hello!" # Now overwrite the dynamic app with a declarative one and check that it gets # deleted upon another apply that doesn't include it. @@ -534,11 +518,9 @@ def __call__(self, *args) -> str: ), ] client.deploy_apps(config) - wait_for_condition( - check_application_running, - name="declarative-app-2", - route_prefix="/app-2", - ) + wait_for_condition(check_running, app_name="declarative-app-2") + url = get_application_url(app_name="declarative-app-2") + assert httpx.post(url).text == "wonderful world" config.applications = [] client.deploy_apps(config) @@ -555,23 +537,24 @@ def test_change_route_prefix(serve_instance): "import_path": "ray.serve.tests.test_config_files.pid.node", } client.deploy_apps(ServeDeploySchema(**{"applications": [app_config]})) - wait_for_condition(check_running) - pid1 = httpx.get("http://localhost:8000/old").json()[0] - + url = get_application_url() + pid1 = httpx.get(url).json()[0] # Redeploy application with route prefix /new. app_config["route_prefix"] = "/new" client.deploy_apps(ServeDeploySchema(**{"applications": [app_config]})) - + wait_for_condition(check_running) # Check that the old route is gone and the response from the new route # has the same PID (replica wasn't restarted). def check_switched(): # Old route should be gone - resp = httpx.get("http://localhost:8000/old") + url = get_application_url(exclude_route_prefix=True) + resp = httpx.get(f"{url}/old") assert "Path '/old' not found." in resp.text # Response from new route should be same PID - pid2 = httpx.get("http://localhost:8000/new").json()[0] + url = get_application_url(exclude_route_prefix=True) + pid2 = httpx.get(f"{url}/new").json()[0] assert pid2 == pid1 return True @@ -611,6 +594,7 @@ def test_num_replicas_auto_api(serve_instance): "downscaling_factor": None, "smoothing_factor": 1.0, "initial_replicas": None, + "policy": {"name": "ray.serve.autoscaling_policy:default_autoscaling_policy"}, } @@ -663,6 +647,7 @@ def test_num_replicas_auto_basic(serve_instance): "downscaling_factor": None, "smoothing_factor": 1.0, "initial_replicas": None, + "policy": {"name": "ray.serve.autoscaling_policy:default_autoscaling_policy"}, } h = serve.get_app_handle(SERVE_DEFAULT_APP_NAME) diff --git a/python/ray/serve/tests/test_deployment_version.py b/python/ray/serve/tests/test_deployment_version.py index ce37a9100e74..b1202aee3134 100644 --- a/python/ray/serve/tests/test_deployment_version.py +++ b/python/ray/serve/tests/test_deployment_version.py @@ -17,6 +17,27 @@ def get_version(): assert len(set(ray.get([get_version.remote() for _ in range(100)]))) == 1 +def test_route_prefix_changes_trigger_reconfigure_hash(): + """Test that route prefix changes trigger a reconfigure hash change.""" + cfg = DeploymentConfig() + v1 = DeploymentVersion( + code_version="same version", + deployment_config=cfg, + ray_actor_options={}, + route_prefix="/a", + ) + v2 = DeploymentVersion( + code_version="same version", + deployment_config=cfg, + ray_actor_options={}, + route_prefix="/b", + ) + assert v1.reconfigure_actor_hash != v2.reconfigure_actor_hash + # Should not require a full actor restart if nothing else changed + assert not v1.requires_actor_restart(v2) + assert v1.requires_actor_reconfigure(v2) + + if __name__ == "__main__": import sys diff --git a/python/ray/serve/tests/test_failure.py b/python/ray/serve/tests/test_failure.py index 2df70aef6a1f..c2e1bc548495 100644 --- a/python/ray/serve/tests/test_failure.py +++ b/python/ray/serve/tests/test_failure.py @@ -16,26 +16,12 @@ from ray.serve._private.test_utils import ( Counter, check_num_replicas_eq, - get_application_url, get_deployment_details, + request_with_retries, tlog, ) -def request_with_retries(endpoint, timeout=30, app_name=SERVE_DEFAULT_APP_NAME): - start = time.time() - while True: - try: - return httpx.get( - get_application_url("HTTP", app_name=app_name) + endpoint, - timeout=timeout, - ) - except (httpx.RequestError, IndexError): - if time.time() - start > timeout: - raise TimeoutError - time.sleep(0.1) - - @pytest.mark.skip(reason="Consistently failing.") def test_controller_failure(serve_instance): @serve.deployment(name="controller_failure") @@ -44,16 +30,16 @@ def function(_): serve.run(function.bind()) - assert request_with_retries("/controller_failure/", timeout=1).text == "hello1" + assert request_with_retries(timeout=1).text == "hello1" for _ in range(10): - response = request_with_retries("/controller_failure/", timeout=30) + response = request_with_retries(timeout=30) assert response.text == "hello1" ray.kill(serve.context._global_client._controller, no_restart=False) for _ in range(10): - response = request_with_retries("/controller_failure/", timeout=30) + response = request_with_retries(timeout=30) assert response.text == "hello1" def function2(_): @@ -64,7 +50,7 @@ def function2(_): serve.run(function.options(func_or_class=function2).bind()) def check_controller_failure(): - response = request_with_retries("/controller_failure/", timeout=30) + response = request_with_retries(timeout=30) return response.text == "hello2" wait_for_condition(check_controller_failure) @@ -78,50 +64,12 @@ def function3(_): ray.kill(serve.context._global_client._controller, no_restart=False) for _ in range(10): - response = request_with_retries("/controller_failure/", timeout=30) + response = request_with_retries(timeout=30) assert response.text == "hello2" - response = request_with_retries("/controller_failure_2/", timeout=30) + response = request_with_retries(timeout=30) assert response.text == "hello3" -def _kill_http_proxies(): - http_proxies = ray.get( - serve.context._global_client._controller.get_proxies.remote() - ) - for http_proxy in http_proxies.values(): - ray.kill(http_proxy, no_restart=False) - - -def test_http_proxy_failure(serve_instance): - @serve.deployment(name="proxy_failure") - def function(_): - return "hello1" - - serve.run(function.bind()) - - assert request_with_retries("/proxy_failure/", timeout=1.0).text == "hello1" - - for _ in range(10): - response = request_with_retries("/proxy_failure/", timeout=30) - assert response.text == "hello1" - - _kill_http_proxies() - - def function2(_): - return "hello2" - - serve.run(function.options(func_or_class=function2).bind()) - - def check_new(): - for _ in range(10): - response = request_with_retries("/proxy_failure/", timeout=30) - if response.text != "hello2": - return False - return True - - wait_for_condition(check_new) - - def _get_worker_handles(deployment_name: str, app_name: str = SERVE_DEFAULT_APP_NAME): id = DeploymentID(name=deployment_name, app_name=app_name) controller = serve.context._global_client._controller @@ -141,7 +89,7 @@ def __call__(self, *args): serve.run(Worker1.bind()) # Get the PID of the worker. - old_pid = request_with_retries("/worker_failure/", timeout=1).text + old_pid = request_with_retries(timeout=1).text # Kill the worker. handles = _get_worker_handles("worker_failure") @@ -151,7 +99,7 @@ def __call__(self, *args): # Wait until the worker is killed and a one is started. start = time.time() while time.time() - start < 30: - response = request_with_retries("/worker_failure/", timeout=30) + response = request_with_retries(timeout=30) if response.text != old_pid: break else: @@ -192,7 +140,7 @@ def __call__(self, *args): start = time.time() while time.time() - start < 30: time.sleep(0.1) - response = request_with_retries("/replica_failure/", timeout=1).text + response = request_with_retries(timeout=1).text assert response in ["1", "2"] responses.add(response) if len(responses) > 1: @@ -211,7 +159,7 @@ def __call__(self, *args): try: # The timeout needs to be small here because the request to # the restarting worker will hang. - request_with_retries("/replica_failure/", timeout=0.1) + request_with_retries(timeout=0.1) break except TimeoutError: time.sleep(0.1) diff --git a/python/ray/serve/tests/test_handle_same_loop.py b/python/ray/serve/tests/test_handle_same_loop.py index 78dcde1963cb..c3086f71ae86 100644 --- a/python/ray/serve/tests/test_handle_same_loop.py +++ b/python/ray/serve/tests/test_handle_same_loop.py @@ -24,7 +24,7 @@ def _skip_test_if_router_running_in_separate_loop(): @pytest.mark.asyncio async def test_deployment_handle_works_with_await_when_router_in_same_loop( - serve_instance, _skip_test_if_router_running_in_separate_loop + serve_instance_async, _skip_test_if_router_running_in_separate_loop ): @serve.deployment class F: @@ -50,7 +50,7 @@ def __call__(self): @pytest.mark.asyncio async def test_deployment_handle_result_fails_in_async_context_but_await_succeeds( - serve_instance, _skip_test_if_router_running_in_separate_loop + serve_instance_async, _skip_test_if_router_running_in_separate_loop ): @serve.deployment class F: @@ -81,7 +81,9 @@ def __call__(self): @pytest.mark.asyncio -async def test_deployment_handle_configured_for_same_loop_via_init(serve_instance): +async def test_deployment_handle_configured_for_same_loop_via_init( + serve_instance_async, +): @serve.deployment class F: def __call__(self): @@ -119,7 +121,7 @@ async def __call__(self): @pytest.mark.asyncio async def test_deployment_handle_exception_propagation_in_same_loop( - serve_instance, _skip_test_if_router_running_in_separate_loop + serve_instance_async, _skip_test_if_router_running_in_separate_loop ): """Test that exceptions are properly propagated when router runs in same loop.""" @@ -136,7 +138,7 @@ def __call__(self): @pytest.mark.asyncio async def test_streaming_response_generator_in_same_loop( - serve_instance, _skip_test_if_router_running_in_separate_loop + serve_instance_async, _skip_test_if_router_running_in_separate_loop ): """Test that streaming responses work correctly when router runs in same loop.""" @@ -159,7 +161,7 @@ def generate_numbers(self, limit: int): @pytest.mark.asyncio async def test_concurrent_requests_in_same_loop( - serve_instance, _skip_test_if_router_running_in_separate_loop + serve_instance_async, _skip_test_if_router_running_in_separate_loop ): """Test that multiple concurrent requests work correctly in same loop mode.""" @@ -185,7 +187,7 @@ async def slow_operation(self, delay: float, value: str): @pytest.mark.asyncio async def test_request_cancellation_in_same_loop( - serve_instance, _skip_test_if_router_running_in_separate_loop + serve_instance_async, _skip_test_if_router_running_in_separate_loop ): """Test that request cancellation works correctly when router runs in same loop.""" signal_actor = SignalActor.remote() @@ -218,5 +220,27 @@ async def check_num_waiters(): await signal_actor.send.remote(clear=True) +@pytest.mark.asyncio +async def test_multiple_awaits(serve_instance_async): + """Test that multiple awaits doesn't call replica multiple times.""" + a = 0 + + @serve.deployment + async def foo(): + nonlocal a + a += 1 + return a + + app = serve.run(foo.bind()) + + response = app.remote() + assert await response == 1 + assert await response == 1 + + response = app.remote() + assert await response == 2 + assert await response == 2 + + if __name__ == "__main__": sys.exit(pytest.main(["-v", "-s", __file__])) diff --git a/python/ray/serve/tests/test_http_routes.py b/python/ray/serve/tests/test_http_routes.py index 8d2cbe4410ae..4fafc1360e73 100644 --- a/python/ray/serve/tests/test_http_routes.py +++ b/python/ray/serve/tests/test_http_routes.py @@ -8,6 +8,7 @@ import ray from ray import serve from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME +from ray.serve._private.test_utils import get_application_url def test_path_validation(serve_instance): @@ -67,15 +68,22 @@ def __call__(self, *args): serve.run(D2.bind(), name="app2", route_prefix="/hello/world") routes = httpx.get("http://localhost:8000/-/routes").json() - assert len(routes) == 2, routes - assert httpx.get("http://localhost:8000/D1").text == "D1" - assert httpx.get("http://localhost:8000/D1").status_code == 200 - assert httpx.get("http://localhost:8000/hello/world").text == "D2" - assert httpx.get("http://localhost:8000/hello/world").status_code == 200 + app1_url = get_application_url(app_name="app1") + app2_url = get_application_url(app_name="app2") + + assert httpx.get(app1_url).text == "D1" + assert httpx.get(app1_url).status_code == 200 + assert httpx.get(app2_url).text == "D2" + assert httpx.get(app2_url).status_code == 200 assert httpx.get("http://localhost:8000/not_exist").status_code == 404 - assert httpx.get("http://localhost:8000/").status_code == 404 + + app1_url = get_application_url(app_name="app1", exclude_route_prefix=True) + app2_url = get_application_url(app_name="app2", exclude_route_prefix=True) + + assert httpx.get(f"{app1_url}/").status_code == 404 + assert httpx.get(f"{app2_url}/").status_code == 404 def test_deployment_without_route(serve_instance): @@ -85,8 +93,8 @@ def __call__(self, *args): return "1" serve.run(D.bind(), route_prefix=None) - routes = httpx.get("http://localhost:8000/-/routes").json() - assert len(routes) == 0 + routes = httpx.get("http://localhost:8000/-/routes") + assert len(routes.json()) == 0 # make sure the deployment is not exposed under the default route r = httpx.get("http://localhost:8000/") @@ -99,16 +107,17 @@ class D1: pass serve.run(D1.bind()) - - routes = httpx.get("http://localhost:8000/-/routes").json() + url = get_application_url(exclude_route_prefix=True) + routes = httpx.get(f"{url}/-/routes").json() assert len(routes) == 1 assert "/" in routes, routes assert routes["/"] == SERVE_DEFAULT_APP_NAME def test_path_prefixing_1(serve_instance): - def check_req(subpath, text=None, status=None): - r = httpx.get(f"http://localhost:8000{subpath}") + def check_req(subpath, app_name, text=None, status=None): + url = get_application_url(app_name=app_name, exclude_route_prefix=True) + r = httpx.get(f"{url}{subpath}") if text is not None: assert r.text == text, f"{r.text} != {text}" if status is not None: @@ -122,10 +131,10 @@ def __call__(self, *args): return "1" serve.run(D1.bind(), route_prefix="/hello", name="app1") - check_req("/", status=404) - check_req("/hello", text="1") - check_req("/hello/", text="1") - check_req("/hello/a", text="1") + check_req("/", "app1", status=404) + check_req("/hello", "app1", text="1") + check_req("/hello/", "app1", text="1") + check_req("/hello/a", "app1", text="1") @serve.deployment class D2: @@ -133,10 +142,10 @@ def __call__(self, *args): return "2" serve.run(D2.bind(), route_prefix="/", name="app2") - check_req("/hello/", text="1") - check_req("/hello/a", text="1") - check_req("/", text="2") - check_req("/a", text="2") + check_req("/hello/", "app1", text="1") + check_req("/hello/a", "app1", text="1") + check_req("/", "app2", text="2") + check_req("/a", "app2", text="2") @serve.deployment class D3: @@ -144,9 +153,9 @@ def __call__(self, *args): return "3" serve.run(D3.bind(), route_prefix="/hello/world", name="app3") - check_req("/hello/", text="1") - check_req("/", text="2") - check_req("/hello/world/", text="3") + check_req("/hello/", "app1", text="1") + check_req("/", "app2", text="2") + check_req("/hello/world/", "app3", text="3") app = FastAPI() @@ -162,11 +171,11 @@ def subpath(self, p: str): return p serve.run(D4.bind(), route_prefix="/hello/world/again", name="app4") - check_req("/hello/") == "1" - check_req("/") == "2" - check_req("/hello/world/") == "3" - check_req("/hello/world/again/") == "4" - check_req("/hello/world/again/hi") == '"hi"' + check_req("/hello/", "app1") == "1" + check_req("/", "app2") == "2" + check_req("/hello/world/", "app3") == "3" + check_req("/hello/world/again/", "app4") == "4" + check_req("/hello/world/again/hi", "app4") == '"hi"' @pytest.mark.parametrize("base_path", ["", "subpath"]) @@ -201,14 +210,13 @@ def redirect_twice(self, request: Request): if route_prefix != "/": route_prefix += "/" - r = httpx.get(f"http://localhost:8000{route_prefix}redirect", follow_redirects=True) + url = get_application_url(exclude_route_prefix=True) + r = httpx.get(f"{url}{route_prefix}redirect", follow_redirects=True) assert r.status_code == 200 assert len(r.history) == 1 assert r.json() == "hello from /" - r = httpx.get( - f"http://localhost:8000{route_prefix}redirect2", follow_redirects=True - ) + r = httpx.get(f"{url}{route_prefix}redirect2", follow_redirects=True) assert r.status_code == 200 assert len(r.history) == 2 assert r.json() == "hello from /" @@ -220,7 +228,9 @@ def f(): _ = 1 / 0 serve.run(f.bind()) - r = httpx.get("http://localhost:8000/f") + url = get_application_url(exclude_route_prefix=True) + # Error is raised when the request reaches the deployed replica. + r = httpx.get(f"{url}/f") assert r.status_code == 500 assert r.text == "Internal Server Error" @@ -234,6 +244,7 @@ def h(): time.sleep(100) # Don't return here to leave time for actor exit. serve.run(h.bind()) + # Error is raised before the request reaches the deployed replica as the replica does not exist. r = httpx.get("http://localhost:8000/h") assert r.status_code == 500 diff --git a/python/ray/serve/tests/test_https_proxy.py b/python/ray/serve/tests/test_https_proxy.py new file mode 100644 index 000000000000..051960eafd1d --- /dev/null +++ b/python/ray/serve/tests/test_https_proxy.py @@ -0,0 +1,495 @@ +import asyncio +import json +import os +import ssl +import tempfile + +import pytest +import requests +import websockets + +import ray +from ray import serve +from ray._private.tls_utils import generate_self_signed_tls_certs +from ray.serve.config import HTTPOptions + + +@pytest.fixture(scope="session") +def ssl_cert_and_key(): + """Generate SSL certificates using Ray's built-in utilities for testing.""" + # Generate certificate and key using Ray's utility + cert_contents, key_contents = generate_self_signed_tls_certs() + + # Create temp directory that persists for the session + temp_dir = tempfile.mkdtemp(prefix="ray_serve_https_test_") + + # Write server certificate and key + cert_path = os.path.join(temp_dir, "server.crt") + key_path = os.path.join(temp_dir, "server.key") + + with open(cert_path, "w") as f: + f.write(cert_contents) + with open(key_path, "w") as f: + f.write(key_contents) + + yield { + "key_path": key_path, + "cert_path": cert_path, + "temp_dir": temp_dir, + } + + # Cleanup + import shutil + + try: + shutil.rmtree(temp_dir) + except Exception: + pass # Ignore cleanup errors + + +@pytest.fixture +def https_serve_instance(ssl_cert_and_key): + """Start Ray Serve with HTTPS enabled.""" + # Ensure Ray is shutdown before starting + try: + ray.shutdown() + except Exception: + pass + + # Disable runtime env upload (dashboard should work now that it's built) + ray.init(runtime_env={"working_dir": None}) + serve.start( + http_options=HTTPOptions( + ssl_keyfile=ssl_cert_and_key["key_path"], + ssl_certfile=ssl_cert_and_key["cert_path"], + ) + ) + yield serve + serve.shutdown() + ray.shutdown() + + +class TestHTTPSProxy: + def test_https_basic_deployment(self, https_serve_instance): + """Test basic HTTPS deployment functionality.""" + + @serve.deployment + def hello(): + return "Hello HTTPS!" + + serve.run(hello.bind()) + + # Test HTTPS request with certificate verification disabled for self-signed cert + response = requests.get( + "https://localhost:8000/hello", + verify=False, # Skip cert verification for self-signed + ) + assert response.status_code == 200 + assert response.text == "Hello HTTPS!" + + def test_https_vs_http_requests(self, https_serve_instance): + """Test that HTTP requests fail when HTTPS is enabled.""" + + @serve.deployment + def echo(): + return "echo" + + serve.run(echo.bind()) + + # HTTPS request should succeed + https_response = requests.get("https://localhost:8000/echo", verify=False) + assert https_response.status_code == 200 + + # HTTP request should fail with connection error + with pytest.raises(requests.exceptions.ConnectionError): + requests.get("http://localhost:8000/echo", timeout=5) + + def test_https_with_fastapi_deployment(self, https_serve_instance): + """Test HTTPS with FastAPI-based deployment.""" + from fastapi import FastAPI + + app = FastAPI() + + @app.get("/items/{item_id}") + async def read_item(item_id: int): + return {"item_id": item_id, "secure": True} + + @serve.deployment + @serve.ingress(app) + class FastAPIDeployment: + pass + + serve.run(FastAPIDeployment.bind()) + + response = requests.get("https://localhost:8000/items/42", verify=False) + assert response.status_code == 200 + assert response.json() == {"item_id": 42, "secure": True} + + def test_https_concurrent_requests(self, https_serve_instance): + """Test HTTPS with concurrent requests.""" + import concurrent.futures + + @serve.deployment + def concurrent_handler(): + import time + + time.sleep(0.1) # Small delay to test concurrency + return "concurrent" + + serve.run(concurrent_handler.bind()) + + def make_request(): + return requests.get( + "https://localhost:8000/concurrent_handler", verify=False + ) + + # Send 10 concurrent requests + with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: + futures = [executor.submit(make_request) for _ in range(10)] + responses = [f.result() for f in futures] + + # All requests should succeed + for response in responses: + assert response.status_code == 200 + assert response.text == "concurrent" + + def test_https_large_payload(self, https_serve_instance): + """Test HTTPS with large payloads.""" + + @serve.deployment + class LargePayloadHandler: + def __call__(self, request): + # Return a large response (1MB) + large_data = "x" * (1024 * 1024) # 1MB string + return {"data": large_data, "size": len(large_data)} + + serve.run(LargePayloadHandler.bind()) + + response = requests.get( + "https://localhost:8000/LargePayloadHandler", verify=False + ) + assert response.status_code == 200 + data = response.json() + assert data["size"] == 1024 * 1024 + assert len(data["data"]) == 1024 * 1024 + + def test_https_websocket_with_fastapi(self, https_serve_instance): + """Test WebSocket functionality with FastAPI over HTTPS.""" + from fastapi import FastAPI, WebSocket, WebSocketDisconnect + + app = FastAPI() + + @app.websocket("/ws") + async def websocket_endpoint(websocket: WebSocket): + await websocket.accept() + try: + while True: + # Receive message from client + data = await websocket.receive_text() + message = json.loads(data) + + # Echo back with modification + response = { + "echo": message.get("message", ""), + "secure": True, + "protocol": "wss", + } + await websocket.send_text(json.dumps(response)) + except WebSocketDisconnect: + pass + + @serve.deployment + @serve.ingress(app) + class WebSocketDeployment: + pass + + serve.run(WebSocketDeployment.bind()) + + # Test WebSocket connection over HTTPS (wss://) + async def test_websocket(): + # Create SSL context that doesn't verify certificates (for self-signed certs) + ssl_context = ssl.create_default_context() + ssl_context.check_hostname = False + ssl_context.verify_mode = ssl.CERT_NONE + + uri = "wss://localhost:8000/ws" + async with websockets.connect(uri, ssl=ssl_context) as websocket: + # Send test message + test_message = {"message": "Hello WebSocket over HTTPS!"} + await websocket.send(json.dumps(test_message)) + + # Receive response + response = await websocket.recv() + data = json.loads(response) + + # Verify response + assert data["echo"] == "Hello WebSocket over HTTPS!" + assert data["secure"] is True + assert data["protocol"] == "wss" + + # Send another message to test bidirectional communication + test_message2 = {"message": "Second message"} + await websocket.send(json.dumps(test_message2)) + + response2 = await websocket.recv() + data2 = json.loads(response2) + assert data2["echo"] == "Second message" + + # Run the async test + asyncio.run(test_websocket()) + + def test_https_websocket_multiple_connections(self, https_serve_instance): + """Test multiple WebSocket connections over HTTPS.""" + from fastapi import FastAPI, WebSocket, WebSocketDisconnect + + app = FastAPI() + + # Store active connections + connections = [] + + @app.websocket("/ws/broadcast") + async def websocket_broadcast(websocket: WebSocket): + await websocket.accept() + connections.append(websocket) + try: + while True: + data = await websocket.receive_text() + message = json.loads(data) + + # Broadcast to all connections + broadcast_message = { + "type": "broadcast", + "message": message.get("message", ""), + "connections": len(connections), + "secure": True, + } + + # Send to all connected clients + disconnected = [] + for conn in connections: + try: + await conn.send_text(json.dumps(broadcast_message)) + except Exception: + disconnected.append(conn) + + # Remove disconnected clients + for conn in disconnected: + connections.remove(conn) + + except WebSocketDisconnect: + if websocket in connections: + connections.remove(websocket) + + @serve.deployment + @serve.ingress(app) + class WebSocketBroadcastDeployment: + pass + + serve.run(WebSocketBroadcastDeployment.bind()) + + async def test_multiple_websockets(): + ssl_context = ssl.create_default_context() + ssl_context.check_hostname = False + ssl_context.verify_mode = ssl.CERT_NONE + + uri = "wss://localhost:8000/ws/broadcast" + + # Connect multiple clients + websocket1 = await websockets.connect(uri, ssl=ssl_context) + websocket2 = await websockets.connect(uri, ssl=ssl_context) + + try: + # Send message from client 1 + test_message = {"message": "Hello from client 1"} + await websocket1.send(json.dumps(test_message)) + + # Both clients should receive the broadcast + response1 = await websocket1.recv() + response2 = await websocket2.recv() + + data1 = json.loads(response1) + data2 = json.loads(response2) + + # Verify both received the same broadcast + assert data1["type"] == "broadcast" + assert data1["message"] == "Hello from client 1" + assert data1["connections"] == 2 + assert data1["secure"] is True + + assert data2["type"] == "broadcast" + assert data2["message"] == "Hello from client 1" + assert data2["connections"] == 2 + assert data2["secure"] is True + + finally: + await websocket1.close() + await websocket2.close() + + # Run the async test + asyncio.run(test_multiple_websockets()) + + +class TestSSLConfiguration: + def test_ssl_config_validation_success(self, ssl_cert_and_key): + """Test successful SSL configuration validation.""" + key_path = ssl_cert_and_key["key_path"] + cert_path = ssl_cert_and_key["cert_path"] + + # Should not raise exception + options = HTTPOptions(ssl_keyfile=key_path, ssl_certfile=cert_path) + assert options.ssl_keyfile == key_path + assert options.ssl_certfile == cert_path + + def test_ssl_config_validation_missing_key(self): + """Test SSL configuration validation with missing key file.""" + with tempfile.TemporaryDirectory() as temp_dir: + cert_path = os.path.join(temp_dir, "test.crt") + with open(cert_path, "w") as f: + f.write("dummy cert") + + with pytest.raises(ValueError) as exc_info: + HTTPOptions(ssl_keyfile=None, ssl_certfile=cert_path) + + assert "Both ssl_keyfile and ssl_certfile must be provided together" in str( + exc_info.value + ) + + def test_ssl_config_validation_missing_cert(self): + """Test SSL configuration validation with missing cert file.""" + with tempfile.TemporaryDirectory() as temp_dir: + key_path = os.path.join(temp_dir, "test.key") + with open(key_path, "w") as f: + f.write("dummy key") + + with pytest.raises(ValueError) as exc_info: + HTTPOptions(ssl_keyfile=key_path, ssl_certfile=None) + + assert "Both ssl_keyfile and ssl_certfile must be provided together" in str( + exc_info.value + ) + + def test_ssl_config_with_password(self, ssl_cert_and_key): + """Test SSL configuration with key file password.""" + key_path = ssl_cert_and_key["key_path"] + cert_path = ssl_cert_and_key["cert_path"] + + options = HTTPOptions( + ssl_keyfile=key_path, ssl_certfile=cert_path, ssl_keyfile_password="secret" + ) + assert options.ssl_keyfile_password == "secret" + + def test_ssl_config_with_ca_certs(self, ssl_cert_and_key): + """Test SSL configuration with CA certificates.""" + key_path = ssl_cert_and_key["key_path"] + cert_path = ssl_cert_and_key["cert_path"] + # Use cert as CA for testing purposes + ca_path = cert_path + + options = HTTPOptions( + ssl_keyfile=key_path, ssl_certfile=cert_path, ssl_ca_certs=ca_path + ) + assert options.ssl_ca_certs == ca_path + + +class TestHTTPSErrorHandling: + def test_ssl_file_paths_validation(self): + """Test that SSL file paths are properly configured in HTTPOptions.""" + with tempfile.TemporaryDirectory() as temp_dir: + key_path = os.path.join(temp_dir, "test.key") + cert_path = os.path.join(temp_dir, "test.crt") + + # Create dummy files (content doesn't matter for this test) + with open(key_path, "w") as f: + f.write("dummy key") + with open(cert_path, "w") as f: + f.write("dummy cert") + + # Test that HTTPOptions accepts valid file paths + options = HTTPOptions(ssl_keyfile=key_path, ssl_certfile=cert_path) + assert options.ssl_keyfile == key_path + assert options.ssl_certfile == cert_path + + def test_https_requires_both_cert_and_key_files(self): + """Test that HTTPS configuration requires both certificate and key files.""" + # This test validates our SSL validation logic works correctly + + # Should work with both files + options = HTTPOptions(ssl_keyfile="key.pem", ssl_certfile="cert.pem") + assert options.ssl_keyfile == "key.pem" + assert options.ssl_certfile == "cert.pem" + + # Should work with neither file + options = HTTPOptions() + assert options.ssl_keyfile is None + assert options.ssl_certfile is None + + +class TestHTTPSIntegration: + def test_https_with_custom_port(self, ssl_cert_and_key): + """Test HTTPS on custom port.""" + # Ensure Ray is shutdown before starting + try: + ray.shutdown() + except Exception: + pass + + # Disable dashboard to prevent SSL conflicts and disable runtime env upload + ray.init(include_dashboard=False, runtime_env={"working_dir": None}) + + try: + serve.start( + http_options=HTTPOptions( + host="127.0.0.1", + port=8443, + ssl_keyfile=ssl_cert_and_key["key_path"], + ssl_certfile=ssl_cert_and_key["cert_path"], + ) + ) + + @serve.deployment + def custom_port_handler(): + return "custom port" + + serve.run(custom_port_handler.bind()) + + response = requests.get( + "https://127.0.0.1:8443/custom_port_handler", verify=False + ) + assert response.status_code == 200 + assert response.text == "custom port" + finally: + try: + serve.shutdown() + except Exception: + pass + ray.shutdown() + + def test_https_deployment_update(self, https_serve_instance): + """Test deployment updates work correctly with HTTPS.""" + + @serve.deployment + def updatable(): + return "version 1" + + serve.run(updatable.bind()) + + # Test initial version + response = requests.get("https://localhost:8000/updatable", verify=False) + assert response.text == "version 1" + + # Update deployment + @serve.deployment + def updatable(): + return "version 2" + + serve.run(updatable.bind()) + + # Test updated version + response = requests.get("https://localhost:8000/updatable", verify=False) + assert response.text == "version 2" + + +if __name__ == "__main__": + import sys + + sys.exit(pytest.main(["-v", "-s", __file__])) diff --git a/python/ray/serve/tests/test_logging.py b/python/ray/serve/tests/test_logging.py index 01a551613a25..c1c1b612f338 100644 --- a/python/ray/serve/tests/test_logging.py +++ b/python/ray/serve/tests/test_logging.py @@ -7,6 +7,7 @@ import sys import time import uuid +from collections import Counter from contextlib import redirect_stderr from pathlib import Path from typing import List, Tuple @@ -21,8 +22,8 @@ import ray import ray.util.state as state_api from ray import serve +from ray._common.formatters import JSONFormatter from ray._common.test_utils import wait_for_condition -from ray._private.ray_logging.formatters import JSONFormatter from ray.serve._private.common import DeploymentID, ReplicaID, ServeComponentType from ray.serve._private.constants import SERVE_LOG_EXTRA_FIELDS, SERVE_LOGGER_NAME from ray.serve._private.logging_utils import ( @@ -1172,39 +1173,44 @@ def disable_stdout(): httpx.get(url, timeout=None) # Check if each of the logs exist in Serve's log files. - from_serve_logger_check = False - from_print_check = False - from_error_check = False - direct_from_stdout = False - direct_from_stderr = False - multiline_log = False - for log_file in os.listdir(logs_dir): - if log_file.startswith("replica_default_disable_stdout"): - with open(logs_dir / log_file) as f: - for line in f: - structured_log = json.loads(line) - message = structured_log["message"] - exc_text = structured_log.get("exc_text", "") - if "from_serve_logger" in message: - from_serve_logger_check = True - elif "from_print" in message: - from_print_check = True - - # Error was logged from replica directly. - elif "from_error" in exc_text: - from_error_check = True - elif "direct_from_stdout" in message: - direct_from_stdout = True - elif "direct_from_stderr" in message: - direct_from_stderr = True - elif "this\nis\nmultiline\nlog\n" in message: - multiline_log = True - assert from_serve_logger_check - assert from_print_check - assert from_error_check - assert direct_from_stdout - assert direct_from_stderr - assert multiline_log + def _all_expected_logs_exist(): + from_serve_logger_check = False + from_print_check = False + from_error_check = False + direct_from_stdout = False + direct_from_stderr = False + multiline_log = False + + for log_file in os.listdir(logs_dir): + if log_file.startswith("replica_default_disable_stdout"): + with open(logs_dir / log_file) as f: + for line in f: + structured_log = json.loads(line) + message = structured_log["message"] + exc_text = structured_log.get("exc_text", "") + + if "from_serve_logger" in message: + from_serve_logger_check = True + elif "from_print" in message: + from_print_check = True + elif "from_error" in exc_text: + from_error_check = True + elif "direct_from_stdout" in message: + direct_from_stdout = True + elif "direct_from_stderr" in message: + direct_from_stderr = True + elif "this\nis\nmultiline\nlog\n" in message: + multiline_log = True + + assert from_serve_logger_check + assert from_print_check + assert from_error_check + assert direct_from_stdout + assert direct_from_stderr + assert multiline_log + return True + + wait_for_condition(_all_expected_logs_exist) @pytest.mark.skipif(sys.platform == "win32", reason="Fail to look for temp dir.") @@ -1355,5 +1361,52 @@ def test_configure_default_serve_logger_with_stderr_redirect( assert not isinstance(sys.stderr, StreamToLogger) +@pytest.mark.parametrize( + "ray_instance", + [ + {"RAY_SERVE_REQUEST_PATH_LOG_BUFFER_SIZE": "1"}, + {"RAY_SERVE_REQUEST_PATH_LOG_BUFFER_SIZE": "100"}, + ], + indirect=True, +) +def test_request_id_uniqueness_with_buffering(serve_and_ray_shutdown, ray_instance): + """Test request IDs are unique when buffering is enabled.""" + + logger = logging.getLogger("ray.serve") + + @serve.deployment(logging_config={"encoding": "JSON"}) + class TestApp: + async def __call__(self): + logger.info("Processing request") + logger.info("Additional log entry") + return "OK" + + serve.run(TestApp.bind()) + for _ in range(200): + httpx.get("http://127.0.0.1:8000/") + + logs_dir = get_serve_logs_dir() + + def check_logs(): + for log_file in os.listdir(logs_dir): + if log_file.startswith("replica"): + with open(os.path.join(logs_dir, log_file)) as f: + log_request_ids = [] + for line in f: + log_entry = json.loads(line) + request_id = log_entry.get("request_id", None) + message = log_entry.get("message", None) + if request_id: + # Append the (request_id, message) pairs to the list + log_request_ids.append((request_id, message)) + # Check that there are no duplicate (request_id, message) pairs + request_id_counts = Counter(log_request_ids) + for _, count in request_id_counts.items(): + assert count == 1, "Request ID duplicates when buffering" + return True + + wait_for_condition(check_logs) + + if __name__ == "__main__": sys.exit(pytest.main(["-v", "-s", __file__])) diff --git a/python/ray/serve/tests/test_multiplex.py b/python/ray/serve/tests/test_multiplex.py index 1ebc29066181..b93857f12c03 100644 --- a/python/ray/serve/tests/test_multiplex.py +++ b/python/ray/serve/tests/test_multiplex.py @@ -34,6 +34,8 @@ def start_serve_with_context(): ), servable_object=None, _deployment_config=DeploymentConfig(), + rank=0, + world_size=1, ) try: yield diff --git a/python/ray/serve/tests/test_persistence.py b/python/ray/serve/tests/test_persistence.py index f2a79348791b..0c924ccb817d 100644 --- a/python/ray/serve/tests/test_persistence.py +++ b/python/ray/serve/tests/test_persistence.py @@ -4,7 +4,8 @@ def test_new_driver(serve_instance): - script = """ + run_string_as_driver( + """ import ray ray.init(address="{}", namespace="default_test_namespace") @@ -16,9 +17,9 @@ def driver(): serve.run(driver.bind(), name="app") """.format( - ray._private.worker._global_node.address + ray.get_runtime_context().gcs_address, + ) ) - run_string_as_driver(script) handle = serve.get_app_handle("app") assert handle.remote().result() == "OK!" diff --git a/python/ray/serve/tests/test_proxy.py b/python/ray/serve/tests/test_proxy.py index 6393194fc53b..28337cfbda96 100644 --- a/python/ray/serve/tests/test_proxy.py +++ b/python/ray/serve/tests/test_proxy.py @@ -9,6 +9,7 @@ from ray._common.network_utils import build_address from ray._common.test_utils import wait_for_condition from ray.actor import ActorHandle +from ray.cluster_utils import Cluster from ray.serve._private.constants import ( DEFAULT_UVICORN_KEEP_ALIVE_TIMEOUT_S, SERVE_NAMESPACE, @@ -16,12 +17,25 @@ from ray.serve._private.test_utils import ( ping_grpc_healthz, ping_grpc_list_applications, + request_with_retries, ) from ray.serve.config import gRPCOptions +from ray.serve.context import _get_global_client from ray.serve.generated import serve_pb2 +from ray.serve.schema import ProxyStatus, ServeInstanceDetails +from ray.tests.conftest import call_ray_stop_only # noqa: F401 from ray.util.state import list_actors +@pytest.fixture +def shutdown_ray(): + if ray.is_initialized(): + ray.shutdown() + yield + if ray.is_initialized(): + ray.shutdown() + + class TestTimeoutKeepAliveConfig: """Test setting keep_alive_timeout_s in config and env.""" @@ -224,5 +238,123 @@ def check_replicas_on_worker_nodes(): ping_grpc_healthz(worker_node_channel, test_draining=True) +def test_drain_and_undrain_http_proxy_actors( + monkeypatch, shutdown_ray, call_ray_stop_only # noqa: F811 +): + """Test the state transtion of the proxy actor between + HEALTHY, DRAINING and DRAINED + """ + monkeypatch.setenv("RAY_SERVE_PROXY_MIN_DRAINING_PERIOD_S", "10") + + cluster = Cluster() + head_node = cluster.add_node(num_cpus=0) + cluster.add_node(num_cpus=1) + cluster.add_node(num_cpus=1) + cluster.wait_for_nodes() + ray.init(address=head_node.address) + serve.start(http_options={"location": "EveryNode"}) + + @serve.deployment + class HelloModel: + def __call__(self): + return "hello" + + serve.run(HelloModel.options(num_replicas=2).bind()) + + # 3 proxies, 1 controller, 2 replicas. + wait_for_condition(lambda: len(list_actors()) == 6) + assert len(ray.nodes()) == 3 + + client = _get_global_client() + serve_details = ServeInstanceDetails( + **ray.get(client._controller.get_serve_instance_details.remote()) + ) + proxy_actor_ids = {proxy.actor_id for _, proxy in serve_details.proxies.items()} + + assert len(proxy_actor_ids) == 3 + + serve.run(HelloModel.options(num_replicas=1).bind()) + # 1 proxy should be draining + + def check_proxy_status(proxy_status_to_count): + serve_details = ServeInstanceDetails( + **ray.get(client._controller.get_serve_instance_details.remote()) + ) + proxy_status_list = [proxy.status for _, proxy in serve_details.proxies.items()] + print("all proxies!!!", [proxy for _, proxy in serve_details.proxies.items()]) + current_status = { + status: proxy_status_list.count(status) for status in proxy_status_list + } + return current_status == proxy_status_to_count, current_status + + wait_for_condition( + condition_predictor=check_proxy_status, + proxy_status_to_count={ProxyStatus.HEALTHY: 2, ProxyStatus.DRAINING: 1}, + ) + + serve.run(HelloModel.options(num_replicas=2).bind()) + # The draining proxy should become healthy. + wait_for_condition( + condition_predictor=check_proxy_status, + proxy_status_to_count={ProxyStatus.HEALTHY: 3}, + ) + serve_details = ServeInstanceDetails( + **ray.get(client._controller.get_serve_instance_details.remote()) + ) + + assert { + proxy.actor_id for _, proxy in serve_details.proxies.items() + } == proxy_actor_ids + + serve.run(HelloModel.options(num_replicas=1).bind()) + # 1 proxy should be draining and eventually be drained. + wait_for_condition( + condition_predictor=check_proxy_status, + timeout=40, + proxy_status_to_count={ProxyStatus.HEALTHY: 2}, + ) + + # Clean up serve. + serve.shutdown() + + +def _kill_http_proxies(): + http_proxies = ray.get( + serve.context._global_client._controller.get_proxies.remote() + ) + for http_proxy in http_proxies.values(): + ray.kill(http_proxy, no_restart=False) + + +def test_http_proxy_failure(serve_instance): + @serve.deployment(name="proxy_failure") + def function(_): + return "hello1" + + serve.run(function.bind()) + + assert request_with_retries(timeout=1.0).text == "hello1" + + for _ in range(10): + response = request_with_retries(timeout=30) + assert response.text == "hello1" + + _kill_http_proxies() + + def function2(_): + return "hello2" + + serve.run(function.options(func_or_class=function2).bind()) + + def check_new(): + for _ in range(10): + response = request_with_retries(timeout=30) + if response.text != "hello2": + return False + return True + + wait_for_condition(check_new) + + if __name__ == "__main__": sys.exit(pytest.main(["-v", "-s", __file__])) diff --git a/python/ray/serve/tests/test_replica_ranks.py b/python/ray/serve/tests/test_replica_ranks.py new file mode 100644 index 000000000000..c2f26b0754c7 --- /dev/null +++ b/python/ray/serve/tests/test_replica_ranks.py @@ -0,0 +1,399 @@ +import random +import sys +from typing import Dict, List + +import pytest + +import ray +from ray import serve +from ray._common.test_utils import SignalActor, wait_for_condition +from ray.serve._private.common import ( + DeploymentID, + DeploymentStatus, + ReplicaState, +) +from ray.serve._private.constants import ( + SERVE_CONTROLLER_NAME, + SERVE_DEFAULT_APP_NAME, + SERVE_NAMESPACE, +) +from ray.serve._private.controller import ServeController +from ray.serve._private.test_utils import ( + check_deployment_status, + check_num_replicas_eq, +) + + +def get_controller() -> ServeController: + """Get the current ServeController actor.""" + return ray.get_actor(SERVE_CONTROLLER_NAME, namespace=SERVE_NAMESPACE) + + +def get_replica_ranks(deployment_name: str) -> Dict[str, int]: + """Get the current rank mapping for all replicas in a deployment.""" + controller = get_controller() + deployment_id = DeploymentID(name=deployment_name, app_name=SERVE_DEFAULT_APP_NAME) + + # Use the public API method on the controller + return ray.get(controller._get_replica_ranks_mapping.remote(deployment_id)) + + +def get_running_replica_ids(deployment_name: str) -> List[str]: + """Get the replica IDs of running replicas for given deployment.""" + controller = get_controller() + deployment_id = DeploymentID(name=deployment_name, app_name=SERVE_DEFAULT_APP_NAME) + + replicas = ray.get( + controller._dump_replica_states_for_testing.remote(deployment_id) + ) + running_replicas = replicas.get([ReplicaState.RUNNING]) + return [replica.replica_id.unique_id for replica in running_replicas] + + +def check_rank_contiguity(ranks: Dict[str, int]) -> bool: + """Check that ranks form a contiguous sequence from 0 to N-1.""" + if not ranks: + return True + + rank_values = sorted(ranks.values()) + expected = list(range(len(rank_values))) + assert rank_values == expected, f"Expected {expected}, got {rank_values}" + return True + + +def check_rank_assignment_complete(deployment_name: str, expected_count: int) -> bool: + """Check that all replicas have been assigned ranks and they are contiguous.""" + try: + replica_ids = get_running_replica_ids(deployment_name) + ranks = get_replica_ranks(deployment_name) + + # Check all running replicas have ranks + for replica_id in replica_ids: + if replica_id not in ranks: + print(f"Replica {replica_id} not found in ranks: {ranks}") + return False + + # Check we have expected number of ranks + if len(ranks) != expected_count: + print(f"Expected {expected_count} ranks, got {len(ranks)}: {ranks}") + return False + + # Check ranks are contiguous + return check_rank_contiguity(ranks) + except Exception as e: + print(f"Error checking rank assignment: {e}") + return False + + +@pytest.mark.parametrize("num_replicas", [1, 3, 5]) +def test_basic_rank_assignment(serve_instance, num_replicas): + """Test basic rank assignment for different numbers of replicas.""" + + @serve.deployment(num_replicas=num_replicas) + class RankTracker: + def __init__(self): + self.replica_rank = None + self.world_size = None + + def __call__(self): + context = serve.get_replica_context() + self.replica_rank = context.rank + self.world_size = context.world_size + return { + "rank": self.replica_rank, + "world_size": self.world_size, + } + + handle = serve.run(RankTracker.bind()) + + # Wait for all replicas to be running and have ranks assigned + wait_for_condition( + lambda: check_rank_assignment_complete("RankTracker", num_replicas), + ) + + # Verify ranks are correctly assigned + ranks = get_replica_ranks("RankTracker") + assert len(ranks) == num_replicas + assert check_rank_contiguity(ranks) + + # Verify replicas can access their ranks via API + responses = [] + for _ in range(10): # Make multiple requests to hit different replicas + response = handle.remote().result() + responses.append(response) + + # Check that we got responses from all replicas + seen_ranks = set() + for response in responses: + assert response["world_size"] == num_replicas + if response["rank"] is not None: + seen_ranks.add(response["rank"]) + + # We should eventually see all ranks (though it might take multiple requests) + assert len(seen_ranks) <= num_replicas + for rank in seen_ranks: + assert 0 <= rank < num_replicas + + +def test_rank_assignment_with_autoscaling(serve_instance): + """Test rank assignment and reassignment during autoscaling.""" + signal_actor = SignalActor.remote() + + @serve.deployment( + autoscaling_config={ + "target_ongoing_requests": 1, + "metrics_interval_s": 0.1, + "min_replicas": 2, + "max_replicas": 4, + "upscale_delay_s": 1, + "downscale_delay_s": 1, + "look_back_period_s": 10, + }, + max_ongoing_requests=10, + ) + class AutoscalingRankTracker: + async def __call__(self): + await signal_actor.wait.remote() + context = serve.get_replica_context() + return { + "rank": context.rank, + "world_size": context.world_size, + } + + handle = serve.run(AutoscalingRankTracker.bind()) + + # Wait for initial replicas + wait_for_condition( + lambda: check_rank_assignment_complete("AutoscalingRankTracker", 2), + ) + + initial_ranks = get_replica_ranks("AutoscalingRankTracker") + assert len(initial_ranks) == 2 + assert check_rank_contiguity(initial_ranks) + + # Send concurrent requests to trigger autoscaling + _ = [handle.remote() for _ in range(10)] + + # Wait for scale-up to happen and ranks to be reassigned + wait_for_condition( + lambda: check_num_replicas_eq("AutoscalingRankTracker", 4, use_controller=True), + timeout=20, + ) + + # Check that ranks are still contiguous after scale-up + wait_for_condition( + lambda: check_rank_assignment_complete("AutoscalingRankTracker", 4), + ) + + scaled_ranks = get_replica_ranks("AutoscalingRankTracker") + assert len(scaled_ranks) == 4 + assert check_rank_contiguity(scaled_ranks) + + signal_actor.send.remote() + + # Wait for scale-down (no more load) + wait_for_condition( + lambda: check_num_replicas_eq("AutoscalingRankTracker", 2, use_controller=True), + ) + + # Check that ranks are reassigned and contiguous after scale-down + wait_for_condition( + lambda: check_rank_assignment_complete("AutoscalingRankTracker", 2), + ) + + final_ranks = get_replica_ranks("AutoscalingRankTracker") + assert len(final_ranks) == 2 + assert check_rank_contiguity(final_ranks) + + +def test_rank_persistence_across_controller_restart(serve_instance): + """Test that ranks are preserved across controller failures.""" + + @serve.deployment(num_replicas=3) + class PersistentRankTracker: + def __call__(self): + context = serve.get_replica_context() + return { + "rank": context.rank, + "world_size": context.world_size, + } + + serve.run(PersistentRankTracker.bind()) + + # Wait for all replicas to be running + wait_for_condition( + lambda: check_rank_assignment_complete("PersistentRankTracker", 3), + ) + + # Record initial ranks + initial_ranks = get_replica_ranks("PersistentRankTracker") + + assert len(initial_ranks) == 3 + assert check_rank_contiguity(initial_ranks) + + # Kill the controller to simulate failure + controller = get_controller() + ray.kill(controller, no_restart=False) + + # Wait for controller to be restarted and deployment to be recovered + wait_for_condition( + lambda: check_deployment_status( + "PersistentRankTracker", DeploymentStatus.HEALTHY + ), + ) + + # Wait for rank assignment to be restored + wait_for_condition( + lambda: check_rank_assignment_complete("PersistentRankTracker", 3), + ) + + # Check that ranks are preserved for surviving replicas + recovered_ranks = get_replica_ranks("PersistentRankTracker") + + assert len(recovered_ranks) == 3 + assert check_rank_contiguity(recovered_ranks) + + # Check that the recovered ranks are the same as the initial ranks + assert recovered_ranks == initial_ranks + + +def test_single_replica_deployment(serve_instance): + """Test rank assignment for single replica deployment.""" + + @serve.deployment(num_replicas=1) + class SingleReplicaTracker: + def __call__(self): + context = serve.get_replica_context() + return { + "rank": context.rank, + "world_size": context.world_size, + } + + handle = serve.run(SingleReplicaTracker.bind()) + + # Wait for deployment + wait_for_condition( + lambda: check_rank_assignment_complete("SingleReplicaTracker", 1), + ) + + # Verify single replica has rank 0 + ranks = get_replica_ranks("SingleReplicaTracker") + assert len(ranks) == 1 + assert 0 in ranks.values() + + # Verify API returns correct values + response = handle.remote().result() + assert response["rank"] == 0 + assert response["world_size"] == 1 + + +def test_multiple_deployments_independent_ranks(serve_instance): + """Test that different deployments have independent rank spaces.""" + + @serve.deployment(name="deployment1", num_replicas=2) + class RankTracker1: + def __call__(self): + context = serve.get_replica_context() + return { + "deployment": "deployment1", + "rank": context.rank, + "world_size": context.world_size, + } + + @serve.deployment(name="deployment2", num_replicas=3) + class RankTracker2: + def __init__(self, rank_tracker1): + self.rank_tracker1 = rank_tracker1 + + def __call__(self): + context = serve.get_replica_context() + return { + "deployment": "deployment2", + "rank": context.rank, + "world_size": context.world_size, + } + + serve.run(RankTracker2.bind(RankTracker1.bind())) + # Wait for both deployments + wait_for_condition( + lambda: check_rank_assignment_complete("deployment1", 2), + ) + wait_for_condition( + lambda: check_rank_assignment_complete("deployment2", 3), + ) + + # Check ranks are independent + ranks1 = get_replica_ranks("deployment1") + ranks2 = get_replica_ranks("deployment2") + + assert len(ranks1) == 2 + assert len(ranks2) == 3 + assert check_rank_contiguity(ranks1) + assert check_rank_contiguity(ranks2) + + # Both should have rank 0 (in their own space) + assert 0 in ranks1.values() + assert 0 in ranks2.values() + assert 1 in ranks1.values() + assert 1 in ranks2.values() + assert 2 in ranks2.values() # Only deployment2 should have rank 2 + + handle1 = serve.get_deployment_handle("deployment1", SERVE_DEFAULT_APP_NAME) + handle2 = serve.get_deployment_handle("deployment2", SERVE_DEFAULT_APP_NAME) + + response1 = handle1.remote().result() + response2 = handle2.remote().result() + assert response1["world_size"] == 2 + assert response2["world_size"] == 3 + + +def test_rank_stability_on_replica_death(serve_instance): + """Test that when one replica dies, other replicas keep their ranks.""" + + @serve.deployment(num_replicas=4) + class StableRankTracker: + def __call__(self): + return "hello" + + serve.run(StableRankTracker.bind()) + + # Wait for all replicas to be running and have ranks + wait_for_condition( + lambda: check_rank_assignment_complete("StableRankTracker", 4), + ) + + # get_replica_ranks + initial_ranks = get_replica_ranks("StableRankTracker") + initial_replica_ids = get_running_replica_ids("StableRankTracker") + assert len(initial_ranks) == 4 + assert check_rank_contiguity(initial_ranks) + + # kill the replica with rank 1 + random_replica_id_idx = random.choice(range(len(initial_replica_ids))) + killed_replica_id = initial_replica_ids[random_replica_id_idx] + replica_handle = ray.get_actor( + f"SERVE_REPLICA::default#StableRankTracker#{killed_replica_id}", + namespace=SERVE_NAMESPACE, + ) + ray.kill(replica_handle, no_restart=False) + + def _check(): + new_running_replica_ids = get_running_replica_ids("StableRankTracker") + assert len(new_running_replica_ids) == 4 + assert new_running_replica_ids != initial_replica_ids + return True + + wait_for_condition(_check, timeout=20) + + # get_replica_ranks + final_ranks = get_replica_ranks("StableRankTracker") + assert len(final_ranks) == 4 + assert check_rank_contiguity(final_ranks) + # for all replicas that is not killed, their ranks should be the same as before + for replica_id in initial_replica_ids: + if replica_id != killed_replica_id: + assert final_ranks[replica_id] == initial_ranks[replica_id] + + +if __name__ == "__main__": + sys.exit(pytest.main(["-v", "-s", __file__])) diff --git a/python/ray/serve/tests/test_serve_ha.py b/python/ray/serve/tests/test_serve_ha.py index a15cff0a3a47..6af3593b50a4 100644 --- a/python/ray/serve/tests/test_serve_ha.py +++ b/python/ray/serve/tests/test_serve_ha.py @@ -109,7 +109,7 @@ def check_for_head_node_come_back_up(): import ray import requests from ray.serve.schema import ServeInstanceDetails -from ray._private.resource_and_label_spec import HEAD_NODE_RESOURCE_NAME +from ray._common.constants import HEAD_NODE_RESOURCE_NAME ray.init(address="auto") head_node_id = ray.get_runtime_context().get_node_id() serve_details = ServeInstanceDetails( diff --git a/python/ray/serve/tests/test_standalone.py b/python/ray/serve/tests/test_standalone.py index ba4b506ded42..2ddb36190acc 100644 --- a/python/ray/serve/tests/test_standalone.py +++ b/python/ray/serve/tests/test_standalone.py @@ -125,6 +125,53 @@ def check_dead(): wait_for_condition(check_dead) +@pytest.mark.asyncio +async def test_shutdown_async(ray_shutdown): + ray.init(num_cpus=8) + serve.start(http_options=dict(port=8003)) + gcs_client = GcsClient(address=ray.get_runtime_context().gcs_address) + cluster_node_info_cache = create_cluster_node_info_cache(gcs_client) + cluster_node_info_cache.update() + + @serve.deployment + def f(): + pass + + serve.run(f.bind()) + + actor_names = [ + SERVE_CONTROLLER_NAME, + format_actor_name( + SERVE_PROXY_NAME, + cluster_node_info_cache.get_alive_nodes()[0][0], + ), + ] + + def check_alive(): + alive = True + for actor_name in actor_names: + try: + ray.get_actor(actor_name, namespace=SERVE_NAMESPACE) + except ValueError: + alive = False + return alive + + wait_for_condition(check_alive) + + await serve.shutdown_async() + + def check_dead(): + for actor_name in actor_names: + try: + ray.get_actor(actor_name, namespace=SERVE_NAMESPACE) + return False + except ValueError: + pass + return True + + wait_for_condition(check_dead) + + def test_single_app_shutdown_actors(ray_shutdown): """Tests serve.shutdown() works correctly in single-app case @@ -165,6 +212,47 @@ def check_dead(): wait_for_condition(check_dead) +@pytest.mark.asyncio +async def test_single_app_shutdown_actors_async(ray_shutdown): + """Tests serve.shutdown_async() works correctly in single-app case + + Ensures that after deploying a (nameless) app using serve.run(), serve.shutdown_async() + deletes all actors (controller, http proxy, all replicas) in the "serve" namespace. + """ + address = ray.init(num_cpus=8)["address"] + serve.start(http_options=dict(port=8003)) + + @serve.deployment + def f(): + pass + + serve.run(f.bind(), name="app") + + actor_names = { + "ServeController", + "ProxyActor", + "ServeReplica:app:f", + } + + def check_alive(): + actors = list_actors( + address=address, + filters=[("ray_namespace", "=", SERVE_NAMESPACE), ("state", "=", "ALIVE")], + ) + return {actor["class_name"] for actor in actors} == actor_names + + def check_dead(): + actors = list_actors( + address=address, + filters=[("ray_namespace", "=", SERVE_NAMESPACE), ("state", "=", "ALIVE")], + ) + return len(actors) == 0 + + wait_for_condition(check_alive) + await serve.shutdown_async() + wait_for_condition(check_dead) + + def test_multi_app_shutdown_actors(ray_shutdown): """Tests serve.shutdown() works correctly in multi-app case. @@ -207,6 +295,49 @@ def check_dead(): wait_for_condition(check_dead) +@pytest.mark.asyncio +async def test_multi_app_shutdown_actors_async(ray_shutdown): + """Tests serve.shutdown_async() works correctly in multi-app case. + + Ensures that after deploying multiple distinct applications, serve.shutdown_async() + deletes all actors (controller, http proxy, all replicas) in the "serve" namespace. + """ + address = ray.init(num_cpus=8)["address"] + serve.start(http_options=dict(port=8003)) + + @serve.deployment + def f(): + pass + + serve.run(f.bind(), name="app1", route_prefix="/app1") + serve.run(f.bind(), name="app2", route_prefix="/app2") + + actor_names = { + "ServeController", + "ProxyActor", + "ServeReplica:app1:f", + "ServeReplica:app2:f", + } + + def check_alive(): + actors = list_actors( + address=address, + filters=[("ray_namespace", "=", SERVE_NAMESPACE), ("state", "=", "ALIVE")], + ) + return {actor["class_name"] for actor in actors} == actor_names + + def check_dead(): + actors = list_actors( + address=address, + filters=[("ray_namespace", "=", SERVE_NAMESPACE), ("state", "=", "ALIVE")], + ) + return len(actors) == 0 + + wait_for_condition(check_alive) + await serve.shutdown_async() + wait_for_condition(check_dead) + + def test_deployment(ray_cluster): # https://github.com/ray-project/ray/issues/11437 @@ -518,6 +649,30 @@ def __call__(self, *args): assert len(serve.status().applications) == 1 +@pytest.mark.asyncio +async def test_serve_shutdown_async(ray_shutdown): + ray.init(namespace="serve") + serve.start() + + @serve.deployment + class A: + def __call__(self, *args): + return "hi" + + serve.run(A.bind()) + + assert len(serve.status().applications) == 1 + + await serve.shutdown_async() + serve.start() + + assert len(serve.status().applications) == 0 + + serve.run(A.bind()) + + assert len(serve.status().applications) == 1 + + def test_instance_in_non_anonymous_namespace(ray_shutdown): # Can start instance in non-anonymous namespace. ray.init(namespace="foo") diff --git a/python/ray/serve/tests/test_standalone_3.py b/python/ray/serve/tests/test_standalone_3.py index cbf5686dc5bc..da3372f8c810 100644 --- a/python/ray/serve/tests/test_standalone_3.py +++ b/python/ray/serve/tests/test_standalone_3.py @@ -22,6 +22,13 @@ from ray.util.state import list_actors +# Some tests are not possible to run if proxy is not available on every node. +# We skip them if proxy is not available. +def is_proxy_on_every_node() -> bool: + client = _get_global_client() + return client._http_config.location == "EveryNode" + + @pytest.fixture def shutdown_ray(): if ray.is_initialized(): @@ -286,8 +293,10 @@ def __call__(self, *args): serve.run(A.bind(), name="app_f") - # 2 proxies, 1 controller, 2 replicas. - wait_for_condition(lambda: len(list_actors()) == 5) + # If proxy is on every node, total actors are 2 proxies, 1 controller, 2 replicas. + # Otherwise, total actors are 1 proxy, 1 controller, 2 replicas. + expected_actors = 5 if is_proxy_on_every_node() else 4 + wait_for_condition(lambda: len(list_actors()) == expected_actors) assert len(ray.nodes()) == 2 # Stop all deployment replicas. @@ -324,82 +333,6 @@ def serve_details_proxy_count(): ray.shutdown() -def test_drain_and_undrain_http_proxy_actors( - monkeypatch, shutdown_ray, call_ray_stop_only # noqa: F811 -): - """Test the state transtion of the proxy actor between - HEALTHY, DRAINING and DRAINED - """ - monkeypatch.setenv("RAY_SERVE_PROXY_MIN_DRAINING_PERIOD_S", "10") - - cluster = Cluster() - head_node = cluster.add_node(num_cpus=0) - cluster.add_node(num_cpus=1) - cluster.add_node(num_cpus=1) - cluster.wait_for_nodes() - ray.init(address=head_node.address) - serve.start(http_options={"location": "EveryNode"}) - - @serve.deployment - class HelloModel: - def __call__(self): - return "hello" - - serve.run(HelloModel.options(num_replicas=2).bind()) - - # 3 proxies, 1 controller, 2 replicas. - wait_for_condition(lambda: len(list_actors()) == 6) - assert len(ray.nodes()) == 3 - - client = _get_global_client() - serve_details = ServeInstanceDetails( - **ray.get(client._controller.get_serve_instance_details.remote()) - ) - proxy_actor_ids = {proxy.actor_id for _, proxy in serve_details.proxies.items()} - assert len(proxy_actor_ids) == 3 - - serve.run(HelloModel.options(num_replicas=1).bind()) - # 1 proxy should be draining - - def check_proxy_status(proxy_status_to_count): - serve_details = ServeInstanceDetails( - **ray.get(client._controller.get_serve_instance_details.remote()) - ) - proxy_status_list = [proxy.status for _, proxy in serve_details.proxies.items()] - print("all proxies!!!", [proxy for _, proxy in serve_details.proxies.items()]) - current_status = { - status: proxy_status_list.count(status) for status in proxy_status_list - } - return current_status == proxy_status_to_count, current_status - - wait_for_condition( - condition_predictor=check_proxy_status, - proxy_status_to_count={ProxyStatus.HEALTHY: 2, ProxyStatus.DRAINING: 1}, - ) - - serve.run(HelloModel.options(num_replicas=2).bind()) - # The draining proxy should become healthy. - wait_for_condition( - condition_predictor=check_proxy_status, - proxy_status_to_count={ProxyStatus.HEALTHY: 3}, - ) - serve_details = ServeInstanceDetails( - **ray.get(client._controller.get_serve_instance_details.remote()) - ) - {proxy.actor_id for _, proxy in serve_details.proxies.items()} == proxy_actor_ids - - serve.run(HelloModel.options(num_replicas=1).bind()) - # 1 proxy should be draining and eventually be drained. - wait_for_condition( - condition_predictor=check_proxy_status, - timeout=40, - proxy_status_to_count={ProxyStatus.HEALTHY: 2}, - ) - - # Clean up serve. - serve.shutdown() - - @pytest.mark.parametrize("wait_for_controller_shutdown", (True, False)) def test_controller_shutdown_gracefully( shutdown_ray, call_ray_stop_only, wait_for_controller_shutdown # noqa: F811 @@ -426,8 +359,10 @@ def __call__(self): model = HelloModel.bind() serve.run(target=model) - # Ensure total actors of 2 proxies, 1 controller, and 2 replicas - wait_for_condition(lambda: len(list_actors()) == 5) + # If proxy is on every node, total actors are 2 proxies, 1 controller, and 2 replicas + # Otherwise, total actors are 1 proxy, 1 controller, and 2 replicas + expected_actors = 5 if is_proxy_on_every_node() else 4 + wait_for_condition(lambda: len(list_actors()) == expected_actors) assert len(ray.nodes()) == 2 # Call `graceful_shutdown()` on the controller, so it will start shutdown. @@ -485,8 +420,11 @@ def __call__(self): model = HelloModel.bind() serve.run(target=model) - # Ensure total actors of 2 proxies, 1 controller, and 2 replicas - wait_for_condition(lambda: len(list_actors()) == 5) + # Check expected actors based on mode + # If proxy is on every node, total actors are 2 proxies, 1 controller, and 2 replicas + # Otherwise, total actors are 1 proxy, 1 controller, and 2 replicas + expected_actors = 5 if is_proxy_on_every_node() else 4 + wait_for_condition(lambda: len(list_actors()) == expected_actors) assert len(ray.nodes()) == 2 # Ensure client times out if the controller does not shutdown within timeout. diff --git a/python/ray/serve/tests/test_task_processor.py b/python/ray/serve/tests/test_task_processor.py new file mode 100644 index 000000000000..b8892992712e --- /dev/null +++ b/python/ray/serve/tests/test_task_processor.py @@ -0,0 +1,624 @@ +import json +import os +import sys +import tempfile +from collections import defaultdict +from pathlib import Path + +import pytest + +import ray +from ray import serve +from ray._common.test_utils import SignalActor, wait_for_condition +from ray.serve.schema import CeleryAdapterConfig, TaskProcessorConfig +from ray.serve.task_consumer import ( + instantiate_adapter_from_config, + task_consumer, + task_handler, +) +from ray.tests.conftest import external_redis # noqa: F401 + + +@ray.remote +class ProcessedTasksTracker: + def __init__(self): + self.processed_tasks = set() + + def add_task(self, task_data): + self.processed_tasks.add(task_data) + + def get_processed_tasks(self): + return self.processed_tasks + + def get_count(self): + return len(self.processed_tasks) + + +@ray.remote +def send_request_to_queue( + processor_config: TaskProcessorConfig, data, task_name="process_request" +): + adapter_instance_global = instantiate_adapter_from_config( + task_processor_config=processor_config + ) + result = adapter_instance_global.enqueue_task_sync(task_name, args=[data]) + assert result.id is not None + return result.id + + +@pytest.fixture(scope="function") +def temp_queue_directory(): + """Creates a temporary directory with 'queue', 'results', and 'control' subdirectories for task consumer tests.""" + + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir_path = Path(tmpdir) + + data_folder_queue = tmpdir_path / "queue" + data_folder_queue.mkdir() + + results_path = tmpdir_path / "results" + results_path.mkdir() + + control_path = tmpdir_path / "control" + control_path.mkdir() + + yield { + "queue_path": data_folder_queue, + "results_path": results_path, + "control_path": control_path, + } + + +@pytest.fixture(scope="function") +def transport_options(temp_queue_directory): + """Create standard transport options for filesystem broker.""" + + queue_path = temp_queue_directory["queue_path"] + control_path = temp_queue_directory["control_path"] + + return { + # Incoming message queue - where new task messages are written when sent to broker + "data_folder_in": str(queue_path), + # Outgoing message storage - where task results and responses are written after completion + "data_folder_out": str(queue_path), + # Processed message archive - where messages are moved after successful processing + "data_folder_processed": str(queue_path), + # Control message storage - where Celery management and control commands are stored + "control_folder": str(control_path), + } + + +@pytest.fixture(scope="function") +def create_processor_config(temp_queue_directory, transport_options): + """Create a TaskProcessorConfig with common defaults.""" + + def _create( + failed_task_queue_name=None, unprocessable_task_queue_name=None, **kwargs + ): + results_path = temp_queue_directory["results_path"] + + config_params = { + "queue_name": "my_default_app_queue", + "adapter_config": CeleryAdapterConfig( + broker_url="filesystem://", + backend_url=f"file://{results_path}", + broker_transport_options=transport_options, + worker_concurrency=1, + ), + } + + # Add dead letter queue names if provided + if failed_task_queue_name is not None: + config_params["failed_task_queue_name"] = failed_task_queue_name + if unprocessable_task_queue_name is not None: + config_params[ + "unprocessable_task_queue_name" + ] = unprocessable_task_queue_name + + config_params.update(kwargs) + + return TaskProcessorConfig(**config_params) + + return _create + + +def _get_task_counts_by_routing_key(queue_path): + """Counts tasks in a queue directory by reading the routing key from each message.""" + counts = defaultdict(int) + if not queue_path.exists(): + return counts + + # Celery doesn't provide a way to get the queue size. + # so we've to levarage the broker's API to get the queue size. + # Since we are using the filesystem broker in tests, we can read the files in the queue directory to get the queue size. + for msg_file in queue_path.iterdir(): + if msg_file.is_file(): + try: + with open(msg_file, "r") as f: + data = json.load(f) + routing_key = ( + data.get("properties", {}) + .get("delivery_info", {}) + .get("routing_key") + ) + if routing_key: + counts[routing_key] += 1 + except (json.JSONDecodeError, IOError): + # Ignore files that aren't valid JSON or are otherwise unreadable + continue + return counts + + +@pytest.mark.skipif(sys.platform == "win32", reason="Flaky on Windows.") +class TestTaskConsumerWithRayServe: + """Test task consumer integration with Ray Serve.""" + + def test_task_consumer_as_serve_deployment( + self, temp_queue_directory, serve_instance, create_processor_config + ): + """Test that task consumers can be used as Ray Serve deployments.""" + processor_config = create_processor_config() + + @serve.deployment + @task_consumer(task_processor_config=processor_config) + class ServeTaskConsumer: + def __init__(self): + self.data_received = None + self.task_received = False + + @task_handler(name="process_request") + def process_request(self, data): + self.task_received = True + self.data_received = data + + def assert_task_received(self): + assert self.task_received is True + assert self.data_received is not None + assert self.data_received == "test_data_1" + + # Deploy the consumer as a Serve deployment + handle = serve.run(ServeTaskConsumer.bind()) + send_request_to_queue.remote(processor_config, "test_data_1") + + def assert_result(): + try: + # `assert_task_received` will throw AssertionError if the task was not received or data is not as expected + handle.assert_task_received.remote().result() + return True + except Exception: + return False + + wait_for_condition(assert_result) + + def test_task_consumer_as_serve_deployment_with_failed_task( + self, temp_queue_directory, serve_instance, create_processor_config + ): + """Test that task consumers can be used as Ray Serve deployments.""" + processor_config = create_processor_config( + failed_task_queue_name="my_failed_task_queue" + ) + + @serve.deployment + @task_consumer(task_processor_config=processor_config) + class ServeTaskConsumer: + def __init__(self): + self.num_calls = 0 + + @task_handler(name="process_request") + def process_request(self, data): + self.num_calls += 1 + raise ValueError("Task failed as expected") + + def get_num_calls(self): + return self.num_calls + + handle = serve.run(ServeTaskConsumer.bind()) + task_id_ref = send_request_to_queue.remote(processor_config, "test_data_1") + task_id = ray.get(task_id_ref) + + adapter_instance = instantiate_adapter_from_config( + task_processor_config=processor_config + ) + + def assert_result(): + result = adapter_instance.get_task_status_sync(task_id) + + if ( + result.status == "FAILURE" + and result.result is not None + and isinstance(result.result, ValueError) + and str(result.result) == "Task failed as expected" + and handle.get_num_calls.remote().result() + == 1 + processor_config.max_retries + ): + return True + else: + return False + + wait_for_condition(assert_result, timeout=20) + + def test_task_consumer_persistence_across_restarts( + self, temp_queue_directory, serve_instance, create_processor_config + ): + """Test that tasks persist in queue and get executed after deployment restart.""" + # Setup + config = create_processor_config() + tracker = ProcessedTasksTracker.remote() + signal1 = SignalActor.remote() + + @serve.deployment(num_replicas=1, graceful_shutdown_timeout_s=60) + @task_consumer(task_processor_config=config) + class TaskConsumer: + def __init__(self, tracker_ref, signal_ref): + self.tracker, self.signal = tracker_ref, signal_ref + self.local_processed = [] + + @task_handler(name="process_request") + def process_request(self, data): + ray.get(self.signal.wait.remote()) # Block until signal + self.local_processed.append(data) + ray.get(self.tracker.add_task.remote(data)) + return f"Processed: {data}" + + def get_local_processed(self): + return self.local_processed + + # Deploy first version and send tasks + serve.run(TaskConsumer.bind(tracker, signal1), name="app_v1") + + num_tasks = 20 + for i in range(num_tasks): + ray.get(send_request_to_queue.remote(config, f"task_{i}")) + + # Process exactly 1 task, then restart deployment + wait_for_condition( + lambda: ray.get(signal1.cur_num_waiters.remote()) == 1, timeout=10 + ) + ray.get(signal1.send.remote(clear=True)) # Allow 1 task to complete + wait_for_condition(lambda: ray.get(tracker.get_count.remote()) == 1, timeout=10) + + # Shutdown first deployment + serve.delete("app_v1", _blocking=False) + ray.get(signal1.send.remote()) # Release any stuck tasks + wait_for_condition( + lambda: "app_v1" not in serve.status().applications, timeout=100 + ) + + tasks_before_restart = ray.get(tracker.get_count.remote()) + assert ( + tasks_before_restart >= 2 and tasks_before_restart < num_tasks + ), f"Expected at least 2 tasks processed and atleast one less than num_tasks, got {tasks_before_restart}" + + # Deploy second version and process remaining tasks + signal2 = SignalActor.remote() + handle = serve.run(TaskConsumer.bind(tracker, signal2), name="app_v2") + + wait_for_condition( + lambda: ray.get(signal2.cur_num_waiters.remote()) == 1, timeout=10 + ) + ray.get(signal2.send.remote()) # Process all remaining tasks + wait_for_condition( + lambda: ray.get(tracker.get_count.remote()) == num_tasks, timeout=100 + ) + + # Verify all tasks were processed and distributed correctly + expected_tasks = {f"task_{i}" for i in range(num_tasks)} + final_tasks = ray.get(tracker.get_processed_tasks.remote()) + second_deployment_tasks = handle.get_local_processed.remote().result() + + assert ( + final_tasks == expected_tasks + ), f"Missing tasks: {expected_tasks - final_tasks}" + assert ( + len(second_deployment_tasks) == num_tasks - tasks_before_restart + ), f"Second deployment processed {len(second_deployment_tasks)} tasks, expected {num_tasks - tasks_before_restart}" + + def test_task_consumer_as_serve_deployment_with_async_task_handler( + self, temp_queue_directory, serve_instance, create_processor_config + ): + """Test that task consumers properly raise NotImplementedError for async task handlers.""" + processor_config = create_processor_config() + + # Test that async task handlers raise NotImplementedError during decoration + with pytest.raises( + NotImplementedError, + match="Async task handlers are not supported yet", + ): + + @serve.deployment + @task_consumer(task_processor_config=processor_config) + class ServeTaskConsumer: + def __init__(self): + self.data_received = None + self.task_received = False + + # This async task handler should raise NotImplementedError during decoration + @task_handler(name="process_request") + async def process_request(self, data): + self.task_received = True + self.data_received = data + + def test_task_consumer_metrics( + self, temp_queue_directory, serve_instance, create_processor_config + ): + """Test that task processor metrics are collected and exposed correctly.""" + processor_config = create_processor_config() + + @serve.deployment + @task_consumer(task_processor_config=processor_config) + class ServeTaskConsumer: + def __init__(self): + self.task_received = False + + @task_handler(name="process_request") + def process_request(self, data): + self.task_received = True + + def get_task_received(self) -> bool: + return self.task_received + + handle = serve.run(ServeTaskConsumer.bind()) + send_request_to_queue.remote(processor_config, "test_data_1") + + def assert_task_received(): + return handle.get_task_received.remote().result() + + wait_for_condition(assert_task_received, timeout=20) + + adapter_instance = instantiate_adapter_from_config( + task_processor_config=processor_config + ) + metrics = adapter_instance.get_metrics_sync() + + assert len(metrics) == 1 + worker_name = next(iter(metrics)) + worker_stats = metrics[worker_name] + + # Check that the total number of processed tasks is correct. + assert worker_stats["pool"]["threads"] == 1 + assert worker_stats["pool"]["max-concurrency"] == 1 + assert worker_stats["total"]["process_request"] == 1 + assert worker_stats["broker"]["transport"] == "filesystem" + + def test_task_consumer_health_check( + self, temp_queue_directory, serve_instance, create_processor_config + ): + """Test that the health check for the task processor works correctly.""" + processor_config = create_processor_config() + + @serve.deployment + @task_consumer(task_processor_config=processor_config) + class ServeTaskConsumer: + pass + + serve.run(ServeTaskConsumer.bind()) + + adapter_instance = instantiate_adapter_from_config( + task_processor_config=processor_config + ) + + def check_health(): + health_status = adapter_instance.health_check_sync() + return len(health_status) > 0 + + # Wait for the worker to be ready + wait_for_condition(check_health, timeout=20) + + health_status = adapter_instance.health_check_sync() + assert len(health_status) == 1 + + worker_reply = health_status[0] + assert len(worker_reply) == 1 + worker_name = next(iter(worker_reply)) + assert worker_reply[worker_name] == {"ok": "pong"} + + def test_task_processor_with_cancel_tasks( + self, external_redis, serve_instance # noqa: F811 + ): + """Test the cancel task functionality with celery broker.""" + redis_address = os.environ.get("RAY_REDIS_ADDRESS") + + processor_config = TaskProcessorConfig( + queue_name="my_app_queue", + adapter_config=CeleryAdapterConfig( + broker_url=f"redis://{redis_address}/0", + backend_url=f"redis://{redis_address}/1", + worker_concurrency=1, + ), + ) + + signal = SignalActor.remote() + + @serve.deployment + @task_consumer(task_processor_config=processor_config) + class MyTaskConsumer: + def __init__(self, signal_actor): + self._signal = signal_actor + self.message_received = [] + + @task_handler(name="process") + def process(self, data): + ray.get(self._signal.wait.remote()) + self.message_received.append(data) + + def get_message_received(self): + return self.message_received + + handle = serve.run(MyTaskConsumer.bind(signal), name="app_v1") + + task_ids = [] + for i in range(2): + task_id_ref = send_request_to_queue.remote( + processor_config, f"test_data_{i}", task_name="process" + ) + task_ids.append(ray.get(task_id_ref)) + + wait_for_condition( + lambda: ray.get(signal.cur_num_waiters.remote()) == 1, timeout=10 + ) + + adapter_instance = instantiate_adapter_from_config( + task_processor_config=processor_config + ) + adapter_instance.cancel_task_sync(task_ids[1]) + + ray.get(signal.send.remote()) + + def check_revoked(): + status = adapter_instance.get_task_status_sync(task_ids[1]) + return status.status == "REVOKED" + + wait_for_condition(check_revoked, timeout=20) + + assert "test_data_0" in handle.get_message_received.remote().result() + assert "test_data_1" not in handle.get_message_received.remote().result() + + serve.delete("app_v1") + + +@pytest.mark.skipif(sys.platform == "win32", reason="Flaky on Windows.") +class TestTaskConsumerWithDLQsConfiguration: + """Test task consumer with dead letter queues.""" + + def _assert_queue_counts( + self, + temp_queue_directory, + processor_config, + expected_main=0, + expected_unprocessable=0, + expected_failed=0, + timeout=15, + ): + """Helper to assert expected task counts in different queues.""" + + def check_counts(): + queue_path = Path(temp_queue_directory["queue_path"]) + counts = _get_task_counts_by_routing_key(queue_path) + + main_count = counts.get(processor_config.queue_name, 0) + unprocessable_count = counts.get( + getattr(processor_config, "unprocessable_task_queue_name", ""), 0 + ) + failed_count = counts.get( + getattr(processor_config, "failed_task_queue_name", ""), 0 + ) + + return ( + main_count == expected_main + and unprocessable_count == expected_unprocessable + and failed_count == expected_failed + ) + + wait_for_condition(check_counts, timeout=timeout) + + def test_task_consumer_as_serve_deployment_with_unknown_task( + self, temp_queue_directory, serve_instance, create_processor_config + ): + """Test that unknown tasks are sent to the unprocessable task queue.""" + processor_config = create_processor_config( + unprocessable_task_queue_name="unprocessable_task_queue" + ) + + @serve.deployment + @task_consumer(task_processor_config=processor_config) + class ServeTaskConsumer: + @task_handler(name="process_request") + def process_request(self, data): + pass + + serve.run(ServeTaskConsumer.bind()) + + # Send a task with an unknown name + send_request_to_queue.remote( + processor_config, "test_data_1", task_name="unregistered_task" + ) + + self._assert_queue_counts( + temp_queue_directory, + processor_config, + expected_main=0, + expected_unprocessable=1, + timeout=10, + ) + + def test_task_consumer_as_serve_deployment_with_failed_task_and_dead_letter_queue( + self, temp_queue_directory, serve_instance, create_processor_config + ): + """Test that failed tasks are sent to the failed task queue.""" + processor_config = create_processor_config( + failed_task_queue_name="failed_task_queue" + ) + + @serve.deployment + @task_consumer(task_processor_config=processor_config) + class ServeTaskConsumer: + @task_handler(name="process_request") + def process_request(self, data): + raise ValueError("Task failed as expected") + + serve.run(ServeTaskConsumer.bind()) + send_request_to_queue.remote(processor_config, "test_data_1") + + self._assert_queue_counts( + temp_queue_directory, processor_config, expected_main=0, expected_failed=1 + ) + + def test_task_consumer_with_mismatched_arguments( + self, temp_queue_directory, serve_instance, create_processor_config + ): + """Test that tasks with mismatched arguments are sent to the unprocessable task queue.""" + processor_config = create_processor_config( + unprocessable_task_queue_name="unprocessable_task_queue", + failed_task_queue_name="failed_task_queue", + ) + + @serve.deployment + @task_consumer(task_processor_config=processor_config) + class ServeTaskConsumer: + @task_handler(name="process_request") + def process_request(self, arg1, arg2): # Expects two arguments + pass + + serve.run(ServeTaskConsumer.bind()) + + # Send a task with only one argument, which should cause a TypeError + send_request_to_queue.remote(processor_config, ["test_data_1"]) + + self._assert_queue_counts( + temp_queue_directory, + processor_config, + expected_main=0, + expected_failed=1, + ) + + def test_task_consumer_with_argument_type_mismatch( + self, temp_queue_directory, serve_instance, create_processor_config + ): + """Test that tasks with argument type mismatches are sent to the unprocessable task queue.""" + processor_config = create_processor_config( + unprocessable_task_queue_name="unprocessable_task_queue", + failed_task_queue_name="failed_task_queue", + ) + + @serve.deployment + @task_consumer(task_processor_config=processor_config) + class ServeTaskConsumer: + @task_handler(name="process_request") + def process_request(self, data: str): + return len(data) # This will fail if data is not a sequence + + serve.run(ServeTaskConsumer.bind()) + + # Send an integer, for which len() is undefined, causing a TypeError + send_request_to_queue.remote(processor_config, 12345) + + self._assert_queue_counts( + temp_queue_directory, + processor_config, + expected_main=0, + expected_failed=1, + ) + + +if __name__ == "__main__": + sys.exit(pytest.main(["-v", "-s", __file__])) diff --git a/python/ray/serve/tests/unit/BUILD b/python/ray/serve/tests/unit/BUILD.bazel similarity index 83% rename from python/ray/serve/tests/unit/BUILD rename to python/ray/serve/tests/unit/BUILD.bazel index 032ce7de5c5b..7e3510045199 100644 --- a/python/ray/serve/tests/unit/BUILD +++ b/python/ray/serve/tests/unit/BUILD.bazel @@ -21,7 +21,10 @@ py_test_run_all_subdirectory( py_test_module_list( size = "medium", - env = {"RAY_SERVE_USE_COMPACT_SCHEDULING_STRATEGY": "1"}, + env = { + "RAY_SERVE_USE_COMPACT_SCHEDULING_STRATEGY": "1", + "RAY_SERVE_FAIL_ON_RANK_ERROR": "1", + }, files = [ "test_deployment_scheduler.py", "test_deployment_state.py", @@ -40,7 +43,10 @@ py_test_module_list( py_test_module_list( size = "medium", - env = {"RAY_SERVE_COLLECT_AUTOSCALING_METRICS_ON_HANDLE": "0"}, + env = { + "RAY_SERVE_COLLECT_AUTOSCALING_METRICS_ON_HANDLE": "0", + "RAY_SERVE_FAIL_ON_RANK_ERROR": "1", + }, files = [ "test_autoscaling_policy.py", "test_deployment_state.py", diff --git a/python/ray/serve/tests/unit/test_application_state.py b/python/ray/serve/tests/unit/test_application_state.py index 1147e3c04f23..4fb9a47b373c 100644 --- a/python/ray/serve/tests/unit/test_application_state.py +++ b/python/ray/serve/tests/unit/test_application_state.py @@ -551,7 +551,7 @@ def test_deploy_and_delete_app(mocked_application_state): app_state.update() deployment_state_manager.set_deployment_deleted(d1_id) - ready_to_be_deleted = app_state.update() + ready_to_be_deleted, _ = app_state.update() assert not ready_to_be_deleted assert app_state.status == ApplicationStatus.DELETING diff --git a/python/ray/serve/tests/unit/test_autoscaling_policy.py b/python/ray/serve/tests/unit/test_autoscaling_policy.py index ac3960103f9d..93678aa00f29 100644 --- a/python/ray/serve/tests/unit/test_autoscaling_policy.py +++ b/python/ray/serve/tests/unit/test_autoscaling_policy.py @@ -2,6 +2,7 @@ import pytest +from ray.serve._private.autoscaling_state import AutoscalingContext from ray.serve._private.constants import CONTROL_LOOP_INTERVAL_S from ray.serve.autoscaling_policy import ( _calculate_desired_num_replicas, @@ -218,15 +219,27 @@ def test_scaling_factor_scale_up_from_0_replicas( upscale_smoothing_factor=10 if use_upscale_smoothing_factor else None, upscaling_factor=10 if use_upscaling_factor else None, ) - new_num_replicas = replica_queue_length_autoscaling_policy( - curr_target_num_replicas=0, + ctx = AutoscalingContext( + target_num_replicas=0, total_num_requests=1, - num_running_replicas=0, + current_num_replicas=0, config=config, capacity_adjusted_min_replicas=min_replicas, capacity_adjusted_max_replicas=max_replicas, policy_state={}, - ) + deployment_id=None, + deployment_name=None, + app_name=None, + running_replicas=None, + current_time=None, + queued_requests=None, + requests_per_replica=None, + aggregated_metrics=None, + raw_metrics=None, + last_scale_up_time=None, + last_scale_down_time=None, + ) + new_num_replicas, _ = replica_queue_length_autoscaling_policy(ctx=ctx) # 1 * 10 assert new_num_replicas == 10 @@ -236,15 +249,7 @@ def test_scaling_factor_scale_up_from_0_replicas( if use_upscaling_factor: config.upscaling_factor = 0.5 - new_num_replicas = replica_queue_length_autoscaling_policy( - curr_target_num_replicas=0, - total_num_requests=1, - num_running_replicas=0, - config=config, - capacity_adjusted_min_replicas=min_replicas, - capacity_adjusted_max_replicas=max_replicas, - policy_state={}, - ) + new_num_replicas, _ = replica_queue_length_autoscaling_policy(ctx=ctx) # math.ceil(1 * 0.5) assert new_num_replicas == 1 @@ -271,15 +276,27 @@ def test_scaling_factor_scale_down_to_0_replicas( upscale_delay_s=0, downscale_delay_s=0, ) - new_num_replicas = replica_queue_length_autoscaling_policy( + ctx = AutoscalingContext( config=config, total_num_requests=0, - num_running_replicas=5, - curr_target_num_replicas=5, + current_num_replicas=5, + target_num_replicas=5, capacity_adjusted_min_replicas=min_replicas, capacity_adjusted_max_replicas=max_replicas, policy_state=policy_state, - ) + deployment_id=None, + deployment_name=None, + app_name=None, + running_replicas=None, + current_time=None, + queued_requests=None, + requests_per_replica=None, + aggregated_metrics=None, + raw_metrics=None, + last_scale_up_time=None, + last_scale_down_time=None, + ) + new_num_replicas, _ = replica_queue_length_autoscaling_policy(ctx=ctx) assert new_num_replicas == 0 @@ -292,17 +309,12 @@ def test_scaling_factor_scale_down_to_0_replicas( config.downscaling_factor = 0.2 # policy_manager = AutoscalingPolicyManager(config) + ctx.total_num_requests = 0 num_replicas = 5 for _ in range(5): - num_replicas = replica_queue_length_autoscaling_policy( - config=config, - total_num_requests=0, - num_running_replicas=num_replicas, - curr_target_num_replicas=num_replicas, - capacity_adjusted_min_replicas=min_replicas, - capacity_adjusted_max_replicas=max_replicas, - policy_state=policy_state, - ) + ctx.current_num_replicas = num_replicas + ctx.target_num_replicas = num_replicas + num_replicas, _ = replica_queue_length_autoscaling_policy(ctx=ctx) assert num_replicas == 0 @@ -328,164 +340,113 @@ def test_upscale_downscale_delay(self): overload_requests = 100 - # Scale up when there are 0 replicas and current_handle_queued_queries > 0 - new_num_replicas = replica_queue_length_autoscaling_policy( + ctx = AutoscalingContext( config=config, total_num_requests=1, - num_running_replicas=0, - curr_target_num_replicas=0, + current_num_replicas=0, + target_num_replicas=0, capacity_adjusted_min_replicas=min_replicas, capacity_adjusted_max_replicas=max_replicas, policy_state=policy_state, + deployment_id=None, + deployment_name=None, + app_name=None, + running_replicas=None, + current_time=None, + queued_requests=None, + requests_per_replica=None, + aggregated_metrics=None, + raw_metrics=None, + last_scale_up_time=None, + last_scale_down_time=None, ) + + # Scale up when there are 0 replicas and current_handle_queued_queries > 0 + new_num_replicas, _ = replica_queue_length_autoscaling_policy(ctx=ctx) assert new_num_replicas == 1 + ctx.total_num_requests = overload_requests + ctx.current_num_replicas = 1 + ctx.target_num_replicas = 1 + # We should scale up only after enough consecutive scale-up decisions. for i in range(upscale_wait_periods): - new_num_replicas = replica_queue_length_autoscaling_policy( - config=config, - total_num_requests=overload_requests, - num_running_replicas=1, - curr_target_num_replicas=1, - capacity_adjusted_min_replicas=min_replicas, - capacity_adjusted_max_replicas=max_replicas, - policy_state=policy_state, - ) + new_num_replicas, _ = replica_queue_length_autoscaling_policy(ctx=ctx) assert new_num_replicas == 1, i - new_num_replicas = replica_queue_length_autoscaling_policy( - config=config, - total_num_requests=overload_requests, - num_running_replicas=1, - curr_target_num_replicas=1, - capacity_adjusted_min_replicas=min_replicas, - capacity_adjusted_max_replicas=max_replicas, - policy_state=policy_state, - ) + new_num_replicas, _ = replica_queue_length_autoscaling_policy(ctx=ctx) assert new_num_replicas == 2 no_requests = 0 + ctx.total_num_requests = no_requests + ctx.current_num_replicas = 2 + ctx.target_num_replicas = 2 + # We should scale down only after enough consecutive scale-down decisions. for i in range(downscale_wait_periods): - new_num_replicas = replica_queue_length_autoscaling_policy( - config=config, - total_num_requests=no_requests, - num_running_replicas=2, - curr_target_num_replicas=2, - capacity_adjusted_min_replicas=min_replicas, - capacity_adjusted_max_replicas=max_replicas, - policy_state=policy_state, - ) + new_num_replicas, _ = replica_queue_length_autoscaling_policy(ctx=ctx) assert new_num_replicas == 2, i - new_num_replicas = replica_queue_length_autoscaling_policy( - config=config, - total_num_requests=no_requests, - num_running_replicas=2, - curr_target_num_replicas=2, - capacity_adjusted_min_replicas=min_replicas, - capacity_adjusted_max_replicas=max_replicas, - policy_state=policy_state, - ) + new_num_replicas, _ = replica_queue_length_autoscaling_policy(ctx=ctx) assert new_num_replicas == 0 # Get some scale-up decisions, but not enough to trigger a scale up. + ctx.total_num_requests = overload_requests + ctx.current_num_replicas = 1 + ctx.target_num_replicas = 1 + for i in range(int(upscale_wait_periods / 2)): - new_num_replicas = replica_queue_length_autoscaling_policy( - config=config, - total_num_requests=overload_requests, - num_running_replicas=1, - curr_target_num_replicas=1, - capacity_adjusted_min_replicas=min_replicas, - capacity_adjusted_max_replicas=max_replicas, - policy_state=policy_state, - ) + new_num_replicas, _ = replica_queue_length_autoscaling_policy(ctx=ctx) assert new_num_replicas == 1, i + ctx.total_num_requests = 0 + ctx.current_num_replicas = 1 + ctx.target_num_replicas = 1 + # Interrupt with a scale-down decision. - replica_queue_length_autoscaling_policy( - config=config, - total_num_requests=0, - num_running_replicas=1, - curr_target_num_replicas=1, - capacity_adjusted_min_replicas=min_replicas, - capacity_adjusted_max_replicas=max_replicas, - policy_state=policy_state, - ) + replica_queue_length_autoscaling_policy(ctx=ctx) # The counter should be reset, so it should require `upscale_wait_periods` # more periods before we actually scale up. + + ctx.total_num_requests = overload_requests + ctx.current_num_replicas = 1 + ctx.target_num_replicas = 1 + for i in range(upscale_wait_periods): - new_num_replicas = replica_queue_length_autoscaling_policy( - config=config, - total_num_requests=overload_requests, - num_running_replicas=1, - curr_target_num_replicas=1, - capacity_adjusted_min_replicas=min_replicas, - capacity_adjusted_max_replicas=max_replicas, - policy_state=policy_state, - ) + new_num_replicas, _ = replica_queue_length_autoscaling_policy(ctx=ctx) assert new_num_replicas == 1, i - new_num_replicas = replica_queue_length_autoscaling_policy( - config=config, - total_num_requests=overload_requests, - num_running_replicas=1, - curr_target_num_replicas=1, - capacity_adjusted_min_replicas=min_replicas, - capacity_adjusted_max_replicas=max_replicas, - policy_state=policy_state, - ) + new_num_replicas, _ = replica_queue_length_autoscaling_policy(ctx=ctx) assert new_num_replicas == 2 + ctx.total_num_requests = no_requests + ctx.current_num_replicas = 2 + ctx.target_num_replicas = 2 + # Get some scale-down decisions, but not enough to trigger a scale down. for i in range(int(downscale_wait_periods / 2)): - new_num_replicas = replica_queue_length_autoscaling_policy( - config=config, - total_num_requests=no_requests, - num_running_replicas=2, - curr_target_num_replicas=2, - capacity_adjusted_min_replicas=min_replicas, - capacity_adjusted_max_replicas=max_replicas, - policy_state=policy_state, - ) + new_num_replicas, _ = replica_queue_length_autoscaling_policy(ctx=ctx) assert new_num_replicas == 2, i + ctx.total_num_requests = 200 + ctx.current_num_replicas = 2 + ctx.target_num_replicas = 2 + # Interrupt with a scale-up decision. - replica_queue_length_autoscaling_policy( - config=config, - total_num_requests=200, - num_running_replicas=2, - curr_target_num_replicas=2, - capacity_adjusted_min_replicas=min_replicas, - capacity_adjusted_max_replicas=max_replicas, - policy_state=policy_state, - ) + replica_queue_length_autoscaling_policy(ctx=ctx) # The counter should be reset so it should require `downscale_wait_periods` # more periods before we actually scale down. + ctx.total_num_requests = no_requests + ctx.current_num_replicas = 2 + ctx.target_num_replicas = 2 for i in range(downscale_wait_periods): - new_num_replicas = replica_queue_length_autoscaling_policy( - config=config, - total_num_requests=no_requests, - num_running_replicas=2, - curr_target_num_replicas=2, - capacity_adjusted_min_replicas=min_replicas, - capacity_adjusted_max_replicas=max_replicas, - policy_state=policy_state, - ) + new_num_replicas, _ = replica_queue_length_autoscaling_policy(ctx=ctx) assert new_num_replicas == 2, i - new_num_replicas = replica_queue_length_autoscaling_policy( - config=config, - total_num_requests=no_requests, - num_running_replicas=2, - curr_target_num_replicas=2, - capacity_adjusted_min_replicas=min_replicas, - capacity_adjusted_max_replicas=max_replicas, - policy_state=policy_state, - ) + new_num_replicas, _ = replica_queue_length_autoscaling_policy(ctx=ctx) assert new_num_replicas == 0 def test_replicas_delayed_startup(self): @@ -502,54 +463,53 @@ def test_replicas_delayed_startup(self): } config = AutoscalingConfig(**config) - # new_num_replicas = policy_manager.get_decision_num_replicas(1, 100, 1) - new_num_replicas = replica_queue_length_autoscaling_policy( + ctx = AutoscalingContext( config=config, - curr_target_num_replicas=1, + target_num_replicas=1, total_num_requests=100, - num_running_replicas=1, + current_num_replicas=1, capacity_adjusted_min_replicas=min_replicas, capacity_adjusted_max_replicas=max_replicas, policy_state=policy_state, + deployment_id=None, + deployment_name=None, + app_name=None, + running_replicas=None, + current_time=None, + queued_requests=None, + requests_per_replica=None, + aggregated_metrics=None, + raw_metrics=None, + last_scale_up_time=None, + last_scale_down_time=None, ) + + # new_num_replicas = policy_manager.get_decision_num_replicas(1, 100, 1) + new_num_replicas, _ = replica_queue_length_autoscaling_policy(ctx=ctx) assert new_num_replicas == 100 # New target is 100, but no new replicas finished spinning up during this # timestep. - new_num_replicas = replica_queue_length_autoscaling_policy( - config=config, - curr_target_num_replicas=100, - total_num_requests=100, - num_running_replicas=1, - capacity_adjusted_min_replicas=min_replicas, - capacity_adjusted_max_replicas=max_replicas, - policy_state=policy_state, - ) + ctx.total_num_requests = 100 + ctx.current_num_replicas = 1 + ctx.target_num_replicas = 100 + new_num_replicas, _ = replica_queue_length_autoscaling_policy(ctx=ctx) assert new_num_replicas == 100 # Two new replicas spun up during this timestep. - new_num_replicas = replica_queue_length_autoscaling_policy( - config=config, - curr_target_num_replicas=100, - total_num_requests=123, - num_running_replicas=3, - capacity_adjusted_min_replicas=min_replicas, - capacity_adjusted_max_replicas=max_replicas, - policy_state=policy_state, - ) + ctx.total_num_requests = 123 + ctx.current_num_replicas = 3 + ctx.target_num_replicas = 100 + + new_num_replicas, _ = replica_queue_length_autoscaling_policy(ctx=ctx) assert new_num_replicas == 123 # A lot of queries got drained and a lot of replicas started up, but # new_num_replicas should not decrease, because of the downscale delay. - new_num_replicas = replica_queue_length_autoscaling_policy( - config=config, - curr_target_num_replicas=123, - total_num_requests=10, - num_running_replicas=4, - capacity_adjusted_min_replicas=min_replicas, - capacity_adjusted_max_replicas=max_replicas, - policy_state=policy_state, - ) + ctx.total_num_requests = 10 + ctx.current_num_replicas = 4 + ctx.target_num_replicas = 123 + new_num_replicas, _ = replica_queue_length_autoscaling_policy(ctx=ctx) assert new_num_replicas == 123 @pytest.mark.parametrize("delay_s", [30.0, 0.0]) @@ -578,32 +538,43 @@ def test_fluctuating_ongoing_requests(self, delay_s): underload_requests, overload_requests = 2 * 20, 100 trials = 1000 + ctx = AutoscalingContext( + config=config, + capacity_adjusted_min_replicas=min_replicas, + capacity_adjusted_max_replicas=max_replicas, + policy_state=policy_state, + target_num_replicas=None, + total_num_requests=None, + current_num_replicas=None, + deployment_id=None, + deployment_name=None, + app_name=None, + running_replicas=None, + current_time=None, + queued_requests=None, + requests_per_replica=None, + aggregated_metrics=None, + raw_metrics=None, + last_scale_up_time=None, + last_scale_down_time=None, + ) + new_num_replicas = None for trial in range(trials): if trial % 2 == 0: - new_num_replicas = replica_queue_length_autoscaling_policy( - config=config, - total_num_requests=overload_requests, - num_running_replicas=1, - curr_target_num_replicas=1, - capacity_adjusted_min_replicas=min_replicas, - capacity_adjusted_max_replicas=max_replicas, - policy_state=policy_state, - ) + ctx.target_num_replicas = 1 + ctx.total_num_requests = overload_requests + ctx.current_num_replicas = 1 + new_num_replicas, _ = replica_queue_length_autoscaling_policy(ctx=ctx) if delay_s > 0: assert new_num_replicas == 1, trial else: assert new_num_replicas == 2, trial else: - new_num_replicas = replica_queue_length_autoscaling_policy( - config=config, - total_num_requests=underload_requests, - num_running_replicas=2, - curr_target_num_replicas=2, - capacity_adjusted_min_replicas=min_replicas, - capacity_adjusted_max_replicas=max_replicas, - policy_state=policy_state, - ) + ctx.target_num_replicas = 2 + ctx.total_num_requests = underload_requests + ctx.current_num_replicas = 2 + new_num_replicas, _ = replica_queue_length_autoscaling_policy(ctx=ctx) if delay_s > 0: assert new_num_replicas == 2, trial else: @@ -624,15 +595,28 @@ def test_single_replica_receives_all_requests(self, ongoing_requests): downscale_delay_s=0.0, ) - new_num_replicas = replica_queue_length_autoscaling_policy( + ctx = AutoscalingContext( config=config, total_num_requests=ongoing_requests, - num_running_replicas=4, - curr_target_num_replicas=4, + current_num_replicas=4, + target_num_replicas=4, capacity_adjusted_min_replicas=min_replicas, capacity_adjusted_max_replicas=max_replicas, policy_state=policy_state, - ) + deployment_id=None, + deployment_name=None, + app_name=None, + running_replicas=None, + current_time=None, + queued_requests=None, + requests_per_replica=None, + aggregated_metrics=None, + raw_metrics=None, + last_scale_up_time=None, + last_scale_down_time=None, + ) + + new_num_replicas, _ = replica_queue_length_autoscaling_policy(ctx=ctx) assert new_num_replicas == ongoing_requests / target_requests diff --git a/python/ray/serve/tests/unit/test_batching.py b/python/ray/serve/tests/unit/test_batching.py index fa83a018c85a..9c08ffdf6813 100644 --- a/python/ray/serve/tests/unit/test_batching.py +++ b/python/ray/serve/tests/unit/test_batching.py @@ -20,6 +20,8 @@ replica_id=ReplicaID(unique_id="test", deployment_id=DeploymentID(name="test")), servable_object=None, _deployment_config=default_deployment_config, + rank=0, + world_size=1, ) diff --git a/python/ray/serve/tests/unit/test_config.py b/python/ray/serve/tests/unit/test_config.py index ee462a42ba63..abf9eb51bc3d 100644 --- a/python/ray/serve/tests/unit/test_config.py +++ b/python/ray/serve/tests/unit/test_config.py @@ -7,7 +7,10 @@ from ray._common.pydantic_compat import ValidationError from ray._common.utils import import_attr from ray.serve._private.config import DeploymentConfig, ReplicaConfig, _proto_to_dict -from ray.serve._private.constants import DEFAULT_AUTOSCALING_POLICY, DEFAULT_GRPC_PORT +from ray.serve._private.constants import ( + DEFAULT_AUTOSCALING_POLICY_NAME, + DEFAULT_GRPC_PORT, +) from ray.serve._private.request_router import PowerOfTwoChoicesRequestRouter from ray.serve._private.utils import DEFAULT from ray.serve.autoscaling_policy import default_autoscaling_policy @@ -168,7 +171,9 @@ def test_setting_and_getting_request_router_class(self): "python.ray.serve.tests.unit.test_config.FakeRequestRouter" ) if sys.platform == "win32": - request_router_path = "com_github_ray_project_ray.python.ray.serve.tests.unit.test_config.FakeRequestRouter" + request_router_path = ( + "io_ray.python.ray.serve.tests.unit.test_config.FakeRequestRouter" + ) # Passing request_router_class as a class. deployment_config = DeploymentConfig.from_default( @@ -799,7 +804,7 @@ def test_autoscaling_policy_import_fails_for_non_existing_policy(): def test_default_autoscaling_policy_import_path(): """Test that default autoscaling policy can be imported.""" - policy = import_attr(DEFAULT_AUTOSCALING_POLICY) + policy = import_attr(DEFAULT_AUTOSCALING_POLICY_NAME) assert policy == default_autoscaling_policy diff --git a/python/ray/serve/tests/unit/test_constants_utils.py b/python/ray/serve/tests/unit/test_constants_utils.py index 3051d29b3c6b..e5e2754b5190 100644 --- a/python/ray/serve/tests/unit/test_constants_utils.py +++ b/python/ray/serve/tests/unit/test_constants_utils.py @@ -4,6 +4,7 @@ import pytest from ray.serve._private.constants_utils import ( + _validate_name, get_env_bool, get_env_float, get_env_float_non_negative, @@ -95,100 +96,142 @@ def mock_environ(): class TestEnvValueFunctions: def test_get_env_int(self, mock_environ): - assert 0 == get_env_int("TEST_VAR", 0) + assert get_env_int("RAY_SERVE_TEST_VAR", 0) == 0 - mock_environ["TEST_VAR"] = "42" - assert 42 == get_env_int("TEST_VAR", 0) + mock_environ["RAY_SERVE_TEST_VAR"] = "42" + assert get_env_int("RAY_SERVE_TEST_VAR", 0) == 42 - mock_environ["TEST_VAR"] = "-1" - assert -1 == get_env_int("TEST_VAR", 0) + mock_environ["RAY_SERVE_TEST_VAR"] = "-1" + assert get_env_int("RAY_SERVE_TEST_VAR", 0) == -1 - mock_environ["TEST_VAR"] = "0.1" + mock_environ["RAY_SERVE_TEST_VAR"] = "0.1" with pytest.raises(ValueError, match=".*`0.1` cannot be converted to `int`!*"): - get_env_int_positive("TEST_VAR", 5) + get_env_int_positive("RAY_SERVE_TEST_VAR", 5) - mock_environ["TEST_VAR"] = "abc" + mock_environ["RAY_SERVE_TEST_VAR"] = "abc" with pytest.raises(ValueError, match=".*`abc` cannot be converted to `int`!*"): - get_env_int_positive("TEST_VAR", 5) + get_env_int_positive("RAY_SERVE_TEST_VAR", 5) + + with pytest.raises(ValueError, match=".*require prefix `RAY_SERVE_`*"): + get_env_int_positive("NO_PREFIX", 5) def test_get_env_int_positive(self, mock_environ): - assert 1 == get_env_int_positive("TEST_VAR", 1) + assert get_env_int_positive("RAY_SERVE_TEST_VAR", 1) == 1 - mock_environ["TEST_VAR"] = "42" - assert 42 == get_env_int_positive("TEST_VAR", 0) + mock_environ["RAY_SERVE_TEST_VAR"] = "42" + assert get_env_int_positive("RAY_SERVE_TEST_VAR", 1) == 42 - mock_environ["TEST_VAR"] = "-1" + mock_environ["RAY_SERVE_TEST_VAR"] = "-1" with pytest.raises(ValueError, match=".*Expected positive `int`.*"): - get_env_int_positive("TEST_VAR", 5) + get_env_int_positive("RAY_SERVE_TEST_VAR", 5) def test_get_env_int_non_negative(self, mock_environ): - assert 0 == get_env_int_non_negative("TEST_VAR", 0) - assert 1 == get_env_int_non_negative("TEST_VAR", 1) + assert get_env_int_non_negative("RAY_SERVE_TEST_VAR", 0) == 0 + assert get_env_int_non_negative("RAY_SERVE_TEST_VAR", 1) == 1 + + mock_environ["RAY_SERVE_TEST_VAR"] = "42" + assert get_env_int_non_negative("RAY_SERVE_TEST_VAR", 0) == 42 - mock_environ["TEST_VAR"] = "42" - assert 42 == get_env_int_non_negative("TEST_VAR", 0) + mock_environ["RAY_SERVE_TEST_VAR"] = "-1" + with pytest.raises(ValueError, match=".*Expected non negative `int`.*"): + get_env_int_non_negative("RAY_SERVE_TEST_VAR", 5) - mock_environ["TEST_VAR"] = "-1" with pytest.raises(ValueError, match=".*Expected non negative `int`.*"): - get_env_int_non_negative("TEST_VAR", 5) + get_env_int_non_negative("RAY_SERVE_TEST_VAR_FROM_DEFAULT", -1) def test_get_env_float(self, mock_environ): - assert 0.0 == get_env_float("TEST_VAR", 0.0) + assert get_env_float("RAY_SERVE_TEST_VAR", 0.0) == 0.0 - mock_environ["TEST_VAR"] = "3.14" - assert 3.14 == get_env_float("TEST_VAR", 0.0) + mock_environ["RAY_SERVE_TEST_VAR"] = "3.14" + assert get_env_float("RAY_SERVE_TEST_VAR", 0.0) == 3.14 - mock_environ["TEST_VAR"] = "-2.5" - assert -2.5 == get_env_float("TEST_VAR", 0.0) + mock_environ["RAY_SERVE_TEST_VAR"] = "-2.5" + assert get_env_float("RAY_SERVE_TEST_VAR", 0.0) == -2.5 - mock_environ["TEST_VAR"] = "abc" + mock_environ["RAY_SERVE_TEST_VAR"] = "abc" with pytest.raises( ValueError, match=".*`abc` cannot be converted to `float`!*" ): - get_env_float("TEST_VAR", 0.0) + get_env_float("RAY_SERVE_TEST_VAR", 0.0) def test_get_env_float_positive(self, mock_environ): - assert 1.5 == get_env_float_positive("TEST_VAR", 1.5) + assert get_env_float_positive("RAY_SERVE_TEST_VAR", 1.5) == 1.5 + assert get_env_float_positive("RAY_SERVE_TEST_VAR", None) is None + + mock_environ["RAY_SERVE_TEST_VAR"] = "42.5" + assert get_env_float_positive("RAY_SERVE_TEST_VAR", 1.0) == 42.5 - mock_environ["TEST_VAR"] = "42.5" - assert 42.5 == get_env_float_positive("TEST_VAR", 0.0) + mock_environ["RAY_SERVE_TEST_VAR"] = "-1.2" + with pytest.raises(ValueError, match=".*Expected positive `float`.*"): + get_env_float_positive("RAY_SERVE_TEST_VAR", 5.0) + + with pytest.raises(ValueError, match=".*Expected positive `float`.*"): + get_env_float_positive("RAY_SERVE_TEST_VAR_FROM_DEFAULT", 0.0) - mock_environ["TEST_VAR"] = "-1.2" with pytest.raises(ValueError, match=".*Expected positive `float`.*"): - get_env_float_positive("TEST_VAR", 5.0) + get_env_float_positive("RAY_SERVE_TEST_VAR_FROM_DEFAULT", -1) def test_get_env_float_non_negative(self, mock_environ): - assert 0.0 == get_env_float_non_negative("TEST_VAR", 0.0) - assert 1.5 == get_env_float_non_negative("TEST_VAR", 1.5) + assert get_env_float_non_negative("RAY_SERVE_TEST_VAR", 0.0) == 0.0 + assert get_env_float_non_negative("RAY_SERVE_TEST_VAR", 1.5) == 1.5 - mock_environ["TEST_VAR"] = "42.5" - assert 42.5 == get_env_float_non_negative("TEST_VAR", 0.0) + mock_environ["RAY_SERVE_TEST_VAR"] = "42.5" + assert get_env_float_non_negative("RAY_SERVE_TEST_VAR", 0.0) == 42.5 - mock_environ["TEST_VAR"] = "-1.2" + mock_environ["RAY_SERVE_TEST_VAR"] = "-1.2" with pytest.raises(ValueError, match=".*Expected non negative `float`.*"): - get_env_float_non_negative("TEST_VAR", 5.0) + get_env_float_non_negative("RAY_SERVE_TEST_VAR", 5.0) def test_get_env_str(self, mock_environ): - mock_environ["TEST_STR"] = "hello" - assert get_env_str("TEST_STR", "default") == "hello" + mock_environ["RAY_SERVE_TEST_STR"] = "hello" + assert get_env_str("RAY_SERVE_TEST_STR", "default") == "hello" - assert get_env_str("NONEXISTENT_VAR", "default_str") == "default_str" + assert get_env_str("RAY_SERVE_NONEXISTENT_VAR", "default_str") == "default_str" - assert get_env_str("NONEXISTENT_VAR", None) is None + assert get_env_str("RAY_SERVE_NONEXISTENT_VAR", None) is None def test_get_env_bool(self, mock_environ): - mock_environ["TEST_BOOL_TRUE"] = "1" - assert get_env_bool("TEST_BOOL_TRUE", "0") is True + mock_environ["RAY_SERVE_TEST_BOOL_TRUE"] = "1" + assert get_env_bool("RAY_SERVE_TEST_BOOL_TRUE", "0") is True # Test with any other value (False) - mock_environ["TEST_BOOL_FALSE"] = "true" - assert get_env_bool("TEST_BOOL_FALSE", "0") is False - mock_environ["TEST_BOOL_FALSE2"] = "yes" - assert get_env_bool("TEST_BOOL_FALSE2", "0") is False + mock_environ["RAY_SERVE_TEST_BOOL_FALSE"] = "true" + assert get_env_bool("RAY_SERVE_TEST_BOOL_FALSE", "0") is False + mock_environ["RAY_SERVE_TEST_BOOL_FALSE2"] = "yes" + assert get_env_bool("RAY_SERVE_TEST_BOOL_FALSE2", "0") is False # Test with default when environment variable not set - assert get_env_bool("NONEXISTENT_VAR", "1") is True - assert get_env_bool("NONEXISTENT_VAR", "0") is False + assert get_env_bool("RAY_SERVE_NONEXISTENT_VAR", "1") is True + assert get_env_bool("RAY_SERVE_NONEXISTENT_VAR", "0") is False + + +class TestValidation: + @pytest.mark.parametrize( + "name", + [ + "RAY_SERVE_FOO", + "RAY_SERVE__DOUBLE_UNDERSCORE", + "RAY_SERVE_123", + "RAY_SERVE_VAR_NAME", + ], + ) + def test_validate_name_accepts_valid_prefix(self, name): + # Should not raise + assert _validate_name(name) is None + + @pytest.mark.parametrize( + "name", + [ + "", + "RAY_SERVE", # missing trailing underscore and name + "SERVE_VAR", + "ray_SERVE_BAR", + "RAY_service_VAR", + ], + ) + def test_validate_name_rejects_invalid_prefix(self, name): + with pytest.raises(ValueError, match=".*require prefix `RAY_SERVE_`*"): + _validate_name(name) if __name__ == "__main__": diff --git a/python/ray/serve/tests/unit/test_deployment_rank_manager.py b/python/ray/serve/tests/unit/test_deployment_rank_manager.py new file mode 100644 index 000000000000..211ce31b9471 --- /dev/null +++ b/python/ray/serve/tests/unit/test_deployment_rank_manager.py @@ -0,0 +1,343 @@ +import pytest + +from ray.serve._private.common import DeploymentID, ReplicaID +from ray.serve._private.deployment_state import DeploymentRankManager + + +@pytest.fixture +def rank_manager(): + """Fixture providing a fresh DeploymentRankManager instance for each test.""" + return DeploymentRankManager() + + +class MockDeploymentReplica: + """Mock replica for testing without heavy dependencies.""" + + def __init__( + self, + replica_id: str, + deployment_name: str = "test_deployment", + app_name: str = "test_app", + ): + self.replica_id = ReplicaID( + unique_id=replica_id, + deployment_id=DeploymentID(name=deployment_name, app_name=app_name), + ) + + def __str__(self): + return f"MockDeploymentReplica(replica_id={self.replica_id})" + + +class TestDeploymentRankManager: + """Test cases for DeploymentRankManager.""" + + def test_init(self, rank_manager): + """Test initialization creates empty state.""" + assert rank_manager._replica_ranks == {} + assert rank_manager._released_ranks == set() + assert rank_manager._next_rank == 0 + + def test_assign_rank_first_replica(self, rank_manager): + """Test assigning rank to first replica.""" + rank = rank_manager.assign_rank("replica_1") + assert rank == 0 + assert rank_manager._replica_ranks["replica_1"] == 0 + assert rank_manager._next_rank == 1 + assert rank_manager._released_ranks == set() + + def test_assign_rank_multiple_replicas(self, rank_manager): + """Test assigning ranks to multiple replicas.""" + rank1 = rank_manager.assign_rank("replica_1") + rank2 = rank_manager.assign_rank("replica_2") + rank3 = rank_manager.assign_rank("replica_3") + + assert rank1 == 0 + assert rank2 == 1 + assert rank3 == 2 + assert rank_manager._next_rank == 3 + assert len(rank_manager._replica_ranks) == 3 + + def test_assign_rank_reuses_released_ranks(self, rank_manager): + """Test that released ranks are reused before assigning new ones.""" + # Assign ranks to 3 replicas + rank_manager.assign_rank("replica_1") + rank_manager.assign_rank("replica_2") + rank_manager.assign_rank("replica_3") + + # Release middle rank + rank_manager.release_rank("replica_2") + assert 1 in rank_manager._released_ranks + + # New replica should get the released rank + rank = rank_manager.assign_rank("replica_4") + assert rank == 1 + assert 1 not in rank_manager._released_ranks + + def test_assign_rank_duplicate_fails(self): + """Test assigning rank to replica that already has one fails when flag is enabled.""" + rank_manager = DeploymentRankManager() + rank_manager.assign_rank("replica_1") + + with pytest.raises(RuntimeError, match="already has a rank assigned"): + rank_manager.assign_rank("replica_1") + + def test_release_rank(self, rank_manager): + """Test releasing a rank makes it available for reuse.""" + rank_manager.assign_rank("replica_1") + rank_manager.assign_rank("replica_2") + + rank_manager.release_rank("replica_1") + + assert "replica_1" not in rank_manager._replica_ranks + assert 0 in rank_manager._released_ranks + assert "replica_2" in rank_manager._replica_ranks + + def test_release_rank_nonexistent_replica(self): + """Test releasing rank for non-existent replica is safe.""" + rank_manager = DeploymentRankManager() + with pytest.raises(RuntimeError, match="has no rank assigned"): + rank_manager.release_rank("nonexistent") + + def test_recover_rank_basic(self, rank_manager): + """Test basic rank recovery.""" + rank_manager.recover_rank("replica_1", 5) + + assert rank_manager._replica_ranks["replica_1"] == 5 + assert rank_manager._next_rank == 6 + + def test_recover_rank_updates_next_rank(self, rank_manager): + """Test that recovering a high rank updates next_rank appropriately.""" + rank_manager.assign_rank("replica_1") # Gets rank 0 + rank_manager.recover_rank("replica_2", 10) + + assert rank_manager._next_rank == 11 + + # New replica should get rank 11 + rank = rank_manager.assign_rank("replica_3") + assert rank == 11 + + def test_recover_rank_removes_from_available(self, rank_manager): + """Test that recovering a rank removes it from available ranks.""" + rank_manager.assign_rank("replica_1") + rank_manager.assign_rank("replica_2") + rank_manager.release_rank("replica_1") # Rank 0 becomes available + + assert 0 in rank_manager._released_ranks + + # Recover rank 0 + rank_manager.recover_rank("replica_3", 0) + + assert 0 not in rank_manager._released_ranks + assert rank_manager._replica_ranks["replica_3"] == 0 + + def test_recover_rank_duplicate_fails(self): + """Test recovering rank for replica that already has one fails when flag is enabled.""" + rank_manager = DeploymentRankManager() + rank_manager.assign_rank("replica_1") + + with pytest.raises(RuntimeError, match="already has a rank assigned"): + rank_manager.recover_rank("replica_1", 5) + + def test_get_replica_rank_existing(self, rank_manager): + """Test getting rank for existing replica.""" + rank_manager.assign_rank("replica_1") + rank = rank_manager.get_replica_rank("replica_1") + assert rank == 0 + + def test_get_replica_rank_nonexistent_fails(self): + """Test getting rank for non-existent replica fails when flag is enabled.""" + rank_manager = DeploymentRankManager() + with pytest.raises(RuntimeError, match="has no rank assigned"): + rank_manager.get_replica_rank("nonexistent") + + def test_get_replica_ranks_mapping(self, rank_manager): + """Test getting copy of replica ranks mapping.""" + rank_manager.assign_rank("replica_1") + rank_manager.assign_rank("replica_2") + + mapping = rank_manager.get_replica_ranks_mapping() + expected = {"replica_1": 0, "replica_2": 1} + + assert mapping == expected + + # Verify it's a copy + mapping["replica_3"] = 2 + assert "replica_3" not in rank_manager._replica_ranks + + def test_clear(self, rank_manager): + """Test clearing all rank data.""" + rank_manager.assign_rank("replica_1") + rank_manager.assign_rank("replica_2") + rank_manager.release_rank("replica_1") + + rank_manager.clear() + + assert rank_manager._replica_ranks == {} + assert rank_manager._released_ranks == set() + assert rank_manager._next_rank == 0 + + def test_check_rank_consistency_empty_replicas(self, rank_manager): + """Test consistency check with no active replicas.""" + result = rank_manager.check_rank_consistency_and_reassign_minimally([]) + assert result == [] + + def test_check_rank_consistency_contiguous_ranks(self, rank_manager): + """Test consistency check with contiguous ranks (no reassignment needed).""" + # Set up contiguous ranks + replica1 = MockDeploymentReplica("replica_1") + replica2 = MockDeploymentReplica("replica_2") + replica3 = MockDeploymentReplica("replica_3") + + rank_manager.assign_rank("replica_1") # rank 0 + rank_manager.assign_rank("replica_2") # rank 1 + rank_manager.assign_rank("replica_3") # rank 2 + + result = rank_manager.check_rank_consistency_and_reassign_minimally( + [replica1, replica2, replica3] + ) + + assert result == [] + + def test_check_rank_consistency_non_contiguous_ranks(self, rank_manager): + """Test consistency check with non-contiguous ranks (reassignment needed).""" + # Set up non-contiguous ranks (simulate a replica being removed) + replica1 = MockDeploymentReplica("replica_1") + replica2 = MockDeploymentReplica("replica_2") + replica3 = MockDeploymentReplica("replica_3") + + # Manually set up non-contiguous ranks + rank_manager._replica_ranks = { + "replica_1": 0, + "replica_2": 2, # Gap at rank 1 + "replica_3": 3, + } + + result = rank_manager.check_rank_consistency_and_reassign_minimally( + [replica1, replica2, replica3] + ) + + # Should reassign some replicas to make ranks contiguous + assert len(result) > 0 + + # After reassignment, ranks should be contiguous + final_ranks = sorted(rank_manager._replica_ranks.values()) + expected_ranks = [0, 1, 2] + assert final_ranks == expected_ranks + + def test_minimal_reassignment_keeps_existing_when_possible(self, rank_manager): + """Test that minimal reassignment keeps existing ranks when possible.""" + replica1 = MockDeploymentReplica("replica_1") + replica2 = MockDeploymentReplica("replica_2") + replica3 = MockDeploymentReplica("replica_3") + replica4 = MockDeploymentReplica("replica_4") + + # Set up ranks: 0, 2, 5, 7 (non-contiguous) + rank_manager._replica_ranks = { + "replica_1": 0, # Should keep this + "replica_2": 2, # Should keep this + "replica_3": 5, # Should be reassigned to 1 + "replica_4": 7, # Should be reassigned to 3 + } + + result = rank_manager.check_rank_consistency_and_reassign_minimally( + [replica1, replica2, replica3, replica4] + ) + + # Verify minimal reassignment + assert len(result) == 2 # Only 2 replicas should be reassigned + reassigned_ids = {r.replica_id.unique_id for r in result} + assert reassigned_ids == {"replica_3", "replica_4"} + + # Verify final ranks are contiguous + final_ranks = sorted(rank_manager._replica_ranks.values()) + assert final_ranks == [0, 1, 2, 3] + + # Verify that replica_1 and replica_2 kept their original ranks + assert rank_manager._replica_ranks["replica_1"] == 0 + assert rank_manager._replica_ranks["replica_2"] == 2 + + def test_check_rank_consistency_unranked_replicas_fails_when_flag_enabled(self): + """Test consistency check fails when active replicas have no ranks and flag is enabled.""" + rank_manager = DeploymentRankManager(_fail_on_error=True) + replica1 = MockDeploymentReplica("replica_1") + + with pytest.raises( + RuntimeError, match="Controller rank system is in an invalid state" + ): + rank_manager.check_rank_consistency_and_reassign_minimally([replica1]) + + def test_check_rank_consistency_unranked_replicas_logs_when_flag_disabled(self): + """Test consistency check only logs when active replicas have no ranks and flag is disabled.""" + rank_manager = DeploymentRankManager(_fail_on_error=False) + replica1 = MockDeploymentReplica("replica_1") + + # When flag is disabled, it logs error but still tries to proceed with reassignment + # However, the reassignment will fail when trying to access ranks that don't exist + result = rank_manager.check_rank_consistency_and_reassign_minimally([replica1]) + assert result == [replica1] + + def test_check_rank_consistency_stale_ranks_fails_when_flag_enabled(self): + """Test consistency check fails when there are stale ranks and flag is enabled.""" + rank_manager = DeploymentRankManager(_fail_on_error=True) + replica1 = MockDeploymentReplica("replica_1") + + # Set up stale rank (replica not in active list) + rank_manager.assign_rank("replica_1") + rank_manager.assign_rank("stale_replica") + + with pytest.raises( + RuntimeError, match="Controller rank system is in an invalid state" + ): + rank_manager.check_rank_consistency_and_reassign_minimally([replica1]) + + def test_check_rank_consistency_stale_ranks_logs_when_flag_disabled(self): + """Test consistency check only logs when there are stale ranks and flag is disabled.""" + rank_manager = DeploymentRankManager(_fail_on_error=False) + replica1 = MockDeploymentReplica("replica_1") + + # Set up stale rank (replica not in active list) + rank_manager.assign_rank("replica_1") + rank_manager.assign_rank("stale_replica") + + # When flag is disabled, it logs error but continues with reassignment + # Since only replica_1 is active and has rank 0, no reassignment needed + result = rank_manager.check_rank_consistency_and_reassign_minimally([replica1]) + assert result == [] + + def test_check_rank_consistency_duplicate_ranks_fails_when_flag_enabled(self): + """Test consistency check fails when there are duplicate ranks and flag is enabled.""" + rank_manager = DeploymentRankManager(_fail_on_error=True) + replica1 = MockDeploymentReplica("replica_1") + replica2 = MockDeploymentReplica("replica_2") + + # Manually create duplicate ranks (this should never happen in normal operation) + rank_manager._replica_ranks = {"replica_1": 0, "replica_2": 0} # Duplicate! + + with pytest.raises( + RuntimeError, match="Controller rank system is in an invalid state" + ): + rank_manager.check_rank_consistency_and_reassign_minimally( + [replica1, replica2] + ) + + def test_check_rank_consistency_duplicate_ranks_logs_when_flag_disabled(self): + """Test consistency check only logs when there are duplicate ranks and flag is disabled.""" + rank_manager = DeploymentRankManager(_fail_on_error=False) + replica1 = MockDeploymentReplica("replica_1") + replica2 = MockDeploymentReplica("replica_2") + + # Manually create duplicate ranks (this should never happen in normal operation) + rank_manager._replica_ranks = {"replica_1": 0, "replica_2": 0} # Duplicate! + rank_manager._next_rank = 1 + + # When flag is disabled, it logs error but still performs reassignment to fix the issue + result = rank_manager.check_rank_consistency_and_reassign_minimally( + [replica1, replica2] + ) + assert result == [replica2] or result == [replica1] + + +if __name__ == "__main__": + import sys + + sys.exit(pytest.main(["-v", "-s", __file__])) diff --git a/python/ray/serve/tests/unit/test_deployment_state.py b/python/ray/serve/tests/unit/test_deployment_state.py index 553a12bff588..9bafe7b4f9b3 100644 --- a/python/ray/serve/tests/unit/test_deployment_state.py +++ b/python/ray/serve/tests/unit/test_deployment_state.py @@ -8,13 +8,17 @@ from ray._common.ray_constants import DEFAULT_MAX_CONCURRENCY_ASYNC from ray.serve._private.autoscaling_state import AutoscalingStateManager from ray.serve._private.common import ( + RUNNING_REQUESTS_KEY, DeploymentHandleSource, DeploymentID, DeploymentStatus, DeploymentStatusTrigger, + HandleMetricReport, ReplicaID, + ReplicaMetricReport, ReplicaState, TargetCapacityDirection, + TimeStampedValue, ) from ray.serve._private.config import DeploymentConfig, ReplicaConfig from ray.serve._private.constants import ( @@ -58,6 +62,7 @@ # loop, so we can't "mark" a replica dead through a method. This global # state is cleared after each test that uses the fixtures in this file. dead_replicas_context = set() +replica_rank_context = {} TEST_DEPLOYMENT_ID = DeploymentID(name="test_deployment", app_name="test_app") TEST_DEPLOYMENT_ID_2 = DeploymentID(name="test_deployment_2", app_name="test_app") @@ -95,10 +100,11 @@ def __init__( self._node_instance_id = None self._node_id_is_set = False self._actor_id = None - self._port = None + self._internal_grpc_port = None self._pg_bundles = None self._initialization_latency_s = -1 self._docs_path = None + self._rank = replica_rank_context.get(replica_id.unique_id, None) @property def is_cross_language(self) -> bool: @@ -217,8 +223,10 @@ def set_node_id(self, node_id: str): def set_actor_id(self, actor_id: str): self._actor_id = actor_id - def start(self, deployment_info: DeploymentInfo): + def start(self, deployment_info: DeploymentInfo, rank: int): self.started = True + self._rank = rank + replica_rank_context[self._replica_id.unique_id] = rank def _on_scheduled_stub(*args, **kwargs): pass @@ -235,10 +243,20 @@ def _on_scheduled_stub(*args, **kwargs): on_scheduled=_on_scheduled_stub, ) - def reconfigure(self, version: DeploymentVersion): + @property + def rank(self) -> Optional[int]: + return self._rank + + def reconfigure( + self, + version: DeploymentVersion, + rank: int = None, + ): self.started = True updating = self.version.requires_actor_reconfigure(version) self.version = version + self._rank = rank + replica_rank_context[self._replica_id.unique_id] = rank return updating def recover(self): @@ -247,6 +265,7 @@ def recover(self): self.recovering = True self.started = False + self._rank = replica_rank_context.get(self._replica_id.unique_id, None) return True def check_ready(self) -> ReplicaStartupStatus: @@ -379,6 +398,7 @@ def create_deployment_state_manager( ) dead_replicas_context.clear() + replica_rank_context.clear() @pytest.fixture @@ -2388,7 +2408,9 @@ def test_recover_state_from_replica_names(mock_deployment_state_manager): # Deploy deployment with version "1" and one replica info1, v1 = deployment_info(version="1") - assert dsm.deploy(TEST_DEPLOYMENT_ID, info1) + target_state_changed = dsm.deploy(TEST_DEPLOYMENT_ID, info1) + assert target_state_changed + dsm.save_checkpoint() ds = dsm._deployment_states[TEST_DEPLOYMENT_ID] # Single replica of version `version1` should be created and in STARTING state @@ -2437,7 +2459,9 @@ def test_recover_during_rolling_update(mock_deployment_state_manager): # Step 1: Create some deployment info with actors in running state info1, v1 = deployment_info(version="1") - assert dsm.deploy(TEST_DEPLOYMENT_ID, info1) + target_state_changed = dsm.deploy(TEST_DEPLOYMENT_ID, info1) + assert target_state_changed + dsm.save_checkpoint() ds = dsm._deployment_states[TEST_DEPLOYMENT_ID] # Single replica of version `version1` should be created and in STARTING state @@ -2452,8 +2476,8 @@ def test_recover_during_rolling_update(mock_deployment_state_manager): # Now execute a rollout: upgrade the version to "2". info2, v2 = deployment_info(version="2") - assert dsm.deploy(TEST_DEPLOYMENT_ID, info2) - + target_state_changed = dsm.deploy(TEST_DEPLOYMENT_ID, info2) + assert target_state_changed # In real code this checkpoint would be done by the caller of .deploy() dsm.save_checkpoint() @@ -2518,7 +2542,9 @@ def test_actor_died_before_recover(mock_deployment_state_manager): # Create some deployment info with actors in running state info1, v1 = deployment_info(version="1") - assert dsm.deploy(TEST_DEPLOYMENT_ID, info1) + target_state_changed = dsm.deploy(TEST_DEPLOYMENT_ID, info1) + assert target_state_changed + dsm.save_checkpoint() ds = dsm._deployment_states[TEST_DEPLOYMENT_ID] # Single replica of version `version1` should be created and in STARTING state @@ -2649,7 +2675,7 @@ def test_max_concurrency_override(self): ) max_ongoing_requests = DEFAULT_MAX_CONCURRENCY_ASYNC + 1 d_info, _ = deployment_info(max_ongoing_requests=max_ongoing_requests) - replica_scheduling_request = actor_replica.start(d_info) + replica_scheduling_request = actor_replica.start(d_info, rank=0) assert ( "max_concurrency" in replica_scheduling_request.actor_options and replica_scheduling_request.actor_options["max_concurrency"] @@ -2808,24 +2834,42 @@ def test_basic_autoscaling( req_per_replica = 2 if target_capacity_direction == "up" else 0 replicas = ds._replicas.get() if RAY_SERVE_COLLECT_AUTOSCALING_METRICS_ON_HANDLE: - asm.record_request_metrics_for_handle( + handle_metric_report = HandleMetricReport( deployment_id=TEST_DEPLOYMENT_ID, handle_id="random", - actor_id=None, + actor_id="actor_id", handle_source=DeploymentHandleSource.UNKNOWN, queued_requests=0, - running_requests={ - replica._actor.replica_id: req_per_replica for replica in replicas + aggregated_metrics={ + RUNNING_REQUESTS_KEY: { + replica._actor.replica_id: req_per_replica + for replica in replicas + } + }, + metrics={ + RUNNING_REQUESTS_KEY: { + replica._actor.replica_id: [ + TimeStampedValue(timer.time(), req_per_replica) + ] + for replica in replicas + } }, - send_timestamp=timer.time(), + timestamp=timer.time(), ) + asm.record_request_metrics_for_handle(handle_metric_report) else: for replica in replicas: - asm.record_request_metrics_for_replica( + replica_metric_report = ReplicaMetricReport( replica_id=replica._actor.replica_id, - window_avg=req_per_replica, - send_timestamp=timer.time(), + aggregated_metrics={RUNNING_REQUESTS_KEY: req_per_replica}, + metrics={ + RUNNING_REQUESTS_KEY: [ + TimeStampedValue(timer.time(), req_per_replica) + ] + }, + timestamp=timer.time(), ) + asm.record_request_metrics_for_replica(replica_metric_report) # status=UPSCALING/DOWNSCALING, status_trigger=AUTOSCALE dsm.update() @@ -2966,20 +3010,35 @@ def test_downscaling_reclaiming_starting_replicas_first( running_replicas = ds._replicas.get(states=[ReplicaState.RUNNING]) replicas = ds._replicas.get() if RAY_SERVE_COLLECT_AUTOSCALING_METRICS_ON_HANDLE: - asm.record_request_metrics_for_handle( + handle_metric_report = HandleMetricReport( deployment_id=TEST_DEPLOYMENT_ID, handle_id="random", - actor_id=None, + actor_id="actor_id", handle_source=DeploymentHandleSource.UNKNOWN, queued_requests=0, - running_requests={replica._actor.replica_id: 2 for replica in replicas}, - send_timestamp=timer.time(), + aggregated_metrics={ + RUNNING_REQUESTS_KEY: { + replica._actor.replica_id: 2 for replica in replicas + } + }, + metrics={ + RUNNING_REQUESTS_KEY: { + replica._actor.replica_id: [TimeStampedValue(timer.time(), 2)] + for replica in replicas + } + }, + timestamp=timer.time(), ) + asm.record_request_metrics_for_handle(handle_metric_report) else: for replica in replicas: - asm.record_request_metrics_for_replica( - replica._actor.replica_id, 2, timer.time() + replica_metric_report = ReplicaMetricReport( + replica_id=replica._actor.replica_id, + aggregated_metrics={RUNNING_REQUESTS_KEY: 2}, + metrics={RUNNING_REQUESTS_KEY: [TimeStampedValue(timer.time(), 2)]}, + timestamp=timer.time(), ) + asm.record_request_metrics_for_replica(replica_metric_report) # status=UPSCALING, status_trigger=AUTOSCALE dsm.update() @@ -3043,20 +3102,35 @@ def test_downscaling_reclaiming_starting_replicas_first( # Now, trigger downscaling attempting to reclaim half (3) of the replicas replicas = ds._replicas.get(states=[ReplicaState.RUNNING]) if RAY_SERVE_COLLECT_AUTOSCALING_METRICS_ON_HANDLE: - asm.record_request_metrics_for_handle( + handle_metric_report = HandleMetricReport( deployment_id=TEST_DEPLOYMENT_ID, handle_id="random", - actor_id=None, + actor_id="actor_id", handle_source=DeploymentHandleSource.UNKNOWN, queued_requests=0, - running_requests={replica._actor.replica_id: 1 for replica in replicas}, - send_timestamp=timer.time(), + aggregated_metrics={ + RUNNING_REQUESTS_KEY: { + replica._actor.replica_id: 1 for replica in replicas + } + }, + metrics={ + RUNNING_REQUESTS_KEY: { + replica._actor.replica_id: [TimeStampedValue(timer.time(), 1)] + for replica in replicas + } + }, + timestamp=timer.time(), ) + asm.record_request_metrics_for_handle(handle_metric_report) else: for replica in replicas: - asm.record_request_metrics_for_replica( - replica._actor.replica_id, 1, timer.time() + replica_metric_report = ReplicaMetricReport( + replica_id=replica._actor.replica_id, + aggregated_metrics={RUNNING_REQUESTS_KEY: 1}, + metrics={RUNNING_REQUESTS_KEY: [TimeStampedValue(timer.time(), 1)]}, + timestamp=timer.time(), ) + asm.record_request_metrics_for_replica(replica_metric_report) # status=DOWNSCALING, status_trigger=AUTOSCALE dsm.update() @@ -3133,20 +3207,35 @@ def test_update_autoscaling_config(self, mock_deployment_state_manager): # Num ongoing requests = 1, status should remain HEALTHY replicas = ds._replicas.get() if RAY_SERVE_COLLECT_AUTOSCALING_METRICS_ON_HANDLE: - asm.record_request_metrics_for_handle( + handle_metric_report = HandleMetricReport( deployment_id=TEST_DEPLOYMENT_ID, handle_id="random", - actor_id=None, + actor_id="actor_id", handle_source=DeploymentHandleSource.UNKNOWN, queued_requests=0, - running_requests={replica._actor.replica_id: 1 for replica in replicas}, - send_timestamp=timer.time(), + aggregated_metrics={ + RUNNING_REQUESTS_KEY: { + replica._actor.replica_id: 1 for replica in replicas + } + }, + metrics={ + RUNNING_REQUESTS_KEY: { + replica._actor.replica_id: [TimeStampedValue(timer.time(), 1)] + for replica in replicas + } + }, + timestamp=timer.time(), ) + asm.record_request_metrics_for_handle(handle_metric_report) else: for replica in replicas: - asm.record_request_metrics_for_replica( - replica._actor.replica_id, 1, timer.time() + replica_metric_report = ReplicaMetricReport( + replica_id=replica._actor.replica_id, + aggregated_metrics={RUNNING_REQUESTS_KEY: 1}, + metrics={RUNNING_REQUESTS_KEY: [TimeStampedValue(timer.time(), 1)]}, + timestamp=timer.time(), ) + asm.record_request_metrics_for_replica(replica_metric_report) check_counts(ds, total=3, by_state=[(ReplicaState.RUNNING, 3, None)]) assert ds.curr_status_info.status == DeploymentStatus.HEALTHY @@ -3231,15 +3320,17 @@ def test_replicas_fail_during_initial_scale_from_zero( ds: DeploymentState = dsm._deployment_states[TEST_DEPLOYMENT_ID] # Send request metrics to controller to make the deployment upscale - asm.record_request_metrics_for_handle( + handle_metric_report = HandleMetricReport( deployment_id=TEST_DEPLOYMENT_ID, handle_id="random", - actor_id=None, + actor_id="actor_id", handle_source=DeploymentHandleSource.UNKNOWN, queued_requests=1, - running_requests={}, - send_timestamp=timer.time(), + aggregated_metrics={}, + metrics={}, + timestamp=timer.time(), ) + asm.record_request_metrics_for_handle(handle_metric_report) # The controller should try to start a new replica. If that replica repeatedly # fails to start, the deployment should transition to UNHEALTHY and NOT retry @@ -3345,15 +3436,17 @@ def test_replicas_fail_during_subsequent_scale_from_zero( check_counts(ds, total=0) # Send request metrics to controller to make the deployment upscale - asm.record_request_metrics_for_handle( + handle_metric_report = HandleMetricReport( deployment_id=TEST_DEPLOYMENT_ID, handle_id="random", - actor_id=None, + actor_id="actor_id", handle_source=DeploymentHandleSource.UNKNOWN, queued_requests=1, - running_requests={}, - send_timestamp=timer.time(), + aggregated_metrics={}, + metrics={}, + timestamp=timer.time(), ) + asm.record_request_metrics_for_handle(handle_metric_report) # The controller should try to start a new replica. If that replica repeatedly # fails to start, the deployment should transition to UNHEALTHY. Meanwhile @@ -3419,15 +3512,25 @@ def test_handle_metrics_timeout(self, mock_deployment_state_manager): check_counts(ds, total=1, by_state=[(ReplicaState.RUNNING, 1, None)]) # Record 2 requests/replica -> trigger upscale - asm.record_request_metrics_for_handle( + handle_metric_report = HandleMetricReport( deployment_id=TEST_DEPLOYMENT_ID, handle_id="random", - actor_id=None, + actor_id="actor_id", handle_source=DeploymentHandleSource.UNKNOWN, queued_requests=0, - running_requests={ds._replicas.get()[0]._actor.replica_id: 2}, - send_timestamp=timer.time(), + aggregated_metrics={ + RUNNING_REQUESTS_KEY: {ds._replicas.get()[0]._actor.replica_id: 2} + }, + metrics={ + RUNNING_REQUESTS_KEY: { + ds._replicas.get()[0]._actor.replica_id: [ + TimeStampedValue(timer.time(), 2) + ] + } + }, + timestamp=timer.time(), ) + asm.record_request_metrics_for_handle(handle_metric_report) asm.drop_stale_handle_metrics(dsm.get_alive_replica_actor_ids()) dsm.update() check_counts( @@ -3505,15 +3608,25 @@ def test_handle_metrics_on_dead_serve_actor(self, mock_deployment_state_manager) check_counts(ds2, total=1, by_state=[(ReplicaState.RUNNING, 1, None)]) # Record 2 requests/replica (sent from d2 replica) -> trigger upscale - asm.record_request_metrics_for_handle( + handle_metric_report = HandleMetricReport( deployment_id=d_id1, handle_id="random", actor_id="d2_replica_actor_id", handle_source=DeploymentHandleSource.REPLICA, queued_requests=0, - running_requests={ds1._replicas.get()[0]._actor.replica_id: 2}, - send_timestamp=timer.time(), + aggregated_metrics={ + RUNNING_REQUESTS_KEY: {ds1._replicas.get()[0]._actor.replica_id: 2} + }, + metrics={ + RUNNING_REQUESTS_KEY: { + ds1._replicas.get()[0]._actor.replica_id: [ + TimeStampedValue(timer.time(), 2) + ] + } + }, + timestamp=timer.time(), ) + asm.record_request_metrics_for_handle(handle_metric_report) asm.drop_stale_handle_metrics(dsm.get_alive_replica_actor_ids()) dsm.update() check_counts( @@ -4884,5 +4997,350 @@ def test_docs_path_not_updated_for_different_version(mock_deployment_state_manag assert ds.docs_path is None +class TestDeploymentRankManagerIntegrationE2E: + """End-to-end integration tests for rank functionality through deployment state manager.""" + + def _set_replicas_ready( + self, ds: DeploymentState, replica_states: List[ReplicaState] + ): + """Helper to set replicas in given states to ready.""" + for replica in ds._replicas.get(replica_states): + replica._actor.set_ready() + + def _set_replicas_done_stopping(self, ds: DeploymentState): + """Helper to set stopping replicas as done stopping.""" + for replica in ds._replicas.get([ReplicaState.STOPPING]): + replica._actor.set_done_stopping() + + def test_scaling_up_and_down_scenario(self, mock_deployment_state_manager): + """Test a realistic scaling scenario through deployment state manager.""" + create_dsm, _, _, _ = mock_deployment_state_manager + dsm: DeploymentStateManager = create_dsm() + + # Start with 3 replicas + info_1, v1 = deployment_info(num_replicas=3, version="1") + dsm.deploy(TEST_DEPLOYMENT_ID, info_1) + ds: DeploymentState = dsm._deployment_states[TEST_DEPLOYMENT_ID] + + # Create initial replicas + dsm.update() + check_counts(ds, total=3, by_state=[(ReplicaState.STARTING, 3, v1)]) + + # Set replicas ready + self._set_replicas_ready(ds, [ReplicaState.STARTING]) + dsm.update() + check_counts(ds, total=3, by_state=[(ReplicaState.RUNNING, 3, v1)]) + assert ds.curr_status_info.status == DeploymentStatus.HEALTHY + + # Check initial ranks are 0, 1, 2 + ranks_mapping = ds._get_replica_ranks_mapping() + ranks = sorted(ranks_mapping.values()) + assert ranks == [0, 1, 2], f"Expected ranks [0, 1, 2], got {ranks}" + + # Scale down to 2 replicas - this should trigger rank reassignment + info_2, _ = deployment_info(num_replicas=2, version="1") + dsm.deploy(TEST_DEPLOYMENT_ID, info_2) + dsm.update() + + # One replica should be stopping + check_counts( + ds, + total=3, + by_state=[(ReplicaState.RUNNING, 2, v1), (ReplicaState.STOPPING, 1, v1)], + ) + + # Complete the scale down + self._set_replicas_done_stopping(ds) + dsm.update() + check_counts(ds, total=2, by_state=[(ReplicaState.RUNNING, 2, v1)]) + assert ds.curr_status_info.status == DeploymentStatus.HEALTHY + + # Trigger rank consistency check with one more update + dsm.update() + + # After scaling down and reaching healthy status, ranks should be contiguous [0, 1] + ranks_mapping = ds._get_replica_ranks_mapping() + ranks = sorted(ranks_mapping.values()) + assert ranks == [0, 1], f"Expected ranks [0, 1] after scale down, got {ranks}" + + # Scale back up to 3 replicas - new replica should reuse available rank + info_3, _ = deployment_info(num_replicas=3, version="1") + dsm.deploy(TEST_DEPLOYMENT_ID, info_3) + dsm.update() + + # Should have one new starting replica + check_counts( + ds, + total=3, + by_state=[(ReplicaState.RUNNING, 2, v1), (ReplicaState.STARTING, 1, v1)], + ) + + # Set new replica ready + self._set_replicas_ready(ds, [ReplicaState.STARTING]) + dsm.update() + check_counts(ds, total=3, by_state=[(ReplicaState.RUNNING, 3, v1)]) + assert ds.curr_status_info.status == DeploymentStatus.HEALTHY + + # Trigger rank consistency check with one more update + dsm.update() + + # Final ranks should be contiguous [0, 1, 2] + ranks_mapping = ds._get_replica_ranks_mapping() + ranks = sorted(ranks_mapping.values()) + assert ranks == [0, 1, 2], f"Expected final ranks [0, 1, 2], got {ranks}" + + def test_controller_recovery_with_scattered_ranks( + self, mock_deployment_state_manager + ): + """Test controller recovery with existing replica ranks through deployment state manager.""" + create_dsm, _, _, _ = mock_deployment_state_manager + dsm: DeploymentStateManager = create_dsm() + + # Deploy with 3 replicas + info_1, v1 = deployment_info(num_replicas=3, version="1") + target_state_changed = dsm.deploy(TEST_DEPLOYMENT_ID, info_1) + assert target_state_changed + dsm.save_checkpoint() + ds: DeploymentState = dsm._deployment_states[TEST_DEPLOYMENT_ID] + + # Create replicas and get them running + dsm.update() + check_counts(ds, total=3, by_state=[(ReplicaState.STARTING, 3, v1)]) + self._set_replicas_ready(ds, [ReplicaState.STARTING]) + dsm.update() + check_counts(ds, total=3, by_state=[(ReplicaState.RUNNING, 3, v1)]) + + # Get the actual replica objects (not just IDs) + replicas = ds._replicas.get([ReplicaState.RUNNING]) + replica_ids = [replica.replica_id for replica in replicas] + + # Simulate controller crashed! Create a new deployment state manager + # with the existing replica IDs to trigger recovery + new_dsm: DeploymentStateManager = create_dsm( + [replica_id.to_full_id_str() for replica_id in replica_ids] + ) + + # New deployment state should be created and replicas should be RECOVERING + new_ds = new_dsm._deployment_states[TEST_DEPLOYMENT_ID] + check_counts(new_ds, total=3, by_state=[(ReplicaState.RECOVERING, 3, v1)]) + + # Complete recovery - set replicas ready + self._set_replicas_ready(new_ds, [ReplicaState.RECOVERING]) + new_dsm.update() + check_counts(new_ds, total=3, by_state=[(ReplicaState.RUNNING, 3, v1)]) + assert new_ds.curr_status_info.status == DeploymentStatus.HEALTHY + + # At this point ranks should be scattered but all values [0, 1, 2] should be present + ranks_mapping = new_ds._get_replica_ranks_mapping() + ranks = sorted(ranks_mapping.values()) + assert ranks == [0, 1, 2], "Should have recovered scattered ranks" + + # Trigger rank consistency check with one more update - this should reorder if needed + new_dsm.update() + + # After rank consistency check, ranks should still be [0, 1, 2] + final_ranks_mapping = new_ds._get_replica_ranks_mapping() + final_ranks = sorted(final_ranks_mapping.values()) + assert final_ranks == [ + 0, + 1, + 2, + ], f"Expected contiguous ranks [0, 1, 2] after consistency check, got {final_ranks}" + + # Clean up + replica_rank_context.clear() + + def test_complex_reassignment_scenario(self, mock_deployment_state_manager): + """Test complex reassignment with many gaps through deployment state manager.""" + create_dsm, _, _, _ = mock_deployment_state_manager + dsm: DeploymentStateManager = create_dsm() + + # Deploy with 4 replicas + info_1, v1 = deployment_info(num_replicas=4, version="1") + target_state_changed = dsm.deploy(TEST_DEPLOYMENT_ID, info_1) + assert target_state_changed + dsm.save_checkpoint() + ds: DeploymentState = dsm._deployment_states[TEST_DEPLOYMENT_ID] + + # Create replicas and get them running + dsm.update() + check_counts(ds, total=4, by_state=[(ReplicaState.STARTING, 4, v1)]) + self._set_replicas_ready(ds, [ReplicaState.STARTING]) + dsm.update() + check_counts(ds, total=4, by_state=[(ReplicaState.RUNNING, 4, v1)]) + + # Get the actual replica objects + replicas = ds._replicas.get([ReplicaState.RUNNING]) + replica_ids = [replica.replica_id for replica in replicas] + + # Simulate very scattered ranks in global context: 0, 3, 7, 10 + global replica_rank_context + replica_rank_context.clear() + replica_rank_context[replica_ids[0].unique_id] = 0 + replica_rank_context[replica_ids[1].unique_id] = 3 + replica_rank_context[replica_ids[2].unique_id] = 7 + replica_rank_context[replica_ids[3].unique_id] = 10 + + # Simulate controller crashed! Create a new deployment state manager + # with the existing replica IDs to trigger recovery + new_dsm: DeploymentStateManager = create_dsm( + [replica_id.to_full_id_str() for replica_id in replica_ids] + ) + + # New deployment state should be created and replicas should be RECOVERING + new_ds = new_dsm._deployment_states[TEST_DEPLOYMENT_ID] + check_counts(new_ds, total=4, by_state=[(ReplicaState.RECOVERING, 4, v1)]) + + # Complete recovery - set replicas ready + self._set_replicas_ready(new_ds, [ReplicaState.RECOVERING]) + new_dsm.update() + check_counts(new_ds, total=4, by_state=[(ReplicaState.RUNNING, 4, v1)]) + assert new_ds.curr_status_info.status == DeploymentStatus.HEALTHY + + # Trigger rank consistency check with one more update + new_dsm.update() + + # After reassignment, ranks should be contiguous [0, 1, 2, 3] + ranks_mapping = new_ds._get_replica_ranks_mapping() + ranks = sorted(ranks_mapping.values()) + assert ranks == [ + 0, + 1, + 2, + 3, + ], f"Expected reassigned ranks [0, 1, 2, 3], got {ranks}" + + def test_rank_consistency_during_version_rollout( + self, mock_deployment_state_manager + ): + """Test that rank consistency is maintained during version rollouts.""" + create_dsm, _, _, _ = mock_deployment_state_manager + dsm: DeploymentStateManager = create_dsm() + + # Start with 3 replicas of version 1 + info_1, v1 = deployment_info(num_replicas=3, version="1") + dsm.deploy(TEST_DEPLOYMENT_ID, info_1) + ds: DeploymentState = dsm._deployment_states[TEST_DEPLOYMENT_ID] + + # Create and ready initial replicas + dsm.update() + check_counts(ds, total=3, by_state=[(ReplicaState.STARTING, 3, v1)]) + self._set_replicas_ready(ds, [ReplicaState.STARTING]) + dsm.update() + check_counts(ds, total=3, by_state=[(ReplicaState.RUNNING, 3, v1)]) + assert ds.curr_status_info.status == DeploymentStatus.HEALTHY + + # Verify initial ranks are contiguous + ranks_mapping = ds._get_replica_ranks_mapping() + initial_ranks = sorted(ranks_mapping.values()) + assert initial_ranks == [0, 1, 2] + + # Deploy version 2 - this should trigger rolling update + info_2, v2 = deployment_info(num_replicas=3, version="2") + dsm.deploy(TEST_DEPLOYMENT_ID, info_2) + dsm.update() + + # Complete the rolling update step by step + while True: + # Set any new starting replicas ready + starting_replicas = ds._replicas.get([ReplicaState.STARTING]) + if starting_replicas: + self._set_replicas_ready(ds, [ReplicaState.STARTING]) + + # Complete any stopping replicas + stopping_replicas = ds._replicas.get([ReplicaState.STOPPING]) + if stopping_replicas: + self._set_replicas_done_stopping(ds) + + dsm.update() + + # Check if rolling update is complete + running_replicas = ds._replicas.get([ReplicaState.RUNNING]) + if len(running_replicas) == 3 and all( + r.version == v2 for r in running_replicas + ): + break + + # After rolling update is complete, deployment should be healthy + assert ds.curr_status_info.status == DeploymentStatus.HEALTHY + + # Trigger rank consistency check with one more update + dsm.update() + + # After rolling update, verify ranks are still contiguous + final_ranks_mapping = ds._get_replica_ranks_mapping() + final_ranks = sorted(final_ranks_mapping.values()) + assert final_ranks == [ + 0, + 1, + 2, + ], f"Expected contiguous ranks [0, 1, 2] after rollout, got {final_ranks}" + + def test_rank_assignment_with_replica_failures(self, mock_deployment_state_manager): + """Test rank handling when replicas fail during startup.""" + create_dsm, _, _, _ = mock_deployment_state_manager + dsm: DeploymentStateManager = create_dsm() + + # Deploy with 3 replicas + info_1, v1 = deployment_info(num_replicas=3, version="1") + dsm.deploy(TEST_DEPLOYMENT_ID, info_1) + ds: DeploymentState = dsm._deployment_states[TEST_DEPLOYMENT_ID] + + # Create initial replicas + dsm.update() + check_counts(ds, total=3, by_state=[(ReplicaState.STARTING, 3, v1)]) + + # Make first two replicas ready, but let the third fail + starting_replicas = ds._replicas.get([ReplicaState.STARTING]) + starting_replicas[0]._actor.set_ready() + starting_replicas[1]._actor.set_ready() + starting_replicas[2]._actor.set_failed_to_start() + + dsm.update() + + running_count = ds._replicas.count(states=[ReplicaState.RUNNING]) + stopping_count = ds._replicas.count(states=[ReplicaState.STOPPING]) + assert running_count == 2, "Should have 2 running replicas" + assert stopping_count == 1, "Should have 1 stopping replica" + + self._set_replicas_done_stopping(ds) + dsm.update() + + starting_count = ds._replicas.count(states=[ReplicaState.STARTING]) + assert starting_count == 1, "Should have 1 starting replica" + + self._set_replicas_ready(ds, [ReplicaState.STARTING]) + + dsm.update() + # second update to reassign ranks + dsm.update() + + # Final verification - should have 3 running replicas (ignore failed/stopping replicas) + running_replicas = ds._replicas.get([ReplicaState.RUNNING]) + assert ( + len(running_replicas) == 3 + ), f"Expected 3 running replicas, got {len(running_replicas)}" + + # Verify that ranks are properly assigned and unique for running replicas + ranks_mapping = ds._get_replica_ranks_mapping() + + # Filter ranks to only include those for running replicas + running_replica_ids = [ + replica.replica_id.unique_id for replica in running_replicas + ] + running_replica_ranks = [ + ranks_mapping[replica_id] + for replica_id in running_replica_ids + if replica_id in ranks_mapping + ] + + # The ranks should be assigned to all running replicas + assert set(running_replica_ranks) == { + 0, + 1, + 2, + }, f"Expected ranks [0, 1, 2], got {ranks_mapping.values()}" + + if __name__ == "__main__": sys.exit(pytest.main(["-v", "-s", __file__])) diff --git a/python/ray/serve/tests/unit/test_local_testing_mode.py b/python/ray/serve/tests/unit/test_local_testing_mode.py index bc5eae880864..d0ac157693fb 100644 --- a/python/ray/serve/tests/unit/test_local_testing_mode.py +++ b/python/ray/serve/tests/unit/test_local_testing_mode.py @@ -1,5 +1,6 @@ import logging import os +import re import sys import pytest @@ -57,6 +58,18 @@ def __init__(self, h: DeploymentHandle, should_raise: bool): def test_to_object_ref_error_message(): + def _get_error_match(by_reference: bool) -> str: + if by_reference: + return ( + "Converting DeploymentResponses to ObjectRefs " + "is not supported in local testing mode." + ) + else: + return re.escape( + "Converting by-value DeploymentResponses to ObjectRefs is not supported. " + "Use handle.options(_by_reference=True) to enable it." + ) + @serve.deployment class Inner: pass @@ -67,22 +80,18 @@ def __init__(self, h: DeploymentHandle): self._h = h async def __call__(self): + match = _get_error_match(self._h.handle_options._by_reference) with pytest.raises( RuntimeError, - match=( - "Converting DeploymentResponses to ObjectRefs " - "is not supported in local testing mode." - ), + match=match, ): await self._h.remote()._to_object_ref() h = serve.run(Outer.bind(Inner.bind()), _local_testing_mode=True) + match = _get_error_match(h.handle_options._by_reference) with pytest.raises( RuntimeError, - match=( - "Converting DeploymentResponses to ObjectRefs " - "is not supported in local testing mode." - ), + match=match, ): h.remote()._to_object_ref_sync() diff --git a/python/ray/serve/tests/unit/test_metrics_utils.py b/python/ray/serve/tests/unit/test_metrics_utils.py index 52f8c84166e2..dd5e68c25d81 100644 --- a/python/ray/serve/tests/unit/test_metrics_utils.py +++ b/python/ray/serve/tests/unit/test_metrics_utils.py @@ -1,10 +1,19 @@ import asyncio import sys +from collections import defaultdict import pytest from ray._common.test_utils import async_wait_for_condition -from ray.serve._private.metrics_utils import InMemoryMetricsStore, MetricsPusher +from ray.serve._private.metrics_utils import ( + QUEUED_REQUESTS_KEY, + InMemoryMetricsStore, + MetricsPusher, + TimeStampedValue, + _bucket_latest_by_window, + _merge_two_timeseries, + merge_timeseries_dicts, +) from ray.serve._private.test_utils import MockAsyncTimer @@ -136,13 +145,26 @@ def new_f(s): await metrics_pusher.graceful_shutdown() +def assert_timeseries_equal(actual, expected): + assert len(actual) == len( + expected + ), f"Length mismatch: {len(actual)} vs {len(expected)}" + for i, (a, e) in enumerate(zip(actual, expected)): + assert ( + a.timestamp == e.timestamp + ), f"Timestamp mismatch at {i}: {a.timestamp} vs {e.timestamp}" + assert a.value == e.value, f"Value mismatch at {i}: {a.value} vs {e.value}" + + class TestInMemoryMetricsStore: def test_basics(self): s = InMemoryMetricsStore() s.add_metrics_point({"m1": 1}, timestamp=1) s.add_metrics_point({"m1": 2}, timestamp=2) - assert s.window_average("m1", window_start_timestamp_s=0) == 1.5 - assert s.max("m1", window_start_timestamp_s=0) == 2 + assert s.aggregate_avg(["m1"]) == (1.5, 1) + assert s.aggregate_max(["m1"]) == (2, 1) + assert s.aggregate_min(["m1"]) == (1, 1) + assert s.get_latest("m1") == 2 def test_out_of_order_insert(self): s = InMemoryMetricsStore() @@ -151,53 +173,42 @@ def test_out_of_order_insert(self): s.add_metrics_point({"m1": 3}, timestamp=3) s.add_metrics_point({"m1": 2}, timestamp=2) s.add_metrics_point({"m1": 4}, timestamp=4) - assert s.window_average("m1", window_start_timestamp_s=0) == 3 - assert s.max("m1", window_start_timestamp_s=0) == 5 + assert s.aggregate_avg(["m1"]) == (3, 1) + assert s.aggregate_max(["m1"]) == (5, 1) + assert s.aggregate_min(["m1"]) == (1, 1) def test_window_start_timestamp(self): s = InMemoryMetricsStore() - assert s.window_average("m1", window_start_timestamp_s=0) is None - assert s.max("m1", window_start_timestamp_s=0) is None + assert s.aggregate_avg(["m1"]) == (None, 0) + assert s.aggregate_max(["m1"]) == (None, 0) + assert s.aggregate_min(["m1"]) == (None, 0) s.add_metrics_point({"m1": 1}, timestamp=2) - assert s.window_average("m1", window_start_timestamp_s=0) == 1 - assert ( - s.window_average("m1", window_start_timestamp_s=10, do_compact=False) - is None - ) - - def test_compaction_window(self): - s = InMemoryMetricsStore() - - s.add_metrics_point({"m1": 1}, timestamp=1) - s.add_metrics_point({"m1": 2}, timestamp=2) - - assert ( - s.window_average("m1", window_start_timestamp_s=0, do_compact=False) == 1.5 - ) - s.window_average("m1", window_start_timestamp_s=1.1, do_compact=True) - # First record should be removed. - assert s.window_average("m1", window_start_timestamp_s=0, do_compact=False) == 2 - - def test_compaction_max(self): - s = InMemoryMetricsStore() - - s.add_metrics_point({"m1": 1}, timestamp=2) - s.add_metrics_point({"m1": 2}, timestamp=1) - - assert s.max("m1", window_start_timestamp_s=0, do_compact=False) == 2 - - s.window_average("m1", window_start_timestamp_s=1.1, do_compact=True) - - assert s.window_average("m1", window_start_timestamp_s=0, do_compact=False) == 1 + assert s.aggregate_avg(["m1"]) == (1, 1) + s.prune_keys_and_compact_data(10) + assert s.aggregate_avg(["m1"]) == (None, 0) def test_multiple_metrics(self): s = InMemoryMetricsStore() s.add_metrics_point({"m1": 1, "m2": -1}, timestamp=1) s.add_metrics_point({"m1": 2, "m2": -2}, timestamp=2) - assert s.window_average("m1", window_start_timestamp_s=0) == 1.5 - assert s.max("m1", window_start_timestamp_s=0) == 2 - assert s.max("m2", window_start_timestamp_s=0) == -1 + assert s.aggregate_avg(["m1"]) == (1.5, 1) + assert s.aggregate_avg(["m2"]) == (-1.5, 1) + assert s.aggregate_avg(["m1", "m2"]) == (0, 2) + assert s.aggregate_max(["m1"]) == (2, 1) + assert s.aggregate_max(["m2"]) == (-1, 1) + assert s.aggregate_max(["m1", "m2"]) == (2, 2) + assert s.aggregate_min(["m1"]) == (1, 1) + assert s.aggregate_min(["m2"]) == (-2, 1) + assert s.aggregate_min(["m1", "m2"]) == (-2, 2) + + def test_empty_key_mix(self): + s = InMemoryMetricsStore() + s.add_metrics_point({"m1": 1}, timestamp=1) + assert s.aggregate_avg(["m1", "m2"]) == (1, 1) + assert s.aggregate_max(["m1", "m2"]) == (1, 1) + assert s.aggregate_min(["m1", "m2"]) == (1, 1) + assert s.aggregate_avg(["m2"]) == (None, 0) def test_prune_keys_and_compact_data(self): s = InMemoryMetricsStore() @@ -210,6 +221,364 @@ def test_prune_keys_and_compact_data(self): assert len(s.data["m2"]) == 2 and s.data["m2"] == s._get_datapoints("m2", 1.1) assert len(s.data["m3"]) == 1 and s.data["m3"] == s._get_datapoints("m3", 1.1) + def test_merge_metrics_stores(self): + s1 = InMemoryMetricsStore() + s2 = InMemoryMetricsStore() + s3 = InMemoryMetricsStore() + s1.add_metrics_point( + {"m1": 1, "m2": 2, "m3": 3, QUEUED_REQUESTS_KEY: 1}, timestamp=1 + ) + s2.add_metrics_point({"m1": 2, "m2": 2, QUEUED_REQUESTS_KEY: 1}, timestamp=2) + s3.add_metrics_point({"m2": 10, QUEUED_REQUESTS_KEY: 10}, timestamp=2) + merged = merge_timeseries_dicts(s1.data, s2.data, s3.data, window_s=1) + + assert_timeseries_equal( + merged["m1"], [TimeStampedValue(1, 1), TimeStampedValue(2, 2)] + ) + assert_timeseries_equal( + merged["m2"], [TimeStampedValue(1, 2), TimeStampedValue(2, 12)] + ) + assert_timeseries_equal(merged["m3"], [TimeStampedValue(1, 3)]) + assert_timeseries_equal( + merged[QUEUED_REQUESTS_KEY], + [TimeStampedValue(1, 1), TimeStampedValue(2, 11)], + ) + + s4 = InMemoryMetricsStore() + s4.add_metrics_point( + {"m1": 100, "m2": 100, "m3": 100, QUEUED_REQUESTS_KEY: 10}, timestamp=0 + ) + + merged = merge_timeseries_dicts(s1.data, s2.data, s3.data, s4.data, window_s=2) + + # With window_s=2 and window start alignment: + # Window boundaries: [0,2), [2,4), etc. + # timestamp=0 (s4) and timestamp=1 (s1) -> window 0 + # timestamp=2 (s2, s3) -> window 1 + assert_timeseries_equal( + merged["m1"], + [TimeStampedValue(0, 101), TimeStampedValue(2, 2)], # 100+1=101, then 2 + ) + assert_timeseries_equal( + merged["m2"], + [ + TimeStampedValue(0, 102), + TimeStampedValue(2, 12), + ], # 100+2=102, then 2+10=12 + ) + assert_timeseries_equal( + merged["m3"], [TimeStampedValue(0, 103)] # 100+3=103, no data in window 1 + ) + assert_timeseries_equal( + merged[QUEUED_REQUESTS_KEY], + [TimeStampedValue(0, 11), TimeStampedValue(2, 11)], # 10+1=11, then 1+10=11 + ) + + s1_s2 = merge_timeseries_dicts(s1.data, s2.data, window_s=1) + s2_s1 = merge_timeseries_dicts(s2.data, s1.data, window_s=1) + s1_s2_s3_s4 = merge_timeseries_dicts( + s1.data, s2.data, s3.data, s4.data, window_s=1 + ) + s4_s1_s3_s2 = merge_timeseries_dicts( + s4.data, s1.data, s3.data, s2.data, window_s=1 + ) + + # dict equality -> compare per-key time series + for k in s1_s2: + assert_timeseries_equal(s1_s2[k], s2_s1[k]) + for k in s1_s2_s3_s4: + assert_timeseries_equal(s1_s2_s3_s4[k], s4_s1_s3_s2[k]) + + a1_none = merge_timeseries_dicts(s1.data, defaultdict(list), window_s=1) + for k in a1_none: + assert_timeseries_equal(a1_none[k], s1.data[k]) + + def test_bucket_latest_by_window_basic(self): + """Test basic functionality of _bucket_latest_by_window.""" + series = [ + TimeStampedValue(1.0, 10.0), + TimeStampedValue(1.5, 15.0), # Same window as 1.0, should overwrite + TimeStampedValue(3.0, 30.0), + ] + + # With window_s=1.0, start=0.0 + buckets = _bucket_latest_by_window(series, start=0.0, window_s=1.0) + + # Window 1: timestamps 1.0-2.0, latest value should be 15.0 + # Window 3: timestamp 3.0-4.0, value should be 30.0 + expected = {1: 15.0, 3: 30.0} + assert buckets == expected + + def test_bucket_latest_by_window_empty(self): + """Test _bucket_latest_by_window with empty series.""" + buckets = _bucket_latest_by_window([], start=0.0, window_s=1.0) + assert buckets == {} + + def test_bucket_latest_by_window_single_value(self): + """Test _bucket_latest_by_window with single value.""" + series = [TimeStampedValue(2.5, 25.0)] + buckets = _bucket_latest_by_window(series, start=0.0, window_s=1.0) + assert buckets == {2: 25.0} + + def test_bucket_latest_by_window_negative_timestamps(self): + """Test _bucket_latest_by_window with negative timestamps.""" + series = [ + TimeStampedValue(-1.5, 10.0), + TimeStampedValue(-0.5, 20.0), + TimeStampedValue(0.5, 30.0), + ] + buckets = _bucket_latest_by_window(series, start=-2.0, window_s=1.0) + # Window 0: -1.5 (index = (-1.5 - (-2.0)) // 1.0 = 0.5 // 1.0 = 0) + # Window 1: -0.5 (index = (-0.5 - (-2.0)) // 1.0 = 1.5 // 1.0 = 1) + # Window 2: 0.5 (index = (0.5 - (-2.0)) // 1.0 = 2.5 // 1.0 = 2) + expected = {0: 10.0, 1: 20.0, 2: 30.0} + assert buckets == expected + + def test_bucket_latest_by_window_very_small_window(self): + """Test _bucket_latest_by_window with very small windows.""" + series = [ + TimeStampedValue(1.001, 10.0), + TimeStampedValue(1.002, 20.0), # Different window + ] + buckets = _bucket_latest_by_window(series, start=1.0, window_s=0.001) + # With window_s=0.001: + # 1.001: (1.001 - 1.0) // 0.001 = 1.0 => window 1, but floor division gives 0 + # 1.002: (1.002 - 1.0) // 0.001 = 2.0 => window 2 + expected = { + 0: 10.0, + 2: 20.0, + } # Corrected based on actual floor division behavior + assert buckets == expected + + def test_merge_two_timeseries_both_empty(self): + """Test _merge_two_timeseries with both series empty.""" + result = _merge_two_timeseries([], [], window_s=1.0) + assert result == [] + + def test_merge_two_timeseries_one_empty(self): + """Test _merge_two_timeseries with one series empty.""" + t1 = [TimeStampedValue(1.0, 10.0), TimeStampedValue(2.0, 20.0)] + + result1 = _merge_two_timeseries(t1, [], window_s=1.0) + result2 = _merge_two_timeseries([], t1, window_s=1.0) + + # Results should be the same regardless of order + assert len(result1) == len(result2) == 2 + assert_timeseries_equal(result1, result2) + + def test_merge_two_timeseries_overlapping_windows(self): + """Test _merge_two_timeseries with values in overlapping time windows.""" + t1 = [TimeStampedValue(1.0, 10.0), TimeStampedValue(1.5, 15.0)] + t2 = [TimeStampedValue(1.3, 13.0), TimeStampedValue(1.8, 18.0)] + + result = _merge_two_timeseries(t1, t2, window_s=1.0) + + # With window_s=1.0 and earliest=1.0: + # start = 1.0 // 1.0 * 1.0 = 1.0 + # Window boundaries are [1.0, 2.0), [2.0, 3.0), etc. + # All values (1.0, 1.3, 1.5, 1.8) fall in window [1.0, 2.0) + # So we get 1 window + assert len(result) == 1 + + # Window 0: latest from t1 is 15.0 (1.5 > 1.0), latest from t2 is 18.0 (1.8 > 1.3), sum: 33.0 + assert result[0].value == 33.0 + + def test_merge_two_timeseries_zero_window(self): + """Test _merge_two_timeseries with zero window size.""" + t1 = [TimeStampedValue(1.0, 10.0)] + t2 = [TimeStampedValue(1.0, 20.0)] + + # Zero window should raise ValueError + with pytest.raises(ValueError, match="window_s must be positive, got 0"): + _merge_two_timeseries(t1, t2, window_s=0.0) + + def test_merge_two_timeseries_negative_window(self): + """Test _merge_two_timeseries with negative window size.""" + t1 = [TimeStampedValue(1.0, 10.0)] + t2 = [TimeStampedValue(1.0, 20.0)] + + # Negative window should raise ValueError + with pytest.raises(ValueError, match="window_s must be positive, got -1"): + _merge_two_timeseries(t1, t2, window_s=-1.0) + + def test_merge_two_timeseries_very_small_window(self): + """Test _merge_two_timeseries with very small window.""" + t1 = [TimeStampedValue(1.0, 10.0)] + t2 = [TimeStampedValue(1.0001, 20.0)] + + result = _merge_two_timeseries(t1, t2, window_s=0.0001) + + # With very small window, these should be in different buckets + assert len(result) == 2 + + def test_merge_two_timeseries_large_window(self): + """Test _merge_two_timeseries with very large window.""" + t1 = [TimeStampedValue(1.0, 10.0), TimeStampedValue(100.0, 15.0)] + t2 = [TimeStampedValue(50.0, 20.0), TimeStampedValue(200.0, 25.0)] + + result = _merge_two_timeseries(t1, t2, window_s=1000.0) + + # All values should be in the same window + assert len(result) == 1 + # Latest from t1: 15.0, latest from t2: 25.0, sum: 40.0 + assert result[0].value == 40.0 + + def test_merge_two_timeseries_duplicate_timestamps(self): + """Test _merge_two_timeseries with duplicate timestamps in same series.""" + t1 = [ + TimeStampedValue(1.0, 10.0), + TimeStampedValue(1.0, 15.0), # Duplicate timestamp + ] + t2 = [TimeStampedValue(1.0, 20.0)] + + result = _merge_two_timeseries(t1, t2, window_s=1.0) + + # Latest from t1 should be 15.0, t2 should be 20.0, sum: 35.0 + assert len(result) == 1 + assert result[0].value == 35.0 + + def test_merge_two_timeseries_floating_point_precision(self): + """Test _merge_two_timeseries with floating point precision edge cases.""" + # Test with timestamps that might have precision issues + t1 = [TimeStampedValue(0.1 + 0.2, 10.0)] # 0.30000000000000004 + t2 = [TimeStampedValue(0.3, 20.0)] + + result = _merge_two_timeseries(t1, t2, window_s=0.01) + + # These should be in the same window due to floating point precision + # but let's verify the behavior + assert len(result) >= 1 + + def test_merge_timeseries_dicts_empty_dicts(self): + """Test merge_timeseries_dicts with empty dictionaries.""" + result = merge_timeseries_dicts( + defaultdict(list), defaultdict(list), window_s=1.0 + ) + assert dict(result) == {} + + def test_merge_timeseries_dicts_single_dict(self): + """Test merge_timeseries_dicts with single dictionary.""" + data = defaultdict(list) + data["key1"] = [TimeStampedValue(1.0, 10.0)] + + result = merge_timeseries_dicts(data, window_s=1.0) + # With windowing applied, the result should have the same values but potentially different timestamps + expected = defaultdict(list) + expected["key1"] = [TimeStampedValue(1.0, 10.0)] # Window [1,2) starts at 1.0 + assert_timeseries_equal(result["key1"], expected["key1"]) + + def test_merge_timeseries_dicts_no_common_keys(self): + """Test merge_timeseries_dicts with dictionaries having no common keys.""" + d1 = defaultdict(list) + d1["key1"] = [TimeStampedValue(1.0, 10.0)] + + d2 = defaultdict(list) + d2["key2"] = [TimeStampedValue(2.0, 20.0)] + + result = merge_timeseries_dicts(d1, d2, window_s=1.0) + + assert "key1" in result + assert "key2" in result + assert len(result["key1"]) == 1 + assert len(result["key2"]) == 1 + + def test_merge_timeseries_dicts_many_stores(self): + """Test merge_timeseries_dicts with many stores.""" + stores = [] + for i in range(10): + store = defaultdict(list) + store["common_key"] = [TimeStampedValue(float(i), float(i * 10))] + stores.append(store) + + result = merge_timeseries_dicts(*stores, window_s=1.0) + + # Each value should be in its own window, sum should be 0+10+20+...+90 = 450 + assert "common_key" in result + total_value = sum(point.value for point in result["common_key"]) + assert total_value == 450.0 + + def test_merge_timeseries_dicts_zero_window(self): + """Test merge_timeseries_dicts with zero window size.""" + d1 = defaultdict(list) + d1["key1"] = [TimeStampedValue(1.0, 10.0)] + + d2 = defaultdict(list) + d2["key1"] = [TimeStampedValue(1.0, 20.0)] + + # Zero window should raise ValueError + with pytest.raises(ValueError, match="window_s must be positive, got 0"): + merge_timeseries_dicts(d1, d2, window_s=0.0) + + def test_merge_timeseries_dicts_negative_window(self): + """Test merge_timeseries_dicts with negative window size.""" + d1 = defaultdict(list) + d1["key1"] = [TimeStampedValue(1.0, 10.0)] + + # Negative window should raise ValueError + with pytest.raises(ValueError, match="window_s must be positive, got -1"): + merge_timeseries_dicts(d1, window_s=-1.0) + + def test_merge_timeseries_dicts_window_alignment_consistency(self): + """Test that window alignment is consistent regardless of input order.""" + # Create data that might expose window alignment issues + d1 = defaultdict(list) + d1["key1"] = [TimeStampedValue(1.1, 10.0)] + + d2 = defaultdict(list) + d2["key1"] = [TimeStampedValue(1.9, 20.0)] + + d3 = defaultdict(list) + d3["key1"] = [TimeStampedValue(2.1, 30.0)] + + # Test different orderings + result1 = merge_timeseries_dicts(d1, d2, d3, window_s=1.0) + result2 = merge_timeseries_dicts(d3, d1, d2, window_s=1.0) + result3 = merge_timeseries_dicts(d2, d3, d1, window_s=1.0) + + # Results should be the same regardless of order + assert_timeseries_equal(result1["key1"], result2["key1"]) + assert_timeseries_equal(result1["key1"], result3["key1"]) + + def test_merge_stores_bug_fix_window_center_calculation(self): + """Test for potential bug in window center calculation.""" + # This test checks if the window center calculation is correct + d1 = defaultdict(list) + d1["key1"] = [ + TimeStampedValue(0.0, 10.0), + TimeStampedValue(1.0, 15.0), + TimeStampedValue(2.0, 20.0), + TimeStampedValue(4.0, 30.0), + TimeStampedValue(5.0, 40.0), + ] + + result = merge_timeseries_dicts(d1, window_s=2.0) + + # With window_s=2.0 and window start alignment: + # Window [0,2): timestamps 0.0, 1.0 -> latest value 15.0 at window start 0.0 + # Window [2,4): timestamp 2.0 -> value 20.0 at window start 2.0 + # Window [4,6): timestamps 4.0, 5.0 -> latest value 40.0 at window start 4.0 + assert len(result["key1"]) == 3 + expected = [ + TimeStampedValue(timestamp=0.0, value=15.0), # Latest in window [0,2) + TimeStampedValue(timestamp=2.0, value=20.0), # Value in window [2,4) + TimeStampedValue(timestamp=4.0, value=40.0), # Latest in window [4,6) + ] + assert_timeseries_equal(result["key1"], expected) + + def test_merge_stores_preserves_value_precision(self): + """Test that merging preserves floating point precision of values.""" + d1 = defaultdict(list) + d1["key1"] = [TimeStampedValue(1.0, 0.1)] + + d2 = defaultdict(list) + d2["key1"] = [TimeStampedValue(1.0, 0.2)] + + result = merge_timeseries_dicts(d1, d2, window_s=1.0) + + # 0.1 + 0.2 should equal 0.3 exactly + assert len(result["key1"]) == 1 + assert abs(result["key1"][0].value - 0.3) < 1e-10 + if __name__ == "__main__": sys.exit(pytest.main(["-v", "-s", __file__])) diff --git a/python/ray/serve/tests/unit/test_pow_2_request_router.py b/python/ray/serve/tests/unit/test_pow_2_request_router.py index cef2801833f8..a594b94045b7 100644 --- a/python/ray/serve/tests/unit/test_pow_2_request_router.py +++ b/python/ray/serve/tests/unit/test_pow_2_request_router.py @@ -119,7 +119,9 @@ async def get_queue_len(self, *, deadline_s: float) -> int: self.get_queue_len_was_cancelled = True raise - def send_request(self, pr: PendingRequest) -> ReplicaResult: + def try_send_request( + self, pr: PendingRequest, with_rejection: bool + ) -> ReplicaResult: raise NotImplementedError() def send_request_with_rejection(self, pr: PendingRequest) -> ReplicaResult: @@ -1916,7 +1918,7 @@ def fake_sample(seq, k): assert done.pop().result() == r3 # assert that we tried local node, followed by local AZ, followed by all replicas - assert len(chosen_replicas) == 3 + assert len(chosen_replicas) in (3, 4) assert set(chosen_replicas[0]) == {r1.replica_id} assert set(chosen_replicas[1]) == {r1.replica_id, r2.replica_id} # assert intersection of chosen_replicas[2] and {r1.replica_id, r2.replica_id, r3.replica_id} is not empty diff --git a/python/ray/serve/tests/unit/test_router.py b/python/ray/serve/tests/unit/test_router.py index 9017efd18c73..e08ffe3a5e65 100644 --- a/python/ray/serve/tests/unit/test_router.py +++ b/python/ray/serve/tests/unit/test_router.py @@ -22,7 +22,10 @@ RunningReplicaInfo, ) from ray.serve._private.config import DeploymentConfig -from ray.serve._private.constants import RAY_SERVE_COLLECT_AUTOSCALING_METRICS_ON_HANDLE +from ray.serve._private.constants import ( + RAY_SERVE_COLLECT_AUTOSCALING_METRICS_ON_HANDLE, + RAY_SERVE_METRICS_EXPORT_INTERVAL_MS, +) from ray.serve._private.replica_result import ReplicaResult from ray.serve._private.request_router import ( PendingRequest, @@ -43,11 +46,20 @@ class FakeReplicaResult(ReplicaResult): - def __init__(self, replica_id, is_generator_object: bool): + def __init__( + self, + replica_id, + is_generator_object: bool, + queue_len_info: Optional[ReplicaQueueLengthInfo] = None, + ): self._replica_id = replica_id self._is_generator_object = is_generator_object + self._queue_len_info = queue_len_info self.cancelled = False + async def get_rejection_response(self): + return self._queue_len_info + def get(self, timeout_s: Optional[float]): raise NotImplementedError @@ -101,9 +113,9 @@ def is_cross_language(self) -> bool: def get_queue_len(self, *, deadline_s: float) -> int: raise NotImplementedError - async def send_request( + def try_send_request( self, pr: PendingRequest, with_rejection: bool - ) -> Tuple[Optional[FakeReplicaResult], Optional[ReplicaQueueLengthInfo]]: + ) -> FakeReplicaResult: if with_rejection: if self._error: raise self._error @@ -115,21 +127,16 @@ async def send_request( self._queue_len_info is not None ), "Must set queue_len_info to use `send_request_with_rejection`." - return ( - FakeReplicaResult(self._replica_id, is_generator_object=True), - self._queue_len_info, + return FakeReplicaResult( + self._replica_id, + is_generator_object=True, + queue_len_info=self._queue_len_info, ) else: if pr.metadata.is_streaming: - return ( - FakeReplicaResult(self._replica_id, is_generator_object=True), - None, - ) + return FakeReplicaResult(self._replica_id, is_generator_object=True) else: - return ( - FakeReplicaResult(self._replica_id, is_generator_object=False), - None, - ) + return FakeReplicaResult(self._replica_id, is_generator_object=False) class FakeRequestRouter(RequestRouter): @@ -180,6 +187,11 @@ def on_new_queue_len_info( replica_id, queue_len_info.num_ongoing_requests ) + def on_send_request(self, replica_id: ReplicaID): + if self._use_queue_len_cache: + num_ongoing_requests = self._replica_queue_len_cache.get(replica_id) or 0 + self._replica_queue_len_cache.update(replica_id, num_ongoing_requests + 1) + def on_replica_actor_unavailable(self, replica_id: ReplicaID): self._replica_queue_len_cache.invalidate_key(replica_id) @@ -760,7 +772,8 @@ def running_replica_info(replica_id: ReplicaID) -> RunningReplicaInfo: class TestRouterMetricsManager: - def test_num_router_requests(self): + @pytest.mark.asyncio + async def test_num_router_requests(self): tags = { "deployment": "a", "application": "b", @@ -779,15 +792,19 @@ def test_num_router_requests(self): ), FakeGauge(tag_keys=("deployment", "application", "handle", "actor_id")), FakeGauge(tag_keys=("deployment", "application", "handle", "actor_id")), + event_loop=asyncio.get_event_loop(), ) assert metrics_manager.num_router_requests.get_count(tags) is None n = random.randint(1, 10) for _ in range(n): metrics_manager.inc_num_total_requests(route="/alice") + + await asyncio.sleep(RAY_SERVE_METRICS_EXPORT_INTERVAL_MS * 2 / 1000) assert metrics_manager.num_router_requests.get_count(tags) == n - def test_num_queued_requests_gauge(self): + @pytest.mark.asyncio + async def test_num_queued_requests_gauge(self): tags = { "deployment": "a", "application": "b", @@ -805,18 +822,23 @@ def test_num_queued_requests_gauge(self): ), FakeGauge(tag_keys=("deployment", "application", "handle", "actor_id")), FakeGauge(tag_keys=("deployment", "application", "handle", "actor_id")), + event_loop=asyncio.get_event_loop(), ) assert metrics_manager.num_queued_requests_gauge.get_value(tags) == 0 n, m = random.randint(0, 10), random.randint(0, 5) for _ in range(n): metrics_manager.inc_num_queued_requests() + await asyncio.sleep(RAY_SERVE_METRICS_EXPORT_INTERVAL_MS * 2 / 1000) assert metrics_manager.num_queued_requests_gauge.get_value(tags) == n for _ in range(m): metrics_manager.dec_num_queued_requests() + + await asyncio.sleep(RAY_SERVE_METRICS_EXPORT_INTERVAL_MS * 2 / 1000) assert metrics_manager.num_queued_requests_gauge.get_value(tags) == n - m - def test_track_requests_sent_to_replicas(self): + @pytest.mark.asyncio + async def test_track_requests_sent_to_replicas(self): d_id = DeploymentID(name="a", app_name="b") metrics_manager = RouterMetricsManager( d_id, @@ -829,6 +851,7 @@ def test_track_requests_sent_to_replicas(self): ), FakeGauge(tag_keys=("deployment", "application", "handle", "actor_id")), FakeGauge(tag_keys=("deployment", "application", "handle", "actor_id")), + event_loop=asyncio.get_event_loop(), ) # r1: number requests -> 0, removed from list of running replicas -> prune @@ -845,6 +868,7 @@ def test_track_requests_sent_to_replicas(self): for i in range(4): for _ in range(i + 1): metrics_manager.inc_num_running_requests_for_replica(replica_ids[i]) + await asyncio.sleep(RAY_SERVE_METRICS_EXPORT_INTERVAL_MS * 2 / 1000) # All 4 replicas should have a positive number of requests for i, r in enumerate(replica_ids): @@ -866,6 +890,7 @@ def test_track_requests_sent_to_replicas(self): metrics_manager.dec_num_running_requests_for_replica(r1) for _ in range(2): metrics_manager.dec_num_running_requests_for_replica(r2) + await asyncio.sleep(RAY_SERVE_METRICS_EXPORT_INTERVAL_MS * 2 / 1000) assert metrics_manager.num_requests_sent_to_replicas[r1] == 0 assert metrics_manager.num_requests_sent_to_replicas[r2] == 0 @@ -889,6 +914,7 @@ def test_track_requests_sent_to_replicas(self): running_replica_info(r4), ] ) + await asyncio.sleep(RAY_SERVE_METRICS_EXPORT_INTERVAL_MS * 2 / 1000) # Only r1 should be pruned, the rest should still be tracked. assert r1 not in metrics_manager.num_requests_sent_to_replicas @@ -896,7 +922,8 @@ def test_track_requests_sent_to_replicas(self): assert r3 in metrics_manager.num_requests_sent_to_replicas assert r4 in metrics_manager.num_requests_sent_to_replicas - def test_should_send_scaled_to_zero_optimized_push(self): + @pytest.mark.asyncio + async def test_should_send_scaled_to_zero_optimized_push(self): metrics_manager = RouterMetricsManager( DeploymentID(name="a", app_name="b"), "random", @@ -908,6 +935,7 @@ def test_should_send_scaled_to_zero_optimized_push(self): ), FakeGauge(tag_keys=("deployment", "application", "handle", "actor_id")), FakeGauge(tag_keys=("deployment", "application", "handle", "actor_id")), + event_loop=asyncio.get_event_loop(), ) # Not an autoscaling deployment, should not push metrics @@ -926,10 +954,11 @@ def test_should_send_scaled_to_zero_optimized_push(self): # All 3 conditions satisfied, should push metrics assert metrics_manager.should_send_scaled_to_zero_optimized_push(0) + @pytest.mark.asyncio @patch( "ray.serve._private.router.RAY_SERVE_COLLECT_AUTOSCALING_METRICS_ON_HANDLE", "1" ) - def test_push_autoscaling_metrics_to_controller(self): + async def test_push_autoscaling_metrics_to_controller(self): timer = MockTimer() start = random.randint(50, 100) timer.reset(start) @@ -956,6 +985,7 @@ def test_push_autoscaling_metrics_to_controller(self): ), FakeGauge(tag_keys=("deployment", "application", "handle", "actor_id")), FakeGauge(tag_keys=("deployment", "application", "handle", "actor_id")), + event_loop=asyncio.get_event_loop(), ) metrics_manager._deployment_config = DeploymentConfig( autoscaling_config=AutoscalingConfig() @@ -976,14 +1006,9 @@ def test_push_autoscaling_metrics_to_controller(self): # Check metrics are pushed correctly metrics_manager.push_autoscaling_metrics_to_controller() - mock_controller_handle.record_handle_metrics.remote.assert_called_with( - deployment_id=deployment_id, - handle_id=handle_id, - actor_id=self_actor_id, - handle_source=DeploymentHandleSource.PROXY, - queued_requests=n, - running_requests=running_requests, - send_timestamp=start, + handle_metric_report = metrics_manager._get_metrics_report() + mock_controller_handle.record_autoscaling_metrics_from_handle.remote.assert_called_with( + handle_metric_report ) @pytest.mark.skipif( @@ -1014,6 +1039,7 @@ async def test_memory_cleared(self): ), FakeGauge(tag_keys=("deployment", "application", "handle", "actor_id")), FakeGauge(tag_keys=("deployment", "application", "handle", "actor_id")), + event_loop=asyncio.get_event_loop(), ) metrics_manager.update_deployment_config( deployment_config=DeploymentConfig( @@ -1056,11 +1082,12 @@ def check_database(expected: Set[ReplicaID]): check_database, expected={r1, r2, QUEUED_REQUESTS_KEY} ) + @pytest.mark.asyncio @patch( "ray.serve._private.router.RAY_SERVE_COLLECT_AUTOSCALING_METRICS_ON_HANDLE", "1" ) @patch("ray.serve._private.router.MetricsPusher") - def test_update_deployment_config(self, metrics_pusher_mock): + async def test_update_deployment_config(self, metrics_pusher_mock): metrics_manager = RouterMetricsManager( DeploymentID(name="a", app_name="b"), "random", @@ -1072,6 +1099,7 @@ def test_update_deployment_config(self, metrics_pusher_mock): ), FakeGauge(tag_keys=("deployment", "application", "handle", "actor_id")), FakeGauge(tag_keys=("deployment", "application", "handle", "actor_id")), + event_loop=asyncio.get_event_loop(), ) # Without autoscaling config, do nothing diff --git a/python/ray/serve/tests/unit/test_run_coroutine_threadsafe.py b/python/ray/serve/tests/unit/test_run_coroutine_threadsafe.py deleted file mode 100644 index ffc8a0fe7c01..000000000000 --- a/python/ray/serve/tests/unit/test_run_coroutine_threadsafe.py +++ /dev/null @@ -1,78 +0,0 @@ -import asyncio -import concurrent.futures -import sys -import threading - -import pytest - -from ray._common.test_utils import wait_for_condition -from ray.serve._private.utils import run_coroutine_or_future_threadsafe - - -@pytest.fixture -def separate_loop(): - loop = asyncio.new_event_loop() - thread = threading.Thread(target=loop.run_forever) - thread.start() - yield loop - loop.call_soon_threadsafe(loop.stop) - thread.join() - loop.close() - - -@pytest.mark.asyncio -async def test_run_coroutine_threadsafe_with_basic_coroutine(separate_loop): - async def sample_coro(): - await asyncio.sleep(0.01) - return "ok" - - future = run_coroutine_or_future_threadsafe(sample_coro(), separate_loop) - result = future.result(timeout=1) - - assert isinstance(future, concurrent.futures.Future) - assert result == "ok" - - -@pytest.mark.asyncio -async def test_run_coroutine_threadsafe_with_future(separate_loop): - async_future = asyncio.Future(loop=separate_loop) - async_future.set_result("ok2") - future = run_coroutine_or_future_threadsafe(async_future, separate_loop) - result = future.result(timeout=1) - assert result == "ok2" - - -@pytest.mark.asyncio -async def test_run_coroutine_threadsafe_with_task(separate_loop): - async def sample_coro(): - await asyncio.sleep(0.01) - return "ok" - - async_future = separate_loop.create_task(sample_coro()) - future = run_coroutine_or_future_threadsafe(async_future, separate_loop) - result = future.result(timeout=1) - assert result == "ok" - - -@pytest.mark.asyncio -async def test_run_coroutine_threadsafe_cancellation(separate_loop): - async def cancelled_coro(): - await asyncio.sleep(5) - - async_future = separate_loop.create_task(cancelled_coro()) - future = run_coroutine_or_future_threadsafe(async_future, separate_loop) - future.cancel() - assert future.cancelled() - wait_for_condition(lambda: async_future.cancelled()) - - -@pytest.mark.asyncio -async def test_run_coroutine_threadsafe_with_future_from_other_loop(separate_loop): - future = asyncio.Future(loop=asyncio.get_running_loop()) - future.set_result("ok") - with pytest.raises(AssertionError): - run_coroutine_or_future_threadsafe(future, separate_loop) - - -if __name__ == "__main__": - sys.exit(pytest.main(["-v", "-s", __file__])) diff --git a/python/ray/serve/tests/unit/test_task_consumer.py b/python/ray/serve/tests/unit/test_task_consumer.py new file mode 100644 index 000000000000..f7e6435a0254 --- /dev/null +++ b/python/ray/serve/tests/unit/test_task_consumer.py @@ -0,0 +1,284 @@ +import sys +import uuid +from typing import Any, Dict, List +from unittest.mock import MagicMock, call + +import pytest + +from ray.serve.schema import CeleryAdapterConfig, TaskProcessorConfig, TaskResult +from ray.serve.task_consumer import task_consumer, task_handler +from ray.serve.task_processor import TaskProcessorAdapter + + +class MockTaskProcessorAdapter(TaskProcessorAdapter): + """Mock adapter for testing task processor functionality.""" + + _start_consumer_received: bool = False + _stop_consumer_received: bool = False + _shutdown_received: bool = False + + def __init__(self, config: TaskProcessorConfig): + self._config = config + self.register_task_handle_mock = MagicMock() + + def initialize(self): + pass + + def register_task_handle(self, func, name=None): + self.register_task_handle_mock(func, name=name) + + def enqueue_task_sync( + self, task_name, args=None, kwargs=None, **options + ) -> TaskResult: + pass + + def get_task_status_sync(self, task_id) -> TaskResult: + pass + + def start_consumer(self, **kwargs): + self._start_consumer_received = True + + def stop_consumer(self, timeout: float = 10.0): + self._stop_consumer_received = True + + def shutdown(self): + self._shutdown_received = True + + def cancel_task_sync(self, task_id) -> bool: + pass + + def get_metrics_sync(self) -> Dict[str, Any]: + pass + + def health_check_sync(self) -> List[Dict]: + pass + + +@pytest.fixture +def config(): + """Provides a mock TaskProcessorConfig.""" + queue_name = f"test_queue_{uuid.uuid4().hex}" + return TaskProcessorConfig( + queue_name=queue_name, + adapter_config=CeleryAdapterConfig( + broker_url="fake://", + backend_url="fake://", + ), + adapter=MockTaskProcessorAdapter, + ) + + +class TestTaskHandlerDecorator: + """Test the task_handler decorator.""" + + def _create_and_test_handler(self, decorator_args=None, expected_name=None): + """Helper to create and test a task handler.""" + mock = MagicMock() + + if decorator_args is None: + + @task_handler + def test_handler(): + mock() + + else: + + @task_handler(**decorator_args) + def test_handler(): + mock() + + test_handler() + + assert mock.call_count == 1 + assert test_handler._task_name == expected_name + + def test_task_handler_decorator_with_name(self): + self._create_and_test_handler( + decorator_args={"name": "my_task"}, expected_name="my_task" + ) + + def test_task_handler_decorator_without_name(self): + self._create_and_test_handler(expected_name="test_handler") + + @pytest.mark.parametrize("invalid_name", ["", " ", 123]) + def test_task_handler_decorator_invalid_name(self, invalid_name): + """Test various invalid task names.""" + with pytest.raises( + ValueError, + match=f"Task name must be a non-empty string, got {invalid_name}", + ): + + @task_handler(name=invalid_name) + def my_task_handler(): + pass + + def test_task_handler_on_callable_object_without_name_attr(self): + """Test that AttributeError is raised for callables with no __name__.""" + + class MyCallable: + """A simple callable class without a __name__ attribute on instances.""" + + def __call__(self): + pass + + with pytest.raises(AttributeError): + task_handler(MyCallable()) + + +class TestTaskConsumerDecorator: + """Test the task_consumer decorator.""" + + def _verify_and_cleanup(self, instance, expected_calls=None): + """Verify consumer and cleanup instance.""" + adapter = instance._adapter + assert adapter._start_consumer_received + + if expected_calls is not None: + if expected_calls: + calls = [call(method, name=name) for method, name in expected_calls] + adapter.register_task_handle_mock.assert_has_calls( + calls, any_order=False + ) + assert adapter.register_task_handle_mock.call_count == len( + expected_calls + ) + else: + adapter.register_task_handle_mock.assert_not_called() + + del instance + + def _run_consumer_test( + self, config, consumer_class_factory, expected_calls_factory=None + ): + """Run a consumer test with factory functions.""" + consumer_class = consumer_class_factory(config) + instance = consumer_class() + + expected_calls = ( + expected_calls_factory(instance) if expected_calls_factory else None + ) + + self._verify_and_cleanup(instance, expected_calls) + + def test_task_consumer_basic(self, config): + """Test basic functionality of the task_consumer decorator.""" + + def make_consumer(cfg): + @task_consumer(task_processor_config=cfg) + class MyConsumer: + @task_handler + def my_task(self): + pass + + return MyConsumer + + self._run_consumer_test( + config, make_consumer, lambda inst: [(inst.my_task, "my_task")] + ) + + def test_task_consumer_multiple_handlers(self, config): + """Test with multiple task handlers.""" + + def make_consumer(cfg): + @task_consumer(task_processor_config=cfg) + class MyConsumer: + @task_handler + def task1(self): + pass + + @task_handler + def task2(self): + pass + + return MyConsumer + + self._run_consumer_test( + config, + make_consumer, + lambda inst: [(inst.task1, "task1"), (inst.task2, "task2")], + ) + + def test_task_consumer_custom_names(self, config): + """Test task handlers with and without custom names.""" + + def make_consumer(cfg): + @task_consumer(task_processor_config=cfg) + class MyConsumer: + @task_handler(name="custom_task") + def task1(self): + pass + + @task_handler + def task2(self): + pass + + return MyConsumer + + self._run_consumer_test( + config, + make_consumer, + lambda inst: [(inst.task1, "custom_task"), (inst.task2, "task2")], + ) + + def test_task_consumer_init_args(self, config): + """Test that __init__ arguments are passed correctly.""" + + @task_consumer(task_processor_config=config) + class MyConsumer: + def __init__(self, value): + self.value = value + + instance = MyConsumer(value=42) + assert instance.value == 42 + self._verify_and_cleanup(instance) + + def test_task_consumer_no_handlers(self, config): + """Test with a class that has no task handlers.""" + + def make_consumer(cfg): + @task_consumer(task_processor_config=cfg) + class MyConsumer: + def some_method(self): + pass + + return MyConsumer + + self._run_consumer_test(config, make_consumer, lambda inst: []) + + def test_task_consumer_inheritance(self, config): + """Test that inherited task handlers are registered.""" + + def make_consumer(cfg): + class BaseConsumer: + @task_handler + def base_task(self): + pass + + @task_consumer(task_processor_config=cfg) + class DerivedConsumer(BaseConsumer): + @task_handler + def derived_task(self): + pass + + return DerivedConsumer + + self._run_consumer_test( + config, + make_consumer, + lambda inst: [ + (inst.base_task, "base_task"), + (inst.derived_task, "derived_task"), + ], + ) + + def test_task_consumer_no_args_decorator(self): + """Test using @task_consumer without arguments raises TypeError.""" + with pytest.raises(TypeError): + + @task_consumer + class MyConsumer: + pass + + +if __name__ == "__main__": + sys.exit(pytest.main(["-v", "-s", __file__])) diff --git a/python/ray/setup-dev.py b/python/ray/setup-dev.py index 2585da4e4f81..8f65e4e716ff 100755 --- a/python/ray/setup-dev.py +++ b/python/ray/setup-dev.py @@ -16,10 +16,11 @@ sys.path.append(this_dir) import argparse -import click import shutil import subprocess +import click + import ray @@ -165,6 +166,7 @@ def do_link(package, force=False, skip_list=None, allow_list=None, local_path=No "widgets": None, "cluster_utils.py": None, "_private": None, + "_common": None, "dashboard": None, } diff --git a/python/ray/tests/BUILD b/python/ray/tests/BUILD.bazel similarity index 98% rename from python/ray/tests/BUILD rename to python/ray/tests/BUILD.bazel index a2b2f3b002a9..075863d7df4f 100644 --- a/python/ray/tests/BUILD +++ b/python/ray/tests/BUILD.bazel @@ -67,6 +67,7 @@ py_test_module_list( "test_reference_counting_2.py", "test_reference_counting_standalone.py", "test_runtime_env_agent.py", + "test_util_helpers.py", ], tags = [ "exclusive", @@ -82,8 +83,7 @@ py_test_module_list( py_test_module_list( size = "medium", env = { - "RAY_experimental_enable_open_telemetry_on_agent": "1", - "RAY_experimental_enable_open_telemetry_on_core": "1", + "RAY_enable_open_telemetry": "1", }, files = [ "test_metric_cardinality.py", @@ -510,6 +510,7 @@ py_test_module_list( "test_async_compat.py", "test_asyncio_cluster.py", "test_autoscaling_policy.py", + "test_baseexceptionandgroup.py", "test_bounded_unix_sockets.py", "test_component_failures.py", "test_concurrency_group.py", @@ -519,7 +520,6 @@ py_test_module_list( "test_distributed_sort.py", "test_environ.py", "test_error_ray_not_initialized.py", - "test_exceptiongroup.py", "test_gcs_pubsub.py", "test_get_or_create_actor.py", "test_grpc_client_credentials.py", @@ -609,9 +609,9 @@ py_test_module_list( ) py_test_module_list( - size = "medium", + size = "large", files = [ - "test_gpu_objects_gloo.py", + "gpu_objects/test_gpu_objects_gloo.py", ], tags = [ "exclusive", @@ -629,7 +629,8 @@ py_test_module_list( size = "medium", env = {"RAY_PYTEST_USE_GPU": "1"}, files = [ - "test_gpu_objects_nccl.py", + "gpu_objects/test_gpu_objects_nccl.py", + "gpu_objects/test_gpu_objects_nixl.py", ], tags = [ "exclusive", @@ -730,6 +731,7 @@ py_test_module_list( size = "medium", files = [ "test_autoscaler.py", + "test_symmetric_run.py", ], tags = [ "exclusive", @@ -881,6 +883,7 @@ py_test_module_list( "test_multi_node.py", "test_placement_group_3.py", "test_placement_group_5.py", + "test_raylet_fault_tolerance.py", "test_reconstruction.py", "test_reconstruction_2.py", "test_runtime_env_working_dir_uri.py", @@ -1089,8 +1092,8 @@ py_test( size = "large", srcs = ["test_runtime_env_container.py"], tags = [ - "container", "exclusive", + "runtime_env_container", "team:core", ], deps = [ diff --git a/python/ray/tests/accelerators/mock_pynvml.py b/python/ray/tests/accelerators/mock_pynvml.py index 6240961aea4b..24079a456d52 100644 --- a/python/ray/tests/accelerators/mock_pynvml.py +++ b/python/ray/tests/accelerators/mock_pynvml.py @@ -1,7 +1,8 @@ -import pytest from typing import List from unittest.mock import patch +import pytest + import ray._private.thirdparty.pynvml as pynvml diff --git a/python/ray/tests/accelerators/test_accelerators.py b/python/ray/tests/accelerators/test_accelerators.py index 80c1ef6ebf57..ac79765e88a7 100644 --- a/python/ray/tests/accelerators/test_accelerators.py +++ b/python/ray/tests/accelerators/test_accelerators.py @@ -1,4 +1,5 @@ import sys + import pytest from ray.util import accelerators diff --git a/python/ray/tests/accelerators/test_amd_gpu.py b/python/ray/tests/accelerators/test_amd_gpu.py index 1449d392b5b3..a1b13e575713 100644 --- a/python/ray/tests/accelerators/test_amd_gpu.py +++ b/python/ray/tests/accelerators/test_amd_gpu.py @@ -1,11 +1,14 @@ import os import sys -import pytest from unittest.mock import patch +import pytest + import ray -from ray._private.accelerators import AMDGPUAcceleratorManager -from ray._private.accelerators import get_accelerator_manager_for_resource +from ray._private.accelerators import ( + AMDGPUAcceleratorManager, + get_accelerator_manager_for_resource, +) @pytest.mark.parametrize( diff --git a/python/ray/tests/accelerators/test_hpu.py b/python/ray/tests/accelerators/test_hpu.py index e1a359051409..f6665c3001ed 100644 --- a/python/ray/tests/accelerators/test_hpu.py +++ b/python/ray/tests/accelerators/test_hpu.py @@ -1,10 +1,11 @@ import os import sys -import pytest from unittest.mock import patch +import pytest + import ray -from ray._private.accelerators import hpu, HPUAcceleratorManager +from ray._private.accelerators import HPUAcceleratorManager, hpu from ray.util.placement_group import placement_group from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy diff --git a/python/ray/tests/accelerators/test_intel_gpu.py b/python/ray/tests/accelerators/test_intel_gpu.py index 93dc8843bbdf..b74dd5296265 100644 --- a/python/ray/tests/accelerators/test_intel_gpu.py +++ b/python/ray/tests/accelerators/test_intel_gpu.py @@ -1,12 +1,15 @@ import os import sys -import pytest from unittest.mock import patch +import pytest + import ray -from ray._private.accelerators import IntelGPUAcceleratorManager as Accelerator -from ray._private.accelerators import get_accelerator_manager_for_resource -from ray.util.accelerators import INTEL_MAX_1550, INTEL_MAX_1100 +from ray._private.accelerators import ( + IntelGPUAcceleratorManager as Accelerator, + get_accelerator_manager_for_resource, +) +from ray.util.accelerators import INTEL_MAX_1100, INTEL_MAX_1550 def test_visible_intel_gpu_ids(shutdown_only): diff --git a/python/ray/tests/accelerators/test_neuron.py b/python/ray/tests/accelerators/test_neuron.py index 75443ec4ae11..19ba76d3d3e3 100644 --- a/python/ray/tests/accelerators/test_neuron.py +++ b/python/ray/tests/accelerators/test_neuron.py @@ -1,8 +1,9 @@ -import sys import subprocess -import pytest +import sys from unittest.mock import patch +import pytest + import ray from ray._private.accelerators import NeuronAcceleratorManager diff --git a/python/ray/tests/accelerators/test_npu.py b/python/ray/tests/accelerators/test_npu.py index 5c79d4d9c185..51cae14422b5 100644 --- a/python/ray/tests/accelerators/test_npu.py +++ b/python/ray/tests/accelerators/test_npu.py @@ -1,8 +1,9 @@ import os import sys -import pytest from unittest.mock import patch +import pytest + import ray from ray._private.accelerators import NPUAcceleratorManager as Accelerator diff --git a/python/ray/tests/accelerators/test_nvidia_gpu.py b/python/ray/tests/accelerators/test_nvidia_gpu.py index 035a866bfcbf..10c2065d3066 100644 --- a/python/ray/tests/accelerators/test_nvidia_gpu.py +++ b/python/ray/tests/accelerators/test_nvidia_gpu.py @@ -1,4 +1,5 @@ import sys + import pytest from ray._private.accelerators import NvidiaGPUAcceleratorManager diff --git a/python/ray/tests/accelerators/test_rbln.py b/python/ray/tests/accelerators/test_rbln.py index fa98927c346f..37865bff1392 100644 --- a/python/ray/tests/accelerators/test_rbln.py +++ b/python/ray/tests/accelerators/test_rbln.py @@ -1,11 +1,12 @@ -import pytest import os import sys +import pytest + from ray._private.accelerators.rbln import ( - RBLNAcceleratorManager, - RBLN_RT_VISIBLE_DEVICES_ENV_VAR, NOSET_RBLN_RT_VISIBLE_DEVICES_ENV_VAR, + RBLN_RT_VISIBLE_DEVICES_ENV_VAR, + RBLNAcceleratorManager, ) diff --git a/python/ray/tests/accelerators/test_tpu.py b/python/ray/tests/accelerators/test_tpu.py index e0d405f60efd..3f2c53286996 100644 --- a/python/ray/tests/accelerators/test_tpu.py +++ b/python/ray/tests/accelerators/test_tpu.py @@ -1,13 +1,14 @@ import os import sys from unittest import mock +from unittest.mock import patch + import pytest import requests -from unittest.mock import patch import ray -from ray._private.accelerators import TPUAcceleratorManager -from ray._private.accelerators import tpu +from ray._private.accelerators import TPUAcceleratorManager, tpu +from ray.tests.conftest import _ray_start_cluster @patch("glob.glob") @@ -353,5 +354,76 @@ def test_get_current_node_tpu_topology_from_metadata(): assert topology == "2x2x4" +@pytest.mark.parametrize( + "topology, accelerator_type, expected_pod_type", + [ + ("2x4", "TPU-V6E", "v6e-8"), + ("2x2x2", "TPU-V4", "v4-8"), + ("2x4x4", "TPU-V3", "v3-32"), + ("4x4", "TPU-V5P", "v5p-16"), + ("8x16", "TPU-V6E", "v6e-128"), + ("", "TPU-V3", None), + ("4x", "TPU-V3", None), + ], +) +def test_infer_tpu_pod_type_from_topology( + topology, accelerator_type, expected_pod_type +): + assert ( + tpu.infer_tpu_pod_type_from_topology(topology, accelerator_type) + == expected_pod_type + ) + + +@pytest.fixture +def ray_start_cpu(): + address_info = ray.init(num_cpus=1) + yield address_info + ray.shutdown() + + +@pytest.fixture +def ray_tpu_cluster(monkeypatch): + """Start a mock TPU Ray cluster.""" + with _ray_start_cluster() as cluster: + monkeypatch.setenv("TPU_NAME", "test-slice-0") + monkeypatch.setenv("TPU_WORKER_ID", "0") + monkeypatch.setenv("TPU_ACCELERATOR_TYPE", "v4-8") + monkeypatch.setenv("TPU_TOPOLOGY", "2x2x2") + + cluster.add_node( + num_cpus=2, + resources={"TPU": 4, "TPU-v4-8-head": 1}, + ) + monkeypatch.setenv("TPU_WORKER_ID", "1") + cluster.add_node( + num_cpus=2, + resources={"TPU": 4}, + ) + ray.init(address=cluster.address) + + yield cluster + ray.shutdown() + + +def test_fetch_tpu_slice_name_from_pg(ray_tpu_cluster): + """Tests that the slice name can be fetched from a PG.""" + tpu_head_pg = ray.util.placement_group(bundles=[{"TPU-v4-8-head": 1}]) + ray.get(tpu_head_pg.ready()) + + tpu_slice_name = "test-slice-0" + slice_name = tpu.fetch_tpu_slice_name_from_pg(tpu_head_pg) + assert slice_name == tpu_slice_name + + ray.util.remove_placement_group(tpu_head_pg) + + +def test_reserve_tpu_slice(ray_tpu_cluster): + """Tests that a TPU slice can be successfully reserved.""" + tpu_slice_name = "test-slice-0" + reserved_name = tpu.reserve_tpu_slice(topology="2x2x2", accelerator_type="TPU-V4") + assert reserved_name == tpu_slice_name + + if __name__ == "__main__": sys.exit(pytest.main(["-sv", __file__])) diff --git a/python/ray/tests/autoscaler/test_providers.py b/python/ray/tests/autoscaler/test_providers.py index b4e0a8c87676..85dd8b68171a 100644 --- a/python/ray/tests/autoscaler/test_providers.py +++ b/python/ray/tests/autoscaler/test_providers.py @@ -1,10 +1,12 @@ +import unittest + +import yaml + from ray.autoscaler._private.providers import ( + _DEFAULT_CONFIGS, _NODE_PROVIDERS, _PROVIDER_PRETTY_NAMES, - _DEFAULT_CONFIGS, ) -import unittest -import yaml class TestProviders(unittest.TestCase): diff --git a/python/ray/tests/autoscaler/util.py b/python/ray/tests/autoscaler/util.py index 426ff3662402..a6631f3d2a31 100644 --- a/python/ray/tests/autoscaler/util.py +++ b/python/ray/tests/autoscaler/util.py @@ -1,5 +1,6 @@ import unittest from unittest.mock import Mock + from ray.autoscaler._private.util import get_per_node_breakdown_as_dict diff --git a/python/ray/tests/autoscaler_test_utils.py b/python/ray/tests/autoscaler_test_utils.py index 8cbcebd6ac2a..d0a0d39e567a 100644 --- a/python/ray/tests/autoscaler_test_utils.py +++ b/python/ray/tests/autoscaler_test_utils.py @@ -1,8 +1,8 @@ import re import threading - from subprocess import CalledProcessError from typing import Any, Dict, List, Optional + from ray.autoscaler.node_provider import NodeProvider diff --git a/python/ray/tests/aws/conftest.py b/python/ray/tests/aws/conftest.py index ed3a6a4b71ad..8f63e619cea9 100644 --- a/python/ray/tests/aws/conftest.py +++ b/python/ray/tests/aws/conftest.py @@ -1,9 +1,8 @@ import pytest +from botocore.stub import Stubber +from ray.autoscaler._private.aws.utils import client_cache, resource_cache from ray.autoscaler._private.constants import BOTO_MAX_RETRIES -from ray.autoscaler._private.aws.utils import resource_cache, client_cache - -from botocore.stub import Stubber @pytest.fixture() diff --git a/python/ray/tests/aws/test_aws_batch_tag_update.py b/python/ray/tests/aws/test_aws_batch_tag_update.py index a9bcd45ffab5..3bd39c0a4ef0 100644 --- a/python/ray/tests/aws/test_aws_batch_tag_update.py +++ b/python/ray/tests/aws/test_aws_batch_tag_update.py @@ -6,8 +6,7 @@ import pytest -from ray.autoscaler._private.aws.node_provider import AWSNodeProvider -from ray.autoscaler._private.aws.node_provider import TAG_BATCH_DELAY +from ray.autoscaler._private.aws.node_provider import TAG_BATCH_DELAY, AWSNodeProvider def mock_create_tags(provider, batch_updates): diff --git a/python/ray/tests/aws/utils/constants.py b/python/ray/tests/aws/utils/constants.py index b92fc5e45ea0..7b0ee9eca340 100644 --- a/python/ray/tests/aws/utils/constants.py +++ b/python/ray/tests/aws/utils/constants.py @@ -1,11 +1,11 @@ import copy -import ray from datetime import datetime +import ray from ray.autoscaler.tags import ( + NODE_KIND_HEAD, TAG_RAY_LAUNCH_CONFIG, TAG_RAY_NODE_KIND, - NODE_KIND_HEAD, TAG_RAY_USER_NODE_TYPE, ) diff --git a/python/ray/tests/aws/utils/helpers.py b/python/ray/tests/aws/utils/helpers.py index 12476cd6649c..9a3825634896 100644 --- a/python/ray/tests/aws/utils/helpers.py +++ b/python/ray/tests/aws/utils/helpers.py @@ -1,23 +1,24 @@ +import copy import os +from typing import Any, Dict + import yaml -import ray -import copy -from typing import Dict, Any +import ray +from ray.autoscaler._private.aws.cloudwatch.cloudwatch_helper import CloudwatchHelper from ray.autoscaler._private.aws.node_provider import AWSNodeProvider +from ray.autoscaler._private.commands import prepare_config, validate_config from ray.autoscaler.tags import ( - TAG_RAY_NODE_KIND, NODE_KIND_HEAD, NODE_KIND_WORKER, - TAG_RAY_USER_NODE_TYPE, TAG_RAY_CLUSTER_NAME, + TAG_RAY_NODE_KIND, + TAG_RAY_USER_NODE_TYPE, ) -from ray.autoscaler._private.commands import prepare_config, validate_config from ray.tests.aws.utils.constants import ( DEFAULT_CLUSTER_NAME, DEFAULT_NODE_PROVIDER_INSTANCE_TAGS, ) -from ray.autoscaler._private.aws.cloudwatch.cloudwatch_helper import CloudwatchHelper def get_aws_example_config_file_path(file_name): diff --git a/python/ray/tests/aws/utils/stubs.py b/python/ray/tests/aws/utils/stubs.py index a95b65cd9fdb..11c625c5b588 100644 --- a/python/ray/tests/aws/utils/stubs.py +++ b/python/ray/tests/aws/utils/stubs.py @@ -1,33 +1,32 @@ -from typing import Dict, List -import ray import copy import json - +from typing import Dict, List +from unittest import mock from uuid import uuid4 + +from botocore.stub import ANY + +import ray +from ray.autoscaler._private.aws.cloudwatch.cloudwatch_helper import ( + CLOUDWATCH_AGENT_INSTALLED_TAG, + CLOUDWATCH_CONFIG_HASH_TAG_BASE, +) +from ray.autoscaler._private.aws.config import key_pair +from ray.autoscaler.tags import NODE_KIND_HEAD, TAG_RAY_NODE_KIND from ray.tests.aws.utils import helpers from ray.tests.aws.utils.constants import ( + A_THOUSAND_SUBNETS_IN_DIFFERENT_VPCS, + DEFAULT_CLUSTER_NAME, DEFAULT_INSTANCE_PROFILE, DEFAULT_KEY_PAIR, - DEFAULT_SUBNET, - A_THOUSAND_SUBNETS_IN_DIFFERENT_VPCS, DEFAULT_LT, + DEFAULT_SUBNET, TWENTY_SUBNETS_IN_DIFFERENT_AZS, - DEFAULT_CLUSTER_NAME, ) -from ray.autoscaler._private.aws.config import key_pair from ray.tests.aws.utils.helpers import ( - get_cloudwatch_dashboard_config_file_path, get_cloudwatch_alarm_config_file_path, + get_cloudwatch_dashboard_config_file_path, ) -from ray.autoscaler._private.aws.cloudwatch.cloudwatch_helper import ( - CLOUDWATCH_AGENT_INSTALLED_TAG, - CLOUDWATCH_CONFIG_HASH_TAG_BASE, -) -from ray.autoscaler.tags import NODE_KIND_HEAD, TAG_RAY_NODE_KIND - -from unittest import mock - -from botocore.stub import ANY def configure_iam_role_default(iam_client_stub): diff --git a/python/ray/tests/chaos/potato_passer.py b/python/ray/tests/chaos/potato_passer.py index 4f84693fa647..25e5b912ef96 100644 --- a/python/ray/tests/chaos/potato_passer.py +++ b/python/ray/tests/chaos/potato_passer.py @@ -1,5 +1,6 @@ -import asyncio import argparse +import asyncio + import ray ray.init() diff --git a/python/ray/tests/chaos/streaming_llm.py b/python/ray/tests/chaos/streaming_llm.py index bbe5075d8c47..4b0536687dba 100644 --- a/python/ray/tests/chaos/streaming_llm.py +++ b/python/ray/tests/chaos/streaming_llm.py @@ -1,8 +1,8 @@ +import argparse import asyncio import logging -import requests -import argparse +import requests from fastapi import FastAPI from starlette.responses import StreamingResponse diff --git a/python/ray/tests/conftest.py b/python/ray/tests/conftest.py index a3738328233c..386f9f63baa0 100644 --- a/python/ray/tests/conftest.py +++ b/python/ray/tests/conftest.py @@ -2,6 +2,7 @@ This file defines the common pytest fixtures used in current directory. """ +import copy import json import logging import os @@ -16,35 +17,35 @@ from tempfile import gettempdir from typing import List, Optional from unittest import mock -import psutil + import pytest -import copy import ray -from ray._common.test_utils import wait_for_condition import ray._private.ray_constants as ray_constants -from ray._private.conftest_utils import set_override_dashboard_url # noqa: F401 from ray._common.network_utils import build_address +from ray._common.test_utils import wait_for_condition +from ray._private.conftest_utils import set_override_dashboard_url # noqa: F401 from ray._private.runtime_env import virtualenv_utils - from ray._private.test_utils import ( + RayletKiller, + external_redis_test_enabled, + find_free_port, get_and_run_resource_killer, + get_redis_cli, init_error_pubsub, init_log_pubsub, - setup_tls, - teardown_tls, - external_redis_test_enabled, redis_replicas, - get_redis_cli, - start_redis_instance, - start_redis_sentinel_instance, redis_sentinel_replicas, - find_free_port, reset_autoscaler_v2_enabled_cache, - RayletKiller, + setup_tls, + start_redis_instance, + start_redis_sentinel_instance, + teardown_tls, ) from ray.cluster_utils import AutoscalingCluster, Cluster, cluster_not_supported +import psutil + # TODO (mengjin) Improve the logging in the conftest files so that the logger can log # information in stdout as well as stderr and replace the print statements in the test # files @@ -1468,6 +1469,15 @@ def random_ascii_file(request): yield fp +# Clean up Ray address file before the test run starts, since sometimes bazel test times out +# and kill the test process, without cleaning up the Ray address file. +def pytest_sessionstart(session): + """Called after the Session object has been created and before performing collection and entering the run test loop.""" + + # Delete the cluster address file just in case. + ray._common.utils.reset_ray_address() + + """ pytest httpserver related test fixtures """ diff --git a/python/ray/tests/conftest_docker.py b/python/ray/tests/conftest_docker.py index ffe5d5c72ac4..02f4598484b1 100644 --- a/python/ray/tests/conftest_docker.py +++ b/python/ray/tests/conftest_docker.py @@ -1,10 +1,12 @@ +import subprocess import time +from typing import List + import pytest -from pytest_docker_tools import container, fetch, network, volume -from pytest_docker_tools import wrappers -import subprocess +from pytest_docker_tools import container, fetch, network, volume, wrappers + import docker -from typing import List + from ray._common.network_utils import build_address # If you need to debug tests using fixtures in this file, @@ -221,7 +223,16 @@ def podman_docker_cluster(): "-f", "/dev/null", ] - container_id = subprocess.check_output(start_container_command).decode("utf-8") + try: + container_id = subprocess.check_output( + start_container_command, stderr=subprocess.STDOUT + ).decode("utf-8") + except subprocess.CalledProcessError as e: + error_output = e.output.decode("utf-8") if e.output else "No output" + print(f"Command failed with return code {e.returncode}") + print(f"Full error output:\n{error_output}") + raise + container_id = container_id.strip() # Get group id that owns the docker socket file. Add user `ray` to diff --git a/python/ray/tests/gcp/test_gcp_node_provider.py b/python/ray/tests/gcp/test_gcp_node_provider.py index 13623ad41e04..1826b6781d6f 100644 --- a/python/ray/tests/gcp/test_gcp_node_provider.py +++ b/python/ray/tests/gcp/test_gcp_node_provider.py @@ -1,33 +1,32 @@ import logging import sys -from typing import Dict from threading import RLock -from unittest.mock import MagicMock, patch, call +from typing import Dict +from unittest.mock import MagicMock, call, patch import pytest +from ray.autoscaler._private.command_runner import DockerCommandRunner, SSHCommandRunner +from ray.autoscaler._private.gcp.config import ( + _get_num_tpu_chips, + _has_tpus_in_node_configs, + _is_single_host_tpu, + get_node_type, + tpu_accelerator_config_to_type, +) from ray.autoscaler._private.gcp.node import ( GCPCompute, GCPNode, GCPNodeType, GCPResource, ) - -from ray.tests.test_autoscaler import MockProcessRunner from ray.autoscaler._private.gcp.node_provider import GCPNodeProvider -from ray.autoscaler._private.gcp.config import ( - get_node_type, - _get_num_tpu_chips, - _is_single_host_tpu, - _has_tpus_in_node_configs, - tpu_accelerator_config_to_type, -) from ray.autoscaler._private.gcp.tpu_command_runner import ( TPUCommandRunner, - TPUVMSSHCommandRunner, TPUVMDockerCommandRunner, + TPUVMSSHCommandRunner, ) -from ray.autoscaler._private.command_runner import SSHCommandRunner, DockerCommandRunner +from ray.tests.test_autoscaler import MockProcessRunner _PROJECT_NAME = "project-one" _AZ = "us-west1-b" diff --git a/python/ray/tests/gcp/test_gcp_tpu_command_runner.py b/python/ray/tests/gcp/test_gcp_tpu_command_runner.py index df908f58cf8d..4c8a88e9149e 100644 --- a/python/ray/tests/gcp/test_gcp_tpu_command_runner.py +++ b/python/ray/tests/gcp/test_gcp_tpu_command_runner.py @@ -6,10 +6,10 @@ import pytest -from ray.tests.test_autoscaler import MockProvider, MockProcessRunner -from ray.autoscaler._private.gcp.tpu_command_runner import TPUCommandRunner -from ray.autoscaler._private.command_runner import SSHCommandRunner from ray._private import ray_constants +from ray.autoscaler._private.command_runner import SSHCommandRunner +from ray.autoscaler._private.gcp.tpu_command_runner import TPUCommandRunner +from ray.tests.test_autoscaler import MockProcessRunner, MockProvider _MOCK_TPU_NAME = "my-tpu" _MOCK_ACCELERATOR_TYPE = "v4-16" diff --git a/python/ray/tests/test_gpu_objects_gloo.py b/python/ray/tests/gpu_objects/test_gpu_objects_gloo.py similarity index 51% rename from python/ray/tests/test_gpu_objects_gloo.py rename to python/ray/tests/gpu_objects/test_gpu_objects_gloo.py index a1b10fc4cb47..ce723104736a 100644 --- a/python/ray/tests/test_gpu_objects_gloo.py +++ b/python/ray/tests/gpu_objects/test_gpu_objects_gloo.py @@ -1,11 +1,15 @@ -import sys import random -import torch +import sys +import threading +import time + import pytest +import torch + import ray -from ray.experimental.collective import create_collective_group +from ray._common.test_utils import SignalActor, wait_for_condition from ray._private.custom_types import TensorTransportEnum -from ray._common.test_utils import wait_for_condition +from ray.experimental.collective import create_collective_group # tensordict is not supported on macos ci, so we skip the tests support_tensordict = sys.platform != "darwin" @@ -14,7 +18,11 @@ from tensordict import TensorDict -@ray.remote(enable_tensor_transport=True) +# TODO: check whether concurrency groups are created correctly if +# enable_tensor_transport is True or if any methods are decorated with +# @ray.method(tensor_transport=...). Check that specifying +# .options(tensor_transport=...) fails if enable_tensor_transport is False. +@ray.remote class GPUTestActor: @ray.method(tensor_transport="gloo") def echo(self, data): @@ -30,19 +38,25 @@ def double(self, data): return data.apply(lambda x: x * 2) return data * 2 + def increment(self, data): + data += 1 + return data + def get_out_of_band_tensors(self, obj_id: str, timeout=None): gpu_object_store = ( ray._private.worker.global_worker.gpu_object_manager.gpu_object_store ) if timeout is None: timeout = 0 - gpu_object = gpu_object_store.wait_and_get_object(obj_id, timeout) - return gpu_object.data + return gpu_object_store.wait_and_get_object(obj_id, timeout) def get_num_gpu_objects(self): gpu_object_manager = ray._private.worker.global_worker.gpu_object_manager return gpu_object_manager.gpu_object_store.get_num_objects() + def fail(self, error_message): + raise Exception(error_message) + @pytest.mark.parametrize("data_size_bytes", [100]) def test_gc_gpu_object(ray_start_regular, data_size_bytes): @@ -203,6 +217,78 @@ def test_p2p(ray_start_regular): assert ray.get(result) == pytest.approx(medium_tensor * 2) +def test_p2p_errors_before_group_creation(ray_start_regular): + world_size = 2 + actors = [GPUTestActor.remote() for _ in range(world_size)] + + small_tensor = torch.randn((1,)) + sender = actors[0] + + with pytest.raises( + ValueError, + match="Actor.* does not have tensor transport GLOO available.*", + ): + sender.echo.remote(small_tensor) + + +@pytest.mark.parametrize("has_tensor_transport_method", [True, False]) +def test_p2p_blocking(ray_start_regular, has_tensor_transport_method): + """Test that p2p transfers still work when sender is blocked in another + task. This should work whether the actor has (a) a tensor transport method + (a method decorated with @ray.method(tensor_transport=...)) or (b) an actor-level decorator + @ray.remote(enable_tensor_transport=True).""" + + class _GPUTestActor: + def double(self, data): + if isinstance(data, list): + return [self.double(d) for d in data] + if support_tensordict and isinstance(data, TensorDict): + return data.apply(lambda x: x * 2) + return data * 2 + + def infinite_sleep(self, signal): + signal.send.remote() + while True: + time.sleep(0.1) + + if has_tensor_transport_method: + # Test tensor transport annotation via ray.method. + @ray.remote + class GPUTestActor(_GPUTestActor): + @ray.method(tensor_transport="gloo") + def echo(self, data): + return data + + else: + # Test tensor transport annotation via ray.remote. + @ray.remote(enable_tensor_transport=True) + class GPUTestActor(_GPUTestActor): + def echo(self, data): + return data + + sender, receiver = GPUTestActor.remote(), GPUTestActor.remote() + signal = SignalActor.remote() + create_collective_group([sender, receiver], backend="torch_gloo") + tensor = torch.randn((500, 500)) + # If the actor does not have a tensor transport method declared, declare it + # dynamically using .options(). + sender_fn = ( + sender.echo + if has_tensor_transport_method + else sender.echo.options(tensor_transport="gloo") + ) + ref = sender_fn.remote(tensor) + + # Start a blocking task on the sender actor. + sender.infinite_sleep.remote(signal) + ray.get(signal.wait.remote(), timeout=10) + + # Ensure that others can still receive the object. + result = receiver.double.remote(ref) + result = ray.get(result, timeout=10) + assert result == pytest.approx(tensor * 2) + + def test_p2p_with_cpu_data(ray_start_regular): world_size = 2 actors = [GPUTestActor.remote() for _ in range(world_size)] @@ -303,6 +389,29 @@ def test_mix_cpu_gpu_data(ray_start_regular): tensor = torch.randn((1,)) cpu_data = random.randint(0, 100) + + data = [tensor, cpu_data] + + sender, receiver = actors[0], actors[1] + ref = sender.echo.remote(data) + ref = receiver.double.remote(ref) + result = ray.get(ref) + + assert result[0] == pytest.approx(tensor * 2) + assert result[1] == cpu_data * 2 + + +def test_object_in_plasma(ray_start_regular): + """ + This test uses a CPU object that is large enough to be stored + in plasma instead of being inlined in the gRPC message. + """ + world_size = 2 + actors = [GPUTestActor.remote() for _ in range(world_size)] + create_collective_group(actors, backend="torch_gloo") + + tensor = torch.randn((1,)) + cpu_data = b"1" * 1000 * 1000 data = [tensor, cpu_data] sender, receiver = actors[0], actors[1] @@ -408,10 +517,6 @@ def test_fetch_gpu_object_to_driver(ray_start_regular): assert result[2] == 7 -@pytest.mark.skipif( - not support_tensordict, - reason="tensordict is not supported on this platform", -) def test_invalid_tensor_transport(ray_start_regular): with pytest.raises(ValueError, match="Invalid tensor transport"): @@ -490,5 +595,295 @@ def test_tensor_extracted_from_tensordict_in_gpu_object_store(ray_start_regular) assert torch.equal(ret_val_src[1], td["reward"]) +@pytest.mark.parametrize("enable_tensor_transport", [True, False]) +def test_dynamic_tensor_transport_via_options( + ray_start_regular, enable_tensor_transport +): + """Test that tensor_transport can be set dynamically via .options() at call + time, if enable_tensor_transport is set to True in @ray.remote.""" + + class TestActor: + def __init__(self): + pass + + def normal_method(self): + return "normal" + + def tensor_method(self): + return torch.randn(5, 5) + + def double(self, data): + return data * 2 + + if enable_tensor_transport: + TestActor = ray.remote(enable_tensor_transport=True)(TestActor) + else: + TestActor = ray.remote(TestActor) + + # Create actor without any tensor_transport decorators + sender = TestActor.remote() + receiver = TestActor.remote() + create_collective_group([sender, receiver], backend="torch_gloo") + + # Test normal method call + result = ray.get(sender.normal_method.remote()) + assert result == "normal" + + # Test method call with tensor_transport specified via .options() + if enable_tensor_transport: + # If enable_tensor_transport is set to True, then it's okay to use + # dynamic tensor_transport. + ref = sender.tensor_method.options(tensor_transport="gloo").remote() + tensor = ray.get(ref) + result = ray.get(receiver.double.remote(ref)) + assert result == pytest.approx(tensor * 2) + else: + # If enable_tensor_transport is not set, then user cannot use + # dynamic tensor_transport. + with pytest.raises( + ValueError, + match='Currently, methods with .options\\(tensor_transport="GLOO"\\) are not supported when enable_tensor_transport=False. Please set @ray.remote\\(enable_tensor_transport=True\\) on the actor class definition.', + ): + ref = sender.tensor_method.options(tensor_transport="gloo").remote() + + +def test_app_error_inter_actor(ray_start_regular): + world_size = 2 + actors = [GPUTestActor.remote() for _ in range(world_size)] + create_collective_group(actors, backend="torch_gloo") + + src_actor, dst_actor = actors[0], actors[1] + + # Make sure the receiver can receive an exception from the sender. + ref = src_actor.fail.options(tensor_transport="gloo").remote("test_app_error") + with pytest.raises(Exception, match="test_app_error"): + ray.get(dst_actor.double.remote(ref)) + + # Make sure the sender and receiver do not hang. + small_tensor = torch.randn((1,)) + ref = src_actor.echo.remote(small_tensor) + result = dst_actor.double.remote(ref) + assert ray.get(result) == pytest.approx(small_tensor * 2) + + +def test_app_error_intra_actor(ray_start_regular): + actor = GPUTestActor.remote() + create_collective_group([actor], backend="torch_gloo") + + # Make sure the receiver can receive an exception from the sender. + ref = actor.fail.options(tensor_transport="gloo").remote("test_app_error") + with pytest.raises(Exception, match="test_app_error"): + ray.get(actor.double.remote(ref)) + + # Make sure the sender and receiver do not hang. + small_tensor = torch.randn((1,)) + ref = actor.echo.remote(small_tensor) + result = actor.double.remote(ref) + assert ray.get(result) == pytest.approx(small_tensor * 2) + + +def test_app_error_fetch_to_driver(ray_start_regular): + actor = GPUTestActor.remote() + create_collective_group([actor], backend="torch_gloo") + + ref = actor.fail.options(tensor_transport="gloo").remote("test_app_error") + with pytest.raises(Exception, match="test_app_error"): + ray.get(ref) + + # Make sure the driver can receive an exception from the actor. + small_tensor = torch.tensor([1, 2, 3]) + ref = actor.echo.remote(small_tensor) + assert torch.equal(ray.get(ref), small_tensor) + + +def test_write_after_save(ray_start_regular): + """Check that an actor can safely write to a tensor after saving it to its + local state by calling `ray.experimental.wait_tensor_freed`.""" + + @ray.remote(enable_tensor_transport=True) + class GPUTestActor: + @ray.method(tensor_transport="gloo") + def save(self, data: torch.Tensor): + # Save the tensor to the actor's local state. + self.data = data + return data + + def receive(self, data: torch.Tensor): + return data + + def increment_saved(self): + ray.experimental.wait_tensor_freed(self.data) + # Write to the saved tensor. + self.data += 1 + return self.data + + world_size = 2 + actors = [GPUTestActor.remote() for _ in range(world_size)] + create_collective_group(actors, backend="torch_gloo") + + medium_tensor = torch.randn((500, 500)) + sender, receiver = actors + ref = sender.save.remote(medium_tensor) + # Sender writes to the GPU object while Ray sends the object to a receiver + # task in the background. + tensor1 = sender.increment_saved.remote() + tensor2 = receiver.receive.remote(ref) + + # The sender task should not have returned yet because the ObjectRef is + # still in scope. + with pytest.raises(ray.exceptions.GetTimeoutError): + ray.get(tensor1, timeout=1) + + del ref + # Check that Ray completed the transfer of the original tensor before the + # sender writes to it. + assert torch.allclose(ray.get(tensor1), medium_tensor + 1) + assert torch.allclose(ray.get(tensor2), medium_tensor) + + +def test_wait_tensor_freed(ray_start_regular): + """Unit test for ray.experimental.wait_tensor_freed. Check that the call + returns when the tensor has been freed from the GPU object store.""" + gpu_object_store = ray.worker.global_worker.gpu_object_manager.gpu_object_store + obj_id = "random_id" + tensor = torch.randn((1,)) + gpu_object_store.add_object(obj_id, [tensor], is_primary=True) + + assert gpu_object_store.has_object(obj_id) + with pytest.raises(TimeoutError): + ray.experimental.wait_tensor_freed(tensor, timeout=1) + assert gpu_object_store.has_object(obj_id) + + # Simulate garbage collection in a background thread. + def gc(): + time.sleep(0.1) + gpu_object_store.pop_object(obj_id) + + gc_thread = threading.Thread(target=gc) + gc_thread.start() + # Now the wait_tensor_freed call should be able to return. + ray.experimental.wait_tensor_freed(tensor) + gc_thread.join() + assert not gpu_object_store.has_object(obj_id) + + +def test_wait_tensor_freed_double_tensor(ray_start_regular): + """Unit test for ray.experimental.wait_tensor_freed when multiple objects + contain the same tensor.""" + gpu_object_store = ray.worker.global_worker.gpu_object_manager.gpu_object_store + obj_id1 = "random_id1" + obj_id2 = "random_id2" + tensor = torch.randn((1,)) + gpu_object_store.add_object(obj_id1, [tensor], is_primary=True) + gpu_object_store.add_object(obj_id2, [tensor], is_primary=True) + + assert gpu_object_store.has_object(obj_id1) + assert gpu_object_store.has_object(obj_id2) + with pytest.raises(TimeoutError): + ray.experimental.wait_tensor_freed(tensor, timeout=1) + assert gpu_object_store.has_object(obj_id1) + assert gpu_object_store.has_object(obj_id2) + + # Simulate garbage collection in a background thread. + def gc(obj_id): + time.sleep(0.1) + gpu_object_store.pop_object(obj_id) + + # Free one object. Tensor should still be stored. + gc_thread = threading.Thread(target=gc, args=(obj_id1,)) + gc_thread.start() + with pytest.raises(TimeoutError): + ray.experimental.wait_tensor_freed(tensor, timeout=1) + gc_thread.join() + assert not gpu_object_store.has_object(obj_id1) + + # Free the other object. Now the wait_tensor_freed call should be able to + # return. + gc_thread = threading.Thread(target=gc, args=(obj_id2,)) + gc_thread.start() + ray.experimental.wait_tensor_freed(tensor) + gc_thread.join() + assert not gpu_object_store.has_object(obj_id2) + + +def test_send_back_and_dst_warning(ray_start_regular): + # Test warning when object is sent back to the src actor and to dst actors + world_size = 2 + actors = [GPUTestActor.remote() for _ in range(world_size)] + create_collective_group(actors, backend="torch_gloo") + + src_actor, dst_actor = actors[0], actors[1] + + tensor = torch.tensor([1, 2, 3]) + + warning_message = r"GPU ObjectRef\(.+\)" + + with pytest.warns(UserWarning, match=warning_message): + t = src_actor.echo.remote(tensor) + t1 = src_actor.echo.remote(t) # Sent back to the source actor + t2 = dst_actor.echo.remote(t) # Also sent to another actor + ray.get([t1, t2]) + + # Second transmission of ObjectRef `t` to `dst_actor` should not trigger a warning + # Verify no `pytest.warns` context is used here because no warning should be raised + t3 = dst_actor.echo.remote(t) + ray.get(t3) + + +def test_duplicate_objectref_transfer(ray_start_regular): + world_size = 2 + actors = [GPUTestActor.remote() for _ in range(world_size)] + create_collective_group(actors, backend="torch_gloo") + actor0, actor1 = actors[0], actors[1] + + small_tensor = torch.randn((1,)) + + # Store the original value for comparison + original_value = small_tensor + + ref = actor0.echo.remote(small_tensor) + + # Pass the same ref to actor1 twice + result1 = actor1.increment.remote(ref) + result2 = actor1.increment.remote(ref) + + # Both should return original_value + 1 because each increment task should receive the same object value. + val1 = ray.get(result1) + val2 = ray.get(result2) + + # Check for correctness + assert val1 == pytest.approx( + original_value + 1 + ), f"Result1 incorrect: got {val1}, expected {original_value + 1}" + assert val2 == pytest.approx( + original_value + 1 + ), f"Result2 incorrect: got {val2}, expected {original_value + 1}" + + # Additional check: results should be equal (both got clean copies) + assert val1 == pytest.approx( + val2 + ), f"Results differ: result1={val1}, result2={val2}" + + +def test_transfer_from_not_actor_creator(ray_start_regular): + @ray.remote + class Actor: + @ray.method(tensor_transport="gloo") + def create(self): + return torch.tensor([1, 2, 3]) + + def consume(self, obj): + return obj + + def do_transfer(self, a1, a2): + create_collective_group([a1, a2], backend="torch_gloo") + return ray.get(a1.consume.remote(a2.create.remote())) + + actor = [Actor.remote() for _ in range(3)] + assert ray.get(actor[2].do_transfer.remote(actor[0], actor[1])) == pytest.approx( + torch.tensor([1, 2, 3]) + ) + + if __name__ == "__main__": sys.exit(pytest.main(["-sv", __file__])) diff --git a/python/ray/tests/test_gpu_objects_nccl.py b/python/ray/tests/gpu_objects/test_gpu_objects_nccl.py similarity index 99% rename from python/ray/tests/test_gpu_objects_nccl.py rename to python/ray/tests/gpu_objects/test_gpu_objects_nccl.py index f3e8bc409f20..7c2307871733 100644 --- a/python/ray/tests/test_gpu_objects_nccl.py +++ b/python/ray/tests/gpu_objects/test_gpu_objects_nccl.py @@ -1,6 +1,8 @@ import sys -import torch + import pytest +import torch + import ray from ray.experimental.collective import create_collective_group diff --git a/python/ray/tests/gpu_objects/test_gpu_objects_nixl.py b/python/ray/tests/gpu_objects/test_gpu_objects_nixl.py new file mode 100644 index 000000000000..1d57b8c675a9 --- /dev/null +++ b/python/ray/tests/gpu_objects/test_gpu_objects_nixl.py @@ -0,0 +1,125 @@ +import sys + +import pytest +import torch + +import ray + + +@ray.remote(num_gpus=1, num_cpus=0, enable_tensor_transport=True) +class GPUTestActor: + @ray.method(tensor_transport="nixl") + def echo(self, data, device): + return data.to(device) + + def sum(self, data, device): + assert data.device.type == device + return data.sum().item() + + def produce(self, tensors): + refs = [] + for t in tensors: + refs.append(ray.put(t, _tensor_transport="nixl")) + return refs + + def consume_with_nixl(self, refs): + tensors = [ray.get(ref) for ref in refs] + sum = 0 + for t in tensors: + assert t.device.type == "cuda" + sum += t.sum().item() + return sum + + def consume_with_object_store(self, refs): + tensors = [ray.get(ref, _tensor_transport="object_store") for ref in refs] + sum = 0 + for t in tensors: + assert t.device.type == "cuda" + sum += t.sum().item() + return sum + + def gc(self): + tensor = torch.tensor([1, 2, 3]).to("cuda") + ref = ray.put(tensor, _tensor_transport="nixl") + gpu_manager = ray._private.worker.global_worker.gpu_object_manager + assert gpu_manager.gpu_object_store.has_tensor(tensor) + del ref + gpu_manager.gpu_object_store.wait_tensor_freed(tensor, timeout=10) + assert not gpu_manager.gpu_object_store.has_tensor(tensor) + return "Success" + + +@pytest.mark.parametrize("ray_start_regular", [{"num_gpus": 2}], indirect=True) +def test_p2p(ray_start_regular): + num_actors = 2 + actors = [GPUTestActor.remote() for _ in range(num_actors)] + + src_actor, dst_actor = actors[0], actors[1] + + # Create test tensor + tensor = torch.tensor([1, 2, 3]) + + tensor1 = torch.tensor([4, 5, 6]) + + # Test GPU to GPU transfer + ref = src_actor.echo.remote(tensor, "cuda") + + # Trigger tensor transfer from src to dst actor + result = dst_actor.sum.remote(ref, "cuda") + assert tensor.sum().item() == ray.get(result, _tensor_transport="object_store") + + # Test CPU to CPU transfer + ref1 = src_actor.echo.remote(tensor1, "cpu") + result1 = dst_actor.sum.remote(ref1, "cpu") + assert tensor1.sum().item() == ray.get(result1, _tensor_transport="object_store") + + +@pytest.mark.parametrize("ray_start_regular", [{"num_gpus": 1}], indirect=True) +def test_intra_gpu_tensor_transfer(ray_start_regular): + actor = GPUTestActor.remote() + + tensor = torch.tensor([1, 2, 3]) + + # Intra-actor communication for pure GPU tensors + ref = actor.echo.remote(tensor, "cuda") + result = actor.sum.remote(ref, "cuda") + assert tensor.sum().item() == ray.get(result, _tensor_transport="object_store") + + +@pytest.mark.parametrize("ray_start_regular", [{"num_gpus": 2}], indirect=True) +def test_put_and_get_object_with_nixl(ray_start_regular): + actors = [GPUTestActor.remote() for _ in range(2)] + src_actor, dst_actor = actors[0], actors[1] + tensor1 = torch.tensor([1, 2, 3]).to("cuda") + tensor2 = torch.tensor([4, 5, 6, 0]).to("cuda") + tensor3 = torch.tensor([7, 8, 9, 0, 0]).to("cuda") + tensors = [tensor1, tensor2, tensor3] + ref = src_actor.produce.remote(tensors) + ref1 = dst_actor.consume_with_nixl.remote(ref) + result1 = ray.get(ref1) + assert result1 == 45 + + +@pytest.mark.parametrize("ray_start_regular", [{"num_gpus": 2}], indirect=True) +def test_put_and_get_object_with_object_store(ray_start_regular): + actors = [GPUTestActor.remote() for _ in range(2)] + src_actor, dst_actor = actors[0], actors[1] + tensor1 = torch.tensor([1, 2, 3]).to("cuda") + tensor2 = torch.tensor([4, 5, 6, 0]).to("cuda") + tensor3 = torch.tensor([7, 8, 9, 0, 0]).to("cuda") + tensors = [tensor1, tensor2, tensor3] + ref = src_actor.produce.remote(tensors) + ref1 = dst_actor.consume_with_object_store.remote(ref) + result1 = ray.get(ref1) + assert result1 == 45 + + +@pytest.mark.parametrize("ray_start_regular", [{"num_gpus": 1}], indirect=True) +def test_put_gc(ray_start_regular): + actor = GPUTestActor.remote() + ref = actor.gc.remote() + assert ray.get(ref) == "Success" + + +if __name__ == "__main__": + sys.exit(pytest.main(["-sv", __file__])) diff --git a/python/ray/tests/horovod/BUILD b/python/ray/tests/horovod/BUILD.bazel similarity index 100% rename from python/ray/tests/horovod/BUILD rename to python/ray/tests/horovod/BUILD.bazel diff --git a/python/ray/tests/horovod/horovod_example.py b/python/ray/tests/horovod/horovod_example.py index 92b4cc1a67f6..d53a93868b82 100644 --- a/python/ray/tests/horovod/horovod_example.py +++ b/python/ray/tests/horovod/horovod_example.py @@ -1,16 +1,15 @@ # This file is duplicated in release/ml_user_tests/horovod import argparse import os -from filelock import FileLock +import horovod.torch as hvd import torch.nn as nn import torch.nn.functional as F import torch.optim as optim -from torchvision import datasets, transforms import torch.utils.data.distributed - -import horovod.torch as hvd +from filelock import FileLock from horovod.ray import RayExecutor +from torchvision import datasets, transforms def metric_average(val, name): diff --git a/python/ray/tests/horovod/test_horovod.py b/python/ray/tests/horovod/test_horovod.py index 19103f399a6d..93aceaae278b 100644 --- a/python/ray/tests/horovod/test_horovod.py +++ b/python/ray/tests/horovod/test_horovod.py @@ -9,8 +9,8 @@ pytest.importorskip("horovod") try: - from horovod.ray.runner import RayExecutor from horovod.common.util import gloo_built + from horovod.ray.runner import RayExecutor except ImportError: pass # This shouldn't be reached - the test should be skipped. @@ -30,11 +30,12 @@ def ray_start_4_cpus(request): def _train(batch_size=32, batch_per_iter=10): + import timeit + + import horovod.torch as hvd import torch.nn.functional as F import torch.optim as optim import torch.utils.data.distributed - import horovod.torch as hvd - import timeit hvd.init() diff --git a/python/ray/tests/kuberay/scripts/non_terminated_nodes_count.py b/python/ray/tests/kuberay/scripts/non_terminated_nodes_count.py index 86f55f67a8ff..7d64678cf679 100644 --- a/python/ray/tests/kuberay/scripts/non_terminated_nodes_count.py +++ b/python/ray/tests/kuberay/scripts/non_terminated_nodes_count.py @@ -1,6 +1,6 @@ import ray -from ray.autoscaler._private.providers import _get_node_provider from ray.autoscaler._private.kuberay.autoscaling_config import _generate_provider_config +from ray.autoscaler._private.providers import _get_node_provider @ray.remote diff --git a/python/ray/tests/kuberay/scripts/scale_up_custom.py b/python/ray/tests/kuberay/scripts/scale_up_custom.py index ada4c9eb757e..3810c635e3be 100644 --- a/python/ray/tests/kuberay/scripts/scale_up_custom.py +++ b/python/ray/tests/kuberay/scripts/scale_up_custom.py @@ -1,6 +1,7 @@ -import ray import time +import ray + def main(): """Submits custom resource request. diff --git a/python/ray/tests/kuberay/test_autoscaling_config.py b/python/ray/tests/kuberay/test_autoscaling_config.py index 29597fc86f8b..61a886d96758 100644 --- a/python/ray/tests/kuberay/test_autoscaling_config.py +++ b/python/ray/tests/kuberay/test_autoscaling_config.py @@ -1,23 +1,23 @@ import copy -from pathlib import Path import platform -import requests import sys +from pathlib import Path from typing import Any, Dict, Optional, Type from unittest import mock -import yaml import pytest +import requests +import yaml from ray.autoscaler._private.kuberay.autoscaling_config import ( GKE_TPU_ACCELERATOR_LABEL, GKE_TPU_TOPOLOGY_LABEL, - _derive_autoscaling_config_from_ray_cr, AutoscalingConfigProducer, - _round_up_k8s_quantity, - _get_num_tpus, + _derive_autoscaling_config_from_ray_cr, _get_custom_resources, + _get_num_tpus, _get_ray_resources_from_group_spec, + _round_up_k8s_quantity, ) from ray.autoscaler._private.kuberay.utils import tpu_node_selectors_to_type diff --git a/python/ray/tests/kuberay/test_autoscaling_e2e.py b/python/ray/tests/kuberay/test_autoscaling_e2e.py index 22d8cd811fb1..5ecc1410cb3f 100644 --- a/python/ray/tests/kuberay/test_autoscaling_e2e.py +++ b/python/ray/tests/kuberay/test_autoscaling_e2e.py @@ -7,20 +7,20 @@ import tempfile import unittest from typing import Any, Dict -import yaml import pytest +import yaml from ray.tests.kuberay.utils import ( get_pod, get_pod_names, get_raycluster, - switch_to_ray_parent_dir, + kubectl_delete, kubectl_exec_python_script, kubectl_logs, - kubectl_delete, - wait_for_pods, + switch_to_ray_parent_dir, wait_for_pod_to_start, + wait_for_pods, wait_for_ray_health, ) diff --git a/python/ray/tests/kuberay/test_kuberay_node_provider.py b/python/ray/tests/kuberay/test_kuberay_node_provider.py index 96ca4863e865..189de40b5521 100644 --- a/python/ray/tests/kuberay/test_kuberay_node_provider.py +++ b/python/ray/tests/kuberay/test_kuberay_node_provider.py @@ -1,25 +1,24 @@ import copy -from unittest import mock import sys +from collections import defaultdict +from pathlib import Path +from typing import List, Set +from unittest import mock import jsonpatch import pytest +import yaml -from collections import defaultdict -from ray.autoscaler.batching_node_provider import NodeData from ray.autoscaler._private.kuberay.node_provider import ( + KubeRayNodeProvider, + ScaleRequest, _worker_group_index, _worker_group_max_replicas, _worker_group_replicas, - KubeRayNodeProvider, - ScaleRequest, ) from ray.autoscaler._private.util import NodeID -from pathlib import Path -import yaml - +from ray.autoscaler.batching_node_provider import NodeData from ray.tests.kuberay.test_autoscaling_config import get_basic_ray_cr -from typing import Set, List def _get_basic_ray_cr_workers_to_delete( diff --git a/python/ray/tests/kuberay/utils.py b/python/ray/tests/kuberay/utils.py index f11f0b24b4c4..63ddd9b3578f 100644 --- a/python/ray/tests/kuberay/utils.py +++ b/python/ray/tests/kuberay/utils.py @@ -4,14 +4,14 @@ import atexit import contextlib import logging +import os import pathlib import subprocess import tempfile import time from typing import Any, Dict, Generator, List, Optional -import yaml -import os +import yaml logger = logging.getLogger(__name__) diff --git a/python/ray/tests/ludwig/BUILD b/python/ray/tests/ludwig/BUILD.bazel similarity index 100% rename from python/ray/tests/ludwig/BUILD rename to python/ray/tests/ludwig/BUILD.bazel diff --git a/python/ray/tests/ludwig/ludwig_test_utils.py b/python/ray/tests/ludwig/ludwig_test_utils.py index 069d431655ad..3b567bce129b 100644 --- a/python/ray/tests/ludwig/ludwig_test_utils.py +++ b/python/ray/tests/ludwig/ludwig_test_utils.py @@ -30,12 +30,10 @@ import cloudpickle import numpy as np import pandas as pd - from ludwig.api import LudwigModel from ludwig.backend import LocalBackend -from ludwig.constants import VECTOR, COLUMN, NAME, PROC_COLUMN -from ludwig.data.dataset_synthesizer import DATETIME_FORMATS -from ludwig.data.dataset_synthesizer import build_synthetic_dataset +from ludwig.constants import COLUMN, NAME, PROC_COLUMN, VECTOR +from ludwig.data.dataset_synthesizer import DATETIME_FORMATS, build_synthetic_dataset from ludwig.experiment import experiment_cli from ludwig.features.feature_utils import compute_feature_hash from ludwig.utils.data_utils import read_csv, replace_file_extension diff --git a/python/ray/tests/ludwig/test_ludwig.py b/python/ray/tests/ludwig/test_ludwig.py index a19ec33520a4..6978234a394f 100644 --- a/python/ray/tests/ludwig/test_ludwig.py +++ b/python/ray/tests/ludwig/test_ludwig.py @@ -19,8 +19,8 @@ import contextlib import os -import tempfile import sys +import tempfile import pytest @@ -47,18 +47,21 @@ if not skip: from ludwig.backend.ray import RayBackend, get_horovod_kwargs - from ray.tests.ludwig.ludwig_test_utils import create_data_set_to_use, spawn - from ray.tests.ludwig.ludwig_test_utils import bag_feature - from ray.tests.ludwig.ludwig_test_utils import binary_feature - from ray.tests.ludwig.ludwig_test_utils import category_feature - from ray.tests.ludwig.ludwig_test_utils import date_feature - from ray.tests.ludwig.ludwig_test_utils import generate_data - from ray.tests.ludwig.ludwig_test_utils import h3_feature - from ray.tests.ludwig.ludwig_test_utils import numerical_feature - from ray.tests.ludwig.ludwig_test_utils import sequence_feature - from ray.tests.ludwig.ludwig_test_utils import set_feature - from ray.tests.ludwig.ludwig_test_utils import train_with_backend - from ray.tests.ludwig.ludwig_test_utils import vector_feature + from ray.tests.ludwig.ludwig_test_utils import ( + bag_feature, + binary_feature, + category_feature, + create_data_set_to_use, + date_feature, + generate_data, + h3_feature, + numerical_feature, + sequence_feature, + set_feature, + spawn, + train_with_backend, + vector_feature, + ) else: diff --git a/python/ray/tests/mock_s3_server.py b/python/ray/tests/mock_s3_server.py index b935e7865c1a..f5bd792be488 100644 --- a/python/ray/tests/mock_s3_server.py +++ b/python/ray/tests/mock_s3_server.py @@ -1,12 +1,13 @@ # extracted from aioboto3 # https://github.com/terrycain/aioboto3/blob/16a1a1085191ebe6d40ee45d9588b2173738af0c/tests/mock_server.py -import pytest -import requests import shutil import signal import subprocess as sp import time +import pytest +import requests + from ray._common.network_utils import build_address _proxy_bypass = { diff --git a/python/ray/tests/modin/BUILD b/python/ray/tests/modin/BUILD.bazel similarity index 100% rename from python/ray/tests/modin/BUILD rename to python/ray/tests/modin/BUILD.bazel diff --git a/python/ray/tests/modin/modin_test_utils.py b/python/ray/tests/modin/modin_test_utils.py index 4071b536104f..5c7ec28aaaf9 100644 --- a/python/ray/tests/modin/modin_test_utils.py +++ b/python/ray/tests/modin/modin_test_utils.py @@ -16,16 +16,16 @@ # This file is copied and adapted from # http://github.com/modin-project/modin/master/modin/pandas/test/utils.py -import pandas import modin.pandas as pd +import numpy as np +import pandas from modin.utils import to_pandas from pandas.testing import ( - assert_series_equal, - assert_frame_equal, assert_extension_array_equal, + assert_frame_equal, assert_index_equal, + assert_series_equal, ) -import numpy as np def categories_equals(left, right): diff --git a/python/ray/tests/modin/test_modin.py b/python/ray/tests/modin/test_modin.py index 9f50fe465a2c..e379c2026df7 100644 --- a/python/ray/tests/modin/test_modin.py +++ b/python/ray/tests/modin/test_modin.py @@ -17,10 +17,12 @@ # http://github.com/modin-project/modin/master/modin/pandas/test/test_general.py import sys -import pytest -import pandas + import numpy as np +import pandas +import pytest from numpy.testing import assert_array_equal + from ray.tests.conftest import ray_start_regular_shared # noqa F401 modin_installed = True @@ -36,9 +38,10 @@ pytestmark = pytest.mark.skipif(skip, reason="Outdated or missing Modin dependency") if not skip: - from ray.tests.modin.modin_test_utils import df_equals import modin.pandas as pd + from ray.tests.modin.modin_test_utils import df_equals + @pytest.fixture(autouse=True) def connect_to_ray_cluster(ray_start_regular_shared): # noqa F811 diff --git a/python/ray/tests/runtime_env_container/test_job.py b/python/ray/tests/runtime_env_container/test_job.py index 9d20c29d8e5a..e0a04d169cc7 100644 --- a/python/ray/tests/runtime_env_container/test_job.py +++ b/python/ray/tests/runtime_env_container/test_job.py @@ -1,8 +1,8 @@ import argparse import ray -from ray.job_submission import JobStatus, JobSubmissionClient from ray._common.test_utils import wait_for_condition +from ray.job_submission import JobStatus, JobSubmissionClient parser = argparse.ArgumentParser() parser.add_argument("--image", type=str, help="The docker image to use for Ray worker") diff --git a/python/ray/tests/runtime_env_container/test_log_file_exists.py b/python/ray/tests/runtime_env_container/test_log_file_exists.py index 8d1afff7eefe..a3dcec682c01 100644 --- a/python/ray/tests/runtime_env_container/test_log_file_exists.py +++ b/python/ray/tests/runtime_env_container/test_log_file_exists.py @@ -1,9 +1,10 @@ -import ray -from pathlib import Path +import argparse import re -from ray.util.state import list_tasks +from pathlib import Path + +import ray from ray._common.test_utils import wait_for_condition -import argparse +from ray.util.state import list_tasks parser = argparse.ArgumentParser() parser.add_argument("--image", type=str, help="The docker image to use for Ray worker") diff --git a/python/ray/tests/runtime_env_container/test_put_get.py b/python/ray/tests/runtime_env_container/test_put_get.py index cc79edf58d29..048b3b863804 100644 --- a/python/ray/tests/runtime_env_container/test_put_get.py +++ b/python/ray/tests/runtime_env_container/test_put_get.py @@ -1,7 +1,9 @@ -import ray -import numpy as np import argparse +import numpy as np + +import ray + parser = argparse.ArgumentParser() parser.add_argument("--image", type=str, help="The docker image to use for Ray worker") parser.add_argument( diff --git a/python/ray/tests/runtime_env_container/test_serve_basic.py b/python/ray/tests/runtime_env_container/test_serve_basic.py index c68597600dbd..8175441eebed 100644 --- a/python/ray/tests/runtime_env_container/test_serve_basic.py +++ b/python/ray/tests/runtime_env_container/test_serve_basic.py @@ -1,4 +1,5 @@ import argparse + from ray import serve from ray._common.test_utils import wait_for_condition from ray.serve.handle import DeploymentHandle diff --git a/python/ray/tests/runtime_env_container/test_serve_telemetry.py b/python/ray/tests/runtime_env_container/test_serve_telemetry.py index d20f0d4c48a3..bd24b23318b2 100644 --- a/python/ray/tests/runtime_env_container/test_serve_telemetry.py +++ b/python/ray/tests/runtime_env_container/test_serve_telemetry.py @@ -5,14 +5,14 @@ import ray from ray import serve from ray._common.test_utils import wait_for_condition -from ray.serve._private.usage import ServeUsageTag -from ray.serve.context import _get_global_client -from ray.serve.schema import ServeDeploySchema from ray.serve._private.test_utils import ( TelemetryStorage, check_ray_started, check_ray_stopped, ) +from ray.serve._private.usage import ServeUsageTag +from ray.serve.context import _get_global_client +from ray.serve.schema import ServeDeploySchema parser = argparse.ArgumentParser( description="Example Python script taking command line arguments." diff --git a/python/ray/tests/runtime_env_container/test_shared_memory.py b/python/ray/tests/runtime_env_container/test_shared_memory.py index 622b6813fbb1..d501a41709a9 100644 --- a/python/ray/tests/runtime_env_container/test_shared_memory.py +++ b/python/ray/tests/runtime_env_container/test_shared_memory.py @@ -1,8 +1,9 @@ -import ray -import numpy as np -import sys import argparse +import sys +import numpy as np + +import ray parser = argparse.ArgumentParser() parser.add_argument("--image", type=str, help="The docker image to use for Ray worker") diff --git a/python/ray/tests/runtime_env_container/test_worker_exit_intended_system_exit_and_user_error.py b/python/ray/tests/runtime_env_container/test_worker_exit_intended_system_exit_and_user_error.py index e8280a2b9214..80db3a460459 100644 --- a/python/ray/tests/runtime_env_container/test_worker_exit_intended_system_exit_and_user_error.py +++ b/python/ray/tests/runtime_env_container/test_worker_exit_intended_system_exit_and_user_error.py @@ -1,12 +1,12 @@ +import argparse import asyncio import os -import argparse import ray -from ray._private.state_api_test_utils import verify_failed_task -from ray.util.state import list_workers from ray._common.test_utils import wait_for_condition +from ray._private.state_api_test_utils import verify_failed_task from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy +from ray.util.state import list_workers parser = argparse.ArgumentParser() parser.add_argument("--image", type=str, help="The docker image to use for Ray worker") diff --git a/python/ray/tests/spark/test_GPU.py b/python/ray/tests/spark/test_GPU.py index 0d8641b75f24..d1d65ef345bf 100644 --- a/python/ray/tests/spark/test_GPU.py +++ b/python/ray/tests/spark/test_GPU.py @@ -1,21 +1,22 @@ -import sys -import pytest +import functools import os +import sys import time -import functools from abc import ABC + +import pytest from pyspark.sql import SparkSession + +import ray +from ray._common.test_utils import wait_for_condition from ray.tests.spark.test_basic import ( + _RAY_ON_SPARK_WORKER_PHYSICAL_MEMORY_BYTES, + _RAY_ON_SPARK_WORKER_SHARED_MEMORY_BYTES, RayOnSparkCPUClusterTestBase, _setup_ray_cluster, _setup_ray_on_spark_envs, - _RAY_ON_SPARK_WORKER_PHYSICAL_MEMORY_BYTES, - _RAY_ON_SPARK_WORKER_SHARED_MEMORY_BYTES, ) from ray.util.spark.utils import _calc_mem_per_ray_worker_node -from ray._common.test_utils import wait_for_condition - -import ray pytestmark = [ pytest.mark.skipif( diff --git a/python/ray/tests/spark/test_basic.py b/python/ray/tests/spark/test_basic.py index 3b1003e591df..bce74852f72f 100644 --- a/python/ray/tests/spark/test_basic.py +++ b/python/ray/tests/spark/test_basic.py @@ -1,32 +1,31 @@ +import logging import os +import re import shutil -import tempfile import socket -import threading -import re -import pytest import sys -from unittest import mock +import tempfile +import threading +import time from abc import ABC +from contextlib import contextmanager +from unittest import mock -import ray +import pytest +from pyspark.sql import SparkSession +import ray import ray.util.spark.cluster_init +from ray._common.test_utils import wait_for_condition from ray.util.spark import ( + MAX_NUM_WORKER_NODES, + setup_global_ray_cluster, setup_ray_cluster, shutdown_ray_cluster, - setup_global_ray_cluster, - MAX_NUM_WORKER_NODES, ) from ray.util.spark.utils import ( _calc_mem_per_ray_worker_node, ) -from pyspark.sql import SparkSession -import time -import logging -from contextlib import contextmanager -from ray._common.test_utils import wait_for_condition - pytestmark = [ pytest.mark.skipif( diff --git a/python/ray/tests/spark/test_databricks_hook.py b/python/ray/tests/spark/test_databricks_hook.py index e0e2d5ed12d8..5995e1103a68 100644 --- a/python/ray/tests/spark/test_databricks_hook.py +++ b/python/ray/tests/spark/test_databricks_hook.py @@ -1,14 +1,14 @@ +import os import sys +import time import pytest -import os -import time -import ray from pyspark.sql import SparkSession -from ray.util.spark import setup_ray_cluster + +import ray import ray.util.spark.databricks_hook from ray._common.test_utils import wait_for_condition - +from ray.util.spark import setup_ray_cluster pytestmark = pytest.mark.skipif( not sys.platform.startswith("linux"), diff --git a/python/ray/tests/spark/test_multicores_per_task.py b/python/ray/tests/spark/test_multicores_per_task.py index b34d93ec3616..3fb693d7466d 100644 --- a/python/ray/tests/spark/test_multicores_per_task.py +++ b/python/ray/tests/spark/test_multicores_per_task.py @@ -1,7 +1,9 @@ +import os import sys + import pytest -import os from pyspark.sql import SparkSession + from ray.tests.spark.test_basic import _setup_ray_on_spark_envs from ray.tests.spark.test_GPU import RayOnSparkGPUClusterTestBase diff --git a/python/ray/tests/spark/test_utils.py b/python/ray/tests/spark/test_utils.py index 35a516c9d4e3..a8efd615f81d 100644 --- a/python/ray/tests/spark/test_utils.py +++ b/python/ray/tests/spark/test_utils.py @@ -1,18 +1,19 @@ -from unittest.mock import patch import os import re import sys +from unittest.mock import patch import pytest -from ray.util.spark.utils import ( - get_spark_task_assigned_physical_gpus, - _calc_mem_per_ray_worker_node, - _get_avail_mem_per_ray_worker_node, -) + from ray.util.spark.cluster_init import ( + _append_default_spilling_dir_config, _convert_ray_node_options, _verify_node_options, - _append_default_spilling_dir_config, +) +from ray.util.spark.utils import ( + _calc_mem_per_ray_worker_node, + _get_avail_mem_per_ray_worker_node, + get_spark_task_assigned_physical_gpus, ) pytestmark = pytest.mark.skipif( diff --git a/python/ray/tests/test_actor.py b/python/ray/tests/test_actor.py index b1ff97c2a671..7055b78c46e9 100644 --- a/python/ray/tests/test_actor.py +++ b/python/ray/tests/test_actor.py @@ -5,23 +5,24 @@ import numpy as np import pytest -import psutil import ray from ray import cloudpickle as pickle +from ray._common.test_utils import SignalActor, wait_for_condition +from ray._common.utils import hex_to_binary from ray._private import ray_constants +from ray._private.state_api_test_utils import invoke_state_api, invoke_state_api_n from ray._private.test_utils import ( client_test_enabled, wait_for_pid_to_exit, ) from ray.actor import ActorClassInheritanceException -from ray.tests.client_test_utils import create_remote_signal_actor -from ray._common.test_utils import SignalActor, wait_for_condition from ray.core.generated import gcs_pb2 -from ray._common.utils import hex_to_binary -from ray._private.state_api_test_utils import invoke_state_api, invoke_state_api_n +from ray.tests.client_test_utils import create_remote_signal_actor from ray.util.state import list_actors +import psutil + @pytest.mark.parametrize("set_enable_auto_connect", [True, False], indirect=True) def test_caching_actors(shutdown_only, set_enable_auto_connect): @@ -1676,5 +1677,67 @@ def method(self): assert result == "ok" +@pytest.mark.skipif( + client_test_enabled(), + reason="Out of scope actor cleanup doesn't work with Ray client.", +) +def test_get_actor_after_same_name_actor_dead(shutdown_only): + ACTOR_NAME = "test_actor" + NAMESPACE_NAME = "test_namespace" + + ray.init(namespace=NAMESPACE_NAME) + + @ray.remote + class Actor: + def get_pid(self): + return os.getpid() + + a = Actor.options(name=ACTOR_NAME, max_restarts=0, max_task_retries=-1).remote() + + pid = ray.get(a.get_pid.remote()) + psutil.Process(pid).kill() + a_actor_id = a._actor_id.hex() + + wait_for_condition(lambda: ray.state.actors(a_actor_id)["State"] == "DEAD") + + # When a reference is held, the name cannot be reused. + with pytest.raises(ValueError): + Actor.options(name=ACTOR_NAME).remote() + + # Deleting the remaining reference so the name can be reused + del a + + b = None + + def wait_new_actor_ready(): + nonlocal b + b = Actor.options(name=ACTOR_NAME).remote() + return True + + wait_for_condition(wait_new_actor_ready) + + ray.get(b.__ray_ready__.remote()) + _ = ray.get_actor(ACTOR_NAME, namespace=NAMESPACE_NAME) + + # ray.kill can proactively release the name. + ray.kill(b) + wait_for_condition(lambda: ray.state.actors(b._actor_id.hex())["State"] == "DEAD") + + c = Actor.options(name=ACTOR_NAME, lifetime="detached").remote() + ray.get(c.__ray_ready__.remote()) + _ = ray.get_actor(ACTOR_NAME, namespace=NAMESPACE_NAME) + + pid = ray.get(c.get_pid.remote()) + psutil.Process(pid).kill() + + wait_for_condition(lambda: ray.state.actors(c._actor_id.hex())["State"] == "DEAD") + + # Detached actors do not subscribe to reference counting, so + # they release the actor name when the actor is dead, without waiting for the reference count + # to be released or the execution of ray.kill. + d = Actor.options(name=ACTOR_NAME).remote() + ray.get(d.__ray_ready__.remote()) + + if __name__ == "__main__": sys.exit(pytest.main(["-sv", __file__])) diff --git a/python/ray/tests/test_actor_advanced.py b/python/ray/tests/test_actor_advanced.py index 79b4b1c0a541..812cfd7f9557 100644 --- a/python/ray/tests/test_actor_advanced.py +++ b/python/ray/tests/test_actor_advanced.py @@ -1,16 +1,16 @@ import os import sys import time -from typing import Optional from concurrent.futures import ThreadPoolExecutor +from typing import Optional import pytest import ray import ray._private.gcs_utils as gcs_utils -from ray.util.state import list_actors import ray.cluster_utils from ray._common.test_utils import SignalActor, wait_for_condition +from ray._private.ray_constants import gcs_actor_scheduling_enabled from ray._private.test_utils import ( convert_actor_state, kill_actor_and_wait_for_failure, @@ -18,8 +18,8 @@ run_string_as_driver, wait_for_pid_to_exit, ) -from ray._private.ray_constants import gcs_actor_scheduling_enabled from ray.experimental.internal_kv import _internal_kv_get, _internal_kv_put +from ray.util.state import list_actors def test_actors_on_nodes_with_no_cpus(ray_start_no_cpu): diff --git a/python/ray/tests/test_actor_bounded_threads.py b/python/ray/tests/test_actor_bounded_threads.py index ba3d536a5851..f2fc7bf24857 100644 --- a/python/ray/tests/test_actor_bounded_threads.py +++ b/python/ray/tests/test_actor_bounded_threads.py @@ -1,13 +1,13 @@ -import sys -import os - -import ray import logging -from typing import Dict +import os +import sys from collections import Counter +from typing import Dict import pytest +import ray + logger = logging.getLogger(__name__) diff --git a/python/ray/tests/test_actor_cancel.py b/python/ray/tests/test_actor_cancel.py index 7f7042a039f2..201ec2a86eb1 100644 --- a/python/ray/tests/test_actor_cancel.py +++ b/python/ray/tests/test_actor_cancel.py @@ -1,14 +1,13 @@ import asyncio +import concurrent.futures import sys import time -import concurrent.futures from collections import defaultdict import pytest import ray -from ray._common.test_utils import SignalActor -from ray._common.test_utils import wait_for_condition +from ray._common.test_utils import SignalActor, wait_for_condition from ray.exceptions import TaskCancelledError from ray.util.state import list_tasks diff --git a/python/ray/tests/test_actor_failures.py b/python/ray/tests/test_actor_failures.py index 5043784d3b34..ed5b2b8a8db4 100644 --- a/python/ray/tests/test_actor_failures.py +++ b/python/ray/tests/test_actor_failures.py @@ -1,23 +1,25 @@ -import atexit import asyncio +import atexit import collections import os import signal import sys +import tempfile import time +from typing import Callable, Generator -import pytest import numpy as np +import pytest import ray -from ray.actor import exit_actor -from ray.exceptions import AsyncioActorExit import ray.cluster_utils from ray._common.test_utils import SignalActor, wait_for_condition from ray._private.test_utils import ( - wait_for_pid_to_exit, generate_system_config_map, + wait_for_pid_to_exit, ) +from ray.actor import exit_actor +from ray.exceptions import AsyncioActorExit SIGKILL = signal.SIGKILL if sys.platform != "win32" else signal.SIGTERM @@ -29,6 +31,32 @@ def ray_init_with_task_retry_delay(): ray.shutdown() +@pytest.fixture +def tempfile_factory() -> Generator[Callable[[], str], None, None]: + """Yields a factory function to generate tempfiles that will be deleted after the test run.""" + files = [] + + def create_temp_file(): + temp_file = tempfile.NamedTemporaryFile(delete=False) + temp_file.close() + files.append(temp_file.name) + return temp_file.name + + yield create_temp_file + + # Cleanup all created files + for file_path in files: + try: + os.unlink(file_path) + except Exception: + pass + + +def check_file_exists_and_not_empty(file_path): + """Helper to check if file exists and has content.""" + return os.path.exists(file_path) and os.path.getsize(file_path) > 0 + + @pytest.mark.parametrize( "ray_start_regular", [ @@ -1248,5 +1276,130 @@ def get_pid(self): assert ray.get(refs) == [3, 4, 5] +def test_actor_user_shutdown_method(ray_start_regular_shared, tempfile_factory): + """Test that __ray_shutdown__ method is called during actor termination.""" + shutdown_file = tempfile_factory() + + @ray.remote + class UserShutdownActor: + def __init__(self): + pass + + def __ray_shutdown__(self): + with open(shutdown_file, "w") as f: + f.write("ray_shutdown_called") + f.flush() + + def get_ready(self): + return "ready" + + actor = UserShutdownActor.remote() + ray.get(actor.get_ready.remote()) + actor.__ray_terminate__.remote() + + wait_for_condition(lambda: check_file_exists_and_not_empty(shutdown_file)) + + with open(shutdown_file, "r") as f: + assert f.read() == "ray_shutdown_called" + + +def test_actor_ray_shutdown_handles_exceptions( + ray_start_regular_shared, tempfile_factory +): + """Test that Ray handles unhandled exceptions in __ray_shutdown__ gracefully.""" + shutdown_file = tempfile_factory() + + @ray.remote + class ExceptionActor: + def __ray_shutdown__(self): + # Write to file before raising exception + with open(shutdown_file, "w") as f: + f.write("cleanup_started") + f.flush() + + # Let exception propagate to Ray's machinery + raise ValueError("Unhandled exception in __ray_shutdown__") + + def get_ready(self): + return "ready" + + actor = ExceptionActor.remote() + ray.get(actor.get_ready.remote()) + actor.__ray_terminate__.remote() + + # Verify that despite the exception: + # 1. File was written (cleanup started) + # 2. Actor shuts down properly (no system crash) + wait_for_condition(lambda: check_file_exists_and_not_empty(shutdown_file)) + + with open(shutdown_file, "r") as f: + assert f.read() == "cleanup_started" + + +def test_actor_atexit_handler_dont_conflict_with_ray_shutdown( + ray_start_regular_shared, tempfile_factory +): + """Test that atexit handler methods don't conflict with __ray_shutdown__ and both run.""" + shutdown_file = tempfile_factory() + atexit_file = tempfile_factory() + + @ray.remote + class CleanupActor: + def __init__(self): + atexit.register(self.cleanup) + + def __ray_shutdown__(self): + with open(shutdown_file, "w") as f: + f.write("ray_shutdown_called") + f.flush() + + def cleanup(self): + with open(atexit_file, "w") as f: + f.write("atexit_cleanup_called") + f.flush() + + def get_ready(self): + return "ready" + + actor = CleanupActor.remote() + ray.get(actor.get_ready.remote()) + actor.__ray_terminate__.remote() + + wait_for_condition(lambda: check_file_exists_and_not_empty(shutdown_file)) + + with open(shutdown_file, "r") as f: + assert f.read() == "ray_shutdown_called" + wait_for_condition(lambda: check_file_exists_and_not_empty(atexit_file)) + with open(atexit_file, "r") as f: + assert f.read() == "atexit_cleanup_called" + + +def test_actor_ray_shutdown_dont_interfere_with_kill( + ray_start_regular_shared, tempfile_factory +): + """Test __ray_shutdown__ is not called when actor is killed with ray.kill().""" + shutdown_file = tempfile_factory() + + @ray.remote + class KillableActor: + def __ray_shutdown__(self): + with open(shutdown_file, "w") as f: + f.write("shutdown_called_kill") + f.flush() + + def get_ready(self): + return "ready" + + def sleep_forever(self): + time.sleep(3600) + + actor = KillableActor.remote() + ray.get(actor.get_ready.remote()) + _ = actor.sleep_forever.remote() + ray.kill(actor) + + wait_for_condition(lambda: not check_file_exists_and_not_empty(shutdown_file)) + + if __name__ == "__main__": sys.exit(pytest.main(["-sv", __file__])) diff --git a/python/ray/tests/test_actor_lifetime.py b/python/ray/tests/test_actor_lifetime.py index 09ff2a16a13c..7d4f118f98a7 100644 --- a/python/ray/tests/test_actor_lifetime.py +++ b/python/ray/tests/test_actor_lifetime.py @@ -1,17 +1,17 @@ import os -import time import signal import sys +import time import pytest import ray from ray._common.test_utils import wait_for_condition -from ray.exceptions import RayActorError -from ray.job_config import JobConfig from ray._private.test_utils import ( wait_for_pid_to_exit, ) +from ray.exceptions import RayActorError +from ray.job_config import JobConfig SIGKILL = signal.SIGKILL if sys.platform != "win32" else signal.SIGTERM diff --git a/python/ray/tests/test_actor_lineage_reconstruction.py b/python/ray/tests/test_actor_lineage_reconstruction.py index 3e3fac33a507..eb06d0bd77e3 100644 --- a/python/ray/tests/test_actor_lineage_reconstruction.py +++ b/python/ray/tests/test_actor_lineage_reconstruction.py @@ -1,14 +1,13 @@ import gc import os -import sys import signal +import sys import pytest import ray from ray._common.test_utils import wait_for_condition -from ray.core.generated import gcs_pb2 -from ray.core.generated import common_pb2 +from ray.core.generated import common_pb2, gcs_pb2 @pytest.mark.parametrize("deterministic_failure", ["request", "response"]) diff --git a/python/ray/tests/test_actor_pool.py b/python/ray/tests/test_actor_pool.py index f7677deccfbf..b969933531cd 100644 --- a/python/ray/tests/test_actor_pool.py +++ b/python/ray/tests/test_actor_pool.py @@ -2,6 +2,7 @@ import sys import time from unittest.mock import MagicMock + import pytest import ray diff --git a/python/ray/tests/test_actor_retry_2.py b/python/ray/tests/test_actor_retry_2.py index fb5fde7f503a..0f06ba3d940e 100644 --- a/python/ray/tests/test_actor_retry_2.py +++ b/python/ray/tests/test_actor_retry_2.py @@ -2,11 +2,11 @@ import sys from collections import defaultdict from typing import Optional -from ray._common.test_utils import SignalActor import pytest import ray +from ray._common.test_utils import SignalActor class MyError(Exception): diff --git a/python/ray/tests/test_actor_state_metrics.py b/python/ray/tests/test_actor_state_metrics.py index f14af6fd8cea..c03ffe5560a2 100644 --- a/python/ray/tests/test_actor_state_metrics.py +++ b/python/ray/tests/test_actor_state_metrics.py @@ -1,6 +1,6 @@ import asyncio -import time import sys +import time from collections import defaultdict from typing import Dict @@ -9,13 +9,12 @@ import ray from ray._common.test_utils import wait_for_condition from ray._common.utils import hex_to_binary - -from ray.util.state import list_actors from ray._private.test_utils import ( raw_metrics, run_string_as_driver, ) from ray._private.worker import RayContext +from ray.util.state import list_actors _SYSTEM_CONFIG = { "metrics_report_interval_ms": 200, diff --git a/python/ray/tests/test_advanced.py b/python/ray/tests/test_advanced.py index 047e71d33c1f..ce5bb7f1e08e 100644 --- a/python/ray/tests/test_advanced.py +++ b/python/ray/tests/test_advanced.py @@ -8,18 +8,18 @@ import numpy as np import pytest -from ray._common.test_utils import wait_for_condition import ray._private.profiling as profiling import ray.cluster_utils +from ray._common.test_utils import wait_for_condition from ray._private.internal_api import ( - memory_summary, get_local_ongoing_lineage_reconstruction_tasks, + memory_summary, ) from ray._private.test_utils import ( client_test_enabled, ) -from ray.exceptions import ObjectFreedError from ray.core.generated import common_pb2 +from ray.exceptions import ObjectFreedError if client_test_enabled(): from ray.util.client import ray diff --git a/python/ray/tests/test_advanced_2.py b/python/ray/tests/test_advanced_2.py index 20647e54169b..20e7715a461a 100644 --- a/python/ray/tests/test_advanced_2.py +++ b/python/ray/tests/test_advanced_2.py @@ -10,8 +10,8 @@ import ray import ray.cluster_utils from ray._common.test_utils import wait_for_condition -from ray.util.placement_group import placement_group from ray.util.accelerators import AWS_NEURON_CORE +from ray.util.placement_group import placement_group from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy logger = logging.getLogger(__name__) @@ -380,9 +380,9 @@ def h(): return ray._private.worker.global_worker.node.unique_id # The g tasks should be scheduled only on the second raylet. - raylet_ids = set(ray.get([g.remote() for _ in range(50)])) - assert len(raylet_ids) == 1 - assert list(raylet_ids)[0] == custom_resource_node.unique_id + node_ids = set(ray.get([g.remote() for _ in range(50)])) + assert len(node_ids) == 1 + assert list(node_ids)[0] == custom_resource_node.unique_id # Make sure that resource bookkeeping works when a task that uses a # custom resources gets blocked. @@ -460,9 +460,9 @@ def k(): assert len(set(ray.get([g.remote() for _ in range(500)]))) == 2 # The h tasks should be scheduled only on the second raylet. - raylet_ids = set(ray.get([h.remote() for _ in range(50)])) - assert len(raylet_ids) == 1 - assert list(raylet_ids)[0] == custom_resource_node.unique_id + node_ids = set(ray.get([h.remote() for _ in range(50)])) + assert len(node_ids) == 1 + assert list(node_ids)[0] == custom_resource_node.unique_id # Make sure that tasks with unsatisfied custom resource requirements do # not get scheduled. diff --git a/python/ray/tests/test_advanced_3.py b/python/ray/tests/test_advanced_3.py index 55a5afc79d8e..09d0743b6a0d 100644 --- a/python/ray/tests/test_advanced_3.py +++ b/python/ray/tests/test_advanced_3.py @@ -1,21 +1,22 @@ # coding: utf-8 +import importlib import logging import os import pickle import socket import sys import time -import importlib import numpy as np import pytest -import psutil import ray import ray._private.ray_constants import ray._private.utils from ray._private.test_utils import check_call_ray, wait_for_num_actors +import psutil + logger = logging.getLogger(__name__) diff --git a/python/ray/tests/test_advanced_4.py b/python/ray/tests/test_advanced_4.py index f000ad4d7088..699757aa5592 100644 --- a/python/ray/tests/test_advanced_4.py +++ b/python/ray/tests/test_advanced_4.py @@ -6,11 +6,11 @@ import pytest import ray +from ray._common.test_utils import Semaphore, wait_for_condition from ray._private.test_utils import ( client_test_enabled, get_gcs_memory_used, ) -from ray._common.test_utils import Semaphore, wait_for_condition from ray.experimental.internal_kv import _internal_kv_list diff --git a/python/ray/tests/test_advanced_6.py b/python/ray/tests/test_advanced_6.py index 98e6edfec377..3f54d79fa95f 100644 --- a/python/ray/tests/test_advanced_6.py +++ b/python/ray/tests/test_advanced_6.py @@ -6,17 +6,18 @@ import sys import time -import psutil import pytest import ray -from ray._common.test_utils import wait_for_condition import ray.cluster_utils +from ray._common.test_utils import wait_for_condition from ray._private.test_utils import ( run_string_as_driver_nonblocking, wait_for_pid_to_exit, ) +import psutil + logger = logging.getLogger(__name__) diff --git a/python/ray/tests/test_advanced_7.py b/python/ray/tests/test_advanced_7.py index 339b90fc89d3..7eb7e23fd0a9 100644 --- a/python/ray/tests/test_advanced_7.py +++ b/python/ray/tests/test_advanced_7.py @@ -6,8 +6,8 @@ import time from concurrent.futures import ThreadPoolExecutor -import pytest import numpy as np +import pytest import ray.cluster_utils from ray._private.test_utils import client_test_enabled diff --git a/python/ray/tests/test_advanced_8.py b/python/ray/tests/test_advanced_8.py index 6c0c32d4f1c2..0b24bac5b435 100644 --- a/python/ray/tests/test_advanced_8.py +++ b/python/ray/tests/test_advanced_8.py @@ -9,10 +9,8 @@ from unittest import mock import numpy as np -import psutil import pytest -from ray._common.utils import RESOURCE_CONSTRAINT_PREFIX import ray import ray._private.gcs_utils as gcs_utils import ray._private.ray_constants as ray_constants @@ -20,9 +18,12 @@ import ray.cluster_utils import ray.util.accelerators from ray._common.test_utils import wait_for_condition +from ray._common.utils import RESOURCE_CONSTRAINT_PREFIX from ray.dashboard import k8s_utils from ray.runtime_env import RuntimeEnv +import psutil + logger = logging.getLogger(__name__) diff --git a/python/ray/tests/test_advanced_9.py b/python/ray/tests/test_advanced_9.py index 39160849b148..dd139ebed0a3 100644 --- a/python/ray/tests/test_advanced_9.py +++ b/python/ray/tests/test_advanced_9.py @@ -1,25 +1,26 @@ import os -import psutil import subprocess import sys import pytest import ray -import ray.util.state import ray._private.ray_constants as ray_constants +import ray.util.state +from ray._common.network_utils import parse_address +from ray._common.test_utils import Semaphore, wait_for_condition from ray._private.test_utils import ( - external_redis_test_enabled, client_test_enabled, - run_string_as_driver, + external_redis_test_enabled, get_gcs_memory_used, + run_string_as_driver, run_string_as_driver_nonblocking, ) -from ray._common.network_utils import parse_address -from ray._common.test_utils import Semaphore, wait_for_condition +from ray._raylet import GCS_PID_KEY, GcsClient from ray.experimental.internal_kv import _internal_kv_list from ray.tests.conftest import call_ray_start -from ray._raylet import GcsClient, GCS_PID_KEY + +import psutil @pytest.fixture diff --git a/python/ray/tests/test_annotations.py b/python/ray/tests/test_annotations.py index 44569cf32f95..cc8a39ba4561 100644 --- a/python/ray/tests/test_annotations.py +++ b/python/ray/tests/test_annotations.py @@ -3,10 +3,10 @@ import pytest -from ray.util.annotations import Deprecated from ray._private.test_utils import ( run_string_as_driver, ) +from ray.util.annotations import Deprecated # Use default filterwarnings behavior for this test diff --git a/python/ray/tests/test_async.py b/python/ray/tests/test_async.py index 0b556a1cd08e..e98b6971fd77 100644 --- a/python/ray/tests/test_async.py +++ b/python/ray/tests/test_async.py @@ -4,14 +4,13 @@ import time import numpy as np - import pytest import ray +from ray._common.test_utils import wait_for_condition from ray._common.utils import ( get_or_create_event_loop, ) -from ray._common.test_utils import wait_for_condition @pytest.fixture diff --git a/python/ray/tests/test_asyncio.py b/python/ray/tests/test_asyncio.py index 904ff1bb403a..5223ffd98701 100644 --- a/python/ray/tests/test_asyncio.py +++ b/python/ray/tests/test_asyncio.py @@ -8,8 +8,8 @@ import pytest import ray -from ray._private.client_mode_hook import client_mode_should_convert from ray._common.test_utils import SignalActor, wait_for_condition +from ray._private.client_mode_hook import client_mode_should_convert from ray._private.test_utils import ( kill_actor_and_wait_for_failure, wait_for_pid_to_exit, diff --git a/python/ray/tests/test_asyncio_cluster.py b/python/ray/tests/test_asyncio_cluster.py index 280bbbae698d..52165bdbb353 100644 --- a/python/ray/tests/test_asyncio_cluster.py +++ b/python/ray/tests/test_asyncio_cluster.py @@ -2,8 +2,8 @@ import asyncio import sys -import pytest import numpy as np +import pytest import ray from ray.cluster_utils import Cluster, cluster_not_supported diff --git a/python/ray/tests/test_autoscaler.py b/python/ray/tests/test_autoscaler.py index 312c7fdcb17f..a4a6a0216b71 100644 --- a/python/ray/tests/test_autoscaler.py +++ b/python/ray/tests/test_autoscaler.py @@ -1,10 +1,10 @@ import copy -import logging -import sys import json +import logging import os import re import shutil +import sys import tempfile import time import unittest @@ -20,20 +20,15 @@ from jsonschema.exceptions import ValidationError import ray -from ray.tests.autoscaler_test_utils import ( - MockNode, - MockProcessRunner, - MockProvider, -) from ray.autoscaler._private import commands from ray.autoscaler._private.autoscaler import NonTerminatedNodes, StandardAutoscaler from ray.autoscaler._private.commands import get_or_create_head_node from ray.autoscaler._private.constants import ( + AUTOSCALER_HEARTBEAT_TIMEOUT_S, DISABLE_LAUNCH_CONFIG_CHECK_KEY, DISABLE_NODE_UPDATERS_KEY, FOREGROUND_NODE_LAUNCH_KEY, WORKER_LIVENESS_CHECK_KEY, - AUTOSCALER_HEARTBEAT_TIMEOUT_S, ) from ray.autoscaler._private.load_metrics import LoadMetrics from ray.autoscaler._private.monitor import Monitor @@ -62,13 +57,16 @@ TAG_RAY_NODE_STATUS, TAG_RAY_USER_NODE_TYPE, ) +from ray.core.generated import common_pb2, gcs_pb2 +from ray.exceptions import RpcError +from ray.tests.autoscaler_test_utils import ( + MockNode, + MockProcessRunner, + MockProvider, +) from ray.tests.test_batch_node_provider_unit import ( MockBatchingNodeProvider, ) -from ray.exceptions import RpcError - -from ray.core.generated import gcs_pb2, common_pb2 - WORKER_FILTER = {TAG_RAY_NODE_KIND: NODE_KIND_WORKER} @@ -107,7 +105,7 @@ def __init__(self, drain_node_outcome=DrainNodeOutcome.Succeeded): # Tracks how many times DrainNode returned a successful RPC response. self.drain_node_reply_success = 0 - def drain_nodes(self, raylet_ids_to_drain, timeout: int): + def drain_nodes(self, node_ids_to_drain, timeout: int): """Simulate NodeInfo stub's DrainNode call. Outcome determined by self.drain_outcome. @@ -132,28 +130,28 @@ def drain_nodes(self, raylet_ids_to_drain, timeout: int): DrainNodeOutcome.Succeeded, DrainNodeOutcome.FailedToFindIp, ]: - return raylet_ids_to_drain + return node_ids_to_drain elif self.drain_node_outcome == DrainNodeOutcome.NotAllDrained: # All but the last. - return raylet_ids_to_drain[:-1] + return node_ids_to_drain[:-1] else: # Shouldn't land here. assert False, "Possible drain node outcomes exhausted." -def mock_raylet_id() -> bytes: - """Random raylet id to pass to load_metrics.update.""" +def mock_node_id() -> bytes: + """Random node id to pass to load_metrics.update.""" return os.urandom(10) -def fill_in_raylet_ids(provider, load_metrics) -> None: - """Raylet ids for each ip are usually obtained by polling the GCS +def fill_in_node_ids(provider, load_metrics) -> None: + """Node ids for each ip are usually obtained by polling the GCS in monitor.py. For test purposes, we sometimes need to manually fill these fields with mocks. """ for node in provider.non_terminated_nodes({}): ip = provider.internal_ip(node) - load_metrics.raylet_id_by_ip[ip] = mock_raylet_id() + load_metrics.node_id_by_ip[ip] = mock_node_id() class MockAutoscaler(StandardAutoscaler): @@ -336,7 +334,7 @@ def update_nodes(self): class LoadMetricsTest(unittest.TestCase): def testHeartbeat(self): lm = LoadMetrics() - lm.update("1.1.1.1", mock_raylet_id(), {"CPU": 2}, {"CPU": 1}, 0) + lm.update("1.1.1.1", mock_node_id(), {"CPU": 2}, {"CPU": 1}, 0) lm.mark_active("2.2.2.2") assert "1.1.1.1" in lm.last_heartbeat_time_by_ip assert "2.2.2.2" in lm.last_heartbeat_time_by_ip @@ -344,13 +342,13 @@ def testHeartbeat(self): def testDebugString(self): lm = LoadMetrics() - lm.update("1.1.1.1", mock_raylet_id(), {"CPU": 2}, {"CPU": 0}, 0) + lm.update("1.1.1.1", mock_node_id(), {"CPU": 2}, {"CPU": 0}, 0) lm.update( - "2.2.2.2", mock_raylet_id(), {"CPU": 2, "GPU": 16}, {"CPU": 2, "GPU": 2}, 0 + "2.2.2.2", mock_node_id(), {"CPU": 2, "GPU": 16}, {"CPU": 2, "GPU": 2}, 0 ) lm.update( "3.3.3.3", - mock_raylet_id(), + mock_node_id(), { "memory": 1.05 * 1024 * 1024 * 1024, "object_store_memory": 2.1 * 1024 * 1024 * 1024, @@ -695,7 +693,7 @@ def testNodeTypeNameChange(self): == "ray.worker.old" ) - fill_in_raylet_ids(self.provider, lm) + fill_in_node_ids(self.provider, lm) autoscaler.update() self.waitForNodes(2) events = autoscaler.event_summarizer.summary() @@ -1383,7 +1381,7 @@ def testTerminateOutdatedNodesGracefully(self): ) self.waitForNodes(10, tag_filters=WORKER_FILTER) - fill_in_raylet_ids(self.provider, lm) + fill_in_node_ids(self.provider, lm) # Gradually scales down to meet target size, never going too low for _ in range(10): autoscaler.update() @@ -1546,7 +1544,7 @@ def _helperDynamicScaling( }, 1, ) - lm.update("172.0.0.0", mock_raylet_id(), {"CPU": 1}, {"CPU": 0}, 0) + lm.update("172.0.0.0", mock_node_id(), {"CPU": 1}, {"CPU": 0}, 0) autoscaler = MockAutoscaler( config_path, lm, @@ -1586,7 +1584,7 @@ def _helperDynamicScaling( new_config["available_node_types"]["worker"]["max_workers"] = 1 new_config["available_node_types"]["worker"]["min_workers"] = 1 self.write_config(new_config) - fill_in_raylet_ids(self.provider, lm) + fill_in_node_ids(self.provider, lm) autoscaler.update() self.waitForNodes(1, tag_filters={TAG_RAY_NODE_KIND: NODE_KIND_WORKER}) @@ -1610,7 +1608,7 @@ def _helperDynamicScaling( tag_filters={TAG_RAY_NODE_KIND: NODE_KIND_WORKER}, )[0] lm.update( - worker_ip, mock_raylet_id(), {"CPU": 1}, {"CPU": 1}, DUMMY_IDLE_DURATION_S + worker_ip, mock_node_id(), {"CPU": 1}, {"CPU": 1}, DUMMY_IDLE_DURATION_S ) autoscaler.update() @@ -1682,7 +1680,7 @@ def _helperDynamicScaling( # self.waitForNodes(1) # lm.update( # head_ip, - # mock_raylet_id(), + # mock_node_id(), # {"CPU": 1}, # {"CPU": 0}, # waiting_bundles=[{"CPU": 1}] * 7, @@ -1708,7 +1706,7 @@ def _helperDynamicScaling( # # for being idle and instantly re-created due to resource demand! # lm.update( # head_ip, - # mock_raylet_id(), + # mock_node_id(), # {}, # {}, # waiting_bundles=[], @@ -1772,10 +1770,10 @@ def testUnmanagedNodes(self): autoscaler.update() self.waitForNodes(2) # This node has num_cpus=0 - lm.update(head_ip, mock_raylet_id(), {"CPU": 1}, {"CPU": 0}, 0) + lm.update(head_ip, mock_node_id(), {"CPU": 1}, {"CPU": 0}, 0) lm.update( unmanaged_ip, - mock_raylet_id(), + mock_node_id(), {"CPU": 0}, {"CPU": 0}, DUMMY_IDLE_DURATION_S, @@ -1785,7 +1783,7 @@ def testUnmanagedNodes(self): # 1 CPU task cannot be scheduled. lm.update( unmanaged_ip, - mock_raylet_id(), + mock_node_id(), {"CPU": 0}, {"CPU": 0}, DUMMY_IDLE_DURATION_S, @@ -1838,10 +1836,10 @@ def testUnmanagedNodes2(self): update_interval_s=0, ) - lm.update(head_ip, mock_raylet_id(), {"CPU": 1}, {"CPU": 0}, 0) + lm.update(head_ip, mock_node_id(), {"CPU": 1}, {"CPU": 0}, 0) lm.update( unmanaged_ip, - mock_raylet_id(), + mock_node_id(), {"CPU": 0}, {"CPU": 0}, DUMMY_IDLE_DURATION_S, @@ -1896,7 +1894,7 @@ def testDelayedLaunch(self): self.provider.ready_to_create.clear() lm.update( head_ip, - mock_raylet_id(), + mock_node_id(), {"CPU": 1}, {"CPU": 0}, 0, @@ -1922,7 +1920,7 @@ def testDelayedLaunch(self): new_config = copy.deepcopy(SMALL_CLUSTER) new_config["available_node_types"]["worker"]["max_workers"] = 1 self.write_config(new_config) - fill_in_raylet_ids(self.provider, lm) + fill_in_node_ids(self.provider, lm) autoscaler.update() assert ( len( @@ -2076,7 +2074,7 @@ def testLaunchConfigChange(self): ] = "updated" self.write_config(new_config) self.provider.ready_to_create.clear() - fill_in_raylet_ids(self.provider, lm) + fill_in_node_ids(self.provider, lm) for _ in range(5): autoscaler.update() self.waitForNodes(0, tag_filters=WORKER_FILTER) @@ -2100,7 +2098,7 @@ def testIgnoresCorruptedConfig(self): 1, ) lm = LoadMetrics() - lm.update("172.0.0.0", mock_raylet_id(), {"CPU": 1}, {"CPU": 0}, 0) + lm.update("172.0.0.0", mock_node_id(), {"CPU": 1}, {"CPU": 0}, 0) mock_metrics = Mock(spec=AutoscalerPrometheusMetrics()) autoscaler = MockAutoscaler( config_path, @@ -2146,7 +2144,7 @@ def testIgnoresCorruptedConfig(self): # Because one worker already started, the scheduler waits for its # resources to be updated before it launches the remaining min_workers. lm.update( - worker_ip, mock_raylet_id(), {"CPU": 1}, {"CPU": 1}, DUMMY_IDLE_DURATION_S + worker_ip, mock_node_id(), {"CPU": 1}, {"CPU": 1}, DUMMY_IDLE_DURATION_S ) autoscaler.update() self.waitForNodes(10, tag_filters={TAG_RAY_NODE_KIND: NODE_KIND_WORKER}) @@ -2279,7 +2277,7 @@ def testReportsConfigFailures(self): autoscaler.update() self.waitForNodes(2, tag_filters=WORKER_FILTER) self.provider.finish_starting_nodes() - fill_in_raylet_ids(self.provider, lm) + fill_in_node_ids(self.provider, lm) autoscaler.update() try: self.waitForNodes( @@ -2412,7 +2410,7 @@ def testScaleDownMaxWorkers(self): config["available_node_types"]["p2.xlarge"]["min_workers"] = 6 # 5 config["available_node_types"]["p2.xlarge"]["max_workers"] = 6 self.write_config(config) - fill_in_raylet_ids(self.provider, lm) + fill_in_node_ids(self.provider, lm) autoscaler.update() events = autoscaler.event_summarizer.summary() self.waitFor(lambda: autoscaler.pending_launches.value == 0) @@ -2437,7 +2435,7 @@ def testScaleDownMaxWorkers(self): def testFalseyLoadMetrics(self): lm = LoadMetrics() assert not lm - lm.update("172.0.0.0", mock_raylet_id(), {"CPU": 1}, {"CPU": 0}, 0) + lm.update("172.0.0.0", mock_node_id(), {"CPU": 1}, {"CPU": 0}, 0) assert lm def testRecoverUnhealthyWorkers(self): @@ -2571,7 +2569,7 @@ def unhealthyWorkerHelper(self, disable_liveness_check: bool): autoscaler.disable_node_updaters = True # Reduce min_workers to 1 autoscaler.config["available_node_types"]["worker"]["min_workers"] = 1 - fill_in_raylet_ids(self.provider, lm) + fill_in_node_ids(self.provider, lm) if disable_liveness_check: # We've disabled the liveness check, so the unhealthy node should stick @@ -2673,7 +2671,7 @@ def testTerminateUnhealthyWorkers2(self): # Mark nodes unhealthy. for ip in ips: lm.last_heartbeat_time_by_ip[ip] = 0 - fill_in_raylet_ids(self.provider, lm) + fill_in_node_ids(self.provider, lm) autoscaler.update() # Unhealthy nodes are gone. self.waitForNodes(0, tag_filters=WORKER_FILTER) @@ -3408,7 +3406,7 @@ def terminate_worker_zero(): ), "Node zero still non-terminated." assert not self.provider.is_terminated("1"), "Node one terminated prematurely." - fill_in_raylet_ids(self.provider, lm) + fill_in_node_ids(self.provider, lm) autoscaler.update() # Failed updates processed are now processed. assert ( @@ -3436,7 +3434,7 @@ def terminate_worker_zero(): ), events # Should get two new nodes after the next update. - fill_in_raylet_ids(self.provider, lm) + fill_in_node_ids(self.provider, lm) autoscaler.update() self.waitForNodes(2) assert set(NonTerminatedNodes(self.provider).worker_ids) == { @@ -3619,7 +3617,7 @@ def testScaleDownIdleTimeOut(self): worker_ip = self.provider.non_terminated_node_ips(WORKER_FILTER)[0] # Mark the node as idle - lm.update(worker_ip, mock_raylet_id(), {"CPU": 1}, {"CPU": 1}, 20) + lm.update(worker_ip, mock_node_id(), {"CPU": 1}, {"CPU": 1}, 20) autoscaler.update() assert self.provider.internal_ip("1") == worker_ip events = autoscaler.event_summarizer.summary() @@ -3691,7 +3689,7 @@ def testDontScaleDownIdleTimeOutForPlacementGroups(self): worker_ip = self.provider.non_terminated_node_ips(WORKER_FILTER)[0] lm.update( worker_ip, - mock_raylet_id(), + mock_node_id(), {"CPU": 1}, {"CPU": 1}, 20, # idle for 20 seconds, which is longer than the idle_timeout_minutes. diff --git a/python/ray/tests/test_autoscaler_drain_node_api.py b/python/ray/tests/test_autoscaler_drain_node_api.py index 207d9f4f1dfe..abe5e97d1ce0 100644 --- a/python/ray/tests/test_autoscaler_drain_node_api.py +++ b/python/ray/tests/test_autoscaler_drain_node_api.py @@ -1,13 +1,13 @@ import logging import platform -import time import sys +import time import pytest import ray -from ray._common.test_utils import wait_for_condition import ray._private.ray_constants as ray_constants +from ray._common.test_utils import wait_for_condition from ray._private.test_utils import ( get_error_message, init_error_pubsub, diff --git a/python/ray/tests/test_autoscaler_e2e.py b/python/ray/tests/test_autoscaler_e2e.py index b491a824a14a..5585413e86b9 100644 --- a/python/ray/tests/test_autoscaler_e2e.py +++ b/python/ray/tests/test_autoscaler_e2e.py @@ -4,13 +4,13 @@ import pytest import ray -from ray.autoscaler._private.constants import AUTOSCALER_METRIC_PORT from ray._common.network_utils import build_address +from ray._common.test_utils import SignalActor, wait_for_condition from ray._private.test_utils import ( - get_metric_check_condition, MetricSamplePattern, + get_metric_check_condition, ) -from ray._common.test_utils import SignalActor, wait_for_condition +from ray.autoscaler._private.constants import AUTOSCALER_METRIC_PORT from ray.autoscaler.node_launch_exception import NodeLaunchException @@ -124,12 +124,15 @@ def ping(self): actor = Actor.remote() ray.get(actor.ping.remote()) - assert "Total Demands" in subprocess.check_output("ray status", shell=True).decode() assert ( - "Total Demands" in subprocess.check_output("ray status -v", shell=True).decode() + "Pending Demands" in subprocess.check_output("ray status", shell=True).decode() + ) + assert ( + "Pending Demands" + in subprocess.check_output("ray status -v", shell=True).decode() ) assert ( - "Total Demands" + "Pending Demands" in subprocess.check_output("ray status --verbose", shell=True).decode() ) diff --git a/python/ray/tests/test_autoscaler_fake_multinode.py b/python/ray/tests/test_autoscaler_fake_multinode.py index 767edb2596a7..a0a1772f447d 100644 --- a/python/ray/tests/test_autoscaler_fake_multinode.py +++ b/python/ray/tests/test_autoscaler_fake_multinode.py @@ -1,7 +1,8 @@ -import time -import pytest import platform import sys +import time + +import pytest import ray from ray.cluster_utils import AutoscalingCluster diff --git a/python/ray/tests/test_autoscaler_util.py b/python/ray/tests/test_autoscaler_util.py index d4b7a1b27e73..eee85e20334c 100644 --- a/python/ray/tests/test_autoscaler_util.py +++ b/python/ray/tests/test_autoscaler_util.py @@ -1,6 +1,6 @@ import sys -import pytest +import pytest from ray.autoscaler._private.util import with_envs, with_head_node_ip diff --git a/python/ray/tests/test_autoscaler_yaml.py b/python/ray/tests/test_autoscaler_yaml.py index 7e302e47a2f8..320461cebc6a 100644 --- a/python/ray/tests/test_autoscaler_yaml.py +++ b/python/ray/tests/test_autoscaler_yaml.py @@ -5,7 +5,7 @@ import tempfile import unittest import urllib -from typing import Dict, Any +from typing import Any, Dict from unittest import mock from unittest.mock import MagicMock, Mock, patch diff --git a/python/ray/tests/test_autoscaling_policy.py b/python/ray/tests/test_autoscaling_policy.py index 34a7a5944e2b..fa7235250f28 100644 --- a/python/ray/tests/test_autoscaling_policy.py +++ b/python/ray/tests/test_autoscaling_policy.py @@ -1,39 +1,40 @@ import collections import copy import logging -import yaml -import tempfile -import sys -from typing import Dict, Callable, List import shutil -from queue import PriorityQueue +import sys +import tempfile import unittest +from queue import PriorityQueue +from typing import Callable, Dict, List + import pytest +import yaml import ray -from ray.tests.test_autoscaler import ( - MockProvider, - MockProcessRunner, - MockGcsClient, - mock_raylet_id, - MockAutoscaler, -) -from ray.tests.test_resource_demand_scheduler import MULTI_WORKER_CLUSTER +from ray._private.gcs_utils import PlacementGroupTableData +from ray.autoscaler._private.cli_logger import cli_logger +from ray.autoscaler._private.constants import AUTOSCALER_UPDATE_INTERVAL_S +from ray.autoscaler._private.load_metrics import LoadMetrics +from ray.autoscaler._private.node_launcher import NodeLauncher from ray.autoscaler._private.providers import ( _NODE_PROVIDERS, _clear_provider_cache, ) -from ray.autoscaler._private.load_metrics import LoadMetrics -from ray.autoscaler._private.node_launcher import NodeLauncher from ray.autoscaler.tags import ( - TAG_RAY_USER_NODE_TYPE, - TAG_RAY_NODE_KIND, NODE_KIND_HEAD, + TAG_RAY_NODE_KIND, + TAG_RAY_USER_NODE_TYPE, ) -from ray.autoscaler._private.constants import AUTOSCALER_UPDATE_INTERVAL_S -from ray.autoscaler._private.cli_logger import cli_logger from ray.core.generated.common_pb2 import Bundle, PlacementStrategy -from ray._private.gcs_utils import PlacementGroupTableData +from ray.tests.test_autoscaler import ( + MockAutoscaler, + MockGcsClient, + MockProcessRunner, + MockProvider, + mock_node_id, +) +from ray.tests.test_resource_demand_scheduler import MULTI_WORKER_CLUSTER class Task: @@ -83,7 +84,7 @@ def __init__(self, resources, in_cluster, node_type, start_time): self.in_cluster = in_cluster self.node_type = node_type self.start_time = start_time - self.raylet_id = mock_raylet_id() + self.node_id = mock_node_id() def bundle_fits(self, bundle): if not self.in_cluster: @@ -370,7 +371,7 @@ def run_autoscaler(self): continue self.load_metrics.update( ip=ip, - raylet_id=node.raylet_id, + node_id=node.node_id, static_resources=node.total_resources, dynamic_resources=node.available_resources, node_idle_duration_s=0, diff --git a/python/ray/tests/test_exceptiongroup.py b/python/ray/tests/test_baseexceptionandgroup.py similarity index 59% rename from python/ray/tests/test_exceptiongroup.py rename to python/ray/tests/test_baseexceptionandgroup.py index 88012d507355..208a4b673e9e 100644 --- a/python/ray/tests/test_exceptiongroup.py +++ b/python/ray/tests/test_baseexceptionandgroup.py @@ -4,15 +4,127 @@ import pytest import ray -from ray.exceptions import RayTaskError +from ray.exceptions import ( + ActorDiedError, + RayTaskError, + TaskCancelledError, + WorkerCrashedError, +) + + +def test_baseexception_task(ray_start_regular_shared): + class MyBaseException(BaseException): + pass + + @ray.remote + def task(): + raise MyBaseException("abc") + + with pytest.raises(MyBaseException): + ray.get(task.remote()) + + +def test_baseexception_actor_task(ray_start_regular_shared): + class MyBaseException(BaseException): + pass + + @ray.remote + class Actor: + def f(self): + raise MyBaseException("abc") + + async def async_f(self): + raise MyBaseException("abc") + + a = Actor.remote() + with pytest.raises(MyBaseException): + ray.get(a.f.remote()) + + with pytest.raises(MyBaseException): + ray.get(a.async_f.remote()) + + +def test_baseexception_actor_creation(ray_start_regular_shared): + class MyBaseException(BaseException): + pass + + @ray.remote + class Actor: + def __init__(self): + raise MyBaseException("abc") + + with pytest.raises(ActorDiedError) as e: + a = Actor.remote() + ray.get(a.__ray_ready__.remote()) + assert "MyBaseException" in str(e.value) + + +def test_baseexception_streaming_generator(ray_start_regular_shared): + class MyBaseException(BaseException): + pass + + @ray.remote + def raise_at_beginning(): + raise MyBaseException("rip") + yield 1 + + raise_at_beginning_ref = raise_at_beginning.remote() + with pytest.raises(MyBaseException): + ray.get(next(raise_at_beginning_ref)) + + @ray.remote + def raise_at_middle(): + for i in range(1, 10): + if i == 5: + raise MyBaseException("rip") + yield i + + raise_at_middle_ref = raise_at_middle.remote() + for i in range(1, 5): + assert i == ray.get(next(raise_at_middle_ref)) + with pytest.raises(MyBaseException): + ray.get(next(raise_at_middle_ref)) + + @ray.remote(_generator_backpressure_num_objects=1) + def raise_after_backpressure(): + for i in range(1, 10): + if i == 5: + raise MyBaseException("rip") + yield i + + raise_after_backpressure_ref = raise_after_backpressure.remote() + for i in range(1, 5): + assert i == ray.get(next(raise_after_backpressure_ref)) + with pytest.raises(MyBaseException): + ray.get(next(raise_after_backpressure_ref)) + + +def test_raise_system_exit(ray_start_regular_shared): + @ray.remote + def task(): + raise SystemExit("abc") + + with pytest.raises(WorkerCrashedError): + ray.get(task.remote()) + + +def test_raise_keyboard_interrupt(ray_start_regular_shared): + @ray.remote + def task(): + raise KeyboardInterrupt("abc") + + with pytest.raises(TaskCancelledError): + ray.get(task.remote()) + -pytestmark = pytest.mark.skipif( +skip_if_python_less_than_3_11 = pytest.mark.skipif( sys.version_info < (3, 11), reason="ExceptionGroup is only available in Python 3.11+", ) -def test_baseexceptiongroup_task(ray_start_regular): +@skip_if_python_less_than_3_11 +def test_baseexceptiongroup_task(ray_start_regular_shared): baseexceptiongroup = BaseExceptionGroup( # noqa: F821 "test baseexceptiongroup", [BaseException("abc")] ) @@ -21,11 +133,12 @@ def test_baseexceptiongroup_task(ray_start_regular): def task(): raise baseexceptiongroup - with pytest.raises(ray.exceptions.WorkerCrashedError): + with pytest.raises(ray.exceptions.RayTaskError): # noqa: F821 ray.get(task.remote()) -def test_baseexceptiongroup_actor(ray_start_regular): +@skip_if_python_less_than_3_11 +def test_baseexceptiongroup_actor(ray_start_regular_shared): baseexceptiongroup = BaseExceptionGroup( # noqa: F821 "test baseexceptiongroup", [BaseException("abc")] ) @@ -35,12 +148,13 @@ class Actor: def f(self): raise baseexceptiongroup - with pytest.raises(ray.exceptions.ActorDiedError): + with pytest.raises(ray.exceptions.RayTaskError): # noqa: F821 a = Actor.remote() ray.get(a.f.remote()) -def test_except_exceptiongroup(ray_start_regular): +@skip_if_python_less_than_3_11 +def test_except_exceptiongroup(ray_start_regular_shared): exceptiongroup = ExceptionGroup( # noqa: F821 "test exceptiongroup", [ValueError(), TypeError()] ) @@ -74,7 +188,8 @@ def f(self): assert isinstance(ex.exceptions[1], TypeError) -def test_except_star_exception(ray_start_regular): +@skip_if_python_less_than_3_11 +def test_except_star_exception(ray_start_regular_shared): @ray.remote def task(): raise ValueError @@ -126,7 +241,8 @@ def f(self): exec(python_code) -def test_except_star_exceptiongroup(ray_start_regular): +@skip_if_python_less_than_3_11 +def test_except_star_exceptiongroup(ray_start_regular_shared): exceptiongroup = ExceptionGroup( # noqa: F821 "test exceptiongroup", [ValueError(), TypeError()] ) diff --git a/python/ray/tests/test_basic.py b/python/ray/tests/test_basic.py index 68d1773adff0..968005cf4980 100644 --- a/python/ray/tests/test_basic.py +++ b/python/ray/tests/test_basic.py @@ -10,7 +10,6 @@ import pytest import ray -import psutil import ray.cluster_utils from ray._common.test_utils import SignalActor from ray._private.test_utils import ( @@ -19,6 +18,8 @@ ) from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy +import psutil + logger = logging.getLogger(__name__) @@ -266,7 +267,7 @@ def get_thread_count(self): ray.get(actor.get_thread_count.remote()) # Lowering these numbers in this assert should be celebrated, # increasing these numbers should be scrutinized - assert ray.get(actor.get_thread_count.remote()) in {24, 25} + assert ray.get(actor.get_thread_count.remote()) in {24, 25, 26} # https://github.com/ray-project/ray/issues/7287 @@ -657,6 +658,59 @@ def check(): ) +# https://github.com/ray-project/ray/issues/54868 +def test_not_override_accelerator_ids_when_num_accelerators_is_zero(): + not_override_check_script = """ +import ray +ray.init() + + +@ray.remote(num_gpus=0) +def check(): + import os + assert "CUDA_VISIBLE_DEVICES" not in os.environ + +@ray.remote(num_gpus=0) +class Actor: + def check(self): + import os + assert "CUDA_VISIBLE_DEVICES" not in os.environ + +print("task check", ray.get(check.remote())) +print("actor check", ray.get(Actor.options(num_gpus=0).remote().check.remote())) +""" + + run_string_as_driver( + not_override_check_script, + dict( + os.environ, + **{"RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO": "0"}, + ), + ) + + override_check_script = """ +import ray +ray.init() + + +@ray.remote(num_gpus=0) +def check(): + import os + assert os.environ.get("CUDA_VISIBLE_DEVICES") == "" + +@ray.remote(num_gpus=0) +class Actor: + def check(self): + import os + assert os.environ.get("CUDA_VISIBLE_DEVICES") == "" + +print("task check", ray.get(check.remote())) +print("actor check", ray.get(Actor.options(num_gpus=0).remote().check.remote())) +""" + + run_string_as_driver(override_check_script) + + def test_put_get(shutdown_only): ray.init(num_cpus=0) @@ -1192,6 +1246,16 @@ def f(): assert False +def test_base_exception_raised(ray_start_shared_local_modes): + @ray.remote + def f(): + raise BaseException("rip") + return 1 + + with pytest.raises(BaseException): + ray.get(f.remote()) + + def test_import_ray_does_not_import_grpc(): # First unload grpc and ray if "grpc" in sys.modules: diff --git a/python/ray/tests/test_basic_5.py b/python/ray/tests/test_basic_5.py index 765bb4d72ee8..2a1568026ebd 100644 --- a/python/ray/tests/test_basic_5.py +++ b/python/ray/tests/test_basic_5.py @@ -2,22 +2,22 @@ import gc import logging import os +import subprocess import sys import time -import subprocess -from unittest.mock import Mock, patch import unittest +from unittest.mock import Mock, patch import pytest import ray import ray.cluster_utils +from ray._common.constants import HEAD_NODE_RESOURCE_NAME from ray._private.test_utils import ( + client_test_enabled, run_string_as_driver, wait_for_pid_to_exit, - client_test_enabled, ) -from ray._common.constants import HEAD_NODE_RESOURCE_NAME logger = logging.getLogger(__name__) diff --git a/python/ray/tests/test_batch_node_provider_integration.py b/python/ray/tests/test_batch_node_provider_integration.py index 7a7290a3f021..19423cc2e323 100644 --- a/python/ray/tests/test_batch_node_provider_integration.py +++ b/python/ray/tests/test_batch_node_provider_integration.py @@ -1,21 +1,21 @@ """Integration/e2e test for BatchingNodeProvider. Adapts FakeMultiNodeProvider tests. """ -from copy import deepcopy +import logging import sys +from copy import deepcopy import pytest - import ray from ray._common.test_utils import wait_for_condition +from ray.autoscaler._private.constants import FOREGROUND_NODE_LAUNCH_KEY +from ray.autoscaler._private.fake_multi_node.node_provider import FakeMultiNodeProvider from ray.autoscaler.batching_node_provider import ( BatchingNodeProvider, NodeData, ScaleRequest, ) -from ray.autoscaler._private.fake_multi_node.node_provider import FakeMultiNodeProvider -from ray.autoscaler._private.constants import FOREGROUND_NODE_LAUNCH_KEY from ray.autoscaler.tags import ( NODE_KIND_WORKER, STATUS_UP_TO_DATE, @@ -25,9 +25,6 @@ ) from ray.cluster_utils import AutoscalingCluster - -import logging - logger = logging.getLogger(__name__) diff --git a/python/ray/tests/test_batch_node_provider_unit.py b/python/ray/tests/test_batch_node_provider_unit.py index a2adc8a7b0a8..7f8fceb86637 100644 --- a/python/ray/tests/test_batch_node_provider_unit.py +++ b/python/ray/tests/test_batch_node_provider_unit.py @@ -1,34 +1,34 @@ """Unit test for BatchingNodeProvider. Validates BatchingNodeProvider's book-keeping logic. """ -from copy import copy -from uuid import uuid4 import random import sys -from typing import Any, Dict from collections import defaultdict +from copy import copy +from typing import Any, Dict +from uuid import uuid4 import pytest +from ray.autoscaler._private.constants import ( + DISABLE_LAUNCH_CONFIG_CHECK_KEY, + DISABLE_NODE_UPDATERS_KEY, + FOREGROUND_NODE_LAUNCH_KEY, +) +from ray.autoscaler._private.util import NodeID, NodeType from ray.autoscaler.batching_node_provider import ( BatchingNodeProvider, NodeData, ScaleRequest, ) -from ray.autoscaler._private.util import NodeID, NodeType from ray.autoscaler.tags import ( + NODE_KIND_HEAD, + NODE_KIND_WORKER, STATUS_UP_TO_DATE, - TAG_RAY_USER_NODE_TYPE, TAG_RAY_NODE_KIND, TAG_RAY_NODE_STATUS, TAG_RAY_REPLICA_INDEX, - NODE_KIND_HEAD, - NODE_KIND_WORKER, -) -from ray.autoscaler._private.constants import ( - DISABLE_LAUNCH_CONFIG_CHECK_KEY, - DISABLE_NODE_UPDATERS_KEY, - FOREGROUND_NODE_LAUNCH_KEY, + TAG_RAY_USER_NODE_TYPE, ) diff --git a/python/ray/tests/test_bounded_unix_sockets.py b/python/ray/tests/test_bounded_unix_sockets.py index 0f13c2e4a08f..9bbb5e9b09a5 100644 --- a/python/ray/tests/test_bounded_unix_sockets.py +++ b/python/ray/tests/test_bounded_unix_sockets.py @@ -1,10 +1,11 @@ +import logging import sys +import pytest + import ray -import logging -import psutil -import pytest +import psutil logger = logging.getLogger(__name__) diff --git a/python/ray/tests/test_bundle_label_selector.py b/python/ray/tests/test_bundle_label_selector.py index 890fa2845dcc..c27b12eef787 100644 --- a/python/ray/tests/test_bundle_label_selector.py +++ b/python/ray/tests/test_bundle_label_selector.py @@ -1,32 +1,35 @@ -import sys import os +import sys import pytest import ray - -from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy from ray._private.test_utils import placement_group_assert_no_leak +from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy def test_bundle_label_selector_with_repeated_labels(ray_start_cluster): cluster = ray_start_cluster - num_nodes = 2 - for _ in range(num_nodes): - cluster.add_node(num_cpus=4, labels={"ray.io/accelerator-type": "A100"}) + cluster.add_node(num_cpus=4, labels={"ray.io/accelerator-type": "A100"}) + node = cluster.add_node(num_cpus=4, labels={"ray.io/accelerator-type": "TPU"}) ray.init(address=cluster.address) bundles = [{"CPU": 1}, {"CPU": 1}] - label_selector = [{"ray.io/accelerator-type": "A100"}] * 2 + label_selector = [{"ray.io/accelerator-type": "TPU"}] * 2 placement_group = ray.util.placement_group( name="repeated_labels_pg", - strategy="PACK", bundles=bundles, bundle_label_selector=label_selector, ) ray.get(placement_group.ready()) + bundles_to_node_id = ray.util.placement_group_table()[placement_group.id.hex()][ + "bundles_to_node_id" + ] + assert bundles_to_node_id[0] == node.node_id + assert bundles_to_node_id[1] == node.node_id + placement_group_assert_no_leak([placement_group]) @@ -42,7 +45,6 @@ def test_unschedulable_bundle_label_selector(ray_start_cluster): placement_group = ray.util.placement_group( name="unschedulable_labels_pg", - strategy="STRICT_PACK", bundles=bundles, bundle_label_selector=label_selector, ) @@ -53,7 +55,7 @@ def test_unschedulable_bundle_label_selector(ray_start_cluster): state = ray.util.placement_group_table()[placement_group.id.hex()]["stats"][ "scheduling_state" ] - assert state == "INFEASIBLE" + assert state == "NO_RESOURCES" def test_bundle_label_selectors_match_bundle_resources(ray_start_cluster): @@ -89,7 +91,6 @@ def test_bundle_label_selectors_match_bundle_resources(ray_start_cluster): pg = ray.util.placement_group( name="label_selectors_match_resources", - strategy="SPREAD", bundles=bundles, bundle_label_selector=bundle_label_selectors, ) diff --git a/python/ray/tests/test_cancel.py b/python/ray/tests/test_cancel.py index 5878853fcf08..54c27641247c 100644 --- a/python/ray/tests/test_cancel.py +++ b/python/ray/tests/test_cancel.py @@ -1,25 +1,24 @@ +import _thread import random import signal import sys import threading -import _thread import time -import numpy as np from typing import List +import numpy as np import pytest import ray +from ray._common.test_utils import SignalActor, wait_for_condition +from ray._private.utils import DeferSigint from ray.exceptions import ( - TaskCancelledError, - RayTaskError, GetTimeoutError, + RayTaskError, + TaskCancelledError, WorkerCrashedError, ) from ray.types import ObjectRef -from ray._private.utils import DeferSigint -from ray._common.test_utils import SignalActor -from ray._common.test_utils import wait_for_condition from ray.util.state import list_tasks diff --git a/python/ray/tests/test_channel.py b/python/ray/tests/test_channel.py index f2c692b1d114..ad08512276fa 100644 --- a/python/ray/tests/test_channel.py +++ b/python/ray/tests/test_channel.py @@ -1,6 +1,6 @@ # coding: utf-8 -import pickle import logging +import pickle import sys import time import traceback @@ -13,11 +13,11 @@ import ray.cluster_utils import ray.exceptions import ray.experimental.channel as ray_channel -from ray.experimental.channel.torch_tensor_type import TorchTensorType +from ray._private.test_utils import get_actor_node_id +from ray.dag.compiled_dag_node import CompiledDAG from ray.exceptions import RayChannelError, RayChannelTimeoutError +from ray.experimental.channel.torch_tensor_type import TorchTensorType from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy -from ray.dag.compiled_dag_node import CompiledDAG -from ray._private.test_utils import get_actor_node_id logger = logging.getLogger(__name__) diff --git a/python/ray/tests/test_channel_serialization.py b/python/ray/tests/test_channel_serialization.py index 36ef7eebacca..732681feb1fc 100644 --- a/python/ray/tests/test_channel_serialization.py +++ b/python/ray/tests/test_channel_serialization.py @@ -2,11 +2,12 @@ import logging import os import sys + import pytest -from ray.experimental.util.types import Device -from ray.experimental.channel.serialization_context import _SerializationContext import torch +from ray.experimental.channel.serialization_context import _SerializationContext +from ray.experimental.util.types import Device logger = logging.getLogger(__name__) diff --git a/python/ray/tests/test_chaos.py b/python/ray/tests/test_chaos.py index 4805e001e2ec..19458773c2f4 100644 --- a/python/ray/tests/test_chaos.py +++ b/python/ray/tests/test_chaos.py @@ -1,24 +1,24 @@ +import random import sys import time -import random import pytest import ray from ray._common.test_utils import wait_for_condition -from ray.experimental import shuffle -from ray.tests.conftest import _ray_start_chaos_cluster -from ray.util.placement_group import placement_group from ray._private.test_utils import ( RayletKiller, - get_log_message, - get_and_run_resource_killer, WorkerKillerActor, + get_and_run_resource_killer, + get_log_message, ) -from ray.exceptions import RayTaskError, ObjectLostError -from ray.util.state.common import ListApiOptions, StateResource -from ray.util.state.api import StateApiClient, list_nodes from ray.cluster_utils import AutoscalingCluster +from ray.exceptions import ObjectLostError, RayTaskError +from ray.experimental import shuffle +from ray.tests.conftest import _ray_start_chaos_cluster +from ray.util.placement_group import placement_group +from ray.util.state.api import StateApiClient, list_nodes +from ray.util.state.common import ListApiOptions, StateResource def assert_no_system_failure(p, timeout): diff --git a/python/ray/tests/test_cli.py b/python/ray/tests/test_cli.py index 782d8792b879..577c54308015 100644 --- a/python/ray/tests/test_cli.py +++ b/python/ray/tests/test_cli.py @@ -18,6 +18,7 @@ randomized each time. """ import glob +import json import multiprocessing as mp import multiprocessing.connection import os @@ -25,10 +26,10 @@ import sys import tempfile import threading -import json import time import uuid from contextlib import contextmanager +from http.server import BaseHTTPRequestHandler, HTTPServer from pathlib import Path from typing import Optional from unittest import mock @@ -42,17 +43,16 @@ from testfixtures.popen import MockPopen, PopenBehaviour import ray +import ray._private.ray_constants as ray_constants +import ray._private.utils as utils import ray.autoscaler._private.aws.config as aws_config import ray.autoscaler._private.constants as autoscaler_constants -import ray._private.ray_constants as ray_constants import ray.scripts.scripts as scripts -import ray._private.utils as utils -from ray.util.check_open_ports import check_open_ports from ray._common.network_utils import build_address, parse_address from ray._common.test_utils import wait_for_condition from ray.cluster_utils import cluster_not_supported +from ray.util.check_open_ports import check_open_ports from ray.util.state import list_nodes -from http.server import BaseHTTPRequestHandler, HTTPServer import psutil diff --git a/python/ray/tests/test_cli_patterns/test_ray_start.txt b/python/ray/tests/test_cli_patterns/test_ray_start.txt index 55d250d62f21..6d6df437a1ca 100644 --- a/python/ray/tests/test_cli_patterns/test_ray_start.txt +++ b/python/ray/tests/test_cli_patterns/test_ray_start.txt @@ -14,7 +14,7 @@ Next steps ray\.init\(\) To submit a Ray job using the Ray Jobs CLI: - RAY_ADDRESS='http://.+:8265' ray job submit --working-dir \. -- python my_script\.py + RAY_API_SERVER_ADDRESS='http://.+:8265' ray job submit --working-dir \. -- python my_script\.py See https://docs\.ray\.io/en/latest/cluster/running-applications/job-submission/index\.html for more information on submitting Ray jobs to the Ray cluster. diff --git a/python/ray/tests/test_cli_patterns/test_ray_start_windows_osx.txt b/python/ray/tests/test_cli_patterns/test_ray_start_windows_osx.txt index b6ea1348f10f..b11b51a275e0 100644 --- a/python/ray/tests/test_cli_patterns/test_ray_start_windows_osx.txt +++ b/python/ray/tests/test_cli_patterns/test_ray_start_windows_osx.txt @@ -15,7 +15,7 @@ Next steps ray\.init\(\) To submit a Ray job using the Ray Jobs CLI: - RAY_ADDRESS='http://.+:8265' ray job submit --working-dir \. -- python my_script\.py + RAY_API_SERVER_ADDRESS='http://.+:8265' ray job submit --working-dir \. -- python my_script\.py See https://docs\.ray\.io/en/latest/cluster/running-applications/job-submission/index\.html for more information on submitting Ray jobs to the Ray cluster. diff --git a/python/ray/tests/test_cli_patterns/test_ray_status.txt b/python/ray/tests/test_cli_patterns/test_ray_status.txt index 5cdf2e0a220a..998eacc9c3f4 100644 --- a/python/ray/tests/test_cli_patterns/test_ray_status.txt +++ b/python/ray/tests/test_cli_patterns/test_ray_status.txt @@ -17,7 +17,7 @@ Total Usage: 0.+ 0.+ -Total Constraints: - \(no request_resources\(\) constraints\) -Total Demands: +From request_resources: + \(none\) +Pending Demands: \(no resource demands\) diff --git a/python/ray/tests/test_cli_patterns/test_ray_status_multinode.txt b/python/ray/tests/test_cli_patterns/test_ray_status_multinode.txt index c86f8cf00c89..b0ada8cd82c3 100644 --- a/python/ray/tests/test_cli_patterns/test_ray_status_multinode.txt +++ b/python/ray/tests/test_cli_patterns/test_ray_status_multinode.txt @@ -20,7 +20,7 @@ Total Usage: 0.+ 0.+ -Total Constraints: - \(no request_resources\(\) constraints\) -Total Demands: +From request_resources: + \(none\) +Pending Demands: \(no resource demands\) diff --git a/python/ray/tests/test_cli_patterns/test_ray_status_multinode_v1.txt b/python/ray/tests/test_cli_patterns/test_ray_status_multinode_v1.txt index cd228fbc591d..537cab7f8abc 100644 --- a/python/ray/tests/test_cli_patterns/test_ray_status_multinode_v1.txt +++ b/python/ray/tests/test_cli_patterns/test_ray_status_multinode_v1.txt @@ -18,7 +18,7 @@ Total Usage: 0.+ 0.+ -Total Constraints: - \(no request_resources\(\) constraints\) -Total Demands: +From request_resources: + \(none\) +Pending Demands: \(no resource demands\) diff --git a/python/ray/tests/test_cli_patterns/test_ray_status_v1.txt b/python/ray/tests/test_cli_patterns/test_ray_status_v1.txt index ec5125f5eb0e..8eac046f8444 100644 --- a/python/ray/tests/test_cli_patterns/test_ray_status_v1.txt +++ b/python/ray/tests/test_cli_patterns/test_ray_status_v1.txt @@ -15,7 +15,7 @@ Total Usage: 0.+ 0.+ -Total Constraints: - \(no request_resources\(\) constraints\) -Total Demands: +From request_resources: + \(none\) +Pending Demands: \(no resource demands\) diff --git a/python/ray/tests/test_cli_patterns/test_ray_up.txt b/python/ray/tests/test_cli_patterns/test_ray_up.txt index 30a9f52d28e9..0da266aaed4a 100644 --- a/python/ray/tests/test_cli_patterns/test_ray_up.txt +++ b/python/ray/tests/test_cli_patterns/test_ray_up.txt @@ -19,6 +19,7 @@ Acquiring an up-to-date head node <1/1> Setting up head node Prepared bootstrap config + Autoscaler v2 is now enabled by default.+ New status: waiting-for-ssh \[1/7\] Waiting for SSH to become available Running `uptime` as a test\. diff --git a/python/ray/tests/test_cli_patterns/test_ray_up_docker.txt b/python/ray/tests/test_cli_patterns/test_ray_up_docker.txt index 30a9f52d28e9..0da266aaed4a 100644 --- a/python/ray/tests/test_cli_patterns/test_ray_up_docker.txt +++ b/python/ray/tests/test_cli_patterns/test_ray_up_docker.txt @@ -19,6 +19,7 @@ Acquiring an up-to-date head node <1/1> Setting up head node Prepared bootstrap config + Autoscaler v2 is now enabled by default.+ New status: waiting-for-ssh \[1/7\] Waiting for SSH to become available Running `uptime` as a test\. diff --git a/python/ray/tests/test_cli_patterns/test_ray_up_record.txt b/python/ray/tests/test_cli_patterns/test_ray_up_record.txt index 1f6ce5e93ce3..3bbbc3b98a13 100644 --- a/python/ray/tests/test_cli_patterns/test_ray_up_record.txt +++ b/python/ray/tests/test_cli_patterns/test_ray_up_record.txt @@ -18,6 +18,7 @@ .+\.py.*Fetching the new head node .+\.py.*<1/1> Setting up head node .+\.py.*Prepared bootstrap config +.+\.py.*Autoscaler v2 is now enabled by default.+ .+\.py.*AWSNodeProvider: Set tag ray-node-status=waiting-for-ssh on \['.+'\] \[LogTimer=.+\] .+\.py.*New status: waiting-for-ssh .+\.py.*\[1/7\] Waiting for SSH to become available @@ -73,9 +74,9 @@ .+\.py.*Full command is `ssh.+` .+\.py.*NodeUpdater: i-.+: Setup commands succeeded \[LogTimer=.+\] .+\.py.*\[7/7\] Starting the Ray runtime -.+\.py.*Running `export RAY_USAGE_STATS_ENABLED=1;export RAY_OVERRIDE_RESOURCES='{"CPU":1}';export RAY_OVERRIDE_LABELS='{"key1":"value1"}';ray stop` +.+\.py.*Running `export RAY_USAGE_STATS_ENABLED=1;export RAY_OVERRIDE_RESOURCES='{"CPU":1}';export RAY_OVERRIDE_LABELS='{"key1":"value1"}';export RAY_enable_autoscaler_v2=1; export RAY_CLOUD_INSTANCE_ID=i-.+; export RAY_NODE_TYPE_NAME=head_node; ray stop` .+\.py.*Full command is `ssh.+` -.+\.py.*Running `export RAY_USAGE_STATS_ENABLED=1;export RAY_OVERRIDE_RESOURCES='{"CPU":1}';export RAY_OVERRIDE_LABELS='{"key1":"value1"}';ray start --head --autoscaling-config=~/ray_bootstrap_config\.yaml` +.+\.py.*Running `export RAY_USAGE_STATS_ENABLED=1;export RAY_OVERRIDE_RESOURCES='{"CPU":1}';export RAY_OVERRIDE_LABELS='{"key1":"value1"}';export RAY_enable_autoscaler_v2=1; export RAY_CLOUD_INSTANCE_ID=i-.+; export RAY_NODE_TYPE_NAME=head_node; ray start --head --autoscaling-config=~/ray_bootstrap_config\.yaml` .+\.py.*Full command is `ssh.+` .+\.py.*NodeUpdater: i-.+: Ray start commands succeeded \[LogTimer=.+\] .+\.py.*NodeUpdater: i-.+: Applied config .+ \[LogTimer=.+\] diff --git a/python/ray/tests/test_client.py b/python/ray/tests/test_client.py index d9c52088bd97..4ff31cdfba05 100644 --- a/python/ray/tests/test_client.py +++ b/python/ray/tests/test_client.py @@ -6,8 +6,8 @@ import sys import threading import time -from unittest.mock import Mock from typing import Type +from unittest.mock import Mock import numpy as np import pytest @@ -17,12 +17,12 @@ import ray import ray.cloudpickle as cloudpickle import ray.util.client.server.server as ray_client_server +from ray._common.network_utils import build_address from ray._private.client_mode_hook import ( client_mode_should_convert, disable_client_hook, enable_client_mode, ) -from ray._common.network_utils import build_address from ray._private.test_utils import run_string_as_driver from ray.tests.client_test_utils import ( create_remote_signal_actor, diff --git a/python/ray/tests/test_client_builder.py b/python/ray/tests/test_client_builder.py index 5bcad34ca8a5..b98a565f9121 100644 --- a/python/ray/tests/test_client_builder.py +++ b/python/ray/tests/test_client_builder.py @@ -7,9 +7,9 @@ import pytest import ray -from ray._common.test_utils import wait_for_condition import ray.client_builder as client_builder import ray.util.client.server.server as ray_client_server +from ray._common.test_utils import wait_for_condition from ray._private.test_utils import ( run_string_as_driver, run_string_as_driver_nonblocking, diff --git a/python/ray/tests/test_client_init.py b/python/ray/tests/test_client_init.py index a217245eed28..bcba6a5d5ce6 100644 --- a/python/ray/tests/test_client_init.py +++ b/python/ray/tests/test_client_init.py @@ -1,20 +1,18 @@ """Client tests that run their own init (as with init_and_serve) live here""" -import time import random -import sys import subprocess +import sys +import time from unittest.mock import patch import pytest -import ray.util.client.server.server as ray_client_server +import ray import ray.core.generated.ray_client_pb2 as ray_client_pb2 - -from ray.util.client import _ClientContext +import ray.util.client.server.server as ray_client_server from ray.cluster_utils import cluster_not_supported - -import ray +from ray.util.client import _ClientContext @ray.remote diff --git a/python/ray/tests/test_client_metadata.py b/python/ray/tests/test_client_metadata.py index 7b96588cad29..fa88466a9fab 100644 --- a/python/ray/tests/test_client_metadata.py +++ b/python/ray/tests/test_client_metadata.py @@ -2,10 +2,9 @@ import pytest -from ray.util.client.ray_client_helpers import ray_start_client_server from ray._raylet import NodeID - from ray.runtime_context import RuntimeContext +from ray.util.client.ray_client_helpers import ray_start_client_server def test_get_ray_metadata(ray_start_regular_shared): diff --git a/python/ray/tests/test_client_multi.py b/python/ray/tests/test_client_multi.py index e6c4f8ec6ca4..69e8b7254691 100644 --- a/python/ray/tests/test_client_multi.py +++ b/python/ray/tests/test_client_multi.py @@ -1,5 +1,7 @@ import sys + import pytest + import ray diff --git a/python/ray/tests/test_client_proxy.py b/python/ray/tests/test_client_proxy.py index a653db604bb7..8501cccf2c1e 100644 --- a/python/ray/tests/test_client_proxy.py +++ b/python/ray/tests/test_client_proxy.py @@ -4,17 +4,17 @@ import sys import time from glob import glob -from unittest.mock import patch, MagicMock from itertools import chain +from unittest.mock import MagicMock, patch import grpc import pytest import ray -from ray._common.test_utils import wait_for_condition import ray.core.generated.ray_client_pb2 as ray_client_pb2 -from ray._common.network_utils import parse_address import ray.util.client.server.proxier as proxier +from ray._common.network_utils import parse_address +from ray._common.test_utils import wait_for_condition from ray._private.ray_constants import REDIS_DEFAULT_PASSWORD from ray._private.test_utils import run_string_as_driver from ray.cloudpickle.compat import pickle diff --git a/python/ray/tests/test_client_reconnect.py b/python/ray/tests/test_client_reconnect.py index 47ec8aba8812..28361abbc9a1 100644 --- a/python/ray/tests/test_client_reconnect.py +++ b/python/ray/tests/test_client_reconnect.py @@ -1,21 +1,21 @@ -from concurrent import futures import contextlib import os -import threading +import random import sys +import threading +import time +from concurrent import futures +from typing import Any, Callable, Optional +from unittest.mock import Mock, patch + import grpc import numpy as np - -import time -import random import pytest -from typing import Any, Callable, Optional -from unittest.mock import patch, Mock import ray -from ray._common.utils import get_or_create_event_loop import ray.core.generated.ray_client_pb2 as ray_client_pb2 import ray.core.generated.ray_client_pb2_grpc as ray_client_pb2_grpc +from ray._common.utils import get_or_create_event_loop from ray.tests.conftest import call_ray_start_context from ray.util.client.common import CLIENT_SERVER_MAX_THREADS, GRPC_OPTIONS diff --git a/python/ray/tests/test_client_warnings.py b/python/ray/tests/test_client_warnings.py index cf463eae1289..7e276c8bfe2c 100644 --- a/python/ray/tests/test_client_warnings.py +++ b/python/ray/tests/test_client_warnings.py @@ -1,8 +1,8 @@ import sys import unittest -import pytest import numpy as np +import pytest from ray.util.client.ray_client_helpers import ray_start_client_server from ray.util.debug import _logged diff --git a/python/ray/tests/test_command_runner.py b/python/ray/tests/test_command_runner.py index af6c609cd502..9832698b0d3c 100644 --- a/python/ray/tests/test_command_runner.py +++ b/python/ray/tests/test_command_runner.py @@ -4,14 +4,14 @@ import pytest -from ray.tests.test_autoscaler import MockProvider, MockProcessRunner -from ray.autoscaler.command_runner import CommandRunnerInterface from ray.autoscaler._private.command_runner import ( - SSHCommandRunner, DockerCommandRunner, + SSHCommandRunner, _with_environment_variables, ) +from ray.autoscaler.command_runner import CommandRunnerInterface from ray.autoscaler.sdk import get_docker_host_mount_location +from ray.tests.test_autoscaler import MockProcessRunner, MockProvider auth_config = { "ssh_user": "ray", diff --git a/python/ray/tests/test_component_failures_2.py b/python/ray/tests/test_component_failures_2.py index 59b82315f919..6ea489606386 100644 --- a/python/ray/tests/test_component_failures_2.py +++ b/python/ray/tests/test_component_failures_2.py @@ -5,8 +5,8 @@ import pytest import ray -from ray._common.test_utils import wait_for_condition import ray._private.ray_constants as ray_constants +from ray._common.test_utils import wait_for_condition from ray._private.test_utils import get_other_nodes from ray.cluster_utils import Cluster, cluster_not_supported diff --git a/python/ray/tests/test_component_failures_3.py b/python/ray/tests/test_component_failures_3.py index d8f1cf737b9e..2a3c10d3a048 100644 --- a/python/ray/tests/test_component_failures_3.py +++ b/python/ray/tests/test_component_failures_3.py @@ -1,8 +1,8 @@ import sys import time -import pytest import numpy as np +import pytest import ray import ray._private.ray_constants as ray_constants diff --git a/python/ray/tests/test_concurrency_group.py b/python/ray/tests/test_concurrency_group.py index 2b31d5b0d0f2..4ae7eff96478 100644 --- a/python/ray/tests/test_concurrency_group.py +++ b/python/ray/tests/test_concurrency_group.py @@ -8,9 +8,9 @@ import pytest import ray +from ray._common.test_utils import SignalActor from ray._common.utils import get_or_create_event_loop from ray._private.test_utils import run_string_as_driver -from ray._common.test_utils import SignalActor # This tests the methods are executed in the correct eventloop. diff --git a/python/ray/tests/test_coordinator_server.py b/python/ray/tests/test_coordinator_server.py index 645b2d91c4e0..6d1fae41bc81 100644 --- a/python/ray/tests/test_coordinator_server.py +++ b/python/ray/tests/test_coordinator_server.py @@ -7,28 +7,28 @@ import pytest -from ray.autoscaler.local.coordinator_server import OnPremCoordinatorServer -from ray.autoscaler._private.providers import _NODE_PROVIDERS, _get_node_provider +from ray._common.network_utils import build_address +from ray._common.utils import get_ray_temp_dir from ray.autoscaler._private.local import config as local_config -from ray.autoscaler._private.local.node_provider import LocalNodeProvider -from ray.autoscaler._private.local.node_provider import ( - record_local_head_state_if_needed, -) from ray.autoscaler._private.local.coordinator_node_provider import ( CoordinatorSenderNodeProvider, ) -from ray._common.network_utils import build_address +from ray.autoscaler._private.local.node_provider import ( + LocalNodeProvider, + record_local_head_state_if_needed, +) +from ray.autoscaler._private.providers import _NODE_PROVIDERS, _get_node_provider +from ray.autoscaler.local.coordinator_server import OnPremCoordinatorServer from ray.autoscaler.tags import ( - TAG_RAY_NODE_KIND, + NODE_KIND_HEAD, + NODE_KIND_WORKER, + STATUS_UP_TO_DATE, TAG_RAY_CLUSTER_NAME, + TAG_RAY_NODE_KIND, TAG_RAY_NODE_NAME, - NODE_KIND_WORKER, - NODE_KIND_HEAD, - TAG_RAY_USER_NODE_TYPE, TAG_RAY_NODE_STATUS, - STATUS_UP_TO_DATE, + TAG_RAY_USER_NODE_TYPE, ) -from ray._common.utils import get_ray_temp_dir class OnPremCoordinatorServerTest(unittest.TestCase): diff --git a/python/ray/tests/test_core_worker_fault_tolerance.py b/python/ray/tests/test_core_worker_fault_tolerance.py index 578feab855b6..e66d90b2180a 100644 --- a/python/ray/tests/test_core_worker_fault_tolerance.py +++ b/python/ray/tests/test_core_worker_fault_tolerance.py @@ -1,6 +1,7 @@ -import ray import pytest +import ray + @pytest.mark.parametrize("deterministic_failure", ["request", "response"]) def test_get_object_status_rpc_retry_and_idempotency( diff --git a/python/ray/tests/test_cross_language.py b/python/ray/tests/test_cross_language.py index ca87cca5b3de..99cbf7dec7ce 100644 --- a/python/ray/tests/test_cross_language.py +++ b/python/ray/tests/test_cross_language.py @@ -1,6 +1,7 @@ -import pytest import sys +import pytest + import ray import ray.cluster_utils diff --git a/python/ray/tests/test_dashboard.py b/python/ray/tests/test_dashboard.py index f7d23799304e..4f6d0e958a29 100644 --- a/python/ray/tests/test_dashboard.py +++ b/python/ray/tests/test_dashboard.py @@ -4,15 +4,16 @@ import sys import time -import psutil import pytest -from ray._common.test_utils import wait_for_condition import requests import ray +from ray._common.test_utils import wait_for_condition from ray._private import ray_constants from ray._private.test_utils import run_string_as_driver +import psutil + def search_agents(cluster): all_processes = cluster.head_node.all_processes diff --git a/python/ray/tests/test_dashboard_profiler.py b/python/ray/tests/test_dashboard_profiler.py index 721dffff7fc2..7beee86cfec2 100644 --- a/python/ray/tests/test_dashboard_profiler.py +++ b/python/ray/tests/test_dashboard_profiler.py @@ -1,9 +1,10 @@ -import pytest -import subprocess import os -import requests +import subprocess import sys +import pytest +import requests + import ray from ray._private.test_utils import ( format_web_url, @@ -23,7 +24,8 @@ reason="Fails on OSX: https://github.com/ray-project/ray/issues/30114", ) @pytest.mark.parametrize("native", ["0", "1"]) -def test_profiler_endpoints(ray_start_with_dashboard, native): +@pytest.mark.parametrize("node_info", ["node_id", "ip"]) +def test_profiler_endpoints(ray_start_with_dashboard, native, node_info): # Sanity check py-spy are installed. subprocess.check_call(["py-spy", "--version"]) @@ -45,10 +47,19 @@ def do_stuff_infinite(self): pid = ray.get(a.getpid.remote()) a.do_stuff_infinite.remote() + node_id = ray_start_with_dashboard.address_info["node_id"] node_ip = ray_start_with_dashboard.address_info["node_ip_address"] + def get_node_info(): + if node_info == "node_id": + return f"node_id={node_id}" + else: + return f"ip={node_ip}" + def get_actor_stack(): - url = f"{webui_url}/worker/traceback?pid={pid}&ip={node_ip}&native={native}" + url = ( + f"{webui_url}/worker/traceback?pid={pid}&{get_node_info()}&native={native}" + ) print("GET URL", url) response = requests.get(url) print("STATUS CODE", response.status_code) @@ -73,7 +84,7 @@ def get_actor_stack(): def get_actor_flamegraph(): response = requests.get( - f"{webui_url}/worker/cpu_profile?pid={pid}&ip={node_ip}&native={native}" + f"{webui_url}/worker/cpu_profile?pid={pid}&{get_node_info()}&native={native}" ) response.raise_for_status() assert response.headers["Content-Type"] == "image/svg+xml", response.headers @@ -106,7 +117,8 @@ def get_actor_flamegraph(): reason="Fails on OSX, requires memray & lldb installed in osx image", ) @pytest.mark.parametrize("leaks", ["0", "1"]) -def test_memory_profiler_endpoint(ray_start_with_dashboard, leaks): +@pytest.mark.parametrize("node_info", ["node_id", "ip"]) +def test_memory_profiler_endpoint(ray_start_with_dashboard, leaks, node_info): # Sanity check memray are installed. subprocess.check_call(["memray", "--version"]) @@ -128,11 +140,18 @@ def do_stuff_infinite(self): pid = ray.get(a.getpid.remote()) a.do_stuff_infinite.remote() + node_id = ray_start_with_dashboard.address_info["node_id"] node_ip = ray_start_with_dashboard.address_info["node_ip_address"] + def get_node_info(): + if node_info == "node_id": + return f"node_id={node_id}" + else: + return f"ip={node_ip}" + def get_actor_memory_flamegraph(): response = requests.get( - f"{webui_url}/memory_profile?pid={pid}&ip={node_ip}&leaks={leaks}&duration=5" + f"{webui_url}/memory_profile?pid={pid}&{get_node_info()}&leaks={leaks}&duration=5" ) response.raise_for_status() @@ -156,7 +175,7 @@ def get_actor_memory_flamegraph(): def get_actor_memory_multiple_flamegraphs(): response = requests.get( - f"{webui_url}/memory_profile?pid={pid}&ip={node_ip}&leaks={leaks}&duration=5" + f"{webui_url}/memory_profile?pid={pid}&{get_node_info()}&leaks={leaks}&duration=5" ) response.raise_for_status() @@ -189,7 +208,8 @@ def get_actor_memory_multiple_flamegraphs(): sys.platform == "darwin", reason="Fails on OSX, requires memray & lldb installed in osx image", ) -def test_profiler_failure_message(ray_start_with_dashboard): +@pytest.mark.parametrize("node_info", ["node_id", "ip"]) +def test_profiler_failure_message(ray_start_with_dashboard, node_info): # Sanity check py-spy and memray is installed. subprocess.check_call(["py-spy", "--version"]) subprocess.check_call(["memray", "--version"]) @@ -212,10 +232,19 @@ def do_stuff_infinite(self): pid = ray.get(a.getpid.remote()) a.do_stuff_infinite.remote() + node_id = ray_start_with_dashboard.address_info["node_id"] node_ip = ray_start_with_dashboard.address_info["node_ip_address"] + def get_node_info(): + if node_info == "node_id": + return f"node_id={node_id}" + else: + return f"ip={node_ip}" + def get_actor_stack(): - response = requests.get(f"{webui_url}/worker/traceback?pid={pid}&ip={node_ip}") + response = requests.get( + f"{webui_url}/worker/traceback?pid={pid}&{get_node_info()}" + ) response.raise_for_status() content = response.content.decode("utf-8") print("CONTENT", content) @@ -230,33 +259,42 @@ def get_actor_stack(): ) # Check we return the right status code and error message on failure. - response = requests.get(f"{webui_url}/worker/traceback?pid=1234567&ip={node_ip}") + response = requests.get( + f"{webui_url}/worker/traceback?pid=1234567&{get_node_info()}" + ) content = response.content.decode("utf-8") print(content) assert "text/plain" in response.headers["Content-Type"], response.headers assert "Failed to execute" in content, content # Check we return the right status code and error message on failure. - response = requests.get(f"{webui_url}/worker/cpu_profile?pid=1234567&ip={node_ip}") + response = requests.get( + f"{webui_url}/worker/cpu_profile?pid=1234567&{get_node_info()}" + ) content = response.content.decode("utf-8") print(content) assert "text/plain" in response.headers["Content-Type"], response.headers assert "Failed to execute" in content, content # Check we return the right status code and error message on failure. - response = requests.get(f"{webui_url}/memory_profile?pid=1234567&ip={node_ip}") + response = requests.get(f"{webui_url}/memory_profile?pid=1234567&{get_node_info()}") content = response.content.decode("utf-8") print(content) assert "text/plain" in response.headers["Content-Type"], response.headers assert "Failed to execute" in content, content - # Check wrong ip failure - response = requests.get(f"{webui_url}/memory_profile?pid=1234567&ip=1.2.3.4") + # Check wrong ID/ip failure + if node_info == "node_id": + wrong_param = "node_id=DUMMY_ID" + expect_msg = "Failed to execute: no agent address found for node DUMMY_ID" + else: + wrong_param = "ip=1.2.3.4" + expect_msg = "Failed to execute: no agent address found for node IP 1.2.3.4" + + response = requests.get(f"{webui_url}/memory_profile?pid=1234567&{wrong_param}") content = response.content.decode("utf-8") print(content) - assert ( - "Failed to execute: no agent address found for node IP 1.2.3.4" in content - ), content + assert expect_msg in content, content if __name__ == "__main__": diff --git a/python/ray/tests/test_debug_tools.py b/python/ray/tests/test_debug_tools.py index 6117d07a059d..4a3a60cbf46c 100644 --- a/python/ray/tests/test_debug_tools.py +++ b/python/ray/tests/test_debug_tools.py @@ -6,8 +6,8 @@ import pytest import ray -import ray._private.services as services import ray._private.ray_constants as ray_constants +import ray._private.services as services from ray._common.test_utils import wait_for_condition diff --git a/python/ray/tests/test_distributed_sort.py b/python/ray/tests/test_distributed_sort.py index 036970f39179..6138e469ca2e 100644 --- a/python/ray/tests/test_distributed_sort.py +++ b/python/ray/tests/test_distributed_sort.py @@ -1,6 +1,7 @@ -import pytest import sys +import pytest + from ray.experimental.raysort import main diff --git a/python/ray/tests/test_draining.py b/python/ray/tests/test_draining.py index c94eb8b5a92c..373fca4a8824 100644 --- a/python/ray/tests/test_draining.py +++ b/python/ray/tests/test_draining.py @@ -1,12 +1,13 @@ import sys +import time +from collections import Counter + import pytest import ray -import time -from collections import Counter +from ray._common.test_utils import SignalActor, wait_for_condition from ray._raylet import GcsClient from ray.core.generated import autoscaler_pb2, common_pb2 -from ray._common.test_utils import wait_for_condition, SignalActor from ray.util.scheduling_strategies import ( NodeAffinitySchedulingStrategy, PlacementGroupSchedulingStrategy, diff --git a/python/ray/tests/test_exit_observability.py b/python/ray/tests/test_exit_observability.py index 715b8b9ed2f3..1b74bed2035f 100644 --- a/python/ray/tests/test_exit_observability.py +++ b/python/ray/tests/test_exit_observability.py @@ -9,8 +9,8 @@ from ray._common.test_utils import wait_for_condition from ray._private.state_api_test_utils import verify_failed_task from ray._private.test_utils import run_string_as_driver -from ray.util.state import list_workers, list_nodes, list_tasks from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy +from ray.util.state import list_nodes, list_tasks, list_workers def get_worker_by_pid(pid, detail=True): diff --git a/python/ray/tests/test_experimental_collective.py b/python/ray/tests/test_experimental_collective.py index 2944be7b76c9..4a13813301e1 100644 --- a/python/ray/tests/test_experimental_collective.py +++ b/python/ray/tests/test_experimental_collective.py @@ -1,11 +1,11 @@ -import pytest import sys + +import pytest import torch import ray import ray.experimental.collective - SHAPE = (2, 2) DTYPE = torch.float16 diff --git a/python/ray/tests/test_failure.py b/python/ray/tests/test_failure.py index d2d2f650138a..a63553b4539f 100644 --- a/python/ray/tests/test_failure.py +++ b/python/ray/tests/test_failure.py @@ -1,9 +1,9 @@ +import logging import os import signal import sys -import time -import logging import threading +import time import numpy as np import pytest @@ -18,7 +18,7 @@ get_error_message, init_error_pubsub, ) -from ray.exceptions import GetTimeoutError, RayActorError, RayTaskError, ActorDiedError +from ray.exceptions import ActorDiedError, GetTimeoutError, RayActorError, RayTaskError from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy @@ -380,26 +380,6 @@ def foo(): assert isinstance(ex, RayTaskError) -def test_baseexception_task(ray_start_regular): - @ray.remote - def task(): - raise BaseException("abc") - - with pytest.raises(ray.exceptions.WorkerCrashedError): - ray.get(task.remote()) - - -def test_baseexception_actor(ray_start_regular): - @ray.remote - class Actor: - def f(self): - raise BaseException("abc") - - with pytest.raises(ActorDiedError): - a = Actor.remote() - ray.get(a.f.remote()) - - @pytest.mark.skip("This test does not work yet.") @pytest.mark.parametrize("ray_start_object_store_memory", [10**6], indirect=True) def test_put_error1(ray_start_object_store_memory, error_pubsub): diff --git a/python/ray/tests/test_failure_2.py b/python/ray/tests/test_failure_2.py index 83231a5bd0e4..2543e56b9602 100644 --- a/python/ray/tests/test_failure_2.py +++ b/python/ray/tests/test_failure_2.py @@ -10,15 +10,15 @@ import ray import ray._private.ray_constants as ray_constants import ray._private.utils -from ray._private.ray_constants import DEBUG_AUTOSCALING_ERROR from ray._common.network_utils import parse_address +from ray._common.test_utils import Semaphore, wait_for_condition +from ray._private.ray_constants import DEBUG_AUTOSCALING_ERROR from ray._private.test_utils import ( get_error_message, get_log_batch, init_error_pubsub, run_string_as_driver_nonblocking, ) -from ray._common.test_utils import Semaphore, wait_for_condition from ray.cluster_utils import cluster_not_supported from ray.experimental.internal_kv import _internal_kv_get diff --git a/python/ray/tests/test_failure_3.py b/python/ray/tests/test_failure_3.py index 759c8c38ada6..428d2f5b00cc 100644 --- a/python/ray/tests/test_failure_3.py +++ b/python/ray/tests/test_failure_3.py @@ -1,21 +1,22 @@ +import json import os -import sys import signal -import time +import sys import threading -import json +import time from pathlib import Path -import ray import numpy as np import pytest -import psutil +import ray +from ray._common.test_utils import SignalActor, wait_for_condition from ray._private.test_utils import ( - wait_for_pid_to_exit, run_string_as_driver_nonblocking, + wait_for_pid_to_exit, ) -from ray._common.test_utils import SignalActor, wait_for_condition + +import psutil SIGKILL = signal.SIGKILL if sys.platform != "win32" else signal.SIGTERM @@ -54,7 +55,19 @@ def test_plasma_store_operation_after_raylet_dies(ray_start_cluster): (RayletDiedError). """ cluster = ray_start_cluster - cluster.add_node(num_cpus=1) + # Required for reducing the retry time of RequestWorkerLease. The call to kill the raylet will also kill the plasma store on the raylet + # meaning the call to put will fail. This will trigger worker death, and the driver will try to queue the task again and request a new worker lease + # from the now dead raylet. + system_configs = { + "raylet_rpc_server_reconnect_timeout_s": 0, + "health_check_initial_delay_ms": 0, + "health_check_timeout_ms": 10, + "health_check_failure_threshold": 1, + } + cluster.add_node( + num_cpus=1, + _system_config=system_configs, + ) cluster.wait_for_nodes() ray.init(address=cluster.address) diff --git a/python/ray/tests/test_failure_4.py b/python/ray/tests/test_failure_4.py index 0c820b623751..ed68031ff614 100644 --- a/python/ray/tests/test_failure_4.py +++ b/python/ray/tests/test_failure_4.py @@ -4,24 +4,22 @@ import grpc import numpy as np -import psutil import pytest from grpc._channel import _InactiveRpcError -from ray.util.state import list_tasks -from ray._private.state_api_test_utils import verify_failed_task import ray import ray._private.ray_constants as ray_constants import ray.experimental.internal_kv as internal_kv from ray import NodeID +from ray._common.network_utils import build_address from ray._common.test_utils import SignalActor, wait_for_condition +from ray._private.state_api_test_utils import verify_failed_task from ray._private.test_utils import ( get_error_message, init_error_pubsub, - run_string_as_driver, kill_raylet, + run_string_as_driver, ) -from ray._common.network_utils import build_address from ray.cluster_utils import Cluster, cluster_not_supported from ray.core.generated import ( gcs_service_pb2, @@ -30,6 +28,9 @@ node_manager_pb2_grpc, ) from ray.exceptions import LocalRayletDiedError +from ray.util.state import list_tasks + +import psutil def search_raylet(cluster): @@ -542,7 +543,18 @@ def task(): def test_task_failure_when_driver_local_raylet_dies(ray_start_cluster): cluster = ray_start_cluster - head = cluster.add_node(num_cpus=4, resources={"foo": 1}) + # Required for reducing the retry time of RequestWorkerLease + system_configs = { + "raylet_rpc_server_reconnect_timeout_s": 0, + "health_check_initial_delay_ms": 0, + "health_check_timeout_ms": 10, + "health_check_failure_threshold": 1, + } + head = cluster.add_node( + num_cpus=4, + resources={"foo": 1}, + _system_config=system_configs, + ) cluster.wait_for_nodes() ray.init(address=cluster.address) diff --git a/python/ray/tests/test_gcs_fault_tolerance.py b/python/ray/tests/test_gcs_fault_tolerance.py index af14adf9a2d9..697bb51fed41 100644 --- a/python/ray/tests/test_gcs_fault_tolerance.py +++ b/python/ray/tests/test_gcs_fault_tolerance.py @@ -3,33 +3,33 @@ import signal import subprocess import sys -import time import tempfile +import time from concurrent.futures import ThreadPoolExecutor from typing import Any -from filelock import FileLock import pytest +from filelock import FileLock import ray -from ray._common.test_utils import wait_for_condition -from ray.autoscaler.v2.sdk import get_cluster_status -from ray.util.placement_group import placement_group -from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy import ray._private.gcs_utils as gcs_utils -from ray._private import ray_constants from ray._common.network_utils import parse_address +from ray._common.test_utils import wait_for_condition +from ray._private import ray_constants +from ray._private.runtime_env.plugin import RuntimeEnvPlugin from ray._private.test_utils import ( convert_actor_state, external_redis_test_enabled, generate_system_config_map, - wait_for_pid_to_exit, - run_string_as_driver, redis_sentinel_replicas, + run_string_as_driver, + wait_for_pid_to_exit, ) -from ray.job_submission import JobSubmissionClient, JobStatus from ray._raylet import GcsClient -from ray._private.runtime_env.plugin import RuntimeEnvPlugin +from ray.autoscaler.v2.sdk import get_cluster_status +from ray.job_submission import JobStatus, JobSubmissionClient +from ray.util.placement_group import placement_group +from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy from ray.util.state import list_placement_groups import psutil @@ -1008,6 +1008,7 @@ def test_redis_logs(external_redis): # assert "redis_context.cc" not in result.output finally: from click.testing import CliRunner + import ray.scripts.scripts as scripts runner = CliRunner(env={"RAY_USAGE_STATS_PROMPT_ENABLED": "0"}) diff --git a/python/ray/tests/test_gcs_ha_e2e_2.py b/python/ray/tests/test_gcs_ha_e2e_2.py index 2f8896b81dc7..e2b667913efd 100644 --- a/python/ray/tests/test_gcs_ha_e2e_2.py +++ b/python/ray/tests/test_gcs_ha_e2e_2.py @@ -1,6 +1,8 @@ -import pytest import sys from time import sleep + +import pytest + from ray._common.test_utils import wait_for_condition from ray.tests.conftest_docker import * # noqa diff --git a/python/ray/tests/test_gcs_pubsub.py b/python/ray/tests/test_gcs_pubsub.py index 4db3789cd81b..b769cb2a5da9 100644 --- a/python/ray/tests/test_gcs_pubsub.py +++ b/python/ray/tests/test_gcs_pubsub.py @@ -1,13 +1,14 @@ import asyncio +import re import sys import threading -import re + +import pytest import ray from ray._private.gcs_pubsub import ( GcsAioResourceUsageSubscriber, ) -import pytest def test_publish_and_subscribe_error_info(ray_start_regular): diff --git a/python/ray/tests/test_gcs_utils.py b/python/ray/tests/test_gcs_utils.py index 6ada3e99c4ad..7e102c42efca 100644 --- a/python/ray/tests/test_gcs_utils.py +++ b/python/ray/tests/test_gcs_utils.py @@ -6,19 +6,19 @@ import time import pytest -from ray._common.test_utils import async_wait_for_condition import redis import ray -from ray._raylet import GcsClient, NodeID import ray._private.gcs_utils as gcs_utils +import ray._private.ray_constants as ray_constants +from ray._common.network_utils import parse_address +from ray._common.test_utils import async_wait_for_condition from ray._private.test_utils import ( external_redis_test_enabled, find_free_port, generate_system_config_map, ) -from ray._common.network_utils import parse_address -import ray._private.ray_constants as ray_constants +from ray._raylet import GcsClient, NodeID # Import asyncio timeout depends on python version if sys.version_info >= (3, 11): diff --git a/python/ray/tests/test_generators.py b/python/ray/tests/test_generators.py index 417a491c8f30..74f14807cf43 100644 --- a/python/ray/tests/test_generators.py +++ b/python/ray/tests/test_generators.py @@ -1,18 +1,19 @@ -import pytest -import numpy as np +import gc import sys import time -import gc from unittest.mock import Mock +import numpy as np +import pytest + import ray -from ray.util.client.ray_client_helpers import ( - ray_start_client_server_for_address, +from ray._common.test_utils import ( + wait_for_condition, ) from ray._private.client_mode_hook import enable_client_mode from ray.tests.conftest import call_ray_start_context -from ray._common.test_utils import ( - wait_for_condition, +from ray.util.client.ray_client_helpers import ( + ray_start_client_server_for_address, ) diff --git a/python/ray/tests/test_get_or_create_actor.py b/python/ray/tests/test_get_or_create_actor.py index 490d9bc7a9a4..f30d66d654bc 100644 --- a/python/ray/tests/test_get_or_create_actor.py +++ b/python/ray/tests/test_get_or_create_actor.py @@ -1,5 +1,6 @@ -import sys import os +import sys + import pytest import ray diff --git a/python/ray/tests/test_global_gc.py b/python/ray/tests/test_global_gc.py index 204ecf56c601..1e6fa4cf2606 100644 --- a/python/ray/tests/test_global_gc.py +++ b/python/ray/tests/test_global_gc.py @@ -2,15 +2,18 @@ import gc import logging import sys +import time import weakref +from unittest.mock import Mock import numpy as np import pytest import ray import ray.cluster_utils -from ray._private.internal_api import global_gc from ray._common.test_utils import wait_for_condition +from ray._private.gc_collect_manager import PythonGCThread +from ray._private.internal_api import global_gc logger = logging.getLogger(__name__) @@ -216,5 +219,136 @@ def f(self): gc.enable() +def test_local_gc_called_once_per_interval(shutdown_only): + ray.init( + num_cpus=2, + _system_config={ + "local_gc_interval_s": 1, + "local_gc_min_interval_s": 0, + "global_gc_min_interval_s": 0, + }, + ) + + class ObjectWithCyclicRef: + def __init__(self): + self.loop = self + + @ray.remote(num_cpus=1) + class GarbageHolder: + def __init__(self): + gc.disable() + self.garbage = None + + def make_garbage(self): + x = ObjectWithCyclicRef() + self.garbage = weakref.ref(x) + return True + + def has_garbage(self): + return self.garbage() is not None + + def all_garbage_collected(local_ref): + return local_ref() is None and not any( + ray.get([a.has_garbage.remote() for a in actors]) + ) + + try: + gc.disable() + + # Round 1: first batch of garbage should be collected + # Local driver. + local_ref = weakref.ref(ObjectWithCyclicRef()) + # Remote workers. + actors = [GarbageHolder.remote() for _ in range(2)] + ray.get([a.make_garbage.remote() for a in actors]) + + assert local_ref() is not None + assert all(ray.get([a.has_garbage.remote() for a in actors])) + + wait_for_condition( + lambda: all_garbage_collected(local_ref), + ) + + # Round 2: second batch should NOT be collected within min_interval + local_ref = weakref.ref(ObjectWithCyclicRef()) + ray.get([a.make_garbage.remote() for a in actors]) + + with pytest.raises(RuntimeError): + wait_for_condition( + lambda: all_garbage_collected(local_ref), + timeout=2.0, # shorter than min_interval + retry_interval_ms=50, + ) + + # Round 3: after min_interval passes, garbage should be collected + wait_for_condition( + lambda: all_garbage_collected(local_ref), + timeout=10.0, + retry_interval_ms=50, + ) + + finally: + gc.enable() + + +def test_gc_manager_thread_basic_functionality(): + mock_gc_collect = Mock(return_value=10) + + gc_thread = PythonGCThread(min_interval_s=1, gc_collect_func=mock_gc_collect) + + try: + gc_thread.start() + assert gc_thread.is_alive() + + gc_thread.trigger_gc() + + wait_for_condition(lambda: mock_gc_collect.call_count == 1, timeout=2) + + mock_gc_collect.assert_called_once() + + finally: + gc_thread.stop() + assert not gc_thread.is_alive() + + +def test_gc_manager_thread_min_interval_throttling(): + mock_gc_collect = Mock(return_value=5) + + gc_thread = PythonGCThread(min_interval_s=2, gc_collect_func=mock_gc_collect) + + try: + gc_thread.start() + + for _ in range(3): + gc_thread.trigger_gc() + time.sleep(1) + + wait_for_condition(lambda: mock_gc_collect.call_count == 2, timeout=2) + + assert mock_gc_collect.call_count == 2 + + finally: + gc_thread.stop() + + +def test_gc_manager_thread_exception_handling(): + mock_gc_collect = Mock(side_effect=RuntimeError("GC failed")) + + gc_thread = PythonGCThread(min_interval_s=5, gc_collect_func=mock_gc_collect) + + try: + gc_thread.start() + + for _ in range(3): + gc_thread.trigger_gc() + time.sleep(0.1) + + assert gc_thread.is_alive() + mock_gc_collect.assert_called_once() + + finally: + gc_thread.stop() + + if __name__ == "__main__": sys.exit(pytest.main(["-sv", __file__])) diff --git a/python/ray/tests/test_global_state.py b/python/ray/tests/test_global_state.py index d786877fb2e4..dcf9654deea5 100644 --- a/python/ray/tests/test_global_state.py +++ b/python/ray/tests/test_global_state.py @@ -1,20 +1,20 @@ import os import sys import time -from typing import Optional, Dict +from typing import Dict, Optional import pytest import ray -from ray._common.test_utils import wait_for_condition import ray._private.gcs_utils as gcs_utils import ray._private.ray_constants -from ray._raylet import GcsClient -from ray.core.generated import autoscaler_pb2 +from ray._common.test_utils import wait_for_condition from ray._private.test_utils import ( convert_actor_state, make_global_state_accessor, ) +from ray._raylet import GcsClient +from ray.core.generated import autoscaler_pb2 def test_replenish_resources(ray_start_regular): diff --git a/python/ray/tests/test_grpc_client_credentials.py b/python/ray/tests/test_grpc_client_credentials.py index 7109ac9fe0dd..910ca0e8a84d 100644 --- a/python/ray/tests/test_grpc_client_credentials.py +++ b/python/ray/tests/test_grpc_client_credentials.py @@ -1,7 +1,7 @@ import sys -import pytest import grpc +import pytest from ray.util.client.worker import Worker diff --git a/python/ray/tests/test_healthcheck.py b/python/ray/tests/test_healthcheck.py index 86e83c40674d..f9eac95143e8 100644 --- a/python/ray/tests/test_healthcheck.py +++ b/python/ray/tests/test_healthcheck.py @@ -5,12 +5,13 @@ import sys import time -import psutil import pytest import ray from ray._private import ray_constants +import psutil + logger = logging.getLogger(__name__) diff --git a/python/ray/tests/test_ids.py b/python/ray/tests/test_ids.py index a33407e20c95..024d5677d66c 100644 --- a/python/ray/tests/test_ids.py +++ b/python/ray/tests/test_ids.py @@ -1,17 +1,19 @@ -import sys import os +import sys + +import pytest + from ray import ( + ActorClassID, ActorID, + ClusterID, + FunctionID, JobID, - TaskID, NodeID, - WorkerID, - FunctionID, - ActorClassID, - ClusterID, PlacementGroupID, + TaskID, + WorkerID, ) -import pytest @pytest.mark.parametrize( diff --git a/python/ray/tests/test_iter.py b/python/ray/tests/test_iter.py index 979be1d744ae..3c8f011baf18 100644 --- a/python/ray/tests/test_iter.py +++ b/python/ray/tests/test_iter.py @@ -1,19 +1,20 @@ +import collections import sys import time -import collections from collections import Counter + import pytest import ray +from ray._common.test_utils import Semaphore from ray.util.iter import ( + LocalIterator, + ParallelIteratorWorker, + from_actors, from_items, from_iterators, from_range, - from_actors, - ParallelIteratorWorker, - LocalIterator, ) -from ray._common.test_utils import Semaphore def test_select_shards(ray_start_regular_shared): diff --git a/python/ray/tests/test_job.py b/python/ray/tests/test_job.py index 16e5071381c9..4f2efb9494db 100644 --- a/python/ray/tests/test_job.py +++ b/python/ray/tests/test_job.py @@ -1,28 +1,27 @@ +import json import os +import re import subprocess import sys import tempfile import time -import re -import json - -from subprocess import Popen, PIPE, STDOUT, list2cmdline +from subprocess import PIPE, STDOUT, Popen, list2cmdline from typing import List -import pytest -from ray._common.test_utils import wait_for_condition -import ray.cloudpickle as pickle +import pytest import ray +import ray.cloudpickle as pickle +from ray._common.test_utils import wait_for_condition from ray._private.test_utils import ( + format_web_url, run_string_as_driver, run_string_as_driver_nonblocking, - format_web_url, wait_for_pid_to_exit, ) +from ray.dashboard.modules.job.pydantic_models import JobDetails from ray.job_config import JobConfig, LoggingConfig from ray.job_submission import JobStatus, JobSubmissionClient -from ray.dashboard.modules.job.pydantic_models import JobDetails def execute_driver(commands: List[str], input: bytes = None): diff --git a/python/ray/tests/test_joblib.py b/python/ray/tests/test_joblib.py index 48b7fffa93d6..8d35e148ce76 100644 --- a/python/ray/tests/test_joblib.py +++ b/python/ray/tests/test_joblib.py @@ -1,30 +1,26 @@ +import os +import pickle import sys import time -import os from unittest import mock import joblib -import pickle -import pytest import numpy as np - +import pytest from sklearn.datasets import load_digits, load_iris -from sklearn.model_selection import RandomizedSearchCV -from sklearn.ensemble import ExtraTreesClassifier -from sklearn.ensemble import RandomForestClassifier -from sklearn.kernel_approximation import Nystroem -from sklearn.kernel_approximation import RBFSampler -from sklearn.pipeline import make_pipeline -from sklearn.svm import LinearSVC, SVC -from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier +from sklearn.kernel_approximation import Nystroem, RBFSampler from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import RandomizedSearchCV, cross_val_score from sklearn.neural_network import MLPClassifier -from sklearn.model_selection import cross_val_score +from sklearn.pipeline import make_pipeline +from sklearn.svm import SVC, LinearSVC +from sklearn.tree import DecisionTreeClassifier import ray +from ray._common.test_utils import wait_for_condition from ray.util.joblib import register_ray from ray.util.joblib.ray_backend import RayBackend -from ray._common.test_utils import wait_for_condition def test_register_ray(): diff --git a/python/ray/tests/test_kill_raylet_signal_log.py b/python/ray/tests/test_kill_raylet_signal_log.py index 28670507ada8..abc01680cea8 100644 --- a/python/ray/tests/test_kill_raylet_signal_log.py +++ b/python/ray/tests/test_kill_raylet_signal_log.py @@ -1,44 +1,38 @@ import signal import sys -# Import psutil after ray so the packaged version is used. -import psutil import pytest import ray from ray._common.test_utils import wait_for_condition +import psutil + def get_pid(name): pids = psutil.process_iter() for pid in pids: if name in pid.name(): return pid.pid - return -1 -def check_result(filename, num_signal, check_key): - ray.init(num_cpus=1) +@pytest.mark.skipif(sys.platform == "win32", reason="Not support on Windows.") +def test_kill_raylet_signal_log(ray_start_regular): session_dir = ray._private.worker._global_node.get_session_dir_path() - raylet_out_path = filename.format(session_dir) + raylet_out_path = "{}/logs/raylet.err".format(session_dir) pid = get_pid("raylet") assert pid > 0 p = psutil.Process(pid) - p.send_signal(num_signal) + p.send_signal(signal.SIGABRT) p.wait(timeout=15) - def check_file(): + def check_for_sigabrt_in_log(): with open(raylet_out_path) as f: s = f.read() - return check_key in s - - wait_for_condition(check_file) + return "SIGABRT" in s - -@pytest.mark.skipif(sys.platform == "win32", reason="Not support on Windows.") -def test_kill_raylet_signal_log(shutdown_only): - check_result("{}/logs/raylet.err", signal.SIGABRT, "SIGABRT") + wait_for_condition(check_for_sigabrt_in_log) if __name__ == "__main__": diff --git a/python/ray/tests/test_kill_subprocesses.py b/python/ray/tests/test_kill_subprocesses.py index 9defa7fc5108..e83db14f4780 100644 --- a/python/ray/tests/test_kill_subprocesses.py +++ b/python/ray/tests/test_kill_subprocesses.py @@ -1,14 +1,17 @@ -import ray -import pytest -import multiprocessing -import subprocess -import time -import psutil import logging +import multiprocessing import os +import subprocess import sys +import time + +import pytest + +import ray from ray._common.test_utils import wait_for_condition +import psutil + logger = logging.getLogger(__name__) diff --git a/python/ray/tests/test_label_utils.py b/python/ray/tests/test_label_utils.py index 8d6c3226cd22..24c78685e6e2 100644 --- a/python/ray/tests/test_label_utils.py +++ b/python/ray/tests/test_label_utils.py @@ -1,22 +1,22 @@ -from contextlib import contextmanager import json import os import sys import tempfile +from contextlib import contextmanager from typing import ContextManager, Dict, Optional, Union import pytest from ray._private.label_utils import ( + parse_node_labels_from_yaml_file, parse_node_labels_json, parse_node_labels_string, - parse_node_labels_from_yaml_file, - validate_node_labels, validate_label_key, - validate_label_value, validate_label_selector, validate_label_selector_value, + validate_label_value, validate_node_label_syntax, + validate_node_labels, ) diff --git a/python/ray/tests/test_list_actors.py b/python/ray/tests/test_list_actors.py index d937af64fcd2..305749d703c9 100644 --- a/python/ray/tests/test_list_actors.py +++ b/python/ray/tests/test_list_actors.py @@ -1,6 +1,7 @@ -import pytest import sys +import pytest + import ray from ray._common.test_utils import wait_for_condition diff --git a/python/ray/tests/test_list_actors_2.py b/python/ray/tests/test_list_actors_2.py index dca2e27e9685..cc303ff7089a 100644 --- a/python/ray/tests/test_list_actors_2.py +++ b/python/ray/tests/test_list_actors_2.py @@ -1,7 +1,8 @@ import os -import pytest import sys +import pytest + import ray from ray._common.test_utils import wait_for_condition diff --git a/python/ray/tests/test_list_actors_3.py b/python/ray/tests/test_list_actors_3.py index dd3a416459d1..4e1484512882 100644 --- a/python/ray/tests/test_list_actors_3.py +++ b/python/ray/tests/test_list_actors_3.py @@ -1,6 +1,7 @@ -import pytest import sys +import pytest + import ray from ray._private.test_utils import run_string_as_driver diff --git a/python/ray/tests/test_list_actors_4.py b/python/ray/tests/test_list_actors_4.py index 0527de580b2d..e8dc604ac6c1 100644 --- a/python/ray/tests/test_list_actors_4.py +++ b/python/ray/tests/test_list_actors_4.py @@ -1,8 +1,9 @@ import asyncio -import pytest import sys import time +import pytest + import ray from ray._private.test_utils import run_string_as_driver diff --git a/python/ray/tests/test_logging.py b/python/ray/tests/test_logging.py index e3cd9160fccc..837ebd9b8e95 100644 --- a/python/ray/tests/test_logging.py +++ b/python/ray/tests/test_logging.py @@ -1,16 +1,16 @@ import io +import logging import os import re import subprocess import sys import tempfile import time -import logging from collections import Counter, defaultdict from contextlib import redirect_stderr, redirect_stdout from pathlib import Path from typing import Dict, List, Tuple -from unittest.mock import Mock, MagicMock, patch +from unittest.mock import MagicMock, Mock, patch import colorama import pytest @@ -18,6 +18,13 @@ import ray from ray._common.test_utils import wait_for_condition from ray._private import ray_constants +from ray._private.log_monitor import ( + LOG_NAME_UPDATE_INTERVAL_S, + RAY_LOG_MONITOR_MANY_FILES_THRESHOLD, + LogFileInfo, + LogMonitor, + is_proc_alive, +) from ray._private.ray_constants import ( PROCESS_TYPE_DASHBOARD, PROCESS_TYPE_DASHBOARD_AGENT, @@ -26,30 +33,23 @@ PROCESS_TYPE_MONITOR, PROCESS_TYPE_PYTHON_CORE_WORKER, PROCESS_TYPE_PYTHON_CORE_WORKER_DRIVER, - PROCESS_TYPE_RAYLET, PROCESS_TYPE_RAY_CLIENT_SERVER, + PROCESS_TYPE_RAYLET, PROCESS_TYPE_REAPER, PROCESS_TYPE_REDIS_SERVER, PROCESS_TYPE_RUNTIME_ENV_AGENT, PROCESS_TYPE_WORKER, ) -from ray._private.log_monitor import ( - LOG_NAME_UPDATE_INTERVAL_S, - RAY_LOG_MONITOR_MANY_FILES_THRESHOLD, - LogFileInfo, - LogMonitor, - is_proc_alive, -) from ray._private.test_utils import ( get_log_batch, - get_log_message, get_log_data, + get_log_message, init_log_pubsub, run_string_as_driver, ) -from ray.cross_language import java_actor_class -from ray.autoscaler._private.cli_logger import cli_logger from ray._private.worker import print_worker_logs +from ray.autoscaler._private.cli_logger import cli_logger +from ray.cross_language import java_actor_class def set_logging_config(monkeypatch, max_bytes, backup_count): diff --git a/python/ray/tests/test_logging_2.py b/python/ray/tests/test_logging_2.py index 69bc24c8bdd1..86c496c66532 100644 --- a/python/ray/tests/test_logging_2.py +++ b/python/ray/tests/test_logging_2.py @@ -1,243 +1,12 @@ -import logging.config -import pytest -import ray -import logging import sys -import json -from ray._private.ray_logging.filters import CoreContextFilter -from ray._private.ray_logging.formatters import JSONFormatter, TextFormatter +import pytest + +import ray from ray._private.ray_logging.logging_config import LoggingConfig from ray._private.test_utils import run_string_as_driver -class TestCoreContextFilter: - def test_driver_process(self, shutdown_only): - log_context = ["job_id", "worker_id", "node_id"] - filter = CoreContextFilter() - record = logging.makeLogRecord({}) - assert filter.filter(record) - # Ray is not initialized so no context - for attr in log_context: - assert not hasattr(record, attr) - assert hasattr(record, "_ray_timestamp_ns") - - ray.init() - record = logging.makeLogRecord({}) - assert filter.filter(record) - runtime_context = ray.get_runtime_context() - expected_values = { - "job_id": runtime_context.get_job_id(), - "worker_id": runtime_context.get_worker_id(), - "node_id": runtime_context.get_node_id(), - } - for attr in log_context: - assert hasattr(record, attr) - assert getattr(record, attr) == expected_values[attr] - # This is not a worker process, so actor_id and task_id should not exist. - for attr in ["actor_id", "task_id"]: - assert not hasattr(record, attr) - assert hasattr(record, "_ray_timestamp_ns") - - def test_task_process(self, shutdown_only): - @ray.remote - def f(): - filter = CoreContextFilter() - record = logging.makeLogRecord({}) - assert filter.filter(record) - should_exist = ["job_id", "worker_id", "node_id", "task_id"] - runtime_context = ray.get_runtime_context() - expected_values = { - "job_id": runtime_context.get_job_id(), - "worker_id": runtime_context.get_worker_id(), - "node_id": runtime_context.get_node_id(), - "task_id": runtime_context.get_task_id(), - "task_name": runtime_context.get_task_name(), - "task_func_name": runtime_context.get_task_function_name(), - } - for attr in should_exist: - assert hasattr(record, attr) - assert getattr(record, attr) == expected_values[attr] - assert not hasattr(record, "actor_id") - assert not hasattr(record, "actor_name") - assert hasattr(record, "_ray_timestamp_ns") - - obj_ref = f.remote() - ray.get(obj_ref) - - def test_actor_process(self, shutdown_only): - @ray.remote - class A: - def f(self): - filter = CoreContextFilter() - record = logging.makeLogRecord({}) - assert filter.filter(record) - should_exist = ["job_id", "worker_id", "node_id", "actor_id", "task_id"] - runtime_context = ray.get_runtime_context() - expected_values = { - "job_id": runtime_context.get_job_id(), - "worker_id": runtime_context.get_worker_id(), - "node_id": runtime_context.get_node_id(), - "actor_id": runtime_context.get_actor_id(), - "actor_name": runtime_context.get_actor_name(), - "task_id": runtime_context.get_task_id(), - "task_name": runtime_context.get_task_name(), - "task_func_name": runtime_context.get_task_function_name(), - } - for attr in should_exist: - assert hasattr(record, attr) - assert getattr(record, attr) == expected_values[attr] - assert hasattr(record, "_ray_timestamp_ns") - - actor = A.remote() - ray.get(actor.f.remote()) - - -class TestJSONFormatter: - def test_empty_record(self, shutdown_only): - formatter = JSONFormatter() - record = logging.makeLogRecord({}) - formatted = formatter.format(record) - - record_dict = json.loads(formatted) - should_exist = [ - "asctime", - "levelname", - "message", - "filename", - "lineno", - "timestamp_ns", - ] - for key in should_exist: - assert key in record_dict - assert len(record_dict) == len(should_exist) - assert "exc_text" not in record_dict - - def test_record_with_exception(self, shutdown_only): - formatter = JSONFormatter() - record = logging.makeLogRecord({}) - try: - raise ValueError("test") - except ValueError: - record.exc_info = sys.exc_info() - formatted = formatter.format(record) - record_dict = json.loads(formatted) - should_exist = [ - "asctime", - "levelname", - "message", - "filename", - "lineno", - "exc_text", - "timestamp_ns", - ] - for key in should_exist: - assert key in record_dict - assert "Traceback (most recent call last):" in record_dict["exc_text"] - assert len(record_dict) == len(should_exist) - - def test_record_with_user_provided_context(self, shutdown_only): - formatter = JSONFormatter() - record = logging.makeLogRecord({"user": "ray"}) - formatted = formatter.format(record) - record_dict = json.loads(formatted) - should_exist = [ - "asctime", - "levelname", - "message", - "filename", - "lineno", - "user", - "timestamp_ns", - ] - for key in should_exist: - assert key in record_dict - assert record_dict["user"] == "ray" - assert len(record_dict) == len(should_exist) - assert "exc_text" not in record_dict - - def test_record_with_flatten_keys_invalid_value(self, shutdown_only): - formatter = JSONFormatter() - record = logging.makeLogRecord({"ray_serve_extra_fields": "not_a_dict"}) - with pytest.raises(ValueError): - formatter.format(record) - - def test_record_with_flatten_keys_valid_dict(self, shutdown_only): - formatter = JSONFormatter() - record = logging.makeLogRecord( - {"ray_serve_extra_fields": {"key1": "value1", "key2": 2}} - ) - formatted = formatter.format(record) - record_dict = json.loads(formatted) - should_exist = [ - "asctime", - "levelname", - "message", - "filename", - "lineno", - "key1", - "key2", - "timestamp_ns", - ] - for key in should_exist: - assert key in record_dict - assert record_dict["key1"] == "value1", record_dict - assert record_dict["key2"] == 2 - assert "ray_serve_extra_fields" not in record_dict - assert len(record_dict) == len(should_exist) - assert "exc_text" not in record_dict - - def test_record_with_valid_additional_log_standard_attrs(self, shutdown_only): - formatter = JSONFormatter() - formatter.set_additional_log_standard_attrs(["name"]) - record = logging.makeLogRecord({}) - formatted = formatter.format(record) - - record_dict = json.loads(formatted) - should_exist = [ - "asctime", - "levelname", - "message", - "filename", - "lineno", - "timestamp_ns", - "name", - ] - for key in should_exist: - assert key in record_dict - assert len(record_dict) == len(should_exist) - - -class TestTextFormatter: - def test_record_with_user_provided_context(self): - formatter = TextFormatter() - record = logging.makeLogRecord({"user": "ray"}) - formatted = formatter.format(record) - assert "user=ray" in formatted - - def test_record_with_exception(self): - formatter = TextFormatter() - record = logging.LogRecord( - name="test_logger", - level=logging.INFO, - pathname="test.py", - lineno=1000, - msg="Test message", - args=None, - exc_info=None, - ) - formatted = formatter.format(record) - for s in ["INFO", "Test message", "test.py:1000", "--"]: - assert s in formatted - - def test_record_with_valid_additional_log_standard_attrs(self, shutdown_only): - formatter = TextFormatter() - formatter.set_additional_log_standard_attrs(["name"]) - record = logging.makeLogRecord({}) - formatted = formatter.format(record) - assert "name=" in formatted - - def test_invalid_encoding(): with pytest.raises(ValueError): LoggingConfig(encoding="INVALID") diff --git a/python/ray/tests/test_memory_deadlock.py b/python/ray/tests/test_memory_deadlock.py index 132455b63a34..fc94f19d32f8 100644 --- a/python/ray/tests/test_memory_deadlock.py +++ b/python/ray/tests/test_memory_deadlock.py @@ -4,13 +4,12 @@ import pytest import ray - from ray.tests.test_memory_pressure import ( - allocate_memory, Leaker, + allocate_memory, get_additional_bytes_to_reach_memory_usage_pct, - memory_usage_threshold, memory_monitor_refresh_ms, + memory_usage_threshold, ) diff --git a/python/ray/tests/test_memory_pressure.py b/python/ray/tests/test_memory_pressure.py index 70264a1ca097..864ba040a673 100644 --- a/python/ray/tests/test_memory_pressure.py +++ b/python/ray/tests/test_memory_pressure.py @@ -1,24 +1,21 @@ -from math import ceil import sys import time +from math import ceil +import numpy as np import pytest import ray from ray._common.test_utils import wait_for_condition +from ray._common.utils import get_system_memory from ray._private import ( ray_constants, ) +from ray._private.state_api_test_utils import verify_failed_task from ray._private.test_utils import raw_metrics - -import numpy as np -from ray._common.utils import get_system_memory from ray._private.utils import get_used_memory -from ray._private.state_api_test_utils import verify_failed_task - from ray.util.state.state_manager import StateDataSourceClient - memory_usage_threshold = 0.5 task_oom_retries = 1 memory_monitor_refresh_ms = 100 diff --git a/python/ray/tests/test_memory_scheduling.py b/python/ray/tests/test_memory_scheduling.py index 73876a902705..cc9eb979ca80 100644 --- a/python/ray/tests/test_memory_scheduling.py +++ b/python/ray/tests/test_memory_scheduling.py @@ -1,8 +1,8 @@ import sys import time -import pytest import numpy as np +import pytest import ray from ray._common.test_utils import wait_for_condition diff --git a/python/ray/tests/test_memstat.py b/python/ray/tests/test_memstat.py index 60ac96d71443..9d701441c887 100644 --- a/python/ray/tests/test_memstat.py +++ b/python/ray/tests/test_memstat.py @@ -6,9 +6,8 @@ import pytest import ray +from ray._common.test_utils import Semaphore, wait_for_condition from ray._private.internal_api import memory_summary -from ray._common.test_utils import wait_for_condition -from ray._common.test_utils import Semaphore from ray.cluster_utils import Cluster, cluster_not_supported # RayConfig to enable recording call sites during ObjectRej creations. diff --git a/python/ray/tests/test_metric_cardinality.py b/python/ray/tests/test_metric_cardinality.py index 426daaaa2181..e44b37b3e3f0 100644 --- a/python/ray/tests/test_metric_cardinality.py +++ b/python/ray/tests/test_metric_cardinality.py @@ -39,14 +39,7 @@ def _setup_cluster_for_test(request, ray_start_cluster): _system_config={ "enable_metrics_collection": True, "metric_cardinality_level": core_metric_cardinality_level, - "experimental_enable_open_telemetry_on_agent": os.getenv( - "RAY_experimental_enable_open_telemetry_on_agent" - ) - == "1", - "experimental_enable_open_telemetry_on_core": os.getenv( - "RAY_experimental_enable_open_telemetry_on_core" - ) - == "1", + "enable_open_telemetry": os.getenv("RAY_enable_open_telemetry") == "1", } ) cluster.wait_for_nodes() @@ -82,7 +75,7 @@ async def run(self): ray.get(obj_refs) -def _cardinality_level_test(_setup_cluster_for_test, cardinality_level): +def _cardinality_level_test(_setup_cluster_for_test, cardinality_level, metric): """ Test that the ray_tasks and ray_actors metric are reported with the expected cardinality level """ @@ -91,43 +84,42 @@ def _cardinality_level_test(_setup_cluster_for_test, cardinality_level): def _validate(): metric_samples = fetch_prometheus_metrics(prom_addresses) - for metric in _TO_TEST_METRICS: - samples = metric_samples.get(metric) - assert samples, f"Metric {metric} not found in samples" - for sample in samples: - if cardinality_level == "recommended": - # If the cardinality level is recommended, the WorkerId tag should - # be removed - assert ( - sample.labels.get(WORKER_ID_TAG_KEY) is None - ), f"Sample {sample} contains WorkerId tag" - elif cardinality_level == "legacy": - # If the cardinality level is legacy, the WorkerId tag should be - # present - assert ( - sample.labels.get(WORKER_ID_TAG_KEY) is not None - ), f"Sample {sample} does not contain WorkerId tag" - if metric == "ray_tasks" or metric == "ray_actors": - assert ( - sample.labels.get(TASK_OR_ACTOR_NAME_TAG_KEY) is not None - ), f"Sample {sample} does not contain Name tag" - elif cardinality_level == "low": - # If the cardinality level is low, the WorkerId and Name tags should - # be removed + samples = metric_samples.get(metric) + assert samples, f"Metric {metric} not found in samples" + for sample in samples: + if cardinality_level == "recommended": + # If the cardinality level is recommended, the WorkerId tag should + # be removed + assert ( + sample.labels.get(WORKER_ID_TAG_KEY) is None + ), f"Sample {sample} contains WorkerId tag" + elif cardinality_level == "legacy": + # If the cardinality level is legacy, the WorkerId tag should be + # present + assert ( + sample.labels.get(WORKER_ID_TAG_KEY) is not None + ), f"Sample {sample} does not contain WorkerId tag" + if metric == "ray_tasks" or metric == "ray_actors": assert ( - sample.labels.get(WORKER_ID_TAG_KEY) is None - ), f"Sample {sample} contains WorkerId tag" - if metric == "ray_tasks" or metric == "ray_actors": - assert ( - sample.labels.get(TASK_OR_ACTOR_NAME_TAG_KEY) is None - ), f"Sample {sample} contains Name tag" - else: - raise ValueError(f"Unknown cardinality level: {cardinality_level}") - - # The Component tag should be present on all cardinality levels + sample.labels.get(TASK_OR_ACTOR_NAME_TAG_KEY) is not None + ), f"Sample {sample} does not contain Name tag" + elif cardinality_level == "low": + # If the cardinality level is low, the WorkerId and Name tags should + # be removed assert ( - sample.labels.get(_COMPONENT_TAG_KEY) is not None - ), f"Sample {sample} does not contain Component tag" + sample.labels.get(WORKER_ID_TAG_KEY) is None + ), f"Sample {sample} contains WorkerId tag" + if metric == "ray_tasks" or metric == "ray_actors": + assert ( + sample.labels.get(TASK_OR_ACTOR_NAME_TAG_KEY) is None + ), f"Sample {sample} contains Name tag" + else: + raise ValueError(f"Unknown cardinality level: {cardinality_level}") + + # The Component tag should be present on all cardinality levels + assert ( + sample.labels.get(_COMPONENT_TAG_KEY) is not None + ), f"Sample {sample} does not contain Component tag" wait_for_assertion( _validate, @@ -138,30 +130,34 @@ def _validate(): @pytest.mark.skipif(prometheus_client is None, reason="Prometheus not installed") @pytest.mark.parametrize( - "_setup_cluster_for_test,cardinality_level", - [("recommended", "recommended"), ("legacy", "legacy")], + "_setup_cluster_for_test,cardinality_level,metric", + [ + (cardinality, cardinality, metric) + for cardinality in ["recommended", "legacy"] + for metric in _TO_TEST_METRICS + ], indirect=["_setup_cluster_for_test"], ) def test_cardinality_recommended_and_legacy_levels( - _setup_cluster_for_test, cardinality_level + _setup_cluster_for_test, cardinality_level, metric ): - _cardinality_level_test(_setup_cluster_for_test, cardinality_level) + _cardinality_level_test(_setup_cluster_for_test, cardinality_level, metric) # We only enable low cardinality test for open telemetry because the legacy opencensus # implementation doesn't support low cardinality. @pytest.mark.skipif(prometheus_client is None, reason="Prometheus not installed") @pytest.mark.skipif( - os.getenv("RAY_experimental_enable_open_telemetry_on_agent") != "1", - reason="OpenTelemetry is not enabled on agent", + os.getenv("RAY_enable_open_telemetry") != "1", + reason="OpenTelemetry is not enabled", ) @pytest.mark.parametrize( - "_setup_cluster_for_test,cardinality_level", - [("low", "low")], + "_setup_cluster_for_test,cardinality_level,metric", + [("low", "low", metric) for metric in _TO_TEST_METRICS], indirect=["_setup_cluster_for_test"], ) -def test_cardinality_low_levels(_setup_cluster_for_test, cardinality_level): - _cardinality_level_test(_setup_cluster_for_test, cardinality_level) +def test_cardinality_low_levels(_setup_cluster_for_test, cardinality_level, metric): + _cardinality_level_test(_setup_cluster_for_test, cardinality_level, metric) if __name__ == "__main__": diff --git a/python/ray/tests/test_metrics.py b/python/ray/tests/test_metrics.py index ecc937fa9453..870b24d86e85 100644 --- a/python/ray/tests/test_metrics.py +++ b/python/ray/tests/test_metrics.py @@ -2,19 +2,20 @@ import platform import sys -import psutil import pytest -from ray._common.test_utils import wait_for_condition import requests import ray +from ray._common.network_utils import build_address +from ray._common.test_utils import wait_for_condition from ray._private.test_utils import ( - wait_until_succeeded_without_exception, get_node_stats, + wait_until_succeeded_without_exception, ) -from ray._common.network_utils import build_address from ray.core.generated import common_pb2 +import psutil + _WIN32 = os.name == "nt" diff --git a/python/ray/tests/test_metrics_agent.py b/python/ray/tests/test_metrics_agent.py index ad7d717619d0..21907971c3bc 100644 --- a/python/ray/tests/test_metrics_agent.py +++ b/python/ray/tests/test_metrics_agent.py @@ -1,11 +1,10 @@ -import time -import signal import json import os import pathlib -import sys import re -import requests +import signal +import sys +import time import warnings from collections import defaultdict from pprint import pformat @@ -13,35 +12,38 @@ import numpy as np import pytest - +import requests from google.protobuf.timestamp_pb2 import Timestamp + import ray -from ray.dashboard.modules.aggregator.tests.test_aggregator_agent import ( - get_event_aggregator_grpc_stub, -) -from ray.core.generated.common_pb2 import TaskAttempt -from ray.core.generated.events_base_event_pb2 import RayEvent -from ray.core.generated.events_event_aggregator_service_pb2 import ( - AddEventsRequest, - RayEventsData, - TaskEventsMetadata, +from ray._common.network_utils import build_address +from ray._common.test_utils import SignalActor, wait_for_condition +from ray._private.metrics_agent import ( + Gauge as MetricsAgentGauge, + PrometheusServiceDiscoveryWriter, ) -from ray.util.state import list_nodes -from ray._private.metrics_agent import PrometheusServiceDiscoveryWriter -from ray._private.metrics_agent import Gauge as MetricsAgentGauge from ray._private.ray_constants import PROMETHEUS_SERVICE_DISCOVERY_FILE -from ray._common.test_utils import SignalActor, wait_for_condition from ray._private.test_utils import ( fetch_prometheus, fetch_prometheus_metrics, + find_free_port, get_log_batch, raw_metrics, - find_free_port, ) -from ray._common.network_utils import build_address from ray.autoscaler._private.constants import AUTOSCALER_METRIC_PORT +from ray.core.generated.common_pb2 import TaskAttempt +from ray.core.generated.events_base_event_pb2 import RayEvent +from ray.core.generated.events_event_aggregator_service_pb2 import ( + AddEventsRequest, + RayEventsData, + TaskEventsMetadata, +) from ray.dashboard.consts import DASHBOARD_METRIC_PORT +from ray.dashboard.modules.aggregator.tests.test_aggregator_agent import ( + get_event_aggregator_grpc_stub, +) from ray.util.metrics import Counter, Gauge, Histogram, Metric +from ray.util.state import list_nodes os.environ["RAY_event_stats"] = "1" @@ -144,6 +146,7 @@ "ray_event_aggregator_agent_events_failed_to_add_to_aggregator_total", "ray_event_aggregator_agent_events_dropped_at_event_aggregator_total", "ray_event_aggregator_agent_events_published_total", + "ray_event_aggregator_agent_events_filtered_out_total", ] _NODE_METRICS = [ @@ -192,17 +195,11 @@ def _setup_cluster_for_test(request, ray_start_cluster): # Add a head node. cluster.add_node( _system_config={ + "metrics_report_interval_ms": 1000, "event_stats_print_interval_ms": 500, "event_stats": True, "enable_metrics_collection": enable_metrics_collection, - "experimental_enable_open_telemetry_on_agent": os.getenv( - "RAY_experimental_enable_open_telemetry_on_agent" - ) - == "1", - "experimental_enable_open_telemetry_on_core": os.getenv( - "RAY_experimental_enable_open_telemetry_on_core" - ) - == "1", + "enable_open_telemetry": os.getenv("RAY_enable_open_telemetry") == "1", } ) # Add worker nodes. @@ -276,8 +273,7 @@ async def ping(self): @pytest.mark.skipif(prometheus_client is None, reason="Prometheus not installed") @pytest.mark.skipif( - os.environ.get("RAY_experimental_enable_open_telemetry_on_core") == "1" - and sys.platform == "darwin", + os.environ.get("RAY_enable_open_telemetry") == "1" and sys.platform == "darwin", reason="OpenTelemetry is not working on macOS yet.", ) @pytest.mark.parametrize("_setup_cluster_for_test", [True], indirect=True) @@ -318,7 +314,7 @@ def test_cases(): "test_driver_counter_total", "test_gauge", ] - if os.environ.get("RAY_experimental_enable_open_telemetry_on_core") != "1" + if os.environ.get("RAY_enable_open_telemetry") != "1" else [ "test_counter_total", "test_driver_counter_total", @@ -422,7 +418,7 @@ def test_metrics_export_node_metrics(shutdown_only): # Verify node metrics are available. addr = ray.init() dashboard_export_addr = build_address( - addr["raylet_ip_address"], DASHBOARD_METRIC_PORT + addr["node_ip_address"], DASHBOARD_METRIC_PORT ) def verify_node_metrics(): @@ -465,6 +461,10 @@ def verify_dashboard_metrics(): _EVENT_AGGREGATOR_AGENT_TARGET_PORT = find_free_port() +_EVENT_AGGREGATOR_AGENT_TARGET_IP = "127.0.0.1" +_EVENT_AGGREGATOR_AGENT_TARGET_ADDR = ( + f"http://{_EVENT_AGGREGATOR_AGENT_TARGET_IP}:{_EVENT_AGGREGATOR_AGENT_TARGET_PORT}" +) @pytest.fixture(scope="module") @@ -477,8 +477,11 @@ def httpserver_listen_address(): [ { "env_vars": { - "RAY_DASHBOARD_AGGREGATOR_AGENT_MAX_EVENT_BUFFER_SIZE": 1, - "RAY_DASHBOARD_AGGREGATOR_AGENT_EVENT_SEND_PORT": _EVENT_AGGREGATOR_AGENT_TARGET_PORT, + "RAY_DASHBOARD_AGGREGATOR_AGENT_MAX_EVENT_BUFFER_SIZE": 2, + "RAY_DASHBOARD_AGGREGATOR_AGENT_EVENTS_EXPORT_ADDR": _EVENT_AGGREGATOR_AGENT_TARGET_ADDR, + # Turn off task events generation to avoid the task events from the + # cluster impacting the test result + "RAY_task_events_report_interval_ms": 0, }, }, ], @@ -489,12 +492,12 @@ def test_metrics_export_event_aggregator_agent( ): cluster = ray_start_cluster_head_with_env_vars stub = get_event_aggregator_grpc_stub( - cluster.webui_url, cluster.gcs_address, cluster.head_node.node_id + cluster.gcs_address, cluster.head_node.node_id ) httpserver.expect_request("/", method="POST").respond_with_data("", status=200) metrics_export_port = cluster.head_node.metrics_export_port - addr = cluster.head_node.raylet_ip_address + addr = cluster.head_node.node_ip_address prom_addresses = [build_address(addr, metrics_export_port)] def test_case_stats_exist(): @@ -505,16 +508,18 @@ def test_case_stats_exist(): "ray_event_aggregator_agent_events_failed_to_add_to_aggregator_total", "ray_event_aggregator_agent_events_dropped_at_event_aggregator_total", "ray_event_aggregator_agent_events_published_total", + "ray_event_aggregator_agent_events_filtered_out_total", ] return all(metric in metrics_names for metric in event_aggregator_metrics) def test_case_value_correct(): _, _, metric_samples = fetch_prometheus(prom_addresses) expected_metrics_values = { - "ray_event_aggregator_agent_events_received_total": 2.0, + "ray_event_aggregator_agent_events_received_total": 3.0, "ray_event_aggregator_agent_events_failed_to_add_to_aggregator_total": 0.0, "ray_event_aggregator_agent_events_dropped_at_event_aggregator_total": 1.0, "ray_event_aggregator_agent_events_published_total": 1.0, + "ray_event_aggregator_agent_events_filtered_out_total": 1.0, } for descriptor, expected_value in expected_metrics_values.items(): samples = [m for m in metric_samples if m.name == descriptor] @@ -543,11 +548,19 @@ def test_case_value_correct(): RayEvent( event_id=b"2", source_type=RayEvent.SourceType.CORE_WORKER, - event_type=RayEvent.EventType.TASK_DEFINITION_EVENT, + event_type=RayEvent.EventType.TASK_PROFILE_EVENT, timestamp=timestamp, severity=RayEvent.Severity.INFO, message="hello 2", ), + RayEvent( + event_id=b"3", + source_type=RayEvent.SourceType.CORE_WORKER, + event_type=RayEvent.EventType.TASK_DEFINITION_EVENT, + timestamp=timestamp, + severity=RayEvent.Severity.INFO, + message="hello 3", + ), ], task_events_metadata=TaskEventsMetadata( dropped_task_attempts=[ @@ -560,8 +573,7 @@ def test_case_value_correct(): ) ) - reply = stub.AddEvents(request) - assert reply is not None + stub.AddEvents(request) wait_for_condition(lambda: len(httpserver.log) == 1) wait_for_condition(test_case_value_correct, timeout=30, retry_interval_ms=1000) @@ -570,133 +582,68 @@ def test_case_value_correct(): def test_operation_stats(monkeypatch, shutdown_only): # Test operation stats are available when flag is on. operation_metrics = [ - "ray_operation_count", - "ray_operation_run_time_ms", - "ray_operation_queue_time_ms", + "ray_operation_count_total", + "ray_operation_run_time_ms_bucket", + "ray_operation_queue_time_ms_bucket", "ray_operation_active_count", ] - with monkeypatch.context() as m: - m.setenv("RAY_event_stats_metrics", "1") - addr = ray.init() - - signal = SignalActor.remote() - - @ray.remote - class Actor: - def __init__(self, signal): - self.signal = signal - - def get_worker_id(self): - return ray.get_runtime_context().get_worker_id() - - def wait(self): - ray.get(self.signal.wait.remote()) - - actor = Actor.remote(signal) - worker_id = ray.get(actor.get_worker_id.remote()) - obj_ref = actor.wait.remote() - - def verify(): - metrics = raw_metrics(addr) - samples = metrics["ray_operation_count"] - found = False - for sample in samples: - if ( - sample.labels["Method"] == "CoreWorkerService.grpc_client.PushTask" - and sample.labels["Component"] == "core_worker" - and sample.labels["WorkerId"] == worker_id - ): - found = True - assert sample.value == 1 - if not found: - return False + addr = ray.init() + remote_signal = SignalActor.remote() - samples = metrics["ray_operation_active_count"] - found = False - for sample in samples: - if ( - sample.labels["Method"] == "CoreWorkerService.grpc_client.PushTask" - and sample.labels["Component"] == "core_worker" - and sample.labels["WorkerId"] == worker_id - ): - found = True - assert sample.value == 1 - if not found: - return False + @ray.remote + class Actor: + def __init__(self, signal): + self.signal = signal - return True + def get_worker_id(self): + return ray.get_runtime_context().get_worker_id() - wait_for_condition(verify, timeout=60) + def wait(self): + ray.get(self.signal.wait.remote()) - ray.get(signal.send.remote()) - ray.get(obj_ref) + actor = Actor.remote(remote_signal) + ray.get(actor.get_worker_id.remote()) + obj_ref = actor.wait.remote() - def verify(): - metrics = raw_metrics(addr) + ray.get(remote_signal.send.remote()) + ray.get(obj_ref) - samples = metrics["ray_operation_count"] - found = False - for sample in samples: - if ( - sample.labels["Method"] == "CoreWorkerService.grpc_client.PushTask" - and sample.labels["Component"] == "core_worker" - and sample.labels["WorkerId"] == worker_id - ): - found = True - assert sample.value == 1 - if not found: - return False + def verify(): + metrics = raw_metrics(addr) - found = False - for sample in samples: - if ( - sample.labels["Method"] - == "CoreWorkerService.grpc_client.PushTask.OnReplyReceived" - and sample.labels["Component"] == "core_worker" - and sample.labels["WorkerId"] == worker_id - ): - found = True - assert sample.value == 1 - if not found: - return False + samples = metrics["ray_operation_active_count"] + found = False + for sample in samples: + if ( + sample.labels["Name"] == "gcs_server_main_io_context" + and sample.labels["Component"] == "gcs_server" + ): + found = True + if not found: + return False - samples = metrics["ray_operation_active_count"] - found = False - for sample in samples: - if ( - sample.labels["Method"] == "CoreWorkerService.grpc_client.PushTask" - and sample.labels["Component"] == "core_worker" - and sample.labels["WorkerId"] == worker_id - ): - found = True - assert sample.value == 0 - if not found: - return False + found = False + for sample in samples: + if ( + sample.labels["Name"] == "raylet_main_io_context" + and sample.labels["Component"] == "raylet" + ): + found = True + if not found: + return False - found = False + metric_names = set(metrics.keys()) + for op_metric in operation_metrics: + assert op_metric in metric_names + samples = metrics[op_metric] + components = set() + print(components) for sample in samples: - if ( - sample.labels["Method"] - == "CoreWorkerService.grpc_client.PushTask.OnReplyReceived" - and sample.labels["Component"] == "core_worker" - and sample.labels["WorkerId"] == worker_id - ): - found = True - assert sample.value == 0 - if not found: - return False - - metric_names = set(metrics.keys()) - for op_metric in operation_metrics: - assert op_metric in metric_names - samples = metrics[op_metric] - components = set() - for sample in samples: - components.add(sample.labels["Component"]) - assert {"raylet", "gcs_server", "core_worker"} == components - return True + components.add(sample.labels["Component"]) + assert {"raylet", "gcs_server"} == components + return True - wait_for_condition(verify, timeout=60) + wait_for_condition(verify, timeout=30) @pytest.mark.skipif(prometheus_client is None, reason="Prometheus not installed") @@ -753,7 +700,7 @@ def wrap_test_case_for_retry(): @pytest.mark.skipif(sys.platform == "win32", reason="Not working in Windows.") @pytest.mark.skipif( - os.environ.get("RAY_experimental_enable_open_telemetry_on_core") == "1", + os.environ.get("RAY_enable_open_telemetry") == "1", reason="OpenTelemetry backend does not support Counter exported as gauge.", ) def test_counter_exported_as_gauge(shutdown_only): @@ -962,9 +909,10 @@ def get_metrics_export_address_from_node(nodes): ) return node_export_addrs + [autoscaler_export_addr, dashboard_export_addr] - loaded_json_data = json.loads(writer.get_file_discovery_content())[0] + loaded_json_data = json.loads(writer.get_file_discovery_content()) + assert loaded_json_data == writer.get_latest_service_discovery_content() assert set(get_metrics_export_address_from_node(nodes)) == set( - loaded_json_data["targets"] + loaded_json_data[0]["targets"] ) # Let's update nodes. @@ -972,9 +920,10 @@ def get_metrics_export_address_from_node(nodes): nodes.append(cluster.add_node()) # Make sure service discovery file content is correctly updated. - loaded_json_data = json.loads(writer.get_file_discovery_content())[0] + loaded_json_data = json.loads(writer.get_file_discovery_content()) + assert loaded_json_data == writer.get_latest_service_discovery_content() assert set(get_metrics_export_address_from_node(nodes)) == set( - loaded_json_data["targets"] + loaded_json_data[0]["targets"] ) diff --git a/python/ray/tests/test_metrics_agent_2.py b/python/ray/tests/test_metrics_agent_2.py index 7da1ded7a348..cbe859930e5c 100644 --- a/python/ray/tests/test_metrics_agent_2.py +++ b/python/ray/tests/test_metrics_agent_2.py @@ -1,46 +1,45 @@ import random import sys import time - -import pytest - -from ray._common.test_utils import wait_for_condition -import ray._private.prometheus_exporter as prometheus_exporter - from typing import List +import pytest from opencensus.metrics.export.metric_descriptor import MetricDescriptorType -from opencensus.stats.view_manager import ViewManager -from opencensus.stats.stats_recorder import StatsRecorder +from opencensus.metrics.export.value import ValueDouble from opencensus.stats import execution_context -from prometheus_client.core import REGISTRY - - -from ray._private.metrics_agent import Gauge, MetricsAgent, Record, RAY_WORKER_TIMEOUT_S from opencensus.stats.aggregation_data import ( - LastValueAggregationData, - SumAggregationData, CountAggregationData, DistributionAggregationData, + LastValueAggregationData, + SumAggregationData, ) -from opencensus.metrics.export.value import ValueDouble -from ray._private.telemetry.metric_cardinality import WORKER_ID_TAG_KEY +from opencensus.stats.stats_recorder import StatsRecorder +from opencensus.stats.view_manager import ViewManager +from prometheus_client.core import REGISTRY + +import ray._private.prometheus_exporter as prometheus_exporter +from ray._common.test_utils import wait_for_condition from ray._private.metrics_agent import ( + RAY_WORKER_TIMEOUT_S, + Gauge, + MetricsAgent, OpenCensusProxyCollector, OpencensusProxyMetric, + Record, +) +from ray._private.telemetry.metric_cardinality import WORKER_ID_TAG_KEY +from ray._private.test_utils import ( + fetch_prometheus_metrics, + fetch_raw_prometheus, ) +from ray._raylet import WorkerID from ray.core.generated.metrics_pb2 import ( + LabelKey, + LabelValue, Metric, MetricDescriptor, Point, - LabelKey, TimeSeries, - LabelValue, -) -from ray._raylet import WorkerID -from ray._private.test_utils import ( - fetch_prometheus_metrics, - fetch_raw_prometheus, ) diff --git a/python/ray/tests/test_metrics_head.py b/python/ray/tests/test_metrics_head.py index 9ac48c5bb131..88bd84b94204 100644 --- a/python/ray/tests/test_metrics_head.py +++ b/python/ray/tests/test_metrics_head.py @@ -2,20 +2,20 @@ import json import logging import os -import pytest import sys import tempfile +import pytest + +from ray._common.utils import get_ray_temp_dir +from ray._private.ray_constants import SESSION_LATEST from ray.dashboard.modules.metrics.dashboards.default_dashboard_panels import ( - DEFAULT_GRAFANA_PANELS, + DEFAULT_GRAFANA_ROWS, ) from ray.dashboard.modules.metrics.dashboards.serve_dashboard_panels import ( SERVE_GRAFANA_PANELS, ) from ray.tests.conftest import _ray_start -from ray._private.ray_constants import SESSION_LATEST -from ray._common.utils import get_ray_temp_dir - logger = logging.getLogger(__name__) @@ -132,6 +132,9 @@ def test_metrics_folder_with_dashboard_override( contents = json.loads(f.read()) assert contents["uid"] == uid for panel in contents["panels"]: + if panel["type"] == "row": + # Row panels don't have targets + continue for target in panel["targets"]: # Check for standard_global_filters assert 'SessionName=~"$SessionName"' in target["expr"] @@ -140,6 +143,9 @@ def test_metrics_folder_with_dashboard_override( for variable in contents["templating"]["list"]: if variable["name"] == "datasource": continue + if variable["name"] == "RayNodeType": + # RayNodeType uses hardcoded values instead of a query + continue assert global_filters in variable["definition"] assert global_filters in variable["query"]["query"] assert "supportsGlobalFilterOverride" in contents["rayMeta"] @@ -151,6 +157,9 @@ def test_metrics_folder_with_dashboard_override( found_max = False found_max_pending = False for panel in contents["panels"]: + if panel["type"] == "row": + # Row panels don't have series overrides + continue for override in panel.get("seriesOverrides", []): if override.get("alias") == "MAX": assert override["fill"] == 0 @@ -207,9 +216,10 @@ def test_metrics_folder_when_dashboard_disabled(): def test_default_dashboard_utilizes_global_filters(): - for panel in DEFAULT_GRAFANA_PANELS: - for target in panel.targets: - assert "{global_filters}" in target.expr + for row in DEFAULT_GRAFANA_ROWS: + for panel in row.panels: + for target in panel.targets: + assert "{global_filters}" in target.expr def test_serve_dashboard_utilizes_global_filters(): diff --git a/python/ray/tests/test_minimal_install.py b/python/ray/tests/test_minimal_install.py index a1da0d51648b..50940b817ffa 100644 --- a/python/ray/tests/test_minimal_install.py +++ b/python/ray/tests/test_minimal_install.py @@ -3,13 +3,13 @@ Tests that are specific to minimal installations. """ -import unittest.mock as mock import itertools -import packaging import os import sys +import unittest.mock as mock from typing import Dict +import packaging import pytest @@ -94,8 +94,7 @@ def test_module_import_with_various_non_minimal_deps(pydantic_version: str): mock_modules[mod] = mock.MagicMock() with mock.patch.dict("sys.modules", mock_modules): - from ray.dashboard.utils import get_all_modules - from ray.dashboard.utils import DashboardHeadModule + from ray.dashboard.utils import DashboardHeadModule, get_all_modules get_all_modules(DashboardHeadModule) diff --git a/python/ray/tests/test_mpi.py b/python/ray/tests/test_mpi.py index e806b3f3e1c5..e204da4191d4 100644 --- a/python/ray/tests/test_mpi.py +++ b/python/ray/tests/test_mpi.py @@ -1,8 +1,10 @@ -import pytest -import ray -import sys import os +import sys + import numpy +import pytest + +import ray from ray.runtime_env import mpi_init @@ -91,9 +93,10 @@ def calc_pi(self): def check_gpu_setup(): - from mpi4py import MPI import os + from mpi4py import MPI + mpi_init() comm = MPI.COMM_WORLD rank = comm.Get_rank() diff --git a/python/ray/tests/test_multi_node.py b/python/ray/tests/test_multi_node.py index 52acfa85bad5..a687c1f2290a 100644 --- a/python/ray/tests/test_multi_node.py +++ b/python/ray/tests/test_multi_node.py @@ -2,7 +2,6 @@ import sys import time -import psutil import pytest import ray @@ -16,6 +15,8 @@ run_string_as_driver_nonblocking, ) +import psutil + @pytest.mark.parametrize( "call_ray_start", diff --git a/python/ray/tests/test_multi_node_3.py b/python/ray/tests/test_multi_node_3.py index f741baa7bef0..cc10937d17e8 100644 --- a/python/ray/tests/test_multi_node_3.py +++ b/python/ray/tests/test_multi_node_3.py @@ -4,24 +4,25 @@ import sys from pathlib import Path -import psutil import pytest import ray import ray._private.ray_constants as ray_constants +from ray._common.test_utils import Semaphore from ray._private.test_utils import ( check_call_ray, check_call_subprocess, kill_process_by_name, - start_redis_instance, run_string_as_driver, run_string_as_driver_nonblocking, + start_redis_instance, wait_for_children_of_pid, wait_for_children_of_pid_to_exit, ) -from ray._common.test_utils import Semaphore from ray._private.utils import detect_fate_sharing_support +import psutil + def test_calling_start_ray_head(call_ray_stop_only): # Test that we can call ray start with various command line diff --git a/python/ray/tests/test_multi_tenancy.py b/python/ray/tests/test_multi_tenancy.py index b4458f8f2d59..608fd5078184 100644 --- a/python/ray/tests/test_multi_tenancy.py +++ b/python/ray/tests/test_multi_tenancy.py @@ -6,8 +6,8 @@ import time from typing import List -import pytest import numpy as np +import pytest import ray from ray._common.test_utils import wait_for_condition diff --git a/python/ray/tests/test_multinode_failures.py b/python/ray/tests/test_multinode_failures.py index 8faa98513a50..30824cae483b 100644 --- a/python/ray/tests/test_multinode_failures.py +++ b/python/ray/tests/test_multinode_failures.py @@ -7,8 +7,8 @@ import ray import ray._private.ray_constants as ray_constants -from ray._private.test_utils import get_other_nodes from ray._common.test_utils import Semaphore +from ray._private.test_utils import get_other_nodes from ray.cluster_utils import Cluster, cluster_not_supported SIGKILL = signal.SIGKILL if sys.platform != "win32" else signal.SIGTERM diff --git a/python/ray/tests/test_multiprocessing.py b/python/ray/tests/test_multiprocessing.py index 947cb173c230..5d1c89ef0d70 100644 --- a/python/ray/tests/test_multiprocessing.py +++ b/python/ray/tests/test_multiprocessing.py @@ -3,6 +3,7 @@ Tests that require a standalone Ray cluster (for example, testing ray.init or shutdown behavior) should go in test_multiprocessing_standalone.py. """ +import multiprocessing as mp import os import platform import queue @@ -10,15 +11,13 @@ import sys import tempfile import time -import multiprocessing as mp from collections import defaultdict import pytest - import ray from ray._common.test_utils import SignalActor -from ray.util.multiprocessing import Pool, TimeoutError, JoinableQueue +from ray.util.multiprocessing import JoinableQueue, Pool, TimeoutError @pytest.fixture(scope="module") diff --git a/python/ray/tests/test_multiprocessing_standalone.py b/python/ray/tests/test_multiprocessing_standalone.py index dec1d956b8f4..1b41f1c08991 100644 --- a/python/ray/tests/test_multiprocessing_standalone.py +++ b/python/ray/tests/test_multiprocessing_standalone.py @@ -3,13 +3,12 @@ Tests that can run on a shared Ray cluster fixture should go in test_multiprocessing.py """ import math +import multiprocessing as mp import os import sys -import multiprocessing as mp import pytest - import ray from ray._private.test_utils import external_redis_test_enabled from ray.util.multiprocessing import Pool diff --git a/python/ray/tests/test_nccl_channel.py b/python/ray/tests/test_nccl_channel.py index 786960999491..555d06cb83a6 100644 --- a/python/ray/tests/test_nccl_channel.py +++ b/python/ray/tests/test_nccl_channel.py @@ -1,23 +1,23 @@ # coding: utf-8 import logging import sys -import torch -from typing import List, Dict, Tuple +from typing import Dict, List, Tuple import pytest +import torch import ray import ray.cluster_utils +from ray._private.test_utils import get_actor_node_id from ray.experimental.channel.conftest import ( Barrier, - start_nccl_mock, TracedChannel, + start_nccl_mock, ) -from ray.experimental.channel.torch_tensor_type import TorchTensorType from ray.experimental.channel.torch_tensor_accelerator_channel import ( _init_communicator, ) -from ray._private.test_utils import get_actor_node_id +from ray.experimental.channel.torch_tensor_type import TorchTensorType logger = logging.getLogger(__name__) diff --git a/python/ray/tests/test_network_failure_e2e.py b/python/ray/tests/test_network_failure_e2e.py index 59b1403f8aeb..39c56e4bd03d 100644 --- a/python/ray/tests/test_network_failure_e2e.py +++ b/python/ray/tests/test_network_failure_e2e.py @@ -1,14 +1,14 @@ -import sys import json - +import sys +import threading from time import sleep + import pytest -import threading + from ray._common.test_utils import wait_for_condition from ray.tests.conftest_docker import * # noqa from ray.tests.conftest_docker import gen_head_node, gen_worker_node - SLEEP_TASK_SCRIPTS = """ import ray ray.init() diff --git a/python/ray/tests/test_node_death.py b/python/ray/tests/test_node_death.py index 66d3cad6ed1f..f12abc31e83c 100644 --- a/python/ray/tests/test_node_death.py +++ b/python/ray/tests/test_node_death.py @@ -1,8 +1,8 @@ import sys + import pytest import ray - from ray._common.test_utils import wait_for_condition from ray.core.generated import common_pb2 diff --git a/python/ray/tests/test_node_label_scheduling_strategy.py b/python/ray/tests/test_node_label_scheduling_strategy.py index d13fc587ac08..4b3f0dadb272 100644 --- a/python/ray/tests/test_node_label_scheduling_strategy.py +++ b/python/ray/tests/test_node_label_scheduling_strategy.py @@ -1,13 +1,14 @@ import sys + import pytest import ray from ray.util.scheduling_strategies import ( - In, - NotIn, - Exists, DoesNotExist, + Exists, + In, NodeLabelSchedulingStrategy, + NotIn, ) diff --git a/python/ray/tests/test_node_labels.py b/python/ray/tests/test_node_labels.py index 02e2b9630b85..ae94aadef6d2 100644 --- a/python/ray/tests/test_node_labels.py +++ b/python/ray/tests/test_node_labels.py @@ -1,14 +1,15 @@ import os -import sys -import pytest import subprocess +import sys import tempfile from unittest.mock import patch -from ray._private.accelerators.tpu import TPUAcceleratorManager + +import pytest import ray -from ray.cluster_utils import AutoscalingCluster from ray._common.test_utils import wait_for_condition +from ray._private.accelerators.tpu import TPUAcceleratorManager +from ray.cluster_utils import AutoscalingCluster def check_cmd_stderr(cmd): diff --git a/python/ray/tests/test_node_manager.py b/python/ray/tests/test_node_manager.py index d859ae135286..75fec392d2c0 100644 --- a/python/ray/tests/test_node_manager.py +++ b/python/ray/tests/test_node_manager.py @@ -12,17 +12,16 @@ import ray from ray._common.test_utils import wait_for_condition -from ray.util.state import list_workers +from ray._private.runtime_env.context import RuntimeEnvContext +from ray._private.runtime_env.plugin import RuntimeEnvPlugin from ray._private.test_utils import ( get_load_metrics_report, + get_resource_usage, run_string_as_driver, run_string_as_driver_nonblocking, - get_resource_usage, ) -from ray.util.state import list_objects from ray._private.utils import get_num_cpus -from ray._private.runtime_env.context import RuntimeEnvContext -from ray._private.runtime_env.plugin import RuntimeEnvPlugin +from ray.util.state import list_objects, list_workers # This tests the queue transitions for infeasible tasks. This has been an issue diff --git a/python/ray/tests/test_node_provider_availability_tracker.py b/python/ray/tests/test_node_provider_availability_tracker.py index 448c7500cd2f..9512bc7b2010 100644 --- a/python/ray/tests/test_node_provider_availability_tracker.py +++ b/python/ray/tests/test_node_provider_availability_tracker.py @@ -1,16 +1,16 @@ -import datetime import dataclasses +import datetime import sys + import pytest -from ray.autoscaler.node_launch_exception import NodeLaunchException from ray.autoscaler._private.node_provider_availability_tracker import ( - NodeProviderAvailabilityTracker, - NodeAvailabilitySummary, NodeAvailabilityRecord, + NodeAvailabilitySummary, + NodeProviderAvailabilityTracker, UnavailableNodeInformation, ) - +from ray.autoscaler.node_launch_exception import NodeLaunchException cur_time = float(0) diff --git a/python/ray/tests/test_numba.py b/python/ray/tests/test_numba.py index 182684f1f212..7f2ab2800640 100644 --- a/python/ray/tests/test_numba.py +++ b/python/ray/tests/test_numba.py @@ -1,10 +1,9 @@ -import pytest import sys import unittest - -from numba import njit import numpy as np +import pytest +from numba import njit import ray diff --git a/python/ray/tests/test_object_assign_owner.py b/python/ray/tests/test_object_assign_owner.py index de7a4ca76759..af39e5e65ae4 100644 --- a/python/ray/tests/test_object_assign_owner.py +++ b/python/ray/tests/test_object_assign_owner.py @@ -1,8 +1,8 @@ import sys import time -import pytest import numpy as np +import pytest import ray from ray.exceptions import OwnerDiedError diff --git a/python/ray/tests/test_object_manager.py b/python/ray/tests/test_object_manager.py index 81a09c0783b4..e4b99f94bd0c 100644 --- a/python/ray/tests/test_object_manager.py +++ b/python/ray/tests/test_object_manager.py @@ -539,6 +539,8 @@ def test_object_directory_failure(ray_start_cluster): "health_check_period_ms": 500, "health_check_failure_threshold": 10, "object_timeout_milliseconds": 200, + # Required for reducing the retry time of RequestWorkerLease + "raylet_rpc_server_reconnect_timeout_s": 0, } # Add a head node. diff --git a/python/ray/tests/test_object_spilling.py b/python/ray/tests/test_object_spilling.py index e219a564cc37..f1a2c9741d99 100644 --- a/python/ray/tests/test_object_spilling.py +++ b/python/ray/tests/test_object_spilling.py @@ -1,35 +1,35 @@ import copy import json +import os import platform import random import sys from datetime import datetime, timedelta -from unittest.mock import patch from pathlib import Path -import os +from unittest.mock import patch -import psutil import numpy as np import pytest - import ray +import ray.remote_function +from ray._common.test_utils import wait_for_condition from ray._private.external_storage import ( + ExternalStorageSmartOpenImpl, + FileSystemStorage, + _get_unique_spill_filename, create_url_with_offset, parse_url_with_offset, - _get_unique_spill_filename, - FileSystemStorage, - ExternalStorageSmartOpenImpl, ) from ray._private.internal_api import memory_summary -from ray._common.test_utils import wait_for_condition -import ray.remote_function from ray.tests.conftest import ( buffer_object_spilling_config, file_system_object_spilling_config, mock_distributed_fs_object_spilling_config, ) +import psutil + # Note: Disk write speed can be as low as 6 MiB/s in AWS Mac instances, so we have to # increase the timeout. pytestmark = [pytest.mark.timeout(900 if platform.system() == "Darwin" else 180)] diff --git a/python/ray/tests/test_object_spilling_2.py b/python/ray/tests/test_object_spilling_2.py index 7828edb96e51..c44404f03e5e 100644 --- a/python/ray/tests/test_object_spilling_2.py +++ b/python/ray/tests/test_object_spilling_2.py @@ -10,12 +10,11 @@ import ray from ray._common.test_utils import wait_for_condition -from ray._private.test_utils import run_string_as_driver -from ray.tests.test_object_spilling import is_dir_empty from ray._private.external_storage import ( FileSystemStorage, ) - +from ray._private.test_utils import run_string_as_driver +from ray.tests.test_object_spilling import is_dir_empty # Note: Disk write speed can be as low as 6 MiB/s in AWS Mac instances, so we have to # increase the timeout. diff --git a/python/ray/tests/test_object_spilling_3.py b/python/ray/tests/test_object_spilling_3.py index a74596088eee..7b53228e0dc4 100644 --- a/python/ray/tests/test_object_spilling_3.py +++ b/python/ray/tests/test_object_spilling_3.py @@ -330,6 +330,8 @@ def test_spill_reconstruction_errors(ray_start_cluster, object_spilling_config): "max_direct_call_object_size": 100, "task_retry_delay_ms": 100, "object_timeout_milliseconds": 200, + # Required for reducing the retry time of RequestWorkerLease + "raylet_rpc_server_reconnect_timeout_s": 0, } cluster = ray_start_cluster # Head node with no resources. diff --git a/python/ray/tests/test_object_store_metrics.py b/python/ray/tests/test_object_store_metrics.py index c1c5c4eebff2..a2070c4665c2 100644 --- a/python/ray/tests/test_object_store_metrics.py +++ b/python/ray/tests/test_object_store_metrics.py @@ -2,12 +2,12 @@ from collections import defaultdict from typing import Dict +import numpy as np import pytest -from ray._common.test_utils import wait_for_condition import requests -import numpy as np import ray +from ray._common.test_utils import wait_for_condition from ray._private.test_utils import ( raw_metrics, ) diff --git a/python/ray/tests/test_open_telemetry_metric_recorder.py b/python/ray/tests/test_open_telemetry_metric_recorder.py index e20b24563a6a..81095041a42c 100644 --- a/python/ray/tests/test_open_telemetry_metric_recorder.py +++ b/python/ray/tests/test_open_telemetry_metric_recorder.py @@ -2,12 +2,12 @@ from unittest.mock import MagicMock, patch import pytest -from opentelemetry.metrics import NoOpCounter, NoOpUpDownCounter, NoOpHistogram +from opentelemetry.metrics import NoOpCounter, NoOpHistogram, NoOpUpDownCounter +from ray._private.metrics_agent import Gauge, Record from ray._private.telemetry.open_telemetry_metric_recorder import ( OpenTelemetryMetricRecorder, ) -from ray._private.metrics_agent import Record, Gauge @patch("opentelemetry.metrics.set_meter_provider") diff --git a/python/ray/tests/test_output.py b/python/ray/tests/test_output.py index 1ab0d893e889..a753040931bf 100644 --- a/python/ray/tests/test_output.py +++ b/python/ray/tests/test_output.py @@ -351,8 +351,9 @@ def _check_events(): @pytest.mark.skipif(sys.platform == "win32", reason="Failing on Windows.") -def test_fail_importing_actor(): - script = """ +@pytest.mark.parametrize("async_actor", [True, False]) +def test_fail_importing_actor(async_actor): + script = f""" import os import sys import tempfile @@ -380,7 +381,7 @@ class Foo: def __init__(self): self.x = module.temporary_python_file() - def ready(self): + {"async " if async_actor else ""}def ready(self): pass finally: os.unlink(f.name) diff --git a/python/ray/tests/test_placement_group.py b/python/ray/tests/test_placement_group.py index 8dca2c15f135..6cbc51ca12be 100644 --- a/python/ray/tests/test_placement_group.py +++ b/python/ray/tests/test_placement_group.py @@ -1,19 +1,19 @@ +import os import sys import warnings -import os import pytest import ray -from ray._private.utils import get_ray_doc_version from ray._private.test_utils import placement_group_assert_no_leak -from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy +from ray._private.utils import get_ray_doc_version from ray.util.placement_group import ( - validate_placement_group, - _validate_bundles, - _validate_bundle_label_selector, VALID_PLACEMENT_GROUP_STRATEGIES, + _validate_bundle_label_selector, + _validate_bundles, + validate_placement_group, ) +from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy def are_pairwise_unique(g): diff --git a/python/ray/tests/test_placement_group_2.py b/python/ray/tests/test_placement_group_2.py index e71d2968c628..a23a3bcba118 100644 --- a/python/ray/tests/test_placement_group_2.py +++ b/python/ray/tests/test_placement_group_2.py @@ -4,9 +4,9 @@ import pytest import ray -from ray._common.test_utils import wait_for_condition import ray._private.gcs_utils as gcs_utils import ray.cluster_utils +from ray._common.test_utils import wait_for_condition from ray._private.test_utils import ( convert_actor_state, generate_system_config_map, diff --git a/python/ray/tests/test_placement_group_3.py b/python/ray/tests/test_placement_group_3.py index 1b1a234eccce..a4a199a740d5 100644 --- a/python/ray/tests/test_placement_group_3.py +++ b/python/ray/tests/test_placement_group_3.py @@ -6,16 +6,15 @@ import pytest import ray -from ray import ObjectRef -from ray._common.test_utils import wait_for_condition import ray._private.gcs_utils as gcs_utils import ray.cluster_utils import ray.experimental.internal_kv as internal_kv +from ray import ObjectRef +from ray._common.test_utils import wait_for_condition from ray._private.ray_constants import ( DEBUG_AUTOSCALING_ERROR, DEBUG_AUTOSCALING_STATUS, ) -from ray.autoscaler._private.constants import AUTOSCALER_UPDATE_INTERVAL_S from ray._private.test_utils import ( convert_actor_state, generate_system_config_map, @@ -25,6 +24,7 @@ run_string_as_driver, ) from ray.autoscaler._private.commands import debug_status +from ray.autoscaler._private.constants import AUTOSCALER_UPDATE_INTERVAL_S from ray.exceptions import RaySystemError from ray.util.placement_group import placement_group, remove_placement_group from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy diff --git a/python/ray/tests/test_placement_group_4.py b/python/ray/tests/test_placement_group_4.py index dd0b5cbc0c12..9e9364b51056 100644 --- a/python/ray/tests/test_placement_group_4.py +++ b/python/ray/tests/test_placement_group_4.py @@ -1,11 +1,14 @@ -import pytest import os import sys import time +import pytest + import ray -from ray._common.test_utils import wait_for_condition import ray.cluster_utils +from ray._common.test_utils import wait_for_condition +from ray._private.runtime_env.context import RuntimeEnvContext +from ray._private.runtime_env.plugin import RuntimeEnvPlugin from ray._private.test_utils import ( get_other_nodes, is_placement_group_removed, @@ -14,8 +17,6 @@ from ray._raylet import PlacementGroupID from ray.util.placement_group import PlacementGroup from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy -from ray._private.runtime_env.context import RuntimeEnvContext -from ray._private.runtime_env.plugin import RuntimeEnvPlugin MOCK_WORKER_STARTUP_SLOWLY_PLUGIN_CLASS_PATH = ( "ray.tests.test_placement_group_4.MockWorkerStartupSlowlyPlugin" # noqa diff --git a/python/ray/tests/test_placement_group_5.py b/python/ray/tests/test_placement_group_5.py index 8440cee1ef37..04702b2e7426 100644 --- a/python/ray/tests/test_placement_group_5.py +++ b/python/ray/tests/test_placement_group_5.py @@ -4,19 +4,21 @@ from functools import reduce from itertools import chain -from click.testing import CliRunner import pytest +from click.testing import CliRunner import ray +import ray.scripts.scripts as scripts +from ray._common.network_utils import build_address from ray._common.test_utils import wait_for_condition -from ray._private.test_utils import placement_group_assert_no_leak +from ray._private.runtime_env.plugin import RuntimeEnvPlugin +from ray._private.test_utils import ( + fetch_prometheus_metrics, + placement_group_assert_no_leak, +) from ray.tests.test_placement_group import are_pairwise_unique -from ray.util.state import list_actors, list_placement_groups from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy -from ray._private.runtime_env.plugin import RuntimeEnvPlugin -from ray._private.test_utils import fetch_prometheus_metrics -from ray._common.network_utils import build_address -import ray.scripts.scripts as scripts +from ray.util.state import list_actors, list_placement_groups def test_placement_group_no_resource(ray_start_cluster): @@ -510,7 +512,7 @@ def test_remove_placement_group_with_pending_worker_lease_waiting_for_pg_resourc Specific test steps: 1. Create a placement group with only 1 bundle. 2. Create two actors using the aforementioned pg. At this point, - the latter actor lease request will definitely be pending in local task manager dispatch queue due to + the latter actor lease request will definitely be pending in local lease manager leases_to_grant queue due to unavailable pg bundle resources. 3. Remove the pg while the latter actor lease request is pending. 4. Verify that the pending actor lease request is cancelled and the pg @@ -549,7 +551,7 @@ def wait_for_actor2_added_to_dispatch_queue(): return False for sample in samples: if sample.labels["State"] == "Dispatched" and sample.value == 1: - # actor2 is in the local task manager dispatch queue + # actor2 is in the local lease manager leases_to_grant queue return True return False diff --git a/python/ray/tests/test_placement_group_failover.py b/python/ray/tests/test_placement_group_failover.py index f50cc99dc7c8..886189f8e1bb 100755 --- a/python/ray/tests/test_placement_group_failover.py +++ b/python/ray/tests/test_placement_group_failover.py @@ -1,12 +1,14 @@ -import pytest import sys -import ray import time -from ray._common.test_utils import wait_for_condition + +import pytest + +import ray import ray.cluster_utils +from ray._common.test_utils import wait_for_condition from ray._private.test_utils import get_other_nodes -from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy from ray.util import placement_group_table +from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy MB = 1024 * 1024 diff --git a/python/ray/tests/test_placement_group_mini_integration.py b/python/ray/tests/test_placement_group_mini_integration.py index 6c6dc7f9e3e3..e89e6e988eed 100644 --- a/python/ray/tests/test_placement_group_mini_integration.py +++ b/python/ray/tests/test_placement_group_mini_integration.py @@ -1,9 +1,9 @@ -import pytest import sys import time - from random import random +import pytest + import ray import ray.cluster_utils from ray._common.test_utils import wait_for_condition diff --git a/python/ray/tests/test_plasma_unlimited.py b/python/ray/tests/test_plasma_unlimited.py index 934f2e6eaf42..d5e0186f07eb 100644 --- a/python/ray/tests/test_plasma_unlimited.py +++ b/python/ray/tests/test_plasma_unlimited.py @@ -1,21 +1,22 @@ -import numpy as np import json -import random import os +import platform +import random import shutil import sys -import platform -import psutil +import numpy as np import pytest import ray +from ray._common.network_utils import build_address from ray._common.test_utils import wait_for_condition from ray._private.test_utils import ( check_spilled_mb, fetch_prometheus, ) -from ray._common.network_utils import build_address + +import psutil MB = 1024 * 1024 diff --git a/python/ray/tests/test_pydantic_serialization.py b/python/ray/tests/test_pydantic_serialization.py index 63310e3f14a9..ef81b8b2c510 100644 --- a/python/ray/tests/test_pydantic_serialization.py +++ b/python/ray/tests/test_pydantic_serialization.py @@ -1,20 +1,20 @@ -from dataclasses import dataclass import logging -from typing import Any, Dict, List, Optional, Type, Tuple import sys -from packaging import version +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple, Type +import pydantic import pytest from fastapi import FastAPI -import pydantic +from packaging import version try: # Testing with Pydantic 2 - from pydantic import BaseModel as BaseModelV2 - from pydantic.v1 import BaseModel as BaseModelV1 - - from pydantic import ValidationError as ValidationErrorV2 - from pydantic.v1 import ValidationError as ValidationErrorV1 + from pydantic import BaseModel as BaseModelV2, ValidationError as ValidationErrorV2 + from pydantic.v1 import ( + BaseModel as BaseModelV1, + ValidationError as ValidationErrorV1, + ) BASE_MODELS = [BaseModelV1, BaseModelV2] BASE_MODEL_AND_ERRORS = [ @@ -23,16 +23,14 @@ ] except ImportError: # Testing with Pydantic 1 - from pydantic import BaseModel as BaseModelV1 - from pydantic import ValidationError as ValidationErrorV1 + from pydantic import BaseModel as BaseModelV1, ValidationError as ValidationErrorV1 BaseModelV2 = None BASE_MODELS = [BaseModelV1] BASE_MODEL_AND_ERRORS = [(BaseModelV1, ValidationErrorV1)] import ray - -from ray.tests.pydantic_module import User, app, user, closure +from ray.tests.pydantic_module import User, app, closure, user @pytest.fixture(scope="session") diff --git a/python/ray/tests/test_queue.py b/python/ray/tests/test_queue.py index 7f6b2fd95c24..93f0b8ebfd86 100644 --- a/python/ray/tests/test_queue.py +++ b/python/ray/tests/test_queue.py @@ -1,13 +1,13 @@ -import time import sys +import time import pytest import ray from ray._common.test_utils import wait_for_condition +from ray._private.test_utils import BatchQueue from ray.exceptions import GetTimeoutError, RayActorError from ray.util.queue import Empty, Full, Queue -from ray._private.test_utils import BatchQueue # Remote helper functions for testing concurrency diff --git a/python/ray/tests/test_ray_debugger.py b/python/ray/tests/test_ray_debugger.py index 6cf1f164e71d..41eb0ed65f01 100644 --- a/python/ray/tests/test_ray_debugger.py +++ b/python/ray/tests/test_ray_debugger.py @@ -3,17 +3,17 @@ import subprocess import sys import unittest -import pexpect -from pexpect.popen_spawn import PopenSpawn from telnetlib import Telnet from typing import Union +import pexpect import pytest +from pexpect.popen_spawn import PopenSpawn import ray +from ray._common.network_utils import parse_address from ray._common.test_utils import wait_for_condition from ray._private import ray_constants, services -from ray._common.network_utils import parse_address from ray._private.test_utils import run_string_as_driver from ray.cluster_utils import Cluster, cluster_not_supported diff --git a/python/ray/tests/test_ray_init.py b/python/ray/tests/test_ray_init.py index 1d6330bedc03..73e76cb95861 100644 --- a/python/ray/tests/test_ray_init.py +++ b/python/ray/tests/test_ray_init.py @@ -1,13 +1,12 @@ -from concurrent.futures import ThreadPoolExecutor import json import os -import sys -import unittest.mock import signal import subprocess +import sys import tempfile +import unittest.mock +from concurrent.futures import ThreadPoolExecutor from pathlib import Path -from ray._common.network_utils import parse_address, build_address import grpc import pytest @@ -15,14 +14,15 @@ import ray import ray._private.services import ray._private.utils as utils +from ray._common.network_utils import build_address, parse_address +from ray._private import ray_constants +from ray._private.test_utils import external_redis_test_enabled from ray.client_builder import ClientContext from ray.cluster_utils import Cluster +from ray.runtime_env.runtime_env import RuntimeEnv from ray.util.client.common import ClientObjectRef from ray.util.client.ray_client_helpers import ray_start_client_server from ray.util.client.worker import Worker -from ray._private.test_utils import external_redis_test_enabled -from ray._private import ray_constants -from ray.runtime_env.runtime_env import RuntimeEnv @pytest.mark.skipif( diff --git a/python/ray/tests/test_ray_init_2.py b/python/ray/tests/test_ray_init_2.py index 04bc16cdfd16..9fdf2b71eda6 100644 --- a/python/ray/tests/test_ray_init_2.py +++ b/python/ray/tests/test_ray_init_2.py @@ -1,24 +1,24 @@ import logging import os +import shutil import sys -import unittest.mock import tempfile -import shutil +import unittest.mock from unittest.mock import patch import pytest import ray -from ray._common.test_utils import wait_for_condition -from ray._private.ray_constants import RAY_OVERRIDE_DASHBOARD_URL, DEFAULT_RESOURCES import ray._private.services -from ray._private.services import get_node_ip_address from ray._common.network_utils import parse_address -from ray.dashboard.utils import ray_address_to_api_server_url +from ray._common.test_utils import wait_for_condition +from ray._private.ray_constants import DEFAULT_RESOURCES, RAY_OVERRIDE_DASHBOARD_URL +from ray._private.services import get_node_ip_address from ray._private.test_utils import ( get_current_unused_port, run_string_as_driver, ) +from ray.dashboard.utils import ray_address_to_api_server_url from ray.util.client.ray_client_helpers import ray_start_client_server diff --git a/python/ray/tests/test_ray_shutdown.py b/python/ray/tests/test_ray_shutdown.py index a39f6f38eb17..8f2f4298db15 100644 --- a/python/ray/tests/test_ray_shutdown.py +++ b/python/ray/tests/test_ray_shutdown.py @@ -1,21 +1,21 @@ -import sys -import time -import platform +import multiprocessing import os +import platform import signal -import multiprocessing +import sys +import time import pytest -import ray - -import psutil # We must import psutil after ray because we bundle it with ray. +import ray from ray._common.test_utils import wait_for_condition from ray._private.test_utils import ( run_string_as_driver_nonblocking, ) -from ray.util.state import get_worker, list_tasks from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy +from ray.util.state import get_worker, list_tasks + +import psutil # We must import psutil after ray because we bundle it with ray. WAIT_TIMEOUT = 20 diff --git a/python/ray/tests/test_raylet_fault_tolerance.py b/python/ray/tests/test_raylet_fault_tolerance.py new file mode 100644 index 000000000000..21fcd1e84a5d --- /dev/null +++ b/python/ray/tests/test_raylet_fault_tolerance.py @@ -0,0 +1,48 @@ +import sys + +import pytest + +import ray +from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy + + +@pytest.mark.parametrize("deterministic_failure", ["request", "response"]) +def test_request_worker_lease_idempotent( + monkeypatch, shutdown_only, deterministic_failure, ray_start_cluster +): + monkeypatch.setenv( + "RAY_testing_rpc_failure", + "NodeManagerService.grpc_client.RequestWorkerLease=1:" + + ("100:0" if deterministic_failure == "request" else "0:100"), + ) + + @ray.remote + def simple_task_1(): + return 0 + + @ray.remote + def simple_task_2(): + return 1 + + # Spin up a two-node cluster where we're targeting scheduling on the + # remote node via NodeAffinitySchedulingStrategy to test remote RequestWorkerLease + # calls. + cluster = ray_start_cluster + remote_node = cluster.add_node(num_cpus=1) + + result_ref1 = simple_task_1.options( + scheduling_strategy=NodeAffinitySchedulingStrategy( + node_id=remote_node.node_id, soft=False + ) + ).remote() + result_ref2 = simple_task_2.options( + scheduling_strategy=NodeAffinitySchedulingStrategy( + node_id=remote_node.node_id, soft=False + ) + ).remote() + + assert ray.get([result_ref1, result_ref2]) == [0, 1] + + +if __name__ == "__main__": + sys.exit(pytest.main(["-sv", __file__])) diff --git a/python/ray/tests/test_reconstruction.py b/python/ray/tests/test_reconstruction.py index f68d3c15d46c..6cf194cda0cc 100644 --- a/python/ray/tests/test_reconstruction.py +++ b/python/ray/tests/test_reconstruction.py @@ -22,6 +22,8 @@ def config(request): "health_check_period_ms": 100, "health_check_failure_threshold": 20, "object_timeout_milliseconds": 200, + # Required for reducing the retry time of RequestWorkerLease + "raylet_rpc_server_reconnect_timeout_s": 0, } yield config diff --git a/python/ray/tests/test_reconstruction_2.py b/python/ray/tests/test_reconstruction_2.py index b7c821356ace..d211aafb0ed5 100644 --- a/python/ray/tests/test_reconstruction_2.py +++ b/python/ray/tests/test_reconstruction_2.py @@ -2,15 +2,14 @@ import sys import time -import pytest import numpy as np +import pytest import ray import ray._private.ray_constants as ray_constants -from ray._private.internal_api import memory_summary -from ray._common.test_utils import Semaphore, SignalActor -from ray._common.test_utils import wait_for_condition import ray.exceptions +from ray._common.test_utils import Semaphore, SignalActor, wait_for_condition +from ray._private.internal_api import memory_summary from ray.util.state import list_tasks # Task status. @@ -26,6 +25,8 @@ def config(request): "health_check_period_ms": 100, "health_check_failure_threshold": 20, "object_timeout_milliseconds": 200, + # Required for reducing the retry time of RequestWorkerLease + "raylet_rpc_server_reconnect_timeout_s": 0, } yield config diff --git a/python/ray/tests/test_reconstruction_stress.py b/python/ray/tests/test_reconstruction_stress.py index 1dac4d58d9c7..e22360a9de36 100644 --- a/python/ray/tests/test_reconstruction_stress.py +++ b/python/ray/tests/test_reconstruction_stress.py @@ -16,6 +16,8 @@ def config(request): "health_check_period_ms": 100, "health_check_failure_threshold": 10, "object_timeout_milliseconds": 200, + # Required for reducing the retry time of RequestWorkerLease + "raylet_rpc_server_reconnect_timeout_s": 0, } yield config diff --git a/python/ray/tests/test_reconstruction_stress_spill.py b/python/ray/tests/test_reconstruction_stress_spill.py index 72fc6307b47c..6d1322c23c08 100644 --- a/python/ray/tests/test_reconstruction_stress_spill.py +++ b/python/ray/tests/test_reconstruction_stress_spill.py @@ -1,8 +1,8 @@ import signal import sys -import pytest import numpy as np +import pytest import ray @@ -16,6 +16,8 @@ def config(request): "health_check_period_ms": 100, "health_check_failure_threshold": 10, "object_timeout_milliseconds": 200, + # Required for reducing the retry time of RequestWorkerLease + "raylet_rpc_server_reconnect_timeout_s": 0, } yield config diff --git a/python/ray/tests/test_redis_tls.py b/python/ray/tests/test_redis_tls.py index 1f9ab273b324..c5990981530f 100644 --- a/python/ray/tests/test_redis_tls.py +++ b/python/ray/tests/test_redis_tls.py @@ -1,5 +1,7 @@ -import pytest import sys + +import pytest + import ray from ray._private.test_utils import external_redis_test_enabled diff --git a/python/ray/tests/test_reference_counting.py b/python/ray/tests/test_reference_counting.py index 5999156204d9..031c42d51e73 100644 --- a/python/ray/tests/test_reference_counting.py +++ b/python/ray/tests/test_reference_counting.py @@ -5,10 +5,10 @@ """ # coding: utf-8 import copy +import gc import logging import os import sys -import gc import time import numpy as np diff --git a/python/ray/tests/test_reference_counting_2.py b/python/ray/tests/test_reference_counting_2.py index 4a2b55a6804e..e3658c8d488d 100644 --- a/python/ray/tests/test_reference_counting_2.py +++ b/python/ray/tests/test_reference_counting_2.py @@ -4,9 +4,9 @@ put the test in `test_reference_counting_standalone.py`. """ # coding: utf-8 +import copy import logging import os -import copy import pickle import signal import sys @@ -17,14 +17,14 @@ import pytest import ray +import ray._private.gcs_utils as gcs_utils import ray.cluster_utils -from ray._private.internal_api import memory_summary from ray._common.test_utils import SignalActor, wait_for_condition +from ray._private.internal_api import memory_summary from ray._private.test_utils import ( put_object, wait_for_num_actors, ) -import ray._private.gcs_utils as gcs_utils SIGKILL = signal.SIGKILL if sys.platform != "win32" else signal.SIGTERM diff --git a/python/ray/tests/test_reference_counting_standalone.py b/python/ray/tests/test_reference_counting_standalone.py index e1ab8132dc50..cb3ced7bb9e2 100644 --- a/python/ray/tests/test_reference_counting_standalone.py +++ b/python/ray/tests/test_reference_counting_standalone.py @@ -15,11 +15,11 @@ import ray import ray.cluster_utils -from ray._private.internal_api import memory_summary -from ray._common.test_utils import SignalActor from ray._common.test_utils import ( + SignalActor, wait_for_condition, ) +from ray._private.internal_api import memory_summary logger = logging.getLogger(__name__) diff --git a/python/ray/tests/test_resource_demand_scheduler.py b/python/ray/tests/test_resource_demand_scheduler.py index 69e4fc478ac0..2c8d921bf0a2 100644 --- a/python/ray/tests/test_resource_demand_scheduler.py +++ b/python/ray/tests/test_resource_demand_scheduler.py @@ -1,6 +1,6 @@ import copy -import os import json +import os import shutil import sys import tempfile @@ -8,21 +8,17 @@ import unittest from dataclasses import asdict from datetime import datetime +from functools import partial from time import sleep from unittest import mock -import yaml import pytest +import yaml import ray import ray._private.ray_constants from ray._private.gcs_utils import PlacementGroupTableData from ray._private.test_utils import same_elements -from ray.autoscaler._private.node_provider_availability_tracker import ( - NodeAvailabilityRecord, - NodeAvailabilitySummary, - UnavailableNodeInformation, -) from ray.autoscaler._private.autoscaler import AutoscalerSummary from ray.autoscaler._private.commands import get_or_create_head_node from ray.autoscaler._private.constants import ( @@ -30,15 +26,20 @@ AUTOSCALER_UTILIZATION_SCORER_KEY, ) from ray.autoscaler._private.load_metrics import LoadMetrics +from ray.autoscaler._private.node_provider_availability_tracker import ( + NodeAvailabilityRecord, + NodeAvailabilitySummary, + UnavailableNodeInformation, +) from ray.autoscaler._private.providers import _NODE_PROVIDERS, _clear_provider_cache from ray.autoscaler._private.resource_demand_scheduler import ( ResourceDemandScheduler, _add_min_workers_nodes, - _resource_based_utilization_scorer, _default_utilization_scorer, + _resource_based_utilization_scorer, get_bin_pack_residual, + get_nodes_for as _get, ) -from ray.autoscaler._private.resource_demand_scheduler import get_nodes_for as _get from ray.autoscaler._private.util import ( LoadMetricsSummary, format_info_string, @@ -62,10 +63,9 @@ MockGcsClient, MockProcessRunner, MockProvider, - fill_in_raylet_ids, - mock_raylet_id, + fill_in_node_ids, + mock_node_id, ) -from functools import partial GET_DEFAULT_METHOD = "ray.autoscaler._private.util._get_default_config" @@ -1775,7 +1775,7 @@ def testResourceDemandVector(self): lm = LoadMetrics() lm.update( "1.1.1.1", - mock_raylet_id(), + mock_node_id(), {"CPU": 2}, {"CPU": 1}, 0, @@ -1800,7 +1800,7 @@ def testPlacementGroupLoad(self): ] lm.update( "1.1.1.1", - mock_raylet_id(), + mock_node_id(), {}, {}, DUMMY_IDLE_DURATION_S, @@ -1825,7 +1825,7 @@ def testSummary(self): ] lm.update( "1.1.1.1", - mock_raylet_id(), + mock_node_id(), { "CPU": 64, "memory": 1000 * 1024 * 1024, @@ -1840,7 +1840,7 @@ def testSummary(self): ) lm.update( "1.1.1.2", - mock_raylet_id(), + mock_node_id(), { "CPU": 64, "GPU": 8, @@ -1855,14 +1855,14 @@ def testSummary(self): ) lm.update( "1.1.1.3", - mock_raylet_id(), + mock_node_id(), {"CPU": 64, "GPU": 8, "accelerator_type:V100": 1}, {"CPU": 0, "GPU": 0, "accelerator_type:V100": 0.92}, 0, ) lm.update( "1.1.1.4", - mock_raylet_id(), + mock_node_id(), {"CPU": 2}, {"CPU": 2}, DUMMY_IDLE_DURATION_S, @@ -2077,9 +2077,9 @@ def testSummary(self): self.waitForNodes(3) for ip in self.provider.non_terminated_node_ips({}): - lm.update(ip, mock_raylet_id(), {"CPU": 2}, {"CPU": 0}, 0) + lm.update(ip, mock_node_id(), {"CPU": 2}, {"CPU": 0}, 0) - lm.update(head_ip, mock_raylet_id(), {"CPU": 16}, {"CPU": 1}, 0) + lm.update(head_ip, mock_node_id(), {"CPU": 16}, {"CPU": 1}, 0) autoscaler.update() while True: @@ -2098,7 +2098,7 @@ def testSummary(self): lm.update( head_ip, - mock_raylet_id(), + mock_node_id(), {"CPU": 16}, {"CPU": 1}, 0, @@ -2281,7 +2281,7 @@ def testPlacementGroup(self): ] lm.update( head_ip, - mock_raylet_id(), + mock_node_id(), {"CPU": 16}, {"CPU": 16}, DUMMY_IDLE_DURATION_S, @@ -2366,7 +2366,7 @@ def testScaleUpMinWorkers(self): # min workers. for node_id in self.provider.non_terminated_nodes({}): lm.ray_nodes_last_used_time_by_ip[self.provider.internal_ip(node_id)] = -60 - fill_in_raylet_ids(self.provider, lm) + fill_in_node_ids(self.provider, lm) autoscaler.update() self.waitForNodes(3) @@ -2415,12 +2415,12 @@ def testScaleUpIgnoreUsed(self): ) autoscaler.update() self.waitForNodes(1) - lm.update(head_ip, mock_raylet_id(), {"CPU": 4, "GPU": 1}, {}, 0) + lm.update(head_ip, mock_node_id(), {"CPU": 4, "GPU": 1}, {}, 0) self.waitForNodes(1) lm.update( head_ip, - mock_raylet_id(), + mock_node_id(), {"CPU": 4, "GPU": 1}, {"GPU": 0}, 0, @@ -2600,7 +2600,7 @@ def testScaleUpLoadMetrics(self): autoscaler.update() lm.update( "1.2.3.4", - mock_raylet_id(), + mock_node_id(), {}, {}, DUMMY_IDLE_DURATION_S, @@ -2641,7 +2641,7 @@ def testCommandPassing(self): 1, ) lm = LoadMetrics() - lm.update("172.0.0.0", mock_raylet_id(), {"CPU": 0}, {"CPU": 0}, 0) + lm.update("172.0.0.0", mock_node_id(), {"CPU": 0}, {"CPU": 0}, 0) autoscaler = MockAutoscaler( config_path, lm, @@ -2821,7 +2821,7 @@ def testUpdateConfig(self): config["available_node_types"]["m4.large"]["min_workers"] = 0 config["available_node_types"]["m4.large"]["node_config"]["field_changed"] = 1 config_path = self.write_config(config) - fill_in_raylet_ids(self.provider, lm) + fill_in_node_ids(self.provider, lm) autoscaler.update() self.waitForNodes(0, tag_filters={TAG_RAY_NODE_KIND: NODE_KIND_WORKER}) @@ -2915,7 +2915,7 @@ def testRequestResourcesIdleTimeout(self): autoscaler.provider.mock_nodes[node_id].state = "unterminatable" lm.update( node_ip, - mock_raylet_id(), + mock_node_id(), config["available_node_types"]["def_worker"]["resources"], config["available_node_types"]["def_worker"]["resources"], DUMMY_IDLE_DURATION_S, @@ -2930,7 +2930,7 @@ def testRequestResourcesIdleTimeout(self): autoscaler.load_metrics.set_resource_requests([{"CPU": 0.2, "WORKER": 1.0}]) lm.update( node_ip, - mock_raylet_id(), + mock_node_id(), config["available_node_types"]["def_worker"]["resources"], {}, 0, @@ -2940,7 +2940,7 @@ def testRequestResourcesIdleTimeout(self): self.waitForNodes(2, tag_filters={TAG_RAY_NODE_KIND: NODE_KIND_WORKER}) lm.update( node_ip, - mock_raylet_id(), + mock_node_id(), config["available_node_types"]["def_worker"]["resources"], config["available_node_types"]["def_worker"]["resources"], DUMMY_IDLE_DURATION_S, @@ -2954,7 +2954,7 @@ def testRequestResourcesIdleTimeout(self): assert autoscaler.provider.mock_nodes[node_id].state == "unterminatable" lm.update( "172.0.0.2", - mock_raylet_id(), + mock_node_id(), config["available_node_types"]["def_worker"]["resources"], config["available_node_types"]["def_worker"]["resources"], DUMMY_IDLE_DURATION_S, @@ -3023,7 +3023,7 @@ def testRequestResourcesRaceConditionsLong(self): autoscaler.provider.mock_nodes[node_id].state = "unterminatable" lm.update( node_ip, - mock_raylet_id(), + mock_node_id(), config["available_node_types"]["def_worker"]["resources"], config["available_node_types"]["def_worker"]["resources"], DUMMY_IDLE_DURATION_S, @@ -3041,7 +3041,7 @@ def testRequestResourcesRaceConditionsLong(self): autoscaler.load_metrics.set_resource_requests([{"CPU": 0.2, "WORKER": 1.0}] * 3) lm.update( node_ip, - mock_raylet_id(), + mock_node_id(), config["available_node_types"]["def_worker"]["resources"], {}, 0, @@ -3053,21 +3053,21 @@ def testRequestResourcesRaceConditionsLong(self): lm.update( "172.0.0.2", - mock_raylet_id(), + mock_node_id(), config["available_node_types"]["def_worker"]["resources"], config["available_node_types"]["def_worker"]["resources"], DUMMY_IDLE_DURATION_S, ) lm.update( "172.0.0.3", - mock_raylet_id(), + mock_node_id(), config["available_node_types"]["def_worker"]["resources"], config["available_node_types"]["def_worker"]["resources"], DUMMY_IDLE_DURATION_S, ) lm.update( node_ip, - mock_raylet_id(), + mock_node_id(), config["available_node_types"]["def_worker"]["resources"], {}, 0, @@ -3174,7 +3174,7 @@ def testRequestResourcesRaceConditionWithResourceDemands(self): ) lm.update( "127.0.0.0", - mock_raylet_id(), + mock_node_id(), {"CPU": 2, "GPU": 1}, {"CPU": 2}, 0, @@ -3186,7 +3186,7 @@ def testRequestResourcesRaceConditionWithResourceDemands(self): self.waitForNodes(2) lm.update( "127.0.0.0", - mock_raylet_id(), + mock_node_id(), {"CPU": 2, "GPU": 1}, {"CPU": 2}, 0, @@ -3284,9 +3284,9 @@ def test_info_string(): 2.00GiB/8.00GiB memory 3.14GiB/16.00GiB object_store_memory -Total Constraints: +From request_resources: {'CPU': 16}: 100 from request_resources() -Total Demands: +Pending Demands: {'CPU': 1}: 150+ pending tasks/actors {'CPU': 4} * 5 (PACK): 420+ pending placement groups """.strip() @@ -3341,10 +3341,10 @@ def test_info_string_multiple_constraints(): 2.00GiB/8.00GiB memory 3.14GiB/16.00GiB object_store_memory -Total Constraints: +From request_resources: {'CPU': 16}: 100 from request_resources() {'CPU': 1, 'GPU': 16}: 10 from request_resources() -Total Demands: +Pending Demands: {'CPU': 1}: 150+ pending tasks/actors {'CPU': 4} * 5 (PACK): 420+ pending placement groups """.strip() @@ -3433,9 +3433,9 @@ def test_info_string_verbose(): 2.00GiB/8.00GiB memory 3.14GiB/16.00GiB object_store_memory -Total Constraints: +From request_resources: {'CPU': 16}: 100 from request_resources() -Total Demands: +Pending Demands: {'CPU': 1}: 150+ pending tasks/actors {'CPU': 4} * 5 (PACK): 420+ pending placement groups @@ -3548,9 +3548,9 @@ def test_info_string_verbose_node_types(): 2.00GiB/8.00GiB memory 3.14GiB/16.00GiB object_store_memory -Total Constraints: +From request_resources: {'CPU': 16}: 100 from request_resources() -Total Demands: +Pending Demands: {'CPU': 1}: 150+ pending tasks/actors {'CPU': 4} * 5 (PACK): 420+ pending placement groups @@ -3640,9 +3640,9 @@ def test_info_string_verbose_no_breakdown(): 2.00GiB/8.00GiB memory 3.14GiB/16.00GiB object_store_memory -Total Constraints: +From request_resources: {'CPU': 16}: 100 from request_resources() -Total Demands: +Pending Demands: {'CPU': 1}: 150+ pending tasks/actors {'CPU': 4} * 5 (PACK): 420+ pending placement groups """.strip() @@ -3735,9 +3735,9 @@ def test_info_string_with_launch_failures(): 2.00GiB/8.00GiB memory 3.14GiB/16.00GiB object_store_memory -Total Constraints: +From request_resources: {'CPU': 16}: 100 from request_resources() -Total Demands: +Pending Demands: {'CPU': 1}: 150+ pending tasks/actors {'CPU': 4} * 5 (PACK): 420+ pending placement groups """.strip() @@ -3828,9 +3828,9 @@ def test_info_string_with_launch_failures_verbose(): 2.00GiB/8.00GiB memory 3.14GiB/16.00GiB object_store_memory -Total Constraints: +From request_resources: {'CPU': 16}: 100 from request_resources() -Total Demands: +Pending Demands: {'CPU': 1}: 150+ pending tasks/actors {'CPU': 4} * 5 (PACK): 420+ pending placement groups """.strip() @@ -3917,9 +3917,9 @@ def test_info_string_failed_node_cap(): 2.00GiB/8.00GiB memory 3.14GiB/16.00GiB object_store_memory -Total Constraints: +From request_resources: {'CPU': 16}: 100 from request_resources() -Total Demands: +Pending Demands: {'CPU': 2.0}: 153+ pending tasks/actors (3+ using placement groups) {'GPU': 0.5}: 100+ pending tasks/actors (100+ using placement groups) {'CPU': 4} * 5 (PACK): 420+ pending placement groups diff --git a/python/ray/tests/test_resource_isolation_config.py b/python/ray/tests/test_resource_isolation_config.py index 0f7c49c6abae..f08b4d694b8d 100644 --- a/python/ray/tests/test_resource_isolation_config.py +++ b/python/ray/tests/test_resource_isolation_config.py @@ -1,6 +1,7 @@ -import pytest import sys +import pytest + from ray._private import utils from ray._private.resource_isolation_config import ResourceIsolationConfig diff --git a/python/ray/tests/test_resource_metrics.py b/python/ray/tests/test_resource_metrics.py index 8e5a003cc176..a3e7377229a5 100644 --- a/python/ray/tests/test_resource_metrics.py +++ b/python/ray/tests/test_resource_metrics.py @@ -4,14 +4,12 @@ import pytest import ray - +from ray._common.network_utils import build_address from ray._common.test_utils import wait_for_condition from ray._private.test_utils import ( fetch_prometheus_metrics, run_string_as_driver_nonblocking, ) -from ray._common.network_utils import build_address - METRIC_CONFIG = { "_system_config": { diff --git a/python/ray/tests/test_response_cache.py b/python/ray/tests/test_response_cache.py index 21c135af8e48..1a1e9d11f6f2 100644 --- a/python/ray/tests/test_response_cache.py +++ b/python/ray/tests/test_response_cache.py @@ -5,10 +5,10 @@ import pytest from ray.util.client.common import ( - _id_is_newer, - ResponseCache, - OrderedResponseCache, INT32_MAX, + OrderedResponseCache, + ResponseCache, + _id_is_newer, ) diff --git a/python/ray/tests/test_runtime_context.py b/python/ray/tests/test_runtime_context.py index 0d44d1925784..7e358798874e 100644 --- a/python/ray/tests/test_runtime_context.py +++ b/python/ray/tests/test_runtime_context.py @@ -1,15 +1,15 @@ import os import signal -import time import sys +import time import warnings import pytest import ray +from ray._common.test_utils import wait_for_condition from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy from ray.util.state import list_tasks -from ray._common.test_utils import wait_for_condition @pytest.mark.skipif(sys.platform == "win32", reason="Fails on windows") diff --git a/python/ray/tests/test_runtime_env_agent.py b/python/ray/tests/test_runtime_env_agent.py index a3f9c6b27a57..bbd877a417ff 100644 --- a/python/ray/tests/test_runtime_env_agent.py +++ b/python/ray/tests/test_runtime_env_agent.py @@ -1,20 +1,22 @@ -import sys -import pytest import logging import os +import sys import time from typing import List, Tuple +import pytest + import ray from ray._common.test_utils import wait_for_condition -from ray._private.runtime_env.agent.runtime_env_agent import UriType, ReferenceTable from ray._private import ray_constants +from ray._private.runtime_env.agent.runtime_env_agent import ReferenceTable, UriType from ray._private.test_utils import ( get_error_message, init_error_pubsub, ) from ray.core.generated import common_pb2 from ray.runtime_env import RuntimeEnv + import psutil logger = logging.getLogger(__name__) diff --git a/python/ray/tests/test_runtime_env_complicated.py b/python/ray/tests/test_runtime_env_complicated.py index 2113dbeb9165..4754ef102a71 100644 --- a/python/ray/tests/test_runtime_env_complicated.py +++ b/python/ray/tests/test_runtime_env_complicated.py @@ -4,38 +4,37 @@ import sys import tempfile import time -from ray._common.test_utils import wait_for_condition -import yaml from pathlib import Path from typing import List from unittest import mock import pytest +import yaml -from ray._common.utils import try_to_create_directory import ray -from ray.runtime_env import RuntimeEnv +from ray._common.test_utils import wait_for_condition +from ray._common.utils import try_to_create_directory from ray._private.runtime_env.conda import ( - inject_dependencies, + _current_py_version, _inject_ray_to_conda_site, _resolve_install_from_source_ray_dependencies, - _current_py_version, + inject_dependencies, ) - from ray._private.runtime_env.conda_utils import ( get_conda_env_list, - get_conda_info_json, get_conda_envs, + get_conda_info_json, ) from ray._private.test_utils import ( + chdir, run_string_as_driver, run_string_as_driver_nonblocking, - chdir, ) from ray._private.utils import ( - get_conda_env_dir, get_conda_bin_executable, + get_conda_env_dir, ) +from ray.runtime_env import RuntimeEnv if not os.environ.get("CI"): # This flags turns on the local development that link against current ray diff --git a/python/ray/tests/test_runtime_env_conda_and_pip.py b/python/ray/tests/test_runtime_env_conda_and_pip.py index 027a582cf833..f656a0a5366a 100644 --- a/python/ray/tests/test_runtime_env_conda_and_pip.py +++ b/python/ray/tests/test_runtime_env_conda_and_pip.py @@ -1,29 +1,29 @@ import os -import pytest -import sys import platform +import subprocess +import sys +import tempfile +from pathlib import Path + +import pytest +import yaml + +import ray from ray._common.test_utils import wait_for_condition -from ray._private.test_utils import ( - chdir, - check_local_files_gced, - generate_runtime_env_dict, -) from ray._private.runtime_env import dependency_utils from ray._private.runtime_env.conda import _get_conda_dict_with_ray_inserted from ray._private.runtime_env.dependency_utils import ( INTERNAL_PIP_FILENAME, MAX_INTERNAL_PIP_FILENAME_TRIES, ) +from ray._private.test_utils import ( + chdir, + check_local_files_gced, + generate_runtime_env_dict, +) from ray.runtime_env import RuntimeEnv from ray.util.state import list_tasks -import yaml -import tempfile -from pathlib import Path -import subprocess - -import ray - if not os.environ.get("CI"): # This flags turns on the local development that link against current ray # packages and fall back all the dependencies to current python's site. diff --git a/python/ray/tests/test_runtime_env_conda_and_pip_2.py b/python/ray/tests/test_runtime_env_conda_and_pip_2.py index 0674a3a65a8d..1c22681d7eb9 100644 --- a/python/ray/tests/test_runtime_env_conda_and_pip_2.py +++ b/python/ray/tests/test_runtime_env_conda_and_pip_2.py @@ -1,11 +1,12 @@ import os -import pytest import sys from unittest import mock +import pytest + import ray -from ray.exceptions import RuntimeEnvSetupError from ray._private.test_utils import generate_runtime_env_dict +from ray.exceptions import RuntimeEnvSetupError if not os.environ.get("CI"): # This flags turns on the local development that link against current ray diff --git a/python/ray/tests/test_runtime_env_conda_and_pip_3.py b/python/ray/tests/test_runtime_env_conda_and_pip_3.py index 9dee12f6bacb..d330072e3984 100644 --- a/python/ray/tests/test_runtime_env_conda_and_pip_3.py +++ b/python/ray/tests/test_runtime_env_conda_and_pip_3.py @@ -1,14 +1,14 @@ import os -import pytest import sys +import pytest + +import ray from ray._common.test_utils import wait_for_condition from ray._private.test_utils import ( check_local_files_gced, generate_runtime_env_dict, ) -import ray - if not os.environ.get("CI"): # This flags turns on the local development that link against current ray diff --git a/python/ray/tests/test_runtime_env_conda_and_pip_4.py b/python/ray/tests/test_runtime_env_conda_and_pip_4.py index 69a446cf9de1..08cb20ac56f2 100644 --- a/python/ray/tests/test_runtime_env_conda_and_pip_4.py +++ b/python/ray/tests/test_runtime_env_conda_and_pip_4.py @@ -1,10 +1,10 @@ import os -import pytest import sys -from ray._private.runtime_env import virtualenv_utils -import ray +import pytest +import ray +from ray._private.runtime_env import virtualenv_utils if not os.environ.get("CI"): # This flags turns on the local development that link against current ray diff --git a/python/ray/tests/test_runtime_env_conda_and_pip_5.py b/python/ray/tests/test_runtime_env_conda_and_pip_5.py index f5d23143c4fb..889cf9922f8d 100644 --- a/python/ray/tests/test_runtime_env_conda_and_pip_5.py +++ b/python/ray/tests/test_runtime_env_conda_and_pip_5.py @@ -1,4 +1,5 @@ import sys + import pytest from packaging.version import parse diff --git a/python/ray/tests/test_runtime_env_container.py b/python/ray/tests/test_runtime_env_container.py index 933a88e2db6a..7b1ad02bbeb1 100644 --- a/python/ray/tests/test_runtime_env_container.py +++ b/python/ray/tests/test_runtime_env_container.py @@ -5,8 +5,7 @@ import ray from ray.tests.conftest import * # noqa from ray.tests.conftest_docker import * # noqa -from ray.tests.conftest_docker import run_in_container, NESTED_IMAGE_NAME - +from ray.tests.conftest_docker import NESTED_IMAGE_NAME, run_in_container # NOTE(zcin): The actual test code are in python scripts under # python/ray/tests/runtime_env_container. The scripts are copied over to diff --git a/python/ray/tests/test_runtime_env_failure.py b/python/ray/tests/test_runtime_env_failure.py index 7a1f904385ea..3df849d7a802 100644 --- a/python/ray/tests/test_runtime_env_failure.py +++ b/python/ray/tests/test_runtime_env_failure.py @@ -3,12 +3,13 @@ from unittest import mock import pytest + +import ray from ray._private.ray_constants import RAY_RUNTIME_ENV_URI_PIN_EXPIRATION_S_DEFAULT from ray._private.runtime_env.packaging import ( RAY_RUNTIME_ENV_FAIL_DOWNLOAD_FOR_TESTING_ENV_VAR, RAY_RUNTIME_ENV_FAIL_UPLOAD_FOR_TESTING_ENV_VAR, ) -import ray from ray.exceptions import RuntimeEnvSetupError diff --git a/python/ray/tests/test_runtime_env_fork_process.py b/python/ray/tests/test_runtime_env_fork_process.py index e7e31b2768c2..35462c7fe70c 100644 --- a/python/ray/tests/test_runtime_env_fork_process.py +++ b/python/ray/tests/test_runtime_env_fork_process.py @@ -1,7 +1,7 @@ # coding: utf-8 +import json import os import sys -import json import pytest diff --git a/python/ray/tests/test_runtime_env_get_wheel_names.py b/python/ray/tests/test_runtime_env_get_wheel_names.py index e124bc9ae676..fbbc735c461c 100644 --- a/python/ray/tests/test_runtime_env_get_wheel_names.py +++ b/python/ray/tests/test_runtime_env_get_wheel_names.py @@ -3,14 +3,13 @@ import pytest import requests +import ray._private.ray_constants as ray_constants from ray._private.utils import ( get_master_wheel_url, get_release_wheel_url, get_wheel_filename, ) -import ray._private.ray_constants as ray_constants - def test_get_wheel_filename(): """Test the code that generates the filenames of the `latest` wheels.""" diff --git a/python/ray/tests/test_runtime_env_packaging.py b/python/ray/tests/test_runtime_env_packaging.py index 21f226b94965..8c015f8bf76e 100644 --- a/python/ray/tests/test_runtime_env_packaging.py +++ b/python/ray/tests/test_runtime_env_packaging.py @@ -6,14 +6,14 @@ import sys import tempfile import uuid +import zipfile from filecmp import dircmp from pathlib import Path from shutil import copytree, make_archive, rmtree -import zipfile -import ray import pytest +import ray from ray._private.ray_constants import ( KV_NAMESPACE_PACKAGE, RAY_RUNTIME_ENV_IGNORE_GITIGNORE, @@ -24,12 +24,13 @@ Protocol, _dir_travel, _get_excludes, + _get_gitignore, _store_package_in_gcs, download_and_unpack_package, get_local_dir_from_uri, get_top_level_dir_from_compressed_package, - get_uri_for_file, get_uri_for_directory, + get_uri_for_file, get_uri_for_package, is_whl_uri, is_zip_uri, @@ -37,7 +38,6 @@ remove_dir_from_filepaths, unzip_package, upload_package_if_needed, - _get_gitignore, upload_package_to_gcs, ) from ray.experimental.internal_kv import ( @@ -491,6 +491,11 @@ class TestParseUri: ("https://test.com/file.zip", Protocol.HTTPS, "https_test_com_file.zip"), ("gs://bucket/file.zip", Protocol.GS, "gs_bucket_file.zip"), ("azure://container/file.zip", Protocol.AZURE, "azure_container_file.zip"), + ( + "abfss://container@account.dfs.core.windows.net/file.zip", + Protocol.ABFSS, + "abfss_container_account_dfs_core_windows_net_file.zip", + ), ( "https://test.com/package-0.0.1-py2.py3-none-any.whl?param=value", Protocol.HTTPS, @@ -553,6 +558,11 @@ def test_parse_private_git_https_uris(self, parsing_tuple): Protocol.AZURE, "azure_fake_2022-10-21T13_11_35_00_00_package.zip", ), + ( + "abfss://container@account.dfs.core.windows.net/2022-10-21T13:11:35+00:00/package.zip", + Protocol.ABFSS, + "abfss_container_account_dfs_core_windows_net_2022-10-21T13_11_35_00_00_package.zip", + ), ( "file:///fake/2022-10-21T13:11:35+00:00/package.zip", Protocol.FILE, @@ -594,6 +604,11 @@ def test_parse_uris_with_disallowed_chars(self, parsing_tuple): Protocol.AZURE, "package.whl", ), + ( + "abfss://container@account.dfs.core.windows.net/2022-10-21T13:11:35+00:00/package.whl", + Protocol.ABFSS, + "package.whl", + ), ( "file:///fake/2022-10-21T13:11:35+00:00/package.whl", Protocol.FILE, @@ -618,6 +633,142 @@ def test_parse_gcs_uri(self, gcs_uri): assert package_name == gcs_uri.split("/")[-1] +class TestAbfssProtocol: + """Test ABFSS protocol implementation.""" + + def test_abfss_protocol_handler_with_invalid_uris(self, tmp_path): + """Test that ABFSS protocol handler raises ValueError for invalid URIs.""" + import unittest.mock as mock + + invalid_uris = [ + "abfss://@account.dfs.core.windows.net/file.zip", # Empty container name + "abfss://container@.dfs.core.windows.net/file.zip", # Empty account name + "abfss://container@account.blob.core.windows.net/file.zip", # Wrong endpoint + "abfss://container@account.core.windows.net/file.zip", # Missing .dfs + "abfss://account.dfs.core.windows.net/file.zip", # Missing container@ + "abfss://container", # Missing @ and hostname + "abfss://", # Empty netloc + ] + + dest_file = tmp_path / "test_download.zip" + + # Mock adlfs and azure.identity modules in sys.modules to avoid import errors in CI + import sys + + mock_adlfs_module = mock.MagicMock() + mock_azure_identity_module = mock.MagicMock() + + with mock.patch.dict( + sys.modules, + { + "adlfs": mock_adlfs_module, + "azure": mock.MagicMock(), + "azure.identity": mock_azure_identity_module, + }, + ): + # Setup the mocks (though they won't be called due to validation failures) + mock_filesystem = mock.Mock() + mock_adlfs_module.AzureBlobFileSystem.return_value = mock_filesystem + mock_filesystem.open.return_value = mock.Mock() + + for invalid_uri in invalid_uris: + with pytest.raises(ValueError, match="Invalid ABFSS URI format"): + Protocol.ABFSS.download_remote_uri(invalid_uri, str(dest_file)) + + +class TestS3Protocol: + """Test S3 protocol implementation with public bucket fallback.""" + + def test_s3_client_creation_with_credentials(self): + """Test S3 client creation when credentials are available.""" + import sys + import unittest.mock as mock + + # Mock boto3 and smart_open modules + mock_boto3 = mock.MagicMock() + mock_smart_open = mock.MagicMock() + + # Setup successful credential scenario + mock_session = mock.MagicMock() + mock_s3_client = mock.MagicMock() + mock_credentials = mock.MagicMock() # Non-None credentials + + mock_boto3.Session.return_value = mock_session + mock_session.get_credentials.return_value = mock_credentials + mock_session.client.return_value = mock_s3_client + + with mock.patch.dict( + sys.modules, + { + "boto3": mock_boto3, + "smart_open": mock_smart_open, + }, + ): + mock_smart_open.open = mock.MagicMock() + + from ray._private.runtime_env.protocol import ProtocolsProvider + + open_file, transport_params = ProtocolsProvider._handle_s3_protocol() + + # Verify that Session was created and get_credentials was called + mock_boto3.Session.assert_called_once() + mock_session.get_credentials.assert_called_once() + # Verify that session.client was called to create signed S3 client + mock_session.client.assert_called_with("s3") + # Verify that the signed client is returned + assert transport_params["client"] == mock_s3_client + + def test_s3_client_creation_without_credentials(self): + """Test S3 client creation falls back to unsigned when no credentials.""" + import sys + import unittest.mock as mock + + # Mock boto3 and botocore modules + mock_boto3 = mock.MagicMock() + mock_botocore = mock.MagicMock() + mock_smart_open = mock.MagicMock() + + # Setup no credentials scenario + mock_session = mock.MagicMock() + mock_unsigned_client = mock.MagicMock() + + mock_boto3.Session.return_value = mock_session + mock_session.get_credentials.return_value = None # No credentials found + mock_boto3.client.return_value = mock_unsigned_client + + # Mock Config and UNSIGNED + mock_config_class = mock.MagicMock() + mock_config = mock.MagicMock() + mock_config_class.return_value = mock_config + mock_botocore.config.Config = mock_config_class + mock_botocore.UNSIGNED = "UNSIGNED" + + with mock.patch.dict( + sys.modules, + { + "boto3": mock_boto3, + "botocore": mock_botocore, + "botocore.config": mock_botocore.config, + "smart_open": mock_smart_open, + }, + ): + mock_smart_open.open = mock.MagicMock() + + from ray._private.runtime_env.protocol import ProtocolsProvider + + open_file, transport_params = ProtocolsProvider._handle_s3_protocol() + + # Verify that Session was created and get_credentials was called + mock_boto3.Session.assert_called_once() + mock_session.get_credentials.assert_called_once() + # Verify that boto3.client was called for unsigned client with config + mock_boto3.client.assert_called_with("s3", config=mock_config) + # Verify Config was created with UNSIGNED signature + mock_config_class.assert_called_with(signature_version="UNSIGNED") + # Verify that the unsigned client is returned + assert transport_params["client"] == mock_unsigned_client + + @pytest.mark.asyncio class TestDownloadAndUnpackPackage: async def test_download_and_unpack_package_with_gcs_uri_without_gcs_client( @@ -698,8 +849,8 @@ async def test_download_and_unpack_package_with_file_uri(self): # Add a file to the zip file so we can verify the file was extracted. zip.writestr("file.txt", "Hello, world!") - from urllib.request import pathname2url from urllib.parse import urljoin + from urllib.request import pathname2url # in windows, file_path = ///C:/Users/... # in linux, file_path = /tmp/... diff --git a/python/ray/tests/test_runtime_env_profiler.py b/python/ray/tests/test_runtime_env_profiler.py index ce3e12735e22..39fa8514677f 100644 --- a/python/ray/tests/test_runtime_env_profiler.py +++ b/python/ray/tests/test_runtime_env_profiler.py @@ -1,13 +1,14 @@ -import os import glob +import os +import subprocess import sys from pathlib import Path + import pytest -import subprocess import ray -from ray._private.runtime_env.nsight import parse_nsight_config from ray._common.test_utils import wait_for_condition +from ray._private.runtime_env.nsight import parse_nsight_config from ray.exceptions import RuntimeEnvSetupError diff --git a/python/ray/tests/test_runtime_env_py_executable.py b/python/ray/tests/test_runtime_env_py_executable.py index b9aef67b5999..daf9445e404d 100644 --- a/python/ray/tests/test_runtime_env_py_executable.py +++ b/python/ray/tests/test_runtime_env_py_executable.py @@ -1,9 +1,10 @@ import os -import pytest import sys import tempfile from pathlib import Path +import pytest + import ray diff --git a/python/ray/tests/test_runtime_env_ray_minimal.py b/python/ray/tests/test_runtime_env_ray_minimal.py index d524ecee30a9..64687d87d5ad 100644 --- a/python/ray/tests/test_runtime_env_ray_minimal.py +++ b/python/ray/tests/test_runtime_env_ray_minimal.py @@ -11,6 +11,7 @@ import os import sys + import pytest import ray diff --git a/python/ray/tests/test_runtime_env_setup_func.py b/python/ray/tests/test_runtime_env_setup_func.py index b66e13a47973..218478ee11c8 100644 --- a/python/ray/tests/test_runtime_env_setup_func.py +++ b/python/ray/tests/test_runtime_env_setup_func.py @@ -1,16 +1,16 @@ -import threading +import logging import os +import platform import sys -import logging import tempfile -import platform +import threading import pytest import ray from ray._common.test_utils import wait_for_condition -from ray.job_submission import JobSubmissionClient, JobStatus from ray._private.test_utils import format_web_url +from ray.job_submission import JobStatus, JobSubmissionClient def _hook(): diff --git a/python/ray/tests/test_runtime_env_standalone.py b/python/ray/tests/test_runtime_env_standalone.py index 4cc1dee743c8..41ba317888d6 100644 --- a/python/ray/tests/test_runtime_env_standalone.py +++ b/python/ray/tests/test_runtime_env_standalone.py @@ -22,8 +22,8 @@ get_log_sources, ) from ray.exceptions import RuntimeEnvSetupError -from ray.runtime_env import RuntimeEnv from ray.job_submission import JobStatus, JobSubmissionClient +from ray.runtime_env import RuntimeEnv @pytest.mark.skipif(sys.platform == "win32", reason="Flaky on Windows.") diff --git a/python/ray/tests/test_runtime_env_strong_type.py b/python/ray/tests/test_runtime_env_strong_type.py index 4d1f77bf40da..bc61df4e1988 100644 --- a/python/ray/tests/test_runtime_env_strong_type.py +++ b/python/ray/tests/test_runtime_env_strong_type.py @@ -1,11 +1,12 @@ import sys +from dataclasses import dataclass +from typing import List + import pytest -import ray -from typing import List +import ray from ray.runtime_env import RuntimeEnv from ray.runtime_env.types.pip import Pip -from dataclasses import dataclass @dataclass diff --git a/python/ray/tests/test_runtime_env_uv.py b/python/ray/tests/test_runtime_env_uv.py index 7b3e083a4ea4..74fec59f029e 100644 --- a/python/ray/tests/test_runtime_env_uv.py +++ b/python/ray/tests/test_runtime_env_uv.py @@ -3,13 +3,14 @@ # 2. Options for `uv install`. import os -import pytest import sys import tempfile from pathlib import Path -from ray._private.runtime_env import virtualenv_utils +import pytest + import ray +from ray._private.runtime_env import virtualenv_utils @pytest.fixture(scope="function") diff --git a/python/ray/tests/test_runtime_env_uv_run.py b/python/ray/tests/test_runtime_env_uv_run.py index 138ff87e81bf..401d822ad058 100644 --- a/python/ray/tests/test_runtime_env_uv_run.py +++ b/python/ray/tests/test_runtime_env_uv_run.py @@ -1,15 +1,12 @@ import json import os -from pathlib import Path -import platform -import stat import subprocess import sys -import tarfile import tempfile -from urllib import request +from pathlib import Path import pytest +from uv import find_uv_bin import ray from ray._private.test_utils import ( @@ -17,22 +14,6 @@ wait_until_server_available, ) - -@pytest.fixture(scope="function") -def with_uv(): - arch = "aarch64" if platform.machine() in ["aarch64", "arm64"] else "x86_64" - system = "unknown-linux-gnu" if platform.system() == "Linux" else "apple-darwin" - name = f"uv-{arch}-{system}" - url = f"https://github.com/astral-sh/uv/releases/download/0.5.27/{name}.tar.gz" - with tempfile.TemporaryDirectory() as tmp_dir: - with request.urlopen(request.Request(url), timeout=15.0) as response: - with tarfile.open(fileobj=response, mode="r|*") as tar: - tar.extractall(tmp_dir) - uv = Path(tmp_dir) / name / "uv" - uv.chmod(uv.stat().st_mode | stat.S_IEXEC) - yield uv - - PYPROJECT_TOML = """ [project] name = "test" @@ -58,11 +39,9 @@ def tmp_working_dir(): @pytest.mark.skipif(sys.platform == "win32", reason="Not ported to Windows yet.") -def test_uv_run_simple(shutdown_only, with_uv): - uv = with_uv - +def test_uv_run_simple(shutdown_only): runtime_env = { - "py_executable": f"{uv} run --with emoji --no-project", + "py_executable": f"{find_uv_bin()} run --with emoji --no-project", } ray.init(runtime_env=runtime_env) @@ -76,15 +55,14 @@ def emojize(): @pytest.mark.skipif(sys.platform == "win32", reason="Not ported to Windows yet.") -def test_uv_run_pyproject(shutdown_only, with_uv, tmp_working_dir): - uv = with_uv +def test_uv_run_pyproject(shutdown_only, tmp_working_dir): tmp_dir = tmp_working_dir ray.init( runtime_env={ "working_dir": tmp_dir, # We want to run in the system environment so the current installation of Ray can be found here - "py_executable": f"env PYTHONPATH={':'.join(sys.path)} {uv} run --python-preference=only-system", + "py_executable": f"env PYTHONPATH={':'.join(sys.path)} {find_uv_bin()} run --python-preference=only-system", } ) @@ -98,8 +76,7 @@ def emojize(): @pytest.mark.skipif(sys.platform == "win32", reason="Not ported to Windows yet.") -def test_uv_run_editable(shutdown_only, with_uv, tmp_working_dir): - uv = with_uv +def test_uv_run_editable(shutdown_only, tmp_working_dir): tmp_dir = tmp_working_dir subprocess.run( @@ -113,7 +90,7 @@ def test_uv_run_editable(shutdown_only, with_uv, tmp_working_dir): ) subprocess.run( - [uv, "add", "--editable", "./emoji_copy"], + [find_uv_bin(), "add", "--editable", "./emoji_copy"], cwd=tmp_dir, ) @@ -133,7 +110,7 @@ def test_uv_run_editable(shutdown_only, with_uv, tmp_working_dir): runtime_env={ "working_dir": tmp_dir, # We want to run in the system environment so the current installation of Ray can be found here - "py_executable": f"env PYTHONPATH={':'.join(sys.path)} {uv} run --python-preference=only-system", + "py_executable": f"env PYTHONPATH={':'.join(sys.path)} {find_uv_bin()} run --python-preference=only-system", } ) @@ -147,12 +124,10 @@ def emojize(): @pytest.mark.skipif(sys.platform == "win32", reason="Not ported to Windows yet.") -def test_uv_run_runtime_env_hook(with_uv): +def test_uv_run_runtime_env_hook(): import ray._private.runtime_env.uv_runtime_env_hook - uv = with_uv - def check_uv_run( cmd, runtime_env, expected_output, subprocess_kwargs=None, expected_error=None ): @@ -171,25 +146,28 @@ def check_uv_run( script = ray._private.runtime_env.uv_runtime_env_hook.__file__ check_uv_run( - cmd=[uv, "run", "--no-project", script], + cmd=[find_uv_bin(), "run", "--no-project", script], runtime_env={}, expected_output={ - "py_executable": f"{uv} run --no-project", + "py_executable": f"{find_uv_bin()} run --no-project", "working_dir": os.getcwd(), }, ) check_uv_run( - cmd=[uv, "run", "--no-project", "--directory", "/tmp", script], + cmd=[find_uv_bin(), "run", "--no-project", "--directory", "/tmp", script], runtime_env={}, expected_output={ - "py_executable": f"{uv} run --no-project", + "py_executable": f"{find_uv_bin()} run --no-project", "working_dir": os.path.realpath("/tmp"), }, ) check_uv_run( - [uv, "run", "--no-project", script], + [find_uv_bin(), "run", "--no-project", script], {"working_dir": "/some/path"}, - {"py_executable": f"{uv} run --no-project", "working_dir": "/some/path"}, + { + "py_executable": f"{find_uv_bin()} run --no-project", + "working_dir": "/some/path", + }, ) with tempfile.TemporaryDirectory() as tmp_dir: @@ -200,9 +178,12 @@ def check_uv_run( file.write('version = "0.1"\n') file.write('dependencies = ["psutil"]\n') check_uv_run( - cmd=[uv, "run", script], + cmd=[find_uv_bin(), "run", script], runtime_env={}, - expected_output={"py_executable": f"{uv} run", "working_dir": f"{tmp_dir}"}, + expected_output={ + "py_executable": f"{find_uv_bin()} run", + "working_dir": f"{tmp_dir}", + }, subprocess_kwargs={"cwd": tmp_dir}, ) @@ -213,10 +194,10 @@ def check_uv_run( with open(requirements, "w") as file: file.write("psutil\n") check_uv_run( - cmd=[uv, "run", "--with-requirements", requirements, script], + cmd=[find_uv_bin(), "run", "--with-requirements", requirements, script], runtime_env={}, expected_output={ - "py_executable": f"{uv} run --with-requirements {requirements}", + "py_executable": f"{find_uv_bin()} run --with-requirements {requirements}", "working_dir": f"{tmp_dir}", }, subprocess_kwargs={"cwd": tmp_dir}, @@ -232,7 +213,7 @@ def check_uv_run( file.write('version = "0.1"\n') file.write('dependencies = ["psutil"]\n') check_uv_run( - cmd=[uv, "run", script], + cmd=[find_uv_bin(), "run", script], runtime_env={}, expected_output=None, subprocess_kwargs={"cwd": tmp_dir / "cwd"}, @@ -247,7 +228,7 @@ def check_uv_run( file.write("psutil\n") check_uv_run( cmd=[ - uv, + find_uv_bin(), "run", "--with-requirements", tmp_dir / "requirements.txt", @@ -263,7 +244,7 @@ def check_uv_run( # when combined with the 'pip' or 'uv' environment. for runtime_env in [{"uv": ["emoji"]}, {"pip": ["emoji"]}]: check_uv_run( - cmd=[uv, "run", "--no-project", script], + cmd=[find_uv_bin(), "run", "--no-project", script], runtime_env=runtime_env, expected_output=None, expected_error="You are using the 'pip' or 'uv' runtime environments together with 'uv run'.", @@ -275,10 +256,10 @@ def check_uv_run( # Check in the case that there is one more level of subprocess indirection between # the "uv run" process and the process that checks the environment check_uv_run( - cmd=[uv, "run", "--no-project", script], + cmd=[find_uv_bin(), "run", "--no-project", script], runtime_env={}, expected_output={ - "py_executable": f"{uv} run --no-project", + "py_executable": f"{find_uv_bin()} run --no-project", "working_dir": os.getcwd(), }, subprocess_kwargs={ @@ -288,10 +269,10 @@ def check_uv_run( # Check in the case that the script is started with multiprocessing spawn check_uv_run( - cmd=[uv, "run", "--no-project", script], + cmd=[find_uv_bin(), "run", "--no-project", script], runtime_env={}, expected_output={ - "py_executable": f"{uv} run --no-project", + "py_executable": f"{find_uv_bin()} run --no-project", "working_dir": os.getcwd(), }, subprocess_kwargs={ @@ -302,7 +283,7 @@ def check_uv_run( # Check in the case that a module is used for "uv run" (-m or --module) check_uv_run( cmd=[ - uv, + find_uv_bin(), "run", "--no-project", "-m", @@ -310,7 +291,7 @@ def check_uv_run( ], runtime_env={}, expected_output={ - "py_executable": f"{uv} run --no-project", + "py_executable": f"{find_uv_bin()} run --no-project", "working_dir": os.getcwd(), }, ) @@ -319,7 +300,7 @@ def check_uv_run( # an argument immediately behind it check_uv_run( cmd=[ - uv, + find_uv_bin(), "run", "--no-project", "-m", @@ -328,7 +309,7 @@ def check_uv_run( ], runtime_env={}, expected_output={ - "py_executable": f"{uv} run --no-project", + "py_executable": f"{find_uv_bin()} run --no-project", "working_dir": os.getcwd(), }, ) @@ -385,10 +366,9 @@ def test_uv_run_parser(): @pytest.mark.skipif(sys.platform == "win32", reason="Not ported to Windows yet.") -def test_uv_run_runtime_env_hook_e2e(shutdown_only, with_uv, temp_dir): +def test_uv_run_runtime_env_hook_e2e(shutdown_only, temp_dir): - uv = with_uv - tmp_out_dir = Path(temp_dir) + tmp_dir = Path(temp_dir) script = f""" import json @@ -400,39 +380,41 @@ def f(): import emoji return {{"working_dir_files": os.listdir(os.getcwd())}} -with open("{tmp_out_dir / "output.txt"}", "w") as out: +with open("{tmp_dir / "output.txt"}", "w") as out: json.dump(ray.get(f.remote()), out) """ - with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f: + working_dir = tmp_dir / "working_dir" + working_dir.mkdir(parents=True, exist_ok=True) + + script_file = working_dir / "script.py" + with open(script_file, "w") as f: f.write(script) f.close() - subprocess.run( - [ - uv, - "run", - # We want to run in the system environment so the current installation of Ray can be found here - "--python-preference=only-system", - "--with", - "emoji", - "--no-project", - f.name, - ], - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - env={ - "RAY_RUNTIME_ENV_HOOK": "ray._private.runtime_env.uv_runtime_env_hook.hook", - "PYTHONPATH": ":".join(sys.path), - "PATH": os.environ["PATH"], - }, - cwd=os.path.dirname(uv), - check=True, - ) - with open(tmp_out_dir / "output.txt") as f: - assert json.load(f) == { - "working_dir_files": os.listdir(os.path.dirname(uv)) - } + + subprocess.run( + [ + find_uv_bin(), + "run", + # We want to run in the system environment so the current installation of Ray can be found here + "--python-preference=only-system", + "--with", + "emoji", + "--no-project", + str(script_file), + ], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env={ + "PYTHONPATH": ":".join(sys.path), + "PATH": os.environ["PATH"], + }, + cwd=working_dir, + check=True, + ) + with open(tmp_dir / "output.txt") as f: + assert json.load(f) == {"working_dir_files": os.listdir(working_dir)} @pytest.mark.skipif(sys.platform == "win32", reason="Not ported to Windows yet.") @@ -440,23 +422,19 @@ def f(): "ray_start_cluster_head_with_env_vars", [ { - "env_vars": { - "RAY_RUNTIME_ENV_HOOK": "ray._private.runtime_env.uv_runtime_env_hook.hook" - }, "include_dashboard": True, } ], indirect=True, ) def test_uv_run_runtime_env_hook_e2e_job( - ray_start_cluster_head_with_env_vars, with_uv, temp_dir + ray_start_cluster_head_with_env_vars, temp_dir ): cluster = ray_start_cluster_head_with_env_vars assert wait_until_server_available(cluster.webui_url) is True webui_url = format_web_url(cluster.webui_url) - uv = with_uv - tmp_out_dir = Path(temp_dir) + tmp_dir = Path(temp_dir) script = f""" import json @@ -468,52 +446,54 @@ def f(): import emoji return {{"working_dir_files": os.listdir(os.getcwd())}} -with open("{tmp_out_dir / "output.txt"}", "w") as out: +with open("{tmp_dir / "output.txt"}", "w") as out: json.dump(ray.get(f.remote()), out) """ - with tempfile.NamedTemporaryFile( - "w", suffix=".py", delete=False - ) as f, tempfile.NamedTemporaryFile("w", delete=False) as requirements: + working_dir = tmp_dir / "working_dir" + working_dir.mkdir(parents=True, exist_ok=True) + + script_file = working_dir / "script.py" + with open(script_file, "w") as f: f.write(script) f.close() - requirements.write("emoji\n") - requirements.close() - # Test job submission - runtime_env_json = ( - '{"env_vars": {"PYTHONPATH": "' - + ":".join(sys.path) - + '"}, "working_dir": "."}' - ) - subprocess.run( - [ - "ray", - "job", - "submit", - "--runtime-env-json", - runtime_env_json, - "--", - uv, - "run", - "--with-requirements", - requirements.name, - "--no-project", - f.name, - ], - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - env={ - "PATH": os.environ["PATH"], - "RAY_ADDRESS": webui_url, - }, - cwd=os.path.dirname(uv), - check=True, - ) - with open(tmp_out_dir / "output.txt") as f: - assert json.load(f) == { - "working_dir_files": os.listdir(os.path.dirname(uv)) - } + + requirements_file = working_dir / "requirements.txt" + with open(requirements_file, "w") as f: + f.write("emoji\n") + f.close() + + # Test job submission + runtime_env_json = ( + '{"env_vars": {"PYTHONPATH": "' + ":".join(sys.path) + '"}, "working_dir": "."}' + ) + subprocess.run( + [ + "ray", + "job", + "submit", + "--runtime-env-json", + runtime_env_json, + "--", + find_uv_bin(), + "run", + "--with-requirements", + str(requirements_file), + "--no-project", + str(script_file), + ], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env={ + "PATH": os.environ["PATH"], + "RAY_ADDRESS": webui_url, + }, + cwd=working_dir, + check=True, + ) + with open(tmp_dir / "output.txt") as f: + assert json.load(f) == {"working_dir_files": os.listdir(working_dir)} if __name__ == "__main__": diff --git a/python/ray/tests/test_runtime_env_working_dir.py b/python/ray/tests/test_runtime_env_working_dir.py index 00e799ba0d53..cd7a1e8f8684 100644 --- a/python/ray/tests/test_runtime_env_working_dir.py +++ b/python/ray/tests/test_runtime_env_working_dir.py @@ -193,8 +193,8 @@ def reinit(): @ray.remote def test_import(): - import test_module import file_module + import test_module assert TEST_IMPORT_DIR in os.environ.get("PYTHONPATH", "") return test_module.one(), file_module.hello() @@ -236,8 +236,8 @@ def test_read(): @ray.remote class Actor: def test_import(self): - import test_module import file_module + import test_module assert TEST_IMPORT_DIR in os.environ.get("PYTHONPATH", "") return test_module.one(), file_module.hello() @@ -297,8 +297,8 @@ def reinit(): # Import in the driver. sys.path.insert(0, tmp_working_dir) - import test_module import file_module + import test_module @ray.remote def test_import(): diff --git a/python/ray/tests/test_runtime_env_working_dir_2.py b/python/ray/tests/test_runtime_env_working_dir_2.py index a77e8353d0ac..b9addec4c1da 100644 --- a/python/ray/tests/test_runtime_env_working_dir_2.py +++ b/python/ray/tests/test_runtime_env_working_dir_2.py @@ -1,23 +1,22 @@ import os -from pathlib import Path import sys import tempfile +from pathlib import Path import pytest -from ray._private.test_utils import ( - chdir, - run_string_as_driver, -) - import ray -from ray._private.runtime_env.packaging import GCS_STORAGE_MAX_SIZE -from ray.exceptions import RuntimeEnvSetupError from ray._private.runtime_env.packaging import ( + GCS_STORAGE_MAX_SIZE, get_uri_for_directory, upload_package_if_needed, ) +from ray._private.test_utils import ( + chdir, + run_string_as_driver, +) from ray._private.utils import get_directory_size_bytes +from ray.exceptions import RuntimeEnvSetupError # This test requires you have AWS credentials set up (any AWS credentials will # do, this test only accesses a public bucket). diff --git a/python/ray/tests/test_runtime_env_working_dir_3.py b/python/ray/tests/test_runtime_env_working_dir_3.py index d90fff342a37..a53f7f015010 100644 --- a/python/ray/tests/test_runtime_env_working_dir_3.py +++ b/python/ray/tests/test_runtime_env_working_dir_3.py @@ -8,15 +8,15 @@ import pytest import ray -from ray._common.test_utils import wait_for_condition import ray.experimental.internal_kv as kv +from ray._common.test_utils import wait_for_condition from ray._private.ray_constants import RAY_RUNTIME_ENV_URI_PIN_EXPIRATION_S_ENV_VAR -from ray._private.utils import get_directory_size_bytes from ray._private.test_utils import ( chdir, check_local_files_gced, find_free_port, ) +from ray._private.utils import get_directory_size_bytes # This test requires you have AWS credentials set up (any AWS credentials will # do, this test only accesses a public bucket). @@ -113,8 +113,8 @@ def test_job_level_gc( @ray.remote(num_cpus=1) class A: def test_import(self): - import test_module import pip_install_test # noqa: F401 + import test_module test_module.one() @@ -239,8 +239,8 @@ def test_detached_actor_gc( @ray.remote class A: def test_import(self): - import test_module import pip_install_test # noqa: F401 + import test_module test_module.one() diff --git a/python/ray/tests/test_runtime_env_working_dir_4.py b/python/ray/tests/test_runtime_env_working_dir_4.py index 77e2e81d6bef..f2aa7d9f04bc 100644 --- a/python/ray/tests/test_runtime_env_working_dir_4.py +++ b/python/ray/tests/test_runtime_env_working_dir_4.py @@ -1,12 +1,12 @@ import os -from pathlib import Path import sys +from pathlib import Path import pytest -from ray._common.test_utils import wait_for_condition from pytest_lazy_fixtures import lf as lazy_fixture import ray +from ray._common.test_utils import wait_for_condition from ray._private.test_utils import ( check_local_files_gced, run_string_as_driver_nonblocking, diff --git a/python/ray/tests/test_scheduling.py b/python/ray/tests/test_scheduling.py index ab27dfe31e6b..786006b06139 100644 --- a/python/ray/tests/test_scheduling.py +++ b/python/ray/tests/test_scheduling.py @@ -12,16 +12,16 @@ import ray import ray.cluster_utils import ray.util.accelerators -from ray._private.internal_api import memory_summary -from ray.util.scheduling_strategies import ( - PlacementGroupSchedulingStrategy, - NodeAffinitySchedulingStrategy, -) from ray._common.test_utils import SignalActor, wait_for_condition +from ray._private.internal_api import memory_summary from ray._private.test_utils import ( - object_memory_usage, - get_metric_check_condition, MetricSamplePattern, + get_metric_check_condition, + object_memory_usage, +) +from ray.util.scheduling_strategies import ( + NodeAffinitySchedulingStrategy, + PlacementGroupSchedulingStrategy, ) logger = logging.getLogger(__name__) diff --git a/python/ray/tests/test_scheduling_2.py b/python/ray/tests/test_scheduling_2.py index 64f237c50892..77ee41286be4 100644 --- a/python/ray/tests/test_scheduling_2.py +++ b/python/ray/tests/test_scheduling_2.py @@ -9,17 +9,17 @@ import ray import ray._private.gcs_utils as gcs_utils import ray.experimental.internal_kv as internal_kv +from ray._common.test_utils import SignalActor, wait_for_condition from ray._private.test_utils import ( - make_global_state_accessor, - get_metric_check_condition, MetricSamplePattern, + get_metric_check_condition, + make_global_state_accessor, ) from ray.util.placement_group import placement_group from ray.util.scheduling_strategies import ( NodeAffinitySchedulingStrategy, PlacementGroupSchedulingStrategy, ) -from ray._common.test_utils import SignalActor, wait_for_condition from ray.util.state import list_tasks diff --git a/python/ray/tests/test_shuffle.py b/python/ray/tests/test_shuffle.py index 0a8bbd5be1dc..520024959d32 100644 --- a/python/ray/tests/test_shuffle.py +++ b/python/ray/tests/test_shuffle.py @@ -1,7 +1,8 @@ -import ray -import pytest import sys +import pytest + +import ray from ray.experimental import shuffle diff --git a/python/ray/tests/test_state_api.py b/python/ray/tests/test_state_api.py index 9d9396ac0552..56231924e5f9 100644 --- a/python/ray/tests/test_state_api.py +++ b/python/ray/tests/test_state_api.py @@ -1,91 +1,94 @@ -import os -import time import json -import sys +import os import signal +import sys +import time from collections import Counter from concurrent.futures import ThreadPoolExecutor from typing import List -from unittest.mock import MagicMock, AsyncMock, patch -import yaml +from unittest.mock import AsyncMock, MagicMock, patch -from click.testing import CliRunner import pytest import pytest_asyncio -from ray._private.state_api_test_utils import ( - get_state_api_manager, - create_api_options, - verify_schema, -) -from ray.util.state import get_job -from ray.dashboard.modules.job.pydantic_models import JobDetails -from ray.util.state.common import Humanify +import yaml +from click.testing import CliRunner -from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy import ray -import ray.dashboard.consts as dashboard_consts -import ray._private.state as global_state import ray._private.ray_constants as ray_constants -from ray._raylet import GcsClient, ActorID, JobID, TaskID +import ray._private.state as global_state +import ray.dashboard.consts as dashboard_consts from ray._common.network_utils import parse_address -from ray._private.test_utils import ( - run_string_as_driver, - find_free_port, -) from ray._common.test_utils import ( SignalActor, async_wait_for_condition, wait_for_condition, ) +from ray._private.state_api_test_utils import ( + create_api_options, + get_state_api_manager, + verify_schema, +) +from ray._private.test_utils import ( + find_free_port, + run_string_as_driver, +) +from ray._raylet import ActorID, GcsClient, JobID, NodeID, TaskID from ray.cluster_utils import cluster_not_supported -from ray._raylet import NodeID from ray.core.generated.common_pb2 import ( Address, CoreWorkerStats, ObjectRefInfo, TaskInfoEntry, TaskStatus, - WorkerType, TaskType, + WorkerType, ) -from ray.core.generated.gcs_service_pb2_grpc import TaskInfoGcsServiceStub from ray.core.generated.gcs_pb2 import ( - TaskEvents, - TaskStateUpdate, ActorTableData, GcsNodeInfo, PlacementGroupTableData, + TaskEvents, + TaskStateUpdate, WorkerTableData, ) from ray.core.generated.gcs_service_pb2 import ( FilterPredicate, GcsStatus, - GetTaskEventsReply, GetAllActorInfoReply, GetAllNodeInfoReply, GetAllPlacementGroupReply, GetAllWorkerInfoReply, + GetTaskEventsReply, ) +from ray.core.generated.gcs_service_pb2_grpc import TaskInfoGcsServiceStub from ray.core.generated.node_manager_pb2 import GetObjectsInfoReply from ray.core.generated.reporter_pb2 import ListLogsReply, StreamLogReply from ray.core.generated.runtime_env_agent_pb2 import GetRuntimeEnvsInfoReply from ray.core.generated.runtime_env_common_pb2 import ( RuntimeEnvState as RuntimeEnvStateProto, ) +from ray.dashboard.modules.job.pydantic_models import JobDetails from ray.dashboard.state_aggregator import ( GCS_QUERY_FAILURE_WARNING, NODE_QUERY_FAILURE_WARNING, StateAPIManager, ) from ray.dashboard.state_api_utils import convert_filters_type +from ray.dashboard.utils import ray_address_to_api_server_url +from ray.job_submission import JobSubmissionClient +from ray.runtime_env import RuntimeEnv +from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy from ray.util.state import ( + StateApiClient, get_actor, + get_job, get_node, get_objects, get_placement_group, get_task, get_worker, list_actors, + list_cluster_events, list_jobs, list_nodes, list_objects, @@ -96,37 +99,33 @@ summarize_actors, summarize_objects, summarize_tasks, - list_cluster_events, - StateApiClient, ) from ray.util.state.common import ( DEFAULT_RPC_TIMEOUT, ActorState, + GetApiOptions, + Humanify, ListApiOptions, - SummaryApiOptions, NodeState, ObjectState, PlacementGroupState, RuntimeEnvState, + StateSchema, + SummaryApiOptions, TaskState, WorkerState, - StateSchema, state_column, - GetApiOptions, ) -from ray.dashboard.utils import ray_address_to_api_server_url from ray.util.state.exception import DataSourceUnavailable, RayStateApiException from ray.util.state.state_cli import ( AvailableFormat, - format_list_api_output, _parse_filter, + format_list_api_output, + ray_get, + ray_list, summary_state_cli_group, ) -from ray.util.state.state_cli import ray_get -from ray.util.state.state_cli import ray_list from ray.util.state.state_manager import StateDataSourceClient -from ray.job_submission import JobSubmissionClient -from ray.runtime_env import RuntimeEnv """ Unit tests @@ -171,7 +170,7 @@ def generate_actor_data(id, state=ActorTableData.ActorState.ALIVE, class_name="c name="abc", pid=1234, class_name=class_name, - address=Address(raylet_id=id, ip_address="127.0.0.1", port=124, worker_id=id), + address=Address(node_id=id, ip_address="127.0.0.1", port=124, worker_id=id), job_id=b"123", node_id=None, ray_namespace="", @@ -208,7 +207,7 @@ def generate_worker_data( ): return WorkerTableData( worker_address=Address( - raylet_id=id, ip_address="127.0.0.1", port=124, worker_id=id + node_id=id, ip_address="127.0.0.1", port=124, worker_id=id ), is_alive=True, timestamp=1234, @@ -3460,9 +3459,9 @@ def verify(): def test_state_api_server_enforce_concurrent_http_requests( api_func, monkeypatch, shutdown_only ): - import time - import threading import queue + import threading + import time # Set environment with monkeypatch.context() as m: diff --git a/python/ray/tests/test_state_api_2.py b/python/ray/tests/test_state_api_2.py index c8aa4095d458..2659e6009c07 100644 --- a/python/ray/tests/test_state_api_2.py +++ b/python/ray/tests/test_state_api_2.py @@ -2,25 +2,24 @@ import json import os import sys -from pathlib import Path import tempfile - from collections import defaultdict -from ray._private.test_utils import check_call_subprocess +from pathlib import Path -import ray -import requests import pytest +import requests +import ray +from ray._common.test_utils import wait_for_condition from ray._private.profiling import chrome_tracing_dump +from ray._private.test_utils import check_call_subprocess from ray.util.state import ( get_actor, - list_tasks, list_actors, - list_workers, list_nodes, + list_tasks, + list_workers, ) -from ray._common.test_utils import wait_for_condition def test_timeline(shutdown_only): diff --git a/python/ray/tests/test_state_api_log.py b/python/ray/tests/test_state_api_log.py index 3a874fe7dd76..69a6835101cc 100644 --- a/python/ray/tests/test_state_api_log.py +++ b/python/ray/tests/test_state_api_log.py @@ -2,48 +2,46 @@ import json import os import sys +import urllib from pathlib import Path from typing import List -from unittest.mock import MagicMock, AsyncMock +from unittest.mock import AsyncMock, MagicMock import grpc -from ray._common.test_utils import wait_for_condition -import requests import pytest -import urllib +import requests from click.testing import CliRunner import ray +from ray._common.test_utils import wait_for_condition from ray._private.test_utils import ( format_web_url, wait_until_server_available, ) -from ray.util.state.state_cli import logs_state_cli_group -from ray.util.state import list_jobs - from ray._raylet import ActorID, NodeID, TaskID, WorkerID from ray.core.generated.common_pb2 import Address -from ray.core.generated.gcs_service_pb2 import GetTaskEventsReply -from ray.core.generated.reporter_pb2 import ListLogsReply, StreamLogReply from ray.core.generated.gcs_pb2 import ( ActorTableData, TaskEvents, - TaskStateUpdate, TaskLogInfo, + TaskStateUpdate, ) +from ray.core.generated.gcs_service_pb2 import GetTaskEventsReply +from ray.core.generated.reporter_pb2 import ListLogsReply, StreamLogReply from ray.dashboard.modules.log.log_agent import ( - find_offset_of_content_in_file, + LogAgentV1Grpc, + _stream_log_in_chunk, find_end_offset_file, find_end_offset_next_n_lines_from_offset, + find_offset_of_content_in_file, find_start_offset_last_n_lines_from_offset, - LogAgentV1Grpc, ) -from ray.dashboard.modules.log.log_agent import _stream_log_in_chunk from ray.dashboard.modules.log.log_manager import LogsManager from ray.dashboard.tests.conftest import * # noqa -from ray.util.state import get_log, list_logs, list_nodes, list_workers +from ray.util.state import get_log, list_jobs, list_logs, list_nodes, list_workers from ray.util.state.common import GetLogOptions from ray.util.state.exception import RayStateApiException +from ray.util.state.state_cli import logs_state_cli_group from ray.util.state.state_manager import StateDataSourceClient @@ -90,7 +88,7 @@ async def generate_actor_data(id, node_id, worker_id): pid=1234, class_name="class", address=Address( - raylet_id=node_id.binary(), + node_id=node_id.binary(), ip_address="127.0.0.1", port=1234, worker_id=worker_id, diff --git a/python/ray/tests/test_state_api_summary.py b/python/ray/tests/test_state_api_summary.py index 8ad65e8021c2..59303828e276 100644 --- a/python/ray/tests/test_state_api_summary.py +++ b/python/ray/tests/test_state_api_summary.py @@ -1,43 +1,42 @@ -import time import json -import pytest -import ray -from unittest.mock import AsyncMock import random import sys -from dataclasses import asdict +import time from concurrent.futures import ThreadPoolExecutor +from dataclasses import asdict +from unittest.mock import AsyncMock -from ray.util.state import ( - summarize_tasks, - summarize_actors, - summarize_objects, -) -from ray._common.test_utils import wait_for_condition -from ray._raylet import ActorID, TaskID, ObjectID +import pytest +from click.testing import CliRunner +import ray +from ray._common.test_utils import wait_for_condition +from ray._raylet import ActorID, ObjectID, TaskID from ray.core.generated.common_pb2 import TaskStatus, TaskType, WorkerType +from ray.core.generated.gcs_pb2 import ActorTableData, GcsNodeInfo +from ray.core.generated.gcs_service_pb2 import GetAllActorInfoReply, GetAllNodeInfoReply from ray.core.generated.node_manager_pb2 import GetObjectsInfoReply -from ray.core.generated.gcs_pb2 import GcsNodeInfo +from ray.dashboard.state_aggregator import StateAPIManager from ray.tests.test_state_api import ( - generate_task_data, - generate_task_event, generate_actor_data, generate_object_info, + generate_task_data, + generate_task_event, +) +from ray.util.state import ( + summarize_actors, + summarize_objects, + summarize_tasks, ) from ray.util.state.common import ( DEFAULT_RPC_TIMEOUT, - SummaryApiOptions, + DRIVER_TASK_ID_PREFIX, Link, NestedTaskSummary, + SummaryApiOptions, TaskSummaries, - DRIVER_TASK_ID_PREFIX, ) -from ray.core.generated.gcs_service_pb2 import GetAllActorInfoReply, GetAllNodeInfoReply -from ray.core.generated.gcs_pb2 import ActorTableData -from click.testing import CliRunner from ray.util.state.state_cli import summary_state_cli_group -from ray.dashboard.state_aggregator import StateAPIManager from ray.util.state.state_manager import StateDataSourceClient diff --git a/python/ray/tests/test_streaming_generator.py b/python/ray/tests/test_streaming_generator.py index 2e166e190ba5..d3d85fec0559 100644 --- a/python/ray/tests/test_streaming_generator.py +++ b/python/ray/tests/test_streaming_generator.py @@ -1,19 +1,19 @@ import asyncio -import pytest -import numpy as np +import gc import sys -import time import threading -import gc +import time +from unittest.mock import Mock, patch -from unittest.mock import patch, Mock +import numpy as np +import pytest import ray from ray._common.test_utils import wait_for_condition -from ray.experimental.state.api import list_objects from ray._raylet import ObjectRefGenerator, ObjectRefStreamEndOfStreamError from ray.cloudpickle import dumps from ray.exceptions import WorkerCrashedError +from ray.experimental.state.api import list_objects class MockedWorker: diff --git a/python/ray/tests/test_streaming_generator_2.py b/python/ray/tests/test_streaming_generator_2.py index f7cdc4d5d704..407dc83ddbdd 100644 --- a/python/ray/tests/test_streaming_generator_2.py +++ b/python/ray/tests/test_streaming_generator_2.py @@ -1,16 +1,17 @@ import asyncio -import pytest -import numpy as np +import gc import sys import time -import gc + +import numpy as np +import pytest import ray -from ray.experimental.state.api import list_actors -from ray._common.test_utils import SignalActor from ray._common.test_utils import ( + SignalActor, wait_for_condition, ) +from ray.experimental.state.api import list_actors RECONSTRUCTION_CONFIG = { "health_check_failure_threshold": 10, @@ -21,6 +22,8 @@ "task_retry_delay_ms": 100, "object_timeout_milliseconds": 200, "fetch_warn_timeout_milliseconds": 1000, + # Required for reducing the retry time of RequestWorkerLease + "raylet_rpc_server_reconnect_timeout_s": 0, } diff --git a/python/ray/tests/test_streaming_generator_3.py b/python/ray/tests/test_streaming_generator_3.py index c16d03c0b9c3..14c66f286f8f 100644 --- a/python/ray/tests/test_streaming_generator_3.py +++ b/python/ray/tests/test_streaming_generator_3.py @@ -1,11 +1,11 @@ import asyncio -import pytest -import numpy as np import sys import time - from collections import Counter +import numpy as np +import pytest + import ray from ray._raylet import ObjectRefGenerator from ray.exceptions import TaskCancelledError diff --git a/python/ray/tests/test_streaming_generator_4.py b/python/ray/tests/test_streaming_generator_4.py index f9b378e00a6b..fe1455e7b4f2 100644 --- a/python/ray/tests/test_streaming_generator_4.py +++ b/python/ray/tests/test_streaming_generator_4.py @@ -1,13 +1,14 @@ -import pytest -import numpy as np -import sys -import time +import asyncio import gc import os -import signal import random -import asyncio +import signal +import sys +import time from typing import Optional + +import numpy as np +import pytest from pydantic import BaseModel import ray @@ -22,6 +23,8 @@ "task_retry_delay_ms": 100, "object_timeout_milliseconds": 200, "fetch_warn_timeout_milliseconds": 1000, + # Required for reducing the retry time of RequestWorkerLease + "raylet_rpc_server_reconnect_timeout_s": 0, } diff --git a/python/ray/tests/test_streaming_generator_backpressure.py b/python/ray/tests/test_streaming_generator_backpressure.py index 0dc220913b82..6d2a3b3d76e5 100644 --- a/python/ray/tests/test_streaming_generator_backpressure.py +++ b/python/ray/tests/test_streaming_generator_backpressure.py @@ -1,10 +1,11 @@ import asyncio -import pytest -import numpy as np +import os +import signal import sys import time -import signal -import os + +import numpy as np +import pytest import ray from ray._common.test_utils import wait_for_condition diff --git a/python/ray/tests/test_stress.py b/python/ray/tests/test_stress.py index 0ff6559094b5..afbc6f40cbba 100644 --- a/python/ray/tests/test_stress.py +++ b/python/ray/tests/test_stress.py @@ -1,8 +1,8 @@ -import time import sys +import time -import pytest import numpy as np +import pytest import ray from ray.cluster_utils import Cluster, cluster_not_supported diff --git a/python/ray/tests/test_symmetric_run.py b/python/ray/tests/test_symmetric_run.py new file mode 100644 index 000000000000..dfbaac359eb4 --- /dev/null +++ b/python/ray/tests/test_symmetric_run.py @@ -0,0 +1,171 @@ +import sys +from contextlib import contextmanager +from unittest.mock import MagicMock, patch + +import pytest +from click.testing import CliRunner + +import ray +import ray.scripts.scripts as scripts + + +@contextmanager +def _setup_mock_network_utils(curr_ip, head_ip): + import socket + + # Mock socket.getaddrinfo to return a valid IP + with patch("socket.getaddrinfo") as mock_getaddrinfo: + mock_getaddrinfo.return_value = [("", "", "", "", (curr_ip, 6379))] + + # Mock psutil.net_if_addrs to return localhost IP + with patch("psutil.net_if_addrs") as mock_net_if_addrs: + mock_net_if_addrs.return_value = { + "lo": [ + type( + "addr", + (), + {"family": socket.AF_INET, "address": head_ip}, + )() + ] + } + yield + + +@pytest.fixture +def cleanup_ray(): + """Shutdown all ray instances""" + yield + runner = CliRunner() + runner.invoke(scripts.stop, ["--force"]) + ray.shutdown() + + +def test_symmetric_run_basic_interface(monkeypatch, cleanup_ray): + """Test basic symmetric_run interface with minimal arguments.""" + from ray.scripts.symmetric_run import symmetric_run + + runner = CliRunner() + + # Mock subprocess.run to avoid actually starting Ray + with patch("subprocess.run") as mock_run: + mock_run.return_value.returncode = 0 + with _setup_mock_network_utils("127.0.0.1", "127.0.0.1"): + args = ["--address", "127.0.0.1:6379", "--", "echo", "test"] + + with patch("sys.argv", ["ray.scripts.symmetric_run", *args]): + # Test basic symmetric_run call using CliRunner + result = runner.invoke(symmetric_run, args) + assert result.exit_code == 0 + + # Verify that subprocess.run was called for ray start + assert mock_run.called + calls = mock_run.call_args_list + + # Should have called ray start with --head + ray_start_calls = [ + call for call in calls if "ray" in str(call) and "start" in str(call) + ] + assert len(ray_start_calls) > 0 + + # Should have called ray stop + ray_stop_calls = [ + call for call in calls if "ray" in str(call) and "stop" in str(call) + ] + assert len(ray_stop_calls) > 0 + + +def test_symmetric_run_worker_node_behavior(monkeypatch, cleanup_ray): + """Test symmetric_run behavior when not on the head node.""" + from ray.scripts.symmetric_run import symmetric_run + + runner = CliRunner() + + with patch("subprocess.run") as mock_run: + mock_run.return_value.returncode = 0 + + with _setup_mock_network_utils("192.168.1.100", "192.168.1.101"): + # Mock socket connection check to simulate head node ready + with patch("socket.socket") as mock_socket: + mock_socket_instance = MagicMock() + mock_socket_instance.connect_ex.return_value = 0 + mock_socket.return_value.__enter__.return_value = mock_socket_instance + + # Test worker node behavior + args = ["--address", "192.168.1.100:6379", "--", "echo", "test"] + with patch("sys.argv", ["ray.scripts.symmetric_run", *args]): + with patch( + "ray.scripts.symmetric_run.check_head_node_ready" + ) as mock_check_head_node_ready: + mock_check_head_node_ready.return_value = True + result = runner.invoke(symmetric_run, args) + assert result.exit_code == 0 + + # Verify that subprocess.run was called + assert mock_run.called + calls = mock_run.call_args_list + + # Should have called ray start with --address (worker mode) + ray_start_calls = [ + call + for call in calls + if "ray" in str(call) and "start" in str(call) + ] + assert len(ray_start_calls) > 0 + + # Check that it's in worker mode (--address instead of --head) + start_call = ray_start_calls[0] + start_args = start_call[0][0] + assert "--address" in start_args + assert "192.168.1.100:6379" in start_args + assert "--head" not in start_args + assert "--block" in start_args # Worker nodes should block + + +def test_symmetric_run_arg_validation(monkeypatch, cleanup_ray): + """Test that symmetric_run validates arguments.""" + from ray.scripts.symmetric_run import symmetric_run + + runner = CliRunner() + + # Mock subprocess.run to avoid actually starting Ray + with _setup_mock_network_utils("127.0.0.1", "127.0.0.1"): + + with patch("subprocess.run") as mock_run: + mock_run.return_value.returncode = 0 + args = ["--address", "127.0.0.1:6379", "--", "echo", "test"] + + with patch("sys.argv", ["ray.scripts.symmetric_run", *args]): + # Test basic symmetric_run call using CliRunner + result = runner.invoke(symmetric_run, args) + assert result.exit_code == 0 + + # Test that invalid arguments are rejected + with patch("subprocess.run") as mock_run: + mock_run.return_value.returncode = 0 + + args = ["--address", "127.0.0.1:6379", "--head", "--", "echo", "test"] + with patch("sys.argv", ["ray.scripts.symmetric_run", *args]): + result = runner.invoke(symmetric_run, args) + assert result.exit_code == 1 + assert "Cannot use --head option in symmetric_run." in result.output + + with patch("subprocess.run") as mock_run: + mock_run.return_value.returncode = 0 + + # Test args with "=" are passed to ray start + args = ["--address", "127.0.0.1:6379", "--num-cpus=4", "--", "echo", "test"] + with patch("sys.argv", ["ray.scripts.symmetric_run", *args]): + result = runner.invoke(symmetric_run, args) + assert result.exit_code == 0 + + ray_start_calls = [ + call + for call in mock_run.call_args_list + if "ray" in str(call) and "start" in str(call) + ] + assert len(ray_start_calls) > 0 + assert "--num-cpus=4" in ray_start_calls[0][0][0] + + +if __name__ == "__main__": + sys.exit(pytest.main(["-sv", __file__])) diff --git a/python/ray/tests/test_task_events.py b/python/ray/tests/test_task_events.py index b2372db6117d..943bffecf264 100644 --- a/python/ray/tests/test_task_events.py +++ b/python/ray/tests/test_task_events.py @@ -1,25 +1,24 @@ +import sys +import threading +import time from collections import defaultdict from typing import Dict import pytest -import sys -import threading -import time + +import ray from ray._common.test_utils import wait_for_condition from ray._private.state_api_test_utils import ( verify_failed_task, ) -from ray.exceptions import RuntimeEnvSetupError -from ray.runtime_env import RuntimeEnv - -import ray from ray._private.test_utils import ( raw_metrics, run_string_as_driver_nonblocking, ) -from ray.util.state import list_tasks - from ray._private.worker import RayContext +from ray.exceptions import RuntimeEnvSetupError +from ray.runtime_env import RuntimeEnv +from ray.util.state import list_tasks _SYSTEM_CONFIG = { "task_events_report_interval_ms": 100, diff --git a/python/ray/tests/test_task_events_2.py b/python/ray/tests/test_task_events_2.py index 4bc5fefd57b2..5320aeef1d5a 100644 --- a/python/ray/tests/test_task_events_2.py +++ b/python/ray/tests/test_task_events_2.py @@ -1,23 +1,23 @@ import asyncio -from collections import defaultdict import os -from typing import Dict -import pytest import sys import time -from ray._common.test_utils import async_wait_for_condition, wait_for_condition -from ray._private import ray_constants +from collections import defaultdict from functools import reduce +from typing import Dict + +import pytest import ray +from ray._common.test_utils import async_wait_for_condition, wait_for_condition +from ray._private import ray_constants from ray._private.state_api_test_utils import ( PidActor, + _is_actor_task_running, get_state_api_manager, - verify_tasks_running_or_terminated, verify_failed_task, - _is_actor_task_running, + verify_tasks_running_or_terminated, ) -from ray.util.state.common import ListApiOptions, StateResource from ray._private.test_utils import ( run_string_as_driver, run_string_as_driver_nonblocking, @@ -28,6 +28,8 @@ list_jobs, list_tasks, ) +from ray.util.state.common import ListApiOptions, StateResource + import psutil _SYSTEM_CONFIG = { diff --git a/python/ray/tests/test_task_events_3.py b/python/ray/tests/test_task_events_3.py index 19ca96468585..0f6d3b100043 100644 --- a/python/ray/tests/test_task_events_3.py +++ b/python/ray/tests/test_task_events_3.py @@ -1,6 +1,7 @@ -import pytest import sys +import pytest + import ray from ray._common.test_utils import ( wait_for_condition, diff --git a/python/ray/tests/test_task_metrics.py b/python/ray/tests/test_task_metrics.py index 05da74f917ae..c57525914fe9 100644 --- a/python/ray/tests/test_task_metrics.py +++ b/python/ray/tests/test_task_metrics.py @@ -15,7 +15,6 @@ wait_for_assertion, ) - METRIC_CONFIG = { "_system_config": { "metrics_report_interval_ms": 100, diff --git a/python/ray/tests/test_task_metrics_reconstruction.py b/python/ray/tests/test_task_metrics_reconstruction.py index cb219075a1f3..b066f5b9a498 100644 --- a/python/ray/tests/test_task_metrics_reconstruction.py +++ b/python/ray/tests/test_task_metrics_reconstruction.py @@ -4,11 +4,10 @@ import pytest import ray - -from ray.tests.test_task_metrics import tasks_by_all, METRIC_CONFIG from ray._common.test_utils import ( wait_for_condition, ) +from ray.tests.test_task_metrics import METRIC_CONFIG, tasks_by_all # Copied from similar test in test_reconstruction_2.py. diff --git a/python/ray/tests/test_tls_auth.py b/python/ray/tests/test_tls_auth.py index 0ed5a1e622b6..21c230c278a4 100644 --- a/python/ray/tests/test_tls_auth.py +++ b/python/ray/tests/test_tls_auth.py @@ -1,8 +1,8 @@ # coding: utf-8 import logging import os -import sys import subprocess +import sys import pytest diff --git a/python/ray/tests/test_top_level_api.py b/python/ray/tests/test_top_level_api.py index 1b30a9f698d2..9858fcbe4f9c 100644 --- a/python/ray/tests/test_top_level_api.py +++ b/python/ray/tests/test_top_level_api.py @@ -1,5 +1,5 @@ -from inspect import getmembers, isfunction, ismodule import sys +from inspect import getmembers, isfunction, ismodule import pytest diff --git a/python/ray/tests/test_tqdm.py b/python/ray/tests/test_tqdm.py index 9e7cc5639e0a..7b0bdbf4ec5f 100644 --- a/python/ray/tests/test_tqdm.py +++ b/python/ray/tests/test_tqdm.py @@ -4,8 +4,8 @@ import pytest import ray -from ray.experimental import tqdm_ray from ray._common.test_utils import wait_for_condition +from ray.experimental import tqdm_ray def test_distributed_tqdm_remote(): diff --git a/python/ray/tests/test_traceback.py b/python/ray/tests/test_traceback.py index 93cb47b3d158..f9bef5dc74ff 100644 --- a/python/ray/tests/test_traceback.py +++ b/python/ray/tests/test_traceback.py @@ -5,7 +5,7 @@ import pytest import ray -from ray.exceptions import RayTaskError, RayActorError +from ray.exceptions import RayActorError, RayTaskError, UnserializableException """This module tests stacktrace of Ray. @@ -40,9 +40,9 @@ def scrub_traceback(ex): ex = re.sub(r"\x1b\[39m", "", ex) # When running bazel test with pytest 6.x, the module name becomes # "python.ray.tests.test_traceback" instead of just "test_traceback" - # Also remove the "com_github_ray_project_ray" prefix, which may appear on Windows. + # Also remove the "io_ray" prefix, which may appear on Windows. ex = re.sub( - r"(com_github_ray_project_ray.)?python\.ray\.tests\.test_traceback", + r"(io_ray.)?python\.ray\.tests\.test_traceback", "test_traceback", ex, ) @@ -54,6 +54,13 @@ def scrub_traceback(ex): ) # Clean up underscore in stack trace, which is new in python 3.12 ex = re.sub("^\\s+~*\\^+~*\n", "", ex, flags=re.MULTILINE) + # Remove internal Cython frames from ray._raylet that can appear on Windows. + ex = re.sub( + r"^\s*File \"FILE\", line ZZ, in ray\._raylet\.[^\n]+\n", + "", + ex, + flags=re.MULTILINE, + ) return ex @@ -294,24 +301,14 @@ def __repr__(self): def test_unpickleable_stacktrace(shutdown_only): - expected_output = """System error: Failed to unpickle serialized exception -traceback: Traceback (most recent call last): - File "FILE", line ZZ, in from_ray_exception - return pickle.loads(ray_exception.serialized_exception) -TypeError: __init__() missing 1 required positional argument: 'arg' - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "FILE", line ZZ, in deserialize_objects - obj = self._deserialize_object( - File "FILE", line ZZ, in _deserialize_object - return RayError.from_bytes(obj) - File "FILE", line ZZ, in from_bytes - return RayError.from_ray_exception(ray_exception) - File "FILE", line ZZ, in from_ray_exception - raise RuntimeError(msg) from e -RuntimeError: Failed to unpickle serialized exception""" + expected_output = """Failed to deserialize exception. Refer to https://docs.ray.io/en/latest/ray-core/objects/serialization.html#custom-serializers-for-exceptions for more information. +Original exception: +ray.exceptions.RayTaskError: ray::f() (pid=XXX, ip=YYY) + File "FILE", line ZZ, in f + return g(c) + File "FILE", line ZZ, in g + raise NoPickleError("FILE") +test_traceback.NoPickleError""" class NoPickleError(OSError): def __init__(self, arg): @@ -327,13 +324,47 @@ def f(): c = a + b return g(c) - try: + with pytest.raises(UnserializableException) as excinfo: ray.get(f.remote()) - except Exception as ex: - python310_extra_exc_msg = "test_unpickleable_stacktrace..NoPickleError." - assert clean_noqa(expected_output) == scrub_traceback(str(ex)).replace( - f"TypeError: {python310_extra_exc_msg}", "TypeError: " + + assert clean_noqa(expected_output) == scrub_traceback(str(excinfo.value)) + + +def test_exception_with_registered_serializer(shutdown_only): + class NoPickleError(OSError): + def __init__(self, msg): + self.msg = msg + + def __str__(self): + return f"message: {self.msg}" + + def _serializer(e: NoPickleError): + return {"msg": e.msg} + + def _deserializer(state): + return NoPickleError(state["msg"] + " deserialized") + + @ray.remote + def raise_custom_exception(): + ray.util.register_serializer( + NoPickleError, serializer=_serializer, deserializer=_deserializer ) + raise NoPickleError("message") + + try: + with pytest.raises(NoPickleError) as exc_info: + ray.get(raise_custom_exception.remote()) + + # Ensure dual-typed exception and message propagation + assert isinstance(exc_info.value, RayTaskError) + # if custom serializer was not registered, this would be an instance of UnserializableException() + assert isinstance(exc_info.value, NoPickleError) + assert "message" in str(exc_info.value) + # modified message should not be in the exception string, only in the cause + assert "deserialized" not in str(exc_info.value) + assert "message deserialized" in str(exc_info.value.cause) + finally: + ray.util.deregister_serializer(NoPickleError) def test_serialization_error_message(shutdown_only): diff --git a/python/ray/tests/test_typing.py b/python/ray/tests/test_typing.py index 46d5726b32f5..b049fdd77532 100644 --- a/python/ray/tests/test_typing.py +++ b/python/ray/tests/test_typing.py @@ -6,7 +6,6 @@ import mypy.api as mypy_api import pytest - # Paths are relative to the directory where Bazel is run in the CI TYPING_GOOD_PATH = "python/ray/tests/typing_files/check_typing_good.py" TYPING_BAD_PATH = "python/ray/tests/typing_files/check_typing_bad.py" diff --git a/python/ray/tests/test_unavailable_actors.py b/python/ray/tests/test_unavailable_actors.py index 3c19a4028c46..a2e4ebde25a4 100644 --- a/python/ray/tests/test_unavailable_actors.py +++ b/python/ray/tests/test_unavailable_actors.py @@ -1,13 +1,13 @@ import os -import pytest -import sys import signal +import sys from typing import Optional, Tuple +import pytest + import ray -from ray.exceptions import ActorUnavailableError, ActorDiedError -from ray._common.test_utils import SignalActor -from ray._common.test_utils import wait_for_condition +from ray._common.test_utils import SignalActor, wait_for_condition +from ray.exceptions import ActorDiedError, ActorUnavailableError import psutil # We must import psutil after ray because we bundle it with ray. diff --git a/python/ray/tests/test_util_helpers.py b/python/ray/tests/test_util_helpers.py new file mode 100644 index 000000000000..5d35fc9bc160 --- /dev/null +++ b/python/ray/tests/test_util_helpers.py @@ -0,0 +1,190 @@ +import sys + +import pytest + +import ray +from ray._common.test_utils import SignalActor +from ray.util import as_completed, map_unordered + + +@pytest.fixture(scope="module") +def ray_init_4_cpu_shared(): + ray.init(num_cpus=4) + yield + ray.shutdown() + + +@pytest.mark.parametrize("yield_obj_refs", [True, False]) +def test_as_completed_chunk_size_1(ray_init_4_cpu_shared, yield_obj_refs): + """Test as_completed with chunk_size=1. + + Use SignalActor to control task completion order and mimic time.sleep(x) behavior. + + """ + inputs = [10, 8, 6, 4, 2] + + # Create signals for each task + signals = [SignalActor.remote() for _ in range(len(inputs))] + + # Create tasks + @ray.remote + def f(x, signal): + ray.get(signal.wait.remote()) + return x + + # Submit tasks with their corresponding signals in the original order + refs = [f.remote(x, signal) for x, signal in zip(inputs, signals)] + + # Use as_completed() lazily + it = as_completed(refs, chunk_size=1, yield_obj_refs=yield_obj_refs) + + # Send signal in reverse order to mimic time.sleep(x), i.e., + # smallest value releases first. At the same time, collect results + + results = [] + for signal in reversed(signals): + ray.get(signal.send.remote()) + results.append(next(it)) + + if yield_obj_refs: + results = ray.get(results) + + assert results == [2, 4, 6, 8, 10] + + +@pytest.mark.parametrize("yield_obj_refs", [True, False]) +def test_as_completed_chunk_size_2(ray_init_4_cpu_shared, yield_obj_refs): + """Test as_completed with chunk_size=2. + + Use SignalActor to control task completion order and mimic time.sleep(x) behavior. + + """ + inputs = [10, 8, 6, 4, 2] + + # Create signals for each task + signals = [SignalActor.remote() for _ in range(len(inputs))] + + # Create tasks + @ray.remote + def f(x, signal): + ray.get(signal.wait.remote()) + return x + + # Submit tasks with their corresponding signals in the original order + refs = [f.remote(x, signal) for x, signal in zip(inputs, signals)] + + # Use as_completed() lazily + it = as_completed(refs, chunk_size=2, yield_obj_refs=yield_obj_refs) + + # Send signal in reverse order to mimic time.sleep(x), i.e., + # smallest value releases first. At the same time, collect results + + results = [] + + ray.get(signals[4].send.remote()) + ray.get(signals[3].send.remote()) + results.append(next(it)) + results.append(next(it)) + + ray.get(signals[2].send.remote()) + ray.get(signals[1].send.remote()) + results.append(next(it)) + results.append(next(it)) + + ray.get(signals[0].send.remote()) + results.append(next(it)) + + if yield_obj_refs: + results = ray.get(results) + + assert results == [4, 2, 8, 6, 10] + + +@pytest.mark.parametrize("yield_obj_refs", [True, False]) +def test_map_unordered_chunk_size_1(ray_init_4_cpu_shared, yield_obj_refs): + """Test map_unordered with chunk_size=1. + + Use SignalActor to control task completion order and mimic time.sleep(x) behavior. + + """ + inputs = [10, 8, 6, 4, 2] + + # Create signals for each task + signals = [SignalActor.remote() for _ in range(len(inputs))] + + # Create tasks + @ray.remote + def f(args): + x, signal = args + ray.get(signal.wait.remote()) + return x + + # Submit tasks with their corresponding signals in the original order + it = map_unordered( + f, zip(inputs, signals), chunk_size=1, yield_obj_refs=yield_obj_refs + ) + + # Send signal in reverse order to mimic time.sleep(x), i.e., + # smallest value releases first. At the same time, collect results + + results = [] + for signal in reversed(signals): + ray.get(signal.send.remote()) + results.append(next(it)) + + if yield_obj_refs: + results = ray.get(results) + + assert results == [2, 4, 6, 8, 10] + + +@pytest.mark.parametrize("yield_obj_refs", [True, False]) +def test_map_unordered_chunk_size_2(ray_init_4_cpu_shared, yield_obj_refs): + """Test map_unordered with chunk_size=2. + + Use SignalActor to control task completion order and mimic time.sleep(x) behavior. + + """ + inputs = [10, 8, 6, 4, 2] + + # Create signals for each task + signals = [SignalActor.remote() for _ in range(len(inputs))] + + # Create tasks + @ray.remote + def f(args): + x, signal = args + ray.get(signal.wait.remote()) + return x + + # Submit tasks with their corresponding signals in the original order + it = map_unordered( + f, zip(inputs, signals), chunk_size=2, yield_obj_refs=yield_obj_refs + ) + + # Send signal in reverse order to mimic time.sleep(x), i.e., + # smallest value releases first. At the same time, collect results + + results = [] + + ray.get(signals[4].send.remote()) + ray.get(signals[3].send.remote()) + results.append(next(it)) + results.append(next(it)) + + ray.get(signals[2].send.remote()) + ray.get(signals[1].send.remote()) + results.append(next(it)) + results.append(next(it)) + + ray.get(signals[0].send.remote()) + results.append(next(it)) + + if yield_obj_refs: + results = ray.get(results) + + assert results == [4, 2, 8, 6, 10] + + +if __name__ == "__main__": + sys.exit(pytest.main(["-sv", __file__])) diff --git a/python/ray/tests/test_utils.py b/python/ray/tests/test_utils.py index 543a53f9c3e6..7ea8d2c02126 100644 --- a/python/ray/tests/test_utils.py +++ b/python/ray/tests/test_utils.py @@ -5,14 +5,15 @@ This currently expects to work for minimal installs. """ import logging -import pytest import sys -from unittest.mock import patch, mock_open +from unittest.mock import mock_open, patch + +import pytest from ray._private.utils import ( + get_current_node_cpu_model_name, parse_pg_formatted_resources_to_original, try_import_each_module, - get_current_node_cpu_model_name, ) logger = logging.getLogger(__name__) diff --git a/python/ray/tests/test_wait.py b/python/ray/tests/test_wait.py index 6040bae86841..9a37b9fdb40d 100644 --- a/python/ray/tests/test_wait.py +++ b/python/ray/tests/test_wait.py @@ -1,13 +1,13 @@ # coding: utf-8 -import pytest -import numpy as np -import time import logging import sys +import time -from ray._private.test_utils import client_test_enabled +import numpy as np +import pytest +from ray._private.test_utils import client_test_enabled if client_test_enabled(): from ray.util.client import ray diff --git a/python/ray/tests/test_widgets.py b/python/ray/tests/test_widgets.py index 1c5273aa4a32..226568f944b9 100644 --- a/python/ray/tests/test_widgets.py +++ b/python/ray/tests/test_widgets.py @@ -6,7 +6,7 @@ import pytest import ray -from ray.widgets.util import repr_with_fallback, _can_display_ipywidgets +from ray.widgets.util import _can_display_ipywidgets, repr_with_fallback @pytest.fixture diff --git a/python/ray/tests/test_worker_capping.py b/python/ray/tests/test_worker_capping.py index 58b499ee7608..8861c7204176 100644 --- a/python/ray/tests/test_worker_capping.py +++ b/python/ray/tests/test_worker_capping.py @@ -1,10 +1,11 @@ import asyncio import os -import pytest import sys import tempfile import time +import pytest + import ray from ray._common.test_utils import Semaphore diff --git a/python/ray/tests/test_worker_graceful_shutdown.py b/python/ray/tests/test_worker_graceful_shutdown.py index 7ac893beb06f..56eb5ab4d0c4 100644 --- a/python/ray/tests/test_worker_graceful_shutdown.py +++ b/python/ray/tests/test_worker_graceful_shutdown.py @@ -6,8 +6,7 @@ import pytest import ray -from ray._common.test_utils import SignalActor -from ray._common.test_utils import wait_for_condition +from ray._common.test_utils import SignalActor, wait_for_condition @pytest.mark.skipif( diff --git a/python/ray/tests/test_worker_state.py b/python/ray/tests/test_worker_state.py index 5a93aefb5c84..4fa4ab6d38da 100644 --- a/python/ray/tests/test_worker_state.py +++ b/python/ray/tests/test_worker_state.py @@ -1,14 +1,14 @@ -import pytest import sys import threading +import pytest + import ray from ray._common.test_utils import ( wait_for_condition, ) from ray.util.state import list_workers - _SYSTEM_CONFIG = { "task_events_report_interval_ms": 100, "metrics_report_interval_ms": 200, diff --git a/python/ray/tests/typing_files/check_typing_good.py b/python/ray/tests/typing_files/check_typing_good.py index 97d4ed116c34..3e1e96190d90 100644 --- a/python/ray/tests/typing_files/check_typing_good.py +++ b/python/ray/tests/typing_files/check_typing_good.py @@ -1,5 +1,6 @@ -import ray from typing import Generator + +import ray from ray import ObjectRef ray.init() diff --git a/python/ray/tests/unit/BUILD b/python/ray/tests/unit/BUILD.bazel similarity index 100% rename from python/ray/tests/unit/BUILD rename to python/ray/tests/unit/BUILD.bazel diff --git a/python/ray/tests/unit/test_node_affinity_validation.py b/python/ray/tests/unit/test_node_affinity_validation.py index 88d93c3c6ba4..d5cbc892f96d 100644 --- a/python/ray/tests/unit/test_node_affinity_validation.py +++ b/python/ray/tests/unit/test_node_affinity_validation.py @@ -1,6 +1,7 @@ -import pytest import sys +import pytest + from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy diff --git a/python/ray/tests/unit/test_resource_and_label_spec.py b/python/ray/tests/unit/test_resource_and_label_spec.py index 53fed3ef9cc1..b1cbf69a5603 100644 --- a/python/ray/tests/unit/test_resource_and_label_spec.py +++ b/python/ray/tests/unit/test_resource_and_label_spec.py @@ -1,9 +1,11 @@ -import sys import json -import pytest +import sys from unittest.mock import patch -from ray._common.constants import HEAD_NODE_RESOURCE_NAME, NODE_ID_PREFIX + +import pytest + import ray._private.ray_constants as ray_constants +from ray._common.constants import HEAD_NODE_RESOURCE_NAME, NODE_ID_PREFIX from ray._private.accelerators import AcceleratorManager from ray._private.resource_and_label_spec import ResourceAndLabelSpec diff --git a/python/ray/tests/unit/test_runtime_env.py b/python/ray/tests/unit/test_runtime_env.py index adb8b17e6bfc..494ebb4ca255 100644 --- a/python/ray/tests/unit/test_runtime_env.py +++ b/python/ray/tests/unit/test_runtime_env.py @@ -1,20 +1,17 @@ -from dataclasses import dataclass import dataclasses import json import os import subprocess import sys import tempfile +from dataclasses import dataclass from typing import Any, Dict from unittest import mock import pytest -from ray.runtime_env.runtime_env import ( - RuntimeEnvConfig, - _merge_runtime_env, -) import ray +import ray._private.ray_constants as ray_constants from ray._private.runtime_env.uri_cache import URICache from ray._private.runtime_env.utils import ( SubprocessCalledProcessError, @@ -24,8 +21,10 @@ chdir, ) from ray.runtime_env import RuntimeEnv - -import ray._private.ray_constants as ray_constants +from ray.runtime_env.runtime_env import ( + RuntimeEnvConfig, + _merge_runtime_env, +) def test_runtime_env_merge(): diff --git a/python/ray/tests/unit/test_runtime_env_uv.py b/python/ray/tests/unit/test_runtime_env_uv.py index b4e210049003..2d8aeec68d5a 100644 --- a/python/ray/tests/unit/test_runtime_env_uv.py +++ b/python/ray/tests/unit/test_runtime_env_uv.py @@ -1,9 +1,10 @@ -from ray._private.runtime_env import uv - -import pytest import sys from unittest.mock import patch +import pytest + +from ray._private.runtime_env import uv + class TestRuntimeEnv: def uv_config(self): diff --git a/python/ray/tests/unit/test_runtime_env_validation.py b/python/ray/tests/unit/test_runtime_env_validation.py index df848c8d4c3f..70d722754300 100644 --- a/python/ray/tests/unit/test_runtime_env_validation.py +++ b/python/ray/tests/unit/test_runtime_env_validation.py @@ -1,25 +1,25 @@ import os -from pathlib import Path import sys import tempfile -import yaml +from pathlib import Path import jsonschema import pytest +import yaml from ray import job_config from ray._private.runtime_env import validation -from ray.runtime_env import RuntimeEnv -from ray.runtime_env.runtime_env import ( - _validate_no_local_paths, -) +from ray._private.runtime_env.plugin_schema_manager import RuntimeEnvPluginSchemaManager from ray._private.runtime_env.validation import ( - parse_and_validate_excludes, - parse_and_validate_working_dir, parse_and_validate_conda, + parse_and_validate_excludes, parse_and_validate_py_modules, + parse_and_validate_working_dir, +) +from ray.runtime_env import RuntimeEnv +from ray.runtime_env.runtime_env import ( + _validate_no_local_paths, ) -from ray._private.runtime_env.plugin_schema_manager import RuntimeEnvPluginSchemaManager _CONDA_DICT = {"dependencies": ["pip", {"pip": ["pip-install-test==0.5"]}]} _PIP_LIST = ["requests==1.0.0", "pip-install-test"] diff --git a/python/ray/tests/vsphere/test_cluster_operator.py b/python/ray/tests/vsphere/test_cluster_operator.py index 7ca46872257a..cecd009d8abe 100644 --- a/python/ray/tests/vsphere/test_cluster_operator.py +++ b/python/ray/tests/vsphere/test_cluster_operator.py @@ -8,6 +8,10 @@ import pytest +from ray.autoscaler._private.vsphere.cluster_operator_client import ( + ClusterOperatorClient, + VMNodeStatus, +) from ray.autoscaler.tags import ( NODE_KIND_HEAD, NODE_KIND_WORKER, @@ -20,11 +24,6 @@ TAG_RAY_NODE_STATUS, TAG_RAY_USER_NODE_TYPE, ) -from ray.autoscaler._private.vsphere.cluster_operator_client import ( - ClusterOperatorClient, - VMNodeStatus, -) - _CLUSTER_NAME = "ray-cluster" _PROVIDER_CONFIG = { @@ -146,8 +145,8 @@ def create_random_pvt_key(): - from cryptography.hazmat.primitives.asymmetric import rsa from cryptography.hazmat.primitives import serialization + from cryptography.hazmat.primitives.asymmetric import rsa private_key = rsa.generate_private_key(public_exponent=65537, key_size=2048) pem_private_key = private_key.private_bytes( diff --git a/python/ray/tests/vsphere/test_vmray_node_provider.py b/python/ray/tests/vsphere/test_vmray_node_provider.py index cceb7a0d1eba..cd1e9889e6a7 100644 --- a/python/ray/tests/vsphere/test_vmray_node_provider.py +++ b/python/ray/tests/vsphere/test_vmray_node_provider.py @@ -5,14 +5,13 @@ import pytest +from ray.autoscaler._private.vsphere.node_provider import VsphereWcpNodeProvider from ray.autoscaler.tags import ( + STATUS_SETTING_UP, TAG_RAY_CLUSTER_NAME, TAG_RAY_NODE_NAME, TAG_RAY_NODE_STATUS, ) -from ray.autoscaler._private.vsphere.node_provider import VsphereWcpNodeProvider - -from ray.autoscaler.tags import STATUS_SETTING_UP _CLUSTER_NAME = "test" _PROVIDER_CONFIG = { diff --git a/python/ray/train/BUILD b/python/ray/train/BUILD.bazel similarity index 100% rename from python/ray/train/BUILD rename to python/ray/train/BUILD.bazel diff --git a/python/ray/train/__init__.py b/python/ray/train/__init__.py index 7713ccb705af..a7cd2aad13af 100644 --- a/python/ray/train/__init__.py +++ b/python/ray/train/__init__.py @@ -34,8 +34,11 @@ RunConfig, ScalingConfig, ) + from ray.train.v2.api.report_config import CheckpointUploadMode # noqa: F811 + from ray.train.v2.api.reported_checkpoint import ReportedCheckpoint # noqa: F811 from ray.train.v2.api.result import Result # noqa: F811 from ray.train.v2.api.train_fn_utils import ( # noqa: F811 + get_all_reported_checkpoints, get_checkpoint, get_context, get_dataset_shard, @@ -76,9 +79,16 @@ SyncConfig.__module__ = "ray.train" TrainingIterator.__module__ = "ray.train" +# TODO: consider implementing these in v1 and raising ImportError instead. if is_v2_enabled(): __all__.append("UserCallback") UserCallback.__module__ = "ray.train" + __all__.append("CheckpointUploadMode") + CheckpointUploadMode.__module__ = "ray.train" + __all__.append("get_all_reported_checkpoints") + get_all_reported_checkpoints.__module__ = "ray.train" + __all__.append("ReportedCheckpoint") + ReportedCheckpoint.__module__ = "ray.train" # DO NOT ADD ANYTHING AFTER THIS LINE. diff --git a/python/ray/train/_internal/session.py b/python/ray/train/_internal/session.py index dec5b062ef4f..1bef98b15025 100644 --- a/python/ray/train/_internal/session.py +++ b/python/ray/train/_internal/session.py @@ -36,6 +36,7 @@ ) from ray.train.error import SessionMisuseError from ray.train.utils import _log_deprecation_warning +from ray.util import queue as ray_queue from ray.util.annotations import DeveloperAPI, PublicAPI from ray.util.debug import log_once from ray.util.placement_group import _valid_resource_shape @@ -205,6 +206,9 @@ def reset( # Queue for sending results across threads. self.result_queue = queue.Queue(1) + # Queue for sending results from training actor to main thread. + self._inter_actor_queue: Optional[ray_queue.Queue[Dict]] = None + # Queue for raising exceptions from runner thread to main thread. # The error queue has a max size of one to prevent stacking error and force # error reporting to block until finished. @@ -282,24 +286,14 @@ def get_next(self) -> Optional[_TrainingResult]: result = None # While training is still ongoing, attempt to get the result. while result is None and self.training_thread.is_alive(): - try: - result = self.result_queue.get( - block=True, timeout=_RESULT_FETCH_TIMEOUT - ) - except queue.Empty: - pass + result = self._get_result_from_queues(block=True) # If no result was found, then the runner must no longer be alive. if result is None: # Try one last time to fetch results in case results were # reported in between the time of the last check and the # termination of the thread runner. - try: - result = self.result_queue.get( - block=False, timeout=_RESULT_FETCH_TIMEOUT - ) - except queue.Empty: - pass + result = self._get_result_from_queues(block=False) # check if error occurred inside the thread runner. if result is None: @@ -325,6 +319,32 @@ def get_next(self) -> Optional[_TrainingResult]: # Return None if there are no more results to fetch. return result + def _get_or_create_inter_actor_queue(self): + """Get or create the inter-actor queue.""" + if self._inter_actor_queue is None: + self._inter_actor_queue = ray_queue.Queue(1, actor_options={"num_cpus": 0}) + return self._inter_actor_queue + + def _get_result_from_queues(self, block: bool) -> Optional[_TrainingResult]: + """Get result from result queue. Pass result from training actor result queue if needed.""" + result = None + if self._inter_actor_queue is not None: + try: + inter_actor_item = self._inter_actor_queue.get( + block=block, timeout=_RESULT_FETCH_TIMEOUT + ) + if inter_actor_item: + # Must release continue_lock to allow report to work. + self.continue_lock.release() + self.report(inter_actor_item) + except ray_queue.Empty: + pass + try: + result = self.result_queue.get(block=block, timeout=_RESULT_FETCH_TIMEOUT) + except queue.Empty: + pass + return result + def _auto_fill_metrics(self, result: dict) -> dict: """Add autofilled metrics and update attributes.""" current_time = time.time() diff --git a/python/ray/train/_internal/worker_group.py b/python/ray/train/_internal/worker_group.py index 853502b3512f..e64de700da39 100644 --- a/python/ray/train/_internal/worker_group.py +++ b/python/ray/train/_internal/worker_group.py @@ -269,6 +269,8 @@ def execute(self, func: Callable[..., T], *args, **kwargs) -> List[T]: worker. The order is the same as ``self.workers``. """ + # TODO: Add a timeout in the case of a hang, particularly + # relevant when func is TorchConfig.on_shutdown return ray.get(self.execute_async(func, *args, **kwargs)) def execute_single_async( diff --git a/python/ray/train/collective/collectives.py b/python/ray/train/collective/collectives.py index c35c43564bbe..8c3dc0e43916 100644 --- a/python/ray/train/collective/collectives.py +++ b/python/ray/train/collective/collectives.py @@ -1,15 +1,9 @@ import logging from typing import Optional, TypeVar -import ray -import ray.cloudpickle as pickle -from ray.train.v2._internal.execution.context import get_train_context +from ray.train.v2._internal.execution.train_fn_utils import get_train_fn_utils from ray.util.annotations import PublicAPI -# For reference, {1:1} is 19 bytes, {"1":"1"} is 21 bytes, -# and {"12345": "12345"} is 25 bytes. -_MAX_BROADCAST_SIZE_BYTES = 1000 - T = TypeVar("T", bound=Optional[object]) @@ -56,26 +50,7 @@ def train_func(): pickle.PicklingError: If the data is not pickleable. TypeError: If the data is not pickleable. """ - # Validate data. - if data is not None: - data_bytes = len(pickle.dumps(data)) - if data_bytes > _MAX_BROADCAST_SIZE_BYTES: - logger.warning( - f"Data size {data_bytes} bytes exceeds the maximum broadcast " - f"size of {_MAX_BROADCAST_SIZE_BYTES} bytes" - ) - - # Send data to all workers. - train_context = get_train_context() - sync_actor = train_context.get_synchronization_actor() - return ray.get( - sync_actor.broadcast_from_rank_zero.remote( - world_rank=train_context.get_world_rank(), - world_size=train_context.get_world_size(), - data=data, - caller_method_name="ray.train.collective.broadcast_from_rank_zero", - ) - ) + return get_train_fn_utils().broadcast_from_rank_zero(data) @PublicAPI(stability="alpha") @@ -103,13 +78,4 @@ def train_func(): trainer = TorchTrainer(train_func) trainer.fit() """ - train_context = get_train_context() - sync_actor = train_context.get_synchronization_actor() - return ray.get( - sync_actor.broadcast_from_rank_zero.remote( - world_rank=train_context.get_world_rank(), - world_size=train_context.get_world_size(), - data=None, - caller_method_name="ray.train.collective.barrier", - ) - ) + return get_train_fn_utils().barrier() diff --git a/python/ray/train/constants.py b/python/ray/train/constants.py index d76965a10338..8e2294827dcd 100644 --- a/python/ray/train/constants.py +++ b/python/ray/train/constants.py @@ -1,4 +1,5 @@ from pathlib import Path +from typing import Any import ray from ray._private.ray_constants import env_bool @@ -43,7 +44,9 @@ def _get_ray_train_session_dir() -> str: TUNE_CHECKPOINT_ID = "_current_checkpoint_id" # Deprecated configs can use this value to detect if the user has set it. -_DEPRECATED_VALUE = "DEPRECATED" +# This has type Any to allow it to be assigned to any annotated parameter +# without causing type errors. +_DEPRECATED_VALUE: Any = "DEPRECATED" # ================================================== @@ -122,6 +125,12 @@ def _v2_migration_warnings_enabled() -> bool: "TUNE_ONLY_STORE_CHECKPOINT_SCORE_ATTRIBUTE" ) +# Seconds to wait for torch process group to shut down. +# Shutting down a healthy torch process group, which we may want to do for reasons +# like restarting a group of workers if an async checkpoint upload fails, can hang. +# This is a workaround until we figure out how to avoid this hang. +TORCH_PROCESS_GROUP_SHUTDOWN_TIMEOUT_S = "TORCH_PROCESS_GROUP_SHUTDOWN_TIMEOUT_S" +DEFAULT_TORCH_PROCESS_GROUP_SHUTDOWN_TIMEOUT_S = 30 # NOTE: When adding a new environment variable, please track it in this list. TRAIN_ENV_VARS = { @@ -134,6 +143,7 @@ def _v2_migration_warnings_enabled() -> bool: RAY_TRAIN_COUNT_PREEMPTION_AS_FAILURE, RAY_TRAIN_ENABLE_STATE_TRACKING, TUNE_ONLY_STORE_CHECKPOINT_SCORE_ATTRIBUTE, + TORCH_PROCESS_GROUP_SHUTDOWN_TIMEOUT_S, } # Key for AIR Checkpoint metadata in TrainingResult metadata diff --git a/python/ray/train/huggingface/transformers/_transformers_utils.py b/python/ray/train/huggingface/transformers/_transformers_utils.py index b195a869f304..7f3eaeefac4a 100644 --- a/python/ray/train/huggingface/transformers/_transformers_utils.py +++ b/python/ray/train/huggingface/transformers/_transformers_utils.py @@ -2,7 +2,7 @@ import shutil from pathlib import Path from tempfile import TemporaryDirectory -from typing import Iterator, Optional, Type +from typing import Iterator, Optional, Type, Union from torch.utils.data import DataLoader, Dataset, IterableDataset @@ -126,12 +126,19 @@ def get_train_dataloader(self) -> DataLoader: return super().get_train_dataloader() def get_eval_dataloader( - self, eval_dataset: Optional[Dataset] = None + self, eval_dataset: Optional[Union[str, Dataset]] = None ) -> DataLoader: if eval_dataset is None: eval_dataset = self.eval_dataset - if isinstance(eval_dataset, _IterableFromIterator): + if ( + isinstance(eval_dataset, str) + and isinstance(self.eval_dataset, dict) + and isinstance(self.eval_dataset[eval_dataset], _IterableFromIterator) + ): + dataset = RayTorchIterableDataset(self.eval_dataset[eval_dataset]) + return DataLoader(dataset, batch_size=1, collate_fn=lambda x: x[0]) + elif isinstance(eval_dataset, _IterableFromIterator): dataset = RayTorchIterableDataset(eval_dataset) return DataLoader(dataset, batch_size=1, collate_fn=lambda x: x[0]) else: diff --git a/python/ray/train/lightning/__init__.py b/python/ray/train/lightning/__init__.py index c8e413a10308..8be5886a805c 100644 --- a/python/ray/train/lightning/__init__.py +++ b/python/ray/train/lightning/__init__.py @@ -19,12 +19,6 @@ RayTrainReportCallback, prepare_trainer, ) -from ray.train.v2._internal.constants import is_v2_enabled - -if is_v2_enabled(): - from ray.train.v2.lightning.lightning_utils import ( # noqa: F811 - RayTrainReportCallback, - ) __all__ = [ "prepare_trainer", diff --git a/python/ray/train/lightning/_lightning_utils.py b/python/ray/train/lightning/_lightning_utils.py index 2157287af516..2da924a3357c 100644 --- a/python/ray/train/lightning/_lightning_utils.py +++ b/python/ray/train/lightning/_lightning_utils.py @@ -9,7 +9,7 @@ from packaging.version import Version import ray -from ray import train +import ray.train from ray._common.usage.usage_lib import TagKey, record_extra_usage_tag from ray.train import Checkpoint from ray.util import PublicAPI @@ -182,16 +182,16 @@ def __init__(self, *args, **kwargs): record_extra_usage_tag(TagKey.TRAIN_LIGHTNING_RAYLIGHTNINGENVIRONMENT, "1") def world_size(self) -> int: - return train.get_context().get_world_size() + return ray.train.get_context().get_world_size() def global_rank(self) -> int: - return train.get_context().get_world_rank() + return ray.train.get_context().get_world_rank() def local_rank(self) -> int: - return train.get_context().get_local_rank() + return ray.train.get_context().get_local_rank() def node_rank(self) -> int: - return train.get_context().get_node_rank() + return ray.train.get_context().get_node_rank() def set_world_size(self, size: int) -> None: # Disable it since `world_size()` directly returns data from Train context. @@ -259,9 +259,14 @@ class RayTrainReportCallback(pl.callbacks.Callback): def __init__(self) -> None: super().__init__() - self.trial_name = train.get_context().get_trial_name() - self.local_rank = train.get_context().get_local_rank() - self.tmpdir_prefix = Path(tempfile.gettempdir(), self.trial_name).as_posix() + job_id = ray.get_runtime_context().get_job_id() + experiment_name = ray.train.get_context().get_experiment_name() + self.local_rank = ray.train.get_context().get_local_rank() + + self.tmpdir_prefix = Path( + tempfile.gettempdir(), + f"lightning_checkpoints-job_id={job_id}-name={experiment_name}", + ).as_posix() if os.path.isdir(self.tmpdir_prefix) and self.local_rank == 0: shutil.rmtree(self.tmpdir_prefix) @@ -286,7 +291,7 @@ def on_train_epoch_end(self, trainer, pl_module) -> None: # Report to train session checkpoint = Checkpoint.from_directory(tmpdir) - train.report(metrics=metrics, checkpoint=checkpoint) + ray.train.report(metrics=metrics, checkpoint=checkpoint) # Add a barrier to ensure all workers finished reporting here trainer.strategy.barrier() diff --git a/python/ray/train/tests/test_backend.py b/python/ray/train/tests/test_backend.py index 3bf9e45a5449..7ea35b11e6bc 100644 --- a/python/ray/train/tests/test_backend.py +++ b/python/ray/train/tests/test_backend.py @@ -28,6 +28,7 @@ from ray.train.constants import ( ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV, ENABLE_SHARE_NEURON_CORES_ACCELERATOR_ENV, + TORCH_PROCESS_GROUP_SHUTDOWN_TIMEOUT_S, TRAIN_ENABLE_WORKER_SPREAD_ENV, ) from ray.train.torch import TorchConfig @@ -364,6 +365,24 @@ def check_process_group(): assert not any(e.finish_training()) +@pytest.mark.parametrize( + "init_method, timeout_s", [("env", 5), ("tcp", 5), ("env", 0), ("tcp", 0)] +) +def test_torch_process_group_shutdown_timeout( + ray_start_2_cpus, monkeypatch, init_method, timeout_s +): + monkeypatch.setenv(TORCH_PROCESS_GROUP_SHUTDOWN_TIMEOUT_S, timeout_s) + torch_config = TorchConfig(backend="gloo", init_method=init_method) + e = BackendExecutor(torch_config, num_workers=2) + e.start() + + _start_training(e, lambda: 1) + assert e.finish_training() == [1, 1] + + # Verify that we do not raise an exception even if we time out + e._backend.on_shutdown(e.worker_group, e._backend_config) + + @pytest.mark.parametrize( "worker_results", [ diff --git a/python/ray/train/tests/test_session.py b/python/ray/train/tests/test_session.py index c70e4ecbbd8e..071aeeaa18af 100644 --- a/python/ray/train/tests/test_session.py +++ b/python/ray/train/tests/test_session.py @@ -17,6 +17,7 @@ ) from ray.train._internal.accelerator import Accelerator from ray.train._internal.session import ( + _TrainingResult, get_accelerator, get_session, init_session, @@ -170,6 +171,47 @@ def test_report_after_finish(session): shutdown_session() +@pytest.mark.parametrize( + "block,put_result_queue,put_actor_queue", + [ + (False, False, False), + (False, False, True), + (False, True, False), + (True, False, False), + (True, False, True), + (True, True, False), + ], +) +def test_get_result_from_queues(session, block, put_result_queue, put_actor_queue): + """Verify that we get the expected _TrainingResult from each result queue. + + `block` describes whether we wait for a result or return after a timeout. + This argument should have no impact on this unit test. + `put_result_queue` and `put_actor_queue` are mutually exclusive and describe + which queue has results to process. The returned _TrainingResult should be + from the expected queue. + """ + result_queue_training_result = _TrainingResult( + checkpoint=None, + metrics={"result_queue_metric_key": "result_queue_metric_value"}, + ) + if put_result_queue: + session.result_queue.put(result_queue_training_result, block=True) + inter_actor_result = {"inter_actor_metric_key": "inter_actor_metric_value"} + if put_actor_queue: + session._get_or_create_inter_actor_queue().put(inter_actor_result, block=True) + result = session._get_result_from_queues(block=block) + if put_result_queue: + assert result == result_queue_training_result + elif put_actor_queue: + assert ( + result.metrics["inter_actor_metric_key"] + == inter_actor_result["inter_actor_metric_key"] + ) + else: + assert result is None + + def test_no_start(session): with pytest.raises(RuntimeError): session.get_next() diff --git a/python/ray/train/tests/test_torch_transformers_train.py b/python/ray/train/tests/test_torch_transformers_train.py index 94eb03715dea..70b67ec3883e 100644 --- a/python/ray/train/tests/test_torch_transformers_train.py +++ b/python/ray/train/tests/test_torch_transformers_train.py @@ -55,6 +55,7 @@ def ray_start_8_cpus(): "save_steps": None, "logging_steps": None, "no_cuda": False, + "use_dict_eval_datasets": False, }, "steps_gpu": { "evaluation_strategy": "steps", @@ -64,6 +65,7 @@ def ray_start_8_cpus(): "save_steps": STEPS_PER_EPOCH * 2, "logging_steps": 1, "no_cuda": False, + "use_dict_eval_datasets": False, }, "steps_cpu": { "evaluation_strategy": "steps", @@ -73,6 +75,7 @@ def ray_start_8_cpus(): "save_steps": STEPS_PER_EPOCH, "logging_steps": 1, "no_cuda": True, + "use_dict_eval_datasets": False, }, } @@ -81,14 +84,27 @@ def train_func(config): # Datasets if config["use_ray_data"]: train_ds_shard = ray.train.get_dataset_shard("train") - eval_ds_shard = ray.train.get_dataset_shard("eval") - train_dataset = train_ds_shard.iter_torch_batches( batch_size=BATCH_SIZE_PER_WORKER ) - eval_dataset = eval_ds_shard.iter_torch_batches( - batch_size=BATCH_SIZE_PER_WORKER - ) + if config["use_dict_eval_datasets"]: + eval_ds_shard_1 = ray.train.get_dataset_shard("eval_1") + eval_ds_shard_2 = ray.train.get_dataset_shard("eval_2") + + eval_dataset = { + "eval_1": eval_ds_shard_1.iter_torch_batches( + batch_size=BATCH_SIZE_PER_WORKER + ), + "eval_2": eval_ds_shard_2.iter_torch_batches( + batch_size=BATCH_SIZE_PER_WORKER + ), + } + else: + eval_ds_shard = ray.train.get_dataset_shard("eval") + + eval_dataset = eval_ds_shard.iter_torch_batches( + batch_size=BATCH_SIZE_PER_WORKER + ) else: train_df = pd.read_json(train_data) validation_df = pd.read_json(validation_data) @@ -201,6 +217,48 @@ def test_e2e_ray_data(ray_start_6_cpus_2_gpus, config_id): assert "eval_loss" in result.metrics +@pytest.mark.parametrize("config_id", ["steps_gpu", "steps_cpu"]) +def test_e2e_dict_eval_ray_data(ray_start_6_cpus_2_gpus, config_id): + train_loop_config = CONFIGURATIONS[config_id] + + # Must specify `max_steps` for Iterable Dataset + train_loop_config["use_ray_data"] = True + train_loop_config["use_dict_eval_datasets"] = True + train_loop_config["max_steps"] = MAX_STEPS + + # Calculate the num of Ray training iterations + num_iterations = MAX_STEPS // train_loop_config["save_steps"] + + train_df = pd.read_json(train_data) + validation_df = pd.read_json(validation_data) + + ray_train_ds = ray.data.from_pandas(train_df) + ray_eval_ds_1 = ray.data.from_pandas(validation_df) + ray_eval_ds_2 = ray.data.from_pandas(validation_df) + + use_gpu = not train_loop_config["no_cuda"] + + trainer = TorchTrainer( + train_func, + train_loop_config=train_loop_config, + scaling_config=ScalingConfig(num_workers=NUM_WORKERS, use_gpu=use_gpu), + datasets={ + "train": ray_train_ds, + "eval_1": ray_eval_ds_1, + "eval_2": ray_eval_ds_2, + }, + ) + result = trainer.fit() + + assert result.metrics["step"] == MAX_STEPS + assert result.metrics["training_iteration"] == num_iterations + assert result.checkpoint + assert isinstance(result.checkpoint, Checkpoint) + assert len(result.best_checkpoints) == num_iterations + assert "eval_eval_1_loss" in result.metrics + assert "eval_eval_2_loss" in result.metrics + + # Tests if Ray Tune works correctly. def test_tune(ray_start_8_cpus): train_loop_config = CONFIGURATIONS["steps_cpu"] diff --git a/python/ray/train/torch/__init__.py b/python/ray/train/torch/__init__.py index db989336afd1..1774b98cb18a 100644 --- a/python/ray/train/torch/__init__.py +++ b/python/ray/train/torch/__init__.py @@ -30,6 +30,8 @@ accelerate, backward, enable_reproducibility, + get_device, + get_devices, prepare_data_loader, prepare_model, prepare_optimizer, diff --git a/python/ray/train/torch/config.py b/python/ray/train/torch/config.py index 9acc0774d5a5..3e9e41c81ac9 100644 --- a/python/ray/train/torch/config.py +++ b/python/ray/train/torch/config.py @@ -10,10 +10,16 @@ import ray from ray._common.network_utils import build_address +from ray._private import ray_constants from ray.air._internal.device_manager import register_custom_torch_dist_backend +from ray.exceptions import GetTimeoutError from ray.train._internal.utils import get_address_and_port from ray.train._internal.worker_group import WorkerGroup from ray.train.backend import Backend, BackendConfig +from ray.train.constants import ( + DEFAULT_TORCH_PROCESS_GROUP_SHUTDOWN_TIMEOUT_S, + TORCH_PROCESS_GROUP_SHUTDOWN_TIMEOUT_S, +) from ray.util import PublicAPI logger = logging.getLogger(__name__) @@ -202,11 +208,21 @@ def set_env_vars(addr, port): else: raise RuntimeError("Distributed torch is not available.") - def on_shutdown(self, worker_group: WorkerGroup, backend_config: TorchConfig): - worker_group.execute( + def on_shutdown(self, worker_group, backend_config): + futures = worker_group.execute_async( _shutdown_torch, destroy_process_group=len(worker_group) > 1, ) + timeout_s = ray_constants.env_integer( + TORCH_PROCESS_GROUP_SHUTDOWN_TIMEOUT_S, + DEFAULT_TORCH_PROCESS_GROUP_SHUTDOWN_TIMEOUT_S, + ) + try: + ray.get(futures, timeout=timeout_s) + except GetTimeoutError: + logger.warning( + f"Torch process group shutdown timed out after {timeout_s} seconds" + ) def on_training_start( self, worker_group: WorkerGroup, backend_config: BackendConfig diff --git a/python/ray/train/v2/BUILD b/python/ray/train/v2/BUILD.bazel similarity index 85% rename from python/ray/train/v2/BUILD rename to python/ray/train/v2/BUILD.bazel index 502056cb9885..c19c054dec37 100644 --- a/python/ray/train/v2/BUILD +++ b/python/ray/train/v2/BUILD.bazel @@ -21,6 +21,22 @@ py_test( ], ) +py_test( + name = "test_async_checkpointing", + size = "medium", + srcs = ["tests/test_async_checkpointing.py"], + env = {"RAY_TRAIN_V2_ENABLED": "1"}, + tags = [ + "exclusive", + "team:ml", + "train_v2", + ], + deps = [ + ":conftest", + "//:ray_lib", + ], +) + py_test( name = "test_checkpoint_manager", size = "small", @@ -71,7 +87,7 @@ py_test( py_test( name = "test_data_integration", - size = "small", + size = "medium", srcs = ["tests/test_data_integration.py"], env = {"RAY_TRAIN_V2_ENABLED": "1"}, tags = [ @@ -133,6 +149,22 @@ py_test( ], ) +py_test( + name = "test_jax_trainer", + size = "small", + srcs = ["tests/test_jax_trainer.py"], + env = {"RAY_TRAIN_V2_ENABLED": "1"}, + tags = [ + "exclusive", + "team:ml", + "train_v2", + ], + deps = [ + ":conftest", + "//:ray_lib", + ], +) + py_test( name = "test_lightgbm_trainer", size = "small", @@ -453,6 +485,22 @@ py_test( ], ) +py_test( + name = "test_worker", + size = "small", + srcs = ["tests/test_worker.py"], + env = {"RAY_TRAIN_V2_ENABLED": "1"}, + tags = [ + "exclusive", + "team:ml", + "train_v2", + ], + deps = [ + ":conftest", + "//:ray_lib", + ], +) + py_test( name = "test_worker_group", size = "medium", @@ -469,6 +517,22 @@ py_test( ], ) +py_test( + name = "test_worker_group_poll_status", + size = "small", + srcs = ["tests/test_worker_group_poll_status.py"], + env = {"RAY_TRAIN_V2_ENABLED": "1"}, + tags = [ + "exclusive", + "team:ml", + "train_v2", + ], + deps = [ + ":conftest", + "//:ray_lib", + ], +) + py_test( name = "test_xgboost_trainer", size = "small", @@ -484,3 +548,19 @@ py_test( "//:ray_lib", ], ) + +py_test( + name = "test_local_mode", + size = "medium", + srcs = ["tests/test_local_mode.py"], + env = {"RAY_TRAIN_V2_ENABLED": "1"}, + tags = [ + "exclusive", + "team:ml", + "train_v2", + ], + deps = [ + ":conftest", + "//:ray_lib", + ], +) diff --git a/python/ray/train/v2/_internal/callbacks/__init__.py b/python/ray/train/v2/_internal/callbacks/__init__.py index 5c5b204acdcf..3db8d835fba3 100644 --- a/python/ray/train/v2/_internal/callbacks/__init__.py +++ b/python/ray/train/v2/_internal/callbacks/__init__.py @@ -2,6 +2,7 @@ from .backend_setup import BackendSetupCallback from .datasets import DatasetsSetupCallback from .state_manager import StateManagerCallback +from .tpu_reservation_callback import TPUReservationCallback from .working_dir_setup import WorkingDirectorySetupCallback __all__ = [ @@ -9,6 +10,7 @@ "BackendSetupCallback", "DatasetsSetupCallback", "StateManagerCallback", + "TPUReservationCallback", "WorkingDirectorySetupCallback", ] diff --git a/python/ray/train/v2/_internal/callbacks/datasets.py b/python/ray/train/v2/_internal/callbacks/datasets.py index a51b633d457a..908411cab443 100644 --- a/python/ray/train/v2/_internal/callbacks/datasets.py +++ b/python/ray/train/v2/_internal/callbacks/datasets.py @@ -1,32 +1,46 @@ import copy -from typing import Any, Callable, Dict, List, Union +from typing import Dict, List import ray.train -from ray.data import Dataset +from ray.data import DataIterator from ray.data.context import DataContext +from ray.train.v2._internal.data_integration.interfaces import ( + DatasetShardMetadata, + DatasetShardProvider, +) from ray.train.v2._internal.execution.callback import WorkerGroupCallback +from ray.train.v2._internal.execution.context import TrainRunContext from ray.train.v2._internal.execution.worker_group.worker_group import ( Worker, WorkerGroup, ) -# A type representing either a ray.data.Dataset or a function that returns a -# ray.data.Dataset and accepts no arguments. -GenDataset = Union[Dataset, Callable[[], Dataset]] + +class RayDatasetShardProvider: + """A shard provider that Train workers use to access a DataIterator for a dataset.""" + + def __init__(self, ds_iterators: Dict[str, DataIterator]): + # Maps dataset_name to a DataIterator. + self._dataset_iterators = ds_iterators + + def get_dataset_shard(self, dataset_info: DatasetShardMetadata) -> DataIterator: + if dataset_info.dataset_name not in self._dataset_iterators: + raise KeyError( + f"Dataset shard for '{dataset_info.dataset_name}' not found. " + "Please ensure that the dataset is passed through the Trainer `datasets` " + "argument." + ) + + return self._dataset_iterators[dataset_info.dataset_name] class DatasetsSetupCallback(WorkerGroupCallback): """The callback to setup Ray Datasets for the worker group.""" - def __init__( - self, - datasets: Dict[str, GenDataset], - data_config: ray.train.DataConfig, - scaling_config: ray.train.ScalingConfig, - ): - self._datasets = datasets - self._data_config = data_config - self._scaling_config = scaling_config + def __init__(self, train_run_context: TrainRunContext): + self._datasets = train_run_context.datasets + self._data_config = copy.deepcopy(train_run_context.dataset_config) + self._scaling_config = train_run_context.scaling_config # Capture the current DataContext to propagate it to # the Train workers later. @@ -45,10 +59,15 @@ def get_train_total_resources( these resources logically from its available pool.""" return scaling_config.total_resources - def before_init_train_context(self, workers: List[Worker]) -> Dict[str, List[Any]]: - # Configure dataset shards - datasets = {k: v() if callable(v) else v for k, v in self._datasets.items()} - node_ids = [worker.metadata.node_id for worker in workers] + # -------------------------- + # WorkerGroupCallback + # -------------------------- + + def before_init_train_context( + self, workers: List[Worker] + ) -> Dict[str, List[DatasetShardProvider]]: + world_size = len(workers) + worker_node_ids = [worker.metadata.node_id for worker in workers] # Notify the DataConfig about the total resources reserved for training. total_train_resources = self.get_train_total_resources(self._scaling_config) @@ -56,15 +75,20 @@ def before_init_train_context(self, workers: List[Worker]) -> Dict[str, List[Any total_train_resources.get("CPU", 0), total_train_resources.get("GPU", 0) ) - dataset_shards = self._data_config.configure( - datasets, - world_size=len(workers), + datasets = {k: v() if callable(v) else v for k, v in self._datasets.items()} + ds_iterators_per_rank = self._data_config.configure( + datasets=datasets, + world_size=world_size, worker_handles=None, - worker_node_ids=node_ids, + worker_node_ids=worker_node_ids, ) - assert len(dataset_shards) == len(workers) + assert len(ds_iterators_per_rank) == world_size - return {"dataset_shards": dataset_shards} + shard_providers_per_rank = [ + RayDatasetShardProvider(ds_iterators=ds_iterators_per_rank[rank]) + for rank in range(world_size) + ] + return {"dataset_shard_provider": shard_providers_per_rank} def after_worker_group_start(self, worker_group: WorkerGroup): # Propagate DataContext diff --git a/python/ray/train/v2/_internal/callbacks/tpu_reservation_callback.py b/python/ray/train/v2/_internal/callbacks/tpu_reservation_callback.py new file mode 100644 index 000000000000..acb7b70847ea --- /dev/null +++ b/python/ray/train/v2/_internal/callbacks/tpu_reservation_callback.py @@ -0,0 +1,45 @@ +from typing import Dict, Optional + +import ray +from ray._private.accelerators.tpu import reserve_tpu_slice +from ray.train.v2._internal.execution.callback import ControllerCallback +from ray.train.v2.api.config import ScalingConfig + + +class TPUReservationCallback(ControllerCallback): + """A callback to handle TPU slice reservation for multi-host training.""" + + def on_controller_start_worker_group( + self, *, scaling_config: ScalingConfig, num_workers: int + ) -> Optional[Dict[str, str]]: + """Reserves a multi-host TPU slice before the worker group starts. + + This hook is called by the TrainController. It checks if multi-host + TPUs are being used and, if so, reserves a slice. + + Args: + scaling_config: The scaling configuration for the run. + num_workers: The number of workers to be started. + + Returns: + A dictionary defining a `bundle_label_selector` to gang schedule + the worker group on the reserved TPU slice. + """ + bundle_label_selector = None + + if scaling_config.use_tpu and num_workers > 1: + assert scaling_config.accelerator_type is not None + assert scaling_config.topology is not None + + slice_name = reserve_tpu_slice( + topology=scaling_config.topology, + accelerator_type=scaling_config.accelerator_type, + ) + if not slice_name: + raise RuntimeError("Failed to reserve TPU slice.") + + bundle_label_selector = { + ray._raylet.RAY_NODE_TPU_SLICE_NAME_KEY: slice_name + } + + return bundle_label_selector diff --git a/python/ray/train/v2/_internal/constants.py b/python/ray/train/v2/_internal/constants.py index 2eedeeff3593..c71e3e48b468 100644 --- a/python/ray/train/v2/_internal/constants.py +++ b/python/ray/train/v2/_internal/constants.py @@ -46,7 +46,7 @@ # Environment variable to enable the print function patching. ENABLE_PRINT_PATCH_ENV_VAR = "RAY_TRAIN_ENABLE_PRINT_PATCH" -DEFAULT_ENABLE_PRINT_PATCH = "1" +DEFAULT_ENABLE_PRINT_PATCH = True # V2 feature flag. V2_ENABLED_ENV_VAR = "RAY_TRAIN_V2_ENABLED" @@ -56,8 +56,8 @@ "RAY_TRAIN_ENABLE_CONTROLLER_STRUCTURED_LOGGING" ) ENABLE_WORKER_STRUCTURED_LOGGING_ENV_VAR = "RAY_TRAIN_ENABLE_WORKER_STRUCTURED_LOGGING" -DEFAULT_ENABLE_CONTROLLER_LOGGING = "1" -DEFAULT_ENABLE_WORKER_LOGGING = "1" +DEFAULT_ENABLE_CONTROLLER_LOGGING = True +DEFAULT_ENABLE_WORKER_LOGGING = True # Environment variables to configure reconciliation interval for Train state actor. # This determines how many seconds the state actor will wait between @@ -65,7 +65,7 @@ ENABLE_STATE_ACTOR_RECONCILIATION_ENV_VAR = ( "RAY_TRAIN_ENABLE_STATE_ACTOR_RECONCILIATION" ) -DEFAULT_ENABLE_STATE_ACTOR_RECONCILIATION = "1" +DEFAULT_ENABLE_STATE_ACTOR_RECONCILIATION = True STATE_ACTOR_RECONCILIATION_INTERVAL_S_ENV_VAR = ( "RAY_TRAIN_STATE_ACTOR_RECONCILIATION_INTERVAL_S" ) @@ -103,10 +103,6 @@ # The environment variable to enable the Ray Train Metrics. METRICS_ENABLED_ENV_VAR = "RAY_TRAIN_METRICS_ENABLED" -# Whether or not to run the controller as an actor. -RUN_CONTROLLER_AS_ACTOR_ENV_VAR = "RAY_TRAIN_RUN_CONTROLLER_AS_ACTOR" -DEFAULT_RUN_CONTROLLER_AS_ACTOR = "1" - def is_v2_enabled() -> bool: return env_bool(V2_ENABLED_ENV_VAR, False) diff --git a/src/ray/raylet/.gitkeep b/python/ray/train/v2/_internal/data_integration/__init__.py similarity index 100% rename from src/ray/raylet/.gitkeep rename to python/ray/train/v2/_internal/data_integration/__init__.py diff --git a/python/ray/train/v2/_internal/data_integration/interfaces.py b/python/ray/train/v2/_internal/data_integration/interfaces.py new file mode 100644 index 000000000000..73b37854fee6 --- /dev/null +++ b/python/ray/train/v2/_internal/data_integration/interfaces.py @@ -0,0 +1,29 @@ +from dataclasses import dataclass +from typing import Callable, Protocol, Union + +from ray.data import DataIterator, Dataset + +# A type representing either a ray.data.Dataset or a function that returns a +# ray.data.Dataset and accepts no arguments. +GenDataset = Union[Dataset, Callable[[], Dataset]] + + +@dataclass +class DatasetShardMetadata: + """Metadata about a dataset shard used for lookup and configuration.""" + + dataset_name: str + + +class DatasetShardProvider(Protocol): + def get_dataset_shard(self, dataset_info: DatasetShardMetadata) -> DataIterator: + """Get the dataset shard for the given dataset info. + Args: + dataset_info: The metadata of the shard to retrieve, + including the dataset name. + Returns: + The :class:`~ray.data.DataIterator` shard for the given dataset info. + Raises: + KeyError: If the dataset shard for the given dataset info is not found. + """ + ... diff --git a/python/ray/train/v2/_internal/execution/callback.py b/python/ray/train/v2/_internal/execution/callback.py index 50796a0700c8..f5cfd3584f79 100644 --- a/python/ray/train/v2/_internal/execution/callback.py +++ b/python/ray/train/v2/_internal/execution/callback.py @@ -2,6 +2,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional from ray.train.v2.api.callback import RayTrainCallback +from ray.train.v2.api.config import ScalingConfig from ray.train.v2.api.result import Result from ray.util.annotations import DeveloperAPI @@ -78,6 +79,28 @@ def after_controller_start(self, train_run_context: "TrainRunContext"): before the control loop starts executing.""" pass + # TODO(matthewdeng): Revisit this callback interface for better extensibility. + # This hook was added for the specific use case of setting a `bundle_label_selector` + # for new worker groups (e.g., for TPU reservations). The current interface is + # tightly coupled to this purpose and limits its reuse for other use-cases. + def on_controller_start_worker_group( + self, *, scaling_config: ScalingConfig, num_workers: int + ) -> Optional[Dict[str, str]]: + """Called by the TrainController before the worker group is started. + + This hook can be used to perform setup that modifies the worker group's + placement, such as reserving an accelerator slice. + + Args: + scaling_config: The scaling configuration for the run. + num_workers: The number of workers to be started. + + Returns: + An optional dictionary defining a `bundle_label_selector` + to gang schedule the worker group on the reserved TPU slice. + """ + return None + def before_controller_shutdown(self): """Called before `TrainController.run` exits, after the control loop has exited.""" diff --git a/python/ray/train/v2/_internal/execution/checkpoint/checkpoint_manager.py b/python/ray/train/v2/_internal/execution/checkpoint/checkpoint_manager.py index a241ce43619d..a4a086ca7f84 100644 --- a/python/ray/train/v2/_internal/execution/checkpoint/checkpoint_manager.py +++ b/python/ray/train/v2/_internal/execution/checkpoint/checkpoint_manager.py @@ -1,3 +1,4 @@ +import asyncio import logging from typing import Any, Dict, List, Optional @@ -14,8 +15,9 @@ WorkerGroupCallback, ) from ray.train.v2._internal.execution.context import StorageContext -from ray.train.v2._internal.execution.storage import _delete_fs_path, _exists_at_fs_path +from ray.train.v2._internal.execution.storage import _exists_at_fs_path, delete_fs_path from ray.train.v2._internal.execution.worker_group import Worker +from ray.train.v2.api.reported_checkpoint import ReportedCheckpoint try: from pydantic import BaseModel @@ -81,6 +83,12 @@ def __init__( ): self._storage_context = storage_context self._checkpoint_config = checkpoint_config + + # This tracks the number of report calls that have been processed + # for the current worker group. + self._current_report_index = 0 + + self._condition = asyncio.Condition() super().__init__(checkpoint_config) # If the snapshot is found, the checkpoint manager will restore its state. self._maybe_load_state_from_storage() @@ -137,7 +145,15 @@ def register_checkpoint(self, checkpoint_result: _TrainingResult): for checkpoint_result in results_to_delete: checkpoint = checkpoint_result.checkpoint logger.debug("Deleting checkpoint: ", checkpoint) - _delete_fs_path(fs=checkpoint.filesystem, fs_path=checkpoint.path) + delete_fs_path(fs=checkpoint.filesystem, fs_path=checkpoint.path) + + self._current_report_index += 1 + + async def async_notify(): + async with self._condition: + self._condition.notify_all() + + asyncio.create_task(async_notify()) # -------------------------- # CheckpointManager state @@ -267,6 +283,7 @@ def after_report( self, metrics: List[Dict[str, Any]], checkpoint: Optional[Checkpoint] ): if not checkpoint: + self._current_report_index += 1 return rank_0_metrics = metrics[0] @@ -279,9 +296,31 @@ def after_report( # -------------------------- def before_init_train_context(self, workers: List[Worker]) -> Dict[str, List[Any]]: + self._current_report_index = 0 latest_checkpoint = ( self.latest_checkpoint_result.checkpoint if self.latest_checkpoint_result else None ) - return {"checkpoint": [latest_checkpoint] * len(workers)} + train_context_args = { + "checkpoint": [latest_checkpoint] * len(workers), + } + return train_context_args + + async def get_all_reported_checkpoints( + self, current_report_index: int + ) -> List[ReportedCheckpoint]: + """Once expected_num_checkpoints are reported, return the ReportedCheckpoints.""" + async with self._condition: + await self._condition.wait_for( + lambda: self._current_report_index == current_report_index + ) + # TODO: might be nice for CheckpointManager to manage ReportedCheckpoint + # instead of _TrainingResult but that is a large refactor. + return [ + ReportedCheckpoint( + checkpoint=tr.checkpoint, + metrics=tr.metrics, + ) + for tr in self._checkpoint_results + ] diff --git a/python/ray/train/v2/_internal/execution/collective_impl.py b/python/ray/train/v2/_internal/execution/collective_impl.py new file mode 100644 index 000000000000..0d91567046aa --- /dev/null +++ b/python/ray/train/v2/_internal/execution/collective_impl.py @@ -0,0 +1,56 @@ +import logging +from typing import Any + +import ray +import ray.cloudpickle as pickle +from ray.train.v2._internal.execution.context import get_train_context + +# For reference, {1:1} is 19 bytes, {"1":"1"} is 21 bytes, +# and {"12345": "12345"} is 25 bytes. +_MAX_BROADCAST_SIZE_BYTES = 1000 + + +logger = logging.getLogger(__name__) + + +def barrier() -> None: + """ + Create a barrier across all training workers. + """ + train_context = get_train_context() + sync_actor = train_context.get_synchronization_actor() + return ray.get( + sync_actor.broadcast_from_rank_zero.remote( + world_rank=train_context.get_world_rank(), + world_size=train_context.get_world_size(), + data=None, + caller_method_name="ray.train.collective.barrier", + ) + ) + + +def broadcast_from_rank_zero(data: Any) -> Any: + """Broadcast data from the rank 0 worker to all other workers. + + This method is used by the public API function :func:`ray.train.collective.broadcast_from_rank_zero`. + Users should typically call ``ray.train.collective.broadcast_from_rank_zero()`` instead of calling this method directly. + """ + # Validate data. + if data is not None: + data_bytes = len(pickle.dumps(data)) + if data_bytes > _MAX_BROADCAST_SIZE_BYTES: + logger.warning( + f"Data size {data_bytes} bytes exceeds the maximum broadcast " + f"size of {_MAX_BROADCAST_SIZE_BYTES} bytes" + ) + + train_context = get_train_context() + sync_actor = train_context.get_synchronization_actor() + return ray.get( + sync_actor.broadcast_from_rank_zero.remote( + world_rank=train_context.get_world_rank(), + world_size=train_context.get_world_size(), + data=data, + caller_method_name="ray.train.collective.broadcast_from_rank_zero", + ) + ) diff --git a/python/ray/train/v2/_internal/execution/context.py b/python/ray/train/v2/_internal/execution/context.py index cf76ec4f8484..b5965d97219e 100644 --- a/python/ray/train/v2/_internal/execution/context.py +++ b/python/ray/train/v2/_internal/execution/context.py @@ -2,28 +2,44 @@ import sys import threading import uuid +from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass, field from queue import Queue from typing import TYPE_CHECKING, Any, Dict, List, Optional import ray +from ray.actor import ActorHandle from ray.data import DataIterator, Dataset -from ray.train import BackendConfig, Checkpoint, DataConfig from ray.train._internal import session from ray.train._internal.session import _TrainingResult from ray.train.v2._internal.execution.checkpoint.sync_actor import SynchronizationActor -from ray.train.v2._internal.execution.storage import StorageContext -from ray.train.v2._internal.util import _copy_doc, invoke_context_managers +from ray.train.v2._internal.execution.storage import StorageContext, delete_fs_path +from ray.train.v2._internal.util import ( + _copy_doc, + construct_user_exception_with_traceback, + invoke_context_managers, +) from ray.train.v2.api.config import RunConfig, ScalingConfig +from ray.train.v2.api.report_config import CheckpointUploadMode if TYPE_CHECKING: + from ray.train import BackendConfig, Checkpoint, DataConfig + from ray.train.v2._internal.data_integration.interfaces import ( + DatasetShardMetadata, + DatasetShardProvider, + ) from ray.train.v2._internal.execution.callback import TrainContextCallback from ray.train.v2._internal.execution.worker_group.thread_runner import ThreadRunner + from ray.train.v2.api.reported_checkpoint import ReportedCheckpoint logger = logging.getLogger(__file__) +# TODO: make this value manually or automatically configurable. +MAX_CHECKPOINT_UPLOAD_THREADS = 1 + + @dataclass(frozen=True) class TrainRunContext: """Holds the metadata and context for the current training run.""" @@ -41,13 +57,13 @@ class TrainRunContext: scaling_config: ScalingConfig # The configuration for the training backend (e.g., PyTorch, XGBoost). - backend_config: BackendConfig + backend_config: "BackendConfig" # The datasets used in the current training run. datasets: Dict[str, Dataset] # The configuration for dataset ingestion and sharding. - dataset_config: DataConfig + dataset_config: "DataConfig" def get_run_config(self) -> RunConfig: """Returns the run config of the current training run.""" @@ -92,8 +108,18 @@ class TrainContext: distributed_context: DistributedContext execution_context: ExecutionContext storage_context: StorageContext - dataset_shards: Dict[str, DataIterator] - checkpoint: Optional[Checkpoint] = None + controller_actor: ActorHandle + + dataset_shard_provider: "DatasetShardProvider" + + # TODO: consolidate into CheckpointContext + checkpoint: Optional["Checkpoint"] = None + current_report_index: int = 0 + report_call_index: int = 0 + report_order_condition: threading.Condition = threading.Condition() + checkpoint_upload_threadpool: ThreadPoolExecutor = ThreadPoolExecutor( + max_workers=MAX_CHECKPOINT_UPLOAD_THREADS + ) @_copy_doc(session.get_experiment_name) def get_experiment_name(self) -> str: @@ -131,9 +157,17 @@ def get_synchronization_actor(self): return self.execution_context.synchronization_actor def get_checkpoint(self): - return self.checkpoint + with self.report_order_condition: + return self.checkpoint - def get_dataset_shard(self, dataset_name: str) -> DataIterator: + def get_all_reported_checkpoints(self) -> List["ReportedCheckpoint"]: + return ray.get( + self.controller_actor.get_all_reported_checkpoints.remote( + self.current_report_index + ) + ) + + def get_dataset_shard(self, dataset_info: "DatasetShardMetadata") -> DataIterator: """Returns the :class:`ray.data.DataIterator` shard for this worker. Call :meth:`~ray.data.DataIterator.iter_torch_batches` or @@ -141,19 +175,13 @@ def get_dataset_shard(self, dataset_name: str) -> DataIterator: appropriate framework-specific data type. Args: - dataset_name: Name of the dataset shard. + dataset_info: The shard metadata, including the dataset name and worker rank. Returns: The ``DataIterator`` shard with the given name for this worker. Raises: KeyError: If the dataset shard with the given name is not found. """ - try: - return self.dataset_shards[dataset_name] - except KeyError: - raise KeyError( - f"Dataset {dataset_name} not found. Available datasets: " - f"{list(self.dataset_shards.keys())}." - ) + return self.dataset_shard_provider.get_dataset_shard(dataset_info) def get_context_callbacks(self) -> List["TrainContextCallback"]: return self.execution_context.train_context_callbacks @@ -187,14 +215,21 @@ def _sync_checkpoint_dir_name_across_ranks( ) ) - def _save_checkpoint( + def _upload_checkpoint( self, checkpoint_dir_name: str, metrics: Dict[str, Any], - checkpoint: Optional[Checkpoint] = None, + checkpoint: Optional["Checkpoint"] = None, + delete_local_checkpoint_after_upload: bool = False, ) -> _TrainingResult: """Save the checkpoint to remote storage. + Args: + checkpoint_dir_name: The checkpoint dir to persist to. + metrics: The metrics to report. + checkpoint: The checkpoint to report. + delete_local_checkpoint_after_upload: Whether to delete the checkpoint after it is uploaded. + Returns: The training result object containing the persisted checkpoint. """ @@ -203,32 +238,68 @@ def _save_checkpoint( return _TrainingResult(checkpoint=None, metrics=metrics) # Persist the checkpoint to the remote storage path. - persisted_checkpoint = self.storage_context.persist_current_checkpoint( - checkpoint, checkpoint_dir_name - ) - # Update latest checkpoint as the persisted checkpoint. - self.checkpoint = persisted_checkpoint + try: + persisted_checkpoint = self.storage_context.persist_current_checkpoint( + checkpoint, checkpoint_dir_name + ) + except FileNotFoundError: + logger.exception( + f"Failed to find local checkpoint {checkpoint} when attempting to upload it. " + "This could be caused by multiple workers on a node attempting to upload the " + "same directory, and then one of the workers deletes the directory before the " + "others finish." + ) + raise + # TODO: consider deleting local checkpoint as async callback instead + if delete_local_checkpoint_after_upload: + try: + delete_fs_path(checkpoint.filesystem, checkpoint.path) + except Exception: + logger.exception( + f"Failed to delete the local checkpoint after a successful upload: {checkpoint}" + ) return _TrainingResult(checkpoint=persisted_checkpoint, metrics=metrics) + def _wait_then_report( + self, training_result: _TrainingResult, report_call_index: int + ) -> None: + """Thread waits for its turn before reporting training result to result queue. + + It does this in order to guarantee the FIFO processing of checkpoints. + + The queue size is set to 1 to avoid accumulating unprocessed results. + If the queue is full, the put operation blocks until a result is consumed. + + TODO: Add a metric to track the blocking time waiting for the + training result to be consumed by the controller. + """ + with self.report_order_condition: + self.report_order_condition.wait_for( + lambda: self.current_report_index == report_call_index - 1 + ) + logger.info( + f"Reporting training result {report_call_index}: {training_result}" + ) + # Update latest checkpoint as the persisted checkpoint. + if training_result.checkpoint: + self.checkpoint = training_result.checkpoint + self.get_result_queue().put(training_result) + self.current_report_index += 1 + self.report_order_condition.notify_all() + def report( self, metrics: Dict[str, Any], - checkpoint: Optional[Checkpoint] = None, + checkpoint: Optional["Checkpoint"] = None, checkpoint_dir_name: Optional[str] = None, - ): + checkpoint_upload_mode: CheckpointUploadMode = CheckpointUploadMode.SYNC, + delete_local_checkpoint_after_upload: Optional[bool] = None, + ) -> None: """ Upload checkpoint to remote storage and put a training result on the result queue of this worker process. - Args: - metrics: The metrics to report. - checkpoint: The checkpoint to report. - checkpoint_dir_name: The name of the checkpoint dir - in this iteration. Note: If not set, the checkpoint will - be stored in the default storage path. If set, make sure - this value is unique for each iteration. - TODO: the report function should be implemented in the worker instead of in the train context. The train context should only keep the train related information and not the worker related actions. This refactor @@ -252,21 +323,65 @@ def report( for callback in self.execution_context.train_context_callbacks ] ): - # Step 1: sync the checkpoint dir name across ranks. + self.report_call_index += 1 + report_call_index = self.report_call_index + + # Sync the checkpoint dir name across ranks. checkpoint_dir_name = self._sync_checkpoint_dir_name_across_ranks( checkpoint_dir_name ) - # Step 2: save the checkpoint to remote storage. - training_result = self._save_checkpoint( - checkpoint_dir_name, metrics, checkpoint - ) - # Step 3: Report the training result to the result queue. - # The queue size is set to 1 to avoid accumulating unprocessed results. - # If the queue is full, the put operation blocks until a result is consumed. - # TODO (hpguo): Add a metrics to track the blocking time waiting for the - # training result to be consumed by the controller. - self.get_result_queue().put(training_result) + # Upload checkpoint, wait for turn, and report. + if checkpoint_upload_mode == CheckpointUploadMode.SYNC: + training_result = self._upload_checkpoint( + checkpoint_dir_name, + metrics, + checkpoint, + delete_local_checkpoint_after_upload, + ) + self._wait_then_report(training_result, report_call_index) + + elif checkpoint_upload_mode == CheckpointUploadMode.NO_UPLOAD: + training_result = _TrainingResult( + checkpoint=checkpoint, metrics=metrics + ) + self._wait_then_report(training_result, report_call_index) + + elif checkpoint_upload_mode == CheckpointUploadMode.ASYNC: + + def _upload_checkpoint_and_report( + checkpoint_dir_name: str, + metrics: Dict[str, Any], + checkpoint: Optional["Checkpoint"], + report_call_index: int, + ) -> None: + try: + training_result = self._upload_checkpoint( + checkpoint_dir_name, + metrics, + checkpoint, + delete_local_checkpoint_after_upload, + ) + self._wait_then_report(training_result, report_call_index) + except Exception as e: + logger.exception( + "Async checkpoint upload failed - shutting down workers" + ) + self.execution_context.training_thread_runner.get_exception_queue().put( + construct_user_exception_with_traceback(e) + ) + + self.checkpoint_upload_threadpool.submit( + _upload_checkpoint_and_report, + checkpoint_dir_name, + metrics, + checkpoint, + report_call_index, + ) + else: + raise ValueError( + f"Invalid checkpoint upload mode: {checkpoint_upload_mode}" + ) # The global variable holding the current TrainContext @@ -277,6 +392,16 @@ def report( def get_train_context() -> TrainContext: + """Get the internal train context. + + Note: + This should not be used directly by user-facing APIs. User-facing APIs should + call :class:`~ray.train.v2._internal.execution.train_fn_utils.TrainFnUtils` + or use :class:`~ray.train.v2.api.context.TrainContext` instead. + + Returns: + The internal TrainContext for this worker. + """ with _context_lock: if _train_context is None: raise RuntimeError("TrainContext has not been initialized.") diff --git a/python/ray/train/v2/_internal/execution/controller/controller.py b/python/ray/train/v2/_internal/execution/controller/controller.py index e2f916d140a4..3d84796c4340 100644 --- a/python/ray/train/v2/_internal/execution/controller/controller.py +++ b/python/ray/train/v2/_internal/execution/controller/controller.py @@ -3,7 +3,7 @@ import os import uuid from dataclasses import dataclass -from typing import Callable, List, Optional, Union +from typing import TYPE_CHECKING, Callable, List, Optional, Union import pandas as pd @@ -50,7 +50,6 @@ ResizeDecision, ScalingPolicy, ) -from ray.train.v2._internal.execution.storage import StorageContext from ray.train.v2._internal.execution.worker_group import ( WorkerGroup, WorkerGroupPollStatus, @@ -67,6 +66,10 @@ ) from ray.train.v2.api.result import Result +if TYPE_CHECKING: + from ray.train.v2.api.reported_checkpoint import ReportedCheckpoint + + logger = logging.getLogger(__name__) @@ -122,11 +125,7 @@ def __init__( self._failure_policy = failure_policy self._run_config = self._train_run_context.run_config self._callbacks = callbacks or [] - self._storage_context = StorageContext( - storage_path=self._run_config.storage_path, - experiment_dir_name=self._run_config.name, - storage_filesystem=self._run_config.storage_filesystem, - ) + self._storage_context = self._train_run_context.run_config.storage_context self._checkpoint_manager = CheckpointManager( checkpoint_config=self._run_config.checkpoint_config, @@ -275,17 +274,37 @@ def _start_worker_group( ) -> Optional[ControllerError]: """Start the worker group and launch the train function. + Args: + num_workers: The number of workers to start. + resources_per_worker: The resources per worker to start. + Returns: None if the worker group was successfully started, ControllerError if the worker group failed to start. """ placement_strategy = self._scaling_policy.scaling_config.placement_strategy + scaling_config = self._train_run_context.scaling_config + + # Check for `bundle_label_selector` to influence WorkerGroup scheduling. + bundle_label_selector = None + try: + for callback in self._controller_callbacks: + selector = callback.on_controller_start_worker_group( + scaling_config=scaling_config, num_workers=num_workers + ) + if selector: + bundle_label_selector = selector + break + except Exception as e: + return ControllerError(e) + worker_group_context = WorkerGroupContext( run_attempt_id=self._get_run_attempt_id(), train_fn_ref=self._train_fn_ref, num_workers=num_workers, resources_per_worker=resources_per_worker, placement_strategy=placement_strategy, + bundle_label_selector=bundle_label_selector, ) try: self._worker_group = self.worker_group_cls.create( @@ -521,7 +540,6 @@ def get_result(self) -> Result: raise ValueError( f"Cannot get result when controller is in state {controller_state}" ) - return self._build_result() def get_training_failed_error(self) -> Optional[TrainingFailedError]: @@ -537,3 +555,10 @@ def get_training_failed_error(self) -> Optional[TrainingFailedError]: return controller_state.training_failed_error return None + + async def get_all_reported_checkpoints( + self, current_report_index: int + ) -> List["ReportedCheckpoint"]: + return await self._checkpoint_manager.get_all_reported_checkpoints( + current_report_index + ) diff --git a/python/ray/train/v2/_internal/execution/local_mode_utils.py b/python/ray/train/v2/_internal/execution/local_mode_utils.py new file mode 100644 index 000000000000..06a1d12627ac --- /dev/null +++ b/python/ray/train/v2/_internal/execution/local_mode_utils.py @@ -0,0 +1,40 @@ +import logging +from typing import Callable, Dict, Optional + +from ray.train import Result +from ray.train.trainer import GenDataset +from ray.train.v2._internal.execution.train_fn_utils import ( + LocalTrainFnUtils, + get_train_fn_utils, + set_train_fn_utils, +) + +logger = logging.getLogger(__name__) + + +class LocalController: + def __init__( + self, experiment_name: str, datasets: Optional[Dict[str, GenDataset]] = None + ): + if datasets is not None: + datasets = {k: v() if callable(v) else v for k, v in datasets.items()} + + self.datasets = datasets + self.experiment_name = experiment_name + + def run(self, train_func: Callable[[], None]) -> Result: + set_train_fn_utils( + LocalTrainFnUtils( + experiment_name=self.experiment_name, + dataset_shards=self.datasets, + ) + ) + train_func() + train_fn_utils = get_train_fn_utils() + assert isinstance(train_fn_utils, LocalTrainFnUtils) + return Result( + metrics=train_fn_utils._get_last_metrics(), + checkpoint=train_fn_utils.get_checkpoint(), + path=None, + error=None, + ) diff --git a/python/ray/train/v2/_internal/execution/storage.py b/python/ray/train/v2/_internal/execution/storage.py index 4ffc740c50af..abf80697da36 100644 --- a/python/ray/train/v2/_internal/execution/storage.py +++ b/python/ray/train/v2/_internal/execution/storage.py @@ -120,7 +120,8 @@ def _pyarrow_fs_copy_files( # TODO(justinvyu): Add unit tests for all these utils. -def _delete_fs_path(fs: pyarrow.fs.FileSystem, fs_path: str): +def delete_fs_path(fs: pyarrow.fs.FileSystem, fs_path: str): + """Deletes (fs, fs_path) or raises FileNotFoundError if it doesn't exist.""" is_dir = _is_directory(fs, fs_path) try: diff --git a/python/ray/train/v2/_internal/execution/train_fn_utils.py b/python/ray/train/v2/_internal/execution/train_fn_utils.py new file mode 100644 index 000000000000..932bc69dd973 --- /dev/null +++ b/python/ray/train/v2/_internal/execution/train_fn_utils.py @@ -0,0 +1,236 @@ +import logging +import threading +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Any, Dict, List, Optional + +from ray.data import DataIterator +from ray.train.v2._internal.data_integration.interfaces import DatasetShardMetadata +from ray.train.v2._internal.execution import collective_impl +from ray.train.v2._internal.execution.context import ( + get_train_context as get_internal_train_context, +) +from ray.train.v2.api.context import ( + DistributedTrainContext, + LocalTrainContext, + TrainContext as ExternalTrainContext, +) +from ray.train.v2.api.report_config import CheckpointUploadMode + +logger = logging.getLogger(__name__) + +if TYPE_CHECKING: + from ray.train import Checkpoint + from ray.train.v2.api.reported_checkpoint import ReportedCheckpoint + + +class TrainFnUtils(ABC): + """Utility class providing an abstraction layer between user-facing APIs + and :class:`~ray.train.v2.api.context.TrainContext`. + + It should be set before the users' training function is called. + This class can be patched if new user APIs behaviors is wanted. + """ + + @abstractmethod + def report( + self, + metrics: Dict[str, Any], + checkpoint: Optional["Checkpoint"] = None, + checkpoint_dir_name: Optional[str] = None, + checkpoint_upload_mode: CheckpointUploadMode = CheckpointUploadMode.SYNC, + delete_local_checkpoint_after_upload: Optional[bool] = None, + ) -> None: + """Upload checkpoint to remote storage and put a training result on the result queue. + + Args: + metrics: The metrics to report. + checkpoint: The checkpoint to report. + checkpoint_dir_name: The name of the checkpoint dir + in this iteration. Note: If not set, the checkpoint will + be stored in the default storage path. If set, make sure + this value is unique for each iteration. + checkpoint_upload_mode: The manner in which we want to upload the checkpoint. + Defaults to uploading the checkpoint synchronously. + This works when no checkpoint is provided but is not useful in that case. + delete_local_checkpoint_after_upload: Whether to delete the checkpoint after it is uploaded. + """ + pass + + @abstractmethod + def get_checkpoint(self) -> Optional["Checkpoint"]: + """Get the latest checkpoint to resume training from. + + Returns: + The latest checkpoint if available, None otherwise. + """ + pass + + @abstractmethod + def get_all_reported_checkpoints(self) -> List["ReportedCheckpoint"]: + """Get all the checkpoints reported by the workers. + + Returns: + A list of ReportedCheckpoint objects that represent the checkpoints and + corresponding metrics reported by the workers. + """ + pass + + @abstractmethod + def get_dataset_shard(self, dataset_info: DatasetShardMetadata) -> DataIterator: + """Get the dataset shard for this training process. + + Args: + dataset_info: The metadata of the dataset to get the shard for. + + Returns: + The DataIterator shard for this worker. + """ + pass + + @abstractmethod + def get_context(self) -> ExternalTrainContext: + """Get the TrainContext for this training process. + The specific type of TrainContext returned depends on the implementation of TrainFnUtils. + + Returns: + The train context for this training process. + """ + pass + + @abstractmethod + def is_distributed(self) -> bool: + pass + + @abstractmethod + def barrier(self) -> None: + """Create a barrier across all workers. + + All workers must call this method before the training function can continue. + + This method is used by the public API function :func:`ray.train.collective.barrier`. + Users should typically call ``ray.train.collective.barrier()`` instead of calling this method directly. + """ + pass + + @abstractmethod + def broadcast_from_rank_zero(self, data: Any) -> Any: + """Broadcast data from the rank 0 worker to all other workers. + + This method is used by the public API function :func:`ray.train.collective.broadcast_from_rank_zero`. + Users should typically call ``ray.train.collective.broadcast_from_rank_zero()`` instead of calling this method directly. + """ + pass + + +class DistributedTrainFnUtils(TrainFnUtils): + def report( + self, + metrics: Dict[str, Any], + checkpoint: Optional["Checkpoint"] = None, + checkpoint_dir_name: Optional[str] = None, + checkpoint_upload_mode: CheckpointUploadMode = CheckpointUploadMode.SYNC, + delete_local_checkpoint_after_upload: Optional[bool] = None, + ) -> None: + return get_internal_train_context().report( + metrics, + checkpoint, + checkpoint_dir_name, + checkpoint_upload_mode, + delete_local_checkpoint_after_upload, + ) + + def get_checkpoint(self): + return get_internal_train_context().get_checkpoint() + + def get_dataset_shard(self, dataset_info: DatasetShardMetadata) -> DataIterator: + return get_internal_train_context().get_dataset_shard(dataset_info) + + def get_context(self) -> DistributedTrainContext: + return DistributedTrainContext() + + def is_distributed(self) -> bool: + return True + + def barrier(self) -> None: + return collective_impl.barrier() + + def broadcast_from_rank_zero(self, data: Any) -> Any: + return collective_impl.broadcast_from_rank_zero(data) + + def get_all_reported_checkpoints(self) -> List["ReportedCheckpoint"]: + return get_internal_train_context().get_all_reported_checkpoints() + + +class LocalTrainFnUtils(TrainFnUtils): + def __init__( + self, + experiment_name: str, + dataset_shards: Optional[Dict[str, DataIterator]] = None, + ): + self._context = LocalTrainContext( + experiment_name=experiment_name, + ) + self._dataset_shards = dataset_shards + self._last_metrics = None + self._last_checkpoint = None + + def report( + self, + metrics: Dict[str, Any], + checkpoint: Optional["Checkpoint"] = None, + checkpoint_dir_name: Optional[str] = None, + checkpoint_upload_mode: CheckpointUploadMode = CheckpointUploadMode.SYNC, + delete_local_checkpoint_after_upload: Optional[bool] = None, + ) -> None: + self._last_metrics = metrics + self._last_checkpoint = checkpoint + logger.info(f"Reported metrics: {metrics}") + + def get_checkpoint(self) -> Optional["Checkpoint"]: + return self._last_checkpoint + + def get_dataset_shard(self, dataset_info: DatasetShardMetadata) -> DataIterator: + dataset_name = dataset_info.dataset_name + assert ( + self._dataset_shards is not None and dataset_name in self._dataset_shards + ), f"Dataset shard {dataset_name} not found." + return self._dataset_shards[dataset_name] + + def get_context(self) -> LocalTrainContext: + return self._context + + def is_distributed(self) -> bool: + return False + + def barrier(self) -> None: + pass + + def broadcast_from_rank_zero(self, data: Any) -> Any: + return data + + def _get_last_metrics(self) -> Optional[Dict[str, Any]]: + """Return the last metrics reported by the training function. + This function should only be called by LocalController + """ + return self._last_metrics + + def get_all_reported_checkpoints(self) -> List["ReportedCheckpoint"]: + return [] + + +_train_fn_utils: Optional[TrainFnUtils] = None +_train_fn_utils_lock = threading.Lock() + + +def get_train_fn_utils() -> TrainFnUtils: + global _train_fn_utils + with _train_fn_utils_lock: + if _train_fn_utils is None: + raise RuntimeError("TrainFnUtils has not been initialized.") + return _train_fn_utils + + +def set_train_fn_utils(train_fn_utils) -> None: + global _train_fn_utils + with _train_fn_utils_lock: + _train_fn_utils = train_fn_utils diff --git a/python/ray/train/v2/_internal/execution/worker_group/poll.py b/python/ray/train/v2/_internal/execution/worker_group/poll.py index e7f47d68da46..dffe2eb5d892 100644 --- a/python/ray/train/v2/_internal/execution/worker_group/poll.py +++ b/python/ray/train/v2/_internal/execution/worker_group/poll.py @@ -1,10 +1,36 @@ +import re +from collections import defaultdict from dataclasses import dataclass from typing import Dict, Optional +from ray._private.ray_logging import NUMBERS from ray.train._internal.session import _TrainingResult +from ray.train.v2._internal.exceptions import WorkerHealthCheckFailedError from ray.train.v2.api.exceptions import WorkerGroupError from ray.types import ObjectRef +ERR_CHAR_LIMIT = 1000 + + +def _normalize_error_string(error_str: str) -> str: + # Replace numbers with based on NUMBERS regex + normalized = re.sub(NUMBERS, "", error_str) + return normalized + + +def _truncate_error_string(error_str: str) -> str: + """ + Truncates error strings to include the first ERR_CHAR_LIMIT // 2 + characters and the last ERR_CHAR_LIMIT // 2 characters. + """ + if len(error_str) >= ERR_CHAR_LIMIT: + return ( + error_str[: ERR_CHAR_LIMIT // 2] + + "...\n... (Output truncated. See individual worker logs for full details) ...\n" + + error_str[len(error_str) - ERR_CHAR_LIMIT // 2 :] + ) + return error_str + @dataclass class WorkerStatus: @@ -38,9 +64,51 @@ def finished(self) -> bool: ) def get_error_string(self) -> str: - return "\n".join( - f"[Rank {world_rank}]\n{error}" for world_rank, error in self.errors.items() - ) + """ + Returns a string representation of worker group errors. + Groups similar errors (ignoring numbers) and shows original error examples. + """ + # Group errors by normalized strings (ignoring numbers) + normalized_error_to_ranks = defaultdict(list) + normalized_error_to_original = {} + show_full_error = set() + + for world_rank, status in self.worker_statuses.items(): + if status.error: + error_str = str(status.error) + normalized_error = _normalize_error_string(error_str) + + normalized_error_to_ranks[normalized_error].append(str(world_rank)) + + # Store the first original error for this normalized group + if normalized_error not in normalized_error_to_original: + normalized_error_to_original[normalized_error] = error_str + + # Fully show errors for non-graceful worker failures or running workers + if ( + isinstance(status.error, WorkerHealthCheckFailedError) + or status.running + ): + show_full_error.add(normalized_error) + + errors = [] + for normalized_error, ranks in normalized_error_to_ranks.items(): + # Show the original error + orig_error = normalized_error_to_original[normalized_error] + + # Convert rank list to comma-separated strings + ranks_str = ",".join(ranks) + + if normalized_error in show_full_error: + errors.append(f"[Rank {ranks_str} Error Snippet]:\n{orig_error}") + else: + errors.append( + f"[Rank {ranks_str} Error Snippet]:\n{_truncate_error_string(orig_error)}" + ) + + error_str = "\n".join(errors) + + return error_str @dataclass(frozen=True) diff --git a/python/ray/train/v2/_internal/execution/worker_group/thread_runner.py b/python/ray/train/v2/_internal/execution/worker_group/thread_runner.py index ef19e66583d4..df0e4cb28e65 100644 --- a/python/ray/train/v2/_internal/execution/worker_group/thread_runner.py +++ b/python/ray/train/v2/_internal/execution/worker_group/thread_runner.py @@ -1,10 +1,13 @@ import logging +import queue import threading -import traceback from typing import Callable, Optional, TypeVar from ray.train.v2._internal.exceptions import UserExceptionWithTraceback -from ray.train.v2._internal.util import get_callable_name +from ray.train.v2._internal.util import ( + construct_user_exception_with_traceback, + get_callable_name, +) T = TypeVar("T") @@ -21,7 +24,9 @@ def __init__(self): self._exc: Optional[UserExceptionWithTraceback] = None self._thread: Optional[threading.Thread] = None + self._monitor_thread: Optional[threading.Thread] = None self._lock = threading.Lock() + self._exc_queue: queue.SimpleQueue[Optional[Exception]] = queue.SimpleQueue() self._is_running = False @@ -37,19 +42,14 @@ def _run_target(): result = target() with self._lock: self._ret = result + self._exc_queue.put(None) except BaseException as e: - with self._lock: - # Exclude the first 2 frames from the traceback, which are - # the `ThreadRunner._run_target` and `construct_train_func` calls. - # TODO(justinvyu): This is brittle and may break if the call stack - # changes. Figure out a more robust way to exclude these frames. - exc_traceback_str = traceback.format_exc( - limit=-(len(traceback.extract_tb(e.__traceback__)) - 2) - ) - logger.error(f"Error in training function:\n{exc_traceback_str}") - self._exc = UserExceptionWithTraceback( - e, traceback_str=exc_traceback_str - ) + # Exclude the first 3 frames from the traceback, which are + # the `ThreadRunner._run_target`, `construct_train_func`, and + # train_fn_with_final_checkpoint_flush calls. + self._exc_queue.put( + construct_user_exception_with_traceback(e, exclude_frames=3) + ) with self._lock: self._is_running = False @@ -61,7 +61,20 @@ def _run_target(): ) self._thread.start() + def _monitor_target(): + exc = self._exc_queue.get() + with self._lock: + self._exc = exc + + self._monitor_thread = threading.Thread( + target=_monitor_target, + daemon=True, + name=f"MonitoringThread({get_callable_name(target)})", + ) + self._monitor_thread.start() + def is_running(self) -> bool: + """Returns whether the target function is still running.""" with self._lock: return self._is_running @@ -73,10 +86,6 @@ def get_return_value(self) -> Optional[T]: with self._lock: return self._ret - def join(self, timeout: Optional[float] = None) -> T: - if self._thread is None: - raise RuntimeError("Must call `run` before trying to `join`.") - - self._thread.join(timeout=timeout) - - return self.get_return_value() + def get_exception_queue(self) -> queue.SimpleQueue: + """Returns a queue that nested threads can add exceptions to.""" + return self._exc_queue diff --git a/python/ray/train/v2/_internal/execution/worker_group/worker.py b/python/ray/train/v2/_internal/execution/worker_group/worker.py index 1a3ea2e7a554..c890ad5c8a88 100644 --- a/python/ray/train/v2/_internal/execution/worker_group/worker.py +++ b/python/ray/train/v2/_internal/execution/worker_group/worker.py @@ -4,13 +4,12 @@ import socket from dataclasses import dataclass from functools import cached_property -from typing import Callable, Dict, List, Optional, TypeVar, Union +from typing import TYPE_CHECKING, Callable, Dict, List, Optional, TypeVar, Union import ray import ray._private.ray_constants as ray_constants from .thread_runner import ThreadRunner from ray.actor import ActorHandle -from ray.data.iterator import DataIterator from ray.train import Checkpoint from ray.train.v2._internal.constants import ( DEFAULT_ENABLE_WORKER_LOGGING, @@ -30,12 +29,19 @@ set_train_context, ) from ray.train.v2._internal.execution.storage import StorageContext +from ray.train.v2._internal.execution.train_fn_utils import ( + DistributedTrainFnUtils, + set_train_fn_utils, +) from ray.train.v2._internal.execution.worker_group.poll import WorkerStatus from ray.train.v2._internal.logging.logging import LoggingManager from ray.train.v2._internal.logging.patch_print import patch_print_function from ray.train.v2._internal.util import ObjectRefWrapper from ray.types import ObjectRef +if TYPE_CHECKING: + from ray.train.v2._internal.data_integration.interfaces import DatasetShardProvider + T = TypeVar("T") logger = logging.getLogger(__name__) @@ -132,8 +138,14 @@ def run_train_fn(self, train_fn_ref: ObjectRefWrapper[Callable[[], None]]): logger.error(f"Error deserializing the training function: {e}") raise + def train_fn_with_final_checkpoint_flush(): + train_fn() + get_train_context().checkpoint_upload_threadpool.shutdown() + # Create and start the training thread. - get_train_context().execution_context.training_thread_runner.run(train_fn) + get_train_context().execution_context.training_thread_runner.run( + train_fn_with_final_checkpoint_flush + ) def get_metadata(self) -> ActorMetadata: return ActorMetadata( @@ -188,7 +200,8 @@ def init_train_context( synchronization_actor: SynchronizationActor, storage_context: StorageContext, worker_callbacks: List[Union[WorkerCallback, TrainContextCallback]], - dataset_shards: Dict[str, DataIterator] = None, + controller_actor: ActorHandle, + dataset_shard_provider: Optional["DatasetShardProvider"] = None, checkpoint: Optional[Checkpoint] = None, ): self._callbacks = [c for c in worker_callbacks if isinstance(c, WorkerCallback)] @@ -207,8 +220,9 @@ def init_train_context( train_context_callbacks=context_callbacks_to_propagate, ), storage_context=storage_context, - dataset_shards=dataset_shards or {}, + controller_actor=controller_actor, checkpoint=checkpoint, + dataset_shard_provider=dataset_shard_provider, ) # Configure the train and root logger for the worker processes. if ray_constants.env_bool( @@ -219,5 +233,8 @@ def init_train_context( # Set the train context global variable for the worker. set_train_context(context) + # user facing train fn utils + set_train_fn_utils(DistributedTrainFnUtils()) + for callback in self._callbacks: callback.after_init_train_context() diff --git a/python/ray/train/v2/_internal/execution/worker_group/worker_group.py b/python/ray/train/v2/_internal/execution/worker_group/worker_group.py index 8931145ecdbb..b103bceac1b7 100644 --- a/python/ray/train/v2/_internal/execution/worker_group/worker_group.py +++ b/python/ray/train/v2/_internal/execution/worker_group/worker_group.py @@ -37,7 +37,6 @@ from ray.train.v2._internal.execution.checkpoint.sync_actor import SynchronizationActor from ray.train.v2._internal.execution.context import ( DistributedContext, - StorageContext, TrainRunContext, ) from ray.train.v2._internal.execution.worker_group.poll import ( @@ -89,6 +88,7 @@ class WorkerGroupContext: num_workers: The number of workers in the worker group. resources_per_worker: The resources per worker. placement_strategy: Strategy for placing workers. + bundle_label_selector: Optional label selectors to apply per-bundle for workers. """ run_attempt_id: str @@ -96,6 +96,7 @@ class WorkerGroupContext: num_workers: int resources_per_worker: Dict[str, float] placement_strategy: str = "PACK" + bundle_label_selector: Optional[Dict[str, str]] = None class WorkerGroup: @@ -143,11 +144,7 @@ def __init__( """ self._train_run_context = train_run_context run_config = self._train_run_context.run_config - self._storage_context = StorageContext( - storage_path=run_config.storage_path, - experiment_dir_name=run_config.name, - storage_filesystem=run_config.storage_filesystem, - ) + self._storage_context = run_config.storage_context self._worker_group_context: WorkerGroupContext = worker_group_context @@ -268,10 +265,18 @@ def _start_impl( for callback in self._callbacks: callback.before_worker_group_start(worker_group_context) + bundle_label_selector = ( + [worker_group_context.bundle_label_selector.copy()] + * worker_group_context.num_workers + if worker_group_context.bundle_label_selector + else None + ) + pg = placement_group( bundles=[worker_group_context.resources_per_worker] * worker_group_context.num_workers, strategy=worker_group_context.placement_strategy, + bundle_label_selector=bundle_label_selector, ) logger.info( f"Attempting to start training worker group of size {worker_group_context.num_workers} with " @@ -427,6 +432,7 @@ def _init_train_context_on_workers( synchronization_actor=sync_actor, storage_context=self._storage_context, worker_callbacks=self._worker_callbacks_to_propagate, + controller_actor=ray.get_runtime_context().current_actor, **{ arg: arg_values[i] for arg, arg_values in train_context_args.items() }, @@ -464,13 +470,15 @@ def _clear_state(self): def abort(self): """Abort the worker group.""" - # TODO: consider shutting down the workers in the future. - # We don't do this for now due to this risk of hanging e.g. when calling - # `destroy_process_group` on an active group. self._assert_active() for callback in self._callbacks: callback.before_worker_group_abort(self._worker_group_context) + # TODO: Add shutdown callback hooks + + self._worker_group_state.shutdown() + self._clear_state() + ##################################################################################### # Polling Worker Group ##################################################################################### diff --git a/python/ray/train/v2/_internal/logging/logging.py b/python/ray/train/v2/_internal/logging/logging.py index 053dd52e63fe..8fb645df3b5e 100644 --- a/python/ray/train/v2/_internal/logging/logging.py +++ b/python/ray/train/v2/_internal/logging/logging.py @@ -4,9 +4,9 @@ from typing import Optional, Union import ray +from ray._common.filters import CoreContextFilter +from ray._common.formatters import JSONFormatter from ray._private.log import PlainRayHandler -from ray._private.ray_logging.filters import CoreContextFilter -from ray._private.ray_logging.formatters import JSONFormatter from ray.train.v2._internal.execution.context import TrainContext, TrainRunContext from ray.train.v2._internal.util import get_module_name diff --git a/python/ray/train/v2/_internal/util.py b/python/ray/train/v2/_internal/util.py index 8ce512af8d90..4f8a7c32732a 100644 --- a/python/ray/train/v2/_internal/util.py +++ b/python/ray/train/v2/_internal/util.py @@ -1,6 +1,8 @@ import contextlib import functools +import logging import time +import traceback from datetime import datetime from typing import ( Any, @@ -17,8 +19,12 @@ import ray from ray.train._internal.utils import count_required_parameters +from ray.train.v2._internal.exceptions import UserExceptionWithTraceback from ray.types import ObjectRef +logger = logging.getLogger(__name__) + + T = TypeVar("T") @@ -210,3 +216,25 @@ def get_callable_name(fn: Callable) -> str: # Fallback to the class name for objects that implement __call__ return fn.__class__.__name__ + + +def construct_user_exception_with_traceback( + e: BaseException, exclude_frames: int = 0 +) -> UserExceptionWithTraceback: + """Construct a UserExceptionWithTraceback from a base exception. + + Args: + e: The base exception to construct a UserExceptionWithTraceback from. + exclude_frames: The number of frames to exclude from the beginnning of + the traceback. + + Returns: + A UserExceptionWithTraceback object. + """ + # TODO(justinvyu): This is brittle and may break if the call stack + # changes. Figure out a more robust way to exclude these frames. + exc_traceback_str = traceback.format_exc( + limit=-(len(traceback.extract_tb(e.__traceback__)) - exclude_frames) + ) + logger.error(f"Error in training function:\n{exc_traceback_str}") + return UserExceptionWithTraceback(e, traceback_str=exc_traceback_str) diff --git a/python/ray/train/v2/api/config.py b/python/ray/train/v2/api/config.py index 4efc25a2960c..a640143d6efe 100644 --- a/python/ray/train/v2/api/config.py +++ b/python/ray/train/v2/api/config.py @@ -1,5 +1,6 @@ import logging from dataclasses import dataclass +from functools import cached_property from pathlib import Path from typing import TYPE_CHECKING, List, Optional, Union @@ -12,6 +13,7 @@ ) from ray.runtime_env import RuntimeEnv from ray.train.v2._internal.constants import _DEPRECATED +from ray.train.v2._internal.execution.storage import StorageContext from ray.train.v2._internal.migration_utils import ( FAIL_FAST_DEPRECATION_MESSAGE, TRAINER_RESOURCES_DEPRECATION_MESSAGE, @@ -22,7 +24,6 @@ if TYPE_CHECKING: from ray.train import UserCallback - logger = logging.getLogger(__name__) @@ -34,7 +35,9 @@ class ScalingConfig(ScalingConfigV1): num_workers: The number of workers (Ray actors) to launch. Each worker will reserve 1 CPU by default. The number of CPUs reserved by each worker can be overridden with the - ``resources_per_worker`` argument. + ``resources_per_worker`` argument. If the number of workers is 0, + the training function will run in local mode, meaning the training + function runs in the same process. use_gpu: If True, training will be done on GPUs (1 per worker). Defaults to False. The number of GPUs reserved by each worker can be overridden with the ``resources_per_worker`` @@ -51,7 +54,17 @@ class ScalingConfig(ScalingConfigV1): of accelerators. See :ref:`the available accelerator types `. Ensure that your cluster has instances with the specified accelerator type - or is able to autoscale to fulfill the request. + or is able to autoscale to fulfill the request. This field is required + when `use_tpu` is True and `num_workers` is greater than 1. + use_tpu: [Experimental] If True, training will be done on TPUs (1 TPU VM + per worker). Defaults to False. The number of TPUs reserved by each + worker can be overridden with the ``resources_per_worker`` + argument. This arg enables SPMD execution of the training workload. + topology: [Experimental] If specified, Ray Train will launch the training + coordinator and workers on nodes with the specified topology. Topology is + auto-detected for TPUs and added as Ray node labels. This arg enables + SPMD execution of the training workload. This field is required + when `use_tpu` is True and `num_workers` is greater than 1. Example: @@ -73,17 +86,69 @@ class ScalingConfig(ScalingConfigV1): """ trainer_resources: Optional[dict] = None + use_tpu: Union[bool] = False + topology: Optional[str] = None def __post_init__(self): if self.trainer_resources is not None: raise DeprecationWarning(TRAINER_RESOURCES_DEPRECATION_MESSAGE) + if self.use_gpu and self.use_tpu: + raise ValueError("Cannot specify both `use_gpu=True` and `use_tpu=True`.") + + if not self.use_tpu and self.num_tpus_per_worker > 0: + raise ValueError( + "`use_tpu` is False but `TPU` was found in " + "`resources_per_worker`. Either set `use_tpu` to True or " + "remove `TPU` from `resources_per_worker." + ) + + if self.use_tpu and self.num_tpus_per_worker == 0: + raise ValueError( + "`use_tpu` is True but `TPU` is set to 0 in " + "`resources_per_worker`. Either set `use_tpu` to False or " + "request a positive number of `TPU` in " + "`resources_per_worker." + ) + + if self.use_tpu and self.num_workers > 1: + if not self.topology: + raise ValueError( + "`topology` must be specified in ScalingConfig when `use_tpu=True` " + " and `num_workers` > 1." + ) + if not self.accelerator_type: + raise ValueError( + "`accelerator_type` must be specified in ScalingConfig when " + "`use_tpu=True` and `num_workers` > 1." + ) + + if self.num_workers == 0: + logger.info( + "Running in local mode. The training function will run in the same process. " + "If you are using it and running into issues please file a report at " + "https://github.com/ray-project/ray/issues." + ) + super().__post_init__() + @property + def _resources_per_worker_not_none(self): + if self.resources_per_worker is None: + if self.use_tpu: + return {"TPU": 1} + + return super()._resources_per_worker_not_none + @property def _trainer_resources_not_none(self): return {} + @property + def num_tpus_per_worker(self): + """The number of TPUs to set per worker.""" + return self._resources_per_worker_not_none.get("TPU", 0) + @dataclass class FailureConfig(FailureConfigV1): @@ -198,3 +263,11 @@ def __post_init__(self): "See this issue for more context: " "https://github.com/ray-project/ray/issues/49454" ) + + @cached_property + def storage_context(self) -> StorageContext: + return StorageContext( + storage_path=self.storage_path, + experiment_dir_name=self.name, + storage_filesystem=self.storage_filesystem, + ) diff --git a/python/ray/train/v2/api/context.py b/python/ray/train/v2/api/context.py index c2fce63e430f..6d8896a1364a 100644 --- a/python/ray/train/v2/api/context.py +++ b/python/ray/train/v2/api/context.py @@ -1,3 +1,4 @@ +from abc import ABC, abstractmethod from typing import Any, Dict from ray.train.v2._internal.execution.context import ( @@ -7,7 +8,9 @@ @PublicAPI(stability="stable") -class TrainContext: +class TrainContext(ABC): + """Abstract interface for training context.""" + @Deprecated def get_metadata(self) -> Dict[str, Any]: """[Deprecated] User metadata dict passed to the Trainer constructor.""" @@ -55,10 +58,12 @@ def get_trial_dir(self) -> str: _TUNE_SPECIFIC_CONTEXT_DEPRECATION_MESSAGE.format("get_trial_dir") ) + @abstractmethod def get_experiment_name(self) -> str: """Experiment name for the corresponding trial.""" - return get_internal_train_context().get_experiment_name() + pass + @abstractmethod def get_world_size(self) -> int: """Get the current world size (i.e. total number of workers) for this run. @@ -85,8 +90,9 @@ def train_loop_per_worker(config): ... """ - return get_internal_train_context().get_world_size() + pass + @abstractmethod def get_world_rank(self) -> int: """Get the world rank of this worker. @@ -112,8 +118,9 @@ def train_loop_per_worker(config): ... """ - return get_internal_train_context().get_world_rank() + pass + @abstractmethod def get_local_rank(self) -> int: """Get the local rank of this worker (rank of the worker on its node). @@ -142,8 +149,9 @@ def train_loop_per_worker(config): ... """ - return get_internal_train_context().get_local_rank() + pass + @abstractmethod def get_local_world_size(self) -> int: """Get the local world size of this node (i.e. number of workers on this node). @@ -170,8 +178,9 @@ def train_loop_per_worker(): ... """ - return get_internal_train_context().get_local_world_size() + pass + @abstractmethod def get_node_rank(self) -> int: """Get the rank of this node. @@ -198,9 +207,10 @@ def train_loop_per_worker(): ... """ - return get_internal_train_context().get_node_rank() + pass @DeveloperAPI + @abstractmethod def get_storage(self): """Returns the :class:`~ray.train._internal.storage.StorageContext` storage context which gives advanced access to the filesystem and paths @@ -209,4 +219,61 @@ def get_storage(self): NOTE: This is a developer API, and the `StorageContext` interface may change without notice between minor versions. """ + pass + + +class DistributedTrainContext(TrainContext): + """Implementation of TrainContext for distributed mode.""" + + def get_experiment_name(self) -> str: + return get_internal_train_context().get_experiment_name() + + def get_world_size(self) -> int: + return get_internal_train_context().get_world_size() + + def get_world_rank(self) -> int: + return get_internal_train_context().get_world_rank() + + def get_local_rank(self) -> int: + return get_internal_train_context().get_local_rank() + + def get_local_world_size(self) -> int: + return get_internal_train_context().get_local_world_size() + + def get_node_rank(self) -> int: + return get_internal_train_context().get_node_rank() + + def get_storage(self): return get_internal_train_context().get_storage() + + +class LocalTrainContext(TrainContext): + """Implementation of TrainContext for local mode.""" + + def __init__( + self, + experiment_name: str, + ): + self.experiment_name = experiment_name + + def get_experiment_name(self) -> str: + return self.experiment_name + + def get_world_size(self) -> int: + return 1 + + def get_world_rank(self) -> int: + return 0 + + def get_local_rank(self) -> int: + return 0 + + def get_local_world_size(self) -> int: + return 1 + + def get_node_rank(self) -> int: + """For local mode, we only use one node.""" + return 0 + + def get_storage(self): + raise NotImplementedError("Local storage context not yet implemented. ") diff --git a/python/ray/train/v2/api/data_parallel_trainer.py b/python/ray/train/v2/api/data_parallel_trainer.py index 40fede922b90..b9f607f3422b 100644 --- a/python/ray/train/v2/api/data_parallel_trainer.py +++ b/python/ray/train/v2/api/data_parallel_trainer.py @@ -1,7 +1,7 @@ -import asyncio import logging import signal import sys +import threading from typing import Any, Callable, Dict, List, Optional, Union import ray @@ -27,9 +27,9 @@ AcceleratorSetupCallback, BackendSetupCallback, DatasetsSetupCallback, + TPUReservationCallback, WorkingDirectorySetupCallback, ) -from ray.train.v2._internal.callbacks.datasets import GenDataset from ray.train.v2._internal.callbacks.env_callback import _initialize_env_callbacks from ray.train.v2._internal.callbacks.metrics import ( ControllerMetricsCallback, @@ -38,15 +38,15 @@ from ray.train.v2._internal.callbacks.state_manager import StateManagerCallback from ray.train.v2._internal.callbacks.user_callback import UserCallbackHandler from ray.train.v2._internal.constants import ( - DEFAULT_RUN_CONTROLLER_AS_ACTOR, METRICS_ENABLED_ENV_VAR, - RUN_CONTROLLER_AS_ACTOR_ENV_VAR, get_env_vars_to_propagate, ) +from ray.train.v2._internal.data_integration.interfaces import GenDataset from ray.train.v2._internal.execution.callback import RayTrainCallback from ray.train.v2._internal.execution.context import TrainRunContext from ray.train.v2._internal.execution.controller import TrainController from ray.train.v2._internal.execution.failure_handling import create_failure_policy +from ray.train.v2._internal.execution.local_mode_utils import LocalController from ray.train.v2._internal.execution.scaling_policy import create_scaling_policy from ray.train.v2._internal.util import ObjectRefWrapper, construct_train_func from ray.train.v2.api.callback import UserCallback @@ -87,6 +87,8 @@ def __init__( self.datasets = datasets or {} self.data_config = dataset_config or DataConfig() + self.running_in_local_mode = self.scaling_config.num_workers == 0 + self.train_run_context = TrainRunContext( run_config=self.run_config, train_loop_config=self.train_loop_config, @@ -105,6 +107,14 @@ def __init__( usage_lib.record_library_usage("train") tag_train_v2_trainer(self) + def _get_train_func(self) -> Callable[[], None]: + return construct_train_func( + self.train_loop_per_worker, + config=self.train_loop_config, + train_func_context=self.backend_config.train_func_context, + fn_arg_name="train_loop_per_worker", + ) + def fit(self) -> Result: """Launches the Ray Train controller to run training on workers. @@ -115,31 +125,35 @@ def fit(self) -> Result: ray.train.v2.api.exceptions.ControllerError: If a non-retryable error occurs in the Ray Train controller itself, or if the number of retries configured in `FailureConfig` is exhausted. ray.train.v2.api.exceptions.WorkerGroupError: If one or more workers fail during training and the number of retries configured in `FailureConfig` is exhausted. """ - train_fn = construct_train_func( - self.train_loop_per_worker, - config=self.train_loop_config, - train_func_context=self.backend_config.train_func_context, - fn_arg_name="train_loop_per_worker", - ) - train_fn_ref = ObjectRefWrapper(train_fn) - - result = self._initialize_and_run_controller( - train_fn_ref=train_fn_ref, - scaling_policy=create_scaling_policy(self.scaling_config), - failure_policy=create_failure_policy(self.run_config.failure_config), - train_run_context=self.train_run_context, - callbacks=self._create_default_callbacks(), - ) + train_fn = self._get_train_func() + if self.running_in_local_mode: + return self._initialize_and_run_local_controller(train_fn) + else: + train_fn_ref = ObjectRefWrapper(train_fn) + + result = self._initialize_and_run_controller( + train_fn_ref=train_fn_ref, + scaling_policy=create_scaling_policy(self.scaling_config), + failure_policy=create_failure_policy(self.run_config.failure_config), + train_run_context=self.train_run_context, + callbacks=self._create_default_callbacks(), + ) - if result.error: - # NOTE: If the training run errored out, raise an error back to the - # user's driver script. - # For example, if the Train `FailurePolicy` runs out of retries, - # and one of the workers errors. The controller will exit, and - # the error will be raised here. - raise result.error + if result.error: + # NOTE: If the training run errored out, raise an error back to the + # user's driver script. + # For example, if the Train `FailurePolicy` runs out of retries, + # and one of the workers errors. The controller will exit, and + # the error will be raised here. + raise result.error - return result + return result + + def _get_local_controller(self) -> LocalController: + return LocalController( + experiment_name=self.run_config.name, + datasets=self.datasets, + ) def _create_default_callbacks(self) -> List[RayTrainCallback]: # Initialize callbacks from environment variable @@ -150,13 +164,13 @@ def _create_default_callbacks(self) -> List[RayTrainCallback]: ) backend_setup_callback = BackendSetupCallback(self.backend_config) datasets_setup_callback = DatasetsSetupCallback( - datasets=self.datasets, - data_config=self.data_config, - scaling_config=self.scaling_config, + train_run_context=self.train_run_context ) + tpu_reservation_setup_callback = TPUReservationCallback() callbacks.extend( [ accelerator_setup_callback, + tpu_reservation_setup_callback, backend_setup_callback, datasets_setup_callback, ] @@ -193,32 +207,32 @@ def _create_default_callbacks(self) -> List[RayTrainCallback]: ) return callbacks - def _initialize_and_run_controller(self, **controller_init_kwargs) -> Result: - run_controller_as_actor = env_bool( - RUN_CONTROLLER_AS_ACTOR_ENV_VAR, DEFAULT_RUN_CONTROLLER_AS_ACTOR - ) - if run_controller_as_actor: - # Attach the controller to the node running the driver script. - controller_actor_cls = ray.remote( - num_cpus=0, - scheduling_strategy=NodeAffinitySchedulingStrategy( - node_id=ray.get_runtime_context().get_node_id(), soft=False - ), - # TODO: Extract env variables that affect controller behavior - # and pass them as explicit args - runtime_env={"env_vars": get_env_vars_to_propagate()}, - )(TrainController) - - controller = controller_actor_cls.remote(**controller_init_kwargs) + def _initialize_and_run_local_controller( + self, train_func: Callable[[], None] + ) -> Result: + return self._get_local_controller().run(train_func) + def _initialize_and_run_controller(self, **controller_init_kwargs) -> Result: + # Attach the controller to the node running the driver script. + controller_actor_cls = ray.remote( + num_cpus=0, + scheduling_strategy=NodeAffinitySchedulingStrategy( + node_id=ray.get_runtime_context().get_node_id(), soft=False + ), + # TODO: Extract env variables that affect controller behavior + # and pass them as explicit args + runtime_env={"env_vars": get_env_vars_to_propagate()}, + )(TrainController) + + controller = controller_actor_cls.remote(**controller_init_kwargs) + + # If this is not the main thread - as is the case when running in Tune - + # registering the SIGINT handler raises an exception. + if threading.current_thread() is threading.main_thread(): self._register_sigint_handler(controller) - ray.get(controller.run.remote()) - return ray.get(controller.get_result.remote()) - else: - controller = TrainController(**controller_init_kwargs) - asyncio.run(controller.run()) - return controller.get_result() + ray.get(controller.run.remote()) + return ray.get(controller.get_result.remote()) def _register_sigint_handler(self, controller: ActorHandle[TrainController]): """Register SIGINT handler so user Ctrl C gracefully aborts run.""" diff --git a/python/ray/train/v2/api/report_config.py b/python/ray/train/v2/api/report_config.py new file mode 100644 index 000000000000..bcd4393da287 --- /dev/null +++ b/python/ray/train/v2/api/report_config.py @@ -0,0 +1,21 @@ +from enum import Enum + +from ray.util.annotations import PublicAPI + + +@PublicAPI(stability="alpha") +class CheckpointUploadMode(Enum): + """The manner in which we want to upload the checkpoint. + + Args: + ASYNC: Upload checkpoint asynchronously. + SYNC: Upload checkpoint synchronously. + NO_UPLOAD: Do not upload checkpoint. + """ + + ASYNC = "ASYNC" + SYNC = "SYNC" + NO_UPLOAD = "NO_UPLOAD" + + def _default_delete_local_checkpoint_after_upload(self) -> bool: + return self == CheckpointUploadMode.ASYNC diff --git a/python/ray/train/v2/api/reported_checkpoint.py b/python/ray/train/v2/api/reported_checkpoint.py new file mode 100644 index 000000000000..2224f52280d4 --- /dev/null +++ b/python/ray/train/v2/api/reported_checkpoint.py @@ -0,0 +1,21 @@ +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Dict + +from ray.util.annotations import PublicAPI + +if TYPE_CHECKING: + from ray.train import Checkpoint + + +@dataclass +@PublicAPI(stability="alpha") +class ReportedCheckpoint: + """A user-reported checkpoint and its associated metrics. + + Attributes: + checkpoint: The checkpoint reported by the user. + metrics: The metrics associated with that checkpoint. + """ + + checkpoint: "Checkpoint" + metrics: Dict[str, Any] diff --git a/python/ray/train/v2/api/train_fn_utils.py b/python/ray/train/v2/api/train_fn_utils.py index 6266f9441b42..eec8274367c2 100644 --- a/python/ray/train/v2/api/train_fn_utils.py +++ b/python/ray/train/v2/api/train_fn_utils.py @@ -1,19 +1,24 @@ -from typing import TYPE_CHECKING, Any, Dict, Optional +from typing import TYPE_CHECKING, Any, Dict, List, Optional -from ray.train import Checkpoint -from ray.train.v2._internal.execution.context import get_train_context +from ray.train.v2._internal.data_integration.interfaces import DatasetShardMetadata +from ray.train.v2._internal.execution.train_fn_utils import get_train_fn_utils from ray.train.v2.api.context import TrainContext +from ray.train.v2.api.report_config import CheckpointUploadMode from ray.util.annotations import PublicAPI if TYPE_CHECKING: from ray.data import DataIterator + from ray.train import Checkpoint + from ray.train.v2.api.reported_checkpoint import ReportedCheckpoint @PublicAPI(stability="stable") def report( metrics: Dict[str, Any], - checkpoint: Optional[Checkpoint] = None, + checkpoint: Optional["Checkpoint"] = None, checkpoint_dir_name: Optional[str] = None, + checkpoint_upload_mode: CheckpointUploadMode = CheckpointUploadMode.SYNC, + delete_local_checkpoint_after_upload: Optional[bool] = None, ): """Report metrics and optionally save a checkpoint. @@ -86,10 +91,22 @@ def train_func(config): If provided, it must be unique across all checkpoints per worker to avoid naming collisions. Consider including identifiers such as the epoch or batch index in the name. + checkpoint_upload_mode: The manner in which we want to upload the checkpoint. + Defaults to uploading the checkpoint synchronously. + This works when no checkpoint is provided but is not useful in that case. + delete_local_checkpoint_after_upload: Whether to delete the checkpoint after it is uploaded. """ + if delete_local_checkpoint_after_upload is None: + delete_local_checkpoint_after_upload = ( + checkpoint_upload_mode._default_delete_local_checkpoint_after_upload() + ) - get_train_context().report( - metrics=metrics, checkpoint=checkpoint, checkpoint_dir_name=checkpoint_dir_name + get_train_fn_utils().report( + metrics=metrics, + checkpoint=checkpoint, + checkpoint_dir_name=checkpoint_dir_name, + checkpoint_upload_mode=checkpoint_upload_mode, + delete_local_checkpoint_after_upload=delete_local_checkpoint_after_upload, ) @@ -103,11 +120,11 @@ def get_context() -> TrainContext: """ # TODO: Return a dummy train context on the controller and driver process # instead of raising an exception if the train context does not exist. - return TrainContext() + return get_train_fn_utils().get_context() @PublicAPI(stability="stable") -def get_checkpoint() -> Optional[Checkpoint]: +def get_checkpoint() -> Optional["Checkpoint"]: """Access the latest reported checkpoint to resume from if one exists. Example: @@ -148,7 +165,52 @@ def train_func(config): Checkpoint object if the session is currently being resumed. Otherwise, return None. """ - return get_train_context().get_checkpoint() + return get_train_fn_utils().get_checkpoint() + + +@PublicAPI(stability="alpha") +def get_all_reported_checkpoints() -> List["ReportedCheckpoint"]: + """Get all the reported checkpoints so far. + + Blocks until Ray Train has finished processing every `ray.train.report` call. + + Example: + + .. testcode:: + + import tempfile + + from ray import train + from ray.train import Checkpoint + from ray.train.torch import TorchTrainer + + + def train_func(config): + start_epoch = 0 + + for epoch in range(start_epoch, config.get("num_epochs", 10)): + # Do training... + + metrics = {"loss": ...} + + with tempfile.TemporaryDirectory() as temp_checkpoint_dir: + # Save the checkpoint... + + checkpoint = Checkpoint.from_directory(temp_checkpoint_dir) + train.report(metrics, checkpoint=checkpoint) + + reported_checkpoints = train.get_all_reported_checkpoints() + # Report artifacts/metrics to experiment tracking framework... + + trainer = TorchTrainer( + train_func, scaling_config=train.ScalingConfig(num_workers=2) + ) + + Returns: + List of ReportedCheckpoint objects that represent the checkpoints and + corresponding metrics reported by the workers. + """ + return get_train_fn_utils().get_all_reported_checkpoints() @PublicAPI(stability="stable") @@ -195,4 +257,6 @@ def train_loop_per_worker(config): The ``DataIterator`` shard to use for this worker. If no dataset is passed into Trainer, then return None. """ - return get_train_context().get_dataset_shard(dataset_name) + return get_train_fn_utils().get_dataset_shard( + DatasetShardMetadata(dataset_name=dataset_name) + ) diff --git a/python/ray/train/v2/jax/__init__.py b/python/ray/train/v2/jax/__init__.py new file mode 100644 index 000000000000..097ee852b783 --- /dev/null +++ b/python/ray/train/v2/jax/__init__.py @@ -0,0 +1,15 @@ +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + try: + import jax # noqa: F401 + except ModuleNotFoundError as exception: + raise ModuleNotFoundError( + "Jax isn't installed. To install Jax, please check" + " `https://github.com/google/jax#installation` for the instructions." + ) from exception + +from ray.train.v2.jax.config import JaxConfig +from ray.train.v2.jax.jax_trainer import JaxTrainer + +__all__ = ["JaxConfig", "JaxTrainer"] diff --git a/python/ray/train/v2/jax/config.py b/python/ray/train/v2/jax/config.py new file mode 100644 index 000000000000..5e8dc5ba33e4 --- /dev/null +++ b/python/ray/train/v2/jax/config.py @@ -0,0 +1,59 @@ +import logging +import os +from dataclasses import dataclass + +import ray +from ray.train._internal.utils import get_address_and_port +from ray.train._internal.worker_group import WorkerGroup +from ray.train.backend import Backend, BackendConfig +from ray.util import PublicAPI + +logger = logging.getLogger(__name__) + + +@PublicAPI(stability="alpha") +@dataclass +class JaxConfig(BackendConfig): + use_tpu: bool = False + + @property + def backend_cls(self): + return _JaxBackend + + +def _setup_jax_tpu_environment( + master_addr_with_port: str, num_workers: int, index: int +): + """Set up distributed Jax training information. + + This function should be called on each worker. + """ + import jax + + jax_platforms = os.environ.get("JAX_PLATFORMS", "").lower() + + if "tpu" in jax_platforms.split(","): + jax.distributed.initialize(master_addr_with_port, num_workers, index) + + +class _JaxBackend(Backend): + def on_start(self, worker_group: WorkerGroup, backend_config: JaxConfig): + if not backend_config.use_tpu: + return + + master_addr, master_port = worker_group.execute_single(0, get_address_and_port) + master_addr_with_port = f"{master_addr}:{master_port}" + + # Get setup tasks in order to throw errors on failure. + setup_futures = [] + for i in range(len(worker_group)): + setup_futures.append( + worker_group.execute_single_async( + i, + _setup_jax_tpu_environment, + master_addr_with_port=master_addr_with_port, + num_workers=len(worker_group), + index=i, + ) + ) + ray.get(setup_futures) diff --git a/python/ray/train/v2/jax/jax_trainer.py b/python/ray/train/v2/jax/jax_trainer.py new file mode 100644 index 000000000000..f1845d8d50ff --- /dev/null +++ b/python/ray/train/v2/jax/jax_trainer.py @@ -0,0 +1,162 @@ +import logging +from typing import TYPE_CHECKING, Callable, Dict, Optional, Union + +from ray.air._internal.config import ensure_only_allowed_dataclass_keys_updated +from ray.train import Checkpoint, DataConfig +from ray.train.trainer import GenDataset +from ray.train.v2.api.config import RunConfig, ScalingConfig +from ray.train.v2.api.data_parallel_trainer import DataParallelTrainer +from ray.train.v2.jax.config import JaxConfig +from ray.util import PublicAPI + +if TYPE_CHECKING: + pass + +logger = logging.getLogger(__name__) + + +@PublicAPI(stability="alpha") +class JaxTrainer(DataParallelTrainer): + """A Trainer for Single-Program Multi-Data (SPMD) JAX training. + Currently only supports TPUs. GPUs will be supported in a future version. + + This Trainer runs the function ``train_loop_per_worker`` on multiple Ray + Actors. These actors are expected to be scheduled on TPU VMs within the same + TPU slice, connected via inter-chip interconnects (ICI). The ``train_loop_per_worker`` + function is expected to take in either 0 or 1 arguments: + + .. testcode:: + + import os + from absl import app + import logging + from typing import Sequence + + import ray + from ray.train.v2.api.config import ScalingConfig, RunConfig + from ray.train.v2.jax import JaxTrainer + from MaxText.train import main as maxtext_main + + def train_loop_per_worker(config): + argv = config["argv"] + maxtext_main(argv) + + def main(argv: Sequence[str]): + ray.init() + + trainer = JaxTrainer( + train_loop_per_worker=train_loop_per_worker, + train_loop_config={"argv": absolute_argv}, + scaling_config=ScalingConfig( + use_tpu=True, + num_workers=4, + topology="4x4", + accelerator_type="TPU-V6E", + resources_per_worker={"TPU": 4}, + placement_strategy="SPREAD", + ), + run_config=RunConfig( + name="maxtext_jaxtrainer", + worker_runtime_env={ + "env_vars": { + "JAX_PLATFORMS": "tpu", + "ENABLE_PJRT_COMPATIBILITY": "true", + "TPU_SLICE_BUILDER_DUMP_CHIP_FORCE": "true", + "TPU_SLICE_BUILDER_DUMP_ICI": "true", + "XLA_FLAGS": "--xla_dump_to=/tmp/xla_dump_file --xla_dump_hlo_as_proto", + } + }, + ), + ) + + result = trainer.fit() + + .. testoutput:: + :options: +ELLIPSIS + :hide: + + If ``train_loop_per_worker`` accepts an argument, then + ``train_loop_config`` will be passed in as the argument. + + If the ``datasets`` dict contains a training dataset (denoted by + the "train" key), then it will be split into multiple dataset + shards that can then be accessed by ``session.get_dataset_shard("train")``. + + Note: + * Only TPU-based distributed training is supported. + * Each worker must be assigned one TPU device via + ``resources_per_worker={"TPU": 1}``. + * Placement strategy is automatically set to ``SPREAD`` to ensure + TPU workers are placed on separate VMs. + * Importing `jax` should occur within `train_loop_per_worker` to + avoid driver-side TPU lock issues. + + Args: + train_loop_per_worker: The training function to execute on each worker. + This function can either take in zero arguments or a single ``Dict`` + argument which is set by defining ``train_loop_config``. + Within this function you can use any of the + :ref:`Ray Train Loop utilities `. + train_loop_config: A configuration ``Dict`` to pass in as an argument to + ``train_loop_per_worker``. + This is typically used for specifying hyperparameters. Passing large + datasets via `train_loop_config` is not recommended and may introduce + large overhead and unknown issues with serialization and deserialization. + jax_config: The configuration for setting up the JAX backend. + If set to None, a default configuration with TPUs will be used. + scaling_config: Configuration for how to scale data parallel training + with SPMD. ``num_workers`` should be set to the number of TPU hosts + and ``topology`` should be set to the TPU topology. + See :class:`~ray.train.ScalingConfig` for more info. + dataset_config: The configuration for ingesting the input ``datasets``. + By default, all the Ray Dataset are split equally across workers. + See :class:`~ray.train.DataConfig` for more details. + run_config: The configuration for the execution of the training run. + See :class:`~ray.train.RunConfig` for more info. + datasets: The Ray Datasets to ingest for training. + Datasets are keyed by name (``{name: dataset}``). + Each dataset can be accessed from within the ``train_loop_per_worker`` + by calling ``ray.train.get_dataset_shard(name)``. + Sharding and additional configuration can be done by + passing in a ``dataset_config``. + resume_from_checkpoint: A checkpoint to resume training from. + This checkpoint can be accessed from within ``train_loop_per_worker`` + by calling ``ray.train.get_checkpoint()``. + """ + + def __init__( + self, + train_loop_per_worker: Union[Callable[[], None], Callable[[Dict], None]], + *, + train_loop_config: Optional[Dict] = None, + jax_config: Optional[JaxConfig] = None, + scaling_config: Optional[ScalingConfig] = None, + dataset_config: Optional[Dict[str, DataConfig]] = None, + run_config: Optional[RunConfig] = None, + datasets: Optional[Dict[str, GenDataset]] = None, + resume_from_checkpoint: Optional[Checkpoint] = None, + ): + if not jax_config: + jax_config = JaxConfig( + use_tpu=scaling_config.use_tpu, + ) + super(JaxTrainer, self).__init__( + train_loop_per_worker=train_loop_per_worker, + train_loop_config=train_loop_config, + backend_config=jax_config, + scaling_config=scaling_config, + dataset_config=dataset_config, + run_config=run_config, + datasets=datasets, + resume_from_checkpoint=resume_from_checkpoint, + ) + + @classmethod + def _validate_scaling_config(cls, scaling_config: ScalingConfig) -> ScalingConfig: + """Return scaling config dataclass after validating updated keys.""" + ensure_only_allowed_dataclass_keys_updated( + dataclass=scaling_config, + allowed_keys=cls._scaling_config_allowed_keys, + ) + + return scaling_config diff --git a/python/ray/train/v2/lightning/lightning_utils.py b/python/ray/train/v2/lightning/lightning_utils.py deleted file mode 100644 index 4d417248c73d..000000000000 --- a/python/ray/train/v2/lightning/lightning_utils.py +++ /dev/null @@ -1,58 +0,0 @@ -import os -import shutil -import tempfile -from pathlib import Path - -import ray.train -from ray._common.usage.usage_lib import TagKey, record_extra_usage_tag -from ray.train.lightning._lightning_utils import ( - RayTrainReportCallback as RayTrainReportCallbackV1, - import_lightning, -) -from ray.util import PublicAPI - -pl = import_lightning() - - -@PublicAPI(stability="beta") -class RayTrainReportCallback(RayTrainReportCallbackV1): - """A simple callback that reports checkpoints to Ray on train epoch end. - - This callback is a subclass of `lightning.pytorch.callbacks.Callback - `_. - - It fetches the latest `trainer.callback_metrics` and reports together with - the checkpoint on each training epoch end. - - Checkpoints will be saved in the following structure: - - checkpoint_{timestamp}/ Ray Train's checkpoint folder - └─ checkpoint.ckpt Lightning's checkpoint format - - For customized reporting and checkpointing logic, implement your own - `lightning.pytorch.callbacks.Callback` following this user - guide: :ref:`Saving and Loading Checkpoints `. - """ - - def __init__(self) -> None: - # TODO: Upstream this change into ray.train.lightning. - # The difference in this version is removing the trial directory usage. - job_id = ray.get_runtime_context().get_job_id() - experiment_name = ray.train.get_context().get_experiment_name() - self.local_rank = ray.train.get_context().get_local_rank() - - # Create a root temporary directory for storing local checkpoints - # before persisting to storage. - # Lightning's checkpointing implementation requires that this directory - # is a common path across all workers. - # Construct the path prefix with the job id and experiment name, - # which are shared across workers for a Ray Train run. - # This path should not be shared across different Ray Train runs. - self.tmpdir_prefix = Path( - tempfile.gettempdir(), - f"lightning_checkpoints-job_id={job_id}-name={experiment_name}", - ).as_posix() - if os.path.isdir(self.tmpdir_prefix) and self.local_rank == 0: - shutil.rmtree(self.tmpdir_prefix) - - record_extra_usage_tag(TagKey.TRAIN_LIGHTNING_RAYTRAINREPORTCALLBACK, "1") diff --git a/python/ray/train/v2/tests/conftest.py b/python/ray/train/v2/tests/conftest.py index 6f52bcd6e438..da060d7e8dee 100644 --- a/python/ray/train/v2/tests/conftest.py +++ b/python/ray/train/v2/tests/conftest.py @@ -3,6 +3,7 @@ import pytest import ray +from ray import runtime_context from ray.train.v2._internal.constants import ( ENABLE_STATE_ACTOR_RECONCILIATION_ENV_VAR, ) @@ -34,3 +35,26 @@ def shutdown_only(): def disable_state_actor_polling(monkeypatch): monkeypatch.setenv(ENABLE_STATE_ACTOR_RECONCILIATION_ENV_VAR, "0") yield + + +@pytest.fixture +def mock_runtime_context(monkeypatch): + @ray.remote + class DummyActor: + pass + + # Must return real actor handle so it can get passed to other actors + # Cannot create actor here since ray has not been initialized yet + def mock_current_actor(self): + return DummyActor.remote() + + # In unit tests where the controller is not an actor, current_actor is + # a DummyActor, which is ok because it won't be called in those tests. + # In unit tests where the controller is an actor, current_actor is the + # controller actor because monkeypatch doesn't propagate to the actor + # process. Those tests can successfully test methods on that actor. + monkeypatch.setattr( + runtime_context.RuntimeContext, "current_actor", property(mock_current_actor) + ) + + yield diff --git a/python/ray/train/v2/tests/test_accelerator_utils.py b/python/ray/train/v2/tests/test_accelerator_utils.py index 8c5cff9cd06f..d9e0149bf64c 100644 --- a/python/ray/train/v2/tests/test_accelerator_utils.py +++ b/python/ray/train/v2/tests/test_accelerator_utils.py @@ -98,7 +98,7 @@ def test_missing_accelerator(): ) -def test_accelerator_setup_callback(mock_gpu_cluster): +def test_accelerator_setup_callback(mock_gpu_cluster, mock_runtime_context): """The accelerator setup callback should set the CUDA_VISIBLE_DEVICES on each worker properly.""" diff --git a/python/ray/train/v2/tests/test_async_checkpointing.py b/python/ray/train/v2/tests/test_async_checkpointing.py new file mode 100644 index 000000000000..6e2251e8510c --- /dev/null +++ b/python/ray/train/v2/tests/test_async_checkpointing.py @@ -0,0 +1,223 @@ +import os +from unittest.mock import create_autospec + +import pytest + +import ray +import ray.cloudpickle as ray_pickle +from ray.train import Checkpoint, RunConfig, ScalingConfig +from ray.train.v2.api.data_parallel_trainer import DataParallelTrainer +from ray.train.v2.api.exceptions import WorkerGroupError +from ray.train.v2.api.report_config import CheckpointUploadMode + + +def test_report_mixed_checkpoint_upload_modes(ray_start_4_cpus, tmp_path): + """Run all 10 possible pairs (e.g. (SYNC, ASYNC)) of checkpoint upload modes between 2 workers.""" + + def get_checkpoint_iteration(checkpoint): + if not checkpoint: + return -1 + return int(checkpoint.path.split("_")[-1]) + + def train_fn(): + # When reporting with async checkpointing, write the checkpoint to + # tmp_path, which stays alive for the duration of the test, instead of + # tempfile.TemporaryDirectory(), which might get deleted before the + # async checkpoint upload completes. + + # Run all 10 possible pairs of checkpoint upload modes + rank = ray.train.get_context().get_world_rank() + if rank == 0: + ASYNC_ITERATIONS = [0, 1, 2, 3] + SYNC_ITERATIONS = [4, 5, 6] + NO_UPLOAD_ITERATIONS = [7, 8] + NO_CHECKPOINT_ITERATIONS = [9] + else: + ASYNC_ITERATIONS = [0] + SYNC_ITERATIONS = [1, 4] + NO_UPLOAD_ITERATIONS = [2, 5, 7] + NO_CHECKPOINT_ITERATIONS = [3, 6, 8, 9] + + prev_latest_checkpoint_iteration = -1 + for i in range(10): + # Set variables + if i in ASYNC_ITERATIONS: + checkpoint_upload_mode = CheckpointUploadMode.ASYNC + elif i in SYNC_ITERATIONS: + checkpoint_upload_mode = CheckpointUploadMode.SYNC + else: + checkpoint_upload_mode = CheckpointUploadMode.NO_UPLOAD + metrics = {"metric": f"iteration_{i}_shard_{rank}"} + + # Create and report checkpoint + if i in NO_CHECKPOINT_ITERATIONS: + ray.train.report( + metrics=metrics, + checkpoint=None, + ) + assert prev_latest_checkpoint_iteration <= get_checkpoint_iteration( + ray.train.get_checkpoint() + ) + else: + # Create remote or local checkpoint_dir + checkpoint_dir_name = f"checkpoint_iteration_{i}" + if i in NO_UPLOAD_ITERATIONS: + checkpoint_dir = ( + ray.train.get_context() + .get_storage() + .build_checkpoint_path_from_name(checkpoint_dir_name) + ) + else: + checkpoint_dir = os.path.join( + tmp_path, checkpoint_dir_name, f"_{rank}" + ) + + # Create and report that remote or local checkpoint + os.makedirs(checkpoint_dir, exist_ok=True) + with open(os.path.join(checkpoint_dir, f"shard_{rank}"), "wb") as f: + ray_pickle.dump(f"iteration_{i}_shard_{rank}", f) + checkpoint = Checkpoint(checkpoint_dir) + ray.train.report( + metrics=metrics, + checkpoint=checkpoint, + checkpoint_upload_mode=checkpoint_upload_mode, + checkpoint_dir_name=checkpoint_dir_name, + ) + + # Check the status of latest_checkpoint + latest_checkpoint = ray.train.get_checkpoint() + if i in NO_UPLOAD_ITERATIONS: + assert latest_checkpoint == checkpoint + elif i in SYNC_ITERATIONS: + assert checkpoint_dir_name in latest_checkpoint.path + else: + assert prev_latest_checkpoint_iteration <= get_checkpoint_iteration( + latest_checkpoint + ) + + prev_latest_checkpoint_iteration = get_checkpoint_iteration( + latest_checkpoint + ) + + trainer = DataParallelTrainer( + train_fn, + scaling_config=ScalingConfig(num_workers=2), + run_config=RunConfig(storage_path=str(tmp_path)), + ) + result = trainer.fit() + # Note that the (checkpoint=None, checkpoint=None) pair does not produce any checkpoint + assert len(result.best_checkpoints) == 9 + for i, (checkpoint, metrics) in enumerate(result.best_checkpoints): + assert checkpoint.path.endswith(f"checkpoint_iteration_{i}") + assert metrics["metric"] == f"iteration_{i}_shard_0" + + +@pytest.mark.parametrize( + "delete_local_checkpoint_after_upload,checkpoint_upload_mode", + [ + (True, CheckpointUploadMode.ASYNC), + (False, CheckpointUploadMode.ASYNC), + (True, CheckpointUploadMode.SYNC), + (False, CheckpointUploadMode.SYNC), + (True, CheckpointUploadMode.NO_UPLOAD), + (False, CheckpointUploadMode.NO_UPLOAD), + ], +) +def test_report_delete_local_checkpoint_after_upload( + ray_start_4_cpus, + tmp_path, + delete_local_checkpoint_after_upload, + checkpoint_upload_mode, +): + """Check that the local checkpoint is deleted after upload.""" + + def train_fn(): + rank = ray.train.get_context().get_world_rank() + if rank == 0: + if checkpoint_upload_mode == CheckpointUploadMode.NO_UPLOAD: + checkpoint_dir = ( + ray.train.get_context() + .get_storage() + .build_checkpoint_path_from_name("my_checkpoint_dir") + ) + else: + checkpoint_dir = os.path.join( + tmp_path, + "my_checkpoint_dir", + ) + os.makedirs(checkpoint_dir, exist_ok=True) + with open(os.path.join(checkpoint_dir, "shard_0"), "wb") as f: + ray_pickle.dump("some_checkpoint_contents", f) + checkpoint = Checkpoint(checkpoint_dir) + ray.train.report( + {}, + checkpoint, + checkpoint_upload_mode=checkpoint_upload_mode, + delete_local_checkpoint_after_upload=delete_local_checkpoint_after_upload, + ) + else: + ray.train.report( + {}, + None, + ) + + trainer = DataParallelTrainer( + train_fn, + scaling_config=ScalingConfig(num_workers=2), + run_config=RunConfig(storage_path=str(tmp_path)), + ) + trainer.fit() + if ( + delete_local_checkpoint_after_upload + or checkpoint_upload_mode == CheckpointUploadMode.NO_UPLOAD + ): + assert not os.path.exists(os.path.join(tmp_path, "my_checkpoint_dir")) + else: + assert os.path.exists(os.path.join(tmp_path, "my_checkpoint_dir")) + + +def test_report_checkpoint_upload_error(ray_start_4_cpus, monkeypatch, tmp_path): + """Check that the trainer shuts down when an error occurs during checkpoint upload.""" + + def train_fn(): + + if ray.train.get_context().get_world_rank() == 0: + + # Mock persist_current_checkpoint to raise an error + mock_persist_current_checkpoint = create_autospec( + ray.train.get_context().get_storage().persist_current_checkpoint + ) + mock_persist_current_checkpoint.side_effect = ValueError("error") + monkeypatch.setattr( + ray.train.get_context().get_storage(), + "persist_current_checkpoint", + mock_persist_current_checkpoint, + ) + + # Report minimal valid checkpoint + local_checkpoint_dir = os.path.join(tmp_path, "local_checkpoint_dir") + os.makedirs(local_checkpoint_dir, exist_ok=True) + ray.train.report( + {}, + Checkpoint.from_directory(local_checkpoint_dir), + checkpoint_upload_mode=CheckpointUploadMode.ASYNC, + ) + else: + ray.train.report( + {}, None, checkpoint_upload_mode=CheckpointUploadMode.ASYNC + ) + + trainer = DataParallelTrainer( + train_fn, + scaling_config=ScalingConfig(num_workers=2), + run_config=RunConfig(storage_path=str(tmp_path)), + ) + with pytest.raises(WorkerGroupError) as exc_info: + trainer.fit() + assert isinstance(exc_info.value.worker_failures[0], ValueError) + + +if __name__ == "__main__": + import sys + + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/train/v2/tests/test_checkpoint_manager.py b/python/ray/train/v2/tests/test_checkpoint_manager.py index e1caf79cd8f9..2d9304a112b6 100644 --- a/python/ray/train/v2/tests/test_checkpoint_manager.py +++ b/python/ray/train/v2/tests/test_checkpoint_manager.py @@ -89,13 +89,15 @@ def _training_results_equal( ), ], ) -def test_save_load_state_equivalence( +async def test_save_load_state_equivalence( monkeypatch, tmp_path, checkpoint_config: CheckpointConfig ): + # Use async here because register_checkpoint creates an async task + # Mock the delete function as we don't want report checkpoints to be deleted. monkeypatch.setattr( ray.train.v2._internal.execution.checkpoint.checkpoint_manager, - "_delete_fs_path", + "delete_fs_path", lambda *args, **kwargs: None, ) exp_name = f"checkpoint_manager_test-{uuid.uuid4().hex}" @@ -113,8 +115,9 @@ def test_save_load_state_equivalence( ) # Register the training results into checkpoint manager - for tr in training_results: + for i, tr in enumerate(training_results): checkpoint_manager.register_checkpoint(tr) + assert checkpoint_manager._current_report_index == i + 1 loaded_checkpoint_manager = CheckpointManager( storage_context=storage_context, checkpoint_config=checkpoint_config, @@ -145,7 +148,8 @@ def test_load_state_error(tmp_path, json_state): checkpoint_manager._load_state(json_state) -def test_before_init_train_context(tmp_path): +async def test_before_init_train_context(tmp_path): + storage_context = StorageContext( storage_path=tmp_path, experiment_dir_name="my_experiment_name", @@ -158,14 +162,14 @@ def test_before_init_train_context(tmp_path): # Assert without a checkpoint. assert checkpoint_manager.before_init_train_context(workers) == { - "checkpoint": [None] * 4 + "checkpoint": [None] * 4, } # Assert with a checkpoint latest_checkpoint_result = _create_dummy_training_results(1, storage_context)[0] - checkpoint_manager._latest_checkpoint_result = latest_checkpoint_result + checkpoint_manager.register_checkpoint(latest_checkpoint_result) assert checkpoint_manager.before_init_train_context(workers) == { - "checkpoint": [latest_checkpoint_result.checkpoint] * 4 + "checkpoint": [latest_checkpoint_result.checkpoint] * 4, } diff --git a/python/ray/train/v2/tests/test_collective.py b/python/ray/train/v2/tests/test_collective.py index d8196ea420a5..046eedf2f979 100644 --- a/python/ray/train/v2/tests/test_collective.py +++ b/python/ray/train/v2/tests/test_collective.py @@ -4,7 +4,7 @@ import ray import ray.train.collective -from ray.train.collective import collectives +from ray.train.v2._internal.execution import collective_impl from ray.train.v2.api.data_parallel_trainer import DataParallelTrainer @@ -49,12 +49,14 @@ def train_fn(): def test_broadcast_from_rank_zero_data_too_big(ray_start_4_cpus): def train_fn(): - collectives.logger = mock.create_autospec(collectives.logger, instance=True) - collectives._MAX_BROADCAST_SIZE_BYTES = 0 + collective_impl.logger = mock.create_autospec( + collective_impl.logger, instance=True + ) + collective_impl._MAX_BROADCAST_SIZE_BYTES = 0 rank = ray.train.get_context().get_world_rank() value = ray.train.collective.broadcast_from_rank_zero({"key": rank}) assert value == {"key": 0} - collectives.logger.warning.assert_called_once() + collective_impl.logger.warning.assert_called_once() trainer = DataParallelTrainer( train_fn, diff --git a/python/ray/train/v2/tests/test_controller.py b/python/ray/train/v2/tests/test_controller.py index 3fbddbc2b64e..80a45fba69a3 100644 --- a/python/ray/train/v2/tests/test_controller.py +++ b/python/ray/train/v2/tests/test_controller.py @@ -36,6 +36,8 @@ create_dummy_run_context, ) +pytestmark = pytest.mark.usefixtures("mock_runtime_context") + @pytest.fixture(autouse=True) def patch_worker_group(monkeypatch): diff --git a/python/ray/train/v2/tests/test_data_integration.py b/python/ray/train/v2/tests/test_data_integration.py index fe8159d5190f..a542520f0001 100644 --- a/python/ray/train/v2/tests/test_data_integration.py +++ b/python/ray/train/v2/tests/test_data_integration.py @@ -1,5 +1,3 @@ -from unittest.mock import MagicMock - import pytest import ray.data @@ -7,13 +5,17 @@ from ray.data import DataContext, ExecutionResources from ray.data._internal.iterator.stream_split_iterator import StreamSplitDataIterator from ray.data.tests.conftest import restore_data_context # noqa: F401 -from ray.train.v2._internal.callbacks import DatasetsSetupCallback -from ray.train.v2._internal.execution.context import TrainRunContext +from ray.train.v2._internal.callbacks.datasets import DatasetsSetupCallback +from ray.train.v2._internal.data_integration.interfaces import DatasetShardMetadata from ray.train.v2._internal.execution.worker_group.worker_group import ( WorkerGroupContext, ) from ray.train.v2.api.data_parallel_trainer import DataParallelTrainer -from ray.train.v2.tests.util import DummyObjectRefWrapper, DummyWorkerGroup +from ray.train.v2.tests.util import ( + DummyObjectRefWrapper, + DummyWorkerGroup, + create_dummy_run_context, +) # TODO(justinvyu): Bring over more tests from ray/air/tests/test_new_dataset_config.py @@ -76,24 +78,30 @@ def test_dataset_setup_callback(ray_start_4_cpus): num_workers=scaling_config.num_workers, resources_per_worker=scaling_config.resources_per_worker, ) + train_run_context = create_dummy_run_context( + datasets={"train": train_ds, "valid": valid_ds}, + dataset_config=data_config, + scaling_config=scaling_config, + ) worker_group = DummyWorkerGroup( - train_run_context=MagicMock(spec=TrainRunContext), + train_run_context=train_run_context, worker_group_context=worker_group_context, ) worker_group._start() - callback = DatasetsSetupCallback( - datasets={"train": train_ds, "valid": valid_ds}, - data_config=data_config, - scaling_config=scaling_config, - ) - dataset_shards = callback.before_init_train_context(worker_group.get_workers())[ - "dataset_shards" - ] - assert len(dataset_shards) == NUM_WORKERS + callback = DatasetsSetupCallback(train_run_context) + dataset_manager_for_each_worker = callback.before_init_train_context( + worker_group.get_workers() + )["dataset_shard_provider"] + assert len(dataset_manager_for_each_worker) == NUM_WORKERS - processed_train_ds = dataset_shards[0]["train"] - processed_valid_ds = dataset_shards[0]["valid"] + dataset_manager = dataset_manager_for_each_worker[0] + processed_train_ds = dataset_manager.get_dataset_shard( + DatasetShardMetadata(dataset_name="train") + ) + processed_valid_ds = dataset_manager.get_dataset_shard( + DatasetShardMetadata(dataset_name="valid") + ) assert isinstance(processed_train_ds, StreamSplitDataIterator) assert not isinstance(processed_valid_ds, StreamSplitDataIterator) diff --git a/python/ray/train/v2/tests/test_data_parallel_trainer.py b/python/ray/train/v2/tests/test_data_parallel_trainer.py index c4ec69739ed3..007bd1226f23 100644 --- a/python/ray/train/v2/tests/test_data_parallel_trainer.py +++ b/python/ray/train/v2/tests/test_data_parallel_trainer.py @@ -141,6 +141,30 @@ def train_fn(): assert tmp_path.joinpath("validate", str(rank)).exists() +def test_report_get_all_reported_checkpoints(): + """Check that get_all_reported_checkpoints returns checkpoints depending on # report calls.""" + + def train_fn(): + if ray.train.get_context().get_world_rank() == 0: + ray.train.report(metrics={}, checkpoint=None) + with create_dict_checkpoint({}) as checkpoint: + ray.train.report(metrics={}, checkpoint=checkpoint) + assert len(ray.train.get_all_reported_checkpoints()) == 1 + with create_dict_checkpoint({}) as checkpoint: + ray.train.report(metrics={}, checkpoint=checkpoint) + else: + ray.train.report(metrics={}, checkpoint=None) + ray.train.report(metrics={}, checkpoint=None) + ray.train.report(metrics={}, checkpoint=None) + assert len(ray.train.get_all_reported_checkpoints()) == 2 + + trainer = DataParallelTrainer( + train_fn, + scaling_config=ScalingConfig(num_workers=2), + ) + trainer.fit() + + def test_error(tmp_path): def _error_func_rank_0(): """An example train_fun that raises an error on rank 0.""" diff --git a/python/ray/train/v2/tests/test_jax_trainer.py b/python/ray/train/v2/tests/test_jax_trainer.py new file mode 100644 index 000000000000..cc77f03b1ae6 --- /dev/null +++ b/python/ray/train/v2/tests/test_jax_trainer.py @@ -0,0 +1,137 @@ +import pytest + +import ray +from ray.tests.conftest import _ray_start_cluster +from ray.train import RunConfig, ScalingConfig +from ray.train.v2._internal.constants import HEALTH_CHECK_INTERVAL_S_ENV_VAR +from ray.train.v2.jax import JaxTrainer + + +@pytest.fixture +def ray_tpu_single_host(monkeypatch): + """Start a mock single-host TPU Ray cluster with 2x4 v6e (8 chips per host).""" + with _ray_start_cluster() as cluster: + monkeypatch.setenv("TPU_ACCELERATOR_TYPE", "v6e-8") + + # Simulate one node with 8 TPU chips. + cluster.add_node( + num_cpus=4, + resources={"TPU": 8}, + ) + + ray.init(address=cluster.address) + + yield cluster + ray.shutdown() + + +@pytest.fixture +def ray_tpu_multi_host(monkeypatch): + """Start a simulated multi-host TPU Ray cluster.""" + with _ray_start_cluster() as cluster: + monkeypatch.setenv("TPU_NAME", "test-slice-1") + monkeypatch.setenv("TPU_WORKER_ID", "0") + monkeypatch.setenv("TPU_ACCELERATOR_TYPE", "v4-8") + monkeypatch.setenv("TPU_TOPOLOGY", "2x2x2") + + cluster.add_node( + num_cpus=2, + resources={"TPU": 4, "TPU-v4-8-head": 1}, + ) + monkeypatch.setenv("TPU_WORKER_ID", "1") + cluster.add_node( + num_cpus=2, + resources={"TPU": 4}, + ) + + ray.init(address=cluster.address) + + yield cluster + ray.shutdown() + + +@pytest.fixture(autouse=True) +def reduce_health_check_interval(monkeypatch): + monkeypatch.setenv(HEALTH_CHECK_INTERVAL_S_ENV_VAR, "0.2") + yield + + +def train_func(): + import jax + + from ray import train + + devices = jax.devices() + print(f"Devices on this worker: {devices}") + train.report({"result": [str(d) for d in devices]}) + + +def test_minimal_singlehost(ray_tpu_single_host, tmp_path): + trainer = JaxTrainer( + train_loop_per_worker=train_func, + # Topology can be omitted for single-host. + scaling_config=ScalingConfig( + num_workers=1, + resources_per_worker={"TPU": 8}, + use_tpu=True, + accelerator_type="TPU-V6E", + ), + run_config=RunConfig( + storage_path=str(tmp_path), + worker_runtime_env={ + "pip": ["jax"], + "env_vars": { + "JAX_PLATFORMS": "cpu", + }, + }, + ), + ) + result = trainer.fit() + assert result.error is None + + # Check that exactly 1 TPU node was used. + nodes = ray.nodes() + labeled_nodes = [ + node for node in nodes if node["Alive"] and node["Resources"].get("TPU") == 8 + ] + assert len(labeled_nodes) == 1 + + +def test_minimal_multihost(ray_tpu_multi_host, tmp_path): + trainer = JaxTrainer( + train_loop_per_worker=train_func, + scaling_config=ScalingConfig( + num_workers=2, + resources_per_worker={"TPU": 4}, + use_tpu=True, + topology="2x2x2", + accelerator_type="TPU-V4", + ), + run_config=RunConfig( + storage_path=str(tmp_path), + worker_runtime_env={ + "pip": ["jax"], + "env_vars": { + "JAX_PLATFORMS": "cpu", + }, + }, + ), + ) + result = trainer.fit() + assert result.error is None + + # Check that multi-host slice was scheduled atomically. + nodes = ray.nodes() + slice_label = "test-slice-1" + labeled_nodes = [ + node + for node in nodes + if node["Alive"] and node["Labels"].get("ray.io/tpu-slice-name") == slice_label + ] + assert len(labeled_nodes) == 2 + + +if __name__ == "__main__": + import sys + + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/train/v2/tests/test_local_mode.py b/python/ray/train/v2/tests/test_local_mode.py new file mode 100644 index 000000000000..9b22a8e6daef --- /dev/null +++ b/python/ray/train/v2/tests/test_local_mode.py @@ -0,0 +1,526 @@ +import math +import sys +from unittest.mock import MagicMock + +import lightgbm +import pandas as pd +import pytest +import xgboost +from datasets import Dataset +from sklearn.datasets import load_breast_cancer +from sklearn.model_selection import train_test_split +from transformers import AutoConfig, AutoModelForCausalLM, Trainer, TrainingArguments + +import ray +from ray.data.preprocessors import Concatenator +from ray.tests.conftest import _ray_start_cluster +from ray.train import ScalingConfig +from ray.train.constants import TRAIN_DATASET_KEY +from ray.train.examples.pytorch.torch_linear_example import ( + train_func as linear_train_func, +) +from ray.train.huggingface.transformers import ( + RayTrainReportCallback as HuggingFaceRayTrainReportCallback, + prepare_trainer, +) +from ray.train.lightgbm import ( + LightGBMTrainer, + RayTrainReportCallback as LightGBMRayTrainReportCallback, +) +from ray.train.lightning import ( + RayDDPStrategy, + RayFSDPStrategy, + RayLightningEnvironment, + RayTrainReportCallback as LightningRayTrainReportCallback, +) +from ray.train.lightning._lightning_utils import import_lightning +from ray.train.tests._huggingface_data import train_data, validation_data +from ray.train.tests.lightning_test_utils import DummyDataModule, LinearModule +from ray.train.tests.util import create_dict_checkpoint +from ray.train.torch import TorchTrainer +from ray.train.v2.api.data_parallel_trainer import DataParallelTrainer +from ray.train.v2.jax import JaxTrainer +from ray.train.xgboost import ( + RayTrainReportCallback as XGBoostRayTrainReportCallback, + XGBoostTrainer, +) + +if sys.version_info >= (3, 12): + # Tensorflow is not installed for Python 3.12 because of keras compatibility. + pass +else: + from ray.train.examples.tf.tensorflow_regression_example import ( + train_func as tensorflow_linear_train_func, + ) + from ray.train.tensorflow import TensorflowTrainer + +pl = import_lightning() + + +@pytest.fixture +def ray_start_6_cpus(): + address_info = ray.init(num_cpus=6) + yield address_info + # The code after the yield will run as teardown code. + ray.shutdown() + + +@pytest.fixture +def ray_tpu_single_host(monkeypatch): + """Start a mock single-host TPU Ray cluster with 2x4 v6e (8 chips per host).""" + with _ray_start_cluster() as cluster: + monkeypatch.setenv("TPU_ACCELERATOR_TYPE", "v6e-8") + + # Simulate one node with 8 TPU chips. + cluster.add_node( + num_cpus=4, + resources={"TPU": 8}, + ) + + ray.init(address=cluster.address) + + yield cluster + ray.shutdown() + + +def test_data_parallel_trainer_local_mode(): + def train_fn(): + with create_dict_checkpoint({}) as checkpoint: + ray.train.report(metrics={"test": 1}, checkpoint=checkpoint) + + trainer = DataParallelTrainer(train_fn, scaling_config=ScalingConfig(num_workers=0)) + result = trainer.fit() + assert result.metrics == {"test": 1} + assert result.checkpoint + + +def test_jax_trainer_local_mode(ray_tpu_single_host, monkeypatch): + def jax_train_func(): + import jax + + devices = jax.devices() + print(f"Devices on this worker: {devices}") + ray.train.report({"result": [str(d) for d in devices]}) + + mock_jax = MagicMock() + mock_jax.devices.return_value = ["TPU:0"] + monkeypatch.setitem(sys.modules, "jax", mock_jax) + + trainer = JaxTrainer( + train_loop_per_worker=jax_train_func, + scaling_config=ScalingConfig( + num_workers=0, + ), + ) + result = trainer.fit() + assert result.error is None + assert result.metrics == {"result": ["TPU:0"]} + + +def test_lightgbm_trainer_local_mode(ray_start_6_cpus): + def lightgbm_train_fn_per_worker( + config: dict, + label_column: str, + dataset_keys: set, + num_boost_round: int = 10, + ): + remaining_iters = num_boost_round + train_ds_iter = ray.train.get_dataset_shard(TRAIN_DATASET_KEY) + train_df = train_ds_iter.materialize().to_pandas() + + eval_ds_iters = { + k: ray.train.get_dataset_shard(k) + for k in dataset_keys + if k != TRAIN_DATASET_KEY + } + eval_dfs = {k: d.materialize().to_pandas() for k, d in eval_ds_iters.items()} + + train_X, train_y = train_df.drop(label_column, axis=1), train_df[label_column] + train_set = lightgbm.Dataset(train_X, label=train_y) + + # NOTE: Include the training dataset in the evaluation datasets. + # This allows `train-*` metrics to be calculated and reported. + valid_sets = [train_set] + valid_names = [TRAIN_DATASET_KEY] + + for eval_name, eval_df in eval_dfs.items(): + eval_X, eval_y = eval_df.drop(label_column, axis=1), eval_df[label_column] + valid_sets.append(lightgbm.Dataset(eval_X, label=eval_y)) + valid_names.append(eval_name) + + # Add network params of the worker group to enable distributed training. + config.update(ray.train.lightgbm.get_network_params()) + + lightgbm.train( + params=config, + train_set=train_set, + num_boost_round=remaining_iters, + valid_sets=valid_sets, + valid_names=valid_names, + init_model=None, + callbacks=[LightGBMRayTrainReportCallback()], + ) + + data_raw = load_breast_cancer() + dataset_df = pd.DataFrame(data_raw["data"], columns=data_raw["feature_names"]) + dataset_df["target"] = data_raw["target"] + train_df, test_df = train_test_split(dataset_df, test_size=0.3) + + train_df_with_cat = train_df.copy() + test_df_with_cat = test_df.copy() + dataset_shard_size = 1 + train_df_with_cat["categorical_column"] = pd.Series( + (["A", "B"] * math.ceil(len(train_df_with_cat) / dataset_shard_size))[ + : len(train_df_with_cat) + ] + ).astype("category") + test_df_with_cat["categorical_column"] = pd.Series( + (["A", "B"] * math.ceil(len(test_df_with_cat) / dataset_shard_size))[ + : len(test_df_with_cat) + ] + ).astype("category") + + scale_config = ScalingConfig(num_workers=0) + train_dataset = ray.data.from_pandas(train_df_with_cat) + valid_dataset = ray.data.from_pandas(test_df_with_cat) + trainer = LightGBMTrainer( + train_loop_per_worker=lambda: lightgbm_train_fn_per_worker( + config={}, + label_column="target", + dataset_keys={TRAIN_DATASET_KEY, "valid"}, + ), + train_loop_config={ + "objective": "binary", + "metric": ["binary_logloss", "binary_error"], + }, + scaling_config=scale_config, + datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset}, + ) + result = trainer.fit() + checkpoint = result.checkpoint + assert checkpoint is not None + + +@pytest.mark.parametrize("datasource", ["dataloader", "datamodule"]) +def test_lightning_trainer_local_mode(ray_start_6_cpus, datasource): + + num_epochs = 1 + batch_size = 8 + dataset_size = 256 + dataset_shard_size = 1 + strategy_name = "ddp" + accelerator = "cpu" + + strategy_map = {"ddp": RayDDPStrategy(), "fsdp": RayFSDPStrategy()} + + def train_loop(): + model = LinearModule(input_dim=32, output_dim=4, strategy=strategy_name) + + strategy = strategy_map[strategy_name] + + trainer = pl.Trainer( + max_epochs=num_epochs, + devices="auto", + accelerator=accelerator, + strategy=strategy, + plugins=[RayLightningEnvironment()], + callbacks=[LightningRayTrainReportCallback()], + ) + + datamodule = DummyDataModule(batch_size, dataset_size) + + if datasource == "dataloader": + trainer.fit( + model, + train_dataloaders=datamodule.train_dataloader(), + val_dataloaders=datamodule.val_dataloader(), + ) + if datasource == "datamodule": + trainer.fit(model, datamodule=datamodule) + + trainer = TorchTrainer( + train_loop_per_worker=train_loop, + scaling_config=ScalingConfig(num_workers=0, use_gpu=(accelerator == "gpu")), + ) + + results = trainer.fit() + assert results.metrics["epoch"] == num_epochs - 1 + assert ( + results.metrics["step"] + == num_epochs * dataset_size / dataset_shard_size / batch_size + ) + assert "loss" in results.metrics + assert "val_loss" in results.metrics + + +@pytest.mark.skipif( + sys.version_info >= (3, 12), + reason="Tensorflow is not installed for Python 3.12 because of keras compatibility.", +) +def test_tensorflow_linear_local_mode(ray_start_4_cpus): + """Also tests air Keras callback.""" + epochs = 1 + + def train_func(config): + result = tensorflow_linear_train_func(config) + assert len(result) == epochs + + train_loop_config = { + "lr": 1e-3, + "batch_size": 32, + "epochs": epochs, + } + scaling_config = ScalingConfig(num_workers=0) + dataset = ray.data.read_csv("s3://anonymous@air-example-data/regression.csv") + columns_to_concatenate = [f"x{i:03}" for i in range(100)] + preprocessor = Concatenator(columns=columns_to_concatenate, output_column_name="x") + dataset = preprocessor.transform(dataset) + + trainer = TensorflowTrainer( + train_loop_per_worker=train_func, + train_loop_config=train_loop_config, + scaling_config=scaling_config, + datasets={TRAIN_DATASET_KEY: dataset}, + ) + result = trainer.fit() + assert not result.error + assert result.checkpoint + + +def test_torch_trainer_local_mode(ray_start_6_cpus): + def train_func(config): + result = linear_train_func(config) + assert len(result) == epochs + assert result[-1]["loss"] < result[0]["loss"] + + epochs = 3 + scaling_config = ScalingConfig(num_workers=0) + config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs} + trainer = TorchTrainer( + train_loop_per_worker=train_func, + train_loop_config=config, + scaling_config=scaling_config, + ) + result = trainer.fit() + assert result.error is None + assert result.metrics is not None + assert result.metrics["loss"] is not None + assert result.checkpoint + + +HF_BATCH_SIZE_PER_WORKER = 2 +HF_MODEL_NAME = "hf-internal-testing/tiny-random-BloomForCausalLM" +HF_MAX_EPOCHS = 1 +HF_TRAIN_DATASET_SIZE = 16 + + +@pytest.mark.parametrize("use_ray_data", [False, True]) +def test_e2e_hf_local_mode(ray_start_4_cpus, use_ray_data): + def get_transformers_configurations(): + """Get configurations with dynamic step calculations based on number of workers.""" + steps_per_epoch = HF_TRAIN_DATASET_SIZE // HF_BATCH_SIZE_PER_WORKER + return { + "epoch_gpu": { + "evaluation_strategy": "epoch", + "save_strategy": "epoch", + "logging_strategy": "epoch", + "eval_steps": None, + "save_steps": None, + "logging_steps": None, + "no_cuda": False, + }, + "steps_gpu": { + "evaluation_strategy": "steps", + "save_strategy": "steps", + "logging_strategy": "steps", + "eval_steps": steps_per_epoch, + "save_steps": steps_per_epoch * 2, + "logging_steps": 1, + "no_cuda": False, + }, + "steps_cpu": { + "evaluation_strategy": "steps", + "save_strategy": "steps", + "logging_strategy": "steps", + "eval_steps": steps_per_epoch, + "save_steps": steps_per_epoch, + "logging_steps": 1, + "no_cuda": True, + }, + "steps_cpu_local": { + "evaluation_strategy": "steps", + "save_strategy": "steps", + "logging_strategy": "steps", + "eval_steps": steps_per_epoch, + "save_steps": steps_per_epoch, + "logging_steps": 1, + "no_cuda": True, + }, + } + + config_id = "steps_cpu_local" + num_workers = 0 + + def train_func(config): + # Datasets + if config["use_ray_data"]: + train_ds_shard = ray.train.get_dataset_shard("train") + eval_ds_shard = ray.train.get_dataset_shard("eval") + + train_dataset = train_ds_shard.iter_torch_batches( + batch_size=HF_BATCH_SIZE_PER_WORKER + ) + eval_dataset = eval_ds_shard.iter_torch_batches( + batch_size=HF_BATCH_SIZE_PER_WORKER + ) + else: + train_df = pd.read_json(train_data) + validation_df = pd.read_json(validation_data) + + train_dataset = Dataset.from_pandas(train_df) + eval_dataset = Dataset.from_pandas(validation_df) + + # Model + model_config = AutoConfig.from_pretrained(HF_MODEL_NAME) + model = AutoModelForCausalLM.from_config(model_config) + + # HF Transformers Trainer + training_args = TrainingArguments( + f"{HF_MODEL_NAME}-wikitext2", + evaluation_strategy=config["evaluation_strategy"], + logging_strategy=config["logging_strategy"], + save_strategy=config["save_strategy"], + eval_steps=config["eval_steps"], + save_steps=config["save_steps"], + logging_steps=config["logging_steps"], + num_train_epochs=config.get("num_train_epochs", HF_MAX_EPOCHS), + max_steps=config.get("max_steps", -1), + learning_rate=config.get("learning_rate", 2e-5), + per_device_train_batch_size=HF_BATCH_SIZE_PER_WORKER, + per_device_eval_batch_size=HF_BATCH_SIZE_PER_WORKER, + weight_decay=0.01, + disable_tqdm=True, + no_cuda=config["no_cuda"], + report_to="none", + ) + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + ) + + # Report to Ray Train + trainer.add_callback(HuggingFaceRayTrainReportCallback()) + trainer = prepare_trainer(trainer) + + # Start Training + trainer.train() + + configurations = get_transformers_configurations() + train_loop_config = configurations[config_id] + + # Calculate the num of Ray training iterations + max_steps = HF_MAX_EPOCHS * HF_TRAIN_DATASET_SIZE // HF_BATCH_SIZE_PER_WORKER + + train_loop_config["use_ray_data"] = use_ray_data + + datasets = None + if use_ray_data: + # Must specify `max_steps` for Iterable Dataset + train_loop_config["max_steps"] = max_steps + + train_df = pd.read_json(train_data) + validation_df = pd.read_json(validation_data) + + ray_train_ds = ray.data.from_pandas(train_df) + ray_eval_ds = ray.data.from_pandas(validation_df) + datasets = {"train": ray_train_ds, "eval": ray_eval_ds} + else: + # Specify `num_train_epochs` for Map-style Dataset + train_loop_config["num_train_epochs"] = HF_MAX_EPOCHS + + use_gpu = not train_loop_config["no_cuda"] + + trainer = TorchTrainer( + train_func, + train_loop_config=train_loop_config, + scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), + datasets=datasets, + ) + result = trainer.fit() + + assert result.metrics["step"] == max_steps + assert "eval_loss" in result.metrics + if not use_ray_data: + assert result.metrics["epoch"] == HF_MAX_EPOCHS + + +def test_xgboost_trainer_local_mode(ray_start_4_cpus): + def xgboost_train_fn_per_worker(): + label_column = "target" + dataset_keys = {TRAIN_DATASET_KEY, "valid"} + checkpoint = ray.train.get_checkpoint() + starting_model = None + remaining_iters = 10 + if checkpoint: + starting_model = XGBoostRayTrainReportCallback.get_model(checkpoint) + starting_iter = starting_model.num_boosted_rounds() + remaining_iters = remaining_iters - starting_iter + + train_ds_iter = ray.train.get_dataset_shard(TRAIN_DATASET_KEY) + train_df = train_ds_iter.materialize().to_pandas() + + eval_ds_iters = { + k: ray.train.get_dataset_shard(k) + for k in dataset_keys + if k != TRAIN_DATASET_KEY + } + eval_dfs = {k: d.materialize().to_pandas() for k, d in eval_ds_iters.items()} + + train_X, train_y = train_df.drop(label_column, axis=1), train_df[label_column] + dtrain = xgboost.DMatrix(train_X, label=train_y) + + # NOTE: Include the training dataset in the evaluation datasets. + # This allows `train-*` metrics to be calculated and reported. + evals = [(dtrain, TRAIN_DATASET_KEY)] + + for eval_name, eval_df in eval_dfs.items(): + eval_X, eval_y = eval_df.drop(label_column, axis=1), eval_df[label_column] + evals.append((xgboost.DMatrix(eval_X, label=eval_y), eval_name)) + + evals_result = {} + xgboost.train( + {}, + dtrain=dtrain, + evals=evals, + evals_result=evals_result, + num_boost_round=remaining_iters, + xgb_model=starting_model, + ) + + data_raw = load_breast_cancer() + dataset_df = pd.DataFrame(data_raw["data"], columns=data_raw["feature_names"]) + dataset_df["target"] = data_raw["target"] + train_df, test_df = train_test_split(dataset_df, test_size=0.3) + + train_dataset = ray.data.from_pandas(train_df) + valid_dataset = ray.data.from_pandas(test_df) + scale_config = ScalingConfig(num_workers=0) + trainer = XGBoostTrainer( + train_loop_per_worker=xgboost_train_fn_per_worker, + train_loop_config={ + "tree_method": "approx", + "objective": "binary:logistic", + "eval_metric": ["logloss", "error"], + }, + scaling_config=scale_config, + datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset}, + ) + result = trainer.fit() + with pytest.raises(DeprecationWarning): + XGBoostTrainer.get_model(result.checkpoint) + + +if __name__ == "__main__": + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/train/v2/tests/test_persistence.py b/python/ray/train/v2/tests/test_persistence.py index cc231722a651..2649c5ce2ad7 100644 --- a/python/ray/train/v2/tests/test_persistence.py +++ b/python/ray/train/v2/tests/test_persistence.py @@ -14,6 +14,7 @@ import ray import ray.train +import ray.train.collective from ray._common.test_utils import simulate_s3_bucket from ray.air._internal.uri_utils import URI from ray.train import ( @@ -24,7 +25,6 @@ ScalingConfig, ) from ray.train.v2._internal.constants import HEALTH_CHECK_INTERVAL_S_ENV_VAR -from ray.train.v2._internal.execution.context import get_train_context from ray.train.v2._internal.execution.storage import _download_from_fs_path from ray.train.v2.api.data_parallel_trainer import DataParallelTrainer @@ -174,6 +174,10 @@ def train_fn(config): print("Loaded back state from checkpoint:", state) start = state["iter"] + 1 + assert len(ray.train.get_all_reported_checkpoints()) == min( + start, config.get("num_to_keep", float("inf")) + ) + for i in range(start, config.get("num_iterations", 5)): time.sleep(config.get("time_per_iter", 0.25)) @@ -212,21 +216,12 @@ def train_fn(config): # which will cause the test assertions to fail. # This should be fixed by forcing a queue flush on all workers before # executing the failure decisions. - # Note: this `get_train_context` is not a public API. - # TODO (hpguo): Think about expose `get_synchronization_actor` as a - # public API, which will be a useful collection of communication utils. - train_context = get_train_context() - sync_actor = train_context.get_synchronization_actor() - ray.get( - sync_actor.broadcast_from_rank_zero.remote( - world_rank=train_context.get_world_rank(), - world_size=train_context.get_world_size(), - data="barrier", - caller_method_name="caller_method_name", - ) - ) + ray.train.collective.barrier() if i in config.get("fail_iters", []): + assert len(ray.train.get_all_reported_checkpoints()) == min( + i + 1, config.get("num_to_keep", float("inf")) + ) raise RuntimeError(f"Failing on iter={i}!!") @@ -317,6 +312,10 @@ def test_trainer( exp_name = f"trainer_persistence_test-{uuid.uuid4().hex}" no_checkpoint_ranks = [0] + if checkpoint_config.num_to_keep: + num_to_keep = checkpoint_config.num_to_keep + else: + num_to_keep = float("inf") with _resolve_storage_type(storage_path_type, tmp_path) as ( storage_path, @@ -337,6 +336,7 @@ def test_trainer( # Test that global rank 0 is not required to checkpoint. "no_checkpoint_ranks": no_checkpoint_ranks, "time_per_iter": time_between_reports, + "num_to_keep": num_to_keep, }, scaling_config=ScalingConfig(num_workers=TestConstants.NUM_WORKERS), run_config=run_config, @@ -353,6 +353,7 @@ def test_trainer( # Test that global rank 0 is not required to checkpoint. "no_checkpoint_ranks": no_checkpoint_ranks, "time_per_iter": time_between_reports, + "num_to_keep": num_to_keep, }, scaling_config=ScalingConfig(num_workers=TestConstants.NUM_WORKERS), run_config=run_config, diff --git a/python/ray/train/v2/tests/test_thread_runner.py b/python/ray/train/v2/tests/test_thread_runner.py index 788150a8af14..2a5038e2b7d8 100644 --- a/python/ray/train/v2/tests/test_thread_runner.py +++ b/python/ray/train/v2/tests/test_thread_runner.py @@ -1,14 +1,32 @@ +import threading import time import pytest from ray.train.v2._internal.exceptions import UserExceptionWithTraceback from ray.train.v2._internal.execution.worker_group.thread_runner import ThreadRunner +from ray.train.v2._internal.util import construct_user_exception_with_traceback + + +class ThreadRunnerWithJoin(ThreadRunner): + def join(self): + """Join both the target thread and the monitor thread. + + Do not include this with the main ThreadRunner class because: + * It is tricky to avoid hangs when nested threads raise errors + * We don't need to join in that case since the controller will see the + error and shut down the worker + """ + if self._monitor_thread is None or self._thread is None: + raise RuntimeError("Must call `run` before trying to `join`.") + self._monitor_thread.join() + self._thread.join() + return self.get_return_value() @pytest.fixture() def thread_runner(): - return ThreadRunner() + return ThreadRunnerWithJoin() def test_successful_return(thread_runner): @@ -28,11 +46,47 @@ def target(): def test_error(thread_runner): """Checks that an exception can be captured from the target function.""" + def wrapped_train_func(): + def train_fn_with_final_checkpoint_flush(): + def train_func(): + raise ValueError + + train_func() + + train_fn_with_final_checkpoint_flush() + + thread_runner.run(wrapped_train_func) + assert not thread_runner.join() + + assert thread_runner.get_return_value() is None + assert not thread_runner.is_running() + + error = thread_runner.get_error() + + assert isinstance(error, UserExceptionWithTraceback) + assert isinstance(error._base_exc, ValueError) + print(error._traceback_str) + assert "_run_target" not in error._traceback_str + assert "wrapped_train_func" not in error._traceback_str + assert "train_fn_with_final_checkpoint_flush" not in error._traceback_str + assert "train_func" in error._traceback_str + + +def test_nested_thread_error(thread_runner): + """Checks that we capture exceptions from threads kicked off by target function.""" + def target(): def nested(): - raise ValueError + try: + raise ValueError + except ValueError as e: + thread_runner.get_exception_queue().put( + construct_user_exception_with_traceback(e) + ) - nested() + thread = threading.Thread(target=nested) + thread.start() + thread.join() thread_runner.run(target) assert not thread_runner.join() @@ -44,8 +98,6 @@ def nested(): assert isinstance(error, UserExceptionWithTraceback) assert isinstance(error._base_exc, ValueError) - print(error._traceback_str) - assert "_run_target" not in error._traceback_str def test_running(thread_runner, tmp_path): diff --git a/python/ray/train/v2/tests/test_torch_transformers_train.py b/python/ray/train/v2/tests/test_torch_transformers_train.py index 1484aef66893..fad84fe4f693 100644 --- a/python/ray/train/v2/tests/test_torch_transformers_train.py +++ b/python/ray/train/v2/tests/test_torch_transformers_train.py @@ -303,6 +303,108 @@ def train_func(config): assert "eval_loss" in result.metrics +@pytest.mark.parametrize("config_id", ["steps_cpu"]) +def test_e2e_dict_eval_ray_data(ray_start_6_cpus_2_gpus, config_id): + def train_func(config): + # Datasets + if config["use_ray_data"]: + train_ds_shard = ray.train.get_dataset_shard("train") + eval_ds_shard_1 = ray.train.get_dataset_shard("eval_1") + eval_ds_shard_2 = ray.train.get_dataset_shard("eval_2") + + train_dataset = train_ds_shard.iter_torch_batches( + batch_size=BATCH_SIZE_PER_WORKER + ) + eval_dataset = { + "eval_1": eval_ds_shard_1.iter_torch_batches( + batch_size=BATCH_SIZE_PER_WORKER + ), + "eval_2": eval_ds_shard_2.iter_torch_batches( + batch_size=BATCH_SIZE_PER_WORKER + ), + } + else: + train_df = pd.read_json(train_data) + validation_df = pd.read_json(validation_data) + + train_dataset = Dataset.from_pandas(train_df) + eval_dataset = Dataset.from_pandas(validation_df) + + # Model + model_config = AutoConfig.from_pretrained(MODEL_NAME) + model = AutoModelForCausalLM.from_config(model_config) + + # HF Transformers Trainer + training_args = TrainingArguments( + f"{MODEL_NAME}-wikitext2", + evaluation_strategy=config["evaluation_strategy"], + logging_strategy=config["logging_strategy"], + save_strategy=config["save_strategy"], + eval_steps=config["eval_steps"], + save_steps=config["save_steps"], + logging_steps=config["logging_steps"], + num_train_epochs=config.get("num_train_epochs", MAX_EPOCHS), + max_steps=config.get("max_steps", -1), + learning_rate=config.get("learning_rate", 2e-5), + per_device_train_batch_size=BATCH_SIZE_PER_WORKER, + per_device_eval_batch_size=BATCH_SIZE_PER_WORKER, + weight_decay=0.01, + disable_tqdm=True, + no_cuda=config["no_cuda"], + report_to="none", + ) + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + ) + + # Report to Ray Train + trainer.add_callback(RayTrainReportCallback()) + trainer = prepare_trainer(trainer) + + # Start Training + trainer.train() + + train_loop_config = CONFIGURATIONS[config_id] + + # Must specify `max_steps` for Iterable Dataset + train_loop_config["use_ray_data"] = True + train_loop_config["max_steps"] = MAX_STEPS + + # Calculate the num of Ray training iterations + num_iterations = MAX_STEPS // train_loop_config["save_steps"] + + train_df = pd.read_json(train_data) + validation_df = pd.read_json(validation_data) + + ray_train_ds = ray.data.from_pandas(train_df) + ray_eval_ds_1 = ray.data.from_pandas(validation_df) + ray_eval_ds_2 = ray.data.from_pandas(validation_df) + + use_gpu = not train_loop_config["no_cuda"] + + trainer = TorchTrainer( + train_func, + train_loop_config=train_loop_config, + scaling_config=ScalingConfig(num_workers=NUM_WORKERS, use_gpu=use_gpu), + datasets={ + "train": ray_train_ds, + "eval_1": ray_eval_ds_1, + "eval_2": ray_eval_ds_2, + }, + ) + result = trainer.fit() + + assert result.metrics["step"] == MAX_STEPS + assert result.checkpoint + assert isinstance(result.checkpoint, Checkpoint) + assert len(result.best_checkpoints) == num_iterations + assert "eval_eval_1_loss" in result.metrics + assert "eval_eval_2_loss" in result.metrics + + if __name__ == "__main__": import sys diff --git a/python/ray/train/v2/tests/test_worker.py b/python/ray/train/v2/tests/test_worker.py new file mode 100644 index 000000000000..75fdf07c3ea0 --- /dev/null +++ b/python/ray/train/v2/tests/test_worker.py @@ -0,0 +1,76 @@ +import queue +import time +from unittest.mock import create_autospec + +import pytest + +from ray.actor import ActorHandle +from ray.train.v2._internal.constants import ENABLE_WORKER_STRUCTURED_LOGGING_ENV_VAR +from ray.train.v2._internal.execution.context import ( + DistributedContext, + TrainRunContext, + get_train_context, +) +from ray.train.v2._internal.execution.storage import StorageContext +from ray.train.v2._internal.execution.worker_group.worker import RayTrainWorker +from ray.train.v2._internal.util import ObjectRefWrapper + + +@pytest.mark.parametrize("created_nested_threads", [True, False]) +def test_worker_finished_after_all_threads_finish(monkeypatch, created_nested_threads): + # Disable this to avoid TypeError from logging MagicMock + monkeypatch.setenv(ENABLE_WORKER_STRUCTURED_LOGGING_ENV_VAR, False) + + # Initialize RayTrainWorker state + worker = RayTrainWorker() + worker.init_train_context( + train_run_context=create_autospec(TrainRunContext, instance=True), + distributed_context=DistributedContext( + world_rank=0, + world_size=1, + local_rank=0, + local_world_size=1, + node_rank=0, + ), + synchronization_actor=create_autospec(ActorHandle, instance=True), + storage_context=create_autospec(StorageContext, instance=True), + worker_callbacks=[], + controller_actor=create_autospec(ActorHandle, instance=True), + ) + global_queue = queue.Queue() + + def train_fn(): + tc = get_train_context() + + def target(): + # Intentionally sleep longer than poll interval to test that we wait + # for nested threads to finish + time.sleep(0.1) + global_queue.put("nested") + + if created_nested_threads: + tc.checkpoint_upload_threadpool.submit(target) + else: + global_queue.put("main") + + # Run train fn and wait for it to finish + train_fn_ref = create_autospec(ObjectRefWrapper, instance=True) + train_fn_ref.get.return_value = train_fn + worker.run_train_fn(train_fn_ref) + while worker.poll_status().running: + time.sleep(0.01) + + # Verify queue contents + queue_contents = [] + while not global_queue.empty(): + queue_contents.append(global_queue.get()) + if created_nested_threads: + assert queue_contents == ["nested"] + else: + assert queue_contents == ["main"] + + +if __name__ == "__main__": + import sys + + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/train/v2/tests/test_worker_group.py b/python/ray/train/v2/tests/test_worker_group.py index 4146cdd763aa..8bf83334fcd7 100644 --- a/python/ray/train/v2/tests/test_worker_group.py +++ b/python/ray/train/v2/tests/test_worker_group.py @@ -28,10 +28,13 @@ Worker, WorkerGroup, WorkerGroupContext, + WorkerGroupState, ) from ray.train.v2.api.config import RunConfig from ray.train.v2.tests.util import DummyObjectRefWrapper, create_dummy_run_context +pytestmark = pytest.mark.usefixtures("mock_runtime_context") + @pytest.fixture(autouse=True, scope="module") def ray_start_4_cpus(): @@ -495,7 +498,21 @@ def after_worker_group_poll_status(self, worker_group_status): assert hooks.shutdown_hook_called -def test_worker_group_abort(): +def test_worker_log_file_paths(): + """Test that log file paths are correctly assigned to workers.""" + wg = _default_inactive_worker_group() + wg._start() + + # Check that all workers have log file paths assigned + workers = wg.get_workers() + for worker in workers: + assert worker.log_file_path is not None + assert "ray-train-app-worker" in worker.log_file_path + + wg.shutdown() + + +def test_worker_group_abort(monkeypatch): class AssertCallback(WorkerGroupCallback): def __init__(self): self.abort_hook_called = False @@ -507,21 +524,26 @@ def before_worker_group_abort(self, worker_group_context): wg = _default_inactive_worker_group(callbacks=[hooks]) wg._start() - wg.abort() - assert hooks.abort_hook_called - wg.shutdown() + # Track shutdown calls without preventing actual cleanup + shutdown_call_count = 0 + original_shutdown = WorkerGroupState.shutdown -def test_worker_log_file_paths(): - """Test that log file paths are correctly assigned to workers.""" - wg = _default_inactive_worker_group() - wg._start() + def track_shutdown_calls(self): + nonlocal shutdown_call_count + shutdown_call_count += 1 + return original_shutdown(self) - # Check that all workers have log file paths assigned - workers = wg.get_workers() - for worker in workers: - assert worker.log_file_path is not None - assert "ray-train-app-worker" in worker.log_file_path + monkeypatch.setattr(WorkerGroupState, "shutdown", track_shutdown_calls) + + wg.abort() + assert ( + shutdown_call_count == 1 + ), f"Expected shutdown to be called once, but was called {shutdown_call_count} times" + assert hooks.abort_hook_called + + # Bypass _assert_active method, allowing for shutdown + monkeypatch.setattr(wg, "_assert_active", lambda: None) wg.shutdown() diff --git a/python/ray/train/v2/tests/test_worker_group_poll_status.py b/python/ray/train/v2/tests/test_worker_group_poll_status.py new file mode 100644 index 000000000000..03c394a961a6 --- /dev/null +++ b/python/ray/train/v2/tests/test_worker_group_poll_status.py @@ -0,0 +1,92 @@ +import pytest + +from ray.train.v2._internal.execution.worker_group.poll import ( + ERR_CHAR_LIMIT, + WorkerGroupPollStatus, + WorkerStatus, + _normalize_error_string, +) + + +def test_get_error_string_basic(): + """ + Simulate four workers, two with the same error, one with a different error, + and one without an error. + """ + + statuses = { + 0: WorkerStatus(running=False, error=ValueError("An error")), + 1: WorkerStatus(running=False, error=None), + 2: WorkerStatus(running=False, error=RuntimeError("Different error")), + 3: WorkerStatus(running=False, error=ValueError("An error")), + } + poll_status = WorkerGroupPollStatus(worker_statuses=statuses) + error_str = poll_status.get_error_string() + + expected_error_str = ( + "[Rank 0,3 Error Snippet]:\nAn error\n[Rank 2 Error Snippet]:\nDifferent error" + ) + assert error_str == expected_error_str + + +def test_get_error_string_with_numbers(): + """ + Simulate workers with similar errors that differ only by numbers. + These should be grouped together. + """ + statuses = { + 0: WorkerStatus( + running=False, error=ValueError("Error parsing object at 0x7f8b12345678") + ), + 1: WorkerStatus( + running=False, error=ValueError("Error parsing object at 0x7f8b12345679") + ), + } + poll_status = WorkerGroupPollStatus(worker_statuses=statuses) + error_str = poll_status.get_error_string() + + assert ( + error_str == "[Rank 0,1 Error Snippet]:\nError parsing object at 0x7f8b12345678" + ) + + +def test_get_error_string_long_error(): + """ + Simulate two workers with identical long error string. + """ + long_error_str = "test string" * 200 + statuses = { + 0: WorkerStatus(running=False, error=long_error_str), + 1: WorkerStatus(running=False, error=long_error_str), + } + poll_status = WorkerGroupPollStatus(worker_statuses=statuses) + error_str = poll_status.get_error_string() + + expected_error_str = ( + "[Rank 0,1 Error Snippet]:\n" + + long_error_str[: ERR_CHAR_LIMIT // 2] + + "...\n... (Output truncated. See individual worker logs for full details) ...\n" + + long_error_str[len(long_error_str) - ERR_CHAR_LIMIT // 2 :] + ) + assert error_str == expected_error_str + + +def test_normalize_error_string(): + """Test that _normalize_error_string properly handles all types of numbers.""" + error = """Traceback (most recent call last): +File "/home/ray/default/train_benchmark.py", line 35, in train_fn_per_worker +File "/tmp/ray/session_2025-08-07_23-49-55_617067_2585/runtime_resources/working_dir_files/_ray_pkg_5abd79ca51ba0ed4/runner.py", line 282, in run""" + result = _normalize_error_string(error) + + assert ( + result + == """Traceback (most recent call last): +File "/home/ray/default/train_benchmark.py", line , in train_fn_per_worker +File "/tmp/ray/session_--_--__/runtime_resources/working_dir_files/_ray_pkg_abdcabaed/runner.py", line , in run""" + ) + + +if __name__ == "__main__": + import sys + + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/train/v2/torch/train_loop_utils.py b/python/ray/train/v2/torch/train_loop_utils.py index af546f83014f..dfb618dc6b6a 100644 --- a/python/ray/train/v2/torch/train_loop_utils.py +++ b/python/ray/train/v2/torch/train_loop_utils.py @@ -1,7 +1,7 @@ import logging import os import random -from typing import Any, Callable, Dict, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Union import numpy as np import torch @@ -17,7 +17,11 @@ import ray.train.torch from ray._common.usage.usage_lib import TagKey, record_extra_usage_tag -from ray.train.torch.train_loop_utils import _WrappedDataLoader +from ray.train.torch.train_loop_utils import ( + _WrappedDataLoader, + get_devices as get_devices_distributed, +) +from ray.train.v2._internal.execution.train_fn_utils import get_train_fn_utils from ray.util.annotations import Deprecated, PublicAPI logger = logging.getLogger(__name__) @@ -34,6 +38,22 @@ ) +def get_device() -> torch.device: + return get_devices()[0] + + +def get_devices() -> List[torch.device]: + if get_train_fn_utils().is_distributed(): + return get_devices_distributed() + else: + # Local mode, we defer to torch.cuda + # TODO(xgui): Use `ScalingConfig.use_gpu` instead + if torch.cuda.is_available(): + return [torch.device(f"cuda:{torch.cuda.current_device()}")] + else: + return [torch.device("cpu")] + + def prepare_model( model: torch.nn.Module, move_to_device: Union[bool, torch.device] = True, diff --git a/python/ray/train/v2/xgboost/__init__.py b/python/ray/train/v2/xgboost/__init__.py index e69de29bb2d1..b4e10280aceb 100644 --- a/python/ray/train/v2/xgboost/__init__.py +++ b/python/ray/train/v2/xgboost/__init__.py @@ -0,0 +1,2 @@ +# This is a workaround to avoid a circular import. +import ray.train.xgboost as ray_train_xgboost # noqa: F401 diff --git a/python/ray/train/v2/xgboost/config.py b/python/ray/train/v2/xgboost/config.py new file mode 100644 index 000000000000..d2c04c99c137 --- /dev/null +++ b/python/ray/train/v2/xgboost/config.py @@ -0,0 +1,21 @@ +from contextlib import contextmanager + +from ray.train.v2._internal.execution.train_fn_utils import get_train_fn_utils +from ray.train.xgboost.config import XGBoostConfig as XGBoostConfigV1 + + +class XGBoostConfig(XGBoostConfigV1): + @property + def train_func_context(self): + distributed_context = super(XGBoostConfig, self).train_func_context + + @contextmanager + def collective_communication_context(): + # The distributed_context is only needed in distributed mode + if get_train_fn_utils().is_distributed(): + with distributed_context(): + yield + else: + yield + + return collective_communication_context diff --git a/python/ray/train/xgboost/__init__.py b/python/ray/train/xgboost/__init__.py index aa2d1c88d11b..447515b95b44 100644 --- a/python/ray/train/xgboost/__init__.py +++ b/python/ray/train/xgboost/__init__.py @@ -6,6 +6,7 @@ from ray.train.xgboost.xgboost_trainer import XGBoostTrainer if is_v2_enabled(): + from ray.train.v2.xgboost.config import XGBoostConfig # noqa: F811 from ray.train.v2.xgboost.xgboost_trainer import XGBoostTrainer # noqa: F811 __all__ = [ diff --git a/python/ray/tune/BUILD b/python/ray/tune/BUILD.bazel similarity index 100% rename from python/ray/tune/BUILD rename to python/ray/tune/BUILD.bazel diff --git a/python/ray/tune/integration/ray_train.py b/python/ray/tune/integration/ray_train.py index f8dfc60d3abf..89302a65869a 100644 --- a/python/ray/tune/integration/ray_train.py +++ b/python/ray/tune/integration/ray_train.py @@ -1,9 +1,10 @@ from typing import Any, Dict, List, Optional -import ray.tune from ray.train import Checkpoint as RayTrainCheckpoint +from ray.train._internal.session import get_session from ray.train.v2._internal.execution.context import TrainRunContext from ray.train.v2.api.callback import UserCallback +from ray.tune.trainable.trainable_fn_utils import _in_tune_session from ray.util.annotations import DeveloperAPI CHECKPOINT_PATH_KEY = "checkpoint_path" @@ -13,6 +14,13 @@ class TuneReportCallback(UserCallback): """Propagate metrics and checkpoint paths from Ray Train workers to Ray Tune.""" + def __init__(self): + if not _in_tune_session(): + raise RuntimeError("TuneReportCallback must be used in a Tune session.") + self._training_actor_item_queue = ( + get_session()._get_or_create_inter_actor_queue() + ) + def after_report( self, run_context: TrainRunContext, @@ -29,4 +37,4 @@ def after_report( if checkpoint: metrics[CHECKPOINT_PATH_KEY] = checkpoint.path - ray.tune.report(metrics=metrics) + self._training_actor_item_queue.put(metrics) diff --git a/python/ray/tune/tests/test_commands.py b/python/ray/tune/tests/test_commands.py index f6615454cb07..b649c65ed59c 100644 --- a/python/ray/tune/tests/test_commands.py +++ b/python/ray/tune/tests/test_commands.py @@ -73,7 +73,7 @@ def train_fn(config): times += [time.time() - start] print("Average CLI time: ", sum(times) / len(times)) - assert sum(times) / len(times) < 2, "CLI is taking too long!" + assert sum(times) / len(times) < 5, "CLI is taking too long!" @mock.patch( diff --git a/python/ray/tune/trainable/function_trainable.py b/python/ray/tune/trainable/function_trainable.py index 9dc9ff02cbfd..e7110275d2c0 100644 --- a/python/ray/tune/trainable/function_trainable.py +++ b/python/ray/tune/trainable/function_trainable.py @@ -16,7 +16,6 @@ init_session, shutdown_session, ) -from ray.train.v2._internal.constants import RUN_CONTROLLER_AS_ACTOR_ENV_VAR from ray.tune.execution.placement_groups import PlacementGroupFactory from ray.tune.result import DEFAULT_METRIC, RESULT_DUPLICATE, SHOULD_CHECKPOINT from ray.tune.trainable.trainable import Trainable @@ -65,17 +64,6 @@ def setup(self, config): ) self._last_training_result: Optional[_TrainingResult] = None - # NOTE: This environment variable is used to disable the - # spawning a new actor for Ray Train drivers being launched - # within Tune functions. - # There are 2 reasons for this: - # 1. Ray Tune already spawns an actor, so we can run the Ray Train - # driver directly in the same actor. - # 2. This allows `ray.tune.report` to be called within Ray Train driver - # callbacks, since it needs to be called on the same process as the - # Tune FunctionTrainable actor. - os.environ[RUN_CONTROLLER_AS_ACTOR_ENV_VAR] = "0" - def _trainable_func(self, config: Dict[str, Any]): """Subclasses can override this to set the trainable func.""" diff --git a/python/ray/util/BUILD b/python/ray/util/BUILD.bazel similarity index 100% rename from python/ray/util/BUILD rename to python/ray/util/BUILD.bazel diff --git a/python/ray/util/__init__.py b/python/ray/util/__init__.py index bc8b6eae909a..19d58a0dd318 100644 --- a/python/ray/util/__init__.py +++ b/python/ray/util/__init__.py @@ -1,18 +1,16 @@ from typing import List import ray -from ray._private.client_mode_hook import client_mode_hook from ray._private.auto_init_hook import wrap_auto_init +from ray._private.client_mode_hook import client_mode_hook from ray._private.services import get_node_instance_id, get_node_ip_address -from ray.util import iter -from ray.util import rpdb as pdb -from ray.util import debugpy as ray_debugpy +from ray.util import accelerators, debugpy as ray_debugpy, iter, rpdb as pdb from ray.util.actor_pool import ActorPool -from ray.util import accelerators from ray.util.annotations import PublicAPI from ray.util.check_serialize import inspect_serializability from ray.util.client_connect import connect, disconnect from ray.util.debug import disable_log_once_globally, enable_periodic_logging, log_once +from ray.util.helpers import as_completed, map_unordered from ray.util.placement_group import ( get_current_placement_group, get_placement_group, @@ -52,6 +50,7 @@ def list_named_actors(all_namespaces: bool = False) -> List[str]: __all__ = [ "accelerators", "ActorPool", + "as_completed", "disable_log_once_globally", "enable_periodic_logging", "iter", @@ -63,6 +62,7 @@ def list_named_actors(all_namespaces: bool = False) -> List[str]: "get_current_placement_group", "get_node_instance_id", "get_node_ip_address", + "map_unordered", "remove_placement_group", "ray_debugpy", "inspect_serializability", diff --git a/python/ray/util/accelerators/__init__.py b/python/ray/util/accelerators/__init__.py index 62888bc9de51..6c757121207b 100644 --- a/python/ray/util/accelerators/__init__.py +++ b/python/ray/util/accelerators/__init__.py @@ -2,32 +2,32 @@ from ray.util.accelerators import tpu from ray.util.accelerators.accelerators import ( - NVIDIA_TESLA_V100, - NVIDIA_TESLA_P100, - NVIDIA_TESLA_T4, - NVIDIA_TESLA_P4, - NVIDIA_TESLA_K80, - NVIDIA_TESLA_A10G, - NVIDIA_L4, - NVIDIA_A100, - NVIDIA_H100, - INTEL_MAX_1550, - INTEL_MAX_1100, - INTEL_GAUDI, AMD_INSTINCT_MI100, AMD_INSTINCT_MI210, AMD_INSTINCT_MI250, - AMD_INSTINCT_MI250x, - AMD_INSTINCT_MI300x, - AMD_RADEON_R9_200_HD_7900, AMD_RADEON_HD_7900, + AMD_RADEON_R9_200_HD_7900, AWS_NEURON_CORE, GOOGLE_TPU_V2, GOOGLE_TPU_V3, GOOGLE_TPU_V4, - GOOGLE_TPU_V5P, GOOGLE_TPU_V5LITEPOD, + GOOGLE_TPU_V5P, GOOGLE_TPU_V6E, + INTEL_GAUDI, + INTEL_MAX_1100, + INTEL_MAX_1550, + NVIDIA_A100, + NVIDIA_H100, + NVIDIA_L4, + NVIDIA_TESLA_A10G, + NVIDIA_TESLA_K80, + NVIDIA_TESLA_P4, + NVIDIA_TESLA_P100, + NVIDIA_TESLA_T4, + NVIDIA_TESLA_V100, + AMD_INSTINCT_MI250x, + AMD_INSTINCT_MI300x, ) __all__ = [ diff --git a/python/ray/util/accelerators/accelerators.py b/python/ray/util/accelerators/accelerators.py index aaa5b8f86f81..b68d0460b538 100644 --- a/python/ray/util/accelerators/accelerators.py +++ b/python/ray/util/accelerators/accelerators.py @@ -23,6 +23,8 @@ AMD_INSTINCT_MI300x_HF = "AMD-Instinct-MI300X-HF" AMD_INSTINCT_MI308x = "AMD-Instinct-MI308X" AMD_INSTINCT_MI325x = "AMD-Instinct-MI325X-OAM" +AMD_INSTINCT_MI350x = "AMD-Instinct-MI350X-OAM" +AMD_INSTINCT_MI355x = "AMD-Instinct-MI355X-OAM" AMD_RADEON_R9_200_HD_7900 = "AMD-Radeon-R9-200-HD-7900" AMD_RADEON_HD_7900 = "AMD-Radeon-HD-7900" AWS_NEURON_CORE = "aws-neuron-core" diff --git a/python/ray/util/accelerators/tpu.py b/python/ray/util/accelerators/tpu.py index 01dfbcf4a02f..ff31581d7ca5 100644 --- a/python/ray/util/accelerators/tpu.py +++ b/python/ray/util/accelerators/tpu.py @@ -1,4 +1,5 @@ from typing import Optional + from ray._private.accelerators import TPUAcceleratorManager from ray.util.annotations import PublicAPI diff --git a/python/ray/util/actor_group.py b/python/ray/util/actor_group.py index 53fe83285a72..5cd343f1b17d 100644 --- a/python/ray/util/actor_group.py +++ b/python/ray/util/actor_group.py @@ -1,12 +1,12 @@ +import logging import weakref from dataclasses import dataclass -import logging -from typing import List, TypeVar, Optional, Dict, Type, Tuple +from typing import Dict, List, Optional, Tuple, Type, TypeVar import ray +from ray._private.utils import get_ray_doc_version from ray.actor import ActorHandle from ray.util.annotations import Deprecated -from ray._private.utils import get_ray_doc_version T = TypeVar("T") ActorMetadata = TypeVar("ActorMetadata") diff --git a/python/ray/util/annotations.py b/python/ray/util/annotations.py index 206c02b36d26..a2e3fc664d55 100644 --- a/python/ray/util/annotations.py +++ b/python/ray/util/annotations.py @@ -1,9 +1,9 @@ -from enum import Enum -from typing import Optional import inspect import sys import warnings +from enum import Enum from functools import wraps +from typing import Optional class AnnotationType(Enum): diff --git a/python/ray/util/check_open_ports.py b/python/ray/util/check_open_ports.py index 29c9e03e4740..67f5e1fd87a5 100644 --- a/python/ray/util/check_open_ports.py +++ b/python/ray/util/check_open_ports.py @@ -3,19 +3,21 @@ See https://www.anyscale.com/blog/update-on-ray-cve-2023-48022-new-verification-tooling-available # noqa: E501 for more details. """ -from typing import List, Tuple +import json import subprocess -import click -import psutil import urllib -import json +from typing import List, Tuple + +import click import ray -from ray.util.annotations import PublicAPI from ray.autoscaler._private.cli_logger import add_click_logging_options, cli_logger from ray.autoscaler._private.constants import RAY_PROCESSES +from ray.util.annotations import PublicAPI from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy +import psutil + def _get_ray_ports() -> List[int]: unique_ports = set() diff --git a/python/ray/util/check_serialize.py b/python/ray/util/check_serialize.py index a9a8377b3a77..04e1c9633f26 100644 --- a/python/ray/util/check_serialize.py +++ b/python/ray/util/check_serialize.py @@ -3,9 +3,10 @@ from contextlib import contextmanager from typing import Any, Optional, Set, Tuple +import colorama + # Import ray first to use the bundled colorama import ray # noqa: F401 -import colorama import ray.cloudpickle as cp from ray.util.annotations import DeveloperAPI diff --git a/python/ray/util/client/__init__.py b/python/ray/util/client/__init__.py index e3d009f172a5..97f4bf2802bc 100644 --- a/python/ray/util/client/__init__.py +++ b/python/ray/util/client/__init__.py @@ -9,10 +9,9 @@ _explicitly_enable_client_mode, ) from ray._private.ray_logging import setup_logger +from ray._private.utils import check_version_info from ray.job_config import JobConfig from ray.util.annotations import DeveloperAPI -from ray._private.utils import check_version_info - logger = logging.getLogger(__name__) diff --git a/python/ray/util/client/client_app.py b/python/ray/util/client/client_app.py index ec0a37021298..612700147f4f 100644 --- a/python/ray/util/client/client_app.py +++ b/python/ray/util/client/client_app.py @@ -1,6 +1,7 @@ -from ray.util.client import ray from typing import Tuple +from ray.util.client import ray + ray.connect("localhost:50051") diff --git a/python/ray/util/client/client_pickler.py b/python/ray/util/client/client_pickler.py index 4971c0e11f96..39a025f1efac 100644 --- a/python/ray/util/client/client_pickler.py +++ b/python/ray/util/client/client_pickler.py @@ -22,25 +22,22 @@ """ import io - -from typing import NamedTuple -from typing import Any -from typing import Dict -from typing import Optional +import pickle # noqa: F401 +from typing import Any, Dict, NamedTuple, Optional import ray.cloudpickle as cloudpickle -from ray.util.client import RayAPIStub -from ray.util.client.common import ClientObjectRef -from ray.util.client.common import ClientActorHandle -from ray.util.client.common import ClientActorRef -from ray.util.client.common import ClientActorClass -from ray.util.client.common import ClientRemoteFunc -from ray.util.client.common import ClientRemoteMethod -from ray.util.client.common import OptionWrapper -from ray.util.client.common import InProgressSentinel import ray.core.generated.ray_client_pb2 as ray_client_pb2 - -import pickle # noqa: F401 +from ray.util.client import RayAPIStub +from ray.util.client.common import ( + ClientActorClass, + ClientActorHandle, + ClientActorRef, + ClientObjectRef, + ClientRemoteFunc, + ClientRemoteMethod, + InProgressSentinel, + OptionWrapper, +) # NOTE(barakmich): These PickleStubs are really close to diff --git a/python/ray/util/client/common.py b/python/ray/util/client/common.py index 7d027d2b0386..80435bc5c4fd 100644 --- a/python/ray/util/client/common.py +++ b/python/ray/util/client/common.py @@ -14,6 +14,7 @@ import ray._raylet as raylet import ray.core.generated.ray_client_pb2 as ray_client_pb2 import ray.core.generated.ray_client_pb2_grpc as ray_client_pb2_grpc +from ray._common.signature import extract_signature, get_signature from ray._private import ray_constants from ray._private.inspect_util import ( is_class_method, @@ -21,7 +22,6 @@ is_function_or_method, is_static_method, ) -from ray._common.signature import extract_signature, get_signature from ray._private.utils import check_oversized_function from ray.util.client import ray from ray.util.client.options import validate_options diff --git a/python/ray/util/client/dataclient.py b/python/ray/util/client/dataclient.py index 5ce08117087d..6ef6f29c190b 100644 --- a/python/ray/util/client/dataclient.py +++ b/python/ray/util/client/dataclient.py @@ -1,15 +1,15 @@ """This file implements a threaded stream controller to abstract a data stream back to the ray clientserver. """ -import math import logging +import math import queue import threading import warnings -import grpc - from collections import OrderedDict -from typing import Any, Callable, Dict, TYPE_CHECKING, Optional, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Union + +import grpc import ray.core.generated.ray_client_pb2 as ray_client_pb2 import ray.core.generated.ray_client_pb2_grpc as ray_client_pb2_grpc diff --git a/python/ray/util/client/examples/run_tune.py b/python/ray/util/client/examples/run_tune.py index d7b76b778f4c..048c7de299be 100644 --- a/python/ray/util/client/examples/run_tune.py +++ b/python/ray/util/client/examples/run_tune.py @@ -1,6 +1,5 @@ -from ray.util.client import ray - from ray.tune import tune +from ray.util.client import ray ray.connect("localhost:50051") diff --git a/python/ray/util/client/logsclient.py b/python/ray/util/client/logsclient.py index b4d9a6af9928..34ad3f9f6ce9 100644 --- a/python/ray/util/client/logsclient.py +++ b/python/ray/util/client/logsclient.py @@ -1,18 +1,17 @@ """This file implements a threaded stream controller to return logs back from the ray clientserver. """ -import sys import logging import queue +import sys import threading import time -import grpc - from typing import TYPE_CHECKING +import grpc + import ray.core.generated.ray_client_pb2 as ray_client_pb2 import ray.core.generated.ray_client_pb2_grpc as ray_client_pb2_grpc - from ray.util.debug import log_once if TYPE_CHECKING: diff --git a/python/ray/util/client/options.py b/python/ray/util/client/options.py index 57f00109af5d..bd0946fa1975 100644 --- a/python/ray/util/client/options.py +++ b/python/ray/util/client/options.py @@ -1,6 +1,4 @@ -from typing import Any -from typing import Dict -from typing import Optional +from typing import Any, Dict, Optional from ray._common import ray_option_utils from ray.util.placement_group import PlacementGroup, check_placement_group_index diff --git a/python/ray/util/client/ray_client_helpers.py b/python/ray/util/client/ray_client_helpers.py index f1ff0eab01e8..1554bd5e1c23 100644 --- a/python/ray/util/client/ray_client_helpers.py +++ b/python/ray/util/client/ray_client_helpers.py @@ -1,12 +1,12 @@ -from contextlib import contextmanager import time +from contextlib import contextmanager from typing import Any, Dict import ray as real_ray -from ray.job_config import JobConfig import ray.util.client.server.server as ray_client_server -from ray.util.client import ray from ray._private.client_mode_hook import disable_client_hook +from ray.job_config import JobConfig +from ray.util.client import ray @contextmanager diff --git a/python/ray/util/client/runtime_context.py b/python/ray/util/client/runtime_context.py index 0fe9f33935cf..ea28055361d8 100644 --- a/python/ray/util/client/runtime_context.py +++ b/python/ray/util/client/runtime_context.py @@ -1,5 +1,5 @@ -from typing import TYPE_CHECKING from types import SimpleNamespace +from typing import TYPE_CHECKING if TYPE_CHECKING: from ray import JobID, NodeID diff --git a/python/ray/util/client/server/dataservicer.py b/python/ray/util/client/server/dataservicer.py index af06b8902785..0e9363ea3640 100644 --- a/python/ray/util/client/server/dataservicer.py +++ b/python/ray/util/client/server/dataservicer.py @@ -1,24 +1,24 @@ -from collections import defaultdict -from ray.util.client.server.server_pickler import loads_from_client -import ray import logging -import grpc -from queue import Queue import sys - -from typing import Any, Dict, Iterator, TYPE_CHECKING, Union -from threading import Event, Lock, Thread import time +from collections import defaultdict +from queue import Queue +from threading import Event, Lock, Thread +from typing import TYPE_CHECKING, Any, Dict, Iterator, Union + +import grpc +import ray import ray.core.generated.ray_client_pb2 as ray_client_pb2 import ray.core.generated.ray_client_pb2_grpc as ray_client_pb2_grpc +from ray._private.client_mode_hook import disable_client_hook from ray.util.client.common import ( CLIENT_SERVER_MAX_THREADS, - _propagate_error_in_context, OrderedResponseCache, + _propagate_error_in_context, ) +from ray.util.client.server.server_pickler import loads_from_client from ray.util.debug import log_once -from ray._private.client_mode_hook import disable_client_hook if TYPE_CHECKING: from ray.util.client.server.server import RayletServicer diff --git a/python/ray/util/client/server/proxier.py b/python/ray/util/client/server/proxier.py index a952cffa8f58..7bc959e3df17 100644 --- a/python/ray/util/client/server/proxier.py +++ b/python/ray/util/client/server/proxier.py @@ -5,30 +5,27 @@ import sys import time import traceback +import urllib from concurrent import futures from dataclasses import dataclass from itertools import chain -import urllib from threading import Event, Lock, RLock, Thread from typing import Callable, Dict, List, Optional, Tuple import grpc -# Import psutil after ray so the packaged version is used. -import psutil - import ray import ray.core.generated.ray_client_pb2 as ray_client_pb2 import ray.core.generated.ray_client_pb2_grpc as ray_client_pb2_grpc import ray.core.generated.runtime_env_agent_pb2 as runtime_env_agent_pb2 +from ray._common.network_utils import build_address, is_localhost from ray._private.client_mode_hook import disable_client_hook -from ray._raylet import GcsClient from ray._private.parameter import RayParams from ray._private.runtime_env.context import RuntimeEnvContext from ray._private.services import ProcessInfo, start_ray_client_server from ray._private.tls_utils import add_port_to_grpc_server from ray._private.utils import detect_fate_sharing_support -from ray._common.network_utils import build_address +from ray._raylet import GcsClient from ray.cloudpickle.compat import pickle from ray.job_config import JobConfig from ray.util.client.common import ( @@ -40,6 +37,9 @@ ) from ray.util.client.server.dataservicer import _get_reconnecting_from_context +# Import psutil after ray so the packaged version is used. +import psutil + logger = logging.getLogger(__name__) CHECK_PROCESS_INTERVAL_S = 30 @@ -860,7 +860,7 @@ def serve_proxier( ray_client_pb2_grpc.add_RayletDriverServicer_to_server(task_servicer, server) ray_client_pb2_grpc.add_RayletDataStreamerServicer_to_server(data_servicer, server) ray_client_pb2_grpc.add_RayletLogStreamerServicer_to_server(logs_servicer, server) - if host != "127.0.0.1" and host != "localhost": + if not is_localhost(host): add_port_to_grpc_server(server, f"127.0.0.1:{port}") add_port_to_grpc_server(server, f"{host}:{port}") server.start() diff --git a/python/ray/util/client/server/server.py b/python/ray/util/client/server/server.py index 768ae2c2118a..c4e5d897e09d 100644 --- a/python/ray/util/client/server/server.py +++ b/python/ray/util/client/server/server.py @@ -20,14 +20,14 @@ import ray.core.generated.ray_client_pb2 as ray_client_pb2 import ray.core.generated.ray_client_pb2_grpc as ray_client_pb2_grpc from ray import cloudpickle +from ray._common.network_utils import build_address, is_localhost from ray._private import ray_constants from ray._private.client_mode_hook import disable_client_hook -from ray._raylet import GcsClient from ray._private.ray_constants import env_integer from ray._private.ray_logging import setup_logger from ray._private.services import canonicalize_bootstrap_address_or_die from ray._private.tls_utils import add_port_to_grpc_server -from ray._common.network_utils import build_address +from ray._raylet import GcsClient from ray.job_config import JobConfig from ray.util.client.common import ( CLIENT_SERVER_MAX_THREADS, @@ -787,7 +787,7 @@ def default_connect_handler( ray_client_pb2_grpc.add_RayletDriverServicer_to_server(task_servicer, server) ray_client_pb2_grpc.add_RayletDataStreamerServicer_to_server(data_servicer, server) ray_client_pb2_grpc.add_RayletLogStreamerServicer_to_server(logs_servicer, server) - if host != "127.0.0.1" and host != "localhost": + if not is_localhost(host): add_port_to_grpc_server(server, f"127.0.0.1:{port}") add_port_to_grpc_server(server, f"{host}:{port}") current_handle = ClientServerHandle( diff --git a/python/ray/util/client/server/server_pickler.py b/python/ray/util/client/server/server_pickler.py index a0d91f400baa..5211a7991a86 100644 --- a/python/ray/util/client/server/server_pickler.py +++ b/python/ray/util/client/server/server_pickler.py @@ -12,16 +12,16 @@ in the server instance. """ import io -import ray - -from typing import Any -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any -from ray._private.client_mode_hook import disable_client_hook +import ray import ray.cloudpickle as cloudpickle +from ray._private.client_mode_hook import disable_client_hook from ray.util.client.client_pickler import PickleStub -from ray.util.client.server.server_stubs import ClientReferenceActor -from ray.util.client.server.server_stubs import ClientReferenceFunction +from ray.util.client.server.server_stubs import ( + ClientReferenceActor, + ClientReferenceFunction, +) if TYPE_CHECKING: from ray.util.client.server.server import RayletServicer diff --git a/python/ray/util/client/server/server_stubs.py b/python/ray/util/client/server/server_stubs.py index e19cbb3134a4..020ebf2aeb2c 100644 --- a/python/ray/util/client/server/server_stubs.py +++ b/python/ray/util/client/server/server_stubs.py @@ -1,6 +1,5 @@ +from abc import ABC, abstractmethod from contextlib import contextmanager -from abc import ABC -from abc import abstractmethod _current_server = None diff --git a/python/ray/util/client/worker.py b/python/ray/util/client/worker.py index babd60a79012..c5a09c8b51eb 100644 --- a/python/ray/util/client/worker.py +++ b/python/ray/util/client/worker.py @@ -66,9 +66,9 @@ # Links to the Ray Design Pattern doc to use in the task overhead warning # message -DESIGN_PATTERN_FINE_GRAIN_TASKS_LINK = "https://docs.google.com/document/d/167rnnDFIVRhHhK4mznEIemOtj63IOhtIPvSYaPgI4Fg/edit#heading=h.f7ins22n6nyl" # noqa E501 +DESIGN_PATTERN_FINE_GRAIN_TASKS_LINK = "https://docs.ray.io/en/latest/ray-core/patterns/too-fine-grained-tasks.html" # noqa E501 -DESIGN_PATTERN_LARGE_OBJECTS_LINK = "https://docs.google.com/document/d/167rnnDFIVRhHhK4mznEIemOtj63IOhtIPvSYaPgI4Fg/edit#heading=h.1afmymq455wu" # noqa E501 +DESIGN_PATTERN_LARGE_OBJECTS_LINK = "https://docs.ray.io/en/latest/ray-core/patterns/closure-capture-large-objects.html" # noqa E501 def backoff(timeout: int) -> int: diff --git a/python/ray/util/client_connect.py b/python/ray/util/client_connect.py index c88b86457b0a..8c64459a1436 100644 --- a/python/ray/util/client_connect.py +++ b/python/ray/util/client_connect.py @@ -1,14 +1,14 @@ -from typing import Any, Dict, List, Optional, Tuple import logging +from typing import Any, Dict, List, Optional, Tuple from ray._private.client_mode_hook import ( _explicitly_enable_client_mode, _set_client_hook_status, ) +from ray._private.utils import get_ray_doc_version from ray.job_config import JobConfig from ray.util.annotations import Deprecated from ray.util.client import ray -from ray._private.utils import get_ray_doc_version logger = logging.getLogger(__name__) diff --git a/python/ray/util/collective/__init__.py b/python/ray/util/collective/__init__.py index ad7bcde93e58..09423ad37c11 100644 --- a/python/ray/util/collective/__init__.py +++ b/python/ray/util/collective/__init__.py @@ -1,28 +1,28 @@ from ray.util.collective.collective import ( - nccl_available, - gloo_available, - is_group_initialized, - init_collective_group, - destroy_collective_group, - create_collective_group, - get_rank, - get_collective_group_size, + allgather, + allgather_multigpu, allreduce, allreduce_multigpu, barrier, - reduce, - reduce_multigpu, broadcast, broadcast_multigpu, - allgather, - allgather_multigpu, + create_collective_group, + destroy_collective_group, + get_collective_group_size, + get_group_handle, + get_rank, + gloo_available, + init_collective_group, + is_group_initialized, + nccl_available, + recv, + recv_multigpu, + reduce, + reduce_multigpu, reducescatter, reducescatter_multigpu, send, send_multigpu, - recv, - recv_multigpu, - get_group_handle, ) __all__ = [ diff --git a/python/ray/util/collective/collective.py b/python/ray/util/collective/collective.py index 9265d8b06b97..1d92b838d7f6 100644 --- a/python/ray/util/collective/collective.py +++ b/python/ray/util/collective/collective.py @@ -1,4 +1,5 @@ """APIs exposed under the namespace ray.util.collective.""" + import logging import os from typing import List @@ -36,6 +37,13 @@ except ImportError: _TORCH_DISTRIBUTED_AVAILABLE = False +try: + from ray.util.collective.collective_group.nixl_backend import NixlBackend + + _NIXL_AVAILABLE = True +except ImportError: + _NIXL_AVAILABLE = False + def nccl_available(): global _LOG_NCCL_WARNING @@ -57,6 +65,10 @@ def torch_distributed_available(): return _TORCH_DISTRIBUTED_AVAILABLE +def nixl_available(): + return _NIXL_AVAILABLE + + class GroupManager(object): """Use this class to manage the collective groups we created so far. @@ -98,6 +110,10 @@ def create_collective_group( "Creating torch.distributed GLOO group: '{}'...".format(group_name) ) g = TorchGLOOGroup(world_size, rank, group_name) + elif backend == types.Backend.NIXL: + _check_backend_availability(backend) + logger.debug("Creating NIXL Backend: '{}'...".format(group_name)) + g = NixlBackend() else: raise RuntimeError(f"Unexpected backend: {backend}") @@ -719,19 +735,24 @@ def get_group_handle(group_name: str = "default"): if not is_group_initialized(group_name): # try loading from remote info store try: - # if the information is stored in an Info object, - # get and create the group. - name = "info_" + group_name - mgr = ray.get_actor(name=name) - ids, world_size, rank, backend, gloo_timeout = ray.get( - mgr.get_info.remote() - ) - worker = ray._private.worker.global_worker - id_ = worker.core_worker.get_actor_id() - r = rank[ids.index(id_)] - _group_mgr.create_collective_group( - backend, world_size, r, group_name, gloo_timeout - ) + if group_name == types.NIXL_GROUP_NAME: + _group_mgr.create_collective_group( + types.Backend.NIXL, None, None, group_name, None + ) + else: + # if the information is stored in an Info object, + # get and create the group. + name = "info_" + group_name + mgr = ray.get_actor(name=name) + ids, world_size, rank, backend, gloo_timeout = ray.get( + mgr.get_info.remote() + ) + worker = ray._private.worker.global_worker + id_ = worker.core_worker.get_actor_id() + r = rank[ids.index(id_)] + _group_mgr.create_collective_group( + backend, world_size, r, group_name, gloo_timeout + ) except ValueError as exc: # check if this group is initialized using options() if ( @@ -781,6 +802,9 @@ def _check_backend_availability(backend: types.Backend): elif backend == types.Backend.TORCH_GLOO: if not torch_distributed_available(): raise RuntimeError("torch.distributed is not available.") + elif backend == types.Backend.NIXL: + if not nixl_available(): + raise RuntimeError("NIXL is not available.") def _check_inside_actor(): diff --git a/python/ray/util/collective/collective_group/base_collective_group.py b/python/ray/util/collective/collective_group/base_collective_group.py index cfb6ebfa8725..eff07fb16c67 100644 --- a/python/ray/util/collective/collective_group/base_collective_group.py +++ b/python/ray/util/collective/collective_group/base_collective_group.py @@ -1,16 +1,15 @@ """Abstract class for collective groups.""" -from abc import ABCMeta -from abc import abstractmethod +from abc import ABCMeta, abstractmethod from ray.util.collective.types import ( + AllGatherOptions, AllReduceOptions, BarrierOptions, - ReduceOptions, - AllGatherOptions, BroadcastOptions, + RecvOptions, + ReduceOptions, ReduceScatterOptions, SendOptions, - RecvOptions, ) diff --git a/python/ray/util/collective/collective_group/cuda_stream.py b/python/ray/util/collective/collective_group/cuda_stream.py index d5496755f82b..dbccb00c1a17 100644 --- a/python/ray/util/collective/collective_group/cuda_stream.py +++ b/python/ray/util/collective/collective_group/cuda_stream.py @@ -2,6 +2,7 @@ import threading import cupy + from ray.util.collective.collective_group import nccl_util from ray.util.collective.const import ENV diff --git a/python/ray/util/collective/collective_group/gloo_collective_group.py b/python/ray/util/collective/collective_group/gloo_collective_group.py index 6782a8e38f72..5809c12b44b3 100644 --- a/python/ray/util/collective/collective_group/gloo_collective_group.py +++ b/python/ray/util/collective/collective_group/gloo_collective_group.py @@ -8,11 +8,11 @@ import pygloo import ray +from ray._common.network_utils import parse_address from ray._private import ray_constants from ray.util.collective.collective_group import gloo_util from ray.util.collective.collective_group.base_collective_group import BaseGroup from ray.util.collective.const import get_store_name -from ray._common.network_utils import parse_address from ray.util.collective.types import ( AllGatherOptions, AllReduceOptions, diff --git a/python/ray/util/collective/collective_group/nccl_collective_group.py b/python/ray/util/collective/collective_group/nccl_collective_group.py index 9c21b936d898..f866b70a9c1e 100644 --- a/python/ray/util/collective/collective_group/nccl_collective_group.py +++ b/python/ray/util/collective/collective_group/nccl_collective_group.py @@ -1,27 +1,26 @@ -import logging import datetime +import logging import time -import ray import cupy -from ray.util.collective.const import ENV +import ray from ray.util.collective.collective_group import nccl_util from ray.util.collective.collective_group.base_collective_group import BaseGroup -from ray.util.collective.const import get_store_name +from ray.util.collective.collective_group.cuda_stream import get_stream_pool +from ray.util.collective.const import ENV, get_store_name from ray.util.collective.types import ( + AllGatherOptions, AllReduceOptions, - BarrierOptions, Backend, - ReduceOptions, + BarrierOptions, BroadcastOptions, - AllGatherOptions, + RecvOptions, + ReduceOptions, ReduceScatterOptions, SendOptions, - RecvOptions, torch_available, ) -from ray.util.collective.collective_group.cuda_stream import get_stream_pool logger = logging.getLogger(__name__) @@ -109,19 +108,12 @@ def get_nccl_id(self, timeout_s=180): """ if not self._store: raise ValueError("Rendezvous store is not setup.") - uid = None - timeout_delta = datetime.timedelta(seconds=timeout_s) - elapsed = datetime.timedelta(seconds=0) - start_time = datetime.datetime.now() - while elapsed < timeout_delta: - uid = ray.get(self._store.get_id.remote()) - if not uid: - time.sleep(1) - elapsed = datetime.datetime.now() - start_time - continue - break - if not uid: - raise RuntimeError("Unable to get the NCCLUniqueID from the store.") + try: + uid = ray.get(self._store.wait_and_get_id.remote(), timeout=timeout_s) + except ray.exceptions.GetTimeoutError: + raise RuntimeError( + f"Unable to get the NCCLUniqueID from the store within {timeout_s} seconds." + ) from None return uid diff --git a/python/ray/util/collective/collective_group/nccl_util.py b/python/ray/util/collective/collective_group/nccl_util.py index 221d5885c411..7f68d8430208 100644 --- a/python/ray/util/collective/collective_group/nccl_util.py +++ b/python/ray/util/collective/collective_group/nccl_util.py @@ -3,13 +3,17 @@ try: import cupy - from cupy.cuda import nccl - from cupy.cuda import Device # noqa: F401 - from cupy.cuda.nccl import get_version - from cupy.cuda.nccl import get_build_version - from cupy.cuda.nccl import NcclCommunicator - from cupy.cuda.nccl import groupStart # noqa: F401 - from cupy.cuda.nccl import groupEnd # noqa: F401 + from cupy.cuda import ( + Device, # noqa: F401 + nccl, + ) + from cupy.cuda.nccl import ( + NcclCommunicator, + get_build_version, + get_version, + groupEnd, # noqa: F401 + groupStart, # noqa: F401 + ) except ImportError: raise ImportError("NCCL in Ray requires Cupy being available!") diff --git a/python/ray/util/collective/collective_group/nixl_backend.py b/python/ray/util/collective/collective_group/nixl_backend.py new file mode 100644 index 000000000000..1950f952d4ef --- /dev/null +++ b/python/ray/util/collective/collective_group/nixl_backend.py @@ -0,0 +1,104 @@ +import time +from typing import TYPE_CHECKING, List, Tuple + +from nixl._api import nixl_agent, nixl_agent_config + +import ray +from ray.util.collective.types import Backend + +if TYPE_CHECKING: + import torch + + +class NixlBackend: + """Backend implementation for NIXL tensor transport. + + This class provides functionality for transferring tensors using NIXL. It handles + initialization of the NIXL agent, receiving tensors, and managing NIXL metadata. + """ + + def __init__(self): + """Initialize the NIXL backend. + + Creates a NIXL agent with UCX backend. + """ + agent_config = nixl_agent_config(backends=["UCX"]) + ctx = ray.get_runtime_context() + actor_id = ctx.get_actor_id() + self._nixl_agent = nixl_agent(actor_id, agent_config) + + @classmethod + def backend(cls): + """Get the backend type. + + Returns: + Backend.NIXL: The backend type enum value for NIXL. + """ + return Backend.NIXL + + def recv( + self, + tensors: List["torch.Tensor"], + nixl_serialized_descs: bytes, + remote_nixl_agent_meta: bytes, + ): + """Receive tensors from a remote NIXL agent. + + Args: + tensors: List of tensors to receive into. + nixl_serialized_descs: Serialized NIXL descriptors for the remote tensors. + remote_nixl_agent_meta: Metadata about the remote NIXL agent. + + Raises: + RuntimeError: If the NIXL transfer enters an error state. + """ + nixl_agent = self._nixl_agent + remote_descs = nixl_agent.deserialize_descs(nixl_serialized_descs) + local_descs = nixl_agent.register_memory(tensors) + remote_name = nixl_agent.add_remote_agent(remote_nixl_agent_meta) + + xfer_handle = nixl_agent.initialize_xfer( + # "UUID" here is just a placeholder, can be any bytes, but without it, + # nixl will fail to transfer multiple times. + "READ", + local_descs.trim(), + remote_descs, + remote_name, + "UUID", + ) + + state = nixl_agent.transfer(xfer_handle) + if state == "ERR": + raise RuntimeError("NIXL transfer got to Error state.") + # Since current nixl does not provide a better way, we need to check the state of + # the transfer continuously. + while True: + state = nixl_agent.check_xfer_state(xfer_handle) + if state == "ERR": + raise RuntimeError("NIXL transfer got to Error state.") + if state == "PROC": + time.sleep(0.001) # Avoid busy waiting + elif state == "DONE": + break + + nixl_agent.release_xfer_handle(xfer_handle) + nixl_agent.deregister_memory(local_descs) + + def get_nixl_metadata(self, tensors: List["torch.Tensor"]) -> Tuple[bytes, bytes]: + """Get NIXL metadata for a set of tensors. + + Args: + tensors: List of tensors to get metadata for. + + Returns: + tuple: A tuple containing: + - Serialized NIXL descriptors for the tensors + - Metadata about this NIXL agent + """ + nixl_agent = self._nixl_agent + reg_descs = nixl_agent.register_memory(tensors) + xfer_descs = reg_descs.trim() + return ( + nixl_agent.get_serialized_descs(xfer_descs), + nixl_agent.get_agent_metadata(), + ) diff --git a/python/ray/util/collective/collective_group/torch_gloo_collective_group.py b/python/ray/util/collective/collective_group/torch_gloo_collective_group.py index 5ec743c673f6..51e7f6482b6f 100644 --- a/python/ray/util/collective/collective_group/torch_gloo_collective_group.py +++ b/python/ray/util/collective/collective_group/torch_gloo_collective_group.py @@ -1,22 +1,23 @@ -from typing import TYPE_CHECKING, List, Optional import os +from typing import TYPE_CHECKING, List, Optional + import torch import torch.distributed as dist import ray.experimental.internal_kv as internal_kv -from ray.util.collective.collective_group.base_collective_group import BaseGroup from ray._common.network_utils import parse_address +from ray.util.collective.collective_group.base_collective_group import BaseGroup from ray.util.collective.types import ( + AllGatherOptions, AllReduceOptions, - BarrierOptions, Backend, + BarrierOptions, + BroadcastOptions, + RecvOptions, ReduceOp, ReduceOptions, - BroadcastOptions, - AllGatherOptions, ReduceScatterOptions, SendOptions, - RecvOptions, ) if TYPE_CHECKING: diff --git a/python/ray/util/collective/examples/nccl_allreduce_example.py b/python/ray/util/collective/examples/nccl_allreduce_example.py index dd8a9f83d171..ec812843a3f8 100644 --- a/python/ray/util/collective/examples/nccl_allreduce_example.py +++ b/python/ray/util/collective/examples/nccl_allreduce_example.py @@ -1,6 +1,6 @@ -import ray import cupy as cp +import ray import ray.util.collective as collective diff --git a/python/ray/util/collective/examples/nccl_allreduce_example_declare_collective_group.py b/python/ray/util/collective/examples/nccl_allreduce_example_declare_collective_group.py index 276843ff6da9..df378785dffb 100644 --- a/python/ray/util/collective/examples/nccl_allreduce_example_declare_collective_group.py +++ b/python/ray/util/collective/examples/nccl_allreduce_example_declare_collective_group.py @@ -1,6 +1,6 @@ import cupy as cp -import ray +import ray import ray.util.collective as collective diff --git a/python/ray/util/collective/examples/nccl_allreduce_multigpu_example.py b/python/ray/util/collective/examples/nccl_allreduce_multigpu_example.py index 89282811a4e7..5a70976ae5ab 100644 --- a/python/ray/util/collective/examples/nccl_allreduce_multigpu_example.py +++ b/python/ray/util/collective/examples/nccl_allreduce_multigpu_example.py @@ -1,8 +1,8 @@ -import ray import cupy as cp +from cupy.cuda import Device +import ray import ray.util.collective as collective -from cupy.cuda import Device @ray.remote(num_gpus=2) diff --git a/python/ray/util/collective/examples/nccl_p2p_example_multigpu.py b/python/ray/util/collective/examples/nccl_p2p_example_multigpu.py index 10fe07928f67..1ef3e26ee428 100644 --- a/python/ray/util/collective/examples/nccl_p2p_example_multigpu.py +++ b/python/ray/util/collective/examples/nccl_p2p_example_multigpu.py @@ -1,8 +1,8 @@ -import ray import cupy as cp +from cupy.cuda import Device +import ray import ray.util.collective as collective -from cupy.cuda import Device @ray.remote(num_gpus=2) diff --git a/python/ray/util/collective/tests/conftest.py b/python/ray/util/collective/tests/conftest.py index 0c8fef090184..e4ec1df88675 100644 --- a/python/ray/util/collective/tests/conftest.py +++ b/python/ray/util/collective/tests/conftest.py @@ -2,6 +2,7 @@ import logging import pytest + import ray from ray.util.collective.collective_group.nccl_collective_group import ( _get_comm_key_from_devices, diff --git a/python/ray/util/collective/tests/cpu_util.py b/python/ray/util/collective/tests/cpu_util.py index f4951900dd20..1196afd86fad 100644 --- a/python/ray/util/collective/tests/cpu_util.py +++ b/python/ray/util/collective/tests/cpu_util.py @@ -1,12 +1,12 @@ -import numpy as np import logging +import numpy as np +import torch + import ray import ray.util.collective as col from ray.util.collective.types import Backend, ReduceOp -import torch - logger = logging.getLogger(__name__) diff --git a/python/ray/util/collective/tests/distributed_cpu_tests/test_distributed_allgather.py b/python/ray/util/collective/tests/distributed_cpu_tests/test_distributed_allgather.py index bdf32432f0ab..f48a41604405 100644 --- a/python/ray/util/collective/tests/distributed_cpu_tests/test_distributed_allgather.py +++ b/python/ray/util/collective/tests/distributed_cpu_tests/test_distributed_allgather.py @@ -1,15 +1,14 @@ """Test the allgather API on a distributed Ray cluster.""" -import pytest -import ray - import numpy as np +import pytest import torch -from ray.util.collective.types import Backend +import ray from ray.util.collective.tests.cpu_util import ( create_collective_workers, init_tensors_for_gather_scatter, ) +from ray.util.collective.types import Backend @pytest.mark.parametrize("backend", [Backend.GLOO]) @@ -137,7 +136,8 @@ def test_allgather_torch_numpy(ray_start_distributed_2_nodes, backend): if __name__ == "__main__": - import pytest import sys + import pytest + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/util/collective/tests/distributed_cpu_tests/test_distributed_allreduce.py b/python/ray/util/collective/tests/distributed_cpu_tests/test_distributed_allreduce.py index 43be7b620fc0..d9d6df92f68c 100644 --- a/python/ray/util/collective/tests/distributed_cpu_tests/test_distributed_allreduce.py +++ b/python/ray/util/collective/tests/distributed_cpu_tests/test_distributed_allreduce.py @@ -1,13 +1,11 @@ """Test the collective allreduice API on a distributed Ray cluster.""" -import pytest -import ray -from ray.util.collective.types import ReduceOp - import numpy as np +import pytest import torch -from ray.util.collective.types import Backend +import ray from ray.util.collective.tests.cpu_util import create_collective_workers +from ray.util.collective.types import Backend, ReduceOp @pytest.mark.parametrize("backend", [Backend.GLOO]) @@ -174,7 +172,8 @@ def test_allreduce_torch_numpy(ray_start_distributed_2_nodes, backend): if __name__ == "__main__": - import pytest import sys + import pytest + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/util/collective/tests/distributed_cpu_tests/test_distributed_basic_apis.py b/python/ray/util/collective/tests/distributed_cpu_tests/test_distributed_basic_apis.py index 1824cda807af..774a70f0a36b 100644 --- a/python/ray/util/collective/tests/distributed_cpu_tests/test_distributed_basic_apis.py +++ b/python/ray/util/collective/tests/distributed_cpu_tests/test_distributed_basic_apis.py @@ -1,10 +1,11 @@ """Test the collective group APIs.""" -import pytest -import ray from random import shuffle -from ray.util.collective.types import Backend +import pytest + +import ray from ray.util.collective.tests.cpu_util import Worker, create_collective_workers +from ray.util.collective.types import Backend @pytest.mark.parametrize("backend", [Backend.GLOO]) @@ -130,7 +131,8 @@ def test_destroy_group(ray_start_distributed_2_nodes, backend): if __name__ == "__main__": - import pytest import sys + import pytest + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/util/collective/tests/distributed_cpu_tests/test_distributed_broadcast.py b/python/ray/util/collective/tests/distributed_cpu_tests/test_distributed_broadcast.py index d344d1894e8f..b00b92edf3ac 100644 --- a/python/ray/util/collective/tests/distributed_cpu_tests/test_distributed_broadcast.py +++ b/python/ray/util/collective/tests/distributed_cpu_tests/test_distributed_broadcast.py @@ -1,10 +1,10 @@ """Test the broadcast API.""" -import pytest import numpy as np -import ray +import pytest -from ray.util.collective.types import Backend +import ray from ray.util.collective.tests.cpu_util import create_collective_workers +from ray.util.collective.types import Backend @pytest.mark.parametrize("backend", [Backend.GLOO]) @@ -89,7 +89,8 @@ def test_broadcast_invalid_rank(ray_start_distributed_2_nodes, backend, src_rank if __name__ == "__main__": - import pytest import sys + import pytest + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/util/collective/tests/distributed_cpu_tests/test_distributed_reduce.py b/python/ray/util/collective/tests/distributed_cpu_tests/test_distributed_reduce.py index 901e773ca757..2df1d27b1e2c 100644 --- a/python/ray/util/collective/tests/distributed_cpu_tests/test_distributed_reduce.py +++ b/python/ray/util/collective/tests/distributed_cpu_tests/test_distributed_reduce.py @@ -1,10 +1,10 @@ """Test the reduce API.""" -import pytest import numpy as np -import ray -from ray.util.collective.types import Backend, ReduceOp +import pytest +import ray from ray.util.collective.tests.cpu_util import create_collective_workers +from ray.util.collective.types import Backend, ReduceOp @pytest.mark.parametrize("backend", [Backend.GLOO]) @@ -140,7 +140,8 @@ def test_reduce_invalid_rank(ray_start_distributed_2_nodes, backend, dst_rank=9) if __name__ == "__main__": - import pytest import sys + import pytest + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/util/collective/tests/distributed_cpu_tests/test_distributed_reducescatter.py b/python/ray/util/collective/tests/distributed_cpu_tests/test_distributed_reducescatter.py index fb5d37556fae..47d05b6965ae 100644 --- a/python/ray/util/collective/tests/distributed_cpu_tests/test_distributed_reducescatter.py +++ b/python/ray/util/collective/tests/distributed_cpu_tests/test_distributed_reducescatter.py @@ -1,15 +1,14 @@ """Test the collective reducescatter API on a distributed Ray cluster.""" -import pytest -import ray - import numpy as np +import pytest import torch -from ray.util.collective.types import Backend +import ray from ray.util.collective.tests.cpu_util import ( create_collective_workers, init_tensors_for_gather_scatter, ) +from ray.util.collective.types import Backend @pytest.mark.parametrize("backend", [Backend.GLOO]) @@ -125,7 +124,8 @@ def test_reducescatter_torch_numpy(ray_start_distributed_2_nodes, backend): if __name__ == "__main__": - import pytest import sys + import pytest + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/util/collective/tests/distributed_cpu_tests/test_distributed_sendrecv.py b/python/ray/util/collective/tests/distributed_cpu_tests/test_distributed_sendrecv.py index 4d2285fcae7e..68aadc067adf 100644 --- a/python/ray/util/collective/tests/distributed_cpu_tests/test_distributed_sendrecv.py +++ b/python/ray/util/collective/tests/distributed_cpu_tests/test_distributed_sendrecv.py @@ -1,10 +1,10 @@ """Test the send/recv API.""" import numpy as np import pytest -import ray -from ray.util.collective.types import Backend +import ray from ray.util.collective.tests.cpu_util import create_collective_workers +from ray.util.collective.types import Backend @pytest.mark.parametrize("backend", [Backend.GLOO]) @@ -45,7 +45,8 @@ def test_sendrecv( if __name__ == "__main__": - import pytest import sys + import pytest + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/util/collective/tests/distributed_gpu_tests/test_distributed_allgather.py b/python/ray/util/collective/tests/distributed_gpu_tests/test_distributed_allgather.py index 6bdac60833b7..82afc324af49 100644 --- a/python/ray/util/collective/tests/distributed_gpu_tests/test_distributed_allgather.py +++ b/python/ray/util/collective/tests/distributed_gpu_tests/test_distributed_allgather.py @@ -1,10 +1,9 @@ """Test the allgather API on a distributed Ray cluster.""" -import pytest -import ray - import cupy as cp +import pytest import torch +import ray from ray.util.collective.tests.util import ( create_collective_workers, init_tensors_for_gather_scatter, @@ -132,7 +131,8 @@ def test_allgather_torch_cupy(ray_start_distributed_2_nodes_4_gpus): if __name__ == "__main__": - import pytest import sys + import pytest + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/util/collective/tests/distributed_gpu_tests/test_distributed_allreduce.py b/python/ray/util/collective/tests/distributed_gpu_tests/test_distributed_allreduce.py index 580b6436e73c..f915db200851 100644 --- a/python/ray/util/collective/tests/distributed_gpu_tests/test_distributed_allreduce.py +++ b/python/ray/util/collective/tests/distributed_gpu_tests/test_distributed_allreduce.py @@ -1,12 +1,11 @@ """Test the collective allreduice API on a distributed Ray cluster.""" -import pytest -import ray -from ray.util.collective.types import ReduceOp - import cupy as cp +import pytest import torch +import ray from ray.util.collective.tests.util import create_collective_workers +from ray.util.collective.types import ReduceOp @pytest.mark.parametrize("group_name", ["default", "test", "123?34!"]) diff --git a/python/ray/util/collective/tests/distributed_gpu_tests/test_distributed_basic_apis.py b/python/ray/util/collective/tests/distributed_gpu_tests/test_distributed_basic_apis.py index ef61d7450611..bcd7b8c3808b 100644 --- a/python/ray/util/collective/tests/distributed_gpu_tests/test_distributed_basic_apis.py +++ b/python/ray/util/collective/tests/distributed_gpu_tests/test_distributed_basic_apis.py @@ -1,8 +1,9 @@ """Test the collective group APIs.""" -import pytest -import ray from random import shuffle +import pytest + +import ray from ray.util.collective.tests.util import Worker, create_collective_workers @@ -114,7 +115,8 @@ def test_destroy_group(ray_start_distributed_2_nodes_4_gpus): if __name__ == "__main__": - import pytest import sys + import pytest + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/util/collective/tests/distributed_gpu_tests/test_distributed_broadcast.py b/python/ray/util/collective/tests/distributed_gpu_tests/test_distributed_broadcast.py index 4a8b9779d085..ad5055a7c826 100644 --- a/python/ray/util/collective/tests/distributed_gpu_tests/test_distributed_broadcast.py +++ b/python/ray/util/collective/tests/distributed_gpu_tests/test_distributed_broadcast.py @@ -1,8 +1,8 @@ """Test the broadcast API.""" -import pytest import cupy as cp -import ray +import pytest +import ray from ray.util.collective.tests.util import create_collective_workers diff --git a/python/ray/util/collective/tests/distributed_gpu_tests/test_distributed_reduce.py b/python/ray/util/collective/tests/distributed_gpu_tests/test_distributed_reduce.py index f7e68b85e1da..969647e78d7d 100644 --- a/python/ray/util/collective/tests/distributed_gpu_tests/test_distributed_reduce.py +++ b/python/ray/util/collective/tests/distributed_gpu_tests/test_distributed_reduce.py @@ -1,10 +1,10 @@ """Test the reduce API.""" -import pytest import cupy as cp -import ray -from ray.util.collective.types import ReduceOp +import pytest +import ray from ray.util.collective.tests.util import create_collective_workers +from ray.util.collective.types import ReduceOp @pytest.mark.parametrize("group_name", ["default", "test", "123?34!"]) diff --git a/python/ray/util/collective/tests/distributed_gpu_tests/test_distributed_reducescatter.py b/python/ray/util/collective/tests/distributed_gpu_tests/test_distributed_reducescatter.py index ea200f861416..99f7beb6d526 100644 --- a/python/ray/util/collective/tests/distributed_gpu_tests/test_distributed_reducescatter.py +++ b/python/ray/util/collective/tests/distributed_gpu_tests/test_distributed_reducescatter.py @@ -1,10 +1,9 @@ """Test the collective reducescatter API on a distributed Ray cluster.""" -import pytest -import ray - import cupy as cp +import pytest import torch +import ray from ray.util.collective.tests.util import ( create_collective_workers, init_tensors_for_gather_scatter, @@ -124,7 +123,8 @@ def test_reducescatter_torch_cupy(ray_start_distributed_2_nodes_4_gpus): if __name__ == "__main__": - import pytest import sys + import pytest + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/util/collective/tests/distributed_gpu_tests/test_distributed_sendrecv.py b/python/ray/util/collective/tests/distributed_gpu_tests/test_distributed_sendrecv.py index 692159d223f9..9fb20cf06287 100644 --- a/python/ray/util/collective/tests/distributed_gpu_tests/test_distributed_sendrecv.py +++ b/python/ray/util/collective/tests/distributed_gpu_tests/test_distributed_sendrecv.py @@ -1,8 +1,8 @@ """Test the send/recv API.""" import cupy as cp import pytest -import ray +import ray from ray.util.collective.tests.util import create_collective_workers diff --git a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_allgather.py b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_allgather.py index 74ea2ebc11df..dea31ff53953 100644 --- a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_allgather.py +++ b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_allgather.py @@ -1,10 +1,9 @@ """Test the allgather API on a distributed Ray cluster.""" -import pytest -import ray - import cupy as cp +import pytest import torch +import ray from ray.util.collective.tests.util import ( create_collective_multigpu_workers, init_tensors_for_gather_scatter_multigpu, @@ -81,7 +80,8 @@ def test_allgather_torch_cupy(ray_start_distributed_multigpu_2_nodes_4_gpus): if __name__ == "__main__": - import pytest import sys + import pytest + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_allreduce.py b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_allreduce.py index 1616e1c2e9d3..aa34cc4a6efb 100644 --- a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_allreduce.py +++ b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_allreduce.py @@ -1,12 +1,12 @@ """Test the collective allreduice API on a distributed Ray cluster.""" -import pytest import logging import cupy as cp +import pytest import ray -from ray.util.collective.types import ReduceOp from ray.util.collective.tests.util import create_collective_multigpu_workers +from ray.util.collective.types import ReduceOp logger = logging.getLogger(__name__) logger.setLevel("DEBUG") diff --git a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_basic_apis.py b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_basic_apis.py index ed6ad137d384..4b0c861f039d 100644 --- a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_basic_apis.py +++ b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_basic_apis.py @@ -1,8 +1,9 @@ """Test the collective group APIs.""" -import pytest -import ray from random import shuffle +import pytest + +import ray from ray.util.collective.tests.util import create_collective_multigpu_workers @@ -95,7 +96,8 @@ def test_destroy_group(ray_start_distributed_multigpu_2_nodes_4_gpus): if __name__ == "__main__": - import pytest import sys + import pytest + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_broadcast.py b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_broadcast.py index 3b90c2568cb9..8cd52a962f5f 100644 --- a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_broadcast.py +++ b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_broadcast.py @@ -1,8 +1,8 @@ """Test the broadcast API.""" -import pytest import cupy as cp -import ray +import pytest +import ray from ray.util.collective.tests.util import create_collective_multigpu_workers diff --git a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_reduce.py b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_reduce.py index c584806eedc2..4a15fc4c40df 100644 --- a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_reduce.py +++ b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_reduce.py @@ -1,10 +1,10 @@ """Test the reduce API.""" -import pytest import cupy as cp -import ray -from ray.util.collective.types import ReduceOp +import pytest +import ray from ray.util.collective.tests.util import create_collective_multigpu_workers +from ray.util.collective.types import ReduceOp @pytest.mark.parametrize("group_name", ["default", "test", "123?34!"]) diff --git a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_reducescatter.py b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_reducescatter.py index 67a2b8b738a8..98cd51360ae4 100644 --- a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_reducescatter.py +++ b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_reducescatter.py @@ -1,10 +1,9 @@ """Test the collective reducescatter API on a distributed Ray cluster.""" -import pytest -import ray - import cupy as cp +import pytest import torch +import ray from ray.util.collective.tests.util import ( create_collective_multigpu_workers, init_tensors_for_gather_scatter_multigpu, @@ -84,7 +83,8 @@ def test_reducescatter_torch_cupy(ray_start_distributed_multigpu_2_nodes_4_gpus) if __name__ == "__main__": - import pytest import sys + import pytest + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_sendrecv.py b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_sendrecv.py index c7371343ba56..0fa18ddaf390 100644 --- a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_sendrecv.py +++ b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_sendrecv.py @@ -1,8 +1,8 @@ """Test the send/recv API.""" import cupy as cp import pytest -import ray +import ray from ray.util.collective.tests.util import create_collective_multigpu_workers diff --git a/python/ray/util/collective/tests/single_node_cpu_tests/test_allgather.py b/python/ray/util/collective/tests/single_node_cpu_tests/test_allgather.py index 67d9ddb01e9b..70026b88ddaa 100644 --- a/python/ray/util/collective/tests/single_node_cpu_tests/test_allgather.py +++ b/python/ray/util/collective/tests/single_node_cpu_tests/test_allgather.py @@ -1,8 +1,9 @@ """Test the collective allgather API.""" import numpy as np import pytest -import ray import torch + +import ray from ray.util.collective.tests.cpu_util import ( create_collective_workers, init_tensors_for_gather_scatter, @@ -135,7 +136,8 @@ def test_allgather_torch_numpy(ray_start_single_node, backend): if __name__ == "__main__": - import pytest import sys + import pytest + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/util/collective/tests/single_node_cpu_tests/test_allreduce.py b/python/ray/util/collective/tests/single_node_cpu_tests/test_allreduce.py index 22ebcfeb6e1b..4791ed2ea388 100644 --- a/python/ray/util/collective/tests/single_node_cpu_tests/test_allreduce.py +++ b/python/ray/util/collective/tests/single_node_cpu_tests/test_allreduce.py @@ -1,8 +1,9 @@ """Test the collective allreduice API.""" import numpy as np import pytest -import ray import torch + +import ray from ray.util.collective.tests.cpu_util import create_collective_workers from ray.util.collective.types import Backend, ReduceOp @@ -158,7 +159,8 @@ def test_allreduce_torch_numpy(ray_start_single_node, backend): if __name__ == "__main__": - import pytest import sys + import pytest + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/util/collective/tests/single_node_cpu_tests/test_basic_apis.py b/python/ray/util/collective/tests/single_node_cpu_tests/test_basic_apis.py index f8bd8dff63b3..0701f40f4eb5 100644 --- a/python/ray/util/collective/tests/single_node_cpu_tests/test_basic_apis.py +++ b/python/ray/util/collective/tests/single_node_cpu_tests/test_basic_apis.py @@ -1,9 +1,9 @@ """Test the collective group APIs.""" import pytest -import ray -from ray.util.collective.types import Backend +import ray from ray.util.collective.tests.cpu_util import Worker, create_collective_workers +from ray.util.collective.types import Backend @pytest.mark.parametrize("backend", [Backend.GLOO]) @@ -123,7 +123,8 @@ def test_destroy_group(ray_start_single_node, backend): if __name__ == "__main__": - import pytest import sys + import pytest + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/util/collective/tests/single_node_cpu_tests/test_broadcast.py b/python/ray/util/collective/tests/single_node_cpu_tests/test_broadcast.py index f785c450c142..263f832ee280 100644 --- a/python/ray/util/collective/tests/single_node_cpu_tests/test_broadcast.py +++ b/python/ray/util/collective/tests/single_node_cpu_tests/test_broadcast.py @@ -1,8 +1,8 @@ """Test the broadcast API.""" -import pytest import numpy as np -import ray +import pytest +import ray from ray.util.collective.tests.cpu_util import create_collective_workers from ray.util.collective.types import Backend @@ -87,7 +87,8 @@ def test_broadcast_invalid_rank(ray_start_single_node, backend, src_rank=3): if __name__ == "__main__": - import pytest import sys + import pytest + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/util/collective/tests/single_node_cpu_tests/test_gloo_group_isolation.py b/python/ray/util/collective/tests/single_node_cpu_tests/test_gloo_group_isolation.py index bc41e341bcc6..7d0d4888aca0 100644 --- a/python/ray/util/collective/tests/single_node_cpu_tests/test_gloo_group_isolation.py +++ b/python/ray/util/collective/tests/single_node_cpu_tests/test_gloo_group_isolation.py @@ -1,8 +1,10 @@ -from python.ray.util.collective.types import Backend +import time + from python.ray.util.collective.collective_group.gloo_collective_group import GLOOGroup +from python.ray.util.collective.types import Backend + import ray import ray.util.collective as col -import time @ray.remote @@ -57,7 +59,8 @@ def test_failure_when_initializing(shutdown_only): if __name__ == "__main__": - import pytest import sys + import pytest + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/util/collective/tests/single_node_cpu_tests/test_reduce.py b/python/ray/util/collective/tests/single_node_cpu_tests/test_reduce.py index d7977b2c32e6..4a125b24b82a 100644 --- a/python/ray/util/collective/tests/single_node_cpu_tests/test_reduce.py +++ b/python/ray/util/collective/tests/single_node_cpu_tests/test_reduce.py @@ -1,10 +1,10 @@ """Test the reduce API.""" -import pytest import numpy as np -import ray -from ray.util.collective.types import Backend, ReduceOp +import pytest +import ray from ray.util.collective.tests.cpu_util import create_collective_workers +from ray.util.collective.types import Backend, ReduceOp @pytest.mark.parametrize("backend", [Backend.GLOO]) @@ -160,7 +160,8 @@ def test_reduce_invalid_rank(ray_start_single_node, backend, dst_rank=3): if __name__ == "__main__": - import pytest import sys + import pytest + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/util/collective/tests/single_node_cpu_tests/test_reducescatter.py b/python/ray/util/collective/tests/single_node_cpu_tests/test_reducescatter.py index 245c84ed9e8a..22d0e56da733 100644 --- a/python/ray/util/collective/tests/single_node_cpu_tests/test_reducescatter.py +++ b/python/ray/util/collective/tests/single_node_cpu_tests/test_reducescatter.py @@ -1,15 +1,14 @@ """Test the collective reducescatter API.""" -import pytest -import ray - import numpy as np +import pytest import torch -from ray.util.collective.types import Backend +import ray from ray.util.collective.tests.cpu_util import ( create_collective_workers, init_tensors_for_gather_scatter, ) +from ray.util.collective.types import Backend @pytest.mark.parametrize("backend", [Backend.GLOO]) @@ -125,7 +124,8 @@ def test_reducescatter_torch_numpy(ray_start_single_node, backend): if __name__ == "__main__": - import pytest import sys + import pytest + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/util/collective/tests/single_node_cpu_tests/test_sendrecv.py b/python/ray/util/collective/tests/single_node_cpu_tests/test_sendrecv.py index aae3440b7cde..e4bd841d7a10 100644 --- a/python/ray/util/collective/tests/single_node_cpu_tests/test_sendrecv.py +++ b/python/ray/util/collective/tests/single_node_cpu_tests/test_sendrecv.py @@ -1,10 +1,10 @@ """Test the send/recv API.""" -import pytest import numpy as np -import ray +import pytest -from ray.util.collective.types import Backend +import ray from ray.util.collective.tests.cpu_util import create_collective_workers +from ray.util.collective.types import Backend @pytest.mark.parametrize("backend", [Backend.GLOO]) @@ -85,7 +85,8 @@ def test_sendrecv_invalid_rank(ray_start_single_node, backend, dst_rank=3): if __name__ == "__main__": - import pytest import sys + import pytest + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/util/collective/tests/single_node_gpu_tests/test_allgather.py b/python/ray/util/collective/tests/single_node_gpu_tests/test_allgather.py index eee8d48313f8..e7f78e6ac6a0 100644 --- a/python/ray/util/collective/tests/single_node_gpu_tests/test_allgather.py +++ b/python/ray/util/collective/tests/single_node_gpu_tests/test_allgather.py @@ -1,10 +1,9 @@ """Test the collective allgather API.""" -import pytest -import ray - import cupy as cp +import pytest import torch +import ray from ray.util.collective.tests.util import ( create_collective_workers, init_tensors_for_gather_scatter, @@ -132,7 +131,8 @@ def test_allgather_torch_cupy(ray_start_single_node_2_gpus): if __name__ == "__main__": - import pytest import sys + import pytest + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/util/collective/tests/single_node_gpu_tests/test_allreduce.py b/python/ray/util/collective/tests/single_node_gpu_tests/test_allreduce.py index 0acab8c73077..1894adfc295d 100644 --- a/python/ray/util/collective/tests/single_node_gpu_tests/test_allreduce.py +++ b/python/ray/util/collective/tests/single_node_gpu_tests/test_allreduce.py @@ -1,8 +1,9 @@ """Test the collective allreduice API.""" import cupy as cp import pytest -import ray import torch + +import ray from ray.util.collective.tests.util import create_collective_workers from ray.util.collective.types import ReduceOp @@ -162,7 +163,8 @@ def test_allreduce_torch_cupy(ray_start_single_node_2_gpus): if __name__ == "__main__": - import pytest import sys + import pytest + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/util/collective/tests/single_node_gpu_tests/test_basic_apis.py b/python/ray/util/collective/tests/single_node_gpu_tests/test_basic_apis.py index 00136b7a8523..892b13288689 100644 --- a/python/ray/util/collective/tests/single_node_gpu_tests/test_basic_apis.py +++ b/python/ray/util/collective/tests/single_node_gpu_tests/test_basic_apis.py @@ -1,7 +1,7 @@ """Test the collective group APIs.""" import pytest -import ray +import ray from ray.util.collective.tests.util import Worker, create_collective_workers @@ -111,7 +111,8 @@ def test_destroy_group(ray_start_single_node_2_gpus): if __name__ == "__main__": - import pytest import sys + import pytest + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/util/collective/tests/single_node_gpu_tests/test_broadcast.py b/python/ray/util/collective/tests/single_node_gpu_tests/test_broadcast.py index e00f355053e9..85623ebdfa34 100644 --- a/python/ray/util/collective/tests/single_node_gpu_tests/test_broadcast.py +++ b/python/ray/util/collective/tests/single_node_gpu_tests/test_broadcast.py @@ -1,8 +1,8 @@ """Test the broadcast API.""" -import pytest import cupy as cp -import ray +import pytest +import ray from ray.util.collective.tests.util import create_collective_workers diff --git a/python/ray/util/collective/tests/single_node_gpu_tests/test_reduce.py b/python/ray/util/collective/tests/single_node_gpu_tests/test_reduce.py index 17fb446c871d..2439c30726d7 100644 --- a/python/ray/util/collective/tests/single_node_gpu_tests/test_reduce.py +++ b/python/ray/util/collective/tests/single_node_gpu_tests/test_reduce.py @@ -1,10 +1,10 @@ """Test the reduce API.""" -import pytest import cupy as cp -import ray -from ray.util.collective.types import ReduceOp +import pytest +import ray from ray.util.collective.tests.util import create_collective_workers +from ray.util.collective.types import ReduceOp @pytest.mark.parametrize("group_name", ["default", "test", "123?34!"]) diff --git a/python/ray/util/collective/tests/single_node_gpu_tests/test_reducescatter.py b/python/ray/util/collective/tests/single_node_gpu_tests/test_reducescatter.py index 122ef1a1faef..83c64f948fb4 100644 --- a/python/ray/util/collective/tests/single_node_gpu_tests/test_reducescatter.py +++ b/python/ray/util/collective/tests/single_node_gpu_tests/test_reducescatter.py @@ -1,10 +1,9 @@ """Test the collective reducescatter API.""" -import pytest -import ray - import cupy as cp +import pytest import torch +import ray from ray.util.collective.tests.util import ( create_collective_workers, init_tensors_for_gather_scatter, @@ -124,7 +123,8 @@ def test_reducescatter_torch_cupy(ray_start_single_node_2_gpus): if __name__ == "__main__": - import pytest import sys + import pytest + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/util/collective/tests/single_node_gpu_tests/test_sendrecv.py b/python/ray/util/collective/tests/single_node_gpu_tests/test_sendrecv.py index 2f79f1fb25b9..cce862ca230b 100644 --- a/python/ray/util/collective/tests/single_node_gpu_tests/test_sendrecv.py +++ b/python/ray/util/collective/tests/single_node_gpu_tests/test_sendrecv.py @@ -1,8 +1,8 @@ """Test the send/recv API.""" -import pytest import cupy as cp -import ray +import pytest +import ray from ray.util.collective.tests.util import create_collective_workers diff --git a/python/ray/util/collective/tests/util.py b/python/ray/util/collective/tests/util.py index 69eac6438224..e3dfd63adc54 100644 --- a/python/ray/util/collective/tests/util.py +++ b/python/ray/util/collective/tests/util.py @@ -1,12 +1,12 @@ -import cupy as cp import logging +import cupy as cp +import torch + import ray import ray.util.collective as col -from ray.util.collective.types import Backend, ReduceOp from ray.util.collective.collective_group.nccl_util import get_num_gpus - -import torch +from ray.util.collective.types import Backend, ReduceOp logger = logging.getLogger(__name__) diff --git a/python/ray/util/collective/types.py b/python/ray/util/collective/types.py index 7cb4babaa2bd..90ed5d6d86bd 100644 --- a/python/ray/util/collective/types.py +++ b/python/ray/util/collective/types.py @@ -1,12 +1,19 @@ """Types conversion between different backends.""" -from enum import Enum + from dataclasses import dataclass from datetime import timedelta +from enum import Enum +from typing import TYPE_CHECKING, List, Optional, Tuple + +from numpy import int32 _NUMPY_AVAILABLE = True _TORCH_AVAILABLE = True _CUPY_AVAILABLE = True +if TYPE_CHECKING: + import torch + try: import torch as th # noqa: F401 except ImportError: @@ -34,6 +41,7 @@ class Backend(object): GLOO = "gloo" # Use gloo through torch.distributed. TORCH_GLOO = "torch_gloo" + NIXL = "nixl" UNRECOGNIZED = "unrecognized" def __new__(cls, name: str): @@ -47,6 +55,67 @@ def __new__(cls, name: str): return backend +@dataclass +class TensorTransportMetadata: + """Metadata for tensors stored in the GPU object store. + + Args: + tensor_meta: A list of tuples, each containing the shape and dtype of a tensor. + tensor_device: The device of the tensor. Currently, we require all tensors in the + list have the same device type. + """ + + tensor_meta: List[Tuple["torch.Size", "torch.dtype"]] + tensor_device: Optional["torch.device"] = None + + +@dataclass +class NixlTransportMetadata(TensorTransportMetadata): + """Metadata for tensors stored in the GPU object store for NIXL transport. + + Args: + nixl_serialized_descs: Serialized tensor descriptors for NIXL transport. + nixl_agent_meta: The additional metadata of the remote NIXL agent. + """ + + nixl_serialized_descs: Optional[bytes] = None + nixl_agent_meta: Optional[bytes] = None + + +@dataclass +class CollectiveTransportMetadata(TensorTransportMetadata): + """Metadata for tensors stored in the GPU object store for collective transport.""" + + +@dataclass +class CommunicatorMetadata: + """Metadata for the communicator. + + Args: + communicator_name: The name of the communicator. + """ + + communicator_name: str = "" + + +@dataclass +class CollectiveCommunicatorMetadata(CommunicatorMetadata): + """Metadata for the collective communicator (e.g. NCCL, GLOO). + + Args: + src_rank: The rank of the source actor. + dst_rank: The rank of the destination actor. + """ + + src_rank: Optional[int32] = None + dst_rank: Optional[int32] = None + + +@dataclass +class NixlCommunicatorMetadata(CommunicatorMetadata): + """Metadata for the NIXL communicator.""" + + class ReduceOp(Enum): SUM = 0 PRODUCT = 1 @@ -56,6 +125,9 @@ class ReduceOp(Enum): unset_timeout_ms = timedelta(milliseconds=-1) +# This is used to identify the collective group for NIXL. +NIXL_GROUP_NAME = "ray_internal_nixl_group" + @dataclass class AllReduceOptions: diff --git a/python/ray/util/collective/util.py b/python/ray/util/collective/util.py index 6acabf82de3e..02221995fd60 100644 --- a/python/ray/util/collective/util.py +++ b/python/ray/util/collective/util.py @@ -1,7 +1,9 @@ """Some utility class for Collectives.""" -import ray +import asyncio import logging +import ray + logger = logging.getLogger(__name__) @@ -20,8 +22,9 @@ class NCCLUniqueIDStore: def __init__(self, name): self.name = name self.nccl_id = None + self.event = asyncio.Event() - def set_id(self, uid): + async def set_id(self, uid): """ Initialize the NCCL unique ID for this store. @@ -29,9 +32,15 @@ def set_id(self, uid): uid: the unique ID generated via the NCCL generate_communicator_id API. Returns: - None + The NCCL unique ID set. """ self.nccl_id = uid + self.event.set() + return uid + + async def wait_and_get_id(self): + """Wait for the NCCL unique ID to be set and return it.""" + await self.event.wait() return self.nccl_id def get_id(self): diff --git a/python/ray/util/dask/BUILD b/python/ray/util/dask/BUILD.bazel similarity index 100% rename from python/ray/util/dask/BUILD rename to python/ray/util/dask/BUILD.bazel diff --git a/python/ray/util/dask/__init__.py b/python/ray/util/dask/__init__.py index 3376b5f8eaca..f9e4ac0cb1af 100644 --- a/python/ray/util/dask/__init__.py +++ b/python/ray/util/dask/__init__.py @@ -11,19 +11,19 @@ "Please upgrade your Dask installation." ) -from .scheduler import ( - ray_dask_get, - ray_dask_get_sync, - enable_dask_on_ray, - disable_dask_on_ray, -) from .callbacks import ( + ProgressBarCallback, RayDaskCallback, local_ray_callbacks, unpack_ray_callbacks, - ProgressBarCallback, ) from .optimizations import dataframe_optimize +from .scheduler import ( + disable_dask_on_ray, + enable_dask_on_ray, + ray_dask_get, + ray_dask_get_sync, +) dask_persist = dask.persist diff --git a/python/ray/util/dask/callbacks.py b/python/ray/util/dask/callbacks.py index 82c6ca1cf717..770d2208b504 100644 --- a/python/ray/util/dask/callbacks.py +++ b/python/ray/util/dask/callbacks.py @@ -1,6 +1,5 @@ import contextlib - -from collections import namedtuple, defaultdict +from collections import defaultdict, namedtuple from datetime import datetime from typing import Any, List, Optional diff --git a/python/ray/util/dask/common.py b/python/ray/util/dask/common.py index b041b7ff4676..47ec12d79a1b 100644 --- a/python/ray/util/dask/common.py +++ b/python/ray/util/dask/common.py @@ -1,16 +1,15 @@ +import uuid from collections import OrderedDict from collections.abc import Iterator from operator import getitem -import uuid - -import ray -from dask.core import quote -from dask.core import get as get_sync +from dask.core import get as get_sync, quote from dask.utils import apply +import ray + try: - from dataclasses import is_dataclass, fields as dataclass_fields + from dataclasses import fields as dataclass_fields, is_dataclass except ImportError: # Python < 3.7 def is_dataclass(x): diff --git a/python/ray/util/dask/optimizations.py b/python/ray/util/dask/optimizations.py index 096a6096d48f..e88416774c6d 100644 --- a/python/ray/util/dask/optimizations.py +++ b/python/ray/util/dask/optimizations.py @@ -8,9 +8,8 @@ from .scheduler import MultipleReturnFunc, multiple_return_get try: - from dask.dataframe.shuffle import SimpleShuffleLayer from dask.dataframe.optimize import optimize - from dask.dataframe.shuffle import shuffle_group + from dask.dataframe.shuffle import SimpleShuffleLayer, shuffle_group except ImportError: # SimpleShuffleLayer doesn't exist in this version of Dask. # This is the case for dask>=2025.1.0. diff --git a/python/ray/util/dask/scheduler.py b/python/ray/util/dask/scheduler.py index daa4449e6de3..0fa94706187f 100644 --- a/python/ray/util/dask/scheduler.py +++ b/python/ray/util/dask/scheduler.py @@ -1,10 +1,8 @@ -import warnings - import atexit import threading import time -from collections import defaultdict -from collections import OrderedDict +import warnings +from collections import OrderedDict, defaultdict from collections.abc import Mapping from dataclasses import dataclass from multiprocessing.pool import ThreadPool @@ -12,22 +10,22 @@ from typing import Optional import dask -from dask.core import istask, ishashable +from dask.core import ishashable, istask try: - from dask._task_spec import Task, Alias, DataNode, TaskRef, convert_legacy_graph + from dask._task_spec import Alias, DataNode, Task, TaskRef, convert_legacy_graph except ImportError: warnings.warn( "Dask on Ray is available only on dask>=2024.11.0, " f"you are on version {dask.__version__}." ) from dask.system import CPU_COUNT -from dask.threaded import pack_exception, _thread_get_id +from dask.threaded import _thread_get_id, pack_exception import ray from ray.util.dask.callbacks import local_ray_callbacks, unpack_ray_callbacks from ray.util.dask.common import unpack_object_refs -from ray.util.dask.scheduler_utils import get_async, apply_sync +from ray.util.dask.scheduler_utils import apply_sync, get_async main_thread = threading.current_thread() default_pool = None diff --git a/python/ray/util/dask/scheduler_utils.py b/python/ray/util/dask/scheduler_utils.py index bb7feca4ae8b..b4c840c6b896 100644 --- a/python/ray/util/dask/scheduler_utils.py +++ b/python/ray/util/dask/scheduler_utils.py @@ -5,7 +5,7 @@ import os import warnings -from queue import Queue, Empty +from queue import Empty, Queue import dask from dask import config diff --git a/python/ray/util/dask/tests/BUILD b/python/ray/util/dask/tests/BUILD.bazel similarity index 100% rename from python/ray/util/dask/tests/BUILD rename to python/ray/util/dask/tests/BUILD.bazel diff --git a/python/ray/util/dask/tests/test_dask_callback.py b/python/ray/util/dask/tests/test_dask_callback.py index 99c59d791b33..d58c7dc3c130 100644 --- a/python/ray/util/dask/tests/test_dask_callback.py +++ b/python/ray/util/dask/tests/test_dask_callback.py @@ -1,12 +1,11 @@ import sys - import dask import pytest import ray from ray.tests.conftest import * # noqa: F403, F401 -from ray.util.dask import ray_dask_get, RayDaskCallback +from ray.util.dask import RayDaskCallback, ray_dask_get @dask.delayed diff --git a/python/ray/util/dask/tests/test_dask_multi_node.py b/python/ray/util/dask/tests/test_dask_multi_node.py index a3dc5f7effa0..44e8ace38b6e 100644 --- a/python/ray/util/dask/tests/test_dask_multi_node.py +++ b/python/ray/util/dask/tests/test_dask_multi_node.py @@ -1,10 +1,10 @@ import sys import dask -import pytest import dask.dataframe as dd import numpy as np import pandas as pd +import pytest import ray from ray.tests.conftest import * # noqa: F403, F401 diff --git a/python/ray/util/dask/tests/test_dask_optimization.py b/python/ray/util/dask/tests/test_dask_optimization.py index 858f0009a363..b09c62fbca39 100644 --- a/python/ray/util/dask/tests/test_dask_optimization.py +++ b/python/ray/util/dask/tests/test_dask_optimization.py @@ -1,8 +1,8 @@ import sys +from unittest import mock import dask import dask.dataframe as dd -from unittest import mock import numpy as np import pandas as pd import pytest @@ -21,9 +21,10 @@ if Version(dask.__version__) < Version("2025.1") and not DASK_EXPR_INSTALLED: from dask.dataframe.shuffle import SimpleShuffleLayer + from ray.util.dask.optimizations import ( - rewrite_simple_shuffle_layer, MultipleReturnSimpleShuffleLayer, + rewrite_simple_shuffle_layer, ) pytestmark = pytest.mark.skipif( diff --git a/python/ray/util/debug.py b/python/ray/util/debug.py index e5482c7b6d8c..29d9a3e1d497 100644 --- a/python/ray/util/debug.py +++ b/python/ray/util/debug.py @@ -1,10 +1,11 @@ -from collections import defaultdict, namedtuple import gc import os import re import time import tracemalloc +from collections import defaultdict, namedtuple from typing import Callable, List, Optional + from ray.util.annotations import DeveloperAPI _logged = set() @@ -210,8 +211,8 @@ def _take_snapshot(table, suspicious=None): def _find_memory_leaks_in_table(table): - import scipy.stats import numpy as np + import scipy.stats suspects = [] diff --git a/python/ray/util/debugpy.py b/python/ray/util/debugpy.py index 3513f2100fc2..1f5a0157f2b6 100644 --- a/python/ray/util/debugpy.py +++ b/python/ray/util/debugpy.py @@ -1,8 +1,8 @@ +import importlib import logging import os import sys import threading -import importlib import ray from ray._common.network_utils import build_address diff --git a/python/ray/util/helpers.py b/python/ray/util/helpers.py new file mode 100644 index 000000000000..bfc400f2ffe2 --- /dev/null +++ b/python/ray/util/helpers.py @@ -0,0 +1,256 @@ +from typing import TYPE_CHECKING, Any, Iterable, Iterator, Optional, Sequence, Union + +import ray +from ray.util.annotations import PublicAPI + +if TYPE_CHECKING: + from ray import ObjectRef + from ray.remote_function import RemoteFunction + + +# ray.wait() has a default num_returns of 1. +# Using a slightly larger batch until the optimization is fully implemented, see +# https://github.com/ray-project/ray/issues/49905 +DEFAULT_CHUNK_SIZE = 10 +DEFAULT_BACKPRESSURE_SIZE = 100 + + +def _wait_and_get_single_batch( + refs: "Sequence[ObjectRef]", + *, + chunk_size: int, + yield_obj_refs: bool = False, + **kwargs, +) -> tuple[list[Union[Any, "ObjectRef"]], "list[ObjectRef]"]: + """Call ray.wait and explicitly return the ready objects/results + and remaining Ray remote refs. + + Args: + refs: A list of Ray object refs. + chunk_size: The `num_returns` parameter to pass to `ray.wait()`. + yield_obj_refs: If True, return Ray remote refs instead of results (by calling :meth:`~ray.get`). + **kwargs: Additional keyword arguments to pass to `ray.wait()`. + + Returns: + A tuple of two lists, ready and not ready. This is the same as the return value of `ray.wait()`. + """ + + if chunk_size < 1: + raise ValueError("`chunk_size` must be >= 1") + + kwargs = kwargs or {} + + # num_returns must be <= len(refs) + ready, refs = ray.wait( + refs, + num_returns=min(chunk_size, len(refs)), + **kwargs, + ) + + if not yield_obj_refs: + return ray.get(ready), refs + + return ready, refs + + +@PublicAPI(stability="alpha") +def as_completed( + refs: "Sequence[ObjectRef]", + *, + chunk_size: int = DEFAULT_CHUNK_SIZE, + yield_obj_refs: bool = False, + **kwargs, +) -> Iterator[Union[Any, "ObjectRef"]]: + """Given a list of Ray task references, yield results as they become available. + + Unlike calling :meth:`~ray.get` on a list of references (i.e., `ray.get(refs)`) which + waits for all results to be ready, this function begins to yield result as soon as + a batch of `chunk_size` results are ready. + + .. note:: + Generally there is no guarantee on the order of results. For example, the first result + is not necessarily the first one completed, but rather the first one submitted in the + first available batch (See :meth:`~ray.wait` for more details about + preservation of submission order). + + .. note:: + Use this function instead of calling :meth:`~ray.get` inside a for loop. See + https://docs.ray.io/en/latest/ray-core/patterns/ray-get-loop.html for more details. + + Example: + Suppose we have a function that sleeps for x seconds depending on the input. + We expect to obtain a partially sorted list of results. + + .. testcode:: python + import ray + import time + + @ray.remote + def f(x): + time.sleep(x) + return x + + refs = [f.remote(i) for i in [10, 4, 6, 8, 2]] + for x in ray.util.as_completed(refs, chunk_size=2): + print(x) + + .. testoutput:: + :options: +MOCK + + # Output: + 4 + 2 + 6 + 8 + 10 + + Args: + refs: A list of Ray object refs. + chunk_size: The number of tasks to wait for in each iteration (default 10). + The parameter is passed as `num_returns` to :meth:`~ray.wait` internally. + yield_obj_refs: If True, return Ray remote refs instead of results (by calling :meth:`~ray.get`). + **kwargs: Additional keyword arguments to pass to :meth:`~ray.wait`, e.g., + `timeout` and `fetch_local`. + + Yields: + Union[Any, ObjectRef]: The results (or optionally their Ray references) of the Ray tasks as they complete. + """ + if chunk_size < 1: + raise ValueError("`chunk_size` must be >= 1") + + if "num_returns" in kwargs: + raise ValueError("Use the `chunksize` argument instead of `num_returns`.") + + while refs: + results, refs = _wait_and_get_single_batch( + refs, + chunk_size=chunk_size, + yield_obj_refs=yield_obj_refs, + **kwargs, + ) + yield from results + + +@PublicAPI(stability="alpha") +def map_unordered( + fn: "RemoteFunction", + items: Iterable[Any], + *, + backpressure_size: Optional[int] = DEFAULT_BACKPRESSURE_SIZE, + chunk_size: int = DEFAULT_CHUNK_SIZE, + yield_obj_refs: bool = False, + **kwargs, +) -> Iterator[Union[Any, "ObjectRef"]]: + """Apply a Ray remote function to a list of items and return an iterator that yields + the completed results as they become available. + + This helper function applies backpressure to control the number of pending tasks, following the + design pattern described in + https://docs.ray.io/en/latest/ray-core/patterns/limit-pending-tasks.html. + + .. note:: + There is generally no guarantee on the order of results. + + Example: + Suppose we have a function that sleeps for x seconds depending on the input. + We expect to obtain a partially sorted list of results. + + .. testcode:: python + + import ray + import time + + @ray.remote + def f(x): + time.sleep(x) + return x + + # Example 1: chunk_size=2 + for x in ray.util.map_unordered(f, [10, 4, 6, 8, 2], chunk_size=2): + print(x) + + .. testoutput:: + :options: +MOCK + + 4 + 2 + 6 + 8 + 10 + + .. testcode:: python + + # Example 2: backpressure_size=2, chunk_size=1 + for x in ray.util.map_unordered(f, [10, 4, 6, 8, 2], backpressure_size=2, chunk_size=1): + print(x) + + .. testoutput:: + :options: +MOCK + + 4 + 10 + 6 + 8 + 2 + + Args: + fn: A remote function to apply to the list of items. For more complex use cases, use Ray Data's + :meth:`~ray.data.Dataset.map` / :meth:`~ray.data.Dataset.map_batches` instead. + items: An iterable of items to apply the function to. + backpressure_size: Maximum number of in-flight tasks allowed before + calling a blocking :meth:`~ray.wait` (default 100). If None, no backpressure is applied. + chunk_size: The number of tasks to wait for when the number of in-flight tasks exceeds + `backpressure_size`. The parameter is passed as `num_returns` to :meth:`~ray.wait` internally. + yield_obj_refs: If True, return Ray remote refs instead of results (by calling :meth:`~ray.get`). + **kwargs: Additional keyword arguments to pass to :meth:`~ray.wait`, e.g., + `timeout` and `fetch_local`. + + Yields: + Union[Any, ObjectRef]: The results (or optionally their Ray references) of the Ray tasks as they complete. + + .. seealso:: + + :meth:`~ray.util.as_completed` + Call this method for an existing list of Ray object refs. + + :meth:`~ray.data.Dataset.map` + Use Ray Data APIs (e.g., :meth:`~ray.data.Dataset.map` and :meth:`~ray.data.Dataset.map_batches`) + for better control and complex use cases, e.g., functions with multiple arguments. + + .. note:: + + This is an altenative to `pool.imap_unordered()` in Ray's Actor-based `multiprocessing.Pool`. + See https://docs.ray.io/en/latest/ray-more-libs/multiprocessing.html for more details. + + """ + + if backpressure_size is None: + backpressure_size: float = float("inf") + elif backpressure_size <= 0: + raise ValueError("backpressure_size must be positive.") + + if chunk_size < 1: + raise ValueError("`chunk_size` must be >= 1") + + if "num_returns" in kwargs: + raise ValueError("Use the `chunk_size` argument instead of `num_returns`.") + + refs = [] + for item in items: + refs.append(fn.remote(item)) + + if len(refs) >= backpressure_size: + results, refs = _wait_and_get_single_batch( + refs, + chunk_size=chunk_size, + yield_obj_refs=yield_obj_refs, + **kwargs, + ) + yield from results + else: + yield from as_completed( + refs, + chunk_size=chunk_size, + yield_obj_refs=yield_obj_refs, + **kwargs, + ) diff --git a/python/ray/util/metrics.py b/python/ray/util/metrics.py index ec1988044b65..7942f7f279c6 100644 --- a/python/ray/util/metrics.py +++ b/python/ray/util/metrics.py @@ -1,15 +1,14 @@ import logging +import os import re import warnings -import os - -from typing import Dict, Any, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union from ray._raylet import ( Count as CythonCount, - Sum as CythonSum, - Histogram as CythonHistogram, Gauge as CythonGauge, + Histogram as CythonHistogram, + Sum as CythonSum, ) # noqa: E402 # Sum is used for CythonCount because it allows incrementing by positive @@ -191,7 +190,7 @@ def __init__( if self._discard_metric: self._metric = None else: - if os.environ.get("RAY_experimental_enable_open_telemetry_on_core") == "1": + if os.environ.get("RAY_enable_open_telemetry") == "1": """ For the new opentelemetry implementation, we'll correctly use Counter rather than Sum. diff --git a/python/ray/util/multiprocessing/__init__.py b/python/ray/util/multiprocessing/__init__.py index 5b390439f5e1..75c07d911814 100644 --- a/python/ray/util/multiprocessing/__init__.py +++ b/python/ray/util/multiprocessing/__init__.py @@ -1,4 +1,4 @@ -from multiprocessing import TimeoutError, JoinableQueue +from multiprocessing import JoinableQueue, TimeoutError from .pool import Pool diff --git a/python/ray/util/placement_group.py b/python/ray/util/placement_group.py index 02c45f380484..d2e29b81c536 100644 --- a/python/ray/util/placement_group.py +++ b/python/ray/util/placement_group.py @@ -1,15 +1,15 @@ import warnings from typing import Dict, List, Optional, Union -from ray._common.utils import hex_to_binary, PLACEMENT_GROUP_BUNDLE_RESOURCE_NAME import ray +from ray._common.utils import PLACEMENT_GROUP_BUNDLE_RESOURCE_NAME, hex_to_binary from ray._private.auto_init_hook import auto_init_ray from ray._private.client_mode_hook import client_mode_should_convert, client_mode_wrap +from ray._private.label_utils import validate_label_selector from ray._private.utils import get_ray_doc_version from ray._raylet import PlacementGroupID from ray.util.annotations import DeveloperAPI, PublicAPI from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy -from ray._private.label_utils import validate_label_selector bundle_reservation_check = None @@ -148,7 +148,6 @@ def placement_group( strategy: str = "PACK", name: str = "", lifetime: Optional[str] = None, - _max_cpu_fraction_per_node: float = 1.0, _soft_target_node_id: Optional[str] = None, bundle_label_selector: List[Dict[str, str]] = None, ) -> PlacementGroup: @@ -170,14 +169,6 @@ def placement_group( will fate share with its creator and will be deleted once its creator is dead, or "detached", which means the placement group will live as a global object independent of the creator. - _max_cpu_fraction_per_node: (Experimental) Disallow placing bundles on nodes - if it would cause the fraction of CPUs used by bundles from *any* placement - group on the node to exceed this fraction. This effectively sets aside - CPUs that placement groups cannot occupy on nodes. when - `max_cpu_fraction_per_node < 1.0`, at least 1 CPU will be excluded from - placement group scheduling. Note: This feature is experimental and is not - recommended for use with autoscaling clusters (scale-up will not trigger - properly). _soft_target_node_id: (Private, Experimental) Soft hint where bundles of this placement group should be placed. The target node is specified by it's hex ID. @@ -202,7 +193,6 @@ def placement_group( bundles=bundles, strategy=strategy, lifetime=lifetime, - _max_cpu_fraction_per_node=_max_cpu_fraction_per_node, _soft_target_node_id=_soft_target_node_id, bundle_label_selector=bundle_label_selector, ) @@ -220,7 +210,6 @@ def placement_group( bundles, strategy, detached, - _max_cpu_fraction_per_node, _soft_target_node_id, bundle_label_selector, ) @@ -353,7 +342,6 @@ def validate_placement_group( bundles: List[Dict[str, float]], strategy: str = "PACK", lifetime: Optional[str] = None, - _max_cpu_fraction_per_node: float = 1.0, _soft_target_node_id: Optional[str] = None, bundle_label_selector: List[Dict[str, str]] = None, ) -> bool: @@ -361,22 +349,6 @@ def validate_placement_group( Raises ValueError if inputs are invalid. """ - - assert _max_cpu_fraction_per_node is not None - - if _max_cpu_fraction_per_node != 1.0: - warnings.warn( - "The experimental '_max_cpu_fraction_per_node' option for placement groups " - "is deprecated and will be removed in a future version of Ray." - ) - - if _max_cpu_fraction_per_node <= 0 or _max_cpu_fraction_per_node > 1: - raise ValueError( - "Invalid argument `_max_cpu_fraction_per_node`: " - f"{_max_cpu_fraction_per_node}. " - "_max_cpu_fraction_per_node must be a float between 0 and 1. " - ) - if _soft_target_node_id and strategy != "STRICT_PACK": raise ValueError( "_soft_target_node_id currently only works " diff --git a/python/ray/util/queue.py b/python/ray/util/queue.py index 8bd205f972c9..b18075c801ac 100644 --- a/python/ray/util/queue.py +++ b/python/ray/util/queue.py @@ -1,7 +1,7 @@ import asyncio import queue -from typing import Optional, Any, List, Dict from collections.abc import Iterable +from typing import Any, Dict, List, Optional import ray from ray.util.annotations import PublicAPI diff --git a/python/ray/util/rpdb.py b/python/ray/util/rpdb.py index ae102c96120b..865980ffb2ea 100644 --- a/python/ray/util/rpdb.py +++ b/python/ray/util/rpdb.py @@ -3,7 +3,6 @@ # (BSD 2-Clause "Simplified" License) import errno -from ray._common.network_utils import build_address import inspect import json import logging @@ -19,6 +18,7 @@ from typing import Callable import ray +from ray._common.network_utils import build_address from ray._private import ray_constants from ray.experimental.internal_kv import _internal_kv_del, _internal_kv_put from ray.util.annotations import DeveloperAPI diff --git a/python/ray/util/scheduling_strategies.py b/python/ray/util/scheduling_strategies.py index b9953094a9c1..6f86622a8be3 100644 --- a/python/ray/util/scheduling_strategies.py +++ b/python/ray/util/scheduling_strategies.py @@ -1,4 +1,5 @@ -from typing import Dict, Union, Optional, TYPE_CHECKING +from typing import TYPE_CHECKING, Dict, Optional, Union + from ray.util.annotations import PublicAPI if TYPE_CHECKING: diff --git a/python/ray/util/spark/__init__.py b/python/ray/util/spark/__init__.py index edded13240a1..69d68172eb19 100644 --- a/python/ray/util/spark/__init__.py +++ b/python/ray/util/spark/__init__.py @@ -1,8 +1,8 @@ from ray.util.spark.cluster_init import ( - setup_ray_cluster, - shutdown_ray_cluster, MAX_NUM_WORKER_NODES, setup_global_ray_cluster, + setup_ray_cluster, + shutdown_ray_cluster, ) __all__ = [ diff --git a/python/ray/util/spark/cluster_init.py b/python/ray/util/spark/cluster_init.py index 27649f278a29..fabbd51a9f76 100644 --- a/python/ray/util/spark/cluster_init.py +++ b/python/ray/util/spark/cluster_init.py @@ -1,49 +1,47 @@ import copy -import signal - -import yaml import json +import logging import os +import signal import socket import sys -import time import threading -import logging +import time import uuid import warnings +from threading import Event +from typing import Dict, Optional, Tuple, Type + import requests +import yaml from packaging.version import Version -from typing import Optional, Dict, Tuple, Type import ray import ray._private.services -from ray.autoscaler._private.spark.node_provider import HEAD_NODE_ID -from ray.util.annotations import DeveloperAPI, PublicAPI -from ray._common.utils import load_class -from ray._common.network_utils import build_address, parse_address - +from .databricks_hook import DefaultDatabricksRayOnSparkStartHook +from .start_hook_base import RayOnSparkStartHook from .utils import ( + _get_cpu_cores, + _get_local_ray_node_slots, + _get_num_physical_gpus, + _wait_service_up, + calc_mem_ray_head_node, exec_cmd, - is_port_in_use, + gen_cmd_exec_failure_msg, + get_avail_mem_per_ray_worker_node, + get_configured_spark_executor_memory_bytes, + get_max_num_concurrent_tasks, get_random_unused_port, - get_spark_session, get_spark_application_driver_host, - is_in_databricks_runtime, + get_spark_session, get_spark_task_assigned_physical_gpus, - get_avail_mem_per_ray_worker_node, - get_max_num_concurrent_tasks, - gen_cmd_exec_failure_msg, - calc_mem_ray_head_node, - _wait_service_up, - _get_local_ray_node_slots, - get_configured_spark_executor_memory_bytes, - _get_cpu_cores, - _get_num_physical_gpus, + is_in_databricks_runtime, + is_port_in_use, ) -from .start_hook_base import RayOnSparkStartHook -from .databricks_hook import DefaultDatabricksRayOnSparkStartHook -from threading import Event - +from ray._common.network_utils import build_address, parse_address +from ray._common.utils import load_class +from ray.autoscaler._private.spark.node_provider import HEAD_NODE_ID +from ray.util.annotations import DeveloperAPI, PublicAPI _logger = logging.getLogger("ray.util.spark") _logger.setLevel(logging.INFO) @@ -318,9 +316,10 @@ def _preallocate_ray_worker_port_range(): Returns: Allocated port range for current worker ports """ - import psutil import fcntl + import psutil + def acquire_lock(file_path): mode = os.O_RDWR | os.O_CREAT | os.O_TRUNC try: diff --git a/python/ray/util/spark/databricks_hook.py b/python/ray/util/spark/databricks_hook.py index 8558c309f398..491f35c1419f 100644 --- a/python/ray/util/spark/databricks_hook.py +++ b/python/ray/util/spark/databricks_hook.py @@ -1,10 +1,10 @@ +import logging import os +import threading +import time from .start_hook_base import RayOnSparkStartHook from .utils import get_spark_session -import logging -import threading -import time _logger = logging.getLogger(__name__) diff --git a/python/ray/util/spark/start_ray_node.py b/python/ray/util/spark/start_ray_node.py index 76489b15b9e5..e03c99d74e55 100644 --- a/python/ray/util/spark/start_ray_node.py +++ b/python/ray/util/spark/start_ray_node.py @@ -1,20 +1,19 @@ +import fcntl +import logging import os.path -import subprocess -import sys -import time import shutil -import fcntl import signal import socket -import logging +import subprocess +import sys import threading +import time +from ray._private.ray_process_reaper import SIGTERM_GRACE_PERIOD_SECONDS from ray.util.spark.cluster_init import ( RAY_ON_SPARK_COLLECT_LOG_TO_PATH, RAY_ON_SPARK_START_RAY_PARENT_PID, ) -from ray._private.ray_process_reaper import SIGTERM_GRACE_PERIOD_SECONDS - # Spark on ray implementation does not directly invoke `ray start ...` script to create # ray node subprocess, instead, it creates a subprocess to run this diff --git a/python/ray/util/spark/utils.py b/python/ray/util/spark/utils.py index 65bfa4a52f2b..9aa3465881fe 100644 --- a/python/ray/util/spark/utils.py +++ b/python/ray/util/spark/utils.py @@ -1,14 +1,13 @@ -import subprocess -import os -import sys -import random -import threading import collections import logging +import os +import random import shutil +import subprocess +import sys +import threading import time - _logger = logging.getLogger("ray.util.spark.utils") @@ -199,9 +198,10 @@ def _get_spark_worker_total_shared_memory(): def calc_mem_ray_head_node(configured_heap_memory_bytes, configured_object_store_bytes): - import psutil import shutil + import psutil + if RAY_ON_SPARK_DRIVER_PHYSICAL_MEMORY_BYTES in os.environ: available_physical_mem = int( os.environ[RAY_ON_SPARK_DRIVER_PHYSICAL_MEMORY_BYTES] diff --git a/python/ray/util/state/__init__.py b/python/ray/util/state/__init__.py index d74f9b650df3..b8bb885e5408 100644 --- a/python/ray/util/state/__init__.py +++ b/python/ray/util/state/__init__.py @@ -1,29 +1,28 @@ from ray.util.state.api import ( + StateApiClient, get_actor, + get_job, get_log, get_node, get_objects, get_placement_group, get_task, get_worker, - get_job, list_actors, + list_cluster_events, list_jobs, + list_logs, list_nodes, + list_objects, list_placement_groups, + list_runtime_envs, list_tasks, list_workers, - list_objects, - list_runtime_envs, - list_logs, - list_cluster_events, summarize_actors, summarize_objects, summarize_tasks, - StateApiClient, ) - __all__ = [ "get_actor", "get_log", diff --git a/python/ray/util/state/common.py b/python/ray/util/state/common.py index 18fb6eeafc75..9e4e7000eec3 100644 --- a/python/ray/util/state/common.py +++ b/python/ray/util/state/common.py @@ -9,9 +9,13 @@ from typing import Any, Dict, List, Optional, Set, Tuple, Union import ray.dashboard.utils as dashboard_utils -from ray._private.ray_constants import env_integer -from ray.core.generated.common_pb2 import TaskStatus, TaskType -from ray.core.generated.gcs_pb2 import TaskEvents + +# TODO(aguo): Instead of a version check, modify the below models +# to use pydantic BaseModel instead of dataclass. +# In pydantic 2, dataclass no longer needs the `init=True` kwarg to +# generate an __init__ method. Additionally, it will raise an error if +# it detects `init=True` to be set. +from ray._common.pydantic_compat import IS_PYDANTIC_2 from ray._private.custom_types import ( TypeActorStatus, TypeNodeStatus, @@ -22,15 +26,11 @@ TypeWorkerExitType, TypeWorkerType, ) -from ray.util.state.exception import RayStateApiException +from ray._private.ray_constants import env_integer +from ray.core.generated.common_pb2 import TaskStatus, TaskType +from ray.core.generated.gcs_pb2 import TaskEvents from ray.dashboard.modules.job.pydantic_models import JobDetails - -# TODO(aguo): Instead of a version check, modify the below models -# to use pydantic BaseModel instead of dataclass. -# In pydantic 2, dataclass no longer needs the `init=True` kwarg to -# generate an __init__ method. Additionally, it will raise an error if -# it detects `init=True` to be set. -from ray._common.pydantic_compat import IS_PYDANTIC_2 +from ray.util.state.exception import RayStateApiException try: from pydantic.dataclasses import dataclass diff --git a/python/ray/util/state/state_cli.py b/python/ray/util/state/state_cli.py index b76544d8d316..16ab4f34e2ea 100644 --- a/python/ray/util/state/state_cli.py +++ b/python/ray/util/state/state_cli.py @@ -8,8 +8,9 @@ import yaml import ray._private.services as services -from ray._private.thirdparty.tabulate.tabulate import tabulate from ray._common.network_utils import parse_address +from ray._private.thirdparty.tabulate.tabulate import tabulate +from ray.util.annotations import PublicAPI from ray.util.state import ( StateApiClient, get_log, @@ -31,7 +32,6 @@ resource_to_schema, ) from ray.util.state.exception import RayStateApiException -from ray.util.annotations import PublicAPI logger = logging.getLogger(__name__) diff --git a/python/ray/util/state/state_manager.py b/python/ray/util/state/state_manager.py index 16c570865007..b22ba784e8c2 100644 --- a/python/ray/util/state/state_manager.py +++ b/python/ray/util/state/state_manager.py @@ -1,20 +1,21 @@ import dataclasses import inspect +import json import logging from functools import wraps from typing import List, Optional, Tuple -import json import aiohttp import grpc from grpc.aio._call import UnaryStreamCall import ray -import ray.dashboard.modules.log.log_consts as log_consts import ray.dashboard.consts as dashboard_consts -from ray._private import ray_constants +import ray.dashboard.modules.log.log_consts as log_consts +from ray._common.network_utils import build_address from ray._common.utils import hex_to_binary -from ray._raylet import GcsClient, ActorID, JobID, TaskID, NodeID +from ray._private import ray_constants +from ray._raylet import ActorID, GcsClient, JobID, NodeID, TaskID from ray.core.generated import gcs_service_pb2_grpc from ray.core.generated.gcs_pb2 import ActorTableData, GcsNodeInfo from ray.core.generated.gcs_service_pb2 import ( @@ -54,7 +55,6 @@ SupportedFilterType, ) from ray.util.state.exception import DataSourceUnavailable -from ray._common.network_utils import build_address logger = logging.getLogger(__name__) diff --git a/python/ray/util/state/util.py b/python/ray/util/state/util.py index dd62076bc1dc..77894289d9fb 100644 --- a/python/ray/util/state/util.py +++ b/python/ray/util/state/util.py @@ -49,6 +49,7 @@ def convert_string_to_type( def record_deprecated_state_api_import(): import warnings + from ray._common.usage.usage_lib import TagKey, record_extra_usage_tag warnings.warn( diff --git a/python/ray/util/tracing/setup_local_tmp_tracing.py b/python/ray/util/tracing/setup_local_tmp_tracing.py index f53579a9d9c6..94523ea8bff7 100644 --- a/python/ray/util/tracing/setup_local_tmp_tracing.py +++ b/python/ray/util/tracing/setup_local_tmp_tracing.py @@ -1,4 +1,5 @@ import os + from opentelemetry import trace from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import ( diff --git a/python/ray/util/tracing/setup_tempo_tracing.py b/python/ray/util/tracing/setup_tempo_tracing.py index 12e310c612a0..e2bb3102b09d 100644 --- a/python/ray/util/tracing/setup_tempo_tracing.py +++ b/python/ray/util/tracing/setup_tempo_tracing.py @@ -1,9 +1,9 @@ # This file is intended for examples exporting traces to a local OTLP listener from opentelemetry import trace -from opentelemetry.sdk.trace import TracerProvider from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import ( OTLPSpanExporter, ) # noqa +from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import ( ConsoleSpanExporter, SimpleSpanProcessor, diff --git a/python/requirements.txt b/python/requirements.txt index 709e744bb0da..ac13f72f2d67 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -13,7 +13,7 @@ filelock jsonschema msgpack<2.0.0,>=1.0.0 packaging -protobuf!=3.19.5,>=3.15.3 +protobuf>=3.20.3 pyyaml requests watchfiles @@ -41,7 +41,7 @@ opentelemetry-api opentelemetry-exporter-prometheus opentelemetry-proto fastapi -gymnasium==1.0.0 +gymnasium==1.1.1 virtualenv!=20.21.1,>=20.0.24 opencensus aiohttp_cors @@ -61,3 +61,4 @@ py-spy>=0.2.0; python_version < '3.12' py-spy>=0.4.0; python_version >= '3.12' memray; sys_platform != "win32" # memray is not supported on Windows pyOpenSSL +celery diff --git a/python/requirements/cloud-requirements.txt b/python/requirements/cloud-requirements.txt index 2865270b4bb2..755a9bf44428 100644 --- a/python/requirements/cloud-requirements.txt +++ b/python/requirements/cloud-requirements.txt @@ -9,10 +9,11 @@ certifi pycurl azure-identity smart_open[s3,gcs,azure,http] +adlfs[abfs] # Anyscale CLI requirements -boto3>=1.26.76 -botocore>=1.19.52 +boto3==1.29.7 +botocore==1.32.7 aiohttp>=3.7.4.post0 certifi>=2024.8.30 Click>=7.0 diff --git a/python/requirements/llm/llm-requirements.txt b/python/requirements/llm/llm-requirements.txt index 9f2815ec0951..dde9a0e38905 100644 --- a/python/requirements/llm/llm-requirements.txt +++ b/python/requirements/llm/llm-requirements.txt @@ -2,7 +2,7 @@ # constraining to a maximum version (i.e. <=) to temporarily work around a bug. # Those pins for the sake of workarounds should not be advertised as constraints # on future releases in setup.py. -vllm>=0.10.0 +vllm>=0.10.1.1 # For json mode jsonref>=1.1.0 jsonschema @@ -13,7 +13,6 @@ typer meson pybind11 hf_transfer -transformers<4.54.0 # Due to https://github.com/vllm-project/vllm-ascend/issues/2046 # nixl version Needs to be in sync with the one in ray-llm/Dockerfile -nixl==0.3.1 +nixl==0.4.1 diff --git a/python/requirements/ml/core-requirements.txt b/python/requirements/ml/core-requirements.txt index 7f0b2caed3b0..2d6948ccef11 100644 --- a/python/requirements/ml/core-requirements.txt +++ b/python/requirements/ml/core-requirements.txt @@ -12,4 +12,4 @@ transformers==4.36.2 accelerate==0.28.0 # Cloud storage tools -s3fs==2023.5.0 +s3fs==2023.12.1 diff --git a/python/requirements/ml/data-requirements.txt b/python/requirements/ml/data-requirements.txt index 14ec80e7d00c..931dea6f9143 100644 --- a/python/requirements/ml/data-requirements.txt +++ b/python/requirements/ml/data-requirements.txt @@ -6,7 +6,7 @@ dask[complete]==2023.6.1; python_version < '3.12' distributed==2023.6.1; python_version < '3.12' dask[complete]==2025.5.0; python_version >= '3.12' distributed==2025.5.0; python_version >= '3.12' -aioboto3==11.2.0 +aioboto3==12.1.0 crc32c==2.3 flask_cors bokeh==2.4.3; python_version < '3.12' diff --git a/python/requirements/ml/dl-gpu-requirements.txt b/python/requirements/ml/dl-gpu-requirements.txt index 59d6b0fb10df..ab46a6df8157 100644 --- a/python/requirements/ml/dl-gpu-requirements.txt +++ b/python/requirements/ml/dl-gpu-requirements.txt @@ -16,3 +16,4 @@ torch-cluster==1.6.3+pt23cu121 torch-spline-conv==1.2.2+pt23cu121 cupy-cuda12x==13.1.0; sys_platform != 'darwin' +nixl==0.4.0; sys_platform != 'darwin' diff --git a/python/requirements/test-requirements.txt b/python/requirements/test-requirements.txt index 7932603cab7a..7fa619651a3f 100644 --- a/python/requirements/test-requirements.txt +++ b/python/requirements/test-requirements.txt @@ -10,7 +10,7 @@ azure-mgmt-network==25.4.0 azure-mgmt-resource==23.1.1 msrestazure==0.6.4 beautifulsoup4==4.11.1 -boto3==1.26.76 +boto3==1.29.7 # Todo: investigate if we can get rid of this and exchange for ray.cloudpickle cloudpickle==2.2.0 ; python_version < "3.12" cloudpickle==3.0.0 ; python_version >= "3.12" @@ -64,6 +64,7 @@ smart_open[s3]==6.2.0 tqdm==4.67.1 trustme==0.9.0 testfixtures==7.0.0 +uv==0.8.9 uvicorn==0.22.0 vsphere-automation-sdk @ git+https://github.com/vmware/vsphere-automation-sdk-python.git@v8.0.1.0 werkzeug==2.3.8 @@ -85,7 +86,7 @@ pytest-docker-tools==3.1.3 pytest-forked==1.4.0 # For dataset tests -polars>=1.30.0,<2.0.0 +polars>=1.32.3,<2.0.0 importlib-metadata==6.11.0 diff --git a/python/requirements_compiled.txt b/python/requirements_compiled.txt index b43938ccf1d0..4b530883bb8a 100644 --- a/python/requirements_compiled.txt +++ b/python/requirements_compiled.txt @@ -24,6 +24,8 @@ adagio==0.2.4 # qpd adal==1.2.7 # via msrestazure +adlfs==2023.8.0 + # via -r python/requirements/cloud-requirements.txt aim==3.23.0 ; python_version < "3.12" # via -r python/requirements/ml/tune-test-requirements.txt aim-ui==3.23.0 @@ -32,9 +34,9 @@ aimrecords==0.0.7 # via aim aimrocks==0.5.2 # via aim -aioboto3==11.2.0 +aioboto3==12.1.0 # via -r python/requirements/ml/data-requirements.txt -aiobotocore==2.5.0 +aiobotocore==2.8.0 # via # aioboto3 # s3fs @@ -50,6 +52,7 @@ aiohttp==3.11.16 # -r python/requirements.txt # -r python/requirements/cloud-requirements.txt # -r python/requirements/test-requirements.txt + # adlfs # aiobotocore # aiohttp-cors # delta-sharing @@ -79,6 +82,8 @@ alembic==1.12.1 # optuna altair==5.1.2 # via gradio +amqp==5.3.1 + # via kombu annotated-types==0.6.0 # via pydantic antlr4-python3-runtime==4.11.1 @@ -158,15 +163,19 @@ azure-common==1.1.28 # smart-open azure-core==1.29.5 # via + # adlfs # azure-identity # azure-mgmt-core # azure-storage-blob # msrest # smart-open +azure-datalake-store==0.0.53 + # via adlfs azure-identity==1.17.1 # via # -r python/requirements/cloud-requirements.txt # -r python/requirements/test-requirements.txt + # adlfs azure-mgmt-compute==31.0.0 # via -r python/requirements/test-requirements.txt azure-mgmt-core==1.4.0 @@ -180,7 +189,9 @@ azure-mgmt-network==25.4.0 azure-mgmt-resource==23.1.1 # via -r python/requirements/test-requirements.txt azure-storage-blob==12.22.0 - # via smart-open + # via + # adlfs + # smart-open babel==2.13.1 # via # jupyterlab-server @@ -203,6 +214,8 @@ beautifulsoup4==4.11.1 # via # -r python/requirements/test-requirements.txt # nbconvert +billiard==4.2.1 + # via celery black==22.10.0 # via -r python/requirements/lint-requirements.txt bleach==6.1.0 @@ -218,7 +231,7 @@ boltons==21.0.0 # semgrep boto==2.49.0 # via gcs-oauth2-boto-plugin -boto3==1.26.76 +boto3==1.29.7 # via # -r python/requirements/cloud-requirements.txt # -r python/requirements/test-requirements.txt @@ -228,7 +241,7 @@ boto3==1.26.76 # moto # smart-open # snowflake-connector-python -botocore==1.29.76 +botocore==1.32.7 # via # -r python/requirements/cloud-requirements.txt # aiobotocore @@ -249,6 +262,8 @@ cachetools==5.5.2 # google-auth # mlflow-skinny # pyiceberg +celery==5.5.3 + # via -r python/requirements.txt certifi==2025.1.31 # via # -r python/requirements/cloud-requirements.txt @@ -264,6 +279,7 @@ certifi==2025.1.31 cffi==1.16.0 # via # argon2-cffi-bindings + # azure-datalake-store # cryptography # pymunk # pynacl @@ -285,7 +301,11 @@ click==8.1.7 # -r python/requirements/cloud-requirements.txt # aim # black + # celery + # click-didyoumean # click-option-group + # click-plugins + # click-repl # dask # distributed # flask @@ -298,8 +318,14 @@ click==8.1.7 # typer # uvicorn # wandb +click-didyoumean==0.3.1 + # via celery click-option-group==0.5.6 # via semgrep +click-plugins==1.1.1.2 + # via celery +click-repl==0.3.0 + # via celery clickhouse-connect==0.8.10 # via -r python/requirements/ml/data-test-requirements.txt cloudpickle==2.2.0 ; python_version < "3.12" @@ -547,9 +573,10 @@ frozenlist==1.4.1 # aiosignal fs==2.4.16 # via triad -fsspec==2023.5.0 +fsspec==2023.12.1 # via # -r python/requirements.txt + # adlfs # dask # datasets # delta-sharing @@ -676,7 +703,7 @@ graphviz==0.20.3 # via -r python/requirements/test-requirements.txt greenlet==3.0.1 # via sqlalchemy -grpcio==1.66.2 ; python_version >= "3.10" +grpcio==1.74.0 ; python_version >= "3.10" # via # -r python/requirements.txt # -r python/requirements/cloud-requirements.txt @@ -695,7 +722,7 @@ gsutil==5.27 # via -r python/requirements/docker/ray-docker-requirements.txt gunicorn==20.1.0 # via mlflow -gymnasium==1.0.0 +gymnasium==1.1.1 # via # -r python/requirements.txt # minigrid @@ -963,6 +990,8 @@ kiwisolver==1.4.5 # via matplotlib knack==0.11.0 # via azure-cli-core +kombu==5.5.4 + # via celery kubernetes==24.2.0 # via -r python/requirements/test-requirements.txt labmaze==1.0.6 @@ -1078,6 +1107,7 @@ mpmath==1.3.0 msal==1.28.1 # via # azure-cli-core + # azure-datalake-store # azure-identity # msal-extensions msal-extensions==1.2.0b1 @@ -1332,7 +1362,7 @@ opt-einsum==3.3.0 # tensorflow optuna==4.1.0 # via -r python/requirements/ml/tune-requirements.txt -orjson==3.9.10 +orjson==3.9.15 # via # gradio # tensordict @@ -1363,6 +1393,7 @@ packaging==23.0 # jupyterlab-server # jupytext # knack + # kombu # lazy-loader # lightning-utilities # matplotlib @@ -1466,7 +1497,7 @@ plotly==5.23.0 # via ax-platform pluggy==1.3.0 # via pytest -polars==1.30.0 +polars==1.32.3 # via -r python/requirements/test-requirements.txt portalocker==2.8.2 # via @@ -1484,7 +1515,9 @@ prometheus-client==0.19.0 promise==2.3 # via tensorflow-datasets prompt-toolkit==3.0.41 - # via ipython + # via + # click-repl + # ipython propcache==0.3.0 # via # aiohttp @@ -1586,7 +1619,7 @@ pycparser==2.21 # via cffi pycurl==7.45.3 # via -r python/requirements/cloud-requirements.txt -pydantic==2.10.0 +pydantic==2.11.7 # via # -r python/requirements.txt # -r python/requirements/test-requirements.txt @@ -1596,7 +1629,7 @@ pydantic==2.10.0 # gradio # mlflow-skinny # pyiceberg -pydantic-core==2.27.0 +pydantic-core==2.33.2 # via pydantic pydot==1.4.2 # via -r python/requirements/test-requirements.txt @@ -1735,6 +1768,7 @@ python-dateutil==2.8.2 # aim # arrow # botocore + # celery # freezegun # google-cloud-bigquery # graphene @@ -1839,6 +1873,7 @@ requests==2.32.3 # aim # azure-cli-core # azure-core + # azure-datalake-store # comet-ml # databricks-sdk # datasets @@ -1930,9 +1965,9 @@ ruamel-yaml==0.17.40 # yahp ruamel-yaml-clib==0.2.8 # via ruamel-yaml -s3fs==2023.5.0 +s3fs==2023.12.1 # via -r python/requirements/ml/core-requirements.txt -s3transfer==0.6.2 +s3transfer==0.8.0 # via boto3 safetensors==0.4.3 # via @@ -2381,6 +2416,11 @@ typing-extensions==4.12.2 # tensorflow # torch # typer + # typing-inspection +typing-inspection==0.4.1 + # via pydantic +tzdata==2025.2 + # via kombu tzlocal==5.3 # via -r python/requirements/cloud-requirements.txt ujson==5.10.0 @@ -2405,6 +2445,8 @@ urllib3==1.26.19 # sentry-sdk utilsforecast==0.2.0 # via statsforecast +uv==0.8.9 + # via -r python/requirements/test-requirements.txt uvicorn==0.22.0 # via # -r python/requirements.txt @@ -2433,6 +2475,11 @@ uvloop==0.21.0 # vmc-draas-client-bindings # vsphere-automation-sdk # via vsphere-automation-sdk +vine==5.1.0 + # via + # amqp + # celery + # kombu virtualenv==20.29.1 # via # -r python/requirements.txt diff --git a/python/setup.py b/python/setup.py index 7044291c4edd..60024b4b8c1f 100644 --- a/python/setup.py +++ b/python/setup.py @@ -29,6 +29,7 @@ ROOT_DIR = os.path.dirname(__file__) BUILD_JAVA = os.getenv("RAY_INSTALL_JAVA") == "1" +BUILD_CPP = os.getenv("RAY_DISABLE_EXTRA_CPP") != "1" SKIP_BAZEL_BUILD = os.getenv("SKIP_BAZEL_BUILD") == "1" BAZEL_ARGS = os.getenv("BAZEL_ARGS") BAZEL_LIMIT_CPUS = os.getenv("BAZEL_LIMIT_CPUS") @@ -37,7 +38,7 @@ RUNTIME_ENV_AGENT_THIRDPARTY_SUBDIR = os.path.join( "ray", "_private", "runtime_env", "agent", "thirdparty_files" ) - +DEPS_ONLY_VERSION = "100.0.0-dev" # In automated builds, we do a few adjustments before building. For instance, # the bazel environment is set up slightly differently, and symlinks are # replaced with junctions in Windows. This variable is set in our conda-forge @@ -70,6 +71,7 @@ class BuildType(Enum): DEBUG = 2 ASAN = 3 TSAN = 4 + DEPS_ONLY = 5 class SetupSpec: @@ -86,6 +88,8 @@ def __init__( self.version: str = f"{version}+asan" elif build_type == BuildType.TSAN: self.version: str = f"{version}+tsan" + elif build_type == BuildType.DEPS_ONLY: + self.version: str = DEPS_ONLY_VERSION else: self.version = version self.description: str = description @@ -95,7 +99,7 @@ def __init__( self.extras: dict = {} def get_packages(self): - if self.type == SetupType.RAY: + if self.type == SetupType.RAY and self.build_type != BuildType.DEPS_ONLY: return setuptools.find_packages(exclude=("tests", "*.tests", "*.tests.*")) else: return [] @@ -108,6 +112,8 @@ def get_packages(self): BUILD_TYPE = BuildType.ASAN elif build_type == "tsan": BUILD_TYPE = BuildType.TSAN +elif build_type == "deps-only": + BUILD_TYPE = BuildType.DEPS_ONLY else: BUILD_TYPE = BuildType.DEFAULT @@ -130,7 +136,7 @@ def get_packages(self): ) RAY_EXTRA_CPP = True # Disable extra cpp for the development versions. - if "dev" in setup_spec.version or os.getenv("RAY_DISABLE_EXTRA_CPP") == "1": + if "dev" in setup_spec.version or not BUILD_CPP: RAY_EXTRA_CPP = False # Ideally, we could include these files by putting them in a @@ -299,12 +305,23 @@ def get_packages(self): ) ) + # This is required for supporting the asynchronous inference, allowing the ray serve applications to + # allow asynchronously execute their code, via the use of celery task processor. + setup_spec.extras["serve-async-inference"] = list( + set( + setup_spec.extras["serve"] + + [ + "celery", + ] + ) + ) + if RAY_EXTRA_CPP: setup_spec.extras["cpp"] = ["ray-cpp==" + setup_spec.version] setup_spec.extras["rllib"] = setup_spec.extras["tune"] + [ "dm_tree", - "gymnasium==1.0.0", + "gymnasium==1.1.1", "lz4", "ormsgpack==1.7.0", "pyyaml", @@ -355,7 +372,7 @@ def get_packages(self): setup_spec.extras["llm"] = list( set( [ - "vllm>=0.10.0", + "vllm>=0.10.1.1", "jsonref>=1.1.0", "jsonschema", "ninja", @@ -383,7 +400,7 @@ def get_packages(self): "jsonschema", "msgpack >= 1.0.0, < 2.0.0", "packaging", - "protobuf >= 3.15.3, != 3.19.5", + "protobuf>=3.20.3", "pyyaml", "requests", ] @@ -623,13 +640,6 @@ def build(build_python, build_java, build_cpp): bazel_precmd_flags = [] if sys.platform == "win32": bazel_precmd_flags = ["--output_user_root=C:/tmp"] - # Using --incompatible_strict_action_env so that the build is more - # cache-able We cannot turn this on for Python tests yet, as Ray's - # Python bazel tests are not hermetic. - # - # And we put it here so that does not change behavior of - # conda-forge build. - bazel_flags.append("--incompatible_strict_action_env") bazel_targets = [] bazel_targets += ["//:gen_ray_pkg"] if build_python else [] @@ -696,12 +706,15 @@ def copy_file(target_dir, filename, rootdir): def pip_run(build_ext): - if SKIP_BAZEL_BUILD: + if SKIP_BAZEL_BUILD or setup_spec.build_type == BuildType.DEPS_ONLY: build(False, False, False) else: - build(True, BUILD_JAVA, True) + build(True, BUILD_JAVA, BUILD_CPP) if setup_spec.type == SetupType.RAY: + if setup_spec.build_type == BuildType.DEPS_ONLY: + setup_spec.files_to_include = [] + return setup_spec.files_to_include += ray_files thirdparty_dir = os.path.join(ROOT_DIR, THIRDPARTY_SUBDIR) @@ -792,7 +805,7 @@ def has_ext_modules(self): exclude_package_data={ # Empty string means "any package". # Therefore, exclude BUILD from every package: - "": ["BUILD"], + "": ["BUILD", "BUILD.bazel"], }, zip_safe=False, license="Apache 2.0", diff --git a/release/BUILD.bazel b/release/BUILD.bazel index a7f8320bd23a..705d230a9c32 100644 --- a/release/BUILD.bazel +++ b/release/BUILD.bazel @@ -309,6 +309,7 @@ py_library( ]) + [ "ray_release/buildkite/aws_instance_types.csv", "ray_release/schema.json", + "//doc:deployment_serve_llm_example_configs", "//doc:example_configs", "//doc/source/train/examples/pytorch/distributing-pytorch/ci:ci_yamls", ], @@ -438,6 +439,42 @@ py_test( ], ) +py_test( + name = "test_custom_byod_build", + size = "small", + srcs = ["ray_release/tests/test_custom_byod_build.py"], + data = [ + "ray_release/configs/oss_config.yaml", + ], + exec_compatible_with = ["//:hermetic_python"], + tags = [ + "release_unit", + "team:ci", + ], + deps = [ + ":ray_release", + bk_require("pytest"), + ], +) + +py_test( + name = "test_custom_byod_build_init_helper", + size = "small", + srcs = ["ray_release/tests/test_custom_byod_build_init_helper.py"], + data = [ + "ray_release/configs/oss_config.yaml", + ], + exec_compatible_with = ["//:hermetic_python"], + tags = [ + "release_unit", + "team:ci", + ], + deps = [ + ":ray_release", + bk_require("pytest"), + ], +) + py_test( name = "test_cluster_manager", size = "small", @@ -674,6 +711,20 @@ py_test( ], ) +py_test( + name = "test_template", + srcs = ["ray_release/tests/test_template.py"], + exec_compatible_with = ["//:hermetic_python"], + tags = [ + "release_unit", + "team:ci", + ], + deps = [ + ":ray_release", + bk_require("pytest"), + ], +) + py_binary( name = "build_pipeline", srcs = ["ray_release/scripts/build_pipeline.py"], @@ -688,3 +739,21 @@ py_binary( ":ray_release", ], ) + +py_binary( + name = "custom_byod_build", + srcs = ["ray_release/scripts/custom_byod_build.py"], + exec_compatible_with = ["//:hermetic_python"], + deps = [ + ":ray_release", + ], +) + +py_binary( + name = "custom_byod_build_init", + srcs = ["ray_release/scripts/custom_byod_build_init.py"], + exec_compatible_with = ["//:hermetic_python"], + deps = [ + ":ray_release", + ], +) diff --git a/release/air_tests/air_benchmarks/compute_gpu_1_aws.yaml b/release/air_tests/air_benchmarks/compute_gpu_1_aws.yaml index 150990710680..c9e54e107da7 100644 --- a/release/air_tests/air_benchmarks/compute_gpu_1_aws.yaml +++ b/release/air_tests/air_benchmarks/compute_gpu_1_aws.yaml @@ -5,7 +5,7 @@ max_workers: 0 head_node_type: name: head_node - instance_type: g3.8xlarge + instance_type: g4dn.8xlarge worker_node_types: [] diff --git a/release/air_tests/air_benchmarks/compute_gpu_2x2_aws.yaml b/release/air_tests/air_benchmarks/compute_gpu_2x2_aws.yaml index 20791f9e4d9d..c81d613a6f89 100644 --- a/release/air_tests/air_benchmarks/compute_gpu_2x2_aws.yaml +++ b/release/air_tests/air_benchmarks/compute_gpu_2x2_aws.yaml @@ -5,11 +5,11 @@ max_workers: 1 head_node_type: name: head_node - instance_type: g3.8xlarge + instance_type: g4dn.8xlarge worker_node_types: - name: worker_node - instance_type: g3.8xlarge + instance_type: g4dn.8xlarge max_workers: 1 min_workers: 1 use_spot: false diff --git a/release/air_tests/horovod/compute_tpl_aws.yaml b/release/air_tests/horovod/compute_tpl_aws.yaml index 2ef09f059167..f94535b53bef 100644 --- a/release/air_tests/horovod/compute_tpl_aws.yaml +++ b/release/air_tests/horovod/compute_tpl_aws.yaml @@ -6,11 +6,11 @@ max_workers: 1 head_node_type: name: head_node - instance_type: g3.8xlarge + instance_type: g4dn.8xlarge worker_node_types: - name: worker_node - instance_type: g3.8xlarge + instance_type: g4dn.8xlarge max_workers: 1 min_workers: 1 use_spot: false diff --git a/release/benchmarks/distributed/many_nodes_tests/dashboard_test.py b/release/benchmarks/distributed/many_nodes_tests/dashboard_test.py index 98323ed95c45..03a1fb287d1a 100644 --- a/release/benchmarks/distributed/many_nodes_tests/dashboard_test.py +++ b/release/benchmarks/distributed/many_nodes_tests/dashboard_test.py @@ -126,7 +126,7 @@ def get_result(self): # Get the memory usage. dashboard_export_addr = build_address( - self.addr["raylet_ip_address"], DASHBOARD_METRIC_PORT + self.addr["node_ip_address"], DASHBOARD_METRIC_PORT ) metrics = fetch_prometheus_metrics([dashboard_export_addr]) memories = [] diff --git a/release/golden_notebook_tests/gpu_tpl_aws.yaml b/release/golden_notebook_tests/gpu_tpl_aws.yaml index 12d5f1a9d9bb..b01340ffea6e 100644 --- a/release/golden_notebook_tests/gpu_tpl_aws.yaml +++ b/release/golden_notebook_tests/gpu_tpl_aws.yaml @@ -9,7 +9,7 @@ head_node_type: worker_node_types: - name: worker_node - instance_type: g3.8xlarge + instance_type: g4dn.12xlarge min_workers: 2 max_workers: 2 use_spot: true diff --git a/release/hello_world_tests/hello_world.py b/release/hello_world_tests/hello_world.py new file mode 100644 index 000000000000..84756ed0a2f4 --- /dev/null +++ b/release/hello_world_tests/hello_world.py @@ -0,0 +1,14 @@ +import ray + + +@ray.remote +def hello_world(): + return "Hello, world!" + + +def main(): + print(ray.get(hello_world.remote())) + + +if __name__ == "__main__": + main() diff --git a/release/hello_world_tests/hello_world_compute_config.yaml b/release/hello_world_tests/hello_world_compute_config.yaml new file mode 100644 index 000000000000..ca578bf09d6b --- /dev/null +++ b/release/hello_world_tests/hello_world_compute_config.yaml @@ -0,0 +1,8 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west-2 + +head_node_type: + name: head_node + instance_type: m5.xlarge + +worker_node_types: [] diff --git a/release/llm_tests/batch/test_batch_vllm.py b/release/llm_tests/batch/test_batch_vllm.py index 231153dedeb1..dea0adca7367 100644 --- a/release/llm_tests/batch/test_batch_vllm.py +++ b/release/llm_tests/batch/test_batch_vllm.py @@ -27,9 +27,12 @@ def add_buffer_time_between_tests(): """Add buffer time after each test to avoid resource conflicts, which cause flakiness. """ - yield # Test runs here + # yield # test runs + # time.sleep(10) + import gc - time.sleep(10) + gc.collect() + time.sleep(15) def test_chat_template_with_vllm(): diff --git a/release/llm_tests/serve/configs/lmcache/decoder.yaml b/release/llm_tests/serve/configs/lmcache/decoder.yaml new file mode 100644 index 000000000000..34e22d421997 --- /dev/null +++ b/release/llm_tests/serve/configs/lmcache/decoder.yaml @@ -0,0 +1,12 @@ +local_cpu: False +max_local_cpu_size: 0 +max_local_disk_size: 0 +remote_serde: NULL + +enable_nixl: True +nixl_role: "receiver" +nixl_receiver_host: "localhost" +nixl_receiver_port: 55555 +nixl_buffer_size: 1073741824 # 1GB +nixl_buffer_device: "cuda" +nixl_enable_gc: True diff --git a/release/llm_tests/serve/configs/lmcache/prefiller.yaml b/release/llm_tests/serve/configs/lmcache/prefiller.yaml new file mode 100644 index 000000000000..544551b78a78 --- /dev/null +++ b/release/llm_tests/serve/configs/lmcache/prefiller.yaml @@ -0,0 +1,12 @@ +local_cpu: False +max_local_cpu_size: 0 +max_local_disk_size: 0 +remote_serde: NULL + +enable_nixl: True +nixl_role: "sender" +nixl_receiver_host: "localhost" +nixl_receiver_port: 55555 +nixl_buffer_size: 1073741824 # 1GB +nixl_buffer_device: "cuda" +nixl_enable_gc: True diff --git a/release/llm_tests/serve/configs/serve_llama_3dot1_8b_quantized_tp1_2p6d_lmcache.yaml b/release/llm_tests/serve/configs/serve_llama_3dot1_8b_quantized_tp1_2p6d_lmcache.yaml new file mode 100644 index 000000000000..87636bb790b5 --- /dev/null +++ b/release/llm_tests/serve/configs/serve_llama_3dot1_8b_quantized_tp1_2p6d_lmcache.yaml @@ -0,0 +1,52 @@ +applications: + - args: + + prefill_config: + model_loading_config: + model_id: neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16 + accelerator_type: A10G + engine_kwargs: + max_model_len: 8192 + tensor_parallel_size: 1 + enforce_eager: true + kv_transfer_config: + kv_connector: LMCacheConnectorV1 + kv_role: kv_producer + kv_connector_extra_config: + discard_partial_chunks: false + lmcache_rpc_port: producer1 + deployment_config: + autoscaling_config: + min_replicas: 2 + max_replicas: 2 + runtime_env: + env_vars: + LMCACHE_CONFIG_FILE: configs/lmcache/prefiller.yaml + LMCACHE_USE_EXPERIMENTAL: "True" + + decode_config: + model_loading_config: + model_id: neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16 + accelerator_type: A10G + engine_kwargs: + max_model_len: 8192 + tensor_parallel_size: 1 + enforce_eager: true + kv_transfer_config: + kv_connector: LMCacheConnectorV1 + kv_role: kv_consumer + kv_connector_extra_config: + discard_partial_chunks: false + lmcache_rpc_port: consumer1 + deployment_config: + autoscaling_config: + min_replicas: 6 + max_replicas: 6 + runtime_env: + env_vars: + LMCACHE_CONFIG_FILE: configs/lmcache/decoder.yaml + LMCACHE_USE_EXPERIMENTAL: "True" + + import_path: ray.serve.llm:build_pd_openai_app + name: llm-endpoint + route_prefix: / diff --git a/release/microbenchmark/experimental/gpu_object_microbenchmark.py b/release/microbenchmark/experimental/gpu_object_microbenchmark.py index 67deb9cb05c7..bded6c4acc76 100644 --- a/release/microbenchmark/experimental/gpu_object_microbenchmark.py +++ b/release/microbenchmark/experimental/gpu_object_microbenchmark.py @@ -27,7 +27,7 @@ class BackendConfig: BACKEND_CONFIG = { "gloo": BackendConfig( - init_actor_kwargs={"enable_tensor_transport": True}, + init_actor_kwargs={}, send_method_kwargs={"tensor_transport": "gloo"}, device=torch.device("cpu"), collective_group_backend="torch_gloo", @@ -51,7 +51,7 @@ class BackendConfig: } -@ray.remote +@ray.remote(enable_tensor_transport=True) class Actor: def __init__( self, diff --git a/release/ml_user_tests/tune_rllib/compute_tpl_aws.yaml b/release/ml_user_tests/tune_rllib/compute_tpl_aws.yaml index 376fd90539c7..1290ca8b6900 100644 --- a/release/ml_user_tests/tune_rllib/compute_tpl_aws.yaml +++ b/release/ml_user_tests/tune_rllib/compute_tpl_aws.yaml @@ -15,7 +15,7 @@ worker_node_types: max_workers: 6 use_spot: false - name: worker_node_gpu - instance_type: g3.4xlarge # 1 GPU and 16 CPU + instance_type: g4dn.4xlarge # 1 GPU and 16 CPU min_workers: 2 max_workers: 2 use_spot: false diff --git a/release/nightly_tests/dataset/autoscaling_gpu_g6e_2xl_aws.yaml b/release/nightly_tests/dataset/autoscaling_gpu_g6e_2xl_aws.yaml new file mode 100644 index 000000000000..d8ca5be2d561 --- /dev/null +++ b/release/nightly_tests/dataset/autoscaling_gpu_g6e_2xl_aws.yaml @@ -0,0 +1,18 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west-2 + +head_node_type: + name: head_node + instance_type: m5.2xlarge + +worker_node_types: + - name: 1xL40S_8CPU_64GB + instance_type: g6e.2xlarge + max_workers: 15 + min_workers: 0 + use_spot: false + - name: 16CPU_64GB + instance_type: m5.4xlarge + max_workers: 20 + min_workers: 0 + use_spot: false diff --git a/release/nightly_tests/dataset/fixed_size_gpu_g6e_2xl_aws.yaml b/release/nightly_tests/dataset/fixed_size_gpu_g6e_2xl_aws.yaml new file mode 100644 index 000000000000..bcb5da42d911 --- /dev/null +++ b/release/nightly_tests/dataset/fixed_size_gpu_g6e_2xl_aws.yaml @@ -0,0 +1,18 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west-2 + +head_node_type: + name: head_node + instance_type: m5.2xlarge + +worker_node_types: + - name: 1xL40S_8CPU_64GB + instance_type: g6e.2xlarge + max_workers: 15 + min_workers: 15 + use_spot: false + - name: 16CPU_64GB + instance_type: m5.4xlarge + max_workers: 20 + min_workers: 20 + use_spot: false diff --git a/release/nightly_tests/dataset/batch_inference_hetero/autoscaling_cluster_compute.yaml b/release/nightly_tests/dataset/image_embedding_from_jsonl/autoscaling_cluster_compute.yaml similarity index 100% rename from release/nightly_tests/dataset/batch_inference_hetero/autoscaling_cluster_compute.yaml rename to release/nightly_tests/dataset/image_embedding_from_jsonl/autoscaling_cluster_compute.yaml diff --git a/release/nightly_tests/dataset/batch_inference_hetero/fixed_size_cluster_compute.yaml b/release/nightly_tests/dataset/image_embedding_from_jsonl/fixed_size_cluster_compute.yaml similarity index 100% rename from release/nightly_tests/dataset/batch_inference_hetero/fixed_size_cluster_compute.yaml rename to release/nightly_tests/dataset/image_embedding_from_jsonl/fixed_size_cluster_compute.yaml diff --git a/release/nightly_tests/dataset/batch_inference_hetero/main.py b/release/nightly_tests/dataset/image_embedding_from_jsonl/main.py similarity index 100% rename from release/nightly_tests/dataset/batch_inference_hetero/main.py rename to release/nightly_tests/dataset/image_embedding_from_jsonl/main.py diff --git a/release/nightly_tests/dataset/image_embedding_from_uris/autoscaling_cluster_compute.yaml b/release/nightly_tests/dataset/image_embedding_from_uris/autoscaling_cluster_compute.yaml new file mode 100644 index 000000000000..ff9f39f3cc5a --- /dev/null +++ b/release/nightly_tests/dataset/image_embedding_from_uris/autoscaling_cluster_compute.yaml @@ -0,0 +1,21 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west-2 + +advanced_configurations_json: + IamInstanceProfile: {"Name": "ray-autoscaler-v1"} + +head_node_type: + name: head-node + instance_type: m5.2xlarge + resources: + cpu: 0 + +worker_node_types: + - name: worker-node + instance_type: g4dn.2xlarge + min_workers: 0 + max_workers: 100 + use_spot: false + +flags: + allow-cross-zone-autoscaling: true diff --git a/release/nightly_tests/dataset/image_embedding_from_uris/fixed_size_cluster_compute.yaml b/release/nightly_tests/dataset/image_embedding_from_uris/fixed_size_cluster_compute.yaml new file mode 100644 index 000000000000..199da1873dc3 --- /dev/null +++ b/release/nightly_tests/dataset/image_embedding_from_uris/fixed_size_cluster_compute.yaml @@ -0,0 +1,21 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west-2 + +advanced_configurations_json: + IamInstanceProfile: {"Name": "ray-autoscaler-v1"} + +head_node_type: + name: head-node + instance_type: m5.2xlarge + resources: + cpu: 0 + +worker_node_types: + - name: worker-node + instance_type: g4dn.2xlarge + min_workers: 100 + max_workers: 100 + use_spot: false + +flags: + allow-cross-zone-autoscaling: true diff --git a/release/nightly_tests/dataset/batch_inference_mock_image_pipeline.py b/release/nightly_tests/dataset/image_embedding_from_uris/main.py similarity index 71% rename from release/nightly_tests/dataset/batch_inference_mock_image_pipeline.py rename to release/nightly_tests/dataset/image_embedding_from_uris/main.py index 41aa12f10845..48fcf8fc83f3 100644 --- a/release/nightly_tests/dataset/batch_inference_mock_image_pipeline.py +++ b/release/nightly_tests/dataset/image_embedding_from_uris/main.py @@ -3,7 +3,6 @@ import uuid from typing import Any, Dict -import boto3 import numpy as np import pandas as pd import torch @@ -12,19 +11,22 @@ from torchvision.models import vit_b_16, ViT_B_16_Weights import albumentations as A import ray -from ray.data import ActorPoolStrategy, DataContext import copy import itertools from typing import List import string import random import time +from ray.data.expressions import download +from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy +from ray._private.test_utils import EC2InstanceTerminatorWithGracePeriod + WRITE_PATH = f"s3://ray-data-write-benchmark/{uuid.uuid4().hex}" BUCKET = "ray-benchmark-data-internal-us-west-2" # Assumptions: homogenously shaped images, homogenous images -# Each iamge is 2048 * 2048 * 3 = 12.58 MB -> 11 images / block. 8 blocks per task, so ~88 images per task. +# Each image is 2048 * 2048 * 3 = 12.58 MB -> 11 images / block. 8 blocks per task, so ~88 images per task. IMAGES_PER_BLOCK = 11 BLOCKS_PER_TASK = 8 NUM_UNITS = 1380 @@ -42,6 +44,13 @@ def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser() + parser.add_argument( + "--inference-concurrency", + nargs=2, + type=int, + required=True, + help="The minimum and maximum concurrency for the inference operator.", + ) parser.add_argument( "--sf", dest="scale_factor", @@ -52,6 +61,14 @@ def parse_args() -> argparse.Namespace: "dataset." ), ) + parser.add_argument( + "--chaos", + action="store_true", + help=( + "Whether to enable chaos. If set, this script terminates one worker node " + "every minute with a grace period." + ), + ) return parser.parse_args() @@ -70,10 +87,9 @@ def create_metadata(scale_factor: int): "metadata_6": "".join(random.choices(string.ascii_letters, k=16)), "container_order_read_id": f"{i:04d}_{j:04d}", "container_id": i, - "channel_keys": [ - f"15TiB-high-resolution-images/group={i:04d}/{j:04d}_{k}.png" - for k in range(3) - ], + "channel0_uris": f"s3://{BUCKET}/15TiB-high-resolution-images/group={i:04d}/{j:04d}_{0}.png", + "channel1_uris": f"s3://{BUCKET}/15TiB-high-resolution-images/group={i:04d}/{j:04d}_{1}.png", + "channel2_uris": f"s3://{BUCKET}/15TiB-high-resolution-images/group={i:04d}/{j:04d}_{2}.png", "applied_scale": 1, } for j in range(NUM_UNITS) @@ -82,20 +98,16 @@ def create_metadata(scale_factor: int): ) -class LoadImage: - def __init__(self): - self._client = boto3.client("s3") +def combine_channels(row: Dict[str, Any]) -> Dict[str, np.ndarray]: + channels = [] + for i in range(3): + data = io.BytesIO(row.pop(f"channel{i}")) + image = Image.open(data) + channels.append(np.array(image)) - def __call__(self, row): - channels = [] - for key in row["channel_keys"]: - data = io.BytesIO() - self._client.download_fileobj(BUCKET, key, data) - image = Image.open(data) - channels.append(np.array(image)) + row["image"] = np.dstack(channels) - row["image"] = np.dstack(channels) - return row + return row def process_image(row: Dict[str, Any]) -> Dict[str, np.ndarray]: @@ -177,11 +189,14 @@ def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: return batch -def main(scale_factor: int): +def main(args: argparse.Namespace): benchmark = Benchmark() + if args.chaos: + start_chaos() + print("Creating metadata") - metadata = create_metadata(scale_factor=scale_factor) + metadata = create_metadata(scale_factor=args.scale_factor) def benchmark_fn(): weights = ViT_B_16_Weights.DEFAULT @@ -189,28 +204,21 @@ def benchmark_fn(): transform = weights.transforms() model_ref = ray.put(model) - # Toggle on features that are required for the pipeline to work. - ctx = DataContext.get_current() - ctx.enable_fallback_to_arrow_object_ext_type = True - ctx.execution_options.actor_locality_enabled = True - - print(f"Starting pipeline with {OVERRIDE_NUM_BLOCKS} blocks") ( - ray.data.from_pandas(metadata, override_num_blocks=OVERRIDE_NUM_BLOCKS) - .map( - LoadImage, - # TODO(mowen): When we fix the deadlocking bug we should increase this to 800. - compute=ActorPoolStrategy(min_size=1, max_size=700), - max_concurrency=4, # needed to prevent image loading from becoming the bottleneck - ) + ray.data.from_pandas(metadata) + .with_column("channel0", download("channel0_uris")) + .with_column("channel1", download("channel1_uris")) + .with_column("channel2", download("channel2_uris")) + .map(combine_channels) .filter(lambda row: row["image"].size != 0) .map(process_image) .flat_map(patch_image) .map_batches(ProcessPatches(transform)) .map_batches( - FakeEmbedPatches, + EmbedPatches, + num_gpus=1, batch_size=BATCH_SIZE, - compute=ActorPoolStrategy(min_size=1, max_size=100), + concurrency=tuple(args.inference_concurrency), fn_constructor_kwargs={"model": model_ref, "device": "cuda"}, ) .write_parquet(WRITE_PATH) @@ -220,7 +228,23 @@ def benchmark_fn(): benchmark.write_result() +def start_chaos(): + assert ray.is_initialized() + + head_node_id = ray.get_runtime_context().get_node_id() + scheduling_strategy = NodeAffinitySchedulingStrategy( + node_id=head_node_id, soft=False + ) + resource_killer = EC2InstanceTerminatorWithGracePeriod.options( + scheduling_strategy=scheduling_strategy + ).remote(head_node_id, max_to_kill=None) + + ray.get(resource_killer.ready.remote()) + + resource_killer.run.remote() + + if __name__ == "__main__": args = parse_args() - scale_factor = args.scale_factor - main(scale_factor) + ray.init() + main(args) diff --git a/release/nightly_tests/dataset/read_from_uris_benchmark.py b/release/nightly_tests/dataset/read_from_uris_benchmark.py index e1da9cb52142..eed591aaf502 100644 --- a/release/nightly_tests/dataset/read_from_uris_benchmark.py +++ b/release/nightly_tests/dataset/read_from_uris_benchmark.py @@ -1,11 +1,12 @@ import io -import boto3 import numpy as np +import pyarrow as pa +import pyarrow.compute as pc from PIL import Image import ray -from ray.data import ActorPoolStrategy +from ray.data.expressions import download from benchmark import Benchmark BUCKET = "anyscale-imagenet" @@ -21,22 +22,27 @@ def main(): def benchmark_fn(): metadata = ray.data.read_parquet(METADATA_PATH) - # Assuming there are 80 CPUs and 4 in-flight tasks per actor, we need at least 320 - # partitions to utilize all CPUs. - # TODO: This is a temporary workaround. We need to improve the default partitioning. - metadata = metadata.repartition(320) - - class LoadImage: - def __init__(self): - self._client = boto3.client("s3") - - def __call__(self, row): - data = io.BytesIO() - self._client.download_fileobj(BUCKET, row["key"], data) - image = Image.open(data).convert("RGB") - return {"image": np.array(image)} - - ds = metadata.map(LoadImage, compute=ActorPoolStrategy(min_size=1)) + + def decode_images(batch): + images = [] + for b in batch["image_bytes"]: + image = Image.open(io.BytesIO(b)).convert("RGB") + images.append(np.array(image)) + del batch["image_bytes"] + batch["image"] = np.array(images, dtype=object) + return batch + + def convert_key(table): + col = table["key"] + t = col.type + new_col = pc.binary_join_element_wise( + pa.scalar("s3://" + BUCKET, type=t), col, pa.scalar("/", type=t) + ) + return table.set_column(table.schema.get_field_index("key"), "key", new_col) + + ds = metadata.map_batches(convert_key, batch_format="pyarrow") + ds = ds.with_column("image_bytes", download("key")) + ds = ds.map_batches(decode_images) for _ in ds.iter_internal_ref_bundles(): pass diff --git a/release/nightly_tests/dataset/text_embedding/autoscaling_cluster_compute.yaml b/release/nightly_tests/dataset/text_embedding/autoscaling_cluster_compute.yaml new file mode 100644 index 000000000000..b601a66dc843 --- /dev/null +++ b/release/nightly_tests/dataset/text_embedding/autoscaling_cluster_compute.yaml @@ -0,0 +1,21 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west-2 + +advanced_configurations_json: + IamInstanceProfile: {"Name": "ray-autoscaler-v1"} + +head_node_type: + name: head-node + instance_type: r6a.8xlarge + resources: + cpu: 0 + +worker_node_types: + - name: gpu-node + instance_type: g5.xlarge + min_workers: 1 + max_workers: 100 + use_spot: false + +flags: + allow-cross-zone-autoscaling: true diff --git a/release/nightly_tests/dataset/text_embedding/create_dataset.py b/release/nightly_tests/dataset/text_embedding/create_dataset.py new file mode 100644 index 000000000000..e8e619d88e19 --- /dev/null +++ b/release/nightly_tests/dataset/text_embedding/create_dataset.py @@ -0,0 +1,102 @@ +import pyarrow as pa +import uuid +import random +import string +import ray +import pyarrow.parquet as pq +from tqdm import tqdm + +STRING_PLACEHOLDER = "" +UUID_PLACEHOLDER = uuid.UUID(int=0) +INT_PLACEHOLDER = 0 + +TARGET_SIZE_BYTES = 4096 +NUM_FILES = 50 + +SCHEMA = pa.schema( + [ + ("metadata00", pa.string()), + ("metadata01", pa.list_(pa.binary(16))), + ("metadata02", pa.string()), + ("metadata03", pa.uint64()), + ("metadata04", pa.list_(pa.binary(16))), + ("metadata05", pa.list_(pa.binary(16))), + ("metadata06", pa.binary(16)), + ("metadata07", pa.string()), + ("metadata08", pa.binary(16)), + ("metadata09", pa.uint64()), + ("metadata10", pa.binary(16)), + ("metadata11", pa.list_(pa.binary(16))), + ("metadata12", pa.uint64()), + ("metadata13", pa.uint64()), + ("metadata14", pa.list_(pa.binary(16))), + ("span_text", pa.string()), + ("metadata15", pa.binary(16)), + ("metadata16", pa.string()), + ("metadata17", pa.list_(pa.binary(16))), + ("metadata18", pa.list_(pa.binary(16))), + ] +) + + +def random_word(min_len=3, max_len=8): + length = random.randint(min_len, max_len) + return "".join(random.choices(string.ascii_lowercase, k=length)) + + +def create_random_sentence(): + sentence = "" + while len(sentence.encode("utf-8")) < TARGET_SIZE_BYTES: + word = random_word() + sentence += word + " " # space between words + + # Trim to exact size + sentence_bytes = sentence.encode("utf-8")[:TARGET_SIZE_BYTES] + return sentence_bytes.decode("utf-8", errors="ignore") + + +def create_row(): + return { + "metadata00": STRING_PLACEHOLDER, + "metadata01": [UUID_PLACEHOLDER.bytes], + "metadata02": STRING_PLACEHOLDER, + "metadata03": INT_PLACEHOLDER, + "metadata04": [UUID_PLACEHOLDER.bytes], + "metadata05": [UUID_PLACEHOLDER.bytes], + "metadata06": UUID_PLACEHOLDER.bytes, + "metadata07": STRING_PLACEHOLDER, + "metadata08": UUID_PLACEHOLDER.bytes, + "metadata09": INT_PLACEHOLDER, + "metadata10": UUID_PLACEHOLDER.bytes, + "metadata11": [UUID_PLACEHOLDER.bytes], + "metadata12": INT_PLACEHOLDER, + "metadata13": None if random.random() < 0.01 else INT_PLACEHOLDER, + "metadata14": [UUID_PLACEHOLDER.bytes], + "span_text": create_random_sentence(), + "metadata15": UUID_PLACEHOLDER.bytes, + "metadata16": STRING_PLACEHOLDER, + "metadata17": [UUID_PLACEHOLDER.bytes], + "metadata18": [UUID_PLACEHOLDER.bytes], + } + + +@ray.remote +def write_table(i: int): + rows = [] + for _ in range(20_000): + rows.append(create_row()) + + table = pa.Table.from_pylist(rows, schema=SCHEMA) + pq.write_table( + table, f"s3://ray-benchmark-data-internal-us-west-2/text-spans/{i}.parquet" + ) + + +refs = [write_table.remote(i) for i in range(NUM_FILES)] + +pbar = tqdm(total=len(refs)) +while refs: + ready, refs = ray.wait(refs, num_returns=1) + pbar.update(len(ready)) + +pbar.close() diff --git a/release/nightly_tests/dataset/text_embedding/fixed_size_cluster_compute.yaml b/release/nightly_tests/dataset/text_embedding/fixed_size_cluster_compute.yaml new file mode 100644 index 000000000000..eb51bba4b5ab --- /dev/null +++ b/release/nightly_tests/dataset/text_embedding/fixed_size_cluster_compute.yaml @@ -0,0 +1,21 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west-2 + +advanced_configurations_json: + IamInstanceProfile: {"Name": "ray-autoscaler-v1"} + +head_node_type: + name: head-node + instance_type: r6a.8xlarge + resources: + cpu: 0 + +worker_node_types: + - name: gpu-node + instance_type: g5.xlarge + min_workers: 100 + max_workers: 100 + use_spot: false + +flags: + allow-cross-zone-autoscaling: true diff --git a/release/nightly_tests/dataset/text_embedding/main.py b/release/nightly_tests/dataset/text_embedding/main.py new file mode 100644 index 000000000000..a74e02657003 --- /dev/null +++ b/release/nightly_tests/dataset/text_embedding/main.py @@ -0,0 +1,149 @@ +import argparse +from typing import Dict +import uuid +import boto3 +import json + +import numpy as np +import pyarrow as pa +from sentence_transformers import SentenceTransformer +import torch + +from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy +from ray._private.test_utils import EC2InstanceTerminatorWithGracePeriod +import ray + +from benchmark import Benchmark + +BATCH_SIZE = 128 + +# This dataset has 50 files, each with 20,000 rows of <1024-token text spans. It +# includes one empty Parquet file and some nulls. See `create_dataset.py` for details. +INPUT_PREFIX = "s3://ray-benchmark-data-internal-us-west-2/text-spans" +# Add a random prefix to avoid conflicts between different runs. +OUTPUT_PREFIX = f"s3://ray-data-write-benchmark/{uuid.uuid4().hex}" + +# These are used to fetch the HF token from AWS Secrets Manager. +SECRET_REGION_NAME = "us-west-2" +SECRET_ID = ( + "arn:aws:secretsmanager:us-west-2:188439194153:secret:release_test_hf_token-p3Lcqy" +) + +# FIXME: We need to explicitly define the schema and specify lists of variable-size +# binaries because Ray Data can't handle lists of fixed-size binaries. +SCHEMA = pa.schema( + [ + ("metadata00", pa.string()), + ("metadata01", pa.list_(pa.binary())), + ("metadata02", pa.string()), + ("metadata03", pa.uint64()), + ("metadata04", pa.list_(pa.binary())), + ("metadata05", pa.list_(pa.binary())), + ("metadata06", pa.binary()), + ("metadata07", pa.string()), + ("metadata08", pa.binary()), + ("metadata09", pa.uint64()), + ("metadata10", pa.binary()), + ("metadata11", pa.list_(pa.binary())), + ("metadata12", pa.uint64()), + ("metadata13", pa.uint64()), + ("metadata14", pa.list_(pa.binary())), + ("span_text", pa.string()), + ("metadata15", pa.binary()), + ("metadata16", pa.string()), + ("metadata17", pa.list_(pa.binary())), + ("metadata18", pa.list_(pa.binary())), + ] +) + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--inference-concurrency", + nargs=2, + type=int, + required=True, + help="The minimum and maximum concurrency for the inference operator.", + ) + parser.add_argument( + "--chaos", + action="store_true", + help=( + "Whether to enable chaos. If set, this script terminates one worker node " + "every minute with a grace period." + ), + ) + return parser.parse_args() + + +def main(args: argparse.Namespace): + benchmark = Benchmark() + + if args.chaos: + start_chaos() + + def benchmark_fn(): + ( + ray.data.read_parquet(INPUT_PREFIX, schema=SCHEMA) + .repartition(target_num_rows_per_block=256) + .map_batches( + EncodingUDF, + concurrency=tuple(args.inference_concurrency), + num_gpus=1, + batch_size=BATCH_SIZE, + fn_constructor_kwargs={"model": "BAAI/bge-m3", "token": get_hf_token()}, + ) + .write_parquet(OUTPUT_PREFIX, mode="overwrite") + ) + + benchmark.run_fn("main", benchmark_fn) + benchmark.write_result() + + +def start_chaos(): + assert ray.is_initialized() + + head_node_id = ray.get_runtime_context().get_node_id() + scheduling_strategy = NodeAffinitySchedulingStrategy( + node_id=head_node_id, soft=False + ) + resource_killer = EC2InstanceTerminatorWithGracePeriod.options( + scheduling_strategy=scheduling_strategy + ).remote(head_node_id, max_to_kill=None) + + ray.get(resource_killer.ready.remote()) + + resource_killer.run.remote() + + +class EncodingUDF: + def __init__(self, model: str, token: str): + device = "cuda" if torch.cuda.is_available() else "cpu" + self._model = SentenceTransformer( + model, + device=device, + token=token, + model_kwargs={"torch_dtype": torch.bfloat16}, + ) + + def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: + batch["vector"] = self._model.encode( + batch["span_text"], batch_size=BATCH_SIZE, convert_to_numpy=True + ) + return batch + + +def get_hf_token() -> str: + session = boto3.session.Session() + client = session.client( + service_name="secretsmanager", region_name=SECRET_REGION_NAME + ) + secret_string = client.get_secret_value(SecretId=SECRET_ID)["SecretString"] + return json.loads(secret_string)["HF_TOKEN"] + + +if __name__ == "__main__": + ray.init() + args = parse_args() + main(args) diff --git a/release/nightly_tests/dataset/text_embeddings_benchmark.py b/release/nightly_tests/dataset/text_embeddings_benchmark.py new file mode 100644 index 000000000000..e2fbccff429d --- /dev/null +++ b/release/nightly_tests/dataset/text_embeddings_benchmark.py @@ -0,0 +1,196 @@ +""" +Benchmark a text embeddings job +""" + +import argparse +import uuid +import time +from typing import Dict, List +from numpy import ndarray + +import ray +import torch +from sentence_transformers import SentenceTransformer +from langchain_text_splitters import ( + RecursiveCharacterTextSplitter, + CharacterTextSplitter, +) + +from benchmark import Benchmark, BenchmarkMetric + +# Subset of the data so that benchmark completes in ~20 minutes. +DEFAULT_SOURCE_DIRECTORY_S3 = "s3://air-example-data/common-pile-mirror/arxiv_papers/arxiv_papers-train-00001-of-00042.parquet" +# Add a random prefix to avoid conflicts between different runs. +WRITE_PATH = f"s3://ray-data-write-benchmark/{uuid.uuid4().hex}/" + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Text Embeddings Batch Inference Benchmark" + ) + parser.add_argument( + "--source-directory", + type=str, + default=DEFAULT_SOURCE_DIRECTORY_S3, + help="S3 URI of source documents", + ) + parser.add_argument( + "--chunk-concurrency", + type=int, + default=20, + help="Concurrency for Chunker stage", + ) + parser.add_argument( + "--chunk-cpus", type=int, default=None, help="Number of CPUs per Chunker" + ) + parser.add_argument( + "--chunk-method", + choices=["fixed", "recursive"], + default="recursive", + help="Chunking method", + ) + parser.add_argument( + "--chunk-size", type=int, default=1200, help="Chunk size for text splitting" + ) + parser.add_argument( + "--chunk-overlap", + type=int, + default=100, + help="Number of overlapping boundary characters between text chunks.", + ) + parser.add_argument( + "--embed-batch-size", + type=int, + default=256, + help="Batch size for embedding inference", + ) + parser.add_argument( + "--embed-concurrency", + type=int, + default=15, + help="Number of Embedder replicas", + ) + parser.add_argument( + "--num-gpus", type=int, default=1, help="Number of GPUs per Embedder" + ) + parser.add_argument( + "--model-name", + type=str, + default="Salesforce/SFR-Embedding-Code-400M_R", + help="Embedding model name", + ) + parser.add_argument( + "--smoke-test", + action="store_true", + help="Runs a smoke test with a small subset of the data", + ) + parser.add_argument( + "--chaos-test", + action="store_true", + default=False, + help="Enable chaos testing to simulate node failures", + ) + return parser.parse_args() + + +class Chunker: + def __init__(self, method: str, chunk_size: int, chunk_overlap: int): + if method == "fixed": + self.splitter = CharacterTextSplitter( + chunk_size=chunk_size, chunk_overlap=chunk_overlap + ) + else: + self.splitter = RecursiveCharacterTextSplitter( + chunk_size=chunk_size, chunk_overlap=chunk_overlap + ) + + def __call__(self, page: Dict) -> List[Dict]: + return [ + { + "text": text, + "source": page["source"], + "chunk_id": f"{page['id']}_{str(uuid.uuid4())}", + "doc_id": page["id"], + } + for text in self.splitter.split_text(page["text"]) + ] + + +class Embedder: + def __init__(self, model_name: str): + self.model = SentenceTransformer( + model_name, + device="cuda" if torch.cuda.is_available() else "cpu", + trust_remote_code=True, + ) + + def __call__(self, batch: Dict[str, ndarray]) -> Dict[str, ndarray]: + batch["embeddings"] = self.model.encode( + batch["text"], convert_to_numpy=True, batch_size=len(batch["text"]) + ) + return batch + + +def main(args): + start_time = time.time() + ds = ray.data.read_parquet( + args.source_directory, + include_paths=True, + ) + metadata_fetch_end = time.time() + metadata_fetching_s = metadata_fetch_end - start_time + if args.smoke_test: + ds = ds.limit(100) + + ds = ds.flat_map( + Chunker( + method=args.chunk_method, + chunk_size=args.chunk_size, + chunk_overlap=args.chunk_overlap, + ), + concurrency=args.chunk_concurrency, + num_cpus=args.chunk_cpus, + ) + ds = ds.map_batches( + Embedder, + fn_constructor_kwargs={"model_name": args.model_name}, + batch_size=args.embed_batch_size, + concurrency=args.embed_concurrency, + num_gpus=args.num_gpus, + ) + ds.write_parquet(WRITE_PATH, num_rows_per_file=5_000) + end_time = time.time() + runtime_s = end_time - start_time + num_rows = ray.data.read_parquet(WRITE_PATH).count() + throughput_rows_s = num_rows / runtime_s + + # Compute metrics for time and throughput without metadata fetch + runtime_s_wo_metadata_fetch = end_time - metadata_fetch_end + throughput_rows_s_wo_metadata_fetch = num_rows / runtime_s_wo_metadata_fetch + + # Report chaos testing node failures + if args.chaos_test: + dead_nodes = [node["NodeID"] for node in ray.nodes() if not node["Alive"]] + assert dead_nodes, "No dead nodes during chaos test" + print(f"Total chaos killed: {dead_nodes}") + + return { + BenchmarkMetric.RUNTIME: runtime_s, + BenchmarkMetric.NUM_ROWS: num_rows, + BenchmarkMetric.THROUGHPUT: throughput_rows_s, + "source_directory": args.source_directory, + "model_name": args.model_name, + "chunk_method": args.chunk_method, + "metadata_fetching_s": metadata_fetching_s, + "runtime_s_wo_metadata_fetch": runtime_s_wo_metadata_fetch, + "throughput_rows_s_wo_metadata_fetch": throughput_rows_s_wo_metadata_fetch, + "chaos_test": args.chaos_test, + } + + +if __name__ == "__main__": + args = parse_args() + print(f"Writing to {WRITE_PATH}") + benchmark = Benchmark() + benchmark.run_fn("text-embeddings-benchmark", main, args) + benchmark.write_result() diff --git a/release/nightly_tests/dataset/wide_schema_pipeline_benchmark.py b/release/nightly_tests/dataset/wide_schema_pipeline_benchmark.py new file mode 100644 index 000000000000..373afb23e55f --- /dev/null +++ b/release/nightly_tests/dataset/wide_schema_pipeline_benchmark.py @@ -0,0 +1,57 @@ +import argparse +from typing import Dict, Any + +import ray +from benchmark import Benchmark + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Wide schema pipeline benchmark") + parser.add_argument( + "--data-type", + choices=["primitives", "tensors", "objects", "nested_structs"], + default="primitives", + help="Type of pre-generated dataset to benchmark", + ) + + return parser.parse_args() + + +def main(args: argparse.Namespace) -> None: + benchmark = Benchmark() + + # Each dataset contains about 500-600Mbs of data, except for objects, + # which contain about 150Mb (this is because their pickle bloat is big). + # Furthermore, the schema contains 5000 fields, and each column contains + # 500 characters. + input_path = ( + f"s3://ray-benchmark-data-internal-us-west-2/wide_schema/{args.data_type}" + ) + + print(f"Using pre-generated dataset: {input_path}") + + # Run the pipeline benchmark (TIMED) + def run_pipeline() -> Dict[str, Any]: + """Run the data pipeline: read -> map_batches -> write""" + ds = ray.data.read_parquet(input_path) + + for _ in ds.iter_internal_ref_bundles(): + pass + + # Get dataset stats for reporting + actual_num_columns = len(ds.schema().base_schema) + + return { + "num_columns": actual_num_columns, + "data_type": args.data_type, + "input_path": input_path, + } + + # Run the timed benchmark + benchmark.run_fn("wide_schema_pipeline", run_pipeline) + benchmark.write_result() + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/release/perf_metrics/benchmarks/many_actors.json b/release/perf_metrics/benchmarks/many_actors.json index 3887e4d4e6fb..d41c48a42071 100644 --- a/release/perf_metrics/benchmarks/many_actors.json +++ b/release/perf_metrics/benchmarks/many_actors.json @@ -1,32 +1,32 @@ { - "_dashboard_memory_usage_mb": 110.235648, + "_dashboard_memory_usage_mb": 116.87936, "_dashboard_test_success": true, - "_peak_memory": 4.85, - "_peak_process_memory": "PID\tMEM\tCOMMAND\n1129\t7.16GiB\t/app/product/go/infra/anyscaled/anyscaled_/anyscaled startv2 --control_plane_url=https://console.any\n3533\t2.04GiB\t/home/ray/anaconda3/lib/python3.9/site-packages/ray/core/src/ray/gcs/gcs_server --log_dir=/tmp/ray/s\n4961\t1.06GiB\tpython distributed/test_many_actors.py\n3034\t0.46GiB\tvector --watch-config --log-format json --config-yaml /etc/vector/vector.yaml\n3734\t0.32GiB\tray-dashboard-NodeHead-0 (/home/ray/anaconda3/bin/python3.9 -c \"from multiprocessing.spawn import sp\n585\t0.2GiB\t/app/go/infra/anyscaled/anyscaled_/anyscaled_shim --cloud_provider=aws\n3649\t0.1GiB\t/home/ray/anaconda3/bin/python3.9 /home/ray/anaconda3/lib/python3.9/site-packages/ray/dashboard/dash\n4254\t0.09GiB\t/home/ray/anaconda3/bin/python3.9 -u /home/ray/anaconda3/lib/python3.9/site-packages/ray/dashboard/a\n3001\t0.09GiB\t/usr/bin/python3 /app/infra/dataplane/webterminal/webterminal_sidecar_image.binary.runfiles/product/\n4256\t0.08GiB\t/home/ray/anaconda3/bin/python3.9 -u /home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/ru", - "actors_per_second": 657.1702061376596, + "_peak_memory": 4.56, + "_peak_process_memory": "PID\tMEM\tCOMMAND\n3537\t2.03GiB\t/home/ray/anaconda3/lib/python3.9/site-packages/ray/core/src/ray/gcs/gcs_server --log_dir=/tmp/ray/s\n5556\t0.87GiB\tpython distributed/test_many_actors.py\n2945\t0.39GiB\tvector --watch-config --log-format json --config-yaml /etc/vector/vector.yaml\n3735\t0.23GiB\tray-dashboard-NodeHead-0 (/home/ray/anaconda3/bin/python3.9 -c \"from multiprocessing.spawn import sp\n582\t0.19GiB\t/app/go/infra/anyscaled/anyscaled_/anyscaled_shim --cloud_provider=aws\n1143\t0.11GiB\t/app/product/go/infra/anyscaled/anyscaled_/anyscaled startv2 --control_plane_url=https://console.any\n3652\t0.1GiB\t/home/ray/anaconda3/bin/python3.9 /home/ray/anaconda3/lib/python3.9/site-packages/ray/dashboard/dash\n4226\t0.1GiB\t/home/ray/anaconda3/bin/python3.9 -u /home/ray/anaconda3/lib/python3.9/site-packages/ray/dashboard/a\n3108\t0.09GiB\t/usr/bin/python3 /app/infra/dataplane/webterminal/webterminal_sidecar_image.binary.runfiles/product/\n4228\t0.09GiB\t/home/ray/anaconda3/bin/python3.9 -u /home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/ru", + "actors_per_second": 566.4200586217125, "num_actors": 10000, "perf_metrics": [ { "perf_metric_name": "actors_per_second", "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 657.1702061376596 + "perf_metric_value": 566.4200586217125 }, { "perf_metric_name": "dashboard_p50_latency_ms", "perf_metric_type": "LATENCY", - "perf_metric_value": 9.35 + "perf_metric_value": 10.833 }, { "perf_metric_name": "dashboard_p95_latency_ms", "perf_metric_type": "LATENCY", - "perf_metric_value": 2197.485 + "perf_metric_value": 2612.102 }, { "perf_metric_name": "dashboard_p99_latency_ms", "perf_metric_type": "LATENCY", - "perf_metric_value": 2572.496 + "perf_metric_value": 3446.344 } ], "success": "1", - "time": 15.216758012771606 + "time": 17.654742002487183 } diff --git a/release/perf_metrics/benchmarks/many_nodes.json b/release/perf_metrics/benchmarks/many_nodes.json index 5359354d5066..a49b33405678 100644 --- a/release/perf_metrics/benchmarks/many_nodes.json +++ b/release/perf_metrics/benchmarks/many_nodes.json @@ -1,14 +1,14 @@ { - "_dashboard_memory_usage_mb": 96.54272, + "_dashboard_memory_usage_mb": 96.198656, "_dashboard_test_success": true, - "_peak_memory": 2.26, - "_peak_process_memory": "PID\tMEM\tCOMMAND\n3357\t0.51GiB\t/home/ray/anaconda3/lib/python3.9/site-packages/ray/core/src/ray/gcs/gcs_server --log_dir=/tmp/ray/s\n2796\t0.28GiB\tvector --watch-config --log-format json --config-yaml /etc/vector/vector.yaml\n5171\t0.17GiB\tpython distributed/test_many_tasks.py --num-tasks=1000\n3555\t0.14GiB\tray-dashboard-NodeHead-0 (/home/ray/anaconda3/bin/python3.9 -c \"from multiprocessing.spawn import sp\n1083\t0.13GiB\t/app/product/go/infra/anyscaled/anyscaled_/anyscaled startv2 --control_plane_url=https://console.any\n4094\t0.1GiB\t/home/ray/anaconda3/bin/python3.9 -u /home/ray/anaconda3/lib/python3.9/site-packages/ray/dashboard/a\n2769\t0.09GiB\t/usr/bin/python3 /app/infra/dataplane/webterminal/webterminal_sidecar_image.binary.runfiles/product/\n5398\t0.09GiB\tray::StateAPIGeneratorActor.start\n3473\t0.09GiB\t/home/ray/anaconda3/bin/python3.9 /home/ray/anaconda3/lib/python3.9/site-packages/ray/dashboard/dash\n4096\t0.08GiB\t/home/ray/anaconda3/bin/python3.9 -u /home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/ru", + "_peak_memory": 2.28, + "_peak_process_memory": "PID\tMEM\tCOMMAND\n3348\t0.51GiB\t/home/ray/anaconda3/lib/python3.9/site-packages/ray/core/src/ray/gcs/gcs_server --log_dir=/tmp/ray/s\n2901\t0.27GiB\tvector --watch-config --log-format json --config-yaml /etc/vector/vector.yaml\n4907\t0.17GiB\tpython distributed/test_many_tasks.py --num-tasks=1000\n3546\t0.13GiB\tray-dashboard-NodeHead-0 (/home/ray/anaconda3/bin/python3.9 -c \"from multiprocessing.spawn import sp\n1091\t0.13GiB\t/app/product/go/infra/anyscaled/anyscaled_/anyscaled startv2 --control_plane_url=https://console.any\n4032\t0.11GiB\t/home/ray/anaconda3/bin/python3.9 -u /home/ray/anaconda3/lib/python3.9/site-packages/ray/dashboard/a\n2825\t0.09GiB\t/usr/bin/python3 /app/infra/dataplane/webterminal/webterminal_sidecar_image.binary.runfiles/product/\n5133\t0.09GiB\tray::StateAPIGeneratorActor.start\n3464\t0.09GiB\t/home/ray/anaconda3/bin/python3.9 /home/ray/anaconda3/lib/python3.9/site-packages/ray/dashboard/dash\n3549\t0.08GiB\tray-dashboard-StateHead-0 (/home/ray/anaconda3/bin/python3.9 -c \"from multiprocessing.spawn import s", "num_tasks": 1000, "perf_metrics": [ { "perf_metric_name": "tasks_per_second", "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 191.95909855877267 + "perf_metric_value": 179.67755143550417 }, { "perf_metric_name": "used_cpus_by_deadline", @@ -18,21 +18,21 @@ { "perf_metric_name": "dashboard_p50_latency_ms", "perf_metric_type": "LATENCY", - "perf_metric_value": 5.204 + "perf_metric_value": 6.935 }, { "perf_metric_name": "dashboard_p95_latency_ms", "perf_metric_type": "LATENCY", - "perf_metric_value": 13.297 + "perf_metric_value": 13.338 }, { "perf_metric_name": "dashboard_p99_latency_ms", "perf_metric_type": "LATENCY", - "perf_metric_value": 54.703 + "perf_metric_value": 35.162 } ], "success": "1", - "tasks_per_second": 191.95909855877267, - "time": 305.2094430923462, + "tasks_per_second": 179.67755143550417, + "time": 305.5655255317688, "used_cpus": 250.0 } diff --git a/release/perf_metrics/benchmarks/many_pgs.json b/release/perf_metrics/benchmarks/many_pgs.json index d6c4288e5a3d..2df384ac4bda 100644 --- a/release/perf_metrics/benchmarks/many_pgs.json +++ b/release/perf_metrics/benchmarks/many_pgs.json @@ -1,32 +1,32 @@ { - "_dashboard_memory_usage_mb": 93.515776, + "_dashboard_memory_usage_mb": 98.287616, "_dashboard_test_success": true, - "_peak_memory": 2.69, - "_peak_process_memory": "PID\tMEM\tCOMMAND\n1130\t7.9GiB\t/app/product/go/infra/anyscaled/anyscaled_/anyscaled startv2 --control_plane_url=https://console.any\n3522\t0.91GiB\t/home/ray/anaconda3/lib/python3.9/site-packages/ray/core/src/ray/gcs/gcs_server --log_dir=/tmp/ray/s\n4967\t0.36GiB\tpython distributed/test_many_pgs.py\n2980\t0.32GiB\tvector --watch-config --log-format json --config-yaml /etc/vector/vector.yaml\n580\t0.19GiB\t/app/go/infra/anyscaled/anyscaled_/anyscaled_shim --cloud_provider=aws\n4243\t0.1GiB\t/home/ray/anaconda3/bin/python3.9 -u /home/ray/anaconda3/lib/python3.9/site-packages/ray/dashboard/a\n3724\t0.09GiB\tray-dashboard-NodeHead-0 (/home/ray/anaconda3/bin/python3.9 -c \"from multiprocessing.spawn import sp\n2794\t0.09GiB\t/app/go/infra/activityprobe/activityprobe ray --port=5903 --metrics_server_port=9092 --raylet_addr=l\n3106\t0.09GiB\t/usr/bin/python3 /app/infra/dataplane/webterminal/webterminal_sidecar_image.binary.runfiles/product/\n3642\t0.08GiB\t/home/ray/anaconda3/bin/python3.9 /home/ray/anaconda3/lib/python3.9/site-packages/ray/dashboard/dash", + "_peak_memory": 2.78, + "_peak_process_memory": "PID\tMEM\tCOMMAND\n1146\t7.94GiB\t/app/product/go/infra/anyscaled/anyscaled_/anyscaled startv2 --control_plane_url=https://console.any\n3546\t0.92GiB\t/home/ray/anaconda3/lib/python3.9/site-packages/ray/core/src/ray/gcs/gcs_server --log_dir=/tmp/ray/s\n3049\t0.47GiB\tvector --watch-config --log-format json --config-yaml /etc/vector/vector.yaml\n5004\t0.37GiB\tpython distributed/test_many_pgs.py\n581\t0.19GiB\t/app/go/infra/anyscaled/anyscaled_/anyscaled_shim --cloud_provider=aws\n3758\t0.13GiB\tray-dashboard-NodeHead-0 (/home/ray/anaconda3/bin/python3.9 -c \"from multiprocessing.spawn import sp\n2816\t0.11GiB\t/app/go/infra/activityprobe/activityprobe ray --port=5903 --metrics_server_port=9092 --raylet_addr=l\n4241\t0.1GiB\t/home/ray/anaconda3/bin/python3.9 -u /home/ray/anaconda3/lib/python3.9/site-packages/ray/dashboard/a\n3665\t0.09GiB\t/home/ray/anaconda3/bin/python3.9 /home/ray/anaconda3/lib/python3.9/site-packages/ray/dashboard/dash\n2941\t0.09GiB\t/usr/bin/python3 /app/infra/dataplane/webterminal/webterminal_sidecar_image.binary.runfiles/product/", "num_pgs": 1000, "perf_metrics": [ { "perf_metric_name": "pgs_per_second", "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 13.215254403739163 + "perf_metric_value": 13.028153672527967 }, { "perf_metric_name": "dashboard_p50_latency_ms", "perf_metric_type": "LATENCY", - "perf_metric_value": 4.194 + "perf_metric_value": 4.26 }, { "perf_metric_name": "dashboard_p95_latency_ms", "perf_metric_type": "LATENCY", - "perf_metric_value": 7.454 + "perf_metric_value": 10.799 }, { "perf_metric_name": "dashboard_p99_latency_ms", "perf_metric_type": "LATENCY", - "perf_metric_value": 219.05 + "perf_metric_value": 188.103 } ], - "pgs_per_second": 13.215254403739163, + "pgs_per_second": 13.028153672527967, "success": "1", - "time": 75.67012858390808 + "time": 76.75684714317322 } diff --git a/release/perf_metrics/benchmarks/many_tasks.json b/release/perf_metrics/benchmarks/many_tasks.json index 0a045a0a839e..cfaad9568f2d 100644 --- a/release/perf_metrics/benchmarks/many_tasks.json +++ b/release/perf_metrics/benchmarks/many_tasks.json @@ -1,14 +1,14 @@ { - "_dashboard_memory_usage_mb": 95.08864, + "_dashboard_memory_usage_mb": 104.443904, "_dashboard_test_success": true, - "_peak_memory": 3.91, - "_peak_process_memory": "PID\tMEM\tCOMMAND\n3526\t1.07GiB\t/home/ray/anaconda3/lib/python3.9/site-packages/ray/core/src/ray/gcs/gcs_server --log_dir=/tmp/ray/s\n5070\t0.75GiB\tpython distributed/test_many_tasks.py --num-tasks=10000\n3724\t0.45GiB\tray-dashboard-NodeHead-0 (/home/ray/anaconda3/bin/python3.9 -c \"from multiprocessing.spawn import sp\n3054\t0.29GiB\tvector --watch-config --log-format json --config-yaml /etc/vector/vector.yaml\n3727\t0.2GiB\tray-dashboard-StateHead-0 (/home/ray/anaconda3/bin/python3.9 -c \"from multiprocessing.spawn import s\n1120\t0.12GiB\t/app/product/go/infra/anyscaled/anyscaled_/anyscaled startv2 --control_plane_url=https://console.any\n4243\t0.11GiB\t/home/ray/anaconda3/bin/python3.9 -u /home/ray/anaconda3/lib/python3.9/site-packages/ray/dashboard/a\n3642\t0.09GiB\t/home/ray/anaconda3/bin/python3.9 /home/ray/anaconda3/lib/python3.9/site-packages/ray/dashboard/dash\n3021\t0.09GiB\t/usr/bin/python3 /app/infra/dataplane/webterminal/webterminal_sidecar_image.binary.runfiles/product/\n5380\t0.09GiB\tray::StateAPIGeneratorActor.start", + "_peak_memory": 3.96, + "_peak_process_memory": "PID\tMEM\tCOMMAND\n3538\t1.1GiB\t/home/ray/anaconda3/lib/python3.9/site-packages/ray/core/src/ray/gcs/gcs_server --log_dir=/tmp/ray/s\n5120\t0.76GiB\tpython distributed/test_many_tasks.py --num-tasks=10000\n3748\t0.46GiB\tray-dashboard-NodeHead-0 (/home/ray/anaconda3/bin/python3.9 -c \"from multiprocessing.spawn import sp\n3008\t0.27GiB\tvector --watch-config --log-format json --config-yaml /etc/vector/vector.yaml\n3751\t0.19GiB\tray-dashboard-StateHead-0 (/home/ray/anaconda3/bin/python3.9 -c \"from multiprocessing.spawn import s\n1134\t0.11GiB\t/app/product/go/infra/anyscaled/anyscaled_/anyscaled startv2 --control_plane_url=https://console.any\n4231\t0.11GiB\t/home/ray/anaconda3/bin/python3.9 -u /home/ray/anaconda3/lib/python3.9/site-packages/ray/dashboard/a\n3653\t0.1GiB\t/home/ray/anaconda3/bin/python3.9 /home/ray/anaconda3/lib/python3.9/site-packages/ray/dashboard/dash\n3029\t0.09GiB\t/usr/bin/python3 /app/infra/dataplane/webterminal/webterminal_sidecar_image.binary.runfiles/product/\n4233\t0.09GiB\t/home/ray/anaconda3/bin/python3.9 -u /home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/ru", "num_tasks": 10000, "perf_metrics": [ { "perf_metric_name": "tasks_per_second", "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 364.43726497335643 + "perf_metric_value": 388.36439061844453 }, { "perf_metric_name": "used_cpus_by_deadline", @@ -18,21 +18,21 @@ { "perf_metric_name": "dashboard_p50_latency_ms", "perf_metric_type": "LATENCY", - "perf_metric_value": 5.277 + "perf_metric_value": 5.544 }, { "perf_metric_name": "dashboard_p95_latency_ms", "perf_metric_type": "LATENCY", - "perf_metric_value": 492.608 + "perf_metric_value": 652.763 }, { "perf_metric_name": "dashboard_p99_latency_ms", "perf_metric_type": "LATENCY", - "perf_metric_value": 787.075 + "perf_metric_value": 779.606 } ], "success": "1", - "tasks_per_second": 364.43726497335643, - "time": 327.4395649433136, + "tasks_per_second": 388.36439061844453, + "time": 325.74901366233826, "used_cpus": 2500.0 } diff --git a/release/perf_metrics/metadata.json b/release/perf_metrics/metadata.json index f185fe20125d..bae7d94a16a3 100644 --- a/release/perf_metrics/metadata.json +++ b/release/perf_metrics/metadata.json @@ -1 +1 @@ -{"release_version": "2.48.0"} +{"release_version": "2.49.1"} diff --git a/release/perf_metrics/microbenchmark.json b/release/perf_metrics/microbenchmark.json index abd2e5ee7bcb..30eb94e6f8a5 100644 --- a/release/perf_metrics/microbenchmark.json +++ b/release/perf_metrics/microbenchmark.json @@ -1,283 +1,283 @@ { "1_1_actor_calls_async": [ - 8663.654839458402, - 182.98906836658583 + 7925.658042658907, + 333.79803776770194 ], "1_1_actor_calls_concurrent": [ - 5775.020315522301, - 166.03207664123752 + 4710.115509639389, + 60.79075536787328 ], "1_1_actor_calls_sync": [ - 2011.916260420167, - 34.20258828426277 + 1826.440590474467, + 30.44826694257455 ], "1_1_async_actor_calls_async": [ - 4259.771844696956, - 244.58821485834815 + 3645.3291604906276, + 145.09649825222274 ], "1_1_async_actor_calls_sync": [ - 1459.7289131365046, - 14.372300668103277 + 1374.047824125402, + 35.86321385785778 ], "1_1_async_actor_calls_with_args_async": [ - 2836.298297310687, - 165.56556787435736 + 2426.398157992012, + 38.78002524735766 ], "1_n_actor_calls_async": [ - 8038.166251679982, - 223.66382715772104 + 7563.474741840271, + 160.3419047539893 ], "1_n_async_actor_calls_async": [ - 7382.681881276498, - 130.69555045858203 + 6964.257909926722, + 53.3826400982145 ], "client__1_1_actor_calls_async": [ - 1098.863141897179, - 11.579112801667774 + 846.4118553774217, + 21.74145942353796 ], "client__1_1_actor_calls_concurrent": [ - 1085.0288964711467, - 4.148700210547401 + 862.8335019710298, + 5.40969189845287 ], "client__1_1_actor_calls_sync": [ - 537.8164788509748, - 4.282391401398279 + 488.0060975075199, + 18.43611203884743 ], "client__get_calls": [ - 1159.3513798913632, - 25.153079890432657 + 983.5607099398597, + 44.68614802894011 ], "client__put_calls": [ - 817.4136861603523, - 35.13575238987404 + 769.3648317551028, + 25.86986897843832 ], "client__put_gigabytes": [ - 0.1559990403715773, - 0.0006899703405647251 + 0.10294244610916167, + 0.00021781279103519403 ], "client__tasks_and_get_batch": [ - 1.009944931213749, - 0.0320718636380897 + 0.8049748278618892, + 0.0384792096927115 ], "client__tasks_and_put_batch": [ - 14560.030073574557, - 146.72299114824276 + 10098.586104880465, + 158.55761529403424 ], "multi_client_put_calls_Plasma_Store": [ - 16526.35985553258, - 400.3514368958908 + 10922.349171697762, + 411.8369713180647 ], "multi_client_put_gigabytes": [ - 38.137310138893675, - 1.3860853941620797 + 27.572292929404366, + 0.301414736739597 ], "multi_client_tasks_async": [ - 21229.843138559452, - 1404.0869837056882 + 19294.747670848352, + 1531.838851224768 ], "n_n_actor_calls_async": [ - 27375.624367126635, - 674.8368191945152 + 24808.730524179864, + 580.5120779930962 ], "n_n_actor_calls_with_arg_async": [ - 2759.3212097473174, - 60.45186810112816 + 2564.489147392739, + 58.242925806948335 ], "n_n_async_actor_calls_async": [ - 23674.50106467489, - 547.7052271058876 + 21602.16598513169, + 648.5971305332962 ], "perf_metrics": [ { "perf_metric_name": "single_client_get_calls_Plasma_Store", "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 10620.405550394937 + "perf_metric_value": 9176.686326011131 }, { "perf_metric_name": "single_client_put_calls_Plasma_Store", "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 5173.290112206238 + "perf_metric_value": 4795.051007052156 }, { "perf_metric_name": "multi_client_put_calls_Plasma_Store", "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 16526.35985553258 + "perf_metric_value": 10922.349171697762 }, { "perf_metric_name": "single_client_put_gigabytes", "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 19.85639156989914 + "perf_metric_value": 20.350152593657818 }, { "perf_metric_name": "single_client_tasks_and_get_batch", "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 5.800654754787365 + "perf_metric_value": 5.261194854317881 }, { "perf_metric_name": "multi_client_put_gigabytes", "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 38.137310138893675 + "perf_metric_value": 27.572292929404366 }, { "perf_metric_name": "single_client_get_object_containing_10k_refs", "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 13.371722002108683 + "perf_metric_value": 13.142098493341212 }, { "perf_metric_name": "single_client_wait_1k_refs", "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 5.079952667320649 + "perf_metric_value": 4.8129125825624035 }, { "perf_metric_name": "single_client_tasks_sync", "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 980.7121217208985 + "perf_metric_value": 900.96738867954 }, { "perf_metric_name": "single_client_tasks_async", "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 8040.530786886751 + "perf_metric_value": 7418.67591750316 }, { "perf_metric_name": "multi_client_tasks_async", "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 21229.843138559452 + "perf_metric_value": 19294.747670848352 }, { "perf_metric_name": "1_1_actor_calls_sync", "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 2011.916260420167 + "perf_metric_value": 1826.440590474467 }, { "perf_metric_name": "1_1_actor_calls_async", "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 8663.654839458402 + "perf_metric_value": 7925.658042658907 }, { "perf_metric_name": "1_1_actor_calls_concurrent", "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 5775.020315522301 + "perf_metric_value": 4710.115509639389 }, { "perf_metric_name": "1_n_actor_calls_async", "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 8038.166251679982 + "perf_metric_value": 7563.474741840271 }, { "perf_metric_name": "n_n_actor_calls_async", "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 27375.624367126635 + "perf_metric_value": 24808.730524179864 }, { "perf_metric_name": "n_n_actor_calls_with_arg_async", "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 2759.3212097473174 + "perf_metric_value": 2564.489147392739 }, { "perf_metric_name": "1_1_async_actor_calls_sync", "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 1459.7289131365046 + "perf_metric_value": 1374.047824125402 }, { "perf_metric_name": "1_1_async_actor_calls_async", "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 4259.771844696956 + "perf_metric_value": 3645.3291604906276 }, { "perf_metric_name": "1_1_async_actor_calls_with_args_async", "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 2836.298297310687 + "perf_metric_value": 2426.398157992012 }, { "perf_metric_name": "1_n_async_actor_calls_async", "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 7382.681881276498 + "perf_metric_value": 6964.257909926722 }, { "perf_metric_name": "n_n_async_actor_calls_async", "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 23674.50106467489 + "perf_metric_value": 21602.16598513169 }, { "perf_metric_name": "placement_group_create/removal", "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 764.5677165695956 + "perf_metric_value": 751.064903521573 }, { "perf_metric_name": "client__get_calls", "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 1159.3513798913632 + "perf_metric_value": 983.5607099398597 }, { "perf_metric_name": "client__put_calls", "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 817.4136861603523 + "perf_metric_value": 769.3648317551028 }, { "perf_metric_name": "client__put_gigabytes", "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 0.1559990403715773 + "perf_metric_value": 0.10294244610916167 }, { "perf_metric_name": "client__tasks_and_put_batch", "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 14560.030073574557 + "perf_metric_value": 10098.586104880465 }, { "perf_metric_name": "client__1_1_actor_calls_sync", "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 537.8164788509748 + "perf_metric_value": 488.0060975075199 }, { "perf_metric_name": "client__1_1_actor_calls_async", "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 1098.863141897179 + "perf_metric_value": 846.4118553774217 }, { "perf_metric_name": "client__1_1_actor_calls_concurrent", "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 1085.0288964711467 + "perf_metric_value": 862.8335019710298 }, { "perf_metric_name": "client__tasks_and_get_batch", "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 1.009944931213749 + "perf_metric_value": 0.8049748278618892 } ], "placement_group_create/removal": [ - 764.5677165695956, - 11.50741876717501 + 751.064903521573, + 5.332518268184338 ], "single_client_get_calls_Plasma_Store": [ - 10620.405550394937, - 95.5780186318987 + 9176.686326011131, + 202.59360315795405 ], "single_client_get_object_containing_10k_refs": [ - 13.371722002108683, - 0.2715300404352367 + 13.142098493341212, + 0.280827763090365 ], "single_client_put_calls_Plasma_Store": [ - 5173.290112206238, - 50.54867941540244 + 4795.051007052156, + 55.29886971022227 ], "single_client_put_gigabytes": [ - 19.85639156989914, - 8.982486882151242 + 20.350152593657818, + 6.284060239581299 ], "single_client_tasks_and_get_batch": [ - 5.800654754787365, - 3.260748466569974 + 5.261194854317881, + 2.8864991514393927 ], "single_client_tasks_async": [ - 8040.530786886751, - 508.5067401143829 + 7418.67591750316, + 224.65732622349898 ], "single_client_tasks_sync": [ - 980.7121217208985, - 15.070879654529714 + 900.96738867954, + 14.441231923805944 ], "single_client_wait_1k_refs": [ - 5.079952667320649, - 0.11950057107198113 + 4.8129125825624035, + 0.007111082814526685 ] } diff --git a/release/perf_metrics/scalability/object_store.json b/release/perf_metrics/scalability/object_store.json index 367cb088c4bb..7d152e5a6d66 100644 --- a/release/perf_metrics/scalability/object_store.json +++ b/release/perf_metrics/scalability/object_store.json @@ -1,12 +1,12 @@ { - "broadcast_time": 17.324763202, + "broadcast_time": 13.41017694899999, "num_nodes": 50, "object_size": 1073741824, "perf_metrics": [ { "perf_metric_name": "time_to_broadcast_1073741824_bytes_to_50_nodes", "perf_metric_type": "LATENCY", - "perf_metric_value": 17.324763202 + "perf_metric_value": 13.41017694899999 } ], "success": "1" diff --git a/release/perf_metrics/scalability/single_node.json b/release/perf_metrics/scalability/single_node.json index bc2cd08fe8df..57cdf632f646 100644 --- a/release/perf_metrics/scalability/single_node.json +++ b/release/perf_metrics/scalability/single_node.json @@ -1,8 +1,8 @@ { - "args_time": 18.84486551900001, - "get_time": 23.075941746000012, + "args_time": 19.077259766999987, + "get_time": 24.000713915999995, "large_object_size": 107374182400, - "large_object_time": 32.03462247800002, + "large_object_time": 31.36117459099995, "num_args": 10000, "num_get_args": 10000, "num_queued": 1000000, @@ -11,30 +11,30 @@ { "perf_metric_name": "10000_args_time", "perf_metric_type": "LATENCY", - "perf_metric_value": 18.84486551900001 + "perf_metric_value": 19.077259766999987 }, { "perf_metric_name": "3000_returns_time", "perf_metric_type": "LATENCY", - "perf_metric_value": 6.088559257 + "perf_metric_value": 5.790547841000006 }, { "perf_metric_name": "10000_get_time", "perf_metric_type": "LATENCY", - "perf_metric_value": 23.075941746000012 + "perf_metric_value": 24.000713915999995 }, { "perf_metric_name": "1000000_queued_time", "perf_metric_type": "LATENCY", - "perf_metric_value": 199.176572467 + "perf_metric_value": 179.146127773 }, { "perf_metric_name": "107374182400_large_object_time", "perf_metric_type": "LATENCY", - "perf_metric_value": 32.03462247800002 + "perf_metric_value": 31.36117459099995 } ], - "queued_time": 199.176572467, - "returns_time": 6.088559257, + "queued_time": 179.146127773, + "returns_time": 5.790547841000006, "success": "1" } diff --git a/release/perf_metrics/stress_tests/stress_test_dead_actors.json b/release/perf_metrics/stress_tests/stress_test_dead_actors.json index 7daf8903fe7f..991e91d96aec 100644 --- a/release/perf_metrics/stress_tests/stress_test_dead_actors.json +++ b/release/perf_metrics/stress_tests/stress_test_dead_actors.json @@ -1,14 +1,14 @@ { - "avg_iteration_time": 1.1874613547325135, - "max_iteration_time": 3.250436544418335, - "min_iteration_time": 0.05550789833068848, + "avg_iteration_time": 1.2971700072288512, + "max_iteration_time": 5.189502000808716, + "min_iteration_time": 0.06091117858886719, "perf_metrics": [ { "perf_metric_name": "avg_iteration_time", "perf_metric_type": "LATENCY", - "perf_metric_value": 1.1874613547325135 + "perf_metric_value": 1.2971700072288512 } ], "success": 1, - "total_time": 118.7462546825409 + "total_time": 129.71714234352112 } diff --git a/release/perf_metrics/stress_tests/stress_test_many_tasks.json b/release/perf_metrics/stress_tests/stress_test_many_tasks.json index accdf6d571e9..bee0d6200cac 100644 --- a/release/perf_metrics/stress_tests/stress_test_many_tasks.json +++ b/release/perf_metrics/stress_tests/stress_test_many_tasks.json @@ -3,45 +3,45 @@ { "perf_metric_name": "stage_0_time", "perf_metric_type": "LATENCY", - "perf_metric_value": 7.344109535217285 + "perf_metric_value": 7.735846281051636 }, { "perf_metric_name": "stage_1_avg_iteration_time", "perf_metric_type": "LATENCY", - "perf_metric_value": 12.96969530582428 + "perf_metric_value": 12.93162693977356 }, { "perf_metric_name": "stage_2_avg_iteration_time", "perf_metric_type": "LATENCY", - "perf_metric_value": 33.957920932769774 + "perf_metric_value": 33.983641386032104 }, { "perf_metric_name": "stage_3_creation_time", "perf_metric_type": "LATENCY", - "perf_metric_value": 1.8526091575622559 + "perf_metric_value": 1.8725192546844482 }, { "perf_metric_name": "stage_3_time", "perf_metric_type": "LATENCY", - "perf_metric_value": 1826.5975222587585 + "perf_metric_value": 1821.4706330299377 }, { "perf_metric_name": "stage_4_spread", "perf_metric_type": "LATENCY", - "perf_metric_value": 0.48570817077228695 + "perf_metric_value": 0.5580154959703073 } ], - "stage_0_time": 7.344109535217285, - "stage_1_avg_iteration_time": 12.96969530582428, - "stage_1_max_iteration_time": 13.717556715011597, - "stage_1_min_iteration_time": 11.527287244796753, - "stage_1_time": 129.69700860977173, - "stage_2_avg_iteration_time": 33.957920932769774, - "stage_2_max_iteration_time": 34.32049250602722, - "stage_2_min_iteration_time": 33.68821382522583, - "stage_2_time": 169.79015111923218, - "stage_3_creation_time": 1.8526091575622559, - "stage_3_time": 1826.5975222587585, - "stage_4_spread": 0.48570817077228695, + "stage_0_time": 7.735846281051636, + "stage_1_avg_iteration_time": 12.93162693977356, + "stage_1_max_iteration_time": 13.44619870185852, + "stage_1_min_iteration_time": 11.569173812866211, + "stage_1_time": 129.31632256507874, + "stage_2_avg_iteration_time": 33.983641386032104, + "stage_2_max_iteration_time": 34.43809151649475, + "stage_2_min_iteration_time": 33.45232319831848, + "stage_2_time": 169.91874861717224, + "stage_3_creation_time": 1.8725192546844482, + "stage_3_time": 1821.4706330299377, + "stage_4_spread": 0.5580154959703073, "success": 1 } diff --git a/release/perf_metrics/stress_tests/stress_test_placement_group.json b/release/perf_metrics/stress_tests/stress_test_placement_group.json index 2ef542254e31..d70d74c39e18 100644 --- a/release/perf_metrics/stress_tests/stress_test_placement_group.json +++ b/release/perf_metrics/stress_tests/stress_test_placement_group.json @@ -1,16 +1,16 @@ { - "avg_pg_create_time_ms": 1.4493968768766456, - "avg_pg_remove_time_ms": 1.2057934429429915, + "avg_pg_create_time_ms": 1.5636188018035782, + "avg_pg_remove_time_ms": 1.396923734234321, "perf_metrics": [ { "perf_metric_name": "avg_pg_create_time_ms", "perf_metric_type": "LATENCY", - "perf_metric_value": 1.4493968768766456 + "perf_metric_value": 1.5636188018035782 }, { "perf_metric_name": "avg_pg_remove_time_ms", "perf_metric_type": "LATENCY", - "perf_metric_value": 1.2057934429429915 + "perf_metric_value": 1.396923734234321 } ], "success": 1 diff --git a/release/ray_release/bazel.py b/release/ray_release/bazel.py index 895fefc5cc7d..aa82b3a5acf7 100644 --- a/release/ray_release/bazel.py +++ b/release/ray_release/bazel.py @@ -2,7 +2,7 @@ import runfiles -REPO_NAME = "com_github_ray_project_ray" +REPO_NAME = "io_ray" _LEGACY_REPO_ROOT = os.path.abspath( os.path.join(os.path.dirname(__file__), "../.."), ) diff --git a/release/ray_release/buildkite/filter.py b/release/ray_release/buildkite/filter.py index 727b0711896e..c2685f124918 100644 --- a/release/ray_release/buildkite/filter.py +++ b/release/ray_release/buildkite/filter.py @@ -22,28 +22,38 @@ def _unflattened_lookup(lookup: Dict, flat_key: str, delimiter: str = "/") -> An def filter_tests( test_collection: List[Test], frequency: Frequency, - test_attr_regex_filters: Optional[Dict[str, str]] = None, + test_filters: Optional[Dict[str, str]] = None, prefer_smoke_tests: bool = False, run_jailed_tests: bool = False, run_unstable_tests: bool = False, ) -> List[Tuple[Test, bool]]: - if test_attr_regex_filters is None: - test_attr_regex_filters = {} + if test_filters is None: + test_filters = {} tests_to_run = [] for test in test_collection: + attr_mismatch = False # Skip kuberay tests for now. # TODO: (khluu) Remove this once we start running KubeRay release tests. if test.is_kuberay() and get_global_config()["kuberay_disabled"]: continue - # First, filter by string attributes - attr_mismatch = False - for attr, regex in test_attr_regex_filters.items(): - if not re.fullmatch(regex, _unflattened_lookup(test, attr) or ""): - attr_mismatch = True - break + + # Check if any test attributes match filters + if test_filters: + for attr, value in test_filters.items(): + # Only prefix filter doesn't use regex + if attr == "prefix": + if not test.get_name().startswith(value): + attr_mismatch = True + break + else: # Match filters using regex + attr_value = _unflattened_lookup(test, attr) or "" + if not re.fullmatch(value, attr_value): + attr_mismatch = True + break if attr_mismatch: continue + if not run_jailed_tests: clone_test = copy.deepcopy(test) clone_test.update_from_s3() diff --git a/release/ray_release/buildkite/settings.py b/release/ray_release/buildkite/settings.py index da8e88245443..6e6579fe4faa 100644 --- a/release/ray_release/buildkite/settings.py +++ b/release/ray_release/buildkite/settings.py @@ -11,7 +11,6 @@ class Frequency(enum.Enum): MANUAL = enum.auto() ANY = enum.auto() - MULTI = enum.auto() NIGHTLY = enum.auto() NIGHTLY_3x = enum.auto() WEEKLY = enum.auto() @@ -22,7 +21,6 @@ class Frequency(enum.Enum): "manual": Frequency.MANUAL, "any": Frequency.ANY, "any-smoke": Frequency.ANY, - "multi": Frequency.MULTI, "nightly": Frequency.NIGHTLY, "nightly-3x": Frequency.NIGHTLY_3x, "weekly": Frequency.WEEKLY, @@ -65,11 +63,11 @@ def get_priority(priority_str: str) -> Priority: return priority_str_to_enum[priority_str] -def get_test_attr_regex_filters(filters_str: str) -> Dict[str, str]: +def get_test_filters(filters_str: str) -> Dict[str, str]: if not filters_str: return {} - test_attr_regex_filters = {} + test_filters = {} for line in filters_str.splitlines(): line = line.strip() if not line: @@ -77,11 +75,10 @@ def get_test_attr_regex_filters(filters_str: str) -> Dict[str, str]: parts = line.split(":", maxsplit=1) if len(parts) != 2: raise ReleaseTestConfigError( - f"Invalid test attr regex filter: {line}. " - "Should be of the form attr:regex" + f"Invalid test filter: {line}. " "Should be of the form attr:value" ) - test_attr_regex_filters[parts[0]] = parts[1] - return test_attr_regex_filters + test_filters[parts[0]] = parts[1] + return test_filters def split_ray_repo_str(repo_str: str) -> Tuple[str, str]: @@ -129,7 +126,7 @@ def get_default_settings() -> Dict: settings = { "frequency": Frequency.ANY, "prefer_smoke_tests": False, - "test_attr_regex_filters": None, + "test_filters": None, "ray_test_repo": None, "ray_test_branch": None, "priority": Priority.DEFAULT, @@ -160,12 +157,13 @@ def update_settings_from_environment(settings: Dict) -> Dict: if "TEST_NAME" in os.environ: # This is for backward compatibility. - settings["test_attr_regex_filters"] = get_test_attr_regex_filters( - "name:" + os.environ["TEST_NAME"] - ) + settings["test_filters"] = get_test_filters("name:" + os.environ["TEST_NAME"]) + + if "TEST_FILTERS" in os.environ: + settings["test_filters"] = os.environ["TEST_FILTERS"] if "TEST_ATTR_REGEX_FILTERS" in os.environ: - settings["test_attr_regex_filters"] = get_test_attr_regex_filters( + settings["test_filters"] = get_test_filters( os.environ["TEST_ATTR_REGEX_FILTERS"] ) @@ -193,17 +191,13 @@ def update_settings_from_buildkite(settings: Dict): test_name_filter = get_buildkite_prompt_value("release-test-name") if test_name_filter: - settings["test_attr_regex_filters"] = get_test_attr_regex_filters( - "name:" + test_name_filter - ) + settings["test_filters"] = get_test_filters("name:" + test_name_filter) - test_attr_regex_filters = get_buildkite_prompt_value( - "release-test-attr-regex-filters" - ) - if test_attr_regex_filters: - settings["test_attr_regex_filters"] = get_test_attr_regex_filters( - test_attr_regex_filters - ) + test_filters = get_buildkite_prompt_value( + "release-test-filters" + ) or get_buildkite_prompt_value("release-test-attr-regex-filters") + if test_filters: + settings["test_filters"] = get_test_filters(test_filters) test_priority = get_buildkite_prompt_value("release-priority") if test_priority: diff --git a/release/ray_release/byod/build.py b/release/ray_release/byod/build.py index 62ae1d62f3b3..882a73c24859 100644 --- a/release/ray_release/byod/build.py +++ b/release/ray_release/byod/build.py @@ -1,11 +1,8 @@ from typing import List, Optional, Dict -import boto3 -import hashlib import os import subprocess import sys -import time from ray_release.config import RELEASE_PACKAGE_DIR from ray_release.logger import logger @@ -15,19 +12,11 @@ bazel_workspace_dir = os.environ.get("BUILD_WORKSPACE_DIRECTORY", "") -DATAPLANE_S3_BUCKET = "ray-release-automation-results" -DATAPLANE_FILENAME = "dataplane_20250624.tar.gz" -DATAPLANE_DIGEST = "3cffb55f1a56f0bc6256cbf1a38bf1e764e202a647a4272b80531760f1250059" -BASE_IMAGE_WAIT_TIMEOUT = 7200 -BASE_IMAGE_WAIT_DURATION = 30 RELEASE_BYOD_DIR = ( os.path.join(bazel_workspace_dir, "release/ray_release/byod") if bazel_workspace_dir else os.path.join(RELEASE_PACKAGE_DIR, "ray_release/byod") ) -REQUIREMENTS_BYOD = "requirements_byod" -REQUIREMENTS_LLM_BYOD = "requirements_llm_byod" -REQUIREMENTS_ML_BYOD = "requirements_ml_byod" def build_anyscale_custom_byod_image( @@ -60,85 +49,22 @@ def build_anyscale_custom_byod_image( _validate_and_push(image) -def build_anyscale_base_byod_images(tests: List[Test]) -> None: +def build_anyscale_base_byod_images(tests: List[Test]) -> List[str]: """ Builds the Anyscale BYOD images for the given tests. """ - _download_dataplane_build_file() - to_be_built = {} - built = set() + images = set() for test in tests: - to_be_built[test.get_anyscale_base_byod_image()] = test + images.add(test.get_anyscale_base_byod_image()) - env = os.environ.copy() - env["DOCKER_BUILDKIT"] = "1" - start = int(time.time()) - # ray images are built on post-merge, so we can wait for them to be available - while ( - len(built) < len(to_be_built) - and int(time.time()) - start < BASE_IMAGE_WAIT_TIMEOUT - ): - for byod_image, test in to_be_built.items(): - py_version = test.get_python_version() - if test.use_byod_ml_image(): - byod_requirements = f"{REQUIREMENTS_ML_BYOD}_{py_version}.txt" - elif test.use_byod_llm_image(): - byod_requirements = f"{REQUIREMENTS_LLM_BYOD}_{py_version}.txt" - else: - byod_requirements = f"{REQUIREMENTS_BYOD}_{py_version}.txt" - - if _image_exist(byod_image): - logger.info(f"Image {byod_image} already exists") - built.add(byod_image) - continue - ray_image = test.get_ray_image() - if not _image_exist(ray_image): - # TODO(can): instead of waiting for the base image to be built, we can - # build it ourselves - timeout = BASE_IMAGE_WAIT_TIMEOUT - (int(time.time()) - start) - logger.info( - f"Image {ray_image} does not exist yet. " - f"Wait for another {timeout}s..." - ) - time.sleep(BASE_IMAGE_WAIT_DURATION) - continue - logger.info(f"Building {byod_image} from {ray_image}") - with open(DATAPLANE_FILENAME, "rb") as build_file: - subprocess.check_call( - [ - "docker", - "build", - "--progress=plain", - "--build-arg", - f"BASE_IMAGE={ray_image}", - "-t", - byod_image, - "-", - ], - stdin=build_file, - stdout=sys.stderr, - env=env, - ) - subprocess.check_call( - [ - "docker", - "build", - "--progress=plain", - "--build-arg", - f"BASE_IMAGE={byod_image}", - "--build-arg", - f"PIP_REQUIREMENTS={byod_requirements}", - "-t", - byod_image, - "-f", - os.path.join(RELEASE_BYOD_DIR, "byod.Dockerfile"), - RELEASE_BYOD_DIR, - ], - stdout=sys.stderr, - env=env, - ) - _validate_and_push(byod_image) - built.add(byod_image) + image_list = list(images) + image_list.sort() + + for image in image_list: + if not _image_exist(image): + raise RuntimeError(f"Image {image} not found") + + return image_list def _validate_and_push(byod_image: str) -> None: @@ -189,21 +115,6 @@ def _get_ray_commit(envs: Optional[Dict[str, str]] = None) -> str: return "" -def _download_dataplane_build_file() -> None: - """ - Downloads the dataplane build file from S3. - """ - s3 = boto3.client("s3") - s3.download_file( - Bucket=DATAPLANE_S3_BUCKET, - Key=DATAPLANE_FILENAME, - Filename=DATAPLANE_FILENAME, - ) - with open(DATAPLANE_FILENAME, "rb") as build_context: - digest = hashlib.sha256(build_context.read()).hexdigest() - assert digest == DATAPLANE_DIGEST, "Mismatched dataplane digest found!" - - def _image_exist(image: str) -> bool: """ Checks if the given image exists in Docker diff --git a/release/ray_release/byod/byod.Dockerfile b/release/ray_release/byod/byod.Dockerfile index 91a5117575ed..ce8c2a8080ca 100644 --- a/release/ray_release/byod/byod.Dockerfile +++ b/release/ray_release/byod/byod.Dockerfile @@ -6,38 +6,35 @@ FROM "$BASE_IMAGE" ARG PIP_REQUIREMENTS +COPY "$PIP_REQUIREMENTS" extra-test-requirements.txt + RUN <=2.5.0 pytest +pyyaml requests>=2.31.0 semidbm s3fs @@ -38,4 +41,3 @@ typing-extensions xarray xgboost zarr -pyyaml diff --git a/release/ray_release/byod/requirements_byod_3.9.txt b/release/ray_release/byod/requirements_byod_3.9.txt index 7a28fcc4d03c..870a5236d18d 100644 --- a/release/ray_release/byod/requirements_byod_3.9.txt +++ b/release/ray_release/byod/requirements_byod_3.9.txt @@ -14,9 +14,9 @@ absl-py==1.4.0 \ # -c release/ray_release/byod/requirements_compiled.txt # tensorboard # tensorflow -aiobotocore==2.5.0 \ - --hash=sha256:6a5b397cddd4f81026aa91a14c7dd2650727425740a5af8ba75127ff663faf67 \ - --hash=sha256:9a2a022d7b78ec9a2af0de589916d2721cddbf96264401b78d7a73c1a1435f3b +aiobotocore==2.8.0 \ + --hash=sha256:32e632fea387acd45416c2bbc03828ee2c2a66a7dc4bd3a9bcb808dea249c469 \ + --hash=sha256:f160497cef21cfffc1a8d4219eeb27bb7b243389c2d021a812b9c0e3fb8e2bd1 # via # -c release/ray_release/byod/requirements_compiled.txt # s3fs @@ -167,6 +167,7 @@ anyio==3.7.1 \ --hash=sha256:91dee416e570e92c64041bd18b900d1d6fa78dff7048769ce5ac5ddad004fbb5 # via # -c release/ray_release/byod/requirements_compiled.txt + # httpx # starlette argcomplete==3.3.0 \ --hash=sha256:c168c3723482c031df3c207d4ba8fa702717ccb9fc0bfe4117166c1f537b4a54 \ @@ -203,15 +204,15 @@ boto==2.49.0 \ # via # -c release/ray_release/byod/requirements_compiled.txt # gcs-oauth2-boto-plugin -boto3==1.26.76 \ - --hash=sha256:30c7d967ed1c6b5a05643e42cae9d4d36c3f1cb6782637ddc7007a104cfd9027 \ - --hash=sha256:b4c2969b7677762914394b8273cc1905dfe5b71f250741c1a575487ae357e729 +boto3==1.29.7 \ + --hash=sha256:1eb4c548118b5fc5e018dee956fd33e6fb249cd1f2def85f1bba816aef4d9f3e \ + --hash=sha256:96e9890ebe7cd823b5f4976dd676e112c000c6528c28e20a2f274590589dd18b # via # -c release/ray_release/byod/requirements_compiled.txt # -r release/ray_release/byod/requirements_byod_3.9.in -botocore==1.29.76 \ - --hash=sha256:70735b00cd529f152992231ca6757e458e5ec25db43767b3526e9a35b2f143b7 \ - --hash=sha256:c2f67b6b3f8acf2968eafca06526f07b9fb0d27bac4c68a635d51abb675134a7 +botocore==1.32.7 \ + --hash=sha256:58b33d02cafa23461c8a9d211b30e8cded992380a84de409379fd02811fa3e11 \ + --hash=sha256:c6795c731b04c8e3635588c44cfd1a4462fc5987859195522c96812cf3eceff9 # via # -c release/ray_release/byod/requirements_compiled.txt # aiobotocore @@ -314,6 +315,8 @@ certifi==2025.1.31 \ # via # -c release/ray_release/byod/requirements_compiled.txt # geventhttpclient + # httpcore + # httpx # requests cffi==1.16.0 \ --hash=sha256:0c9ef6ff37e974b73c25eecc13952c55bceed9112be2d9d938ded8e856138bcc \ @@ -680,9 +683,9 @@ diskcache==5.6.3 \ --hash=sha256:2c3a3fa2743d8535d832ec61c2054a1641f41775aa7c556758a109941e33e4fc \ --hash=sha256:5e31b2d5fbad117cc363ebaf6b689474db18a1f6438bc82358b024abd4c2ca19 # via petastorm -exceptiongroup==1.2.1 \ - --hash=sha256:5258b9ed329c5bbdd31a309f53cbfb0b155341807f6ff7606a1e801a891b29ad \ - --hash=sha256:a4785e48b045528f5bfe627b6ad554ff32def154f42372786903b7abcfe1aa16 +exceptiongroup==1.3.0 \ + --hash=sha256:4d111e6e0c13d0644cad6ddaa7ed0261a0b36971f6d23e7ec9b4b9097da78a10 \ + --hash=sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88 # via # anyio # pytest @@ -811,9 +814,9 @@ frozenlist==1.4.1 \ # -c release/ray_release/byod/requirements_compiled.txt # aiohttp # aiosignal -fsspec==2023.5.0 \ - --hash=sha256:51a4ad01a5bb66fcc58036e288c0d53d3975a0df2a5dc59a93b59bade0391f2a \ - --hash=sha256:b3b56e00fb93ea321bc9e5d9cf6f8522a0198b20eb24e02774d329e9c6fb84ce +fsspec==2023.12.1 \ + --hash=sha256:6271f1d3075a378bfe432f6f42bf7e1d2a6ba74f78dd9b512385474c579146a0 \ + --hash=sha256:c4da01a35ac65c853f833e43f67802c25213f560820d54ddf248f92eddd5e990 # via # -c release/ray_release/byod/requirements_compiled.txt # gcsfs @@ -836,9 +839,9 @@ gcs-oauth2-boto-plugin==3.0 \ # via # -c release/ray_release/byod/requirements_compiled.txt # gsutil -gcsfs==2023.5.0 \ - --hash=sha256:02a815e1cf28197ab4f57335e89dc5df8744a065c7c956d42692b50a9e8f1625 \ - --hash=sha256:4f2ebc41814de3f566f85dec208704cf19823b9d04a55fd12b3142aef9046525 +gcsfs==2023.12.1 \ + --hash=sha256:c1ccfa9f84dca019cd334aaf7eb03cc1dc13c296717346927a9fd40255348f9c \ + --hash=sha256:e86cc583fdf879e5ea2f87bab61738d26ec7e8972762a1e6c6ab758b1e1af99c # via -r release/ray_release/byod/requirements_byod_3.9.in gevent==24.2.1 \ --hash=sha256:03aa5879acd6b7076f6a2a307410fb1e0d288b84b03cdfd8c74db8b4bc882fc5 \ @@ -1199,64 +1202,59 @@ greenlet==3.0.1 \ # via # -c release/ray_release/byod/requirements_compiled.txt # gevent -grpcio==1.66.2 ; sys_platform != "darwin" \ - --hash=sha256:02697eb4a5cbe5a9639f57323b4c37bcb3ab2d48cec5da3dc2f13334d72790dd \ - --hash=sha256:03b0b307ba26fae695e067b94cbb014e27390f8bc5ac7a3a39b7723fed085604 \ - --hash=sha256:05bc2ceadc2529ab0b227b1310d249d95d9001cd106aa4d31e8871ad3c428d73 \ - --hash=sha256:06de8ec0bd71be123eec15b0e0d457474931c2c407869b6c349bd9bed4adbac3 \ - --hash=sha256:0be4e0490c28da5377283861bed2941d1d20ec017ca397a5df4394d1c31a9b50 \ - --hash=sha256:12fda97ffae55e6526825daf25ad0fa37483685952b5d0f910d6405c87e3adb6 \ - --hash=sha256:1caa38fb22a8578ab8393da99d4b8641e3a80abc8fd52646f1ecc92bcb8dee34 \ - --hash=sha256:2018b053aa15782db2541ca01a7edb56a0bf18c77efed975392583725974b249 \ - --hash=sha256:20657d6b8cfed7db5e11b62ff7dfe2e12064ea78e93f1434d61888834bc86d75 \ - --hash=sha256:2335c58560a9e92ac58ff2bc5649952f9b37d0735608242973c7a8b94a6437d8 \ - --hash=sha256:31fd163105464797a72d901a06472860845ac157389e10f12631025b3e4d0453 \ - --hash=sha256:38b68498ff579a3b1ee8f93a05eb48dc2595795f2f62716e797dc24774c1aaa8 \ - --hash=sha256:3b00efc473b20d8bf83e0e1ae661b98951ca56111feb9b9611df8efc4fe5d55d \ - --hash=sha256:3ed71e81782966ffead60268bbda31ea3f725ebf8aa73634d5dda44f2cf3fb9c \ - --hash=sha256:45a3d462826f4868b442a6b8fdbe8b87b45eb4f5b5308168c156b21eca43f61c \ - --hash=sha256:49f0ca7ae850f59f828a723a9064cadbed90f1ece179d375966546499b8a2c9c \ - --hash=sha256:4e504572433f4e72b12394977679161d495c4c9581ba34a88d843eaf0f2fbd39 \ - --hash=sha256:4ea1d062c9230278793820146c95d038dc0f468cbdd172eec3363e42ff1c7d01 \ - --hash=sha256:563588c587b75c34b928bc428548e5b00ea38c46972181a4d8b75ba7e3f24231 \ - --hash=sha256:6001e575b8bbd89eee11960bb640b6da6ae110cf08113a075f1e2051cc596cae \ - --hash=sha256:66a0cd8ba6512b401d7ed46bb03f4ee455839957f28b8d61e7708056a806ba6a \ - --hash=sha256:6851de821249340bdb100df5eacfecfc4e6075fa85c6df7ee0eb213170ec8e5d \ - --hash=sha256:728bdf36a186e7f51da73be7f8d09457a03061be848718d0edf000e709418987 \ - --hash=sha256:73e3b425c1e155730273f73e419de3074aa5c5e936771ee0e4af0814631fb30a \ - --hash=sha256:73fc8f8b9b5c4a03e802b3cd0c18b2b06b410d3c1dcbef989fdeb943bd44aff7 \ - --hash=sha256:78fa51ebc2d9242c0fc5db0feecc57a9943303b46664ad89921f5079e2e4ada7 \ - --hash=sha256:7b2c86457145ce14c38e5bf6bdc19ef88e66c5fee2c3d83285c5aef026ba93b3 \ - --hash=sha256:7d69ce1f324dc2d71e40c9261d3fdbe7d4c9d60f332069ff9b2a4d8a257c7b2b \ - --hash=sha256:802d84fd3d50614170649853d121baaaa305de7b65b3e01759247e768d691ddf \ - --hash=sha256:80fd702ba7e432994df208f27514280b4b5c6843e12a48759c9255679ad38db8 \ - --hash=sha256:8ac475e8da31484efa25abb774674d837b343afb78bb3bcdef10f81a93e3d6bf \ - --hash=sha256:950da58d7d80abd0ea68757769c9db0a95b31163e53e5bb60438d263f4bed7b7 \ - --hash=sha256:99a641995a6bc4287a6315989ee591ff58507aa1cbe4c2e70d88411c4dcc0839 \ - --hash=sha256:9c3a99c519f4638e700e9e3f83952e27e2ea10873eecd7935823dab0c1c9250e \ - --hash=sha256:9c509a4f78114cbc5f0740eb3d7a74985fd2eff022971bc9bc31f8bc93e66a3b \ - --hash=sha256:a18e20d8321c6400185b4263e27982488cb5cdd62da69147087a76a24ef4e7e3 \ - --hash=sha256:a917d26e0fe980b0ac7bfcc1a3c4ad6a9a4612c911d33efb55ed7833c749b0ee \ - --hash=sha256:a9539f01cb04950fd4b5ab458e64a15f84c2acc273670072abe49a3f29bbad54 \ - --hash=sha256:ad2efdbe90c73b0434cbe64ed372e12414ad03c06262279b104a029d1889d13e \ - --hash=sha256:b672abf90a964bfde2d0ecbce30f2329a47498ba75ce6f4da35a2f4532b7acbc \ - --hash=sha256:bbd27c24a4cc5e195a7f56cfd9312e366d5d61b86e36d46bbe538457ea6eb8dd \ - --hash=sha256:c400ba5675b67025c8a9f48aa846f12a39cf0c44df5cd060e23fda5b30e9359d \ - --hash=sha256:c408f5ef75cfffa113cacd8b0c0e3611cbfd47701ca3cdc090594109b9fcbaed \ - --hash=sha256:c806852deaedee9ce8280fe98955c9103f62912a5b2d5ee7e3eaa284a6d8d8e7 \ - --hash=sha256:ce89f5876662f146d4c1f695dda29d4433a5d01c8681fbd2539afff535da14d4 \ - --hash=sha256:d25a14af966438cddf498b2e338f88d1c9706f3493b1d73b93f695c99c5f0e2a \ - --hash=sha256:d8d4732cc5052e92cea2f78b233c2e2a52998ac40cd651f40e398893ad0d06ec \ - --hash=sha256:d9a9724a156c8ec6a379869b23ba3323b7ea3600851c91489b871e375f710bc8 \ - --hash=sha256:e636ce23273683b00410f1971d209bf3689238cf5538d960adc3cdfe80dd0dbd \ - --hash=sha256:e88264caad6d8d00e7913996030bac8ad5f26b7411495848cc218bd3a9040b6c \ - --hash=sha256:f145cc21836c332c67baa6fc81099d1d27e266401565bf481948010d6ea32d46 \ - --hash=sha256:fb57870449dfcfac428afbb5a877829fcb0d6db9d9baa1148705739e9083880e \ - --hash=sha256:fb70487c95786e345af5e854ffec8cb8cc781bcc5df7930c4fbb7feaa72e1cdf \ - --hash=sha256:fe96281713168a3270878255983d2cb1a97e034325c8c2c25169a69289d3ecfa \ - --hash=sha256:ff1f7882e56c40b0d33c4922c15dfa30612f05fb785074a012f7cda74d1c3679 +grpcio==1.74.0 \ + --hash=sha256:0f87bddd6e27fc776aacf7ebfec367b6d49cad0455123951e4488ea99d9b9b8f \ + --hash=sha256:136b53c91ac1d02c8c24201bfdeb56f8b3ac3278668cbb8e0ba49c88069e1bdc \ + --hash=sha256:1733969040989f7acc3d94c22f55b4a9501a30f6aaacdbccfaba0a3ffb255ab7 \ + --hash=sha256:176d60a5168d7948539def20b2a3adcce67d72454d9ae05969a2e73f3a0feee7 \ + --hash=sha256:1a2b06afe2e50ebfd46247ac3ba60cac523f54ec7792ae9ba6073c12daf26f0a \ + --hash=sha256:1bf949792cee20d2078323a9b02bacbbae002b9e3b9e2433f2741c15bdeba1c4 \ + --hash=sha256:22b834cef33429ca6cc28303c9c327ba9a3fafecbf62fae17e9a7b7163cc43ac \ + --hash=sha256:2918948864fec2a11721d91568effffbe0a02b23ecd57f281391d986847982f6 \ + --hash=sha256:2bc2d7d8d184e2362b53905cb1708c84cb16354771c04b490485fa07ce3a1d89 \ + --hash=sha256:2f609a39f62a6f6f05c7512746798282546358a37ea93c1fcbadf8b2fed162e3 \ + --hash=sha256:3601274bc0523f6dc07666c0e01682c94472402ac2fd1226fd96e079863bfa49 \ + --hash=sha256:3b03d8f2a07f0fea8c8f74deb59f8352b770e3900d143b3d1475effcb08eec20 \ + --hash=sha256:3d14e3c4d65e19d8430a4e28ceb71ace4728776fd6c3ce34016947474479683f \ + --hash=sha256:42f8fee287427b94be63d916c90399ed310ed10aadbf9e2e5538b3e497d269bc \ + --hash=sha256:4bc5fca10aaf74779081e16c2bcc3d5ec643ffd528d9e7b1c9039000ead73bae \ + --hash=sha256:4e4181bfc24413d1e3a37a0b7889bea68d973d4b45dd2bc68bb766c140718f82 \ + --hash=sha256:55b453812fa7c7ce2f5c88be3018fb4a490519b6ce80788d5913f3f9d7da8c7b \ + --hash=sha256:566b9395b90cc3d0d0c6404bc8572c7c18786ede549cdb540ae27b58afe0fb91 \ + --hash=sha256:5f251c355167b2360537cf17bea2cf0197995e551ab9da6a0a59b3da5e8704f9 \ + --hash=sha256:60d2d48b0580e70d2e1954d0d19fa3c2e60dd7cbed826aca104fff518310d1c5 \ + --hash=sha256:64229c1e9cea079420527fa8ac45d80fc1e8d3f94deaa35643c381fa8d98f362 \ + --hash=sha256:655726919b75ab3c34cdad39da5c530ac6fa32696fb23119e36b64adcfca174a \ + --hash=sha256:662456c4513e298db6d7bd9c3b8df6f75f8752f0ba01fb653e252ed4a59b5a5d \ + --hash=sha256:68c8ebcca945efff9d86d8d6d7bfb0841cf0071024417e2d7f45c5e46b5b08eb \ + --hash=sha256:69e1a8180868a2576f02356565f16635b99088da7df3d45aaa7e24e73a054e31 \ + --hash=sha256:6bab67d15ad617aff094c382c882e0177637da73cbc5532d52c07b4ee887a87b \ + --hash=sha256:7d95d71ff35291bab3f1c52f52f474c632db26ea12700c2ff0ea0532cb0b5854 \ + --hash=sha256:80d1f4fbb35b0742d3e3d3bb654b7381cd5f015f8497279a1e9c21ba623e01b1 \ + --hash=sha256:834988b6c34515545b3edd13e902c1acdd9f2465d386ea5143fb558f153a7176 \ + --hash=sha256:8533e6e9c5bd630ca98062e3a1326249e6ada07d05acf191a77bc33f8948f3d8 \ + --hash=sha256:85bd5cdf4ed7b2d6438871adf6afff9af7096486fcf51818a81b77ef4dd30907 \ + --hash=sha256:86ad489db097141a907c559988c29718719aa3e13370d40e20506f11b4de0d11 \ + --hash=sha256:885912559974df35d92219e2dc98f51a16a48395f37b92865ad45186f294096c \ + --hash=sha256:8efe72fde5500f47aca1ef59495cb59c885afe04ac89dd11d810f2de87d935d4 \ + --hash=sha256:8f7b5882fb50632ab1e48cb3122d6df55b9afabc265582808036b6e51b9fd6b7 \ + --hash=sha256:9e7c4389771855a92934b2846bd807fc25a3dfa820fd912fe6bd8136026b2707 \ + --hash=sha256:9e912d3c993a29df6c627459af58975b2e5c897d93287939b9d5065f000249b5 \ + --hash=sha256:a8f0302f9ac4e9923f98d8e243939a6fb627cd048f5cd38595c97e38020dffce \ + --hash=sha256:b6a73b2ba83e663b2480a90b82fdae6a7aa6427f62bf43b29912c0cfd1aa2bfa \ + --hash=sha256:c14e803037e572c177ba54a3e090d6eb12efd795d49327c5ee2b3bddb836bf01 \ + --hash=sha256:c3d7bd6e3929fd2ea7fbc3f562e4987229ead70c9ae5f01501a46701e08f1ad9 \ + --hash=sha256:c98e0b7434a7fa4e3e63f250456eaef52499fba5ae661c58cc5b5477d11e7182 \ + --hash=sha256:cce634b10aeab37010449124814b05a62fb5f18928ca878f1bf4750d1f0c815b \ + --hash=sha256:e154d230dc1bbbd78ad2fdc3039fa50ad7ffcf438e4eb2fa30bce223a70c7486 \ + --hash=sha256:e1ea6176d7dfd5b941ea01c2ec34de9531ba494d541fe2057c904e601879f249 \ + --hash=sha256:e759f9e8bc908aaae0412642afe5416c9f983a80499448fcc7fab8692ae044c3 \ + --hash=sha256:e8978003816c7b9eabe217f88c78bc26adc8f9304bf6a594b02e5a49b2ef9c11 \ + --hash=sha256:ecde9ab49f58433abe02f9ed076c7b5be839cf0153883a6d23995937a82392fa \ + --hash=sha256:f6ec94f0e50eb8fa1744a731088b966427575e40c2944a980049798b127a687e \ + --hash=sha256:fd3c71aeee838299c5887230b8a1822795325ddfea635edd82954c1eaa831e24 \ + --hash=sha256:fe0f540750a13fd8e5da4b3eaba91a785eea8dca5ccd2bc2ffe978caa403090e # via - # -c release/ray_release/byod/requirements_compiled.txt # tensorboard # tensorflow gsutil==5.27 \ @@ -1264,12 +1262,18 @@ gsutil==5.27 \ # via # -c release/ray_release/byod/requirements_compiled.txt # -r release/ray_release/byod/requirements_byod_3.9.in -gymnasium[atari]==1.0.0 \ - --hash=sha256:9d2b66f30c1b34fe3c2ce7fae65ecf365d0e9982d2b3d860235e773328a3b403 \ - --hash=sha256:b6f40e1e24c5bd419361e1a5b86a9117d2499baecc3a660d44dfff4c465393ad +gymnasium[atari]==1.1.1 \ + --hash=sha256:8bd9ea9bdef32c950a444ff36afc785e1d81051ec32d30435058953c20d2456d \ + --hash=sha256:9c167ec0a2b388666e37f63b2849cd2552f7f5b71938574c637bb36487eb928a # via # -c release/ray_release/byod/requirements_compiled.txt # -r release/ray_release/byod/requirements_byod_3.9.in +h11==0.16.0 \ + --hash=sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1 \ + --hash=sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86 + # via + # -c release/ray_release/byod/requirements_compiled.txt + # httpcore h5py==3.10.0 \ --hash=sha256:012ab448590e3c4f5a8dd0f3533255bc57f80629bf7c5054cf4c87b30085063c \ --hash=sha256:212bb997a91e6a895ce5e2f365ba764debeaef5d2dca5c6fb7098d66607adf99 \ @@ -1299,6 +1303,12 @@ h5py==3.10.0 \ # via # -c release/ray_release/byod/requirements_compiled.txt # tensorflow +httpcore==1.0.9 \ + --hash=sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55 \ + --hash=sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8 + # via + # -c release/ray_release/byod/requirements_compiled.txt + # httpx httplib2==0.20.4 \ --hash=sha256:58a98e45b4b1a48273073f905d2961666ecf0fbac4250ea5b47aef259eb5c585 \ --hash=sha256:8b6a905cb1c79eefd03f8669fd993c36dc341f7c558f056cb5a33b5c2f458543 @@ -1308,12 +1318,19 @@ httplib2==0.20.4 \ # google-apitools # gsutil # oauth2client +httpx==0.27.2 \ + --hash=sha256:7bb2708e112d8fdd7829cd4243970f0c223274051cb35ee80c03301ee29a3df0 \ + --hash=sha256:f7c2be1d2f3c3c3160d441802406b206c2b76f5947b11115e6df10c6c65e66c2 + # via + # -c release/ray_release/byod/requirements_compiled.txt + # -r release/ray_release/byod/requirements_byod_3.9.in idna==3.7 \ --hash=sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc \ --hash=sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0 # via # -c release/ray_release/byod/requirements_compiled.txt # anyio + # httpx # requests # yarl importlib-metadata==6.11.0 \ @@ -1816,6 +1833,60 @@ opt-einsum==3.3.0 \ # via # -c release/ray_release/byod/requirements_compiled.txt # tensorflow +orjson==3.9.15 \ + --hash=sha256:001f4eb0ecd8e9ebd295722d0cbedf0748680fb9998d3993abaed2f40587257a \ + --hash=sha256:05a1f57fb601c426635fcae9ddbe90dfc1ed42245eb4c75e4960440cac667262 \ + --hash=sha256:10c57bc7b946cf2efa67ac55766e41764b66d40cbd9489041e637c1304400494 \ + --hash=sha256:12365576039b1a5a47df01aadb353b68223da413e2e7f98c02403061aad34bde \ + --hash=sha256:2973474811db7b35c30248d1129c64fd2bdf40d57d84beed2a9a379a6f57d0ab \ + --hash=sha256:2b5c0f532905e60cf22a511120e3719b85d9c25d0e1c2a8abb20c4dede3b05a5 \ + --hash=sha256:2c51378d4a8255b2e7c1e5cc430644f0939539deddfa77f6fac7b56a9784160a \ + --hash=sha256:2d99e3c4c13a7b0fb3792cc04c2829c9db07838fb6973e578b85c1745e7d0ce7 \ + --hash=sha256:2f256d03957075fcb5923410058982aea85455d035607486ccb847f095442bda \ + --hash=sha256:34cbcd216e7af5270f2ffa63a963346845eb71e174ea530867b7443892d77180 \ + --hash=sha256:4228aace81781cc9d05a3ec3a6d2673a1ad0d8725b4e915f1089803e9efd2b99 \ + --hash=sha256:4feeb41882e8aa17634b589533baafdceb387e01e117b1ec65534ec724023d04 \ + --hash=sha256:57d5d8cf9c27f7ef6bc56a5925c7fbc76b61288ab674eb352c26ac780caa5b10 \ + --hash=sha256:5bb399e1b49db120653a31463b4a7b27cf2fbfe60469546baf681d1b39f4edf2 \ + --hash=sha256:62482873e0289cf7313461009bf62ac8b2e54bc6f00c6fabcde785709231a5d7 \ + --hash=sha256:67384f588f7f8daf040114337d34a5188346e3fae6c38b6a19a2fe8c663a2f9b \ + --hash=sha256:6ae4e06be04dc00618247c4ae3f7c3e561d5bc19ab6941427f6d3722a0875ef7 \ + --hash=sha256:6f7b65bfaf69493c73423ce9db66cfe9138b2f9ef62897486417a8fcb0a92bfe \ + --hash=sha256:6fc2fe4647927070df3d93f561d7e588a38865ea0040027662e3e541d592811e \ + --hash=sha256:71c6b009d431b3839d7c14c3af86788b3cfac41e969e3e1c22f8a6ea13139404 \ + --hash=sha256:7413070a3e927e4207d00bd65f42d1b780fb0d32d7b1d951f6dc6ade318e1b5a \ + --hash=sha256:76bc6356d07c1d9f4b782813094d0caf1703b729d876ab6a676f3aaa9a47e37c \ + --hash=sha256:7f6cbd8e6e446fb7e4ed5bac4661a29e43f38aeecbf60c4b900b825a353276a1 \ + --hash=sha256:8055ec598605b0077e29652ccfe9372247474375e0e3f5775c91d9434e12d6b1 \ + --hash=sha256:809d653c155e2cc4fd39ad69c08fdff7f4016c355ae4b88905219d3579e31eb7 \ + --hash=sha256:82425dd5c7bd3adfe4e94c78e27e2fa02971750c2b7ffba648b0f5d5cc016a73 \ + --hash=sha256:87f1097acb569dde17f246faa268759a71a2cb8c96dd392cd25c668b104cad2f \ + --hash=sha256:920fa5a0c5175ab14b9c78f6f820b75804fb4984423ee4c4f1e6d748f8b22bc1 \ + --hash=sha256:92255879280ef9c3c0bcb327c5a1b8ed694c290d61a6a532458264f887f052cb \ + --hash=sha256:946c3a1ef25338e78107fba746f299f926db408d34553b4754e90a7de1d44068 \ + --hash=sha256:95cae920959d772f30ab36d3b25f83bb0f3be671e986c72ce22f8fa700dae061 \ + --hash=sha256:9cf1596680ac1f01839dba32d496136bdd5d8ffb858c280fa82bbfeb173bdd40 \ + --hash=sha256:9fe41b6f72f52d3da4db524c8653e46243c8c92df826ab5ffaece2dba9cccd58 \ + --hash=sha256:b17f0f14a9c0ba55ff6279a922d1932e24b13fc218a3e968ecdbf791b3682b25 \ + --hash=sha256:b3d336ed75d17c7b1af233a6561cf421dee41d9204aa3cfcc6c9c65cd5bb69a8 \ + --hash=sha256:b66bcc5670e8a6b78f0313bcb74774c8291f6f8aeef10fe70e910b8040f3ab75 \ + --hash=sha256:b725da33e6e58e4a5d27958568484aa766e825e93aa20c26c91168be58e08cbb \ + --hash=sha256:b72758f3ffc36ca566ba98a8e7f4f373b6c17c646ff8ad9b21ad10c29186f00d \ + --hash=sha256:bcef128f970bb63ecf9a65f7beafd9b55e3aaf0efc271a4154050fc15cdb386e \ + --hash=sha256:c8e8fe01e435005d4421f183038fc70ca85d2c1e490f51fb972db92af6e047c2 \ + --hash=sha256:d61f7ce4727a9fa7680cd6f3986b0e2c732639f46a5e0156e550e35258aa313a \ + --hash=sha256:d6768a327ea1ba44c9114dba5fdda4a214bdb70129065cd0807eb5f010bfcbb5 \ + --hash=sha256:e18668f1bd39e69b7fed19fa7cd1cd110a121ec25439328b5c89934e6d30d357 \ + --hash=sha256:e88b97ef13910e5f87bcbc4dd7979a7de9ba8702b54d3204ac587e83639c0c2b \ + --hash=sha256:ea0b183a5fe6b2b45f3b854b0d19c4e932d6f5934ae1f723b07cf9560edd4ec7 \ + --hash=sha256:ede0bde16cc6e9b96633df1631fbcd66491d1063667f260a4f2386a098393790 \ + --hash=sha256:f541587f5c558abd93cb0de491ce99a9ef8d1ae29dd6ab4dbb5a13281ae04cbd \ + --hash=sha256:fbbeb3c9b2edb5fd044b2a070f127a0ac456ffd079cb82746fc84af01ef021a4 \ + --hash=sha256:fdfa97090e2d6f73dced247a2f2d8004ac6449df6568f30e7fa1a045767c69a6 \ + --hash=sha256:ff0f9913d82e1d1fadbd976424c316fbc4d9c525c81d047bbdd16bd27dd98cfc + # via + # -c release/ray_release/byod/requirements_compiled.txt + # -r release/ray_release/byod/requirements_byod_3.9.in packaging==23.0 \ --hash=sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2 \ --hash=sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97 @@ -2098,114 +2169,113 @@ pycparser==2.21 \ # via # -c release/ray_release/byod/requirements_compiled.txt # cffi -pydantic==2.10.0 \ - --hash=sha256:0aca0f045ff6e2f097f1fe89521115335f15049eeb8a7bef3dafe4b19a74e289 \ - --hash=sha256:5e7807ba9201bdf61b1b58aa6eb690916c40a47acfb114b1b4fef3e7fd5b30fc +pydantic==2.11.7 \ + --hash=sha256:d989c3c6cb79469287b1569f7447a17848c998458d49ebe294e975b9baf0f0db \ + --hash=sha256:dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b # via # -c release/ray_release/byod/requirements_compiled.txt # -r release/ray_release/byod/requirements_byod_3.9.in # fastapi -pydantic-core==2.27.0 \ - --hash=sha256:0aa4d1b2eba9a325897308b3124014a142cdccb9f3e016f31d3ebee6b5ea5e75 \ - --hash=sha256:0d06b667e53320332be2bf6f9461f4a9b78092a079b8ce8634c9afaa7e10cd9f \ - --hash=sha256:153017e3d6cd3ce979de06d84343ca424bb6092727375eba1968c8b4693c6ecb \ - --hash=sha256:15e350efb67b855cd014c218716feea4986a149ed1f42a539edd271ee074a196 \ - --hash=sha256:185ef205256cd8b38431205698531026979db89a79587725c1e55c59101d64e9 \ - --hash=sha256:1da0c98a85a6c6ed702d5556db3b09c91f9b0b78de37b7593e2de8d03238807a \ - --hash=sha256:225bfff5d425c34e1fd562cef52d673579d59b967d9de06178850c4802af9039 \ - --hash=sha256:24f984fc7762ed5f806d9e8c4c77ea69fdb2afd987b4fd319ef06c87595a8c55 \ - --hash=sha256:25a7fd4de38f7ff99a37e18fa0098c3140286451bc823d1746ba80cec5b433a1 \ - --hash=sha256:2883b260f7a93235488699d39cbbd94fa7b175d3a8063fbfddd3e81ad9988cb2 \ - --hash=sha256:2a51ce96224eadd1845150b204389623c8e129fde5a67a84b972bd83a85c6c40 \ - --hash=sha256:2be0ad541bb9f059954ccf8877a49ed73877f862529575ff3d54bf4223e4dd61 \ - --hash=sha256:31a2cae5f059329f9cfe3d8d266d3da1543b60b60130d186d9b6a3c20a346361 \ - --hash=sha256:333c840a1303d1474f491e7be0b718226c730a39ead0f7dab2c7e6a2f3855555 \ - --hash=sha256:33d14369739c5d07e2e7102cdb0081a1fa46ed03215e07f097b34e020b83b1ae \ - --hash=sha256:35380671c3c921fe8adf31ad349dc6f7588b7e928dbe44e1093789734f607399 \ - --hash=sha256:359e7951f04ad35111b5ddce184db3391442345d0ab073aa63a95eb8af25a5ef \ - --hash=sha256:36aa167f69d8807ba7e341d67ea93e50fcaaf6bc433bb04939430fa3dab06f31 \ - --hash=sha256:395e3e1148fa7809016231f8065f30bb0dc285a97b4dc4360cd86e17bab58af7 \ - --hash=sha256:3e8d89c276234579cd3d095d5fa2a44eb10db9a218664a17b56363cddf226ff3 \ - --hash=sha256:3eb8849445c26b41c5a474061032c53e14fe92a11a5db969f722a2716cd12206 \ - --hash=sha256:3fd8bc2690e7c39eecdf9071b6a889ce7b22b72073863940edc2a0a23750ca90 \ - --hash=sha256:400bf470e4327e920883b51e255617dfe4496d4e80c3fea0b5a5d0bf2c404dd4 \ - --hash=sha256:4148dc9184ab79e356dc00a4199dc0ee8647973332cb385fc29a7cced49b9f9c \ - --hash=sha256:433689845288f9a1ee5714444e65957be26d30915f7745091ede4a83cfb2d7bb \ - --hash=sha256:43b61989068de9ce62296cde02beffabcadb65672207fc51e7af76dca75e6636 \ - --hash=sha256:4523c4009c3f39d948e01962223c9f5538602e7087a628479b723c939fab262d \ - --hash=sha256:483c2213a609e7db2c592bbc015da58b6c75af7360ca3c981f178110d9787bcf \ - --hash=sha256:49633583eb7dc5cba61aaf7cdb2e9e662323ad394e543ee77af265736bcd3eaa \ - --hash=sha256:4b51f964fcbb02949fc546022e56cdb16cda457af485e9a3e8b78ac2ecf5d77e \ - --hash=sha256:4bf1340ae507f6da6360b24179c2083857c8ca7644aab65807023cf35404ea8d \ - --hash=sha256:4fb49cfdb53af5041aba909be00cccfb2c0d0a2e09281bf542371c5fd36ad04c \ - --hash=sha256:510b11e9c3b1a852876d1ccd8d5903684336d635214148637ceb27366c75a467 \ - --hash=sha256:513cb14c0cc31a4dfd849a4674b20c46d87b364f997bbcb02282306f5e187abf \ - --hash=sha256:58560828ee0951bb125c6f2862fbc37f039996d19ceb6d8ff1905abf7da0bf3d \ - --hash=sha256:58ab0d979c969983cdb97374698d847a4acffb217d543e172838864636ef10d9 \ - --hash=sha256:5982048129f40b082c2654de10c0f37c67a14f5ff9d37cf35be028ae982f26df \ - --hash=sha256:5ab325fc86fbc077284c8d7f996d904d30e97904a87d6fb303dce6b3de7ebba9 \ - --hash=sha256:5cc822ab90a70ea3a91e6aed3afac570b276b1278c6909b1d384f745bd09c714 \ - --hash=sha256:5f2b19b8d6fca432cb3acf48cf5243a7bf512988029b6e6fd27e9e8c0a204d85 \ - --hash=sha256:5fc72fbfebbf42c0856a824b8b0dc2b5cd2e4a896050281a21cfa6fed8879cb1 \ - --hash=sha256:6354e18a9be37bfa124d6b288a87fb30c673745806c92956f1a25e3ae6e76b96 \ - --hash=sha256:678f66462058dd978702db17eb6a3633d634f7aa0deaea61e0a674152766d3fc \ - --hash=sha256:68950bc08f9735306322bfc16a18391fcaac99ded2509e1cc41d03ccb6013cfe \ - --hash=sha256:68ef5377eb582fa4343c9d0b57a5b094046d447b4c73dd9fbd9ffb216f829e7d \ - --hash=sha256:6b4c19525c3538fbc0bbda6229f9682fb8199ce9ac37395880e6952798e00373 \ - --hash=sha256:6bb69bf3b6500f195c3deb69c1205ba8fc3cb21d1915f1f158a10d6b1ef29b6a \ - --hash=sha256:6e19401742ed7b69e51d8e4df3c03ad5ec65a83b36244479fd70edde2828a5d9 \ - --hash=sha256:6f4a53af9e81d757756508b57cae1cf28293f0f31b9fa2bfcb416cc7fb230f9d \ - --hash=sha256:6fda87808429c520a002a85d6e7cdadbf58231d60e96260976c5b8f9a12a8e13 \ - --hash=sha256:78f841523729e43e3928a364ec46e2e3f80e6625a4f62aca5c345f3f626c6e8a \ - --hash=sha256:7a6ebfac28fd51890a61df36ef202adbd77d00ee5aca4a3dadb3d9ed49cfb929 \ - --hash=sha256:7b0202ebf2268954090209a84f9897345719e46a57c5f2c9b7b250ca0a9d3e63 \ - --hash=sha256:8117839a9bdbba86e7f9df57018fe3b96cec934c3940b591b0fd3fbfb485864a \ - --hash=sha256:82e1ad4ca170e8af4c928b67cff731b6296e6a0a0981b97b2eb7c275cc4e15bd \ - --hash=sha256:836a4bfe0cc6d36dc9a9cc1a7b391265bf6ce9d1eb1eac62ac5139f5d8d9a6fa \ - --hash=sha256:84af1cf7bfdcbc6fcf5a5f70cc9896205e0350306e4dd73d54b6a18894f79386 \ - --hash=sha256:84e35afd9e10b2698e6f2f32256678cb23ca6c1568d02628033a837638b3ed12 \ - --hash=sha256:884f1806609c2c66564082540cffc96868c5571c7c3cf3a783f63f2fb49bd3cd \ - --hash=sha256:8a150392102c402c538190730fda06f3bce654fc498865579a9f2c1d2b425833 \ - --hash=sha256:8e21d927469d04b39386255bf00d0feedead16f6253dcc85e9e10ddebc334084 \ - --hash=sha256:8e96ca781e0c01e32115912ebdf7b3fb0780ce748b80d7d28a0802fa9fbaf44e \ - --hash=sha256:8ee4c2a75af9fe21269a4a0898c5425afb01af1f5d276063f57e2ae1bc64e191 \ - --hash=sha256:91bc66f878557313c2a6bcf396e7befcffe5ab4354cfe4427318968af31143c3 \ - --hash=sha256:951e71da6c89d354572098bada5ba5b5dc3a9390c933af8a614e37755d3d1840 \ - --hash=sha256:99b2863c1365f43f74199c980a3d40f18a218fbe683dd64e470199db426c4d6a \ - --hash=sha256:9a8fbf506fde1529a1e3698198fe64bfbe2e0c09557bc6a7dcf872e7c01fec40 \ - --hash=sha256:9ce048deb1e033e7a865ca384770bccc11d44179cf09e5193a535c4c2f497bdc \ - --hash=sha256:9fe94d9d2a2b4edd7a4b22adcd45814b1b59b03feb00e56deb2e89747aec7bfe \ - --hash=sha256:a291d0b4243a259c8ea7e2b84eb9ccb76370e569298875a7c5e3e71baf49057a \ - --hash=sha256:a5c022bb0d453192426221605efc865373dde43b17822a264671c53b068ac20c \ - --hash=sha256:abb4785894936d7682635726613c44578c420a096729f1978cd061a7e72d5275 \ - --hash=sha256:b872c86d8d71827235c7077461c502feb2db3f87d9d6d5a9daa64287d75e4fa0 \ - --hash=sha256:bf37b72834e7239cf84d4a0b2c050e7f9e48bced97bad9bdf98d26b8eb72e846 \ - --hash=sha256:c0c431e4be5c1a0c6654e0c31c661cd89e0ca956ef65305c3c3fd96f4e72ca39 \ - --hash=sha256:c5726eec789ee38f2c53b10b1821457b82274f81f4f746bb1e666d8741fcfadb \ - --hash=sha256:c6fcb3fa3855d583aa57b94cf146f7781d5d5bc06cb95cb3afece33d31aac39b \ - --hash=sha256:c86679f443e7085ea55a7376462553996c688395d18ef3f0d3dbad7838f857a2 \ - --hash=sha256:c91e3c04f5191fd3fb68764bddeaf02025492d5d9f23343b283870f6ace69708 \ - --hash=sha256:c921ad596ff1a82f9c692b0758c944355abc9f0de97a4c13ca60ffc6d8dc15d4 \ - --hash=sha256:c9ed88b398ba7e3bad7bd64d66cc01dcde9cfcb7ec629a6fd78a82fa0b559d78 \ - --hash=sha256:cd2ac6b919f7fed71b17fe0b4603c092a4c9b5bae414817c9c81d3c22d1e1bcc \ - --hash=sha256:d28ca7066d6cdd347a50d8b725dc10d9a1d6a1cce09836cf071ea6a2d4908be0 \ - --hash=sha256:d29e235ce13c91902ef3efc3d883a677655b3908b1cbc73dee816e5e1f8f7739 \ - --hash=sha256:d8b5ee4ae9170e2775d495b81f414cc20268041c42571530513496ba61e94ba3 \ - --hash=sha256:db72e40628967f6dc572020d04b5f800d71264e0531c6da35097e73bdf38b003 \ - --hash=sha256:df45c4073bed486ea2f18757057953afed8dd77add7276ff01bccb79982cf46c \ - --hash=sha256:dfa5f5c0a4c8fced1422dc2ca7eefd872d5d13eb33cf324361dbf1dbfba0a9fe \ - --hash=sha256:e015833384ca3e1a0565a79f5d953b0629d9138021c27ad37c92a9fa1af7623c \ - --hash=sha256:e15315691fe2253eb447503153acef4d7223dfe7e7702f9ed66539fcd0c43801 \ - --hash=sha256:e65466b31be1070b4a5b7dbfbd14b247884cb8e8b79c64fb0f36b472912dbaea \ - --hash=sha256:e7820bb0d65e3ce1e3e70b6708c2f66143f55912fa02f4b618d0f08b61575f12 \ - --hash=sha256:e851a051f7260e6d688267eb039c81f05f23a19431bd7dfa4bf5e3cb34c108cd \ - --hash=sha256:e9f9feee7f334b72ceae46313333d002b56f325b5f04271b4ae2aadd9e993ae4 \ - --hash=sha256:eb40f828bc2f73f777d1eb8fee2e86cd9692a4518b63b6b5aa8af915dfd3207b \ - --hash=sha256:eb704155e73b833801c247f39d562229c0303f54770ca14fb1c053acb376cf10 \ - --hash=sha256:edb1bfd45227dec8d50bc7c7d86463cd8728bcc574f9b07de7369880de4626a3 \ - --hash=sha256:ee7d9d5537daf6d5c74a83b38a638cc001b648096c1cae8ef695b0c919d9d379 \ - --hash=sha256:f57783fbaf648205ac50ae7d646f27582fc706be3977e87c3c124e7a92407b10 \ - --hash=sha256:ff63a92f6e249514ef35bc795de10745be0226eaea06eb48b4bbeaa0c8850a4a +pydantic-core==2.33.2 \ + --hash=sha256:0069c9acc3f3981b9ff4cdfaf088e98d83440a4c7ea1bc07460af3d4dc22e72d \ + --hash=sha256:031c57d67ca86902726e0fae2214ce6770bbe2f710dc33063187a68744a5ecac \ + --hash=sha256:0405262705a123b7ce9f0b92f123334d67b70fd1f20a9372b907ce1080c7ba02 \ + --hash=sha256:04a1a413977ab517154eebb2d326da71638271477d6ad87a769102f7c2488c56 \ + --hash=sha256:09fb9dd6571aacd023fe6aaca316bd01cf60ab27240d7eb39ebd66a3a15293b4 \ + --hash=sha256:0a39979dcbb70998b0e505fb1556a1d550a0781463ce84ebf915ba293ccb7e22 \ + --hash=sha256:0a9f2c9dd19656823cb8250b0724ee9c60a82f3cdf68a080979d13092a3b0fef \ + --hash=sha256:0e03262ab796d986f978f79c943fc5f620381be7287148b8010b4097f79a39ec \ + --hash=sha256:0e5b2671f05ba48b94cb90ce55d8bdcaaedb8ba00cc5359f6810fc918713983d \ + --hash=sha256:0e6116757f7959a712db11f3e9c0a99ade00a5bbedae83cb801985aa154f071b \ + --hash=sha256:0fb2d542b4d66f9470e8065c5469ec676978d625a8b7a363f07d9a501a9cb36a \ + --hash=sha256:1082dd3e2d7109ad8b7da48e1d4710c8d06c253cbc4a27c1cff4fbcaa97a9e3f \ + --hash=sha256:1a8695a8d00c73e50bff9dfda4d540b7dee29ff9b8053e38380426a85ef10052 \ + --hash=sha256:1e063337ef9e9820c77acc768546325ebe04ee38b08703244c1309cccc4f1bab \ + --hash=sha256:1ea40a64d23faa25e62a70ad163571c0b342b8bf66d5fa612ac0dec4f069d916 \ + --hash=sha256:2058a32994f1fde4ca0480ab9d1e75a0e8c87c22b53a3ae66554f9af78f2fe8c \ + --hash=sha256:235f45e5dbcccf6bd99f9f472858849f73d11120d76ea8707115415f8e5ebebf \ + --hash=sha256:2807668ba86cb38c6817ad9bc66215ab8584d1d304030ce4f0887336f28a5e27 \ + --hash=sha256:2b0a451c263b01acebe51895bfb0e1cc842a5c666efe06cdf13846c7418caa9a \ + --hash=sha256:2b3d326aaef0c0399d9afffeb6367d5e26ddc24d351dbc9c636840ac355dc5d8 \ + --hash=sha256:2bfb5112df54209d820d7bf9317c7a6c9025ea52e49f46b6a2060104bba37de7 \ + --hash=sha256:2f82865531efd18d6e07a04a17331af02cb7a651583c418df8266f17a63c6612 \ + --hash=sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1 \ + --hash=sha256:3a1c81334778f9e3af2f8aeb7a960736e5cab1dfebfb26aabca09afd2906c039 \ + --hash=sha256:3abcd9392a36025e3bd55f9bd38d908bd17962cc49bc6da8e7e96285336e2bca \ + --hash=sha256:3c6db6e52c6d70aa0d00d45cdb9b40f0433b96380071ea80b09277dba021ddf7 \ + --hash=sha256:3dc625f4aa79713512d1976fe9f0bc99f706a9dee21dfd1810b4bbbf228d0e8a \ + --hash=sha256:3eb3fe62804e8f859c49ed20a8451342de53ed764150cb14ca71357c765dc2a6 \ + --hash=sha256:44857c3227d3fb5e753d5fe4a3420d6376fa594b07b621e220cd93703fe21782 \ + --hash=sha256:4b25d91e288e2c4e0662b8038a28c6a07eaac3e196cfc4ff69de4ea3db992a1b \ + --hash=sha256:4c5b0a576fb381edd6d27f0a85915c6daf2f8138dc5c267a57c08a62900758c7 \ + --hash=sha256:4e61206137cbc65e6d5256e1166f88331d3b6238e082d9f74613b9b765fb9025 \ + --hash=sha256:52fb90784e0a242bb96ec53f42196a17278855b0f31ac7c3cc6f5c1ec4811849 \ + --hash=sha256:53a57d2ed685940a504248187d5685e49eb5eef0f696853647bf37c418c538f7 \ + --hash=sha256:572c7e6c8bb4774d2ac88929e3d1f12bc45714ae5ee6d9a788a9fb35e60bb04b \ + --hash=sha256:5c4aa4e82353f65e548c476b37e64189783aa5384903bfea4f41580f255fddfa \ + --hash=sha256:5c92edd15cd58b3c2d34873597a1e20f13094f59cf88068adb18947df5455b4e \ + --hash=sha256:5f483cfb75ff703095c59e365360cb73e00185e01aaea067cd19acffd2ab20ea \ + --hash=sha256:61c18fba8e5e9db3ab908620af374db0ac1baa69f0f32df4f61ae23f15e586ac \ + --hash=sha256:6368900c2d3ef09b69cb0b913f9f8263b03786e5b2a387706c5afb66800efd51 \ + --hash=sha256:64632ff9d614e5eecfb495796ad51b0ed98c453e447a76bcbeeb69615079fc7e \ + --hash=sha256:65132b7b4a1c0beded5e057324b7e16e10910c106d43675d9bd87d4f38dde162 \ + --hash=sha256:6b99022f1d19bc32a4c2a0d544fc9a76e3be90f0b3f4af413f87d38749300e65 \ + --hash=sha256:6bdfe4b3789761f3bcb4b1ddf33355a71079858958e3a552f16d5af19768fef2 \ + --hash=sha256:6fa6dfc3e4d1f734a34710f391ae822e0a8eb8559a85c6979e14e65ee6ba2954 \ + --hash=sha256:73662edf539e72a9440129f231ed3757faab89630d291b784ca99237fb94db2b \ + --hash=sha256:73cf6373c21bc80b2e0dc88444f41ae60b2f070ed02095754eb5a01df12256de \ + --hash=sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc \ + --hash=sha256:7f92c15cd1e97d4b12acd1cc9004fa092578acfa57b67ad5e43a197175d01a64 \ + --hash=sha256:82f68293f055f51b51ea42fafc74b6aad03e70e191799430b90c13d643059ebb \ + --hash=sha256:83aa99b1285bc8f038941ddf598501a86f1536789740991d7d8756e34f1e74d9 \ + --hash=sha256:87acbfcf8e90ca885206e98359d7dca4bcbb35abdc0ff66672a293e1d7a19101 \ + --hash=sha256:87b31b6846e361ef83fedb187bb5b4372d0da3f7e28d85415efa92d6125d6e6d \ + --hash=sha256:881b21b5549499972441da4758d662aeea93f1923f953e9cbaff14b8b9565aef \ + --hash=sha256:8d55ab81c57b8ff8548c3e4947f119551253f4e3787a7bbc0b6b3ca47498a9d3 \ + --hash=sha256:8f57a69461af2a5fa6e6bbd7a5f60d3b7e6cebb687f55106933188e79ad155c1 \ + --hash=sha256:95237e53bb015f67b63c91af7518a62a8660376a6a0db19b89acc77a4d6199f5 \ + --hash=sha256:96081f1605125ba0855dfda83f6f3df5ec90c61195421ba72223de35ccfb2f88 \ + --hash=sha256:970919794d126ba8645f3837ab6046fb4e72bbc057b3709144066204c19a455d \ + --hash=sha256:9cb1da0f5a471435a7bc7e439b8a728e8b61e59784b2af70d7c169f8dd8ae290 \ + --hash=sha256:9fcd347d2cc5c23b06de6d3b7b8275be558a0c90549495c699e379a80bf8379e \ + --hash=sha256:9fdac5d6ffa1b5a83bca06ffe7583f5576555e6c8b3a91fbd25ea7780f825f7d \ + --hash=sha256:a11c8d26a50bfab49002947d3d237abe4d9e4b5bdc8846a63537b6488e197808 \ + --hash=sha256:a144d4f717285c6d9234a66778059f33a89096dfb9b39117663fd8413d582dcc \ + --hash=sha256:a2b911a5b90e0374d03813674bf0a5fbbb7741570dcd4b4e85a2e48d17def29d \ + --hash=sha256:a7ec89dc587667f22b6a0b6579c249fca9026ce7c333fc142ba42411fa243cdc \ + --hash=sha256:aa9d91b338f2df0508606f7009fde642391425189bba6d8c653afd80fd6bb64e \ + --hash=sha256:b0379a2b24882fef529ec3b4987cb5d003b9cda32256024e6fe1586ac45fc640 \ + --hash=sha256:bc7aee6f634a6f4a95676fcb5d6559a2c2a390330098dba5e5a5f28a2e4ada30 \ + --hash=sha256:bdc25f3681f7b78572699569514036afe3c243bc3059d3942624e936ec93450e \ + --hash=sha256:c083a3bdd5a93dfe480f1125926afcdbf2917ae714bdb80b36d34318b2bec5d9 \ + --hash=sha256:c20c462aa4434b33a2661701b861604913f912254e441ab8d78d30485736115a \ + --hash=sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9 \ + --hash=sha256:c52b02ad8b4e2cf14ca7b3d918f3eb0ee91e63b3167c32591e57c4317e134f8f \ + --hash=sha256:c54c939ee22dc8e2d545da79fc5381f1c020d6d3141d3bd747eab59164dc89fb \ + --hash=sha256:c8e7af2f4e0194c22b5b37205bfb293d166a7344a5b0d0eaccebc376546d77d5 \ + --hash=sha256:cca3868ddfaccfbc4bfb1d608e2ccaaebe0ae628e1416aeb9c4d88c001bb45ab \ + --hash=sha256:d3f26877a748dc4251cfcfda9dfb5f13fcb034f5308388066bcfe9031b63ae7d \ + --hash=sha256:d53b22f2032c42eaaf025f7c40c2e3b94568ae077a606f006d206a463bc69572 \ + --hash=sha256:d87c561733f66531dced0da6e864f44ebf89a8fba55f31407b00c2f7f9449593 \ + --hash=sha256:d946c8bf0d5c24bf4fe333af284c59a19358aa3ec18cb3dc4370080da1e8ad29 \ + --hash=sha256:dac89aea9af8cd672fa7b510e7b8c33b0bba9a43186680550ccf23020f32d535 \ + --hash=sha256:db4b41f9bd95fbe5acd76d89920336ba96f03e149097365afe1cb092fceb89a1 \ + --hash=sha256:dc46a01bf8d62f227d5ecee74178ffc448ff4e5197c756331f71efcc66dc980f \ + --hash=sha256:dd14041875d09cc0f9308e37a6f8b65f5585cf2598a53aa0123df8b129d481f8 \ + --hash=sha256:de4b83bb311557e439b9e186f733f6c645b9417c84e2eb8203f3f820a4b988bf \ + --hash=sha256:e799c050df38a639db758c617ec771fd8fb7a5f8eaaa4b27b101f266b216a246 \ + --hash=sha256:e80b087132752f6b3d714f041ccf74403799d3b23a72722ea2e6ba2e892555b9 \ + --hash=sha256:eb8c529b2819c37140eb51b914153063d27ed88e3bdc31b71198a198e921e011 \ + --hash=sha256:eb9b459ca4df0e5c87deb59d37377461a538852765293f9e6ee834f0435a93b9 \ + --hash=sha256:efec8db3266b76ef9607c2c4c419bdb06bf335ae433b80816089ea7585816f6a \ + --hash=sha256:f481959862f57f29601ccced557cc2e817bce7533ab8e01a797a48b49c9692b3 \ + --hash=sha256:f517ca031dfc037a9c07e748cefd8d96235088b83b4f4ba8939105d20fa1dcd6 \ + --hash=sha256:f889f7a40498cc077332c7ab6b4608d296d852182211787d4f3ee377aaae66e8 \ + --hash=sha256:f8de619080e944347f5f20de29a975c2d815d9ddd8be9b9b7268e2e3ef68605a \ + --hash=sha256:f941635f2a3d96b2973e867144fde513665c87f13fe0e193c158ac51bfaaa7b2 \ + --hash=sha256:fa754d1850735a0b0e03bcffd9d4b4343eb417e47196e4485d9cca326073a42c \ + --hash=sha256:fa854f5cf7e33842a892e5c73f45327760bc7bc516339fda888c75ae60edaeb6 \ + --hash=sha256:fe5b32187cbc0c862ee201ad66c30cf218e5ed468ec8dc1cf49dec66e160cc4d # via # -c release/ray_release/byod/requirements_compiled.txt # pydantic @@ -2562,15 +2632,15 @@ rsa==4.7.2 \ # gcs-oauth2-boto-plugin # google-auth # oauth2client -s3fs==2023.5.0 \ - --hash=sha256:0d82c4fa43d1214117f56b239c3e03c9a2886f41c31000c1c967ac6030d20362 \ - --hash=sha256:106b5d9a1000e6af413f918156ba4b96789ac832b7e08c99d186eb08164e6981 +s3fs==2023.12.1 \ + --hash=sha256:63e429bb6b5e814568cacd3f2a8551fc35493e8c418ddfcb44e6f86aa8696ccd \ + --hash=sha256:ed0b7df8cc20a2b5cefe607b1cf4e860d37c5ca4ac2d68f55464805d75d18710 # via # -c release/ray_release/byod/requirements_compiled.txt # -r release/ray_release/byod/requirements_byod_3.9.in -s3transfer==0.6.2 \ - --hash=sha256:b014be3a8a2aab98cfe1abc7229cc5a9a0cf05eb9c1f2b86b230fd8df3f78084 \ - --hash=sha256:cab66d3380cca3e70939ef2255d01cd8aece6a4907a9528740f668c4b0611861 +s3transfer==0.8.0 \ + --hash=sha256:baa479dc2e63e5c2ed51611b4d46cdf0295e2070d8d0b86b22f335ee5b954986 \ + --hash=sha256:e8d6bd52ffd99841e3a57b34370a54841f12d3aab072af862cdcc50955288002 # via # -c release/ray_release/byod/requirements_compiled.txt # boto3 @@ -2670,6 +2740,7 @@ sniffio==1.3.1 \ # via # -c release/ray_release/byod/requirements_compiled.txt # anyio + # httpx starlette==0.46.2 \ --hash=sha256:595633ce89f8ffa71a015caed34a5b2dc1c0cdb3f0f1fbd1e69339cf2abeec35 \ --hash=sha256:7f7361f34eed179294600af672f565727419830b54b7b084efe44bb82d2fccd5 @@ -2839,6 +2910,7 @@ typing-extensions==4.12.2 \ # -r release/ray_release/byod/requirements_byod_3.9.in # aioitertools # ale-py + # exceptiongroup # fastapi # gymnasium # pydantic @@ -2848,6 +2920,13 @@ typing-extensions==4.12.2 \ # starlette # tensorflow # typer + # typing-inspection +typing-inspection==0.4.1 \ + --hash=sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51 \ + --hash=sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28 + # via + # -c release/ray_release/byod/requirements_compiled.txt + # pydantic urllib3==1.26.19 \ --hash=sha256:37a0344459b199fce0e80b0d3569837ec6b6937435c5244e7fd73fa6006830f3 \ --hash=sha256:3e3d753a8618b86d7de333b4223005f68720bcd6a7d2bcb9fbd2229ec7c1e429 diff --git a/release/ray_release/byod/requirements_llm_byod_3.11.txt b/release/ray_release/byod/requirements_llm_byod_3.11.txt index 2bf87ef15bed..e2558df8f389 100644 --- a/release/ray_release/byod/requirements_llm_byod_3.11.txt +++ b/release/ray_release/byod/requirements_llm_byod_3.11.txt @@ -3,3 +3,5 @@ pytest-timeout==2.1.0 locust==2.33.0 orjson==3.10.15 backoff==2.2.1 +langchain_text_splitters==0.3.9 +sentence-transformers==5.1.0 diff --git a/release/ray_release/byod/requirements_ml_byod_3.9.in b/release/ray_release/byod/requirements_ml_byod_3.9.in index 6e93e852e7ed..6c373e1b1a2e 100644 --- a/release/ray_release/byod/requirements_ml_byod_3.9.in +++ b/release/ray_release/byod/requirements_ml_byod_3.9.in @@ -14,7 +14,7 @@ evaluate fairscale fastapi filelock -gcsfs==2023.5.0 +gcsfs==2023.12.1 gsutil ipywidgets jupytext @@ -27,6 +27,7 @@ modin numpy openai-whisper openskill +orjson petastorm protobuf pyarrow diff --git a/release/ray_release/byod/requirements_ml_byod_3.9.txt b/release/ray_release/byod/requirements_ml_byod_3.9.txt index 84b41c4c5447..abe636e92761 100644 --- a/release/ray_release/byod/requirements_ml_byod_3.9.txt +++ b/release/ray_release/byod/requirements_ml_byod_3.9.txt @@ -203,15 +203,15 @@ boto==2.49.0 \ # via # -c release/ray_release/byod/requirements_compiled.txt # gcs-oauth2-boto-plugin -boto3==1.26.76 \ - --hash=sha256:30c7d967ed1c6b5a05643e42cae9d4d36c3f1cb6782637ddc7007a104cfd9027 \ - --hash=sha256:b4c2969b7677762914394b8273cc1905dfe5b71f250741c1a575487ae357e729 +boto3==1.29.7 \ + --hash=sha256:1eb4c548118b5fc5e018dee956fd33e6fb249cd1f2def85f1bba816aef4d9f3e \ + --hash=sha256:96e9890ebe7cd823b5f4976dd676e112c000c6528c28e20a2f274590589dd18b # via # -c release/ray_release/byod/requirements_compiled.txt # -r release/ray_release/byod/requirements_ml_byod_3.9.in -botocore==1.29.76 \ - --hash=sha256:70735b00cd529f152992231ca6757e458e5ec25db43767b3526e9a35b2f143b7 \ - --hash=sha256:c2f67b6b3f8acf2968eafca06526f07b9fb0d27bac4c68a635d51abb675134a7 +botocore==1.32.7 \ + --hash=sha256:58b33d02cafa23461c8a9d211b30e8cded992380a84de409379fd02811fa3e11 \ + --hash=sha256:c6795c731b04c8e3635588c44cfd1a4462fc5987859195522c96812cf3eceff9 # via # -c release/ray_release/byod/requirements_compiled.txt # boto3 @@ -968,9 +968,9 @@ fs==2.4.16 \ # via # -c release/ray_release/byod/requirements_compiled.txt # triad -fsspec[http]==2023.5.0 \ - --hash=sha256:51a4ad01a5bb66fcc58036e288c0d53d3975a0df2a5dc59a93b59bade0391f2a \ - --hash=sha256:b3b56e00fb93ea321bc9e5d9cf6f8522a0198b20eb24e02774d329e9c6fb84ce +fsspec[http]==2023.12.1 \ + --hash=sha256:6271f1d3075a378bfe432f6f42bf7e1d2a6ba74f78dd9b512385474c579146a0 \ + --hash=sha256:c4da01a35ac65c853f833e43f67802c25213f560820d54ddf248f92eddd5e990 # via # -c release/ray_release/byod/requirements_compiled.txt # datasets @@ -1004,9 +1004,9 @@ gcs-oauth2-boto-plugin==3.0 \ # via # -c release/ray_release/byod/requirements_compiled.txt # gsutil -gcsfs==2023.5.0 \ - --hash=sha256:02a815e1cf28197ab4f57335e89dc5df8744a065c7c956d42692b50a9e8f1625 \ - --hash=sha256:4f2ebc41814de3f566f85dec208704cf19823b9d04a55fd12b3142aef9046525 +gcsfs==2023.12.1 \ + --hash=sha256:c1ccfa9f84dca019cd334aaf7eb03cc1dc13c296717346927a9fd40255348f9c \ + --hash=sha256:e86cc583fdf879e5ea2f87bab61738d26ec7e8972762a1e6c6ab758b1e1af99c # via -r release/ray_release/byod/requirements_ml_byod_3.9.in gevent==24.2.1 \ --hash=sha256:03aa5879acd6b7076f6a2a307410fb1e0d288b84b03cdfd8c74db8b4bc882fc5 \ @@ -2310,6 +2310,60 @@ openskill==6.0.0 \ --hash=sha256:eee2d0b3c1648663a480cf4680654dfd12bdc749a96d611b1904e191f2632f62 \ --hash=sha256:f89b18930c2befd580407e7cf80a480bc69c3b25d2841346be6d875c8c4bc92e # via -r release/ray_release/byod/requirements_ml_byod_3.9.in +orjson==3.9.15 \ + --hash=sha256:001f4eb0ecd8e9ebd295722d0cbedf0748680fb9998d3993abaed2f40587257a \ + --hash=sha256:05a1f57fb601c426635fcae9ddbe90dfc1ed42245eb4c75e4960440cac667262 \ + --hash=sha256:10c57bc7b946cf2efa67ac55766e41764b66d40cbd9489041e637c1304400494 \ + --hash=sha256:12365576039b1a5a47df01aadb353b68223da413e2e7f98c02403061aad34bde \ + --hash=sha256:2973474811db7b35c30248d1129c64fd2bdf40d57d84beed2a9a379a6f57d0ab \ + --hash=sha256:2b5c0f532905e60cf22a511120e3719b85d9c25d0e1c2a8abb20c4dede3b05a5 \ + --hash=sha256:2c51378d4a8255b2e7c1e5cc430644f0939539deddfa77f6fac7b56a9784160a \ + --hash=sha256:2d99e3c4c13a7b0fb3792cc04c2829c9db07838fb6973e578b85c1745e7d0ce7 \ + --hash=sha256:2f256d03957075fcb5923410058982aea85455d035607486ccb847f095442bda \ + --hash=sha256:34cbcd216e7af5270f2ffa63a963346845eb71e174ea530867b7443892d77180 \ + --hash=sha256:4228aace81781cc9d05a3ec3a6d2673a1ad0d8725b4e915f1089803e9efd2b99 \ + --hash=sha256:4feeb41882e8aa17634b589533baafdceb387e01e117b1ec65534ec724023d04 \ + --hash=sha256:57d5d8cf9c27f7ef6bc56a5925c7fbc76b61288ab674eb352c26ac780caa5b10 \ + --hash=sha256:5bb399e1b49db120653a31463b4a7b27cf2fbfe60469546baf681d1b39f4edf2 \ + --hash=sha256:62482873e0289cf7313461009bf62ac8b2e54bc6f00c6fabcde785709231a5d7 \ + --hash=sha256:67384f588f7f8daf040114337d34a5188346e3fae6c38b6a19a2fe8c663a2f9b \ + --hash=sha256:6ae4e06be04dc00618247c4ae3f7c3e561d5bc19ab6941427f6d3722a0875ef7 \ + --hash=sha256:6f7b65bfaf69493c73423ce9db66cfe9138b2f9ef62897486417a8fcb0a92bfe \ + --hash=sha256:6fc2fe4647927070df3d93f561d7e588a38865ea0040027662e3e541d592811e \ + --hash=sha256:71c6b009d431b3839d7c14c3af86788b3cfac41e969e3e1c22f8a6ea13139404 \ + --hash=sha256:7413070a3e927e4207d00bd65f42d1b780fb0d32d7b1d951f6dc6ade318e1b5a \ + --hash=sha256:76bc6356d07c1d9f4b782813094d0caf1703b729d876ab6a676f3aaa9a47e37c \ + --hash=sha256:7f6cbd8e6e446fb7e4ed5bac4661a29e43f38aeecbf60c4b900b825a353276a1 \ + --hash=sha256:8055ec598605b0077e29652ccfe9372247474375e0e3f5775c91d9434e12d6b1 \ + --hash=sha256:809d653c155e2cc4fd39ad69c08fdff7f4016c355ae4b88905219d3579e31eb7 \ + --hash=sha256:82425dd5c7bd3adfe4e94c78e27e2fa02971750c2b7ffba648b0f5d5cc016a73 \ + --hash=sha256:87f1097acb569dde17f246faa268759a71a2cb8c96dd392cd25c668b104cad2f \ + --hash=sha256:920fa5a0c5175ab14b9c78f6f820b75804fb4984423ee4c4f1e6d748f8b22bc1 \ + --hash=sha256:92255879280ef9c3c0bcb327c5a1b8ed694c290d61a6a532458264f887f052cb \ + --hash=sha256:946c3a1ef25338e78107fba746f299f926db408d34553b4754e90a7de1d44068 \ + --hash=sha256:95cae920959d772f30ab36d3b25f83bb0f3be671e986c72ce22f8fa700dae061 \ + --hash=sha256:9cf1596680ac1f01839dba32d496136bdd5d8ffb858c280fa82bbfeb173bdd40 \ + --hash=sha256:9fe41b6f72f52d3da4db524c8653e46243c8c92df826ab5ffaece2dba9cccd58 \ + --hash=sha256:b17f0f14a9c0ba55ff6279a922d1932e24b13fc218a3e968ecdbf791b3682b25 \ + --hash=sha256:b3d336ed75d17c7b1af233a6561cf421dee41d9204aa3cfcc6c9c65cd5bb69a8 \ + --hash=sha256:b66bcc5670e8a6b78f0313bcb74774c8291f6f8aeef10fe70e910b8040f3ab75 \ + --hash=sha256:b725da33e6e58e4a5d27958568484aa766e825e93aa20c26c91168be58e08cbb \ + --hash=sha256:b72758f3ffc36ca566ba98a8e7f4f373b6c17c646ff8ad9b21ad10c29186f00d \ + --hash=sha256:bcef128f970bb63ecf9a65f7beafd9b55e3aaf0efc271a4154050fc15cdb386e \ + --hash=sha256:c8e8fe01e435005d4421f183038fc70ca85d2c1e490f51fb972db92af6e047c2 \ + --hash=sha256:d61f7ce4727a9fa7680cd6f3986b0e2c732639f46a5e0156e550e35258aa313a \ + --hash=sha256:d6768a327ea1ba44c9114dba5fdda4a214bdb70129065cd0807eb5f010bfcbb5 \ + --hash=sha256:e18668f1bd39e69b7fed19fa7cd1cd110a121ec25439328b5c89934e6d30d357 \ + --hash=sha256:e88b97ef13910e5f87bcbc4dd7979a7de9ba8702b54d3204ac587e83639c0c2b \ + --hash=sha256:ea0b183a5fe6b2b45f3b854b0d19c4e932d6f5934ae1f723b07cf9560edd4ec7 \ + --hash=sha256:ede0bde16cc6e9b96633df1631fbcd66491d1063667f260a4f2386a098393790 \ + --hash=sha256:f541587f5c558abd93cb0de491ce99a9ef8d1ae29dd6ab4dbb5a13281ae04cbd \ + --hash=sha256:fbbeb3c9b2edb5fd044b2a070f127a0ac456ffd079cb82746fc84af01ef021a4 \ + --hash=sha256:fdfa97090e2d6f73dced247a2f2d8004ac6449df6568f30e7fa1a045767c69a6 \ + --hash=sha256:ff0f9913d82e1d1fadbd976424c316fbc4d9c525c81d047bbdd16bd27dd98cfc + # via + # -c release/ray_release/byod/requirements_compiled.txt + # -r release/ray_release/byod/requirements_ml_byod_3.9.in packaging==23.0 \ --hash=sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2 \ --hash=sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97 @@ -2761,116 +2815,115 @@ pycparser==2.21 \ # via # -c release/ray_release/byod/requirements_compiled.txt # cffi -pydantic==2.10.0 \ - --hash=sha256:0aca0f045ff6e2f097f1fe89521115335f15049eeb8a7bef3dafe4b19a74e289 \ - --hash=sha256:5e7807ba9201bdf61b1b58aa6eb690916c40a47acfb114b1b4fef3e7fd5b30fc +pydantic==2.11.7 \ + --hash=sha256:d989c3c6cb79469287b1569f7447a17848c998458d49ebe294e975b9baf0f0db \ + --hash=sha256:dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b # via # -c release/ray_release/byod/requirements_compiled.txt # -r release/ray_release/byod/requirements_ml_byod_3.9.in # albumentations # deepspeed # fastapi -pydantic-core==2.27.0 \ - --hash=sha256:0aa4d1b2eba9a325897308b3124014a142cdccb9f3e016f31d3ebee6b5ea5e75 \ - --hash=sha256:0d06b667e53320332be2bf6f9461f4a9b78092a079b8ce8634c9afaa7e10cd9f \ - --hash=sha256:153017e3d6cd3ce979de06d84343ca424bb6092727375eba1968c8b4693c6ecb \ - --hash=sha256:15e350efb67b855cd014c218716feea4986a149ed1f42a539edd271ee074a196 \ - --hash=sha256:185ef205256cd8b38431205698531026979db89a79587725c1e55c59101d64e9 \ - --hash=sha256:1da0c98a85a6c6ed702d5556db3b09c91f9b0b78de37b7593e2de8d03238807a \ - --hash=sha256:225bfff5d425c34e1fd562cef52d673579d59b967d9de06178850c4802af9039 \ - --hash=sha256:24f984fc7762ed5f806d9e8c4c77ea69fdb2afd987b4fd319ef06c87595a8c55 \ - --hash=sha256:25a7fd4de38f7ff99a37e18fa0098c3140286451bc823d1746ba80cec5b433a1 \ - --hash=sha256:2883b260f7a93235488699d39cbbd94fa7b175d3a8063fbfddd3e81ad9988cb2 \ - --hash=sha256:2a51ce96224eadd1845150b204389623c8e129fde5a67a84b972bd83a85c6c40 \ - --hash=sha256:2be0ad541bb9f059954ccf8877a49ed73877f862529575ff3d54bf4223e4dd61 \ - --hash=sha256:31a2cae5f059329f9cfe3d8d266d3da1543b60b60130d186d9b6a3c20a346361 \ - --hash=sha256:333c840a1303d1474f491e7be0b718226c730a39ead0f7dab2c7e6a2f3855555 \ - --hash=sha256:33d14369739c5d07e2e7102cdb0081a1fa46ed03215e07f097b34e020b83b1ae \ - --hash=sha256:35380671c3c921fe8adf31ad349dc6f7588b7e928dbe44e1093789734f607399 \ - --hash=sha256:359e7951f04ad35111b5ddce184db3391442345d0ab073aa63a95eb8af25a5ef \ - --hash=sha256:36aa167f69d8807ba7e341d67ea93e50fcaaf6bc433bb04939430fa3dab06f31 \ - --hash=sha256:395e3e1148fa7809016231f8065f30bb0dc285a97b4dc4360cd86e17bab58af7 \ - --hash=sha256:3e8d89c276234579cd3d095d5fa2a44eb10db9a218664a17b56363cddf226ff3 \ - --hash=sha256:3eb8849445c26b41c5a474061032c53e14fe92a11a5db969f722a2716cd12206 \ - --hash=sha256:3fd8bc2690e7c39eecdf9071b6a889ce7b22b72073863940edc2a0a23750ca90 \ - --hash=sha256:400bf470e4327e920883b51e255617dfe4496d4e80c3fea0b5a5d0bf2c404dd4 \ - --hash=sha256:4148dc9184ab79e356dc00a4199dc0ee8647973332cb385fc29a7cced49b9f9c \ - --hash=sha256:433689845288f9a1ee5714444e65957be26d30915f7745091ede4a83cfb2d7bb \ - --hash=sha256:43b61989068de9ce62296cde02beffabcadb65672207fc51e7af76dca75e6636 \ - --hash=sha256:4523c4009c3f39d948e01962223c9f5538602e7087a628479b723c939fab262d \ - --hash=sha256:483c2213a609e7db2c592bbc015da58b6c75af7360ca3c981f178110d9787bcf \ - --hash=sha256:49633583eb7dc5cba61aaf7cdb2e9e662323ad394e543ee77af265736bcd3eaa \ - --hash=sha256:4b51f964fcbb02949fc546022e56cdb16cda457af485e9a3e8b78ac2ecf5d77e \ - --hash=sha256:4bf1340ae507f6da6360b24179c2083857c8ca7644aab65807023cf35404ea8d \ - --hash=sha256:4fb49cfdb53af5041aba909be00cccfb2c0d0a2e09281bf542371c5fd36ad04c \ - --hash=sha256:510b11e9c3b1a852876d1ccd8d5903684336d635214148637ceb27366c75a467 \ - --hash=sha256:513cb14c0cc31a4dfd849a4674b20c46d87b364f997bbcb02282306f5e187abf \ - --hash=sha256:58560828ee0951bb125c6f2862fbc37f039996d19ceb6d8ff1905abf7da0bf3d \ - --hash=sha256:58ab0d979c969983cdb97374698d847a4acffb217d543e172838864636ef10d9 \ - --hash=sha256:5982048129f40b082c2654de10c0f37c67a14f5ff9d37cf35be028ae982f26df \ - --hash=sha256:5ab325fc86fbc077284c8d7f996d904d30e97904a87d6fb303dce6b3de7ebba9 \ - --hash=sha256:5cc822ab90a70ea3a91e6aed3afac570b276b1278c6909b1d384f745bd09c714 \ - --hash=sha256:5f2b19b8d6fca432cb3acf48cf5243a7bf512988029b6e6fd27e9e8c0a204d85 \ - --hash=sha256:5fc72fbfebbf42c0856a824b8b0dc2b5cd2e4a896050281a21cfa6fed8879cb1 \ - --hash=sha256:6354e18a9be37bfa124d6b288a87fb30c673745806c92956f1a25e3ae6e76b96 \ - --hash=sha256:678f66462058dd978702db17eb6a3633d634f7aa0deaea61e0a674152766d3fc \ - --hash=sha256:68950bc08f9735306322bfc16a18391fcaac99ded2509e1cc41d03ccb6013cfe \ - --hash=sha256:68ef5377eb582fa4343c9d0b57a5b094046d447b4c73dd9fbd9ffb216f829e7d \ - --hash=sha256:6b4c19525c3538fbc0bbda6229f9682fb8199ce9ac37395880e6952798e00373 \ - --hash=sha256:6bb69bf3b6500f195c3deb69c1205ba8fc3cb21d1915f1f158a10d6b1ef29b6a \ - --hash=sha256:6e19401742ed7b69e51d8e4df3c03ad5ec65a83b36244479fd70edde2828a5d9 \ - --hash=sha256:6f4a53af9e81d757756508b57cae1cf28293f0f31b9fa2bfcb416cc7fb230f9d \ - --hash=sha256:6fda87808429c520a002a85d6e7cdadbf58231d60e96260976c5b8f9a12a8e13 \ - --hash=sha256:78f841523729e43e3928a364ec46e2e3f80e6625a4f62aca5c345f3f626c6e8a \ - --hash=sha256:7a6ebfac28fd51890a61df36ef202adbd77d00ee5aca4a3dadb3d9ed49cfb929 \ - --hash=sha256:7b0202ebf2268954090209a84f9897345719e46a57c5f2c9b7b250ca0a9d3e63 \ - --hash=sha256:8117839a9bdbba86e7f9df57018fe3b96cec934c3940b591b0fd3fbfb485864a \ - --hash=sha256:82e1ad4ca170e8af4c928b67cff731b6296e6a0a0981b97b2eb7c275cc4e15bd \ - --hash=sha256:836a4bfe0cc6d36dc9a9cc1a7b391265bf6ce9d1eb1eac62ac5139f5d8d9a6fa \ - --hash=sha256:84af1cf7bfdcbc6fcf5a5f70cc9896205e0350306e4dd73d54b6a18894f79386 \ - --hash=sha256:84e35afd9e10b2698e6f2f32256678cb23ca6c1568d02628033a837638b3ed12 \ - --hash=sha256:884f1806609c2c66564082540cffc96868c5571c7c3cf3a783f63f2fb49bd3cd \ - --hash=sha256:8a150392102c402c538190730fda06f3bce654fc498865579a9f2c1d2b425833 \ - --hash=sha256:8e21d927469d04b39386255bf00d0feedead16f6253dcc85e9e10ddebc334084 \ - --hash=sha256:8e96ca781e0c01e32115912ebdf7b3fb0780ce748b80d7d28a0802fa9fbaf44e \ - --hash=sha256:8ee4c2a75af9fe21269a4a0898c5425afb01af1f5d276063f57e2ae1bc64e191 \ - --hash=sha256:91bc66f878557313c2a6bcf396e7befcffe5ab4354cfe4427318968af31143c3 \ - --hash=sha256:951e71da6c89d354572098bada5ba5b5dc3a9390c933af8a614e37755d3d1840 \ - --hash=sha256:99b2863c1365f43f74199c980a3d40f18a218fbe683dd64e470199db426c4d6a \ - --hash=sha256:9a8fbf506fde1529a1e3698198fe64bfbe2e0c09557bc6a7dcf872e7c01fec40 \ - --hash=sha256:9ce048deb1e033e7a865ca384770bccc11d44179cf09e5193a535c4c2f497bdc \ - --hash=sha256:9fe94d9d2a2b4edd7a4b22adcd45814b1b59b03feb00e56deb2e89747aec7bfe \ - --hash=sha256:a291d0b4243a259c8ea7e2b84eb9ccb76370e569298875a7c5e3e71baf49057a \ - --hash=sha256:a5c022bb0d453192426221605efc865373dde43b17822a264671c53b068ac20c \ - --hash=sha256:abb4785894936d7682635726613c44578c420a096729f1978cd061a7e72d5275 \ - --hash=sha256:b872c86d8d71827235c7077461c502feb2db3f87d9d6d5a9daa64287d75e4fa0 \ - --hash=sha256:bf37b72834e7239cf84d4a0b2c050e7f9e48bced97bad9bdf98d26b8eb72e846 \ - --hash=sha256:c0c431e4be5c1a0c6654e0c31c661cd89e0ca956ef65305c3c3fd96f4e72ca39 \ - --hash=sha256:c5726eec789ee38f2c53b10b1821457b82274f81f4f746bb1e666d8741fcfadb \ - --hash=sha256:c6fcb3fa3855d583aa57b94cf146f7781d5d5bc06cb95cb3afece33d31aac39b \ - --hash=sha256:c86679f443e7085ea55a7376462553996c688395d18ef3f0d3dbad7838f857a2 \ - --hash=sha256:c91e3c04f5191fd3fb68764bddeaf02025492d5d9f23343b283870f6ace69708 \ - --hash=sha256:c921ad596ff1a82f9c692b0758c944355abc9f0de97a4c13ca60ffc6d8dc15d4 \ - --hash=sha256:c9ed88b398ba7e3bad7bd64d66cc01dcde9cfcb7ec629a6fd78a82fa0b559d78 \ - --hash=sha256:cd2ac6b919f7fed71b17fe0b4603c092a4c9b5bae414817c9c81d3c22d1e1bcc \ - --hash=sha256:d28ca7066d6cdd347a50d8b725dc10d9a1d6a1cce09836cf071ea6a2d4908be0 \ - --hash=sha256:d29e235ce13c91902ef3efc3d883a677655b3908b1cbc73dee816e5e1f8f7739 \ - --hash=sha256:d8b5ee4ae9170e2775d495b81f414cc20268041c42571530513496ba61e94ba3 \ - --hash=sha256:db72e40628967f6dc572020d04b5f800d71264e0531c6da35097e73bdf38b003 \ - --hash=sha256:df45c4073bed486ea2f18757057953afed8dd77add7276ff01bccb79982cf46c \ - --hash=sha256:dfa5f5c0a4c8fced1422dc2ca7eefd872d5d13eb33cf324361dbf1dbfba0a9fe \ - --hash=sha256:e015833384ca3e1a0565a79f5d953b0629d9138021c27ad37c92a9fa1af7623c \ - --hash=sha256:e15315691fe2253eb447503153acef4d7223dfe7e7702f9ed66539fcd0c43801 \ - --hash=sha256:e65466b31be1070b4a5b7dbfbd14b247884cb8e8b79c64fb0f36b472912dbaea \ - --hash=sha256:e7820bb0d65e3ce1e3e70b6708c2f66143f55912fa02f4b618d0f08b61575f12 \ - --hash=sha256:e851a051f7260e6d688267eb039c81f05f23a19431bd7dfa4bf5e3cb34c108cd \ - --hash=sha256:e9f9feee7f334b72ceae46313333d002b56f325b5f04271b4ae2aadd9e993ae4 \ - --hash=sha256:eb40f828bc2f73f777d1eb8fee2e86cd9692a4518b63b6b5aa8af915dfd3207b \ - --hash=sha256:eb704155e73b833801c247f39d562229c0303f54770ca14fb1c053acb376cf10 \ - --hash=sha256:edb1bfd45227dec8d50bc7c7d86463cd8728bcc574f9b07de7369880de4626a3 \ - --hash=sha256:ee7d9d5537daf6d5c74a83b38a638cc001b648096c1cae8ef695b0c919d9d379 \ - --hash=sha256:f57783fbaf648205ac50ae7d646f27582fc706be3977e87c3c124e7a92407b10 \ - --hash=sha256:ff63a92f6e249514ef35bc795de10745be0226eaea06eb48b4bbeaa0c8850a4a +pydantic-core==2.33.2 \ + --hash=sha256:0069c9acc3f3981b9ff4cdfaf088e98d83440a4c7ea1bc07460af3d4dc22e72d \ + --hash=sha256:031c57d67ca86902726e0fae2214ce6770bbe2f710dc33063187a68744a5ecac \ + --hash=sha256:0405262705a123b7ce9f0b92f123334d67b70fd1f20a9372b907ce1080c7ba02 \ + --hash=sha256:04a1a413977ab517154eebb2d326da71638271477d6ad87a769102f7c2488c56 \ + --hash=sha256:09fb9dd6571aacd023fe6aaca316bd01cf60ab27240d7eb39ebd66a3a15293b4 \ + --hash=sha256:0a39979dcbb70998b0e505fb1556a1d550a0781463ce84ebf915ba293ccb7e22 \ + --hash=sha256:0a9f2c9dd19656823cb8250b0724ee9c60a82f3cdf68a080979d13092a3b0fef \ + --hash=sha256:0e03262ab796d986f978f79c943fc5f620381be7287148b8010b4097f79a39ec \ + --hash=sha256:0e5b2671f05ba48b94cb90ce55d8bdcaaedb8ba00cc5359f6810fc918713983d \ + --hash=sha256:0e6116757f7959a712db11f3e9c0a99ade00a5bbedae83cb801985aa154f071b \ + --hash=sha256:0fb2d542b4d66f9470e8065c5469ec676978d625a8b7a363f07d9a501a9cb36a \ + --hash=sha256:1082dd3e2d7109ad8b7da48e1d4710c8d06c253cbc4a27c1cff4fbcaa97a9e3f \ + --hash=sha256:1a8695a8d00c73e50bff9dfda4d540b7dee29ff9b8053e38380426a85ef10052 \ + --hash=sha256:1e063337ef9e9820c77acc768546325ebe04ee38b08703244c1309cccc4f1bab \ + --hash=sha256:1ea40a64d23faa25e62a70ad163571c0b342b8bf66d5fa612ac0dec4f069d916 \ + --hash=sha256:2058a32994f1fde4ca0480ab9d1e75a0e8c87c22b53a3ae66554f9af78f2fe8c \ + --hash=sha256:235f45e5dbcccf6bd99f9f472858849f73d11120d76ea8707115415f8e5ebebf \ + --hash=sha256:2807668ba86cb38c6817ad9bc66215ab8584d1d304030ce4f0887336f28a5e27 \ + --hash=sha256:2b0a451c263b01acebe51895bfb0e1cc842a5c666efe06cdf13846c7418caa9a \ + --hash=sha256:2b3d326aaef0c0399d9afffeb6367d5e26ddc24d351dbc9c636840ac355dc5d8 \ + --hash=sha256:2bfb5112df54209d820d7bf9317c7a6c9025ea52e49f46b6a2060104bba37de7 \ + --hash=sha256:2f82865531efd18d6e07a04a17331af02cb7a651583c418df8266f17a63c6612 \ + --hash=sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1 \ + --hash=sha256:3a1c81334778f9e3af2f8aeb7a960736e5cab1dfebfb26aabca09afd2906c039 \ + --hash=sha256:3abcd9392a36025e3bd55f9bd38d908bd17962cc49bc6da8e7e96285336e2bca \ + --hash=sha256:3c6db6e52c6d70aa0d00d45cdb9b40f0433b96380071ea80b09277dba021ddf7 \ + --hash=sha256:3dc625f4aa79713512d1976fe9f0bc99f706a9dee21dfd1810b4bbbf228d0e8a \ + --hash=sha256:3eb3fe62804e8f859c49ed20a8451342de53ed764150cb14ca71357c765dc2a6 \ + --hash=sha256:44857c3227d3fb5e753d5fe4a3420d6376fa594b07b621e220cd93703fe21782 \ + --hash=sha256:4b25d91e288e2c4e0662b8038a28c6a07eaac3e196cfc4ff69de4ea3db992a1b \ + --hash=sha256:4c5b0a576fb381edd6d27f0a85915c6daf2f8138dc5c267a57c08a62900758c7 \ + --hash=sha256:4e61206137cbc65e6d5256e1166f88331d3b6238e082d9f74613b9b765fb9025 \ + --hash=sha256:52fb90784e0a242bb96ec53f42196a17278855b0f31ac7c3cc6f5c1ec4811849 \ + --hash=sha256:53a57d2ed685940a504248187d5685e49eb5eef0f696853647bf37c418c538f7 \ + --hash=sha256:572c7e6c8bb4774d2ac88929e3d1f12bc45714ae5ee6d9a788a9fb35e60bb04b \ + --hash=sha256:5c4aa4e82353f65e548c476b37e64189783aa5384903bfea4f41580f255fddfa \ + --hash=sha256:5c92edd15cd58b3c2d34873597a1e20f13094f59cf88068adb18947df5455b4e \ + --hash=sha256:5f483cfb75ff703095c59e365360cb73e00185e01aaea067cd19acffd2ab20ea \ + --hash=sha256:61c18fba8e5e9db3ab908620af374db0ac1baa69f0f32df4f61ae23f15e586ac \ + --hash=sha256:6368900c2d3ef09b69cb0b913f9f8263b03786e5b2a387706c5afb66800efd51 \ + --hash=sha256:64632ff9d614e5eecfb495796ad51b0ed98c453e447a76bcbeeb69615079fc7e \ + --hash=sha256:65132b7b4a1c0beded5e057324b7e16e10910c106d43675d9bd87d4f38dde162 \ + --hash=sha256:6b99022f1d19bc32a4c2a0d544fc9a76e3be90f0b3f4af413f87d38749300e65 \ + --hash=sha256:6bdfe4b3789761f3bcb4b1ddf33355a71079858958e3a552f16d5af19768fef2 \ + --hash=sha256:6fa6dfc3e4d1f734a34710f391ae822e0a8eb8559a85c6979e14e65ee6ba2954 \ + --hash=sha256:73662edf539e72a9440129f231ed3757faab89630d291b784ca99237fb94db2b \ + --hash=sha256:73cf6373c21bc80b2e0dc88444f41ae60b2f070ed02095754eb5a01df12256de \ + --hash=sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc \ + --hash=sha256:7f92c15cd1e97d4b12acd1cc9004fa092578acfa57b67ad5e43a197175d01a64 \ + --hash=sha256:82f68293f055f51b51ea42fafc74b6aad03e70e191799430b90c13d643059ebb \ + --hash=sha256:83aa99b1285bc8f038941ddf598501a86f1536789740991d7d8756e34f1e74d9 \ + --hash=sha256:87acbfcf8e90ca885206e98359d7dca4bcbb35abdc0ff66672a293e1d7a19101 \ + --hash=sha256:87b31b6846e361ef83fedb187bb5b4372d0da3f7e28d85415efa92d6125d6e6d \ + --hash=sha256:881b21b5549499972441da4758d662aeea93f1923f953e9cbaff14b8b9565aef \ + --hash=sha256:8d55ab81c57b8ff8548c3e4947f119551253f4e3787a7bbc0b6b3ca47498a9d3 \ + --hash=sha256:8f57a69461af2a5fa6e6bbd7a5f60d3b7e6cebb687f55106933188e79ad155c1 \ + --hash=sha256:95237e53bb015f67b63c91af7518a62a8660376a6a0db19b89acc77a4d6199f5 \ + --hash=sha256:96081f1605125ba0855dfda83f6f3df5ec90c61195421ba72223de35ccfb2f88 \ + --hash=sha256:970919794d126ba8645f3837ab6046fb4e72bbc057b3709144066204c19a455d \ + --hash=sha256:9cb1da0f5a471435a7bc7e439b8a728e8b61e59784b2af70d7c169f8dd8ae290 \ + --hash=sha256:9fcd347d2cc5c23b06de6d3b7b8275be558a0c90549495c699e379a80bf8379e \ + --hash=sha256:9fdac5d6ffa1b5a83bca06ffe7583f5576555e6c8b3a91fbd25ea7780f825f7d \ + --hash=sha256:a11c8d26a50bfab49002947d3d237abe4d9e4b5bdc8846a63537b6488e197808 \ + --hash=sha256:a144d4f717285c6d9234a66778059f33a89096dfb9b39117663fd8413d582dcc \ + --hash=sha256:a2b911a5b90e0374d03813674bf0a5fbbb7741570dcd4b4e85a2e48d17def29d \ + --hash=sha256:a7ec89dc587667f22b6a0b6579c249fca9026ce7c333fc142ba42411fa243cdc \ + --hash=sha256:aa9d91b338f2df0508606f7009fde642391425189bba6d8c653afd80fd6bb64e \ + --hash=sha256:b0379a2b24882fef529ec3b4987cb5d003b9cda32256024e6fe1586ac45fc640 \ + --hash=sha256:bc7aee6f634a6f4a95676fcb5d6559a2c2a390330098dba5e5a5f28a2e4ada30 \ + --hash=sha256:bdc25f3681f7b78572699569514036afe3c243bc3059d3942624e936ec93450e \ + --hash=sha256:c083a3bdd5a93dfe480f1125926afcdbf2917ae714bdb80b36d34318b2bec5d9 \ + --hash=sha256:c20c462aa4434b33a2661701b861604913f912254e441ab8d78d30485736115a \ + --hash=sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9 \ + --hash=sha256:c52b02ad8b4e2cf14ca7b3d918f3eb0ee91e63b3167c32591e57c4317e134f8f \ + --hash=sha256:c54c939ee22dc8e2d545da79fc5381f1c020d6d3141d3bd747eab59164dc89fb \ + --hash=sha256:c8e7af2f4e0194c22b5b37205bfb293d166a7344a5b0d0eaccebc376546d77d5 \ + --hash=sha256:cca3868ddfaccfbc4bfb1d608e2ccaaebe0ae628e1416aeb9c4d88c001bb45ab \ + --hash=sha256:d3f26877a748dc4251cfcfda9dfb5f13fcb034f5308388066bcfe9031b63ae7d \ + --hash=sha256:d53b22f2032c42eaaf025f7c40c2e3b94568ae077a606f006d206a463bc69572 \ + --hash=sha256:d87c561733f66531dced0da6e864f44ebf89a8fba55f31407b00c2f7f9449593 \ + --hash=sha256:d946c8bf0d5c24bf4fe333af284c59a19358aa3ec18cb3dc4370080da1e8ad29 \ + --hash=sha256:dac89aea9af8cd672fa7b510e7b8c33b0bba9a43186680550ccf23020f32d535 \ + --hash=sha256:db4b41f9bd95fbe5acd76d89920336ba96f03e149097365afe1cb092fceb89a1 \ + --hash=sha256:dc46a01bf8d62f227d5ecee74178ffc448ff4e5197c756331f71efcc66dc980f \ + --hash=sha256:dd14041875d09cc0f9308e37a6f8b65f5585cf2598a53aa0123df8b129d481f8 \ + --hash=sha256:de4b83bb311557e439b9e186f733f6c645b9417c84e2eb8203f3f820a4b988bf \ + --hash=sha256:e799c050df38a639db758c617ec771fd8fb7a5f8eaaa4b27b101f266b216a246 \ + --hash=sha256:e80b087132752f6b3d714f041ccf74403799d3b23a72722ea2e6ba2e892555b9 \ + --hash=sha256:eb8c529b2819c37140eb51b914153063d27ed88e3bdc31b71198a198e921e011 \ + --hash=sha256:eb9b459ca4df0e5c87deb59d37377461a538852765293f9e6ee834f0435a93b9 \ + --hash=sha256:efec8db3266b76ef9607c2c4c419bdb06bf335ae433b80816089ea7585816f6a \ + --hash=sha256:f481959862f57f29601ccced557cc2e817bce7533ab8e01a797a48b49c9692b3 \ + --hash=sha256:f517ca031dfc037a9c07e748cefd8d96235088b83b4f4ba8939105d20fa1dcd6 \ + --hash=sha256:f889f7a40498cc077332c7ab6b4608d296d852182211787d4f3ee377aaae66e8 \ + --hash=sha256:f8de619080e944347f5f20de29a975c2d815d9ddd8be9b9b7268e2e3ef68605a \ + --hash=sha256:f941635f2a3d96b2973e867144fde513665c87f13fe0e193c158ac51bfaaa7b2 \ + --hash=sha256:fa754d1850735a0b0e03bcffd9d4b4343eb417e47196e4485d9cca326073a42c \ + --hash=sha256:fa854f5cf7e33842a892e5c73f45327760bc7bc516339fda888c75ae60edaeb6 \ + --hash=sha256:fe5b32187cbc0c862ee201ad66c30cf218e5ed468ec8dc1cf49dec66e160cc4d # via # -c release/ray_release/byod/requirements_compiled.txt # pydantic @@ -3360,9 +3413,9 @@ rsa==4.7.2 \ # gcs-oauth2-boto-plugin # google-auth # oauth2client -s3transfer==0.6.2 \ - --hash=sha256:b014be3a8a2aab98cfe1abc7229cc5a9a0cf05eb9c1f2b86b230fd8df3f78084 \ - --hash=sha256:cab66d3380cca3e70939ef2255d01cd8aece6a4907a9528740f668c4b0611861 +s3transfer==0.8.0 \ + --hash=sha256:baa479dc2e63e5c2ed51611b4d46cdf0295e2070d8d0b86b22f335ee5b954986 \ + --hash=sha256:e8d6bd52ffd99841e3a57b34370a54841f12d3aab072af862cdcc50955288002 # via # -c release/ray_release/byod/requirements_compiled.txt # boto3 @@ -4425,7 +4478,14 @@ typing-extensions==4.12.2 \ # starlette # torch # typer + # typing-inspection # wandb +typing-inspection==0.4.1 \ + --hash=sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51 \ + --hash=sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28 + # via + # -c release/ray_release/byod/requirements_compiled.txt + # pydantic urllib3==1.26.19 \ --hash=sha256:37a0344459b199fce0e80b0d3569837ec6b6937435c5244e7fd73fa6006830f3 \ --hash=sha256:3e3d753a8618b86d7de333b4223005f68720bcd6a7d2bcb9fbd2229ec7c1e429 diff --git a/release/ray_release/configs/global_config.py b/release/ray_release/configs/global_config.py index 9de06e104318..88eb6656c73a 100644 --- a/release/ray_release/configs/global_config.py +++ b/release/ray_release/configs/global_config.py @@ -11,6 +11,7 @@ class GlobalConfig(TypedDict): byod_ray_ml_cr_repo: str byod_ray_llm_cr_repo: str byod_ecr: str + byod_ecr_region: str byod_aws_cr: str byod_gcp_cr: str state_machine_pr_aws_bucket: str @@ -20,6 +21,9 @@ class GlobalConfig(TypedDict): ci_pipeline_premerge: List[str] ci_pipeline_postmerge: List[str] ci_pipeline_buildkite_secret: str + release_image_step_ray: str + release_image_step_ray_ml: str + release_image_step_ray_llm: str config = None @@ -67,6 +71,10 @@ def _init_global_config(config_file: str): config_content.get("byod", {}).get("byod_ecr") or config_content.get("release_byod", {}).get("byod_ecr") ), + byod_ecr_region=( + config_content.get("byod", {}).get("byod_ecr_region") + or config_content.get("release_byod", {}).get("byod_ecr_region") + ), byod_aws_cr=( config_content.get("byod", {}).get("aws_cr") or config_content.get("release_byod", {}).get("aws_cr") @@ -101,6 +109,13 @@ def _init_global_config(config_file: str): "buildkite_secret" ), kuberay_disabled=config_content.get("kuberay", {}).get("disabled", 0) == 1, + release_image_step_ray=config_content.get("release_image_step", {}).get("ray"), + release_image_step_ray_ml=config_content.get("release_image_step", {}).get( + "ray_ml" + ), + release_image_step_ray_llm=config_content.get("release_image_step", {}).get( + "ray_llm" + ), ) # setup GCP workload identity federation os.environ[ diff --git a/release/ray_release/custom_byod_build_init_helper.py b/release/ray_release/custom_byod_build_init_helper.py new file mode 100644 index 000000000000..cbdd90b55926 --- /dev/null +++ b/release/ray_release/custom_byod_build_init_helper.py @@ -0,0 +1,67 @@ +from typing import List, Tuple +import yaml +from ray_release.configs.global_config import get_global_config +from ray_release.logger import logger +from ray_release.test import Test + + +def _generate_custom_build_step_key(image: str) -> str: + # Buildkite step key cannot contain special characters, so they need to be replaced. + # Buildkite also limits step key length to 80 characters. + return ( + "custom_build_" + + image.replace("/", "_") + .replace(":", "_") + .replace(".", "_") + .replace("-", "_")[-40:] + ) + + +def get_images_from_tests(tests: List[Test]) -> List[Tuple[str, str, str]]: + """Get a list of custom BYOD images to build from a list of tests.""" + custom_byod_images = set() + for test in tests: + if not test.require_custom_byod_image(): + continue + custom_byod_image_build = ( + test.get_anyscale_byod_image(), + test.get_anyscale_base_byod_image(), + test.get_byod_post_build_script(), + ) + logger.info(f"To be built: {custom_byod_image_build[0]}") + custom_byod_images.add(custom_byod_image_build) + return list(custom_byod_images) + + +def create_custom_build_yaml(destination_file: str, tests: List[Test]) -> None: + config = get_global_config() + if not config or not config.get("byod_ecr_region") or not config.get("byod_ecr"): + raise ValueError("byod_ecr_region and byod_ecr must be set in the config") + """Create a yaml file for building custom BYOD images""" + custom_byod_images = get_images_from_tests(tests) + if not custom_byod_images: + return + build_config = {"group": "Custom images build", "steps": []} + + for image, base_image, post_build_script in custom_byod_images: + if not post_build_script: + continue + step = { + "label": f":tapioca: build custom: {image}", + "key": _generate_custom_build_step_key(image), + "instance_type": "release-medium", + "commands": [ + f"aws ecr get-login-password --region {config['byod_ecr_region']} | docker login --username AWS --password-stdin {config['byod_ecr']}", + f"bazelisk run //release:custom_byod_build -- --image-name {image} --base-image {base_image} --post-build-script {post_build_script}", + ], + } + if "ray-ml" in image: + step["depends_on"] = "anyscalemlbuild" + elif "ray-llm" in image: + step["depends_on"] = "anyscalellmbuild" + else: + step["depends_on"] = "anyscalebuild" + build_config["steps"].append(step) + + with open(destination_file, "w") as f: + yaml.dump(build_config, f, default_flow_style=False, sort_keys=False) diff --git a/release/ray_release/schema.json b/release/ray_release/schema.json index 7be825405dea..7ad30d9c3ce1 100644 --- a/release/ray_release/schema.json +++ b/release/ray_release/schema.json @@ -36,7 +36,6 @@ "type": "string", "enum": [ "manual", - "multi", "nightly", "nightly-3x", "weekly", @@ -201,7 +200,6 @@ "type": "string", "enum": [ "manual", - "multi", "nightly", "nightly-3x", "weekly", diff --git a/release/ray_release/scripts/build_pipeline.py b/release/ray_release/scripts/build_pipeline.py index 29e448d8f4fd..4cf21864da0a 100644 --- a/release/ray_release/scripts/build_pipeline.py +++ b/release/ray_release/scripts/build_pipeline.py @@ -21,7 +21,6 @@ from ray_release.configs.global_config import init_global_config from ray_release.exception import ReleaseTestCLIError, ReleaseTestConfigError from ray_release.logger import logger -from ray_release.wheels import get_buildkite_repo_branch PIPELINE_ARTIFACT_PATH = "/tmp/pipeline_artifacts" @@ -79,14 +78,14 @@ def main( env = {} frequency = settings["frequency"] prefer_smoke_tests = settings["prefer_smoke_tests"] - test_attr_regex_filters = settings["test_attr_regex_filters"] + test_filters = settings["test_filters"] priority = settings["priority"] logger.info( f"Found the following buildkite pipeline settings:\n\n" f" frequency = {settings['frequency']}\n" f" prefer_smoke_tests = {settings['prefer_smoke_tests']}\n" - f" test_attr_regex_filters = {settings['test_attr_regex_filters']}\n" + f" test_filters = {settings['test_filters']}\n" f" ray_test_repo = {settings['ray_test_repo']}\n" f" ray_test_branch = {settings['ray_test_branch']}\n" f" priority = {settings['priority']}\n" @@ -111,7 +110,7 @@ def main( filtered_tests = filter_tests( test_collection, frequency=frequency, - test_attr_regex_filters=test_attr_regex_filters, + test_filters=test_filters, prefer_smoke_tests=prefer_smoke_tests, run_jailed_tests=run_jailed_tests, run_unstable_tests=run_unstable_tests, @@ -127,7 +126,12 @@ def main( build_anyscale_base_byod_images(tests) logger.info("Build anyscale custom BYOD images") for test in tests: - build_anyscale_custom_byod_image(test) + if test.require_custom_byod_image(): + build_anyscale_custom_byod_image( + test.get_anyscale_byod_image(), + test.get_anyscale_base_byod_image(), + test.get_byod_post_build_script(), + ) grouped_tests = group_tests(filtered_tests) group_str = "" @@ -145,10 +149,16 @@ def main( if no_concurrency_limit: logger.warning("Concurrency is not limited for this run!") - _, buildkite_branch = get_buildkite_repo_branch() if os.environ.get("REPORT_TO_RAY_TEST_DB", False): env["REPORT_TO_RAY_TEST_DB"] = "1" + # Pipe through RAYCI_BUILD_ID from the forge step. + # TODO(khluu): convert the steps to rayci steps and stop passing through + # RAYCI_BUILD_ID. + build_id = os.environ.get("RAYCI_BUILD_ID") + if build_id: + env["RAYCI_BUILD_ID"] = build_id + steps = get_step_for_test_group( grouped_tests, minimum_run_per_test=run_per_test, diff --git a/release/ray_release/scripts/custom_byod_build.py b/release/ray_release/scripts/custom_byod_build.py new file mode 100644 index 000000000000..c773b46ff5ca --- /dev/null +++ b/release/ray_release/scripts/custom_byod_build.py @@ -0,0 +1,14 @@ +import click +from ray_release.byod.build import build_anyscale_custom_byod_image + + +@click.command() +@click.option("--image-name", type=str, required=True) +@click.option("--base-image", type=str, required=True) +@click.option("--post-build-script", type=str, required=True) +def main(image_name: str, base_image: str, post_build_script: str): + build_anyscale_custom_byod_image(image_name, base_image, post_build_script) + + +if __name__ == "__main__": + main() diff --git a/release/ray_release/scripts/custom_byod_build_init.py b/release/ray_release/scripts/custom_byod_build_init.py new file mode 100644 index 000000000000..ba0df452e2ce --- /dev/null +++ b/release/ray_release/scripts/custom_byod_build_init.py @@ -0,0 +1,112 @@ +import os +from typing import Tuple +from pathlib import Path +import sys + +import click + +from ray_release.buildkite.filter import filter_tests +from ray_release.buildkite.settings import get_pipeline_settings +from ray_release.config import ( + read_and_validate_release_test_collection, + RELEASE_TEST_CONFIG_FILES, +) +from ray_release.configs.global_config import init_global_config +from ray_release.exception import ReleaseTestConfigError, ReleaseTestCLIError +from ray_release.logger import logger +from ray_release.custom_byod_build_init_helper import create_custom_build_yaml + + +@click.command( + help="Create a rayci yaml file for building custom BYOD images based on tests." +) +@click.option( + "--test-collection-file", + type=str, + multiple=True, + help="Test collection file, relative path to ray repo.", +) +@click.option( + "--run-jailed-tests", + is_flag=True, + show_default=True, + default=False, + help=("Will run jailed tests."), +) +@click.option( + "--run-unstable-tests", + is_flag=True, + show_default=True, + default=False, + help=("Will run unstable tests."), +) +@click.option( + "--global-config", + default="oss_config.yaml", + type=click.Choice( + [x.name for x in (Path(__file__).parent.parent / "configs").glob("*.yaml")] + ), + help="Global config to use for test execution.", +) +@click.option( + "--frequency", + default=None, + type=click.Choice(["manual", "nightly", "nightly-3x", "weekly"]), + help="Run frequency of the test", +) +@click.option( + "--test-filters", + default=None, + type=str, + help="Test filters by prefix/regex.", +) +def main( + test_collection_file: Tuple[str], + run_jailed_tests: bool = False, + run_unstable_tests: bool = False, + global_config: str = "oss_config.yaml", + frequency: str = None, + test_filters: str = None, +): + global_config_file = os.path.join( + os.path.dirname(__file__), "..", "configs", global_config + ) + init_global_config(global_config_file) + settings = get_pipeline_settings() + + frequency = frequency or settings["frequency"] + prefer_smoke_tests = settings["prefer_smoke_tests"] + test_filters = test_filters or settings["test_filters"] + + try: + test_collection = read_and_validate_release_test_collection( + test_collection_file or RELEASE_TEST_CONFIG_FILES + ) + except ReleaseTestConfigError as e: + raise ReleaseTestConfigError( + "Cannot load test yaml file.\nHINT: If you're kicking off tests for a " + "specific commit on Buildkite to test Ray wheels, after clicking " + "'New build', leave the commit at HEAD, and only specify the commit " + "in the dialog that asks for the Ray wheels." + ) from e + + filtered_tests = filter_tests( + test_collection, + frequency=frequency, + test_filters=test_filters, + prefer_smoke_tests=prefer_smoke_tests, + run_jailed_tests=run_jailed_tests, + run_unstable_tests=run_unstable_tests, + ) + logger.info(f"Found {len(filtered_tests)} tests to run.") + if len(filtered_tests) == 0: + raise ReleaseTestCLIError( + "Empty test collection. The selected frequency or filter did " + "not return any tests to run. Adjust your filters." + ) + tests = [test for test, _ in filtered_tests] + create_custom_build_yaml(".buildkite/release/custom_byod_build.rayci.yml", tests) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/release/ray_release/template.py b/release/ray_release/template.py index da060da25f93..31cca890db37 100644 --- a/release/ray_release/template.py +++ b/release/ray_release/template.py @@ -25,6 +25,7 @@ class TestEnvironment(dict): _test_env = None +_bazel_workspace_dir = os.environ.get("BUILD_WORKSPACE_DIRECTORY", "") def get_test_environment(): @@ -72,13 +73,28 @@ def render_yaml_template(template: str, env: Optional[Dict] = None): ) from e -def get_working_dir(test: "Test", test_definition_root: Optional[str] = None) -> str: +def get_working_dir( + test: "Test", + test_definition_root: Optional[str] = None, + bazel_workspace_dir: Optional[str] = None, +) -> str: + if not bazel_workspace_dir: + bazel_workspace_dir = _bazel_workspace_dir + if bazel_workspace_dir and test_definition_root: + raise ReleaseTestConfigError( + "test_definition_root should not be specified when running with Bazel." + ) working_dir = test.get("working_dir", "") - if working_dir.startswith("//"): - return bazel_runfile(working_dir.lstrip("//")) if test_definition_root: return os.path.join(test_definition_root, working_dir) - return bazel_runfile("release", working_dir) + if working_dir.startswith("//"): + working_dir = working_dir.lstrip("//") + else: + working_dir = os.path.join("release", working_dir) + if bazel_workspace_dir: + return os.path.join(bazel_workspace_dir, working_dir) + else: + return bazel_runfile(working_dir) def load_test_cluster_compute( diff --git a/release/ray_release/test.py b/release/ray_release/test.py index 03ee427338f5..c912a7b00bdb 100644 --- a/release/ray_release/test.py +++ b/release/ray_release/test.py @@ -541,23 +541,11 @@ def get_byod_base_image_tag(self) -> str: # TODO(can): this is a temporary backdoor that should be removed # once civ2 is fully rolled out. return byod_image_tag - commit = os.environ.get( - "COMMIT_TO_TEST", - os.environ["BUILDKITE_COMMIT"], - ) - branch = os.environ.get( - "BRANCH_TO_TEST", - os.environ["BUILDKITE_BRANCH"], - ) - pr = os.environ.get("BUILDKITE_PULL_REQUEST", "false") - ray_version = commit[:6] - if pr != "false": - ray_version = f"pr-{pr}.{ray_version}" - elif branch.startswith("releases/"): - release_name = branch[len("releases/") :] - ray_version = f"{release_name}.{ray_version}" - python_version = f"py{self.get_python_version().replace('.', '')}" - return f"{ray_version}-{python_version}-{self.get_tag_suffix()}" + build_id = os.environ.get("RAYCI_BUILD_ID", "") + if not build_id: + raise ValueError("RAYCI_BUILD_ID is not set") + python_version = "py" + self.get_python_version().replace(".", "") + return f"{build_id}-{python_version}-{self.get_tag_suffix()}" def get_byod_image_tag(self) -> str: """ diff --git a/release/ray_release/tests/test_buildkite.py b/release/ray_release/tests/test_buildkite.py index f8fbf964a8bb..08128ee3c8f0 100644 --- a/release/ray_release/tests/test_buildkite.py +++ b/release/ray_release/tests/test_buildkite.py @@ -20,7 +20,7 @@ Frequency, update_settings_from_buildkite, Priority, - get_test_attr_regex_filters, + get_test_filters, ) from ray_release.buildkite.step import ( get_step, @@ -110,23 +110,23 @@ def testSplitRayRepoStr(self): self.assertEqual(branch, DEFAULT_BRANCH) def testGetTestAttrRegexFilters(self): - test_attr_regex_filters = get_test_attr_regex_filters("") - self.assertDictEqual(test_attr_regex_filters, {}) + test_filters = get_test_filters("") + self.assertDictEqual(test_filters, {}) - test_attr_regex_filters = get_test_attr_regex_filters("name:xxx") - self.assertDictEqual(test_attr_regex_filters, {"name": "xxx"}) + test_filters = get_test_filters("name:xxx") + self.assertDictEqual(test_filters, {"name": "xxx"}) - test_attr_regex_filters = get_test_attr_regex_filters("name:xxx\n") - self.assertDictEqual(test_attr_regex_filters, {"name": "xxx"}) + test_filters = get_test_filters("name:xxx\n") + self.assertDictEqual(test_filters, {"name": "xxx"}) - test_attr_regex_filters = get_test_attr_regex_filters("name:xxx\n\nteam:yyy") - self.assertDictEqual(test_attr_regex_filters, {"name": "xxx", "team": "yyy"}) + test_filters = get_test_filters("name:xxx\n\nteam:yyy") + self.assertDictEqual(test_filters, {"name": "xxx", "team": "yyy"}) - test_attr_regex_filters = get_test_attr_regex_filters("name:xxx\n \nteam:yyy\n") - self.assertDictEqual(test_attr_regex_filters, {"name": "xxx", "team": "yyy"}) + test_filters = get_test_filters("name:xxx\n \nteam:yyy\n") + self.assertDictEqual(test_filters, {"name": "xxx", "team": "yyy"}) with self.assertRaises(ReleaseTestConfigError): - get_test_attr_regex_filters("xxx") + get_test_filters("xxx") def testSettingsOverrideEnv(self): settings = get_default_settings() @@ -168,8 +168,9 @@ def testSettingsOverrideEnv(self): os.environ["TEST_ATTR_REGEX_FILTERS"] = "name:xxx\nteam:yyy\n" updated_settings = settings.copy() update_settings_from_environment(updated_settings) + print(updated_settings) self.assertDictEqual( - updated_settings["test_attr_regex_filters"], + updated_settings["test_filters"], { "name": "xxx", "team": "yyy", @@ -191,7 +192,7 @@ def testSettingsOverrideEnv(self): { "frequency": Frequency.NIGHTLY, "prefer_smoke_tests": False, - "test_attr_regex_filters": {"name": "name_filter"}, + "test_filters": {"name": "name_filter"}, "ray_test_repo": "https://github.com/user/ray.git", "ray_test_branch": "sub/branch", "priority": Priority.MANUAL, @@ -206,7 +207,7 @@ def testSettingsOverrideEnv(self): { "frequency": Frequency.ANY, "prefer_smoke_tests": True, - "test_attr_regex_filters": {"name": "name_filter"}, + "test_filters": {"name": "name_filter"}, "ray_test_repo": "https://github.com/user/ray.git", "ray_test_branch": "sub/branch", "priority": Priority.MANUAL, @@ -321,18 +322,18 @@ def testSettingsOverrideBuildkite(self): # Invalid test attr regex filters self.buildkite.clear() self.buildkite.update(buildkite) - self.buildkite["release-test-attr-regex-filters"] = "xxxx" + self.buildkite["release-test-filters"] = "xxxx" updated_settings = settings.copy() with self.assertRaises(ReleaseTestConfigError): update_settings_from_buildkite(updated_settings) self.buildkite.clear() self.buildkite.update(buildkite) - self.buildkite["release-test-attr-regex-filters"] = "name:xxx\ngroup:yyy" + self.buildkite["release-test-filters"] = "name:xxx\ngroup:yyy" updated_settings = settings.copy() update_settings_from_buildkite(updated_settings) self.assertDictEqual( - updated_settings["test_attr_regex_filters"], + updated_settings["test_filters"], { "name": "xxx", "group": "yyy", @@ -353,7 +354,7 @@ def testSettingsOverrideBuildkite(self): { "frequency": Frequency.NIGHTLY, "prefer_smoke_tests": False, - "test_attr_regex_filters": {"name": "name_filter"}, + "test_filters": {"name": "name_filter"}, "ray_test_repo": "https://github.com/user/ray.git", "ray_test_branch": "sub/branch", "priority": Priority.MANUAL, @@ -369,7 +370,7 @@ def testSettingsOverrideBuildkite(self): { "frequency": Frequency.ANY, "prefer_smoke_tests": True, - "test_attr_regex_filters": {"name": "name_filter"}, + "test_filters": {"name": "name_filter"}, "ray_test_repo": "https://github.com/user/ray.git", "ray_test_branch": "sub/branch", "priority": Priority.MANUAL, @@ -377,7 +378,7 @@ def testSettingsOverrideBuildkite(self): }, ) - def _filter_names_smoke(self, *args, **kwargs): + def _filter_names(self, *args, **kwargs): filtered = filter_tests(*args, **kwargs) return [(t[0]["name"], t[1]) for t in filtered] @@ -411,7 +412,7 @@ def testFilterTests(self, *args): { "name": "other_2", "frequency": "nightly", - "smoke_test": {"frequency": "multi"}, + "smoke_test": {"frequency": "manual"}, "team": "team_2", "run": {"type": "job"}, } @@ -429,7 +430,32 @@ def testFilterTests(self, *args): ), ] - filtered = self._filter_names_smoke(tests, frequency=Frequency.ANY) + # Test filter by prefix alone + filtered = self._filter_names( + tests, frequency=Frequency.ANY, test_filters={"prefix": "test"} + ) + self.assertSequenceEqual( + filtered, + [ + ("test_1", False), + ("test_2", False), + ("test_3", False), + ("test_4.kuberay", False), + ], + ) + + # Test filter by prefix and regex together + filtered = self._filter_names( + tests, + frequency=Frequency.NIGHTLY, + test_filters={"prefix": "test", "name": "other.*"}, + ) + self.assertSequenceEqual( + filtered, + [], + ) + + filtered = self._filter_names(tests, frequency=Frequency.ANY) self.assertSequenceEqual( filtered, [ @@ -444,7 +470,7 @@ def testFilterTests(self, *args): ) assert not test.get("update_from_s3") - filtered = self._filter_names_smoke( + filtered = self._filter_names( tests, frequency=Frequency.ANY, prefer_smoke_tests=True, @@ -462,7 +488,7 @@ def testFilterTests(self, *args): ], ) - filtered = self._filter_names_smoke(tests, frequency=Frequency.NIGHTLY) + filtered = self._filter_names(tests, frequency=Frequency.NIGHTLY) self.assertSequenceEqual( filtered, [ @@ -474,7 +500,7 @@ def testFilterTests(self, *args): ], ) - filtered = self._filter_names_smoke( + filtered = self._filter_names( tests, frequency=Frequency.NIGHTLY, prefer_smoke_tests=True, @@ -490,13 +516,13 @@ def testFilterTests(self, *args): ], ) - filtered = self._filter_names_smoke(tests, frequency=Frequency.WEEKLY) + filtered = self._filter_names(tests, frequency=Frequency.WEEKLY) self.assertSequenceEqual(filtered, [("test_2", False), ("other_1", False)]) - filtered = self._filter_names_smoke( + filtered = self._filter_names( tests, frequency=Frequency.NIGHTLY, - test_attr_regex_filters={"name": "other.*"}, + test_filters={"name": "other.*"}, ) self.assertSequenceEqual( filtered, @@ -505,10 +531,10 @@ def testFilterTests(self, *args): ], ) - filtered = self._filter_names_smoke( + filtered = self._filter_names( tests, frequency=Frequency.NIGHTLY, - test_attr_regex_filters={"name": "test.*"}, + test_filters={"name": "test.*"}, ) self.assertSequenceEqual( filtered, @@ -520,46 +546,46 @@ def testFilterTests(self, *args): ], ) - filtered = self._filter_names_smoke( - tests, frequency=Frequency.NIGHTLY, test_attr_regex_filters={"name": "test"} + filtered = self._filter_names( + tests, frequency=Frequency.NIGHTLY, test_filters={"name": "test"} ) self.assertSequenceEqual(filtered, []) - filtered = self._filter_names_smoke( + filtered = self._filter_names( tests, frequency=Frequency.NIGHTLY, - test_attr_regex_filters={"name": "test.*", "team": "team_1"}, + test_filters={"name": "test.*", "team": "team_1"}, ) self.assertSequenceEqual(filtered, [("test_1", False)]) - filtered = self._filter_names_smoke( + filtered = self._filter_names( tests, frequency=Frequency.NIGHTLY, - test_attr_regex_filters={"name": "test_1|test_2"}, + test_filters={"name": "test_1|test_2"}, ) self.assertSequenceEqual(filtered, [("test_1", False), ("test_2", True)]) # Filter by nested properties - filtered = self._filter_names_smoke( + filtered = self._filter_names( tests, frequency=Frequency.ANY, - test_attr_regex_filters={"run/type": "job"}, + test_filters={"run/type": "job"}, ) self.assertSequenceEqual( filtered, [("test_1", False), ("other_2", False), ("test_4.kuberay", False)] ) - filtered = self._filter_names_smoke( + filtered = self._filter_names( tests, frequency=Frequency.ANY, - test_attr_regex_filters={"run/type": "client"}, + test_filters={"run/type": "client"}, ) self.assertSequenceEqual(filtered, [("test_2", False)]) - filtered = self._filter_names_smoke( + filtered = self._filter_names( tests, frequency=Frequency.ANY, - test_attr_regex_filters={"run/invalid": "xxx"}, + test_filters={"run/invalid": "xxx"}, ) self.assertSequenceEqual(filtered, []) @@ -586,7 +612,7 @@ def testGetStep(self): "name": "test", "frequency": "nightly", "run": {"script": "test_script.py"}, - "smoke_test": {"frequency": "multi"}, + "smoke_test": {"frequency": "nightly"}, } ) diff --git a/release/ray_release/tests/test_byod_build.py b/release/ray_release/tests/test_byod_build.py index 920b2c826e36..039d302182a0 100644 --- a/release/ray_release/tests/test_byod_build.py +++ b/release/ray_release/tests/test_byod_build.py @@ -10,7 +10,6 @@ from ray_release.byod.build import ( build_anyscale_custom_byod_image, build_anyscale_base_byod_images, - DATAPLANE_FILENAME, _get_ray_commit, ) @@ -42,10 +41,6 @@ def test_get_ray_commit() -> None: init_global_config(bazel_runfile("release/ray_release/configs/oss_config.yaml")) -# Create a mock file to simulate the S3 download -with open(DATAPLANE_FILENAME, "wb") as f: - f.write(b"abc123") - def test_build_anyscale_custom_byod_image() -> None: cmds = [] @@ -59,7 +54,10 @@ def _mock_check_call( with patch("ray_release.byod.build._image_exist", return_value=False), patch.dict( "os.environ", - {"BUILDKITE_COMMIT": "abc123", "BUILDKITE_BRANCH": "master"}, + { + "BUILDKITE_COMMIT": "abc123", + "RAYCI_BUILD_ID": "a1b2c3d4", + }, ), patch("subprocess.check_call", side_effect=_mock_check_call,), patch( "subprocess.check_output", return_value=b"abc123", @@ -74,31 +72,23 @@ def _mock_check_call( test.get_byod_post_build_script(), ) assert "docker build --build-arg BASE_IMAGE=029272617770.dkr.ecr.us-west-2." - "amazonaws.com/anyscale/ray:abc123-py37 -t 029272617770.dkr.ecr.us-west-2." - "amazonaws.com/anyscale/ray:abc123-py37-c3fc5fc6d84cea4d7ab885c6cdc966542e" + "amazonaws.com/anyscale/ray:a1b2c3d4-py37 -t 029272617770.dkr.ecr.us-west-2." + "amazonaws.com/anyscale/ray:a1b2c3d4-py37-c3fc5fc6d84cea4d7ab885c6cdc966542e" "f59e4c679b8c970f2f77b956bfd8fb" in " ".join(cmds[0]) def test_build_anyscale_base_byod_images() -> None: - images = [] - - def _mock_validate_and_push(image: str) -> None: - images.append(image) - def _mock_image_exist(image: str) -> bool: - return "rayproject/ray" in image + return True with patch( - "ray_release.byod.build._download_dataplane_build_file", return_value=None - ), patch( "os.environ", - {"BUILDKITE_COMMIT": "abc123", "BUILDKITE_BRANCH": "master"}, - ), patch( - "subprocess.check_call", return_value=None - ), patch( + { + "BUILDKITE_COMMIT": "abc123", + "RAYCI_BUILD_ID": "a1b2c3d4", + }, + ), patch("subprocess.check_call", return_value=None), patch( "ray_release.byod.build._image_exist", side_effect=_mock_image_exist - ), patch( - "ray_release.byod.build._validate_and_push", side_effect=_mock_validate_and_push ): tests = [ Test(name="aws", env="aws", cluster={"byod": {}}), @@ -122,18 +112,18 @@ def _mock_image_exist(image: str) -> bool: ), Test(name="gce", env="gce", cluster={"byod": {}}), ] - build_anyscale_base_byod_images(tests) + images = build_anyscale_base_byod_images(tests) global_config = get_global_config() aws_cr = global_config["byod_aws_cr"] gcp_cr = global_config["byod_gcp_cr"] - assert images == [ - f"{aws_cr}/anyscale/ray:abc123-py39-cpu", - f"{aws_cr}/anyscale/ray-ml:abc123-py39-gpu", - f"{aws_cr}/anyscale/ray:abc123-py39-cu121", - f"{aws_cr}/anyscale/ray:abc123-py39-cu116", - f"{aws_cr}/anyscale/ray:abc123-py311-cu118", - f"{gcp_cr}/anyscale/ray:abc123-py39-cpu", - ] + assert set(images) == { + f"{aws_cr}/anyscale/ray:a1b2c3d4-py39-cpu", + f"{aws_cr}/anyscale/ray:a1b2c3d4-py39-cu116", + f"{aws_cr}/anyscale/ray:a1b2c3d4-py39-cu121", + f"{aws_cr}/anyscale/ray:a1b2c3d4-py311-cu118", + f"{aws_cr}/anyscale/ray-ml:a1b2c3d4-py39-gpu", + f"{gcp_cr}/anyscale/ray:a1b2c3d4-py39-cpu", + } if __name__ == "__main__": diff --git a/release/ray_release/tests/test_config.py b/release/ray_release/tests/test_config.py index b0ec1c4b1a2c..c815dfc1bba6 100644 --- a/release/ray_release/tests/test_config.py +++ b/release/ray_release/tests/test_config.py @@ -37,7 +37,7 @@ "wait_for_nodes": {"num_nodes": 2, "timeout": 100}, "type": "client", }, - "smoke_test": {"run": {"timeout": 20}, "frequency": "multi"}, + "smoke_test": {"run": {"timeout": 20}, "frequency": "nightly"}, "alert": "default", } diff --git a/release/ray_release/tests/test_custom_byod_build.py b/release/ray_release/tests/test_custom_byod_build.py new file mode 100644 index 000000000000..7e29a352096d --- /dev/null +++ b/release/ray_release/tests/test_custom_byod_build.py @@ -0,0 +1,63 @@ +import sys +import pytest +from unittest.mock import patch + +from click.testing import CliRunner +from ray_release.scripts.custom_byod_build import main + + +@patch("ray_release.scripts.custom_byod_build.build_anyscale_custom_byod_image") +def test_custom_byod_build(mock_build_anyscale_custom_byod_image): + mock_build_anyscale_custom_byod_image.return_value = None + runner = CliRunner() + result = runner.invoke( + main, + [ + "--image-name", + "test-image", + "--base-image", + "test-base-image", + "--post-build-script", + "test_post_build_script.sh", + ], + ) + assert result.exit_code == 0 + + +@patch("ray_release.scripts.custom_byod_build.build_anyscale_custom_byod_image") +def test_custom_byod_build_missing_arg(mock_build_anyscale_custom_byod_image): + mock_build_anyscale_custom_byod_image.return_value = None + runner = CliRunner() + result = runner.invoke( + main, + [ + "--base-image", + "test-base-image", + "--post-build-script", + "test_post_build_script.sh", + ], + ) + assert result.exit_code == 2 + assert "Error: Missing option '--image-name'" in result.output + + result = runner.invoke( + main, + [ + "--image-name", + "test-image", + "--post-build-script", + "test_post_build_script.sh", + ], + ) + assert result.exit_code == 2 + assert "Error: Missing option '--base-image'" in result.output + + result = runner.invoke( + main, ["--image-name", "test-image", "--base-image", "test-base-image"] + ) + assert result.exit_code == 2 + assert "Error: Missing option '--post-build-script'" in result.output + + +if __name__ == "__main__": + sys.exit(pytest.main(["-v", __file__])) diff --git a/release/ray_release/tests/test_custom_byod_build_init_helper.py b/release/ray_release/tests/test_custom_byod_build_init_helper.py new file mode 100644 index 000000000000..895f192d3278 --- /dev/null +++ b/release/ray_release/tests/test_custom_byod_build_init_helper.py @@ -0,0 +1,77 @@ +import os +import tempfile +import sys +import pytest +from unittest import mock +import yaml + +from ray_release.custom_byod_build_init_helper import create_custom_build_yaml +from ray_release.configs.global_config import init_global_config +from ray_release.bazel import bazel_runfile +from ray_release.test import Test +from ray_release.configs.global_config import get_global_config + + +init_global_config(bazel_runfile("release/ray_release/configs/oss_config.yaml")) + + +@mock.patch("ray_release.custom_byod_build_init_helper.get_images_from_tests") +def test_create_custom_build_yaml(mock_get_images_from_tests): + config = get_global_config() + custom_byod_images = [ + ( + "ray-project/ray-ml:abc123-custom", + "ray-project/ray-ml:abc123-base", + "custom_script.sh", + ), + ("ray-project/ray-ml:abc123-custom", "ray-project/ray-ml:abc123-base", ""), + ( + "ray-project/ray-ml:nightly-py37-cpu-custom-abcdef123456789abc123456789", + "ray-project/ray-ml:nightly-py37-cpu-base", + "custom_script.sh", + ), # longer than 40 chars + ] + mock_get_images_from_tests.return_value = custom_byod_images + + # List of dummy tests + tests = [ + Test( + name="test_1", + frequency="manual", + group="test_group", + team="test_team", + working_dir="test_working_dir", + ), + Test( + name="test_2", + frequency="manual", + group="test_group", + team="test_team", + working_dir="test_working_dir", + ), + ] + with tempfile.TemporaryDirectory() as tmpdir: + create_custom_build_yaml( + os.path.join(tmpdir, "custom_byod_build.rayci.yml"), tests + ) + with open(os.path.join(tmpdir, "custom_byod_build.rayci.yml"), "r") as f: + content = yaml.safe_load(f) + assert content["group"] == "Custom images build" + assert len(content["steps"]) == 2 + assert ( + f"--region {config['byod_ecr_region']}" + in content["steps"][0]["commands"][0] + ) + assert f"{config['byod_ecr']}" in content["steps"][0]["commands"][0] + assert ( + f"--image-name {custom_byod_images[0][0]}" + in content["steps"][0]["commands"][1] + ) + assert ( + f"--image-name {custom_byod_images[2][0]}" + in content["steps"][1]["commands"][1] + ) + + +if __name__ == "__main__": + sys.exit(pytest.main(["-v", __file__])) diff --git a/release/ray_release/tests/test_global_config.py b/release/ray_release/tests/test_global_config.py index 439dc25c3b14..ce60f26d3175 100644 --- a/release/ray_release/tests/test_global_config.py +++ b/release/ray_release/tests/test_global_config.py @@ -33,6 +33,10 @@ postmerge: - hi - three +release_image_step: + ray: anyscalebuild + ray_ml: anyscalemlbuild + ray_llm: anyscalellmbuild """ @@ -56,6 +60,9 @@ def test_init_global_config() -> None: assert config["byod_ray_cr_repo"] == "ray" assert config["byod_ray_ml_cr_repo"] == "ray-ml" assert config["byod_ray_llm_cr_repo"] == "ray-llm" + assert config["release_image_step_ray"] == "anyscalebuild" + assert config["release_image_step_ray_ml"] == "anyscalemlbuild" + assert config["release_image_step_ray_llm"] == "anyscalellmbuild" if __name__ == "__main__": diff --git a/release/ray_release/tests/test_template.py b/release/ray_release/tests/test_template.py new file mode 100644 index 000000000000..4f5ef46d6426 --- /dev/null +++ b/release/ray_release/tests/test_template.py @@ -0,0 +1,56 @@ +import sys + +import pytest + +from ray_release.test import Test +from ray_release.template import get_working_dir, bazel_runfile +from ray_release.exception import ReleaseTestConfigError + + +def test_get_working_dir_with_path_from_root(): + test_with_path_from_root = Test( + { + "name": "test", + "working_dir": "//ray_testing/ray_release/tests", + } + ) + assert ( + get_working_dir(test_with_path_from_root, None, "/tmp/bazel_workspace") + == "/tmp/bazel_workspace/ray_testing/ray_release/tests" + ) + assert get_working_dir(test_with_path_from_root, None, None) == bazel_runfile( + "ray_testing/ray_release/tests" + ) + + +def test_get_working_dir_with_relative_path(): + test_with_relative_path = Test( + { + "name": "test", + "working_dir": "ray_release/tests", + } + ) + assert ( + get_working_dir(test_with_relative_path, None, "/tmp/bazel_workspace") + == "/tmp/bazel_workspace/release/ray_release/tests" + ) + assert get_working_dir(test_with_relative_path, None, None) == bazel_runfile( + "release/ray_release/tests" + ) + + +def test_get_working_dir_fail(): + test_with_path_from_root = Test( + { + "name": "test", + "working_dir": "//ray_testing/ray_release/tests", + } + ) + with pytest.raises(ReleaseTestConfigError): + get_working_dir( + test_with_path_from_root, "/tmp/test_definition_root", "tmp/bazel_workspace" + ) + + +if __name__ == "__main__": + sys.exit(pytest.main(["-v", __file__])) diff --git a/release/ray_release/tests/test_test.py b/release/ray_release/tests/test_test.py index e4899de2ebb7..513eba2a9878 100644 --- a/release/ray_release/tests/test_test.py +++ b/release/ray_release/tests/test_test.py @@ -90,8 +90,9 @@ def test_get_python_version(): def test_get_ray_image(): - os.environ["BUILDKITE_BRANCH"] = "master" - os.environ["BUILDKITE_COMMIT"] = "1234567890" + os.environ["RAYCI_BUILD_ID"] = "a1b2c3d4" + + # These images are NOT saved on Docker Hub, but on private ECR. assert ( _stub_test( { @@ -99,7 +100,7 @@ def test_get_ray_image(): "cluster": {"byod": {}}, } ).get_ray_image() - == "rayproject/ray:123456-py39-cpu" + == "rayproject/ray:a1b2c3d4-py39-cpu" ) assert ( _stub_test( @@ -112,7 +113,7 @@ def test_get_ray_image(): }, } ).get_ray_image() - == "rayproject/ray-ml:123456-py39-gpu" + == "rayproject/ray-ml:a1b2c3d4-py39-gpu" ) assert ( _stub_test( @@ -125,23 +126,34 @@ def test_get_ray_image(): }, } ).get_ray_image() - == "rayproject/ray-llm:123456-py311-cu124" - ) - os.environ["BUILDKITE_BRANCH"] = "releases/1.0.0" - assert ( - _stub_test({"cluster": {"byod": {}}}).get_ray_image() - == "rayproject/ray:1.0.0.123456-py39-cpu" + == "rayproject/ray-llm:a1b2c3d4-py311-cu124" ) - with mock.patch.dict(os.environ, {"BUILDKITE_PULL_REQUEST": "123"}): + + # When RAY_IMAGE_TAG is set, we use the RAYCI_BUILD_ID. + with mock.patch.dict(os.environ, {"RAY_IMAGE_TAG": "my_tag"}): assert ( _stub_test({"cluster": {"byod": {}}}).get_ray_image() - == "rayproject/ray:pr-123.123456-py39-cpu" + == "rayproject/ray:my_tag" ) - with mock.patch.dict(os.environ, {"RAY_IMAGE_TAG": "my_tag"}): + + with mock.patch.dict(os.environ, {"BUILDKITE_BRANCH": "releases/1.0.0"}): + # Even on release branches, we also use the RAYCI_BUILD_ID. assert ( _stub_test({"cluster": {"byod": {}}}).get_ray_image() - == "rayproject/ray:my_tag" + == "rayproject/ray:a1b2c3d4-py39-cpu" ) + with mock.patch.dict(os.environ, {"BUILDKITE_PULL_REQUEST": "123"}): + assert ( + _stub_test({"cluster": {"byod": {}}}).get_ray_image() + == "rayproject/ray:a1b2c3d4-py39-cpu" + ) + + # Unless RAY_IMAGE_TAG is set, we use the RAYCI_BUILD_ID. + with mock.patch.dict(os.environ, {"RAY_IMAGE_TAG": "my_tag"}): + assert ( + _stub_test({"cluster": {"byod": {}}}).get_ray_image() + == "rayproject/ray:my_tag" + ) def test_get_byod_runtime_env(): @@ -161,11 +173,10 @@ def test_get_byod_runtime_env(): def test_get_anyscale_byod_image(): - os.environ["BUILDKITE_BRANCH"] = "master" - os.environ["BUILDKITE_COMMIT"] = "1234567890" + os.environ["RAYCI_BUILD_ID"] = "a1b2c3d4" assert ( _stub_test({"python": "3.7", "cluster": {"byod": {}}}).get_anyscale_byod_image() - == f"{get_global_config()['byod_ecr']}/{DATAPLANE_ECR_REPO}:123456-py37-cpu" + == f"{get_global_config()['byod_ecr']}/{DATAPLANE_ECR_REPO}:a1b2c3d4-py37-cpu" ) assert _stub_test( { @@ -177,7 +188,8 @@ def test_get_anyscale_byod_image(): }, } ).get_anyscale_byod_image() == ( - f"{get_global_config()['byod_ecr']}/" f"{DATAPLANE_ECR_ML_REPO}:123456-py38-gpu" + f"{get_global_config()['byod_ecr']}/" + f"{DATAPLANE_ECR_ML_REPO}:a1b2c3d4-py38-gpu" ) assert _stub_test( { @@ -191,7 +203,7 @@ def test_get_anyscale_byod_image(): } ).get_anyscale_byod_image() == ( f"{get_global_config()['byod_ecr']}" - f"/{DATAPLANE_ECR_ML_REPO}:123456-py38-gpu-" + f"/{DATAPLANE_ECR_ML_REPO}:a1b2c3d4-py38-gpu-" "ab7ed2b7a7e8d3f855a7925b0d296b0f9c75fac91882aba47854d92d27e13e53" ) diff --git a/release/release_data_tests.yaml b/release/release_data_tests.yaml index 5a1bf0debacc..52839de05d80 100644 --- a/release/release_data_tests.yaml +++ b/release/release_data_tests.yaml @@ -35,6 +35,22 @@ s3://ray-benchmark-data-internal-us-west-2/imagenet/parquet --format parquet --iter-bundles +- name: "read_large_parquet_{{scaling}}" + + cluster: + cluster_compute: "{{scaling}}_cpu_compute.yaml" + + matrix: + setup: + scaling: [fixed_size, autoscaling] + + run: + timeout: 3600 + script: > + python read_and_consume_benchmark.py + s3://ray-benchmark-data-internal-us-west-2/large-parquet/ --format parquet + --iter-bundles + - name: "read_images_{{scaling}}" cluster: @@ -224,6 +240,25 @@ --join_type {{join_type}} --num_partitions 50 +############### +# Wide Schema tests +############### + +- name: wide_schema_pipeline_{{data_type}} + + cluster: + cluster_compute: fixed_size_cpu_compute.yaml + + matrix: + setup: + data_type: [primitives, tensors, objects, nested_structs] + + run: + timeout: 300 + script: > + python wide_schema_pipeline_benchmark.py + --data-type {{data_type}} + ####################### # Streaming split tests ####################### @@ -457,7 +492,8 @@ # 300 GB image classification parquet data up to 10 GPUs # 10 g4dn.12xlarge. -- name: "batch_inference_{{scaling}}" +- name: "image_classification_{{scaling}}" + group: batch-inference cluster: cluster_compute: "{{scaling}}_gpu_compute.yaml" @@ -472,11 +508,12 @@ python gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw-parquet --data-format parquet -- name: batch_inference_chaos +- name: image_classification_chaos stable: False # Don't use 'nightly_tests/dataset' as the working directory because we need to run # the 'setup_chaos.py' script. working_dir: nightly_tests + group: batch-inference cluster: cluster_compute: dataset/autoscaling_gpu_compute.yaml @@ -488,9 +525,10 @@ python dataset/gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw-parquet --data-format parquet --chaos-test -- name: batch_inference_chaos_no_scale_back +- name: image_classification_chaos_no_scale_back stable: False working_dir: nightly_tests + group: batch-inference cluster: cluster_compute: dataset/autoscaling_gpu_compute.yaml @@ -502,48 +540,41 @@ python dataset/gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw-parquet --data-format parquet --chaos-test -- name: batch_inference_mock_image_pipeline - frequency: manual - working_dir: nightly_tests - - cluster: - cluster_compute: dataset/autoscaling_100_cpu_compute.yaml - - run: - timeout: 3600 - script: > - python dataset/batch_inference_mock_image_pipeline.py - variations: - - __suffix__: regular - - __suffix__: chaos - run: - prepare: > - python setup_chaos.py --chaos TerminateEC2InstanceWithGracePeriod - --batch-size-to-kill 10 --max-to-kill 100 --kill-delay 120 +- name: image_embedding_from_uris_{{case}} + frequency: weekly + group: batch-inference -- name: batch_inference_mock_image_pipeline_fixed - frequency: manual - working_dir: nightly_tests + matrix: + setup: + case: [] + cluster_type: [] + args: [] + adjustments: + - with: + case: fixed_size + cluster_type: fixed_size + args: --inference-concurrency 100 100 + - with: + case: autoscaling + cluster_type: autoscaling + args: --inference-concurrency 1 100 + - with: + case: fixed_size_chaos + cluster_type: fixed_size + args: --inference-concurrency 100 100 --chaos cluster: - cluster_compute: dataset/fixed_size_100_cpu_compute.yaml + cluster_compute: image_embedding_from_uris/{{cluster_type}}_cluster_compute.yaml run: timeout: 3600 - script: > - python dataset/batch_inference_mock_image_pipeline.py + script: python image_embedding_from_uris/main.py {{args}} - variations: - - __suffix__: regular - - __suffix__: chaos - run: - prepare: > - python setup_chaos.py --chaos TerminateEC2InstanceWithGracePeriod - --batch-size-to-kill 10 --max-to-kill 100 --kill-delay 120 -- name: batch_inference_hetero_{{case}} - frequency: manual +- name: image_embedding_from_jsonl_{{case}} + frequency: weekly + group: batch-inference matrix: setup: @@ -565,14 +596,46 @@ args: --inference-concurrency 40 40 --chaos cluster: - cluster_compute: batch_inference_hetero/{{cluster_type}}_cluster_compute.yaml + cluster_compute: image_embedding_from_jsonl/{{cluster_type}}_cluster_compute.yaml byod: post_build_script: byod_install_pybase64.sh run: timeout: 3600 - script: python batch_inference_hetero/main.py {{args}} + script: python image_embedding_from_jsonl/main.py {{args}} + +- name: text_embedding_{{case}} + frequency: weekly + group: batch-inference + matrix: + setup: + case: [] + cluster_type: [] + args: [] + adjustments: + - with: + case: fixed_size + cluster_type: fixed_size + args: --inference-concurrency 100 100 + - with: + case: autoscaling + cluster_type: autoscaling + args: --inference-concurrency 1 100 + - with: + case: fixed_size_chaos + cluster_type: fixed_size + args: --inference-concurrency 100 100 --chaos + + cluster: + cluster_compute: text_embedding/{{cluster_type}}_cluster_compute.yaml + byod: + type: cu123 + post_build_script: byod_install_text_embedding.sh + + run: + timeout: 3600 + script: python text_embedding/main.py {{args}} ############## # TPCH Queries diff --git a/release/release_tests.yaml b/release/release_tests.yaml index 9edc6f66a9ac..bf7146fb92f9 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -76,6 +76,26 @@ # # It can then let the test fail, e.g. if a metric regression is observed. # alert: default +####################### +# Baseline test +####################### +- name: hello_world + team: reef + group: hello_world + frequency: nightly + working_dir: hello_world_tests + + cluster: + byod: {} + cluster_compute: hello_world_compute_config.yaml + + run: + timeout: 1800 + script: python hello_world.py + + variations: + - __suffix__: aws + ####################### # Cluster scaling tests ####################### @@ -2058,7 +2078,38 @@ cluster: byod: {} - cluster_compute: compute_tpl_single_node_32_cpu.yaml + cluster_compute: compute_tpl_single_node_16_cpu.yaml + cloud_id: cld_wy5a6nhazplvu32526ams61d98 + project_id: prj_lhlrf1u5yv8qz9qg3xzw8fkiiq + + run: + timeout: 7200 + long_running: false + script: python workloads/microbenchmarks.py --run-all + + alert: default + + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_compute: compute_tpl_single_node_gce.yaml + +- name: serve_throughput_optimized_microbenchmarks + group: Serve tests + working_dir: serve_tests + + frequency: nightly + team: serve + + cluster: + byod: + runtime_env: + - RAY_SERVE_THROUGHPUT_OPTIMIZED=1 + - RAY_SERVE_DISABLE_SHUTTING_DOWN_INGRESS_REPLICAS_FORCEFULLY=0 + cluster_compute: compute_tpl_single_node_16_cpu.yaml cloud_id: cld_wy5a6nhazplvu32526ams61d98 project_id: prj_lhlrf1u5yv8qz9qg3xzw8fkiiq @@ -2856,10 +2907,7 @@ team: core cluster: - byod: - runtime_env: - - RAY_experimental_enable_open_telemetry_on_agent=1 - - RAY_experimental_enable_open_telemetry_on_core=1 + byod: {} cluster_compute: stress_tests/placement_group_tests_compute.yaml run: @@ -2931,10 +2979,7 @@ working_dir: microbenchmark cluster: - byod: - runtime_env: - - RAY_experimental_enable_open_telemetry_on_agent=1 - - RAY_experimental_enable_open_telemetry_on_core=1 + byod: {} cluster_compute: tpl_64.yaml run: @@ -3213,10 +3258,7 @@ frequency: nightly team: core cluster: - byod: - runtime_env: - - RAY_experimental_enable_open_telemetry_on_agent=1 - - RAY_experimental_enable_open_telemetry_on_core=1 + byod: {} cluster_compute: stress_tests/stress_tests_compute.yaml run: @@ -3256,10 +3298,7 @@ frequency: nightly team: core cluster: - byod: - runtime_env: - - RAY_experimental_enable_open_telemetry_on_agent=1 - - RAY_experimental_enable_open_telemetry_on_core=1 + byod: {} cluster_compute: stress_tests/stress_tests_compute.yaml run: @@ -3391,10 +3430,7 @@ team: core env: aws_perf cluster: - byod: - runtime_env: - - RAY_experimental_enable_open_telemetry_on_agent=1 - - RAY_experimental_enable_open_telemetry_on_core=1 + byod: {} cluster_compute: stress_tests/stress_tests_single_node_oom_compute.yaml run: @@ -3558,8 +3594,6 @@ type: gpu runtime_env: - LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so - - RAY_experimental_enable_open_telemetry_on_agent=1 - - RAY_experimental_enable_open_telemetry_on_core=1 cluster_compute: single_node.yaml run: @@ -3587,8 +3621,6 @@ type: gpu runtime_env: - LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so - - RAY_experimental_enable_open_telemetry_on_agent=1 - - RAY_experimental_enable_open_telemetry_on_core=1 cluster_compute: object_store.yaml run: @@ -3617,8 +3649,6 @@ type: gpu runtime_env: - LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so - - RAY_experimental_enable_open_telemetry_on_agent=1 - - RAY_experimental_enable_open_telemetry_on_core=1 cluster_compute: object_store/small_objects.yaml run: @@ -3642,8 +3672,6 @@ type: gpu runtime_env: - LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so - - RAY_experimental_enable_open_telemetry_on_agent=1 - - RAY_experimental_enable_open_telemetry_on_core=1 cluster_compute: object_store/large_objects.yaml run: @@ -3667,8 +3695,6 @@ type: gpu runtime_env: - LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so - - RAY_experimental_enable_open_telemetry_on_agent=1 - - RAY_experimental_enable_open_telemetry_on_core=1 cluster_compute: distributed.yaml run: @@ -3717,8 +3743,6 @@ type: gpu runtime_env: - LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so - - RAY_experimental_enable_open_telemetry_on_agent=1 - - RAY_experimental_enable_open_telemetry_on_core=1 cluster_compute: distributed.yaml run: @@ -3747,8 +3771,6 @@ type: gpu runtime_env: - LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so - - RAY_experimental_enable_open_telemetry_on_agent=1 - - RAY_experimental_enable_open_telemetry_on_core=1 cluster_compute: distributed.yaml run: @@ -3798,8 +3820,6 @@ type: gpu runtime_env: - LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so - - RAY_experimental_enable_open_telemetry_on_agent=1 - - RAY_experimental_enable_open_telemetry_on_core=1 cluster_compute: many_nodes.yaml run: @@ -4102,6 +4122,8 @@ variations: - __suffix__: v1 + run: + script: RAY_UP_enable_autoscaler_v2=0 python launch_and_verify_cluster.py aws/tests/aws_cluster.yaml --num-expected-nodes 2 --retries 10 - __suffix__: v2 run: script: RAY_UP_enable_autoscaler_v2=1 python launch_and_verify_cluster.py aws/tests/aws_cluster.yaml --num-expected-nodes 2 --retries 10 @@ -4122,6 +4144,8 @@ variations: - __suffix__: v1 + run: + script: RAY_UP_enable_autoscaler_v2=0 python launch_and_verify_cluster.py aws/tests/aws_cluster.yaml --num-expected-nodes 2 --retries 10 --docker-override nightly - __suffix__: v2 run: script: RAY_UP_enable_autoscaler_v2=1 python launch_and_verify_cluster.py aws/tests/aws_cluster.yaml --num-expected-nodes 2 --retries 10 --docker-override nightly @@ -4142,6 +4166,8 @@ variations: - __suffix__: v1 + run: + script: RAY_UP_enable_autoscaler_v2=0 python launch_and_verify_cluster.py aws/tests/aws_cluster.yaml --num-expected-nodes 2 --retries 10 --docker-override latest - __suffix__: v2 run: script: RAY_UP_enable_autoscaler_v2=1 python launch_and_verify_cluster.py aws/tests/aws_cluster.yaml --num-expected-nodes 2 --retries 10 --docker-override latest @@ -4162,6 +4188,8 @@ variations: - __suffix__: v1 + run: + script: RAY_UP_enable_autoscaler_v2=0 python launch_and_verify_cluster.py aws/tests/aws_cluster.yaml --num-expected-nodes 2 --retries 10 --docker-override commit - __suffix__: v2 run: script: RAY_UP_enable_autoscaler_v2=1 python launch_and_verify_cluster.py aws/tests/aws_cluster.yaml --num-expected-nodes 2 --retries 10 --docker-override commit @@ -4183,6 +4211,8 @@ variations: - __suffix__: v1 + run: + script: RAY_UP_enable_autoscaler_v2=0 python launch_and_verify_cluster.py aws/example-minimal.yaml - __suffix__: v2 run: script: RAY_UP_enable_autoscaler_v2=1 python launch_and_verify_cluster.py aws/example-minimal.yaml @@ -4203,6 +4233,8 @@ variations: - __suffix__: v1 + run: + script: RAY_UP_enable_autoscaler_v2=0 python launch_and_verify_cluster.py aws/example-full.yaml --num-expected-nodes 2 --retries 20 --docker-override latest - __suffix__: v2 run: script: RAY_UP_enable_autoscaler_v2=1 python launch_and_verify_cluster.py aws/example-full.yaml --num-expected-nodes 2 --retries 20 --docker-override latest @@ -4226,6 +4258,8 @@ variations: - __suffix__: v1 + run: + script: RAY_UP_enable_autoscaler_v2=0 python launch_and_verify_cluster.py gcp/example-minimal-pinned.yaml - __suffix__: v2 run: script: RAY_UP_enable_autoscaler_v2=1 python launch_and_verify_cluster.py gcp/example-minimal-pinned.yaml @@ -4249,6 +4283,8 @@ variations: - __suffix__: v1 + run: + script: RAY_UP_enable_autoscaler_v2=0 python launch_and_verify_cluster.py gcp/example-full.yaml --num-expected-nodes 2 --retries 30 --docker-override latest - __suffix__: v2 run: script: RAY_UP_enable_autoscaler_v2=1 python launch_and_verify_cluster.py gcp/example-full.yaml --num-expected-nodes 2 --retries 30 --docker-override latest @@ -4272,6 +4308,8 @@ variations: - __suffix__: v1 + run: + script: RAY_UP_enable_autoscaler_v2=0 python launch_and_verify_cluster.py gcp/example-full.yaml --num-expected-nodes 2 --retries 20 --docker-override latest - __suffix__: v2 run: script: RAY_UP_enable_autoscaler_v2=1 python launch_and_verify_cluster.py gcp/example-full.yaml --num-expected-nodes 2 --retries 20 --docker-override latest @@ -4295,6 +4333,8 @@ variations: - __suffix__: v1 + run: + script: RAY_UP_enable_autoscaler_v2=0 python launch_and_verify_cluster.py gcp/example-full.yaml --num-expected-nodes 2 --retries 20 --docker-override nightly - __suffix__: v2 run: script: RAY_UP_enable_autoscaler_v2=1 python launch_and_verify_cluster.py gcp/example-full.yaml --num-expected-nodes 2 --retries 20 --docker-override nightly @@ -4318,6 +4358,8 @@ variations: - __suffix__: v1 + run: + script: RAY_UP_enable_autoscaler_v2=0 python launch_and_verify_cluster.py gcp/example-full.yaml --num-expected-nodes 2 --retries 20 --docker-override commit - __suffix__: v2 run: script: RAY_UP_enable_autoscaler_v2=1 python launch_and_verify_cluster.py gcp/example-full.yaml --num-expected-nodes 2 --retries 20 --docker-override commit @@ -4341,6 +4383,8 @@ variations: - __suffix__: v1 + run: + script: RAY_UP_enable_autoscaler_v2=0 python launch_and_verify_cluster.py gcp/example-gpu-docker.yaml - __suffix__: v2 run: script: RAY_UP_enable_autoscaler_v2=1 python launch_and_verify_cluster.py gcp/example-gpu-docker.yaml @@ -4553,6 +4597,26 @@ long_running: false script: python run_llm_serve_test_and_bms.py --serve-config-file configs/serve_llama_3dot1_8b_quantized_tp1_2p6d.yaml --skip-hf-token true +- name: llm_serve_llama_3dot1_8B_quantized_tp1_2p6d_lmcache + frequency: manual # todo(ray-llm): fix this test with new/old lmcache version and new vllm version and re-enable it. + python: "3.11" + group: llm-serve + team: llm + working_dir: llm_tests/serve + + cluster: + byod: + type: llm-cu128 + post_build_script: byod_llm_lmcache_test.sh + cluster_compute: llm_auto_select_worker.yaml + # NOTE: Important for getting the correct secrets + cloud_id: cld_wy5a6nhazplvu32526ams61d98 + project_id: prj_lhlrf1u5yv8qz9qg3xzw8fkiiq + + run: + timeout: 3600 + long_running: false + script: python run_llm_serve_test_and_bms.py --serve-config-file configs/serve_llama_3dot1_8b_quantized_tp1_2p6d_lmcache.yaml --skip-hf-token true ############## # LLM Batch @@ -4597,6 +4661,57 @@ pytest -sv test_batch_sglang.py +- name: text_embeddings_benchmark_{{scaling}} + frequency: nightly + python: "3.11" # necessary for the llm-cu128 image + working_dir: nightly_tests + team: data + group: data-tests + + cluster: + byod: + type: llm-cu128 + cluster_compute: dataset/{{scaling}}_gpu_g6e_2xl_aws.yaml + + matrix: + setup: + scaling: [fixed_size, autoscaling] + + run: + timeout: 3600 + script: > + python dataset/text_embeddings_benchmark.py --embed-concurrency 15 + +# Note: release tests do not support specifying both 'matrix' and 'variations' +# in a test definition, so split off preemptible tests here. +- name: text_embeddings_benchmark_{{scaling}}_preemptible + frequency: nightly + python: "3.11" + working_dir: nightly_tests + team: data + group: data-tests + + cluster: + byod: + type: llm-cu128 + cluster_compute: dataset/{{scaling}}_gpu_g6e_2xl_aws.yaml + + matrix: + setup: + scaling: [fixed_size, autoscaling] + + run: + timeout: 3600 + # Notes: + # - Not using true spot instances. We simulate spot preemption using TerminateEC2InstanceWithGracePeriod to soft-kill the workers. This is so that we can + # control the kill schedule. + # - Batch size is always fixed, so kill schedule is deterministic. + prepare: > + python setup_chaos.py --chaos TerminateEC2InstanceWithGracePeriod + --batch-size-to-kill 5 --max-to-kill 15 --kill-delay 30 --kill-interval 100 + script: > + python dataset/text_embeddings_benchmark.py --chaos-test --embed-concurrency 15 + ####################### # Ray examples tests ####################### @@ -4678,6 +4793,33 @@ cluster: cluster_compute: ci/gce.yaml # relative to working_dir + +- name: deployment_serve_llm # do not use dashes (regex sensitive) + frequency: weekly + python: "3.11" + group: ray-examples + team: ml + working_dir: //doc/source/serve/tutorials/deployment-serve-llm # use // to access from repo's root + + cluster: + byod: + type: llm-cu128 # anyscale/ray-llm:-py311-cu128 + post_build_script: byod_deployment_serve_llm.sh # release/ray_release/byod/ + cluster_compute: ci/aws.yaml # relative to working_dir + + run: + timeout: 3600 + script: bash ci/tests.sh # relative to working_dir + + variations: + - __suffix__: aws # uses default specs above + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_compute: ci/gce.yaml # relative to working_dir + + - name: distributing_pytorch # do not use dashes (regex sensitive) frequency: weekly group: ray-examples diff --git a/release/rllib_tests/2gpus_32cpus.yaml b/release/rllib_tests/2gpus_32cpus.yaml index 02065ef9dc8f..06739cff5739 100644 --- a/release/rllib_tests/2gpus_32cpus.yaml +++ b/release/rllib_tests/2gpus_32cpus.yaml @@ -5,7 +5,7 @@ max_workers: 0 head_node_type: name: head_node - instance_type: g3.8xlarge + instance_type: g4dn.12xlarge worker_node_types: [] diff --git a/release/rllib_tests/2gpus_64cpus.yaml b/release/rllib_tests/2gpus_64cpus.yaml index bd7f534c1fdf..d1a1d0b54dca 100644 --- a/release/rllib_tests/2gpus_64cpus.yaml +++ b/release/rllib_tests/2gpus_64cpus.yaml @@ -5,7 +5,7 @@ max_workers: 1 head_node_type: name: head_node - instance_type: g3.8xlarge + instance_type: g4dn.12xlarge worker_node_types: - name: worker_node diff --git a/release/serve_tests/compute_tpl_single_node_16_cpu.yaml b/release/serve_tests/compute_tpl_single_node_16_cpu.yaml new file mode 100644 index 000000000000..d4684d799118 --- /dev/null +++ b/release/serve_tests/compute_tpl_single_node_16_cpu.yaml @@ -0,0 +1,18 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west-2 + +max_workers: 0 + +head_node_type: + name: head_node + # 16 cpus, arm, 64G mem, 12.5Gb NIC + instance_type: m7a.4xlarge + +worker_node_types: [] + +advanced_configurations_json: + TagSpecifications: + - ResourceType: "instance" + Tags: + - Key: ttl-hours + Value: '24' diff --git a/release/serve_tests/workloads/locust_utils.py b/release/serve_tests/workloads/locust_utils.py index 59c65e4e7e5c..6242b5e1dc0a 100644 --- a/release/serve_tests/workloads/locust_utils.py +++ b/release/serve_tests/workloads/locust_utils.py @@ -1,26 +1,23 @@ from dataclasses import asdict, dataclass -from itertools import chain +import os +import sys +import subprocess import json import logging -import time -from tqdm import tqdm -from typing import Any, Dict, List +from typing import Any, List import ray -from ray.serve._private.utils import generate_request_id +from ray.serve._private.benchmarks.locust_utils import ( + LocustStage, + LocustTestResults, + PerformanceStats, +) logger = logging.getLogger(__file__) logging.basicConfig(level=logging.INFO) -@dataclass -class LocustStage: - duration_s: int - users: int - spawn_rate: float - - @dataclass class LocustLoadTestConfig: num_workers: int @@ -31,247 +28,116 @@ class LocustLoadTestConfig: wait_for_workers_timeout_s: float = 600 -@dataclass -class PerformanceStats: - p50_latency: float - p90_latency: float - p99_latency: float - rps: float - - -@dataclass -class LocustTestResults: - history: List[Dict] - total_requests: int - num_failures: int - avg_latency: float - p50_latency: float - p90_latency: float - p99_latency: float - avg_rps: float - stats_in_stages: List[PerformanceStats] - - -@dataclass -class FailedRequest: - request_id: str - status_code: int - exception: str - response_time_ms: float - start_time_s: float - - -class LocustClient: - def __init__( - self, - host_url: str, - token: str, - data: Dict[str, Any] = None, - ): - from locust import task, constant, events, FastHttpUser - from locust.contrib.fasthttp import FastResponse - - self.errors = [] - - class EndpointUser(FastHttpUser): - wait_time = constant(0) - failed_requests = [] - host = host_url - - @task - def test(self): - request_id = generate_request_id() - headers = ( - {"Authorization": f"Bearer {token}", "X-Request-ID": request_id} - if token - else None - ) - with self.client.get( - "", headers=headers, json=data, catch_response=True - ) as r: - r.request_meta["context"]["request_id"] = request_id - - @events.request.add_listener - def on_request( - response: FastResponse, - exception, - context, - start_time: float, - response_time: float, - **kwargs, - ): - if exception: - request_id = context["request_id"] - response.encoding = "utf-8" - err = FailedRequest( - request_id=request_id, - status_code=response.status_code, - exception=response.text, - response_time_ms=response_time, - start_time_s=start_time, - ) - self.errors.append(err) - print( - f"Request '{request_id}' failed with exception: {response.text}" - ) - - self.user_class = EndpointUser - - -@ray.remote(num_cpus=1) -class LocustWorker(LocustClient): - def __init__( - self, - host_url: str, - token: str, - master_address: str, - data: Dict[str, Any] = None, - ): - # NOTE(zcin): We need to lazily import locust because the driver - # script won't connect to ray properly otherwise. - import locust - from locust.env import Environment - from locust.log import setup_logging - - super().__init__(host_url=host_url, token=token, data=data) - setup_logging("INFO") - self.env = Environment(user_classes=[self.user_class], events=locust.events) - self.master_address = master_address - - def run(self) -> List[Dict]: - runner = self.env.create_worker_runner( - master_host=self.master_address, master_port=5557 - ) - runner.greenlet.join() - return self.errors - - @ray.remote(num_cpus=1) -class LocustMaster(LocustClient): +class LocustProcess: def __init__( self, + worker_type: str, host_url: str, token: str, - expected_num_workers: int, - stages: List[LocustStage], - wait_for_workers_timeout_s: float, + expected_num_workers: int = None, + stages: List[LocustStage] = None, + wait_for_workers_timeout_s: float = None, + data: Any = None, + master_address: str = None, ): - # NOTE(zcin): We need to lazily import locust because the driver - # script won't connect to ray properly otherwise. - import locust - from locust import LoadTestShape - from locust.env import Environment - from locust.log import setup_logging - - super().__init__(host_url=host_url, token=token) - setup_logging("INFO") - - self.stats_in_stages: List[PerformanceStats] = [] - - class StagesShape(LoadTestShape): - curr_stage_ix = 0 - - def tick(cls): - run_time = cls.get_run_time() - prefix_time = 0 - for i, stage in enumerate(stages): - prefix_time += stage.duration_s - - if run_time < prefix_time: - if i != cls.curr_stage_ix: - self.on_stage_finished() - cls.curr_stage_ix = i - - current_stage = stages[cls.curr_stage_ix] - return current_stage.users, current_stage.spawn_rate - - # End of stage test - self.on_stage_finished() - - self.master_env = Environment( - user_classes=[self.user_class], - shape_class=StagesShape(), - events=locust.events, - ) + self.worker_type = worker_type + self.host_url = host_url + self.token = token self.expected_num_workers = expected_num_workers + self.stages = stages self.wait_for_workers_timeout_s = wait_for_workers_timeout_s - self.master_runner = None + self.data = data + self.master_address = master_address - def on_stage_finished(self): - stats_entry_key = ("", "GET") - stats_entry = self.master_runner.stats.entries.get(stats_entry_key) + def run(self): + # Create a temporary file for results + import tempfile - self.stats_in_stages.append( - PerformanceStats( - p50_latency=stats_entry.get_current_response_time_percentile(0.5), - p90_latency=stats_entry.get_current_response_time_percentile(0.9), - p99_latency=stats_entry.get_current_response_time_percentile(0.99), - rps=stats_entry.current_rps, - ) + results_file = tempfile.NamedTemporaryFile( + mode="w", delete=False, suffix=".json" ) + results_file.close() - def run(self): - import gevent - from locust.stats import ( - get_stats_summary, - get_percentile_stats_summary, - get_error_report_summary, - stats_history, - stats_printer, + # Prepare the subprocess script + if self.worker_type == "master": + script = f""" +import sys +import json +from ray.serve._private.benchmarks.locust_utils import run_locust_master, run_locust_worker, LocustStage + +stages = json.loads(sys.argv[1]) +stages = [LocustStage(**stage) for stage in stages] +results = run_locust_master( + host_url="{self.host_url}", + token="{self.token}", + expected_num_workers={self.expected_num_workers}, + stages=stages, + wait_for_workers_timeout_s={self.wait_for_workers_timeout_s} +) + +with open("{results_file.name}", 'w') as f: + json.dump(results, f) +""" + stages = json.dumps([asdict(stage) for stage in self.stages]) + cmd_args = [sys.executable, "-c", script, stages] + else: + script = f""" +import sys +import json +from ray.serve._private.benchmarks.locust_utils import run_locust_master, run_locust_worker, LocustStage + +data = sys.argv[1] +results = run_locust_worker( + master_address="{self.master_address}", + host_url="{self.host_url}", + token="{self.token}", + data=data, +) +""" + data = json.dumps(self.data) + cmd_args = [sys.executable, "-c", script, data] + + # Start the Locust process + self.process = subprocess.Popen( + cmd_args, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, ) - - self.master_runner = self.master_env.create_master_runner("*", 5557) - - start = time.time() - while len(self.master_runner.clients.ready) < self.expected_num_workers: - if time.time() - start > self.wait_for_workers_timeout_s: - raise RuntimeError( - f"Timed out waiting for {self.expected_num_workers} workers to " - "connect to Locust master." + print(f"Started {self.worker_type} subprocess ({self.process.pid})") + + try: + # Wait for the process to complete first + for line in self.process.stdout: # yields as the child prints + sys.stdout.write(line) # stream to our stdout + + return_code = self.process.wait() + if return_code != 0: + # Clean up the results file on error + try: + os.unlink(results_file.name) + except OSError: + pass + raise RuntimeError(f"Subprocess failed with return code {return_code}.") + + # Read the result from the results file + with open(results_file.name, "r") as f: + result_data = f.read() + + if result_data: + result_data = json.loads(result_data) + stats_in_stages = [ + PerformanceStats(**stage) + for stage in result_data.pop("stats_in_stages") + ] + result = LocustTestResults( + **result_data, stats_in_stages=stats_in_stages ) - - print( - f"Waiting for workers to be ready, " - f"{len(self.master_runner.clients.ready)} " - f"of {self.expected_num_workers} ready." - ) - time.sleep(1) - - # Periodically output current stats (each entry is aggregated - # stats over the past 10 seconds, by default) - gevent.spawn(stats_printer(self.master_env.stats)) - gevent.spawn(stats_history, self.master_runner) - - # Start test & wait for the shape test to finish - self.master_runner.start_shape() - self.master_runner.shape_greenlet.join() - # Send quit signal to all locust workers - self.master_runner.quit() - - # Print stats - for line in get_stats_summary(self.master_runner.stats, current=False): - print(line) - # Print percentile stats - for line in get_percentile_stats_summary(self.master_runner.stats): - print(line) - # Print error report - if self.master_runner.stats.errors: - for line in get_error_report_summary(self.master_runner.stats): - print(line) - - stats_entry_key = ("", "GET") - stats_entry = self.master_runner.stats.entries.get(stats_entry_key) - return LocustTestResults( - history=self.master_runner.stats.history, - total_requests=self.master_runner.stats.num_requests, - num_failures=self.master_runner.stats.num_failures, - avg_latency=stats_entry.avg_response_time, - p50_latency=stats_entry.get_response_time_percentile(0.5), - p90_latency=stats_entry.get_response_time_percentile(0.9), - p99_latency=stats_entry.get_response_time_percentile(0.99), - avg_rps=stats_entry.total_rps, - stats_in_stages=self.stats_in_stages, - ) + return result + finally: + os.unlink(results_file.name) def run_locust_load_test(config: LocustLoadTestConfig) -> LocustTestResults: @@ -288,17 +154,20 @@ def run_locust_load_test(config: LocustLoadTestConfig) -> LocustTestResults: worker_refs = [] # Start Locust workers - for _ in tqdm(range(config.num_workers)): - locust_worker = LocustWorker.remote( + for i in range(config.num_workers): + locust_worker = LocustProcess.options(name=f"LocustWorker-{i}").remote( + worker_type="worker", host_url=config.host_url, token=config.auth_token, master_address=master_address, data=config.data, ) worker_refs.append(locust_worker.run.remote()) + print(f"Started worker {i}") # Start Locust master - master_worker = LocustMaster.remote( + master_worker = LocustProcess.options(name="LocustMaster").remote( + worker_type="master", host_url=config.host_url, token=config.auth_token, expected_num_workers=config.num_workers, @@ -309,13 +178,19 @@ def run_locust_load_test(config: LocustLoadTestConfig) -> LocustTestResults: # Collect results and metrics stats: LocustTestResults = ray.get(master_ref) - errors = sorted(chain(*ray.get(worker_refs)), key=lambda e: e.start_time_s) + ray.get(worker_refs) + return stats - # If there were any requests that failed, raise error. - if stats.num_failures > 0: - errors_json = [asdict(err) for err in errors] - raise RuntimeError( - f"There were failed requests: {json.dumps(errors_json, indent=4)}" - ) - return stats +if __name__ == "__main__": + ray.init(address="auto") + results = run_locust_load_test( + LocustLoadTestConfig( + num_workers=9, + host_url="https://services-canary-pinger-aws-zugs7.cld-kvedzwag2qa8i5bj.s.anyscaleuserdata.com/info", + auth_token="v9M8jb3tBbHOGoWrg7X1fCwF8wYn7gqZR5VZ1_h4t50", + data=None, + stages=[LocustStage(duration_s=10, users=10, spawn_rate=1)], + ) + ) + print(results) diff --git a/release/serve_tests/workloads/microbenchmarks.py b/release/serve_tests/workloads/microbenchmarks.py index 6ccde24344b9..d75a204f14cf 100644 --- a/release/serve_tests/workloads/microbenchmarks.py +++ b/release/serve_tests/workloads/microbenchmarks.py @@ -26,6 +26,9 @@ do_single_http_batch, generate_payload, Noop, + ModelComp, + GrpcDeployment, + GrpcModelComp, IntermediateRouter, run_latency_benchmark, run_throughput_benchmark, @@ -60,18 +63,6 @@ STREAMING_NUM_TRIALS = 10 -@serve.deployment -class GrpcDeployment: - def __init__(self): - logging.getLogger("ray.serve").setLevel(logging.WARNING) - - async def grpc_call(self, user_message): - return serve_pb2.ModelOutput(output=9) - - async def call_with_string(self, user_message): - return serve_pb2.ModelOutput(output=9) - - def convert_throughput_to_perf_metrics( name: str, mean: float, @@ -133,6 +124,7 @@ async def _main( run_throughput: bool, run_streaming: bool, throughput_max_ongoing_requests: List[int], + concurrencies: List[int], ): perf_metrics = [] payload_1mb = generate_payload(1000000) @@ -153,26 +145,39 @@ async def _main( num_requests=NUM_REQUESTS, ) perf_metrics.extend(convert_latencies_to_perf_metrics(name, latencies)) - serve.shutdown() + await serve.shutdown_async() if run_throughput: # Microbenchmark: HTTP throughput - for max_ongoing_requests in throughput_max_ongoing_requests: - serve.run( - Noop.options(max_ongoing_requests=max_ongoing_requests).bind() - ) - url = get_application_url(use_localhost=True) - mean, std, _ = await run_throughput_benchmark( - fn=partial(do_single_http_batch, batch_size=BATCH_SIZE, url=url), - multiplier=BATCH_SIZE, - num_trials=NUM_TRIALS, - trial_runtime=TRIAL_RUNTIME_S, - ) - test_name = get_throughput_test_name("http", max_ongoing_requests) - perf_metrics.extend( - convert_throughput_to_perf_metrics(test_name, mean, std) - ) - serve.shutdown() + for max_ongoing_requests, concurrency in zip( + throughput_max_ongoing_requests, concurrencies + ): + workloads = { + "http": Noop.options( + max_ongoing_requests=max_ongoing_requests + ).bind(), + "http_model_comp": ModelComp.options( + max_ongoing_requests=max_ongoing_requests + ).bind( + Noop.options(max_ongoing_requests=max_ongoing_requests).bind() + ), + } + for name, app in workloads.items(): + serve.run(app) + url = get_application_url(use_localhost=True) + mean, std, _ = await run_throughput_benchmark( + fn=partial( + do_single_http_batch, batch_size=concurrency, url=url + ), + multiplier=concurrency, + num_trials=NUM_TRIALS, + trial_runtime=TRIAL_RUNTIME_S, + ) + test_name = get_throughput_test_name(name, max_ongoing_requests) + perf_metrics.extend( + convert_throughput_to_perf_metrics(test_name, mean, std) + ) + await serve.shutdown_async() if run_streaming: # Direct streaming between replica @@ -209,7 +214,7 @@ async def _main( perf_metrics.extend( convert_latencies_to_perf_metrics("http_streaming", latencies) ) - serve.shutdown() + await serve.shutdown_async() # Streaming with intermediate router serve.run( @@ -243,7 +248,7 @@ async def _main( "http_intermediate_streaming", latencies ) ) - serve.shutdown() + await serve.shutdown_async() # GRPC if run_grpc: @@ -275,33 +280,42 @@ async def _main( num_requests=NUM_REQUESTS, ) perf_metrics.extend(convert_latencies_to_perf_metrics(name, latencies)) - serve.shutdown() + await serve.shutdown_async() if run_throughput: # Microbenchmark: GRPC throughput - for max_ongoing_requests in throughput_max_ongoing_requests: - serve.start(grpc_options=serve_grpc_options) - serve.run( - GrpcDeployment.options( + for max_ongoing_requests, concurrency in zip( + throughput_max_ongoing_requests, concurrencies + ): + workloads = { + "grpc": GrpcDeployment.options( max_ongoing_requests=max_ongoing_requests - ).bind() - ) - target = get_application_url( - protocol=RequestProtocol.GRPC, use_localhost=True - ) - mean, std, _ = await run_throughput_benchmark( - fn=partial( - do_single_grpc_batch, batch_size=BATCH_SIZE, target=target + ).bind(), + "grpc_model_comp": GrpcModelComp.options( + max_ongoing_requests=max_ongoing_requests + ).bind( + Noop.options(max_ongoing_requests=max_ongoing_requests).bind() ), - multiplier=BATCH_SIZE, - num_trials=NUM_TRIALS, - trial_runtime=TRIAL_RUNTIME_S, - ) - test_name = get_throughput_test_name("grpc", max_ongoing_requests) - perf_metrics.extend( - convert_throughput_to_perf_metrics(test_name, mean, std) - ) - serve.shutdown() + } + for name, app in workloads.items(): + serve.start(grpc_options=serve_grpc_options) + serve.run(app) + target = get_application_url( + protocol=RequestProtocol.GRPC, use_localhost=True + ) + mean, std, _ = await run_throughput_benchmark( + fn=partial( + do_single_grpc_batch, batch_size=concurrency, target=target + ), + multiplier=concurrency, + num_trials=NUM_TRIALS, + trial_runtime=TRIAL_RUNTIME_S, + ) + test_name = get_throughput_test_name(name, max_ongoing_requests) + perf_metrics.extend( + convert_throughput_to_perf_metrics(test_name, mean, std) + ) + await serve.shutdown_async() # Handle if run_handle: @@ -316,26 +330,44 @@ async def _main( num_requests=NUM_REQUESTS, payload=payload ) perf_metrics.extend(convert_latencies_to_perf_metrics(name, latencies)) - serve.shutdown() + await serve.shutdown_async() if run_throughput: # Microbenchmark: Handle throughput - for max_ongoing_requests in throughput_max_ongoing_requests: - h: DeploymentHandle = serve.run( - Benchmarker.options(max_ongoing_requests=max_ongoing_requests).bind( + for max_ongoing_requests, concurrency in zip( + throughput_max_ongoing_requests, concurrencies + ): + workloads = { + "handle": Benchmarker.options( + max_ongoing_requests=max_ongoing_requests + ).bind( Noop.options(max_ongoing_requests=max_ongoing_requests).bind() + ), + "handle_model_comp": Benchmarker.options( + max_ongoing_requests=max_ongoing_requests + ).bind( + ModelComp.options( + max_ongoing_requests=max_ongoing_requests + ).bind( + Noop.options( + max_ongoing_requests=max_ongoing_requests + ).bind() + ) + ), + } + for name, app in workloads.items(): + h: DeploymentHandle = serve.run(app) + + mean, std, _ = await h.run_throughput_benchmark.remote( + batch_size=concurrency, + num_trials=NUM_TRIALS, + trial_runtime=TRIAL_RUNTIME_S, ) - ) - mean, std, _ = await h.run_throughput_benchmark.remote( - batch_size=BATCH_SIZE, - num_trials=NUM_TRIALS, - trial_runtime=TRIAL_RUNTIME_S, - ) - test_name = get_throughput_test_name("handle", max_ongoing_requests) - perf_metrics.extend( - convert_throughput_to_perf_metrics(test_name, mean, std) - ) - serve.shutdown() + test_name = get_throughput_test_name(name, max_ongoing_requests) + perf_metrics.extend( + convert_throughput_to_perf_metrics(test_name, mean, std) + ) + await serve.shutdown_async() if run_streaming: h: DeploymentHandle = serve.run( @@ -362,7 +394,7 @@ async def _main( perf_metrics.extend( convert_latencies_to_perf_metrics("handle_streaming", latencies) ) - serve.shutdown() + await serve.shutdown_async() logging.info(f"Perf metrics:\n {json.dumps(perf_metrics, indent=4)}") results = {"perf_metrics": perf_metrics} @@ -383,8 +415,16 @@ async def _main( "-t", multiple=True, type=int, - default=[5, 100], - help="Max ongoing requests for throughput benchmarks. Default: [5, 100]", + default=[5, 100, 800], + help="Max ongoing requests for throughput benchmarks. Must be in the same order as --concurrencies. Default: [5, 100, 800]", +) +@click.option( + "--concurrencies", + "-c", + multiple=True, + type=int, + default=[100, 100, 800], + help="User concurrency for throughput benchmarks. Must be in the same order as --throughput-max-ongoing-requests. Default: [100, 100, 800]", ) def main( output_path: Optional[str], @@ -396,7 +436,12 @@ def main( run_throughput: bool, run_streaming: bool, throughput_max_ongoing_requests: List[int], + concurrencies: List[int], ): + assert len(throughput_max_ongoing_requests) == len( + concurrencies + ), "Must have the same number of --throughput-max-ongoing-requests and --concurrencies" + # If none of the flags are set, default to run all if not ( run_http @@ -426,6 +471,7 @@ def main( run_throughput, run_streaming, throughput_max_ongoing_requests, + concurrencies, ) ) diff --git a/release/train_tests/benchmark/config.py b/release/train_tests/benchmark/config.py index 03f55de0ce6d..b0686d8c4d23 100644 --- a/release/train_tests/benchmark/config.py +++ b/release/train_tests/benchmark/config.py @@ -83,6 +83,9 @@ class BenchmarkConfig(BaseModel): num_epochs: int = 1 skip_train_step: bool = False + # Checkpointing + checkpoint_every_n_steps: int = -1 + # Validation validate_every_n_steps: int = -1 skip_validation_step: bool = False @@ -109,11 +112,11 @@ def _add_field_to_parser(parser: argparse.ArgumentParser, field: str, field_info parser.add_argument(f"--{field}", type=field_type, default=field_info.default) -def cli_to_config() -> BenchmarkConfig: +def cli_to_config(benchmark_config_cls=BenchmarkConfig) -> BenchmarkConfig: parser = argparse.ArgumentParser() nested_fields = [] - for field, field_info in BenchmarkConfig.model_fields.items(): + for field, field_info in benchmark_config_cls.model_fields.items(): # Skip nested configs for now if _is_pydantic_model(field_info.annotation): nested_fields.append(field) @@ -127,24 +130,24 @@ def cli_to_config() -> BenchmarkConfig: nested_configs = {} for nested_field in nested_fields: nested_parser = argparse.ArgumentParser() - config_cls = BenchmarkConfig.model_fields[nested_field].annotation + nested_config_cls = benchmark_config_cls.model_fields[nested_field].annotation - if config_cls == DataLoaderConfig: + if nested_config_cls == DataLoaderConfig: if top_level_args.dataloader_type == DataloaderType.RAY_DATA: - config_cls = RayDataConfig + nested_config_cls = RayDataConfig elif top_level_args.dataloader_type == DataloaderType.TORCH: - config_cls = TorchConfig + nested_config_cls = TorchConfig - if config_cls == TaskConfig: + if nested_config_cls == TaskConfig: if top_level_args.task == ImageClassificationConfig.TASK_NAME: - config_cls = ImageClassificationConfig + nested_config_cls = ImageClassificationConfig elif top_level_args.task == RecsysConfig.TASK_NAME: - config_cls = RecsysConfig + nested_config_cls = RecsysConfig - for field, field_info in config_cls.model_fields.items(): + for field, field_info in nested_config_cls.model_fields.items(): _add_field_to_parser(nested_parser, field, field_info) args, _ = nested_parser.parse_known_args() - nested_configs[nested_field] = config_cls(**vars(args)) + nested_configs[nested_field] = nested_config_cls(**vars(args)) - return BenchmarkConfig(**vars(top_level_args), **nested_configs) + return benchmark_config_cls(**vars(top_level_args), **nested_configs) diff --git a/release/train_tests/benchmark/runner.py b/release/train_tests/benchmark/runner.py index 5842059fae6a..5c87f41674bf 100644 --- a/release/train_tests/benchmark/runner.py +++ b/release/train_tests/benchmark/runner.py @@ -32,7 +32,7 @@ def __init__(self, factory: BenchmarkFactory): # Training progress state. self._train_batch_idx: int = 0 self._train_epoch_idx: int = 0 - self._restored_train_batch_idx: Optional[int] = None + self._global_rows_processed_this_epoch: int = 0 # Performance metrics self._metrics = collections.defaultdict(lambda: Timer()) @@ -121,6 +121,17 @@ def dataloader_with_timers(): return dataloader_with_timers() + @property + def _num_batches_to_skip(self) -> int: + """Calculate the number of batches to skip based on the number of rows already processed in this epoch.""" + + global_batch_size = ( + self.benchmark_config.dataloader_config.train_batch_size + * ray.train.get_context().get_world_size() + ) + + return self._global_rows_processed_this_epoch // global_batch_size + def _train_epoch(self): """Subclasses can override the entrire `_train_epoch` method for more training logic customization.""" @@ -132,11 +143,11 @@ def _train_epoch(self): # Skip through batches if we restored to a middle of the epoch. # TODO: Compare this baseline to the data checkpointing approach once we have it. - if self._restored_train_batch_idx is not None: + if self._num_batches_to_skip: if ray.train.get_context().get_world_rank() == 0: - logger.info(f"Skipping {self._restored_train_batch_idx + 1} batches...") + logger.info(f"Skipping {self._num_batches_to_skip} batches...") - for _ in range(self._restored_train_batch_idx + 1): + for _ in range(self._num_batches_to_skip): with self._metrics["train/iter_skip_batch"].timer(): next(train_dataloader) @@ -146,18 +157,27 @@ def _train_epoch(self): self._train_step(batch) # TODO: This is slightly off if the last batch is a partial batch (if drop_last=False) - self._metrics["train/rows_processed"].add( + global_batch_size = ( self.benchmark_config.dataloader_config.train_batch_size + * ray.train.get_context().get_world_size() ) + self._metrics["train/rows_processed"].add(global_batch_size) + + self._global_rows_processed_this_epoch += global_batch_size + + if self._should_checkpoint_during_epoch(): + self._checkpoint() if self._should_validate_during_epoch(): - self._validate_and_checkpoint() + validation_metrics = self._validate() + self._checkpoint(validation_metrics) if self._should_log_metrics(): logger.info(pprint.pformat(self.get_metrics(), indent=2)) self._train_epoch_idx += 1 self._train_batch_idx = 0 + self._global_rows_processed_this_epoch = 0 def _validate_epoch(self) -> Dict[str, float]: if ray.train.get_context().get_world_rank() == 0: @@ -181,9 +201,18 @@ def _validate_epoch(self) -> Dict[str, float]: self._metrics["validation/rows_processed"].add( self.benchmark_config.dataloader_config.validation_batch_size ) + assert num_rows > 0, "Validation dataset yielded no batches." return {"validation/loss": total_loss.item() / num_rows} + def _should_checkpoint_during_epoch(self) -> bool: + """Handles the checkpoint_every_n_steps logic.""" + return ( + self.benchmark_config.checkpoint_every_n_steps > 0 + and self._train_batch_idx % self.benchmark_config.checkpoint_every_n_steps + == 0 + ) + def _should_validate_during_epoch(self) -> bool: """Handles the validate_every_n_steps logic.""" return ( @@ -200,10 +229,12 @@ def _should_log_metrics(self) -> bool: == 0 ) - def _validate_and_checkpoint(self): + def _validate(self) -> Dict[str, float]: with self._metrics["validation/epoch"].timer(): validation_metrics = self._validate_epoch() + return validation_metrics + def _checkpoint(self, metrics: Optional[Dict[str, float]] = None): with tempfile.TemporaryDirectory( dir="/mnt/local_storage" ) as temp_checkpoint_dir: @@ -212,7 +243,7 @@ def _validate_and_checkpoint(self): with self._metrics["checkpoint/report"].timer(): self._report_checkpoint( - metrics=validation_metrics, + metrics=metrics or {}, checkpoint=ray.train.Checkpoint.from_directory(temp_checkpoint_dir), ) @@ -221,7 +252,10 @@ def _load_checkpoint(self, local_dir: str): run_state = torch.load(os.path.join(local_dir, "run_state.pt")) self._train_epoch_idx = run_state["epoch"] - self._restored_train_batch_idx = run_state["batch_idx"] + self._train_batch_idx = run_state["batch_idx"] + self._global_rows_processed_this_epoch = run_state[ + "global_rows_processed_this_epoch" + ] with open(os.path.join(local_dir, "metrics.json"), "r") as f: metrics_json = json.load(f) @@ -232,7 +266,7 @@ def _load_checkpoint(self, local_dir: str): if ray.train.get_context().get_world_rank() == 0: logger.info( f"Restored to epoch={self._train_epoch_idx}, " - f"train_batch_idx={self._restored_train_batch_idx} from checkpoint: " + f"train_batch_idx={self._train_batch_idx} from checkpoint: " f"{ray.train.get_checkpoint()}" ) @@ -248,6 +282,7 @@ def _save_checkpoint(self, local_dir: str): run_state = { "epoch": self._train_epoch_idx, "batch_idx": self._train_batch_idx, + "global_rows_processed_this_epoch": self._global_rows_processed_this_epoch, } torch.save(run_state, os.path.join(local_dir, "run_state.pt")) @@ -279,7 +314,8 @@ def run(self): self._train_epoch() if not self.benchmark_config.skip_validation_at_epoch_end: - self._validate_and_checkpoint() + validation_metrics = self._validate() + self._checkpoint(validation_metrics) if ray.train.get_context().get_world_rank() == 0: logger.info(pprint.pformat(self.get_metrics(), indent=2)) @@ -304,7 +340,6 @@ def get_metrics(self, dataset_creation_time: float = 0.0) -> Dict[str, float]: # Throughput # TODO: Ray Data can provide these throughput metrics automatically. - num_workers = ray.train.get_context().get_world_size() train_time = ( metrics["train/dataset_creation_time"] + self._metrics["train/step"].get() @@ -313,11 +348,8 @@ def get_metrics(self, dataset_creation_time: float = 0.0) -> Dict[str, float]: + self._metrics["train/iter_batch"].get() ) if train_time > 0: - metrics["train/local_throughput"] = ( - self._metrics["train/rows_processed"].get() / train_time - ) metrics["train/global_throughput"] = ( - metrics["train/local_throughput"] * num_workers + self._metrics["train/rows_processed"].get() / train_time ) validation_time = ( @@ -328,11 +360,8 @@ def get_metrics(self, dataset_creation_time: float = 0.0) -> Dict[str, float]: + self._metrics["validation/iter_batch"].get() ) if validation_time > 0: - metrics["validation/local_throughput"] = ( - self._metrics["validation/rows_processed"].get() / validation_time - ) metrics["validation/global_throughput"] = ( - metrics["validation/local_throughput"] * num_workers + self._metrics["validation/rows_processed"].get() / validation_time ) # Extra time that each worker spends to restore from checkpoint, diff --git a/release/train_tests/benchmark/torch_dataloader_factory.py b/release/train_tests/benchmark/torch_dataloader_factory.py index a4fd4d9e868c..733c15d497ca 100644 --- a/release/train_tests/benchmark/torch_dataloader_factory.py +++ b/release/train_tests/benchmark/torch_dataloader_factory.py @@ -1,7 +1,6 @@ from typing import Dict, Iterator, Tuple import logging from abc import ABC, abstractmethod -import sys import multiprocessing import torch @@ -100,26 +99,20 @@ def get_iterable_datasets(self) -> Dict[str, IterableDataset]: def _create_multiprocessing_context(self): # Importing libs in torch dataloader worker subprocesses is very slow. - # Preload all imported modules to speed up subprocess forking. - imported_modules = list(sys.modules.keys()) + # Preload some modules to speed up subprocess forking. ctx = multiprocessing.get_context("forkserver") - ctx.set_forkserver_preload(imported_modules) + modules = ["torch", "torchvision", "pandas", "numpy", "boto3", "fsspec"] + ctx.set_forkserver_preload(modules) return ctx - def get_train_dataloader(self) -> Iterator[Tuple[torch.Tensor, torch.Tensor]]: - """Create a DataLoader for training data. - - Returns: - An iterator that yields (image, label) tensors for training - """ + def _create_dataloader(self, dataset_key: DatasetKey, batch_size: int): worker_rank = ray.train.get_context().get_world_rank() - logger.info(f"Worker {worker_rank}: Creating train dataloader") - dataloader_config = self.get_dataloader_config() - device = self._get_device() # Create dataset and dataloader - train_ds = self.get_iterable_datasets()[DatasetKey.TRAIN] + ds = self.get_iterable_datasets()[dataset_key] + + device = self._get_device() # Adjust worker settings for 0 workers case num_workers = max(0, self.num_torch_workers) @@ -134,7 +127,6 @@ def get_train_dataloader(self) -> Iterator[Tuple[torch.Tensor, torch.Tensor]]: timeout = ( dataloader_config.torch_dataloader_timeout_seconds if num_workers > 0 else 0 ) - batch_size = dataloader_config.train_batch_size logger.info( f"Worker {worker_rank}: Creating train DataLoader with " @@ -143,17 +135,22 @@ def get_train_dataloader(self) -> Iterator[Tuple[torch.Tensor, torch.Tensor]]: f"timeout={timeout}, batch_size={batch_size}" ) + multiprocessing_args = {} + if num_workers > 0: + multiprocessing_args = dict( + multiprocessing_context=self._create_multiprocessing_context(), + worker_init_fn=self.worker_init_fn, + persistent_workers=persistent_workers, + ) dataloader = torch.utils.data.DataLoader( - dataset=train_ds, + dataset=ds, batch_size=batch_size, num_workers=num_workers, pin_memory=pin_memory, - persistent_workers=persistent_workers, prefetch_factor=prefetch_factor, timeout=timeout, - drop_last=True, - worker_init_fn=self.worker_init_fn if num_workers > 0 else None, - multiprocessing_context=self._create_multiprocessing_context(), + drop_last=False, + **multiprocessing_args, ) # Add a DistributedSampler to the dataloader if possible (map-style datasets) dataloader = ray.train.torch.prepare_data_loader( @@ -162,6 +159,19 @@ def get_train_dataloader(self) -> Iterator[Tuple[torch.Tensor, torch.Tensor]]: return self.create_batch_iterator(dataloader, device) + def get_train_dataloader(self) -> Iterator[Tuple[torch.Tensor, torch.Tensor]]: + """Create a DataLoader for training data. + + Returns: + An iterator that yields (image, label) tensors for training + """ + worker_rank = ray.train.get_context().get_world_rank() + logger.info(f"Worker {worker_rank}: Creating train dataloader") + + return self._create_dataloader( + DatasetKey.TRAIN, self.get_dataloader_config().train_batch_size + ) + def get_val_dataloader(self) -> Iterator[Tuple[torch.Tensor, torch.Tensor]]: """Create a DataLoader for validation data. @@ -171,49 +181,6 @@ def get_val_dataloader(self) -> Iterator[Tuple[torch.Tensor, torch.Tensor]]: worker_rank = ray.train.get_context().get_world_rank() logger.info(f"Worker {worker_rank}: Creating validation dataloader") - dataloader_config = self.get_dataloader_config() - device = self._get_device() - - # Create dataset and dataloader with row limits - val_ds = self.get_iterable_datasets()[DatasetKey.VALID] - - # Adjust worker settings for 0 workers case - num_workers = max(0, self.num_torch_workers) - persistent_workers = num_workers > 0 - pin_memory = ( - dataloader_config.torch_pin_memory and torch.cuda.is_available() - ) # Use config setting - - if dataloader_config.torch_prefetch_factor >= 0: - prefetch_factor = dataloader_config.torch_prefetch_factor - else: - prefetch_factor = None - - timeout = ( - dataloader_config.torch_dataloader_timeout_seconds if num_workers > 0 else 0 - ) - batch_size = dataloader_config.validation_batch_size - - logger.info( - f"Worker {worker_rank}: Creating validation DataLoader with " - f"num_workers={num_workers}, pin_memory={pin_memory}, " - f"persistent_workers={persistent_workers}, prefetch_factor={prefetch_factor}, " - f"timeout={timeout}, batch_size={batch_size}" - ) - - dataloader = torch.utils.data.DataLoader( - dataset=val_ds, - batch_size=batch_size, - num_workers=num_workers, - pin_memory=pin_memory, - persistent_workers=persistent_workers, - prefetch_factor=prefetch_factor, - timeout=timeout, - drop_last=False, - worker_init_fn=self.worker_init_fn if num_workers > 0 else None, - multiprocessing_context=self._create_multiprocessing_context(), + return self._create_dataloader( + DatasetKey.VALID, self.get_dataloader_config().validation_batch_size ) - dataloader = ray.train.torch.prepare_data_loader( - dataloader, move_to_device=False - ) - return self.create_batch_iterator(dataloader, device) diff --git a/release/train_tests/benchmark/train_benchmark.py b/release/train_tests/benchmark/train_benchmark.py index b2650baf985b..f2378a19b32b 100644 --- a/release/train_tests/benchmark/train_benchmark.py +++ b/release/train_tests/benchmark/train_benchmark.py @@ -34,12 +34,26 @@ def train_fn_per_worker(config): runner.run() - metrics = runner.get_metrics(dataset_creation_time=config["dataset_creation_time"]) + metrics = runner.get_metrics( + dataset_creation_time=config.get("dataset_creation_time", 0) + ) if ray.train.get_context().get_world_rank() == 0: with open(METRICS_OUTPUT_PATH, "w") as f: json.dump(metrics, f) +def get_datasets_and_data_config(factory: BenchmarkFactory): + dataloader_factory = factory.get_dataloader_factory() + if isinstance(dataloader_factory, RayDataLoaderFactory): + datasets = dataloader_factory.get_ray_datasets() + data_config = dataloader_factory.get_ray_data_config() + else: + datasets = {} + data_config = None + + return datasets, data_config + + def main(): start_time = time.perf_counter() logging.basicConfig(level=logging.INFO) @@ -60,13 +74,7 @@ def main(): else: raise ValueError(f"Unknown task: {benchmark_config.task}") - dataloader_factory = factory.get_dataloader_factory() - if isinstance(dataloader_factory, RayDataLoaderFactory): - datasets = dataloader_factory.get_ray_datasets() - data_config = dataloader_factory.get_ray_data_config() - else: - datasets = {} - data_config = None + datasets, data_config = get_datasets_and_data_config(factory) dataset_creation_time = time.perf_counter() - start_time diff --git a/rllib/BUILD b/rllib/BUILD.bazel similarity index 96% rename from rllib/BUILD rename to rllib/BUILD.bazel index 8ed94ce32d53..4dbfccb6c865 100644 --- a/rllib/BUILD +++ b/rllib/BUILD.bazel @@ -1095,6 +1095,57 @@ py_test( # args = ["--as-test", "--num-learners=2", "--num-gpus-per-learner=1"] # ) +# IQL +# Pendulum-v1 (enormous) +py_test( + name = "learning_tests_pendulum_iql", + size = "large", + srcs = ["tuned_examples/iql/pendulum_iql.py"], + args = [ + "--as-test", + "--num-cpus=32", + ], + # Include the offline data files. + data = [ + "tests/data/pendulum/pendulum-v1_enormous", + ], + main = "tuned_examples/iql/pendulum_iql.py", + tags = [ + "exclusive", + "learning_tests", + "learning_tests_continuous", + "learning_tests_pytorch_use_all_core", + "team:rllib", + "torch_only", + ], +) + +# GPU training. +py_test( + name = "learning_tests_pendulum_iql_gpu", + size = "large", + srcs = ["tuned_examples/iql/pendulum_iql.py"], + args = [ + "--as-test", + "--num-cpus=32", + "--num-gpus-per-learner=1", + ], + # Include the offline data files. + data = [ + "tests/data/pendulum/pendulum-v1_enormous", + ], + main = "tuned_examples/iql/pendulum_iql.py", + tags = [ + "exclusive", + "gpu", + "learning_tests", + "learning_tests_continuous", + "learning_tests_pytorch_use_all_core", + "team:rllib", + "torch_only", + ], +) + # MARWIL # CartPole py_test( @@ -1487,6 +1538,86 @@ py_test( ], ) +# Footsies +py_test( + name = "learning_tests_multi_agent_footsies_ppo", + size = "large", + srcs = ["tuned_examples/ppo/multi_agent_footsies_ppo.py"], + args = [ + "--as-test", + "--num-env-runners=6", + "--evaluation-num-env-runners=2", + ], + main = "tuned_examples/ppo/multi_agent_footsies_ppo.py", + tags = [ + "exclusive", + "learning_tests", + "learning_tests_discrete", + "team:rllib", + ], +) + +py_test( + name = "learning_tests_multi_agent_footsies_ppo_gpu", + size = "large", + srcs = ["tuned_examples/ppo/multi_agent_footsies_ppo.py"], + args = [ + "--as-test", + "--num-env-runners=20", + "--evaluation-num-env-runners=3", + "--num-learners=1", + "--num-gpus-per-learner=1", + ], + main = "tuned_examples/ppo/multi_agent_footsies_ppo.py", + tags = [ + "exclusive", + "learning_tests", + "learning_tests_discrete", + "multi_gpu", + "team:rllib", + ], +) + +py_test( + name = "learning_tests_multi_agent_footsies_ppo_multi_cpu", + size = "large", + srcs = ["tuned_examples/ppo/multi_agent_footsies_ppo.py"], + args = [ + "--as-test", + "--num-env-runners=6", + "--evaluation-num-env-runners=2", + "--num-learners=2", + ], + main = "tuned_examples/ppo/multi_agent_footsies_ppo.py", + tags = [ + "exclusive", + "learning_tests", + "learning_tests_discrete", + "team:rllib", + ], +) + +py_test( + name = "learning_tests_multi_agent_footsies_ppo_multi_gpu", + size = "large", + srcs = ["tuned_examples/ppo/multi_agent_footsies_ppo.py"], + args = [ + "--as-test", + "--num-env-runners=20", + "--evaluation-num-env-runners=3", + "--num-learners=2", + "--num-gpus-per-learner=1", + ], + main = "tuned_examples/ppo/multi_agent_footsies_ppo.py", + tags = [ + "exclusive", + "learning_tests", + "learning_tests_discrete", + "multi_gpu", + "team:rllib", + ], +) + # Pendulum py_test( name = "learning_tests_pendulum_ppo", @@ -3973,6 +4104,26 @@ py_test( ], ) +py_test( + name = "examples/curriculum/pong_curriculum_learning", + size = "large", + srcs = ["examples/curriculum/pong_curriculum_learning.py"], + args = [ + "--as-test", + "--num-env-runners=10", + "--num-cpus=11", + "--num-envs-per-env-runner=5", + "--stop-iters=20", + "--stop-reward=-21.0", + ], + main = "examples/curriculum/pong_curriculum_learning.py", + tags = [ + "examples", + "exclusive", + "team:rllib", + ], +) + # subdirectory: debugging/ # .................................... py_test( @@ -4013,14 +4164,14 @@ py_test( # subdirectory: envs/ # .................................... py_test( - name = "examples/envs/agents_act_simultaneously", + name = "examples/envs/agents_act_in_sequence", size = "medium", - srcs = ["examples/envs/agents_act_simultaneously.py"], + srcs = ["examples/envs/agents_act_in_sequence.py"], args = [ "--num-agents=2", "--stop-iters=3", ], - main = "examples/envs/agents_act_simultaneously.py", + main = "examples/envs/agents_act_in_sequence.py", tags = [ "examples", "exclusive", @@ -4029,14 +4180,14 @@ py_test( ) py_test( - name = "examples/envs/agents_act_in_sequence", + name = "examples/envs/agents_act_simultaneously", size = "medium", - srcs = ["examples/envs/agents_act_in_sequence.py"], + srcs = ["examples/envs/agents_act_simultaneously.py"], args = [ "--num-agents=2", "--stop-iters=3", ], - main = "examples/envs/agents_act_in_sequence.py", + main = "examples/envs/agents_act_simultaneously.py", tags = [ "examples", "exclusive", @@ -4112,6 +4263,7 @@ py_test( args = [ "--as-test", "--port=12346", + "--use-dummy-client", ], main = "examples/envs/env_connecting_to_rllib_w_tcp_client.py", tags = [ @@ -4941,6 +5093,42 @@ py_test( ], ) +py_test( + name = "examples/multi_agent/self_play_footsies", + size = "large", + srcs = ["examples/multi_agent/self_play_footsies.py"], + args = [ + "--as-test", + "--num-cpus=4", + ], + main = "examples/multi_agent/self_play_footsies.py", + tags = [ + "examples", + "examples_use_all_core", + "exclusive", + "team:rllib", + ], +) + +py_test( + name = "examples/multi_agent/self_play_league_based_with_open_spiel_connect_4_ppo_torch", + size = "large", + srcs = ["examples/multi_agent/self_play_league_based_with_open_spiel.py"], + args = [ + "--framework=torch", + "--env=connect_four", + "--win-rate-threshold=0.8", + "--num-episodes-human-play=0", + "--min-league-size=8", + ], + main = "examples/multi_agent/self_play_league_based_with_open_spiel.py", + tags = [ + "examples", + "exclusive", + "team:rllib", + ], +) + # @OldAPIStack py_test( name = "examples/multi_agent/self_play_with_open_spiel_connect_4_ppo_tf_old_api_stack", @@ -5003,17 +5191,13 @@ py_test( ) py_test( - name = "examples/multi_agent/self_play_league_based_with_open_spiel_connect_4_ppo_torch", - size = "large", - srcs = ["examples/multi_agent/self_play_league_based_with_open_spiel.py"], + name = "examples/multi_agent/shared_encoder_cartpole", + size = "medium", + srcs = ["examples/multi_agent/shared_encoder_cartpole.py"], args = [ - "--framework=torch", - "--env=connect_four", - "--win-rate-threshold=0.8", - "--num-episodes-human-play=0", - "--min-league-size=8", + "--stop-iter=10", ], - main = "examples/multi_agent/self_play_league_based_with_open_spiel.py", + main = "examples/multi_agent/shared_encoder_cartpole.py", tags = [ "examples", "exclusive", diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py index 6d455da75c55..5b3f56c8d96d 100644 --- a/rllib/algorithms/algorithm.py +++ b/rllib/algorithms/algorithm.py @@ -109,7 +109,7 @@ try_import_msgpack, ) from ray.rllib.utils.debug import update_global_seed_if_necessary -from ray.rllib.utils.deprecation import ( +from ray._common.deprecation import ( DEPRECATED_VALUE, Deprecated, deprecation_warning, @@ -769,14 +769,35 @@ def setup(self, config: AlgorithmConfig) -> None: elif self.eval_env_runner_group: spaces.update(self.eval_env_runner_group.get_spaces()) else: - spaces.update( - { - DEFAULT_MODULE_ID: ( - self.config.observation_space, - self.config.action_space, - ), - } - ) + # If the algorithm is online we use the spaces from as they are + # provided. + if self.config.is_online: + spaces.update( + { + DEFAULT_MODULE_ID: ( + self.config.observation_space, + self.config.action_space, + ), + } + ) + # Otherwise, when we are offline we need to check, if the learner connector + # is transforming the spaces. + elif self.config.is_offline: + # Build the learner connector with the input spaces from the environment. + learner_connector = self.config.build_learner_connector( + input_observation_space=spaces[INPUT_ENV_SPACES][0], + input_action_space=spaces[INPUT_ENV_SPACES][1], + ) + # Update the `spaces` dictionary by using the output spaces of the learner + # connector pipeline. + spaces.update( + { + DEFAULT_MODULE_ID: ( + learner_connector.observation_space, + learner_connector.action_space, + ), + } + ) module_spec: MultiRLModuleSpec = self.config.get_multi_rl_module_spec( spaces=spaces, @@ -2190,11 +2211,11 @@ def add_module( EnvRunnerGroup (with its o EnvRunners plus the local one). Returns: - The new MultiAgentRLModuleSpec (after the RLModule has been added). + The new MultiRLModuleSpec (after the RLModule has been added). """ validate_module_id(module_id, error=True) - # The to-be-returned new MultiAgentRLModuleSpec. + # The to-be-returned new MultiRLModuleSpec. multi_rl_module_spec = None if not self.config.is_multi_agent: @@ -2316,9 +2337,9 @@ def remove_module( EnvRunnerGroup (with its o EnvRunners plus the local one). Returns: - The new MultiAgentRLModuleSpec (after the RLModule has been removed). + The new MultiRLModuleSpec (after the RLModule has been removed). """ - # The to-be-returned new MultiAgentRLModuleSpec. + # The to-be-returned new MultiRLModuleSpec. multi_rl_module_spec = None # Remove RLModule from the LearnerGroup. diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index 2c6120b86452..f58470da92ed 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -48,7 +48,7 @@ OldAPIStack, OverrideToImplementCustomLogic_CallToSuperRecommended, ) -from ray.rllib.utils.deprecation import ( +from ray._common.deprecation import ( DEPRECATED_VALUE, Deprecated, deprecation_warning, @@ -143,6 +143,7 @@ def DEFAULT_AGENT_TO_MODULE_MAPPING_FN(agent_id, episode): # Map any agent ID to "default_policy". return DEFAULT_MODULE_ID + # @OldAPIStack # TODO (sven): Deprecate in new API stack. @staticmethod def DEFAULT_POLICY_MAPPING_FN(aid, episode, worker, **kwargs): @@ -565,7 +566,7 @@ def __init__(self, algo_class: Optional[type] = None): self.min_time_s_per_iteration = None self.min_train_timesteps_per_iteration = 0 self.min_sample_timesteps_per_iteration = 0 - self.log_gradients = True + self.log_gradients = False # `self.checkpointing()` self.export_native_model_files = False @@ -2989,7 +2990,7 @@ def evaluation( if offline_evaluation_type is not NotProvided: self.offline_evaluation_type = offline_evaluation_type if offline_eval_runner_class is not NotProvided: - self.offline_eval_runner_cls = offline_eval_runner_class + self.offline_eval_runner_class = offline_eval_runner_class if offline_loss_for_module_fn is not NotProvided: self.offline_loss_for_module_fn = offline_loss_for_module_fn if offline_eval_batch_size_per_runner is not NotProvided: @@ -3653,7 +3654,7 @@ def reporting( executed. Set to 0 or None for no minimum timesteps. log_gradients: Log gradients to results. If this is `True` the global norm of the gradients dictionariy for each optimizer is logged to results. - The default is `True`. + The default is `False`. Returns: This updated AlgorithmConfig object. @@ -3794,9 +3795,11 @@ def fault_tolerance( True). restart_failed_sub_environments: If True and any sub-environment (within a vectorized env) throws any error during env stepping, the - Sampler tries to restart the faulty sub-environment. This is done + EnvRunner tries to restart the faulty sub-environment. This is done without disturbing the other (still intact) sub-environment and without - the EnvRunner crashing. + the EnvRunner crashing. You can raise + `ray.rllib.env.env_runner.StepFailedRecreateEnvError` from your + environment's `step` method to not log the error. num_consecutive_env_runner_failures_tolerance: The number of consecutive times an EnvRunner failure (also for evaluation) is tolerated before finally crashing the Algorithm. Only useful if either @@ -5328,8 +5331,8 @@ def _validate_offline_settings(self): from ray.rllib.offline.offline_evaluation_runner import OfflineEvaluationRunner - if self.prelearner_class and not issubclass( - self.prelearner_class, OfflineEvaluationRunner + if self.offline_eval_runner_class and not issubclass( + self.offline_eval_runner_class, OfflineEvaluationRunner ): self._value_error( "Unknown `offline_eval_runner_class`. OfflineEvaluationRunner class needs to inherit " diff --git a/rllib/algorithms/appo/appo.py b/rllib/algorithms/appo/appo.py index a59636df752d..c3bc4c0031bb 100644 --- a/rllib/algorithms/appo/appo.py +++ b/rllib/algorithms/appo/appo.py @@ -18,7 +18,7 @@ from ray.rllib.core.rl_module.rl_module import RLModuleSpec from ray.rllib.policy.policy import Policy from ray.rllib.utils.annotations import override -from ray.rllib.utils.deprecation import DEPRECATED_VALUE, deprecation_warning +from ray._common.deprecation import DEPRECATED_VALUE, deprecation_warning from ray.rllib.utils.metrics import ( LAST_TARGET_UPDATE_TS, NUM_AGENT_STEPS_SAMPLED, diff --git a/rllib/algorithms/appo/appo_rl_module.py b/rllib/algorithms/appo/appo_rl_module.py index 5a2f59f9f201..de9b862a92ab 100644 --- a/rllib/algorithms/appo/appo_rl_module.py +++ b/rllib/algorithms/appo/appo_rl_module.py @@ -2,7 +2,7 @@ from ray.rllib.algorithms.appo.default_appo_rl_module import ( # noqa DefaultAPPORLModule as APPORLModule, ) -from ray.rllib.utils.deprecation import deprecation_warning +from ray._common.deprecation import deprecation_warning deprecation_warning( old="ray.rllib.algorithms.appo.appo_rl_module.APPORLModule", diff --git a/rllib/algorithms/appo/torch/appo_torch_rl_module.py b/rllib/algorithms/appo/torch/appo_torch_rl_module.py index ae60657b2c95..3bb3f0ba7f40 100644 --- a/rllib/algorithms/appo/torch/appo_torch_rl_module.py +++ b/rllib/algorithms/appo/torch/appo_torch_rl_module.py @@ -2,7 +2,7 @@ from ray.rllib.algorithms.appo.torch.default_appo_torch_rl_module import ( # noqa DefaultAPPOTorchRLModule as APPOTorchRLModule, ) -from ray.rllib.utils.deprecation import deprecation_warning +from ray._common.deprecation import deprecation_warning deprecation_warning( diff --git a/rllib/algorithms/cql/cql.py b/rllib/algorithms/cql/cql.py index de972a119a90..6d3b95cad746 100644 --- a/rllib/algorithms/cql/cql.py +++ b/rllib/algorithms/cql/cql.py @@ -25,7 +25,7 @@ ) from ray.rllib.policy.policy import Policy from ray.rllib.utils.annotations import OldAPIStack, override -from ray.rllib.utils.deprecation import ( +from ray._common.deprecation import ( DEPRECATED_VALUE, deprecation_warning, ) @@ -302,15 +302,20 @@ def training_step(self) -> None: # Sampling from offline data. with self.metrics.log_time((TIMERS, OFFLINE_SAMPLING_TIMER)): + # If we should use an iterator in the learner(s). Note, in case of + # multiple learners we must always return a list of iterators. + return_iterator = return_iterator = ( + self.config.num_learners > 0 + or self.config.dataset_num_iters_per_learner != 1 + ) + # Return an iterator in case we are using remote learners. batch_or_iterator = self.offline_data.sample( num_samples=self.config.train_batch_size_per_learner, num_shards=self.config.num_learners, # Return an iterator, if a `Learner` should update # multiple times per RLlib iteration. - return_iterator=self.config.dataset_num_iters_per_learner > 1 - if self.config.dataset_num_iters_per_learner - else True, + return_iterator=return_iterator, ) # Updating the policy. diff --git a/rllib/algorithms/dqn/dqn.py b/rllib/algorithms/dqn/dqn.py index 02014e72554c..6bc65698a56b 100644 --- a/rllib/algorithms/dqn/dqn.py +++ b/rllib/algorithms/dqn/dqn.py @@ -59,7 +59,7 @@ TD_ERROR_KEY, TIMERS, ) -from ray.rllib.utils.deprecation import DEPRECATED_VALUE +from ray._common.deprecation import DEPRECATED_VALUE from ray.rllib.utils.replay_buffers.utils import sample_min_n_steps_from_buffer from ray.rllib.utils.typing import ( LearningRateOrSchedule, diff --git a/rllib/algorithms/impala/impala.py b/rllib/algorithms/impala/impala.py index e3c2abfa1f37..c183b9e2f653 100644 --- a/rllib/algorithms/impala/impala.py +++ b/rllib/algorithms/impala/impala.py @@ -21,7 +21,7 @@ from ray.rllib.policy.policy import Policy from ray.rllib.policy.sample_batch import concat_samples from ray.rllib.utils.annotations import OldAPIStack, override -from ray.rllib.utils.deprecation import DEPRECATED_VALUE, deprecation_warning +from ray._common.deprecation import DEPRECATED_VALUE, deprecation_warning from ray.rllib.utils.metrics import ( AGGREGATOR_ACTOR_RESULTS, ALL_MODULES, diff --git a/rllib/algorithms/iql/__init__.py b/rllib/algorithms/iql/__init__.py new file mode 100644 index 000000000000..404fb83b6aac --- /dev/null +++ b/rllib/algorithms/iql/__init__.py @@ -0,0 +1,6 @@ +from ray.rllib.algorithms.iql.iql import IQL, IQLConfig + +__all__ = [ + "IQL", + "IQLConfig", +] diff --git a/rllib/algorithms/iql/default_iql_rl_module.py b/rllib/algorithms/iql/default_iql_rl_module.py new file mode 100644 index 000000000000..e6e3b2279ac5 --- /dev/null +++ b/rllib/algorithms/iql/default_iql_rl_module.py @@ -0,0 +1,35 @@ +from ray.rllib.algorithms.sac.default_sac_rl_module import DefaultSACRLModule +from ray.rllib.core.models.configs import MLPHeadConfig +from ray.rllib.core.rl_module.apis.value_function_api import ValueFunctionAPI +from ray.rllib.utils.annotations import ( + override, + OverrideToImplementCustomLogic_CallToSuperRecommended, +) + + +class DefaultIQLRLModule(DefaultSACRLModule, ValueFunctionAPI): + @override(DefaultSACRLModule) + def setup(self): + # Setup the `DefaultSACRLModule` to get the catalog. + super().setup() + + # Only, if the `RLModule` is used on a `Learner` we build the value network. + if not self.inference_only: + # Build the encoder for the value function. + self.vf_encoder = self.catalog.build_encoder(framework=self.framework) + + # Build the vf head. + self.vf = MLPHeadConfig( + input_dims=self.catalog.latent_dims, + # Note, we use the same layers as for the policy and Q-network. + hidden_layer_dims=self.catalog.pi_and_qf_head_hiddens, + hidden_layer_activation=self.catalog.pi_and_qf_head_activation, + output_layer_activation="linear", + output_layer_dim=1, + ).build(framework=self.framework) + + @override(DefaultSACRLModule) + @OverrideToImplementCustomLogic_CallToSuperRecommended + def get_non_inference_attributes(self): + # Use all of `super`'s attributes and add the value function attributes. + return super().get_non_inference_attributes() + ["vf_encoder", "vf"] diff --git a/rllib/algorithms/iql/iql.py b/rllib/algorithms/iql/iql.py new file mode 100644 index 000000000000..893555002708 --- /dev/null +++ b/rllib/algorithms/iql/iql.py @@ -0,0 +1,228 @@ +from typing import Optional, Type, Union + +from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided +from ray.rllib.algorithms.marwil.marwil import MARWIL, MARWILConfig +from ray.rllib.connectors.common.add_observations_from_episodes_to_batch import ( + AddObservationsFromEpisodesToBatch, +) +from ray.rllib.connectors.learner.add_next_observations_from_episodes_to_train_batch import ( # noqa + AddNextObservationsFromEpisodesToTrainBatch, +) +from ray.rllib.core.learner.learner import Learner +from ray.rllib.core.rl_module.rl_module import RLModuleSpec +from ray.rllib.utils.annotations import override +from ray.rllib.utils.typing import LearningRateOrSchedule, RLModuleSpecType + + +class IQLConfig(MARWILConfig): + """Defines a configuration class from which a new IQL Algorithm can be built + + .. testcode:: + :skipif: True + + from ray.rllib.algorithms.iql import IQLConfig + # Run this from the ray directory root. + config = IQLConfig().training(actor_lr=0.00001, gamma=0.99) + config = config.offline_data( + input_="./rllib/tests/data/pendulum/pendulum-v1_enormous") + + # Build an Algorithm object from the config and run 1 training iteration. + algo = config.build() + algo.train() + + .. testcode:: + :skipif: True + + from ray.rllib.algorithms.iql import IQLConfig + from ray import tune + config = IQLConfig() + # Print out some default values. + print(config.beta) + # Update the config object. + config.training( + lr=tune.grid_search([0.001, 0.0001]), beta=0.75 + ) + # Set the config object's data path. + # Run this from the ray directory root. + config.offline_data( + input_="./rllib/tests/data/pendulum-v1_enormous" + ) + # Set the config object's env, used for evaluation. + config.environment(env="Pendulum-v1") + # Use to_dict() to get the old-style python config dict + # when running with tune. + tune.Tuner( + "IQL", + param_space=config.to_dict(), + ).fit() + """ + + def __init__(self, algo_class=None): + super().__init__(algo_class=algo_class or IQL) + + # fmt: off + # __sphinx_doc_begin__ + # The temperature for the actor loss. + self.beta = 0.1 + + # The expectile to use in expectile regression. + self.expectile = 0.8 + + # The learning rates for the actor, critic and value network(s). + self.actor_lr = 3e-4 + self.critic_lr = 3e-4 + self.value_lr = 3e-4 + # Set `lr` parameter to `None` and ensure it is not used. + self.lr = None + + # If a twin-Q architecture should be used (advisable). + self.twin_q = True + + # How often the target network should be updated. + self.target_network_update_freq = 0 + # The weight for Polyak averaging. + self.tau = 1.0 + + # __sphinx_doc_end__ + # fmt: on + + @override(MARWILConfig) + def training( + self, + *, + twin_q: Optional[bool] = NotProvided, + expectile: Optional[float] = NotProvided, + actor_lr: Optional[LearningRateOrSchedule] = NotProvided, + critic_lr: Optional[LearningRateOrSchedule] = NotProvided, + value_lr: Optional[LearningRateOrSchedule] = NotProvided, + target_network_update_freq: Optional[int] = NotProvided, + tau: Optional[float] = NotProvided, + **kwargs, + ) -> "IQLConfig": + """Sets the training related configuration. + + Args: + beta: The temperature to scaling advantages in exponential terms. + Must be >> 0.0. The higher this parameter the less greedy + (exploitative) the policy becomes. It also means that the policy + is fitting less to the best actions in the dataset. + twin_q: If a twin-Q architecture should be used (advisable). + expectile: The expectile to use in expectile regression for the value + function. For high expectiles the value function tries to match + the upper tail of the Q-value distribution. + actor_lr: The learning rate for the actor network. Actor learning rates + greater than critic learning rates work well in experiments. + critic_lr: The learning rate for the Q-network. Critic learning rates + greater than value function learning rates work well in experiments. + value_lr: The learning rate for the value function network. + target_network_update_freq: The number of timesteps in between the target + Q-network is fixed. Note, too high values here could harm convergence. + The target network is updated via Polyak-averaging. + tau: The update parameter for Polyak-averaging of the target Q-network. + The higher this value the faster the weights move towards the actual + Q-network. + + Return: + This updated `AlgorithmConfig` object. + """ + super().training(**kwargs) + + if twin_q is not NotProvided: + self.twin_q = twin_q + if expectile is not NotProvided: + self.expectile = expectile + if actor_lr is not NotProvided: + self.actor_lr = actor_lr + if critic_lr is not NotProvided: + self.critic_lr = critic_lr + if value_lr is not NotProvided: + self.value_lr = value_lr + if target_network_update_freq is not NotProvided: + self.target_network_update_freq = target_network_update_freq + if tau is not NotProvided: + self.tau = tau + + return self + + @override(MARWILConfig) + def get_default_learner_class(self) -> Union[Type["Learner"], str]: + if self.framework_str == "torch": + from ray.rllib.algorithms.iql.torch.iql_torch_learner import IQLTorchLearner + + return IQLTorchLearner + else: + raise ValueError( + f"The framework {self.framework_str} is not supported. " + "Use `'torch'` instead." + ) + + @override(MARWILConfig) + def get_default_rl_module_spec(self) -> RLModuleSpecType: + if self.framework_str == "torch": + from ray.rllib.algorithms.iql.torch.default_iql_torch_rl_module import ( + DefaultIQLTorchRLModule, + ) + + return RLModuleSpec(module_class=DefaultIQLTorchRLModule) + else: + raise ValueError( + f"The framework {self.framework_str} is not supported. " + "Use `torch` instead." + ) + + @override(MARWILConfig) + def build_learner_connector( + self, + input_observation_space, + input_action_space, + device=None, + ): + pipeline = super().build_learner_connector( + input_observation_space=input_observation_space, + input_action_space=input_action_space, + device=device, + ) + + # Remove unneeded connectors from the MARWIL connector pipeline. + pipeline.remove("AddOneTsToEpisodesAndTruncate") + pipeline.remove("GeneralAdvantageEstimation") + + # Prepend the "add-NEXT_OBS-from-episodes-to-train-batch" connector piece (right + # after the corresponding "add-OBS-..." default piece). + pipeline.insert_after( + AddObservationsFromEpisodesToBatch, + AddNextObservationsFromEpisodesToTrainBatch(), + ) + + return pipeline + + @override(MARWILConfig) + def validate(self) -> None: + # Call super's validation method. + super().validate() + + # Ensure hyperparameters are meaningful. + if self.beta <= 0.0: + self._value_error( + "For meaningful results, `beta` (temperature) parameter must be >> 0.0!" + ) + if not 0.0 < self.expectile < 1.0: + self._value_error( + "For meaningful results, `expectile` parameter must be in (0, 1)." + ) + + @property + def _model_config_auto_includes(self): + return super()._model_config_auto_includes | {"twin_q": self.twin_q} + + +class IQL(MARWIL): + """Implicit Q-learning (derived from MARWIL). + + Uses MARWIL training step. + """ + + @classmethod + @override(MARWIL) + def get_default_config(cls) -> AlgorithmConfig: + return IQLConfig() diff --git a/rllib/algorithms/iql/iql_learner.py b/rllib/algorithms/iql/iql_learner.py new file mode 100644 index 000000000000..5821f2ccb5e0 --- /dev/null +++ b/rllib/algorithms/iql/iql_learner.py @@ -0,0 +1,84 @@ +from typing import Dict + +from ray.rllib.algorithms.dqn.dqn_learner import DQNLearner +from ray.rllib.utils.annotations import ( + override, + OverrideToImplementCustomLogic_CallToSuperRecommended, +) +from ray.rllib.utils.lambda_defaultdict import LambdaDefaultDict +from ray.rllib.utils.typing import ModuleID, TensorType + +QF_TARGET_PREDS = "qf_target_preds" +VF_PREDS_NEXT = "vf_preds_next" +VF_LOSS = "value_loss" + + +class IQLLearner(DQNLearner): + @OverrideToImplementCustomLogic_CallToSuperRecommended + @override(DQNLearner) + def build(self) -> None: + # Build the `DQNLearner` (builds the target network). + super().build() + + # Define the expectile parameter(s). + self.expectile: Dict[ModuleID, TensorType] = LambdaDefaultDict( + lambda module_id: self._get_tensor_variable( + # Note, we want to train with a certain expectile. + [self.config.get_config_for_module(module_id).expectile], + trainable=False, + ) + ) + + # Define the temperature for the actor advantage loss. + self.temperature: Dict[ModuleID, TensorType] = LambdaDefaultDict( + lambda module_id: self._get_tensor_variable( + # Note, we want to train with a certain expectile. + [self.config.get_config_for_module(module_id).beta], + trainable=False, + ) + ) + + # Store loss tensors here temporarily inside the loss function for (exact) + # consumption later by the compute gradients function. + # Keys=(module_id, optimizer_name), values=loss tensors (in-graph). + self._temp_losses = {} + + @override(DQNLearner) + def remove_module(self, module_id: ModuleID) -> None: + """Removes the expectile and temperature for removed modules.""" + # First call `super`'s `remove_module` method. + super().remove_module(module_id) + # Remove the expectile from the mapping. + self.expectile.pop(module_id, None) + # Remove the temperature from the mapping. + self.temperature.pop(module_id, None) + + @override(DQNLearner) + def add_module( + self, + *, + module_id, + module_spec, + config_overrides=None, + new_should_module_be_updated=None + ): + """Adds the expectile and temperature for new modules.""" + # First call `super`'s `add_module` method. + super().add_module( + module_id=module_id, + module_spec=module_spec, + config_overrides=config_overrides, + new_should_module_be_updated=new_should_module_be_updated, + ) + # Add the expectile to the mapping. + self.expectile[module_id] = self._get_tensor_variable( + # Note, we want to train with a certain expectile. + [self.config.get_config_for_module(module_id).beta], + trainable=False, + ) + # Add the temperature to the mapping. + self.temperature[module_id] = self._get_tensor_variable( + # Note, we want to train with a certain expectile. + [self.config.get_config_for_module(module_id).beta], + trainable=False, + ) diff --git a/rllib/algorithms/iql/torch/__init__.py b/rllib/algorithms/iql/torch/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/rllib/algorithms/iql/torch/default_iql_torch_rl_module.py b/rllib/algorithms/iql/torch/default_iql_torch_rl_module.py new file mode 100644 index 000000000000..00d7fc821e49 --- /dev/null +++ b/rllib/algorithms/iql/torch/default_iql_torch_rl_module.py @@ -0,0 +1,78 @@ +import gymnasium as gym +from typing import Any, Dict, Optional + +from ray.rllib.algorithms.iql.default_iql_rl_module import DefaultIQLRLModule +from ray.rllib.algorithms.iql.iql_learner import VF_PREDS_NEXT, QF_TARGET_PREDS +from ray.rllib.algorithms.sac.torch.default_sac_torch_rl_module import ( + DefaultSACTorchRLModule, +) +from ray.rllib.core.columns import Columns +from ray.rllib.core.models.base import ENCODER_OUT +from ray.rllib.core.rl_module.apis.value_function_api import ValueFunctionAPI +from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.typing import TensorType + +torch, nn = try_import_torch() + + +class DefaultIQLTorchRLModule(DefaultSACTorchRLModule, DefaultIQLRLModule): + + framework: str = "torch" + + @override(DefaultSACTorchRLModule) + def _forward_train(self, batch: Dict, **kwargs) -> Dict[str, Any]: + + # Right now, IQL runs only with continuous action spaces. + # TODO (simon): Implement it also for discrete action spaces. + if not isinstance(self.action_space, gym.spaces.Box): + raise ValueError( + f"Unsupported action space type: {type(self.action_space)}. " + "Only continuous action spaces are supported." + ) + + # Call the forward pass of the SAC module. + output = super()._forward_train(batch, **kwargs) + + # Create batches for the forward passes of the target Q-networks and the + # value function. + batch_curr = { + Columns.OBS: batch[Columns.OBS], + Columns.ACTIONS: batch[Columns.ACTIONS], + } + batch_next = {Columns.OBS: batch[Columns.NEXT_OBS]} + + # These target q-values are needed for the value loss and actor loss. + output[QF_TARGET_PREDS] = self._qf_forward_train_helper( + batch_curr, encoder=self.target_qf_encoder, head=self.target_qf + ) + # If a twin-Q architecture is used run its target Q-network. + if self.twin_q: + output[QF_TARGET_PREDS] = torch.min( + output[QF_TARGET_PREDS], + self._qf_forward_train_helper( + batch_curr, encoder=self.target_qf_twin_encoder, head=self.qf_twin + ), + ) + + # Compute values for the current observations. + output[Columns.VF_PREDS] = self.compute_values(batch_curr) + # The values of the next observations are needed for the critic loss. + output[VF_PREDS_NEXT] = self.compute_values(batch_next) + + return output + + @override(ValueFunctionAPI) + def compute_values( + self, + batch: Dict[str, Any], + embeddings: Optional[Any] = None, + ) -> TensorType: + # If no embeddings are provided make a forward pass on the encoder. + if embeddings is None: + embeddings = self.vf_encoder(batch)[ENCODER_OUT] + + # Value head. + vf_out = self.vf(embeddings) + # Squeeze out last dimension (single node value head). + return vf_out.squeeze(-1) diff --git a/rllib/algorithms/iql/torch/iql_torch_learner.py b/rllib/algorithms/iql/torch/iql_torch_learner.py new file mode 100644 index 000000000000..85dc68e86fb2 --- /dev/null +++ b/rllib/algorithms/iql/torch/iql_torch_learner.py @@ -0,0 +1,245 @@ +from typing import Dict + +from ray.rllib.algorithms.algorithm_config import AlgorithmConfig +from ray.rllib.algorithms.dqn.dqn_learner import QF_PREDS, QF_LOSS_KEY +from ray.rllib.algorithms.iql.iql_learner import ( + IQLLearner, + QF_TARGET_PREDS, + VF_PREDS_NEXT, + VF_LOSS, +) +from ray.rllib.algorithms.sac.sac_learner import QF_TWIN_PREDS, QF_TWIN_LOSS_KEY +from ray.rllib.core import ALL_MODULES +from ray.rllib.core.columns import Columns +from ray.rllib.core.learner.learner import ( + POLICY_LOSS_KEY, +) +from ray.rllib.core.learner.torch.torch_learner import TorchLearner +from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.typing import ModuleID, ParamDict, TensorType + +torch, nn = try_import_torch() + + +class IQLTorchLearner(TorchLearner, IQLLearner): + """Implements the IQL loss on top of `IQLLearner`. + + This Learner implements configure_optimizers_for_module to define + separate optimizers for the policy, Q-, and value networks. When + using a twin-Q network architecture, each Q-network is assigned its + own optimizer—consistent with the SAC algorithm. + + The IQL loss is defined in compute_loss_for_module and consists of + three components: value loss, Q-loss (TD error), and actor (policy) + loss. + + Note that the original IQL implementation performs separate backward + passes for each network. However, due to RLlib's reliance on TorchDDP, + all backward passes must be executed within a single update step. This + constraint can lead to parameter lag and cyclical loss behavior, though + it does not hinder convergence. + """ + + @override(TorchLearner) + def configure_optimizers_for_module( + self, module_id: ModuleID, config: AlgorithmConfig = None + ) -> None: + + # Note, we could have derived directly from SACTorchLearner to + # inherit the setup of optimizers, but that learner comes with + # additional parameters which we do not need. + # Receive the module. + module = self._module[module_id] + + # Define the optimizer for the critic. + # TODO (sven): Maybe we change here naming to `qf` for unification. + params_critic = self.get_parameters(module.qf_encoder) + self.get_parameters( + module.qf + ) + optim_critic = torch.optim.Adam(params_critic, eps=1e-7) + self.register_optimizer( + module_id=module_id, + optimizer_name="qf", + optimizer=optim_critic, + params=params_critic, + lr_or_lr_schedule=config.critic_lr, + ) + # If necessary register also an optimizer for a twin Q network. + if config.twin_q: + params_twin_critic = self.get_parameters( + module.qf_twin_encoder + ) + self.get_parameters(module.qf_twin) + optim_twin_critic = torch.optim.Adam(params_twin_critic, eps=1e-7) + self.register_optimizer( + module_id=module_id, + optimizer_name="qf_twin", + optimizer=optim_twin_critic, + params=params_twin_critic, + lr_or_lr_schedule=config.critic_lr, + ) + + # Define the optimizer for the actor. + params_actor = self.get_parameters(module.pi_encoder) + self.get_parameters( + module.pi + ) + optim_actor = torch.optim.Adam(params_actor, eps=1e-7) + self.register_optimizer( + module_id=module_id, + optimizer_name="policy", + optimizer=optim_actor, + params=params_actor, + lr_or_lr_schedule=config.actor_lr, + ) + + # Define the optimizer for the value function. + params_value = self.get_parameters(module.vf_encoder) + self.get_parameters( + module.vf + ) + optim_value = torch.optim.Adam(params_value, eps=1e-7) + self.register_optimizer( + module_id=module_id, + optimizer_name="value", + optimizer=optim_value, + params=params_value, + lr_or_lr_schedule=config.value_lr, + ) + + @override(TorchLearner) + def compute_loss_for_module( + self, + *, + module_id: ModuleID, + config: AlgorithmConfig, + batch: Dict, + fwd_out: Dict + ): + + # Get the module and hyperparameters. + module = self._module[module_id] + expectile = self.expectile[module_id] + temperature = self.temperature[module_id] + + # Get the action distribution for the actor loss. + action_train_dist_class = module.get_train_action_dist_cls() + action_train_dist = action_train_dist_class.from_logits( + fwd_out[Columns.ACTION_DIST_INPUTS] + ) + + # First, compute the value loss via the target Q-network and current observations. + value_loss = torch.mean( + self._expectile_loss( + fwd_out[QF_TARGET_PREDS] - fwd_out[Columns.VF_PREDS], expectile + ) + ) + + # Second, compute the actor loss using the target-Q network and values. + exp_advantages = torch.minimum( + torch.exp( + temperature * (fwd_out[QF_TARGET_PREDS] - fwd_out[Columns.VF_PREDS]) + ), + torch.Tensor([100.0]).to(self.device), + ) + # Note, we are using here the actions from the data sample. + action_logps = action_train_dist.logp(batch[Columns.ACTIONS]) + # Compute the actor loss. + actor_loss = -torch.mean(exp_advantages.detach() * action_logps) + + # Third, compute the critic loss. + target_critic = ( + batch[Columns.REWARDS] + + config.gamma + * (1 - batch[Columns.TERMINATEDS].float()) + * fwd_out[VF_PREDS_NEXT].detach() + ) + + critic_loss = torch.mean( + torch.nn.MSELoss(reduction="none")(target_critic, fwd_out[QF_PREDS]) + ) + + # If we have a twin-Q architecture, calculate the its loss, too. + if config.twin_q: + critic_twin_loss = ( + torch.mean( + torch.nn.MSELoss(reduction="none")( + target_critic, fwd_out[QF_TWIN_PREDS] + ) + ) + * 0.5 + ) + critic_loss *= 0.5 + + # Compute the total loss. + total_loss = value_loss + actor_loss + critic_loss + + # If we have a twin-Q architecture, add its loss. + if config.twin_q: + total_loss += critic_twin_loss + + # Log metrics. + self.metrics.log_dict( + { + POLICY_LOSS_KEY: actor_loss, + QF_LOSS_KEY: critic_loss, + }, + key=module_id, + window=1, # <- single items (should not be mean/ema-reduced over time). + ) + + # Log the losses also in the temporary containers for gradient computation. + self._temp_losses[(module_id, POLICY_LOSS_KEY)] = actor_loss + self._temp_losses[(module_id, QF_LOSS_KEY)] = critic_loss + self._temp_losses[(module_id, VF_LOSS)] = value_loss + + # If a twin-Q architecture is used add metrics and loss. + if config.twin_q: + self.metrics.log_value( + key=(module_id, QF_TWIN_LOSS_KEY), + value=critic_twin_loss, + window=1, # <- single items (should not be mean/ema-reduced over time). + ) + self._temp_losses[(module_id, QF_TWIN_LOSS_KEY)] = critic_twin_loss + + return total_loss + + @override(TorchLearner) + def compute_gradients( + self, loss_per_module: Dict[ModuleID, TensorType], **kwargs + ) -> ParamDict: + grads = {} + for module_id in set(loss_per_module.keys()) - {ALL_MODULES}: + # Loop through optimizers registered for this module. + for optim_name, optim in self.get_optimizers_for_module(module_id): + # Zero the gradients. Note, we need to reset the gradients b/c + # each component for a module operates on the same graph. + optim.zero_grad(set_to_none=True) + + # Compute the gradients for the component and module. + loss_tensor = self._temp_losses.pop((module_id, optim_name + "_loss")) + loss_tensor.backward(retain_graph=True) + # Store the gradients for the component and module. + grads.update( + { + pid: p.grad + for pid, p in self.filter_param_dict_for_optimizer( + self._params, optim + ).items() + } + ) + + # Make sure we updated on all loss terms. + assert not self._temp_losses + return grads + + def _expectile_loss(self, diff: TensorType, expectile: TensorType) -> TensorType: + """Computes the expectile loss. + + Args: + diff: A tensor containing a difference loss. + expectile: The expectile to use for the expectile loss. + + Returns: + The expectile loss of `diff` using `expectile`. + """ + weight = torch.where(diff > 0, expectile, 1 - expectile) + return weight * torch.pow(diff, 2) diff --git a/rllib/algorithms/marwil/marwil.py b/rllib/algorithms/marwil/marwil.py index b0a06ae6d2d8..7dfb1e1dcde6 100644 --- a/rllib/algorithms/marwil/marwil.py +++ b/rllib/algorithms/marwil/marwil.py @@ -20,7 +20,7 @@ ) from ray.rllib.policy.policy import Policy from ray.rllib.utils.annotations import OldAPIStack, override -from ray.rllib.utils.deprecation import deprecation_warning +from ray._common.deprecation import deprecation_warning from ray.rllib.utils.metrics import ( LEARNER_RESULTS, LEARNER_UPDATE_TIMER, @@ -457,11 +457,13 @@ class (multi-/single-learner setup) and evaluation on # the user that sth. is not right, although it is as # we do not step the env. with self.metrics.log_time((TIMERS, OFFLINE_SAMPLING_TIMER)): + # If we should use an iterator in the learner(s). Note, in case of + # multiple learners we must always return a list of iterators. return_iterator = ( - self.config.dataset_num_iters_per_learner > 1 - if self.config.dataset_num_iters_per_learner - else True + self.config.num_learners > 0 + or self.config.dataset_num_iters_per_learner != 1 ) + # Sampling from offline data. batch_or_iterator = self.offline_data.sample( num_samples=self.config.train_batch_size_per_learner, diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py index 2b28c5bd91c8..7ffa74477928 100644 --- a/rllib/algorithms/ppo/ppo.py +++ b/rllib/algorithms/ppo/ppo.py @@ -25,7 +25,7 @@ ) from ray.rllib.policy.policy import Policy from ray.rllib.utils.annotations import OldAPIStack, override -from ray.rllib.utils.deprecation import DEPRECATED_VALUE +from ray._common.deprecation import DEPRECATED_VALUE from ray.rllib.utils.metrics import ( ENV_RUNNER_RESULTS, ENV_RUNNER_SAMPLING_TIMER, diff --git a/rllib/algorithms/ppo/ppo_rl_module.py b/rllib/algorithms/ppo/ppo_rl_module.py index 78f1ccef9fbd..631bf29fdd62 100644 --- a/rllib/algorithms/ppo/ppo_rl_module.py +++ b/rllib/algorithms/ppo/ppo_rl_module.py @@ -2,7 +2,7 @@ from ray.rllib.algorithms.ppo.default_ppo_rl_module import ( # noqa DefaultPPORLModule as PPORLModule, ) -from ray.rllib.utils.deprecation import deprecation_warning +from ray._common.deprecation import deprecation_warning deprecation_warning( old="ray.rllib.algorithms.ppo.ppo_rl_module.PPORLModule", diff --git a/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py b/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py index 60370a150497..66acb9e5fb5a 100644 --- a/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py +++ b/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py @@ -2,7 +2,7 @@ from ray.rllib.algorithms.ppo.torch.default_ppo_torch_rl_module import ( # noqa DefaultPPOTorchRLModule as PPOTorchRLModule, ) -from ray.rllib.utils.deprecation import deprecation_warning +from ray._common.deprecation import deprecation_warning deprecation_warning( diff --git a/rllib/algorithms/registry.py b/rllib/algorithms/registry.py index 77f0581a69dc..c349489d165c 100644 --- a/rllib/algorithms/registry.py +++ b/rllib/algorithms/registry.py @@ -40,6 +40,12 @@ def _import_impala(): return impala.IMPALA, impala.IMPALA.get_default_config() +def _import_iql(): + import ray.rllib.algorithms.iql as iql + + return iql.IQL, iql.IQL.get_default_config() + + def _import_marwil(): import ray.rllib.algorithms.marwil as marwil @@ -65,6 +71,7 @@ def _import_sac(): "DQN": _import_dqn, "DreamerV3": _import_dreamerv3, "IMPALA": _import_impala, + "IQL": _import_iql, "MARWIL": _import_marwil, "PPO": _import_ppo, "SAC": _import_sac, @@ -78,6 +85,7 @@ def _import_sac(): "DQN": "DQN", "DreamerV3": "DreamerV3", "Impala": "IMPALA", + "IQL": "IQL", "IMPALA": "IMPALA", "MARWIL": "MARWIL", "PPO": "PPO", diff --git a/rllib/algorithms/sac/sac.py b/rllib/algorithms/sac/sac.py index 581434e03ed9..6a0c2375153a 100644 --- a/rllib/algorithms/sac/sac.py +++ b/rllib/algorithms/sac/sac.py @@ -15,7 +15,7 @@ from ray.rllib.policy.policy import Policy from ray.rllib.utils import deep_update from ray.rllib.utils.annotations import override -from ray.rllib.utils.deprecation import DEPRECATED_VALUE, deprecation_warning +from ray._common.deprecation import DEPRECATED_VALUE, deprecation_warning from ray.rllib.utils.framework import try_import_tf, try_import_tfp from ray.rllib.utils.replay_buffers.episode_replay_buffer import EpisodeReplayBuffer from ray.rllib.utils.typing import LearningRateOrSchedule, RLModuleSpecType diff --git a/rllib/algorithms/sac/sac_learner.py b/rllib/algorithms/sac/sac_learner.py index 8046c4c07892..f4108943ad04 100644 --- a/rllib/algorithms/sac/sac_learner.py +++ b/rllib/algorithms/sac/sac_learner.py @@ -49,25 +49,10 @@ def build(self) -> None: # for the alpha already defined. super().build() - def get_target_entropy(module_id): - """Returns the target entropy to use for the loss. - - Args: - module_id: Module ID for which the target entropy should be - returned. - - Returns: - Target entropy. - """ - target_entropy = self.config.get_config_for_module(module_id).target_entropy - if target_entropy is None or target_entropy == "auto": - target_entropy = -np.prod( - self._module_spec.module_specs[module_id].action_space.shape - ) - return target_entropy - self.target_entropy: Dict[ModuleID, TensorType] = LambdaDefaultDict( - lambda module_id: self._get_tensor_variable(get_target_entropy(module_id)) + lambda module_id: self._get_tensor_variable( + self._get_target_entropy(module_id) + ) ) @override(Learner) @@ -80,3 +65,51 @@ def remove_module(self, module_id: ModuleID) -> None: super().remove_module(module_id) self.curr_log_alpha.pop(module_id, None) self.target_entropy.pop(module_id, None) + + @override(Learner) + def add_module( + self, + *, + module_id, + module_spec, + config_overrides=None, + new_should_module_be_updated=None + ): + # First call `super`'s `add_module` method. + super().add_module( + module_id=module_id, + module_spec=module_spec, + config_overrides=config_overrides, + new_should_module_be_updated=new_should_module_be_updated, + ) + # Now add the log alpha. + self.curr_log_alpha[module_id] = self._get_tensor_variable( + # Note, we want to train the temperature parameter. + [ + np.log( + self.config.get_config_for_module(module_id).initial_alpha + ).astype(np.float32) + ], + trainable=True, + ) + # Add also the target entropy for the new module. + self.target_entropy[module_id] = self._get_tensor_variable( + self._get_target_entropy(module_id) + ) + + def _get_target_entropy(self, module_id): + """Returns the target entropy to use for the loss. + + Args: + module_id: Module ID for which the target entropy should be + returned. + + Returns: + Target entropy. + """ + target_entropy = self.config.get_config_for_module(module_id).target_entropy + if target_entropy is None or target_entropy == "auto": + target_entropy = -np.prod( + self._module_spec.module_specs[module_id].action_space.shape + ) + return target_entropy diff --git a/rllib/algorithms/sac/torch/default_sac_torch_rl_module.py b/rllib/algorithms/sac/torch/default_sac_torch_rl_module.py index 3b62e949a9cf..0612dce7c391 100644 --- a/rllib/algorithms/sac/torch/default_sac_torch_rl_module.py +++ b/rllib/algorithms/sac/torch/default_sac_torch_rl_module.py @@ -54,11 +54,8 @@ def _forward_exploration(self, batch: Dict, **kwargs) -> Dict[str, Any]: @override(RLModule) def _forward_train(self, batch: Dict) -> Dict[str, Any]: - if self.inference_only: - raise RuntimeError( - "Trying to train a module that is not a learner module. Set the " - "flag `inference_only=False` when building the module." - ) + # Call the `super`'s `forward_train` + super()._forward_train(batch) if isinstance(self.action_space, gym.spaces.Discrete): return self._forward_train_discrete(batch) elif isinstance(self.action_space, gym.spaces.Box): diff --git a/rllib/algorithms/tests/test_algorithm_save_load_checkpoint_learner.py b/rllib/algorithms/tests/test_algorithm_save_load_checkpoint_learner.py index 5cb37f805e35..78840a3fe4be 100644 --- a/rllib/algorithms/tests/test_algorithm_save_load_checkpoint_learner.py +++ b/rllib/algorithms/tests/test_algorithm_save_load_checkpoint_learner.py @@ -95,7 +95,7 @@ def setUpClass(cls) -> None: ray.init() @classmethod - def tearDowClass(cls) -> None: + def tearDownClass(cls) -> None: ray.shutdown() def test_save_and_restore(self): diff --git a/rllib/core/learner/learner.py b/rllib/core/learner/learner.py index 3eab6072ff6f..e2c6578c9173 100644 --- a/rllib/core/learner/learner.py +++ b/rllib/core/learner/learner.py @@ -47,7 +47,7 @@ ) from ray.rllib.utils.checkpoints import Checkpointable from ray.rllib.utils.debug import update_global_seed_if_necessary -from ray.rllib.utils.deprecation import Deprecated +from ray._common.deprecation import Deprecated from ray.rllib.utils.framework import try_import_torch from ray.rllib.utils.metrics import ( ALL_MODULES, diff --git a/rllib/core/learner/learner_group.py b/rllib/core/learner/learner_group.py index e1d816ff9e19..b1010950e746 100644 --- a/rllib/core/learner/learner_group.py +++ b/rllib/core/learner/learner_group.py @@ -34,7 +34,7 @@ ) from ray.rllib.utils.annotations import override from ray.rllib.utils.checkpoints import Checkpointable -from ray.rllib.utils.deprecation import Deprecated +from ray._common.deprecation import Deprecated from ray.rllib.utils.typing import ( EpisodeType, ModuleID, diff --git a/rllib/core/models/catalog.py b/rllib/core/models/catalog.py index f1bf8c6a3ea5..e4f9abe53b88 100644 --- a/rllib/core/models/catalog.py +++ b/rllib/core/models/catalog.py @@ -19,7 +19,7 @@ from ray.rllib.core.distribution.distribution import Distribution from ray.rllib.models.preprocessors import get_preprocessor, Preprocessor from ray.rllib.models.utils import get_filter_config -from ray.rllib.utils.deprecation import deprecation_warning, DEPRECATED_VALUE +from ray._common.deprecation import deprecation_warning, DEPRECATED_VALUE from ray.rllib.utils.error import UnsupportedSpaceException from ray.rllib.utils.spaces.simplex import Simplex from ray.rllib.utils.spaces.space_utils import flatten_space diff --git a/rllib/core/models/specs/specs_base.py b/rllib/core/models/specs/specs_base.py index 722267b3dc6d..2274fdd73641 100644 --- a/rllib/core/models/specs/specs_base.py +++ b/rllib/core/models/specs/specs_base.py @@ -1,4 +1,4 @@ -from ray.rllib.utils.deprecation import Deprecated +from ray._common.deprecation import Deprecated @Deprecated( diff --git a/rllib/core/models/specs/specs_dict.py b/rllib/core/models/specs/specs_dict.py index 7d944688eb0e..9c60b46fe67d 100644 --- a/rllib/core/models/specs/specs_dict.py +++ b/rllib/core/models/specs/specs_dict.py @@ -1,4 +1,4 @@ -from ray.rllib.utils.deprecation import Deprecated +from ray._common.deprecation import Deprecated @Deprecated( diff --git a/rllib/core/rl_module/multi_rl_module.py b/rllib/core/rl_module/multi_rl_module.py index c444f411b45a..5ba41d10931c 100644 --- a/rllib/core/rl_module/multi_rl_module.py +++ b/rllib/core/rl_module/multi_rl_module.py @@ -27,7 +27,7 @@ OverrideToImplementCustomLogic, ) from ray.rllib.utils.checkpoints import Checkpointable -from ray.rllib.utils.deprecation import ( +from ray._common.deprecation import ( Deprecated, DEPRECATED_VALUE, deprecation_warning, diff --git a/rllib/core/rl_module/rl_module.py b/rllib/core/rl_module/rl_module.py index 644ecae02a46..eeb75a1cd680 100644 --- a/rllib/core/rl_module/rl_module.py +++ b/rllib/core/rl_module/rl_module.py @@ -14,7 +14,7 @@ OverrideToImplementCustomLogic, ) from ray.rllib.utils.checkpoints import Checkpointable -from ray.rllib.utils.deprecation import ( +from ray._common.deprecation import ( Deprecated, DEPRECATED_VALUE, deprecation_warning, @@ -269,6 +269,8 @@ class RLModule(Checkpointable, abc.ABC): DefaultPPOTorchRLModule ) from ray.rllib.algorithms.ppo.ppo_catalog import PPOCatalog + from ray.rllib.core.rl_module.rl_module import RLModuleSpec + from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig import gymnasium as gym import torch @@ -300,6 +302,12 @@ class RLModule(Checkpointable, abc.ABC): .. testcode:: + from ray.rllib.algorithms.ppo.torch.ppo_torch_rl_module import ( + PPOTorchRLModule + ) + from ray.rllib.algorithms.ppo.ppo_catalog import PPOCatalog + from ray.rllib.core.rl_module.rl_module import RLModuleSpec + from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig import gymnasium as gym import torch @@ -327,6 +335,12 @@ class RLModule(Checkpointable, abc.ABC): .. testcode:: + from ray.rllib.algorithms.ppo.torch.ppo_torch_rl_module import ( + PPOTorchRLModule + ) + from ray.rllib.algorithms.ppo.ppo_catalog import PPOCatalog + from ray.rllib.core.rl_module.rl_module import RLModuleSpec + from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig import gymnasium as gym import torch diff --git a/rllib/env/__init__.py b/rllib/env/__init__.py index 2e48374d784c..ca9de7949565 100644 --- a/rllib/env/__init__.py +++ b/rllib/env/__init__.py @@ -4,7 +4,6 @@ from ray.rllib.env.external_multi_agent_env import ExternalMultiAgentEnv from ray.rllib.env.multi_agent_env import MultiAgentEnv from ray.rllib.env.policy_client import PolicyClient -from ray.rllib.env.policy_server_input import PolicyServerInput from ray.rllib.env.remote_base_env import RemoteBaseEnv from ray.rllib.env.vector_env import VectorEnv @@ -31,7 +30,6 @@ "PettingZooEnv", "ParallelPettingZooEnv", "PolicyClient", - "PolicyServerInput", "RemoteBaseEnv", "Unity3DEnv", "VectorEnv", diff --git a/rllib/env/env_errors.py b/rllib/env/env_errors.py new file mode 100644 index 000000000000..cb52892db1da --- /dev/null +++ b/rllib/env/env_errors.py @@ -0,0 +1,18 @@ +"""Error classes for RLlib environment operations.""" + +from ray.util.annotations import PublicAPI + + +@PublicAPI(stability="alpha") +class StepFailedRecreateEnvError(Exception): + """An exception that signals that the environment step failed and the environment needs to be reset. + + This exception may be raised by the environment's `step` method. + It is then caught by the `EnvRunner` and the environment is reset. + This can be useful if your environment is unstable, regularely crashing in a certain way. + For example, if you connect to an external simulator that you have little control over. + You can detect such crashes in your step method and throw this error to not log the error. + Use this with caution, as it may lead to infinite loops of resetting the environment. + """ + + pass diff --git a/rllib/env/env_runner.py b/rllib/env/env_runner.py index 7d3a7a9d488e..6da4fcaf68e0 100644 --- a/rllib/env/env_runner.py +++ b/rllib/env/env_runner.py @@ -7,6 +7,7 @@ import ray from ray.rllib.core import COMPONENT_RL_MODULE +from ray.rllib.env.env_errors import StepFailedRecreateEnvError from ray.rllib.utils.actor_manager import FaultAwareApply from ray.rllib.utils.debug import update_global_seed_if_necessary from ray.rllib.utils.framework import try_import_tf @@ -25,6 +26,7 @@ ENV_RESET_FAILURE = "env_reset_failure" ENV_STEP_FAILURE = "env_step_failure" +NUM_ENV_STEP_FAILURES_LIFETIME = "num_env_step_failures" # TODO (sven): As soon as RolloutWorker is no longer supported, make this base class @@ -232,11 +234,11 @@ def _try_env_step(self, actions): results = self.env.step(actions) return results except Exception as e: + self.metrics.log_value(NUM_ENV_STEP_FAILURES_LIFETIME, 1, reduce="sum") + if self.config.restart_failed_sub_environments: - logger.exception( - "Stepping the env resulted in an error! The original error " - f"is: {e.args[0]}" - ) + if not isinstance(e, StepFailedRecreateEnvError): + logger.exception("Stepping the env resulted in an error!") # Recreate the env. self.make_env() # And return that the stepping failed. The caller will then handle @@ -244,6 +246,10 @@ def _try_env_step(self, actions): # data and repeating the step attempt). return ENV_STEP_FAILURE else: + if isinstance(e, StepFailedRecreateEnvError): + raise ValueError( + "Environment raised StepFailedRecreateEnvError but config.restart_failed_sub_environments is False." + ) from e raise e def _convert_to_tensor(self, struct) -> TensorType: diff --git a/rllib/env/env_runner_group.py b/rllib/env/env_runner_group.py index 7d49910598dc..6974c1d30187 100644 --- a/rllib/env/env_runner_group.py +++ b/rllib/env/env_runner_group.py @@ -37,7 +37,7 @@ from ray.rllib.policy.policy import Policy, PolicyState from ray.rllib.utils.actor_manager import FaultTolerantActorManager from ray.rllib.utils.annotations import OldAPIStack -from ray.rllib.utils.deprecation import ( +from ray._common.deprecation import ( Deprecated, deprecation_warning, DEPRECATED_VALUE, diff --git a/rllib/env/external/__init__.py b/rllib/env/external/__init__.py new file mode 100644 index 000000000000..343adb18b3c5 --- /dev/null +++ b/rllib/env/external/__init__.py @@ -0,0 +1,12 @@ +from ray.rllib.env.external.rllink import ( + get_rllink_message, + send_rllink_message, + RLlink, +) + + +__all__ = [ + "get_rllink_message", + "send_rllink_message", + "RLlink", +] diff --git a/rllib/env/external/env_runner_server_for_external_inference.py b/rllib/env/external/env_runner_server_for_external_inference.py new file mode 100644 index 000000000000..36bb2723c27b --- /dev/null +++ b/rllib/env/external/env_runner_server_for_external_inference.py @@ -0,0 +1,368 @@ +from collections import defaultdict +import pickle +import socket +import threading +import time +from typing import Collection, DefaultDict, List, Optional, Union + +from ray.rllib.core import ( + COMPONENT_RL_MODULE, + DEFAULT_AGENT_ID, + DEFAULT_MODULE_ID, +) +from ray.rllib.env import INPUT_ENV_SPACES +from ray.rllib.env.env_runner import EnvRunner +from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner +from ray.rllib.env.single_agent_episode import SingleAgentEpisode +from ray.rllib.env.external.rllink import ( + get_rllink_message, + send_rllink_message, + RLlink, +) +from ray.rllib.utils.annotations import override +from ray.rllib.utils.checkpoints import Checkpointable +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.metrics import ( + EPISODE_DURATION_SEC_MEAN, + EPISODE_LEN_MAX, + EPISODE_LEN_MEAN, + EPISODE_LEN_MIN, + EPISODE_RETURN_MAX, + EPISODE_RETURN_MEAN, + EPISODE_RETURN_MIN, + WEIGHTS_SEQ_NO, +) +from ray.rllib.utils.metrics.metrics_logger import MetricsLogger +from ray.rllib.utils.typing import EpisodeID, StateDict +from ray.util.annotations import DeveloperAPI + +torch, _ = try_import_torch() + + +@DeveloperAPI +class EnvRunnerServerForExternalInference(EnvRunner, Checkpointable): + """An EnvRunner communicating with an external env through a TCP socket. + + This implementation assumes: + - Only one external client ever connects to this env runner. + - The external client owns the connector pipelines (env-to-module and module-to-env) + as well as the RLModule and thus performs inference locally. Samples are sent in + bulk as lists of RLlib episodes once a certain number of timesteps has been executed + on the client's side. + - A copy of the RLModule is kept at all times on this EnvRunner, but is never used + for inference, only as a weights container. + TODO (sven): The above might be inefficient as we have to store basically two + models, one in this EnvRunner, one in the env (as ONNX). + - As a consequence, there are no environment and no connectors on this env runner. + The external env is responsible for generating all the data to create episodes. + """ + + @override(EnvRunner) + def __init__(self, *, config, **kwargs): + """ + Initializes an EnvRunnerServerForExternalInference instance. + + Args: + config: The AlgorithmConfig to use for setup. + + Keyword Args: + port: The base port number. The server socket is then actually bound to + `port` + self.worker_index. + """ + super().__init__(config=config, **kwargs) + + self.worker_index: int = kwargs.get("worker_index", 0) + + self._weights_seq_no = 0 + + # Build the module from its spec. + module_spec = self.config.get_rl_module_spec( + spaces=self.get_spaces(), inference_only=True + ) + self.module = module_spec.build() + + self.host = "localhost" + self.port = int(self.config.env_config.get("port", 5555)) + self.worker_index + self.server_socket = None + self.client_socket = None + self.address = None + + self.metrics = MetricsLogger() + + self._episode_chunks_to_return: Optional[List[SingleAgentEpisode]] = None + self._done_episodes_for_metrics: List[SingleAgentEpisode] = [] + self._ongoing_episodes_for_metrics: DefaultDict[ + EpisodeID, List[SingleAgentEpisode] + ] = defaultdict(list) + + self._sample_lock = threading.Lock() + self._on_policy_lock = threading.Lock() + self._blocked_on_state = False + + # Start a background thread for client communication. + self.thread = threading.Thread( + target=self._client_message_listener, daemon=True + ) + self.thread.start() + + @override(EnvRunner) + def assert_healthy(self): + """Checks that the server socket is open and listening.""" + assert ( + self.server_socket is not None + ), "Server socket is None (not connected, not listening)." + + @override(EnvRunner) + def sample(self, **kwargs): + """Waits for the client to send episodes.""" + while True: + with self._sample_lock: + if self._episode_chunks_to_return is not None: + num_env_steps = 0 + num_episodes_completed = 0 + for eps in self._episode_chunks_to_return: + if eps.is_done: + self._done_episodes_for_metrics.append(eps) + num_episodes_completed += 1 + else: + self._ongoing_episodes_for_metrics[eps.id_].append(eps) + num_env_steps += len(eps) + + ret = self._episode_chunks_to_return + self._episode_chunks_to_return = None + + SingleAgentEnvRunner._increase_sampled_metrics( + self, num_env_steps, num_episodes_completed + ) + + return ret + time.sleep(0.01) + + @override(EnvRunner) + def get_metrics(self): + # TODO (sven): We should probably make this a utility function to be called + # from within Single/MultiAgentEnvRunner and other EnvRunner subclasses, as + # needed. + # Compute per-episode metrics (only on already completed episodes). + for eps in self._done_episodes_for_metrics: + assert eps.is_done + episode_length = len(eps) + episode_return = eps.get_return() + episode_duration_s = eps.get_duration_s() + # Don't forget about the already returned chunks of this episode. + if eps.id_ in self._ongoing_episodes_for_metrics: + for eps2 in self._ongoing_episodes_for_metrics[eps.id_]: + episode_length += len(eps2) + episode_return += eps2.get_return() + episode_duration_s += eps2.get_duration_s() + del self._ongoing_episodes_for_metrics[eps.id_] + + self._log_episode_metrics( + episode_length, episode_return, episode_duration_s + ) + + # Now that we have logged everything, clear cache of done episodes. + self._done_episodes_for_metrics.clear() + + # Return reduced metrics. + return self.metrics.reduce() + + def get_spaces(self): + return { + INPUT_ENV_SPACES: (self.config.observation_space, self.config.action_space), + DEFAULT_MODULE_ID: ( + self.config.observation_space, + self.config.action_space, + ), + } + + @override(EnvRunner) + def stop(self): + """Closes the client and server sockets.""" + self._close_sockets_if_necessary() + + @override(Checkpointable) + def get_ctor_args_and_kwargs(self): + return ( + (), # *args + {"config": self.config}, # **kwargs + ) + + @override(Checkpointable) + def get_checkpointable_components(self): + return [ + (COMPONENT_RL_MODULE, self.module), + ] + + @override(Checkpointable) + def get_state( + self, + components: Optional[Union[str, Collection[str]]] = None, + *, + not_components: Optional[Union[str, Collection[str]]] = None, + **kwargs, + ) -> StateDict: + return { + COMPONENT_RL_MODULE: self.module.get_state(), + WEIGHTS_SEQ_NO: self._weights_seq_no, + } + + @override(Checkpointable) + def set_state(self, state: StateDict) -> None: + # Update the RLModule state. + if COMPONENT_RL_MODULE in state: + # A missing value for WEIGHTS_SEQ_NO or a value of 0 means: Force the + # update. + weights_seq_no = state.get(WEIGHTS_SEQ_NO, 0) + + # Only update the weigths, if this is the first synchronization or + # if the weights of this `EnvRunner` lacks behind the actual ones. + if weights_seq_no == 0 or self._weights_seq_no < weights_seq_no: + rl_module_state = state[COMPONENT_RL_MODULE] + if ( + isinstance(rl_module_state, dict) + and DEFAULT_MODULE_ID in rl_module_state + ): + rl_module_state = rl_module_state[DEFAULT_MODULE_ID] + self.module.set_state(rl_module_state) + + # Update our weights_seq_no, if the new one is > 0. + if weights_seq_no > 0: + self._weights_seq_no = weights_seq_no + + if self._blocked_on_state is True: + self._send_set_state_message() + self._blocked_on_state = False + + def _client_message_listener(self): + """Entry point for the listener thread.""" + + # Set up the server socket and bind to the specified host and port. + self._recycle_sockets() + + # Enter an endless message receival- and processing loop. + while True: + # As long as we are blocked on a new state, sleep a bit and continue. + # Do NOT process any incoming messages (until we send out the new state + # back to the client). + if self._blocked_on_state is True: + time.sleep(0.01) + continue + + try: + # Blocking call to get next message. + msg_type, msg_body = get_rllink_message(self.client_socket) + + # Process the message received based on its type. + # Initial handshake. + if msg_type == RLlink.PING: + self._send_pong_message() + + # Episode data from the client. + elif msg_type in [ + RLlink.EPISODES, + RLlink.EPISODES_AND_GET_STATE, + ]: + self._process_episodes_message(msg_type, msg_body) + + # Client requests the state (model weights). + elif msg_type == RLlink.GET_STATE: + self._send_set_state_message() + + # Clients requests config information. + elif msg_type == RLlink.GET_CONFIG: + self._send_set_config_message() + + except ConnectionError as e: + print(f"Messaging/connection error {e}! Recycling sockets ...") + self._recycle_sockets(5.0) + continue + + def _recycle_sockets(self, sleep: float = 0.0): + # Close all old sockets, if they exist. + self._close_sockets_if_necessary() + + time.sleep(sleep) + + # Start listening on the configured port. + self.server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + # Allow reuse of the address. + self.server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + self.server_socket.bind((self.host, self.port)) + # Listen for a single connection. + self.server_socket.listen(1) + print(f"Waiting for client to connect to port {self.port}...") + + self.client_socket, self.address = self.server_socket.accept() + print(f"Connected to client at {self.address}") + + def _close_sockets_if_necessary(self): + if self.client_socket: + self.client_socket.close() + if self.server_socket: + self.server_socket.close() + + def _send_pong_message(self): + send_rllink_message(self.client_socket, {"type": RLlink.PONG.name}) + + def _process_episodes_message(self, msg_type, msg_body): + # On-policy training -> we have to block until we get a new `set_state` call + # (b/c the learning step is done and we can send new weights back to all + # clients). + if msg_type == RLlink.EPISODES_AND_GET_STATE: + self._blocked_on_state = True + + episodes = [] + for episode_state in msg_body["episodes"]: + episode = SingleAgentEpisode.from_state(episode_state) + episodes.append(episode.to_numpy()) + + # Push episodes into the to-be-returned list (for `sample()` requests). + with self._sample_lock: + if isinstance(self._episode_chunks_to_return, list): + self._episode_chunks_to_return.extend(episodes) + else: + self._episode_chunks_to_return = episodes + + def _send_set_state_message(self): + send_rllink_message( + self.client_socket, + { + "type": RLlink.SET_STATE.name, + "state": self.get_state(inference_only=True), + }, + ) + + def _send_set_config_message(self): + send_rllink_message( + self.client_socket, + { + "type": RLlink.SET_CONFIG.name, + # TODO (sven): We need AlgorithmConfig to be a `Checkpointable` with a + # msgpack'able state. + "config": pickle.dumps(self.config), + }, + ) + + def _log_episode_metrics(self, length, ret, sec): + # Log general episode metrics. + # To mimic the old API stack behavior, we'll use `window` here for + # these particular stats (instead of the default EMA). + win = self.config.metrics_num_episodes_for_smoothing + self.metrics.log_value(EPISODE_LEN_MEAN, length, window=win) + self.metrics.log_value(EPISODE_RETURN_MEAN, ret, window=win) + self.metrics.log_value(EPISODE_DURATION_SEC_MEAN, sec, window=win) + # Per-agent returns. + self.metrics.log_value( + ("agent_episode_returns_mean", DEFAULT_AGENT_ID), ret, window=win + ) + # Per-RLModule returns. + self.metrics.log_value( + ("module_episode_returns_mean", DEFAULT_MODULE_ID), ret, window=win + ) + + # For some metrics, log min/max as well. + self.metrics.log_value(EPISODE_LEN_MIN, length, reduce="min", window=win) + self.metrics.log_value(EPISODE_RETURN_MIN, ret, reduce="min", window=win) + self.metrics.log_value(EPISODE_LEN_MAX, length, reduce="max", window=win) + self.metrics.log_value(EPISODE_RETURN_MAX, ret, reduce="max", window=win) diff --git a/rllib/env/external/rllink.py b/rllib/env/external/rllink.py new file mode 100644 index 000000000000..dfb72bda97b6 --- /dev/null +++ b/rllib/env/external/rllink.py @@ -0,0 +1,109 @@ +from enum import Enum +from packaging.version import Version + +from ray.rllib.utils.checkpoints import try_import_msgpack +from ray.util.annotations import DeveloperAPI + + +msgpack = None + + +@DeveloperAPI +class RLlink(Enum): + PROTOCOL_VERSION = Version("0.0.1") + + # Requests: Client (external env) -> Server (RLlib). + # ---- + # Ping command (initial handshake). + PING = "PING" + # List of episodes (similar to what an EnvRunner.sample() call would return). + EPISODES = "EPISODES" + # Request state (e.g. model weights). + GET_STATE = "GET_STATE" + # Request Algorithm config. + GET_CONFIG = "GET_CONFIG" + # Send episodes and request the next state update right after that. + # Clients sending this message should wait for a SET_STATE message as an immediate + # response. Useful for external samplers that must collect on-policy data. + EPISODES_AND_GET_STATE = "EPISODES_AND_GET_STATE" + + # Responses: Server (RLlib) -> Client (external env). + # ---- + # Pong response (initial handshake). + PONG = "PONG" + # Set state (e.g. model weights). + SET_STATE = "SET_STATE" + # Set Algorithm config. + SET_CONFIG = "SET_CONFIG" + + # @OldAPIStack (to be deprecated soon). + ACTION_SPACE = "ACTION_SPACE" + OBSERVATION_SPACE = "OBSERVATION_SPACE" + GET_WORKER_ARGS = "GET_WORKER_ARGS" + GET_WEIGHTS = "GET_WEIGHTS" + REPORT_SAMPLES = "REPORT_SAMPLES" + START_EPISODE = "START_EPISODE" + GET_ACTION = "GET_ACTION" + LOG_ACTION = "LOG_ACTION" + LOG_RETURNS = "LOG_RETURNS" + END_EPISODE = "END_EPISODE" + + def __str__(self): + return self.name + + +@DeveloperAPI +def send_rllink_message(sock_, message: dict): + """Sends a message to the client with a length header.""" + global msgpack + if msgpack is None: + msgpack = try_import_msgpack(error=True) + + body = msgpack.packb(message, use_bin_type=True) # .encode("utf-8") + header = str(len(body)).zfill(8).encode("utf-8") + try: + sock_.sendall(header + body) + except Exception as e: + raise ConnectionError( + f"Error sending message {message} to server on socket {sock_}! " + f"Original error was: {e}" + ) + + +@DeveloperAPI +def get_rllink_message(sock_): + """Receives a message from the client following the length-header protocol.""" + global msgpack + if msgpack is None: + msgpack = try_import_msgpack(error=True) + + try: + # Read the length header (8 bytes) + header = _get_num_bytes(sock_, 8) + msg_length = int(header.decode("utf-8")) + # Read the message body + body = _get_num_bytes(sock_, msg_length) + # Decode JSON. + message = msgpack.unpackb(body, raw=False) # .loads(body.decode("utf-8")) + # Check for proper protocol. + if "type" not in message: + raise ConnectionError( + "Protocol Error! Message from peer does not contain `type` field." + ) + return RLlink(message.pop("type")), message + except Exception as e: + raise ConnectionError( + f"Error receiving message from peer on socket {sock_}! " + f"Original error was: {e}" + ) + + +def _get_num_bytes(sock_, num_bytes): + """Helper function to receive a specific number of bytes.""" + data = b"" + while len(data) < num_bytes: + packet = sock_.recv(num_bytes - len(data)) + if not packet: + raise ConnectionError(f"No data received from socket {sock_}!") + data += packet + return data diff --git a/rllib/env/external_env.py b/rllib/env/external_env.py index 41eb89d6c471..783ae256cb99 100644 --- a/rllib/env/external_env.py +++ b/rllib/env/external_env.py @@ -13,7 +13,7 @@ EnvType, MultiEnvDict, ) -from ray.rllib.utils.deprecation import deprecation_warning +from ray._common.deprecation import deprecation_warning if TYPE_CHECKING: from ray.rllib.models.preprocessors import Preprocessor diff --git a/rllib/env/multi_agent_env.py b/rllib/env/multi_agent_env.py index 843169306dce..a09f2cd93f97 100644 --- a/rllib/env/multi_agent_env.py +++ b/rllib/env/multi_agent_env.py @@ -8,7 +8,7 @@ from ray.rllib.env.base_env import BaseEnv from ray.rllib.env.env_context import EnvContext from ray.rllib.utils.annotations import OldAPIStack, override -from ray.rllib.utils.deprecation import Deprecated +from ray._common.deprecation import Deprecated from ray.rllib.utils.typing import ( AgentID, EnvCreator, diff --git a/rllib/env/multi_agent_env_runner.py b/rllib/env/multi_agent_env_runner.py index 2610d1f1ba2d..48d7e1a6a9af 100644 --- a/rllib/env/multi_agent_env_runner.py +++ b/rllib/env/multi_agent_env_runner.py @@ -28,7 +28,7 @@ from ray.rllib.utils import force_list from ray.rllib.utils.annotations import override from ray.rllib.utils.checkpoints import Checkpointable -from ray.rllib.utils.deprecation import Deprecated +from ray._common.deprecation import Deprecated from ray.rllib.utils.framework import get_device, try_import_torch from ray.rllib.utils.metrics import ( ENV_TO_MODULE_CONNECTOR, diff --git a/rllib/env/multi_agent_episode.py b/rllib/env/multi_agent_episode.py index 76b078ef69ff..3e21bac0cb4e 100644 --- a/rllib/env/multi_agent_episode.py +++ b/rllib/env/multi_agent_episode.py @@ -20,7 +20,7 @@ from ray.rllib.env.utils.infinite_lookback_buffer import InfiniteLookbackBuffer from ray.rllib.policy.sample_batch import MultiAgentBatch from ray.rllib.utils import force_list -from ray.rllib.utils.deprecation import Deprecated +from ray._common.deprecation import Deprecated from ray.rllib.utils.error import MultiAgentEnvError from ray.rllib.utils.spaces.space_utils import batch from ray.rllib.utils.typing import AgentID, ModuleID, MultiAgentDict diff --git a/rllib/env/policy_client.py b/rllib/env/policy_client.py index 2f3791226077..e4e2e9ad8a62 100644 --- a/rllib/env/policy_client.py +++ b/rllib/env/policy_client.py @@ -1,9 +1,3 @@ -"""REST client to interact with a policy server. - -This client supports both local and remote policy inference modes. Local -inference is faster but causes more compute to be done on the client. -""" - import logging import threading import time @@ -23,7 +17,7 @@ ) # Backward compatibility. -from ray.rllib.env.utils.external_env_protocol import RLlink as Commands +from ray.rllib.env.external.rllink import RLlink as Commands logger = logging.getLogger(__name__) @@ -48,20 +42,6 @@ def __init__( update_interval: float = 10.0, session: Optional[requests.Session] = None, ): - """Create a PolicyClient instance. - - Args: - address: Server to connect to (e.g., "localhost:9090"). - inference_mode: Whether to use 'local' or 'remote' policy - inference for computing actions. - update_interval (float or None): If using 'local' inference mode, - the policy is refreshed after this many seconds have passed, - or None for manual control via client. - session (requests.Session or None): If available the session object - is used to communicate with the policy server. Using a session - can lead to speedups as connections are reused. It is the - responsibility of the creator of the session to close it. - """ self.address = address self.session = session self.env: ExternalEnv = None @@ -76,18 +56,6 @@ def __init__( def start_episode( self, episode_id: Optional[str] = None, training_enabled: bool = True ) -> str: - """Record the start of one or more episode(s). - - Args: - episode_id (Optional[str]): Unique string id for the episode or - None for it to be auto-assigned. - training_enabled: Whether to use experiences for this - episode to improve the policy. - - Returns: - episode_id: Unique string id for the episode. - """ - if self.local: self._update_local_policy() return self.env.start_episode(episode_id, training_enabled) @@ -103,16 +71,6 @@ def start_episode( def get_action( self, episode_id: str, observation: Union[EnvObsType, MultiAgentDict] ) -> Union[EnvActionType, MultiAgentDict]: - """Record an observation and get the on-policy action. - - Args: - episode_id: Episode id returned from start_episode(). - observation: Current environment observation. - - Returns: - action: Action from the env action space. - """ - if self.local: self._update_local_policy() if isinstance(episode_id, (list, tuple)): @@ -138,14 +96,6 @@ def log_action( observation: Union[EnvObsType, MultiAgentDict], action: Union[EnvActionType, MultiAgentDict], ) -> None: - """Record an observation and (off-policy) action taken. - - Args: - episode_id: Episode id returned from start_episode(). - observation: Current environment observation. - action: Action for the observation. - """ - if self.local: self._update_local_policy() return self.env.log_action(episode_id, observation, action) @@ -166,19 +116,6 @@ def log_returns( info: Union[EnvInfoDict, MultiAgentDict] = None, multiagent_done_dict: Optional[MultiAgentDict] = None, ) -> None: - """Record returns from the environment. - - The reward will be attributed to the previous action taken by the - episode. Rewards accumulate until the next action. If no reward is - logged before the next action, a reward of 0.0 is assumed. - - Args: - episode_id: Episode id returned from start_episode(). - reward: Reward from the environment. - info: Extra info dict. - multiagent_done_dict: Multi-agent done information. - """ - if self.local: self._update_local_policy() if multiagent_done_dict is not None: @@ -201,13 +138,6 @@ def log_returns( def end_episode( self, episode_id: str, observation: Union[EnvObsType, MultiAgentDict] ) -> None: - """Record the end of an episode. - - Args: - episode_id: Episode id returned from start_episode(). - observation: Current environment observation. - """ - if self.local: self._update_local_policy() return self.env.end_episode(episode_id, observation) @@ -276,9 +206,8 @@ def _update_local_policy(self, force=False): self.last_updated = time.time() +@OldAPIStack class _LocalInferenceThread(threading.Thread): - """Thread that handles experience generation (worker.sample() loop).""" - def __init__(self, rollout_worker, send_fn): super().__init__() self.daemon = True @@ -313,13 +242,8 @@ def run(self): logger.error("Error: inference worker thread died!", e) +@OldAPIStack def _auto_wrap_external(real_env_creator): - """Wrap an environment in the ExternalEnv interface if needed. - - Args: - real_env_creator: Create an env given the env_config. - """ - def wrapped_creator(env_config): real_env = real_env_creator(env_config) if not isinstance(real_env, (ExternalEnv, ExternalMultiAgentEnv)): @@ -352,14 +276,8 @@ def run(self): return wrapped_creator +@OldAPIStack def _create_embedded_rollout_worker(kwargs, send_fn): - """Create a local rollout worker and a thread that samples from it. - - Args: - kwargs: Args for the RolloutWorker constructor. - send_fn: Function to send a JSON request to the server. - """ - # Since the server acts as an input datasource, we have to reset the # input config to the default, which runs env rollouts. kwargs = kwargs.copy() diff --git a/rllib/env/policy_server_input.py b/rllib/env/policy_server_input.py deleted file mode 100644 index 70bc2d130757..000000000000 --- a/rllib/env/policy_server_input.py +++ /dev/null @@ -1,343 +0,0 @@ -from collections import deque -from http.server import HTTPServer, SimpleHTTPRequestHandler -import logging -import queue -from socketserver import ThreadingMixIn -import threading -import time -import traceback - -from typing import List -import ray.cloudpickle as pickle -from ray.rllib.env.policy_client import ( - _create_embedded_rollout_worker, - Commands, -) -from ray.rllib.offline.input_reader import InputReader -from ray.rllib.offline.io_context import IOContext -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.utils.annotations import override, PublicAPI -from ray.rllib.evaluation.metrics import RolloutMetrics -from ray.rllib.evaluation.sampler import SamplerInput -from ray.rllib.utils.typing import SampleBatchType -from ray._common.network_utils import build_address - -logger = logging.getLogger(__name__) - - -@PublicAPI -class PolicyServerInput(ThreadingMixIn, HTTPServer, InputReader): - """REST policy server that acts as an offline data source. - - This launches a multi-threaded server that listens on the specified host - and port to serve policy requests and forward experiences to RLlib. For - high performance experience collection, it implements InputReader. - - For an example, run `examples/envs/external_envs/cartpole_server.py` along - with `examples/envs/external_envs/cartpole_client.py --inference-mode=local|remote`. - - WARNING: This class is not meant to be publicly exposed. Anyone that can - communicate with this server can execute arbitary code on the machine. Use - this with caution, in isolated environments, and at your own risk. - - .. testcode:: - :skipif: True - - import gymnasium as gym - from ray.rllib.algorithms.ppo import PPOConfig - from ray.rllib.env.policy_client import PolicyClient - from ray.rllib.env.policy_server_input import PolicyServerInput - addr, port = ... - config = ( - PPOConfig() - .api_stack( - enable_rl_module_and_learner=False, - enable_env_runner_and_connector_v2=False, - ) - .environment("CartPole-v1") - .offline_data( - input_=lambda ioctx: PolicyServerInput(ioctx, addr, port) - ) - # Run just 1 server (in the Algorithm's EnvRunnerGroup). - .env_runners(num_env_runners=0) - ) - algo = config.build() - while True: - algo.train() - client = PolicyClient( - "localhost:9900", inference_mode="local") - eps_id = client.start_episode() - env = gym.make("CartPole-v1") - obs, info = env.reset() - action = client.get_action(eps_id, obs) - _, reward, _, _, _ = env.step(action) - client.log_returns(eps_id, reward) - client.log_returns(eps_id, reward) - algo.stop() - """ - - @PublicAPI - def __init__( - self, - ioctx: IOContext, - address: str, - port: int, - idle_timeout: float = 3.0, - max_sample_queue_size: int = 20, - ): - """Create a PolicyServerInput. - - This class implements rllib.offline.InputReader, and can be used with - any Algorithm by configuring - - [AlgorithmConfig object] - .env_runners(num_env_runners=0) - .offline_data(input_=lambda ioctx: PolicyServerInput(ioctx, addr, port)) - - Note that by setting num_env_runners: 0, the algorithm will only create one - rollout worker / PolicyServerInput. Clients can connect to the launched - server using rllib.env.PolicyClient. You can increase the number of available - connections (ports) by setting num_env_runners to a larger number. The ports - used will then be `port` + the worker's index. - - Args: - ioctx: IOContext provided by RLlib. - address: Server addr (e.g., "localhost"). - port: Server port (e.g., 9900). - max_queue_size: The maximum size for the sample queue. Once full, will - purge (throw away) 50% of all samples, oldest first, and continue. - """ - - self.rollout_worker = ioctx.worker - # Protect ourselves from having a bottleneck on the server (learning) side. - # Once the queue (deque) is full, we throw away 50% (oldest - # samples first) of the samples, warn, and continue. - self.samples_queue = deque(maxlen=max_sample_queue_size) - self.metrics_queue = queue.Queue() - self.idle_timeout = idle_timeout - - # Forwards client-reported metrics directly into the local rollout - # worker. - if self.rollout_worker.sampler is not None: - # This is a bit of a hack since it is patching the get_metrics - # function of the sampler. - - def get_metrics(): - completed = [] - while True: - try: - completed.append(self.metrics_queue.get_nowait()) - except queue.Empty: - break - - return completed - - self.rollout_worker.sampler.get_metrics = get_metrics - else: - # If there is no sampler, act like if there would be one to collect - # metrics from - class MetricsDummySampler(SamplerInput): - """This sampler only maintains a queue to get metrics from.""" - - def __init__(self, metrics_queue): - """Initializes a MetricsDummySampler instance. - - Args: - metrics_queue: A queue of metrics - """ - self.metrics_queue = metrics_queue - - def get_data(self) -> SampleBatchType: - raise NotImplementedError - - def get_extra_batches(self) -> List[SampleBatchType]: - raise NotImplementedError - - def get_metrics(self) -> List[RolloutMetrics]: - """Returns metrics computed on a policy client rollout worker.""" - completed = [] - while True: - try: - completed.append(self.metrics_queue.get_nowait()) - except queue.Empty: - break - return completed - - self.rollout_worker.sampler = MetricsDummySampler(self.metrics_queue) - - # Create a request handler that receives commands from the clients - # and sends data and metrics into the queues. - handler = _make_handler( - self.rollout_worker, self.samples_queue, self.metrics_queue - ) - try: - import time - - time.sleep(1) - HTTPServer.__init__(self, (address, port), handler) - except OSError: - print(f"Creating a PolicyServer on {build_address(address, port)} failed!") - import time - - time.sleep(1) - raise - - logger.info( - "Starting connector server at " - f"{build_address(self.server_name, self.server_port)}" - ) - - # Start the serving thread, listening on socket and handling commands. - serving_thread = threading.Thread(name="server", target=self.serve_forever) - serving_thread.daemon = True - serving_thread.start() - - # Start a dummy thread that puts empty SampleBatches on the queue, just - # in case we don't receive anything from clients (or there aren't - # any). The latter would block sample collection entirely otherwise, - # even if other workers' PolicyServerInput receive incoming data from - # actual clients. - heart_beat_thread = threading.Thread( - name="heart-beat", target=self._put_empty_sample_batch_every_n_sec - ) - heart_beat_thread.daemon = True - heart_beat_thread.start() - - @override(InputReader) - def next(self): - # Blocking wait until there is something in the deque. - while len(self.samples_queue) == 0: - time.sleep(0.1) - # Utilize last items first in order to remain as closely as possible - # to operating on-policy. - return self.samples_queue.pop() - - def _put_empty_sample_batch_every_n_sec(self): - # Places an empty SampleBatch every `idle_timeout` seconds onto the - # `samples_queue`. This avoids hanging of all RolloutWorkers parallel - # to this one in case this PolicyServerInput does not have incoming - # data (e.g. no client connected) and the driver algorithm uses parallel - # synchronous sampling (e.g. PPO). - while True: - time.sleep(self.idle_timeout) - self.samples_queue.append(SampleBatch()) - - -def _make_handler(rollout_worker, samples_queue, metrics_queue): - # Only used in remote inference mode. We must create a new rollout worker - # then since the original worker doesn't have the env properly wrapped in - # an ExternalEnv interface. - child_rollout_worker = None - inference_thread = None - lock = threading.Lock() - - def setup_child_rollout_worker(): - nonlocal lock - - with lock: - nonlocal child_rollout_worker - nonlocal inference_thread - - if child_rollout_worker is None: - ( - child_rollout_worker, - inference_thread, - ) = _create_embedded_rollout_worker( - rollout_worker.creation_args(), report_data - ) - child_rollout_worker.set_weights(rollout_worker.get_weights()) - - def report_data(data): - nonlocal child_rollout_worker - - batch = data["samples"] - batch.decompress_if_needed() - samples_queue.append(batch) - # Deque is full -> purge 50% (oldest samples) - if len(samples_queue) == samples_queue.maxlen: - logger.warning( - "PolicyServerInput queue is full! Purging half of the samples (oldest)." - ) - for _ in range(samples_queue.maxlen // 2): - samples_queue.popleft() - for rollout_metric in data["metrics"]: - metrics_queue.put(rollout_metric) - - if child_rollout_worker is not None: - child_rollout_worker.set_weights( - rollout_worker.get_weights(), rollout_worker.get_global_vars() - ) - - class Handler(SimpleHTTPRequestHandler): - def __init__(self, *a, **kw): - super().__init__(*a, **kw) - - def do_POST(self): - content_len = int(self.headers.get("Content-Length"), 0) - raw_body = self.rfile.read(content_len) - parsed_input = pickle.loads(raw_body) - try: - response = self.execute_command(parsed_input) - self.send_response(200) - self.end_headers() - self.wfile.write(pickle.dumps(response)) - except Exception: - self.send_error(500, traceback.format_exc()) - - def execute_command(self, args): - command = args["command"] - response = {} - - # Local inference commands: - if command == Commands.GET_WORKER_ARGS: - logger.info("Sending worker creation args to client.") - response["worker_args"] = rollout_worker.creation_args() - elif command == Commands.GET_WEIGHTS: - logger.info("Sending worker weights to client.") - response["weights"] = rollout_worker.get_weights() - response["global_vars"] = rollout_worker.get_global_vars() - elif command == Commands.REPORT_SAMPLES: - logger.info( - "Got sample batch of size {} from client.".format( - args["samples"].count - ) - ) - report_data(args) - - # Remote inference commands: - elif command == Commands.START_EPISODE: - setup_child_rollout_worker() - assert inference_thread.is_alive() - response["episode_id"] = child_rollout_worker.env.start_episode( - args["episode_id"], args["training_enabled"] - ) - elif command == Commands.GET_ACTION: - assert inference_thread.is_alive() - response["action"] = child_rollout_worker.env.get_action( - args["episode_id"], args["observation"] - ) - elif command == Commands.LOG_ACTION: - assert inference_thread.is_alive() - child_rollout_worker.env.log_action( - args["episode_id"], args["observation"], args["action"] - ) - elif command == Commands.LOG_RETURNS: - assert inference_thread.is_alive() - if args["done"]: - child_rollout_worker.env.log_returns( - args["episode_id"], args["reward"], args["info"], args["done"] - ) - else: - child_rollout_worker.env.log_returns( - args["episode_id"], args["reward"], args["info"] - ) - elif command == Commands.END_EPISODE: - assert inference_thread.is_alive() - child_rollout_worker.env.end_episode( - args["episode_id"], args["observation"] - ) - else: - raise ValueError("Unknown command: {}".format(command)) - return response - - return Handler diff --git a/rllib/env/single_agent_env_runner.py b/rllib/env/single_agent_env_runner.py index 05db62932cc3..d032f3a8d245 100644 --- a/rllib/env/single_agent_env_runner.py +++ b/rllib/env/single_agent_env_runner.py @@ -29,7 +29,7 @@ from ray.rllib.utils import force_list from ray.rllib.utils.annotations import override from ray.rllib.utils.checkpoints import Checkpointable -from ray.rllib.utils.deprecation import Deprecated +from ray._common.deprecation import Deprecated from ray.rllib.utils.framework import get_device from ray.rllib.utils.metrics import ( ENV_TO_MODULE_CONNECTOR, diff --git a/rllib/env/single_agent_episode.py b/rllib/env/single_agent_episode.py index 7056ff5c43b3..03906ff3d692 100644 --- a/rllib/env/single_agent_episode.py +++ b/rllib/env/single_agent_episode.py @@ -13,7 +13,7 @@ from ray.rllib.env.utils.infinite_lookback_buffer import InfiniteLookbackBuffer from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.serialization import gym_space_from_dict, gym_space_to_dict -from ray.rllib.utils.deprecation import Deprecated +from ray._common.deprecation import Deprecated from ray.rllib.utils.typing import AgentID, ModuleID from ray.util.annotations import PublicAPI diff --git a/rllib/env/tcp_client_inference_env_runner.py b/rllib/env/tcp_client_inference_env_runner.py index 8aaf29749a28..09f8f4a2e715 100644 --- a/rllib/env/tcp_client_inference_env_runner.py +++ b/rllib/env/tcp_client_inference_env_runner.py @@ -1,589 +1,6 @@ -import base64 -from collections import defaultdict -import gzip -import json -import pathlib -import socket -import tempfile -import threading -import time -from typing import Collection, DefaultDict, List, Optional, Union - -import gymnasium as gym -import numpy as np -import onnxruntime - -from ray.rllib.core import ( - Columns, - COMPONENT_RL_MODULE, - DEFAULT_AGENT_ID, - DEFAULT_MODULE_ID, -) -from ray.rllib.env import INPUT_ENV_SPACES -from ray.rllib.env.env_runner import EnvRunner -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner -from ray.rllib.env.single_agent_episode import SingleAgentEpisode -from ray.rllib.env.utils.external_env_protocol import RLlink as rllink -from ray.rllib.utils.annotations import ExperimentalAPI, override -from ray.rllib.utils.checkpoints import Checkpointable -from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.metrics import ( - EPISODE_DURATION_SEC_MEAN, - EPISODE_LEN_MAX, - EPISODE_LEN_MEAN, - EPISODE_LEN_MIN, - EPISODE_RETURN_MAX, - EPISODE_RETURN_MEAN, - EPISODE_RETURN_MIN, - WEIGHTS_SEQ_NO, +from ray.rllib.env.external.env_runner_server_for_external_inference import ( + EnvRunnerServerForExternalInference, ) -from ray.rllib.utils.metrics.metrics_logger import MetricsLogger -from ray.rllib.utils.numpy import softmax -from ray.rllib.utils.typing import EpisodeID, StateDict - -torch, _ = try_import_torch() - - -@ExperimentalAPI -class TcpClientInferenceEnvRunner(EnvRunner, Checkpointable): - """An EnvRunner communicating with an external env through a TCP socket. - - This implementation assumes: - - Only one external client ever connects to this env runner. - - The external client performs inference locally through an ONNX model. Thus, - samples are sent in bulk once a certain number of timesteps has been executed on the - client's side (no individual action requests). - - A copy of the RLModule is kept at all times on the env runner, but never used - for inference, only as a data (weights) container. - TODO (sven): The above might be inefficient as we have to store basically two - models, one in this EnvRunner, one in the env (as ONNX). - - There is no environment and no connectors on this env runner. The external env - is responsible for generating all the data to create episodes. - """ - - @override(EnvRunner) - def __init__(self, *, config, **kwargs): - """ - Initializes a TcpClientInferenceEnvRunner instance. - - Args: - config: The AlgorithmConfig to use for setup. - - Keyword Args: - port: The base port number. The server socket is then actually bound to - `port` + self.worker_index. - """ - super().__init__(config=config, **kwargs) - - self.worker_index: int = kwargs.get("worker_index", 0) - - self._weights_seq_no = 0 - - # Build the module from its spec. - module_spec = self.config.get_rl_module_spec( - spaces=self.get_spaces(), inference_only=True - ) - self.module = module_spec.build() - - self.host = "localhost" - self.port = int(self.config.env_config.get("port", 5555)) + self.worker_index - self.server_socket = None - self.client_socket = None - self.address = None - - self.metrics = MetricsLogger() - - self._episode_chunks_to_return: Optional[List[SingleAgentEpisode]] = None - self._done_episodes_for_metrics: List[SingleAgentEpisode] = [] - self._ongoing_episodes_for_metrics: DefaultDict[ - EpisodeID, List[SingleAgentEpisode] - ] = defaultdict(list) - - self._sample_lock = threading.Lock() - self._on_policy_lock = threading.Lock() - self._blocked_on_state = False - - # Start a background thread for client communication. - self.thread = threading.Thread( - target=self._client_message_listener, daemon=True - ) - self.thread.start() - - @override(EnvRunner) - def assert_healthy(self): - """Checks that the server socket is open and listening.""" - assert ( - self.server_socket is not None - ), "Server socket is None (not connected, not listening)." - - @override(EnvRunner) - def sample(self, **kwargs): - """Waits for the client to send episodes.""" - while True: - with self._sample_lock: - if self._episode_chunks_to_return is not None: - num_env_steps = 0 - num_episodes_completed = 0 - for eps in self._episode_chunks_to_return: - if eps.is_done: - self._done_episodes_for_metrics.append(eps) - num_episodes_completed += 1 - else: - self._ongoing_episodes_for_metrics[eps.id_].append(eps) - num_env_steps += len(eps) - - ret = self._episode_chunks_to_return - self._episode_chunks_to_return = None - - SingleAgentEnvRunner._increase_sampled_metrics( - self, num_env_steps, num_episodes_completed - ) - - return ret - time.sleep(0.01) - - @override(EnvRunner) - def get_metrics(self): - # TODO (sven): We should probably make this a utility function to be called - # from within Single/MultiAgentEnvRunner and other EnvRunner subclasses, as - # needed. - # Compute per-episode metrics (only on already completed episodes). - for eps in self._done_episodes_for_metrics: - assert eps.is_done - episode_length = len(eps) - episode_return = eps.get_return() - episode_duration_s = eps.get_duration_s() - # Don't forget about the already returned chunks of this episode. - if eps.id_ in self._ongoing_episodes_for_metrics: - for eps2 in self._ongoing_episodes_for_metrics[eps.id_]: - episode_length += len(eps2) - episode_return += eps2.get_return() - episode_duration_s += eps2.get_duration_s() - del self._ongoing_episodes_for_metrics[eps.id_] - - self._log_episode_metrics( - episode_length, episode_return, episode_duration_s - ) - - # Now that we have logged everything, clear cache of done episodes. - self._done_episodes_for_metrics.clear() - - # Return reduced metrics. - return self.metrics.reduce() - - def get_spaces(self): - return { - INPUT_ENV_SPACES: (self.config.observation_space, self.config.action_space), - DEFAULT_MODULE_ID: ( - self.config.observation_space, - self.config.action_space, - ), - } - - @override(EnvRunner) - def stop(self): - """Closes the client and server sockets.""" - self._close_sockets_if_necessary() - - @override(Checkpointable) - def get_ctor_args_and_kwargs(self): - return ( - (), # *args - {"config": self.config}, # **kwargs - ) - - @override(Checkpointable) - def get_checkpointable_components(self): - return [ - (COMPONENT_RL_MODULE, self.module), - ] - - @override(Checkpointable) - def get_state( - self, - components: Optional[Union[str, Collection[str]]] = None, - *, - not_components: Optional[Union[str, Collection[str]]] = None, - **kwargs, - ) -> StateDict: - return {} - - @override(Checkpointable) - def set_state(self, state: StateDict) -> None: - # Update the RLModule state. - if COMPONENT_RL_MODULE in state: - # A missing value for WEIGHTS_SEQ_NO or a value of 0 means: Force the - # update. - weights_seq_no = state.get(WEIGHTS_SEQ_NO, 0) - - # Only update the weigths, if this is the first synchronization or - # if the weights of this `EnvRunner` lacks behind the actual ones. - if weights_seq_no == 0 or self._weights_seq_no < weights_seq_no: - rl_module_state = state[COMPONENT_RL_MODULE] - if ( - isinstance(rl_module_state, dict) - and DEFAULT_MODULE_ID in rl_module_state - ): - rl_module_state = rl_module_state[DEFAULT_MODULE_ID] - self.module.set_state(rl_module_state) - - # Update our weights_seq_no, if the new one is > 0. - if weights_seq_no > 0: - self._weights_seq_no = weights_seq_no - - if self._blocked_on_state is True: - self._send_set_state_message() - self._blocked_on_state = False - - def _client_message_listener(self): - """Entry point for the listener thread.""" - - # Set up the server socket and bind to the specified host and port. - self._recycle_sockets() - - # Enter an endless message receival- and processing loop. - while True: - # As long as we are blocked on a new state, sleep a bit and continue. - # Do NOT process any incoming messages (until we send out the new state - # back to the client). - if self._blocked_on_state is True: - time.sleep(0.01) - continue - - try: - # Blocking call to get next message. - msg_type, msg_body = _get_message(self.client_socket) - - # Process the message received based on its type. - # Initial handshake. - if msg_type == rllink.PING: - self._send_pong_message() - - # Episode data from the client. - elif msg_type in [ - rllink.EPISODES, - rllink.EPISODES_AND_GET_STATE, - ]: - self._process_episodes_message(msg_type, msg_body) - - # Client requests the state (model weights). - elif msg_type == rllink.GET_STATE: - self._send_set_state_message() - - # Clients requests some (relevant) config information. - elif msg_type == rllink.GET_CONFIG: - self._send_set_config_message() - - except ConnectionError as e: - print(f"Messaging/connection error {e}! Recycling sockets ...") - self._recycle_sockets(5.0) - continue - - def _recycle_sockets(self, sleep: float = 0.0): - # Close all old sockets, if they exist. - self._close_sockets_if_necessary() - - time.sleep(sleep) - - # Start listening on the configured port. - self.server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - # Allow reuse of the address. - self.server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - self.server_socket.bind((self.host, self.port)) - # Listen for a single connection. - self.server_socket.listen(1) - print(f"Waiting for client to connect to port {self.port}...") - - self.client_socket, self.address = self.server_socket.accept() - print(f"Connected to client at {self.address}") - - def _close_sockets_if_necessary(self): - if self.client_socket: - self.client_socket.close() - if self.server_socket: - self.server_socket.close() - - def _send_pong_message(self): - _send_message(self.client_socket, {"type": rllink.PONG.name}) - - def _process_episodes_message(self, msg_type, msg_body): - # On-policy training -> we have to block until we get a new `set_state` call - # (b/c the learning step is done and we can sent new weights back to all - # clients). - if msg_type == rllink.EPISODES_AND_GET_STATE: - self._blocked_on_state = True - - episodes = [] - for episode_data in msg_body["episodes"]: - episode = SingleAgentEpisode( - observation_space=self.config.observation_space, - observations=[np.array(o) for o in episode_data[Columns.OBS]], - action_space=self.config.action_space, - actions=episode_data[Columns.ACTIONS], - rewards=episode_data[Columns.REWARDS], - extra_model_outputs={ - Columns.ACTION_DIST_INPUTS: [ - np.array(a) for a in episode_data[Columns.ACTION_DIST_INPUTS] - ], - Columns.ACTION_LOGP: episode_data[Columns.ACTION_LOGP], - }, - terminated=episode_data["is_terminated"], - truncated=episode_data["is_truncated"], - len_lookback_buffer=0, - ) - episodes.append(episode.to_numpy()) - - # Push episodes into the to-be-returned list (for `sample()` requests). - with self._sample_lock: - if isinstance(self._episode_chunks_to_return, list): - self._episode_chunks_to_return.extend(episodes) - else: - self._episode_chunks_to_return = episodes - - def _send_set_state_message(self): - with tempfile.TemporaryDirectory() as dir: - onnx_file = pathlib.Path(dir) / "_temp_model.onnx" - torch.onnx.export( - self.module, - { - "batch": { - "obs": torch.randn(1, *self.config.observation_space.shape) - } - }, - onnx_file, - export_params=True, - ) - with open(onnx_file, "rb") as f: - compressed = gzip.compress(f.read()) - onnx_binary = base64.b64encode(compressed).decode("utf-8") - _send_message( - self.client_socket, - { - "type": rllink.SET_STATE.name, - "onnx_file": onnx_binary, - WEIGHTS_SEQ_NO: self._weights_seq_no, - }, - ) - - def _send_set_config_message(self): - _send_message( - self.client_socket, - { - "type": rllink.SET_CONFIG.name, - "env_steps_per_sample": self.config.get_rollout_fragment_length( - worker_index=self.worker_index - ), - "force_on_policy": True, - }, - ) - - def _log_episode_metrics(self, length, ret, sec): - # Log general episode metrics. - # To mimic the old API stack behavior, we'll use `window` here for - # these particular stats (instead of the default EMA). - win = self.config.metrics_num_episodes_for_smoothing - self.metrics.log_value(EPISODE_LEN_MEAN, length, window=win) - self.metrics.log_value(EPISODE_RETURN_MEAN, ret, window=win) - self.metrics.log_value(EPISODE_DURATION_SEC_MEAN, sec, window=win) - # Per-agent returns. - self.metrics.log_value( - ("agent_episode_returns_mean", DEFAULT_AGENT_ID), ret, window=win - ) - # Per-RLModule returns. - self.metrics.log_value( - ("module_episode_returns_mean", DEFAULT_MODULE_ID), ret, window=win - ) - - # For some metrics, log min/max as well. - self.metrics.log_value(EPISODE_LEN_MIN, length, reduce="min", window=win) - self.metrics.log_value(EPISODE_RETURN_MIN, ret, reduce="min", window=win) - self.metrics.log_value(EPISODE_LEN_MAX, length, reduce="max", window=win) - self.metrics.log_value(EPISODE_RETURN_MAX, ret, reduce="max", window=win) - - -def _send_message(sock_, message: dict): - """Sends a message to the client with a length header.""" - body = json.dumps(message).encode("utf-8") - header = str(len(body)).zfill(8).encode("utf-8") - try: - sock_.sendall(header + body) - except Exception as e: - raise ConnectionError( - f"Error sending message {message} to server on socket {sock_}! " - f"Original error was: {e}" - ) - - -def _get_message(sock_): - """Receives a message from the client following the length-header protocol.""" - try: - # Read the length header (8 bytes) - header = _get_num_bytes(sock_, 8) - msg_length = int(header.decode("utf-8")) - # Read the message body - body = _get_num_bytes(sock_, msg_length) - # Decode JSON. - message = json.loads(body.decode("utf-8")) - # Check for proper protocol. - if "type" not in message: - raise ConnectionError( - "Protocol Error! Message from peer does not contain `type` field." - ) - return rllink(message.pop("type")), message - except Exception as e: - raise ConnectionError( - f"Error receiving message from peer on socket {sock_}! " - f"Original error was: {e}" - ) - - -def _get_num_bytes(sock_, num_bytes): - """Helper function to receive a specific number of bytes.""" - data = b"" - while len(data) < num_bytes: - packet = sock_.recv(num_bytes - len(data)) - if not packet: - raise ConnectionError(f"No data received from socket {sock_}!") - data += packet - return data - - -def _dummy_client(port: int = 5556): - """A dummy client that runs CartPole and acts as a testing external env.""" - - def _set_state(msg_body): - with tempfile.TemporaryDirectory(): - with open("_temp_onnx", "wb") as f: - f.write( - gzip.decompress( - base64.b64decode(msg_body["onnx_file"].encode("utf-8")) - ) - ) - onnx_session = onnxruntime.InferenceSession("_temp_onnx") - output_names = [o.name for o in onnx_session.get_outputs()] - return onnx_session, output_names - - # Connect to server. - while True: - try: - print(f"Trying to connect to localhost:{port} ...") - sock_ = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - sock_.connect(("localhost", port)) - break - except ConnectionRefusedError: - time.sleep(5) - - # Send ping-pong. - _send_message(sock_, {"type": rllink.PING.name}) - msg_type, msg_body = _get_message(sock_) - assert msg_type == rllink.PONG - - # Request config. - _send_message(sock_, {"type": rllink.GET_CONFIG.name}) - msg_type, msg_body = _get_message(sock_) - assert msg_type == rllink.SET_CONFIG - env_steps_per_sample = msg_body["env_steps_per_sample"] - force_on_policy = msg_body["force_on_policy"] - - # Request ONNX weights. - _send_message(sock_, {"type": rllink.GET_STATE.name}) - msg_type, msg_body = _get_message(sock_) - assert msg_type == rllink.SET_STATE - onnx_session, output_names = _set_state(msg_body) - - # Episode collection buckets. - episodes = [] - observations = [] - actions = [] - action_dist_inputs = [] - action_logps = [] - rewards = [] - - timesteps = 0 - episode_return = 0.0 - - # Start actual env loop. - env = gym.make("CartPole-v1") - obs, info = env.reset() - observations.append(obs.tolist()) - - while True: - timesteps += 1 - # Perform action inference using the ONNX model. - logits = onnx_session.run( - output_names, - {"onnx::Gemm_0": np.array([obs], np.float32)}, - )[0][ - 0 - ] # [0]=first return item, [0]=batch size 1 - - # Stochastic sample. - action_probs = softmax(logits) - action = int(np.random.choice(list(range(env.action_space.n)), p=action_probs)) - logp = float(np.log(action_probs[action])) - - # Perform the env step. - obs, reward, terminated, truncated, info = env.step(action) - - # Collect step data. - observations.append(obs.tolist()) - actions.append(action) - action_dist_inputs.append(logits.tolist()) - action_logps.append(logp) - rewards.append(reward) - episode_return += reward - - # We have to create a new episode record. - if timesteps == env_steps_per_sample or terminated or truncated: - episodes.append( - { - Columns.OBS: observations, - Columns.ACTIONS: actions, - Columns.ACTION_DIST_INPUTS: action_dist_inputs, - Columns.ACTION_LOGP: action_logps, - Columns.REWARDS: rewards, - "is_terminated": terminated, - "is_truncated": truncated, - } - ) - # We collected enough samples -> Send them to server. - if timesteps == env_steps_per_sample: - # Make sure the amount of data we collected is correct. - assert sum(len(e["actions"]) for e in episodes) == env_steps_per_sample - - # Send the data to the server. - if force_on_policy: - _send_message( - sock_, - { - "type": rllink.EPISODES_AND_GET_STATE.name, - "episodes": episodes, - "timesteps": timesteps, - }, - ) - # We are forced to sample on-policy. Have to wait for a response - # with the state (weights) in it. - msg_type, msg_body = _get_message(sock_) - assert msg_type == rllink.SET_STATE - onnx_session, output_names = _set_state(msg_body) - - # Sampling doesn't have to be on-policy -> continue collecting - # samples. - else: - raise NotImplementedError - - episodes = [] - timesteps = 0 - - # Set new buckets to empty lists (for next episode). - observations = [observations[-1]] - actions = [] - action_dist_inputs = [] - action_logps = [] - rewards = [] - # The episode is done -> Reset. - if terminated or truncated: - obs, _ = env.reset() - observations = [obs.tolist()] - episode_return = 0.0 +# @Deprecated +TcpClientInferenceEnvRunner = EnvRunnerServerForExternalInference diff --git a/rllib/env/tests/test_single_agent_env_runner.py b/rllib/env/tests/test_single_agent_env_runner.py index 4d5f8808aa84..0aac37bb3f83 100644 --- a/rllib/env/tests/test_single_agent_env_runner.py +++ b/rllib/env/tests/test_single_agent_env_runner.py @@ -1,12 +1,14 @@ from functools import partial -import unittest +from unittest.mock import patch +import unittest import gymnasium as gym import ray from ray import tune from ray.rllib.algorithms.algorithm_config import AlgorithmConfig from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner +from ray.rllib.env.env_runner import StepFailedRecreateEnvError from ray.rllib.env.utils import _gym_env_creator from ray.rllib.examples.envs.classes.simple_corridor import SimpleCorridor from ray.rllib.utils.test_utils import check @@ -75,7 +77,7 @@ def test_sample(self): self.assertTrue(sum_ in [128, 129]) def test_async_vector_env(self): - """Tests, whether SingleAgentGymEnvRunner can run with vector envs.""" + """Tests, whether SingleAgentEnvRunner can run with vector envs.""" for env in ["CartPole-v1", SimpleCorridor, "tune-registered"]: config = ( @@ -101,7 +103,7 @@ def test_async_vector_env(self): env_runner.stop() def test_distributed_env_runner(self): - """Tests, whether SingleAgentGymEnvRunner can be distributed.""" + """Tests, whether SingleAgentEnvRunner can be distributed.""" remote_class = ray.remote(num_cpus=1, num_gpus=0)(SingleAgentEnvRunner) @@ -142,6 +144,54 @@ def test_distributed_env_runner(self): ], ) + @patch("ray.rllib.env.env_runner.logger") + def test_step_failed_reset_required(self, mock_logger): + """Tests, whether SingleAgentEnvRunner can handle StepFailedResetRequired.""" + # Define an env that raises StepFailedResetRequired + + class ErrorRaisingEnv(gym.Env): + def __init__(self, config=None): + # As per gymnasium standard, provide observation and action spaces in your + # constructor. + self.observation_space = gym.spaces.Discrete(2) + self.action_space = gym.spaces.Discrete(2) + self.exception_type = config["exception_type"] + + def reset(self, *, seed=None, options=None): + return self.observation_space.sample(), {} + + def step(self, action): + raise self.exception_type() + + config = ( + AlgorithmConfig() + .environment( + ErrorRaisingEnv, + env_config={"exception_type": StepFailedRecreateEnvError}, + ) + .env_runners(num_envs_per_env_runner=1, rollout_fragment_length=10) + .fault_tolerance(restart_failed_sub_environments=True) + ) + env_runner = SingleAgentEnvRunner(config=config) + + # Check that we don't log the error on the first step (because we don't raise StepFailedResetRequired) + # We need two steps because the first one naturally raises ResetNeeded because we try to step before the env is reset. + env_runner._try_env_reset() + env_runner._try_env_step(actions=[None]) + + assert mock_logger.exception.call_count == 0 + + config.environment(ErrorRaisingEnv, env_config={"exception_type": ValueError}) + + env_runner = SingleAgentEnvRunner(config=config) + + # Check that we don't log the error on the first step (because we don't raise StepFailedResetRequired) + # We need two steps because the first one naturally raises ResetNeeded because we try to step before the env is reset. + env_runner._try_env_reset() + env_runner._try_env_step(actions=[None]) + + assert mock_logger.exception.call_count == 1 + if __name__ == "__main__": import pytest diff --git a/rllib/env/utils/external_env_protocol.py b/rllib/env/utils/external_env_protocol.py index 0234d273470f..3356a87da30a 100644 --- a/rllib/env/utils/external_env_protocol.py +++ b/rllib/env/utils/external_env_protocol.py @@ -1,45 +1,8 @@ -from enum import Enum - -from ray.util.annotations import PublicAPI - - -@PublicAPI(stability="alpha") -class RLlink(Enum): - # Requests: Client (external env) -> Server (RLlib). - # ---- - # Ping command (initial handshake). - PING = "PING" - # List of episodes (similar to what an EnvRunner.sample() call would return). - EPISODES = "EPISODES" - # Request state (e.g. model weights). - GET_STATE = "GET_STATE" - # Request (relevant) config. - GET_CONFIG = "GET_CONFIG" - # Send episodes and request the next state update right after that. - # Clients sending this message should wait for a SET_STATE message as an immediate - # response. Useful for external samplers that must collect on-policy data. - EPISODES_AND_GET_STATE = "EPISODES_AND_GET_STATE" - - # Responses: Server (RLlib) -> Client (external env). - # ---- - # Pong response (initial handshake). - PONG = "PONG" - # Set state (e.g. model weights). - SET_STATE = "SET_STATE" - # Set (relevant) config. - SET_CONFIG = "SET_CONFIG" - - # @OldAPIStack (to be deprecated soon). - ACTION_SPACE = "ACTION_SPACE" - OBSERVATION_SPACE = "OBSERVATION_SPACE" - GET_WORKER_ARGS = "GET_WORKER_ARGS" - GET_WEIGHTS = "GET_WEIGHTS" - REPORT_SAMPLES = "REPORT_SAMPLES" - START_EPISODE = "START_EPISODE" - GET_ACTION = "GET_ACTION" - LOG_ACTION = "LOG_ACTION" - LOG_RETURNS = "LOG_RETURNS" - END_EPISODE = "END_EPISODE" - - def __str__(self): - return self.name +from ray.rllib.env.external.rllink import RLlink # noqa +from ray._common.deprecation import deprecation_warning + +deprecation_warning( + old="ray.rllib.env.utils.external_env_protocol", + new="ray.rllib.env.external.rllink", + error=False, +) diff --git a/rllib/env/utils/infinite_lookback_buffer.py b/rllib/env/utils/infinite_lookback_buffer.py index 26f76fbc31ae..76004f0200fa 100644 --- a/rllib/env/utils/infinite_lookback_buffer.py +++ b/rllib/env/utils/infinite_lookback_buffer.py @@ -13,8 +13,10 @@ get_base_struct_from_space, to_jsonable_if_needed, ) +from ray.util.annotations import DeveloperAPI +@DeveloperAPI class InfiniteLookbackBuffer: @property def space(self): diff --git a/rllib/env/wrappers/unity3d_env.py b/rllib/env/wrappers/unity3d_env.py index 45f0f910af92..82babd666741 100644 --- a/rllib/env/wrappers/unity3d_env.py +++ b/rllib/env/wrappers/unity3d_env.py @@ -7,28 +7,14 @@ from ray.rllib.env.multi_agent_env import MultiAgentEnv from ray.rllib.policy.policy import PolicySpec -from ray.rllib.utils.annotations import PublicAPI +from ray.rllib.utils.annotations import OldAPIStack from ray.rllib.utils.typing import MultiAgentDict, PolicyID, AgentID logger = logging.getLogger(__name__) -@PublicAPI +@OldAPIStack class Unity3DEnv(MultiAgentEnv): - """A MultiAgentEnv representing a single Unity3D game instance. - - For an example on how to use this Env with a running Unity3D editor - or with a compiled game, see: - `rllib/examples/unity3d_env_local.py` - For an example on how to use it inside a Unity game client, which - connects to an RLlib Policy server, see: - `rllib/examples/envs/external_envs/unity3d_[client|server].py` - - Supports all Unity3D (MLAgents) examples, multi- or single-agent and - gets converted automatically into an ExternalMultiAgentEnv, when used - inside an RLlib PolicyClient for cloud/distributed training of Unity games. - """ - # Default base port when connecting directly to the Editor _BASE_PORT_EDITOR = 5004 # Default base port when connecting to a compiled environment @@ -45,25 +31,6 @@ def __init__( timeout_wait: int = 300, episode_horizon: int = 1000, ): - """Initializes a Unity3DEnv object. - - Args: - file_name (Optional[str]): Name of the Unity game binary. - If None, will assume a locally running Unity3D editor - to be used, instead. - port (Optional[int]): Port number to connect to Unity environment. - seed: A random seed value to use for the Unity3D game. - no_graphics: Whether to run the Unity3D simulator in - no-graphics mode. Default: False. - timeout_wait: Time (in seconds) to wait for connection from - the Unity3D instance. - episode_horizon: A hard horizon to abide to. After at most - this many steps (per-agent episode `step()` calls), the - Unity3D game is reset and will start again (finishing the - multi-agent episode that the game represents). - Note: The game itself may contain its own episode length - limits, which are always obeyed (on top of this value here). - """ super().__init__() if file_name is None: @@ -120,24 +87,6 @@ def step( ) -> Tuple[ MultiAgentDict, MultiAgentDict, MultiAgentDict, MultiAgentDict, MultiAgentDict ]: - """Performs one multi-agent step through the game. - - Args: - action_dict: Multi-agent action dict with: - keys=agent identifier consisting of - [MLagents behavior name, e.g. "Goalie?team=1"] + "_" + - [Agent index, a unique MLAgent-assigned index per single agent] - - Returns: - tuple: - - obs: Multi-agent observation dict. - Only those observations for which to get new actions are - returned. - - rewards: Rewards dict matching `obs`. - - dones: Done dict with only an __all__ multi-agent entry in - it. __all__=True, if episode is done for all agents. - - infos: An (empty) info dict. - """ from mlagents_envs.base_env import ActionTuple # Set only the required actions (from the DecisionSteps) in Unity3D. @@ -199,18 +148,6 @@ def reset( return obs, infos def _get_step_results(self): - """Collects those agents' obs/rewards that have to act in next `step`. - - Returns: - Tuple: - obs: Multi-agent observation dict. - Only those observations for which to get new actions are - returned. - rewards: Rewards dict matching `obs`. - dones: Done dict with only an __all__ multi-agent entry in it. - __all__=True, if episode is done for all agents. - infos: An (empty) info dict. - """ obs = {} rewards = {} infos = {} diff --git a/rllib/evaluation/sample_batch_builder.py b/rllib/evaluation/sample_batch_builder.py index c4c748fe3bce..e42242e375a2 100644 --- a/rllib/evaluation/sample_batch_builder.py +++ b/rllib/evaluation/sample_batch_builder.py @@ -8,7 +8,7 @@ from ray.rllib.policy.sample_batch import SampleBatch, MultiAgentBatch from ray.rllib.utils.annotations import OldAPIStack from ray.rllib.utils.debug import summarize -from ray.rllib.utils.deprecation import deprecation_warning +from ray._common.deprecation import deprecation_warning from ray.rllib.utils.typing import PolicyID, AgentID from ray.util.debug import log_once diff --git a/rllib/evaluation/sampler.py b/rllib/evaluation/sampler.py index c6b4ce937e6b..9fb2a3700029 100644 --- a/rllib/evaluation/sampler.py +++ b/rllib/evaluation/sampler.py @@ -19,7 +19,7 @@ from ray.rllib.offline import InputReader from ray.rllib.policy.sample_batch import concat_samples from ray.rllib.utils.annotations import OldAPIStack, override -from ray.rllib.utils.deprecation import deprecation_warning, DEPRECATED_VALUE +from ray._common.deprecation import deprecation_warning, DEPRECATED_VALUE from ray.rllib.utils.framework import try_import_tf from ray.rllib.utils.typing import SampleBatchType from ray.util.debug import log_once diff --git a/rllib/evaluation/worker_set.py b/rllib/evaluation/worker_set.py index 0eeea1ea2c8f..1f0beba433c2 100644 --- a/rllib/evaluation/worker_set.py +++ b/rllib/evaluation/worker_set.py @@ -1,4 +1,4 @@ -from ray.rllib.utils.deprecation import Deprecated +from ray._common.deprecation import Deprecated @Deprecated( diff --git a/rllib/examples/curriculum/curriculum_learning.py b/rllib/examples/curriculum/curriculum_learning.py index 252a468698be..a3a71de6789f 100644 --- a/rllib/examples/curriculum/curriculum_learning.py +++ b/rllib/examples/curriculum/curriculum_learning.py @@ -148,6 +148,15 @@ def _remote_fn(env_runner, new_task: int): class EnvTaskCallback(RLlibCallback): """Custom callback implementing `on_train_result()` for changing the envs' maps.""" + def on_algorithm_init( + self, + *, + algorithm: "Algorithm", + **kwargs, + ) -> None: + # Set the initial task to 0. + algorithm._counters["current_env_task"] = 0 + def on_train_result( self, *, diff --git a/rllib/examples/curriculum/pong_curriculum_learning.py b/rllib/examples/curriculum/pong_curriculum_learning.py new file mode 100644 index 000000000000..07881d3275d5 --- /dev/null +++ b/rllib/examples/curriculum/pong_curriculum_learning.py @@ -0,0 +1,281 @@ +"""Example of using curriculum learning for Atari Pong by implementing a custom callback. + +This example: + - demonstrates how to define a curriculum for an agent playing gymnasium's Atari + Pong. + - defines a custom callback that gets called once per iteration and - if the agent + performs well enough - increases the task difficulty, i.e. the `frameskip` for all + environments on all EnvRunners (the agent must act now faster). + - also demonstrates how to provide the callback with varying curriculum parameters + (like threshold maps, returns at which the curriculum ends, etc.). + - uses Ray Tune and RLlib to curriculum-learn Atari Pong with a high frameskip. + +We use Atari Pong with a framestack of 4 images (i.e. observation dimensions of 64x64x4) +and start with a frameskip of 1. At a return of 15.0 we increase the frameskip to 2, at +a return of 17.0 to 3, at 19.0 to 4, and the task is solved at a frameskip of 21.0. + +How to run this script +---------------------- +`python [script file name].py` + +Use the `--solved-return` flag to define the threshold at which curriculum learning ends. +Note that a PPO agent on Atari Pong will need a long time to learn. + +To ensure the agent has not collapsed, but rather made had a bad seed, we only decrease +the frameskip when the agent performed worse than the next lower threshold. The margin by +which the agent has to be worse is defined by the `--demotion-margin` argument and defaults +to 2.0. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +""" + +import functools +import gymnasium as gym +from typing import Callable + +from ray import tune +from ray.rllib.algorithms.algorithm import Algorithm +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.callbacks.callbacks import RLlibCallback +from ray.rllib.connectors.env_to_module.frame_stacking import FrameStackingEnvToModule +from ray.rllib.connectors.learner.frame_stacking import FrameStackingLearner +from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig +from ray.rllib.env.wrappers.atari_wrappers import wrap_atari_for_new_api_stack +from ray.rllib.utils.metrics import ENV_RUNNER_RESULTS, EPISODE_RETURN_MEAN +from ray.rllib.utils.test_utils import add_rllib_example_script_args + + +parser = add_rllib_example_script_args( + default_reward=float("inf"), + default_timesteps=3000000, + default_iters=100000000000, +) +parser.set_defaults( + env="ale_py:ALE/Pong-v5", +) +parser.add_argument( + "--solved-return", + type=float, + default=21.0, + help=("The mean episode return at which we consider the task to be fully solved."), +) +parser.add_argument( + "--demotion-margin", + type=float, + default=2.0, + help=( + "The margin below the next lower task threshold, beneath which the agent " + " is considered to have collapsed, prompting a downgrade of the task." + ), +) +# Use `parser` to add your own custom command line options to this script +# and (if needed) use their values to set up `config` below. +args = parser.parse_args() + +NUM_LEARNERS = args.num_learners or 1 +ENV = args.env + + +class PongEnvTaskCallback(RLlibCallback): + """Custom callback changing the frameskip in Atari Pong dependent on return.""" + + def __init__( + self, + task_threshold_map: dict, + remote_fn: Callable, + demotion_margin: float = 0.0, + solved_return: float = float("inf"), + ): + self.task_threshold_map = task_threshold_map + self.remote_fn = remote_fn + self.demotion_margin = demotion_margin + self.solved_return = solved_return + + def on_algorithm_init( + self, + *, + algorithm: "Algorithm", + **kwargs, + ) -> None: + # Set the initial task to 1, which corresponds to a frameskip of 1. + algorithm.metrics.log_value("current_env_task", 1, reduce="sum") + + def on_train_result( + self, + *, + algorithm: Algorithm, + metrics_logger=None, + result: dict, + **kwargs, + ) -> None: + # Store the current task inside the metrics logger in our Algorithm. + current_task = metrics_logger.peek("current_env_task") + + # If episode return is consistently above `task_threshold_map[current_task]`, + # we switch to a more difficult task (i.e. higher `frameskip`` if possible). + # If we already mastered the most difficult task, we publish our victory in + # the result dict. + result["task_solved"] = 0.0 + + # Note, in the first callback executions there may be no completed episode + # (and therefore no episode return) reported. In this case we will skip the + # the logic to manage task difficulty. + if EPISODE_RETURN_MEAN in result[ENV_RUNNER_RESULTS]: + current_return = result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] + else: + return + + # Get the threshold of the current task from the threshold map. + threshold = self.task_threshold_map.get(current_task, float("inf")) + + # Check, if curriculum is solved. + final_task = max(self.task_threshold_map.keys()) + if current_task == final_task and current_return >= self.solved_return: + # Hardest task was solved -> report this in the results dict. + result["task_solved"] = 1.0 + + # Check promotion (increasing task). Note, we could use here also a promotion_patience + # that ensures that the return is collected in a stable manner instead of a lucky shot. + if ( + current_return >= threshold + ): # & result[ENV_RUNNER_RESULTS][NUM_EPISODES] > promotion_patience. + next_task = current_task + 1 + if next_task in self.task_threshold_map: + print( + f"Switching task on all EnvRunners up to #{next_task} (1=easiest, " + f"4=hardest), b/c R={current_return} on current task." + ) + # Increase task. + algorithm.env_runner_group.foreach_env_runner( + func=functools.partial(self.remote_fn, new_task=next_task) + ) + metrics_logger.log_value("current_env_task", next_task, window=1) + + # Check demotion (decreasing task). The demotion is used to avoid decreasing the task + # in case of an unlucky episode run. Only if the return is singificantly lower we + # decrease the task. + previous_task = current_task - 1 + if previous_task in self.task_threshold_map: + previous_threshold = self.task_threshold_map[previous_task] + if current_return < previous_threshold - self.demotion_margin: + print( + f"Switching task on all EnvRunners back to #{previous_task} (1=easiest, " + f"4=hardest), b/c R={current_return} on current task." + ) + # Decrease to previous level. + algorithm.env_runner_group.foreach_env_runner( + func=functools.partial(self.remote_fn, new_task=previous_task) + ) + metrics_logger.log_value("current_env_task", previous_task, window=1) + + +# These tags allow extracting portions of this script on Anyscale. +# ws-template-code-start +def _make_env_to_module_connector(env, spaces, device): + return FrameStackingEnvToModule(num_frames=4) + + +def _make_learner_connector(input_observation_space, input_action_space): + return FrameStackingLearner(num_frames=4) + + +# Create a custom Atari setup (w/o the usual RLlib-hard-coded framestacking in it). +# We would like our frame stacking connector to do this job. +def _env_creator(cfg): + return wrap_atari_for_new_api_stack( + gym.make(ENV, **cfg, render_mode="rgb_array"), + # Perform frame-stacking through ConnectorV2 API. + framestack=None, + ) + + +# Simple function sent to an EnvRunner to change the map of all its gym. Envs from +# the current one to a new (tougher) one, in which the frameskip is higher +# and the agent must therefore act faster. +def _remote_fn(env_runner, new_task: int): + # Override the env_config with the new setting. + env_runner.config.env_config.update( + { + "frameskip": new_task, + } + ) + # We recreate the entire env object by changing the env_config on the worker, + # then calling its `make_env()` method. + env_runner.make_env() + + +# Task threshold map keeps track of thresholds for each task. If the threshold has +# been surpassed the task difficulty is increased. +task_threshold_map = { + # Frameskip: Return. + 1: 15.0, + 2: 17.0, + 3: 19.0, + 4: float("inf"), +} + +tune.register_env("env", _env_creator) + +config = ( + PPOConfig() + .environment( + "env", + env_config={ + # Make analogous to old v4 + NoFrameskip. + "frameskip": 1, + "full_action_space": False, + "repeat_action_probability": 0.0, + }, + clip_rewards=True, + ) + .env_runners( + env_to_module_connector=_make_env_to_module_connector, + ) + .training( + learner_connector=_make_learner_connector, + train_batch_size_per_learner=4000, + minibatch_size=128, + lambda_=0.95, + kl_coeff=0.5, + clip_param=0.1, + vf_clip_param=10.0, + entropy_coeff=0.01, + num_epochs=10, + lr=0.00015 * NUM_LEARNERS, + grad_clip=100.0, + grad_clip_by="global_norm", + ) + .rl_module( + model_config=DefaultModelConfig( + conv_filters=[[16, 4, 2], [32, 4, 2], [64, 4, 2], [128, 4, 2]], + conv_activation="relu", + head_fcnet_hiddens=[256], + vf_share_layers=True, + ), + ) + .callbacks( + functools.partial( + PongEnvTaskCallback, + task_threshold_map=task_threshold_map, + remote_fn=_remote_fn, + # Avoids downgrading the task to early when the agent had an unlucky run. + demotion_margin=args.demotion_margin, + # The return at which the task is learned. + solved_return=args.solved_return, + ) + ) +) + +if __name__ == "__main__": + from ray.rllib.utils.test_utils import run_rllib_example_script_experiment + + run_rllib_example_script_experiment(config, args=args) diff --git a/rllib/examples/debugging/deterministic_sampling_and_training.py b/rllib/examples/debugging/deterministic_sampling_and_training.py index 11319c4da112..219b5db07168 100644 --- a/rllib/examples/debugging/deterministic_sampling_and_training.py +++ b/rllib/examples/debugging/deterministic_sampling_and_training.py @@ -93,6 +93,8 @@ .environment("env" if args.num_agents > 0 else "CartPole-v1") # Make sure every environment gets a fixed seed. .debugging(seed=args.seed) + # Log gradients and check them in the test. + .reporting(log_gradients=True) ) # Add a simple multi-agent setup. diff --git a/rllib/examples/envs/classes/multi_agent/footsies/README.md b/rllib/examples/envs/classes/multi_agent/footsies/README.md new file mode 100644 index 000000000000..6c9bec11c453 --- /dev/null +++ b/rllib/examples/envs/classes/multi_agent/footsies/README.md @@ -0,0 +1,10 @@ +# Footsies Environment + +This environment implementation is based on the [FootsiesGym project](https://github.com/chasemcd/FootsiesGym), +specifically the version as of **July 28, 2025**. + +## Notes + +All examples in the RLlib documentation that use the Footsies environment are self-contained. +This means that you do not need to install anything from the FootsiesGym repository or other places. +Examples handle binary automatically (downloading, extracting, starting, stopping, etc.). diff --git a/rllib/examples/envs/classes/multi_agent/footsies/__init__.py b/rllib/examples/envs/classes/multi_agent/footsies/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/rllib/examples/envs/classes/multi_agent/footsies/encoder.py b/rllib/examples/envs/classes/multi_agent/footsies/encoder.py new file mode 100644 index 000000000000..475e1574891e --- /dev/null +++ b/rllib/examples/envs/classes/multi_agent/footsies/encoder.py @@ -0,0 +1,225 @@ +import collections +import copy +from typing import Any, Optional, Union + +import numpy as np +from ray.rllib.examples.envs.classes.multi_agent.footsies.game import constants +from ray.rllib.examples.envs.classes.multi_agent.footsies.game.proto import ( + footsies_service_pb2 as footsies_pb2, +) + + +class FootsiesEncoder: + """Encoder class to generate observations from the game state""" + + def __init__(self, observation_delay: int): + self._encoding_history = { + agent_id: collections.deque(maxlen=int(observation_delay)) + for agent_id in ["p1", "p2"] + } + self.observation_delay = observation_delay + self._last_common_state: Optional[np.ndarray] = None + self._action_id_values = list(constants.FOOTSIES_ACTION_IDS.values()) + + @staticmethod + def encode_common_state(game_state: footsies_pb2.GameState) -> np.ndarray: + p1_state, p2_state = game_state.player1, game_state.player2 + + dist_x = np.abs(p1_state.player_position_x - p2_state.player_position_x) / 8.0 + + return np.array( + [ + dist_x, + ], + dtype=np.float32, + ) + + @staticmethod + def _encode_input_buffer( + input_buffer: list[int], last_n: Optional[int] = None + ) -> np.ndarray: + """Encodes the input buffer into a one-hot vector. + + :param input_buffer: The input buffer to encode + :type input_buffer: list[int] + :return: The encoded one-hot vector + :rtype: np.ndarray + """ + + if last_n is not None: + input_buffer = input_buffer[last_n:] + + ib_encoding = [] + for action_id in input_buffer: + arr = [0] * (len(constants.ACTION_TO_BITS) + 1) + arr[action_id] = 1 + ib_encoding.extend(arr) + + input_buffer_vector = np.asarray(ib_encoding, dtype=np.float32) + + return input_buffer_vector + + def encode( + self, + game_state: footsies_pb2.GameState, + ) -> dict[str, Any]: + """Encodes the game state into observations for all agents. + + :param game_state: The game state to encode + :type game_state: footsies_pb2.GameState + :return: The encoded observations for all agents. + :rtype: dict[str, Any] + """ + common_state = self.encode_common_state(game_state) + p1_encoding = self.encode_player_state(game_state.player1) + p2_encoding = self.encode_player_state(game_state.player2) + + observation_delay = min( + self.observation_delay, len(self._encoding_history["p1"]) + ) + + if observation_delay > 0: + p1_delayed_encoding = self._encoding_history["p1"][-observation_delay] + p2_delayed_encoding = self._encoding_history["p2"][-observation_delay] + else: + p1_delayed_encoding = copy.deepcopy(p1_encoding) + p2_delayed_encoding = copy.deepcopy(p2_encoding) + + self._encoding_history["p1"].append(p1_encoding) + self._encoding_history["p2"].append(p2_encoding) + self._last_common_state = common_state + + # Create features dictionary + features = {} + current_index = 0 + + # Common state + features["common_state"] = { + "start": current_index, + "length": len(common_state), + } + current_index += len(common_state) + + # Concatenate the observations for the undelayed encoding + p1_encoding = np.hstack(list(p1_encoding.values()), dtype=np.float32) + p2_encoding = np.hstack(list(p2_encoding.values()), dtype=np.float32) + + # Concatenate the observations for the delayed encoding + p1_delayed_encoding = np.hstack( + list(p1_delayed_encoding.values()), dtype=np.float32 + ) + p2_delayed_encoding = np.hstack( + list(p2_delayed_encoding.values()), dtype=np.float32 + ) + + p1_centric_observation = np.hstack( + [common_state, p1_encoding, p2_delayed_encoding] + ) + + p2_centric_observation = np.hstack( + [common_state, p2_encoding, p1_delayed_encoding] + ) + + return {"p1": p1_centric_observation, "p2": p2_centric_observation} + + def encode_player_state( + self, + player_state: footsies_pb2.PlayerState, + ) -> dict[str, Union[int, float, list, np.ndarray]]: + """Encodes the player state into observations. + + :param player_state: The player state to encode + :type player_state: footsies_pb2.PlayerState + :return: The encoded observations for the player + :rtype: dict[str, Any] + """ + feature_dict = { + "player_position_x": player_state.player_position_x + / constants.FeatureDictNormalizers.PLAYER_POSITION_X, + "velocity_x": player_state.velocity_x + / constants.FeatureDictNormalizers.VELOCITY_X, + "is_dead": int(player_state.is_dead), + "vital_health": player_state.vital_health, + "guard_health": one_hot_encoder(player_state.guard_health, [0, 1, 2, 3]), + "current_action_id": self._encode_action_id(player_state.current_action_id), + "current_action_frame": player_state.current_action_frame + / constants.FeatureDictNormalizers.CURRENT_ACTION_FRAME, + "current_action_frame_count": player_state.current_action_frame_count + / constants.FeatureDictNormalizers.CURRENT_ACTION_FRAME_COUNT, + "current_action_remaining_frames": ( + player_state.current_action_frame_count + - player_state.current_action_frame + ) + / constants.FeatureDictNormalizers.CURRENT_ACTION_REMAINING_FRAMES, + "is_action_end": int(player_state.is_action_end), + "is_always_cancelable": int(player_state.is_always_cancelable), + "current_action_hit_count": player_state.current_action_hit_count, + "current_hit_stun_frame": player_state.current_hit_stun_frame + / constants.FeatureDictNormalizers.CURRENT_HIT_STUN_FRAME, + "is_in_hit_stun": int(player_state.is_in_hit_stun), + "sprite_shake_position": player_state.sprite_shake_position, + "max_sprite_shake_frame": player_state.max_sprite_shake_frame + / constants.FeatureDictNormalizers.MAX_SPRITE_SHAKE_FRAME, + "is_face_right": int(player_state.is_face_right), + "current_frame_advantage": player_state.current_frame_advantage + / constants.FeatureDictNormalizers.CURRENT_FRAME_ADVANTAGE, + # The below features leak some information about the opponent! + "would_next_forward_input_dash": int( + player_state.would_next_forward_input_dash + ), + "would_next_backward_input_dash": int( + player_state.would_next_backward_input_dash + ), + "special_attack_progress": min(player_state.special_attack_progress, 1.0), + } + + return feature_dict + + def get_last_encoding(self) -> Optional[dict[str, np.ndarray]]: + if self._last_common_state is None: + return None + + return { + "common_state": self._last_common_state.reshape(-1), + "p1": np.hstack( + list(self._encoding_history["p1"][-1].values()), + dtype=np.float32, + ), + "p2": np.hstack( + list(self._encoding_history["p2"][-1].values()), + dtype=np.float32, + ), + } + + def reset(self): + self._encoding_history = { + agent_id: collections.deque(maxlen=int(self.observation_delay)) + for agent_id in ["p1", "p2"] + } + + def _encode_action_id(self, action_id: int) -> np.ndarray: + """Encodes the action id into a one-hot vector. + + :param action_id: The action id to encode + :type action_id: int + :return: The encoded one-hot vector + :rtype: np.ndarray + """ + + action_vector = np.zeros(len(self._action_id_values), dtype=np.float32) + + # Get the index of the action id in constants.ActionID + action_index = self._action_id_values.index(action_id) + action_vector[action_index] = 1 + + assert action_vector.max() == 1 and action_vector.min() == 0 + + return action_vector + + +def one_hot_encoder( + value: Union[int, float, str], collection: list[Union[int, float, str]] +) -> np.ndarray: + vector = np.zeros(len(collection), dtype=np.float32) + vector[collection.index(value)] = 1 + return vector diff --git a/rllib/examples/envs/classes/multi_agent/footsies/fixed_rlmodules.py b/rllib/examples/envs/classes/multi_agent/footsies/fixed_rlmodules.py new file mode 100644 index 000000000000..cf9030a96ad8 --- /dev/null +++ b/rllib/examples/envs/classes/multi_agent/footsies/fixed_rlmodules.py @@ -0,0 +1,39 @@ +import tree # pip install dm_tree +from ray.rllib.core.rl_module import RLModule +from ray.rllib.examples.envs.classes.multi_agent.footsies.game import constants +from ray.rllib.policy import sample_batch +from ray.rllib.utils.spaces.space_utils import batch as batch_func + + +class FixedRLModule(RLModule): + def _forward_inference(self, batch, **kwargs): + return self._fixed_forward(batch, **kwargs) + + def _forward_exploration(self, batch, **kwargs): + return self._fixed_forward(batch, **kwargs) + + def _forward_train(self, *args, **kwargs): + raise NotImplementedError( + f"RLlib: {self.__class__.__name__} should not be trained. " + f"It is a fixed RLModule, returning a fixed action for all observations." + ) + + def _fixed_forward(self, batch, **kwargs): + """Implements a fixed that always returns the same action.""" + raise NotImplementedError( + "FixedRLModule: This method should be overridden by subclasses to implement a specific action." + ) + + +class NoopFixedRLModule(FixedRLModule): + def _fixed_forward(self, batch, **kwargs): + obs_batch_size = len(tree.flatten(batch[sample_batch.SampleBatch.OBS])[0]) + actions = batch_func([constants.EnvActions.NONE for _ in range(obs_batch_size)]) + return {sample_batch.SampleBatch.ACTIONS: actions} + + +class BackFixedRLModule(FixedRLModule): + def _fixed_forward(self, batch, **kwargs): + obs_batch_size = len(tree.flatten(batch[sample_batch.SampleBatch.OBS])[0]) + actions = batch_func([constants.EnvActions.BACK for _ in range(obs_batch_size)]) + return {sample_batch.SampleBatch.ACTIONS: actions} diff --git a/rllib/examples/envs/classes/multi_agent/footsies/footsies_env.py b/rllib/examples/envs/classes/multi_agent/footsies/footsies_env.py new file mode 100644 index 000000000000..c08c0bce570e --- /dev/null +++ b/rllib/examples/envs/classes/multi_agent/footsies/footsies_env.py @@ -0,0 +1,282 @@ +import logging +from typing import Any, Optional + +import numpy as np +import psutil +from gymnasium import spaces +from pettingzoo.utils.env import ( + AgentID, + ActionType, + ObsType, +) +from ray.rllib.env import EnvContext +from ray.rllib.env.multi_agent_env import MultiAgentEnv +from ray.rllib.examples.envs.classes.multi_agent.footsies.encoder import FootsiesEncoder +from ray.rllib.examples.envs.classes.multi_agent.footsies.game import constants +from ray.rllib.examples.envs.classes.multi_agent.footsies.game.footsies_binary import ( + FootsiesBinary, +) +from ray.rllib.examples.envs.classes.multi_agent.footsies.game.footsies_game import ( + FootsiesGame, +) + +logger = logging.getLogger("ray.rllib") + + +class FootsiesEnv(MultiAgentEnv): + metadata = {"render.modes": ["human"]} + SPECIAL_CHARGE_FRAMES = 60 + GUARD_BREAK_REWARD = 0.3 + + observation_space = spaces.Dict( + { + agent: spaces.Box( + low=-np.inf, + high=np.inf, + shape=(constants.OBSERVATION_SPACE_SIZE,), + ) + for agent in ["p1", "p2"] + } + ) + + action_space = spaces.Dict( + { + agent: spaces.Discrete( + len( + [ + constants.EnvActions.NONE, + constants.EnvActions.BACK, + constants.EnvActions.FORWARD, + constants.EnvActions.ATTACK, + constants.EnvActions.BACK_ATTACK, + constants.EnvActions.FORWARD_ATTACK, + # This is a special input that holds down + # attack for 60 frames. It's just too long of a sequence + # to easily learn by holding ATTACK for so long. + constants.EnvActions.SPECIAL_CHARGE, + ] + ) + ) + for agent in ["p1", "p2"] + } + ) + + def __init__(self, config: EnvContext, port: int): + super().__init__() + + if config is None: + config = {} + self.config = config + self.port = port + self.footsies_process_pid = ( + None # Store PID of the running footsies process (we assume one per env) + ) + self.agents: list[AgentID] = ["p1", "p2"] + self.possible_agents: list[AgentID] = self.agents.copy() + self._agent_ids: set[AgentID] = set(self.agents) + + self.t: int = 0 + self.max_t: int = config.get("max_t", 1000) + self.frame_skip = config.get("frame_skip", 4) + observation_delay = config.get("observation_delay", 16) + + assert ( + observation_delay % self.frame_skip == 0 + ), "observation_delay must be divisible by frame_skip" + + self.encoder = FootsiesEncoder( + observation_delay=observation_delay // self.frame_skip + ) + + # start the game server before initializing the communication between the + # game server and the Python harness via gRPC + self._prepare_and_start_game_server() + self.game = FootsiesGame( + host=config["host"], + port=self.port, + ) + + self.last_game_state = None + self.special_charge_queue = { + "p1": -1, + "p2": -1, + } + + @staticmethod + def _convert_to_charge_action(action: int) -> int: + if action == constants.EnvActions.BACK: + return constants.EnvActions.BACK_ATTACK + elif action == constants.EnvActions.FORWARD: + return constants.EnvActions.FORWARD_ATTACK + else: + return constants.EnvActions.ATTACK + + def close(self): + """Terminate Footsies game server process. + + Run to ensure no game servers are left running. + """ + timeout = 2 + try: + logger.info( + f"RLlib {self.__class__.__name__}: Terminating Footsies " + f"game server process with PID: {self.footsies_process_pid}..." + ) + p = psutil.Process(self.footsies_process_pid) + p.terminate() + p.wait(timeout=timeout) + except psutil.NoSuchProcess: + logger.info( + f"RLlib {self.__class__.__name__}: Process with PID {self.footsies_process_pid} not found, " + f"it might have been already terminated." + ) + except psutil.TimeoutExpired: + logger.warning( + f"RLlib {self.__class__.__name__}: Process with PID {self.footsies_process_pid} did not terminate " + f"within {timeout} seconds. " + f"Sending SIGKILL signal instead.", + ) + p.kill() + p.wait(timeout=timeout) + + def get_infos(self): + return {agent: {} for agent in self.agents} + + def get_obs(self, game_state): + return self.encoder.encode(game_state) + + def reset( + self, + *, + seed: Optional[int] = None, + options: Optional[dict] = None, + ) -> tuple[dict[AgentID, ObsType], dict[AgentID, Any]]: + """Resets the environment to the starting state + and returns the initial observations for all agents. + + :return: Tuple of observations and infos for each agent. + :rtype: tuple[dict[AgentID, ObsType], dict[AgentID, Any]] + """ + self.t = 0 + self.game.reset_game() + self.game.start_game() + + self.encoder.reset() + self.last_game_state = self.game.get_state() + + observations = self.get_obs(self.last_game_state) + + return observations, {agent: {} for agent in self.agents} + + def step( + self, actions: dict[AgentID, ActionType] + ) -> tuple[ + dict[AgentID, ObsType], + dict[AgentID, float], + dict[AgentID, bool], + dict[AgentID, bool], + dict[AgentID, dict[str, Any]], + ]: + """Step the environment with the provided actions for all agents. + + :param actions: Dictionary mapping agent ids to their actions for this step. + :type actions: dict[AgentID, ActionType] + :return: Tuple of observations, rewards, terminates, truncateds and infos for all agents. + :rtype: tuple[ dict[AgentID, ObsType], dict[AgentID, float], dict[AgentID, bool], dict[AgentID, bool], dict[AgentID, dict[str, Any]], ] + """ + self.t += 1 + + for agent_id in self.agents: + empty_queue = self.special_charge_queue[agent_id] < 0 + action_is_special_charge = ( + actions[agent_id] == constants.EnvActions.SPECIAL_CHARGE + ) + + # Refill the charge queue only if we're not already in a special charge. + if action_is_special_charge and empty_queue: + self.special_charge_queue[ + agent_id + ] = self._build_charged_special_queue() + + if self.special_charge_queue[agent_id] >= 0: + self.special_charge_queue[agent_id] -= 1 + actions[agent_id] = self._convert_to_charge_action(actions[agent_id]) + + p1_action = self.game.action_to_bits(actions["p1"], is_player_1=True) + p2_action = self.game.action_to_bits(actions["p2"], is_player_1=False) + + game_state = self.game.step_n_frames( + p1_action=p1_action, p2_action=p2_action, n_frames=self.frame_skip + ) + observations = self.get_obs(game_state) + + terminated = game_state.player1.is_dead or game_state.player2.is_dead + + # Zero-sum game: 1 if other player is dead, -1 if you're dead: + rewards = { + "p1": int(game_state.player2.is_dead) - int(game_state.player1.is_dead), + "p2": int(game_state.player1.is_dead) - int(game_state.player2.is_dead), + } + + if self.config.get("reward_guard_break", False): + p1_prev_guard_health = self.last_game_state.player1.guard_health + p2_prev_guard_health = self.last_game_state.player2.guard_health + p1_guard_health = game_state.player1.guard_health + p2_guard_health = game_state.player2.guard_health + + if p2_guard_health < p2_prev_guard_health: + rewards["p1"] += self.GUARD_BREAK_REWARD + rewards["p2"] -= self.GUARD_BREAK_REWARD + if p1_guard_health < p1_prev_guard_health: + rewards["p2"] += self.GUARD_BREAK_REWARD + rewards["p1"] -= self.GUARD_BREAK_REWARD + + terminateds = { + "p1": terminated, + "p2": terminated, + "__all__": terminated, + } + + truncated = self.t >= self.max_t + truncateds = { + "p1": truncated, + "p2": truncated, + "__all__": truncated, + } + + self.last_game_state = game_state + + return observations, rewards, terminateds, truncateds, self.get_infos() + + def _build_charged_special_queue(self): + assert self.SPECIAL_CHARGE_FRAMES % self.frame_skip == 0 + steps_to_apply_attack = int(self.SPECIAL_CHARGE_FRAMES // self.frame_skip) + return steps_to_apply_attack + + def _prepare_and_start_game_server(self): + fb = FootsiesBinary(config=self.config, port=self.port) + self.footsies_process_pid = fb.start_game_server() + + +def env_creator(env_config: EnvContext) -> FootsiesEnv: + """Creates the Footsies environment + + Ensure that each game server runs on a unique port. Training and evaluation env runners have separate port ranges. + + Helper function to create the FootsiesEnv with a unique port based on the worker index and vector index. + It's usually passed to the `register_env()`, like this: register_env(name="FootsiesEnv", env_creator=env_creator). + """ + if env_config.get("env-for-evaluation", False): + port = ( + env_config["eval_start_port"] + - 1 # "-1" to start with eval_start_port as the first port (eval worker index starts at 1) + + int(env_config.worker_index) * env_config.get("num_envs_per_worker", 1) + + env_config.get("vector_index", 0) + ) + else: + port = ( + env_config["train_start_port"] + + int(env_config.worker_index) * env_config.get("num_envs_per_worker", 1) + + env_config.get("vector_index", 0) + ) + return FootsiesEnv(config=env_config, port=port) diff --git a/rllib/examples/envs/classes/multi_agent/footsies/game/__init__.py b/rllib/examples/envs/classes/multi_agent/footsies/game/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/rllib/examples/envs/classes/multi_agent/footsies/game/constants.py b/rllib/examples/envs/classes/multi_agent/footsies/game/constants.py new file mode 100644 index 000000000000..9a5c86065128 --- /dev/null +++ b/rllib/examples/envs/classes/multi_agent/footsies/game/constants.py @@ -0,0 +1,151 @@ +from dataclasses import dataclass + +OBSERVATION_SPACE_SIZE: int = 81 + + +@dataclass +class EnvActions: + NONE = 0 + BACK = 1 + FORWARD = 2 + ATTACK = 3 + BACK_ATTACK = 4 + FORWARD_ATTACK = 5 + SPECIAL_CHARGE = 6 + + +@dataclass +class GameActions: + NONE = 0 + LEFT = 1 + RIGHT = 2 + ATTACK = 3 + LEFT_ATTACK = 4 + RIGHT_ATTACK = 5 + + +@dataclass +class ActionBits: + NONE: int = 0 + LEFT: int = 1 << 0 + RIGHT: int = 1 << 1 + ATTACK: int = 1 << 2 + LEFT_ATTACK: int = LEFT | ATTACK + RIGHT_ATTACK: int = RIGHT | ATTACK + + +@dataclass +class ActionID: + STAND = 0 + FORWARD = 1 + BACKWARD = 2 + DASH_FORWARD = 10 + DASH_BACKWARD = 11 + N_ATTACK = 100 + B_ATTACK = 105 + N_SPECIAL = 110 + B_SPECIAL = 115 + DAMAGE = 200 + GUARD_M = 301 + GUARD_STAND = 305 + GUARD_CROUCH = 306 + GUARD_BREAK = 310 + GUARD_PROXIMITY = 350 + DEAD = 500 + WIN = 510 + + +@dataclass +class FeatureDictNormalizers: + PLAYER_POSITION_X = 4.0 + VELOCITY_X = 5.0 + CURRENT_ACTION_FRAME = 25 + CURRENT_ACTION_FRAME_COUNT = 25 + CURRENT_ACTION_REMAINING_FRAMES = 25 + CURRENT_HIT_STUN_FRAME = 10 + MAX_SPRITE_SHAKE_FRAME = 10 + CURRENT_FRAME_ADVANTAGE = 10 + + +ACTION_TO_BITS = { + GameActions.NONE: ActionBits.NONE, + GameActions.LEFT: ActionBits.LEFT, + GameActions.RIGHT: ActionBits.RIGHT, + GameActions.ATTACK: ActionBits.ATTACK, + GameActions.LEFT_ATTACK: ActionBits.LEFT_ATTACK, + GameActions.RIGHT_ATTACK: ActionBits.RIGHT_ATTACK, +} + +FOOTSIES_ACTION_IDS = { + "STAND": ActionID.STAND, + "FORWARD": ActionID.FORWARD, + "BACKWARD": ActionID.BACKWARD, + "DASH_FORWARD": ActionID.DASH_FORWARD, + "DASH_BACKWARD": ActionID.DASH_BACKWARD, + "N_ATTACK": ActionID.N_ATTACK, + "B_ATTACK": ActionID.B_ATTACK, + "N_SPECIAL": ActionID.N_SPECIAL, + "B_SPECIAL": ActionID.B_SPECIAL, + "DAMAGE": ActionID.DAMAGE, + "GUARD_M": ActionID.GUARD_M, + "GUARD_STAND": ActionID.GUARD_STAND, + "GUARD_CROUCH": ActionID.GUARD_CROUCH, + "GUARD_BREAK": ActionID.GUARD_BREAK, + "GUARD_PROXIMITY": ActionID.GUARD_PROXIMITY, + "DEAD": ActionID.DEAD, + "WIN": ActionID.WIN, +} + +# backup file location (uploaded July 29th, 2025): +# https://ray-example-data.s3.us-west-2.amazonaws.com/rllib/env-footsies/feature_indices.json +# Dictionary mapping feature names to their index ranges within a flat observation vector. +# Each key is a feature name, and its value is a dictionary with keys: +# "start": the starting index in the observation array. +# "length": it's length in bytes +feature_indices = { + "common_state": {"start": 0, "length": 1}, + "frame_count": {"start": 1, "length": 1}, + "player_position_x": {"start": 2, "length": 1}, + "velocity_x": {"start": 3, "length": 1}, + "is_dead": {"start": 4, "length": 1}, + "vital_health": {"start": 5, "length": 1}, + "guard_health": {"start": 6, "length": 4}, + "current_action_id": {"start": 10, "length": 17}, + "current_action_frame": {"start": 27, "length": 1}, + "current_action_frame_count": {"start": 28, "length": 1}, + "current_action_remaining_frames": {"start": 29, "length": 1}, + "is_action_end": {"start": 30, "length": 1}, + "is_always_cancelable": {"start": 31, "length": 1}, + "current_action_hit_count": {"start": 32, "length": 1}, + "current_hit_stun_frame": {"start": 33, "length": 1}, + "is_in_hit_stun": {"start": 34, "length": 1}, + "sprite_shake_position": {"start": 35, "length": 1}, + "max_sprite_shake_frame": {"start": 36, "length": 1}, + "is_face_right": {"start": 37, "length": 1}, + "current_frame_advantage": {"start": 38, "length": 1}, + "would_next_forward_input_dash": {"start": 39, "length": 1}, + "would_next_backward_input_dash": {"start": 40, "length": 1}, + "special_attack_progress": {"start": 41, "length": 1}, + "opponent_frame_count": {"start": 42, "length": 1}, + "opponent_player_position_x": {"start": 43, "length": 1}, + "opponent_velocity_x": {"start": 44, "length": 1}, + "opponent_is_dead": {"start": 45, "length": 1}, + "opponent_vital_health": {"start": 46, "length": 1}, + "opponent_guard_health": {"start": 47, "length": 4}, + "opponent_current_action_id": {"start": 51, "length": 17}, + "opponent_current_action_frame": {"start": 68, "length": 1}, + "opponent_current_action_frame_count": {"start": 69, "length": 1}, + "opponent_current_action_remaining_frames": {"start": 70, "length": 1}, + "opponent_is_action_end": {"start": 71, "length": 1}, + "opponent_is_always_cancelable": {"start": 72, "length": 1}, + "opponent_current_action_hit_count": {"start": 73, "length": 1}, + "opponent_current_hit_stun_frame": {"start": 74, "length": 1}, + "opponent_is_in_hit_stun": {"start": 75, "length": 1}, + "opponent_sprite_shake_position": {"start": 76, "length": 1}, + "opponent_max_sprite_shake_frame": {"start": 77, "length": 1}, + "opponent_is_face_right": {"start": 78, "length": 1}, + "opponent_current_frame_advantage": {"start": 79, "length": 1}, + "opponent_would_next_forward_input_dash": {"start": 80, "length": 1}, + "opponent_would_next_backward_input_dash": {"start": 81, "length": 1}, + "opponent_special_attack_progress": {"start": 82, "length": 1}, +} diff --git a/rllib/examples/envs/classes/multi_agent/footsies/game/footsies_binary.py b/rllib/examples/envs/classes/multi_agent/footsies/game/footsies_binary.py new file mode 100644 index 000000000000..2b9e3bcbc5b9 --- /dev/null +++ b/rllib/examples/envs/classes/multi_agent/footsies/game/footsies_binary.py @@ -0,0 +1,195 @@ +import logging +import os +import stat +import subprocess +import time +import zipfile +from dataclasses import dataclass +from pathlib import Path + +import grpc +import requests +from ray.rllib.env import EnvContext +from ray.rllib.examples.envs.classes.multi_agent.footsies.game.proto import ( + footsies_service_pb2 as footsies_pb2, +) +from ray.rllib.examples.envs.classes.multi_agent.footsies.game.proto import ( + footsies_service_pb2_grpc as footsies_pb2_grpc, +) + +logger = logging.getLogger(__name__) + + +@dataclass +class BinaryUrls: + # Uploaded 07.28.2025 + S3_ROOT = "https://ray-example-data.s3.us-west-2.amazonaws.com/rllib/env-footsies/binaries/" + + # Zip file names + ZIP_LINUX_SERVER = "footsies_linux_server_021725.zip" + ZIP_LINUX_WINDOWED = "footsies_linux_windowed_021725.zip" + ZIP_MAC_HEADLESS = "footsies_mac_headless_5709b6d.zip" + ZIP_MAC_WINDOWED = "footsies_mac_windowed_5709b6d.zip" + + # Full URLs + URL_LINUX_SERVER_BINARIES = S3_ROOT + ZIP_LINUX_SERVER + URL_LINUX_WINDOWED_BINARIES = S3_ROOT + ZIP_LINUX_WINDOWED + URL_MAC_HEADLESS_BINARIES = S3_ROOT + ZIP_MAC_HEADLESS + URL_MAC_WINDOWED_BINARIES = S3_ROOT + ZIP_MAC_WINDOWED + + +class FootsiesBinary: + def __init__(self, config: EnvContext, port: int): + self._urls = BinaryUrls() + self.config = config + self.port = port + self.binary_to_download = config["binary_to_download"] + if self.binary_to_download == "linux_server": + self.url = self._urls.URL_LINUX_SERVER_BINARIES + elif self.binary_to_download == "linux_windowed": + self.url = self._urls.URL_LINUX_WINDOWED_BINARIES + elif self.binary_to_download == "mac_headless": + self.url = self._urls.URL_MAC_HEADLESS_BINARIES + elif self.binary_to_download == "mac_windowed": + self.url = self._urls.URL_MAC_WINDOWED_BINARIES + else: + raise ValueError(f"Invalid target binary: {self.binary_to_download}") + + self.full_download_dir = Path(config["binary_download_dir"]).resolve() + self.full_download_path = ( + self.full_download_dir / str.split(self.url, sep="/")[-1] + ) + self.full_extract_dir = Path(config["binary_extract_dir"]).resolve() + self.renamed_path = self.full_extract_dir / "footsies_binaries" + + @staticmethod + def _add_executable_permission(binary_path: Path) -> None: + binary_path.chmod(binary_path.stat().st_mode | stat.S_IXUSR) + + def start_game_server(self) -> int: + """Downloads, unzips, and starts the Footsies game server binary. + + Returns footsies process PID. + """ + self._download_game_binary() + self._unzip_game_binary() + + if self.binary_to_download == "mac_windowed": + game_binary_path = ( + Path(self.renamed_path) / "Contents" / "MacOS" / "FOOTSIES" + ) + elif self.binary_to_download == "mac_headless": + game_binary_path = Path(self.renamed_path) / "FOOTSIES" + else: + game_binary_path = Path(self.renamed_path) / "footsies.x86_64" + + if os.access(game_binary_path, os.X_OK): + logger.info( + f"Game binary has an 'executable' permission: {game_binary_path}" + ) + else: + self._add_executable_permission(game_binary_path) + logger.info(f"Game binary path: {game_binary_path}") + + if ( + self.binary_to_download == "linux_server" + or self.binary_to_download == "linux_windowed" + ): + process = subprocess.Popen([game_binary_path, "--port", str(self.port)]) + else: + process = subprocess.Popen( + [ + "arch", + "-x86_64", + game_binary_path, + "--port", + str(self.port), + ], + ) + + # check if the game server is running correctly + timeout = 2 + channel = grpc.insecure_channel(f"localhost:{self.port}") + stub = footsies_pb2_grpc.FootsiesGameServiceStub(channel) + + # step 1: try to start the game + while True: + try: + stub.StartGame(footsies_pb2.Empty()) + logger.info("Game ready!") + break + except grpc.RpcError as e: + code = e.code() + if code in ( + grpc.StatusCode.UNAVAILABLE, + grpc.StatusCode.DEADLINE_EXCEEDED, + ): + logger.info(f"RLlib {self.__class__.__name__}: Game not ready...") + time.sleep(timeout) + continue + raise + + # step 2: check if the game is ready + ready = False + while not ready: + try: + ready = stub.IsReady(footsies_pb2.Empty()).value + if not ready: + logger.info(f"RLlib {self.__class__.__name__}: Game not ready...") + time.sleep(timeout) + continue + else: + logger.info("Game ready!") + break + except grpc.RpcError as e: + if e.code() in ( + grpc.StatusCode.UNAVAILABLE, + grpc.StatusCode.DEADLINE_EXCEEDED, + ): + time.sleep(timeout) + logger.info(f"RLlib {self.__class__.__name__}: Game not ready...") + continue + raise + + channel.close() + return process.pid + + def _download_game_binary(self): + chunk_size = 1024 * 1024 # 1MB + + if Path(self.full_download_path).exists(): + logger.info( + f"Game binary already exists at {self.full_download_path}, skipping download." + ) + + else: + try: + with requests.get(self.url, stream=True) as response: + response.raise_for_status() + self.full_download_dir.mkdir(parents=True, exist_ok=True) + with open(self.full_download_path, "wb") as f: + for chunk in response.iter_content(chunk_size=chunk_size): + if chunk: + f.write(chunk) + logger.info( + f"Downloaded game binary to {self.full_download_path}\n" + f"Binary size: {self.full_download_path.stat().st_size / 1024 / 1024:.1f} MB\n" + ) + except requests.exceptions.RequestException as e: + logger.error(f"Failed to download binary from {self.url}: {e}") + + def _unzip_game_binary(self): + if Path(self.renamed_path).exists(): + logger.info( + f"Game binary already extracted at {self.renamed_path}, skipping extraction." + ) + else: + self.full_extract_dir.mkdir(parents=True, exist_ok=True) + with zipfile.ZipFile(self.full_download_path, mode="r") as zip_ref: + zip_ref.extractall(self.full_extract_dir) + + if self.binary_to_download == "mac_windowed": + self.full_download_path.with_suffix(".app").rename(self.renamed_path) + else: + self.full_download_path.with_suffix("").rename(self.renamed_path) + logger.info(f"Extracted game binary to {self.renamed_path}") diff --git a/rllib/examples/envs/classes/multi_agent/footsies/game/footsies_game.py b/rllib/examples/envs/classes/multi_agent/footsies/game/footsies_game.py new file mode 100644 index 000000000000..5f4252412958 --- /dev/null +++ b/rllib/examples/envs/classes/multi_agent/footsies/game/footsies_game.py @@ -0,0 +1,121 @@ +import logging +import time + +import grpc +import numpy as np + +import ray.rllib.examples.envs.classes.multi_agent.footsies.game.proto.footsies_service_pb2 as footsies_pb2 +import ray.rllib.examples.envs.classes.multi_agent.footsies.game.proto.footsies_service_pb2_grpc as footsies_pb2_grpc +from ray.rllib.examples.envs.classes.multi_agent.footsies.game import constants + +logger = logging.getLogger(__name__) + + +class FootsiesGame: + """Handles gRPC communication with game the server. + + This class establishes communication between the + game server and the Python harness via gRPC. It provides methods + to start the game, reset it, get the current state, and step the + game by a certain number of frames. + """ + + def __init__(self, host: str, port: int): + self.host = host + self.port = port + self.stub = self._initialize_stub() + + @staticmethod + def action_to_bits(action: int, is_player_1: bool) -> int: + """Converts an action to its corresponding bit representation.""" + + if isinstance(action, np.ndarray): + action = action.item() + + if is_player_1: + if action == constants.EnvActions.BACK: + action = constants.GameActions.LEFT + elif action == constants.EnvActions.FORWARD: + action = constants.GameActions.RIGHT + elif action == constants.EnvActions.BACK_ATTACK: + action = constants.GameActions.LEFT_ATTACK + elif action == constants.EnvActions.FORWARD_ATTACK: + action = constants.GameActions.RIGHT_ATTACK + else: + if action == constants.EnvActions.BACK: + action = constants.GameActions.RIGHT + elif action == constants.EnvActions.FORWARD: + action = constants.GameActions.LEFT + elif action == constants.EnvActions.BACK_ATTACK: + action = constants.GameActions.RIGHT_ATTACK + elif action == constants.EnvActions.FORWARD_ATTACK: + action = constants.GameActions.LEFT_ATTACK + + return constants.ACTION_TO_BITS[action] + + def get_encoded_state(self) -> footsies_pb2.EncodedGameState: + """Gets the current encoded game state by calling the GetEncodedState RPC.""" + try: + return self.stub.GetEncodedState(footsies_pb2.Empty()) + except Exception as e: + logger.error(f"Error calling GetEncodedState with exception: {e}") + raise e + + def get_state(self) -> footsies_pb2.GameState: + """Gets the current game state by calling the GetState RPC.""" + try: + return self.stub.GetState(footsies_pb2.Empty()) + except Exception as e: + logger.error(f"Error calling GetState with exception: {e}") + raise e + + def is_ready(self) -> bool: + """Checks if the game is ready by calling the IsReady RPC.""" + try: + return self.stub.IsReady(footsies_pb2.Empty()).value + except Exception as e: + logger.error(f"Error calling IsReady with exception: {e}") + raise e + + def reset_game(self) -> None: + """Resets the game by calling the ResetGame RPC.""" + try: + self.stub.ResetGame(footsies_pb2.Empty()) + except Exception as e: + logger.error(f"Error calling ResetGame with exception: {e}") + raise e + + def start_game(self) -> None: + """Starts the game by calling the StartGame RPC.""" + try: + self.stub.StartGame(footsies_pb2.Empty()) + + while not self.is_ready(): + logger.info("Game not ready...") + time.sleep(0.5) + logger.info("StartGame called successfully") + + except Exception as e: + logger.error(f"Error calling StartGame with exception: {e}") + raise e + + def step_n_frames( + self, p1_action: int, p2_action: int, n_frames: int + ) -> footsies_pb2.GameState: + """Steps the game by n_frames with the given player actions. The provided actions will be repeated for all n_frames.""" + try: + step_input = footsies_pb2.StepInput( + p1_action=p1_action, p2_action=p2_action, nFrames=n_frames + ) + return self.stub.StepNFrames(step_input) + except Exception as e: + logger.error(f"Error calling StepNFrames with exception: {e}") + raise e + + def _initialize_stub(self) -> footsies_pb2_grpc.FootsiesGameServiceStub: + try: + channel = grpc.insecure_channel(f"{self.host}:{self.port}") + return footsies_pb2_grpc.FootsiesGameServiceStub(channel) + except grpc.RpcError as e: + logger.error(f"Error connecting to gRPC stub with exception: {e}") + raise e diff --git a/rllib/examples/envs/classes/multi_agent/footsies/game/proto/footsies_service.proto b/rllib/examples/envs/classes/multi_agent/footsies/game/proto/footsies_service.proto new file mode 100644 index 000000000000..5edbd7bda692 --- /dev/null +++ b/rllib/examples/envs/classes/multi_agent/footsies/game/proto/footsies_service.proto @@ -0,0 +1,63 @@ +syntax = "proto3"; + + +service FootsiesGameService { + rpc StartGame(Empty) returns (Empty) {} + rpc ResetGame(Empty) returns (Empty) {} + rpc StepNFrames(StepInput) returns (GameState) {} + rpc GetState(Empty) returns (GameState) {} + rpc GetEncodedState(Empty) returns (EncodedGameState) {} + rpc IsReady(Empty) returns (BoolValue) {} +} + + +message StepInput { + int64 p1_action = 1; + int64 p2_action = 2; + int64 nFrames = 3; +} + +message PlayerState { + float player_position_x = 1; + bool is_dead = 2; + int64 vital_health = 3; + int64 guard_health = 4; + int64 current_action_id = 5; + int64 current_action_frame = 6; + int64 current_action_frame_count = 7; + bool is_action_end = 8; + bool is_always_cancelable = 9; + int64 current_action_hit_count = 10; + int64 current_hit_stun_frame = 11; + bool is_in_hit_stun = 12; + int64 sprite_shake_position = 13; + int64 max_sprite_shake_frame = 14; + float velocity_x = 15; + bool is_face_right = 16; + repeated int64 input_buffer = 17; + int64 current_frame_advantage = 18; + bool would_next_forward_input_dash = 19; + bool would_next_backward_input_dash = 20; + float special_attack_progress = 21; +} + +message GameState { + PlayerState player1 = 1; + PlayerState player2 = 2; + int64 round_state = 3; + int64 frame_count = 4; +} + +message EncodedGameState { + repeated float player1_encoding = 1; + repeated float player2_encoding = 2; +} + +message BoolValue { + bool value = 1; +} + + + + +message Empty {} diff --git a/rllib/examples/envs/classes/multi_agent/footsies/game/proto/footsies_service_pb2.py b/rllib/examples/envs/classes/multi_agent/footsies/game/proto/footsies_service_pb2.py new file mode 100644 index 000000000000..8dc26277dff8 --- /dev/null +++ b/rllib/examples/envs/classes/multi_agent/footsies/game/proto/footsies_service_pb2.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: footsies_service.proto +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import builder as _builder + +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( + b'\n\x16\x66ootsies_service.proto"B\n\tStepInput\x12\x11\n\tp1_action\x18\x01 \x01(\x03\x12\x11\n\tp2_action\x18\x02 \x01(\x03\x12\x0f\n\x07nFrames\x18\x03 \x01(\x03"\xe2\x04\n\x0bPlayerState\x12\x19\n\x11player_position_x\x18\x01 \x01(\x02\x12\x0f\n\x07is_dead\x18\x02 \x01(\x08\x12\x14\n\x0cvital_health\x18\x03 \x01(\x03\x12\x14\n\x0cguard_health\x18\x04 \x01(\x03\x12\x19\n\x11\x63urrent_action_id\x18\x05 \x01(\x03\x12\x1c\n\x14\x63urrent_action_frame\x18\x06 \x01(\x03\x12"\n\x1a\x63urrent_action_frame_count\x18\x07 \x01(\x03\x12\x15\n\ris_action_end\x18\x08 \x01(\x08\x12\x1c\n\x14is_always_cancelable\x18\t \x01(\x08\x12 \n\x18\x63urrent_action_hit_count\x18\n \x01(\x03\x12\x1e\n\x16\x63urrent_hit_stun_frame\x18\x0b \x01(\x03\x12\x16\n\x0eis_in_hit_stun\x18\x0c \x01(\x08\x12\x1d\n\x15sprite_shake_position\x18\r \x01(\x03\x12\x1e\n\x16max_sprite_shake_frame\x18\x0e \x01(\x03\x12\x12\n\nvelocity_x\x18\x0f \x01(\x02\x12\x15\n\ris_face_right\x18\x10 \x01(\x08\x12\x14\n\x0cinput_buffer\x18\x11 \x03(\x03\x12\x1f\n\x17\x63urrent_frame_advantage\x18\x12 \x01(\x03\x12%\n\x1dwould_next_forward_input_dash\x18\x13 \x01(\x08\x12&\n\x1ewould_next_backward_input_dash\x18\x14 \x01(\x08\x12\x1f\n\x17special_attack_progress\x18\x15 \x01(\x02"s\n\tGameState\x12\x1d\n\x07player1\x18\x01 \x01(\x0b\x32\x0c.PlayerState\x12\x1d\n\x07player2\x18\x02 \x01(\x0b\x32\x0c.PlayerState\x12\x13\n\x0bround_state\x18\x03 \x01(\x03\x12\x13\n\x0b\x66rame_count\x18\x04 \x01(\x03"F\n\x10\x45ncodedGameState\x12\x18\n\x10player1_encoding\x18\x01 \x03(\x02\x12\x18\n\x10player2_encoding\x18\x02 \x03(\x02"\x1a\n\tBoolValue\x12\r\n\x05value\x18\x01 \x01(\x08"\x07\n\x05\x45mpty2\xef\x01\n\x13\x46ootsiesGameService\x12\x1d\n\tStartGame\x12\x06.Empty\x1a\x06.Empty"\x00\x12\x1d\n\tResetGame\x12\x06.Empty\x1a\x06.Empty"\x00\x12\'\n\x0bStepNFrames\x12\n.StepInput\x1a\n.GameState"\x00\x12 \n\x08GetState\x12\x06.Empty\x1a\n.GameState"\x00\x12.\n\x0fGetEncodedState\x12\x06.Empty\x1a\x11.EncodedGameState"\x00\x12\x1f\n\x07IsReady\x12\x06.Empty\x1a\n.BoolValue"\x00\x62\x06proto3' +) + +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "footsies_service_pb2", globals()) +if _descriptor._USE_C_DESCRIPTORS == False: + + DESCRIPTOR._options = None + _STEPINPUT._serialized_start = 26 + _STEPINPUT._serialized_end = 92 + _PLAYERSTATE._serialized_start = 95 + _PLAYERSTATE._serialized_end = 705 + _GAMESTATE._serialized_start = 707 + _GAMESTATE._serialized_end = 822 + _ENCODEDGAMESTATE._serialized_start = 824 + _ENCODEDGAMESTATE._serialized_end = 894 + _BOOLVALUE._serialized_start = 896 + _BOOLVALUE._serialized_end = 922 + _EMPTY._serialized_start = 924 + _EMPTY._serialized_end = 931 + _FOOTSIESGAMESERVICE._serialized_start = 934 + _FOOTSIESGAMESERVICE._serialized_end = 1173 +# @@protoc_insertion_point(module_scope) diff --git a/rllib/examples/envs/classes/multi_agent/footsies/game/proto/footsies_service_pb2_grpc.py b/rllib/examples/envs/classes/multi_agent/footsies/game/proto/footsies_service_pb2_grpc.py new file mode 100644 index 000000000000..b39a76d7bf5a --- /dev/null +++ b/rllib/examples/envs/classes/multi_agent/footsies/game/proto/footsies_service_pb2_grpc.py @@ -0,0 +1,307 @@ +# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! +"""Client and server classes corresponding to protobuf-defined services.""" +import grpc + +import ray.rllib.examples.envs.classes.multi_agent.footsies.game.proto.footsies_service_pb2 as footsies__service__pb2 + + +# import footsies_service_pb2 as footsies__service__pb2 + + +class FootsiesGameServiceStub(object): + """Missing associated documentation comment in .proto file.""" + + def __init__(self, channel): + """Constructor. + + Args: + channel: A grpc.Channel. + """ + self.StartGame = channel.unary_unary( + "/FootsiesGameService/StartGame", + request_serializer=footsies__service__pb2.Empty.SerializeToString, + response_deserializer=footsies__service__pb2.Empty.FromString, + ) + self.ResetGame = channel.unary_unary( + "/FootsiesGameService/ResetGame", + request_serializer=footsies__service__pb2.Empty.SerializeToString, + response_deserializer=footsies__service__pb2.Empty.FromString, + ) + self.StepNFrames = channel.unary_unary( + "/FootsiesGameService/StepNFrames", + request_serializer=footsies__service__pb2.StepInput.SerializeToString, + response_deserializer=footsies__service__pb2.GameState.FromString, + ) + self.GetState = channel.unary_unary( + "/FootsiesGameService/GetState", + request_serializer=footsies__service__pb2.Empty.SerializeToString, + response_deserializer=footsies__service__pb2.GameState.FromString, + ) + self.GetEncodedState = channel.unary_unary( + "/FootsiesGameService/GetEncodedState", + request_serializer=footsies__service__pb2.Empty.SerializeToString, + response_deserializer=footsies__service__pb2.EncodedGameState.FromString, + ) + self.IsReady = channel.unary_unary( + "/FootsiesGameService/IsReady", + request_serializer=footsies__service__pb2.Empty.SerializeToString, + response_deserializer=footsies__service__pb2.BoolValue.FromString, + ) + + +class FootsiesGameServiceServicer(object): + """Missing associated documentation comment in .proto file.""" + + def StartGame(self, request, context): + """Missing associated documentation comment in .proto file.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") + + def ResetGame(self, request, context): + """Missing associated documentation comment in .proto file.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") + + def StepNFrames(self, request, context): + """Missing associated documentation comment in .proto file.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") + + def GetState(self, request, context): + """Missing associated documentation comment in .proto file.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") + + def GetEncodedState(self, request, context): + """Missing associated documentation comment in .proto file.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") + + def IsReady(self, request, context): + """Missing associated documentation comment in .proto file.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") + + +def add_FootsiesGameServiceServicer_to_server(servicer, server): + rpc_method_handlers = { + "StartGame": grpc.unary_unary_rpc_method_handler( + servicer.StartGame, + request_deserializer=footsies__service__pb2.Empty.FromString, + response_serializer=footsies__service__pb2.Empty.SerializeToString, + ), + "ResetGame": grpc.unary_unary_rpc_method_handler( + servicer.ResetGame, + request_deserializer=footsies__service__pb2.Empty.FromString, + response_serializer=footsies__service__pb2.Empty.SerializeToString, + ), + "StepNFrames": grpc.unary_unary_rpc_method_handler( + servicer.StepNFrames, + request_deserializer=footsies__service__pb2.StepInput.FromString, + response_serializer=footsies__service__pb2.GameState.SerializeToString, + ), + "GetState": grpc.unary_unary_rpc_method_handler( + servicer.GetState, + request_deserializer=footsies__service__pb2.Empty.FromString, + response_serializer=footsies__service__pb2.GameState.SerializeToString, + ), + "GetEncodedState": grpc.unary_unary_rpc_method_handler( + servicer.GetEncodedState, + request_deserializer=footsies__service__pb2.Empty.FromString, + response_serializer=footsies__service__pb2.EncodedGameState.SerializeToString, + ), + "IsReady": grpc.unary_unary_rpc_method_handler( + servicer.IsReady, + request_deserializer=footsies__service__pb2.Empty.FromString, + response_serializer=footsies__service__pb2.BoolValue.SerializeToString, + ), + } + generic_handler = grpc.method_handlers_generic_handler( + "FootsiesGameService", rpc_method_handlers + ) + server.add_generic_rpc_handlers((generic_handler,)) + + +# This class is part of an EXPERIMENTAL API. +class FootsiesGameService(object): + """Missing associated documentation comment in .proto file.""" + + @staticmethod + def StartGame( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, + target, + "/FootsiesGameService/StartGame", + footsies__service__pb2.Empty.SerializeToString, + footsies__service__pb2.Empty.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) + + @staticmethod + def ResetGame( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, + target, + "/FootsiesGameService/ResetGame", + footsies__service__pb2.Empty.SerializeToString, + footsies__service__pb2.Empty.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) + + @staticmethod + def StepNFrames( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, + target, + "/FootsiesGameService/StepNFrames", + footsies__service__pb2.StepInput.SerializeToString, + footsies__service__pb2.GameState.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) + + @staticmethod + def GetState( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, + target, + "/FootsiesGameService/GetState", + footsies__service__pb2.Empty.SerializeToString, + footsies__service__pb2.GameState.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) + + @staticmethod + def GetEncodedState( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, + target, + "/FootsiesGameService/GetEncodedState", + footsies__service__pb2.Empty.SerializeToString, + footsies__service__pb2.EncodedGameState.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) + + @staticmethod + def IsReady( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, + target, + "/FootsiesGameService/IsReady", + footsies__service__pb2.Empty.SerializeToString, + footsies__service__pb2.BoolValue.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) diff --git a/rllib/examples/envs/classes/multi_agent/footsies/utils.py b/rllib/examples/envs/classes/multi_agent/footsies/utils.py new file mode 100644 index 000000000000..3321f32058c6 --- /dev/null +++ b/rllib/examples/envs/classes/multi_agent/footsies/utils.py @@ -0,0 +1,331 @@ +import collections +import logging +from dataclasses import dataclass +from typing import Dict, Optional + +import gymnasium as gym +import numpy as np +from ray.rllib.algorithms.algorithm import Algorithm +from ray.rllib.algorithms.callbacks import RLlibCallback +from ray.rllib.core.rl_module import RLModuleSpec +from ray.rllib.env.env_runner import EnvRunner +from ray.rllib.env.multi_agent_episode import MultiAgentEpisode +from ray.rllib.examples.envs.classes.multi_agent.footsies.game.constants import ( + FOOTSIES_ACTION_IDS, +) +from ray.rllib.utils.metrics import ENV_RUNNER_RESULTS +from ray.rllib.utils.metrics.metrics_logger import MetricsLogger +from ray.rllib.utils.typing import EpisodeType + +logger = logging.getLogger("ray.rllib") + + +@dataclass +class Matchup: + p1: str + p2: str + prob: float + + +class Matchmaker: + def __init__(self, matchups: list[Matchup]): + self.matchups = matchups + self.probs = [matchup.prob for matchup in matchups] + self.current_matchups = collections.defaultdict(dict) + + def agent_to_module_mapping_fn( + self, agent_id: str, episode: EpisodeType, **kwargs + ) -> str: + """Mapping function that retrieves policy_id from the sampled matchup""" + id_ = episode.id_ + if self.current_matchups.get(id_) is None: + # step 1: sample a matchup according to the specified probabilities + sampled_matchup = np.random.choice(a=self.matchups, p=self.probs) + + # step 2: Randomize who is player 1 and player 2 + policies = [sampled_matchup.p1, sampled_matchup.p2] + p1, p2 = np.random.choice(policies, size=2, replace=False) + + # step 3: Set as the current matchup for the episode in question (id_) + self.current_matchups[id_]["p1"] = p1 + self.current_matchups[id_]["p2"] = p2 + + policy_id = self.current_matchups[id_].pop(agent_id) + + # remove (an empty dict) for the current episode with id_ + if not self.current_matchups[id_]: + del self.current_matchups[id_] + + return policy_id + + +class MetricsLoggerCallback(RLlibCallback): + def __init__(self, main_policy: str) -> None: + """Log experiment metrics + + Logs metrics after each episode step and at the end of each (train or eval) episode. + Metrics logged at the end of each episode will be later used by MixManagerCallback + to decide whether to add a new opponent to the mix. + """ + super().__init__() + self.main_policy = main_policy + self.action_id_to_str = { + action_id: action_str + for action_str, action_id in FOOTSIES_ACTION_IDS.items() + } + + def on_episode_step( + self, + *, + episode: MultiAgentEpisode, + env_runner: Optional[EnvRunner] = None, + metrics_logger: Optional[MetricsLogger] = None, + env: Optional[gym.Env] = None, + env_index: int, + **kwargs, + ) -> None: + """Log action usage frequency + + Log actions performed by both players at each step of the (training or evaluation) episode. + """ + stage = "eval" if env_runner.config.in_evaluation else "train" + + # get the ModuleID for each agent + p1_module = episode.module_for("p1") + p2_module = episode.module_for("p2") + + # get action string for each agent + p1_action_id = env.envs[ + env_index + ].unwrapped.last_game_state.player1.current_action_id + p2_action_id = env.envs[ + env_index + ].unwrapped.last_game_state.player2.current_action_id + p1_action_str = self.action_id_to_str[p1_action_id] + p2_action_str = self.action_id_to_str[p2_action_id] + + metrics_logger.log_value( + key=f"footsies/{stage}/actions/{p1_module}/{p1_action_str}", + value=1, + reduce="sum", + window=100, + clear_on_reduce=True, + ) + metrics_logger.log_value( + key=f"footsies/{stage}/actions/{p2_module}/{p2_action_str}", + value=1, + reduce="sum", + window=100, + clear_on_reduce=True, + ) + + def on_episode_end( + self, + *, + episode: MultiAgentEpisode, + env_runner: Optional[EnvRunner] = None, + metrics_logger: Optional[MetricsLogger] = None, + env: Optional[gym.Env] = None, + env_index: int, + **kwargs, + ) -> None: + """Log win rates + + Log win rates of the main policy against its opponent at the end of the (training or evaluation) episode. + """ + stage = "eval" if env_runner.config.in_evaluation else "train" + + # check status of "p1" and "p2" + last_game_state = env.envs[env_index].unwrapped.last_game_state + p1_dead = last_game_state.player1.is_dead + p2_dead = last_game_state.player2.is_dead + + # get the ModuleID for each agent + p1_module = episode.module_for("p1") + p2_module = episode.module_for("p2") + + if self.main_policy == p1_module: + opponent_id = p2_module + main_policy_win = p2_dead + elif self.main_policy == p2_module: + opponent_id = p1_module + main_policy_win = p1_dead + else: + logger.info( + f"RLlib {self.__class__.__name__}: Main policy: '{self.main_policy}' not found in this episode. " + f"Policies in this episode are: '{p1_module}' and '{p2_module}'. " + f"Check your multi_agent 'policy_mapping_fn'. " + f"Metrics logging for this episode will be skipped." + ) + return + + if p1_dead and p2_dead: + metrics_logger.log_value( + key=f"footsies/{stage}/both_dead/{self.main_policy}/vs_{opponent_id}", + value=1, + reduce="mean", + window=100, + clear_on_reduce=True, + ) + elif not p1_dead and not p2_dead: + metrics_logger.log_value( + key=f"footsies/{stage}/both_alive/{self.main_policy}/vs_{opponent_id}", + value=1, + reduce="mean", + window=100, + clear_on_reduce=True, + ) + else: + # log the win rate against the opponent with an 'opponent_id' + metrics_logger.log_value( + key=f"footsies/{stage}/win_rates/{self.main_policy}/vs_{opponent_id}", + value=int(main_policy_win), + reduce="mean", + window=100, + clear_on_reduce=True, + ) + + # log the win rate, without specifying the opponent + # this metric collected from the eval env runner + # will be used to decide whether to add + # a new opponent at the current level. + metrics_logger.log_value( + key=f"footsies/{stage}/win_rates/{self.main_policy}/vs_any", + value=int(main_policy_win), + reduce="mean", + window=100, + clear_on_reduce=True, + ) + + +class MixManagerCallback(RLlibCallback): + def __init__( + self, + win_rate_threshold: float, + main_policy: str, + target_mix_size: int, + starting_modules=list[str], # default is ["lstm", "noop"] + fixed_modules_progression_sequence=tuple[str], # default is ("noop", "back") + ) -> None: + """Track win rates and manage mix of opponents""" + super().__init__() + self.win_rate_threshold = win_rate_threshold + self.main_policy = main_policy + self.target_mix_size = target_mix_size + self.fixed_modules_progression_sequence = tuple( + fixed_modules_progression_sequence + ) # Order of RL modules to be added to the mix + self.modules_in_mix = list( + starting_modules + ) # RLModules that are currently in the mix + self._trained_policy_idx = ( + 0 # We will use this to create new opponents of the main policy + ) + + def on_evaluate_end( + self, + *, + algorithm: Algorithm, + metrics_logger: Optional[MetricsLogger] = None, + evaluation_metrics: dict, + **kwargs, + ) -> None: + """Check win rates and add new opponent if necessary. + + Check the win rate of the main policy against its current opponent. + If the win rate exceeds the specified threshold, add a new opponent to the mix, by modifying: + 1. update the policy_mapping_fn for (training and evaluation) env runners + 2. if the new policy is a trained one (not a fixed RL module), modify Algorithm's state (initialize the state of the newly added RLModule by using the main policy) + """ + _main_module = algorithm.get_module(self.main_policy) + new_module_id = None + new_module_spec = None + + win_rate = evaluation_metrics[ENV_RUNNER_RESULTS][ + f"footsies/eval/win_rates/{self.main_policy}/vs_any" + ] + + if win_rate > self.win_rate_threshold: + logger.info( + f"RLlib {self.__class__.__name__}: Win rate for main policy '{self.main_policy}' " + f"exceeded threshold ({win_rate} > {self.win_rate_threshold})." + f" Adding new RL Module to the mix..." + ) + + # check if fixed RL module should be added to the mix, + # and if so, create new_module_id and new_module_spec for it + for module_id in self.fixed_modules_progression_sequence: + if module_id not in self.modules_in_mix: + new_module_id = module_id + break + + # in case that all fixed RL Modules are already in the mix (together with the main policy), + # we will add a new RL Module by taking main policy and adding an instance of it to the mix + if new_module_id is None: + new_module_id = f"{self.main_policy}_v{self._trained_policy_idx}" + new_module_spec = RLModuleSpec.from_module(_main_module) + self._trained_policy_idx += 1 + + # create new policy mapping function, to ensure that the main policy plays against newly added policy + new_mapping_fn = Matchmaker( + [ + Matchup( + p1=self.main_policy, + p2=new_module_id, + prob=1.0, + ) + ] + ).agent_to_module_mapping_fn + + # update (training) env runners with the new mapping function + algorithm.env_runner_group.foreach_env_runner( + lambda er: er.config.multi_agent(policy_mapping_fn=new_mapping_fn), + local_env_runner=True, + ) + + # update (eval) env runners with the new mapping function + algorithm.eval_env_runner_group.foreach_env_runner( + lambda er: er.config.multi_agent(policy_mapping_fn=new_mapping_fn), + local_env_runner=True, + ) + + if new_module_id not in self.fixed_modules_progression_sequence: + algorithm.add_module( + module_id=new_module_id, + module_spec=new_module_spec, + new_agent_to_module_mapping_fn=new_mapping_fn, + ) + # newly added trained policy should be initialized with the state of the main policy + algorithm.set_state( + { + "learner_group": { + "learner": { + "rl_module": { + new_module_id: _main_module.get_state(), + } + } + }, + } + ) + # we added a new RL Module, so we need to update the current mix list. + self.modules_in_mix.append(new_module_id) + + else: + logger.info( + f"RLlib {self.__class__.__name__}: Win rate for main policy '{self.main_policy}' " + f"did not exceed threshold ({win_rate} <= {self.win_rate_threshold})." + ) + + def on_train_result( + self, + *, + algorithm: Algorithm, + metrics_logger: Optional[MetricsLogger] = None, + result: Dict, + **kwargs, + ) -> None: + """Report the current mix size at the end of training iteration. + + That will tell Ray Tune, whether to stop training (once the 'target_mix_size' has been reached). + """ + result["mix_size"] = len(self.modules_in_mix) diff --git a/rllib/examples/envs/classes/utils/dummy_external_client.py b/rllib/examples/envs/classes/utils/dummy_external_client.py new file mode 100644 index 000000000000..8cc1bf0af6f7 --- /dev/null +++ b/rllib/examples/envs/classes/utils/dummy_external_client.py @@ -0,0 +1,126 @@ +import pickle +import socket +import time + +import gymnasium as gym +import numpy as np + +from ray.rllib.core import ( + Columns, + COMPONENT_RL_MODULE, +) +from ray.rllib.env.external.rllink import ( + get_rllink_message, + send_rllink_message, + RLlink, +) +from ray.rllib.env.single_agent_episode import SingleAgentEpisode +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.numpy import softmax + +torch, _ = try_import_torch() + + +def _dummy_external_client(port: int = 5556): + """A dummy client that runs CartPole and acts as a testing external env.""" + + def _set_state(msg_body, rl_module): + rl_module.set_state(msg_body[COMPONENT_RL_MODULE]) + # return msg_body[WEIGHTS_SEQ_NO] + + # Connect to server. + while True: + try: + print(f"Trying to connect to localhost:{port} ...") + sock_ = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock_.connect(("localhost", port)) + break + except ConnectionRefusedError: + time.sleep(5) + + # Send ping-pong. + send_rllink_message(sock_, {"type": RLlink.PING.name}) + msg_type, msg_body = get_rllink_message(sock_) + assert msg_type == RLlink.PONG + + # Request config. + send_rllink_message(sock_, {"type": RLlink.GET_CONFIG.name}) + msg_type, msg_body = get_rllink_message(sock_) + assert msg_type == RLlink.SET_CONFIG + + config = pickle.loads(msg_body["config"]) + # Create the RLModule. + rl_module = config.get_rl_module_spec().build() + + # Request state/weights. + send_rllink_message(sock_, {"type": RLlink.GET_STATE.name}) + msg_type, msg_body = get_rllink_message(sock_) + assert msg_type == RLlink.SET_STATE + _set_state(msg_body["state"], rl_module) + + env_steps_per_sample = config.get_rollout_fragment_length() + + # Start actual env loop. + env = gym.make("CartPole-v1") + obs, _ = env.reset() + episode = SingleAgentEpisode(observations=[obs]) + episodes = [episode] + + while True: + # Perform action inference using the RLModule. + logits = rl_module.forward_exploration( + batch={ + Columns.OBS: torch.tensor(np.array([obs], np.float32)), + } + )[Columns.ACTION_DIST_INPUTS][ + 0 + ].numpy() # [0]=batch size 1 + + # Stochastic sample. + action_probs = softmax(logits) + action = int(np.random.choice(list(range(env.action_space.n)), p=action_probs)) + logp = float(np.log(action_probs[action])) + + # Perform the env step. + obs, reward, terminated, truncated, _ = env.step(action) + + # Collect step data. + episode.add_env_step( + action=action, + reward=reward, + observation=obs, + terminated=terminated, + truncated=truncated, + extra_model_outputs={ + Columns.ACTION_DIST_INPUTS: logits, + Columns.ACTION_LOGP: logp, + }, + ) + + # We collected enough samples -> Send them to server. + if sum(map(len, episodes)) == env_steps_per_sample: + # Send the data to the server. + send_rllink_message( + sock_, + { + "type": RLlink.EPISODES_AND_GET_STATE.name, + "episodes": [e.get_state() for e in episodes], + "timesteps": env_steps_per_sample, + }, + ) + # We are forced to sample on-policy. Have to wait for a response + # with the state (weights) in it. + msg_type, msg_body = get_rllink_message(sock_) + assert msg_type == RLlink.SET_STATE + _set_state(msg_body["state"], rl_module) + + episodes = [] + if not episode.is_done: + episode = episode.cut() + episodes.append(episode) + + # If episode is done, reset env and create a new episode. + if episode.is_done: + obs, _ = env.reset() + episode = SingleAgentEpisode(observations=[obs]) + episodes.append(episode) diff --git a/rllib/examples/envs/env_connecting_to_rllib_w_tcp_client.py b/rllib/examples/envs/env_connecting_to_rllib_w_tcp_client.py index 3a757bab5993..aedd97237ce0 100644 --- a/rllib/examples/envs/env_connecting_to_rllib_w_tcp_client.py +++ b/rllib/examples/envs/env_connecting_to_rllib_w_tcp_client.py @@ -1,6 +1,6 @@ """Example of running against a TCP-connected external env performing its own inference. -The example uses a custom EnvRunner (TcpClientInferenceEnvRunner) to allow +The example uses a custom EnvRunner (EnvRunnerServerForExternalInference) to allow connections from one or more TCP clients to RLlib's EnvRunner actors, which act as RL servers. In this example, action inference for stepping the env is performed on the client's @@ -60,16 +60,17 @@ ConnectionError: Error receiving message from peer on socket ... ``` """ -from functools import partial import threading import gymnasium as gym import numpy as np from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig -from ray.rllib.env.tcp_client_inference_env_runner import ( - _dummy_client, - TcpClientInferenceEnvRunner, +from ray.rllib.env.external.env_runner_server_for_external_inference import ( + EnvRunnerServerForExternalInference, +) +from ray.rllib.examples.envs.classes.utils.dummy_external_client import ( + _dummy_external_client, ) from ray.rllib.utils.test_utils import ( add_rllib_example_script_args, @@ -90,34 +91,44 @@ help="The port for RLlib's EnvRunner to listen to for incoming UE5 connections. " "You need to specify the same port inside your UE5 `RLlibClient` plugin.", ) +parser.add_argument( + "--use-dummy-client", + action="store_true", + help="If set, the script runs with its own external client acting as a " + "simulator. Otherwise connect on your own from your C++ application.", +) if __name__ == "__main__": args = parser.parse_args() - # Start the dummy CartPole client in a thread (and do its thing in parallel). - client_thread = threading.Thread( - target=partial( - _dummy_client, - port=args.port - + (args.num_env_runners if args.num_env_runners is not None else 1), - ), - ) - client_thread.start() + # Start the dummy CartPole "simulation". + if args.use_dummy_client: + threading.Thread( + target=_dummy_external_client, + args=( + # Connect to the first remote EnvRunner, of - if there is no remote one - + # to the local EnvRunner. + args.port + + (args.num_env_runners if args.num_env_runners is not None else 1), + ), + ).start() # Define the RLlib (server) config. base_config = ( get_trainable_cls(args.algo) .get_default_config() .environment( - observation_space=gym.spaces.Box(-1.0, 1.0, (4,), np.float32), + observation_space=gym.spaces.Box( + float("-inf"), float("-inf"), (4,), np.float32 + ), action_space=gym.spaces.Discrete(2), # EnvRunners listen on `port` + their worker index. env_config={"port": args.port}, ) .env_runners( # Point RLlib to the custom EnvRunner to be used here. - env_runner_cls=TcpClientInferenceEnvRunner, + env_runner_cls=EnvRunnerServerForExternalInference, ) .training( num_epochs=10, diff --git a/rllib/examples/envs/external_envs/cartpole_client.py b/rllib/examples/envs/external_envs/cartpole_client.py deleted file mode 100755 index d1ed0345f0a5..000000000000 --- a/rllib/examples/envs/external_envs/cartpole_client.py +++ /dev/null @@ -1,133 +0,0 @@ -#!/usr/bin/env python - -# TODO (sven): Move this example script into the new API stack. - -""" -Example of running an external simulator (a simple CartPole env -in this case) against an RLlib policy server listening on one or more -HTTP-speaking port(s). See `cartpole_server.py` in this same directory for -how to start this server. - -This script will only create one single env altogether to illustrate -that RLlib can run w/o needing an internalized environment. - -Setup: -1) Start the policy server: - See `cartpole_server.py` on how to do this. -2) Run this client: - $ python cartpole_client.py --inference-mode=local|remote --[other options] - Use --help for help. - -In "local" inference-mode, the action computations are performed -inside the PolicyClient used in this script w/o sending an HTTP request -to the server. This reduces network communication overhead, but requires -the PolicyClient to create its own RolloutWorker (+Policy) based on -the server's config. The PolicyClient will retrieve this config automatically. -You do not need to define the RLlib config dict here! - -In "remote" inference mode, the PolicyClient will send action requests to the -server and not compute its own actions locally. The server then performs the -inference forward pass and returns the action to the client. - -In either case, the user of PolicyClient must: -- Declare new episodes and finished episodes to the PolicyClient. -- Log rewards to the PolicyClient. -- Call `get_action` to receive an action from the PolicyClient (whether it'd be - computed locally or remotely). -- Besides `get_action`, the user may let the PolicyClient know about - off-policy actions having been taken via `log_action`. This can be used in - combination with `get_action`, but will only work, if the connected server - runs an off-policy RL algorithm (such as DQN, SAC, or DDPG). -""" - -import argparse -import gymnasium as gym - -from ray.rllib.env.policy_client import PolicyClient -from ray._common.network_utils import build_address - -parser = argparse.ArgumentParser() -parser.add_argument( - "--no-train", action="store_true", help="Whether to disable training." -) -parser.add_argument( - "--inference-mode", type=str, default="local", choices=["local", "remote"] -) -parser.add_argument( - "--off-policy", - action="store_true", - help="Whether to compute random actions instead of on-policy " - "(Policy-computed) ones.", -) -parser.add_argument( - "--stop-reward", - type=float, - default=9999, - help="Stop once the specified reward is reached.", -) -parser.add_argument( - "--port", type=int, default=9900, help="The port to use (on localhost)." -) - -if __name__ == "__main__": - args = parser.parse_args() - - # The following line is the only instance, where an actual env will - # be created in this entire example (including the server side!). - # This is to demonstrate that RLlib does not require you to create - # unnecessary env objects within the PolicyClient/Server objects, but - # that only this following env and the loop below runs the entire - # training process. - env = gym.make("CartPole-v1") - - # If server has n workers, all ports between 9900 and 990[n-1] should - # be listened on. E.g. if server has num_env_runners=2, try 9900 or 9901. - # Note that no config is needed in this script as it will be defined - # on and sent from the server. - client = PolicyClient( - f"http://{build_address('localhost', args.port)}", - inference_mode=args.inference_mode, - ) - - # In the following, we will use our external environment (the CartPole - # env we created above) in connection with the PolicyClient to query - # actions (from the server if "remote"; if "local" we'll compute them - # on this client side), and send back observations and rewards. - - # Start a new episode. - obs, info = env.reset() - eid = client.start_episode(training_enabled=not args.no_train) - - rewards = 0.0 - while True: - # Compute an action randomly (off-policy) and log it. - if args.off_policy: - action = env.action_space.sample() - client.log_action(eid, obs, action) - # Compute an action locally or remotely (on server). - # No need to log it here as the action - else: - action = client.get_action(eid, obs) - - # Perform a step in the external simulator (env). - obs, reward, terminated, truncated, info = env.step(action) - rewards += reward - - # Log next-obs, rewards, and infos. - client.log_returns(eid, reward, info=info) - - # Reset the episode if done. - if terminated or truncated: - print("Total reward:", rewards) - if rewards >= args.stop_reward: - print("Target reward achieved, exiting") - exit(0) - - rewards = 0.0 - - # End the old episode. - client.end_episode(eid, obs) - - # Start a new episode. - obs, info = env.reset() - eid = client.start_episode(training_enabled=not args.no_train) diff --git a/rllib/examples/envs/external_envs/cartpole_server.py b/rllib/examples/envs/external_envs/cartpole_server.py deleted file mode 100755 index 65d86b14ef3e..000000000000 --- a/rllib/examples/envs/external_envs/cartpole_server.py +++ /dev/null @@ -1,278 +0,0 @@ -#!/usr/bin/env python - -# TODO (sven): Move this example script into the new API stack. - -""" -Example of running an RLlib policy server, allowing connections from -external environment running clients. The server listens on -(a simple CartPole env -in this case) against an RLlib policy server listening on one or more -HTTP-speaking ports. See `cartpole_client.py` in this same directory for how -to start any number of clients (after this server has been started). - -This script will not create any actual env to illustrate that RLlib can -run w/o needing an internalized environment. - -Setup: -1) Start this server: - $ python cartpole_server.py --num-workers --[other options] - Use --help for help. -2) Run n policy clients: - See `cartpole_client.py` on how to do this. - -The `num-workers` setting will allow you to distribute the incoming feed over n -listen sockets (in this example, between 9900 and 990n with n=worker_idx-1). -You may connect more than one policy client to any open listen port. -""" - -import argparse -import gymnasium as gym -import os - -import ray -from ray import tune -from ray.rllib.env.policy_server_input import PolicyServerInput -from ray.rllib.utils.metrics import ( - ENV_RUNNER_RESULTS, - EPISODE_RETURN_MEAN, - NUM_ENV_STEPS_SAMPLED_LIFETIME, -) -from ray.tune.logger import pretty_print -from ray.tune.registry import get_trainable_cls -from ray.tune.result import TRAINING_ITERATION - -SERVER_ADDRESS = "localhost" -# In this example, the user can run the policy server with -# n workers, opening up listen ports 9900 - 990n (n = num_env_runners - 1) -# to each of which different clients may connect. -SERVER_BASE_PORT = 9900 # + worker-idx - 1 - -CHECKPOINT_FILE = "last_checkpoint_{}.out" - - -def get_cli_args(): - """Create CLI parser and return parsed arguments""" - parser = argparse.ArgumentParser() - - # Example-specific args. - parser.add_argument( - "--port", - type=int, - default=SERVER_BASE_PORT, - help="The base-port to use (on localhost). " f"Default is {SERVER_BASE_PORT}.", - ) - parser.add_argument( - "--callbacks-verbose", - action="store_true", - help="Activates info-messages for different events on " - "server/client (episode steps, postprocessing, etc..).", - ) - parser.add_argument( - "--num-workers", - type=int, - default=2, - help="The number of workers to use. Each worker will create " - "its own listening socket for incoming experiences.", - ) - parser.add_argument( - "--no-restore", - action="store_true", - help="Do not restore from a previously saved checkpoint (location of " - "which is saved in `last_checkpoint_[algo-name].out`).", - ) - - # General args. - parser.add_argument( - "--run", - default="PPO", - choices=["APEX", "DQN", "IMPALA", "PPO", "R2D2"], - help="The RLlib-registered algorithm to use.", - ) - parser.add_argument("--num-cpus", type=int, default=3) - parser.add_argument( - "--framework", - choices=["tf", "tf2", "torch"], - default="torch", - help="The DL framework specifier.", - ) - parser.add_argument( - "--use-lstm", - action="store_true", - help="Whether to auto-wrap the model with an LSTM. Only valid option for " - "--run=[IMPALA|PPO|R2D2]", - ) - parser.add_argument( - "--stop-iters", type=int, default=200, help="Number of iterations to train." - ) - parser.add_argument( - "--stop-timesteps", - type=int, - default=500000, - help="Number of timesteps to train.", - ) - parser.add_argument( - "--stop-reward", - type=float, - default=80.0, - help="Reward at which we stop training.", - ) - parser.add_argument( - "--as-test", - action="store_true", - help="Whether this script should be run as a test: --stop-reward must " - "be achieved within --stop-timesteps AND --stop-iters.", - ) - parser.add_argument( - "--no-tune", - action="store_true", - help="Run without Tune using a manual train loop instead. Here," - "there is no TensorBoard support.", - ) - parser.add_argument( - "--local-mode", - action="store_true", - help="Init Ray in local mode for easier debugging.", - ) - - args = parser.parse_args() - print(f"Running with following CLI args: {args}") - return args - - -if __name__ == "__main__": - args = get_cli_args() - ray.init() - - # `InputReader` generator (returns None if no input reader is needed on - # the respective worker). - def _input(ioctx): - # We are remote worker or we are local worker with num_env_runners=0: - # Create a PolicyServerInput. - if ioctx.worker_index > 0 or ioctx.worker.num_workers == 0: - return PolicyServerInput( - ioctx, - SERVER_ADDRESS, - args.port + ioctx.worker_index - (1 if ioctx.worker_index > 0 else 0), - ) - # No InputReader (PolicyServerInput) needed. - else: - return None - - # Algorithm config. Note that this config is sent to the client only in case - # the client needs to create its own policy copy for local inference. - config = ( - get_trainable_cls(args.run).get_default_config() - # Indicate that the Algorithm we setup here doesn't need an actual env. - # Allow spaces to be determined by user (see below). - .environment( - env=None, - # TODO: (sven) make these settings unnecessary and get the information - # about the env spaces from the client. - observation_space=gym.spaces.Box(float("-inf"), float("inf"), (4,)), - action_space=gym.spaces.Discrete(2), - ) - # DL framework to use. - .framework(args.framework) - # Use the `PolicyServerInput` to generate experiences. - .offline_data(input_=_input) - # Use n worker processes to listen on different ports. - .env_runners( - num_env_runners=args.num_workers, - # Connectors are not compatible with the external env. - enable_connectors=False, - ) - # Disable OPE, since the rollouts are coming from online clients. - .evaluation(off_policy_estimation_methods={}) - # Set to INFO so we'll see the server's actual address:port. - .debugging(log_level="INFO") - ) - # Disable RLModules because they need connectors - - # DQN. - if args.run == "DQN" or args.run == "APEX" or args.run == "R2D2": - # Example of using DQN (supports off-policy actions). - config.update_from_dict( - { - "num_steps_sampled_before_learning_starts": 100, - "min_sample_timesteps_per_iteration": 200, - "n_step": 3, - "rollout_fragment_length": 4, - "train_batch_size": 8, - } - ) - config.model.update( - { - "fcnet_hiddens": [64], - "fcnet_activation": "linear", - } - ) - if args.run == "R2D2": - config.model["use_lstm"] = args.use_lstm - - elif args.run == "IMPALA": - config.update_from_dict( - { - "num_gpus": 0, - "model": {"use_lstm": args.use_lstm}, - } - ) - - # PPO. - else: - # Example of using PPO (does NOT support off-policy actions). - config.update_from_dict( - { - "rollout_fragment_length": 1000, - "train_batch_size": 4000, - "model": {"use_lstm": args.use_lstm}, - } - ) - - checkpoint_path = CHECKPOINT_FILE.format(args.run) - # Attempt to restore from checkpoint, if possible. - if not args.no_restore and os.path.exists(checkpoint_path): - checkpoint_path = open(checkpoint_path).read() - else: - checkpoint_path = None - - # Manual training loop (no Ray tune). - if args.no_tune: - algo = config.build() - - if checkpoint_path: - print("Restoring from checkpoint path", checkpoint_path) - algo.restore(checkpoint_path) - - # Serving and training loop. - ts = 0 - for _ in range(args.stop_iters): - results = algo.train() - print(pretty_print(results)) - checkpoint = algo.save().checkpoint - print("Last checkpoint", checkpoint) - with open(checkpoint_path, "w") as f: - f.write(checkpoint.path) - if ( - results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] >= args.stop_reward - or ts >= args.stop_timesteps - ): - break - ts += results[f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}"] - - algo.stop() - - # Run with Tune for auto env and algo creation and TensorBoard. - else: - print("Ignoring restore even if previous checkpoint is provided...") - - stop = { - TRAINING_ITERATION: args.stop_iters, - NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps, - f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward, - } - - tune.Tuner( - args.run, - param_space=config, - run_config=tune.RunConfig(stop=stop, verbose=2), - ).fit() diff --git a/rllib/examples/envs/external_envs/dummy_client_with_two_episodes.py b/rllib/examples/envs/external_envs/dummy_client_with_two_episodes.py deleted file mode 100644 index 8f201d5e01c0..000000000000 --- a/rllib/examples/envs/external_envs/dummy_client_with_two_episodes.py +++ /dev/null @@ -1,95 +0,0 @@ -#!/usr/bin/env python - -# TODO (sven): Move this example script into the new API stack. - -""" -For testing purposes only. -Runs a policy client that starts two episodes, uses one for calculating actions -("action episode") and the other for logging those actions ("logging episode"). -Terminates the "logging episode" before computing a few more actions -from the "action episode". -The action episode is also started with the training_enabled=False flag so no -batches should be produced by this episode for training inside the -SampleCollector's `postprocess_trajectory` method. -""" - -import argparse -import gymnasium as gym -import ray - -from ray.rllib.env.policy_client import PolicyClient -from ray._common.network_utils import build_address - -parser = argparse.ArgumentParser() -parser.add_argument( - "--inference-mode", type=str, default="local", choices=["local", "remote"] -) -parser.add_argument( - "--off-policy", - action="store_true", - help="Whether to compute random actions instead of on-policy " - "(Policy-computed) ones.", -) -parser.add_argument( - "--port", type=int, default=9900, help="The port to use (on localhost)." -) -parser.add_argument("--dummy-arg", type=str, default="") - - -if __name__ == "__main__": - args = parser.parse_args() - - ray.init() - - # Use a CartPole-v1 env so this plays nicely with our cartpole server script. - env = gym.make("CartPole-v1") - - # Note that the RolloutWorker that is generated inside the client (in case - # of local inference) will contain only a RandomEnv dummy env to step through. - # The actual env we care about is the above generated CartPole one. - client = PolicyClient( - f"http://{build_address('localhost', args.port)}", - inference_mode=args.inference_mode, - ) - - # Get a dummy obs - dummy_obs, dummy_infos = env.reset() - dummy_reward = 1.3 - - # Start an episode to only compute actions (do NOT record this episode's - # trajectories in any returned SampleBatches sent to the server for learning). - action_eid = client.start_episode(training_enabled=False) - print(f"Starting action episode: {action_eid}.") - # Get some actions using the action episode - dummy_action = client.get_action(action_eid, dummy_obs) - print(f"Computing action 1 in action episode: {dummy_action}.") - dummy_action = client.get_action(action_eid, dummy_obs) - print(f"Computing action 2 in action episode: {dummy_action}.") - - # Start a log episode to log action and log rewards for learning. - log_eid = client.start_episode(training_enabled=True) - print(f"Starting logging episode: {log_eid}.") - # Produce an action, just for testing. - garbage_action = client.get_action(log_eid, dummy_obs) - # Log 1 action and 1 reward. - client.log_action(log_eid, dummy_obs, dummy_action) - client.log_returns(log_eid, dummy_reward) - print(f".. logged action + reward: {dummy_action} + {dummy_reward}") - - # Log 2 actions (w/o reward in the middle) and then one reward. - # The reward after the 1st of these actions should be considered 0.0. - client.log_action(log_eid, dummy_obs, dummy_action) - client.log_action(log_eid, dummy_obs, dummy_action) - client.log_returns(log_eid, dummy_reward) - print(f".. logged actions + reward: 2x {dummy_action} + {dummy_reward}") - - # End the log episode - client.end_episode(log_eid, dummy_obs) - print(".. ended logging episode") - - # Continue getting actions using the action episode - # The bug happens when executing the following line - dummy_action = client.get_action(action_eid, dummy_obs) - print(f"Computing action 3 in action episode: {dummy_action}.") - dummy_action = client.get_action(action_eid, dummy_obs) - print(f"Computing action 4 in action episode: {dummy_action}.") diff --git a/rllib/examples/envs/external_envs/unity3d_client.py b/rllib/examples/envs/external_envs/unity3d_client.py deleted file mode 100644 index 4160836d8f2c..000000000000 --- a/rllib/examples/envs/external_envs/unity3d_client.py +++ /dev/null @@ -1,133 +0,0 @@ -# TODO (sven): Move this example script into the new API stack. - -""" -Example of running a Unity3D client instance against an RLlib Policy server. -Unity3D clients can be run in distributed fashion on n nodes in the cloud -and all connect to the same RLlib server for faster sample collection. -For a locally running Unity3D example, see: -`examples/unity3d_env_local.py` - -To run this script on possibly different machines -against a central Policy server: -1) Install Unity3D and `pip install mlagents`. - -2) Compile a Unity3D example game with MLAgents support (e.g. 3DBall or any - other one that you created yourself) and place the compiled binary - somewhere, where your RLlib client script (see below) can access it. - -2.1) To find Unity3D MLAgent examples, first `pip install mlagents`, - then check out the `.../ml-agents/Project/Assets/ML-Agents/Examples/` - folder. - -3) Change your RLlib Policy server code so it knows the observation- and - action Spaces, the different Policies (called "behaviors" in Unity3D - MLAgents), and Agent-to-Policy mappings for your particular game. - Alternatively, use one of the two already existing setups (3DBall or - SoccerStrikersVsGoalie). - -4) Then run (two separate shells/machines): -$ python unity3d_server.py --env 3DBall -$ python unity3d_client.py --inference-mode=local --game [path to game binary] -""" - -import argparse - -from ray.rllib.env.policy_client import PolicyClient -from ray.rllib.env.wrappers.unity3d_env import Unity3DEnv -from ray._common.network_utils import build_address - -SERVER_ADDRESS = "localhost" -SERVER_PORT = 9900 - -parser = argparse.ArgumentParser() -parser.add_argument( - "--game", - type=str, - default=None, - help="The game executable to run as RL env. If not provided, uses local " - "Unity3D editor instance.", -) -parser.add_argument( - "--horizon", - type=int, - default=200, - help="The max. number of `step()`s for any episode (per agent) before " - "it'll be reset again automatically.", -) -parser.add_argument( - "--server", - type=str, - default=SERVER_ADDRESS, - help="The Policy server's address to connect to from this client.", -) -parser.add_argument( - "--port", type=int, default=SERVER_PORT, help="The port to use (on --server)." -) -parser.add_argument( - "--no-train", - action="store_true", - help="Whether to disable training (on the server side).", -) -parser.add_argument( - "--inference-mode", - type=str, - default="local", - choices=["local", "remote"], - help="Whether to compute actions `local`ly or `remote`ly. Note that " - "`local` is much faster b/c observations/actions do not have to be " - "sent via the network.", -) -parser.add_argument( - "--update-interval-local-mode", - type=float, - default=10.0, - help="For `inference-mode=local`, every how many seconds do we update " - "learnt policy weights from the server?", -) -parser.add_argument( - "--stop-reward", - type=float, - default=9999, - help="Stop once the specified reward is reached.", -) - -if __name__ == "__main__": - args = parser.parse_args() - - # Start the client for sending environment information (e.g. observations, - # actions) to a policy server (listening on port 9900). - client = PolicyClient( - f"http://{build_address(args.server, args.port)}", - inference_mode=args.inference_mode, - update_interval=args.update_interval_local_mode, - ) - - # Start and reset the actual Unity3DEnv (either already running Unity3D - # editor or a binary (game) to be started automatically). - env = Unity3DEnv(file_name=args.game, episode_horizon=args.horizon) - obs, info = env.reset() - eid = client.start_episode(training_enabled=not args.no_train) - - # Keep track of the total reward per episode. - total_rewards_this_episode = 0.0 - - # Loop infinitely through the env. - while True: - # Get actions from the Policy server given our current obs. - actions = client.get_action(eid, obs) - # Apply actions to our env. - obs, rewards, terminateds, truncateds, infos = env.step(actions) - total_rewards_this_episode += sum(rewards.values()) - # Log rewards and single-agent terminateds. - client.log_returns(eid, rewards, infos, multiagent_done_dict=terminateds) - # Check whether all agents are done and end the episode, if necessary. - if terminateds["__all__"] or truncateds["__all__"]: - print("Episode done: Reward={}".format(total_rewards_this_episode)) - if total_rewards_this_episode >= args.stop_reward: - quit(0) - # End the episode and reset Unity Env. - total_rewards_this_episode = 0.0 - client.end_episode(eid, obs) - obs, info = env.reset() - # Start a new episode. - eid = client.start_episode(training_enabled=not args.no_train) diff --git a/rllib/examples/envs/external_envs/unity3d_dummy_client.py b/rllib/examples/envs/external_envs/unity3d_dummy_client.py deleted file mode 100644 index 58b723c61349..000000000000 --- a/rllib/examples/envs/external_envs/unity3d_dummy_client.py +++ /dev/null @@ -1,160 +0,0 @@ -# TODO (sven): Move this example script into the new API stack. - -""" -Dummy in-place replacement for the unity3d_client.py script -in case you don't have an actual Unity3D engine installed or just want -to test client/server connectivity with the unity3d_server.py script. - -This client script simply uses RLlib's RandomMultiAgentEnv to mimic -one of the ML Agents (Unity3D) example games (e.g. "3DBall"). - -To run this script on possibly different machines -against a central Policy server: - -1) Run (two separate shells/machines): -$ python unity3d_server.py --env 3DBall -$ python unity3d_dummy_client.py --env 3DBall --inference-mode=local -""" - -import argparse - -from ray.rllib.env.policy_client import PolicyClient -from ray.rllib.env.wrappers.unity3d_env import Unity3DEnv -from ray.rllib.examples.envs.classes.random_env import RandomMultiAgentEnv -from ray._common.network_utils import build_address - -SERVER_ADDRESS = "localhost" -SERVER_PORT = 9900 - -parser = argparse.ArgumentParser() -parser.add_argument( - "--env", - type=str, - default="3DBall", - choices=[ - "3DBall", - "3DBallHard", - "FoodCollector", - "GridFoodCollector", - "Pyramids", - "Sorter", - "Tennis", - "VisualHallway", - "Walker", - ], - help="The name of the Env to mimic. Only those examples supported so " - "far for which all agents have the same " - "observation- and action spaces (feel free to add more to this script!)", -) -parser.add_argument( - "--horizon", - type=int, - default=200, - help="The max. number of `step()`s for any episode (per agent) before " - "it'll be reset again automatically.", -) -parser.add_argument( - "--server", - type=str, - default=SERVER_ADDRESS, - help="The Policy server's address to connect to from this client.", -) -parser.add_argument( - "--port", type=int, default=SERVER_PORT, help="The port to use (on --server)." -) -parser.add_argument( - "--no-train", - action="store_true", - help="Whether to disable training (on the server side).", -) -parser.add_argument( - "--inference-mode", - type=str, - default="local", - choices=["local", "remote"], - help="Whether to compute actions `local`ly or `remote`ly. Note that " - "`local` is much faster b/c observations/actions do not have to be " - "sent via the network.", -) -parser.add_argument( - "--update-interval-local-mode", - type=float, - default=10.0, - help="For `inference-mode=local`, every how many seconds do we update " - "learnt policy weights from the server?", -) -parser.add_argument( - "--num-episodes", - type=int, - default=10, - help="Stop once the specified number of episodes have been played.", -) - -if __name__ == "__main__": - args = parser.parse_args() - - # Start the client for sending environment information (e.g. observations, - # actions) to a policy server (listening on port 9900). - client = PolicyClient( - f"http://{build_address(args.server, args.port)}", - inference_mode=args.inference_mode, - update_interval=args.update_interval_local_mode, - ) - - # Get the multi-agent policies dict and agent->policy - # mapping-fn. - policies, policy_mapping_fn = Unity3DEnv.get_policy_configs_for_game(args.env) - - # Make sure all policies' obs- and action spaces are the same. - # If not, we won't be able to mimic the Unity3D env using RLlib's - # RandomMultiAgentEnv. - first_policy_spec = next(iter(policies.values())) - for pid, policy_spec in policies.items(): - assert policy_spec.observation_space == first_policy_spec.observation_space - assert policy_spec.action_space == first_policy_spec.action_space - - # Start and reset the actual Unity3DEnv (either already running Unity3D - # editor or a binary (game) to be started automatically). - env = RandomMultiAgentEnv( - { - # Same number of agents as the actual Unity3D game would have. - "num_agents": len(policies), - # Make sure we stick to the user given horizons using our - # RandomMultiAgentEnv options. - "max_episode_len": args.horizon, - "p_terminated": 0.0, - # Same obs- action spaces as the actual Unity3D game would have. - "observation_space": first_policy_spec.observation_space, - "action_space": first_policy_spec.action_space, - } - ) - obs, info = env.reset() - eid = client.start_episode(training_enabled=not args.no_train) - - # Keep track of the total reward per episode. - total_rewards_this_episode = 0.0 - - # Loop through the env until n episodes completed. - num_episodes = 0 - while True: - # Get actions from the Policy server given our current obs. - actions = client.get_action(eid, obs) - # Apply actions to our env. - obs, rewards, terminateds, truncateds, infos = env.step(actions) - total_rewards_this_episode += sum(rewards.values()) - # Log rewards and single-agent terminateds. - client.log_returns(eid, rewards, infos, multiagent_done_dict=terminateds) - # Check whether all agents are done and end the episode, if necessary. - if terminateds["__all__"] or truncateds["__all__"]: - print("Episode done: Reward={}".format(total_rewards_this_episode)) - - num_episodes += 1 - if num_episodes >= args.num_episodes: - quit(0) - - # End the episode and reset dummy Env. - total_rewards_this_episode = 0.0 - client.end_episode(eid, obs) - obs, info = env.reset() - # Start a new episode. - eid = client.start_episode(training_enabled=not args.no_train) diff --git a/rllib/examples/envs/external_envs/unity3d_server.py b/rllib/examples/envs/external_envs/unity3d_server.py deleted file mode 100755 index 4457102877e1..000000000000 --- a/rllib/examples/envs/external_envs/unity3d_server.py +++ /dev/null @@ -1,178 +0,0 @@ -# TODO (sven): Move this example script into the new API stack. - -""" -Example of running a Unity3D (MLAgents) Policy server that can learn -Policies via sampling inside many connected Unity game clients (possibly -running in the cloud on n nodes). -For a locally running Unity3D example, see: -`examples/unity3d_env_local.py` - -To run this script against one or more possibly cloud-based clients: -1) Install Unity3D and `pip install mlagents`. - -2) Compile a Unity3D example game with MLAgents support (e.g. 3DBall or any - other one that you created yourself) and place the compiled binary - somewhere, where your RLlib client script (see below) can access it. - -2.1) To find Unity3D MLAgent examples, first `pip install mlagents`, - then check out the `.../ml-agents/Project/Assets/ML-Agents/Examples/` - folder. - -3) Change this RLlib Policy server code so it knows the observation- and - action Spaces, the different Policies (called "behaviors" in Unity3D - MLAgents), and Agent-to-Policy mappings for your particular game. - Alternatively, use one of the two already existing setups (3DBall or - SoccerStrikersVsGoalie). - -4) Then run (two separate shells/machines): -$ python unity3d_server.py --env 3DBall -$ python unity3d_client.py --inference-mode=local --game [path to game binary] -""" - -import argparse -import gymnasium as gym -import os - -import ray -from ray.rllib.env.policy_server_input import PolicyServerInput -from ray.rllib.env.wrappers.unity3d_env import Unity3DEnv -from ray.tune.registry import get_trainable_cls - -SERVER_ADDRESS = "localhost" -SERVER_PORT = 9900 -CHECKPOINT_FILE = "last_checkpoint_{}.out" - -parser = argparse.ArgumentParser() -parser.add_argument( - "--run", - default="PPO", - choices=["DQN", "PPO"], - help="The RLlib-registered algorithm to use.", -) -parser.add_argument( - "--framework", - choices=["tf", "tf2", "torch"], - default="torch", - help="The DL framework specifier.", -) -parser.add_argument( - "--num-workers", - type=int, - default=2, - help="The number of workers to use. Each worker will create " - "its own listening socket for incoming experiences.", -) -parser.add_argument( - "--env", - type=str, - default="3DBall", - choices=[ - "3DBall", - "3DBallHard", - "FoodCollector", - "GridFoodCollector", - "Pyramids", - "SoccerStrikersVsGoalie", - "Sorter", - "Tennis", - "VisualHallway", - "Walker", - ], - help="The name of the Env to run in the Unity3D editor " - "(feel free to add more to this script!)", -) -parser.add_argument( - "--port", - type=int, - default=SERVER_PORT, - help="The Policy server's port to listen on for ExternalEnv client conections.", -) -parser.add_argument( - "--checkpoint-freq", - type=int, - default=10, - help="The frequency with which to create checkpoint files of the learnt " - "Policies.", -) -parser.add_argument( - "--no-restore", - action="store_true", - help="Whether to load the Policy weights from a previous checkpoint", -) - -if __name__ == "__main__": - args = parser.parse_args() - ray.init() - - # `InputReader` generator (returns None if no input reader is needed on - # the respective worker). - def _input(ioctx): - # We are remote worker or we are local worker with num_env_runners=0: - # Create a PolicyServerInput. - if ioctx.worker_index > 0 or ioctx.worker.num_workers == 0: - return PolicyServerInput( - ioctx, - SERVER_ADDRESS, - args.port + ioctx.worker_index - (1 if ioctx.worker_index > 0 else 0), - ) - # No InputReader (PolicyServerInput) needed. - else: - return None - - # Get the multi-agent policies dict and agent->policy - # mapping-fn. - policies, policy_mapping_fn = Unity3DEnv.get_policy_configs_for_game(args.env) - - # The entire config will be sent to connecting clients so they can - # build their own samplers (and also Policy objects iff - # `inference_mode=local` on clients' command line). - config = ( - get_trainable_cls(args.run) - .get_default_config() - # DL framework to use. - .framework(args.framework) - # Use n worker processes to listen on different ports. - .env_runners( - num_env_runners=args.num_workers, - rollout_fragment_length=20, - ) - .environment( - env=None, - # TODO: (sven) make these settings unnecessary and get the information - # about the env spaces from the client. - observation_space=gym.spaces.Box(float("-inf"), float("inf"), (8,)), - action_space=gym.spaces.Box(-1.0, 1.0, (2,)), - ) - .training(train_batch_size=256) - # Multi-agent setup for the given env. - .multi_agent(policies=policies, policy_mapping_fn=policy_mapping_fn) - # Use the `PolicyServerInput` to generate experiences. - .offline_data(input_=_input) - # Disable OPE, since the rollouts are coming from online clients. - .evaluation(off_policy_estimation_methods={}) - ) - - # Create the Algorithm used for Policy serving. - algo = config.build() - - # Attempt to restore from checkpoint if possible. - checkpoint_path = CHECKPOINT_FILE.format(args.env) - if not args.no_restore and os.path.exists(checkpoint_path): - checkpoint_path = open(checkpoint_path).read() - print("Restoring from checkpoint path", checkpoint_path) - algo.restore(checkpoint_path) - - # Serving and training loop. - count = 0 - while True: - # Calls to train() will block on the configured `input` in the Algorithm - # config above (PolicyServerInput). - print(algo.train()) - if count % args.checkpoint_freq == 0: - print("Saving learning progress to checkpoint file.") - checkpoint = algo.save().checkpoint - # Write the latest checkpoint location to CHECKPOINT_FILE, - # so we can pick up from the latest one after a server re-start. - with open(checkpoint_path, "w") as f: - f.write(checkpoint.path) - count += 1 diff --git a/rllib/examples/learners/classes/vpg_torch_learner_shared_optimizer.py b/rllib/examples/learners/classes/vpg_torch_learner_shared_optimizer.py new file mode 100644 index 000000000000..4594cf1f2f28 --- /dev/null +++ b/rllib/examples/learners/classes/vpg_torch_learner_shared_optimizer.py @@ -0,0 +1,32 @@ +from ray.rllib.examples.learners.classes.vpg_torch_learner import VPGTorchLearner +from ray.rllib.core.learner.torch.torch_learner import TorchLearner +from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_torch + +torch, _ = try_import_torch() + + +class VPGTorchLearnerSharedOptimizer(VPGTorchLearner): + """ + In order for a shared module to learn properly, a special, multi-agent Learner + has been set up. There is only one optimizer (used to train all submodules, e.g. + a shared encoder and n policy nets), in order to not destabilize learning. The + latter may happen if more than one optimizer would try to alternatingly optimize + the same shared submodule. + """ + + @override(TorchLearner) + def configure_optimizers(self) -> None: + # Get and aggregate parameters for every module + param_list = [] + for m in self.module.values(): + if self.rl_module_is_compatible(m): + param_list.extend(m.parameters()) + + self.register_optimizer( + optimizer_name="shared_optimizer", + optimizer=torch.optim.Adam(params=param_list), + params=param_list, + # For the policy learning rate, we use the "main" lr in the AlgorithmConfig. + lr_or_lr_schedule=self.config.lr, + ) diff --git a/rllib/examples/multi_agent/self_play_footsies.py b/rllib/examples/multi_agent/self_play_footsies.py new file mode 100644 index 000000000000..2cc5213eced2 --- /dev/null +++ b/rllib/examples/multi_agent/self_play_footsies.py @@ -0,0 +1,112 @@ +""" +Multi-agent RLlib Footsies Simplified Example (PPO) + +About: + - This example as a simplified version of "rllib/tuned_examples/ppo/multi_agent_footsies_ppo.py", + which has more detailed comments and instructions. Please refer to that example for more information. + - This example is created to test the self-play training progression with footsies. + - Simplified version runs with single learner (cpu), single env runner, and single eval env runner. +""" +from pathlib import Path + +from ray.rllib.tuned_examples.ppo.multi_agent_footsies_ppo import ( + config, + env_creator, + stop, +) +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, +) +from ray.tune.registry import register_env + +parser = add_rllib_example_script_args( + default_iters=500, + default_timesteps=5_000_000, +) +parser.add_argument( + "--train-start-port", + type=int, + default=45001, + help="First port number for the Footsies training environment server (default: 45001). Each server gets its own port.", +) +parser.add_argument( + "--eval-start-port", + type=int, + default=55001, + help="First port number for the Footsies evaluation environment server (default: 55001) Each server gets its own port.", +) +parser.add_argument( + "--binary-download-dir", + type=Path, + default="/tmp/ray/binaries/footsies", + help="Directory to download Footsies binaries (default: /tmp/ray/binaries/footsies)", +) +parser.add_argument( + "--binary-extract-dir", + type=Path, + default="/tmp/ray/binaries/footsies", + help="Directory to extract Footsies binaries (default: /tmp/ray/binaries/footsies)", +) +parser.add_argument( + "--binary-to-download", + type=str, + choices=["linux_server", "linux_windowed", "mac_headless", "mac_windowed"], + default="linux_server", + help="Target binary for Footsies environment (default: linux_server). Linux and Mac machines are supported. " + "'linux_server' and 'mac_headless' choices are the default options for the training. Game will run in the batchmode, without initializing the graphics. " + "'linux_windowed' and 'mac_windowed' choices are for the local run only, because " + "game will be rendered in the OS window. To use this option effectively, set up: " + "--no-tune --num-env-runners 0 --evaluation-num-env-runners 0", +) +parser.add_argument( + "--win-rate-threshold", + type=float, + default=0.55, + help="The main policy should have at least 'win-rate-threshold' win rate against the " + "other policy to advance to the next level. Moving to the next level " + "means adding a new policy to the mix.", +) +parser.add_argument( + "--target-mix-size", + type=int, + default=4, + help="Target number of policies (RLModules) in the mix to consider the test passed. " + "The initial mix size is 2: 'main policy' vs. 'other'. " + "`--target-mix-size=4` means that 2 new policies will be added to the mix. " + "Whether to add new policy is decided by checking the '--win-rate-threshold' condition. ", +) +parser.add_argument( + "--rollout-fragment-length", + type=int, + default=256, + help="The length of each rollout fragment to be collected by the EnvRunners when sampling.", +) + +args = parser.parse_args() +register_env(name="FootsiesEnv", env_creator=env_creator) +stop["mix_size"] = args.target_mix_size + +config.environment( + env="FootsiesEnv", + env_config={ + "train_start_port": args.train_start_port, + "eval_start_port": args.eval_start_port, + "binary_download_dir": args.binary_download_dir, + "binary_extract_dir": args.binary_extract_dir, + "binary_to_download": args.binary_to_download, + }, +).training( + train_batch_size_per_learner=args.rollout_fragment_length + * (args.num_env_runners or 1), +) + + +if __name__ == "__main__": + from ray.rllib.utils.test_utils import run_rllib_example_script_experiment + + results = run_rllib_example_script_experiment( + base_config=config, + args=args, + stop=stop, + success_metric={"mix_size": args.target_mix_size}, + ) diff --git a/rllib/examples/multi_agent/shared_encoder_cartpole.py b/rllib/examples/multi_agent/shared_encoder_cartpole.py new file mode 100644 index 000000000000..caea04adef8c --- /dev/null +++ b/rllib/examples/multi_agent/shared_encoder_cartpole.py @@ -0,0 +1,164 @@ +"""A runnable example involving the use of a shared encoder module. + +How to run this script +---------------------- +`python [script file name].py --num-agents=2` + +Control the number of agents and policies (RLModules) via --num-agents. +--encoder-emb-dim sets the encoder output dimension, and --no-shared-encoder +runs the experiment with independent encoders. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +Under the shared encoder architecture, the target reward of 700 will typically be reached well before 100,000 iterations. A trial concludes as below: + ++---------------------+------------+-----------------+--------+------------------+-------+-------------------+-------------+-------------+ +| Trial name | status | loc | iter | total time (s) | ts | combined return | return p1 | return p0 | +|---------------------+------------+-----------------+--------+------------------+-------+-------------------+-------------+-------------| +| VPG_env_ab318_00000 | TERMINATED | 127.0.0.1:37375 | 33 | 44.2689 | 74197 | 611.35 | 191.71 | 419.64 | ++---------------------+------------+-----------------+--------+------------------+-------+-------------------+-------------+-------------+ + +Without a shared encoder, a lower reward is typically achieved after training for the full 100,000 timesteps: + ++---------------------+------------+-----------------+--------+------------------+--------+-------------------+-------------+-------------+ +| Trial name | status | loc | iter | total time (s) | ts | combined return | return p0 | return p1 | +|---------------------+------------+-----------------+--------+------------------+--------+-------------------+-------------+-------------| +| VPG_env_2e79e_00000 | TERMINATED | 127.0.0.1:39076 | 37 | 52.127 | 103894 | 526.66 | 85.78 | 440.88 | ++---------------------+------------+-----------------+--------+------------------+--------+-------------------+-------------+-------------+ + + +""" + +import gymnasium as gym +from ray.rllib.core.rl_module.rl_module import RLModuleSpec +from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec + +from ray.rllib.examples.algorithms.classes.vpg import VPGConfig +from ray.rllib.examples.learners.classes.vpg_torch_learner_shared_optimizer import ( + VPGTorchLearnerSharedOptimizer, +) +from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole +from ray.rllib.examples.rl_modules.classes.vpg_using_shared_encoder_rlm import ( + SHARED_ENCODER_ID, + SharedEncoder, + VPGPolicyAfterSharedEncoder, + VPGMultiRLModuleWithSharedEncoder, + VPGPolicyNoSharedEncoder, +) +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import register_env + +parser = add_rllib_example_script_args( + default_iters=200, + default_timesteps=100000, + default_reward=600.0, +) +parser.set_defaults( + algo="VPG", + num_agents=2, +) +parser.add_argument("--encoder-emb-dim", type=int, default=64) +parser.add_argument("--no-shared-encoder", action="store_true") + +if __name__ == "__main__": + args = parser.parse_args() + assert args.algo == "VPG", "The shared encoder example is meant for VPG agents." + assert args.num_agents == 2, "This example makes use of two agents." + + single_agent_env = gym.make( + "CartPole-v1" + ) # To allow instantiation of shared encoder + + EMBEDDING_DIM = args.encoder_emb_dim # encoder output dim + + if args.no_shared_encoder: + print("Running experiment without shared encoder") + specs = MultiRLModuleSpec( + rl_module_specs={ + # Large policy net. + "p0": RLModuleSpec( + module_class=VPGPolicyNoSharedEncoder, + model_config={ + "embedding_dim": EMBEDDING_DIM, + "hidden_dim": 64, + }, + ), + # Small policy net. + "p1": RLModuleSpec( + module_class=VPGPolicyNoSharedEncoder, + model_config={ + "embedding_dim": EMBEDDING_DIM, + "hidden_dim": 64, + }, + ), + } + ) + else: + specs = MultiRLModuleSpec( + multi_rl_module_class=VPGMultiRLModuleWithSharedEncoder, + rl_module_specs={ + # Shared encoder. + SHARED_ENCODER_ID: RLModuleSpec( + module_class=SharedEncoder, + model_config={"embedding_dim": EMBEDDING_DIM}, + observation_space=single_agent_env.observation_space, + action_space=single_agent_env.action_space, + ), + # Large policy net. + "p0": RLModuleSpec( + module_class=VPGPolicyAfterSharedEncoder, + model_config={ + "embedding_dim": EMBEDDING_DIM, + "hidden_dim": 64, + }, + ), + # Small policy net. + "p1": RLModuleSpec( + module_class=VPGPolicyAfterSharedEncoder, + model_config={ + "embedding_dim": EMBEDDING_DIM, + "hidden_dim": 64, + }, + ), + }, + ) + + # Register our environment with tune. + register_env( + "env", + lambda _: MultiAgentCartPole(config={"num_agents": args.num_agents}), + ) + + base_config = ( + VPGConfig() + .environment("env" if args.num_agents > 0 else "CartPole-v1") + .training( + learner_class=VPGTorchLearnerSharedOptimizer + if not args.no_shared_encoder + else None, + train_batch_size=2048, + lr=1e-2, + ) + .multi_agent( + policies={"p0", "p1"}, + policy_mapping_fn=lambda agent_id, episode, **kw: f"p{agent_id}", + ) + .rl_module( + rl_module_spec=specs, + ) + ) + + run_rllib_example_script_experiment(base_config, args) diff --git a/rllib/examples/multi_agent/utils/self_play_callback_old_api_stack.py b/rllib/examples/multi_agent/utils/self_play_callback_old_api_stack.py index 42b05b945017..eb19e57b02e4 100644 --- a/rllib/examples/multi_agent/utils/self_play_callback_old_api_stack.py +++ b/rllib/examples/multi_agent/utils/self_play_callback_old_api_stack.py @@ -1,7 +1,7 @@ import numpy as np from ray.rllib.callbacks.callbacks import RLlibCallback -from ray.rllib.utils.deprecation import Deprecated +from ray._common.deprecation import Deprecated from ray.rllib.utils.metrics import ENV_RUNNER_RESULTS diff --git a/rllib/examples/multi_agent/utils/self_play_league_based_callback_old_api_stack.py b/rllib/examples/multi_agent/utils/self_play_league_based_callback_old_api_stack.py index dc39fa8fac9a..e33a208be488 100644 --- a/rllib/examples/multi_agent/utils/self_play_league_based_callback_old_api_stack.py +++ b/rllib/examples/multi_agent/utils/self_play_league_based_callback_old_api_stack.py @@ -3,7 +3,7 @@ import numpy as np from ray.rllib.callbacks.callbacks import RLlibCallback -from ray.rllib.utils.deprecation import Deprecated +from ray._common.deprecation import Deprecated from ray.rllib.utils.metrics import ENV_RUNNER_RESULTS diff --git a/rllib/examples/rl_modules/classes/vpg_using_shared_encoder_rlm.py b/rllib/examples/rl_modules/classes/vpg_using_shared_encoder_rlm.py index 05a350de5ff0..2cd2809a6206 100644 --- a/rllib/examples/rl_modules/classes/vpg_using_shared_encoder_rlm.py +++ b/rllib/examples/rl_modules/classes/vpg_using_shared_encoder_rlm.py @@ -3,7 +3,15 @@ from ray.rllib.core import Columns from ray.rllib.core.rl_module.multi_rl_module import MultiRLModule from ray.rllib.core.rl_module.torch.torch_rl_module import TorchRLModule +from ray.rllib.core.models.base import ENCODER_OUT +from ray.rllib.utils.annotations import override +from ray.rllib.utils.typing import ModuleID +from typing import ( + Any, + Dict, + Union, +) SHARED_ENCODER_ID = "shared_encoder" @@ -34,8 +42,7 @@ def setup(self): ) def _forward(self, batch, **kwargs): - # Embeddings can be found in the batch under the "encoder_embeddings" key. - embeddings = batch["encoder_embeddings"] + embeddings = batch[ENCODER_OUT] # Get the output of the encoder logits = self._pi_head(embeddings) return {Columns.ACTION_DIST_INPUTS: logits} @@ -48,23 +55,35 @@ class VPGMultiRLModuleWithSharedEncoder(MultiRLModule): """VPG (vanilla pol. gradient)-style MultiRLModule handling a shared encoder. # __sphinx_doc_mrlm_end__ - This MultiRLModule needs to be configured appropriately as follows: + This MultiRLModule needs to be configured appropriately as below. .. testcode:: # __sphinx_doc_how_to_run_begin__ import gymnasium as gym - from ray.rllib.algorithms.ppo import PPOConfig - from ray.rllib.core import MultiRLModuleSpec, RLModuleSpec + from ray.rllib.core.rl_module.rl_module import RLModuleSpec + from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec + + from ray.rllib.examples.algorithms.classes.vpg import VPGConfig + from ray.rllib.examples.learners.classes.vpg_torch_learner_shared_optimizer import VPGTorchLearnerSharedOptimizer from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole + from ray.rllib.examples.rl_modules.classes.vpg_using_shared_encoder_rlm import ( + SHARED_ENCODER_ID, + SharedEncoder, + VPGPolicyAfterSharedEncoder, + VPGMultiRLModuleWithSharedEncoder, + ) single_agent_env = gym.make("CartPole-v1") EMBEDDING_DIM = 64 # encoder output dim config = ( - PPOConfig() + VPGConfig() .environment(MultiAgentCartPole, env_config={"num_agents": 2}) + .training( + learner_class=VPGTorchLearnerSharedOptimizer, + ) .multi_agent( # Declare the two policies trained. policies={"p0", "p1"}, @@ -74,6 +93,7 @@ class VPGMultiRLModuleWithSharedEncoder(MultiRLModule): ) .rl_module( rl_module_spec=MultiRLModuleSpec( + multi_rl_module_class=VPGMultiRLModuleWithSharedEncoder, rl_module_specs={ # Shared encoder. SHARED_ENCODER_ID: RLModuleSpec( @@ -102,47 +122,52 @@ class VPGMultiRLModuleWithSharedEncoder(MultiRLModule): ), ) ) - algo = config.build() - print(algo.get_module("p0")) + algo = config.build_algo() + print(algo.train()) # __sphinx_doc_how_to_run_end__ - - Also note that in order to learn properly, a special, multi-agent Learner - accounting for the shared encoder must be setup. This Learner should have only - one optimizer (used to train all submodules: encoder and the n policy nets) in - order to not destabilize learning. The latter would happen if more than one - optimizer would try to alternatingly optimize the same shared encoder submodule. # __sphinx_doc_mrlm_2_begin__ """ def setup(self): # Call the super's setup(). super().setup() - # Assert, we have the shared encoder submodule. - assert ( - SHARED_ENCODER_ID in self._rl_modules - and isinstance(self._rl_modules[SHARED_ENCODER_ID], SharedEncoder) - and len(self._rl_modules) > 1 - ) + assert SHARED_ENCODER_ID in self._rl_modules and len(self._rl_modules) > 1 # Assign the encoder to a convenience attribute. self.encoder = self._rl_modules[SHARED_ENCODER_ID] - def _forward(self, batch, **kwargs): + def _forward(self, batch, forward_type, **kwargs): # Collect our policies' outputs in this dict. - outputs = {} - + fwd_out = {} # Loop through the policy nets (through the given batch's keys). for policy_id, policy_batch in batch.items(): - rl_module = self._rl_modules[policy_id] + # Feed this policy's observation into the shared encoder + encoder_output = self.encoder._forward(batch[policy_id]) + policy_batch[ENCODER_OUT] = encoder_output[ENCODER_OUT] + # Get the desired module + m = getattr(self._rl_modules[policy_id], forward_type) + # Pass the policy's embeddings through the policy net. + fwd_out[policy_id] = m(batch[policy_id], **kwargs) + return fwd_out - # Pass policy's observations through shared encoder to get the features for - # this policy. - policy_batch["encoder_embeddings"] = self.encoder._forward(batch[policy_id]) + # These methods could probably stand to be adjusted in MultiRLModule using something like this, so that subclasses that tweak _forward don't need to rewrite all of them. The prior implementation errored out because of this issue. + @override(MultiRLModule) + def _forward_inference( + self, batch: Dict[str, Any], **kwargs + ) -> Union[Dict[str, Any], Dict[ModuleID, Dict[str, Any]]]: + return self._forward(batch, "_forward_inference", **kwargs) - # Pass the policy's embeddings through the policy net. - outputs[policy_id] = rl_module._forward(batch[policy_id], **kwargs) + @override(MultiRLModule) + def _forward_exploration( + self, batch: Dict[str, Any], **kwargs + ) -> Union[Dict[str, Any], Dict[ModuleID, Dict[str, Any]]]: + return self._forward(batch, "_forward_exploration", **kwargs) - return outputs + @override(MultiRLModule) + def _forward_train( + self, batch: Dict[str, Any], **kwargs + ) -> Union[Dict[str, Any], Dict[ModuleID, Dict[str, Any]]]: + return self._forward(batch, "_forward_train", **kwargs) # __sphinx_doc_mrlm_2_end__ @@ -165,7 +190,63 @@ def setup(self): def _forward(self, batch, **kwargs): # Pass observations through the net and return outputs. - return {"encoder_embeddings": self._net(batch[Columns.OBS])} + return {ENCODER_OUT: self._net(batch[Columns.OBS])} # __sphinx_doc_encoder_end__ + + +# __sphinx_doc_ns_encoder_begin__ +class VPGIndividualEncoder(torch.nn.Module): + def __init__(self, observation_space, embedding_dim): + """ + An individual version of SharedEncoder, supporting direct comparison between + the two architectures. + """ + super().__init__() + + input_dim = observation_space.shape[0] + + # A very simple encoder network. + self._net = torch.nn.Sequential( + torch.nn.Linear(input_dim, embedding_dim), + ) + + def forward(self, batch, **kwargs): + # Pass observations through the net and return outputs. + return {ENCODER_OUT: self._net(batch[Columns.OBS])} + + +# __sphinx_doc_ns_encoder_end__ + + +# __sphinx_doc_ns_policy_begin__ +class VPGPolicyNoSharedEncoder(TorchRLModule): + """ + A VPG (vanilla pol. gradient)-style RLModule that doesn't use a shared encoder. + Facilitates experiments comparing shared and individual encoder architectures. + """ + + def setup(self): + super().setup() + + # Incoming feature dim from the encoder. + embedding_dim = self.model_config["embedding_dim"] + hidden_dim = self.model_config["hidden_dim"] + + self._pi_head = torch.nn.Sequential( + torch.nn.Linear(embedding_dim, hidden_dim), + torch.nn.ReLU(), + torch.nn.Linear(hidden_dim, self.action_space.n), + ) + self.encoder = VPGIndividualEncoder(self.observation_space, embedding_dim) + + def _forward(self, batch, **kwargs): + if ENCODER_OUT not in batch: + batch = self.encoder(batch) + embeddings = batch[ENCODER_OUT] + logits = self._pi_head(embeddings) + return {Columns.ACTION_DIST_INPUTS: logits} + + +# __sphinx_doc_ns_policy_end__ diff --git a/rllib/execution/multi_gpu_learner_thread.py b/rllib/execution/multi_gpu_learner_thread.py index aacf797b32b8..556586e88f58 100644 --- a/rllib/execution/multi_gpu_learner_thread.py +++ b/rllib/execution/multi_gpu_learner_thread.py @@ -7,7 +7,7 @@ from ray.rllib.execution.minibatch_buffer import MinibatchBuffer from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.annotations import OldAPIStack, override -from ray.rllib.utils.deprecation import deprecation_warning +from ray._common.deprecation import deprecation_warning from ray.rllib.utils.framework import try_import_tf from ray.rllib.utils.metrics.learner_info import LearnerInfoBuilder from ray.rllib.evaluation.rollout_worker import RolloutWorker diff --git a/rllib/execution/train_ops.py b/rllib/execution/train_ops.py index 732beb92e7c4..ebed28078b51 100644 --- a/rllib/execution/train_ops.py +++ b/rllib/execution/train_ops.py @@ -6,7 +6,7 @@ from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID from ray.rllib.utils.annotations import OldAPIStack from ray.rllib.utils.framework import try_import_tf -from ray.rllib.utils.deprecation import deprecation_warning +from ray._common.deprecation import deprecation_warning from ray.rllib.utils.metrics import ( NUM_ENV_STEPS_TRAINED, NUM_AGENT_STEPS_TRAINED, diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py index 0b2d393c0d11..59180bcc8691 100644 --- a/rllib/models/catalog.py +++ b/rllib/models/catalog.py @@ -31,7 +31,7 @@ TorchMultiCategorical, ) from ray.rllib.utils.annotations import DeveloperAPI, PublicAPI -from ray.rllib.utils.deprecation import ( +from ray._common.deprecation import ( DEPRECATED_VALUE, deprecation_warning, ) diff --git a/rllib/models/distributions.py b/rllib/models/distributions.py index dac7b108d610..05d9670a8c7f 100644 --- a/rllib/models/distributions.py +++ b/rllib/models/distributions.py @@ -1,4 +1,4 @@ -from ray.rllib.utils.deprecation import deprecation_warning +from ray._common.deprecation import deprecation_warning from ray.rllib.core.distribution.distribution import Distribution # noqa deprecation_warning( diff --git a/rllib/models/modelv2.py b/rllib/models/modelv2.py index df07150e57ba..c3eda53c171d 100644 --- a/rllib/models/modelv2.py +++ b/rllib/models/modelv2.py @@ -11,7 +11,7 @@ from ray.rllib.policy.view_requirement import ViewRequirement from ray.rllib.utils import NullContextManager from ray.rllib.utils.annotations import OldAPIStack -from ray.rllib.utils.deprecation import Deprecated +from ray._common.deprecation import Deprecated from ray.rllib.utils.framework import try_import_tf, try_import_torch, TensorType from ray.rllib.utils.spaces.repeated import Repeated from ray.rllib.utils.typing import ModelConfigDict, ModelInputDict, TensorStructType diff --git a/rllib/models/tf/attention_net.py b/rllib/models/tf/attention_net.py index 886580fce177..3a250bf897c1 100644 --- a/rllib/models/tf/attention_net.py +++ b/rllib/models/tf/attention_net.py @@ -29,7 +29,7 @@ from ray.rllib.utils.spaces.space_utils import get_base_struct_from_space from ray.rllib.utils.tf_utils import flatten_inputs_to_1d_tensor, one_hot from ray.rllib.utils.typing import ModelConfigDict, TensorType, List -from ray.rllib.utils.deprecation import deprecation_warning +from ray._common.deprecation import deprecation_warning from ray.util import log_once tf1, tf, tfv = try_import_tf() diff --git a/rllib/models/tf/layers/gru_gate.py b/rllib/models/tf/layers/gru_gate.py index a41b23bbf534..4a3fc0ad5303 100644 --- a/rllib/models/tf/layers/gru_gate.py +++ b/rllib/models/tf/layers/gru_gate.py @@ -1,6 +1,6 @@ from ray.rllib.utils.framework import try_import_tf from ray.rllib.utils.typing import TensorType, TensorShape -from ray.rllib.utils.deprecation import deprecation_warning +from ray._common.deprecation import deprecation_warning from ray.util import log_once tf1, tf, tfv = try_import_tf() diff --git a/rllib/models/tf/layers/multi_head_attention.py b/rllib/models/tf/layers/multi_head_attention.py index 595608989f0b..d1372c59903e 100644 --- a/rllib/models/tf/layers/multi_head_attention.py +++ b/rllib/models/tf/layers/multi_head_attention.py @@ -5,7 +5,7 @@ """ from ray.rllib.utils.framework import try_import_tf from ray.rllib.utils.typing import TensorType -from ray.rllib.utils.deprecation import deprecation_warning +from ray._common.deprecation import deprecation_warning from ray.util import log_once tf1, tf, tfv = try_import_tf() diff --git a/rllib/models/tf/layers/noisy_layer.py b/rllib/models/tf/layers/noisy_layer.py index 5bc149d5de13..b1ade2acf1fc 100644 --- a/rllib/models/tf/layers/noisy_layer.py +++ b/rllib/models/tf/layers/noisy_layer.py @@ -7,7 +7,7 @@ TensorType, TensorShape, ) -from ray.rllib.utils.deprecation import deprecation_warning +from ray._common.deprecation import deprecation_warning from ray.util import log_once tf1, tf, tfv = try_import_tf() diff --git a/rllib/models/tf/layers/relative_multi_head_attention.py b/rllib/models/tf/layers/relative_multi_head_attention.py index f88486ff2051..d0dfd3a20e40 100644 --- a/rllib/models/tf/layers/relative_multi_head_attention.py +++ b/rllib/models/tf/layers/relative_multi_head_attention.py @@ -2,7 +2,7 @@ from ray.rllib.utils.framework import try_import_tf from ray.rllib.utils.typing import TensorType -from ray.rllib.utils.deprecation import deprecation_warning +from ray._common.deprecation import deprecation_warning from ray.util import log_once tf1, tf, tfv = try_import_tf() diff --git a/rllib/models/tf/layers/skip_connection.py b/rllib/models/tf/layers/skip_connection.py index 3ee1751caf36..1ae2525e997b 100644 --- a/rllib/models/tf/layers/skip_connection.py +++ b/rllib/models/tf/layers/skip_connection.py @@ -2,7 +2,7 @@ from ray.rllib.utils.framework import try_import_tf from ray.rllib.utils.typing import TensorType -from ray.rllib.utils.deprecation import deprecation_warning +from ray._common.deprecation import deprecation_warning from ray.util import log_once tf1, tf, tfv = try_import_tf() diff --git a/rllib/models/tf/recurrent_net.py b/rllib/models/tf/recurrent_net.py index 2010d4a90118..cd4d721a2967 100644 --- a/rllib/models/tf/recurrent_net.py +++ b/rllib/models/tf/recurrent_net.py @@ -15,7 +15,7 @@ from ray.rllib.utils.spaces.space_utils import get_base_struct_from_space from ray.rllib.utils.tf_utils import flatten_inputs_to_1d_tensor, one_hot from ray.rllib.utils.typing import ModelConfigDict, TensorType -from ray.rllib.utils.deprecation import deprecation_warning +from ray._common.deprecation import deprecation_warning from ray.util.debug import log_once tf1, tf, tfv = try_import_tf() diff --git a/rllib/models/tf/tf_modelv2.py b/rllib/models/tf/tf_modelv2.py index 743879694424..f1ad20c3b65e 100644 --- a/rllib/models/tf/tf_modelv2.py +++ b/rllib/models/tf/tf_modelv2.py @@ -6,7 +6,7 @@ from ray.util import log_once from ray.rllib.models.modelv2 import ModelV2 from ray.rllib.utils.annotations import OldAPIStack, override -from ray.rllib.utils.deprecation import deprecation_warning +from ray._common.deprecation import deprecation_warning from ray.rllib.utils.framework import try_import_tf from ray.rllib.utils.typing import ModelConfigDict, TensorType diff --git a/rllib/models/torch/attention_net.py b/rllib/models/torch/attention_net.py index 2382a4da1381..d2624da0a5a2 100644 --- a/rllib/models/torch/attention_net.py +++ b/rllib/models/torch/attention_net.py @@ -30,7 +30,7 @@ from ray.rllib.utils.spaces.space_utils import get_base_struct_from_space from ray.rllib.utils.torch_utils import flatten_inputs_to_1d_tensor, one_hot from ray.rllib.utils.typing import ModelConfigDict, TensorType, List -from ray.rllib.utils.deprecation import deprecation_warning +from ray._common.deprecation import deprecation_warning from ray.util import log_once torch, nn = try_import_torch() diff --git a/rllib/models/torch/mingpt.py b/rllib/models/torch/mingpt.py index 7e24cfdc730a..f64ea12419b8 100644 --- a/rllib/models/torch/mingpt.py +++ b/rllib/models/torch/mingpt.py @@ -20,7 +20,7 @@ from torch.nn import functional as F from ray.rllib.utils.annotations import DeveloperAPI -from ray.rllib.utils.deprecation import Deprecated +from ray._common.deprecation import Deprecated @DeveloperAPI diff --git a/rllib/models/torch/recurrent_net.py b/rllib/models/torch/recurrent_net.py index 01fbab223e29..d4afc688ea8e 100644 --- a/rllib/models/torch/recurrent_net.py +++ b/rllib/models/torch/recurrent_net.py @@ -15,7 +15,7 @@ from ray.rllib.utils.spaces.space_utils import get_base_struct_from_space from ray.rllib.utils.torch_utils import flatten_inputs_to_1d_tensor, one_hot from ray.rllib.utils.typing import ModelConfigDict, TensorType -from ray.rllib.utils.deprecation import deprecation_warning +from ray._common.deprecation import deprecation_warning from ray.util.debug import log_once torch, nn = try_import_torch() diff --git a/rllib/models/torch/torch_distributions.py b/rllib/models/torch/torch_distributions.py index afba9a9a16a6..d9f77b975f4a 100644 --- a/rllib/models/torch/torch_distributions.py +++ b/rllib/models/torch/torch_distributions.py @@ -1,4 +1,4 @@ -from ray.rllib.utils.deprecation import deprecation_warning +from ray._common.deprecation import deprecation_warning from ray.rllib.core.distribution.torch.torch_distribution import ( # noqa TorchDistribution, TorchCategorical, diff --git a/rllib/offline/estimators/feature_importance.py b/rllib/offline/estimators/feature_importance.py index a5d4d1718932..148426aefb9b 100644 --- a/rllib/offline/estimators/feature_importance.py +++ b/rllib/offline/estimators/feature_importance.py @@ -2,7 +2,7 @@ __all__ = ["FeatureImportance"] -from ray.rllib.utils.deprecation import deprecation_warning +from ray._common.deprecation import deprecation_warning deprecation_warning( "ray.rllib.offline.estimators.feature_importance.FeatureImportance", diff --git a/rllib/offline/estimators/off_policy_estimator.py b/rllib/offline/estimators/off_policy_estimator.py index 7c6ef95eb78b..9abee46c1a12 100644 --- a/rllib/offline/estimators/off_policy_estimator.py +++ b/rllib/offline/estimators/off_policy_estimator.py @@ -13,7 +13,7 @@ ExperimentalAPI, OverrideToImplementCustomLogic, ) -from ray.rllib.utils.deprecation import Deprecated +from ray._common.deprecation import Deprecated from ray.rllib.utils.numpy import convert_to_numpy from ray.rllib.utils.typing import TensorType, SampleBatchType from ray.rllib.offline.offline_evaluator import OfflineEvaluator diff --git a/rllib/offline/is_estimator.py b/rllib/offline/is_estimator.py index 58c8da3e0c72..d395e3f9a356 100644 --- a/rllib/offline/is_estimator.py +++ b/rllib/offline/is_estimator.py @@ -1,5 +1,5 @@ from ray.rllib.offline.estimators.importance_sampling import ImportanceSampling -from ray.rllib.utils.deprecation import Deprecated +from ray._common.deprecation import Deprecated @Deprecated( diff --git a/rllib/offline/off_policy_estimator.py b/rllib/offline/off_policy_estimator.py index c8a08fb4a1df..9d2b90195a57 100644 --- a/rllib/offline/off_policy_estimator.py +++ b/rllib/offline/off_policy_estimator.py @@ -1,7 +1,7 @@ from ray.rllib.offline.estimators.off_policy_estimator import ( # noqa: F401 OffPolicyEstimator, ) -from ray.rllib.utils.deprecation import deprecation_warning +from ray._common.deprecation import deprecation_warning deprecation_warning( old="ray.rllib.offline.off_policy_estimator", diff --git a/rllib/offline/wis_estimator.py b/rllib/offline/wis_estimator.py index 128b50e24b2a..95c7e3bcec09 100644 --- a/rllib/offline/wis_estimator.py +++ b/rllib/offline/wis_estimator.py @@ -1,7 +1,7 @@ from ray.rllib.offline.estimators.weighted_importance_sampling import ( WeightedImportanceSampling, ) -from ray.rllib.utils.deprecation import Deprecated +from ray._common.deprecation import Deprecated @Deprecated( diff --git a/rllib/policy/dynamic_tf_policy.py b/rllib/policy/dynamic_tf_policy.py index 9645faf6e08f..180f3059e6db 100644 --- a/rllib/policy/dynamic_tf_policy.py +++ b/rllib/policy/dynamic_tf_policy.py @@ -16,7 +16,7 @@ from ray.rllib.utils import force_list from ray.rllib.utils.annotations import OldAPIStack, override from ray.rllib.utils.debug import summarize -from ray.rllib.utils.deprecation import ( +from ray._common.deprecation import ( deprecation_warning, DEPRECATED_VALUE, ) diff --git a/rllib/policy/eager_tf_policy.py b/rllib/policy/eager_tf_policy.py index c2e4fa33f159..c4f43c6d4ee0 100644 --- a/rllib/policy/eager_tf_policy.py +++ b/rllib/policy/eager_tf_policy.py @@ -17,7 +17,7 @@ from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils import add_mixins, force_list from ray.rllib.utils.annotations import OldAPIStack, override -from ray.rllib.utils.deprecation import ( +from ray._common.deprecation import ( DEPRECATED_VALUE, deprecation_warning, ) diff --git a/rllib/policy/policy.py b/rllib/policy/policy.py index 0b1db3653a8c..859e1d5847ac 100644 --- a/rllib/policy/policy.py +++ b/rllib/policy/policy.py @@ -40,7 +40,7 @@ get_checkpoint_info, try_import_msgpack, ) -from ray.rllib.utils.deprecation import ( +from ray._common.deprecation import ( DEPRECATED_VALUE, deprecation_warning, ) diff --git a/rllib/policy/policy_map.py b/rllib/policy/policy_map.py index b14b2a27056e..1627d3788939 100644 --- a/rllib/policy/policy_map.py +++ b/rllib/policy/policy_map.py @@ -6,7 +6,7 @@ import ray from ray.rllib.policy.policy import Policy from ray.rllib.utils.annotations import OldAPIStack, override -from ray.rllib.utils.deprecation import deprecation_warning +from ray._common.deprecation import deprecation_warning from ray.rllib.utils.framework import try_import_tf from ray.rllib.utils.threading import with_lock from ray.rllib.utils.typing import PolicyID diff --git a/rllib/policy/sample_batch.py b/rllib/policy/sample_batch.py index 558140da8239..25bc6f313b09 100644 --- a/rllib/policy/sample_batch.py +++ b/rllib/policy/sample_batch.py @@ -12,7 +12,7 @@ from ray.rllib.core.columns import Columns from ray.rllib.utils.annotations import DeveloperAPI, ExperimentalAPI, PublicAPI from ray.rllib.utils.compression import pack, unpack, is_compressed -from ray.rllib.utils.deprecation import Deprecated, deprecation_warning +from ray._common.deprecation import Deprecated, deprecation_warning from ray.rllib.utils.framework import try_import_tf, try_import_torch from ray.rllib.utils.torch_utils import convert_to_torch_tensor from ray.rllib.utils.typing import ( diff --git a/rllib/policy/tf_policy.py b/rllib/policy/tf_policy.py index ff68aeed8a46..377ce00727ff 100644 --- a/rllib/policy/tf_policy.py +++ b/rllib/policy/tf_policy.py @@ -14,7 +14,7 @@ from ray.rllib.utils import force_list from ray.rllib.utils.annotations import OldAPIStack, override from ray.rllib.utils.debug import summarize -from ray.rllib.utils.deprecation import Deprecated +from ray._common.deprecation import Deprecated from ray.rllib.utils.error import ERR_MSG_TF_POLICY_CANNOT_SAVE_KERAS_MODEL from ray.rllib.utils.framework import try_import_tf from ray.rllib.utils.metrics import ( diff --git a/rllib/policy/tf_policy_template.py b/rllib/policy/tf_policy_template.py index d82e0691b362..dea3687f5526 100644 --- a/rllib/policy/tf_policy_template.py +++ b/rllib/policy/tf_policy_template.py @@ -10,7 +10,7 @@ from ray.rllib.policy.tf_policy import TFPolicy from ray.rllib.utils import add_mixins, force_list from ray.rllib.utils.annotations import OldAPIStack, override -from ray.rllib.utils.deprecation import ( +from ray._common.deprecation import ( deprecation_warning, DEPRECATED_VALUE, ) diff --git a/rllib/tests/run_regression_tests.py b/rllib/tests/run_regression_tests.py index e0a82f00499a..8fc62da78c23 100644 --- a/rllib/tests/run_regression_tests.py +++ b/rllib/tests/run_regression_tests.py @@ -16,7 +16,7 @@ from ray import air from ray.air.integrations.wandb import WandbLoggerCallback from ray.rllib import _register_all -from ray.rllib.utils.deprecation import deprecation_warning +from ray._common.deprecation import deprecation_warning from ray.rllib.utils.metrics import ( ENV_RUNNER_RESULTS, EPISODE_RETURN_MEAN, diff --git a/rllib/tuned_examples/bc/cartpole_bc.py b/rllib/tuned_examples/bc/cartpole_bc.py index 327dbb32fb44..0c0b3630a642 100644 --- a/rllib/tuned_examples/bc/cartpole_bc.py +++ b/rllib/tuned_examples/bc/cartpole_bc.py @@ -51,7 +51,7 @@ # The number of iterations to be run per learner when in multi-learner # mode in a single RLlib training iteration. Leave this to `None` to # run an entire epoch on the dataset during a single RLlib training - # iteration. For single-learner mode, 1 is the only option. + # iteration. dataset_num_iters_per_learner=5, ) .training( diff --git a/rllib/tuned_examples/bc/cartpole_bc_with_offline_evaluation.py b/rllib/tuned_examples/bc/cartpole_bc_with_offline_evaluation.py index 326f7712936b..25a6eec32b1e 100644 --- a/rllib/tuned_examples/bc/cartpole_bc_with_offline_evaluation.py +++ b/rllib/tuned_examples/bc/cartpole_bc_with_offline_evaluation.py @@ -79,7 +79,7 @@ # The number of iterations to be run per learner when in multi-learner # mode in a single RLlib training iteration. Leave this to `None` to # run an entire epoch on the dataset during a single RLlib training - # iteration. For single-learner mode, 1 is the only option. + # iteration. dataset_num_iters_per_learner=5, ) .training( diff --git a/rllib/tuned_examples/cql/pendulum_cql.py b/rllib/tuned_examples/cql/pendulum_cql.py index 74526cd23153..391e7a7376d0 100644 --- a/rllib/tuned_examples/cql/pendulum_cql.py +++ b/rllib/tuned_examples/cql/pendulum_cql.py @@ -44,7 +44,7 @@ # The number of iterations to be run per learner when in multi-learner # mode in a single RLlib training iteration. Leave this to `None` to # run an entire epoch on the dataset during a single RLlib training - # iteration. For single-learner mode 1 is the only option. + # iteration. dataset_num_iters_per_learner=5, # TODO (sven): Has this any influence in the connectors? actions_in_input_normalized=True, diff --git a/rllib/tuned_examples/iql/pendulum_iql.py b/rllib/tuned_examples/iql/pendulum_iql.py new file mode 100644 index 000000000000..cb56bb6faed5 --- /dev/null +++ b/rllib/tuned_examples/iql/pendulum_iql.py @@ -0,0 +1,90 @@ +from pathlib import Path + +from ray.tune.result import TRAINING_ITERATION +from ray.rllib.algorithms.iql.iql import IQLConfig +from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, + EVALUATION_RESULTS, +) +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) + + +parser = add_rllib_example_script_args() +# Use `parser` to add your own custom command line options to this script +# and (if needed) use their values to set up `config` below. +args = parser.parse_args() + +assert ( + args.env == "Pendulum-v1" or args.env is None +), "This tuned example works only with `Pendulum-v1`." + +# Define the data paths. +data_path = "tests/data/pendulum/pendulum-v1_enormous" +base_path = Path(__file__).parents[2] +print(f"base_path={base_path}") +data_path = "local://" / base_path / data_path +print(f"data_path={data_path}") + +# Define the IQL config. +config = ( + IQLConfig() + .environment(env="Pendulum-v1") + .evaluation( + evaluation_interval=3, + evaluation_num_env_runners=1, + evaluation_duration=5, + evaluation_parallel_to_training=True, + ) + # Note, the `input_` argument is the major argument for the + # new offline API. Via the `input_read_method_kwargs` the + # arguments for the `ray.data.Dataset` read method can be + # configured. The read method needs at least as many blocks + # as remote learners. + .offline_data( + input_=[data_path.as_posix()], + # Concurrency defines the number of processes that run the + # `map_batches` transformations. This should be aligned with the + # 'prefetch_batches' argument in 'iter_batches_kwargs'. + map_batches_kwargs={"concurrency": 2, "num_cpus": 2}, + # This data set is small so do not prefetch too many batches and use no + # local shuffle. + iter_batches_kwargs={ + "prefetch_batches": 1, + }, + # The number of iterations to be run per learner when in multi-learner + # mode in a single RLlib training iteration. Leave this to `None` to + # run an entire epoch on the dataset during a single RLlib training + # iteration. + dataset_num_iters_per_learner=5, + ) + .training( + # To increase learning speed with multiple learners, + # increase the learning rates correspondingly. + actor_lr=2.59e-4 * (args.num_learners or 1) ** 0.5, + critic_lr=2.14e-4 * (args.num_learners or 1) ** 0.5, + value_lr=3.7e-5 * (args.num_learners or 1) ** 0.5, + # Smooth Polyak-averaging for the target network. + tau=6e-4, + # Update the target network each training iteration. + target_network_update_freq=1, + train_batch_size_per_learner=1024, + ) + .rl_module( + model_config=DefaultModelConfig( + fcnet_activation="relu", + ), + ) +) + +stop = { + f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": -200.0, + TRAINING_ITERATION: 1250, +} + +if __name__ == "__main__": + run_rllib_example_script_experiment(config, args, stop=stop) diff --git a/rllib/tuned_examples/marwil/cartpole_marwil.py b/rllib/tuned_examples/marwil/cartpole_marwil.py index d31836b93960..c758bae0f238 100644 --- a/rllib/tuned_examples/marwil/cartpole_marwil.py +++ b/rllib/tuned_examples/marwil/cartpole_marwil.py @@ -57,7 +57,7 @@ # The number of iterations to be run per learner when in multi-learner # mode in a single RLlib training iteration. Leave this to `None` to # run an entire epoch on the dataset during a single RLlib training - # iteration. For single-learner mode 1 is the only option. + # iteration. dataset_num_iters_per_learner=5, ) .training( diff --git a/rllib/tuned_examples/ppo/multi_agent_cartpole_ppo.py b/rllib/tuned_examples/ppo/multi_agent_cartpole_ppo.py index ea88cca34180..72f020f3664d 100644 --- a/rllib/tuned_examples/ppo/multi_agent_cartpole_ppo.py +++ b/rllib/tuned_examples/ppo/multi_agent_cartpole_ppo.py @@ -44,7 +44,7 @@ ) stop = { - NUM_ENV_STEPS_SAMPLED_LIFETIME: 300000, + NUM_ENV_STEPS_SAMPLED_LIFETIME: 400000, # Divide by num_agents to get actual return per agent. f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 450.0 * (args.num_agents or 1), } diff --git a/rllib/tuned_examples/ppo/multi_agent_footsies_ppo.py b/rllib/tuned_examples/ppo/multi_agent_footsies_ppo.py new file mode 100644 index 000000000000..e046c380933b --- /dev/null +++ b/rllib/tuned_examples/ppo/multi_agent_footsies_ppo.py @@ -0,0 +1,259 @@ +""" +Multi-agent RLlib Footsies Example (PPO) + +About: + - Example is based on the Footsies environment (https://github.com/chasemcd/FootsiesGym). + - Footsies is a two-player fighting game where each player controls a character and tries to hit the opponent while avoiding being hit. + - Footsies is a zero-sum game, when one player wins (+1 reward) the other loses (-1 reward). + +Summary: + - Main policy is an LSTM-based policy. + - Training algorithm is PPO. + +Training: + - Training is governed by adding new, more complex opponents to the mix as the main policy reaches a certain win rate threshold against the current opponent. + - Current opponent is always the newest opponent added to the mix. + - Training starts with a very simple opponent: "noop" (does nothing), then progresses to "back" (only moves backwards). These are the fixed (very simple) policies that are used to kick off the training. + - After "random", new opponents are frozen copies of the main policy at different training stages. They will be added to the mix as "lstm_v0", "lstm_v1", etc. + - In this way - after kick-starting the training with fixed simple opponents - the main policy will play against a version of itself from an earlier training stage. + - The main policy has to achieve the win rate threshold against the current opponent to add a new opponent to the mix. + - Training concludes when the target mix size is reached. + +Evaluation: + - Evaluation is performed against the current (newest) opponent. + - Evaluation runs for a fixed number of episodes at the end of each training iteration. + +""" +import functools +from pathlib import Path + +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.core.rl_module import RLModuleSpec, MultiRLModuleSpec +from ray.rllib.env.multi_agent_env_runner import MultiAgentEnvRunner +from ray.rllib.examples.envs.classes.multi_agent.footsies.fixed_rlmodules import ( + NoopFixedRLModule, + BackFixedRLModule, +) +from ray.rllib.examples.envs.classes.multi_agent.footsies.footsies_env import ( + env_creator, +) +from ray.rllib.examples.envs.classes.multi_agent.footsies.utils import ( + Matchup, + Matchmaker, + MetricsLoggerCallback, + MixManagerCallback, +) +from ray.rllib.examples.rl_modules.classes.lstm_containing_rlm import ( + LSTMContainingRLModule, +) +from ray.rllib.utils.metrics import NUM_ENV_STEPS_SAMPLED_LIFETIME +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, +) +from ray.tune.registry import register_env +from ray.tune.result import TRAINING_ITERATION + +# setting two default stopping criteria: +# 1. training_iteration (via "stop_iters") +# 2. num_env_steps_sampled_lifetime (via "default_timesteps") +# ...values very high to make sure that the test passes by adding +# all required policies to the mix, not by hitting the iteration limit. +# Our main stopping criterion is "target_mix_size" (see an argument below). +parser = add_rllib_example_script_args( + default_iters=500, + default_timesteps=5_000_000, +) + +parser.add_argument( + "--train-start-port", + type=int, + default=45001, + help="First port number for the Footsies training environment server (default: 45001). Each server gets its own port.", +) +parser.add_argument( + "--eval-start-port", + type=int, + default=55001, + help="First port number for the Footsies evaluation environment server (default: 55001) Each server gets its own port.", +) +parser.add_argument( + "--binary-download-dir", + type=Path, + default="/tmp/ray/binaries/footsies", + help="Directory to download Footsies binaries (default: /tmp/ray/binaries/footsies)", +) +parser.add_argument( + "--binary-extract-dir", + type=Path, + default="/tmp/ray/binaries/footsies", + help="Directory to extract Footsies binaries (default: /tmp/ray/binaries/footsies)", +) +parser.add_argument( + "--binary-to-download", + type=str, + choices=["linux_server", "linux_windowed", "mac_headless", "mac_windowed"], + default="linux_server", + help="Target binary for Footsies environment (default: linux_server). Linux and Mac machines are supported. " + "'linux_server' and 'mac_headless' choices are the default options for the training. Game will run in the batchmode, without initializing the graphics. " + "'linux_windowed' and 'mac_windowed' choices are for the local run only, because " + "game will be rendered in the OS window. To use this option effectively, set up: " + "--no-tune --num-env-runners 0 --evaluation-num-env-runners 0", +) +parser.add_argument( + "--win-rate-threshold", + type=float, + default=0.8, + help="The main policy should have at least 'win-rate-threshold' win rate against the " + "other policy to advance to the next level. Moving to the next level " + "means adding a new policy to the mix.", +) +parser.add_argument( + "--target-mix-size", + type=int, + default=5, + help="Target number of policies (RLModules) in the mix to consider the test passed. " + "The initial mix size is 2: 'main policy' vs. 'other'. " + "`--target-mix-size=5` means that 3 new policies will be added to the mix. " + "Whether to add new policy is decided by checking the '--win-rate-threshold' condition. ", +) +parser.add_argument( + "--rollout-fragment-length", + type=int, + default=256, + help="The length of each rollout fragment to be collected by the EnvRunners when sampling.", +) + +main_policy = "lstm" +args = parser.parse_args() +register_env(name="FootsiesEnv", env_creator=env_creator) + +config = ( + PPOConfig() + .reporting( + min_time_s_per_iteration=30, + ) + .environment( + env="FootsiesEnv", + env_config={ + "max_t": 1000, + "frame_skip": 4, + "observation_delay": 16, + "train_start_port": args.train_start_port, + "eval_start_port": args.eval_start_port, + "host": "localhost", + "binary_download_dir": args.binary_download_dir, + "binary_extract_dir": args.binary_extract_dir, + "binary_to_download": args.binary_to_download, + }, + ) + .learners( + num_learners=1, + num_cpus_per_learner=1, + num_gpus_per_learner=0, + num_aggregator_actors_per_learner=0, + ) + .env_runners( + env_runner_cls=MultiAgentEnvRunner, + num_env_runners=args.num_env_runners or 1, + num_cpus_per_env_runner=0.5, + num_envs_per_env_runner=1, + batch_mode="truncate_episodes", + rollout_fragment_length=args.rollout_fragment_length, + episodes_to_numpy=False, + create_env_on_local_worker=True, + ) + .training( + train_batch_size_per_learner=args.rollout_fragment_length + * (args.num_env_runners or 1), + lr=1e-4, + entropy_coeff=0.01, + num_epochs=10, + minibatch_size=128, + ) + .multi_agent( + policies={ + main_policy, + "noop", + "back", + }, + # this is a starting policy_mapping_fn + # It will be updated by the MixManagerCallback during training. + policy_mapping_fn=Matchmaker( + [Matchup(main_policy, "noop", 1.0)] + ).agent_to_module_mapping_fn, + # we only train the main policy, this doesn't change during training. + policies_to_train=[main_policy], + ) + .rl_module( + rl_module_spec=MultiRLModuleSpec( + rl_module_specs={ + main_policy: RLModuleSpec( + module_class=LSTMContainingRLModule, + model_config={ + "lstm_cell_size": 128, + "dense_layers": [128, 128], + "max_seq_len": 64, + }, + ), + # for simplicity, all fixed RLModules are added to the config at the start. + # However, only "noop" is used at the start of training, + # the others are added to the mix later by the MixManagerCallback. + "noop": RLModuleSpec(module_class=NoopFixedRLModule), + "back": RLModuleSpec(module_class=BackFixedRLModule), + }, + ) + ) + .evaluation( + evaluation_num_env_runners=args.evaluation_num_env_runners or 1, + evaluation_sample_timeout_s=120, + evaluation_interval=1, + evaluation_duration=10, # 10 episodes is enough to get a good win rate estimate + evaluation_duration_unit="episodes", + evaluation_parallel_to_training=False, + # we may add new RLModules to the mix at the end of the evaluation stage. + # Running evaluation in parallel may result in training for one more iteration on the old mix. + evaluation_force_reset_envs_before_iteration=True, + evaluation_config={ + "env_config": {"env-for-evaluation": True}, + }, # evaluation_config is used to add an argument to the env creator. + ) + .callbacks( + [ + functools.partial( + MetricsLoggerCallback, + main_policy=main_policy, + ), + functools.partial( + MixManagerCallback, + win_rate_threshold=args.win_rate_threshold, + main_policy=main_policy, + target_mix_size=args.target_mix_size, + starting_modules=[main_policy, "noop"], + fixed_modules_progression_sequence=( + "noop", + "back", + ), + ), + ] + ) +) + +# stopping criteria to be passed to Ray Tune. The main stopping criterion is "mix_size". +# "mix_size" is reported at the end of each training iteration by the MixManagerCallback. +stop = { + NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps, + TRAINING_ITERATION: args.stop_iters, + "mix_size": args.target_mix_size, +} + +if __name__ == "__main__": + from ray.rllib.utils.test_utils import run_rllib_example_script_experiment + + results = run_rllib_example_script_experiment( + base_config=config, + args=args, + stop=stop, + success_metric={ + "mix_size": args.target_mix_size + }, # pass the success metric for RLlib's testing framework + ) diff --git a/rllib/utils/__init__.py b/rllib/utils/__init__.py index 7adcf6f7ca51..ff95e19a155f 100644 --- a/rllib/utils/__init__.py +++ b/rllib/utils/__init__.py @@ -5,7 +5,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union from ray.rllib.utils.annotations import override, PublicAPI, DeveloperAPI -from ray.rllib.utils.deprecation import deprecation_warning +from ray._common.deprecation import deprecation_warning from ray.rllib.utils.filter import Filter from ray.rllib.utils.filter_manager import FilterManager from ray.rllib.utils.framework import ( diff --git a/rllib/utils/annotations.py b/rllib/utils/annotations.py index 6824412b354f..286c541e0f12 100644 --- a/rllib/utils/annotations.py +++ b/rllib/utils/annotations.py @@ -1,4 +1,4 @@ -from ray.rllib.utils.deprecation import Deprecated +from ray._common.deprecation import Deprecated from ray.util.annotations import _mark_annotated diff --git a/rllib/utils/checkpoints.py b/rllib/utils/checkpoints.py index 43c522cfc565..99cd5ef3846f 100644 --- a/rllib/utils/checkpoints.py +++ b/rllib/utils/checkpoints.py @@ -1036,7 +1036,8 @@ def try_import_msgpack(error: bool = False): error: Whether to raise an error if msgpack/msgpack_numpy cannot be imported. Returns: - The `msgpack` module. + The `msgpack` module, with the msgpack_numpy module already patched in. This + means you can already encde and decode numpy arrays with the returned module. Raises: ImportError: If error=True and msgpack/msgpack_numpy is not installed. diff --git a/rllib/utils/filter.py b/rllib/utils/filter.py index 5f1418cfd2d1..8b4e6ffcd827 100644 --- a/rllib/utils/filter.py +++ b/rllib/utils/filter.py @@ -5,13 +5,13 @@ import tree # pip install dm_tree from ray.rllib.utils.annotations import OldAPIStack -from ray.rllib.utils.deprecation import Deprecated +from ray._common.deprecation import Deprecated from ray.rllib.utils.numpy import ( SMALL_NUMBER, ) # Assuming SMALL_NUMBER is a small float like 1e-8 from ray.rllib.utils.typing import TensorStructType from ray.rllib.utils.serialization import _serialize_ndarray, _deserialize_ndarray -from ray.rllib.utils.deprecation import deprecation_warning +from ray._common.deprecation import deprecation_warning logger = logging.getLogger(__name__) diff --git a/rllib/utils/framework.py b/rllib/utils/framework.py index c0b9a28fa472..ba2280488d69 100644 --- a/rllib/utils/framework.py +++ b/rllib/utils/framework.py @@ -8,7 +8,7 @@ import ray from ray.rllib.utils.annotations import DeveloperAPI, PublicAPI -from ray.rllib.utils.deprecation import Deprecated +from ray._common.deprecation import Deprecated from ray.rllib.utils.typing import ( TensorShape, TensorStructType, diff --git a/rllib/utils/memory.py b/rllib/utils/memory.py index fe739cc0f99b..323bec70c50f 100644 --- a/rllib/utils/memory.py +++ b/rllib/utils/memory.py @@ -1,4 +1,4 @@ -from ray.rllib.utils.deprecation import deprecation_warning +from ray._common.deprecation import deprecation_warning from ray.rllib.utils.numpy import aligned_array, concat_aligned # noqa deprecation_warning( diff --git a/rllib/utils/metrics/metrics_logger.py b/rllib/utils/metrics/metrics_logger.py index 627ec5b9d579..d941db5599c3 100644 --- a/rllib/utils/metrics/metrics_logger.py +++ b/rllib/utils/metrics/metrics_logger.py @@ -4,7 +4,7 @@ from ray.rllib.utils import force_tuple, deep_update from ray.rllib.utils.metrics.stats import Stats, merge_stats -from ray.rllib.utils.deprecation import Deprecated, deprecation_warning +from ray._common.deprecation import Deprecated, deprecation_warning from ray.rllib.utils.framework import try_import_tf, try_import_torch from ray.util.annotations import PublicAPI from ray.util import log_once @@ -761,7 +761,6 @@ def aggregate( key: Optional top-level key under which to log all keys/key sequences found in the n `stats_dicts`. """ - assert isinstance(stats_dicts, list), "stats_dicts must be a list" all_keys = set() def traverse_and_add_paths(d, path=()): @@ -1168,6 +1167,8 @@ def set_state(self, state: Dict[str, Any]) -> None: state: The state to set `self` to. """ with self._threading_lock: + # Reset all existing stats to ensure a clean state transition + self.stats = {} for flat_key, stats_state in state["stats"].items(): self._set_key(flat_key.split("--"), Stats.from_state(stats_state)) diff --git a/rllib/utils/metrics/stats.py b/rllib/utils/metrics/stats.py index a764639effdd..5ec957580c34 100644 --- a/rllib/utils/metrics/stats.py +++ b/rllib/utils/metrics/stats.py @@ -1,58 +1,21 @@ from collections import defaultdict, deque -import time import copy -import threading import heapq +import threading +import time from typing import Any, Dict, List, Union, Optional, Tuple +import uuid import numpy as np from ray.rllib.utils import force_list -from ray.rllib.utils.framework import try_import_tf, try_import_torch +from ray.rllib.utils.framework import try_import_torch from ray.rllib.utils.numpy import convert_to_numpy from ray.util.annotations import DeveloperAPI -_, tf, _ = try_import_tf() torch, _ = try_import_torch() -@DeveloperAPI -def compute_percentiles(sorted_list, percentiles): - """Compute percentiles from an already sorted list. - - Note that this will not raise an error if the list is not sorted to avoid overhead. - - Args: - sorted_list: A list of numbers sorted in ascending order - percentiles: A list of percentile values (0-100) - - Returns: - A dictionary mapping percentile values to their corresponding data values - """ - n = len(sorted_list) - - if n == 0: - return {p: None for p in percentiles} - - results = {} - - for p in percentiles: - index = (p / 100) * (n - 1) - - if index.is_integer(): - results[p] = sorted_list[int(index)] - else: - lower_index = int(index) - upper_index = lower_index + 1 - weight = index - lower_index - results[p] = ( - sorted_list[lower_index] * (1 - weight) - + sorted_list[upper_index] * weight - ) - - return results - - @DeveloperAPI class Stats: """A container class holding a number of values and executing reductions over them. @@ -172,9 +135,9 @@ def __init__( "A window must be specified when reduce is 'percentiles'!" ) if reduce_per_index_on_aggregate is not False: - print(reduce_per_index_on_aggregate) raise ValueError( - "`reduce_per_index_on_aggregate` must be `False` when `percentiles` is not `False`!" + f"`reduce_per_index_on_aggregate` ({reduce_per_index_on_aggregate})" + f" must be `False` when `percentiles` is not `False`!" ) if percentiles is True: @@ -234,16 +197,18 @@ def __init__( self._has_returned_zero = False # On each `.reduce()` call, we store the result of this call in - # reduce_history[0] and the previous `reduce()` result in reduce_history[1]. - self._reduce_history: deque[List[Any]] = deque( - [[np.nan], [np.nan], [np.nan]], maxlen=3 - ) + # self._last_reduce. + self._last_reduced = [np.nan] + # The ID of this Stats instance. + self.id_ = str(uuid.uuid4()) + self._prev_merge_values = defaultdict(int) self._throughput_ema_coeff = throughput_ema_coeff self._throughput_stats = None if throughput is not False: self._throughput_stats = Stats( - # We have to check for bool here because in Python, bool is a subclass of int + # We have to check for bool here because in Python, bool is a subclass + # of int. init_values=[throughput] if ( isinstance(throughput, (int, float)) @@ -258,9 +223,9 @@ def __init__( throughput_ema_coeff=None, ) if init_values is not None: - self._last_push_time = time.perf_counter() + self._last_throughput_measure_time = time.perf_counter() else: - self._last_push_time = ( + self._last_throughput_measure_time = ( -1 ) # Track last push time for throughput calculation @@ -282,7 +247,7 @@ def check_value(self, value: Any) -> None: if self._reduce_method is not None: if isinstance(value, np.ndarray) and value.shape == (): return - elif (torch and torch.is_tensor(value)) or (tf and tf.is_tensor(value)): + elif torch and torch.is_tensor(value): self._is_tensor = True if tuple(value.shape) == (): return @@ -302,14 +267,7 @@ def push(self, value: Any) -> None: self.check_value(value) # If throughput tracking is enabled, calculate it based on time between pushes if self.has_throughput: - current_time = time.perf_counter() - if self._last_push_time >= 0: - time_diff = current_time - self._last_push_time - if time_diff > 0: # Avoid division by zero - current_throughput = value / time_diff - self._throughput_stats.push(current_throughput) - self._last_push_time = current_time - + self._recompute_throughput(value) # Handle different reduction methods if self._window is not None: # For windowed operations, append to values and trim if needed @@ -377,27 +335,13 @@ def peek(self, compile: bool = True) -> Union[Any, List[Any]]: return compute_percentiles(reduced_values, self._percentiles) return reduced_value else: - return_value = self.get_reduce_history()[-1].copy() + return_value = self._last_reduced if compile: # We don't need to check for self._reduce_method or percentiles here # because we only store the reduced value if there is a reduce method. return_value = return_value[0] return return_value - def get_reduce_history(self) -> List[Any]: - """Returns the history of reduced values as a list. - - The history contains the most recent reduced values, with the most recent value - at the end of the list. The length of the history is limited by the maxlen of - the internal history deque. - - Returns: - A list containing the history of reduced values. - """ - # Turning the reduce history into a deque avoids mutating the original reduce - # history's elements. - return list(self._reduce_history) - @property def throughput(self) -> float: """Returns the current throughput estimate per second. @@ -447,15 +391,11 @@ class for details on the reduction logic applied to the values list, based on # `clear_on_reduce` -> Clear the values list. if self._clear_on_reduce: self._set_values([]) - # If we clear on reduce, following reduce calls should not return the - # old values. - self._has_new_values = True else: - self._has_new_values = False self._set_values(reduced_internal_values_list) else: reduced_internal_values_list = None - reduced = self.get_reduce_history()[-1] + reduced = self._last_reduced reduced = self._numpy_if_necessary(reduced) @@ -464,7 +404,7 @@ class for details on the reduction logic applied to the values list, based on # It only makes sense to extend the history if we are reducing to a single # value. We need to make a copy here because the new_values_list is a # reference to the internal values list - self._reduce_history.append(force_list(reduced.copy())) + self._last_reduced = force_list(reduced.copy()) else: # If there is a window and no reduce method, we don't want to use the reduce # history to return reduced values in other methods @@ -510,10 +450,6 @@ def merge_on_time_axis(self, other: "Stats") -> None: """ self.values.extend(other.values) - # Adopt `other`'s current throughput estimate (it's the newer one). - if self.has_throughput: - self._throughput_stats.merge_on_time_axis(other._throughput_stats) - # Mark that we have new values since we modified the values list self._has_new_values = True @@ -587,9 +523,8 @@ def merge_in_parallel(self, *others: "Stats") -> None: continue tmp_values.append(stats.values[-i]) - # Now reduce across `tmp_values` based on the reduce-settings of this Stats. - # TODO (sven) : explain why all this - + # Now reduce across `tmp_values` based on the reduce-settings of this + # Stats. if self._reduce_per_index_on_aggregate: n_values = 1 else: @@ -603,10 +538,10 @@ def merge_in_parallel(self, *others: "Stats") -> None: # We add [sum(tmp_values) / n_values] * n_values to the new values # list instead of tmp_values, because every incoming element should # have the same weight. - reduced_value = ( - self._reduced_values(values=tmp_values)[0][0] / n_values - ) - new_values.extend([reduced_value] * n_values) + added_sum = self._reduced_values(values=tmp_values)[0][0] + new_values.extend([added_sum / n_values] * n_values) + if self.has_throughput: + self._recompute_throughput(added_sum) else: new_values.extend( self._reduced_values(values=tmp_values)[0] * n_values @@ -619,16 +554,37 @@ def merge_in_parallel(self, *others: "Stats") -> None: self._set_values(list(reversed(new_values))) - # Adopt `other`'s current throughput estimate (it's the newer one). - if self.has_throughput: - other_throughput_stats = [ - other._throughput_stats for other in others if other.has_throughput - ] - self._throughput_stats.merge_in_parallel(*other_throughput_stats) - # Mark that we have new values since we modified the values list self._has_new_values = True + def clear_throughput(self) -> None: + """Clears the throughput Stats, if applicable and `self` has throughput. + + Also resets `self._last_throughput_measure_time` to -1 such that the Stats + object has to create a new timestamp first, before measuring any new throughput + values. + """ + if self.has_throughput: + self._throughput_stats._set_values([]) + self._last_throughput_measure_time = -1 + + def _recompute_throughput(self, value) -> None: + """Recomputes the current throughput value of this Stats instance.""" + # Make sure this Stats object does measure throughput. + assert self.has_throughput + # Take the current time stamp. + current_time = time.perf_counter() + # Check, whether we have a previous timestamp (non -1). + if self._last_throughput_measure_time >= 0: + # Compute the time delta. + time_diff = current_time - self._last_throughput_measure_time + # Avoid divisions by zero. + if time_diff > 0: + # Push new throughput value into our throughput stats object. + self._throughput_stats.push(value / time_diff) + # Update the time stamp of the most recent throughput computation (this one). + self._last_throughput_measure_time = current_time + @staticmethod def _numpy_if_necessary(values): # Torch tensor handling. Convert to CPU/numpy first. @@ -745,7 +701,7 @@ def get_state(self) -> Dict[str, Any]: "window": self._window, "ema_coeff": self._ema_coeff, "clear_on_reduce": self._clear_on_reduce, - "_hist": list(self.get_reduce_history()), + "_last_reduced": self._last_reduced, "_is_tensor": self._is_tensor, } if self._throughput_stats is not None: @@ -804,13 +760,13 @@ def from_state(state: Dict[str, Any]) -> "Stats": # Compatibility to old checkpoints where a reduce sometimes resulted in a single # values instead of a list such that the history would be a list of integers # instead of a list of lists. - # TODO(Artur): Remove this after a few Ray releases. - if not isinstance(state["_hist"][0], list): - state["_hist"] = list(map(lambda x: [x], state["_hist"])) - - stats._reduce_history = deque( - state["_hist"], maxlen=stats._reduce_history.maxlen - ) + if "_hist" in state: + # TODO(Artur): Remove this after a few Ray releases. + if not isinstance(state["_hist"][0], list): + state["_hist"] = list(map(lambda x: [x], state["_hist"])) + stats._last_reduced = state["_hist"][-1] + else: + stats._last_reduced = state.get("_last_reduced", [np.nan]) return stats @staticmethod @@ -845,7 +801,8 @@ def similar_to( else False, throughput_ema_coeff=other._throughput_ema_coeff, ) - stats._reduce_history = other._reduce_history + stats.id_ = other.id_ + stats._last_reduced = other._last_reduced return stats def _set_values(self, new_values): @@ -930,8 +887,6 @@ def _reduced_values(self, values=None) -> Tuple[Any, Any]: def safe_isnan(value): if torch and isinstance(value, torch.Tensor): return torch.isnan(value) - if tf and tf.is_tensor(value): - return tf.math.is_nan(value) return np.isnan(value) # Convert from numpy to primitive python types, if original `values` are @@ -960,6 +915,43 @@ def safe_isnan(value): return [reduced], values +@DeveloperAPI +def compute_percentiles(sorted_list, percentiles): + """Compute percentiles from an already sorted list. + + Note that this will not raise an error if the list is not sorted to avoid overhead. + + Args: + sorted_list: A list of numbers sorted in ascending order + percentiles: A list of percentile values (0-100) + + Returns: + A dictionary mapping percentile values to their corresponding data values + """ + n = len(sorted_list) + + if n == 0: + return {p: None for p in percentiles} + + results = {} + + for p in percentiles: + index = (p / 100) * (n - 1) + + if index.is_integer(): + results[p] = sorted_list[int(index)] + else: + lower_index = int(index) + upper_index = lower_index + 1 + weight = index - lower_index + results[p] = ( + sorted_list[lower_index] * (1 - weight) + + sorted_list[upper_index] * weight + ) + + return results + + @DeveloperAPI def merge_stats(base_stats: Optional[Stats], incoming_stats: List[Stats]) -> Stats: """Merges Stats objects. @@ -978,47 +970,70 @@ def merge_stats(base_stats: Optional[Stats], incoming_stats: List[Stats]) -> Sta new_root_stats = True else: new_root_stats = False + # Nothing to be merged + if len(incoming_stats) == 0: + return base_stats if new_root_stats: # We need to deepcopy here first because stats from incoming_stats may be altered in the future base_stats = copy.deepcopy(incoming_stats[0]) + base_stats.clear_throughput() + # Note that we may take a mean of means here, which is not the same as a + # mean of all values. In the future, we could implement a weighted mean + # of means here by introducing a new Stats object that counts samples + # for each mean Stats object. + if len(incoming_stats) > 1: + base_stats.merge_in_parallel(*incoming_stats[1:]) + if ( + base_stats._reduce_method == "sum" + and base_stats._inf_window + and base_stats._clear_on_reduce is False + ): + for stat in incoming_stats: + base_stats._prev_merge_values[stat.id_] = stat.peek() + elif len(incoming_stats) > 0: # Special case: `base_stats` is a lifetime sum (reduce=sum, # clear_on_reduce=False) -> We subtract the previous value (from 2 # `reduce()` calls ago) from all to-be-merged stats, so we don't count # twice the older sum from before. + + # Also, for the merged, new throughput value, we need to find out what the + # actual value-delta is between before the last reduce and the current one. + + added_sum = 0.0 # Used in `base_stats._recompute_throughput` if applicable. if ( base_stats._reduce_method == "sum" and base_stats._inf_window and base_stats._clear_on_reduce is False ): for stat in incoming_stats: - reduce_by = stat.get_reduce_history()[-2][0] - base_stats.values[-1] -= reduce_by - else: - # Nothing to be merged - return base_stats - - if new_root_stats: - # Note that we may take a mean of means here, which is not the same as a - # mean of all values. In the future, we could implement a weighted mean - # of means here by introducing a new Stats object that counts samples - # for each mean Stats object. - if len(incoming_stats) > 1: - base_stats.merge_in_parallel(*incoming_stats[1:]) - elif len(incoming_stats) > 0: + # Subtract "lifetime counts" from the Stat's values to not count + # older "lifetime counts" more than once. + prev_reduction = base_stats._prev_merge_values[stat.id_] + new_reduction = stat.peek(compile=True) + base_stats.values[-1] -= prev_reduction + # Keep track of how many counts we actually gained (for throughput + # recomputation). + added_sum += new_reduction - prev_reduction + base_stats._prev_merge_values[stat.id_] = new_reduction + + parallel_merged_stat = copy.deepcopy(incoming_stats[0]) if len(incoming_stats) > 1: # There are more than one incoming parallel others -> Merge all of # them in parallel (equal importance). - incoming_stats[0].merge_in_parallel(*incoming_stats[1:]) + parallel_merged_stat.merge_in_parallel(*incoming_stats[1:]) # Merge incoming Stats object into base Stats object on time axis # (giving incoming ones priority). if base_stats._reduce_method == "mean" and not base_stats._clear_on_reduce: # If we don't clear values, values that are not cleared would contribute # to the mean multiple times. - base_stats._set_values(incoming_stats[0].values.copy()) + base_stats._set_values(parallel_merged_stat.values.copy()) else: - base_stats.merge_on_time_axis(incoming_stats[0]) + base_stats.merge_on_time_axis(parallel_merged_stat) + # Keep track of throughput through the sum of added counts. + if base_stats.has_throughput: + base_stats._recompute_throughput(added_sum) return base_stats diff --git a/rllib/utils/metrics/tests/test_metrics_logger.py b/rllib/utils/metrics/tests/test_metrics_logger.py index dbd9e0096b04..485af1e86dfc 100644 --- a/rllib/utils/metrics/tests/test_metrics_logger.py +++ b/rllib/utils/metrics/tests/test_metrics_logger.py @@ -3,6 +3,7 @@ import numpy as np import torch +import ray from ray.rllib.utils.metrics.metrics_logger import MetricsLogger from ray.rllib.utils.test_utils import check @@ -235,8 +236,8 @@ def test_throughput_tracking(logger): check(logger.peek("count"), num_iters * 2 + 1) approx_throughput = (num_iters * 2 + 1) / (end_time - start_time) check( - logger.peek("count", throughput=True), approx_throughput, rtol=0.1 - ) # 10% tolerance in throughput + logger.peek("count", throughput=True), approx_throughput, rtol=0.15 + ) # 15% tolerance in throughput # Test _get_throughputs() method without key (returns all throughputs) throughputs = logger.peek(throughput=True) @@ -274,6 +275,79 @@ def test_throughput_tracking(logger): check("count_throughput" in all_throughputs["nested"], True) +def test_throughput_aggregation(): + """Test aggregation of throughput metrics from different (remote) sources.""" + + @ray.remote + class EnvRunner: + def __init__(self): + self.metrics = MetricsLogger() + + def increase(self, count=1): + self.metrics.log_value( + "counter", + count, + reduce="sum", + clear_on_reduce=False, # lifetime counter + with_throughput=True, + ) + + def get_metrics(self): + return self.metrics.reduce() + + env_runners = [EnvRunner.remote() for _ in range(3)] + + # Main logger. + main_metrics = MetricsLogger() + + env_runners[0].increase.remote(count=0) + env_runners[1].increase.remote(count=0) + _ = [ray.get(act.get_metrics.remote()) for act in env_runners] + + # Add 1 count for actor0 and 5 counts for actor1 to the lifetime counters + # in each of the 5 iterations. + # 5 iterations -> expect final count of 5 * 6 = 30 + for _ in range(5): + time.sleep(0.1) + env_runners[0].increase.remote(count=1) + env_runners[1].increase.remote(count=5) + + # Pull metrics from both actors. + results = [ray.get(act.get_metrics.remote()) for act in env_runners] + main_metrics.aggregate(results) + # The first aggregate (before the key even exists in `main_metrics`, throughput + # should be NaN. + check(main_metrics.peek("counter"), 30) + # After first aggregation, throughput should be NaN, b/c the Stats did not exist + # within the `MetricsLogger`. + assert np.isnan(main_metrics.stats["counter"].throughput) + + # Add 1 count for actor0 and 2 counts for actor1 to the lifetime counters + # in each of the 5 iterations. + # 5 iterations each 1 sec -> expect throughput of 3/0.2sec = 5/sec. + for _ in range(5): + time.sleep(0.2) + env_runners[0].increase.remote(count=1) + env_runners[1].increase.remote(count=2) + results = [ray.get(act.get_metrics.remote()) for act in env_runners] + main_metrics.aggregate(results) + + check(main_metrics.peek("counter"), 30 + 15) + tp = main_metrics.stats["counter"].throughput + check(tp, 15, atol=2) + + time.sleep(1.0) + env_runners[2].increase.remote(count=50) + results = ray.get(env_runners[2].get_metrics.remote()) + main_metrics.aggregate([results]) + + check(main_metrics.peek("counter"), 30 + 15 + 50) + tp = main_metrics.stats["counter"].throughput + # Expect throughput - due to the EMA - to be only slightly higher than + # the original value of 15. + check(tp, 16, atol=2) + + def test_reset_and_delete(logger): """Test reset and delete functionality.""" # Log some values diff --git a/rllib/utils/metrics/tests/test_stats.py b/rllib/utils/metrics/tests/test_stats.py index b43a0a15559a..ec4f36700533 100644 --- a/rllib/utils/metrics/tests/test_stats.py +++ b/rllib/utils/metrics/tests/test_stats.py @@ -1,8 +1,10 @@ import pytest import time import numpy as np +import re from ray.rllib.utils.metrics.stats import Stats, merge_stats +from ray.rllib.utils.metrics.metrics_logger import MetricsLogger from ray.rllib.utils.test_utils import check # Default values used throughout the tests @@ -342,7 +344,7 @@ def test_similar_to(): # Test that adding to the similar stats does not affect the original stats similar.push(10) check(original.peek(), 3) - check(original.get_reduce_history(), [[np.nan], [np.nan], [3]]) + check(original._last_reduced, [3]) def test_reduce_history(): @@ -358,19 +360,19 @@ def test_reduce_history(): ) # Initially history should contain NaN values - check(stats.get_reduce_history(), [[np.nan], [np.nan], [np.nan]]) + check(stats._last_reduced, [np.nan]) # Push values and reduce stats.push(1) stats.push(2) check(stats.reduce(), 3) - check(stats.get_reduce_history(), [[np.nan], [np.nan], [3]]) + check(stats._last_reduced, [3]) # Push more values and reduce stats.push(3) stats.push(4) check(stats.reduce(), 10) - check(stats.get_reduce_history(), [[np.nan], [3], [10]]) + check(stats._last_reduced, [10]) def test_reduce_history_with_clear(): @@ -389,13 +391,13 @@ def test_reduce_history_with_clear(): stats.push(1) stats.push(2) check(stats.reduce(), 3) - check(stats.get_reduce_history(), [[np.nan], [np.nan], [3]]) + check(stats._last_reduced, [3]) check(len(stats), 0) # Values should be cleared stats.push(3) stats.push(4) check(stats.reduce(), 7) - check(stats.get_reduce_history(), [[np.nan], [3], [7]]) + check(stats._last_reduced, [7]) check(len(stats), 0) @@ -1178,18 +1180,110 @@ def test_percentiles(): # Test validation - percentiles must be None for other reduce methods with pytest.raises( - ValueError, match="`reduce` must be `None` when `percentiles` is not `False" + ValueError, match="`reduce` must be `None` when `percentiles` is not `False`" ): Stats(reduce="mean", window=5, percentiles=[50]) with pytest.raises( - ValueError, match="`reduce_per_index_on_aggregate` must be `False`" + ValueError, + match=re.escape( + "`reduce_per_index_on_aggregate` (True) must be `False` " + "when `percentiles` is not `False`!" + ), ): Stats( reduce=None, reduce_per_index_on_aggregate=True, percentiles=True, window=5 ) +def test_set_state_complete_replacement(): + """Test that set_state() completely replaces the logger's state. + + This test verifies the fix for the issue where set_state() would only update + keys present in the new state but leave old keys intact, causing stale data + to persist after checkpoint restoration. + """ + # Test case 1: Basic replacement with fewer keys + logger1 = MetricsLogger() + logger1.log_value("solo", 0) + logger1.log_value("duo", 0) + + logger2 = MetricsLogger() + logger2.log_value("duo", 1) + + # Before fix: {'solo': 0, 'duo': 1} - 'solo' would persist + # After fix: {'duo': 1} - only new state keys remain + logger1.set_state(logger2.get_state()) + result = logger1.peek() + expected = {"duo": 1} + + check(result, expected) + + # Test case 2: Complete replacement with different keys + logger3 = MetricsLogger() + logger3.log_value("old_key1", 10) + logger3.log_value("old_key2", 20) + logger3.log_value("shared_key", 30) + + logger4 = MetricsLogger() + logger4.log_value("shared_key", 100) + logger4.log_value("new_key", 200) + + logger3.set_state(logger4.get_state()) + result = logger3.peek() + expected = {"shared_key": 100, "new_key": 200} + + check(result, expected) + + # Test case 3: Setting to empty state + logger5 = MetricsLogger() + logger5.log_value("key1", 1) + logger5.log_value("key2", 2) + + empty_logger = MetricsLogger() + logger5.set_state(empty_logger.get_state()) + result = logger5.peek() + + check(result, {}) + + # Test case 4: Nested keys + logger6 = MetricsLogger() + logger6.log_value(("nested", "old_key"), 1) + logger6.log_value(("nested", "shared_key"), 2) + logger6.log_value("top_level", 3) + + logger7 = MetricsLogger() + logger7.log_value(("nested", "shared_key"), 20) + logger7.log_value(("nested", "new_key"), 30) + + logger6.set_state(logger7.get_state()) + result = logger6.peek() + expected = {"nested": {"shared_key": 20, "new_key": 30}} + + check(result, expected) + + # Test case 5: Multiple set_state calls (simulating multiple restore_from_path calls) + logger8 = MetricsLogger() + logger8.log_value("initial", 0) + + # First set_state + temp1 = MetricsLogger() + temp1.log_value("first", 1) + temp1.log_value("shared", 100) + logger8.set_state(temp1.get_state()) + + # Second set_state - should completely replace first state + temp2 = MetricsLogger() + temp2.log_value("second", 2) + temp2.log_value("shared", 20) + logger8.set_state(temp2.get_state()) + + result = logger8.peek() + expected = {"second": 2, "shared": 20} + + check(result, expected) + + if __name__ == "__main__": import sys diff --git a/rllib/utils/numpy.py b/rllib/utils/numpy.py index b0970ad51427..f1bb8f2ff32b 100644 --- a/rllib/utils/numpy.py +++ b/rllib/utils/numpy.py @@ -7,7 +7,7 @@ from ray.rllib.utils.annotations import PublicAPI -from ray.rllib.utils.deprecation import Deprecated +from ray._common.deprecation import Deprecated from ray.rllib.utils.framework import try_import_tf, try_import_torch from ray.rllib.utils.typing import SpaceStruct, TensorType, TensorStructType, Union diff --git a/rllib/utils/policy.py b/rllib/utils/policy.py index a5b6b2ccfda6..0cb149ced5bc 100644 --- a/rllib/utils/policy.py +++ b/rllib/utils/policy.py @@ -20,7 +20,7 @@ from ray.rllib.policy.policy import PolicySpec from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.annotations import OldAPIStack -from ray.rllib.utils.deprecation import Deprecated +from ray._common.deprecation import Deprecated from ray.rllib.utils.framework import try_import_tf from ray.rllib.utils.typing import ( ActionConnectorDataType, diff --git a/rllib/utils/replay_buffers/multi_agent_replay_buffer.py b/rllib/utils/replay_buffers/multi_agent_replay_buffer.py index ac3af0125b27..5fcfd75365c6 100644 --- a/rllib/utils/replay_buffers/multi_agent_replay_buffer.py +++ b/rllib/utils/replay_buffers/multi_agent_replay_buffer.py @@ -7,7 +7,7 @@ from ray.rllib.policy.rnn_sequencing import timeslice_along_seq_lens_with_overlap from ray.rllib.policy.sample_batch import MultiAgentBatch, SampleBatch from ray.rllib.utils.annotations import override -from ray.rllib.utils.deprecation import Deprecated +from ray._common.deprecation import Deprecated from ray.rllib.utils.from_config import from_config from ray.rllib.utils.replay_buffers.replay_buffer import ( _ALL_POLICIES, diff --git a/rllib/utils/replay_buffers/utils.py b/rllib/utils/replay_buffers/utils.py index 16fa37d0626f..baf24b6874c7 100644 --- a/rllib/utils/replay_buffers/utils.py +++ b/rllib/utils/replay_buffers/utils.py @@ -4,9 +4,9 @@ import numpy as np -from ray.rllib.utils import deprecation_warning +from ray._common.deprecation import deprecation_warning from ray.rllib.utils.annotations import OldAPIStack -from ray.rllib.utils.deprecation import DEPRECATED_VALUE +from ray._common.deprecation import DEPRECATED_VALUE from ray.rllib.utils.from_config import from_config from ray.rllib.utils.metrics import ALL_MODULES, TD_ERROR_KEY from ray.rllib.utils.metrics.learner_info import LEARNER_STATS_KEY diff --git a/rllib/utils/tests/run_memory_leak_tests.py b/rllib/utils/tests/run_memory_leak_tests.py index 4fc509fd7c88..36d5c96c2b93 100644 --- a/rllib/utils/tests/run_memory_leak_tests.py +++ b/rllib/utils/tests/run_memory_leak_tests.py @@ -26,7 +26,7 @@ from ray.rllib.common import SupportedFileType from ray.rllib.train import load_experiments_from_file from ray.rllib.utils.debug.memory import check_memory_leaks -from ray.rllib.utils.deprecation import deprecation_warning +from ray._common.deprecation import deprecation_warning from ray.tune.registry import get_trainable_cls parser = argparse.ArgumentParser() diff --git a/rllib/utils/typing.py b/rllib/utils/typing.py index 81116fbcacaf..3f7f559b6e2c 100644 --- a/rllib/utils/typing.py +++ b/rllib/utils/typing.py @@ -39,8 +39,8 @@ jnp = jax.numpy # Represents a generic tensor type. -# This could be an np.ndarray, tf.Tensor, or a torch.Tensor. -TensorType = Union[np.array, "jnp.ndarray", "tf.Tensor", "torch.Tensor"] +# This could be an np.ndarray, jnp.ndarray, tf.Tensor, or a torch.Tensor. +TensorType = Union[np.ndarray, "jnp.ndarray", "tf.Tensor", "torch.Tensor"] # Either a plain tensor, or a dict or tuple of tensors (or StructTensors). TensorStructType = Union[TensorType, dict, tuple] diff --git a/src/fakes/ray/object_manager/plasma/BUILD.bazel b/src/fakes/ray/object_manager/plasma/BUILD.bazel new file mode 100644 index 000000000000..86e70b439f83 --- /dev/null +++ b/src/fakes/ray/object_manager/plasma/BUILD.bazel @@ -0,0 +1,12 @@ +load("//bazel:ray.bzl", "ray_cc_library") + +ray_cc_library( + name = "fake_plasma_client", + hdrs = ["fake_plasma_client.h"], + deps = [ + "//src/ray/common:buffer", + "//src/ray/common:id", + "//src/ray/common:status", + "//src/ray/object_manager/plasma:plasma_client_interface", + ], +) diff --git a/src/fakes/ray/object_manager/plasma/fake_plasma_client.h b/src/fakes/ray/object_manager/plasma/fake_plasma_client.h new file mode 100644 index 000000000000..42030c1ab68a --- /dev/null +++ b/src/fakes/ray/object_manager/plasma/fake_plasma_client.h @@ -0,0 +1,126 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include "absl/container/flat_hash_map.h" +#include "ray/common/buffer.h" +#include "ray/common/id.h" +#include "ray/common/status.h" +#include "ray/object_manager/plasma/client.h" + +namespace plasma { + +class FakePlasmaClient : public PlasmaClientInterface { + public: + Status Connect(const std::string &store_socket_name, + const std::string &manager_socket_name = "", + int num_retries = -1) override { + return Status::OK(); + }; + + Status CreateAndSpillIfNeeded(const ObjectID &object_id, + const ray::rpc::Address &owner_address, + bool is_mutable, + int64_t data_size, + const uint8_t *metadata, + int64_t metadata_size, + std::shared_ptr *data, + plasma::flatbuf::ObjectSource source, + int device_num = 0) override { + return Status::OK(); + } + + Status TryCreateImmediately(const ObjectID &object_id, + const ray::rpc::Address &owner_address, + int64_t data_size, + const uint8_t *metadata, + int64_t metadata_size, + std::shared_ptr *data, + plasma::flatbuf::ObjectSource source, + int device_num = 0) override { + std::vector data_vec(data_size); + if (data != nullptr && data_size > 0) { + data_vec.assign(data->get()->Data(), data->get()->Data() + data_size); + } + std::vector metadata_vec; + if (metadata != nullptr && metadata_size > 0) { + metadata_vec.assign(metadata, metadata + metadata_size); + } + objects_in_plasma_.emplace( + object_id, std::make_pair(std::move(data_vec), std::move(metadata_vec))); + return Status::OK(); + } + + Status Get(const std::vector &object_ids, + int64_t timeout_ms, + std::vector *object_buffers) override { + for (const auto &id : object_ids) { + auto &buffers = objects_in_plasma_[id]; + plasma::ObjectBuffer shm_buffer{std::make_shared( + buffers.first.data(), buffers.first.size()), + std::make_shared( + buffers.second.data(), buffers.second.size())}; + object_buffers->emplace_back(shm_buffer); + } + return Status::OK(); + } + + Status GetExperimentalMutableObject( + const ObjectID &object_id, + std::unique_ptr *mutable_object) override { + return Status::OK(); + } + + Status Release(const ObjectID &object_id) override { + objects_in_plasma_.erase(object_id); + return Status::OK(); + } + + Status Contains(const ObjectID &object_id, bool *has_object) override { + *has_object = objects_in_plasma_.contains(object_id); + return Status::OK(); + } + + Status Abort(const ObjectID &object_id) override { return Status::OK(); } + + Status Seal(const ObjectID &object_id) override { return Status::OK(); } + + Status Delete(const std::vector &object_ids) override { + num_free_objects_requests++; + for (const auto &id : object_ids) { + objects_in_plasma_.erase(id); + } + return Status::OK(); + } + + Status Disconnect() override { return Status::OK(); }; + + std::string DebugString() { return ""; } + + int64_t store_capacity() { return 0; } + + StatusOr GetMemoryUsage() override { return std::string("fake"); } + + absl::flat_hash_map, std::vector>> + objects_in_plasma_; + uint32_t num_free_objects_requests = 0; +}; + +} // namespace plasma diff --git a/src/fakes/ray/pubsub/publisher.h b/src/fakes/ray/pubsub/publisher.h index b8daaf958f31..fe10bc02550c 100644 --- a/src/fakes/ray/pubsub/publisher.h +++ b/src/fakes/ray/pubsub/publisher.h @@ -21,22 +21,22 @@ namespace pubsub { class FakePublisher : public Publisher { public: - bool RegisterSubscription(const rpc::ChannelType channel_type, - const SubscriberID &subscriber_id, - const std::optional &key_id) override { - return true; - } + void RegisterSubscription(const rpc::ChannelType channel_type, + const UniqueID &subscriber_id, + const std::optional &key_id) override {} void Publish(rpc::PubMessage pub_message) override {} void PublishFailure(const rpc::ChannelType channel_type, const std::string &key_id) override {} - bool UnregisterSubscription(const rpc::ChannelType channel_type, - const SubscriberID &subscriber_id, - const std::optional &key_id) override { - return true; - } + void UnregisterSubscription(const rpc::ChannelType channel_type, + const UniqueID &subscriber_id, + const std::optional &key_id) override {} + + void UnregisterSubscriber(const UniqueID &subscriber_id) override {} + + std::string DebugString() const override { return "FakePublisher"; } }; } // namespace pubsub diff --git a/src/fakes/ray/pubsub/subscriber.h b/src/fakes/ray/pubsub/subscriber.h index 5abd3d33ba2d..b0afd5dd03fc 100644 --- a/src/fakes/ray/pubsub/subscriber.h +++ b/src/fakes/ray/pubsub/subscriber.h @@ -14,7 +14,7 @@ #pragma once -#include "ray/pubsub/subscriber.h" +#include "ray/pubsub/subscriber_interface.h" namespace ray { namespace pubsub { @@ -32,39 +32,22 @@ class FakeSubscriberClient : public SubscriberClientInterface { class FakeSubscriber : public SubscriberInterface { public: - bool Subscribe( + void Subscribe( std::unique_ptr sub_message, - const rpc::ChannelType channel_type, + rpc::ChannelType channel_type, const rpc::Address &owner_address, - const std::string &key_id, + const std::optional &key_id, pubsub::SubscribeDoneCallback subscribe_done_callback, pubsub::SubscriptionItemCallback subscription_callback, - pubsub::SubscriptionFailureCallback subscription_failure_callback) override { - return true; - } - - bool SubscribeChannel( - std::unique_ptr sub_message, - const rpc::ChannelType channel_type, - const rpc::Address &owner_address, - pubsub::SubscribeDoneCallback subscribe_done_callback, - pubsub::SubscriptionItemCallback subscription_callback, - pubsub::SubscriptionFailureCallback subscription_failure_callback) override { - return true; - } + pubsub::SubscriptionFailureCallback subscription_failure_callback) override {} - bool Unsubscribe(const rpc::ChannelType channel_type, + bool Unsubscribe(rpc::ChannelType channel_type, const rpc::Address &publisher_address, - const std::string &key_id) override { - return true; - } - - bool UnsubscribeChannel(const rpc::ChannelType channel_type, - const rpc::Address &publisher_address) override { + const std::optional &key_id) override { return true; } - bool IsSubscribed(const rpc::ChannelType channel_type, + bool IsSubscribed(rpc::ChannelType channel_type, const rpc::Address &publisher_address, const std::string &key_id) const override { return false; diff --git a/src/fakes/ray/rpc/raylet/BUILD.bazel b/src/fakes/ray/rpc/raylet/BUILD.bazel new file mode 100644 index 000000000000..fc6b5a141289 --- /dev/null +++ b/src/fakes/ray/rpc/raylet/BUILD.bazel @@ -0,0 +1,12 @@ +load("//bazel:ray.bzl", "ray_cc_library") + +ray_cc_library( + name = "fake_raylet_client", + hdrs = ["raylet_client.h"], + deps = [ + "//src/ray/common:id", + "//src/ray/common:status", + "//src/ray/common/scheduling:scheduling_ids", + "//src/ray/rpc:raylet_client_interface", + ], +) diff --git a/src/fakes/ray/rpc/raylet/raylet_client.h b/src/fakes/ray/rpc/raylet/raylet_client.h index 3c2bcb70a8cd..3680819c04fe 100644 --- a/src/fakes/ray/rpc/raylet/raylet_client.h +++ b/src/fakes/ray/rpc/raylet/raylet_client.h @@ -14,7 +14,10 @@ #pragma once -#include "ray/raylet_client/raylet_client.h" +#include "ray/common/scheduling/scheduling_ids.h" +#include "ray/rpc/raylet/raylet_client_interface.h" +#include "src/ray/common/id.h" +#include "src/ray/common/status.h" namespace ray { @@ -27,18 +30,25 @@ class FakeRayletClient : public RayletClientInterface { const ray::rpc::ClientCallback &callback) override {} void RequestWorkerLease( - const rpc::TaskSpec &task_spec, + const rpc::LeaseSpec &lease_spec, bool grant_or_reject, const ray::rpc::ClientCallback &callback, const int64_t backlog_size = -1, - const bool is_selected_based_on_locality = false) override {} + const bool is_selected_based_on_locality = false) override { + num_workers_requested += 1; + callbacks.push_back(callback); + } - ray::Status ReturnWorker(int worker_port, - const WorkerID &worker_id, - bool disconnect_worker, - const std::string &disconnect_worker_error_detail, - bool worker_exiting) override { - return Status::OK(); + void ReturnWorkerLease(int worker_port, + const LeaseID &lease_id, + bool disconnect_worker, + const std::string &disconnect_worker_error_detail, + bool worker_exiting) override { + if (disconnect_worker) { + num_workers_disconnected++; + } else { + num_workers_returned++; + } } void PrestartWorkers( @@ -48,30 +58,166 @@ class FakeRayletClient : public RayletClientInterface { void ReleaseUnusedActorWorkers( const std::vector &workers_in_use, const rpc::ClientCallback &callback) override { + num_release_unused_workers += 1; + release_callbacks.push_back(callback); } void CancelWorkerLease( - const TaskID &task_id, - const rpc::ClientCallback &callback) override {} + const LeaseID &lease_id, + const rpc::ClientCallback &callback) override { + num_leases_canceled += 1; + cancel_callbacks.push_back(callback); + } + + bool GrantWorkerLease() { + return GrantWorkerLease("", 0, WorkerID::FromRandom(), node_id_, NodeID::Nil()); + } + + bool GrantWorkerLease(const std::string &address, + int port, + const WorkerID &worker_id, + const NodeID &node_id, + const NodeID &retry_at_node_id, + Status status = Status::OK(), + bool rejected = false) { + rpc::RequestWorkerLeaseReply reply; + if (!retry_at_node_id.IsNil()) { + reply.mutable_retry_at_raylet_address()->set_ip_address(address); + reply.mutable_retry_at_raylet_address()->set_port(port); + reply.mutable_retry_at_raylet_address()->set_node_id(retry_at_node_id.Binary()); + } else { + reply.mutable_worker_address()->set_ip_address(address); + reply.mutable_worker_address()->set_port(port); + reply.mutable_worker_address()->set_node_id(node_id.Binary()); + reply.mutable_worker_address()->set_worker_id(worker_id.Binary()); + } + if (rejected) { + reply.set_rejected(true); + auto resources_data = reply.mutable_resources_data(); + resources_data->set_node_id(node_id.Binary()); + resources_data->set_resources_normal_task_changed(true); + auto &normal_task_map = *(resources_data->mutable_resources_normal_task()); + normal_task_map[kMemory_ResourceLabel] = + static_cast(std::numeric_limits::max()); + resources_data->set_resources_normal_task_timestamp(absl::GetCurrentTimeNanos()); + } + + if (callbacks.size() == 0) { + return false; + } else { + auto callback = callbacks.front(); + callback(status, std::move(reply)); + callbacks.pop_front(); + return true; + } + } + + bool ReplyCancelWorkerLease(bool success = true) { + rpc::CancelWorkerLeaseReply reply; + reply.set_success(success); + if (cancel_callbacks.size() == 0) { + return false; + } else { + auto callback = cancel_callbacks.front(); + callback(Status::OK(), std::move(reply)); + cancel_callbacks.pop_front(); + return true; + } + } + + bool ReplyReleaseUnusedActorWorkers() { + rpc::ReleaseUnusedActorWorkersReply reply; + if (release_callbacks.size() == 0) { + return false; + } else { + auto callback = release_callbacks.front(); + callback(Status::OK(), std::move(reply)); + release_callbacks.pop_front(); + return true; + } + } + + bool ReplyDrainRaylet() { + if (drain_raylet_callbacks.size() == 0) { + return false; + } else { + rpc::DrainRayletReply reply; + reply.set_is_accepted(true); + auto callback = drain_raylet_callbacks.front(); + callback(Status::OK(), std::move(reply)); + drain_raylet_callbacks.pop_front(); + return true; + } + } void PrepareBundleResources( const std::vector> &bundle_specs, const ray::rpc::ClientCallback &callback) - override {} + override { + num_lease_requested += 1; + lease_callbacks.push_back(callback); + } void CommitBundleResources( const std::vector> &bundle_specs, const ray::rpc::ClientCallback &callback) - override {} + override { + num_commit_requested += 1; + commit_callbacks.push_back(callback); + } void CancelResourceReserve( const BundleSpecification &bundle_spec, const ray::rpc::ClientCallback &callback) - override {} + override { + num_return_requested += 1; + return_callbacks.push_back(callback); + } void ReleaseUnusedBundles( const std::vector &bundles_in_use, - const rpc::ClientCallback &callback) override {} + const rpc::ClientCallback &callback) override { + ++num_release_unused_bundles_requested; + } + + bool GrantPrepareBundleResources(bool success = true, + const Status &status = Status::OK()) { + rpc::PrepareBundleResourcesReply reply; + reply.set_success(success); + if (lease_callbacks.size() == 0) { + return false; + } else { + auto callback = lease_callbacks.front(); + callback(status, std::move(reply)); + lease_callbacks.pop_front(); + return true; + } + } + + bool GrantCommitBundleResources(const Status &status = Status::OK()) { + rpc::CommitBundleResourcesReply reply; + if (commit_callbacks.size() == 0) { + return false; + } else { + auto callback = commit_callbacks.front(); + callback(status, std::move(reply)); + commit_callbacks.pop_front(); + return true; + } + } + + bool GrantCancelResourceReserve(bool success = true) { + Status status = Status::OK(); + rpc::CancelResourceReserveReply reply; + if (return_callbacks.size() == 0) { + return false; + } else { + auto callback = return_callbacks.front(); + callback(status, std::move(reply)); + return_callbacks.pop_front(); + return true; + } + } void ReportWorkerBacklog( const WorkerID &worker_id, @@ -94,9 +240,13 @@ class FakeRayletClient : public RayletClientInterface { void *metadata, const rpc::ClientCallback &callback) override {} - void GetTaskFailureCause( - const TaskID &task_id, - const rpc::ClientCallback &callback) override {} + void GetWorkerFailureCause( + const LeaseID &lease_id, + const rpc::ClientCallback &callback) override { + ray::rpc::GetWorkerFailureCauseReply reply; + callback(Status::OK(), std::move(reply)); + num_get_task_failure_causes += 1; + } void GetSystemConfig( const rpc::ClientCallback &callback) override {} @@ -112,11 +262,15 @@ class FakeRayletClient : public RayletClientInterface { void DrainRaylet(const rpc::autoscaler::DrainNodeReason &reason, const std::string &reason_message, int64_t deadline_timestamp_ms, - const rpc::ClientCallback &callback) override {} + const rpc::ClientCallback &callback) override { + rpc::DrainRayletReply reply; + reply.set_is_accepted(true); + drain_raylet_callbacks.push_back(callback); + } - void CancelTasksWithResourceShapes( + void CancelLeasesWithResourceShapes( const std::vector> &resource_shapes, - const rpc::ClientCallback &callback) + const rpc::ClientCallback &callback) override {} void IsLocalWorkerDead( @@ -132,6 +286,27 @@ class FakeRayletClient : public RayletClientInterface { void GlobalGC(const rpc::ClientCallback &callback) override {} int64_t GetPinsInFlight() const override { return 0; } + + int num_workers_requested = 0; + int num_workers_returned = 0; + int num_workers_disconnected = 0; + int num_leases_canceled = 0; + int num_release_unused_workers = 0; + int num_get_task_failure_causes = 0; + NodeID node_id_ = NodeID::FromRandom(); + std::list> drain_raylet_callbacks = {}; + std::list> callbacks = {}; + std::list> cancel_callbacks = {}; + std::list> release_callbacks = + {}; + int num_lease_requested = 0; + int num_return_requested = 0; + int num_commit_requested = 0; + + int num_release_unused_bundles_requested = 0; + std::list> lease_callbacks = {}; + std::list> commit_callbacks = {}; + std::list> return_callbacks = {}; }; } // namespace ray diff --git a/src/fakes/ray/rpc/worker/BUILD.bazel b/src/fakes/ray/rpc/worker/BUILD.bazel new file mode 100644 index 000000000000..b5bd45196c99 --- /dev/null +++ b/src/fakes/ray/rpc/worker/BUILD.bazel @@ -0,0 +1,10 @@ +load("//bazel:ray.bzl", "ray_cc_library") + +ray_cc_library( + name = "fake_core_worker_client", + hdrs = ["core_worker_client.h"], + deps = [ + "//src/ray/rpc:core_worker_client", + "@com_google_absl//absl/synchronization", + ], +) diff --git a/src/fakes/ray/rpc/worker/core_worker_client.h b/src/fakes/ray/rpc/worker/core_worker_client.h new file mode 100644 index 000000000000..e4f5f27f1aad --- /dev/null +++ b/src/fakes/ray/rpc/worker/core_worker_client.h @@ -0,0 +1,58 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "absl/synchronization/mutex.h" +#include "ray/rpc/worker/core_worker_client.h" + +namespace ray { + +class FakeCoreWorkerClient : public rpc::CoreWorkerClientInterface { + public: + void PushNormalTask(std::unique_ptr request, + const rpc::ClientCallback &callback) override { + absl::MutexLock lock(&mutex_); + callbacks_.push_back(callback); + } + + bool ReplyPushTask(Status status = Status::OK(), bool exit = false) { + rpc::ClientCallback callback = nullptr; + { + absl::MutexLock lock(&mutex_); + if (callbacks_.size() == 0) { + return false; + } + callback = callbacks_.front(); + callbacks_.pop_front(); + } + // call the callback without the lock to avoid deadlock. + auto reply = rpc::PushTaskReply(); + if (exit) { + reply.set_worker_exiting(true); + } + callback(status, std::move(reply)); + return true; + } + + size_t GetNumCallbacks() { + absl::MutexLock lock(&mutex_); + return callbacks_.size(); + } + + std::list> callbacks_ ABSL_GUARDED_BY(mutex_); + absl::Mutex mutex_; +}; + +} // namespace ray diff --git a/src/mock/ray/core_worker/actor_creator.h b/src/mock/ray/core_worker/actor_creator.h deleted file mode 100644 index 95deb2808a5a..000000000000 --- a/src/mock/ray/core_worker/actor_creator.h +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2021 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#pragma once - -#include "gmock/gmock.h" -namespace ray { -namespace core { - -class MockActorCreatorInterface : public ActorCreatorInterface { - public: - MOCK_METHOD(Status, - RegisterActor, - (const TaskSpecification &task_spec), - (const, override)); - MOCK_METHOD(void, - AsyncRegisterActor, - (const TaskSpecification &task_spec, gcs::StatusCallback callback), - (override)); - MOCK_METHOD(void, - AsyncCreateActor, - (const TaskSpecification &task_spec, - const rpc::ClientCallback &callback), - (override)); - MOCK_METHOD(void, - AsyncRestartActorForLineageReconstruction, - (const ActorID &actor_id, - uint64_t num_restarts, - gcs::StatusCallback callback), - (override)); - MOCK_METHOD(void, - AsyncReportActorOutOfScope, - (const ActorID &actor_id, - uint64_t num_restarts_due_to_lineage_reconstruction, - gcs::StatusCallback callback), - (override)); - MOCK_METHOD(void, - AsyncWaitForActorRegisterFinish, - (const ActorID &actor_id, gcs::StatusCallback callback), - (override)); - MOCK_METHOD(bool, IsActorInRegistering, (const ActorID &actor_id), (const, override)); -}; - -} // namespace core -} // namespace ray diff --git a/src/mock/ray/core_worker/core_worker.h b/src/mock/ray/core_worker/core_worker.h index 905ecceddec9..403d97de0db0 100644 --- a/src/mock/ray/core_worker/core_worker.h +++ b/src/mock/ray/core_worker/core_worker.h @@ -13,7 +13,7 @@ // limitations under the License. #pragma once #include "gmock/gmock.h" -#include "mock/ray/gcs/gcs_client/gcs_client.h" +#include "mock/ray/gcs_client/gcs_client.h" namespace ray::core { diff --git a/src/mock/ray/core_worker/reference_count.h b/src/mock/ray/core_worker/reference_count.h index 9efc65afc25d..bf02c1bc987a 100644 --- a/src/mock/ray/core_worker/reference_count.h +++ b/src/mock/ray/core_worker/reference_count.h @@ -39,7 +39,7 @@ class MockReferenceCounter : public ReferenceCounterInterface { const int64_t object_size, bool is_reconstructable, bool add_local_ref, - const std::optional &pinned_at_raylet_id, + const std::optional &pinned_at_node_id, rpc::TensorTransport tensor_transport)); MOCK_METHOD2(AddObjectOutOfScopeOrFreedCallback, diff --git a/src/mock/ray/gcs/gcs_server/gcs_actor_manager.h b/src/mock/ray/gcs/gcs_actor_manager.h similarity index 96% rename from src/mock/ray/gcs/gcs_server/gcs_actor_manager.h rename to src/mock/ray/gcs/gcs_actor_manager.h index b960f753ac3d..fd5a5f4a8769 100644 --- a/src/mock/ray/gcs/gcs_server/gcs_actor_manager.h +++ b/src/mock/ray/gcs/gcs_actor_manager.h @@ -12,15 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -namespace ray { -namespace gcs { +#pragma once -class MockGcsActor : public GcsActor { - public: -}; +#include -} // namespace gcs -} // namespace ray +#include "ray/gcs/gcs_actor_manager.h" namespace ray { namespace gcs { diff --git a/src/mock/ray/gcs/gcs_server/gcs_actor_scheduler.h b/src/mock/ray/gcs/gcs_actor_scheduler.h similarity index 96% rename from src/mock/ray/gcs/gcs_server/gcs_actor_scheduler.h rename to src/mock/ray/gcs/gcs_actor_scheduler.h index 2715c57849eb..7ada39f420d6 100644 --- a/src/mock/ray/gcs/gcs_server/gcs_actor_scheduler.h +++ b/src/mock/ray/gcs/gcs_actor_scheduler.h @@ -12,6 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +#pragma once + +#include + +#include "ray/gcs/gcs_actor_scheduler.h" + namespace ray { namespace gcs { @@ -22,7 +28,7 @@ class MockGcsActorSchedulerInterface : public GcsActorSchedulerInterface { MOCK_METHOD(std::vector, CancelOnNode, (const NodeID &node_id), (override)); MOCK_METHOD(void, CancelOnLeasing, - (const NodeID &node_id, const ActorID &actor_id, const TaskID &task_id), + (const NodeID &node_id, const ActorID &actor_id, const LeaseID &lease_id), (override)); MOCK_METHOD(ActorID, CancelOnWorker, @@ -62,7 +68,7 @@ class MockGcsActorScheduler : public GcsActorScheduler { MOCK_METHOD(std::vector, CancelOnNode, (const NodeID &node_id), (override)); MOCK_METHOD(void, CancelOnLeasing, - (const NodeID &node_id, const ActorID &actor_id, const TaskID &task_id), + (const NodeID &node_id, const ActorID &actor_id, const LeaseID &lease_id), (override)); MOCK_METHOD(ActorID, CancelOnWorker, diff --git a/src/mock/ray/gcs/gcs_server/gcs_job_manager.h b/src/mock/ray/gcs/gcs_job_manager.h similarity index 96% rename from src/mock/ray/gcs/gcs_server/gcs_job_manager.h rename to src/mock/ray/gcs/gcs_job_manager.h index 9b3b2ca2d1f2..2a04a8e2b87a 100644 --- a/src/mock/ray/gcs/gcs_server/gcs_job_manager.h +++ b/src/mock/ray/gcs/gcs_job_manager.h @@ -12,6 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +#pragma once + +#include + +#include "ray/gcs/gcs_job_manager.h" + namespace ray { namespace gcs { diff --git a/src/mock/ray/gcs/gcs_server/gcs_kv_manager.h b/src/mock/ray/gcs/gcs_kv_manager.h similarity index 97% rename from src/mock/ray/gcs/gcs_server/gcs_kv_manager.h rename to src/mock/ray/gcs/gcs_kv_manager.h index 9004ffe59785..87df51b573db 100644 --- a/src/mock/ray/gcs/gcs_server/gcs_kv_manager.h +++ b/src/mock/ray/gcs/gcs_kv_manager.h @@ -12,13 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "gmock/gmock.h" -#include "ray/gcs/gcs_server/gcs_kv_manager.h" +#pragma once + +#include + +#include "ray/gcs/gcs_kv_manager.h" namespace ray { namespace gcs { -class MockInternalKVInterface : public ray::gcs::InternalKVInterface { +class MockInternalKVInterface : public InternalKVInterface { public: MockInternalKVInterface() {} diff --git a/src/mock/ray/gcs/gcs_server/gcs_node_manager.h b/src/mock/ray/gcs/gcs_node_manager.h similarity index 95% rename from src/mock/ray/gcs/gcs_server/gcs_node_manager.h rename to src/mock/ray/gcs/gcs_node_manager.h index 5d1851b867a8..ef81ef8a6d71 100644 --- a/src/mock/ray/gcs/gcs_server/gcs_node_manager.h +++ b/src/mock/ray/gcs/gcs_node_manager.h @@ -11,7 +11,12 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#include "gmock/gmock.h" + +#pragma once + +#include + +#include "ray/gcs/gcs_node_manager.h" namespace ray { namespace gcs { diff --git a/src/mock/ray/gcs/gcs_server/gcs_placement_group_mgr.h b/src/mock/ray/gcs/gcs_placement_group_manager.h similarity index 93% rename from src/mock/ray/gcs/gcs_server/gcs_placement_group_mgr.h rename to src/mock/ray/gcs/gcs_placement_group_manager.h index 97d02a932d94..ffd4ceee0cb0 100644 --- a/src/mock/ray/gcs/gcs_server/gcs_placement_group_mgr.h +++ b/src/mock/ray/gcs/gcs_placement_group_manager.h @@ -11,17 +11,12 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/gcs_server/gcs_placement_group_mgr.h" -namespace ray { -namespace gcs { +#pragma once -class MockGcsPlacementGroup : public GcsPlacementGroup { - public: -}; +#include -} // namespace gcs -} // namespace ray +#include "ray/gcs/gcs_placement_group_manager.h" namespace ray { namespace gcs { diff --git a/src/mock/ray/gcs/gcs_server/gcs_placement_group_scheduler.h b/src/mock/ray/gcs/gcs_placement_group_scheduler.h similarity index 97% rename from src/mock/ray/gcs/gcs_server/gcs_placement_group_scheduler.h rename to src/mock/ray/gcs/gcs_placement_group_scheduler.h index a0d6f84d1663..f6fb6ac3ff14 100644 --- a/src/mock/ray/gcs/gcs_server/gcs_placement_group_scheduler.h +++ b/src/mock/ray/gcs/gcs_placement_group_scheduler.h @@ -12,6 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +#pragma once + +#include + +#include "ray/gcs/gcs_placement_group_scheduler.h" + namespace ray { namespace gcs { diff --git a/src/mock/ray/gcs/gcs_server/gcs_resource_manager.h b/src/mock/ray/gcs/gcs_resource_manager.h similarity index 96% rename from src/mock/ray/gcs/gcs_server/gcs_resource_manager.h rename to src/mock/ray/gcs/gcs_resource_manager.h index eba879e1ad00..0d5c83531cb3 100644 --- a/src/mock/ray/gcs/gcs_server/gcs_resource_manager.h +++ b/src/mock/ray/gcs/gcs_resource_manager.h @@ -12,10 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. +#pragma once + +#include + #include "ray/common/asio/instrumented_io_context.h" +#include "ray/gcs/gcs_resource_manager.h" namespace ray { namespace gcs { + static instrumented_io_context __mock_io_context_; static ClusterResourceManager __mock_cluster_resource_manager_(__mock_io_context_); static GcsNodeManager __mock_gcs_node_manager_( diff --git a/src/mock/ray/gcs/gcs_server/gcs_init_data.h b/src/mock/ray/gcs/gcs_server/gcs_init_data.h deleted file mode 100644 index e784243ca5af..000000000000 --- a/src/mock/ray/gcs/gcs_server/gcs_init_data.h +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -namespace ray { -namespace gcs { - -class MockGcsInitData : public GcsInitData { - public: -}; - -} // namespace gcs -} // namespace ray diff --git a/src/mock/ray/gcs/gcs_server/gcs_redis_failure_detector.h b/src/mock/ray/gcs/gcs_server/gcs_redis_failure_detector.h deleted file mode 100644 index d3b5948df8e7..000000000000 --- a/src/mock/ray/gcs/gcs_server/gcs_redis_failure_detector.h +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -namespace ray { -namespace gcs { - -class MockGcsRedisFailureDetector : public GcsRedisFailureDetector { - public: -}; - -} // namespace gcs -} // namespace ray diff --git a/src/mock/ray/gcs/gcs_server/gcs_table_storage.h b/src/mock/ray/gcs/gcs_server/gcs_table_storage.h deleted file mode 100644 index 4b229784b8cb..000000000000 --- a/src/mock/ray/gcs/gcs_server/gcs_table_storage.h +++ /dev/null @@ -1,154 +0,0 @@ -// Copyright The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -namespace ray { -namespace gcs { - -template -class MockGcsTable : public GcsTable { - public: - MOCK_METHOD(Status, - Put, - (const Key &key, const Data &value, const StatusCallback &callback), - (override)); - MOCK_METHOD(Status, - Delete, - (const Key &key, const StatusCallback &callback), - (override)); - MOCK_METHOD(Status, - BatchDelete, - (const std::vector &keys, const StatusCallback &callback), - (override)); -}; - -} // namespace gcs -} // namespace ray - -namespace ray { -namespace gcs { - -template -class MockGcsTableWithJobId : public GcsTableWithJobId { - public: - MOCK_METHOD(Status, - Put, - (const Key &key, const Data &value, const StatusCallback &callback), - (override)); - MOCK_METHOD(Status, - Delete, - (const Key &key, const StatusCallback &callback), - (override)); - MOCK_METHOD(Status, - BatchDelete, - (const std::vector &keys, const StatusCallback &callback), - (override)); - MOCK_METHOD(JobID, GetJobIdFromKey, (const Key &key), (override)); -}; - -} // namespace gcs -} // namespace ray - -namespace ray { -namespace gcs { - -class MockGcsJobTable : public GcsJobTable { - public: -}; - -} // namespace gcs -} // namespace ray - -namespace ray { -namespace gcs { - -class MockGcsActorTable : public GcsActorTable { - public: - MockGcsActorTable() : GcsActorTable(nullptr) {} - - MOCK_METHOD(JobID, GetJobIdFromKey, (const ActorID &key), (override)); -}; - -} // namespace gcs -} // namespace ray - -namespace ray { -namespace gcs { - -class MockGcsPlacementGroupTable : public GcsPlacementGroupTable { - public: -}; - -} // namespace gcs -} // namespace ray - -namespace ray { -namespace gcs { - -class MockGcsNodeTable : public GcsNodeTable { - public: - MockGcsNodeTable() : GcsNodeTable(nullptr){}; - - MOCK_METHOD(Status, - Put, - (const NodeID &key, - const GcsNodeInfo &value, - const StatusCallback &callback), - (override)); -}; - -} // namespace gcs -} // namespace ray - -namespace ray { -namespace gcs { - -class MockGcsWorkerTable : public GcsWorkerTable { - public: -}; - -} // namespace gcs -} // namespace ray - -namespace ray { -namespace gcs { - -class MockGcsTableStorage : public GcsTableStorage { - public: - MockGcsTableStorage() : GcsTableStorage(nullptr) {} - - MOCK_METHOD((GcsNodeTable &), NodeTable, (), (override)); -}; - -} // namespace gcs -} // namespace ray - -namespace ray { -namespace gcs { - -class MockRedisGcsTableStorage : public RedisGcsTableStorage { - public: -}; - -} // namespace gcs -} // namespace ray - -namespace ray { -namespace gcs { - -class MockInMemoryGcsTableStorage : public InMemoryGcsTableStorage { - public: -}; - -} // namespace gcs -} // namespace ray diff --git a/src/mock/ray/gcs/gcs_server/gcs_task_manager.h b/src/mock/ray/gcs/gcs_task_manager.h similarity index 94% rename from src/mock/ray/gcs/gcs_server/gcs_task_manager.h rename to src/mock/ray/gcs/gcs_task_manager.h index 67601dfd56a7..db633ba6e6b8 100644 --- a/src/mock/ray/gcs/gcs_server/gcs_task_manager.h +++ b/src/mock/ray/gcs/gcs_task_manager.h @@ -12,6 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +#pragma once + +#include + +#include "ray/gcs/gcs_task_manager.h" + namespace ray { namespace gcs { diff --git a/src/mock/ray/gcs/gcs_server/gcs_worker_manager.h b/src/mock/ray/gcs/gcs_worker_manager.h similarity index 96% rename from src/mock/ray/gcs/gcs_server/gcs_worker_manager.h rename to src/mock/ray/gcs/gcs_worker_manager.h index 7e993fc4814a..e44259ed523f 100644 --- a/src/mock/ray/gcs/gcs_server/gcs_worker_manager.h +++ b/src/mock/ray/gcs/gcs_worker_manager.h @@ -12,6 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +#pragma once + +#include + +#include "ray/gcs/gcs_worker_manager.h" + namespace ray { namespace gcs { diff --git a/src/mock/ray/gcs/pubsub/gcs_pub_sub.h b/src/mock/ray/gcs/pubsub/gcs_pub_sub.h deleted file mode 100644 index 14252da567cc..000000000000 --- a/src/mock/ray/gcs/pubsub/gcs_pub_sub.h +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2021 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -namespace ray { -namespace gcs { - -class MockGcsPubSub : public GcsPubSub { - public: - MOCK_METHOD(Status, - Publish, - (const std::string &channel, - const std::string &id, - const std::string &data, - const StatusCallback &done), - (override)); -}; - -} // namespace gcs -} // namespace ray diff --git a/src/mock/ray/gcs/store_client/in_memory_store_client.h b/src/mock/ray/gcs/store_client/in_memory_store_client.h index 51bebc607e02..16a7a5cab895 100644 --- a/src/mock/ray/gcs/store_client/in_memory_store_client.h +++ b/src/mock/ray/gcs/store_client/in_memory_store_client.h @@ -17,64 +17,64 @@ namespace gcs { class MockInMemoryStoreClient : public InMemoryStoreClient { public: - MOCK_METHOD(Status, + MOCK_METHOD(void, AsyncPut, (const std::string &table_name, const std::string &key, - const std::string &data, + std::string data, bool overwrite, Postable callback), (override)); - MOCK_METHOD(Status, + MOCK_METHOD(void, AsyncGet, (const std::string &table_name, const std::string &key, ToPostable> callback), (override)); - MOCK_METHOD(Status, + MOCK_METHOD(void, AsyncGetAll, (const std::string &table_name, Postable)> callback), (override)); - MOCK_METHOD(Status, + MOCK_METHOD(void, AsyncMultiGet, (const std::string &table_name, const std::vector &keys, Postable)> callback), (override)); - MOCK_METHOD(Status, + MOCK_METHOD(void, AsyncDelete, (const std::string &table_name, const std::string &key, Postable callback), (override)); - MOCK_METHOD(Status, + MOCK_METHOD(void, AsyncBatchDelete, (const std::string &table_name, const std::vector &keys, Postable callback), (override)); - MOCK_METHOD(Status, + MOCK_METHOD(void, AsyncGetKeys, (const std::string &table_name, const std::string &prefix, Postable)> callback), (override)); - MOCK_METHOD(Status, + MOCK_METHOD(void, AsyncExists, (const std::string &table_name, const std::string &key, Postable callback), (override)); - MOCK_METHOD(Status, AsyncGetNextJobID, (Postable callback), (override)); + MOCK_METHOD(void, AsyncGetNextJobID, (Postable callback), (override)); }; } // namespace gcs diff --git a/src/mock/ray/gcs/store_client/redis_store_client.h b/src/mock/ray/gcs/store_client/redis_store_client.h index a0fc20272f9c..7a73e5b045dd 100644 --- a/src/mock/ray/gcs/store_client/redis_store_client.h +++ b/src/mock/ray/gcs/store_client/redis_store_client.h @@ -17,52 +17,52 @@ namespace gcs { class MockStoreClient : public StoreClient { public: - MOCK_METHOD(Status, + MOCK_METHOD(void, AsyncPut, (const std::string &table_name, const std::string &key, - const std::string &data, + std::string data, bool overwrite, Postable callback), (override)); - MOCK_METHOD(Status, + MOCK_METHOD(void, AsyncGet, (const std::string &table_name, const std::string &key, ToPostable> callback), (override)); - MOCK_METHOD(Status, + MOCK_METHOD(void, AsyncGetAll, (const std::string &table_name, Postable)> callback), (override)); - MOCK_METHOD(Status, + MOCK_METHOD(void, AsyncMultiGet, (const std::string &table_name, - const std::vector &key, + const std::vector &keys, Postable)> callback), (override)); - MOCK_METHOD(Status, + MOCK_METHOD(void, AsyncDelete, (const std::string &table_name, const std::string &key, Postable callback), (override)); - MOCK_METHOD(Status, + MOCK_METHOD(void, AsyncBatchDelete, (const std::string &table_name, const std::vector &keys, Postable callback), (override)); - MOCK_METHOD(Status, AsyncGetNextJobID, (Postable callback), (override)); - MOCK_METHOD(Status, + MOCK_METHOD(void, AsyncGetNextJobID, (Postable callback), (override)); + MOCK_METHOD(void, AsyncGetKeys, (const std::string &table_name, const std::string &prefix, Postable)> callback), (override)); - MOCK_METHOD(Status, + MOCK_METHOD(void, AsyncExists, (const std::string &table_name, const std::string &key, diff --git a/src/mock/ray/gcs/store_client/store_client.h b/src/mock/ray/gcs/store_client/store_client.h index 9094588f5e37..7a73e5b045dd 100644 --- a/src/mock/ray/gcs/store_client/store_client.h +++ b/src/mock/ray/gcs/store_client/store_client.h @@ -17,7 +17,7 @@ namespace gcs { class MockStoreClient : public StoreClient { public: - MOCK_METHOD(Status, + MOCK_METHOD(void, AsyncPut, (const std::string &table_name, const std::string &key, @@ -25,44 +25,44 @@ class MockStoreClient : public StoreClient { bool overwrite, Postable callback), (override)); - MOCK_METHOD(Status, + MOCK_METHOD(void, AsyncGet, (const std::string &table_name, const std::string &key, ToPostable> callback), (override)); - MOCK_METHOD(Status, + MOCK_METHOD(void, AsyncGetAll, (const std::string &table_name, Postable)> callback), (override)); - MOCK_METHOD(Status, + MOCK_METHOD(void, AsyncMultiGet, (const std::string &table_name, - const std::vector &key, + const std::vector &keys, Postable)> callback), (override)); - MOCK_METHOD(Status, + MOCK_METHOD(void, AsyncDelete, (const std::string &table_name, const std::string &key, Postable callback), (override)); - MOCK_METHOD(Status, + MOCK_METHOD(void, AsyncBatchDelete, (const std::string &table_name, const std::vector &keys, Postable callback), (override)); - MOCK_METHOD(Status, AsyncGetNextJobID, (Postable callback), (override)); - MOCK_METHOD(Status, + MOCK_METHOD(void, AsyncGetNextJobID, (Postable callback), (override)); + MOCK_METHOD(void, AsyncGetKeys, (const std::string &table_name, const std::string &prefix, Postable)> callback), (override)); - MOCK_METHOD(Status, + MOCK_METHOD(void, AsyncExists, (const std::string &table_name, const std::string &key, diff --git a/src/mock/ray/gcs/gcs_client/accessor.h b/src/mock/ray/gcs_client/accessor.h similarity index 97% rename from src/mock/ray/gcs/gcs_client/accessor.h rename to src/mock/ray/gcs_client/accessor.h index 47d920125293..ce66405ed34a 100644 --- a/src/mock/ray/gcs/gcs_client/accessor.h +++ b/src/mock/ray/gcs_client/accessor.h @@ -13,7 +13,7 @@ // limitations under the License. #pragma once #include "gmock/gmock.h" -#include "ray/gcs/gcs_client/accessor.h" +#include "ray/gcs_client/accessor.h" namespace ray { namespace gcs { @@ -192,11 +192,7 @@ namespace gcs { class MockErrorInfoAccessor : public ErrorInfoAccessor { public: - MOCK_METHOD(void, - AsyncReportJobError, - (const std::shared_ptr &data_ptr, - const StatusCallback &callback), - (override)); + MOCK_METHOD(void, AsyncReportJobError, (rpc::ErrorTableData data), (override)); }; } // namespace gcs diff --git a/src/mock/ray/gcs/gcs_client/gcs_client.h b/src/mock/ray/gcs_client/gcs_client.h similarity index 94% rename from src/mock/ray/gcs/gcs_client/gcs_client.h rename to src/mock/ray/gcs_client/gcs_client.h index a798ef77760d..1ad7d85b3ffc 100644 --- a/src/mock/ray/gcs/gcs_client/gcs_client.h +++ b/src/mock/ray/gcs_client/gcs_client.h @@ -14,8 +14,8 @@ #pragma once -#include "mock/ray/gcs/gcs_client/accessor.h" -#include "ray/gcs/gcs_client/gcs_client.h" +#include "mock/ray/gcs_client/accessor.h" +#include "ray/gcs_client/gcs_client.h" namespace ray { namespace gcs { @@ -58,6 +58,7 @@ class MockGcsClient : public GcsClient { GcsClient::error_accessor_.reset(mock_error_accessor); GcsClient::worker_accessor_.reset(mock_worker_accessor); GcsClient::placement_group_accessor_.reset(mock_placement_group_accessor); + GcsClient::internal_kv_accessor_.reset(mock_internal_kv_accessor); GcsClient::task_accessor_.reset(mock_task_accessor); } MockActorInfoAccessor *mock_actor_accessor; diff --git a/src/mock/ray/object_manager/object_manager.h b/src/mock/ray/object_manager/object_manager.h index 67813e247ca2..3f16bb85b3f5 100644 --- a/src/mock/ray/object_manager/object_manager.h +++ b/src/mock/ray/object_manager/object_manager.h @@ -54,6 +54,8 @@ class MockObjectManager : public ObjectManagerInterface { MOCK_METHOD(void, Stop, (), (override)); MOCK_METHOD(void, RecordMetrics, (), (override)); MOCK_METHOD(void, HandleNodeRemoved, (const NodeID &node_id), (override)); + MOCK_METHOD(void, HandleObjectAdded, (const ObjectInfo &object_info), (override)); + MOCK_METHOD(void, HandleObjectDeleted, (const ObjectID &object_id), (override)); }; } // namespace ray diff --git a/src/mock/ray/object_manager/plasma/client.h b/src/mock/ray/object_manager/plasma/client.h index 8e5905c73463..dd7617b58641 100644 --- a/src/mock/ray/object_manager/plasma/client.h +++ b/src/mock/ray/object_manager/plasma/client.h @@ -79,6 +79,8 @@ class MockPlasmaClient : public PlasmaClientInterface { (override)); MOCK_METHOD(Status, Delete, (const std::vector &object_ids), (override)); + + MOCK_METHOD(StatusOr, GetMemoryUsage, (), (override)); }; } // namespace plasma diff --git a/src/mock/ray/pubsub/BUILD.bazel b/src/mock/ray/pubsub/BUILD.bazel new file mode 100644 index 000000000000..23bfce50a7f4 --- /dev/null +++ b/src/mock/ray/pubsub/BUILD.bazel @@ -0,0 +1,9 @@ +load("//bazel:ray.bzl", "ray_cc_library") + +ray_cc_library( + name = "mock_publisher", + hdrs = ["publisher.h"], + deps = [ + "//src/ray/pubsub:publisher_interface", + ], +) diff --git a/src/mock/ray/pubsub/publisher.h b/src/mock/ray/pubsub/publisher.h index 899f34fd140b..9a1d7c33635f 100644 --- a/src/mock/ray/pubsub/publisher.h +++ b/src/mock/ray/pubsub/publisher.h @@ -12,15 +12,27 @@ // See the License for the specific language governing permissions and // limitations under the License. +#pragma once + +#include "gmock/gmock.h" +#include "ray/pubsub/publisher_interface.h" + namespace ray { namespace pubsub { -class MockPublisher : public Publisher { +class MockPublisher : public PublisherInterface { public: - MOCK_METHOD(bool, + MOCK_METHOD(void, + ConnectToSubscriber, + (const rpc::PubsubLongPollingRequest &request, + std::string *publisher_id, + google::protobuf::RepeatedPtrField *pub_messages, + rpc::SendReplyCallback send_reply_callback), + (override)); + MOCK_METHOD(void, RegisterSubscription, (const rpc::ChannelType channel_type, - const SubscriberID &subscriber_id, + const UniqueID &subscriber_id, const std::optional &key_id), (override)); MOCK_METHOD(void, Publish, (rpc::PubMessage pub_message), (override)); @@ -28,12 +40,14 @@ class MockPublisher : public Publisher { PublishFailure, (const rpc::ChannelType channel_type, const std::string &key_id), (override)); - MOCK_METHOD(bool, + MOCK_METHOD(void, UnregisterSubscription, (const rpc::ChannelType channel_type, - const SubscriberID &subscriber_id, + const UniqueID &subscriber_id, const std::optional &key_id), (override)); + MOCK_METHOD(void, UnregisterSubscriber, (const UniqueID &subscriber_id), (override)); + MOCK_METHOD(std::string, DebugString, (), (const, override)); }; } // namespace pubsub diff --git a/src/mock/ray/pubsub/subscriber.h b/src/mock/ray/pubsub/subscriber.h deleted file mode 100644 index 2aa671795ee9..000000000000 --- a/src/mock/ray/pubsub/subscriber.h +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright 2021 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "gmock/gmock.h" -#include "ray/pubsub/subscriber.h" - -namespace ray { -namespace pubsub { - -class MockSubscriberClientInterface : public SubscriberClientInterface { - public: - MOCK_METHOD(void, - PubsubLongPolling, - (const rpc::PubsubLongPollingRequest &request, - const rpc::ClientCallback &callback), - (override)); - MOCK_METHOD(void, - PubsubCommandBatch, - (const rpc::PubsubCommandBatchRequest &request, - const rpc::ClientCallback &callback), - (override)); -}; - -class MockSubscriber : public SubscriberInterface { - public: - MOCK_METHOD(bool, - Subscribe, - (std::unique_ptr sub_message, - const rpc::ChannelType channel_type, - const rpc::Address &owner_address, - const std::string &key_id, - pubsub::SubscribeDoneCallback subscribe_done_callback, - pubsub::SubscriptionItemCallback subscription_callback, - pubsub::SubscriptionFailureCallback subscription_failure_callback), - (override)); - - MOCK_METHOD(bool, - SubscribeChannel, - (std::unique_ptr sub_message, - const rpc::ChannelType channel_type, - const rpc::Address &owner_address, - pubsub::SubscribeDoneCallback subscribe_done_callback, - pubsub::SubscriptionItemCallback subscription_callback, - pubsub::SubscriptionFailureCallback subscription_failure_callback), - (override)); - - MOCK_METHOD(bool, - Unsubscribe, - (const rpc::ChannelType channel_type, - const rpc::Address &publisher_address, - const std::string &key_id), - (override)); - - MOCK_METHOD(bool, - UnsubscribeChannel, - (const rpc::ChannelType channel_type, - const rpc::Address &publisher_address), - (override)); - - MOCK_METHOD(bool, - IsSubscribed, - (const rpc::ChannelType channel_type, - const rpc::Address &publisher_address, - const std::string &key_id), - (const, override)); - - MOCK_METHOD(std::string, DebugString, (), (const, override)); -}; - -} // namespace pubsub -} // namespace ray diff --git a/src/mock/ray/raylet/local_task_manager.h b/src/mock/ray/raylet/local_lease_manager.h similarity index 76% rename from src/mock/ray/raylet/local_task_manager.h rename to src/mock/ray/raylet/local_lease_manager.h index 1dbbb8aea9ef..825dae47dde7 100644 --- a/src/mock/ray/raylet/local_task_manager.h +++ b/src/mock/ray/raylet/local_lease_manager.h @@ -15,25 +15,25 @@ #pragma once #include "gmock/gmock.h" -#include "ray/raylet/scheduling/local_task_manager_interface.h" +#include "ray/raylet/scheduling/local_lease_manager_interface.h" namespace ray::raylet { -class MockLocalTaskManager : public ILocalTaskManager { +class MockLocalLeaseManager : public LocalLeaseManagerInterface { public: MOCK_METHOD(void, - QueueAndScheduleTask, + QueueAndScheduleLease, (std::shared_ptr work), (override)); - MOCK_METHOD(void, ScheduleAndDispatchTasks, (), (override)); + MOCK_METHOD(void, ScheduleAndGrantLeases, (), (override)); MOCK_METHOD(bool, - CancelTasks, + CancelLeases, (std::function &)> predicate, rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type, const std::string &scheduling_failure_message), (override)); MOCK_METHOD((const absl::flat_hash_map>> &), - GetTaskToDispatch, + GetLeasesToGrant, (), (const, override)); MOCK_METHOD((const absl::flat_hash_map worker, RayTask *task), + CleanupLease, + (std::shared_ptr worker, RayLease *lease), (override)); - MOCK_METHOD(void, TasksUnblocked, (const std::vector &ready_ids), (override)); + MOCK_METHOD(void, LeasesUnblocked, (const std::vector &ready_ids), (override)); MOCK_METHOD(void, ReleaseWorkerResources, (std::shared_ptr worker), @@ -72,9 +72,9 @@ class MockLocalTaskManager : public ILocalTaskManager { MOCK_METHOD(ResourceSet, CalcNormalTaskResources, (), (const, override)); MOCK_METHOD(void, RecordMetrics, (), (const, override)); MOCK_METHOD(void, DebugStr, (std::stringstream & buffer), (const, override)); - MOCK_METHOD(size_t, GetNumTaskSpilled, (), (const, override)); - MOCK_METHOD(size_t, GetNumWaitingTaskSpilled, (), (const, override)); - MOCK_METHOD(size_t, GetNumUnschedulableTaskSpilled, (), (const, override)); + MOCK_METHOD(size_t, GetNumLeaseSpilled, (), (const, override)); + MOCK_METHOD(size_t, GetNumWaitingLeaseSpilled, (), (const, override)); + MOCK_METHOD(size_t, GetNumUnschedulableLeaseSpilled, (), (const, override)); }; } // namespace ray::raylet diff --git a/src/mock/ray/raylet/worker_pool.h b/src/mock/ray/raylet/worker_pool.h index 731e59abd424..6e8337aef2d4 100644 --- a/src/mock/ray/raylet/worker_pool.h +++ b/src/mock/ray/raylet/worker_pool.h @@ -22,7 +22,7 @@ class MockWorkerPool : public WorkerPoolInterface { public: MOCK_METHOD(void, PopWorker, - (const TaskSpecification &task_spec, const PopWorkerCallback &callback), + (const LeaseSpecification &lease_spec, const PopWorkerCallback &callback), (override)); MOCK_METHOD(void, PushWorker, @@ -100,7 +100,7 @@ class MockWorkerPool : public WorkerPoolInterface { (override)); MOCK_METHOD(void, PrestartWorkers, - (const TaskSpecification &task_spec, int64_t backlog_size), + (const LeaseSpecification &lease_spec, int64_t backlog_size), (override)); MOCK_METHOD(void, StartNewWorker, diff --git a/src/mock/ray/raylet_client/raylet_client.h b/src/mock/ray/raylet_client/raylet_client.h index dc804ff16207..9a2c2d06b8b9 100644 --- a/src/mock/ray/raylet_client/raylet_client.h +++ b/src/mock/ray/raylet_client/raylet_client.h @@ -25,24 +25,24 @@ class MockRayletClientInterface : public RayletClientInterface { MOCK_METHOD( void, RequestWorkerLease, - (const rpc::TaskSpec &resource_spec, + (const rpc::LeaseSpec &lease_spec, bool grant_or_reject, const ray::rpc::ClientCallback &callback, const int64_t backlog_size, const bool is_selected_based_on_locality), (override)); - MOCK_METHOD(ray::Status, - ReturnWorker, + MOCK_METHOD(void, + ReturnWorkerLease, (int worker_port, - const WorkerID &worker_id, + const LeaseID &lease_id, bool disconnect_worker, const std::string &disconnect_worker_error_detail, bool worker_exiting), (override)); MOCK_METHOD(void, - GetTaskFailureCause, - (const TaskID &task_id, - const rpc::ClientCallback &callback), + GetWorkerFailureCause, + (const LeaseID &lease_id, + const rpc::ClientCallback &callback), (override)); MOCK_METHOD(void, PrestartWorkers, @@ -56,7 +56,7 @@ class MockRayletClientInterface : public RayletClientInterface { (override)); MOCK_METHOD(void, CancelWorkerLease, - (const TaskID &task_id, + (const LeaseID &lease_id, const rpc::ClientCallback &callback), (override)); MOCK_METHOD( @@ -132,9 +132,9 @@ class MockRayletClientInterface : public RayletClientInterface { (override)); MOCK_METHOD( void, - CancelTasksWithResourceShapes, + CancelLeasesWithResourceShapes, ((const std::vector>)&resource_shapes, - const rpc::ClientCallback &callback), + const rpc::ClientCallback &callback), (override)); MOCK_METHOD(void, IsLocalWorkerDead, diff --git a/src/mock/ray/rpc/worker/core_worker_client.h b/src/mock/ray/rpc/worker/core_worker_client.h index 3e7e4d734c4c..26aed0495833 100644 --- a/src/mock/ray/rpc/worker/core_worker_client.h +++ b/src/mock/ray/rpc/worker/core_worker_client.h @@ -15,13 +15,13 @@ #pragma once #include "gmock/gmock.h" +#include "ray/pubsub/subscriber_interface.h" #include "ray/rpc/worker/core_worker_client.h" namespace ray { namespace rpc { -class MockCoreWorkerClientInterface : public ray::pubsub::MockSubscriberClientInterface, - public CoreWorkerClientInterface { +class MockCoreWorkerClientInterface : public CoreWorkerClientInterface { public: MOCK_METHOD(void, PushActorTask, @@ -47,7 +47,7 @@ class MockCoreWorkerClientInterface : public ray::pubsub::MockSubscriberClientIn (override)); MOCK_METHOD(void, GetObjectStatus, - (const GetObjectStatusRequest &request, + (GetObjectStatusRequest && request, const ClientCallback &callback), (override)); MOCK_METHOD(void, @@ -67,7 +67,7 @@ class MockCoreWorkerClientInterface : public ray::pubsub::MockSubscriberClientIn (override)); MOCK_METHOD(void, UpdateObjectLocationBatch, - (const UpdateObjectLocationBatchRequest &request, + (UpdateObjectLocationBatchRequest && request, const ClientCallback &callback), (override)); MOCK_METHOD(void, @@ -129,6 +129,7 @@ class MockCoreWorkerClientInterface : public ray::pubsub::MockSubscriberClientIn (const AssignObjectOwnerRequest &request, const ClientCallback &callback), (override)); + MOCK_METHOD(std::string, DebugString, (), (const, override)); }; class MockCoreWorkerClientConfigurableRunningTasks diff --git a/src/ray/common/BUILD.bazel b/src/ray/common/BUILD.bazel index aaa5dc7f6109..f23c1e24ad6b 100644 --- a/src/ray/common/BUILD.bazel +++ b/src/ray/common/BUILD.bazel @@ -1,4 +1,4 @@ -load("//bazel:ray.bzl", "ray_cc_library", "ray_cc_test") +load("//bazel:ray.bzl", "ray_cc_library") ray_cc_library( name = "compat", @@ -11,18 +11,24 @@ ray_cc_library( ) ray_cc_library( - name = "test_util", - srcs = ["test_util.cc"], - hdrs = ["test_util.h"], + name = "test_utils", + srcs = ["test_utils.cc"], + hdrs = ["test_utils.h"], deps = [ ":asio", ":id", + ":placement_group", ":ray_object", + ":task_common", + "//src/ray/protobuf:autoscaler_cc_grpc", "//src/ray/protobuf:common_cc_proto", - "//src/ray/util", + "//src/ray/protobuf:gcs_cc_proto", + "//src/ray/protobuf:gcs_service_cc_grpc", "//src/ray/util:cmd_line_utils", "//src/ray/util:network_util", "//src/ray/util:path_utils", + "//src/ray/util:process", + "//src/ray/util:time", "@boost//:optional", "@com_google_googletest//:gtest", ], @@ -61,7 +67,6 @@ ray_cc_library( deps = [ ":ray_config", ":status", - "//src/ray/util", "//src/ray/util:logging", "//src/ray/util:type_traits", "@com_github_grpc_grpc//:grpc++", @@ -80,7 +85,7 @@ ray_cc_library( deps = [ ":asio", ":ray_config", - "//src/ray/util", + "//src/ray/util:process", "@com_google_absl//absl/strings", "@com_google_absl//absl/strings:str_format", "@com_google_googletest//:gtest_prod", @@ -94,7 +99,6 @@ ray_cc_library( hdrs = ["file_system_monitor.h"], deps = [ ":asio", - "//src/ray/util", "//src/ray/util:event", "@com_google_googletest//:gtest_prod", ], @@ -120,73 +124,141 @@ ray_cc_library( ray_cc_library( name = "id", srcs = [ - "common_protocol.cc", "id.cc", ], hdrs = [ - "common_protocol.h", "id.h", "id_def.h", ], deps = [ ":constants", - ":status", "//src/ray/protobuf:common_cc_proto", - "//src/ray/protobuf:gcs_cc_proto", - "//src/ray/util", + "//src/ray/thirdparty:sha256", + "//src/ray/util:logging", "//src/ray/util:random", - "@com_github_google_flatbuffers//:flatbuffers", + "//src/ray/util:visibility", "@msgpack", ], ) ray_cc_library( - name = "task_common", + name = "flatbuf_utils", + hdrs = [ + "flatbuf_utils.h", + ], + deps = [ + "@com_github_google_flatbuffers//:flatbuffers", + ], +) + +ray_cc_library( + name = "bundle_spec", srcs = [ - "bundle_location_index.cc", "bundle_spec.cc", - "function_descriptor.cc", - "placement_group.cc", - "scheduling/cluster_resource_data.cc", - "scheduling/fixed_point.cc", - "scheduling/label_selector.cc", - "scheduling/resource_instance_set.cc", - "scheduling/resource_set.cc", - "scheduling/scheduling_ids.cc", - "task/task.cc", - "task/task_spec.cc", ], hdrs = [ - "bundle_location_index.h", "bundle_spec.h", - "function_descriptor.h", + ], + deps = [ + ":grpc_util", + ":id", + "//src/ray/common/scheduling:cluster_resource_data", + "//src/ray/common/scheduling:label_selector", + "//src/ray/common/scheduling:placement_group_util", + "//src/ray/common/scheduling:scheduling_ids", + "//src/ray/protobuf:common_cc_proto", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_protobuf//:protobuf", + ], +) + +ray_cc_library( + name = "placement_group", + srcs = [ + "placement_group.cc", + ], + hdrs = [ "placement_group.h", - "scheduling/cluster_resource_data.h", - "scheduling/fixed_point.h", - "scheduling/label_selector.h", - "scheduling/resource_instance_set.h", - "scheduling/resource_set.h", - "scheduling/scheduling_ids.h", - "task/task.h", + ], + deps = [ + ":bundle_spec", + ":id", + "//src/ray/protobuf:common_cc_proto", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_protobuf//:protobuf", + ], +) + +ray_cc_library( + name = "function_descriptor", + srcs = ["function_descriptor.cc"], + hdrs = ["function_descriptor.h"], + deps = [ + ":grpc_util", + "//src/ray/protobuf:common_cc_proto", + "//src/ray/util:logging", + "@com_google_absl//absl/strings:str_format", + ], +) + +ray_cc_library( + name = "bundle_location_index", + srcs = ["bundle_location_index.cc"], + hdrs = ["bundle_location_index.h"], + deps = [ + ":id", + ":placement_group", + "//src/ray/protobuf:gcs_cc_proto", + "@com_google_absl//absl/container:flat_hash_map", + ], +) + +ray_cc_library( + name = "task_common", + srcs = [ + "task/task_spec.cc", + ], + hdrs = [ "task/task_common.h", "task/task_spec.h", "task/task_util.h", ], deps = [ ":event_stats", + ":function_descriptor", ":grpc_util", - ":id", ":ray_config", ":ray_object", ":runtime_env", + "//src/ray/common/scheduling:label_selector", + "//src/ray/common/scheduling:resource_set", + "//src/ray/common/scheduling:scheduling_class_util", "//src/ray/flatbuffers:node_manager_generated", - "//src/ray/util", "//src/ray/util:container_util", "@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/container:flat_hash_set", "@com_google_absl//absl/strings", "@com_google_absl//absl/strings:str_format", - "@com_google_absl//absl/synchronization", + ], +) + +ray_cc_library( + name = "lease", + srcs = [ + "lease/lease_spec.cc", + ], + hdrs = [ + "lease/lease.h", + "lease/lease_spec.h", + ], + deps = [ + ":function_descriptor", + ":id", + ":runtime_env", + "//src/ray/common/scheduling:label_selector", + "//src/ray/common/scheduling:resource_set", + "//src/ray/common/scheduling:scheduling_class_util", + "//src/ray/protobuf:common_cc_proto", ], ) @@ -209,7 +281,6 @@ ray_cc_library( deps = [ ":event_stats", ":ray_config", - "//src/ray/util", "//src/ray/util:array", "//src/ray/util:function_traits", "@boost//:asio", @@ -229,7 +300,6 @@ ray_cc_library( deps = [ ":ray_config", "//src/ray/stats:stats_metric", - "//src/ray/util", "@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/synchronization", ], @@ -244,8 +314,8 @@ ray_cc_library( "ray_internal_flag_def.h", ], deps = [ - "//src/ray/util", - "@com_google_absl//absl/algorithm", + "//src/ray/util:logging", + "@boost//:algorithm", "@com_google_absl//absl/strings", "@nlohmann_json", ], @@ -282,7 +352,9 @@ ray_cc_library( srcs = ["status.cc"], hdrs = ["status.h"], deps = [ + ":macros", ":source_location", + "//src/ray/util:logging", "//src/ray/util:macros", "//src/ray/util:visibility", "@boost//:system", @@ -313,13 +385,27 @@ ray_cc_library( hdrs = ["source_location.h"], ) -ray_cc_test( - name = "source_location_test", - size = "small", - srcs = ["source_location_test.cc"], - tags = ["team:core"], +ray_cc_library( + name = "protobuf_utils", + srcs = ["protobuf_utils.cc"], + hdrs = ["protobuf_utils.h"], + deps = [ + ":constants", + ":id", + ":ray_config", + ":task_common", + "//src/ray/protobuf:autoscaler_cc_proto", + "//src/ray/protobuf:export_task_event_cc_proto", + "//src/ray/protobuf:gcs_cc_proto", + "//src/ray/util:time", + "@com_google_absl//absl/time", + ], +) + +ray_cc_library( + name = "gcs_callbacks", + hdrs = ["gcs_callbacks.h"], deps = [ - ":source_location", - "@com_google_googletest//:gtest_main", + "//src/ray/common:status", ], ) diff --git a/src/ray/common/asio/asio_util.h b/src/ray/common/asio/asio_util.h index 38564c8ac0e3..f360b058aa8d 100644 --- a/src/ray/common/asio/asio_util.h +++ b/src/ray/common/asio/asio_util.h @@ -25,7 +25,6 @@ #include "ray/common/asio/instrumented_io_context.h" #include "ray/util/array.h" #include "ray/util/thread_utils.h" -#include "ray/util/util.h" template std::shared_ptr execute_after( @@ -61,7 +60,7 @@ class InstrumentedIOContextWithThread { */ explicit InstrumentedIOContextWithThread(const std::string &thread_name, bool enable_lag_probe = false) - : io_service_(enable_lag_probe, /*running_on_single_thread=*/true), + : io_service_(enable_lag_probe, /*running_on_single_thread=*/true, thread_name), work_(io_service_.get_executor()), thread_name_(thread_name) { io_thread_ = std::thread([this] { @@ -91,7 +90,7 @@ class InstrumentedIOContextWithThread { } private: - instrumented_io_context io_service_{/*enable_lag_probe=*/false, + instrumented_io_context io_service_{/*enable_metrics=*/false, /*running_on_single_thread=*/true}; boost::asio::executor_work_guard work_; // to keep io_service_ running diff --git a/src/ray/common/asio/instrumented_io_context.cc b/src/ray/common/asio/instrumented_io_context.cc index 05a398a131db..9147452b6c7f 100644 --- a/src/ray/common/asio/instrumented_io_context.cc +++ b/src/ray/common/asio/instrumented_io_context.cc @@ -26,17 +26,19 @@ namespace { // Post a probe. Records the lag and schedule another probe. // Requires: `interval_ms` > 0. -void LagProbeLoop(instrumented_io_context &io_context, int64_t interval_ms) { +void LagProbeLoop(instrumented_io_context &io_context, + int64_t interval_ms, + const std::optional &context_name) { auto begin = std::chrono::steady_clock::now(); io_context.post( - [&io_context, begin, interval_ms]() { + [&io_context, begin, interval_ms, context_name]() { auto end = std::chrono::steady_clock::now(); auto duration = std::chrono::duration_cast(end - begin); ray::stats::STATS_io_context_event_loop_lag_ms.Record( duration.count(), { - {"Name", GetThreadName()}, + {"Name", context_name.value_or(GetThreadName())}, }); // Schedule the next probe. If `duration` is larger than `interval_ms`, we @@ -44,42 +46,50 @@ void LagProbeLoop(instrumented_io_context &io_context, int64_t interval_ms) { // for `interval_ms - duration`. auto delay = interval_ms - duration.count(); if (delay <= 0) { - LagProbeLoop(io_context, interval_ms); + LagProbeLoop(io_context, interval_ms, context_name); } else { execute_after( io_context, - [&io_context, interval_ms]() { LagProbeLoop(io_context, interval_ms); }, + [&io_context, interval_ms, context_name]() { + LagProbeLoop(io_context, interval_ms, context_name); + }, std::chrono::milliseconds(delay)); } }, "event_loop_lag_probe"); } -void ScheduleLagProbe(instrumented_io_context &io_context) { - if (!RayConfig::instance().enable_metrics_collection()) { - return; - } +void ScheduleLagProbe(instrumented_io_context &io_context, + const std::optional &context_name) { auto interval = RayConfig::instance().io_context_event_loop_lag_collection_interval_ms(); if (interval <= 0) { return; } RAY_LOG(DEBUG) << "Scheduling lag probe for the io_context on thread " - << GetThreadName() << " every " << interval << "ms"; + << context_name.value_or(GetThreadName()) << " every " << interval + << "ms"; // At this time, the `io_context` may not be running yet, so we need to post the // first probe. - io_context.post([&io_context, interval]() { LagProbeLoop(io_context, interval); }, - "event_loop_lag_probe"); + io_context.post( + [&io_context, interval, context_name]() { + LagProbeLoop(io_context, interval, context_name); + }, + "event_loop_lag_probe"); } } // namespace -instrumented_io_context::instrumented_io_context(bool enable_lag_probe, - bool running_on_single_thread) +instrumented_io_context::instrumented_io_context( + const bool emit_metrics, + const bool running_on_single_thread, + const std::optional context_name) : boost::asio::io_context( running_on_single_thread ? 1 : BOOST_ASIO_CONCURRENCY_HINT_DEFAULT), - event_stats_(std::make_shared()) { - if (enable_lag_probe) { - ScheduleLagProbe(*this); + event_stats_(std::make_shared()), + emit_metrics_(emit_metrics), + context_name_(context_name) { + if (emit_metrics) { + ScheduleLagProbe(*this, context_name_); } } @@ -93,7 +103,8 @@ void instrumented_io_context::post(std::function handler, // GuardedHandlerStats synchronizes internal access, we can concurrently write to the // handler stats it->second from multiple threads without acquiring a table-level // readers lock in the callback. - auto stats_handle = event_stats_->RecordStart(std::move(name)); + auto stats_handle = + event_stats_->RecordStart(std::move(name), emit_metrics_, 0, context_name_); handler = [handler = std::move(handler), stats_handle = std::move(stats_handle)]() mutable { EventTracker::RecordExecution(handler, std::move(stats_handle)); @@ -111,7 +122,8 @@ void instrumented_io_context::dispatch(std::function handler, std::strin if (!RayConfig::instance().event_stats()) { return boost::asio::post(*this, std::move(handler)); } - auto stats_handle = event_stats_->RecordStart(std::move(name)); + auto stats_handle = + event_stats_->RecordStart(std::move(name), emit_metrics_, 0, context_name_); // References are only invalidated upon deletion of the corresponding item from the // table, which we won't do until this io_context is deleted. Provided that // GuardedHandlerStats synchronizes internal access, we can concurrently write to the diff --git a/src/ray/common/asio/instrumented_io_context.h b/src/ray/common/asio/instrumented_io_context.h index 120023233a0b..33778bffc80a 100644 --- a/src/ray/common/asio/instrumented_io_context.h +++ b/src/ray/common/asio/instrumented_io_context.h @@ -15,12 +15,9 @@ #pragma once #include -#include #include #include -#include "absl/container/flat_hash_map.h" -#include "absl/synchronization/mutex.h" #include "ray/common/event_stats.h" #include "ray/common/ray_config.h" #include "ray/util/logging.h" @@ -31,11 +28,16 @@ class instrumented_io_context : public boost::asio::io_context { /// Initializes the global stats struct after calling the base contructor. /// TODO(ekl) allow taking an externally defined event tracker. /// - /// \param enable_lag_probe If true, and if related Ray configs are set, schedule a - /// probe to measure the event loop lag. After a probe is done, it schedules another one - /// so a io_context.run() call will never return. - explicit instrumented_io_context(bool enable_lag_probe = false, - bool running_on_single_thread = false); + /// \param emit_metrics enables or disables metric emission on this io_context + /// \param running_on_single_thread hints to the underlying io_context if locking should + /// be enabled or not (that is, if running on multiple threads is true, then concurrency + /// controls will engage) + /// \param context_name optional name assigned to this io_context used for metric + /// emission + explicit instrumented_io_context( + bool emit_metrics = false, + bool running_on_single_thread = false, + std::optional context_name = std::nullopt); /// A proxy post function that collects count, queueing, and execution statistics for /// the given handler. @@ -59,4 +61,6 @@ class instrumented_io_context : public boost::asio::io_context { private: /// The event stats tracker to use to record asio handler stats to. std::shared_ptr event_stats_; + bool emit_metrics_; + std::optional context_name_; }; diff --git a/src/ray/common/asio/io_service_pool.cc b/src/ray/common/asio/io_service_pool.cc index 4603266ed64c..9f3c9f8d2a1e 100644 --- a/src/ray/common/asio/io_service_pool.cc +++ b/src/ray/common/asio/io_service_pool.cc @@ -25,7 +25,7 @@ IOServicePool::~IOServicePool() {} void IOServicePool::Run() { for (size_t i = 0; i < io_service_num_; ++i) { io_services_.emplace_back(std::make_unique( - /*enable_lag_probe=*/false, /*running_on_single_thread=*/true)); + /*enable_metrics=*/false, /*running_on_single_thread=*/true)); instrumented_io_context &io_service = *io_services_[i]; threads_.emplace_back([&io_service] { boost::asio::executor_work_guard work( diff --git a/src/ray/common/asio/periodical_runner.cc b/src/ray/common/asio/periodical_runner.cc index b4f7307c7101..9da73cc39596 100644 --- a/src/ray/common/asio/periodical_runner.cc +++ b/src/ray/common/asio/periodical_runner.cc @@ -106,7 +106,8 @@ void PeriodicalRunner::DoRunFnPeriodicallyInstrumented( // NOTE: We add the timer period to the enqueue time in order only measure the time in // which the handler was elgible to execute on the event loop but was queued by the // event loop. - auto stats_handle = io_service_.stats().RecordStart(name, period.total_nanoseconds()); + auto stats_handle = + io_service_.stats().RecordStart(name, false, period.total_nanoseconds()); timer->async_wait( [weak_self = weak_from_this(), fn = std::move(fn), diff --git a/src/ray/common/bundle_spec.cc b/src/ray/common/bundle_spec.cc index 111765363b63..336f8906ab11 100644 --- a/src/ray/common/bundle_spec.cc +++ b/src/ray/common/bundle_spec.cc @@ -14,6 +14,10 @@ #include "ray/common/bundle_spec.h" +#include "ray/common/scheduling/label_selector.h" +#include "ray/common/scheduling/placement_group_util.h" +#include "ray/common/scheduling/scheduling_ids.h" + namespace ray { void BundleSpecification::ComputeResources() { @@ -142,59 +146,6 @@ std::string GetOriginalResourceNameFromWildcardResource(const std::string &resou } } -bool IsCPUOrPlacementGroupCPUResource(ResourceID resource_id) { - // Check whether the resource is CPU resource or CPU resource inside PG. - if (resource_id == ResourceID::CPU()) { - return true; - } - - auto possible_pg_resource = ParsePgFormattedResource(resource_id.Binary(), - /*for_wildcard_resource*/ true, - /*for_indexed_resource*/ true); - if (possible_pg_resource.has_value() && - possible_pg_resource->original_resource == ResourceID::CPU().Binary()) { - return true; - } - - return false; -} - -std::optional ParsePgFormattedResource( - const std::string &resource, bool for_wildcard_resource, bool for_indexed_resource) { - // Check if it is a wildcard pg resource. - PgFormattedResourceData data; - std::smatch match_groups; - RAY_CHECK(for_wildcard_resource || for_indexed_resource) - << "Either one of for_wildcard_resource or for_indexed_resource must be true"; - - if (for_wildcard_resource) { - static const std::regex wild_card_resource_pattern("^(.*)_group_([0-9a-f]+)$"); - - if (std::regex_match(resource, match_groups, wild_card_resource_pattern) && - match_groups.size() == 3) { - data.original_resource = match_groups[1].str(); - data.bundle_index = -1; - data.group_id = match_groups[2].str(); - return data; - } - } - - // Check if it is a regular pg resource. - if (for_indexed_resource) { - static const std::regex pg_resource_pattern("^(.+)_group_(\\d+)_([0-9a-zA-Z]+)"); - if (std::regex_match(resource, match_groups, pg_resource_pattern) && - match_groups.size() == 4) { - data.original_resource = match_groups[1].str(); - data.bundle_index = stoi(match_groups[2].str()); - data.group_id = match_groups[3].str(); - return data; - } - } - - // If it is not a wildcard or pg formatted resource, return nullopt. - return {}; -} - std::string GetDebugStringForBundles( const std::vector> &bundles) { std::ostringstream debug_info; diff --git a/src/ray/common/bundle_spec.h b/src/ray/common/bundle_spec.h index 5f77cbb7650d..4b59d1895fe3 100644 --- a/src/ray/common/bundle_spec.h +++ b/src/ray/common/bundle_spec.h @@ -14,23 +14,22 @@ #pragma once -#include -#include +#include #include +#include #include -#include "absl/synchronization/mutex.h" -#include "ray/common/function_descriptor.h" +#include "absl/container/flat_hash_map.h" #include "ray/common/grpc_util.h" #include "ray/common/id.h" #include "ray/common/scheduling/cluster_resource_data.h" -#include "ray/common/task/task_common.h" +#include "src/ray/protobuf/common.pb.h" namespace ray { -/// Arguments are the raylet ID to spill back to, the raylet's +/// Arguments are the node ID to spill back to, the raylet's /// address and the raylet's port. -typedef std::function SpillbackBundleCallback; +using SpillbackBundleCallback = std::function; const std::string kGroupKeyword = "_group_"; const size_t kGroupKeywordSize = kGroupKeyword.size(); @@ -93,13 +92,6 @@ class BundleSpecification : public MessageWrapper { absl::flat_hash_map bundle_resource_labels_; }; -struct PgFormattedResourceData { - std::string original_resource; - /// -1 if it is a wildcard resource. - int64_t bundle_index; - std::string group_id; -}; - /// Format a placement group resource with provided parameters. /// /// \param original_resource_name The original resource name of the pg resource. @@ -126,23 +118,6 @@ std::string GetOriginalResourceName(const std::string &resource); // Returns "" if the resource is not a wildcard resource. std::string GetOriginalResourceNameFromWildcardResource(const std::string &resource); -/// Return whether the resource specified by the resource_id is a CPU resource -/// or CPU resource inside a placement group. -bool IsCPUOrPlacementGroupCPUResource(ResourceID resource_id); - -/// Parse the given resource and get the pg related information. -/// -/// \param resource name of the resource. -/// \param for_wildcard_resource if true, it parses wildcard pg resources. -/// E.g., [resource]_group_[pg_id] -/// \param for_indexed_resource if true, it parses indexed pg resources. -/// E.g., [resource]_group_[index]_[pg_id] -/// \return nullopt if it is not a pg resource. Otherwise, it returns the -/// struct with pg information parsed from the resource. -/// If a returned bundle index is -1, it means the resource is the wildcard resource. -std::optional ParsePgFormattedResource( - const std::string &resource, bool for_wildcard_resource, bool for_indexed_resource); - /// Generate debug information of given bundles. std::string GetDebugStringForBundles( const std::vector> &bundles); diff --git a/src/ray/common/cgroup/BUILD.bazel b/src/ray/common/cgroup/BUILD.bazel index a67086c14e75..1487689d8985 100644 --- a/src/ray/common/cgroup/BUILD.bazel +++ b/src/ray/common/cgroup/BUILD.bazel @@ -14,8 +14,9 @@ ray_cc_library( ":cgroup_utils", ":constants", "//src/ray/common:macros", - "//src/ray/util", + "//src/ray/util:filesystem", "//src/ray/util:invoke_once_token", + "//src/ray/util:logging", "//src/ray/util:path_utils", "@com_google_absl//absl/strings:str_format", ], diff --git a/src/ray/common/cgroup/cgroup_setup.cc b/src/ray/common/cgroup/cgroup_setup.cc index 3087e172d457..5a3903768b7f 100644 --- a/src/ray/common/cgroup/cgroup_setup.cc +++ b/src/ray/common/cgroup/cgroup_setup.cc @@ -85,7 +85,6 @@ Status CheckCgroupV2MountedRW(const std::string &directory) { #include "ray/util/invoke_once_token.h" #include "ray/util/logging.h" #include "ray/util/path_utils.h" -#include "ray/util/util.h" namespace ray { diff --git a/src/ray/common/cgroup/test/BUILD.bazel b/src/ray/common/cgroup/tests/BUILD.bazel similarity index 89% rename from src/ray/common/cgroup/test/BUILD.bazel rename to src/ray/common/cgroup/tests/BUILD.bazel index e0777ca2bec1..5ad80e0fe6aa 100644 --- a/src/ray/common/cgroup/test/BUILD.bazel +++ b/src/ray/common/cgroup/tests/BUILD.bazel @@ -12,7 +12,7 @@ ray_cc_test( ], deps = [ "//src/ray/common/cgroup:cgroup_setup", - "//src/ray/common/test:testing", + "//src/ray/common/tests:testing", "@com_google_googletest//:gtest_main", ], ) @@ -28,7 +28,7 @@ ray_cc_test( ], deps = [ "//src/ray/common/cgroup:cgroup_setup", - "//src/ray/common/test:testing", + "//src/ray/common/tests:testing", "@com_google_googletest//:gtest_main", ], ) @@ -42,7 +42,7 @@ ray_cc_test( ], deps = [ "//src/ray/common/cgroup:fake_cgroup_setup", - "//src/ray/common/test:testing", + "//src/ray/common/tests:testing", "@com_google_googletest//:gtest_main", ], ) @@ -59,7 +59,7 @@ ray_cc_test( ":cgroup_test_utils", "//src/ray/common/cgroup:cgroup_setup", "//src/ray/common/cgroup:cgroup_utils", - "//src/ray/common/test:testing", + "//src/ray/common/tests:testing", "@com_google_googletest//:gtest_main", ], ) @@ -70,7 +70,7 @@ ray_cc_library( srcs = ["cgroup_test_utils.cc"], hdrs = ["cgroup_test_utils.h"], deps = [ - "//src/ray/common/test:testing", + "//src/ray/common/tests:testing", "//src/ray/util:compat", "//src/ray/util:container_util", "//src/ray/util:filesystem", diff --git a/src/ray/common/cgroup/test/cgroup_test_utils.cc b/src/ray/common/cgroup/tests/cgroup_test_utils.cc similarity index 93% rename from src/ray/common/cgroup/test/cgroup_test_utils.cc rename to src/ray/common/cgroup/tests/cgroup_test_utils.cc index 3303c4270b75..bdc373fd69d6 100644 --- a/src/ray/common/cgroup/test/cgroup_test_utils.cc +++ b/src/ray/common/cgroup/tests/cgroup_test_utils.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/common/cgroup/test/cgroup_test_utils.h" +#include "ray/common/cgroup/tests/cgroup_test_utils.h" #include @@ -21,7 +21,7 @@ #include "absl/strings/str_split.h" #include "absl/strings/strip.h" -#include "ray/common/test/testing.h" +#include "ray/common/tests/testing.h" #include "ray/util/container_util.h" #include "ray/util/filesystem.h" diff --git a/src/ray/common/cgroup/test/cgroup_test_utils.h b/src/ray/common/cgroup/tests/cgroup_test_utils.h similarity index 100% rename from src/ray/common/cgroup/test/cgroup_test_utils.h rename to src/ray/common/cgroup/tests/cgroup_test_utils.h diff --git a/src/ray/common/cgroup/test/cgroup_v2_setup_test.cc b/src/ray/common/cgroup/tests/cgroup_v2_setup_test.cc similarity index 96% rename from src/ray/common/cgroup/test/cgroup_v2_setup_test.cc rename to src/ray/common/cgroup/tests/cgroup_v2_setup_test.cc index 3feff7298d8a..1cc57433c9f3 100644 --- a/src/ray/common/cgroup/test/cgroup_v2_setup_test.cc +++ b/src/ray/common/cgroup/tests/cgroup_v2_setup_test.cc @@ -20,7 +20,7 @@ // https://docs.redhat.com/en/documentation/red_hat_enterprise_linux/8/html/managing_monitoring_and_updating_the_kernel/using-cgroups-v2-to-control-distribution-of-cpu-time-for-applications_managing-monitoring-and-updating-the-kernel#mounting-cgroups-v2_using-cgroups-v2-to-control-distribution-of-cpu-time-for-applications // // Execution command: -// sudo bazel-bin/src/ray/common/cgroup/test/cgroup_v2_setup_test +// sudo bazel-bin/src/ray/common/cgroup/tests/cgroup_v2_setup_test #include #include @@ -35,8 +35,8 @@ #include "ray/common/cgroup/cgroup_setup.h" #include "ray/common/cgroup/cgroup_utils.h" -#include "ray/common/cgroup/test/cgroup_test_utils.h" -#include "ray/common/test/testing.h" +#include "ray/common/cgroup/tests/cgroup_test_utils.h" +#include "ray/common/tests/testing.h" namespace ray { diff --git a/src/ray/common/cgroup/test/cgroup_v2_utils_privileged_test.cc b/src/ray/common/cgroup/tests/cgroup_v2_utils_privileged_test.cc similarity index 97% rename from src/ray/common/cgroup/test/cgroup_v2_utils_privileged_test.cc rename to src/ray/common/cgroup/tests/cgroup_v2_utils_privileged_test.cc index ec1c12e4f8d5..14b0bf9182e7 100644 --- a/src/ray/common/cgroup/test/cgroup_v2_utils_privileged_test.cc +++ b/src/ray/common/cgroup/tests/cgroup_v2_utils_privileged_test.cc @@ -15,7 +15,7 @@ #include #include "ray/common/cgroup/cgroup_setup.h" -#include "ray/common/test/testing.h" +#include "ray/common/tests/testing.h" namespace ray::internal { diff --git a/src/ray/common/cgroup/test/cgroup_v2_utils_unprivileged_test.cc b/src/ray/common/cgroup/tests/cgroup_v2_utils_unprivileged_test.cc similarity index 97% rename from src/ray/common/cgroup/test/cgroup_v2_utils_unprivileged_test.cc rename to src/ray/common/cgroup/tests/cgroup_v2_utils_unprivileged_test.cc index 723f38bc4dfc..38626b4ca313 100644 --- a/src/ray/common/cgroup/test/cgroup_v2_utils_unprivileged_test.cc +++ b/src/ray/common/cgroup/tests/cgroup_v2_utils_unprivileged_test.cc @@ -20,7 +20,7 @@ #include #include "ray/common/cgroup/cgroup_setup.h" -#include "ray/common/test/testing.h" +#include "ray/common/tests/testing.h" namespace ray::internal { diff --git a/src/ray/common/cgroup/test/fake_cgroup_setup_test.cc b/src/ray/common/cgroup/tests/fake_cgroup_setup_test.cc similarity index 98% rename from src/ray/common/cgroup/test/fake_cgroup_setup_test.cc rename to src/ray/common/cgroup/tests/fake_cgroup_setup_test.cc index 59c15dabb9ab..fd29f13391ec 100644 --- a/src/ray/common/cgroup/test/fake_cgroup_setup_test.cc +++ b/src/ray/common/cgroup/tests/fake_cgroup_setup_test.cc @@ -19,7 +19,7 @@ #include #include -#include "ray/common/test/testing.h" +#include "ray/common/tests/testing.h" namespace ray { diff --git a/src/ray/common/cgroup2/BUILD.bazel b/src/ray/common/cgroup2/BUILD.bazel index b74e0505428c..b2becaa0575b 100644 --- a/src/ray/common/cgroup2/BUILD.bazel +++ b/src/ray/common/cgroup2/BUILD.bazel @@ -1,13 +1,54 @@ load("//bazel:ray.bzl", "ray_cc_library") +config_setting( + name = "is_linux", + constraint_values = ["@platforms//os:linux"], +) + +# Public targets. +ray_cc_library( + name = "cgroup_manager", + srcs = select({ + ":is_linux": ["cgroup_manager.cc"], + "//conditions:default": ["noop_cgroup_manager.cc"], + }), + hdrs = [ + "cgroup_manager.h", + "scoped_cgroup_operation.h", + ], + visibility = ["//visibility:public"], + deps = [ + ":cgroup_driver_interface", + ":cgroup_manager_interface", + "//src/ray/common:status", + "//src/ray/common:status_or", + ] + select({ + ":is_linux": [ + "//src/ray/util:logging", + "@com_google_absl//absl/strings", + ], + "//conditions:default": [], + }), +) + ray_cc_library( name = "cgroup_driver_interface", hdrs = [ "cgroup_driver_interface.h", ], - tags = [ - "no_windows", + visibility = ["//visibility:public"], + deps = [ + "//src/ray/common:status", + "//src/ray/common:status_or", ], +) + +ray_cc_library( + name = "cgroup_manager_interface", + hdrs = [ + "cgroup_manager_interface.h", + ], + visibility = ["//visibility:public"], deps = [ "//src/ray/common:status", "//src/ray/common:status_or", @@ -16,18 +57,55 @@ ray_cc_library( ray_cc_library( name = "sysfs_cgroup_driver", - srcs = ["sysfs_cgroup_driver.cc"], + srcs = select({ + ":is_linux": ["sysfs_cgroup_driver.cc"], + "//conditions:default": ["noop_sysfs_cgroup_driver.cc"], + }), hdrs = [ "sysfs_cgroup_driver.h", ], - tags = [ - "no_windows", + visibility = ["//visibility:public"], + deps = [ + ":cgroup_driver_interface", + "//src/ray/common:status", + "//src/ray/common:status_or", + ] + select({ + ":is_linux": [ + "//src/ray/util:logging", + "@com_google_absl//absl/strings", + ], + "//conditions:default": [], + }), +) + +# Private Targets. +ray_cc_library( + name = "fake_cgroup_driver", + hdrs = [ + "fake_cgroup_driver.h", ], + target_compatible_with = [ + "@platforms//os:linux", + ], + visibility = [":__subpackages__"], deps = [ ":cgroup_driver_interface", "//src/ray/common:status", + ], +) + +ray_cc_library( + name = "cgroup_test_utils", + srcs = ["cgroup_test_utils.cc"], + hdrs = ["cgroup_test_utils.h"], + target_compatible_with = [ + "@platforms//os:linux", + ], + visibility = [":__subpackages__"], + deps = [ + "//src/ray/common:id", + "//src/ray/common:status", "//src/ray/common:status_or", - "//src/ray/util:logging", - "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], ) diff --git a/src/ray/common/cgroup2/cgroup_driver_interface.h b/src/ray/common/cgroup2/cgroup_driver_interface.h index 132000c79ff5..01f61c68e0e7 100644 --- a/src/ray/common/cgroup2/cgroup_driver_interface.h +++ b/src/ray/common/cgroup2/cgroup_driver_interface.h @@ -68,19 +68,34 @@ class CgroupDriverInterface { /** Creates a new cgroup at the specified path. + Expects all cgroups on the path from root -> the new cgroup to already exist. Expects the user to have read, write, and execute privileges to parent cgroup. @param cgroup is an absolute path to the cgroup - @return Status::OK if no errors are encounted. Otherwise, one of the following errors + @return Status::OK if no errors are encounted. @return Status::NotFound if an ancestor cgroup does not exist. - @return Status::PermissionDenied if current user doesn't have read, write, and execute - permissions. + @return Status::PermissionDenied if the process doesn't have sufficient permissions. @return Status::AlreadyExists if the cgroup already exists. */ virtual Status CreateCgroup(const std::string &cgroup) = 0; + /** + Deletes the specified cgroup. + + Expects all cgroups from the root -> the specified cgroup to exist. + Expects the cgroup to have no children. + Expects the process to have adequate permissions for the parent cgroup. + + @param cgroup is an absolute path to the cgroup + + @return Status::OK if no errors are encounted. + @return Status::NotFound if an ancestor cgroup does not exist. + @return Status::PermissionDenied if the process doesn't have sufficient permissions. + */ + virtual Status DeleteCgroup(const std::string &cgroup) = 0; + /** Move all processes from one cgroup to another. The process must have read, write, and execute permissions for both cgroups and their lowest common ancestor. @@ -157,6 +172,7 @@ class CgroupDriverInterface { supported or the value not correct. */ virtual Status AddConstraint(const std::string &cgroup, + const std::string &controller, const std::string &constraint, const std::string &value) = 0; /** @@ -190,17 +206,5 @@ class CgroupDriverInterface { */ virtual StatusOr> GetEnabledControllers( const std::string &cgroup) = 0; - - struct Constraint { - std::pair range; - std::string controller; - }; - - protected: - const std::unordered_map supported_constraints_ = { - {"cpu.weight", {{1, 10000}, "cpu"}}, - {"memory.min", {{0, std::numeric_limits::max()}, "memory"}}, - }; - const std::unordered_set supported_controllers_ = {"cpu", "memory"}; }; } // namespace ray diff --git a/src/ray/common/cgroup2/cgroup_manager.cc b/src/ray/common/cgroup2/cgroup_manager.cc new file mode 100644 index 000000000000..e210191565d2 --- /dev/null +++ b/src/ray/common/cgroup2/cgroup_manager.cc @@ -0,0 +1,262 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ray/common/cgroup2/cgroup_manager.h" + +#include +#include +#include +#include +#include + +#include "absl/strings/str_format.h" +#include "absl/strings/str_join.h" +#include "ray/common/cgroup2/cgroup_driver_interface.h" +#include "ray/common/cgroup2/scoped_cgroup_operation.h" +#include "ray/common/status_or.h" + +namespace ray { + +CgroupManager::CgroupManager(std::string base_cgroup_path, + const std::string &node_id, + std::unique_ptr cgroup_driver) + : base_cgroup_path_(std::move(base_cgroup_path)), + cgroup_driver_(std::move(cgroup_driver)) { + node_cgroup_path_ = base_cgroup_path_ + std::filesystem::path::preferred_separator + + absl::StrFormat("%s_%s", kNodeCgroupName, node_id); + system_cgroup_path_ = + node_cgroup_path_ + std::filesystem::path::preferred_separator + kSystemCgroupName; + + application_cgroup_path_ = node_cgroup_path_ + + std::filesystem::path::preferred_separator + + kApplicationCgroupName; +} + +CgroupManager::~CgroupManager() { + while (!cleanup_operations_.empty()) { + cleanup_operations_.pop_back(); + } +} + +StatusOr> CgroupManager::Create( + std::string base_cgroup_path, + const std::string &node_id, + const int64_t system_reserved_cpu_weight, + const int64_t system_reserved_memory_bytes, + std::unique_ptr cgroup_driver) { + if (!cpu_weight_constraint_.IsValid(system_reserved_cpu_weight)) { + return Status::InvalidArgument( + absl::StrFormat("Invalid constraint %s=%d. %s must be in the range [%d, %d].", + cpu_weight_constraint_.name_, + system_reserved_cpu_weight, + cpu_weight_constraint_.name_, + cpu_weight_constraint_.Min(), + cpu_weight_constraint_.Max())); + } + if (!memory_min_constraint_.IsValid(system_reserved_memory_bytes)) { + return Status::InvalidArgument( + absl::StrFormat("Invalid constraint %s=%d. %s must be in the range [%d, %d].", + memory_min_constraint_.name_, + system_reserved_memory_bytes, + memory_min_constraint_.name_, + memory_min_constraint_.Min(), + memory_min_constraint_.Max())); + } + RAY_RETURN_NOT_OK(cgroup_driver->CheckCgroupv2Enabled()); + RAY_RETURN_NOT_OK(cgroup_driver->CheckCgroup(base_cgroup_path)); + StatusOr> available_controllers = + cgroup_driver->GetAvailableControllers(base_cgroup_path); + + if (!available_controllers.ok()) { + return available_controllers.status(); + } + + std::string supported_controllers_str = + absl::StrCat("[", absl::StrJoin(supported_controllers_, ", "), "]"); + + for (const auto &ctrl : supported_controllers_) { + if (available_controllers->find(ctrl) == available_controllers->end()) { + std::string available_controllers_str = + absl::StrCat("[", absl::StrJoin(*available_controllers, ", "), "]"); + return Status::Invalid(absl::StrFormat( + "Failed to initialize resource isolation " + "because required controllers are not available in the cgroup %s. " + "To make controllers available in %s, you need to enable controllers for its " + "ancestor cgroups. See " + "https://docs.kernel.org/admin-guide/cgroup-v2.html#controlling-controllers " + "for more details. Available controllers: %s. Required controllers: " + "%s.", + base_cgroup_path, + base_cgroup_path, + available_controllers_str, + supported_controllers_str)); + } + } + + std::unique_ptr cgroup_manager = std::unique_ptr( + new CgroupManager(std::move(base_cgroup_path), node_id, std::move(cgroup_driver))); + + RAY_RETURN_NOT_OK(cgroup_manager->Initialize(system_reserved_cpu_weight, + system_reserved_memory_bytes)); + + return cgroup_manager; +} + +void CgroupManager::RegisterDeleteCgroup(const std::string &cgroup_path) { + cleanup_operations_.emplace_back([this, cgroup = cgroup_path]() { + Status s = this->cgroup_driver_->DeleteCgroup(cgroup); + if (!s.ok()) { + RAY_LOG(WARNING) << absl::StrFormat( + "Failed to delete cgroup %s with error %s.", cgroup, s.ToString()); + } + }); +} + +void CgroupManager::RegisterMoveAllProcesses(const std::string &from, + const std::string &to) { + cleanup_operations_.emplace_back([this, from_cgroup = from, to_cgroup = to]() { + Status s = this->cgroup_driver_->MoveAllProcesses(from_cgroup, to_cgroup); + if (!s.ok()) { + RAY_LOG(WARNING) << absl::StrFormat( + "Failed to move all processes from %s to %s with error %s", + from_cgroup, + to_cgroup, + s.ToString()); + } + }); +} + +template +void CgroupManager::RegisterRemoveConstraint(const std::string &cgroup, + const Constraint &constraint) { + cleanup_operations_.emplace_back( + [this, constrained_cgroup = cgroup, constraint_to_remove = constraint]() { + std::string default_value = std::to_string(constraint_to_remove.default_value_); + Status s = this->cgroup_driver_->AddConstraint(constrained_cgroup, + constraint_to_remove.controller_, + constraint_to_remove.name_, + default_value); + if (!s.ok()) { + RAY_LOG(WARNING) << absl::StrFormat( + "Failed to set constraint %s=%s to default value for cgroup %s with error " + "%s.", + constraint_to_remove.name_, + default_value, + constrained_cgroup, + s.ToString()); + } + }); +} + +void CgroupManager::RegisterDisableController(const std::string &cgroup_path, + const std::string &controller) { + cleanup_operations_.emplace_back( + [this, cgroup = cgroup_path, controller_to_disable = controller]() { + Status s = this->cgroup_driver_->DisableController(cgroup, controller_to_disable); + if (!s.ok()) { + RAY_LOG(WARNING) << absl::StrFormat( + "Failed to disable controller %s for cgroup %s with error %s", + controller_to_disable, + cgroup, + s.ToString()); + } + }); +} + +Status CgroupManager::Initialize(int64_t system_reserved_cpu_weight, + int64_t system_reserved_memory_bytes) { + std::string supported_controllers = + absl::StrCat("[", absl::StrJoin(supported_controllers_, ", "), "]"); + + // The cpu.weight is distributed between the system and application cgroups. + // The application cgroup gets whatever is leftover from the system cgroup. + int64_t application_cgroup_cpu_weight = + cpu_weight_constraint_.Max() - system_reserved_cpu_weight; + + RAY_LOG(INFO) << absl::StrFormat( + "Initializing CgroupManager at base cgroup at '%s'. Ray's cgroup " + "hierarchy will under the node cgroup at '%s'. The %s controllers will be " + "enabled. " + "The system cgroup at '%s' will have constraints [%s=%lld, %s=%lld]. " + "The application cgroup '%s' will have constraints [%s=%lld].", + base_cgroup_path_, + node_cgroup_path_, + supported_controllers, + system_cgroup_path_, + cpu_weight_constraint_.name_, + system_reserved_cpu_weight, + memory_min_constraint_.name_, + system_reserved_memory_bytes, + application_cgroup_path_, + cpu_weight_constraint_.name_, + application_cgroup_cpu_weight); + + // Create the cgroup heirarchy: + // base_cgroup_path (e.g. /sys/fs/cgroup) + // | + // ray_node_ + // | | + // system application + RAY_RETURN_NOT_OK(cgroup_driver_->CreateCgroup(node_cgroup_path_)); + RegisterDeleteCgroup(node_cgroup_path_); + + RAY_RETURN_NOT_OK(cgroup_driver_->CreateCgroup(system_cgroup_path_)); + RegisterDeleteCgroup(system_cgroup_path_); + + RAY_RETURN_NOT_OK(cgroup_driver_->CreateCgroup(application_cgroup_path_)); + RegisterDeleteCgroup(application_cgroup_path_); + + // Move all processes from the base_cgroup into the system_cgroup to make sure + // that the no internal process constraint is not violated. This is relevant + // when the base_cgroup_path is not a root cgroup for the system. This is likely + // the case if Ray is running inside a container. + RAY_RETURN_NOT_OK( + cgroup_driver_->MoveAllProcesses(base_cgroup_path_, system_cgroup_path_)); + RegisterMoveAllProcesses(system_cgroup_path_, base_cgroup_path_); + + for (const auto &ctrl : supported_controllers_) { + RAY_RETURN_NOT_OK(cgroup_driver_->EnableController(base_cgroup_path_, ctrl)); + RegisterDisableController(base_cgroup_path_, ctrl); + RAY_RETURN_NOT_OK(cgroup_driver_->EnableController(node_cgroup_path_, ctrl)); + RegisterDisableController(node_cgroup_path_, ctrl); + RAY_RETURN_NOT_OK(cgroup_driver_->EnableController(system_cgroup_path_, ctrl)); + RegisterDisableController(system_cgroup_path_, ctrl); + RAY_RETURN_NOT_OK(cgroup_driver_->EnableController(application_cgroup_path_, ctrl)); + RegisterDisableController(application_cgroup_path_, ctrl); + } + + RAY_RETURN_NOT_OK( + cgroup_driver_->AddConstraint(system_cgroup_path_, + cpu_weight_constraint_.controller_, + cpu_weight_constraint_.name_, + std::to_string(system_reserved_cpu_weight))); + RegisterRemoveConstraint(system_cgroup_path_, cpu_weight_constraint_); + + RAY_RETURN_NOT_OK( + cgroup_driver_->AddConstraint(system_cgroup_path_, + memory_min_constraint_.controller_, + memory_min_constraint_.name_, + std::to_string(system_reserved_memory_bytes))); + RegisterRemoveConstraint(system_cgroup_path_, memory_min_constraint_); + + RAY_RETURN_NOT_OK( + cgroup_driver_->AddConstraint(application_cgroup_path_, + cpu_weight_constraint_.controller_, + cpu_weight_constraint_.name_, + std::to_string(application_cgroup_cpu_weight))); + RegisterRemoveConstraint(application_cgroup_path_, cpu_weight_constraint_); + + return Status::OK(); +} +} // namespace ray diff --git a/src/ray/common/cgroup2/cgroup_manager.h b/src/ray/common/cgroup2/cgroup_manager.h new file mode 100644 index 000000000000..466abe6e1257 --- /dev/null +++ b/src/ray/common/cgroup2/cgroup_manager.h @@ -0,0 +1,136 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include +#include + +#include "ray/common/cgroup2/cgroup_driver_interface.h" +#include "ray/common/cgroup2/cgroup_manager_interface.h" +#include "ray/common/cgroup2/scoped_cgroup_operation.h" +#include "ray/common/status.h" +#include "ray/common/status_or.h" + +namespace ray { +class CgroupManager : public CgroupManagerInterface { + public: + /** + Creates a CgroupManager after checking for the following invariants: + + 1. cgroupv2 is mounted correctly in unified mode. For more details (@see + CgroupDriverInterface::CheckCgroupv2Enabled). + 2. the current process has permissions to read and write to the base_cgroup. + 3. supported cgroup controllers are available (@see supported_controllers_). + + The CgroupManager will be used to + 1. construct the cgroup hierarchy. + 2. move processes into the appropriate cgroups. + 3. enable controllers and resource constraints. + + @param base_cgroup the cgroup that the process will take ownership of. + @param node_id used to create a ray node cgroup. + @param system_reserved_cpu_weight a value between [1,10000] to assign to the cgroup + for system processes. The cgroup for application processes gets 10000 - + system_reserved_cpu_weight. + @param system_reserved_memory_bytes used to reserve memory for the system cgroup. + @param cgroup_driver used to perform cgroup operations. + + @return Status::OK with an instance of CgroupManager if everything succeeds. + @return Status::Invalid if cgroupv2 is not enabled correctly. + @return Status::InvalidArgument if base_cgroup is not a cgroup. + @return Status::NotFound if the base_cgroupd does not exist. + @return Status::PermissionDenied if current user doesn't have read, write, and + execute permissions. + */ + static StatusOr> Create( + std::string base_cgroup_path, + const std::string &node_id, + const int64_t system_reserved_cpu_weight, + const int64_t system_reserved_memory_bytes, + std::unique_ptr cgroup_driver); + + // Unmovable and uncopyable type. + CgroupManager(const CgroupManager &) = delete; + CgroupManager &operator=(const CgroupManager &) = delete; + CgroupManager(CgroupManager &&) = default; + CgroupManager &operator=(CgroupManager &&) = default; + + /** + Performs cleanup in reverse order from the Initialize function: + 1. remove resource constraints to the system and application cgroups. + 2. disable controllers on the base, system, and application cgroups respectively. + 3. move all processes from the system cgroup into the base cgroup. + 4. delete the node, system, and application cgroups respectively. + + Cleanup is best-effort. If any step fails, it will log a warning. + */ + ~CgroupManager() override; + + private: + CgroupManager(std::string base_cgroup_path, + const std::string &node_id, + std::unique_ptr cgroup_driver); + + /** + Performs the following operations: + + 1. create the node, system, and application cgroups respectively. + 2. move all processes from the base_cgroup into the system cgroup. + 3. enable controllers the base, node, system, and application cgroups respectively. + 4. add resource constraints to the system and application cgroups. + + @param system_reserved_cpu_weight a value between [1,10000] to assign to the cgroup + for system processes. The cgroup for application processes gets 10000 - + system_reserved_cpu_weight. + @param system_reserved_memory_bytes used to reserve memory for the system cgroup. + + @return Status::OK if no errors encountered. + @return Status::NotFound if base_cgroup does not exist. + @return Status::PermissionDenied if the process does not have enough permissions + to create a cgroup or write to it. + @return Status::Invalid if processes could not be moved between cgroups. + @return Status::InvalidArgument if base_cgroup_path_ is not a valid cgroup, + supported_controllers_ cannot be enabled, or a constraint is not supported. + @return Status::AlreadyExists if the the node, application, or system cgroup already + exists. + + */ + Status Initialize(const int64_t system_reserved_cpu_weight, + const int64_t system_reserved_memory_bytes); + + // The Register* methods register a callback that will execute in the destructor + // in FILO order. All callbacks required the cgroup_driver_ to be available to + // remove the cgroup hierarchy. + void RegisterDeleteCgroup(const std::string &cgroup); + void RegisterMoveAllProcesses(const std::string &from, const std::string &to); + template + void RegisterRemoveConstraint(const std::string &cgroup, + const Constraint &constraint); + void RegisterDisableController(const std::string &cgroup, + const std::string &controller); + + std::string base_cgroup_path_; + std::string node_cgroup_path_; + std::string system_cgroup_path_; + std::string application_cgroup_path_; + + // This will be popped in reverse order to clean up all side-effects performed + // during setup. + std::vector cleanup_operations_; + + std::unique_ptr cgroup_driver_; +}; +} // namespace ray diff --git a/src/ray/common/cgroup2/cgroup_manager_interface.h b/src/ray/common/cgroup2/cgroup_manager_interface.h new file mode 100644 index 000000000000..28b6f936932f --- /dev/null +++ b/src/ray/common/cgroup2/cgroup_manager_interface.h @@ -0,0 +1,89 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "ray/common/cgroup2/cgroup_driver_interface.h" +#include "ray/common/status_or.h" + +namespace ray { + +/** + Sets up resource isolation for a Ray node using cgroup2 using the following + cgroup hierachy: + + base_cgroup_path (e.g. /sys/fs/cgroup) + | + ray_node_ + | | + system application +*/ +class CgroupManagerInterface { + public: + // TODO(#54703): These will be implemented in a later PR to move processes + // into a cgroup. + // virtual Status AddProcessToApplicationCgroup(int) = 0; + // virtual Status AddProcessToSystemCgroup(int) = 0; + + /** + Cleans up the cgroup hierarchy, disables all controllers and removes all + constraints. + */ + virtual ~CgroupManagerInterface() = default; + + protected: + inline static const std::string kNodeCgroupName = "ray_node"; + inline static const std::string kSystemCgroupName = "system"; + inline static const std::string kApplicationCgroupName = "application"; + + // Controllers that can be enabled in Ray. + inline static const std::unordered_set supported_controllers_ = {"cpu", + "memory"}; + /** + Metadata about constraints that can be used. + @tparam the type of value that the constraint can take. + */ + template + struct Constraint { + std::string name_; + std::string controller_; + std::pair range_; + T default_value_; + T Max() const { return range_.second; } + T Min() const { return range_.first; } + bool IsValid(T value) const { return value <= Max() && value >= Min(); } + }; + + // cpu.weight distributes a cgroup's cpu cycles between it's children. + // See https://docs.kernel.org/admin-guide/cgroup-v2.html#cpu-interface-files + inline static const Constraint cpu_weight_constraint_{ + "cpu.weight", "cpu", {1, 10000}, 100}; + + // memory.min guarantees hard memory protection. If the memory usage of a cgroup + // is within its effective min boundary, the cgroup’s memory won’t be reclaimed under + // any conditions. + // See https://docs.kernel.org/admin-guide/cgroup-v2.html#memory-interface-files + inline static const Constraint memory_min_constraint_{ + "memory.min", "memory", {0, std::numeric_limits::max()}, 0}; +}; +} // namespace ray diff --git a/src/ray/common/cgroup2/cgroup_test_utils.cc b/src/ray/common/cgroup2/cgroup_test_utils.cc new file mode 100644 index 000000000000..49939b576153 --- /dev/null +++ b/src/ray/common/cgroup2/cgroup_test_utils.cc @@ -0,0 +1,293 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ray/common/cgroup2/cgroup_test_utils.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "absl/strings/str_format.h" +#include "ray/common/id.h" +#include "ray/common/status.h" +#include "ray/common/status_or.h" +#include "ray/util/logging.h" + +ray::StatusOr> TempCgroupDirectory::Create( + const std::string &base_path, mode_t mode) { + std::string random_name = ray::UniqueID::FromRandom().Hex(); + std::string name = random_name.substr(0, std::min(6, random_name.size())); + std::string path = base_path + std::filesystem::path::preferred_separator + name; + if (mkdir(path.c_str(), mode) == -1) { + return ray::Status::IOError( + absl::StrFormat("Failed to create cgroup directory at path %s.\n" + "Cgroup tests expect tmpfs and cgroupv2 to be mounted " + "and only run on Linux.\n" + "Error: %s", + path, + strerror(errno))); + } + auto output = std::make_unique(std::move(name), std::move(path)); + return output; +} + +TempCgroupDirectory::~TempCgroupDirectory() noexcept(false) { + // TODO(#54703): This can be refactored to disarm the destructor so that when you delete + // a cgroup created with TempCgroupDirectory and delete it outside the handler, this + // will not attempt to delete it. + if (rmdir(path_.c_str()) == -1) { + if (errno != ENOENT) { + RAY_LOG(WARNING) << absl::StrFormat( + "Failed to delete a cgroup directory at %s with error %s. Please manually " + "delete it with rmdir.", + path_, + strerror(errno)); + } + } +} + +ray::StatusOr> TempDirectory::Create() { + std::string path = "/tmp/XXXXXX"; + char *ret = mkdtemp(path.data()); + if (ret == nullptr) { + return ray::Status::Invalid( + absl::StrFormat("Failed to create a temp directory on tmpfs with error %s." + "Cgroup tests expect tmpfs to be mounted and only run on Linux.", + strerror(errno))); + } + std::unique_ptr temp_dir = + std::make_unique(std::move(path)); + return ray::StatusOr>(std::move(temp_dir)); +} + +TempDirectory::~TempDirectory() { + std::error_code error_code; + RAY_CHECK(std::filesystem::remove_all(path_, error_code)) << absl::StrFormat( + "Failed to delete temp directory at %s with error %s. Please manually " + "delete it with rmdir.", + path_, + error_code.message()); +} + +/** + Note: clone3 supports creating a process inside a cgroup instead of creating + and then moving. However, clone3 does not have a glibc wrapper and + must be called directly using syscall syscall (see man 2 syscall). + This function needs linux kernel >= 5.7 to use the CLONE_INTO_CGROUP flag. +*/ +#ifdef CLONE_INTO_CGROUP +ray::StatusOr> StartChildProcessInCgroup( + const std::string &cgroup_path) { + int cgroup_fd = open(cgroup_path.c_str(), O_RDONLY); + if (cgroup_fd == -1) { + return ray::Status::InvalidArgument( + absl::StrFormat("Unable to open fd for cgroup at %s with error %s.", + cgroup_path, + strerror(errno))); + } + + // Will be set by clone3 if a child process is successfully created. + pid_t child_pidfd = -1; + + clone_args cl_args = {}; + cl_args.flags = CLONE_PIDFD | CLONE_INTO_CGROUP; + cl_args.cgroup = cgroup_fd; + + // Can be used both as a pid and as a fd. + cl_args.pidfd = ((__u64)((uintptr_t)(&child_pidfd))); + + int child_pid = -1; + + if ((child_pid = syscall(__NR_clone3, &cl_args, sizeof(struct clone_args))) == -1) { + close(cgroup_fd); + return ray::Status::Invalid( + absl::StrFormat("Failed to clone process into cgroup %s with error %s.", + cgroup_path, + strerror(errno))); + } + + if (child_pid == 0) { + // Child process will wait for parent to unblock it. + pause(); + _exit(0); + } + + // Parent process will continue here. + close(cgroup_fd); + return std::make_pair(child_pid, static_cast(child_pidfd)); +} +#else +// Fallback for older kernels. Uses fork/exec instead. +ray::StatusOr> StartChildProcessInCgroup( + const std::string &cgroup_path) { + int new_pid = fork(); + if (new_pid == -1) { + return ray::Status::Invalid( + absl::StrFormat("Failed to fork process with error %s.", strerror(errno))); + } + + if (new_pid == 0) { + // Child process will pause and wait for parent to terminate and reap it. + pause(); + _exit(0); + } + + std::string cgroup_proc_file_path = cgroup_path + "/cgroup.procs"; + + // Parent process has to move the process into a cgroup. + int cgroup_fd = open(cgroup_proc_file_path.c_str(), O_RDWR); + + if (cgroup_fd == -1) { + return ray::Status::Invalid( + absl::StrFormat("Failed to open cgroup procs file at path %s with error %s.", + cgroup_proc_file_path, + strerror(errno))); + } + + std::string pid_to_write = std::to_string(new_pid); + + if (write(cgroup_fd, pid_to_write.c_str(), pid_to_write.size()) == -1) { + // Best effort killing of the child process because we couldn't move it + // into the cgroup. + kill(SIGKILL, new_pid); + close(cgroup_fd); + return ray::Status::Invalid( + absl::StrFormat("Failed to write pid %i to cgroup procs file %s with error %s.", + new_pid, + cgroup_proc_file_path, + strerror(errno))); + } + + close(cgroup_fd); + + int child_pidfd = static_cast(syscall(SYS_pidfd_open, new_pid, 0)); + if (child_pidfd == -1) { + // Best effort killing of the child process because we couldn't create + // a pidfd from the process. + kill(SIGKILL, new_pid); + close(cgroup_fd); + return ray::Status::Invalid( + absl::StrFormat("Failed to create process fd for pid %i with error %s.", + new_pid, + strerror(errno))); + } + return std::make_pair(new_pid, child_pidfd); +} +#endif + +ray::Status TerminateChildProcessAndWaitForTimeout(pid_t pid, int fd, int timeout_ms) { + if (kill(pid, SIGKILL) == -1) { + return ray::Status::InvalidArgument(absl::StrFormat( + "Failed to send SIGTERM to pid: %i with error %s.", pid, strerror(errno))); + } + struct pollfd poll_fd = { + .fd = fd, + .events = POLLIN, + }; + + int poll_status = poll(&poll_fd, 1, timeout_ms); + if (poll_status == -1) { + return ray::Status::InvalidArgument( + absl::StrFormat("Failed to poll process pid: %i, fd: %i with error %s. Process " + "was not killed. Kill it manually to prevent a leak.", + pid, + fd, + strerror(errno))); + } + if (poll_status == 0) { + return ray::Status::Invalid( + absl::StrFormat("Process pid: %i, fd: %i was not killed within the timeout of " + "%ims. Kill it manually to prevent a leak.", + pid, + fd, + timeout_ms)); + } + siginfo_t dummy = {0}; + int wait_id_status = waitid(P_PID, static_cast(fd), &dummy, WEXITED); + if (wait_id_status == -1) { + if (errno != ECHILD) + return ray::Status::Invalid( + absl::StrFormat("Failed to wait for process pid: %i, fd: %i with error %s. " + "Process was not reaped, but " + "it will be reaped by init after program exits.", + pid, + fd, + strerror(errno))); + }; + return ray::Status::OK(); +} + +TempFile::TempFile(std::string path) { + path_ = path; + fd_ = open(path_.c_str(), O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); // NOLINT + RAY_CHECK(fd_ != -1) << absl::StrFormat( + "Failed to create a temp file at path %s with error %s. Cgroup tests expect " + "tmpfs to be mounted and only run on Linux.", + path_, + strerror(errno)); + file_output_stream_ = std::ofstream(path_, std::ios::trunc); + RAY_CHECK(file_output_stream_.is_open()) << absl::StrFormat( + "Failed to open file %s on tmpfs with error %s", path_, strerror(errno)); +} + +TempFile::TempFile() { + fd_ = mkstemp(path_.data()); // NOLINT + if (fd_ == -1) { + throw std::runtime_error( + "Failed to create a temp file. Cgroup tests expect tmpfs to be " + "mounted " + "and only run on Linux"); + } + file_output_stream_ = std::ofstream(path_, std::ios::trunc); + RAY_CHECK(file_output_stream_.is_open()) + << absl::StrFormat("Could not open temporary file at path %s.", path_); +} + +TempFile::~TempFile() { + RAY_CHECK(close(fd_) != -1) << absl::StrFormat( + "Failed to close file descriptor with error %s.", strerror(errno)); + file_output_stream_.close(); + RAY_CHECK(unlink(path_.c_str()) != -1) + << absl::StrFormat("Failed to unlink temporary file at path %s with error %s.", + path_, + strerror(errno)); +} + +void TempFile::AppendLine(const std::string &line) { + file_output_stream_ << line; + file_output_stream_.flush(); + // All current callers treat this is as a fatal error so this is a RAY_CHECK + // instead of returning a Status. + RAY_CHECK(file_output_stream_.good()) + << absl::StrFormat("Failed to write to temporary file at path %s.", path_); +} diff --git a/src/ray/common/cgroup2/cgroup_test_utils.h b/src/ray/common/cgroup2/cgroup_test_utils.h new file mode 100644 index 000000000000..beaa58c7de91 --- /dev/null +++ b/src/ray/common/cgroup2/cgroup_test_utils.h @@ -0,0 +1,133 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include + +#include +#include +#include +#include + +#include "ray/common/status.h" +#include "ray/common/status_or.h" + +class TempCgroupDirectory { + public: + static ray::StatusOr> Create( + const std::string &base_path, mode_t mode = 0777); + + TempCgroupDirectory() = default; + explicit TempCgroupDirectory(std::string &&name, std::string &&path) + : name_(name), path_(path) {} + + TempCgroupDirectory(const TempCgroupDirectory &) = delete; + TempCgroupDirectory(TempCgroupDirectory &&) = delete; + TempCgroupDirectory &operator=(const TempCgroupDirectory &) = delete; + TempCgroupDirectory &operator=(TempCgroupDirectory &&) = delete; + + const std::string &GetPath() const { return path_; } + const std::string &GetName() const { return name_; } + + ~TempCgroupDirectory() noexcept(false); + + private: + std::string name_; + std::string path_; +}; + +class TempDirectory { + public: + static ray::StatusOr> Create(); + explicit TempDirectory(std::string &&path) : path_(path) {} + + TempDirectory(const TempDirectory &) = delete; + TempDirectory(TempDirectory &&) = delete; + TempDirectory &operator=(const TempDirectory &) = delete; + TempDirectory &operator=(TempDirectory &&) = delete; + + const std::string &GetPath() const { return path_; } + + ~TempDirectory(); + + private: + const std::string path_; +}; + +class TempFile { + public: + explicit TempFile(std::string path); + TempFile(); + + TempFile(TempFile &other) = delete; + TempFile(TempFile &&other) = delete; + TempFile operator=(TempFile &other) = delete; + TempFile &operator=(TempFile &&other) = delete; + + ~TempFile(); + void AppendLine(const std::string &line); + + const std::string &GetPath() const { return path_; } + + private: + std::string path_ = "/tmp/XXXXXX"; + std::ofstream file_output_stream_; + int fd_; +}; + +/** + Starts a process in the given cgroup. Assumes the cgroup already exists and + that the caller has read-write the lowest-common ancestor of the cgroup + the current process is running in and the target cgroup. + + The spawned process will wait forever for the parent to unblock it and then + reap it. + + @param target_cgroup_path target cgroup to create a process in. + @return Status::OK with a pair of the processfd and pid if successful + @return Status::InvalidArgument if target cgroup does exist or current process + has insufficient permissions. + @return Status::Invalid if process cannot be forked/cloned or processfd cannot + be obtained. +*/ +ray::StatusOr> StartChildProcessInCgroup( + const std::string &target_cgroup_path); + +/** + Kills the specified process and polls its processfd to reap it with a timeout. + + @param pid + @param process_fd can be used as a fd and as a pid. It can be created using + clone or pidfd_open or clone. + @param timeout_ms + + @return Status::OK if successfully terminated the process and reaped it. + @return Status::InvalidArgument if could not send SIGKILL to the process or poll its fd. + @return Status::Invalid if could not reap the process within the timeout. +*/ +ray::Status TerminateChildProcessAndWaitForTimeout(pid_t pid, int fd, int timeout_ms); + +// Convenience methods so you can print the TempCgroupDirectory's path directly +// instead of calling temp_cgroup_dir.GetPath() everytime. +std::ostream &operator<<(std::ostream &os, const TempCgroupDirectory &temp_cgroup_dir) { + return os << temp_cgroup_dir.GetPath(); +} + +std::ostream &operator<<(std::ostream &os, + const std::unique_ptr &ptr) { + if (ptr == nullptr) { + return os << ""; + } + return os << *ptr; +} diff --git a/src/ray/common/cgroup2/fake_cgroup_driver.h b/src/ray/common/cgroup2/fake_cgroup_driver.h new file mode 100644 index 000000000000..e49e63429670 --- /dev/null +++ b/src/ray/common/cgroup2/fake_cgroup_driver.h @@ -0,0 +1,227 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "ray/common/cgroup2/cgroup_driver_interface.h" +#include "ray/common/cgroup2/cgroup_manager.h" +#include "ray/common/status.h" + +namespace ray { + +struct FakeCgroup { + std::string path_; + std::vector processes_; + std::unordered_map constraints_; + std::unordered_set available_controllers_; + std::unordered_set enabled_controllers_; + bool operator==(const FakeCgroup &other) const { + return path_ == other.path_ && processes_ == other.processes_ && + constraints_ == other.constraints_ && + available_controllers_ == other.available_controllers_ && + enabled_controllers_ == other.enabled_controllers_; + } +}; + +struct FakeConstraint { + std::string cgroup_; + std::string name_; +}; + +struct FakeController { + std::string cgroup_; + std::string name_; +}; + +struct FakeMoveProcesses { + std::string from_; + std::string to_; +}; + +// Intended to be used only in unit tests. This class is not thread-safe. +class FakeCgroupDriver : public CgroupDriverInterface { + public: + static std::unique_ptr Create( + std::shared_ptr> cgroups = nullptr, + std::shared_ptr>> deleted_cgroups = nullptr, + std::shared_ptr>> constraints_disabled = + nullptr, + std::shared_ptr>> controllers_disabled = + nullptr, + std::shared_ptr>> processes_moved = + nullptr) { + if (!cgroups) { + cgroups = std::make_shared>(); + } + if (!deleted_cgroups) { + deleted_cgroups = std::make_shared>>(); + } + if (!constraints_disabled) { + constraints_disabled = + std::make_shared>>(); + } + if (!controllers_disabled) { + controllers_disabled = + std::make_shared>>(); + } + if (!processes_moved) { + processes_moved = + std::make_shared>>(); + } + return std::unique_ptr(new FakeCgroupDriver(cgroups, + deleted_cgroups, + constraints_disabled, + controllers_disabled, + processes_moved)); + } + + FakeCgroupDriver( + std::shared_ptr> cgroups, + std::shared_ptr>> deleted_cgroups, + std::shared_ptr>> constraints_disabled, + std::shared_ptr>> controllers_disabled, + std::shared_ptr>> processes_moved) + : cgroups_(cgroups), + deleted_cgroups_(deleted_cgroups), + constraints_disabled_(constraints_disabled), + controllers_disabled_(controllers_disabled), + processes_moved_(processes_moved) {} + + std::shared_ptr> cgroups_; + + // Cgroup cleanup order can be recorded by setting cleanup_mode_ to true. + bool cleanup_mode_ = false; + // cleanup_counter_ is incremented with each cleanup operation to capture + // the order of operations. + int cleanup_counter_ = 0; + std::shared_ptr>> deleted_cgroups_; + std::shared_ptr>> constraints_disabled_; + std::shared_ptr>> controllers_disabled_; + std::shared_ptr>> processes_moved_; + + Status check_cgroup_enabled_s_ = Status::OK(); + Status check_cgroup_s_ = Status::OK(); + Status create_cgroup_s_ = Status::OK(); + Status delete_cgroup_s_ = Status::OK(); + Status move_all_processes_s_ = Status::OK(); + Status enable_controller_s_ = Status::OK(); + Status disable_controller_s_ = Status::OK(); + Status add_constraint_s_ = Status::OK(); + Status available_controllers_s_ = Status::OK(); + Status enabled_controllers_s_ = Status::OK(); + + // These have no side-effects. + Status CheckCgroupv2Enabled() override { return check_cgroup_enabled_s_; } + Status CheckCgroup(const std::string &cgroup) override { return check_cgroup_s_; } + + // These have side-effects made visible through the cgroups_ map. + // All of them can be short-circuited by setting the corresponding + // status to not ok. + Status CreateCgroup(const std::string &cgroup) override { + if (!create_cgroup_s_.ok()) { + return create_cgroup_s_; + } + cgroups_->emplace(cgroup, FakeCgroup{cgroup}); + return create_cgroup_s_; + } + + Status DeleteCgroup(const std::string &cgroup) override { + if (!delete_cgroup_s_.ok()) { + return delete_cgroup_s_; + } + cgroups_->erase(cgroup); + if (cleanup_mode_) { + deleted_cgroups_->emplace_back(std::make_pair(++cleanup_counter_, cgroup)); + } + return delete_cgroup_s_; + } + + Status MoveAllProcesses(const std::string &from, const std::string &to) override { + if (!move_all_processes_s_.ok()) { + return move_all_processes_s_; + } + FakeCgroup &from_cgroup = (*cgroups_)[from]; + FakeCgroup &to_cgroup = (*cgroups_)[to]; + while (!from_cgroup.processes_.empty()) { + to_cgroup.processes_.emplace_back(from_cgroup.processes_.back()); + from_cgroup.processes_.pop_back(); + } + if (cleanup_mode_) { + processes_moved_->emplace_back( + std::make_pair(++cleanup_counter_, FakeMoveProcesses{from, to})); + } + return move_all_processes_s_; + } + + Status EnableController(const std::string &cgroup, + const std::string &controller) override { + if (!enable_controller_s_.ok()) { + return enable_controller_s_; + } + (*cgroups_)[cgroup].enabled_controllers_.emplace(controller); + return enable_controller_s_; + } + + Status DisableController(const std::string &cgroup, + const std::string &controller) override { + if (!disable_controller_s_.ok()) { + return disable_controller_s_; + } + if (cleanup_mode_) { + controllers_disabled_->emplace_back( + std::make_pair(++cleanup_counter_, FakeController{cgroup, controller})); + } + (*cgroups_)[cgroup].enabled_controllers_.erase(controller); + return disable_controller_s_; + } + + Status AddConstraint(const std::string &cgroup, + const std::string &controller, + const std::string &constraint, + const std::string &value) override { + if (!add_constraint_s_.ok()) { + return add_constraint_s_; + } + (*cgroups_)[cgroup].constraints_.emplace(constraint, value); + if (cleanup_mode_) { + constraints_disabled_->emplace_back( + std::make_pair(++cleanup_counter_, FakeConstraint{cgroup, constraint})); + } + return add_constraint_s_; + } + + StatusOr> GetAvailableControllers( + const std::string &cgroup) override { + if (!available_controllers_s_.ok()) { + return available_controllers_s_; + } + return (*cgroups_)[cgroup].available_controllers_; + } + + StatusOr> GetEnabledControllers( + const std::string &cgroup) override { + if (!enabled_controllers_s_.ok()) { + return enabled_controllers_s_; + } + return (*cgroups_)[cgroup].enabled_controllers_; + } +}; + +} // namespace ray diff --git a/src/ray/common/cgroup2/integration_tests/BUILD.bazel b/src/ray/common/cgroup2/integration_tests/BUILD.bazel new file mode 100644 index 000000000000..fda28fe9a638 --- /dev/null +++ b/src/ray/common/cgroup2/integration_tests/BUILD.bazel @@ -0,0 +1,24 @@ +load("//bazel:ray.bzl", "ray_cc_test") + +# This test is run through sysfs_cgroup_driver_integration_test_entrypoint.sh +# See sysfs_cgroup_driver_integration_test_entrypoint.sh for instructions +# for how to run locally. +ray_cc_test( + name = "sysfs_cgroup_driver_integration_test", + srcs = ["sysfs_cgroup_driver_integration_test.cc"], + tags = [ + "cgroup", + "team:core", + ], + target_compatible_with = [ + "@platforms//os:linux", + ], + deps = [ + "//src/ray/common:status", + "//src/ray/common:status_or", + "//src/ray/common/cgroup2:cgroup_test_utils", + "//src/ray/common/cgroup2:sysfs_cgroup_driver", + "@com_google_absl//absl/strings:str_format", + "@com_google_googletest//:gtest_main", + ], +) diff --git a/src/ray/common/cgroup2/integration_tests/sysfs_cgroup_driver_integration_test.cc b/src/ray/common/cgroup2/integration_tests/sysfs_cgroup_driver_integration_test.cc new file mode 100644 index 000000000000..3be47faaf5bd --- /dev/null +++ b/src/ray/common/cgroup2/integration_tests/sysfs_cgroup_driver_integration_test.cc @@ -0,0 +1,625 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "ray/common/cgroup2/cgroup_test_utils.h" +#include "ray/common/cgroup2/sysfs_cgroup_driver.h" +#include "ray/common/status.h" + +constexpr const char *ENV_VAR_TEST_CGROUP_PATH = "CGROUP_PATH"; + +namespace ray { + +class SysFsCgroupDriverIntegrationTest : public ::testing::Test { + protected: + static void SetUpTestSuite() { + const char *cgroup_env = std::getenv(ENV_VAR_TEST_CGROUP_PATH); + if (!cgroup_env || std::string(cgroup_env).empty()) { + throw std::runtime_error("Environment variable CGROUP_PATH not set or empty"); + } + test_cgroup_path_ = cgroup_env; + } + + static const std::string &GetTestCgroupPath() { return test_cgroup_path_; } + + inline static std::string test_cgroup_path_; +}; + +TEST_F(SysFsCgroupDriverIntegrationTest, + SysFsCgroupDriverIntegrationTestFailsIfNoCgroupTestPathSpecified) { + ASSERT_FALSE(test_cgroup_path_.empty()) + << "These integration tests cannot be run without the " + "environment variable CGROUP_TEST_PATH"; +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + CheckCgroupFailsIfCgroupv2PathButNoReadPermissions) { + auto cgroup_dir_or_status = TempCgroupDirectory::Create(test_cgroup_path_, 0000); + ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); + auto cgroup_dir = std::move(cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + Status s = driver.CheckCgroup(cgroup_dir->GetPath()); + EXPECT_TRUE(s.IsPermissionDenied()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + CheckCgroupFailsIfCgroupv2PathButNoWritePermissions) { + auto cgroup_dir_or_status = TempCgroupDirectory::Create(test_cgroup_path_, S_IRUSR); + ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); + auto cgroup_dir = std::move(cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + Status s = driver.CheckCgroup(cgroup_dir->GetPath()); + EXPECT_TRUE(s.IsPermissionDenied()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + CheckCgroupFailsIfCgroupv2PathButNoExecPermissions) { + auto cgroup_dir_or_status = + TempCgroupDirectory::Create(test_cgroup_path_, S_IRUSR | S_IWUSR); + ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); + auto cgroup_dir = std::move(cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + Status s = driver.CheckCgroup(cgroup_dir->GetPath()); + EXPECT_TRUE(s.IsPermissionDenied()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + CheckCgroupSucceedsIfCgroupv2PathAndReadWriteExecPermissions) { + auto cgroup_dir_or_status = TempCgroupDirectory::Create(test_cgroup_path_, S_IRWXU); + ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); + auto cgroup_dir = std::move(cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + Status s = driver.CheckCgroup(cgroup_dir->GetPath()); + EXPECT_TRUE(s.ok()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, CreateCgroupFailsIfAlreadyExists) { + auto cgroup_dir_or_status = TempCgroupDirectory::Create(test_cgroup_path_, S_IRWXU); + ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); + auto cgroup_dir = std::move(cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + Status s = driver.CreateCgroup(cgroup_dir->GetPath()); + ASSERT_TRUE(s.IsAlreadyExists()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, CreateCgroupFailsIfAncestorCgroupDoesNotExist) { + auto cgroup_dir_or_status = TempCgroupDirectory::Create(test_cgroup_path_, S_IRWXU); + ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); + auto cgroup_dir = std::move(cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + std::string non_existent_path = cgroup_dir->GetPath() + + std::filesystem::path::preferred_separator + "no" + + std::filesystem::path::preferred_separator + "bueno"; + Status s = driver.CreateCgroup(non_existent_path); + EXPECT_TRUE(s.IsNotFound()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, CreateCgroupFailsIfOnlyReadPermissions) { + auto cgroup_dir_or_status = TempCgroupDirectory::Create(test_cgroup_path_, S_IRUSR); + ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); + auto cgroup_dir = std::move(cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + std::string child_cgroup_path = + cgroup_dir->GetPath() + std::filesystem::path::preferred_separator + "child"; + Status s = driver.CreateCgroup(child_cgroup_path); + EXPECT_TRUE(s.IsPermissionDenied()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, CreateCgroupFailsIfOnlyReadWritePermissions) { + auto cgroup_dir_or_status = + TempCgroupDirectory::Create(test_cgroup_path_, S_IRUSR | S_IWUSR); + ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); + auto cgroup_dir = std::move(cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + std::string child_cgroup_path = + cgroup_dir->GetPath() + std::filesystem::path::preferred_separator + "child"; + Status s = driver.CreateCgroup(child_cgroup_path); + EXPECT_TRUE(s.IsPermissionDenied()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + CreateCgroupSucceedsIfParentExistsAndReadWriteExecPermissions) { + auto cgroup_dir_or_status = TempCgroupDirectory::Create(test_cgroup_path_, S_IRWXU); + ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); + auto cgroup_dir = std::move(cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + std::string child_cgroup_path = + cgroup_dir->GetPath() + std::filesystem::path::preferred_separator + "child"; + Status s = driver.CreateCgroup(child_cgroup_path); + EXPECT_TRUE(s.ok()) << s.ToString(); + Status check_status = driver.CheckCgroup(child_cgroup_path); + EXPECT_TRUE(check_status.ok()) << check_status.ToString(); + ASSERT_EQ(rmdir(child_cgroup_path.c_str()), 0) + << "Failed to cleanup test cgroup at path " << child_cgroup_path << ".\n" + << "Error: " << strerror(errno); +} + +// Tests for DeleteCgroup +TEST_F(SysFsCgroupDriverIntegrationTest, DeleteCgroupFailsIfDoesNotExist) { + auto cgroup_dir_or_status = TempCgroupDirectory::Create(test_cgroup_path_, S_IRWXU); + ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); + auto cgroup = std::move(cgroup_dir_or_status.value()); + std::string cgroup_to_delete = + cgroup->GetPath() + std::filesystem::path::preferred_separator + "cool_group"; + SysFsCgroupDriver driver; + Status s = driver.DeleteCgroup(cgroup_to_delete); + ASSERT_TRUE(s.IsNotFound()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, DeleteCgroupFailsIfAncestorCgroupDoesNotExist) { + auto cgroup_dir_or_status = TempCgroupDirectory::Create(test_cgroup_path_, S_IRWXU); + ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); + auto cgroup_dir = std::move(cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + std::string non_existent_path = cgroup_dir->GetPath() + + std::filesystem::path::preferred_separator + "no" + + std::filesystem::path::preferred_separator + "bueno"; + Status s = driver.DeleteCgroup(non_existent_path); + EXPECT_TRUE(s.IsNotFound()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, DeleteCgroupFailsIfOnlyReadPermissions) { + auto cgroup_dir_or_status = TempCgroupDirectory::Create(test_cgroup_path_, S_IRUSR); + ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); + auto cgroup_dir = std::move(cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + std::string child_cgroup_path = + cgroup_dir->GetPath() + std::filesystem::path::preferred_separator + "child"; + Status s = driver.DeleteCgroup(child_cgroup_path); + EXPECT_TRUE(s.IsPermissionDenied()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, DeleteCgroupFailsIfOnlyReadWritePermissions) { + auto cgroup_dir_or_status = + TempCgroupDirectory::Create(test_cgroup_path_, S_IRUSR | S_IWUSR); + ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); + auto cgroup_dir = std::move(cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + std::string child_cgroup_path = + cgroup_dir->GetPath() + std::filesystem::path::preferred_separator + "child"; + Status s = driver.DeleteCgroup(child_cgroup_path); + EXPECT_TRUE(s.IsPermissionDenied()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, DeleteCgroupFailsIfCgroupHasChildren) { + auto parent_cgroup_dir_or_status = + TempCgroupDirectory::Create(test_cgroup_path_, S_IRWXU); + ASSERT_TRUE(parent_cgroup_dir_or_status.ok()) << parent_cgroup_dir_or_status.ToString(); + std::unique_ptr parent_cgroup = + std::move(parent_cgroup_dir_or_status.value()); + auto child_cgroup_dir_or_status = + TempCgroupDirectory::Create(parent_cgroup->GetPath(), S_IRWXU); + ASSERT_TRUE(child_cgroup_dir_or_status.ok()) << child_cgroup_dir_or_status.ToString(); + SysFsCgroupDriver driver; + Status s = driver.DeleteCgroup(parent_cgroup->GetPath()); + EXPECT_TRUE(s.IsInvalidArgument()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, DeleteCgroupFailsIfCgroupHasProcesses) { + auto cgroup_or_status = TempCgroupDirectory::Create(test_cgroup_path_, S_IRWXU); + ASSERT_TRUE(cgroup_or_status.ok()) << cgroup_or_status.ToString(); + auto cgroup = std::move(cgroup_or_status.value()); + StatusOr> child_process = + StartChildProcessInCgroup(cgroup->GetPath()); + ASSERT_TRUE(child_process.ok()) << child_process.ToString(); + auto [child_pid, child_pidfd] = *child_process; + SysFsCgroupDriver driver; + // Delete fails while process is alive. + Status failed_s = driver.DeleteCgroup(cgroup->GetPath()); + EXPECT_TRUE(failed_s.IsInvalidArgument()) << failed_s.ToString(); + Status terminate_child = + TerminateChildProcessAndWaitForTimeout(child_pid, child_pidfd, 5000); + ASSERT_TRUE(terminate_child.ok()) << terminate_child.ToString(); + // Delete succeeds after child process terminates. + Status succeeded_s = driver.DeleteCgroup(cgroup->GetPath()); + EXPECT_TRUE(succeeded_s.ok()) << succeeded_s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + DeleteCgroupSucceedsIfLeafCgroupExistsWithNoProcessesAndCorrectPermissions) { + auto cgroup_or_status = TempCgroupDirectory::Create(test_cgroup_path_, S_IRWXU); + ASSERT_TRUE(cgroup_or_status.ok()) << cgroup_or_status.ToString(); + auto cgroup = std::move(cgroup_or_status.value()); + SysFsCgroupDriver driver; + Status s = driver.DeleteCgroup(cgroup->GetPath()); + EXPECT_TRUE(s.ok()) << s.ToString(); +} + +// RemoveController tests + +TEST_F(SysFsCgroupDriverIntegrationTest, + GetAvailableControllersFailsIfCgroupDoesNotExist) { + std::string non_existent_path = test_cgroup_path_ + + std::filesystem::path::preferred_separator + "no" + + std::filesystem::path::preferred_separator + "bueno"; + SysFsCgroupDriver driver; + StatusOr> s = + driver.GetAvailableControllers(non_existent_path); + EXPECT_TRUE(s.IsNotFound()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + GetAvailableControllersFailsIfReadWriteButNotExecutePermissions) { + auto cgroup_dir_or_status = + TempCgroupDirectory::Create(test_cgroup_path_, S_IRUSR | S_IWUSR); + ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); + std::unique_ptr cgroup_dir = + std::move(cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + StatusOr> s = + driver.GetAvailableControllers(cgroup_dir->GetPath()); + EXPECT_TRUE(s.IsPermissionDenied()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + GetAvailableControllersSucceedsWithCPUAndMemoryControllersOnBaseCgroup) { + SysFsCgroupDriver driver; + StatusOr> s = + driver.GetAvailableControllers(test_cgroup_path_); + EXPECT_TRUE(s.ok()) << s.ToString(); + std::unordered_set controllers = std::move(s.value()); + EXPECT_TRUE(controllers.find("cpu") != controllers.end()) + << "Cgroup integration tests expect the base cgroup at " << test_cgroup_path_ + << " has the cpu controller available"; +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + GetAvailableControllersSucceedsWithNoAvailableControllers) { + auto parent_cgroup_dir_or_status = + TempCgroupDirectory::Create(test_cgroup_path_, S_IRWXU); + ASSERT_TRUE(parent_cgroup_dir_or_status.ok()) << parent_cgroup_dir_or_status.ToString(); + std::unique_ptr parent_cgroup = + std::move(parent_cgroup_dir_or_status.value()); + auto child_cgroup_dir_or_status = + TempCgroupDirectory::Create(parent_cgroup->GetPath(), S_IRWXU); + ASSERT_TRUE(child_cgroup_dir_or_status.ok()) << child_cgroup_dir_or_status.ToString(); + std::unique_ptr child_cgroup = + std::move(child_cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + StatusOr> s = + driver.GetAvailableControllers(child_cgroup->GetPath()); + EXPECT_TRUE(s.ok()) << s.ToString(); + std::unordered_set controllers = std::move(s.value()); + EXPECT_EQ(controllers.size(), 0); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, MoveAllProcessesFailsIfSourceDoesntExist) { + auto ancestor_cgroup_or_status = + TempCgroupDirectory::Create(test_cgroup_path_, S_IRWXU); + ASSERT_TRUE(ancestor_cgroup_or_status.ok()) << ancestor_cgroup_or_status.ToString(); + auto ancestor_cgroup = std::move(ancestor_cgroup_or_status.value()); + auto dest_cgroup_or_status = + TempCgroupDirectory::Create(ancestor_cgroup->GetPath(), S_IRWXU); + ASSERT_TRUE(dest_cgroup_or_status.ok()) << dest_cgroup_or_status.ToString(); + auto dest_cgroup = std::move(dest_cgroup_or_status.value()); + // Do not create the source cgroup + std::string non_existent_path = + ancestor_cgroup->GetPath() + std::filesystem::path::preferred_separator + "nope"; + SysFsCgroupDriver driver; + Status s = driver.MoveAllProcesses(non_existent_path, dest_cgroup->GetPath()); + EXPECT_TRUE(s.IsNotFound()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, MoveAllProcessesFailsIfDestDoesntExist) { + auto ancestor_cgroup_or_status = + TempCgroupDirectory::Create(test_cgroup_path_, S_IRWXU); + ASSERT_TRUE(ancestor_cgroup_or_status.ok()) << ancestor_cgroup_or_status.ToString(); + auto ancestor_cgroup = std::move(ancestor_cgroup_or_status.value()); + auto source_cgroup_or_status = + TempCgroupDirectory::Create(ancestor_cgroup->GetPath(), S_IRWXU); + ASSERT_TRUE(source_cgroup_or_status.ok()) << source_cgroup_or_status.ToString(); + auto source_cgroup = std::move(source_cgroup_or_status.value()); + // Do not create the dest cgroup. + std::string non_existent_path = + ancestor_cgroup->GetPath() + std::filesystem::path::preferred_separator + "nope"; + SysFsCgroupDriver driver; + Status s = driver.MoveAllProcesses(source_cgroup->GetPath(), non_existent_path); + EXPECT_TRUE(s.IsNotFound()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + MoveAllProcessesFailsIfNotReadWriteExecPermissionsForSource) { + auto ancestor_cgroup_or_status = + TempCgroupDirectory::Create(test_cgroup_path_, S_IRWXU); + ASSERT_TRUE(ancestor_cgroup_or_status.ok()) << ancestor_cgroup_or_status.ToString(); + auto ancestor_cgroup = std::move(ancestor_cgroup_or_status.value()); + auto source_cgroup_or_status = + TempCgroupDirectory::Create(ancestor_cgroup->GetPath(), S_IRUSR | S_IWUSR); + ASSERT_TRUE(source_cgroup_or_status.ok()) << source_cgroup_or_status.ToString(); + auto source_cgroup = std::move(source_cgroup_or_status.value()); + auto dest_cgroup_or_status = + TempCgroupDirectory::Create(ancestor_cgroup->GetPath(), S_IRWXU); + ASSERT_TRUE(dest_cgroup_or_status.ok()) << dest_cgroup_or_status.ToString(); + auto dest_cgroup = std::move(dest_cgroup_or_status.value()); + SysFsCgroupDriver driver; + Status s = driver.MoveAllProcesses(source_cgroup->GetPath(), dest_cgroup->GetPath()); + EXPECT_TRUE(s.IsPermissionDenied()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + MoveAllProcessesFailsIfNotReadWriteExecPermissionsForDest) { + auto ancestor_cgroup_or_status = + TempCgroupDirectory::Create(test_cgroup_path_, S_IRWXU); + ASSERT_TRUE(ancestor_cgroup_or_status.ok()) << ancestor_cgroup_or_status.ToString(); + auto ancestor_cgroup = std::move(ancestor_cgroup_or_status.value()); + auto source_cgroup_or_status = + TempCgroupDirectory::Create(ancestor_cgroup->GetPath(), S_IRWXU); + ASSERT_TRUE(source_cgroup_or_status.ok()) << source_cgroup_or_status.ToString(); + auto source_cgroup = std::move(source_cgroup_or_status.value()); + auto dest_cgroup_or_status = + TempCgroupDirectory::Create(ancestor_cgroup->GetPath(), S_IRUSR | S_IWUSR); + ASSERT_TRUE(dest_cgroup_or_status.ok()) << dest_cgroup_or_status.ToString(); + auto dest_cgroup = std::move(dest_cgroup_or_status.value()); + SysFsCgroupDriver driver; + Status s = driver.MoveAllProcesses(source_cgroup->GetPath(), dest_cgroup->GetPath()); + EXPECT_TRUE(s.IsPermissionDenied()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + MoveAllProcessesFailsIfNotReadWriteExecPermissionsForAncestor) { + auto ancestor_cgroup_or_status = + TempCgroupDirectory::Create(test_cgroup_path_, S_IRWXU); + ASSERT_TRUE(ancestor_cgroup_or_status.ok()) << ancestor_cgroup_or_status.ToString(); + auto ancestor_cgroup = std::move(ancestor_cgroup_or_status.value()); + auto source_cgroup_or_status = + TempCgroupDirectory::Create(ancestor_cgroup->GetPath(), S_IRWXU); + ASSERT_TRUE(source_cgroup_or_status.ok()) << source_cgroup_or_status.ToString(); + auto source_cgroup = std::move(source_cgroup_or_status.value()); + auto dest_cgroup_or_status = + TempCgroupDirectory::Create(ancestor_cgroup->GetPath(), S_IRWXU); + ASSERT_TRUE(dest_cgroup_or_status.ok()) << dest_cgroup_or_status.ToString(); + auto dest_cgroup = std::move(dest_cgroup_or_status.value()); + ASSERT_EQ(chmod(ancestor_cgroup->GetPath().c_str(), S_IRUSR), 0) + << "Failed to chmod cgroup directory " << ancestor_cgroup->GetPath() + << "\n Error: " << strerror(errno); + SysFsCgroupDriver driver; + Status s = driver.MoveAllProcesses(source_cgroup->GetPath(), dest_cgroup->GetPath()); + EXPECT_TRUE(s.IsPermissionDenied()) << s.ToString(); + // Change the permissions back read, write, and execute so cgroup can be deleted. + ASSERT_EQ(chmod(ancestor_cgroup->GetPath().c_str(), S_IRWXU), 0) + << "Failed to chmod cgroup directory " << ancestor_cgroup->GetPath() + << "\n Error: " << strerror(errno); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + MoveAllProcessesSucceedsWithCorrectPermissionsAndValidCgroups) { + auto source_cgroup_or_status = TempCgroupDirectory::Create(test_cgroup_path_, S_IRWXU); + ASSERT_TRUE(source_cgroup_or_status.ok()) << source_cgroup_or_status.ToString(); + auto source_cgroup = std::move(source_cgroup_or_status.value()); + auto dest_cgroup_or_status = TempCgroupDirectory::Create(test_cgroup_path_, S_IRWXU); + ASSERT_TRUE(dest_cgroup_or_status.ok()) << dest_cgroup_or_status.ToString(); + auto dest_cgroup = std::move(dest_cgroup_or_status.value()); + StatusOr> child_process_s = + StartChildProcessInCgroup(source_cgroup->GetPath()); + ASSERT_TRUE(child_process_s.ok()) << child_process_s.ToString(); + auto [child_pid, child_pidfd] = child_process_s.value(); + SysFsCgroupDriver driver; + Status s = driver.MoveAllProcesses(source_cgroup->GetPath(), dest_cgroup->GetPath()); + ASSERT_TRUE(s.ok()) << s.ToString(); + // Assert that the child's pid is actually in the new file. + std::string dest_cgroup_procs_file_path = dest_cgroup->GetPath() + + std::filesystem::path::preferred_separator + + "cgroup.procs"; + std::ifstream dest_cgroup_procs_file(dest_cgroup_procs_file_path); + ASSERT_TRUE(dest_cgroup_procs_file.is_open()) + << "Could not open file " << dest_cgroup_procs_file_path << "."; + std::unordered_set dest_cgroup_pids; + int pid = -1; + while (dest_cgroup_procs_file >> pid) { + ASSERT_FALSE(dest_cgroup_procs_file.fail()) + << "Unable to read pid from file " << dest_cgroup_procs_file_path; + dest_cgroup_pids.emplace(pid); + } + EXPECT_EQ(dest_cgroup_pids.size(), 1); + EXPECT_TRUE(dest_cgroup_pids.find(child_pid) != dest_cgroup_pids.end()); + Status terminate_s = + TerminateChildProcessAndWaitForTimeout(child_pid, child_pidfd, 5000); + ASSERT_TRUE(terminate_s.ok()) << terminate_s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + EnableControllerFailsIfReadOnlyPermissionsForCgroup) { + auto cgroup_dir_or_status = TempCgroupDirectory::Create(test_cgroup_path_, S_IRUSR); + ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); + auto cgroup_dir = std::move(cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + Status s = driver.EnableController(cgroup_dir->GetPath(), "memory"); + ASSERT_TRUE(s.IsPermissionDenied()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + EnableControllerFailsIfReadWriteOnlyPermissionsForCgroup) { + auto cgroup_dir_or_status = + TempCgroupDirectory::Create(test_cgroup_path_, S_IRUSR | S_IWUSR); + ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); + auto cgroup_dir = std::move(cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + Status s = driver.EnableController(cgroup_dir->GetPath(), "memory"); + ASSERT_TRUE(s.IsPermissionDenied()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, EnableControllerFailsIfCgroupDoesNotExist) { + std::string non_existent_path = + test_cgroup_path_ + std::filesystem::path::preferred_separator + "nope"; + SysFsCgroupDriver driver; + Status s = driver.EnableController(non_existent_path, "memory"); + ASSERT_TRUE(s.IsNotFound()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + EnableControllerFailsIfControllerNotAvailableForCgroup) { + // This will inherit controllers available because testing_cgroup_ has + // CPU and Memory controllers available. + auto cgroup_dir_or_status = TempCgroupDirectory::Create(test_cgroup_path_, S_IRWXU); + ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); + auto cgroup_dir = std::move(cgroup_dir_or_status.value()); + auto nested_cgroup_dir_or_status = + TempCgroupDirectory::Create(cgroup_dir->GetPath(), S_IRWXU); + ASSERT_TRUE(nested_cgroup_dir_or_status.ok()) << nested_cgroup_dir_or_status.ToString(); + auto nested_cgroup_dir = std::move(nested_cgroup_dir_or_status.value()); + // Make sure that the cgroup has 0 available controllers. + SysFsCgroupDriver driver; + auto available_controllers_s = + driver.GetAvailableControllers(nested_cgroup_dir->GetPath()); + ASSERT_TRUE(available_controllers_s.ok()) << available_controllers_s.ToString(); + auto available_controllers = std::move(available_controllers_s.value()); + ASSERT_EQ(available_controllers.size(), 0); + Status s = driver.EnableController(nested_cgroup_dir->GetPath(), "memory"); + ASSERT_TRUE(s.IsInvalidArgument()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, DisableControllerFailsIfControllerNotEnabled) { + auto cgroup_dir_or_status = TempCgroupDirectory::Create(test_cgroup_path_, S_IRWXU); + ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); + auto cgroup_dir = std::move(cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + auto enabled_controllers_s = driver.GetEnabledControllers(cgroup_dir->GetPath()); + ASSERT_TRUE(enabled_controllers_s.ok()) << enabled_controllers_s.ToString(); + auto enabled_controllers = std::move(enabled_controllers_s.value()); + ASSERT_EQ(enabled_controllers.size(), 0); + Status s = driver.DisableController(cgroup_dir->GetPath(), "memory"); + ASSERT_TRUE(s.IsInvalidArgument()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + DisableControllerFailsIfReadOnlyPermissionsForCgroup) { + auto cgroup_dir_or_status = TempCgroupDirectory::Create(test_cgroup_path_, S_IRUSR); + ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); + auto cgroup_dir = std::move(cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + Status s = driver.DisableController(cgroup_dir->GetPath(), "memory"); + ASSERT_TRUE(s.IsPermissionDenied()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + DisableControllerFailsIfReadWriteOnlyPermissionsForCgroup) { + auto cgroup_dir_or_status = + TempCgroupDirectory::Create(test_cgroup_path_, S_IRUSR | S_IWUSR); + ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); + auto cgroup_dir = std::move(cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + Status s = driver.DisableController(cgroup_dir->GetPath(), "memory"); + ASSERT_TRUE(s.IsPermissionDenied()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, DisableControllerFailsIfCgroupDoesNotExist) { + std::string non_existent_path = + test_cgroup_path_ + std::filesystem::path::preferred_separator + "nope"; + SysFsCgroupDriver driver; + Status s = driver.DisableController(non_existent_path, "memory"); + ASSERT_TRUE(s.IsNotFound()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + EnableAndDisableControllerSucceedWithCorrectInputAndPermissions) { + auto parent_cgroup_dir_or_status = + TempCgroupDirectory::Create(test_cgroup_path_, S_IRWXU); + ASSERT_TRUE(parent_cgroup_dir_or_status.ok()) << parent_cgroup_dir_or_status.ToString(); + auto parent_cgroup_dir = std::move(parent_cgroup_dir_or_status.value()); + auto child_cgroup_dir_or_status = + TempCgroupDirectory::Create(parent_cgroup_dir->GetPath(), S_IRWXU); + ASSERT_TRUE(child_cgroup_dir_or_status.ok()) << child_cgroup_dir_or_status.ToString(); + auto child_cgroup_dir = std::move(child_cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + + // There should be no enabled controllers on the parent cgroup so enabling the memory + // controller should fail. + Status invalid_argument_s = driver.EnableController(child_cgroup_dir->GetPath(), "cpu"); + ASSERT_TRUE(invalid_argument_s.IsInvalidArgument()) << invalid_argument_s.ToString(); + + // Enable the controller on the parent cgroup to make it available on the child + Status enable_parent_s = driver.EnableController(parent_cgroup_dir->GetPath(), "cpu"); + ASSERT_TRUE(enable_parent_s.ok()) << enable_parent_s.ToString(); + + // Enable the controller on the child cgroup. + Status enable_child_s = driver.EnableController(child_cgroup_dir->GetPath(), "cpu"); + ASSERT_TRUE(enable_child_s.ok()) << enable_child_s.ToString(); + + // Cannot disable the controller on the parent cgroup while the child cgroup + // still has it enabled. + Status disable_parent_failure_s = + driver.DisableController(parent_cgroup_dir->GetPath(), "cpu"); + ASSERT_FALSE(disable_parent_failure_s.ok()) << enable_parent_s.ToString(); + // Disable the controller on the child cgroup. + Status disable_child_s = driver.DisableController(child_cgroup_dir->GetPath(), "cpu"); + ASSERT_TRUE(disable_child_s.ok()) << disable_child_s.ToString(); + // Can now disable the controller on the parent cgroup. + Status disable_parent_success_s = + driver.DisableController(parent_cgroup_dir->GetPath(), "cpu"); + ASSERT_TRUE(disable_parent_success_s.ok()) << disable_parent_success_s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, AddResourceConstraintFailsIfCgroupDoesntExist) { + std::string non_existent_path = + test_cgroup_path_ + std::filesystem::path::preferred_separator + "nope"; + SysFsCgroupDriver driver; + Status s = driver.AddConstraint(non_existent_path, "memory", "memory.min", "1"); + ASSERT_TRUE(s.IsNotFound()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + AddResourceConstraintFailsIfReadOnlyPermissions) { + auto cgroup_or_status = TempCgroupDirectory::Create(test_cgroup_path_, S_IRUSR); + ASSERT_TRUE(cgroup_or_status.ok()) << cgroup_or_status.ToString(); + auto cgroup = std::move(cgroup_or_status.value()); + SysFsCgroupDriver driver; + Status s = driver.AddConstraint(cgroup->GetPath(), "memory", "memory.min", "1"); + ASSERT_TRUE(s.IsPermissionDenied()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + AddResourceConstraintFailsIfReadWriteOnlyPermissions) { + auto cgroup_or_status = + TempCgroupDirectory::Create(test_cgroup_path_, S_IRUSR | S_IWUSR); + ASSERT_TRUE(cgroup_or_status.ok()) << cgroup_or_status.ToString(); + auto cgroup = std::move(cgroup_or_status.value()); + SysFsCgroupDriver driver; + Status s = driver.AddConstraint(cgroup->GetPath(), "memory", "memory.min", "1"); + ASSERT_TRUE(s.IsPermissionDenied()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + AddResourceConstraintFailsIfControllerNotEnabled) { + auto cgroup_or_status = TempCgroupDirectory::Create(test_cgroup_path_, S_IRWXU); + ASSERT_TRUE(cgroup_or_status.ok()) << cgroup_or_status.ToString(); + auto cgroup = std::move(cgroup_or_status.value()); + SysFsCgroupDriver driver; + // Memory controller is not enabled. + Status s = driver.AddConstraint(cgroup->GetPath(), "memory", "memory.min", "1"); + ASSERT_TRUE(s.IsInvalidArgument()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, AddResourceConstraintSucceeds) { + auto cgroup_or_status = TempCgroupDirectory::Create(test_cgroup_path_, S_IRWXU); + ASSERT_TRUE(cgroup_or_status.ok()) << cgroup_or_status.ToString(); + auto cgroup = std::move(cgroup_or_status.value()); + SysFsCgroupDriver driver; + // Enable the cpu controller first. + Status enable_controller_s = driver.EnableController(cgroup->GetPath(), "cpu"); + ASSERT_TRUE(enable_controller_s.ok()) << enable_controller_s.ToString(); + // cpu.weight must be between [1,10000] + Status s = driver.AddConstraint(cgroup->GetPath(), "cpu", "cpu.weight", "500"); + ASSERT_TRUE(s.ok()) << s.ToString(); +} +} // namespace ray diff --git a/src/ray/common/cgroup2/integration_tests/sysfs_cgroup_driver_integration_test_entrypoint.sh b/src/ray/common/cgroup2/integration_tests/sysfs_cgroup_driver_integration_test_entrypoint.sh new file mode 100755 index 000000000000..ee4f8d3fa3de --- /dev/null +++ b/src/ray/common/cgroup2/integration_tests/sysfs_cgroup_driver_integration_test_entrypoint.sh @@ -0,0 +1,116 @@ +#!/usr/bin/env bash +set -euo pipefail + +# To run this test locally, you will need to run it as the root user to be able +# to create cgroups, add users etc. It is recommended to first create a cgroup for testing +# so the tests do not interfere with your root cgroup. +# +# 1) Create a cgroup +# sudo mkdir -p /sys/fs/cgroup/testing +# +# 2) Enable rwx permissions for files in the cgroup +# sudo chmod u+rwx /sys/fs/cgroup/testing +# +# 2) Move the current process into the cgroup +# echo $$ | sudo tee /sys/fs/cgroup/testing/cgroup.procs +# +# 3) Execute the tests with sudo passing your ROOT_CGROUP +# NOTE: the "env PATH=${PATH}" is for the root user to find the bazel executable +# since it may not already be in its path. +# sudo env PATH="${PATH}" ./sysfs_cgroup_driver_integration_test_entrypoint.sh /sys/fs/cgroup/testing +# +# If cleanup fails during local testing, you can run to remove all created cgroups. +# sudo find /sys/fs/cgroup/testing -type d -depth 10 -exec rmdir {} + +if [[ "$(uname -s)" != "Linux" ]]; then + echo "ERROR: Cgroup integration tests can only be run on Linux." + echo " The current OS is $(uname)" + exit 0 +fi + +BAZEL=$(which bazel) +# Defaults to /sys/fs/cgroup if not passed in as an argument. +ROOT_CGROUP="${1:-/sys/fs/cgroup}" +CURR_USER=$(whoami) + +echo "Starting Cgroupv2 Integration Tests as user ${CURR_USER}" +echo "ROOT_CGROUP is ${ROOT_CGROUP}." + +if ! grep -qE 'cgroup2\srw' /etc/mtab; then + echo "Failed because cgroupv2 is not mounted on the system in read-write mode." + echo "See the following documentation for how to enable cgroupv2 properly:" + echo "https://kubernetes.io/docs/concepts/architecture/cgroups/#linux-distribution-cgroup-v2-support" + exit 1 +fi +if grep -qE "cgroup\sr" /etc/mtab; then + echo "Failed because cgroupv2 and cgroupv1 is mounted on this system." + echo "See the following documentation for how to enable cgroupv2 in properly in unified mode:" + echo "https://kubernetes.io/docs/concepts/architecture/cgroups/#linux-distribution-cgroup-v2-support" + exit 1 +fi +if [[ ! -w ${ROOT_CGROUP} ]]; then + echo "$(whoami) needs read and write access to ${ROOT_CGROUP} to run integration tests." + echo "Run 'sudo chown -R ${CURR_USER} ${ROOT_CGROUP}' to fix this." + exit 1 +fi +if ! grep -qE '\scpu\s' "${ROOT_CGROUP}"/cgroup.controllers; then + echo "Failed because the cpu controller is not available in the ${ROOT_CGROUP}/cgroup.controllers." + echo "To enable the cpu controller, you need to add it to the parent cgroup of ${ROOT_CGROUP}." + echo "See: https://docs.kernel.org/admin-guide/cgroup-v2.html#enabling-and-disabling." + exit 1 +fi +if ! grep -qE '\smemory\s' "${ROOT_CGROUP}"/cgroup.controllers; then + echo "Failed because the memory controller is not available in the ${ROOT_CGROUP}/cgroup.controllers." + echo "To enable the memory controller, you need to add it to the parent cgroup of ${ROOT_CGROUP}." + echo "See: https://docs.kernel.org/admin-guide/cgroup-v2.html#enabling-and-disabling." + exit 1 +fi + + +TEST_FIXTURE_SCRIPT=src/ray/common/cgroup2/integration_tests/sysfs_cgroup_driver_integration_test_fixture.sh +BASE_CGROUP="$(mktemp -d -p "${ROOT_CGROUP}" testing.XXXXX)" +TEST_CGROUP=${BASE_CGROUP}/test +LEAF_CGROUP=${BASE_CGROUP}/leaf +UNPRIV_USER=cgroup-tester + +trap 'echo "ERROR on line ${LINENO}"; cleanup' ERR INT TERM + +cleanup() { + echo "Running teardown because of an error." + "${TEST_FIXTURE_SCRIPT}" teardown "${ROOT_CGROUP}" "${BASE_CGROUP}" "${UNPRIV_USER}" +} + +# The integration tests assume that the ROOT_CGROUP exists and has read and write access. +# +# This test suite will create the following cgroup hierarchy for the tests +# starting with BASE_CGROUP. +# +# ROOT_CGROUP +# | +# BASE_CGROUP +# / \ +# TEST_CGROUP LEAF_CGROUP +# +# NOTE: The test suite does not assume that ROOT_CGROUP is an actual ROOT_CGROUP. Therefore, +# 1. setup will migrate all processes from the ROOT_CGROUP -> LEAF_CGROUP +# 2. teardown will migrate all processes from the LEAF_CGROUP -> ROOT_CGROUP +# +# NOTE: BASE_CGROUP will have a randomly generated name to isolate tests from each other. +# +# The test suite assumes that +# 1. cpu, memory controllers are available on ROOT_CGROUP i.e. in the ROOT_CGROUP/cgroup.controllers file. +# 2. All processes inside the base_cgroup can be migrated into the leaf_cgroup to avoid not violating +# the no internal processes contstraint. +# +# All C++ tests should only have access to the TEST_CGROUP and nothing outside of it. +# The C++ tests will be executed as a non-root user. Setup/teardown will need root permissions. +echo "ROOT_CGROUP is ${ROOT_CGROUP}." +echo "BASE_CGROUP for the test suite is ${BASE_CGROUP}." +echo "TEST_CGROUP for the test suite is ${TEST_CGROUP}." +echo "LEAF_CGROUP for the test suite is ${LEAF_CGROUP}." + +"${TEST_FIXTURE_SCRIPT}" setup "${ROOT_CGROUP}" "${BASE_CGROUP}" "${UNPRIV_USER}" + +sudo -u "${UNPRIV_USER}" CGROUP_PATH="${TEST_CGROUP}" \ + "${BAZEL}" run //src/ray/common/cgroup2/integration_tests:sysfs_cgroup_driver_integration_test + +"${TEST_FIXTURE_SCRIPT}" teardown "${ROOT_CGROUP}" "${BASE_CGROUP}" "${UNPRIV_USER}" diff --git a/src/ray/common/cgroup2/integration_tests/sysfs_cgroup_driver_integration_test_fixture.sh b/src/ray/common/cgroup2/integration_tests/sysfs_cgroup_driver_integration_test_fixture.sh new file mode 100755 index 000000000000..b2f65ef36b0a --- /dev/null +++ b/src/ray/common/cgroup2/integration_tests/sysfs_cgroup_driver_integration_test_fixture.sh @@ -0,0 +1,168 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + echo "Usage: $0 " + echo " ACTION - One of {setup, teardown}." + echo " ROOT_CGROUP - The root cgroup path. Assumes the cgroup already exists." + echo " BASE_CGROUP - The base cgroup path. Assumes the cgroup already exists." + echo " UNPRIV_USER - The name of the unprivileged user. Will create if doesn't exist." + exit 1 +} + +ACTION=${1:-} +ROOT_CGROUP=${2:-} +BASE_CGROUP=${3:-} +UNPRIV_USER=${4:-} + +validate_args() { + if [[ -z "$ACTION" || -z "$ROOT_CGROUP" || -z "$BASE_CGROUP" || -z "$UNPRIV_USER" ]]; then + echo "ERROR: Missing arguments." + usage + fi +} + +# Helper function to move all processes from the src cgroup +# into the dest cgroup. +move_all_processes() { + # Errexit is disabled because pids can be transient i.e. + # you can fail to move a pid that existed when you read the file + # but exited by the time you tried to move it. + set +e + local src="$1" dst="$2" + local count=0 + while IFS= read -r pid + do + if echo "${pid}" > "${dst}" 2>/dev/null; then + ((count++)) + fi + done < <(grep -v '^ *#' "${src}") + echo "Moved ${count} procs from ${src} to ${dst}." + set -e +} + +update_controllers() { + local CONTROLLER_FILE=$1 + local UPDATE=$2 + if echo "${UPDATE}" > "${CONTROLLER_FILE}"; then + echo "Updated ${UPDATE} controllers for ${CONTROLLER_FILE}" + else + echo "ERROR: Failed to update controllers ${UPDATE} for ${CONTROLLER_FILE}" >&2 + exit 1 + fi + +} + +# Setup involves the following steps: +# +# 1. Create the LEAF_CGROUP and TEST_CGROUP. +# 2. Move all processes from the ROOT_CGROUP into the LEAF_CGROUP. +# 3. Enable cpu, memory controllers on the ROOT, BASE, and TEST cgroups. +# 4. Create the UNPRIV_USER to run the tests as a non-root user. +# 5. Make UNPRIV_USER owner of the cgroup subtree starting at BASE_CGROUP. +# +# NOTE: The tests need to be run as a separate user because access control +# checks will always pass for the root user so they cannot be tested properly +# without creating an unprivileged user. +setup() { + +mkdir -p "${LEAF_CGROUP}" +mkdir -p "${TEST_CGROUP}" + +echo "Created LEAF_CGROUP at ${LEAF_CGROUP}." +echo "Created TEST_CGROUP at ${TEST_CGROUP}." + +move_all_processes "${ROOT_CGROUP_PROCS}" "${LEAF_CGROUP_PROCS}" + +if [[ -s "${ROOT_CGROUP_PROCS}" ]]; then + echo "ERROR: Failed to move all processes out of ${ROOT_CGROUP_PROCS}." + echo " Expected cgroup.procs to be empty, but it's not:" + cat "${ROOT_CGROUP_PROCS}" + exit 1 +fi + +update_controllers "${ROOT_CGROUP}/cgroup.subtree_control" "+cpu +memory" +update_controllers "${BASE_CGROUP}/cgroup.subtree_control" "+cpu +memory" +update_controllers "${TEST_CGROUP}/cgroup.subtree_control" "+cpu +memory" + +if ! id -u "${UNPRIV_USER}" >/dev/null 2>&1; then + sudo useradd -m -s /usr/sbin/nologin "${UNPRIV_USER}" + echo "Created unprivilged user ${UNPRIV_USER}." +fi + +sudo chown -R "${UNPRIV_USER}":"${UNPRIV_USER}" "${BASE_CGROUP}" +sudo chmod -R u+rwx "${BASE_CGROUP}" +echo "${UNPRIV_USER} is the owner the cgroup subtree starting at ${BASE_CGROUP}" + +} + +# Cleanup is the reverse of setup +# 1) Delete the user we created. +# 2) Disable controllers throughout heirarchy. +# 3) Migrate all processes back into the ROOT_CGROUP. +# 4) Recursively delete all created subcgroups. +# +# This is best effort. There can be leaks. The recommended thing +# to do is to run these tests inside a container. +# Setup involves the following steps: +# +# 1. Delete the UNPRIV_USER. +# 2. Disable cpu, memory controllers on the ROOT, BASE, and TEST cgroups. +# 3. Move all processes from the LEAF_CGROUP into the ROOT_CGROUP. +# 4. Delete the TEST, LEAF, and BASE cgroups in that order. +# +# NOTE: This assumes that all C++ tests will clean up their own cgroups. +# If they do not, teardown will fail. +teardown() { + +# Delete the user we created +if id -u "${UNPRIV_USER}" >/dev/null 2>&1; then + pkill -KILL -u "${UNPRIV_USER}" 2>/dev/null || true + deluser -f "${UNPRIV_USER}" --remove-home 2>/dev/null || true + echo "Deleted unprivilged user ${UNPRIV_USER}." +fi + +update_controllers "${TEST_CGROUP}/cgroup.subtree_control" "-cpu -memory" +update_controllers "${BASE_CGROUP}/cgroup.subtree_control" "-cpu -memory" +update_controllers "${ROOT_CGROUP}/cgroup.subtree_control" "-cpu -memory" + +move_all_processes "${LEAF_CGROUP_PROCS}" "${ROOT_CGROUP_PROCS}" + +rmdir "${TEST_CGROUP}" +echo "Deleted ${TEST_CGROUP}" +rmdir "${LEAF_CGROUP}" +echo "Deleted ${LEAF_CGROUP}" +rmdir "${BASE_CGROUP}" +echo "Deleted ${BASE_CGROUP}" + +echo "Teardown successful." + +} + +validate_args + +LEAF_CGROUP="${BASE_CGROUP}/leaf" +TEST_CGROUP="${BASE_CGROUP}/test" +ROOT_CGROUP_PROCS="${ROOT_CGROUP}/cgroup.procs" +LEAF_CGROUP_PROCS="${LEAF_CGROUP}/cgroup.procs" + +echo "Starting integration test fixture with:" +echo " ACTION=${ACTION}" +echo " ROOT_CGROUP=${ROOT_CGROUP}" +echo " BASE_CGROUP=${BASE_CGROUP}" +echo " TEST_CGROUP=${TEST_CGROUP}" +echo " UNPRIV_USER=${UNPRIV_USER}" + +SETUP_ACTION=setup +TEARDOWN_ACTION=teardown + +if [[ "${ACTION}" == "${SETUP_ACTION}" ]]; then + echo "Running ACTION: ${SETUP_ACTION}" + setup +elif [[ "${ACTION}" == "${TEARDOWN_ACTION}" ]]; then + echo "Running ACTION: ${TEARDOWN_ACTION}" + teardown +else + echo "[ERROR]: Unknown action ${ACTION}." + usage +fi diff --git a/src/ray/common/cgroup2/noop_cgroup_manager.cc b/src/ray/common/cgroup2/noop_cgroup_manager.cc new file mode 100644 index 000000000000..1accae4827df --- /dev/null +++ b/src/ray/common/cgroup2/noop_cgroup_manager.cc @@ -0,0 +1,39 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include +#include +#include + +#include "ray/common/cgroup2/cgroup_driver_interface.h" +#include "ray/common/cgroup2/cgroup_manager.h" +#include "ray/common/status_or.h" + +namespace ray { + +CgroupManager::CgroupManager(std::string base_cgroup_path, + const std::string &node_id, + std::unique_ptr cgroup_driver) {} + +CgroupManager::~CgroupManager() {} + +StatusOr> CgroupManager::Create( + std::string base_cgroup_path, + const std::string &node_id, + const int64_t system_reserved_cpu_weight, + const int64_t system_reserved_memory_bytes, + std::unique_ptr cgroup_driver) { + return std::unique_ptr( + new CgroupManager(base_cgroup_path, node_id, std::move(cgroup_driver))); +} +} // namespace ray diff --git a/src/ray/common/cgroup2/noop_sysfs_cgroup_driver.cc b/src/ray/common/cgroup2/noop_sysfs_cgroup_driver.cc new file mode 100644 index 000000000000..b448f021a8ad --- /dev/null +++ b/src/ray/common/cgroup2/noop_sysfs_cgroup_driver.cc @@ -0,0 +1,74 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "ray/common/cgroup2/sysfs_cgroup_driver.h" +#include "ray/common/status.h" +#include "ray/common/status_or.h" + +namespace ray { +Status SysFsCgroupDriver::CheckCgroupv2Enabled() { return Status::OK(); } + +Status SysFsCgroupDriver::CheckCgroup(const std::string &cgroup_path) { + return Status::OK(); +} + +Status SysFsCgroupDriver::CreateCgroup(const std::string &cgroup_path) { + return Status::OK(); +} + +Status SysFsCgroupDriver::DeleteCgroup(const std::string &cgroup_path) { + return Status::OK(); +} + +StatusOr> SysFsCgroupDriver::GetAvailableControllers( + const std::string &cgroup_dir) { + return std::unordered_set{}; +} + +StatusOr> SysFsCgroupDriver::GetEnabledControllers( + const std::string &cgroup_dir) { + return std::unordered_set{}; +} + +Status SysFsCgroupDriver::MoveAllProcesses(const std::string &from, + const std::string &to) { + return Status::OK(); +} + +Status SysFsCgroupDriver::EnableController(const std::string &cgroup_path, + const std::string &controller) { + return Status::OK(); +} + +Status SysFsCgroupDriver::DisableController(const std::string &cgroup_path, + const std::string &controller) { + return Status::OK(); +} + +Status SysFsCgroupDriver::AddConstraint(const std::string &cgroup_path, + const std::string &controller, + const std::string &constraint, + const std::string &constraint_value) { + return Status::OK(); +} + +StatusOr> SysFsCgroupDriver::ReadControllerFile( + const std::string &controller_file_path) { + return std::unordered_set{}; +} + +} // namespace ray diff --git a/src/ray/common/cgroup2/scoped_cgroup_operation.h b/src/ray/common/cgroup2/scoped_cgroup_operation.h new file mode 100644 index 000000000000..4f8f26992ab2 --- /dev/null +++ b/src/ray/common/cgroup2/scoped_cgroup_operation.h @@ -0,0 +1,54 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include + +namespace ray { + +/** + A simple RAII style guard that calls the registered callback on destruction. + ScopedCgroupOperation instances can be moved, but they cannot be copied. + + Usage: + ScopedCgroupOperation say_hello_on_death([]() { + RAY_INFO(INFO) << "Hi, I'm dying!"; + }); +*/ +class ScopedCgroupOperation { + public: + explicit ScopedCgroupOperation(std::function cleanup_fcn) + : cleanup_fcn_(std::move(cleanup_fcn)) {} + + ~ScopedCgroupOperation() { cleanup_fcn_(); } + + ScopedCgroupOperation(const ScopedCgroupOperation &) = delete; + ScopedCgroupOperation &operator=(const ScopedCgroupOperation &other) = delete; + + ScopedCgroupOperation(ScopedCgroupOperation &&other) noexcept + : cleanup_fcn_(std::move(other.cleanup_fcn_)) { + other.cleanup_fcn_ = []() {}; + } + + ScopedCgroupOperation &operator=(ScopedCgroupOperation &&other) noexcept { + cleanup_fcn_ = std::move(other.cleanup_fcn_); + other.cleanup_fcn_ = []() {}; + return *this; + } + + private: + // Defaults to no cleanup. + std::function cleanup_fcn_ = []() {}; +}; +} // namespace ray diff --git a/src/ray/common/cgroup2/sysfs_cgroup_driver.cc b/src/ray/common/cgroup2/sysfs_cgroup_driver.cc index fc564dfb7fd9..afa5be6ce544 100644 --- a/src/ray/common/cgroup2/sysfs_cgroup_driver.cc +++ b/src/ray/common/cgroup2/sysfs_cgroup_driver.cc @@ -28,9 +28,7 @@ #include #include #include -#include #include -#include #include #include @@ -39,6 +37,12 @@ #include "ray/common/status.h" #include "ray/common/status_or.h" +// Used to identify if a filesystem is mounted using cgroupv2. +// See: https://docs.kernel.org/admin-guide/cgroup-v2.html#mounting +#ifndef CGROUP2_SUPER_MAGIC +#define CGROUP2_SUPER_MAGIC 0x63677270 +#endif + namespace ray { Status SysFsCgroupDriver::CheckCgroupv2Enabled() { FILE *fp = setmntent(mount_file_path_.c_str(), "r"); @@ -56,8 +60,8 @@ Status SysFsCgroupDriver::CheckCgroupv2Enabled() { struct mntent *mnt; while ((mnt = getmntent(fp)) != nullptr) { - found_cgroupv1 = found_cgroupv1 || strcmp(mnt->mnt_fsname, "cgroup") == 0; - found_cgroupv2 = found_cgroupv2 || strcmp(mnt->mnt_fsname, "cgroup2") == 0; + found_cgroupv1 = found_cgroupv1 || strcmp(mnt->mnt_type, "cgroup") == 0; + found_cgroupv2 = found_cgroupv2 || strcmp(mnt->mnt_type, "cgroup2") == 0; } // After parsing the mount file, the file should be at the EOF position. @@ -136,14 +140,14 @@ Status SysFsCgroupDriver::CreateCgroup(const std::string &cgroup_path) { strerror(errno))); } if (errno == EACCES) { - return Status::PermissionDenied(absl::StrFormat( - "Failed to create cgroup at path %s with permissions %#o. " - "The current user does not have read, write, execute permissions " - "for the parent cgroup.\n" - "Error: %s.", - cgroup_path, - S_IRWXU, - strerror(errno))); + return Status::PermissionDenied( + absl::StrFormat("Failed to create cgroup at path %s with permissions %#o. " + "The process does not have read, write, execute permissions " + "for the parent cgroup.\n" + "Error: %s.", + cgroup_path, + S_IRWXU, + strerror(errno))); } if (errno == EEXIST) { return Status::AlreadyExists( @@ -164,6 +168,35 @@ Status SysFsCgroupDriver::CreateCgroup(const std::string &cgroup_path) { return Status::OK(); } +Status SysFsCgroupDriver::DeleteCgroup(const std::string &cgroup_path) { + RAY_RETURN_NOT_OK(CheckCgroup(cgroup_path)); + if (rmdir(cgroup_path.c_str()) == -1) { + if (errno == ENOENT) { + return Status::NotFound(absl::StrFormat( + "Failed to delete cgroup at path %s. The parent cgroup does not exist.\n" + "Error: %s.", + cgroup_path, + strerror(errno))); + } + if (errno == EACCES) { + return Status::PermissionDenied( + absl::StrFormat("Failed to delete cgroup at path %s. " + "The process does not have read, write, execute permissions " + "for the parent cgroup.\n" + "Error: %s.", + cgroup_path, + strerror(errno))); + } + return Status::InvalidArgument( + absl::StrFormat("Failed to delete cgroup at path %s. To delete a cgroup, it must " + "have no children and it must not have any processes.\n" + "Error: %s.", + cgroup_path, + strerror(errno))); + } + return Status::OK(); +} + StatusOr> SysFsCgroupDriver::GetAvailableControllers( const std::string &cgroup_dir) { RAY_RETURN_NOT_OK(CheckCgroup(cgroup_dir)); @@ -297,61 +330,28 @@ Status SysFsCgroupDriver::DisableController(const std::string &cgroup_path, return Status::OK(); } -Status SysFsCgroupDriver::AddConstraint(const std::string &cgroup, +Status SysFsCgroupDriver::AddConstraint(const std::string &cgroup_path, + const std::string &controller, const std::string &constraint, const std::string &constraint_value) { - RAY_RETURN_NOT_OK(CheckCgroup(cgroup)); - auto constraint_it = supported_constraints_.find(constraint); - if (constraint_it == supported_constraints_.end()) { - std::string supported_constraint_names("["); - for (auto it = supported_constraints_.begin(); it != supported_constraints_.end(); - ++it) { - supported_constraint_names.append(it->first); - if (std::next(it) != supported_constraints_.end()) { - supported_constraint_names.append(", "); - } - } - supported_constraint_names.append("]"); - return Status::InvalidArgument(absl::StrFormat( - "Failed to apply constraint %s to cgroup %s. Ray only supports %s", - constraint, - cgroup, - supported_constraint_names)); - } - - // Check if the constraint value is out of range and therefore invalid. - auto [low, high] = constraint_it->second.range; - size_t value = static_cast(std::stoi(constraint_value)); - if (value < low || value > high) { - return Status::InvalidArgument(absl::StrFormat( - "Failed to apply constraint %s=%s to cgroup %s. %s can only have values " - "in the range[%i, %i].", - constraint, - constraint_value, - cgroup, - constraint, - low, - high)); - } - + RAY_RETURN_NOT_OK(CheckCgroup(cgroup_path)); // Check if the required controller for the constraint is enabled. - const std::string &controller = constraint_it->second.controller; StatusOr> available_controllers_s = - GetEnabledControllers(cgroup); + GetEnabledControllers(cgroup_path); RAY_RETURN_NOT_OK(available_controllers_s.status()); const auto &controllers = available_controllers_s.value(); if (controllers.find(controller) == controllers.end()) { return Status::InvalidArgument(absl::StrFormat( "Failed to apply %s to cgroup %s. To use %s, enable the %s controller.", constraint, - cgroup, + cgroup_path, constraint, controller)); } // Try to apply the constraint and propagate the appropriate failure error. std::string file_path = - cgroup + std::filesystem::path::preferred_separator + constraint; + cgroup_path + std::filesystem::path::preferred_separator + constraint; int fd = open(file_path.c_str(), O_RDWR); @@ -361,7 +361,7 @@ Status SysFsCgroupDriver::AddConstraint(const std::string &cgroup, "Error: %s", constraint, constraint_value, - cgroup, + cgroup_path, strerror(errno))); } @@ -374,7 +374,7 @@ Status SysFsCgroupDriver::AddConstraint(const std::string &cgroup, "Error: %s", constraint, constraint_value, - cgroup, + cgroup_path, strerror(errno))); } close(fd); diff --git a/src/ray/common/cgroup2/sysfs_cgroup_driver.h b/src/ray/common/cgroup2/sysfs_cgroup_driver.h index fd56d129617b..6b01fbe4886f 100644 --- a/src/ray/common/cgroup2/sysfs_cgroup_driver.h +++ b/src/ray/common/cgroup2/sysfs_cgroup_driver.h @@ -13,8 +13,10 @@ // limitations under the License. #pragma once -#include -#include +// TODO(#54703): SysFsCgroupDriver should not be a public target. +// It will be hidden behind a CgroupManagerFactory which will create +// an appropriate depending on configuration and platform. +// #include #include #include @@ -24,12 +26,6 @@ #include "ray/common/status.h" #include "ray/common/status_or.h" -// Used to identify if a filesystem is mounted using cgroupv2. -// See: https://docs.kernel.org/admin-guide/cgroup-v2.html#mounting -#ifndef CGROUP2_SUPER_MAGIC -#define CGROUP2_SUPER_MAGIC 0x63677270 -#endif - namespace ray { /** @@ -46,12 +42,9 @@ namespace ray { class SysFsCgroupDriver : public CgroupDriverInterface { public: /** - * MOUNTED is defined in mntent.h (and typically refers to /etc/mtab) - * @see https://www.gnu.org/software/libc/manual/2.24/html_node/Mount-Information.html - * * @param mount_file_path only used for testing. */ - explicit SysFsCgroupDriver(std::string mount_file_path = MOUNTED) + explicit SysFsCgroupDriver(std::string mount_file_path = kMountFilePath) : mount_file_path_(std::move(mount_file_path)) {} ~SysFsCgroupDriver() override = default; @@ -121,6 +114,25 @@ class SysFsCgroupDriver : public CgroupDriverInterface { */ Status CreateCgroup(const std::string &cgroup_path) override; + /** + To delete a cgroup using the cgroupv2 vfs, the current user needs to read, write, and + execute permissions for the parent cgroup. This can be achieved through cgroup + delegation. The cgroup must also have no processes or children. + + @see The relevant manpage section on delegation for more details + https://docs.kernel.org/admin-guide/cgroup-v2.html#delegation + + @param cgroup_path the absolute path of the cgroup directory to create. + + @return Status::OK if no errors are encounted. + @return Status::NotFound if an ancestor cgroup does not exist. + @return Status::PermissionDenied if current user doesn't have read, write, and execute + permissions. + @return Status::InvalidArgument if the cgroup has children, processes, or for any + other reason. + */ + Status DeleteCgroup(const std::string &cgroup_path) override; + /** Parses the cgroup.controllers file which has a space separated list of all controllers available to the cgroup. @@ -188,8 +200,7 @@ class SysFsCgroupDriver : public CgroupDriverInterface { https://docs.kernel.org/admin-guide/cgroup-v2.html#controlling-controllers @param cgroup_path absolute path of the cgroup. - @param controller name of the controller i.e. "cpu" or "memory" from - @ref CgroupDriverInterface::supported_controllers_ "supported controllers". + @param controller name of the controller e.g. "cpu", "memory" etc. @return Status::OK if successful @return Status::NotFound if the cgroup does not exist. @@ -225,19 +236,22 @@ class SysFsCgroupDriver : public CgroupDriverInterface { const std::string &controller) override; /** - Adds a constraint to the respective cgroup file. See - @ref CgroupDriverInterface::supported_constraints_ "supported constraints" and valid - values. + Adds a constraint to the respective cgroup file. + + @param cgroup_path absolute path of the cgroup. + @param controller the name of the controller + @param constraint the name of the cgroup file to add the constraint to e.g. cpu.weight + @param constraint_value @return Status::OK if no errors are encounted. @return Status::NotFound if the cgroup does not exist. @return Status::PermissionDenied if current user doesn't have read, write, and execute permissions. - @return Status::InvalidArgument if the cgroup is not using cgroupv2, the constraint - is not supported in ray, the constraint value is out of range, or if cannot write - to the relevant constraint file. + @return Status::InvalidArgument if the cgroup is not using cgroupv2, controller is not + enabled, or cannot write to the constraint file. */ Status AddConstraint(const std::string &cgroup, + const std::string &controller, const std::string &constraint, const std::string &constraint_value) override; @@ -259,5 +273,6 @@ class SysFsCgroupDriver : public CgroupDriverInterface { static constexpr std::string_view kCgroupSubtreeControlFilename = "cgroup.subtree_control"; static constexpr std::string_view kCgroupControllersFilename = "cgroup.controllers"; + static inline std::string kMountFilePath = "/etc/mtab"; }; } // namespace ray diff --git a/src/ray/common/cgroup2/test/cgroup_test_utils.cc b/src/ray/common/cgroup2/test/cgroup_test_utils.cc deleted file mode 100644 index e61ad82e633c..000000000000 --- a/src/ray/common/cgroup2/test/cgroup_test_utils.cc +++ /dev/null @@ -1,107 +0,0 @@ -// Copyright 2025 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "ray/common/cgroup2/test/cgroup_test_utils.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "absl/strings/str_format.h" -#include "ray/common/status.h" -#include "ray/common/status_or.h" -#include "ray/util/logging.h" - -ray::StatusOr> TempDirectory::Create() { - std::string path = "/tmp/XXXXXX"; - char *ret = mkdtemp(path.data()); - if (ret == nullptr) { - return ray::Status::UnknownError( - absl::StrFormat("Failed to create a temp directory. " - "Cgroup tests expect tmpfs to be mounted and only run on Linux.\n" - "Error: %s", - strerror(errno))); - } - std::unique_ptr temp_dir = - std::make_unique(std::move(path)); - return ray::StatusOr>(std::move(temp_dir)); -} - -TempDirectory::~TempDirectory() { std::filesystem::remove_all(path_); } - -TempFile::TempFile(std::string path) { - path_ = path; - fd_ = open(path_.c_str(), O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); // NOLINT - if (fd_ == -1) { - throw std::runtime_error( - absl::StrFormat("Failed to create a temp file. Cgroup tests expect " - "tmpfs to be mounted " - "and only run on Linux. Error: %s", - strerror(errno))); - } - file_output_stream_ = std::ofstream(path_, std::ios::trunc); - if (!file_output_stream_.is_open()) { - throw std::runtime_error("Could not open file on tmpfs."); - } -} - -TempFile::TempFile() { - fd_ = mkstemp(path_.data()); // NOLINT - if (fd_ == -1) { - throw std::runtime_error( - "Failed to create a temp file. Cgroup tests expect tmpfs to be " - "mounted " - "and only run on Linux"); - } - if (unlink(path_.c_str()) == -1) { - close(fd_); - throw std::runtime_error("Failed to unlink temporary file."); - } - file_output_stream_ = std::ofstream(path_, std::ios::trunc); - if (!file_output_stream_.is_open()) { - throw std::runtime_error("Could not open mount file on tmpfs."); - } -} - -TempFile::~TempFile() { - close(fd_); - file_output_stream_.close(); -} - -void TempFile::AppendLine(const std::string &line) { - file_output_stream_ << line; - file_output_stream_.flush(); - if (file_output_stream_.fail()) { - throw std::runtime_error("Could not write to mount file on tmpfs"); - } -} diff --git a/src/ray/common/cgroup2/test/cgroup_test_utils.h b/src/ray/common/cgroup2/test/cgroup_test_utils.h deleted file mode 100644 index f1622d413573..000000000000 --- a/src/ray/common/cgroup2/test/cgroup_test_utils.h +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright 2025 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#pragma once - -#include - -#include -#include -#include -#include - -#include "ray/common/status.h" -#include "ray/common/status_or.h" - -/** - RAII style class for creating and destroying temporary directory for testing. - TODO(irabbani): add full documentation once complete. - */ -class TempDirectory { - public: - static ray::StatusOr> Create(); - explicit TempDirectory(std::string &&path) : path_(path) {} - - TempDirectory(const TempDirectory &) = delete; - TempDirectory(TempDirectory &&) = delete; - TempDirectory &operator=(const TempDirectory &) = delete; - TempDirectory &operator=(TempDirectory &&) = delete; - - const std::string &GetPath() const { return path_; } - - ~TempDirectory(); - - private: - const std::string path_; -}; - -/** - RAII wrapper that creates a file that can be written to. - TODO(irabbani): Add full documentation once the API is complete. -*/ -class TempFile { - public: - explicit TempFile(std::string path); - TempFile(); - - TempFile(TempFile &other) = delete; - TempFile(TempFile &&other) = delete; - TempFile operator=(TempFile &other) = delete; - TempFile &operator=(TempFile &&other) = delete; - - ~TempFile(); - void AppendLine(const std::string &line); - - const std::string &GetPath() const { return path_; } - - private: - std::string path_ = "/tmp/XXXXXX"; - std::ofstream file_output_stream_; - int fd_; -}; diff --git a/src/ray/common/cgroup2/test/BUILD.bazel b/src/ray/common/cgroup2/tests/BUILD.bazel similarity index 50% rename from src/ray/common/cgroup2/test/BUILD.bazel rename to src/ray/common/cgroup2/tests/BUILD.bazel index e829d9c9e080..06d0ca6d1221 100644 --- a/src/ray/common/cgroup2/test/BUILD.bazel +++ b/src/ray/common/cgroup2/tests/BUILD.bazel @@ -1,32 +1,40 @@ -load("//bazel:ray.bzl", "ray_cc_library", "ray_cc_test") +load("//bazel:ray.bzl", "ray_cc_test") -ray_cc_library( - name = "cgroup_test_utils", - srcs = ["cgroup_test_utils.cc"], - hdrs = ["cgroup_test_utils.h"], +ray_cc_test( + name = "sysfs_cgroup_driver_test", + srcs = ["sysfs_cgroup_driver_test.cc"], tags = [ - "no_windows", + "cgroup", + "team:core", + ], + target_compatible_with = [ + "@platforms//os:linux", ], deps = [ "//src/ray/common:status", "//src/ray/common:status_or", + "//src/ray/common/cgroup2:cgroup_test_utils", + "//src/ray/common/cgroup2:sysfs_cgroup_driver", + "//src/ray/common/tests:testing", "@com_google_absl//absl/strings:str_format", + "@com_google_googletest//:gtest_main", ], ) ray_cc_test( - name = "sysfs_cgroup_driver_test", - srcs = ["sysfs_cgroup_driver_test.cc"], + name = "cgroup_manager_test", + srcs = ["cgroup_manager_test.cc"], tags = [ + "cgroup", "no_windows", "team:core", ], deps = [ - ":cgroup_test_utils", "//src/ray/common:status", "//src/ray/common:status_or", - "//src/ray/common/cgroup2:sysfs_cgroup_driver", - "//src/ray/common/test:testing", + "//src/ray/common/cgroup2:cgroup_driver_interface", + "//src/ray/common/cgroup2:cgroup_manager", + "//src/ray/common/cgroup2:fake_cgroup_driver", "@com_google_absl//absl/strings:str_format", "@com_google_googletest//:gtest_main", ], diff --git a/src/ray/common/cgroup2/tests/cgroup_manager_test.cc b/src/ray/common/cgroup2/tests/cgroup_manager_test.cc new file mode 100644 index 000000000000..aa83dfa64828 --- /dev/null +++ b/src/ray/common/cgroup2/tests/cgroup_manager_test.cc @@ -0,0 +1,276 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ray/common/cgroup2/cgroup_manager.h" + +#include +#include +#include +#include + +#include "gtest/gtest.h" +#include "ray/common/cgroup2/fake_cgroup_driver.h" +#include "ray/common/status.h" +namespace ray { + +TEST(CgroupManagerTest, CreateReturnsInvalidIfCgroupv2NotAvailable) { + std::shared_ptr> cgroups = + std::make_shared>(); + cgroups->emplace("/sys/fs/cgroup", FakeCgroup{"/sys/fs/cgroup"}); + FakeCgroup base_cgroup{"/sys/fs/cgroup"}; + + std::unique_ptr driver = FakeCgroupDriver::Create(cgroups); + + driver->check_cgroup_enabled_s_ = Status::Invalid(""); + auto cgroup_manager_s = CgroupManager::Create( + "/sys/fs/cgroup/ray", "node_id_123", 100, 1000000, std::move(driver)); + ASSERT_TRUE(cgroup_manager_s.IsInvalid()) << cgroup_manager_s.ToString(); + // No visible side-effects + ASSERT_EQ(cgroups->size(), 1); + ASSERT_EQ(cgroups->begin()->second, base_cgroup); +} + +TEST(CgroupManagerTest, CreateReturnsNotFoundIfBaseCgroupDoesNotExist) { + std::shared_ptr> cgroups = + std::make_shared>(); + std::unique_ptr driver = FakeCgroupDriver::Create(cgroups); + driver->check_cgroup_s_ = Status::NotFound(""); + auto cgroup_manager_s = CgroupManager::Create( + "/sys/fs/cgroup/ray", "node_id_123", 100, 1000000, std::move(driver)); + ASSERT_TRUE(cgroup_manager_s.IsNotFound()) << cgroup_manager_s.ToString(); + // No visible side-effects + ASSERT_EQ(cgroups->size(), 0); +} + +TEST(CgroupManagerTest, + CreateReturnsNotFoundIfProcessDoesNotHavePermissionsForBaseCgroup) { + std::shared_ptr> cgroups = + std::make_shared>(); + cgroups->emplace("/sys/fs/cgroup", FakeCgroup{"/sys/fs/cgroup"}); + FakeCgroup base_cgroup{"/sys/fs/cgroup"}; + std::unique_ptr driver = FakeCgroupDriver::Create(cgroups); + driver->check_cgroup_s_ = Status::PermissionDenied(""); + auto cgroup_manager_s = CgroupManager::Create( + "/sys/fs/cgroup/ray", "node_id_123", 100, 1000000, std::move(driver)); + ASSERT_TRUE(cgroup_manager_s.IsPermissionDenied()) << cgroup_manager_s.ToString(); + // No visible side-effects + ASSERT_EQ(cgroups->size(), 1); + ASSERT_EQ(cgroups->begin()->second, base_cgroup); +} + +TEST(CgroupManagerTest, CreateReturnsInvalidIfSupportedControllersAreNotAvailable) { + std::shared_ptr> cgroups = + std::make_shared>(); + cgroups->emplace("/sys/fs/cgroup", FakeCgroup{"/sys/fs/cgroup"}); + FakeCgroup base_cgroup{"/sys/fs/cgroup"}; + std::unique_ptr driver = FakeCgroupDriver::Create(cgroups); + auto cgroup_manager_s = CgroupManager::Create( + "/sys/fs/cgroup", "node_id_123", 100, 1000000, std::move(driver)); + ASSERT_TRUE(cgroup_manager_s.IsInvalid()) << cgroup_manager_s.ToString(); + // No visible side-effects + ASSERT_EQ(cgroups->size(), 1); + ASSERT_EQ(cgroups->begin()->second, base_cgroup); +} + +TEST(CgroupManagerTest, CreateReturnsInvalidArgumentIfConstraintValuesOutOfBounds) { + std::shared_ptr> cgroups = + std::make_shared>(); + cgroups->emplace("/sys/fs/cgroup", FakeCgroup{"/sys/fs/cgroup"}); + FakeCgroup base_cgroup{"/sys/fs/cgroup"}; + std::unique_ptr driver = FakeCgroupDriver::Create(cgroups); + auto cgroup_manager_s = + CgroupManager::Create("/sys/fs/cgroup", "node_id_123", -1, -1, std::move(driver)); + ASSERT_TRUE(cgroup_manager_s.IsInvalidArgument()) << cgroup_manager_s.ToString(); + // No visible side-effects + ASSERT_EQ(cgroups->size(), 1); + ASSERT_EQ(cgroups->begin()->second, base_cgroup); +} + +TEST(CgroupManagerTest, CreateSucceedsWithCleanupInOrder) { + std::shared_ptr> cgroups = + std::make_shared>(); + + cgroups->emplace("/sys/fs/cgroup", + FakeCgroup{"/sys/fs/cgroup", {5}, {}, {"cpu", "memory"}, {}}); + + auto deleted_cgroups = std::make_shared>>(); + auto constraints_disabled = + std::make_shared>>(); + auto controllers_disabled = + std::make_shared>>(); + auto processes_moved = + std::make_shared>>(); + + std::unique_ptr owned_driver = + FakeCgroupDriver::Create(cgroups, + deleted_cgroups, + constraints_disabled, + controllers_disabled, + processes_moved); + + FakeCgroupDriver *driver = owned_driver.get(); + + // node, system, and application cgroups were created in the fake + std::string node_id = "id_123"; + std::string base_cgroup_path = "/sys/fs/cgroup"; + std::string node_cgroup_path = "/sys/fs/cgroup/ray_node_id_123"; + std::string system_cgroup_path = "/sys/fs/cgroup/ray_node_id_123/system"; + std::string application_cgroup_path = "/sys/fs/cgroup/ray_node_id_123/application"; + int64_t system_reserved_cpu_weight = 1000; + int64_t system_reserved_memory_bytes = 1024 * 1024 * 1024; + + auto cgroup_manager_s = CgroupManager::Create(base_cgroup_path, + node_id, + system_reserved_cpu_weight, + system_reserved_memory_bytes, + std::move(owned_driver)); + + // The cgroup hierarchy was created correctly. + ASSERT_EQ(cgroups->size(), 4); + ASSERT_NE(cgroups->find(base_cgroup_path), cgroups->end()); + ASSERT_NE(cgroups->find(node_cgroup_path), cgroups->end()); + ASSERT_NE(cgroups->find(system_cgroup_path), cgroups->end()); + ASSERT_NE(cgroups->find(application_cgroup_path), cgroups->end()); + + std::array created_cgroups{&cgroups->at(base_cgroup_path), + &cgroups->at(node_cgroup_path), + &cgroups->at(system_cgroup_path), + &cgroups->at(application_cgroup_path)}; + + // Controllers are enabled on base, node, application, and system cgroups. + for (const FakeCgroup *cg : created_cgroups) { + ASSERT_EQ(cg->enabled_controllers_.size(), 2); + ASSERT_NE(cg->enabled_controllers_.find("cpu"), cg->enabled_controllers_.end()); + ASSERT_NE(cg->enabled_controllers_.find("memory"), cg->enabled_controllers_.end()); + } + + // Processes were moved out of the base cgroup into the system cgroup. + const FakeCgroup &base_cgroup = cgroups->find(base_cgroup_path)->second; + const FakeCgroup &system_cgroup = cgroups->find(system_cgroup_path)->second; + ASSERT_TRUE(base_cgroup.processes_.empty()); + ASSERT_EQ(system_cgroup.processes_.size(), 1); + + // Check to see that the memory and cpu constraints were enabled correctly + // for the system and application cgroups. + ASSERT_EQ(system_cgroup.constraints_.size(), 2); + ASSERT_NE(system_cgroup.constraints_.find("cpu.weight"), + system_cgroup.constraints_.end()); + ASSERT_EQ(system_cgroup.constraints_.at("cpu.weight"), + std::to_string(system_reserved_cpu_weight)); + ASSERT_EQ(system_cgroup.constraints_.at("memory.min"), + std::to_string(system_reserved_memory_bytes)); + + const FakeCgroup &app_cgroup = cgroups->find(application_cgroup_path)->second; + ASSERT_EQ(app_cgroup.constraints_.size(), 1); + ASSERT_NE(app_cgroup.constraints_.find("cpu.weight"), app_cgroup.constraints_.end()); + ASSERT_EQ(app_cgroup.constraints_.at("cpu.weight"), + std::to_string(10000 - system_reserved_cpu_weight)); + + // Switching the mode of the FakeCgroupDriver to cleanup to record cleanup + // operations + driver->cleanup_mode_ = true; + // Destroying the cgroup manager triggers automatic cleanup. + std::unique_ptr cgroup_manager = std::move(cgroup_manager_s.value()); + cgroup_manager.reset(); + + // Only the base cgroup is left after the cgroup_manager is destroyed. + ASSERT_EQ(cgroups->size(), 1); + ASSERT_NE(cgroups->find(base_cgroup_path), cgroups->end()); + + // Since the order of operation matters during cleanup for cgroups, we're going + // to have to check the fake for side-effects extensively: + // + // Constraints have to be disabled before controllers are disabled. + ASSERT_EQ(constraints_disabled->size(), 3); + // Since constraints were only enabled on leaf nodes, the order does not matter. + ASSERT_EQ( + std::count_if(constraints_disabled->begin(), + constraints_disabled->end(), + [&system_cgroup_path](const std::pair &item) { + return item.second.cgroup_ == system_cgroup_path && + item.second.name_ == "cpu.weight"; + }), + 1); + ASSERT_EQ( + std::count_if(constraints_disabled->begin(), + constraints_disabled->end(), + [&system_cgroup_path](const std::pair &item) { + return item.second.cgroup_ == system_cgroup_path && + item.second.name_ == "memory.min"; + }), + 1); + ASSERT_EQ(std::count_if( + constraints_disabled->begin(), + constraints_disabled->end(), + [&application_cgroup_path](const std::pair &item) { + return item.second.cgroup_ == application_cgroup_path && + item.second.name_ == "cpu.weight"; + }), + 1); + + // Controllers were disabled second. + ASSERT_EQ(controllers_disabled->size(), 8); + // Controllers must be disabled after the constraints are removed. + ASSERT_LT(constraints_disabled->back().first, controllers_disabled->front().first); + // Check to see controllers are disabled on all cgroups from the leaves to + // the root. + ASSERT_EQ((*controllers_disabled)[0].second.cgroup_, application_cgroup_path); + ASSERT_EQ((*controllers_disabled)[1].second.cgroup_, system_cgroup_path); + ASSERT_EQ((*controllers_disabled)[2].second.cgroup_, node_cgroup_path); + ASSERT_EQ((*controllers_disabled)[3].second.cgroup_, base_cgroup_path); + ASSERT_EQ((*controllers_disabled)[4].second.cgroup_, application_cgroup_path); + ASSERT_EQ((*controllers_disabled)[5].second.cgroup_, system_cgroup_path); + ASSERT_EQ((*controllers_disabled)[6].second.cgroup_, node_cgroup_path); + ASSERT_EQ((*controllers_disabled)[7].second.cgroup_, base_cgroup_path); + + // The memory and cpu controller are both disabled for each cgroup + std::array cgroup_names{ + base_cgroup_path, + node_cgroup_path, + system_cgroup_path, + application_cgroup_path, + }; + + for (const auto &cgroup_name : cgroup_names) { + ASSERT_EQ(std::count_if(controllers_disabled->begin(), + controllers_disabled->end(), + [&cgroup_name](const std::pair &item) { + return item.second.cgroup_ == cgroup_name && + item.second.name_ == "cpu"; + }), + 1); + ASSERT_EQ(std::count_if(controllers_disabled->begin(), + controllers_disabled->end(), + [&cgroup_name](const std::pair &item) { + return item.second.cgroup_ == cgroup_name && + item.second.name_ == "memory"; + }), + 1); + } + + // Processes were moved third. + ASSERT_EQ(processes_moved->size(), 1); + ASSERT_EQ((*processes_moved)[0].second.from_, system_cgroup_path); + ASSERT_EQ((*processes_moved)[0].second.to_, base_cgroup_path); + ASSERT_LT(constraints_disabled->back().first, processes_moved->front().first); + + // Cgroups were deleted last and in reverse order i.e. application, system, node. + ASSERT_EQ(deleted_cgroups->size(), 3); + ASSERT_LT(processes_moved->back().first, deleted_cgroups->front().first); + ASSERT_EQ((*deleted_cgroups)[0].second, application_cgroup_path); + ASSERT_EQ((*deleted_cgroups)[1].second, system_cgroup_path); + ASSERT_EQ((*deleted_cgroups)[2].second, node_cgroup_path); +} + +} // namespace ray diff --git a/src/ray/common/cgroup2/test/sysfs_cgroup_driver_test.cc b/src/ray/common/cgroup2/tests/sysfs_cgroup_driver_test.cc similarity index 84% rename from src/ray/common/cgroup2/test/sysfs_cgroup_driver_test.cc rename to src/ray/common/cgroup2/tests/sysfs_cgroup_driver_test.cc index 275a122e808f..0d712c1443c3 100644 --- a/src/ray/common/cgroup2/test/sysfs_cgroup_driver_test.cc +++ b/src/ray/common/cgroup2/tests/sysfs_cgroup_driver_test.cc @@ -20,7 +20,7 @@ #include #include "gtest/gtest.h" -#include "ray/common/cgroup2/test/cgroup_test_utils.h" +#include "ray/common/cgroup2/cgroup_test_utils.h" #include "ray/common/status.h" #include "ray/common/status_or.h" @@ -64,7 +64,7 @@ TEST(SysFsCgroupDriverTest, TEST(SysFsCgroupDriverTest, CheckCgroupv2EnabledSucceedsIfOnlyCgroupv2Mounted) { TempFile temp_mount_file; - temp_mount_file.AppendLine("cgroup2 /sys/fs/cgroup rw 0 0\n"); + temp_mount_file.AppendLine("cgroup2 /sys/fs/cgroup cgroup2 rw 0 0\n"); SysFsCgroupDriver driver(temp_mount_file.GetPath()); Status s = driver.CheckCgroupv2Enabled(); EXPECT_TRUE(s.ok()) << s.ToString(); @@ -87,6 +87,23 @@ TEST(SysFsCgroupDriver, CheckCgroupFailsIfCgroupDoesNotExist) { EXPECT_TRUE(s.IsNotFound()) << s.ToString(); } +TEST(SysFsCgroupDriver, DeleteCgroupFailsIfNotCgroup2Path) { + // This is not a directory on the cgroupv2 vfs. + auto temp_dir_or_status = TempDirectory::Create(); + ASSERT_TRUE(temp_dir_or_status.ok()) << temp_dir_or_status.ToString(); + std::unique_ptr temp_dir = std::move(temp_dir_or_status.value()); + SysFsCgroupDriver driver; + Status s = driver.DeleteCgroup(temp_dir->GetPath()); + EXPECT_TRUE(s.IsInvalidArgument()) << s.ToString(); +} + +TEST(SysFsCgroupDriver, DeleteCgroupFailsIfCgroupDoesNotExist) { + // This is not a directory on the cgroupv2 vfs. + SysFsCgroupDriver driver; + Status s = driver.DeleteCgroup("/some/path/that/doesnt/exist"); + EXPECT_TRUE(s.IsNotFound()) << s.ToString(); +} + TEST(SysFsCgroupDriver, GetAvailableControllersFailsIfNotCgroup2Path) { auto temp_dir_or_status = TempDirectory::Create(); ASSERT_TRUE(temp_dir_or_status.ok()) << temp_dir_or_status.ToString(); @@ -125,7 +142,7 @@ TEST(SysFsCgroupDriver, AddConstraintFailsIfNotCgroupv2Path) { ASSERT_TRUE(temp_dir_or_status.ok()) << temp_dir_or_status.ToString(); std::unique_ptr temp_dir = std::move(temp_dir_or_status.value()); SysFsCgroupDriver driver; - Status s = driver.AddConstraint(temp_dir->GetPath(), "memory.min", "1"); + Status s = driver.AddConstraint(temp_dir->GetPath(), "memory", "memory.min", "1"); ASSERT_TRUE(s.IsInvalidArgument()) << s.ToString(); } diff --git a/src/ray/common/common_protocol.cc b/src/ray/common/common_protocol.cc deleted file mode 100644 index 03043efc2dc0..000000000000 --- a/src/ray/common/common_protocol.cc +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright 2017 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "ray/common/common_protocol.h" - -#include "ray/util/logging.h" - -std::string string_from_flatbuf(const flatbuffers::String &string) { - return std::string(string.data(), string.size()); -} diff --git a/src/ray/common/common_protocol.h b/src/ray/common/common_protocol.h deleted file mode 100644 index e5c06e6fc401..000000000000 --- a/src/ray/common/common_protocol.h +++ /dev/null @@ -1,173 +0,0 @@ -// Copyright 2017 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include - -#include "ray/common/id.h" -#include "ray/util/logging.h" -#include "src/ray/protobuf/common.pb.h" -#include "src/ray/protobuf/gcs.pb.h" - -/// Convert an unique ID to a flatbuffer string. -/// -/// @param fbb Reference to the flatbuffer builder. -/// @param id The ID to be converted. -/// @return The flatbuffer string containing the ID. -template -flatbuffers::Offset to_flatbuf(flatbuffers::FlatBufferBuilder &fbb, - ID id); - -/// Convert a flatbuffer string to an unique ID. -/// -/// @param string The flatbuffer string. -/// @return The ID. -template -ID from_flatbuf(const flatbuffers::String &string); - -/// Convert a flatbuffer vector of strings to a vector of unique IDs. -/// -/// @param vector The flatbuffer vector. -/// @return The vector of IDs. -template -const std::vector from_flatbuf( - const flatbuffers::Vector> &vector); - -/// Convert an array of unique IDs to a flatbuffer vector of strings. -/// -/// @param fbb Reference to the flatbuffer builder. -/// @param ids Array of unique IDs. -/// @param num_ids Number of elements in the array. -/// @return Flatbuffer vector of strings. -template -flatbuffers::Offset>> -to_flatbuf(flatbuffers::FlatBufferBuilder &fbb, ID ids[], int64_t num_ids); - -/// Convert a vector of unique IDs to a flatbuffer vector of strings. -/// -/// @param fbb Reference to the flatbuffer builder. -/// @param ids Vector of IDs. -/// @return Flatbuffer vector of strings. -template -flatbuffers::Offset>> -to_flatbuf(flatbuffers::FlatBufferBuilder &fbb, const std::vector &ids); - -/// Convert an unordered_set of unique IDs to a flatbuffer vector of strings. -/// -/// @param fbb Reference to the flatbuffer builder. -/// @param ids Unordered set of IDs. -/// @return Flatbuffer vector of strings. -template -flatbuffers::Offset>> -to_flatbuf(flatbuffers::FlatBufferBuilder &fbb, const std::unordered_set &ids); - -/// Convert a flatbuffer string to a std::string. -/// -/// @param fbb Reference to the flatbuffer builder. -/// @param string A flatbuffers string. -/// @return The std::string version of the flatbuffer string. -std::string string_from_flatbuf(const flatbuffers::String &string); - -template -flatbuffers::Offset to_flatbuf(flatbuffers::FlatBufferBuilder &fbb, - ID id) { - return fbb.CreateString(reinterpret_cast(id.Data()), id.Size()); -} - -template -ID from_flatbuf(const flatbuffers::String &string) { - return ID::FromBinary(string.str()); -} - -template -const std::vector from_flatbuf( - const flatbuffers::Vector> &vector) { - std::vector ids; - for (int64_t i = 0; i < vector.size(); i++) { - ids.push_back(from_flatbuf(*vector.Get(i))); - } - return ids; -} - -template -flatbuffers::Offset>> -to_flatbuf(flatbuffers::FlatBufferBuilder &fbb, ID ids[], int64_t num_ids) { - std::vector> results; - for (int64_t i = 0; i < num_ids; i++) { - results.push_back(to_flatbuf(fbb, ids[i])); - } - return fbb.CreateVector(results); -} - -template -flatbuffers::Offset>> -to_flatbuf(flatbuffers::FlatBufferBuilder &fbb, const std::vector &ids) { - std::vector> results; - for (auto id : ids) { - results.push_back(to_flatbuf(fbb, id)); - } - return fbb.CreateVector(results); -} - -template -flatbuffers::Offset>> -to_flatbuf(flatbuffers::FlatBufferBuilder &fbb, const std::unordered_set &ids) { - std::vector> results; - for (auto id : ids) { - results.push_back(to_flatbuf(fbb, id)); - } - return fbb.CreateVector(results); -} - -static inline ray::rpc::ObjectReference ObjectIdToRef( - const ray::ObjectID &object_id, const ray::rpc::Address owner_address) { - ray::rpc::ObjectReference ref; - ref.set_object_id(object_id.Binary()); - ref.mutable_owner_address()->CopyFrom(owner_address); - return ref; -} - -static inline ray::ObjectID ObjectRefToId(const ray::rpc::ObjectReference &object_ref) { - return ray::ObjectID::FromBinary(object_ref.object_id()); -} - -static inline std::vector ObjectRefsToIds( - const std::vector &object_refs) { - std::vector object_ids; - for (const auto &ref : object_refs) { - object_ids.push_back(ObjectRefToId(ref)); - } - return object_ids; -} - -static inline ray::rpc::ActorTableData::ActorState StringToActorState( - const std::string &actor_state_name) { - if (actor_state_name == "DEPENDENCIES_UNREADY") { - return ray::rpc::ActorTableData::DEPENDENCIES_UNREADY; - } else if (actor_state_name == "PENDING_CREATION") { - return ray::rpc::ActorTableData::PENDING_CREATION; - } else if (actor_state_name == "ALIVE") { - return ray::rpc::ActorTableData::ALIVE; - } else if (actor_state_name == "RESTARTING") { - return ray::rpc::ActorTableData::RESTARTING; - } else if (actor_state_name == "DEAD") { - return ray::rpc::ActorTableData::DEAD; - } else { - RAY_CHECK(false) << "Invalid actor state name:" << actor_state_name; - return {}; - } -} diff --git a/src/ray/common/constants.h b/src/ray/common/constants.h index 9ee40fd92673..aa9d858f2811 100644 --- a/src/ray/common/constants.h +++ b/src/ray/common/constants.h @@ -14,8 +14,7 @@ #pragma once -#include -#include +#include /// Default value for enable_task_events within core. constexpr bool kDefaultTaskEventEnabled = true; diff --git a/src/ray/common/event_stats.cc b/src/ray/common/event_stats.cc index 6e4f3a8b1800..b18cfd442374 100644 --- a/src/ray/common/event_stats.cc +++ b/src/ray/common/event_stats.cc @@ -60,26 +60,31 @@ std::string to_human_readable(int64_t duration) { } // namespace std::shared_ptr EventTracker::RecordStart( - std::string name, int64_t expected_queueing_delay_ns) { + std::string name, + bool emit_metrics, + const int64_t expected_queueing_delay_ns, + const std::optional &event_context_name) { auto stats = GetOrCreate(name); - int64_t cum_count = 0; int64_t curr_count = 0; { absl::MutexLock lock(&(stats->mutex)); - cum_count = ++stats->stats.cum_count; + ++stats->stats.cum_count; curr_count = ++stats->stats.curr_count; } - if (RayConfig::instance().event_stats_metrics()) { - ray::stats::STATS_operation_count.Record(cum_count, name); - ray::stats::STATS_operation_active_count.Record(curr_count, name); + if (emit_metrics) { + ray::stats::STATS_operation_count.Record(1, event_context_name.value_or(name)); + ray::stats::STATS_operation_active_count.Record(curr_count, + event_context_name.value_or(name)); } return std::make_shared( std::move(name), absl::GetCurrentTimeNanos() + expected_queueing_delay_ns, std::move(stats), - global_stats_); + global_stats_, + emit_metrics, + event_context_name); } void EventTracker::RecordEnd(std::shared_ptr handle) { @@ -89,11 +94,12 @@ void EventTracker::RecordEnd(std::shared_ptr handle) { const auto execution_time_ns = absl::GetCurrentTimeNanos() - handle->start_time; handle->handler_stats->stats.cum_execution_time += execution_time_ns; - if (RayConfig::instance().event_stats_metrics()) { + if (handle->emit_stats) { // Update event-specific stats. - ray::stats::STATS_operation_run_time_ms.Record(execution_time_ns / 1000000, - handle->event_name); - ray::stats::STATS_operation_active_count.Record(curr_count, handle->event_name); + ray::stats::STATS_operation_run_time_ms.Record( + execution_time_ns / 1000000, handle->context_name.value_or(handle->event_name)); + ray::stats::STATS_operation_active_count.Record( + curr_count, handle->context_name.value_or(handle->event_name)); } handle->end_or_execution_recorded = true; @@ -134,14 +140,15 @@ void EventTracker::RecordExecution(const std::function &fn, stats->stats.running_count--; } - if (RayConfig::instance().event_stats_metrics()) { + if (handle->emit_stats) { // Update event-specific stats. - ray::stats::STATS_operation_run_time_ms.Record(execution_time_ns / 1000000, - handle->event_name); - ray::stats::STATS_operation_active_count.Record(curr_count, handle->event_name); + ray::stats::STATS_operation_run_time_ms.Record( + execution_time_ns / 1000000, handle->context_name.value_or(handle->event_name)); + ray::stats::STATS_operation_active_count.Record( + curr_count, handle->context_name.value_or(handle->event_name)); // Update global stats. - ray::stats::STATS_operation_queue_time_ms.Record(queue_time_ns / 1000000, - handle->event_name); + ray::stats::STATS_operation_queue_time_ms.Record( + queue_time_ns / 1000000, handle->context_name.value_or(handle->event_name)); } { @@ -186,6 +193,7 @@ GlobalStats EventTracker::get_global_stats() const { return to_global_stats_view(global_stats_); } +// Testing only method std::optional EventTracker::get_event_stats( const std::string &event_name) const { absl::ReaderMutexLock lock(&mutex_); @@ -196,6 +204,7 @@ std::optional EventTracker::get_event_stats( return to_event_stats_view(it->second); } +// Logging only method std::vector> EventTracker::get_event_stats() const { // We lock the stats table while copying the table into a vector. absl::ReaderMutexLock lock(&mutex_); diff --git a/src/ray/common/event_stats.h b/src/ray/common/event_stats.h index 1650733e7770..d687d06de141 100644 --- a/src/ray/common/event_stats.h +++ b/src/ray/common/event_stats.h @@ -73,16 +73,23 @@ struct StatsHandle { const std::shared_ptr global_stats; // Whether RecordEnd or RecordExecution is called. std::atomic end_or_execution_recorded; + // Metric emission specific configurations + const bool emit_stats; + const std::optional context_name; StatsHandle(std::string event_name_, - int64_t start_time_, + const int64_t start_time_, std::shared_ptr handler_stats_, - std::shared_ptr global_stats_) + std::shared_ptr global_stats_, + const bool emit_stats_, + const std::optional &context_name_) : event_name(std::move(event_name_)), start_time(start_time_), handler_stats(std::move(handler_stats_)), global_stats(std::move(global_stats_)), - end_or_execution_recorded(false) {} + end_or_execution_recorded(false), + emit_stats(emit_stats_), + context_name(context_name_) {} ~StatsHandle() { if (!end_or_execution_recorded) { @@ -106,12 +113,19 @@ class EventTracker { /// The returned opaque stats handle MUST be given to a subsequent /// RecordExecution() or RecordEnd() call. /// - /// \param name A human-readable name to which collected stats will be associated. - /// \param expected_queueing_delay_ns How much to pad the observed queueing start time, + /// \param name A human-readable name to which collected stats will be associated for + /// logging. \param expected_queueing_delay_ns How much to pad the observed queueing + /// start time, /// in nanoseconds. + /// \param emit_metrics Emit the underlying stat as a service metric + /// \param event_context_name A human-readable name to which collected stats will be + /// associated for metrics. /// \return An opaque stats handle, to be given to RecordExecution() or RecordEnd(). - std::shared_ptr RecordStart(std::string name, - int64_t expected_queueing_delay_ns = 0); + std::shared_ptr RecordStart( + std::string name, + bool emit_metrics = false, + int64_t expected_queueing_delay_ns = 0, + const std::optional &event_context_name = std::nullopt); /// Records stats about the provided function's execution. This is used in conjunction /// with RecordStart() to manually instrument an event loop handler that calls .post(). diff --git a/src/ray/common/file_system_monitor.h b/src/ray/common/file_system_monitor.h index ccba1d5e8696..eae48ae93e3f 100644 --- a/src/ray/common/file_system_monitor.h +++ b/src/ray/common/file_system_monitor.h @@ -67,7 +67,7 @@ class FileSystemMonitor { const std::vector paths_; const double capacity_threshold_; std::atomic over_capacity_; - instrumented_io_context io_context_{/*enable_lag_probe=*/false, + instrumented_io_context io_context_{/*enable_metrics=*/false, /*running_on_single_thread=*/true}; std::thread monitor_thread_; std::shared_ptr runner_; diff --git a/src/ray/common/flatbuf_utils.h b/src/ray/common/flatbuf_utils.h new file mode 100644 index 000000000000..7a1d56854a16 --- /dev/null +++ b/src/ray/common/flatbuf_utils.h @@ -0,0 +1,72 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include + +namespace ray { + +namespace flatbuf { + +using flatbuffers::FlatBufferBuilder; +using flatbuffers::Offset; +using flatbuffers::String; +using flatbuffers::uoffset_t; +using flatbuffers::Vector; + +template +Offset to_flatbuf(FlatBufferBuilder &fbb, const ID &id) { + return fbb.CreateString(reinterpret_cast(id.Data()), id.Size()); +} + +template +Offset>> to_flatbuf(FlatBufferBuilder &fbb, + ID ids[], + int64_t num_ids) { + std::vector> results; + results.reserve(num_ids); + for (int64_t i = 0; i < num_ids; i++) { + results.push_back(to_flatbuf(fbb, ids[i])); + } + return fbb.CreateVector(results); +} + +template +Offset>> to_flatbuf(FlatBufferBuilder &fbb, + const std::vector &ids) { + std::vector> results; + results.reserve(ids.size()); + for (const auto &id : ids) { + results.push_back(to_flatbuf(fbb, id)); + } + return fbb.CreateVector(results); +} + +template +Offset>> to_flatbuf(FlatBufferBuilder &fbb, + const std::unordered_set &ids) { + std::vector> results; + results.reserve(ids.size()); + for (const auto &id : ids) { + results.push_back(to_flatbuf(fbb, id)); + } + return fbb.CreateVector(results); +} + +} // namespace flatbuf + +} // namespace ray diff --git a/src/ray/common/function_descriptor.cc b/src/ray/common/function_descriptor.cc index 22a997932266..8df6c3e1ee1f 100644 --- a/src/ray/common/function_descriptor.cc +++ b/src/ray/common/function_descriptor.cc @@ -14,6 +14,8 @@ #include "ray/common/function_descriptor.h" +#include "ray/util/logging.h" + namespace ray { FunctionDescriptor FunctionDescriptorBuilder::Empty() { static ray::FunctionDescriptor empty = diff --git a/src/ray/common/function_descriptor.h b/src/ray/common/function_descriptor.h index b4f7ca3cd92a..452fc446ae6c 100644 --- a/src/ray/common/function_descriptor.h +++ b/src/ray/common/function_descriptor.h @@ -145,7 +145,7 @@ class JavaFunctionDescriptor : public FunctionDescriptorInterface { virtual std::string ClassName() const { return typed_message_->class_name(); } - const std::string &FunctionName() const { return typed_message_->function_name(); } + virtual std::string FunctionName() const { return typed_message_->function_name(); } const std::string &Signature() const { return typed_message_->signature(); } diff --git a/src/ray/gcs/callback.h b/src/ray/common/gcs_callbacks.h similarity index 83% rename from src/ray/gcs/callback.h rename to src/ray/common/gcs_callbacks.h index e4ac07a57407..1d5da52fec9b 100644 --- a/src/ray/gcs/callback.h +++ b/src/ray/common/gcs_callbacks.h @@ -20,12 +20,8 @@ #include "ray/common/status.h" namespace ray { - namespace gcs { -/// This callback is used to notify when a operation completes. -using EmptyCallback = std::function; - /// This callback is used to notify when a write/subscribe to GCS completes. /// \param status Status indicates whether the write/subscribe was successful. using StatusCallback = std::function; @@ -34,7 +30,6 @@ using StatusCallback = std::function; /// \param status Status indicates whether the read was successful. /// \param result The item returned by GCS. If the item to read doesn't exist, /// this optional object is empty. -/// TODO(ryw): make an Either union type to avoid the optional. template using OptionalItemCallback = std::function result)>; @@ -56,11 +51,5 @@ using SubscribeCallback = std::function; template using ItemCallback = std::function; -/// This callback is used to receive multiple key-value items from GCS. -/// \param result The key-value items returned by GCS. -template -using MapCallback = std::function &&result)>; - } // namespace gcs - } // namespace ray diff --git a/src/ray/common/grpc_util.h b/src/ray/common/grpc_util.h index 458c5d17d3e8..ae99eaf79081 100644 --- a/src/ray/common/grpc_util.h +++ b/src/ray/common/grpc_util.h @@ -32,6 +32,7 @@ namespace ray { /// Wrap a protobuf message. template +// TODO(#55921): Remove MessageWrapper class and clean up LeaseSpec/TaskSpec classes class MessageWrapper { public: /// Construct an empty message wrapper. This should not be used directly. @@ -243,4 +244,11 @@ inline google::protobuf::Timestamp AbslTimeNanosToProtoTimestamp(int64_t nanos) return timestamp; } +// Conver a protobuf timestamp to an epoch time in nanoseconds +// Ref: https://protobuf.dev/reference/php/api-docs/Google/Protobuf/Timestamp.html +inline int64_t ProtoTimestampToAbslTimeNanos( + const google::protobuf::Timestamp ×tamp) { + return timestamp.seconds() * 1000000000LL + timestamp.nanos(); +} + } // namespace ray diff --git a/src/ray/common/id.cc b/src/ray/common/id.cc index 91041d75d70f..9883ef0c26dc 100644 --- a/src/ray/common/id.cc +++ b/src/ray/common/id.cc @@ -14,18 +14,11 @@ #include "ray/common/id.h" -#include - #include -#include -#include -#include #include "absl/time/clock.h" #include "ray/common/constants.h" -#include "ray/common/status.h" #include "ray/util/macros.h" -#include "ray/util/util.h" extern "C" { #include "ray/thirdparty/sha256.h" @@ -144,7 +137,6 @@ ActorID ActorID::Of(const JobID &job_id, absl::GetCurrentTimeNanos(), ActorID::kUniqueBytesLength); std::copy_n(job_id.Data(), JobID::kLength, std::back_inserter(data)); - RAY_CHECK(data.size() == kLength); return ActorID::FromBinary(data); } @@ -152,7 +144,6 @@ ActorID ActorID::NilFromJob(const JobID &job_id) { std::string data(kUniqueBytesLength, 0); FillNil(&data); std::copy_n(job_id.Data(), JobID::kLength, std::back_inserter(data)); - RAY_CHECK(data.size() == kLength); return ActorID::FromBinary(data); } @@ -167,7 +158,6 @@ TaskID TaskID::ForDriverTask(const JobID &job_id) { FillNil(&data); const auto dummy_actor_id = ActorID::NilFromJob(job_id); std::copy_n(dummy_actor_id.Data(), ActorID::kLength, std::back_inserter(data)); - RAY_CHECK(data.size() == TaskID::kLength); return TaskID::FromBinary(data); } @@ -182,7 +172,6 @@ TaskID TaskID::ForActorCreationTask(const ActorID &actor_id) { std::string data(kUniqueBytesLength, 0); FillNil(&data); std::copy_n(actor_id.Data(), ActorID::kLength, std::back_inserter(data)); - RAY_CHECK(data.size() == TaskID::kLength); return TaskID::FromBinary(data); } @@ -193,7 +182,6 @@ TaskID TaskID::ForActorTask(const JobID &job_id, std::string data = GenerateUniqueBytes( job_id, parent_task_id, parent_task_counter, 0, TaskID::kUniqueBytesLength); std::copy_n(actor_id.Data(), ActorID::kLength, std::back_inserter(data)); - RAY_CHECK(data.size() == TaskID::kLength); return TaskID::FromBinary(data); } @@ -204,7 +192,6 @@ TaskID TaskID::ForNormalTask(const JobID &job_id, job_id, parent_task_id, parent_task_counter, 0, TaskID::kUniqueBytesLength); const auto dummy_actor_id = ActorID::NilFromJob(job_id); std::copy_n(dummy_actor_id.Data(), ActorID::kLength, std::back_inserter(data)); - RAY_CHECK(data.size() == TaskID::kLength); return TaskID::FromBinary(data); } @@ -313,7 +300,6 @@ PlacementGroupID PlacementGroupID::Of(const JobID &job_id) { std::string data(PlacementGroupID::kUniqueBytesLength, 0); FillRandom(&data); std::copy_n(job_id.Data(), JobID::kLength, std::back_inserter(data)); - RAY_CHECK(data.size() == kLength); return PlacementGroupID::FromBinary(data); } @@ -323,6 +309,24 @@ JobID PlacementGroupID::JobId() const { reinterpret_cast(this->Data() + kUniqueBytesLength), JobID::kLength)); } +LeaseID LeaseID::FromRandom() { + std::string data(kLength, 0); + FillRandom(&data); + return LeaseID::FromBinary(data); +} + +LeaseID LeaseID::FromWorker(const WorkerID &worker_id, uint32_t counter) { + std::string data(kUniqueBytesLength, 0); + std::memcpy(data.data(), &counter, sizeof(counter)); + std::copy_n(worker_id.Data(), kUniqueIDSize, std::back_inserter(data)); + return LeaseID::FromBinary(data); +} + +WorkerID LeaseID::WorkerId() const { + return WorkerID::FromBinary(std::string( + reinterpret_cast(id_ + kUniqueBytesLength), kUniqueIDSize)); +} + #define ID_OSTREAM_OPERATOR(id_type) \ std::ostream &operator<<(std::ostream &os, const id_type &id) { \ if (id.IsNil()) { \ @@ -339,6 +343,7 @@ ID_OSTREAM_OPERATOR(ActorID); ID_OSTREAM_OPERATOR(TaskID); ID_OSTREAM_OPERATOR(ObjectID); ID_OSTREAM_OPERATOR(PlacementGroupID); +ID_OSTREAM_OPERATOR(LeaseID); const NodeID kGCSNodeID = NodeID::FromBinary(std::string(kUniqueIDSize, 0)); diff --git a/src/ray/common/id.h b/src/ray/common/id.h index 6296c717253c..8e89d7e55cca 100644 --- a/src/ray/common/id.h +++ b/src/ray/common/id.h @@ -14,21 +14,15 @@ #pragma once -#include -#include - -#include #include #include -#include -#include #include #include "ray/common/constants.h" #include "ray/util/logging.h" #include "ray/util/random.h" -#include "ray/util/util.h" #include "ray/util/visibility.h" +#include "src/ray/protobuf/common.pb.h" namespace ray { @@ -132,12 +126,8 @@ class ActorID : public BaseID { static constexpr size_t kUniqueBytesLength = 12; public: - /// Length of `ActorID` in bytes. static constexpr size_t kLength = kUniqueBytesLength + JobID::kLength; - /// Size of `ActorID` in bytes. - /// - /// \return Size of `ActorID` in bytes. static constexpr size_t Size() { return kLength; } /// Creates an `ActorID` by hashing the given information. @@ -151,22 +141,13 @@ class ActorID : public BaseID { const TaskID &parent_task_id, const size_t parent_task_counter); - /// Creates a nil ActorID with the given job. - /// - /// \param job_id The job id to which this actor belongs. - /// - /// \return The `ActorID` with unique bytes being nil. static ActorID NilFromJob(const JobID &job_id); // Warning: this can duplicate IDs after a fork() call. We assume this never happens. static ActorID FromRandom() = delete; - /// Constructor of `ActorID`. ActorID() : BaseID() {} - /// Get the job id to which this actor belongs. - /// - /// \return The job id to which this actor belongs. JobID JobId() const; MSGPACK_DEFINE(id_); @@ -191,18 +172,11 @@ class TaskID : public BaseID { // Warning: this can duplicate IDs after a fork() call. We assume this never happens. static TaskID FromRandom() = delete; - /// The ID generated for driver task. static TaskID ForDriverTask(const JobID &job_id); /// Generate driver task id for the given job. static TaskID FromRandom(const JobID &job_id); - /// Creates a TaskID for an actor creation task. - /// - /// \param actor_id The ID of the actor that will be created - /// by this actor creation task. - /// - /// \return The ID of the actor creation task. static TaskID ForActorCreationTask(const ActorID &actor_id); /// Creates a TaskID for actor task. @@ -242,17 +216,10 @@ class TaskID : public BaseID { /// \return The ID of the n-th execution of the task. static TaskID ForExecutionAttempt(const TaskID &task_id, uint64_t attempt_number); - /// Get the id of the actor to which this task belongs. - /// - /// \return The `ActorID` of the actor which creates this task. ActorID ActorId() const; - /// Returns whether this is the ID of an actor creation task. bool IsForActorCreationTask() const; - /// Get the id of the job to which this task belongs. - /// - /// \return The `JobID` of the job which creates this task. JobID JobId() const; MSGPACK_DEFINE(id_); @@ -269,7 +236,6 @@ class ObjectID : public BaseID { /// The maximum number of objects that can be returned or put by a task. static constexpr int64_t kMaxObjectIndex = ((int64_t)1 << kObjectIdIndexSize) - 1; - /// The length of ObjectID in bytes. static constexpr size_t kLength = kIndexBytesLength + TaskID::kLength; ObjectID() : BaseID() {} @@ -289,9 +255,6 @@ class ObjectID : public BaseID { /// this object. ObjectIDIndexType ObjectIndex() const; - /// Compute the task ID of the task that created the object. - /// - /// \return The task ID of the task that created this object. TaskID TaskId() const; /// Compute the object ID of an object created by a task, either via an object put @@ -303,12 +266,8 @@ class ObjectID : public BaseID { /// \return The computed object ID. static ObjectID FromIndex(const TaskID &task_id, ObjectIDIndexType index); - /// Create an object id randomly. - /// /// Warning: this can duplicate IDs after a fork() call. We assume this /// never happens. - /// - /// \return A random object id. static ObjectID FromRandom(); /// Compute the object ID that is used to track an actor's lifetime. This @@ -322,6 +281,7 @@ class ObjectID : public BaseID { /// Whether this ObjectID represents an actor handle. This is the ObjectID /// returned by the actor's creation task. static bool IsActorID(const ObjectID &object_id); + /// Return the ID of the actor that produces this object. For the actor /// creation task and for tasks executed by the actor, this will return a /// non-nil ActorID. @@ -330,7 +290,6 @@ class ObjectID : public BaseID { MSGPACK_DEFINE(id_); private: - /// A helper method to generate an ObjectID. static ObjectID GenerateObjectId(const std::string &task_id_binary, ObjectIDIndexType object_index = 0); @@ -343,12 +302,8 @@ class PlacementGroupID : public BaseID { static constexpr size_t kUniqueBytesLength = 14; public: - /// Length of `PlacementGroupID` in bytes. static constexpr size_t kLength = kUniqueBytesLength + JobID::kLength; - /// Size of `PlacementGroupID` in bytes. - /// - /// \return Size of `PlacementGroupID` in bytes. static constexpr size_t Size() { return kLength; } /// Creates a `PlacementGroupID` by hashing the given information. @@ -360,12 +315,8 @@ class PlacementGroupID : public BaseID { static PlacementGroupID FromRandom() = delete; - /// Constructor of `PlacementGroupID`. PlacementGroupID() : BaseID() {} - /// Get the job id to which this placement group belongs. - /// - /// \return The job id to which this placement group belongs. JobID JobId() const; MSGPACK_DEFINE(id_); @@ -376,6 +327,39 @@ class PlacementGroupID : public BaseID { typedef std::pair BundleID; +class LeaseID : public BaseID { + private: + static constexpr size_t kUniqueBytesLength = 4; + + public: + static constexpr size_t kLength = kUniqueBytesLength + kUniqueIDSize; + + static constexpr size_t Size() { return kLength; } + + /// Creates a `LeaseID` from a specific worker ID. + /// + /// \param worker_id The worker ID from which this lease is requested. + /// \param counter The n-th lease requested by this worker, staring from 1 + /// + /// \return The `LeaseID` for the worker lease. + static LeaseID FromWorker(const WorkerID &worker_id, uint32_t counter); + + /// Creates a random `LeaseID`. + /// + /// \return A `LeaseID` generated with random bytes + /// Warning: this can duplicate IDs after a fork() call. We assume this never happens. + static LeaseID FromRandom(); + + LeaseID() : BaseID() {} + + WorkerID WorkerId() const; + + MSGPACK_DEFINE(id_); + + private: + uint8_t id_[kLength]; +}; + static_assert(sizeof(JobID) == JobID::kLength + sizeof(size_t), "JobID size is not as expected"); static_assert(sizeof(ActorID) == ActorID::kLength + sizeof(size_t), @@ -386,6 +370,8 @@ static_assert(sizeof(ObjectID) == ObjectID::kLength + sizeof(size_t), "ObjectID size is not as expected"); static_assert(sizeof(PlacementGroupID) == PlacementGroupID::kLength + sizeof(size_t), "PlacementGroupID size is not as expected"); +static_assert(sizeof(LeaseID) == LeaseID::kLength + sizeof(size_t), + "LeaseID size is not as expected"); std::ostream &operator<<(std::ostream &os, const UniqueID &id); std::ostream &operator<<(std::ostream &os, const JobID &id); @@ -393,6 +379,7 @@ std::ostream &operator<<(std::ostream &os, const ActorID &id); std::ostream &operator<<(std::ostream &os, const TaskID &id); std::ostream &operator<<(std::ostream &os, const ObjectID &id); std::ostream &operator<<(std::ostream &os, const PlacementGroupID &id); +std::ostream &operator<<(std::ostream &os, const LeaseID &id); #define DEFINE_UNIQUE_ID(type) \ class RAY_EXPORT type : public UniqueID { \ @@ -591,6 +578,25 @@ struct DefaultLogKey { constexpr static std::string_view key = kLogKeyPlacementGroupID; }; +template <> +struct DefaultLogKey { + constexpr static std::string_view key = kLogKeyLeaseID; +}; + +inline ObjectID ObjectRefToId(const rpc::ObjectReference &object_ref) { + return ObjectID::FromBinary(object_ref.object_id()); +} + +inline std::vector ObjectRefsToIds( + const std::vector &object_refs) { + std::vector object_ids; + object_ids.reserve(object_refs.size()); + for (const auto &ref : object_refs) { + object_ids.push_back(ObjectRefToId(ref)); + } + return object_ids; +} + } // namespace ray namespace std { @@ -607,6 +613,7 @@ DEFINE_UNIQUE_ID(ActorID); DEFINE_UNIQUE_ID(TaskID); DEFINE_UNIQUE_ID(ObjectID); DEFINE_UNIQUE_ID(PlacementGroupID); +DEFINE_UNIQUE_ID(LeaseID); #include "ray/common/id_def.h" #undef DEFINE_UNIQUE_ID diff --git a/src/ray/common/lease/lease.h b/src/ray/common/lease/lease.h new file mode 100644 index 000000000000..1dd4853c4064 --- /dev/null +++ b/src/ray/common/lease/lease.h @@ -0,0 +1,82 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include +#include + +#include "ray/common/lease/lease_spec.h" + +namespace ray { + +/// \class RayLease +/// +/// A RayLease represents a Ray lease and a specification of its execution (e.g., +/// resource demands). The lease's specification contains both immutable fields, +/// determined at submission time, and mutable fields, determined at execution +/// time. +class RayLease { + public: + /// Construct an empty lease. This should only be used to pass a lease + /// as an out parameter to a function or method. + // TODO(#55923): Remove this constructor and refactor worker.h to use unique_ptr + RayLease() = default; + + /// Construct a `RayLease` object from a protobuf message. + explicit RayLease(rpc::LeaseSpec lease_spec) + : lease_spec_(LeaseSpecification(std::move(lease_spec))) {} + + /// Construct a `RayLease` object from a `LeaseSpecification`. + explicit RayLease(LeaseSpecification lease_spec) : lease_spec_(std::move(lease_spec)) {} + + RayLease(LeaseSpecification lease_spec, std::string preferred_node_id) + : lease_spec_(std::move(lease_spec)), + preferred_node_id_(std::move(preferred_node_id)) {} + + /// Get the immutable specification for the lease. + /// + /// \return The immutable specification for the lease. + const LeaseSpecification &GetLeaseSpecification() const { return lease_spec_; } + + /// Get the lease's object dependencies. This comprises the immutable lease + /// arguments and the mutable execution dependencies. + /// + /// \return The object dependencies. + const std::vector &GetDependencies() const { + return lease_spec_.GetDependencies(); + } + + /// Get the lease's preferred node id for scheduling. If the returned value + /// is empty, then it means the lease has no preferred node. + /// + /// \return The preferred node id. + const std::string &GetPreferredNodeID() const { return preferred_node_id_; } + + std::string DebugString() const { + return absl::StrFormat("lease_spec={%s}", lease_spec_.DebugString()); + } + + private: + /// RayLease specification object, consisting of immutable information about this + /// lease determined at submission time. Includes resource demand, object + /// dependencies, etc. + LeaseSpecification lease_spec_; + + std::string preferred_node_id_; +}; + +} // namespace ray diff --git a/src/ray/common/lease/lease_spec.cc b/src/ray/common/lease/lease_spec.cc new file mode 100644 index 000000000000..7d84ebd92144 --- /dev/null +++ b/src/ray/common/lease/lease_spec.cc @@ -0,0 +1,361 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ray/common/lease/lease_spec.h" + +#include "ray/common/function_descriptor.h" +#include "ray/common/runtime_env_common.h" + +namespace ray { + +using SchedulingClass = int; + +LeaseSpecification::LeaseSpecification(const rpc::TaskSpec &task_spec) + : MessageWrapper(std::make_shared()) { + RAY_CHECK(task_spec.type() == rpc::TaskType::NORMAL_TASK || + task_spec.type() == rpc::TaskType::ACTOR_CREATION_TASK); + message_->set_job_id(task_spec.job_id()); + message_->mutable_caller_address()->CopyFrom(task_spec.caller_address()); + message_->mutable_required_resources()->insert(task_spec.required_resources().begin(), + task_spec.required_resources().end()); + message_->mutable_required_placement_resources()->insert( + task_spec.required_placement_resources().begin(), + task_spec.required_placement_resources().end()); + message_->mutable_scheduling_strategy()->CopyFrom(task_spec.scheduling_strategy()); + message_->mutable_label_selector()->insert(task_spec.label_selector().begin(), + task_spec.label_selector().end()); + message_->set_depth(task_spec.depth()); + message_->set_parent_task_id(task_spec.parent_task_id()); + message_->mutable_dependencies()->Reserve(task_spec.args_size()); + for (size_t i = 0; i < static_cast(task_spec.args_size()); ++i) { + if (task_spec.args(i).has_object_ref() && !task_spec.args(i).is_inlined()) { + message_->add_dependencies()->CopyFrom(task_spec.args(i).object_ref()); + } + } + message_->mutable_function_descriptor()->CopyFrom(task_spec.function_descriptor()); + message_->set_language(task_spec.language()); + message_->mutable_runtime_env_info()->CopyFrom(task_spec.runtime_env_info()); + message_->set_attempt_number(task_spec.attempt_number()); + message_->set_root_detached_actor_id(task_spec.root_detached_actor_id()); + message_->set_task_name(task_spec.name()); + message_->set_type(task_spec.type()); + if (IsActorCreationTask()) { + message_->set_actor_id(task_spec.actor_creation_task_spec().actor_id()); + message_->set_is_detached_actor(task_spec.actor_creation_task_spec().is_detached()); + message_->set_max_actor_restarts( + task_spec.actor_creation_task_spec().max_actor_restarts()); + for (const auto &option : + task_spec.actor_creation_task_spec().dynamic_worker_options()) { + message_->add_dynamic_worker_options(option); + } + } else { + message_->set_max_retries(task_spec.max_retries()); + } + ComputeResources(); +} + +LeaseID LeaseSpecification::LeaseId() const { + return LeaseID::FromBinary(message_->lease_id()); +} + +JobID LeaseSpecification::JobId() const { return JobID::FromBinary(message_->job_id()); } + +const rpc::Address &LeaseSpecification::CallerAddress() const { + return message_->caller_address(); +} + +rpc::Language LeaseSpecification::GetLanguage() const { return message_->language(); } + +bool LeaseSpecification::IsNormalTask() const { + return message_->type() == rpc::TaskType::NORMAL_TASK; +} + +bool LeaseSpecification::IsActorCreationTask() const { + return message_->type() == rpc::TaskType::ACTOR_CREATION_TASK; +} + +bool LeaseSpecification::IsNodeAffinitySchedulingStrategy() const { + return GetSchedulingStrategy().scheduling_strategy_case() == + rpc::SchedulingStrategy::kNodeAffinitySchedulingStrategy; +} + +NodeID LeaseSpecification::GetNodeAffinitySchedulingStrategyNodeId() const { + if (!IsNodeAffinitySchedulingStrategy()) { + return NodeID::Nil(); + } + return NodeID::FromBinary( + GetSchedulingStrategy().node_affinity_scheduling_strategy().node_id()); +} + +bool LeaseSpecification::GetNodeAffinitySchedulingStrategySoft() const { + if (!IsNodeAffinitySchedulingStrategy()) { + return false; + } + return GetSchedulingStrategy().node_affinity_scheduling_strategy().soft(); +} + +std::vector LeaseSpecification::GetDependencyIds() const { + std::vector ids; + ids.reserve(dependencies_.size()); + for (const auto &ref : dependencies_) { + ids.emplace_back(ObjectRefToId(ref)); + } + return ids; +} + +const std::vector &LeaseSpecification::GetDependencies() const { + return dependencies_; +} + +WorkerID LeaseSpecification::CallerWorkerId() const { + return WorkerID::FromBinary(message_->caller_address().worker_id()); +} + +NodeID LeaseSpecification::CallerNodeId() const { + return NodeID::FromBinary(message_->caller_address().node_id()); +} + +BundleID LeaseSpecification::PlacementGroupBundleId() const { + if (GetSchedulingStrategy().scheduling_strategy_case() != + rpc::SchedulingStrategy::kPlacementGroupSchedulingStrategy) { + return std::make_pair(PlacementGroupID::Nil(), -1); + } + const auto &pg = GetSchedulingStrategy().placement_group_scheduling_strategy(); + return std::make_pair(PlacementGroupID::FromBinary(pg.placement_group_id()), + pg.placement_group_bundle_index()); +} + +int64_t LeaseSpecification::MaxActorRestarts() const { + RAY_CHECK(IsActorCreationTask()); + return message_->max_actor_restarts(); +} + +int32_t LeaseSpecification::MaxRetries() const { + RAY_CHECK(IsNormalTask()); + return message_->max_retries(); +} + +bool LeaseSpecification::IsRetriable() const { + if (IsActorCreationTask() && MaxActorRestarts() == 0) { + return false; + } + if (IsNormalTask() && MaxRetries() == 0) { + return false; + } + return true; +} + +uint64_t LeaseSpecification::AttemptNumber() const { return message_->attempt_number(); } + +bool LeaseSpecification::IsRetry() const { return AttemptNumber() > 0; } + +std::string LeaseSpecification::GetTaskName() const { return message_->task_name(); } + +std::string LeaseSpecification::GetFunctionOrActorName() const { + if (IsActorCreationTask()) { + return FunctionDescriptor()->ClassName(); + } + return FunctionDescriptor()->CallString(); +} + +TaskID LeaseSpecification::ParentTaskId() const { + // Set to Nil for driver tasks. + if (message_->parent_task_id().empty()) { + return TaskID::Nil(); + } + return TaskID::FromBinary(message_->parent_task_id()); +} + +ActorID LeaseSpecification::ActorId() const { + if (message_->actor_id().empty()) { + return ActorID::Nil(); + } + return ActorID::FromBinary(message_->actor_id()); +} + +ActorID LeaseSpecification::RootDetachedActorId() const { + if (message_->root_detached_actor_id().empty()) { + return ActorID::Nil(); + } + return ActorID::FromBinary(message_->root_detached_actor_id()); +} + +bool LeaseSpecification::IsDetachedActor() const { return message_->is_detached_actor(); } + +int LeaseSpecification::GetRuntimeEnvHash() const { return runtime_env_hash_; } + +std::string LeaseSpecification::DebugString() const { + std::ostringstream stream; + stream << "Type=" << TaskType_Name(message_->type()) + << ", Language=" << Language_Name(message_->language()); + + if (required_resources_ != nullptr) { + stream << ", Resources: {"; + + // Print resource description. + for (const auto &entry : GetRequiredResources().GetResourceMap()) { + stream << entry.first << ": " << entry.second << ", "; + } + stream << "}"; + } + + stream << ", function_descriptor="; + + // Print function descriptor. + stream << FunctionDescriptor()->ToString(); + + stream << ", lease_id=" << LeaseId() << ", task_name=" << GetTaskName() + << ", job_id=" << JobId() << ", depth=" << GetDepth() + << ", attempt_number=" << AttemptNumber(); + + if (IsActorCreationTask()) { + // Print actor creation task spec. + stream << ", actor_creation_task_spec={actor_id=" << ActorId() + << ", max_restarts=" << MaxActorRestarts() + << ", is_detached=" << IsDetachedActor() << "}"; + } else { + stream << ", normal_task_spec={max_retries=" << MaxRetries() << "}"; + } + + // Print non-sensitive runtime env info. + if (HasRuntimeEnv()) { + const auto &runtime_env_info = RuntimeEnvInfo(); + stream << ", runtime_env_hash=" << GetRuntimeEnvHash(); + if (runtime_env_info.has_runtime_env_config()) { + stream << ", eager_install=" + << runtime_env_info.runtime_env_config().eager_install(); + stream << ", setup_timeout_seconds=" + << runtime_env_info.runtime_env_config().setup_timeout_seconds(); + } + } + + return stream.str(); +} + +bool LeaseSpecification::HasRuntimeEnv() const { + return !IsRuntimeEnvEmpty(SerializedRuntimeEnv()); +} + +const std::string &LeaseSpecification::SerializedRuntimeEnv() const { + return message_->runtime_env_info().serialized_runtime_env(); +} + +const rpc::RuntimeEnvInfo &LeaseSpecification::RuntimeEnvInfo() const { + return message_->runtime_env_info(); +} + +int64_t LeaseSpecification::GetDepth() const { return message_->depth(); } + +const rpc::SchedulingStrategy &LeaseSpecification::GetSchedulingStrategy() const { + return message_->scheduling_strategy(); +} + +const ResourceSet &LeaseSpecification::GetRequiredResources() const { + return *required_resources_; +} + +const ResourceSet &LeaseSpecification::GetRequiredPlacementResources() const { + return *required_placement_resources_; +} + +const LabelSelector &LeaseSpecification::GetLabelSelector() const { + return *label_selector_; +} + +ray::FunctionDescriptor LeaseSpecification::FunctionDescriptor() const { + return ray::FunctionDescriptorBuilder::FromProto(message_->function_descriptor()); +} + +void LeaseSpecification::ComputeResources() { + auto &required_resources = message_->required_resources(); + + if (required_resources.empty()) { + // A static nil object is used here to avoid allocating the empty object every time. + required_resources_ = ResourceSet::Nil(); + } else { + required_resources_ = + std::make_shared(MapFromProtobuf(required_resources)); + } + + auto &required_placement_resources = message_->required_placement_resources().empty() + ? required_resources + : message_->required_placement_resources(); + + if (required_placement_resources.empty()) { + required_placement_resources_ = ResourceSet::Nil(); + } else { + required_placement_resources_ = + std::make_shared(MapFromProtobuf(required_placement_resources)); + } + + // Set LabelSelector required for scheduling if specified. Parses string map + // from proto to LabelSelector data type. + label_selector_ = std::make_shared(message_->label_selector()); + + // Copy dependencies from message + dependencies_.reserve(message_->dependencies_size()); + for (int i = 0; i < message_->dependencies_size(); ++i) { + dependencies_.push_back(message_->dependencies(i)); + } + + // There is no need to compute `SchedulingClass` for actor tasks since + // the actor tasks need not be scheduled. + const bool is_actor_creation_task = IsActorCreationTask(); + const bool should_report_placement_resources = + RayConfig::instance().report_actor_placement_resources(); + const auto &resource_set = (is_actor_creation_task && should_report_placement_resources) + ? GetRequiredPlacementResources() + : GetRequiredResources(); + auto depth = GetDepth(); + auto label_selector = GetLabelSelector(); + const auto &function_descriptor = FunctionDescriptor(); + auto sched_cls_desc = SchedulingClassDescriptor( + resource_set, label_selector, function_descriptor, depth, GetSchedulingStrategy()); + // Map the scheduling class descriptor to an integer for performance. + sched_cls_id_ = SchedulingClassToIds::GetSchedulingClass(sched_cls_desc); + RAY_CHECK_GT(sched_cls_id_, 0); + + runtime_env_hash_ = CalculateRuntimeEnvHash(SerializedRuntimeEnv()); +} + +std::vector LeaseSpecification::DynamicWorkerOptionsOrEmpty() const { + if (!IsActorCreationTask()) { + return {}; + } + return VectorFromProtobuf(message_->dynamic_worker_options()); +} + +std::vector LeaseSpecification::DynamicWorkerOptions() const { + RAY_CHECK(IsActorCreationTask()); + return VectorFromProtobuf(message_->dynamic_worker_options()); +} + +size_t LeaseSpecification::DynamicWorkerOptionsSize() const { + return message_->dynamic_worker_options_size(); +} + +const rpc::RuntimeEnvConfig &LeaseSpecification::RuntimeEnvConfig() const { + return message_->runtime_env_info().runtime_env_config(); +} + +bool LeaseSpecification::IsSpreadSchedulingStrategy() const { + return message_->scheduling_strategy().scheduling_strategy_case() == + rpc::SchedulingStrategy::SchedulingStrategyCase::kSpreadSchedulingStrategy; +} + +SchedulingClass LeaseSpecification::GetSchedulingClass() const { return sched_cls_id_; } + +const rpc::LeaseSpec &LeaseSpecification::GetMessage() const { return *message_; } + +} // namespace ray diff --git a/src/ray/common/lease/lease_spec.h b/src/ray/common/lease/lease_spec.h new file mode 100644 index 000000000000..ab507a4e5544 --- /dev/null +++ b/src/ray/common/lease/lease_spec.h @@ -0,0 +1,112 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include "ray/common/grpc_util.h" +#include "ray/common/id.h" +#include "ray/common/scheduling/label_selector.h" +#include "ray/common/scheduling/resource_set.h" +#include "ray/common/scheduling/scheduling_class_util.h" +#include "src/ray/protobuf/common.pb.h" + +namespace ray { + +// LeaseSpec captures only the subset of TaskSpec used by the raylet for +// leasing, scheduling, dependency resolution, and cancellation. +class LeaseSpecification : public MessageWrapper { + public: + explicit LeaseSpecification(const rpc::TaskSpec &task_spec); + + /// Construct an empty task specification. This should not be used directly. + LeaseSpecification() { ComputeResources(); } + + explicit LeaseSpecification(rpc::LeaseSpec lease_spec) + : MessageWrapper(std::move(lease_spec)) { + ComputeResources(); + } + + explicit LeaseSpecification(std::shared_ptr message) + : MessageWrapper(std::move(message)) { + ComputeResources(); + } + + LeaseID LeaseId() const; + JobID JobId() const; + + const ResourceSet &GetRequiredResources() const; + const ResourceSet &GetRequiredPlacementResources() const; + const LabelSelector &GetLabelSelector() const; + const rpc::SchedulingStrategy &GetSchedulingStrategy() const; + bool IsNodeAffinitySchedulingStrategy() const; + NodeID GetNodeAffinitySchedulingStrategyNodeId() const; + bool GetNodeAffinitySchedulingStrategySoft() const; + std::vector GetDependencyIds() const; + const std::vector &GetDependencies() const; + + bool IsNormalTask() const; + bool IsActorCreationTask() const; + ActorID ActorId() const; + + const rpc::Address &CallerAddress() const; + WorkerID CallerWorkerId() const; + NodeID CallerNodeId() const; + BundleID PlacementGroupBundleId() const; + bool IsRetriable() const; + TaskID ParentTaskId() const; + bool IsDetachedActor() const; + std::string DebugString() const; + int GetRuntimeEnvHash() const; + rpc::Language GetLanguage() const; + bool HasRuntimeEnv() const; + const rpc::RuntimeEnvInfo &RuntimeEnvInfo() const; + const std::string &SerializedRuntimeEnv() const; + int64_t GetDepth() const; + ActorID RootDetachedActorId() const; + ray::FunctionDescriptor FunctionDescriptor() const; + int64_t MaxActorRestarts() const; + int32_t MaxRetries() const; + uint64_t AttemptNumber() const; + bool IsRetry() const; + std::string GetTaskName() const; + std::string GetFunctionOrActorName() const; + std::vector DynamicWorkerOptionsOrEmpty() const; + std::vector DynamicWorkerOptions() const; + size_t DynamicWorkerOptionsSize() const; + const rpc::RuntimeEnvConfig &RuntimeEnvConfig() const; + bool IsSpreadSchedulingStrategy() const; + SchedulingClass GetSchedulingClass() const; + const rpc::LeaseSpec &GetMessage() const; + + private: + void ComputeResources(); + + SchedulingClass GetSchedulingClass(const SchedulingClassDescriptor &sched_cls); + + SchedulingClass sched_cls_id_ = 0; + std::shared_ptr required_resources_; + std::shared_ptr required_placement_resources_; + std::shared_ptr label_selector_; + + std::vector dependencies_; + + int runtime_env_hash_ = 0; +}; + +} // namespace ray diff --git a/src/ray/common/memory_monitor.cc b/src/ray/common/memory_monitor.cc index 98a33da0cd84..1c60943402a9 100644 --- a/src/ray/common/memory_monitor.cc +++ b/src/ray/common/memory_monitor.cc @@ -23,7 +23,6 @@ #include "ray/common/ray_config.h" #include "ray/util/logging.h" #include "ray/util/process.h" -#include "ray/util/util.h" namespace ray { @@ -51,10 +50,10 @@ MemoryMonitor::MemoryMonitor(instrumented_io_context &io_service, << " system memory), total system memory bytes: " << total_memory_bytes; runner_->RunFnPeriodically( [this] { - auto [used_memory_bytes, total_memory_bytes] = GetMemoryBytes(); + auto [used_mem_bytes, total_mem_bytes] = GetMemoryBytes(); MemorySnapshot system_memory; - system_memory.used_bytes = used_memory_bytes; - system_memory.total_bytes = total_memory_bytes; + system_memory.used_bytes = used_mem_bytes; + system_memory.total_bytes = total_mem_bytes; bool is_usage_above_threshold = IsUsageAboveThreshold(system_memory, computed_threshold_bytes_); diff --git a/src/ray/common/placement_group.cc b/src/ray/common/placement_group.cc index 93431cf4c0f4..a0ec994088f3 100644 --- a/src/ray/common/placement_group.cc +++ b/src/ray/common/placement_group.cc @@ -17,7 +17,7 @@ namespace ray { void PlacementGroupSpecification::ConstructBundles() { for (int i = 0; i < message_->bundles_size(); i++) { - bundles_.push_back(BundleSpecification(message_->bundles(i))); + bundles_.emplace_back(message_->bundles(i)); } } @@ -44,8 +44,4 @@ BundleSpecification PlacementGroupSpecification::GetBundle(int position) const { std::string PlacementGroupSpecification::GetName() const { return std::string(message_->name()); } - -double PlacementGroupSpecification::GetMaxCpuFractionPerNode() const { - return message_->max_cpu_fraction_per_node(); -} } // namespace ray diff --git a/src/ray/common/placement_group.h b/src/ray/common/placement_group.h index e20776e3aa5b..c3d0057d88b2 100644 --- a/src/ray/common/placement_group.h +++ b/src/ray/common/placement_group.h @@ -14,6 +14,7 @@ #pragma once +#include "absl/container/flat_hash_map.h" #include "ray/common/bundle_spec.h" #include "ray/common/grpc_util.h" #include "ray/common/id.h" @@ -40,14 +41,14 @@ class PlacementGroupSpecification : public MessageWrapper message) - : MessageWrapper(message) { + : MessageWrapper(std::move(message)) { ConstructBundles(); } /// Return the placement group id. @@ -60,8 +61,6 @@ class PlacementGroupSpecification : public MessageWrapper> &bundles, const rpc::PlacementStrategy strategy, const bool is_detached, - double max_cpu_fraction_per_node, NodeID soft_target_node_id, const JobID &creator_job_id, const ActorID &creator_actor_id, @@ -105,7 +103,6 @@ class PlacementGroupSpecBuilder { message_->set_creator_actor_id(creator_actor_id.Binary()); message_->set_creator_actor_dead(creator_actor_id.IsNil()); message_->set_is_detached(is_detached); - message_->set_max_cpu_fraction_per_node(max_cpu_fraction_per_node); message_->set_soft_target_node_id(soft_target_node_id.Binary()); for (size_t i = 0; i < bundles.size(); i++) { diff --git a/src/ray/gcs/pb_util.h b/src/ray/common/protobuf_utils.cc similarity index 71% rename from src/ray/gcs/pb_util.h rename to src/ray/common/protobuf_utils.cc index 2733cf470e86..29a57343a44c 100644 --- a/src/ray/gcs/pb_util.h +++ b/src/ray/common/protobuf_utils.cc @@ -1,4 +1,4 @@ -// Copyright 2017 The Ray Authors. +// Copyright 2025 The Ray Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,46 +12,26 @@ // See the License for the specific language governing permissions and // limitations under the License. -#pragma once +#include "ray/common/protobuf_utils.h" #include #include +#include #include -#include "absl/time/time.h" -#include "ray/common/constants.h" -#include "ray/common/id.h" #include "ray/common/ray_config.h" -#include "ray/common/task/task_spec.h" -#include "src/ray/protobuf/autoscaler.pb.h" -#include "src/ray/protobuf/export_task_event.pb.h" -#include "src/ray/protobuf/gcs.pb.h" +#include "ray/util/time.h" namespace ray { - namespace gcs { -using ContextCase = rpc::ActorDeathCause::ContextCase; -// Forward declaration. -std::string GenErrorMessageFromDeathCause(const rpc::ActorDeathCause &death_cause); - -/// Helper function to produce job table data (for newly created job or updated job). -/// -/// \param job_id The ID of job that needs to be registered or updated. -/// \param is_dead Whether the driver of this job is dead. -/// \param timestamp The UNIX timestamp corresponding to this event. -/// \param driver_address Address of the driver that started this job. -/// \param driver_pid Process ID of the driver running this job. -/// \param entrypoint The entrypoint name of the job. -/// \param job_config The config of this job. -/// \return The job table data created by this method. -inline std::shared_ptr CreateJobTableData( +std::shared_ptr CreateJobTableData( const ray::JobID &job_id, bool is_dead, const ray::rpc::Address &driver_address, int64_t driver_pid, const std::string &entrypoint, - const ray::rpc::JobConfig &job_config = {}) { + const ray::rpc::JobConfig &job_config) { auto job_info_ptr = std::make_shared(); job_info_ptr->set_job_id(job_id.Binary()); job_info_ptr->set_is_dead(is_dead); @@ -63,15 +43,29 @@ inline std::shared_ptr CreateJobTableData( return job_info_ptr; } -/// Helper function to produce error table data. -std::shared_ptr CreateErrorTableData( - const std::string &error_type, - const std::string &error_msg, - absl::Time timestamp, - const JobID &job_id = JobID::Nil()); +rpc::ErrorTableData CreateErrorTableData(const std::string &error_type, + const std::string &error_msg, + absl::Time timestamp, + const JobID &job_id) { + uint32_t max_error_msg_size_bytes = RayConfig::instance().max_error_msg_size_bytes(); + rpc::ErrorTableData error_info; + error_info.set_type(error_type); + if (error_msg.length() > max_error_msg_size_bytes) { + std::string formatted_error_message = absl::StrFormat( + "The message size exceeds %d bytes. Find the full log from the log files. Here " + "is abstract: %s", + max_error_msg_size_bytes, + std::string_view{error_msg}.substr(0, max_error_msg_size_bytes)); + error_info.set_error_message(std::move(formatted_error_message)); + } else { + error_info.set_error_message(error_msg); + } + error_info.set_timestamp(absl::ToUnixMillis(timestamp)); + error_info.set_job_id(job_id.Binary()); + return error_info; +} -/// Helper function to produce worker failure data. -inline std::shared_ptr CreateWorkerFailureData( +std::shared_ptr CreateWorkerFailureData( const WorkerID &worker_id, const NodeID &node_id, const std::string &ip_address, @@ -79,12 +73,12 @@ inline std::shared_ptr CreateWorkerFailureData( rpc::WorkerExitType disconnect_type, const std::string &disconnect_detail, int pid, - const rpc::RayException *creation_task_exception = nullptr) { + const rpc::RayException *creation_task_exception) { auto worker_failure_info_ptr = std::make_shared(); // Only report the worker id + delta (new data upon worker failures). // GCS will merge the data with original worker data. worker_failure_info_ptr->mutable_worker_address()->set_worker_id(worker_id.Binary()); - worker_failure_info_ptr->mutable_worker_address()->set_raylet_id(node_id.Binary()); + worker_failure_info_ptr->mutable_worker_address()->set_node_id(node_id.Binary()); worker_failure_info_ptr->mutable_worker_address()->set_ip_address(ip_address); worker_failure_info_ptr->set_timestamp(timestamp); worker_failure_info_ptr->set_exit_type(disconnect_type); @@ -98,9 +92,7 @@ inline std::shared_ptr CreateWorkerFailureData( return worker_failure_info_ptr; } -/// Get actor creation task exception from ActorDeathCause. -/// Returns nullptr if actor isn't dead due to creation task failure. -inline const rpc::RayException *GetCreationTaskExceptionFromDeathCause( +const rpc::RayException *GetCreationTaskExceptionFromDeathCause( const rpc::ActorDeathCause *death_cause) { if (death_cause == nullptr || death_cause->context_case() != ContextCase::kCreationTaskFailureContext) { @@ -109,8 +101,7 @@ inline const rpc::RayException *GetCreationTaskExceptionFromDeathCause( return &(death_cause->creation_task_failure_context()); } -inline const std::string &GetActorDeathCauseString( - const rpc::ActorDeathCause &death_cause) { +const std::string &GetActorDeathCauseString(const rpc::ActorDeathCause &death_cause) { static absl::flat_hash_map death_cause_string{ {ContextCase::CONTEXT_NOT_SET, "CONTEXT_NOT_SET"}, {ContextCase::kRuntimeEnvFailedContext, "RuntimeEnvFailedContext"}, @@ -124,11 +115,7 @@ inline const std::string &GetActorDeathCauseString( return it->second; } -/// Get the error information from the actor death cause. -/// -/// \param[in] death_cause The rpc message that contains the actos death information. -/// \return RayErrorInfo that has propagated death cause. -inline rpc::RayErrorInfo GetErrorInfoFromActorDeathCause( +rpc::RayErrorInfo GetErrorInfoFromActorDeathCause( const rpc::ActorDeathCause &death_cause) { rpc::RayErrorInfo error_info; switch (death_cause.context_case()) { @@ -157,9 +144,7 @@ inline rpc::RayErrorInfo GetErrorInfoFromActorDeathCause( return error_info; } -/// Generate object error type from ActorDeathCause. -inline std::string GenErrorMessageFromDeathCause( - const rpc::ActorDeathCause &death_cause) { +std::string GenErrorMessageFromDeathCause(const rpc::ActorDeathCause &death_cause) { if (death_cause.context_case() == ContextCase::kCreationTaskFailureContext) { return death_cause.creation_task_failure_context().formatted_exception_string(); } else if (death_cause.context_case() == ContextCase::kRuntimeEnvFailedContext) { @@ -176,7 +161,7 @@ inline std::string GenErrorMessageFromDeathCause( } } -inline bool IsActorRestartable(const rpc::ActorTableData &actor) { +bool IsActorRestartable(const rpc::ActorTableData &actor) { RAY_CHECK_EQ(actor.state(), rpc::ActorTableData::DEAD); return actor.death_cause().context_case() == ContextCase::kActorDiedErrorContext && actor.death_cause().actor_died_error_context().reason() == @@ -189,27 +174,21 @@ inline bool IsActorRestartable(const rpc::ActorTableData &actor) { actor.max_restarts())); } -inline std::string RayErrorInfoToString(const ray::rpc::RayErrorInfo &error_info) { +std::string RayErrorInfoToString(const ray::rpc::RayErrorInfo &error_info) { std::stringstream ss; ss << "Error type " << error_info.error_type() << " exception string " << error_info.error_message(); return ss.str(); } -/// Get the parent task id from the task event. -/// -/// \param task_event Task event. -/// \return TaskID::Nil() if parent task id info not available, else the parent task id -/// for the task. -inline TaskID GetParentTaskId(const rpc::TaskEvents &task_event) { +TaskID GetParentTaskId(const rpc::TaskEvents &task_event) { if (task_event.has_task_info()) { return TaskID::FromBinary(task_event.task_info().parent_task_id()); } return TaskID::Nil(); } -inline void FillTaskInfo(rpc::TaskInfoEntry *task_info, - const TaskSpecification &task_spec) { +void FillTaskInfo(rpc::TaskInfoEntry *task_info, const TaskSpecification &task_spec) { rpc::TaskType type; if (task_spec.IsNormalTask()) { type = rpc::TaskType::NORMAL_TASK; @@ -256,9 +235,8 @@ inline void FillTaskInfo(rpc::TaskInfoEntry *task_info, } } -// Fill task_info for the export API with task specification from task_spec -inline void FillExportTaskInfo(rpc::ExportTaskEventData::TaskInfoEntry *task_info, - const TaskSpecification &task_spec) { +void FillExportTaskInfo(rpc::ExportTaskEventData::TaskInfoEntry *task_info, + const TaskSpecification &task_spec) { rpc::TaskType type; if (task_spec.IsNormalTask()) { type = rpc::TaskType::NORMAL_TASK; @@ -316,31 +294,22 @@ inline void FillExportTaskInfo(rpc::ExportTaskEventData::TaskInfoEntry *task_inf } } -/// Generate a RayErrorInfo from ErrorType -inline rpc::RayErrorInfo GetRayErrorInfo(const rpc::ErrorType &error_type, - const std::string &error_msg = "") { +rpc::RayErrorInfo GetRayErrorInfo(const rpc::ErrorType &error_type, + const std::string &error_msg) { rpc::RayErrorInfo error_info; error_info.set_error_type(error_type); error_info.set_error_message(error_msg); return error_info; } -/// Get the worker id from the task event. -/// -/// \param task_event Task event. -/// \return WorkerID::Nil() if worker id info not available, else the worker id. -inline WorkerID GetWorkerID(const rpc::TaskEvents &task_event) { +WorkerID GetWorkerID(const rpc::TaskEvents &task_event) { if (task_event.has_state_updates() && task_event.state_updates().has_worker_id()) { return WorkerID::FromBinary(task_event.state_updates().worker_id()); } return WorkerID::Nil(); } -/// Return if the task has already terminated (finished or failed) -/// -/// \param task_event Task event. -/// \return True if the task has already terminated, false otherwise. -inline bool IsTaskTerminated(const rpc::TaskEvents &task_event) { +bool IsTaskTerminated(const rpc::TaskEvents &task_event) { if (!task_event.has_state_updates()) { return false; } @@ -350,19 +319,19 @@ inline bool IsTaskTerminated(const rpc::TaskEvents &task_event) { state_updates.state_ts_ns().contains(rpc::TaskStatus::FAILED); } -inline size_t NumProfileEvents(const rpc::TaskEvents &task_event) { +size_t NumProfileEvents(const rpc::TaskEvents &task_event) { if (!task_event.has_profile_events()) { return 0; } return static_cast(task_event.profile_events().events_size()); } -inline TaskAttempt GetTaskAttempt(const rpc::TaskEvents &task_event) { +TaskAttempt GetTaskAttempt(const rpc::TaskEvents &task_event) { return std::make_pair(TaskID::FromBinary(task_event.task_id()), task_event.attempt_number()); } -inline bool IsActorTask(const rpc::TaskEvents &task_event) { +bool IsActorTask(const rpc::TaskEvents &task_event) { if (!task_event.has_task_info()) { return false; } @@ -372,7 +341,7 @@ inline bool IsActorTask(const rpc::TaskEvents &task_event) { task_info.type() == rpc::TaskType::ACTOR_CREATION_TASK; } -inline bool IsTaskFinished(const rpc::TaskEvents &task_event) { +bool IsTaskFinished(const rpc::TaskEvents &task_event) { if (!task_event.has_state_updates()) { return false; } @@ -381,14 +350,9 @@ inline bool IsTaskFinished(const rpc::TaskEvents &task_event) { return state_updates.state_ts_ns().contains(rpc::TaskStatus::FINISHED); } -/// Fill the rpc::TaskStateUpdate with the timestamps according to the status change. -/// -/// \param task_status The task status. -/// \param timestamp The timestamp. -/// \param[out] state_updates The state updates with timestamp to be updated. -inline void FillTaskStatusUpdateTime(const ray::rpc::TaskStatus &task_status, - int64_t timestamp, - ray::rpc::TaskStateUpdate *state_updates) { +void FillTaskStatusUpdateTime(const ray::rpc::TaskStatus &task_status, + int64_t timestamp, + ray::rpc::TaskStateUpdate *state_updates) { if (task_status == rpc::TaskStatus::NIL) { // Not status change. return; @@ -396,13 +360,7 @@ inline void FillTaskStatusUpdateTime(const ray::rpc::TaskStatus &task_status, (*state_updates->mutable_state_ts_ns())[task_status] = timestamp; } -/// Fill the rpc::ExportTaskEventData::TaskStateUpdate with the timestamps -/// according to the status change. -/// -/// \param task_status The task status. -/// \param timestamp The timestamp. -/// \param[out] state_updates The state updates with timestamp to be updated. -inline void FillExportTaskStatusUpdateTime( +void FillExportTaskStatusUpdateTime( const ray::rpc::TaskStatus &task_status, int64_t timestamp, rpc::ExportTaskEventData::TaskStateUpdate *state_updates) { @@ -413,9 +371,8 @@ inline void FillExportTaskStatusUpdateTime( (*state_updates->mutable_state_ts_ns())[task_status] = timestamp; } -/// Convert rpc::TaskLogInfo to rpc::ExportTaskEventData::TaskLogInfo -inline void TaskLogInfoToExport(const rpc::TaskLogInfo &src, - rpc::ExportTaskEventData::TaskLogInfo *dest) { +void TaskLogInfoToExport(const rpc::TaskLogInfo &src, + rpc::ExportTaskEventData::TaskLogInfo *dest) { dest->set_stdout_file(src.stdout_file()); dest->set_stderr_file(src.stderr_file()); dest->set_stdout_start(src.stdout_start()); @@ -424,36 +381,15 @@ inline void TaskLogInfoToExport(const rpc::TaskLogInfo &src, dest->set_stderr_end(src.stderr_end()); } -inline std::string FormatPlacementGroupLabelName(const std::string &pg_id) { - return kPlacementGroupConstraintKeyPrefix + pg_id; -} - -/// \brief Format placement group details. -/// Format: -/// :: -/// -/// \param pg_data -/// \return -inline std::string FormatPlacementGroupDetails( - const rpc::PlacementGroupTableData &pg_data) { - return PlacementGroupID::FromBinary(pg_data.placement_group_id()).Hex() + ":" + - rpc::PlacementStrategy_Name(pg_data.strategy()) + "|" + - rpc::PlacementGroupTableData::PlacementGroupState_Name(pg_data.state()); -} - -/// Generate a placement constraint for placement group. -/// -/// \param pg_id The ID of placement group. -/// \param strategy The placement strategy of placement group. -/// \return The placement constraint for placement group if it's not a strict -/// strategy, else absl::nullopt. -inline std::optional +std::optional GenPlacementConstraintForPlacementGroup(const std::string &pg_id, rpc::PlacementStrategy strategy) { rpc::autoscaler::PlacementConstraint pg_constraint; // We are embedding the PG id into the key for the same reasons as we do for // dynamic labels (a node will have multiple PGs thus having a common PG key // is not enough). + // Note that this is only use case for dynamic labels and is retained + // purely for backward compatibility purposes. const std::string name = FormatPlacementGroupLabelName(pg_id); switch (strategy) { case rpc::PlacementStrategy::STRICT_SPREAD: { @@ -478,5 +414,4 @@ GenPlacementConstraintForPlacementGroup(const std::string &pg_id, } } // namespace gcs - } // namespace ray diff --git a/src/ray/common/protobuf_utils.h b/src/ray/common/protobuf_utils.h new file mode 100644 index 000000000000..2017107c3db8 --- /dev/null +++ b/src/ray/common/protobuf_utils.h @@ -0,0 +1,178 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "absl/time/time.h" +#include "ray/common/id.h" +#include "ray/common/task/task_spec.h" +#include "src/ray/protobuf/autoscaler.pb.h" +#include "src/ray/protobuf/export_task_event.pb.h" +#include "src/ray/protobuf/gcs.pb.h" + +namespace ray { +namespace gcs { + +using ContextCase = rpc::ActorDeathCause::ContextCase; + +/// Helper function to produce job table data (for newly created job or updated job). +/// +/// \param job_id The ID of job that needs to be registered or updated. +/// \param is_dead Whether the driver of this job is dead. +/// \param timestamp The UNIX timestamp corresponding to this event. +/// \param driver_address Address of the driver that started this job. +/// \param driver_pid Process ID of the driver running this job. +/// \param entrypoint The entrypoint name of the job. +/// \param job_config The config of this job. +/// \return The job table data created by this method. +std::shared_ptr CreateJobTableData( + const ray::JobID &job_id, + bool is_dead, + const ray::rpc::Address &driver_address, + int64_t driver_pid, + const std::string &entrypoint, + const ray::rpc::JobConfig &job_config = {}); + +/// Helper function to produce error table data. +rpc::ErrorTableData CreateErrorTableData(const std::string &error_type, + const std::string &error_msg, + absl::Time timestamp, + const JobID &job_id = JobID::Nil()); + +/// Helper function to produce worker failure data. +std::shared_ptr CreateWorkerFailureData( + const WorkerID &worker_id, + const NodeID &node_id, + const std::string &ip_address, + int64_t timestamp, + rpc::WorkerExitType disconnect_type, + const std::string &disconnect_detail, + int pid, + const rpc::RayException *creation_task_exception = nullptr); + +/// Get actor creation task exception from ActorDeathCause. +/// Returns nullptr if actor isn't dead due to creation task failure. +const rpc::RayException *GetCreationTaskExceptionFromDeathCause( + const rpc::ActorDeathCause *death_cause); + +const std::string &GetActorDeathCauseString(const rpc::ActorDeathCause &death_cause); + +/// Get the error information from the actor death cause. +/// +/// \param[in] death_cause The rpc message that contains the actos death information. +/// \return RayErrorInfo that has propagated death cause. +rpc::RayErrorInfo GetErrorInfoFromActorDeathCause( + const rpc::ActorDeathCause &death_cause); + +/// Generate object error type from ActorDeathCause. +std::string GenErrorMessageFromDeathCause(const rpc::ActorDeathCause &death_cause); + +bool IsActorRestartable(const rpc::ActorTableData &actor); + +std::string RayErrorInfoToString(const ray::rpc::RayErrorInfo &error_info); + +/// Get the parent task id from the task event. +/// +/// \param task_event Task event. +/// \return TaskID::Nil() if parent task id info not available, else the parent task id +/// for the task. +TaskID GetParentTaskId(const rpc::TaskEvents &task_event); + +void FillTaskInfo(rpc::TaskInfoEntry *task_info, const TaskSpecification &task_spec); + +// Fill task_info for the export API with task specification from task_spec +void FillExportTaskInfo(rpc::ExportTaskEventData::TaskInfoEntry *task_info, + const TaskSpecification &task_spec); + +/// Generate a RayErrorInfo from ErrorType +rpc::RayErrorInfo GetRayErrorInfo(const rpc::ErrorType &error_type, + const std::string &error_msg = ""); + +/// Get the worker id from the task event. +/// +/// \param task_event Task event. +/// \return WorkerID::Nil() if worker id info not available, else the worker id. +WorkerID GetWorkerID(const rpc::TaskEvents &task_event); + +/// Return if the task has already terminated (finished or failed) +/// +/// \param task_event Task event. +/// \return True if the task has already terminated, false otherwise. +bool IsTaskTerminated(const rpc::TaskEvents &task_event); + +size_t NumProfileEvents(const rpc::TaskEvents &task_event); + +TaskAttempt GetTaskAttempt(const rpc::TaskEvents &task_event); + +bool IsActorTask(const rpc::TaskEvents &task_event); + +bool IsTaskFinished(const rpc::TaskEvents &task_event); + +/// Fill the rpc::TaskStateUpdate with the timestamps according to the status change. +/// +/// \param task_status The task status. +/// \param timestamp The timestamp. +/// \param[out] state_updates The state updates with timestamp to be updated. +void FillTaskStatusUpdateTime(const ray::rpc::TaskStatus &task_status, + int64_t timestamp, + ray::rpc::TaskStateUpdate *state_updates); + +/// Fill the rpc::ExportTaskEventData::TaskStateUpdate with the timestamps +/// according to the status change. +/// +/// \param task_status The task status. +/// \param timestamp The timestamp. +/// \param[out] state_updates The state updates with timestamp to be updated. +void FillExportTaskStatusUpdateTime( + const ray::rpc::TaskStatus &task_status, + int64_t timestamp, + rpc::ExportTaskEventData::TaskStateUpdate *state_updates); + +/// Convert rpc::TaskLogInfo to rpc::ExportTaskEventData::TaskLogInfo +void TaskLogInfoToExport(const rpc::TaskLogInfo &src, + rpc::ExportTaskEventData::TaskLogInfo *dest); + +inline std::string FormatPlacementGroupLabelName(const std::string &pg_id) { + return kPlacementGroupConstraintKeyPrefix + pg_id; +} + +/// \brief Format placement group details. +/// Format: +/// :: +/// +/// \param pg_data +/// \return +inline std::string FormatPlacementGroupDetails( + const rpc::PlacementGroupTableData &pg_data) { + return PlacementGroupID::FromBinary(pg_data.placement_group_id()).Hex() + ":" + + rpc::PlacementStrategy_Name(pg_data.strategy()) + "|" + + rpc::PlacementGroupTableData::PlacementGroupState_Name(pg_data.state()); +} + +/// Generate a placement constraint for placement group. +/// +/// \param pg_id The ID of placement group. +/// \param strategy The placement strategy of placement group. +/// \return The placement constraint for placement group if it's not a strict +/// strategy, else absl::nullopt. +std::optional +GenPlacementConstraintForPlacementGroup(const std::string &pg_id, + rpc::PlacementStrategy strategy); + +} // namespace gcs +} // namespace ray diff --git a/src/ray/common/ray_config_def.h b/src/ray/common/ray_config_def.h index 5aa0e3ef306e..4f166f891816 100644 --- a/src/ray/common/ray_config_def.h +++ b/src/ray/common/ray_config_def.h @@ -24,10 +24,10 @@ RAY_CONFIG(uint64_t, debug_dump_period_milliseconds, 10000) /// Whether to enable Ray event stats collection. RAY_CONFIG(bool, event_stats, true) -/// Whether to enable Ray event stats metrics export. -/// Note that enabling this adds high overhead to -/// Ray metrics agent. -RAY_CONFIG(bool, event_stats_metrics, false) +/// Whether to enable Ray event stats metrics for main services +/// such as gcs and raylet (which today are the sole consumers of +/// this config) +RAY_CONFIG(bool, emit_main_service_metrics, true) /// Whether to enable cluster authentication. RAY_CONFIG(bool, enable_cluster_auth, true) @@ -360,8 +360,9 @@ RAY_CONFIG(uint32_t, RAY_CONFIG(int64_t, gcs_service_connect_retries, 50) /// Waiting time for each gcs service connection. RAY_CONFIG(int64_t, internal_gcs_service_connect_wait_milliseconds, 100) -/// The interval at which the gcs server will check if redis has gone down. -/// When this happens, gcs server will kill itself. +/// The interval at which the gcs server will health check the connection to the +/// external Redis server. If a health check fails, the GCS will crash itself. +/// Set to zero to disable health checking. RAY_CONFIG(uint64_t, gcs_redis_heartbeat_interval_milliseconds, 100) /// Duration to wait between retries for leasing worker in gcs server. RAY_CONFIG(uint32_t, gcs_lease_worker_retry_interval_ms, 200) @@ -457,6 +458,11 @@ RAY_CONFIG(bool, task_events_skip_driver_for_test, false) /// Setting the value to 0 disables the task event recording and reporting. RAY_CONFIG(int64_t, task_events_report_interval_ms, 1000) +/// The interval duration for which ray events will be reported to the event aggregator. +/// The reported data should only be used for observability. +/// Setting the value to 0 disables the ray event recording and reporting. +RAY_CONFIG(int64_t, ray_events_report_interval_ms, 1000) + /// The number of tasks tracked in GCS for task state events. Any additional events /// from new tasks will evict events of tasks reported earlier. /// Setting the value to -1 allows for unlimited task events stored in GCS. @@ -520,16 +526,13 @@ RAY_CONFIG(bool, enable_metrics_collection, true) /// RAY_METRIC_CARDINALITY_LEVEL in ray_constants.py RAY_CONFIG(std::string, metric_cardinality_level, "legacy") -/// Whether enable OpenTelemetry as the metrics collection backend on the driver -/// component. This flag is only used during the migration of the metric collection -/// backend from OpenCensus to OpenTelemetry. It will be removed in the future. -RAY_CONFIG(bool, experimental_enable_open_telemetry_on_agent, false) +/// Whether enable OpenTelemetry as the metrics collection backend. The default is +/// using OpenCensus. +RAY_CONFIG(bool, enable_open_telemetry, false) -/// Whether enable OpenTelemetry as the metrics collection backend on the core -/// components (core workers, gcs server, raylet, etc.). This flag is only used during -/// the migration of the metric collection backend from OpenCensus to OpenTelemetry. -/// It will be removed in the future. -RAY_CONFIG(bool, experimental_enable_open_telemetry_on_core, false) +/// Whether to enable Ray Event as the event collection backend. The default is +/// using the Export API. +RAY_CONFIG(bool, enable_ray_event, false) /// Comma separated list of components we enable grpc metrics collection for. /// Only effective if `enable_metrics_collection` is also true. Will have some performance @@ -758,12 +761,6 @@ RAY_CONFIG(std::string, custom_unit_instance_resources, "neuron_cores,TPU,NPU,HP /// Ray-internal auxiliary tasks (e.g., compiled graph workers). RAY_CONFIG(std::string, system_concurrency_group_name, "_ray_system") -// Maximum size of the batches when broadcasting resources to raylet. -RAY_CONFIG(uint64_t, resource_broadcast_batch_size, 512) - -// Maximum ray sync message batch size in bytes (1MB by default) between nodes. -RAY_CONFIG(uint64_t, max_sync_message_batch_bytes, 1 * 1024 * 1024) - /// ServerCall instance number of each RPC service handler /// /// NOTE: Default value is temporarily pegged at `gcs_server_rpc_server_thread_num * 100` @@ -840,9 +837,15 @@ RAY_CONFIG(std::string, REDIS_SERVER_NAME, "") // it will apply to all methods. RAY_CONFIG(std::string, testing_asio_delay_us, "") -/// To use this, simply do -/// export -/// RAY_testing_rpc_failure="method1=max_num_failures:req_failure_prob:resp_failure_prob,method2=max_num_failures:req_failure_prob:resp_failure_prob" +/// To use this, simply do +/// export +/// RAY_testing_rpc_failure="method1=max_num_failures:req_failure_prob:resp_failure_prob,method2=max_num_failures:req_failure_prob:resp_failure_prob" +/// If you want to test all rpc failures you can use * as the method name and you can set +/// -1 max_num_failures to have unlimited failures. +/// Ex. unlimited failures for all rpc's with 25% request failures and 50% response +/// failures. +/// export RAY_testing_rpc_failure="*=-1:25:50" +/// NOTE: Setting the wildcard will override any configuration for other methods. RAY_CONFIG(std::string, testing_rpc_failure, "") /// The following are configs for the health check. They are borrowed @@ -959,3 +962,6 @@ RAY_CONFIG(int32_t, raylet_rpc_server_reconnect_timeout_s, 60) // process getting spawned. Setting to zero or less maintains the default // number of threads grpc will spawn. RAY_CONFIG(int64_t, worker_num_grpc_internal_threads, 0) + +// Whether to start a background thread to manage Python GC in workers. +RAY_CONFIG(bool, start_python_gc_manager_thread, true) diff --git a/src/ray/common/ray_syncer/ray_syncer.cc b/src/ray/common/ray_syncer/ray_syncer.cc index 8fadae0f9957..7991fdcd2c92 100644 --- a/src/ray/common/ray_syncer/ray_syncer.cc +++ b/src/ray/common/ray_syncer/ray_syncer.cc @@ -86,11 +86,11 @@ void RaySyncer::Connect(const std::string &node_id, /* message_processor */ [this](auto msg) { BroadcastMessage(std::move(msg)); }, /* cleanup_cb */ - [this, channel](RaySyncerBidiReactor *reactor, bool restart) { - const std::string &node_id = reactor->GetRemoteNodeID(); - auto iter = sync_reactors_.find(node_id); + [this, channel](RaySyncerBidiReactor *bidi_reactor, bool restart) { + const std::string &remote_node_id = bidi_reactor->GetRemoteNodeID(); + auto iter = sync_reactors_.find(remote_node_id); if (iter != sync_reactors_.end()) { - if (iter->second != reactor) { + if (iter->second != bidi_reactor) { // The client is already reconnected. return; } @@ -99,14 +99,14 @@ void RaySyncer::Connect(const std::string &node_id, if (restart) { execute_after( io_context_, - [this, node_id, channel]() { - RAY_LOG(INFO).WithField(NodeID::FromBinary(node_id)) + [this, remote_node_id, channel]() { + RAY_LOG(INFO).WithField(NodeID::FromBinary(remote_node_id)) << "Connection is broken. Reconnect to node."; - Connect(node_id, channel); + Connect(remote_node_id, channel); }, /* delay_microseconds = */ std::chrono::milliseconds(2000)); } else { - node_state_->RemoveNode(node_id); + node_state_->RemoveNode(remote_node_id); } }, /* stub */ std::move(stub)); @@ -124,7 +124,7 @@ void RaySyncer::Connect(RaySyncerBidiReactor *reactor) { boost::asio::dispatch( io_context_.get_executor(), std::packaged_task([this, reactor]() { - auto [_, is_new] = sync_reactors_.emplace(reactor->GetRemoteNodeID(), reactor); + auto is_new = sync_reactors_.emplace(reactor->GetRemoteNodeID(), reactor).second; RAY_CHECK(is_new) << NodeID::FromBinary(reactor->GetRemoteNodeID()) << " has already registered."; // Send the view for new connections. @@ -223,13 +223,13 @@ ServerBidiReactor *RaySyncerService::StartSync(grpc::CallbackServerContext *cont syncer_.GetLocalNodeID(), /*message_processor=*/[this](auto msg) mutable { syncer_.BroadcastMessage(msg); }, /*cleanup_cb=*/ - [this](RaySyncerBidiReactor *reactor, bool reconnect) mutable { + [this](RaySyncerBidiReactor *bidi_reactor, bool reconnect) mutable { // No need to reconnect for server side. RAY_CHECK(!reconnect); - const auto &node_id = reactor->GetRemoteNodeID(); + const auto &node_id = bidi_reactor->GetRemoteNodeID(); auto iter = syncer_.sync_reactors_.find(node_id); if (iter != syncer_.sync_reactors_.end()) { - if (iter->second != reactor) { + if (iter->second != bidi_reactor) { // There is a new connection to the node, no need to clean up. // This can happen when there is transient network error and the client // reconnects. The sequence of events are: diff --git a/src/ray/common/scheduling/BUILD.bazel b/src/ray/common/scheduling/BUILD.bazel new file mode 100644 index 000000000000..baac27e1096f --- /dev/null +++ b/src/ray/common/scheduling/BUILD.bazel @@ -0,0 +1,110 @@ +load("//bazel:ray.bzl", "ray_cc_library") + +ray_cc_library( + name = "scheduling_ids", + srcs = ["scheduling_ids.cc"], + hdrs = ["scheduling_ids.h"], + deps = [ + "//src/ray/common:constants", + "//src/ray/common:ray_config", + "//src/ray/util:logging", + "@boost//:algorithm", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/container:flat_hash_set", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/synchronization", + ], +) + +ray_cc_library( + name = "label_selector", + srcs = ["label_selector.cc"], + hdrs = ["label_selector.h"], + deps = [ + "//src/ray/protobuf:common_cc_proto", + "@com_google_absl//absl/container:flat_hash_set", + "@com_google_absl//absl/strings", + "@com_google_protobuf//:protobuf", + ], +) + +ray_cc_library( + name = "fixed_point", + srcs = ["fixed_point.cc"], + hdrs = ["fixed_point.h"], + deps = [ + "//src/ray/common:constants", + ], +) + +ray_cc_library( + name = "placement_group_util", + srcs = ["placement_group_util.cc"], + hdrs = ["placement_group_util.h"], + deps = [ + ":scheduling_ids", + "//src/ray/util:logging", + ], +) + +ray_cc_library( + name = "resource_set", + srcs = ["resource_set.cc"], + hdrs = ["resource_set.h"], + deps = [ + ":fixed_point", + ":scheduling_ids", + "@boost//:range", + "@com_google_absl//absl/container:flat_hash_map", + ], +) + +ray_cc_library( + name = "cluster_resource_data", + srcs = ["cluster_resource_data.cc"], + hdrs = ["cluster_resource_data.h"], + deps = [ + ":fixed_point", + ":label_selector", + ":resource_instance_set", + ":resource_set", + ":scheduling_ids", + "//src/ray/util:logging", + "@boost//:range", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/time", + ], +) + +ray_cc_library( + name = "scheduling_class_util", + srcs = ["scheduling_class_util.cc"], + hdrs = ["scheduling_class_util.h"], + deps = [ + ":label_selector", + ":resource_set", + "//src/ray/common:function_descriptor", + "//src/ray/common:runtime_env", + "//src/ray/protobuf:common_cc_proto", + "//src/ray/util:logging", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/synchronization", + "@com_google_protobuf//:protobuf", + ], +) + +ray_cc_library( + name = "resource_instance_set", + srcs = ["resource_instance_set.cc"], + hdrs = ["resource_instance_set.h"], + deps = [ + ":fixed_point", + ":placement_group_util", + ":resource_set", + ":scheduling_ids", + "//src/ray/util:container_util", + "//src/ray/util:logging", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/strings", + ], +) diff --git a/src/ray/common/scheduling/cluster_resource_data.cc b/src/ray/common/scheduling/cluster_resource_data.cc index 7555a329879e..4028de3ee4c8 100644 --- a/src/ray/common/scheduling/cluster_resource_data.cc +++ b/src/ray/common/scheduling/cluster_resource_data.cc @@ -17,9 +17,6 @@ #include #include -#include "ray/common/bundle_spec.h" -#include "ray/common/scheduling/resource_set.h" - namespace ray { /// Convert a map of resources to a ResourceRequest data structure. @@ -27,7 +24,7 @@ ResourceRequest ResourceMapToResourceRequest( const absl::flat_hash_map &resource_map, bool requires_object_store_memory) { ResourceRequest res({}, requires_object_store_memory); - for (auto entry : resource_map) { + for (const auto &entry : resource_map) { res.Set(ResourceID(entry.first), FixedPoint(entry.second)); } return res; @@ -116,7 +113,7 @@ bool NodeResources::IsFeasible(const ResourceRequest &resource_request) const { bool NodeResources::HasRequiredLabels(const LabelSelector &label_selector) const { // Check if node labels satisfy all label constraints - const auto constraints = label_selector.GetConstraints(); + const auto &constraints = label_selector.GetConstraints(); for (const auto &constraint : constraints) { if (!NodeLabelMatchesConstraint(constraint)) { return false; @@ -175,7 +172,7 @@ std::string NodeResources::DebugString() const { std::string NodeResources::DictString() const { return DebugString(); } -bool NodeResourceInstances::operator==(const NodeResourceInstances &other) { +bool NodeResourceInstances::operator==(const NodeResourceInstances &other) const { return this->total == other.total && this->available == other.available; } diff --git a/src/ray/common/scheduling/cluster_resource_data.h b/src/ray/common/scheduling/cluster_resource_data.h index cb3b4e8028a4..4ed7b77a79b5 100644 --- a/src/ray/common/scheduling/cluster_resource_data.h +++ b/src/ray/common/scheduling/cluster_resource_data.h @@ -15,15 +15,14 @@ #pragma once #include -#include +#include #include #include #include #include #include "absl/container/flat_hash_map.h" -#include "absl/container/flat_hash_set.h" -#include "ray/common/id.h" +#include "absl/time/time.h" #include "ray/common/scheduling/fixed_point.h" #include "ray/common/scheduling/label_selector.h" #include "ray/common/scheduling/resource_instance_set.h" @@ -376,7 +375,7 @@ class NodeResourceInstances { const NodeResourceInstanceSet &GetAvailableResourceInstances() const; const NodeResourceInstanceSet &GetTotalResourceInstances() const; /// Returns if this equals another node resources. - bool operator==(const NodeResourceInstances &other); + bool operator==(const NodeResourceInstances &other) const; /// Returns human-readable string for these resources. [[nodiscard]] std::string DebugString() const; }; diff --git a/src/ray/common/scheduling/label_selector.cc b/src/ray/common/scheduling/label_selector.cc index 4b27d25955c3..a2255549d080 100644 --- a/src/ray/common/scheduling/label_selector.cc +++ b/src/ray/common/scheduling/label_selector.cc @@ -18,7 +18,6 @@ #include #include "absl/strings/match.h" -#include "ray/util/logging.h" namespace ray { diff --git a/src/ray/common/scheduling/placement_group_util.cc b/src/ray/common/scheduling/placement_group_util.cc new file mode 100644 index 000000000000..7fab5a56efff --- /dev/null +++ b/src/ray/common/scheduling/placement_group_util.cc @@ -0,0 +1,77 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ray/common/scheduling/placement_group_util.h" + +#include +#include + +#include "ray/util/logging.h" + +namespace ray { + +bool IsCPUOrPlacementGroupCPUResource(ResourceID resource_id) { + // Check whether the resource is CPU resource or CPU resource inside PG. + if (resource_id == ResourceID::CPU()) { + return true; + } + + auto possible_pg_resource = ParsePgFormattedResource(resource_id.Binary(), + /*for_wildcard_resource*/ true, + /*for_indexed_resource*/ true); + if (possible_pg_resource.has_value() && + possible_pg_resource->original_resource == ResourceID::CPU().Binary()) { + return true; + } + + return false; +} + +std::optional ParsePgFormattedResource( + const std::string &resource, bool for_wildcard_resource, bool for_indexed_resource) { + // Check if it is a wildcard pg resource. + PgFormattedResourceData data; + std::smatch match_groups; + RAY_CHECK(for_wildcard_resource || for_indexed_resource) + << "Either one of for_wildcard_resource or for_indexed_resource must be true"; + + if (for_wildcard_resource) { + static const std::regex wild_card_resource_pattern("^(.*)_group_([0-9a-f]+)$"); + + if (std::regex_match(resource, match_groups, wild_card_resource_pattern) && + match_groups.size() == 3) { + data.original_resource = match_groups[1].str(); + data.bundle_index = -1; + data.group_id = match_groups[2].str(); + return data; + } + } + + // Check if it is a regular pg resource. + if (for_indexed_resource) { + static const std::regex pg_resource_pattern("^(.+)_group_(\\d+)_([0-9a-zA-Z]+)"); + if (std::regex_match(resource, match_groups, pg_resource_pattern) && + match_groups.size() == 4) { + data.original_resource = match_groups[1].str(); + data.bundle_index = stoi(match_groups[2].str()); + data.group_id = match_groups[3].str(); + return data; + } + } + + // If it is not a wildcard or pg formatted resource, return nullopt. + return std::nullopt; +} + +} // namespace ray diff --git a/src/ray/common/scheduling/placement_group_util.h b/src/ray/common/scheduling/placement_group_util.h new file mode 100644 index 000000000000..56c2137c5cd9 --- /dev/null +++ b/src/ray/common/scheduling/placement_group_util.h @@ -0,0 +1,50 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "ray/common/scheduling/scheduling_ids.h" + +namespace ray { + +using scheduling::ResourceID; + +struct PgFormattedResourceData { + std::string original_resource; + /// -1 if it is a wildcard resource. + int64_t bundle_index; + std::string group_id; +}; + +/// Return whether the resource specified by the resource_id is a CPU resource +/// or CPU resource inside a placement group. +bool IsCPUOrPlacementGroupCPUResource(ResourceID resource_id); + +/// Parse the given resource and get the pg related information. +/// +/// \param resource name of the resource. +/// \param for_wildcard_resource if true, it parses wildcard pg resources. +/// E.g., [resource]_group_[pg_id] +/// \param for_indexed_resource if true, it parses indexed pg resources. +/// E.g., [resource]_group_[index]_[pg_id] +/// \return nullopt if it is not a pg resource. Otherwise, it returns the +/// struct with pg information parsed from the resource. +/// If a returned bundle index is -1, it means the resource is the wildcard resource. +std::optional ParsePgFormattedResource( + const std::string &resource, bool for_wildcard_resource, bool for_indexed_resource); + +} // namespace ray diff --git a/src/ray/common/scheduling/resource_instance_set.cc b/src/ray/common/scheduling/resource_instance_set.cc index 4765dd7c5c5e..e64d9b329aab 100644 --- a/src/ray/common/scheduling/resource_instance_set.cc +++ b/src/ray/common/scheduling/resource_instance_set.cc @@ -20,7 +20,7 @@ #include #include -#include "ray/common/bundle_spec.h" +#include "ray/common/scheduling/placement_group_util.h" #include "ray/util/container_util.h" #include "ray/util/logging.h" @@ -191,8 +191,7 @@ NodeResourceInstanceSet::TryAllocate(const ResourceSet &resource_demands) { if (data) { // Aggregate based on resource type ResourceID original_resource_id{data->original_resource}; - pg_resource_map[original_resource_id].push_back( - std::make_pair(resource_id, data.value())); + pg_resource_map[original_resource_id].emplace_back(resource_id, data.value()); } else { // Directly allocate the resources if the resource is not with a placement group auto allocation = TryAllocate(resource_id, demand); @@ -202,8 +201,8 @@ NodeResourceInstanceSet::TryAllocate(const ResourceSet &resource_demands) { allocations[resource_id] = std::move(*allocation); } else { // Allocation failed. Restore partially allocated resources. - for (const auto &[resource_id, allocation] : allocations) { - Free(resource_id, allocation); + for (const auto &[id, allocated] : allocations) { + Free(id, allocated); } return std::nullopt; } diff --git a/src/ray/common/scheduling/resource_instance_set.h b/src/ray/common/scheduling/resource_instance_set.h index 61ad263a59fb..f49b2d01fccf 100644 --- a/src/ray/common/scheduling/resource_instance_set.h +++ b/src/ray/common/scheduling/resource_instance_set.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include diff --git a/src/ray/common/scheduling/resource_set.cc b/src/ray/common/scheduling/resource_set.cc index b33e82e97907..871b6655ff2d 100644 --- a/src/ray/common/scheduling/resource_set.cc +++ b/src/ray/common/scheduling/resource_set.cc @@ -14,14 +14,11 @@ #include "ray/common/scheduling/resource_set.h" -#include #include #include #include #include -#include "ray/util/logging.h" - namespace ray { ResourceSet::ResourceSet( diff --git a/src/ray/common/scheduling/scheduling_class_util.cc b/src/ray/common/scheduling/scheduling_class_util.cc new file mode 100644 index 000000000000..0e1248fe569e --- /dev/null +++ b/src/ray/common/scheduling/scheduling_class_util.cc @@ -0,0 +1,169 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ray/common/scheduling/scheduling_class_util.h" + +#include +#include +#include + +#include "google/protobuf/util/message_differencer.h" +#include "ray/common/runtime_env_common.h" +#include "ray/util/logging.h" + +namespace ray { + +SchedulingClassDescriptor::SchedulingClassDescriptor( + ResourceSet rs, + LabelSelector ls, + FunctionDescriptor fd, + int64_t d, + rpc::SchedulingStrategy sched_strategy) + : resource_set(std::move(rs)), + label_selector(std::move(ls)), + function_descriptor(std::move(fd)), + depth(d), + scheduling_strategy(std::move(sched_strategy)) {} + +bool operator==(const ray::rpc::SchedulingStrategy &lhs, + const ray::rpc::SchedulingStrategy &rhs) { + if (lhs.scheduling_strategy_case() != rhs.scheduling_strategy_case()) { + return false; + } + + switch (lhs.scheduling_strategy_case()) { + case ray::rpc::SchedulingStrategy::kNodeAffinitySchedulingStrategy: { + return (lhs.node_affinity_scheduling_strategy().node_id() == + rhs.node_affinity_scheduling_strategy().node_id()) && + (lhs.node_affinity_scheduling_strategy().soft() == + rhs.node_affinity_scheduling_strategy().soft()) && + (lhs.node_affinity_scheduling_strategy().spill_on_unavailable() == + rhs.node_affinity_scheduling_strategy().spill_on_unavailable()) && + (lhs.node_affinity_scheduling_strategy().fail_on_unavailable() == + rhs.node_affinity_scheduling_strategy().fail_on_unavailable()); + } + case ray::rpc::SchedulingStrategy::kPlacementGroupSchedulingStrategy: { + return (lhs.placement_group_scheduling_strategy().placement_group_id() == + rhs.placement_group_scheduling_strategy().placement_group_id()) && + (lhs.placement_group_scheduling_strategy().placement_group_bundle_index() == + rhs.placement_group_scheduling_strategy().placement_group_bundle_index()) && + (lhs.placement_group_scheduling_strategy() + .placement_group_capture_child_tasks() == + rhs.placement_group_scheduling_strategy() + .placement_group_capture_child_tasks()); + } + case ray::rpc::SchedulingStrategy::kNodeLabelSchedulingStrategy: { + return google::protobuf::util::MessageDifferencer::Equivalent( + lhs.node_label_scheduling_strategy(), rhs.node_label_scheduling_strategy()); + } + default: + return true; + } +} + +// SchedulingClassDescriptor methods +bool SchedulingClassDescriptor::operator==(const SchedulingClassDescriptor &other) const { + return depth == other.depth && resource_set == other.resource_set && + label_selector == other.label_selector && + function_descriptor == other.function_descriptor && + scheduling_strategy == other.scheduling_strategy; +} + +std::string SchedulingClassDescriptor::DebugString() const { + std::stringstream buffer; + buffer << "{" + << "depth=" << depth << " " + << "function_descriptor=" << function_descriptor->ToString() << " " + << "scheduling_strategy=" << scheduling_strategy.DebugString() << " " + << "resource_set=" + << "{"; + for (const auto &pair : resource_set.GetResourceMap()) { + buffer << pair.first << " : " << pair.second << ", "; + } + buffer << "}"; + + buffer << "label_selector={"; + for (const auto &constraint : label_selector.GetConstraints()) { + buffer << constraint.GetLabelKey() << " " + << (constraint.GetOperator() == ray::LabelSelectorOperator::LABEL_IN ? "in" + : "!in") + << " ("; + for (const auto &val : constraint.GetLabelValues()) { + buffer << val << ", "; + } + buffer << "), "; + } + buffer << "}}"; + + return buffer.str(); +} + +std::string SchedulingClassDescriptor::ResourceSetStr() const { + std::stringstream buffer; + buffer << "{"; + for (const auto &pair : resource_set.GetResourceMap()) { + buffer << pair.first << " : " << pair.second << ", "; + } + buffer << "}"; + return buffer.str(); +} + +// Static member definitions +absl::Mutex SchedulingClassToIds::mutex_; +absl::flat_hash_map + SchedulingClassToIds::sched_cls_to_id_; +absl::flat_hash_map + SchedulingClassToIds::sched_id_to_cls_; +int SchedulingClassToIds::next_sched_id_; + +SchedulingClassDescriptor &SchedulingClassToIds::GetSchedulingClassDescriptor( + SchedulingClass id) { + absl::MutexLock lock(&mutex_); + auto it = sched_id_to_cls_.find(id); + RAY_CHECK(it != sched_id_to_cls_.end()) << "invalid id: " << id; + return it->second; +} + +SchedulingClass SchedulingClassToIds::GetSchedulingClass( + const SchedulingClassDescriptor &sched_cls) { + SchedulingClass sched_cls_id = 0; + absl::MutexLock lock(&mutex_); + auto it = sched_cls_to_id_.find(sched_cls); + if (it == sched_cls_to_id_.end()) { + sched_cls_id = ++next_sched_id_; + // TODO(ekl) we might want to try cleaning up task types in these cases + if (sched_cls_id > 100) { + RAY_LOG_EVERY_MS(WARNING, 1000) + << "More than " << sched_cls_id + << " types of tasks seen, this may reduce performance."; + } + sched_cls_to_id_[sched_cls] = sched_cls_id; + sched_id_to_cls_.emplace(sched_cls_id, sched_cls); + } else { + sched_cls_id = it->second; + } + return sched_cls_id; +} + +int CalculateRuntimeEnvHash(const std::string &serialized_runtime_env) { + if (IsRuntimeEnvEmpty(serialized_runtime_env)) { + // It's useful to have the same predetermined value for both unspecified and empty + // runtime envs. + return 0; + } + size_t hash = std::hash()(serialized_runtime_env); + return static_cast(hash); +} + +} // namespace ray diff --git a/src/ray/common/scheduling/scheduling_class_util.h b/src/ray/common/scheduling/scheduling_class_util.h new file mode 100644 index 000000000000..37b56736667d --- /dev/null +++ b/src/ray/common/scheduling/scheduling_class_util.h @@ -0,0 +1,170 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "absl/container/flat_hash_map.h" +#include "absl/synchronization/mutex.h" +#include "ray/common/function_descriptor.h" +#include "ray/common/scheduling/label_selector.h" +#include "ray/common/scheduling/resource_set.h" +#include "src/ray/protobuf/common.pb.h" + +namespace ray { + +bool operator==(const ray::rpc::SchedulingStrategy &lhs, + const ray::rpc::SchedulingStrategy &rhs); + +struct SchedulingClassDescriptor { + public: + explicit SchedulingClassDescriptor(ResourceSet rs, + LabelSelector ls, + FunctionDescriptor fd, + int64_t d, + rpc::SchedulingStrategy sched_strategy); + ResourceSet resource_set; + LabelSelector label_selector; + FunctionDescriptor function_descriptor; + int64_t depth; + rpc::SchedulingStrategy scheduling_strategy; + + bool operator==(const SchedulingClassDescriptor &other) const; + std::string DebugString() const; + std::string ResourceSetStr() const; +}; + +template +H AbslHashValue(H h, const SchedulingClassDescriptor &sched_cls) { + return H::combine(std::move(h), + sched_cls.resource_set, + sched_cls.function_descriptor->Hash(), + sched_cls.depth, + sched_cls.scheduling_strategy, + sched_cls.label_selector); +} + +using SchedulingClass = int; + +struct SchedulingClassToIds { + /// Below static fields could be mutated in `ComputeResources` concurrently due to + /// multi-threading, we need a mutex to protect it. + static absl::Mutex mutex_; + /// Keep global static id mappings for SchedulingClass for performance. + static absl::flat_hash_map sched_cls_to_id_ + ABSL_GUARDED_BY(mutex_); + static absl::flat_hash_map sched_id_to_cls_ + ABSL_GUARDED_BY(mutex_); + static int next_sched_id_ ABSL_GUARDED_BY(mutex_); + + /// Gets the scheduling class descriptor for the given id. + static SchedulingClassDescriptor &GetSchedulingClassDescriptor(SchedulingClass id); + + /// Gets or creates a scheduling class id for the given descriptor. + static SchedulingClass GetSchedulingClass(const SchedulingClassDescriptor &sched_cls); +}; + +// Get a Hash for the runtime environment string. +// "" and "{}" have the same hash. +// Other than that, only compare literal strings. i.e. '{"a": 1, "b": 2}' and '{"b": 2, +// "a": 1}' have different hashes. +int CalculateRuntimeEnvHash(const std::string &serialized_runtime_env); +} // namespace ray + +// Template specializations for std::hash +namespace std { + +template <> +struct hash { + size_t operator()(const ray::rpc::LabelOperator &label_operator) const { + size_t hash_value = std::hash()(label_operator.label_operator_case()); + if (label_operator.has_label_in()) { + for (const auto &value : label_operator.label_in().values()) { + hash_value ^= std::hash()(value); + } + } else if (label_operator.has_label_not_in()) { + for (const auto &value : label_operator.label_not_in().values()) { + hash_value ^= std::hash()(value); + } + } + return hash_value; + } +}; + +template <> +struct hash { + size_t operator()(const ray::rpc::LabelMatchExpression &expression) const { + size_t hash_val = std::hash()(expression.key()); + hash_val ^= std::hash()(expression.operator_()); + return hash_val; + } +}; + +template <> +struct hash { + size_t operator()(const ray::rpc::LabelMatchExpressions &expressions) const { + size_t hash_val = 0; + for (const auto &expression : expressions.expressions()) { + hash_val ^= std::hash()(expression); + } + return hash_val; + } +}; + +template <> +struct hash { + size_t operator()(const ray::rpc::SchedulingStrategy &scheduling_strategy) const { + size_t hash_val = std::hash()(scheduling_strategy.scheduling_strategy_case()); + if (scheduling_strategy.scheduling_strategy_case() == + ray::rpc::SchedulingStrategy::kNodeAffinitySchedulingStrategy) { + hash_val ^= std::hash()( + scheduling_strategy.node_affinity_scheduling_strategy().node_id()); + // soft returns a bool + hash_val ^= static_cast( + scheduling_strategy.node_affinity_scheduling_strategy().soft()); + hash_val ^= static_cast( + scheduling_strategy.node_affinity_scheduling_strategy().spill_on_unavailable()); + hash_val ^= static_cast( + scheduling_strategy.node_affinity_scheduling_strategy().fail_on_unavailable()); + } else if (scheduling_strategy.scheduling_strategy_case() == + ray::rpc::SchedulingStrategy::kPlacementGroupSchedulingStrategy) { + hash_val ^= std::hash()( + scheduling_strategy.placement_group_scheduling_strategy().placement_group_id()); + hash_val ^= scheduling_strategy.placement_group_scheduling_strategy() + .placement_group_bundle_index(); + // placement_group_capture_child_tasks returns a bool + hash_val ^= + static_cast(scheduling_strategy.placement_group_scheduling_strategy() + .placement_group_capture_child_tasks()); + } else if (scheduling_strategy.has_node_label_scheduling_strategy()) { + if (scheduling_strategy.node_label_scheduling_strategy().hard().expressions_size() > + 0) { + hash_val ^= std::hash()("hard"); + hash_val ^= std::hash()( + scheduling_strategy.node_label_scheduling_strategy().hard()); + } + if (scheduling_strategy.node_label_scheduling_strategy().soft().expressions_size() > + 0) { + hash_val ^= std::hash()("soft"); + hash_val ^= std::hash()( + scheduling_strategy.node_label_scheduling_strategy().soft()); + } + } + return hash_val; + } +}; + +} // namespace std diff --git a/src/ray/common/scheduling/scheduling_ids.cc b/src/ray/common/scheduling/scheduling_ids.cc index d1d128c82f02..87dbb86abdc5 100644 --- a/src/ray/common/scheduling/scheduling_ids.cc +++ b/src/ray/common/scheduling/scheduling_ids.cc @@ -14,9 +14,13 @@ #include "ray/common/scheduling/scheduling_ids.h" +#include #include #include +#include "ray/common/ray_config.h" +#include "ray/util/logging.h" + namespace ray { int64_t StringIdMap::Get(const std::string &string_id) const { diff --git a/src/ray/common/scheduling/scheduling_ids.h b/src/ray/common/scheduling/scheduling_ids.h index 1e8ead6ba118..ce97202130cc 100644 --- a/src/ray/common/scheduling/scheduling_ids.h +++ b/src/ray/common/scheduling/scheduling_ids.h @@ -14,18 +14,15 @@ #pragma once -#include #include #include #include #include "absl/container/flat_hash_map.h" #include "absl/container/flat_hash_set.h" +#include "absl/strings/match.h" #include "absl/synchronization/mutex.h" #include "ray/common/constants.h" -#include "ray/common/ray_config.h" -#include "ray/util/logging.h" -#include "ray/util/util.h" namespace ray { @@ -143,15 +140,15 @@ inline std::ostream &operator<<( /// the singleton map with PredefinedResources. template <> inline StringIdMap &BaseSchedulingID::GetMap() { - static std::unique_ptr map{[]() { - std::unique_ptr map(new StringIdMap()); - map->InsertOrDie(kCPU_ResourceLabel, CPU) + static std::unique_ptr singleton_map{[]() { + std::unique_ptr map_ptr(new StringIdMap()); + map_ptr->InsertOrDie(kCPU_ResourceLabel, CPU) .InsertOrDie(kGPU_ResourceLabel, GPU) .InsertOrDie(kObjectStoreMemory_ResourceLabel, OBJECT_STORE_MEM) .InsertOrDie(kMemory_ResourceLabel, MEM); - return map; + return map_ptr; }()}; - return *map; + return *singleton_map; } namespace scheduling { diff --git a/src/ray/common/scheduling/tests/BUILD.bazel b/src/ray/common/scheduling/tests/BUILD.bazel new file mode 100644 index 000000000000..2833b8227f76 --- /dev/null +++ b/src/ray/common/scheduling/tests/BUILD.bazel @@ -0,0 +1,67 @@ +load("//bazel:ray.bzl", "ray_cc_test") + +ray_cc_test( + name = "resource_request_test", + size = "small", + srcs = [ + "resource_request_test.cc", + ], + tags = ["team:core"], + deps = [ + "//src/ray/common/scheduling:cluster_resource_data", + "@com_google_googletest//:gtest_main", + ], +) + +ray_cc_test( + name = "resource_set_test", + size = "small", + srcs = [ + "resource_set_test.cc", + ], + tags = ["team:core"], + deps = [ + "//src/ray/common/scheduling:resource_set", + "@com_google_googletest//:gtest_main", + ], +) + +ray_cc_test( + name = "resource_instance_set_test", + size = "small", + srcs = [ + "resource_instance_set_test.cc", + ], + tags = ["team:core"], + deps = [ + "//src/ray/common/scheduling:resource_instance_set", + "@com_google_googletest//:gtest_main", + ], +) + +ray_cc_test( + name = "scheduling_ids_test", + size = "small", + srcs = [ + "scheduling_ids_test.cc", + ], + tags = ["team:core"], + deps = [ + "//src/ray/common:ray_config", + "//src/ray/common/scheduling:scheduling_ids", + "@com_google_googletest//:gtest_main", + ], +) + +ray_cc_test( + name = "label_selector_test", + size = "small", + srcs = [ + "label_selector_test.cc", + ], + tags = ["team:core"], + deps = [ + "//src/ray/common/scheduling:label_selector", + "@com_google_googletest//:gtest_main", + ], +) diff --git a/src/ray/common/test/label_selector_test.cc b/src/ray/common/scheduling/tests/label_selector_test.cc similarity index 99% rename from src/ray/common/test/label_selector_test.cc rename to src/ray/common/scheduling/tests/label_selector_test.cc index f30ad26caa05..89c1fe20aaea 100644 --- a/src/ray/common/test/label_selector_test.cc +++ b/src/ray/common/scheduling/tests/label_selector_test.cc @@ -14,6 +14,8 @@ #include "ray/common/scheduling/label_selector.h" +#include + #include "gtest/gtest.h" namespace ray { diff --git a/src/ray/common/test/resource_instance_set_test.cc b/src/ray/common/scheduling/tests/resource_instance_set_test.cc similarity index 99% rename from src/ray/common/test/resource_instance_set_test.cc rename to src/ray/common/scheduling/tests/resource_instance_set_test.cc index ba969f54509c..b5745caabf60 100644 --- a/src/ray/common/test/resource_instance_set_test.cc +++ b/src/ray/common/scheduling/tests/resource_instance_set_test.cc @@ -14,6 +14,9 @@ #include "ray/common/scheduling/resource_instance_set.h" +#include +#include + #include "absl/container/flat_hash_map.h" #include "absl/container/flat_hash_set.h" #include "gtest/gtest.h" diff --git a/src/ray/common/test/resource_request_test.cc b/src/ray/common/scheduling/tests/resource_request_test.cc similarity index 99% rename from src/ray/common/test/resource_request_test.cc rename to src/ray/common/scheduling/tests/resource_request_test.cc index 6b58e63e2757..50d9b14223ef 100644 --- a/src/ray/common/test/resource_request_test.cc +++ b/src/ray/common/scheduling/tests/resource_request_test.cc @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include + #include "absl/container/flat_hash_map.h" #include "absl/container/flat_hash_set.h" #include "gtest/gtest.h" diff --git a/src/ray/common/test/resource_set_test.cc b/src/ray/common/scheduling/tests/resource_set_test.cc similarity index 99% rename from src/ray/common/test/resource_set_test.cc rename to src/ray/common/scheduling/tests/resource_set_test.cc index 00ae8343853f..5eb5ae1eb822 100644 --- a/src/ray/common/test/resource_set_test.cc +++ b/src/ray/common/scheduling/tests/resource_set_test.cc @@ -14,6 +14,9 @@ #include "ray/common/scheduling/resource_set.h" +#include +#include + #include "absl/container/flat_hash_map.h" #include "absl/container/flat_hash_set.h" #include "gtest/gtest.h" diff --git a/src/ray/common/scheduling/tests/scheduling_ids_test.cc b/src/ray/common/scheduling/tests/scheduling_ids_test.cc new file mode 100644 index 000000000000..eabd09d3fe54 --- /dev/null +++ b/src/ray/common/scheduling/tests/scheduling_ids_test.cc @@ -0,0 +1,74 @@ +// Copyright 2021 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ray/common/scheduling/scheduling_ids.h" + +#include +#include + +#include "gtest/gtest.h" +#include "ray/common/ray_config.h" + +namespace ray { + +struct SchedulingIDsTest : public ::testing::Test {}; + +TEST_F(SchedulingIDsTest, BasicTest) { + std::vector string_ids = {"hello", "whaaat", "yes"}; + std::vector node_ids; + for (auto &string_id : string_ids) { + node_ids.emplace_back(string_id); + ASSERT_EQ(node_ids.back().Binary(), string_id); + } + ASSERT_EQ(node_ids[0], scheduling::NodeID(string_ids[0])); + ASSERT_EQ(node_ids[0], scheduling::NodeID(node_ids[0].ToInt())); + + ASSERT_TRUE(scheduling::NodeID::Nil().IsNil()); + ASSERT_EQ(scheduling::NodeID::Nil().ToInt(), -1); + ASSERT_EQ(scheduling::NodeID::Nil().Binary(), "-1"); + + ASSERT_EQ(scheduling::NodeID(13), scheduling::NodeID(13)); + ASSERT_NE(scheduling::NodeID(1), scheduling::NodeID(2)); + ASSERT_TRUE(scheduling::NodeID(1) < scheduling::NodeID(2)); +} + +TEST_F(SchedulingIDsTest, PrepopulateResourceIDTest) { + ASSERT_EQ(kCPU_ResourceLabel, scheduling::ResourceID(CPU).Binary()); + ASSERT_EQ(kGPU_ResourceLabel, scheduling::ResourceID(GPU).Binary()); + ASSERT_EQ(kObjectStoreMemory_ResourceLabel, + scheduling::ResourceID(OBJECT_STORE_MEM).Binary()); + ASSERT_EQ(kMemory_ResourceLabel, scheduling::ResourceID(MEM).Binary()); + + // mean while NodeID is not populated. + ASSERT_NE(kCPU_ResourceLabel, scheduling::NodeID(CPU).Binary()); +} + +TEST_F(SchedulingIDsTest, UnitInstanceResourceTest) { + RayConfig::instance().initialize( + R"( +{ + "predefined_unit_instance_resources": "CPU,GPU", + "custom_unit_instance_resources": "neuron_cores,TPU,custom1" +} + )"); + ASSERT_TRUE(scheduling::ResourceID::CPU().IsUnitInstanceResource()); + ASSERT_TRUE(scheduling::ResourceID::GPU().IsUnitInstanceResource()); + ASSERT_TRUE(scheduling::ResourceID("custom1").IsUnitInstanceResource()); + ASSERT_TRUE(scheduling::ResourceID("neuron_cores").IsUnitInstanceResource()); + ASSERT_TRUE(scheduling::ResourceID("TPU").IsUnitInstanceResource()); + + ASSERT_FALSE(scheduling::ResourceID::Memory().IsUnitInstanceResource()); + ASSERT_FALSE(scheduling::ResourceID("custom2").IsUnitInstanceResource()); +} +} // namespace ray diff --git a/src/ray/common/status.h b/src/ray/common/status.h index 3b9ba8a6cc43..58c83f3abb8f 100644 --- a/src/ray/common/status.h +++ b/src/ray/common/status.h @@ -33,7 +33,9 @@ #include #include "absl/strings/str_cat.h" +#include "ray/common/macros.h" #include "ray/common/source_location.h" +#include "ray/util/logging.h" #include "ray/util/macros.h" #include "ray/util/visibility.h" @@ -52,10 +54,10 @@ class error_code; // If the status is not OK, CHECK-fail immediately, appending the status to the // logged message. The message can be appended with <<. -#define RAY_CHECK_OK(s) \ - if (const ::ray::Status &_status_ = (s); true) \ - RAY_CHECK_WITH_DISPLAY(_status_.ok(), #s) \ - << "Status not OK: " << _status_.ToString() << " " +#define RAY_CHECK_OK(s) \ + if (const ::ray::Status & RAY_UNIQUE_VARIABLE(_s) = (s); true) \ + RAY_CHECK_WITH_DISPLAY(RAY_UNIQUE_VARIABLE(_s).ok(), #s) \ + << "Status not OK: " << RAY_UNIQUE_VARIABLE(_s).ToString() << " " namespace ray { diff --git a/src/ray/common/status_or.h b/src/ray/common/status_or.h index 2a94bb4bec99..12c7ed8f7b44 100644 --- a/src/ray/common/status_or.h +++ b/src/ray/common/status_or.h @@ -155,6 +155,7 @@ class StatusOr { bool IsNotFound() const { return code() == StatusCode::NotFound; } bool IsInvalidArgument() const { return code() == StatusCode::InvalidArgument; } + bool IsInvalid() const { return code() == StatusCode::Invalid; } bool IsPermissionDenied() const { return code() == StatusCode::PermissionDenied; } // Returns a reference to the current `ray::Status` contained within the diff --git a/src/ray/common/task/task.cc b/src/ray/common/task/task.cc deleted file mode 100644 index 8be3a423c1b5..000000000000 --- a/src/ray/common/task/task.cc +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2019-2020 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "ray/common/task/task.h" - -#include "absl/strings/str_format.h" - -namespace ray { - -RayTask::RayTask(rpc::TaskSpec task_spec) : task_spec_(std::move(task_spec)) { - ComputeDependencies(); -} - -RayTask::RayTask(rpc::Task message) - : task_spec_(std::move(*message.mutable_task_spec())) { - ComputeDependencies(); -} - -RayTask::RayTask(TaskSpecification task_spec) : task_spec_(std::move(task_spec)) { - ComputeDependencies(); -} - -RayTask::RayTask(TaskSpecification task_spec, std::string preferred_node_id) - : task_spec_(std::move(task_spec)), preferred_node_id_(std::move(preferred_node_id)) { - ComputeDependencies(); -} - -const TaskSpecification &RayTask::GetTaskSpecification() const { return task_spec_; } - -const std::vector &RayTask::GetDependencies() const { - return dependencies_; -} - -const std::string &RayTask::GetPreferredNodeID() const { return preferred_node_id_; } - -void RayTask::ComputeDependencies() { dependencies_ = task_spec_.GetDependencies(); } - -std::string RayTask::DebugString() const { - return absl::StrFormat("task_spec={%s}", task_spec_.DebugString()); -} - -} // namespace ray diff --git a/src/ray/common/task/task.h b/src/ray/common/task/task.h deleted file mode 100644 index fa9f4db14b3e..000000000000 --- a/src/ray/common/task/task.h +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright 2019-2020 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include "ray/common/task/task_common.h" -#include "ray/common/task/task_spec.h" - -namespace ray { - -/// \class RayTask -/// -/// A RayTask represents a Ray task and a specification of its execution (e.g., -/// resource demands). The task's specification contains both immutable fields, -/// determined at submission time, and mutable fields, determined at execution -/// time. -class RayTask { - public: - /// Construct an empty task. This should only be used to pass a task - /// as an out parameter to a function or method. - RayTask() = default; - - /// Construct a `RayTask` object from a protobuf message. - explicit RayTask(rpc::TaskSpec task_spec); - - /// Construct a `RayTask` object from a protobuf message. - /// - /// \param message The protobuf message. - explicit RayTask(rpc::Task message); - - /// Construct a `RayTask` object from a `TaskSpecification`. - explicit RayTask(TaskSpecification task_spec); - - RayTask(TaskSpecification task_spec, std::string preferred_node_id); - - /// Get the immutable specification for the task. - /// - /// \return The immutable specification for the task. - const TaskSpecification &GetTaskSpecification() const; - - /// Get the task's object dependencies. This comprises the immutable task - /// arguments and the mutable execution dependencies. - /// - /// \return The object dependencies. - const std::vector &GetDependencies() const; - - /// Get the task's preferred node id for scheduling. If the returned value - /// is empty, then it means the task has no preferred node. - /// - /// \return The preferred node id. - const std::string &GetPreferredNodeID() const; - - std::string DebugString() const; - - private: - void ComputeDependencies(); - - /// RayTask specification object, consisting of immutable information about this - /// task determined at submission time. Includes resource demand, object - /// dependencies, etc. - TaskSpecification task_spec_; - /// A cached copy of the task's object dependencies, including arguments from - /// the TaskSpecification. - std::vector dependencies_; - - std::string preferred_node_id_; -}; - -} // namespace ray diff --git a/src/ray/common/task/task_spec.cc b/src/ray/common/task/task_spec.cc index 1101d0f7c953..94c1199a7d27 100644 --- a/src/ray/common/task/task_spec.cc +++ b/src/ray/common/task/task_spec.cc @@ -15,7 +15,11 @@ #include "ray/common/task/task_spec.h" #include +#include #include +#include +#include +#include #include "ray/common/ray_config.h" #include "ray/common/runtime_env_common.h" @@ -24,42 +28,6 @@ namespace ray { -absl::Mutex TaskSpecification::mutex_; -absl::flat_hash_map - TaskSpecification::sched_cls_to_id_; -absl::flat_hash_map - TaskSpecification::sched_id_to_cls_; -int TaskSpecification::next_sched_id_; - -SchedulingClassDescriptor &TaskSpecification::GetSchedulingClassDescriptor( - SchedulingClass id) { - absl::MutexLock lock(&mutex_); - auto it = sched_id_to_cls_.find(id); - RAY_CHECK(it != sched_id_to_cls_.end()) << "invalid id: " << id; - return it->second; -} - -SchedulingClass TaskSpecification::GetSchedulingClass( - const SchedulingClassDescriptor &sched_cls) { - SchedulingClass sched_cls_id; - absl::MutexLock lock(&mutex_); - auto it = sched_cls_to_id_.find(sched_cls); - if (it == sched_cls_to_id_.end()) { - sched_cls_id = ++next_sched_id_; - // TODO(ekl) we might want to try cleaning up task types in these cases - if (sched_cls_id > 100) { - RAY_LOG_EVERY_MS(WARNING, 1000) - << "More than " << sched_cls_id - << " types of tasks seen, this may reduce performance."; - } - sched_cls_to_id_[sched_cls] = sched_cls_id; - sched_id_to_cls_.emplace(sched_cls_id, sched_cls); - } else { - sched_cls_id = it->second; - } - return sched_cls_id; -} - const BundleID TaskSpecification::PlacementGroupBundleId() const { if (message_->scheduling_strategy().scheduling_strategy_case() == rpc::SchedulingStrategy::SchedulingStrategyCase:: @@ -95,7 +63,8 @@ void TaskSpecification::ComputeResources() { // A static nil object is used here to avoid allocating the empty object every time. required_resources_ = ResourceSet::Nil(); } else { - required_resources_.reset(new ResourceSet(MapFromProtobuf(required_resources))); + required_resources_ = + std::make_shared(MapFromProtobuf(required_resources)); } auto &required_placement_resources = message_->required_placement_resources().empty() @@ -105,8 +74,8 @@ void TaskSpecification::ComputeResources() { if (required_placement_resources.empty()) { required_placement_resources_ = ResourceSet::Nil(); } else { - required_placement_resources_.reset( - new ResourceSet(MapFromProtobuf(required_placement_resources))); + required_placement_resources_ = + std::make_shared(MapFromProtobuf(required_placement_resources)); } // Set LabelSelector required for scheduling if specified. Parses string map @@ -132,7 +101,7 @@ void TaskSpecification::ComputeResources() { depth, GetSchedulingStrategy()); // Map the scheduling class descriptor to an integer for performance. - sched_cls_id_ = GetSchedulingClass(sched_cls_desc); + sched_cls_id_ = SchedulingClassToIds::GetSchedulingClass(sched_cls_desc); } runtime_env_hash_ = CalculateRuntimeEnvHash(SerializedRuntimeEnv()); @@ -162,12 +131,7 @@ const std::string TaskSpecification::GetSerializedActorHandle() const { return message_->actor_creation_task_spec().serialized_actor_handle(); } -JobID TaskSpecification::JobId() const { - if (message_->job_id().empty() /* e.g., empty proto default */) { - return JobID::Nil(); - } - return JobID::FromBinary(message_->job_id()); -} +JobID TaskSpecification::JobId() const { return JobID::FromBinary(message_->job_id()); } const rpc::JobConfig &TaskSpecification::JobConfig() const { return message_->job_config(); @@ -491,7 +455,7 @@ std::string TaskSpecification::CallerWorkerIdBinary() const { } NodeID TaskSpecification::CallerNodeId() const { - return NodeID::FromBinary(message_->caller_address().raylet_id()); + return NodeID::FromBinary(message_->caller_address().node_id()); } // === Below are getter methods specific to actor tasks. @@ -666,16 +630,6 @@ std::string TaskSpecification::CallSiteString() const { return stream.str(); } -int CalculateRuntimeEnvHash(const std::string &serialized_runtime_env) { - if (IsRuntimeEnvEmpty(serialized_runtime_env)) { - // It's useful to have the same predetermined value for both unspecified and empty - // runtime envs. - return 0; - } - size_t hash = std::hash()(serialized_runtime_env); - return static_cast(hash); -} - std::vector TaskSpecification::ConcurrencyGroups() const { RAY_CHECK(IsActorCreationTask()); std::vector concurrency_groups; @@ -692,10 +646,10 @@ std::vector TaskSpecification::ConcurrencyGroups() const { curr_group_message.function_descriptors(j))); } - concurrency_groups.push_back( - {std::string{curr_group_message.name()}, - static_cast(curr_group_message.max_concurrency()), - function_descriptors}); + concurrency_groups.emplace_back( + std::string{curr_group_message.name()}, + static_cast(curr_group_message.max_concurrency()), + function_descriptors); } return concurrency_groups; diff --git a/src/ray/common/task/task_spec.h b/src/ray/common/task/task_spec.h index dca4db743701..3ead82c55128 100644 --- a/src/ray/common/task/task_spec.h +++ b/src/ray/common/task/task_spec.h @@ -17,246 +17,50 @@ #include #include +#include #include -#include #include #include -#include "absl/hash/hash.h" -#include "absl/synchronization/mutex.h" #include "ray/common/function_descriptor.h" #include "ray/common/grpc_util.h" #include "ray/common/id.h" #include "ray/common/scheduling/label_selector.h" #include "ray/common/scheduling/resource_set.h" +#include "ray/common/scheduling/scheduling_class_util.h" #include "ray/common/task/task_common.h" extern "C" { #include "ray/thirdparty/sha256.h" } -namespace ray { -inline bool operator==(const ray::rpc::SchedulingStrategy &lhs, - const ray::rpc::SchedulingStrategy &rhs) { - if (lhs.scheduling_strategy_case() != rhs.scheduling_strategy_case()) { - return false; - } - - switch (lhs.scheduling_strategy_case()) { - case ray::rpc::SchedulingStrategy::kNodeAffinitySchedulingStrategy: { - return (lhs.node_affinity_scheduling_strategy().node_id() == - rhs.node_affinity_scheduling_strategy().node_id()) && - (lhs.node_affinity_scheduling_strategy().soft() == - rhs.node_affinity_scheduling_strategy().soft()) && - (lhs.node_affinity_scheduling_strategy().spill_on_unavailable() == - rhs.node_affinity_scheduling_strategy().spill_on_unavailable()) && - (lhs.node_affinity_scheduling_strategy().fail_on_unavailable() == - rhs.node_affinity_scheduling_strategy().fail_on_unavailable()); - } - case ray::rpc::SchedulingStrategy::kPlacementGroupSchedulingStrategy: { - return (lhs.placement_group_scheduling_strategy().placement_group_id() == - rhs.placement_group_scheduling_strategy().placement_group_id()) && - (lhs.placement_group_scheduling_strategy().placement_group_bundle_index() == - rhs.placement_group_scheduling_strategy().placement_group_bundle_index()) && - (lhs.placement_group_scheduling_strategy() - .placement_group_capture_child_tasks() == - rhs.placement_group_scheduling_strategy() - .placement_group_capture_child_tasks()); - } - case ray::rpc::SchedulingStrategy::kNodeLabelSchedulingStrategy: { - return google::protobuf::util::MessageDifferencer::Equivalent( - lhs.node_label_scheduling_strategy(), rhs.node_label_scheduling_strategy()); - } - default: - return true; - } -} - -typedef int SchedulingClass; - -struct SchedulingClassDescriptor { - public: - explicit SchedulingClassDescriptor(ResourceSet rs, - LabelSelector ls, - FunctionDescriptor fd, - int64_t d, - rpc::SchedulingStrategy scheduling_strategy) - : resource_set(std::move(rs)), - label_selector(std::move(ls)), - function_descriptor(std::move(fd)), - depth(d), - scheduling_strategy(std::move(scheduling_strategy)) {} - ResourceSet resource_set; - LabelSelector label_selector; - FunctionDescriptor function_descriptor; - int64_t depth; - rpc::SchedulingStrategy scheduling_strategy; - - bool operator==(const SchedulingClassDescriptor &other) const { - return depth == other.depth && resource_set == other.resource_set && - label_selector == other.label_selector && - function_descriptor == other.function_descriptor && - scheduling_strategy == other.scheduling_strategy; - } - - std::string DebugString() const { - std::stringstream buffer; - buffer << "{" - << "depth=" << depth << " " - << "function_descriptor=" << function_descriptor->ToString() << " " - << "scheduling_strategy=" << scheduling_strategy.DebugString() << " " - << "resource_set=" - << "{"; - for (const auto &pair : resource_set.GetResourceMap()) { - buffer << pair.first << " : " << pair.second << ", "; - } - buffer << "}"; - - buffer << "label_selector={"; - for (const auto &constraint : label_selector.GetConstraints()) { - buffer << constraint.GetLabelKey() << " " - << (constraint.GetOperator() == ray::LabelSelectorOperator::LABEL_IN ? "in" - : "!in") - << " ("; - for (const auto &val : constraint.GetLabelValues()) { - buffer << val << ", "; - } - buffer << "), "; - } - buffer << "}}"; - - return buffer.str(); - } - - std::string ResourceSetStr() const { - std::stringstream buffer; - buffer << "{"; - for (const auto &pair : resource_set.GetResourceMap()) { - buffer << pair.first << " : " << pair.second << ", "; - } - buffer << "}"; - return buffer.str(); - } -}; - -template -H AbslHashValue(H h, const SchedulingClassDescriptor &sched_cls) { - return H::combine(std::move(h), - sched_cls.resource_set, - sched_cls.function_descriptor->Hash(), - sched_cls.depth, - sched_cls.scheduling_strategy, - sched_cls.label_selector); -} -} // namespace ray - -namespace std { -template <> -struct hash { - size_t operator()(const ray::rpc::LabelOperator &label_operator) const { - size_t hash = std::hash()(label_operator.label_operator_case()); - if (label_operator.has_label_in()) { - for (const auto &value : label_operator.label_in().values()) { - hash ^= std::hash()(value); - } - } else if (label_operator.has_label_not_in()) { - for (const auto &value : label_operator.label_not_in().values()) { - hash ^= std::hash()(value); - } - } - return hash; - } -}; - -template <> -struct hash { - size_t operator()(const ray::rpc::LabelMatchExpression &expression) const { - size_t hash_val = std::hash()(expression.key()); - hash_val ^= std::hash()(expression.operator_()); - return hash_val; - } -}; - -template <> -struct hash { - size_t operator()(const ray::rpc::LabelMatchExpressions &expressions) const { - size_t hash_val = 0; - for (const auto &expression : expressions.expressions()) { - hash_val ^= std::hash()(expression); - } - return hash_val; - } -}; - -template <> -struct hash { - size_t operator()(const ray::rpc::SchedulingStrategy &scheduling_strategy) const { - size_t hash_val = std::hash()(scheduling_strategy.scheduling_strategy_case()); - if (scheduling_strategy.scheduling_strategy_case() == - ray::rpc::SchedulingStrategy::kNodeAffinitySchedulingStrategy) { - hash_val ^= std::hash()( - scheduling_strategy.node_affinity_scheduling_strategy().node_id()); - // soft returns a bool - hash_val ^= static_cast( - scheduling_strategy.node_affinity_scheduling_strategy().soft()); - hash_val ^= static_cast( - scheduling_strategy.node_affinity_scheduling_strategy().spill_on_unavailable()); - hash_val ^= static_cast( - scheduling_strategy.node_affinity_scheduling_strategy().fail_on_unavailable()); - } else if (scheduling_strategy.scheduling_strategy_case() == - ray::rpc::SchedulingStrategy::kPlacementGroupSchedulingStrategy) { - hash_val ^= std::hash()( - scheduling_strategy.placement_group_scheduling_strategy().placement_group_id()); - hash_val ^= scheduling_strategy.placement_group_scheduling_strategy() - .placement_group_bundle_index(); - // placement_group_capture_child_tasks returns a bool - hash_val ^= - static_cast(scheduling_strategy.placement_group_scheduling_strategy() - .placement_group_capture_child_tasks()); - } else if (scheduling_strategy.has_node_label_scheduling_strategy()) { - if (scheduling_strategy.node_label_scheduling_strategy().hard().expressions_size() > - 0) { - hash_val ^= std::hash()("hard"); - hash_val ^= std::hash()( - scheduling_strategy.node_label_scheduling_strategy().hard()); - } - if (scheduling_strategy.node_label_scheduling_strategy().soft().expressions_size() > - 0) { - hash_val ^= std::hash()("soft"); - hash_val ^= std::hash()( - scheduling_strategy.node_label_scheduling_strategy().soft()); - } - } - return hash_val; - } -}; -} // namespace std - namespace ray { /// ConcurrencyGroup is a group of actor methods that shares /// a executing thread pool. struct ConcurrencyGroup { // Name of this group. - std::string name; + std::string name_; // Max concurrency of this group. - uint32_t max_concurrency; + uint32_t max_concurrency_; // Function descriptors of the actor methods in this group. - std::vector function_descriptors; + std::vector function_descriptors_; ConcurrencyGroup() = default; - ConcurrencyGroup(const std::string &name, + ConcurrencyGroup(std::string name, uint32_t max_concurrency, - const std::vector &fds) - : name(name), max_concurrency(max_concurrency), function_descriptors(fds) {} + std::vector fds) + : name_(std::move(name)), + max_concurrency_(max_concurrency), + function_descriptors_(std::move(fds)) {} - std::string GetName() const { return name; } + std::string GetName() const { return name_; } - uint32_t GetMaxConcurrency() const { return max_concurrency; } + uint32_t GetMaxConcurrency() const { return max_concurrency_; } std::vector GetFunctionDescriptors() const { - return function_descriptors; + return function_descriptors_; } }; @@ -282,12 +86,7 @@ class TaskSpecification : public MessageWrapper { /// The input message will be copied/moved into this object. /// /// \param message The protobuf message. - explicit TaskSpecification(rpc::TaskSpec &&message) - : MessageWrapper(std::move(message)) { - ComputeResources(); - } - - explicit TaskSpecification(const rpc::TaskSpec &message) : MessageWrapper(message) { + explicit TaskSpecification(rpc::TaskSpec message) : MessageWrapper(std::move(message)) { ComputeResources(); } @@ -583,21 +382,6 @@ class TaskSpecification : public MessageWrapper { // Field storing label selector for scheduling Task on a node. Initialized in constuctor // in ComputeResources() call. std::shared_ptr label_selector_; - /// Below static fields could be mutated in `ComputeResources` concurrently due to - /// multi-threading, we need a mutex to protect it. - static absl::Mutex mutex_; - /// Keep global static id mappings for SchedulingClass for performance. - static absl::flat_hash_map sched_cls_to_id_ - ABSL_GUARDED_BY(mutex_); - static absl::flat_hash_map sched_id_to_cls_ - ABSL_GUARDED_BY(mutex_); - static int next_sched_id_ ABSL_GUARDED_BY(mutex_); }; -// Get a Hash for the runtime environment string. -// "" and "{}" have the same hash. -// Other than that, only compare literal strings. i.e. '{"a": 1, "b": 2}' and '{"b": 2, -// "a": 1}' have different hashes. -int CalculateRuntimeEnvHash(const std::string &serialized_runtime_env); - } // namespace ray diff --git a/src/ray/common/task/task_util.h b/src/ray/common/task/task_util.h index bf9e35eb0c38..265aed3fb51e 100644 --- a/src/ray/common/task/task_util.h +++ b/src/ray/common/task/task_util.h @@ -14,6 +14,12 @@ #pragma once +#include +#include +#include +#include +#include + #include "ray/common/buffer.h" #include "ray/common/ray_object.h" #include "ray/common/task/task_spec.h" @@ -24,17 +30,17 @@ namespace ray { /// Stores the task failure reason. struct TaskFailureEntry { /// The task failure details. - rpc::RayErrorInfo ray_error_info; + rpc::RayErrorInfo ray_error_info_; /// The creation time of this entry. - std::chrono::steady_clock::time_point creation_time; + std::chrono::steady_clock::time_point creation_time_; /// Whether this task should be retried. - bool should_retry; + bool should_retry_; TaskFailureEntry(const rpc::RayErrorInfo &ray_error_info, bool should_retry) - : ray_error_info(ray_error_info), - creation_time(std::chrono::steady_clock::now()), - should_retry(should_retry) {} + : ray_error_info_(ray_error_info), + creation_time_(std::chrono::steady_clock::now()), + should_retry_(should_retry) {} }; /// Argument of a task. @@ -50,16 +56,22 @@ class TaskArgByReference : public TaskArg { /// /// \param[in] object_id Id of the argument. /// \return The task argument. - TaskArgByReference(const ObjectID &object_id, - const rpc::Address &owner_address, - const std::string &call_site) - : id_(object_id), owner_address_(owner_address), call_site_(call_site) {} + TaskArgByReference( + const ObjectID &object_id, + const rpc::Address &owner_address, + const std::string &call_site, + const rpc::TensorTransport &tensor_transport = rpc::TensorTransport::OBJECT_STORE) + : id_(object_id), + owner_address_(owner_address), + call_site_(call_site), + tensor_transport_(tensor_transport) {} void ToProto(rpc::TaskArg *arg_proto) const { auto ref = arg_proto->mutable_object_ref(); ref->set_object_id(id_.Binary()); ref->mutable_owner_address()->CopyFrom(owner_address_); ref->set_call_site(call_site_); + ref->set_tensor_transport(tensor_transport_); } private: @@ -67,6 +79,7 @@ class TaskArgByReference : public TaskArg { const ObjectID id_; const rpc::Address owner_address_; const std::string call_site_; + const rpc::TensorTransport tensor_transport_; }; class TaskArgByValue : public TaskArg { @@ -265,10 +278,10 @@ class TaskSpecBuilder { actor_creation_spec->set_serialized_actor_handle(serialized_actor_handle); for (const auto &concurrency_group : concurrency_groups) { rpc::ConcurrencyGroup *group = actor_creation_spec->add_concurrency_groups(); - group->set_name(concurrency_group.name); - group->set_max_concurrency(concurrency_group.max_concurrency); + group->set_name(concurrency_group.name_); + group->set_max_concurrency(concurrency_group.max_concurrency_); // Fill into function descriptor. - for (auto &item : concurrency_group.function_descriptors) { + for (auto &item : concurrency_group.function_descriptors_) { rpc::FunctionDescriptor *fd = group->add_function_descriptors(); *fd = item->GetMessage(); } diff --git a/src/ray/common/test/scheduling_ids_test.cc b/src/ray/common/test/scheduling_ids_test.cc deleted file mode 100644 index f06a5cd10544..000000000000 --- a/src/ray/common/test/scheduling_ids_test.cc +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright 2021 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "ray/common/scheduling/scheduling_ids.h" - -#include "gtest/gtest.h" - -namespace ray { -using namespace ray::scheduling; - -struct SchedulingIDsTest : public ::testing::Test {}; - -TEST_F(SchedulingIDsTest, BasicTest) { - std::vector string_ids = {"hello", "whaaat", "yes"}; - std::vector node_ids; - for (auto &string_id : string_ids) { - node_ids.emplace_back(NodeID(string_id)); - ASSERT_EQ(node_ids.back().Binary(), string_id); - } - ASSERT_EQ(node_ids[0], NodeID(string_ids[0])); - ASSERT_EQ(node_ids[0], NodeID(node_ids[0].ToInt())); - - ASSERT_TRUE(NodeID::Nil().IsNil()); - ASSERT_EQ(NodeID::Nil().ToInt(), -1); - ASSERT_EQ(NodeID::Nil().Binary(), "-1"); - - ASSERT_EQ(NodeID(13), NodeID(13)); - ASSERT_NE(NodeID(1), NodeID(2)); - ASSERT_TRUE(NodeID(1) < NodeID(2)); -} - -TEST_F(SchedulingIDsTest, PrepopulateResourceIDTest) { - ASSERT_EQ(kCPU_ResourceLabel, ResourceID(CPU).Binary()); - ASSERT_EQ(kGPU_ResourceLabel, ResourceID(GPU).Binary()); - ASSERT_EQ(kObjectStoreMemory_ResourceLabel, ResourceID(OBJECT_STORE_MEM).Binary()); - ASSERT_EQ(kMemory_ResourceLabel, ResourceID(MEM).Binary()); - - // mean while NodeID is not populated. - ASSERT_NE(kCPU_ResourceLabel, NodeID(CPU).Binary()); -} - -TEST_F(SchedulingIDsTest, UnitInstanceResourceTest) { - RayConfig::instance().initialize( - R"( -{ - "predefined_unit_instance_resources": "CPU,GPU", - "custom_unit_instance_resources": "neuron_cores,TPU,custom1" -} - )"); - ASSERT_TRUE(ResourceID::CPU().IsUnitInstanceResource()); - ASSERT_TRUE(ResourceID::GPU().IsUnitInstanceResource()); - ASSERT_TRUE(ResourceID("custom1").IsUnitInstanceResource()); - ASSERT_TRUE(ResourceID("neuron_cores").IsUnitInstanceResource()); - ASSERT_TRUE(ResourceID("TPU").IsUnitInstanceResource()); - - ASSERT_FALSE(ResourceID::Memory().IsUnitInstanceResource()); - ASSERT_FALSE(ResourceID("custom2").IsUnitInstanceResource()); -} -} // namespace ray diff --git a/src/ray/common/test_util.cc b/src/ray/common/test_util.cc deleted file mode 100644 index 14307488c87e..000000000000 --- a/src/ray/common/test_util.cc +++ /dev/null @@ -1,193 +0,0 @@ -// Copyright 2017 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "ray/common/test_util.h" - -#include -#include - -#include "absl/strings/escaping.h" -#include "ray/common/buffer.h" -#include "ray/common/ray_config.h" -#include "ray/common/ray_object.h" -#include "ray/common/test_util.h" -#include "ray/util/cmd_line_utils.h" -#include "ray/util/filesystem.h" -#include "ray/util/logging.h" -#include "ray/util/network_util.h" -#include "ray/util/path_utils.h" -#include "ray/util/process.h" -#include "ray/util/util.h" - -namespace ray { - -void TestSetupUtil::StartUpRedisServers(const std::vector &redis_server_ports, - bool save) { - if (redis_server_ports.empty()) { - TEST_REDIS_SERVER_PORTS.push_back(StartUpRedisServer(0, save)); - } else { - for (const auto &port : redis_server_ports) { - TEST_REDIS_SERVER_PORTS.push_back(StartUpRedisServer(port, save)); - } - } -} - -// start a redis server with specified port, use random one when 0 given -int TestSetupUtil::StartUpRedisServer(int port, bool save) { - int actual_port = port; - if (port == 0) { - static std::atomic srand_called(false); - if (!srand_called.exchange(true)) { - srand(current_time_ms() % RAND_MAX); - } - // Use random port (in range [2000, 7000) to avoid port conflicts between UTs. - do { - actual_port = rand() % 5000 + 2000; - } while (!CheckPortFree(actual_port)); - } - - std::string program = TEST_REDIS_SERVER_EXEC_PATH; -#ifdef _WIN32 - std::vector cmdargs({program, "--loglevel", "warning"}); -#else - std::vector cmdargs; - if (!save) { - cmdargs = {program, "--loglevel", "warning", "--save", "", "--appendonly", "no"}; - } else { - cmdargs = {program, "--loglevel", "warning"}; - } -#endif - cmdargs.insert(cmdargs.end(), {"--port", std::to_string(actual_port)}); - RAY_LOG(INFO) << "Start redis command is: " << CreateCommandLine(cmdargs); - RAY_CHECK(!Process::Spawn(cmdargs, true).second); - std::this_thread::sleep_for(std::chrono::milliseconds(200)); - return actual_port; -} - -void TestSetupUtil::ShutDownRedisServers() { - for (const auto &port : TEST_REDIS_SERVER_PORTS) { - ShutDownRedisServer(port); - } - TEST_REDIS_SERVER_PORTS = std::vector(); -} - -void TestSetupUtil::ShutDownRedisServer(int port) { - std::vector cmdargs( - {TEST_REDIS_CLIENT_EXEC_PATH, "-p", std::to_string(port), "shutdown"}); - RAY_LOG(INFO) << "Stop redis command is: " << CreateCommandLine(cmdargs); - if (Process::Call(cmdargs) != std::error_code()) { - RAY_LOG(WARNING) << "Failed to stop redis. The redis process may no longer exist."; - } - std::this_thread::sleep_for(std::chrono::milliseconds(100)); -} - -void TestSetupUtil::FlushAllRedisServers() { - for (const auto &port : TEST_REDIS_SERVER_PORTS) { - FlushRedisServer(port); - } -} - -void TestSetupUtil::ExecuteRedisCmd(int port, std::vector cmd) { - std::vector cmdargs( - {TEST_REDIS_CLIENT_EXEC_PATH, "-p", std::to_string(port)}); - cmdargs.insert(cmdargs.end(), cmd.begin(), cmd.end()); - RAY_LOG(INFO) << "Send command to redis: " << CreateCommandLine(cmdargs); - if (Process::Call(cmdargs)) { - RAY_LOG(WARNING) << "Failed to send request to redis."; - } -} - -void TestSetupUtil::FlushRedisServer(int port) { - std::vector cmdargs( - {TEST_REDIS_CLIENT_EXEC_PATH, "-p", std::to_string(port), "flushall"}); - RAY_LOG(INFO) << "Cleaning up redis with command: " << CreateCommandLine(cmdargs); - if (Process::Call(cmdargs)) { - RAY_LOG(WARNING) << "Failed to flush redis. The redis process may no longer exist."; - } - std::this_thread::sleep_for(std::chrono::milliseconds(100)); -} - -bool WaitReady(std::future future, const std::chrono::milliseconds &timeout_ms) { - auto status = future.wait_for(timeout_ms); - return status == std::future_status::ready && future.get(); -} - -bool WaitForCondition(std::function condition, int timeout_ms) { - int wait_time = 0; - while (true) { - if (condition()) { - return true; - } - - // sleep 10ms. - const int wait_interval_ms = 10; - std::this_thread::sleep_for(std::chrono::milliseconds(wait_interval_ms)); - wait_time += wait_interval_ms; - if (wait_time > timeout_ms) { - break; - } - } - return false; -} - -void WaitForExpectedCount(std::atomic ¤t_count, - int expected_count, - int timeout_ms) { - auto condition = [¤t_count, expected_count]() { - return current_count == expected_count; - }; - EXPECT_TRUE(WaitForCondition(condition, timeout_ms)); -} - -TaskID RandomTaskId() { - std::string data(TaskID::Size(), 0); - FillRandom(&data); - return TaskID::FromBinary(data); -} - -JobID RandomJobId() { - std::string data(JobID::Size(), 0); - FillRandom(&data); - return JobID::FromBinary(data); -} - -std::shared_ptr GenerateRandomBuffer() { - auto seed = std::chrono::high_resolution_clock::now().time_since_epoch().count(); - std::mt19937 gen(seed); - std::uniform_int_distribution<> dis(1, 10); - std::uniform_int_distribution<> value_dis(1, 255); - - std::vector arg1(dis(gen), value_dis(gen)); - return std::make_shared(arg1.data(), arg1.size(), true); -} - -std::shared_ptr GenerateRandomObject( - const std::vector &inlined_ids) { - std::vector refs; - for (const auto &inlined_id : inlined_ids) { - rpc::ObjectReference ref; - ref.set_object_id(inlined_id.Binary()); - refs.push_back(ref); - } - return std::make_shared(GenerateRandomBuffer(), nullptr, refs); -} - -/// Path to redis server executable binary. -std::string TEST_REDIS_SERVER_EXEC_PATH; -/// Path to redis client executable binary. -std::string TEST_REDIS_CLIENT_EXEC_PATH; -/// Ports of redis server. -std::vector TEST_REDIS_SERVER_PORTS; - -} // namespace ray diff --git a/src/ray/common/test_util.h b/src/ray/common/test_util.h deleted file mode 100644 index 8be836b49e01..000000000000 --- a/src/ray/common/test_util.h +++ /dev/null @@ -1,117 +0,0 @@ -// Copyright 2017 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include - -#include "gtest/gtest.h" -#include "ray/common/asio/asio_util.h" -#include "ray/common/id.h" -#include "ray/util/util.h" -#include "src/ray/protobuf/common.pb.h" -namespace ray { - -static inline std::vector ObjectIdsToRefs( - std::vector object_ids) { - std::vector refs; - for (const auto &object_id : object_ids) { - rpc::ObjectReference ref; - ref.set_object_id(object_id.Binary()); - refs.push_back(ref); - } - return refs; -} - -class Buffer; -class RayObject; - -/// Wait until the future is ready, or timeout is reached. -/// -/// \param[in] future The future to wait for. -/// \param[in] timeout_ms Timeout in milliseconds to wait for for. -/// \return Whether the future is ready. -bool WaitReady(std::future future, const std::chrono::milliseconds &timeout_ms); - -/// Wait until the condition is met, or timeout is reached. -/// -/// \param[in] condition The condition to wait for. -/// \param[in] timeout_ms Timeout in milliseconds to wait for for. -/// \return Whether the condition is met. -bool WaitForCondition(std::function condition, int timeout_ms); - -/// Wait until the expected count is met, or timeout is reached. -/// -/// \param[in] current_count The current count. -/// \param[in] expected_count The expected count. -/// \param[in] timeout_ms Timeout in milliseconds to wait for for. -/// \return Whether the expected count is met. -void WaitForExpectedCount(std::atomic ¤t_count, - int expected_count, - int timeout_ms = 60000); - -// A helper function to return a random task id. -TaskID RandomTaskId(); - -// A helper function to return a random job id. -JobID RandomJobId(); - -std::shared_ptr GenerateRandomBuffer(); - -std::shared_ptr GenerateRandomObject( - const std::vector &inlined_ids = {}); - -/// Path to redis server executable binary. -extern std::string TEST_REDIS_SERVER_EXEC_PATH; -/// Path to redis client executable binary. -extern std::string TEST_REDIS_CLIENT_EXEC_PATH; -/// Ports of redis server. -extern std::vector TEST_REDIS_SERVER_PORTS; - -//-------------------------------------------------------------------------------- -// COMPONENT MANAGEMENT CLASSES FOR TEST CASES -//-------------------------------------------------------------------------------- -/// Test cases can use it to start/stop/flush redis server(s). -class TestSetupUtil { - public: - static void StartUpRedisServers(const std::vector &redis_server_ports, - bool save = false); - static void ShutDownRedisServers(); - static void FlushAllRedisServers(); - - static void ExecuteRedisCmd(int port, std::vector cmd); - static int StartUpRedisServer(int port, bool save = false); - static void ShutDownRedisServer(int port); - static void FlushRedisServer(int port); -}; - -template -struct SaveArgToUniquePtrAction { - std::unique_ptr *pointer; - - template - void operator()(const Args &...args) const { - *pointer = std::make_unique(std::get(std::tie(args...))); - } -}; - -// Copies the k-th arg with make_unique(arg) into ptr. -template -SaveArgToUniquePtrAction SaveArgToUniquePtr(std::unique_ptr *ptr) { - return {ptr}; -} - -} // namespace ray diff --git a/src/ray/common/test_utils.cc b/src/ray/common/test_utils.cc new file mode 100644 index 000000000000..31b4530df525 --- /dev/null +++ b/src/ray/common/test_utils.cc @@ -0,0 +1,594 @@ +// Copyright 2017 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ray/common/test_utils.h" + +#include +#include + +#include "absl/strings/escaping.h" +#include "ray/common/buffer.h" +#include "ray/common/ray_object.h" +#include "ray/common/task/task_util.h" +#include "ray/util/cmd_line_utils.h" +#include "ray/util/filesystem.h" +#include "ray/util/logging.h" +#include "ray/util/network_util.h" +#include "ray/util/path_utils.h" +#include "ray/util/process.h" +#include "ray/util/time.h" + +namespace ray { + +void TestSetupUtil::StartUpRedisServers(const std::vector &redis_server_ports, + bool save) { + if (redis_server_ports.empty()) { + TEST_REDIS_SERVER_PORTS.push_back(StartUpRedisServer(0, save)); + } else { + for (const auto &port : redis_server_ports) { + TEST_REDIS_SERVER_PORTS.push_back(StartUpRedisServer(port, save)); + } + } +} + +// start a redis server with specified port, use random one when 0 given +int TestSetupUtil::StartUpRedisServer(int port, bool save) { + int actual_port = port; + if (port == 0) { + static std::atomic srand_called(false); + if (!srand_called.exchange(true)) { + srand(current_time_ms() % RAND_MAX); + } + // Use random port (in range [2000, 7000) to avoid port conflicts between UTs. + do { + actual_port = rand() % 5000 + 2000; + } while (!CheckPortFree(actual_port)); + } + + std::string program = TEST_REDIS_SERVER_EXEC_PATH; +#ifdef _WIN32 + std::vector cmdargs({program, "--loglevel", "warning"}); +#else + std::vector cmdargs; + if (!save) { + cmdargs = {program, "--loglevel", "warning", "--save", "", "--appendonly", "no"}; + } else { + cmdargs = {program, "--loglevel", "warning"}; + } +#endif + cmdargs.insert(cmdargs.end(), {"--port", std::to_string(actual_port)}); + RAY_LOG(INFO) << "Start redis command is: " << CreateCommandLine(cmdargs); + RAY_CHECK(!Process::Spawn(cmdargs, true).second); + std::this_thread::sleep_for(std::chrono::milliseconds(200)); + return actual_port; +} + +void TestSetupUtil::ShutDownRedisServers() { + for (const auto &port : TEST_REDIS_SERVER_PORTS) { + ShutDownRedisServer(port); + } + TEST_REDIS_SERVER_PORTS = std::vector(); +} + +void TestSetupUtil::ShutDownRedisServer(int port) { + std::vector cmdargs( + {TEST_REDIS_CLIENT_EXEC_PATH, "-p", std::to_string(port), "shutdown"}); + RAY_LOG(INFO) << "Stop redis command is: " << CreateCommandLine(cmdargs); + if (Process::Call(cmdargs) != std::error_code()) { + RAY_LOG(WARNING) << "Failed to stop redis. The redis process may no longer exist."; + } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); +} + +void TestSetupUtil::FlushAllRedisServers() { + for (const auto &port : TEST_REDIS_SERVER_PORTS) { + FlushRedisServer(port); + } +} + +void TestSetupUtil::ExecuteRedisCmd(int port, std::vector cmd) { + std::vector cmdargs( + {TEST_REDIS_CLIENT_EXEC_PATH, "-p", std::to_string(port)}); + cmdargs.insert(cmdargs.end(), cmd.begin(), cmd.end()); + RAY_LOG(INFO) << "Send command to redis: " << CreateCommandLine(cmdargs); + if (Process::Call(cmdargs)) { + RAY_LOG(WARNING) << "Failed to send request to redis."; + } +} + +void TestSetupUtil::FlushRedisServer(int port) { + std::vector cmdargs( + {TEST_REDIS_CLIENT_EXEC_PATH, "-p", std::to_string(port), "flushall"}); + RAY_LOG(INFO) << "Cleaning up redis with command: " << CreateCommandLine(cmdargs); + if (Process::Call(cmdargs)) { + RAY_LOG(WARNING) << "Failed to flush redis. The redis process may no longer exist."; + } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); +} + +bool WaitReady(std::future future, const std::chrono::milliseconds &timeout_ms) { + auto status = future.wait_for(timeout_ms); + return status == std::future_status::ready && future.get(); +} + +bool WaitForCondition(std::function condition, int timeout_ms) { + int wait_time = 0; + while (true) { + if (condition()) { + return true; + } + + // sleep 10ms. + const int wait_interval_ms = 10; + std::this_thread::sleep_for(std::chrono::milliseconds(wait_interval_ms)); + wait_time += wait_interval_ms; + if (wait_time > timeout_ms) { + break; + } + } + return false; +} + +void WaitForExpectedCount(std::atomic ¤t_count, + int expected_count, + int timeout_ms) { + auto condition = [¤t_count, expected_count]() { + return current_count == expected_count; + }; + EXPECT_TRUE(WaitForCondition(condition, timeout_ms)); +} + +TaskID RandomTaskId() { + std::string data(TaskID::Size(), 0); + FillRandom(&data); + return TaskID::FromBinary(data); +} + +JobID RandomJobId() { + std::string data(JobID::Size(), 0); + FillRandom(&data); + return JobID::FromBinary(data); +} + +std::shared_ptr GenerateRandomBuffer() { + auto seed = std::chrono::high_resolution_clock::now().time_since_epoch().count(); + std::mt19937 gen(seed); + std::uniform_int_distribution<> dis(1, 10); + std::uniform_int_distribution<> value_dis(1, 255); + + std::vector arg1(dis(gen), value_dis(gen)); + return std::make_shared(arg1.data(), arg1.size(), true); +} + +std::shared_ptr GenerateRandomObject( + const std::vector &inlined_ids) { + std::vector refs; + for (const auto &inlined_id : inlined_ids) { + rpc::ObjectReference ref; + ref.set_object_id(inlined_id.Binary()); + refs.push_back(ref); + } + return std::make_shared(GenerateRandomBuffer(), nullptr, refs); +} + +TaskSpecification GenActorCreationTask( + const JobID &job_id, + int max_restarts, + bool detached, + const std::string &name, + const std::string &ray_namespace, + const rpc::Address &owner_address, + std::unordered_map required_resources, + std::unordered_map required_placement_resources) { + TaskSpecBuilder builder; + rpc::JobConfig kJobConfig; + auto actor_id = ActorID::Of(job_id, RandomTaskId(), 0); + auto task_id = TaskID::ForActorCreationTask(actor_id); + FunctionDescriptor function_descriptor; + function_descriptor = FunctionDescriptorBuilder::BuildPython("", "", "", ""); + builder.SetCommonTaskSpec(task_id, + name + ":" + function_descriptor->CallString(), + Language::PYTHON, + function_descriptor, + job_id, + kJobConfig, + TaskID::Nil(), + 0, + TaskID::Nil(), + owner_address, + 1, + false, + false, + -1, + required_resources, + required_placement_resources, + "", + 0, + TaskID::Nil(), + ""); + rpc::SchedulingStrategy scheduling_strategy; + scheduling_strategy.mutable_default_scheduling_strategy(); + builder.SetActorCreationTaskSpec(actor_id, + {}, + scheduling_strategy, + max_restarts, + /*max_task_retries=*/0, + {}, + 1, + detached, + name, + ray_namespace); + return std::move(builder).ConsumeAndBuild(); +} + +rpc::CreateActorRequest GenCreateActorRequest(const JobID &job_id, + int max_restarts, + bool detached, + const std::string &name, + const std::string &ray_namespace) { + rpc::Address owner_address; + owner_address.set_node_id(NodeID::FromRandom().Binary()); + owner_address.set_ip_address("1234"); + owner_address.set_port(5678); + owner_address.set_worker_id(WorkerID::FromRandom().Binary()); + auto actor_creation_task_spec = GenActorCreationTask( + job_id, max_restarts, detached, name, ray_namespace, owner_address); + rpc::CreateActorRequest request; + request.mutable_task_spec()->CopyFrom(actor_creation_task_spec.GetMessage()); + return request; +} + +rpc::RegisterActorRequest GenRegisterActorRequest(const JobID &job_id, + int max_restarts, + bool detached, + const std::string &name, + const std::string &ray_namespace) { + rpc::Address owner_address; + owner_address.set_node_id(NodeID::FromRandom().Binary()); + owner_address.set_ip_address("1234"); + owner_address.set_port(5678); + owner_address.set_worker_id(WorkerID::FromRandom().Binary()); + auto actor_creation_task_spec = GenActorCreationTask( + job_id, max_restarts, detached, name, ray_namespace, owner_address); + rpc::RegisterActorRequest request; + request.mutable_task_spec()->CopyFrom(actor_creation_task_spec.GetMessage()); + return request; +} + +PlacementGroupSpecification GenPlacementGroupCreation( + const std::string &name, + std::vector> &bundles, + rpc::PlacementStrategy strategy, + const JobID &job_id, + const ActorID &actor_id) { + PlacementGroupSpecBuilder builder; + + auto placement_group_id = PlacementGroupID::Of(job_id); + builder.SetPlacementGroupSpec(placement_group_id, + name, + bundles, + strategy, + /* is_detached */ false, + /* soft_target_node_id */ NodeID::Nil(), + job_id, + actor_id, + /* is_creator_detached */ false); + return builder.Build(); +} + +rpc::CreatePlacementGroupRequest GenCreatePlacementGroupRequest( + const std::string name, + rpc::PlacementStrategy strategy, + int bundles_count, + double cpu_num, + const JobID job_id, + const ActorID &actor_id) { + rpc::CreatePlacementGroupRequest request; + std::vector> bundles; + std::unordered_map bundle; + bundle["CPU"] = cpu_num; + for (int index = 0; index < bundles_count; ++index) { + bundles.push_back(bundle); + } + auto placement_group_creation_spec = + GenPlacementGroupCreation(name, bundles, strategy, job_id, actor_id); + request.mutable_placement_group_spec()->CopyFrom( + placement_group_creation_spec.GetMessage()); + return request; +} +std::shared_ptr GenNodeInfo(uint16_t port, + const std::string address, + const std::string node_name) { + auto node = std::make_shared(); + node->set_node_id(NodeID::FromRandom().Binary()); + node->set_node_manager_port(port); + node->set_node_manager_address(address); + node->set_node_name(node_name); + node->set_instance_id("instance_x"); + node->set_state(rpc::GcsNodeInfo::ALIVE); + return node; +} + +std::shared_ptr GenJobTableData(JobID job_id) { + auto job_table_data = std::make_shared(); + job_table_data->set_job_id(job_id.Binary()); + job_table_data->set_is_dead(false); + job_table_data->set_timestamp(current_sys_time_ms()); + job_table_data->set_driver_ip_address("127.0.0.1"); + rpc::Address address; + address.set_ip_address("127.0.0.1"); + address.set_port(1234); + address.set_node_id(UniqueID::FromRandom().Binary()); + address.set_worker_id(UniqueID::FromRandom().Binary()); + job_table_data->mutable_driver_address()->CopyFrom(address); + job_table_data->set_driver_pid(5667L); + return job_table_data; +} + +std::shared_ptr GenActorTableData(const JobID &job_id) { + auto actor_table_data = std::make_shared(); + ActorID actor_id = ActorID::Of(job_id, RandomTaskId(), 0); + actor_table_data->set_actor_id(actor_id.Binary()); + actor_table_data->set_job_id(job_id.Binary()); + actor_table_data->set_state(rpc::ActorTableData::ALIVE); + actor_table_data->set_max_restarts(1); + actor_table_data->set_num_restarts(0); + return actor_table_data; +} + +std::shared_ptr GenErrorTableData(const JobID &job_id) { + auto error_table_data = std::make_shared(); + error_table_data->set_job_id(job_id.Binary()); + return error_table_data; +} + +std::shared_ptr GenWorkerTableData() { + auto worker_table_data = std::make_shared(); + worker_table_data->set_timestamp(std::time(nullptr)); + return worker_table_data; +} + +std::shared_ptr GenAddJobRequest( + const JobID &job_id, + const std::string &ray_namespace, + const std::optional &submission_id, + const std::optional &address) { + auto job_config_data = std::make_shared(); + job_config_data->set_ray_namespace(ray_namespace); + + auto job_table_data = std::make_shared(); + job_table_data->set_job_id(job_id.Binary()); + job_table_data->mutable_config()->CopyFrom(*job_config_data); + if (address.has_value()) { + job_table_data->mutable_driver_address()->CopyFrom(address.value()); + } else { + rpc::Address dummy_address; + dummy_address.set_port(1234); + dummy_address.set_node_id(NodeID::FromRandom().Binary()); + dummy_address.set_ip_address("123.456.7.8"); + dummy_address.set_worker_id(WorkerID::FromRandom().Binary()); + job_table_data->mutable_driver_address()->CopyFrom(dummy_address); + } + if (submission_id.has_value()) { + job_table_data->mutable_config()->mutable_metadata()->insert( + {"job_submission_id", submission_id.value()}); + } + + auto add_job_request = std::make_shared(); + add_job_request->mutable_data()->CopyFrom(*job_table_data); + return add_job_request; +} + +rpc::TaskEventData GenTaskEventsData(const std::vector &task_events, + int32_t num_profile_task_events_dropped, + int32_t num_status_task_events_dropped) { + rpc::TaskEventData data; + for (auto &events : task_events) { + auto new_events = data.add_events_by_task(); + new_events->CopyFrom(events); + } + + for (int i = 0; i < num_status_task_events_dropped; ++i) { + rpc::TaskAttempt rpc_task_attempt; + rpc_task_attempt.set_task_id(RandomTaskId().Binary()); + rpc_task_attempt.set_attempt_number(0); + *(data.add_dropped_task_attempts()) = rpc_task_attempt; + } + + data.set_num_profile_events_dropped(num_profile_task_events_dropped); + data.set_job_id(JobID::FromInt(0).Binary()); + + return data; +} + +rpc::events::RayEventsData GenRayEventsData( + const std::vector &task_events, + const std::vector &drop_tasks) { + rpc::events::RayEventsData data; + rpc::events::TaskEventsMetadata metadata; + for (const auto &task_attempt : drop_tasks) { + rpc::TaskAttempt rpc_task_attempt; + rpc_task_attempt.set_task_id(task_attempt.first.Binary()); + rpc_task_attempt.set_attempt_number(task_attempt.second); + *(metadata.add_dropped_task_attempts()) = rpc_task_attempt; + } + data.mutable_task_events_metadata()->CopyFrom(metadata); + for (const auto &task_event : task_events) { + rpc::events::RayEvent ray_event; + rpc::events::TaskDefinitionEvent task_definition_event; + task_definition_event.set_task_id(task_event.task_id()); + task_definition_event.set_task_attempt(task_event.attempt_number()); + task_definition_event.set_job_id(task_event.job_id()); + if (task_event.has_task_info()) { + const auto &task_info = task_event.task_info(); + task_definition_event.set_task_type(task_info.type()); + task_definition_event.set_task_name(task_info.name()); + task_definition_event.set_language(task_info.language()); + } + ray_event.set_event_id(task_event.task_id()); + ray_event.set_event_type(rpc::events::RayEvent::TASK_DEFINITION_EVENT); + ray_event.set_message("test"); + ray_event.mutable_task_definition_event()->CopyFrom(task_definition_event); + *(data.add_events()) = ray_event; + } + + return data; +} + +rpc::TaskEventData GenTaskEventsDataLoss(const std::vector &drop_tasks, + int job_id) { + rpc::TaskEventData data; + for (const auto &task_attempt : drop_tasks) { + rpc::TaskAttempt rpc_task_attempt; + rpc_task_attempt.set_task_id(task_attempt.first.Binary()); + rpc_task_attempt.set_attempt_number(task_attempt.second); + *(data.add_dropped_task_attempts()) = rpc_task_attempt; + } + data.set_job_id(JobID::FromInt(job_id).Binary()); + + return data; +} + +rpc::ResourceDemand GenResourceDemand( + const absl::flat_hash_map &resource_demands, + int64_t num_ready_queued, + int64_t num_infeasible, + int64_t num_backlog, + const std::vector &label_selectors) { + rpc::ResourceDemand resource_demand; + for (const auto &resource : resource_demands) { + (*resource_demand.mutable_shape())[resource.first] = resource.second; + } + resource_demand.set_num_ready_requests_queued(num_ready_queued); + resource_demand.set_num_infeasible_requests_queued(num_infeasible); + resource_demand.set_backlog_size(num_backlog); + for (const auto &selector : label_selectors) { + *resource_demand.add_label_selectors() = selector; + } + return resource_demand; +} + +void FillResourcesData( + rpc::ResourcesData &resources_data, + const NodeID &node_id, + const absl::flat_hash_map &available_resources, + const absl::flat_hash_map &total_resources, + int64_t idle_ms, + bool is_draining, + int64_t draining_deadline_timestamp_ms) { + resources_data.set_node_id(node_id.Binary()); + for (const auto &resource : available_resources) { + (*resources_data.mutable_resources_available())[resource.first] = resource.second; + } + for (const auto &resource : total_resources) { + (*resources_data.mutable_resources_total())[resource.first] = resource.second; + } + resources_data.set_idle_duration_ms(idle_ms); + resources_data.set_is_draining(is_draining); + resources_data.set_draining_deadline_timestamp_ms(draining_deadline_timestamp_ms); +} + +void FillResourcesData(rpc::ResourcesData &data, + const std::string &node_id, + std::vector demands) { + auto load_by_shape = data.mutable_resource_load_by_shape(); + auto agg_load = data.mutable_resource_load(); + for (const auto &demand : demands) { + load_by_shape->add_resource_demands()->CopyFrom(demand); + for (const auto &resource : demand.shape()) { + (*agg_load)[resource.first] += + (resource.second * (demand.num_ready_requests_queued() + + demand.num_infeasible_requests_queued())); + } + } + data.set_node_id(node_id); +} + +std::shared_ptr GenPlacementGroupLoad( + std::vector placement_group_table_data_vec) { + auto placement_group_load = std::make_shared(); + for (auto &placement_group_table_data : placement_group_table_data_vec) { + placement_group_load->add_placement_group_data()->CopyFrom( + placement_group_table_data); + } + return placement_group_load; +} + +rpc::PlacementGroupTableData GenPlacementGroupTableData( + const PlacementGroupID &placement_group_id, + const JobID &job_id, + const std::vector> &bundles, + const std::vector &nodes, + rpc::PlacementStrategy strategy, + const rpc::PlacementGroupTableData::PlacementGroupState state, + const std::string &name, + const ActorID &actor_id) { + rpc::PlacementGroupTableData placement_group_table_data; + placement_group_table_data.set_placement_group_id(placement_group_id.Binary()); + placement_group_table_data.set_state(state); + placement_group_table_data.set_name(name); + placement_group_table_data.set_strategy(strategy); + RAY_CHECK(bundles.size() == nodes.size()); + size_t i = 0; + for (auto &bundle : bundles) { + // Add unit resources + auto bundle_spec = placement_group_table_data.add_bundles(); + for (auto &resource : bundle) { + (*bundle_spec->mutable_unit_resources())[resource.first] = resource.second; + } + + // Add node id + const auto &node = nodes[i]; + if (!node.empty()) { + bundle_spec->set_node_id(node); + } + + i++; + } + return placement_group_table_data; +} +rpc::autoscaler::ClusterResourceConstraint GenClusterResourcesConstraint( + const std::vector> &request_resources, + const std::vector &count_array) { + rpc::autoscaler::ClusterResourceConstraint constraint; + RAY_CHECK(request_resources.size() == count_array.size()); + for (size_t i = 0; i < request_resources.size(); i++) { + auto &resource = request_resources[i]; + auto count = count_array[i]; + auto bundle = constraint.add_resource_requests(); + bundle->set_count(count); + bundle->mutable_request()->mutable_resources_bundle()->insert(resource.begin(), + resource.end()); + } + return constraint; +} +// Read all lines of a file into vector vc +void ReadContentFromFile(std::vector &vc, std::string log_file) { + std::string line; + std::ifstream read_file; + read_file.open(log_file, std::ios::binary); + while (std::getline(read_file, line)) { + vc.push_back(line); + } + read_file.close(); +} + +/// Path to redis server executable binary. +std::string TEST_REDIS_SERVER_EXEC_PATH; +/// Path to redis client executable binary. +std::string TEST_REDIS_CLIENT_EXEC_PATH; +/// Ports of redis server. +std::vector TEST_REDIS_SERVER_PORTS; + +} // namespace ray diff --git a/src/ray/common/test_utils.h b/src/ray/common/test_utils.h new file mode 100644 index 000000000000..70be73269c9c --- /dev/null +++ b/src/ray/common/test_utils.h @@ -0,0 +1,232 @@ +// Copyright 2017 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "gtest/gtest.h" +#include "ray/common/asio/asio_util.h" +#include "ray/common/id.h" +#include "ray/common/placement_group.h" +#include "ray/common/task/task_spec.h" +#include "src/ray/protobuf/autoscaler.pb.h" +#include "src/ray/protobuf/common.pb.h" +#include "src/ray/protobuf/gcs.pb.h" +#include "src/ray/protobuf/gcs_service.grpc.pb.h" + +namespace ray { + +static inline std::vector ObjectIdsToRefs( + std::vector object_ids) { + std::vector refs; + for (const auto &object_id : object_ids) { + rpc::ObjectReference ref; + ref.set_object_id(object_id.Binary()); + refs.push_back(ref); + } + return refs; +} + +class Buffer; +class RayObject; + +/// Wait until the future is ready, or timeout is reached. +/// +/// \param[in] future The future to wait for. +/// \param[in] timeout_ms Timeout in milliseconds to wait for for. +/// \return Whether the future is ready. +bool WaitReady(std::future future, const std::chrono::milliseconds &timeout_ms); + +/// Wait until the condition is met, or timeout is reached. +/// +/// \param[in] condition The condition to wait for. +/// \param[in] timeout_ms Timeout in milliseconds to wait for for. +/// \return Whether the condition is met. +bool WaitForCondition(std::function condition, int timeout_ms); + +/// Wait until the expected count is met, or timeout is reached. +/// +/// \param[in] current_count The current count. +/// \param[in] expected_count The expected count. +/// \param[in] timeout_ms Timeout in milliseconds to wait for for. +/// \return Whether the expected count is met. +void WaitForExpectedCount(std::atomic ¤t_count, + int expected_count, + int timeout_ms = 60000); + +// A helper function to return a random task id. +TaskID RandomTaskId(); + +// A helper function to return a random job id. +JobID RandomJobId(); + +std::shared_ptr GenerateRandomBuffer(); + +std::shared_ptr GenerateRandomObject( + const std::vector &inlined_ids = {}); + +/// Path to redis server executable binary. +extern std::string TEST_REDIS_SERVER_EXEC_PATH; +/// Path to redis client executable binary. +extern std::string TEST_REDIS_CLIENT_EXEC_PATH; +/// Ports of redis server. +extern std::vector TEST_REDIS_SERVER_PORTS; + +//-------------------------------------------------------------------------------- +// COMPONENT MANAGEMENT CLASSES FOR TEST CASES +//-------------------------------------------------------------------------------- +/// Test cases can use it to start/stop/flush redis server(s). +class TestSetupUtil { + public: + static void StartUpRedisServers(const std::vector &redis_server_ports, + bool save = false); + static void ShutDownRedisServers(); + static void FlushAllRedisServers(); + + static void ExecuteRedisCmd(int port, std::vector cmd); + static int StartUpRedisServer(int port, bool save = false); + static void ShutDownRedisServer(int port); + static void FlushRedisServer(int port); +}; + +template +struct SaveArgToUniquePtrAction { + std::unique_ptr *pointer; + + template + void operator()(const Args &...args) const { + *pointer = std::make_unique(std::get(std::tie(args...))); + } +}; + +// Copies the k-th arg with make_unique(arg) into ptr. +template +SaveArgToUniquePtrAction SaveArgToUniquePtr(std::unique_ptr *ptr) { + return {ptr}; +} + +TaskSpecification GenActorCreationTask( + const JobID &job_id, + int max_restarts, + bool detached, + const std::string &name, + const std::string &ray_namespace, + const rpc::Address &owner_address, + std::unordered_map required_resources = + std::unordered_map(), + std::unordered_map required_placement_resources = + std::unordered_map()); + +rpc::CreateActorRequest GenCreateActorRequest(const JobID &job_id, + int max_restarts = 0, + bool detached = false, + const std::string &name = "", + const std::string &ray_namespace = ""); + +rpc::RegisterActorRequest GenRegisterActorRequest( + const JobID &job_id, + int max_restarts = 0, + bool detached = false, + const std::string &name = "", + const std::string &ray_namespace = "test"); + +PlacementGroupSpecification GenPlacementGroupCreation( + const std::string &name, + std::vector> &bundles, + rpc::PlacementStrategy strategy, + const JobID &job_id, + const ActorID &actor_id); + +rpc::CreatePlacementGroupRequest GenCreatePlacementGroupRequest( + const std::string name = "", + rpc::PlacementStrategy strategy = rpc::PlacementStrategy::SPREAD, + int bundles_count = 2, + double cpu_num = 1.0, + const JobID job_id = JobID::FromInt(1), + const ActorID &actor_id = ActorID::Nil()); + +std::shared_ptr GenNodeInfo( + uint16_t port = 0, + const std::string address = "127.0.0.1", + const std::string node_name = "Mocker_node"); + +std::shared_ptr GenJobTableData(JobID job_id); + +std::shared_ptr GenActorTableData(const JobID &job_id); + +std::shared_ptr GenErrorTableData(const JobID &job_id); + +std::shared_ptr GenWorkerTableData(); + +std::shared_ptr GenAddJobRequest( + const JobID &job_id, + const std::string &ray_namespace, + const std::optional &submission_id = std::nullopt, + const std::optional &address = std::nullopt); + +rpc::TaskEventData GenTaskEventsData(const std::vector &task_events, + int32_t num_profile_task_events_dropped = 0, + int32_t num_status_task_events_dropped = 0); + +rpc::events::RayEventsData GenRayEventsData( + const std::vector &task_events, + const std::vector &drop_tasks); + +rpc::TaskEventData GenTaskEventsDataLoss(const std::vector &drop_tasks, + int job_id = 0); + +rpc::ResourceDemand GenResourceDemand( + const absl::flat_hash_map &resource_demands, + int64_t num_ready_queued, + int64_t num_infeasible, + int64_t num_backlog, + const std::vector &label_selectors = {}); + +void FillResourcesData( + rpc::ResourcesData &resources_data, + const NodeID &node_id, + const absl::flat_hash_map &available_resources, + const absl::flat_hash_map &total_resources, + int64_t idle_ms = 0, + bool is_draining = false, + int64_t draining_deadline_timestamp_ms = -1); + +void FillResourcesData(rpc::ResourcesData &data, + const std::string &node_id, + std::vector demands); + +std::shared_ptr GenPlacementGroupLoad( + std::vector placement_group_table_data_vec); + +rpc::PlacementGroupTableData GenPlacementGroupTableData( + const PlacementGroupID &placement_group_id, + const JobID &job_id, + const std::vector> &bundles, + const std::vector &nodes, + rpc::PlacementStrategy strategy, + const rpc::PlacementGroupTableData::PlacementGroupState state, + const std::string &name = "", + const ActorID &actor_id = ActorID::Nil()); + +rpc::autoscaler::ClusterResourceConstraint GenClusterResourcesConstraint( + const std::vector> &request_resources, + const std::vector &count_array); + +// Read all lines of a file into vector vc +void ReadContentFromFile(std::vector &vc, std::string log_file); + +} // namespace ray diff --git a/src/ray/common/test/BUILD.bazel b/src/ray/common/tests/BUILD.bazel similarity index 75% rename from src/ray/common/test/BUILD.bazel rename to src/ray/common/tests/BUILD.bazel index ca25a451e3ea..840a4a2b4f5a 100644 --- a/src/ray/common/test/BUILD.bazel +++ b/src/ray/common/tests/BUILD.bazel @@ -1,44 +1,5 @@ load("//bazel:ray.bzl", "ray_cc_binary", "ray_cc_library", "ray_cc_test") -ray_cc_test( - name = "resource_request_test", - size = "small", - srcs = [ - "resource_request_test.cc", - ], - tags = ["team:core"], - deps = [ - "//src/ray/common:task_common", - "@com_google_googletest//:gtest_main", - ], -) - -ray_cc_test( - name = "resource_set_test", - size = "small", - srcs = [ - "resource_set_test.cc", - ], - tags = ["team:core"], - deps = [ - "//src/ray/common:task_common", - "@com_google_googletest//:gtest_main", - ], -) - -ray_cc_test( - name = "resource_instance_set_test", - size = "small", - srcs = [ - "resource_instance_set_test.cc", - ], - tags = ["team:core"], - deps = [ - "//src/ray/common:task_common", - "@com_google_googletest//:gtest_main", - ], -) - ray_cc_test( name = "ray_syncer_test", srcs = ["ray_syncer_test.cc"], @@ -54,6 +15,7 @@ ray_cc_test( "//src/ray/rpc:grpc_server", "//src/ray/util:network_util", "//src/ray/util:path_utils", + "//src/ray/util:raii", "@com_github_grpc_grpc//:grpc++", "@com_google_googletest//:gtest", ], @@ -111,7 +73,6 @@ ray_cc_test( tags = ["team:core"], deps = [ "//src/ray/common:id", - "//src/ray/common:task_common", "//src/ray/protobuf:common_cc_proto", "@com_google_googletest//:gtest_main", ], @@ -132,6 +93,7 @@ ray_cc_test( tags = ["team:core"], deps = [ "//src/ray/common:task_common", + "//src/ray/common/scheduling:scheduling_class_util", "@com_google_googletest//:gtest_main", ], ) @@ -143,7 +105,7 @@ ray_cc_test( ], tags = ["team:core"], deps = [ - "//src/ray/common:task_common", + "//src/ray/common:bundle_location_index", "@com_google_googletest//:gtest_main", ], ) @@ -204,19 +166,6 @@ ray_cc_test( ], ) -ray_cc_test( - name = "scheduling_ids_test", - size = "small", - srcs = [ - "scheduling_ids_test.cc", - ], - tags = ["team:core"], - deps = [ - "//src/ray/common:task_common", - "@com_google_googletest//:gtest_main", - ], -) - ray_cc_test( name = "grpc_util_test", size = "small", @@ -232,14 +181,12 @@ ray_cc_test( ) ray_cc_test( - name = "label_selector_test", + name = "source_location_test", size = "small", - srcs = [ - "label_selector_test.cc", - ], + srcs = ["source_location_test.cc"], tags = ["team:core"], deps = [ - "//src/ray/common:task_common", + "//src/ray/common:source_location", "@com_google_googletest//:gtest_main", ], ) diff --git a/src/ray/common/test/asio_defer_test.cc b/src/ray/common/tests/asio_defer_test.cc similarity index 100% rename from src/ray/common/test/asio_defer_test.cc rename to src/ray/common/tests/asio_defer_test.cc diff --git a/src/ray/common/test/bundle_location_index_test.cc b/src/ray/common/tests/bundle_location_index_test.cc similarity index 99% rename from src/ray/common/test/bundle_location_index_test.cc rename to src/ray/common/tests/bundle_location_index_test.cc index 49468d978274..9e3bba19c8ee 100644 --- a/src/ray/common/test/bundle_location_index_test.cc +++ b/src/ray/common/tests/bundle_location_index_test.cc @@ -15,6 +15,9 @@ #include "ray/common/bundle_location_index.h" +#include +#include + #include "gtest/gtest.h" namespace ray { diff --git a/src/ray/common/test/event_stats_test.cc b/src/ray/common/tests/event_stats_test.cc similarity index 98% rename from src/ray/common/test/event_stats_test.cc rename to src/ray/common/tests/event_stats_test.cc index 9d88065ea8ff..3b1bf67c1e56 100644 --- a/src/ray/common/test/event_stats_test.cc +++ b/src/ray/common/tests/event_stats_test.cc @@ -14,6 +14,9 @@ #include "ray/common/event_stats.h" +#include +#include + #include "gtest/gtest.h" TEST(EventStatsTest, TestRecordEnd) { diff --git a/src/ray/common/test/grpc_util_test.cc b/src/ray/common/tests/grpc_util_test.cc similarity index 99% rename from src/ray/common/test/grpc_util_test.cc rename to src/ray/common/tests/grpc_util_test.cc index 17c50faf28dc..5170446c123b 100644 --- a/src/ray/common/test/grpc_util_test.cc +++ b/src/ray/common/tests/grpc_util_test.cc @@ -14,6 +14,8 @@ #include "ray/common/grpc_util.h" +#include + #include "gtest/gtest.h" #include "src/ray/protobuf/common.pb.h" diff --git a/src/ray/common/test/id_test.cc b/src/ray/common/tests/id_test.cc similarity index 82% rename from src/ray/common/test/id_test.cc rename to src/ray/common/tests/id_test.cc index 2c11d7681a1e..3cbf782eb5a3 100644 --- a/src/ray/common/test/id_test.cc +++ b/src/ray/common/tests/id_test.cc @@ -12,11 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "ray/common/id.h" + #include +#include + #include "absl/container/flat_hash_set.h" -#include "ray/common/common_protocol.h" -#include "ray/common/task/task_spec.h" namespace ray { @@ -35,9 +37,9 @@ void TestRandomObjectId() { ASSERT_EQ(random_object_id.ObjectIndex(), 0); } -const static JobID kDefaultJobId = JobID::FromInt(199); +static const JobID kDefaultJobId = JobID::FromInt(199); -const static TaskID kDefaultDriverTaskId = TaskID::ForDriverTask(kDefaultJobId); +static const TaskID kDefaultDriverTaskId = TaskID::ForDriverTask(kDefaultJobId); TEST(JobIDTest, TestJobID) { uint32_t id = 100; @@ -104,9 +106,9 @@ TEST(TaskIDTest, TestTaskIDForExecution) { } TEST(ObjectIDTest, TestObjectID) { - const static ActorID default_actor_id = + static const ActorID default_actor_id = ActorID::Of(kDefaultJobId, kDefaultDriverTaskId, 1); - const static TaskID default_task_id = + static const TaskID default_task_id = TaskID::ForActorTask(kDefaultJobId, kDefaultDriverTaskId, 1, default_actor_id); { @@ -174,4 +176,31 @@ TEST(PlacementGroupIDTest, TestPlacementGroup) { } } +TEST(LeaseIDTest, TestLeaseID) { + // Test basic LeaseID creation, size, and worker extraction + const WorkerID worker_id = WorkerID::FromRandom(); + const LeaseID lease_id = LeaseID::FromWorker(worker_id, 2); + const size_t lease_id_size = 32; + ASSERT_FALSE(lease_id.IsNil()); + ASSERT_EQ(lease_id.WorkerId(), worker_id); + ASSERT_EQ(LeaseID::Size(), lease_id_size); + ASSERT_EQ(lease_id.Binary().size(), lease_id_size); + + const LeaseID random_lease = LeaseID::FromRandom(); + const LeaseID another_lease = LeaseID::FromWorker(worker_id, 1); + + ASSERT_FALSE(random_lease.IsNil()); + ASSERT_NE(lease_id, another_lease); + ASSERT_NE(lease_id, random_lease); + ASSERT_EQ(lease_id.WorkerId(), another_lease.WorkerId()); + + // Test serialization roundtrip + const LeaseID from_hex = LeaseID::FromHex(lease_id.Hex()); + const LeaseID from_binary = LeaseID::FromBinary(lease_id.Binary()); + + ASSERT_EQ(lease_id, from_hex); + ASSERT_EQ(lease_id, from_binary); + ASSERT_EQ(lease_id.WorkerId(), from_hex.WorkerId()); +} + } // namespace ray diff --git a/src/ray/common/test/memory_monitor_test.cc b/src/ray/common/tests/memory_monitor_test.cc similarity index 99% rename from src/ray/common/test/memory_monitor_test.cc rename to src/ray/common/tests/memory_monitor_test.cc index 3f9e8f0071b6..03ccb4d693c5 100644 --- a/src/ray/common/test/memory_monitor_test.cc +++ b/src/ray/common/tests/memory_monitor_test.cc @@ -20,6 +20,9 @@ #include #include #include +#include +#include +#include #include "gtest/gtest.h" #include "ray/common/asio/instrumented_io_context.h" diff --git a/src/ray/common/test/postable_test.cc b/src/ray/common/tests/postable_test.cc similarity index 99% rename from src/ray/common/test/postable_test.cc rename to src/ray/common/tests/postable_test.cc index 60f8d18571ff..1490d18f8684 100644 --- a/src/ray/common/test/postable_test.cc +++ b/src/ray/common/tests/postable_test.cc @@ -16,6 +16,8 @@ #include +#include + namespace ray { TEST(PostableTest, TestPostable) { diff --git a/src/ray/common/test/ray_config_test.cc b/src/ray/common/tests/ray_config_test.cc similarity index 96% rename from src/ray/common/test/ray_config_test.cc rename to src/ray/common/tests/ray_config_test.cc index 6522640a499d..5584ab647eb8 100644 --- a/src/ray/common/test/ray_config_test.cc +++ b/src/ray/common/tests/ray_config_test.cc @@ -14,6 +14,9 @@ #include "ray/common/ray_config.h" +#include +#include + #include "gtest/gtest.h" #include "ray/common/grpc_util.h" diff --git a/src/ray/common/test/ray_syncer_test.cc b/src/ray/common/tests/ray_syncer_test.cc similarity index 95% rename from src/ray/common/test/ray_syncer_test.cc rename to src/ray/common/tests/ray_syncer_test.cc index 6b8757f5e080..cb2b81579eb4 100644 --- a/src/ray/common/test/ray_syncer_test.cc +++ b/src/ray/common/tests/ray_syncer_test.cc @@ -25,7 +25,13 @@ #include #include +#include #include +#include +#include +#include +#include +#include #include "ray/common/ray_syncer/node_state.h" #include "ray/common/ray_syncer/ray_syncer.h" @@ -34,9 +40,8 @@ #include "ray/rpc/grpc_server.h" #include "ray/util/network_util.h" #include "ray/util/path_utils.h" +#include "ray/util/raii.h" -using namespace std::chrono; -using namespace ray::syncer; using ray::NodeID; using ::testing::_; using ::testing::Eq; @@ -204,7 +209,7 @@ TEST_F(RaySyncerTest, RaySyncerBidiReactorBase) { } struct SyncerServerTest { - SyncerServerTest(std::string port) + explicit SyncerServerTest(std::string port) : SyncerServerTest( std::move(port), /*node_id=*/NodeID::FromRandom(), /*ray_sync_observer=*/{}) { } @@ -306,22 +311,24 @@ struct SyncerServerTest { if (f.get()) { return; } else { - std::this_thread::sleep_for(1s); + std::this_thread::sleep_for(std::chrono::seconds(1)); } } } bool WaitUntil(std::function predicate, int64_t time_s) { - auto start = steady_clock::now(); + auto start = std::chrono::steady_clock::now(); - while (duration_cast(steady_clock::now() - start).count() <= time_s) { + while (std::chrono::duration_cast( + std::chrono::steady_clock::now() - start) + .count() <= time_s) { std::promise p; auto f = p.get_future(); io_context.post([&p, predicate]() mutable { p.set_value(predicate()); }, "TEST"); if (f.get()) { return true; } else { - std::this_thread::sleep_for(1s); + std::this_thread::sleep_for(std::chrono::seconds(1)); } } return false; @@ -445,7 +452,7 @@ class SyncerTest : public ::testing::Test { s->Stop(); } - std::this_thread::sleep_for(1s); + std::this_thread::sleep_for(std::chrono::seconds(1)); } std::vector> servers; }; @@ -526,10 +533,10 @@ TEST_F(SyncerTest, Test1To1) { // Make sure no new messages are sent s2.local_versions[0] = 0; - std::this_thread::sleep_for(1s); + std::this_thread::sleep_for(std::chrono::seconds(1)); - ASSERT_TRUE(s1.GetNumConsumedMessages(s2.syncer->GetLocalNodeID()) == 2); - ASSERT_TRUE(s2.GetNumConsumedMessages(s1.syncer->GetLocalNodeID()) == 1); + ASSERT_EQ(s1.GetNumConsumedMessages(s2.syncer->GetLocalNodeID()), 2); + ASSERT_EQ(s2.GetNumConsumedMessages(s1.syncer->GetLocalNodeID()), 1); // Change it back s2.local_versions[0] = 1; @@ -539,7 +546,7 @@ TEST_F(SyncerTest, Test1To1) { std::uniform_int_distribution<> rand_sleep(0, 10000); std::uniform_int_distribution<> choose_component(0, kTestComponents - 1); - auto start = steady_clock::now(); + auto start = std::chrono::steady_clock::now(); for (int i = 0; i < 10000; ++i) { if (choose_component(gen) == 0) { s1.local_versions[0]++; @@ -547,16 +554,16 @@ TEST_F(SyncerTest, Test1To1) { s2.local_versions[choose_component(gen)]++; } if (rand_sleep(gen) < 5) { - std::this_thread::sleep_for(1s); + std::this_thread::sleep_for(std::chrono::seconds(1)); } } - auto end = steady_clock::now(); + auto end = std::chrono::steady_clock::now(); // Max messages can be send during this period of time. // +1 is for corner cases. auto max_sends = - duration_cast(end - start).count() / + std::chrono::duration_cast(end - start).count() / RayConfig::instance().raylet_report_resources_period_milliseconds() + 1; @@ -721,7 +728,7 @@ bool TestCorrectness(std::function get_cluster_ for (size_t i = 0; i < 10; ++i) { if (!check()) { - std::this_thread::sleep_for(1s); + std::this_thread::sleep_for(std::chrono::seconds(1)); } else { break; } @@ -747,7 +754,7 @@ bool TestCorrectness(std::function get_cluster_ servers[server_idx]->local_versions[message_type]++; // expect to sleep for 100 times for the whole loop. if (rand_sleep(gen) < 100) { - std::this_thread::sleep_for(100ms); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); } } @@ -757,7 +764,7 @@ bool TestCorrectness(std::function get_cluster_ // Make sure everything is synced. for (size_t i = 0; i < 10; ++i) { if (!check()) { - std::this_thread::sleep_for(1s); + std::this_thread::sleep_for(std::chrono::seconds(1)); } else { break; } @@ -881,14 +888,18 @@ class SyncerReactorTest : public ::testing::Test { work_guard_ = std::make_unique(io_context_.get_executor()); thread_ = std::make_unique([this]() { io_context_.run(); }); - auto start = steady_clock::now(); - while (duration_cast(steady_clock::now() - start).count() <= 5) { + auto start = std::chrono::steady_clock::now(); + while (std::chrono::duration_cast( + std::chrono::steady_clock::now() - start) + .count() <= 5) { RAY_LOG(INFO) << "Waiting: " - << duration_cast(steady_clock::now() - start).count(); + << std::chrono::duration_cast( + std::chrono::steady_clock::now() - start) + .count(); if (rpc_service_->reactor != nullptr) { break; }; - std::this_thread::sleep_for(1s); + std::this_thread::sleep_for(std::chrono::seconds(1)); } } @@ -991,6 +1002,6 @@ int main(int argc, char **argv) { ::testing::InitGoogleTest(&argc, argv); auto ret = RUN_ALL_TESTS(); // Sleep for gRPC to gracefully shutdown. - std::this_thread::sleep_for(2s); + std::this_thread::sleep_for(std::chrono::seconds(2)); return ret; } diff --git a/src/ray/common/source_location_test.cc b/src/ray/common/tests/source_location_test.cc similarity index 93% rename from src/ray/common/source_location_test.cc rename to src/ray/common/tests/source_location_test.cc index fe5c5b3078ec..74d938cee0d6 100644 --- a/src/ray/common/source_location_test.cc +++ b/src/ray/common/tests/source_location_test.cc @@ -35,7 +35,7 @@ TEST(SourceLocationTest, StringifyTest) { auto loc = RAY_LOC(); std::stringstream ss{}; ss << loc; - EXPECT_EQ(ss.str(), "src/ray/common/source_location_test.cc:35"); + EXPECT_EQ(ss.str(), "src/ray/common/tests/source_location_test.cc:35"); } } diff --git a/src/ray/common/test/status_or_test.cc b/src/ray/common/tests/status_or_test.cc similarity index 99% rename from src/ray/common/test/status_or_test.cc rename to src/ray/common/tests/status_or_test.cc index 4a5f41a4542e..5c20ab4e387a 100644 --- a/src/ray/common/test/status_or_test.cc +++ b/src/ray/common/tests/status_or_test.cc @@ -19,7 +19,7 @@ #include #include -#include "ray/common/test/testing.h" +#include "ray/common/tests/testing.h" namespace ray { diff --git a/src/ray/common/test/status_test.cc b/src/ray/common/tests/status_test.cc similarity index 99% rename from src/ray/common/test/status_test.cc rename to src/ray/common/tests/status_test.cc index aa3597193d25..84166968c5b4 100644 --- a/src/ray/common/test/status_test.cc +++ b/src/ray/common/tests/status_test.cc @@ -14,6 +14,8 @@ #include "ray/common/status.h" +#include + #include "gtest/gtest.h" #include "ray/common/grpc_util.h" diff --git a/src/ray/common/test/syncer_service_e2e_test.cc b/src/ray/common/tests/syncer_service_e2e_test.cc similarity index 87% rename from src/ray/common/test/syncer_service_e2e_test.cc rename to src/ray/common/tests/syncer_service_e2e_test.cc index 650a6fed0e47..90dbf97619c4 100644 --- a/src/ray/common/test/syncer_service_e2e_test.cc +++ b/src/ray/common/tests/syncer_service_e2e_test.cc @@ -22,16 +22,17 @@ #include #include #include +#include +#include +#include #include "ray/common/asio/periodical_runner.h" #include "ray/common/id.h" #include "ray/common/ray_syncer/ray_syncer.h" #include "ray/util/network_util.h" -using namespace std; -using namespace ray::syncer; using ray::PeriodicalRunner; -class LocalNode : public ReporterInterface { +class LocalNode : public ray::syncer::ReporterInterface { public: LocalNode(instrumented_io_context &io_context, ray::NodeID node_id) : node_id_(node_id), timer_(PeriodicalRunner::Create(io_context)) { @@ -51,8 +52,8 @@ class LocalNode : public ReporterInterface { "LocalNodeStateUpdate"); } - std::optional CreateSyncMessage(int64_t current_version, - MessageType) const override { + std::optional CreateSyncMessage( + int64_t current_version, ray::syncer::MessageType) const override { if (current_version > version_) { return std::nullopt; } @@ -72,7 +73,7 @@ class LocalNode : public ReporterInterface { std::shared_ptr timer_; }; -class RemoteNodes : public ReceiverInterface { +class RemoteNodes : public ray::syncer::ReceiverInterface { public: RemoteNodes() {} void ConsumeSyncMessage( @@ -100,18 +101,18 @@ int main(int argc, char *argv[]) { auto leader_port = std::string(argv[2]); auto local_node = std::make_unique(io_context, node_id); auto remote_node = std::make_unique(); - RaySyncer syncer(io_context, node_id.Binary()); + ray::syncer::RaySyncer syncer(io_context, node_id.Binary()); // RPC related field grpc::ServerBuilder builder; - std::unique_ptr service; + std::unique_ptr service; std::unique_ptr server; std::shared_ptr channel; syncer.Register( ray::rpc::syncer::MessageType::RESOURCE_VIEW, local_node.get(), remote_node.get()); if (server_port != ".") { RAY_LOG(INFO) << "Start server on port " << server_port; - auto server_address = BuildAddress("0.0.0.0", server_port); - service = std::make_unique(syncer); + auto server_address = ray::BuildAddress("0.0.0.0", server_port); + service = std::make_unique(syncer); builder.AddListeningPort(server_address, grpc::InsecureServerCredentials()); builder.RegisterService(service.get()); server = builder.BuildAndStart(); @@ -123,7 +124,7 @@ int main(int argc, char *argv[]) { argument.SetMaxSendMessageSize(::RayConfig::instance().max_grpc_message_size()); argument.SetMaxReceiveMessageSize(::RayConfig::instance().max_grpc_message_size()); - channel = grpc::CreateCustomChannel(BuildAddress("localhost", leader_port), + channel = grpc::CreateCustomChannel(ray::BuildAddress("localhost", leader_port), grpc::InsecureChannelCredentials(), argument); diff --git a/src/ray/common/test/task_spec_test.cc b/src/ray/common/tests/task_spec_test.cc similarity index 89% rename from src/ray/common/test/task_spec_test.cc rename to src/ray/common/tests/task_spec_test.cc index 5383adcd707f..bb8a64636ffe 100644 --- a/src/ray/common/test/task_spec_test.cc +++ b/src/ray/common/tests/task_spec_test.cc @@ -14,6 +14,10 @@ #include "ray/common/task/task_spec.h" +#include +#include +#include + #include "gtest/gtest.h" #include "ray/common/task/task_util.h" @@ -62,62 +66,62 @@ TEST(TaskSpecTest, TestSchedulingClassDescriptor) { ASSERT_TRUE(descriptor1 == descriptor1); ASSERT_TRUE(absl::Hash()(descriptor1) == absl::Hash()(descriptor1)); - ASSERT_TRUE(TaskSpecification::GetSchedulingClass(descriptor1) == - TaskSpecification::GetSchedulingClass(descriptor1)); + ASSERT_TRUE(SchedulingClassToIds::GetSchedulingClass(descriptor1) == + SchedulingClassToIds::GetSchedulingClass(descriptor1)); ASSERT_FALSE(descriptor1 == descriptor2); ASSERT_FALSE(absl::Hash()(descriptor1) == absl::Hash()(descriptor2)); - ASSERT_FALSE(TaskSpecification::GetSchedulingClass(descriptor1) == - TaskSpecification::GetSchedulingClass(descriptor2)); + ASSERT_FALSE(SchedulingClassToIds::GetSchedulingClass(descriptor1) == + SchedulingClassToIds::GetSchedulingClass(descriptor2)); ASSERT_FALSE(descriptor1 == descriptor3); ASSERT_FALSE(absl::Hash()(descriptor1) == absl::Hash()(descriptor3)); - ASSERT_FALSE(TaskSpecification::GetSchedulingClass(descriptor1) == - TaskSpecification::GetSchedulingClass(descriptor3)); + ASSERT_FALSE(SchedulingClassToIds::GetSchedulingClass(descriptor1) == + SchedulingClassToIds::GetSchedulingClass(descriptor3)); ASSERT_FALSE(descriptor1 == descriptor4); ASSERT_FALSE(absl::Hash()(descriptor1) == absl::Hash()(descriptor4)); - ASSERT_FALSE(TaskSpecification::GetSchedulingClass(descriptor1) == - TaskSpecification::GetSchedulingClass(descriptor4)); + ASSERT_FALSE(SchedulingClassToIds::GetSchedulingClass(descriptor1) == + SchedulingClassToIds::GetSchedulingClass(descriptor4)); ASSERT_FALSE(descriptor4 == descriptor5); ASSERT_FALSE(absl::Hash()(descriptor4) == absl::Hash()(descriptor5)); - ASSERT_FALSE(TaskSpecification::GetSchedulingClass(descriptor4) == - TaskSpecification::GetSchedulingClass(descriptor5)); + ASSERT_FALSE(SchedulingClassToIds::GetSchedulingClass(descriptor4) == + SchedulingClassToIds::GetSchedulingClass(descriptor5)); ASSERT_TRUE(descriptor5 == descriptor6); ASSERT_TRUE(absl::Hash()(descriptor5) == absl::Hash()(descriptor6)); - ASSERT_TRUE(TaskSpecification::GetSchedulingClass(descriptor5) == - TaskSpecification::GetSchedulingClass(descriptor6)); + ASSERT_TRUE(SchedulingClassToIds::GetSchedulingClass(descriptor5) == + SchedulingClassToIds::GetSchedulingClass(descriptor6)); ASSERT_FALSE(descriptor6 == descriptor10); ASSERT_FALSE(absl::Hash()(descriptor6) == absl::Hash()(descriptor10)); - ASSERT_FALSE(TaskSpecification::GetSchedulingClass(descriptor6) == - TaskSpecification::GetSchedulingClass(descriptor10)); + ASSERT_FALSE(SchedulingClassToIds::GetSchedulingClass(descriptor6) == + SchedulingClassToIds::GetSchedulingClass(descriptor10)); ASSERT_FALSE(descriptor6 == descriptor7); ASSERT_FALSE(absl::Hash()(descriptor6) == absl::Hash()(descriptor7)); - ASSERT_FALSE(TaskSpecification::GetSchedulingClass(descriptor6) == - TaskSpecification::GetSchedulingClass(descriptor7)); + ASSERT_FALSE(SchedulingClassToIds::GetSchedulingClass(descriptor6) == + SchedulingClassToIds::GetSchedulingClass(descriptor7)); ASSERT_FALSE(descriptor7 == descriptor8); ASSERT_FALSE(absl::Hash()(descriptor7) == absl::Hash()(descriptor8)); - ASSERT_FALSE(TaskSpecification::GetSchedulingClass(descriptor7) == - TaskSpecification::GetSchedulingClass(descriptor8)); + ASSERT_FALSE(SchedulingClassToIds::GetSchedulingClass(descriptor7) == + SchedulingClassToIds::GetSchedulingClass(descriptor8)); ASSERT_TRUE(descriptor7 == descriptor9); ASSERT_TRUE(absl::Hash()(descriptor7) == absl::Hash()(descriptor9)); - ASSERT_TRUE(TaskSpecification::GetSchedulingClass(descriptor7) == - TaskSpecification::GetSchedulingClass(descriptor9)); + ASSERT_TRUE(SchedulingClassToIds::GetSchedulingClass(descriptor7) == + SchedulingClassToIds::GetSchedulingClass(descriptor9)); } TEST(TaskSpecTest, TestActorSchedulingClass) { @@ -234,7 +238,7 @@ TEST(TaskSpecTest, TestCallerAddress) { rpc::Address caller_address; NodeID caller_node_id = NodeID::FromRandom(); WorkerID caller_worker_id = WorkerID::FromRandom(); - caller_address.set_raylet_id(caller_node_id.Binary()); + caller_address.set_node_id(caller_node_id.Binary()); caller_address.set_worker_id(caller_worker_id.Binary()); TaskSpecBuilder task_spec_builder; task_spec_builder.SetCommonTaskSpec( diff --git a/src/ray/common/test/testing.h b/src/ray/common/tests/testing.h similarity index 100% rename from src/ray/common/test/testing.h rename to src/ray/common/tests/testing.h diff --git a/src/ray/core_worker/BUILD.bazel b/src/ray/core_worker/BUILD.bazel index 931ad1959ad0..429655af4fe1 100644 --- a/src/ray/core_worker/BUILD.bazel +++ b/src/ray/core_worker/BUILD.bazel @@ -5,11 +5,13 @@ ray_cc_library( srcs = [ "core_worker.cc", "core_worker_process.cc", + "core_worker_shutdown_executor.cc", ], hdrs = [ "core_worker.h", "core_worker_process.h", "core_worker_rpc_proxy.h", + "core_worker_shutdown_executor.h", ], deps = [ ":actor_handle", @@ -21,28 +23,30 @@ ray_cc_library( ":experimental_mutable_object_provider", ":future_resolver", ":generator_waiter", + ":grpc_service", ":memory_store", - ":normal_task_submitter", + ":metrics", ":object_recovery_manager", ":plasma_store_provider", ":profile_event", ":reference_count", + ":shutdown_coordinator", ":task_event_buffer", + "//src/ray/common:protobuf_utils", "//src/ray/common/cgroup:cgroup_context", "//src/ray/common/cgroup:cgroup_manager", "//src/ray/common/cgroup:constants", "//src/ray/core_worker/task_execution:task_receiver", - "//src/ray/gcs:gcs_pb_util", - "//src/ray/gcs/gcs_client:gcs_client_lib", + "//src/ray/core_worker/task_submission:normal_task_submitter", + "//src/ray/gcs_client", "//src/ray/ipc:raylet_ipc_client", "//src/ray/protobuf:pubsub_cc_proto", "//src/ray/pubsub:publisher", "//src/ray/pubsub:subscriber", - "//src/ray/raylet_client:raylet_client_lib", "//src/ray/rpc:core_worker_client", - "//src/ray/rpc:core_worker_server", + "//src/ray/rpc:metrics_agent_client", + "//src/ray/rpc:raylet_client_lib", "//src/ray/stats:stats_lib", - "//src/ray/util", "//src/ray/util:container_util", "//src/ray/util:env", "//src/ray/util:event", @@ -53,6 +57,7 @@ ray_cc_library( "//src/ray/util:shared_lru", "//src/ray/util:stream_redirection", "//src/ray/util:stream_redirection_options", + "//src/ray/util:time", "@com_google_absl//absl/cleanup", "@com_google_absl//absl/strings", "@com_google_googletest//:gtest_prod", @@ -60,35 +65,48 @@ ray_cc_library( ) ray_cc_library( - name = "core_worker_options", - hdrs = ["core_worker_options.h"], + name = "grpc_service", + srcs = [ + "grpc_service.cc", + ], + hdrs = [ + "grpc_service.h", + ], + visibility = [":__subpackages__"], deps = [ - ":common", - "//src/ray/common:id", - "//src/ray/common:ray_object", - "//src/ray/common:status", - "//src/ray/common:task_common", - "//src/ray/gcs/gcs_client:gcs_client_lib", - "//src/ray/util:process", + "//src/ray/common:asio", + "//src/ray/protobuf:core_worker_cc_grpc", + "//src/ray/protobuf:core_worker_cc_proto", + "//src/ray/rpc:grpc_server", + "//src/ray/rpc:server_call", ], ) ray_cc_library( - name = "core_worker_fiber", - hdrs = ["fiber.h"], + name = "shutdown_coordinator", + srcs = [ + "shutdown_coordinator.cc", + ], + hdrs = [ + "shutdown_coordinator.h", + ], + visibility = [":__subpackages__"], deps = [ - "//src/ray/util:logging", - "@boost//:fiber", + "//src/ray/common:buffer", + "//src/ray/protobuf:common_cc_proto", ], ) ray_cc_library( - name = "actor_submit_queue", - hdrs = ["transport/actor_submit_queue.h"], + name = "core_worker_options", + hdrs = ["core_worker_options.h"], deps = [ + ":common", "//src/ray/common:id", - "//src/ray/common:task_common", - "@com_google_absl//absl/types:optional", + "//src/ray/common:ray_object", + "//src/ray/common:status", + "//src/ray/gcs_client", + "//src/ray/util:process", ], ) @@ -101,6 +119,7 @@ ray_cc_library( "//src/ray/common:id", "//src/ray/common:ray_object", "//src/ray/common:task_common", + "//src/ray/util:process", ], ) @@ -136,10 +155,20 @@ ray_cc_library( ray_cc_library( name = "actor_creator", + srcs = ["actor_creator.cc"], hdrs = ["actor_creator.h"], + visibility = [":__subpackages__"], deps = [ - "//src/ray/common:ray_config", - "//src/ray/gcs/gcs_client:gcs_client_lib", + "//src/ray/gcs_client", + ], +) + +ray_cc_library( + name = "fake_actor_creator", + hdrs = ["fake_actor_creator.h"], + visibility = [":__subpackages__"], + deps = [ + ":actor_creator", ], ) @@ -150,14 +179,14 @@ ray_cc_library( deps = [ ":actor_creator", ":actor_handle", - ":actor_task_submitter", ":common", ":core_worker_context", ":reference_count", "//src/ray/common:id", + "//src/ray/common:protobuf_utils", "//src/ray/common:task_common", - "//src/ray/gcs:gcs_pb_util", - "//src/ray/gcs/gcs_client:gcs_client_lib", + "//src/ray/core_worker/task_submission:actor_task_submitter", + "//src/ray/gcs_client", "//src/ray/protobuf:core_worker_cc_proto", "@com_google_absl//absl/container:flat_hash_map", "@com_google_googletest//:gtest_prod", @@ -169,11 +198,11 @@ ray_cc_library( srcs = ["reference_count.cc"], hdrs = ["reference_count.h"], deps = [ - ":lease_policy", "//src/ray/common:id", + "//src/ray/core_worker:lease_policy", "//src/ray/protobuf:common_cc_proto", - "//src/ray/pubsub:publisher", - "//src/ray/pubsub:subscriber", + "//src/ray/pubsub:publisher_interface", + "//src/ray/pubsub:subscriber_interface", "//src/ray/rpc:core_worker_client", "//src/ray/rpc:grpc_server", "//src/ray/util:logging", @@ -187,9 +216,10 @@ ray_cc_library( name = "lease_policy", srcs = ["lease_policy.cc"], hdrs = ["lease_policy.h"], + visibility = [":__subpackages__"], deps = [ "//src/ray/common:id", - "//src/ray/common:task_common", + "//src/ray/common:lease", "@com_google_absl//absl/base:core_headers", "@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/container:flat_hash_set", @@ -204,9 +234,9 @@ ray_cc_library( deps = [ "//src/ray/common:asio", "//src/ray/common:id", + "//src/ray/common:protobuf_utils", "//src/ray/common:task_common", - "//src/ray/gcs:gcs_pb_util", - "//src/ray/gcs/gcs_client:gcs_client_lib", + "//src/ray/gcs_client", "//src/ray/protobuf:export_task_event_cc_proto", "//src/ray/protobuf:gcs_cc_proto", "//src/ray/rpc:event_aggregator_client", @@ -219,29 +249,6 @@ ray_cc_library( ], ) -ray_cc_library( - name = "out_of_order_actor_submit_queue", - srcs = ["transport/out_of_order_actor_submit_queue.cc"], - hdrs = ["transport/out_of_order_actor_submit_queue.h"], - deps = [ - ":actor_submit_queue", - "//src/ray/common:id", - "@com_google_absl//absl/container:btree", - "@com_google_absl//absl/types:optional", - ], -) - -ray_cc_library( - name = "sequential_actor_submit_queue", - srcs = ["transport/sequential_actor_submit_queue.cc"], - hdrs = ["transport/sequential_actor_submit_queue.h"], - deps = [ - "actor_submit_queue", - "//src/ray/common:id", - "@com_google_absl//absl/types:optional", - ], -) - ray_cc_library( name = "memory_store", srcs = ["store_provider/memory_store/memory_store.cc"], @@ -253,7 +260,7 @@ ray_cc_library( "//src/ray/common:id", "//src/ray/common:ray_config", "//src/ray/common:status", - "//src/ray/ipc:raylet_ipc_client", + "//src/ray/ipc:raylet_ipc_client_interface", "@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/container:flat_hash_set", "@com_google_absl//absl/synchronization", @@ -284,13 +291,13 @@ ray_cc_library( ":task_manager_interface", "//src/ray/common:buffer", "//src/ray/common:id", - "//src/ray/gcs:gcs_pb_util", + "//src/ray/common:protobuf_utils", "//src/ray/protobuf:common_cc_proto", "//src/ray/protobuf:core_worker_cc_proto", "//src/ray/stats:stats_metric", - "//src/ray/util", "//src/ray/util:counter_map", "//src/ray/util:exponential_backoff", + "//src/ray/util:time", "@com_google_absl//absl/base:core_headers", "@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/strings", @@ -298,44 +305,6 @@ ray_cc_library( ], ) -ray_cc_library( - name = "dependency_resolver", - srcs = ["transport/dependency_resolver.cc"], - hdrs = ["transport/dependency_resolver.h"], - deps = [ - ":actor_creator", - ":memory_store", - ":task_manager_interface", - "//src/ray/common:id", - "//src/ray/common:task_common", - "@com_google_absl//absl/container:flat_hash_map", - "@com_google_absl//absl/container:flat_hash_set", - ], -) - -ray_cc_library( - name = "actor_task_submitter", - srcs = ["transport/actor_task_submitter.cc"], - hdrs = ["transport/actor_task_submitter.h"], - deps = [ - ":actor_creator", - ":actor_submit_queue", - ":core_worker_context", - ":dependency_resolver", - ":out_of_order_actor_submit_queue", - ":sequential_actor_submit_queue", - "//src/ray/common:asio", - "//src/ray/common:id", - "//src/ray/common:ray_object", - "//src/ray/gcs:gcs_pb_util", - "//src/ray/gcs/gcs_client:gcs_client_lib", - "//src/ray/rpc:core_worker_client", - "@com_google_absl//absl/base:core_headers", - "@com_google_absl//absl/container:flat_hash_map", - "@com_google_absl//absl/container:flat_hash_set", - ], -) - ray_cc_library( name = "experimental_mutable_object_manager", srcs = ["experimental_mutable_object_manager.cc"], @@ -344,9 +313,9 @@ ray_cc_library( "//src/ray/common:ray_config", "//src/ray/common:ray_object", "//src/ray/common:status", - "//src/ray/common:task_common", "//src/ray/object_manager:object_manager_common", "//src/ray/object_manager/plasma:plasma_client", + "//src/ray/util:time", "@com_google_absl//absl/container:node_hash_map", "@com_google_absl//absl/strings", "@com_google_googletest//:gtest_prod", @@ -371,8 +340,8 @@ ray_cc_library( hdrs = ["experimental_mutable_object_provider.h"], deps = [ ":experimental_mutable_object_manager", - "//src/ray/raylet_client:raylet_client_lib", "//src/ray/rpc:client_call", + "//src/ray/rpc:raylet_client_interface", ], ) @@ -395,7 +364,7 @@ ray_cc_library( ":reference_count", ":task_manager", "//src/ray/common:id", - "//src/ray/raylet_client:raylet_client_lib", + "//src/ray/rpc:raylet_client_pool", "@com_google_absl//absl/base:core_headers", "@com_google_absl//absl/synchronization", ], @@ -424,30 +393,19 @@ ray_cc_library( "//src/ray/common:id", "//src/ray/common:ray_config", "//src/ray/common:status", - "//src/ray/common:task_common", - "//src/ray/ipc:raylet_ipc_client", + "//src/ray/ipc:raylet_ipc_client_interface", "//src/ray/object_manager/plasma:plasma_client", "//src/ray/protobuf:common_cc_proto", + "//src/ray/util:time", "@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/container:flat_hash_set", ], ) ray_cc_library( - name = "normal_task_submitter", - srcs = ["transport/normal_task_submitter.cc"], - hdrs = ["transport/normal_task_submitter.h"], + name = "metrics", + hdrs = ["metrics.h"], deps = [ - ":actor_manager", - ":core_worker_context", - ":dependency_resolver", - ":lease_policy", - ":memory_store", - ":task_manager", - "//src/ray/common:id", - "//src/ray/gcs:gcs_pb_util", - "//src/ray/raylet_client:raylet_client_lib", - "//src/ray/rpc:core_worker_client", - "@com_google_absl//absl/base:core_headers", + "//src/ray/stats:stats_lib", ], ) diff --git a/src/ray/core_worker/actor_creator.cc b/src/ray/core_worker/actor_creator.cc new file mode 100644 index 000000000000..b5d9e10c99a3 --- /dev/null +++ b/src/ray/core_worker/actor_creator.cc @@ -0,0 +1,86 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ray/core_worker/actor_creator.h" + +#include +#include +#include + +namespace ray { +namespace core { + +Status ActorCreator::RegisterActor(const TaskSpecification &task_spec) const { + const auto status = actor_client_.SyncRegisterActor(task_spec); + if (status.IsTimedOut()) { + std::ostringstream stream; + stream << "There was timeout in registering an actor. It is probably " + "because GCS server is dead or there's a high load there."; + return Status::TimedOut(stream.str()); + } + return status; +} + +void ActorCreator::AsyncRegisterActor(const TaskSpecification &task_spec, + gcs::StatusCallback callback) { + auto actor_id = task_spec.ActorCreationId(); + (*registering_actors_)[actor_id] = {}; + if (callback != nullptr) { + (*registering_actors_)[actor_id].emplace_back(std::move(callback)); + } + actor_client_.AsyncRegisterActor(task_spec, [actor_id, this](Status status) { + std::vector cbs; + cbs = std::move((*registering_actors_)[actor_id]); + registering_actors_->erase(actor_id); + for (auto &cb : cbs) { + cb(status); + } + }); +} + +void ActorCreator::AsyncRestartActorForLineageReconstruction( + const ActorID &actor_id, + uint64_t num_restarts_due_to_lineage_reconstructions, + gcs::StatusCallback callback) { + actor_client_.AsyncRestartActorForLineageReconstruction( + actor_id, num_restarts_due_to_lineage_reconstructions, callback); +} + +void ActorCreator::AsyncReportActorOutOfScope( + const ActorID &actor_id, + uint64_t num_restarts_due_to_lineage_reconstruction, + gcs::StatusCallback callback) { + actor_client_.AsyncReportActorOutOfScope( + actor_id, num_restarts_due_to_lineage_reconstruction, callback); +} + +bool ActorCreator::IsActorInRegistering(const ActorID &actor_id) const { + return registering_actors_->find(actor_id) != registering_actors_->end(); +} + +void ActorCreator::AsyncWaitForActorRegisterFinish(const ActorID &actor_id, + gcs::StatusCallback callback) { + auto iter = registering_actors_->find(actor_id); + RAY_CHECK(iter != registering_actors_->end()); + iter->second.emplace_back(std::move(callback)); +} + +void ActorCreator::AsyncCreateActor( + const TaskSpecification &task_spec, + const rpc::ClientCallback &callback) { + actor_client_.AsyncCreateActor(task_spec, callback); +} + +} // namespace core +} // namespace ray diff --git a/src/ray/core_worker/actor_creator.h b/src/ray/core_worker/actor_creator.h index fb92ce99ac94..e34751f1f116 100644 --- a/src/ray/core_worker/actor_creator.h +++ b/src/ray/core_worker/actor_creator.h @@ -18,8 +18,7 @@ #include #include -#include "ray/common/ray_config.h" -#include "ray/gcs/gcs_client/gcs_client.h" +#include "ray/gcs_client/accessor.h" namespace ray { namespace core { @@ -72,73 +71,36 @@ class ActorCreatorInterface { virtual bool IsActorInRegistering(const ActorID &actor_id) const = 0; }; -class DefaultActorCreator : public ActorCreatorInterface { +class ActorCreator : public ActorCreatorInterface { public: - explicit DefaultActorCreator(std::shared_ptr gcs_client) - : gcs_client_(std::move(gcs_client)) {} - - Status RegisterActor(const TaskSpecification &task_spec) const override { - const auto status = gcs_client_->Actors().SyncRegisterActor(task_spec); - if (status.IsTimedOut()) { - std::ostringstream stream; - stream << "There was timeout in registering an actor. It is probably " - "because GCS server is dead or there's a high load there."; - return Status::TimedOut(stream.str()); - } - return status; - } + explicit ActorCreator(gcs::ActorInfoAccessor &actor_client) + : actor_client_(actor_client) {} + + Status RegisterActor(const TaskSpecification &task_spec) const override; void AsyncRegisterActor(const TaskSpecification &task_spec, - gcs::StatusCallback callback) override { - auto actor_id = task_spec.ActorCreationId(); - (*registering_actors_)[actor_id] = {}; - if (callback != nullptr) { - (*registering_actors_)[actor_id].emplace_back(std::move(callback)); - } - gcs_client_->Actors().AsyncRegisterActor(task_spec, [actor_id, this](Status status) { - std::vector cbs; - cbs = std::move((*registering_actors_)[actor_id]); - registering_actors_->erase(actor_id); - for (auto &cb : cbs) { - cb(status); - } - }); - } + gcs::StatusCallback callback) override; void AsyncRestartActorForLineageReconstruction( const ActorID &actor_id, uint64_t num_restarts_due_to_lineage_reconstructions, - gcs::StatusCallback callback) override { - gcs_client_->Actors().AsyncRestartActorForLineageReconstruction( - actor_id, num_restarts_due_to_lineage_reconstructions, callback); - } + gcs::StatusCallback callback) override; void AsyncReportActorOutOfScope(const ActorID &actor_id, uint64_t num_restarts_due_to_lineage_reconstruction, - gcs::StatusCallback callback) override { - gcs_client_->Actors().AsyncReportActorOutOfScope( - actor_id, num_restarts_due_to_lineage_reconstruction, callback); - } + gcs::StatusCallback callback) override; - bool IsActorInRegistering(const ActorID &actor_id) const override { - return registering_actors_->find(actor_id) != registering_actors_->end(); - } + bool IsActorInRegistering(const ActorID &actor_id) const override; void AsyncWaitForActorRegisterFinish(const ActorID &actor_id, - gcs::StatusCallback callback) override { - auto iter = registering_actors_->find(actor_id); - RAY_CHECK(iter != registering_actors_->end()); - iter->second.emplace_back(std::move(callback)); - } + gcs::StatusCallback callback) override; void AsyncCreateActor( const TaskSpecification &task_spec, - const rpc::ClientCallback &callback) override { - gcs_client_->Actors().AsyncCreateActor(task_spec, callback); - } + const rpc::ClientCallback &callback) override; private: - std::shared_ptr gcs_client_; + gcs::ActorInfoAccessor &actor_client_; using RegisteringActorType = absl::flat_hash_map>; ThreadPrivate registering_actors_; diff --git a/src/ray/core_worker/actor_handle.cc b/src/ray/core_worker/actor_handle.cc index b11065c8bffc..4f3e04875345 100644 --- a/src/ray/core_worker/actor_handle.cc +++ b/src/ray/core_worker/actor_handle.cc @@ -35,6 +35,7 @@ rpc::ActorHandle CreateInnerActorHandle( const std::string &ray_namespace, int32_t max_pending_calls, bool allow_out_of_order_execution, + bool enable_tensor_transport, std::optional enable_task_events, const std::unordered_map &labels) { rpc::ActorHandle inner; @@ -50,8 +51,9 @@ rpc::ActorHandle CreateInnerActorHandle( inner.set_max_task_retries(max_task_retries); inner.set_name(name); inner.set_ray_namespace(ray_namespace); - inner.set_allow_out_of_order_execution(allow_out_of_order_execution); inner.set_max_pending_calls(max_pending_calls); + inner.set_allow_out_of_order_execution(allow_out_of_order_execution); + inner.set_enable_tensor_transport(enable_tensor_transport); inner.set_enable_task_events(enable_task_events.value_or(kDefaultTaskEventEnabled)); inner.mutable_labels()->insert(labels.begin(), labels.end()); return inner; @@ -105,6 +107,7 @@ ActorHandle::ActorHandle( const std::string &ray_namespace, int32_t max_pending_calls, bool allow_out_of_order_execution, + bool enable_tensor_transport, std::optional enable_task_events, const std::unordered_map &labels) : ActorHandle(CreateInnerActorHandle(actor_id, @@ -120,6 +123,7 @@ ActorHandle::ActorHandle( ray_namespace, max_pending_calls, allow_out_of_order_execution, + enable_tensor_transport, enable_task_events, labels)) {} diff --git a/src/ray/core_worker/actor_handle.h b/src/ray/core_worker/actor_handle.h index b3ec2294befb..d9aa163f7dfa 100644 --- a/src/ray/core_worker/actor_handle.h +++ b/src/ray/core_worker/actor_handle.h @@ -49,6 +49,7 @@ class ActorHandle { const std::string &ray_namespace, int32_t max_pending_calls, bool allow_out_of_order_execution = false, + bool enable_tensor_transport = false, std::optional enable_task_events = absl::nullopt, const std::unordered_map &labels = {}); @@ -110,6 +111,8 @@ class ActorHandle { bool AllowOutOfOrderExecution() const { return inner_.allow_out_of_order_execution(); } + bool EnableTensorTransport() const { return inner_.enable_tensor_transport(); } + const ::google::protobuf::Map &GetLabels() const { return inner_.labels(); } diff --git a/src/ray/core_worker/actor_manager.cc b/src/ray/core_worker/actor_manager.cc index 4e6586dc10d1..767525be2937 100644 --- a/src/ray/core_worker/actor_manager.cc +++ b/src/ray/core_worker/actor_manager.cc @@ -19,7 +19,7 @@ #include #include -#include "ray/gcs/pb_util.h" +#include "ray/common/protobuf_utils.h" namespace ray { namespace core { @@ -214,8 +214,8 @@ void ActorManager::HandleActorStateNotification(const ActorID &actor_id, const rpc::ActorTableData &actor_data) { const auto &actor_state = rpc::ActorTableData::ActorState_Name(actor_data.state()); const auto worker_id = WorkerID::FromBinary(actor_data.address().worker_id()); - const auto raylet_id = NodeID::FromBinary(actor_data.address().raylet_id()); - RAY_LOG(INFO).WithField(actor_id).WithField(worker_id).WithField(raylet_id) + const auto node_id = NodeID::FromBinary(actor_data.address().node_id()); + RAY_LOG(INFO).WithField(actor_id).WithField(worker_id).WithField(node_id) << "received notification on actor, state: " << actor_state << ", ip address: " << actor_data.address().ip_address() << ", port: " << actor_data.address().port() diff --git a/src/ray/core_worker/actor_manager.h b/src/ray/core_worker/actor_manager.h index 897aa45e5b5b..ee9eaf798563 100644 --- a/src/ray/core_worker/actor_manager.h +++ b/src/ray/core_worker/actor_manager.h @@ -25,8 +25,8 @@ #include "ray/core_worker/actor_creator.h" #include "ray/core_worker/actor_handle.h" #include "ray/core_worker/reference_count.h" -#include "ray/core_worker/transport/actor_task_submitter.h" -#include "ray/gcs/gcs_client/gcs_client.h" +#include "ray/core_worker/task_submission/actor_task_submitter.h" +#include "ray/gcs_client/gcs_client.h" namespace ray { namespace core { diff --git a/src/ray/core_worker/common.cc b/src/ray/core_worker/common.cc index e82ed8d8fbed..e372c925b9bd 100644 --- a/src/ray/core_worker/common.cc +++ b/src/ray/core_worker/common.cc @@ -17,7 +17,8 @@ #include #include #include -#include + +#include "ray/util/process.h" namespace ray { namespace core { diff --git a/src/ray/core_worker/common.h b/src/ray/core_worker/common.h index de2a41013a7a..c68bed12d18e 100644 --- a/src/ray/core_worker/common.h +++ b/src/ray/core_worker/common.h @@ -22,7 +22,6 @@ #include "ray/common/id.h" #include "ray/common/ray_object.h" -#include "ray/common/scheduling/label_selector.h" #include "ray/common/task/task_spec.h" #include "src/ray/protobuf/common.pb.h" @@ -130,6 +129,7 @@ struct ActorCreationOptions { std::vector concurrency_groups_p = {}, bool allow_out_of_order_execution_p = false, int32_t max_pending_calls_p = -1, + bool enable_tensor_transport_p = false, bool enable_task_events_p = kDefaultTaskEventEnabled, std::unordered_map labels_p = {}, std::unordered_map label_selector_p = {}) @@ -148,6 +148,7 @@ struct ActorCreationOptions { concurrency_groups(std::move(concurrency_groups_p)), allow_out_of_order_execution(allow_out_of_order_execution_p), max_pending_calls(max_pending_calls_p), + enable_tensor_transport(enable_tensor_transport_p), scheduling_strategy(std::move(scheduling_strategy_p)), enable_task_events(enable_task_events_p), labels(std::move(labels_p)), @@ -201,6 +202,7 @@ struct ActorCreationOptions { const bool allow_out_of_order_execution = false; /// The maximum actor call pending count. const int max_pending_calls = -1; + const bool enable_tensor_transport = false; // The strategy about how to schedule this actor. rpc::SchedulingStrategy scheduling_strategy; /// True if task events (worker::TaskEvent) from this creation task should be reported @@ -219,39 +221,35 @@ struct PlacementGroupCreationOptions { PlacementStrategy strategy, std::vector> bundles, bool is_detached_p, - double max_cpu_fraction_per_node, NodeID soft_target_node_id = NodeID::Nil(), std::vector> bundle_label_selector = {}) - : name(std::move(name)), - strategy(strategy), - bundles(std::move(bundles)), - is_detached(is_detached_p), - max_cpu_fraction_per_node(max_cpu_fraction_per_node), - soft_target_node_id(soft_target_node_id), - bundle_label_selector(std::move(bundle_label_selector)) { - RAY_CHECK(soft_target_node_id.IsNil() || strategy == PlacementStrategy::STRICT_PACK) + : name_(std::move(name)), + strategy_(strategy), + bundles_(std::move(bundles)), + is_detached_(is_detached_p), + soft_target_node_id_(soft_target_node_id), + bundle_label_selector_(std::move(bundle_label_selector)) { + RAY_CHECK(soft_target_node_id_.IsNil() || strategy_ == PlacementStrategy::STRICT_PACK) << "soft_target_node_id only works with STRICT_PACK now"; } /// The name of the placement group. - const std::string name; + const std::string name_; /// The strategy to place the bundle in Placement Group. - const PlacementStrategy strategy = rpc::PACK; + const PlacementStrategy strategy_ = rpc::PACK; /// The resource bundles in this placement group. - const std::vector> bundles; + const std::vector> bundles_; /// Whether to keep the placement group persistent after its creator dead. - const bool is_detached = false; - /// The maximum fraction of CPU cores this placement group can take up on each node. - const double max_cpu_fraction_per_node; + const bool is_detached_ = false; /// ID of the target node where bundles should be placed /// iff the target node has enough available resources and alive. /// Otherwise, the bundles can be placed elsewhere. /// Nil means there is no target node. /// This only applies to STRICT_PACK pg. - const NodeID soft_target_node_id; + const NodeID soft_target_node_id_; /// The label selectors to apply per-bundle in this placement group. - const std::vector> bundle_label_selector; + const std::vector> bundle_label_selector_; }; class ObjectLocation { @@ -311,13 +309,13 @@ namespace std { template <> struct hash { size_t operator()(const ray::rpc::LineageReconstructionTask &task) const { - size_t hash = std::hash()(task.name()); - hash ^= std::hash()(task.status()); + size_t hash_value = std::hash()(task.name()); + hash_value ^= std::hash()(task.status()); for (const auto &label : task.labels()) { - hash ^= std::hash()(label.first); - hash ^= std::hash()(label.second); + hash_value ^= std::hash()(label.first); + hash_value ^= std::hash()(label.second); } - return hash; + return hash_value; } }; } // namespace std diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index 6cebf980861a..c9b93f4e0d22 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -23,6 +23,9 @@ #include #include +#include "ray/core_worker/core_worker_shutdown_executor.h" +#include "ray/core_worker/shutdown_coordinator.h" + #ifndef _WIN32 #include #endif @@ -35,22 +38,24 @@ #include "ray/common/cgroup/cgroup_context.h" #include "ray/common/cgroup/cgroup_manager.h" #include "ray/common/cgroup/constants.h" +#include "ray/common/protobuf_utils.h" #include "ray/common/ray_config.h" #include "ray/common/runtime_env_common.h" #include "ray/common/task/task_util.h" -#include "ray/gcs/gcs_client/gcs_client.h" -#include "ray/gcs/pb_util.h" +#include "ray/gcs_client/gcs_client.h" #include "ray/rpc/event_aggregator_client.h" #include "ray/util/container_util.h" #include "ray/util/event.h" #include "ray/util/subreaper.h" -#include "ray/util/util.h" +#include "ray/util/time.h" using json = nlohmann::json; using MessageType = ray::protocol::MessageType; namespace ray::core { +using std::literals::operator""sv; + namespace { // Default capacity for serialization caches. constexpr size_t kDefaultSerializationCacheCap = 500; @@ -123,6 +128,28 @@ std::optional TryGetLocalObjectLocation( return CreateObjectLocation(object_info); } +/// Converts rpc::WorkerExitType to ShutdownReason +/// \param exit_type The worker exit type to convert +/// \param is_force_exit If true, INTENDED_USER_EXIT maps to kForcedExit; otherwise +/// kGracefulExit +ShutdownReason ConvertExitTypeToShutdownReason(rpc::WorkerExitType exit_type, + bool is_force_exit = false) { + switch (exit_type) { + case rpc::WorkerExitType::INTENDED_SYSTEM_EXIT: + return ShutdownReason::kIntentionalShutdown; + case rpc::WorkerExitType::INTENDED_USER_EXIT: + return is_force_exit ? ShutdownReason::kForcedExit : ShutdownReason::kGracefulExit; + case rpc::WorkerExitType::USER_ERROR: + return ShutdownReason::kUserError; + case rpc::WorkerExitType::SYSTEM_ERROR: + return ShutdownReason::kUnexpectedError; + case rpc::WorkerExitType::NODE_OUT_OF_MEMORY: + return ShutdownReason::kOutOfMemory; + default: + return ShutdownReason::kUnexpectedError; + } +} + } // namespace JobID GetProcessJobID(const CoreWorkerOptions &options) { @@ -141,7 +168,8 @@ JobID GetProcessJobID(const CoreWorkerOptions &options) { return options.job_id; } -TaskCounter::TaskCounter() { +TaskCounter::TaskCounter(ray::observability::MetricInterface &task_by_state_counter) + : task_by_state_counter_(task_by_state_counter) { counter_.SetOnChangeCallback( [this](const std::tuple &key) ABSL_EXCLUSIVE_LOCKS_REQUIRED(&mu_) mutable { @@ -156,37 +184,37 @@ TaskCounter::TaskCounter() { const auto is_retry_label = is_retry ? "1" : "0"; // RUNNING_IN_RAY_GET/WAIT are sub-states of RUNNING, so we need to subtract // them out to avoid double-counting. - ray::stats::STATS_tasks.Record( + task_by_state_counter_.Record( running_total - num_in_get - num_in_wait, - {{"State", rpc::TaskStatus_Name(rpc::TaskStatus::RUNNING)}, - {"Name", func_name}, - {"IsRetry", is_retry_label}, - {"JobId", job_id_}, - {"Source", "executor"}}); + {{"State"sv, rpc::TaskStatus_Name(rpc::TaskStatus::RUNNING)}, + {"Name"sv, func_name}, + {"IsRetry"sv, is_retry_label}, + {"JobId"sv, job_id_}, + {"Source"sv, "executor"}}); // Negate the metrics recorded from the submitter process for these tasks. - ray::stats::STATS_tasks.Record( + task_by_state_counter_.Record( -running_total, - {{"State", rpc::TaskStatus_Name(rpc::TaskStatus::SUBMITTED_TO_WORKER)}, - {"Name", func_name}, - {"IsRetry", is_retry_label}, - {"JobId", job_id_}, - {"Source", "executor"}}); + {{"State"sv, rpc::TaskStatus_Name(rpc::TaskStatus::SUBMITTED_TO_WORKER)}, + {"Name"sv, func_name}, + {"IsRetry"sv, is_retry_label}, + {"JobId"sv, job_id_}, + {"Source"sv, "executor"}}); // Record sub-state for get. - ray::stats::STATS_tasks.Record( + task_by_state_counter_.Record( num_in_get, - {{"State", rpc::TaskStatus_Name(rpc::TaskStatus::RUNNING_IN_RAY_GET)}, - {"Name", func_name}, - {"IsRetry", is_retry_label}, - {"JobId", job_id_}, - {"Source", "executor"}}); + {{"State"sv, rpc::TaskStatus_Name(rpc::TaskStatus::RUNNING_IN_RAY_GET)}, + {"Name"sv, func_name}, + {"IsRetry"sv, is_retry_label}, + {"JobId"sv, job_id_}, + {"Source"sv, "executor"}}); // Record sub-state for wait. - ray::stats::STATS_tasks.Record( + task_by_state_counter_.Record( num_in_wait, - {{"State", rpc::TaskStatus_Name(rpc::TaskStatus::RUNNING_IN_RAY_WAIT)}, - {"Name", func_name}, - {"IsRetry", is_retry_label}, - {"JobId", job_id_}, - {"Source", "executor"}}); + {{"State"sv, rpc::TaskStatus_Name(rpc::TaskStatus::RUNNING_IN_RAY_WAIT)}, + {"Name"sv, func_name}, + {"IsRetry"sv, is_retry_label}, + {"JobId"sv, job_id_}, + {"Source"sv, "executor"}}); }); } @@ -273,7 +301,7 @@ CoreWorker::CoreWorker( std::unique_ptr core_worker_server, rpc::Address rpc_address, std::shared_ptr gcs_client, - std::shared_ptr raylet_ipc_client, + std::shared_ptr raylet_ipc_client, std::shared_ptr local_raylet_rpc_client, boost::thread &io_thread, std::shared_ptr reference_counter, @@ -293,7 +321,8 @@ CoreWorker::CoreWorker( std::unique_ptr actor_manager, instrumented_io_context &task_execution_service, std::unique_ptr task_event_buffer, - uint32_t pid) + uint32_t pid, + ray::observability::MetricInterface &task_by_state_counter) : options_(std::move(options)), get_call_site_(RayConfig::instance().record_ref_creation_sites() ? options_.get_lang_stack @@ -332,8 +361,10 @@ CoreWorker::CoreWorker( task_execution_service_(task_execution_service), exiting_detail_(std::nullopt), max_direct_call_object_size_(RayConfig::instance().max_direct_call_object_size()), + task_counter_(task_by_state_counter), task_event_buffer_(std::move(task_event_buffer)), pid_(pid), + actor_shutdown_callback_(std::move(options_.actor_shutdown_callback)), runtime_env_json_serialization_cache_(kDefaultSerializationCacheCap) { // Initialize task receivers. if (options_.worker_type == WorkerType::WORKER || options_.is_local_mode) { @@ -396,7 +427,8 @@ CoreWorker::CoreWorker( /*attempt_number=*/0, rpc::TaskStatus::RUNNING, /*timestamp=*/absl::GetCurrentTimeNanos(), - /*is_actor_task=*/false, + /*is_actor_task_event=*/false, + options_.session_name, std::make_shared(std::move(spec))); task_event_buffer_->AddTaskEvent(std::move(task_event)); } @@ -487,55 +519,23 @@ CoreWorker::CoreWorker( // NOTE: This also marks the worker as available in Raylet. We do this at the very end // in case there is a problem during construction. ConnectToRayletInternal(); + + // Initialize shutdown coordinator last - after all services are ready + // Create concrete shutdown executor that implements real shutdown operations + auto shutdown_executor = std::make_unique(this); + shutdown_coordinator_ = std::make_unique( + std::move(shutdown_executor), options_.worker_type); + + RAY_LOG(DEBUG) << "Initialized unified shutdown coordinator with concrete executor for " + "worker type: " + << WorkerTypeString(options_.worker_type); } // NOLINT(readability/fn_size) CoreWorker::~CoreWorker() { RAY_LOG(INFO) << "Core worker is destructed"; } void CoreWorker::Shutdown() { - // Ensure that the shutdown logic runs at most once. - bool expected = false; - if (!is_shutdown_.compare_exchange_strong(expected, /*desired=*/true)) { - RAY_LOG(INFO) << "Shutdown was called more than once, ignoring."; - return; - } - RAY_LOG(INFO) << "Shutting down."; - - if (options_.worker_type == WorkerType::WORKER) { - // Running in a main thread. - // Asyncio coroutines could still run after CoreWorker is removed because it is - // running in a different thread. This can cause segfault because coroutines try to - // access CoreWorker methods that are already garbage collected. We should complete - // all coroutines before shutting down in order to prevent this. - if (worker_context_->CurrentActorIsAsync()) { - options_.terminate_asyncio_thread(); - } - task_execution_service_.stop(); - } - - task_event_buffer_->FlushEvents(/*forced=*/true); - task_event_buffer_->Stop(); - - io_service_.stop(); - RAY_LOG(INFO) << "Waiting for joining a core worker io thread. If it hangs here, there " - "might be deadlock or a high load in the core worker io service."; - if (io_thread_.joinable()) { - io_thread_.join(); - } - - // Shutdown gRPC server - core_worker_server_->Shutdown(); - - // Now that gcs_client is not used within io service, we can reset the pointer and clean - // it up. - if (gcs_client_) { - RAY_LOG(INFO) << "Disconnecting a GCS client."; - // TODO(hjiang): Move the Disconnect() logic - // to GcsClient destructor. - gcs_client_->Disconnect(); - gcs_client_.reset(); - } - - RAY_LOG(INFO) << "Core worker ready to be deallocated."; + shutdown_coordinator_->RequestShutdown( + /*force_shutdown=*/false, ShutdownReason::kGracefulExit, "ray.shutdown() called"); } void CoreWorker::ConnectToRayletInternal() { @@ -569,7 +569,8 @@ void CoreWorker::Disconnect( /*attempt_number=*/0, rpc::TaskStatus::FINISHED, /*timestamp=*/absl::GetCurrentTimeNanos(), - /*is_actor_task_event=*/worker_context_->GetCurrentActorID().IsNil()); + /*is_actor_task_event=*/worker_context_->GetCurrentActorID().IsNil(), + options_.session_name); task_event_buffer_->AddTaskEvent(std::move(task_event)); } @@ -641,118 +642,29 @@ void CoreWorker::Exit( const rpc::WorkerExitType exit_type, const std::string &detail, const std::shared_ptr &creation_task_exception_pb_bytes) { - // Ensure that the exit logic runs at most once. - bool expected = false; - if (!is_exited_.compare_exchange_strong(expected, /*desired=*/true)) { - RAY_LOG(INFO) << "Exit was called multipled times, ignoring."; - return; - } + // Preserve actor creation failure details by marking a distinct shutdown reason + // when initialization raised an exception. An exception payload is provided. + ShutdownReason reason = creation_task_exception_pb_bytes != nullptr + ? ShutdownReason::kActorCreationFailed + : ConvertExitTypeToShutdownReason(exit_type); - RAY_LOG(INFO) << "Exit signal received, this process will exit after all outstanding " - "tasks have finished" - << ", exit_type=" << rpc::WorkerExitType_Name(exit_type) - << ", detail=" << detail; - { - absl::MutexLock lock(&mutex_); - RAY_CHECK_NE(detail, ""); - exiting_detail_ = std::optional{detail}; - } - - // Callback to shutdown. - auto shutdown = [this, exit_type, detail, creation_task_exception_pb_bytes]() { - // To avoid problems, make sure shutdown is always called from the same - // event loop each time. - task_execution_service_.post( - [this, exit_type, detail, creation_task_exception_pb_bytes]() { - rpc::DrainServerCallExecutor(); - KillChildProcs(); - // Disconnect should be put close to Shutdown - // https://github.com/ray-project/ray/pull/34883 - // TODO(iycheng): Improve the Process.h and make it able to monitor - // process liveness - Disconnect(exit_type, detail, creation_task_exception_pb_bytes); - Shutdown(); - }, - "CoreWorker.Shutdown"); - }; - // Callback to drain objects once all pending tasks have been drained. - auto drain_references_callback = [this, shutdown]() { - // Post to the event loop to avoid a deadlock between the TaskManager and - // the ReferenceCounter. The deadlock can occur because this callback may - // get called by the TaskManager while the ReferenceCounter's lock is held, - // but the callback itself must acquire the ReferenceCounter's lock to - // drain the object references. - task_execution_service_.post( - [this, shutdown]() { - RAY_LOG(INFO) << "Wait for currently executing tasks in the underlying thread " - "pools to finish."; - // Wait for currently executing tasks in the underlying thread pools to - // finish. Note that if tasks have been posted to the thread pools but not - // started yet, they will not be executed. - task_receiver_->Stop(); - - // Release resources only after tasks have stopped executing. - auto status = raylet_ipc_client_->NotifyDirectCallTaskBlocked(); - if (!status.ok()) { - RAY_LOG(WARNING) - << "Failed to notify Raylet. The raylet may have already shut down or " - << "the connection was lost."; - } - - bool not_actor_task = false; - { - absl::MutexLock lock(&mutex_); - not_actor_task = actor_id_.IsNil(); - } - if (not_actor_task) { - // Normal tasks should not hold any object references in the heap after - // executing, but they could in the case that one was stored as a glob - // variable (anti-pattern, but possible). We decrement the reference count - // for all local references to account for this. After this call, the only - // references left to drain should be those that are in use by remote - // workers. If these workers hold their references forever, the call to - // drain the reference counter will hang forever and this process will not - // exit until it is forcibly removed (e.g., via SIGKILL). - // - // NOTE(edoakes): this is only safe to do _after_ we have drained executing - // tasks in the task_receiver_, otherwise there might still be user code - // running that relies on the state of the reference counter. - // See: https://github.com/ray-project/ray/pull/53002. - RAY_LOG(INFO) - << "Releasing local references, then draining reference counter."; - reference_counter_->ReleaseAllLocalReferences(); - reference_counter_->DrainAndShutdown(shutdown); - } else { - // If we are an actor, then we may be holding object references in the - // heap. Then, we should not wait to drain the object references before - // shutdown since this could hang. - RAY_LOG(INFO) - << "Not draining reference counter since this is an actor worker."; - shutdown(); - } - }, - "CoreWorker.DrainAndShutdown"); - }; - - task_manager_->DrainAndShutdown(drain_references_callback); + shutdown_coordinator_->RequestShutdown(/*force_shutdown=*/false, + reason, + detail, + ShutdownCoordinator::kInfiniteTimeout, + creation_task_exception_pb_bytes); } void CoreWorker::ForceExit(const rpc::WorkerExitType exit_type, const std::string &detail) { - RAY_LOG(WARNING) << "Force exit the process. " - << " Details: " << detail; + RAY_LOG(DEBUG) << "ForceExit called: exit_type=" << static_cast(exit_type) + << ", detail=" << detail; - KillChildProcs(); - // Disconnect should be put close to Exit - // https://github.com/ray-project/ray/pull/34883 - // TODO(iycheng): Improve the Process.h and make it able to monitor - // process liveness - Disconnect(exit_type, detail); + ShutdownReason reason = ConvertExitTypeToShutdownReason(exit_type, true); + shutdown_coordinator_->RequestShutdown( + /*force_shutdown=*/true, reason, detail, std::chrono::milliseconds{0}, nullptr); - // NOTE(hchen): Use `QuickExit()` to force-exit this process without doing cleanup. - // `exit()` will destruct static objects in an incorrect order, which will lead to - // core dumps. - QuickExit(); + RAY_LOG(DEBUG) << "ForceExit: shutdown request completed"; } const WorkerID &CoreWorker::GetWorkerID() const { return worker_context_->GetWorkerID(); } @@ -788,7 +700,7 @@ void CoreWorker::RegisterToGcs(int64_t worker_launch_time_ms, } auto worker_data = std::make_shared(); - worker_data->mutable_worker_address()->set_raylet_id(rpc_address_.raylet_id()); + worker_data->mutable_worker_address()->set_node_id(rpc_address_.node_id()); worker_data->mutable_worker_address()->set_ip_address(rpc_address_.ip_address()); worker_data->mutable_worker_address()->set_port(rpc_address_.port()); worker_data->mutable_worker_address()->set_worker_id(worker_id.Binary()); @@ -871,11 +783,11 @@ void CoreWorker::InternalHeartbeat() { if (spec.IsActorTask()) { auto actor_handle = actor_manager_->GetActorHandle(spec.ActorId()); actor_handle->SetResubmittedActorTaskSpec(spec); - RAY_CHECK_OK(actor_task_submitter_->SubmitTask(spec)); + actor_task_submitter_->SubmitTask(spec); } else if (spec.IsActorCreationTask()) { - RAY_CHECK_OK(actor_task_submitter_->SubmitActorCreationTask(spec)); + actor_task_submitter_->SubmitActorCreationTask(spec); } else { - RAY_CHECK_OK(normal_task_submitter_->SubmitTask(spec)); + normal_task_submitter_->SubmitTask(spec); } } @@ -1040,7 +952,7 @@ Status CoreWorker::Put(const RayObject &object, object.GetSize(), /*is_reconstructable=*/false, /*add_local_ref=*/true, - NodeID::FromBinary(rpc_address_.raylet_id())); + NodeID::FromBinary(rpc_address_.node_id())); auto status = Put(object, contained_object_ids, *object_id, /*pin_object=*/true); if (!status.ok()) { RemoveLocalReference(*object_id); @@ -1105,7 +1017,8 @@ Status CoreWorker::CreateOwnedAndIncrementLocalRef( ObjectID *object_id, std::shared_ptr *data, const std::unique_ptr &owner_address, - bool inline_small_object) { + bool inline_small_object, + rpc::TensorTransport tensor_transport) { auto status = WaitForActorRegistered(contained_object_ids); if (!status.ok()) { return status; @@ -1124,7 +1037,14 @@ Status CoreWorker::CreateOwnedAndIncrementLocalRef( data_size + metadata->Size(), /*is_reconstructable=*/false, /*add_local_ref=*/true, - NodeID::FromBinary(rpc_address_.raylet_id())); + NodeID::FromBinary(rpc_address_.node_id()), + /*tensor_transport=*/tensor_transport); + + // Register the callback to free the GPU object when it is out of scope. + if (tensor_transport != rpc::TensorTransport::OBJECT_STORE) { + reference_counter_->AddObjectOutOfScopeOrFreedCallback( + *object_id, options_.free_actor_object_callback); + } } else { // Because in the remote worker's `HandleAssignObjectOwner`, // a `WaitForRefRemoved` RPC request will be sent back to @@ -1329,7 +1249,7 @@ Status CoreWorker::ExperimentalRegisterMutableObjectReaderRemote( conn->RegisterMutableObjectReader( req, [&promise, num_replied, num_requests, addr]( - const Status &status, const rpc::RegisterMutableObjectReaderReply &reply) { + const Status &status, const rpc::RegisterMutableObjectReaderReply &) { RAY_CHECK_OK(status); *num_replied += 1; if (*num_replied == num_requests) { @@ -1844,8 +1764,8 @@ json CoreWorker::OverrideRuntimeEnv(const json &child, std::shared_ptr CoreWorker::OverrideTaskOrActorRuntimeEnvInfo( const std::string &serialized_runtime_env_info) const { - auto factory = [this](const std::string &serialized_runtime_env_info) { - return OverrideTaskOrActorRuntimeEnvInfoImpl(serialized_runtime_env_info); + auto factory = [this](const std::string &runtime_env_info_str) { + return OverrideTaskOrActorRuntimeEnvInfoImpl(runtime_env_info_str); }; return runtime_env_json_serialization_cache_.GetOrCreate(serialized_runtime_env_info, std::move(factory)); @@ -2089,7 +2009,7 @@ std::vector CoreWorker::SubmitTask( io_service_.post( [this, task_spec = std::move(task_spec)]() mutable { - RAY_UNUSED(normal_task_submitter_->SubmitTask(std::move(task_spec))); + normal_task_submitter_->SubmitTask(std::move(task_spec)); }, "CoreWorker.SubmitTask"); } @@ -2184,6 +2104,7 @@ Status CoreWorker::CreateActor(const RayFunction &function, ray_namespace, actor_creation_options.max_pending_calls, actor_creation_options.allow_out_of_order_execution, + actor_creation_options.enable_tensor_transport, actor_creation_options.enable_task_events, actor_creation_options.labels); std::string serialized_actor_handle; @@ -2269,7 +2190,7 @@ Status CoreWorker::CreateActor(const RayFunction &function, task_manager_->FailPendingTask( task_spec.TaskId(), rpc::ErrorType::ACTOR_CREATION_FAILED, &status); } else { - RAY_UNUSED(actor_task_submitter_->SubmitActorCreationTask(task_spec)); + actor_task_submitter_->SubmitActorCreationTask(task_spec); } }); }, @@ -2284,7 +2205,7 @@ Status CoreWorker::CreateActor(const RayFunction &function, } io_service_.post( [this, task_spec = std::move(task_spec)]() { - RAY_UNUSED(actor_task_submitter_->SubmitActorCreationTask(task_spec)); + actor_task_submitter_->SubmitActorCreationTask(task_spec); }, "CoreWorker.SubmitTask"); } @@ -2294,7 +2215,7 @@ Status CoreWorker::CreateActor(const RayFunction &function, Status CoreWorker::CreatePlacementGroup( const PlacementGroupCreationOptions &placement_group_creation_options, PlacementGroupID *return_placement_group_id) { - const auto &bundles = placement_group_creation_options.bundles; + const auto &bundles = placement_group_creation_options.bundles_; for (const auto &bundle : bundles) { for (const auto &resource : bundle) { if (resource.first == kBundle_ResourceLabel) { @@ -2307,18 +2228,16 @@ Status CoreWorker::CreatePlacementGroup( } const PlacementGroupID placement_group_id = PlacementGroupID::Of(GetCurrentJobId()); PlacementGroupSpecBuilder builder; - builder.SetPlacementGroupSpec( - placement_group_id, - placement_group_creation_options.name, - placement_group_creation_options.bundles, - placement_group_creation_options.strategy, - placement_group_creation_options.is_detached, - placement_group_creation_options.max_cpu_fraction_per_node, - placement_group_creation_options.soft_target_node_id, - worker_context_->GetCurrentJobID(), - worker_context_->GetCurrentActorID(), - worker_context_->CurrentActorDetached(), - placement_group_creation_options.bundle_label_selector); + builder.SetPlacementGroupSpec(placement_group_id, + placement_group_creation_options.name_, + placement_group_creation_options.bundles_, + placement_group_creation_options.strategy_, + placement_group_creation_options.is_detached_, + placement_group_creation_options.soft_target_node_id_, + worker_context_->GetCurrentJobID(), + worker_context_->GetCurrentActorID(), + worker_context_->CurrentActorDetached(), + placement_group_creation_options.bundle_label_selector_); PlacementGroupSpecification placement_group_spec = builder.Build(); *return_placement_group_id = placement_group_id; RAY_LOG(INFO).WithField(placement_group_id) @@ -2382,7 +2301,6 @@ Status CoreWorker::SubmitActorTask( std::string err_msg = absl::StrFormat( "Can't find actor %s. It might be dead or it's from a different cluster", actor_id.Hex()); - // TODO(dayshah): make status take by value return Status::NotFound(err_msg); } /// Check whether backpressure may happen at the very beginning of submitting a task. @@ -2468,7 +2386,7 @@ Status CoreWorker::SubmitActorTask( returned_refs = task_manager_->AddPendingTask( rpc_address_, task_spec, CurrentCallSite(), max_retries); - RAY_CHECK_OK(actor_task_submitter_->SubmitTask(task_spec)); + actor_task_submitter_->SubmitTask(task_spec); } task_returns = std::move(returned_refs); return Status::OK(); @@ -2488,8 +2406,8 @@ Status CoreWorker::CancelTask(const ObjectID &object_id, RAY_LOG(DEBUG).WithField(object_id) << "Request to cancel a task of object to an owner " << obj_addr.SerializeAsString(); - return normal_task_submitter_->CancelRemoteTask( - object_id, obj_addr, force_kill, recursive); + normal_task_submitter_->CancelRemoteTask(object_id, obj_addr, force_kill, recursive); + return Status::OK(); } auto task_spec = task_manager_->GetTaskSpec(object_id.TaskId()); @@ -2510,58 +2428,51 @@ Status CoreWorker::CancelTask(const ObjectID &object_id, return Status::InvalidArgument("force=True is not supported for actor tasks."); } - return actor_task_submitter_->CancelTask(task_spec.value(), recursive); + actor_task_submitter_->CancelTask(task_spec.value(), recursive); } else { - return normal_task_submitter_->CancelTask(task_spec.value(), force_kill, recursive); + normal_task_submitter_->CancelTask(task_spec.value(), force_kill, recursive); } + return Status::OK(); } Status CoreWorker::CancelChildren(const TaskID &task_id, bool force_kill) { - std::vector> recursive_cancellation_status; - bool recursive_success = true; - for (const auto &child_id : task_manager_->GetPendingChildrenTasks(task_id)) { + absl::flat_hash_set unknown_child_task_ids; + auto child_task_ids = task_manager_->GetPendingChildrenTasks(task_id); + for (const auto &child_id : child_task_ids) { auto child_spec = task_manager_->GetTaskSpec(child_id); if (!child_spec.has_value()) { - recursive_success = false; - recursive_cancellation_status.emplace_back( - child_id, - Status::UnknownError( - "Recursive task cancellation failed--check warning logs.")); + unknown_child_task_ids.insert(child_id); } else if (child_spec->IsActorTask()) { - auto result = actor_task_submitter_->CancelTask(child_spec.value(), true); - recursive_cancellation_status.emplace_back(child_id, result); + actor_task_submitter_->CancelTask(std::move(*child_spec), true); } else { - auto result = - normal_task_submitter_->CancelTask(child_spec.value(), force_kill, true); - recursive_cancellation_status.emplace_back(child_id, result); + normal_task_submitter_->CancelTask(std::move(*child_spec), force_kill, true); } } - if (recursive_success) { + if (unknown_child_task_ids.empty()) { return Status::OK(); - } else { - auto kMaxFailedTaskSampleSize = 10; - std::ostringstream ostr; - ostr << "Failed to cancel all the children tasks of " << task_id << " recursively.\n" - << "Here are up to " << kMaxFailedTaskSampleSize - << " samples tasks that failed to be canceled\n"; - auto success = 0; - auto failures = 0; - for (const auto &[child_id, status] : recursive_cancellation_status) { - if (status.ok()) { - success += 1; - } else { - // Only record up to sample sizes. - if (failures < kMaxFailedTaskSampleSize) { - ostr << "\t" << child_id << ", " << status << "\n"; - } - failures += 1; - } + } + + constexpr size_t kMaxFailedTaskSampleSize = 10; + std::ostringstream ostr; + ostr << "Failed to cancel all the children tasks of " << task_id << " recursively.\n" + << "Here are up to " << kMaxFailedTaskSampleSize + << " samples tasks that failed to be canceled\n"; + const auto failure_status_str = + Status::UnknownError("Recursive task cancellation failed--check warning logs.") + .ToString(); + size_t failures = 0; + for (const auto &child_id : unknown_child_task_ids) { + ostr << "\t" << child_id << ", " << failure_status_str << "\n"; + failures += 1; + if (failures >= kMaxFailedTaskSampleSize) { + break; } - ostr << "Total Recursive cancelation success: " << success - << ", failures: " << failures; - return Status::UnknownError(ostr.str()); } + ostr << "Total Recursive cancelation success: " + << (child_task_ids.size() - unknown_child_task_ids.size()) + << ", failures: " << unknown_child_task_ids.size(); + return Status::UnknownError(ostr.str()); } Status CoreWorker::KillActor(const ActorID &actor_id, bool force_kill, bool no_restart) { @@ -2747,7 +2658,7 @@ void CoreWorker::RunTaskExecutionLoop() { "CoreWorker.CheckSignal"); } task_execution_service_.run(); - RAY_CHECK(is_shutdown_) + RAY_CHECK(shutdown_coordinator_ && shutdown_coordinator_->IsShuttingDown()) << "Task execution loop was terminated without calling shutdown API."; } @@ -3123,12 +3034,12 @@ bool CoreWorker::PinExistingReturnObject(const ObjectID &return_id, owner_address, {return_id}, generator_id, - [return_id, pinned_return_object](const Status &status, + [return_id, pinned_return_object](const Status &pin_object_status, const rpc::PinObjectIDsReply &reply) { // RPC to the local raylet should never fail. - if (!status.ok()) { + if (!pin_object_status.ok()) { RAY_LOG(ERROR) << "Request to local raylet to pin object failed: " - << status.ToString(); + << pin_object_status.ToString(); return; } if (!reply.successes(0)) { @@ -3192,7 +3103,7 @@ Status CoreWorker::ReportGeneratorItemReturns( waiter->IncrementObjectGenerated(); client->ReportGeneratorItemReturns( - request, + std::move(request), [waiter, generator_id, return_id, item_index]( const Status &status, const rpc::ReportGeneratorItemReturnsReply &reply) { RAY_LOG(DEBUG) << "ReportGeneratorItemReturns replied. " << generator_id @@ -3577,8 +3488,9 @@ void CoreWorker::HandleWaitForActorRefDeleted( // Send a response to trigger cleaning up the actor state once the handle is // no longer in scope. - auto respond = [send_reply_callback](const ActorID &actor_id) { - RAY_LOG(DEBUG).WithField(actor_id) << "Replying to HandleWaitForActorRefDeleted"; + auto respond = [send_reply_callback](const ActorID &respond_actor_id) { + RAY_LOG(DEBUG).WithField(respond_actor_id) + << "Replying to HandleWaitForActorRefDeleted"; send_reply_callback(Status::OK(), nullptr, nullptr); }; @@ -3698,8 +3610,10 @@ void CoreWorker::HandlePubsubLongPolling(rpc::PubsubLongPollingRequest request, rpc::SendReplyCallback send_reply_callback) { const auto subscriber_id = NodeID::FromBinary(request.subscriber_id()); RAY_LOG(DEBUG).WithField(subscriber_id) << "Got a long polling request from a node"; - object_info_publisher_->ConnectToSubscriber( - request, reply, std::move(send_reply_callback)); + object_info_publisher_->ConnectToSubscriber(request, + reply->mutable_publisher_id(), + reply->mutable_pub_messages(), + std::move(send_reply_callback)); } void CoreWorker::HandlePubsubCommandBatch(rpc::PubsubCommandBatchRequest request, @@ -3883,24 +3797,20 @@ void CoreWorker::ProcessSubscribeForRefRemoved( const rpc::WorkerRefRemovedSubMessage &message) { const ObjectID &object_id = ObjectID::FromBinary(message.reference().object_id()); - // Set a callback to publish the message when the requested object ID's ref count - // goes to 0. - auto ref_removed_callback = - boost::bind(&ReferenceCounter::HandleRefRemoved, reference_counter_, object_id); - const auto intended_worker_id = WorkerID::FromBinary(message.intended_worker_id()); if (intended_worker_id != worker_context_->GetWorkerID()) { RAY_LOG(INFO) << "The ProcessSubscribeForRefRemoved message is for worker " << intended_worker_id << ", but the current worker is " << worker_context_->GetWorkerID() << ". The RPC will be no-op."; - ref_removed_callback(object_id); + reference_counter_->PublishRefRemoved(object_id); return; } const auto owner_address = message.reference().owner_address(); ObjectID contained_in_id = ObjectID::FromBinary(message.contained_in_id()); - reference_counter_->SetRefRemovedCallback( - object_id, contained_in_id, owner_address, ref_removed_callback); + // So it will call PublishRefRemovedInternal to publish a message when the requested + // object ID's ref count goes to 0. + reference_counter_->SubscribeRefRemoved(object_id, contained_in_id, owner_address); } void CoreWorker::HandleRemoteCancelTask(rpc::RemoteCancelTaskRequest request, @@ -4079,11 +3989,14 @@ void CoreWorker::HandleKillActor(rpc::KillActorRequest request, if (request.force_kill()) { RAY_LOG(INFO) << "Force kill actor request has received. exiting immediately... " << kill_actor_reason; + RAY_LOG(DEBUG) << "HandleKillActor: About to call ForceExit"; // If we don't need to restart this actor, we notify raylet before force killing it. ForceExit( rpc::WorkerExitType::INTENDED_SYSTEM_EXIT, absl::StrCat("Worker exits because the actor is killed. ", kill_actor_reason)); + RAY_LOG(DEBUG) << "HandleKillActor: ForceExit completed"; } else { + RAY_LOG(DEBUG) << "HandleKillActor: About to call Exit"; Exit(rpc::WorkerExitType::INTENDED_SYSTEM_EXIT, absl::StrCat("Worker exits because the actor is killed. ", kill_actor_reason)); } @@ -4142,7 +4055,6 @@ void CoreWorker::HandleGetCoreWorkerStats(rpc::GetCoreWorkerStatsRequest request } (*used_resources_map)[resource_name] = allocations; } - stats->set_actor_title(actor_title_); google::protobuf::Map webui_map(webui_display_.begin(), webui_display_.end()); (*stats->mutable_webui_display()) = webui_map; @@ -4281,16 +4193,12 @@ void CoreWorker::HandleDeleteSpilledObjects(rpc::DeleteSpilledObjectsRequest req void CoreWorker::HandleExit(rpc::ExitRequest request, rpc::ExitReply *reply, rpc::SendReplyCallback send_reply_callback) { - const size_t num_objects_with_references = reference_counter_->Size(); - const size_t num_pending_tasks = task_manager_->NumPendingTasks(); - const int64_t pins_in_flight = local_raylet_rpc_client_->GetPinsInFlight(); - // We consider the worker to be idle if it doesn't have object references and it doesn't - // have any object pinning RPCs in flight and it doesn't have pending tasks. - bool is_idle = (num_objects_with_references == 0) && (pins_in_flight == 0) && - (num_pending_tasks == 0); + bool is_idle = IsIdle(); bool force_exit = request.force_exit(); RAY_LOG(DEBUG) << "Exiting: is_idle: " << is_idle << " force_exit: " << force_exit; if (!is_idle) { + const size_t num_pending_tasks = task_manager_->NumPendingTasks(); + const int64_t pins_in_flight = local_raylet_rpc_client_->GetPinsInFlight(); RAY_LOG_EVERY_MS(INFO, 60000) << "Worker is not idle: reference counter: " << reference_counter_->DebugString() << " # pins in flight: " << pins_in_flight @@ -4307,21 +4215,29 @@ void CoreWorker::HandleExit(rpc::ExitRequest request, send_reply_callback( Status::OK(), [this, will_exit, force_exit]() { - // If the worker is idle, we exit. + if (!will_exit) { + return; + } + + ShutdownReason reason; + std::string detail; + if (force_exit) { - ForceExit(rpc::WorkerExitType::INTENDED_SYSTEM_EXIT, - "Worker force exits because its job has finished"); - } else if (will_exit) { - Exit(rpc::WorkerExitType::INTENDED_SYSTEM_EXIT, - "Worker exits because it was idle (it doesn't have objects it owns while " - "no task or actor has been scheduled) for a long time."); + reason = ShutdownReason::kForcedExit; + detail = "Worker force exited because its job has finished"; + } else { + reason = ShutdownReason::kIdleTimeout; + detail = "Worker exited because it was idle for a long time"; } + + shutdown_coordinator_->RequestShutdown(force_exit, reason, detail); }, - // We need to kill it regardless if the RPC failed. + // Fallback on RPC failure - still attempt shutdown [this]() { - Exit(rpc::WorkerExitType::INTENDED_SYSTEM_EXIT, - "Worker exits because it was idle (it doesn't have objects it owns while " - "no task or actor has been scheduled) for a long time."); + shutdown_coordinator_->RequestShutdown( + /*force_shutdown=*/false, + ShutdownReason::kIdleTimeout, + "Worker exited due to RPC failure during idle exit"); }); } @@ -4346,7 +4262,7 @@ void CoreWorker::HandleAssignObjectOwner(rpc::AssignObjectOwnerRequest request, request.object_size(), /*is_reconstructable=*/false, /*add_local_ref=*/false, - /*pinned_at_raylet_id=*/NodeID::FromBinary(borrower_address.raylet_id())); + /*pinned_at_node_id=*/NodeID::FromBinary(borrower_address.node_id())); reference_counter_->AddBorrowerAddress(object_id, borrower_address); memory_store_->Put(RayObject(rpc::ErrorType::OBJECT_IN_PLASMA), object_id); send_reply_callback(Status::OK(), nullptr, nullptr); @@ -4472,11 +4388,6 @@ void CoreWorker::SetWebuiDisplay(const std::string &key, const std::string &mess webui_display_[key] = message; } -void CoreWorker::SetActorTitle(const std::string &title) { - absl::MutexLock lock(&mutex_); - actor_title_ = title; -} - void CoreWorker::SetActorReprName(const std::string &repr_name) { RAY_CHECK(task_receiver_ != nullptr); task_receiver_->SetActorReprName(repr_name); @@ -4489,9 +4400,20 @@ rpc::JobConfig CoreWorker::GetJobConfig() const { return worker_context_->GetCurrentJobConfig(); } -bool CoreWorker::IsExiting() const { - absl::MutexLock lock(&mutex_); - return exiting_detail_.has_value(); +bool CoreWorker::IsExiting() const { return shutdown_coordinator_->ShouldEarlyExit(); } + +bool CoreWorker::IsIdle(size_t num_objects_with_references, + int64_t pins_in_flight, + size_t num_pending_tasks) const { + return (num_objects_with_references == 0) && (pins_in_flight == 0) && + (num_pending_tasks == 0); +} + +bool CoreWorker::IsIdle() const { + const size_t num_objects_with_references = reference_counter_->Size(); + const size_t num_pending_tasks = task_manager_->NumPendingTasks(); + const int64_t pins_in_flight = local_raylet_rpc_client_->GetPinsInFlight(); + return IsIdle(num_objects_with_references, pins_in_flight, num_pending_tasks); } Status CoreWorker::WaitForActorRegistered(const std::vector &ids) { @@ -4631,52 +4553,13 @@ void CoreWorker::UpdateTaskIsDebuggerPaused(const TaskID &task_id, worker::TaskStatusEvent::TaskStateUpdate(is_debugger_paused))); } -void CoreWorker::TaskManagerRetryTask(TaskSpecification &spec, - bool object_recovery, - uint32_t delay_ms) { +void CoreWorker::AsyncRetryTask(TaskSpecification &spec, uint32_t delay_ms) { spec.GetMutableMessage().set_attempt_number(spec.AttemptNumber() + 1); - if (!object_recovery) { - // Retry after a delay to emulate the existing Raylet reconstruction - // behaviour. TODO(ekl) backoff exponentially. - RAY_LOG(INFO) << "Will resubmit task after a " << delay_ms - << "ms delay: " << spec.DebugString(); - absl::MutexLock lock(&mutex_); - TaskToRetry task_to_retry{current_time_ms() + delay_ms, spec}; - to_resubmit_.push(std::move(task_to_retry)); - } else { - if (spec.IsActorTask()) { - auto actor_handle = actor_manager_->GetActorHandle(spec.ActorId()); - actor_handle->SetResubmittedActorTaskSpec(spec); - RAY_CHECK_OK(actor_task_submitter_->SubmitTask(spec)); - } else { - RAY_CHECK(spec.IsNormalTask()); - RAY_CHECK_OK(normal_task_submitter_->SubmitTask(spec)); - } - } -} - -ClusterSizeBasedLeaseRequestRateLimiter::ClusterSizeBasedLeaseRequestRateLimiter( - size_t min_concurrent_lease_limit) - : min_concurrent_lease_cap_(min_concurrent_lease_limit), num_alive_nodes_(0) {} - -size_t ClusterSizeBasedLeaseRequestRateLimiter:: - GetMaxPendingLeaseRequestsPerSchedulingCategory() { - return std::max(min_concurrent_lease_cap_, num_alive_nodes_.load()); -} - -void ClusterSizeBasedLeaseRequestRateLimiter::OnNodeChanges( - const rpc::GcsNodeInfo &data) { - if (data.state() == rpc::GcsNodeInfo::DEAD) { - if (num_alive_nodes_ != 0) { - num_alive_nodes_--; - } else { - RAY_LOG(WARNING) << "Node" << data.node_manager_address() - << " change state to DEAD but num_alive_node is 0."; - } - } else { - num_alive_nodes_++; - } - RAY_LOG_EVERY_MS(INFO, 60000) << "Number of alive nodes:" << num_alive_nodes_.load(); + absl::MutexLock lock(&mutex_); + TaskToRetry task_to_retry{current_time_ms() + delay_ms, spec}; + RAY_LOG(INFO) << "Will resubmit task after a " << delay_ms + << "ms delay: " << spec.DebugString(); + to_resubmit_.push(std::move(task_to_retry)); } } // namespace ray::core diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h index 003a9954b683..1f1ccda1d196 100644 --- a/src/ray/core_worker/core_worker.h +++ b/src/ray/core_worker/core_worker.h @@ -42,31 +42,21 @@ #include "ray/core_worker/object_recovery_manager.h" #include "ray/core_worker/profile_event.h" #include "ray/core_worker/reference_count.h" +#include "ray/core_worker/shutdown_coordinator.h" #include "ray/core_worker/store_provider/memory_store/memory_store.h" #include "ray/core_worker/store_provider/plasma_store_provider.h" #include "ray/core_worker/task_event_buffer.h" #include "ray/core_worker/task_execution/task_receiver.h" -#include "ray/core_worker/transport/normal_task_submitter.h" -#include "ray/gcs/gcs_client/gcs_client.h" -#include "ray/ipc/raylet_ipc_client.h" +#include "ray/core_worker/task_submission/normal_task_submitter.h" +#include "ray/gcs_client/gcs_client.h" +#include "ray/ipc/raylet_ipc_client_interface.h" #include "ray/pubsub/publisher.h" #include "ray/pubsub/subscriber.h" -#include "ray/raylet_client/raylet_client.h" -#include "ray/rpc/worker/core_worker_server.h" +#include "ray/rpc/raylet/raylet_client_interface.h" #include "ray/util/process.h" #include "ray/util/shared_lru.h" #include "src/ray/protobuf/pubsub.pb.h" -/// The set of gRPC handlers and their associated level of concurrency. If you want to -/// add a new call to the worker gRPC server, do the following: -/// 1) Add the rpc to the CoreWorkerService in core_worker.proto, e.g., "ExampleCall" -/// 2) Add a new macro to RAY_CORE_WORKER_DECLARE_RPC_HANDLERS -/// in core_worker_server.h, -// e.g. "DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(ExampleCall)" -/// 3) Add a new macro to RAY_CORE_WORKER_RPC_HANDLERS in core_worker_server.h, e.g. -/// "RPC_SERVICE_HANDLER(CoreWorkerService, ExampleCall, 1)" -/// 4) Add a method to the CoreWorker class below: "CoreWorker::HandleExampleCall" - namespace ray::core { JobID GetProcessJobID(const CoreWorkerOptions &options); @@ -79,7 +69,7 @@ class TaskCounter { enum class TaskStatusType { kPending, kRunning, kFinished }; public: - TaskCounter(); + explicit TaskCounter(ray::observability::MetricInterface &task_by_state_counter); void BecomeActor(const std::string &actor_name) { absl::MutexLock l(&mu_); @@ -137,6 +127,14 @@ class TaskCounter { // Used for actor state tracking. std::string actor_name_ ABSL_GUARDED_BY(mu_); int64_t num_tasks_running_ ABSL_GUARDED_BY(mu_) = 0; + + // Metric to track the number of tasks by state. + // Expected tags: + // - State: the task state, as described by rpc::TaskState proto in common.proto + // - Name: the name of the function called + // - IsRetry: whether the task is a retry + // - Source: component reporting, e.g., "core_worker", "executor", or "pull_manager" + ray::observability::MetricInterface &task_by_state_counter_; }; struct TaskToRetry { @@ -180,7 +178,7 @@ class CoreWorker { std::unique_ptr core_worker_server, rpc::Address rpc_address, std::shared_ptr gcs_client, - std::shared_ptr raylet_ipc_client, + std::shared_ptr raylet_ipc_client, std::shared_ptr local_raylet_rpc_client, boost::thread &io_thread, std::shared_ptr reference_counter, @@ -200,7 +198,8 @@ class CoreWorker { std::unique_ptr actor_manager, instrumented_io_context &task_execution_service, std::unique_ptr task_event_buffer, - uint32_t pid); + uint32_t pid, + ray::observability::MetricInterface &task_by_state_counter); CoreWorker(CoreWorker const &) = delete; @@ -288,7 +287,7 @@ class CoreWorker { int64_t GetTaskDepth() const { return worker_context_->GetTaskDepth(); } - NodeID GetCurrentNodeId() const { return NodeID::FromBinary(rpc_address_.raylet_id()); } + NodeID GetCurrentNodeId() const { return NodeID::FromBinary(rpc_address_.node_id()); } /// Read the next index of a ObjectRefStream of generator_id. /// This API always return immediately. @@ -350,8 +349,6 @@ class CoreWorker { void SetWebuiDisplay(const std::string &key, const std::string &message); - void SetActorTitle(const std::string &title); - /// Sets the actor's repr name. /// /// This is set explicitly rather than included as part of actor creation task spec @@ -519,6 +516,7 @@ class CoreWorker { /// defaults to this worker. /// \param[in] inline_small_object Whether to inline create this object if it's /// small. + /// \param[in] tensor_transport The tensor transport to use for the object. /// \return Status. Status CreateOwnedAndIncrementLocalRef( bool is_experimental_mutable_object, @@ -528,7 +526,8 @@ class CoreWorker { ObjectID *object_id, std::shared_ptr *data, const std::unique_ptr &owner_address = nullptr, - bool inline_small_object = true); + bool inline_small_object = true, + rpc::TensorTransport tensor_transport = rpc::TensorTransport::OBJECT_STORE); /// Create and return a buffer in the object store that can be directly written /// into, for an object ID that already exists. After writing to the buffer, the @@ -1343,9 +1342,7 @@ class CoreWorker { const std::shared_ptr &creation_task_exception_pb_bytes = nullptr); - void TaskManagerRetryTask(TaskSpecification &spec, - bool object_recovery, - uint32_t delay_ms); + void AsyncRetryTask(TaskSpecification &spec, uint32_t delay_ms); private: static nlohmann::json OverrideRuntimeEnv(const nlohmann::json &child, @@ -1493,6 +1490,19 @@ class CoreWorker { std::string *application_error); /// Put an object in the local plasma store. + /// + /// Return status semantics: + /// - Status::OK(): The object was created (or already existed) and bookkeeping was + /// updated. Note: an internal ObjectExists from the plasma provider is treated + /// as OK and does not surface here. + /// - Status::ObjectStoreFull(): The local plasma store is out of memory (or out of + /// disk when spilling). The error message contains context and a short memory + /// report. + /// - Status::IOError(): IPC/connection failures while talking to the plasma store + /// (e.g., broken pipe/connection reset during shutdown, store not reachable). + /// + /// Call sites that run during shutdown may choose to tolerate IOError specifically, + /// but should treat all other statuses as real failures. Status PutInLocalPlasmaStore(const RayObject &object, const ObjectID &object_id, bool pin_object); @@ -1673,6 +1683,17 @@ class CoreWorker { const int64_t timeout_ms, std::vector> &results); + /// Helper to compute idleness from precomputed counters. + /// + /// We consider the worker to be idle if it doesn't have object references and it + /// doesn't have any object pinning RPCs in flight and it doesn't have pending tasks. + bool IsIdle(size_t num_objects_with_references, + int64_t pins_in_flight, + size_t num_pending_tasks) const; + + /// Convenience overload that fetches counters and evaluates idleness. + bool IsIdle() const; + /// Get the caller ID used to submit tasks from this worker to an actor. /// /// \return The caller ID. For non-actor tasks, this is the current task ID. @@ -1740,7 +1761,7 @@ class CoreWorker { std::shared_ptr gcs_client_; // Client to the local Raylet that goes over a local socket. - std::shared_ptr raylet_ipc_client_; + std::shared_ptr raylet_ipc_client_; // Client to the local Raylet that goes over a gRPC connection. std::shared_ptr local_raylet_rpc_client_; @@ -1824,9 +1845,6 @@ class CoreWorker { /// Key value pairs to be displayed on Web UI. std::unordered_map webui_display_ ABSL_GUARDED_BY(mutex_); - /// Actor title that consists of class name, args, kwargs for actor construction. - std::string actor_title_ ABSL_GUARDED_BY(mutex_); - /// Actor repr name if overrides by the user, empty string if not. std::string actor_repr_name_ ABSL_GUARDED_BY(mutex_); @@ -1872,19 +1890,13 @@ class CoreWorker { /// If this value is set, it means the exit process has begun. std::optional exiting_detail_ ABSL_GUARDED_BY(mutex_); - /// TODO(kevin85421): the shutdown logic contained in `Disconnect`, `Exit`, and - /// `Shutdown` should be unified to avoid mistakes due to complex dependent semantics. - /// See https://github.com/ray-project/ray/issues/51642. - - /// Used to ensure that the `CoreWorker::Exit` method is called at most once. - std::atomic is_exited_ = false; - /// Used to ensure that the `CoreWorker::Shutdown` method is called at most once. - std::atomic is_shutdown_ = false; + /// Unified shutdown coordinator that manages all shutdown operations. + /// Implements a thread-safe, single state machine that coordinates + /// all shutdown entry points. + std::unique_ptr shutdown_coordinator_; int64_t max_direct_call_object_size_; - friend class CoreWorkerTest; - TaskCounter task_counter_; /// Used to guarantee that submitting actor task is thread safe. @@ -1900,6 +1912,9 @@ class CoreWorker { /// Worker's PID uint32_t pid_; + /// Callback to cleanup actor instance before shutdown + std::function actor_shutdown_callback_; + // Guards generator_ids_pending_deletion_. absl::Mutex generator_ids_pending_deletion_mutex_; @@ -1923,22 +1938,13 @@ class CoreWorker { /// Used to ensure we only subscribe to node changes once. std::once_flag subscribe_to_node_changes_flag_; + // Grant CoreWorkerShutdownExecutor access to CoreWorker internals for orchestrating + // the shutdown procedure without exposing additional public APIs. + friend class CoreWorkerShutdownExecutor; + /// Used to block in certain spots if the GCS node cache is needed. std::mutex gcs_client_node_cache_populated_mutex_; std::condition_variable gcs_client_node_cache_populated_cv_; bool gcs_client_node_cache_populated_ = false; }; - -// Lease request rate-limiter based on cluster node size. -// It returns max(num_nodes_in_cluster, min_concurrent_lease_limit) -class ClusterSizeBasedLeaseRequestRateLimiter : public LeaseRequestRateLimiter { - public: - explicit ClusterSizeBasedLeaseRequestRateLimiter(size_t min_concurrent_lease_limit); - size_t GetMaxPendingLeaseRequestsPerSchedulingCategory() override; - void OnNodeChanges(const rpc::GcsNodeInfo &data); - - private: - const size_t min_concurrent_lease_cap_; - std::atomic num_alive_nodes_; -}; } // namespace ray::core diff --git a/src/ray/core_worker/core_worker_options.h b/src/ray/core_worker/core_worker_options.h index 5c89fb56db49..fc59a55495e5 100644 --- a/src/ray/core_worker/core_worker_options.h +++ b/src/ray/core_worker/core_worker_options.h @@ -25,9 +25,8 @@ #include "ray/common/ray_object.h" #include "ray/common/status.h" #include "ray/common/task/task_common.h" -#include "ray/common/task/task_spec.h" #include "ray/core_worker/common.h" -#include "ray/gcs/gcs_client/gcs_client.h" +#include "ray/gcs_client/gcs_client.h" #include "ray/util/process.h" namespace ray { @@ -83,7 +82,6 @@ struct CoreWorkerOptions { interactive(false), node_ip_address(""), node_manager_port(0), - raylet_ip_address(""), driver_name(""), task_execution_callback(nullptr), free_actor_object_callback(nullptr), @@ -97,6 +95,7 @@ struct CoreWorkerOptions { get_lang_stack(nullptr), kill_main(nullptr), cancel_async_actor_task(nullptr), + actor_shutdown_callback(nullptr), is_local_mode(false), terminate_asyncio_thread(nullptr), serialized_job_config(""), @@ -107,8 +106,7 @@ struct CoreWorkerOptions { entrypoint(""), worker_launch_time_ms(-1), worker_launched_time_ms(-1), - debug_source(""), - enable_resource_isolation(false) {} + debug_source("") {} /// Type of this worker (i.e., DRIVER or WORKER). WorkerType worker_type; @@ -135,8 +133,6 @@ struct CoreWorkerOptions { std::string node_ip_address; /// Port of the local raylet. int node_manager_port; - /// IP address of the raylet. - std::string raylet_ip_address; /// The name of the driver. std::string driver_name; /// Application-language worker callback to execute tasks. @@ -177,6 +173,8 @@ struct CoreWorkerOptions { // Should return a boolean indicating if the task was successfully cancelled or not. // If not, the client will retry. std::function cancel_async_actor_task; + /// Callback to shutdown actor instance before shutdown. + std::function actor_shutdown_callback; /// Is local mode being used. bool is_local_mode; /// The function to destroy asyncio event and loops. @@ -203,7 +201,7 @@ struct CoreWorkerOptions { std::function(const ray::RayObject &object, const ObjectID &object_id)> object_allocator; - /// Session name (Cluster ID) of the cluster. + /// The current Ray session name. std::string session_name; std::string entrypoint; int64_t worker_launch_time_ms; @@ -212,10 +210,6 @@ struct CoreWorkerOptions { // Source information for `CoreWorker`, used for debugging and informational purpose, // rather than functional purpose. std::string debug_source; - - // If true, core worker enables resource isolation through cgroupv2 by reserving - // resources for ray system processes. - bool enable_resource_isolation = false; }; } // namespace core } // namespace ray diff --git a/src/ray/core_worker/core_worker_process.cc b/src/ray/core_worker/core_worker_process.cc index a0b5109fc8da..9513bbd6af05 100644 --- a/src/ray/core_worker/core_worker_process.cc +++ b/src/ray/core_worker/core_worker_process.cc @@ -27,14 +27,16 @@ #include "ray/common/cgroup/cgroup_context.h" #include "ray/common/cgroup/cgroup_manager.h" #include "ray/common/cgroup/constants.h" +#include "ray/common/protobuf_utils.h" #include "ray/common/ray_config.h" #include "ray/common/runtime_env_common.h" #include "ray/common/task/task_util.h" #include "ray/core_worker/core_worker.h" #include "ray/core_worker/core_worker_rpc_proxy.h" -#include "ray/gcs/gcs_client/gcs_client.h" -#include "ray/gcs/pb_util.h" +#include "ray/gcs_client/gcs_client.h" #include "ray/ipc/raylet_ipc_client.h" +#include "ray/object_manager/plasma/client.h" +#include "ray/rpc/raylet/raylet_client.h" #include "ray/stats/stats.h" #include "ray/util/container_util.h" #include "ray/util/env.h" @@ -45,7 +47,6 @@ #include "ray/util/stream_redirection.h" #include "ray/util/stream_redirection_options.h" #include "ray/util/subreaper.h" -#include "ray/util/util.h" namespace ray { namespace core { @@ -143,13 +144,6 @@ std::shared_ptr CoreWorkerProcessImpl::CreateCoreWorker( options.worker_type, worker_id, GetProcessJobID(options)); auto pid = getpid(); - // Move worker process into cgroup on startup. - AppProcCgroupMetadata app_cgroup_metadata; - app_cgroup_metadata.pid = pid; - app_cgroup_metadata.max_memory = kUnlimitedCgroupMemory; - GetCgroupSetup(options.enable_resource_isolation) - .ApplyCgroupContext(app_cgroup_metadata); - RAY_LOG(DEBUG) << "Creating core worker with debug source: " << options.debug_source; RAY_LOG(DEBUG).WithField(worker_id) << "Constructing CoreWorker"; @@ -175,7 +169,8 @@ std::shared_ptr CoreWorkerProcessImpl::CreateCoreWorker( auto task_event_buffer = std::make_unique( std::make_unique(options.gcs_options), std::make_unique(options.metrics_agent_port, - *client_call_manager)); + *client_call_manager), + options.session_name); // Start the IO thread first to make sure the checker is working. boost::thread::attributes io_thread_attrs; @@ -214,7 +209,7 @@ std::shared_ptr CoreWorkerProcessImpl::CreateCoreWorker( auto raylet_ipc_client = std::make_shared( io_service_, options.raylet_socket, /*num_retries=*/-1, /*timeout=*/-1); - NodeID local_raylet_id; + NodeID local_node_id; int assigned_port = 0; Status status = raylet_ipc_client->RegisterClient(worker_context->GetWorkerID(), options.worker_type, @@ -224,7 +219,7 @@ std::shared_ptr CoreWorkerProcessImpl::CreateCoreWorker( options.node_ip_address, options.serialized_job_config, options.startup_token, - &local_raylet_id, + &local_node_id, &assigned_port); if (!status.ok()) { // Avoid using FATAL log or RAY_CHECK here because they may create a core dump file. @@ -242,11 +237,11 @@ std::shared_ptr CoreWorkerProcessImpl::CreateCoreWorker( // so that the worker (java/python .etc) can retrieve and handle the error // instead of crashing. auto raylet_address = rpc::RayletClientPool::GenerateRayletAddress( - local_raylet_id, options.node_ip_address, options.node_manager_port); - auto local_raylet_rpc_client = std::make_shared( - std::move(raylet_address), - *client_call_manager, - /*raylet_unavailable_timeout_callback=*/[] {}); + local_node_id, options.node_ip_address, options.node_manager_port); + auto local_raylet_rpc_client = + std::make_shared(std::move(raylet_address), + *client_call_manager, + /*raylet_unavailable_timeout_callback=*/[] {}); auto core_worker_server = std::make_unique(WorkerTypeString(options.worker_type), assigned_port, @@ -254,18 +249,19 @@ std::shared_ptr CoreWorkerProcessImpl::CreateCoreWorker( // Start RPC server after all the task receivers are properly initialized and we have // our assigned port from the raylet. core_worker_server->RegisterService( - std::make_unique(io_service_, *service_handler_), + std::make_unique( + io_service_, *service_handler_, /*max_active_rpcs_per_handler_=*/-1), false /* token_auth */); core_worker_server->Run(); // Set our own address. - RAY_CHECK(!local_raylet_id.IsNil()); + RAY_CHECK(!local_node_id.IsNil()); rpc::Address rpc_address; rpc_address.set_ip_address(options.node_ip_address); rpc_address.set_port(core_worker_server->GetPort()); - rpc_address.set_raylet_id(local_raylet_id.Binary()); + rpc_address.set_node_id(local_node_id.Binary()); rpc_address.set_worker_id(worker_context->GetWorkerID().Binary()); - RAY_LOG(INFO).WithField(worker_context->GetWorkerID()).WithField(local_raylet_id) + RAY_LOG(INFO).WithField(worker_context->GetWorkerID()).WithField(local_node_id) << "Initializing worker at address: " << BuildAddress(rpc_address.ip_address(), rpc_address.port()); @@ -282,7 +278,7 @@ std::shared_ptr CoreWorkerProcessImpl::CreateCoreWorker( auto raylet_client_pool = std::make_shared([&](const rpc::Address &addr) { auto core_worker = GetCoreWorker(); - return std::make_shared( + return std::make_shared( addr, *core_worker->client_call_manager_, rpc::RayletClientPool::GetDefaultUnavailableTimeoutCallback( @@ -351,6 +347,13 @@ std::shared_ptr CoreWorkerProcessImpl::CreateCoreWorker( /*min_concurrent_lease_cap_*/ 10); } + // We can turn on exit_on_connection_failure on for the core worker plasma + // client to early exit core worker after the raylet's death because on the + // raylet side, we never proactively close the plasma store connection even + // during shutdown. So any error from the raylet side should be a sign of raylet + // death. + auto plasma_client = std::shared_ptr( + new plasma::PlasmaClient(/*exit_on_connection_failure*/ true)); auto plasma_store_provider = std::make_shared( options.store_socket, raylet_ipc_client, @@ -359,6 +362,8 @@ std::shared_ptr CoreWorkerProcessImpl::CreateCoreWorker( /*warmup=*/ (options.worker_type != WorkerType::SPILL_WORKER && options.worker_type != WorkerType::RESTORE_WORKER), + /*store_client=*/plasma_client, + /*fetch_batch_size=*/RayConfig::instance().worker_fetch_request_size(), /*get_current_call_site=*/[this]() { auto core_worker = GetCoreWorker(); return core_worker->CurrentCallSite(); @@ -381,9 +386,9 @@ std::shared_ptr CoreWorkerProcessImpl::CreateCoreWorker( // from the middle of user operations. core_worker->io_service_.post( [this, obj]() { - auto core_worker = GetCoreWorker(); - if (core_worker->options_.unhandled_exception_handler != nullptr) { - core_worker->options_.unhandled_exception_handler(obj); + auto this_core_worker = GetCoreWorker(); + if (this_core_worker->options_.unhandled_exception_handler != nullptr) { + this_core_worker->options_.unhandled_exception_handler(obj); } }, "CoreWorker.HandleException"); @@ -423,13 +428,29 @@ std::shared_ptr CoreWorkerProcessImpl::CreateCoreWorker( /*put_in_local_plasma_callback=*/ [this](const RayObject &object, const ObjectID &object_id) { auto core_worker = GetCoreWorker(); - RAY_CHECK_OK( - core_worker->PutInLocalPlasmaStore(object, object_id, /*pin_object=*/true)); + constexpr int max_retries = 3; + int attempt = 0; + int64_t backoff_ms = 10; + Status put_status; + while (attempt++ < max_retries) { + put_status = + core_worker->PutInLocalPlasmaStore(object, object_id, /*pin_object=*/true); + if (put_status.ok()) { + return Status::OK(); + } + // Backoff before retrying. + std::this_thread::sleep_for(std::chrono::milliseconds(backoff_ms)); + backoff_ms *= 2; + } + RAY_LOG(WARNING).WithField(object_id) + << "Exhausted plasma put retries (attempts=" << attempt + << ") with status: " << put_status; + return put_status; }, - /* retry_task_callback= */ - [this](TaskSpecification &spec, bool object_recovery, uint32_t delay_ms) { + /* async_retry_task_callback=*/ + [this](TaskSpecification &spec, uint32_t delay_ms) { auto core_worker = GetCoreWorker(); - core_worker->TaskManagerRetryTask(spec, object_recovery, delay_ms); + core_worker->AsyncRetryTask(spec, delay_ms); }, /*queue_generator_resubmit=*/ [this](const TaskSpecification &spec) { @@ -442,13 +463,17 @@ std::shared_ptr CoreWorkerProcessImpl::CreateCoreWorker( RayConfig::instance().max_lineage_bytes(), *task_event_buffer, /*get_actor_rpc_client_callback=*/ - [this](const ActorID &actor_id) { + [this](const ActorID &actor_id) + -> std::optional> { auto core_worker = GetCoreWorker(); auto addr = core_worker->actor_task_submitter_->GetActorAddress(actor_id); - RAY_CHECK(addr.has_value()) << "Actor address not found for actor " << actor_id; - return core_worker->core_worker_client_pool_->GetOrConnect(addr.value()); + if (!addr.has_value()) { + return std::nullopt; + } + return core_worker->core_worker_client_pool_->GetOrConnect(*addr); }, - gcs_client); + gcs_client, + task_by_state_counter_); auto on_excess_queueing = [this](const ActorID &actor_id, uint64_t num_queued) { auto timestamp = std::chrono::duration_cast( @@ -466,7 +491,7 @@ std::shared_ptr CoreWorkerProcessImpl::CreateCoreWorker( timestamp)); }; - auto actor_creator = std::make_shared(gcs_client); + auto actor_creator = std::make_shared(gcs_client->Actors()); auto actor_task_submitter = std::make_unique( *core_worker_client_pool, @@ -487,19 +512,20 @@ std::shared_ptr CoreWorkerProcessImpl::CreateCoreWorker( std::optional address_opt; if (auto node_info = core_worker->gcs_client_->Nodes().Get(node_id)) { auto &address = address_opt.emplace(); - address.set_raylet_id(node_info->node_id()); + address.set_node_id(node_info->node_id()); address.set_ip_address(node_info->node_manager_address()); address.set_port(node_info->node_manager_port()); } return address_opt; }; - auto lease_policy = RayConfig::instance().locality_aware_leasing_enabled() - ? std::unique_ptr( - std::make_unique( - *reference_counter, node_addr_factory, rpc_address)) - : std::unique_ptr( - std::make_unique(rpc_address)); + auto lease_policy = + RayConfig::instance().locality_aware_leasing_enabled() + ? std::unique_ptr( + std::make_unique( + *reference_counter, node_addr_factory, raylet_address)) + : std::unique_ptr( + std::make_unique(raylet_address)); auto normal_task_submitter = std::make_unique( rpc_address, @@ -509,7 +535,7 @@ std::shared_ptr CoreWorkerProcessImpl::CreateCoreWorker( std::move(lease_policy), memory_store, *task_manager, - local_raylet_id, + local_node_id, options.worker_type, RayConfig::instance().worker_lease_timeout_milliseconds(), actor_creator, @@ -570,7 +596,7 @@ std::shared_ptr CoreWorkerProcessImpl::CreateCoreWorker( continue; } rpc::Address addr; - addr.set_raylet_id(node_info->node_id()); + addr.set_node_id(node_info->node_id()); addr.set_ip_address(node_info->node_manager_address()); addr.set_port(node_info->node_manager_port()); locations.push_back(std::move(addr)); @@ -586,7 +612,7 @@ std::shared_ptr CoreWorkerProcessImpl::CreateCoreWorker( for (const auto &node_info : node_infos) { if (node_info.state() != rpc::GcsNodeInfo::DEAD) { rpc::Address addr; - addr.set_raylet_id(node_info.node_id()); + addr.set_node_id(node_info.node_id()); addr.set_ip_address(node_info.node_manager_address()); addr.set_port(node_info.node_manager_port()); locations.push_back(std::move(addr)); @@ -651,7 +677,8 @@ std::shared_ptr CoreWorkerProcessImpl::CreateCoreWorker( std::move(actor_manager), task_execution_service_, std::move(task_event_buffer), - pid); + pid, + task_by_state_counter_); return core_worker; } @@ -779,6 +806,15 @@ CoreWorkerProcessImpl::CoreWorkerProcessImpl(const CoreWorkerOptions &options) auto worker = CreateCoreWorker(options_, worker_id_); auto write_locked = core_worker_.LockForWrite(); write_locked.Get() = worker; + // Initialize metrics agent client. + metrics_agent_client_ = std::make_unique( + "127.0.0.1", + options_.metrics_agent_port, + io_service_, + *write_locked.Get()->client_call_manager_); + metrics_agent_client_->WaitForServerReady([this](const Status &server_status) { + stats::InitOpenTelemetryExporter(options_.metrics_agent_port, server_status); + }); } } @@ -823,8 +859,7 @@ void CoreWorkerProcessImpl::InitializeSystemConfig() { // TODO(joshlee): This local raylet client has a custom retry policy below since its // likely the driver can start up before the raylet is ready. We want to move away // from this and will be fixed in https://github.com/ray-project/ray/issues/55200 - raylet::RayletClient local_raylet_rpc_client( - raylet_address, client_call_manager, [] {}); + rpc::RayletClient local_raylet_rpc_client(raylet_address, client_call_manager, [] {}); std::function get_once = [this, &get_once, diff --git a/src/ray/core_worker/core_worker_process.h b/src/ray/core_worker/core_worker_process.h index 5a6eb569b0ab..9081360c02ed 100644 --- a/src/ray/core_worker/core_worker_process.h +++ b/src/ray/core_worker/core_worker_process.h @@ -19,6 +19,9 @@ #include #include "ray/core_worker/core_worker_options.h" +#include "ray/core_worker/grpc_service.h" +#include "ray/core_worker/metrics.h" +#include "ray/rpc/metrics_agent_client.h" #include "ray/util/mutex_protected.h" namespace ray { @@ -177,6 +180,11 @@ class CoreWorkerProcessImpl { /// The proxy service handler that routes the RPC calls to the core worker. std::unique_ptr service_handler_; + + /// The client to export metrics to the metrics agent. + std::unique_ptr metrics_agent_client_; + + ray::stats::Gauge task_by_state_counter_{GetTaskMetric()}; }; } // namespace core } // namespace ray diff --git a/src/ray/core_worker/core_worker_shutdown_executor.cc b/src/ray/core_worker/core_worker_shutdown_executor.cc new file mode 100644 index 000000000000..6b1d66ddc836 --- /dev/null +++ b/src/ray/core_worker/core_worker_shutdown_executor.cc @@ -0,0 +1,306 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ray/core_worker/core_worker_shutdown_executor.h" + +#include +#include +#include +#include + +#include "ray/core_worker/core_worker.h" + +namespace ray { + +namespace core { + +CoreWorkerShutdownExecutor::CoreWorkerShutdownExecutor(CoreWorker *core_worker) + : core_worker_(core_worker) {} + +void CoreWorkerShutdownExecutor::ExecuteGracefulShutdown( + std::string_view exit_type, + std::string_view detail, + std::chrono::milliseconds timeout_ms) { + RAY_LOG(INFO) << "Executing graceful shutdown: " << exit_type << " - " << detail + << " (timeout: " << timeout_ms.count() << "ms)"; + + // For actors, perform cleanup before shutdown proceeds. + if (!core_worker_->worker_context_->GetCurrentActorID().IsNil() && + core_worker_->actor_shutdown_callback_) { + RAY_LOG(INFO) << "Calling actor shutdown callback before shutdown"; + core_worker_->actor_shutdown_callback_(); + } + + if (core_worker_->options_.worker_type == WorkerType::WORKER) { + // Running in a main thread. + // Asyncio coroutines could still run after CoreWorker is removed because it is + // running in a different thread. This can cause segfault because coroutines try to + // access CoreWorker methods that are already garbage collected. We should complete + // all coroutines before shutting down in order to prevent this. + if (core_worker_->worker_context_->CurrentActorIsAsync()) { + core_worker_->options_.terminate_asyncio_thread(); + } + core_worker_->task_execution_service_.stop(); + } + + core_worker_->task_event_buffer_->FlushEvents(/*forced=*/true); + core_worker_->task_event_buffer_->Stop(); + + core_worker_->io_service_.stop(); + RAY_LOG(INFO) << "Waiting for joining a core worker io thread. If it hangs here, there " + "might be deadlock or a high load in the core worker io service."; + if (core_worker_->io_thread_.joinable()) { + // Check if we're already running in the IO thread to avoid self-join deadlock + if (core_worker_->io_thread_.get_id() != boost::this_thread::get_id()) { + core_worker_->io_thread_.join(); + } else { + RAY_LOG(INFO) + << "Skipping IO thread join since we're already running in the IO thread"; + } + } + + // Shutdown gRPC server + core_worker_->core_worker_server_->Shutdown(); + + // Now that gcs_client is not used within io service, we can reset the pointer and clean + // it up. + if (core_worker_->gcs_client_) { + RAY_LOG(INFO) << "Disconnecting a GCS client."; + // TODO(55607): Move the Disconnect() logic to GcsClient destructor. + // https://github.com/ray-project/ray/issues/55607 + core_worker_->gcs_client_->Disconnect(); + core_worker_->gcs_client_.reset(); + } + + RAY_LOG(INFO) << "Core worker ready to be deallocated."; +} + +void CoreWorkerShutdownExecutor::ExecuteForceShutdown(std::string_view exit_type, + std::string_view detail) { + KillChildProcessesImmediately(); + DisconnectServices(exit_type, detail, nullptr); + QuickExit(); +} + +void CoreWorkerShutdownExecutor::ExecuteWorkerExit(std::string_view exit_type, + std::string_view detail, + std::chrono::milliseconds timeout_ms) { + ExecuteExit(exit_type, detail, timeout_ms, nullptr); +} + +void CoreWorkerShutdownExecutor::ExecuteExit( + std::string_view exit_type, + std::string_view detail, + std::chrono::milliseconds timeout_ms, + const std::shared_ptr &creation_task_exception_pb_bytes) { + RAY_LOG(INFO) << "Executing worker exit: " << exit_type << " - " << detail + << " (timeout: " << timeout_ms.count() << "ms)"; + + { + absl::MutexLock lock(&core_worker_->mutex_); + RAY_CHECK_NE(detail, ""); + core_worker_->exiting_detail_ = std::optional{detail}; + } + + auto shutdown_callback = [this, + exit_type = std::string(exit_type), + detail = std::string(detail), + creation_task_exception_pb_bytes]() { + // To avoid problems, make sure shutdown is always called from the same + // event loop each time. + core_worker_->task_execution_service_.post( + [this, exit_type, detail, creation_task_exception_pb_bytes]() { + rpc::DrainServerCallExecutor(); + KillChildProcessesImmediately(); + DisconnectServices(exit_type, detail, creation_task_exception_pb_bytes); + ExecuteGracefulShutdown( + exit_type, "Post-exit graceful shutdown", std::chrono::milliseconds{30000}); + }, + "CoreWorker.Shutdown"); + }; + + auto drain_references_callback = [this, shutdown_callback]() { + // Post to the event loop to avoid a deadlock between the TaskManager and + // the ReferenceCounter. The deadlock can occur because this callback may + // get called by the TaskManager while the ReferenceCounter's lock is held, + // but the callback itself must acquire the ReferenceCounter's lock to + // drain the object references. + core_worker_->task_execution_service_.post( + [this, shutdown_callback]() { + RAY_LOG(INFO) << "Wait for currently executing tasks in the underlying thread " + "pools to finish."; + // Wait for currently executing tasks in the underlying thread pools to + // finish. Note that if tasks have been posted to the thread pools but not + // started yet, they will not be executed. + core_worker_->task_receiver_->Stop(); + + // Release resources only after tasks have stopped executing. + auto status = core_worker_->raylet_ipc_client_->NotifyDirectCallTaskBlocked(); + if (!status.ok()) { + RAY_LOG(WARNING) + << "Failed to notify Raylet. The raylet may have already shut down or " + << "the connection was lost."; + } + + bool not_actor_task = false; + { + absl::MutexLock lock(&core_worker_->mutex_); + not_actor_task = core_worker_->actor_id_.IsNil(); + } + if (not_actor_task) { + // Normal tasks should not hold any object references in the heap after + // executing, but they could in the case that one was stored as a glob + // variable (anti-pattern, but possible). We decrement the reference count + // for all local references to account for this. After this call, the only + // references left to drain should be those that are in use by remote + // workers. If these workers hold their references forever, the call to + // drain the reference counter will hang forever and this process will not + // exit until it is forcibly removed (e.g., via SIGKILL). + // + // NOTE(edoakes): this is only safe to do _after_ we have drained executing + // tasks in the task_receiver_, otherwise there might still be user code + // running that relies on the state of the reference counter. + // See: https://github.com/ray-project/ray/pull/53002. + RAY_LOG(INFO) + << "Releasing local references, then draining reference counter."; + core_worker_->reference_counter_->ReleaseAllLocalReferences(); + core_worker_->reference_counter_->DrainAndShutdown(shutdown_callback); + } else { + // If we are an actor, then we may be holding object references in the + // heap. Then, we should not wait to drain the object references before + // shutdown since this could hang. + RAY_LOG(INFO) + << "Not draining reference counter since this is an actor worker."; + shutdown_callback(); + } + }, + "CoreWorker.DrainAndShutdown"); + }; + + core_worker_->task_manager_->DrainAndShutdown(drain_references_callback); +} + +void CoreWorkerShutdownExecutor::ExecuteHandleExit(std::string_view exit_type, + std::string_view detail, + std::chrono::milliseconds timeout_ms) { + RAY_LOG(INFO) << "Executing handle exit: " << exit_type << " - " << detail + << " (timeout: " << timeout_ms.count() << "ms)"; + + if (ShouldWorkerIdleExit()) { + auto actual_timeout = timeout_ms; + if (actual_timeout.count() == -1) { + actual_timeout = std::chrono::milliseconds{10000}; // 10s default + } + + ExecuteWorkerExit(exit_type, detail, actual_timeout); + } else { + RAY_LOG(INFO) << "Worker not idle, ignoring exit request: " << detail; + } +} + +void CoreWorkerShutdownExecutor::KillChildProcessesImmediately() { + if (!RayConfig::instance().kill_child_processes_on_worker_exit()) { + RAY_LOG(DEBUG) + << "kill_child_processes_on_worker_exit is not true, skipping KillChildProcs"; + return; + } + + RAY_LOG(DEBUG) << "kill_child_processes_on_worker_exit true, KillChildProcs"; + auto maybe_child_procs = GetAllProcsWithPpid(GetPID()); + + // Enumerating child procs is not supported on this platform. + if (!maybe_child_procs) { + RAY_LOG(DEBUG) << "Killing leaked procs not supported on this platform."; + return; + } + + const auto &child_procs = *maybe_child_procs; + const auto child_procs_str = absl::StrJoin(child_procs, ","); + RAY_LOG(INFO) << "Try killing all child processes of this worker as it exits. " + << "Child process pids: " << child_procs_str; + + for (const auto &child_pid : child_procs) { + auto maybe_error_code = KillProc(child_pid); + RAY_CHECK(maybe_error_code) + << "Expected this path to only be called when KillProc is supported."; + auto error_code = *maybe_error_code; + + RAY_LOG(INFO) << "Kill result for child pid " << child_pid << ": " + << error_code.message() << ", bool " << static_cast(error_code); + if (error_code) { + RAY_LOG(WARNING) << "Unable to kill potentially leaked process " << child_pid + << ": " << error_code.message(); + } + } +} + +bool CoreWorkerShutdownExecutor::ShouldWorkerIdleExit() const { + return core_worker_->IsIdle(); +} + +void CoreWorkerShutdownExecutor::DisconnectServices( + std::string_view exit_type, + std::string_view detail, + const std::shared_ptr &creation_task_exception_pb_bytes) { + core_worker_->RecordMetrics(); + + if (core_worker_->options_.worker_type == WorkerType::DRIVER && + core_worker_->task_event_buffer_->Enabled() && + !RayConfig::instance().task_events_skip_driver_for_test()) { + auto task_event = std::make_unique( + core_worker_->worker_context_->GetCurrentTaskID(), + core_worker_->worker_context_->GetCurrentJobID(), + /* attempt_number */ 0, + rpc::TaskStatus::FINISHED, + /* timestamp */ absl::GetCurrentTimeNanos(), + /*is_actor_task_event=*/ + core_worker_->worker_context_->GetCurrentActorID().IsNil(), + core_worker_->options_.session_name); + core_worker_->task_event_buffer_->AddTaskEvent(std::move(task_event)); + } + + opencensus::stats::StatsExporter::ExportNow(); + if (core_worker_->connected_) { + RAY_LOG(INFO) << "Sending disconnect message to the local raylet."; + core_worker_->connected_ = false; + if (core_worker_->raylet_ipc_client_) { + rpc::WorkerExitType worker_exit_type = rpc::WorkerExitType::INTENDED_USER_EXIT; + if (exit_type == "INTENDED_SYSTEM_EXIT") { + worker_exit_type = rpc::WorkerExitType::INTENDED_SYSTEM_EXIT; + } else if (exit_type == "USER_ERROR") { + worker_exit_type = rpc::WorkerExitType::USER_ERROR; + } else if (exit_type == "SYSTEM_ERROR") { + worker_exit_type = rpc::WorkerExitType::SYSTEM_ERROR; + } else if (exit_type == "NODE_OUT_OF_MEMORY") { + worker_exit_type = rpc::WorkerExitType::NODE_OUT_OF_MEMORY; + } + + Status status = core_worker_->raylet_ipc_client_->Disconnect( + worker_exit_type, std::string(detail), creation_task_exception_pb_bytes); + if (status.ok()) { + RAY_LOG(INFO) << "Disconnected from the local raylet."; + } else { + RAY_LOG(WARNING) << "Failed to disconnect from the local raylet: " << status; + } + } + } +} + +void CoreWorkerShutdownExecutor::QuickExit() { + RAY_LOG(WARNING) << "Quick exit - terminating process immediately"; + ray::QuickExit(); + RAY_LOG(WARNING) << "Quick exit - this line should never be reached"; +} +} // namespace core +} // namespace ray diff --git a/src/ray/core_worker/core_worker_shutdown_executor.h b/src/ray/core_worker/core_worker_shutdown_executor.h new file mode 100644 index 000000000000..9617f0c1564f --- /dev/null +++ b/src/ray/core_worker/core_worker_shutdown_executor.h @@ -0,0 +1,104 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "ray/core_worker/shutdown_coordinator.h" + +namespace ray { + +namespace core { + +class CoreWorker; + +/// Concrete implementation of `ShutdownExecutorInterface` that executes actual +/// shutdown operations for `CoreWorker`. +/// +/// Semantics overview: +/// - Graceful shutdown (ExecuteGracefulShutdown): stop accepting new work, drain ongoing +/// work, flush task +/// events, stop services (task execution service, gRPC server, IO service), +/// disconnect from the GCS/raylet, and join the IO thread if safe. This path +/// attempts best-effort cleanup to preserve observability and avoid resource +/// leaks. It may take up to `timeout_ms` for certain steps. +/// - Force shutdown (ExecuteForceShutdown): immediately kill child processes, disconnect +/// services, and +/// terminate the process without draining or cleanup. This path is used to +/// break out of hung or long-running shutdowns and should be considered +/// preemptive; it sacrifices cleanup for determinism. +/// - Worker exit (ExecuteWorkerExit): worker-type-specific graceful +/// shutdown that handles task draining and optional actor creation failure +/// payloads, then proceeds with the graceful sequence. +/// - Handle exit (ExecuteHandleExit): conditional exit that first checks worker +/// idleness and only proceeds when idle; otherwise it is ignored. +class CoreWorkerShutdownExecutor : public ShutdownExecutorInterface { + public: + /// Constructor with CoreWorker reference for accessing internals + /// \param core_worker Reference to the CoreWorker instance + explicit CoreWorkerShutdownExecutor(CoreWorker *core_worker); + + ~CoreWorkerShutdownExecutor() override = default; + + /// Execute graceful shutdown sequence. + /// Stops task execution, flushes task events, stops IO/gRPC services, joins IO + /// thread when not self, and disconnects from GCS. Best-effort cleanup. + void ExecuteGracefulShutdown(std::string_view exit_type, + std::string_view detail, + std::chrono::milliseconds timeout_ms) override; + + /// Execute force shutdown sequence. + /// Kills child processes, disconnects services, and terminates the process. + /// Skips draining/cleanup for fast, deterministic termination. + void ExecuteForceShutdown(std::string_view exit_type, std::string_view detail) override; + + /// Execute worker exit sequence with task draining. + /// Drains tasks/references as applicable for worker mode, then performs + /// graceful shutdown. + void ExecuteWorkerExit(std::string_view exit_type, + std::string_view detail, + std::chrono::milliseconds timeout_ms) override; + + void ExecuteExit(std::string_view exit_type, + std::string_view detail, + std::chrono::milliseconds timeout_ms, + const std::shared_ptr + &creation_task_exception_pb_bytes) override; + + /// Execute handle exit sequence with idle checking. + /// Only performs worker exit if the worker is currently idle; otherwise, it + /// logs and returns without action. + void ExecuteHandleExit(std::string_view exit_type, + std::string_view detail, + std::chrono::milliseconds timeout_ms) override; + + void KillChildProcessesImmediately() override; + + bool ShouldWorkerIdleExit() const override; + + private: + /// Reference to CoreWorker for accessing shutdown operations + CoreWorker *core_worker_; + + void DisconnectServices( + std::string_view exit_type, + std::string_view detail, + const std::shared_ptr &creation_task_exception_pb_bytes); + void QuickExit(); +}; +} // namespace core +} // namespace ray diff --git a/src/ray/core_worker/experimental_mutable_object_manager.cc b/src/ray/core_worker/experimental_mutable_object_manager.cc index 9b083e4830e7..ca551d97eb59 100644 --- a/src/ray/core_worker/experimental_mutable_object_manager.cc +++ b/src/ray/core_worker/experimental_mutable_object_manager.cc @@ -24,6 +24,7 @@ #include "absl/strings/str_format.h" #include "ray/common/ray_config.h" #include "ray/object_manager/common.h" +#include "ray/util/time.h" namespace ray { namespace experimental { diff --git a/src/ray/core_worker/experimental_mutable_object_provider.cc b/src/ray/core_worker/experimental_mutable_object_provider.cc index ff5d2addac85..b97b19347278 100644 --- a/src/ray/core_worker/experimental_mutable_object_provider.cc +++ b/src/ray/core_worker/experimental_mutable_object_provider.cc @@ -242,9 +242,9 @@ void MutableObjectProvider::PollWriterClosure( object->GetData()->Data(), object->GetMetadata()->Data(), [this, &io_context, writer_object_id, remote_readers, num_replied]( - const Status &status, const rpc::PushMutableObjectReply &reply) { + const Status &push_object_status, const rpc::PushMutableObjectReply &reply) { *num_replied += 1; - if (!status.ok()) { + if (!push_object_status.ok()) { RAY_LOG(ERROR) << "Failed to transfer object to a remote node for an object id " << writer_object_id << ". It can cause hang."; diff --git a/src/ray/core_worker/experimental_mutable_object_provider.h b/src/ray/core_worker/experimental_mutable_object_provider.h index 364d1f7d3aa1..085f8994cfea 100644 --- a/src/ray/core_worker/experimental_mutable_object_provider.h +++ b/src/ray/core_worker/experimental_mutable_object_provider.h @@ -18,8 +18,8 @@ #include #include "ray/core_worker/experimental_mutable_object_manager.h" -#include "ray/raylet_client/raylet_client.h" #include "ray/rpc/client_call.h" +#include "ray/rpc/raylet/raylet_client_interface.h" namespace ray { namespace core { diff --git a/src/ray/core_worker/fake_actor_creator.h b/src/ray/core_worker/fake_actor_creator.h new file mode 100644 index 000000000000..08deb9bf6cda --- /dev/null +++ b/src/ray/core_worker/fake_actor_creator.h @@ -0,0 +1,63 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include "ray/core_worker/actor_creator.h" + +namespace ray { +namespace core { + +class FakeActorCreator : public ActorCreatorInterface { + public: + Status RegisterActor(const TaskSpecification &task_spec) const override { + return Status::OK(); + }; + + void AsyncRegisterActor(const TaskSpecification &task_spec, + gcs::StatusCallback callback) override {} + + void AsyncRestartActorForLineageReconstruction( + const ActorID &actor_id, + uint64_t num_restarts_due_to_lineage_reconstructions, + gcs::StatusCallback callback) override {} + + void AsyncReportActorOutOfScope(const ActorID &actor_id, + uint64_t num_restarts_due_to_lineage_reconstruction, + gcs::StatusCallback callback) override {} + + void AsyncCreateActor( + const TaskSpecification &task_spec, + const rpc::ClientCallback &callback) override {} + + void AsyncWaitForActorRegisterFinish(const ActorID &, + gcs::StatusCallback callback) override { + callbacks.push_back(callback); + } + + [[nodiscard]] bool IsActorInRegistering(const ActorID &actor_id) const override { + return actor_pending; + } + + std::list callbacks; + bool actor_pending = false; +}; + +} // namespace core +} // namespace ray diff --git a/src/ray/core_worker/future_resolver.cc b/src/ray/core_worker/future_resolver.cc index 3231b9595bd9..153c06a0f84f 100644 --- a/src/ray/core_worker/future_resolver.cc +++ b/src/ray/core_worker/future_resolver.cc @@ -15,6 +15,7 @@ #include "ray/core_worker/future_resolver.h" #include +#include namespace ray { namespace core { @@ -32,7 +33,7 @@ void FutureResolver::ResolveFutureAsync(const ObjectID &object_id, request.set_object_id(object_id.Binary()); request.set_owner_worker_id(owner_address.worker_id()); conn->GetObjectStatus( - request, + std::move(request), [this, object_id, owner_address](const Status &status, const rpc::GetObjectStatusReply &reply) { ProcessResolvedObject(object_id, owner_address, status, reply); diff --git a/src/ray/core_worker/grpc_service.cc b/src/ray/core_worker/grpc_service.cc new file mode 100644 index 000000000000..128fb3cd72c9 --- /dev/null +++ b/src/ray/core_worker/grpc_service.cc @@ -0,0 +1,118 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ray/core_worker/grpc_service.h" + +#include +#include + +namespace ray { +namespace rpc { + +void CoreWorkerGrpcService::InitServerCallFactories( + const std::unique_ptr &cq, + std::vector> *server_call_factories, + const ClusterID &cluster_id) { + /// TODO(vitsai): Remove this when auth is implemented for node manager. + /// Disable gRPC server metrics since it incurs too high cardinality. + RPC_SERVICE_HANDLER_CUSTOM_AUTH_SERVER_METRICS_DISABLED( + CoreWorkerService, PushTask, max_active_rpcs_per_handler_, AuthType::NO_AUTH); + RPC_SERVICE_HANDLER_CUSTOM_AUTH_SERVER_METRICS_DISABLED(CoreWorkerService, + ActorCallArgWaitComplete, + max_active_rpcs_per_handler_, + AuthType::NO_AUTH); + RPC_SERVICE_HANDLER_CUSTOM_AUTH_SERVER_METRICS_DISABLED(CoreWorkerService, + RayletNotifyGCSRestart, + max_active_rpcs_per_handler_, + AuthType::NO_AUTH); + RPC_SERVICE_HANDLER_CUSTOM_AUTH_SERVER_METRICS_DISABLED(CoreWorkerService, + GetObjectStatus, + max_active_rpcs_per_handler_, + AuthType::NO_AUTH); + RPC_SERVICE_HANDLER_CUSTOM_AUTH_SERVER_METRICS_DISABLED(CoreWorkerService, + WaitForActorRefDeleted, + max_active_rpcs_per_handler_, + AuthType::NO_AUTH); + RPC_SERVICE_HANDLER_CUSTOM_AUTH_SERVER_METRICS_DISABLED(CoreWorkerService, + PubsubLongPolling, + max_active_rpcs_per_handler_, + AuthType::NO_AUTH); + RPC_SERVICE_HANDLER_CUSTOM_AUTH_SERVER_METRICS_DISABLED(CoreWorkerService, + PubsubCommandBatch, + max_active_rpcs_per_handler_, + AuthType::NO_AUTH); + RPC_SERVICE_HANDLER_CUSTOM_AUTH_SERVER_METRICS_DISABLED(CoreWorkerService, + UpdateObjectLocationBatch, + max_active_rpcs_per_handler_, + AuthType::NO_AUTH); + RPC_SERVICE_HANDLER_CUSTOM_AUTH_SERVER_METRICS_DISABLED(CoreWorkerService, + GetObjectLocationsOwner, + max_active_rpcs_per_handler_, + AuthType::NO_AUTH); + RPC_SERVICE_HANDLER_CUSTOM_AUTH_SERVER_METRICS_DISABLED(CoreWorkerService, + ReportGeneratorItemReturns, + max_active_rpcs_per_handler_, + AuthType::NO_AUTH); + RPC_SERVICE_HANDLER_CUSTOM_AUTH_SERVER_METRICS_DISABLED( + CoreWorkerService, KillActor, max_active_rpcs_per_handler_, AuthType::NO_AUTH); + RPC_SERVICE_HANDLER_CUSTOM_AUTH_SERVER_METRICS_DISABLED( + CoreWorkerService, CancelTask, max_active_rpcs_per_handler_, AuthType::NO_AUTH); + RPC_SERVICE_HANDLER_CUSTOM_AUTH_SERVER_METRICS_DISABLED(CoreWorkerService, + RemoteCancelTask, + max_active_rpcs_per_handler_, + AuthType::NO_AUTH); + RPC_SERVICE_HANDLER_CUSTOM_AUTH_SERVER_METRICS_DISABLED(CoreWorkerService, + RegisterMutableObjectReader, + max_active_rpcs_per_handler_, + AuthType::NO_AUTH); + RPC_SERVICE_HANDLER_CUSTOM_AUTH_SERVER_METRICS_DISABLED(CoreWorkerService, + GetCoreWorkerStats, + max_active_rpcs_per_handler_, + AuthType::NO_AUTH); + RPC_SERVICE_HANDLER_CUSTOM_AUTH_SERVER_METRICS_DISABLED( + CoreWorkerService, LocalGC, max_active_rpcs_per_handler_, AuthType::NO_AUTH); + RPC_SERVICE_HANDLER_CUSTOM_AUTH_SERVER_METRICS_DISABLED( + CoreWorkerService, DeleteObjects, max_active_rpcs_per_handler_, AuthType::NO_AUTH); + RPC_SERVICE_HANDLER_CUSTOM_AUTH_SERVER_METRICS_DISABLED( + CoreWorkerService, SpillObjects, max_active_rpcs_per_handler_, AuthType::NO_AUTH); + RPC_SERVICE_HANDLER_CUSTOM_AUTH_SERVER_METRICS_DISABLED(CoreWorkerService, + RestoreSpilledObjects, + max_active_rpcs_per_handler_, + AuthType::NO_AUTH); + RPC_SERVICE_HANDLER_CUSTOM_AUTH_SERVER_METRICS_DISABLED(CoreWorkerService, + DeleteSpilledObjects, + max_active_rpcs_per_handler_, + AuthType::NO_AUTH); + RPC_SERVICE_HANDLER_CUSTOM_AUTH_SERVER_METRICS_DISABLED(CoreWorkerService, + PlasmaObjectReady, + max_active_rpcs_per_handler_, + AuthType::NO_AUTH); + RPC_SERVICE_HANDLER_CUSTOM_AUTH_SERVER_METRICS_DISABLED( + CoreWorkerService, Exit, max_active_rpcs_per_handler_, AuthType::NO_AUTH); + RPC_SERVICE_HANDLER_CUSTOM_AUTH_SERVER_METRICS_DISABLED(CoreWorkerService, + AssignObjectOwner, + max_active_rpcs_per_handler_, + AuthType::NO_AUTH); + RPC_SERVICE_HANDLER_CUSTOM_AUTH_SERVER_METRICS_DISABLED(CoreWorkerService, + NumPendingTasks, + max_active_rpcs_per_handler_, + AuthType::NO_AUTH); + RPC_SERVICE_HANDLER_CUSTOM_AUTH_SERVER_METRICS_DISABLED(CoreWorkerService, + FreeActorObject, + max_active_rpcs_per_handler_, + AuthType::NO_AUTH); +} + +} // namespace rpc +} // namespace ray diff --git a/src/ray/core_worker/grpc_service.h b/src/ray/core_worker/grpc_service.h new file mode 100644 index 000000000000..fdb2b09fe2c7 --- /dev/null +++ b/src/ray/core_worker/grpc_service.h @@ -0,0 +1,174 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/* + * This file defines the gRPC service handlers for the core worker server. + * + * core_worker_process should be the only user of this target. If other classes need the + * CoreWorkerInterface in the future, split it into its own target that does not include + * the heavyweight gRPC headers.. + * + * To add a new RPC handler: + * - Update core_worker.proto. + * - Add a virtual method to CoreWorkerService. + * - Initialize the handler for the method in InitServerCallFactories. + * - Implement the method in core_worker. + */ + +#pragma once + +#include +#include + +#include "ray/common/asio/instrumented_io_context.h" +#include "ray/rpc/grpc_server.h" +#include "ray/rpc/server_call.h" +#include "src/ray/protobuf/core_worker.grpc.pb.h" +#include "src/ray/protobuf/core_worker.pb.h" + +namespace ray { +namespace rpc { + +class CoreWorkerServiceHandler : public DelayedServiceHandler { + public: + /// Blocks until the service is ready to serve RPCs. + virtual void WaitUntilInitialized() = 0; + + virtual void HandlePushTask(PushTaskRequest request, + PushTaskReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleActorCallArgWaitComplete(ActorCallArgWaitCompleteRequest request, + ActorCallArgWaitCompleteReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleRayletNotifyGCSRestart(RayletNotifyGCSRestartRequest request, + RayletNotifyGCSRestartReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleGetObjectStatus(GetObjectStatusRequest request, + GetObjectStatusReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleWaitForActorRefDeleted(WaitForActorRefDeletedRequest request, + WaitForActorRefDeletedReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandlePubsubLongPolling(PubsubLongPollingRequest request, + PubsubLongPollingReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandlePubsubCommandBatch(PubsubCommandBatchRequest request, + PubsubCommandBatchReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleUpdateObjectLocationBatch(UpdateObjectLocationBatchRequest request, + UpdateObjectLocationBatchReply *reply, + SendReplyCallback send_reply_callback) = 0; + virtual void HandleGetObjectLocationsOwner(GetObjectLocationsOwnerRequest request, + GetObjectLocationsOwnerReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleReportGeneratorItemReturns( + ReportGeneratorItemReturnsRequest request, + ReportGeneratorItemReturnsReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleKillActor(KillActorRequest request, + KillActorReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleCancelTask(CancelTaskRequest request, + CancelTaskReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleRemoteCancelTask(RemoteCancelTaskRequest request, + RemoteCancelTaskReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleRegisterMutableObjectReader( + RegisterMutableObjectReaderRequest request, + RegisterMutableObjectReaderReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleGetCoreWorkerStats(GetCoreWorkerStatsRequest request, + GetCoreWorkerStatsReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleLocalGC(LocalGCRequest request, + LocalGCReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleDeleteObjects(DeleteObjectsRequest request, + DeleteObjectsReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleSpillObjects(SpillObjectsRequest request, + SpillObjectsReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleRestoreSpilledObjects(RestoreSpilledObjectsRequest request, + RestoreSpilledObjectsReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleDeleteSpilledObjects(DeleteSpilledObjectsRequest request, + DeleteSpilledObjectsReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandlePlasmaObjectReady(PlasmaObjectReadyRequest request, + PlasmaObjectReadyReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleExit(ExitRequest request, + ExitReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleAssignObjectOwner(AssignObjectOwnerRequest request, + AssignObjectOwnerReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleNumPendingTasks(NumPendingTasksRequest request, + NumPendingTasksReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleFreeActorObject(FreeActorObjectRequest request, + FreeActorObjectReply *reply, + SendReplyCallback send_reply_callback) = 0; +}; + +class CoreWorkerGrpcService : public GrpcService { + public: + CoreWorkerGrpcService(instrumented_io_context &main_service, + CoreWorkerServiceHandler &service_handler, + int64_t max_active_rpcs_per_handler) + : GrpcService(main_service), + service_handler_(service_handler), + max_active_rpcs_per_handler_(max_active_rpcs_per_handler) {} + + protected: + grpc::Service &GetGrpcService() override { return service_; } + + void InitServerCallFactories( + const std::unique_ptr &cq, + std::vector> *server_call_factories, + const ClusterID &cluster_id) override; + + private: + CoreWorkerService::AsyncService service_; + CoreWorkerServiceHandler &service_handler_; + int64_t max_active_rpcs_per_handler_; +}; + +} // namespace rpc +} // namespace ray diff --git a/src/ray/core_worker/lease_policy.cc b/src/ray/core_worker/lease_policy.cc index 10e4471cf39c..f1efd15e951c 100644 --- a/src/ray/core_worker/lease_policy.cc +++ b/src/ray/core_worker/lease_policy.cc @@ -21,8 +21,8 @@ namespace ray { namespace core { -std::pair LocalityAwareLeasePolicy::GetBestNodeForTask( - const TaskSpecification &spec) { +std::pair LocalityAwareLeasePolicy::GetBestNodeForLease( + const LeaseSpecification &spec) { if (spec.GetMessage().scheduling_strategy().scheduling_strategy_case() == rpc::SchedulingStrategy::SchedulingStrategyCase::kSpreadSchedulingStrategy) { // The explicit spread scheduling strategy @@ -40,7 +40,7 @@ std::pair LocalityAwareLeasePolicy::GetBestNodeForTask( } // Pick node based on locality. - if (auto node_id = GetBestNodeIdForTask(spec)) { + if (auto node_id = GetBestNodeIdForLease(spec)) { if (auto addr = node_addr_factory_(node_id.value())) { return std::make_pair(addr.value(), true); } @@ -49,8 +49,8 @@ std::pair LocalityAwareLeasePolicy::GetBestNodeForTask( } /// Criteria for "best" node: The node with the most object bytes (from object_ids) local. -std::optional LocalityAwareLeasePolicy::GetBestNodeIdForTask( - const TaskSpecification &spec) { +std::optional LocalityAwareLeasePolicy::GetBestNodeIdForLease( + const LeaseSpecification &spec) { const auto object_ids = spec.GetDependencyIds(); // Number of object bytes (from object_ids) that a given node has local. absl::flat_hash_map bytes_local_table; @@ -76,8 +76,8 @@ std::optional LocalityAwareLeasePolicy::GetBestNodeIdForTask( return max_bytes_node; } -std::pair LocalLeasePolicy::GetBestNodeForTask( - const TaskSpecification &spec) { +std::pair LocalLeasePolicy::GetBestNodeForLease( + const LeaseSpecification &spec) { // Always return the local node. return std::make_pair(local_node_rpc_address_, false); } diff --git a/src/ray/core_worker/lease_policy.h b/src/ray/core_worker/lease_policy.h index 78c927802987..78ae5d4aefd6 100644 --- a/src/ray/core_worker/lease_policy.h +++ b/src/ray/core_worker/lease_policy.h @@ -18,7 +18,7 @@ #include "absl/container/flat_hash_set.h" #include "ray/common/id.h" -#include "ray/common/task/task_spec.h" +#include "ray/common/lease/lease_spec.h" #include "src/ray/protobuf/common.pb.h" namespace ray { @@ -41,9 +41,9 @@ class LocalityDataProviderInterface { /// Interface for mocking the lease policy. class LeasePolicyInterface { public: - /// Get the address of the best worker node for a lease request for the provided task. - virtual std::pair GetBestNodeForTask( - const TaskSpecification &spec) = 0; + /// Get the address of the best worker node for a lease request. + virtual std::pair GetBestNodeForLease( + const LeaseSpecification &spec) = 0; virtual ~LeasePolicyInterface() = default; }; @@ -63,13 +63,13 @@ class LocalityAwareLeasePolicy : public LeasePolicyInterface { ~LocalityAwareLeasePolicy() override = default; - /// Get the address of the best worker node for a lease request for the provided task. - std::pair GetBestNodeForTask( - const TaskSpecification &spec) override; + /// Get the address of the best worker node for a lease request. + std::pair GetBestNodeForLease( + const LeaseSpecification &spec) override; private: - /// Get the best worker node for a lease request for the provided task. - std::optional GetBestNodeIdForTask(const TaskSpecification &spec); + /// Get the best worker node for a lease request. + std::optional GetBestNodeIdForLease(const LeaseSpecification &spec); /// Provider of locality data that will be used in choosing the best lessor. LocalityDataProviderInterface &locality_data_provider_; @@ -90,9 +90,9 @@ class LocalLeasePolicy : public LeasePolicyInterface { ~LocalLeasePolicy() override = default; - /// Get the address of the local node for a lease request for the provided task. - std::pair GetBestNodeForTask( - const TaskSpecification &spec) override; + /// Get the address of the local node for a lease request. + std::pair GetBestNodeForLease( + const LeaseSpecification &spec) override; private: /// RPC address of the local node. diff --git a/src/ray/core_worker/lib/java/BUILD.bazel b/src/ray/core_worker/lib/java/BUILD.bazel index bce303e200b8..470e5a775838 100644 --- a/src/ray/core_worker/lib/java/BUILD.bazel +++ b/src/ray/core_worker/lib/java/BUILD.bazel @@ -24,8 +24,9 @@ ray_cc_binary( "//:src/ray/ray_exported_symbols.lds", "//:src/ray/ray_version_script.lds", "//src/ray/core_worker:core_worker_lib", - "//src/ray/gcs/gcs_client:global_state_accessor_lib", + "//src/ray/gcs_client:global_state_accessor_lib", "//src/ray/stats:stats_lib", + "//src/ray/util:time", "@bazel_tools//tools/jdk:jni", ], ) diff --git a/src/ray/core_worker/lib/java/io_ray_runtime_RayNativeRuntime.cc b/src/ray/core_worker/lib/java/io_ray_runtime_RayNativeRuntime.cc index 40cac1dc9ecc..1b8b72cbb8e6 100644 --- a/src/ray/core_worker/lib/java/io_ray_runtime_RayNativeRuntime.cc +++ b/src/ray/core_worker/lib/java/io_ray_runtime_RayNativeRuntime.cc @@ -28,6 +28,7 @@ #include "ray/common/ray_config.h" #include "ray/core_worker/actor_handle.h" #include "ray/core_worker/core_worker.h" +#include "ray/util/time.h" thread_local JNIEnv *local_env = nullptr; jobject java_task_executor = nullptr; @@ -69,18 +70,18 @@ jobject ToJavaArgs(JNIEnv *env, jobject args_array_list = NativeVectorToJavaList>( env, args, - [check_results, &i](JNIEnv *env, + [check_results, &i](JNIEnv *inner_env, const std::shared_ptr &native_object) { if (*(check_results + (i++))) { // If the type of this argument is ByteBuffer, we create a // DirectByteBuffer here To avoid data copy. // TODO(kfstorm): Check native_object->GetMetadata() == "RAW" - jobject obj = env->NewDirectByteBuffer(native_object->GetData()->Data(), - native_object->GetData()->Size()); + jobject obj = inner_env->NewDirectByteBuffer( + native_object->GetData()->Data(), native_object->GetData()->Size()); RAY_CHECK(obj); return obj; } - return NativeRayObjectToJavaNativeRayObject(env, native_object); + return NativeRayObjectToJavaNativeRayObject(inner_env, native_object); }); env->ReleaseBooleanArrayElements(java_check_results, check_results, JNI_ABORT); return args_array_list; @@ -152,7 +153,7 @@ Java_io_ray_runtime_RayNativeRuntime_nativeInitialize(JNIEnv *env, // errors for Java. *is_retryable_error = false; - JNIEnv *env = GetJNIEnv(); + JNIEnv *inner_env = GetJNIEnv(); RAY_CHECK(java_task_executor); // convert RayFunction @@ -168,53 +169,56 @@ Java_io_ray_runtime_RayNativeRuntime_nativeInitialize(JNIEnv *env, } if (!ray_function_array_list) { ray_function_array_list = - NativeRayFunctionDescriptorToJavaStringList(env, function_descriptor); + NativeRayFunctionDescriptorToJavaStringList(inner_env, function_descriptor); fd_vector.emplace_back(function_descriptor, ray_function_array_list); } // convert args // TODO(kfstorm): Avoid copying binary data from Java to C++ jbooleanArray java_check_results = static_cast( - env->CallObjectMethod(java_task_executor, - java_task_executor_parse_function_arguments, - ray_function_array_list)); - RAY_CHECK_JAVA_EXCEPTION(env); - jobject args_array_list = ToJavaArgs(env, java_check_results, args); + inner_env->CallObjectMethod(java_task_executor, + java_task_executor_parse_function_arguments, + ray_function_array_list)); + RAY_CHECK_JAVA_EXCEPTION(inner_env); + jobject args_array_list = ToJavaArgs(inner_env, java_check_results, args); // invoke Java method - jobject java_return_objects = env->CallObjectMethod(java_task_executor, - java_task_executor_execute, - ray_function_array_list, - args_array_list); + jobject java_return_objects = + inner_env->CallObjectMethod(java_task_executor, + java_task_executor_execute, + ray_function_array_list, + args_array_list); // Check whether the exception is `IntentionalSystemExit`. - jthrowable throwable = env->ExceptionOccurred(); + jthrowable throwable = inner_env->ExceptionOccurred(); if (throwable) { Status status_to_return = Status::OK(); - if (env->IsInstanceOf(throwable, - java_ray_intentional_system_exit_exception_class)) { + if (inner_env->IsInstanceOf(throwable, + java_ray_intentional_system_exit_exception_class)) { status_to_return = Status::IntentionalSystemExit(""); - } else if (env->IsInstanceOf(throwable, java_ray_actor_exception_class)) { - creation_task_exception_pb = SerializeActorCreationException(env, throwable); + } else if (inner_env->IsInstanceOf(throwable, java_ray_actor_exception_class)) { + creation_task_exception_pb = + SerializeActorCreationException(inner_env, throwable); status_to_return = Status::CreationTaskError(""); } else { RAY_LOG(ERROR) << "Unknown java exception was thrown while executing tasks."; } *application_error = status_to_return.ToString(); - env->ExceptionClear(); + inner_env->ExceptionClear(); return status_to_return; } - RAY_CHECK_JAVA_EXCEPTION(env); + RAY_CHECK_JAVA_EXCEPTION(inner_env); int64_t task_output_inlined_bytes = 0; // Process return objects. if (!returns->empty()) { std::vector> return_objects; JavaListToNativeVector>( - env, + inner_env, java_return_objects, &return_objects, - [](JNIEnv *env, jobject java_native_ray_object) { - return JavaNativeRayObjectToNativeRayObject(env, java_native_ray_object); + [](JNIEnv *object_env, jobject java_native_ray_object) { + return JavaNativeRayObjectToNativeRayObject(object_env, + java_native_ray_object); }); for (size_t i = 0; i < return_objects.size(); i++) { auto &result_id = (*returns)[i].first; @@ -251,9 +255,9 @@ Java_io_ray_runtime_RayNativeRuntime_nativeInitialize(JNIEnv *env, } } - env->DeleteLocalRef(java_check_results); - env->DeleteLocalRef(java_return_objects); - env->DeleteLocalRef(args_array_list); + inner_env->DeleteLocalRef(java_check_results); + inner_env->DeleteLocalRef(java_return_objects); + inner_env->DeleteLocalRef(args_array_list); return Status::OK(); }; @@ -273,9 +277,9 @@ Java_io_ray_runtime_RayNativeRuntime_nativeInitialize(JNIEnv *env, absl::MutexLock lock(&mutex); int64_t start = current_time_ms(); if (last_gc_time_ms + 1000 < start) { - JNIEnv *env = GetJNIEnv(); + JNIEnv *inner_env = GetJNIEnv(); RAY_LOG(DEBUG) << "Calling System.gc() ..."; - env->CallStaticObjectMethod(java_system_class, java_system_gc); + inner_env->CallStaticObjectMethod(java_system_class, java_system_gc); last_gc_time_ms = current_time_ms(); RAY_LOG(DEBUG) << "GC finished in " << static_cast(last_gc_time_ms - start) / 1000 @@ -299,7 +303,6 @@ Java_io_ray_runtime_RayNativeRuntime_nativeInitialize(JNIEnv *env, options.install_failure_signal_handler = false; options.node_ip_address = JavaStringToNativeString(env, nodeIpAddress); options.node_manager_port = static_cast(nodeManagerPort); - options.raylet_ip_address = JavaStringToNativeString(env, nodeIpAddress); options.driver_name = JavaStringToNativeString(env, driverName); options.task_execution_callback = task_execution_callback; options.gc_collect = gc_collect; @@ -316,34 +319,36 @@ Java_io_ray_runtime_RayNativeRuntime_nativeInitialize(JNIEnv *env, return std::make_shared( object.GetData(), object.GetMetadata(), object.GetNestedRefs(), true); } - JNIEnv *env = GetJNIEnv(); - auto java_byte_array = NativeBufferToJavaByteArray(env, object.GetData()); - auto raw_object_id_byte_array = NativeStringToJavaByteArray(env, object_id.Binary()); + JNIEnv *inner_env = GetJNIEnv(); + auto java_byte_array = NativeBufferToJavaByteArray(inner_env, object.GetData()); + auto raw_object_id_byte_array = + NativeStringToJavaByteArray(inner_env, object_id.Binary()); RAY_LOG(DEBUG) << "Allocating Java byte array for object " << object_id; - env->CallStaticVoidMethod(java_object_ref_impl_class, - java_object_ref_impl_class_on_memory_store_object_allocated, - raw_object_id_byte_array, - java_byte_array); - auto java_weak_ref = CreateJavaWeakRef(env, java_byte_array); + inner_env->CallStaticVoidMethod( + java_object_ref_impl_class, + java_object_ref_impl_class_on_memory_store_object_allocated, + raw_object_id_byte_array, + java_byte_array); + auto java_weak_ref = CreateJavaWeakRef(inner_env, java_byte_array); // This shared_ptr will be captured by the data_factory. So when the data_factory // is destructed, we deference the java_weak_ref. std::shared_ptr java_weak_ref_ptr{ reinterpret_cast(java_weak_ref), [](auto p) { - JNIEnv *env = GetJNIEnv(); - env->DeleteLocalRef(reinterpret_cast(p)); + JNIEnv *deleter_env = GetJNIEnv(); + deleter_env->DeleteLocalRef(reinterpret_cast(p)); }}; // Remove this local reference because this byte array is fate-sharing with the // ObjectRefImpl in Java frontend. - env->DeleteLocalRef(java_byte_array); - env->DeleteLocalRef(raw_object_id_byte_array); + inner_env->DeleteLocalRef(java_byte_array); + inner_env->DeleteLocalRef(raw_object_id_byte_array); auto data_factory = [java_weak_ref_ptr, object_id]() -> std::shared_ptr { - JNIEnv *env = GetJNIEnv(); - jbyteArray java_byte_array = (jbyteArray)env->CallObjectMethod( + JNIEnv *data_env = GetJNIEnv(); + jbyteArray _java_byte_array = (jbyteArray)data_env->CallObjectMethod( reinterpret_cast(java_weak_ref_ptr.get()), java_weak_reference_get); - RAY_CHECK_JAVA_EXCEPTION(env); - RAY_CHECK(java_byte_array != nullptr) + RAY_CHECK_JAVA_EXCEPTION(data_env); + RAY_CHECK(_java_byte_array != nullptr) << "The java byte array is null of object " << object_id; - return std::make_shared(env, java_byte_array); + return std::make_shared(data_env, _java_byte_array); }; std::shared_ptr metadata_buffer = object.GetMetadata(); return std::make_shared(metadata_buffer, @@ -408,22 +413,23 @@ JNIEXPORT void JNICALL Java_io_ray_runtime_RayNativeRuntime_nativeKillActor( JNIEXPORT jobject JNICALL Java_io_ray_runtime_RayNativeRuntime_nativeGetResourceIds(JNIEnv *env, jclass) { - auto key_converter = [](JNIEnv *env, const std::string &str) -> jstring { - return env->NewStringUTF(str.c_str()); + auto key_converter = [](JNIEnv *inner_env, const std::string &str) -> jstring { + return inner_env->NewStringUTF(str.c_str()); }; auto value_converter = - [](JNIEnv *env, const std::vector> &value) -> jobject { - auto elem_converter = [](JNIEnv *env, + [](JNIEnv *inner_env, + const std::vector> &value) -> jobject { + auto elem_converter = [](JNIEnv *object_env, const std::pair &elem) -> jobject { - jobject java_item = env->NewObject(java_resource_value_class, - java_resource_value_init, - (jlong)elem.first, - (jdouble)elem.second); - RAY_CHECK_JAVA_EXCEPTION(env); + jobject java_item = object_env->NewObject(java_resource_value_class, + java_resource_value_init, + (jlong)elem.first, + (jdouble)elem.second); + RAY_CHECK_JAVA_EXCEPTION(object_env); return java_item; }; return NativeVectorToJavaList>( - env, value, std::move(elem_converter)); + inner_env, value, std::move(elem_converter)); }; ResourceMappingType resource_mapping = CoreWorkerProcess::GetCoreWorker().GetResourceIDs(); diff --git a/src/ray/core_worker/lib/java/io_ray_runtime_gcs_GlobalStateAccessor.cc b/src/ray/core_worker/lib/java/io_ray_runtime_gcs_GlobalStateAccessor.cc index e99c24530581..d1b5f56c4699 100644 --- a/src/ray/core_worker/lib/java/io_ray_runtime_gcs_GlobalStateAccessor.cc +++ b/src/ray/core_worker/lib/java/io_ray_runtime_gcs_GlobalStateAccessor.cc @@ -22,7 +22,7 @@ #include "jni_utils.h" // NOLINT(build/include_subdir) #include "ray/common/ray_config.h" #include "ray/core_worker/common.h" -#include "ray/gcs/gcs_client/global_state_accessor.h" +#include "ray/gcs_client/global_state_accessor.h" #ifdef __cplusplus extern "C" { @@ -64,8 +64,8 @@ JNIEXPORT jobject JNICALL Java_io_ray_runtime_gcs_GlobalStateAccessor_nativeGetA auto *gcs_accessor = reinterpret_cast(gcs_accessor_ptr); auto job_info_list = gcs_accessor->GetAllJobInfo(); return NativeVectorToJavaList( - env, job_info_list, [](JNIEnv *env, const std::string &str) { - return NativeStringToJavaByteArray(env, str); + env, job_info_list, [](JNIEnv *inner_env, const std::string &str) { + return NativeStringToJavaByteArray(inner_env, str); }); } @@ -85,8 +85,8 @@ Java_io_ray_runtime_gcs_GlobalStateAccessor_nativeGetAllNodeInfo(JNIEnv *env, auto *gcs_accessor = reinterpret_cast(gcs_accessor_ptr); auto node_info_list = gcs_accessor->GetAllNodeInfo(); return NativeVectorToJavaList( - env, node_info_list, [](JNIEnv *env, const std::string &str) { - return NativeStringToJavaByteArray(env, str); + env, node_info_list, [](JNIEnv *inner_env, const std::string &str) { + return NativeStringToJavaByteArray(inner_env, str); }); } @@ -110,8 +110,8 @@ Java_io_ray_runtime_gcs_GlobalStateAccessor_nativeGetAllActorInfo( auto actor_info_list = gcs_accessor->GetAllActorInfo(std::nullopt, job_id, actor_state_name); return NativeVectorToJavaList( - env, actor_info_list, [](JNIEnv *env, const std::string &str) { - return NativeStringToJavaByteArray(env, str); + env, actor_info_list, [](JNIEnv *inner_env, const std::string &str) { + return NativeStringToJavaByteArray(inner_env, str); }); } @@ -161,8 +161,8 @@ Java_io_ray_runtime_gcs_GlobalStateAccessor_nativeGetAllPlacementGroupInfo( auto *gcs_accessor = reinterpret_cast(gcs_accessor_ptr); auto placement_group_info_list = gcs_accessor->GetAllPlacementGroupInfo(); return NativeVectorToJavaList( - env, placement_group_info_list, [](JNIEnv *env, const std::string &str) { - return NativeStringToJavaByteArray(env, str); + env, placement_group_info_list, [](JNIEnv *inner_env, const std::string &str) { + return NativeStringToJavaByteArray(inner_env, str); }); } diff --git a/src/ray/core_worker/lib/java/io_ray_runtime_object_NativeObjectStore.cc b/src/ray/core_worker/lib/java/io_ray_runtime_object_NativeObjectStore.cc index e83a3773f184..bb942786ab1b 100644 --- a/src/ray/core_worker/lib/java/io_ray_runtime_object_NativeObjectStore.cc +++ b/src/ray/core_worker/lib/java/io_ray_runtime_object_NativeObjectStore.cc @@ -25,7 +25,7 @@ #include "ray/common/id.h" #include "ray/core_worker/common.h" #include "ray/core_worker/core_worker.h" -#include "ray/gcs/gcs_client/global_state_accessor.h" +#include "ray/gcs_client/global_state_accessor.h" Status PutSerializedObject(JNIEnv *env, jobject obj, @@ -128,9 +128,10 @@ Java_io_ray_runtime_object_NativeObjectStore_nativePut___3BLio_ray_runtime_objec JNIEXPORT jobject JNICALL Java_io_ray_runtime_object_NativeObjectStore_nativeGet( JNIEnv *env, jclass, jobject ids, jlong timeoutMs) { std::vector object_ids; - JavaListToNativeVector(env, ids, &object_ids, [](JNIEnv *env, jobject id) { - return JavaByteArrayToId(env, static_cast(id)); - }); + JavaListToNativeVector( + env, ids, &object_ids, [](JNIEnv *inner_env, jobject id) { + return JavaByteArrayToId(inner_env, static_cast(id)); + }); std::vector> results; auto status = CoreWorkerProcess::GetCoreWorker().Get( object_ids, static_cast(timeoutMs), results); @@ -148,8 +149,8 @@ Java_io_ray_runtime_object_NativeObjectStore_nativeWait(JNIEnv *env, jboolean fetch_local) { std::vector object_ids; JavaListToNativeVector( - env, objectIds, &object_ids, [](JNIEnv *env, jobject id) { - return JavaByteArrayToId(env, static_cast(id)); + env, objectIds, &object_ids, [](JNIEnv *inner_env, jobject id) { + return JavaByteArrayToId(inner_env, static_cast(id)); }); std::vector results; auto status = CoreWorkerProcess::GetCoreWorker().Wait(object_ids, @@ -158,20 +159,21 @@ Java_io_ray_runtime_object_NativeObjectStore_nativeWait(JNIEnv *env, &results, static_cast(fetch_local)); THROW_EXCEPTION_AND_RETURN_IF_NOT_OK(env, status, nullptr); - return NativeVectorToJavaList(env, results, [](JNIEnv *env, const bool &item) { - jobject java_item = - env->NewObject(java_boolean_class, java_boolean_init, (jboolean)item); - RAY_CHECK_JAVA_EXCEPTION(env); - return java_item; - }); + return NativeVectorToJavaList( + env, results, [](JNIEnv *inner_env, const bool &item) { + jobject java_item = + inner_env->NewObject(java_boolean_class, java_boolean_init, (jboolean)item); + RAY_CHECK_JAVA_EXCEPTION(inner_env); + return java_item; + }); } JNIEXPORT void JNICALL Java_io_ray_runtime_object_NativeObjectStore_nativeDelete( JNIEnv *env, jclass, jobject objectIds, jboolean localOnly) { std::vector object_ids; JavaListToNativeVector( - env, objectIds, &object_ids, [](JNIEnv *env, jobject id) { - return JavaByteArrayToId(env, static_cast(id)); + env, objectIds, &object_ids, [](JNIEnv *inner_env, jobject id) { + return JavaByteArrayToId(inner_env, static_cast(id)); }); auto status = CoreWorkerProcess::GetCoreWorker().Delete(object_ids, static_cast(localOnly)); @@ -207,15 +209,15 @@ Java_io_ray_runtime_object_NativeObjectStore_nativeGetAllReferenceCounts(JNIEnv return NativeMapToJavaMap>( env, reference_counts, - [](JNIEnv *env, const ObjectID &key) { - return IdToJavaByteArray(env, key); + [](JNIEnv *inner_env, const ObjectID &key) { + return IdToJavaByteArray(inner_env, key); }, - [](JNIEnv *env, const std::pair &value) { - jlongArray array = env->NewLongArray(2); - jlong *elements = env->GetLongArrayElements(array, nullptr); + [](JNIEnv *inner_env, const std::pair &value) { + jlongArray array = inner_env->NewLongArray(2); + jlong *elements = inner_env->GetLongArrayElements(array, nullptr); elements[0] = static_cast(value.first); elements[1] = static_cast(value.second); - env->ReleaseLongArrayElements(array, elements, 0); + inner_env->ReleaseLongArrayElements(array, elements, 0); return array; }); } diff --git a/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskExecutor.cc b/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskExecutor.cc index 9c0adc401893..b0077962cb2a 100644 --- a/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskExecutor.cc +++ b/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskExecutor.cc @@ -20,7 +20,7 @@ #include "ray/common/id.h" #include "ray/core_worker/common.h" #include "ray/core_worker/core_worker.h" -#include "ray/raylet_client/raylet_client.h" +#include "ray/rpc/raylet/raylet_client_interface.h" #ifdef __cplusplus extern "C" { diff --git a/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskSubmitter.cc b/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskSubmitter.cc index 531b19c3d58e..363d51234a12 100644 --- a/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskSubmitter.cc +++ b/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskSubmitter.cc @@ -66,10 +66,10 @@ inline const RayFunction &ToRayFunction(JNIEnv *env, return fd_vector.back().second; } -inline std::vector> ToTaskArgs(JNIEnv *env, jobject args) { +inline std::vector> ToTaskArgs(JNIEnv *inner_env, jobject args) { std::vector> task_args; JavaListToNativeVector>( - env, args, &task_args, [](JNIEnv *env, jobject arg) { + inner_env, args, &task_args, [](JNIEnv *env, jobject arg) { auto java_id = env->GetObjectField(arg, java_function_arg_id); if (java_id) { auto java_id_bytes = static_cast( @@ -99,12 +99,12 @@ inline std::unordered_map ToResources(JNIEnv *env, return JavaMapToNativeMap( env, java_resources, - [](JNIEnv *env, jobject java_key) { - return JavaStringToNativeString(env, (jstring)java_key); + [](JNIEnv *inner_env, jobject java_key) { + return JavaStringToNativeString(inner_env, (jstring)java_key); }, - [](JNIEnv *env, jobject java_value) { - double value = env->CallDoubleMethod(java_value, java_double_double_value); - RAY_CHECK_JAVA_EXCEPTION(env); + [](JNIEnv *inner_env, jobject java_value) { + double value = inner_env->CallDoubleMethod(java_value, java_double_double_value); + RAY_CHECK_JAVA_EXCEPTION(inner_env); return value; }); } @@ -232,34 +232,35 @@ inline ActorCreationOptions ToActorCreationOptions(JNIEnv *env, env, java_concurrency_groups_field, &concurrency_groups, - [](JNIEnv *env, jobject java_concurrency_group_impl) { + [](JNIEnv *inner_env, jobject java_concurrency_group_impl) { RAY_CHECK(java_concurrency_group_impl != nullptr); - jobject java_func_descriptors = - env->CallObjectMethod(java_concurrency_group_impl, - java_concurrency_group_impl_get_function_descriptors); - RAY_CHECK_JAVA_EXCEPTION(env); + jobject java_func_descriptors = inner_env->CallObjectMethod( + java_concurrency_group_impl, + java_concurrency_group_impl_get_function_descriptors); + RAY_CHECK_JAVA_EXCEPTION(inner_env); std::vector native_func_descriptors; JavaListToNativeVector( - env, + inner_env, java_func_descriptors, &native_func_descriptors, - [](JNIEnv *env, jobject java_func_descriptor) { + [](JNIEnv *converter_env, jobject java_func_descriptor) { RAY_CHECK(java_func_descriptor != nullptr); - const jint hashcode = GetHashCodeOfJavaObject(env, java_func_descriptor); + const jint hashcode = + GetHashCodeOfJavaObject(converter_env, java_func_descriptor); ray::FunctionDescriptor native_func = - ToRayFunction(env, java_func_descriptor, hashcode) + ToRayFunction(converter_env, java_func_descriptor, hashcode) .GetFunctionDescriptor(); return native_func; }); // Put func_descriptors into this task group. const std::string concurrency_group_name = JavaStringToNativeString( - env, - (jstring)env->GetObjectField(java_concurrency_group_impl, - java_concurrency_group_impl_name)); - const uint32_t max_concurrency = env->GetIntField( + inner_env, + (jstring)inner_env->GetObjectField(java_concurrency_group_impl, + java_concurrency_group_impl_name)); + const uint32_t _max_concurrency = inner_env->GetIntField( java_concurrency_group_impl, java_concurrency_group_impl_max_concurrency); return ray::ConcurrencyGroup{ - concurrency_group_name, max_concurrency, native_func_descriptors}; + concurrency_group_name, _max_concurrency, native_func_descriptors}; }); auto java_serialized_runtime_env = (jstring)env->GetObjectField( actorCreationOptions, java_actor_creation_options_serialized_runtime_env); @@ -340,24 +341,24 @@ inline PlacementGroupCreationOptions ToPlacementGroupCreationOptions( placementGroupCreationOptions, java_placement_group_creation_options_bundles); std::vector> bundles; JavaListToNativeVector>( - env, java_bundles, &bundles, [](JNIEnv *env, jobject java_bundle) { + env, java_bundles, &bundles, [](JNIEnv *inner_env, jobject java_bundle) { return JavaMapToNativeMap( - env, + inner_env, java_bundle, - [](JNIEnv *env, jobject java_key) { - return JavaStringToNativeString(env, (jstring)java_key); + [](JNIEnv *key_env, jobject java_key) { + return JavaStringToNativeString(key_env, (jstring)java_key); }, - [](JNIEnv *env, jobject java_value) { - double value = env->CallDoubleMethod(java_value, java_double_double_value); - RAY_CHECK_JAVA_EXCEPTION(env); + [](JNIEnv *value_env, jobject java_value) { + double value = + value_env->CallDoubleMethod(java_value, java_double_double_value); + RAY_CHECK_JAVA_EXCEPTION(value_env); return value; }); }); return PlacementGroupCreationOptions(name, ConvertStrategy(java_strategy), bundles, - /*is_detached=*/false, - /*max_cpu_fraction_per_node*/ 1.0); + /*is_detached=*/false); } #ifdef __cplusplus diff --git a/src/ray/core_worker/metrics.h b/src/ray/core_worker/metrics.h new file mode 100644 index 000000000000..e4a8b207ba25 --- /dev/null +++ b/src/ray/core_worker/metrics.h @@ -0,0 +1,45 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "ray/stats/metric.h" + +namespace ray { +namespace core { + +inline ray::stats::Gauge GetTaskMetric() { + /// Tracks tasks by state, including pending, running, and finished tasks. + /// This metric may be recorded from multiple components processing the task in Ray, + /// including the submitting core worker, executor core worker, and pull manager. + /// + /// To avoid metric collection conflicts between components reporting on the same task, + /// we use the "Source" required label. + return ray::stats::Gauge{ + /*name=*/"tasks", + /*description=*/"Current number of tasks currently in a particular state.", + /*unit=*/"", + // Expected tags: + // - State: the task state, as described by rpc::TaskState proto in common.proto + // - Name: the name of the function called (Keep this tag name in sync with the + // TASK_OR_ACTOR_NAME_TAG_KEY in + // python/ray/_private/telemetry/metric_cardinality.py) + // - IsRetry: whether the task is a retry + // - Source: component reporting, e.g., "core_worker", "executor", or "pull_manager" + /*tag_keys=*/{"State", "Name", "Source", "IsRetry", "JobId"}, + }; +} + +} // namespace core +} // namespace ray diff --git a/src/ray/core_worker/object_recovery_manager.cc b/src/ray/core_worker/object_recovery_manager.cc index 8aa30697d53e..893fb4fabd6d 100644 --- a/src/ray/core_worker/object_recovery_manager.cc +++ b/src/ray/core_worker/object_recovery_manager.cc @@ -18,8 +18,6 @@ #include #include -#include "ray/util/util.h" - namespace ray { namespace core { @@ -71,8 +69,8 @@ bool ObjectRecoveryManager::RecoverObject(const ObjectID &object_id) { // gcs_client. object_lookup_( object_id, - [this](const ObjectID &object_id, std::vector locations) { - PinOrReconstructObject(object_id, std::move(locations)); + [this](const ObjectID &object_id_to_lookup, std::vector locations) { + PinOrReconstructObject(object_id_to_lookup, std::move(locations)); }); } else if (requires_recovery) { RAY_LOG(DEBUG).WithField(object_id) << "Recovery already started for object"; @@ -110,7 +108,7 @@ void ObjectRecoveryManager::PinExistingObjectCopy( std::vector other_locations) { // If a copy still exists, pin the object by sending a // PinObjectIDs RPC. - const auto node_id = NodeID::FromBinary(raylet_address.raylet_id()); + const auto node_id = NodeID::FromBinary(raylet_address.node_id()); RAY_LOG(DEBUG).WithField(object_id).WithField(node_id) << "Trying to pin copy of lost object at node"; diff --git a/src/ray/core_worker/object_recovery_manager.h b/src/ray/core_worker/object_recovery_manager.h index ede91805c789..77cd9dac636c 100644 --- a/src/ray/core_worker/object_recovery_manager.h +++ b/src/ray/core_worker/object_recovery_manager.h @@ -25,8 +25,8 @@ #include "ray/core_worker/reference_count.h" #include "ray/core_worker/store_provider/memory_store/memory_store.h" #include "ray/core_worker/task_manager.h" -#include "ray/raylet_client/raylet_client.h" -#include "ray/rpc/node_manager/raylet_client_pool.h" +#include "ray/rpc/raylet/raylet_client_interface.h" +#include "ray/rpc/raylet/raylet_client_pool.h" namespace ray { namespace core { diff --git a/src/ray/core_worker/profile_event.cc b/src/ray/core_worker/profile_event.cc index a6c8348dd8b7..6da5ec40c0c0 100644 --- a/src/ray/core_worker/profile_event.cc +++ b/src/ray/core_worker/profile_event.cc @@ -48,7 +48,8 @@ ProfileEvent::ProfileEvent(TaskEventBuffer &task_event_buffer, worker_context.GetWorkerID().Binary(), node_ip_address, event_name, - absl::GetCurrentTimeNanos()); + absl::GetCurrentTimeNanos(), + task_event_buffer_.GetSessionName()); } ProfileEvent::~ProfileEvent() { diff --git a/src/ray/core_worker/reference_count.cc b/src/ray/core_worker/reference_count.cc index 0e8e842ffc8d..2f51b14467da 100644 --- a/src/ray/core_worker/reference_count.cc +++ b/src/ray/core_worker/reference_count.cc @@ -38,7 +38,7 @@ bool ReferenceCounter::OwnedByUs(const ObjectID &object_id) const { absl::MutexLock lock(&mutex_); auto it = object_id_refs_.find(object_id); if (it != object_id_refs_.end()) { - return it->second.owned_by_us; + return it->second.owned_by_us_; } return false; } @@ -102,12 +102,12 @@ bool ReferenceCounter::AddBorrowedObjectInternal(const ObjectID &object_id, } RAY_LOG(DEBUG) << "Adding borrowed object " << object_id; - it->second.owner_address = owner_address; + it->second.owner_address_ = owner_address; it->second.foreign_owner_already_monitoring |= foreign_owner_already_monitoring; if (!outer_id.IsNil()) { auto outer_it = object_id_refs_.find(outer_id); - if (outer_it != object_id_refs_.end() && !outer_it->second.owned_by_us) { + if (outer_it != object_id_refs_.end() && !outer_it->second.owned_by_us_) { RAY_LOG(DEBUG) << "Setting borrowed inner ID " << object_id << " contained_in_borrowed: " << outer_id; RAY_CHECK_NE(object_id, outer_id); @@ -143,18 +143,18 @@ void ReferenceCounter::AddObjectRefStats( auto ref_proto = stats->add_object_refs(); ref_proto->set_object_id(ref.first.Binary()); - ref_proto->set_call_site(ref.second.call_site); - ref_proto->set_object_size(ref.second.object_size); + ref_proto->set_call_site(ref.second.call_site_); + ref_proto->set_object_size(ref.second.object_size_); ref_proto->set_local_ref_count(ref.second.local_ref_count); ref_proto->set_submitted_task_ref_count(ref.second.submitted_task_ref_count); auto it = pinned_objects.find(ref.first); if (it != pinned_objects.end()) { ref_proto->set_pinned_in_memory(true); // If some info isn't available, fallback to getting it from the pinned info. - if (ref.second.object_size <= 0) { + if (ref.second.object_size_ <= 0) { ref_proto->set_object_size(it->second.first); } - if (ref.second.call_site.empty()) { + if (ref.second.call_site_.empty()) { ref_proto->set_call_site(it->second.second); } } @@ -162,7 +162,7 @@ void ReferenceCounter::AddObjectRefStats( ref_proto->add_contained_in_owned(obj_id.Binary()); } - if (ref.second.owned_by_us && !ref.second.pending_creation) { + if (ref.second.owned_by_us_ && !ref.second.pending_creation_) { // For finished tasks only, we set the status here instead of in the // TaskManager in case the task spec has already been GCed. ref_proto->set_task_status(rpc::TaskStatus::FINISHED); @@ -195,7 +195,7 @@ void ReferenceCounter::AddOwnedObject(const ObjectID &object_id, const int64_t object_size, bool is_reconstructable, bool add_local_ref, - const std::optional &pinned_at_raylet_id, + const std::optional &pinned_at_node_id, rpc::TensorTransport tensor_transport) { absl::MutexLock lock(&mutex_); RAY_CHECK(AddOwnedObjectInternal(object_id, @@ -205,7 +205,7 @@ void ReferenceCounter::AddOwnedObject(const ObjectID &object_id, object_size, is_reconstructable, add_local_ref, - pinned_at_raylet_id, + pinned_at_node_id, tensor_transport)) << "Tried to create an owned object that already exists: " << object_id; } @@ -225,15 +225,15 @@ void ReferenceCounter::AddDynamicReturn(const ObjectID &object_id, } RAY_LOG(DEBUG) << "Adding dynamic return " << object_id << " contained in generator object " << generator_id; - RAY_CHECK(outer_it->second.owned_by_us); - RAY_CHECK(outer_it->second.owner_address.has_value()); - rpc::Address owner_address(outer_it->second.owner_address.value()); + RAY_CHECK(outer_it->second.owned_by_us_); + RAY_CHECK(outer_it->second.owner_address_.has_value()); + rpc::Address owner_address(outer_it->second.owner_address_.value()); RAY_UNUSED(AddOwnedObjectInternal(object_id, {}, owner_address, - outer_it->second.call_site, + outer_it->second.call_site_, /*object_size=*/-1, - outer_it->second.is_reconstructable, + outer_it->second.is_reconstructable_, /*add_local_ref=*/false, std::optional())); AddNestedObjectIdsInternal(generator_id, {object_id}, owner_address); @@ -258,17 +258,17 @@ void ReferenceCounter::OwnDynamicStreamingTaskReturnRef(const ObjectID &object_i } RAY_LOG(DEBUG) << "Adding dynamic return " << object_id << " contained in generator object " << generator_id; - RAY_CHECK(outer_it->second.owned_by_us); - RAY_CHECK(outer_it->second.owner_address.has_value()); - rpc::Address owner_address(outer_it->second.owner_address.value()); + RAY_CHECK(outer_it->second.owned_by_us_); + RAY_CHECK(outer_it->second.owner_address_.has_value()); + rpc::Address owner_address(outer_it->second.owner_address_.value()); // We add a local reference here. The ref removal will be handled // by the ObjectRefStream. RAY_UNUSED(AddOwnedObjectInternal(object_id, {}, owner_address, - outer_it->second.call_site, + outer_it->second.call_site_, /*object_size=*/-1, - outer_it->second.is_reconstructable, + outer_it->second.is_reconstructable_, /*add_local_ref=*/true, std::optional())); } @@ -319,7 +319,7 @@ bool ReferenceCounter::AddOwnedObjectInternal( const int64_t object_size, bool is_reconstructable, bool add_local_ref, - const std::optional &pinned_at_raylet_id, + const std::optional &pinned_at_node_id, rpc::TensorTransport tensor_transport) { if (object_id_refs_.contains(object_id)) { return false; @@ -341,7 +341,7 @@ bool ReferenceCounter::AddOwnedObjectInternal( call_site, object_size, is_reconstructable, - pinned_at_raylet_id, + pinned_at_node_id, tensor_transport)) .first; if (!inner_ids.empty()) { @@ -349,9 +349,9 @@ bool ReferenceCounter::AddOwnedObjectInternal( // the inner objects until the outer object ID goes out of scope. AddNestedObjectIdsInternal(object_id, inner_ids, rpc_address_); } - if (pinned_at_raylet_id.has_value()) { + if (pinned_at_node_id.has_value()) { // We eagerly add the pinned location to the set of object locations. - AddObjectLocationInternal(it, pinned_at_raylet_id.value()); + AddObjectLocationInternal(it, pinned_at_node_id.value()); } reconstructable_owned_objects_.emplace_back(object_id); @@ -370,7 +370,7 @@ void ReferenceCounter::UpdateObjectSize(const ObjectID &object_id, int64_t objec absl::MutexLock lock(&mutex_); auto it = object_id_refs_.find(object_id); if (it != object_id_refs_.end()) { - it->second.object_size = object_size; + it->second.object_size_ = object_size; PushToLocationSubscribers(it); } } @@ -529,16 +529,16 @@ void ReferenceCounter::UpdateFinishedTaskReferences( int64_t ReferenceCounter::ReleaseLineageReferences(ReferenceTable::iterator ref) { int64_t lineage_bytes_evicted = 0; std::vector argument_ids; - if (on_lineage_released_ && ref->second.owned_by_us) { + if (on_lineage_released_ && ref->second.owned_by_us_) { RAY_LOG(DEBUG) << "Releasing lineage for object " << ref->first; lineage_bytes_evicted += on_lineage_released_(ref->first, &argument_ids); // The object is still in scope by the application and it was // reconstructable with lineage. Mark that its lineage has been evicted so // we can return the right error during reconstruction. if (!ref->second.OutOfScope(lineage_pinning_enabled_) && - ref->second.is_reconstructable) { + ref->second.is_reconstructable_) { ref->second.lineage_evicted = true; - ref->second.is_reconstructable = false; + ref->second.is_reconstructable_ = false; } } @@ -558,7 +558,7 @@ int64_t ReferenceCounter::ReleaseLineageReferences(ReferenceTable::iterator ref) OnObjectOutOfScopeOrFreed(arg_it); } if (arg_it->second.ShouldDelete(lineage_pinning_enabled_)) { - RAY_CHECK(arg_it->second.on_ref_removed == nullptr); + RAY_CHECK(!arg_it->second.publish_ref_removed); lineage_bytes_evicted += ReleaseLineageReferences(arg_it); EraseReference(arg_it); } @@ -609,8 +609,8 @@ bool ReferenceCounter::GetOwnerInternal(const ObjectID &object_id, return false; } - if (it->second.owner_address) { - *owner_address = *it->second.owner_address; + if (it->second.owner_address_) { + *owner_address = *it->second.owner_address_; return true; } else { return false; @@ -667,7 +667,7 @@ void ReferenceCounter::FreePlasmaObjects(const std::vector &object_ids // The object is still in scope. It will be removed from this set // once its Reference has been deleted. freed_objects_.insert(object_id); - if (!it->second.owned_by_us) { + if (!it->second.owned_by_us_) { RAY_LOG(WARNING) << "Tried to free an object " << object_id << " that we did not create. The object value may not be released."; @@ -683,10 +683,10 @@ void ReferenceCounter::DeleteReferenceInternal(ReferenceTable::iterator it, std::vector *deleted) { const ObjectID id = it->first; RAY_LOG(DEBUG) << "Attempting to delete object " << id; - if (it->second.RefCount() == 0 && it->second.on_ref_removed) { - RAY_LOG(DEBUG) << "Calling on_ref_removed for object " << id; - it->second.on_ref_removed(id); - it->second.on_ref_removed = nullptr; + if (it->second.RefCount() == 0 && it->second.publish_ref_removed) { + RAY_LOG(DEBUG) << "Calling PublishRefRemoved for object " << id; + PublishRefRemovedInternal(id); + it->second.publish_ref_removed = false; } PRINT_REF_COUNT(it); @@ -697,7 +697,7 @@ void ReferenceCounter::DeleteReferenceInternal(ReferenceTable::iterator it, auto inner_it = object_id_refs_.find(inner_id); if (inner_it != object_id_refs_.end()) { RAY_LOG(DEBUG) << "Try to delete inner object " << inner_id; - if (it->second.owned_by_us) { + if (it->second.owned_by_us_) { // If this object ID was nested in an owned object, make sure that // the outer object counted towards the ref count for the inner // object. @@ -745,7 +745,7 @@ void ReferenceCounter::EraseReference(ReferenceTable::iterator it) { reconstructable_owned_objects_index_.erase(index_it); } freed_objects_.erase(it->first); - if (it->second.owned_by_us) { + if (it->second.owned_by_us_) { if (ObjectID::IsActorID(it->first)) { num_actors_owned_by_us_--; } else { @@ -787,7 +787,7 @@ void ReferenceCounter::OnObjectOutOfScopeOrFreed(ReferenceTable::iterator it) { } void ReferenceCounter::UnsetObjectPrimaryCopy(ReferenceTable::iterator it) { - it->second.pinned_at_raylet_id.reset(); + it->second.pinned_at_node_id_.reset(); if (it->second.spilled && !it->second.spilled_node_id.IsNil()) { it->second.spilled = false; it->second.spilled_url = ""; @@ -827,18 +827,18 @@ bool ReferenceCounter::AddObjectOutOfScopeOrFreedCallback( return true; } -void ReferenceCounter::ResetObjectsOnRemovedNode(const NodeID &raylet_id) { +void ReferenceCounter::ResetObjectsOnRemovedNode(const NodeID &node_id) { absl::MutexLock lock(&mutex_); for (auto it = object_id_refs_.begin(); it != object_id_refs_.end(); it++) { const auto &object_id = it->first; - if (it->second.pinned_at_raylet_id.value_or(NodeID::Nil()) == raylet_id || - it->second.spilled_node_id == raylet_id) { + if (it->second.pinned_at_node_id_.value_or(NodeID::Nil()) == node_id || + it->second.spilled_node_id == node_id) { UnsetObjectPrimaryCopy(it); if (!it->second.OutOfScope(lineage_pinning_enabled_)) { objects_to_recover_.push_back(object_id); } } - RemoveObjectLocationInternal(it, raylet_id); + RemoveObjectLocationInternal(it, node_id); } } @@ -850,7 +850,7 @@ std::vector ReferenceCounter::FlushObjectsToRecover() { } void ReferenceCounter::UpdateObjectPinnedAtRaylet(const ObjectID &object_id, - const NodeID &raylet_id) { + const NodeID &node_id) { absl::MutexLock lock(&mutex_); auto it = object_id_refs_.find(object_id); if (it != object_id_refs_.end()) { @@ -861,17 +861,17 @@ void ReferenceCounter::UpdateObjectPinnedAtRaylet(const ObjectID &object_id, // The object is still in scope. Track the raylet location until the object // has gone out of scope or the raylet fails, whichever happens first. - if (it->second.pinned_at_raylet_id.has_value()) { + if (it->second.pinned_at_node_id_.has_value()) { RAY_LOG(INFO).WithField(object_id) - << "Updating primary location for object to node " << raylet_id - << ", but it already has a primary location " << *it->second.pinned_at_raylet_id + << "Updating primary location for object to node " << node_id + << ", but it already has a primary location " << *it->second.pinned_at_node_id_ << ". This should only happen during reconstruction"; } // Only the owner tracks the location. - RAY_CHECK(it->second.owned_by_us); + RAY_CHECK(it->second.owned_by_us_); if (!it->second.OutOfScope(lineage_pinning_enabled_)) { - if (!is_node_dead_(raylet_id)) { - it->second.pinned_at_raylet_id = raylet_id; + if (!is_node_dead_(node_id)) { + it->second.pinned_at_node_id_ = node_id; } else { UnsetObjectPrimaryCopy(it); objects_to_recover_.push_back(object_id); @@ -887,10 +887,10 @@ bool ReferenceCounter::IsPlasmaObjectPinnedOrSpilled(const ObjectID &object_id, absl::MutexLock lock(&mutex_); auto it = object_id_refs_.find(object_id); if (it != object_id_refs_.end()) { - if (it->second.owned_by_us) { + if (it->second.owned_by_us_) { *owned_by_us = true; *spilled = it->second.spilled; - *pinned_at = it->second.pinned_at_raylet_id.value_or(NodeID::Nil()); + *pinned_at = it->second.pinned_at_node_id_.value_or(NodeID::Nil()); } return true; } @@ -996,7 +996,7 @@ bool ReferenceCounter::GetAndClearLocalBorrowersInternal( // because it is possible to receive a reference to an object that we already // own, e.g., if we execute a task that has an object ID in its arguments // that we created in an earlier task. - if (ref.owned_by_us) { + if (ref.owned_by_us_) { // Return true because we have the ref, but there is no need to return it // since we own the object. return true; @@ -1081,16 +1081,16 @@ void ReferenceCounter::MergeRemoteBorrowers(const ObjectID &object_id, // local table. for (const auto &contained_in_borrowed_id : borrower_it->second.nested().contained_in_borrowed_ids) { - RAY_CHECK(borrower_ref.owner_address); + RAY_CHECK(borrower_ref.owner_address_); AddBorrowedObjectInternal(object_id, contained_in_borrowed_id, - *borrower_ref.owner_address, + *borrower_ref.owner_address_, /*foreign_owner_already_monitoring=*/false); } // If we own this ID, then wait for all new borrowers to reach a ref count // of 0 before GCing the object value. - if (it->second.owned_by_us) { + if (it->second.owned_by_us_) { for (const auto &addr : new_borrowers) { WaitForRefRemoved(it, addr); } @@ -1139,10 +1139,10 @@ void ReferenceCounter::WaitForRefRemoved(const ReferenceTable::iterator &ref_it, auto sub_message = std::make_unique(); auto *request = sub_message->mutable_worker_ref_removed_message(); // Only the owner should send requests to borrowers. - RAY_CHECK(ref_it->second.owned_by_us); + RAY_CHECK(ref_it->second.owned_by_us_); request->mutable_reference()->set_object_id(object_id.Binary()); request->mutable_reference()->mutable_owner_address()->CopyFrom( - *ref_it->second.owner_address); + *ref_it->second.owner_address_); request->set_contained_in_id(contained_in_id.Binary()); request->set_intended_worker_id(addr.worker_id()); request->set_subscriber_worker_id(rpc_address_.worker_id()); @@ -1167,20 +1167,21 @@ void ReferenceCounter::WaitForRefRemoved(const ReferenceTable::iterator &ref_it, const Status &) { // When the request is failed, there's no new borrowers ref published from this // borrower. - const auto object_id = ObjectID::FromBinary(object_id_binary); - RAY_LOG(DEBUG).WithField(object_id).WithField(WorkerID::FromBinary(addr.worker_id())) + const auto failed_borrower_object_id = ObjectID::FromBinary(object_id_binary); + RAY_LOG(DEBUG) + .WithField(failed_borrower_object_id) + .WithField(WorkerID::FromBinary(addr.worker_id())) << "WaitForRefRemoved failed for object, dest worker"; - CleanupBorrowersOnRefRemoved({}, object_id, addr); + CleanupBorrowersOnRefRemoved({}, failed_borrower_object_id, addr); }; - RAY_CHECK( - object_info_subscriber_->Subscribe(std::move(sub_message), - rpc::ChannelType::WORKER_REF_REMOVED_CHANNEL, - addr, - object_id.Binary(), - /*subscribe_done_callback=*/nullptr, - message_published_callback, - publisher_failed_callback)); + object_info_subscriber_->Subscribe(std::move(sub_message), + rpc::ChannelType::WORKER_REF_REMOVED_CHANNEL, + addr, + object_id.Binary(), + /*subscribe_done_callback=*/nullptr, + message_published_callback, + publisher_failed_callback); } void ReferenceCounter::AddNestedObjectIds(const ObjectID &object_id, @@ -1199,7 +1200,7 @@ void ReferenceCounter::AddNestedObjectIdsInternal(const ObjectID &object_id, // We own object_id. This is a `ray.put()` case OR returning an object ID // from a task and the task's caller executed in the same process as us. if (it != object_id_refs_.end()) { - RAY_CHECK(it->second.owned_by_us); + RAY_CHECK(it->second.owned_by_us_); // The outer object is still in scope. Mark the inner ones as being // contained in the outer object ID so we do not GC the inner objects // until the outer object goes out of scope. @@ -1232,7 +1233,7 @@ void ReferenceCounter::AddNestedObjectIdsInternal(const ObjectID &object_id, inner_it = object_id_refs_.emplace(inner_id, Reference()).first; } // Add the task's caller as a borrower. - if (inner_it->second.owned_by_us) { + if (inner_it->second.owned_by_us_) { auto inserted = inner_it->second.mutable_borrow()->borrowers.insert(owner_address).second; if (inserted) { @@ -1252,8 +1253,13 @@ void ReferenceCounter::AddNestedObjectIdsInternal(const ObjectID &object_id, } } -void ReferenceCounter::HandleRefRemoved(const ObjectID &object_id) { - RAY_LOG(DEBUG).WithField(object_id) << "HandleRefRemoved "; +void ReferenceCounter::PublishRefRemoved(const ObjectID &object_id) { + absl::MutexLock lock(&mutex_); + PublishRefRemovedInternal(object_id); +} + +void ReferenceCounter::PublishRefRemovedInternal(const ObjectID &object_id) { + RAY_LOG(DEBUG).WithField(object_id) << "PublishRefRemoved "; auto it = object_id_refs_.find(object_id); if (it != object_id_refs_.end()) { PRINT_REF_COUNT(it); @@ -1283,11 +1289,9 @@ void ReferenceCounter::HandleRefRemoved(const ObjectID &object_id) { object_info_publisher_->Publish(std::move(pub_message)); } -void ReferenceCounter::SetRefRemovedCallback( - const ObjectID &object_id, - const ObjectID &contained_in_id, - const rpc::Address &owner_address, - const ReferenceCounter::ReferenceRemovedCallback &ref_removed_callback) { +void ReferenceCounter::SubscribeRefRemoved(const ObjectID &object_id, + const ObjectID &contained_in_id, + const rpc::Address &owner_address) { absl::MutexLock lock(&mutex_); RAY_LOG(DEBUG).WithField(object_id) << "Received WaitForRefRemoved object contained in " << contained_in_id; @@ -1297,6 +1301,8 @@ void ReferenceCounter::SetRefRemovedCallback( it = object_id_refs_.emplace(object_id, Reference()).first; } + auto &reference = it->second; + // If we are borrowing the ID because we own an object that contains it, then // add the outer object to the inner ID's ref count. We will not respond to // the owner of the inner ID until the outer object ID goes out of scope. @@ -1304,28 +1310,28 @@ void ReferenceCounter::SetRefRemovedCallback( AddNestedObjectIdsInternal(contained_in_id, {object_id}, rpc_address_); } - if (it->second.RefCount() == 0) { + if (reference.RefCount() == 0) { RAY_LOG(DEBUG).WithField(object_id) << "Ref count for borrowed object is already 0, responding to WaitForRefRemoved"; // We already stopped borrowing the object ID. Respond to the owner // immediately. - ref_removed_callback(object_id); + PublishRefRemovedInternal(object_id); DeleteReferenceInternal(it, nullptr); } else { // We are still borrowing the object ID. Respond to the owner once we have // stopped borrowing it. - if (it->second.on_ref_removed != nullptr) { + if (reference.publish_ref_removed) { // TODO(swang): If the owner of an object dies and and is re-executed, it // is possible that we will receive a duplicate request to set - // on_ref_removed. If messages are delayed and we overwrite the + // publish_ref_removed. If messages are delayed and we overwrite the // callback here, it's possible we will drop the request that was sent by // the more recent owner. We should fix this by setting multiple // callbacks or by versioning the owner requests. RAY_LOG(WARNING).WithField(object_id) - << "on_ref_removed already set for object. The owner task must have died and " - "been re-executed."; + << "publish_ref_removed already set for object. The owner task must have " + "died and been re-executed."; } - it->second.on_ref_removed = ref_removed_callback; + reference.publish_ref_removed = true; } } @@ -1389,8 +1395,8 @@ void ReferenceCounter::UpdateObjectPendingCreationInternal(const ObjectID &objec auto it = object_id_refs_.find(object_id); bool push = false; if (it != object_id_refs_.end()) { - push = (it->second.pending_creation != pending_creation); - it->second.pending_creation = pending_creation; + push = (it->second.pending_creation_ != pending_creation); + it->second.pending_creation_ = pending_creation; } if (push) { PushToLocationSubscribers(it); @@ -1462,11 +1468,11 @@ std::optional ReferenceCounter::GetLocalityData( } // The size of this object. - const auto object_size = it->second.object_size; + const auto object_size = it->second.object_size_; if (object_size < 0) { // We don't know the object size so we can't returned valid locality data. RAY_LOG(DEBUG).WithField(object_id) - << "Reference [" << it->second.call_site + << "Reference [" << it->second.call_site_ << "] for object has an unknown object size, locality data not available"; return absl::nullopt; } @@ -1479,8 +1485,8 @@ std::optional ReferenceCounter::GetLocalityData( auto node_ids = it->second.locations; // Add location of the primary copy since the object must be there: either in memory or // spilled. - if (it->second.pinned_at_raylet_id.has_value()) { - node_ids.emplace(it->second.pinned_at_raylet_id.value()); + if (it->second.pinned_at_node_id_.has_value()) { + node_ids.emplace(it->second.pinned_at_node_id_.value()); } // We should only reach here if we have valid locality data to return. @@ -1500,13 +1506,13 @@ bool ReferenceCounter::ReportLocalityData(const ObjectID &object_id, << " The object has probably already been freed."; return false; } - RAY_CHECK(!it->second.owned_by_us) + RAY_CHECK(!it->second.owned_by_us_) << "ReportLocalityData should only be used for borrowed references."; for (const auto &location : locations) { it->second.locations.emplace(location); } if (object_size > 0) { - it->second.object_size = object_size; + it->second.object_size_ = object_size; } return true; } @@ -1517,7 +1523,7 @@ void ReferenceCounter::AddBorrowerAddress(const ObjectID &object_id, auto it = object_id_refs_.find(object_id); RAY_CHECK(it != object_id_refs_.end()); - RAY_CHECK(it->second.owned_by_us) + RAY_CHECK(it->second.owned_by_us_) << "AddBorrowerAddress should only be used for owner references."; RAY_CHECK(borrower_address.worker_id() != rpc_address_.worker_id()) @@ -1542,7 +1548,7 @@ bool ReferenceCounter::IsObjectReconstructable(const ObjectID &object_id, return false; } *lineage_evicted = it->second.lineage_evicted; - return it->second.is_reconstructable; + return it->second.is_reconstructable_; } void ReferenceCounter::UpdateObjectPendingCreation(const ObjectID &object_id, @@ -1557,23 +1563,23 @@ bool ReferenceCounter::IsObjectPendingCreation(const ObjectID &object_id) const if (it == object_id_refs_.end()) { return false; } - return it->second.pending_creation; + return it->second.pending_creation_; } void ReferenceCounter::PushToLocationSubscribers(ReferenceTable::iterator it) { const auto &object_id = it->first; const auto &locations = it->second.locations; - auto object_size = it->second.object_size; + auto object_size = it->second.object_size_; const auto &spilled_url = it->second.spilled_url; const auto &spilled_node_id = it->second.spilled_node_id; - const auto &optional_primary_node_id = it->second.pinned_at_raylet_id; + const auto &optional_primary_node_id = it->second.pinned_at_node_id_; const auto &primary_node_id = optional_primary_node_id.value_or(NodeID::Nil()); RAY_LOG(DEBUG).WithField(object_id) << "Published message for object, " << locations.size() << " locations, spilled url: [" << spilled_url << "], spilled node ID: " << spilled_node_id << ", and object size: " << object_size << ", and primary node ID: " << primary_node_id << ", pending creation? " - << it->second.pending_creation; + << it->second.pending_creation_; rpc::PubMessage pub_message; pub_message.set_key_id(object_id.Binary()); pub_message.set_channel_type(rpc::ChannelType::WORKER_OBJECT_LOCATIONS_CHANNEL); @@ -1604,15 +1610,15 @@ void ReferenceCounter::FillObjectInformationInternal( for (const auto &node_id : it->second.locations) { object_info->add_node_ids(node_id.Binary()); } - int64_t object_size = it->second.object_size; + int64_t object_size = it->second.object_size_; if (object_size > 0) { - object_info->set_object_size(it->second.object_size); + object_info->set_object_size(it->second.object_size_); } object_info->set_spilled_url(it->second.spilled_url); object_info->set_spilled_node_id(it->second.spilled_node_id.Binary()); - auto primary_node_id = it->second.pinned_at_raylet_id.value_or(NodeID::Nil()); + auto primary_node_id = it->second.pinned_at_node_id_.value_or(NodeID::Nil()); object_info->set_primary_node_id(primary_node_id.Binary()); - object_info->set_pending_creation(it->second.pending_creation); + object_info->set_pending_creation(it->second.pending_creation_); object_info->set_did_spill(it->second.did_spill); } @@ -1670,7 +1676,7 @@ std::string ReferenceCounter::Reference::DebugString() const { ReferenceCounter::Reference ReferenceCounter::Reference::FromProto( const rpc::ObjectReferenceCount &ref_count) { Reference ref; - ref.owner_address = ref_count.reference().owner_address(); + ref.owner_address_ = ref_count.reference().owner_address(); ref.local_ref_count = ref_count.has_local_ref() ? 1 : 0; for (const auto &borrower : ref_count.borrowers()) { @@ -1692,8 +1698,8 @@ ReferenceCounter::Reference ReferenceCounter::Reference::FromProto( void ReferenceCounter::Reference::ToProto(rpc::ObjectReferenceCount *ref, bool deduct_local_ref) const { - if (owner_address) { - ref->mutable_reference()->mutable_owner_address()->CopyFrom(*owner_address); + if (owner_address_) { + ref->mutable_reference()->mutable_owner_address()->CopyFrom(*owner_address_); } ref->set_has_local_ref(RefCount() > (deduct_local_ref ? 1 : 0)); for (const auto &borrower : borrow().borrowers) { @@ -1719,7 +1725,7 @@ std::optional ReferenceCounter::GetTensorTransport( if (it == object_id_refs_.end()) { return absl::nullopt; } - return it->second.tensor_transport; + return it->second.tensor_transport_; } } // namespace core diff --git a/src/ray/core_worker/reference_count.h b/src/ray/core_worker/reference_count.h index 1ede49834e94..75e8423ca86e 100644 --- a/src/ray/core_worker/reference_count.h +++ b/src/ray/core_worker/reference_count.h @@ -28,8 +28,8 @@ #include "absl/synchronization/mutex.h" #include "ray/common/id.h" #include "ray/core_worker/lease_policy.h" -#include "ray/pubsub/publisher.h" -#include "ray/pubsub/subscriber.h" +#include "ray/pubsub/publisher_interface.h" +#include "ray/pubsub/subscriber_interface.h" #include "ray/rpc/grpc_server.h" #include "ray/rpc/worker/core_worker_client.h" #include "ray/rpc/worker/core_worker_client_pool.h" @@ -56,7 +56,7 @@ class ReferenceCounterInterface { const int64_t object_size, bool is_reconstructable, bool add_local_ref, - const std::optional &pinned_at_raylet_id = std::optional(), + const std::optional &pinned_at_node_id = std::optional(), rpc::TensorTransport tensor_transport = rpc::TensorTransport::OBJECT_STORE) = 0; virtual bool AddObjectOutOfScopeOrFreedCallback( const ObjectID &object_id, @@ -188,7 +188,7 @@ class ReferenceCounter : public ReferenceCounterInterface, /// \param[in] add_local_ref Whether to initialize the local ref count to 1. /// This is used to ensure that the ref is considered in scope before the /// corresponding ObjectRef has been returned to the language frontend. - /// \param[in] pinned_at_raylet_id The primary location for the object, if it + /// \param[in] pinned_at_node_id The primary location for the object, if it /// is already known. This is only used for ray.put calls. /// \param[in] tensor_transport The transport used for the object. void AddOwnedObject( @@ -199,7 +199,7 @@ class ReferenceCounter : public ReferenceCounterInterface, const int64_t object_size, bool is_reconstructable, bool add_local_ref, - const std::optional &pinned_at_raylet_id = std::optional(), + const std::optional &pinned_at_node_id = std::optional(), rpc::TensorTransport tensor_transport = rpc::TensorTransport::OBJECT_STORE) override ABSL_LOCKS_EXCLUDED(mutex_); @@ -345,8 +345,8 @@ class ReferenceCounter : public ReferenceCounterInterface, const std::function callback) override ABSL_LOCKS_EXCLUDED(mutex_); - /// Set a callback for when we are no longer borrowing this object (when our - /// ref count goes to 0). + /// So we call PublishRefRemovedInternal when we are no longer borrowing this object + /// (when our ref count goes to 0). /// /// \param[in] object_id The object ID to set the callback for. /// \param[in] contained_in_id The object ID that contains object_id, if any. @@ -354,13 +354,9 @@ class ReferenceCounter : public ReferenceCounterInterface, /// submitted. Then, as long as we have contained_in_id in scope, we are /// borrowing object_id. /// \param[in] owner_address The owner of object_id's address. - /// \param[in] ref_removed_callback The callback to call when we are no - /// longer borrowing the object. - void SetRefRemovedCallback(const ObjectID &object_id, - const ObjectID &contained_in_id, - const rpc::Address &owner_address, - const ReferenceRemovedCallback &ref_removed_callback) - ABSL_LOCKS_EXCLUDED(mutex_); + void SubscribeRefRemoved(const ObjectID &object_id, + const ObjectID &contained_in_id, + const rpc::Address &owner_address) ABSL_LOCKS_EXCLUDED(mutex_); /// Set a callback to call whenever a Reference that we own is deleted. A /// Reference can only be deleted if: @@ -371,12 +367,8 @@ class ReferenceCounter : public ReferenceCounterInterface, /// \param[in] callback The callback to call. void SetReleaseLineageCallback(const LineageReleasedCallback &callback); - /// Respond to the object's owner once we are no longer borrowing it. The - /// sender is the owner of the object ID. We will send the reply when our - /// RefCount() for the object ID goes to 0. - /// - /// \param[in] object_id The object that we were borrowing. - void HandleRefRemoved(const ObjectID &object_id) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_); + /// Just calls PublishRefRemovedInternal with a lock. + void PublishRefRemoved(const ObjectID &object_id) ABSL_LOCKS_EXCLUDED(mutex_); /// Returns the total number of ObjectIDs currently in scope. size_t NumObjectIDsInScope() const ABSL_LOCKS_EXCLUDED(mutex_); @@ -444,8 +436,8 @@ class ReferenceCounter : public ReferenceCounterInterface, /// Update the pinned location of an object stored in plasma. /// /// \param[in] object_id The object to update. - /// \param[in] raylet_id The raylet that is now pinning the object ID. - void UpdateObjectPinnedAtRaylet(const ObjectID &object_id, const NodeID &raylet_id) + /// \param[in] node_id The raylet that is now pinning the object ID. + void UpdateObjectPinnedAtRaylet(const ObjectID &object_id, const NodeID &node_id) ABSL_LOCKS_EXCLUDED(mutex_); /// Check whether the object is pinned at a remote plasma store node or @@ -473,7 +465,7 @@ class ReferenceCounter : public ReferenceCounterInterface, /// /// \param[in] node_id The node whose object store has been removed. /// \return The set of objects that were pinned on the given node. - void ResetObjectsOnRemovedNode(const NodeID &raylet_id); + void ResetObjectsOnRemovedNode(const NodeID &node_id); std::vector FlushObjectsToRecover(); @@ -646,22 +638,22 @@ class ReferenceCounter : public ReferenceCounterInterface, /// Constructor for a reference whose origin is unknown. Reference() = default; Reference(std::string call_site, int64_t object_size) - : call_site(std::move(call_site)), object_size(object_size) {} + : call_site_(std::move(call_site)), object_size_(object_size) {} /// Constructor for a reference that we created. Reference(rpc::Address owner_address, std::string call_site, int64_t object_size, bool is_reconstructable, - std::optional pinned_at_raylet_id, + std::optional pinned_at_node_id, rpc::TensorTransport tensor_transport) - : call_site(std::move(call_site)), - object_size(object_size), - owner_address(std::move(owner_address)), - pinned_at_raylet_id(std::move(pinned_at_raylet_id)), - tensor_transport(tensor_transport), - owned_by_us(true), - is_reconstructable(is_reconstructable), - pending_creation(!pinned_at_raylet_id.has_value()) {} + : call_site_(std::move(call_site)), + object_size_(object_size), + owner_address_(std::move(owner_address)), + pinned_at_node_id_(std::move(pinned_at_node_id)), + tensor_transport_(tensor_transport), + owned_by_us_(true), + is_reconstructable_(is_reconstructable), + pending_creation_(!pinned_at_node_id_.has_value()) {} /// Constructor from a protobuf. This is assumed to be a message from /// another process, so the object defaults to not being owned by us. @@ -694,7 +686,7 @@ class ReferenceCounter : public ReferenceCounterInterface, bool was_stored_in_objects = !borrow().stored_in_objects.empty(); bool has_lineage_references = false; - if (lineage_pinning_enabled && owned_by_us && !is_reconstructable) { + if (lineage_pinning_enabled && owned_by_us_ && !is_reconstructable_) { has_lineage_references = lineage_ref_count > 0; } @@ -756,9 +748,9 @@ class ReferenceCounter : public ReferenceCounterInterface, std::string DebugString() const; /// Description of the call site where the reference was created. - std::string call_site = ""; + std::string call_site_ = ""; /// Object size if known, otherwise -1; - int64_t object_size = -1; + int64_t object_size_ = -1; /// If this object is owned by us and stored in plasma, this contains all /// object locations. absl::flat_hash_set locations; @@ -766,25 +758,25 @@ class ReferenceCounter : public ReferenceCounterInterface, /// owner, then this is added during creation of the Reference. If this is /// process is a borrower, the borrower must add the owner's address before /// using the ObjectID. - std::optional owner_address; + std::optional owner_address_; /// If this object is owned by us and stored in plasma, and reference /// counting is enabled, then some raylet must be pinning the object value. /// This is the address of that raylet. - std::optional pinned_at_raylet_id; + std::optional pinned_at_node_id_; /// TODO(kevin85421): Make tensor_transport a required field for all constructors. /// /// The transport used for the object. - rpc::TensorTransport tensor_transport = rpc::TensorTransport::OBJECT_STORE; + rpc::TensorTransport tensor_transport_ = rpc::TensorTransport::OBJECT_STORE; /// Whether we own the object. If we own the object, then we are /// responsible for tracking the state of the task that creates the object /// (see task_manager.h). - bool owned_by_us = false; + bool owned_by_us_ = false; // Whether this object can be reconstructed via lineage. If false, then the // object's value will be pinned as long as it is referenced by any other // object's lineage. This should be set to false if the object was created // by ray.put(), a task that cannot be retried, or its lineage was evicted. - bool is_reconstructable = false; + bool is_reconstructable_ = false; /// Whether the lineage of this object was evicted due to memory pressure. bool lineage_evicted = false; /// The number of tasks that depend on this object that may be retried in @@ -815,9 +807,9 @@ class ReferenceCounter : public ReferenceCounterInterface, /// Callback that will be called when the object ref is deleted /// from the reference table (all refs including lineage ref count go to 0). std::function on_object_ref_delete; - /// Callback that is called when this process is no longer a borrower - /// (RefCount() == 0). - std::function on_ref_removed; + /// If this is set, we'll call PublishRefRemovedInternal when this process is no + /// longer a borrower (RefCount() == 0). + bool publish_ref_removed = false; /// For objects that have been spilled to external storage, the URL from which /// they can be retrieved. @@ -843,7 +835,7 @@ class ReferenceCounter : public ReferenceCounterInterface, bool has_nested_refs_to_report = false; /// Whether the task that creates this object is scheduled/executing. - bool pending_creation = false; + bool pending_creation_ = false; /// Whether or not this object was spilled. bool did_spill = false; @@ -860,7 +852,7 @@ class ReferenceCounter : public ReferenceCounterInterface, const int64_t object_size, bool is_reconstructable, bool add_local_ref, - const std::optional &pinned_at_raylet_id, + const std::optional &pinned_at_node_id, rpc::TensorTransport tensor_transport = rpc::TensorTransport::OBJECT_STORE) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_); @@ -990,6 +982,12 @@ class ReferenceCounter : public ReferenceCounterInterface, std::vector *deleted) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_); + /// To respond to the object's owner once we are no longer borrowing it. The + /// sender is the owner of the object ID. We will send the reply when our + /// RefCount() for the object ID goes to 0. + void PublishRefRemovedInternal(const ObjectID &object_id) + ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_); + /// Erase the Reference from the table. Assumes that the entry has no more /// references, normal or lineage. void EraseReference(ReferenceTable::iterator entry) diff --git a/src/ray/core_worker/shutdown_coordinator.cc b/src/ray/core_worker/shutdown_coordinator.cc new file mode 100644 index 000000000000..a08cb8ec0f88 --- /dev/null +++ b/src/ray/core_worker/shutdown_coordinator.cc @@ -0,0 +1,287 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ray/core_worker/shutdown_coordinator.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "ray/common/buffer.h" // LocalMemoryBuffer +namespace ray { + +namespace core { + +ShutdownCoordinator::ShutdownCoordinator( + std::unique_ptr executor, rpc::WorkerType worker_type) + : executor_(std::move(executor)), worker_type_(worker_type) { + RAY_CHECK(executor_) + << "ShutdownCoordinator requires a non-null ShutdownExecutorInterface. " + << "This indicates a construction-time bug. " + << "Pass a concrete executor (e.g., CoreWorkerShutdownExecutor) " + << "when creating the coordinator."; +} + +bool ShutdownCoordinator::RequestShutdown( + bool force_shutdown, + ShutdownReason reason, + std::string_view detail, + std::chrono::milliseconds timeout_ms, + const std::shared_ptr &creation_task_exception_pb_bytes) { + bool should_execute = false; + bool execute_force = force_shutdown; + { + std::lock_guard lock(mu_); + if (state_ == ShutdownState::kShutdown) { + return false; + } + // If a force request arrives, latch it immediately to guarantee single execution. + if (force_shutdown) { + if (force_started_) { + return false; + } + force_started_ = true; + reason_ = reason; + shutdown_detail_ = std::string(detail); + if (state_ == ShutdownState::kRunning) { + state_ = ShutdownState::kShuttingDown; + } + should_execute = true; + } else { + if (state_ != ShutdownState::kRunning) { + return false; + } + state_ = ShutdownState::kShuttingDown; + reason_ = reason; + shutdown_detail_ = std::string(detail); + should_execute = true; + } + } + + if (!should_execute) { + return false; + } + + ExecuteShutdownSequence( + execute_force, detail, timeout_ms, creation_task_exception_pb_bytes); + return true; +} + +bool ShutdownCoordinator::TryTransitionToDisconnecting() { + std::lock_guard lock(mu_); + if (state_ != ShutdownState::kShuttingDown) { + return false; + } + state_ = ShutdownState::kDisconnecting; + return true; +} + +bool ShutdownCoordinator::TryTransitionToShutdown() { + std::lock_guard lock(mu_); + if (state_ != ShutdownState::kShuttingDown && state_ != ShutdownState::kDisconnecting) { + return false; + } + state_ = ShutdownState::kShutdown; + return true; +} + +ShutdownState ShutdownCoordinator::GetState() const { + std::lock_guard lock(mu_); + return state_; +} + +ShutdownReason ShutdownCoordinator::GetReason() const { + std::lock_guard lock(mu_); + return reason_; +} + +bool ShutdownCoordinator::ShouldEarlyExit() const { + std::lock_guard lock(mu_); + return state_ != ShutdownState::kRunning; +} + +bool ShutdownCoordinator::IsRunning() const { + return GetState() == ShutdownState::kRunning; +} + +bool ShutdownCoordinator::IsShuttingDown() const { + return GetState() != ShutdownState::kRunning; +} + +bool ShutdownCoordinator::IsShutdown() const { + return GetState() == ShutdownState::kShutdown; +} + +std::string ShutdownCoordinator::GetStateString() const { + switch (GetState()) { + case ShutdownState::kRunning: + return "Running"; + case ShutdownState::kShuttingDown: + return "ShuttingDown"; + case ShutdownState::kDisconnecting: + return "Disconnecting"; + case ShutdownState::kShutdown: + return "Shutdown"; + default: + return "Unknown"; + } +} + +// Methods that execute shutdown logic + +void ShutdownCoordinator::ExecuteShutdownSequence( + bool force_shutdown, + std::string_view detail, + std::chrono::milliseconds timeout_ms, + const std::shared_ptr &creation_task_exception_pb_bytes) { + switch (worker_type_) { + case rpc::WorkerType::DRIVER: + ExecuteDriverShutdown(force_shutdown, detail, timeout_ms); + break; + case rpc::WorkerType::WORKER: + case rpc::WorkerType::SPILL_WORKER: + case rpc::WorkerType::RESTORE_WORKER: + ExecuteWorkerShutdown( + force_shutdown, detail, timeout_ms, creation_task_exception_pb_bytes); + break; + default: + RAY_LOG(FATAL) << "Unknown worker type: " << static_cast(worker_type_) + << ". This should be unreachable. Please file a bug at " + << "https://github.com/ray-project/ray/issues."; + break; + } +} + +void ShutdownCoordinator::ExecuteGracefulShutdown(std::string_view detail, + std::chrono::milliseconds timeout_ms) { + TryTransitionToDisconnecting(); + executor_->ExecuteGracefulShutdown(GetExitTypeString(), detail, timeout_ms); + TryTransitionToShutdown(); +} + +void ShutdownCoordinator::ExecuteForceShutdown(std::string_view detail) { + // Force shutdown bypasses normal state transitions and terminates immediately + // This ensures that force shutdowns can interrupt hanging graceful shutdowns + { + std::lock_guard lock(mu_); + if (force_executed_) { + return; + } + force_executed_ = true; + } + executor_->ExecuteForceShutdown(GetExitTypeString(), detail); + + // Only update state if we're not already in final state + // (force shutdown should have terminated the process by now) + TryTransitionToShutdown(); +} + +void ShutdownCoordinator::ExecuteDriverShutdown(bool force_shutdown, + std::string_view detail, + std::chrono::milliseconds timeout_ms) { + if (force_shutdown) { + ExecuteForceShutdown(detail); + } else { + ExecuteGracefulShutdown(detail, timeout_ms); + } +} + +void ShutdownCoordinator::ExecuteWorkerShutdown( + bool force_shutdown, + std::string_view detail, + std::chrono::milliseconds timeout_ms, + const std::shared_ptr &creation_task_exception_pb_bytes) { + if (force_shutdown) { + ExecuteForceShutdown(detail); + return; + } + + ShutdownReason reason = GetReason(); + + if (reason == ShutdownReason::kActorCreationFailed) { + TryTransitionToDisconnecting(); + executor_->ExecuteExit( + GetExitTypeString(), detail, timeout_ms, creation_task_exception_pb_bytes); + } else if (reason == ShutdownReason::kUserError || + reason == ShutdownReason::kGracefulExit || + reason == ShutdownReason::kIntentionalShutdown || + reason == ShutdownReason::kUnexpectedError || + reason == ShutdownReason::kOutOfMemory || + reason == ShutdownReason::kActorKilled) { + TryTransitionToDisconnecting(); + executor_->ExecuteWorkerExit(GetExitTypeString(), detail, timeout_ms); + } else if (reason == ShutdownReason::kIdleTimeout || + reason == ShutdownReason::kJobFinished) { + TryTransitionToDisconnecting(); + executor_->ExecuteHandleExit(GetExitTypeString(), detail, timeout_ms); + } else { + ExecuteGracefulShutdown(detail, timeout_ms); + } +} + +std::string ShutdownCoordinator::GetExitTypeString() const { + switch (GetReason()) { + case ShutdownReason::kIdleTimeout: + case ShutdownReason::kIntentionalShutdown: + return "INTENDED_SYSTEM_EXIT"; + case ShutdownReason::kUserError: + return "USER_ERROR"; + case ShutdownReason::kActorCreationFailed: + return "USER_ERROR"; + case ShutdownReason::kUnexpectedError: + return "SYSTEM_ERROR"; + case ShutdownReason::kOutOfMemory: + return "NODE_OUT_OF_MEMORY"; + case ShutdownReason::kForcedExit: + case ShutdownReason::kGracefulExit: + default: + return "INTENDED_USER_EXIT"; + } +} + +std::string ShutdownCoordinator::GetReasonString() const { + switch (GetReason()) { + case ShutdownReason::kNone: + return "None"; + case ShutdownReason::kIntentionalShutdown: + return "IntentionalShutdown"; + case ShutdownReason::kUnexpectedError: + return "UnexpectedError"; + case ShutdownReason::kIdleTimeout: + return "IdleTimeout"; + case ShutdownReason::kGracefulExit: + return "GracefulExit"; + case ShutdownReason::kForcedExit: + return "ForcedExit"; + case ShutdownReason::kUserError: + return "UserError"; + case ShutdownReason::kOutOfMemory: + return "OutOfMemory"; + case ShutdownReason::kJobFinished: + return "JobFinished"; + case ShutdownReason::kActorKilled: + return "ActorKilled"; + case ShutdownReason::kActorCreationFailed: + return "ActorCreationFailed"; + default: + return "Unknown"; + } +} + +} // namespace core +} // namespace ray diff --git a/src/ray/core_worker/shutdown_coordinator.h b/src/ray/core_worker/shutdown_coordinator.h new file mode 100644 index 000000000000..2692cb72a28d --- /dev/null +++ b/src/ray/core_worker/shutdown_coordinator.h @@ -0,0 +1,271 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "src/ray/protobuf/common.pb.h" + +namespace ray { +class LocalMemoryBuffer; +} // namespace ray + +namespace ray { + +namespace core { + +/// Interface for executing shutdown operations that the coordinator invokes. +class ShutdownExecutorInterface { + public: + virtual ~ShutdownExecutorInterface() = default; + + virtual void ExecuteGracefulShutdown(std::string_view exit_type, + std::string_view detail, + std::chrono::milliseconds timeout_ms) = 0; + + virtual void ExecuteForceShutdown(std::string_view exit_type, + std::string_view detail) = 0; + + virtual void ExecuteWorkerExit(std::string_view exit_type, + std::string_view detail, + std::chrono::milliseconds timeout_ms) = 0; + + virtual void ExecuteExit(std::string_view exit_type, + std::string_view detail, + std::chrono::milliseconds timeout_ms, + const std::shared_ptr<::ray::LocalMemoryBuffer> + &creation_task_exception_pb_bytes) = 0; + + virtual void ExecuteHandleExit(std::string_view exit_type, + std::string_view detail, + std::chrono::milliseconds timeout_ms) = 0; + + // Best-effort cleanup of child processes spawned by this worker process to + // avoid leaked subprocesses holding expensive resources (e.g., CUDA contexts). + // + // - Intended to be called during shutdown (including force paths). + // - Only targets direct children of the current process; crash paths can still leak + // (subreaper not yet used). + // - No-ops when disabled by configuration + // (RayConfig::kill_child_processes_on_worker_exit()). + // - Platform-dependent: process enumeration may be unavailable on some OSes. + virtual void KillChildProcessesImmediately() = 0; + + virtual bool ShouldWorkerIdleExit() const = 0; +}; + +/// Reasons for worker shutdown. Used for observability and debugging. +enum class ShutdownReason : std::uint8_t { + kNone = 0, + kIntentionalShutdown = 1, + kUnexpectedError = 2, + kIdleTimeout = 3, + kGracefulExit = 4, + kForcedExit = 5, + kUserError = 6, + kOutOfMemory = 7, + kJobFinished = 8, + kActorKilled = 9, + kActorCreationFailed = 10 +}; + +/// Shutdown state representing the current lifecycle phase of worker shutdown. +/// The state machine supports two paths with only forward transitions: +/// +/// Normal shutdown: kRunning -> kShuttingDown -> kDisconnecting -> kShutdown +/// Force shutdown: kRunning -> kShuttingDown -> kShutdown (bypasses kDisconnecting) +/// +/// State semantics: +/// - kRunning: Normal operation, accepting new work +/// - kShuttingDown: Shutdown initiated, draining existing work, no new work accepted +/// - kDisconnecting: Disconnecting from services (raylet, GCS), cleanup phase +/// - kShutdown: Final state, all cleanup complete, ready for process termination +enum class ShutdownState : std::uint8_t { + kRunning = 0, + kShuttingDown = 1, + kDisconnecting = 2, + kShutdown = 3 +}; + +/// Thread-safe coordinator for managing worker shutdown state and transitions. +/// +/// Uses a single mutex to serialize state transitions and to capture the shutdown +/// reason exactly once. We favor simple, readable synchronization because shutdown is +/// control-path, not throughput-critical. +/// +/// Key features: +/// - Atomic state transitions with integrated reason tracking +/// - Idempotent shutdown operations +/// - Performance optimized for hot-path checks +/// - Thread-safe from any thread context +/// +/// Usage: +/// auto coordinator = std::make_unique(); +/// +/// // Try to initiate shutdown (only the first caller succeeds) +/// if (coordinator->TryInitiateShutdown(ShutdownReason::kGracefulExit)) { +/// // This thread should execute shutdown sequence +/// } +/// +/// // Fast check for early exit in performance-critical paths +/// if (coordinator->ShouldEarlyExit()) { +/// return Status::Invalid("Worker is shutting down"); +/// } +class ShutdownCoordinator { + public: + static constexpr std::chrono::milliseconds kInfiniteTimeout{-1}; + /// Constructor + /// + /// \param executor Shutdown executor implementation + /// \param worker_type Type of worker for shutdown behavior customization + explicit ShutdownCoordinator(std::unique_ptr executor, + rpc::WorkerType worker_type = rpc::WorkerType::WORKER); + + ~ShutdownCoordinator() = default; + + // Non-copyable and non-movable for safety + ShutdownCoordinator(const ShutdownCoordinator &) = delete; + ShutdownCoordinator &operator=(const ShutdownCoordinator &) = delete; + ShutdownCoordinator(ShutdownCoordinator &&) = delete; + ShutdownCoordinator &operator=(ShutdownCoordinator &&) = delete; + + /// Request shutdown with configurable timeout and fallback behavior. + /// + /// Single entry-point that captures the first shutdown reason, chooses the + /// worker-type-specific path, and optionally falls back to force. Additional + /// graceful requests are ignored; a concurrent force may override the reason + /// and proceed. + /// + /// \param force_shutdown If true, force immediate shutdown; if false, graceful shutdown + /// \param reason The reason for shutdown initiation + /// \param detail Optional detailed explanation + /// \param timeout_ms Timeout for graceful shutdown (-1 = no timeout) + /// \return true if this call initiated shutdown, false if already shutting down + bool RequestShutdown(bool force_shutdown, + ShutdownReason reason, + std::string_view detail = "", + std::chrono::milliseconds timeout_ms = kInfiniteTimeout, + const std::shared_ptr<::ray::LocalMemoryBuffer> + &creation_task_exception_pb_bytes = nullptr); + + /// Get the current shutdown state (mutex-protected, fast path safe). + /// + /// \return Current shutdown state + ShutdownState GetState() const; + + /// Get the shutdown reason. + /// + /// The reason is set when shutdown is first initiated and remains + /// constant throughout the shutdown process. + /// + /// \return Shutdown reason (kNone if not shutting down) + ShutdownReason GetReason() const; + + /// Check if worker should early-exit from operations. + /// + /// Recommended hot-path check; returns true for any non-running state. + /// + /// \return true if operations should be aborted, false if normal operation + bool ShouldEarlyExit() const; + + /// Check if worker is in running state. + /// + /// \return true if in kRunning state, false otherwise + bool IsRunning() const; + + /// Check if shutdown has been initiated. + /// + /// \return true if in any shutdown state, false if still running + bool IsShuttingDown() const; + + /// Check if worker has completed shutdown. + /// + /// \return true if in kShutdown state, false otherwise + bool IsShutdown() const; + + /// Get string representation of current state. + /// + /// \return Human-readable state description + std::string GetStateString() const; + + /// Get string representation of exit type based on shutdown reason. + std::string GetExitTypeString() const; + + /// Get string representation of shutdown reason. + /// + /// \return Human-readable reason description + std::string GetReasonString() const; + + private: + /// Attempt to transition to disconnecting state. + /// Begins the disconnection/cleanup phase (e.g., GCS/raylet disconnect). Only + /// valid from kShuttingDown. + /// \return true if transition succeeded, false if invalid state + bool TryTransitionToDisconnecting(); + + /// Attempt to transition to final shutdown state. + /// Finalizes shutdown. Allowed from kDisconnecting (normal) or kShuttingDown + /// (force path). + /// \return true if transition succeeded, false if invalid state + bool TryTransitionToShutdown(); + + /// Execute shutdown sequence based on worker type and mode + void ExecuteShutdownSequence( + bool force_shutdown, + std::string_view detail, + std::chrono::milliseconds timeout_ms, + const std::shared_ptr<::ray::LocalMemoryBuffer> &creation_task_exception_pb_bytes); + + /// Executes graceful path; transitions to Disconnecting/Shutdown + void ExecuteGracefulShutdown(std::string_view detail, + std::chrono::milliseconds timeout_ms); + + /// Executes force path; guarded to run at most once + void ExecuteForceShutdown(std::string_view detail); + + void ExecuteDriverShutdown(bool force_shutdown, + std::string_view detail, + std::chrono::milliseconds timeout_ms); + /// Worker-type specific shutdown behavior + /// - Honors kActorCreationFailed with serialized exception payloads + /// - Uses worker-idle checks for idle exits + /// - Drains tasks/references before disconnect + void ExecuteWorkerShutdown( + bool force_shutdown, + std::string_view detail, + std::chrono::milliseconds timeout_ms, + const std::shared_ptr<::ray::LocalMemoryBuffer> &creation_task_exception_pb_bytes); + + // Executor and configuration + std::unique_ptr executor_; + rpc::WorkerType worker_type_; + + // Mutex-guarded shutdown state + mutable std::mutex mu_; + ShutdownState state_ = ShutdownState::kRunning; + ShutdownReason reason_ = ShutdownReason::kNone; + bool force_executed_ = false; + bool force_started_ = false; + + /// Shutdown detail for observability (set once during shutdown initiation) + std::string shutdown_detail_; +}; +} // namespace core +} // namespace ray diff --git a/src/ray/core_worker/store_provider/memory_store/memory_store.cc b/src/ray/core_worker/store_provider/memory_store/memory_store.cc index e6b58485728f..3898e048354e 100644 --- a/src/ray/core_worker/store_provider/memory_store/memory_store.cc +++ b/src/ray/core_worker/store_provider/memory_store/memory_store.cc @@ -21,7 +21,7 @@ #include #include "ray/common/ray_config.h" -#include "ray/ipc/raylet_ipc_client.h" +#include "ray/ipc/raylet_ipc_client_interface.h" namespace ray { namespace core { @@ -136,7 +136,7 @@ std::shared_ptr GetRequest::Get(const ObjectID &object_id) const { CoreWorkerMemoryStore::CoreWorkerMemoryStore( instrumented_io_context &io_context, ReferenceCounter *counter, - std::shared_ptr raylet_ipc_client, + std::shared_ptr raylet_ipc_client, std::function check_signals, std::function unhandled_exception_handler, std::function( diff --git a/src/ray/core_worker/store_provider/memory_store/memory_store.h b/src/ray/core_worker/store_provider/memory_store/memory_store.h index 509938d6dd61..17d1d4df1555 100644 --- a/src/ray/core_worker/store_provider/memory_store/memory_store.h +++ b/src/ray/core_worker/store_provider/memory_store/memory_store.h @@ -27,7 +27,7 @@ #include "ray/common/status.h" #include "ray/core_worker/context.h" #include "ray/core_worker/reference_count.h" -#include "ray/ipc/raylet_ipc_client.h" +#include "ray/ipc/raylet_ipc_client_interface.h" namespace ray { namespace core { @@ -54,7 +54,7 @@ class CoreWorkerMemoryStore { explicit CoreWorkerMemoryStore( instrumented_io_context &io_context, ReferenceCounter *counter = nullptr, - std::shared_ptr raylet_ipc_client = nullptr, + std::shared_ptr raylet_ipc_client = nullptr, std::function check_signals = nullptr, std::function unhandled_exception_handler = nullptr, std::function(const RayObject &object, @@ -210,7 +210,7 @@ class CoreWorkerMemoryStore { ReferenceCounter *ref_counter_; // If set, this will be used to notify worker blocked / unblocked on get calls. - std::shared_ptr raylet_ipc_client_; + std::shared_ptr raylet_ipc_client_; /// Protects the data structures below. mutable absl::Mutex mu_; diff --git a/src/ray/core_worker/store_provider/plasma_store_provider.cc b/src/ray/core_worker/store_provider/plasma_store_provider.cc index caaa2b30ef34..3323a924ec50 100644 --- a/src/ray/core_worker/store_provider/plasma_store_provider.cc +++ b/src/ray/core_worker/store_provider/plasma_store_provider.cc @@ -23,7 +23,8 @@ #include "ray/common/ray_config.h" #include "ray/common/status.h" #include "ray/common/status_or.h" -#include "ray/ipc/raylet_ipc_client.h" +#include "ray/ipc/raylet_ipc_client_interface.h" +#include "ray/util/time.h" #include "src/ray/protobuf/common.pb.h" namespace ray { @@ -62,21 +63,18 @@ BufferTracker::UsedObjects() const { CoreWorkerPlasmaStoreProvider::CoreWorkerPlasmaStoreProvider( const std::string &store_socket, - const std::shared_ptr raylet_ipc_client, + const std::shared_ptr raylet_ipc_client, ReferenceCounter &reference_counter, std::function check_signals, bool warmup, + std::shared_ptr store_client, + int64_t fetch_batch_size, std::function get_current_call_site) : raylet_ipc_client_(raylet_ipc_client), - // We can turn on exit_on_connection_failure on for the core worker plasma - // client to early exit core worker after the raylet's death because on the - // raylet side, we never proactively close the plasma store connection even - // during shutdown. So any error from the raylet side should be a sign of raylet - // death. - store_client_( - std::make_shared(/*exit_on_connection_failure*/ true)), + store_client_(std::move(store_client)), reference_counter_(reference_counter), - check_signals_(std::move(check_signals)) { + check_signals_(std::move(check_signals)), + fetch_batch_size_(fetch_batch_size) { if (get_current_call_site != nullptr) { get_current_call_site_ = get_current_call_site; } else { @@ -84,7 +82,10 @@ CoreWorkerPlasmaStoreProvider::CoreWorkerPlasmaStoreProvider( } object_store_full_delay_ms_ = RayConfig::instance().object_store_full_delay_ms(); buffer_tracker_ = std::make_shared(); - RAY_CHECK_OK(store_client_->Connect(store_socket)); + if (!store_socket.empty()) { + RAY_CHECK(store_client_ != nullptr) << "Plasma client must be provided"; + RAY_CHECK_OK(store_client_->Connect(store_socket)); + } if (warmup) { RAY_CHECK_OK(WarmupStore()); } @@ -223,9 +224,7 @@ Status CoreWorkerPlasmaStoreProvider::GetIfLocal( const std::vector &object_ids, absl::flat_hash_map> *results) { std::vector plasma_results; - RAY_RETURN_NOT_OK(store_client_->Get(object_ids, - /*timeout_ms=*/0, - &plasma_results)); + RAY_RETURN_NOT_OK(store_client_->Get(object_ids, /*timeout_ms=*/0, &plasma_results)); for (size_t i = 0; i < object_ids.size(); i++) { if (plasma_results[i].data != nullptr || plasma_results[i].metadata != nullptr) { @@ -255,8 +254,9 @@ Status CoreWorkerPlasmaStoreProvider::GetExperimentalMutableObject( return store_client_->GetExperimentalMutableObject(object_id, mutable_object); } -Status UnblockIfNeeded(const std::shared_ptr &raylet_client, - const WorkerContext &ctx) { +Status UnblockIfNeeded( + const std::shared_ptr &raylet_client, + const WorkerContext &ctx) { if (ctx.CurrentTaskIsDirectCall()) { // NOTE: for direct call actors, we still need to issue an unblock IPC to release // get subscriptions, even if the worker isn't blocked. @@ -276,17 +276,16 @@ Status CoreWorkerPlasmaStoreProvider::Get( const WorkerContext &ctx, absl::flat_hash_map> *results, bool *got_exception) { - int64_t batch_size = RayConfig::instance().worker_fetch_request_size(); std::vector batch_ids; absl::flat_hash_set remaining(object_ids.begin(), object_ids.end()); // Send initial requests to pull all objects in parallel. std::vector id_vector(object_ids.begin(), object_ids.end()); int64_t total_size = static_cast(object_ids.size()); - for (int64_t start = 0; start < total_size; start += batch_size) { + for (int64_t start = 0; start < total_size; start += fetch_batch_size_) { batch_ids.clear(); - for (int64_t i = start; i < batch_size && i < total_size; i++) { - batch_ids.push_back(id_vector[start + i]); + for (int64_t i = start; i < start + fetch_batch_size_ && i < total_size; i++) { + batch_ids.push_back(id_vector[i]); } RAY_RETURN_NOT_OK( PullObjectsAndGetFromPlasmaStore(remaining, @@ -313,7 +312,7 @@ Status CoreWorkerPlasmaStoreProvider::Get( while (!remaining.empty() && !should_break) { batch_ids.clear(); for (const auto &id : remaining) { - if (static_cast(batch_ids.size()) == batch_size) { + if (static_cast(batch_ids.size()) == fetch_batch_size_) { break; } batch_ids.push_back(id); diff --git a/src/ray/core_worker/store_provider/plasma_store_provider.h b/src/ray/core_worker/store_provider/plasma_store_provider.h index 867fb739e9e6..448dc5e6f40c 100644 --- a/src/ray/core_worker/store_provider/plasma_store_provider.h +++ b/src/ray/core_worker/store_provider/plasma_store_provider.h @@ -25,10 +25,9 @@ #include "ray/common/id.h" #include "ray/common/status.h" #include "ray/common/status_or.h" -#include "ray/core_worker/common.h" #include "ray/core_worker/context.h" #include "ray/core_worker/reference_count.h" -#include "ray/ipc/raylet_ipc_client.h" +#include "ray/ipc/raylet_ipc_client_interface.h" #include "ray/object_manager/plasma/client.h" #include "src/ray/protobuf/common.pb.h" @@ -96,10 +95,12 @@ class CoreWorkerPlasmaStoreProvider { public: CoreWorkerPlasmaStoreProvider( const std::string &store_socket, - const std::shared_ptr raylet_ipc_client, + const std::shared_ptr raylet_ipc_client, ReferenceCounter &reference_counter, std::function check_signals, bool warmup, + std::shared_ptr store_client, + int64_t fetch_batch_size, std::function get_current_call_site = nullptr); ~CoreWorkerPlasmaStoreProvider(); @@ -201,7 +202,7 @@ class CoreWorkerPlasmaStoreProvider { StatusOr GetMemoryUsage(); - std::shared_ptr &store_client() { return store_client_; } + std::shared_ptr &store_client() { return store_client_; } private: /// Ask the raylet to pull a set of objects and then attempt to get them @@ -235,8 +236,8 @@ class CoreWorkerPlasmaStoreProvider { /// \return status Status WarmupStore(); - const std::shared_ptr raylet_ipc_client_; - std::shared_ptr store_client_; + const std::shared_ptr raylet_ipc_client_; + std::shared_ptr store_client_; /// Used to look up a plasma object's owner. ReferenceCounter &reference_counter_; std::function check_signals_; @@ -244,6 +245,7 @@ class CoreWorkerPlasmaStoreProvider { uint32_t object_store_full_delay_ms_; // Pointer to the shared buffer tracker. std::shared_ptr buffer_tracker_; + int64_t fetch_batch_size_ = 0; }; } // namespace core diff --git a/src/ray/core_worker/task_event_buffer.cc b/src/ray/core_worker/task_event_buffer.cc index 48348add2176..f6f632b581c1 100644 --- a/src/ray/core_worker/task_event_buffer.cc +++ b/src/ray/core_worker/task_event_buffer.cc @@ -37,12 +37,14 @@ TaskStatusEvent::TaskStatusEvent( const rpc::TaskStatus &task_status, int64_t timestamp, bool is_actor_task_event, + std::string session_name, const std::shared_ptr &task_spec, std::optional state_update) : TaskEvent(task_id, job_id, attempt_number), task_status_(task_status), timestamp_(timestamp), is_actor_task_event_(is_actor_task_event), + session_name_(session_name), task_spec_(task_spec), state_update_(std::move(state_update)) {} @@ -53,13 +55,15 @@ TaskProfileEvent::TaskProfileEvent(TaskID task_id, std::string component_id, std::string node_ip_address, std::string event_name, - int64_t start_time) + int64_t start_time, + std::string session_name) : TaskEvent(task_id, job_id, attempt_number), component_type_(std::move(component_type)), component_id_(std::move(component_id)), node_ip_address_(std::move(node_ip_address)), event_name_(std::move(event_name)), - start_time_(start_time) {} + start_time_(start_time), + session_name_(session_name) {} void TaskStatusEvent::ToRpcTaskEvents(rpc::TaskEvents *rpc_task_events) { // Base fields @@ -181,6 +185,7 @@ void TaskStatusEvent::PopulateRpcRayTaskDefinitionEvent(T &definition_event_data definition_event_data.set_task_attempt(attempt_number_); // Common fields + definition_event_data.set_language(task_spec_->GetLanguage()); const auto &required_resources = task_spec_->GetRequiredResources().GetResourceMap(); definition_event_data.mutable_required_resources()->insert( std::make_move_iterator(required_resources.begin()), @@ -199,16 +204,18 @@ void TaskStatusEvent::PopulateRpcRayTaskDefinitionEvent(T &definition_event_data definition_event_data.mutable_actor_func()->CopyFrom( task_spec_->FunctionDescriptor()->GetMessage()); definition_event_data.set_actor_id(task_spec_->ActorId().Binary()); + definition_event_data.set_actor_task_name(task_spec_->GetName()); } else { definition_event_data.mutable_task_func()->CopyFrom( task_spec_->FunctionDescriptor()->GetMessage()); + definition_event_data.set_task_type(task_spec_->GetMessage().type()); definition_event_data.set_task_name(task_spec_->GetName()); } } -template void TaskStatusEvent::PopulateRpcRayTaskExecutionEvent( - T &execution_event_data, google::protobuf::Timestamp timestamp) { + rpc::events::TaskExecutionEvent &execution_event_data, + google::protobuf::Timestamp timestamp) { // Task identifier execution_event_data.set_task_id(task_id_.Binary()); execution_event_data.set_task_attempt(attempt_number_); @@ -247,6 +254,8 @@ void TaskStatusEvent::PopulateRpcRayTaskExecutionEvent( if (state_update_->pid_.has_value()) { execution_event_data.set_worker_pid(state_update_->pid_.value()); } + + execution_event_data.set_job_id(job_id_.Binary()); } void TaskStatusEvent::PopulateRpcRayEventBaseFields( @@ -257,24 +266,22 @@ void TaskStatusEvent::PopulateRpcRayEventBaseFields( ray_event.set_source_type(rpc::events::RayEvent::CORE_WORKER); ray_event.mutable_timestamp()->CopyFrom(timestamp); ray_event.set_severity(rpc::events::RayEvent::INFO); + ray_event.set_session_name(session_name_); - if (is_actor_task_event_) { - if (is_definition_event) { + if (is_definition_event) { + if (is_actor_task_event_) { ray_event.set_event_type(rpc::events::RayEvent::ACTOR_TASK_DEFINITION_EVENT); } else { - ray_event.set_event_type(rpc::events::RayEvent::ACTOR_TASK_EXECUTION_EVENT); - } - } else { - if (is_definition_event) { ray_event.set_event_type(rpc::events::RayEvent::TASK_DEFINITION_EVENT); - } else { - ray_event.set_event_type(rpc::events::RayEvent::TASK_EXECUTION_EVENT); } + } else { + ray_event.set_event_type(rpc::events::RayEvent::TASK_EXECUTION_EVENT); } } -void TaskStatusEvent::ToRpcRayEvents(RayEventsPair &ray_events_pair) { - auto &[task_definition_event_rpc, task_execution_event_rpc] = ray_events_pair; +void TaskStatusEvent::ToRpcRayEvents(RayEventsTuple &ray_events_tuple) { + auto &[task_definition_event_rpc, task_execution_event_rpc, task_profile_event_rpc] = + ray_events_tuple; google::protobuf::Timestamp timestamp = AbslTimeNanosToProtoTimestamp(timestamp_); @@ -298,15 +305,9 @@ void TaskStatusEvent::ToRpcRayEvents(RayEventsPair &ray_events_pair) { : task_execution_event_rpc.emplace(), false, timestamp); - if (is_actor_task_event_) { - auto actor_task_execution_event = - task_execution_event_rpc.value().mutable_actor_task_execution_event(); - PopulateRpcRayTaskExecutionEvent(*actor_task_execution_event, timestamp); - } else { - auto task_execution_event = - task_execution_event_rpc.value().mutable_task_execution_event(); - PopulateRpcRayTaskExecutionEvent(*task_execution_event, timestamp); - } + auto task_execution_event = + task_execution_event_rpc.value().mutable_task_execution_event(); + PopulateRpcRayTaskExecutionEvent(*task_execution_event, timestamp); } void TaskProfileEvent::ToRpcTaskEvents(rpc::TaskEvents *rpc_task_events) { @@ -346,12 +347,44 @@ void TaskProfileEvent::ToRpcTaskExportEvents( event_entry->set_extra_data(std::move(extra_data_)); } -void TaskProfileEvent::ToRpcRayEvents(RayEventsPair &ray_events_pair) { - // TODO(myan): #54515 need to further figure out how to migrate the task profile event - // to the new ray event format. +void TaskProfileEvent::PopulateRpcRayEventBaseFields( + rpc::events::RayEvent &ray_event, google::protobuf::Timestamp timestamp) { + ray_event.set_event_id(UniqueID::FromRandom().Binary()); + ray_event.set_source_type(rpc::events::RayEvent::CORE_WORKER); + ray_event.mutable_timestamp()->CopyFrom(timestamp); + ray_event.set_severity(rpc::events::RayEvent::INFO); + ray_event.set_event_type(rpc::events::RayEvent::TASK_PROFILE_EVENT); + ray_event.set_session_name(session_name_); +} + +void TaskProfileEvent::ToRpcRayEvents(RayEventsTuple &ray_events_tuple) { + auto &[task_definition_event, task_execution_event, task_profile_event] = + ray_events_tuple; + + // Using profile start time as the event generation timestamp + google::protobuf::Timestamp timestamp = AbslTimeNanosToProtoTimestamp(start_time_); + + // Populate Ray event base fields + auto &ray_event = task_profile_event.emplace(); + PopulateRpcRayEventBaseFields(ray_event, timestamp); + + // Populate the task profile event + auto *task_profile_events = ray_event.mutable_task_profile_events(); + task_profile_events->set_task_id(task_id_.Binary()); + task_profile_events->set_job_id(job_id_.Binary()); + task_profile_events->set_attempt_number(attempt_number_); + auto profile_events = task_profile_events->mutable_profile_events(); + profile_events->set_component_type(component_type_); + profile_events->set_component_id(component_id_); + profile_events->set_node_ip_address(node_ip_address_); + auto event_entry = profile_events->add_events(); + event_entry->set_event_name(event_name_); + event_entry->set_start_time(start_time_); + event_entry->set_end_time(end_time_); + event_entry->set_extra_data(std::move(extra_data_)); } -bool TaskEventBuffer::RecordTaskStatusEventIfNeeded( +bool TaskEventBufferImpl::RecordTaskStatusEventIfNeeded( const TaskID &task_id, const JobID &job_id, int32_t attempt_number, @@ -373,6 +406,7 @@ bool TaskEventBuffer::RecordTaskStatusEventIfNeeded( status, /* timestamp */ absl::GetCurrentTimeNanos(), /*is_actor_task_event=*/spec.IsActorTask(), + session_name_, include_task_info ? std::make_shared(spec) : nullptr, std::move(state_update)); @@ -382,21 +416,28 @@ bool TaskEventBuffer::RecordTaskStatusEventIfNeeded( TaskEventBufferImpl::TaskEventBufferImpl( std::unique_ptr gcs_client, - std::unique_ptr event_aggregator_client) + std::unique_ptr event_aggregator_client, + std::string session_name) : work_guard_(boost::asio::make_work_guard(io_service_)), periodical_runner_(PeriodicalRunner::Create(io_service_)), gcs_client_(std::move(gcs_client)), - event_aggregator_client_(std::move(event_aggregator_client)) {} + event_aggregator_client_(std::move(event_aggregator_client)), + session_name_(session_name) {} TaskEventBufferImpl::~TaskEventBufferImpl() { Stop(); } Status TaskEventBufferImpl::Start(bool auto_flush) { absl::MutexLock lock(&mutex_); - export_event_write_enabled_ = TaskEventBufferImpl::IsExportAPIEnabledTask(); send_task_events_to_gcs_enabled_ = RayConfig::instance().enable_core_worker_task_event_to_gcs(); send_ray_events_to_aggregator_enabled_ = RayConfig::instance().enable_core_worker_ray_event_to_aggregator(); + + // We want to make sure that only one of the event export mechanism is enabled. And + // if both are enabled, we will use the event aggregator instead of the export API. + // This code will be removed when we deprecate the export API implementation. + export_event_write_enabled_ = !send_ray_events_to_aggregator_enabled_ && + TaskEventBufferImpl::IsExportAPIEnabledTask(); auto report_interval_ms = RayConfig::instance().task_events_report_interval_ms(); RAY_CHECK(report_interval_ms > 0) << "RAY_task_events_report_interval_ms should be > 0 to use TaskEventBuffer."; @@ -581,19 +622,24 @@ std::unique_ptr TaskEventBufferImpl::CreateTaskEventDataToSe std::unique_ptr TaskEventBufferImpl::CreateRayEventsDataToSend( - absl::flat_hash_map &&agg_task_events, + absl::flat_hash_map &&agg_task_events, const absl::flat_hash_set &dropped_task_attempts_to_send) { auto data = std::make_unique(); // Move the ray events. - for (auto &[task_attempt, ray_events_pair] : agg_task_events) { - auto &[task_definition_event_rpc, task_execution_event_rpc] = ray_events_pair; - if (task_definition_event_rpc) { + for (auto &[task_attempt, ray_events_tuple] : agg_task_events) { + auto &[task_definition_event, task_execution_event, task_profile_event] = + ray_events_tuple; + if (task_definition_event) { + auto events = data->add_events(); + *events = std::move(task_definition_event.value()); + } + if (task_execution_event) { auto events = data->add_events(); - *events = std::move(task_definition_event_rpc.value()); + *events = std::move(task_execution_event.value()); } - if (task_execution_event_rpc) { + if (task_profile_event) { auto events = data->add_events(); - *events = std::move(task_execution_event_rpc.value()); + *events = std::move(task_profile_event.value()); } } @@ -614,8 +660,8 @@ TaskEventBuffer::TaskEventDataToSend TaskEventBufferImpl::CreateDataToSend( const absl::flat_hash_set &dropped_task_attempts_to_send) { // Aggregate the task events by TaskAttempt. absl::flat_hash_map agg_task_events; - // (task_attempt, (task_definition_event, task_execution_events)) - absl::flat_hash_map agg_ray_events; + // (task_attempt, (task_definition_event, task_execution_event, task_profile_event)) + absl::flat_hash_map agg_ray_events; auto to_rpc_event_fn = [this, &agg_task_events, &agg_ray_events, &dropped_task_attempts_to_send]( diff --git a/src/ray/core_worker/task_event_buffer.h b/src/ray/core_worker/task_event_buffer.h index 131c905a35d1..2e9ce16fc4dc 100644 --- a/src/ray/core_worker/task_event_buffer.h +++ b/src/ray/core_worker/task_event_buffer.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -26,9 +27,9 @@ #include "ray/common/asio/instrumented_io_context.h" #include "ray/common/asio/periodical_runner.h" #include "ray/common/id.h" +#include "ray/common/protobuf_utils.h" #include "ray/common/task/task_spec.h" -#include "ray/gcs/gcs_client/gcs_client.h" -#include "ray/gcs/pb_util.h" +#include "ray/gcs_client/gcs_client.h" #include "ray/rpc/event_aggregator_client.h" #include "ray/util/counter_map.h" #include "ray/util/event.h" @@ -41,11 +42,16 @@ namespace core { namespace worker { using TaskAttempt = std::pair; -/// A pair of rpc::events::RayEvent. -/// When converting the TaskStatusEvent, the pair will be populated with the -/// rpc::events::TaskDefinitionEvent and rpc::events::TaskExecutionEvent respectively. -using RayEventsPair = - std::pair, std::optional>; +/// A tuple of rpc::events::RayEvent. +/// When converting the TaskStatusEvent, the first 2 elements of the tuple will be +/// populated with rpc::events::TaskDefinitionEvent and rpc::events::TaskExecutionEvent +/// respectively. When converting the TaskProfileEvent, the last element of the tuple will +/// be populated with rpc::events::TaskProfileEvent. A tuple is needed because the +/// TaskProfileEvent, TaskDefinitionEvent and TaskExecutionEvent all can share the same +/// task_id and attempt_number. +using RayEventsTuple = std::tuple, + std::optional, + std::optional>; /// A wrapper class that will be converted to protobuf task events representation. /// @@ -82,7 +88,7 @@ class TaskEvent { /// Convert itself to a pair of RayEvent. /// /// \param[out] ray_events The pair of rpc::events::RayEvent - virtual void ToRpcRayEvents(RayEventsPair &ray_events) = 0; + virtual void ToRpcRayEvents(RayEventsTuple &ray_events_tuple) = 0; /// If it is a profile event. virtual bool IsProfileEvent() const = 0; @@ -150,6 +156,7 @@ class TaskStatusEvent : public TaskEvent { const rpc::TaskStatus &task_status, int64_t timestamp, bool is_actor_task_event, + std::string session_name, const std::shared_ptr &task_spec = nullptr, std::optional state_update = std::nullopt); @@ -166,26 +173,22 @@ class TaskStatusEvent : public TaskEvent { /// NOTE: this method will modify internal states by moving fields of task_spec_ to /// the rpc::events::RayEvent. /// - /// \param[out] ray_events The pair of rpc::events::RayEvent protobuf messages to be + /// \param[out] ray_events The tuple of rpc::events::RayEvent protobuf messages to be /// filled. - void ToRpcRayEvents(RayEventsPair &ray_events) override; + void ToRpcRayEvents(RayEventsTuple &ray_events_tuple) override; bool IsProfileEvent() const override { return false; } private: // Helper functions to populate the task definition event of rpc::events::RayEvent // This function assumes task_spec_ is not null. - // This function also checks T must be one of rpc::events::ActorTaskDefinitionEvent or - // rpc::events::TaskDefinitionEvent template void PopulateRpcRayTaskDefinitionEvent(T &definition_event_data); // Helper functions to populate the task execution event of rpc::events::RayEvent - // This function checks T must be one of rpc::events::ActorTaskExecutionEvent or - // rpc::events::TaskExecutionEvent - template - void PopulateRpcRayTaskExecutionEvent(T &execution_event_data, - google::protobuf::Timestamp timestamp); + void PopulateRpcRayTaskExecutionEvent( + rpc::events::TaskExecutionEvent &execution_event_data, + google::protobuf::Timestamp timestamp); // Helper functions to populate the base fields of rpc::events::RayEvent void PopulateRpcRayEventBaseFields(rpc::events::RayEvent &ray_event, @@ -198,6 +201,8 @@ class TaskStatusEvent : public TaskEvent { int64_t timestamp_ = -1; /// Whether the task is an actor task. bool is_actor_task_event_ = false; + /// The current Ray session name. + std::string session_name_; /// Pointer to the task spec. std::shared_ptr task_spec_ = nullptr; /// Optional task state update @@ -214,14 +219,17 @@ class TaskProfileEvent : public TaskEvent { std::string component_id, std::string node_ip_address, std::string event_name, - int64_t start_time); + int64_t start_time, + std::string session_name); void ToRpcTaskEvents(rpc::TaskEvents *rpc_task_events) override; void ToRpcTaskExportEvents( std::shared_ptr rpc_task_export_event_data) override; - void ToRpcRayEvents(RayEventsPair &ray_events) override; + /// Note: The extra data will be moved when this is called and will no longer be usable. + /// Second element of the RayEventsTuple will always be empty for TaskProfileEvent. + void ToRpcRayEvents(RayEventsTuple &ray_events_tuple) override; bool IsProfileEvent() const override { return true; } @@ -230,6 +238,9 @@ class TaskProfileEvent : public TaskEvent { void SetExtraData(const std::string &extra_data) { extra_data_ = extra_data; } private: + // Helper functions to populate the base fields of rpc::events::RayEvent + void PopulateRpcRayEventBaseFields(rpc::events::RayEvent &ray_event, + google::protobuf::Timestamp timestamp); /// The below fields mirror rpc::ProfileEvent std::string component_type_; std::string component_id_; @@ -238,6 +249,8 @@ class TaskProfileEvent : public TaskEvent { int64_t start_time_{}; int64_t end_time_{}; std::string extra_data_; + /// The current Ray session name. + std::string session_name_; }; /// @brief An enum class defining counters to be used in TaskEventBufferImpl. @@ -300,14 +313,15 @@ class TaskEventBuffer { /// \param status the changed status. /// \param state_update optional task state updates. /// \return true if the event is recorded, false otherwise. - bool RecordTaskStatusEventIfNeeded( + virtual bool RecordTaskStatusEventIfNeeded( const TaskID &task_id, const JobID &job_id, int32_t attempt_number, const TaskSpecification &spec, rpc::TaskStatus status, bool include_task_info = false, - std::optional state_update = absl::nullopt); + std::optional state_update = + absl::nullopt) = 0; /// Add a task event to be reported. /// @@ -351,6 +365,9 @@ class TaskEventBuffer { /// Return a string that describes the task event buffer stats. virtual std::string DebugString() = 0; + + /// Return the current Ray session name. + virtual std::string GetSessionName() const = 0; }; /// Implementation of TaskEventBuffer. @@ -367,13 +384,23 @@ class TaskEventBufferImpl : public TaskEventBuffer { /// \param event_aggregator_client Event aggregator client explicit TaskEventBufferImpl( std::unique_ptr gcs_client, - std::unique_ptr event_aggregator_client); + std::unique_ptr event_aggregator_client, + std::string session_name); TaskEventBufferImpl(const TaskEventBufferImpl &) = delete; TaskEventBufferImpl &operator=(const TaskEventBufferImpl &) = delete; ~TaskEventBufferImpl() override; + bool RecordTaskStatusEventIfNeeded(const TaskID &task_id, + const JobID &job_id, + int32_t attempt_number, + const TaskSpecification &spec, + rpc::TaskStatus status, + bool include_task_info = false, + std::optional + state_update = absl::nullopt) override; + void AddTaskEvent(std::unique_ptr task_event) ABSL_LOCKS_EXCLUDED(mutex_) override; @@ -387,6 +414,8 @@ class TaskEventBufferImpl : public TaskEventBuffer { std::string DebugString() override; + std::string GetSessionName() const override { return session_name_; } + private: /// Add a task status event to be reported. /// @@ -439,7 +468,7 @@ class TaskEventBufferImpl : public TaskEventBuffer { /// status events being dropped. /// \return data The ray event data to be sent. std::unique_ptr CreateRayEventsDataToSend( - absl::flat_hash_map &&agg_task_events, + absl::flat_hash_map &&agg_task_events, const absl::flat_hash_set &dropped_task_attempts_to_send); /// Reset the metrics counters for flush. @@ -566,6 +595,9 @@ class TaskEventBufferImpl : public TaskEventBuffer { /// If true, ray events from the event buffer are sent to the event aggregator bool send_ray_events_to_aggregator_enabled_ = false; + /// The current Ray session name. Passed in from the core worker + std::string session_name_ = ""; + FRIEND_TEST(TaskEventBufferTestManualStart, TestGcsClientFail); FRIEND_TEST(TaskEventBufferTestBatchSendDifferentDestination, TestBatchedSend); FRIEND_TEST(TaskEventBufferTest, TestAddEvents); @@ -578,6 +610,9 @@ class TaskEventBufferImpl : public TaskEventBuffer { FRIEND_TEST(TaskEventBufferTestLimitProfileEvents, TestBufferSizeLimitProfileEvents); FRIEND_TEST(TaskEventBufferTestLimitProfileEvents, TestLimitProfileEventsPerTask); FRIEND_TEST(TaskEventTestWriteExport, TestWriteTaskExportEvents); + FRIEND_TEST(TaskEventBufferTest, TestCreateRayEventsDataWithProfileEvents); + FRIEND_TEST(TaskEventBufferTestDifferentDestination, + TestMixedStatusAndProfileEventsToRayEvents); }; } // namespace worker diff --git a/src/ray/core_worker/task_execution/BUILD.bazel b/src/ray/core_worker/task_execution/BUILD.bazel index 6f432bd339ca..49b0dc755cb4 100644 --- a/src/ray/core_worker/task_execution/BUILD.bazel +++ b/src/ray/core_worker/task_execution/BUILD.bazel @@ -113,7 +113,6 @@ ray_cc_library( "//src/ray/rpc:server_call", "@com_google_absl//absl/base:core_headers", "@com_google_absl//absl/container:flat_hash_map", - "@com_google_absl//absl/container:flat_hash_set", "@com_google_absl//absl/synchronization", ], ) @@ -142,7 +141,5 @@ ray_cc_library( "//src/ray/rpc:server_call", "@com_google_absl//absl/base:core_headers", "@com_google_absl//absl/container:flat_hash_map", - "@com_google_absl//absl/container:flat_hash_set", - "@com_google_absl//absl/synchronization", ], ) diff --git a/src/ray/core_worker/task_execution/actor_scheduling_queue.cc b/src/ray/core_worker/task_execution/actor_scheduling_queue.cc index ebd201a4481b..9eee6f987a8a 100644 --- a/src/ray/core_worker/task_execution/actor_scheduling_queue.cc +++ b/src/ray/core_worker/task_execution/actor_scheduling_queue.cc @@ -104,29 +104,29 @@ void ActorSchedulingQueue::Add( rpc::TaskStatus::PENDING_ACTOR_TASK_ARGS_FETCH, /* include_task_info */ false)); waiter_.Wait(dependencies, [this, seq_no, is_retry, retry_request]() mutable { - InboundRequest *inbound_request = nullptr; + InboundRequest *inbound_req = nullptr; if (is_retry) { // retry_request is guaranteed to be a valid pointer for retries because it // won't be erased from the retry list until its dependencies are fetched and // ExecuteRequest happens. - inbound_request = retry_request; + inbound_req = retry_request; } else if (auto it = pending_actor_tasks_.find(seq_no); it != pending_actor_tasks_.end()) { // For non-retry tasks, we need to check if the task is still in the map because // it can be erased due to being canceled via a higher `client_processed_up_to_`. - inbound_request = &it->second; + inbound_req = &it->second; } - if (inbound_request != nullptr) { - const auto &task_spec = inbound_request->TaskSpec(); + if (inbound_req != nullptr) { + const auto &inbound_req_task_spec = inbound_req->TaskSpec(); RAY_UNUSED(task_event_buffer_.RecordTaskStatusEventIfNeeded( - task_spec.TaskId(), - task_spec.JobId(), - task_spec.AttemptNumber(), - task_spec, + inbound_req_task_spec.TaskId(), + inbound_req_task_spec.JobId(), + inbound_req_task_spec.AttemptNumber(), + inbound_req_task_spec, rpc::TaskStatus::PENDING_ACTOR_TASK_ORDERING_OR_CONCURRENCY, /* include_task_info */ false)); - inbound_request->MarkDependenciesResolved(); + inbound_req->MarkDependenciesResolved(); ScheduleRequests(); } }); diff --git a/src/ray/core_worker/task_execution/concurrency_group_manager.cc b/src/ray/core_worker/task_execution/concurrency_group_manager.cc index b1eaf375637a..ce58694d06c4 100644 --- a/src/ray/core_worker/task_execution/concurrency_group_manager.cc +++ b/src/ray/core_worker/task_execution/concurrency_group_manager.cc @@ -15,7 +15,6 @@ #include "ray/core_worker/task_execution/concurrency_group_manager.h" #include -#include #include #include #include @@ -33,11 +32,11 @@ ConcurrencyGroupManager::ConcurrencyGroupManager( std::function()> initialize_thread_callback) : initialize_thread_callback_(std::move(initialize_thread_callback)) { for (auto &group : concurrency_groups) { - const auto name = group.name; - const auto max_concurrency = group.max_concurrency; + const auto name = group.name_; + const auto max_concurrency = group.max_concurrency_; auto executor = std::make_shared(max_concurrency, initialize_thread_callback_); - auto &fds = group.function_descriptors; + auto &fds = group.function_descriptors_; for (auto fd : fds) { functions_to_executor_index_[fd->ToString()] = executor; } diff --git a/src/ray/core_worker/task_execution/concurrency_group_manager.h b/src/ray/core_worker/task_execution/concurrency_group_manager.h index c976523a56b4..4aa3bd16c6a1 100644 --- a/src/ray/core_worker/task_execution/concurrency_group_manager.h +++ b/src/ray/core_worker/task_execution/concurrency_group_manager.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include #include diff --git a/src/ray/core_worker/task_execution/out_of_order_actor_scheduling_queue.cc b/src/ray/core_worker/task_execution/out_of_order_actor_scheduling_queue.cc index 7c6740b1898c..f68843ad6829 100644 --- a/src/ray/core_worker/task_execution/out_of_order_actor_scheduling_queue.cc +++ b/src/ray/core_worker/task_execution/out_of_order_actor_scheduling_queue.cc @@ -45,7 +45,8 @@ OutOfOrderActorSchedulingQueue::OutOfOrderActorSchedulingQueue( ss << "Setting actor as asyncio with max_concurrency=" << fiber_max_concurrency << ", and defined concurrency groups are:" << std::endl; for (const auto &concurrency_group : concurrency_groups) { - ss << "\t" << concurrency_group.name << " : " << concurrency_group.max_concurrency; + ss << "\t" << concurrency_group.name_ << " : " + << concurrency_group.max_concurrency_; } RAY_LOG(INFO) << ss.str(); } @@ -188,12 +189,12 @@ void OutOfOrderActorSchedulingQueue::RunRequest(InboundRequest request) { waiter_.Wait(dependencies, [this, request = std::move(request)]() mutable { RAY_CHECK_EQ(std::this_thread::get_id(), main_thread_id_); - const TaskSpecification &task_spec = request.TaskSpec(); + const TaskSpecification &task = request.TaskSpec(); RAY_UNUSED(task_event_buffer_.RecordTaskStatusEventIfNeeded( - task_spec.TaskId(), - task_spec.JobId(), - task_spec.AttemptNumber(), - task_spec, + task.TaskId(), + task.JobId(), + task.AttemptNumber(), + task, rpc::TaskStatus::PENDING_ACTOR_TASK_ORDERING_OR_CONCURRENCY, /* include_task_info */ false)); diff --git a/src/ray/core_worker/task_execution/out_of_order_actor_scheduling_queue.h b/src/ray/core_worker/task_execution/out_of_order_actor_scheduling_queue.h index 24ef6e1d505c..46d144e5fe97 100644 --- a/src/ray/core_worker/task_execution/out_of_order_actor_scheduling_queue.h +++ b/src/ray/core_worker/task_execution/out_of_order_actor_scheduling_queue.h @@ -20,7 +20,6 @@ #include "absl/base/thread_annotations.h" #include "absl/container/flat_hash_map.h" -#include "absl/container/flat_hash_set.h" #include "absl/synchronization/mutex.h" #include "ray/common/id.h" #include "ray/common/task/task_spec.h" diff --git a/src/ray/core_worker/task_execution/task_receiver.cc b/src/ray/core_worker/task_execution/task_receiver.cc index 18236a0c72a4..759ce1ee2e28 100644 --- a/src/ray/core_worker/task_execution/task_receiver.cc +++ b/src/ray/core_worker/task_execution/task_receiver.cc @@ -51,9 +51,10 @@ void TaskReceiver::HandleTask(rpc::PushTaskRequest request, } auto accept_callback = [this, reply, resource_ids = std::move(resource_ids)]( - const TaskSpecification &task_spec, - const rpc::SendReplyCallback &send_reply_callback) mutable { - auto num_returns = task_spec.NumReturns(); + const TaskSpecification &accepted_task_spec, + const rpc::SendReplyCallback + &accepted_send_reply_callback) mutable { + auto num_returns = accepted_task_spec.NumReturns(); RAY_CHECK(num_returns >= 0); std::vector>> return_objects; @@ -61,7 +62,7 @@ void TaskReceiver::HandleTask(rpc::PushTaskRequest request, std::vector> streaming_generator_returns; bool is_retryable_error = false; std::string application_error; - auto status = task_handler_(task_spec, + auto status = task_handler_(accepted_task_spec, std::move(resource_ids), &return_objects, &dynamic_return_objects, @@ -108,8 +109,9 @@ void TaskReceiver::HandleTask(rpc::PushTaskRequest request, } if (objects_valid) { - if (task_spec.ReturnsDynamic()) { - size_t num_dynamic_returns_expected = task_spec.DynamicReturnIds().size(); + if (accepted_task_spec.ReturnsDynamic()) { + size_t num_dynamic_returns_expected = + accepted_task_spec.DynamicReturnIds().size(); if (num_dynamic_returns_expected > 0) { RAY_CHECK(dynamic_return_objects.size() == num_dynamic_returns_expected) << "Expected " << num_dynamic_returns_expected @@ -132,15 +134,15 @@ void TaskReceiver::HandleTask(rpc::PushTaskRequest request, return_object.first, return_object.second, return_object_proto); } - if (task_spec.IsActorCreationTask()) { - concurrency_groups_ = task_spec.ConcurrencyGroups(); + if (accepted_task_spec.IsActorCreationTask()) { + concurrency_groups_ = accepted_task_spec.ConcurrencyGroups(); if (is_asyncio_) { fiber_state_manager_ = std::make_shared>( concurrency_groups_, fiber_max_concurrency_, initialize_thread_callback_); } else { // If the actor is an asyncio actor, then this concurrency group manager // for BoundedExecutor will never be used, so we don't need to initialize it. - const int default_max_concurrency = task_spec.MaxActorConcurrency(); + const int default_max_concurrency = accepted_task_spec.MaxActorConcurrency(); pool_manager_ = std::make_shared>( concurrency_groups_, default_max_concurrency, initialize_thread_callback_); } @@ -151,16 +153,17 @@ void TaskReceiver::HandleTask(rpc::PushTaskRequest request, RAY_CHECK_OK(actor_creation_task_done_()); if (status.IsCreationTaskError()) { RAY_LOG(WARNING) << "Actor creation task finished with errors, task_id: " - << task_spec.TaskId() - << ", actor_id: " << task_spec.ActorCreationId() + << accepted_task_spec.TaskId() + << ", actor_id: " << accepted_task_spec.ActorCreationId() << ", status: " << status; } else { // Set the actor repr name if it's customized by the actor. if (!actor_repr_name_.empty()) { reply->set_actor_repr_name(actor_repr_name_); } - RAY_LOG(INFO) << "Actor creation task finished, task_id: " << task_spec.TaskId() - << ", actor_id: " << task_spec.ActorCreationId() + RAY_LOG(INFO) << "Actor creation task finished, task_id: " + << accepted_task_spec.TaskId() + << ", actor_id: " << accepted_task_spec.ActorCreationId() << ", actor_repr_name: " << actor_repr_name_; } } @@ -172,28 +175,29 @@ void TaskReceiver::HandleTask(rpc::PushTaskRequest request, reply->set_worker_exiting(true); if (objects_valid) { // This happens when max_calls is hit. We still need to return the objects. - send_reply_callback(Status::OK(), nullptr, nullptr); + accepted_send_reply_callback(Status::OK(), nullptr, nullptr); } else { - send_reply_callback(status, nullptr, nullptr); + accepted_send_reply_callback(status, nullptr, nullptr); } } else { RAY_CHECK_OK(status); RAY_CHECK(objects_valid); - send_reply_callback(Status::OK(), nullptr, nullptr); + accepted_send_reply_callback(Status::OK(), nullptr, nullptr); } }; - auto cancel_callback = [reply](const TaskSpecification &task_spec, - const Status &status, - const rpc::SendReplyCallback &send_reply_callback) { - if (task_spec.IsActorTask()) { + auto cancel_callback = [reply]( + const TaskSpecification &canceled_task_spec, + const Status &status, + const rpc::SendReplyCallback &canceled_send_reply_callback) { + if (canceled_task_spec.IsActorTask()) { // We consider cancellation of actor tasks to be a push task RPC failure. - send_reply_callback(status, nullptr, nullptr); + canceled_send_reply_callback(status, nullptr, nullptr); } else { // We consider cancellation of normal tasks to be an in-band cancellation of a // successful RPC. reply->set_was_cancelled_before_running(true); - send_reply_callback(status, nullptr, nullptr); + canceled_send_reply_callback(status, nullptr, nullptr); } }; diff --git a/src/ray/core_worker/task_execution/task_receiver.h b/src/ray/core_worker/task_execution/task_receiver.h index 9ae7ce3c4f7b..157e597fd8e7 100644 --- a/src/ray/core_worker/task_execution/task_receiver.h +++ b/src/ray/core_worker/task_execution/task_receiver.h @@ -14,19 +14,13 @@ #pragma once -#include #include -#include -#include #include #include #include #include -#include "absl/base/thread_annotations.h" #include "absl/container/flat_hash_map.h" -#include "absl/container/flat_hash_set.h" -#include "absl/synchronization/mutex.h" #include "ray/common/asio/instrumented_io_context.h" #include "ray/common/id.h" #include "ray/common/ray_object.h" diff --git a/src/ray/core_worker/task_execution/test/BUILD.bazel b/src/ray/core_worker/task_execution/tests/BUILD.bazel similarity index 92% rename from src/ray/core_worker/task_execution/test/BUILD.bazel rename to src/ray/core_worker/task_execution/tests/BUILD.bazel index 48666ca86ce0..0f7c8e4a5cc2 100644 --- a/src/ray/core_worker/task_execution/test/BUILD.bazel +++ b/src/ray/core_worker/task_execution/tests/BUILD.bazel @@ -29,7 +29,7 @@ ray_cc_test( tags = ["team:core"], deps = [ "//src/ray/common:asio", - "//src/ray/common:test_util", + "//src/ray/common:test_utils", "//src/ray/core_worker/task_execution:concurrency_group_manager", "@com_google_googletest//:gtest", "@com_google_googletest//:gtest_main", @@ -42,7 +42,7 @@ ray_cc_test( tags = ["team:core"], deps = [ "//src/ray/common:asio", - "//src/ray/common:test_util", + "//src/ray/common:test_utils", "//src/ray/core_worker/task_execution:actor_scheduling_queue", "//src/ray/core_worker/task_execution:normal_scheduling_queue", "//src/ray/core_worker/task_execution:out_of_order_actor_scheduling_queue", @@ -58,9 +58,10 @@ ray_cc_test( deps = [ "//:ray_mock", "//src/ray/common:asio", - "//src/ray/common:test_util", + "//src/ray/common:test_utils", "//src/ray/core_worker/task_execution:task_receiver", "//src/ray/rpc:core_worker_client", + "//src/ray/util:time", "@com_google_googletest//:gtest", "@com_google_googletest//:gtest_main", ], diff --git a/src/ray/core_worker/task_execution/test/concurrency_group_manager_test.cc b/src/ray/core_worker/task_execution/tests/concurrency_group_manager_test.cc similarity index 98% rename from src/ray/core_worker/task_execution/test/concurrency_group_manager_test.cc rename to src/ray/core_worker/task_execution/tests/concurrency_group_manager_test.cc index b12bb8876535..6d3f95030484 100644 --- a/src/ray/core_worker/task_execution/test/concurrency_group_manager_test.cc +++ b/src/ray/core_worker/task_execution/tests/concurrency_group_manager_test.cc @@ -19,7 +19,7 @@ #include "gtest/gtest.h" #include "ray/common/asio/instrumented_io_context.h" -#include "ray/common/test_util.h" +#include "ray/common/test_utils.h" #include "ray/core_worker/task_execution/fiber.h" #include "ray/core_worker/task_execution/thread_pool.h" diff --git a/src/ray/core_worker/task_execution/test/fiber_state_test.cc b/src/ray/core_worker/task_execution/tests/fiber_state_test.cc similarity index 100% rename from src/ray/core_worker/task_execution/test/fiber_state_test.cc rename to src/ray/core_worker/task_execution/tests/fiber_state_test.cc diff --git a/src/ray/core_worker/task_execution/test/scheduling_queue_test.cc b/src/ray/core_worker/task_execution/tests/scheduling_queue_test.cc similarity index 97% rename from src/ray/core_worker/task_execution/test/scheduling_queue_test.cc rename to src/ray/core_worker/task_execution/tests/scheduling_queue_test.cc index 057fde36ec89..5a779ff091b7 100644 --- a/src/ray/core_worker/task_execution/test/scheduling_queue_test.cc +++ b/src/ray/core_worker/task_execution/tests/scheduling_queue_test.cc @@ -19,7 +19,7 @@ #include "gtest/gtest.h" #include "ray/common/asio/instrumented_io_context.h" -#include "ray/common/test_util.h" +#include "ray/common/test_utils.h" #include "ray/core_worker/task_execution/actor_scheduling_queue.h" #include "ray/core_worker/task_execution/normal_scheduling_queue.h" #include "ray/core_worker/task_execution/out_of_order_actor_scheduling_queue.h" @@ -61,6 +61,30 @@ class MockTaskEventBuffer : public worker::TaskEventBuffer { std::string DebugString() override { return ""; } + bool RecordTaskStatusEventIfNeeded( + const TaskID &task_id, + const JobID &job_id, + int32_t attempt_number, + const TaskSpecification &spec, + rpc::TaskStatus status, + bool include_task_info, + std::optional state_update) + override { + AddTaskEvent(std::make_unique( + task_id, + job_id, + attempt_number, + status, + /* timestamp */ absl::GetCurrentTimeNanos(), + /*is_actor_task_event=*/spec.IsActorTask(), + "test-session-name", + include_task_info ? std::make_shared(spec) : nullptr, + std::move(state_update))); + return true; + } + + std::string GetSessionName() const override { return "test-session-name"; } + std::vector> task_events; }; diff --git a/src/ray/core_worker/task_execution/test/task_receiver_test.cc b/src/ray/core_worker/task_execution/tests/task_receiver_test.cc similarity index 95% rename from src/ray/core_worker/task_execution/test/task_receiver_test.cc rename to src/ray/core_worker/task_execution/tests/task_receiver_test.cc index 915299250fc8..94b143bc9cb4 100644 --- a/src/ray/core_worker/task_execution/test/task_receiver_test.cc +++ b/src/ray/core_worker/task_execution/tests/task_receiver_test.cc @@ -21,8 +21,9 @@ #include "gtest/gtest.h" #include "ray/common/asio/instrumented_io_context.h" #include "ray/common/task/task_spec.h" -#include "ray/common/test_util.h" +#include "ray/common/test_utils.h" #include "ray/rpc/worker/core_worker_client.h" +#include "ray/util/time.h" namespace ray { namespace core { @@ -107,7 +108,21 @@ class MockTaskEventBuffer : public worker::TaskEventBuffer { bool Enabled() const override { return true; } + bool RecordTaskStatusEventIfNeeded( + const TaskID &task_id, + const JobID &job_id, + int32_t attempt_number, + const TaskSpecification &spec, + rpc::TaskStatus status, + bool include_task_info, + std::optional state_update) + override { + return true; + } + std::string DebugString() override { return ""; } + + std::string GetSessionName() const override { return "test-session-name"; } }; class TaskReceiverTest : public ::testing::Test { diff --git a/src/ray/core_worker/task_execution/test/thread_pool_test.cc b/src/ray/core_worker/task_execution/tests/thread_pool_test.cc similarity index 100% rename from src/ray/core_worker/task_execution/test/thread_pool_test.cc rename to src/ray/core_worker/task_execution/tests/thread_pool_test.cc diff --git a/src/ray/core_worker/task_manager.cc b/src/ray/core_worker/task_manager.cc index b41e0616ba29..ca5a08b64dc0 100644 --- a/src/ray/core_worker/task_manager.cc +++ b/src/ray/core_worker/task_manager.cc @@ -23,11 +23,10 @@ #include "absl/strings/match.h" #include "ray/common/buffer.h" -#include "ray/common/common_protocol.h" +#include "ray/common/protobuf_utils.h" #include "ray/core_worker/actor_manager.h" -#include "ray/gcs/pb_util.h" #include "ray/util/exponential_backoff.h" -#include "ray/util/util.h" +#include "ray/util/time.h" #include "src/ray/protobuf/common.pb.h" namespace ray { @@ -39,6 +38,31 @@ constexpr int64_t kTaskFailureThrottlingThreshold = 50; // Throttle task failure logs to once this interval. constexpr int64_t kTaskFailureLoggingFrequencyMillis = 5000; +namespace { + +rpc::ErrorType MapPlasmaPutStatusToErrorType(const Status &status) { + // Only the following should be returned from plasma put paths today. + RAY_DCHECK(status.IsObjectStoreFull() || status.IsTransientObjectStoreFull() || + status.IsOutOfDisk() || status.IsIOError()) + << "Unexpected status from plasma put: " << status; + + if (status.IsObjectStoreFull() || status.IsTransientObjectStoreFull()) { + // TODO(codope): add a dedicated OBJECT_STORE_FULL error type and map to it. + // https://github.com/ray-project/ray/pull/56070 + return rpc::ErrorType::OUT_OF_MEMORY; + } + if (status.IsOutOfDisk()) { + return rpc::ErrorType::OUT_OF_DISK_ERROR; + } + if (status.IsIOError()) { + // Local IPC failure to plasma/raylet; attribute to local control-plane failure. + return rpc::ErrorType::LOCAL_RAYLET_DIED; + } + return rpc::ErrorType::WORKER_DIED; +} + +} // namespace + absl::flat_hash_set ObjectRefStream::GetItemsUnconsumed() const { absl::flat_hash_set result; for (int64_t index = 0; index <= max_index_seen_; index++) { @@ -267,30 +291,35 @@ std::vector TaskManager::AddPendingTask( -1, is_reconstructable, /*add_local_ref=*/true, - /*pinned_at_raylet_id=*/std::optional(), + /*pinned_at_node_id=*/std::optional(), /*tensor_transport=*/spec.TensorTransport()); } return_ids.push_back(return_id); rpc::ObjectReference ref; - auto object_id = spec.ReturnId(i); - ref.set_object_id(object_id.Binary()); + auto return_object_id = spec.ReturnId(i); + ref.set_object_id(return_object_id.Binary()); ref.mutable_owner_address()->CopyFrom(caller_address); ref.set_call_site(call_site); + ref.set_tensor_transport(spec.TensorTransport()); // Register the callback to free the GPU object when it is out of scope. - auto tensor_transport = reference_counter_.GetTensorTransport(object_id); + auto tensor_transport = reference_counter_.GetTensorTransport(return_object_id); if (tensor_transport.value_or(rpc::TensorTransport::OBJECT_STORE) != rpc::TensorTransport::OBJECT_STORE) { reference_counter_.AddObjectOutOfScopeOrFreedCallback( - object_id, [this](const ObjectID &object_id) { + return_object_id, [this](const ObjectID &object_id) { auto actor_id = ObjectID::ToActorID(object_id); auto rpc_client = get_actor_rpc_client_callback_(actor_id); - auto request = rpc::FreeActorObjectRequest(); + if (!rpc_client.has_value()) { + // ActorTaskSubmitter already knows the actor is already dead. + return; + } + rpc::FreeActorObjectRequest request; request.set_object_id(object_id.Binary()); - rpc_client->FreeActorObject( + rpc_client.value()->FreeActorObject( request, - [object_id, actor_id](Status status, + [object_id, actor_id](const Status &status, const rpc::FreeActorObjectReply &reply) { if (!status.ok()) { RAY_LOG(ERROR).WithField(object_id).WithField(actor_id) @@ -351,13 +380,13 @@ std::optional TaskManager::ResubmitTask( return rpc::ErrorType::OBJECT_UNRECONSTRUCTABLE_MAX_ATTEMPTS_EXCEEDED; } auto &task_entry = it->second; - if (task_entry.is_canceled) { + if (task_entry.is_canceled_) { return rpc::ErrorType::TASK_CANCELLED; } - if (task_entry.spec.IsStreamingGenerator() && + if (task_entry.spec_.IsStreamingGenerator() && task_entry.GetStatus() == rpc::TaskStatus::SUBMITTED_TO_WORKER) { - if (task_entry.num_retries_left == 0) { + if (task_entry.num_retries_left_ == 0) { // If the last attempt is in progress. return rpc::ErrorType::OBJECT_UNRECONSTRUCTABLE_MAX_ATTEMPTS_EXCEEDED; } @@ -374,7 +403,7 @@ std::optional TaskManager::ResubmitTask( SetupTaskEntryForResubmit(task_entry); } - spec = task_entry.spec; + spec = task_entry.spec_; } if (should_queue_generator_resubmit) { @@ -391,7 +420,7 @@ std::optional TaskManager::ResubmitTask( // issue #54260. RAY_LOG(INFO) << "Resubmitting task that produced lost plasma object, attempt #" << spec.AttemptNumber() << ": " << spec.DebugString(); - retry_task_callback_(spec, /*object_recovery*/ true, /*delay_ms*/ 0); + async_retry_task_callback_(spec, /*delay_ms=*/0); return std::nullopt; } @@ -399,25 +428,25 @@ std::optional TaskManager::ResubmitTask( void TaskManager::SetupTaskEntryForResubmit(TaskEntry &task_entry) { task_entry.MarkRetry(); // NOTE(rickyx): We only increment the AttemptNumber on the task spec when - // `retry_task_callback_` is invoked. In order to record the correct status change for - // the new task attempt, we pass the attempt number explicitly. + // `async_retry_task_callback_` is invoked. In order to record the correct status change + // for the new task attempt, we pass the attempt number explicitly. SetTaskStatus(task_entry, rpc::TaskStatus::PENDING_ARGS_AVAIL, /* state_update */ std::nullopt, /* include_task_info */ true, - task_entry.spec.AttemptNumber() + 1); + task_entry.spec_.AttemptNumber() + 1); num_pending_tasks_++; // The task is pending again, so it's no longer counted as lineage. If // the task finishes and we still need the spec, we'll add the task back // to the footprint sum. - total_lineage_footprint_bytes_ -= task_entry.lineage_footprint_bytes; - task_entry.lineage_footprint_bytes = 0; + total_lineage_footprint_bytes_ -= task_entry.lineage_footprint_bytes_; + task_entry.lineage_footprint_bytes_ = 0; - if (task_entry.num_retries_left > 0) { - task_entry.num_retries_left--; + if (task_entry.num_retries_left_ > 0) { + task_entry.num_retries_left_--; } else { - RAY_CHECK(task_entry.num_retries_left == -1); + RAY_CHECK(task_entry.num_retries_left_ == -1); } } @@ -472,13 +501,13 @@ void TaskManager::MarkGeneratorFailedAndResubmit(const TaskID &task_id) { worker::TaskStatusEvent::TaskStateUpdate(error_info)); SetupTaskEntryForResubmit(task_entry); - spec = task_entry.spec; + spec = task_entry.spec_; } // Note: Don't need to call UpdateReferencesForResubmit because CompletePendingTask or // FailPendingTask are not called when this is. Therefore, RemoveFinishedTaskReferences // never happened for this task. - retry_task_callback_(spec, /*object_recovery*/ true, /*delay_ms*/ 0); + async_retry_task_callback_(spec, /*delay_ms*/ 0); } void TaskManager::DrainAndShutdown(std::function shutdown) { @@ -533,10 +562,10 @@ size_t TaskManager::NumPendingTasks() const { return num_pending_tasks_; } -bool TaskManager::HandleTaskReturn(const ObjectID &object_id, - const rpc::ReturnObject &return_object, - const NodeID &worker_raylet_id, - bool store_in_plasma) { +StatusOr TaskManager::HandleTaskReturn(const ObjectID &object_id, + const rpc::ReturnObject &return_object, + const NodeID &worker_node_id, + bool store_in_plasma) { bool direct_return = false; reference_counter_.UpdateObjectSize(object_id, return_object.size()); RAY_LOG(DEBUG) << "Task return object " << object_id << " has size " @@ -548,7 +577,7 @@ bool TaskManager::HandleTaskReturn(const ObjectID &object_id, // NOTE(swang): We need to add the location of the object before marking // it as local in the in-memory store so that the data locality policy // will choose the right raylet for any queued dependent tasks. - reference_counter_.UpdateObjectPinnedAtRaylet(object_id, worker_raylet_id); + reference_counter_.UpdateObjectPinnedAtRaylet(object_id, worker_node_id); // Mark it as in plasma with a dummy object. in_memory_store_.Put(RayObject(rpc::ErrorType::OBJECT_IN_PLASMA), object_id); } else { @@ -579,7 +608,10 @@ bool TaskManager::HandleTaskReturn(const ObjectID &object_id, /*copy_data=*/false, tensor_transport.value_or(rpc::TensorTransport::OBJECT_STORE)); if (store_in_plasma) { - put_in_local_plasma_callback_(object, object_id); + Status s = put_in_local_plasma_callback_(object, object_id); + if (!s.ok()) { + return s; + } } else { in_memory_store_.Put(object, object_id); direct_return = true; @@ -619,7 +651,7 @@ Status TaskManager::TryReadObjectRefStream(const ObjectID &generator_id, absl::MutexLock lock(&mu_); auto it = submissible_tasks_.find(generator_id.TaskId()); if (it != submissible_tasks_.end()) { - backpressure_threshold = it->second.spec.GeneratorBackpressureNumObjects(); + backpressure_threshold = it->second.spec_.GeneratorBackpressureNumObjects(); } } @@ -770,8 +802,8 @@ bool TaskManager::HandleReportGeneratorItemReturns( absl::MutexLock lock(&mu_); auto it = submissible_tasks_.find(task_id); if (it != submissible_tasks_.end()) { - backpressure_threshold = it->second.spec.GeneratorBackpressureNumObjects(); - if (it->second.spec.AttemptNumber() > attempt_number) { + backpressure_threshold = it->second.spec_.GeneratorBackpressureNumObjects(); + if (it->second.spec_.AttemptNumber() > attempt_number) { // Generator task reports can arrive at any time. If the first attempt // fails, we may receive a report from the first executor after the // second attempt has started. In this case, we should ignore the first @@ -813,10 +845,15 @@ bool TaskManager::HandleReportGeneratorItemReturns( } // When an object is reported, the object is ready to be fetched. reference_counter_.UpdateObjectPendingCreation(object_id, false); - HandleTaskReturn(object_id, - return_object, - NodeID::FromBinary(request.worker_addr().raylet_id()), - /*store_in_plasma=*/store_in_plasma_ids.contains(object_id)); + StatusOr put_res = + HandleTaskReturn(object_id, + return_object, + NodeID::FromBinary(request.worker_addr().node_id()), + /*store_in_plasma=*/store_in_plasma_ids.contains(object_id)); + if (!put_res.ok()) { + RAY_LOG(WARNING).WithField(object_id) + << "Failed to handle streaming dynamic return: " << put_res.status(); + } } // Handle backpressure if needed. @@ -900,23 +937,54 @@ void TaskManager::CompletePendingTask(const TaskID &task_id, reference_counter_.AddDynamicReturn(object_id, generator_id); dynamic_return_ids.push_back(object_id); } - if (!HandleTaskReturn(object_id, - return_object, - NodeID::FromBinary(worker_addr.raylet_id()), - store_in_plasma_ids.contains(object_id))) { - if (first_execution) { - dynamic_returns_in_plasma.push_back(object_id); - } + StatusOr direct_or = + HandleTaskReturn(object_id, + return_object, + NodeID::FromBinary(worker_addr.node_id()), + store_in_plasma_ids.contains(object_id)); + if (!direct_or.ok()) { + RAY_LOG(WARNING).WithField(object_id) + << "Failed to handle dynamic task return: " << direct_or.status(); + Status st = direct_or.status(); + rpc::ErrorType err_type = MapPlasmaPutStatusToErrorType(st); + rpc::RayErrorInfo err_info; + err_info.set_error_message(st.ToString()); + FailOrRetryPendingTask(task_id, + err_type, + &st, + /*ray_error_info=*/&err_info, + /*mark_task_object_failed=*/true, + /*fail_immediately=*/true); + return; + } else if (!direct_or.value() && first_execution) { + dynamic_returns_in_plasma.push_back(object_id); } } } for (const auto &return_object : reply.return_objects()) { const auto object_id = ObjectID::FromBinary(return_object.object_id()); - if (HandleTaskReturn(object_id, - return_object, - NodeID::FromBinary(worker_addr.raylet_id()), - store_in_plasma_ids.contains(object_id))) { + StatusOr direct_or = HandleTaskReturn(object_id, + return_object, + NodeID::FromBinary(worker_addr.node_id()), + store_in_plasma_ids.contains(object_id)); + if (!direct_or.ok()) { + RAY_LOG(WARNING).WithField(object_id) + << "Failed to handle task return: " << direct_or.status(); + // If storing return in plasma failed, treat as system failure for this attempt. + // Do not proceed with normal completion. Mark task failed immediately. + Status st = direct_or.status(); + rpc::ErrorType err_type = MapPlasmaPutStatusToErrorType(st); + rpc::RayErrorInfo err_info; + err_info.set_error_message(st.ToString()); + FailOrRetryPendingTask(task_id, + err_type, + &st, + /*ray_error_info=*/&err_info, + /*mark_task_object_failed=*/true, + /*fail_immediately=*/true); + return; + } else if (direct_or.value()) { direct_return_ids.push_back(object_id); } } @@ -929,7 +997,7 @@ void TaskManager::CompletePendingTask(const TaskID &task_id, auto it = submissible_tasks_.find(task_id); RAY_CHECK(it != submissible_tasks_.end()) << "Tried to complete task that was not pending " << task_id; - spec = it->second.spec; + spec = it->second.spec_; // Record any dynamically returned objects. We need to store these with the // task spec so that the worker will recreate them if the task gets @@ -942,7 +1010,7 @@ void TaskManager::CompletePendingTask(const TaskID &task_id, spec.AddDynamicReturnId(dynamic_return_id); } for (const auto &dynamic_return_id : dynamic_returns_in_plasma) { - it->second.reconstructable_return_ids.insert(dynamic_return_id); + it->second.reconstructable_return_ids_.insert(dynamic_return_id); } if (spec.IsStreamingGenerator()) { @@ -962,7 +1030,7 @@ void TaskManager::CompletePendingTask(const TaskID &task_id, // cause a memory leak of the task metadata, because we will // never receive a callback from the ReferenceCounter to erase // the task. - it->second.reconstructable_return_ids.insert( + it->second.reconstructable_return_ids_.insert( ObjectID::FromBinary(return_id_info.object_id())); } } @@ -975,14 +1043,14 @@ void TaskManager::CompletePendingTask(const TaskID &task_id, for (const auto &direct_return_id : direct_return_ids) { RAY_LOG(DEBUG) << "Task " << it->first << " returned direct object " << direct_return_id << ", now has " - << it->second.reconstructable_return_ids.size() + << it->second.reconstructable_return_ids_.size() << " plasma returns in scope"; - it->second.reconstructable_return_ids.erase(direct_return_id); + it->second.reconstructable_return_ids_.erase(direct_return_id); } RAY_LOG(DEBUG) << "Task " << it->first << " now has " - << it->second.reconstructable_return_ids.size() + << it->second.reconstructable_return_ids_.size() << " plasma returns in scope"; - it->second.num_successful_executions++; + it->second.num_successful_executions_++; if (is_application_error) { SetTaskStatus( @@ -998,13 +1066,13 @@ void TaskManager::CompletePendingTask(const TaskID &task_id, // A finished task can only be re-executed if it has some number of // retries left and returned at least one object that is still in use and // stored in plasma. - bool task_retryable = it->second.num_retries_left != 0 && - !it->second.reconstructable_return_ids.empty(); + bool task_retryable = it->second.num_retries_left_ != 0 && + !it->second.reconstructable_return_ids_.empty(); if (task_retryable) { // Pin the task spec if it may be retried again. release_lineage = false; - it->second.lineage_footprint_bytes = it->second.spec.GetMessage().ByteSizeLong(); - total_lineage_footprint_bytes_ += it->second.lineage_footprint_bytes; + it->second.lineage_footprint_bytes_ = it->second.spec_.GetMessage().ByteSizeLong(); + total_lineage_footprint_bytes_ += it->second.lineage_footprint_bytes_; if (total_lineage_footprint_bytes_ > max_lineage_bytes_) { RAY_LOG(INFO) << "Total lineage size is " << total_lineage_footprint_bytes_ / 1e6 << "MB, which exceeds the limit of " << max_lineage_bytes_ / 1e6 @@ -1040,10 +1108,27 @@ void TaskManager::CompletePendingTask(const TaskID &task_id, const auto generator_return_id = spec.StreamingGeneratorReturnId(i); RAY_CHECK_EQ(reply.return_objects_size(), 1); const auto &return_object = reply.return_objects(0); - HandleTaskReturn(generator_return_id, - return_object, - NodeID::FromBinary(worker_addr.raylet_id()), - store_in_plasma_ids.contains(generator_return_id)); + StatusOr res = + HandleTaskReturn(generator_return_id, + return_object, + NodeID::FromBinary(worker_addr.node_id()), + store_in_plasma_ids.contains(generator_return_id)); + if (!res.ok()) { + RAY_LOG(WARNING).WithField(generator_return_id) + << "Failed to handle generator return during app error propagation: " + << res.status(); + Status st = res.status(); + rpc::ErrorType err_type = MapPlasmaPutStatusToErrorType(st); + rpc::RayErrorInfo err_info; + err_info.set_error_message(st.ToString()); + FailOrRetryPendingTask(spec.TaskId(), + err_type, + &st, + /*ray_error_info=*/&err_info, + /*mark_task_object_failed=*/true, + /*fail_immediately=*/true); + return; + } } } } @@ -1074,13 +1159,13 @@ bool TaskManager::RetryTaskIfPossible(const TaskID &task_id, auto &task_entry = it->second; RAY_CHECK(task_entry.IsPending()) << "Tried to retry task that was not pending " << task_id; - spec = task_entry.spec; - num_retries_left = task_entry.num_retries_left; - num_oom_retries_left = task_entry.num_oom_retries_left; + spec = task_entry.spec_; + num_retries_left = task_entry.num_retries_left_; + num_oom_retries_left = task_entry.num_oom_retries_left_; if (task_failed_due_to_oom) { if (num_oom_retries_left > 0) { will_retry = true; - task_entry.num_oom_retries_left--; + task_entry.num_oom_retries_left_--; } else if (num_oom_retries_left == -1) { will_retry = true; } else { @@ -1095,13 +1180,13 @@ bool TaskManager::RetryTaskIfPossible(const TaskID &task_id, node_info->death_info().reason() == rpc::NodeDeathInfo::AUTOSCALER_DRAIN_PREEMPTED; } - if (num_retries_left > 0 || (is_preempted && task_entry.spec.IsRetriable())) { + if (num_retries_left > 0 || (is_preempted && task_entry.spec_.IsRetriable())) { will_retry = true; if (is_preempted) { RAY_LOG(INFO) << "Task " << task_id << " failed due to node preemption on node " << task_entry.GetNodeId() << ", not counting against retries"; } else { - task_entry.num_retries_left--; + task_entry.num_retries_left_--; } } else if (num_retries_left == -1) { will_retry = true; @@ -1110,8 +1195,8 @@ bool TaskManager::RetryTaskIfPossible(const TaskID &task_id, } } // Keep `num_retries_left` and `num_oom_retries_left` up to date - num_retries_left = task_entry.num_retries_left; - num_oom_retries_left = task_entry.num_oom_retries_left; + num_retries_left = task_entry.num_retries_left_; + num_oom_retries_left = task_entry.num_oom_retries_left_; if (will_retry) { // Record the old attempt status as FAILED. @@ -1125,7 +1210,7 @@ bool TaskManager::RetryTaskIfPossible(const TaskID &task_id, rpc::TaskStatus::PENDING_ARGS_AVAIL, /* state_update */ std::nullopt, /* include_task_info */ true, - task_entry.spec.AttemptNumber() + 1); + task_entry.spec_.AttemptNumber() + 1); } } @@ -1144,7 +1229,7 @@ bool TaskManager::RetryTaskIfPossible(const TaskID &task_id, spec.AttemptNumber(), RayConfig::instance().task_oom_retry_delay_base_ms()) : RayConfig::instance().task_retry_delay_ms(); - retry_task_callback_(spec, /*object_recovery*/ false, delay_ms); + async_retry_task_callback_(spec, delay_ms); return true; } else { RAY_LOG(INFO) << "No retries left for task " << spec.TaskId() @@ -1185,8 +1270,8 @@ void TaskManager::FailPendingTask(const TaskID &task_id, } RAY_CHECK(it->second.IsPending()) << "Tried to fail task that was not pending " << task_id; - spec = it->second.spec; - if (it->second.is_canceled && error_type != rpc::ErrorType::TASK_CANCELLED) { + spec = it->second.spec_; + if (it->second.is_canceled_ && error_type != rpc::ErrorType::TASK_CANCELLED) { // If the task is marked as cancelled before reaching FailPendingTask (which is // essentially the final state of the task lifecycle), that failure reason takes // precedence. @@ -1352,36 +1437,36 @@ int64_t TaskManager::RemoveLineageReference(const ObjectID &object_id, } RAY_LOG(DEBUG) << "Plasma object " << object_id << " out of scope"; - for (const auto &plasma_id : it->second.reconstructable_return_ids) { + for (const auto &plasma_id : it->second.reconstructable_return_ids_) { RAY_LOG(DEBUG) << "Task " << task_id << " has " << plasma_id << " in scope"; } - it->second.reconstructable_return_ids.erase(object_id); + it->second.reconstructable_return_ids_.erase(object_id); RAY_LOG(DEBUG) << "Task " << task_id << " now has " - << it->second.reconstructable_return_ids.size() + << it->second.reconstructable_return_ids_.size() << " plasma returns in scope"; - if (it->second.reconstructable_return_ids.empty() && !it->second.IsPending()) { + if (it->second.reconstructable_return_ids_.empty() && !it->second.IsPending()) { // If the task can no longer be retried, decrement the lineage ref count // for each of the task's args. - for (size_t i = 0; i < it->second.spec.NumArgs(); i++) { - if (it->second.spec.ArgByRef(i)) { - released_objects->push_back(it->second.spec.ArgObjectId(i)); + for (size_t i = 0; i < it->second.spec_.NumArgs(); i++) { + if (it->second.spec_.ArgByRef(i)) { + released_objects->push_back(it->second.spec_.ArgObjectId(i)); } else { - const auto &inlined_refs = it->second.spec.ArgInlinedRefs(i); + const auto &inlined_refs = it->second.spec_.ArgInlinedRefs(i); for (const auto &inlined_ref : inlined_refs) { released_objects->push_back(ObjectID::FromBinary(inlined_ref.object_id())); } } } - if (it->second.spec.IsActorTask()) { + if (it->second.spec_.IsActorTask()) { // We need to decrement the actor lineage ref count here // since it's incremented during TaskManager::AddPendingTask. - const auto actor_creation_return_id = it->second.spec.ActorCreationDummyObjectId(); + const auto actor_creation_return_id = it->second.spec_.ActorCreationDummyObjectId(); released_objects->push_back(actor_creation_return_id); } - total_lineage_footprint_bytes_ -= it->second.lineage_footprint_bytes; + total_lineage_footprint_bytes_ -= it->second.lineage_footprint_bytes_; // The task has finished and none of the return IDs are in scope anymore, // so it is safe to remove the task spec. submissible_tasks_.erase(it); @@ -1404,10 +1489,10 @@ void TaskManager::MarkTaskNoRetryInternal(const TaskID &task_id, bool canceled) absl::MutexLock lock(&mu_); auto it = submissible_tasks_.find(task_id); if (it != submissible_tasks_.end()) { - it->second.num_retries_left = 0; - it->second.num_oom_retries_left = 0; + it->second.num_retries_left_ = 0; + it->second.num_oom_retries_left_ = 0; if (canceled) { - it->second.is_canceled = true; + it->second.is_canceled_ = true; } } } @@ -1432,9 +1517,9 @@ absl::flat_hash_set TaskManager::GetTaskReturnObjectsToStoreInPlasma( // from submissible_tasks_. Do nothing in this case. return {}; } - first_execution = it->second.num_successful_executions == 0; + first_execution = it->second.num_successful_executions_ == 0; if (!first_execution) { - store_in_plasma_ids = it->second.reconstructable_return_ids; + store_in_plasma_ids = it->second.reconstructable_return_ids_; } if (first_execution_out != nullptr) { *first_execution_out = first_execution; @@ -1455,7 +1540,12 @@ void TaskManager::MarkTaskReturnObjectsFailed( for (int i = 0; i < num_returns; i++) { const auto object_id = ObjectID::FromIndex(task_id, /*index=*/i + 1); if (store_in_plasma_ids.contains(object_id)) { - put_in_local_plasma_callback_(error, object_id); + Status s = put_in_local_plasma_callback_(error, object_id); + if (!s.ok()) { + RAY_LOG(WARNING).WithField(object_id) + << "Failed to put error object in plasma: " << s; + in_memory_store_.Put(error, object_id); + } } else { in_memory_store_.Put(error, object_id); } @@ -1463,7 +1553,12 @@ void TaskManager::MarkTaskReturnObjectsFailed( if (spec.ReturnsDynamic()) { for (const auto &dynamic_return_id : spec.DynamicReturnIds()) { if (store_in_plasma_ids.contains(dynamic_return_id)) { - put_in_local_plasma_callback_(error, dynamic_return_id); + Status s = put_in_local_plasma_callback_(error, dynamic_return_id); + if (!s.ok()) { + RAY_LOG(WARNING).WithField(dynamic_return_id) + << "Failed to put error object in plasma: " << s; + in_memory_store_.Put(error, dynamic_return_id); + } } else { in_memory_store_.Put(error, dynamic_return_id); } @@ -1488,7 +1583,12 @@ void TaskManager::MarkTaskReturnObjectsFailed( for (size_t i = 0; i < num_streaming_generator_returns; i++) { const auto generator_return_id = spec.StreamingGeneratorReturnId(i); if (store_in_plasma_ids.contains(generator_return_id)) { - put_in_local_plasma_callback_(error, generator_return_id); + Status s = put_in_local_plasma_callback_(error, generator_return_id); + if (!s.ok()) { + RAY_LOG(WARNING).WithField(generator_return_id) + << "Failed to put error object in plasma: " << s; + in_memory_store_.Put(error, generator_return_id); + } } else { in_memory_store_.Put(error, generator_return_id); } @@ -1502,7 +1602,7 @@ std::optional TaskManager::GetTaskSpec(const TaskID &task_id) if (it == submissible_tasks_.end()) { return std::optional(); } - return it->second.spec; + return it->second.spec_; } std::vector TaskManager::GetPendingChildrenTasks( @@ -1510,7 +1610,7 @@ std::vector TaskManager::GetPendingChildrenTasks( std::vector ret_vec; absl::MutexLock lock(&mu_); for (const auto &it : submissible_tasks_) { - if (it.second.IsPending() && (it.second.spec.ParentTaskId() == parent_task_id)) { + if (it.second.IsPending() && (it.second.spec_.ParentTaskId() == parent_task_id)) { ret_vec.push_back(it.first); } } @@ -1528,7 +1628,7 @@ void TaskManager::AddTaskStatusInfo(rpc::CoreWorkerStats *stats) const { continue; } ref->set_task_status(it->second.GetStatus()); - ref->set_attempt_number(it->second.spec.AttemptNumber()); + ref->set_attempt_number(it->second.spec_.AttemptNumber()); } } @@ -1568,19 +1668,19 @@ void TaskManager::SetTaskStatus( std::optional state_update, bool include_task_info, std::optional attempt_number) { - RAY_LOG(DEBUG).WithField(task_entry.spec.TaskId()) + RAY_LOG(DEBUG).WithField(task_entry.spec_.TaskId()) << "Setting task status from " << rpc::TaskStatus_Name(task_entry.GetStatus()) << " to " << rpc::TaskStatus_Name(status); task_entry.SetStatus(status); const int32_t attempt_number_to_record = - attempt_number.value_or(task_entry.spec.AttemptNumber()); + attempt_number.value_or(task_entry.spec_.AttemptNumber()); const auto state_update_to_record = state_update.value_or(worker::TaskStatusEvent::TaskStateUpdate()); - RAY_UNUSED(task_event_buffer_.RecordTaskStatusEventIfNeeded(task_entry.spec.TaskId(), - task_entry.spec.JobId(), + RAY_UNUSED(task_event_buffer_.RecordTaskStatusEventIfNeeded(task_entry.spec_.TaskId(), + task_entry.spec_.JobId(), attempt_number_to_record, - task_entry.spec, + task_entry.spec_, status, include_task_info, state_update_to_record)); @@ -1597,19 +1697,19 @@ TaskManager::GetOngoingLineageReconstructionTasks( continue; } - if (task_entry.num_successful_executions == 0) { + if (task_entry.num_successful_executions_ == 0) { // Not lineage reconstruction task continue; } rpc::LineageReconstructionTask task; - task.set_name(task_entry.spec.GetName()); + task.set_name(task_entry.spec_.GetName()); task.set_status(task_entry.GetStatus()); - if (task_entry.spec.IsNormalTask()) { - task.mutable_labels()->insert(task_entry.spec.GetMessage().labels().begin(), - task_entry.spec.GetMessage().labels().end()); - } else if (task_entry.spec.IsActorTask()) { - auto actor_handle = actor_manager.GetActorHandle(task_entry.spec.ActorId()); + if (task_entry.spec_.IsNormalTask()) { + task.mutable_labels()->insert(task_entry.spec_.GetMessage().labels().begin(), + task_entry.spec_.GetMessage().labels().end()); + } else if (task_entry.spec_.IsActorTask()) { + auto actor_handle = actor_manager.GetActorHandle(task_entry.spec_.ActorId()); RAY_CHECK(actor_handle) << "Actor task must be submitted via actor handle"; const auto &labels = actor_handle->GetLabels(); task.mutable_labels()->insert(labels.begin(), labels.end()); @@ -1638,7 +1738,7 @@ void TaskManager::FillTaskInfo(rpc::GetCoreWorkerStatsReply *reply, const auto &task_entry = task_it.second; auto entry = reply->add_owned_task_info_entries(); - const auto &task_spec = task_entry.spec; + const auto &task_spec = task_entry.spec_; const auto &task_state = task_entry.GetStatus(); const auto &node_id = task_entry.GetNodeId(); rpc::TaskType type; @@ -1683,10 +1783,10 @@ ObjectID TaskManager::TaskGeneratorId(const TaskID &task_id) const { if (it == submissible_tasks_.end()) { return ObjectID::Nil(); } - if (!it->second.spec.ReturnsDynamic()) { + if (!it->second.spec_.ReturnsDynamic()) { return ObjectID::Nil(); } - return it->second.spec.ReturnId(0); + return it->second.spec_.ReturnId(0); } std::vector ExtractPlasmaDependencies(const TaskSpecification &spec) { diff --git a/src/ray/core_worker/task_manager.h b/src/ray/core_worker/task_manager.h index 1d841f21f284..43f6fce25ea9 100644 --- a/src/ray/core_worker/task_manager.h +++ b/src/ray/core_worker/task_manager.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include #include @@ -25,10 +26,12 @@ #include "absl/container/flat_hash_map.h" #include "absl/synchronization/mutex.h" #include "ray/common/id.h" +#include "ray/common/status.h" #include "ray/core_worker/store_provider/memory_store/memory_store.h" #include "ray/core_worker/task_event_buffer.h" #include "ray/core_worker/task_manager_interface.h" -#include "ray/gcs/gcs_client/gcs_client.h" +#include "ray/gcs_client/gcs_client.h" +#include "ray/observability/metric_interface.h" #include "ray/stats/metric_defs.h" #include "ray/util/counter_map.h" #include "src/ray/protobuf/common.pb.h" @@ -38,13 +41,15 @@ namespace ray { namespace core { +using std::literals::operator""sv; + class ActorManager; using TaskStatusCounter = CounterMap>; using PutInLocalPlasmaCallback = - std::function; -using RetryTaskCallback = - std::function; + std::function; +using AsyncRetryTaskCallback = + std::function; using ReconstructObjectCallback = std::function; using PushErrorCallback = std::function queue_generator_resubmit, PushErrorCallback push_error_callback, int64_t max_lineage_bytes, worker::TaskEventBuffer &task_event_buffer, - std::function(const ActorID &)> - client_factory, - std::shared_ptr gcs_client) + std::function>( + const ActorID &)> get_actor_rpc_client_callback, + std::shared_ptr gcs_client, + ray::observability::MetricInterface &task_by_state_counter) : in_memory_store_(in_memory_store), reference_counter_(reference_counter), put_in_local_plasma_callback_(std::move(put_in_local_plasma_callback)), - retry_task_callback_(std::move(retry_task_callback)), + async_retry_task_callback_(std::move(async_retry_task_callback)), queue_generator_resubmit_(std::move(queue_generator_resubmit)), push_error_callback_(std::move(push_error_callback)), max_lineage_bytes_(max_lineage_bytes), task_event_buffer_(task_event_buffer), - get_actor_rpc_client_callback_(std::move(client_factory)), - gcs_client_(std::move(gcs_client)) { + get_actor_rpc_client_callback_(std::move(get_actor_rpc_client_callback)), + gcs_client_(std::move(gcs_client)), + task_by_state_counter_(task_by_state_counter) { task_counter_.SetOnChangeCallback( [this](const std::tuple &key) ABSL_EXCLUSIVE_LOCKS_REQUIRED(&mu_) { - ray::stats::STATS_tasks.Record( + task_by_state_counter_.Record( task_counter_.Get(key), - {{"State", rpc::TaskStatus_Name(std::get<1>(key))}, - {"Name", std::get<0>(key)}, - {"IsRetry", std::get<2>(key) ? "1" : "0"}, - {"Source", "owner"}}); + {{"State"sv, rpc::TaskStatus_Name(std::get<1>(key))}, + {"Name"sv, std::get<0>(key)}, + {"IsRetry"sv, std::get<2>(key) ? "1" : "0"}, + {"Source"sv, "owner"}}); }); reference_counter_.SetReleaseLineageCallback( [this](const ObjectID &object_id, std::vector *ids_to_release) { @@ -501,41 +508,41 @@ class TaskManager : public TaskManagerInterface { private: struct TaskEntry { - TaskEntry(TaskSpecification spec_arg, - int num_retries_left_arg, + TaskEntry(TaskSpecification spec, + int num_retries_left, size_t num_returns, TaskStatusCounter &counter, int64_t num_oom_retries_left) - : spec(std::move(spec_arg)), - num_retries_left(num_retries_left_arg), - counter(&counter), - num_oom_retries_left(num_oom_retries_left), - is_canceled(false) { - reconstructable_return_ids.reserve(num_returns); + : spec_(std::move(spec)), + num_retries_left_(num_retries_left), + counter_(&counter), + num_oom_retries_left_(num_oom_retries_left), + is_canceled_(false) { + reconstructable_return_ids_.reserve(num_returns); for (size_t i = 0; i < num_returns; i++) { - reconstructable_return_ids.insert(spec.ReturnId(i)); + reconstructable_return_ids_.insert(spec_.ReturnId(i)); } - status = - std::make_tuple(spec.GetName(), rpc::TaskStatus::PENDING_ARGS_AVAIL, false); - counter.Increment(status); + status_ = + std::make_tuple(spec_.GetName(), rpc::TaskStatus::PENDING_ARGS_AVAIL, false); + counter_->Increment(status_); } void SetStatus(rpc::TaskStatus new_status) { - auto new_tuple = std::make_tuple(spec.GetName(), new_status, is_retry_); + auto new_tuple = std::make_tuple(spec_.GetName(), new_status, is_retry_); if (IsPending()) { - counter->Swap(status, new_tuple); + counter_->Swap(status_, new_tuple); } else { // FINISHED and FAILED are monotonically increasing. // TODO(jjyao): We should use Counter instead of Gauge // for FINISHED and FAILED tasks. - counter->Increment(new_tuple); + counter_->Increment(new_tuple); } - status = std::move(new_tuple); + status_ = std::move(new_tuple); } void MarkRetry() { is_retry_ = true; } - rpc::TaskStatus GetStatus() const { return std::get<1>(status); } + rpc::TaskStatus GetStatus() const { return std::get<1>(status_); } // Get the NodeID where the task is executed. NodeID GetNodeId() const { return node_id_; } @@ -555,25 +562,25 @@ class TaskManager : public TaskManagerInterface { /// - The task is still pending execution. This means that the task may /// fail and so it may be retried in the future. /// - The task finished execution, but it has num_retries_left > 0 and - /// reconstructable_return_ids is not empty. This means that the task may + /// reconstructable_return_ids_ is not empty. This means that the task may /// be retried in the future to recreate its return objects. /// TODO(swang): The TaskSpec protobuf must be copied into the /// PushTaskRequest protobuf when sent to a worker so that we can retry it if /// the worker fails. We could avoid this by either not caching the full /// TaskSpec for tasks that cannot be retried (e.g., actor tasks), or by /// storing a shared_ptr to a PushTaskRequest protobuf for all tasks. - TaskSpecification spec; + TaskSpecification spec_; // Number of times this task may be resubmitted. If this reaches 0, then // the task entry may be erased. - int32_t num_retries_left; + int32_t num_retries_left_; // Reference to the task stats tracker. - TaskStatusCounter *counter; + TaskStatusCounter *counter_; // Number of times this task may be resubmitted if the task failed // due to out of memory failure. - int32_t num_oom_retries_left; + int32_t num_oom_retries_left_; // Whether the task has been marked for cancellation. // Canceled tasks will never be retried. - bool is_canceled; + bool is_canceled_; // Objects returned by this task that are reconstructable. This is set // objects may be reconstructed by resubmitting the task. Once the task // finishes its first execution, then the objects that the task returned by @@ -584,18 +591,18 @@ class TaskManager : public TaskManagerInterface { // 2) There are no tasks that depend on the object. This includes both // pending tasks and tasks that finished execution but that may be // retried in the future. - absl::flat_hash_set reconstructable_return_ids; + absl::flat_hash_set reconstructable_return_ids_; // The size of this (serialized) task spec in bytes, if the task spec is // not pending, i.e. it is being pinned because it's in another object's // lineage. We cache this because the task spec protobuf can mutate // out-of-band. - int64_t lineage_footprint_bytes = 0; + int64_t lineage_footprint_bytes_ = 0; // Number of times this task successfully completed execution so far. - int num_successful_executions = 0; + int num_successful_executions_ = 0; private: // The task's current execution and metric status (name, status, is_retry). - std::tuple status; + std::tuple status_; // The node id where task is executed. NodeID node_id_; // Whether this is a task retry due to task failure. @@ -607,13 +614,13 @@ class TaskManager : public TaskManagerInterface { void MarkTaskNoRetryInternal(const TaskID &task_id, bool canceled) ABSL_LOCKS_EXCLUDED(mu_); - /// Update nested ref count info and store the in-memory value for a task's - /// return object. Returns true if the task's return object was returned - /// directly by value. - bool HandleTaskReturn(const ObjectID &object_id, - const rpc::ReturnObject &return_object, - const NodeID &worker_raylet_id, - bool store_in_plasma) ABSL_LOCKS_EXCLUDED(mu_); + /// Update nested ref count info and store the task's return object. + /// Returns StatusOr where the bool indicates the object was returned + /// directly in-memory (not stored in plasma) when true. + StatusOr HandleTaskReturn(const ObjectID &object_id, + const rpc::ReturnObject &return_object, + const NodeID &worker_node_id, + bool store_in_plasma) ABSL_LOCKS_EXCLUDED(mu_); /// Remove a lineage reference to this object ID. This should be called /// whenever a task that depended on this object ID can no longer be retried. @@ -656,7 +663,7 @@ class TaskManager : public TaskManagerInterface { /// Shutdown if all tasks are finished and shutdown is scheduled. void ShutdownIfNeeded() ABSL_LOCKS_EXCLUDED(mu_); - /// Updates the task entry state (e.g. status, is_retry, lineage_footprint_bytes, + /// Updates the task entry state (e.g. status, is_retry, lineage_footprint_bytes_, /// num_retries_left) + related global task manager state. void SetupTaskEntryForResubmit(TaskEntry &task_entry) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_); @@ -741,7 +748,7 @@ class TaskManager : public TaskManagerInterface { const PutInLocalPlasmaCallback put_in_local_plasma_callback_; /// Called when a task should be retried. - const RetryTaskCallback retry_task_callback_; + const AsyncRetryTaskCallback async_retry_task_callback_; /// For when a streaming generator task currently in progress needs to be resubmitted. std::function queue_generator_resubmit_; @@ -789,12 +796,20 @@ class TaskManager : public TaskManagerInterface { worker::TaskEventBuffer &task_event_buffer_; /// Callback to get the actor RPC client. - std::function( + std::function>( const ActorID &actor_id)> get_actor_rpc_client_callback_; std::shared_ptr gcs_client_; + // Metric to track the number of tasks by state. + // Expected tags: + // - State: the task state, as described by rpc::TaskState proto in common.proto + // - Name: the name of the function called + // - IsRetry: whether the task is a retry + // - Source: component reporting, e.g., "core_worker", "executor", or "pull_manager" + observability::MetricInterface &task_by_state_counter_; + friend class TaskManagerTest; }; diff --git a/src/ray/core_worker/task_manager_interface.h b/src/ray/core_worker/task_manager_interface.h index 9ba0260b40bd..34e04140984d 100644 --- a/src/ray/core_worker/task_manager_interface.h +++ b/src/ray/core_worker/task_manager_interface.h @@ -19,9 +19,9 @@ #include "absl/types/optional.h" #include "ray/common/id.h" +#include "ray/common/lease/lease.h" #include "ray/common/scheduling/scheduling_ids.h" #include "ray/common/status.h" -#include "ray/common/task/task.h" #include "ray/common/task/task_spec.h" #include "src/ray/protobuf/common.pb.h" #include "src/ray/protobuf/core_worker.pb.h" diff --git a/src/ray/core_worker/task_submission/BUILD.bazel b/src/ray/core_worker/task_submission/BUILD.bazel new file mode 100644 index 000000000000..53fcf8b306d1 --- /dev/null +++ b/src/ray/core_worker/task_submission/BUILD.bazel @@ -0,0 +1,102 @@ +load("//bazel:ray.bzl", "ray_cc_library") + +ray_cc_library( + name = "dependency_resolver", + srcs = ["dependency_resolver.cc"], + hdrs = ["dependency_resolver.h"], + visibility = [":__subpackages__"], + deps = [ + "//src/ray/common:id", + "//src/ray/common:task_common", + "//src/ray/core_worker:actor_creator", + "//src/ray/core_worker:lease_policy", + "//src/ray/core_worker:memory_store", + "//src/ray/core_worker:task_manager_interface", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/container:flat_hash_set", + ], +) + +ray_cc_library( + name = "actor_submit_queue", + hdrs = ["actor_submit_queue.h"], + visibility = ["//visibility:private"], + deps = [ + "//src/ray/common:id", + "//src/ray/common:task_common", + "@com_google_absl//absl/types:optional", + ], +) + +ray_cc_library( + name = "out_of_order_actor_submit_queue", + srcs = ["out_of_order_actor_submit_queue.cc"], + hdrs = ["out_of_order_actor_submit_queue.h"], + visibility = [":__subpackages__"], + deps = [ + ":actor_submit_queue", + "//src/ray/common:id", + "@com_google_absl//absl/container:btree", + "@com_google_absl//absl/types:optional", + ], +) + +ray_cc_library( + name = "sequential_actor_submit_queue", + srcs = ["sequential_actor_submit_queue.cc"], + hdrs = ["sequential_actor_submit_queue.h"], + visibility = [":__subpackages__"], + deps = [ + ":actor_submit_queue", + "//src/ray/common:id", + "@com_google_absl//absl/types:optional", + ], +) + +ray_cc_library( + name = "actor_task_submitter", + srcs = ["actor_task_submitter.cc"], + hdrs = ["actor_task_submitter.h"], + visibility = [ + ":__subpackages__", + "//src/ray/core_worker:__pkg__", + ], + deps = [ + ":actor_submit_queue", + ":dependency_resolver", + ":out_of_order_actor_submit_queue", + ":sequential_actor_submit_queue", + "//src/ray/common:asio", + "//src/ray/common:id", + "//src/ray/common:protobuf_utils", + "//src/ray/core_worker:actor_creator", + "//src/ray/rpc:core_worker_client", + "//src/ray/util:time", + "@com_google_absl//absl/base:core_headers", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/container:flat_hash_set", + ], +) + +ray_cc_library( + name = "normal_task_submitter", + srcs = ["normal_task_submitter.cc"], + hdrs = ["normal_task_submitter.h"], + visibility = [ + ":__subpackages__", + "//src/ray/core_worker:__pkg__", + ], + deps = [ + ":dependency_resolver", + "//src/ray/common:id", + "//src/ray/common:lease", + "//src/ray/common:protobuf_utils", + "//src/ray/core_worker:lease_policy", + "//src/ray/core_worker:memory_store", + "//src/ray/core_worker:task_manager_interface", + "//src/ray/rpc:core_worker_client", + "//src/ray/rpc:raylet_client_interface", + "//src/ray/util:time", + "@com_google_absl//absl/base:core_headers", + ], +) diff --git a/src/ray/core_worker/transport/actor_submit_queue.h b/src/ray/core_worker/task_submission/actor_submit_queue.h similarity index 98% rename from src/ray/core_worker/transport/actor_submit_queue.h rename to src/ray/core_worker/task_submission/actor_submit_queue.h index 0f3dbd6c4182..e84f662a380f 100644 --- a/src/ray/core_worker/transport/actor_submit_queue.h +++ b/src/ray/core_worker/task_submission/actor_submit_queue.h @@ -37,7 +37,6 @@ namespace core { * to know the actual sequence_no to send over the network. * * This class is not thread safe. - * TODO(scv119): the protocol could be improved. */ class IActorSubmitQueue { public: diff --git a/src/ray/core_worker/transport/actor_task_submitter.cc b/src/ray/core_worker/task_submission/actor_task_submitter.cc similarity index 74% rename from src/ray/core_worker/transport/actor_task_submitter.cc rename to src/ray/core_worker/task_submission/actor_task_submitter.cc index 7de64c9e1330..59c9797dbec2 100644 --- a/src/ray/core_worker/transport/actor_task_submitter.cc +++ b/src/ray/core_worker/task_submission/actor_task_submitter.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/core_worker/transport/actor_task_submitter.h" +#include "ray/core_worker/task_submission/actor_task_submitter.h" #include #include @@ -20,7 +20,8 @@ #include #include -#include "ray/gcs/pb_util.h" +#include "ray/common/protobuf_utils.h" +#include "ray/util/time.h" namespace ray { namespace core { @@ -35,8 +36,8 @@ void ActorTaskSubmitter::NotifyGCSWhenActorOutOfScope( { absl::MutexLock lock(&mu_); if (auto iter = client_queues_.find(actor_id); iter != client_queues_.end()) { - if (iter->second.state != rpc::ActorTableData::DEAD) { - iter->second.pending_out_of_scope_death = true; + if (iter->second.state_ != rpc::ActorTableData::DEAD) { + iter->second.pending_out_of_scope_death_ = true; } } } @@ -74,8 +75,7 @@ void ActorTaskSubmitter::AddActorQueueIfNotExists(const ActorID &actor_id, << "Set actor max pending calls to " << max_pending_calls; inserted = client_queues_ .emplace(actor_id, - ClientQueue(actor_id, - allow_out_of_order_execution, + ClientQueue(allow_out_of_order_execution, max_pending_calls, fail_if_actor_unreachable, owned)) @@ -89,11 +89,9 @@ void ActorTaskSubmitter::AddActorQueueIfNotExists(const ActorID &actor_id, } } -Status ActorTaskSubmitter::SubmitActorCreationTask(TaskSpecification task_spec) { +void ActorTaskSubmitter::SubmitActorCreationTask(TaskSpecification task_spec) { RAY_CHECK(task_spec.IsActorCreationTask()); - const auto actor_id = task_spec.ActorCreationId(); - const auto task_id = task_spec.TaskId(); - RAY_LOG(DEBUG).WithField(actor_id).WithField(task_id) + RAY_LOG(DEBUG).WithField(task_spec.ActorCreationId()).WithField(task_spec.TaskId()) << "Submitting actor creation task"; resolver_.ResolveDependencies(task_spec, [this, task_spec](Status status) mutable { // NOTE: task_spec here is capture copied (from a stack variable) and also @@ -118,16 +116,17 @@ Status ActorTaskSubmitter::SubmitActorCreationTask(TaskSpecification task_spec) RAY_LOG(DEBUG).WithField(actor_id).WithField(task_id) << "Creating actor via GCS"; actor_creator_.AsyncCreateActor( task_spec, - [this, actor_id, task_id](Status status, const rpc::CreateActorReply &reply) { - if (status.ok() || status.IsCreationTaskError()) { + [this, actor_id, task_id](Status create_actor_status, + const rpc::CreateActorReply &reply) { + if (create_actor_status.ok() || create_actor_status.IsCreationTaskError()) { rpc::PushTaskReply push_task_reply; push_task_reply.mutable_borrowed_refs()->CopyFrom(reply.borrowed_refs()); - if (status.IsCreationTaskError()) { + if (create_actor_status.IsCreationTaskError()) { RAY_LOG(INFO).WithField(actor_id).WithField(task_id) << "Actor creation failed and we will not be retrying the " "creation task"; // Update the task execution error to be CreationTaskError. - push_task_reply.set_task_execution_error(status.ToString()); + push_task_reply.set_task_execution_error(create_actor_status.ToString()); } else { RAY_LOG(DEBUG).WithField(actor_id).WithField(task_id) << "Created actor"; } @@ -137,11 +136,11 @@ Status ActorTaskSubmitter::SubmitActorCreationTask(TaskSpecification task_spec) task_id, push_task_reply, reply.actor_address(), - /*is_application_error=*/status.IsCreationTaskError()); + /*is_application_error=*/create_actor_status.IsCreationTaskError()); } else { // Either fails the rpc call or actor scheduling cancelled. rpc::RayErrorInfo ray_error_info; - if (status.IsSchedulingCancelled()) { + if (create_actor_status.IsSchedulingCancelled()) { RAY_LOG(DEBUG).WithField(actor_id).WithField(task_id) << "Actor creation cancelled"; task_manager_.MarkTaskNoRetry(task_id); @@ -150,7 +149,7 @@ Status ActorTaskSubmitter::SubmitActorCreationTask(TaskSpecification task_spec) } } else { RAY_LOG(INFO).WithField(actor_id).WithField(task_id) - << "Failed to create actor with status: " << status; + << "Failed to create actor with status: " << create_actor_status; } // Actor creation task retry happens in GCS // and transient rpc errors are retried in gcs client @@ -158,16 +157,14 @@ Status ActorTaskSubmitter::SubmitActorCreationTask(TaskSpecification task_spec) RAY_UNUSED(task_manager_.FailPendingTask( task_id, rpc::ErrorType::ACTOR_CREATION_FAILED, - &status, + &create_actor_status, ray_error_info.has_actor_died_error() ? &ray_error_info : nullptr)); } }); }); - - return Status::OK(); } -Status ActorTaskSubmitter::SubmitTask(TaskSpecification task_spec) { +void ActorTaskSubmitter::SubmitTask(TaskSpecification task_spec) { auto task_id = task_spec.TaskId(); auto actor_id = task_spec.ActorId(); RAY_LOG(DEBUG).WithField(task_id) << "Submitting task"; @@ -176,59 +173,67 @@ Status ActorTaskSubmitter::SubmitTask(TaskSpecification task_spec) { bool task_queued = false; uint64_t send_pos = 0; { + // We must release mu_ before resolving the task dependencies since the callback that + // reacquires mu_ may get called in the same call stack. absl::MutexLock lock(&mu_); auto queue = client_queues_.find(actor_id); RAY_CHECK(queue != client_queues_.end()); - if (queue->second.state == rpc::ActorTableData::DEAD && - queue->second.is_restartable && queue->second.owned) { + if (queue->second.state_ == rpc::ActorTableData::DEAD && + queue->second.is_restartable_ && queue->second.owned_) { RestartActorForLineageReconstruction(actor_id); } - if (queue->second.state != rpc::ActorTableData::DEAD) { + if (queue->second.state_ != rpc::ActorTableData::DEAD) { // We must fix the send order prior to resolving dependencies, which may // complete out of order. This ensures that we will not deadlock due to // backpressure. The receiving actor will execute the tasks according to // this sequence number. send_pos = task_spec.SequenceNumber(); - queue->second.actor_submit_queue->Emplace(send_pos, task_spec); - queue->second.cur_pending_calls++; + queue->second.actor_submit_queue_->Emplace(send_pos, task_spec); + queue->second.cur_pending_calls_++; task_queued = true; } } if (task_queued) { + { + absl::MutexLock resolver_lock(&resolver_mu_); + pending_dependency_resolution_.insert(task_id); + } io_service_.post( - [task_spec, send_pos, this]() mutable { - // We must release the lock before resolving the task dependencies since - // the callback may get called in the same call stack. - auto actor_id = task_spec.ActorId(); - auto task_id = task_spec.TaskId(); - resolver_.ResolveDependencies( - task_spec, [this, send_pos, actor_id, task_id](Status status) { - task_manager_.MarkDependenciesResolved(task_id); - bool fail_or_retry_task = false; - { - absl::MutexLock lock(&mu_); - auto queue = client_queues_.find(actor_id); - RAY_CHECK(queue != client_queues_.end()); - auto &actor_submit_queue = queue->second.actor_submit_queue; - // Only dispatch tasks if the submitted task is still queued. The task - // may have been dequeued if the actor has since failed. - if (actor_submit_queue->Contains(send_pos)) { - if (status.ok()) { - actor_submit_queue->MarkDependencyResolved(send_pos); - SendPendingTasks(actor_id); - } else { - fail_or_retry_task = true; - actor_submit_queue->MarkDependencyFailed(send_pos); + [task_spec, task_id, actor_id, send_pos, this]() mutable { + { + absl::MutexLock resolver_lock(&resolver_mu_); + if (pending_dependency_resolution_.erase(task_id) == 0) { + return; + } + resolver_.ResolveDependencies( + task_spec, [this, send_pos, actor_id, task_id](Status status) { + task_manager_.MarkDependenciesResolved(task_id); + bool fail_or_retry_task = false; + { + absl::MutexLock lock(&mu_); + auto queue = client_queues_.find(actor_id); + RAY_CHECK(queue != client_queues_.end()); + auto &actor_submit_queue = queue->second.actor_submit_queue_; + // Only dispatch tasks if the submitted task is still queued. The task + // may have been dequeued if the actor has since failed. + if (actor_submit_queue->Contains(send_pos)) { + if (status.ok()) { + actor_submit_queue->MarkDependencyResolved(send_pos); + SendPendingTasks(actor_id); + } else { + fail_or_retry_task = true; + actor_submit_queue->MarkDependencyFailed(send_pos); + } } } - } - if (fail_or_retry_task) { - GetTaskManagerWithoutMu().FailOrRetryPendingTask( - task_id, rpc::ErrorType::DEPENDENCY_RESOLUTION_FAILED, &status); - } - }); + if (fail_or_retry_task) { + task_manager_.FailOrRetryPendingTask( + task_id, rpc::ErrorType::DEPENDENCY_RESOLUTION_FAILED, &status); + } + }); + } }, "ActorTaskSubmitter::SubmitTask"); } else { @@ -239,7 +244,7 @@ Status ActorTaskSubmitter::SubmitTask(TaskSpecification task_spec) { { absl::MutexLock lock(&mu_); const auto queue_it = client_queues_.find(task_spec.ActorId()); - const auto &death_cause = queue_it->second.death_cause; + const auto &death_cause = queue_it->second.death_cause_; error_info = gcs::GetErrorInfoFromActorDeathCause(death_cause); error_type = error_info.error_type(); } @@ -250,31 +255,33 @@ Status ActorTaskSubmitter::SubmitTask(TaskSpecification task_spec) { error_info.has_actor_died_error() && error_info.actor_died_error().has_oom_context() && error_info.actor_died_error().oom_context().fail_immediately(); - GetTaskManagerWithoutMu().FailOrRetryPendingTask(task_id, - error_type, - &status, - &error_info, - /*mark_task_object_failed*/ true, - fail_immediately); + task_manager_.FailOrRetryPendingTask(task_id, + error_type, + &status, + &error_info, + /*mark_task_object_failed*/ true, + fail_immediately); } +} - // If the task submission subsequently fails, then the client will receive - // the error in a callback. - return Status::OK(); +void ActorTaskSubmitter::CancelDependencyResolution(const TaskID &task_id) { + absl::MutexLock resolver_lock(&resolver_mu_); + pending_dependency_resolution_.erase(task_id); + RAY_UNUSED(resolver_.CancelDependencyResolution(task_id)); } void ActorTaskSubmitter::DisconnectRpcClient(ClientQueue &queue) { - queue.rpc_client = nullptr; - core_worker_client_pool_.Disconnect(WorkerID::FromBinary(queue.worker_id)); - queue.worker_id.clear(); + queue.client_address_ = std::nullopt; + core_worker_client_pool_.Disconnect(WorkerID::FromBinary(queue.worker_id_)); + queue.worker_id_.clear(); } void ActorTaskSubmitter::FailInflightTasksOnRestart( const absl::flat_hash_map> &inflight_task_callbacks) { // NOTE(kfstorm): We invoke the callbacks with a bad status to act like there's a - // network issue. We don't call `task_manager_.FailOrRetryPendingTask` directly because - // there's much more work to do in the callback. + // network issue. We don't call `task_manager_.FailOrRetryPendingTask` directly + // because there's much more work to do in the callback. auto status = Status::IOError("The actor was restarted"); for (const auto &[_, callback] : inflight_task_callbacks) { callback(status, rpc::PushTaskReply()); @@ -295,7 +302,7 @@ void ActorTaskSubmitter::ConnectActor(const ActorID &actor_id, auto queue = client_queues_.find(actor_id); RAY_CHECK(queue != client_queues_.end()); - if (num_restarts < queue->second.num_restarts) { + if (num_restarts < queue->second.num_restarts_) { // This message is about an old version of the actor and the actor has // already restarted since then. Skip the connection. RAY_LOG(INFO).WithField(actor_id) @@ -303,32 +310,31 @@ void ActorTaskSubmitter::ConnectActor(const ActorID &actor_id, return; } - if (queue->second.rpc_client && - queue->second.rpc_client->Addr().ip_address() == address.ip_address() && - queue->second.rpc_client->Addr().port() == address.port()) { + if (queue->second.client_address_.has_value() && + queue->second.client_address_->ip_address() == address.ip_address() && + queue->second.client_address_->port() == address.port()) { RAY_LOG(DEBUG).WithField(actor_id) << "Skip actor that has already been connected"; return; } - if (queue->second.state == rpc::ActorTableData::DEAD) { + if (queue->second.state_ == rpc::ActorTableData::DEAD) { // This message is about an old version of the actor and the actor has // already died since then. Skip the connection. return; } - queue->second.num_restarts = num_restarts; - if (queue->second.rpc_client) { + queue->second.num_restarts_ = num_restarts; + if (queue->second.client_address_.has_value()) { // Clear the client to the old version of the actor. DisconnectRpcClient(queue->second); - inflight_task_callbacks = std::move(queue->second.inflight_task_callbacks); - queue->second.inflight_task_callbacks.clear(); + inflight_task_callbacks = std::move(queue->second.inflight_task_callbacks_); + queue->second.inflight_task_callbacks_.clear(); } - queue->second.state = rpc::ActorTableData::ALIVE; - // Update the mapping so new RPCs go out with the right intended worker id. - queue->second.worker_id = address.worker_id(); - // Create a new connection to the actor. - queue->second.rpc_client = core_worker_client_pool_.GetOrConnect(address); + queue->second.state_ = rpc::ActorTableData::ALIVE; + // So new RPCs go out with the right intended worker id to the right address. + queue->second.worker_id_ = address.worker_id(); + queue->second.client_address_ = address; SendPendingTasks(actor_id); } @@ -341,17 +347,17 @@ void ActorTaskSubmitter::RestartActorForLineageReconstruction(const ActorID &act RAY_LOG(INFO).WithField(actor_id) << "Reconstructing actor"; auto queue = client_queues_.find(actor_id); RAY_CHECK(queue != client_queues_.end()); - RAY_CHECK(queue->second.owned) << "Only owner can restart the dead actor"; - RAY_CHECK(queue->second.is_restartable) << "This actor is no longer restartable"; - queue->second.state = rpc::ActorTableData::RESTARTING; - queue->second.num_restarts_due_to_lineage_reconstructions += 1; + RAY_CHECK(queue->second.owned_) << "Only owner can restart the dead actor"; + RAY_CHECK(queue->second.is_restartable_) << "This actor is no longer restartable"; + queue->second.state_ = rpc::ActorTableData::RESTARTING; + queue->second.num_restarts_due_to_lineage_reconstructions_ += 1; actor_creator_.AsyncRestartActorForLineageReconstruction( actor_id, - queue->second.num_restarts_due_to_lineage_reconstructions, + queue->second.num_restarts_due_to_lineage_reconstructions_, [this, actor_id, num_restarts_due_to_lineage_reconstructions = - queue->second.num_restarts_due_to_lineage_reconstructions](Status status) { + queue->second.num_restarts_due_to_lineage_reconstructions_](Status status) { if (!status.ok()) { RAY_LOG(ERROR).WithField(actor_id) << "Failed to reconstruct actor. Error message: " << status.ToString(); @@ -382,7 +388,7 @@ void ActorTaskSubmitter::DisconnectActor(const ActorID &actor_id, if (!dead) { RAY_CHECK_GT(num_restarts, 0); } - if (num_restarts <= queue->second.num_restarts && !dead) { + if (num_restarts <= queue->second.num_restarts_ && !dead) { // This message is about an old version of the actor that has already been // restarted successfully. Skip the message handling. RAY_LOG(INFO).WithField(actor_id) @@ -394,20 +400,20 @@ void ActorTaskSubmitter::DisconnectActor(const ActorID &actor_id, // permanently dead or the new client will be inserted once the actor is // restarted. DisconnectRpcClient(queue->second); - inflight_task_callbacks = std::move(queue->second.inflight_task_callbacks); - queue->second.inflight_task_callbacks.clear(); + inflight_task_callbacks = std::move(queue->second.inflight_task_callbacks_); + queue->second.inflight_task_callbacks_.clear(); if (dead) { - queue->second.state = rpc::ActorTableData::DEAD; - queue->second.death_cause = death_cause; - queue->second.pending_out_of_scope_death = false; - queue->second.is_restartable = is_restartable; + queue->second.state_ = rpc::ActorTableData::DEAD; + queue->second.death_cause_ = death_cause; + queue->second.pending_out_of_scope_death_ = false; + queue->second.is_restartable_ = is_restartable; - if (queue->second.is_restartable && queue->second.owned) { + if (queue->second.is_restartable_ && queue->second.owned_) { // Actor is out of scope so there should be no inflight actor tasks. - RAY_CHECK(queue->second.wait_for_death_info_tasks.empty()); + RAY_CHECK(queue->second.wait_for_death_info_tasks_.empty()); RAY_CHECK(inflight_task_callbacks.empty()); - if (!queue->second.actor_submit_queue->Empty()) { + if (!queue->second.actor_submit_queue_->Empty()) { // There are pending lineage reconstruction tasks. RestartActorForLineageReconstruction(actor_id); } @@ -416,18 +422,18 @@ void ActorTaskSubmitter::DisconnectActor(const ActorID &actor_id, RAY_LOG(INFO).WithField(actor_id) << "Failing pending tasks for actor because the actor is already dead."; - task_ids_to_fail = queue->second.actor_submit_queue->ClearAllTasks(); + task_ids_to_fail = queue->second.actor_submit_queue_->ClearAllTasks(); // We need to execute this outside of the lock to prevent deadlock. - wait_for_death_info_tasks = std::move(queue->second.wait_for_death_info_tasks); + wait_for_death_info_tasks = std::move(queue->second.wait_for_death_info_tasks_); // Reset the queue - queue->second.wait_for_death_info_tasks = + queue->second.wait_for_death_info_tasks_ = std::deque>(); } - } else if (queue->second.state != rpc::ActorTableData::DEAD) { + } else if (queue->second.state_ != rpc::ActorTableData::DEAD) { // Only update the actor's state if it is not permanently dead. The actor // will eventually get restarted or marked as permanently dead. - queue->second.state = rpc::ActorTableData::RESTARTING; - queue->second.num_restarts = num_restarts; + queue->second.state_ = rpc::ActorTableData::RESTARTING; + queue->second.num_restarts_ = num_restarts; } } @@ -444,24 +450,24 @@ void ActorTaskSubmitter::DisconnectActor(const ActorID &actor_id, task_manager_.MarkTaskNoRetry(task_id); // This task may have been waiting for dependency resolution, so cancel // this first. - RAY_UNUSED(resolver_.CancelDependencyResolution(task_id)); + CancelDependencyResolution(task_id); bool fail_immediatedly = error_info.has_actor_died_error() && error_info.actor_died_error().has_oom_context() && error_info.actor_died_error().oom_context().fail_immediately(); - GetTaskManagerWithoutMu().FailOrRetryPendingTask(task_id, - error_type, - &status, - &error_info, - /*mark_task_object_failed*/ true, - fail_immediatedly); + task_manager_.FailOrRetryPendingTask(task_id, + error_type, + &status, + &error_info, + /*mark_task_object_failed*/ true, + fail_immediatedly); } if (!wait_for_death_info_tasks.empty()) { RAY_LOG(DEBUG).WithField(actor_id) << "Failing tasks waiting for death info, size=" << wait_for_death_info_tasks.size(); for (auto &task : wait_for_death_info_tasks) { - GetTaskManagerWithoutMu().FailPendingTask( - task->task_spec.TaskId(), error_type, &task->status, &error_info); + task_manager_.FailPendingTask( + task->task_spec_.TaskId(), error_type, &task->status_, &error_info); } } } @@ -471,8 +477,8 @@ void ActorTaskSubmitter::DisconnectActor(const ActorID &actor_id, void ActorTaskSubmitter::FailTaskWithError(const PendingTaskWaitingForDeathInfo &task) { rpc::RayErrorInfo error_info; - if (!task.actor_preempted) { - error_info = task.timeout_error_info; + if (!task.actor_preempted_) { + error_info = task.timeout_error_info_; } else { // Special error for preempted actor. The task "timed out" because the actor may // not have sent a notification to the gcs; regardless we already know it's @@ -480,7 +486,7 @@ void ActorTaskSubmitter::FailTaskWithError(const PendingTaskWaitingForDeathInfo auto actor_death_cause = error_info.mutable_actor_died_error(); auto actor_died_error_context = actor_death_cause->mutable_actor_died_error_context(); actor_died_error_context->set_reason(rpc::ActorDiedErrorContext::NODE_DIED); - actor_died_error_context->set_actor_id(task.task_spec.ActorId().Binary()); + actor_died_error_context->set_actor_id(task.task_spec_.ActorId().Binary()); auto node_death_info = actor_died_error_context->mutable_node_death_info(); node_death_info->set_reason(rpc::NodeDeathInfo::AUTOSCALER_DRAIN_PREEMPTED); node_death_info->set_reason_message( @@ -488,26 +494,26 @@ void ActorTaskSubmitter::FailTaskWithError(const PendingTaskWaitingForDeathInfo error_info.set_error_type(rpc::ErrorType::ACTOR_DIED); error_info.set_error_message("Actor died by preemption."); } - GetTaskManagerWithoutMu().FailPendingTask( - task.task_spec.TaskId(), error_info.error_type(), &task.status, &error_info); + task_manager_.FailPendingTask( + task.task_spec_.TaskId(), error_info.error_type(), &task.status_, &error_info); } void ActorTaskSubmitter::CheckTimeoutTasks() { // For each task in `wait_for_death_info_tasks`, if it times out, fail it with // timeout_error_info. But operating on the queue requires the mu_ lock; while calling - // FailPendingTask requires the opposite. So we copy the tasks out from the queue within - // the lock. This requires putting the data into shared_ptr. + // FailPendingTask requires the opposite. So we copy the tasks out from the queue + // within the lock. This requires putting the data into shared_ptr. std::vector> timeout_tasks; int64_t now = current_time_ms(); { absl::MutexLock lock(&mu_); for (auto &[actor_id, client_queue] : client_queues_) { - auto &deque = client_queue.wait_for_death_info_tasks; + auto &deque = client_queue.wait_for_death_info_tasks_; auto deque_itr = deque.begin(); - while (deque_itr != deque.end() && (*deque_itr)->deadline_ms < now) { + while (deque_itr != deque.end() && (*deque_itr)->deadline_ms_ < now) { // Populate the info of whether the actor is preempted. If so we hard fail the // task. - (*deque_itr)->actor_preempted = client_queue.preempted; + (*deque_itr)->actor_preempted_ = client_queue.preempted_; timeout_tasks.push_back(*deque_itr); deque_itr = deque.erase(deque_itr); } @@ -523,17 +529,17 @@ void ActorTaskSubmitter::SendPendingTasks(const ActorID &actor_id) { auto it = client_queues_.find(actor_id); RAY_CHECK(it != client_queues_.end()); auto &client_queue = it->second; - auto &actor_submit_queue = client_queue.actor_submit_queue; - if (client_queue.pending_out_of_scope_death) { + auto &actor_submit_queue = client_queue.actor_submit_queue_; + if (client_queue.pending_out_of_scope_death_) { // Wait until the actor is dead and then decide // whether we should fail pending tasks or restart the actor. // If the actor is restarted, ConnectActor will be called // and pending tasks will be sent at that time. return; } - if (!client_queue.rpc_client) { - if (client_queue.state == rpc::ActorTableData::RESTARTING && - client_queue.fail_if_actor_unreachable) { + if (!client_queue.client_address_.has_value()) { + if (client_queue.state_ == rpc::ActorTableData::RESTARTING && + client_queue.fail_if_actor_unreachable_) { // When `fail_if_actor_unreachable` is true, tasks submitted while the actor is in // `RESTARTING` state fail immediately. while (true) { @@ -561,7 +567,7 @@ void ActorTaskSubmitter::SendPendingTasks(const ActorID &actor_id) { if (!task.has_value()) { break; } - RAY_CHECK(!client_queue.worker_id.empty()); + RAY_CHECK(!client_queue.worker_id_.empty()); PushActorTask(client_queue, /*task_spec=*/task->first, /*skip_queue=*/task->second); } } @@ -577,12 +583,12 @@ void ActorTaskSubmitter::PushActorTask(ClientQueue &queue, // access the task. request->mutable_task_spec()->CopyFrom(task_spec.GetMessage()); - request->set_intended_worker_id(queue.worker_id); + request->set_intended_worker_id(queue.worker_id_); request->set_sequence_number(task_spec.SequenceNumber()); const auto actor_id = task_spec.ActorId(); - const auto num_queued = queue.inflight_task_callbacks.size(); + const auto num_queued = queue.inflight_task_callbacks_.size(); RAY_LOG(DEBUG).WithField(task_id).WithField(actor_id) << "Pushing task to actor, actor id " << actor_id << " seq no " << request->sequence_number() << " num queued " << num_queued; @@ -592,38 +598,38 @@ void ActorTaskSubmitter::PushActorTask(ClientQueue &queue, next_queueing_warn_threshold_ *= 2; } - rpc::Address addr(queue.rpc_client->Addr()); + auto &addr = queue.client_address_.value(); rpc::ClientCallback reply_callback = [this, addr, task_spec](const Status &status, const rpc::PushTaskReply &reply) { HandlePushTaskReply(status, reply, addr, task_spec); }; const TaskAttempt task_attempt = std::make_pair(task_id, task_spec.AttemptNumber()); - queue.inflight_task_callbacks.emplace(task_attempt, std::move(reply_callback)); + queue.inflight_task_callbacks_.emplace(task_attempt, std::move(reply_callback)); rpc::ClientCallback wrapped_callback = [this, task_attempt, actor_id](const Status &status, rpc::PushTaskReply &&reply) { - rpc::ClientCallback reply_callback; + rpc::ClientCallback push_task_reply_callback; { absl::MutexLock lock(&mu_); auto it = client_queues_.find(actor_id); RAY_CHECK(it != client_queues_.end()); - auto &queue = it->second; - auto callback_it = queue.inflight_task_callbacks.find(task_attempt); - if (callback_it == queue.inflight_task_callbacks.end()) { + auto &client_queue = it->second; + auto callback_it = client_queue.inflight_task_callbacks_.find(task_attempt); + if (callback_it == client_queue.inflight_task_callbacks_.end()) { RAY_LOG(DEBUG).WithField(task_attempt.first) << "The task has already been marked as failed. Ignore the reply."; return; } - reply_callback = std::move(callback_it->second); - queue.inflight_task_callbacks.erase(callback_it); + push_task_reply_callback = std::move(callback_it->second); + client_queue.inflight_task_callbacks_.erase(callback_it); } - reply_callback(status, std::move(reply)); + push_task_reply_callback(status, std::move(reply)); }; task_manager_.MarkTaskWaitingForExecution(task_id, - NodeID::FromBinary(addr.raylet_id()), + NodeID::FromBinary(addr.node_id()), WorkerID::FromBinary(addr.worker_id())); - queue.rpc_client->PushActorTask( + core_worker_client_pool_.GetOrConnect(addr)->PushActorTask( std::move(request), skip_queue, std::move(wrapped_callback)); } @@ -644,11 +650,11 @@ void ActorTaskSubmitter::HandlePushTaskReply(const Status &status, auto queue_pair = client_queues_.find(actor_id); RAY_CHECK(queue_pair != client_queues_.end()); auto &queue = queue_pair->second; - queue.cur_pending_calls--; + queue.cur_pending_calls_--; } } if (resubmit_generator) { - GetTaskManagerWithoutMu().MarkGeneratorFailedAndResubmit(task_id); + task_manager_.MarkGeneratorFailedAndResubmit(task_id); return; } @@ -669,10 +675,10 @@ void ActorTaskSubmitter::HandlePushTaskReply(const Status &status, rpc::RayErrorInfo error_info; error_info.set_error_message(msg); error_info.set_error_type(rpc::ErrorType::TASK_CANCELLED); - GetTaskManagerWithoutMu().FailPendingTask(task_spec.TaskId(), - rpc::ErrorType::TASK_CANCELLED, - /*status*/ nullptr, - &error_info); + task_manager_.FailPendingTask(task_spec.TaskId(), + rpc::ErrorType::TASK_CANCELLED, + /*status*/ nullptr, + &error_info); } else { bool is_actor_dead = false; bool fail_immediately = false; @@ -691,22 +697,22 @@ void ActorTaskSubmitter::HandlePushTaskReply(const Status &status, auto &queue = queue_pair->second; // If the actor is already dead, immediately mark the task object as failed. - // Otherwise, start the grace period, waiting for the actor death reason. Before the - // deadline: + // Otherwise, start the grace period, waiting for the actor death reason. Before + // the deadline: // - If we got the death reason: mark the object as failed with that reason. // - If we did not get the death reason: raise ACTOR_UNAVAILABLE with the status. // - If we did not get the death reason, but *the actor is preempted*: raise // ACTOR_DIED. See `CheckTimeoutTasks`. - is_actor_dead = queue.state == rpc::ActorTableData::DEAD; + is_actor_dead = queue.state_ == rpc::ActorTableData::DEAD; if (is_actor_dead) { - const auto &death_cause = queue.death_cause; + const auto &death_cause = queue.death_cause_; error_info = gcs::GetErrorInfoFromActorDeathCause(death_cause); fail_immediately = error_info.has_actor_died_error() && error_info.actor_died_error().has_oom_context() && error_info.actor_died_error().oom_context().fail_immediately(); } else { - // The actor may or may not be dead, but the request failed. Consider the failure - // temporary. May recognize retry, so fail_immediately = false. + // The actor may or may not be dead, but the request failed. Consider the + // failure temporary. May recognize retry, so fail_immediately = false. error_info.set_error_message("The actor is temporarily unavailable: " + status.ToString()); error_info.set_error_type(rpc::ErrorType::ACTOR_UNAVAILABLE); @@ -716,27 +722,27 @@ void ActorTaskSubmitter::HandlePushTaskReply(const Status &status, // This task may have been waiting for dependency resolution, so cancel // this first. - RAY_UNUSED(resolver_.CancelDependencyResolution(task_id)); - - will_retry = GetTaskManagerWithoutMu().FailOrRetryPendingTask( - task_id, - error_info.error_type(), - &status, - &error_info, - /*mark_task_object_failed*/ is_actor_dead, - fail_immediately); + CancelDependencyResolution(task_id); + + will_retry = + task_manager_.FailOrRetryPendingTask(task_id, + error_info.error_type(), + &status, + &error_info, + /*mark_task_object_failed*/ is_actor_dead, + fail_immediately); if (!is_actor_dead && !will_retry) { // Ran out of retries, last failure = either user exception or actor death. if (status.ok()) { // last failure = user exception, just complete it with failure. RAY_CHECK(reply.is_retryable_error()); - GetTaskManagerWithoutMu().CompletePendingTask( + task_manager_.CompletePendingTask( task_id, reply, addr, reply.is_application_error()); } else if (RayConfig::instance().timeout_ms_task_wait_for_death_info() != 0) { - // last failure = Actor death, but we still see the actor "alive" so we optionally - // wait for a grace period for the death info. + // last failure = Actor death, but we still see the actor "alive" so we + // optionally wait for a grace period for the death info. int64_t death_info_grace_period_ms = current_time_ms() + @@ -745,14 +751,14 @@ void ActorTaskSubmitter::HandlePushTaskReply(const Status &status, auto queue_pair = client_queues_.find(actor_id); RAY_CHECK(queue_pair != client_queues_.end()); auto &queue = queue_pair->second; - queue.wait_for_death_info_tasks.push_back( + queue.wait_for_death_info_tasks_.push_back( std::make_shared( death_info_grace_period_ms, task_spec, status, error_info)); RAY_LOG(INFO).WithField(task_spec.TaskId()) << "PushActorTask failed because of network error, this task " "will be stashed away and waiting for Death info from GCS" ", wait_queue_size=" - << queue.wait_for_death_info_tasks.size(); + << queue.wait_for_death_info_tasks_.size(); } else { // TODO(vitsai): if we don't need death info, just fail the request. { @@ -760,7 +766,7 @@ void ActorTaskSubmitter::HandlePushTaskReply(const Status &status, auto queue_pair = client_queues_.find(actor_id); RAY_CHECK(queue_pair != client_queues_.end()); } - GetTaskManagerWithoutMu().FailPendingTask( + task_manager_.FailPendingTask( task_spec.TaskId(), error_info.error_type(), &status, &error_info); } } @@ -770,7 +776,7 @@ void ActorTaskSubmitter::HandlePushTaskReply(const Status &status, auto queue_pair = client_queues_.find(actor_id); RAY_CHECK(queue_pair != client_queues_.end()); auto &queue = queue_pair->second; - queue.cur_pending_calls--; + queue.cur_pending_calls_--; } } @@ -782,7 +788,7 @@ std::optional ActorTaskSubmitter::GetLocalActor if (iter == client_queues_.end()) { return std::nullopt; } else { - return iter->second.state; + return iter->second.state_; } } @@ -790,39 +796,32 @@ bool ActorTaskSubmitter::IsActorAlive(const ActorID &actor_id) const { absl::MutexLock lock(&mu_); auto iter = client_queues_.find(actor_id); - return (iter != client_queues_.end() && iter->second.rpc_client); + return (iter != client_queues_.end() && iter->second.client_address_.has_value()); } std::optional ActorTaskSubmitter::GetActorAddress( const ActorID &actor_id) const { absl::MutexLock lock(&mu_); - auto iter = client_queues_.find(actor_id); if (iter == client_queues_.end()) { return std::nullopt; } - - const auto &rpc_client = iter->second.rpc_client; - if (rpc_client == nullptr) { - return std::nullopt; - } - - return iter->second.rpc_client->Addr(); + return iter->second.client_address_; } bool ActorTaskSubmitter::PendingTasksFull(const ActorID &actor_id) const { absl::MutexLock lock(&mu_); auto it = client_queues_.find(actor_id); RAY_CHECK(it != client_queues_.end()); - return it->second.max_pending_calls > 0 && - it->second.cur_pending_calls >= it->second.max_pending_calls; + return it->second.max_pending_calls_ > 0 && + it->second.cur_pending_calls_ >= it->second.max_pending_calls_; } size_t ActorTaskSubmitter::NumPendingTasks(const ActorID &actor_id) const { absl::MutexLock lock(&mu_); auto it = client_queues_.find(actor_id); RAY_CHECK(it != client_queues_.end()); - return it->second.cur_pending_calls; + return it->second.cur_pending_calls_; } bool ActorTaskSubmitter::CheckActorExists(const ActorID &actor_id) const { @@ -848,12 +847,12 @@ void ActorTaskSubmitter::RetryCancelTask(TaskSpecification task_spec, execute_after( io_service_, [this, task_spec = std::move(task_spec), recursive] { - RAY_UNUSED(CancelTask(task_spec, recursive)); + CancelTask(task_spec, recursive); }, std::chrono::milliseconds(milliseconds)); } -Status ActorTaskSubmitter::CancelTask(TaskSpecification task_spec, bool recursive) { +void ActorTaskSubmitter::CancelTask(TaskSpecification task_spec, bool recursive) { // We don't support force_kill = true for actor tasks. bool force_kill = false; RAY_LOG(INFO).WithField(task_spec.TaskId()).WithField(task_spec.ActorId()) @@ -872,10 +871,10 @@ Status ActorTaskSubmitter::CancelTask(TaskSpecification task_spec, bool recursiv // Shouldn't hold a lock while accessing task_manager_. // Task is already canceled or finished. - GetTaskManagerWithoutMu().MarkTaskCanceled(task_id); - if (!GetTaskManagerWithoutMu().IsTaskPending(task_id)) { + task_manager_.MarkTaskCanceled(task_id); + if (!task_manager_.IsTaskPending(task_id)) { RAY_LOG(DEBUG).WithField(task_id) << "Task is already finished or canceled"; - return Status::OK(); + return; } auto task_queued = false; @@ -886,25 +885,18 @@ Status ActorTaskSubmitter::CancelTask(TaskSpecification task_spec, bool recursiv auto queue = client_queues_.find(actor_id); RAY_CHECK(queue != client_queues_.end()); - if (queue->second.state == rpc::ActorTableData::DEAD) { + if (queue->second.state_ == rpc::ActorTableData::DEAD) { // No need to decrement cur_pending_calls because it doesn't matter. RAY_LOG(DEBUG).WithField(task_id) << "Task's actor is already dead. Ignoring the cancel request."; - return Status::OK(); + return; } - task_queued = queue->second.actor_submit_queue->Contains(send_pos); + task_queued = queue->second.actor_submit_queue_->Contains(send_pos); if (task_queued) { - auto dep_resolved = - queue->second.actor_submit_queue->DependenciesResolved(send_pos); - if (!dep_resolved) { - RAY_LOG(DEBUG).WithField(task_id) - << "Task has been resolving dependencies. Cancel to resolve dependencies"; - RAY_UNUSED(resolver_.CancelDependencyResolution(task_id)); - } RAY_LOG(DEBUG).WithField(task_id) << "Task was queued. Mark a task is canceled from a queue."; - queue->second.actor_submit_queue->MarkTaskCanceled(send_pos); + queue->second.actor_submit_queue_->MarkTaskCanceled(send_pos); } } @@ -912,15 +904,17 @@ Status ActorTaskSubmitter::CancelTask(TaskSpecification task_spec, bool recursiv // The task won't be sent to an actor in this case. // We cannot hold a lock when calling `FailOrRetryPendingTask`. if (task_queued) { + // Could be in dependency resolution or ResolveDependencies call may be queued up + CancelDependencyResolution(task_id); rpc::RayErrorInfo error_info; std::ostringstream stream; stream << "The task " << task_id << " is canceled from an actor " << actor_id << " before it executes."; error_info.set_error_message(stream.str()); error_info.set_error_type(rpc::ErrorType::TASK_CANCELLED); - GetTaskManagerWithoutMu().FailOrRetryPendingTask( + task_manager_.FailOrRetryPendingTask( task_id, rpc::ErrorType::TASK_CANCELLED, /*status*/ nullptr, &error_info); - return Status::OK(); + return; } // At this point, the task is in "sent" state and not finished yet. @@ -936,17 +930,17 @@ Status ActorTaskSubmitter::CancelTask(TaskSpecification task_spec, bool recursiv RAY_LOG(DEBUG).WithField(task_id) << "Task was sent to an actor. Send a cancel RPC."; auto queue = client_queues_.find(actor_id); RAY_CHECK(queue != client_queues_.end()); - if (!queue->second.rpc_client) { + if (!queue->second.client_address_.has_value()) { RetryCancelTask(task_spec, recursive, 1000); - return Status::OK(); + return; } - const auto &client = queue->second.rpc_client; - auto request = rpc::CancelTaskRequest(); + rpc::CancelTaskRequest request; request.set_intended_task_id(task_spec.TaskIdBinary()); request.set_force_kill(force_kill); request.set_recursive(recursive); request.set_caller_worker_id(task_spec.CallerWorkerIdBinary()); + auto client = core_worker_client_pool_.GetOrConnect(*queue->second.client_address_); client->CancelTask(request, [this, task_spec = std::move(task_spec), recursive, task_id]( const Status &status, const rpc::CancelTaskReply &reply) { @@ -956,7 +950,7 @@ Status ActorTaskSubmitter::CancelTask(TaskSpecification task_spec, bool recursiv // Keep retrying every 2 seconds until a task is officially // finished. - if (!GetTaskManagerWithoutMu().GetTaskSpec(task_id)) { + if (!task_manager_.GetTaskSpec(task_id)) { // Task is already finished. RAY_LOG(DEBUG).WithField(task_spec.TaskId()) << "Task is finished. Stop a cancel request."; @@ -968,11 +962,6 @@ Status ActorTaskSubmitter::CancelTask(TaskSpecification task_spec, bool recursiv } }); } - - // NOTE: Currently, ray.cancel is asynchronous. - // If we want to have a better guarantee in the cancelation result - // we should make it synchronos, but that can regress the performance. - return Status::OK(); } bool ActorTaskSubmitter::QueueGeneratorForResubmit(const TaskSpecification &spec) { diff --git a/src/ray/core_worker/transport/actor_task_submitter.h b/src/ray/core_worker/task_submission/actor_task_submitter.h similarity index 81% rename from src/ray/core_worker/transport/actor_task_submitter.h rename to src/ray/core_worker/task_submission/actor_task_submitter.h index 8a8350a9f64c..b077d7edb77c 100644 --- a/src/ray/core_worker/transport/actor_task_submitter.h +++ b/src/ray/core_worker/task_submission/actor_task_submitter.h @@ -14,14 +14,9 @@ #pragma once -#include -#include #include -#include #include #include -#include -#include #include #include @@ -29,17 +24,13 @@ #include "absl/container/flat_hash_map.h" #include "absl/container/flat_hash_set.h" #include "absl/synchronization/mutex.h" -#include "ray/common/asio/asio_util.h" #include "ray/common/id.h" -#include "ray/common/ray_object.h" #include "ray/core_worker/actor_creator.h" -#include "ray/core_worker/context.h" #include "ray/core_worker/store_provider/memory_store/memory_store.h" -#include "ray/core_worker/transport/actor_submit_queue.h" -#include "ray/core_worker/transport/dependency_resolver.h" -#include "ray/core_worker/transport/out_of_order_actor_submit_queue.h" -#include "ray/core_worker/transport/sequential_actor_submit_queue.h" -#include "ray/gcs/gcs_client/gcs_client.h" +#include "ray/core_worker/task_submission/actor_submit_queue.h" +#include "ray/core_worker/task_submission/dependency_resolver.h" +#include "ray/core_worker/task_submission/out_of_order_actor_submit_queue.h" +#include "ray/core_worker/task_submission/sequential_actor_submit_queue.h" #include "ray/rpc/worker/core_worker_client.h" namespace ray { @@ -68,7 +59,7 @@ class ActorTaskSubmitterInterface { /// If called, preempted = true will be set in the death cause upon actor death. virtual void SetPreempted(const ActorID &actor_id) = 0; - virtual ~ActorTaskSubmitterInterface() {} + virtual ~ActorTaskSubmitterInterface() = default; }; // This class is thread-safe. @@ -86,17 +77,16 @@ class ActorTaskSubmitter : public ActorTaskSubmitterInterface { actor_creator_(actor_creator), resolver_(store, task_manager, actor_creator, tensor_transport_getter), task_manager_(task_manager), - warn_excess_queueing_(warn_excess_queueing), + warn_excess_queueing_(std::move(warn_excess_queueing)), + next_queueing_warn_threshold_( + ::RayConfig::instance().actor_excess_queueing_warn_threshold()), io_service_(io_service), - reference_counter_(reference_counter) { - next_queueing_warn_threshold_ = - ::RayConfig::instance().actor_excess_queueing_warn_threshold(); - } + reference_counter_(std::move(reference_counter)) {} - void SetPreempted(const ActorID &actor_id) { + void SetPreempted(const ActorID &actor_id) override { absl::MutexLock lock(&mu_); if (auto iter = client_queues_.find(actor_id); iter != client_queues_.end()) { - iter->second.preempted = true; + iter->second.preempted_ = true; } } @@ -115,17 +105,13 @@ class ActorTaskSubmitter : public ActorTaskSubmitterInterface { int32_t max_pending_calls, bool allow_out_of_order_execution, bool fail_if_actor_unreachable, - bool owned); + bool owned) override; /// Submit a task to an actor for execution. - /// - /// \param[in] task_spec The task spec to submit. - /// - /// \return Status::Invalid if the task is not yet supported. - Status SubmitTask(TaskSpecification task_spec); + void SubmitTask(TaskSpecification task_spec); /// Submit an actor creation task to an actor via GCS. - Status SubmitActorCreationTask(TaskSpecification task_spec); + void SubmitActorCreationTask(TaskSpecification task_spec); /// Create connection to actor and send all pending tasks. /// @@ -136,7 +122,7 @@ class ActorTaskSubmitter : public ActorTaskSubmitterInterface { /// ignore the command to connect. void ConnectActor(const ActorID &actor_id, const rpc::Address &address, - int64_t num_restarts); + int64_t num_restarts) override; /// Disconnect from a failed actor. /// @@ -152,13 +138,13 @@ class ActorTaskSubmitter : public ActorTaskSubmitterInterface { int64_t num_restarts, bool dead, const rpc::ActorDeathCause &death_cause, - bool is_restartable); + bool is_restartable) override; /// Set the timerstamp for the caller. void SetCallerCreationTimestamp(int64_t timestamp); /// Check timeout tasks that are waiting for Death info. - void CheckTimeoutTasks(); + void CheckTimeoutTasks() override; /// If the number of tasks in requests is greater than or equal to /// max_pending_calls. @@ -241,11 +227,7 @@ class ActorTaskSubmitter : public ActorTaskSubmitterInterface { /// /// \param task_spec The task spec of a task that will be canceled. /// \param recursive If true, it will cancel all child tasks. - /// \return True if cancel request is not needed or it will be - /// requested. False otherwise. Note that tasks could be "not" - /// canceled although the status is true because it is an - /// asynchronous API. - Status CancelTask(TaskSpecification task_spec, bool recursive); + void CancelTask(TaskSpecification task_spec, bool recursive); /// Retry the CancelTask in milliseconds. void RetryCancelTask(TaskSpecification task_spec, bool recursive, int64_t milliseconds); @@ -257,75 +239,65 @@ class ActorTaskSubmitter : public ActorTaskSubmitterInterface { private: struct PendingTaskWaitingForDeathInfo { - int64_t deadline_ms; - TaskSpecification task_spec; - ray::Status status; - rpc::RayErrorInfo timeout_error_info; - bool actor_preempted = false; + int64_t deadline_ms_; + TaskSpecification task_spec_; + ray::Status status_; + rpc::RayErrorInfo timeout_error_info_; + bool actor_preempted_ = false; PendingTaskWaitingForDeathInfo(int64_t deadline_ms, TaskSpecification task_spec, ray::Status status, rpc::RayErrorInfo timeout_error_info) - : deadline_ms(deadline_ms), - task_spec(std::move(task_spec)), - status(std::move(status)), - timeout_error_info(std::move(timeout_error_info)) {} + : deadline_ms_(deadline_ms), + task_spec_(std::move(task_spec)), + status_(std::move(status)), + timeout_error_info_(std::move(timeout_error_info)) {} }; - /// A helper function to get task manager without holding mu_ - /// We should use this function when access - /// - FailOrRetryPendingTask - /// - FailPendingTask - TaskManagerInterface &GetTaskManagerWithoutMu() { - mu_.AssertNotHeld(); - return task_manager_; - } struct ClientQueue { - ClientQueue(ActorID actor_id, - bool allow_out_of_order_execution, + ClientQueue(bool allow_out_of_order_execution, int32_t max_pending_calls, bool fail_if_actor_unreachable, bool owned) - : max_pending_calls(max_pending_calls), - fail_if_actor_unreachable(fail_if_actor_unreachable), - owned(owned) { + : max_pending_calls_(max_pending_calls), + fail_if_actor_unreachable_(fail_if_actor_unreachable), + owned_(owned) { if (allow_out_of_order_execution) { - actor_submit_queue = std::make_unique(actor_id); + actor_submit_queue_ = std::make_unique(); } else { - actor_submit_queue = std::make_unique(actor_id); + actor_submit_queue_ = std::make_unique(); } } /// The current state of the actor. If this is ALIVE, then we should have /// an RPC client to the actor. If this is DEAD, then all tasks in the /// queue will be marked failed and all other ClientQueue state is ignored. - rpc::ActorTableData::ActorState state = rpc::ActorTableData::DEPENDENCIES_UNREADY; + rpc::ActorTableData::ActorState state_ = rpc::ActorTableData::DEPENDENCIES_UNREADY; /// The reason why this actor is dead. /// If the context is not set, it means the actor is not dead. - rpc::ActorDeathCause death_cause; + rpc::ActorDeathCause death_cause_; /// How many times this actor has been restarted before. Starts at -1 to /// indicate that the actor is not yet created. This is used to drop stale /// messages from the GCS. - int64_t num_restarts = -1; + int64_t num_restarts_ = -1; /// How many times this actor has been lineage reconstructured. /// This is used to drop stale messages. - int64_t num_restarts_due_to_lineage_reconstructions = 0; + int64_t num_restarts_due_to_lineage_reconstructions_ = 0; /// Whether this actor exits by spot preemption. - bool preempted = false; - /// The RPC client. We use shared_ptr to enable shared_from_this for - /// pending client callbacks. - std::shared_ptr rpc_client = nullptr; + bool preempted_ = false; + /// The RPC client address. + std::optional client_address_; /// The intended worker ID of the actor. - std::string worker_id = ""; + std::string worker_id_; /// The actor is out of scope but the death info is not published /// to this worker yet. - bool pending_out_of_scope_death = false; + bool pending_out_of_scope_death_ = false; /// If the actor is dead, whether it can be restarted. - bool is_restartable = false; + bool is_restartable_ = false; /// The queue that orders actor requests. - std::unique_ptr actor_submit_queue; + std::unique_ptr actor_submit_queue_; /// Tasks that can't be sent because 1) the callee actor is dead. 2) network error. /// For 1) the task will wait for the DEAD state notification, then mark task as @@ -341,38 +313,42 @@ class ActorTaskSubmitter : public ActorTaskSubmitterInterface { /// `timeout_error_info`. One special case is when the actor is preempted, where /// the actor may not be dead *just yet* but we want to treat it as dead. In this /// case we hard code an error info. - std::deque> wait_for_death_info_tasks; + std::deque> + wait_for_death_info_tasks_; /// Stores all callbacks of inflight tasks. An actor task is inflight /// if the PushTask RPC is sent but the reply is not received yet. absl::flat_hash_map> - inflight_task_callbacks; + inflight_task_callbacks_; /// The max number limit of task capacity used for back pressure. /// If the number of tasks in requests >= max_pending_calls, it can't continue to /// push task to ClientQueue. - const int32_t max_pending_calls; + const int32_t max_pending_calls_; /// The current task number in this client queue. - int32_t cur_pending_calls = 0; + int32_t cur_pending_calls_ = 0; /// Whether to fail newly submitted tasks immediately when the actor is unreachable. - bool fail_if_actor_unreachable = true; + bool fail_if_actor_unreachable_ = true; /// Whether the current process is owner of the actor. - bool owned; + bool owned_; /// Returns debug string for class. /// /// \return string. std::string DebugString() const { std::ostringstream stream; - stream << "max_pending_calls=" << max_pending_calls - << " cur_pending_calls=" << cur_pending_calls; + stream << "max_pending_calls=" << max_pending_calls_ + << " cur_pending_calls=" << cur_pending_calls_; return stream.str(); } }; + void CancelDependencyResolution(const TaskID &task_id) + ABSL_LOCKS_EXCLUDED(resolver_mu_); + /// Fail the task with the timeout error, or the preempted error. void FailTaskWithError(const PendingTaskWaitingForDeathInfo &task); @@ -431,6 +407,13 @@ class ActorTaskSubmitter : public ActorTaskSubmitterInterface { // Generators that are currently running and need to be resubmitted. absl::flat_hash_set generators_to_resubmit_ ABSL_GUARDED_BY(mu_); + // For when kicking off dependency resolution is still queued on the io_context. + // We need an extra mutex because the ResolveDependencies callback could be called + // immediately and it acquires mu_ and needs to call GetTaskManagerWithoutMu. + absl::Mutex resolver_mu_ ABSL_ACQUIRED_BEFORE(mu_); + absl::flat_hash_set pending_dependency_resolution_ + ABSL_GUARDED_BY(resolver_mu_); + /// Resolve object dependencies. LocalDependencyResolver resolver_; @@ -448,8 +431,6 @@ class ActorTaskSubmitter : public ActorTaskSubmitterInterface { instrumented_io_context &io_service_; std::shared_ptr reference_counter_; - - friend class CoreWorkerTest; }; } // namespace core diff --git a/src/ray/core_worker/transport/dependency_resolver.cc b/src/ray/core_worker/task_submission/dependency_resolver.cc similarity index 92% rename from src/ray/core_worker/transport/dependency_resolver.cc rename to src/ray/core_worker/task_submission/dependency_resolver.cc index f35fd39175e0..3b3c521cb8d1 100644 --- a/src/ray/core_worker/transport/dependency_resolver.cc +++ b/src/ray/core_worker/task_submission/dependency_resolver.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/core_worker/transport/dependency_resolver.h" +#include "ray/core_worker/task_submission/dependency_resolver.h" #include #include @@ -77,6 +77,9 @@ void InlineDependencies( mutable_arg->add_nested_inlined_refs()->CopyFrom(nested_ref); contained_ids->push_back(ObjectID::FromBinary(nested_ref.object_id())); } + } else { + auto tensor_transport = mutable_arg->object_ref().tensor_transport(); + mutable_arg->set_tensor_transport(tensor_transport); } found++; } @@ -101,12 +104,12 @@ void LocalDependencyResolver::ResolveDependencies( if (task.ArgByRef(i)) { local_dependency_ids.insert(task.ArgObjectId(i)); } - for (const auto &in : task.ArgInlinedRefs(i)) { - auto object_id = ObjectID::FromBinary(in.object_id()); + for (const auto &inlined_ref : task.ArgInlinedRefs(i)) { + const auto object_id = ObjectID::FromBinary(inlined_ref.object_id()); if (ObjectID::IsActorID(object_id)) { - auto actor_id = ObjectID::ToActorID(object_id); + const auto actor_id = ObjectID::ToActorID(object_id); if (actor_creator_.IsActorInRegistering(actor_id)) { - actor_dependency_ids.insert(ObjectID::ToActorID(object_id)); + actor_dependency_ids.insert(actor_id); } } } @@ -165,7 +168,7 @@ void LocalDependencyResolver::ResolveDependencies( contained_ids); } if (resolved_task_state) { - resolved_task_state->on_dependencies_resolved(resolved_task_state->status); + resolved_task_state->on_dependencies_resolved_(resolved_task_state->status); } }); } @@ -195,7 +198,7 @@ void LocalDependencyResolver::ResolveDependencies( } if (resolved_task_state) { - resolved_task_state->on_dependencies_resolved(resolved_task_state->status); + resolved_task_state->on_dependencies_resolved_(resolved_task_state->status); } }); } diff --git a/src/ray/core_worker/transport/dependency_resolver.h b/src/ray/core_worker/task_submission/dependency_resolver.h similarity index 97% rename from src/ray/core_worker/transport/dependency_resolver.h rename to src/ray/core_worker/task_submission/dependency_resolver.h index 2d9624144017..aa625ba9a266 100644 --- a/src/ray/core_worker/transport/dependency_resolver.h +++ b/src/ray/core_worker/task_submission/dependency_resolver.h @@ -79,7 +79,7 @@ class LocalDependencyResolver { : task(std::move(t)), actor_dependencies_remaining(actor_ids.size()), status(Status::OK()), - on_dependencies_resolved(std::move(on_dependencies_resolved)) { + on_dependencies_resolved_(std::move(on_dependencies_resolved)) { local_dependencies.reserve(deps.size()); for (const auto &dep : deps) { local_dependencies.emplace(dep, /*ray_object=*/nullptr); @@ -97,7 +97,7 @@ class LocalDependencyResolver { size_t obj_dependencies_remaining; /// Dependency resolution status. Status status; - std::function on_dependencies_resolved; + std::function on_dependencies_resolved_; }; /// The in-memory store. diff --git a/src/ray/core_worker/transport/normal_task_submitter.cc b/src/ray/core_worker/task_submission/normal_task_submitter.cc similarity index 73% rename from src/ray/core_worker/transport/normal_task_submitter.cc rename to src/ray/core_worker/task_submission/normal_task_submitter.cc index b95d61224e0f..60f87076e069 100644 --- a/src/ray/core_worker/transport/normal_task_submitter.cc +++ b/src/ray/core_worker/task_submission/normal_task_submitter.cc @@ -12,20 +12,23 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/core_worker/transport/normal_task_submitter.h" +#include "ray/core_worker/task_submission/normal_task_submitter.h" +#include #include #include #include #include #include -#include "ray/gcs/pb_util.h" +#include "ray/common/lease/lease_spec.h" +#include "ray/common/protobuf_utils.h" +#include "ray/util/time.h" namespace ray { namespace core { -Status NormalTaskSubmitter::SubmitTask(TaskSpecification task_spec) { +void NormalTaskSubmitter::SubmitTask(TaskSpecification task_spec) { RAY_CHECK(task_spec.IsNormalTask()); RAY_LOG(DEBUG) << "Submit task " << task_spec.TaskId(); @@ -61,9 +64,10 @@ Status NormalTaskSubmitter::SubmitTask(TaskSpecification task_spec) { const SchedulingKey scheduling_key(task_spec.GetSchedulingClass(), task_spec.GetDependencyIds(), task_spec.GetRuntimeEnvHash()); + // TODO(#56107): Only create the lease spec if this is a new scheduling key entry auto &scheduling_key_entry = scheduling_key_entries_[scheduling_key]; - scheduling_key_entry.task_queue.push_back(task_spec); - scheduling_key_entry.resource_spec = std::move(task_spec); + scheduling_key_entry.lease_spec = LeaseSpecification(task_spec.GetMessage()); + scheduling_key_entry.task_queue.push_back(std::move(task_spec)); if (!scheduling_key_entry.AllWorkersBusy()) { // There are idle workers, so we don't need more @@ -85,19 +89,18 @@ Status NormalTaskSubmitter::SubmitTask(TaskSpecification task_spec) { } RequestNewWorkerIfNeeded(scheduling_key); }); - return Status::OK(); } void NormalTaskSubmitter::AddWorkerLeaseClient( const rpc::Address &addr, - std::shared_ptr raylet_client, + const NodeID &node_id, const google::protobuf::RepeatedPtrField &assigned_resources, const SchedulingKey &scheduling_key, - const TaskID &task_id) { + const LeaseID &lease_id) { core_worker_client_pool_->GetOrConnect(addr); int64_t expiration = current_time_ms() + lease_timeout_ms_; LeaseEntry new_lease_entry{ - std::move(raylet_client), expiration, assigned_resources, scheduling_key, task_id}; + node_id, expiration, assigned_resources, scheduling_key, lease_id}; worker_to_lease_entry_.emplace(addr, new_lease_entry); auto &scheduling_key_entry = scheduling_key_entries_[scheduling_key]; @@ -105,17 +108,17 @@ void NormalTaskSubmitter::AddWorkerLeaseClient( RAY_CHECK(scheduling_key_entry.active_workers.size() >= 1); } -void NormalTaskSubmitter::ReturnWorker(const rpc::Address &addr, - bool was_error, - const std::string &error_detail, - bool worker_exiting, - const SchedulingKey &scheduling_key) { +void NormalTaskSubmitter::ReturnWorkerLease(const rpc::Address &addr, + bool was_error, + const std::string &error_detail, + bool worker_exiting, + const SchedulingKey &scheduling_key) { RAY_LOG(DEBUG) << "Returning worker " << WorkerID::FromBinary(addr.worker_id()) - << " to raylet " << NodeID::FromBinary(addr.raylet_id()); + << " to raylet " << NodeID::FromBinary(addr.node_id()); auto &scheduling_key_entry = scheduling_key_entries_[scheduling_key]; RAY_CHECK(scheduling_key_entry.active_workers.size() >= 1); auto &lease_entry = worker_to_lease_entry_[addr]; - RAY_CHECK(lease_entry.raylet_client); + RAY_CHECK(!lease_entry.node_id.IsNil()); RAY_CHECK(!lease_entry.is_busy); // Decrement the number of active workers consuming tasks from the queue associated @@ -126,16 +129,10 @@ void NormalTaskSubmitter::ReturnWorker(const rpc::Address &addr, // scheduling_key_entries_ hashmap. scheduling_key_entries_.erase(scheduling_key); } - - auto status = - lease_entry.raylet_client->ReturnWorker(addr.port(), - WorkerID::FromBinary(addr.worker_id()), - was_error, - error_detail, - worker_exiting); - if (!status.ok()) { - RAY_LOG(ERROR) << "Error returning worker to raylet: " << status.ToString(); - } + auto raylet_client = raylet_client_pool_->GetByID(lease_entry.node_id); + RAY_CHECK(raylet_client); + raylet_client->ReturnWorkerLease( + addr.port(), lease_entry.lease_id, was_error, error_detail, worker_exiting); worker_to_lease_entry_.erase(addr); } @@ -147,7 +144,7 @@ void NormalTaskSubmitter::OnWorkerIdle( bool worker_exiting, const google::protobuf::RepeatedPtrField &assigned_resources) { auto &lease_entry = worker_to_lease_entry_[addr]; - if (!lease_entry.raylet_client) { + if (lease_entry.node_id.IsNil()) { return; } @@ -163,12 +160,12 @@ void NormalTaskSubmitter::OnWorkerIdle( // Return the worker only if there are no tasks to do. if (!lease_entry.is_busy) { - ReturnWorker(addr, was_error, error_detail, worker_exiting, scheduling_key); + ReturnWorkerLease(addr, was_error, error_detail, worker_exiting, scheduling_key); } } else { auto client = core_worker_client_pool_->GetOrConnect(addr); - while (!current_queue.empty() && !lease_entry.is_busy) { + if (!current_queue.empty() && !lease_entry.is_busy) { auto task_spec = std::move(current_queue.front()); current_queue.pop_front(); @@ -204,11 +201,12 @@ void NormalTaskSubmitter::CancelWorkerLeaseIfNeeded(const SchedulingKey &schedul for (auto &pending_lease_request : scheduling_key_entry.pending_lease_requests) { // There is an in-flight lease request. Cancel it. - auto raylet_client = GetOrConnectRayletClient(&pending_lease_request.second); - auto &task_id = pending_lease_request.first; - RAY_LOG(DEBUG) << "Canceling lease request " << task_id; + auto raylet_client = + raylet_client_pool_->GetOrConnectByAddress(pending_lease_request.second); + const auto &lease_id = pending_lease_request.first; + RAY_LOG(DEBUG) << "Canceling lease request " << lease_id; raylet_client->CancelWorkerLease( - task_id, + lease_id, [this, scheduling_key](const Status &status, const rpc::CancelWorkerLeaseReply &reply) { absl::MutexLock lock(&mu_); @@ -216,51 +214,31 @@ void NormalTaskSubmitter::CancelWorkerLeaseIfNeeded(const SchedulingKey &schedul // The cancellation request can fail if the raylet does not have // the request queued. This can happen if: a) due to message // reordering, the raylet has not yet received the worker lease - // request, or b) we have already returned the worker lease - // request. In the former case, we should try the cancellation - // request again. In the latter case, the in-flight lease request - // should already have been removed from our local state, so we no - // longer need to cancel. + // request, b) we have already returned the worker lease + // request, or c) the current request is a retry and the server response to + // the initial request was lost after cancelling the lease. In case a), we + // should try the cancellation request again. In case b), the in-flight lease + // request should already have been removed from our local state, so we no + // longer need to cancel. In case c), the response for ReturnWorkerLease + // should have already been triggered and the pending lease request will be + // cleaned up. CancelWorkerLeaseIfNeeded(scheduling_key); } }); } } -std::shared_ptr NormalTaskSubmitter::GetOrConnectRayletClient( - const rpc::Address *raylet_address) { - std::shared_ptr raylet_client; - RAY_CHECK(raylet_address != nullptr); - if (NodeID::FromBinary(raylet_address->raylet_id()) != local_raylet_id_) { - // A remote raylet was specified. Connect to the raylet if needed. - NodeID raylet_id = NodeID::FromBinary(raylet_address->raylet_id()); - auto it = remote_raylet_clients_.find(raylet_id); - if (it == remote_raylet_clients_.end()) { - RAY_LOG(INFO) << "Connecting to raylet " << raylet_id; - it = remote_raylet_clients_ - .emplace(raylet_id, - raylet_client_pool_->GetOrConnectByAddress(*raylet_address)) - .first; - } - raylet_client = it->second; - } else { - raylet_client = local_raylet_client_; - } - - return raylet_client; -} - void NormalTaskSubmitter::ReportWorkerBacklog() { absl::MutexLock lock(&mu_); ReportWorkerBacklogInternal(); } void NormalTaskSubmitter::ReportWorkerBacklogInternal() { - absl::flat_hash_map> backlogs; + absl::flat_hash_map> backlogs; for (auto &scheduling_key_and_entry : scheduling_key_entries_) { const SchedulingClass scheduling_class = std::get<0>(scheduling_key_and_entry.first); if (backlogs.find(scheduling_class) == backlogs.end()) { - backlogs[scheduling_class].first = scheduling_key_and_entry.second.resource_spec; + backlogs[scheduling_class].first = scheduling_key_and_entry.second.lease_spec; backlogs[scheduling_class].second = 0; } // We report backlog size per scheduling class not per scheduling key @@ -274,12 +252,11 @@ void NormalTaskSubmitter::ReportWorkerBacklogInternal() { std::vector backlog_reports; for (const auto &backlog : backlogs) { rpc::WorkerBacklogReport backlog_report; - backlog_report.mutable_resource_spec()->CopyFrom(backlog.second.first.GetMessage()); + backlog_report.mutable_lease_spec()->CopyFrom(backlog.second.first.GetMessage()); backlog_report.set_backlog_size(backlog.second.second); backlog_reports.emplace_back(backlog_report); } - local_raylet_client_->ReportWorkerBacklog( - WorkerID::FromBinary(rpc_address_.worker_id()), backlog_reports); + local_raylet_client_->ReportWorkerBacklog(worker_id_, backlog_reports); } void NormalTaskSubmitter::ReportWorkerBacklogIfNeeded( @@ -324,36 +301,35 @@ void NormalTaskSubmitter::RequestNewWorkerIfNeeded(const SchedulingKey &scheduli // All tasks have corresponding pending leases, no need to request more return; } - - // Create a TaskSpecification with an overwritten TaskID to make sure we don't reuse the - // same TaskID to request a worker - auto resource_spec_msg = scheduling_key_entry.resource_spec.GetMutableMessage(); - resource_spec_msg.set_task_id(TaskID::FromRandom(job_id_).Binary()); - const TaskSpecification resource_spec = TaskSpecification(std::move(resource_spec_msg)); + // Counter for generating unique lease IDs. + static uint32_t lease_id_counter = 0; + const LeaseID lease_id = LeaseID::FromWorker(worker_id_, lease_id_counter++); + rpc::LeaseSpec lease_spec_msg = scheduling_key_entry.lease_spec.GetMessage(); + lease_spec_msg.set_lease_id(lease_id.Binary()); + const LeaseSpecification lease_spec = LeaseSpecification(std::move(lease_spec_msg)); rpc::Address best_node_address; const bool is_spillback = (raylet_address != nullptr); bool is_selected_based_on_locality = false; if (raylet_address == nullptr) { // If no raylet address is given, find the best worker for our next lease request. std::tie(best_node_address, is_selected_based_on_locality) = - lease_policy_->GetBestNodeForTask(resource_spec); + lease_policy_->GetBestNodeForLease(lease_spec); raylet_address = &best_node_address; } - auto raylet_client = GetOrConnectRayletClient(raylet_address); - const TaskID task_id = resource_spec.TaskId(); - const std::string task_name = resource_spec.GetName(); - RAY_LOG(DEBUG) << "Requesting lease from raylet " - << NodeID::FromBinary(raylet_address->raylet_id()) << " for task " - << task_id; + auto raylet_client = raylet_client_pool_->GetOrConnectByAddress(*raylet_address); + const std::string function_or_actor_name = lease_spec.GetFunctionOrActorName(); + RAY_LOG(DEBUG) << "Requesting lease " << lease_id << " from raylet " + << NodeID::FromBinary(raylet_address->node_id()) << " for " + << function_or_actor_name; raylet_client->RequestWorkerLease( - resource_spec.GetMessage(), + lease_spec.GetMessage(), /*grant_or_reject=*/is_spillback, [this, scheduling_key, - task_id, - task_name, + lease_id, + function_or_actor_name, is_spillback, raylet_address = *raylet_address](const Status &status, const rpc::RequestWorkerLeaseReply &reply) { @@ -364,14 +340,14 @@ void NormalTaskSubmitter::RequestNewWorkerIfNeeded(const SchedulingKey &scheduli { absl::MutexLock lock(&mu_); - auto &scheduling_key_entry = scheduling_key_entries_[scheduling_key]; - auto raylet_client = GetOrConnectRayletClient(&raylet_address); - scheduling_key_entry.pending_lease_requests.erase(task_id); + auto &sched_entry = scheduling_key_entries_[scheduling_key]; + auto raylet_lease_client = + raylet_client_pool_->GetOrConnectByAddress(raylet_address); + sched_entry.pending_lease_requests.erase(lease_id); if (status.ok()) { if (reply.canceled()) { - RAY_LOG(DEBUG) << "Lease canceled for task: " << task_id - << ", canceled type: " + RAY_LOG(DEBUG) << "Lease canceled for: " << lease_id << ", canceled type: " << rpc::RequestWorkerLeaseReply::SchedulingFailureType_Name( reply.failure_type()); if (reply.failure_type() == @@ -404,41 +380,40 @@ void NormalTaskSubmitter::RequestNewWorkerIfNeeded(const SchedulingKey &scheduli } error_info.set_error_message( absl::StrCat(reply.scheduling_failure_message(), - " task_id=", - task_id.Hex(), - ", task_name=", - task_name)); - - tasks_to_fail = std::move(scheduling_key_entry.task_queue); - scheduling_key_entry.task_queue.clear(); - if (scheduling_key_entry.CanDelete()) { + " lease_id=", + lease_id.Hex(), + ", name=", + function_or_actor_name)); + + tasks_to_fail = std::move(sched_entry.task_queue); + sched_entry.task_queue.clear(); + if (sched_entry.CanDelete()) { scheduling_key_entries_.erase(scheduling_key); } } else { RequestNewWorkerIfNeeded(scheduling_key); } } else if (reply.rejected()) { - RAY_LOG(DEBUG) << "Lease rejected " << task_id; + RAY_LOG(DEBUG) << "Lease rejected " << lease_id; // It might happen when the first raylet has a stale view // of the spillback raylet resources. // Retry the request at the first raylet since the resource view may be // refreshed. RAY_CHECK(is_spillback); RequestNewWorkerIfNeeded(scheduling_key); - } else if (!reply.worker_address().raylet_id().empty()) { + } else if (!reply.worker_address().node_id().empty()) { // We got a lease for a worker. Add the lease client state and try to // assign work to the worker. - RAY_LOG(DEBUG) << "Lease granted to task " << task_id << " from raylet " - << NodeID::FromBinary(reply.worker_address().raylet_id()) + RAY_LOG(DEBUG) << "Lease granted to task " << lease_id << " from raylet " + << NodeID::FromBinary(reply.worker_address().node_id()) << " with worker " << WorkerID::FromBinary(reply.worker_address().worker_id()); - AddWorkerLeaseClient(reply.worker_address(), - std::move(raylet_client), + NodeID::FromBinary(reply.worker_address().node_id()), reply.resource_mapping(), scheduling_key, - task_id); - RAY_CHECK(scheduling_key_entry.active_workers.size() >= 1); + lease_id); + RAY_CHECK(sched_entry.active_workers.size() >= 1); OnWorkerIdle(reply.worker_address(), scheduling_key, /*was_error=*/false, @@ -448,65 +423,53 @@ void NormalTaskSubmitter::RequestNewWorkerIfNeeded(const SchedulingKey &scheduli } else { // The raylet redirected us to a different raylet to retry at. RAY_CHECK(!is_spillback); - RAY_LOG(DEBUG) << "Redirect lease for task " << task_id << " from raylet " - << NodeID::FromBinary(raylet_address.raylet_id()) + RAY_LOG(DEBUG) << "Redirect lease " << lease_id << " from raylet " + << NodeID::FromBinary(raylet_address.node_id()) << " to raylet " << NodeID::FromBinary( - reply.retry_at_raylet_address().raylet_id()); + reply.retry_at_raylet_address().node_id()) + << " for " << function_or_actor_name; RequestNewWorkerIfNeeded(scheduling_key, &reply.retry_at_raylet_address()); } - } else if (raylet_client != local_raylet_client_) { + } else if (NodeID::FromBinary(raylet_address.node_id()) != local_node_id_) { // A lease request to a remote raylet failed. Retry locally if the lease is // still needed. // TODO(swang): Fail after some number of retries? RAY_LOG_EVERY_MS(INFO, 30 * 1000) - << "Retrying attempt to schedule task (id: " << task_id - << " name: " << task_name - << ") at remote node (id: " << raylet_address.raylet_id() + << "Retrying attempt to schedule lease (id: " << lease_id + << " name: " << function_or_actor_name + << ") at remote node (id: " << raylet_address.node_id() << " ip: " << raylet_address.ip_address() << "). Try again " "on a local node. Error: " << status.ToString(); RequestNewWorkerIfNeeded(scheduling_key); - } else { - if (status.IsRpcError() && - status.rpc_code() == grpc::StatusCode::UNAVAILABLE) { - RAY_LOG(WARNING) - << "The worker failed to receive a response from the local " - << "raylet because the raylet is unavailable (crashed). " - << "Error: " << status; - if (worker_type_ == WorkerType::WORKER) { - // Exit the worker so that caller can retry somewhere else. - RAY_LOG(WARNING) << "Terminating the worker due to local raylet death"; - QuickExit(); - } - RAY_CHECK(worker_type_ == WorkerType::DRIVER); - error_type = rpc::ErrorType::LOCAL_RAYLET_DIED; - error_status = status; - // Grpc errors are not helpful at all. So we are overwriting it. - std::stringstream ss; - ss << "The worker failed to receive a response from the local raylet" - << "(id: " << NodeID::FromBinary(raylet_address.raylet_id()).Hex() - << " ,ip: " << raylet_address.ip_address() << ") " - << "because the raylet is " - "unavailable (crashed)."; - error_info.set_error_message(ss.str()); - tasks_to_fail = std::move(scheduling_key_entry.task_queue); - scheduling_key_entry.task_queue.clear(); - if (scheduling_key_entry.CanDelete()) { - scheduling_key_entries_.erase(scheduling_key); - } - } else { - RAY_LOG(WARNING) - << "The worker failed to receive a response from the local raylet, but " - "raylet is still alive. Try again on a local node. Error: " - << status; - // TODO(sang): Maybe we should raise FATAL error if it happens too many - // times. - RequestNewWorkerIfNeeded(scheduling_key); + RAY_LOG(WARNING) << "The worker failed to receive a response from the local " + << "raylet because the raylet is unavailable (crashed). " + << "Error: " << status; + if (worker_type_ == WorkerType::WORKER) { + // Exit the worker so that caller can retry somewhere else. + RAY_LOG(WARNING) << "Terminating the worker due to local raylet death"; + QuickExit(); + } + RAY_CHECK(worker_type_ == WorkerType::DRIVER); + error_type = rpc::ErrorType::LOCAL_RAYLET_DIED; + error_status = status; + // Grpc errors are not helpful at all. So we are overwriting it. + std::stringstream ss; + ss << "The worker failed to receive a response from the local raylet" + << "(id: " << NodeID::FromBinary(raylet_address.node_id()).Hex() + << " ,ip: " << raylet_address.ip_address() << ") " + << "because the raylet is " + "unavailable (crashed)."; + error_info.set_error_message(ss.str()); + tasks_to_fail = std::move(sched_entry.task_queue); + sched_entry.task_queue.clear(); + if (sched_entry.CanDelete()) { + scheduling_key_entries_.erase(scheduling_key); } } } @@ -520,7 +483,7 @@ void NormalTaskSubmitter::RequestNewWorkerIfNeeded(const SchedulingKey &scheduli }, task_queue.size(), is_selected_based_on_locality); - scheduling_key_entry.pending_lease_requests.emplace(task_id, *raylet_address); + scheduling_key_entry.pending_lease_requests.emplace(lease_id, *raylet_address); ReportWorkerBacklogIfNeeded(scheduling_key); // Lease more workers if there are still pending tasks and @@ -541,7 +504,7 @@ void NormalTaskSubmitter::PushNormalTask( const google::protobuf::RepeatedPtrField &assigned_resources) { RAY_LOG(DEBUG) << "Pushing task " << task_spec.TaskId() << " to worker " << WorkerID::FromBinary(addr.worker_id()) << " of raylet " - << NodeID::FromBinary(addr.raylet_id()); + << NodeID::FromBinary(addr.node_id()); auto task_id = task_spec.TaskId(); auto request = std::make_unique(); // NOTE(swang): CopyFrom is needed because if we use Swap here and the task @@ -551,7 +514,7 @@ void NormalTaskSubmitter::PushNormalTask( request->mutable_resource_mapping()->CopyFrom(assigned_resources); request->set_intended_worker_id(addr.worker_id()); task_manager_.MarkTaskWaitingForExecution(task_id, - NodeID::FromBinary(addr.raylet_id()), + NodeID::FromBinary(addr.node_id()), WorkerID::FromBinary(addr.worker_id())); client->PushNormalTask( std::move(request), @@ -565,7 +528,7 @@ void NormalTaskSubmitter::PushNormalTask( { RAY_LOG(DEBUG) << "Task " << task_id << " finished from worker " << WorkerID::FromBinary(addr.worker_id()) << " of raylet " - << NodeID::FromBinary(addr.raylet_id()); + << NodeID::FromBinary(addr.node_id()); absl::MutexLock lock(&mu_); executing_tasks_.erase(task_id); @@ -586,17 +549,18 @@ void NormalTaskSubmitter::PushNormalTask( if (!status.ok()) { failed_tasks_pending_failure_cause_.insert(task_id); RAY_LOG(DEBUG) << "Getting error from raylet for task " << task_id; - const ray::rpc::ClientCallback callback = - [this, status, task_id, addr]( - const Status &get_task_failure_cause_reply_status, - const rpc::GetTaskFailureCauseReply &get_task_failure_cause_reply) { + const ray::rpc::ClientCallback + callback = [this, status, task_id, addr]( + const Status &get_task_failure_cause_reply_status, + const rpc::GetWorkerFailureCauseReply + &get_task_failure_cause_reply) { bool will_retry = - HandleGetTaskFailureCause(status, - task_id, - addr, - get_task_failure_cause_reply_status, - get_task_failure_cause_reply); - absl::MutexLock lock(&mu_); + HandleGetWorkerFailureCause(status, + task_id, + addr, + get_task_failure_cause_reply_status, + get_task_failure_cause_reply); + absl::MutexLock task_submission_state_lock(&mu_); if (!will_retry) { // Task submission and task cancellation are the only two other code // paths that clean up the cancelled_tasks_ map. If the task is not @@ -607,9 +571,9 @@ void NormalTaskSubmitter::PushNormalTask( failed_tasks_pending_failure_cause_.erase(task_id); }; auto &cur_lease_entry = worker_to_lease_entry_[addr]; - RAY_CHECK(cur_lease_entry.raylet_client); - cur_lease_entry.raylet_client->GetTaskFailureCause(cur_lease_entry.task_id, - callback); + auto raylet_client = raylet_client_pool_->GetByID(cur_lease_entry.node_id); + RAY_CHECK(raylet_client); + raylet_client->GetWorkerFailureCause(cur_lease_entry.lease_id, callback); } OnWorkerIdle(addr, scheduling_key, @@ -640,43 +604,44 @@ void NormalTaskSubmitter::PushNormalTask( }); } -bool NormalTaskSubmitter::HandleGetTaskFailureCause( +bool NormalTaskSubmitter::HandleGetWorkerFailureCause( const Status &task_execution_status, const TaskID &task_id, const rpc::Address &addr, - const Status &get_task_failure_cause_reply_status, - const rpc::GetTaskFailureCauseReply &get_task_failure_cause_reply) { + const Status &get_worker_failure_cause_reply_status, + const rpc::GetWorkerFailureCauseReply &get_worker_failure_cause_reply) { rpc::ErrorType task_error_type = rpc::ErrorType::WORKER_DIED; std::unique_ptr error_info; bool fail_immediately = false; - if (get_task_failure_cause_reply_status.ok()) { - RAY_LOG(WARNING) << "Task failure cause for task " << task_id << ": " + if (get_worker_failure_cause_reply_status.ok()) { + RAY_LOG(WARNING) << "Worker failure cause for task " << task_id << ": " << ray::gcs::RayErrorInfoToString( - get_task_failure_cause_reply.failure_cause()) + get_worker_failure_cause_reply.failure_cause()) << " fail immedediately: " - << get_task_failure_cause_reply.fail_task_immediately(); - if (get_task_failure_cause_reply.has_failure_cause()) { - task_error_type = get_task_failure_cause_reply.failure_cause().error_type(); + << get_worker_failure_cause_reply.fail_task_immediately(); + if (get_worker_failure_cause_reply.has_failure_cause()) { + task_error_type = get_worker_failure_cause_reply.failure_cause().error_type(); error_info = std::make_unique( - get_task_failure_cause_reply.failure_cause()); + get_worker_failure_cause_reply.failure_cause()); // TODO(clarng): track and append task retry history to the error message. } - fail_immediately = get_task_failure_cause_reply.fail_task_immediately(); + fail_immediately = get_worker_failure_cause_reply.fail_task_immediately(); } else { - RAY_LOG(WARNING) << "Failed to fetch task result with status " - << get_task_failure_cause_reply_status.ToString() - << " node id: " << NodeID::FromBinary(addr.raylet_id()) + RAY_LOG(WARNING) << "Failed to fetch worker failure cause with status " + << get_worker_failure_cause_reply_status.ToString() + << " worker id: " << WorkerID::FromBinary(addr.worker_id()) + << " node id: " << NodeID::FromBinary(addr.node_id()) << " ip: " << addr.ip_address(); task_error_type = rpc::ErrorType::NODE_DIED; std::stringstream buffer; buffer << "Task failed due to the node (where this task was running) " << " was dead or unavailable.\n\nThe node IP: " << addr.ip_address() - << ", node ID: " << NodeID::FromBinary(addr.raylet_id()) << "\n\n" + << ", node ID: " << NodeID::FromBinary(addr.node_id()) << "\n\n" << "This can happen if the instance where the node was running failed, " << "the node was preempted, or raylet crashed unexpectedly " << "(e.g., due to OOM) etc.\n\n" << "To see node death information, use `ray list nodes --filter \"node_id=" - << NodeID::FromBinary(addr.raylet_id()) << "\"`, " + << NodeID::FromBinary(addr.node_id()) << "\"`, " << "or check Ray dashboard cluster page, or search the node ID in GCS log, " << "or use `ray logs raylet.out -ip " << addr.ip_address() << "`"; error_info = std::make_unique(); @@ -691,9 +656,9 @@ bool NormalTaskSubmitter::HandleGetTaskFailureCause( fail_immediately); } -Status NormalTaskSubmitter::CancelTask(TaskSpecification task_spec, - bool force_kill, - bool recursive) { +void NormalTaskSubmitter::CancelTask(TaskSpecification task_spec, + bool force_kill, + bool recursive) { const auto task_id = task_spec.TaskId(); RAY_LOG(INFO) << "Cancelling a task: " << task_id << " force_kill: " << force_kill << " recursive: " << recursive; @@ -708,13 +673,13 @@ Status NormalTaskSubmitter::CancelTask(TaskSpecification task_spec, // For idempotency. if (cancelled_tasks_.contains(task_id)) { // The task cancel is already in progress. We don't need to do anything. - return Status::OK(); + return; } task_manager_.MarkTaskCanceled(task_id); if (!task_manager_.IsTaskPending(task_id)) { // The task is finished or failed so marking the task as cancelled is sufficient. - return Status::OK(); + return; } auto &scheduling_key_entry = scheduling_key_entries_[scheduling_key]; @@ -727,7 +692,7 @@ Status NormalTaskSubmitter::CancelTask(TaskSpecification task_spec, scheduling_tasks.erase(spec); CancelWorkerLeaseIfNeeded(scheduling_key); task_manager_.FailPendingTask(task_id, rpc::ErrorType::TASK_CANCELLED); - return Status::OK(); + return; } } } @@ -755,7 +720,7 @@ Status NormalTaskSubmitter::CancelTask(TaskSpecification task_spec, // scheduling_key_entries_ hashmap. scheduling_key_entries_.erase(scheduling_key); } - return Status::OK(); + return; } // Looks for an RPC handle for the worker executing the task. client = core_worker_client_pool_->GetOrConnect(rpc_client->second); @@ -805,20 +770,18 @@ Status NormalTaskSubmitter::CancelTask(TaskSpecification task_spec, } } }); - return Status::OK(); } -Status NormalTaskSubmitter::CancelRemoteTask(const ObjectID &object_id, - const rpc::Address &worker_addr, - bool force_kill, - bool recursive) { +void NormalTaskSubmitter::CancelRemoteTask(const ObjectID &object_id, + const rpc::Address &worker_addr, + bool force_kill, + bool recursive) { auto client = core_worker_client_pool_->GetOrConnect(worker_addr); auto request = rpc::RemoteCancelTaskRequest(); request.set_force_kill(force_kill); request.set_recursive(recursive); request.set_remote_object_id(object_id.Binary()); client->RemoteCancelTask(request, nullptr); - return Status::OK(); } bool NormalTaskSubmitter::QueueGeneratorForResubmit(const TaskSpecification &spec) { @@ -831,5 +794,29 @@ bool NormalTaskSubmitter::QueueGeneratorForResubmit(const TaskSpecification &spe return true; } +ClusterSizeBasedLeaseRequestRateLimiter::ClusterSizeBasedLeaseRequestRateLimiter( + size_t min_concurrent_lease_limit) + : min_concurrent_lease_cap_(min_concurrent_lease_limit), num_alive_nodes_(0) {} + +size_t ClusterSizeBasedLeaseRequestRateLimiter:: + GetMaxPendingLeaseRequestsPerSchedulingCategory() { + return std::max(min_concurrent_lease_cap_, num_alive_nodes_.load()); +} + +void ClusterSizeBasedLeaseRequestRateLimiter::OnNodeChanges( + const rpc::GcsNodeInfo &data) { + if (data.state() == rpc::GcsNodeInfo::DEAD) { + if (num_alive_nodes_ != 0) { + num_alive_nodes_--; + } else { + RAY_LOG(WARNING) << "Node" << data.node_manager_address() + << " change state to DEAD but num_alive_node is 0."; + } + } else { + num_alive_nodes_++; + } + RAY_LOG_EVERY_MS(INFO, 60000) << "Number of alive nodes:" << num_alive_nodes_.load(); +} + } // namespace core } // namespace ray diff --git a/src/ray/core_worker/transport/normal_task_submitter.h b/src/ray/core_worker/task_submission/normal_task_submitter.h similarity index 84% rename from src/ray/core_worker/transport/normal_task_submitter.h rename to src/ray/core_worker/task_submission/normal_task_submitter.h index f7fb0bcbe691..86dfe685689f 100644 --- a/src/ray/core_worker/transport/normal_task_submitter.h +++ b/src/ray/core_worker/task_submission/normal_task_submitter.h @@ -25,14 +25,12 @@ #include "absl/base/thread_annotations.h" #include "ray/common/id.h" -#include "ray/core_worker/actor_manager.h" -#include "ray/core_worker/context.h" #include "ray/core_worker/lease_policy.h" #include "ray/core_worker/store_provider/memory_store/memory_store.h" -#include "ray/core_worker/task_manager.h" -#include "ray/core_worker/transport/dependency_resolver.h" -#include "ray/raylet_client/raylet_client.h" -#include "ray/rpc/node_manager/raylet_client_pool.h" +#include "ray/core_worker/task_manager_interface.h" +#include "ray/core_worker/task_submission/dependency_resolver.h" +#include "ray/rpc/raylet/raylet_client_interface.h" +#include "ray/rpc/raylet/raylet_client_pool.h" #include "ray/rpc/worker/core_worker_client.h" #include "ray/rpc/worker/core_worker_client_pool.h" @@ -67,6 +65,19 @@ class StaticLeaseRequestRateLimiter : public LeaseRequestRateLimiter { const size_t kLimit; }; +// Lease request rate-limiter based on cluster node size. +// It returns max(num_nodes_in_cluster, min_concurrent_lease_limit) +class ClusterSizeBasedLeaseRequestRateLimiter : public LeaseRequestRateLimiter { + public: + explicit ClusterSizeBasedLeaseRequestRateLimiter(size_t min_concurrent_lease_limit); + size_t GetMaxPendingLeaseRequestsPerSchedulingCategory() override; + void OnNodeChanges(const rpc::GcsNodeInfo &data); + + private: + const size_t min_concurrent_lease_cap_; + std::atomic num_alive_nodes_; +}; + // This class is thread-safe. class NormalTaskSubmitter { public: @@ -78,7 +89,7 @@ class NormalTaskSubmitter { std::unique_ptr lease_policy, std::shared_ptr store, TaskManagerInterface &task_manager, - NodeID local_raylet_id, + NodeID local_node_id, WorkerType worker_type, int64_t lease_timeout_ms, std::shared_ptr actor_creator, @@ -93,7 +104,8 @@ class NormalTaskSubmitter { resolver_(*store, task_manager, *actor_creator, tensor_transport_getter), task_manager_(task_manager), lease_timeout_ms_(lease_timeout_ms), - local_raylet_id_(local_raylet_id), + local_node_id_(local_node_id), + worker_id_(WorkerID::FromBinary(rpc_address_.worker_id())), worker_type_(worker_type), core_worker_client_pool_(std::move(core_worker_client_pool)), job_id_(job_id), @@ -101,24 +113,22 @@ class NormalTaskSubmitter { cancel_retry_timer_(std::move(cancel_timer)) {} /// Schedule a task for direct submission to a worker. - /// - /// \param[in] task_spec The task to schedule. - Status SubmitTask(TaskSpecification task_spec); + void SubmitTask(TaskSpecification task_spec); /// Either remove a pending task or send an RPC to kill a running task /// /// \param[in] task_spec The task to kill. /// \param[in] force_kill Whether to kill the worker executing the task. - Status CancelTask(TaskSpecification task_spec, bool force_kill, bool recursive); + void CancelTask(TaskSpecification task_spec, bool force_kill, bool recursive); /// Request the owner of the object ID to cancel a request. /// It is used when a object ID is not owned by the current process. /// We cannot cancel the task in this case because we don't have enough /// information to cancel a task. - Status CancelRemoteTask(const ObjectID &object_id, - const rpc::Address &worker_addr, - bool force_kill, - bool recursive); + void CancelRemoteTask(const ObjectID &object_id, + const rpc::Address &worker_addr, + bool force_kill, + bool recursive); /// Queue the streaming generator up for resubmission. /// \return true if the task is still executing and the submitter agrees to resubmit @@ -158,12 +168,6 @@ class NormalTaskSubmitter { const google::protobuf::RepeatedPtrField &assigned_resources) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_); - /// Get an existing lease client or connect a new one. If a raylet_address is - /// provided, this connects to a remote raylet. Else, this connects to the - /// local raylet. - std::shared_ptr GetOrConnectRayletClient( - const rpc::Address *raylet_address) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_); - /// Report worker backlog information to the local raylet void ReportWorkerBacklogInternal() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_); @@ -190,10 +194,10 @@ class NormalTaskSubmitter { /// Set up client state for newly granted worker lease. void AddWorkerLeaseClient( const rpc::Address &addr, - std::shared_ptr raylet_client, + const NodeID &node_id, const google::protobuf::RepeatedPtrField &assigned_resources, const SchedulingKey &scheduling_key, - const TaskID &task_id) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_); + const LeaseID &lease_id) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_); /// This function takes care of returning a worker to the Raylet. /// \param[in] addr The address of the worker. @@ -201,11 +205,11 @@ class NormalTaskSubmitter { /// \param[in] error_detail The reason why it was errored. /// it is unused if was_error is false. /// \param[in] worker_exiting Whether the worker is exiting. - void ReturnWorker(const rpc::Address &addr, - bool was_error, - const std::string &error_detail, - bool worker_exiting, - const SchedulingKey &scheduling_key) + void ReturnWorkerLease(const rpc::Address &addr, + bool was_error, + const std::string &error_detail, + bool worker_exiting, + const SchedulingKey &scheduling_key) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_); /// Check that the scheduling_key_entries_ hashmap is empty. @@ -221,25 +225,21 @@ class NormalTaskSubmitter { const google::protobuf::RepeatedPtrField &assigned_resources); - /// Handles result from GetTaskFailureCause. - /// \return true if the task should be retried, false otherwise. - bool HandleGetTaskFailureCause( + /// Handles result from GetWorkerFailureCause. + /// \return true if the task executing on the worker should be retried, false otherwise. + bool HandleGetWorkerFailureCause( const Status &task_execution_status, const TaskID &task_id, const rpc::Address &addr, - const Status &get_task_failure_cause_reply_status, - const rpc::GetTaskFailureCauseReply &get_task_failure_cause_reply); + const Status &get_worker_failure_cause_reply_status, + const rpc::GetWorkerFailureCauseReply &get_worker_failure_cause_reply); /// Address of our RPC server. rpc::Address rpc_address_; - // Client that can be used to lease and return workers from the local raylet. + /// Client that can be used to lease and return workers from the local raylet. std::shared_ptr local_raylet_client_; - /// Cache of gRPC clients to remote raylets. - absl::flat_hash_map> - remote_raylet_clients_ ABSL_GUARDED_BY(mu_); - /// Raylet client pool for producing new clients to request leases from remote nodes. std::shared_ptr raylet_client_pool_; @@ -257,9 +257,12 @@ class NormalTaskSubmitter { /// to the raylet. int64_t lease_timeout_ms_; - /// The local raylet ID. Used to make sure that we use the local lease client + /// The local node ID. Used to make sure that we use the local lease client /// if a remote raylet tells us to spill the task back to the local raylet. - const NodeID local_raylet_id_; + const NodeID local_node_id_; + + /// The local worker ID. + const WorkerID worker_id_; /// The type of this core worker process. const WorkerType worker_type_; @@ -273,18 +276,18 @@ class NormalTaskSubmitter { const JobID job_id_; /// A LeaseEntry struct is used to condense the metadata about a single executor: - /// (1) The lease client through which the worker should be returned + /// (1) The node id of the leased worker. /// (2) The expiration time of a worker's lease. /// (3) Whether the worker has assigned task to do. - /// (5) The resources assigned to the worker - /// (6) The SchedulingKey assigned to tasks that will be sent to the worker - /// (7) The task id used to obtain the worker lease. + /// (4) The resources assigned to the worker + /// (5) The SchedulingKey assigned to tasks that will be sent to the worker + /// (6) The task id used to obtain the worker lease. struct LeaseEntry { - std::shared_ptr raylet_client; + NodeID node_id; int64_t lease_expiration_time; google::protobuf::RepeatedPtrField assigned_resources; SchedulingKey scheduling_key; - TaskID task_id; + LeaseID lease_id; bool is_busy = false; }; @@ -294,8 +297,9 @@ class NormalTaskSubmitter { struct SchedulingKeyEntry { // Keep track of pending worker lease requests to the raylet. - absl::flat_hash_map pending_lease_requests; - TaskSpecification resource_spec; + absl::flat_hash_map pending_lease_requests; + + LeaseSpecification lease_spec; // Tasks that are queued for execution. We keep an individual queue per // scheduling class to ensure fairness. std::deque task_queue; diff --git a/src/ray/core_worker/transport/out_of_order_actor_submit_queue.cc b/src/ray/core_worker/task_submission/out_of_order_actor_submit_queue.cc similarity index 94% rename from src/ray/core_worker/transport/out_of_order_actor_submit_queue.cc rename to src/ray/core_worker/task_submission/out_of_order_actor_submit_queue.cc index 32a8712e1682..61541d513624 100644 --- a/src/ray/core_worker/transport/out_of_order_actor_submit_queue.cc +++ b/src/ray/core_worker/task_submission/out_of_order_actor_submit_queue.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/core_worker/transport/out_of_order_actor_submit_queue.h" +#include "ray/core_worker/task_submission/out_of_order_actor_submit_queue.h" #include #include @@ -20,8 +20,7 @@ namespace ray { namespace core { -OutofOrderActorSubmitQueue::OutofOrderActorSubmitQueue(ActorID actor_id) - : kActorId(actor_id) {} +OutofOrderActorSubmitQueue::OutofOrderActorSubmitQueue() {} void OutofOrderActorSubmitQueue::Emplace(uint64_t position, const TaskSpecification &spec) { diff --git a/src/ray/core_worker/transport/out_of_order_actor_submit_queue.h b/src/ray/core_worker/task_submission/out_of_order_actor_submit_queue.h similarity index 95% rename from src/ray/core_worker/transport/out_of_order_actor_submit_queue.h rename to src/ray/core_worker/task_submission/out_of_order_actor_submit_queue.h index facbb456775a..3af1acba54d4 100644 --- a/src/ray/core_worker/transport/out_of_order_actor_submit_queue.h +++ b/src/ray/core_worker/task_submission/out_of_order_actor_submit_queue.h @@ -20,7 +20,7 @@ #include "absl/container/btree_map.h" #include "absl/types/optional.h" #include "ray/common/id.h" -#include "ray/core_worker/transport/actor_submit_queue.h" +#include "ray/core_worker/task_submission/actor_submit_queue.h" namespace ray { namespace core { @@ -34,7 +34,7 @@ namespace core { */ class OutofOrderActorSubmitQueue : public IActorSubmitQueue { public: - explicit OutofOrderActorSubmitQueue(ActorID actor_id); + OutofOrderActorSubmitQueue(); /// Add a task into the queue. void Emplace(uint64_t position, const TaskSpecification &spec) override; /// If a task exists. @@ -60,7 +60,6 @@ class OutofOrderActorSubmitQueue : public IActorSubmitQueue { bool Empty() override; private: - ActorID kActorId; absl::btree_map> pending_queue_; absl::btree_map> sending_queue_; }; diff --git a/src/ray/core_worker/transport/sequential_actor_submit_queue.cc b/src/ray/core_worker/task_submission/sequential_actor_submit_queue.cc similarity index 95% rename from src/ray/core_worker/transport/sequential_actor_submit_queue.cc rename to src/ray/core_worker/task_submission/sequential_actor_submit_queue.cc index e5c676e21258..773df5c22f6b 100644 --- a/src/ray/core_worker/transport/sequential_actor_submit_queue.cc +++ b/src/ray/core_worker/task_submission/sequential_actor_submit_queue.cc @@ -12,15 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/core_worker/transport/sequential_actor_submit_queue.h" +#include "ray/core_worker/task_submission/sequential_actor_submit_queue.h" #include #include namespace ray { namespace core { -SequentialActorSubmitQueue::SequentialActorSubmitQueue(ActorID actor_id) - : actor_id(actor_id) {} +SequentialActorSubmitQueue::SequentialActorSubmitQueue() {} void SequentialActorSubmitQueue::Emplace(uint64_t sequence_no, const TaskSpecification &spec) { diff --git a/src/ray/core_worker/transport/sequential_actor_submit_queue.h b/src/ray/core_worker/task_submission/sequential_actor_submit_queue.h similarity index 94% rename from src/ray/core_worker/transport/sequential_actor_submit_queue.h rename to src/ray/core_worker/task_submission/sequential_actor_submit_queue.h index 5559bed185d5..f54c7f9a75be 100644 --- a/src/ray/core_worker/transport/sequential_actor_submit_queue.h +++ b/src/ray/core_worker/task_submission/sequential_actor_submit_queue.h @@ -20,7 +20,7 @@ #include "absl/container/btree_map.h" #include "absl/types/optional.h" #include "ray/common/id.h" -#include "ray/core_worker/transport/actor_submit_queue.h" +#include "ray/core_worker/task_submission/actor_submit_queue.h" namespace ray { namespace core { @@ -31,7 +31,7 @@ namespace core { */ class SequentialActorSubmitQueue : public IActorSubmitQueue { public: - explicit SequentialActorSubmitQueue(ActorID actor_id); + SequentialActorSubmitQueue(); /// Add a task into the queue. void Emplace(uint64_t sequence_no, const TaskSpecification &task_spec) override; /// If a task exists. @@ -57,9 +57,6 @@ class SequentialActorSubmitQueue : public IActorSubmitQueue { bool Empty() override; private: - /// The ID of the actor. - ActorID actor_id; - /// The actor's pending requests, ordered by the sequence number in the request. /// The bool indicates whether the dependencies for that task have been resolved yet. /// A task will be sent after its dependencies are resolved. diff --git a/src/ray/core_worker/task_submission/tests/BUILD.bazel b/src/ray/core_worker/task_submission/tests/BUILD.bazel new file mode 100644 index 000000000000..b2cf928329af --- /dev/null +++ b/src/ray/core_worker/task_submission/tests/BUILD.bazel @@ -0,0 +1,81 @@ +load("//bazel:ray.bzl", "ray_cc_test") + +ray_cc_test( + name = "dependency_resolver_test", + size = "small", + srcs = ["dependency_resolver_test.cc"], + tags = ["team:core"], + deps = [ + "//:ray_mock", + "//src/ray/common:task_common", + "//src/ray/common:test_utils", + "//src/ray/core_worker:fake_actor_creator", + "//src/ray/core_worker/task_submission:dependency_resolver", + "@com_google_googletest//:gtest", + ], +) + +ray_cc_test( + name = "out_of_order_actor_submit_queue_test", + size = "small", + srcs = ["out_of_order_actor_submit_queue_test.cc"], + tags = ["team:core"], + deps = [ + "//src/ray/common:asio", + "//src/ray/core_worker/task_submission:out_of_order_actor_submit_queue", + "@com_google_googletest//:gtest", + "@com_google_googletest//:gtest_main", + ], +) + +ray_cc_test( + name = "direct_actor_transport_test", + srcs = ["direct_actor_transport_test.cc"], + tags = ["team:core"], + deps = [ + "//:ray_mock", + "//src/ray/core_worker/task_submission:actor_task_submitter", + "@com_google_googletest//:gtest", + "@com_google_googletest//:gtest_main", + ], +) + +ray_cc_test( + name = "actor_task_submitter_test", + srcs = ["actor_task_submitter_test.cc"], + tags = ["team:core"], + deps = [ + "//:ray_mock", + "//src/ray/common:asio", + "//src/ray/common:task_common", + "//src/ray/common:test_utils", + "//src/ray/core_worker:actor_creator", + "//src/ray/core_worker:fake_actor_creator", + "//src/ray/core_worker:reference_count", + "//src/ray/core_worker:task_manager", + "//src/ray/rpc:core_worker_client", + "@com_google_googletest//:gtest", + "@com_google_googletest//:gtest_main", + ], +) + +ray_cc_test( + name = "normal_task_submitter_test", + size = "small", + srcs = ["normal_task_submitter_test.cc"], + tags = ["team:core"], + deps = [ + "//:ray_fakes", + "//:ray_mock", + "//src/fakes/ray/rpc/raylet:fake_raylet_client", + "//src/ray/common:task_common", + "//src/ray/common:test_utils", + "//src/ray/core_worker:fake_actor_creator", + "//src/ray/core_worker:memory_store", + "//src/ray/core_worker/task_submission:normal_task_submitter", + "//src/ray/rpc:core_worker_client", + "//src/ray/rpc:raylet_client_interface", + "@com_google_googletest//:gtest", + "@com_google_googletest//:gtest_main", + ], +) diff --git a/src/ray/core_worker/test/actor_task_submitter_test.cc b/src/ray/core_worker/task_submission/tests/actor_task_submitter_test.cc similarity index 92% rename from src/ray/core_worker/test/actor_task_submitter_test.cc rename to src/ray/core_worker/task_submission/tests/actor_task_submitter_test.cc index d4bd062a551e..0d49e7f8369b 100644 --- a/src/ray/core_worker/test/actor_task_submitter_test.cc +++ b/src/ray/core_worker/task_submission/tests/actor_task_submitter_test.cc @@ -12,17 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/core_worker/transport/actor_task_submitter.h" +#include "ray/core_worker/task_submission/actor_task_submitter.h" #include #include #include #include "gtest/gtest.h" -#include "mock/ray/core_worker/actor_creator.h" #include "mock/ray/core_worker/reference_count.h" #include "mock/ray/core_worker/task_manager_interface.h" -#include "ray/common/test_util.h" +#include "ray/common/test_utils.h" +#include "ray/core_worker/fake_actor_creator.h" #include "ray/rpc/worker/core_worker_client.h" namespace ray::core { @@ -85,11 +85,8 @@ class MockWorkerClient : public rpc::CoreWorkerClientInterface { class ActorTaskSubmitterTest : public ::testing::TestWithParam { public: ActorTaskSubmitterTest() - : client_pool_( - std::make_shared([&](const rpc::Address &addr) { - num_clients_connected_++; - return worker_client_; - })), + : client_pool_(std::make_shared( + [&](const rpc::Address &addr) { return worker_client_; })), worker_client_(std::make_shared()), store_(std::make_shared(io_context)), task_manager_(std::make_shared()), @@ -109,9 +106,8 @@ class ActorTaskSubmitterTest : public ::testing::TestWithParam { void TearDown() override { io_context.stop(); } - int num_clients_connected_ = 0; int64_t last_queue_warning_ = 0; - MockActorCreatorInterface actor_creator_; + FakeActorCreator actor_creator_; std::shared_ptr client_pool_; std::shared_ptr worker_client_; std::shared_ptr store_; @@ -135,7 +131,7 @@ TEST_P(ActorTaskSubmitterTest, TestSubmitTask) { /*owned*/ false); auto task1 = CreateActorTaskHelper(actor_id, worker_id, 0); - ASSERT_TRUE(submitter_.SubmitTask(task1).ok()); + submitter_.SubmitTask(task1); ASSERT_EQ(io_context.poll_one(), 1); ASSERT_EQ(worker_client_->callbacks.size(), 0); @@ -143,7 +139,7 @@ TEST_P(ActorTaskSubmitterTest, TestSubmitTask) { ASSERT_EQ(worker_client_->callbacks.size(), 1); auto task2 = CreateActorTaskHelper(actor_id, worker_id, 1); - ASSERT_TRUE(submitter_.SubmitTask(task2).ok()); + submitter_.SubmitTask(task2); ASSERT_EQ(io_context.poll_one(), 1); ASSERT_EQ(worker_client_->callbacks.size(), 2); @@ -176,7 +172,7 @@ TEST_P(ActorTaskSubmitterTest, TestQueueingWarning) { for (int i = 0; i < 7500; i++) { auto task = CreateActorTaskHelper(actor_id, worker_id, i); - ASSERT_TRUE(submitter_.SubmitTask(task).ok()); + submitter_.SubmitTask(task); ASSERT_EQ(io_context.poll_one(), 1); ASSERT_TRUE(worker_client_->ReplyPushTask(task.GetTaskAttempt(), Status::OK())); } @@ -184,7 +180,7 @@ TEST_P(ActorTaskSubmitterTest, TestQueueingWarning) { for (int i = 7500; i < 15000; i++) { auto task = CreateActorTaskHelper(actor_id, worker_id, i); - ASSERT_TRUE(submitter_.SubmitTask(task).ok()); + submitter_.SubmitTask(task); ASSERT_EQ(io_context.poll_one(), 1); /* no ack */ } @@ -192,7 +188,7 @@ TEST_P(ActorTaskSubmitterTest, TestQueueingWarning) { for (int i = 15000; i < 35000; i++) { auto task = CreateActorTaskHelper(actor_id, worker_id, i); - ASSERT_TRUE(submitter_.SubmitTask(task).ok()); + submitter_.SubmitTask(task); ASSERT_EQ(io_context.poll_one(), 1); /* no ack */ } @@ -225,9 +221,9 @@ TEST_P(ActorTaskSubmitterTest, TestDependencies) { // Neither task can be submitted yet because they are still waiting on // dependencies. - ASSERT_TRUE(submitter_.SubmitTask(task1).ok()); + submitter_.SubmitTask(task1); ASSERT_EQ(io_context.poll_one(), 1); - ASSERT_TRUE(submitter_.SubmitTask(task2).ok()); + submitter_.SubmitTask(task2); ASSERT_EQ(io_context.poll_one(), 1); ASSERT_EQ(worker_client_->callbacks.size(), 0); @@ -272,9 +268,9 @@ TEST_P(ActorTaskSubmitterTest, TestOutOfOrderDependencies) { // Neither task can be submitted yet because they are still waiting on // dependencies. - ASSERT_TRUE(submitter_.SubmitTask(task1).ok()); + submitter_.SubmitTask(task1); ASSERT_EQ(io_context.poll_one(), 1); - ASSERT_TRUE(submitter_.SubmitTask(task2).ok()); + submitter_.SubmitTask(task2); ASSERT_EQ(io_context.poll_one(), 1); ASSERT_EQ(worker_client_->callbacks.size(), 0); @@ -325,9 +321,9 @@ TEST_P(ActorTaskSubmitterTest, TestActorDead) { ObjectID obj = ObjectID::FromRandom(); auto task2 = CreateActorTaskHelper(actor_id, worker_id, 1); task2.GetMutableMessage().add_args()->mutable_object_ref()->set_object_id(obj.Binary()); - ASSERT_TRUE(submitter_.SubmitTask(task1).ok()); + submitter_.SubmitTask(task1); ASSERT_EQ(io_context.poll_one(), 1); - ASSERT_TRUE(submitter_.SubmitTask(task2).ok()); + submitter_.SubmitTask(task2); ASSERT_EQ(io_context.poll_one(), 1); ASSERT_EQ(worker_client_->callbacks.size(), 1); @@ -369,11 +365,11 @@ TEST_P(ActorTaskSubmitterTest, TestActorRestartNoRetry) { auto task3 = CreateActorTaskHelper(actor_id, worker_id, 2); auto task4 = CreateActorTaskHelper(actor_id, worker_id, 3); // Submit three tasks. - ASSERT_TRUE(submitter_.SubmitTask(task1).ok()); + submitter_.SubmitTask(task1); ASSERT_EQ(io_context.poll_one(), 1); - ASSERT_TRUE(submitter_.SubmitTask(task2).ok()); + submitter_.SubmitTask(task2); ASSERT_EQ(io_context.poll_one(), 1); - ASSERT_TRUE(submitter_.SubmitTask(task3).ok()); + submitter_.SubmitTask(task3); ASSERT_EQ(io_context.poll_one(), 1); EXPECT_CALL(*task_manager_, CompletePendingTask(task1.TaskId(), _, _, _)).Times(1); @@ -397,7 +393,7 @@ TEST_P(ActorTaskSubmitterTest, TestActorRestartNoRetry) { // Actor gets restarted. addr.set_port(1); submitter_.ConnectActor(actor_id, addr, 1); - ASSERT_TRUE(submitter_.SubmitTask(task4).ok()); + submitter_.SubmitTask(task4); ASSERT_EQ(io_context.poll_one(), 1); ASSERT_TRUE(worker_client_->ReplyPushTask(task4.GetTaskAttempt(), Status::OK())); ASSERT_TRUE(worker_client_->callbacks.empty()); @@ -426,11 +422,11 @@ TEST_P(ActorTaskSubmitterTest, TestActorRestartRetry) { auto task3 = CreateActorTaskHelper(actor_id, worker_id, 2); auto task4 = CreateActorTaskHelper(actor_id, worker_id, 3); // Submit three tasks. - ASSERT_TRUE(submitter_.SubmitTask(task1).ok()); + submitter_.SubmitTask(task1); ASSERT_EQ(io_context.poll_one(), 1); - ASSERT_TRUE(submitter_.SubmitTask(task2).ok()); + submitter_.SubmitTask(task2); ASSERT_EQ(io_context.poll_one(), 1); - ASSERT_TRUE(submitter_.SubmitTask(task3).ok()); + submitter_.SubmitTask(task3); ASSERT_EQ(io_context.poll_one(), 1); // All tasks will eventually finish. @@ -457,17 +453,17 @@ TEST_P(ActorTaskSubmitterTest, TestActorRestartRetry) { addr.set_port(1); submitter_.ConnectActor(actor_id, addr, 1); // A new task is submitted. - ASSERT_TRUE(submitter_.SubmitTask(task4).ok()); + submitter_.SubmitTask(task4); ASSERT_EQ(io_context.poll_one(), 1); // Tasks 2 and 3 get retried. In the real world, the seq_no of these two tasks should be // updated to 4 and 5 by `CoreWorker::InternalHeartbeat`. task2.GetMutableMessage().set_attempt_number(task2.AttemptNumber() + 1); task2.GetMutableMessage().mutable_actor_task_spec()->set_sequence_number(4); - ASSERT_TRUE(submitter_.SubmitTask(task2).ok()); + submitter_.SubmitTask(task2); ASSERT_EQ(io_context.poll_one(), 1); task3.GetMutableMessage().set_attempt_number(task2.AttemptNumber() + 1); task3.GetMutableMessage().mutable_actor_task_spec()->set_sequence_number(5); - ASSERT_TRUE(submitter_.SubmitTask(task3).ok()); + submitter_.SubmitTask(task3); ASSERT_EQ(io_context.poll_one(), 1); ASSERT_TRUE(worker_client_->ReplyPushTask(task4.GetTaskAttempt(), Status::OK())); ASSERT_TRUE(worker_client_->ReplyPushTask(task2.GetTaskAttempt(), Status::OK())); @@ -496,11 +492,11 @@ TEST_P(ActorTaskSubmitterTest, TestActorRestartOutOfOrderRetry) { auto task2 = CreateActorTaskHelper(actor_id, worker_id, 1); auto task3 = CreateActorTaskHelper(actor_id, worker_id, 2); // Submit three tasks. - ASSERT_TRUE(submitter_.SubmitTask(task1).ok()); + submitter_.SubmitTask(task1); ASSERT_EQ(io_context.poll_one(), 1); - ASSERT_TRUE(submitter_.SubmitTask(task2).ok()); + submitter_.SubmitTask(task2); ASSERT_EQ(io_context.poll_one(), 1); - ASSERT_TRUE(submitter_.SubmitTask(task3).ok()); + submitter_.SubmitTask(task3); ASSERT_EQ(io_context.poll_one(), 1); // All tasks will eventually finish. EXPECT_CALL(*task_manager_, CompletePendingTask(_, _, _, _)).Times(3); @@ -526,7 +522,7 @@ TEST_P(ActorTaskSubmitterTest, TestActorRestartOutOfOrderRetry) { // Retry task 2 manually (simulating task_manager and SendPendingTask's behavior) task2.GetMutableMessage().set_attempt_number(task2.AttemptNumber() + 1); task2.GetMutableMessage().mutable_actor_task_spec()->set_sequence_number(3); - ASSERT_TRUE(submitter_.SubmitTask(task2).ok()); + submitter_.SubmitTask(task2); ASSERT_EQ(io_context.poll_one(), 1); // Only task2 should be submitted. task 3 (completed) should not be retried. @@ -548,12 +544,11 @@ TEST_P(ActorTaskSubmitterTest, TestActorRestartOutOfOrderGcs) { addr.set_port(0); submitter_.ConnectActor(actor_id, addr, 0); ASSERT_EQ(worker_client_->callbacks.size(), 0); - ASSERT_EQ(num_clients_connected_, 1); // Create four tasks for the actor. auto task1 = CreateActorTaskHelper(actor_id, worker_id, 0); // Submit a task. - ASSERT_TRUE(submitter_.SubmitTask(task1).ok()); + submitter_.SubmitTask(task1); ASSERT_EQ(io_context.poll_one(), 1); EXPECT_CALL(*task_manager_, CompletePendingTask(task1.TaskId(), _, _, _)).Times(1); ASSERT_TRUE(worker_client_->ReplyPushTask(task1.GetTaskAttempt(), Status::OK())); @@ -561,10 +556,9 @@ TEST_P(ActorTaskSubmitterTest, TestActorRestartOutOfOrderGcs) { // Actor restarts, but we don't receive the disconnect message until later. addr.set_port(1); submitter_.ConnectActor(actor_id, addr, 1); - ASSERT_EQ(num_clients_connected_, 2); // Submit a task. auto task2 = CreateActorTaskHelper(actor_id, worker_id, 1); - ASSERT_TRUE(submitter_.SubmitTask(task2).ok()); + submitter_.SubmitTask(task2); ASSERT_EQ(io_context.poll_one(), 1); EXPECT_CALL(*task_manager_, CompletePendingTask(task2.TaskId(), _, _, _)).Times(1); ASSERT_TRUE(worker_client_->ReplyPushTask(task2.GetTaskAttempt(), Status::OK())); @@ -573,10 +567,9 @@ TEST_P(ActorTaskSubmitterTest, TestActorRestartOutOfOrderGcs) { const auto death_cause = CreateMockDeathCause(); submitter_.DisconnectActor( actor_id, 1, /*dead=*/false, death_cause, /*is_restartable=*/true); - ASSERT_EQ(num_clients_connected_, 2); // Submit a task. auto task3 = CreateActorTaskHelper(actor_id, worker_id, 2); - ASSERT_TRUE(submitter_.SubmitTask(task3).ok()); + submitter_.SubmitTask(task3); ASSERT_EQ(io_context.poll_one(), 1); EXPECT_CALL(*task_manager_, CompletePendingTask(task3.TaskId(), _, _, _)).Times(1); ASSERT_TRUE(worker_client_->ReplyPushTask(task3.GetTaskAttempt(), Status::OK())); @@ -584,10 +577,9 @@ TEST_P(ActorTaskSubmitterTest, TestActorRestartOutOfOrderGcs) { // The actor dies twice. We receive the last RESTART message first. submitter_.DisconnectActor( actor_id, 3, /*dead=*/false, death_cause, /*is_restartable=*/true); - ASSERT_EQ(num_clients_connected_, 2); // Submit a task. auto task4 = CreateActorTaskHelper(actor_id, worker_id, 3); - ASSERT_TRUE(submitter_.SubmitTask(task4).ok()); + submitter_.SubmitTask(task4); ASSERT_EQ(io_context.poll_one(), 1); // Tasks submitted when the actor is in RESTARTING state will fail immediately. // This happens in an io_service.post. Search `SendPendingTasks_ForceFail` to locate @@ -601,24 +593,21 @@ TEST_P(ActorTaskSubmitterTest, TestActorRestartOutOfOrderGcs) { submitter_.ConnectActor(actor_id, addr, 2); submitter_.DisconnectActor( actor_id, 2, /*dead=*/false, death_cause, /*is_restartable=*/true); - ASSERT_EQ(num_clients_connected_, 2); // The actor dies permanently. submitter_.DisconnectActor( actor_id, 3, /*dead=*/true, death_cause, /*is_restartable=*/false); - ASSERT_EQ(num_clients_connected_, 2); // We receive more late messages. Nothing happens because the actor is dead. submitter_.DisconnectActor( actor_id, 4, /*dead=*/false, death_cause, /*is_restartable=*/true); addr.set_port(3); submitter_.ConnectActor(actor_id, addr, 4); - ASSERT_EQ(num_clients_connected_, 2); // Submit a task. auto task5 = CreateActorTaskHelper(actor_id, worker_id, 4); EXPECT_CALL(*task_manager_, FailOrRetryPendingTask(task5.TaskId(), _, _, _, _, _)) .Times(1); - ASSERT_TRUE(submitter_.SubmitTask(task5).ok()); + submitter_.SubmitTask(task5); ASSERT_EQ(io_context.poll_one(), 0); } @@ -635,14 +624,13 @@ TEST_P(ActorTaskSubmitterTest, TestActorRestartFailInflightTasks) { /*owned*/ false); submitter_.ConnectActor(actor_id, actor_addr1, 0); ASSERT_EQ(worker_client_->callbacks.size(), 0); - ASSERT_EQ(num_clients_connected_, 1); // Create 3 tasks for the actor. auto task1_first_attempt = CreateActorTaskHelper(actor_id, caller_worker_id, 0); auto task2_first_attempt = CreateActorTaskHelper(actor_id, caller_worker_id, 1); auto task3_first_attempt = CreateActorTaskHelper(actor_id, caller_worker_id, 2); // Submit a task. - ASSERT_TRUE(submitter_.SubmitTask(task1_first_attempt).ok()); + submitter_.SubmitTask(task1_first_attempt); ASSERT_EQ(io_context.poll_one(), 1); EXPECT_CALL(*task_manager_, CompletePendingTask(task1_first_attempt.TaskId(), _, _, _)) .Times(1); @@ -651,9 +639,9 @@ TEST_P(ActorTaskSubmitterTest, TestActorRestartFailInflightTasks) { ASSERT_EQ(worker_client_->callbacks.size(), 0); // Submit 2 tasks. - ASSERT_TRUE(submitter_.SubmitTask(task2_first_attempt).ok()); + submitter_.SubmitTask(task2_first_attempt); ASSERT_EQ(io_context.poll_one(), 1); - ASSERT_TRUE(submitter_.SubmitTask(task3_first_attempt).ok()); + submitter_.SubmitTask(task3_first_attempt); ASSERT_EQ(io_context.poll_one(), 1); // Actor failed, but the task replies are delayed (or in some scenarios, lost). // We should still be able to fail the inflight tasks. @@ -681,9 +669,9 @@ TEST_P(ActorTaskSubmitterTest, TestActorRestartFailInflightTasks) { task3_first_attempt.TaskIdBinary()); task3_second_attempt.GetMutableMessage().set_attempt_number( task3_first_attempt.AttemptNumber() + 1); - ASSERT_TRUE(submitter_.SubmitTask(task2_second_attempt).ok()); + submitter_.SubmitTask(task2_second_attempt); ASSERT_EQ(io_context.poll_one(), 1); - ASSERT_TRUE(submitter_.SubmitTask(task3_second_attempt).ok()); + submitter_.SubmitTask(task3_second_attempt); ASSERT_EQ(io_context.poll_one(), 1); // Restart the actor. @@ -748,11 +736,10 @@ TEST_P(ActorTaskSubmitterTest, TestActorRestartFastFail) { addr.set_port(0); submitter_.ConnectActor(actor_id, addr, 0); ASSERT_EQ(worker_client_->callbacks.size(), 0); - ASSERT_EQ(num_clients_connected_, 1); auto task1 = CreateActorTaskHelper(actor_id, worker_id, 0); // Submit a task. - ASSERT_TRUE(submitter_.SubmitTask(task1).ok()); + submitter_.SubmitTask(task1); ASSERT_EQ(io_context.poll_one(), 1); EXPECT_CALL(*task_manager_, CompletePendingTask(task1.TaskId(), _, _, _)).Times(1); ASSERT_TRUE(worker_client_->ReplyPushTask(task1.GetTaskAttempt(), Status::OK())); @@ -764,7 +751,7 @@ TEST_P(ActorTaskSubmitterTest, TestActorRestartFastFail) { // Submit a new task. This task should fail immediately because "max_task_retries" is 0. auto task2 = CreateActorTaskHelper(actor_id, worker_id, 1); - ASSERT_TRUE(submitter_.SubmitTask(task2).ok()); + submitter_.SubmitTask(task2); ASSERT_EQ(io_context.poll_one(), 1); EXPECT_CALL(*task_manager_, CompletePendingTask(task2.TaskId(), _, _, _)).Times(0); EXPECT_CALL(*task_manager_, FailOrRetryPendingTask(task2.TaskId(), _, _, _, _, _)) @@ -792,7 +779,7 @@ TEST_P(ActorTaskSubmitterTest, TestPendingTasks) { ASSERT_FALSE(submitter_.PendingTasksFull(actor_id)); auto task = CreateActorTaskHelper(actor_id, worker_id, i); tasks.push_back(task); - ASSERT_TRUE(submitter_.SubmitTask(task).ok()); + submitter_.SubmitTask(task); ASSERT_EQ(io_context.poll_one(), 1); } @@ -811,13 +798,13 @@ TEST_P(ActorTaskSubmitterTest, TestPendingTasks) { // We can submit task 10, but after that the queue is full. auto task = CreateActorTaskHelper(actor_id, worker_id, 10); tasks.push_back(task); - ASSERT_TRUE(submitter_.SubmitTask(task).ok()); + submitter_.SubmitTask(task); ASSERT_EQ(io_context.poll_one(), 1); ASSERT_TRUE(submitter_.PendingTasksFull(actor_id)); // All the replies comes, the queue shouble be empty. - for (auto &task : tasks) { - ASSERT_TRUE(worker_client_->ReplyPushTask(task.GetTaskAttempt(), Status::OK())); + for (auto &task_spec : tasks) { + ASSERT_TRUE(worker_client_->ReplyPushTask(task_spec.GetTaskAttempt(), Status::OK())); } ASSERT_FALSE(submitter_.PendingTasksFull(actor_id)); } @@ -837,7 +824,7 @@ TEST_P(ActorTaskSubmitterTest, TestActorRestartResubmit) { // Generator is pushed to worker -> generator queued for resubmit -> comes back from // worker -> resubmit happens. auto task1 = CreateActorTaskHelper(actor_id, worker_id, 0); - ASSERT_TRUE(submitter_.SubmitTask(task1).ok()); + submitter_.SubmitTask(task1); io_context.run_one(); submitter_.ConnectActor(actor_id, addr, 0); ASSERT_EQ(worker_client_->callbacks.size(), 1); diff --git a/src/ray/core_worker/test/dependency_resolver_test.cc b/src/ray/core_worker/task_submission/tests/dependency_resolver_test.cc similarity index 91% rename from src/ray/core_worker/test/dependency_resolver_test.cc rename to src/ray/core_worker/task_submission/tests/dependency_resolver_test.cc index e36d721f53d4..e9766aec1281 100644 --- a/src/ray/core_worker/test/dependency_resolver_test.cc +++ b/src/ray/core_worker/task_submission/tests/dependency_resolver_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/core_worker/transport/dependency_resolver.h" +#include "ray/core_worker/task_submission/dependency_resolver.h" #include #include @@ -26,10 +26,8 @@ #include "mock/ray/core_worker/task_manager_interface.h" #include "ray/common/task/task_spec.h" #include "ray/common/task/task_util.h" -#include "ray/common/test_util.h" -#include "ray/core_worker/store_provider/memory_store/memory_store.h" -#include "ray/raylet_client/raylet_client.h" -#include "ray/rpc/worker/core_worker_client.h" +#include "ray/common/test_utils.h" +#include "ray/core_worker/fake_actor_creator.h" namespace ray { namespace core { @@ -137,49 +135,10 @@ class MockTaskManager : public MockTaskManagerInterface { int num_fail_pending_task_calls = 0; }; -class MockActorCreator : public ActorCreatorInterface { - public: - MockActorCreator() = default; - - Status RegisterActor(const TaskSpecification &task_spec) const override { - return Status::OK(); - }; - - void AsyncRegisterActor(const TaskSpecification &task_spec, - gcs::StatusCallback callback) override {} - - void AsyncCreateActor( - const TaskSpecification &task_spec, - const rpc::ClientCallback &callback) override {} - - void AsyncRestartActorForLineageReconstruction( - const ActorID &actor_id, - uint64_t num_restarts_due_to_lineage_reconstructions, - gcs::StatusCallback callback) override {} - - void AsyncReportActorOutOfScope(const ActorID &actor_id, - uint64_t num_restarts_due_to_lineage_reconstruction, - gcs::StatusCallback callback) override {} - - void AsyncWaitForActorRegisterFinish(const ActorID &, - gcs::StatusCallback callback) override { - callbacks.push_back(callback); - } - - [[nodiscard]] bool IsActorInRegistering(const ActorID &actor_id) const override { - return actor_pending; - } - - ~MockActorCreator() {} - - std::list callbacks; - bool actor_pending = false; -}; - TEST(LocalDependencyResolverTest, TestNoDependencies) { auto store = DefaultCoreWorkerMemoryStoreWithThread::Create(); auto task_manager = std::make_shared(); - MockActorCreator actor_creator; + FakeActorCreator actor_creator; LocalDependencyResolver resolver( *store, *task_manager, actor_creator, [](const ObjectID &object_id) { return rpc::TensorTransport::OBJECT_STORE; @@ -195,7 +154,7 @@ TEST(LocalDependencyResolverTest, TestActorAndObjectDependencies1) { // Actor dependency resolved first. auto store = DefaultCoreWorkerMemoryStoreWithThread::Create(); auto task_manager = std::make_shared(); - MockActorCreator actor_creator; + FakeActorCreator actor_creator; LocalDependencyResolver resolver( *store, *task_manager, actor_creator, [](const ObjectID &object_id) { return rpc::TensorTransport::OBJECT_STORE; @@ -240,7 +199,7 @@ TEST(LocalDependencyResolverTest, TestActorAndObjectDependencies2) { // Object dependency resolved first. auto store = DefaultCoreWorkerMemoryStoreWithThread::Create(); auto task_manager = std::make_shared(); - MockActorCreator actor_creator; + FakeActorCreator actor_creator; LocalDependencyResolver resolver( *store, *task_manager, actor_creator, [](const ObjectID &object_id) { return rpc::TensorTransport::OBJECT_STORE; @@ -284,7 +243,7 @@ TEST(LocalDependencyResolverTest, TestActorAndObjectDependencies2) { TEST(LocalDependencyResolverTest, TestHandlePlasmaPromotion) { auto store = DefaultCoreWorkerMemoryStoreWithThread::Create(); auto task_manager = std::make_shared(); - MockActorCreator actor_creator; + FakeActorCreator actor_creator; LocalDependencyResolver resolver( *store, *task_manager, actor_creator, [](const ObjectID &object_id) { return rpc::TensorTransport::OBJECT_STORE; @@ -314,7 +273,7 @@ TEST(LocalDependencyResolverTest, TestHandlePlasmaPromotion) { TEST(LocalDependencyResolverTest, TestInlineLocalDependencies) { auto store = DefaultCoreWorkerMemoryStoreWithThread::Create(); auto task_manager = std::make_shared(); - MockActorCreator actor_creator; + FakeActorCreator actor_creator; LocalDependencyResolver resolver( *store, *task_manager, actor_creator, [](const ObjectID &object_id) { return rpc::TensorTransport::OBJECT_STORE; @@ -348,7 +307,7 @@ TEST(LocalDependencyResolverTest, TestInlineLocalDependencies) { TEST(LocalDependencyResolverTest, TestInlinePendingDependencies) { auto store = DefaultCoreWorkerMemoryStoreWithThread::Create(); auto task_manager = std::make_shared(); - MockActorCreator actor_creator; + FakeActorCreator actor_creator; LocalDependencyResolver resolver( *store, *task_manager, actor_creator, [](const ObjectID &object_id) { return rpc::TensorTransport::OBJECT_STORE; @@ -386,7 +345,7 @@ TEST(LocalDependencyResolverTest, TestInlinePendingDependencies) { TEST(LocalDependencyResolverTest, TestInlinedObjectIds) { auto store = DefaultCoreWorkerMemoryStoreWithThread::Create(); auto task_manager = std::make_shared(); - MockActorCreator actor_creator; + FakeActorCreator actor_creator; LocalDependencyResolver resolver( *store, *task_manager, actor_creator, [](const ObjectID &object_id) { return rpc::TensorTransport::OBJECT_STORE; @@ -426,7 +385,7 @@ TEST(LocalDependencyResolverTest, TestCancelDependencyResolution) { InstrumentedIOContextWithThread io_context("TestCancelDependencyResolution"); auto store = std::make_shared(io_context.GetIoService()); auto task_manager = std::make_shared(); - MockActorCreator actor_creator; + FakeActorCreator actor_creator; LocalDependencyResolver resolver( *store, *task_manager, actor_creator, [](const ObjectID &object_id) { return rpc::TensorTransport::OBJECT_STORE; @@ -461,7 +420,7 @@ TEST(LocalDependencyResolverTest, TestCancelDependencyResolution) { TEST(LocalDependencyResolverTest, TestDependenciesAlreadyLocal) { auto store = DefaultCoreWorkerMemoryStoreWithThread::Create(); auto task_manager = std::make_shared(); - MockActorCreator actor_creator; + FakeActorCreator actor_creator; LocalDependencyResolver resolver( *store, *task_manager, actor_creator, [](const ObjectID &object_id) { return rpc::TensorTransport::OBJECT_STORE; @@ -497,7 +456,7 @@ TEST(LocalDependencyResolverTest, TestMixedTensorTransport) { // there will be performance regression in some edge cases. auto store = DefaultCoreWorkerMemoryStoreWithThread::Create(); auto task_manager = std::make_shared(); - MockActorCreator actor_creator; + FakeActorCreator actor_creator; // `obj1` is a GPU object, and `obj2` is a normal object. ObjectID obj1 = ObjectID::FromRandom(); diff --git a/src/ray/core_worker/test/direct_actor_transport_mock_test.cc b/src/ray/core_worker/task_submission/tests/direct_actor_transport_test.cc similarity index 93% rename from src/ray/core_worker/test/direct_actor_transport_mock_test.cc rename to src/ray/core_worker/task_submission/tests/direct_actor_transport_test.cc index 8ea082cfa779..a4d50e583a61 100644 --- a/src/ray/core_worker/test/direct_actor_transport_mock_test.cc +++ b/src/ray/core_worker/task_submission/tests/direct_actor_transport_test.cc @@ -11,20 +11,17 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/core_worker/transport/actor_task_submitter.h" -// clang-format off #include #include "gmock/gmock.h" #include "gtest/gtest.h" -#include "ray/core_worker/actor_creator.h" -#include "mock/ray/core_worker/task_manager_interface.h" -#include "mock/ray/gcs/gcs_client/gcs_client.h" -#include "mock/ray/core_worker/reference_count.h" #include "mock/ray/core_worker/memory_store.h" - -// clang-format on +#include "mock/ray/core_worker/reference_count.h" +#include "mock/ray/core_worker/task_manager_interface.h" +#include "mock/ray/gcs_client/gcs_client.h" +#include "ray/core_worker/actor_creator.h" +#include "ray/core_worker/task_submission/actor_task_submitter.h" namespace ray { namespace core { @@ -36,7 +33,7 @@ class DirectTaskTransportTest : public ::testing::Test { void SetUp() override { gcs_client = std::make_shared(); - actor_creator = std::make_unique(gcs_client); + actor_creator = std::make_unique(gcs_client->Actors()); task_manager = std::make_shared(); client_pool = std::make_shared( @@ -75,7 +72,7 @@ class DirectTaskTransportTest : public ::testing::Test { protected: bool CheckSubmitTask(TaskSpecification task) { - EXPECT_TRUE(actor_task_submitter->SubmitTask(task).ok()); + actor_task_submitter->SubmitTask(task); return 1 == io_context.poll_one(); } @@ -86,7 +83,7 @@ class DirectTaskTransportTest : public ::testing::Test { std::shared_ptr client_pool; std::unique_ptr memory_store; std::shared_ptr task_manager; - std::unique_ptr actor_creator; + std::unique_ptr actor_creator; std::shared_ptr gcs_client; std::shared_ptr reference_counter; }; @@ -99,7 +96,7 @@ TEST_F(DirectTaskTransportTest, ActorCreationOk) { EXPECT_CALL(*gcs_client->mock_actor_accessor, AsyncCreateActor(creation_task_spec, ::testing::_)) .WillOnce(::testing::DoAll(::testing::SaveArg<1>(&create_cb))); - ASSERT_TRUE(actor_task_submitter->SubmitActorCreationTask(creation_task_spec).ok()); + actor_task_submitter->SubmitActorCreationTask(creation_task_spec); create_cb(Status::OK(), rpc::CreateActorReply()); } @@ -115,7 +112,7 @@ TEST_F(DirectTaskTransportTest, ActorCreationFail) { EXPECT_CALL(*gcs_client->mock_actor_accessor, AsyncCreateActor(creation_task_spec, ::testing::_)) .WillOnce(::testing::DoAll(::testing::SaveArg<1>(&create_cb))); - ASSERT_TRUE(actor_task_submitter->SubmitActorCreationTask(creation_task_spec).ok()); + actor_task_submitter->SubmitActorCreationTask(creation_task_spec); create_cb(Status::IOError(""), rpc::CreateActorReply()); } diff --git a/src/ray/core_worker/test/normal_task_submitter_test.cc b/src/ray/core_worker/task_submission/tests/normal_task_submitter_test.cc similarity index 87% rename from src/ray/core_worker/test/normal_task_submitter_test.cc rename to src/ray/core_worker/task_submission/tests/normal_task_submitter_test.cc index cb1ae15f7791..e3eeab9a0a3b 100644 --- a/src/ray/core_worker/test/normal_task_submitter_test.cc +++ b/src/ray/core_worker/task_submission/tests/normal_task_submitter_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/core_worker/transport/normal_task_submitter.h" +#include "ray/core_worker/task_submission/normal_task_submitter.h" #include #include @@ -28,10 +28,10 @@ #include "mock/ray/core_worker/task_manager_interface.h" #include "ray/common/task/task_spec.h" #include "ray/common/task/task_util.h" -#include "ray/common/test_util.h" -#include "ray/core_worker/core_worker.h" +#include "ray/common/test_utils.h" +#include "ray/core_worker/fake_actor_creator.h" #include "ray/core_worker/store_provider/memory_store/memory_store.h" -#include "ray/raylet_client/raylet_client.h" +#include "ray/rpc/raylet/raylet_client_interface.h" #include "ray/rpc/worker/core_worker_client.h" namespace ray { @@ -40,11 +40,10 @@ namespace { class DynamicRateLimiter : public LeaseRequestRateLimiter { public: - explicit DynamicRateLimiter(size_t limit) : limit(limit) {} - size_t GetMaxPendingLeaseRequestsPerSchedulingCategory() override { return limit; } + explicit DynamicRateLimiter(size_t limit) : limit_(limit) {} + size_t GetMaxPendingLeaseRequestsPerSchedulingCategory() override { return limit_; } - public: - size_t limit; + size_t limit_; }; // Wait (and halt the thread) until object_id appears in memory_store. @@ -225,11 +224,11 @@ class MockTaskManager : public MockTaskManagerInterface { class MockRayletClient : public FakeRayletClient { public: - Status ReturnWorker(int worker_port, - const WorkerID &worker_id, - bool disconnect_worker, - const std::string &disconnect_worker_error_detail, - bool worker_exiting) override { + void ReturnWorkerLease(int worker_port, + const LeaseID &lease_id, + bool disconnect_worker, + const std::string &disconnect_worker_error_detail, + bool worker_exiting) override { std::lock_guard lock(mu_); if (disconnect_worker) { num_workers_disconnected++; @@ -239,25 +238,24 @@ class MockRayletClient : public FakeRayletClient { num_workers_returned_exiting++; } } - return Status::OK(); } - void GetTaskFailureCause( - const TaskID &task_id, - const ray::rpc::ClientCallback &callback) + void GetWorkerFailureCause( + const LeaseID &lease_id, + const ray::rpc::ClientCallback &callback) override { std::lock_guard lock(mu_); get_task_failure_cause_callbacks.push_back(callback); num_get_task_failure_causes += 1; } - bool ReplyGetTaskFailureCause() { + bool ReplyGetWorkerFailureCause() { if (get_task_failure_cause_callbacks.size() == 0) { return false; } auto callback = std::move(get_task_failure_cause_callbacks.front()); get_task_failure_cause_callbacks.pop_front(); - rpc::GetTaskFailureCauseReply reply; + rpc::GetWorkerFailureCauseReply reply; callback(Status::OK(), std::move(reply)); return true; } @@ -270,14 +268,14 @@ class MockRayletClient : public FakeRayletClient { reported_backlogs.clear(); for (const auto &backlog_report : backlog_reports) { reported_backlog_size += backlog_report.backlog_size(); - const TaskSpecification resource_spec(backlog_report.resource_spec()); - const SchedulingClass scheduling_class = resource_spec.GetSchedulingClass(); + const LeaseSpecification lease_spec(backlog_report.lease_spec()); + const SchedulingClass scheduling_class = lease_spec.GetSchedulingClass(); reported_backlogs[scheduling_class] = backlog_report.backlog_size(); } } void RequestWorkerLease( - const rpc::TaskSpec &task_spec, + const rpc::LeaseSpec &lease_spec, bool grant_or_reject, const ray::rpc::ClientCallback &callback, const int64_t backlog_size, @@ -303,7 +301,7 @@ class MockRayletClient : public FakeRayletClient { } void CancelWorkerLease( - const TaskID &task_id, + const LeaseID &lease_id, const rpc::ClientCallback &callback) override { std::lock_guard lock(mu_); num_leases_canceled += 1; @@ -314,7 +312,8 @@ class MockRayletClient : public FakeRayletClient { bool GrantWorkerLease( const std::string &address, int port, - const NodeID &retry_at_raylet_id, + const NodeID &granted_node_id, + const NodeID &retry_at_node_id = NodeID::Nil(), bool cancel = false, std::string worker_id = WorkerID::FromRandom().Binary(), bool reject = false, @@ -326,14 +325,14 @@ class MockRayletClient : public FakeRayletClient { reply.set_failure_type(failure_type); } else if (reject) { reply.set_rejected(true); - } else if (!retry_at_raylet_id.IsNil()) { + } else if (!retry_at_node_id.IsNil()) { reply.mutable_retry_at_raylet_address()->set_ip_address(address); reply.mutable_retry_at_raylet_address()->set_port(port); - reply.mutable_retry_at_raylet_address()->set_raylet_id(retry_at_raylet_id.Binary()); + reply.mutable_retry_at_raylet_address()->set_node_id(retry_at_node_id.Binary()); } else { reply.mutable_worker_address()->set_ip_address(address); reply.mutable_worker_address()->set_port(port); - reply.mutable_worker_address()->set_raylet_id(retry_at_raylet_id.Binary()); + reply.mutable_worker_address()->set_node_id(granted_node_id.Binary()); reply.mutable_worker_address()->set_worker_id(worker_id); } rpc::ClientCallback callback = PopCallbackInLock(); @@ -402,56 +401,15 @@ class MockRayletClient : public FakeRayletClient { std::map reported_backlogs; std::list> callbacks = {}; std::list> cancel_callbacks = {}; - std::list> + std::list> get_task_failure_cause_callbacks = {}; }; -class MockActorCreator : public ActorCreatorInterface { - public: - MockActorCreator() {} - - Status RegisterActor(const TaskSpecification &task_spec) const override { - return Status::OK(); - }; - - void AsyncRegisterActor(const TaskSpecification &task_spec, - gcs::StatusCallback callback) override {} - - void AsyncRestartActorForLineageReconstruction( - const ActorID &actor_id, - uint64_t num_restarts_due_to_lineage_reconstructions, - gcs::StatusCallback callback) override {} - - void AsyncReportActorOutOfScope(const ActorID &actor_id, - uint64_t num_restarts_due_to_lineage_reconstruction, - gcs::StatusCallback callback) override {} - - void AsyncCreateActor( - const TaskSpecification &task_spec, - const rpc::ClientCallback &callback) override {} - - void AsyncWaitForActorRegisterFinish(const ActorID &, - gcs::StatusCallback callback) override { - callbacks.push_back(callback); - } - - [[nodiscard]] bool IsActorInRegistering(const ActorID &actor_id) const override { - return actor_pending; - } - - ~MockActorCreator() {} - - std::list callbacks; - bool actor_pending = false; -}; - class MockLeasePolicy : public LeasePolicyInterface { public: - void SetNodeID(NodeID node_id) { - fallback_rpc_address_.set_raylet_id(node_id.Binary()); - } + void SetNodeID(NodeID node_id) { fallback_rpc_address_.set_node_id(node_id.Binary()); } - std::pair GetBestNodeForTask(const TaskSpecification &spec) { + std::pair GetBestNodeForLease(const LeaseSpecification &spec) { num_lease_policy_consults++; return std::make_pair(fallback_rpc_address_, is_locality_aware); }; @@ -479,7 +437,8 @@ TaskSpecification WithRandomTaskId(const TaskSpecification &task_spec) { class NormalTaskSubmitterTest : public testing::Test { public: NormalTaskSubmitterTest() - : raylet_client_pool(std::make_shared( + : local_node_id(NodeID::FromRandom()), + raylet_client_pool(std::make_shared( [](const rpc::Address &) { return std::make_shared(); })), raylet_client(std::make_shared()), worker_client(std::make_shared()), @@ -487,9 +446,12 @@ class NormalTaskSubmitterTest : public testing::Test { client_pool(std::make_shared( [&](const rpc::Address &) { return worker_client; })), task_manager(std::make_unique()), - actor_creator(std::make_shared()), + actor_creator(std::make_shared()), lease_policy(std::make_unique()), - lease_policy_ptr(lease_policy.get()) {} + lease_policy_ptr(lease_policy.get()) { + address.set_node_id(local_node_id.Binary()); + lease_policy_ptr->SetNodeID(local_node_id); + } NormalTaskSubmitter CreateNormalTaskSubmitter( std::shared_ptr rate_limiter, @@ -497,16 +459,24 @@ class NormalTaskSubmitterTest : public testing::Test { std::function(const rpc::Address &)> raylet_client_factory = nullptr, std::shared_ptr custom_memory_store = nullptr, - int64_t lease_timeout_ms = kLongTimeout, - NodeID local_raylet_id = NodeID::Nil()) { + int64_t lease_timeout_ms = kLongTimeout) { if (custom_memory_store != nullptr) { store = custom_memory_store; } if (raylet_client_factory == nullptr) { raylet_client_pool = std::make_shared( - [](const rpc::Address &) { return std::make_shared(); }); + [this](const rpc::Address &) { return this->raylet_client; }); } else { - raylet_client_pool = std::make_shared(raylet_client_factory); + raylet_client_pool = std::make_shared( + [this, raylet_client_factory]( + const rpc::Address &addr) -> std::shared_ptr { + NodeID addr_node_id = NodeID::FromBinary(addr.node_id()); + if (addr_node_id == local_node_id) { + return this->raylet_client; + } else { + return raylet_client_factory(addr); + } + }); } return NormalTaskSubmitter( address, @@ -516,7 +486,7 @@ class NormalTaskSubmitterTest : public testing::Test { std::move(lease_policy), store, *task_manager, - local_raylet_id, + local_node_id, worker_type, lease_timeout_ms, actor_creator, @@ -526,6 +496,7 @@ class NormalTaskSubmitterTest : public testing::Test { boost::asio::steady_timer(io_context)); } + NodeID local_node_id; rpc::Address address; std::shared_ptr raylet_client_pool; std::shared_ptr raylet_client; @@ -533,7 +504,7 @@ class NormalTaskSubmitterTest : public testing::Test { std::shared_ptr store; std::shared_ptr client_pool; std::unique_ptr task_manager; - std::shared_ptr actor_creator; + std::shared_ptr actor_creator; // Note: Use lease_policy_ptr in tests, not lease_policy since it has to be moved into // the submitter. std::unique_ptr lease_policy; @@ -548,14 +519,14 @@ TEST_F(NormalTaskSubmitterTest, TestLocalityAwareSubmitOneTask) { TaskSpecification task = BuildEmptyTaskSpec(); - ASSERT_TRUE(submitter.SubmitTask(task).ok()); + submitter.SubmitTask(task); ASSERT_EQ(lease_policy_ptr->num_lease_policy_consults, 1); ASSERT_EQ(raylet_client->num_is_selected_based_on_locality_leases_requested, 1); ASSERT_EQ(raylet_client->num_workers_requested, 1); ASSERT_EQ(raylet_client->num_workers_returned, 0); ASSERT_EQ(worker_client->callbacks.size(), 0); - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1234, NodeID::Nil())); + ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1234, local_node_id)); ASSERT_EQ(worker_client->callbacks.size(), 1); ASSERT_EQ(task_manager->num_tasks_complete, 0); ASSERT_EQ(task_manager->num_tasks_failed, 0); @@ -579,14 +550,14 @@ TEST_F(NormalTaskSubmitterTest, TestSubmitOneTask) { CreateNormalTaskSubmitter(std::make_shared(1)); TaskSpecification task = BuildEmptyTaskSpec(); - ASSERT_TRUE(submitter.SubmitTask(task).ok()); + submitter.SubmitTask(task); ASSERT_EQ(lease_policy_ptr->num_lease_policy_consults, 1); ASSERT_EQ(raylet_client->num_is_selected_based_on_locality_leases_requested, 0); ASSERT_EQ(raylet_client->num_workers_requested, 1); ASSERT_EQ(raylet_client->num_workers_returned, 0); ASSERT_EQ(worker_client->callbacks.size(), 0); - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1234, NodeID::Nil())); + ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1234, local_node_id)); ASSERT_EQ(worker_client->callbacks.size(), 1); ASSERT_EQ(task_manager->num_tasks_complete, 0); ASSERT_EQ(task_manager->num_tasks_failed, 0); @@ -611,8 +582,8 @@ TEST_F(NormalTaskSubmitterTest, TestRetryTaskApplicationLevelError) { TaskSpecification task = BuildEmptyTaskSpec(); task.GetMutableMessage().set_retry_exceptions(true); - ASSERT_TRUE(submitter.SubmitTask(task).ok()); - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1234, NodeID::Nil())); + submitter.SubmitTask(task); + ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1234, local_node_id)); // Simulate an application-level error. ASSERT_TRUE(worker_client->ReplyPushTask(Status::OK(), false, true)); ASSERT_EQ(raylet_client->num_workers_returned, 1); @@ -625,8 +596,8 @@ TEST_F(NormalTaskSubmitterTest, TestRetryTaskApplicationLevelError) { task.GetMutableMessage().set_retry_exceptions(false); - ASSERT_TRUE(submitter.SubmitTask(task).ok()); - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1234, NodeID::Nil())); + submitter.SubmitTask(task); + ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1234, local_node_id)); // Simulate an application-level error. ASSERT_TRUE(worker_client->ReplyPushTask(Status::OK(), false, true)); ASSERT_EQ(raylet_client->num_workers_returned, 2); @@ -647,11 +618,11 @@ TEST_F(NormalTaskSubmitterTest, TestHandleTaskFailure) { CreateNormalTaskSubmitter(std::make_shared(1)); TaskSpecification task = BuildEmptyTaskSpec(); - ASSERT_TRUE(submitter.SubmitTask(task).ok()); - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1234, NodeID::Nil())); + submitter.SubmitTask(task); + ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1234, local_node_id)); // Simulate a system failure, i.e., worker died unexpectedly. ASSERT_TRUE(worker_client->ReplyPushTask(Status::IOError("oops"))); - ASSERT_TRUE(raylet_client->ReplyGetTaskFailureCause()); + ASSERT_TRUE(raylet_client->ReplyGetWorkerFailureCause()); ASSERT_EQ(worker_client->callbacks.size(), 0); ASSERT_EQ(raylet_client->num_workers_returned, 0); ASSERT_EQ(raylet_client->num_workers_disconnected, 1); @@ -668,23 +639,23 @@ TEST_F(NormalTaskSubmitterTest, TestHandleTaskFailure) { TEST_F(NormalTaskSubmitterTest, TestCancellationWhileHandlingTaskFailure) { // This test is a regression test for a bug where a crash happens when - // the task cancellation races between ReplyPushTask and ReplyGetTaskFailureCause. + // the task cancellation races between ReplyPushTask and ReplyGetWorkerFailureCause. // For an example of a python integration test, see // https://github.com/ray-project/ray/blob/2b6807f4d9c4572e6309f57bc404aa641bc4b185/python/ray/tests/test_cancel.py#L35 auto submitter = CreateNormalTaskSubmitter(std::make_shared(1)); TaskSpecification task = BuildEmptyTaskSpec(); - ASSERT_TRUE(submitter.SubmitTask(task).ok()); - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1234, NodeID::Nil())); + submitter.SubmitTask(task); + ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1234, local_node_id)); // Simulate a system failure, i.e., worker died unexpectedly so that - // GetTaskFailureCause is called. + // GetWorkerFailureCause is called. ASSERT_TRUE(worker_client->ReplyPushTask(Status::IOError("oops"))); - // Cancel the task while GetTaskFailureCause has not been completed. - ASSERT_TRUE(submitter.CancelTask(task, true, false).ok()); - // Completing the GetTaskFailureCause call. Check that the reply runs without error + // Cancel the task while GetWorkerFailureCause has not been completed. + submitter.CancelTask(task, true, false); + // Completing the GetWorkerFailureCause call. Check that the reply runs without error // and FailPendingTask is not called. - ASSERT_TRUE(raylet_client->ReplyGetTaskFailureCause()); + ASSERT_TRUE(raylet_client->ReplyGetWorkerFailureCause()); ASSERT_EQ(task_manager->num_fail_pending_task_calls, 0); } @@ -695,9 +666,9 @@ TEST_F(NormalTaskSubmitterTest, TestHandleUnschedulableTask) { TaskSpecification task2 = BuildEmptyTaskSpec(); TaskSpecification task3 = BuildEmptyTaskSpec(); - ASSERT_TRUE(submitter.SubmitTask(task1).ok()); - ASSERT_TRUE(submitter.SubmitTask(task2).ok()); - ASSERT_TRUE(submitter.SubmitTask(task3).ok()); + submitter.SubmitTask(task1); + submitter.SubmitTask(task2); + submitter.SubmitTask(task3); ASSERT_EQ(lease_policy_ptr->num_lease_policy_consults, 2); ASSERT_EQ(raylet_client->num_workers_requested, 2); ASSERT_EQ(raylet_client->num_workers_returned, 0); @@ -708,6 +679,7 @@ TEST_F(NormalTaskSubmitterTest, TestHandleUnschedulableTask) { ASSERT_TRUE(raylet_client->GrantWorkerLease( "", 0, + local_node_id, NodeID::Nil(), true, "", @@ -722,6 +694,7 @@ TEST_F(NormalTaskSubmitterTest, TestHandleUnschedulableTask) { ASSERT_TRUE(raylet_client->GrantWorkerLease( "", 0, + local_node_id, NodeID::Nil(), true, "", @@ -745,9 +718,9 @@ TEST_F(NormalTaskSubmitterTest, TestHandleRuntimeEnvSetupFailed) { TaskSpecification task2 = BuildEmptyTaskSpec(); TaskSpecification task3 = BuildEmptyTaskSpec(); - ASSERT_TRUE(submitter.SubmitTask(task1).ok()); - ASSERT_TRUE(submitter.SubmitTask(task2).ok()); - ASSERT_TRUE(submitter.SubmitTask(task3).ok()); + submitter.SubmitTask(task1); + submitter.SubmitTask(task2); + submitter.SubmitTask(task3); ASSERT_EQ(lease_policy_ptr->num_lease_policy_consults, 2); ASSERT_EQ(raylet_client->num_workers_requested, 2); ASSERT_EQ(raylet_client->num_workers_returned, 0); @@ -758,6 +731,7 @@ TEST_F(NormalTaskSubmitterTest, TestHandleRuntimeEnvSetupFailed) { ASSERT_TRUE(raylet_client->GrantWorkerLease( "", 0, + local_node_id, NodeID::Nil(), true, "", @@ -772,6 +746,7 @@ TEST_F(NormalTaskSubmitterTest, TestHandleRuntimeEnvSetupFailed) { ASSERT_TRUE(raylet_client->GrantWorkerLease( "", 0, + local_node_id, NodeID::Nil(), true, "", @@ -792,7 +767,7 @@ TEST_F(NormalTaskSubmitterTest, TestWorkerHandleLocalRayletDied) { CreateNormalTaskSubmitter(std::make_shared(2)); TaskSpecification task1 = BuildEmptyTaskSpec(); - ASSERT_TRUE(submitter.SubmitTask(task1).ok()); + submitter.SubmitTask(task1); ASSERT_DEATH(raylet_client->FailWorkerLeaseDueToGrpcUnavailable(), ""); } @@ -804,9 +779,9 @@ TEST_F(NormalTaskSubmitterTest, TestDriverHandleLocalRayletDied) { TaskSpecification task2 = BuildEmptyTaskSpec(); TaskSpecification task3 = BuildEmptyTaskSpec(); - ASSERT_TRUE(submitter.SubmitTask(task1).ok()); - ASSERT_TRUE(submitter.SubmitTask(task2).ok()); - ASSERT_TRUE(submitter.SubmitTask(task3).ok()); + submitter.SubmitTask(task1); + submitter.SubmitTask(task2); + submitter.SubmitTask(task3); ASSERT_EQ(lease_policy_ptr->num_lease_policy_consults, 2); ASSERT_EQ(raylet_client->num_workers_requested, 2); ASSERT_EQ(raylet_client->num_workers_returned, 0); @@ -839,7 +814,7 @@ TEST_F(NormalTaskSubmitterTest, TestConcurrentWorkerLeases) { for (int i = 0; i < 2 * concurrency; i++) { auto task = BuildEmptyTaskSpec(); tasks.push_back(task); - ASSERT_TRUE(submitter.SubmitTask(task).ok()); + submitter.SubmitTask(task); } ASSERT_EQ(lease_policy_ptr->num_lease_policy_consults, concurrency); @@ -854,7 +829,7 @@ TEST_F(NormalTaskSubmitterTest, TestConcurrentWorkerLeases) { // Grant the first round of leases. for (int i = 0; i < concurrency; i++) { - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", i, NodeID::Nil())); + ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", i, local_node_id)); ASSERT_EQ(worker_client->callbacks.size(), i + 1); ASSERT_EQ(lease_policy_ptr->num_lease_policy_consults, concurrency + i + 1); ASSERT_EQ(raylet_client->num_workers_requested, concurrency + i + 1); @@ -862,7 +837,7 @@ TEST_F(NormalTaskSubmitterTest, TestConcurrentWorkerLeases) { } for (int i = 0; i < concurrency; i++) { ASSERT_TRUE( - raylet_client->GrantWorkerLease("localhost", concurrency + i, NodeID::Nil())); + raylet_client->GrantWorkerLease("localhost", concurrency + i, local_node_id)); ASSERT_EQ(worker_client->callbacks.size(), concurrency + i + 1); ASSERT_EQ(lease_policy_ptr->num_lease_policy_consults, tasks.size()); ASSERT_EQ(raylet_client->num_workers_requested, tasks.size()); @@ -895,7 +870,7 @@ TEST_F(NormalTaskSubmitterTest, TestConcurrentWorkerLeasesDynamic) { for (int i = 0; i < 2 * concurrency; i++) { auto task = BuildEmptyTaskSpec(); tasks.push_back(task); - ASSERT_TRUE(submitter.SubmitTask(task).ok()); + submitter.SubmitTask(task); } ASSERT_EQ(lease_policy_ptr->num_lease_policy_consults, 1); @@ -909,14 +884,14 @@ TEST_F(NormalTaskSubmitterTest, TestConcurrentWorkerLeasesDynamic) { ASSERT_EQ(raylet_client->reported_backlog_size, tasks.size() - 1); // Max concurrency is still 1. - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1000, NodeID::Nil())); + ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1000, local_node_id)); ASSERT_EQ(lease_policy_ptr->num_lease_policy_consults, 2); ASSERT_EQ(raylet_client->num_workers_requested, 2); ASSERT_EQ(raylet_client->reported_backlog_size, tasks.size() - 2); // Increase max concurrency. Should request leases up to the max concurrency. - rateLimiter->limit = concurrency; - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1001, NodeID::Nil())); + rateLimiter->limit_ = concurrency; + ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1001, local_node_id)); ASSERT_EQ(lease_policy_ptr->num_lease_policy_consults, 2 + concurrency); ASSERT_EQ(raylet_client->num_workers_requested, 2 + concurrency); ASSERT_EQ(raylet_client->reported_backlog_size, @@ -925,9 +900,9 @@ TEST_F(NormalTaskSubmitterTest, TestConcurrentWorkerLeasesDynamic) { // Decrease max concurrency again. Should not request any more leases even as // previous requests are granted, since we are still over the current // concurrency. - rateLimiter->limit = 1; + rateLimiter->limit_ = 1; for (int i = 0; i < concurrency - 1; i++) { - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", i, NodeID::Nil())); + ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", i, local_node_id)); ASSERT_EQ(lease_policy_ptr->num_lease_policy_consults, 2 + concurrency); ASSERT_EQ(raylet_client->num_workers_requested, 2 + concurrency); ASSERT_EQ(raylet_client->reported_backlog_size, @@ -940,14 +915,14 @@ TEST_F(NormalTaskSubmitterTest, TestConcurrentWorkerLeasesDynamic) { raylet_client->num_workers_requested = 0; for (int i = 0; i < num_tasks_remaining; i++) { ASSERT_TRUE( - raylet_client->GrantWorkerLease("localhost", concurrency + i, NodeID::Nil())); + raylet_client->GrantWorkerLease("localhost", concurrency + i, local_node_id)); ASSERT_EQ(lease_policy_ptr->num_lease_policy_consults, i + 1); ASSERT_EQ(raylet_client->num_workers_requested, i + 1); } lease_policy_ptr->num_lease_policy_consults = 0; raylet_client->num_workers_requested = 0; - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 2000, NodeID::Nil())); + ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 2000, local_node_id)); ASSERT_EQ(lease_policy_ptr->num_lease_policy_consults, 0); ASSERT_EQ(raylet_client->num_workers_requested, 0); @@ -980,7 +955,7 @@ TEST_F(NormalTaskSubmitterTest, TestConcurrentWorkerLeasesDynamicWithSpillback) for (int i = 0; i < 2 * concurrency; i++) { auto task = BuildEmptyTaskSpec(); tasks.push_back(task); - ASSERT_TRUE(submitter.SubmitTask(task).ok()); + submitter.SubmitTask(task); } ASSERT_EQ(lease_policy_ptr->num_lease_policy_consults, 1); @@ -994,15 +969,17 @@ TEST_F(NormalTaskSubmitterTest, TestConcurrentWorkerLeasesDynamicWithSpillback) ASSERT_EQ(raylet_client->reported_backlog_size, tasks.size() - 1); // Max concurrency is still 1. - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1000, NodeID::Nil())); + ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1000, local_node_id)); ASSERT_EQ(lease_policy_ptr->num_lease_policy_consults, 2); ASSERT_EQ(raylet_client->num_workers_requested, 2); ASSERT_EQ(raylet_client->reported_backlog_size, tasks.size() - 2); // Increase max concurrency. - rateLimiter->limit = concurrency; + rateLimiter->limit_ = concurrency; // The outstanding lease request is spilled back to a remote raylet. - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1001, NodeID::FromRandom())); + auto remote_node_id = NodeID::FromRandom(); + ASSERT_TRUE( + raylet_client->GrantWorkerLease("localhost", 1001, NodeID::Nil(), remote_node_id)); // We should request one lease request from the spillback raylet and then the // rest from the raylet returned by the lease policy. ASSERT_EQ(lease_policy_ptr->num_lease_policy_consults, concurrency + 1); @@ -1013,9 +990,9 @@ TEST_F(NormalTaskSubmitterTest, TestConcurrentWorkerLeasesDynamicWithSpillback) // Decrease max concurrency again. Should not request any more leases even as // previous requests are granted, since we are still over the current // concurrency. - rateLimiter->limit = 1; + rateLimiter->limit_ = 1; for (int i = 0; i < concurrency - 1; i++) { - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", i, NodeID::Nil())); + ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", i, local_node_id)); ASSERT_EQ(lease_policy_ptr->num_lease_policy_consults, concurrency + 1); ASSERT_EQ(raylet_client->num_workers_requested, 2 + concurrency); ASSERT_EQ(raylet_client->reported_backlog_size, @@ -1028,14 +1005,14 @@ TEST_F(NormalTaskSubmitterTest, TestConcurrentWorkerLeasesDynamicWithSpillback) raylet_client->num_workers_requested = 0; for (int i = 0; i < num_tasks_remaining; i++) { ASSERT_TRUE( - raylet_client->GrantWorkerLease("localhost", concurrency + i, NodeID::Nil())); + raylet_client->GrantWorkerLease("localhost", concurrency + i, local_node_id)); ASSERT_EQ(lease_policy_ptr->num_lease_policy_consults, i + 1); ASSERT_EQ(raylet_client->num_workers_requested, i + 1); } lease_policy_ptr->num_lease_policy_consults = 0; raylet_client->num_workers_requested = 0; - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 2000, NodeID::Nil())); + ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 2000, local_node_id)); ASSERT_EQ(lease_policy_ptr->num_lease_policy_consults, 0); ASSERT_EQ(raylet_client->num_workers_requested, 0); @@ -1063,29 +1040,29 @@ TEST_F(NormalTaskSubmitterTest, TestSubmitMultipleTasks) { TaskSpecification task2 = BuildEmptyTaskSpec(); TaskSpecification task3 = BuildEmptyTaskSpec(); - ASSERT_TRUE(submitter.SubmitTask(task1).ok()); - ASSERT_TRUE(submitter.SubmitTask(task2).ok()); - ASSERT_TRUE(submitter.SubmitTask(task3).ok()); + submitter.SubmitTask(task1); + submitter.SubmitTask(task2); + submitter.SubmitTask(task3); ASSERT_EQ(lease_policy_ptr->num_lease_policy_consults, 1); ASSERT_EQ(raylet_client->num_workers_requested, 1); ASSERT_EQ(raylet_client->reported_backlog_size, 0); // Task 1 is pushed; worker 2 is requested. - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1000, NodeID::Nil())); + ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1000, local_node_id)); ASSERT_EQ(worker_client->callbacks.size(), 1); ASSERT_EQ(lease_policy_ptr->num_lease_policy_consults, 2); ASSERT_EQ(raylet_client->num_workers_requested, 2); ASSERT_EQ(raylet_client->reported_backlog_size, 1); // Task 2 is pushed; worker 3 is requested. - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1001, NodeID::Nil())); + ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1001, local_node_id)); ASSERT_EQ(worker_client->callbacks.size(), 2); ASSERT_EQ(lease_policy_ptr->num_lease_policy_consults, 3); ASSERT_EQ(raylet_client->num_workers_requested, 3); ASSERT_EQ(raylet_client->reported_backlog_size, 0); // Task 3 is pushed; no more workers requested. - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1002, NodeID::Nil())); + ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1002, local_node_id)); ASSERT_EQ(worker_client->callbacks.size(), 3); ASSERT_EQ(lease_policy_ptr->num_lease_policy_consults, 3); ASSERT_EQ(raylet_client->num_workers_requested, 3); @@ -1114,14 +1091,14 @@ TEST_F(NormalTaskSubmitterTest, TestReuseWorkerLease) { TaskSpecification task2 = BuildEmptyTaskSpec(); TaskSpecification task3 = BuildEmptyTaskSpec(); - ASSERT_TRUE(submitter.SubmitTask(task1).ok()); - ASSERT_TRUE(submitter.SubmitTask(task2).ok()); - ASSERT_TRUE(submitter.SubmitTask(task3).ok()); + submitter.SubmitTask(task1); + submitter.SubmitTask(task2); + submitter.SubmitTask(task3); ASSERT_EQ(lease_policy_ptr->num_lease_policy_consults, 1); ASSERT_EQ(raylet_client->num_workers_requested, 1); // Task 1 is pushed. - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1000, NodeID::Nil())); + ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1000, local_node_id)); ASSERT_EQ(worker_client->callbacks.size(), 1); ASSERT_EQ(lease_policy_ptr->num_lease_policy_consults, 2); ASSERT_EQ(raylet_client->num_workers_requested, 2); @@ -1144,7 +1121,7 @@ TEST_F(NormalTaskSubmitterTest, TestReuseWorkerLease) { ASSERT_EQ(raylet_client->num_workers_returned, 1); // The second lease request is returned immediately. - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1001, NodeID::Nil())); + ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1001, local_node_id)); ASSERT_EQ(worker_client->callbacks.size(), 0); ASSERT_EQ(lease_policy_ptr->num_lease_policy_consults, 2); ASSERT_EQ(raylet_client->num_workers_returned, 2); @@ -1166,13 +1143,13 @@ TEST_F(NormalTaskSubmitterTest, TestRetryLeaseCancellation) { TaskSpecification task2 = BuildEmptyTaskSpec(); TaskSpecification task3 = BuildEmptyTaskSpec(); - ASSERT_TRUE(submitter.SubmitTask(task1).ok()); - ASSERT_TRUE(submitter.SubmitTask(task2).ok()); - ASSERT_TRUE(submitter.SubmitTask(task3).ok()); + submitter.SubmitTask(task1); + submitter.SubmitTask(task2); + submitter.SubmitTask(task3); ASSERT_EQ(raylet_client->num_workers_requested, 1); // Task 1 is pushed. - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1000, NodeID::Nil())); + ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1000, local_node_id)); // Task 1 finishes, Task 2 is scheduled on the same worker. ASSERT_TRUE(worker_client->ReplyPushTask()); // Task 2 finishes, Task 3 is scheduled on the same worker. @@ -1194,7 +1171,8 @@ TEST_F(NormalTaskSubmitterTest, TestRetryLeaseCancellation) { ASSERT_EQ(raylet_client->num_leases_canceled, i); ASSERT_FALSE(raylet_client->ReplyCancelWorkerLease()); ASSERT_EQ(raylet_client->num_leases_canceled, i); - ASSERT_TRUE(raylet_client->GrantWorkerLease("", 0, NodeID::Nil(), /*cancel=*/true)); + ASSERT_TRUE(raylet_client->GrantWorkerLease( + "", 0, local_node_id, NodeID::Nil(), /*cancel=*/true)); ASSERT_EQ(worker_client->callbacks.size(), 0); // The canceled lease is not returned. ASSERT_EQ(raylet_client->num_workers_returned, 1); @@ -1214,11 +1192,11 @@ TEST_F(NormalTaskSubmitterTest, TestConcurrentCancellationAndSubmission) { TaskSpecification task2 = BuildEmptyTaskSpec(); TaskSpecification task3 = BuildEmptyTaskSpec(); - ASSERT_TRUE(submitter.SubmitTask(task1).ok()); - ASSERT_TRUE(submitter.SubmitTask(task2).ok()); + submitter.SubmitTask(task1); + submitter.SubmitTask(task2); // Task 1 is pushed. - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1000, NodeID::Nil())); + ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1000, local_node_id)); ASSERT_EQ(raylet_client->num_workers_requested, 2); // Task 1 finishes, Task 2 is scheduled on the same worker. ASSERT_TRUE(worker_client->ReplyPushTask()); @@ -1231,17 +1209,18 @@ TEST_F(NormalTaskSubmitterTest, TestConcurrentCancellationAndSubmission) { ASSERT_EQ(raylet_client->num_workers_returned, 1); // Another task is submitted while task 2's lease request is being canceled. - ASSERT_TRUE(submitter.SubmitTask(task3).ok()); + submitter.SubmitTask(task3); ASSERT_EQ(raylet_client->num_workers_requested, 2); // Task 2's lease request is canceled, a new worker is requested for task 3. ASSERT_TRUE(raylet_client->ReplyCancelWorkerLease()); ASSERT_EQ(raylet_client->num_workers_requested, 2); - ASSERT_TRUE(raylet_client->GrantWorkerLease("", 0, NodeID::Nil(), /*cancel=*/true)); + ASSERT_TRUE(raylet_client->GrantWorkerLease( + "", 0, local_node_id, NodeID::Nil(), /*cancel=*/true)); ASSERT_EQ(raylet_client->num_workers_requested, 3); // Task 3 finishes, all workers returned. - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1000, NodeID::Nil())); + ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1000, local_node_id)); ASSERT_TRUE(worker_client->ReplyPushTask()); ASSERT_EQ(raylet_client->num_workers_returned, 2); ASSERT_FALSE(raylet_client->ReplyCancelWorkerLease()); @@ -1258,24 +1237,24 @@ TEST_F(NormalTaskSubmitterTest, TestWorkerNotReusedOnError) { TaskSpecification task1 = BuildEmptyTaskSpec(); TaskSpecification task2 = BuildEmptyTaskSpec(); - ASSERT_TRUE(submitter.SubmitTask(task1).ok()); - ASSERT_TRUE(submitter.SubmitTask(task2).ok()); + submitter.SubmitTask(task1); + submitter.SubmitTask(task2); ASSERT_EQ(raylet_client->num_workers_requested, 1); // Task 1 is pushed. - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1000, NodeID::Nil())); + ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1000, local_node_id)); ASSERT_EQ(worker_client->callbacks.size(), 1); ASSERT_EQ(raylet_client->num_workers_requested, 2); // Task 1 finishes with failure; the worker is returned. ASSERT_TRUE(worker_client->ReplyPushTask(Status::IOError("worker dead"))); - ASSERT_TRUE(raylet_client->ReplyGetTaskFailureCause()); + ASSERT_TRUE(raylet_client->ReplyGetWorkerFailureCause()); ASSERT_EQ(worker_client->callbacks.size(), 0); ASSERT_EQ(raylet_client->num_workers_returned, 0); ASSERT_EQ(raylet_client->num_workers_disconnected, 1); // Task 2 runs successfully on the second worker. - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1001, NodeID::Nil())); + ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1001, local_node_id)); ASSERT_TRUE(worker_client->ReplyPushTask()); ASSERT_EQ(raylet_client->num_workers_returned, 1); ASSERT_EQ(raylet_client->num_workers_disconnected, 1); @@ -1294,11 +1273,11 @@ TEST_F(NormalTaskSubmitterTest, TestWorkerNotReturnedOnExit) { CreateNormalTaskSubmitter(std::make_shared(1)); TaskSpecification task1 = BuildEmptyTaskSpec(); - ASSERT_TRUE(submitter.SubmitTask(task1).ok()); + submitter.SubmitTask(task1); ASSERT_EQ(raylet_client->num_workers_requested, 1); // Task 1 is pushed. - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1000, NodeID::Nil())); + ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1000, local_node_id)); ASSERT_EQ(worker_client->callbacks.size(), 1); // Task 1 finishes with exit status; the worker is not returned. @@ -1330,7 +1309,7 @@ TEST_F(NormalTaskSubmitterTest, TestSpillback) { raylet_client_factory); TaskSpecification task = BuildEmptyTaskSpec(); - ASSERT_TRUE(submitter.SubmitTask(task).ok()); + submitter.SubmitTask(task); ASSERT_EQ(lease_policy_ptr->num_lease_policy_consults, 1); ASSERT_EQ(raylet_client->num_workers_requested, 1); ASSERT_EQ(raylet_client->num_workers_returned, 0); @@ -1338,16 +1317,17 @@ TEST_F(NormalTaskSubmitterTest, TestSpillback) { ASSERT_EQ(remote_raylet_clients.size(), 0); // Spillback to a remote node. - auto remote_raylet_id = NodeID::FromRandom(); - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 7777, remote_raylet_id)); + auto remote_node_id = NodeID::FromRandom(); + ASSERT_TRUE( + raylet_client->GrantWorkerLease("localhost", 7777, NodeID::Nil(), remote_node_id)); ASSERT_EQ(remote_raylet_clients.count(7777), 1); // Confirm that lease policy is not consulted on spillback. ASSERT_EQ(lease_policy_ptr->num_lease_policy_consults, 1); // There should be no more callbacks on the local client. - ASSERT_FALSE(raylet_client->GrantWorkerLease("remote", 1234, NodeID::Nil())); + ASSERT_FALSE(raylet_client->GrantWorkerLease("remote", 1234, local_node_id)); // Trigger retry at the remote node. ASSERT_TRUE( - remote_raylet_clients[7777]->GrantWorkerLease("remote", 1234, NodeID::Nil())); + remote_raylet_clients[7777]->GrantWorkerLease("remote", 1234, remote_node_id)); // The worker is returned to the remote node, not the local one. ASSERT_TRUE(worker_client->ReplyPushTask()); @@ -1378,19 +1358,16 @@ TEST_F(NormalTaskSubmitterTest, TestSpillbackRoundTrip) { remote_raylet_clients[addr.port()] = client; return client; }; - auto local_raylet_id = NodeID::FromRandom(); - lease_policy_ptr->SetNodeID(local_raylet_id); - auto store = DefaultCoreWorkerMemoryStoreWithThread::CreateShared(); + auto memory_store = DefaultCoreWorkerMemoryStoreWithThread::CreateShared(); auto submitter = CreateNormalTaskSubmitter(std::make_shared(1), WorkerType::WORKER, raylet_client_factory, - store, - kLongTimeout, - local_raylet_id); + memory_store, + kLongTimeout); TaskSpecification task = BuildEmptyTaskSpec(); - ASSERT_TRUE(submitter.SubmitTask(task).ok()); + submitter.SubmitTask(task); ASSERT_EQ(raylet_client->num_grant_or_reject_leases_requested, 0); ASSERT_EQ(raylet_client->num_workers_requested, 1); ASSERT_EQ(raylet_client->num_workers_returned, 0); @@ -1398,28 +1375,34 @@ TEST_F(NormalTaskSubmitterTest, TestSpillbackRoundTrip) { ASSERT_EQ(remote_raylet_clients.size(), 0); // Spillback to a remote node. - auto remote_raylet_id = NodeID::FromRandom(); - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 7777, remote_raylet_id)); + auto remote_node_id = NodeID::FromRandom(); + rpc::Address remote_address; + remote_address.set_node_id(remote_node_id.Binary()); + remote_address.set_ip_address("localhost"); + remote_address.set_port(7777); + raylet_client_pool->GetOrConnectByAddress(remote_address); + ASSERT_TRUE( + raylet_client->GrantWorkerLease("localhost", 7777, NodeID::Nil(), remote_node_id)); ASSERT_EQ(remote_raylet_clients.count(7777), 1); ASSERT_EQ(remote_raylet_clients[7777]->num_workers_requested, 1); // Confirm that the spillback lease request has grant_or_reject set to true. ASSERT_EQ(remote_raylet_clients[7777]->num_grant_or_reject_leases_requested, 1); // Confirm that lease policy is not consulted on spillback. ASSERT_EQ(lease_policy_ptr->num_lease_policy_consults, 1); - ASSERT_FALSE(raylet_client->GrantWorkerLease("remote", 1234, NodeID::Nil())); + ASSERT_FALSE(raylet_client->GrantWorkerLease("remote", 1234, local_node_id)); // Trigger a rejection back to the local node. ASSERT_TRUE(remote_raylet_clients[7777]->GrantWorkerLease( - "local", 1234, local_raylet_id, false, "", /*reject=*/true)); + "local", 1234, remote_node_id, NodeID::Nil(), false, "", /*reject=*/true)); // We should not have created another lease client to the local raylet. ASSERT_EQ(remote_raylet_clients.size(), 1); // There should be no more callbacks on the remote node. ASSERT_FALSE( - remote_raylet_clients[7777]->GrantWorkerLease("remote", 1234, NodeID::Nil())); + remote_raylet_clients[7777]->GrantWorkerLease("remote", 1234, remote_node_id)); // The worker is returned to the local node. ASSERT_EQ(raylet_client->num_grant_or_reject_leases_requested, 0); ASSERT_EQ(raylet_client->num_workers_requested, 2); - ASSERT_TRUE(raylet_client->GrantWorkerLease("local", 1234, NodeID::Nil())); + ASSERT_TRUE(raylet_client->GrantWorkerLease("local", 1234, local_node_id)); ASSERT_TRUE(worker_client->ReplyPushTask()); ASSERT_EQ(raylet_client->num_workers_returned, 1); ASSERT_EQ(remote_raylet_clients[7777]->num_workers_returned, 0); @@ -1446,23 +1429,27 @@ void TestSchedulingKey(const std::shared_ptr store, const TaskSpecification &same2, const TaskSpecification &different) { rpc::Address address; + auto local_node_id = NodeID::FromRandom(); auto raylet_client = std::make_shared(); + auto raylet_client_pool = std::make_shared( + [&](const rpc::Address &addr) { return raylet_client; }); auto worker_client = std::make_shared(); auto client_pool = std::make_shared( [&](const rpc::Address &addr) { return worker_client; }); auto task_manager = std::make_unique(); - auto actor_creator = std::make_shared(); + auto actor_creator = std::make_shared(); auto lease_policy = std::make_unique(); + lease_policy->SetNodeID(local_node_id); instrumented_io_context io_context; NormalTaskSubmitter submitter( address, raylet_client, client_pool, - nullptr, + raylet_client_pool, std::move(lease_policy), store, *task_manager, - NodeID::Nil(), + local_node_id, WorkerType::WORKER, kLongTimeout, actor_creator, @@ -1471,16 +1458,16 @@ void TestSchedulingKey(const std::shared_ptr store, [](const ObjectID &object_id) { return rpc::TensorTransport::OBJECT_STORE; }, boost::asio::steady_timer(io_context)); - ASSERT_TRUE(submitter.SubmitTask(same1).ok()); - ASSERT_TRUE(submitter.SubmitTask(same2).ok()); - ASSERT_TRUE(submitter.SubmitTask(different).ok()); + submitter.SubmitTask(same1); + submitter.SubmitTask(same2); + submitter.SubmitTask(different); WaitForCondition( [&raylet_client]() { return raylet_client->num_workers_returned == 2; }, /*timeout_ms=*/1000); // same1 is pushed. - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1000, NodeID::Nil())); + ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1000, local_node_id)); ASSERT_EQ(worker_client->callbacks.size(), 1); // Another worker is requested because same2 is pending. ASSERT_EQ(raylet_client->num_workers_requested, 3); @@ -1496,7 +1483,7 @@ void TestSchedulingKey(const std::shared_ptr store, ASSERT_TRUE(raylet_client->ReplyCancelWorkerLease()); // different is pushed. - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1001, NodeID::Nil())); + ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1001, local_node_id)); ASSERT_EQ(worker_client->callbacks.size(), 2); ASSERT_EQ(raylet_client->num_workers_requested, 3); @@ -1513,7 +1500,8 @@ void TestSchedulingKey(const std::shared_ptr store, ASSERT_EQ(raylet_client->num_leases_canceled, 1); // Trigger reply to RequestWorkerLease to remove the canceled pending lease request - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1002, NodeID::Nil(), true)); + ASSERT_TRUE(raylet_client->GrantWorkerLease( + "localhost", 1002, local_node_id, NodeID::Nil(), true)); ASSERT_EQ(raylet_client->num_workers_returned, 2); // Check that there are no entries left in the scheduling_key_entries_ hashmap. These @@ -1523,7 +1511,7 @@ void TestSchedulingKey(const std::shared_ptr store, TEST(NormalTaskSubmitterSchedulingKeyTest, TestSchedulingKeys) { InstrumentedIOContextWithThread io_context("TestSchedulingKeys"); - auto store = std::make_shared(io_context.GetIoService()); + auto memory_store = std::make_shared(io_context.GetIoService()); std::unordered_map resources1({{"a", 1.0}}); std::unordered_map resources2({{"b", 2.0}}); @@ -1534,28 +1522,28 @@ TEST(NormalTaskSubmitterSchedulingKeyTest, TestSchedulingKeys) { // Tasks with different resources should request different worker leases. RAY_LOG(INFO) << "Test different resources"; - TestSchedulingKey(store, + TestSchedulingKey(memory_store, BuildTaskSpec(resources1, descriptor1), BuildTaskSpec(resources1, descriptor1), BuildTaskSpec(resources2, descriptor1)); // Tasks with different functions should request different worker leases. RAY_LOG(INFO) << "Test different functions"; - TestSchedulingKey(store, + TestSchedulingKey(memory_store, BuildTaskSpec(resources1, descriptor1), BuildTaskSpec(resources1, descriptor1), BuildTaskSpec(resources1, descriptor2)); // Tasks with different depths should request different worker leases. RAY_LOG(INFO) << "Test different depths"; - TestSchedulingKey(store, + TestSchedulingKey(memory_store, BuildTaskSpec(resources1, descriptor1, 0), BuildTaskSpec(resources1, descriptor1, 0), BuildTaskSpec(resources1, descriptor1, 1)); // Tasks with different runtime envs do not request different workers. RAY_LOG(INFO) << "Test different runtimes"; - TestSchedulingKey(store, + TestSchedulingKey(memory_store, BuildTaskSpec(resources1, descriptor1, 0, "a"), BuildTaskSpec(resources1, descriptor1, 0, "b"), BuildTaskSpec(resources1, descriptor1, 1, "a")); @@ -1566,16 +1554,16 @@ TEST(NormalTaskSubmitterSchedulingKeyTest, TestSchedulingKeys) { ObjectID plasma2 = ObjectID::FromRandom(); // Ensure the data is already present in the local store for direct call objects. auto data = GenerateRandomObject(); - store->Put(*data, direct1); - store->Put(*data, direct2); + memory_store->Put(*data, direct1); + memory_store->Put(*data, direct2); // Force plasma objects to be promoted. std::string meta = std::to_string(static_cast(rpc::ErrorType::OBJECT_IN_PLASMA)); auto metadata = const_cast(reinterpret_cast(meta.data())); auto meta_buffer = std::make_shared(metadata, meta.size()); auto plasma_data = RayObject(nullptr, meta_buffer, std::vector()); - store->Put(plasma_data, plasma1); - store->Put(plasma_data, plasma2); + memory_store->Put(plasma_data, plasma1); + memory_store->Put(plasma_data, plasma2); TaskSpecification same_deps_1 = BuildTaskSpec(resources1, descriptor1); same_deps_1.GetMutableMessage().add_args()->mutable_object_ref()->set_object_id( @@ -1601,17 +1589,18 @@ TEST(NormalTaskSubmitterSchedulingKeyTest, TestSchedulingKeys) { // Tasks with different plasma dependencies should request different worker leases, // but direct call dependencies shouldn't be considered. RAY_LOG(INFO) << "Test different dependencies"; - TestSchedulingKey(store, same_deps_1, same_deps_2, different_deps); + TestSchedulingKey(memory_store, same_deps_1, same_deps_2, different_deps); } TEST_F(NormalTaskSubmitterTest, TestBacklogReport) { InstrumentedIOContextWithThread store_io_context("TestBacklogReport"); - auto store = std::make_shared(store_io_context.GetIoService()); + auto memory_store = + std::make_shared(store_io_context.GetIoService()); auto submitter = CreateNormalTaskSubmitter(std::make_shared(1), WorkerType::WORKER, /*raylet_client_factory=*/nullptr, - store); + memory_store); TaskSpecification task1 = BuildEmptyTaskSpec(); @@ -1628,8 +1617,8 @@ TEST_F(NormalTaskSubmitterTest, TestBacklogReport) { auto metadata = const_cast(reinterpret_cast(meta.data())); auto meta_buffer = std::make_shared(metadata, meta.size()); auto plasma_data = RayObject(nullptr, meta_buffer, std::vector()); - store->Put(plasma_data, plasma1); - store->Put(plasma_data, plasma2); + memory_store->Put(plasma_data, plasma1); + memory_store->Put(plasma_data, plasma2); // Same SchedulingClass, different SchedulingKey TaskSpecification task2 = BuildTaskSpec(resources1, descriptor1); @@ -1638,18 +1627,19 @@ TEST_F(NormalTaskSubmitterTest, TestBacklogReport) { TaskSpecification task3 = BuildTaskSpec(resources1, descriptor1); task3.GetMutableMessage().add_args()->mutable_object_ref()->set_object_id( plasma2.Binary()); - TestSchedulingKey(store, WithRandomTaskId(task2), WithRandomTaskId(task2), task3); + TestSchedulingKey( + memory_store, WithRandomTaskId(task2), WithRandomTaskId(task2), task3); TaskSpecification task4 = BuildTaskSpec(resources2, descriptor2); - ASSERT_TRUE(submitter.SubmitTask(task1).ok()); + submitter.SubmitTask(task1); // One is requested and one is in the backlog for each SchedulingKey - ASSERT_TRUE(submitter.SubmitTask(WithRandomTaskId(task2)).ok()); - ASSERT_TRUE(submitter.SubmitTask(WithRandomTaskId(task2)).ok()); - ASSERT_TRUE(submitter.SubmitTask(WithRandomTaskId(task3)).ok()); - ASSERT_TRUE(submitter.SubmitTask(WithRandomTaskId(task3)).ok()); - ASSERT_TRUE(submitter.SubmitTask(WithRandomTaskId(task4)).ok()); - ASSERT_TRUE(submitter.SubmitTask(WithRandomTaskId(task4)).ok()); + submitter.SubmitTask(WithRandomTaskId(task2)); + submitter.SubmitTask(WithRandomTaskId(task2)); + submitter.SubmitTask(WithRandomTaskId(task3)); + submitter.SubmitTask(WithRandomTaskId(task3)); + submitter.SubmitTask(WithRandomTaskId(task4)); + submitter.SubmitTask(WithRandomTaskId(task4)); // Waits for the async callbacks in submitter.SubmitTask to finish, before we call // ReportWorkerBacklog. @@ -1667,36 +1657,36 @@ TEST_F(NormalTaskSubmitterTest, TestBacklogReport) { } TEST_F(NormalTaskSubmitterTest, TestWorkerLeaseTimeout) { - auto store = DefaultCoreWorkerMemoryStoreWithThread::CreateShared(); + auto memory_store = DefaultCoreWorkerMemoryStoreWithThread::CreateShared(); auto submitter = CreateNormalTaskSubmitter(std::make_shared(1), WorkerType::WORKER, /*raylet_client_factory=*/nullptr, - store, + memory_store, /*lease_timeout_ms=*/5); TaskSpecification task1 = BuildEmptyTaskSpec(); TaskSpecification task2 = BuildEmptyTaskSpec(); TaskSpecification task3 = BuildEmptyTaskSpec(); - ASSERT_TRUE(submitter.SubmitTask(task1).ok()); - ASSERT_TRUE(submitter.SubmitTask(task2).ok()); - ASSERT_TRUE(submitter.SubmitTask(task3).ok()); + submitter.SubmitTask(task1); + submitter.SubmitTask(task2); + submitter.SubmitTask(task3); ASSERT_EQ(raylet_client->num_workers_requested, 1); // Task 1 is pushed. - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1000, NodeID::Nil())); + ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1000, local_node_id)); ASSERT_EQ(raylet_client->num_workers_requested, 2); // Task 1 finishes with failure; the worker is returned due to the error even though // it hasn't timed out. ASSERT_TRUE(worker_client->ReplyPushTask(Status::IOError("worker dead"))); - ASSERT_TRUE(raylet_client->ReplyGetTaskFailureCause()); + ASSERT_TRUE(raylet_client->ReplyGetWorkerFailureCause()); ASSERT_EQ(raylet_client->num_workers_returned, 0); ASSERT_EQ(raylet_client->num_workers_disconnected, 1); // Task 2 runs successfully on the second worker; the worker is returned due to the // timeout. - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1001, NodeID::Nil())); + ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1001, local_node_id)); std::this_thread::sleep_for( std::chrono::milliseconds(10)); // Sleep for 10ms, causing the lease to time out. ASSERT_TRUE(worker_client->ReplyPushTask()); @@ -1705,7 +1695,7 @@ TEST_F(NormalTaskSubmitterTest, TestWorkerLeaseTimeout) { // Task 3 runs successfully on the third worker; the worker is returned even though it // hasn't timed out. - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1002, NodeID::Nil())); + ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1002, local_node_id)); ASSERT_TRUE(worker_client->ReplyPushTask()); ASSERT_EQ(worker_client->callbacks.size(), 0); ASSERT_EQ(raylet_client->num_workers_returned, 2); @@ -1723,14 +1713,14 @@ TEST_F(NormalTaskSubmitterTest, TestKillExecutingTask) { CreateNormalTaskSubmitter(std::make_shared(1)); TaskSpecification task = BuildEmptyTaskSpec(); - ASSERT_TRUE(submitter.SubmitTask(task).ok()); - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1234, NodeID::Nil())); + submitter.SubmitTask(task); + ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1234, local_node_id)); // Try force kill, exiting the worker - ASSERT_TRUE(submitter.CancelTask(task, true, false).ok()); + submitter.CancelTask(task, true, false); ASSERT_EQ(worker_client->kill_requests.front().intended_task_id(), task.TaskIdBinary()); ASSERT_TRUE(worker_client->ReplyPushTask(Status::IOError("workerdying"), true)); - ASSERT_TRUE(raylet_client->ReplyGetTaskFailureCause()); + ASSERT_TRUE(raylet_client->ReplyGetWorkerFailureCause()); ASSERT_EQ(worker_client->callbacks.size(), 0); ASSERT_EQ(raylet_client->num_workers_returned, 0); ASSERT_EQ(raylet_client->num_workers_returned_exiting, 0); @@ -1740,11 +1730,11 @@ TEST_F(NormalTaskSubmitterTest, TestKillExecutingTask) { task.GetMutableMessage().set_task_id( TaskID::ForNormalTask(JobID::Nil(), TaskID::Nil(), 1).Binary()); - ASSERT_TRUE(submitter.SubmitTask(task).ok()); - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1234, NodeID::Nil())); + submitter.SubmitTask(task); + ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1234, local_node_id)); // Try non-force kill, worker returns normally - ASSERT_TRUE(submitter.CancelTask(task, false, false).ok()); + submitter.CancelTask(task, false, false); ASSERT_TRUE(worker_client->ReplyPushTask()); ASSERT_EQ(worker_client->kill_requests.front().intended_task_id(), task.TaskIdBinary()); ASSERT_EQ(worker_client->callbacks.size(), 0); @@ -1764,8 +1754,8 @@ TEST_F(NormalTaskSubmitterTest, TestKillPendingTask) { CreateNormalTaskSubmitter(std::make_shared(1)); TaskSpecification task = BuildEmptyTaskSpec(); - ASSERT_TRUE(submitter.SubmitTask(task).ok()); - ASSERT_TRUE(submitter.CancelTask(task, true, false).ok()); + submitter.SubmitTask(task); + submitter.CancelTask(task, true, false); ASSERT_EQ(worker_client->kill_requests.size(), 0); ASSERT_EQ(worker_client->callbacks.size(), 0); ASSERT_EQ(raylet_client->num_workers_returned, 0); @@ -1777,7 +1767,8 @@ TEST_F(NormalTaskSubmitterTest, TestKillPendingTask) { ASSERT_TRUE(raylet_client->ReplyCancelWorkerLease()); // Trigger reply to RequestWorkerLease to remove the canceled pending lease request - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1000, NodeID::Nil(), true)); + ASSERT_TRUE(raylet_client->GrantWorkerLease( + "localhost", 1000, local_node_id, NodeID::Nil(), true)); // Check that there are no entries left in the scheduling_key_entries_ hashmap. These // would otherwise cause a memory leak. @@ -1790,9 +1781,9 @@ TEST_F(NormalTaskSubmitterTest, TestKillResolvingTask) { TaskSpecification task = BuildEmptyTaskSpec(); ObjectID obj1 = ObjectID::FromRandom(); task.GetMutableMessage().add_args()->mutable_object_ref()->set_object_id(obj1.Binary()); - ASSERT_TRUE(submitter.SubmitTask(task).ok()); + submitter.SubmitTask(task); ASSERT_EQ(task_manager->num_inlined_dependencies, 0); - ASSERT_TRUE(submitter.CancelTask(task, true, false).ok()); + submitter.CancelTask(task, true, false); auto data = GenerateRandomObject(); store->Put(*data, obj1); WaitForObjectIdInMemoryStore(*store, obj1); @@ -1813,8 +1804,8 @@ TEST_F(NormalTaskSubmitterTest, TestQueueGeneratorForResubmit) { auto submitter = CreateNormalTaskSubmitter(std::make_shared(1)); TaskSpecification task = BuildEmptyTaskSpec(); - ASSERT_TRUE(submitter.SubmitTask(task).ok()); - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1234, NodeID::Nil())); + submitter.SubmitTask(task); + ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1234, local_node_id)); ASSERT_TRUE(submitter.QueueGeneratorForResubmit(task)); ASSERT_TRUE(worker_client->ReplyPushTask()); ASSERT_EQ(task_manager->num_tasks_complete, 0); @@ -1828,9 +1819,9 @@ TEST_F(NormalTaskSubmitterTest, TestCancelBeforeAfterQueueGeneratorForResubmit) auto submitter = CreateNormalTaskSubmitter(std::make_shared(1)); TaskSpecification task = BuildEmptyTaskSpec(); - ASSERT_TRUE(submitter.SubmitTask(task).ok()); - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1234, NodeID::Nil())); - ASSERT_TRUE(submitter.CancelTask(task, /*force_kill=*/false, /*recursive=*/true).ok()); + submitter.SubmitTask(task); + ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1234, local_node_id)); + submitter.CancelTask(task, /*force_kill=*/false, /*recursive=*/true); ASSERT_FALSE(submitter.QueueGeneratorForResubmit(task)); worker_client->ReplyCancelTask(); ASSERT_TRUE(submitter.QueueGeneratorForResubmit(task)); @@ -1845,10 +1836,10 @@ TEST_F(NormalTaskSubmitterTest, TestCancelBeforeAfterQueueGeneratorForResubmit) // Succesful queue generator for resubmit -> cancel -> successful execution -> no // resubmit. TaskSpecification task2 = BuildEmptyTaskSpec(); - ASSERT_TRUE(submitter.SubmitTask(task2).ok()); - ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1234, NodeID::Nil())); + submitter.SubmitTask(task2); + ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1234, local_node_id)); ASSERT_TRUE(submitter.QueueGeneratorForResubmit(task2)); - ASSERT_TRUE(submitter.CancelTask(task2, /*force_kill=*/false, /*recursive=*/true).ok()); + submitter.CancelTask(task2, /*force_kill=*/false, /*recursive=*/true); ASSERT_TRUE(worker_client->ReplyPushTask()); worker_client->ReplyCancelTask(Status::OK(), /*attempt_succeeded=*/true, diff --git a/src/ray/core_worker/test/actor_submit_queue_test.cc b/src/ray/core_worker/task_submission/tests/out_of_order_actor_submit_queue_test.cc similarity index 95% rename from src/ray/core_worker/test/actor_submit_queue_test.cc rename to src/ray/core_worker/task_submission/tests/out_of_order_actor_submit_queue_test.cc index d12f38ebd0a9..bbaefd7b780f 100644 --- a/src/ray/core_worker/test/actor_submit_queue_test.cc +++ b/src/ray/core_worker/task_submission/tests/out_of_order_actor_submit_queue_test.cc @@ -12,13 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "ray/core_worker/task_submission/out_of_order_actor_submit_queue.h" + #include #include #include #include "gtest/gtest.h" -#include "ray/common/test_util.h" -#include "ray/core_worker/transport/out_of_order_actor_submit_queue.h" namespace ray { namespace core { @@ -35,7 +35,7 @@ TaskSpecification BuildTaskSpec(uint64_t seq) { } // namespace TEST(OutofOrderActorSubmitQueueTest, PassThroughTest) { - OutofOrderActorSubmitQueue queue(ActorID{}); + OutofOrderActorSubmitQueue queue; // insert request 0 1 2 3 4 std::vector task_ids; for (uint64_t i = 0; i < 5; i++) { diff --git a/src/ray/core_worker/test/BUILD.bazel b/src/ray/core_worker/tests/BUILD.bazel similarity index 64% rename from src/ray/core_worker/test/BUILD.bazel rename to src/ray/core_worker/tests/BUILD.bazel index c2b327301160..c4366aa3297c 100644 --- a/src/ray/core_worker/test/BUILD.bazel +++ b/src/ray/core_worker/tests/BUILD.bazel @@ -12,16 +12,12 @@ ray_cc_test( ) ray_cc_test( - name = "memory_store_test", - size = "small", - srcs = ["memory_store_test.cc"], + name = "shutdown_coordinator_test", + size = "medium", + srcs = ["shutdown_coordinator_test.cc"], tags = ["team:core"], deps = [ - "//:ray_mock", - "//src/ray/common:status", - "//src/ray/common:status_or", - "//src/ray/common:test_util", - "//src/ray/core_worker:memory_store", + "//src/ray/core_worker:shutdown_coordinator", "@com_google_absl//absl/synchronization", "@com_google_googletest//:gtest", "@com_google_googletest//:gtest_main", @@ -29,70 +25,17 @@ ray_cc_test( ) ray_cc_test( - name = "actor_task_submitter_test", - srcs = ["actor_task_submitter_test.cc"], - tags = ["team:core"], - deps = [ - "//:ray_mock", - "//src/ray/common:asio", - "//src/ray/common:task_common", - "//src/ray/common:test_util", - "//src/ray/core_worker:actor_creator", - "//src/ray/core_worker:reference_count", - "//src/ray/core_worker:task_manager", - "//src/ray/rpc:core_worker_client", - "@com_google_googletest//:gtest", - "@com_google_googletest//:gtest_main", - ], -) - -ray_cc_test( - name = "direct_actor_transport_mock_test", - srcs = ["direct_actor_transport_mock_test.cc"], - tags = ["team:core"], - deps = [ - "//:ray_mock", - "//src/ray/core_worker:memory_store", - "//src/ray/core_worker:reference_count", - "//src/ray/core_worker:task_manager", - "//src/ray/gcs/gcs_client:gcs_client_lib", - "@com_google_googletest//:gtest", - "@com_google_googletest//:gtest_main", - ], -) - -ray_cc_test( - name = "dependency_resolver_test", - size = "small", - srcs = ["dependency_resolver_test.cc"], - tags = ["team:core"], - deps = [ - "//:ray_mock", - "//src/ray/common:task_common", - "//src/ray/common:test_util", - "//src/ray/core_worker:dependency_resolver", - "//src/ray/core_worker:memory_store", - "//src/ray/raylet_client:raylet_client_lib", - "//src/ray/rpc:core_worker_client", - "@com_google_googletest//:gtest", - "@com_google_googletest//:gtest_main", - ], -) - -ray_cc_test( - name = "normal_task_submitter_test", + name = "memory_store_test", size = "small", - srcs = ["normal_task_submitter_test.cc"], + srcs = ["memory_store_test.cc"], tags = ["team:core"], deps = [ - "//:ray_fakes", "//:ray_mock", - "//src/ray/common:task_common", - "//src/ray/common:test_util", - "//src/ray/core_worker:core_worker_lib", + "//src/ray/common:status", + "//src/ray/common:status_or", + "//src/ray/common:test_utils", "//src/ray/core_worker:memory_store", - "//src/ray/raylet_client:raylet_client_lib", - "//src/ray/rpc:core_worker_client", + "@com_google_absl//absl/synchronization", "@com_google_googletest//:gtest", "@com_google_googletest//:gtest_main", ], @@ -104,12 +47,14 @@ ray_cc_test( srcs = ["reference_count_test.cc"], tags = ["team:core"], deps = [ - "//:ray_mock", + "//:ray_fakes", + "//src/mock/ray/pubsub:mock_publisher", "//src/ray/common:asio", "//src/ray/common:ray_object", "//src/ray/core_worker:memory_store", "//src/ray/pubsub:publisher", - "//src/ray/pubsub:subscriber", + "//src/ray/pubsub:publisher_interface", + "//src/ray/pubsub:subscriber_interface", "@com_google_absl//absl/functional:bind_front", "@com_google_googletest//:gtest", "@com_google_googletest//:gtest_main", @@ -124,27 +69,13 @@ ray_cc_test( deps = [ "//:ray_fakes", "//:ray_mock", - "//src/ray/common:task_common", - "//src/ray/common:test_util", + "//src/fakes/ray/rpc/raylet:fake_raylet_client", + "//src/mock/ray/pubsub:mock_publisher", + "//src/ray/common:test_utils", "//src/ray/core_worker:memory_store", - "//src/ray/core_worker:normal_task_submitter", "//src/ray/core_worker:object_recovery_manager", "//src/ray/object_manager:object_manager_common", - "//src/ray/raylet_client:raylet_client_lib", - "@com_google_googletest//:gtest", - "@com_google_googletest//:gtest_main", - ], -) - -ray_cc_test( - name = "actor_submit_queue_test", - size = "small", - srcs = ["actor_submit_queue_test.cc"], - tags = ["team:core"], - deps = [ - "//src/ray/common:asio", - "//src/ray/common:test_util", - "//src/ray/core_worker:out_of_order_actor_submit_queue", + "//src/ray/rpc:raylet_client_interface", "@com_google_googletest//:gtest", "@com_google_googletest//:gtest_main", ], @@ -156,14 +87,17 @@ ray_cc_test( srcs = ["task_manager_test.cc"], tags = ["team:core"], deps = [ + "//:ray_fakes", "//:ray_mock", + "//src/mock/ray/pubsub:mock_publisher", "//src/ray/common:task_common", - "//src/ray/common:test_util", + "//src/ray/common:test_utils", "//src/ray/core_worker:memory_store", "//src/ray/core_worker:reference_count", "//src/ray/core_worker:task_event_buffer", "//src/ray/core_worker:task_manager", - "//src/ray/gcs/gcs_client:gcs_client_lib", + "//src/ray/gcs_client", + "//src/ray/observability:fake_metric", "@com_google_googletest//:gtest", "@com_google_googletest//:gtest_main", ], @@ -177,9 +111,9 @@ ray_cc_test( deps = [ "//:ray_mock", "//src/ray/common:task_common", - "//src/ray/common:test_util", + "//src/ray/common:test_utils", "//src/ray/core_worker:task_event_buffer", - "//src/ray/gcs/gcs_client:gcs_client_lib", + "//src/ray/gcs_client", "//src/ray/util:event", "@com_google_absl//absl/base:core_headers", "@com_google_absl//absl/synchronization", @@ -199,13 +133,11 @@ ray_cc_test( ], deps = [ "//:ray_mock", - "//src/ray/common:task_common", - "//src/ray/common:test_util", + "//src/ray/common:test_utils", "//src/ray/core_worker:task_event_buffer", - "//src/ray/gcs/gcs_client:gcs_client_lib", + "//src/ray/gcs_client", "//src/ray/util:event", "@com_google_absl//absl/base:core_headers", - "@com_google_absl//absl/synchronization", "@com_google_absl//absl/types:optional", "@com_google_googletest//:gtest", "@com_google_googletest//:gtest_main", @@ -219,10 +151,11 @@ ray_cc_test( tags = ["team:core"], deps = [ "//:ray_mock", - "//src/ray/common:test_util", + "//src/ray/common:test_utils", "//src/ray/core_worker:actor_creator", - "//src/ray/gcs/gcs_client:gcs_client_lib", + "//src/ray/gcs_client", "//src/ray/util:path_utils", + "//src/ray/util:raii", "@com_google_googletest//:gtest", "@com_google_googletest//:gtest_main", ], @@ -235,10 +168,10 @@ ray_cc_test( tags = ["team:core"], deps = [ "//:ray_mock", - "//src/ray/common:test_util", + "//src/ray/common:test_utils", "//src/ray/core_worker:common", "//src/ray/core_worker:generator_waiter", - "//src/ray/gcs/gcs_client:gcs_client_lib", + "//src/ray/gcs_client", "@com_google_googletest//:gtest", "@com_google_googletest//:gtest_main", ], @@ -251,10 +184,9 @@ ray_cc_test( tags = ["team:core"], deps = [ "//:ray_mock", - "//src/ray/common:task_common", - "//src/ray/common:test_util", + "//src/ray/common:test_utils", "//src/ray/core_worker:actor_manager", - "//src/ray/gcs/gcs_client:gcs_client_lib", + "//src/ray/gcs_client", "@com_google_googletest//:gtest", "@com_google_googletest//:gtest_main", ], @@ -266,7 +198,6 @@ ray_cc_test( srcs = ["lease_policy_test.cc"], tags = ["team:core"], deps = [ - "//src/ray/common:task_common", "//src/ray/core_worker:lease_policy", "@com_google_googletest//:gtest", "@com_google_googletest//:gtest_main", @@ -291,6 +222,7 @@ ray_cc_test( deps = [ "//:ray_fakes", "//:ray_mock", + "//src/fakes/ray/rpc/raylet:fake_raylet_client", "//src/ray/core_worker:experimental_mutable_object_provider", "//src/ray/object_manager:object_manager_common", "//src/ray/object_manager/plasma:plasma_client", @@ -311,10 +243,15 @@ ray_cc_test( deps = [ "//:ray_fakes", "//:ray_mock", - "//src/ray/common:test_util", + "//src/fakes/ray/object_manager/plasma:fake_plasma_client", + "//src/fakes/ray/rpc/raylet:fake_raylet_client", + "//src/ray/common:test_utils", "//src/ray/core_worker:core_worker_lib", + "//src/ray/core_worker:grpc_service", "//src/ray/core_worker:memory_store", "//src/ray/core_worker:reference_count", + "//src/ray/ipc:fake_raylet_ipc_client", + "//src/ray/observability:fake_metric", "@com_google_googletest//:gtest", "@com_google_googletest//:gtest_main", ], diff --git a/src/ray/core_worker/test/actor_creator_test.cc b/src/ray/core_worker/tests/actor_creator_test.cc similarity index 90% rename from src/ray/core_worker/test/actor_creator_test.cc rename to src/ray/core_worker/tests/actor_creator_test.cc index 9e50e896d151..10d3b3574c3e 100644 --- a/src/ray/core_worker/test/actor_creator_test.cc +++ b/src/ray/core_worker/tests/actor_creator_test.cc @@ -12,16 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -// clang-format off +#include "ray/core_worker/actor_creator.h" + #include #include "gmock/gmock.h" #include "gtest/gtest.h" -#include "ray/core_worker/actor_creator.h" -#include "ray/common/test_util.h" +#include "mock/ray/gcs_client/gcs_client.h" +#include "ray/common/test_utils.h" #include "ray/util/path_utils.h" -#include "mock/ray/gcs/gcs_client/gcs_client.h" -// clang-format on +#include "ray/util/raii.h" namespace ray { namespace core { @@ -31,7 +31,7 @@ class ActorCreatorTest : public ::testing::Test { ActorCreatorTest() {} void SetUp() override { gcs_client = std::make_shared(); - actor_creator = std::make_unique(gcs_client); + actor_creator = std::make_unique(gcs_client->Actors()); } TaskSpecification GetTaskSpec(const ActorID &actor_id) { rpc::TaskSpec task_spec; @@ -42,7 +42,7 @@ class ActorCreatorTest : public ::testing::Test { return TaskSpecification(task_spec); } std::shared_ptr gcs_client; - std::unique_ptr actor_creator; + std::unique_ptr actor_creator; }; TEST_F(ActorCreatorTest, IsRegister) { @@ -66,19 +66,19 @@ TEST_F(ActorCreatorTest, AsyncWaitForFinish) { EXPECT_CALL(*gcs_client->mock_actor_accessor, AsyncRegisterActor(::testing::_, ::testing::_, ::testing::_)) .WillRepeatedly(::testing::DoAll(::testing::SaveArg<1>(&cb))); - int cnt = 0; - auto per_finish_cb = [&cnt](Status status) { + int count = 0; + auto per_finish_cb = [&count](Status status) { ASSERT_TRUE(status.ok()); - cnt++; + count++; }; actor_creator->AsyncRegisterActor(task_spec, per_finish_cb); ASSERT_TRUE(actor_creator->IsActorInRegistering(actor_id)); - for (int i = 0; i < 100; ++i) { + for (int i = 0; i < 10; ++i) { actor_creator->AsyncWaitForActorRegisterFinish(actor_id, per_finish_cb); } cb(Status::OK()); ASSERT_FALSE(actor_creator->IsActorInRegistering(actor_id)); - ASSERT_EQ(101, cnt); + ASSERT_EQ(11, count); } } // namespace core diff --git a/src/ray/core_worker/test/actor_manager_test.cc b/src/ray/core_worker/tests/actor_manager_test.cc similarity index 99% rename from src/ray/core_worker/test/actor_manager_test.cc rename to src/ray/core_worker/tests/actor_manager_test.cc index 4856e3b6c214..1e2c644064ff 100644 --- a/src/ray/core_worker/test/actor_manager_test.cc +++ b/src/ray/core_worker/tests/actor_manager_test.cc @@ -21,10 +21,9 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" #include "mock/ray/core_worker/reference_count.h" -#include "ray/common/task/task_spec.h" -#include "ray/common/test_util.h" -#include "ray/gcs/gcs_client/accessor.h" -#include "ray/gcs/gcs_client/gcs_client.h" +#include "ray/common/test_utils.h" +#include "ray/gcs_client/accessor.h" +#include "ray/gcs_client/gcs_client.h" namespace ray { namespace core { diff --git a/src/ray/core_worker/test/core_worker_resubmit_queue_test.cc b/src/ray/core_worker/tests/core_worker_resubmit_queue_test.cc similarity index 100% rename from src/ray/core_worker/test/core_worker_resubmit_queue_test.cc rename to src/ray/core_worker/tests/core_worker_resubmit_queue_test.cc diff --git a/src/ray/core_worker/test/core_worker_test.cc b/src/ray/core_worker/tests/core_worker_test.cc similarity index 67% rename from src/ray/core_worker/test/core_worker_test.cc rename to src/ray/core_worker/tests/core_worker_test.cc index 1979d217aeac..a972d19a47a5 100644 --- a/src/ray/core_worker/test/core_worker_test.cc +++ b/src/ray/core_worker/tests/core_worker_test.cc @@ -24,22 +24,30 @@ #include #include +#include "absl/container/flat_hash_set.h" #include "fakes/ray/common/asio/fake_periodical_runner.h" -#include "fakes/ray/ipc/raylet_ipc_client.h" +#include "fakes/ray/object_manager/plasma/fake_plasma_client.h" #include "fakes/ray/pubsub/publisher.h" #include "fakes/ray/pubsub/subscriber.h" #include "fakes/ray/rpc/raylet/raylet_client.h" -#include "mock/ray/gcs/gcs_client/gcs_client.h" +#include "mock/ray/gcs_client/gcs_client.h" +#include "mock/ray/object_manager/plasma/client.h" +#include "ray/common/buffer.h" +#include "ray/common/ray_config.h" #include "ray/core_worker/actor_creator.h" #include "ray/core_worker/actor_manager.h" #include "ray/core_worker/context.h" #include "ray/core_worker/core_worker_rpc_proxy.h" #include "ray/core_worker/future_resolver.h" +#include "ray/core_worker/grpc_service.h" #include "ray/core_worker/object_recovery_manager.h" #include "ray/core_worker/reference_count.h" #include "ray/core_worker/store_provider/memory_store/memory_store.h" -#include "ray/core_worker/transport/actor_task_submitter.h" -#include "ray/core_worker/transport/normal_task_submitter.h" +#include "ray/core_worker/store_provider/plasma_store_provider.h" +#include "ray/core_worker/task_submission/actor_task_submitter.h" +#include "ray/core_worker/task_submission/normal_task_submitter.h" +#include "ray/ipc/fake_raylet_ipc_client.h" +#include "ray/observability/fake_metric.h" #include "ray/rpc/worker/core_worker_client_pool.h" namespace ray { @@ -49,16 +57,15 @@ using ::testing::_; using ::testing::InvokeWithoutArgs; using ::testing::Return; -class CoreWorkerHandleGetObjectStatusTest : public ::testing::Test { +class CoreWorkerTest : public ::testing::Test { public: - CoreWorkerHandleGetObjectStatusTest() + CoreWorkerTest() : io_work_(io_service_.get_executor()), task_execution_service_work_(task_execution_service_.get_executor()) { CoreWorkerOptions options; options.worker_type = WorkerType::WORKER; options.language = Language::PYTHON; options.node_ip_address = "127.0.0.1"; - options.raylet_ip_address = "127.0.0.1"; options.task_execution_callback = [](const rpc::Address &caller_address, TaskType task_type, @@ -108,21 +115,21 @@ class CoreWorkerHandleGetObjectStatusTest : public ::testing::Test { auto core_worker_server = std::make_unique(WorkerTypeString(options.worker_type), 0, true); core_worker_server->RegisterService( - std::make_unique(io_service_, *service_handler), + std::make_unique( + io_service_, *service_handler, /*max_active_rpcs_per_handler_=*/-1), false /* token_auth */); core_worker_server->Run(); - rpc::Address rpc_address; - rpc_address.set_ip_address(options.node_ip_address); - rpc_address.set_port(core_worker_server->GetPort()); - rpc_address.set_raylet_id(NodeID::FromRandom().Binary()); - rpc_address.set_worker_id(worker_context->GetWorkerID().Binary()); + rpc_address_.set_ip_address(options.node_ip_address); + rpc_address_.set_port(core_worker_server->GetPort()); + rpc_address_.set_node_id(NodeID::FromRandom().Binary()); + rpc_address_.set_worker_id(worker_context->GetWorkerID().Binary()); auto fake_object_info_publisher = std::make_unique(); auto fake_object_info_subscriber = std::make_unique(); reference_counter_ = std::make_shared( - rpc_address, + rpc_address_, fake_object_info_publisher.get(), fake_object_info_subscriber.get(), [](const NodeID &) { return false; }, @@ -138,17 +145,18 @@ class CoreWorkerHandleGetObjectStatusTest : public ::testing::Test { const absl::flat_hash_set &locations, uint64_t object_size) {}, core_worker_client_pool, - rpc_address); + rpc_address_); auto task_event_buffer = std::make_unique( std::make_unique(), - std::make_unique(0, *client_call_manager)); + std::make_unique(0, *client_call_manager), + "test_session"); - auto task_manager = std::make_shared( + task_manager_ = std::make_shared( *memory_store_, *reference_counter_, [](const RayObject &object, const ObjectID &object_id) { return Status::OK(); }, - [](TaskSpecification &spec, bool object_recovery, uint32_t delay_ms) {}, + [](TaskSpecification &spec, uint32_t delay_ms) {}, [](const TaskSpecification &spec) { return false; }, [](const JobID &job_id, const std::string &type, @@ -159,34 +167,35 @@ class CoreWorkerHandleGetObjectStatusTest : public ::testing::Test { [](const ActorID &actor_id) { return std::make_shared(); }, - mock_gcs_client); + mock_gcs_client, + fake_task_by_state_counter_); auto object_recovery_manager = std::make_unique( - rpc_address, + rpc_address_, raylet_client_pool, [](const ObjectID &object_id, const ObjectLookupCallback &callback) { return Status::OK(); }, - *task_manager, + *task_manager_, *reference_counter_, *memory_store_, [](const ObjectID &object_id, rpc::ErrorType reason, bool pin_object) {}); auto lease_policy = std::unique_ptr( - std::make_unique(rpc_address)); + std::make_unique(rpc_address_)); auto lease_request_rate_limiter = std::make_shared(10); - auto actor_creator = std::make_shared(mock_gcs_client); + auto actor_creator = std::make_shared(mock_gcs_client->Actors()); auto normal_task_submitter = std::make_unique( - rpc_address, + rpc_address_, fake_local_raylet_rpc_client, core_worker_client_pool, raylet_client_pool, std::move(lease_policy), memory_store_, - *task_manager, + *task_manager_, NodeID::Nil(), WorkerType::WORKER, 10000, @@ -199,13 +208,14 @@ class CoreWorkerHandleGetObjectStatusTest : public ::testing::Test { auto actor_task_submitter = std::make_unique( *core_worker_client_pool, *memory_store_, - *task_manager, + *task_manager_, *actor_creator, /*tensor_transport_getter=*/ [](const ObjectID &object_id) { return rpc::TensorTransport::OBJECT_STORE; }, [](const ActorID &actor_id, uint64_t num_queued) { return Status::OK(); }, io_service_, reference_counter_); + actor_task_submitter_ = actor_task_submitter.get(); auto actor_manager = std::make_unique( mock_gcs_client, *actor_task_submitter, *reference_counter_); @@ -222,7 +232,7 @@ class CoreWorkerHandleGetObjectStatusTest : public ::testing::Test { std::move(raylet_client_pool), std::move(periodical_runner), std::move(core_worker_server), - std::move(rpc_address), + std::move(rpc_address_), std::move(mock_gcs_client), std::move(fake_raylet_ipc_client), std::move(fake_local_raylet_rpc_client), @@ -232,7 +242,7 @@ class CoreWorkerHandleGetObjectStatusTest : public ::testing::Test { nullptr, // plasma_store_provider_ nullptr, // mutable_object_provider_ std::move(future_resolver), - std::move(task_manager), + task_manager_, std::move(actor_creator), std::move(actor_task_submitter), std::move(fake_object_info_publisher), @@ -243,23 +253,26 @@ class CoreWorkerHandleGetObjectStatusTest : public ::testing::Test { std::move(actor_manager), task_execution_service_, std::move(task_event_buffer), - getpid()); + getpid(), + fake_task_by_state_counter_); } protected: - instrumented_io_context io_service_{/*enable_lag_probe=*/false, - /*running_on_single_thread=*/true}; - instrumented_io_context task_execution_service_{/*enable_lag_probe=*/false, - /*running_on_single_thread=*/true}; + instrumented_io_context io_service_; + instrumented_io_context task_execution_service_; boost::asio::executor_work_guard io_work_; boost::asio::executor_work_guard task_execution_service_work_; boost::thread io_thread_; + rpc::Address rpc_address_; std::shared_ptr reference_counter_; std::shared_ptr memory_store_; + ActorTaskSubmitter *actor_task_submitter_; + std::shared_ptr task_manager_; std::shared_ptr core_worker_; + ray::observability::FakeMetric fake_task_by_state_counter_; }; std::shared_ptr MakeRayObject(const std::string &data_str, @@ -275,7 +288,23 @@ std::shared_ptr MakeRayObject(const std::string &data_str, return std::make_shared(data, metadata, std::vector()); } -TEST_F(CoreWorkerHandleGetObjectStatusTest, IdempotencyTest) { +TEST_F(CoreWorkerTest, RecordMetrics) { + std::vector> results; + auto status = core_worker_->Get({}, -1, results); + ASSERT_TRUE(status.ok()); + // disconnect to trigger metric recording + core_worker_->Disconnect(rpc::WorkerExitType::SYSTEM_ERROR, "test", nullptr); + auto tag_to_value = fake_task_by_state_counter_.GetTagToValue(); + // 4 states: RUNNING, SUBMITTED_TO_WORKER, RUNNING_IN_RAY_GET and RUNNING_IN_RAY_WAIT + ASSERT_EQ(tag_to_value.size(), 4); + for (auto &[key, value] : tag_to_value) { + ASSERT_EQ(key.at("Name"), "Unknown task"); + ASSERT_EQ(key.at("Source"), "executor"); + ASSERT_EQ(key.at("IsRetry"), "0"); + } +} + +TEST_F(CoreWorkerTest, HandleGetObjectStatusIdempotency) { auto object_id = ObjectID::FromRandom(); auto ray_object = MakeRayObject("test_data", "meta"); @@ -325,7 +354,7 @@ TEST_F(CoreWorkerHandleGetObjectStatusTest, IdempotencyTest) { EXPECT_EQ("meta", reply2.object().metadata()); } -TEST_F(CoreWorkerHandleGetObjectStatusTest, ObjectPutAfterFirstRequest) { +TEST_F(CoreWorkerTest, HandleGetObjectStatusObjectPutAfterFirstRequest) { auto object_id = ObjectID::FromRandom(); auto ray_object = MakeRayObject("test_data", "meta"); @@ -380,7 +409,7 @@ TEST_F(CoreWorkerHandleGetObjectStatusTest, ObjectPutAfterFirstRequest) { EXPECT_EQ("meta", reply2.object().metadata()); } -TEST_F(CoreWorkerHandleGetObjectStatusTest, ObjectFreedBetweenRequests) { +TEST_F(CoreWorkerTest, HandleGetObjectStatusObjectFreedBetweenRequests) { auto object_id = ObjectID::FromRandom(); auto ray_object = MakeRayObject("test_data", "meta"); @@ -430,7 +459,7 @@ TEST_F(CoreWorkerHandleGetObjectStatusTest, ObjectFreedBetweenRequests) { ASSERT_FALSE(io_service_.poll_one()); } -TEST_F(CoreWorkerHandleGetObjectStatusTest, ObjectOutOfScope) { +TEST_F(CoreWorkerTest, HandleGetObjectStatusObjectOutOfScope) { auto object_id = ObjectID::FromRandom(); auto ray_object = MakeRayObject("test_data", "meta"); @@ -481,5 +510,136 @@ TEST_F(CoreWorkerHandleGetObjectStatusTest, ObjectOutOfScope) { EXPECT_EQ(reply2.status(), rpc::GetObjectStatusReply::OUT_OF_SCOPE); } +namespace { + +ObjectID CreateInlineObjectInMemoryStoreAndRefCounter(CoreWorkerMemoryStore &memory_store, + ReferenceCounter &reference_counter, + rpc::Address &rpc_address) { + auto inlined_dependency_id = ObjectID::FromRandom(); + std::string data = "hello"; + auto data_ptr = const_cast(reinterpret_cast(data.data())); + auto data_buffer = + std::make_shared(data_ptr, data.size(), /*copy_data=*/true); + RayObject memory_store_object(data_buffer, + /*metadata=*/nullptr, + std::vector(), + /*copy_data=*/true); + reference_counter.AddOwnedObject(inlined_dependency_id, + /*contained_ids=*/{}, + rpc_address, + "call_site", + /*object_size=*/100, + /*is_reconstructable=*/false, + /*add_local_ref=*/true); + memory_store.Put(memory_store_object, inlined_dependency_id); + return inlined_dependency_id; +} + +} // namespace + +TEST_F(CoreWorkerTest, ActorTaskCancelDuringDepResolution) { + /* + See https://github.com/ray-project/ray/pull/56123 for context. + 1. Put an inline object in the memory store + ref counter. + 2. Create an actor (just creating an actor queue in the submitter). + 3. Submit an actor task with the inline objects as dependencies. + 4. Cancel the actor task. + 5. Run the io context to completion to run the actual submission + dependency + resolution logic. + */ + + auto inlined_dependency_id = CreateInlineObjectInMemoryStoreAndRefCounter( + *memory_store_, *reference_counter_, rpc_address_); + + auto actor_id = ActorID::Of(JobID::FromInt(0), TaskID::Nil(), 0); + actor_task_submitter_->AddActorQueueIfNotExists(actor_id, + /*max_pending_calls=*/-1, + /*allow_out_of_order_execution=*/false, + /*fail_if_actor_unreachable=*/true, + /*owned=*/false); + + TaskSpecification task; + auto &task_message = task.GetMutableMessage(); + task_message.set_task_id(TaskID::FromRandom(actor_id.JobId()).Binary()); + task_message.set_type(TaskType::ACTOR_TASK); + task_message.mutable_actor_task_spec()->set_actor_id(actor_id.Binary()); + task_message.add_args()->mutable_object_ref()->set_object_id( + inlined_dependency_id.Binary()); + task_manager_->AddPendingTask(rpc_address_, task, "call_site"); + actor_task_submitter_->SubmitTask(task); + + actor_task_submitter_->CancelTask(task, /*recursive=*/false); + + while (io_service_.poll_one() > 0) { + } +} + +TEST(BatchingPassesTwoTwoOneIntoPlasmaGet, CallsPlasmaGetInCorrectBatches) { + auto fake_raylet = std::make_shared(); + // Build a ReferenceCounter with minimal dependencies. + rpc::Address addr; + addr.set_ip_address("127.0.0.1"); + auto is_node_dead = [](const NodeID &) { return false; }; + ReferenceCounter ref_counter(addr, + /*object_info_publisher=*/nullptr, + /*object_info_subscriber=*/nullptr, + is_node_dead); + + // Fake plasma client that records Get calls. + std::vector> observed_batches; + class RecordingPlasmaGetClient : public plasma::FakePlasmaClient { + public: + explicit RecordingPlasmaGetClient(std::vector> *observed) + : observed_(observed) {} + Status Get(const std::vector &object_ids, + int64_t timeout_ms, + std::vector *object_buffers) override { + if (observed_ != nullptr) { + observed_->push_back(object_ids); + } + object_buffers->resize(object_ids.size()); + for (size_t i = 0; i < object_ids.size(); i++) { + uint8_t byte = 0; + auto parent = std::make_shared(&byte, 1, /*copy_data=*/true); + (*object_buffers)[i].data = SharedMemoryBuffer::Slice(parent, 0, 1); + (*object_buffers)[i].metadata = SharedMemoryBuffer::Slice(parent, 0, 1); + } + return Status::OK(); + } + + private: + std::vector> *observed_; + }; + + auto fake_plasma = std::make_shared(&observed_batches); + + CoreWorkerPlasmaStoreProvider provider( + /*store_socket=*/"", + fake_raylet, + ref_counter, + /*check_signals=*/[] { return Status::OK(); }, + /*warmup=*/false, + /*store_client=*/fake_plasma, + /*fetch_batch_size=*/2, + /*get_current_call_site=*/nullptr); + + // Build a set of 5 object ids. + std::vector ids; + for (int i = 0; i < 5; i++) ids.push_back(ObjectID::FromRandom()); + absl::flat_hash_set idset(ids.begin(), ids.end()); + + absl::flat_hash_map> results; + bool got_exception = false; + WorkerContext ctx(WorkerType::WORKER, WorkerID::FromRandom(), JobID::FromInt(0)); + + ASSERT_TRUE(provider.Get(idset, /*timeout_ms=*/-1, ctx, &results, &got_exception).ok()); + + // Assert: batches seen by plasma Get are [2,2,1]. + ASSERT_EQ(observed_batches.size(), 3U); + EXPECT_EQ(observed_batches[0].size(), 2U); + EXPECT_EQ(observed_batches[1].size(), 2U); + EXPECT_EQ(observed_batches[2].size(), 1U); +} + } // namespace core } // namespace ray diff --git a/src/ray/core_worker/test/generator_waiter_test.cc b/src/ray/core_worker/tests/generator_waiter_test.cc similarity index 100% rename from src/ray/core_worker/test/generator_waiter_test.cc rename to src/ray/core_worker/tests/generator_waiter_test.cc diff --git a/src/ray/core_worker/test/lease_policy_test.cc b/src/ray/core_worker/tests/lease_policy_test.cc similarity index 83% rename from src/ray/core_worker/test/lease_policy_test.cc rename to src/ray/core_worker/tests/lease_policy_test.cc index 3bdd17bb5000..b1219bcf375a 100644 --- a/src/ray/core_worker/test/lease_policy_test.cc +++ b/src/ray/core_worker/tests/lease_policy_test.cc @@ -18,22 +18,19 @@ #include #include "gtest/gtest.h" -#include "ray/common/task/task_spec.h" +#include "ray/common/lease/lease_spec.h" namespace ray { namespace core { -TaskSpecification CreateFakeTask(std::vector deps) { - TaskSpecification spec; - spec.GetMutableMessage().set_task_id(TaskID::FromRandom(JobID::FromInt(1)).Binary()); +LeaseSpecification CreateFakeLease(std::vector deps) { + rpc::LeaseSpec spec; for (auto &dep : deps) { - spec.GetMutableMessage().add_args()->mutable_object_ref()->set_object_id( - dep.Binary()); + spec.add_dependencies()->set_object_id(dep.Binary()); } - spec.GetMutableMessage() - .mutable_scheduling_strategy() - ->mutable_default_scheduling_strategy(); - return spec; + spec.set_lease_id(LeaseID::FromRandom().Binary()); + spec.mutable_scheduling_strategy()->mutable_default_scheduling_strategy(); + return LeaseSpecification(spec); } class MockLocalityDataProvider : public LocalityDataProviderInterface { @@ -57,7 +54,7 @@ class MockLocalityDataProvider : public LocalityDataProviderInterface { std::optional MockNodeAddrFactory(const NodeID &node_id) { rpc::Address mock_rpc_address; - mock_rpc_address.set_raylet_id(node_id.Binary()); + mock_rpc_address.set_node_id(node_id.Binary()); std::optional opt_mock_rpc_address = mock_rpc_address; return opt_mock_rpc_address; } @@ -73,11 +70,11 @@ TEST(LocalLeasePolicyTest, TestReturnFallback) { ObjectID obj1 = ObjectID::FromRandom(); ObjectID obj2 = ObjectID::FromRandom(); std::vector deps{obj1, obj2}; - auto task_spec = CreateFakeTask(deps); + auto lease_spec = CreateFakeLease(deps); auto [best_node_address, is_selected_based_on_locality] = - local_lease_policy.GetBestNodeForTask(task_spec); + local_lease_policy.GetBestNodeForLease(lease_spec); // Test that fallback node was chosen. - ASSERT_EQ(NodeID::FromBinary(best_node_address.raylet_id()), fallback_node); + ASSERT_EQ(NodeID::FromBinary(best_node_address.node_id()), fallback_node); ASSERT_FALSE(is_selected_based_on_locality); } @@ -96,16 +93,16 @@ TEST(LocalityAwareLeasePolicyTest, TestBestLocalityFallbackSpreadSchedulingStrat LocalityAwareLeasePolicy locality_lease_policy( *mock_locality_data_provider, MockNodeAddrFactory, fallback_rpc_address); std::vector deps{obj1, obj2}; - auto task_spec = CreateFakeTask(deps); - task_spec.GetMutableMessage() + auto lease_spec = CreateFakeLease(deps); + lease_spec.GetMutableMessage() .mutable_scheduling_strategy() ->mutable_spread_scheduling_strategy(); auto [best_node_address, is_selected_based_on_locality] = - locality_lease_policy.GetBestNodeForTask(task_spec); + locality_lease_policy.GetBestNodeForLease(lease_spec); // Locality logic is not run since it's a spread scheduling strategy. ASSERT_EQ(mock_locality_data_provider->num_locality_data_fetches, 0); // Test that fallback node was chosen. - ASSERT_EQ(NodeID::FromBinary(best_node_address.raylet_id()), fallback_node); + ASSERT_EQ(NodeID::FromBinary(best_node_address.node_id()), fallback_node); ASSERT_FALSE(is_selected_based_on_locality); } @@ -125,18 +122,18 @@ TEST(LocalityAwareLeasePolicyTest, LocalityAwareLeasePolicy locality_lease_policy( *mock_locality_data_provider, MockNodeAddrFactory, fallback_rpc_address); std::vector deps{obj1, obj2}; - auto task_spec = CreateFakeTask(deps); + auto lease_spec = CreateFakeLease(deps); NodeID node_affinity_node = NodeID::FromRandom(); - task_spec.GetMutableMessage() + lease_spec.GetMutableMessage() .mutable_scheduling_strategy() ->mutable_node_affinity_scheduling_strategy() ->set_node_id(node_affinity_node.Binary()); auto [best_node_address, is_selected_based_on_locality] = - locality_lease_policy.GetBestNodeForTask(task_spec); + locality_lease_policy.GetBestNodeForLease(lease_spec); // Locality logic is not run since it's a node affinity scheduling strategy. ASSERT_EQ(mock_locality_data_provider->num_locality_data_fetches, 0); // Test that node affinity node was chosen. - ASSERT_EQ(NodeID::FromBinary(best_node_address.raylet_id()), node_affinity_node); + ASSERT_EQ(NodeID::FromBinary(best_node_address.node_id()), node_affinity_node); ASSERT_FALSE(is_selected_based_on_locality); } @@ -155,13 +152,13 @@ TEST(LocalityAwareLeasePolicyTest, TestBestLocalityDominatingNode) { LocalityAwareLeasePolicy locality_lease_policy( *mock_locality_data_provider, MockNodeAddrFactory, fallback_rpc_address); std::vector deps{obj1, obj2}; - auto task_spec = CreateFakeTask(deps); + auto lease_spec = CreateFakeLease(deps); auto [best_node_address, is_selected_based_on_locality] = - locality_lease_policy.GetBestNodeForTask(task_spec); + locality_lease_policy.GetBestNodeForLease(lease_spec); // Locality data provider should be called once for each dependency. ASSERT_EQ(mock_locality_data_provider->num_locality_data_fetches, deps.size()); // Test that best node was chosen. - ASSERT_EQ(NodeID::FromBinary(best_node_address.raylet_id()), best_node); + ASSERT_EQ(NodeID::FromBinary(best_node_address.node_id()), best_node); ASSERT_TRUE(is_selected_based_on_locality); } @@ -181,13 +178,13 @@ TEST(LocalityAwareLeasePolicyTest, TestBestLocalityBiggerObject) { LocalityAwareLeasePolicy locality_lease_policy( *mock_locality_data_provider, MockNodeAddrFactory, fallback_rpc_address); std::vector deps{obj1, obj2}; - auto task_spec = CreateFakeTask(deps); + auto lease_spec = CreateFakeLease(deps); auto [best_node_address, is_selected_based_on_locality] = - locality_lease_policy.GetBestNodeForTask(task_spec); + locality_lease_policy.GetBestNodeForLease(lease_spec); // Locality data provider should be called once for each dependency. ASSERT_EQ(mock_locality_data_provider->num_locality_data_fetches, deps.size()); // Test that best node was chosen. - ASSERT_EQ(NodeID::FromBinary(best_node_address.raylet_id()), best_node); + ASSERT_EQ(NodeID::FromBinary(best_node_address.node_id()), best_node); ASSERT_TRUE(is_selected_based_on_locality); } @@ -211,13 +208,13 @@ TEST(LocalityAwareLeasePolicyTest, TestBestLocalityBetterNode) { LocalityAwareLeasePolicy locality_lease_policy( *mock_locality_data_provider, MockNodeAddrFactory, fallback_rpc_address); std::vector deps{obj1, obj2, obj3}; - auto task_spec = CreateFakeTask(deps); + auto lease_spec = CreateFakeLease(deps); auto [best_node_address, is_selected_based_on_locality] = - locality_lease_policy.GetBestNodeForTask(task_spec); + locality_lease_policy.GetBestNodeForLease(lease_spec); // Locality data provider should be called once for each dependency. ASSERT_EQ(mock_locality_data_provider->num_locality_data_fetches, deps.size()); // Test that best node was chosen. - ASSERT_EQ(NodeID::FromBinary(best_node_address.raylet_id()), best_node); + ASSERT_EQ(NodeID::FromBinary(best_node_address.node_id()), best_node); ASSERT_TRUE(is_selected_based_on_locality); } @@ -235,13 +232,13 @@ TEST(LocalityAwareLeasePolicyTest, TestBestLocalityFallbackNoLocations) { LocalityAwareLeasePolicy locality_lease_policy( *mock_locality_data_provider, MockNodeAddrFactory, fallback_rpc_address); std::vector deps{obj1, obj2}; - auto task_spec = CreateFakeTask(deps); + auto lease_spec = CreateFakeLease(deps); auto [best_node_address, is_selected_based_on_locality] = - locality_lease_policy.GetBestNodeForTask(task_spec); + locality_lease_policy.GetBestNodeForLease(lease_spec); // Locality data provider should be called once for each dependency. ASSERT_EQ(mock_locality_data_provider->num_locality_data_fetches, deps.size()); // Test that fallback node was chosen. - ASSERT_EQ(NodeID::FromBinary(best_node_address.raylet_id()), fallback_node); + ASSERT_EQ(NodeID::FromBinary(best_node_address.node_id()), fallback_node); ASSERT_FALSE(is_selected_based_on_locality); } @@ -252,15 +249,15 @@ TEST(LocalityAwareLeasePolicyTest, TestBestLocalityFallbackNoDeps) { auto mock_locality_data_provider = std::make_shared(); LocalityAwareLeasePolicy locality_lease_policy( *mock_locality_data_provider, MockNodeAddrFactory, fallback_rpc_address); - // No task dependencies. + // No lease dependencies. std::vector deps; - auto task_spec = CreateFakeTask(deps); + auto lease_spec = CreateFakeLease(deps); auto [best_node_address, is_selected_based_on_locality] = - locality_lease_policy.GetBestNodeForTask(task_spec); + locality_lease_policy.GetBestNodeForLease(lease_spec); // Locality data provider should be called once for each dependency. ASSERT_EQ(mock_locality_data_provider->num_locality_data_fetches, deps.size()); // Test that fallback node was chosen. - ASSERT_EQ(NodeID::FromBinary(best_node_address.raylet_id()), fallback_node); + ASSERT_EQ(NodeID::FromBinary(best_node_address.node_id()), fallback_node); ASSERT_FALSE(is_selected_based_on_locality); } @@ -279,13 +276,13 @@ TEST(LocalityAwareLeasePolicyTest, TestBestLocalityFallbackAddrFetchFail) { LocalityAwareLeasePolicy locality_lease_policy( *mock_locality_data_provider, MockNodeAddrFactoryAlwaysNull, fallback_rpc_address); std::vector deps{obj1, obj2}; - auto task_spec = CreateFakeTask(deps); + auto lease_spec = CreateFakeLease(deps); auto [best_node_address, is_selected_based_on_locality] = - locality_lease_policy.GetBestNodeForTask(task_spec); + locality_lease_policy.GetBestNodeForLease(lease_spec); // Locality data provider should be called once for each dependency. ASSERT_EQ(mock_locality_data_provider->num_locality_data_fetches, deps.size()); // Test that fallback node was chosen. - ASSERT_EQ(NodeID::FromBinary(best_node_address.raylet_id()), fallback_node); + ASSERT_EQ(NodeID::FromBinary(best_node_address.node_id()), fallback_node); ASSERT_FALSE(is_selected_based_on_locality); } diff --git a/src/ray/core_worker/test/memory_store_test.cc b/src/ray/core_worker/tests/memory_store_test.cc similarity index 95% rename from src/ray/core_worker/test/memory_store_test.cc rename to src/ray/core_worker/tests/memory_store_test.cc index 257330888afe..5a90b26af481 100644 --- a/src/ray/core_worker/test/memory_store_test.cc +++ b/src/ray/core_worker/tests/memory_store_test.cc @@ -26,23 +26,22 @@ #include "mock/ray/core_worker/memory_store.h" #include "ray/common/status.h" #include "ray/common/status_or.h" -#include "ray/common/test_util.h" +#include "ray/common/test_utils.h" namespace ray { namespace core { -inline std::shared_ptr MakeBufferFromString(const uint8_t *data, - size_t data_size) { - auto metadata = const_cast(data); +namespace { + +std::shared_ptr MakeLocalMemoryBufferFromString( + const std::string &str) { + auto metadata = const_cast(reinterpret_cast(str.data())); auto meta_buffer = - std::make_shared(metadata, data_size, /*copy_data=*/true); + std::make_shared(metadata, str.size(), /*copy_data=*/true); return meta_buffer; } -inline std::shared_ptr MakeLocalMemoryBufferFromString( - const std::string &str) { - return MakeBufferFromString(reinterpret_cast(str.data()), str.size()); -} +} // namespace TEST(TestMemoryStore, TestReportUnhandledErrors) { std::vector> results; @@ -195,8 +194,8 @@ TEST(TestMemoryStore, TestObjectAllocator) { auto buf = object.GetData(); mock_buffer_manager.AcquireMemory(buf->Size()); auto data_factory = [&mock_buffer_manager, object]() -> std::shared_ptr { - auto buf = object.GetData(); - std::string data(reinterpret_cast(buf->Data()), buf->Size()); + auto inner_buf = object.GetData(); + std::string data(reinterpret_cast(inner_buf->Data()), inner_buf->Size()); return std::make_shared(mock_buffer_manager, data); }; diff --git a/src/ray/core_worker/test/mutable_object_provider_test.cc b/src/ray/core_worker/tests/mutable_object_provider_test.cc similarity index 100% rename from src/ray/core_worker/test/mutable_object_provider_test.cc rename to src/ray/core_worker/tests/mutable_object_provider_test.cc diff --git a/src/ray/core_worker/test/object_recovery_manager_test.cc b/src/ray/core_worker/tests/object_recovery_manager_test.cc similarity index 96% rename from src/ray/core_worker/test/object_recovery_manager_test.cc rename to src/ray/core_worker/tests/object_recovery_manager_test.cc index 12317e359692..68ecc7bedf6d 100644 --- a/src/ray/core_worker/test/object_recovery_manager_test.cc +++ b/src/ray/core_worker/tests/object_recovery_manager_test.cc @@ -20,18 +20,15 @@ #include #include +#include "fakes/ray/pubsub/subscriber.h" #include "fakes/ray/rpc/raylet/raylet_client.h" #include "gmock/gmock.h" #include "gtest/gtest.h" #include "mock/ray/core_worker/task_manager_interface.h" #include "mock/ray/pubsub/publisher.h" -#include "mock/ray/pubsub/subscriber.h" -#include "ray/common/task/task_spec.h" -#include "ray/common/task/task_util.h" -#include "ray/common/test_util.h" +#include "ray/common/test_utils.h" #include "ray/core_worker/store_provider/memory_store/memory_store.h" -#include "ray/core_worker/transport/normal_task_submitter.h" -#include "ray/raylet_client/raylet_client.h" +#include "ray/rpc/raylet/raylet_client_interface.h" namespace ray { namespace core { @@ -122,10 +119,10 @@ class MockObjectDirectory { class ObjectRecoveryManagerTestBase : public ::testing::Test { public: explicit ObjectRecoveryManagerTestBase(bool lineage_enabled) - : local_raylet_id_(NodeID::FromRandom()), + : local_node_id_(NodeID::FromRandom()), io_context_("TestOnly.ObjectRecoveryManagerTestBase"), publisher_(std::make_shared()), - subscriber_(std::make_shared()), + subscriber_(std::make_shared()), object_directory_(std::make_shared()), memory_store_( std::make_shared(io_context_.GetIoService())), @@ -172,13 +169,13 @@ class ObjectRecoveryManagerTestBase : public ::testing::Test { io_context_.Stop(); } - NodeID local_raylet_id_; + NodeID local_node_id_; absl::flat_hash_map failed_reconstructions_; // Used by memory_store_. InstrumentedIOContextWithThread io_context_; std::shared_ptr publisher_; - std::shared_ptr subscriber_; + std::shared_ptr subscriber_; std::shared_ptr object_directory_; std::shared_ptr memory_store_; std::shared_ptr raylet_client_pool_; @@ -237,7 +234,7 @@ TEST_F(ObjectRecoveryLineageDisabledTest, TestPinNewCopy) { true, /*add_local_ref=*/true); rpc::Address address; - address.set_raylet_id(NodeID::FromRandom().Binary()); + address.set_node_id(NodeID::FromRandom().Binary()); object_directory_->SetLocations(object_id, {address}); ASSERT_TRUE(manager_.RecoverObject(object_id)); @@ -257,9 +254,9 @@ TEST_F(ObjectRecoveryManagerTest, TestPinNewCopy) { true, /*add_local_ref=*/true); rpc::Address address1; - address1.set_raylet_id(NodeID::FromRandom().Binary()); + address1.set_node_id(NodeID::FromRandom().Binary()); rpc::Address address2; - address2.set_raylet_id(NodeID::FromRandom().Binary()); + address2.set_node_id(NodeID::FromRandom().Binary()); object_directory_->SetLocations(object_id, {address1, address2}); ASSERT_TRUE(manager_.RecoverObject(object_id)); @@ -309,7 +306,7 @@ TEST_F(ObjectRecoveryManagerTest, TestReconstructionSuppression) { // A new copy of the object is pinned. NodeID remote_node_id = NodeID::FromRandom(); rpc::Address address; - address.set_raylet_id(remote_node_id.Binary()); + address.set_node_id(remote_node_id.Binary()); object_directory_->SetLocations(object_id, {address}); ASSERT_EQ(object_directory_->Flush(), 1); ASSERT_EQ(raylet_client_->Flush(), 1); diff --git a/src/ray/core_worker/test/reference_count_test.cc b/src/ray/core_worker/tests/reference_count_test.cc similarity index 97% rename from src/ray/core_worker/test/reference_count_test.cc rename to src/ray/core_worker/tests/reference_count_test.cc index 31fb503fab9e..0686db4837f9 100644 --- a/src/ray/core_worker/test/reference_count_test.cc +++ b/src/ray/core_worker/tests/reference_count_test.cc @@ -20,16 +20,17 @@ #include #include "absl/functional/bind_front.h" +#include "fakes/ray/pubsub/subscriber.h" #include "gmock/gmock.h" #include "gtest/gtest.h" #include "mock/ray/pubsub/publisher.h" -#include "mock/ray/pubsub/subscriber.h" #include "ray/common/asio/instrumented_io_context.h" #include "ray/common/asio/periodical_runner.h" #include "ray/common/ray_object.h" #include "ray/core_worker/store_provider/memory_store/memory_store.h" #include "ray/pubsub/publisher.h" -#include "ray/pubsub/subscriber.h" +#include "ray/pubsub/publisher_interface.h" +#include "ray/pubsub/subscriber_interface.h" namespace ray { namespace core { @@ -43,7 +44,7 @@ class ReferenceCountTest : public ::testing::Test { virtual void SetUp() { rpc::Address addr; publisher_ = std::make_shared(); - subscriber_ = std::make_shared(); + subscriber_ = std::make_shared(); rc = std::make_unique( addr, publisher_.get(), subscriber_.get(), [](const NodeID &node_id) { return false; @@ -60,7 +61,7 @@ class ReferenceCountTest : public ::testing::Test { void AssertNoLeaks() { ASSERT_EQ(rc->NumObjectIDsInScope(), 0); } std::shared_ptr publisher_; - std::shared_ptr subscriber_; + std::shared_ptr subscriber_; }; class ReferenceCountLineageEnabledTest : public ::testing::Test { @@ -69,7 +70,7 @@ class ReferenceCountLineageEnabledTest : public ::testing::Test { virtual void SetUp() { rpc::Address addr; publisher_ = std::make_shared(); - subscriber_ = std::make_shared(); + subscriber_ = std::make_shared(); rc = std::make_unique( addr, publisher_.get(), @@ -85,7 +86,7 @@ class ReferenceCountLineageEnabledTest : public ::testing::Test { } std::shared_ptr publisher_; - std::shared_ptr subscriber_; + std::shared_ptr subscriber_; }; /// The 2 classes below are implemented to support distributed mock test using @@ -106,7 +107,7 @@ using SubscriptionFailureCallbackMap = // static maps are used to simulate distirubted environment. static SubscriptionCallbackMap subscription_callback_map; static SubscriptionFailureCallbackMap subscription_failure_callback_map; -static pubsub::pub_internal::SubscriptionIndex directory( +static pubsub::SubscriptionIndex directory( rpc::ChannelType::WORKER_OBJECT_LOCATIONS_CHANNEL); static std::string GenerateID(UniqueID publisher_id, UniqueID subscriber_id) { @@ -126,16 +127,16 @@ using PublisherFactoryFn = class MockDistributedSubscriber : public pubsub::SubscriberInterface { public: - MockDistributedSubscriber(pubsub::pub_internal::SubscriptionIndex *dict, + MockDistributedSubscriber(pubsub::SubscriptionIndex *dict, SubscriptionCallbackMap *sub_callback_map, SubscriptionFailureCallbackMap *sub_failure_callback_map, - pubsub::SubscriberID subscriber_id, + UniqueID subscriber_id, PublisherFactoryFn client_factory) : directory_(dict), subscription_callback_map_(sub_callback_map), subscription_failure_callback_map_(sub_failure_callback_map), subscriber_id_(subscriber_id), - subscriber_(std::make_unique( + subscriber_(std::make_unique( subscriber_id, /*get_time_ms=*/[]() { return 1.0; }, /*subscriber_timeout_ms=*/1000, @@ -145,11 +146,11 @@ class MockDistributedSubscriber : public pubsub::SubscriberInterface { ~MockDistributedSubscriber() = default; - bool Subscribe( - const std::unique_ptr sub_message, - const rpc::ChannelType channel_type, + void Subscribe( + std::unique_ptr sub_message, + rpc::ChannelType channel_type, const rpc::Address &publisher_address, - const std::string &key_id_binary, + const std::optional &key_id_binary, pubsub::SubscribeDoneCallback subscribe_done_callback, pubsub::SubscriptionItemCallback subscription_callback, pubsub::SubscriptionFailureCallback subscription_failure_callback) override { @@ -165,9 +166,9 @@ class MockDistributedSubscriber : public pubsub::SubscriberInterface { } // Due to the test env, there are times that the same message id from the same // subscriber is subscribed twice. We should just no-op in this case. - if (!(directory_->HasKeyId(key_id_binary) && + if (!(directory_->HasKeyId(*key_id_binary) && directory_->HasSubscriber(subscriber_id_))) { - directory_->AddEntry(key_id_binary, subscriber_.get()); + directory_->AddEntry(*key_id_binary, subscriber_.get()); } const auto publisher_id = UniqueID::FromBinary(publisher_address.worker_id()); const auto id = GenerateID(publisher_id, subscriber_id_); @@ -183,34 +184,18 @@ class MockDistributedSubscriber : public pubsub::SubscriberInterface { .first; } - const auto oid = ObjectID::FromBinary(key_id_binary); + const auto oid = ObjectID::FromBinary(*key_id_binary); callback_it->second.emplace(oid, subscription_callback); - return failure_callback_it->second.emplace(oid, subscription_failure_callback).second; + failure_callback_it->second.emplace(oid, subscription_failure_callback); } - bool SubscribeChannel( - const std::unique_ptr sub_message, - const rpc::ChannelType channel_type, - const rpc::Address &publisher_address, - pubsub::SubscribeDoneCallback subscribe_done_callback, - pubsub::SubscriptionItemCallback subscription_callback, - pubsub::SubscriptionFailureCallback subscription_failure_callback) override { - RAY_LOG(FATAL) << "Unimplemented!"; - return false; - } - - bool Unsubscribe(const rpc::ChannelType channel_type, + bool Unsubscribe(rpc::ChannelType channel_type, const rpc::Address &publisher_address, - const std::string &key_id_binary) override { - return true; - } - - bool UnsubscribeChannel(const rpc::ChannelType channel_type, - const rpc::Address &publisher_address) override { + const std::optional &key_id_binary) override { return true; } - bool IsSubscribed(const rpc::ChannelType channel_type, + bool IsSubscribed(rpc::ChannelType channel_type, const rpc::Address &publisher_address, const std::string &key_id_binary) const override { return directory_->HasKeyId(key_id_binary) && @@ -222,17 +207,17 @@ class MockDistributedSubscriber : public pubsub::SubscriberInterface { return ""; } - pubsub::pub_internal::SubscriptionIndex *directory_; + pubsub::SubscriptionIndex *directory_; SubscriptionCallbackMap *subscription_callback_map_; SubscriptionFailureCallbackMap *subscription_failure_callback_map_; - pubsub::SubscriberID subscriber_id_; - std::unique_ptr subscriber_; + UniqueID subscriber_id_; + std::unique_ptr subscriber_; PublisherFactoryFn client_factory_; }; class MockDistributedPublisher : public pubsub::PublisherInterface { public: - MockDistributedPublisher(pubsub::pub_internal::SubscriptionIndex *dict, + MockDistributedPublisher(pubsub::SubscriptionIndex *dict, SubscriptionCallbackMap *sub_callback_map, SubscriptionFailureCallbackMap *sub_failure_callback_map, WorkerID publisher_id) @@ -242,11 +227,10 @@ class MockDistributedPublisher : public pubsub::PublisherInterface { publisher_id_(publisher_id) {} ~MockDistributedPublisher() = default; - bool RegisterSubscription(const rpc::ChannelType channel_type, - const pubsub::SubscriberID &subscriber_id, + void RegisterSubscription(const rpc::ChannelType channel_type, + const UniqueID &subscriber_id, const std::optional &key_id_binary) override { RAY_CHECK(false) << "No need to implement it for testing."; - return false; } void PublishFailure(const rpc::ChannelType channel_type, @@ -271,17 +255,21 @@ class MockDistributedPublisher : public pubsub::PublisherInterface { } } - bool UnregisterSubscription(const rpc::ChannelType channel_type, - const pubsub::SubscriberID &subscriber_id, - const std::optional &key_id_binary) override { - return true; - } + void UnregisterSubscription(const rpc::ChannelType channel_type, + const UniqueID &subscriber_id, + const std::optional &key_id_binary) override {} - void ConnectToSubscriber(const rpc::PubsubLongPollingRequest &request, - rpc::PubsubLongPollingReply *reply, - rpc::SendReplyCallback send_reply_callback) override {} + void UnregisterSubscriber(const UniqueID &subscriber_id) override {} - pubsub::pub_internal::SubscriptionIndex *directory_; + void ConnectToSubscriber( + const rpc::PubsubLongPollingRequest &request, + std::string *publisher_id, + google::protobuf::RepeatedPtrField *pub_messages, + rpc::SendReplyCallback send_reply_callback) override {} + + std::string DebugString() const override { return ""; } + + pubsub::SubscriptionIndex *directory_; SubscriptionCallbackMap *subscription_callback_map_; SubscriptionFailureCallbackMap *subscription_failure_callback_map_; WorkerID publisher_id_; @@ -293,7 +281,7 @@ class MockWorkerClient : public MockCoreWorkerClientInterface { static rpc::Address CreateRandomAddress(const std::string &addr) { rpc::Address address; address.set_ip_address(addr); - address.set_raylet_id(NodeID::FromRandom().Binary()); + address.set_node_id(NodeID::FromRandom().Binary()); address.set_worker_id(WorkerID::FromRandom().Binary()); return address; } @@ -331,16 +319,15 @@ class MockWorkerClient : public MockCoreWorkerClientInterface { auto r = num_requests_; auto borrower_callback = [=]() { - auto ref_removed_callback = - absl::bind_front(&ReferenceCounter::HandleRefRemoved, &rc_); - rc_.SetRefRemovedCallback( - object_id, contained_in_id, owner_address, ref_removed_callback); + rc_.SubscribeRefRemoved(object_id, contained_in_id, owner_address); }; borrower_callbacks_[r] = borrower_callback; num_requests_++; } + std::string DebugString() const override { return ""; } + bool FlushBorrowerCallbacks() { // Flush all the borrower callbacks. This means that after this function is invoked, // all of ref_counts will be tracked. @@ -828,7 +815,7 @@ TEST(MemoryStoreIntegrationTest, TestSimple) { RayObject buffer(std::make_shared(data, sizeof(data)), nullptr, {}); auto publisher = std::make_shared(); - auto subscriber = std::make_shared(); + auto subscriber = std::make_shared(); auto rc = std::make_shared( rpc::Address(), publisher.get(), diff --git a/src/ray/core_worker/tests/shutdown_coordinator_test.cc b/src/ray/core_worker/tests/shutdown_coordinator_test.cc new file mode 100644 index 000000000000..9d18f8f7030d --- /dev/null +++ b/src/ray/core_worker/tests/shutdown_coordinator_test.cc @@ -0,0 +1,401 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ray/core_worker/shutdown_coordinator.h" + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "absl/synchronization/mutex.h" +#include "ray/common/buffer.h" +#include "src/ray/protobuf/common.pb.h" + +namespace ray { +namespace core { + +// Simple fake executor for tests without gmock. +class FakeShutdownExecutor : public ShutdownExecutorInterface { + public: + std::atomic graceful_calls{0}; + std::atomic force_calls{0}; + std::atomic worker_exit_calls{0}; + std::atomic handle_exit_calls{0}; + std::atomic idle_exit_allowed{false}; + + std::string last_exit_type; + std::string last_detail; + mutable absl::Mutex mu_; + + std::string GetLastExitType() const { + absl::MutexLock lk(&mu_); + return last_exit_type; + } + + std::string GetLastDetail() const { + absl::MutexLock lk(&mu_); + return last_detail; + } + + void ExecuteGracefulShutdown(std::string_view exit_type, + std::string_view detail, + std::chrono::milliseconds timeout_ms) override { + graceful_calls++; + { + absl::MutexLock lk(&mu_); + last_exit_type = std::string(exit_type); + last_detail = std::string(detail); + } + } + void ExecuteForceShutdown(std::string_view exit_type, + std::string_view detail) override { + force_calls++; + { + absl::MutexLock lk(&mu_); + last_exit_type = std::string(exit_type); + last_detail = std::string(detail); + } + } + void ExecuteWorkerExit(std::string_view exit_type, + std::string_view detail, + std::chrono::milliseconds timeout_ms) override { + worker_exit_calls++; + { + absl::MutexLock lk(&mu_); + last_exit_type = std::string(exit_type); + last_detail = std::string(detail); + } + } + void ExecuteExit(std::string_view exit_type, + std::string_view detail, + std::chrono::milliseconds timeout_ms, + const std::shared_ptr<::ray::LocalMemoryBuffer> + &creation_task_exception_pb_bytes) override { + worker_exit_calls++; + { + absl::MutexLock lk(&mu_); + last_exit_type = std::string(exit_type); + last_detail = std::string(detail); + } + } + void ExecuteHandleExit(std::string_view exit_type, + std::string_view detail, + std::chrono::milliseconds timeout_ms) override { + handle_exit_calls++; + { + absl::MutexLock lk(&mu_); + last_exit_type = std::string(exit_type); + last_detail = std::string(detail); + } + } + void KillChildProcessesImmediately() override {} + bool ShouldWorkerIdleExit() const override { return idle_exit_allowed.load(); } +}; + +// No-op executor used in disabled/manual-transition tests. +class NoOpShutdownExecutor : public ShutdownExecutorInterface { + public: + void ExecuteGracefulShutdown(std::string_view, + std::string_view, + std::chrono::milliseconds) override {} + void ExecuteForceShutdown(std::string_view, std::string_view) override {} + void ExecuteWorkerExit(std::string_view, + std::string_view, + std::chrono::milliseconds) override {} + void ExecuteExit(std::string_view, + std::string_view, + std::chrono::milliseconds, + const std::shared_ptr<::ray::LocalMemoryBuffer> &) override {} + void ExecuteHandleExit(std::string_view, + std::string_view, + std::chrono::milliseconds) override {} + void KillChildProcessesImmediately() override {} + bool ShouldWorkerIdleExit() const override { return false; } +}; + +class ShutdownCoordinatorTest : public ::testing::Test { + protected: + // Helper to create coordinator with specific worker type + std::unique_ptr CreateCoordinator( + rpc::WorkerType worker_type = rpc::WorkerType::WORKER) { + auto fake = std::make_unique(); + return std::make_unique(std::move(fake), worker_type); + } +}; + +TEST_F(ShutdownCoordinatorTest, InitialStateWithNoTransitions_IsRunning) { + auto coordinator = CreateCoordinator(); + + EXPECT_EQ(coordinator->GetState(), ShutdownState::kRunning); + EXPECT_EQ(coordinator->GetReason(), ShutdownReason::kNone); + EXPECT_TRUE(coordinator->IsRunning()); + EXPECT_FALSE(coordinator->IsShuttingDown()); + EXPECT_FALSE(coordinator->IsShutdown()); + EXPECT_FALSE(coordinator->ShouldEarlyExit()); +} + +TEST_F(ShutdownCoordinatorTest, RequestShutdown_IdempotentBehavior) { + auto coordinator = CreateCoordinator(); + + // First graceful request should succeed + EXPECT_TRUE(coordinator->RequestShutdown( + false, ShutdownReason::kGracefulExit, "test_graceful")); + const auto state = coordinator->GetState(); + EXPECT_TRUE(state == ShutdownState::kDisconnecting || + state == ShutdownState::kShutdown); + EXPECT_EQ(coordinator->GetReason(), ShutdownReason::kGracefulExit); + + // A second graceful request should be ignored + EXPECT_FALSE( + coordinator->RequestShutdown(false, ShutdownReason::kUserError, "test_graceful2")); + EXPECT_EQ(coordinator->GetReason(), + ShutdownReason::kGracefulExit); // Reason is unchanged + + // A force-kill request should succeed and override the graceful one + EXPECT_TRUE( + coordinator->RequestShutdown(true, ShutdownReason::kForcedExit, "test_force")); + EXPECT_EQ(coordinator->GetState(), ShutdownState::kShutdown); + EXPECT_EQ(coordinator->GetReason(), ShutdownReason::kForcedExit); // Reason is updated +} + +TEST_F(ShutdownCoordinatorTest, RequestShutdown_DelegatesToGraceful_OnlyFirstSucceeds) { + auto coordinator = CreateCoordinator(); + + EXPECT_TRUE(coordinator->RequestShutdown(false, ShutdownReason::kUserError)); + const auto state = coordinator->GetState(); + EXPECT_TRUE(state == ShutdownState::kShuttingDown || + state == ShutdownState::kDisconnecting); + EXPECT_EQ(coordinator->GetReason(), ShutdownReason::kUserError); + + // Second call should fail + EXPECT_FALSE(coordinator->RequestShutdown(false, ShutdownReason::kForcedExit)); + EXPECT_EQ(coordinator->GetReason(), ShutdownReason::kUserError); // unchanged +} + +TEST_F(ShutdownCoordinatorTest, + RequestShutdown_Graceful_SetsDisconnecting_ThenTryTransitionToShutdown_Succeeds) { + auto coordinator = std::make_unique( + std::make_unique(), rpc::WorkerType::WORKER); + + // Running -> ShuttingDown -> Disconnecting + EXPECT_TRUE( + coordinator->RequestShutdown(false /*graceful*/, ShutdownReason::kGracefulExit)); + + // worker path enters Disconnecting and requires explicit final step. + EXPECT_EQ(coordinator->GetState(), ShutdownState::kDisconnecting); + EXPECT_EQ(coordinator->GetReason(), ShutdownReason::kGracefulExit); + + // Disconnecting -> Shutdown + EXPECT_TRUE(coordinator->RequestShutdown(true, ShutdownReason::kForcedExit)); + EXPECT_EQ(coordinator->GetState(), ShutdownState::kShutdown); + + // Further transitions are no-ops. + EXPECT_FALSE(coordinator->RequestShutdown(false, ShutdownReason::kGracefulExit)); + EXPECT_FALSE(coordinator->RequestShutdown(true, ShutdownReason::kForcedExit)); +} + +TEST_F(ShutdownCoordinatorTest, ForceShutdown_TransitionsDirectlyToShutdown) { + auto coordinator = CreateCoordinator(); + + // Running -> Shutdown (completes immediately with mocked dependencies) + EXPECT_TRUE(coordinator->RequestShutdown(true, // force + ShutdownReason::kForcedExit)); + + // Already in shutdown state, manual transition should fail + EXPECT_FALSE(coordinator->RequestShutdown(true, ShutdownReason::kForcedExit)); + EXPECT_EQ(coordinator->GetState(), ShutdownState::kShutdown); +} + +TEST_F(ShutdownCoordinatorTest, + RequestShutdown_Graceful_OnlyOneInitiatorUnderConcurrency) { + auto coordinator = CreateCoordinator(); + + constexpr int num_threads = 10; + std::atomic success_count{0}; + std::vector threads; + + // Launch multiple threads trying to initiate shutdown + for (int i = 0; i < num_threads; ++i) { + threads.emplace_back([&coordinator, &success_count, i]() { + if (coordinator->RequestShutdown(false, // graceful + ShutdownReason::kGracefulExit, + "thread_" + std::to_string(i))) { + success_count.fetch_add(1); + } + }); + } + + // Wait for all threads + for (auto &thread : threads) { + thread.join(); + } + + // Only one thread should have succeeded + EXPECT_EQ(success_count.load(), 1); + const auto state = coordinator->GetState(); + EXPECT_TRUE(state == ShutdownState::kShuttingDown || + state == ShutdownState::kDisconnecting); + EXPECT_EQ(coordinator->GetReason(), ShutdownReason::kGracefulExit); +} + +TEST_F(ShutdownCoordinatorTest, Driver_GracefulReasonRecorded) { + auto coordinator = CreateCoordinator(rpc::WorkerType::DRIVER); + + EXPECT_TRUE(coordinator->RequestShutdown(false, // graceful + ShutdownReason::kGracefulExit)); + + EXPECT_EQ(coordinator->GetReason(), ShutdownReason::kGracefulExit); +} + +TEST_F(ShutdownCoordinatorTest, Driver_ForceReasonRecorded) { + auto coordinator = CreateCoordinator(rpc::WorkerType::DRIVER); + + EXPECT_TRUE(coordinator->RequestShutdown(true, // force + ShutdownReason::kForcedExit)); + + EXPECT_EQ(coordinator->GetReason(), ShutdownReason::kForcedExit); +} + +TEST_F(ShutdownCoordinatorTest, Worker_GracefulInitiates) { + auto coordinator = CreateCoordinator(rpc::WorkerType::WORKER); + + EXPECT_TRUE(coordinator->RequestShutdown(false, // graceful + ShutdownReason::kGracefulExit)); +} + +TEST_F(ShutdownCoordinatorTest, Worker_ExecuteWorkerExit_OnUserError) { + auto coordinator = CreateCoordinator(rpc::WorkerType::WORKER); + + EXPECT_TRUE(coordinator->RequestShutdown(false, // graceful + ShutdownReason::kUserError)); +} + +TEST_F(ShutdownCoordinatorTest, Worker_HandleExit_OnIdleTimeout) { + auto coordinator = CreateCoordinator(rpc::WorkerType::WORKER); + + EXPECT_TRUE(coordinator->RequestShutdown(false, // graceful + ShutdownReason::kIdleTimeout)); +} + +TEST_F(ShutdownCoordinatorTest, StringRepresentations_StateAndReason_AreReadable) { + auto coordinator = CreateCoordinator(); + + EXPECT_EQ(coordinator->GetStateString(), "Running"); + EXPECT_EQ(coordinator->GetReasonString(), "None"); + + coordinator->RequestShutdown(false, ShutdownReason::kGracefulExit); // graceful + + EXPECT_EQ(coordinator->GetStateString(), "Disconnecting"); + EXPECT_EQ(coordinator->GetReasonString(), "GracefulExit"); + + coordinator->RequestShutdown(true, ShutdownReason::kForcedExit); + EXPECT_EQ(coordinator->GetStateString(), "Shutdown"); +} + +TEST_F(ShutdownCoordinatorTest, ExitTypeStringMapping_UserError_IsUSER_ERROR) { + auto coordinator = CreateCoordinator(); + coordinator->RequestShutdown(false, ShutdownReason::kUserError); + EXPECT_EQ(coordinator->GetExitTypeString(), "USER_ERROR"); +} + +TEST_F(ShutdownCoordinatorTest, ExitTypeStringMapping_OOM_IsNODE_OUT_OF_MEMORY) { + auto coordinator = CreateCoordinator(); + coordinator->RequestShutdown(false, ShutdownReason::kOutOfMemory); + EXPECT_EQ(coordinator->GetExitTypeString(), "NODE_OUT_OF_MEMORY"); +} + +TEST_F(ShutdownCoordinatorTest, + ExitTypeStringMapping_IdleTimeout_IsINTENDED_SYSTEM_EXIT) { + auto coordinator = CreateCoordinator(); + coordinator->RequestShutdown(false, ShutdownReason::kIdleTimeout); + EXPECT_EQ(coordinator->GetExitTypeString(), "INTENDED_SYSTEM_EXIT"); +} + +TEST_F(ShutdownCoordinatorTest, ShouldEarlyExit_MemoryOrdering_ConcurrentVisibility) { + auto coordinator = CreateCoordinator(); + + std::atomic thread1_saw_shutdown{false}; + std::atomic thread2_saw_shutdown{false}; + + std::thread thread1([&coordinator, &thread1_saw_shutdown]() { + coordinator->RequestShutdown(false, ShutdownReason::kGracefulExit); // graceful + thread1_saw_shutdown.store(true); + }); + + std::thread thread2([&coordinator, &thread2_saw_shutdown]() { + while (!coordinator->ShouldEarlyExit()) { + std::this_thread::yield(); + } + thread2_saw_shutdown.store(true); + }); + + thread1.join(); + thread2.join(); + + // Both threads should have seen the shutdown state + EXPECT_TRUE(thread1_saw_shutdown.load()); + EXPECT_TRUE(thread2_saw_shutdown.load()); + EXPECT_TRUE(coordinator->ShouldEarlyExit()); +} + +TEST_F(ShutdownCoordinatorTest, Concurrent_GracefulVsForce_ForceExecutesOnce) { + auto fake = std::make_unique(); + auto *fake_ptr = fake.get(); + auto coordinator = + std::make_unique(std::move(fake), rpc::WorkerType::WORKER); + + std::thread t1([&] { + coordinator->RequestShutdown(false, ShutdownReason::kGracefulExit, "graceful"); + }); + std::thread t2( + [&] { coordinator->RequestShutdown(true, ShutdownReason::kForcedExit, "force"); }); + t1.join(); + t2.join(); + + EXPECT_EQ(coordinator->GetState(), ShutdownState::kShutdown); + EXPECT_EQ(coordinator->GetReason(), ShutdownReason::kForcedExit); + EXPECT_EQ(fake_ptr->force_calls.load(), 1); + EXPECT_LE(fake_ptr->graceful_calls.load(), 1); +} + +TEST_F(ShutdownCoordinatorTest, Concurrent_DoubleForce_ForceExecutesOnce) { + auto fake = std::make_unique(); + auto *fake_ptr = fake.get(); + auto coordinator = + std::make_unique(std::move(fake), rpc::WorkerType::WORKER); + + std::thread t1( + [&] { coordinator->RequestShutdown(true, ShutdownReason::kForcedExit, "force1"); }); + std::thread t2( + [&] { coordinator->RequestShutdown(true, ShutdownReason::kForcedExit, "force2"); }); + t1.join(); + t2.join(); + + EXPECT_EQ(coordinator->GetState(), ShutdownState::kShutdown); + EXPECT_EQ(coordinator->GetReason(), ShutdownReason::kForcedExit); + // Verify that only one forced shutdown was called + EXPECT_EQ(fake_ptr->force_calls.load(), 1); + EXPECT_EQ(fake_ptr->graceful_calls.load(), 0); + EXPECT_TRUE(fake_ptr->GetLastDetail() == "force1" || + fake_ptr->GetLastDetail() == "force2"); +} + +} // namespace core +} // namespace ray diff --git a/src/ray/core_worker/test/task_event_buffer_export_event_test.cc b/src/ray/core_worker/tests/task_event_buffer_export_event_test.cc similarity index 96% rename from src/ray/core_worker/test/task_event_buffer_export_event_test.cc rename to src/ray/core_worker/tests/task_event_buffer_export_event_test.cc index e3f76d3a1e83..cf2e6e7203f2 100644 --- a/src/ray/core_worker/test/task_event_buffer_export_event_test.cc +++ b/src/ray/core_worker/tests/task_event_buffer_export_event_test.cc @@ -21,14 +21,11 @@ #include #include -#include "absl/base/thread_annotations.h" -#include "absl/synchronization/mutex.h" #include "absl/types/optional.h" #include "gmock/gmock.h" #include "gtest/gtest.h" -#include "mock/ray/gcs/gcs_client/gcs_client.h" -#include "ray/common/task/task_spec.h" -#include "ray/common/test_util.h" +#include "mock/ray/gcs_client/gcs_client.h" +#include "ray/common/test_utils.h" #include "ray/core_worker/task_event_buffer.h" #include "ray/util/event.h" @@ -63,13 +60,15 @@ class TaskEventTestWriteExport : public ::testing::Test { "task_events_send_batch_size": 100, "export_task_events_write_batch_size": 1, "task_events_max_num_export_status_events_buffer_on_worker": 15, - "enable_export_api_write": true + "enable_export_api_write": true, + "enable_core_worker_ray_event_to_aggregator": false } )"); task_event_buffer_ = std::make_unique( std::make_unique(), - std::make_unique()); + std::make_unique(), + "test_session_name"); } virtual void SetUp() { RAY_CHECK_OK(task_event_buffer_->Start(/*auto_flush*/ false)); } @@ -99,6 +98,7 @@ class TaskEventTestWriteExport : public ::testing::Test { rpc::TaskStatus::RUNNING, running_ts, /*is_actor_task_event=*/false, + "test_session_name", nullptr, state_update); } diff --git a/src/ray/core_worker/test/task_event_buffer_test.cc b/src/ray/core_worker/tests/task_event_buffer_test.cc similarity index 74% rename from src/ray/core_worker/test/task_event_buffer_test.cc rename to src/ray/core_worker/tests/task_event_buffer_test.cc index 462a940c3dab..58edefd13604 100644 --- a/src/ray/core_worker/test/task_event_buffer_test.cc +++ b/src/ray/core_worker/tests/task_event_buffer_test.cc @@ -31,10 +31,10 @@ #include "absl/types/optional.h" #include "gmock/gmock.h" #include "gtest/gtest.h" -#include "mock/ray/gcs/gcs_client/gcs_client.h" +#include "mock/ray/gcs_client/gcs_client.h" #include "ray/common/task/task_spec.h" #include "ray/common/task/task_util.h" -#include "ray/common/test_util.h" +#include "ray/common/test_utils.h" #include "ray/util/event.h" using ::testing::_; @@ -91,7 +91,8 @@ class TaskEventBufferTest : public ::testing::Test { task_event_buffer_ = std::make_unique( std::make_unique(), - std::make_unique()); + std::make_unique(), + "test_session_name"); } virtual void SetUp() { RAY_CHECK_OK(task_event_buffer_->Start(/*auto_flush*/ false)); } @@ -156,6 +157,7 @@ class TaskEventBufferTest : public ::testing::Test { rpc::TaskStatus::RUNNING, 1, /*is_actor_task_event=*/false, + "test_session_name", std::make_shared(task_spec), status_update); } @@ -172,13 +174,21 @@ class TaskEventBufferTest : public ::testing::Test { rpc::TaskStatus::RUNNING, running_ts, /*is_actor_task_event=*/false, + "test_session_name", nullptr, state_update); } std::unique_ptr GenProfileTaskEvent(TaskID task_id, int32_t attempt_num) { - return std::make_unique( - task_id, JobID::FromInt(0), attempt_num, "", "", "", "test_event", 1); + return std::make_unique(task_id, + JobID::FromInt(0), + attempt_num, + "", + "", + "", + "test_event", + 1, + "test_session_name"); } static void CompareTaskEventData(const rpc::TaskEventData &actual_data, @@ -423,16 +433,21 @@ TEST_P(TaskEventBufferTestDifferentDestination, TestFlushEvents) { auto event = expected_task_event_data.add_events_by_task(); task_event->ToRpcTaskEvents(event); - RayEventsPair ray_events_pair; - task_event->ToRpcRayEvents(ray_events_pair); - auto [task_definition_event, task_execution_event] = ray_events_pair; + RayEventsTuple ray_events_tuple; + task_event->ToRpcRayEvents(ray_events_tuple); + auto [task_definition_event, task_execution_event, task_profile_event] = + ray_events_tuple; if (task_definition_event) { - auto event = expected_ray_events_data.add_events(); - *event = std::move(task_definition_event.value()); + auto new_event = expected_ray_events_data.add_events(); + *new_event = std::move(task_definition_event.value()); } if (task_execution_event) { - auto event = expected_ray_events_data.add_events(); - *event = std::move(task_execution_event.value()); + auto new_event = expected_ray_events_data.add_events(); + *new_event = std::move(task_execution_event.value()); + } + if (task_profile_event) { + auto new_event = expected_ray_events_data.add_events(); + *new_event = std::move(task_profile_event.value()); } } @@ -738,16 +753,21 @@ TEST_P(TaskEventBufferTestLimitBufferDifferentDestination, *static_cast(event_ptr.get())); event->ToRpcTaskEvents(expect_event); - RayEventsPair ray_events_pair; - event->ToRpcRayEvents(ray_events_pair); - auto [task_definition_event, task_execution_event] = ray_events_pair; + RayEventsTuple ray_events_tuple; + event->ToRpcRayEvents(ray_events_tuple); + auto [task_definition_event, task_execution_event, task_profile_event] = + ray_events_tuple; if (task_definition_event) { - auto event = expected_ray_events_data.add_events(); - *event = std::move(task_definition_event.value()); + auto new_event = expected_ray_events_data.add_events(); + *new_event = std::move(task_definition_event.value()); } if (task_execution_event) { - auto event = expected_ray_events_data.add_events(); - *event = std::move(task_execution_event.value()); + auto new_event = expected_ray_events_data.add_events(); + *new_event = std::move(task_execution_event.value()); + } + if (task_profile_event) { + auto new_event = expected_ray_events_data.add_events(); + *new_event = std::move(task_profile_event.value()); } } @@ -914,6 +934,222 @@ TEST_F(TaskEventBufferTest, TestGracefulDestruction) { delete task_event_buffer_.release(); } +TEST_F(TaskEventBufferTest, TestTaskProfileEventToRpcRayEvents) { + auto task_id = RandomTaskId(); + auto job_id = JobID::FromInt(123); + int32_t attempt_number = 1; + std::string component_type = "core_worker"; + std::string component_id = "worker_123"; + std::string node_ip = "192.168.1.1"; + std::string event_name = "test_profile_event"; + int64_t start_time = 1000; + + auto profile_event = std::make_unique(task_id, + job_id, + attempt_number, + component_type, + component_id, + node_ip, + event_name, + start_time, + "test_session_name"); + + // Set end time and extra data to test full population + profile_event->SetEndTime(2000); + profile_event->SetExtraData("test_extra_data"); + + RayEventsTuple ray_events_tuple; + profile_event->ToRpcRayEvents(ray_events_tuple); + + auto &[task_definition_event, task_execution_event, task_profile_event] = + ray_events_tuple; + + // Verify that the second event is nullopt (empty) + EXPECT_FALSE(task_definition_event.has_value()) + << "TaskProfileEvent should be populated at the third element of RayEventsTuple"; + EXPECT_FALSE(task_execution_event.has_value()) + << "TaskProfileEvent should be populated at the third element of RayEventsTuple"; + + // Verify that the first event contains the profile event + ASSERT_TRUE(task_profile_event.has_value()) + << "TaskProfileEvent should populate third element of RayEventsTuple"; + + const auto &ray_event = task_profile_event.value(); + + // Verify base fields + EXPECT_EQ(ray_event.source_type(), rpc::events::RayEvent::CORE_WORKER); + EXPECT_EQ(ray_event.event_type(), rpc::events::RayEvent::TASK_PROFILE_EVENT); + EXPECT_EQ(ray_event.severity(), rpc::events::RayEvent::INFO); + EXPECT_FALSE(ray_event.event_id().empty()); + EXPECT_EQ(ray_event.session_name(), "test_session_name"); + + // Verify task profile events are populated + ASSERT_TRUE(ray_event.has_task_profile_events()); + const auto &task_profile_events = ray_event.task_profile_events(); + + EXPECT_EQ(task_profile_events.task_id(), task_id.Binary()); + EXPECT_EQ(task_profile_events.job_id(), job_id.Binary()); + EXPECT_EQ(task_profile_events.attempt_number(), attempt_number); + + // Verify profile event + ASSERT_TRUE(task_profile_events.has_profile_events()); + const auto &profile_events = task_profile_events.profile_events(); + + EXPECT_EQ(profile_events.component_type(), component_type); + EXPECT_EQ(profile_events.component_id(), component_id); + EXPECT_EQ(profile_events.node_ip_address(), node_ip); + + // Verify event entry + ASSERT_EQ(profile_events.events_size(), 1); + const auto &event_entry = profile_events.events(0); + + EXPECT_EQ(event_entry.event_name(), event_name); + EXPECT_EQ(event_entry.start_time(), start_time); + EXPECT_EQ(event_entry.end_time(), 2000); + EXPECT_EQ(event_entry.extra_data(), "test_extra_data"); +} + +TEST_F(TaskEventBufferTest, TestCreateRayEventsDataWithProfileEvents) { + // Test that CreateRayEventsDataToSend correctly handles profile events + // by only including the first element of RayEventsPair + + auto task_id = RandomTaskId(); + auto job_id = JobID::FromInt(456); + int32_t attempt_number = 2; + + // Create a profile event + auto profile_event = std::make_unique(task_id, + job_id, + attempt_number, + "core_worker", + "worker_456", + "192.168.1.2", + "profile_test", + 5000, + "test_session_name"); + profile_event->SetEndTime(6000); + + absl::flat_hash_map agg_ray_events; + TaskAttempt task_attempt = std::make_pair(task_id, attempt_number); + + // Populate the ray events pair + RayEventsTuple ray_events_tuple; + profile_event->ToRpcRayEvents(ray_events_tuple); + agg_ray_events[task_attempt] = std::move(ray_events_tuple); + + // Create the data using the real implementation + absl::flat_hash_set dropped_task_attempts; + auto ray_events_data = task_event_buffer_->CreateRayEventsDataToSend( + std::move(agg_ray_events), dropped_task_attempts); + + // Verify that exactly one event was added (only the profile event, not the nullopt + // second) + ASSERT_EQ(ray_events_data->events_size(), 1); + + const auto &event = ray_events_data->events(0); + EXPECT_EQ(event.event_type(), rpc::events::RayEvent::TASK_PROFILE_EVENT); + EXPECT_EQ(event.session_name(), "test_session_name"); + EXPECT_TRUE(event.has_task_profile_events()); + + const auto &task_profile_events = event.task_profile_events(); + EXPECT_EQ(task_profile_events.task_id(), task_id.Binary()); + EXPECT_EQ(task_profile_events.job_id(), job_id.Binary()); + EXPECT_EQ(task_profile_events.attempt_number(), attempt_number); +} + +TEST_P(TaskEventBufferTestDifferentDestination, + TestMixedStatusAndProfileEventsToRayEvents) { + // Test that a mix of status events and profile events are correctly handled + const auto [to_gcs, to_aggregator] = GetParam(); + + // Generate the task id and job id + auto task_id = RandomTaskId(); + auto job_id = JobID::FromInt(789); + + // Create a status event (should populate both elements of RayEventsPair) + auto status_event = GenStatusTaskEvent(task_id, 1, 1000); + + // Create a profile event (should populate only first element) + auto profile_event = std::make_unique(task_id, + job_id, + 1, + "core_worker", + "worker_789", + "192.168.1.3", + "mixed_test", + 7000, + "test_session_name"); + // Expect data flushed match. Generate the expected data + rpc::TaskEventData expected_task_event_data; + rpc::events::RayEventsData expected_ray_events_data; + auto event = expected_task_event_data.add_events_by_task(); + status_event->ToRpcTaskEvents(event); + profile_event->ToRpcTaskEvents(event); + + RayEventsTuple ray_events_tuple; + status_event->ToRpcRayEvents(ray_events_tuple); + profile_event->ToRpcRayEvents(ray_events_tuple); + auto [task_definition_event, task_execution_event, task_profile_event] = + ray_events_tuple; + if (task_definition_event) { + auto new_event = expected_ray_events_data.add_events(); + *new_event = std::move(task_definition_event.value()); + } + if (task_execution_event) { + auto new_event = expected_ray_events_data.add_events(); + *new_event = std::move(task_execution_event.value()); + } + if (task_profile_event) { + auto new_event = expected_ray_events_data.add_events(); + *new_event = std::move(task_profile_event.value()); + } + + // Add Events to the task event buffer + task_event_buffer_->AddTaskEvent(std::move(status_event)); + task_event_buffer_->AddTaskEvent(std::move(profile_event)); + ASSERT_EQ(task_event_buffer_->GetNumTaskEventsStored(), 2); + + // Manually call flush should call GCS client's flushing grpc. + auto task_gcs_accessor = + static_cast(task_event_buffer_->GetGcsClient()) + ->mock_task_accessor; + if (to_gcs) { + EXPECT_CALL(*task_gcs_accessor, AsyncAddTaskEventData(_, _)) + .WillOnce([&](std::unique_ptr actual_data, + ray::gcs::StatusCallback callback) { + CompareTaskEventData(*actual_data, expected_task_event_data); + return Status::OK(); + }); + } else { + EXPECT_CALL(*task_gcs_accessor, AsyncAddTaskEventData(_, _)).Times(0); + } + + // If ray events to aggregator is enabled, expect to call AddEvents grpc. + auto event_aggregator_client = static_cast( + task_event_buffer_->event_aggregator_client_.get()); + rpc::events::AddEventsRequest add_events_request; + if (to_aggregator) { + rpc::events::AddEventsReply reply; + Status status = Status::OK(); + EXPECT_CALL(*event_aggregator_client, AddEvents(_, _)) + .WillOnce(DoAll( + Invoke([&](const rpc::events::AddEventsRequest &request, + const rpc::ClientCallback &callback) { + CompareRayEventsData(request.events_data(), expected_ray_events_data); + }), + MakeAction( + new MockEventAggregatorAddEvents(std::move(status), std::move(reply))))); + } else { + EXPECT_CALL(*event_aggregator_client, AddEvents(_, _)).Times(0); + } + + // Flush events + task_event_buffer_->FlushEvents(false); + + // Expect no more events. + ASSERT_EQ(task_event_buffer_->GetNumTaskEventsStored(), 0); +} + INSTANTIATE_TEST_SUITE_P(TaskEventBufferTest, TaskEventBufferTestDifferentDestination, ::testing::Values(DifferentDestination{true, true}, diff --git a/src/ray/core_worker/test/task_manager_test.cc b/src/ray/core_worker/tests/task_manager_test.cc similarity index 90% rename from src/ray/core_worker/test/task_manager_test.cc rename to src/ray/core_worker/tests/task_manager_test.cc index 9be48562a797..b68c1acbe6c6 100644 --- a/src/ray/core_worker/test/task_manager_test.cc +++ b/src/ray/core_worker/tests/task_manager_test.cc @@ -20,17 +20,18 @@ #include #include +#include "fakes/ray/pubsub/subscriber.h" #include "gmock/gmock.h" #include "gtest/gtest.h" -#include "mock/ray/gcs/gcs_client/gcs_client.h" +#include "mock/ray/gcs_client/gcs_client.h" #include "mock/ray/pubsub/publisher.h" -#include "mock/ray/pubsub/subscriber.h" #include "ray/common/task/task_spec.h" #include "ray/common/task/task_util.h" -#include "ray/common/test_util.h" +#include "ray/common/test_utils.h" #include "ray/core_worker/reference_count.h" #include "ray/core_worker/store_provider/memory_store/memory_store.h" #include "ray/core_worker/task_event_buffer.h" +#include "ray/observability/fake_metric.h" namespace ray { namespace core { @@ -120,6 +121,20 @@ class MockTaskEventBuffer : public worker::TaskEventBuffer { MOCK_METHOD(bool, Enabled, (), (const, override)); MOCK_METHOD(std::string, DebugString, (), (override)); + + MOCK_METHOD( + bool, + RecordTaskStatusEventIfNeeded, + (const TaskID &task_id, + const JobID &job_id, + int32_t attempt_number, + const TaskSpecification &spec, + rpc::TaskStatus status, + bool include_task_info, + std::optional state_update), + (override)); + + MOCK_METHOD(std::string, GetSessionName, (), (const, override)); }; class TaskManagerTest : public ::testing::Test { @@ -129,7 +144,7 @@ class TaskManagerTest : public ::testing::Test { : lineage_pinning_enabled_(lineage_pinning_enabled), addr_(GetRandomWorkerAddr()), publisher_(std::make_shared()), - subscriber_(std::make_shared()), + subscriber_(std::make_shared()), task_event_buffer_mock_(std::make_unique()), mock_gcs_client_(std::make_shared()), reference_counter_(std::make_shared( @@ -146,12 +161,11 @@ class TaskManagerTest : public ::testing::Test { *reference_counter_, [this](const RayObject &object, const ObjectID &object_id) { stored_in_plasma.insert(object_id); + return Status::OK(); }, - [this](TaskSpecification &spec, bool object_recovery, uint32_t delay_ms) { + [this](TaskSpecification &spec, uint32_t delay_ms) { num_retries_++; last_delay_ms_ = delay_ms; - last_object_recovery_ = object_recovery; - return Status::OK(); }, [this](const TaskSpecification &spec) { return this->did_queue_generator_resubmit_; @@ -166,7 +180,8 @@ class TaskManagerTest : public ::testing::Test { -> std::shared_ptr { return nullptr; }, - mock_gcs_client_) {} + mock_gcs_client_, + fake_task_by_state_counter_) {} virtual void TearDown() { AssertNoLeaks(); } @@ -201,7 +216,7 @@ class TaskManagerTest : public ::testing::Test { bool did_queue_generator_resubmit_ = false; rpc::Address addr_; std::shared_ptr publisher_; - std::shared_ptr subscriber_; + std::shared_ptr subscriber_; std::unique_ptr task_event_buffer_mock_; std::shared_ptr mock_gcs_client_; std::shared_ptr reference_counter_; @@ -211,8 +226,8 @@ class TaskManagerTest : public ::testing::Test { TaskManager manager_; int num_retries_ = 0; uint32_t last_delay_ms_ = 0; - bool last_object_recovery_ = false; std::unordered_set stored_in_plasma; + ray::observability::FakeMetric fake_task_by_state_counter_; }; class TaskManagerLineageTest : public TaskManagerTest { @@ -220,6 +235,19 @@ class TaskManagerLineageTest : public TaskManagerTest { TaskManagerLineageTest() : TaskManagerTest(true, /*max_lineage_bytes=*/10000) {} }; +TEST_F(TaskManagerTest, TestRecordMetrics) { + rpc::Address caller_address; + auto spec = CreateTaskHelper(1, {}); + manager_.AddPendingTask(caller_address, spec, ""); + manager_.RecordMetrics(); + auto tag_to_value = fake_task_by_state_counter_.GetTagToValue(); + ASSERT_EQ(tag_to_value.size(), 1); // one task state data point + ASSERT_EQ(tag_to_value.begin()->first.at("State"), + rpc::TaskStatus_Name(rpc::TaskStatus::PENDING_ARGS_AVAIL)); + ASSERT_EQ(tag_to_value.begin()->second, 1); // one task in the PENDING_ARGS_AVAIL state + manager_.FailPendingTask(spec.TaskId(), rpc::ErrorType::WORKER_DIED); +} + TEST_F(TaskManagerTest, TestTaskSuccess) { rpc::Address caller_address; ObjectID dep1 = ObjectID::FromRandom(); @@ -415,7 +443,6 @@ TEST_F(TaskManagerTest, TestTaskReconstruction) { ASSERT_FALSE(store_->Get({return_id}, 1, 0, ctx, false, &results).ok()); ASSERT_EQ(num_retries_, i + 1); ASSERT_EQ(last_delay_ms_, RayConfig::instance().task_retry_delay_ms()); - ASSERT_EQ(last_object_recovery_, false); } manager_.FailOrRetryPendingTask(spec.TaskId(), error); @@ -550,13 +577,11 @@ TEST_F(TaskManagerTest, TestTaskOomAndNonOomKillReturnsLastError) { manager_.FailOrRetryPendingTask(spec.TaskId(), error); ASSERT_EQ(num_retries_, 1); ASSERT_EQ(last_delay_ms_, RayConfig::instance().task_oom_retry_delay_base_ms()); - ASSERT_EQ(last_object_recovery_, false); error = rpc::ErrorType::WORKER_DIED; manager_.FailOrRetryPendingTask(spec.TaskId(), error); ASSERT_EQ(num_retries_, 2); ASSERT_EQ(last_delay_ms_, RayConfig::instance().task_retry_delay_ms()); - ASSERT_EQ(last_object_recovery_, false); error = rpc::ErrorType::WORKER_DIED; manager_.FailOrRetryPendingTask(spec.TaskId(), error); @@ -725,7 +750,7 @@ TEST_F(TaskManagerTest, TestLocalityDataAdded) { return_object->set_in_plasma(true); return_object->set_size(object_size); rpc::Address worker_addr; - worker_addr.set_raylet_id(node_id.Binary()); + worker_addr.set_node_id(node_id.Binary()); manager_.AddPendingTask(rpc::Address(), spec, "", 0); manager_.CompletePendingTask(spec.TaskId(), reply, worker_addr, false); } @@ -1058,7 +1083,6 @@ TEST_F(TaskManagerLineageTest, TestResubmitTask) { ASSERT_EQ(resubmitted_task_deps, spec.GetDependencyIds()); ASSERT_EQ(num_retries_, 1); ASSERT_EQ(last_delay_ms_, 0); - ASSERT_EQ(last_object_recovery_, true); resubmitted_task_deps.clear(); // The return ID goes out of scope. @@ -1122,7 +1146,6 @@ TEST_F(TaskManagerLineageTest, TestResubmittedTaskNondeterministicReturns) { ASSERT_EQ(manager_.ResubmitTask(spec.TaskId(), &resubmitted_task_deps), std::nullopt); ASSERT_EQ(num_retries_, 1); ASSERT_EQ(last_delay_ms_, 0); - ASSERT_EQ(last_object_recovery_, true); // The re-executed task completes again. One of the return objects is now // returned directly. @@ -1187,7 +1210,6 @@ TEST_F(TaskManagerLineageTest, TestResubmittedTaskFails) { ASSERT_EQ(manager_.ResubmitTask(spec.TaskId(), &resubmitted_task_deps), std::nullopt); ASSERT_EQ(num_retries_, 1); ASSERT_EQ(last_delay_ms_, 0); - ASSERT_EQ(last_object_recovery_, true); // The re-executed task fails due to worker crashed. { @@ -1308,7 +1330,6 @@ TEST_F(TaskManagerLineageTest, TestResubmittedDynamicReturnsTaskFails) { ASSERT_EQ(manager_.ResubmitTask(spec.TaskId(), &resubmitted_task_deps), std::nullopt); ASSERT_EQ(num_retries_, 1); ASSERT_EQ(last_delay_ms_, 0); - ASSERT_EQ(last_object_recovery_, true); // Dereference the generator to a list of its internal ObjectRefs. for (const auto &dynamic_return_id : dynamic_return_ids) { @@ -1346,6 +1367,188 @@ TEST_F(TaskManagerLineageTest, TestResubmittedDynamicReturnsTaskFails) { ASSERT_EQ(stored_in_plasma.size(), 3); } +// High-level tests around plasma put failures and retries using a real memory store +TEST_F(TaskManagerTest, PlasmaPut_ObjectStoreFull_FailsTaskAndWritesError) { + auto local_ref_counter = std::make_shared( + addr_, + publisher_.get(), + subscriber_.get(), + /*is_node_dead=*/[this](const NodeID &) { return node_died_; }, + lineage_pinning_enabled_); + auto local_store = std::make_shared(io_context_.GetIoService(), + local_ref_counter.get()); + + TaskManager failing_mgr( + *local_store, + *local_ref_counter, + /*put_in_local_plasma_callback=*/ + [](const RayObject &, const ObjectID &) { + return Status::ObjectStoreFull("simulated"); + }, + [this](TaskSpecification &spec, uint32_t delay_ms) { + num_retries_++; + last_delay_ms_ = delay_ms; + }, + [this](const TaskSpecification &spec) { + return this->did_queue_generator_resubmit_; + }, + [](const JobID &, const std::string &, const std::string &, double) { + return Status::OK(); + }, + /*max_lineage_bytes*/ 1024 * 1024, + *task_event_buffer_mock_.get(), + [](const ActorID &) -> std::shared_ptr { + return nullptr; + }, + mock_gcs_client_, + fake_task_by_state_counter_); + + rpc::Address caller_address; + auto spec = CreateTaskHelper(1, {}); + failing_mgr.AddPendingTask(caller_address, spec, ""); + failing_mgr.MarkDependenciesResolved(spec.TaskId()); + failing_mgr.MarkTaskWaitingForExecution( + spec.TaskId(), NodeID::FromRandom(), WorkerID::FromRandom()); + + rpc::PushTaskReply reply; + auto return_object = reply.add_return_objects(); + auto return_id = spec.ReturnId(0); + return_object->set_object_id(return_id.Binary()); + return_object->set_in_plasma(true); + failing_mgr.CompletePendingTask( + spec.TaskId(), reply, rpc::Address(), /*app_err=*/false); + + ASSERT_FALSE(failing_mgr.IsTaskPending(spec.TaskId())); + std::vector> results; + WorkerContext ctx(WorkerType::WORKER, WorkerID::FromRandom(), JobID::FromInt(0)); + RAY_CHECK_OK(local_store->Get({return_id}, 1, 0, ctx, false, &results)); + ASSERT_EQ(results.size(), 1); + ASSERT_TRUE(results[0]->IsException()); +} + +TEST_F(TaskManagerTest, PlasmaPut_TransientFull_RetriesThenSucceeds) { + std::shared_ptr> attempts = std::make_shared>(0); + auto local_ref_counter = std::make_shared( + addr_, + publisher_.get(), + subscriber_.get(), + /*is_node_dead=*/[this](const NodeID &) { return node_died_; }, + lineage_pinning_enabled_); + auto local_store = std::make_shared(io_context_.GetIoService(), + local_ref_counter.get()); + TaskManager retry_mgr( + *local_store, + *local_ref_counter, + /*put_in_local_plasma_callback=*/ + [attempts](const RayObject &, const ObjectID &) { + int n = ++(*attempts); + if (n < 3) { + return Status::TransientObjectStoreFull("retry"); + } + return Status::OK(); + }, + [this](TaskSpecification &spec, uint32_t delay_ms) { + num_retries_++; + last_delay_ms_ = delay_ms; + }, + [this](const TaskSpecification &spec) { + return this->did_queue_generator_resubmit_; + }, + [](const JobID &, const std::string &, const std::string &, double) { + return Status::OK(); + }, + /*max_lineage_bytes*/ 1024 * 1024, + *task_event_buffer_mock_.get(), + [](const ActorID &) -> std::shared_ptr { + return nullptr; + }, + mock_gcs_client_, + fake_task_by_state_counter_); + + rpc::Address caller_address; + auto spec = CreateTaskHelper(1, {}); + retry_mgr.AddPendingTask(caller_address, spec, ""); + retry_mgr.MarkDependenciesResolved(spec.TaskId()); + retry_mgr.MarkTaskWaitingForExecution( + spec.TaskId(), NodeID::FromRandom(), WorkerID::FromRandom()); + + rpc::PushTaskReply reply; + auto return_object = reply.add_return_objects(); + auto return_id = spec.ReturnId(0); + return_object->set_object_id(return_id.Binary()); + return_object->set_in_plasma(true); + retry_mgr.CompletePendingTask(spec.TaskId(), reply, rpc::Address(), /*app_err=*/false); + + std::vector> results; + WorkerContext ctx(WorkerType::WORKER, WorkerID::FromRandom(), JobID::FromInt(0)); + RAY_CHECK_OK(local_store->Get({return_id}, 1, 0, ctx, false, &results)); + ASSERT_EQ(results.size(), 1); + ASSERT_TRUE(results[0]->IsInPlasmaError()); +} + +TEST_F(TaskManagerTest, DynamicReturn_PlasmaPutFailure_FailsTaskImmediately) { + bool first_fail_done = false; + auto local_ref_counter = std::make_shared( + addr_, + publisher_.get(), + subscriber_.get(), + /*is_node_dead=*/[this](const NodeID &) { return node_died_; }, + lineage_pinning_enabled_); + auto local_store = std::make_shared(io_context_.GetIoService(), + local_ref_counter.get()); + TaskManager dyn_mgr( + *local_store, + *local_ref_counter, + /*put_in_local_plasma_callback=*/ + [&first_fail_done](const RayObject &, const ObjectID &) { + if (!first_fail_done) { + first_fail_done = true; + return Status::IOError("broken pipe"); + } + return Status::OK(); + }, + [this](TaskSpecification &spec, uint32_t delay_ms) { + num_retries_++; + last_delay_ms_ = delay_ms; + }, + [this](const TaskSpecification &spec) { + return this->did_queue_generator_resubmit_; + }, + [](const JobID &, const std::string &, const std::string &, double) { + return Status::OK(); + }, + /*max_lineage_bytes*/ 1024 * 1024, + *task_event_buffer_mock_.get(), + [](const ActorID &) -> std::shared_ptr { + return nullptr; + }, + mock_gcs_client_, + fake_task_by_state_counter_); + + auto spec = CreateTaskHelper(1, {}, /*dynamic_returns=*/true); + dyn_mgr.AddPendingTask(addr_, spec, "", /*num_retries=*/0); + dyn_mgr.MarkDependenciesResolved(spec.TaskId()); + dyn_mgr.MarkTaskWaitingForExecution( + spec.TaskId(), NodeID::FromRandom(), WorkerID::FromRandom()); + + rpc::PushTaskReply reply; + auto generator_id = spec.ReturnId(0); + auto gen_obj = reply.add_return_objects(); + gen_obj->set_object_id(generator_id.Binary()); + auto data = GenerateRandomBuffer(); + gen_obj->set_data(data->Data(), data->Size()); + for (int i = 0; i < 2; i++) { + auto dyn_id = ObjectID::FromIndex(spec.TaskId(), i + 2); + auto dyn_obj = reply.add_dynamic_return_objects(); + dyn_obj->set_object_id(dyn_id.Binary()); + dyn_obj->set_data(data->Data(), data->Size()); + dyn_obj->set_in_plasma(true); + } + + dyn_mgr.CompletePendingTask(spec.TaskId(), reply, rpc::Address(), /*app_err=*/false); + ASSERT_FALSE(dyn_mgr.IsTaskPending(spec.TaskId())); +} + TEST_F(TaskManagerTest, TestObjectRefStreamCreateDelete) { /** * Test create and deletion of stream works. @@ -2360,10 +2563,10 @@ TEST_F(TaskManagerTest, TestObjectRefStreamBackpressure) { bool signal_called = false; ASSERT_TRUE(manager_.HandleReportGeneratorItemReturns( req, - /*execution_signal_callback*/ [&signal_called](Status status, + /*execution_signal_callback*/ [&signal_called](Status callback_status, int64_t num_objects_consumed) { signal_called = true; - ASSERT_TRUE(status.ok()); + ASSERT_TRUE(callback_status.ok()); ASSERT_EQ(num_objects_consumed, 0); })); ASSERT_TRUE(signal_called); @@ -2692,6 +2895,74 @@ TEST_F(TaskManagerTest, TestTaskRetriedOnNodePreemption) { // Cleanup manager_.FailPendingTask(spec.TaskId(), rpc::ErrorType::WORKER_DIED); } + +class PlasmaShutdownRaceTest : public ::testing::Test { + public: + PlasmaShutdownRaceTest() : is_shutting_down_(false) {} + + Status SimulatePlasmaCallback(const ObjectID &object_id, bool simulate_failure) { + if (is_shutting_down_) { + skipped_operations_.insert(object_id); + return Status::OK(); + } + + if (simulate_failure) { + auto status = Status::IOError("Broken pipe"); + if (status.IsIOError() && is_shutting_down_) { + tolerated_operations_.insert(object_id); + return Status::OK(); + } else { + failed_operations_.insert(object_id); + return status; + } + } + + successful_operations_.insert(object_id); + return Status::OK(); + } + + void SetShuttingDown(bool shutting_down) { is_shutting_down_ = shutting_down; } + + protected: + bool is_shutting_down_; + std::unordered_set skipped_operations_; + std::unordered_set tolerated_operations_; + std::unordered_set successful_operations_; + std::unordered_set failed_operations_; +}; + +// Test plasma callback behavior during shutdown to prevent RAY_CHECK crashes +TEST_F(PlasmaShutdownRaceTest, PlasmaCallbackHandlesShutdownRaceCondition) { + auto object_id = ObjectID::FromRandom(); + + SetShuttingDown(false); + ASSERT_TRUE(SimulatePlasmaCallback(object_id, false).ok()); + ASSERT_EQ(successful_operations_.count(object_id), 1); + + auto object_id2 = ObjectID::FromRandom(); + auto status = SimulatePlasmaCallback(object_id2, true); + ASSERT_FALSE(status.ok()); + ASSERT_TRUE(status.IsIOError()); + ASSERT_EQ(failed_operations_.count(object_id2), 1); + + auto object_id3 = ObjectID::FromRandom(); + SetShuttingDown(true); + ASSERT_TRUE(SimulatePlasmaCallback(object_id3, false).ok()); + ASSERT_EQ(skipped_operations_.count(object_id3), 1); + + auto object_id4 = ObjectID::FromRandom(); + SetShuttingDown(false); + auto status4 = Status::IOError("Broken pipe"); + SetShuttingDown(true); + + if (status4.IsIOError() && is_shutting_down_) { + tolerated_operations_.insert(object_id4); + } else { + failed_operations_.insert(object_id4); + } + ASSERT_EQ(tolerated_operations_.count(object_id4), 1); +} + } // namespace core } // namespace ray diff --git a/src/ray/design_docs/id_specification.md b/src/ray/design_docs/id_specification.md index e5a4e52368bb..8c56400f7a08 100644 --- a/src/ray/design_docs/id_specification.md +++ b/src/ray/design_docs/id_specification.md @@ -25,14 +25,19 @@ Ray ID Specification | TaskID | index bytes | ObjectID 28B +-----------------------------------------------------------------------+-----------------+ + 4B 28B ++-----------------+-----------------------------------------------------------------------+ +| unique bytes | WorkerID | LeaseID 32B ++-----------------+-----------------------------------------------------------------------+ + ``` #### JobID (4 bytes) `JobID` is generated by `GCS` to ensure uniqueness. Its length is 4 bytes. -#### ActorID (8 bytes) +#### ActorID (16 bytes) An `ActorID` contains two parts: 1) 12 unique bytes, and 2) its `JobID`. -#### TaskID (16 bytes) +#### TaskID (24 bytes) A `TaskID` contains two parts: 1) 8 unique bytes, and 2) its `ActorID`. If the task is a normal task or a driver task, the part 2 is its dummy actor id. @@ -58,3 +63,11 @@ An `ObjectID` contains 2 parts: and `n` is added to the `TaskID`'s unique bytes, where `n` is the number of times that task has executed so far. For task returns, the unique bytes are identical to the parent task. + +#### LeaseID (32 bytes) +A `LeaseID` contains 2 parts: +- `unique bytes`: 4 bytes generated via a counter unique to the lease requester +(worker or gcs). +- `WorkerID`: 28 bytes that represent the WorkerID of the lease requester. +In the case of the gcs it's randomly generated. Due to the possibility of GCS +restarts, we can't simply nil them out. diff --git a/src/ray/flatbuffers/node_manager.fbs b/src/ray/flatbuffers/node_manager.fbs index dde5dfac89d6..c6399c0d2aa2 100644 --- a/src/ray/flatbuffers/node_manager.fbs +++ b/src/ray/flatbuffers/node_manager.fbs @@ -103,7 +103,7 @@ table RegisterClientRequest { table RegisterClientReply { success: bool; failure_reason: string; - raylet_id: string; + node_id: string; port: int; } @@ -122,7 +122,7 @@ table AnnounceWorkerPortReply { // Mimics the Address protobuf. table Address { - raylet_id: string; + node_id: string; ip_address: string; port: int; // Optional unique id for the worker. diff --git a/src/ray/gcs/BUILD.bazel b/src/ray/gcs/BUILD.bazel index 8c2d190bd421..5493169d3f6b 100644 --- a/src/ray/gcs/BUILD.bazel +++ b/src/ray/gcs/BUILD.bazel @@ -1,57 +1,544 @@ -load("//bazel:ray.bzl", "ray_cc_library") +load("//bazel:ray.bzl", "ray_cc_binary", "ray_cc_library") ray_cc_library( - name = "gcs_redis_client", - srcs = [ - "redis_async_context.cc", - "redis_client.cc", - "redis_context.cc", + name = "gcs_state_util", + srcs = ["state_util.cc"], + hdrs = ["state_util.h"], + deps = [ + "//src/ray/protobuf:gcs_cc_proto", + "@com_google_absl//absl/container:flat_hash_map", ], - hdrs = [ - "redis_async_context.h", - "redis_client.h", - "redis_context.h", +) + +ray_cc_library( + name = "gcs_table_storage", + srcs = ["gcs_table_storage.cc"], + hdrs = ["gcs_table_storage.h"], + deps = [ + "//src/ray/common:asio", + "//src/ray/common:id", + "//src/ray/common:status", + "//src/ray/gcs/store_client", + "//src/ray/protobuf:gcs_cc_proto", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/container:flat_hash_set", + ], +) + +ray_cc_library( + name = "gcs_init_data", + srcs = ["gcs_init_data.cc"], + hdrs = ["gcs_init_data.h"], + deps = [ + ":gcs_table_storage", + "//src/ray/common:asio", + "//src/ray/common:id", + "//src/ray/protobuf:gcs_cc_proto", + "@com_google_absl//absl/container:flat_hash_map", + ], +) + +ray_cc_library( + name = "gcs_kv_manager", + srcs = ["gcs_kv_manager.cc"], + hdrs = ["gcs_kv_manager.h"], + deps = [ + "//src/ray/common:asio", + "//src/ray/common:status", + "//src/ray/gcs:grpc_service_interfaces", + "//src/ray/protobuf:gcs_cc_proto", + ], +) + +ray_cc_library( + name = "gcs_function_manager", + hdrs = ["gcs_function_manager.h"], + deps = [ + ":gcs_kv_manager", + "//src/ray/common:asio", + "//src/ray/common:constants", + "//src/ray/common:id", + "@com_google_absl//absl/container:flat_hash_map", + ], +) + +ray_cc_library( + name = "gcs_node_manager", + srcs = ["gcs_node_manager.cc"], + hdrs = ["gcs_node_manager.h"], + deps = [ + ":gcs_init_data", + ":gcs_table_storage", + ":grpc_service_interfaces", + "//src/ray/common:asio", + "//src/ray/common:id", + "//src/ray/common:protobuf_utils", + "//src/ray/common:ray_config", + "//src/ray/protobuf:autoscaler_cc_proto", + "//src/ray/protobuf:gcs_service_cc_proto", + "//src/ray/protobuf:ray_syncer_cc_proto", + "//src/ray/pubsub:gcs_publisher", + "//src/ray/rpc:raylet_client_pool", + "//src/ray/stats:stats_metric", + "//src/ray/util:event", + "//src/ray/util:logging", + "//src/ray/util:time", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/container:flat_hash_set", + ], +) + +ray_cc_library( + name = "gcs_resource_manager", + srcs = ["gcs_resource_manager.cc"], + hdrs = ["gcs_resource_manager.h"], + deps = [ + ":gcs_init_data", + ":gcs_node_manager", + ":gcs_state_util", + ":grpc_service_interfaces", + "//src/ray/common:asio", + "//src/ray/common:id", + "//src/ray/common:ray_config", + "//src/ray/common:ray_syncer", + "//src/ray/protobuf:gcs_service_cc_proto", + "//src/ray/protobuf:ray_syncer_cc_proto", + "//src/ray/raylet/scheduling:cluster_lease_manager", + "//src/ray/raylet/scheduling:cluster_resource_manager", + "//src/ray/util:logging", + "@com_google_absl//absl/container:flat_hash_map", + ], +) + +ray_cc_library( + name = "gcs_usage_stats_client", + srcs = ["usage_stats_client.cc"], + hdrs = ["usage_stats_client.h"], + deps = [ + ":gcs_kv_manager", + "//src/ray/common:asio", + "//src/ray/protobuf:usage_cc_proto", + ], +) + +ray_cc_library( + name = "gcs_store_client_kv", + srcs = ["store_client_kv.cc"], + hdrs = ["store_client_kv.h"], + deps = [ + ":gcs_kv_manager", + "//src/ray/gcs/store_client", + ], +) + +ray_cc_library( + name = "gcs_pubsub_handler", + srcs = ["pubsub_handler.cc"], + hdrs = ["pubsub_handler.h"], + deps = [ + "//src/ray/gcs:grpc_service_interfaces", + "//src/ray/protobuf:gcs_service_cc_proto", + "//src/ray/pubsub:gcs_publisher", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/container:flat_hash_set", + ], +) + +ray_cc_library( + name = "gcs_runtime_env_handler", + srcs = ["runtime_env_handler.cc"], + hdrs = ["runtime_env_handler.h"], + deps = [ + ":grpc_service_interfaces", + "//src/ray/common:asio", + "//src/ray/common:runtime_env", + "//src/ray/protobuf:gcs_cc_proto", + "//src/ray/util:thread_checker", + "@boost//:asio", + ], +) + +ray_cc_library( + name = "gcs_worker_manager", + srcs = ["gcs_worker_manager.cc"], + hdrs = ["gcs_worker_manager.h"], + deps = [ + ":gcs_kv_manager", + ":gcs_table_storage", + ":gcs_usage_stats_client", + ":grpc_service_interfaces", + "//src/ray/pubsub:gcs_publisher", + "//src/ray/stats:stats_metric", ], +) + +ray_cc_library( + name = "gcs_health_check_manager", + srcs = ["gcs_health_check_manager.cc"], + hdrs = ["gcs_health_check_manager.h"], deps = [ - "//:hiredis", "//src/ray/common:asio", + "//src/ray/common:id", + "//src/ray/common:ray_config", + "//src/ray/stats:stats_metric", + "//src/ray/util:thread_checker", + "@com_github_grpc_grpc//:grpc++", + "@com_github_grpc_grpc//src/proto/grpc/health/v1:health_proto", + "@com_google_absl//absl/container:flat_hash_map", + ], +) + +ray_cc_library( + name = "gcs_ray_event_converter", + srcs = ["gcs_ray_event_converter.cc"], + hdrs = ["gcs_ray_event_converter.h"], + deps = [ + "//src/ray/common:grpc_util", + "//src/ray/common:id", + "//src/ray/protobuf:events_event_aggregator_service_cc_proto", + "//src/ray/protobuf:gcs_service_cc_proto", + "//src/ray/util:logging", + ], +) + +ray_cc_library( + name = "gcs_task_manager", + srcs = ["gcs_task_manager.cc"], + hdrs = ["gcs_task_manager.h"], + deps = [ + ":gcs_ray_event_converter", + ":gcs_usage_stats_client", + ":grpc_service_interfaces", + "//src/ray/common:asio", + "//src/ray/common:id", + "//src/ray/common:protobuf_utils", "//src/ray/common:ray_config", "//src/ray/common:status", + "//src/ray/protobuf:events_event_aggregator_service_cc_proto", + "//src/ray/protobuf:gcs_cc_proto", + "//src/ray/stats:stats_metric", + "//src/ray/util:counter_map", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/container:flat_hash_set", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/synchronization", + ], +) + +ray_cc_library( + name = "gcs_server_io_context_policy", + hdrs = ["gcs_server_io_context_policy.h"], + deps = [ + ":gcs_task_manager", + "//src/ray/common:ray_syncer", + "//src/ray/observability:ray_event_recorder", + "//src/ray/pubsub:gcs_publisher", + "//src/ray/util:array", + "//src/ray/util:type_traits", + ], +) + +ray_cc_library( + name = "gcs_job_manager", + srcs = ["gcs_job_manager.cc"], + hdrs = ["gcs_job_manager.h"], + deps = [ + ":gcs_function_manager", + ":gcs_init_data", + ":gcs_table_storage", + ":grpc_service_interfaces", + "//src/ray/common:protobuf_utils", + "//src/ray/common:runtime_env", + "//src/ray/observability:ray_driver_job_definition_event", + "//src/ray/observability:ray_driver_job_execution_event", + "//src/ray/observability:ray_event_recorder_interface", + "//src/ray/pubsub:gcs_publisher", + "//src/ray/rpc:core_worker_client", + "//src/ray/stats:stats_metric", + "//src/ray/util:event", + "//src/ray/util:thread_checker", + "@com_google_absl//absl/container:flat_hash_map", + ], +) + +ray_cc_library( + name = "gcs_placement_group", + srcs = ["gcs_placement_group.cc"], + hdrs = ["gcs_placement_group.h"], + deps = [ + "//src/ray/common:bundle_spec", + "//src/ray/common:id", + "//src/ray/protobuf:gcs_service_cc_proto", + "//src/ray/stats:stats_lib", + "//src/ray/util:counter_map", + "//src/ray/util:time", + ], +) + +ray_cc_library( + name = "gcs_placement_group_scheduler", + srcs = ["gcs_placement_group_scheduler.cc"], + hdrs = ["gcs_placement_group_scheduler.h"], + deps = [ + ":gcs_node_manager", + ":gcs_placement_group", + ":gcs_table_storage", + "//src/ray/common:asio", + "//src/ray/common:id", + "//src/ray/raylet/scheduling:cluster_resource_scheduler", + "//src/ray/raylet/scheduling:scheduling_context", + "//src/ray/rpc:raylet_client_interface", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/container:flat_hash_set", + ], +) + +ray_cc_library( + name = "gcs_placement_group_manager", + srcs = ["gcs_placement_group_manager.cc"], + hdrs = ["gcs_placement_group_manager.h"], + deps = [ + ":gcs_init_data", + ":gcs_placement_group", + ":gcs_placement_group_scheduler", + ":gcs_resource_manager", + ":gcs_table_storage", + ":gcs_usage_stats_client", + ":grpc_service_interfaces", + "//src/ray/common:asio", + "//src/ray/common:bundle_spec", + "//src/ray/common:id", + "//src/ray/common:ray_config", + "//src/ray/protobuf:gcs_cc_proto", "//src/ray/stats:stats_lib", + "//src/ray/util:counter_map", "//src/ray/util:exponential_backoff", - "//src/ray/util:network_util", - "@boost//:asio", + "@com_google_absl//absl/container:flat_hash_map", ], ) ray_cc_library( - name = "gcs_pb_util", - srcs = ["pb_utils.cc"], - hdrs = ["pb_util.h"], + name = "grpc_service_interfaces", + hdrs = [ + "grpc_service_interfaces.h", + ], + visibility = ["//visibility:private"], deps = [ - "//src/ray/common:constants", + "//src/ray/common:status", + "//src/ray/protobuf:autoscaler_cc_grpc", + "//src/ray/protobuf:gcs_service_cc_grpc", + ], +) + +ray_cc_library( + name = "grpc_services", + srcs = [ + "grpc_services.cc", + ], + hdrs = [ + "grpc_services.h", + ], + visibility = ["//visibility:private"], + deps = [ + ":grpc_service_interfaces", + "//src/ray/common:asio", + "//src/ray/common:id", + "//src/ray/protobuf:autoscaler_cc_grpc", + "//src/ray/protobuf:gcs_service_cc_grpc", + "//src/ray/rpc:grpc_server", + "//src/ray/rpc:server_call", + "@com_github_grpc_grpc//:grpc++", + ], +) + +ray_cc_library( + name = "gcs_actor", + srcs = [ + "gcs_actor.cc", + ], + hdrs = [ + "gcs_actor.h", + ], + deps = [ + "//src/ray/common:id", + "//src/ray/common:lease", + "//src/ray/common:task_common", + "//src/ray/common/scheduling:cluster_resource_data", + "//src/ray/protobuf:core_worker_cc_proto", + "//src/ray/protobuf:export_event_cc_proto", + "//src/ray/protobuf:gcs_service_cc_proto", + "//src/ray/util:counter_map", + "//src/ray/util:event", + "//src/ray/util:logging", + ], +) + +ray_cc_library( + name = "gcs_actor_scheduler", + srcs = [ + "gcs_actor_scheduler.cc", + ], + hdrs = [ + "gcs_actor_scheduler.h", + ], + deps = [ + ":gcs_actor", + ":gcs_node_manager", + ":gcs_table_storage", + "//src/ray/common:asio", + "//src/ray/common:id", + "//src/ray/common:ray_config", + "//src/ray/raylet/scheduling:cluster_lease_manager", + "//src/ray/rpc:core_worker_client", + "//src/ray/rpc:raylet_client_interface", + "//src/ray/rpc:raylet_client_pool", + "//src/ray/util:logging", + "//src/ray/util:time", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/container:flat_hash_set", + "@com_google_googletest//:gtest", + ], +) + +ray_cc_library( + name = "gcs_actor_manager", + srcs = [ + "gcs_actor_manager.cc", + ], + hdrs = [ + "gcs_actor_manager.h", + ], + deps = [ + ":gcs_actor", + ":gcs_actor_scheduler", + ":gcs_function_manager", + ":gcs_init_data", + ":gcs_table_storage", + ":gcs_usage_stats_client", + ":grpc_service_interfaces", + "//src/ray/common:asio", "//src/ray/common:id", + "//src/ray/common:protobuf_utils", "//src/ray/common:ray_config", "//src/ray/common:task_common", - "//src/ray/protobuf:autoscaler_cc_proto", - "//src/ray/protobuf:export_task_event_cc_proto", + "//src/ray/protobuf:gcs_service_cc_proto", + "//src/ray/pubsub:gcs_publisher", + "//src/ray/rpc:core_worker_client", + "//src/ray/stats:stats_lib", + "//src/ray/util:counter_map", + "//src/ray/util:logging", + "//src/ray/util:thread_checker", + "//src/ray/util:time", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/container:flat_hash_set", + "@com_google_googletest//:gtest", ], ) ray_cc_library( - name = "gcs_callback", - hdrs = ["callback.h"], + name = "gcs_autoscaler_state_manager", + srcs = [ + "gcs_autoscaler_state_manager.cc", + ], + hdrs = [ + "gcs_autoscaler_state_manager.h", + ], deps = [ - "//src/ray/common:status", + ":gcs_actor_manager", + ":gcs_init_data", + ":gcs_kv_manager", + ":gcs_node_manager", + ":gcs_placement_group_manager", + ":gcs_state_util", + ":grpc_service_interfaces", + "//src/ray/common:asio", + "//src/ray/common:id", + "//src/ray/common:protobuf_utils", + "//src/ray/common:ray_config", + "//src/ray/protobuf:gcs_cc_proto", + "//src/ray/pubsub:gcs_publisher", + "//src/ray/util:logging", + "//src/ray/util:string_utils", + "//src/ray/util:thread_checker", + "//src/ray/util:time", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_googletest//:gtest", ], ) ray_cc_library( - name = "gcs", + name = "gcs_server_lib", + srcs = [ + "gcs_server.cc", + ], + hdrs = [ + "gcs_server.h", + ], + deps = [ + ":gcs_actor", + ":gcs_actor_manager", + ":gcs_actor_scheduler", + ":gcs_autoscaler_state_manager", + ":gcs_function_manager", + ":gcs_health_check_manager", + ":gcs_init_data", + ":gcs_job_manager", + ":gcs_kv_manager", + ":gcs_node_manager", + ":gcs_placement_group", + ":gcs_placement_group_manager", + ":gcs_placement_group_scheduler", + ":gcs_pubsub_handler", + ":gcs_resource_manager", + ":gcs_runtime_env_handler", + ":gcs_server_io_context_policy", + ":gcs_state_util", + ":gcs_store_client_kv", + ":gcs_table_storage", + ":gcs_task_manager", + ":gcs_usage_stats_client", + ":gcs_worker_manager", + ":grpc_service_interfaces", + ":grpc_services", + "//src/ray/gcs/store_client", + "//src/ray/gcs/store_client:in_memory_store_client", + "//src/ray/gcs/store_client:observable_store_client", + "//src/ray/gcs/store_client:redis_store_client", + "//src/ray/protobuf:autoscaler_cc_grpc", + "//src/ray/protobuf:gcs_service_cc_grpc", + "//src/ray/pubsub:gcs_publisher", + "//src/ray/pubsub:publisher", + "//src/ray/raylet/scheduling:scheduler", + "//src/ray/rpc:core_worker_client", + "//src/ray/rpc:grpc_server", + "//src/ray/rpc:metrics_agent_client", + "//src/ray/rpc:raylet_client_lib", + "//src/ray/rpc:raylet_client_pool", + "//src/ray/util:counter_map", + "//src/ray/util:exponential_backoff", + "//src/ray/util:network_util", + "//src/ray/util:thread_checker", + "//src/ray/util:throttler", + "//src/ray/util:time", + "//src/ray/util:type_traits", + "@boost//:bimap", + "@com_google_absl//absl/container:btree", + ], +) + +ray_cc_binary( + name = "gcs_server", + srcs = [ + "gcs_server_main.cc", + ], + visibility = ["//visibility:public"], deps = [ - ":gcs_callback", - ":gcs_pb_util", - ":gcs_redis_client", - "//src/ray/rpc:node_manager_client", + ":gcs_server_lib", + "//src/ray/stats:stats_lib", + "//src/ray/util:event", + "//src/ray/util:raii", + "//src/ray/util:stream_redirection", + "//src/ray/util:stream_redirection_options", + "@com_github_gflags_gflags//:gflags", ], ) diff --git a/src/ray/gcs/gcs_actor.cc b/src/ray/gcs/gcs_actor.cc new file mode 100644 index 000000000000..152294b05a94 --- /dev/null +++ b/src/ray/gcs/gcs_actor.cc @@ -0,0 +1,149 @@ +// Copyright 2017 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ray/gcs/gcs_actor.h" + +#include +#include + +#include "ray/util/logging.h" + +namespace ray { +namespace gcs { + +NodeID GcsActor::GetNodeID() const { + const auto &node_id_binary = actor_table_data_.address().node_id(); + if (node_id_binary.empty()) { + return NodeID::Nil(); + } + return NodeID::FromBinary(node_id_binary); +} + +void GcsActor::UpdateAddress(const rpc::Address &address) { + actor_table_data_.mutable_address()->CopyFrom(address); +} + +const rpc::Address &GcsActor::GetAddress() const { return actor_table_data_.address(); } + +WorkerID GcsActor::GetWorkerID() const { + const auto &address = actor_table_data_.address(); + if (address.worker_id().empty()) { + return WorkerID::Nil(); + } + return WorkerID::FromBinary(address.worker_id()); +} + +WorkerID GcsActor::GetOwnerID() const { + return WorkerID::FromBinary(GetOwnerAddress().worker_id()); +} + +NodeID GcsActor::GetOwnerNodeID() const { + return NodeID::FromBinary(GetOwnerAddress().node_id()); +} + +const rpc::Address &GcsActor::GetOwnerAddress() const { + return actor_table_data_.owner_address(); +} + +void GcsActor::UpdateState(rpc::ActorTableData::ActorState state) { + actor_table_data_.set_state(state); + RefreshMetrics(); +} + +rpc::ActorTableData::ActorState GcsActor::GetState() const { + return actor_table_data_.state(); +} + +ActorID GcsActor::GetActorID() const { + return ActorID::FromBinary(actor_table_data_.actor_id()); +} + +bool GcsActor::IsDetached() const { return actor_table_data_.is_detached(); } + +std::string GcsActor::GetName() const { return actor_table_data_.name(); } + +std::string GcsActor::GetRayNamespace() const { + return actor_table_data_.ray_namespace(); +} + +TaskSpecification GcsActor::GetCreationTaskSpecification() const { + // The task spec is not available when the actor is dead. + RAY_CHECK(actor_table_data_.state() != rpc::ActorTableData::DEAD); + return TaskSpecification(*task_spec_); +} + +const rpc::ActorTableData &GcsActor::GetActorTableData() const { + return actor_table_data_; +} + +rpc::ActorTableData *GcsActor::GetMutableActorTableData() { return &actor_table_data_; } + +void GcsActor::WriteActorExportEvent() const { + /// Verify actor export events should be written to file + /// and then write actor_table_data_ as an export event. + if (!export_event_write_enabled_) { + return; + } + std::shared_ptr export_actor_data_ptr = + std::make_shared(); + + export_actor_data_ptr->set_actor_id(actor_table_data_.actor_id()); + export_actor_data_ptr->set_job_id(actor_table_data_.job_id()); + export_actor_data_ptr->set_state(ConvertActorStateToExport(actor_table_data_.state())); + export_actor_data_ptr->set_is_detached(actor_table_data_.is_detached()); + export_actor_data_ptr->set_name(actor_table_data_.name()); + export_actor_data_ptr->set_pid(actor_table_data_.pid()); + export_actor_data_ptr->set_ray_namespace(actor_table_data_.ray_namespace()); + export_actor_data_ptr->set_serialized_runtime_env( + actor_table_data_.serialized_runtime_env()); + export_actor_data_ptr->set_class_name(actor_table_data_.class_name()); + export_actor_data_ptr->mutable_death_cause()->CopyFrom(actor_table_data_.death_cause()); + export_actor_data_ptr->mutable_required_resources()->insert( + actor_table_data_.required_resources().begin(), + actor_table_data_.required_resources().end()); + export_actor_data_ptr->set_node_id(actor_table_data_.node_id()); + export_actor_data_ptr->set_placement_group_id(actor_table_data_.placement_group_id()); + export_actor_data_ptr->set_repr_name(actor_table_data_.repr_name()); + export_actor_data_ptr->mutable_labels()->insert(task_spec_.get()->labels().begin(), + task_spec_.get()->labels().end()); + export_actor_data_ptr->mutable_label_selector()->insert( + actor_table_data_.label_selector().begin(), + actor_table_data_.label_selector().end()); + + RayExportEvent(export_actor_data_ptr).SendEvent(); +} + +rpc::TaskSpec *GcsActor::GetMutableTaskSpec() { return task_spec_.get(); } + +rpc::LeaseSpec *GcsActor::GetMutableLeaseSpec() { + return &lease_spec_->GetMutableMessage(); +} + +const LeaseSpecification &GcsActor::GetLeaseSpecification() const { return *lease_spec_; } + +const ResourceRequest &GcsActor::GetAcquiredResources() const { + return acquired_resources_; +} +void GcsActor::SetAcquiredResources(ResourceRequest &&resource_request) { + acquired_resources_ = std::move(resource_request); +} + +bool GcsActor::GetGrantOrReject() const { return grant_or_reject_; } + +void GcsActor::SetGrantOrReject(bool grant_or_reject) { + grant_or_reject_ = grant_or_reject; +} + +} // namespace gcs +} // namespace ray diff --git a/src/ray/gcs/gcs_actor.h b/src/ray/gcs/gcs_actor.h new file mode 100644 index 000000000000..4ea57bdfebfb --- /dev/null +++ b/src/ray/gcs/gcs_actor.h @@ -0,0 +1,282 @@ +// Copyright 2017 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include + +#include "ray/common/id.h" +#include "ray/common/lease/lease_spec.h" +#include "ray/common/scheduling/cluster_resource_data.h" +#include "ray/common/task/task_spec.h" +#include "ray/util/counter_map.h" +#include "ray/util/event.h" +#include "src/ray/protobuf/core_worker.pb.h" +#include "src/ray/protobuf/export_actor_data.pb.h" +#include "src/ray/protobuf/gcs_service.pb.h" + +namespace ray { +namespace gcs { + +/// GcsActor just wraps `ActorTableData` and provides some convenient interfaces to access +/// the fields inside `ActorTableData`. +/// This class is not thread-safe. +class GcsActor { + public: + /// Create a GcsActor by actor_table_data. + /// + /// \param actor_table_data Data of the actor (see gcs.proto). + /// \param counter The counter to report metrics to. + explicit GcsActor( + rpc::ActorTableData actor_table_data, + std::shared_ptr>> + counter) + : actor_table_data_(std::move(actor_table_data)), + counter_(std::move(counter)), + export_event_write_enabled_(IsExportAPIEnabledActor()) { + RefreshMetrics(); + } + + /// Create a GcsActor by actor_table_data and task_spec. + /// This is only for ALIVE actors. + /// + /// \param actor_table_data Data of the actor (see gcs.proto). + /// \param task_spec Task spec of the actor. + /// \param counter The counter to report metrics to. + explicit GcsActor( + rpc::ActorTableData actor_table_data, + rpc::TaskSpec task_spec, + std::shared_ptr>> + counter) + : actor_table_data_(std::move(actor_table_data)), + task_spec_(std::make_unique(std::move(task_spec))), + counter_(std::move(counter)), + export_event_write_enabled_(IsExportAPIEnabledActor()) { + lease_spec_ = std::make_unique(*task_spec_); + RAY_CHECK(actor_table_data_.state() != rpc::ActorTableData::DEAD); + RefreshMetrics(); + } + + /// Create a GcsActor by TaskSpec. + /// + /// \param task_spec Contains the actor creation task specification. + /// \param ray_namespace Namespace of the actor. + /// \param counter The counter to report metrics to. + explicit GcsActor( + rpc::TaskSpec task_spec, + std::string ray_namespace, + std::shared_ptr>> + counter) + : task_spec_(std::make_unique(std::move(task_spec))), + counter_(std::move(counter)), + export_event_write_enabled_(IsExportAPIEnabledActor()) { + RAY_CHECK(task_spec_->type() == TaskType::ACTOR_CREATION_TASK); + const auto &actor_creation_task_spec = task_spec_->actor_creation_task_spec(); + actor_table_data_.set_actor_id(actor_creation_task_spec.actor_id()); + actor_table_data_.set_job_id(task_spec_->job_id()); + actor_table_data_.set_max_restarts(actor_creation_task_spec.max_actor_restarts()); + actor_table_data_.set_num_restarts(0); + actor_table_data_.set_num_restarts_due_to_lineage_reconstruction(0); + + actor_table_data_.mutable_function_descriptor()->CopyFrom( + task_spec_->function_descriptor()); + + actor_table_data_.set_is_detached(actor_creation_task_spec.is_detached()); + actor_table_data_.set_name(actor_creation_task_spec.name()); + actor_table_data_.mutable_owner_address()->CopyFrom(task_spec_->caller_address()); + + actor_table_data_.set_state(rpc::ActorTableData::DEPENDENCIES_UNREADY); + + actor_table_data_.mutable_address()->set_node_id(NodeID::Nil().Binary()); + actor_table_data_.mutable_address()->set_worker_id(WorkerID::Nil().Binary()); + + actor_table_data_.set_ray_namespace(ray_namespace); + if (task_spec_->scheduling_strategy().scheduling_strategy_case() == + rpc::SchedulingStrategy::SchedulingStrategyCase:: + kPlacementGroupSchedulingStrategy) { + actor_table_data_.set_placement_group_id(task_spec_->scheduling_strategy() + .placement_group_scheduling_strategy() + .placement_group_id()); + } + + // Set required resources. + auto resource_map = + GetCreationTaskSpecification().GetRequiredResources().GetResourceMap(); + actor_table_data_.mutable_required_resources()->insert(resource_map.begin(), + resource_map.end()); + + const auto &function_descriptor = task_spec_->function_descriptor(); + switch (function_descriptor.function_descriptor_case()) { + case rpc::FunctionDescriptor::FunctionDescriptorCase::kJavaFunctionDescriptor: + actor_table_data_.set_class_name( + function_descriptor.java_function_descriptor().class_name()); + break; + case rpc::FunctionDescriptor::FunctionDescriptorCase::kPythonFunctionDescriptor: + actor_table_data_.set_class_name( + function_descriptor.python_function_descriptor().class_name()); + break; + default: + // TODO(Alex): Handle the C++ case, which we currently don't have an + // easy equivalent to class_name for. + break; + } + + actor_table_data_.set_serialized_runtime_env( + task_spec_->runtime_env_info().serialized_runtime_env()); + if (task_spec_->call_site().size() > 0) { + actor_table_data_.set_call_site(task_spec_->call_site()); + } + if (task_spec_->label_selector().size() > 0) { + actor_table_data_.mutable_label_selector()->insert( + task_spec_->label_selector().begin(), task_spec_->label_selector().end()); + } + lease_spec_ = std::make_unique(*task_spec_); + RefreshMetrics(); + } + + ~GcsActor() { + // We don't decrement the value when it becomes DEAD because we don't want to + // lose the # of dead actors count when this class is GC'ed. + if (last_metric_state_ && last_metric_state_.value() != rpc::ActorTableData::DEAD) { + RAY_LOG(DEBUG) << "Decrementing state at " + << rpc::ActorTableData::ActorState_Name(last_metric_state_.value()) + << " " << GetActorTableData().class_name(); + counter_->Decrement( + std::make_pair(last_metric_state_.value(), GetActorTableData().class_name())); + } + } + + /// Get the node id on which this actor is created. + NodeID GetNodeID() const; + /// Get the id of the worker on which this actor is created. + WorkerID GetWorkerID() const; + /// Get the actor's owner ID. + WorkerID GetOwnerID() const; + /// Get the node ID of the actor's owner. + NodeID GetOwnerNodeID() const; + /// Get the address of the actor's owner. + const rpc::Address &GetOwnerAddress() const; + + /// Update the `Address` of this actor (see gcs.proto). + void UpdateAddress(const rpc::Address &address); + /// Get the `Address` of this actor. + const rpc::Address &GetAddress() const; + + /// Update the state of this actor and refreshes metrics. Do not update the + /// state of the underlying proto directly via set_state(), otherwise metrics + /// will get out of sync. + void UpdateState(rpc::ActorTableData::ActorState state); + /// Get the state of this gcs actor. + rpc::ActorTableData::ActorState GetState() const; + + /// Get the id of this actor. + ActorID GetActorID() const; + /// Returns whether or not this is a detached actor. + bool IsDetached() const; + /// Get the name of this actor. + std::string GetName() const; + /// Get the namespace of this actor. + std::string GetRayNamespace() const; + /// Get the task specification of this actor. + TaskSpecification GetCreationTaskSpecification() const; + const LeaseSpecification &GetLeaseSpecification() const; + + /// Get the immutable ActorTableData of this actor. + const rpc::ActorTableData &GetActorTableData() const; + /// Get the mutable ActorTableData of this actor. + rpc::ActorTableData *GetMutableActorTableData(); + rpc::TaskSpec *GetMutableTaskSpec(); + rpc::LeaseSpec *GetMutableLeaseSpec(); + /// Write an event containing this actor's ActorTableData + /// to file for the Export API. + void WriteActorExportEvent() const; + // Verify if export events should be written for EXPORT_ACTOR source types + bool IsExportAPIEnabledActor() const { + return IsExportAPIEnabledSourceType( + "EXPORT_ACTOR", + RayConfig::instance().enable_export_api_write(), + RayConfig::instance().enable_export_api_write_config()); + } + + const ResourceRequest &GetAcquiredResources() const; + void SetAcquiredResources(ResourceRequest &&resource_request); + bool GetGrantOrReject() const; + void SetGrantOrReject(bool grant_or_reject); + + private: + void RefreshMetrics() { + auto cur_state = GetState(); + if (last_metric_state_) { + RAY_LOG(DEBUG) << "Swapping state from " + << rpc::ActorTableData::ActorState_Name(last_metric_state_.value()) + << " to " << rpc::ActorTableData::ActorState_Name(cur_state) + << " for : " << GetActorID(); + counter_->Swap( + std::make_pair(last_metric_state_.value(), GetActorTableData().class_name()), + std::make_pair(cur_state, GetActorTableData().class_name())); + } else { + RAY_LOG(DEBUG) << "Incrementing state at " + << rpc::ActorTableData::ActorState_Name(cur_state) << " " + << GetActorTableData().class_name(); + counter_->Increment(std::make_pair(cur_state, GetActorTableData().class_name())); + } + last_metric_state_ = cur_state; + } + + rpc::ExportActorData::ActorState ConvertActorStateToExport( + rpc::ActorTableData::ActorState actor_state) const { + switch (actor_state) { + case rpc::ActorTableData::DEPENDENCIES_UNREADY: + return rpc::ExportActorData::DEPENDENCIES_UNREADY; + case rpc::ActorTableData::PENDING_CREATION: + return rpc::ExportActorData::PENDING_CREATION; + case rpc::ActorTableData::ALIVE: + return rpc::ExportActorData::ALIVE; + case rpc::ActorTableData::RESTARTING: + return rpc::ExportActorData::RESTARTING; + case rpc::ActorTableData::DEAD: + return rpc::ExportActorData::DEAD; + default: + // Unknown rpc::ActorTableData::ActorState value + RAY_LOG(FATAL) << "Invalid value for rpc::ActorTableData::ActorState" + << rpc::ActorTableData::ActorState_Name(actor_state); + return rpc::ExportActorData::DEAD; + } + } + + /// The actor meta data which contains the task specification as well as the state of + /// the gcs actor and so on (see gcs.proto). + rpc::ActorTableData actor_table_data_; + const std::unique_ptr task_spec_; + /// Resources acquired by this actor. + ResourceRequest acquired_resources_; + /// Reference to the counter to use for actor state metrics tracking. + std::shared_ptr>> + counter_; + /// Whether the actor's target node only grants or rejects the lease request. + bool grant_or_reject_ = false; + /// The last recorded metric state. + std::optional last_metric_state_; + /// If true, actor events are exported for Export API + bool export_event_write_enabled_ = false; + std::unique_ptr lease_spec_; +}; + +using RestartActorForLineageReconstructionCallback = + std::function)>; +using CreateActorCallback = std::function, const rpc::PushTaskReply &reply, const Status &status)>; + +} // namespace gcs +} // namespace ray diff --git a/src/ray/gcs/gcs_server/gcs_actor_manager.cc b/src/ray/gcs/gcs_actor_manager.cc similarity index 89% rename from src/ray/gcs/gcs_server/gcs_actor_manager.cc rename to src/ray/gcs/gcs_actor_manager.cc index c1955578fedf..8c5e69f0853e 100644 --- a/src/ray/gcs/gcs_server/gcs_actor_manager.cc +++ b/src/ray/gcs/gcs_actor_manager.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/gcs_server/gcs_actor_manager.h" +#include "ray/gcs/gcs_actor_manager.h" #include #include @@ -22,9 +22,12 @@ #include #include +#include "ray/common/protobuf_utils.h" #include "ray/common/ray_config.h" -#include "ray/gcs/pb_util.h" +#include "ray/common/task/task_spec.h" #include "ray/stats/metric_defs.h" +#include "ray/util/logging.h" +#include "ray/util/time.h" namespace { /// The error message constructed from below methods is user-facing, so please avoid @@ -77,7 +80,7 @@ const ray::rpc::ActorDeathCause GenWorkerDiedCause( const ray::rpc::ActorDeathCause GenOwnerDiedCause( const ray::gcs::GcsActor *actor, - const WorkerID &owner_id, + const ray::WorkerID &owner_id, const ray::rpc::WorkerExitType disconnect_type, const std::string &disconnect_detail, const std::string &owner_ip_address) { @@ -150,7 +153,7 @@ bool OnInitializeActorShouldLoad(const ray::gcs::GcsInitData &gcs_init_data, } const auto &actor_task_spec = ray::map_find_or_die(actor_task_specs, actor_id); - ActorID root_detached_actor_id = + ray::ActorID root_detached_actor_id = ray::TaskSpecification(actor_task_spec).RootDetachedActorId(); if (root_detached_actor_id.IsNil()) { // owner is job, NOT detached actor, should die with job @@ -179,122 +182,6 @@ bool is_uuid(const std::string &str) { return regex_match(str, e); // note: case sensitive now } -NodeID GcsActor::GetNodeID() const { - const auto &raylet_id_binary = actor_table_data_.address().raylet_id(); - if (raylet_id_binary.empty()) { - return NodeID::Nil(); - } - return NodeID::FromBinary(raylet_id_binary); -} - -void GcsActor::UpdateAddress(const rpc::Address &address) { - actor_table_data_.mutable_address()->CopyFrom(address); -} - -const rpc::Address &GcsActor::GetAddress() const { return actor_table_data_.address(); } - -WorkerID GcsActor::GetWorkerID() const { - const auto &address = actor_table_data_.address(); - if (address.worker_id().empty()) { - return WorkerID::Nil(); - } - return WorkerID::FromBinary(address.worker_id()); -} - -WorkerID GcsActor::GetOwnerID() const { - return WorkerID::FromBinary(GetOwnerAddress().worker_id()); -} - -NodeID GcsActor::GetOwnerNodeID() const { - return NodeID::FromBinary(GetOwnerAddress().raylet_id()); -} - -const rpc::Address &GcsActor::GetOwnerAddress() const { - return actor_table_data_.owner_address(); -} - -void GcsActor::UpdateState(rpc::ActorTableData::ActorState state) { - actor_table_data_.set_state(state); - RefreshMetrics(); -} - -rpc::ActorTableData::ActorState GcsActor::GetState() const { - return actor_table_data_.state(); -} - -ActorID GcsActor::GetActorID() const { - return ActorID::FromBinary(actor_table_data_.actor_id()); -} - -bool GcsActor::IsDetached() const { return actor_table_data_.is_detached(); } - -std::string GcsActor::GetName() const { return actor_table_data_.name(); } - -std::string GcsActor::GetRayNamespace() const { - return actor_table_data_.ray_namespace(); -} - -TaskSpecification GcsActor::GetCreationTaskSpecification() const { - // The task spec is not available when the actor is dead. - RAY_CHECK(actor_table_data_.state() != rpc::ActorTableData::DEAD); - return TaskSpecification(*task_spec_); -} - -const rpc::ActorTableData &GcsActor::GetActorTableData() const { - return actor_table_data_; -} - -rpc::ActorTableData *GcsActor::GetMutableActorTableData() { return &actor_table_data_; } - -void GcsActor::WriteActorExportEvent() const { - /// Verify actor export events should be written to file - /// and then write actor_table_data_ as an export event. - if (!export_event_write_enabled_) { - return; - } - std::shared_ptr export_actor_data_ptr = - std::make_shared(); - - export_actor_data_ptr->set_actor_id(actor_table_data_.actor_id()); - export_actor_data_ptr->set_job_id(actor_table_data_.job_id()); - export_actor_data_ptr->set_state(ConvertActorStateToExport(actor_table_data_.state())); - export_actor_data_ptr->set_is_detached(actor_table_data_.is_detached()); - export_actor_data_ptr->set_name(actor_table_data_.name()); - export_actor_data_ptr->set_pid(actor_table_data_.pid()); - export_actor_data_ptr->set_ray_namespace(actor_table_data_.ray_namespace()); - export_actor_data_ptr->set_serialized_runtime_env( - actor_table_data_.serialized_runtime_env()); - export_actor_data_ptr->set_class_name(actor_table_data_.class_name()); - export_actor_data_ptr->mutable_death_cause()->CopyFrom(actor_table_data_.death_cause()); - export_actor_data_ptr->mutable_required_resources()->insert( - actor_table_data_.required_resources().begin(), - actor_table_data_.required_resources().end()); - export_actor_data_ptr->set_node_id(actor_table_data_.node_id()); - export_actor_data_ptr->set_placement_group_id(actor_table_data_.placement_group_id()); - export_actor_data_ptr->set_repr_name(actor_table_data_.repr_name()); - export_actor_data_ptr->mutable_labels()->insert(task_spec_.get()->labels().begin(), - task_spec_.get()->labels().end()); - export_actor_data_ptr->mutable_label_selector()->insert( - actor_table_data_.label_selector().begin(), - actor_table_data_.label_selector().end()); - - RayExportEvent(export_actor_data_ptr).SendEvent(); -} - -rpc::TaskSpec *GcsActor::GetMutableTaskSpec() { return task_spec_.get(); } - -const ResourceRequest &GcsActor::GetAcquiredResources() const { - return acquired_resources_; -} -void GcsActor::SetAcquiredResources(ResourceRequest &&resource_request) { - acquired_resources_ = std::move(resource_request); -} - -bool GcsActor::GetGrantOrReject() const { return grant_or_reject_; } -void GcsActor::SetGrantOrReject(bool grant_or_reject) { - grant_or_reject_ = grant_or_reject; -} - const ray::rpc::ActorDeathCause GcsActorManager::GenNodeDiedCause( const ray::gcs::GcsActor *actor, std::shared_ptr node) { ray::rpc::ActorDeathCause death_cause; @@ -331,12 +218,11 @@ const ray::rpc::ActorDeathCause GcsActorManager::GenNodeDiedCause( return death_cause; } -///////////////////////////////////////////////////////////////////////////////////////// GcsActorManager::GcsActorManager( std::unique_ptr scheduler, GcsTableStorage *gcs_table_storage, instrumented_io_context &io_context, - GcsPublisher *gcs_publisher, + pubsub::GcsPublisher *gcs_publisher, RuntimeEnvManager &runtime_env_manager, GCSFunctionManager &function_manager, std::function destroy_owned_placement_group_if_needed, @@ -404,15 +290,15 @@ void GcsActorManager::HandleRegisterActor(rpc::RegisterActorRequest request, RAY_LOG(INFO).WithField(actor_id.JobId()).WithField(actor_id) << "Registering actor"; Status status = RegisterActor( - request, [reply, send_reply_callback, actor_id](const Status &status) { - if (status.ok()) { + request, [reply, send_reply_callback, actor_id](const Status ®ister_status) { + if (register_status.ok()) { RAY_LOG(INFO).WithField(actor_id.JobId()).WithField(actor_id) << "Registered actor"; } else { RAY_LOG(WARNING).WithField(actor_id.JobId()).WithField(actor_id) - << "Failed to register actor: " << status.ToString(); + << "Failed to register actor: " << register_status.ToString(); } - GCS_RPC_SEND_REPLY(send_reply_callback, reply, status); + GCS_RPC_SEND_REPLY(send_reply_callback, reply, register_status); }); if (!status.ok()) { RAY_LOG(WARNING).WithField(actor_id.JobId()).WithField(actor_id) @@ -498,12 +384,15 @@ void GcsActorManager::HandleRestartActorForLineageReconstruction( // should overwrite the actor state to DEAD to avoid race condition. return; } - auto iter = actor_to_restart_for_lineage_reconstruction_callbacks_.find( - actor->GetActorID()); - RAY_CHECK(iter != actor_to_restart_for_lineage_reconstruction_callbacks_.end() && - !iter->second.empty()); - auto callbacks = std::move(iter->second); - actor_to_restart_for_lineage_reconstruction_callbacks_.erase(iter); + auto restart_callback_iter = + actor_to_restart_for_lineage_reconstruction_callbacks_.find( + actor->GetActorID()); + RAY_CHECK(restart_callback_iter != + actor_to_restart_for_lineage_reconstruction_callbacks_.end() && + !restart_callback_iter->second.empty()); + auto callbacks = std::move(restart_callback_iter->second); + actor_to_restart_for_lineage_reconstruction_callbacks_.erase( + restart_callback_iter); for (auto &callback : callbacks) { callback(actor); } @@ -640,7 +529,7 @@ void GcsActorManager::HandleGetAllActorInfo(rpc::GetAllActorInfoRequest request, RAY_CHECK(request.show_dead_jobs()); // We don't maintain an in-memory cache of all actors which belong to dead // jobs, so fetch it from redis. - Status status = gcs_table_storage_->ActorTable().GetAll( + gcs_table_storage_->ActorTable().GetAll( {[reply, send_reply_callback, limit, request = std::move(request), filter_fn]( absl::flat_hash_map &&result) { auto total_actors = result.size(); @@ -670,10 +559,6 @@ void GcsActorManager::HandleGetAllActorInfo(rpc::GetAllActorInfoRequest request, RAY_LOG(DEBUG) << "Finished getting all actor info."; }, io_context_}); - if (!status.ok()) { - // Send the response to unblock the sender and free the request. - GCS_RPC_SEND_REPLY(send_reply_callback, reply, status); - } } void GcsActorManager::HandleGetNamedActorInfo( @@ -798,12 +683,11 @@ Status GcsActorManager::RegisterActor(const ray::rpc::RegisterActorRequest &requ "explicitly connect to this namespace with ray.init(namespace=\"" << actor->GetRayNamespace() << "\", ...)"; - auto error_data_ptr = gcs::CreateErrorTableData( + auto error_data = CreateErrorTableData( "detached_actor_anonymous_namespace", stream.str(), absl::Now(), job_id); - RAY_LOG(WARNING) << error_data_ptr->SerializeAsString(); - RAY_CHECK_OK( - gcs_publisher_->PublishError(job_id.Hex(), *error_data_ptr, nullptr)); + RAY_LOG(WARNING) << error_data.SerializeAsString(); + gcs_publisher_->PublishError(job_id.Hex(), std::move(error_data)); } actors_in_namespace.emplace(actor->GetName(), actor->GetActorID()); } else { @@ -819,7 +703,7 @@ Status GcsActorManager::RegisterActor(const ray::rpc::RegisterActorRequest &requ function_manager_.AddJobReference(actor_id.JobId()); const auto &owner_address = actor->GetOwnerAddress(); - auto node_id = NodeID::FromBinary(owner_address.raylet_id()); + auto node_id = NodeID::FromBinary(owner_address.node_id()); auto worker_id = WorkerID::FromBinary(owner_address.worker_id()); RAY_CHECK(unresolved_actors_[node_id][worker_id].emplace(actor->GetActorID()).second); @@ -834,18 +718,18 @@ Status GcsActorManager::RegisterActor(const ray::rpc::RegisterActorRequest &requ } // The backend storage is supposed to be reliable, so the status must be ok. - RAY_CHECK_OK(gcs_table_storage_->ActorTaskSpecTable().Put( + gcs_table_storage_->ActorTaskSpecTable().Put( actor_id, request.task_spec(), {[this, actor](Status status) { - RAY_CHECK_OK(gcs_table_storage_->ActorTable().Put( + gcs_table_storage_->ActorTable().Put( actor->GetActorID(), *actor->GetMutableActorTableData(), - {[this, actor](Status status) { + {[this, actor](Status put_status) { RAY_CHECK(thread_checker_.IsOnSameThread()); // The backend storage is supposed to be reliable, so the status must be // ok. - RAY_CHECK_OK(status); + RAY_CHECK_OK(put_status); actor->WriteActorExportEvent(); auto registered_actor_it = registered_actors_.find(actor->GetActorID()); auto callback_iter = @@ -866,8 +750,8 @@ Status GcsActorManager::RegisterActor(const ray::rpc::RegisterActorRequest &requ return; } - RAY_CHECK_OK(gcs_publisher_->PublishActor( - actor->GetActorID(), actor->GetActorTableData(), nullptr)); + gcs_publisher_->PublishActor(actor->GetActorID(), + actor->GetActorTableData()); // Invoke all callbacks for all registration requests of this actor // (duplicated requests are included) and remove all of them from // actor_to_register_callbacks_. @@ -877,9 +761,9 @@ Status GcsActorManager::RegisterActor(const ray::rpc::RegisterActorRequest &requ } actor_to_register_callbacks_.erase(callback_iter); }, - io_context_})); + io_context_}); }, - io_context_})); + io_context_}); return Status::OK(); } @@ -948,7 +832,7 @@ Status GcsActorManager::CreateActor(const ray::rpc::CreateActorRequest &request, current_sys_time_ms()); // Pub this state for dashboard showing. - RAY_CHECK_OK(gcs_publisher_->PublishActor(actor_id, actor_table_data, nullptr)); + gcs_publisher_->PublishActor(actor_id, actor_table_data); actor->WriteActorExportEvent(); RemoveUnresolvedActor(actor); @@ -1034,12 +918,12 @@ void GcsActorManager::PollOwnerForActorRefDeleted( auto client = worker_client_pool_.GetOrConnect(actor->GetOwnerAddress()); it = workers.emplace(owner_id, Owner(std::move(client))).first; } - it->second.children_actor_ids.insert(actor_id); + it->second.children_actor_ids_.insert(actor_id); rpc::WaitForActorRefDeletedRequest wait_request; wait_request.set_intended_worker_id(owner_id.Binary()); wait_request.set_actor_id(actor_id.Binary()); - it->second.client->WaitForActorRefDeleted( + it->second.client_->WaitForActorRefDeleted( wait_request, [this, owner_node_id, owner_id, actor_id]( Status status, const rpc::WaitForActorRefDeletedReply &reply) { @@ -1112,7 +996,7 @@ void GcsActorManager::DestroyActor(const ActorID &actor_id, // worker exit to avoid process and resource leak. NotifyCoreWorkerToKillActor(actor, death_cause, force_kill); } - CancelActorInScheduling(actor, TaskID::ForActorCreationTask(actor_id)); + CancelActorInScheduling(actor); } } @@ -1156,7 +1040,7 @@ void GcsActorManager::DestroyActor(const ActorID &actor_id, auto actor_table_data = std::make_shared(*mutable_actor_table_data); // The backend storage is reliable in the future, so the status must be ok. - RAY_CHECK_OK(gcs_table_storage_->ActorTable().Put( + gcs_table_storage_->ActorTable().Put( actor->GetActorID(), *actor_table_data, {[this, @@ -1168,17 +1052,17 @@ void GcsActorManager::DestroyActor(const ActorID &actor_id, if (done_callback) { done_callback(); } - RAY_CHECK_OK(gcs_publisher_->PublishActor( - actor_id, GenActorDataOnlyWithStates(*actor_table_data), nullptr)); + gcs_publisher_->PublishActor(actor_id, + GenActorDataOnlyWithStates(*actor_table_data)); if (!is_restartable) { - RAY_CHECK_OK(gcs_table_storage_->ActorTaskSpecTable().Delete( - actor_id, {[](auto) {}, io_context_})); + gcs_table_storage_->ActorTaskSpecTable().Delete(actor_id, + {[](auto) {}, io_context_}); } actor->WriteActorExportEvent(); // Destroy placement group owned by this actor. destroy_owned_placement_group_if_needed_(actor_id); }, - io_context_})); + io_context_}); // Inform all creation callbacks that the actor was cancelled, not created. RunAndClearActorCreationCallbacks( @@ -1254,7 +1138,7 @@ void GcsActorManager::OnWorkerDead(const ray::NodeID &node_id, auto owner = it->second.find(worker_id); // Make a copy of the children actor IDs since we will delete from the // list. - const auto children_ids = owner->second.children_actor_ids; + const auto children_ids = owner->second.children_actor_ids_; for (const auto &child_id : children_ids) { DestroyActor(child_id, GenOwnerDiedCause(GetActor(child_id), @@ -1328,7 +1212,7 @@ void GcsActorManager::OnNodeDead(std::shared_ptr node, absl::flat_hash_map children_ids; // Make a copy of all the actor IDs owned by workers on the dead node. for (const auto &owner : it->second) { - for (const auto &child_id : owner.second.children_actor_ids) { + for (const auto &child_id : owner.second.children_actor_ids_) { children_ids.emplace(owner.first, child_id); } } @@ -1400,14 +1284,14 @@ void GcsActorManager::SetPreemptedAndPublish(const NodeID &node_id) { const auto &actor_id = id_iter.second; const auto &actor_table_data = actor_iter->second->GetActorTableData(); - RAY_CHECK_OK(gcs_table_storage_->ActorTable().Put( + gcs_table_storage_->ActorTable().Put( actor_id, actor_table_data, {[this, actor_id, actor_table_data](Status status) { - RAY_CHECK_OK(gcs_publisher_->PublishActor( - actor_id, GenActorDataOnlyWithStates(actor_table_data), nullptr)); + gcs_publisher_->PublishActor(actor_id, + GenActorDataOnlyWithStates(actor_table_data)); }, - io_context_})); + io_context_}); } } @@ -1476,21 +1360,20 @@ void GcsActorManager::RestartActor(const ActorID &actor_id, actor->UpdateAddress(rpc::Address()); mutable_actor_table_data->clear_resource_mapping(); // The backend storage is reliable in the future, so the status must be ok. - RAY_CHECK_OK(gcs_table_storage_->ActorTable().Put( + gcs_table_storage_->ActorTable().Put( actor_id, *mutable_actor_table_data, {[this, actor, actor_id, mutable_actor_table_data, done_callback](Status status) { if (done_callback) { done_callback(); } - RAY_CHECK_OK(gcs_publisher_->PublishActor( - actor_id, GenActorDataOnlyWithStates(*mutable_actor_table_data), nullptr)); + gcs_publisher_->PublishActor( + actor_id, GenActorDataOnlyWithStates(*mutable_actor_table_data)); actor->WriteActorExportEvent(); }, - io_context_})); + io_context_}); gcs_actor_scheduler_->Schedule(actor); } else { - RemoveActorNameFromRegistry(actor); actor->UpdateState(rpc::ActorTableData::DEAD); mutable_actor_table_data->mutable_death_cause()->CopyFrom(death_cause); auto time = current_sys_time_ms(); @@ -1498,7 +1381,7 @@ void GcsActorManager::RestartActor(const ActorID &actor_id, mutable_actor_table_data->set_timestamp(time); // The backend storage is reliable in the future, so the status must be ok. - RAY_CHECK_OK(gcs_table_storage_->ActorTable().Put( + gcs_table_storage_->ActorTable().Put( actor_id, *mutable_actor_table_data, {[this, actor, actor_id, mutable_actor_table_data, death_cause, done_callback]( @@ -1512,13 +1395,13 @@ void GcsActorManager::RestartActor(const ActorID &actor_id, if (done_callback) { done_callback(); } - RAY_CHECK_OK(gcs_publisher_->PublishActor( - actor_id, GenActorDataOnlyWithStates(*mutable_actor_table_data), nullptr)); - RAY_CHECK_OK(gcs_table_storage_->ActorTaskSpecTable().Delete( - actor_id, {[](auto) {}, io_context_})); + gcs_publisher_->PublishActor( + actor_id, GenActorDataOnlyWithStates(*mutable_actor_table_data)); + gcs_table_storage_->ActorTaskSpecTable().Delete(actor_id, + {[](auto) {}, io_context_}); actor->WriteActorExportEvent(); }, - io_context_})); + io_context_}); // The actor is dead, but we should not remove the entry from the // registered actors yet. If the actor is owned, we will destroy the actor // once the owner fails or notifies us that the actor has no references. @@ -1618,21 +1501,25 @@ void GcsActorManager::OnActorCreationSuccess(const std::shared_ptr &ac RAY_CHECK(!node_id.IsNil()); RAY_CHECK(created_actors_[node_id].emplace(worker_id, actor_id).second); - auto actor_table_data = *mutable_actor_table_data; + auto actor_data_only_with_states = + GenActorDataOnlyWithStates(*mutable_actor_table_data); // The backend storage is reliable in the future, so the status must be ok. - RAY_CHECK_OK(gcs_table_storage_->ActorTable().Put( + gcs_table_storage_->ActorTable().Put( actor_id, - actor_table_data, - {[this, actor_id, actor_table_data, actor, reply](Status status) { - RAY_CHECK_OK(gcs_publisher_->PublishActor( - actor_id, GenActorDataOnlyWithStates(actor_table_data), nullptr)); + *mutable_actor_table_data, + {[this, + actor_id, + actor_data_only_with_states = std::move(actor_data_only_with_states), + actor, + reply](Status status) mutable { + gcs_publisher_->PublishActor(actor_id, std::move(actor_data_only_with_states)); actor->WriteActorExportEvent(); // Invoke all callbacks for all registration requests of this actor (duplicated // requests are included) and remove all of them from // actor_to_create_callbacks_. RunAndClearActorCreationCallbacks(actor, reply, Status::OK()); }, - io_context_})); + io_context_}); } void GcsActorManager::SchedulePendingActors() { @@ -1669,7 +1556,7 @@ void GcsActorManager::Initialize(const GcsInitData &gcs_init_data) { if (actor_table_data.state() == ray::rpc::ActorTableData::DEPENDENCIES_UNREADY) { const auto &owner = actor->GetOwnerAddress(); - const auto &owner_node = NodeID::FromBinary(owner.raylet_id()); + const auto &owner_node = NodeID::FromBinary(owner.node_id()); const auto &owner_worker = WorkerID::FromBinary(owner.worker_id()); RAY_CHECK(unresolved_actors_[owner_node][owner_worker] .emplace(actor->GetActorID()) @@ -1698,8 +1585,8 @@ void GcsActorManager::Initialize(const GcsInitData &gcs_init_data) { } } if (!dead_actors.empty()) { - RAY_CHECK_OK(gcs_table_storage_->ActorTaskSpecTable().BatchDelete( - dead_actors, {[](auto) {}, io_context_})); + gcs_table_storage_->ActorTaskSpecTable().BatchDelete(dead_actors, + {[](auto) {}, io_context_}); } sorted_destroyed_actor_list_.sort([](const std::pair &left, const std::pair &right) { @@ -1738,7 +1625,7 @@ const absl::flat_hash_map> void GcsActorManager::RemoveUnresolvedActor(const std::shared_ptr &actor) { const auto &owner_address = actor->GetOwnerAddress(); - auto node_id = NodeID::FromBinary(owner_address.raylet_id()); + auto node_id = NodeID::FromBinary(owner_address.node_id()); auto worker_id = WorkerID::FromBinary(owner_address.worker_id()); auto iter = unresolved_actors_.find(node_id); if (iter != unresolved_actors_.end()) { @@ -1765,8 +1652,8 @@ void GcsActorManager::RemoveActorFromOwner(const std::shared_ptr &acto auto worker_it = node.find(owner_id); RAY_CHECK(worker_it != node.end()); auto &owner = worker_it->second; - RAY_CHECK(owner.children_actor_ids.erase(actor_id)); - if (owner.children_actor_ids.empty()) { + RAY_CHECK(owner.children_actor_ids_.erase(actor_id)); + if (owner.children_actor_ids_.empty()) { node.erase(worker_it); if (node.empty()) { owners_.erase(owner_node_id); @@ -1819,16 +1706,16 @@ void GcsActorManager::KillActor(const ActorID &actor_id, bool force_kill) { NotifyCoreWorkerToKillActor( actor, GenKilledByApplicationCause(GetActor(actor_id)), force_kill); } else { - const auto &task_id = actor->GetCreationTaskSpecification().TaskId(); - RAY_LOG(DEBUG).WithField(actor->GetActorID()).WithField(task_id) - << "The actor hasn't been created yet, cancel scheduling task"; + const auto &lease_id = actor->GetLeaseSpecification().LeaseId(); + RAY_LOG(DEBUG).WithField(actor->GetActorID()).WithField(lease_id) + << "The actor hasn't been created yet, cancel scheduling lease"; if (!worker_id.IsNil()) { // The actor is in phase of creating, so we need to notify the core // worker exit to avoid process and resource leak. NotifyCoreWorkerToKillActor( actor, GenKilledByApplicationCause(GetActor(actor_id)), force_kill); } - CancelActorInScheduling(actor, task_id); + CancelActorInScheduling(actor); RestartActor(actor_id, /*need_reschedule=*/true, GenKilledByApplicationCause(GetActor(actor_id))); @@ -1839,8 +1726,7 @@ void GcsActorManager::AddDestroyedActorToCache(const std::shared_ptr & if (destroyed_actors_.size() >= RayConfig::instance().maximum_gcs_destroyed_actor_cached_count()) { const auto &actor_id = sorted_destroyed_actor_list_.front().first; - RAY_CHECK_OK( - gcs_table_storage_->ActorTable().Delete(actor_id, {[](auto) {}, io_context_})); + gcs_table_storage_->ActorTable().Delete(actor_id, {[](auto) {}, io_context_}); destroyed_actors_.erase(actor_id); sorted_destroyed_actor_list_.pop_front(); } @@ -1852,10 +1738,10 @@ void GcsActorManager::AddDestroyedActorToCache(const std::shared_ptr & } } -void GcsActorManager::CancelActorInScheduling(const std::shared_ptr &actor, - const TaskID &task_id) { - RAY_LOG(DEBUG).WithField(actor->GetActorID()).WithField(task_id) - << "Cancel actor in scheduling"; +void GcsActorManager::CancelActorInScheduling(const std::shared_ptr &actor) { + auto lease_id = actor->GetLeaseSpecification().LeaseId(); + RAY_LOG(DEBUG).WithField(actor->GetActorID()).WithField(lease_id) + << "Cancel actor in scheduling, this may be due to resource re-eviction"; const auto &actor_id = actor->GetActorID(); const auto &node_id = actor->GetNodeID(); // The actor has not been created yet. It is either being scheduled or is @@ -1872,7 +1758,7 @@ void GcsActorManager::CancelActorInScheduling(const std::shared_ptr &a // it doesn't responds, and the actor should be still in leasing state. // NOTE: We will cancel outstanding lease request by calling // `raylet_client->CancelWorkerLease`. - gcs_actor_scheduler_->CancelOnLeasing(node_id, actor_id, task_id); + gcs_actor_scheduler_->CancelOnLeasing(node_id, actor_id, lease_id); // Return the actor's acquired resources (if any). gcs_actor_scheduler_->OnActorDestruction(actor); } @@ -1896,8 +1782,8 @@ bool GcsActorManager::RemovePendingActor(std::shared_ptr actor) { const auto &actor_id = actor->GetActorID(); auto pending_it = std::find_if(pending_actors_.begin(), pending_actors_.end(), - [actor_id](const std::shared_ptr &actor) { - return actor->GetActorID() == actor_id; + [actor_id](const std::shared_ptr &this_actor) { + return this_actor->GetActorID() == actor_id; }); // The actor was pending scheduling. Remove it from the queue. diff --git a/src/ray/gcs/gcs_server/gcs_actor_manager.h b/src/ray/gcs/gcs_actor_manager.h similarity index 67% rename from src/ray/gcs/gcs_server/gcs_actor_manager.h rename to src/ray/gcs/gcs_actor_manager.h index ddd788579ea7..947c52f107bd 100644 --- a/src/ray/gcs/gcs_server/gcs_actor_manager.h +++ b/src/ray/gcs/gcs_actor_manager.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once + #include #include @@ -22,269 +23,27 @@ #include #include "absl/container/flat_hash_map.h" +#include "absl/container/flat_hash_set.h" +#include "ray/common/asio/instrumented_io_context.h" #include "ray/common/id.h" #include "ray/common/runtime_env_manager.h" -#include "ray/common/task/task_spec.h" -#include "ray/gcs/gcs_server/gcs_actor_scheduler.h" -#include "ray/gcs/gcs_server/gcs_function_manager.h" -#include "ray/gcs/gcs_server/gcs_init_data.h" -#include "ray/gcs/gcs_server/gcs_table_storage.h" -#include "ray/gcs/gcs_server/usage_stats_client.h" -#include "ray/gcs/pubsub/gcs_pub_sub.h" -#include "ray/rpc/gcs/gcs_rpc_server.h" +#include "ray/gcs/gcs_actor.h" +#include "ray/gcs/gcs_actor_scheduler.h" +#include "ray/gcs/gcs_function_manager.h" +#include "ray/gcs/gcs_init_data.h" +#include "ray/gcs/gcs_table_storage.h" +#include "ray/gcs/grpc_service_interfaces.h" +#include "ray/gcs/usage_stats_client.h" +#include "ray/pubsub/gcs_publisher.h" #include "ray/rpc/worker/core_worker_client.h" +#include "ray/rpc/worker/core_worker_client_pool.h" #include "ray/util/counter_map.h" -#include "ray/util/event.h" #include "ray/util/thread_checker.h" #include "src/ray/protobuf/gcs_service.pb.h" namespace ray { namespace gcs { -/// GcsActor just wraps `ActorTableData` and provides some convenient interfaces to access -/// the fields inside `ActorTableData`. -/// This class is not thread-safe. -class GcsActor { - public: - /// Create a GcsActor by actor_table_data. - /// - /// \param actor_table_data Data of the actor (see gcs.proto). - /// \param counter The counter to report metrics to. - explicit GcsActor( - rpc::ActorTableData actor_table_data, - std::shared_ptr>> - counter) - : actor_table_data_(std::move(actor_table_data)), - counter_(std::move(counter)), - export_event_write_enabled_(IsExportAPIEnabledActor()) { - RefreshMetrics(); - } - - /// Create a GcsActor by actor_table_data and task_spec. - /// This is only for ALIVE actors. - /// - /// \param actor_table_data Data of the actor (see gcs.proto). - /// \param task_spec Task spec of the actor. - /// \param counter The counter to report metrics to. - explicit GcsActor( - rpc::ActorTableData actor_table_data, - rpc::TaskSpec task_spec, - std::shared_ptr>> - counter) - : actor_table_data_(std::move(actor_table_data)), - task_spec_(std::make_unique(std::move(task_spec))), - counter_(std::move(counter)), - export_event_write_enabled_(IsExportAPIEnabledActor()) { - RAY_CHECK(actor_table_data_.state() != rpc::ActorTableData::DEAD); - RefreshMetrics(); - } - - /// Create a GcsActor by TaskSpec. - /// - /// \param task_spec Contains the actor creation task specification. - /// \param ray_namespace Namespace of the actor. - /// \param counter The counter to report metrics to. - explicit GcsActor( - rpc::TaskSpec task_spec, - std::string ray_namespace, - std::shared_ptr>> - counter) - : task_spec_(std::make_unique(std::move(task_spec))), - counter_(std::move(counter)), - export_event_write_enabled_(IsExportAPIEnabledActor()) { - RAY_CHECK(task_spec_->type() == TaskType::ACTOR_CREATION_TASK); - const auto &actor_creation_task_spec = task_spec_->actor_creation_task_spec(); - actor_table_data_.set_actor_id(actor_creation_task_spec.actor_id()); - actor_table_data_.set_job_id(task_spec_->job_id()); - actor_table_data_.set_max_restarts(actor_creation_task_spec.max_actor_restarts()); - actor_table_data_.set_num_restarts(0); - actor_table_data_.set_num_restarts_due_to_lineage_reconstruction(0); - - actor_table_data_.mutable_function_descriptor()->CopyFrom( - task_spec_->function_descriptor()); - - actor_table_data_.set_is_detached(actor_creation_task_spec.is_detached()); - actor_table_data_.set_name(actor_creation_task_spec.name()); - actor_table_data_.mutable_owner_address()->CopyFrom(task_spec_->caller_address()); - - actor_table_data_.set_state(rpc::ActorTableData::DEPENDENCIES_UNREADY); - - actor_table_data_.mutable_address()->set_raylet_id(NodeID::Nil().Binary()); - actor_table_data_.mutable_address()->set_worker_id(WorkerID::Nil().Binary()); - - actor_table_data_.set_ray_namespace(ray_namespace); - if (task_spec_->scheduling_strategy().scheduling_strategy_case() == - rpc::SchedulingStrategy::SchedulingStrategyCase:: - kPlacementGroupSchedulingStrategy) { - actor_table_data_.set_placement_group_id(task_spec_->scheduling_strategy() - .placement_group_scheduling_strategy() - .placement_group_id()); - } - - // Set required resources. - auto resource_map = - GetCreationTaskSpecification().GetRequiredResources().GetResourceMap(); - actor_table_data_.mutable_required_resources()->insert(resource_map.begin(), - resource_map.end()); - - const auto &function_descriptor = task_spec_->function_descriptor(); - switch (function_descriptor.function_descriptor_case()) { - case rpc::FunctionDescriptor::FunctionDescriptorCase::kJavaFunctionDescriptor: - actor_table_data_.set_class_name( - function_descriptor.java_function_descriptor().class_name()); - break; - case rpc::FunctionDescriptor::FunctionDescriptorCase::kPythonFunctionDescriptor: - actor_table_data_.set_class_name( - function_descriptor.python_function_descriptor().class_name()); - break; - default: - // TODO(Alex): Handle the C++ case, which we currently don't have an - // easy equivalent to class_name for. - break; - } - - actor_table_data_.set_serialized_runtime_env( - task_spec_->runtime_env_info().serialized_runtime_env()); - if (task_spec_->call_site().size() > 0) { - actor_table_data_.set_call_site(task_spec_->call_site()); - } - if (task_spec_->label_selector().size() > 0) { - actor_table_data_.mutable_label_selector()->insert( - task_spec_->label_selector().begin(), task_spec_->label_selector().end()); - } - RefreshMetrics(); - } - - ~GcsActor() { - // We don't decrement the value when it becomes DEAD because we don't want to - // lose the # of dead actors count when this class is GC'ed. - if (last_metric_state_ && last_metric_state_.value() != rpc::ActorTableData::DEAD) { - RAY_LOG(DEBUG) << "Decrementing state at " - << rpc::ActorTableData::ActorState_Name(last_metric_state_.value()) - << " " << GetActorTableData().class_name(); - counter_->Decrement( - std::make_pair(last_metric_state_.value(), GetActorTableData().class_name())); - } - } - - /// Get the node id on which this actor is created. - NodeID GetNodeID() const; - /// Get the id of the worker on which this actor is created. - WorkerID GetWorkerID() const; - /// Get the actor's owner ID. - WorkerID GetOwnerID() const; - /// Get the node ID of the actor's owner. - NodeID GetOwnerNodeID() const; - /// Get the address of the actor's owner. - const rpc::Address &GetOwnerAddress() const; - - /// Update the `Address` of this actor (see gcs.proto). - void UpdateAddress(const rpc::Address &address); - /// Get the `Address` of this actor. - const rpc::Address &GetAddress() const; - - /// Update the state of this actor and refreshes metrics. Do not update the - /// state of the underlying proto directly via set_state(), otherwise metrics - /// will get out of sync. - void UpdateState(rpc::ActorTableData::ActorState state); - /// Get the state of this gcs actor. - rpc::ActorTableData::ActorState GetState() const; - - /// Get the id of this actor. - ActorID GetActorID() const; - /// Returns whether or not this is a detached actor. - bool IsDetached() const; - /// Get the name of this actor. - std::string GetName() const; - /// Get the namespace of this actor. - std::string GetRayNamespace() const; - /// Get the task specification of this actor. - TaskSpecification GetCreationTaskSpecification() const; - - /// Get the immutable ActorTableData of this actor. - const rpc::ActorTableData &GetActorTableData() const; - /// Get the mutable ActorTableData of this actor. - rpc::ActorTableData *GetMutableActorTableData(); - rpc::TaskSpec *GetMutableTaskSpec(); - /// Write an event containing this actor's ActorTableData - /// to file for the Export API. - void WriteActorExportEvent() const; - // Verify if export events should be written for EXPORT_ACTOR source types - bool IsExportAPIEnabledActor() const { - return IsExportAPIEnabledSourceType( - "EXPORT_ACTOR", - RayConfig::instance().enable_export_api_write(), - RayConfig::instance().enable_export_api_write_config()); - } - - const ResourceRequest &GetAcquiredResources() const; - void SetAcquiredResources(ResourceRequest &&resource_request); - bool GetGrantOrReject() const; - void SetGrantOrReject(bool grant_or_reject); - - private: - void RefreshMetrics() { - auto cur_state = GetState(); - if (last_metric_state_) { - RAY_LOG(DEBUG) << "Swapping state from " - << rpc::ActorTableData::ActorState_Name(last_metric_state_.value()) - << " to " << rpc::ActorTableData::ActorState_Name(cur_state) - << " for : " << GetActorID(); - counter_->Swap( - std::make_pair(last_metric_state_.value(), GetActorTableData().class_name()), - std::make_pair(cur_state, GetActorTableData().class_name())); - } else { - RAY_LOG(DEBUG) << "Incrementing state at " - << rpc::ActorTableData::ActorState_Name(cur_state) << " " - << GetActorTableData().class_name(); - counter_->Increment(std::make_pair(cur_state, GetActorTableData().class_name())); - } - last_metric_state_ = cur_state; - } - - rpc::ExportActorData::ActorState ConvertActorStateToExport( - rpc::ActorTableData::ActorState actor_state) const { - switch (actor_state) { - case rpc::ActorTableData::DEPENDENCIES_UNREADY: - return rpc::ExportActorData::DEPENDENCIES_UNREADY; - case rpc::ActorTableData::PENDING_CREATION: - return rpc::ExportActorData::PENDING_CREATION; - case rpc::ActorTableData::ALIVE: - return rpc::ExportActorData::ALIVE; - case rpc::ActorTableData::RESTARTING: - return rpc::ExportActorData::RESTARTING; - case rpc::ActorTableData::DEAD: - return rpc::ExportActorData::DEAD; - default: - // Unknown rpc::ActorTableData::ActorState value - RAY_LOG(FATAL) << "Invalid value for rpc::ActorTableData::ActorState" - << rpc::ActorTableData::ActorState_Name(actor_state); - return rpc::ExportActorData::DEAD; - } - } - - /// The actor meta data which contains the task specification as well as the state of - /// the gcs actor and so on (see gcs.proto). - rpc::ActorTableData actor_table_data_; - const std::unique_ptr task_spec_; - /// Resources acquired by this actor. - ResourceRequest acquired_resources_; - /// Reference to the counter to use for actor state metrics tracking. - std::shared_ptr>> - counter_; - /// Whether the actor's target node only grants or rejects the lease request. - bool grant_or_reject_ = false; - /// The last recorded metric state. - std::optional last_metric_state_; - /// If true, actor events are exported for Export API - bool export_event_write_enabled_ = false; -}; - -using RestartActorForLineageReconstructionCallback = - std::function)>; -using CreateActorCallback = std::function, const rpc::PushTaskReply &reply, const Status &status)>; - /// GcsActorManager is responsible for managing the lifecycle of all actors. /// This class is not thread-safe. /// Actor State Transition Diagram: @@ -330,7 +89,7 @@ using CreateActorCallback = std::function scheduler, GcsTableStorage *gcs_table_storage, instrumented_io_context &io_context, - GcsPublisher *gcs_publisher, + pubsub::GcsPublisher *gcs_publisher, RuntimeEnvManager &runtime_env_manager, GCSFunctionManager &function_manager, std::function destroy_owned_placement_group_if_needed, @@ -525,11 +284,11 @@ class GcsActorManager : public rpc::ActorInfoHandler { /// A data structure representing an actor's owner. struct Owner { explicit Owner(std::shared_ptr client) - : client(std::move(client)) {} + : client_(std::move(client)) {} /// A client that can be used to contact the owner. - std::shared_ptr client; + std::shared_ptr client_; /// The IDs of actors owned by this worker. - absl::flat_hash_set children_actor_ids; + absl::flat_hash_set children_actor_ids_; }; /// Poll an actor's owner so that we will receive a notification when the @@ -633,9 +392,8 @@ class GcsActorManager : public rpc::ActorInfoHandler { /// Cancel actor which is either being scheduled or is pending scheduling. /// /// \param actor The actor to be cancelled. - /// \param task_id The id of actor creation task to be cancelled. - void CancelActorInScheduling(const std::shared_ptr &actor, - const TaskID &task_id); + /// \param lease_id The lease id of actor creation task to be cancelled. + void CancelActorInScheduling(const std::shared_ptr &actor); /// Get the alive or dead actor of the actor id. /// NOTE: The return value is not meant to be passed to other scope. @@ -716,7 +474,7 @@ class GcsActorManager : public rpc::ActorInfoHandler { GcsTableStorage *gcs_table_storage_; instrumented_io_context &io_context_; /// A publisher for publishing gcs messages. - GcsPublisher *gcs_publisher_; + pubsub::GcsPublisher *gcs_publisher_; /// This is used to communicate with actors and their owners. rpc::CoreWorkerClientPool &worker_client_pool_; /// A callback that is used to destroy placemenet group owned by the actor. diff --git a/src/ray/gcs/gcs_server/gcs_actor_scheduler.cc b/src/ray/gcs/gcs_actor_scheduler.cc similarity index 88% rename from src/ray/gcs/gcs_server/gcs_actor_scheduler.cc rename to src/ray/gcs/gcs_actor_scheduler.cc index 5eb2948d11f1..79bfd2930770 100644 --- a/src/ray/gcs/gcs_server/gcs_actor_scheduler.cc +++ b/src/ray/gcs/gcs_actor_scheduler.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/gcs_server/gcs_actor_scheduler.h" +#include "ray/gcs/gcs_actor_scheduler.h" #include #include @@ -20,10 +20,8 @@ #include #include "ray/common/asio/asio_util.h" -#include "ray/common/asio/instrumented_io_context.h" #include "ray/common/ray_config.h" -#include "ray/gcs/gcs_server/gcs_actor_manager.h" -#include "src/ray/protobuf/node_manager.pb.h" +#include "ray/util/time.h" namespace ray { namespace gcs { @@ -32,7 +30,7 @@ GcsActorScheduler::GcsActorScheduler( instrumented_io_context &io_context, GcsActorTable &gcs_actor_table, const GcsNodeManager &gcs_node_manager, - ClusterTaskManager &cluster_task_manager, + ClusterLeaseManager &cluster_lease_manager, GcsActorSchedulerFailureCallback schedule_failure_handler, GcsActorSchedulerSuccessCallback schedule_success_handler, rpc::RayletClientPool &raylet_client_pool, @@ -42,7 +40,7 @@ GcsActorScheduler::GcsActorScheduler( : io_context_(io_context), gcs_actor_table_(gcs_actor_table), gcs_node_manager_(gcs_node_manager), - cluster_task_manager_(cluster_task_manager), + cluster_lease_manager_(cluster_lease_manager), schedule_failure_handler_(std::move(schedule_failure_handler)), schedule_success_handler_(std::move(schedule_success_handler)), raylet_client_pool_(raylet_client_pool), @@ -56,7 +54,7 @@ void GcsActorScheduler::Schedule(std::shared_ptr actor) { RAY_CHECK(actor->GetNodeID().IsNil() && actor->GetWorkerID().IsNil()); if (RayConfig::instance().gcs_actor_scheduling_enabled() && - !actor->GetCreationTaskSpecification().GetRequiredResources().IsEmpty()) { + !actor->GetLeaseSpecification().GetRequiredResources().IsEmpty()) { ScheduleByGcs(actor); } else { ScheduleByRaylet(actor); @@ -77,14 +75,14 @@ void GcsActorScheduler::ScheduleByGcs(std::shared_ptr actor) { return; } const auto &retry_at_raylet_address = reply->retry_at_raylet_address(); - RAY_CHECK(!retry_at_raylet_address.raylet_id().empty()); - auto node_id = NodeID::FromBinary(retry_at_raylet_address.raylet_id()); + RAY_CHECK(!retry_at_raylet_address.node_id().empty()); + auto node_id = NodeID::FromBinary(retry_at_raylet_address.node_id()); auto node = gcs_node_manager_.GetAliveNode(node_id); RAY_CHECK(node.has_value()); // Update the address of the actor as it is tied to a node. rpc::Address address; - address.set_raylet_id(node.value()->node_id()); + address.set_node_id(node.value()->node_id()); actor->UpdateAddress(address); RAY_CHECK(node_to_actors_when_leasing_[actor->GetNodeID()] @@ -92,8 +90,7 @@ void GcsActorScheduler::ScheduleByGcs(std::shared_ptr actor) { .second); actor->SetAcquiredResources(ResourceMapToResourceRequest( - actor->GetCreationTaskSpecification().GetRequiredResources().GetResourceMap(), - false)); + actor->GetLeaseSpecification().GetRequiredResources().GetResourceMap(), false)); // Lease worker directly from the node. actor->SetGrantOrReject(true); LeaseWorkerFromNode(actor, node.value()); @@ -101,13 +98,14 @@ void GcsActorScheduler::ScheduleByGcs(std::shared_ptr actor) { // Queue and schedule the actor locally (gcs). const auto &owner_node = gcs_node_manager_.GetAliveNode(actor->GetOwnerNodeID()); - RayTask task(actor->GetCreationTaskSpecification(), - owner_node.has_value() ? actor->GetOwnerNodeID().Binary() : std::string()); - cluster_task_manager_.QueueAndScheduleTask(std::move(task), - /*grant_or_reject*/ false, - /*is_selected_based_on_locality*/ false, - /*reply*/ reply.get(), - send_reply_callback); + RayLease lease( + actor->GetLeaseSpecification(), + owner_node.has_value() ? actor->GetOwnerNodeID().Binary() : std::string()); + cluster_lease_manager_.QueueAndScheduleLease(std::move(lease), + /*grant_or_reject=*/false, + /*is_selected_based_on_locality=*/false, + /*reply=*/reply.get(), + send_reply_callback); } void GcsActorScheduler::ScheduleByRaylet(std::shared_ptr actor) { @@ -126,7 +124,7 @@ void GcsActorScheduler::ScheduleByRaylet(std::shared_ptr actor) { // Update the address of the actor as it is tied to a node. rpc::Address address; - address.set_raylet_id(node.value()->node_id()); + address.set_node_id(node.value()->node_id()); actor->UpdateAddress(address); RAY_CHECK(node_to_actors_when_leasing_[actor->GetNodeID()] @@ -144,8 +142,8 @@ NodeID GcsActorScheduler::SelectForwardingNode(std::shared_ptr actor) // If an actor has resource requirements, we will try to schedule it on the same node as // the owner if possible. - const auto &task_spec = actor->GetCreationTaskSpecification(); - if (!task_spec.GetRequiredResources().IsEmpty()) { + const auto &lease_spec = actor->GetLeaseSpecification(); + if (!lease_spec.GetRequiredResources().IsEmpty()) { auto maybe_node = gcs_node_manager_.GetAliveNode(actor->GetOwnerNodeID()); node = maybe_node.has_value() ? maybe_node.value() : SelectNodeRandomly(); } else { @@ -229,10 +227,10 @@ std::vector GcsActorScheduler::CancelOnNode(const NodeID &node_id) { void GcsActorScheduler::CancelOnLeasing(const NodeID &node_id, const ActorID &actor_id, - const TaskID &task_id) { + const LeaseID &lease_id) { // NOTE: This method will cancel the outstanding lease request and remove leasing // information from the internal state. - RAY_LOG(DEBUG) << "Canceling worker leasing of task " << task_id; + RAY_LOG(DEBUG) << "Canceling worker lease request " << lease_id; auto node_it = node_to_actors_when_leasing_.find(node_id); RAY_CHECK(node_it != node_to_actors_when_leasing_.end()); node_it->second.erase(actor_id); @@ -245,12 +243,12 @@ void GcsActorScheduler::CancelOnLeasing(const NodeID &node_id, if (iter != alive_nodes.end()) { const auto &node_info = iter->second; rpc::Address address; - address.set_raylet_id(node_info->node_id()); + address.set_node_id(node_info->node_id()); address.set_ip_address(node_info->node_manager_address()); address.set_port(node_info->node_manager_port()); auto raylet_client = GetOrConnectRayletClient(address); raylet_client->CancelWorkerLease( - task_id, [](const Status &status, const rpc::CancelWorkerLeaseReply &reply) {}); + lease_id, [](const Status &status, const rpc::CancelWorkerLeaseReply &reply) {}); } } @@ -287,7 +285,7 @@ void GcsActorScheduler::ReleaseUnusedActorWorkers( nodes_of_releasing_unused_workers_.insert(node_id); rpc::Address address; - address.set_raylet_id(alive_node.second->node_id()); + address.set_node_id(alive_node.second->node_id()); address.set_ip_address(alive_node.second->node_manager_address()); address.set_port(alive_node.second->node_manager_port()); auto raylet_client = GetOrConnectRayletClient(address); @@ -323,14 +321,18 @@ void GcsActorScheduler::LeaseWorkerFromNode(std::shared_ptr actor, } rpc::Address remote_address; - remote_address.set_raylet_id(node->node_id()); + remote_address.set_node_id(node->node_id()); remote_address.set_ip_address(node->node_manager_address()); remote_address.set_port(node->node_manager_port()); auto raylet_client = GetOrConnectRayletClient(remote_address); // Actor leases should be sent to the raylet immediately, so we should never build up a // backlog in GCS. + // Counter for generating unique lease IDs. + static uint32_t lease_id_counter = 0; + actor->GetMutableLeaseSpec()->set_lease_id( + LeaseID::FromWorker(WorkerID::FromRandom(), lease_id_counter++).Binary()); raylet_client->RequestWorkerLease( - actor->GetCreationTaskSpecification().GetMessage(), + actor->GetLeaseSpecification().GetMessage(), actor->GetGrantOrReject(), [this, actor, node](const Status &status, const rpc::RequestWorkerLeaseReply &reply) { @@ -369,11 +371,11 @@ void GcsActorScheduler::HandleWorkerLeaseGrantedReply( std::shared_ptr actor, const ray::rpc::RequestWorkerLeaseReply &reply) { const auto &retry_at_raylet_address = reply.retry_at_raylet_address(); const auto &worker_address = reply.worker_address(); - if (worker_address.raylet_id().empty()) { + if (worker_address.node_id().empty()) { // The worker did not succeed in the lease, but the specified node returned a new // node, and then try again on the new node. - RAY_CHECK(!retry_at_raylet_address.raylet_id().empty()); - auto spill_back_node_id = NodeID::FromBinary(retry_at_raylet_address.raylet_id()); + RAY_CHECK(!retry_at_raylet_address.node_id().empty()); + auto spill_back_node_id = NodeID::FromBinary(retry_at_raylet_address.node_id()); auto maybe_spill_back_node = gcs_node_manager_.GetAliveNode(spill_back_node_id); if (maybe_spill_back_node.has_value()) { auto spill_back_node = maybe_spill_back_node.value(); @@ -414,18 +416,17 @@ void GcsActorScheduler::HandleWorkerLeaseGrantedReply( // Without this, there could be a possible race condition. Related issues: // https://github.com/ray-project/ray/pull/9215/files#r449469320 worker_client_pool_.GetOrConnect(leased_worker->GetAddress()); - RAY_CHECK_OK(gcs_actor_table_.Put(actor->GetActorID(), - actor->GetActorTableData(), - {[this, actor, leased_worker](Status status) { - RAY_CHECK_OK(status); - if (actor->GetState() == - rpc::ActorTableData::DEAD) { - // Actor has already been killed. - return; - } - CreateActorOnWorker(actor, leased_worker); - }, - io_context_})); + gcs_actor_table_.Put(actor->GetActorID(), + actor->GetActorTableData(), + {[this, actor, leased_worker](Status status) { + RAY_CHECK_OK(status); + if (actor->GetState() == rpc::ActorTableData::DEAD) { + // Actor has already been killed. + return; + } + CreateActorOnWorker(actor, leased_worker); + }, + io_context_}); } } @@ -434,13 +435,13 @@ void GcsActorScheduler::HandleRequestWorkerLeaseCanceled( const NodeID &node_id, rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type, const std::string &scheduling_failure_message) { - RAY_LOG(INFO) - << "The lease worker request from node " << node_id << " for actor " - << actor->GetActorID() << "(" - << actor->GetCreationTaskSpecification().FunctionDescriptor()->CallString() << ")" - << " has been canceled, job id = " << actor->GetActorID().JobId() - << ", cancel type: " - << rpc::RequestWorkerLeaseReply::SchedulingFailureType_Name(failure_type); + RAY_LOG(INFO) << "The lease worker request from node " << node_id << " for actor " + << actor->GetActorID() << "(" + << actor->GetLeaseSpecification().FunctionDescriptor()->CallString() + << ")" + << " has been canceled, job id = " << actor->GetActorID().JobId() + << ", cancel type: " + << rpc::RequestWorkerLeaseReply::SchedulingFailureType_Name(failure_type); schedule_failure_handler_(actor, failure_type, scheduling_failure_message); } @@ -537,7 +538,7 @@ std::shared_ptr GcsActorScheduler::GetOrConnectRayletClie bool GcsActorScheduler::KillActorOnWorker(const rpc::Address &worker_address, ActorID actor_id) { - if (worker_address.raylet_id().empty()) { + if (worker_address.node_id().empty()) { RAY_LOG(DEBUG) << "Invalid worker address, skip the killing of actor " << actor_id; return false; } @@ -607,8 +608,8 @@ void GcsActorScheduler::HandleWorkerLeaseReply( return; } - if (reply.worker_address().raylet_id().empty() && - reply.retry_at_raylet_address().raylet_id().empty() && !reply.rejected()) { + if (reply.worker_address().node_id().empty() && + reply.retry_at_raylet_address().node_id().empty() && !reply.rejected()) { // Actor creation task has been cancelled. It is triggered by `ray.kill`. If // the number of remaining restarts of the actor is not equal to 0, GCS will // reschedule the actor, so it return directly here. @@ -665,13 +666,13 @@ void GcsActorScheduler::HandleWorkerLeaseRejectedReply( void GcsActorScheduler::OnActorDestruction(std::shared_ptr actor) { if (!actor->GetAcquiredResources().IsEmpty()) { ReturnActorAcquiredResources(actor); - cluster_task_manager_.ScheduleAndDispatchTasks(); + cluster_lease_manager_.ScheduleAndGrantLeases(); } } void GcsActorScheduler::ReturnActorAcquiredResources(std::shared_ptr actor) { auto &cluster_resource_manager = - cluster_task_manager_.GetClusterResourceScheduler().GetClusterResourceManager(); + cluster_lease_manager_.GetClusterResourceScheduler().GetClusterResourceManager(); cluster_resource_manager.AddNodeAvailableResources( scheduling::NodeID(actor->GetNodeID().Binary()), actor->GetAcquiredResources().GetResourceSet()); @@ -679,13 +680,13 @@ void GcsActorScheduler::ReturnActorAcquiredResources(std::shared_ptr a } size_t GcsActorScheduler::GetPendingActorsCount() const { - return cluster_task_manager_.GetInfeasibleQueueSize() + - cluster_task_manager_.GetPendingQueueSize(); + return cluster_lease_manager_.GetInfeasibleQueueSize() + + cluster_lease_manager_.GetPendingQueueSize(); } bool GcsActorScheduler::CancelInFlightActorScheduling( const std::shared_ptr &actor) { - return cluster_task_manager_.CancelTask(actor->GetCreationTaskSpecification().TaskId()); + return cluster_lease_manager_.CancelLease(actor->GetLeaseSpecification().LeaseId()); } } // namespace gcs diff --git a/src/ray/gcs/gcs_server/gcs_actor_scheduler.h b/src/ray/gcs/gcs_actor_scheduler.h similarity index 95% rename from src/ray/gcs/gcs_server/gcs_actor_scheduler.h rename to src/ray/gcs/gcs_actor_scheduler.h index 1cd9a0907e05..432b93cf4c9a 100644 --- a/src/ray/gcs/gcs_server/gcs_actor_scheduler.h +++ b/src/ray/gcs/gcs_actor_scheduler.h @@ -16,7 +16,6 @@ #include #include -#include #include #include #include @@ -25,24 +24,18 @@ #include "absl/container/flat_hash_set.h" #include "ray/common/asio/instrumented_io_context.h" #include "ray/common/id.h" -#include "ray/common/scheduling/scheduling_ids.h" -#include "ray/common/task/task_spec.h" -#include "ray/gcs/gcs_server/gcs_node_manager.h" -#include "ray/gcs/gcs_server/gcs_table_storage.h" -#include "ray/raylet/scheduling/cluster_task_manager.h" -#include "ray/raylet_client/raylet_client.h" -#include "ray/rpc/node_manager/node_manager_client.h" -#include "ray/rpc/node_manager/raylet_client_pool.h" -#include "ray/rpc/worker/core_worker_client.h" +#include "ray/gcs/gcs_actor.h" +#include "ray/gcs/gcs_node_manager.h" +#include "ray/gcs/gcs_table_storage.h" +#include "ray/raylet/scheduling/cluster_lease_manager.h" +#include "ray/rpc/raylet/raylet_client_interface.h" +#include "ray/rpc/raylet/raylet_client_pool.h" #include "ray/rpc/worker/core_worker_client_pool.h" -#include "src/ray/protobuf/gcs_service.pb.h" namespace ray { -using raylet::ClusterTaskManager; +using raylet::ClusterLeaseManager; namespace gcs { -class GcsActor; - using GcsActorSchedulerFailureCallback = std::function, rpc::RequestWorkerLeaseReply::SchedulingFailureType, @@ -74,7 +67,7 @@ class GcsActorSchedulerInterface { /// \param actor_id ID of an actor. virtual void CancelOnLeasing(const NodeID &node_id, const ActorID &actor_id, - const TaskID &task_id) = 0; + const LeaseID &lease_id) = 0; /// Cancel the actor that is being scheduled to the specified worker. /// @@ -119,7 +112,7 @@ class GcsActorScheduler : public GcsActorSchedulerInterface { /// \param io_context The main event loop. /// \param gcs_actor_table Used to flush actor info to storage. /// \param gcs_node_manager The node manager which is used when scheduling. - /// \param cluster_task_manager The task manager that queues and schedules actor. + /// \param cluster_lease_manager The task manager that queues and schedules actor. /// creation tasks. /// \param schedule_failure_handler Invoked when there are no available /// nodes to schedule actors. @@ -132,7 +125,7 @@ class GcsActorScheduler : public GcsActorSchedulerInterface { instrumented_io_context &io_context, GcsActorTable &gcs_actor_table, const GcsNodeManager &gcs_node_manager, - ClusterTaskManager &cluster_task_manager_, + ClusterLeaseManager &cluster_lease_manager_, GcsActorSchedulerFailureCallback schedule_failure_handler, GcsActorSchedulerSuccessCallback schedule_success_handler, rpc::RayletClientPool &raylet_client_pool, @@ -144,7 +137,7 @@ class GcsActorScheduler : public GcsActorSchedulerInterface { /// Schedule the specified actor. /// If there is no available nodes then the actor would be queued in the - /// `cluster_task_manager_`. + /// `cluster_lease_manager_`. /// /// \param actor to be scheduled. void Schedule(std::shared_ptr actor) override; @@ -170,7 +163,7 @@ class GcsActorScheduler : public GcsActorSchedulerInterface { /// \param actor_id ID of an actor. void CancelOnLeasing(const NodeID &node_id, const ActorID &actor_id, - const TaskID &task_id) override; + const LeaseID &lease_id) override; /// Cancel the actor that is being scheduled to the specified worker. /// @@ -236,7 +229,7 @@ class GcsActorScheduler : public GcsActorSchedulerInterface { WorkerID GetWorkerID() const { return WorkerID::FromBinary(address_.worker_id()); } /// Get the NodeID of this leased worker. - NodeID GetNodeID() const { return NodeID::FromBinary(address_.raylet_id()); } + NodeID GetNodeID() const { return NodeID::FromBinary(address_.node_id()); } /// Get the id of the actor which is assigned to this leased worker. ActorID GetAssignedActorID() const { return assigned_actor_id_; } @@ -380,8 +373,8 @@ class GcsActorScheduler : public GcsActorSchedulerInterface { node_to_workers_when_creating_; /// Reference of GcsNodeManager. const GcsNodeManager &gcs_node_manager_; - /// The cluster task manager. - ClusterTaskManager &cluster_task_manager_; + /// The cluster lease manager. + ClusterLeaseManager &cluster_lease_manager_; /// The handler to handle the scheduling failures. GcsActorSchedulerFailureCallback schedule_failure_handler_; /// The handler to handle the successful scheduling. diff --git a/src/ray/gcs/gcs_server/gcs_autoscaler_state_manager.cc b/src/ray/gcs/gcs_autoscaler_state_manager.cc similarity index 97% rename from src/ray/gcs/gcs_server/gcs_autoscaler_state_manager.cc rename to src/ray/gcs/gcs_autoscaler_state_manager.cc index e3873fe82ff9..6d0841745282 100644 --- a/src/ray/gcs/gcs_server/gcs_autoscaler_state_manager.cc +++ b/src/ray/gcs/gcs_autoscaler_state_manager.cc @@ -12,19 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/gcs_server/gcs_autoscaler_state_manager.h" +#include "ray/gcs/gcs_autoscaler_state_manager.h" #include #include #include #include -#include "ray/gcs/gcs_server/gcs_actor_manager.h" -#include "ray/gcs/gcs_server/gcs_node_manager.h" -#include "ray/gcs/gcs_server/gcs_placement_group_mgr.h" -#include "ray/gcs/gcs_server/state_util.h" -#include "ray/gcs/pb_util.h" +#include "ray/common/protobuf_utils.h" #include "ray/util/string_utils.h" +#include "ray/util/time.h" namespace ray { namespace gcs { @@ -37,7 +34,7 @@ GcsAutoscalerStateManager::GcsAutoscalerStateManager( rpc::RayletClientPool &raylet_client_pool, InternalKVInterface &kv, instrumented_io_context &io_context, - GcsPublisher *gcs_publisher) + pubsub::GcsPublisher *gcs_publisher) : session_name_(std::move(session_name)), gcs_node_manager_(gcs_node_manager), gcs_actor_manager_(gcs_actor_manager), @@ -93,10 +90,9 @@ void GcsAutoscalerStateManager::HandleReportAutoscalingState( if (gcs_publisher_ != nullptr) { std::string error_type = "infeasible_resource_requests"; - auto error_data_ptr = gcs::CreateErrorTableData( + auto error_data = CreateErrorTableData( error_type, error_message, absl::FromUnixMillis(current_time_ms())); - RAY_CHECK_OK( - gcs_publisher_->PublishError(session_name_, *error_data_ptr, nullptr)); + gcs_publisher_->PublishError(session_name_, std::move(error_data)); } } }; @@ -413,6 +409,10 @@ void GcsAutoscalerStateManager::GetNodeStates( node_state_proto->mutable_total_resources()->insert(total.begin(), total.end()); // Add dynamic PG labels. + // DEPRECATED: Dynamic labels feature is deprecated. Do not introduce new usages. + // This assignment is kept only for backward compatibility in the autoscaler, where + // the placement group ID is needed to enforce antiaffinity constraints for + // strict-spread placement group scheduling. const auto &pgs_on_node = gcs_placement_group_manager_.GetBundlesOnNode(node_id); for (const auto &[pg_id, _bundle_indices] : pgs_on_node) { node_state_proto->mutable_dynamic_labels()->insert( @@ -617,10 +617,10 @@ void GcsAutoscalerStateManager::CancelInfeasibleRequests() const { RAY_LOG(WARNING) << "Canceling infeasible requests on node " << node_id << " with infeasible_shapes=" << resource_shapes_str; - raylet_client->CancelTasksWithResourceShapes( + raylet_client->CancelLeasesWithResourceShapes( infeasible_shapes, [node_id](const Status &status, - const rpc::CancelTasksWithResourceShapesReply &) { + const rpc::CancelLeasesWithResourceShapesReply &) { if (status.ok()) { RAY_LOG(INFO) << "Infeasible tasks cancelled on node " << node_id; } else { diff --git a/src/ray/gcs/gcs_server/gcs_autoscaler_state_manager.h b/src/ray/gcs/gcs_autoscaler_state_manager.h similarity index 93% rename from src/ray/gcs/gcs_server/gcs_autoscaler_state_manager.h rename to src/ray/gcs/gcs_autoscaler_state_manager.h index e37d582fb630..2a459a7e51fe 100644 --- a/src/ray/gcs/gcs_server/gcs_autoscaler_state_manager.h +++ b/src/ray/gcs/gcs_autoscaler_state_manager.h @@ -14,29 +14,31 @@ #pragma once +#include + #include #include #include #include -#include "ray/gcs/gcs_server/gcs_init_data.h" -#include "ray/gcs/gcs_server/gcs_kv_manager.h" -#include "ray/gcs/gcs_server/state_util.h" -#include "ray/gcs/pubsub/gcs_pub_sub.h" -#include "ray/rpc/gcs/gcs_rpc_server.h" -#include "ray/rpc/node_manager/raylet_client_pool.h" +#include "absl/container/flat_hash_map.h" +#include "ray/common/asio/instrumented_io_context.h" +#include "ray/gcs/gcs_actor_manager.h" +#include "ray/gcs/gcs_init_data.h" +#include "ray/gcs/gcs_kv_manager.h" +#include "ray/gcs/gcs_node_manager.h" +#include "ray/gcs/gcs_placement_group_manager.h" +#include "ray/gcs/grpc_service_interfaces.h" +#include "ray/gcs/state_util.h" +#include "ray/pubsub/gcs_publisher.h" +#include "ray/rpc/raylet/raylet_client_pool.h" #include "ray/util/thread_checker.h" #include "src/ray/protobuf/gcs.pb.h" namespace ray { namespace gcs { -class GcsActorManager; -class GcsNodeManager; -class GcsPlacementGroupManager; -class GcsResourceManager; - -class GcsAutoscalerStateManager : public rpc::autoscaler::AutoscalerStateHandler { +class GcsAutoscalerStateManager : public rpc::autoscaler::AutoscalerStateServiceHandler { public: GcsAutoscalerStateManager(std::string session_name, GcsNodeManager &gcs_node_manager, @@ -45,7 +47,7 @@ class GcsAutoscalerStateManager : public rpc::autoscaler::AutoscalerStateHandler rpc::RayletClientPool &raylet_client_pool, InternalKVInterface &kv, instrumented_io_context &io_context, - GcsPublisher *gcs_publisher); + pubsub::GcsPublisher *gcs_publisher); void HandleGetClusterResourceState( rpc::autoscaler::GetClusterResourceStateRequest request, @@ -172,7 +174,7 @@ class GcsAutoscalerStateManager : public rpc::autoscaler::AutoscalerStateHandler /// TODO: Implement the function void CancelInfeasibleRequests() const; - // Ray cluster session name. + // The current Ray session name. const std::string session_name_; /// Gcs node manager that provides node status information. @@ -192,7 +194,7 @@ class GcsAutoscalerStateManager : public rpc::autoscaler::AutoscalerStateHandler instrumented_io_context &io_context_; // A publisher for publishing gcs messages. - GcsPublisher *gcs_publisher_; + pubsub::GcsPublisher *gcs_publisher_; // The default value of the last seen version for the request is 0, which indicates // no version has been reported. So the first reported version should be 1. diff --git a/src/ray/gcs/gcs_client/BUILD.bazel b/src/ray/gcs/gcs_client/BUILD.bazel deleted file mode 100644 index 3d78c1f66247..000000000000 --- a/src/ray/gcs/gcs_client/BUILD.bazel +++ /dev/null @@ -1,42 +0,0 @@ -load("//bazel:ray.bzl", "ray_cc_library") - -ray_cc_library( - name = "gcs_client_lib", - srcs = [ - "accessor.cc", - "gcs_client.cc", - ], - hdrs = [ - "accessor.h", - "gcs_client.h", - ], - deps = [ - "//src/ray/common:asio", - "//src/ray/common:id", - "//src/ray/gcs:gcs_pb_util", - "//src/ray/gcs/pubsub:gcs_pub_sub_lib", - "//src/ray/gcs/store_client:gcs_redis_store_client", - "//src/ray/protobuf:usage_cc_proto", - "//src/ray/pubsub:subscriber", - "//src/ray/rpc:gcs_client", - "//src/ray/util:container_util", - "//src/ray/util:network_util", - "//src/ray/util:sequencer", - ], -) - -ray_cc_library( - name = "global_state_accessor_lib", - srcs = ["global_state_accessor.cc"], - hdrs = ["global_state_accessor.h"], - deps = [ - ":gcs_client_lib", - ], -) - -ray_cc_library( - name = "gcs_python_callbacks", - hdrs = [ - "python_callbacks.h", - ], -) diff --git a/src/ray/gcs/gcs_server/gcs_function_manager.h b/src/ray/gcs/gcs_function_manager.h similarity index 97% rename from src/ray/gcs/gcs_server/gcs_function_manager.h rename to src/ray/gcs/gcs_function_manager.h index 10eac00744e3..3c861fe83f98 100644 --- a/src/ray/gcs/gcs_server/gcs_function_manager.h +++ b/src/ray/gcs/gcs_function_manager.h @@ -17,7 +17,8 @@ #include "absl/container/flat_hash_map.h" #include "ray/common/asio/instrumented_io_context.h" #include "ray/common/constants.h" -#include "ray/gcs/gcs_server/gcs_kv_manager.h" +#include "ray/common/id.h" +#include "ray/gcs/gcs_kv_manager.h" namespace ray { namespace gcs { diff --git a/src/ray/gcs/gcs_server/gcs_health_check_manager.cc b/src/ray/gcs/gcs_health_check_manager.cc similarity index 94% rename from src/ray/gcs/gcs_server/gcs_health_check_manager.cc rename to src/ray/gcs/gcs_health_check_manager.cc index 239a18c2aa4c..9cc54c945304 100644 --- a/src/ray/gcs/gcs_server/gcs_health_check_manager.cc +++ b/src/ray/gcs/gcs_health_check_manager.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/gcs_server/gcs_health_check_manager.h" +#include "ray/gcs/gcs_health_check_manager.h" #include #include @@ -173,8 +173,8 @@ void GcsHealthCheckManager::HealthCheckContext::StartHealthCheck() { response_ptr, [this, start = now, context = std::move(context), response = std::move(response)]( ::grpc::Status status) { - auto manager = manager_.lock(); - if (manager == nullptr) { + auto gcs_health_check_manager = manager_.lock(); + if (gcs_health_check_manager == nullptr) { delete this; return; } @@ -183,14 +183,14 @@ void GcsHealthCheckManager::HealthCheckContext::StartHealthCheck() { STATS_health_check_rpc_latency_ms.Record( absl::ToInt64Milliseconds(absl::Now() - start)); - manager->io_service_.post( + gcs_health_check_manager->io_service_.post( [this, status, response = std::move(response)]() { if (stopped_) { delete this; return; } - auto manager = manager_.lock(); - if (manager == nullptr) { + auto mgr = manager_.lock(); + if (mgr == nullptr) { delete this; return; } @@ -201,7 +201,7 @@ void GcsHealthCheckManager::HealthCheckContext::StartHealthCheck() { if (status.ok() && response->status() == HealthCheckResponse::SERVING) { // Health check passed. - health_check_remaining_ = manager->failure_threshold_; + health_check_remaining_ = mgr->failure_threshold_; } else { --health_check_remaining_; RAY_LOG(WARNING) @@ -213,15 +213,14 @@ void GcsHealthCheckManager::HealthCheckContext::StartHealthCheck() { } if (health_check_remaining_ == 0) { - manager->FailNode(node_id_); + mgr->FailNode(node_id_); delete this; } else { // Do another health check. // // TODO(hjiang): Able to reduce a few health check based on know resource // usage communication between GCS and raylet. - timer_.expires_from_now( - boost::posix_time::milliseconds(manager->period_ms_)); + timer_.expires_from_now(boost::posix_time::milliseconds(mgr->period_ms_)); timer_.async_wait([this](auto) { StartHealthCheck(); }); } }, diff --git a/src/ray/gcs/gcs_server/gcs_health_check_manager.h b/src/ray/gcs/gcs_health_check_manager.h similarity index 100% rename from src/ray/gcs/gcs_server/gcs_health_check_manager.h rename to src/ray/gcs/gcs_health_check_manager.h diff --git a/src/ray/gcs/gcs_server/gcs_init_data.cc b/src/ray/gcs/gcs_init_data.cc similarity index 69% rename from src/ray/gcs/gcs_server/gcs_init_data.cc rename to src/ray/gcs/gcs_init_data.cc index 1e33538bf521..2f695f5e9188 100644 --- a/src/ray/gcs/gcs_server/gcs_init_data.cc +++ b/src/ray/gcs/gcs_init_data.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/gcs_server/gcs_init_data.h" +#include "ray/gcs/gcs_init_data.h" #include #include @@ -44,56 +44,53 @@ void GcsInitData::AsyncLoad(Postable on_done) { void GcsInitData::AsyncLoadJobTableData(Postable on_done) { RAY_LOG(INFO) << "Loading job table data."; - RAY_CHECK_OK(gcs_table_storage_.JobTable().GetAll(std::move(on_done).TransformArg( + gcs_table_storage_.JobTable().GetAll(std::move(on_done).TransformArg( [this](absl::flat_hash_map result) { job_table_data_ = std::move(result); RAY_LOG(INFO) << "Finished loading job table data, size = " << job_table_data_.size(); - }))); + })); } void GcsInitData::AsyncLoadNodeTableData(Postable on_done) { RAY_LOG(INFO) << "Loading node table data."; - RAY_CHECK_OK(gcs_table_storage_.NodeTable().GetAll(std::move(on_done).TransformArg( + gcs_table_storage_.NodeTable().GetAll(std::move(on_done).TransformArg( [this](absl::flat_hash_map result) { node_table_data_ = std::move(result); RAY_LOG(INFO) << "Finished loading node table data, size = " << node_table_data_.size(); - }))); + })); } void GcsInitData::AsyncLoadPlacementGroupTableData(Postable on_done) { RAY_LOG(INFO) << "Loading placement group table data."; - RAY_CHECK_OK( - gcs_table_storage_.PlacementGroupTable().GetAll(std::move(on_done).TransformArg( - [this](absl::flat_hash_map - result) { - placement_group_table_data_ = std::move(result); - RAY_LOG(INFO) << "Finished loading placement group table data, size = " - << placement_group_table_data_.size(); - }))); + gcs_table_storage_.PlacementGroupTable().GetAll(std::move(on_done).TransformArg( + [this](absl::flat_hash_map result) { + placement_group_table_data_ = std::move(result); + RAY_LOG(INFO) << "Finished loading placement group table data, size = " + << placement_group_table_data_.size(); + })); } void GcsInitData::AsyncLoadActorTableData(Postable on_done) { RAY_LOG(INFO) << "Loading actor table data."; - RAY_CHECK_OK(gcs_table_storage_.ActorTable().AsyncRebuildIndexAndGetAll( + gcs_table_storage_.ActorTable().AsyncRebuildIndexAndGetAll( std::move(on_done).TransformArg( [this](absl::flat_hash_map result) { actor_table_data_ = std::move(result); RAY_LOG(INFO) << "Finished loading actor table data, size = " << actor_table_data_.size(); - }))); + })); } void GcsInitData::AsyncLoadActorTaskSpecTableData(Postable on_done) { RAY_LOG(INFO) << "Loading actor task spec table data."; - RAY_CHECK_OK( - gcs_table_storage_.ActorTaskSpecTable().GetAll(std::move(on_done).TransformArg( - [this](absl::flat_hash_map result) -> void { - actor_task_spec_table_data_ = std::move(result); - RAY_LOG(INFO) << "Finished loading actor task spec table data, size = " - << actor_task_spec_table_data_.size(); - }))); + gcs_table_storage_.ActorTaskSpecTable().GetAll(std::move(on_done).TransformArg( + [this](absl::flat_hash_map result) -> void { + actor_task_spec_table_data_ = std::move(result); + RAY_LOG(INFO) << "Finished loading actor task spec table data, size = " + << actor_task_spec_table_data_.size(); + })); } } // namespace gcs diff --git a/src/ray/gcs/gcs_server/gcs_init_data.h b/src/ray/gcs/gcs_init_data.h similarity index 97% rename from src/ray/gcs/gcs_server/gcs_init_data.h rename to src/ray/gcs/gcs_init_data.h index f6627499cbfd..1fcd02897346 100644 --- a/src/ray/gcs/gcs_server/gcs_init_data.h +++ b/src/ray/gcs/gcs_init_data.h @@ -17,8 +17,7 @@ #include "absl/container/flat_hash_map.h" #include "ray/common/asio/postable.h" #include "ray/common/id.h" -#include "ray/gcs/callback.h" -#include "ray/gcs/gcs_server/gcs_table_storage.h" +#include "ray/gcs/gcs_table_storage.h" #include "src/ray/protobuf/gcs.pb.h" namespace ray { diff --git a/src/ray/gcs/gcs_server/gcs_job_manager.cc b/src/ray/gcs/gcs_job_manager.cc similarity index 88% rename from src/ray/gcs/gcs_server/gcs_job_manager.cc rename to src/ray/gcs/gcs_job_manager.cc index 5400b353dc06..a2eee9004c80 100644 --- a/src/ray/gcs/gcs_server/gcs_job_manager.cc +++ b/src/ray/gcs/gcs_job_manager.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/gcs_server/gcs_job_manager.h" +#include "ray/gcs/gcs_job_manager.h" #include #include @@ -21,8 +21,12 @@ #include #include -#include "ray/gcs/pb_util.h" +#include "absl/strings/match.h" +#include "ray/common/protobuf_utils.h" +#include "ray/observability/ray_driver_job_definition_event.h" +#include "ray/observability/ray_driver_job_execution_event.h" #include "ray/stats/metric.h" +#include "ray/util/time.h" namespace ray { namespace gcs { @@ -40,18 +44,33 @@ void GcsJobManager::Initialize(const GcsInitData &gcs_init_data) { } } -void GcsJobManager::WriteDriverJobExportEvent(rpc::JobTableData job_data) const { +void GcsJobManager::WriteDriverJobExportEvent( + rpc::JobTableData job_data, rpc::events::DriverJobExecutionEvent::State state) const { /// Write job_data as a export driver job event if /// enable_export_api_write() is enabled and if this job is /// not in the _ray_internal_ namespace. - if (!export_event_write_enabled_) { - return; - } - if (job_data.config().ray_namespace().find(kRayInternalNamespacePrefix) == 0) { + if (absl::StartsWith(job_data.config().ray_namespace(), kRayInternalNamespacePrefix)) { // Namespace of this job starts with _ray_internal_ so // don't write export event. return; } + if (RayConfig::instance().enable_ray_event()) { + std::vector> events; + if (state == rpc::events::DriverJobExecutionEvent::CREATED) { + // Job definition event is emitted once when the job is created. + events.push_back(std::make_unique( + job_data, session_name_)); + } + events.push_back(std::make_unique( + job_data, state, session_name_)); + ray_event_recorder_.AddEvents(std::move(events)); + return; + } + + // TODO(#56391): to be deprecated once the Ray Event system is stable. + if (!export_event_write_enabled_) { + return; + } std::shared_ptr export_driver_job_data_ptr = std::make_shared(); export_driver_job_data_ptr->set_job_id(job_data.job_id()); @@ -102,15 +121,14 @@ void GcsJobManager::HandleAddJob(rpc::AddJobRequest request, job_table_data = mutable_job_table_data, reply, send_reply_callback = - std::move(send_reply_callback)](const Status &status) { - RAY_CHECK(thread_checker_.IsOnSameThread()); - + std::move(send_reply_callback)](const Status &status) mutable { + WriteDriverJobExportEvent(job_table_data, + rpc::events::DriverJobExecutionEvent::CREATED); if (!status.ok()) { RAY_LOG(ERROR).WithField(job_id).WithField("driver_pid", job_table_data.driver_pid()) << "Failed to register job."; } else { - RAY_CHECK_OK(gcs_publisher_.PublishJob(job_id, job_table_data, /*done=*/nullptr)); if (job_table_data.config().has_runtime_env_info()) { runtime_env_manager_.AddURIReference(job_id.Hex(), job_table_data.config().runtime_env_info()); @@ -123,16 +141,14 @@ void GcsJobManager::HandleAddJob(rpc::AddJobRequest request, // Intentionally not checking return value, since the function could be invoked for // multiple times and requires idempotency (i.e. due to retry). running_job_start_times_.insert({job_id, job_table_data.start_time()}); + gcs_publisher_.PublishJob(job_id, std::move(job_table_data)); } - WriteDriverJobExportEvent(job_table_data); + GCS_RPC_SEND_REPLY(send_reply_callback, reply, status); }; - Status status = gcs_table_storage_.JobTable().Put( - job_id, mutable_job_table_data, {on_done, io_context_}); - if (!status.ok()) { - on_done(status); - } + gcs_table_storage_.JobTable().Put( + job_id, mutable_job_table_data, {std::move(on_done), io_context_}); } void GcsJobManager::MarkJobAsFinished(rpc::JobTableData job_table_data, @@ -151,13 +167,14 @@ void GcsJobManager::MarkJobAsFinished(rpc::JobTableData job_table_data, if (!status.ok()) { RAY_LOG(ERROR).WithField(job_id) << "Failed to mark job as finished."; } else { - RAY_CHECK_OK(gcs_publisher_.PublishJob(job_id, job_table_data, nullptr)); + gcs_publisher_.PublishJob(job_id, job_table_data); runtime_env_manager_.RemoveURIReference(job_id.Hex()); ClearJobInfos(job_table_data); RAY_LOG(DEBUG).WithField(job_id) << "Marked job as finished."; } function_manager_.RemoveJobReference(job_id); - WriteDriverJobExportEvent(job_table_data); + WriteDriverJobExportEvent(job_table_data, + rpc::events::DriverJobExecutionEvent::FINISHED); // Update running job status. // Note: This operation must be idempotent since MarkJobFinished can be called @@ -174,11 +191,8 @@ void GcsJobManager::MarkJobAsFinished(rpc::JobTableData job_table_data, done_callback(status); }; - Status status = - gcs_table_storage_.JobTable().Put(job_id, job_table_data, {on_done, io_context_}); - if (!status.ok()) { - on_done(status); - } + gcs_table_storage_.JobTable().Put( + job_id, job_table_data, {std::move(on_done), io_context_}); } void GcsJobManager::HandleMarkJobFinished(rpc::MarkJobFinishedRequest request, @@ -191,13 +205,13 @@ void GcsJobManager::HandleMarkJobFinished(rpc::MarkJobFinishedRequest request, GCS_RPC_SEND_REPLY(send_reply_callback, reply, status); }; - Status status = gcs_table_storage_.JobTable().Get( + gcs_table_storage_.JobTable().Get( job_id, - {[this, job_id, send_reply](Status status, + {[this, job_id, send_reply](Status get_status, std::optional result) { RAY_CHECK(thread_checker_.IsOnSameThread()); - if (status.ok() && result) { + if (get_status.ok() && result) { MarkJobAsFinished(*result, send_reply); return; } @@ -205,16 +219,13 @@ void GcsJobManager::HandleMarkJobFinished(rpc::MarkJobFinishedRequest request, if (!result.has_value()) { RAY_LOG(ERROR).WithField(job_id) << "Tried to mark job as finished, but no job table entry was found."; - } else if (!status.ok()) { + } else if (!get_status.ok()) { RAY_LOG(ERROR).WithField(job_id) - << "Failed to mark job as finished: " << status; + << "Failed to mark job as finished: " << get_status; } - send_reply(status); + send_reply(get_status); }, io_context_}); - if (!status.ok()) { - send_reply(status); - } } void GcsJobManager::ClearJobInfos(const rpc::JobTableData &job_data) { @@ -417,8 +428,8 @@ void GcsJobManager::HandleGetAllJobInfo(rpc::GetAllJobInfoRequest request, send_reply_callback, job_data_key_to_indices, num_finished_tasks, - try_send_reply](const auto &result) { - for (const auto &data : result) { + try_send_reply](const auto &job_info_result) { + for (const auto &data : job_info_result) { const std::string &job_data_key = data.first; // The JobInfo stored by the Ray Job API. const std::string &job_info_json = data.second; @@ -433,8 +444,8 @@ void GcsJobManager::HandleGetAllJobInfo(rpc::GetAllJobInfoRequest request, << job_info_json << " Error: " << status.message(); } // Add the JobInfo to the correct indices in the reply. - for (int i : job_data_key_to_indices.at(job_data_key)) { - reply->mutable_job_info_list(i)->mutable_job_info()->CopyFrom( + for (int j : job_data_key_to_indices.at(job_data_key)) { + reply->mutable_job_info_list(j)->mutable_job_info()->CopyFrom( jobs_api_info); } } @@ -446,17 +457,14 @@ void GcsJobManager::HandleGetAllJobInfo(rpc::GetAllJobInfoRequest request, "job", job_api_data_keys, {kv_multi_get_callback, io_context_}); } }; - Status status = gcs_table_storage_.JobTable().GetAll({on_done, io_context_}); - if (!status.ok()) { - on_done(absl::flat_hash_map()); - } + gcs_table_storage_.JobTable().GetAll({std::move(on_done), io_context_}); } void GcsJobManager::HandleReportJobError(rpc::ReportJobErrorRequest request, rpc::ReportJobErrorReply *reply, rpc::SendReplyCallback send_reply_callback) { auto job_id = JobID::FromBinary(request.job_error().job_id()); - RAY_CHECK_OK(gcs_publisher_.PublishError(job_id.Hex(), request.job_error(), nullptr)); + gcs_publisher_.PublishError(job_id.Hex(), std::move(*request.mutable_job_error())); GCS_RPC_SEND_REPLY(send_reply_callback, reply, Status::OK()); } @@ -468,7 +476,7 @@ void GcsJobManager::HandleGetNextJobID(rpc::GetNextJobIDRequest request, reply->set_job_id(job_id); GCS_RPC_SEND_REPLY(send_reply_callback, reply, Status::OK()); }; - RAY_CHECK_OK(gcs_table_storage_.AsyncGetNextJobID({std::move(callback), io_context_})); + gcs_table_storage_.AsyncGetNextJobID({std::move(callback), io_context_}); } std::shared_ptr GcsJobManager::GetJobConfig(const JobID &job_id) const { @@ -489,7 +497,7 @@ void GcsJobManager::OnNodeDead(const NodeID &node_id) { // - (1) are not already dead. // - (2) have their driver running on the dead node. for (auto &data : result) { - auto driver_node_id = NodeID::FromBinary(data.second.driver_address().raylet_id()); + auto driver_node_id = NodeID::FromBinary(data.second.driver_address().node_id()); if (!data.second.is_dead() && driver_node_id == node_id) { MarkJobAsFinished(data.second, [data](Status status) { if (!status.ok()) { @@ -500,7 +508,7 @@ void GcsJobManager::OnNodeDead(const NodeID &node_id) { } }; - RAY_CHECK_OK(gcs_table_storage_.JobTable().GetAll({on_done, io_context_})); + gcs_table_storage_.JobTable().GetAll({std::move(on_done), io_context_}); } void GcsJobManager::RecordMetrics() { diff --git a/src/ray/gcs/gcs_server/gcs_job_manager.h b/src/ray/gcs/gcs_job_manager.h similarity index 83% rename from src/ray/gcs/gcs_server/gcs_job_manager.h rename to src/ray/gcs/gcs_job_manager.h index a9484e7c0277..9a6db62a494b 100644 --- a/src/ray/gcs/gcs_server/gcs_job_manager.h +++ b/src/ray/gcs/gcs_job_manager.h @@ -21,13 +21,14 @@ #include #include "absl/container/flat_hash_map.h" -#include "absl/container/flat_hash_set.h" #include "ray/common/runtime_env_manager.h" -#include "ray/gcs/gcs_server/gcs_function_manager.h" -#include "ray/gcs/gcs_server/gcs_init_data.h" -#include "ray/gcs/gcs_server/gcs_table_storage.h" -#include "ray/gcs/pubsub/gcs_pub_sub.h" -#include "ray/rpc/gcs/gcs_rpc_server.h" +#include "ray/gcs/gcs_function_manager.h" +#include "ray/gcs/gcs_init_data.h" +#include "ray/gcs/gcs_kv_manager.h" +#include "ray/gcs/gcs_table_storage.h" +#include "ray/gcs/grpc_service_interfaces.h" +#include "ray/observability/ray_event_recorder_interface.h" +#include "ray/pubsub/gcs_publisher.h" #include "ray/rpc/worker/core_worker_client.h" #include "ray/rpc/worker/core_worker_client_pool.h" #include "ray/util/event.h" @@ -46,18 +47,20 @@ inline std::string JobDataKey(const std::string &submission_id) { return kJobDataKeyPrefix + submission_id; } -using JobFinishListenerCallback = rpc::JobInfoHandler::JobFinishListenerCallback; +using JobFinishListenerCallback = + rpc::JobInfoGcsServiceHandler::JobFinishListenerCallback; -/// This implementation class of `JobInfoHandler`. -class GcsJobManager : public rpc::JobInfoHandler { +class GcsJobManager : public rpc::JobInfoGcsServiceHandler { public: explicit GcsJobManager(GcsTableStorage &gcs_table_storage, - GcsPublisher &gcs_publisher, + pubsub::GcsPublisher &gcs_publisher, RuntimeEnvManager &runtime_env_manager, GCSFunctionManager &function_manager, InternalKVInterface &internal_kv, instrumented_io_context &io_context, - rpc::CoreWorkerClientPool &worker_client_pool) + rpc::CoreWorkerClientPool &worker_client_pool, + observability::RayEventRecorderInterface &ray_event_recorder, + const std::string &session_name) : gcs_table_storage_(gcs_table_storage), gcs_publisher_(gcs_publisher), runtime_env_manager_(runtime_env_manager), @@ -65,6 +68,8 @@ class GcsJobManager : public rpc::JobInfoHandler { internal_kv_(internal_kv), io_context_(io_context), worker_client_pool_(worker_client_pool), + ray_event_recorder_(ray_event_recorder), + session_name_(session_name), export_event_write_enabled_(IsExportAPIEnabledDriverJob()) {} void Initialize(const GcsInitData &gcs_init_data); @@ -99,7 +104,8 @@ class GcsJobManager : public rpc::JobInfoHandler { /// \param node_id The specified node id. void OnNodeDead(const NodeID &node_id); - void WriteDriverJobExportEvent(rpc::JobTableData job_data) const; + void WriteDriverJobExportEvent(rpc::JobTableData job_data, + rpc::events::DriverJobExecutionEvent::State state) const; // Verify if export events should be written for EXPORT_DRIVER_JOB source types bool IsExportAPIEnabledDriverJob() const { @@ -132,7 +138,7 @@ class GcsJobManager : public rpc::JobInfoHandler { int64_t finished_jobs_count_ = 0; GcsTableStorage &gcs_table_storage_; - GcsPublisher &gcs_publisher_; + pubsub::GcsPublisher &gcs_publisher_; /// Listeners which monitors the finish of jobs. std::vector job_finished_listeners_; @@ -145,6 +151,8 @@ class GcsJobManager : public rpc::JobInfoHandler { InternalKVInterface &internal_kv_; instrumented_io_context &io_context_; rpc::CoreWorkerClientPool &worker_client_pool_; + observability::RayEventRecorderInterface &ray_event_recorder_; + std::string session_name_; /// If true, driver job events are exported for Export API bool export_event_write_enabled_ = false; diff --git a/src/ray/gcs/gcs_server/gcs_kv_manager.cc b/src/ray/gcs/gcs_kv_manager.cc similarity index 99% rename from src/ray/gcs/gcs_server/gcs_kv_manager.cc rename to src/ray/gcs/gcs_kv_manager.cc index 26e77cf3bf1f..988604021f68 100644 --- a/src/ray/gcs/gcs_server/gcs_kv_manager.cc +++ b/src/ray/gcs/gcs_kv_manager.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/gcs_server/gcs_kv_manager.h" +#include "ray/gcs/gcs_kv_manager.h" #include #include diff --git a/src/ray/gcs/gcs_server/gcs_kv_manager.h b/src/ray/gcs/gcs_kv_manager.h similarity index 97% rename from src/ray/gcs/gcs_server/gcs_kv_manager.h rename to src/ray/gcs/gcs_kv_manager.h index eb1ca3b302f7..6814b593e92e 100644 --- a/src/ray/gcs/gcs_server/gcs_kv_manager.h +++ b/src/ray/gcs/gcs_kv_manager.h @@ -22,7 +22,7 @@ #include "ray/common/asio/instrumented_io_context.h" #include "ray/common/asio/postable.h" #include "ray/common/status.h" -#include "ray/rpc/gcs/gcs_rpc_server.h" +#include "ray/gcs/grpc_service_interfaces.h" namespace ray { namespace gcs { @@ -100,8 +100,7 @@ class InternalKVInterface { virtual ~InternalKVInterface() = default; }; -/// This implementation class of `InternalKVHandler`. -class GcsInternalKVManager : public rpc::InternalKVHandler { +class GcsInternalKVManager : public rpc::InternalKVGcsServiceHandler { public: explicit GcsInternalKVManager(std::unique_ptr kv_instance, std::string raylet_config_list, diff --git a/src/ray/gcs/gcs_server/gcs_node_manager.cc b/src/ray/gcs/gcs_node_manager.cc similarity index 88% rename from src/ray/gcs/gcs_server/gcs_node_manager.cc rename to src/ray/gcs/gcs_node_manager.cc index a9a54756cf85..ffa6a7fee49a 100644 --- a/src/ray/gcs/gcs_server/gcs_node_manager.cc +++ b/src/ray/gcs/gcs_node_manager.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/gcs_server/gcs_node_manager.h" +#include "ray/gcs/gcs_node_manager.h" #include #include @@ -21,17 +21,16 @@ #include #include -#include "ray/common/ray_config.h" -#include "ray/gcs/pb_util.h" -#include "ray/util/event.h" +#include "absl/container/flat_hash_set.h" +#include "ray/common/protobuf_utils.h" #include "ray/util/logging.h" +#include "ray/util/time.h" #include "src/ray/protobuf/gcs.pb.h" namespace ray { namespace gcs { -////////////////////////////////////////////////////////////////////////////////////////// -GcsNodeManager::GcsNodeManager(GcsPublisher *gcs_publisher, +GcsNodeManager::GcsNodeManager(pubsub::GcsPublisher *gcs_publisher, gcs::GcsTableStorage *gcs_table_storage, instrumented_io_context &io_context, rpc::RayletClientPool *raylet_client_pool, @@ -90,15 +89,15 @@ void GcsNodeManager::HandleRegisterNode(rpc::RegisterNodeRequest request, .WithField("node_address", node_info.node_manager_address()) << "Registering new node."; - auto on_done = - [this, node_id, node_info, reply, send_reply_callback](const Status &status) { - RAY_CHECK_OK(status) << "Failed to register node '" << node_id << "'."; - RAY_LOG(DEBUG).WithField(node_id) << "Finished registering node."; - RAY_CHECK_OK(gcs_publisher_->PublishNodeInfo(node_id, node_info, nullptr)); - AddNode(std::make_shared(node_info)); - WriteNodeExportEvent(node_info); - GCS_RPC_SEND_REPLY(send_reply_callback, reply, status); - }; + auto on_done = [this, node_id, node_info_copy = node_info, reply, send_reply_callback]( + const Status &status) mutable { + RAY_CHECK_OK(status) << "Failed to register node '" << node_id << "'."; + RAY_LOG(DEBUG).WithField(node_id) << "Finished registering node."; + AddNode(std::make_shared(node_info_copy)); + WriteNodeExportEvent(node_info_copy); + gcs_publisher_->PublishNodeInfo(node_id, std::move(node_info_copy)); + GCS_RPC_SEND_REPLY(send_reply_callback, reply, status); + }; if (node_info.is_head_node()) { // mark all old head nodes as dead if exists: // 1. should never happen when HA is not used @@ -114,18 +113,17 @@ void GcsNodeManager::HandleRegisterNode(rpc::RegisterNodeRequest request, RAY_CHECK_LE(head_nodes.size(), 1UL); if (head_nodes.size() == 1) { OnNodeFailure(head_nodes[0], - [this, node_id, node_info, on_done](const Status &status) { - RAY_CHECK_OK(status); - RAY_CHECK_OK(gcs_table_storage_->NodeTable().Put( - node_id, node_info, {on_done, io_context_})); + [this, node_id, node_info, on_done = std::move(on_done)]() { + gcs_table_storage_->NodeTable().Put( + node_id, node_info, {on_done, io_context_}); }); } else { - RAY_CHECK_OK(gcs_table_storage_->NodeTable().Put( - node_id, node_info, {on_done, io_context_})); + gcs_table_storage_->NodeTable().Put( + node_id, node_info, {std::move(on_done), io_context_}); } } else { - RAY_CHECK_OK( - gcs_table_storage_->NodeTable().Put(node_id, node_info, {on_done, io_context_})); + gcs_table_storage_->NodeTable().Put( + node_id, node_info, {std::move(on_done), io_context_}); } ++counts_[CountType::REGISTER_NODE_REQUEST]; } @@ -166,11 +164,10 @@ void GcsNodeManager::HandleUnregisterNode(rpc::UnregisterNodeRequest request, node_info_delta->set_end_time_ms(node->end_time_ms()); auto on_put_done = [this, node_id, node_info_delta, node](const Status &status) { - RAY_CHECK_OK(gcs_publisher_->PublishNodeInfo(node_id, *node_info_delta, nullptr)); + gcs_publisher_->PublishNodeInfo(node_id, *node_info_delta); WriteNodeExportEvent(*node); }; - RAY_CHECK_OK( - gcs_table_storage_->NodeTable().Put(node_id, *node, {on_put_done, io_context_})); + gcs_table_storage_->NodeTable().Put(node_id, *node, {on_put_done, io_context_}); GCS_RPC_SEND_REPLY(send_reply_callback, reply, Status::OK()); } @@ -406,7 +403,7 @@ std::shared_ptr GcsNodeManager::RemoveNode( << ", death reason = " << rpc::NodeDeathInfo_Reason_Name(death_info->reason()) << ", death message = " << death_info->reason_message(); // Record stats that there's a new removed node. - stats::NodeFailureTotal.Record(1); + ray_metric_node_failures_total_.Record(1); // Remove from alive nodes. alive_nodes_.erase(iter); // Remove from draining nodes if present. @@ -430,9 +427,9 @@ std::shared_ptr GcsNodeManager::RemoveNode( .WithField("ip", removed_node->node_manager_address()) << error_message.str(); RAY_LOG(WARNING) << error_message.str(); - auto error_data_ptr = gcs::CreateErrorTableData( + auto error_data = CreateErrorTableData( type, error_message.str(), absl::FromUnixMillis(current_time_ms())); - RAY_CHECK_OK(gcs_publisher_->PublishError(node_id.Hex(), *error_data_ptr, nullptr)); + gcs_publisher_->PublishError(node_id.Hex(), std::move(error_data)); } // Notify all listeners. @@ -443,8 +440,8 @@ std::shared_ptr GcsNodeManager::RemoveNode( return removed_node; } -void GcsNodeManager::OnNodeFailure(const NodeID &node_id, - const StatusCallback &node_table_updated_callback) { +void GcsNodeManager::OnNodeFailure( + const NodeID &node_id, const std::function &node_table_updated_callback) { auto maybe_node = GetAliveNode(node_id); if (maybe_node.has_value()) { rpc::NodeDeathInfo death_info = InferDeathInfo(node_id); @@ -453,24 +450,27 @@ void GcsNodeManager::OnNodeFailure(const NodeID &node_id, node->set_end_time_ms(current_sys_time_ms()); AddDeadNodeToCache(node); - auto node_info_delta = std::make_shared(); - node_info_delta->set_node_id(node->node_id()); - node_info_delta->set_state(node->state()); - node_info_delta->set_end_time_ms(node->end_time_ms()); - node_info_delta->mutable_death_info()->CopyFrom(node->death_info()); - - auto on_done = [this, node_id, node_table_updated_callback, node_info_delta, node]( - const Status &status) { + rpc::GcsNodeInfo node_info_delta; + node_info_delta.set_node_id(node->node_id()); + node_info_delta.set_state(node->state()); + node_info_delta.set_end_time_ms(node->end_time_ms()); + node_info_delta.mutable_death_info()->CopyFrom(node->death_info()); + + auto on_done = [this, + node_id, + node_table_updated_callback, + node_info_delta = std::move(node_info_delta), + node](const Status &status) mutable { WriteNodeExportEvent(*node); if (node_table_updated_callback != nullptr) { - node_table_updated_callback(Status::OK()); + node_table_updated_callback(); } - RAY_CHECK_OK(gcs_publisher_->PublishNodeInfo(node_id, *node_info_delta, nullptr)); + gcs_publisher_->PublishNodeInfo(node_id, std::move(node_info_delta)); }; - RAY_CHECK_OK( - gcs_table_storage_->NodeTable().Put(node_id, *node, {on_done, io_context_})); + gcs_table_storage_->NodeTable().Put( + node_id, *node, {std::move(on_done), io_context_}); } else if (node_table_updated_callback != nullptr) { - node_table_updated_callback(Status::OK()); + node_table_updated_callback(); } } @@ -504,8 +504,7 @@ void GcsNodeManager::Initialize(const GcsInitData &gcs_init_data) { void GcsNodeManager::AddDeadNodeToCache(std::shared_ptr node) { if (dead_nodes_.size() >= RayConfig::instance().maximum_gcs_dead_node_cached_count()) { const auto &node_id = sorted_dead_node_list_.front().first; - RAY_CHECK_OK(gcs_table_storage_->NodeTable().Delete( - node_id, {[](const auto &) {}, io_context_})); + gcs_table_storage_->NodeTable().Delete(node_id, {[](const auto &) {}, io_context_}); dead_nodes_.erase(sorted_dead_node_list_.front().first); sorted_dead_node_list_.pop_front(); } @@ -527,7 +526,7 @@ std::string GcsNodeManager::DebugString() const { void GcsNodeManager::UpdateAliveNode( const NodeID &node_id, - const syncer::ResourceViewSyncMessage &resource_view_sync_message) { + const rpc::syncer::ResourceViewSyncMessage &resource_view_sync_message) { auto maybe_node_info = GetAliveNode(node_id); if (maybe_node_info == absl::nullopt) { return; diff --git a/src/ray/gcs/gcs_server/gcs_node_manager.h b/src/ray/gcs/gcs_node_manager.h similarity index 90% rename from src/ray/gcs/gcs_server/gcs_node_manager.h rename to src/ray/gcs/gcs_node_manager.h index 972ecba9c4e6..fe463d6adf8d 100644 --- a/src/ray/gcs/gcs_server/gcs_node_manager.h +++ b/src/ray/gcs/gcs_node_manager.h @@ -14,11 +14,6 @@ #pragma once -#include - -#include -#include -#include #include #include #include @@ -26,35 +21,36 @@ #include #include "absl/container/flat_hash_map.h" -#include "absl/container/flat_hash_set.h" +#include "ray/common/asio/instrumented_io_context.h" #include "ray/common/id.h" -#include "ray/common/ray_syncer/ray_syncer.h" -#include "ray/gcs/gcs_server/gcs_init_data.h" -#include "ray/gcs/gcs_server/gcs_resource_manager.h" -#include "ray/gcs/gcs_server/gcs_table_storage.h" -#include "ray/gcs/pubsub/gcs_pub_sub.h" -#include "ray/rpc/client_call.h" -#include "ray/rpc/gcs/gcs_rpc_server.h" -#include "ray/rpc/node_manager/node_manager_client.h" -#include "ray/rpc/node_manager/raylet_client_pool.h" +#include "ray/gcs/gcs_init_data.h" +#include "ray/gcs/gcs_table_storage.h" +#include "ray/gcs/grpc_service_interfaces.h" +#include "ray/pubsub/gcs_publisher.h" +#include "ray/rpc/raylet/raylet_client_pool.h" +#include "ray/stats/metric_defs.h" #include "ray/util/event.h" +#include "src/ray/protobuf/autoscaler.pb.h" #include "src/ray/protobuf/gcs.pb.h" +#include "src/ray/protobuf/ray_syncer.pb.h" -namespace ray::gcs { +namespace ray { +namespace gcs { class GcsAutoscalerStateManagerTest; class GcsStateTest; + /// GcsNodeManager is responsible for managing and monitoring nodes as well as handing /// node and resource related rpc requests. /// This class is not thread-safe. -class GcsNodeManager : public rpc::NodeInfoHandler { +class GcsNodeManager : public rpc::NodeInfoGcsServiceHandler { public: /// Create a GcsNodeManager. /// /// \param gcs_publisher GCS message publisher. /// \param gcs_table_storage GCS table external storage accessor. - GcsNodeManager(GcsPublisher *gcs_publisher, - gcs::GcsTableStorage *gcs_table_storage, + GcsNodeManager(pubsub::GcsPublisher *gcs_publisher, + GcsTableStorage *gcs_table_storage, instrumented_io_context &io_context, rpc::RayletClientPool *raylet_client_pool, const ClusterID &cluster_id); @@ -96,7 +92,7 @@ class GcsNodeManager : public rpc::NodeInfoHandler { /// \param node_table_updated_callback The status callback function after /// faled node info is updated to gcs node table. void OnNodeFailure(const NodeID &node_id, - const StatusCallback &node_table_updated_callback); + const std::function &node_table_updated_callback); /// Add an alive node. /// @@ -176,8 +172,9 @@ class GcsNodeManager : public rpc::NodeInfoHandler { /// /// \param node_id The ID of the node to update. /// \param resource_view_sync_message The sync message containing the new state. - void UpdateAliveNode(const NodeID &node_id, - const syncer::ResourceViewSyncMessage &resource_view_sync_message); + void UpdateAliveNode( + const NodeID &node_id, + const rpc::syncer::ResourceViewSyncMessage &resource_view_sync_message); private: /// Add the dead node to the cache. If the cache is full, the earliest dead node is @@ -265,9 +262,9 @@ class GcsNodeManager : public rpc::NodeInfoHandler { std::vector)>> node_removed_listeners_; /// A publisher for publishing gcs messages. - GcsPublisher *gcs_publisher_; + pubsub::GcsPublisher *gcs_publisher_; /// Storage for GCS tables. - gcs::GcsTableStorage *gcs_table_storage_; + GcsTableStorage *gcs_table_storage_; instrumented_io_context &io_context_; /// Raylet client pool. rpc::RayletClientPool *raylet_client_pool_; @@ -286,8 +283,15 @@ class GcsNodeManager : public rpc::NodeInfoHandler { /// If true, node events are exported for Export API bool export_event_write_enabled_ = false; + /// Ray metrics + ray::stats::Count ray_metric_node_failures_total_{ + /*name=*/"node_failure_total", + /*description=*/"Number of node failures that have happened in the cluster.", + /*unit=*/""}; + friend GcsAutoscalerStateManagerTest; friend GcsStateTest; }; -} // namespace ray::gcs +} // namespace gcs +} // namespace ray diff --git a/src/ray/gcs/gcs_placement_group.cc b/src/ray/gcs/gcs_placement_group.cc new file mode 100644 index 000000000000..4de5b8fa229b --- /dev/null +++ b/src/ray/gcs/gcs_placement_group.cc @@ -0,0 +1,151 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ray/gcs/gcs_placement_group.h" + +#include +#include +#include + +#include "ray/stats/metric_defs.h" + +namespace ray { +namespace gcs { + +void GcsPlacementGroup::UpdateState( + rpc::PlacementGroupTableData::PlacementGroupState state) { + if (state == rpc::PlacementGroupTableData::CREATED) { + RAY_CHECK_EQ(placement_group_table_data_.state(), + rpc::PlacementGroupTableData::PREPARED); + placement_group_table_data_.set_placement_group_final_bundle_placement_timestamp_ms( + current_sys_time_ms()); + + double duration_s = + (placement_group_table_data_ + .placement_group_final_bundle_placement_timestamp_ms() - + placement_group_table_data_.placement_group_creation_timestamp_ms()) / + 1000; + stats::STATS_scheduler_placement_time_s.Record(duration_s, + {{"WorkloadType", "PlacementGroup"}}); + } + placement_group_table_data_.set_state(state); + RefreshMetrics(); +} + +rpc::PlacementGroupTableData::PlacementGroupState GcsPlacementGroup::GetState() const { + return placement_group_table_data_.state(); +} + +PlacementGroupID GcsPlacementGroup::GetPlacementGroupID() const { + return PlacementGroupID::FromBinary(placement_group_table_data_.placement_group_id()); +} + +std::string GcsPlacementGroup::GetName() const { + return placement_group_table_data_.name(); +} + +std::string GcsPlacementGroup::GetRayNamespace() const { + return placement_group_table_data_.ray_namespace(); +} + +std::vector> &GcsPlacementGroup::GetBundles() + const { + // Fill the cache if it wasn't. + if (cached_bundle_specs_.empty()) { + const auto &bundles = placement_group_table_data_.bundles(); + for (const auto &bundle : bundles) { + cached_bundle_specs_.push_back(std::make_shared(bundle)); + } + } + return cached_bundle_specs_; +} + +std::vector> +GcsPlacementGroup::GetUnplacedBundles() const { + const auto &bundle_specs = GetBundles(); + + std::vector> unplaced_bundles; + for (const auto &bundle : bundle_specs) { + if (bundle->NodeId().IsNil()) { + unplaced_bundles.push_back(bundle); + } + } + return unplaced_bundles; +} + +bool GcsPlacementGroup::HasUnplacedBundles() const { + return !GetUnplacedBundles().empty(); +} + +rpc::PlacementStrategy GcsPlacementGroup::GetStrategy() const { + return placement_group_table_data_.strategy(); +} + +const rpc::PlacementGroupTableData &GcsPlacementGroup::GetPlacementGroupTableData() + const { + return placement_group_table_data_; +} + +std::string GcsPlacementGroup::DebugString() const { + std::stringstream stream; + stream << "placement group id = " << GetPlacementGroupID() << ", name = " << GetName() + << ", strategy = " << GetStrategy(); + return stream.str(); +} + +rpc::Bundle *GcsPlacementGroup::GetMutableBundle(int bundle_index) { + // Invalidate the cache. + cached_bundle_specs_.clear(); + return placement_group_table_data_.mutable_bundles(bundle_index); +} + +const ActorID GcsPlacementGroup::GetCreatorActorId() const { + return ActorID::FromBinary(placement_group_table_data_.creator_actor_id()); +} + +const JobID GcsPlacementGroup::GetCreatorJobId() const { + return JobID::FromBinary(placement_group_table_data_.creator_job_id()); +} + +void GcsPlacementGroup::MarkCreatorJobDead() { + placement_group_table_data_.set_creator_job_dead(true); +} + +void GcsPlacementGroup::MarkCreatorActorDead() { + placement_group_table_data_.set_creator_actor_dead(true); +} + +bool GcsPlacementGroup::IsPlacementGroupLifetimeDone() const { + return !IsDetached() && placement_group_table_data_.creator_job_dead() && + placement_group_table_data_.creator_actor_dead(); +} + +bool GcsPlacementGroup::IsDetached() const { + return placement_group_table_data_.is_detached(); +} + +NodeID GcsPlacementGroup::GetSoftTargetNodeID() const { + return NodeID::FromBinary(placement_group_table_data_.soft_target_node_id()); +} + +const rpc::PlacementGroupStats &GcsPlacementGroup::GetStats() const { + return placement_group_table_data_.stats(); +} + +rpc::PlacementGroupStats *GcsPlacementGroup::GetMutableStats() { + return placement_group_table_data_.mutable_stats(); +} + +} // namespace gcs +} // namespace ray diff --git a/src/ray/gcs/gcs_placement_group.h b/src/ray/gcs/gcs_placement_group.h new file mode 100644 index 000000000000..61f41fcbbcc4 --- /dev/null +++ b/src/ray/gcs/gcs_placement_group.h @@ -0,0 +1,212 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#include +#include +#include +#include +#include + +#include "ray/common/bundle_spec.h" +#include "ray/common/id.h" +#include "ray/util/counter_map.h" +#include "ray/util/time.h" +#include "src/ray/protobuf/gcs_service.pb.h" + +namespace ray { +namespace gcs { + +/// GcsPlacementGroup just wraps `PlacementGroupTableData` and provides some convenient +/// interfaces to access the fields inside `PlacementGroupTableData`. This class is not +/// thread-safe. +class GcsPlacementGroup { + public: + /// Create a GcsPlacementGroup by placement_group_table_data. + /// + /// \param placement_group_table_data Data of the placement_group (see gcs.proto). + explicit GcsPlacementGroup( + rpc::PlacementGroupTableData placement_group_table_data, + std::shared_ptr> + counter) + : placement_group_table_data_(std::move(placement_group_table_data)), + counter_(counter) { + SetupStates(); + } + + /// Create a GcsPlacementGroup by CreatePlacementGroupRequest. + /// + /// \param request Contains the placement group creation task specification. + explicit GcsPlacementGroup( + const ray::rpc::CreatePlacementGroupRequest &request, + std::string ray_namespace, + std::shared_ptr> + counter) + : counter_(counter) { + const auto &placement_group_spec = request.placement_group_spec(); + placement_group_table_data_.set_placement_group_id( + placement_group_spec.placement_group_id()); + placement_group_table_data_.set_name(placement_group_spec.name()); + placement_group_table_data_.set_state(rpc::PlacementGroupTableData::PENDING); + placement_group_table_data_.mutable_bundles()->CopyFrom( + placement_group_spec.bundles()); + placement_group_table_data_.set_strategy(placement_group_spec.strategy()); + placement_group_table_data_.set_creator_job_id(placement_group_spec.creator_job_id()); + placement_group_table_data_.set_creator_actor_id( + placement_group_spec.creator_actor_id()); + placement_group_table_data_.set_creator_job_dead( + placement_group_spec.creator_job_dead()); + placement_group_table_data_.set_creator_actor_dead( + placement_group_spec.creator_actor_dead()); + placement_group_table_data_.set_is_detached(placement_group_spec.is_detached()); + placement_group_table_data_.set_soft_target_node_id( + placement_group_spec.soft_target_node_id()); + placement_group_table_data_.set_ray_namespace(ray_namespace); + placement_group_table_data_.set_placement_group_creation_timestamp_ms( + current_sys_time_ms()); + SetupStates(); + } + + ~GcsPlacementGroup() { + if (last_metric_state_ && + last_metric_state_.value() != rpc::PlacementGroupTableData::REMOVED) { + RAY_LOG(DEBUG) << "Decrementing state at " + << rpc::PlacementGroupTableData::PlacementGroupState_Name( + last_metric_state_.value()); + // Retain groups in the REMOVED state so we have a history of past groups. + counter_->Decrement(last_metric_state_.value()); + } + } + + /// Get the immutable PlacementGroupTableData of this placement group. + const rpc::PlacementGroupTableData &GetPlacementGroupTableData() const; + + /// Get the mutable bundle of this placement group. + rpc::Bundle *GetMutableBundle(int bundle_index); + + /// Update the state of this placement_group. + void UpdateState(rpc::PlacementGroupTableData::PlacementGroupState state); + + /// Get the state of this gcs placement_group. + rpc::PlacementGroupTableData::PlacementGroupState GetState() const; + + /// Get the id of this placement_group. + PlacementGroupID GetPlacementGroupID() const; + + /// Get the name of this placement_group. + std::string GetName() const; + + /// Get the name of this placement_group. + std::string GetRayNamespace() const; + + /// Get the bundles of this placement_group (including unplaced). + std::vector> &GetBundles() const; + + /// Get the unplaced bundles of this placement group. + std::vector> GetUnplacedBundles() const; + + /// Check if there are unplaced bundles. + bool HasUnplacedBundles() const; + + /// Get the Strategy + rpc::PlacementStrategy GetStrategy() const; + + /// Get debug string for the placement group. + std::string DebugString() const; + + /// Below fields are used for automatic cleanup of placement groups. + + /// Get the actor id that created the placement group. + const ActorID GetCreatorActorId() const; + + /// Get the job id that created the placement group. + const JobID GetCreatorJobId() const; + + /// Mark that the creator job of this placement group is dead. + void MarkCreatorJobDead(); + + /// Mark that the creator actor of this placement group is dead. + void MarkCreatorActorDead(); + + /// Return True if the placement group lifetime is done. False otherwise. + bool IsPlacementGroupLifetimeDone() const; + + /// Returns whether or not this is a detached placement group. + bool IsDetached() const; + + /// Return the target node ID where bundles of this placement group should be placed. + /// Only works for STRICT_PACK placement group. + NodeID GetSoftTargetNodeID() const; + + const rpc::PlacementGroupStats &GetStats() const; + + rpc::PlacementGroupStats *GetMutableStats(); + + private: + // XXX. + FRIEND_TEST(GcsPlacementGroupManagerTest, TestPlacementGroupBundleCache); + + /// Setup states other than placement_group_table_data_. + void SetupStates() { + auto stats = placement_group_table_data_.mutable_stats(); + // The default value for the field is 0 + if (stats->creation_request_received_ns() == 0) { + auto now = absl::GetCurrentTimeNanos(); + stats->set_creation_request_received_ns(now); + } + // The default value for the field is 0 + // Only set the state to the QUEUED when the state wasn't persisted before. + if (stats->scheduling_state() == 0) { + stats->set_scheduling_state(rpc::PlacementGroupStats::QUEUED); + } + RefreshMetrics(); + } + + /// Record metric updates if there have been any state changes. + void RefreshMetrics() { + auto cur_state = GetState(); + if (last_metric_state_) { + RAY_LOG(DEBUG) << "Swapping state from " + << rpc::PlacementGroupTableData::PlacementGroupState_Name( + last_metric_state_.value()) + << " to " + << rpc::PlacementGroupTableData::PlacementGroupState_Name(cur_state); + counter_->Swap(last_metric_state_.value(), cur_state); + } else { + RAY_LOG(DEBUG) << "Incrementing state at " + << rpc::PlacementGroupTableData::PlacementGroupState_Name(cur_state); + counter_->Increment(cur_state); + } + last_metric_state_ = cur_state; + } + + /// The placement_group meta data which contains the task specification as well as the + /// state of the gcs placement_group and so on (see gcs.proto). + rpc::PlacementGroupTableData placement_group_table_data_; + /// Creating bundle specification requires heavy computation because it needs to compute + /// formatted strings for all resources (heavy string operations). To optimize the CPU + /// usage, we cache bundle specs. + mutable std::vector> cached_bundle_specs_; + + /// Reference to the counter to use for placement group state metrics tracking. + std::shared_ptr> counter_; + + /// The last recorded metric state. + std::optional last_metric_state_; +}; + +} // namespace gcs +} // namespace ray diff --git a/src/ray/gcs/gcs_server/gcs_placement_group_mgr.cc b/src/ray/gcs/gcs_placement_group_manager.cc similarity index 86% rename from src/ray/gcs/gcs_server/gcs_placement_group_mgr.cc rename to src/ray/gcs/gcs_placement_group_manager.cc index 9e9694801586..be3c8d21853a 100644 --- a/src/ray/gcs/gcs_server/gcs_placement_group_mgr.cc +++ b/src/ray/gcs/gcs_placement_group_manager.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/gcs_server/gcs_placement_group_mgr.h" +#include "ray/gcs/gcs_placement_group_manager.h" #include #include @@ -20,9 +20,8 @@ #include #include "ray/common/asio/asio_util.h" -#include "ray/common/asio/instrumented_io_context.h" +#include "ray/common/bundle_spec.h" #include "ray/common/ray_config.h" -#include "ray/gcs/pb_util.h" #include "ray/stats/metric_defs.h" #include "src/ray/protobuf/gcs.pb.h" @@ -54,136 +53,6 @@ ExponentialBackoff CreateDefaultBackoff() { } } // namespace -void GcsPlacementGroup::UpdateState( - rpc::PlacementGroupTableData::PlacementGroupState state) { - if (state == rpc::PlacementGroupTableData::CREATED) { - RAY_CHECK_EQ(placement_group_table_data_.state(), - rpc::PlacementGroupTableData::PREPARED); - placement_group_table_data_.set_placement_group_final_bundle_placement_timestamp_ms( - current_sys_time_ms()); - - double duration_s = - (placement_group_table_data_ - .placement_group_final_bundle_placement_timestamp_ms() - - placement_group_table_data_.placement_group_creation_timestamp_ms()) / - 1000; - stats::STATS_scheduler_placement_time_s.Record(duration_s, - {{"WorkloadType", "PlacementGroup"}}); - } - placement_group_table_data_.set_state(state); - RefreshMetrics(); -} - -rpc::PlacementGroupTableData::PlacementGroupState GcsPlacementGroup::GetState() const { - return placement_group_table_data_.state(); -} - -PlacementGroupID GcsPlacementGroup::GetPlacementGroupID() const { - return PlacementGroupID::FromBinary(placement_group_table_data_.placement_group_id()); -} - -std::string GcsPlacementGroup::GetName() const { - return placement_group_table_data_.name(); -} - -std::string GcsPlacementGroup::GetRayNamespace() const { - return placement_group_table_data_.ray_namespace(); -} - -std::vector> &GcsPlacementGroup::GetBundles() - const { - // Fill the cache if it wasn't. - if (cached_bundle_specs_.empty()) { - const auto &bundles = placement_group_table_data_.bundles(); - for (const auto &bundle : bundles) { - cached_bundle_specs_.push_back(std::make_shared(bundle)); - } - } - return cached_bundle_specs_; -} - -std::vector> -GcsPlacementGroup::GetUnplacedBundles() const { - const auto &bundle_specs = GetBundles(); - - std::vector> unplaced_bundles; - for (const auto &bundle : bundle_specs) { - if (bundle->NodeId().IsNil()) { - unplaced_bundles.push_back(bundle); - } - } - return unplaced_bundles; -} - -bool GcsPlacementGroup::HasUnplacedBundles() const { - return !GetUnplacedBundles().empty(); -} - -rpc::PlacementStrategy GcsPlacementGroup::GetStrategy() const { - return placement_group_table_data_.strategy(); -} - -const rpc::PlacementGroupTableData &GcsPlacementGroup::GetPlacementGroupTableData() - const { - return placement_group_table_data_; -} - -std::string GcsPlacementGroup::DebugString() const { - std::stringstream stream; - stream << "placement group id = " << GetPlacementGroupID() << ", name = " << GetName() - << ", strategy = " << GetStrategy(); - return stream.str(); -} - -rpc::Bundle *GcsPlacementGroup::GetMutableBundle(int bundle_index) { - // Invalidate the cache. - cached_bundle_specs_.clear(); - return placement_group_table_data_.mutable_bundles(bundle_index); -} - -const ActorID GcsPlacementGroup::GetCreatorActorId() const { - return ActorID::FromBinary(placement_group_table_data_.creator_actor_id()); -} - -const JobID GcsPlacementGroup::GetCreatorJobId() const { - return JobID::FromBinary(placement_group_table_data_.creator_job_id()); -} - -void GcsPlacementGroup::MarkCreatorJobDead() { - placement_group_table_data_.set_creator_job_dead(true); -} - -void GcsPlacementGroup::MarkCreatorActorDead() { - placement_group_table_data_.set_creator_actor_dead(true); -} - -bool GcsPlacementGroup::IsPlacementGroupLifetimeDone() const { - return !IsDetached() && placement_group_table_data_.creator_job_dead() && - placement_group_table_data_.creator_actor_dead(); -} - -bool GcsPlacementGroup::IsDetached() const { - return placement_group_table_data_.is_detached(); -} - -double GcsPlacementGroup::GetMaxCpuFractionPerNode() const { - return placement_group_table_data_.max_cpu_fraction_per_node(); -} - -NodeID GcsPlacementGroup::GetSoftTargetNodeID() const { - return NodeID::FromBinary(placement_group_table_data_.soft_target_node_id()); -} - -const rpc::PlacementGroupStats &GcsPlacementGroup::GetStats() const { - return placement_group_table_data_.stats(); -} - -rpc::PlacementGroupStats *GcsPlacementGroup::GetMutableStats() { - return placement_group_table_data_.mutable_stats(); -} - -///////////////////////////////////////////////////////////////////////////////////////// - GcsPlacementGroupManager::GcsPlacementGroupManager( instrumented_io_context &io_context, GcsResourceManager &gcs_resource_manager) : io_context_(io_context), gcs_resource_manager_(gcs_resource_manager) {} @@ -263,18 +132,19 @@ void GcsPlacementGroupManager::RegisterPlacementGroup( placement_group); AddToPendingQueue(placement_group); - RAY_CHECK_OK(gcs_table_storage_->PlacementGroupTable().Put( + gcs_table_storage_->PlacementGroupTable().Put( placement_group_id, placement_group->GetPlacementGroupTableData(), {[this, placement_group_id, placement_group](Status status) { // The backend storage is supposed to be reliable, so the status must be ok. RAY_CHECK_OK(status); if (registered_placement_groups_.contains(placement_group_id)) { - auto iter = placement_group_to_register_callbacks_.find(placement_group_id); - auto callbacks = std::move(iter->second); - placement_group_to_register_callbacks_.erase(iter); - for (const auto &callback : callbacks) { - callback(status); + auto register_callback_iter = + placement_group_to_register_callbacks_.find(placement_group_id); + auto callbacks = std::move(register_callback_iter->second); + placement_group_to_register_callbacks_.erase(register_callback_iter); + for (const auto ®ister_callback : callbacks) { + register_callback(status); } SchedulePendingPlacementGroups(); } else { @@ -292,7 +162,7 @@ void GcsPlacementGroupManager::RegisterPlacementGroup( return; } }, - io_context_})); + io_context_}); } PlacementGroupID GcsPlacementGroupManager::GetPlacementGroupIDByName( @@ -374,7 +244,7 @@ void GcsPlacementGroupManager::OnPlacementGroupCreationSuccess( // Update states and persists the information. placement_group->UpdateState(rpc::PlacementGroupTableData::CREATED); auto placement_group_id = placement_group->GetPlacementGroupID(); - RAY_CHECK_OK(gcs_table_storage_->PlacementGroupTable().Put( + gcs_table_storage_->PlacementGroupTable().Put( placement_group_id, placement_group->GetPlacementGroupTableData(), {[this, placement_group_id](Status status) { @@ -398,7 +268,7 @@ void GcsPlacementGroupManager::OnPlacementGroupCreationSuccess( placement_group_to_create_callbacks_.erase(pg_to_create_iter); } }, - io_context_})); + io_context_}); lifetime_num_placement_groups_created_++; io_context_.post([this] { SchedulePendingPlacementGroups(); }, "GcsPlacementGroupManager.SchedulePendingPlacementGroups"); @@ -442,14 +312,14 @@ void GcsPlacementGroupManager::SchedulePendingPlacementGroups() { gcs_placement_group_scheduler_->ScheduleUnplacedBundles(SchedulePgRequest{ /*placement_group=*/placement_group, /*failure_callback=*/ - [this, backoff](std::shared_ptr placement_group, + [this, backoff](std::shared_ptr failure_placement_group, bool is_feasible) { OnPlacementGroupCreationFailed( - std::move(placement_group), backoff, is_feasible); + std::move(failure_placement_group), backoff, is_feasible); }, /*success_callback=*/ - [this](std::shared_ptr placement_group) { - OnPlacementGroupCreationSuccess(placement_group); + [this](std::shared_ptr success_placement_group) { + OnPlacementGroupCreationSuccess(success_placement_group); }}); is_new_placement_group_scheduled = true; } @@ -550,8 +420,9 @@ void GcsPlacementGroupManager::RemovePlacementGroup( auto pending_it = std::find_if( infeasible_placement_groups_.begin(), infeasible_placement_groups_.end(), - [placement_group_id](const std::shared_ptr &placement_group) { - return placement_group->GetPlacementGroupID() == placement_group_id; + [placement_group_id]( + const std::shared_ptr &this_placement_group) { + return this_placement_group->GetPlacementGroupID() == placement_group_id; }); if (pending_it != infeasible_placement_groups_.end()) { // The placement group is infeasible now, remove it from the queue. @@ -562,7 +433,7 @@ void GcsPlacementGroupManager::RemovePlacementGroup( placement_group->UpdateState(rpc::PlacementGroupTableData::REMOVED); placement_group->GetMutableStats()->set_scheduling_state( rpc::PlacementGroupStats::REMOVED); - RAY_CHECK_OK(gcs_table_storage_->PlacementGroupTable().Put( + gcs_table_storage_->PlacementGroupTable().Put( placement_group->GetPlacementGroupID(), placement_group->GetPlacementGroupTableData(), {[this, on_placement_group_removed, placement_group_id](Status status) { @@ -579,7 +450,7 @@ void GcsPlacementGroupManager::RemovePlacementGroup( } on_placement_group_removed(status); }, - io_context_})); + io_context_}); } void GcsPlacementGroupManager::HandleGetPlacementGroup( @@ -606,11 +477,8 @@ void GcsPlacementGroupManager::HandleGetPlacementGroup( if (it != registered_placement_groups_.end()) { on_done(Status::OK(), it->second->GetPlacementGroupTableData()); } else { - Status status = gcs_table_storage_->PlacementGroupTable().Get( - placement_group_id, {std::move(on_done), io_context_}); - if (!status.ok()) { - on_done(status, std::nullopt); - } + gcs_table_storage_->PlacementGroupTable().Get(placement_group_id, + {std::move(on_done), io_context_}); } ++counts_[CountType::GET_PLACEMENT_GROUP_REQUEST]; } @@ -677,11 +545,7 @@ void GcsPlacementGroupManager::HandleGetAllPlacementGroup( RAY_LOG(DEBUG) << "Finished getting all placement group info."; GCS_RPC_SEND_REPLY(send_reply_callback, reply, Status::OK()); }; - Status status = - gcs_table_storage_->PlacementGroupTable().GetAll({std::move(on_done), io_context_}); - if (!status.ok()) { - on_done(absl::flat_hash_map()); - } + gcs_table_storage_->PlacementGroupTable().GetAll({std::move(on_done), io_context_}); ++counts_[CountType::GET_ALL_PLACEMENT_GROUP_REQUEST]; } @@ -742,11 +606,8 @@ void GcsPlacementGroupManager::WaitPlacementGroup( } }; - Status status = gcs_table_storage_->PlacementGroupTable().Get( - placement_group_id, {std::move(on_done), io_context_}); - if (!status.ok()) { - on_done(status, std::nullopt); - } + gcs_table_storage_->PlacementGroupTable().Get(placement_group_id, + {std::move(on_done), io_context_}); } else if (iter->second->GetState() == rpc::PlacementGroupTableData::CREATED) { RAY_LOG(DEBUG) << "Placement group is created, placement group id = " << placement_group_id; @@ -838,10 +699,10 @@ void GcsPlacementGroupManager::OnNodeDead(const NodeID &node_id) { iter->second->GetMutableStats()->set_scheduling_state( rpc::PlacementGroupStats::QUEUED); AddToPendingQueue(iter->second, 0); - RAY_CHECK_OK(gcs_table_storage_->PlacementGroupTable().Put( + gcs_table_storage_->PlacementGroupTable().Put( iter->second->GetPlacementGroupID(), iter->second->GetPlacementGroupTableData(), - {[this](Status status) { SchedulePendingPlacementGroups(); }, io_context_})); + {[this](Status status) { SchedulePendingPlacementGroups(); }, io_context_}); } } } @@ -1020,13 +881,14 @@ void GcsPlacementGroupManager::Initialize(const GcsInitData &gcs_init_data) { prepared_pgs.emplace_back(SchedulePgRequest{ placement_group, /*failure_callback=*/ - [this](std::shared_ptr placement_group, bool is_feasible) { + [this](std::shared_ptr failure_placement_group, + bool is_feasible) { OnPlacementGroupCreationFailed( - std::move(placement_group), CreateDefaultBackoff(), is_feasible); + std::move(failure_placement_group), CreateDefaultBackoff(), is_feasible); }, /*success_callback=*/ - [this](std::shared_ptr placement_group) { - OnPlacementGroupCreationSuccess(placement_group); + [this](std::shared_ptr success_placement_group) { + OnPlacementGroupCreationSuccess(success_placement_group); }, }); } @@ -1147,10 +1009,10 @@ bool GcsPlacementGroupManager::RescheduleIfStillHasUnplacedBundles( << placement_group->GetPlacementGroupID(); placement_group->UpdateState(rpc::PlacementGroupTableData::RESCHEDULING); AddToPendingQueue(placement_group, 0); - RAY_CHECK_OK(gcs_table_storage_->PlacementGroupTable().Put( + gcs_table_storage_->PlacementGroupTable().Put( placement_group->GetPlacementGroupID(), placement_group->GetPlacementGroupTableData(), - {[this](Status status) { SchedulePendingPlacementGroups(); }, io_context_})); + {[this](Status status) { SchedulePendingPlacementGroups(); }, io_context_}); return true; } } diff --git a/src/ray/gcs/gcs_server/gcs_placement_group_mgr.h b/src/ray/gcs/gcs_placement_group_manager.h similarity index 65% rename from src/ray/gcs/gcs_server/gcs_placement_group_mgr.h rename to src/ray/gcs/gcs_placement_group_manager.h index 36d6e1b8f0b8..aa3bfae4c6f4 100644 --- a/src/ray/gcs/gcs_server/gcs_placement_group_mgr.h +++ b/src/ray/gcs/gcs_placement_group_manager.h @@ -24,204 +24,21 @@ #include "absl/container/flat_hash_map.h" #include "ray/common/asio/instrumented_io_context.h" -#include "ray/common/bundle_spec.h" #include "ray/common/id.h" -#include "ray/common/task/task_spec.h" -#include "ray/gcs/gcs_server/gcs_init_data.h" -#include "ray/gcs/gcs_server/gcs_node_manager.h" -#include "ray/gcs/gcs_server/gcs_placement_group_scheduler.h" -#include "ray/gcs/gcs_server/gcs_table_storage.h" -#include "ray/gcs/gcs_server/usage_stats_client.h" -#include "ray/gcs/pubsub/gcs_pub_sub.h" -#include "ray/rpc/worker/core_worker_client.h" +#include "ray/gcs/gcs_init_data.h" +#include "ray/gcs/gcs_placement_group.h" +#include "ray/gcs/gcs_placement_group_scheduler.h" +#include "ray/gcs/gcs_resource_manager.h" +#include "ray/gcs/gcs_table_storage.h" +#include "ray/gcs/grpc_service_interfaces.h" +#include "ray/gcs/usage_stats_client.h" #include "ray/util/counter_map.h" +#include "ray/util/exponential_backoff.h" #include "src/ray/protobuf/gcs_service.pb.h" namespace ray { namespace gcs { -/// GcsPlacementGroup just wraps `PlacementGroupTableData` and provides some convenient -/// interfaces to access the fields inside `PlacementGroupTableData`. This class is not -/// thread-safe. -class GcsPlacementGroup { - public: - /// Create a GcsPlacementGroup by placement_group_table_data. - /// - /// \param placement_group_table_data Data of the placement_group (see gcs.proto). - explicit GcsPlacementGroup( - rpc::PlacementGroupTableData placement_group_table_data, - std::shared_ptr> - counter) - : placement_group_table_data_(std::move(placement_group_table_data)), - counter_(counter) { - SetupStates(); - } - - /// Create a GcsPlacementGroup by CreatePlacementGroupRequest. - /// - /// \param request Contains the placement group creation task specification. - explicit GcsPlacementGroup( - const ray::rpc::CreatePlacementGroupRequest &request, - std::string ray_namespace, - std::shared_ptr> - counter) - : counter_(counter) { - const auto &placement_group_spec = request.placement_group_spec(); - placement_group_table_data_.set_placement_group_id( - placement_group_spec.placement_group_id()); - placement_group_table_data_.set_name(placement_group_spec.name()); - placement_group_table_data_.set_state(rpc::PlacementGroupTableData::PENDING); - placement_group_table_data_.mutable_bundles()->CopyFrom( - placement_group_spec.bundles()); - placement_group_table_data_.set_strategy(placement_group_spec.strategy()); - placement_group_table_data_.set_creator_job_id(placement_group_spec.creator_job_id()); - placement_group_table_data_.set_creator_actor_id( - placement_group_spec.creator_actor_id()); - placement_group_table_data_.set_creator_job_dead( - placement_group_spec.creator_job_dead()); - placement_group_table_data_.set_creator_actor_dead( - placement_group_spec.creator_actor_dead()); - placement_group_table_data_.set_is_detached(placement_group_spec.is_detached()); - placement_group_table_data_.set_max_cpu_fraction_per_node( - placement_group_spec.max_cpu_fraction_per_node()); - placement_group_table_data_.set_soft_target_node_id( - placement_group_spec.soft_target_node_id()); - placement_group_table_data_.set_ray_namespace(ray_namespace); - placement_group_table_data_.set_placement_group_creation_timestamp_ms( - current_sys_time_ms()); - SetupStates(); - } - - ~GcsPlacementGroup() { - if (last_metric_state_ && - last_metric_state_.value() != rpc::PlacementGroupTableData::REMOVED) { - RAY_LOG(DEBUG) << "Decrementing state at " - << rpc::PlacementGroupTableData::PlacementGroupState_Name( - last_metric_state_.value()); - // Retain groups in the REMOVED state so we have a history of past groups. - counter_->Decrement(last_metric_state_.value()); - } - } - - /// Get the immutable PlacementGroupTableData of this placement group. - const rpc::PlacementGroupTableData &GetPlacementGroupTableData() const; - - /// Get the mutable bundle of this placement group. - rpc::Bundle *GetMutableBundle(int bundle_index); - - /// Update the state of this placement_group. - void UpdateState(rpc::PlacementGroupTableData::PlacementGroupState state); - - /// Get the state of this gcs placement_group. - rpc::PlacementGroupTableData::PlacementGroupState GetState() const; - - /// Get the id of this placement_group. - PlacementGroupID GetPlacementGroupID() const; - - /// Get the name of this placement_group. - std::string GetName() const; - - /// Get the name of this placement_group. - std::string GetRayNamespace() const; - - /// Get the bundles of this placement_group (including unplaced). - std::vector> &GetBundles() const; - - /// Get the unplaced bundles of this placement group. - std::vector> GetUnplacedBundles() const; - - /// Check if there are unplaced bundles. - bool HasUnplacedBundles() const; - - /// Get the Strategy - rpc::PlacementStrategy GetStrategy() const; - - /// Get debug string for the placement group. - std::string DebugString() const; - - /// Below fields are used for automatic cleanup of placement groups. - - /// Get the actor id that created the placement group. - const ActorID GetCreatorActorId() const; - - /// Get the job id that created the placement group. - const JobID GetCreatorJobId() const; - - /// Mark that the creator job of this placement group is dead. - void MarkCreatorJobDead(); - - /// Mark that the creator actor of this placement group is dead. - void MarkCreatorActorDead(); - - /// Return True if the placement group lifetime is done. False otherwise. - bool IsPlacementGroupLifetimeDone() const; - - /// Returns whether or not this is a detached placement group. - bool IsDetached() const; - - /// Returns the maximum CPU fraction per node for this placement group. - double GetMaxCpuFractionPerNode() const; - - /// Return the target node ID where bundles of this placement group should be placed. - /// Only works for STRICT_PACK placement group. - NodeID GetSoftTargetNodeID() const; - - const rpc::PlacementGroupStats &GetStats() const; - - rpc::PlacementGroupStats *GetMutableStats(); - - private: - FRIEND_TEST(GcsPlacementGroupManagerTest, TestPlacementGroupBundleCache); - - /// Setup states other than placement_group_table_data_. - void SetupStates() { - auto stats = placement_group_table_data_.mutable_stats(); - // The default value for the field is 0 - if (stats->creation_request_received_ns() == 0) { - auto now = absl::GetCurrentTimeNanos(); - stats->set_creation_request_received_ns(now); - } - // The default value for the field is 0 - // Only set the state to the QUEUED when the state wasn't persisted before. - if (stats->scheduling_state() == 0) { - stats->set_scheduling_state(rpc::PlacementGroupStats::QUEUED); - } - RefreshMetrics(); - } - - /// Record metric updates if there have been any state changes. - void RefreshMetrics() { - auto cur_state = GetState(); - if (last_metric_state_) { - RAY_LOG(DEBUG) << "Swapping state from " - << rpc::PlacementGroupTableData::PlacementGroupState_Name( - last_metric_state_.value()) - << " to " - << rpc::PlacementGroupTableData::PlacementGroupState_Name(cur_state); - counter_->Swap(last_metric_state_.value(), cur_state); - } else { - RAY_LOG(DEBUG) << "Incrementing state at " - << rpc::PlacementGroupTableData::PlacementGroupState_Name(cur_state); - counter_->Increment(cur_state); - } - last_metric_state_ = cur_state; - } - - /// The placement_group meta data which contains the task specification as well as the - /// state of the gcs placement_group and so on (see gcs.proto). - rpc::PlacementGroupTableData placement_group_table_data_; - /// Creating bundle specification requires heavy computation because it needs to compute - /// formatted strings for all resources (heavy string operations). To optimize the CPU - /// usage, we cache bundle specs. - mutable std::vector> cached_bundle_specs_; - - /// Reference to the counter to use for placement group state metrics tracking. - std::shared_ptr> counter_; - - /// The last recorded metric state. - std::optional last_metric_state_; -}; - /// GcsPlacementGroupManager is responsible for managing the lifecycle of all placement /// group. This class is not thread-safe. /// The placementGroup will be added into queue and set the status as pending first and @@ -229,7 +46,7 @@ class GcsPlacementGroup { /// the head of the queue and schedule it. If schedule success, using the /// SchedulePendingPlacementGroups() Immediately. else wait for a short time beforw using /// SchedulePendingPlacementGroups() next time. -class GcsPlacementGroupManager : public rpc::PlacementGroupInfoHandler { +class GcsPlacementGroupManager : public rpc::PlacementGroupInfoGcsServiceHandler { public: /// Create a GcsPlacementGroupManager /// diff --git a/src/ray/gcs/gcs_server/gcs_placement_group_scheduler.cc b/src/ray/gcs/gcs_placement_group_scheduler.cc similarity index 98% rename from src/ray/gcs/gcs_server/gcs_placement_group_scheduler.cc rename to src/ray/gcs/gcs_placement_group_scheduler.cc index 0ca35f1765ed..f14f380d6018 100644 --- a/src/ray/gcs/gcs_server/gcs_placement_group_scheduler.cc +++ b/src/ray/gcs/gcs_placement_group_scheduler.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/gcs_server/gcs_placement_group_scheduler.h" +#include "ray/gcs/gcs_placement_group_scheduler.h" #include #include @@ -21,8 +21,6 @@ #include #include "ray/common/asio/asio_util.h" -#include "ray/gcs/gcs_server/gcs_placement_group_mgr.h" -#include "src/ray/protobuf/gcs.pb.h" namespace ray { namespace gcs { @@ -72,7 +70,6 @@ void GcsPlacementGroupScheduler::ScheduleUnplacedBundles( auto scheduling_options = CreateSchedulingOptions(placement_group->GetPlacementGroupID(), strategy, - placement_group->GetMaxCpuFractionPerNode(), placement_group->GetSoftTargetNodeID()); auto scheduling_result = cluster_resource_scheduler_.Schedule(resource_request_list, scheduling_options); @@ -292,7 +289,7 @@ std::shared_ptr GcsPlacementGroupScheduler::GetRayletClientFromNode( const std::shared_ptr &node) { rpc::Address remote_address; - remote_address.set_raylet_id(node->node_id()); + remote_address.set_node_id(node->node_id()); remote_address.set_ip_address(node->node_manager_address()); remote_address.set_port(node->node_manager_port()); return GetOrConnectRayletClient(remote_address); @@ -401,16 +398,16 @@ void GcsPlacementGroupScheduler::OnAllBundlePrepareRequestReturned( placement_group->UpdateState(rpc::PlacementGroupTableData::PREPARED); - RAY_CHECK_OK(gcs_table_storage_.PlacementGroupTable().Put( + gcs_table_storage_.PlacementGroupTable().Put( placement_group_id, placement_group->GetPlacementGroupTableData(), {[this, lease_status_tracker, schedule_failure_handler, schedule_success_handler]( - Status status) { + const ray::Status &status) { RAY_CHECK_OK(status); CommitAllBundles( lease_status_tracker, schedule_failure_handler, schedule_success_handler); }, - io_context_})); + io_context_}); } void GcsPlacementGroupScheduler::OnAllBundleCommitRequestReturned( @@ -475,22 +472,20 @@ GcsPlacementGroupScheduler::CreateSchedulingContext( SchedulingOptions GcsPlacementGroupScheduler::CreateSchedulingOptions( const PlacementGroupID &placement_group_id, rpc::PlacementStrategy strategy, - double max_cpu_fraction_per_node, NodeID soft_target_node_id) { switch (strategy) { case rpc::PlacementStrategy::PACK: - return SchedulingOptions::BundlePack(max_cpu_fraction_per_node); + return SchedulingOptions::BundlePack(); case rpc::PlacementStrategy::SPREAD: - return SchedulingOptions::BundleSpread(max_cpu_fraction_per_node); + return SchedulingOptions::BundleSpread(); case rpc::PlacementStrategy::STRICT_PACK: return SchedulingOptions::BundleStrictPack( - max_cpu_fraction_per_node, soft_target_node_id.IsNil() ? scheduling::NodeID::Nil() : scheduling::NodeID(soft_target_node_id.Binary())); case rpc::PlacementStrategy::STRICT_SPREAD: return SchedulingOptions::BundleStrictSpread( - max_cpu_fraction_per_node, CreateSchedulingContext(placement_group_id)); + CreateSchedulingContext(placement_group_id)); default: RAY_LOG(FATAL) << "Unsupported scheduling type: " << rpc::PlacementStrategy_Name(strategy); diff --git a/src/ray/gcs/gcs_server/gcs_placement_group_scheduler.h b/src/ray/gcs/gcs_placement_group_scheduler.h similarity index 97% rename from src/ray/gcs/gcs_server/gcs_placement_group_scheduler.h rename to src/ray/gcs/gcs_placement_group_scheduler.h index ebd5e9e460e9..6e43239d7967 100644 --- a/src/ray/gcs/gcs_server/gcs_placement_group_scheduler.h +++ b/src/ray/gcs/gcs_placement_group_scheduler.h @@ -24,22 +24,18 @@ #include "ray/common/asio/instrumented_io_context.h" #include "ray/common/bundle_location_index.h" #include "ray/common/id.h" -#include "ray/common/scheduling/scheduling_ids.h" -#include "ray/gcs/gcs_server/gcs_node_manager.h" -#include "ray/gcs/gcs_server/gcs_table_storage.h" +#include "ray/gcs/gcs_node_manager.h" +#include "ray/gcs/gcs_placement_group.h" +#include "ray/gcs/gcs_table_storage.h" #include "ray/raylet/scheduling/cluster_resource_scheduler.h" #include "ray/raylet/scheduling/policy/scheduling_context.h" -#include "ray/raylet_client/raylet_client.h" -#include "ray/rpc/node_manager/node_manager_client.h" -#include "ray/rpc/node_manager/raylet_client_pool.h" -#include "ray/rpc/worker/core_worker_client.h" +#include "ray/rpc/raylet/raylet_client_interface.h" +#include "ray/rpc/raylet/raylet_client_pool.h" #include "src/ray/protobuf/gcs_service.pb.h" namespace ray { namespace gcs { -class GcsPlacementGroup; - using PGSchedulingFailureCallback = std::function, bool)>; using PGSchedulingSuccessfulCallback = @@ -465,7 +461,6 @@ class GcsPlacementGroupScheduler : public GcsPlacementGroupSchedulerInterface { /// Create scheduling options. SchedulingOptions CreateSchedulingOptions(const PlacementGroupID &placement_group_id, rpc::PlacementStrategy strategy, - double max_cpu_fraction_per_node, NodeID soft_target_node_id); /// Try to release bundle resource to cluster resource manager. @@ -514,6 +509,11 @@ class GcsPlacementGroupScheduler : public GcsPlacementGroupSchedulerInterface { /// The bundles that waiting to be destroyed and release resources. std::list>> waiting_removed_bundles_; + + friend class GcsPlacementGroupSchedulerTest; + FRIEND_TEST(GcsPlacementGroupSchedulerTest, TestCheckingWildcardResource); + FRIEND_TEST(GcsPlacementGroupSchedulerTest, TestWaitingRemovedBundles); + FRIEND_TEST(GcsPlacementGroupSchedulerTest, TestBundlesRemovedWhenNodeDead); }; } // namespace gcs diff --git a/src/ray/gcs/gcs_ray_event_converter.cc b/src/ray/gcs/gcs_ray_event_converter.cc new file mode 100644 index 000000000000..d07a51ca3557 --- /dev/null +++ b/src/ray/gcs/gcs_ray_event_converter.cc @@ -0,0 +1,239 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ray/gcs/gcs_ray_event_converter.h" + +#include + +#include "absl/container/flat_hash_map.h" +#include "ray/common/grpc_util.h" +#include "ray/common/id.h" +#include "ray/util/logging.h" + +namespace ray { +namespace gcs { + +std::vector +GcsRayEventConverter::ConvertToTaskEventDataRequests( + rpc::events::AddEventsRequest &&request) { + std::vector requests_per_job_id; + absl::flat_hash_map job_id_to_index; + // convert RayEvents to TaskEvents and group by job id. + for (auto &event : *request.mutable_events_data()->mutable_events()) { + std::optional task_event = std::nullopt; + switch (event.event_type()) { + case rpc::events::RayEvent::TASK_DEFINITION_EVENT: { + task_event = ConvertToTaskEvents(std::move(*event.mutable_task_definition_event())); + break; + } + case rpc::events::RayEvent::TASK_EXECUTION_EVENT: { + task_event = ConvertToTaskEvents(std::move(*event.mutable_task_execution_event())); + break; + } + case rpc::events::RayEvent::TASK_PROFILE_EVENT: { + task_event = ConvertToTaskEvents(std::move(*event.mutable_task_profile_events())); + break; + } + case rpc::events::RayEvent::ACTOR_TASK_DEFINITION_EVENT: { + task_event = + ConvertToTaskEvents(std::move(*event.mutable_actor_task_definition_event())); + break; + } + default: + // TODO(can-anyscale): Handle other event types + break; + } + + // Groups all taskEvents belonging to same jobId into one AddTaskEventDataRequest + if (task_event) { + AddTaskEventToRequest(std::move(*task_event), requests_per_job_id, job_id_to_index); + } + } + + // Groups all taskEventMetadata belonging to same jobId into one + // AddTaskEventDataRequest + auto *metadata = request.mutable_events_data()->mutable_task_events_metadata(); + if (metadata->dropped_task_attempts_size() > 0) { + AddDroppedTaskAttemptsToRequest( + std::move(*metadata), requests_per_job_id, job_id_to_index); + } + return requests_per_job_id; +} + +void GcsRayEventConverter::AddTaskEventToRequest( + rpc::TaskEvents &&task_event, + std::vector &requests_per_job_id, + absl::flat_hash_map &job_id_to_index) { + const std::string job_id_key = task_event.job_id(); + auto it = job_id_to_index.find(job_id_key); + if (it == job_id_to_index.end()) { + // Create new AddTaskEventDataRequest entry and add index to map + size_t idx = requests_per_job_id.size(); + requests_per_job_id.emplace_back(); + auto *data = requests_per_job_id.back().mutable_data(); + data->set_job_id(job_id_key); + *data->add_events_by_task() = std::move(task_event); + job_id_to_index.emplace(job_id_key, idx); + } else { + // add taskEvent to existing AddTaskEventDataRequest with same job id + auto *data = requests_per_job_id[it->second].mutable_data(); + *data->add_events_by_task() = std::move(task_event); + } +} + +void GcsRayEventConverter::AddDroppedTaskAttemptsToRequest( + rpc::events::TaskEventsMetadata &&metadata, + std::vector &requests_per_job_id, + absl::flat_hash_map &job_id_to_index) { + // Process each dropped task attempt individually and route to the correct job ID + for (auto &dropped_attempt : *metadata.mutable_dropped_task_attempts()) { + const auto task_id = TaskID::FromBinary(dropped_attempt.task_id()); + const auto job_id_key = task_id.JobId().Binary(); + + auto it = job_id_to_index.find(job_id_key); + if (it == job_id_to_index.end()) { + // Create new request if job_id not found + size_t idx = requests_per_job_id.size(); + requests_per_job_id.emplace_back(); + auto *data = requests_per_job_id.back().mutable_data(); + data->set_job_id(job_id_key); + *data->add_dropped_task_attempts() = std::move(dropped_attempt); + job_id_to_index.emplace(job_id_key, idx); + } else { + // Add to existing request with same job_id + auto *data = requests_per_job_id[it->second].mutable_data(); + *data->add_dropped_task_attempts() = std::move(dropped_attempt); + } + } +} + +rpc::TaskEvents GcsRayEventConverter::ConvertToTaskEvents( + rpc::events::TaskDefinitionEvent &&event) { + rpc::TaskEvents task_event; + task_event.set_task_id(event.task_id()); + task_event.set_attempt_number(event.task_attempt()); + task_event.set_job_id(event.job_id()); + + rpc::TaskInfoEntry *task_info = task_event.mutable_task_info(); + task_info->set_type(event.task_type()); + task_info->set_name(event.task_name()); + task_info->set_task_id(event.task_id()); + task_info->set_job_id(event.job_id()); + task_info->set_parent_task_id(event.parent_task_id()); + if (!event.placement_group_id().empty()) { + task_info->set_placement_group_id(event.placement_group_id()); + } + + PopulateTaskRuntimeAndFunctionInfo(std::move(*event.mutable_runtime_env_info()), + std::move(*event.mutable_task_func()), + std::move(*event.mutable_required_resources()), + event.language(), + task_info); + return task_event; +} + +rpc::TaskEvents GcsRayEventConverter::ConvertToTaskEvents( + rpc::events::TaskExecutionEvent &&event) { + rpc::TaskEvents task_event; + task_event.set_task_id(event.task_id()); + task_event.set_attempt_number(event.task_attempt()); + task_event.set_job_id(event.job_id()); + + rpc::TaskStateUpdate *task_state_update = task_event.mutable_state_updates(); + task_state_update->set_node_id(event.node_id()); + task_state_update->set_worker_id(event.worker_id()); + task_state_update->set_worker_pid(event.worker_pid()); + task_state_update->mutable_error_info()->Swap(event.mutable_ray_error_info()); + + for (const auto &[state, timestamp] : event.task_state()) { + int64_t ns = ProtoTimestampToAbslTimeNanos(timestamp); + (*task_state_update->mutable_state_ts_ns())[state] = ns; + } + return task_event; +} + +rpc::TaskEvents GcsRayEventConverter::ConvertToTaskEvents( + rpc::events::ActorTaskDefinitionEvent &&event) { + rpc::TaskEvents task_event; + task_event.set_task_id(event.task_id()); + task_event.set_attempt_number(event.task_attempt()); + task_event.set_job_id(event.job_id()); + + rpc::TaskInfoEntry *task_info = task_event.mutable_task_info(); + task_info->set_type(rpc::TaskType::ACTOR_TASK); + task_info->set_name(event.actor_task_name()); + task_info->set_task_id(event.task_id()); + task_info->set_job_id(event.job_id()); + task_info->set_parent_task_id(event.parent_task_id()); + if (!event.placement_group_id().empty()) { + task_info->set_placement_group_id(event.placement_group_id()); + } + if (!event.actor_id().empty()) { + task_info->set_actor_id(event.actor_id()); + } + PopulateTaskRuntimeAndFunctionInfo(std::move(*event.mutable_runtime_env_info()), + std::move(*event.mutable_actor_func()), + std::move(*event.mutable_required_resources()), + event.language(), + task_info); + return task_event; +} + +rpc::TaskEvents GcsRayEventConverter::ConvertToTaskEvents( + rpc::events::TaskProfileEvents &&event) { + rpc::TaskEvents task_event; + task_event.set_task_id(event.task_id()); + task_event.set_attempt_number(event.attempt_number()); + task_event.set_job_id(event.job_id()); + + task_event.mutable_profile_events()->Swap(event.mutable_profile_events()); + return task_event; +} + +void GcsRayEventConverter::PopulateTaskRuntimeAndFunctionInfo( + rpc::RuntimeEnvInfo &&runtime_env_info, + rpc::FunctionDescriptor &&function_descriptor, + ::google::protobuf::Map &&required_resources, + rpc::Language language, + rpc::TaskInfoEntry *task_info) { + task_info->set_language(language); + task_info->mutable_runtime_env_info()->Swap(&runtime_env_info); + switch (language) { + case rpc::Language::CPP: + if (function_descriptor.has_cpp_function_descriptor()) { + task_info->set_func_or_class_name( + function_descriptor.cpp_function_descriptor().function_name()); + } + break; + case rpc::Language::PYTHON: + if (function_descriptor.has_python_function_descriptor()) { + task_info->set_func_or_class_name( + function_descriptor.python_function_descriptor().function_name()); + } + break; + case rpc::Language::JAVA: + if (function_descriptor.has_java_function_descriptor()) { + task_info->set_func_or_class_name( + function_descriptor.java_function_descriptor().function_name()); + } + break; + default: + // Other languages are not handled. + break; + } + task_info->mutable_required_resources()->swap(required_resources); +} + +} // namespace gcs +} // namespace ray diff --git a/src/ray/gcs/gcs_ray_event_converter.h b/src/ray/gcs/gcs_ray_event_converter.h new file mode 100644 index 000000000000..950d77c00dcb --- /dev/null +++ b/src/ray/gcs/gcs_ray_event_converter.h @@ -0,0 +1,107 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "absl/container/flat_hash_map.h" +#include "gtest/gtest_prod.h" +#include "src/ray/protobuf/events_event_aggregator_service.pb.h" +#include "src/ray/protobuf/gcs_service.pb.h" + +namespace ray { +namespace gcs { + +/// GcsRayEventConverter converts RayEvents to TaskEvents. +class GcsRayEventConverter { + public: + GcsRayEventConverter() = default; + ~GcsRayEventConverter() = default; + + /// Convert an AddEventsRequest to a list of AddTaskEventDataRequest objects, + /// grouping entries by job id. + /// + /// \param request The AddEventsRequest to convert. + /// \return A list of AddTaskEventDataRequest grouped by job id. + std::vector ConvertToTaskEventDataRequests( + rpc::events::AddEventsRequest &&request); + + private: + /// Convert a TaskDefinitionEvent to a TaskEvents. + /// + /// \param event The TaskDefinitionEvent to convert. + /// \return The output TaskEvents to populate. + rpc::TaskEvents ConvertToTaskEvents(rpc::events::TaskDefinitionEvent &&event); + + /// Convert ProfileEvents to a TaskEvents. + /// + /// \param event TaskProfileEvents object to convert. + /// \return The output TaskEvents to populate. + rpc::TaskEvents ConvertToTaskEvents(rpc::events::TaskProfileEvents &&event); + + /// Convert a TaskExecutionEvent to a TaskEvents. + /// + /// \param event The TaskExecutionEvent to convert. + /// \return The output TaskEvents to populate. + rpc::TaskEvents ConvertToTaskEvents(rpc::events::TaskExecutionEvent &&event); + + /// Convert an ActorTaskDefinitionEvent to a TaskEvents. + /// + /// \param event The ActorTaskDefinitionEvent to convert. + /// \return The output TaskEvents to populate. + rpc::TaskEvents ConvertToTaskEvents(rpc::events::ActorTaskDefinitionEvent &&event); + + /// Populate the TaskInfoEntry with the given runtime env info, function descriptor, + /// and required resources. This function is commonly used to convert the task + /// and actor task definition events to TaskEvents. + /// + /// \param runtime_env_info The runtime env info. + /// \param function_descriptor The function descriptor. + /// \param required_resources The required resources. + /// \param language The language of the task. + /// \param task_info The output TaskInfoEntry to populate. + void PopulateTaskRuntimeAndFunctionInfo( + rpc::RuntimeEnvInfo &&runtime_env_info, + rpc::FunctionDescriptor &&function_descriptor, + ::google::protobuf::Map &&required_resources, + rpc::Language language, + rpc::TaskInfoEntry *task_info); + + /// Add a task event to the appropriate job-grouped request. + /// + /// \param task_event The TaskEvents to add. + /// \param requests_per_job_id The list of requests grouped by job id. + /// \param job_id_to_index The map from job id to index in requests_per_job_id. + void AddTaskEventToRequest( + rpc::TaskEvents &&task_event, + std::vector &requests_per_job_id, + absl::flat_hash_map &job_id_to_index); + + /// Add dropped task attempts to the appropriate job-grouped request. + /// + /// \param metadata The task events metadata containing dropped task attempts. + /// \param requests_per_job_id The list of requests grouped by job id. + /// \param job_id_to_index The map from job id to index in requests_per_job_id. + void AddDroppedTaskAttemptsToRequest( + rpc::events::TaskEventsMetadata &&metadata, + std::vector &requests_per_job_id, + absl::flat_hash_map &job_id_to_index); + + FRIEND_TEST(GcsRayEventConverterTest, TestConvertTaskExecutionEvent); + FRIEND_TEST(GcsRayEventConverterTest, TestConvertActorTaskDefinitionEvent); +}; + +} // namespace gcs +} // namespace ray diff --git a/src/ray/gcs/gcs_server/gcs_resource_manager.cc b/src/ray/gcs/gcs_resource_manager.cc similarity index 96% rename from src/ray/gcs/gcs_server/gcs_resource_manager.cc rename to src/ray/gcs/gcs_resource_manager.cc index 088c32cc2b15..b9303b58ee55 100644 --- a/src/ray/gcs/gcs_server/gcs_resource_manager.cc +++ b/src/ray/gcs/gcs_resource_manager.cc @@ -12,14 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/gcs_server/gcs_resource_manager.h" +#include "ray/gcs/gcs_resource_manager.h" #include #include #include #include "ray/common/ray_config.h" -#include "ray/stats/metric_defs.h" +#include "ray/gcs/state_util.h" +#include "ray/util/logging.h" namespace ray { namespace gcs { @@ -28,15 +29,15 @@ GcsResourceManager::GcsResourceManager(instrumented_io_context &io_context, ClusterResourceManager &cluster_resource_manager, GcsNodeManager &gcs_node_manager, NodeID local_node_id, - ClusterTaskManager *cluster_task_manager) + raylet::ClusterLeaseManager *cluster_lease_manager) : io_context_(io_context), cluster_resource_manager_(cluster_resource_manager), gcs_node_manager_(gcs_node_manager), local_node_id_(std::move(local_node_id)), - cluster_task_manager_(cluster_task_manager) {} + cluster_lease_manager_(cluster_lease_manager) {} void GcsResourceManager::ConsumeSyncMessage( - std::shared_ptr message) { + std::shared_ptr message) { // ConsumeSyncMessage is called by ray_syncer which might not run // in a dedicated thread for performance. // GcsResourceManager is a module always run in the main thread, so we just @@ -199,10 +200,10 @@ void GcsResourceManager::HandleGetAllResourceUsage( batch.add_batch()->CopyFrom(usage.second); } - if (cluster_task_manager_ != nullptr) { + if (cluster_lease_manager_ != nullptr) { // Fill the gcs info when gcs actor scheduler is enabled. rpc::ResourcesData gcs_resources_data; - cluster_task_manager_->FillPendingActorInfo(gcs_resources_data); + cluster_lease_manager_->FillPendingActorInfo(gcs_resources_data); // Aggregate the load (pending actor info) of gcs. FillAggregateLoad(gcs_resources_data, &aggregate_load); // We only export gcs's pending info without adding the corresponding @@ -301,7 +302,7 @@ void GcsResourceManager::OnNodeAdd(const rpc::GcsNodeInfo &node) { absl::flat_hash_map labels(node.labels().begin(), node.labels().end()); - cluster_resource_manager_.SetNodeLabels(scheduling_node_id, labels); + cluster_resource_manager_.SetNodeLabels(scheduling_node_id, std::move(labels)); rpc::ResourcesData data; data.set_node_id(node_id.Binary()); diff --git a/src/ray/gcs/gcs_server/gcs_resource_manager.h b/src/ray/gcs/gcs_resource_manager.h similarity index 87% rename from src/ray/gcs/gcs_server/gcs_resource_manager.h rename to src/ray/gcs/gcs_resource_manager.h index ab523b33bd23..4d477ba1cb1f 100644 --- a/src/ray/gcs/gcs_server/gcs_resource_manager.h +++ b/src/ray/gcs/gcs_resource_manager.h @@ -20,27 +20,19 @@ #include #include "absl/container/flat_hash_map.h" -#include "absl/container/flat_hash_set.h" +#include "ray/common/asio/instrumented_io_context.h" #include "ray/common/id.h" #include "ray/common/ray_syncer/ray_syncer.h" -#include "ray/common/scheduling/cluster_resource_data.h" -#include "ray/gcs/gcs_server/gcs_init_data.h" -#include "ray/gcs/gcs_server/gcs_node_manager.h" -#include "ray/gcs/gcs_server/gcs_table_storage.h" -#include "ray/gcs/gcs_server/state_util.h" +#include "ray/gcs/gcs_init_data.h" +#include "ray/gcs/gcs_node_manager.h" +#include "ray/gcs/grpc_service_interfaces.h" +#include "ray/raylet/scheduling/cluster_lease_manager.h" #include "ray/raylet/scheduling/cluster_resource_manager.h" -#include "ray/raylet/scheduling/cluster_task_manager.h" -#include "ray/rpc/client_call.h" -#include "ray/rpc/gcs/gcs_rpc_server.h" #include "src/ray/protobuf/gcs.pb.h" +#include "src/ray/protobuf/ray_syncer.pb.h" namespace ray { - -using raylet::ClusterTaskManager; - namespace gcs { -class GcsNodeManager; -class GcsServer; /// Ideally, the logic related to resource calculation should be moved from /// `gcs_resource_manager` to `cluster_resource_manager`, and all logic related to @@ -60,20 +52,22 @@ class GcsServer; /// It is responsible for handing node resource related rpc requests and it is used for /// actor and placement group scheduling. It obtains the available resources of nodes /// through heartbeat reporting. Non-thread safe. -class GcsResourceManager : public rpc::NodeResourceInfoHandler, +class GcsResourceManager : public rpc::NodeResourceInfoGcsServiceHandler, public syncer::ReceiverInterface { public: /// Create a GcsResourceManager. - explicit GcsResourceManager(instrumented_io_context &io_context, - ClusterResourceManager &cluster_resource_manager, - GcsNodeManager &gcs_node_manager, - NodeID local_node_id, - ClusterTaskManager *cluster_task_manager = nullptr); + explicit GcsResourceManager( + instrumented_io_context &io_context, + ClusterResourceManager &cluster_resource_manager, + GcsNodeManager &gcs_node_manager, + NodeID local_node_id, + raylet::ClusterLeaseManager *cluster_lease_manager = nullptr); virtual ~GcsResourceManager() = default; /// Handle the resource update. - void ConsumeSyncMessage(std::shared_ptr message) override; + void ConsumeSyncMessage( + std::shared_ptr message) override; /// Handle get available resources of all nodes. /// Autoscaler-specific RPC called from Python. @@ -202,7 +196,7 @@ class GcsResourceManager : public rpc::NodeResourceInfoHandler, ClusterResourceManager &cluster_resource_manager_; GcsNodeManager &gcs_node_manager_; NodeID local_node_id_; - ClusterTaskManager *cluster_task_manager_; + raylet::ClusterLeaseManager *cluster_lease_manager_; /// Num of alive nodes in the cluster. size_t num_alive_nodes_ = 0; }; diff --git a/src/ray/gcs/gcs_server/gcs_server.cc b/src/ray/gcs/gcs_server.cc similarity index 83% rename from src/ray/gcs/gcs_server/gcs_server.cc rename to src/ray/gcs/gcs_server.cc index 81058297b98c..6dcd258465f2 100644 --- a/src/ray/gcs/gcs_server/gcs_server.cc +++ b/src/ray/gcs/gcs_server.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/gcs_server/gcs_server.h" +#include "ray/gcs/gcs_server.h" #include #include @@ -23,16 +23,22 @@ #include "ray/common/asio/asio_util.h" #include "ray/common/asio/instrumented_io_context.h" #include "ray/common/ray_config.h" -#include "ray/gcs/gcs_server/gcs_actor_manager.h" -#include "ray/gcs/gcs_server/gcs_autoscaler_state_manager.h" -#include "ray/gcs/gcs_server/gcs_job_manager.h" -#include "ray/gcs/gcs_server/gcs_placement_group_mgr.h" -#include "ray/gcs/gcs_server/gcs_resource_manager.h" -#include "ray/gcs/gcs_server/gcs_worker_manager.h" -#include "ray/gcs/gcs_server/store_client_kv.h" +#include "ray/gcs/gcs_actor_manager.h" +#include "ray/gcs/gcs_autoscaler_state_manager.h" +#include "ray/gcs/gcs_job_manager.h" +#include "ray/gcs/gcs_placement_group_manager.h" +#include "ray/gcs/gcs_resource_manager.h" +#include "ray/gcs/gcs_worker_manager.h" +#include "ray/gcs/grpc_services.h" +#include "ray/gcs/store_client/in_memory_store_client.h" +#include "ray/gcs/store_client/observable_store_client.h" +#include "ray/gcs/store_client/redis_store_client.h" +#include "ray/gcs/store_client/store_client.h" +#include "ray/gcs/store_client_kv.h" #include "ray/pubsub/publisher.h" +#include "ray/rpc/raylet/raylet_client.h" +#include "ray/stats/stats.h" #include "ray/util/network_util.h" -#include "ray/util/util.h" namespace ray { namespace gcs { @@ -58,7 +64,6 @@ GcsServer::GcsServer(const ray::gcs::GcsServerConfig &config, rpc_server_(config.grpc_server_name, config.grpc_server_port, config.node_ip_address == "127.0.0.1", - ClusterID::Nil(), config.grpc_server_thread_num, /*keepalive_time_ms=*/RayConfig::instance().grpc_keepalive_time_ms()), client_call_manager_(main_service, @@ -66,11 +71,11 @@ GcsServer::GcsServer(const ray::gcs::GcsServerConfig &config, ClusterID::Nil(), RayConfig::instance().gcs_server_rpc_client_thread_num()), raylet_client_pool_([this](const rpc::Address &addr) { - return std::make_shared( + return std::make_shared( addr, this->client_call_manager_, /*raylet_unavailable_timeout_callback=*/[this, addr]() { - const NodeID node_id = NodeID::FromBinary(addr.raylet_id()); + const NodeID node_id = NodeID::FromBinary(addr.node_id()); auto alive_node = this->gcs_node_manager_->GetAliveNode(node_id); if (!alive_node.has_value()) { this->raylet_client_pool_.Disconnect(node_id); @@ -82,7 +87,7 @@ GcsServer::GcsServer(const ray::gcs::GcsServerConfig &config, addr, this->client_call_manager_, /*core_worker_unavailable_timeout_callback*/ [this, addr]() { - const NodeID node_id = NodeID::FromBinary(addr.raylet_id()); + const NodeID node_id = NodeID::FromBinary(addr.node_id()); const WorkerID worker_id = WorkerID::FromBinary(addr.worker_id()); auto alive_node = this->gcs_node_manager_->GetAliveNode(node_id); if (!alive_node.has_value()) { @@ -108,8 +113,18 @@ GcsServer::GcsServer(const ray::gcs::GcsServerConfig &config, }); }); }), - pubsub_periodical_runner_( - PeriodicalRunner::Create(io_context_provider_.GetIOContext())), + event_aggregator_client_call_manager_( + io_context_provider_.GetIOContext(), + /*record_stats=*/true, + ClusterID::Nil(), + RayConfig::instance().gcs_server_rpc_client_thread_num()), + event_aggregator_client_(std::make_unique( + config_.metrics_agent_port, event_aggregator_client_call_manager_)), + ray_event_recorder_(std::make_unique( + *event_aggregator_client_, + io_context_provider_.GetIOContext())), + pubsub_periodical_runner_(PeriodicalRunner::Create( + io_context_provider_.GetIOContext())), periodical_runner_( PeriodicalRunner::Create(io_context_provider_.GetDefaultIOContext())), is_started_(false), @@ -118,30 +133,38 @@ GcsServer::GcsServer(const ray::gcs::GcsServerConfig &config, // GcsInternalKVManager, to avoid congestion on the latter. RAY_LOG(INFO) << "GCS storage type is " << storage_type_; auto &io_context = io_context_provider_.GetDefaultIOContext(); + std::shared_ptr store_client; switch (storage_type_) { case StorageType::IN_MEMORY: - gcs_table_storage_ = std::make_unique(); + store_client = + std::make_shared(std::make_unique()); break; case StorageType::REDIS_PERSIST: { - auto redis_client = CreateRedisClient(io_context); - gcs_table_storage_ = std::make_unique(redis_client); - // Init redis failure detector. - gcs_redis_failure_detector_ = - std::make_unique(io_context, redis_client, []() { - RAY_LOG(FATAL) << "Redis connection failed. Shutdown GCS."; - }); - gcs_redis_failure_detector_->Start(); + auto redis_store_client = + std::make_shared(io_context, GetRedisClientOptions()); + // Health check Redis periodically and crash if it becomes unavailable. + // NOTE: periodical_runner_ must run on the same IO context as the Redis client. + periodical_runner_->RunFnPeriodically( + [redis_store_client, &io_context] { + redis_store_client->AsyncCheckHealth( + {[](const Status &status) { + RAY_CHECK_OK(status) << "Redis connection failed unexpectedly."; + }, + io_context}); + }, + RayConfig::instance().gcs_redis_heartbeat_interval_milliseconds(), + "GCSServer.redis_health_check"); + + store_client = redis_store_client; break; } default: RAY_LOG(FATAL) << "Unexpected storage type: " << storage_type_; } - // Init GCS publisher instance. - std::unique_ptr inner_publisher; - // Init grpc based pubsub on GCS. - // TODO(yic): Move this into GcsPublisher. - inner_publisher = std::make_unique( + gcs_table_storage_ = std::make_unique(std::move(store_client)); + + auto inner_publisher = std::make_unique( /*channels=*/ std::vector{ rpc::ChannelType::GCS_ACTOR_CHANNEL, @@ -158,19 +181,16 @@ GcsServer::GcsServer(const ray::gcs::GcsServerConfig &config, /*publish_batch_size_=*/RayConfig::instance().publish_batch_size(), /*publisher_id=*/NodeID::FromRandom()); - gcs_publisher_ = std::make_unique(std::move(inner_publisher)); + gcs_publisher_ = std::make_unique(std::move(inner_publisher)); + metrics_agent_client_ = std::make_unique( + "127.0.0.1", + config_.metrics_agent_port, + io_context_provider_.GetDefaultIOContext(), + client_call_manager_); } GcsServer::~GcsServer() { Stop(); } -RedisClientOptions GcsServer::GetRedisClientOptions() const { - return RedisClientOptions(config_.redis_address, - config_.redis_port, - config_.redis_username, - config_.redis_password, - config_.enable_redis_ssl); -} - void GcsServer::Start() { // Load gcs tables data asynchronously. auto gcs_init_data = std::make_shared(*gcs_table_storage_); @@ -198,7 +218,7 @@ void GcsServer::GetOrGenerateClusterId( {[this, continuation = std::move(continuation)]( std::optional provided_cluster_id) mutable { if (!provided_cluster_id.has_value()) { - instrumented_io_context &io_context = continuation.io_context(); + instrumented_io_context &io_ctx = continuation.io_context(); ClusterID cluster_id = ClusterID::FromRandom(); RAY_LOG(INFO).WithField(cluster_id) << "Generated new cluster ID."; kv_manager_->GetInstance().Put( @@ -213,7 +233,7 @@ void GcsServer::GetOrGenerateClusterId( .Dispatch("GcsServer.GetOrGenerateClusterId.continuation", cluster_id); }, - io_context}); + io_ctx}); } else { ClusterID cluster_id = ClusterID::FromBinary(provided_cluster_id.value()); RAY_LOG(INFO).WithField(cluster_id) @@ -226,60 +246,31 @@ void GcsServer::GetOrGenerateClusterId( } void GcsServer::DoStart(const GcsInitData &gcs_init_data) { - // Init cluster resource scheduler. InitClusterResourceScheduler(); - - // Init gcs node manager. InitGcsNodeManager(gcs_init_data); - - // Init cluster task manager. - InitClusterTaskManager(); - - // Init gcs resource manager. + InitClusterLeaseManager(); InitGcsResourceManager(gcs_init_data); - - // Init gcs health check manager. InitGcsHealthCheckManager(gcs_init_data); - - // Init synchronization service InitRaySyncer(gcs_init_data); - - // Init KV service. InitKVService(); - - // Init function manager InitFunctionManager(); - - // Init Pub/Sub handler InitPubSubHandler(); - - // Init RuntimeEnv manager InitRuntimeEnvManager(); - - // Init gcs job manager. InitGcsJobManager(gcs_init_data); - - // Init gcs placement group manager. InitGcsPlacementGroupManager(gcs_init_data); - - // Init gcs actor manager. InitGcsActorManager(gcs_init_data); - - // Init gcs worker manager. InitGcsWorkerManager(); - - // Init GCS task manager. InitGcsTaskManager(); - - // Install event listeners. InstallEventListeners(); - - // Init autoscaling manager InitGcsAutoscalerStateManager(gcs_init_data); - - // Init usage stats client. InitUsageStatsClient(); + // Init metrics and event exporter. + metrics_agent_client_->WaitForServerReady([this](const Status &server_status) { + stats::InitOpenTelemetryExporter(config_.metrics_agent_port, server_status); + ray_event_recorder_->StartExportingEvents(); + }); + // Start RPC server when all tables have finished loading initial // data. rpc_server_.Run(); @@ -326,9 +317,6 @@ void GcsServer::Stop() { kv_manager_.reset(); is_stopped_ = true; - if (gcs_redis_failure_detector_) { - gcs_redis_failure_detector_->Stop(); - } RAY_LOG(INFO) << "GCS server stopped."; } @@ -345,7 +333,9 @@ void GcsServer::InitGcsNodeManager(const GcsInitData &gcs_init_data) { // Initialize by gcs tables data. gcs_node_manager_->Initialize(gcs_init_data); rpc_server_.RegisterService(std::make_unique( - io_context_provider_.GetDefaultIOContext(), *gcs_node_manager_)); + io_context_provider_.GetDefaultIOContext(), + *gcs_node_manager_, + RayConfig::instance().gcs_max_active_rpcs_per_handler())); } void GcsServer::InitGcsHealthCheckManager(const GcsInitData &gcs_init_data) { @@ -372,18 +362,20 @@ void GcsServer::InitGcsHealthCheckManager(const GcsInitData &gcs_init_data) { } void GcsServer::InitGcsResourceManager(const GcsInitData &gcs_init_data) { - RAY_CHECK(cluster_resource_scheduler_ && cluster_task_manager_); + RAY_CHECK(cluster_resource_scheduler_ && cluster_lease_manager_); gcs_resource_manager_ = std::make_unique( io_context_provider_.GetDefaultIOContext(), cluster_resource_scheduler_->GetClusterResourceManager(), *gcs_node_manager_, kGCSNodeID, - cluster_task_manager_.get()); + cluster_lease_manager_.get()); // Initialize by gcs tables data. gcs_resource_manager_->Initialize(gcs_init_data); rpc_server_.RegisterService(std::make_unique( - io_context_provider_.GetDefaultIOContext(), *gcs_resource_manager_)); + io_context_provider_.GetDefaultIOContext(), + *gcs_resource_manager_, + RayConfig::instance().gcs_max_active_rpcs_per_handler())); periodical_runner_->RunFnPeriodically( [this] { @@ -442,9 +434,9 @@ void GcsServer::InitClusterResourceScheduler() { /*is_local_node_with_raylet=*/false); } -void GcsServer::InitClusterTaskManager() { +void GcsServer::InitClusterLeaseManager() { RAY_CHECK(cluster_resource_scheduler_); - cluster_task_manager_ = std::make_unique( + cluster_lease_manager_ = std::make_unique( kGCSNodeID, *cluster_resource_scheduler_, /*get_node_info=*/ @@ -453,7 +445,7 @@ void GcsServer::InitClusterTaskManager() { return node.has_value() ? node.value().get() : nullptr; }, /*announce_infeasible_task=*/nullptr, - /*local_task_manager=*/local_task_manager_); + /*local_lease_manager=*/local_lease_manager_); } void GcsServer::InitGcsJobManager(const GcsInitData &gcs_init_data) { @@ -465,11 +457,15 @@ void GcsServer::InitGcsJobManager(const GcsInitData &gcs_init_data) { *function_manager_, kv_manager_->GetInstance(), io_context_provider_.GetDefaultIOContext(), - worker_client_pool_); + worker_client_pool_, + *ray_event_recorder_, + config_.session_name); gcs_job_manager_->Initialize(gcs_init_data); rpc_server_.RegisterService(std::make_unique( - io_context_provider_.GetDefaultIOContext(), *gcs_job_manager_)); + io_context_provider_.GetDefaultIOContext(), + *gcs_job_manager_, + RayConfig::instance().gcs_max_active_rpcs_per_handler())); } void GcsServer::InitGcsActorManager(const GcsInitData &gcs_init_data) { @@ -491,12 +487,12 @@ void GcsServer::InitGcsActorManager(const GcsInitData &gcs_init_data) { gcs_actor_manager_->OnActorCreationSuccess(actor, reply); }; - RAY_CHECK(gcs_resource_manager_ && cluster_task_manager_); + RAY_CHECK(gcs_resource_manager_ && cluster_lease_manager_); scheduler = std::make_unique( io_context_provider_.GetDefaultIOContext(), gcs_table_storage_->ActorTable(), *gcs_node_manager_, - *cluster_task_manager_, + *cluster_lease_manager_, schedule_failure_handler, schedule_success_handler, raylet_client_pool_, @@ -517,10 +513,11 @@ void GcsServer::InitGcsActorManager(const GcsInitData &gcs_init_data) { }, worker_client_pool_); - // Initialize by gcs tables data. gcs_actor_manager_->Initialize(gcs_init_data); rpc_server_.RegisterService(std::make_unique( - io_context_provider_.GetDefaultIOContext(), *gcs_actor_manager_)); + io_context_provider_.GetDefaultIOContext(), + *gcs_actor_manager_, + RayConfig::instance().gcs_max_active_rpcs_per_handler())); } void GcsServer::InitGcsPlacementGroupManager(const GcsInitData &gcs_init_data) { @@ -540,10 +537,12 @@ void GcsServer::InitGcsPlacementGroupManager(const GcsInitData &gcs_init_data) { [this](const JobID &job_id) { return gcs_job_manager_->GetJobConfig(job_id)->ray_namespace(); }); - // Initialize by gcs tables data. + gcs_placement_group_manager_->Initialize(gcs_init_data); rpc_server_.RegisterService(std::make_unique( - io_context_provider_.GetDefaultIOContext(), *gcs_placement_group_manager_)); + io_context_provider_.GetDefaultIOContext(), + *gcs_placement_group_manager_, + RayConfig::instance().gcs_max_active_rpcs_per_handler())); } GcsServer::StorageType GcsServer::GetStorageType() const { @@ -594,24 +593,25 @@ void GcsServer::InitUsageStatsClient() { } void GcsServer::InitKVManager() { - // TODO(yic): Use a factory with configs - std::unique_ptr instance; auto &io_context = io_context_provider_.GetIOContext(); + std::unique_ptr store_client; switch (storage_type_) { case (StorageType::REDIS_PERSIST): - instance = std::make_unique( - std::make_unique(CreateRedisClient(io_context))); + store_client = + std::make_unique(io_context, GetRedisClientOptions()); break; case (StorageType::IN_MEMORY): - instance = std::make_unique( - std::make_unique(std::make_unique())); + store_client = + std::make_unique(std::make_unique()); break; default: RAY_LOG(FATAL) << "Unexpected storage type! " << storage_type_; } kv_manager_ = std::make_unique( - std::move(instance), config_.raylet_config_list, io_context); + std::make_unique(std::move(store_client)), + config_.raylet_config_list, + io_context); kv_manager_->GetInstance().Put( "", @@ -632,15 +632,19 @@ void GcsServer::InitKVService() { RAY_CHECK(kv_manager_); rpc_server_.RegisterService( std::make_unique( - io_context_provider_.GetIOContext(), *kv_manager_), + io_context_provider_.GetIOContext(), + *kv_manager_, + /*max_active_rpcs_per_handler_=*/-1), false /* token_auth */); } void GcsServer::InitPubSubHandler() { - auto &io_context = io_context_provider_.GetIOContext(); + auto &io_context = io_context_provider_.GetIOContext(); pubsub_handler_ = std::make_unique(io_context, *gcs_publisher_); - rpc_server_.RegisterService( - std::make_unique(io_context, *pubsub_handler_)); + + // This service is used to handle long poll requests, so we don't limit active RPCs. + rpc_server_.RegisterService(std::make_unique( + io_context, *pubsub_handler_, /*max_active_rpcs_per_handler_=*/-1)); } void GcsServer::InitRuntimeEnvManager() { @@ -682,14 +686,18 @@ void GcsServer::InitRuntimeEnvManager() { std::chrono::milliseconds(delay_ms)); }); rpc_server_.RegisterService(std::make_unique( - io_context_provider_.GetDefaultIOContext(), *runtime_env_handler_)); + io_context_provider_.GetDefaultIOContext(), + *runtime_env_handler_, + /*max_active_rpcs_per_handler=*/-1)); } void GcsServer::InitGcsWorkerManager() { gcs_worker_manager_ = std::make_unique( *gcs_table_storage_, io_context_provider_.GetDefaultIOContext(), *gcs_publisher_); rpc_server_.RegisterService(std::make_unique( - io_context_provider_.GetDefaultIOContext(), *gcs_worker_manager_)); + io_context_provider_.GetDefaultIOContext(), + *gcs_worker_manager_, + RayConfig::instance().gcs_max_active_rpcs_per_handler())); } void GcsServer::InitGcsAutoscalerStateManager(const GcsInitData &gcs_init_data) { @@ -737,17 +745,23 @@ void GcsServer::InitGcsAutoscalerStateManager(const GcsInitData &gcs_init_data) gcs_autoscaler_state_manager_->Initialize(gcs_init_data); rpc_server_.RegisterService( std::make_unique( - io_context_provider_.GetDefaultIOContext(), *gcs_autoscaler_state_manager_)); + io_context_provider_.GetDefaultIOContext(), + *gcs_autoscaler_state_manager_, + RayConfig::instance().gcs_max_active_rpcs_per_handler())); } void GcsServer::InitGcsTaskManager() { auto &io_context = io_context_provider_.GetIOContext(); gcs_task_manager_ = std::make_unique(io_context); // Register service. - rpc_server_.RegisterService( - std::make_unique(io_context, *gcs_task_manager_)); - rpc_server_.RegisterService( - std::make_unique(io_context, *gcs_task_manager_)); + rpc_server_.RegisterService(std::make_unique( + io_context, + *gcs_task_manager_, + RayConfig::instance().gcs_max_active_rpcs_per_handler())); + rpc_server_.RegisterService(std::make_unique( + io_context, + *gcs_task_manager_, + RayConfig::instance().gcs_max_active_rpcs_per_handler())); } void GcsServer::InstallEventListeners() { @@ -772,7 +786,7 @@ void GcsServer::InstallEventListeners() { RAY_CHECK(channel != nullptr); gcs_healthcheck_manager_->AddNode(node_id, channel); } - cluster_task_manager_->ScheduleAndDispatchTasks(); + cluster_lease_manager_->ScheduleAndGrantLeases(); }); gcs_node_manager_->AddNodeRemovedListener( [this](const std::shared_ptr &node) { @@ -797,7 +811,7 @@ void GcsServer::InstallEventListeners() { auto &worker_address = worker_failure_data->worker_address(); auto worker_id = WorkerID::FromBinary(worker_address.worker_id()); worker_client_pool_.Disconnect(worker_id); - auto node_id = NodeID::FromBinary(worker_address.raylet_id()); + auto node_id = NodeID::FromBinary(worker_address.node_id()); auto worker_ip = worker_address.ip_address(); const rpc::RayException *creation_task_exception = nullptr; if (worker_failure_data->has_creation_task_exception()) { @@ -829,7 +843,7 @@ void GcsServer::InstallEventListeners() { // Because resources have been changed, we need to try to schedule the // pending placement groups and actors. gcs_placement_group_manager_->SchedulePendingPlacementGroups(); - cluster_task_manager_->ScheduleAndDispatchTasks(); + cluster_lease_manager_->ScheduleAndGrantLeases(); }, "GcsServer.SchedulePendingActors"); }); @@ -840,7 +854,7 @@ void GcsServer::InstallEventListeners() { // Because some placement group resources have been committed or deleted, we // need to try to schedule the pending placement groups and actors. gcs_placement_group_manager_->SchedulePendingPlacementGroups(); - cluster_task_manager_->ScheduleAndDispatchTasks(); + cluster_lease_manager_->ScheduleAndGrantLeases(); }, "GcsServer.SchedulePendingPGActors"); }); @@ -877,12 +891,12 @@ std::string GcsServer::GetDebugState() const { return stream.str(); } -std::shared_ptr GcsServer::CreateRedisClient( - instrumented_io_context &io_service) { - auto redis_client = std::make_shared(GetRedisClientOptions()); - auto status = redis_client->Connect(io_service); - RAY_CHECK_OK(status) << "Failed to init redis gcs client"; - return redis_client; +RedisClientOptions GcsServer::GetRedisClientOptions() { + return RedisClientOptions{config_.redis_address, + config_.redis_port, + config_.redis_username, + config_.redis_password, + config_.enable_redis_ssl}; } void GcsServer::PrintAsioStats() { @@ -901,7 +915,7 @@ void GcsServer::PrintAsioStats() { } void GcsServer::TryGlobalGC() { - if (cluster_task_manager_->GetPendingQueueSize() == 0) { + if (cluster_lease_manager_->GetPendingQueueSize() == 0) { task_pending_schedule_detected_ = 0; return; } diff --git a/src/ray/gcs/gcs_server/gcs_server.h b/src/ray/gcs/gcs_server.h similarity index 83% rename from src/ray/gcs/gcs_server/gcs_server.h rename to src/ray/gcs/gcs_server.h index 755a466b172f..c9541defe3c1 100644 --- a/src/ray/gcs/gcs_server/gcs_server.h +++ b/src/ray/gcs/gcs_server.h @@ -22,31 +22,34 @@ #include "ray/common/asio/postable.h" #include "ray/common/ray_syncer/ray_syncer.h" #include "ray/common/runtime_env_manager.h" -#include "ray/gcs/gcs_server/gcs_function_manager.h" -#include "ray/gcs/gcs_server/gcs_health_check_manager.h" -#include "ray/gcs/gcs_server/gcs_init_data.h" -#include "ray/gcs/gcs_server/gcs_kv_manager.h" -#include "ray/gcs/gcs_server/gcs_redis_failure_detector.h" -#include "ray/gcs/gcs_server/gcs_resource_manager.h" -#include "ray/gcs/gcs_server/gcs_server_io_context_policy.h" -#include "ray/gcs/gcs_server/gcs_table_storage.h" -#include "ray/gcs/gcs_server/gcs_task_manager.h" -#include "ray/gcs/gcs_server/pubsub_handler.h" -#include "ray/gcs/gcs_server/runtime_env_handler.h" -#include "ray/gcs/gcs_server/usage_stats_client.h" -#include "ray/gcs/pubsub/gcs_pub_sub.h" -#include "ray/gcs/redis_client.h" +#include "ray/gcs/gcs_function_manager.h" +#include "ray/gcs/gcs_health_check_manager.h" +#include "ray/gcs/gcs_init_data.h" +#include "ray/gcs/gcs_kv_manager.h" +#include "ray/gcs/gcs_resource_manager.h" +#include "ray/gcs/gcs_server_io_context_policy.h" +#include "ray/gcs/gcs_table_storage.h" +#include "ray/gcs/gcs_task_manager.h" +#include "ray/gcs/pubsub_handler.h" +#include "ray/gcs/runtime_env_handler.h" +#include "ray/gcs/store_client/in_memory_store_client.h" +#include "ray/gcs/store_client/observable_store_client.h" +#include "ray/gcs/store_client/redis_store_client.h" +#include "ray/gcs/usage_stats_client.h" +#include "ray/observability/ray_event_recorder.h" +#include "ray/pubsub/gcs_publisher.h" +#include "ray/raylet/scheduling/cluster_lease_manager.h" #include "ray/raylet/scheduling/cluster_resource_scheduler.h" -#include "ray/raylet/scheduling/cluster_task_manager.h" #include "ray/rpc/client_call.h" -#include "ray/rpc/gcs/gcs_rpc_server.h" -#include "ray/rpc/node_manager/raylet_client_pool.h" +#include "ray/rpc/grpc_server.h" +#include "ray/rpc/metrics_agent_client.h" +#include "ray/rpc/raylet/raylet_client_pool.h" #include "ray/rpc/worker/core_worker_client_pool.h" #include "ray/util/throttler.h" namespace ray { -using raylet::ClusterTaskManager; -using raylet::NoopLocalTaskManager; +using raylet::ClusterLeaseManager; +using raylet::NoopLocalLeaseManager; namespace gcs { @@ -54,6 +57,7 @@ struct GcsServerConfig { std::string grpc_server_name = "GcsServer"; uint16_t grpc_server_port = 0; uint16_t grpc_server_thread_num = 1; + uint16_t metrics_agent_port = 0; std::string redis_username; std::string redis_password; std::string redis_address; @@ -129,9 +133,6 @@ class GcsServer { } protected: - /// Generate the redis client options - RedisClientOptions GetRedisClientOptions() const; - void DoStart(const GcsInitData &gcs_init_data); /// Initialize gcs node manager. @@ -149,8 +150,8 @@ class GcsServer { /// Initialize cluster resource scheduler. void InitClusterResourceScheduler(); - /// Initialize cluster task manager. - void InitClusterTaskManager(); + /// Initialize cluster lease manager. + void InitClusterLeaseManager(); /// Initialize gcs job manager. void InitGcsJobManager(const GcsInitData &gcs_init_data); @@ -213,8 +214,7 @@ class GcsServer { /// Print the asio event loop stats for debugging. void PrintAsioStats(); - /// Get or connect to a redis server - std::shared_ptr CreateRedisClient(instrumented_io_context &io_service); + RedisClientOptions GetRedisClientOptions(); void TryGlobalGC(); @@ -236,25 +236,23 @@ class GcsServer { rpc::CoreWorkerClientPool worker_client_pool_; /// The cluster resource scheduler. std::shared_ptr cluster_resource_scheduler_; - /// Local task manager. - NoopLocalTaskManager local_task_manager_; + /// Local lease manager. + NoopLocalLeaseManager local_lease_manager_; /// The gcs table storage. std::unique_ptr gcs_table_storage_; - /// The cluster task manager. - std::unique_ptr cluster_task_manager_; - /// [gcs_resource_manager_] depends on [cluster_task_manager_]. + /// The cluster lease manager. + std::unique_ptr cluster_lease_manager_; + /// [gcs_resource_manager_] depends on [cluster_lease_manager_]. /// The gcs resource manager. std::unique_ptr gcs_resource_manager_; /// The autoscaler state manager. std::unique_ptr gcs_autoscaler_state_manager_; /// A publisher for publishing gcs messages. - std::unique_ptr gcs_publisher_; + std::unique_ptr gcs_publisher_; /// The gcs node manager. std::unique_ptr gcs_node_manager_; /// The health check manager. std::shared_ptr gcs_healthcheck_manager_; - /// The gcs redis failure detector. - std::unique_ptr gcs_redis_failure_detector_; /// The gcs placement group manager. std::unique_ptr gcs_placement_group_manager_; /// The gcs actor manager. @@ -270,6 +268,11 @@ class GcsServer { std::unique_ptr kv_manager_; /// Job info handler. std::unique_ptr gcs_job_manager_; + /// The Ray event recorder that is used to record events (e.g. job events, node events, + /// etc.). + rpc::ClientCallManager event_aggregator_client_call_manager_; + std::unique_ptr event_aggregator_client_; + std::unique_ptr ray_event_recorder_; /// Ray Syncer related fields. std::unique_ptr ray_syncer_; @@ -298,6 +301,8 @@ class GcsServer { int task_pending_schedule_detected_ = 0; /// Throttler for global gc std::unique_ptr global_gc_throttler_; + /// Client to call a metrics agent gRPC server. + std::unique_ptr metrics_agent_client_; }; } // namespace gcs diff --git a/src/ray/gcs/gcs_server/BUILD.bazel b/src/ray/gcs/gcs_server/BUILD.bazel deleted file mode 100644 index 179b54c35b3d..000000000000 --- a/src/ray/gcs/gcs_server/BUILD.bazel +++ /dev/null @@ -1,278 +0,0 @@ -load("//bazel:ray.bzl", "ray_cc_binary", "ray_cc_library") - -ray_cc_library( - name = "gcs_state_util", - srcs = ["state_util.cc"], - hdrs = ["state_util.h"], - deps = [ - "//src/ray/protobuf:gcs_cc_proto", - "@com_google_absl//absl/container:flat_hash_map", - ], -) - -ray_cc_library( - name = "gcs_table_storage", - srcs = ["gcs_table_storage.cc"], - hdrs = ["gcs_table_storage.h"], - deps = [ - "//src/ray/common:asio", - "//src/ray/common:id", - "//src/ray/common:status", - "//src/ray/gcs:gcs_callback", - "//src/ray/gcs/store_client:gcs_in_memory_store_client", - "//src/ray/gcs/store_client:gcs_observable_store_client", - "//src/ray/gcs/store_client:gcs_redis_store_client", - "//src/ray/protobuf:gcs_cc_proto", - ], -) - -ray_cc_library( - name = "gcs_init_data", - srcs = ["gcs_init_data.cc"], - hdrs = ["gcs_init_data.h"], - deps = [ - ":gcs_table_storage", - "//src/ray/common:asio", - "//src/ray/common:id", - "//src/ray/gcs:gcs_callback", - "//src/ray/protobuf:gcs_cc_proto", - "@com_google_absl//absl/container:flat_hash_map", - ], -) - -ray_cc_library( - name = "gcs_kv_manager", - srcs = ["gcs_kv_manager.cc"], - hdrs = ["gcs_kv_manager.h"], - deps = [ - "//src/ray/common:asio", - "//src/ray/common:status", - "//src/ray/protobuf:gcs_cc_proto", - "//src/ray/rpc:gcs_server", - ], -) - -ray_cc_library( - name = "gcs_function_manager", - hdrs = ["gcs_function_manager.h"], - deps = [ - ":gcs_kv_manager", - "//src/ray/common:asio", - "//src/ray/common:constants", - "@com_google_absl//absl/container:flat_hash_map", - ], -) - -ray_cc_library( - name = "gcs_usage_stats_client", - srcs = ["usage_stats_client.cc"], - hdrs = ["usage_stats_client.h"], - deps = [ - ":gcs_kv_manager", - "//src/ray/common:asio", - "//src/ray/protobuf:usage_cc_proto", - ], -) - -ray_cc_library( - name = "gcs_store_client_kv", - srcs = ["store_client_kv.cc"], - hdrs = ["store_client_kv.h"], - deps = [ - ":gcs_kv_manager", - "//src/ray/gcs/store_client:gcs_store_client", - ], -) - -ray_cc_library( - name = "gcs_pubsub_handler", - srcs = ["pubsub_handler.cc"], - hdrs = ["pubsub_handler.h"], - deps = [ - "//src/ray/gcs/pubsub:gcs_pub_sub_lib", - "//src/ray/protobuf:gcs_service_cc_proto", - "//src/ray/rpc:gcs_server", - "@com_google_absl//absl/container:flat_hash_map", - "@com_google_absl//absl/container:flat_hash_set", - ], -) - -ray_cc_library( - name = "gcs_runtime_env_handler", - srcs = ["runtime_env_handler.cc"], - hdrs = ["runtime_env_handler.h"], - deps = [ - "//src/ray/common:runtime_env", - "//src/ray/protobuf:gcs_cc_proto", - "//src/ray/rpc:gcs_server", - "//src/ray/util:thread_checker", - ], -) - -ray_cc_library( - name = "gcs_redis_failure_detector", - srcs = ["gcs_redis_failure_detector.cc"], - hdrs = ["gcs_redis_failure_detector.h"], - deps = [ - "//src/ray/common:asio", - "//src/ray/common:ray_config", - "//src/ray/gcs:gcs_redis_client", - ], -) - -ray_cc_library( - name = "gcs_worker_manager", - srcs = ["gcs_worker_manager.cc"], - hdrs = ["gcs_worker_manager.h"], - deps = [ - ":gcs_kv_manager", - ":gcs_table_storage", - ":gcs_usage_stats_client", - "//src/ray/gcs/pubsub:gcs_pub_sub_lib", - "//src/ray/rpc:gcs_server", - "//src/ray/stats:stats_metric", - ], -) - -ray_cc_library( - name = "gcs_health_check_manager", - srcs = ["gcs_health_check_manager.cc"], - hdrs = ["gcs_health_check_manager.h"], - deps = [ - "//src/ray/common:asio", - "//src/ray/common:id", - "//src/ray/common:ray_config", - "//src/ray/stats:stats_metric", - "//src/ray/util:thread_checker", - "@com_github_grpc_grpc//:grpc++", - "@com_github_grpc_grpc//src/proto/grpc/health/v1:health_proto", - "@com_google_absl//absl/container:flat_hash_map", - ], -) - -ray_cc_library( - name = "gcs_task_manager", - srcs = ["gcs_task_manager.cc"], - hdrs = ["gcs_task_manager.h"], - deps = [ - ":gcs_usage_stats_client", - "//src/ray/common:asio", - "//src/ray/common:id", - "//src/ray/common:ray_config", - "//src/ray/common:status", - "//src/ray/gcs:gcs_pb_util", - "//src/ray/protobuf:events_event_aggregator_service_cc_proto", - "//src/ray/protobuf:gcs_cc_proto", - "//src/ray/util:counter_map", - "@com_google_absl//absl/container:flat_hash_map", - "@com_google_absl//absl/container:flat_hash_set", - "@com_google_absl//absl/strings", - "@com_google_absl//absl/synchronization", - ], -) - -ray_cc_library( - name = "gcs_server_io_context_policy", - hdrs = ["gcs_server_io_context_policy.h"], - deps = [ - ":gcs_task_manager", - "//src/ray/common:ray_syncer", - "//src/ray/gcs/pubsub:gcs_pub_sub_lib", - "//src/ray/util:array", - "//src/ray/util:type_traits", - ], -) - -ray_cc_library( - name = "gcs_job_manager", - srcs = ["gcs_job_manager.cc"], - hdrs = ["gcs_job_manager.h"], - deps = [ - ":gcs_function_manager", - ":gcs_init_data", - ":gcs_table_storage", - "//src/ray/common:runtime_env", - "//src/ray/gcs:gcs_pb_util", - "//src/ray/gcs/pubsub:gcs_pub_sub_lib", - "//src/ray/rpc:core_worker_client", - "//src/ray/rpc:gcs_server", - "//src/ray/stats:stats_metric", - "//src/ray/util:event", - "//src/ray/util:thread_checker", - "@com_google_absl//absl/container:flat_hash_map", - "@com_google_absl//absl/container:flat_hash_set", - ], -) - -ray_cc_library( - name = "gcs_server_lib", - srcs = [ - "gcs_actor_manager.cc", - "gcs_actor_scheduler.cc", - "gcs_autoscaler_state_manager.cc", - "gcs_node_manager.cc", - "gcs_placement_group_mgr.cc", - "gcs_placement_group_scheduler.cc", - "gcs_resource_manager.cc", - "gcs_server.cc", - ], - hdrs = [ - "gcs_actor_manager.h", - "gcs_actor_scheduler.h", - "gcs_autoscaler_state_manager.h", - "gcs_node_manager.h", - "gcs_placement_group_mgr.h", - "gcs_placement_group_scheduler.h", - "gcs_resource_manager.h", - "gcs_server.h", - ], - deps = [ - ":gcs_function_manager", - ":gcs_health_check_manager", - ":gcs_init_data", - ":gcs_job_manager", - ":gcs_kv_manager", - ":gcs_pubsub_handler", - ":gcs_redis_failure_detector", - ":gcs_runtime_env_handler", - ":gcs_server_io_context_policy", - ":gcs_state_util", - ":gcs_store_client_kv", - ":gcs_table_storage", - ":gcs_task_manager", - ":gcs_usage_stats_client", - ":gcs_worker_manager", - "//src/ray/gcs/pubsub:gcs_pub_sub_lib", - "//src/ray/gcs/store_client:gcs_observable_store_client", - "//src/ray/protobuf:autoscaler_cc_grpc", - "//src/ray/protobuf:gcs_service_cc_grpc", - "//src/ray/pubsub:publisher", - "//src/ray/raylet/scheduling:scheduler", - "//src/ray/raylet_client:raylet_client_lib", - "//src/ray/rpc:core_worker_client", - "//src/ray/rpc:gcs_server", - "//src/ray/rpc:node_manager_client", - "//src/ray/util:counter_map", - "//src/ray/util:network_util", - "//src/ray/util:thread_checker", - "//src/ray/util:throttler", - "//src/ray/util:type_traits", - "@boost//:bimap", - "@com_google_absl//absl/container:btree", - ], -) - -ray_cc_binary( - name = "gcs_server", - srcs = [ - "gcs_server_main.cc", - ], - visibility = ["//visibility:public"], - deps = [ - ":gcs_server_lib", - "//src/ray/stats:stats_lib", - "//src/ray/util:stream_redirection", - "//src/ray/util:stream_redirection_options", - "@com_github_gflags_gflags//:gflags", - ], -) diff --git a/src/ray/gcs/gcs_server/gcs_redis_failure_detector.cc b/src/ray/gcs/gcs_server/gcs_redis_failure_detector.cc deleted file mode 100644 index 79ba225b4202..000000000000 --- a/src/ray/gcs/gcs_server/gcs_redis_failure_detector.cc +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright 2017 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "ray/gcs/gcs_server/gcs_redis_failure_detector.h" - -#include -#include - -#include "ray/common/ray_config.h" -#include "ray/gcs/redis_client.h" - -namespace ray { -namespace gcs { - -GcsRedisFailureDetector::GcsRedisFailureDetector( - instrumented_io_context &io_service, - std::shared_ptr redis_client, - std::function callback) - : io_service_(io_service), - redis_client_(std::move(redis_client)), - callback_(std::move(callback)) {} - -void GcsRedisFailureDetector::Start() { - RAY_LOG(INFO) << "Starting redis failure detector."; - periodical_runner_ = PeriodicalRunner::Create(io_service_); - periodical_runner_->RunFnPeriodically( - [this] { DetectRedis(); }, - RayConfig::instance().gcs_redis_heartbeat_interval_milliseconds(), - "GcsRedisFailureDetector.deadline_timer.detect_redis_failure"); -} - -void GcsRedisFailureDetector::Stop() { - RAY_LOG(INFO) << "Stopping redis failure detector."; - periodical_runner_.reset(); -} - -void GcsRedisFailureDetector::DetectRedis() { - auto redis_callback = [this](const std::shared_ptr &reply) { - if (reply->IsNil()) { - RAY_LOG(ERROR) << "Redis is inactive."; - this->io_service_.dispatch(this->callback_, "GcsRedisFailureDetector.DetectRedis"); - } - }; - auto *cxt = redis_client_->GetPrimaryContext(); - cxt->RunArgvAsync({"PING"}, redis_callback); -} - -} // namespace gcs -} // namespace ray diff --git a/src/ray/gcs/gcs_server/gcs_redis_failure_detector.h b/src/ray/gcs/gcs_server/gcs_redis_failure_detector.h deleted file mode 100644 index 1928226a33b7..000000000000 --- a/src/ray/gcs/gcs_server/gcs_redis_failure_detector.h +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright 2017 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include - -#include "ray/common/asio/instrumented_io_context.h" -#include "ray/common/asio/periodical_runner.h" - -namespace ray { -namespace gcs { - -// Forward declaration. -class RedisClient; - -/// GcsRedisFailureDetector is responsible for monitoring redis and binding GCS server and -/// redis life cycle together. GCS client subscribes to redis messages and it cannot sense -/// whether the redis is inactive unless we go to ping redis voluntarily. But there are -/// many GCS clients, if they all Ping redis, the redis load will be high. So we ping -/// redis on GCS server and GCS client can sense whether redis is normal through RPC -/// connection with GCS server. -class GcsRedisFailureDetector { - public: - /// Create a GcsRedisFailureDetector. - /// - /// \param io_service The event loop to run the monitor on. - /// \param redis_context The redis context is used to ping redis. - /// \param callback Callback that will be called when redis is detected as not alive. - explicit GcsRedisFailureDetector(instrumented_io_context &io_service, - std::shared_ptr redis_client, - std::function callback); - - /// Start detecting redis. - void Start(); - - /// Stop detecting redis. - void Stop(); - - protected: - /// Check that if redis is inactive. - void DetectRedis(); - - private: - instrumented_io_context &io_service_; - - std::shared_ptr redis_client_; - - /// The runner to run function periodically. - std::shared_ptr periodical_runner_; - - /// A function is called when redis is detected to be unavailable. - std::function callback_; -}; - -} // namespace gcs -} // namespace ray diff --git a/src/ray/gcs/gcs_server/gcs_server_io_context_policy.h b/src/ray/gcs/gcs_server_io_context_policy.h similarity index 78% rename from src/ray/gcs/gcs_server/gcs_server_io_context_policy.h rename to src/ray/gcs/gcs_server_io_context_policy.h index 5fcc02400a1a..f8a504762162 100644 --- a/src/ray/gcs/gcs_server/gcs_server_io_context_policy.h +++ b/src/ray/gcs/gcs_server_io_context_policy.h @@ -19,8 +19,9 @@ #include #include "ray/common/ray_syncer/ray_syncer.h" -#include "ray/gcs/gcs_server/gcs_task_manager.h" -#include "ray/gcs/pubsub/gcs_pub_sub.h" +#include "ray/gcs/gcs_task_manager.h" +#include "ray/observability/ray_event_recorder.h" +#include "ray/pubsub/gcs_publisher.h" #include "ray/util/array.h" #include "ray/util/type_traits.h" @@ -37,10 +38,12 @@ struct GcsServerIOContextPolicy { static constexpr int GetDedicatedIOContextIndex() { if constexpr (std::is_same_v) { return IndexOf("task_io_context"); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { return IndexOf("pubsub_io_context"); } else if constexpr (std::is_same_v) { return IndexOf("ray_syncer_io_context"); + } else if constexpr (std::is_same_v) { + return IndexOf("ray_event_io_context"); } else if constexpr (std::is_same_v) { // default io context return -1; @@ -54,10 +57,13 @@ struct GcsServerIOContextPolicy { // This list must be unique and complete set of names returned from // GetDedicatedIOContextIndex. Or you can get runtime crashes when accessing a missing // name, or get leaks by creating unused threads. - constexpr static std::array kAllDedicatedIOContextNames{ - "task_io_context", "pubsub_io_context", "ray_syncer_io_context"}; - constexpr static std::array kAllDedicatedIOContextEnableLagProbe{ - true, true, true}; + constexpr static std::array kAllDedicatedIOContextNames{ + "task_io_context", + "pubsub_io_context", + "ray_syncer_io_context", + "ray_event_io_context"}; + constexpr static std::array kAllDedicatedIOContextEnableLagProbe{ + true, true, true, true}; constexpr static size_t IndexOf(std::string_view name) { return ray::IndexOf(kAllDedicatedIOContextNames, name); diff --git a/src/ray/gcs/gcs_server/gcs_server_main.cc b/src/ray/gcs/gcs_server_main.cc similarity index 93% rename from src/ray/gcs/gcs_server/gcs_server_main.cc rename to src/ray/gcs/gcs_server_main.cc index 2bf2366a2626..19d155e407b9 100644 --- a/src/ray/gcs/gcs_server/gcs_server_main.cc +++ b/src/ray/gcs/gcs_server_main.cc @@ -20,13 +20,13 @@ #include "gflags/gflags.h" #include "ray/common/ray_config.h" -#include "ray/gcs/gcs_server/gcs_server.h" +#include "ray/gcs/gcs_server.h" #include "ray/gcs/store_client/redis_store_client.h" #include "ray/stats/stats.h" #include "ray/util/event.h" +#include "ray/util/raii.h" #include "ray/util/stream_redirection.h" #include "ray/util/stream_redirection_options.h" -#include "ray/util/util.h" #include "src/ray/protobuf/gcs_service.pb.h" DEFINE_string(redis_address, "", "The ip address of redis."); @@ -37,14 +37,12 @@ DEFINE_string(stdout_filepath, "", "The filepath to dump gcs server stdout."); DEFINE_string(stderr_filepath, "", "The filepath to dump gcs server stderr."); DEFINE_int32(gcs_server_port, 0, "The port of gcs server."); DEFINE_int32(metrics_agent_port, -1, "The port of metrics agent."); -DEFINE_string(config_list, "", "The config list of raylet."); +DEFINE_string(config_list, "", "The config list of gcs."); DEFINE_string(redis_username, "", "The username of Redis."); DEFINE_string(redis_password, "", "The password of Redis."); DEFINE_bool(retry_redis, false, "Whether to retry to connect to Redis."); DEFINE_string(node_ip_address, "", "The IP address of the node."); -DEFINE_string(session_name, - "", - "session_name: The session name (ClusterID) of the cluster."); +DEFINE_string(session_name, "", "session_name: The current Ray session name."); DEFINE_string(ray_commit, "", "The commit hash of Ray."); int main(int argc, char *argv[]) { @@ -111,8 +109,10 @@ int main(int argc, char *argv[]) { // IO Service for main loop. SetThreadName("gcs_server"); - instrumented_io_context main_service(/*enable_lag_probe=*/true, - /*running_on_single_thread=*/true); + instrumented_io_context main_service( + /*enable_metrics=*/RayConfig::instance().emit_main_service_metrics(), + /*running_on_single_thread=*/true, + "gcs_server_main_io_context"); // Ensure that the IO service keeps running. Without this, the main_service will exit // as soon as there is no more work to be processed. boost::asio::executor_work_guard work( @@ -125,7 +125,7 @@ int main(int argc, char *argv[]) { {ray::stats::VersionKey, kRayVersion}, {ray::stats::NodeAddressKey, node_ip_address}, {ray::stats::SessionNameKey, session_name}}; - ray::stats::Init(global_tags, metrics_agent_port, WorkerID::Nil()); + ray::stats::Init(global_tags, metrics_agent_port, ray::WorkerID::Nil()); // Initialize event framework. if (RayConfig::instance().event_log_reporter_enabled() && !log_dir.empty()) { @@ -150,6 +150,7 @@ int main(int argc, char *argv[]) { gcs_server_config.grpc_server_port = gcs_server_port; gcs_server_config.grpc_server_thread_num = RayConfig::instance().gcs_server_rpc_server_thread_num(); + gcs_server_config.metrics_agent_port = metrics_agent_port; gcs_server_config.redis_address = redis_address; gcs_server_config.redis_port = redis_port; gcs_server_config.enable_redis_ssl = FLAGS_redis_enable_ssl; @@ -157,6 +158,7 @@ int main(int argc, char *argv[]) { gcs_server_config.redis_username = redis_username; gcs_server_config.retry_redis = retry_redis; gcs_server_config.node_ip_address = node_ip_address; + gcs_server_config.metrics_agent_port = metrics_agent_port; gcs_server_config.log_dir = log_dir; gcs_server_config.raylet_config_list = config_list; gcs_server_config.session_name = session_name; diff --git a/src/ray/gcs/gcs_server/gcs_table_storage.cc b/src/ray/gcs/gcs_table_storage.cc similarity index 67% rename from src/ray/gcs/gcs_server/gcs_table_storage.cc rename to src/ray/gcs/gcs_table_storage.cc index df636bee6a8a..549a0c1733fc 100644 --- a/src/ray/gcs/gcs_server/gcs_table_storage.cc +++ b/src/ray/gcs/gcs_table_storage.cc @@ -12,16 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/gcs_server/gcs_table_storage.h" +#include "ray/gcs/gcs_table_storage.h" #include #include #include +#include "absl/container/flat_hash_map.h" #include "ray/common/asio/postable.h" #include "ray/common/id.h" #include "ray/common/status.h" -#include "ray/gcs/callback.h" namespace ray { namespace gcs { @@ -36,39 +36,38 @@ Postable JustOk(Postable callback) { } // namespace template -Status GcsTable::Put(const Key &key, - const Data &value, - Postable callback) { - return store_client_->AsyncPut(table_name_, - key.Binary(), - value.SerializeAsString(), - /*overwrite*/ true, - JustOk(std::move(callback))); +void GcsTable::Put(const Key &key, + const Data &value, + Postable callback) { + store_client_->AsyncPut(table_name_, + key.Binary(), + value.SerializeAsString(), + /*overwrite*/ true, + JustOk(std::move(callback))); } template -Status GcsTable::Get(const Key &key, - Postable)> callback) { +void GcsTable::Get(const Key &key, + Postable)> callback) { // We can't use TransformArg here because we need to return 2 arguments. - return store_client_->AsyncGet( - table_name_, key.Binary(), std::move(callback).Rebind([](auto callback) { - return [callback = std::move(callback)](Status status, - std::optional result) { + store_client_->AsyncGet( + table_name_, key.Binary(), std::move(callback).Rebind([](auto cb) { + return [cb = std::move(cb)](Status status, std::optional result) { std::optional value; if (result) { Data data; data.ParseFromString(*result); value = std::move(data); } - callback(status, std::move(value)); + cb(status, std::move(value)); }; })); } template -Status GcsTable::GetAll( +void GcsTable::GetAll( Postable)> callback) { - return store_client_->AsyncGetAll( + store_client_->AsyncGetAll( table_name_, std::move(callback).TransformArg( [](absl::flat_hash_map result) { @@ -84,40 +83,40 @@ Status GcsTable::GetAll( } template -Status GcsTable::Delete(const Key &key, Postable callback) { - return store_client_->AsyncDelete( +void GcsTable::Delete(const Key &key, Postable callback) { + store_client_->AsyncDelete( table_name_, key.Binary(), JustOk(std::move(callback))); } template -Status GcsTable::BatchDelete(const std::vector &keys, - Postable callback) { +void GcsTable::BatchDelete(const std::vector &keys, + Postable callback) { std::vector keys_to_delete; keys_to_delete.reserve(keys.size()); for (auto &key : keys) { keys_to_delete.emplace_back(std::move(key.Binary())); } - return this->store_client_->AsyncBatchDelete( + this->store_client_->AsyncBatchDelete( this->table_name_, keys_to_delete, JustOk(std::move(callback))); } template -Status GcsTableWithJobId::Put(const Key &key, - const Data &value, - Postable callback) { +void GcsTableWithJobId::Put(const Key &key, + const Data &value, + Postable callback) { { absl::MutexLock lock(&mutex_); index_[GetJobIdFromKey(key)].insert(key); } - return this->store_client_->AsyncPut(this->table_name_, - key.Binary(), - value.SerializeAsString(), - /*overwrite*/ true, - JustOk(std::move(callback))); + this->store_client_->AsyncPut(this->table_name_, + key.Binary(), + value.SerializeAsString(), + /*overwrite*/ true, + JustOk(std::move(callback))); } template -Status GcsTableWithJobId::GetByJobId( +void GcsTableWithJobId::GetByJobId( const JobID &job_id, Postable)> callback) { std::vector keys; { @@ -127,7 +126,7 @@ Status GcsTableWithJobId::GetByJobId( keys.push_back(key.Binary()); } } - return this->store_client_->AsyncMultiGet( + this->store_client_->AsyncMultiGet( this->table_name_, keys, std::move(callback).TransformArg( @@ -143,8 +142,8 @@ Status GcsTableWithJobId::GetByJobId( } template -Status GcsTableWithJobId::DeleteByJobId(const JobID &job_id, - Postable callback) { +void GcsTableWithJobId::DeleteByJobId(const JobID &job_id, + Postable callback) { std::vector keys; { absl::MutexLock lock(&mutex_); @@ -153,24 +152,24 @@ Status GcsTableWithJobId::DeleteByJobId(const JobID &job_id, keys.push_back(key); } } - return BatchDelete(keys, std::move(callback)); + BatchDelete(keys, std::move(callback)); } template -Status GcsTableWithJobId::Delete(const Key &key, - Postable callback) { - return BatchDelete({key}, std::move(callback)); +void GcsTableWithJobId::Delete(const Key &key, + Postable callback) { + BatchDelete({key}, std::move(callback)); } template -Status GcsTableWithJobId::BatchDelete(const std::vector &keys, - Postable callback) { +void GcsTableWithJobId::BatchDelete(const std::vector &keys, + Postable callback) { std::vector keys_to_delete; keys_to_delete.reserve(keys.size()); for (auto &key : keys) { keys_to_delete.push_back(key.Binary()); } - return this->store_client_->AsyncBatchDelete( + this->store_client_->AsyncBatchDelete( this->table_name_, keys_to_delete, std::move(callback).TransformArg([this, callback, keys](int64_t) { @@ -185,9 +184,9 @@ Status GcsTableWithJobId::BatchDelete(const std::vector &keys, } template -Status GcsTableWithJobId::AsyncRebuildIndexAndGetAll( +void GcsTableWithJobId::AsyncRebuildIndexAndGetAll( Postable)> callback) { - return this->GetAll(std::move(callback).TransformArg( + this->GetAll(std::move(callback).TransformArg( [this](absl::flat_hash_map result) mutable { absl::MutexLock lock(&this->mutex_); this->index_.clear(); diff --git a/src/ray/gcs/gcs_server/gcs_table_storage.h b/src/ray/gcs/gcs_table_storage.h similarity index 76% rename from src/ray/gcs/gcs_server/gcs_table_storage.h rename to src/ray/gcs/gcs_table_storage.h index af0992cbf4e1..46f1e8b746dc 100644 --- a/src/ray/gcs/gcs_server/gcs_table_storage.h +++ b/src/ray/gcs/gcs_table_storage.h @@ -19,9 +19,9 @@ #include #include -#include "ray/gcs/store_client/in_memory_store_client.h" -#include "ray/gcs/store_client/observable_store_client.h" -#include "ray/gcs/store_client/redis_store_client.h" +#include "absl/container/flat_hash_map.h" +#include "absl/container/flat_hash_set.h" +#include "ray/gcs/store_client/store_client.h" #include "src/ray/protobuf/gcs.pb.h" namespace ray { @@ -46,38 +46,33 @@ class GcsTable { /// \param key The key that will be written to the table. /// \param value The value of the key that will be written to the table. /// \param callback Callback that will be called after write finishes. - /// \return Status - virtual Status Put(const Key &key, - const Data &value, - Postable callback); + virtual void Put(const Key &key, + const Data &value, + Postable callback); /// Get data from the table asynchronously. /// /// \param key The key to lookup from the table. /// \param callback Callback that will be called after read finishes. - /// \return Status - Status Get(const Key &key, Postable)> callback); + void Get(const Key &key, Postable)> callback); /// Get all data from the table asynchronously. /// /// \param callback Callback that will be called after data has been received. - /// \return Status - Status GetAll(Postable)> callback); + void GetAll(Postable)> callback); /// Delete data from the table asynchronously. /// /// \param key The key that will be deleted from the table. /// \param callback Callback that will be called after delete finishes. - /// \return Status - virtual Status Delete(const Key &key, Postable callback); + virtual void Delete(const Key &key, Postable callback); /// Delete a batch of data from the table asynchronously. /// /// \param keys The batch key that will be deleted from the table. /// \param callback Callback that will be called after delete finishes. - /// \return Status - virtual Status BatchDelete(const std::vector &keys, - Postable callback); + virtual void BatchDelete(const std::vector &keys, + Postable callback); protected: std::string table_name_; @@ -105,43 +100,39 @@ class GcsTableWithJobId : public GcsTable { /// from the key. /// \param value The value of the key that will be written to the table. /// \param callback Callback that will be called after write finishes, whether it - /// succeeds or not. \return Status for issuing the asynchronous write operation. - Status Put(const Key &key, - const Data &value, - Postable callback) override; + /// succeeds or not. + void Put(const Key &key, + const Data &value, + Postable callback) override; /// Get all the data of the specified job id from the table asynchronously. /// /// \param job_id The key to lookup from the table. /// \param callback Callback that will be called after read finishes. - /// \return Status - Status GetByJobId(const JobID &job_id, - Postable)> callback); + void GetByJobId(const JobID &job_id, + Postable)> callback); /// Delete all the data of the specified job id from the table asynchronously. /// /// \param job_id The key that will be deleted from the table. /// \param callback Callback that will be called after delete finishes. - /// \return Status - Status DeleteByJobId(const JobID &job_id, Postable callback); + void DeleteByJobId(const JobID &job_id, Postable callback); /// Delete data and index from the table asynchronously. /// /// \param key The key that will be deleted from the table. /// \param callback Callback that will be called after delete finishes. - /// \return Status - Status Delete(const Key &key, Postable callback) override; + void Delete(const Key &key, Postable callback) override; /// Delete a batch of data and index from the table asynchronously. /// /// \param keys The batch key that will be deleted from the table. /// \param callback Callback that will be called after delete finishes. - /// \return Status - Status BatchDelete(const std::vector &keys, - Postable callback) override; + void BatchDelete(const std::vector &keys, + Postable callback) override; /// Rebuild the index during startup. - Status AsyncRebuildIndexAndGetAll( + void AsyncRebuildIndexAndGetAll( Postable)> callback); protected: @@ -206,10 +197,6 @@ class GcsWorkerTable : public GcsTable { } }; -/// \class GcsTableStorage -/// -/// This class is not meant to be used directly. All gcs table storage classes should -/// derive from this class and override class member variables. class GcsTableStorage { public: explicit GcsTableStorage(std::shared_ptr store_client) @@ -254,9 +241,9 @@ class GcsTableStorage { return *worker_table_; } - Status AsyncGetNextJobID(Postable callback) { + void AsyncGetNextJobID(Postable callback) { RAY_CHECK(store_client_); - return store_client_->AsyncGetNextJobID(std::move(callback)); + store_client_->AsyncGetNextJobID(std::move(callback)); } protected: @@ -269,24 +256,5 @@ class GcsTableStorage { std::unique_ptr worker_table_; }; -/// \class RedisGcsTableStorage -/// RedisGcsTableStorage is an implementation of `GcsTableStorage` -/// that uses redis as storage. -class RedisGcsTableStorage : public GcsTableStorage { - public: - explicit RedisGcsTableStorage(std::shared_ptr redis_client) - : GcsTableStorage(std::make_shared(std::move(redis_client))) {} -}; - -/// \class InMemoryGcsTableStorage -/// InMemoryGcsTableStorage is an implementation of `GcsTableStorage` -/// that uses memory as storage. -class InMemoryGcsTableStorage : public GcsTableStorage { - public: - explicit InMemoryGcsTableStorage() - : GcsTableStorage(std::make_shared( - std::make_unique())) {} -}; - } // namespace gcs } // namespace ray diff --git a/src/ray/gcs/gcs_server/gcs_task_manager.cc b/src/ray/gcs/gcs_task_manager.cc similarity index 98% rename from src/ray/gcs/gcs_server/gcs_task_manager.cc rename to src/ray/gcs/gcs_task_manager.cc index 271f704322ac..250a88c9fe40 100644 --- a/src/ray/gcs/gcs_server/gcs_task_manager.cc +++ b/src/ray/gcs/gcs_task_manager.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/gcs_server/gcs_task_manager.h" +#include "ray/gcs/gcs_task_manager.h" #include #include @@ -38,6 +38,7 @@ GcsTaskManager::GcsTaskManager(instrumented_io_context &io_service) RayConfig::instance().task_events_max_num_task_in_gcs(), stats_counter_, std::make_unique())), + ray_event_converter_(std::make_unique()), periodical_runner_(PeriodicalRunner::Create(io_service_)) { periodical_runner_->RunFnPeriodically([this] { task_event_storage_->GcJobSummary(); }, 5 * 1000, @@ -638,9 +639,7 @@ void GcsTaskManager::GcsTaskManagerStorage::RecordDataLossFromWorker( } } -void GcsTaskManager::HandleAddTaskEventData(rpc::AddTaskEventDataRequest request, - rpc::AddTaskEventDataReply *reply, - rpc::SendReplyCallback send_reply_callback) { +void GcsTaskManager::RecordTaskEventData(rpc::AddTaskEventDataRequest &request) { auto data = std::move(*request.mutable_data()); task_event_storage_->RecordDataLossFromWorker(data); @@ -648,6 +647,12 @@ void GcsTaskManager::HandleAddTaskEventData(rpc::AddTaskEventDataRequest request stats_counter_.Increment(kTotalNumTaskEventsReported); task_event_storage_->AddOrReplaceTaskEvent(std::move(events_by_task)); } +} + +void GcsTaskManager::HandleAddTaskEventData(rpc::AddTaskEventDataRequest request, + rpc::AddTaskEventDataReply *reply, + rpc::SendReplyCallback send_reply_callback) { + RecordTaskEventData(request); // Processed all the task events GCS_RPC_SEND_REPLY(send_reply_callback, reply, Status::OK()); @@ -656,7 +661,14 @@ void GcsTaskManager::HandleAddTaskEventData(rpc::AddTaskEventDataRequest request void GcsTaskManager::HandleAddEvents(rpc::events::AddEventsRequest request, rpc::events::AddEventsReply *reply, rpc::SendReplyCallback send_reply_callback) { - // TODO(can-anyscale): Implement this. + auto task_event_data_requests = + ray_event_converter_->ConvertToTaskEventDataRequests(std::move(request)); + + for (auto &task_event_data : task_event_data_requests) { + RecordTaskEventData(task_event_data); + } + + // Processed all the task events GCS_RPC_SEND_REPLY(send_reply_callback, reply, Status::OK()); } diff --git a/src/ray/gcs/gcs_server/gcs_task_manager.h b/src/ray/gcs/gcs_task_manager.h similarity index 96% rename from src/ray/gcs/gcs_server/gcs_task_manager.h rename to src/ray/gcs/gcs_task_manager.h index 89ca5c8b611c..ee0ebe3110f3 100644 --- a/src/ray/gcs/gcs_server/gcs_task_manager.h +++ b/src/ray/gcs/gcs_task_manager.h @@ -24,8 +24,11 @@ #include "absl/container/flat_hash_map.h" #include "absl/container/flat_hash_set.h" #include "absl/synchronization/mutex.h" -#include "ray/gcs/gcs_server/usage_stats_client.h" -#include "ray/gcs/pb_util.h" +#include "ray/common/protobuf_utils.h" +#include "ray/gcs/gcs_ray_event_converter.h" +#include "ray/gcs/grpc_service_interfaces.h" +#include "ray/gcs/usage_stats_client.h" +#include "ray/stats/metric_defs.h" #include "ray/util/counter_map.h" #include "src/ray/protobuf/gcs.pb.h" @@ -91,7 +94,8 @@ class FinishedTaskActorTaskGcPolicy : public TaskEventsGcPolicyInterface { /// /// This class has its own io_context and io_thread, that's separate from other GCS /// services. All handling of all rpc should be posted to the single thread it owns. -class GcsTaskManager : public rpc::TaskInfoHandler, public rpc::RayEventExportHandler { +class GcsTaskManager : public rpc::TaskInfoGcsServiceHandler, + public rpc::events::RayEventExportGcsServiceHandler { public: /// Create a GcsTaskManager. explicit GcsTaskManager(instrumented_io_context &io_service); @@ -469,6 +473,7 @@ class GcsTaskManager : public rpc::TaskInfoHandler, public rpc::RayEventExportHa std::vector> task_events_list_; friend class GcsTaskManager; + FRIEND_TEST(GcsTaskManagerTest, TestHandleAddEventBasic); FRIEND_TEST(GcsTaskManagerTest, TestHandleAddTaskEventBasic); FRIEND_TEST(GcsTaskManagerTest, TestMergeTaskEventsSameTaskAttempt); FRIEND_TEST(GcsTaskManagerMemoryLimitedTest, TestLimitTaskEvents); @@ -479,6 +484,8 @@ class GcsTaskManager : public rpc::TaskInfoHandler, public rpc::RayEventExportHa }; private: + void RecordTaskEventData(rpc::AddTaskEventDataRequest &request); + /// Record data loss from worker. /// /// TODO(rickyx): This will be updated to record task attempt loss properly. @@ -519,10 +526,15 @@ class GcsTaskManager : public rpc::TaskInfoHandler, public rpc::RayEventExportHa // the io_service_thread_. Access to it is *not* thread safe. std::unique_ptr task_event_storage_; + // Converter for converting RayEvents to TaskEvents. + std::unique_ptr ray_event_converter_; + /// The runner to run function periodically. std::shared_ptr periodical_runner_; + FRIEND_TEST(GcsTaskManagerTest, TestHandleAddEventBasic); FRIEND_TEST(GcsTaskManagerTest, TestHandleAddTaskEventBasic); + FRIEND_TEST(GcsTaskManagerTest, TestHandleAddEventsMultiJobGrouping); FRIEND_TEST(GcsTaskManagerTest, TestMergeTaskEventsSameTaskAttempt); FRIEND_TEST(GcsTaskManagerMemoryLimitedTest, TestLimitTaskEvents); FRIEND_TEST(GcsTaskManagerMemoryLimitedTest, TestIndexNoLeak); diff --git a/src/ray/gcs/gcs_server/gcs_worker_manager.cc b/src/ray/gcs/gcs_worker_manager.cc similarity index 84% rename from src/ray/gcs/gcs_server/gcs_worker_manager.cc rename to src/ray/gcs/gcs_worker_manager.cc index 42e8709d5f8d..3819112f75c6 100644 --- a/src/ray/gcs/gcs_server/gcs_worker_manager.cc +++ b/src/ray/gcs/gcs_worker_manager.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/gcs_server/gcs_worker_manager.h" +#include "ray/gcs/gcs_worker_manager.h" #include #include @@ -40,9 +40,9 @@ void GcsWorkerManager::HandleReportWorkerFailure( GetWorkerInfo( worker_id, {[this, reply, send_reply_callback, worker_id, request = std::move(request)]( - const std::optional &result) { + std::optional result) { const auto &worker_address = request.worker_failure().worker_address(); - const auto node_id = NodeID::FromBinary(worker_address.raylet_id()); + const auto node_id = NodeID::FromBinary(worker_address.node_id()); std::string message = absl::StrCat("Reporting worker exit, worker id = ", worker_id.Hex(), @@ -63,10 +63,10 @@ void GcsWorkerManager::HandleReportWorkerFailure( "are lots of this logs, that might indicate there are " "unexpected failures in the cluster."; } - auto worker_failure_data = std::make_shared(); - if (result) { - worker_failure_data->CopyFrom(*result); - } + auto worker_failure_data = + result.has_value() + ? std::make_shared(std::move(*result)) + : std::make_shared(); worker_failure_data->MergeFrom(request.worker_failure()); worker_failure_data->set_is_alive(false); @@ -75,29 +75,28 @@ void GcsWorkerManager::HandleReportWorkerFailure( } auto on_done = [this, - worker_address, - worker_id, node_id, + worker_id, worker_failure_data, reply, - send_reply_callback](const Status &status) { + send_reply_callback, + worker_ip_address = + worker_address.ip_address()](const Status &status) { if (!status.ok()) { RAY_LOG(ERROR).WithField(worker_id).WithField(node_id).WithField( - "worker_address", worker_address.ip_address()) + "worker_address", worker_ip_address) << "Failed to report worker failure"; } else { if (!IsIntentionalWorkerFailure(worker_failure_data->exit_type())) { - stats::UnintentionalWorkerFailures.Record(1); + ray_metric_unintentional_worker_failures_.Record(1); } - // Only publish worker_id and raylet_id in address as they are the only + // Only publish worker_id and node_id in address as they are the only // fields used by sub clients. rpc::WorkerDeltaData worker_failure; worker_failure.set_worker_id( worker_failure_data->worker_address().worker_id()); - worker_failure.set_raylet_id( - worker_failure_data->worker_address().raylet_id()); - RAY_CHECK_OK( - gcs_publisher_.PublishWorkerFailure(worker_id, worker_failure, nullptr)); + worker_failure.set_node_id(worker_failure_data->worker_address().node_id()); + gcs_publisher_.PublishWorkerFailure(worker_id, std::move(worker_failure)); } GCS_RPC_SEND_REPLY(send_reply_callback, reply, status); }; @@ -106,11 +105,8 @@ void GcsWorkerManager::HandleReportWorkerFailure( // receives the worker registration information first and then the worker failure // message, so we delete the get operation. Related issues: // https://github.com/ray-project/ray/pull/11599 - Status status = gcs_table_storage_.WorkerTable().Put( - worker_id, *worker_failure_data, {on_done, io_context_}); - if (!status.ok()) { - on_done(status); - } + gcs_table_storage_.WorkerTable().Put( + worker_id, *worker_failure_data, {std::move(on_done), io_context_}); if (request.worker_failure().exit_type() == rpc::WorkerExitType::SYSTEM_ERROR || request.worker_failure().exit_type() == @@ -201,10 +197,7 @@ void GcsWorkerManager::HandleGetAllWorkerInfo( RAY_LOG(DEBUG) << "Finished getting all worker info."; GCS_RPC_SEND_REPLY(send_reply_callback, reply, Status::OK()); }; - Status status = gcs_table_storage_.WorkerTable().GetAll({on_done, io_context_}); - if (!status.ok()) { - on_done(absl::flat_hash_map()); - } + gcs_table_storage_.WorkerTable().GetAll({std::move(on_done), io_context_}); } void GcsWorkerManager::HandleAddWorkerInfo(rpc::AddWorkerInfoRequest request, @@ -225,11 +218,7 @@ void GcsWorkerManager::HandleAddWorkerInfo(rpc::AddWorkerInfoRequest request, GCS_RPC_SEND_REPLY(send_reply_callback, reply, status); }; - Status status = gcs_table_storage_.WorkerTable().Put( - worker_id, *worker_data, {on_done, io_context_}); - if (!status.ok()) { - on_done(status); - } + gcs_table_storage_.WorkerTable().Put(worker_id, *worker_data, {on_done, io_context_}); } void GcsWorkerManager::HandleUpdateWorkerDebuggerPort( @@ -263,19 +252,13 @@ void GcsWorkerManager::HandleUpdateWorkerDebuggerPort( auto worker_data = std::make_shared(); worker_data->CopyFrom(*result); worker_data->set_debugger_port(debugger_port); - Status put_status = gcs_table_storage_.WorkerTable().Put( - worker_id, *worker_data, {on_worker_update_done, io_context_}); - if (!put_status.ok()) { - GCS_RPC_SEND_REPLY(send_reply_callback, reply, put_status); - } + gcs_table_storage_.WorkerTable().Put( + worker_id, *worker_data, {std::move(on_worker_update_done), io_context_}); } }; - Status status = - gcs_table_storage_.WorkerTable().Get(worker_id, {on_worker_get_done, io_context_}); - if (!status.ok()) { - GCS_RPC_SEND_REPLY(send_reply_callback, reply, status); - } + gcs_table_storage_.WorkerTable().Get(worker_id, + {std::move(on_worker_get_done), io_context_}); } void GcsWorkerManager::HandleUpdateWorkerNumPausedThreads( @@ -321,19 +304,13 @@ void GcsWorkerManager::HandleUpdateWorkerNumPausedThreads( worker_data->has_num_paused_threads() ? worker_data->num_paused_threads() : 0; worker_data->set_num_paused_threads(current_num_paused_threads + num_paused_threads_delta); - Status put_status = gcs_table_storage_.WorkerTable().Put( - worker_id, *worker_data, {on_worker_update_done, io_context_}); - if (!put_status.ok()) { - GCS_RPC_SEND_REPLY(send_reply_callback, reply, put_status); - } + gcs_table_storage_.WorkerTable().Put( + worker_id, *worker_data, {std::move(on_worker_update_done), io_context_}); } }; - Status status = - gcs_table_storage_.WorkerTable().Get(worker_id, {on_worker_get_done, io_context_}); - if (!status.ok()) { - GCS_RPC_SEND_REPLY(send_reply_callback, reply, status); - } + gcs_table_storage_.WorkerTable().Get(worker_id, + {std::move(on_worker_get_done), io_context_}); } void GcsWorkerManager::AddWorkerDeadListener( @@ -345,7 +322,7 @@ void GcsWorkerManager::AddWorkerDeadListener( void GcsWorkerManager::GetWorkerInfo( const WorkerID &worker_id, Postable)> callback) const { - RAY_CHECK_OK(gcs_table_storage_.WorkerTable().Get( + gcs_table_storage_.WorkerTable().Get( worker_id, std::move(callback).TransformArg( [worker_id](Status status, std::optional data) { @@ -354,7 +331,7 @@ void GcsWorkerManager::GetWorkerInfo( << "Failed to get worker info, status = " << status; } return data; - }))); + })); } } // namespace gcs diff --git a/src/ray/gcs/gcs_server/gcs_worker_manager.h b/src/ray/gcs/gcs_worker_manager.h similarity index 82% rename from src/ray/gcs/gcs_server/gcs_worker_manager.h rename to src/ray/gcs/gcs_worker_manager.h index d4858efcbd6e..062fb25d1d3d 100644 --- a/src/ray/gcs/gcs_server/gcs_worker_manager.h +++ b/src/ray/gcs/gcs_worker_manager.h @@ -16,21 +16,20 @@ #include -#include "ray/gcs/gcs_server/gcs_kv_manager.h" -#include "ray/gcs/gcs_server/gcs_table_storage.h" -#include "ray/gcs/gcs_server/usage_stats_client.h" -#include "ray/gcs/pubsub/gcs_pub_sub.h" -#include "ray/rpc/gcs/gcs_rpc_server.h" +#include "ray/gcs/gcs_kv_manager.h" +#include "ray/gcs/gcs_table_storage.h" +#include "ray/gcs/grpc_service_interfaces.h" +#include "ray/gcs/usage_stats_client.h" +#include "ray/pubsub/gcs_publisher.h" namespace ray { namespace gcs { -/// This implementation class of `WorkerInfoHandler`. -class GcsWorkerManager : public rpc::WorkerInfoHandler { +class GcsWorkerManager : public rpc::WorkerInfoGcsServiceHandler { public: GcsWorkerManager(gcs::GcsTableStorage &gcs_table_storage, instrumented_io_context &io_context, - GcsPublisher &gcs_publisher) + pubsub::GcsPublisher &gcs_publisher) : gcs_table_storage_(gcs_table_storage), io_context_(io_context), gcs_publisher_(gcs_publisher) {} @@ -74,7 +73,7 @@ class GcsWorkerManager : public rpc::WorkerInfoHandler { gcs::GcsTableStorage &gcs_table_storage_; instrumented_io_context &io_context_; - GcsPublisher &gcs_publisher_; + pubsub::GcsPublisher &gcs_publisher_; UsageStatsClient *usage_stats_client_; /// Only listens for unexpected worker deaths not expected like node death. @@ -86,6 +85,14 @@ class GcsWorkerManager : public rpc::WorkerInfoHandler { /// Tracks the number of occurences of worker crash due to OOM int32_t worker_crash_oom_count_ = 0; + + /// Ray metrics + ray::stats::Count ray_metric_unintentional_worker_failures_{ + /*name=*/"unintentional_worker_failures_total", + /*description=*/ + "Number of worker failures that are not intentional. For example, worker failures " + "due to system related errors.", + /*unit=*/""}; }; } // namespace gcs diff --git a/src/ray/gcs/grpc_service_interfaces.h b/src/ray/gcs/grpc_service_interfaces.h new file mode 100644 index 000000000000..e7d8dedf5a61 --- /dev/null +++ b/src/ray/gcs/grpc_service_interfaces.h @@ -0,0 +1,338 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/* + * This file defines the gRPC service *INTERFACES* only. + * The subcomponent that handles a given interface should inherit from the relevant + * class. The target for the subcomponent should depend only on this file, not on + * grpc_services.h. + */ + +#pragma once + +#include "ray/common/status.h" +#include "src/ray/protobuf/autoscaler.grpc.pb.h" +#include "src/ray/protobuf/gcs_service.grpc.pb.h" + +namespace ray { +namespace rpc { + +using SendReplyCallback = std::function success, std::function failure)>; + +#define GCS_RPC_SEND_REPLY(send_reply_callback, reply, status) \ + reply->mutable_status()->set_code(static_cast(status.code())); \ + reply->mutable_status()->set_message(status.message()); \ + send_reply_callback(ray::Status::OK(), nullptr, nullptr) + +class ActorInfoGcsServiceHandler { + public: + virtual ~ActorInfoGcsServiceHandler() = default; + + virtual void HandleRegisterActor(RegisterActorRequest request, + RegisterActorReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleRestartActorForLineageReconstruction( + RestartActorForLineageReconstructionRequest request, + RestartActorForLineageReconstructionReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleCreateActor(CreateActorRequest request, + CreateActorReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleGetActorInfo(GetActorInfoRequest request, + GetActorInfoReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleGetNamedActorInfo(GetNamedActorInfoRequest request, + GetNamedActorInfoReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleListNamedActors(rpc::ListNamedActorsRequest request, + rpc::ListNamedActorsReply *reply, + rpc::SendReplyCallback send_reply_callback) = 0; + + virtual void HandleGetAllActorInfo(GetAllActorInfoRequest request, + GetAllActorInfoReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleKillActorViaGcs(KillActorViaGcsRequest request, + KillActorViaGcsReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleReportActorOutOfScope(ReportActorOutOfScopeRequest request, + ReportActorOutOfScopeReply *reply, + SendReplyCallback send_reply_callback) = 0; +}; + +class NodeInfoGcsServiceHandler { + public: + virtual ~NodeInfoGcsServiceHandler() = default; + + virtual void HandleGetClusterId(GetClusterIdRequest request, + GetClusterIdReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleRegisterNode(RegisterNodeRequest request, + RegisterNodeReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleUnregisterNode(UnregisterNodeRequest request, + UnregisterNodeReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleCheckAlive(CheckAliveRequest request, + CheckAliveReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleDrainNode(DrainNodeRequest request, + DrainNodeReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleGetAllNodeInfo(GetAllNodeInfoRequest request, + GetAllNodeInfoReply *reply, + SendReplyCallback send_reply_callback) = 0; +}; + +class NodeResourceInfoGcsServiceHandler { + public: + virtual ~NodeResourceInfoGcsServiceHandler() = default; + + virtual void HandleGetAllAvailableResources(GetAllAvailableResourcesRequest request, + GetAllAvailableResourcesReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleGetAllTotalResources(GetAllTotalResourcesRequest request, + GetAllTotalResourcesReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleGetDrainingNodes(GetDrainingNodesRequest request, + GetDrainingNodesReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleGetAllResourceUsage(GetAllResourceUsageRequest request, + GetAllResourceUsageReply *reply, + SendReplyCallback send_reply_callback) = 0; +}; + +class InternalPubSubGcsServiceHandler { + public: + virtual ~InternalPubSubGcsServiceHandler() = default; + + virtual void HandleGcsPublish(GcsPublishRequest request, + GcsPublishReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleGcsSubscriberPoll(GcsSubscriberPollRequest request, + GcsSubscriberPollReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleGcsSubscriberCommandBatch(GcsSubscriberCommandBatchRequest request, + GcsSubscriberCommandBatchReply *reply, + SendReplyCallback send_reply_callback) = 0; +}; + +class JobInfoGcsServiceHandler { + public: + using JobFinishListenerCallback = std::function; + + virtual ~JobInfoGcsServiceHandler() = default; + + virtual void HandleAddJob(AddJobRequest request, + AddJobReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleMarkJobFinished(MarkJobFinishedRequest request, + MarkJobFinishedReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleGetAllJobInfo(GetAllJobInfoRequest request, + GetAllJobInfoReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void AddJobFinishedListener(JobFinishListenerCallback listener) = 0; + + virtual void HandleReportJobError(ReportJobErrorRequest request, + ReportJobErrorReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleGetNextJobID(GetNextJobIDRequest request, + GetNextJobIDReply *reply, + SendReplyCallback send_reply_callback) = 0; +}; + +class RuntimeEnvGcsServiceHandler { + public: + virtual ~RuntimeEnvGcsServiceHandler() = default; + + virtual void HandlePinRuntimeEnvURI(PinRuntimeEnvURIRequest request, + PinRuntimeEnvURIReply *reply, + SendReplyCallback send_reply_callback) = 0; +}; + +class WorkerInfoGcsServiceHandler { + public: + virtual ~WorkerInfoGcsServiceHandler() = default; + + virtual void HandleReportWorkerFailure(ReportWorkerFailureRequest request, + ReportWorkerFailureReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleGetWorkerInfo(GetWorkerInfoRequest request, + GetWorkerInfoReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleGetAllWorkerInfo(GetAllWorkerInfoRequest request, + GetAllWorkerInfoReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleAddWorkerInfo(AddWorkerInfoRequest request, + AddWorkerInfoReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleUpdateWorkerDebuggerPort(UpdateWorkerDebuggerPortRequest request, + UpdateWorkerDebuggerPortReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleUpdateWorkerNumPausedThreads( + UpdateWorkerNumPausedThreadsRequest request, + UpdateWorkerNumPausedThreadsReply *reply, + SendReplyCallback send_reply_callback) = 0; +}; + +class InternalKVGcsServiceHandler { + public: + virtual ~InternalKVGcsServiceHandler() = default; + virtual void HandleInternalKVKeys(InternalKVKeysRequest request, + InternalKVKeysReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleInternalKVGet(InternalKVGetRequest request, + InternalKVGetReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleInternalKVMultiGet(InternalKVMultiGetRequest request, + InternalKVMultiGetReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleInternalKVPut(InternalKVPutRequest request, + InternalKVPutReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleInternalKVDel(InternalKVDelRequest request, + InternalKVDelReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleInternalKVExists(InternalKVExistsRequest request, + InternalKVExistsReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleGetInternalConfig(GetInternalConfigRequest request, + GetInternalConfigReply *reply, + SendReplyCallback send_reply_callback) = 0; +}; + +class TaskInfoGcsServiceHandler { + public: + virtual ~TaskInfoGcsServiceHandler() = default; + + virtual void HandleAddTaskEventData(AddTaskEventDataRequest request, + AddTaskEventDataReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleGetTaskEvents(GetTaskEventsRequest request, + GetTaskEventsReply *reply, + SendReplyCallback send_reply_callback) = 0; +}; + +class PlacementGroupInfoGcsServiceHandler { + public: + virtual ~PlacementGroupInfoGcsServiceHandler() = default; + + virtual void HandleCreatePlacementGroup(CreatePlacementGroupRequest request, + CreatePlacementGroupReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleRemovePlacementGroup(RemovePlacementGroupRequest request, + RemovePlacementGroupReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleGetPlacementGroup(GetPlacementGroupRequest request, + GetPlacementGroupReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleGetAllPlacementGroup(GetAllPlacementGroupRequest request, + GetAllPlacementGroupReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleWaitPlacementGroupUntilReady( + WaitPlacementGroupUntilReadyRequest request, + WaitPlacementGroupUntilReadyReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleGetNamedPlacementGroup(GetNamedPlacementGroupRequest request, + GetNamedPlacementGroupReply *reply, + SendReplyCallback send_reply_callback) = 0; +}; + +namespace autoscaler { + +class AutoscalerStateServiceHandler { + public: + virtual ~AutoscalerStateServiceHandler() = default; + + virtual void HandleGetClusterResourceState(GetClusterResourceStateRequest request, + GetClusterResourceStateReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleReportAutoscalingState(ReportAutoscalingStateRequest request, + ReportAutoscalingStateReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleRequestClusterResourceConstraint( + RequestClusterResourceConstraintRequest request, + RequestClusterResourceConstraintReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleGetClusterStatus(GetClusterStatusRequest request, + GetClusterStatusReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleDrainNode(DrainNodeRequest request, + DrainNodeReply *reply, + SendReplyCallback send_reply_callback) = 0; + + virtual void HandleReportClusterConfig(ReportClusterConfigRequest request, + ReportClusterConfigReply *reply, + SendReplyCallback send_reply_callback) = 0; +}; + +} // namespace autoscaler + +namespace events { + +class RayEventExportGcsServiceHandler { + public: + virtual ~RayEventExportGcsServiceHandler() = default; + virtual void HandleAddEvents(events::AddEventsRequest request, + events::AddEventsReply *reply, + SendReplyCallback send_reply_callback) = 0; +}; + +} // namespace events + +} // namespace rpc +} // namespace ray diff --git a/src/ray/gcs/grpc_services.cc b/src/ray/gcs/grpc_services.cc new file mode 100644 index 000000000000..012f81537a55 --- /dev/null +++ b/src/ray/gcs/grpc_services.cc @@ -0,0 +1,196 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "ray/gcs/grpc_services.h" + +#include +#include + +namespace ray { +namespace rpc { + +void ActorInfoGrpcService::InitServerCallFactories( + const std::unique_ptr &cq, + std::vector> *server_call_factories, + const ClusterID &cluster_id) { + /// The register & create actor RPCs take a long time, so we shouldn't limit their + /// concurrency to avoid distributed deadlock. + RPC_SERVICE_HANDLER(ActorInfoGcsService, RegisterActor, -1) + RPC_SERVICE_HANDLER(ActorInfoGcsService, CreateActor, -1) + RPC_SERVICE_HANDLER(ActorInfoGcsService, RestartActorForLineageReconstruction, -1) + + RPC_SERVICE_HANDLER(ActorInfoGcsService, GetActorInfo, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER(ActorInfoGcsService, GetAllActorInfo, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER( + ActorInfoGcsService, GetNamedActorInfo, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER(ActorInfoGcsService, ListNamedActors, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER(ActorInfoGcsService, KillActorViaGcs, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER( + ActorInfoGcsService, ReportActorOutOfScope, max_active_rpcs_per_handler_) +} + +void NodeInfoGrpcService::InitServerCallFactories( + const std::unique_ptr &cq, + std::vector> *server_call_factories, + const ClusterID &cluster_id) { + // We only allow one cluster ID in the lifetime of a client. + // So, if a client connects, it should not have a pre-existing different ID. + RPC_SERVICE_HANDLER_CUSTOM_AUTH(NodeInfoGcsService, + GetClusterId, + max_active_rpcs_per_handler_, + AuthType::EMPTY_AUTH); + RPC_SERVICE_HANDLER(NodeInfoGcsService, RegisterNode, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER(NodeInfoGcsService, UnregisterNode, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER(NodeInfoGcsService, DrainNode, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER(NodeInfoGcsService, GetAllNodeInfo, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER(NodeInfoGcsService, CheckAlive, max_active_rpcs_per_handler_) +} + +void NodeResourceInfoGrpcService::InitServerCallFactories( + const std::unique_ptr &cq, + std::vector> *server_call_factories, + const ClusterID &cluster_id) { + RPC_SERVICE_HANDLER( + NodeResourceInfoGcsService, GetAllAvailableResources, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER( + NodeResourceInfoGcsService, GetAllTotalResources, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER( + NodeResourceInfoGcsService, GetDrainingNodes, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER( + NodeResourceInfoGcsService, GetAllResourceUsage, max_active_rpcs_per_handler_) +} + +void InternalPubSubGrpcService::InitServerCallFactories( + const std::unique_ptr &cq, + std::vector> *server_call_factories, + const ClusterID &cluster_id) { + RPC_SERVICE_HANDLER(InternalPubSubGcsService, GcsPublish, max_active_rpcs_per_handler_); + RPC_SERVICE_HANDLER( + InternalPubSubGcsService, GcsSubscriberPoll, max_active_rpcs_per_handler_); + RPC_SERVICE_HANDLER( + InternalPubSubGcsService, GcsSubscriberCommandBatch, max_active_rpcs_per_handler_); +} + +void JobInfoGrpcService::InitServerCallFactories( + const std::unique_ptr &cq, + std::vector> *server_call_factories, + const ClusterID &cluster_id) { + RPC_SERVICE_HANDLER(JobInfoGcsService, AddJob, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER(JobInfoGcsService, MarkJobFinished, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER(JobInfoGcsService, GetAllJobInfo, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER(JobInfoGcsService, ReportJobError, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER(JobInfoGcsService, GetNextJobID, max_active_rpcs_per_handler_) +} + +void RuntimeEnvGrpcService::InitServerCallFactories( + const std::unique_ptr &cq, + std::vector> *server_call_factories, + const ClusterID &cluster_id) { + RPC_SERVICE_HANDLER( + RuntimeEnvGcsService, PinRuntimeEnvURI, max_active_rpcs_per_handler_) +} + +void WorkerInfoGrpcService::InitServerCallFactories( + const std::unique_ptr &cq, + std::vector> *server_call_factories, + const ClusterID &cluster_id) { + RPC_SERVICE_HANDLER( + WorkerInfoGcsService, ReportWorkerFailure, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER(WorkerInfoGcsService, GetWorkerInfo, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER( + WorkerInfoGcsService, GetAllWorkerInfo, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER(WorkerInfoGcsService, AddWorkerInfo, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER( + WorkerInfoGcsService, UpdateWorkerDebuggerPort, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER( + WorkerInfoGcsService, UpdateWorkerNumPausedThreads, max_active_rpcs_per_handler_) +} + +void InternalKVGrpcService::InitServerCallFactories( + const std::unique_ptr &cq, + std::vector> *server_call_factories, + const ClusterID &cluster_id) { + RPC_SERVICE_HANDLER(InternalKVGcsService, InternalKVGet, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER( + InternalKVGcsService, InternalKVMultiGet, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER(InternalKVGcsService, InternalKVPut, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER(InternalKVGcsService, InternalKVDel, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER( + InternalKVGcsService, InternalKVExists, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER(InternalKVGcsService, InternalKVKeys, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER( + InternalKVGcsService, GetInternalConfig, max_active_rpcs_per_handler_) +} + +void TaskInfoGrpcService::InitServerCallFactories( + const std::unique_ptr &cq, + std::vector> *server_call_factories, + const ClusterID &cluster_id) { + RPC_SERVICE_HANDLER(TaskInfoGcsService, AddTaskEventData, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER(TaskInfoGcsService, GetTaskEvents, max_active_rpcs_per_handler_) +} + +void PlacementGroupInfoGrpcService::InitServerCallFactories( + const std::unique_ptr &cq, + std::vector> *server_call_factories, + const ClusterID &cluster_id) { + RPC_SERVICE_HANDLER( + PlacementGroupInfoGcsService, CreatePlacementGroup, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER( + PlacementGroupInfoGcsService, RemovePlacementGroup, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER( + PlacementGroupInfoGcsService, GetPlacementGroup, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER( + PlacementGroupInfoGcsService, GetNamedPlacementGroup, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER( + PlacementGroupInfoGcsService, GetAllPlacementGroup, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER(PlacementGroupInfoGcsService, + WaitPlacementGroupUntilReady, + max_active_rpcs_per_handler_) +} + +namespace autoscaler { + +void AutoscalerStateGrpcService::InitServerCallFactories( + const std::unique_ptr &cq, + std::vector> *server_call_factories, + const ClusterID &cluster_id) { + RPC_SERVICE_HANDLER( + AutoscalerStateService, GetClusterResourceState, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER( + AutoscalerStateService, ReportAutoscalingState, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER( + AutoscalerStateService, ReportClusterConfig, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER(AutoscalerStateService, + RequestClusterResourceConstraint, + max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER( + AutoscalerStateService, GetClusterStatus, max_active_rpcs_per_handler_) + RPC_SERVICE_HANDLER(AutoscalerStateService, DrainNode, max_active_rpcs_per_handler_) +} + +} // namespace autoscaler + +namespace events { + +void RayEventExportGrpcService::InitServerCallFactories( + const std::unique_ptr &cq, + std::vector> *server_call_factories, + const ClusterID &cluster_id) { + RPC_SERVICE_HANDLER(RayEventExportGcsService, AddEvents, max_active_rpcs_per_handler_) +} + +} // namespace events + +} // namespace rpc +} // namespace ray diff --git a/src/ray/gcs/grpc_services.h b/src/ray/gcs/grpc_services.h new file mode 100644 index 000000000000..9a7862d334b5 --- /dev/null +++ b/src/ray/gcs/grpc_services.h @@ -0,0 +1,324 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/* + * This file defines the gRPC service handlers for the GCS server binary. + * Subcomponents that implement a given interface should inherit from the relevant + * class in grpc_service_interfaces.h. + * + * The GCS server main binary should be the only user of this target. + */ + +#pragma once + +#include +#include + +#include "ray/common/asio/instrumented_io_context.h" +#include "ray/common/id.h" +#include "ray/gcs/grpc_service_interfaces.h" +#include "ray/rpc/grpc_server.h" +#include "ray/rpc/server_call.h" +#include "src/ray/protobuf/autoscaler.grpc.pb.h" +#include "src/ray/protobuf/gcs_service.grpc.pb.h" + +namespace ray { +namespace rpc { + +class ActorInfoGrpcService : public GrpcService { + public: + explicit ActorInfoGrpcService(instrumented_io_context &io_service, + ActorInfoGcsServiceHandler &service_handler, + int64_t max_active_rpcs_per_handler) + : GrpcService(io_service), + service_handler_(service_handler), + max_active_rpcs_per_handler_(max_active_rpcs_per_handler) {} + + protected: + grpc::Service &GetGrpcService() override { return service_; } + + void InitServerCallFactories( + const std::unique_ptr &cq, + std::vector> *server_call_factories, + const ClusterID &cluster_id) override; + + private: + ActorInfoGcsService::AsyncService service_; + ActorInfoGcsServiceHandler &service_handler_; + int64_t max_active_rpcs_per_handler_; +}; + +class NodeInfoGrpcService : public GrpcService { + public: + explicit NodeInfoGrpcService(instrumented_io_context &io_service, + NodeInfoGcsServiceHandler &service_handler, + int64_t max_active_rpcs_per_handler) + : GrpcService(io_service), + service_handler_(service_handler), + max_active_rpcs_per_handler_(max_active_rpcs_per_handler) {} + + protected: + grpc::Service &GetGrpcService() override { return service_; } + + void InitServerCallFactories( + const std::unique_ptr &cq, + std::vector> *server_call_factories, + const ClusterID &cluster_id) override; + + private: + NodeInfoGcsService::AsyncService service_; + NodeInfoGcsServiceHandler &service_handler_; + int64_t max_active_rpcs_per_handler_; +}; + +class NodeResourceInfoGrpcService : public GrpcService { + public: + explicit NodeResourceInfoGrpcService(instrumented_io_context &io_service, + NodeResourceInfoGcsServiceHandler &handler, + int64_t max_active_rpcs_per_handler) + : GrpcService(io_service), + service_handler_(handler), + max_active_rpcs_per_handler_(max_active_rpcs_per_handler){}; + + protected: + grpc::Service &GetGrpcService() override { return service_; } + + void InitServerCallFactories( + const std::unique_ptr &cq, + std::vector> *server_call_factories, + const ClusterID &cluster_id) override; + + private: + NodeResourceInfoGcsService::AsyncService service_; + NodeResourceInfoGcsServiceHandler &service_handler_; + int64_t max_active_rpcs_per_handler_; +}; + +class InternalPubSubGrpcService : public GrpcService { + public: + InternalPubSubGrpcService(instrumented_io_context &io_service, + InternalPubSubGcsServiceHandler &handler, + int64_t max_active_rpcs_per_handler) + : GrpcService(io_service), + service_handler_(handler), + max_active_rpcs_per_handler_(max_active_rpcs_per_handler) {} + + protected: + grpc::Service &GetGrpcService() override { return service_; } + + void InitServerCallFactories( + const std::unique_ptr &cq, + std::vector> *server_call_factories, + const ClusterID &cluster_id) override; + + private: + InternalPubSubGcsService::AsyncService service_; + InternalPubSubGcsServiceHandler &service_handler_; + int64_t max_active_rpcs_per_handler_; +}; + +class JobInfoGrpcService : public GrpcService { + public: + explicit JobInfoGrpcService(instrumented_io_context &io_service, + JobInfoGcsServiceHandler &handler, + int64_t max_active_rpcs_per_handler) + : GrpcService(io_service), + service_handler_(handler), + max_active_rpcs_per_handler_(max_active_rpcs_per_handler){}; + + protected: + grpc::Service &GetGrpcService() override { return service_; } + + void InitServerCallFactories( + const std::unique_ptr &cq, + std::vector> *server_call_factories, + const ClusterID &cluster_id) override; + + private: + JobInfoGcsService::AsyncService service_; + JobInfoGcsServiceHandler &service_handler_; + int64_t max_active_rpcs_per_handler_; +}; + +class RuntimeEnvGrpcService : public GrpcService { + public: + explicit RuntimeEnvGrpcService(instrumented_io_context &io_service, + RuntimeEnvGcsServiceHandler &handler, + int64_t max_active_rpcs_per_handler) + : GrpcService(io_service), + service_handler_(handler), + max_active_rpcs_per_handler_(max_active_rpcs_per_handler) {} + + protected: + grpc::Service &GetGrpcService() override { return service_; } + + void InitServerCallFactories( + const std::unique_ptr &cq, + std::vector> *server_call_factories, + const ClusterID &cluster_id) override; + + private: + RuntimeEnvGcsService::AsyncService service_; + RuntimeEnvGcsServiceHandler &service_handler_; + int64_t max_active_rpcs_per_handler_; +}; + +class WorkerInfoGrpcService : public GrpcService { + public: + explicit WorkerInfoGrpcService(instrumented_io_context &io_service, + WorkerInfoGcsServiceHandler &handler, + int64_t max_active_rpcs_per_handler) + : GrpcService(io_service), + service_handler_(handler), + max_active_rpcs_per_handler_(max_active_rpcs_per_handler){}; + + protected: + grpc::Service &GetGrpcService() override { return service_; } + + void InitServerCallFactories( + const std::unique_ptr &cq, + std::vector> *server_call_factories, + const ClusterID &cluster_id) override; + + private: + WorkerInfoGcsService::AsyncService service_; + WorkerInfoGcsServiceHandler &service_handler_; + int64_t max_active_rpcs_per_handler_; +}; + +class InternalKVGrpcService : public GrpcService { + public: + explicit InternalKVGrpcService(instrumented_io_context &io_service, + InternalKVGcsServiceHandler &handler, + int64_t max_active_rpcs_per_handler) + : GrpcService(io_service), + service_handler_(handler), + max_active_rpcs_per_handler_(max_active_rpcs_per_handler){}; + + protected: + grpc::Service &GetGrpcService() override { return service_; } + + void InitServerCallFactories( + const std::unique_ptr &cq, + std::vector> *server_call_factories, + const ClusterID &cluster_id) override; + + private: + InternalKVGcsService::AsyncService service_; + InternalKVGcsServiceHandler &service_handler_; + int64_t max_active_rpcs_per_handler_; +}; + +class TaskInfoGrpcService : public GrpcService { + public: + explicit TaskInfoGrpcService(instrumented_io_context &io_service, + TaskInfoGcsServiceHandler &handler, + int64_t max_active_rpcs_per_handler) + : GrpcService(io_service), + service_handler_(handler), + max_active_rpcs_per_handler_(max_active_rpcs_per_handler){}; + + protected: + grpc::Service &GetGrpcService() override { return service_; } + + void InitServerCallFactories( + const std::unique_ptr &cq, + std::vector> *server_call_factories, + const ClusterID &cluster_id) override; + + private: + TaskInfoGcsService::AsyncService service_; + TaskInfoGcsServiceHandler &service_handler_; + int64_t max_active_rpcs_per_handler_; +}; + +class PlacementGroupInfoGrpcService : public GrpcService { + public: + explicit PlacementGroupInfoGrpcService(instrumented_io_context &io_service, + PlacementGroupInfoGcsServiceHandler &handler, + int64_t max_active_rpcs_per_handler) + : GrpcService(io_service), + service_handler_(handler), + max_active_rpcs_per_handler_(max_active_rpcs_per_handler) {} + + protected: + grpc::Service &GetGrpcService() override { return service_; } + + void InitServerCallFactories( + const std::unique_ptr &cq, + std::vector> *server_call_factories, + const ClusterID &cluster_id) override; + + private: + PlacementGroupInfoGcsService::AsyncService service_; + PlacementGroupInfoGcsServiceHandler &service_handler_; + int64_t max_active_rpcs_per_handler_; +}; + +namespace autoscaler { + +class AutoscalerStateGrpcService : public GrpcService { + public: + explicit AutoscalerStateGrpcService(instrumented_io_context &io_service, + AutoscalerStateServiceHandler &handler, + int64_t max_active_rpcs_per_handler) + : GrpcService(io_service), + service_handler_(handler), + max_active_rpcs_per_handler_(max_active_rpcs_per_handler){}; + + protected: + grpc::Service &GetGrpcService() override { return service_; } + + void InitServerCallFactories( + const std::unique_ptr &cq, + std::vector> *server_call_factories, + const ClusterID &cluster_id) override; + + private: + AutoscalerStateService::AsyncService service_; + AutoscalerStateServiceHandler &service_handler_; + int64_t max_active_rpcs_per_handler_; +}; + +} // namespace autoscaler + +namespace events { + +class RayEventExportGrpcService : public GrpcService { + public: + explicit RayEventExportGrpcService(instrumented_io_context &io_service, + RayEventExportGcsServiceHandler &handler, + int64_t max_active_rpcs_per_handler) + : GrpcService(io_service), + service_handler_(handler), + max_active_rpcs_per_handler_(max_active_rpcs_per_handler){}; + + protected: + grpc::Service &GetGrpcService() override { return service_; } + + void InitServerCallFactories( + const std::unique_ptr &cq, + std::vector> *server_call_factories, + const ClusterID &cluster_id) override; + + private: + RayEventExportGcsService::AsyncService service_; + RayEventExportGcsServiceHandler &service_handler_; + int64_t max_active_rpcs_per_handler_; +}; + +} // namespace events + +} // namespace rpc +} // namespace ray diff --git a/src/ray/gcs/pb_utils.cc b/src/ray/gcs/pb_utils.cc deleted file mode 100644 index 1c510b9b5902..000000000000 --- a/src/ray/gcs/pb_utils.cc +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2024 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// TODO(hjiang): Move all functions from `pb_utils.h` to this implementation file. -#include -#include -#include -#include - -#include "absl/strings/str_format.h" -#include "ray/gcs/pb_util.h" - -namespace ray::gcs { - -std::shared_ptr CreateErrorTableData( - const std::string &error_type, - const std::string &error_msg, - absl::Time timestamp, - const JobID &job_id) { - uint32_t max_error_msg_size_bytes = RayConfig::instance().max_error_msg_size_bytes(); - auto error_info_ptr = std::make_shared(); - error_info_ptr->set_type(error_type); - if (error_msg.length() > max_error_msg_size_bytes) { - std::string formatted_error_message = absl::StrFormat( - "The message size exceeds %d bytes. Find the full log from the log files. Here " - "is abstract: %s", - max_error_msg_size_bytes, - std::string_view{error_msg}.substr(0, max_error_msg_size_bytes)); - error_info_ptr->set_error_message(std::move(formatted_error_message)); - } else { - error_info_ptr->set_error_message(error_msg); - } - error_info_ptr->set_timestamp(absl::ToUnixMillis(timestamp)); - error_info_ptr->set_job_id(job_id.Binary()); - return error_info_ptr; -} - -} // namespace ray::gcs diff --git a/src/ray/gcs/pubsub/BUILD.bazel b/src/ray/gcs/pubsub/BUILD.bazel deleted file mode 100644 index e5a50a771846..000000000000 --- a/src/ray/gcs/pubsub/BUILD.bazel +++ /dev/null @@ -1,15 +0,0 @@ -load("//bazel:ray.bzl", "ray_cc_library") - -ray_cc_library( - name = "gcs_pub_sub_lib", - srcs = ["gcs_pub_sub.cc"], - hdrs = ["gcs_pub_sub.h"], - deps = [ - "//src/ray/common:ray_config", - "//src/ray/gcs:gcs_callback", - "//src/ray/gcs:gcs_redis_client", - "//src/ray/pubsub:publisher", - "//src/ray/pubsub:subscriber", - "//src/ray/rpc:gcs_client", - ], -) diff --git a/src/ray/gcs/pubsub/gcs_pub_sub.cc b/src/ray/gcs/pubsub/gcs_pub_sub.cc deleted file mode 100644 index ea722b03f5e1..000000000000 --- a/src/ray/gcs/pubsub/gcs_pub_sub.cc +++ /dev/null @@ -1,397 +0,0 @@ -// Copyright 2017 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "ray/gcs/pubsub/gcs_pub_sub.h" - -#include -#include -#include -#include - -#include "ray/rpc/gcs/gcs_rpc_client.h" - -namespace ray { -namespace gcs { - -Status GcsPublisher::PublishActor(const ActorID &id, - rpc::ActorTableData message, - const StatusCallback &done) { - rpc::PubMessage msg; - msg.set_channel_type(rpc::ChannelType::GCS_ACTOR_CHANNEL); - msg.set_key_id(id.Binary()); - *msg.mutable_actor_message() = std::move(message); - publisher_->Publish(std::move(msg)); - if (done != nullptr) { - done(Status::OK()); - } - return Status::OK(); -} - -Status GcsPublisher::PublishJob(const JobID &id, - const rpc::JobTableData &message, - const StatusCallback &done) { - rpc::PubMessage msg; - msg.set_channel_type(rpc::ChannelType::GCS_JOB_CHANNEL); - msg.set_key_id(id.Binary()); - *msg.mutable_job_message() = message; - publisher_->Publish(std::move(msg)); - if (done != nullptr) { - done(Status::OK()); - } - return Status::OK(); -} - -Status GcsPublisher::PublishNodeInfo(const NodeID &id, - const rpc::GcsNodeInfo &message, - const StatusCallback &done) { - rpc::PubMessage msg; - msg.set_channel_type(rpc::ChannelType::GCS_NODE_INFO_CHANNEL); - msg.set_key_id(id.Binary()); - *msg.mutable_node_info_message() = message; - publisher_->Publish(std::move(msg)); - if (done != nullptr) { - done(Status::OK()); - } - return Status::OK(); -} - -Status GcsPublisher::PublishWorkerFailure(const WorkerID &id, - const rpc::WorkerDeltaData &message, - const StatusCallback &done) { - rpc::PubMessage msg; - msg.set_channel_type(rpc::ChannelType::GCS_WORKER_DELTA_CHANNEL); - msg.set_key_id(id.Binary()); - *msg.mutable_worker_delta_message() = message; - publisher_->Publish(std::move(msg)); - if (done != nullptr) { - done(Status::OK()); - } - return Status::OK(); -} - -Status GcsPublisher::PublishError(const std::string &id, - const rpc::ErrorTableData &message, - const StatusCallback &done) { - rpc::PubMessage msg; - msg.set_channel_type(rpc::ChannelType::RAY_ERROR_INFO_CHANNEL); - msg.set_key_id(id); - *msg.mutable_error_info_message() = message; - publisher_->Publish(std::move(msg)); - if (done != nullptr) { - done(Status::OK()); - } - return Status::OK(); -} - -std::string GcsPublisher::DebugString() const { return publisher_->DebugString(); } - -Status GcsSubscriber::SubscribeAllJobs( - const SubscribeCallback &subscribe, - const StatusCallback &done) { - // GCS subscriber. - auto subscribe_item_callback = [subscribe](rpc::PubMessage &&msg) { - RAY_CHECK(msg.channel_type() == rpc::ChannelType::GCS_JOB_CHANNEL); - const JobID id = JobID::FromBinary(msg.key_id()); - subscribe(id, std::move(*msg.mutable_job_message())); - }; - auto subscription_failure_callback = [](const std::string &, const Status &status) { - RAY_LOG(WARNING) << "Subscription to Job channel failed: " << status.ToString(); - }; - // Ignore if the subscription already exists, because the resubscription is intentional. - RAY_UNUSED(subscriber_->SubscribeChannel( - std::make_unique(), - rpc::ChannelType::GCS_JOB_CHANNEL, - gcs_address_, - [done](Status status) { - if (done != nullptr) { - done(status); - } - }, - std::move(subscribe_item_callback), - std::move(subscription_failure_callback))); - return Status::OK(); -} - -Status GcsSubscriber::SubscribeActor( - const ActorID &id, - const SubscribeCallback &subscribe, - const StatusCallback &done) { - // GCS subscriber. - auto subscription_callback = [id, subscribe](rpc::PubMessage &&msg) { - RAY_CHECK(msg.channel_type() == rpc::ChannelType::GCS_ACTOR_CHANNEL); - RAY_CHECK(msg.key_id() == id.Binary()); - subscribe(id, std::move(*msg.mutable_actor_message())); - }; - auto subscription_failure_callback = [id](const std::string &failed_id, - const Status &status) { - RAY_CHECK(failed_id == id.Binary()); - RAY_LOG(WARNING) << "Subscription to Actor " << id.Hex() - << " failed: " << status.ToString(); - }; - // Ignore if the subscription already exists, because the resubscription is intentional. - RAY_UNUSED(subscriber_->Subscribe( - std::make_unique(), - rpc::ChannelType::GCS_ACTOR_CHANNEL, - gcs_address_, - id.Binary(), - [done](Status status) { - if (done != nullptr) { - done(status); - } - }, - std::move(subscription_callback), - std::move(subscription_failure_callback))); - return Status::OK(); -} - -Status GcsSubscriber::UnsubscribeActor(const ActorID &id) { - subscriber_->Unsubscribe( - rpc::ChannelType::GCS_ACTOR_CHANNEL, gcs_address_, id.Binary()); - return Status::OK(); -} - -bool GcsSubscriber::IsActorUnsubscribed(const ActorID &id) { - return !subscriber_->IsSubscribed( - rpc::ChannelType::GCS_ACTOR_CHANNEL, gcs_address_, id.Binary()); -} - -void GcsSubscriber::SubscribeAllNodeInfo(const ItemCallback &subscribe, - const StatusCallback &done) { - // GCS subscriber. - auto subscribe_item_callback = [subscribe](rpc::PubMessage &&msg) { - RAY_CHECK(msg.channel_type() == rpc::ChannelType::GCS_NODE_INFO_CHANNEL); - subscribe(std::move(*msg.mutable_node_info_message())); - }; - auto subscription_failure_callback = [](const std::string &, const Status &status) { - RAY_LOG(WARNING) << "Subscription to NodeInfo channel failed: " << status.ToString(); - }; - // Ignore if the subscription already exists, because the resubscription is intentional. - RAY_UNUSED(subscriber_->SubscribeChannel( - std::make_unique(), - rpc::ChannelType::GCS_NODE_INFO_CHANNEL, - gcs_address_, - [done](Status status) { - if (done != nullptr) { - done(status); - } - }, - std::move(subscribe_item_callback), - std::move(subscription_failure_callback))); -} - -Status GcsSubscriber::SubscribeAllWorkerFailures( - const ItemCallback &subscribe, const StatusCallback &done) { - auto subscribe_item_callback = [subscribe](rpc::PubMessage &&msg) { - RAY_CHECK(msg.channel_type() == rpc::ChannelType::GCS_WORKER_DELTA_CHANNEL); - subscribe(std::move(*msg.mutable_worker_delta_message())); - }; - auto subscription_failure_callback = [](const std::string &, const Status &status) { - RAY_LOG(WARNING) << "Subscription to WorkerDelta channel failed: " - << status.ToString(); - }; - // Ignore if the subscription already exists, because the resubscription is intentional. - RAY_UNUSED(subscriber_->SubscribeChannel( - std::make_unique(), - rpc::ChannelType::GCS_WORKER_DELTA_CHANNEL, - gcs_address_, - /*subscribe_done_callback=*/ - [done](Status status) { - if (done != nullptr) { - done(status); - } - }, - std::move(subscribe_item_callback), - std::move(subscription_failure_callback))); - return Status::OK(); -} - -std::vector PythonGetLogBatchLines(const rpc::LogBatch &log_batch) { - return std::vector(log_batch.lines().begin(), log_batch.lines().end()); -} - -PythonGcsSubscriber::PythonGcsSubscriber(const std::string &gcs_address, - int gcs_port, - rpc::ChannelType channel_type, - const std::string &subscriber_id, - const std::string &worker_id) - : channel_type_(channel_type), - subscriber_id_(subscriber_id), - publisher_id_(""), - worker_id_(worker_id), - max_processed_sequence_id_(0), - closed_(false) { - channel_ = rpc::GcsRpcClient::CreateGcsChannel(gcs_address, gcs_port); - pubsub_stub_ = rpc::InternalPubSubGcsService::NewStub(channel_); -} - -Status PythonGcsSubscriber::Subscribe() { - absl::MutexLock lock(&mu_); - - if (closed_) { - return Status::OK(); - } - - grpc::ClientContext context; - - rpc::GcsSubscriberCommandBatchRequest request; - request.set_subscriber_id(subscriber_id_); - request.set_sender_id(worker_id_); - auto *cmd = request.add_commands(); - cmd->set_channel_type(channel_type_); - cmd->mutable_subscribe_message(); - - rpc::GcsSubscriberCommandBatchReply reply; - grpc::Status status = - pubsub_stub_->GcsSubscriberCommandBatch(&context, request, &reply); - - if (status.ok()) { - return Status::OK(); - } else { - return Status::RpcError(status.error_message(), status.error_code()); - } -} - -Status PythonGcsSubscriber::DoPoll(int64_t timeout_ms, rpc::PubMessage *message) { - absl::MutexLock lock(&mu_); - - while (queue_.empty()) { - if (closed_) { - return Status::OK(); - } - current_polling_context_ = std::make_shared(); - if (timeout_ms != -1) { - current_polling_context_->set_deadline(std::chrono::system_clock::now() + - std::chrono::milliseconds(timeout_ms)); - } - rpc::GcsSubscriberPollRequest request; - request.set_subscriber_id(subscriber_id_); - request.set_max_processed_sequence_id(max_processed_sequence_id_); - request.set_publisher_id(publisher_id_); - - rpc::GcsSubscriberPollReply reply; - auto context = current_polling_context_; - // Drop the lock while in RPC - mu_.Unlock(); - grpc::Status status = pubsub_stub_->GcsSubscriberPoll(context.get(), request, &reply); - mu_.Lock(); - - if (status.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED || - status.error_code() == grpc::StatusCode::UNAVAILABLE) { - return Status::OK(); - } - if (status.error_code() == grpc::StatusCode::CANCELLED) { - // This channel was shut down via Close() - return Status::OK(); - } - if (status.error_code() != grpc::StatusCode::OK) { - return Status::Invalid(status.error_message()); - } - - if (publisher_id_ != reply.publisher_id()) { - if (publisher_id_ != "") { - RAY_LOG(DEBUG) << "Replied publisher_id " << reply.publisher_id() - << " different from " << publisher_id_ - << ", this should only happen" - << " during GCS failover."; - } - publisher_id_ = reply.publisher_id(); - max_processed_sequence_id_ = 0; - } - last_batch_size_ = reply.pub_messages().size(); - for (auto &cur_pub_msg : reply.pub_messages()) { - if (cur_pub_msg.sequence_id() <= max_processed_sequence_id_) { - RAY_LOG(WARNING) << "Ignoring out of order message " << cur_pub_msg.sequence_id(); - continue; - } - max_processed_sequence_id_ = cur_pub_msg.sequence_id(); - if (cur_pub_msg.channel_type() != channel_type_) { - RAY_LOG(WARNING) << "Ignoring message from unsubscribed channel " - << cur_pub_msg.channel_type(); - continue; - } - queue_.emplace_back(std::move(cur_pub_msg)); - } - } - - *message = queue_.front(); - queue_.pop_front(); - - return Status::OK(); -} - -Status PythonGcsSubscriber::PollError(std::string *key_id, - int64_t timeout_ms, - rpc::ErrorTableData *data) { - rpc::PubMessage message; - RAY_RETURN_NOT_OK(DoPoll(timeout_ms, &message)); - *key_id = message.key_id(); - *data = message.error_info_message(); - return Status::OK(); -} - -Status PythonGcsSubscriber::PollLogs(std::string *key_id, - int64_t timeout_ms, - rpc::LogBatch *data) { - rpc::PubMessage message; - RAY_RETURN_NOT_OK(DoPoll(timeout_ms, &message)); - *key_id = message.key_id(); - *data = message.log_batch_message(); - return Status::OK(); -} - -Status PythonGcsSubscriber::PollActor(std::string *key_id, - int64_t timeout_ms, - rpc::ActorTableData *data) { - rpc::PubMessage message; - RAY_RETURN_NOT_OK(DoPoll(timeout_ms, &message)); - *key_id = message.key_id(); - *data = message.actor_message(); - return Status::OK(); -} - -Status PythonGcsSubscriber::Close() { - std::shared_ptr current_polling_context; - { - absl::MutexLock lock(&mu_); - if (closed_) { - return Status::OK(); - } - closed_ = true; - current_polling_context = current_polling_context_; - } - if (current_polling_context) { - current_polling_context->TryCancel(); - } - - grpc::ClientContext context; - - rpc::GcsUnregisterSubscriberRequest request; - request.set_subscriber_id(subscriber_id_); - rpc::GcsUnregisterSubscriberReply reply; - grpc::Status status = pubsub_stub_->GcsUnregisterSubscriber(&context, request, &reply); - - if (!status.ok()) { - RAY_LOG(WARNING) << "Error while unregistering the subscriber: " - << status.error_message() << " [code " << status.error_code() << "]"; - } - return Status::OK(); -} - -int64_t PythonGcsSubscriber::last_batch_size() { - absl::MutexLock lock(&mu_); - return last_batch_size_; -} - -} // namespace gcs -} // namespace ray diff --git a/src/ray/gcs/pubsub/gcs_pub_sub.h b/src/ray/gcs/pubsub/gcs_pub_sub.h deleted file mode 100644 index c9abb5112aa3..000000000000 --- a/src/ray/gcs/pubsub/gcs_pub_sub.h +++ /dev/null @@ -1,194 +0,0 @@ -// Copyright 2017 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -#include "absl/container/flat_hash_map.h" -#include "absl/synchronization/mutex.h" -#include "ray/common/ray_config.h" -#include "ray/gcs/callback.h" -#include "ray/pubsub/publisher.h" -#include "ray/pubsub/subscriber.h" -#include "src/ray/protobuf/gcs.pb.h" -#include "src/ray/protobuf/gcs_service.grpc.pb.h" -#include "src/ray/protobuf/gcs_service.pb.h" - -namespace ray { -namespace gcs { - -/// \class GcsPublisher -/// -/// Supports publishing per-entity data and errors from GCS. Thread safe. -class GcsPublisher { - public: - /// Initializes GcsPublisher with GCS based publishers. - /// Publish*() member functions below would be incrementally converted to use the GCS - /// based publisher, if available. - explicit GcsPublisher(std::unique_ptr publisher) - : publisher_(std::move(publisher)) { - RAY_CHECK(publisher_); - } - - virtual ~GcsPublisher() = default; - - /// Returns the underlying pubsub::Publisher. Caller does not take ownership. - pubsub::Publisher &GetPublisher() const { return *publisher_; } - - /// Each publishing method below publishes to a different "channel". - /// ID is the entity which the message is associated with, e.g. ActorID for Actor data. - /// Subscribers receive typed messages for the ID that they subscribe to. - /// - /// The full stream of NodeResource and Error channels are needed by its subscribers. - /// But for other channels, subscribers should only need the latest data. - /// - /// TODO: Verify GCS pubsub satisfies the streaming semantics. - /// TODO: Implement optimization for channels where only latest data per ID is useful. - - Status PublishActor(const ActorID &id, - rpc::ActorTableData message, - const StatusCallback &done); - - // TODO(dayshah): Look at possibility of moving all of these rpc messages - - Status PublishJob(const JobID &id, - const rpc::JobTableData &message, - const StatusCallback &done); - - virtual Status PublishNodeInfo(const NodeID &id, - const rpc::GcsNodeInfo &message, - const StatusCallback &done); - - /// Actually rpc::WorkerDeltaData is not a delta message. - Status PublishWorkerFailure(const WorkerID &id, - const rpc::WorkerDeltaData &message, - const StatusCallback &done); - - virtual Status PublishError(const std::string &id, - const rpc::ErrorTableData &message, - const StatusCallback &done); - - /// TODO: remove once it is converted to GRPC-based push broadcasting. - Status PublishResourceBatch(const rpc::ResourceUsageBatchData &message, - const StatusCallback &done); - - /// Prints debugging info for the publisher. - std::string DebugString() const; - - private: - const std::unique_ptr publisher_; -}; - -/// \class GcsSubscriber -/// -/// Supports subscribing to an entity or a channel from GCS. Thread safe. -class GcsSubscriber { - public: - /// Initializes GcsSubscriber with GCS based GcsSubscribers. - // TODO(mwtian): Support restarted GCS publisher, at the same or a different address. - GcsSubscriber(const rpc::Address &gcs_address, - std::unique_ptr subscriber) - : gcs_address_(gcs_address), subscriber_(std::move(subscriber)) {} - - /// Subscribe*() member functions below would be incrementally converted to use the GCS - /// based subscriber, if available. - /// The `subscribe` callbacks must not be empty. The `done` callbacks can optionally be - /// empty. - - /// Uses GCS pubsub when created with `subscriber`. - Status SubscribeActor(const ActorID &id, - const SubscribeCallback &subscribe, - const StatusCallback &done); - Status UnsubscribeActor(const ActorID &id); - - bool IsActorUnsubscribed(const ActorID &id); - - Status SubscribeAllJobs(const SubscribeCallback &subscribe, - const StatusCallback &done); - - void SubscribeAllNodeInfo(const ItemCallback &subscribe, - const StatusCallback &done); - - Status SubscribeAllWorkerFailures(const ItemCallback &subscribe, - const StatusCallback &done); - - /// Prints debugging info for the subscriber. - std::string DebugString() const; - - private: - const rpc::Address gcs_address_; - const std::unique_ptr subscriber_; -}; - -// This client is only supposed to be used from Cython / Python -class RAY_EXPORT PythonGcsSubscriber { - public: - explicit PythonGcsSubscriber(const std::string &gcs_address, - int gcs_port, - rpc::ChannelType channel_type, - const std::string &subscriber_id, - const std::string &worker_id); - - /// Register a subscription for the subscriber's channel type. - /// - /// Before the registration, published messages in the channel - /// will not be saved for the subscriber. - Status Subscribe(); - - /// Polls for new error message. - /// Both key_id and data are out parameters. - Status PollError(std::string *key_id, int64_t timeout_ms, rpc::ErrorTableData *data); - - /// Polls for new log messages. - Status PollLogs(std::string *key_id, int64_t timeout_ms, rpc::LogBatch *data); - - /// Polls for actor messages. - Status PollActor(std::string *key_id, int64_t timeout_ms, rpc::ActorTableData *data); - - /// Closes the subscriber and its active subscription. - Status Close(); - - int64_t last_batch_size(); - - private: - Status DoPoll(int64_t timeout_ms, rpc::PubMessage *message); - - mutable absl::Mutex mu_; - - std::unique_ptr pubsub_stub_; - std::shared_ptr channel_; - const rpc::ChannelType channel_type_; - const std::string subscriber_id_; - std::string publisher_id_; - const std::string worker_id_; - int64_t max_processed_sequence_id_ ABSL_GUARDED_BY(mu_); - int64_t last_batch_size_ ABSL_GUARDED_BY(mu_); - std::deque queue_ ABSL_GUARDED_BY(mu_); - bool closed_ ABSL_GUARDED_BY(mu_); - std::shared_ptr current_polling_context_ ABSL_GUARDED_BY(mu_); -}; - -/// Get the .lines() attribute of a LogBatch as a std::vector -/// (this is needed so it can be wrapped in Cython) -std::vector PythonGetLogBatchLines(const rpc::LogBatch &log_batch); - -} // namespace gcs -} // namespace ray diff --git a/src/ray/gcs/gcs_server/pubsub_handler.cc b/src/ray/gcs/pubsub_handler.cc similarity index 74% rename from src/ray/gcs/gcs_server/pubsub_handler.cc rename to src/ray/gcs/pubsub_handler.cc index 1bc889c912ef..1e281ba1b56c 100644 --- a/src/ray/gcs/gcs_server/pubsub_handler.cc +++ b/src/ray/gcs/pubsub_handler.cc @@ -12,9 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/gcs_server/pubsub_handler.h" +#include "ray/gcs/pubsub_handler.h" -#include #include #include @@ -22,7 +21,7 @@ namespace ray { namespace gcs { InternalPubSubHandler::InternalPubSubHandler(instrumented_io_context &io_service, - gcs::GcsPublisher &gcs_publisher) + pubsub::GcsPublisher &gcs_publisher) : io_service_(io_service), gcs_publisher_(gcs_publisher) {} void InternalPubSubHandler::HandleGcsPublish(rpc::GcsPublishRequest request, @@ -43,23 +42,13 @@ void InternalPubSubHandler::HandleGcsSubscriberPoll( rpc::GcsSubscriberPollReply *reply, rpc::SendReplyCallback send_reply_callback) { rpc::PubsubLongPollingRequest pubsub_req; - pubsub_req.set_subscriber_id(request.subscriber_id()); - pubsub_req.set_publisher_id(request.publisher_id()); + pubsub_req.set_subscriber_id(std::move(*request.mutable_subscriber_id())); + pubsub_req.set_publisher_id(std::move(*request.mutable_publisher_id())); pubsub_req.set_max_processed_sequence_id(request.max_processed_sequence_id()); - auto pubsub_reply = std::make_shared(); - auto pubsub_reply_ptr = pubsub_reply.get(); - gcs_publisher_.GetPublisher().ConnectToSubscriber( - pubsub_req, - pubsub_reply_ptr, - [reply, - reply_cb = std::move(send_reply_callback), - pubsub_reply = std::move(pubsub_reply)](ray::Status status, - std::function success_cb, - std::function failure_cb) { - reply->mutable_pub_messages()->Swap(pubsub_reply->mutable_pub_messages()); - reply->set_publisher_id(std::move(*pubsub_reply->mutable_publisher_id())); - reply_cb(std::move(status), std::move(success_cb), std::move(failure_cb)); - }); + gcs_publisher_.GetPublisher().ConnectToSubscriber(pubsub_req, + reply->mutable_publisher_id(), + reply->mutable_pub_messages(), + std::move(send_reply_callback)); } // Similar for HandleGcsSubscriberPoll() above, needs to use @@ -104,15 +93,6 @@ void InternalPubSubHandler::HandleGcsSubscriberCommandBatch( send_reply_callback(Status::OK(), nullptr, nullptr); } -void InternalPubSubHandler::HandleGcsUnregisterSubscriber( - rpc::GcsUnregisterSubscriberRequest request, - rpc::GcsUnregisterSubscriberReply *reply, - rpc::SendReplyCallback send_reply_callback) { - const auto subscriber_id = UniqueID::FromBinary(request.subscriber_id()); - gcs_publisher_.GetPublisher().UnregisterSubscriber(subscriber_id); - send_reply_callback(Status::OK(), nullptr, nullptr); -} - void InternalPubSubHandler::AsyncRemoveSubscriberFrom(const std::string &sender_id) { io_service_.post( [this, sender_id]() { diff --git a/src/ray/gcs/gcs_server/pubsub_handler.h b/src/ray/gcs/pubsub_handler.h similarity index 79% rename from src/ray/gcs/gcs_server/pubsub_handler.h rename to src/ray/gcs/pubsub_handler.h index fc5d92dd9abf..f69b8e2a50f1 100644 --- a/src/ray/gcs/gcs_server/pubsub_handler.h +++ b/src/ray/gcs/pubsub_handler.h @@ -18,9 +18,8 @@ #include "absl/container/flat_hash_map.h" #include "absl/container/flat_hash_set.h" -#include "ray/gcs/pubsub/gcs_pub_sub.h" -#include "ray/rpc/gcs/gcs_rpc_server.h" -#include "src/ray/protobuf/gcs_service.grpc.pb.h" +#include "ray/gcs/grpc_service_interfaces.h" +#include "ray/pubsub/gcs_publisher.h" namespace ray { namespace gcs { @@ -28,10 +27,10 @@ namespace gcs { /// This is the implementation class of `InternalPubsubHandler`. /// It supports subscribing updates from GCS with long poll, and registering / /// de-registering subscribers. -class InternalPubSubHandler : public rpc::InternalPubSubHandler { +class InternalPubSubHandler : public rpc::InternalPubSubGcsServiceHandler { public: InternalPubSubHandler(instrumented_io_context &io_service, - gcs::GcsPublisher &gcs_publisher); + pubsub::GcsPublisher &gcs_publisher); void HandleGcsPublish(rpc::GcsPublishRequest request, rpc::GcsPublishReply *reply, @@ -45,10 +44,6 @@ class InternalPubSubHandler : public rpc::InternalPubSubHandler { rpc::GcsSubscriberCommandBatchReply *reply, rpc::SendReplyCallback send_reply_callback) final; - void HandleGcsUnregisterSubscriber(rpc::GcsUnregisterSubscriberRequest request, - rpc::GcsUnregisterSubscriberReply *reply, - rpc::SendReplyCallback send_reply_callback) final; - /// This function is only for external callers. Internally, can just erase from /// sender_to_subscribers_ and everything should be on the Publisher's io_service_. void AsyncRemoveSubscriberFrom(const std::string &sender_id); @@ -56,7 +51,7 @@ class InternalPubSubHandler : public rpc::InternalPubSubHandler { private: /// Not owning the io service, to allow sharing it with pubsub::Publisher. instrumented_io_context &io_service_; - gcs::GcsPublisher &gcs_publisher_; + pubsub::GcsPublisher &gcs_publisher_; absl::flat_hash_map> sender_to_subscribers_; }; diff --git a/src/ray/gcs/redis_client.cc b/src/ray/gcs/redis_client.cc deleted file mode 100644 index 4f547ac9cb1c..000000000000 --- a/src/ray/gcs/redis_client.cc +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright 2017 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "ray/gcs/redis_client.h" - -#include - -#include "ray/common/ray_config.h" -#include "ray/gcs/redis_context.h" - -extern "C" { -#include "hiredis/hiredis.h" -} - -namespace ray { -namespace gcs { -RedisClient::RedisClient(const RedisClientOptions &options) : options_(options) {} - -Status RedisClient::Connect(instrumented_io_context &io_service) { - RAY_CHECK(!is_connected_); - - if (options_.server_ip_.empty()) { - RAY_LOG(ERROR) << "Failed to connect, redis server address is empty."; - return Status::Invalid("Redis server address is invalid!"); - } - - primary_context_ = std::make_unique(io_service); - - RAY_CHECK_OK(primary_context_->Connect(options_.server_ip_, - options_.server_port_, - /*username=*/options_.username_, - /*password=*/options_.password_, - /*enable_ssl=*/options_.enable_ssl_)); - - is_connected_ = true; - RAY_LOG(DEBUG) << "RedisClient connected."; - - return Status::OK(); -} - -void RedisClient::Disconnect() { - RAY_CHECK(is_connected_); - is_connected_ = false; - RAY_LOG(DEBUG) << "RedisClient disconnected."; -} -} // namespace gcs -} // namespace ray diff --git a/src/ray/gcs/redis_client.h b/src/ray/gcs/redis_client.h deleted file mode 100644 index d3cfcd655128..000000000000 --- a/src/ray/gcs/redis_client.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright 2017 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include - -#include "ray/common/asio/instrumented_io_context.h" -#include "ray/common/status.h" -#include "ray/gcs/redis_context.h" -#include "ray/util/logging.h" - -namespace ray { -namespace gcs { -class RedisClientOptions { - public: - RedisClientOptions(const std::string &ip, - int port, - const std::string &username, - const std::string &password, - bool enable_ssl = false) - : server_ip_(ip), - server_port_(port), - username_(username), - password_(password), - enable_ssl_(enable_ssl) {} - - // Redis server address - std::string server_ip_; - int server_port_; - - // Username of Redis. - std::string username_; - - // Password of Redis. - std::string password_; - - // Whether to use tls/ssl for redis connection - bool enable_ssl_ = false; -}; - -/// \class RedisClient -/// This class is used to send commands to Redis. -class RedisClient { - public: - explicit RedisClient(const RedisClientOptions &options); - - /// Connect to Redis. Non-thread safe. - /// Call this function before calling other functions. - /// - /// \param io_service The event loop for this client. - /// This io_service must be single-threaded. Because `RedisAsioClient` is - /// non-thread safe. - /// \return Status - Status Connect(instrumented_io_context &io_service); - - /// Disconnect with Redis. Non-thread safe. - void Disconnect(); - - RedisContext *GetPrimaryContext() { return primary_context_.get(); } - - protected: - RedisClientOptions options_; - - /// Whether this client is connected to redis. - bool is_connected_{false}; - - // The following context writes everything to the primary shard - std::unique_ptr primary_context_; -}; -} // namespace gcs -} // namespace ray diff --git a/src/ray/gcs/gcs_server/runtime_env_handler.cc b/src/ray/gcs/runtime_env_handler.cc similarity index 96% rename from src/ray/gcs/gcs_server/runtime_env_handler.cc rename to src/ray/gcs/runtime_env_handler.cc index 83aa0c5c3538..b71604b9cecc 100644 --- a/src/ray/gcs/gcs_server/runtime_env_handler.cc +++ b/src/ray/gcs/runtime_env_handler.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/gcs_server/runtime_env_handler.h" +#include "ray/gcs/runtime_env_handler.h" #include diff --git a/src/ray/gcs/gcs_server/runtime_env_handler.h b/src/ray/gcs/runtime_env_handler.h similarity index 88% rename from src/ray/gcs/gcs_server/runtime_env_handler.h rename to src/ray/gcs/runtime_env_handler.h index 946ca0327568..4211fb95030a 100644 --- a/src/ray/gcs/gcs_server/runtime_env_handler.h +++ b/src/ray/gcs/runtime_env_handler.h @@ -13,11 +13,14 @@ // limitations under the License. #pragma once +#include #include #include +#include "ray/common/asio/instrumented_io_context.h" #include "ray/common/runtime_env_manager.h" -#include "ray/rpc/gcs/gcs_rpc_server.h" +#include "ray/gcs/grpc_service_interfaces.h" + namespace ray { namespace gcs { @@ -25,7 +28,7 @@ typedef std::function(std::function uint32_t delay_ms)> DelayExecutorFn; -class RuntimeEnvHandler : public rpc::RuntimeEnvHandler { +class RuntimeEnvHandler : public rpc::RuntimeEnvGcsServiceHandler { public: RuntimeEnvHandler(instrumented_io_context &io_service, RuntimeEnvManager &runtime_env_manager, diff --git a/src/ray/gcs/gcs_server/state_util.cc b/src/ray/gcs/state_util.cc similarity index 97% rename from src/ray/gcs/gcs_server/state_util.cc rename to src/ray/gcs/state_util.cc index b1f41b393682..64576c793687 100644 --- a/src/ray/gcs/gcs_server/state_util.cc +++ b/src/ray/gcs/state_util.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/gcs_server/state_util.h" +#include "ray/gcs/state_util.h" #include diff --git a/src/ray/gcs/gcs_server/state_util.h b/src/ray/gcs/state_util.h similarity index 100% rename from src/ray/gcs/gcs_server/state_util.h rename to src/ray/gcs/state_util.h diff --git a/src/ray/gcs/store_client/BUILD.bazel b/src/ray/gcs/store_client/BUILD.bazel index 0be49eb13690..dbf318a7ba6c 100644 --- a/src/ray/gcs/store_client/BUILD.bazel +++ b/src/ray/gcs/store_client/BUILD.bazel @@ -1,50 +1,61 @@ load("//bazel:ray.bzl", "ray_cc_library") ray_cc_library( - name = "gcs_store_client", + name = "store_client", hdrs = ["store_client.h"], deps = [ "//src/ray/common:asio", + "//src/ray/common:gcs_callbacks", "//src/ray/common:id", "//src/ray/common:status", - "//src/ray/gcs:gcs_callback", ], ) ray_cc_library( - name = "gcs_redis_store_client", - srcs = ["redis_store_client.cc"], - hdrs = ["redis_store_client.h"], + name = "redis_store_client", + srcs = [ + "redis_async_context.cc", + "redis_context.cc", + "redis_store_client.cc", + ], + hdrs = [ + "redis_async_context.h", + "redis_context.h", + "redis_store_client.h", + ], deps = [ - ":gcs_store_client", - "//src/ray/gcs:gcs_callback", - "//src/ray/gcs:gcs_redis_client", + ":store_client", + "//:hiredis", + "//src/ray/common:asio", + "//src/ray/common:ray_config", + "//src/ray/common:status", + "//src/ray/stats:stats_lib", "//src/ray/util:container_util", + "//src/ray/util:exponential_backoff", + "//src/ray/util:network_util", + "@boost//:asio", "@com_google_absl//absl/strings:str_format", "@com_google_absl//absl/synchronization", ], ) ray_cc_library( - name = "gcs_in_memory_store_client", + name = "in_memory_store_client", srcs = ["in_memory_store_client.cc"], hdrs = ["in_memory_store_client.h"], deps = [ - ":gcs_store_client", + ":store_client", "//src/ray/common:asio", - "//src/ray/gcs:gcs_callback", "//src/ray/util:concurrent_flat_map", "@com_google_absl//absl/container:node_hash_map", ], ) ray_cc_library( - name = "gcs_observable_store_client", + name = "observable_store_client", srcs = ["observable_store_client.cc"], hdrs = ["observable_store_client.h"], deps = [ - ":gcs_store_client", - "//src/ray/gcs:gcs_callback", - "//src/ray/util", + ":store_client", ], ) diff --git a/src/ray/gcs/store_client/in_memory_store_client.cc b/src/ray/gcs/store_client/in_memory_store_client.cc index d7e64bba9fcd..cea449dd71d5 100644 --- a/src/ray/gcs/store_client/in_memory_store_client.cc +++ b/src/ray/gcs/store_client/in_memory_store_client.cc @@ -20,11 +20,11 @@ namespace ray::gcs { -Status InMemoryStoreClient::AsyncPut(const std::string &table_name, - const std::string &key, - std::string data, - bool overwrite, - Postable callback) { +void InMemoryStoreClient::AsyncPut(const std::string &table_name, + const std::string &key, + std::string data, + bool overwrite, + Postable callback) { auto &table = GetOrCreateMutableTable(table_name); bool inserted = false; if (overwrite) { @@ -33,10 +33,9 @@ Status InMemoryStoreClient::AsyncPut(const std::string &table_name, inserted = table.Emplace(key, std::move(data)); } std::move(callback).Post("GcsInMemoryStore.Put", inserted); - return Status::OK(); } -Status InMemoryStoreClient::AsyncGet( +void InMemoryStoreClient::AsyncGet( const std::string &table_name, const std::string &key, ToPostable> callback) { @@ -46,10 +45,9 @@ Status InMemoryStoreClient::AsyncGet( data = table->Get(key); } std::move(callback).Post("GcsInMemoryStore.Get", Status::OK(), std::move(data)); - return Status::OK(); } -Status InMemoryStoreClient::AsyncGetAll( +void InMemoryStoreClient::AsyncGetAll( const std::string &table_name, Postable)> callback) { auto result = absl::flat_hash_map(); @@ -58,10 +56,9 @@ Status InMemoryStoreClient::AsyncGetAll( result = table->GetMapClone(); } std::move(callback).Post("GcsInMemoryStore.GetAll", std::move(result)); - return Status::OK(); } -Status InMemoryStoreClient::AsyncMultiGet( +void InMemoryStoreClient::AsyncMultiGet( const std::string &table_name, const std::vector &keys, Postable)> callback) { @@ -74,31 +71,27 @@ Status InMemoryStoreClient::AsyncMultiGet( }); } std::move(callback).Post("GcsInMemoryStore.GetAll", std::move(result)); - return Status::OK(); } -Status InMemoryStoreClient::AsyncDelete(const std::string &table_name, - const std::string &key, - Postable callback) { +void InMemoryStoreClient::AsyncDelete(const std::string &table_name, + const std::string &key, + Postable callback) { auto &table = GetOrCreateMutableTable(table_name); auto erased = table.Erase(key); std::move(callback).Post("GcsInMemoryStore.Delete", erased); - return Status::OK(); } -Status InMemoryStoreClient::AsyncBatchDelete(const std::string &table_name, - const std::vector &keys, - Postable callback) { +void InMemoryStoreClient::AsyncBatchDelete(const std::string &table_name, + const std::vector &keys, + Postable callback) { auto &table = GetOrCreateMutableTable(table_name); int64_t num_erased = table.EraseKeys(absl::MakeSpan(keys)); std::move(callback).Post("GcsInMemoryStore.BatchDelete", num_erased); - return Status::OK(); } -Status InMemoryStoreClient::AsyncGetNextJobID(Postable callback) { +void InMemoryStoreClient::AsyncGetNextJobID(Postable callback) { auto job_id = job_id_.fetch_add(1, std::memory_order_acq_rel); std::move(callback).Post("GcsInMemoryStore.GetNextJobID", job_id); - return Status::OK(); } ConcurrentFlatMap &InMemoryStoreClient::GetOrCreateMutableTable( @@ -121,7 +114,7 @@ const ConcurrentFlatMap *InMemoryStoreClient::GetTable return nullptr; } -Status InMemoryStoreClient::AsyncGetKeys( +void InMemoryStoreClient::AsyncGetKeys( const std::string &table_name, const std::string &prefix, Postable)> callback) { @@ -135,20 +128,17 @@ Status InMemoryStoreClient::AsyncGetKeys( }); } std::move(callback).Post("GcsInMemoryStore.Keys", std::move(result)); - - return Status::OK(); } -Status InMemoryStoreClient::AsyncExists(const std::string &table_name, - const std::string &key, - Postable callback) { +void InMemoryStoreClient::AsyncExists(const std::string &table_name, + const std::string &key, + Postable callback) { bool result = false; auto table = GetTable(table_name); if (table != nullptr) { result = table->Contains(key); } std::move(callback).Post("GcsInMemoryStore.Exists", result); - return Status::OK(); } } // namespace ray::gcs diff --git a/src/ray/gcs/store_client/in_memory_store_client.h b/src/ray/gcs/store_client/in_memory_store_client.h index e95754592857..d956ad40752c 100644 --- a/src/ray/gcs/store_client/in_memory_store_client.h +++ b/src/ray/gcs/store_client/in_memory_store_client.h @@ -22,7 +22,6 @@ #include "absl/synchronization/mutex.h" #include "ray/gcs/store_client/store_client.h" #include "ray/util/concurrent_flat_map.h" -#include "src/ray/protobuf/gcs.pb.h" namespace ray::gcs { @@ -34,42 +33,42 @@ class InMemoryStoreClient : public StoreClient { public: explicit InMemoryStoreClient() = default; - Status AsyncPut(const std::string &table_name, - const std::string &key, - std::string data, - bool overwrite, - Postable callback) override; + void AsyncPut(const std::string &table_name, + const std::string &key, + std::string data, + bool overwrite, + Postable callback) override; - Status AsyncGet(const std::string &table_name, - const std::string &key, - ToPostable> callback) override; + void AsyncGet(const std::string &table_name, + const std::string &key, + ToPostable> callback) override; - Status AsyncGetAll( + void AsyncGetAll( const std::string &table_name, Postable)> callback) override; - Status AsyncMultiGet( + void AsyncMultiGet( const std::string &table_name, const std::vector &keys, Postable)> callback) override; - Status AsyncDelete(const std::string &table_name, - const std::string &key, - Postable callback) override; + void AsyncDelete(const std::string &table_name, + const std::string &key, + Postable callback) override; - Status AsyncBatchDelete(const std::string &table_name, - const std::vector &keys, - Postable callback) override; + void AsyncBatchDelete(const std::string &table_name, + const std::vector &keys, + Postable callback) override; - Status AsyncGetNextJobID(Postable callback) override; + void AsyncGetNextJobID(Postable callback) override; - Status AsyncGetKeys(const std::string &table_name, - const std::string &prefix, - Postable)> callback) override; + void AsyncGetKeys(const std::string &table_name, + const std::string &prefix, + Postable)> callback) override; - Status AsyncExists(const std::string &table_name, - const std::string &key, - Postable callback) override; + void AsyncExists(const std::string &table_name, + const std::string &key, + Postable callback) override; private: // The returned reference is valid as long as the InMemoryStoreClient is alive and diff --git a/src/ray/gcs/store_client/observable_store_client.cc b/src/ray/gcs/store_client/observable_store_client.cc index 5243944a9f77..b9c5d84c31dc 100644 --- a/src/ray/gcs/store_client/observable_store_client.cc +++ b/src/ray/gcs/store_client/observable_store_client.cc @@ -24,79 +24,81 @@ namespace ray { namespace gcs { -Status ObservableStoreClient::AsyncPut(const std::string &table_name, - const std::string &key, - std::string data, - bool overwrite, - Postable callback) { +void ObservableStoreClient::AsyncPut(const std::string &table_name, + const std::string &key, + std::string data, + bool overwrite, + Postable callback) { auto start = absl::GetCurrentTimeNanos(); ray::stats::STATS_gcs_storage_operation_count.Record(1, "Put"); - return delegate_->AsyncPut( - table_name, key, data, overwrite, std::move(callback).OnInvocation([start]() { - auto end = absl::GetCurrentTimeNanos(); - ray::stats::STATS_gcs_storage_operation_latency_ms.Record( - absl::ToDoubleMilliseconds(absl::Nanoseconds(end - start)), "Put"); - })); + delegate_->AsyncPut(table_name, + key, + std::move(data), + overwrite, + std::move(callback).OnInvocation([start]() { + auto end = absl::GetCurrentTimeNanos(); + ray::stats::STATS_gcs_storage_operation_latency_ms.Record( + absl::ToDoubleMilliseconds(absl::Nanoseconds(end - start)), + "Put"); + })); } -Status ObservableStoreClient::AsyncGet( +void ObservableStoreClient::AsyncGet( const std::string &table_name, const std::string &key, ToPostable> callback) { auto start = absl::GetCurrentTimeNanos(); ray::stats::STATS_gcs_storage_operation_count.Record(1, "Get"); - return delegate_->AsyncGet(table_name, key, std::move(callback).OnInvocation([start]() { + delegate_->AsyncGet(table_name, key, std::move(callback).OnInvocation([start]() { auto end = absl::GetCurrentTimeNanos(); ray::stats::STATS_gcs_storage_operation_latency_ms.Record( absl::ToDoubleMilliseconds(absl::Nanoseconds(end - start)), "Get"); })); } -Status ObservableStoreClient::AsyncGetAll( +void ObservableStoreClient::AsyncGetAll( const std::string &table_name, Postable)> callback) { auto start = absl::GetCurrentTimeNanos(); ray::stats::STATS_gcs_storage_operation_count.Record(1, "GetAll"); - return delegate_->AsyncGetAll(table_name, std::move(callback).OnInvocation([start]() { + delegate_->AsyncGetAll(table_name, std::move(callback).OnInvocation([start]() { auto end = absl::GetCurrentTimeNanos(); ray::stats::STATS_gcs_storage_operation_latency_ms.Record( absl::ToDoubleMilliseconds(absl::Nanoseconds(end - start)), "GetAll"); })); } -Status ObservableStoreClient::AsyncMultiGet( +void ObservableStoreClient::AsyncMultiGet( const std::string &table_name, const std::vector &keys, Postable)> callback) { auto start = absl::GetCurrentTimeNanos(); ray::stats::STATS_gcs_storage_operation_count.Record(1, "MultiGet"); - return delegate_->AsyncMultiGet( - table_name, keys, std::move(callback).OnInvocation([start]() { - auto end = absl::GetCurrentTimeNanos(); - ray::stats::STATS_gcs_storage_operation_latency_ms.Record( - absl::ToDoubleMilliseconds(absl::Nanoseconds(end - start)), "MultiGet"); - })); + delegate_->AsyncMultiGet(table_name, keys, std::move(callback).OnInvocation([start]() { + auto end = absl::GetCurrentTimeNanos(); + ray::stats::STATS_gcs_storage_operation_latency_ms.Record( + absl::ToDoubleMilliseconds(absl::Nanoseconds(end - start)), "MultiGet"); + })); } -Status ObservableStoreClient::AsyncDelete(const std::string &table_name, - const std::string &key, - Postable callback) { +void ObservableStoreClient::AsyncDelete(const std::string &table_name, + const std::string &key, + Postable callback) { auto start = absl::GetCurrentTimeNanos(); ray::stats::STATS_gcs_storage_operation_count.Record(1, "Delete"); - return delegate_->AsyncDelete( - table_name, key, std::move(callback).OnInvocation([start]() { - auto end = absl::GetCurrentTimeNanos(); - ray::stats::STATS_gcs_storage_operation_latency_ms.Record( - absl::ToDoubleMilliseconds(absl::Nanoseconds(end - start)), "Delete"); - })); + delegate_->AsyncDelete(table_name, key, std::move(callback).OnInvocation([start]() { + auto end = absl::GetCurrentTimeNanos(); + ray::stats::STATS_gcs_storage_operation_latency_ms.Record( + absl::ToDoubleMilliseconds(absl::Nanoseconds(end - start)), "Delete"); + })); } -Status ObservableStoreClient::AsyncBatchDelete(const std::string &table_name, - const std::vector &keys, - Postable callback) { +void ObservableStoreClient::AsyncBatchDelete(const std::string &table_name, + const std::vector &keys, + Postable callback) { auto start = absl::GetCurrentTimeNanos(); ray::stats::STATS_gcs_storage_operation_count.Record(1, "BatchDelete"); - return delegate_->AsyncBatchDelete( + delegate_->AsyncBatchDelete( table_name, keys, std::move(callback).OnInvocation([start]() { auto end = absl::GetCurrentTimeNanos(); ray::stats::STATS_gcs_storage_operation_latency_ms.Record( @@ -104,35 +106,33 @@ Status ObservableStoreClient::AsyncBatchDelete(const std::string &table_name, })); } -Status ObservableStoreClient::AsyncGetNextJobID(Postable callback) { - return delegate_->AsyncGetNextJobID(std::move(callback)); +void ObservableStoreClient::AsyncGetNextJobID(Postable callback) { + delegate_->AsyncGetNextJobID(std::move(callback)); } -Status ObservableStoreClient::AsyncGetKeys( +void ObservableStoreClient::AsyncGetKeys( const std::string &table_name, const std::string &prefix, Postable)> callback) { auto start = absl::GetCurrentTimeNanos(); ray::stats::STATS_gcs_storage_operation_count.Record(1, "GetKeys"); - return delegate_->AsyncGetKeys( - table_name, prefix, std::move(callback).OnInvocation([start]() { - auto end = absl::GetCurrentTimeNanos(); - ray::stats::STATS_gcs_storage_operation_latency_ms.Record( - absl::ToDoubleMilliseconds(absl::Nanoseconds(end - start)), "GetKeys"); - })); + delegate_->AsyncGetKeys(table_name, prefix, std::move(callback).OnInvocation([start]() { + auto end = absl::GetCurrentTimeNanos(); + ray::stats::STATS_gcs_storage_operation_latency_ms.Record( + absl::ToDoubleMilliseconds(absl::Nanoseconds(end - start)), "GetKeys"); + })); } -Status ObservableStoreClient::AsyncExists(const std::string &table_name, - const std::string &key, - Postable callback) { +void ObservableStoreClient::AsyncExists(const std::string &table_name, + const std::string &key, + Postable callback) { auto start = absl::GetCurrentTimeNanos(); ray::stats::STATS_gcs_storage_operation_count.Record(1, "Exists"); - return delegate_->AsyncExists( - table_name, key, std::move(callback).OnInvocation([start]() { - auto end = absl::GetCurrentTimeNanos(); - ray::stats::STATS_gcs_storage_operation_latency_ms.Record( - absl::ToDoubleMilliseconds(absl::Nanoseconds(end - start)), "Exists"); - })); + delegate_->AsyncExists(table_name, key, std::move(callback).OnInvocation([start]() { + auto end = absl::GetCurrentTimeNanos(); + ray::stats::STATS_gcs_storage_operation_latency_ms.Record( + absl::ToDoubleMilliseconds(absl::Nanoseconds(end - start)), "Exists"); + })); } } // namespace gcs diff --git a/src/ray/gcs/store_client/observable_store_client.h b/src/ray/gcs/store_client/observable_store_client.h index 1c7bfa9857b6..c483b03b75cd 100644 --- a/src/ray/gcs/store_client/observable_store_client.h +++ b/src/ray/gcs/store_client/observable_store_client.h @@ -31,42 +31,42 @@ class ObservableStoreClient : public StoreClient { explicit ObservableStoreClient(std::unique_ptr delegate) : delegate_(std::move(delegate)) {} - Status AsyncPut(const std::string &table_name, - const std::string &key, - std::string data, - bool overwrite, - Postable callback) override; + void AsyncPut(const std::string &table_name, + const std::string &key, + std::string data, + bool overwrite, + Postable callback) override; - Status AsyncGet(const std::string &table_name, - const std::string &key, - ToPostable> callback) override; + void AsyncGet(const std::string &table_name, + const std::string &key, + ToPostable> callback) override; - Status AsyncGetAll( + void AsyncGetAll( const std::string &table_name, Postable)> callback) override; - Status AsyncMultiGet( + void AsyncMultiGet( const std::string &table_name, const std::vector &keys, Postable)> callback) override; - Status AsyncDelete(const std::string &table_name, - const std::string &key, - Postable callback) override; + void AsyncDelete(const std::string &table_name, + const std::string &key, + Postable callback) override; - Status AsyncBatchDelete(const std::string &table_name, - const std::vector &keys, - Postable callback) override; + void AsyncBatchDelete(const std::string &table_name, + const std::vector &keys, + Postable callback) override; - Status AsyncGetNextJobID(Postable callback) override; + void AsyncGetNextJobID(Postable callback) override; - Status AsyncGetKeys(const std::string &table_name, - const std::string &prefix, - Postable)> callback) override; + void AsyncGetKeys(const std::string &table_name, + const std::string &prefix, + Postable)> callback) override; - Status AsyncExists(const std::string &table_name, - const std::string &key, - Postable callback) override; + void AsyncExists(const std::string &table_name, + const std::string &key, + Postable callback) override; private: std::unique_ptr delegate_; diff --git a/src/ray/gcs/redis_async_context.cc b/src/ray/gcs/store_client/redis_async_context.cc similarity index 99% rename from src/ray/gcs/redis_async_context.cc rename to src/ray/gcs/store_client/redis_async_context.cc index 10df58cf5365..d022009b9a10 100644 --- a/src/ray/gcs/redis_async_context.cc +++ b/src/ray/gcs/store_client/redis_async_context.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/redis_async_context.h" +#include "ray/gcs/store_client/redis_async_context.h" #include #include diff --git a/src/ray/gcs/redis_async_context.h b/src/ray/gcs/store_client/redis_async_context.h similarity index 100% rename from src/ray/gcs/redis_async_context.h rename to src/ray/gcs/store_client/redis_async_context.h diff --git a/src/ray/gcs/redis_context.cc b/src/ray/gcs/store_client/redis_context.cc similarity index 99% rename from src/ray/gcs/redis_context.cc rename to src/ray/gcs/store_client/redis_context.cc index fb684bbd8db6..7c40cab5a5ba 100644 --- a/src/ray/gcs/redis_context.cc +++ b/src/ray/gcs/store_client/redis_context.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/redis_context.h" +#include "ray/gcs/store_client/redis_context.h" #include #include @@ -23,7 +23,6 @@ #include "ray/common/asio/asio_util.h" #include "ray/stats/metric_defs.h" #include "ray/util/network_util.h" -#include "ray/util/util.h" extern "C" { #include "hiredis/async.h" @@ -205,7 +204,7 @@ void RedisRequestContext::RedisResponseFn(redisAsyncContext *async_context, }, "RedisRequestContext.Callback"); auto end_time = absl::Now(); - ray::stats::GcsLatency().Record( + request_cxt->ray_metric_gcs_latency_.Record( absl::ToDoubleMilliseconds(end_time - request_cxt->start_time_)); delete request_cxt; } diff --git a/src/ray/gcs/redis_context.h b/src/ray/gcs/store_client/redis_context.h similarity index 92% rename from src/ray/gcs/redis_context.h rename to src/ray/gcs/store_client/redis_context.h index d21331aac550..343465e70565 100644 --- a/src/ray/gcs/redis_context.h +++ b/src/ray/gcs/store_client/redis_context.h @@ -23,9 +23,10 @@ #include "ray/common/asio/instrumented_io_context.h" #include "ray/common/status.h" -#include "ray/gcs/redis_async_context.h" +#include "ray/gcs/store_client/redis_async_context.h" +#include "ray/stats/metric.h" +#include "ray/stats/tag_defs.h" #include "ray/util/exponential_backoff.h" -#include "src/ray/protobuf/gcs.pb.h" extern "C" { #include "hiredis/hiredis.h" @@ -61,7 +62,7 @@ class CallbackReply { const std::string &ReadAsString() const; /// Read this reply data as a string array. - [[nodiscard]] const std::vector> &ReadAsStringArray() const; + const std::vector> &ReadAsStringArray() const; /// Read this reply data as a scan array. /// @@ -127,6 +128,14 @@ struct RedisRequestContext { std::vector redis_cmds_; std::vector argv_; std::vector argc_; + + // Ray metrics + ray::stats::Histogram ray_metric_gcs_latency_{ + "gcs_latency", + "The latency of a GCS (by default Redis) operation.", + "us", + {100, 200, 300, 400, 500, 600, 700, 800, 900, 1000}, + {stats::kCustomKey}}; }; class RedisContext { diff --git a/src/ray/gcs/store_client/redis_store_client.cc b/src/ray/gcs/store_client/redis_store_client.cc index 8863fb4f111f..2f58eac6afe9 100644 --- a/src/ray/gcs/store_client/redis_store_client.cc +++ b/src/ray/gcs/store_client/redis_store_client.cc @@ -26,7 +26,7 @@ #include "absl/cleanup/cleanup.h" #include "absl/strings/match.h" #include "absl/strings/str_cat.h" -#include "ray/gcs/redis_context.h" +#include "ray/common/ray_config.h" #include "ray/util/container_util.h" #include "ray/util/logging.h" @@ -100,7 +100,7 @@ void RedisStoreClient::MGetValues( shared_callback, key_value_map](const std::shared_ptr &reply) { if (!reply->IsNil()) { - auto value = reply->ReadAsStringArray(); + const auto &value = reply->ReadAsStringArray(); for (size_t index = 0; index < value.size(); ++index) { if (value[index].has_value()) { (*key_value_map)[args[index]] = *(value[index]); @@ -118,19 +118,35 @@ void RedisStoreClient::MGetValues( } } -RedisStoreClient::RedisStoreClient(std::shared_ptr redis_client) - : external_storage_namespace_(::RayConfig::instance().external_storage_namespace()), - redis_client_(std::move(redis_client)) { +std::shared_ptr ConnectRedisContext(instrumented_io_context &io_service, + const RedisClientOptions &options) { + RAY_CHECK(!options.ip.empty()) << "Redis IP address cannot be empty."; + auto context = std::make_shared(io_service); + RAY_CHECK_OK(context->Connect(options.ip, + options.port, + /*username=*/options.username, + /*password=*/options.password, + /*enable_ssl=*/options.enable_ssl)) + << "Failed to connect to Redis."; + return context; +} + +RedisStoreClient::RedisStoreClient(instrumented_io_context &io_service, + const RedisClientOptions &options) + : io_service_(io_service), + options_(options), + external_storage_namespace_(::RayConfig::instance().external_storage_namespace()), + primary_context_(ConnectRedisContext(io_service, options)) { RAY_CHECK(!absl::StrContains(external_storage_namespace_, kClusterSeparator)) << "Storage namespace (" << external_storage_namespace_ << ") shouldn't contain " << kClusterSeparator << "."; } -Status RedisStoreClient::AsyncPut(const std::string &table_name, - const std::string &key, - std::string data, - bool overwrite, - Postable callback) { +void RedisStoreClient::AsyncPut(const std::string &table_name, + const std::string &key, + std::string data, + bool overwrite, + Postable callback) { RedisCommand command{/*command=*/overwrite ? "HSET" : "HSETNX", RedisKey{external_storage_namespace_, table_name}, /*args=*/{key, std::move(data)}}; @@ -141,13 +157,11 @@ Status RedisStoreClient::AsyncPut(const std::string &table_name, std::move(callback).Dispatch("RedisStoreClient.AsyncPut", added_num != 0); }; SendRedisCmdWithKeys({key}, std::move(command), std::move(write_callback)); - return Status::OK(); } -Status RedisStoreClient::AsyncGet( - const std::string &table_name, - const std::string &key, - ToPostable> callback) { +void RedisStoreClient::AsyncGet(const std::string &table_name, + const std::string &key, + ToPostable> callback) { auto redis_callback = [callback = std::move(callback)]( const std::shared_ptr &reply) mutable { std::optional result; @@ -165,49 +179,45 @@ Status RedisStoreClient::AsyncGet( RedisKey{external_storage_namespace_, table_name}, /*args=*/{key}}; SendRedisCmdArgsAsKeys(std::move(command), std::move(redis_callback)); - return Status::OK(); } -Status RedisStoreClient::AsyncGetAll( +void RedisStoreClient::AsyncGetAll( const std::string &table_name, Postable)> callback) { - RedisScanner::ScanKeysAndValues(redis_client_, + RedisScanner::ScanKeysAndValues(primary_context_, RedisKey{external_storage_namespace_, table_name}, RedisMatchPattern::Any(), std::move(callback)); - return Status::OK(); } -Status RedisStoreClient::AsyncDelete(const std::string &table_name, - const std::string &key, - Postable callback) { - return AsyncBatchDelete( - table_name, {key}, std::move(callback).TransformArg([](int64_t cnt) { - return cnt > 0; - })); +void RedisStoreClient::AsyncDelete(const std::string &table_name, + const std::string &key, + Postable callback) { + AsyncBatchDelete(table_name, {key}, std::move(callback).TransformArg([](int64_t cnt) { + return cnt > 0; + })); } -Status RedisStoreClient::AsyncBatchDelete(const std::string &table_name, - const std::vector &keys, - Postable callback) { +void RedisStoreClient::AsyncBatchDelete(const std::string &table_name, + const std::vector &keys, + Postable callback) { if (keys.empty()) { std::move(callback).Dispatch("RedisStoreClient.AsyncBatchDelete", 0); - return Status::OK(); + return; } - return DeleteByKeys(table_name, keys, std::move(callback)); + DeleteByKeys(table_name, keys, std::move(callback)); } -Status RedisStoreClient::AsyncMultiGet( +void RedisStoreClient::AsyncMultiGet( const std::string &table_name, const std::vector &keys, Postable)> callback) { if (keys.empty()) { std::move(callback).Dispatch("RedisStoreClient.AsyncMultiGet", absl::flat_hash_map{}); - return Status::OK(); + return; } MGetValues(table_name, keys, std::move(callback)); - return Status::OK(); } size_t RedisStoreClient::PushToSendingQueue(const std::vector &keys, @@ -275,7 +285,7 @@ void RedisStoreClient::SendRedisCmdWithKeys(std::vector keys, auto num_ready_keys = std::make_shared(0); std::function send_redis = [this, num_ready_keys = num_ready_keys, - concurrency_keys, // Copied! + concurrency_keys, command = std::move(command), redis_callback = std::move(redis_callback)]() mutable { @@ -289,23 +299,23 @@ void RedisStoreClient::SendRedisCmdWithKeys(std::vector keys, } } // Send the actual request - auto *cxt = redis_client_->GetPrimaryContext(); - cxt->RunArgvAsync(command.ToRedisArgs(), - [this, - concurrency_keys, // Copied! - redis_callback = std::move(redis_callback)](auto reply) { - std::vector> requests; - { - absl::MutexLock lock(&mu_); - requests = TakeRequestsFromSendingQueue(concurrency_keys); - } - for (auto &request : requests) { - request(); - } - if (redis_callback) { - redis_callback(reply); - } - }); + primary_context_->RunArgvAsync( + command.ToRedisArgs(), + [this, + concurrency_keys, // Copied! + redis_callback = std::move(redis_callback)](auto reply) { + std::vector> requests; + { + absl::MutexLock lock(&mu_); + requests = TakeRequestsFromSendingQueue(concurrency_keys); + } + for (auto &request : requests) { + request(); + } + if (redis_callback) { + redis_callback(reply); + } + }); }; { @@ -325,9 +335,9 @@ void RedisStoreClient::SendRedisCmdWithKeys(std::vector keys, } } -Status RedisStoreClient::DeleteByKeys(const std::string &table, - const std::vector &keys, - Postable callback) { +void RedisStoreClient::DeleteByKeys(const std::string &table, + const std::vector &keys, + Postable callback) { auto del_cmds = GenCommandsBatched("HDEL", RedisKey{external_storage_namespace_, table}, keys); auto total_count = del_cmds.size(); @@ -348,30 +358,29 @@ Status RedisStoreClient::DeleteByKeys(const std::string &table, }; SendRedisCmdArgsAsKeys(std::move(command), std::move(delete_callback)); } - return Status::OK(); } RedisStoreClient::RedisScanner::RedisScanner( PrivateCtorTag ctor_tag, - std::shared_ptr redis_client, + std::shared_ptr primary_context, RedisKey redis_key, RedisMatchPattern match_pattern, Postable)> callback) : redis_key_(std::move(redis_key)), match_pattern_(std::move(match_pattern)), - redis_client_(std::move(redis_client)), + primary_context_(std::move(primary_context)), callback_(std::move(callback)) { cursor_ = 0; pending_request_count_ = 0; } void RedisStoreClient::RedisScanner::ScanKeysAndValues( - std::shared_ptr redis_client, + std::shared_ptr primary_context, RedisKey redis_key, RedisMatchPattern match_pattern, Postable)> callback) { auto scanner = std::make_shared(PrivateCtorTag(), - std::move(redis_client), + std::move(primary_context), std::move(redis_key), std::move(match_pattern), std::move(callback)); @@ -396,14 +405,13 @@ void RedisStoreClient::RedisScanner::Scan() { // Scan by prefix from Redis. RedisCommand command = {"HSCAN", redis_key_, {std::to_string(cursor_.value())}}; - if (match_pattern_.escaped != "*") { + if (match_pattern_.escaped_ != "*") { command.args.push_back("MATCH"); - command.args.push_back(match_pattern_.escaped); + command.args.push_back(match_pattern_.escaped_); } command.args.push_back("COUNT"); command.args.push_back(std::to_string(batch_count)); - auto *primary_context = redis_client_->GetPrimaryContext(); - primary_context->RunArgvAsync( + primary_context_->RunArgvAsync( command.ToRedisArgs(), // self_ref to keep the scanner alive until the callback is called, even if it // releases its self_ref in Scan(). @@ -444,30 +452,27 @@ void RedisStoreClient::RedisScanner::OnScanCallback( } } -Status RedisStoreClient::AsyncGetNextJobID(Postable callback) { +void RedisStoreClient::AsyncGetNextJobID(Postable callback) { // Note: This is not a HASH! It's a simple key-value pair. // Key: "RAYexternal_storage_namespace@JobCounter" // Value: The next job ID. RedisCommand command = { "INCRBY", RedisKey{external_storage_namespace_, "JobCounter"}, {"1"}}; - auto *cxt = redis_client_->GetPrimaryContext(); - - cxt->RunArgvAsync(command.ToRedisArgs(), - [callback = std::move(callback)]( - const std::shared_ptr &reply) mutable { - auto job_id = static_cast(reply->ReadAsInteger()); - std::move(callback).Post("GcsStore.GetNextJobID", job_id); - }); - - return Status::OK(); + primary_context_->RunArgvAsync( + command.ToRedisArgs(), + [callback = + std::move(callback)](const std::shared_ptr &reply) mutable { + auto job_id = static_cast(reply->ReadAsInteger()); + std::move(callback).Post("GcsStore.GetNextJobID", job_id); + }); } -Status RedisStoreClient::AsyncGetKeys(const std::string &table_name, - const std::string &prefix, - Postable)> callback) { +void RedisStoreClient::AsyncGetKeys(const std::string &table_name, + const std::string &prefix, + Postable)> callback) { RedisScanner::ScanKeysAndValues( - redis_client_, + primary_context_, RedisKey{external_storage_namespace_, table_name}, RedisMatchPattern::Prefix(prefix), std::move(callback).TransformArg( @@ -479,12 +484,11 @@ Status RedisStoreClient::AsyncGetKeys(const std::string &table_name, } return keys; })); - return Status::OK(); } -Status RedisStoreClient::AsyncExists(const std::string &table_name, - const std::string &key, - Postable callback) { +void RedisStoreClient::AsyncExists(const std::string &table_name, + const std::string &key, + Postable callback) { RedisCommand command = { "HEXISTS", RedisKey{external_storage_namespace_, table_name}, {key}}; SendRedisCmdArgsAsKeys( @@ -494,7 +498,21 @@ Status RedisStoreClient::AsyncExists(const std::string &table_name, bool exists = reply->ReadAsInteger() > 0; std::move(callback).Dispatch("RedisStoreClient.AsyncExists", exists); }); - return Status::OK(); +} + +void RedisStoreClient::AsyncCheckHealth(Postable callback) { + auto redis_callback = [callback = std::move(callback)]( + const std::shared_ptr &reply) mutable { + Status status = Status::OK(); + if (reply->IsNil()) { + status = Status::IOError("Unexpected connection error."); + } else if (reply->IsError()) { + status = reply->ReadAsStatus(); + } + std::move(callback).Dispatch("RedisStoreClient.AsyncCheckHealth", status); + }; + + primary_context_->RunArgvAsync({"PING"}, redis_callback); } // Returns True if at least 1 key is deleted, False otherwise. @@ -504,11 +522,10 @@ bool RedisDelKeyPrefixSync(const std::string &host, const std::string &password, bool use_ssl, const std::string &external_storage_namespace) { - RedisClientOptions options(host, port, username, password, use_ssl); - auto cli = std::make_unique(options); - instrumented_io_context io_service{/*enable_lag_probe=*/false, /*running_on_single_thread=*/true}; + RedisClientOptions options{host, port, username, password, use_ssl}; + std::shared_ptr context = ConnectRedisContext(io_service, options); auto thread = std::make_unique([&]() { boost::asio::executor_work_guard work( @@ -521,14 +538,10 @@ bool RedisDelKeyPrefixSync(const std::string &host, thread->join(); }); - auto status = cli->Connect(io_service); - RAY_CHECK_OK(status) << "Failed to connect to redis"; - - auto *context = cli->GetPrimaryContext(); // Delete all such keys by using empty table name. RedisKey redis_key{external_storage_namespace, /*table_name=*/""}; std::vector cmd{"KEYS", - RedisMatchPattern::Prefix(redis_key.ToString()).escaped}; + RedisMatchPattern::Prefix(redis_key.ToString()).escaped_}; std::promise> promise; context->RunArgvAsync(cmd, [&promise](const std::shared_ptr &reply) { promise.set_value(reply); @@ -540,14 +553,14 @@ bool RedisDelKeyPrefixSync(const std::string &host, << external_storage_namespace; return true; } - auto delete_one_sync = [context](const std::string &key) { + auto delete_one_sync = [&context](const std::string &key) { auto del_cmd = std::vector{"DEL", key}; - std::promise> promise; + std::promise> prom; context->RunArgvAsync(del_cmd, - [&promise](const std::shared_ptr &reply) { - promise.set_value(reply); + [&prom](const std::shared_ptr &callback_reply) { + prom.set_value(callback_reply); }); - auto del_reply = promise.get_future().get(); + auto del_reply = prom.get_future().get(); return del_reply->ReadAsInteger() > 0; }; size_t num_deleted = 0; diff --git a/src/ray/gcs/store_client/redis_store_client.h b/src/ray/gcs/store_client/redis_store_client.h index 39d73d16bfeb..6ba9c8b4ea3b 100644 --- a/src/ray/gcs/store_client/redis_store_client.h +++ b/src/ray/gcs/store_client/redis_store_client.h @@ -22,14 +22,11 @@ #include #include -#include "absl/container/flat_hash_set.h" #include "absl/synchronization/mutex.h" +#include "ray/common/asio/instrumented_io_context.h" #include "ray/common/asio/postable.h" -#include "ray/common/ray_config.h" -#include "ray/gcs/redis_client.h" -#include "ray/gcs/redis_context.h" +#include "ray/gcs/store_client/redis_context.h" #include "ray/gcs/store_client/store_client.h" -#include "src/ray/protobuf/gcs.pb.h" namespace ray { @@ -48,10 +45,10 @@ struct RedisMatchPattern { static const RedisMatchPattern kAny("*"); return kAny; } - const std::string escaped; + const std::string escaped_; private: - explicit RedisMatchPattern(std::string escaped) : escaped(std::move(escaped)) {} + explicit RedisMatchPattern(std::string escaped) : escaped_(std::move(escaped)) {} }; struct RedisCommand { @@ -90,7 +87,25 @@ inline std::ostream &operator<<(std::ostream &os, const RedisConcurrencyKey &key return os; } +struct RedisClientOptions { + // Redis server address. + std::string ip; + int port; + + // Redis username and password. + std::string username; + std::string password; + + // Whether to use TLS/SSL for the connection. + bool enable_ssl = false; +}; + // StoreClient using Redis as persistence backend. +// +// The StoreClient does not currently handle any failures (transient or otherwise) of +// the Redis server. A periodic health check runs in the background and it will crash +// the process if the Redis server cannot be reached. +// // Note in redis term a "key" points to a hash table and a "field" is a key, a "value" // is just a value. We double quote "key" and "field" to avoid confusion. // @@ -110,44 +125,54 @@ inline std::ostream &operator<<(std::ostream &os, const RedisConcurrencyKey &key // [1] https://github.com/ray-project/ray/pull/35123#issuecomment-1546549046 class RedisStoreClient : public StoreClient { public: - explicit RedisStoreClient(std::shared_ptr redis_client); - - Status AsyncPut(const std::string &table_name, - const std::string &key, - std::string data, - bool overwrite, - Postable callback) override; - - Status AsyncGet(const std::string &table_name, - const std::string &key, - ToPostable> callback) override; - - Status AsyncGetAll( + /// Connect to Redis. Not thread safe. + /// + /// \param io_service The event loop for this client. Must be single threaded. + /// \param options The options for connecting to Redis. + explicit RedisStoreClient(instrumented_io_context &io_service, + const RedisClientOptions &options); + + void AsyncPut(const std::string &table_name, + const std::string &key, + std::string data, + bool overwrite, + Postable callback) override; + + void AsyncGet(const std::string &table_name, + const std::string &key, + ToPostable> callback) override; + + void AsyncGetAll( const std::string &table_name, Postable)> callback) override; - Status AsyncMultiGet( + void AsyncMultiGet( const std::string &table_name, const std::vector &keys, Postable)> callback) override; - Status AsyncDelete(const std::string &table_name, - const std::string &key, - Postable callback) override; + void AsyncDelete(const std::string &table_name, + const std::string &key, + Postable callback) override; + + void AsyncBatchDelete(const std::string &table_name, + const std::vector &keys, + Postable callback) override; - Status AsyncBatchDelete(const std::string &table_name, - const std::vector &keys, - Postable callback) override; + void AsyncGetNextJobID(Postable callback) override; - Status AsyncGetNextJobID(Postable callback) override; + void AsyncGetKeys(const std::string &table_name, + const std::string &prefix, + Postable)> callback) override; - Status AsyncGetKeys(const std::string &table_name, - const std::string &prefix, - Postable)> callback) override; + void AsyncExists(const std::string &table_name, + const std::string &key, + Postable callback) override; - Status AsyncExists(const std::string &table_name, - const std::string &key, - Postable callback) override; + // Check if Redis is available. + // + // \param callback The callback that will be called with a Status. OK means healthy. + void AsyncCheckHealth(Postable callback); private: /// \class RedisScanner @@ -167,13 +192,13 @@ class RedisStoreClient : public StoreClient { // Don't call this. Use ScanKeysAndValues instead. explicit RedisScanner( PrivateCtorTag tag, - std::shared_ptr redis_client, + std::shared_ptr primary_context, RedisKey redis_key, RedisMatchPattern match_pattern, Postable)> callback); static void ScanKeysAndValues( - std::shared_ptr redis_client, + std::shared_ptr primary_context, RedisKey redis_key, RedisMatchPattern match_pattern, Postable)> callback); @@ -185,6 +210,7 @@ class RedisStoreClient : public StoreClient { void Scan(); void OnScanCallback(const std::shared_ptr &reply); + /// The table name that the scanner will scan. RedisKey redis_key_; @@ -204,7 +230,7 @@ class RedisStoreClient : public StoreClient { /// The pending shard scan count. std::atomic pending_request_count_{0}; - std::shared_ptr redis_client_; + std::shared_ptr primary_context_; Postable)> callback_; @@ -232,9 +258,9 @@ class RedisStoreClient : public StoreClient { std::vector> TakeRequestsFromSendingQueue( const std::vector &keys) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_); - Status DeleteByKeys(const std::string &table_name, - const std::vector &keys, - Postable callback); + void DeleteByKeys(const std::string &table_name, + const std::vector &keys, + Postable callback); // Send the redis command to the server. This method will make request to be // serialized for each key in keys. At a given time, only one request for a {table_name, @@ -260,8 +286,15 @@ class RedisStoreClient : public StoreClient { const std::vector &keys, Postable)> callback); + instrumented_io_context &io_service_; + + RedisClientOptions options_; + std::string external_storage_namespace_; - std::shared_ptr redis_client_; + + // The following context writes everything to the primary shard. + std::shared_ptr primary_context_; + absl::Mutex mu_; // The pending redis requests queue for each key. diff --git a/src/ray/gcs/store_client/store_client.h b/src/ray/gcs/store_client/store_client.h index 882a5201a9ee..f9c78b0057af 100644 --- a/src/ray/gcs/store_client/store_client.h +++ b/src/ray/gcs/store_client/store_client.h @@ -20,9 +20,9 @@ #include "ray/common/asio/io_service_pool.h" #include "ray/common/asio/postable.h" +#include "ray/common/gcs_callbacks.h" #include "ray/common/id.h" #include "ray/common/status.h" -#include "ray/gcs/callback.h" namespace ray { @@ -43,29 +43,26 @@ class StoreClient { /// will be ignored. /// \param callback WARNING: it returns true if and only if A NEW ENTRY is added. /// Overwritten return false. - /// \return Status - virtual Status AsyncPut(const std::string &table_name, - const std::string &key, - std::string data, - bool overwrite, - Postable callback) = 0; + virtual void AsyncPut(const std::string &table_name, + const std::string &key, + std::string data, + bool overwrite, + Postable callback) = 0; /// Get data from the given table asynchronously. /// /// \param table_name The name of the table to be read. /// \param key The key to lookup from the table. /// \param callback returns the value or null. - /// \return Status - virtual Status AsyncGet(const std::string &table_name, - const std::string &key, - ToPostable> callback) = 0; + virtual void AsyncGet(const std::string &table_name, + const std::string &key, + ToPostable> callback) = 0; /// Get all data from the given table asynchronously. /// /// \param table_name The name of the table to be read. /// \param callback returns the key value pairs in a map. - /// \return Status - virtual Status AsyncGetAll( + virtual void AsyncGetAll( const std::string &table_name, Postable)> callback) = 0; @@ -74,8 +71,7 @@ class StoreClient { /// \param table_name The name of the table to be read. /// \param keys The keys to look up from the table. /// \param callback returns the key value pairs in a map for those keys that exist. - /// \return Status - virtual Status AsyncMultiGet( + virtual void AsyncMultiGet( const std::string &table_name, const std::vector &keys, Postable)> callback) = 0; @@ -85,45 +81,41 @@ class StoreClient { /// \param table_name The name of the table from which data is to be deleted. /// \param key The key that will be deleted from the table. /// \param callback returns true if an entry with matching key is deleted. - /// \return Status - virtual Status AsyncDelete(const std::string &table_name, - const std::string &key, - Postable callback) = 0; + virtual void AsyncDelete(const std::string &table_name, + const std::string &key, + Postable callback) = 0; /// Batch delete data from the given table asynchronously. /// /// \param table_name The name of the table from which data is to be deleted. /// \param keys The keys that will be deleted from the table. /// \param callback returns the number of deleted entries. - /// \return Status - virtual Status AsyncBatchDelete(const std::string &table_name, - const std::vector &keys, - Postable callback) = 0; + virtual void AsyncBatchDelete(const std::string &table_name, + const std::vector &keys, + Postable callback) = 0; /// Get next job id by `INCR` "JobCounter" key asynchronously. /// /// \param callback returns the next job id in integer representation. - /// \return Status - virtual Status AsyncGetNextJobID(Postable callback) = 0; + virtual void AsyncGetNextJobID(Postable callback) = 0; /// Get all the keys match the prefix from the given table asynchronously. /// /// \param table_name The name of the table to be read. /// \param prefix The prefix to be scaned. /// \param callback returns all matching keys in a vector. - /// \return Status - virtual Status AsyncGetKeys(const std::string &table_name, - const std::string &prefix, - Postable)> callback) = 0; + virtual void AsyncGetKeys(const std::string &table_name, + const std::string &prefix, + Postable)> callback) = 0; /// Check whether the key exists in the table. /// /// \param table_name The name of the table to be read. /// \param key The key to be checked. /// \param callback Returns true if such key exists. - virtual Status AsyncExists(const std::string &table_name, - const std::string &key, - Postable callback) = 0; + virtual void AsyncExists(const std::string &table_name, + const std::string &key, + Postable callback) = 0; protected: StoreClient() = default; diff --git a/src/ray/gcs/store_client/test/redis_store_client_test.cc b/src/ray/gcs/store_client/test/redis_store_client_test.cc deleted file mode 100644 index b473b3167b91..000000000000 --- a/src/ray/gcs/store_client/test/redis_store_client_test.cc +++ /dev/null @@ -1,427 +0,0 @@ -// Copyright 2017 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "ray/gcs/store_client/redis_store_client.h" - -#include -#include -#include -#include -#include -#include -#include - -#include "ray/common/test_util.h" -#include "ray/gcs/redis_client.h" -#include "ray/gcs/store_client/test/store_client_test_base.h" -#include "ray/util/path_utils.h" - -using namespace std::chrono_literals; // NOLINT -namespace ray { - -namespace gcs { - -class RedisStoreClientTest : public StoreClientTestBase { - public: - RedisStoreClientTest() { - if (std::getenv("REDIS_CHAOS") != nullptr) { - ::RayConfig::instance().num_redis_request_retries() = 1000; - ::RayConfig::instance().redis_retry_base_ms() = 10; - ::RayConfig::instance().redis_retry_max_ms() = 100; - } - } - - virtual ~RedisStoreClientTest() {} - - static void SetUpTestCase() { TestSetupUtil::StartUpRedisServers(std::vector()); } - - static void TearDownTestCase() { TestSetupUtil::ShutDownRedisServers(); } - - void SetUp() override { - auto port = TEST_REDIS_SERVER_PORTS.front(); - TestSetupUtil::FlushRedisServer(port); - StoreClientTestBase::SetUp(); - if (std::getenv("REDIS_CHAOS") != nullptr) { - t_ = std::make_unique([this, port]() { - while (!stopped_) { - TestSetupUtil::ExecuteRedisCmd(port, {"REPLICAOF", "localhost", "1234"}); - std::this_thread::sleep_for(50ms); - TestSetupUtil::ExecuteRedisCmd(port, {"REPLICAOF", "NO", "ONE"}); - std::this_thread::sleep_for(200ms); - } - }); - } - } - - void TearDown() override { - stopped_ = true; - if (t_) { - t_->join(); - } - StoreClientTestBase::TearDown(); - } - - void InitStoreClient() override { - RedisClientOptions options("127.0.0.1", TEST_REDIS_SERVER_PORTS.front(), "", ""); - redis_client_ = std::make_shared(options); - RAY_CHECK_OK(redis_client_->Connect(*io_service_pool_->Get())); - - store_client_ = std::make_shared(redis_client_); - } - - void DisconnectStoreClient() override { redis_client_->Disconnect(); } - - protected: - std::shared_ptr redis_client_; - std::unique_ptr t_; - std::atomic stopped_ = false; -}; - -TEST_F(RedisStoreClientTest, AsyncPutAndAsyncGetTest) { TestAsyncPutAndAsyncGet(); } - -TEST_F(RedisStoreClientTest, AsyncGetAllAndBatchDeleteTest) { - TestAsyncGetAllAndBatchDelete(); -} - -TEST_F(RedisStoreClientTest, BasicSimple) { - // Send 100 times write and then read - auto cnt = std::make_shared>(0); - for (size_t i = 0; i < 100; ++i) { - for (size_t j = 0; j < 20; ++j) { - ++*cnt; - ASSERT_TRUE(store_client_ - ->AsyncPut("T", - absl::StrCat("A", std::to_string(j)), - std::to_string(i), - true, - {[i, cnt](auto r) { - --*cnt; - ASSERT_TRUE((i == 0 && r) || (i != 0 && !r)); - }, - *io_service_pool_->Get()}) - .ok()); - } - } - for (size_t j = 0; j < 20; ++j) { - ++*cnt; - ASSERT_TRUE(store_client_ - ->AsyncGet("T", - absl::StrCat("A", std::to_string(j)), - {[cnt](auto s, auto r) { - --*cnt; - ASSERT_TRUE(r.has_value()); - ASSERT_EQ(*r, "99"); - }, - *io_service_pool_->Get()}) - .ok()); - } - ASSERT_TRUE(WaitForCondition([cnt]() { return *cnt == 0; }, 5000)); -} - -TEST_F(RedisStoreClientTest, Complicated) { - int window = 10; - std::atomic finished{0}; - std::atomic sent{0}; - - for (int i = 0; i < 1000; i += window) { - std::vector keys; - for (int j = i; j < i + window; ++j) { - ++sent; - RAY_LOG(INFO) << "S AsyncPut: " << ("P_" + std::to_string(j)); - ASSERT_TRUE(store_client_ - ->AsyncPut("N", - "P_" + std::to_string(j), - std::to_string(j), - true, - {[&finished, j](auto r) mutable { - RAY_LOG(INFO) - << "F AsyncPut: " << ("P_" + std::to_string(j)); - ++finished; - ASSERT_TRUE(r); - }, - *io_service_pool_->Get()}) - .ok()); - keys.push_back(std::to_string(j)); - } - - std::vector p_keys; - for (auto &key : keys) { - p_keys.push_back("P_" + key); - } - - std::vector n_keys; - for (auto &key : keys) { - n_keys.push_back("N_" + key); - } - - ++sent; - RAY_LOG(INFO) << "S AsyncMultiGet: " << absl::StrJoin(p_keys, ","); - ASSERT_TRUE( - store_client_ - ->AsyncMultiGet( - "N", - p_keys, - {[&finished, i, keys, window, &sent, p_keys, n_keys, this]( - absl::flat_hash_map m) mutable -> void { - RAY_LOG(INFO) << "F SendAsyncMultiGet: " << absl::StrJoin(p_keys, ","); - ++finished; - ASSERT_EQ(keys.size(), m.size()); - for (auto &key : keys) { - ASSERT_EQ(m["P_" + key], key); - } - - if ((i / window) % 2 == 0) { - // Delete non exist keys - for (size_t jj = 0; jj < keys.size(); ++jj) { - ++sent; - RAY_LOG(INFO) << "S AsyncDelete: " << n_keys[jj]; - ASSERT_TRUE( - store_client_ - ->AsyncDelete("N", - n_keys[jj], - {[&finished, n_keys, jj](auto b) mutable { - RAY_LOG(INFO) - << "F AsyncDelete: " << n_keys[jj]; - ++finished; - ASSERT_FALSE(b); - }, - *this->io_service_pool_->Get()}) - .ok()); - - ++sent; - RAY_LOG(INFO) << "S AsyncExists: " << p_keys[jj]; - ASSERT_TRUE( - store_client_ - ->AsyncExists("N", - p_keys[jj], - {[&finished, p_keys, jj](auto b) mutable { - RAY_LOG(INFO) - << "F AsyncExists: " << p_keys[jj]; - ++finished; - ASSERT_TRUE(b); - }, - *this->io_service_pool_->Get()}) - .ok()); - } - } else { - ++sent; - RAY_LOG(INFO) - << "S AsyncBatchDelete: " << absl::StrJoin(p_keys, ","); - ASSERT_TRUE(store_client_ - ->AsyncBatchDelete( - "N", - p_keys, - {[&finished, p_keys, keys](auto n) mutable { - RAY_LOG(INFO) << "F AsyncBatchDelete: " - << absl::StrJoin(p_keys, ","); - ++finished; - ASSERT_EQ(n, keys.size()); - }, - *this->io_service_pool_->Get()}) - .ok()); - - for (auto p_key : p_keys) { - ++sent; - RAY_LOG(INFO) << "S AsyncExists: " << p_key; - ASSERT_TRUE(store_client_ - ->AsyncExists("N", - p_key, - {[&finished, p_key](auto b) mutable { - RAY_LOG(INFO) - << "F AsyncExists: " << p_key; - ++finished; - ASSERT_FALSE(false); - }, - *this->io_service_pool_->Get()}) - .ok()); - } - } - }, - *io_service_pool_->Get()}) - .ok()); - } - ASSERT_TRUE(WaitForCondition( - [&finished, &sent]() { - RAY_LOG(INFO) << finished << "/" << sent; - return finished == sent; - }, - 5000)); -} - -TEST_F(RedisStoreClientTest, Random) { - std::map dict; - auto counter = std::make_shared>(0); - auto m_gen_keys = []() { - auto num_keys = static_cast(std::rand() % 10); - std::unordered_set keys; - while (keys.size() < num_keys) { - auto k = std::to_string(std::rand() % 1000); - keys.insert(k); - } - return std::vector(keys.begin(), keys.end()); - }; - - auto m_multi_get = [&, counter, this](size_t idx) { - auto keys = m_gen_keys(); - absl::flat_hash_map result; - for (auto key : keys) { - auto iter = dict.find(key); - if (iter != dict.end()) { - result[key] = iter->second; - } - } - RAY_LOG(INFO) << "m_multi_get Sending: " << idx; - *counter += 1; - RAY_CHECK_OK(store_client_->AsyncMultiGet("N", - keys, - {[result, idx, counter](auto m) mutable { - RAY_LOG(INFO) - << "m_multi_get Finished: " << idx - << " " << m.size(); - *counter -= 1; - ASSERT_TRUE(m == result); - }, - *io_service_pool_->Get()})); - }; - - auto m_batch_delete = [&, counter, this](size_t idx) mutable { - auto keys = m_gen_keys(); - size_t deleted_num = 0; - for (auto key : keys) { - deleted_num += dict.erase(key); - } - RAY_LOG(INFO) << "m_batch_delete Sending: " << idx; - *counter += 1; - RAY_CHECK_OK(store_client_->AsyncBatchDelete( - "N", - keys, - {[&counter, deleted_num, idx](auto v) mutable { - RAY_LOG(INFO) << "m_batch_delete Finished: " << idx << " " << v; - *counter -= 1; - ASSERT_EQ(v, deleted_num); - }, - *io_service_pool_->Get()})); - }; - - auto m_delete = [&, this](size_t idx) mutable { - auto k = std::to_string(std::rand() % 1000); - bool deleted = dict.erase(k) > 0; - RAY_LOG(INFO) << "m_delete Sending: " << idx << " " << k; - *counter += 1; - RAY_CHECK_OK(store_client_->AsyncDelete("N", - k, - {[counter, k, idx, deleted](auto r) { - RAY_LOG(INFO) - << "m_delete Finished: " << idx << " " - << k << " " << deleted; - *counter -= 1; - ASSERT_EQ(deleted, r); - }, - *io_service_pool_->Get()})); - }; - - auto m_get = [&, counter, this](size_t idx) { - auto k = std::to_string(std::rand() % 1000); - std::optional v; - if (dict.count(k)) { - v = dict[k]; - } - RAY_LOG(INFO) << "m_get Sending: " << idx; - *counter += 1; - RAY_CHECK_OK(store_client_->AsyncGet("N", - k, - {[counter, idx, v](auto, auto r) { - RAY_LOG(INFO) - << "m_get Finished: " << idx << " " - << (r ? *r : std::string("-")); - *counter -= 1; - ASSERT_EQ(v, r); - }, - *io_service_pool_->Get()})); - }; - - auto m_exists = [&, counter, this](size_t idx) { - auto k = std::to_string(std::rand() % 1000); - bool existed = dict.count(k); - RAY_LOG(INFO) << "m_exists Sending: " << idx; - *counter += 1; - RAY_CHECK_OK(store_client_->AsyncExists( - "N", - k, - {[k, existed, counter, idx](auto r) mutable { - RAY_LOG(INFO) << "m_exists Finished: " << idx << " " << k << " " << r; - *counter -= 1; - ASSERT_EQ(existed, r) << " exists check " << k; - }, - *io_service_pool_->Get()})); - }; - - auto m_puts = [&, counter, this](size_t idx) mutable { - auto k = std::to_string(std::rand() % 1000); - auto v = std::to_string(std::rand() % 1000); - bool added = false; - if (!dict.count(k)) { - added = true; - } - dict[k] = v; - RAY_LOG(INFO) << "m_put Sending: " << idx << " " << k << " " << v; - *counter += 1; - RAY_CHECK_OK(store_client_->AsyncPut("N", - k, - v, - true, - {[idx, added, k, counter](bool r) mutable { - RAY_LOG(INFO) - << "m_put Finished: " - << " " << idx << " " << k << " " << r; - *counter -= 1; - ASSERT_EQ(r, added); - }, - *io_service_pool_->Get()})); - }; - - std::vector> ops{ - m_batch_delete, m_delete, m_get, m_exists, m_multi_get, m_puts}; - - for (size_t i = 0; i < 10000; ++i) { - auto idx = std::rand() % ops.size(); - ops[idx](i); - } - EXPECT_TRUE(WaitForCondition([&counter]() { return *counter == 0; }, 10000)); - auto redis_store_client_raw_ptr = - reinterpret_cast(store_client_.get()); - absl::MutexLock lock(&redis_store_client_raw_ptr->mu_); - ASSERT_TRUE(redis_store_client_raw_ptr->pending_redis_request_by_key_.empty()); -} - -} // namespace gcs - -} // namespace ray - -int main(int argc, char **argv) { - InitShutdownRAII ray_log_shutdown_raii( - ray::RayLog::StartRayLog, - ray::RayLog::ShutDownRayLog, - argv[0], - ray::RayLogLevel::INFO, - ray::GetLogFilepathFromDirectory(/*log_dir=*/"", /*app_name=*/argv[0]), - ray::GetErrLogFilepathFromDirectory(/*log_dir=*/"", /*app_name=*/argv[0]), - ray::RayLog::GetRayLogRotationMaxBytesOrDefault(), - ray::RayLog::GetRayLogRotationBackupCountOrDefault()); - ::testing::InitGoogleTest(&argc, argv); - RAY_CHECK(argc == 3); - ray::TEST_REDIS_SERVER_EXEC_PATH = argv[1]; - ray::TEST_REDIS_CLIENT_EXEC_PATH = argv[2]; - return RUN_ALL_TESTS(); -} diff --git a/src/ray/gcs/store_client/test/BUILD.bazel b/src/ray/gcs/store_client/tests/BUILD.bazel similarity index 55% rename from src/ray/gcs/store_client/test/BUILD.bazel rename to src/ray/gcs/store_client/tests/BUILD.bazel index 6ceedbe51516..b6bbb280fed8 100644 --- a/src/ray/gcs/store_client/test/BUILD.bazel +++ b/src/ray/gcs/store_client/tests/BUILD.bazel @@ -4,8 +4,8 @@ ray_cc_library( name = "store_client_test_lib", hdrs = ["store_client_test_base.h"], deps = [ - "//src/ray/common:test_util", - "//src/ray/gcs/store_client:gcs_redis_store_client", + "//src/ray/common:test_utils", + "//src/ray/gcs/store_client", ], ) @@ -24,7 +24,10 @@ ray_cc_test( tags = ["team:core"], deps = [ ":store_client_test_lib", - "//src/ray/gcs/store_client:gcs_redis_store_client", + "//src/ray/gcs/store_client:redis_store_client", + "//src/ray/util:network_util", + "//src/ray/util:path_utils", + "//src/ray/util:raii", "@boost//:optional", "@com_google_googletest//:gtest_main", ], @@ -52,7 +55,10 @@ ray_cc_test( ], deps = [ ":store_client_test_lib", - "//src/ray/gcs/store_client:gcs_redis_store_client", + "//src/ray/gcs/store_client:redis_store_client", + "//src/ray/util:network_util", + "//src/ray/util:path_utils", + "//src/ray/util:raii", "@boost//:optional", "@com_google_googletest//:gtest_main", ], @@ -65,7 +71,7 @@ ray_cc_test( tags = ["team:core"], deps = [ ":store_client_test_lib", - "//src/ray/gcs/store_client:gcs_in_memory_store_client", + "//src/ray/gcs/store_client:in_memory_store_client", "@com_google_googletest//:gtest_main", ], ) @@ -77,8 +83,40 @@ ray_cc_test( tags = ["team:core"], deps = [ ":store_client_test_lib", - "//src/ray/gcs/store_client:gcs_in_memory_store_client", - "//src/ray/gcs/store_client:gcs_observable_store_client", + "//src/ray/gcs/store_client:in_memory_store_client", + "//src/ray/gcs/store_client:observable_store_client", + "@com_google_googletest//:gtest_main", + ], +) + +ray_cc_test( + name = "redis_callback_reply_test", + size = "small", + srcs = ["redis_callback_reply_test.cc"], + tags = ["team:core"], + deps = [ + "//src/ray/gcs/store_client:redis_store_client", + "@com_google_googletest//:gtest_main", + ], +) + +ray_cc_test( + name = "redis_async_context_test", + size = "small", + srcs = ["redis_async_context_test.cc"], + args = [ + "$(location //:redis-server)", + "$(location //:redis-cli)", + ], + data = [ + "//:redis-cli", + "//:redis-server", + ], + tags = ["team:core"], + deps = [ + "//src/ray/common:test_utils", + "//src/ray/gcs/store_client:redis_store_client", + "//src/ray/util:raii", "@com_google_googletest//:gtest_main", ], ) diff --git a/src/ray/gcs/store_client/test/in_memory_store_client_test.cc b/src/ray/gcs/store_client/tests/in_memory_store_client_test.cc similarity index 91% rename from src/ray/gcs/store_client/test/in_memory_store_client_test.cc rename to src/ray/gcs/store_client/tests/in_memory_store_client_test.cc index c5f8e81ec284..fdd6d24b4673 100644 --- a/src/ray/gcs/store_client/test/in_memory_store_client_test.cc +++ b/src/ray/gcs/store_client/tests/in_memory_store_client_test.cc @@ -16,7 +16,7 @@ #include -#include "ray/gcs/store_client/test/store_client_test_base.h" +#include "ray/gcs/store_client/tests/store_client_test_base.h" namespace ray { @@ -27,8 +27,6 @@ class InMemoryStoreClientTest : public StoreClientTestBase { void InitStoreClient() override { store_client_ = std::make_shared(); } - - void DisconnectStoreClient() override {} }; TEST_F(InMemoryStoreClientTest, AsyncPutAndAsyncGetTest) { TestAsyncPutAndAsyncGet(); } diff --git a/src/ray/gcs/store_client/test/observable_store_client_test.cc b/src/ray/gcs/store_client/tests/observable_store_client_test.cc similarity index 92% rename from src/ray/gcs/store_client/test/observable_store_client_test.cc rename to src/ray/gcs/store_client/tests/observable_store_client_test.cc index 17a1f751aeb9..013601fd8741 100644 --- a/src/ray/gcs/store_client/test/observable_store_client_test.cc +++ b/src/ray/gcs/store_client/tests/observable_store_client_test.cc @@ -17,7 +17,7 @@ #include #include "ray/gcs/store_client/in_memory_store_client.h" -#include "ray/gcs/store_client/test/store_client_test_base.h" +#include "ray/gcs/store_client/tests/store_client_test_base.h" namespace ray { @@ -29,8 +29,6 @@ class ObservableStoreClientTest : public StoreClientTestBase { store_client_ = std::make_shared(std::make_unique()); } - - void DisconnectStoreClient() override {} }; TEST_F(ObservableStoreClientTest, AsyncPutAndAsyncGetTest) { TestAsyncPutAndAsyncGet(); } diff --git a/src/ray/gcs/test/redis_async_context_test.cc b/src/ray/gcs/store_client/tests/redis_async_context_test.cc similarity index 95% rename from src/ray/gcs/test/redis_async_context_test.cc rename to src/ray/gcs/store_client/tests/redis_async_context_test.cc index 6309e867995a..605ded810aa9 100644 --- a/src/ray/gcs/test/redis_async_context_test.cc +++ b/src/ray/gcs/store_client/tests/redis_async_context_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/redis_async_context.h" +#include "ray/gcs/store_client/redis_async_context.h" #include #include @@ -20,10 +20,11 @@ #include "gtest/gtest.h" #include "ray/common/asio/instrumented_io_context.h" -#include "ray/common/test_util.h" -#include "ray/gcs/redis_context.h" +#include "ray/common/test_utils.h" +#include "ray/gcs/store_client/redis_context.h" #include "ray/util/logging.h" #include "ray/util/path_utils.h" +#include "ray/util/raii.h" extern "C" { #include "hiredis/async.h" diff --git a/src/ray/gcs/test/callback_reply_test.cc b/src/ray/gcs/store_client/tests/redis_callback_reply_test.cc similarity index 98% rename from src/ray/gcs/test/callback_reply_test.cc rename to src/ray/gcs/store_client/tests/redis_callback_reply_test.cc index c6221e42d2ec..ad8e5b3ee48c 100644 --- a/src/ray/gcs/test/callback_reply_test.cc +++ b/src/ray/gcs/store_client/tests/redis_callback_reply_test.cc @@ -16,7 +16,7 @@ #include #include "gtest/gtest.h" -#include "ray/gcs/redis_context.h" +#include "ray/gcs/store_client/redis_context.h" extern "C" { #include "hiredis/hiredis.h" diff --git a/src/ray/gcs/store_client/tests/redis_store_client_test.cc b/src/ray/gcs/store_client/tests/redis_store_client_test.cc new file mode 100644 index 000000000000..454a2a7af2dc --- /dev/null +++ b/src/ray/gcs/store_client/tests/redis_store_client_test.cc @@ -0,0 +1,397 @@ +// Copyright 2017 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ray/gcs/store_client/redis_store_client.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "ray/common/test_utils.h" +#include "ray/gcs/store_client/tests/store_client_test_base.h" +#include "ray/util/network_util.h" +#include "ray/util/path_utils.h" +#include "ray/util/raii.h" + +using namespace std::chrono_literals; // NOLINT +namespace ray { + +namespace gcs { + +class RedisStoreClientTest : public StoreClientTestBase { + public: + RedisStoreClientTest() { + if (std::getenv("REDIS_CHAOS") != nullptr) { + ::RayConfig::instance().num_redis_request_retries() = 1000; + ::RayConfig::instance().redis_retry_base_ms() = 10; + ::RayConfig::instance().redis_retry_max_ms() = 100; + } + } + + virtual ~RedisStoreClientTest() {} + + static void SetUpTestCase() { TestSetupUtil::StartUpRedisServers(std::vector()); } + + static void TearDownTestCase() { TestSetupUtil::ShutDownRedisServers(); } + + void SetUp() override { + auto port = TEST_REDIS_SERVER_PORTS.front(); + TestSetupUtil::FlushRedisServer(port); + StoreClientTestBase::SetUp(); + if (std::getenv("REDIS_CHAOS") != nullptr) { + t_ = std::make_unique([this, port]() { + while (!stopped_) { + TestSetupUtil::ExecuteRedisCmd(port, {"REPLICAOF", "localhost", "1234"}); + std::this_thread::sleep_for(50ms); + TestSetupUtil::ExecuteRedisCmd(port, {"REPLICAOF", "NO", "ONE"}); + std::this_thread::sleep_for(200ms); + } + }); + } + } + + void TearDown() override { + stopped_ = true; + if (t_) { + t_->join(); + } + StoreClientTestBase::TearDown(); + } + + void InitStoreClient() override { + auto &io_context = *io_service_pool_->Get(); + RedisClientOptions options{"127.0.0.1", TEST_REDIS_SERVER_PORTS.front()}; + store_client_ = std::make_shared(io_context, options); + } + + protected: + std::unique_ptr t_; + std::atomic stopped_ = false; +}; + +TEST_F(RedisStoreClientTest, AsyncPutAndAsyncGetTest) { TestAsyncPutAndAsyncGet(); } + +TEST_F(RedisStoreClientTest, AsyncGetAllAndBatchDeleteTest) { + TestAsyncGetAllAndBatchDelete(); +} + +TEST_F(RedisStoreClientTest, BasicSimple) { + // Send 100 times write and then read + auto cnt = std::make_shared>(0); + for (size_t i = 0; i < 100; ++i) { + for (size_t j = 0; j < 20; ++j) { + ++*cnt; + store_client_->AsyncPut("T", + absl::StrCat("A", std::to_string(j)), + std::to_string(i), + true, + {[i, cnt](auto r) { + --*cnt; + ASSERT_TRUE((i == 0 && r) || (i != 0 && !r)); + }, + *io_service_pool_->Get()}); + } + } + for (size_t j = 0; j < 20; ++j) { + ++*cnt; + store_client_->AsyncGet("T", + absl::StrCat("A", std::to_string(j)), + {[cnt](auto s, auto r) { + --*cnt; + ASSERT_TRUE(r.has_value()); + ASSERT_EQ(*r, "99"); + }, + *io_service_pool_->Get()}); + } + ASSERT_TRUE(WaitForCondition([cnt]() { return *cnt == 0; }, 5000)); +} + +TEST_F(RedisStoreClientTest, Complicated) { + int window = 10; + std::atomic finished{0}; + std::atomic sent{0}; + + for (int i = 0; i < 1000; i += window) { + std::vector keys; + for (int j = i; j < i + window; ++j) { + ++sent; + RAY_LOG(INFO) << "S AsyncPut: " << ("P_" + std::to_string(j)); + store_client_->AsyncPut("N", + "P_" + std::to_string(j), + std::to_string(j), + true, + {[&finished, j](auto r) mutable { + RAY_LOG(INFO) + << "F AsyncPut: " << ("P_" + std::to_string(j)); + ++finished; + ASSERT_TRUE(r); + }, + *io_service_pool_->Get()}); + keys.push_back(std::to_string(j)); + } + + std::vector p_keys; + for (auto &key : keys) { + p_keys.push_back("P_" + key); + } + + std::vector n_keys; + for (auto &key : keys) { + n_keys.push_back("N_" + key); + } + + ++sent; + RAY_LOG(INFO) << "S AsyncMultiGet: " << absl::StrJoin(p_keys, ","); + store_client_->AsyncMultiGet( + "N", + p_keys, + {[&finished, i, keys, window, &sent, p_keys, n_keys, this]( + absl::flat_hash_map m) mutable -> void { + RAY_LOG(INFO) << "F SendAsyncMultiGet: " << absl::StrJoin(p_keys, ","); + ++finished; + ASSERT_EQ(keys.size(), m.size()); + for (auto &key : keys) { + ASSERT_EQ(m["P_" + key], key); + } + + if ((i / window) % 2 == 0) { + // Delete non exist keys + for (size_t jj = 0; jj < keys.size(); ++jj) { + ++sent; + RAY_LOG(INFO) << "S AsyncDelete: " << n_keys[jj]; + store_client_->AsyncDelete("N", + n_keys[jj], + {[&finished, n_keys, jj](auto b) mutable { + RAY_LOG(INFO) + << "F AsyncDelete: " << n_keys[jj]; + ++finished; + ASSERT_FALSE(b); + }, + *this->io_service_pool_->Get()}); + + ++sent; + RAY_LOG(INFO) << "S AsyncExists: " << p_keys[jj]; + store_client_->AsyncExists("N", + p_keys[jj], + {[&finished, p_keys, jj](auto b) mutable { + RAY_LOG(INFO) + << "F AsyncExists: " << p_keys[jj]; + ++finished; + ASSERT_TRUE(b); + }, + *this->io_service_pool_->Get()}); + } + } else { + ++sent; + RAY_LOG(INFO) << "S AsyncBatchDelete: " << absl::StrJoin(p_keys, ","); + store_client_->AsyncBatchDelete( + "N", + p_keys, + {[&finished, p_keys, keys](auto n) mutable { + RAY_LOG(INFO) << "F AsyncBatchDelete: " << absl::StrJoin(p_keys, ","); + ++finished; + ASSERT_EQ(n, keys.size()); + }, + *this->io_service_pool_->Get()}); + + for (auto p_key : p_keys) { + ++sent; + RAY_LOG(INFO) << "S AsyncExists: " << p_key; + store_client_->AsyncExists("N", + p_key, + {[&finished, p_key](auto b) mutable { + RAY_LOG(INFO) << "F AsyncExists: " << p_key; + ++finished; + ASSERT_FALSE(false); + }, + *this->io_service_pool_->Get()}); + } + } + }, + *io_service_pool_->Get()}); + } + ASSERT_TRUE(WaitForCondition( + [&finished, &sent]() { + RAY_LOG(INFO) << finished << "/" << sent; + return finished == sent; + }, + 5000)); +} + +TEST_F(RedisStoreClientTest, Random) { + std::map dict; + auto counter = std::make_shared>(0); + auto m_gen_keys = []() { + auto num_keys = static_cast(std::rand() % 10); + std::unordered_set keys; + while (keys.size() < num_keys) { + auto k = std::to_string(std::rand() % 1000); + keys.insert(k); + } + return std::vector(keys.begin(), keys.end()); + }; + + auto m_multi_get = [&, counter, this](size_t idx) { + auto keys = m_gen_keys(); + absl::flat_hash_map result; + for (auto key : keys) { + auto iter = dict.find(key); + if (iter != dict.end()) { + result[key] = iter->second; + } + } + RAY_LOG(INFO) << "m_multi_get Sending: " << idx; + *counter += 1; + store_client_->AsyncMultiGet("N", + keys, + {[result, idx, counter](auto m) mutable { + RAY_LOG(INFO) << "m_multi_get Finished: " << idx + << " " << m.size(); + *counter -= 1; + ASSERT_TRUE(m == result); + }, + *io_service_pool_->Get()}); + }; + + auto m_batch_delete = [&, counter, this](size_t idx) mutable { + auto keys = m_gen_keys(); + size_t deleted_num = 0; + for (auto key : keys) { + deleted_num += dict.erase(key); + } + RAY_LOG(INFO) << "m_batch_delete Sending: " << idx; + *counter += 1; + store_client_->AsyncBatchDelete("N", + keys, + {[&counter, deleted_num, idx](auto v) mutable { + RAY_LOG(INFO) << "m_batch_delete Finished: " << idx + << " " << v; + *counter -= 1; + ASSERT_EQ(v, deleted_num); + }, + *io_service_pool_->Get()}); + }; + + auto m_delete = [&, this](size_t idx) mutable { + auto k = std::to_string(std::rand() % 1000); + bool deleted = dict.erase(k) > 0; + RAY_LOG(INFO) << "m_delete Sending: " << idx << " " << k; + *counter += 1; + store_client_->AsyncDelete("N", + k, + {[counter, k, idx, deleted](auto r) { + RAY_LOG(INFO) << "m_delete Finished: " << idx << " " + << k << " " << deleted; + *counter -= 1; + ASSERT_EQ(deleted, r); + }, + *io_service_pool_->Get()}); + }; + + auto m_get = [&, counter, this](size_t idx) { + auto k = std::to_string(std::rand() % 1000); + std::optional v; + if (dict.count(k)) { + v = dict[k]; + } + RAY_LOG(INFO) << "m_get Sending: " << idx; + *counter += 1; + store_client_->AsyncGet("N", + k, + {[counter, idx, v](auto, auto r) { + RAY_LOG(INFO) << "m_get Finished: " << idx << " " + << (r ? *r : std::string("-")); + *counter -= 1; + ASSERT_EQ(v, r); + }, + *io_service_pool_->Get()}); + }; + + auto m_exists = [&, counter, this](size_t idx) { + auto k = std::to_string(std::rand() % 1000); + bool existed = dict.count(k); + RAY_LOG(INFO) << "m_exists Sending: " << idx; + *counter += 1; + store_client_->AsyncExists("N", + k, + {[k, existed, counter, idx](auto r) mutable { + RAY_LOG(INFO) << "m_exists Finished: " << idx << " " + << k << " " << r; + *counter -= 1; + ASSERT_EQ(existed, r) << " exists check " << k; + }, + *io_service_pool_->Get()}); + }; + + auto m_puts = [&, counter, this](size_t idx) mutable { + auto k = std::to_string(std::rand() % 1000); + auto v = std::to_string(std::rand() % 1000); + bool added = false; + if (!dict.count(k)) { + added = true; + } + dict[k] = v; + RAY_LOG(INFO) << "m_put Sending: " << idx << " " << k << " " << v; + *counter += 1; + store_client_->AsyncPut("N", + k, + v, + true, + {[idx, added, k, counter](bool r) mutable { + RAY_LOG(INFO) + << "m_put Finished: " << idx << " " << k << " " << r; + *counter -= 1; + ASSERT_EQ(r, added); + }, + *io_service_pool_->Get()}); + }; + + std::vector> ops{ + m_batch_delete, m_delete, m_get, m_exists, m_multi_get, m_puts}; + + for (size_t i = 0; i < 10000; ++i) { + auto idx = std::rand() % ops.size(); + ops[idx](i); + } + EXPECT_TRUE(WaitForCondition([&counter]() { return *counter == 0; }, 10000)); + auto redis_store_client_raw_ptr = + reinterpret_cast(store_client_.get()); + absl::MutexLock lock(&redis_store_client_raw_ptr->mu_); + ASSERT_TRUE(redis_store_client_raw_ptr->pending_redis_request_by_key_.empty()); +} + +} // namespace gcs + +} // namespace ray + +int main(int argc, char **argv) { + InitShutdownRAII ray_log_shutdown_raii( + ray::RayLog::StartRayLog, + ray::RayLog::ShutDownRayLog, + argv[0], + ray::RayLogLevel::INFO, + ray::GetLogFilepathFromDirectory(/*log_dir=*/"", /*app_name=*/argv[0]), + ray::GetErrLogFilepathFromDirectory(/*log_dir=*/"", /*app_name=*/argv[0]), + ray::RayLog::GetRayLogRotationMaxBytesOrDefault(), + ray::RayLog::GetRayLogRotationBackupCountOrDefault()); + ::testing::InitGoogleTest(&argc, argv); + RAY_CHECK(argc == 3); + ray::TEST_REDIS_SERVER_EXEC_PATH = argv[1]; + ray::TEST_REDIS_CLIENT_EXEC_PATH = argv[2]; + return RUN_ALL_TESTS(); +} diff --git a/src/ray/gcs/store_client/test/store_client_test_base.h b/src/ray/gcs/store_client/tests/store_client_test_base.h similarity index 87% rename from src/ray/gcs/store_client/test/store_client_test_base.h rename to src/ray/gcs/store_client/tests/store_client_test_base.h index f115ff0292ec..558443e63450 100644 --- a/src/ray/gcs/store_client/test/store_client_test_base.h +++ b/src/ray/gcs/store_client/tests/store_client_test_base.h @@ -26,9 +26,10 @@ #include "absl/container/flat_hash_map.h" #include "ray/common/asio/io_service_pool.h" #include "ray/common/id.h" -#include "ray/common/test_util.h" +#include "ray/common/test_utils.h" #include "ray/gcs/store_client/store_client.h" #include "ray/util/logging.h" +#include "src/ray/protobuf/gcs.pb.h" namespace ray { @@ -48,8 +49,6 @@ class StoreClientTestBase : public ::testing::Test { } void TearDown() override { - DisconnectStoreClient(); - io_service_pool_->Stop(); key_to_value_.clear(); @@ -57,18 +56,16 @@ class StoreClientTestBase : public ::testing::Test { virtual void InitStoreClient() = 0; - virtual void DisconnectStoreClient() = 0; - protected: void Put() { auto put_callback = [this](auto) { --pending_count_; }; for (const auto &[key, value] : key_to_value_) { ++pending_count_; - RAY_CHECK_OK(store_client_->AsyncPut(table_name_, - key.Hex(), - value.SerializeAsString(), - true, - {put_callback, *io_service_pool_->Get()})); + store_client_->AsyncPut(table_name_, + key.Hex(), + value.SerializeAsString(), + true, + {put_callback, *io_service_pool_->Get()}); } WaitPendingDone(); } @@ -77,8 +74,8 @@ class StoreClientTestBase : public ::testing::Test { auto delete_callback = [this](auto) { --pending_count_; }; for (const auto &[key, _] : key_to_value_) { ++pending_count_; - RAY_CHECK_OK(store_client_->AsyncDelete( - table_name_, key.Hex(), {delete_callback, *io_service_pool_->Get()})); + store_client_->AsyncDelete( + table_name_, key.Hex(), {delete_callback, *io_service_pool_->Get()}); } WaitPendingDone(); } @@ -97,8 +94,8 @@ class StoreClientTestBase : public ::testing::Test { }; for (const auto &[key, _] : key_to_value_) { ++pending_count_; - RAY_CHECK_OK(store_client_->AsyncGet( - table_name_, key.Hex(), {get_callback, *io_service_pool_->Get()})); + store_client_->AsyncGet( + table_name_, key.Hex(), {get_callback, *io_service_pool_->Get()}); } WaitPendingDone(); } @@ -114,8 +111,7 @@ class StoreClientTestBase : public ::testing::Test { }; ++pending_count_; - RAY_CHECK_OK(store_client_->AsyncGet( - table_name_, key, {get_callback, *io_service_pool_->Get()})); + store_client_->AsyncGet(table_name_, key, {get_callback, *io_service_pool_->Get()}); } WaitPendingDone(); } @@ -139,8 +135,7 @@ class StoreClientTestBase : public ::testing::Test { }; pending_count_ += key_to_value_.size(); - RAY_CHECK_OK(store_client_->AsyncGetAll( - table_name_, {get_all_callback, *io_service_pool_->Get()})); + store_client_->AsyncGetAll(table_name_, {get_all_callback, *io_service_pool_->Get()}); WaitPendingDone(); } @@ -166,8 +161,8 @@ class StoreClientTestBase : public ::testing::Test { pending_count_ += result_set.size(); - RAY_CHECK_OK(store_client_->AsyncGetKeys( - table_name_, prefix, {get_keys_callback, *io_service_pool_->Get()})); + store_client_->AsyncGetKeys( + table_name_, prefix, {get_keys_callback, *io_service_pool_->Get()}); WaitPendingDone(); } } @@ -180,8 +175,8 @@ class StoreClientTestBase : public ::testing::Test { pending_count_ += key_to_value_.size(); for (const auto &item : key_to_value_) { - RAY_CHECK_OK(store_client_->AsyncExists( - table_name_, item.first.Hex(), {exists_callback, *io_service_pool_->Get()})); + store_client_->AsyncExists( + table_name_, item.first.Hex(), {exists_callback, *io_service_pool_->Get()}); } WaitPendingDone(); } @@ -193,8 +188,8 @@ class StoreClientTestBase : public ::testing::Test { for (auto &[key, _] : key_to_value_) { keys.push_back(key.Hex()); } - RAY_CHECK_OK(store_client_->AsyncBatchDelete( - table_name_, keys, {delete_callback, *io_service_pool_->Get()})); + store_client_->AsyncBatchDelete( + table_name_, keys, {delete_callback, *io_service_pool_->Get()}); WaitPendingDone(); } diff --git a/src/ray/gcs/gcs_server/store_client_kv.cc b/src/ray/gcs/store_client_kv.cc similarity index 77% rename from src/ray/gcs/gcs_server/store_client_kv.cc rename to src/ray/gcs/store_client_kv.cc index 6c1fc739073d..31297c49536e 100644 --- a/src/ray/gcs/gcs_server/store_client_kv.cc +++ b/src/ray/gcs/store_client_kv.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/gcs_server/store_client_kv.h" +#include "ray/gcs/store_client_kv.h" #include #include @@ -56,7 +56,7 @@ StoreClientInternalKV::StoreClientInternalKV(std::unique_ptr store_ void StoreClientInternalKV::Get(const std::string &ns, const std::string &key, Postable)> callback) { - RAY_CHECK_OK(delegate_->AsyncGet( + delegate_->AsyncGet( table_name_, MakeKey(ns, key), std::move(callback).TransformArg( @@ -64,7 +64,7 @@ void StoreClientInternalKV::Get(const std::string &ns, std::optional result) -> std::optional { RAY_CHECK(status.ok()) << "Fails to get key from storage " << status; return result; - }))); + })); } void StoreClientInternalKV::MultiGet( @@ -76,20 +76,18 @@ void StoreClientInternalKV::MultiGet( for (const auto &key : keys) { prefixed_keys.emplace_back(MakeKey(ns, key)); } - RAY_CHECK_OK(delegate_->AsyncMultiGet( + delegate_->AsyncMultiGet( table_name_, prefixed_keys, - std::move(callback).TransformArg // < - // absl::flat_hash_map( - // absl::flat_hash_map)> - ([](absl::flat_hash_map before_extract) { - absl::flat_hash_map ret; - ret.reserve(before_extract.size()); - for (auto &&item : std::move(before_extract)) { - ret.emplace(ExtractKey(item.first), std::move(item.second)); - } - return ret; - }))); + std::move(callback).TransformArg( + [](absl::flat_hash_map before_extract) { + absl::flat_hash_map ret; + ret.reserve(before_extract.size()); + for (auto &&item : std::move(before_extract)) { + ret.emplace(ExtractKey(item.first), std::move(item.second)); + } + return ret; + })); } void StoreClientInternalKV::Put(const std::string &ns, @@ -97,8 +95,8 @@ void StoreClientInternalKV::Put(const std::string &ns, std::string value, bool overwrite, Postable callback) { - RAY_CHECK_OK(delegate_->AsyncPut( - table_name_, MakeKey(ns, key), std::move(value), overwrite, std::move(callback))); + delegate_->AsyncPut( + table_name_, MakeKey(ns, key), std::move(value), overwrite, std::move(callback)); } void StoreClientInternalKV::Del(const std::string &ns, @@ -106,17 +104,16 @@ void StoreClientInternalKV::Del(const std::string &ns, bool del_by_prefix, Postable callback) { if (!del_by_prefix) { - RAY_CHECK_OK(delegate_->AsyncDelete( - table_name_, - MakeKey(ns, key), - std::move(callback).TransformArg( - [](bool deleted) -> int64_t { return deleted ? 1 : 0; }))); + delegate_->AsyncDelete(table_name_, + MakeKey(ns, key), + std::move(callback).TransformArg( + [](bool deleted) -> int64_t { return deleted ? 1 : 0; })); return; } instrumented_io_context &io_context = callback.io_context(); - RAY_CHECK_OK(delegate_->AsyncGetKeys( + delegate_->AsyncGetKeys( table_name_, MakeKey(ns, key), {[this, ns, callback = std::move(callback)](auto keys) mutable { @@ -124,23 +121,21 @@ void StoreClientInternalKV::Del(const std::string &ns, std::move(callback).Dispatch("StoreClientInternalKV.Del", 0); return; } - RAY_CHECK_OK( - delegate_->AsyncBatchDelete(table_name_, keys, std::move(callback))); + delegate_->AsyncBatchDelete(table_name_, keys, std::move(callback)); }, - io_context})); + io_context}); } void StoreClientInternalKV::Exists(const std::string &ns, const std::string &key, Postable callback) { - RAY_CHECK_OK( - delegate_->AsyncExists(table_name_, MakeKey(ns, key), std::move(callback))); + delegate_->AsyncExists(table_name_, MakeKey(ns, key), std::move(callback)); } void StoreClientInternalKV::Keys(const std::string &ns, const std::string &prefix, Postable)> callback) { - RAY_CHECK_OK(delegate_->AsyncGetKeys( + delegate_->AsyncGetKeys( table_name_, MakeKey(ns, prefix), std::move(callback).TransformArg([](std::vector keys) { @@ -150,7 +145,7 @@ void StoreClientInternalKV::Keys(const std::string &ns, true_keys.emplace_back(ExtractKey(key)); } return true_keys; - }))); + })); } } // namespace gcs diff --git a/src/ray/gcs/gcs_server/store_client_kv.h b/src/ray/gcs/store_client_kv.h similarity index 97% rename from src/ray/gcs/gcs_server/store_client_kv.h rename to src/ray/gcs/store_client_kv.h index 9d122b85184e..295ad387a8e6 100644 --- a/src/ray/gcs/gcs_server/store_client_kv.h +++ b/src/ray/gcs/store_client_kv.h @@ -20,7 +20,7 @@ #include #include "ray/common/asio/postable.h" -#include "ray/gcs/gcs_server/gcs_kv_manager.h" +#include "ray/gcs/gcs_kv_manager.h" #include "ray/gcs/store_client/store_client.h" namespace ray { diff --git a/src/ray/gcs/test/BUILD.bazel b/src/ray/gcs/test/BUILD.bazel deleted file mode 100644 index c89258455258..000000000000 --- a/src/ray/gcs/test/BUILD.bazel +++ /dev/null @@ -1,46 +0,0 @@ -load("//bazel:ray.bzl", "ray_cc_library", "ray_cc_test") - -ray_cc_library( - name = "gcs_test_util_lib", - hdrs = [ - "gcs_test_util.h", - ], - deps = [ - "//src/ray/common:test_util", - "//src/ray/gcs:gcs_pb_util", - "//src/ray/protobuf:autoscaler_cc_grpc", - "//src/ray/protobuf:gcs_service_cc_grpc", - ], -) - -ray_cc_test( - name = "callback_reply_test", - size = "small", - srcs = ["callback_reply_test.cc"], - tags = ["team:core"], - deps = [ - "//src/ray/gcs:gcs_redis_client", - "@com_google_googletest//:gtest_main", - ], -) - -ray_cc_test( - name = "redis_async_context_test", - size = "small", - srcs = ["redis_async_context_test.cc"], - args = [ - "$(location //:redis-server)", - "$(location //:redis-cli)", - ], - data = [ - "//:redis-cli", - "//:redis-server", - ], - tags = ["team:core"], - deps = [ - "//src/ray/common:test_util", - "//src/ray/gcs:gcs_redis_client", - "//src/ray/util", - "@com_google_googletest//:gtest_main", - ], -) diff --git a/src/ray/gcs/test/gcs_test_util.h b/src/ray/gcs/test/gcs_test_util.h deleted file mode 100644 index 56379b5a1154..000000000000 --- a/src/ray/gcs/test/gcs_test_util.h +++ /dev/null @@ -1,451 +0,0 @@ -// Copyright 2017 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -#include "gmock/gmock.h" -#include "ray/common/asio/instrumented_io_context.h" -#include "ray/common/bundle_spec.h" -#include "ray/common/placement_group.h" -#include "ray/common/task/task.h" -#include "ray/common/task/task_util.h" -#include "ray/common/test_util.h" -#include "ray/gcs/pb_util.h" -#include "src/ray/protobuf/autoscaler.grpc.pb.h" -#include "src/ray/protobuf/gcs_service.grpc.pb.h" - -namespace ray { - -struct Mocker { - static TaskSpecification GenActorCreationTask( - const JobID &job_id, - int max_restarts, - bool detached, - const std::string &name, - const std::string &ray_namespace, - const rpc::Address &owner_address, - std::unordered_map required_resources = - std::unordered_map(), - std::unordered_map required_placement_resources = - std::unordered_map()) { - TaskSpecBuilder builder; - static rpc::JobConfig kJobConfig; - auto actor_id = ActorID::Of(job_id, RandomTaskId(), 0); - auto task_id = TaskID::ForActorCreationTask(actor_id); - FunctionDescriptor function_descriptor; - function_descriptor = FunctionDescriptorBuilder::BuildPython("", "", "", ""); - builder.SetCommonTaskSpec(task_id, - name + ":" + function_descriptor->CallString(), - Language::PYTHON, - function_descriptor, - job_id, - kJobConfig, - TaskID::Nil(), - 0, - TaskID::Nil(), - owner_address, - 1, - false, - false, - -1, - required_resources, - required_placement_resources, - "", - 0, - TaskID::Nil(), - ""); - rpc::SchedulingStrategy scheduling_strategy; - scheduling_strategy.mutable_default_scheduling_strategy(); - builder.SetActorCreationTaskSpec(actor_id, - {}, - scheduling_strategy, - max_restarts, - /*max_task_retries=*/0, - {}, - 1, - detached, - name, - ray_namespace); - return std::move(builder).ConsumeAndBuild(); - } - - static rpc::CreateActorRequest GenCreateActorRequest( - const JobID &job_id, - int max_restarts = 0, - bool detached = false, - const std::string &name = "", - const std::string &ray_namespace = "") { - rpc::Address owner_address; - owner_address.set_raylet_id(NodeID::FromRandom().Binary()); - owner_address.set_ip_address("1234"); - owner_address.set_port(5678); - owner_address.set_worker_id(WorkerID::FromRandom().Binary()); - auto actor_creation_task_spec = GenActorCreationTask( - job_id, max_restarts, detached, name, ray_namespace, owner_address); - rpc::CreateActorRequest request; - request.mutable_task_spec()->CopyFrom(actor_creation_task_spec.GetMessage()); - return request; - } - - static rpc::RegisterActorRequest GenRegisterActorRequest( - const JobID &job_id, - int max_restarts = 0, - bool detached = false, - const std::string &name = "", - const std::string &ray_namespace = "test") { - rpc::Address owner_address; - owner_address.set_raylet_id(NodeID::FromRandom().Binary()); - owner_address.set_ip_address("1234"); - owner_address.set_port(5678); - owner_address.set_worker_id(WorkerID::FromRandom().Binary()); - auto actor_creation_task_spec = GenActorCreationTask( - job_id, max_restarts, detached, name, ray_namespace, owner_address); - rpc::RegisterActorRequest request; - request.mutable_task_spec()->CopyFrom(actor_creation_task_spec.GetMessage()); - return request; - } - - static std::vector> GenBundleSpecifications( - const PlacementGroupID &placement_group_id, - absl::flat_hash_map &unit_resource, - int bundles_size = 1) { - std::vector> bundle_specs; - for (int i = 0; i < bundles_size; i++) { - rpc::Bundle bundle; - auto mutable_bundle_id = bundle.mutable_bundle_id(); - // The bundle index is start from 1. - mutable_bundle_id->set_bundle_index(i + 1); - mutable_bundle_id->set_placement_group_id(placement_group_id.Binary()); - auto mutable_unit_resources = bundle.mutable_unit_resources(); - for (auto &resource : unit_resource) { - mutable_unit_resources->insert({resource.first, resource.second}); - } - bundle_specs.emplace_back(std::make_shared(bundle)); - } - return bundle_specs; - } - - // TODO(@clay4444): Remove this once we did the batch rpc request refactor. - static BundleSpecification GenBundleCreation( - const PlacementGroupID &placement_group_id, - const int bundle_index, - absl::flat_hash_map &unit_resource) { - rpc::Bundle bundle; - auto mutable_bundle_id = bundle.mutable_bundle_id(); - mutable_bundle_id->set_bundle_index(bundle_index); - mutable_bundle_id->set_placement_group_id(placement_group_id.Binary()); - auto mutable_unit_resources = bundle.mutable_unit_resources(); - for (auto &resource : unit_resource) { - mutable_unit_resources->insert({resource.first, resource.second}); - } - return BundleSpecification(bundle); - } - - static PlacementGroupSpecification GenPlacementGroupCreation( - const std::string &name, - std::vector> &bundles, - rpc::PlacementStrategy strategy, - const JobID &job_id, - const ActorID &actor_id) { - PlacementGroupSpecBuilder builder; - - auto placement_group_id = PlacementGroupID::Of(job_id); - builder.SetPlacementGroupSpec(placement_group_id, - name, - bundles, - strategy, - /* is_detached */ false, - /* max_cpu_fraction_per_node */ 1.0, - /* soft_target_node_id */ NodeID::Nil(), - job_id, - actor_id, - /* is_creator_detached */ false); - return builder.Build(); - } - - static rpc::CreatePlacementGroupRequest GenCreatePlacementGroupRequest( - const std::string name = "", - rpc::PlacementStrategy strategy = rpc::PlacementStrategy::SPREAD, - int bundles_count = 2, - double cpu_num = 1.0, - const JobID job_id = JobID::FromInt(1), - const ActorID &actor_id = ActorID::Nil()) { - rpc::CreatePlacementGroupRequest request; - std::vector> bundles; - std::unordered_map bundle; - bundle["CPU"] = cpu_num; - for (int index = 0; index < bundles_count; ++index) { - bundles.push_back(bundle); - } - auto placement_group_creation_spec = - GenPlacementGroupCreation(name, bundles, strategy, job_id, actor_id); - request.mutable_placement_group_spec()->CopyFrom( - placement_group_creation_spec.GetMessage()); - return request; - } - static std::shared_ptr GenNodeInfo( - uint16_t port = 0, - const std::string address = "127.0.0.1", - const std::string node_name = "Mocker_node") { - auto node = std::make_shared(); - node->set_node_id(NodeID::FromRandom().Binary()); - node->set_node_manager_port(port); - node->set_node_manager_address(address); - node->set_node_name(node_name); - node->set_instance_id("instance_x"); - node->set_state(rpc::GcsNodeInfo::ALIVE); - return node; - } - - static std::shared_ptr GenJobTableData(JobID job_id) { - auto job_table_data = std::make_shared(); - job_table_data->set_job_id(job_id.Binary()); - job_table_data->set_is_dead(false); - job_table_data->set_timestamp(current_sys_time_ms()); - job_table_data->set_driver_ip_address("127.0.0.1"); - rpc::Address address; - address.set_ip_address("127.0.0.1"); - address.set_port(1234); - address.set_raylet_id(UniqueID::FromRandom().Binary()); - address.set_worker_id(UniqueID::FromRandom().Binary()); - job_table_data->mutable_driver_address()->CopyFrom(address); - job_table_data->set_driver_pid(5667L); - return job_table_data; - } - - static std::shared_ptr GenActorTableData(const JobID &job_id) { - auto actor_table_data = std::make_shared(); - ActorID actor_id = ActorID::Of(job_id, RandomTaskId(), 0); - actor_table_data->set_actor_id(actor_id.Binary()); - actor_table_data->set_job_id(job_id.Binary()); - actor_table_data->set_state(rpc::ActorTableData::ALIVE); - actor_table_data->set_max_restarts(1); - actor_table_data->set_num_restarts(0); - return actor_table_data; - } - - static std::shared_ptr GenErrorTableData(const JobID &job_id) { - auto error_table_data = std::make_shared(); - error_table_data->set_job_id(job_id.Binary()); - return error_table_data; - } - - static std::shared_ptr GenWorkerTableData() { - auto worker_table_data = std::make_shared(); - worker_table_data->set_timestamp(std::time(nullptr)); - return worker_table_data; - } - - static std::shared_ptr GenAddJobRequest( - const JobID &job_id, - const std::string &ray_namespace, - const std::optional &submission_id = std::nullopt, - const std::optional &address = std::nullopt) { - auto job_config_data = std::make_shared(); - job_config_data->set_ray_namespace(ray_namespace); - - auto job_table_data = std::make_shared(); - job_table_data->set_job_id(job_id.Binary()); - job_table_data->mutable_config()->CopyFrom(*job_config_data); - if (address.has_value()) { - job_table_data->mutable_driver_address()->CopyFrom(address.value()); - } else { - rpc::Address dummy_address; - dummy_address.set_port(1234); - dummy_address.set_raylet_id(NodeID::FromRandom().Binary()); - dummy_address.set_ip_address("123.456.7.8"); - dummy_address.set_worker_id(WorkerID::FromRandom().Binary()); - job_table_data->mutable_driver_address()->CopyFrom(dummy_address); - } - if (submission_id.has_value()) { - job_table_data->mutable_config()->mutable_metadata()->insert( - {"job_submission_id", submission_id.value()}); - } - - auto add_job_request = std::make_shared(); - add_job_request->mutable_data()->CopyFrom(*job_table_data); - return add_job_request; - } - - static rpc::TaskEventData GenTaskEventsData( - const std::vector &task_events, - int32_t num_profile_task_events_dropped = 0, - int32_t num_status_task_events_dropped = 0) { - rpc::TaskEventData data; - for (auto &events : task_events) { - auto new_events = data.add_events_by_task(); - new_events->CopyFrom(events); - } - - for (int i = 0; i < num_status_task_events_dropped; ++i) { - rpc::TaskAttempt rpc_task_attempt; - rpc_task_attempt.set_task_id(RandomTaskId().Binary()); - rpc_task_attempt.set_attempt_number(0); - *(data.add_dropped_task_attempts()) = rpc_task_attempt; - } - - data.set_num_profile_events_dropped(num_profile_task_events_dropped); - data.set_job_id(JobID::FromInt(0).Binary()); - - return data; - } - - static rpc::TaskEventData GenTaskEventsDataLoss( - const std::vector &drop_tasks, int job_id = 0) { - rpc::TaskEventData data; - for (const auto &task_attempt : drop_tasks) { - rpc::TaskAttempt rpc_task_attempt; - rpc_task_attempt.set_task_id(task_attempt.first.Binary()); - rpc_task_attempt.set_attempt_number(task_attempt.second); - *(data.add_dropped_task_attempts()) = rpc_task_attempt; - } - data.set_job_id(JobID::FromInt(job_id).Binary()); - - return data; - } - - static rpc::ResourceDemand GenResourceDemand( - const absl::flat_hash_map &resource_demands, - int64_t num_ready_queued, - int64_t num_infeasible, - int64_t num_backlog, - const std::vector &label_selectors = {}) { - rpc::ResourceDemand resource_demand; - for (const auto &resource : resource_demands) { - (*resource_demand.mutable_shape())[resource.first] = resource.second; - } - resource_demand.set_num_ready_requests_queued(num_ready_queued); - resource_demand.set_num_infeasible_requests_queued(num_infeasible); - resource_demand.set_backlog_size(num_backlog); - for (const auto &selector : label_selectors) { - *resource_demand.add_label_selectors() = selector; - } - return resource_demand; - } - - static void FillResourcesData( - rpc::ResourcesData &resources_data, - const NodeID &node_id, - const absl::flat_hash_map &available_resources, - const absl::flat_hash_map &total_resources, - int64_t idle_ms = 0, - bool is_draining = false, - int64_t draining_deadline_timestamp_ms = -1) { - resources_data.set_node_id(node_id.Binary()); - for (const auto &resource : available_resources) { - (*resources_data.mutable_resources_available())[resource.first] = resource.second; - } - for (const auto &resource : total_resources) { - (*resources_data.mutable_resources_total())[resource.first] = resource.second; - } - resources_data.set_idle_duration_ms(idle_ms); - resources_data.set_is_draining(is_draining); - resources_data.set_draining_deadline_timestamp_ms(draining_deadline_timestamp_ms); - } - - static void FillResourcesData(rpc::ResourcesData &data, - const std::string &node_id, - std::vector demands) { - auto load_by_shape = data.mutable_resource_load_by_shape(); - auto agg_load = data.mutable_resource_load(); - for (const auto &demand : demands) { - load_by_shape->add_resource_demands()->CopyFrom(demand); - for (const auto &resource : demand.shape()) { - (*agg_load)[resource.first] += - (resource.second * (demand.num_ready_requests_queued() + - demand.num_infeasible_requests_queued())); - } - } - data.set_node_id(node_id); - } - - static std::shared_ptr GenPlacementGroupLoad( - std::vector placement_group_table_data_vec) { - auto placement_group_load = std::make_shared(); - for (auto &placement_group_table_data : placement_group_table_data_vec) { - placement_group_load->add_placement_group_data()->CopyFrom( - placement_group_table_data); - } - return placement_group_load; - } - - static rpc::PlacementGroupTableData GenPlacementGroupTableData( - const PlacementGroupID &placement_group_id, - const JobID &job_id, - const std::vector> &bundles, - const std::vector &nodes, - rpc::PlacementStrategy strategy, - const rpc::PlacementGroupTableData::PlacementGroupState state, - const std::string &name = "", - const ActorID &actor_id = ActorID::Nil()) { - rpc::PlacementGroupTableData placement_group_table_data; - placement_group_table_data.set_placement_group_id(placement_group_id.Binary()); - placement_group_table_data.set_state(state); - placement_group_table_data.set_name(name); - placement_group_table_data.set_strategy(strategy); - RAY_CHECK(bundles.size() == nodes.size()); - size_t i = 0; - for (auto &bundle : bundles) { - // Add unit resources - auto bundle_spec = placement_group_table_data.add_bundles(); - for (auto &resource : bundle) { - (*bundle_spec->mutable_unit_resources())[resource.first] = resource.second; - } - - // Add node id - const auto &node = nodes[i]; - if (!node.empty()) { - bundle_spec->set_node_id(node); - } - - i++; - } - return placement_group_table_data; - } - static rpc::autoscaler::ClusterResourceConstraint GenClusterResourcesConstraint( - const std::vector> &request_resources, - const std::vector &count_array) { - rpc::autoscaler::ClusterResourceConstraint constraint; - RAY_CHECK(request_resources.size() == count_array.size()); - for (size_t i = 0; i < request_resources.size(); i++) { - auto &resource = request_resources[i]; - auto count = count_array[i]; - auto bundle = constraint.add_resource_requests(); - bundle->set_count(count); - bundle->mutable_request()->mutable_resources_bundle()->insert(resource.begin(), - resource.end()); - } - return constraint; - } - // Read all lines of a file into vector vc - static void ReadContentFromFile(std::vector &vc, std::string log_file) { - std::string line; - std::ifstream read_file; - read_file.open(log_file, std::ios::binary); - while (std::getline(read_file, line)) { - vc.push_back(line); - } - read_file.close(); - } -}; - -} // namespace ray diff --git a/src/ray/gcs/gcs_server/test/BUILD.bazel b/src/ray/gcs/tests/BUILD.bazel similarity index 51% rename from src/ray/gcs/gcs_server/test/BUILD.bazel rename to src/ray/gcs/tests/BUILD.bazel index aa110ed7ae24..012661e15b01 100644 --- a/src/ray/gcs/gcs_server/test/BUILD.bazel +++ b/src/ray/gcs/tests/BUILD.bazel @@ -6,22 +6,23 @@ ray_cc_test( tags = ["team:core"], deps = [ "//:ray_mock", - "//src/ray/gcs/gcs_server:gcs_server_lib", + "//src/ray/gcs:gcs_function_manager", "@com_google_googletest//:gtest_main", ], ) ray_cc_test( - name = "gcs_placement_group_mgr_mock_test", + name = "gcs_placement_group_manager_mock_test", size = "small", srcs = [ - "gcs_placement_group_mgr_mock_test.cc", + "gcs_placement_group_manager_mock_test.cc", ], tags = ["team:core"], deps = [ "//:ray_mock", - "//src/ray/gcs/gcs_server:gcs_server_lib", - "//src/ray/gcs/test:gcs_test_util_lib", + "//src/ray/common:test_utils", + "//src/ray/gcs:gcs_placement_group_manager", + "//src/ray/util:counter_map", "@com_google_googletest//:gtest_main", ], ) @@ -46,8 +47,8 @@ ray_cc_test( "team:core", ], deps = [ - "//src/ray/gcs/gcs_server:gcs_server_lib", - "//src/ray/gcs/test:gcs_test_util_lib", + "//src/ray/common:test_utils", + "//src/ray/gcs:gcs_server_lib", "@com_google_googletest//:gtest", ], ) @@ -68,23 +69,15 @@ ray_cc_test( ], tags = ["team:core"], deps = [ - "//src/ray/gcs/gcs_server:gcs_server_lib", - "//src/ray/gcs/test:gcs_test_util_lib", + "//src/ray/common:test_utils", + "//src/ray/gcs:gcs_kv_manager", + "//src/ray/gcs:gcs_store_client_kv", + "//src/ray/gcs/store_client:in_memory_store_client", + "//src/ray/gcs/store_client:redis_store_client", "@com_google_googletest//:gtest", ], ) -ray_cc_library( - name = "gcs_server_test_util", - hdrs = [ - "gcs_server_test_util.h", - ], - deps = [ - "//:ray_fakes", - "//src/ray/gcs/gcs_client:gcs_client_lib", - ], -) - ray_cc_test( name = "gcs_health_check_manager_test", size = "medium", @@ -96,7 +89,8 @@ ray_cc_test( "team:core", ], deps = [ - "//src/ray/gcs/gcs_server:gcs_server_lib", + "//src/ray/gcs:gcs_health_check_manager", + "//src/ray/rpc:grpc_server", "//src/ray/util:network_util", "@boost//:thread", "@com_google_googletest//:gtest_main", @@ -111,10 +105,11 @@ ray_cc_test( ], tags = ["team:core"], deps = [ - ":gcs_server_test_util", - "//:ray_mock", - "//src/ray/gcs/gcs_server:gcs_server_lib", - "//src/ray/gcs/test:gcs_test_util_lib", + "//:ray_fakes", + "//src/fakes/ray/rpc/raylet:fake_raylet_client", + "//src/mock/ray/pubsub:mock_publisher", + "//src/ray/common:test_utils", + "//src/ray/gcs:gcs_node_manager", "@com_google_googletest//:gtest_main", ], ) @@ -127,10 +122,13 @@ ray_cc_test( ], tags = ["team:core"], deps = [ - ":gcs_server_test_util", "//:ray_mock", - "//src/ray/gcs/gcs_server:gcs_server_lib", - "//src/ray/gcs/test:gcs_test_util_lib", + "//src/mock/ray/pubsub:mock_publisher", + "//src/ray/common:test_utils", + "//src/ray/gcs:gcs_job_manager", + "//src/ray/gcs:gcs_kv_manager", + "//src/ray/gcs/store_client:in_memory_store_client", + "//src/ray/observability:fake_ray_event_recorder", "@com_google_googletest//:gtest_main", ], ) @@ -143,29 +141,31 @@ ray_cc_test( ], tags = ["team:core"], deps = [ - ":gcs_server_test_util", "//:ray_mock", - "//src/ray/gcs/gcs_server:gcs_server_lib", - "//src/ray/gcs/test:gcs_test_util_lib", + "//src/ray/common:protobuf_utils", + "//src/ray/common:test_utils", + "//src/ray/gcs:gcs_task_manager", "@com_google_googletest//:gtest_main", ], ) ray_cc_test( - name = "gcs_placement_group_mgr_test", + name = "gcs_placement_group_manager_test", size = "small", srcs = [ - "gcs_placement_group_mgr_test.cc", + "gcs_placement_group_manager_test.cc", ], tags = [ "no_tsan", "team:core", ], deps = [ - ":gcs_server_test_util", "//:ray_mock", - "//src/ray/gcs/gcs_server:gcs_server_lib", - "//src/ray/gcs/test:gcs_test_util_lib", + "//src/mock/ray/pubsub:mock_publisher", + "//src/ray/common:test_utils", + "//src/ray/gcs:gcs_placement_group_manager", + "//src/ray/gcs/store_client:in_memory_store_client", + "//src/ray/util:counter_map", "@com_google_googletest//:gtest_main", ], ) @@ -181,10 +181,17 @@ ray_cc_test( "team:core", ], deps = [ - ":gcs_server_test_util", - "//:ray_mock", - "//src/ray/gcs/gcs_server:gcs_server_lib", - "//src/ray/gcs/test:gcs_test_util_lib", + "//src/fakes/ray/rpc/raylet:fake_raylet_client", + "//src/fakes/ray/rpc/worker:fake_core_worker_client", + "//src/mock/ray/pubsub:mock_publisher", + "//src/ray/common:test_utils", + "//src/ray/gcs:gcs_node_manager", + "//src/ray/gcs:gcs_placement_group", + "//src/ray/gcs:gcs_placement_group_scheduler", + "//src/ray/gcs:gcs_resource_manager", + "//src/ray/gcs:gcs_table_storage", + "//src/ray/gcs/store_client:in_memory_store_client", + "//src/ray/util:counter_map", "@com_google_googletest//:gtest_main", ], ) @@ -197,11 +204,15 @@ ray_cc_test( ], tags = ["team:core"], deps = [ - ":gcs_server_test_util", - "//:ray_mock", - "//src/ray/gcs/gcs_client:gcs_client_lib", - "//src/ray/gcs/gcs_server:gcs_server_lib", - "//src/ray/gcs/test:gcs_test_util_lib", + "//src/fakes/ray/rpc/raylet:fake_raylet_client", + "//src/fakes/ray/rpc/worker:fake_core_worker_client", + "//src/mock/ray/pubsub:mock_publisher", + "//src/ray/common:test_utils", + "//src/ray/gcs:gcs_actor", + "//src/ray/gcs:gcs_actor_scheduler", + "//src/ray/gcs:gcs_resource_manager", + "//src/ray/gcs/store_client:in_memory_store_client", + "//src/ray/util:counter_map", "@com_google_googletest//:gtest_main", ], ) @@ -215,8 +226,10 @@ ray_cc_test( tags = ["team:core"], deps = [ "//:ray_mock", - "//src/ray/common:test_util", - "//src/ray/gcs/gcs_server:gcs_server_lib", + "//src/ray/common:test_utils", + "//src/ray/gcs:gcs_actor", + "//src/ray/gcs:gcs_actor_scheduler", + "//src/ray/util:counter_map", "@com_google_googletest//:gtest_main", ], ) @@ -231,10 +244,16 @@ ray_cc_test( "team:core", ], deps = [ - ":gcs_server_test_util", "//:ray_mock", - "//src/ray/gcs/gcs_server:gcs_server_lib", - "//src/ray/gcs/test:gcs_test_util_lib", + "//src/ray/common:asio", + "//src/ray/common:runtime_env", + "//src/ray/common:test_utils", + "//src/ray/gcs:gcs_actor", + "//src/ray/gcs:gcs_actor_manager", + "//src/ray/gcs:gcs_actor_scheduler", + "//src/ray/gcs:gcs_function_manager", + "//src/ray/gcs/store_client:in_memory_store_client", + "//src/ray/pubsub:publisher", "@com_google_googletest//:gtest_main", ], ) @@ -247,10 +266,11 @@ ray_cc_test( ], tags = ["team:core"], deps = [ - ":gcs_server_test_util", - "//:ray_mock", - "//src/ray/gcs/gcs_server:gcs_server_lib", - "//src/ray/gcs/test:gcs_test_util_lib", + "//src/mock/ray/pubsub:mock_publisher", + "//src/ray/common:test_utils", + "//src/ray/gcs:gcs_store_client_kv", + "//src/ray/gcs:gcs_worker_manager", + "//src/ray/gcs/store_client:in_memory_store_client", "//src/ray/util:process", "@com_google_googletest//:gtest_main", ], @@ -262,7 +282,7 @@ ray_cc_library( "gcs_table_storage_test_base.h", ], deps = [ - "//src/ray/gcs/store_client:gcs_redis_store_client", + "//src/ray/gcs/store_client:redis_store_client", ], ) @@ -283,9 +303,9 @@ ray_cc_test( tags = ["team:core"], deps = [ ":gcs_table_storage_test_lib", - "//src/ray/gcs/gcs_server:gcs_table_storage", - "//src/ray/gcs/store_client/test:store_client_test_lib", - "//src/ray/gcs/test:gcs_test_util_lib", + "//src/ray/common:test_utils", + "//src/ray/gcs:gcs_table_storage", + "//src/ray/gcs/store_client/tests:store_client_test_lib", "@com_google_googletest//:gtest", ], ) @@ -297,10 +317,10 @@ ray_cc_test( tags = ["team:core"], deps = [ ":gcs_table_storage_test_lib", - "//src/ray/common:test_util", - "//src/ray/gcs/gcs_server:gcs_table_storage", - "//src/ray/gcs/store_client/test:store_client_test_lib", - "//src/ray/gcs/test:gcs_test_util_lib", + "//src/ray/common:test_utils", + "//src/ray/gcs:gcs_table_storage", + "//src/ray/gcs/store_client:in_memory_store_client", + "//src/ray/gcs/store_client/tests:store_client_test_lib", "@com_google_googletest//:gtest_main", ], ) @@ -313,10 +333,17 @@ ray_cc_test( ], tags = ["team:core"], deps = [ - ":gcs_server_test_util", + "//:ray_fakes", "//:ray_mock", - "//src/ray/gcs/gcs_server:gcs_server_lib", - "//src/ray/gcs/test:gcs_test_util_lib", + "//src/fakes/ray/rpc/raylet:fake_raylet_client", + "//src/ray/common:asio", + "//src/ray/common:protobuf_utils", + "//src/ray/common:test_utils", + "//src/ray/gcs:gcs_autoscaler_state_manager", + "//src/ray/gcs:gcs_init_data", + "//src/ray/gcs:gcs_resource_manager", + "//src/ray/gcs:gcs_store_client_kv", + "//src/ray/raylet/scheduling:cluster_resource_manager", "@com_google_googletest//:gtest_main", ], ) @@ -330,8 +357,10 @@ ray_cc_test( tags = ["team:core"], deps = [ "//:ray_mock", - "//src/ray/gcs/gcs_server:gcs_server_lib", - "//src/ray/gcs/test:gcs_test_util_lib", + "//src/ray/common:test_utils", + "//src/ray/gcs:gcs_node_manager", + "//src/ray/gcs:gcs_resource_manager", + "//src/ray/raylet/scheduling:cluster_resource_manager", "@com_google_googletest//:gtest_main", ], ) @@ -344,10 +373,10 @@ ray_cc_test( ], tags = ["team:core"], deps = [ - ":gcs_server_test_util", "//:ray_mock", - "//src/ray/gcs/gcs_server:gcs_server_lib", - "//src/ray/gcs/test:gcs_test_util_lib", + "//src/ray/common:asio", + "//src/ray/common:test_utils", + "//src/ray/gcs:gcs_usage_stats_client", "@com_google_googletest//:gtest_main", ], ) @@ -361,10 +390,13 @@ ray_cc_test( "team:core", ], deps = [ - ":gcs_server_test_util", "//:ray_mock", - "//src/ray/gcs/gcs_server:gcs_server_lib", - "//src/ray/gcs/test:gcs_test_util_lib", + "//src/mock/ray/pubsub:mock_publisher", + "//src/ray/common:test_utils", + "//src/ray/gcs:gcs_job_manager", + "//src/ray/gcs:gcs_kv_manager", + "//src/ray/gcs/store_client:in_memory_store_client", + "//src/ray/observability:fake_ray_event_recorder", "@com_google_googletest//:gtest_main", ], ) @@ -378,16 +410,24 @@ ray_cc_test( "team:core", ], deps = [ - ":gcs_server_test_util", "//:ray_mock", - "//src/ray/gcs/gcs_server:gcs_server_lib", - "//src/ray/gcs/test:gcs_test_util_lib", + "//src/ray/common:asio", + "//src/ray/common:runtime_env", + "//src/ray/common:test_utils", + "//src/ray/gcs:gcs_actor", + "//src/ray/gcs:gcs_actor_manager", + "//src/ray/gcs:gcs_actor_scheduler", + "//src/ray/gcs:gcs_function_manager", + "//src/ray/gcs/store_client:in_memory_store_client", + "//src/ray/pubsub:publisher", + "//src/ray/rpc:core_worker_client", + "//src/ray/util:event", "@com_google_googletest//:gtest_main", ], ) ray_cc_test( - name = "gcs_node_manager_export_event_test", + name = "node_manager_export_event_test", size = "small", srcs = ["export_api/gcs_node_manager_export_event_test.cc"], tags = [ @@ -395,10 +435,23 @@ ray_cc_test( "team:core", ], deps = [ - ":gcs_server_test_util", - "//:ray_mock", - "//src/ray/gcs/gcs_server:gcs_server_lib", - "//src/ray/gcs/test:gcs_test_util_lib", + "//src/fakes/ray/rpc/raylet:fake_raylet_client", + "//src/mock/ray/pubsub:mock_publisher", + "//src/ray/common:test_utils", + "//src/ray/gcs:gcs_node_manager", + "//src/ray/gcs/store_client:in_memory_store_client", + "//src/ray/util:string_utils", + "@com_google_googletest//:gtest", + ], +) + +ray_cc_test( + name = "gcs_ray_event_converter_test", + size = "small", + srcs = ["gcs_ray_event_converter_test.cc"], + tags = ["team:core"], + deps = [ + "//src/ray/gcs:gcs_ray_event_converter", "@com_google_googletest//:gtest_main", ], ) diff --git a/src/ray/gcs/gcs_server/test/export_api/gcs_actor_manager_export_event_test.cc b/src/ray/gcs/tests/export_api/gcs_actor_manager_export_event_test.cc similarity index 89% rename from src/ray/gcs/gcs_server/test/export_api/gcs_actor_manager_export_event_test.cc rename to src/ray/gcs/tests/export_api/gcs_actor_manager_export_event_test.cc index 7f552fc41571..0a154594859c 100644 --- a/src/ray/gcs/gcs_server/test/export_api/gcs_actor_manager_export_event_test.cc +++ b/src/ray/gcs/tests/export_api/gcs_actor_manager_export_event_test.cc @@ -11,8 +11,10 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +#include #include +#include #include #include #include @@ -20,20 +22,21 @@ #include #include -// clang-format off -#include "gtest/gtest.h" +#include "mock/ray/gcs/gcs_kv_manager.h" +#include "mock/ray/gcs/gcs_node_manager.h" #include "ray/common/asio/instrumented_io_context.h" -#include "ray/gcs/gcs_server/test/gcs_server_test_util.h" -#include "ray/gcs/test/gcs_test_util.h" -#include "ray/gcs/gcs_server/gcs_kv_manager.h" -#include "mock/ray/gcs/gcs_server/gcs_kv_manager.h" -#include "mock/ray/gcs/gcs_server/gcs_node_manager.h" -#include "mock/ray/pubsub/publisher.h" +#include "ray/common/runtime_env_manager.h" +#include "ray/common/test_utils.h" +#include "ray/gcs/gcs_actor.h" +#include "ray/gcs/gcs_actor_manager.h" +#include "ray/gcs/gcs_function_manager.h" +#include "ray/gcs/store_client/in_memory_store_client.h" +#include "ray/pubsub/publisher.h" +#include "ray/rpc/worker/core_worker_client.h" +#include "ray/rpc/worker/core_worker_client_pool.h" #include "ray/util/event.h" -// clang-format on namespace ray { - namespace gcs { using ::testing::_; @@ -53,8 +56,8 @@ class MockActorScheduler : public gcs::GcsActorSchedulerInterface { auto pending_it = std::find_if(actors.begin(), actors.end(), - [actor_id](const std::shared_ptr &actor) { - return actor->GetActorID() == actor_id; + [actor_id](const std::shared_ptr ¤t_actor) { + return current_actor->GetActorID() == actor_id; }); if (pending_it != actors.end()) { actors.erase(pending_it); @@ -72,7 +75,7 @@ class MockActorScheduler : public gcs::GcsActorSchedulerInterface { MOCK_METHOD3(CancelOnLeasing, void(const NodeID &node_id, const ActorID &actor_id, - const TaskID &task_id)); + const LeaseID &lease_id)); std::vector> actors; }; @@ -152,8 +155,9 @@ class GcsActorManagerTest : public ::testing::Test { /*subscriber_timeout_ms=*/absl::ToInt64Microseconds(absl::Seconds(30)), /*batch_size=*/100); - gcs_publisher_ = std::make_unique(std::move(publisher)); - gcs_table_storage_ = std::make_unique(); + gcs_publisher_ = std::make_unique(std::move(publisher)); + gcs_table_storage_ = + std::make_unique(std::make_unique()); kv_ = std::make_unique(); function_manager_ = std::make_unique(*kv_, io_service_); auto actor_scheduler = std::make_unique(); @@ -212,7 +216,7 @@ class GcsActorManagerTest : public ::testing::Test { rpc::Address address; auto node_id = NodeID::FromRandom(); auto worker_id = WorkerID::FromRandom(); - address.set_raylet_id(node_id.Binary()); + address.set_node_id(node_id.Binary()); address.set_worker_id(worker_id.Binary()); return address; } @@ -224,8 +228,8 @@ class GcsActorManagerTest : public ::testing::Test { const std::string &name = "", const std::string &ray_namespace = "test") { std::promise> promise; - auto request = Mocker::GenRegisterActorRequest( - job_id, max_restarts, detached, name, ray_namespace); + auto request = + GenRegisterActorRequest(job_id, max_restarts, detached, name, ray_namespace); // `DestroyActor` triggers some asynchronous operations. // If we register an actor after destroying an actor, it may result in multithreading // reading and writing the same variable. In order to avoid the problem of @@ -233,7 +237,7 @@ class GcsActorManagerTest : public ::testing::Test { io_service_.post( [this, request, &promise]() { auto status = gcs_actor_manager_->RegisterActor( - request, [this, request, &promise](const Status &status) { + request, [this, request, &promise](const Status &) { auto actor_id = ActorID::FromBinary( request.task_spec().actor_creation_task_spec().actor_id()); promise.set_value( @@ -258,7 +262,7 @@ class GcsActorManagerTest : public ::testing::Test { std::unique_ptr worker_client_pool_; absl::flat_hash_map job_namespace_table_; std::unique_ptr gcs_actor_manager_; - std::shared_ptr gcs_publisher_; + std::shared_ptr gcs_publisher_; std::unique_ptr runtime_env_mgr_; const std::chrono::milliseconds timeout_ms_{2000}; absl::Mutex mutex_; @@ -284,9 +288,9 @@ TEST_F(GcsActorManagerTest, TestBasic) { std::vector> finished_actors; Status status = gcs_actor_manager_->CreateActor( create_actor_request, - [&finished_actors](const std::shared_ptr &actor, - const rpc::PushTaskReply &reply, - const Status &status) { finished_actors.emplace_back(actor); }); + [&finished_actors](const std::shared_ptr &result_actor, + const rpc::PushTaskReply &, + const Status &) { finished_actors.emplace_back(result_actor); }); RAY_CHECK_OK(status); RAY_CHECK_EQ(gcs_actor_manager_->CountFor(rpc::ActorTableData::PENDING_CREATION, ""), 1); @@ -315,7 +319,7 @@ TEST_F(GcsActorManagerTest, TestBasic) { "DEPENDENCIES_UNREADY", "PENDING_CREATION", "ALIVE", "DEAD"}; std::vector vc; for (int i = 0; i < num_retry; i++) { - Mocker::ReadContentFromFile(vc, log_dir_ + "/export_events/event_EXPORT_ACTOR.log"); + ReadContentFromFile(vc, log_dir_ + "/export_events/event_EXPORT_ACTOR.log"); if (static_cast(vc.size()) == num_export_events) { for (int event_idx = 0; event_idx < num_export_events; event_idx++) { json export_event_as_json = json::parse(vc[event_idx]); @@ -339,7 +343,7 @@ TEST_F(GcsActorManagerTest, TestBasic) { vc.clear(); } } - Mocker::ReadContentFromFile(vc, log_dir_ + "/export_events/event_EXPORT_ACTOR.log"); + ReadContentFromFile(vc, log_dir_ + "/export_events/event_EXPORT_ACTOR.log"); std::ostringstream lines; for (auto line : vc) { lines << line << "\n"; @@ -350,5 +354,4 @@ TEST_F(GcsActorManagerTest, TestBasic) { } } // namespace gcs - } // namespace ray diff --git a/src/ray/gcs/gcs_server/test/export_api/gcs_job_manager_export_event_test.cc b/src/ray/gcs/tests/export_api/gcs_job_manager_export_event_test.cc similarity index 67% rename from src/ray/gcs/gcs_server/test/export_api/gcs_job_manager_export_event_test.cc rename to src/ray/gcs/tests/export_api/gcs_job_manager_export_event_test.cc index ffcbeb7d3676..afaf73eeac39 100644 --- a/src/ray/gcs/gcs_server/test/export_api/gcs_job_manager_export_event_test.cc +++ b/src/ray/gcs/tests/export_api/gcs_job_manager_export_event_test.cc @@ -12,24 +12,21 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include + +#include #include #include #include -#include "ray/gcs/gcs_server/gcs_job_manager.h" - -// clang-format off -#include "gtest/gtest.h" -#include "ray/gcs/gcs_server/test/gcs_server_test_util.h" -#include "ray/gcs/store_client/in_memory_store_client.h" -#include "ray/gcs/test/gcs_test_util.h" -#include "ray/gcs/gcs_server/gcs_kv_manager.h" -#include "mock/ray/gcs/gcs_server/gcs_kv_manager.h" +#include "mock/ray/gcs/gcs_kv_manager.h" #include "mock/ray/pubsub/publisher.h" -#include "mock/ray/pubsub/subscriber.h" #include "mock/ray/rpc/worker/core_worker_client.h" - -// clang-format on +#include "ray/common/test_utils.h" +#include "ray/gcs/gcs_job_manager.h" +#include "ray/gcs/gcs_kv_manager.h" +#include "ray/gcs/store_client/in_memory_store_client.h" +#include "ray/observability/fake_ray_event_recorder.h" using json = nlohmann::json; @@ -47,7 +44,7 @@ class GcsJobManagerTest : public ::testing::Test { }); promise.get_future().get(); - gcs_publisher_ = std::make_shared( + gcs_publisher_ = std::make_shared( std::make_unique()); store_client_ = std::make_shared(); gcs_table_storage_ = std::make_shared(store_client_); @@ -63,6 +60,7 @@ class GcsJobManagerTest : public ::testing::Test { return std::make_shared( address.port()); }); + fake_ray_event_recorder_ = std::make_unique(); log_dir_ = "event_12345"; } @@ -77,16 +75,55 @@ class GcsJobManagerTest : public ::testing::Test { std::unique_ptr thread_io_service_; std::shared_ptr store_client_; std::shared_ptr gcs_table_storage_; - std::shared_ptr gcs_publisher_; + std::shared_ptr gcs_publisher_; std::unique_ptr function_manager_; std::unique_ptr kv_; std::unique_ptr fake_kv_; std::unique_ptr worker_client_pool_; + std::unique_ptr fake_ray_event_recorder_; RuntimeEnvManager runtime_env_manager_; const std::chrono::milliseconds timeout_ms_{5000}; std::string log_dir_; }; +TEST_F(GcsJobManagerTest, TestRayEventDriverJobEvents) { + RayConfig::instance().initialize( + R"( +{ + "enable_ray_event": true +} + )"); + gcs::GcsJobManager gcs_job_manager(*gcs_table_storage_, + *gcs_publisher_, + runtime_env_manager_, + *function_manager_, + *fake_kv_, + io_service_, + *worker_client_pool_, + *fake_ray_event_recorder_, + "test_session_name"); + gcs::GcsInitData gcs_init_data(*gcs_table_storage_); + gcs_job_manager.Initialize(gcs_init_data); + auto job_api_job_id = JobID::FromInt(100); + std::string submission_id = "submission_id_100"; + auto add_job_request = GenAddJobRequest(job_api_job_id, "namespace_100", submission_id); + rpc::AddJobReply empty_reply; + std::promise promise; + gcs_job_manager.HandleAddJob( + *add_job_request, + &empty_reply, + [&promise](Status, std::function, std::function) { + promise.set_value(true); + }); + promise.get_future().get(); + auto buffer = fake_ray_event_recorder_->FlushBuffer(); + + ASSERT_EQ(buffer.size(), 2); + ASSERT_EQ(buffer[0]->GetEventType(), + rpc::events::RayEvent::DRIVER_JOB_DEFINITION_EVENT); + ASSERT_EQ(buffer[1]->GetEventType(), rpc::events::RayEvent::DRIVER_JOB_EXECUTION_EVENT); +} + TEST_F(GcsJobManagerTest, TestExportDriverJobEvents) { // Test adding and marking a driver job as finished, and that corresponding // export events are written. @@ -109,15 +146,16 @@ TEST_F(GcsJobManagerTest, TestExportDriverJobEvents) { *function_manager_, *fake_kv_, io_service_, - *worker_client_pool_); + *worker_client_pool_, + *fake_ray_event_recorder_, + "test_session_name"); gcs::GcsInitData gcs_init_data(*gcs_table_storage_); gcs_job_manager.Initialize(gcs_init_data); auto job_api_job_id = JobID::FromInt(100); std::string submission_id = "submission_id_100"; - auto add_job_request = - Mocker::GenAddJobRequest(job_api_job_id, "namespace_100", submission_id); + auto add_job_request = GenAddJobRequest(job_api_job_id, "namespace_100", submission_id); rpc::AddJobReply empty_reply; std::promise promise; gcs_job_manager.HandleAddJob( @@ -129,8 +167,7 @@ TEST_F(GcsJobManagerTest, TestExportDriverJobEvents) { promise.get_future().get(); std::vector vc; - Mocker::ReadContentFromFile(vc, - log_dir_ + "/export_events/event_EXPORT_DRIVER_JOB.log"); + ReadContentFromFile(vc, log_dir_ + "/export_events/event_EXPORT_DRIVER_JOB.log"); ASSERT_EQ((int)vc.size(), 1); json event_data = json::parse(vc[0])["event_data"].get(); ASSERT_EQ(event_data["is_dead"], false); @@ -149,8 +186,7 @@ TEST_F(GcsJobManagerTest, TestExportDriverJobEvents) { job_finished_promise.get_future().get(); vc.clear(); - Mocker::ReadContentFromFile(vc, - log_dir_ + "/export_events/event_EXPORT_DRIVER_JOB.log"); + ReadContentFromFile(vc, log_dir_ + "/export_events/event_EXPORT_DRIVER_JOB.log"); ASSERT_EQ((int)vc.size(), 2); event_data = json::parse(vc[1])["event_data"].get(); ASSERT_EQ(event_data["is_dead"], true); diff --git a/src/ray/gcs/gcs_server/test/export_api/gcs_node_manager_export_event_test.cc b/src/ray/gcs/tests/export_api/gcs_node_manager_export_event_test.cc similarity index 84% rename from src/ray/gcs/gcs_server/test/export_api/gcs_node_manager_export_event_test.cc rename to src/ray/gcs/tests/export_api/gcs_node_manager_export_event_test.cc index 137a1e271c62..5c2ceb17f61b 100644 --- a/src/ray/gcs/gcs_server/test/export_api/gcs_node_manager_export_event_test.cc +++ b/src/ray/gcs/tests/export_api/gcs_node_manager_export_event_test.cc @@ -15,22 +15,20 @@ #include #include +#include #include #include #include #include -#include "ray/gcs/gcs_server/test/gcs_server_test_util.h" -#include "ray/gcs/test/gcs_test_util.h" +#include "fakes/ray/rpc/raylet/raylet_client.h" +#include "mock/ray/pubsub/publisher.h" +#include "ray/common/test_utils.h" +#include "ray/gcs/gcs_node_manager.h" +#include "ray/gcs/store_client/in_memory_store_client.h" #include "ray/util/event.h" #include "ray/util/string_utils.h" -// clang-format off -#include "ray/rpc/node_manager/node_manager_client.h" -#include "ray/rpc/node_manager/raylet_client_pool.h" -#include "mock/ray/pubsub/publisher.h" -// clang-format on - using json = nlohmann::json; namespace ray { @@ -45,12 +43,15 @@ std::string GenerateLogDir() { class GcsNodeManagerExportAPITest : public ::testing::Test { public: GcsNodeManagerExportAPITest() { - raylet_client_ = std::make_shared(); + auto raylet_client = std::make_shared(); client_pool_ = std::make_unique( - [this](const rpc::Address &) { return raylet_client_; }); - gcs_publisher_ = std::make_unique( + [raylet_client = std::move(raylet_client)](const rpc::Address &) { + return raylet_client; + }); + gcs_publisher_ = std::make_unique( std::make_unique()); - gcs_table_storage_ = std::make_unique(); + gcs_table_storage_ = std::make_unique( + std::make_unique()); RayConfig::instance().initialize( R"( @@ -76,9 +77,8 @@ class GcsNodeManagerExportAPITest : public ::testing::Test { protected: std::unique_ptr gcs_table_storage_; - std::shared_ptr raylet_client_; std::unique_ptr client_pool_; - std::shared_ptr gcs_publisher_; + std::shared_ptr gcs_publisher_; instrumented_io_context io_service_; std::string log_dir_; }; @@ -90,7 +90,7 @@ TEST_F(GcsNodeManagerExportAPITest, TestExportEventRegisterNode) { io_service_, client_pool_.get(), ClusterID::Nil()); - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); rpc::RegisterNodeRequest register_request; register_request.mutable_node_info()->CopyFrom(*node); @@ -102,7 +102,7 @@ TEST_F(GcsNodeManagerExportAPITest, TestExportEventRegisterNode) { io_service_.poll(); std::vector vc; - Mocker::ReadContentFromFile(vc, log_dir_ + "/export_events/event_EXPORT_NODE.log"); + ReadContentFromFile(vc, log_dir_ + "/export_events/event_EXPORT_NODE.log"); ASSERT_EQ((int)vc.size(), 1); json event_data = json::parse(vc[0])["event_data"].get(); ASSERT_EQ(event_data["state"], "ALIVE"); @@ -115,7 +115,7 @@ TEST_F(GcsNodeManagerExportAPITest, TestExportEventUnregisterNode) { io_service_, client_pool_.get(), ClusterID::Nil()); - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); auto node_id = NodeID::FromBinary(node->node_id()); node_manager.AddNode(node); @@ -133,7 +133,7 @@ TEST_F(GcsNodeManagerExportAPITest, TestExportEventUnregisterNode) { io_service_.poll(); std::vector vc; - Mocker::ReadContentFromFile(vc, log_dir_ + "/export_events/event_EXPORT_NODE.log"); + ReadContentFromFile(vc, log_dir_ + "/export_events/event_EXPORT_NODE.log"); ASSERT_EQ((int)vc.size(), 1); json event_data = json::parse(vc[0])["event_data"].get(); ASSERT_EQ(event_data["state"], "DEAD"); diff --git a/src/ray/gcs/gcs_server/test/gcs_actor_manager_test.cc b/src/ray/gcs/tests/gcs_actor_manager_test.cc similarity index 84% rename from src/ray/gcs/gcs_server/test/gcs_actor_manager_test.cc rename to src/ray/gcs/tests/gcs_actor_manager_test.cc index 5b41f25778ab..bcc7b74b5730 100644 --- a/src/ray/gcs/gcs_server/test/gcs_actor_manager_test.cc +++ b/src/ray/gcs/tests/gcs_actor_manager_test.cc @@ -12,22 +12,26 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "ray/gcs/gcs_actor_manager.h" + +#include + #include #include #include #include #include -// clang-format off -#include "gtest/gtest.h" +#include "mock/ray/gcs/gcs_kv_manager.h" +#include "mock/ray/gcs/gcs_node_manager.h" #include "ray/common/asio/instrumented_io_context.h" -#include "ray/gcs/gcs_server/test/gcs_server_test_util.h" -#include "ray/gcs/test/gcs_test_util.h" -#include "ray/gcs/gcs_server/gcs_kv_manager.h" -#include "mock/ray/gcs/gcs_server/gcs_kv_manager.h" -#include "mock/ray/gcs/gcs_server/gcs_node_manager.h" -#include "mock/ray/pubsub/publisher.h" -// clang-format on +#include "ray/common/runtime_env_manager.h" +#include "ray/common/test_utils.h" +#include "ray/gcs/gcs_actor.h" +#include "ray/gcs/gcs_actor_scheduler.h" +#include "ray/gcs/gcs_function_manager.h" +#include "ray/gcs/store_client/in_memory_store_client.h" +#include "ray/pubsub/publisher.h" namespace ray { namespace gcs { @@ -48,8 +52,8 @@ class MockActorScheduler : public gcs::GcsActorSchedulerInterface { auto pending_it = std::find_if(actors.begin(), actors.end(), - [actor_id](const std::shared_ptr &actor) { - return actor->GetActorID() == actor_id; + [actor_id](const std::shared_ptr ¤t_actor) { + return current_actor->GetActorID() == actor_id; }); if (pending_it != actors.end()) { actors.erase(pending_it); @@ -67,7 +71,7 @@ class MockActorScheduler : public gcs::GcsActorSchedulerInterface { MOCK_METHOD3(CancelOnLeasing, void(const NodeID &node_id, const ActorID &actor_id, - const TaskID &task_id)); + const LeaseID &lease_id)); std::vector> actors; }; @@ -126,9 +130,10 @@ class GcsActorManagerTest : public ::testing::Test { /*subscriber_timeout_ms=*/absl::ToInt64Microseconds(absl::Seconds(30)), /*batch_size=*/100); - gcs_publisher_ = std::make_unique(std::move(publisher)); + gcs_publisher_ = std::make_unique(std::move(publisher)); store_client_ = std::make_shared(); - gcs_table_storage_ = std::make_unique(); + gcs_table_storage_ = + std::make_unique(std::make_unique()); kv_ = std::make_unique(); function_manager_ = std::make_unique(*kv_, io_service_); auto scheduler = std::make_unique(); @@ -157,7 +162,7 @@ class GcsActorManagerTest : public ::testing::Test { rpc::Address address; auto node_id = NodeID::FromRandom(); auto worker_id = WorkerID::FromRandom(); - address.set_raylet_id(node_id.Binary()); + address.set_node_id(node_id.Binary()); address.set_worker_id(worker_id.Binary()); return address; } @@ -174,9 +179,9 @@ class GcsActorManagerTest : public ::testing::Test { while (io_service_.poll_one()) { continue; } - auto request = Mocker::GenRegisterActorRequest( - job_id, max_restarts, detached, name, ray_namespace); - auto status = gcs_actor_manager_->RegisterActor(request, [](const Status &status) {}); + auto request = + GenRegisterActorRequest(job_id, max_restarts, detached, name, ray_namespace); + auto status = gcs_actor_manager_->RegisterActor(request, [](const Status &) {}); io_service_.run_one(); io_service_.run_one(); auto actor_id = @@ -219,7 +224,7 @@ class GcsActorManagerTest : public ::testing::Test { std::unique_ptr worker_client_pool_; absl::flat_hash_map job_namespace_table_; std::unique_ptr gcs_actor_manager_; - std::shared_ptr gcs_publisher_; + std::shared_ptr gcs_publisher_; std::unique_ptr runtime_env_mgr_; const std::chrono::milliseconds timeout_ms_{2000}; absl::Mutex mutex_; @@ -242,7 +247,7 @@ TEST_F(GcsActorManagerTest, TestBasic) { create_actor_request, [&finished_actors](const std::shared_ptr &actor, const rpc::PushTaskReply &reply, - const Status &status) { finished_actors.emplace_back(actor); }); + const Status &) { finished_actors.emplace_back(actor); }); RAY_CHECK_OK(status); RAY_CHECK_EQ(gcs_actor_manager_->CountFor(rpc::ActorTableData::PENDING_CREATION, ""), 1); @@ -287,7 +292,7 @@ TEST_F(GcsActorManagerTest, TestDeadCount) { gcs_actor_manager_->CreateActor(create_actor_request, [](const std::shared_ptr &actor, const rpc::PushTaskReply &reply, - const Status &status) {}); + const Status &) {}); RAY_CHECK_OK(status); auto actor = mock_actor_scheduler_->actors.back(); mock_actor_scheduler_->actors.pop_back(); @@ -312,9 +317,11 @@ TEST_F(GcsActorManagerTest, TestSchedulingFailed) { std::vector> finished_actors; RAY_CHECK_OK(gcs_actor_manager_->CreateActor( create_actor_request, - [&finished_actors](std::shared_ptr actor, - const rpc::PushTaskReply &reply, - const Status &status) { finished_actors.emplace_back(actor); })); + [&finished_actors](std::shared_ptr result_actor, + const rpc::PushTaskReply &, + const Status &) { + finished_actors.emplace_back(result_actor); + })); ASSERT_EQ(finished_actors.size(), 0); ASSERT_EQ(mock_actor_scheduler_->actors.size(), 1); @@ -339,9 +346,11 @@ TEST_F(GcsActorManagerTest, TestWorkerFailure) { std::vector> finished_actors; RAY_CHECK_OK(gcs_actor_manager_->CreateActor( create_actor_request, - [&finished_actors](std::shared_ptr actor, - const rpc::PushTaskReply &reply, - const Status &status) { finished_actors.emplace_back(actor); })); + [&finished_actors](std::shared_ptr result_actor, + const rpc::PushTaskReply &, + const Status &) { + finished_actors.emplace_back(result_actor); + })); ASSERT_EQ(finished_actors.size(), 0); ASSERT_EQ(mock_actor_scheduler_->actors.size(), 1); @@ -350,7 +359,7 @@ TEST_F(GcsActorManagerTest, TestWorkerFailure) { // Check that the actor is in state `ALIVE`. auto address = RandomAddress(); - auto node_id = NodeID::FromBinary(address.raylet_id()); + auto node_id = NodeID::FromBinary(address.node_id()); auto worker_id = WorkerID::FromBinary(address.worker_id()); actor->UpdateAddress(address); gcs_actor_manager_->OnActorCreationSuccess(actor, rpc::PushTaskReply()); @@ -386,8 +395,8 @@ TEST_F(GcsActorManagerTest, TestNodeFailure) { Status status = gcs_actor_manager_->CreateActor( create_actor_request, [&finished_actors](std::shared_ptr actor, - const rpc::PushTaskReply &reply, - const Status &status) { finished_actors.emplace_back(actor); }); + const rpc::PushTaskReply &, + const Status &) { finished_actors.emplace_back(actor); }); RAY_CHECK_OK(status); ASSERT_EQ(finished_actors.size(), 0); @@ -408,7 +417,7 @@ TEST_F(GcsActorManagerTest, TestNodeFailure) { ASSERT_EQ(actor->GetState(), rpc::ActorTableData::ALIVE); // Remove node and then check that the actor is dead. - auto node_id = NodeID::FromBinary(address.raylet_id()); + auto node_id = NodeID::FromBinary(address.node_id()); EXPECT_CALL(*mock_actor_scheduler_, CancelOnNode(node_id)); OnNodeDead(node_id); @@ -436,8 +445,8 @@ TEST_F(GcsActorManagerTest, TestActorReconstruction) { Status status = gcs_actor_manager_->CreateActor( create_actor_request, [&finished_actors](std::shared_ptr actor, - const rpc::PushTaskReply &reply, - const Status &status) { finished_actors.emplace_back(actor); }); + const rpc::PushTaskReply &, + const Status &) { finished_actors.emplace_back(actor); }); RAY_CHECK_OK(status); ASSERT_EQ(finished_actors.size(), 0); @@ -447,7 +456,7 @@ TEST_F(GcsActorManagerTest, TestActorReconstruction) { // Check that the actor is in state `ALIVE`. auto address = RandomAddress(); - auto node_id = NodeID::FromBinary(address.raylet_id()); + auto node_id = NodeID::FromBinary(address.node_id()); actor->UpdateAddress(address); gcs_actor_manager_->OnActorCreationSuccess(actor, rpc::PushTaskReply()); io_service_.run_one(); @@ -463,7 +472,7 @@ TEST_F(GcsActorManagerTest, TestActorReconstruction) { mock_actor_scheduler_->actors.clear(); ASSERT_EQ(finished_actors.size(), 1); auto node_id2 = NodeID::FromRandom(); - address.set_raylet_id(node_id2.Binary()); + address.set_node_id(node_id2.Binary()); actor->UpdateAddress(address); gcs_actor_manager_->OnActorCreationSuccess(actor, rpc::PushTaskReply()); io_service_.run_one(); @@ -503,9 +512,11 @@ TEST_F(GcsActorManagerTest, TestActorRestartWhenOwnerDead) { std::vector> finished_actors; RAY_CHECK_OK(gcs_actor_manager_->CreateActor( create_actor_request, - [&finished_actors](std::shared_ptr actor, - const rpc::PushTaskReply &reply, - const Status &status) { finished_actors.emplace_back(actor); })); + [&finished_actors](std::shared_ptr result_actor, + const rpc::PushTaskReply &, + const Status &) { + finished_actors.emplace_back(result_actor); + })); ASSERT_EQ(finished_actors.size(), 0); ASSERT_EQ(mock_actor_scheduler_->actors.size(), 1); @@ -515,7 +526,7 @@ TEST_F(GcsActorManagerTest, TestActorRestartWhenOwnerDead) { // Check that the actor is in state `ALIVE`. auto address = RandomAddress(); - auto node_id = NodeID::FromBinary(address.raylet_id()); + auto node_id = NodeID::FromBinary(address.node_id()); actor->UpdateAddress(address); gcs_actor_manager_->OnActorCreationSuccess(actor, rpc::PushTaskReply()); io_service_.run_one(); @@ -553,9 +564,11 @@ TEST_F(GcsActorManagerTest, TestDetachedActorRestartWhenCreatorDead) { std::vector> finished_actors; RAY_CHECK_OK(gcs_actor_manager_->CreateActor( create_actor_request, - [&finished_actors](std::shared_ptr actor, - const rpc::PushTaskReply &reply, - const Status &status) { finished_actors.emplace_back(actor); })); + [&finished_actors](std::shared_ptr result_actor, + const rpc::PushTaskReply &, + const Status &) { + finished_actors.emplace_back(result_actor); + })); ASSERT_EQ(finished_actors.size(), 0); ASSERT_EQ(mock_actor_scheduler_->actors.size(), 1); @@ -582,13 +595,12 @@ TEST_F(GcsActorManagerTest, TestActorWithEmptyName) { // Gen `CreateActorRequest` with an empty name. // (name,actor_id) => ("", actor_id_1) - auto request1 = Mocker::GenRegisterActorRequest(job_id, - /*max_restarts=*/0, - /*detached=*/true, - /*name=*/""); + auto request1 = GenRegisterActorRequest(job_id, + /*max_restarts=*/0, + /*detached=*/true, + /*name=*/""); - Status status = - gcs_actor_manager_->RegisterActor(request1, [](const Status &status) {}); + Status status = gcs_actor_manager_->RegisterActor(request1, [](const Status &) {}); io_service_.run_one(); // Ensure successful registration. @@ -598,11 +610,11 @@ TEST_F(GcsActorManagerTest, TestActorWithEmptyName) { // Gen another `CreateActorRequest` with an empty name. // (name,actor_id) => ("", actor_id_2) - auto request2 = Mocker::GenRegisterActorRequest(job_id, - /*max_restarts=*/0, - /*detached=*/true, - /*name=*/""); - status = gcs_actor_manager_->RegisterActor(request2, [](const Status &status) {}); + auto request2 = GenRegisterActorRequest(job_id, + /*max_restarts=*/0, + /*detached=*/true, + /*name=*/""); + status = gcs_actor_manager_->RegisterActor(request2, [](const Status &) {}); io_service_.run_one(); // Ensure successful registration. ASSERT_TRUE(status.ok()); @@ -612,24 +624,23 @@ TEST_F(GcsActorManagerTest, TestNamedActors) { auto job_id_1 = JobID::FromInt(1); auto job_id_2 = JobID::FromInt(2); - auto request1 = Mocker::GenRegisterActorRequest(job_id_1, - /*max_restarts=*/0, - /*detached=*/true, - /*name=*/"actor1", - /*ray_namespace=*/"test_named_actor"); - Status status = - gcs_actor_manager_->RegisterActor(request1, [](const Status &status) {}); + auto request1 = GenRegisterActorRequest(job_id_1, + /*max_restarts=*/0, + /*detached=*/true, + /*name=*/"actor1", + /*ray_namespace=*/"test_named_actor"); + Status status = gcs_actor_manager_->RegisterActor(request1, [](const Status &) {}); io_service_.run_one(); ASSERT_TRUE(status.ok()); ASSERT_EQ(gcs_actor_manager_->GetActorIDByName("actor1", "test_named_actor").Binary(), request1.task_spec().actor_creation_task_spec().actor_id()); - auto request2 = Mocker::GenRegisterActorRequest(job_id_1, - /*max_restarts=*/0, - /*detached=*/true, - /*name=*/"actor2", - /*ray_namesapce=*/"test_named_actor"); - status = gcs_actor_manager_->RegisterActor(request2, [](const Status &status) {}); + auto request2 = GenRegisterActorRequest(job_id_1, + /*max_restarts=*/0, + /*detached=*/true, + /*name=*/"actor2", + /*ray_namesapce=*/"test_named_actor"); + status = gcs_actor_manager_->RegisterActor(request2, [](const Status &) {}); io_service_.run_one(); ASSERT_TRUE(status.ok()); ASSERT_EQ(gcs_actor_manager_->GetActorIDByName("actor2", "test_named_actor").Binary(), @@ -640,24 +651,24 @@ TEST_F(GcsActorManagerTest, TestNamedActors) { ActorID::Nil()); // Check that naming collisions return Status::AlreadyExists. - auto request3 = Mocker::GenRegisterActorRequest(job_id_1, - /*max_restarts=*/0, - /*detached=*/true, - /*name=*/"actor2", - /*ray_namesapce=*/"test_named_actor"); - status = gcs_actor_manager_->RegisterActor(request3, [](const Status &status) {}); + auto request3 = GenRegisterActorRequest(job_id_1, + /*max_restarts=*/0, + /*detached=*/true, + /*name=*/"actor2", + /*ray_namesapce=*/"test_named_actor"); + status = gcs_actor_manager_->RegisterActor(request3, [](const Status &) {}); io_service_.run_one(); ASSERT_TRUE(status.IsAlreadyExists()); ASSERT_EQ(gcs_actor_manager_->GetActorIDByName("actor2", "test_named_actor").Binary(), request2.task_spec().actor_creation_task_spec().actor_id()); // Check that naming collisions are enforced across JobIDs. - auto request4 = Mocker::GenRegisterActorRequest(job_id_2, - /*max_restarts=*/0, - /*detached=*/true, - /*name=*/"actor2", - /*ray_namesapce=*/"test_named_actor"); - status = gcs_actor_manager_->RegisterActor(request4, [](const Status &status) {}); + auto request4 = GenRegisterActorRequest(job_id_2, + /*max_restarts=*/0, + /*detached=*/true, + /*name=*/"actor2", + /*ray_namesapce=*/"test_named_actor"); + status = gcs_actor_manager_->RegisterActor(request4, [](const Status &) {}); io_service_.run_one(); ASSERT_TRUE(status.IsAlreadyExists()); ASSERT_EQ(gcs_actor_manager_->GetActorIDByName("actor2", "test_named_actor").Binary(), @@ -676,10 +687,9 @@ TEST_F(GcsActorManagerTest, TestNamedActorDeletionWorkerFailure) { request1.mutable_task_spec()->CopyFrom( registered_actor_1->GetCreationTaskSpecification().GetMessage()); - Status status = gcs_actor_manager_->CreateActor(request1, - [](std::shared_ptr actor, - const rpc::PushTaskReply &reply, - const Status &status) {}); + Status status = gcs_actor_manager_->CreateActor( + request1, + [](std::shared_ptr, const rpc::PushTaskReply &, const Status &) {}); ASSERT_TRUE(status.ok()); ASSERT_EQ(gcs_actor_manager_->GetActorIDByName(actor_name, "test").Binary(), request1.task_spec().actor_creation_task_spec().actor_id()); @@ -689,7 +699,7 @@ TEST_F(GcsActorManagerTest, TestNamedActorDeletionWorkerFailure) { // Check that the actor is in state `ALIVE`. auto address = RandomAddress(); - auto node_id = NodeID::FromBinary(address.raylet_id()); + auto node_id = NodeID::FromBinary(address.node_id()); auto worker_id = WorkerID::FromBinary(address.worker_id()); actor->UpdateAddress(address); gcs_actor_manager_->OnActorCreationSuccess(actor, rpc::PushTaskReply()); @@ -702,6 +712,24 @@ TEST_F(GcsActorManagerTest, TestNamedActorDeletionWorkerFailure) { ASSERT_TRUE(absl::StrContains( actor->GetActorTableData().death_cause().actor_died_error_context().error_message(), "worker process has died.")); + ASSERT_EQ(gcs_actor_manager_->GetActorIDByName(actor_name, "test"), + actor->GetActorID()); + + // Detached actor has no reply of WaitForActorRefDeleted request. + ASSERT_FALSE(worker_client_->Reply()); + // Kill this detached actor + rpc::KillActorViaGcsReply reply; + rpc::KillActorViaGcsRequest request; + request.set_actor_id(actor->GetActorID().Binary()); + request.set_force_kill(true); + request.set_no_restart(true); + gcs_actor_manager_->HandleKillActorViaGcs( + request, + &reply, + /*send_reply_callback*/ + [](Status status, std::function success, std::function failure) {}); + io_service_.run_one(); + ASSERT_EQ(gcs_actor_manager_->GetActorIDByName(actor_name, "test"), ActorID::Nil()); // Create an actor with the same name. This ensures that the name has been properly @@ -714,10 +742,9 @@ TEST_F(GcsActorManagerTest, TestNamedActorDeletionWorkerFailure) { request2.mutable_task_spec()->CopyFrom( registered_actor_2->GetCreationTaskSpecification().GetMessage()); - status = gcs_actor_manager_->CreateActor(request2, - [](std::shared_ptr actor, - const rpc::PushTaskReply &reply, - const Status &status) {}); + status = gcs_actor_manager_->CreateActor( + request2, + [](std::shared_ptr, const rpc::PushTaskReply &, const Status &) {}); ASSERT_TRUE(status.ok()); ASSERT_EQ(gcs_actor_manager_->GetActorIDByName(actor_name, "test").Binary(), request2.task_spec().actor_creation_task_spec().actor_id()); @@ -734,10 +761,9 @@ TEST_F(GcsActorManagerTest, TestNamedActorDeletionNodeFailure) { request1.mutable_task_spec()->CopyFrom( registered_actor_1->GetCreationTaskSpecification().GetMessage()); - Status status = gcs_actor_manager_->CreateActor(request1, - [](std::shared_ptr actor, - const rpc::PushTaskReply &reply, - const Status &status) {}); + Status status = gcs_actor_manager_->CreateActor( + request1, + [](std::shared_ptr, const rpc::PushTaskReply &, const Status &) {}); ASSERT_TRUE(status.ok()); ASSERT_EQ(gcs_actor_manager_->GetActorIDByName("actor", "test").Binary(), request1.task_spec().actor_creation_task_spec().actor_id()); @@ -747,7 +773,7 @@ TEST_F(GcsActorManagerTest, TestNamedActorDeletionNodeFailure) { // Check that the actor is in state `ALIVE`. auto address = RandomAddress(); - auto node_id = NodeID::FromBinary(address.raylet_id()); + auto node_id = NodeID::FromBinary(address.node_id()); actor->UpdateAddress(address); gcs_actor_manager_->OnActorCreationSuccess(actor, rpc::PushTaskReply()); io_service_.run_one(); @@ -771,10 +797,9 @@ TEST_F(GcsActorManagerTest, TestNamedActorDeletionNodeFailure) { request2.mutable_task_spec()->CopyFrom( registered_actor_2->GetCreationTaskSpecification().GetMessage()); - status = gcs_actor_manager_->CreateActor(request2, - [](std::shared_ptr actor, - const rpc::PushTaskReply &reply, - const Status &status) {}); + status = gcs_actor_manager_->CreateActor( + request2, + [](std::shared_ptr, const rpc::PushTaskReply &, const Status &) {}); ASSERT_TRUE(status.ok()); ASSERT_EQ(gcs_actor_manager_->GetActorIDByName("actor", "test").Binary(), request2.task_spec().actor_creation_task_spec().actor_id()); @@ -795,7 +820,7 @@ TEST_F(GcsActorManagerTest, TestNamedActorDeletionNotHappendWhenReconstructed) { Status status = gcs_actor_manager_->CreateActor(request1, [](std::shared_ptr actor, const rpc::PushTaskReply &reply, - const Status &status) {}); + const Status &) {}); ASSERT_TRUE(status.ok()); ASSERT_EQ(gcs_actor_manager_->GetActorIDByName("actor", "test").Binary(), request1.task_spec().actor_creation_task_spec().actor_id()); @@ -805,7 +830,7 @@ TEST_F(GcsActorManagerTest, TestNamedActorDeletionNotHappendWhenReconstructed) { // Check that the actor is in state `ALIVE`. auto address = RandomAddress(); - auto node_id = NodeID::FromBinary(address.raylet_id()); + auto node_id = NodeID::FromBinary(address.node_id()); auto worker_id = WorkerID::FromBinary(address.worker_id()); actor->UpdateAddress(address); gcs_actor_manager_->OnActorCreationSuccess(actor, rpc::PushTaskReply()); @@ -820,11 +845,11 @@ TEST_F(GcsActorManagerTest, TestNamedActorDeletionNotHappendWhenReconstructed) { // It should fail because actor has been reconstructed, and names shouldn't have been // cleaned. const auto job_id_2 = JobID::FromInt(2); - auto request2 = Mocker::GenRegisterActorRequest(job_id_2, - /*max_restarts=*/0, - /*detached=*/true, - /*name=*/"actor"); - status = gcs_actor_manager_->RegisterActor(request2, [](const Status &status) {}); + auto request2 = GenRegisterActorRequest(job_id_2, + /*max_restarts=*/0, + /*detached=*/true, + /*name=*/"actor"); + status = gcs_actor_manager_->RegisterActor(request2, [](const Status &) {}); io_service_.run_one(); ASSERT_TRUE(status.IsAlreadyExists()); ASSERT_EQ(gcs_actor_manager_->GetActorIDByName("actor", "test").Binary(), @@ -841,9 +866,11 @@ TEST_F(GcsActorManagerTest, TestDestroyActorBeforeActorCreationCompletes) { std::vector> finished_actors; RAY_CHECK_OK(gcs_actor_manager_->CreateActor( create_actor_request, - [&finished_actors](std::shared_ptr actor, - const rpc::PushTaskReply &reply, - const Status &status) { finished_actors.emplace_back(actor); })); + [&finished_actors](std::shared_ptr result_actor, + const rpc::PushTaskReply &, + const Status &) { + finished_actors.emplace_back(result_actor); + })); ASSERT_EQ(finished_actors.size(), 0); ASSERT_EQ(mock_actor_scheduler_->actors.size(), 1); @@ -876,9 +903,11 @@ TEST_F(GcsActorManagerTest, TestRaceConditionCancelLease) { std::vector> finished_actors; RAY_CHECK_OK(gcs_actor_manager_->CreateActor( create_actor_request, - [&finished_actors](std::shared_ptr actor, - const rpc::PushTaskReply &reply, - const Status &status) { finished_actors.emplace_back(actor); })); + [&finished_actors](std::shared_ptr result_actor, + const rpc::PushTaskReply &, + const Status &) { + finished_actors.emplace_back(result_actor); + })); ASSERT_EQ(finished_actors.size(), 0); ASSERT_EQ(mock_actor_scheduler_->actors.size(), 1); @@ -891,13 +920,12 @@ TEST_F(GcsActorManagerTest, TestRaceConditionCancelLease) { rpc::Address address; auto node_id = NodeID::FromRandom(); auto worker_id = WorkerID::FromRandom(); - address.set_raylet_id(node_id.Binary()); + address.set_node_id(node_id.Binary()); address.set_worker_id(worker_id.Binary()); actor->UpdateAddress(address); const auto &actor_id = actor->GetActorID(); - const auto &task_id = TaskID::FromBinary( - registered_actor->GetCreationTaskSpecification().GetMessage().task_id()); - EXPECT_CALL(*mock_actor_scheduler_, CancelOnLeasing(node_id, actor_id, task_id)); + // LeaseID is randomly generated, so we can't check for a specific lease ID. + EXPECT_CALL(*mock_actor_scheduler_, CancelOnLeasing(node_id, actor_id, _)); gcs_actor_manager_->OnWorkerDead(owner_node_id, owner_worker_id); io_service_.run_one(); ASSERT_TRUE(actor->GetActorTableData().death_cause().has_actor_died_error_context()); @@ -920,10 +948,10 @@ TEST_F(GcsActorManagerTest, TestRegisterActor) { registered_actor->GetCreationTaskSpecification().GetMessage()); RAY_CHECK_OK(gcs_actor_manager_->CreateActor( request, - [&finished_actors](std::shared_ptr actor, - const rpc::PushTaskReply &reply, - const Status &status) { - finished_actors.emplace_back(std::move(actor)); + [&finished_actors](std::shared_ptr result_actor, + const rpc::PushTaskReply &, + const Status &) { + finished_actors.emplace_back(result_actor); })); // Make sure the actor is scheduling. ASSERT_EQ(mock_actor_scheduler_->actors.size(), 1); @@ -942,7 +970,7 @@ TEST_F(GcsActorManagerTest, TestOwnerWorkerDieBeforeActorDependenciesResolved) { auto job_id = JobID::FromInt(1); auto registered_actor = RegisterActor(job_id); const auto &owner_address = registered_actor->GetOwnerAddress(); - auto node_id = NodeID::FromBinary(owner_address.raylet_id()); + auto node_id = NodeID::FromBinary(owner_address.node_id()); auto worker_id = WorkerID::FromBinary(owner_address.worker_id()); gcs_actor_manager_->OnWorkerDead(node_id, worker_id); io_service_.run_one(); @@ -968,7 +996,7 @@ TEST_F(GcsActorManagerTest, TestOwnerWorkerDieBeforeDetachedActorDependenciesRes auto job_id = JobID::FromInt(1); auto registered_actor = RegisterActor(job_id, /*max_restarts=*/1, /*detached=*/true); const auto &owner_address = registered_actor->GetOwnerAddress(); - auto node_id = NodeID::FromBinary(owner_address.raylet_id()); + auto node_id = NodeID::FromBinary(owner_address.node_id()); auto worker_id = WorkerID::FromBinary(owner_address.worker_id()); gcs_actor_manager_->OnWorkerDead(node_id, worker_id); io_service_.run_one(); @@ -993,7 +1021,7 @@ TEST_F(GcsActorManagerTest, TestOwnerNodeDieBeforeActorDependenciesResolved) { auto job_id = JobID::FromInt(1); auto registered_actor = RegisterActor(job_id); const auto &owner_address = registered_actor->GetOwnerAddress(); - auto node_id = NodeID::FromBinary(owner_address.raylet_id()); + auto node_id = NodeID::FromBinary(owner_address.node_id()); OnNodeDead(node_id); ASSERT_EQ(registered_actor->GetState(), rpc::ActorTableData::DEAD); ASSERT_TRUE( @@ -1015,7 +1043,7 @@ TEST_F(GcsActorManagerTest, TestOwnerNodeDieBeforeDetachedActorDependenciesResol auto job_id = JobID::FromInt(1); auto registered_actor = RegisterActor(job_id, /*max_restarts=*/1, /*detached=*/true); const auto &owner_address = registered_actor->GetOwnerAddress(); - auto node_id = NodeID::FromBinary(owner_address.raylet_id()); + auto node_id = NodeID::FromBinary(owner_address.node_id()); OnNodeDead(node_id); ASSERT_EQ(registered_actor->GetState(), rpc::ActorTableData::DEAD); ASSERT_TRUE( @@ -1046,9 +1074,11 @@ TEST_F(GcsActorManagerTest, TestOwnerAndChildDiedAtTheSameTimeRaceCondition) { std::vector> finished_actors; RAY_CHECK_OK(gcs_actor_manager_->CreateActor( create_actor_request, - [&finished_actors](std::shared_ptr actor, - const rpc::PushTaskReply &reply, - const Status &status) { finished_actors.emplace_back(actor); })); + [&finished_actors](std::shared_ptr result_actor, + const rpc::PushTaskReply &, + const Status &) { + finished_actors.emplace_back(result_actor); + })); auto actor = mock_actor_scheduler_->actors.back(); mock_actor_scheduler_->actors.pop_back(); @@ -1078,25 +1108,24 @@ TEST_F(GcsActorManagerTest, TestRayNamespace) { std::string second_namespace = "another_namespace"; job_namespace_table_[job_id_2] = second_namespace; - auto request1 = Mocker::GenRegisterActorRequest(job_id_1, - /*max_restarts=*/0, - /*detached=*/true, - /*name=*/"actor"); - Status status = - gcs_actor_manager_->RegisterActor(request1, [](const Status &status) {}); + auto request1 = GenRegisterActorRequest(job_id_1, + /*max_restarts=*/0, + /*detached=*/true, + /*name=*/"actor"); + Status status = gcs_actor_manager_->RegisterActor(request1, [](const Status &) {}); ASSERT_TRUE(status.ok()); ASSERT_EQ(gcs_actor_manager_->GetActorIDByName("actor", "test").Binary(), request1.task_spec().actor_creation_task_spec().actor_id()); io_service_.run_one(); - auto request2 = Mocker::GenRegisterActorRequest(job_id_2, - /*max_restarts=*/0, - /*detached=*/true, - /*name=*/"actor", - second_namespace); + auto request2 = GenRegisterActorRequest(job_id_2, + /*max_restarts=*/0, + /*detached=*/true, + /*name=*/"actor", + second_namespace); // Create a second actor of the same name. Its job id belongs to a different // namespace though. - status = gcs_actor_manager_->RegisterActor(request2, [](const Status &status) {}); + status = gcs_actor_manager_->RegisterActor(request2, [](const Status &) {}); ASSERT_TRUE(status.ok()); ASSERT_EQ(gcs_actor_manager_->GetActorIDByName("actor", second_namespace).Binary(), request2.task_spec().actor_creation_task_spec().actor_id()); @@ -1105,12 +1134,12 @@ TEST_F(GcsActorManagerTest, TestRayNamespace) { request1.task_spec().actor_creation_task_spec().actor_id()); io_service_.run_one(); - auto request3 = Mocker::GenRegisterActorRequest(job_id_3, - /*max_restarts=*/0, - /*detached=*/true, - /*name=*/"actor", - /*ray_namespace=*/"test"); - status = gcs_actor_manager_->RegisterActor(request3, [](const Status &status) {}); + auto request3 = GenRegisterActorRequest(job_id_3, + /*max_restarts=*/0, + /*detached=*/true, + /*name=*/"actor", + /*ray_namespace=*/"test"); + status = gcs_actor_manager_->RegisterActor(request3, [](const Status &) {}); ASSERT_TRUE(status.IsAlreadyExists()); ASSERT_EQ(gcs_actor_manager_->GetActorIDByName("actor", "test").Binary(), request1.task_spec().actor_creation_task_spec().actor_id()); @@ -1122,12 +1151,10 @@ TEST_F(GcsActorManagerTest, TestReuseActorNameInNamespace) { std::string ray_namespace = "actor_namespace"; auto job_id_1 = JobID::FromInt(1); - auto request_1 = - Mocker::GenRegisterActorRequest(job_id_1, 0, true, actor_name, ray_namespace); + auto request_1 = GenRegisterActorRequest(job_id_1, 0, true, actor_name, ray_namespace); auto actor_id_1 = ActorID::FromBinary(request_1.task_spec().actor_creation_task_spec().actor_id()); - Status status = - gcs_actor_manager_->RegisterActor(request_1, [](const Status &status) {}); + Status status = gcs_actor_manager_->RegisterActor(request_1, [](const Status &) {}); ASSERT_TRUE(status.ok()); ASSERT_EQ(gcs_actor_manager_->GetActorIDByName(actor_name, ray_namespace).Binary(), actor_id_1.Binary()); @@ -1135,18 +1162,17 @@ TEST_F(GcsActorManagerTest, TestReuseActorNameInNamespace) { auto owner_address = request_1.task_spec().caller_address(); auto node_info = std::make_shared(); - node_info->set_node_id(owner_address.raylet_id()); + node_info->set_node_id(owner_address.node_id()); gcs_actor_manager_->OnNodeDead(node_info, ""); ASSERT_EQ(gcs_actor_manager_->GetActorIDByName(actor_name, ray_namespace).Binary(), ActorID::Nil().Binary()); io_service_.run_one(); auto job_id_2 = JobID::FromInt(2); - auto request_2 = - Mocker::GenRegisterActorRequest(job_id_2, 0, true, actor_name, ray_namespace); + auto request_2 = GenRegisterActorRequest(job_id_2, 0, true, actor_name, ray_namespace); auto actor_id_2 = ActorID::FromBinary(request_2.task_spec().actor_creation_task_spec().actor_id()); - status = gcs_actor_manager_->RegisterActor(request_2, [](const Status &status) {}); + status = gcs_actor_manager_->RegisterActor(request_2, [](const Status &) {}); ASSERT_TRUE(status.ok()); ASSERT_EQ(gcs_actor_manager_->GetActorIDByName(actor_name, ray_namespace).Binary(), actor_id_2.Binary()); @@ -1162,12 +1188,13 @@ TEST_F(GcsActorManagerTest, TestGetAllActorInfoFilters) { create_actor_request.mutable_task_spec()->CopyFrom( registered_actor->GetCreationTaskSpecification().GetMessage()); std::vector> finished_actors; - Status status = gcs_actor_manager_->CreateActor( + Status create_status = gcs_actor_manager_->CreateActor( create_actor_request, - [&finished_actors](const std::shared_ptr &actor, - const rpc::PushTaskReply &reply, - const Status &status) { finished_actors.emplace_back(actor); }); + [&finished_actors](const std::shared_ptr &result_actor, + const rpc::PushTaskReply &, + const Status &) { finished_actors.emplace_back(result_actor); }); + ASSERT_TRUE(create_status.ok()); auto actor = mock_actor_scheduler_->actors.back(); mock_actor_scheduler_->actors.pop_back(); @@ -1182,17 +1209,16 @@ TEST_F(GcsActorManagerTest, TestGetAllActorInfoFilters) { auto job_id_other = JobID::FromInt(2); auto num_other_actors = 3; for (int i = 0; i < num_other_actors; i++) { - auto request1 = Mocker::GenRegisterActorRequest(job_id_other, - /*max_restarts=*/0, - /*detached=*/false); - Status status = - gcs_actor_manager_->RegisterActor(request1, [](const Status &status) {}); - ASSERT_TRUE(status.ok()); + auto request1 = GenRegisterActorRequest(job_id_other, + /*max_restarts=*/0, + /*detached=*/false); + Status register_status = + gcs_actor_manager_->RegisterActor(request1, [](const Status &) {}); + ASSERT_TRUE(register_status.ok()); io_service_.run_one(); } - auto callback = - [](Status status, std::function success, std::function failure) {}; + auto callback = [](Status, std::function, std::function) {}; // Filter with actor id { rpc::GetAllActorInfoRequest request; @@ -1260,11 +1286,10 @@ TEST_F(GcsActorManagerTest, TestGetAllActorInfoLimit) { auto job_id_1 = JobID::FromInt(1); auto num_actors = 3; for (int i = 0; i < num_actors; i++) { - auto request1 = Mocker::GenRegisterActorRequest(job_id_1, - /*max_restarts=*/0, - /*detached=*/false); - Status status = - gcs_actor_manager_->RegisterActor(request1, [](const Status &status) {}); + auto request1 = GenRegisterActorRequest(job_id_1, + /*max_restarts=*/0, + /*detached=*/false); + Status status = gcs_actor_manager_->RegisterActor(request1, [](const Status &) {}); ASSERT_TRUE(status.ok()); io_service_.run_one(); } @@ -1273,9 +1298,7 @@ TEST_F(GcsActorManagerTest, TestGetAllActorInfoLimit) { rpc::GetAllActorInfoRequest request; auto &reply = *google::protobuf::Arena::CreateMessage(&arena); - auto callback = [](Status status, - std::function success, - std::function failure) {}; + auto callback = [](Status, std::function, std::function) {}; gcs_actor_manager_->HandleGetAllActorInfo(request, &reply, callback); ASSERT_EQ(reply.actor_table_data().size(), 3); @@ -1298,9 +1321,9 @@ TEST_F(GcsActorManagerTest, TestKillActorWhenActorIsCreating) { std::vector> finished_actors; Status status = gcs_actor_manager_->CreateActor( create_actor_request, - [&finished_actors](const std::shared_ptr &actor, - const rpc::PushTaskReply &reply, - const Status &status) { finished_actors.emplace_back(actor); }); + [&finished_actors](const std::shared_ptr &result_actor, + const rpc::PushTaskReply &, + const Status &) { finished_actors.emplace_back(result_actor); }); RAY_CHECK_OK(status); ASSERT_EQ(finished_actors.size(), 0); @@ -1326,7 +1349,7 @@ TEST_F(GcsActorManagerTest, TestKillActorWhenActorIsCreating) { request, &reply, /*send_reply_callback*/ - [](Status status, std::function success, std::function failure) {}); + [](Status, std::function, std::function) {}); io_service_.run_one(); // Make sure the `KillActor` rpc is send. @@ -1347,9 +1370,9 @@ TEST_F(GcsActorManagerTest, TestRestartActorForLineageReconstruction) { std::vector> created_actors; RAY_CHECK_OK(gcs_actor_manager_->CreateActor( create_actor_request, - [&created_actors](std::shared_ptr actor, - const rpc::PushTaskReply &reply, - const Status &status) { created_actors.emplace_back(actor); })); + [&created_actors](std::shared_ptr result_actor, + const rpc::PushTaskReply &, + const Status &) { created_actors.emplace_back(result_actor); })); ASSERT_EQ(created_actors.size(), 0); ASSERT_EQ(mock_actor_scheduler_->actors.size(), 1); @@ -1358,7 +1381,7 @@ TEST_F(GcsActorManagerTest, TestRestartActorForLineageReconstruction) { // Check that the actor is in state `ALIVE`. auto address = RandomAddress(); - auto node_id = NodeID::FromBinary(address.raylet_id()); + auto node_id = NodeID::FromBinary(address.node_id()); actor->UpdateAddress(address); gcs_actor_manager_->OnActorCreationSuccess(actor, rpc::PushTaskReply()); io_service_.run_one(); @@ -1375,7 +1398,7 @@ TEST_F(GcsActorManagerTest, TestRestartActorForLineageReconstruction) { mock_actor_scheduler_->actors.clear(); ASSERT_EQ(created_actors.size(), 1); auto node_id2 = NodeID::FromRandom(); - address.set_raylet_id(node_id2.Binary()); + address.set_node_id(node_id2.Binary()); actor->UpdateAddress(address); gcs_actor_manager_->OnActorCreationSuccess(actor, rpc::PushTaskReply()); io_service_.run_one(); @@ -1407,7 +1430,7 @@ TEST_F(GcsActorManagerTest, TestRestartActorForLineageReconstruction) { mock_actor_scheduler_->actors.clear(); ASSERT_EQ(created_actors.size(), 1); auto node_id3 = NodeID::FromRandom(); - address.set_raylet_id(node_id3.Binary()); + address.set_node_id(node_id3.Binary()); actor->UpdateAddress(address); gcs_actor_manager_->OnActorCreationSuccess(actor, rpc::PushTaskReply()); io_service_.run_one(); @@ -1428,9 +1451,9 @@ TEST_F(GcsActorManagerTest, TestRestartPermanentlyDeadActorForLineageReconstruct std::vector> created_actors; RAY_CHECK_OK(gcs_actor_manager_->CreateActor( create_actor_request, - [&created_actors](std::shared_ptr actor, - const rpc::PushTaskReply &reply, - const Status &status) { created_actors.emplace_back(actor); })); + [&created_actors](std::shared_ptr result_actor, + const rpc::PushTaskReply &, + const Status &) { created_actors.emplace_back(result_actor); })); ASSERT_EQ(created_actors.size(), 0); ASSERT_EQ(mock_actor_scheduler_->actors.size(), 1); @@ -1484,9 +1507,9 @@ TEST_F(GcsActorManagerTest, TestIdempotencyOfRestartActorForLineageReconstructio std::vector> created_actors; RAY_CHECK_OK(gcs_actor_manager_->CreateActor( create_actor_request, - [&created_actors](std::shared_ptr actor, - const rpc::PushTaskReply &reply, - const Status &status) { created_actors.emplace_back(actor); })); + [&created_actors](std::shared_ptr result_actor, + const rpc::PushTaskReply &, + const Status &) { created_actors.emplace_back(result_actor); })); ASSERT_EQ(created_actors.size(), 0); ASSERT_EQ(mock_actor_scheduler_->actors.size(), 1); @@ -1517,17 +1540,11 @@ TEST_F(GcsActorManagerTest, TestIdempotencyOfRestartActorForLineageReconstructio rpc::RestartActorForLineageReconstructionReply reply2; gcs_actor_manager_->HandleRestartActorForLineageReconstruction( - request, - &reply1, - [&reply1]( - Status status, std::function success, std::function failure) { + request, &reply1, [&reply1](Status, std::function, std::function) { ASSERT_EQ(reply1.status().code(), static_cast(StatusCode::OK)); }); gcs_actor_manager_->HandleRestartActorForLineageReconstruction( - request, - &reply2, - [&reply2]( - Status status, std::function success, std::function failure) { + request, &reply2, [&reply2](Status, std::function, std::function) { ASSERT_EQ(reply2.status().code(), static_cast(StatusCode::OK)); }); io_service_.run_one(); @@ -1538,7 +1555,7 @@ TEST_F(GcsActorManagerTest, TestIdempotencyOfRestartActorForLineageReconstructio mock_actor_scheduler_->actors.clear(); ASSERT_EQ(created_actors.size(), 1); auto node_id = NodeID::FromRandom(); - address.set_raylet_id(node_id.Binary()); + address.set_node_id(node_id.Binary()); actor->UpdateAddress(address); gcs_actor_manager_->OnActorCreationSuccess(actor, rpc::PushTaskReply()); io_service_.run_one(); @@ -1577,9 +1594,9 @@ TEST_F(GcsActorManagerTest, TestDestroyActorWhenActorIsCreating) { std::vector> finished_actors; Status status = gcs_actor_manager_->CreateActor( create_actor_request, - [&finished_actors](const std::shared_ptr &actor, - const rpc::PushTaskReply &reply, - const Status &status) { finished_actors.emplace_back(actor); }); + [&finished_actors](const std::shared_ptr &result_actor, + const rpc::PushTaskReply &, + const Status &) { finished_actors.emplace_back(result_actor); }); RAY_CHECK_OK(status); ASSERT_EQ(finished_actors.size(), 0); @@ -1605,7 +1622,7 @@ TEST_F(GcsActorManagerTest, TestDestroyActorWhenActorIsCreating) { request, &reply, /*send_reply_callback*/ - [](Status status, std::function success, std::function failure) {}); + [](Status, std::function, std::function) {}); io_service_.run_one(); io_service_.run_one(); @@ -1619,7 +1636,7 @@ TEST_F(GcsActorManagerTest, TestDestroyActorWhenActorIsCreating) { TEST_F(GcsActorManagerTest, TestDestroyWhileRegistering) { // Register comes in -> Kill comes in -> Run all kv operations and callbacks - auto register_request = Mocker::GenRegisterActorRequest( + auto register_request = GenRegisterActorRequest( JobID::FromInt(1), /*max_restarts=*/0, /*detached=*/false, "", "test"); rpc::RegisterActorReply register_reply; gcs_actor_manager_->HandleRegisterActor( @@ -1656,9 +1673,9 @@ TEST_F(GcsActorManagerTest, TestRestartPreemptedActor) { Status status = gcs_actor_manager_->CreateActor(create_actor_request, - [](const std::shared_ptr &actor, - const rpc::PushTaskReply &reply, - const Status &status) {}); + [](const std::shared_ptr &, + const rpc::PushTaskReply &, + const Status &) {}); RAY_CHECK_OK(status); ASSERT_EQ(mock_actor_scheduler_->actors.size(), 1); @@ -1667,7 +1684,7 @@ TEST_F(GcsActorManagerTest, TestRestartPreemptedActor) { // Make the actor alive on a specific node auto address = RandomAddress(); - auto node_id = NodeID::FromBinary(address.raylet_id()); + auto node_id = NodeID::FromBinary(address.node_id()); auto worker_id = WorkerID::FromBinary(address.worker_id()); actor->UpdateAddress(address); gcs_actor_manager_->OnActorCreationSuccess(actor, rpc::PushTaskReply()); @@ -1686,7 +1703,7 @@ TEST_F(GcsActorManagerTest, TestRestartPreemptedActor) { // Make the actor alive on a specific node again. auto new_address = RandomAddress(); - auto new_node_id = NodeID::FromBinary(new_address.raylet_id()); + auto new_node_id = NodeID::FromBinary(new_address.node_id()); auto new_worker_id = WorkerID::FromBinary(new_address.worker_id()); actor->UpdateAddress(new_address); gcs_actor_manager_->OnActorCreationSuccess(actor, rpc::PushTaskReply()); @@ -1709,7 +1726,7 @@ TEST_F(GcsActorManagerTest, TestRestartPreemptedActor) { // Make the actor alive on another node again auto new_address_2 = RandomAddress(); - auto new_node_id_2 = NodeID::FromBinary(new_address_2.raylet_id()); + auto new_node_id_2 = NodeID::FromBinary(new_address_2.node_id()); auto new_worker_id_2 = WorkerID::FromBinary(new_address_2.worker_id()); actor->UpdateAddress(new_address_2); gcs_actor_manager_->OnActorCreationSuccess(actor, rpc::PushTaskReply()); diff --git a/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_mock_test.cc b/src/ray/gcs/tests/gcs_actor_scheduler_mock_test.cc similarity index 85% rename from src/ray/gcs/gcs_server/test/gcs_actor_scheduler_mock_test.cc rename to src/ray/gcs/tests/gcs_actor_scheduler_mock_test.cc index 4493b8257bc1..cbe43ab9f513 100644 --- a/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_mock_test.cc +++ b/src/ray/gcs/tests/gcs_actor_scheduler_mock_test.cc @@ -15,24 +15,23 @@ #include #include #include -// clang-format off -#include "gtest/gtest.h" + #include "gmock/gmock.h" -#include "ray/gcs/gcs_server/gcs_actor_manager.h" -#include "ray/gcs/gcs_server/gcs_actor_scheduler.h" +#include "gtest/gtest.h" +#include "mock/ray/gcs/gcs_node_manager.h" #include "mock/ray/gcs/store_client/store_client.h" -#include "mock/ray/gcs/gcs_server/gcs_node_manager.h" #include "mock/ray/raylet_client/raylet_client.h" -#include "mock/ray/pubsub/subscriber.h" #include "mock/ray/rpc/worker/core_worker_client.h" -#include "ray/common/test_util.h" -// clang-format on +#include "ray/common/test_utils.h" +#include "ray/gcs/gcs_actor.h" +#include "ray/gcs/gcs_actor_scheduler.h" +#include "ray/util/counter_map.h" using namespace ::testing; // NOLINT namespace ray { -using raylet::NoopLocalTaskManager; namespace gcs { + struct MockCallback { MOCK_METHOD(void, Call, ((std::shared_ptr))); void operator()(std::shared_ptr a) { return Call(a); } @@ -57,8 +56,8 @@ class GcsActorSchedulerMockTest : public Test { /*is_node_available_fn=*/ [](auto) { return true; }, /*is_local_node_with_raylet=*/false); - local_task_manager_ = std::make_unique(); - cluster_task_manager = std::make_unique( + local_lease_manager_ = std::make_unique(); + cluster_lease_manager = std::make_unique( local_node_id, *cluster_resource_scheduler, /*get_node_info=*/ @@ -66,8 +65,8 @@ class GcsActorSchedulerMockTest : public Test { auto node = gcs_node_manager->GetAliveNode(nid); return node.has_value() ? node.value().get() : nullptr; }, - /*announce_infeasible_task=*/nullptr, - /*local_task_manager=*/*local_task_manager_); + /*announce_infeasible_lease=*/nullptr, + *local_lease_manager_); counter.reset( new CounterMap>()); worker_client_pool_ = std::make_unique( @@ -76,7 +75,7 @@ class GcsActorSchedulerMockTest : public Test { io_context, *actor_table, *gcs_node_manager, - *cluster_task_manager, + *cluster_lease_manager, [this](auto a, auto b, auto c) { schedule_failure_handler(a); }, [this](auto a, const rpc::PushTaskReply) { schedule_success_handler(a); }, *client_pool, @@ -94,8 +93,8 @@ class GcsActorSchedulerMockTest : public Test { std::shared_ptr store_client; std::unique_ptr actor_table; std::unique_ptr gcs_node_manager; - std::unique_ptr local_task_manager_; - std::unique_ptr cluster_task_manager; + std::unique_ptr local_lease_manager_; + std::unique_ptr cluster_lease_manager; std::unique_ptr actor_scheduler; std::shared_ptr core_worker_client; std::unique_ptr worker_client_pool_; @@ -122,7 +121,8 @@ TEST_F(GcsActorSchedulerMockTest, KillWorkerLeak1) { actor_data.set_actor_id(actor_id.Binary()); auto actor = std::make_shared(actor_data, rpc::TaskSpec(), counter); rpc::ClientCallback cb; - EXPECT_CALL(*raylet_client, RequestWorkerLease(An(), _, _, _, _)) + EXPECT_CALL(*raylet_client, + RequestWorkerLease(An(), _, _, _, _)) .WillOnce(testing::SaveArg<2>(&cb)); // Ensure actor is killed EXPECT_CALL(*core_worker_client, KillActor(_, _)); @@ -130,7 +130,7 @@ TEST_F(GcsActorSchedulerMockTest, KillWorkerLeak1) { actor->GetMutableActorTableData()->set_state(rpc::ActorTableData::DEAD); actor_scheduler->CancelOnNode(node_id); ray::rpc::RequestWorkerLeaseReply reply; - reply.mutable_worker_address()->set_raylet_id(node_id.Binary()); + reply.mutable_worker_address()->set_node_id(node_id.Binary()); reply.mutable_worker_address()->set_worker_id(worker_id.Binary()); cb(Status::OK(), std::move(reply)); } @@ -139,8 +139,8 @@ TEST_F(GcsActorSchedulerMockTest, KillWorkerLeak2) { // Ensure worker is not leak in the following case: // 1. Actor is in pending creation // 2. Gcs push creation task to run in worker - // 3. Cancel the task - // 4. Task creating reply received + // 3. Cancel the lease + // 4. Lease creating reply received // We'd like to test the worker got released eventually. // Worker is released with actor killing auto actor_id = ActorID::FromHex("f4ce02420592ca68c1738a0d01000000"); @@ -151,18 +151,19 @@ TEST_F(GcsActorSchedulerMockTest, KillWorkerLeak2) { rpc::ClientCallback request_worker_lease_cb; // Ensure actor is killed EXPECT_CALL(*core_worker_client, KillActor(_, _)); - EXPECT_CALL(*raylet_client, RequestWorkerLease(An(), _, _, _, _)) + EXPECT_CALL(*raylet_client, + RequestWorkerLease(An(), _, _, _, _)) .WillOnce(testing::SaveArg<2>(&request_worker_lease_cb)); // Postable is not default constructable, so we use a unique_ptr to hold one. std::unique_ptr> async_put_with_index_cb; // Leasing successfully EXPECT_CALL(*store_client, AsyncPut(_, _, _, _, _)) - .WillOnce( - DoAll(SaveArgToUniquePtr<4>(&async_put_with_index_cb), Return(Status::OK()))); + .WillOnce(DoAll(SaveArgToUniquePtr<4>(&async_put_with_index_cb), + InvokeWithoutArgs([]() {}))); actor_scheduler->ScheduleByRaylet(actor); rpc::RequestWorkerLeaseReply reply; - reply.mutable_worker_address()->set_raylet_id(node_id.Binary()); + reply.mutable_worker_address()->set_node_id(node_id.Binary()); reply.mutable_worker_address()->set_worker_id(worker_id.Binary()); request_worker_lease_cb(Status::OK(), std::move(reply)); @@ -177,5 +178,6 @@ TEST_F(GcsActorSchedulerMockTest, KillWorkerLeak2) { actor_scheduler->CancelOnWorker(node_id, worker_id); push_normal_task_cb(Status::OK(), rpc::PushTaskReply()); } + } // namespace gcs } // namespace ray diff --git a/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_test.cc b/src/ray/gcs/tests/gcs_actor_scheduler_test.cc similarity index 89% rename from src/ray/gcs/gcs_server/test/gcs_actor_scheduler_test.cc rename to src/ray/gcs/tests/gcs_actor_scheduler_test.cc index 1834056bbd62..689007ffc67e 100644 --- a/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_test.cc +++ b/src/ray/gcs/tests/gcs_actor_scheduler_test.cc @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "ray/gcs/gcs_actor_scheduler.h" + #include #include @@ -20,38 +22,82 @@ #include #include -// clang-format off -#include "ray/common/asio/asio_util.h" -#include "ray/gcs/gcs_server/gcs_actor_scheduler.h" -#include "ray/gcs/gcs_server/test/gcs_server_test_util.h" -#include "ray/gcs/test/gcs_test_util.h" +#include "fakes/ray/rpc/raylet/raylet_client.h" +#include "fakes/ray/rpc/worker/core_worker_client.h" #include "mock/ray/pubsub/publisher.h" -// clang-format on +#include "ray/common/asio/asio_util.h" +#include "ray/common/test_utils.h" +#include "ray/gcs/gcs_actor.h" +#include "ray/gcs/gcs_actor_scheduler.h" +#include "ray/gcs/gcs_resource_manager.h" +#include "ray/gcs/store_client/in_memory_store_client.h" +#include "ray/util/counter_map.h" namespace ray { -using raylet::NoopLocalTaskManager; +using raylet::NoopLocalLeaseManager; namespace gcs { +class MockedGcsActorScheduler : public gcs::GcsActorScheduler { + public: + using gcs::GcsActorScheduler::GcsActorScheduler; + + protected: + void RetryLeasingWorkerFromNode(std::shared_ptr actor, + std::shared_ptr node) override { + ++num_retry_leasing_count_; + if (num_retry_leasing_count_ <= 1) { + DoRetryLeasingWorkerFromNode(actor, node); + } + } + + void RetryCreatingActorOnWorker(std::shared_ptr actor, + std::shared_ptr worker) override { + ++num_retry_creating_count_; + DoRetryCreatingActorOnWorker(actor, worker); + } + + public: + int num_retry_leasing_count_ = 0; + int num_retry_creating_count_ = 0; +}; + +class FakeGcsActorTable : public gcs::GcsActorTable { + public: + // The store_client and io_context args are NOT used. + explicit FakeGcsActorTable(std::shared_ptr store_client) + : GcsActorTable(store_client) {} + + void Put(const ActorID &key, + const rpc::ActorTableData &value, + Postable callback) override { + std::move(callback).Post("FakeGcsActorTable.Put", Status::OK()); + } + + private: + std::shared_ptr store_client_ = + std::make_shared(); +}; + class GcsActorSchedulerTest : public ::testing::Test { public: void SetUp() override { io_context_ = std::make_unique("GcsActorSchedulerTest"); - raylet_client_ = std::make_shared(); + raylet_client_ = std::make_shared(); raylet_client_pool_ = std::make_shared( [this](const rpc::Address &addr) { return raylet_client_; }); - worker_client_ = std::make_shared(); - gcs_publisher_ = std::make_shared( + worker_client_ = std::make_shared(); + gcs_publisher_ = std::make_shared( std::make_unique()); store_client_ = std::make_shared(); - gcs_table_storage_ = std::make_shared(); + gcs_table_storage_ = + std::make_unique(std::make_unique()); gcs_node_manager_ = std::make_shared(gcs_publisher_.get(), gcs_table_storage_.get(), io_context_->GetIoService(), raylet_client_pool_.get(), ClusterID::Nil()); - gcs_actor_table_ = - std::make_shared(store_client_); + gcs_actor_table_ = std::make_shared(store_client_); local_node_id_ = NodeID::FromRandom(); cluster_resource_scheduler_ = std::make_unique( io_context_->GetIoService(), @@ -62,8 +108,8 @@ class GcsActorSchedulerTest : public ::testing::Test { /*is_local_node_with_raylet=*/false); counter.reset( new CounterMap>()); - local_task_manager_ = std::make_unique(); - cluster_task_manager_ = std::make_unique( + local_lease_manager_ = std::make_unique(); + cluster_lease_manager_ = std::make_unique( local_node_id_, *cluster_resource_scheduler_, /*get_node_info=*/ @@ -72,7 +118,7 @@ class GcsActorSchedulerTest : public ::testing::Test { return node.has_value() ? node.value().get() : nullptr; }, /*announce_infeasible_task=*/nullptr, - /*local_task_manager=*/*local_task_manager_); + /*local_lease_manager=*/*local_lease_manager_); auto gcs_resource_manager = std::make_shared( io_context_->GetIoService(), cluster_resource_scheduler_->GetClusterResourceManager(), @@ -80,11 +126,11 @@ class GcsActorSchedulerTest : public ::testing::Test { local_node_id_); worker_client_pool_ = std::make_unique( [this](const rpc::Address &address) { return worker_client_; }); - gcs_actor_scheduler_ = std::make_shared( + gcs_actor_scheduler_ = std::make_shared( io_context_->GetIoService(), *gcs_actor_table_, *gcs_node_manager_, - *cluster_task_manager_, + *cluster_lease_manager_, /*schedule_failure_handler=*/ [this](std::shared_ptr actor, const rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type, @@ -120,7 +166,7 @@ class GcsActorSchedulerTest : public ::testing::Test { std::shared_ptr NewGcsActor( const std::unordered_map &required_placement_resources) { rpc::Address owner_address; - owner_address.set_raylet_id(NodeID::FromRandom().Binary()); + owner_address.set_node_id(NodeID::FromRandom().Binary()); owner_address.set_ip_address("127.0.0.1"); owner_address.set_port(5678); owner_address.set_worker_id(WorkerID::FromRandom().Binary()); @@ -130,15 +176,14 @@ class GcsActorSchedulerTest : public ::testing::Test { required_resources.insert(required_placement_resources.begin(), required_placement_resources.end()); - auto actor_creating_task_spec = - Mocker::GenActorCreationTask(job_id, - /*max_restarts=*/1, - /*detached=*/true, - /*name=*/"", - "", - owner_address, - required_resources, - required_placement_resources); + auto actor_creating_task_spec = GenActorCreationTask(job_id, + /*max_restarts=*/1, + /*detached=*/true, + /*name=*/"", + "", + owner_address, + required_resources, + required_placement_resources); return std::make_shared(actor_creating_task_spec.GetMessage(), /*ray_namespace=*/"", counter); @@ -146,7 +191,7 @@ class GcsActorSchedulerTest : public ::testing::Test { std::shared_ptr AddNewNode( std::unordered_map node_resources) { - auto node_info = Mocker::GenNodeInfo(); + auto node_info = GenNodeInfo(); node_info->mutable_resources_total()->insert(node_resources.begin(), node_resources.end()); gcs_node_manager_->AddNode(node_info); @@ -155,21 +200,21 @@ class GcsActorSchedulerTest : public ::testing::Test { protected: std::unique_ptr io_context_; - std::shared_ptr store_client_; - std::shared_ptr gcs_actor_table_; - std::shared_ptr raylet_client_; - std::shared_ptr worker_client_; + std::shared_ptr store_client_; + std::shared_ptr gcs_actor_table_; + std::shared_ptr raylet_client_; + std::shared_ptr worker_client_; std::unique_ptr worker_client_pool_; std::shared_ptr gcs_node_manager_; - std::unique_ptr local_task_manager_; + std::unique_ptr local_lease_manager_; std::unique_ptr cluster_resource_scheduler_; - std::shared_ptr cluster_task_manager_; - std::shared_ptr gcs_actor_scheduler_; + std::shared_ptr cluster_lease_manager_; + std::shared_ptr gcs_actor_scheduler_; std::shared_ptr>> counter; std::vector> failure_actors_; std::vector> success_actors_; - std::shared_ptr gcs_publisher_; + std::shared_ptr gcs_publisher_; std::shared_ptr gcs_table_storage_; std::shared_ptr raylet_client_pool_; NodeID local_node_id_; @@ -183,7 +228,7 @@ TEST_F(GcsActorSchedulerTest, TestScheduleFailedWithZeroNode) { ASSERT_EQ(0, gcs_node_manager_->GetAllAliveNodes().size()); auto job_id = JobID::FromInt(1); - auto create_actor_request = Mocker::GenCreateActorRequest(job_id); + auto create_actor_request = GenCreateActorRequest(job_id); auto actor = std::make_shared(create_actor_request.task_spec(), "", counter); @@ -199,13 +244,13 @@ TEST_F(GcsActorSchedulerTest, TestScheduleFailedWithZeroNode) { } TEST_F(GcsActorSchedulerTest, TestScheduleActorSuccess) { - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); auto node_id = NodeID::FromBinary(node->node_id()); gcs_node_manager_->AddNode(node); ASSERT_EQ(1, gcs_node_manager_->GetAllAliveNodes().size()); auto job_id = JobID::FromInt(1); - auto create_actor_request = Mocker::GenCreateActorRequest(job_id); + auto create_actor_request = GenCreateActorRequest(job_id); auto actor = std::make_shared(create_actor_request.task_spec(), "", counter); @@ -237,13 +282,13 @@ TEST_F(GcsActorSchedulerTest, TestScheduleActorSuccess) { } TEST_F(GcsActorSchedulerTest, TestScheduleRetryWhenLeasing) { - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); auto node_id = NodeID::FromBinary(node->node_id()); gcs_node_manager_->AddNode(node); ASSERT_EQ(1, gcs_node_manager_->GetAllAliveNodes().size()); auto job_id = JobID::FromInt(1); - auto create_actor_request = Mocker::GenCreateActorRequest(job_id); + auto create_actor_request = GenCreateActorRequest(job_id); auto actor = std::make_shared(create_actor_request.task_spec(), "", counter); @@ -288,13 +333,13 @@ TEST_F(GcsActorSchedulerTest, TestScheduleRetryWhenLeasing) { } TEST_F(GcsActorSchedulerTest, TestScheduleRetryWhenCreating) { - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); auto node_id = NodeID::FromBinary(node->node_id()); gcs_node_manager_->AddNode(node); ASSERT_EQ(1, gcs_node_manager_->GetAllAliveNodes().size()); auto job_id = JobID::FromInt(1); - auto create_actor_request = Mocker::GenCreateActorRequest(job_id); + auto create_actor_request = GenCreateActorRequest(job_id); auto actor = std::make_shared(create_actor_request.task_spec(), "", counter); @@ -332,13 +377,13 @@ TEST_F(GcsActorSchedulerTest, TestScheduleRetryWhenCreating) { } TEST_F(GcsActorSchedulerTest, TestNodeFailedWhenLeasing) { - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); auto node_id = NodeID::FromBinary(node->node_id()); gcs_node_manager_->AddNode(node); ASSERT_EQ(1, gcs_node_manager_->GetAllAliveNodes().size()); auto job_id = JobID::FromInt(1); - auto create_actor_request = Mocker::GenCreateActorRequest(job_id); + auto create_actor_request = GenCreateActorRequest(job_id); auto actor = std::make_shared(create_actor_request.task_spec(), "", counter); @@ -374,13 +419,13 @@ TEST_F(GcsActorSchedulerTest, TestNodeFailedWhenLeasing) { } TEST_F(GcsActorSchedulerTest, TestLeasingCancelledWhenLeasing) { - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); auto node_id = NodeID::FromBinary(node->node_id()); gcs_node_manager_->AddNode(node); ASSERT_EQ(1, gcs_node_manager_->GetAllAliveNodes().size()); auto job_id = JobID::FromInt(1); - auto create_actor_request = Mocker::GenCreateActorRequest(job_id); + auto create_actor_request = GenCreateActorRequest(job_id); auto actor = std::make_shared(create_actor_request.task_spec(), "", counter); @@ -391,8 +436,8 @@ TEST_F(GcsActorSchedulerTest, TestLeasingCancelledWhenLeasing) { ASSERT_EQ(1, raylet_client_->callbacks.size()); // Cancel the lease request. - const auto &task_id = TaskID::FromBinary(create_actor_request.task_spec().task_id()); - gcs_actor_scheduler_->CancelOnLeasing(node_id, actor->GetActorID(), task_id); + gcs_actor_scheduler_->CancelOnLeasing( + node_id, actor->GetActorID(), actor->GetLeaseSpecification().LeaseId()); ASSERT_EQ(1, raylet_client_->num_workers_requested); ASSERT_EQ(1, raylet_client_->callbacks.size()); @@ -411,13 +456,13 @@ TEST_F(GcsActorSchedulerTest, TestLeasingCancelledWhenLeasing) { } TEST_F(GcsActorSchedulerTest, TestNodeFailedWhenCreating) { - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); auto node_id = NodeID::FromBinary(node->node_id()); gcs_node_manager_->AddNode(node); ASSERT_EQ(1, gcs_node_manager_->GetAllAliveNodes().size()); auto job_id = JobID::FromInt(1); - auto create_actor_request = Mocker::GenCreateActorRequest(job_id); + auto create_actor_request = GenCreateActorRequest(job_id); auto actor = std::make_shared(create_actor_request.task_spec(), "", counter); @@ -457,13 +502,13 @@ TEST_F(GcsActorSchedulerTest, TestNodeFailedWhenCreating) { } TEST_F(GcsActorSchedulerTest, TestWorkerFailedWhenCreating) { - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); auto node_id = NodeID::FromBinary(node->node_id()); gcs_node_manager_->AddNode(node); ASSERT_EQ(1, gcs_node_manager_->GetAllAliveNodes().size()); auto job_id = JobID::FromInt(1); - auto create_actor_request = Mocker::GenCreateActorRequest(job_id); + auto create_actor_request = GenCreateActorRequest(job_id); auto actor = std::make_shared(create_actor_request.task_spec(), "", counter); @@ -499,13 +544,13 @@ TEST_F(GcsActorSchedulerTest, TestWorkerFailedWhenCreating) { } TEST_F(GcsActorSchedulerTest, TestSpillback) { - auto node1 = Mocker::GenNodeInfo(); + auto node1 = GenNodeInfo(); auto node_id_1 = NodeID::FromBinary(node1->node_id()); gcs_node_manager_->AddNode(node1); ASSERT_EQ(1, gcs_node_manager_->GetAllAliveNodes().size()); auto job_id = JobID::FromInt(1); - auto create_actor_request = Mocker::GenCreateActorRequest(job_id); + auto create_actor_request = GenCreateActorRequest(job_id); auto actor = std::make_shared(create_actor_request.task_spec(), "", counter); @@ -517,13 +562,13 @@ TEST_F(GcsActorSchedulerTest, TestSpillback) { ASSERT_EQ(0, worker_client_->GetNumCallbacks()); // Add another node. - auto node2 = Mocker::GenNodeInfo(); + auto node2 = GenNodeInfo(); auto node_id_2 = NodeID::FromBinary(node2->node_id()); gcs_node_manager_->AddNode(node2); ASSERT_EQ(2, gcs_node_manager_->GetAllAliveNodes().size()); // Grant with an invalid spillback node, and schedule again. - auto invalid_node_id = NodeID::FromBinary(Mocker::GenNodeInfo()->node_id()); + auto invalid_node_id = NodeID::FromBinary(GenNodeInfo()->node_id()); ASSERT_TRUE(raylet_client_->GrantWorkerLease(node2->node_manager_address(), node2->node_manager_port(), WorkerID::Nil(), @@ -566,19 +611,19 @@ TEST_F(GcsActorSchedulerTest, TestSpillback) { } TEST_F(GcsActorSchedulerTest, TestReschedule) { - auto node1 = Mocker::GenNodeInfo(); + auto node1 = GenNodeInfo(); auto node_id_1 = NodeID::FromBinary(node1->node_id()); gcs_node_manager_->AddNode(node1); ASSERT_EQ(1, gcs_node_manager_->GetAllAliveNodes().size()); // 1.Actor is already tied to a leased worker. auto job_id = JobID::FromInt(1); - auto create_actor_request = Mocker::GenCreateActorRequest(job_id); + auto create_actor_request = GenCreateActorRequest(job_id); auto actor = std::make_shared(create_actor_request.task_spec(), "", counter); rpc::Address address; WorkerID worker_id = WorkerID::FromRandom(); - address.set_raylet_id(node_id_1.Binary()); + address.set_node_id(node_id_1.Binary()); address.set_worker_id(worker_id.Binary()); actor->UpdateAddress(address); @@ -622,7 +667,7 @@ TEST_F(GcsActorSchedulerTest, TestReleaseUnusedActorWorkers) { // if there is still a pending `ReleaseUnusedActorWorkers` request. // Add a node to the cluster. - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); auto node_id = NodeID::FromBinary(node->node_id()); gcs_node_manager_->AddNode(node); ASSERT_EQ(1, gcs_node_manager_->GetAllAliveNodes().size()); @@ -640,7 +685,7 @@ TEST_F(GcsActorSchedulerTest, TestReleaseUnusedActorWorkers) { // `GcsActorScheduler` won't send `RequestWorkerLease` request to node immediately. But // instead, it will invoke the `RetryLeasingWorkerFromNode` to retry later. auto job_id = JobID::FromInt(1); - auto request = Mocker::GenCreateActorRequest(job_id); + auto request = GenCreateActorRequest(job_id); auto actor = std::make_shared(request.task_spec(), "", counter); gcs_actor_scheduler_->ScheduleByRaylet(actor); ASSERT_EQ(2, gcs_actor_scheduler_->num_retry_leasing_count_); @@ -649,7 +694,7 @@ TEST_F(GcsActorSchedulerTest, TestReleaseUnusedActorWorkers) { // When `GcsActorScheduler` receives the `ReleaseUnusedActorWorkers` reply, it will send // out the `RequestWorkerLease` request. ASSERT_TRUE(raylet_client_->ReplyReleaseUnusedActorWorkers()); - gcs_actor_scheduler_->TryLeaseWorkerFromNodeAgain(actor, node); + gcs_actor_scheduler_->DoRetryLeasingWorkerFromNode(actor, node); ASSERT_EQ(raylet_client_->num_workers_requested, 1); } @@ -677,7 +722,7 @@ TEST_F(GcsActorSchedulerTestWithGcsScheduling, TestScheduleFailedWithZeroNodeByG // are no available nodes. ASSERT_EQ(raylet_client_->num_workers_requested, 0); ASSERT_EQ(0, success_actors_.size()); - ASSERT_EQ(1, cluster_task_manager_->GetInfeasibleQueueSize()); + ASSERT_EQ(1, cluster_lease_manager_->GetInfeasibleQueueSize()); ASSERT_TRUE(actor->GetNodeID().IsNil()); } @@ -699,7 +744,7 @@ TEST_F(GcsActorSchedulerTestWithGcsScheduling, TestNotEnoughClusterResources) { // are not enough cluster resources. ASSERT_EQ(raylet_client_->num_workers_requested, 0); ASSERT_EQ(0, success_actors_.size()); - ASSERT_EQ(1, cluster_task_manager_->GetInfeasibleQueueSize()); + ASSERT_EQ(1, cluster_lease_manager_->GetInfeasibleQueueSize()); ASSERT_TRUE(actor->GetNodeID().IsNil()); } @@ -712,7 +757,7 @@ TEST_F(GcsActorSchedulerTestWithGcsScheduling, TestScheduleAndDestroyOneActor) { scheduling::NodeID scheduling_node_id(node->node_id()); ASSERT_EQ(1, gcs_node_manager_->GetAllAliveNodes().size()); const auto &cluster_resource_manager = - cluster_task_manager_->GetClusterResourceScheduler().GetClusterResourceManager(); + cluster_lease_manager_->GetClusterResourceScheduler().GetClusterResourceManager(); auto resource_view_before_scheduling = cluster_resource_manager.GetResourceView(); ASSERT_TRUE(resource_view_before_scheduling.contains(scheduling_node_id)); @@ -740,8 +785,8 @@ TEST_F(GcsActorSchedulerTestWithGcsScheduling, TestScheduleAndDestroyOneActor) { // Reply the actor creation request, then the actor should be scheduled successfully. ASSERT_TRUE(worker_client_->ReplyPushTask()); ASSERT_EQ(0, worker_client_->GetNumCallbacks()); - ASSERT_EQ(0, cluster_task_manager_->GetInfeasibleQueueSize()); - ASSERT_EQ(0, cluster_task_manager_->GetPendingQueueSize()); + ASSERT_EQ(0, cluster_lease_manager_->GetInfeasibleQueueSize()); + ASSERT_EQ(0, cluster_lease_manager_->GetPendingQueueSize()); ASSERT_EQ(1, success_actors_.size()); ASSERT_EQ(actor, success_actors_.front()); ASSERT_EQ(actor->GetNodeID(), node_id); @@ -879,8 +924,8 @@ TEST_F(GcsActorSchedulerTestWithGcsScheduling, TestScheduleRetryWhenLeasingByGcs // Reply the actor creation request, then the actor should be scheduled successfully. ASSERT_TRUE(worker_client_->ReplyPushTask()); ASSERT_EQ(0, worker_client_->GetNumCallbacks()); - ASSERT_EQ(0, cluster_task_manager_->GetInfeasibleQueueSize()); - ASSERT_EQ(0, cluster_task_manager_->GetPendingQueueSize()); + ASSERT_EQ(0, cluster_lease_manager_->GetInfeasibleQueueSize()); + ASSERT_EQ(0, cluster_lease_manager_->GetPendingQueueSize()); ASSERT_EQ(1, success_actors_.size()); ASSERT_EQ(actor, success_actors_.front()); ASSERT_EQ(actor->GetNodeID(), node_id); @@ -926,8 +971,8 @@ TEST_F(GcsActorSchedulerTestWithGcsScheduling, TestScheduleRetryWhenCreatingByGc // Reply the actor creation request, then the actor should be scheduled successfully. ASSERT_TRUE(worker_client_->ReplyPushTask()); ASSERT_EQ(0, worker_client_->GetNumCallbacks()); - ASSERT_EQ(0, cluster_task_manager_->GetInfeasibleQueueSize()); - ASSERT_EQ(0, cluster_task_manager_->GetPendingQueueSize()); + ASSERT_EQ(0, cluster_lease_manager_->GetInfeasibleQueueSize()); + ASSERT_EQ(0, cluster_lease_manager_->GetPendingQueueSize()); ASSERT_EQ(1, success_actors_.size()); ASSERT_EQ(actor, success_actors_.front()); ASSERT_EQ(actor->GetNodeID(), node_id); @@ -975,8 +1020,8 @@ TEST_F(GcsActorSchedulerTestWithGcsScheduling, TestNodeFailedWhenLeasingByGcs) { ASSERT_EQ(0, gcs_actor_scheduler_->num_retry_leasing_count_); ASSERT_EQ(0, success_actors_.size()); - ASSERT_EQ(0, cluster_task_manager_->GetInfeasibleQueueSize()); - ASSERT_EQ(0, cluster_task_manager_->GetPendingQueueSize()); + ASSERT_EQ(0, cluster_lease_manager_->GetInfeasibleQueueSize()); + ASSERT_EQ(0, cluster_lease_manager_->GetPendingQueueSize()); } TEST_F(GcsActorSchedulerTestWithGcsScheduling, TestLeasingCancelledWhenLeasingByGcs) { @@ -999,8 +1044,8 @@ TEST_F(GcsActorSchedulerTestWithGcsScheduling, TestLeasingCancelledWhenLeasingBy ASSERT_EQ(1, raylet_client_->callbacks.size()); // Cancel the lease request. - const auto &task_id = actor->GetCreationTaskSpecification().TaskId(); - gcs_actor_scheduler_->CancelOnLeasing(node_id, actor->GetActorID(), task_id); + gcs_actor_scheduler_->CancelOnLeasing( + node_id, actor->GetActorID(), actor->GetLeaseSpecification().LeaseId()); ASSERT_EQ(1, raylet_client_->num_workers_requested); ASSERT_EQ(1, raylet_client_->callbacks.size()); @@ -1015,8 +1060,8 @@ TEST_F(GcsActorSchedulerTestWithGcsScheduling, TestLeasingCancelledWhenLeasingBy ASSERT_EQ(0, gcs_actor_scheduler_->num_retry_leasing_count_); ASSERT_EQ(0, success_actors_.size()); - ASSERT_EQ(0, cluster_task_manager_->GetInfeasibleQueueSize()); - ASSERT_EQ(0, cluster_task_manager_->GetPendingQueueSize()); + ASSERT_EQ(0, cluster_lease_manager_->GetInfeasibleQueueSize()); + ASSERT_EQ(0, cluster_lease_manager_->GetPendingQueueSize()); } TEST_F(GcsActorSchedulerTestWithGcsScheduling, TestNodeFailedWhenCreatingByGcs) { @@ -1064,8 +1109,8 @@ TEST_F(GcsActorSchedulerTestWithGcsScheduling, TestNodeFailedWhenCreatingByGcs) ASSERT_EQ(0, gcs_actor_scheduler_->num_retry_creating_count_); ASSERT_EQ(0, success_actors_.size()); - ASSERT_EQ(0, cluster_task_manager_->GetInfeasibleQueueSize()); - ASSERT_EQ(0, cluster_task_manager_->GetPendingQueueSize()); + ASSERT_EQ(0, cluster_lease_manager_->GetInfeasibleQueueSize()); + ASSERT_EQ(0, cluster_lease_manager_->GetPendingQueueSize()); } TEST_F(GcsActorSchedulerTestWithGcsScheduling, TestWorkerFailedWhenCreatingByGcs) { @@ -1109,8 +1154,8 @@ TEST_F(GcsActorSchedulerTestWithGcsScheduling, TestWorkerFailedWhenCreatingByGcs ASSERT_EQ(0, gcs_actor_scheduler_->num_retry_creating_count_); ASSERT_EQ(0, success_actors_.size()); - ASSERT_EQ(0, cluster_task_manager_->GetInfeasibleQueueSize()); - ASSERT_EQ(0, cluster_task_manager_->GetPendingQueueSize()); + ASSERT_EQ(0, cluster_lease_manager_->GetInfeasibleQueueSize()); + ASSERT_EQ(0, cluster_lease_manager_->GetPendingQueueSize()); } TEST_F(GcsActorSchedulerTestWithGcsScheduling, TestRescheduleByGcs) { @@ -1129,7 +1174,7 @@ TEST_F(GcsActorSchedulerTestWithGcsScheduling, TestRescheduleByGcs) { // 1.Actor is already tied to a leased worker. rpc::Address address; WorkerID worker_id = WorkerID::FromRandom(); - address.set_raylet_id(node_id_1.Binary()); + address.set_node_id(node_id_1.Binary()); address.set_worker_id(worker_id.Binary()); actor->UpdateAddress(address); @@ -1164,8 +1209,8 @@ TEST_F(GcsActorSchedulerTestWithGcsScheduling, TestRescheduleByGcs) { ASSERT_TRUE(worker_client_->ReplyPushTask()); ASSERT_EQ(0, worker_client_->GetNumCallbacks()); - ASSERT_EQ(0, cluster_task_manager_->GetInfeasibleQueueSize()); - ASSERT_EQ(0, cluster_task_manager_->GetPendingQueueSize()); + ASSERT_EQ(0, cluster_lease_manager_->GetInfeasibleQueueSize()); + ASSERT_EQ(0, cluster_lease_manager_->GetPendingQueueSize()); ASSERT_EQ(2, success_actors_.size()); } @@ -1204,7 +1249,7 @@ TEST_F(GcsActorSchedulerTestWithGcsScheduling, TestReleaseUnusedActorWorkersByGc // When `GcsActorScheduler` receives the `ReleaseUnusedActorWorkers` reply, it will send // out the `RequestWorkerLease` request. ASSERT_TRUE(raylet_client_->ReplyReleaseUnusedActorWorkers()); - gcs_actor_scheduler_->TryLeaseWorkerFromNodeAgain(actor, node); + gcs_actor_scheduler_->DoRetryLeasingWorkerFromNode(actor, node); ASSERT_EQ(raylet_client_->num_workers_requested, 1); } diff --git a/src/ray/gcs/gcs_server/test/gcs_autoscaler_state_manager_test.cc b/src/ray/gcs/tests/gcs_autoscaler_state_manager_test.cc similarity index 83% rename from src/ray/gcs/gcs_server/test/gcs_autoscaler_state_manager_test.cc rename to src/ray/gcs/tests/gcs_autoscaler_state_manager_test.cc index a706feba521b..ca07d5a72ab5 100644 --- a/src/ray/gcs/gcs_server/test/gcs_autoscaler_state_manager_test.cc +++ b/src/ray/gcs/tests/gcs_autoscaler_state_manager_test.cc @@ -12,31 +12,32 @@ // See the License for the specific language governing permissions and // limitations under the License. -// clang-format off -#include -#include -#include +#include "ray/gcs/gcs_autoscaler_state_manager.h" + +#include +#include + #include +#include #include +#include #include -#include +#include +#include -#include "gmock/gmock.h" -#include "gtest/gtest.h" -#include "ray/common/asio/instrumented_io_context.h" -#include "ray/gcs/gcs_server/test/gcs_server_test_util.h" -#include "ray/gcs/test/gcs_test_util.h" -#include "ray/gcs/gcs_server/store_client_kv.h" -#include "ray/raylet/scheduling/cluster_resource_manager.h" -#include "mock/ray/gcs/gcs_server/gcs_placement_group_mgr.h" -#include "mock/ray/gcs/gcs_server/gcs_node_manager.h" -#include "mock/ray/gcs/gcs_server/gcs_actor_manager.h" +#include "fakes/ray/rpc/raylet/raylet_client.h" +#include "mock/ray/gcs/gcs_actor_manager.h" +#include "mock/ray/gcs/gcs_node_manager.h" +#include "mock/ray/gcs/gcs_placement_group_manager.h" #include "mock/ray/gcs/store_client/store_client.h" -#include "mock/ray/pubsub/subscriber.h" #include "mock/ray/rpc/worker/core_worker_client.h" - -#include "ray/gcs/gcs_server/gcs_autoscaler_state_manager.h" -// clang-format on +#include "ray/common/asio/instrumented_io_context.h" +#include "ray/common/protobuf_utils.h" +#include "ray/common/test_utils.h" +#include "ray/gcs/gcs_init_data.h" +#include "ray/gcs/gcs_resource_manager.h" +#include "ray/gcs/store_client_kv.h" +#include "ray/raylet/scheduling/cluster_resource_manager.h" namespace ray { @@ -55,7 +56,7 @@ class GcsAutoscalerStateManagerTest : public ::testing::Test { protected: static constexpr char kRayletConfig[] = R"({"raylet_config":"this is a config"})"; instrumented_io_context io_service_; - std::shared_ptr raylet_client_; + std::shared_ptr raylet_client_; std::shared_ptr client_pool_; std::unique_ptr cluster_resource_manager_; std::shared_ptr gcs_resource_manager_; @@ -69,7 +70,7 @@ class GcsAutoscalerStateManagerTest : public ::testing::Test { std::unique_ptr worker_client_pool_; void SetUp() override { - raylet_client_ = std::make_shared(); + raylet_client_ = std::make_shared(); client_pool_ = std::make_unique( [this](const rpc::Address &) { return raylet_client_; }); cluster_resource_manager_ = std::make_unique(io_service_); @@ -201,13 +202,13 @@ class GcsAutoscalerStateManagerTest : public ::testing::Test { bool is_draining = false, int64_t draining_deadline_timestamp_ms = -1) { rpc::ResourcesData resources_data; - Mocker::FillResourcesData(resources_data, - node_id, - available_resources, - total_resources, - idle_ms, - is_draining, - draining_deadline_timestamp_ms); + FillResourcesData(resources_data, + node_id, + available_resources, + total_resources, + idle_ms, + is_draining, + draining_deadline_timestamp_ms); gcs_autoscaler_state_manager_->UpdateResourceLoadAndUsage(resources_data); } @@ -225,7 +226,7 @@ class GcsAutoscalerStateManagerTest : public ::testing::Test { void UpdateResourceLoads(const std::string &node_id, std::vector demands) { rpc::ResourcesData data; - Mocker::FillResourcesData(data, node_id, demands); + FillResourcesData(data, node_id, demands); gcs_autoscaler_state_manager_->UpdateResourceLoadAndUsage(data); } @@ -384,7 +385,7 @@ TEST_F(GcsAutoscalerStateManagerTest, TestGenPlacementConstraintForPlacementGrou } TEST_F(GcsAutoscalerStateManagerTest, TestNodeAddUpdateRemove) { - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); // Adding a node. { @@ -426,7 +427,7 @@ TEST_F(GcsAutoscalerStateManagerTest, TestNodeAddUpdateRemove) { } TEST_F(GcsAutoscalerStateManagerTest, TestGetClusterStatusBasic) { - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); // Test basic cluster resource. { @@ -457,7 +458,7 @@ TEST_F(GcsAutoscalerStateManagerTest, TestGetClusterStatusBasic) { TEST_F(GcsAutoscalerStateManagerTest, TestNodeDynamicLabelsWithPG) { /// Check if PGs are created on a node, the node status should include /// the PG labels. - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); // Adding a node. node->mutable_resources_total()->insert({"CPU", 2}); @@ -485,7 +486,7 @@ TEST_F(GcsAutoscalerStateManagerTest, TestNodeDynamicLabelsWithPG) { } TEST_F(GcsAutoscalerStateManagerTest, TestBasicResourceRequests) { - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); node->mutable_resources_total()->insert({"CPU", 2}); node->mutable_resources_total()->insert({"GPU", 1}); node->set_instance_id("instance_1"); @@ -501,16 +502,16 @@ TEST_F(GcsAutoscalerStateManagerTest, TestBasicResourceRequests) { // Update resource usages. { UpdateResourceLoads(node->node_id(), - {Mocker::GenResourceDemand({{"CPU", 1}}, - /* nun_ready_queued */ 1, - /* nun_infeasible */ 1, - /* num_backlog */ 0, - /* label_selectors */ {}), - Mocker::GenResourceDemand({{"CPU", 4}, {"GPU", 2}}, - /* num_ready_queued */ 0, - /* num_infeasible */ 1, - /* num_backlog */ 1, - /* label_selectors */ {})}); + {GenResourceDemand({{"CPU", 1}}, + /* nun_ready_queued */ 1, + /* nun_infeasible */ 1, + /* num_backlog */ 0, + /* label_selectors */ {}), + GenResourceDemand({{"CPU", 4}, {"GPU", 2}}, + /* num_ready_queued */ 0, + /* num_infeasible */ 1, + /* num_backlog */ 1, + /* label_selectors */ {})}); const auto &state = GetClusterResourceStateSync(); // Expect each pending resources shape to be num_infeasible + num_backlog. @@ -526,7 +527,7 @@ TEST_F(GcsAutoscalerStateManagerTest, TestBasicResourceRequests) { } TEST_F(GcsAutoscalerStateManagerTest, TestGangResourceRequestsBasic) { - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); node->mutable_resources_total()->insert({"CPU", 1}); node->set_instance_id("instance_1"); // Adding a node. @@ -543,14 +544,13 @@ TEST_F(GcsAutoscalerStateManagerTest, TestGangResourceRequestsBasic) { { auto pg = PlacementGroupID::Of(job_id); EXPECT_CALL(*gcs_placement_group_manager_, GetPlacementGroupLoad) - .WillOnce( - Return(Mocker::GenPlacementGroupLoad({Mocker::GenPlacementGroupTableData( - pg, - job_id, - {{{"CPU", 1}}, {{"GPU", 1}}}, - {"", ""}, - rpc::PlacementStrategy::STRICT_SPREAD, - rpc::PlacementGroupTableData::PENDING)}))); + .WillOnce(Return(GenPlacementGroupLoad( + {GenPlacementGroupTableData(pg, + job_id, + {{{"CPU", 1}}, {{"GPU", 1}}}, + {"", ""}, + rpc::PlacementStrategy::STRICT_SPREAD, + rpc::PlacementGroupTableData::PENDING)}))); auto state = GetClusterResourceStateSync(); CheckGangResourceRequests(state, @@ -564,14 +564,13 @@ TEST_F(GcsAutoscalerStateManagerTest, TestGangResourceRequestsBasic) { { auto pg = PlacementGroupID::Of(job_id); EXPECT_CALL(*gcs_placement_group_manager_, GetPlacementGroupLoad) - .WillOnce( - Return(Mocker::GenPlacementGroupLoad({Mocker::GenPlacementGroupTableData( - pg, - job_id, - {{{"CPU", 1}}, {{"GPU", 1}}}, - {"", ""}, - rpc::PlacementStrategy::STRICT_PACK, - rpc::PlacementGroupTableData::PENDING)}))); + .WillOnce(Return(GenPlacementGroupLoad( + {GenPlacementGroupTableData(pg, + job_id, + {{{"CPU", 1}}, {{"GPU", 1}}}, + {"", ""}, + rpc::PlacementStrategy::STRICT_PACK, + rpc::PlacementGroupTableData::PENDING)}))); auto state = GetClusterResourceStateSync(); CheckGangResourceRequests(state, @@ -583,7 +582,7 @@ TEST_F(GcsAutoscalerStateManagerTest, TestGangResourceRequestsBasic) { } TEST_F(GcsAutoscalerStateManagerTest, TestGangResourceRequestsNonStrict) { - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); node->set_instance_id("instance_1"); node->mutable_resources_total()->insert({"CPU", 1}); // Adding a node. @@ -597,20 +596,19 @@ TEST_F(GcsAutoscalerStateManagerTest, TestGangResourceRequestsNonStrict) { auto pg1 = PlacementGroupID::Of(job_id1); auto pg2 = PlacementGroupID::Of(job_id2); EXPECT_CALL(*gcs_placement_group_manager_, GetPlacementGroupLoad) - .WillOnce(Return(Mocker::GenPlacementGroupLoad( - {Mocker::GenPlacementGroupTableData(pg1, - job_id1, - {{{"CPU", 1}, {"GPU", 2}}}, - {""}, - rpc::PlacementStrategy::PACK, - rpc::PlacementGroupTableData::PENDING), - Mocker::GenPlacementGroupTableData( - pg2, - job_id2, - {{{"TPU", 1}}}, - {""}, - rpc::PlacementStrategy::SPREAD, - rpc::PlacementGroupTableData::PENDING)}))); + .WillOnce(Return(GenPlacementGroupLoad( + {GenPlacementGroupTableData(pg1, + job_id1, + {{{"CPU", 1}, {"GPU", 2}}}, + {""}, + rpc::PlacementStrategy::PACK, + rpc::PlacementGroupTableData::PENDING), + GenPlacementGroupTableData(pg2, + job_id2, + {{{"TPU", 1}}}, + {""}, + rpc::PlacementStrategy::SPREAD, + rpc::PlacementGroupTableData::PENDING)}))); const auto &state = GetClusterResourceStateSync(); CheckGangResourceRequests(state, @@ -621,7 +619,7 @@ TEST_F(GcsAutoscalerStateManagerTest, TestGangResourceRequestsNonStrict) { } TEST_F(GcsAutoscalerStateManagerTest, TestGangResourceRequestsPartialRescheduling) { - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); node->set_instance_id("instance_1"); node->mutable_resources_total()->insert({"CPU", 1}); // Adding a node. @@ -632,14 +630,13 @@ TEST_F(GcsAutoscalerStateManagerTest, TestGangResourceRequestsPartialReschedulin auto pg1 = PlacementGroupID::Of(job_id1); EXPECT_CALL(*gcs_placement_group_manager_, GetPlacementGroupLoad) - .WillOnce( - Return(Mocker::GenPlacementGroupLoad({Mocker::GenPlacementGroupTableData( - pg1, - job_id1, - {{{"CPU_failed_1", 1}}, {{"CPU_success_2", 2}}}, - {"", node->node_id()}, - rpc::PlacementStrategy::STRICT_SPREAD, - rpc::PlacementGroupTableData::RESCHEDULING)}))); + .WillOnce(Return(GenPlacementGroupLoad( + {GenPlacementGroupTableData(pg1, + job_id1, + {{{"CPU_failed_1", 1}}, {{"CPU_success_2", 2}}}, + {"", node->node_id()}, + rpc::PlacementStrategy::STRICT_SPREAD, + rpc::PlacementGroupTableData::RESCHEDULING)}))); const auto &state = GetClusterResourceStateSync(); @@ -662,7 +659,7 @@ TEST_F(GcsAutoscalerStateManagerTest, TestClusterResourcesConstraint) { // Generate one constraint. { RequestClusterResourceConstraint( - Mocker::GenClusterResourcesConstraint({{{"CPU", 2}, {"GPU", 1}}}, {1})); + GenClusterResourcesConstraint({{{"CPU", 2}, {"GPU", 1}}}, {1})); const auto &state = GetClusterResourceStateSync(); ASSERT_EQ(state.cluster_resource_constraints_size(), 1); ASSERT_EQ(state.cluster_resource_constraints(0).resource_requests_size(), 1); @@ -673,8 +670,8 @@ TEST_F(GcsAutoscalerStateManagerTest, TestClusterResourcesConstraint) { // Override it { - RequestClusterResourceConstraint(Mocker::GenClusterResourcesConstraint( - {{{"CPU", 4}, {"GPU", 5}, {"TPU", 1}}}, {1})); + RequestClusterResourceConstraint( + GenClusterResourcesConstraint({{{"CPU", 4}, {"GPU", 5}, {"TPU", 1}}}, {1})); const auto &state = GetClusterResourceStateSync(); ASSERT_EQ(state.cluster_resource_constraints_size(), 1); ASSERT_EQ(state.cluster_resource_constraints(0).resource_requests_size(), 1); @@ -726,7 +723,7 @@ TEST_F(GcsAutoscalerStateManagerTest, TestReportAutoscalingState) { } TEST_F(GcsAutoscalerStateManagerTest, TestDrainNonAliveNode) { - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); // Adding a node. node->mutable_resources_total()->insert({"CPU", 2}); @@ -751,7 +748,7 @@ TEST_F(GcsAutoscalerStateManagerTest, TestDrainNonAliveNode) { } TEST_F(GcsAutoscalerStateManagerTest, TestDrainingStatus) { - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); // Adding a node. node->mutable_resources_total()->insert({"CPU", 2}); @@ -786,7 +783,7 @@ TEST_F(GcsAutoscalerStateManagerTest, TestDrainingStatus) { } TEST_F(GcsAutoscalerStateManagerTest, TestDrainNodeRaceCondition) { - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); // Adding a node. node->mutable_resources_total()->insert({"CPU", 2}); @@ -818,7 +815,7 @@ TEST_F(GcsAutoscalerStateManagerTest, TestDrainNodeRaceCondition) { } TEST_F(GcsAutoscalerStateManagerTest, TestIdleTime) { - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); // Adding a node. node->mutable_resources_total()->insert({"CPU", 2}); @@ -879,8 +876,8 @@ TEST_F(GcsAutoscalerStateManagerTest, TestGcsKvManagerInternalConfig) { TEST_F(GcsAutoscalerStateManagerTest, TestGetPerNodeInfeasibleResourceRequests_NoInfeasibleRequests) { // Prepare - auto node_1 = Mocker::GenNodeInfo(); - auto node_2 = Mocker::GenNodeInfo(); + auto node_1 = GenNodeInfo(); + auto node_2 = GenNodeInfo(); // Add nodes { @@ -895,27 +892,27 @@ TEST_F(GcsAutoscalerStateManagerTest, // Update resource usages { UpdateResourceLoads(node_1->node_id(), - {Mocker::GenResourceDemand({{"GPU", 1}}, - /* nun_ready_queued */ 1, - /* nun_infeasible */ 1, - /* num_backlog */ 0, - /* label_selectors */ {}), - Mocker::GenResourceDemand({{"CPU", 1}}, - /* nun_ready_queued */ 1, - /* nun_infeasible */ 0, - /* num_backlog */ 1, - /* label_selectors */ {}), - Mocker::GenResourceDemand({{"CPU", 3}}, - /* num_ready_queued */ 0, - /* num_infeasible */ 1, - /* num_backlog */ 1, - /* label_selectors */ {})}); + {GenResourceDemand({{"GPU", 1}}, + /* nun_ready_queued */ 1, + /* nun_infeasible */ 1, + /* num_backlog */ 0, + /* label_selectors */ {}), + GenResourceDemand({{"CPU", 1}}, + /* nun_ready_queued */ 1, + /* nun_infeasible */ 0, + /* num_backlog */ 1, + /* label_selectors */ {}), + GenResourceDemand({{"CPU", 3}}, + /* num_ready_queued */ 0, + /* num_infeasible */ 1, + /* num_backlog */ 1, + /* label_selectors */ {})}); UpdateResourceLoads(node_2->node_id(), - {Mocker::GenResourceDemand({{"CPU", 2}}, - /* nun_ready_queued */ 1, - /* nun_infeasible */ 0, - /* num_backlog */ 1, - /* label_selectors */ {})}); + {GenResourceDemand({{"CPU", 2}}, + /* nun_ready_queued */ 1, + /* nun_infeasible */ 0, + /* num_backlog */ 1, + /* label_selectors */ {})}); } // Update autoscaling state @@ -942,8 +939,8 @@ TEST_F(GcsAutoscalerStateManagerTest, TEST_F(GcsAutoscalerStateManagerTest, TestGetPerNodeInfeasibleResourceRequests_WithInfeasibleRequests) { // Prepare - auto node_1 = Mocker::GenNodeInfo(); - auto node_2 = Mocker::GenNodeInfo(); + auto node_1 = GenNodeInfo(); + auto node_2 = GenNodeInfo(); // Add nodes { @@ -958,27 +955,27 @@ TEST_F(GcsAutoscalerStateManagerTest, // Update resource usages { UpdateResourceLoads(node_1->node_id(), - {Mocker::GenResourceDemand({{"GPU", 1}}, - /* nun_ready_queued */ 1, - /* nun_infeasible */ 1, - /* num_backlog */ 0), + {GenResourceDemand({{"GPU", 1}}, + /* nun_ready_queued */ 1, + /* nun_infeasible */ 1, + /* num_backlog */ 0), /* label_selectors */ {}, - Mocker::GenResourceDemand({{"CPU", 1}}, - /* nun_ready_queued */ 1, - /* nun_infeasible */ 0, - /* num_backlog */ 1), + GenResourceDemand({{"CPU", 1}}, + /* nun_ready_queued */ 1, + /* nun_infeasible */ 0, + /* num_backlog */ 1), /* label_selectors */ {}, - Mocker::GenResourceDemand({{"CPU", 3}}, - /* num_ready_queued */ 0, - /* num_infeasible */ 1, - /* num_backlog */ 1, - /* label_selectors */ {})}); + GenResourceDemand({{"CPU", 3}}, + /* num_ready_queued */ 0, + /* num_infeasible */ 1, + /* num_backlog */ 1, + /* label_selectors */ {})}); UpdateResourceLoads(node_2->node_id(), - {Mocker::GenResourceDemand({{"CPU", 2}}, - /* nun_ready_queued */ 1, - /* nun_infeasible */ 0, - /* num_backlog */ 1, - /* label_selectors */ {})}); + {GenResourceDemand({{"CPU", 2}}, + /* nun_ready_queued */ 1, + /* nun_infeasible */ 0, + /* num_backlog */ 1, + /* label_selectors */ {})}); } // Update autoscaling state @@ -1021,7 +1018,7 @@ TEST_F(GcsAutoscalerStateManagerTest, } TEST_F(GcsAutoscalerStateManagerTest, TestNodeLabelsAdded) { - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); node->mutable_resources_total()->insert({"CPU", 2}); node->set_instance_id("instance_1"); (*node->mutable_labels())["accelerator-type"] = "TPU"; @@ -1036,7 +1033,7 @@ TEST_F(GcsAutoscalerStateManagerTest, TestNodeLabelsAdded) { } TEST_F(GcsAutoscalerStateManagerTest, TestGetPendingResourceRequestsWithLabelSelectors) { - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); node->mutable_resources_total()->insert({"CPU", 2}); node->set_instance_id("instance_1"); AddNode(node); @@ -1061,11 +1058,11 @@ TEST_F(GcsAutoscalerStateManagerTest, TestGetPendingResourceRequestsWithLabelSel // Simulate an infeasible request with a label selector UpdateResourceLoads(node->node_id(), - {Mocker::GenResourceDemand({{"CPU", 2}}, - /*ready=*/0, - /*infeasible=*/1, - /*backlog=*/0, - {selector})}); + {GenResourceDemand({{"CPU", 2}}, + /*ready=*/0, + /*infeasible=*/1, + /*backlog=*/0, + {selector})}); } // Validate the cluster state includes the generated pending request diff --git a/src/ray/gcs/gcs_server/test/gcs_function_manager_test.cc b/src/ray/gcs/tests/gcs_function_manager_test.cc similarity index 96% rename from src/ray/gcs/gcs_server/test/gcs_function_manager_test.cc rename to src/ray/gcs/tests/gcs_function_manager_test.cc index 2e82f08330d6..b24eb51a8a21 100644 --- a/src/ray/gcs/gcs_server/test/gcs_function_manager_test.cc +++ b/src/ray/gcs/tests/gcs_function_manager_test.cc @@ -12,14 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/gcs_server/gcs_function_manager.h" +#include "ray/gcs/gcs_function_manager.h" + +#include #include -// clang-format off -#include "gtest/gtest.h" -#include "mock/ray/gcs/gcs_server/gcs_kv_manager.h" -// clang-format on +#include "mock/ray/gcs/gcs_kv_manager.h" namespace ray { diff --git a/src/ray/gcs/gcs_server/test/gcs_health_check_manager_test.cc b/src/ray/gcs/tests/gcs_health_check_manager_test.cc similarity index 99% rename from src/ray/gcs/gcs_server/test/gcs_health_check_manager_test.cc rename to src/ray/gcs/tests/gcs_health_check_manager_test.cc index 8ca66a12522f..8c6d5e485e8d 100644 --- a/src/ray/gcs/gcs_server/test/gcs_health_check_manager_test.cc +++ b/src/ray/gcs/tests/gcs_health_check_manager_test.cc @@ -34,7 +34,7 @@ using namespace boost::asio::ip; // NOLINT #include #include "gtest/gtest.h" -#include "ray/gcs/gcs_server/gcs_health_check_manager.h" +#include "ray/gcs/gcs_health_check_manager.h" #include "ray/util/network_util.h" int GetFreePort() { diff --git a/src/ray/gcs/gcs_server/test/gcs_job_manager_test.cc b/src/ray/gcs/tests/gcs_job_manager_test.cc similarity index 92% rename from src/ray/gcs/gcs_server/test/gcs_job_manager_test.cc rename to src/ray/gcs/tests/gcs_job_manager_test.cc index 683016b5d801..e2aae44e87d1 100644 --- a/src/ray/gcs/gcs_server/test/gcs_job_manager_test.cc +++ b/src/ray/gcs/tests/gcs_job_manager_test.cc @@ -12,23 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/gcs_server/gcs_job_manager.h" +#include "ray/gcs/gcs_job_manager.h" #include #include -// clang-format off #include "gtest/gtest.h" -#include "ray/gcs/gcs_server/test/gcs_server_test_util.h" -#include "ray/gcs/store_client/in_memory_store_client.h" -#include "ray/gcs/test/gcs_test_util.h" -#include "ray/gcs/gcs_server/gcs_kv_manager.h" -#include "mock/ray/gcs/gcs_server/gcs_kv_manager.h" +#include "mock/ray/gcs/gcs_kv_manager.h" #include "mock/ray/pubsub/publisher.h" -#include "mock/ray/pubsub/subscriber.h" #include "mock/ray/rpc/worker/core_worker_client.h" - -// clang-format on +#include "ray/common/test_utils.h" +#include "ray/gcs/gcs_kv_manager.h" +#include "ray/gcs/store_client/in_memory_store_client.h" +#include "ray/observability/fake_ray_event_recorder.h" namespace ray { @@ -44,7 +40,7 @@ class GcsJobManagerTest : public ::testing::Test { }); promise.get_future().get(); - gcs_publisher_ = std::make_unique( + gcs_publisher_ = std::make_unique( std::make_unique()); store_client_ = std::make_shared(); gcs_table_storage_ = std::make_shared(store_client_); @@ -60,13 +56,16 @@ class GcsJobManagerTest : public ::testing::Test { return std::make_shared( address.port()); }); + fake_ray_event_recorder_ = std::make_unique(); gcs_job_manager_ = std::make_unique(*gcs_table_storage_, *gcs_publisher_, runtime_env_manager_, *function_manager_, *fake_kv_, io_service_, - *worker_client_pool_); + *worker_client_pool_, + *fake_ray_event_recorder_, + "test_session_name"); } ~GcsJobManagerTest() { @@ -79,7 +78,7 @@ class GcsJobManagerTest : public ::testing::Test { std::unique_ptr thread_io_service_; std::shared_ptr store_client_; std::shared_ptr gcs_table_storage_; - std::shared_ptr gcs_publisher_; + std::shared_ptr gcs_publisher_; std::unique_ptr function_manager_; std::unique_ptr kv_; std::unique_ptr fake_kv_; @@ -87,6 +86,7 @@ class GcsJobManagerTest : public ::testing::Test { RuntimeEnvManager runtime_env_manager_; const std::chrono::milliseconds timeout_ms_{5000}; std::unique_ptr gcs_job_manager_; + std::unique_ptr fake_ray_event_recorder_; }; TEST_F(GcsJobManagerTest, TestFakeInternalKV) { @@ -129,12 +129,12 @@ TEST_F(GcsJobManagerTest, TestIsRunningTasks) { address.set_port(num_running_tasks); // Populate other fields, the value is not important. - address.set_raylet_id(NodeID::FromRandom().Binary()); + address.set_node_id(NodeID::FromRandom().Binary()); address.set_ip_address("123.456.7.8"); address.set_worker_id(WorkerID::FromRandom().Binary()); auto add_job_request = - Mocker::GenAddJobRequest(job_id, std::to_string(i), std::to_string(i), address); + GenAddJobRequest(job_id, std::to_string(i), std::to_string(i), address); rpc::AddJobReply empty_reply; std::promise promise; gcs_job_manager_->HandleAddJob( @@ -176,8 +176,7 @@ TEST_F(GcsJobManagerTest, TestGetAllJobInfo) { // Add 100 jobs. for (int i = 0; i < 100; ++i) { auto job_id = JobID::FromInt(i); - auto add_job_request = - Mocker::GenAddJobRequest(job_id, "namespace_" + std::to_string(i)); + auto add_job_request = GenAddJobRequest(job_id, "namespace_" + std::to_string(i)); rpc::AddJobReply empty_reply; std::promise promise; gcs_job_manager_->HandleAddJob( @@ -208,8 +207,7 @@ TEST_F(GcsJobManagerTest, TestGetAllJobInfo) { // API.") auto job_api_job_id = JobID::FromInt(100); std::string submission_id = "submission_id_100"; - auto add_job_request = - Mocker::GenAddJobRequest(job_api_job_id, "namespace_100", submission_id); + auto add_job_request = GenAddJobRequest(job_api_job_id, "namespace_100", submission_id); rpc::AddJobReply empty_reply; std::promise promise; gcs_job_manager_->HandleAddJob( @@ -312,8 +310,7 @@ TEST_F(GcsJobManagerTest, TestGetAllJobInfo) { // Add another job with the *same* submission ID. This can happen if the entrypoint // script calls ray.init() multiple times. auto job_id2 = JobID::FromInt(2); - auto add_job_request2 = - Mocker::GenAddJobRequest(job_id2, "namespace_100", submission_id); + auto add_job_request2 = GenAddJobRequest(job_id2, "namespace_100", submission_id); std::promise promise4; gcs_job_manager_->HandleAddJob( *add_job_request2, @@ -349,8 +346,7 @@ TEST_F(GcsJobManagerTest, TestGetAllJobInfoWithFilter) { std::promise promise1; std::promise promise2; - auto add_job_request1 = - Mocker::GenAddJobRequest(job_id1, "namespace_1", "submission_1"); + auto add_job_request1 = GenAddJobRequest(job_id1, "namespace_1", "submission_1"); gcs_job_manager_->HandleAddJob( *add_job_request1, &empty_reply, @@ -359,8 +355,7 @@ TEST_F(GcsJobManagerTest, TestGetAllJobInfoWithFilter) { }); promise1.get_future().get(); - auto add_job_request2 = - Mocker::GenAddJobRequest(job_id2, "namespace_2", "submission_2"); + auto add_job_request2 = GenAddJobRequest(job_id2, "namespace_2", "submission_2"); gcs_job_manager_->HandleAddJob( *add_job_request2, &empty_reply, @@ -427,7 +422,7 @@ TEST_F(GcsJobManagerTest, TestGetAllJobInfoWithLimit) { std::promise promise1; std::promise promise2; - auto add_job_request1 = Mocker::GenAddJobRequest(job_id1, "namespace_1"); + auto add_job_request1 = GenAddJobRequest(job_id1, "namespace_1"); gcs_job_manager_->HandleAddJob( *add_job_request1, &empty_reply, @@ -436,7 +431,7 @@ TEST_F(GcsJobManagerTest, TestGetAllJobInfoWithLimit) { }); promise1.get_future().get(); - auto add_job_request2 = Mocker::GenAddJobRequest(job_id2, "namespace_2"); + auto add_job_request2 = GenAddJobRequest(job_id2, "namespace_2"); gcs_job_manager_->HandleAddJob( *add_job_request2, &empty_reply, @@ -523,7 +518,7 @@ TEST_F(GcsJobManagerTest, TestGetJobConfig) { std::promise promise1; std::promise promise2; - auto add_job_request1 = Mocker::GenAddJobRequest(job_id1, "namespace_1"); + auto add_job_request1 = GenAddJobRequest(job_id1, "namespace_1"); gcs_job_manager_->HandleAddJob( *add_job_request1, &empty_reply, @@ -532,7 +527,7 @@ TEST_F(GcsJobManagerTest, TestGetJobConfig) { }); promise1.get_future().get(); - auto add_job_request2 = Mocker::GenAddJobRequest(job_id2, "namespace_2"); + auto add_job_request2 = GenAddJobRequest(job_id2, "namespace_2"); gcs_job_manager_->HandleAddJob( *add_job_request2, &empty_reply, @@ -552,12 +547,12 @@ TEST_F(GcsJobManagerTest, TestPreserveDriverInfo) { auto job_id = JobID::FromInt(1); gcs::GcsInitData gcs_init_data(*gcs_table_storage_); gcs_job_manager_->Initialize(/*init_data=*/gcs_init_data); - auto add_job_request = Mocker::GenAddJobRequest(job_id, "namespace"); + auto add_job_request = GenAddJobRequest(job_id, "namespace"); rpc::Address address; address.set_ip_address("10.0.0.1"); address.set_port(8264); - address.set_raylet_id(NodeID::FromRandom().Binary()); + address.set_node_id(NodeID::FromRandom().Binary()); address.set_worker_id(WorkerID::FromRandom().Binary()); add_job_request->mutable_data()->set_driver_ip_address("10.0.0.1"); add_job_request->mutable_data()->mutable_driver_address()->CopyFrom(address); @@ -617,14 +612,16 @@ TEST_F(GcsJobManagerTest, TestMarkJobFinishedIdempotency) { *function_manager_, *fake_kv_, io_service_, - *worker_client_pool_); + *worker_client_pool_, + *fake_ray_event_recorder_, + "test_session_name"); auto job_id = JobID::FromInt(1); gcs::GcsInitData gcs_init_data(*gcs_table_storage_); gcs_job_manager.Initialize(/*init_data=*/gcs_init_data); // Add a job first - auto add_job_request = Mocker::GenAddJobRequest(job_id, "namespace"); + auto add_job_request = GenAddJobRequest(job_id, "namespace"); rpc::AddJobReply add_job_reply; std::promise add_promise; gcs_job_manager.HandleAddJob( @@ -708,7 +705,7 @@ TEST_F(GcsJobManagerTest, TestNodeFailure) { std::promise promise1; std::promise promise2; - auto add_job_request1 = Mocker::GenAddJobRequest(job_id1, "namespace_1"); + auto add_job_request1 = GenAddJobRequest(job_id1, "namespace_1"); gcs_job_manager_->HandleAddJob( *add_job_request1, &empty_reply, @@ -717,7 +714,7 @@ TEST_F(GcsJobManagerTest, TestNodeFailure) { }); promise1.get_future().get(); - auto add_job_request2 = Mocker::GenAddJobRequest(job_id2, "namespace_2"); + auto add_job_request2 = GenAddJobRequest(job_id2, "namespace_2"); gcs_job_manager_->HandleAddJob( *add_job_request2, &empty_reply, @@ -744,7 +741,7 @@ TEST_F(GcsJobManagerTest, TestNodeFailure) { // Remove node and then check that the job is dead. auto address = all_job_info_reply.job_info_list().Get(0).driver_address(); - auto node_id = NodeID::FromBinary(address.raylet_id()); + auto node_id = NodeID::FromBinary(address.node_id()); gcs_job_manager_->OnNodeDead(node_id); // Test get all jobs and check if killed node jobs marked as finished @@ -763,7 +760,7 @@ TEST_F(GcsJobManagerTest, TestNodeFailure) { bool job_condition = true; // job1 from the current node should dead, while job2 is still alive for (auto job_info : all_job_info_reply2.job_info_list()) { - auto job_node_id = NodeID::FromBinary(job_info.driver_address().raylet_id()); + auto job_node_id = NodeID::FromBinary(job_info.driver_address().node_id()); job_condition = job_condition && (job_info.is_dead() == (job_node_id == node_id)); } return job_condition; diff --git a/src/ray/gcs/gcs_server/test/gcs_kv_manager_test.cc b/src/ray/gcs/tests/gcs_kv_manager_test.cc similarity index 91% rename from src/ray/gcs/gcs_server/test/gcs_kv_manager_test.cc rename to src/ray/gcs/tests/gcs_kv_manager_test.cc index 228d446e4947..e33795bc0869 100644 --- a/src/ray/gcs/gcs_server/test/gcs_kv_manager_test.cc +++ b/src/ray/gcs/tests/gcs_kv_manager_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/gcs_server/gcs_kv_manager.h" +#include "ray/gcs/gcs_kv_manager.h" #include #include @@ -21,10 +21,10 @@ #include #include "gtest/gtest.h" -#include "ray/common/test_util.h" -#include "ray/gcs/gcs_server/store_client_kv.h" +#include "ray/common/test_utils.h" #include "ray/gcs/store_client/in_memory_store_client.h" #include "ray/gcs/store_client/redis_store_client.h" +#include "ray/gcs/store_client_kv.h" class GcsKVManagerTest : public ::testing::TestWithParam { public: @@ -36,13 +36,11 @@ class GcsKVManagerTest : public ::testing::TestWithParam { io_service.get_executor()); io_service.run(); }); - ray::gcs::RedisClientOptions redis_client_options( - "127.0.0.1", ray::TEST_REDIS_SERVER_PORTS.front(), "", "", false); + ray::gcs::RedisClientOptions options{"127.0.0.1", + ray::TEST_REDIS_SERVER_PORTS.front()}; if (GetParam() == "redis") { - auto client = std::make_shared(redis_client_options); - RAY_CHECK_OK(client->Connect(io_service)); kv_instance = std::make_unique( - std::make_unique(client)); + std::make_unique(io_service, options)); } else if (GetParam() == "memory") { kv_instance = std::make_unique( std::make_unique()); @@ -52,11 +50,9 @@ class GcsKVManagerTest : public ::testing::TestWithParam { void TearDown() override { io_service.stop(); thread_io_service->join(); - redis_client.reset(); kv_instance.reset(); } - std::unique_ptr redis_client; std::unique_ptr thread_io_service; instrumented_io_context io_service; std::unique_ptr kv_instance; diff --git a/src/ray/gcs/gcs_server/test/gcs_node_manager_test.cc b/src/ray/gcs/tests/gcs_node_manager_test.cc similarity index 87% rename from src/ray/gcs/gcs_server/test/gcs_node_manager_test.cc rename to src/ray/gcs/tests/gcs_node_manager_test.cc index 69bb1aee77d3..0406017f31cb 100644 --- a/src/ray/gcs/gcs_server/test/gcs_node_manager_test.cc +++ b/src/ray/gcs/tests/gcs_node_manager_test.cc @@ -12,38 +12,36 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "ray/gcs/gcs_node_manager.h" + +#include + #include #include #include -// clang-format off -#include "gtest/gtest.h" -#include "ray/gcs/gcs_server/test/gcs_server_test_util.h" -#include "ray/gcs/test/gcs_test_util.h" -#include "ray/rpc/node_manager/node_manager_client.h" -#include "ray/rpc/node_manager/raylet_client_pool.h" +#include "fakes/ray/rpc/raylet/raylet_client.h" #include "mock/ray/pubsub/publisher.h" -#include "ray/common/asio/asio_util.h" -#include "ray/common/ray_syncer/ray_syncer.h" -// clang-format on +#include "ray/common/test_utils.h" namespace ray { class GcsNodeManagerTest : public ::testing::Test { public: GcsNodeManagerTest() { - raylet_client_ = std::make_shared(); + auto raylet_client = std::make_shared(); client_pool_ = std::make_unique( - [this](const rpc::Address &) { return raylet_client_; }); - gcs_publisher_ = std::make_unique( + [raylet_client = std::move(raylet_client)](const rpc::Address &) { + return raylet_client; + }); + gcs_publisher_ = std::make_unique( std::make_unique()); io_context_ = std::make_unique("GcsNodeManagerTest"); } protected: std::unique_ptr gcs_table_storage_; - std::shared_ptr raylet_client_; std::unique_ptr client_pool_; - std::unique_ptr gcs_publisher_; + std::unique_ptr gcs_publisher_; std::unique_ptr io_context_; }; @@ -54,7 +52,7 @@ TEST_F(GcsNodeManagerTest, TestManagement) { client_pool_.get(), ClusterID::Nil()); // Test Add/Get/Remove functionality. - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); auto node_id = NodeID::FromBinary(node->node_id()); node_manager.AddNode(node); @@ -79,7 +77,7 @@ TEST_F(GcsNodeManagerTest, TestListener) { added_nodes.emplace_back(std::move(node)); }); for (int i = 0; i < node_count; ++i) { - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); node_manager.AddNode(node); } ASSERT_EQ(node_count, added_nodes.size()); @@ -116,7 +114,7 @@ TEST_F(GcsNodeManagerTest, TestUpdateAliveNode) { ClusterID::Nil()); // Create a test node - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); auto node_id = NodeID::FromBinary(node->node_id()); // Add the node to the manager diff --git a/src/ray/gcs/gcs_server/test/gcs_placement_group_mgr_mock_test.cc b/src/ray/gcs/tests/gcs_placement_group_manager_mock_test.cc similarity index 88% rename from src/ray/gcs/gcs_server/test/gcs_placement_group_mgr_mock_test.cc rename to src/ray/gcs/tests/gcs_placement_group_manager_mock_test.cc index c02924582618..66987d0dae74 100644 --- a/src/ray/gcs/gcs_server/test/gcs_placement_group_mgr_mock_test.cc +++ b/src/ray/gcs/tests/gcs_placement_group_manager_mock_test.cc @@ -12,21 +12,20 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include + #include #include -// clang-format off -#include "gtest/gtest.h" -#include "gmock/gmock.h" -#include "ray/gcs/gcs_server/gcs_placement_group_mgr.h" -#include "ray/raylet/scheduling/cluster_resource_manager.h" -#include "mock/ray/gcs/gcs_server/gcs_node_manager.h" -#include "mock/ray/gcs/gcs_server/gcs_placement_group_mgr.h" -#include "mock/ray/gcs/gcs_server/gcs_placement_group_scheduler.h" -#include "mock/ray/gcs/gcs_server/gcs_resource_manager.h" + +#include "mock/ray/gcs/gcs_node_manager.h" +#include "mock/ray/gcs/gcs_placement_group_scheduler.h" +#include "mock/ray/gcs/gcs_resource_manager.h" #include "mock/ray/gcs/store_client/store_client.h" +#include "ray/common/test_utils.h" +#include "ray/gcs/gcs_placement_group_manager.h" +#include "ray/raylet/scheduling/cluster_resource_manager.h" #include "ray/util/counter_map.h" -#include "ray/gcs/test/gcs_test_util.h" -// clang-format on using namespace ::testing; // NOLINT using namespace ray; // NOLINT @@ -70,14 +69,13 @@ class GcsPlacementGroupManagerMockTest : public Test { TEST_F(GcsPlacementGroupManagerMockTest, PendingQueuePriorityReschedule) { // Test priority works // When return with reschedule, it should be given with the highest pri - auto req = - Mocker::GenCreatePlacementGroupRequest("", rpc::PlacementStrategy::SPREAD, 1); + auto req = GenCreatePlacementGroupRequest("", rpc::PlacementStrategy::SPREAD, 1); auto pg = std::make_shared(req, "", counter_); auto cb = [](Status s) {}; SchedulePgRequest request; std::unique_ptr> put_cb; EXPECT_CALL(*store_client_, AsyncPut(_, _, _, _, _)) - .WillOnce(DoAll(SaveArgToUniquePtr<4>(&put_cb), Return(Status::OK()))); + .WillOnce(DoAll(SaveArgToUniquePtr<4>(&put_cb))); EXPECT_CALL(*gcs_placement_group_scheduler_, ScheduleUnplacedBundles(_)) .WillOnce(DoAll(SaveArg<0>(&request))); auto now = absl::GetCurrentTimeNanos(); @@ -97,14 +95,13 @@ TEST_F(GcsPlacementGroupManagerMockTest, PendingQueuePriorityReschedule) { TEST_F(GcsPlacementGroupManagerMockTest, PendingQueuePriorityFailed) { // Test priority works // When return with a failure, exp backoff should work - auto req = - Mocker::GenCreatePlacementGroupRequest("", rpc::PlacementStrategy::SPREAD, 1); + auto req = GenCreatePlacementGroupRequest("", rpc::PlacementStrategy::SPREAD, 1); auto pg = std::make_shared(req, "", counter_); auto cb = [](Status s) {}; SchedulePgRequest request; std::unique_ptr> put_cb; EXPECT_CALL(*store_client_, AsyncPut(_, _, _, _, _)) - .WillOnce(DoAll(SaveArgToUniquePtr<4>(&put_cb), Return(Status::OK()))); + .WillOnce(DoAll(SaveArgToUniquePtr<4>(&put_cb))); EXPECT_CALL(*gcs_placement_group_scheduler_, ScheduleUnplacedBundles(_)) .Times(2) .WillRepeatedly(DoAll(SaveArg<0>(&request))); @@ -151,18 +148,16 @@ TEST_F(GcsPlacementGroupManagerMockTest, PendingQueuePriorityOrder) { // Test priority works // Add two pgs // Fail one and make sure it's scheduled later - auto req1 = - Mocker::GenCreatePlacementGroupRequest("", rpc::PlacementStrategy::SPREAD, 1); + auto req1 = GenCreatePlacementGroupRequest("", rpc::PlacementStrategy::SPREAD, 1); auto pg1 = std::make_shared(req1, "", counter_); - auto req2 = - Mocker::GenCreatePlacementGroupRequest("", rpc::PlacementStrategy::SPREAD, 1); + auto req2 = GenCreatePlacementGroupRequest("", rpc::PlacementStrategy::SPREAD, 1); auto pg2 = std::make_shared(req2, "", counter_); auto cb = [](Status s) {}; SchedulePgRequest request; std::unique_ptr> put_cb; EXPECT_CALL(*store_client_, AsyncPut(_, _, _, _, _)) .Times(2) - .WillRepeatedly(DoAll(SaveArgToUniquePtr<4>(&put_cb), Return(Status::OK()))); + .WillRepeatedly(DoAll(SaveArgToUniquePtr<4>(&put_cb))); EXPECT_CALL(*gcs_placement_group_scheduler_, ScheduleUnplacedBundles(_)) .Times(2) .WillRepeatedly(DoAll(SaveArg<0>(&request))); diff --git a/src/ray/gcs/gcs_server/test/gcs_placement_group_mgr_test.cc b/src/ray/gcs/tests/gcs_placement_group_manager_test.cc similarity index 95% rename from src/ray/gcs/gcs_server/test/gcs_placement_group_mgr_test.cc rename to src/ray/gcs/tests/gcs_placement_group_manager_test.cc index 660c2c320363..bc0126e33e79 100644 --- a/src/ray/gcs/gcs_server/test/gcs_placement_group_mgr_test.cc +++ b/src/ray/gcs/tests/gcs_placement_group_manager_test.cc @@ -12,22 +12,21 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/gcs_server/gcs_placement_group_mgr.h" +#include "ray/gcs/gcs_placement_group_manager.h" + +#include #include #include #include -// clang-format off -#include "gtest/gtest.h" +#include "mock/ray/gcs/gcs_node_manager.h" +#include "mock/ray/pubsub/publisher.h" #include "ray/common/asio/instrumented_io_context.h" -#include "ray/gcs/gcs_server/test/gcs_server_test_util.h" -#include "ray/gcs/test/gcs_test_util.h" +#include "ray/common/test_utils.h" +#include "ray/gcs/store_client/in_memory_store_client.h" #include "ray/raylet/scheduling/cluster_resource_manager.h" #include "ray/util/counter_map.h" -#include "mock/ray/pubsub/publisher.h" -#include "mock/ray/gcs/gcs_server/gcs_node_manager.h" -// clang-format on namespace ray { namespace gcs { @@ -83,9 +82,10 @@ class GcsPlacementGroupManagerTest : public ::testing::Test { GcsPlacementGroupManagerTest() : mock_placement_group_scheduler_(new MockPlacementGroupScheduler()), cluster_resource_manager_(io_service_) { - gcs_publisher_ = - std::make_shared(std::make_unique()); - gcs_table_storage_ = std::make_unique(); + gcs_publisher_ = std::make_shared( + std::make_unique()); + gcs_table_storage_ = + std::make_unique(std::make_unique()); gcs_node_manager_ = std::make_shared(); gcs_resource_manager_ = std::make_shared( io_service_, cluster_resource_manager_, *gcs_node_manager_, NodeID::FromRandom()); @@ -220,11 +220,11 @@ class GcsPlacementGroupManagerTest : public ::testing::Test { ClusterResourceManager cluster_resource_manager_; std::shared_ptr gcs_node_manager_; std::shared_ptr gcs_resource_manager_; - std::shared_ptr gcs_publisher_; + std::shared_ptr gcs_publisher_; }; TEST_F(GcsPlacementGroupManagerTest, TestPlacementGroupBundleCache) { - auto request = Mocker::GenCreatePlacementGroupRequest(); + auto request = GenCreatePlacementGroupRequest(); std::atomic registered_placement_group_count(0); RegisterPlacementGroup(request, [®istered_placement_group_count](const Status &status) { @@ -244,7 +244,7 @@ TEST_F(GcsPlacementGroupManagerTest, TestPlacementGroupBundleCache) { } TEST_F(GcsPlacementGroupManagerTest, TestBasic) { - auto request = Mocker::GenCreatePlacementGroupRequest(); + auto request = GenCreatePlacementGroupRequest(); std::atomic registered_placement_group_count(0); RegisterPlacementGroup(request, [®istered_placement_group_count](const Status &status) { @@ -263,7 +263,7 @@ TEST_F(GcsPlacementGroupManagerTest, TestBasic) { } TEST_F(GcsPlacementGroupManagerTest, TestSchedulingFailed) { - auto request = Mocker::GenCreatePlacementGroupRequest(); + auto request = GenCreatePlacementGroupRequest(); std::atomic registered_placement_group_count(0); RegisterPlacementGroup(request, [®istered_placement_group_count](const Status &status) { @@ -296,7 +296,7 @@ TEST_F(GcsPlacementGroupManagerTest, TestSchedulingFailed) { } TEST_F(GcsPlacementGroupManagerTest, TestGetPlacementGroupIDByName) { - auto request = Mocker::GenCreatePlacementGroupRequest("test_name"); + auto request = GenCreatePlacementGroupRequest("test_name"); std::atomic registered_placement_group_count(0); RegisterPlacementGroup(request, [®istered_placement_group_count](Status status) { ++registered_placement_group_count; @@ -315,7 +315,7 @@ TEST_F(GcsPlacementGroupManagerTest, TestGetPlacementGroupIDByName) { } TEST_F(GcsPlacementGroupManagerTest, TestRemoveNamedPlacementGroup) { - auto request = Mocker::GenCreatePlacementGroupRequest("test_name"); + auto request = GenCreatePlacementGroupRequest("test_name"); std::atomic registered_placement_group_count(0); RegisterPlacementGroup(request, [®istered_placement_group_count](const Status &status) { @@ -339,7 +339,7 @@ TEST_F(GcsPlacementGroupManagerTest, TestRemoveNamedPlacementGroup) { } TEST_F(GcsPlacementGroupManagerTest, TestRemovedPlacementGroupNotReportedAsLoad) { - auto request = Mocker::GenCreatePlacementGroupRequest(); + auto request = GenCreatePlacementGroupRequest(); std::atomic registered_placement_group_count(0); RegisterPlacementGroup(request, [®istered_placement_group_count](Status status) { ++registered_placement_group_count; @@ -367,7 +367,7 @@ TEST_F(GcsPlacementGroupManagerTest, TestRemovedPlacementGroupNotReportedAsLoad) } TEST_F(GcsPlacementGroupManagerTest, TestRescheduleWhenNodeAdd) { - auto request = Mocker::GenCreatePlacementGroupRequest(); + auto request = GenCreatePlacementGroupRequest(); std::atomic registered_placement_group_count(0); RegisterPlacementGroup(request, [®istered_placement_group_count](Status status) { ++registered_placement_group_count; @@ -389,7 +389,7 @@ TEST_F(GcsPlacementGroupManagerTest, TestRescheduleWhenNodeAdd) { } TEST_F(GcsPlacementGroupManagerTest, TestRemovingPendingPlacementGroup) { - auto request = Mocker::GenCreatePlacementGroupRequest(); + auto request = GenCreatePlacementGroupRequest(); std::atomic registered_placement_group_count(0); RegisterPlacementGroup(request, [®istered_placement_group_count](Status status) { ++registered_placement_group_count; @@ -428,7 +428,7 @@ TEST_F(GcsPlacementGroupManagerTest, TestRemovingPendingPlacementGroup) { } TEST_F(GcsPlacementGroupManagerTest, TestRemovingLeasingPlacementGroup) { - auto request = Mocker::GenCreatePlacementGroupRequest(); + auto request = GenCreatePlacementGroupRequest(); std::atomic registered_placement_group_count(0); RegisterPlacementGroup(request, [®istered_placement_group_count](Status status) { ++registered_placement_group_count; @@ -464,7 +464,7 @@ TEST_F(GcsPlacementGroupManagerTest, TestRemovingLeasingPlacementGroup) { } TEST_F(GcsPlacementGroupManagerTest, TestRemovingCreatedPlacementGroup) { - auto request = Mocker::GenCreatePlacementGroupRequest(); + auto request = GenCreatePlacementGroupRequest(); std::atomic registered_placement_group_count(0); RegisterPlacementGroup(request, [®istered_placement_group_count](Status status) { ++registered_placement_group_count; @@ -508,7 +508,7 @@ TEST_F(GcsPlacementGroupManagerTest, TestReschedulingRetry) { /// pg scheduled -> pg created -> node dead -> /// pg rescheduled -> rescheduling failed -> retry. /// - auto request1 = Mocker::GenCreatePlacementGroupRequest(); + auto request1 = GenCreatePlacementGroupRequest(); std::atomic registered_placement_group_count(0); RegisterPlacementGroup(request1, [®istered_placement_group_count](Status status) { ++registered_placement_group_count; @@ -551,7 +551,7 @@ TEST_F(GcsPlacementGroupManagerTest, TestRescheduleWhenNodeDead) { /// Test the basic case. /// pg scheduled -> pg created -> node dead -> pg rescheduled. /// - auto request1 = Mocker::GenCreatePlacementGroupRequest(); + auto request1 = GenCreatePlacementGroupRequest(); std::atomic registered_placement_group_count(0); RegisterPlacementGroup(request1, [®istered_placement_group_count](Status status) { ++registered_placement_group_count; @@ -587,7 +587,7 @@ TEST_F(GcsPlacementGroupManagerTest, TestNodeDeadBeforePlacementGroupCreated) { /// Test the case where a node dies before the placement group is created. /// pg scheduled -> node dead -> pg created -> pg rescheduled. /// - auto request1 = Mocker::GenCreatePlacementGroupRequest(); + auto request1 = GenCreatePlacementGroupRequest(); std::atomic registered_placement_group_count(0); RegisterPlacementGroup(request1, [®istered_placement_group_count](Status status) { ++registered_placement_group_count; @@ -630,7 +630,7 @@ TEST_F(GcsPlacementGroupManagerTest, TestTwoNodesWithBundlesFromSamePlacementGro /// -> bundles on node1 created -> pg rescheduled (for bundles on node2) -> pg created /// - auto request1 = Mocker::GenCreatePlacementGroupRequest(); + auto request1 = GenCreatePlacementGroupRequest(); std::atomic registered_placement_group_count(0); RegisterPlacementGroup(request1, [®istered_placement_group_count](Status status) { ++registered_placement_group_count; @@ -693,7 +693,7 @@ TEST_F(GcsPlacementGroupManagerTest, TestTwoNodesWithBundlesFromSamePlacementGro /// -> pg prepared -> bundles on node1 created -> pg rescheduled (for bundles on node2) /// -> pg created /// - auto request1 = Mocker::GenCreatePlacementGroupRequest(); + auto request1 = GenCreatePlacementGroupRequest(); std::atomic registered_placement_group_count(0); RegisterPlacementGroup(request1, [®istered_placement_group_count](Status status) { ++registered_placement_group_count; @@ -755,7 +755,7 @@ TEST_F(GcsPlacementGroupManagerTest, TestTwoNodesWithBundlesFromSamePlacementGro /// pg scheduled -> pg created -> node1 dead -> pg rescheduled -> node2 dead /// -> pg still in rescheduled state -> pg prepared -> pg created /// - auto request1 = Mocker::GenCreatePlacementGroupRequest(); + auto request1 = GenCreatePlacementGroupRequest(); std::atomic registered_placement_group_count(0); RegisterPlacementGroup(request1, [®istered_placement_group_count](Status status) { ++registered_placement_group_count; @@ -809,16 +809,15 @@ TEST_F(GcsPlacementGroupManagerTest, TestTwoNodesWithBundlesFromSamePlacementGro TEST_F(GcsPlacementGroupManagerTest, TestSchedulerReinitializeAfterGcsRestart) { // Create a placement group and make sure it has been created successfully. auto job_id = JobID::FromInt(1); - auto request = Mocker::GenCreatePlacementGroupRequest( + auto request = GenCreatePlacementGroupRequest( /* name */ "", rpc::PlacementStrategy::SPREAD, /* bundles_count */ 2, /* cpu_num */ 1.0, /* job_id */ job_id); - auto job_table_data = Mocker::GenJobTableData(job_id); - RAY_CHECK_OK(gcs_table_storage_->JobTable().Put( - job_id, *job_table_data, {[](auto) {}, io_service_})); - std::atomic registered_placement_group_count(0); + auto job_table_data = GenJobTableData(job_id); + gcs_table_storage_->JobTable().Put(job_id, *job_table_data, {[](auto) {}, io_service_}); + std::atomic registered_placement_group_count{0}; RegisterPlacementGroup(request, [®istered_placement_group_count](Status status) { ++registered_placement_group_count; }); @@ -851,7 +850,7 @@ TEST_F(GcsPlacementGroupManagerTest, TestAutomaticCleanupWhenActorDeadAndJobDead // Test the scenario where actor dead -> job dead. const auto job_id = JobID::FromInt(1); const auto actor_id = ActorID::Of(job_id, TaskID::Nil(), 0); - auto request = Mocker::GenCreatePlacementGroupRequest( + auto request = GenCreatePlacementGroupRequest( /* name */ "", rpc::PlacementStrategy::SPREAD, /* bundles_count */ 2, @@ -887,7 +886,7 @@ TEST_F(GcsPlacementGroupManagerTest, TestAutomaticCleanupWhenActorAndJobDead) { // Test the scenario where job dead -> actor dead. const auto job_id = JobID::FromInt(1); const auto actor_id = ActorID::Of(job_id, TaskID::Nil(), 0); - auto request = Mocker::GenCreatePlacementGroupRequest( + auto request = GenCreatePlacementGroupRequest( /* name */ "", rpc::PlacementStrategy::SPREAD, /* bundles_count */ 2, @@ -925,7 +924,7 @@ TEST_F(GcsPlacementGroupManagerTest, TestAutomaticCleanupWhenActorAndJobDead) { TEST_F(GcsPlacementGroupManagerTest, TestAutomaticCleanupWhenOnlyJobDead) { // Test placement group is cleaned when both actor & job are dead. const auto job_id = JobID::FromInt(1); - auto request = Mocker::GenCreatePlacementGroupRequest( + auto request = GenCreatePlacementGroupRequest( /* name */ "", rpc::PlacementStrategy::SPREAD, /* bundles_count */ 2, @@ -959,7 +958,7 @@ TEST_F(GcsPlacementGroupManagerTest, // Test placement group is cleaned when both actor & job are dead. const auto job_id = JobID::FromInt(1); const auto different_job_id = JobID::FromInt(3); - auto request = Mocker::GenCreatePlacementGroupRequest( + auto request = GenCreatePlacementGroupRequest( /* name */ "", rpc::PlacementStrategy::SPREAD, /* bundles_count */ 2, @@ -990,7 +989,7 @@ TEST_F(GcsPlacementGroupManagerTest, } TEST_F(GcsPlacementGroupManagerTest, TestSchedulingCanceledWhenPgIsInfeasible) { - auto request = Mocker::GenCreatePlacementGroupRequest(); + auto request = GenCreatePlacementGroupRequest(); std::atomic registered_placement_group_count(0); RegisterPlacementGroup(request, [®istered_placement_group_count](const Status &status) { @@ -1029,11 +1028,11 @@ TEST_F(GcsPlacementGroupManagerTest, TestSchedulingCanceledWhenPgIsInfeasible) { } TEST_F(GcsPlacementGroupManagerTest, TestRayNamespace) { - auto request1 = Mocker::GenCreatePlacementGroupRequest("test_name"); + auto request1 = GenCreatePlacementGroupRequest("test_name"); job_namespace_table_[JobID::FromInt(11)] = "another_namespace"; - auto request2 = Mocker::GenCreatePlacementGroupRequest( + auto request2 = GenCreatePlacementGroupRequest( "test_name", rpc::PlacementStrategy::SPREAD, 2, 1.0, JobID::FromInt(11)); - auto request3 = Mocker::GenCreatePlacementGroupRequest("test_name"); + auto request3 = GenCreatePlacementGroupRequest("test_name"); { // Create a placement group in the empty namespace. std::atomic registered_placement_group_count(0); RegisterPlacementGroup(request1, [®istered_placement_group_count](Status status) { @@ -1092,7 +1091,7 @@ TEST_F(GcsPlacementGroupManagerTest, TestRayNamespace) { } TEST_F(GcsPlacementGroupManagerTest, TestStats) { - auto request = Mocker::GenCreatePlacementGroupRequest(); + auto request = GenCreatePlacementGroupRequest(); std::atomic registered_placement_group_count(0); RegisterPlacementGroup(request, [®istered_placement_group_count](const Status &status) { @@ -1152,7 +1151,7 @@ TEST_F(GcsPlacementGroupManagerTest, TestStats) { } TEST_F(GcsPlacementGroupManagerTest, TestStatsCreationTime) { - auto request = Mocker::GenCreatePlacementGroupRequest(); + auto request = GenCreatePlacementGroupRequest(); std::atomic registered_placement_group_count(0); auto request_received_ns = absl::GetCurrentTimeNanos(); RegisterPlacementGroup(request, @@ -1197,7 +1196,7 @@ TEST_F(GcsPlacementGroupManagerTest, TestGetAllPlacementGroupInfoLimit) { auto num_pgs = 3; std::atomic registered_placement_group_count(0); for (int i = 0; i < num_pgs; i++) { - auto request = Mocker::GenCreatePlacementGroupRequest(); + auto request = GenCreatePlacementGroupRequest(); RegisterPlacementGroup(request, [®istered_placement_group_count](const Status &status) { ++registered_placement_group_count; @@ -1236,18 +1235,16 @@ TEST_F(GcsPlacementGroupManagerTest, TestGetAllPlacementGroupInfoLimit) { TEST_F(GcsPlacementGroupManagerTest, TestCheckCreatorJobIsDeadWhenGcsRestart) { auto job_id = JobID::FromInt(1); - auto request = Mocker::GenCreatePlacementGroupRequest( + auto request = GenCreatePlacementGroupRequest( /* name */ "", rpc::PlacementStrategy::SPREAD, /* bundles_count */ 2, /* cpu_num */ 1.0, /* job_id */ job_id); - auto job_table_data = Mocker::GenJobTableData(job_id); + auto job_table_data = GenJobTableData(job_id); job_table_data->set_is_dead(true); - RAY_CHECK_OK(gcs_table_storage_->JobTable().Put( - job_id, *job_table_data, {[](auto) {}, io_service_})); - - std::atomic registered_placement_group_count(0); + gcs_table_storage_->JobTable().Put(job_id, *job_table_data, {[](auto) {}, io_service_}); + std::atomic registered_placement_group_count{0}; RegisterPlacementGroup(request, [®istered_placement_group_count](Status status) { ++registered_placement_group_count; }); diff --git a/src/ray/gcs/gcs_server/test/gcs_placement_group_scheduler_test.cc b/src/ray/gcs/tests/gcs_placement_group_scheduler_test.cc similarity index 76% rename from src/ray/gcs/gcs_server/test/gcs_placement_group_scheduler_test.cc rename to src/ray/gcs/tests/gcs_placement_group_scheduler_test.cc index 0e1d30a5a357..3a866866d72e 100644 --- a/src/ray/gcs/gcs_server/test/gcs_placement_group_scheduler_test.cc +++ b/src/ray/gcs/tests/gcs_placement_group_scheduler_test.cc @@ -13,22 +13,29 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "ray/gcs/gcs_placement_group_scheduler.h" + +#include + #include #include #include #include -// clang-format off -#include "gtest/gtest.h" +#include "fakes/ray/rpc/raylet/raylet_client.h" +#include "mock/ray/pubsub/publisher.h" #include "ray/common/asio/instrumented_io_context.h" -#include "ray/gcs/gcs_server/test/gcs_server_test_util.h" -#include "ray/gcs/test/gcs_test_util.h" +#include "ray/common/test_utils.h" +#include "ray/gcs/gcs_node_manager.h" +#include "ray/gcs/gcs_placement_group.h" +#include "ray/gcs/gcs_resource_manager.h" +#include "ray/gcs/gcs_table_storage.h" +#include "ray/gcs/store_client/in_memory_store_client.h" #include "ray/raylet/scheduling/cluster_resource_scheduler.h" #include "ray/util/counter_map.h" -#include "mock/ray/pubsub/publisher.h" -// clang-format on namespace ray { +namespace gcs { enum class GcsPlacementGroupStatus : int32_t { SUCCESS = 0, @@ -44,10 +51,11 @@ class GcsPlacementGroupSchedulerTest : public ::testing::Test { io_service_.run(); })); for (int index = 0; index < 3; ++index) { - raylet_clients_.push_back(std::make_shared()); + raylet_clients_.push_back(std::make_shared()); } - gcs_table_storage_ = std::make_shared(); - gcs_publisher_ = std::make_shared( + gcs_table_storage_ = + std::make_unique(std::make_unique()); + gcs_publisher_ = std::make_shared( std::make_unique()); auto local_node_id = NodeID::FromRandom(); cluster_resource_scheduler_ = std::make_shared( @@ -57,25 +65,25 @@ class GcsPlacementGroupSchedulerTest : public ::testing::Test { /*is_node_available_fn=*/ [](auto) { return true; }, /*is_local_node_with_raylet=*/false); - gcs_node_manager_ = std::make_shared(gcs_publisher_.get(), - gcs_table_storage_.get(), - io_service_, - raylet_client_pool_.get(), - ClusterID::Nil()); - gcs_resource_manager_ = std::make_shared( + gcs_node_manager_ = std::make_shared(gcs_publisher_.get(), + gcs_table_storage_.get(), + io_service_, + raylet_client_pool_.get(), + ClusterID::Nil()); + gcs_resource_manager_ = std::make_shared( io_service_, cluster_resource_scheduler_->GetClusterResourceManager(), *gcs_node_manager_, local_node_id); - store_client_ = std::make_shared(); + store_client_ = std::make_shared(); raylet_client_pool_ = std::make_unique( [this](const rpc::Address &addr) { return raylet_clients_[addr.port()]; }); - scheduler_ = std::make_shared( - io_service_, - *gcs_table_storage_, - *gcs_node_manager_, - *cluster_resource_scheduler_, - *raylet_client_pool_); + scheduler_ = + std::make_unique(io_service_, + *gcs_table_storage_, + *gcs_node_manager_, + *cluster_resource_scheduler_, + *raylet_client_pool_); counter_.reset(new CounterMap()); } @@ -104,9 +112,8 @@ class GcsPlacementGroupSchedulerTest : public ::testing::Test { } } - void CheckEqWithPlacementGroupFront( - std::shared_ptr placement_group, - const GcsPlacementGroupStatus status) { + void CheckEqWithPlacementGroupFront(std::shared_ptr placement_group, + const GcsPlacementGroupStatus status) { absl::MutexLock lock(&placement_group_requests_mutex_); if (status == GcsPlacementGroupStatus::SUCCESS) { ASSERT_EQ(placement_group, success_placement_groups_.front()); @@ -138,22 +145,20 @@ class GcsPlacementGroupSchedulerTest : public ::testing::Test { void ScheduleFailedWithZeroNodeTest(rpc::PlacementStrategy strategy) { ASSERT_EQ(0, gcs_node_manager_->GetAllAliveNodes().size()); - auto request = Mocker::GenCreatePlacementGroupRequest("", strategy); - auto placement_group = - std::make_shared(request, "", counter_); + auto request = GenCreatePlacementGroupRequest("", strategy); + auto placement_group = std::make_shared(request, "", counter_); // Schedule the placement_group with zero node. - scheduler_->ScheduleUnplacedBundles( + scheduler_->ScheduleUnplacedBundles(SchedulePgRequest{ placement_group, - [this](std::shared_ptr placement_group, - bool is_insfeasble) { + [this](std::shared_ptr placement_group, bool is_insfeasble) { absl::MutexLock lock(&placement_group_requests_mutex_); failure_placement_groups_.emplace_back(std::move(placement_group)); }, - [this](std::shared_ptr placement_group) { + [this](std::shared_ptr placement_group) { absl::MutexLock lock(&placement_group_requests_mutex_); success_placement_groups_.emplace_back(std::move(placement_group)); - }); + }}); // The lease request should not be send and the scheduling of placement_group should // fail as there are no available nodes. @@ -164,27 +169,25 @@ class GcsPlacementGroupSchedulerTest : public ::testing::Test { } void SchedulePlacementGroupSuccessTest(rpc::PlacementStrategy strategy) { - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); AddNode(node); ASSERT_EQ(1, gcs_node_manager_->GetAllAliveNodes().size()); - auto request = Mocker::GenCreatePlacementGroupRequest("", strategy); - auto placement_group = - std::make_shared(request, "", counter_); + auto request = GenCreatePlacementGroupRequest("", strategy); + auto placement_group = std::make_shared(request, "", counter_); // Schedule the placement_group with 1 available node, and the lease request should be // send to the node. - scheduler_->ScheduleUnplacedBundles( + scheduler_->ScheduleUnplacedBundles(SchedulePgRequest{ placement_group, - [this](std::shared_ptr placement_group, - bool is_insfeasble) { + [this](std::shared_ptr placement_group, bool is_insfeasble) { absl::MutexLock lock(&placement_group_requests_mutex_); failure_placement_groups_.emplace_back(std::move(placement_group)); }, - [this](std::shared_ptr placement_group) { + [this](std::shared_ptr placement_group) { absl::MutexLock lock(&placement_group_requests_mutex_); success_placement_groups_.emplace_back(std::move(placement_group)); - }); + }}); ASSERT_EQ(1, raylet_clients_[0]->num_lease_requested); ASSERT_EQ(1, raylet_clients_[0]->lease_callbacks.size()); @@ -197,31 +200,29 @@ class GcsPlacementGroupSchedulerTest : public ::testing::Test { } void ReschedulingWhenNodeAddTest(rpc::PlacementStrategy strategy) { - AddNode(Mocker::GenNodeInfo(0), 1); - auto failure_handler = [this](std::shared_ptr placement_group, + AddNode(GenNodeInfo(0), 1); + auto failure_handler = [this](std::shared_ptr placement_group, bool is_insfeasble) { absl::MutexLock lock(&placement_group_requests_mutex_); failure_placement_groups_.emplace_back(std::move(placement_group)); }; - auto success_handler = - [this](std::shared_ptr placement_group) { - absl::MutexLock lock(&placement_group_requests_mutex_); - success_placement_groups_.emplace_back(std::move(placement_group)); - }; + auto success_handler = [this](std::shared_ptr placement_group) { + absl::MutexLock lock(&placement_group_requests_mutex_); + success_placement_groups_.emplace_back(std::move(placement_group)); + }; // Failed to schedule the placement group, because the node resources is not enough. - auto request = Mocker::GenCreatePlacementGroupRequest("", strategy); - auto placement_group = - std::make_shared(request, "", counter_); + auto request = GenCreatePlacementGroupRequest("", strategy); + auto placement_group = std::make_shared(request, "", counter_); scheduler_->ScheduleUnplacedBundles( - placement_group, failure_handler, success_handler); + SchedulePgRequest{placement_group, failure_handler, success_handler}); WaitPlacementGroupPendingDone(1, GcsPlacementGroupStatus::FAILURE); CheckPlacementGroupSize(0, GcsPlacementGroupStatus::SUCCESS); // A new node is added, and the rescheduling is successful. - AddNode(Mocker::GenNodeInfo(0), 2); + AddNode(GenNodeInfo(0), 2); scheduler_->ScheduleUnplacedBundles( - placement_group, failure_handler, success_handler); + SchedulePgRequest{placement_group, failure_handler, success_handler}); ASSERT_TRUE(raylet_clients_[0]->GrantPrepareBundleResources()); WaitPendingDone(raylet_clients_[0]->commit_callbacks, 1); ASSERT_TRUE(raylet_clients_[0]->GrantCommitBundleResources()); @@ -229,8 +230,8 @@ class GcsPlacementGroupSchedulerTest : public ::testing::Test { } void AddTwoNodes() { - auto node0 = Mocker::GenNodeInfo(0); - auto node1 = Mocker::GenNodeInfo(1); + auto node0 = GenNodeInfo(0); + auto node1 = GenNodeInfo(1); AddNode(node0); AddNode(node1); } @@ -249,18 +250,17 @@ class GcsPlacementGroupSchedulerTest : public ::testing::Test { } void ScheduleUnplacedBundles( - const std::shared_ptr &placement_group) { - scheduler_->ScheduleUnplacedBundles( + const std::shared_ptr &placement_group) { + scheduler_->ScheduleUnplacedBundles(SchedulePgRequest{ placement_group, - [this](std::shared_ptr placement_group, - bool is_insfeasble) { + [this](std::shared_ptr placement_group, bool is_insfeasble) { absl::MutexLock lock(&placement_group_requests_mutex_); failure_placement_groups_.emplace_back(std::move(placement_group)); }, - [this](std::shared_ptr placement_group) { + [this](std::shared_ptr placement_group) { absl::MutexLock lock(&placement_group_requests_mutex_); success_placement_groups_.emplace_back(std::move(placement_group)); - }); + }}); } void GrantPrepareBundleResources(const std::pair &grant0, @@ -289,19 +289,19 @@ class GcsPlacementGroupSchedulerTest : public ::testing::Test { absl::Mutex placement_group_requests_mutex_; std::unique_ptr thread_io_service_; instrumented_io_context io_service_; - std::shared_ptr store_client_; + std::shared_ptr store_client_; - std::vector> raylet_clients_; - std::shared_ptr gcs_resource_manager_; + std::vector> raylet_clients_; + std::shared_ptr gcs_resource_manager_; std::shared_ptr cluster_resource_scheduler_; - std::shared_ptr gcs_node_manager_; - std::shared_ptr scheduler_; - std::vector> success_placement_groups_ + std::shared_ptr gcs_node_manager_; + std::unique_ptr scheduler_; + std::vector> success_placement_groups_ ABSL_GUARDED_BY(placement_group_requests_mutex_); - std::vector> failure_placement_groups_ + std::vector> failure_placement_groups_ ABSL_GUARDED_BY(placement_group_requests_mutex_); - std::shared_ptr gcs_publisher_; - std::shared_ptr gcs_table_storage_; + std::shared_ptr gcs_publisher_; + std::shared_ptr gcs_table_storage_; std::unique_ptr raylet_client_pool_; std::shared_ptr> counter_; }; @@ -335,26 +335,25 @@ TEST_F(GcsPlacementGroupSchedulerTest, TestStrictPackSchedulePlacementGroupSucce } TEST_F(GcsPlacementGroupSchedulerTest, TestSchedulePlacementGroupReplyFailure) { - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); AddNode(node); ASSERT_EQ(1, gcs_node_manager_->GetAllAliveNodes().size()); - auto request = Mocker::GenCreatePlacementGroupRequest(); - auto placement_group = std::make_shared(request, "", counter_); + auto request = GenCreatePlacementGroupRequest(); + auto placement_group = std::make_shared(request, "", counter_); // Schedule the placement_group with 1 available node, and the lease request should be // send to the node. - scheduler_->ScheduleUnplacedBundles( + scheduler_->ScheduleUnplacedBundles(SchedulePgRequest{ placement_group, - [this](std::shared_ptr placement_group, - bool is_insfeasble) { + [this](std::shared_ptr placement_group, bool is_insfeasble) { absl::MutexLock lock(&placement_group_requests_mutex_); failure_placement_groups_.emplace_back(std::move(placement_group)); }, - [this](std::shared_ptr placement_group) { + [this](std::shared_ptr placement_group) { absl::MutexLock lock(&placement_group_requests_mutex_); success_placement_groups_.emplace_back(std::move(placement_group)); - }); + }}); ASSERT_EQ(1, raylet_clients_[0]->num_lease_requested); ASSERT_EQ(1, raylet_clients_[0]->lease_callbacks.size()); @@ -368,52 +367,52 @@ TEST_F(GcsPlacementGroupSchedulerTest, TestSchedulePlacementGroupReplyFailure) { } TEST_F(GcsPlacementGroupSchedulerTest, TestSpreadStrategyResourceCheck) { - auto node = Mocker::GenNodeInfo(0); + auto node = GenNodeInfo(0); AddNode(node, 2); - auto failure_handler = [this](std::shared_ptr placement_group, + auto failure_handler = [this](std::shared_ptr placement_group, bool is_insfeasble) { absl::MutexLock lock(&placement_group_requests_mutex_); failure_placement_groups_.emplace_back(std::move(placement_group)); }; - auto success_handler = [this](std::shared_ptr placement_group) { + auto success_handler = [this](std::shared_ptr placement_group) { absl::MutexLock lock(&placement_group_requests_mutex_); success_placement_groups_.emplace_back(std::move(placement_group)); }; - auto request = - Mocker::GenCreatePlacementGroupRequest("", rpc::PlacementStrategy::SPREAD, 3, 2); - auto placement_group = std::make_shared(request, "", counter_); - scheduler_->ScheduleUnplacedBundles(placement_group, failure_handler, success_handler); + auto request = GenCreatePlacementGroupRequest("", rpc::PlacementStrategy::SPREAD, 3, 2); + auto placement_group = std::make_shared(request, "", counter_); + scheduler_->ScheduleUnplacedBundles( + SchedulePgRequest{placement_group, failure_handler, success_handler}); // The node resource is not enough, scheduling failed. WaitPlacementGroupPendingDone(1, GcsPlacementGroupStatus::FAILURE); - scheduler_->ScheduleUnplacedBundles(placement_group, failure_handler, success_handler); + scheduler_->ScheduleUnplacedBundles( + SchedulePgRequest{placement_group, failure_handler, success_handler}); // The node resource is not enough, scheduling failed. WaitPlacementGroupPendingDone(2, GcsPlacementGroupStatus::FAILURE); } TEST_F(GcsPlacementGroupSchedulerTest, TestSchedulePlacementGroupReturnResource) { - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); AddNode(node); ASSERT_EQ(1, gcs_node_manager_->GetAllAliveNodes().size()); - auto request = Mocker::GenCreatePlacementGroupRequest(); - auto placement_group = std::make_shared(request, "", counter_); + auto request = GenCreatePlacementGroupRequest(); + auto placement_group = std::make_shared(request, "", counter_); // Schedule the placement_group with 1 available node, and the lease request should be // send to the node. - scheduler_->ScheduleUnplacedBundles( + scheduler_->ScheduleUnplacedBundles(SchedulePgRequest{ placement_group, - [this](std::shared_ptr placement_group, - bool is_insfeasble) { + [this](std::shared_ptr placement_group, bool is_insfeasble) { absl::MutexLock lock(&placement_group_requests_mutex_); failure_placement_groups_.emplace_back(std::move(placement_group)); }, - [this](std::shared_ptr placement_group) { + [this](std::shared_ptr placement_group) { absl::MutexLock lock(&placement_group_requests_mutex_); success_placement_groups_.emplace_back(std::move(placement_group)); - }); + }}); ASSERT_EQ(1, raylet_clients_[0]->num_lease_requested); ASSERT_EQ(1, raylet_clients_[0]->lease_callbacks.size()); @@ -428,14 +427,14 @@ TEST_F(GcsPlacementGroupSchedulerTest, TestSchedulePlacementGroupReturnResource) } TEST_F(GcsPlacementGroupSchedulerTest, TestStrictPackStrategyBalancedScheduling) { - AddNode(Mocker::GenNodeInfo(0)); - AddNode(Mocker::GenNodeInfo(1)); - auto failure_handler = [this](std::shared_ptr placement_group, + AddNode(GenNodeInfo(0)); + AddNode(GenNodeInfo(1)); + auto failure_handler = [this](std::shared_ptr placement_group, bool is_insfeasble) { absl::MutexLock lock(&placement_group_requests_mutex_); failure_placement_groups_.emplace_back(std::move(placement_group)); }; - auto success_handler = [this](std::shared_ptr placement_group) { + auto success_handler = [this](std::shared_ptr placement_group) { absl::MutexLock lock(&placement_group_requests_mutex_); success_placement_groups_.emplace_back(std::move(placement_group)); }; @@ -446,11 +445,10 @@ TEST_F(GcsPlacementGroupSchedulerTest, TestStrictPackStrategyBalancedScheduling) int node_index = 0; for (int index = 0; index < 10; ++index) { auto request = - Mocker::GenCreatePlacementGroupRequest("", rpc::PlacementStrategy::STRICT_PACK); - auto placement_group = - std::make_shared(request, "", counter_); + GenCreatePlacementGroupRequest("", rpc::PlacementStrategy::STRICT_PACK); + auto placement_group = std::make_shared(request, "", counter_); scheduler_->ScheduleUnplacedBundles( - placement_group, failure_handler, success_handler); + SchedulePgRequest{placement_group, failure_handler, success_handler}); node_index = !raylet_clients_[0]->lease_callbacks.empty() ? 0 : 1; ++node_select_count[node_index]; @@ -475,21 +473,21 @@ TEST_F(GcsPlacementGroupSchedulerTest, TestStrictPackStrategyReschedulingWhenNod } TEST_F(GcsPlacementGroupSchedulerTest, TestStrictPackStrategyResourceCheck) { - auto node0 = Mocker::GenNodeInfo(0); + auto node0 = GenNodeInfo(0); AddNode(node0); - auto failure_handler = [this](std::shared_ptr placement_group, + auto failure_handler = [this](std::shared_ptr placement_group, bool is_insfeasble) { absl::MutexLock lock(&placement_group_requests_mutex_); failure_placement_groups_.emplace_back(std::move(placement_group)); }; - auto success_handler = [this](std::shared_ptr placement_group) { + auto success_handler = [this](std::shared_ptr placement_group) { absl::MutexLock lock(&placement_group_requests_mutex_); success_placement_groups_.emplace_back(std::move(placement_group)); }; - auto request = - Mocker::GenCreatePlacementGroupRequest("", rpc::PlacementStrategy::STRICT_PACK); - auto placement_group = std::make_shared(request, "", counter_); - scheduler_->ScheduleUnplacedBundles(placement_group, failure_handler, success_handler); + auto request = GenCreatePlacementGroupRequest("", rpc::PlacementStrategy::STRICT_PACK); + auto placement_group = std::make_shared(request, "", counter_); + scheduler_->ScheduleUnplacedBundles( + SchedulePgRequest{placement_group, failure_handler, success_handler}); ASSERT_TRUE(raylet_clients_[0]->GrantPrepareBundleResources()); WaitPendingDone(raylet_clients_[0]->commit_callbacks, 1); ASSERT_TRUE(raylet_clients_[0]->GrantCommitBundleResources()); @@ -497,13 +495,14 @@ TEST_F(GcsPlacementGroupSchedulerTest, TestStrictPackStrategyResourceCheck) { // Node1 has less number of bundles, but it doesn't satisfy the resource // requirement. In this case, the bundles should be scheduled on Node0. - auto node1 = Mocker::GenNodeInfo(1); + auto node1 = GenNodeInfo(1); AddNode(node1, 1); auto create_placement_group_request2 = - Mocker::GenCreatePlacementGroupRequest("", rpc::PlacementStrategy::STRICT_PACK); - auto placement_group2 = std::make_shared( - create_placement_group_request2, "", counter_); - scheduler_->ScheduleUnplacedBundles(placement_group2, failure_handler, success_handler); + GenCreatePlacementGroupRequest("", rpc::PlacementStrategy::STRICT_PACK); + auto placement_group2 = + std::make_shared(create_placement_group_request2, "", counter_); + scheduler_->ScheduleUnplacedBundles( + SchedulePgRequest{placement_group2, failure_handler, success_handler}); ASSERT_TRUE(raylet_clients_[0]->GrantPrepareBundleResources()); WaitPendingDone(raylet_clients_[0]->commit_callbacks, 1); ASSERT_TRUE(raylet_clients_[0]->GrantCommitBundleResources()); @@ -511,27 +510,26 @@ TEST_F(GcsPlacementGroupSchedulerTest, TestStrictPackStrategyResourceCheck) { } TEST_F(GcsPlacementGroupSchedulerTest, DestroyPlacementGroup) { - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); AddNode(node); ASSERT_EQ(1, gcs_node_manager_->GetAllAliveNodes().size()); - auto create_placement_group_request = Mocker::GenCreatePlacementGroupRequest(); - auto placement_group = std::make_shared( - create_placement_group_request, "", counter_); + auto create_placement_group_request = GenCreatePlacementGroupRequest(); + auto placement_group = + std::make_shared(create_placement_group_request, "", counter_); // Schedule the placement_group with 1 available node, and the lease request should be // send to the node. - scheduler_->ScheduleUnplacedBundles( + scheduler_->ScheduleUnplacedBundles(SchedulePgRequest{ placement_group, - [this](std::shared_ptr placement_group, - bool is_insfeasble) { + [this](std::shared_ptr placement_group, bool is_insfeasble) { absl::MutexLock lock(&placement_group_requests_mutex_); failure_placement_groups_.emplace_back(std::move(placement_group)); }, - [this](std::shared_ptr placement_group) { + [this](std::shared_ptr placement_group) { absl::MutexLock lock(&placement_group_requests_mutex_); success_placement_groups_.emplace_back(std::move(placement_group)); - }); + }}); ASSERT_TRUE(raylet_clients_[0]->GrantPrepareBundleResources()); WaitPendingDone(raylet_clients_[0]->commit_callbacks, 1); ASSERT_TRUE(raylet_clients_[0]->GrantCommitBundleResources()); @@ -548,30 +546,29 @@ TEST_F(GcsPlacementGroupSchedulerTest, DestroyPlacementGroup) { } TEST_F(GcsPlacementGroupSchedulerTest, DestroyCancelledPlacementGroup) { - auto node0 = Mocker::GenNodeInfo(0); - auto node1 = Mocker::GenNodeInfo(1); + auto node0 = GenNodeInfo(0); + auto node1 = GenNodeInfo(1); AddNode(node0); AddNode(node1); ASSERT_EQ(2, gcs_node_manager_->GetAllAliveNodes().size()); - auto create_placement_group_request = Mocker::GenCreatePlacementGroupRequest(); - auto placement_group = std::make_shared( - create_placement_group_request, "", counter_); + auto create_placement_group_request = GenCreatePlacementGroupRequest(); + auto placement_group = + std::make_shared(create_placement_group_request, "", counter_); const auto &placement_group_id = placement_group->GetPlacementGroupID(); // Schedule the placement_group with 1 available node, and the lease request should be // send to the node. - scheduler_->ScheduleUnplacedBundles( + scheduler_->ScheduleUnplacedBundles(SchedulePgRequest{ placement_group, - [this](std::shared_ptr placement_group, - bool is_insfeasble) { + [this](std::shared_ptr placement_group, bool is_insfeasble) { absl::MutexLock lock(&placement_group_requests_mutex_); failure_placement_groups_.emplace_back(std::move(placement_group)); }, - [this](std::shared_ptr placement_group) { + [this](std::shared_ptr placement_group) { absl::MutexLock lock(&placement_group_requests_mutex_); success_placement_groups_.emplace_back(std::move(placement_group)); - }); + }}); // Now, cancel the schedule request. ASSERT_TRUE(raylet_clients_[0]->GrantPrepareBundleResources()); @@ -583,30 +580,29 @@ TEST_F(GcsPlacementGroupSchedulerTest, DestroyCancelledPlacementGroup) { } TEST_F(GcsPlacementGroupSchedulerTest, PlacementGroupCancelledDuringCommit) { - auto node0 = Mocker::GenNodeInfo(0); - auto node1 = Mocker::GenNodeInfo(1); + auto node0 = GenNodeInfo(0); + auto node1 = GenNodeInfo(1); AddNode(node0); AddNode(node1); ASSERT_EQ(2, gcs_node_manager_->GetAllAliveNodes().size()); - auto create_placement_group_request = Mocker::GenCreatePlacementGroupRequest(); - auto placement_group = std::make_shared( - create_placement_group_request, "", counter_); + auto create_placement_group_request = GenCreatePlacementGroupRequest(); + auto placement_group = + std::make_shared(create_placement_group_request, "", counter_); const auto &placement_group_id = placement_group->GetPlacementGroupID(); // Schedule the placement_group with 1 available node, and the lease request should be // send to the node. - scheduler_->ScheduleUnplacedBundles( + scheduler_->ScheduleUnplacedBundles(SchedulePgRequest{ placement_group, - [this](std::shared_ptr placement_group, - bool is_insfeasble) { + [this](std::shared_ptr placement_group, bool is_insfeasble) { absl::MutexLock lock(&placement_group_requests_mutex_); failure_placement_groups_.emplace_back(std::move(placement_group)); }, - [this](std::shared_ptr placement_group) { + [this](std::shared_ptr placement_group) { absl::MutexLock lock(&placement_group_requests_mutex_); success_placement_groups_.emplace_back(std::move(placement_group)); - }); + }}); // Now, cancel the schedule request. ASSERT_TRUE(raylet_clients_[0]->GrantPrepareBundleResources()); @@ -626,28 +622,29 @@ TEST_F(GcsPlacementGroupSchedulerTest, PlacementGroupCancelledDuringPreparedPut) // After a PG is prepared by all nodes, GCS writes to Redis then commit-all. // If a Cancel is happening during prepare, or during the Redis write, i.e. before the // commit-all is called, the PG should be removed and no commits should be sent. - auto node0 = Mocker::GenNodeInfo(0); - auto node1 = Mocker::GenNodeInfo(1); + auto node0 = GenNodeInfo(0); + auto node1 = GenNodeInfo(1); AddNode(node0); AddNode(node1); ASSERT_EQ(2, gcs_node_manager_->GetAllAliveNodes().size()); - auto create_placement_group_request = Mocker::GenCreatePlacementGroupRequest(); - auto placement_group = std::make_shared( - create_placement_group_request, "", counter_); + auto create_placement_group_request = GenCreatePlacementGroupRequest(); + auto placement_group = + std::make_shared(create_placement_group_request, "", counter_); // Schedule the placement group successfully. - auto failure_handler = [this](std::shared_ptr placement_group, + auto failure_handler = [this](std::shared_ptr placement_group, bool is_insfeasble) { absl::MutexLock lock(&placement_group_requests_mutex_); failure_placement_groups_.emplace_back(std::move(placement_group)); }; - auto success_handler = [this](std::shared_ptr placement_group) { + auto success_handler = [this](std::shared_ptr placement_group) { absl::MutexLock lock(&placement_group_requests_mutex_); success_placement_groups_.emplace_back(std::move(placement_group)); }; - scheduler_->ScheduleUnplacedBundles(placement_group, failure_handler, success_handler); + scheduler_->ScheduleUnplacedBundles( + SchedulePgRequest{placement_group, failure_handler, success_handler}); ASSERT_TRUE(raylet_clients_[0]->GrantPrepareBundleResources()); scheduler_->MarkScheduleCancelled(placement_group->GetPlacementGroupID()); ASSERT_TRUE(raylet_clients_[1]->GrantPrepareBundleResources()); @@ -676,24 +673,24 @@ TEST_F(GcsPlacementGroupSchedulerTest, TestPackStrategyReschedulingWhenNodeAdd) } TEST_F(GcsPlacementGroupSchedulerTest, TestPackStrategyLargeBundlesScheduling) { - AddNode(Mocker::GenNodeInfo(0)); - AddNode(Mocker::GenNodeInfo(1)); - auto failure_handler = [this](std::shared_ptr placement_group, + AddNode(GenNodeInfo(0)); + AddNode(GenNodeInfo(1)); + auto failure_handler = [this](std::shared_ptr placement_group, bool is_insfeasble) { absl::MutexLock lock(&placement_group_requests_mutex_); failure_placement_groups_.emplace_back(std::move(placement_group)); }; - auto success_handler = [this](std::shared_ptr placement_group) { + auto success_handler = [this](std::shared_ptr placement_group) { absl::MutexLock lock(&placement_group_requests_mutex_); success_placement_groups_.emplace_back(std::move(placement_group)); }; // Schedule placement group which has large bundles. // One node does not have enough resources, so we will divide bundles to two nodes. - auto request = - Mocker::GenCreatePlacementGroupRequest("", rpc::PlacementStrategy::PACK, 15); - auto placement_group = std::make_shared(request, "", counter_); - scheduler_->ScheduleUnplacedBundles(placement_group, failure_handler, success_handler); + auto request = GenCreatePlacementGroupRequest("", rpc::PlacementStrategy::PACK, 15); + auto placement_group = std::make_shared(request, "", counter_); + scheduler_->ScheduleUnplacedBundles( + SchedulePgRequest{placement_group, failure_handler, success_handler}); // Prepared resource is batched! ASSERT_EQ(raylet_clients_[0]->num_lease_requested, 1); ASSERT_EQ(raylet_clients_[1]->num_lease_requested, 1); @@ -712,28 +709,29 @@ TEST_F(GcsPlacementGroupSchedulerTest, TestPackStrategyLargeBundlesScheduling) { TEST_F(GcsPlacementGroupSchedulerTest, TestStrictSpreadRescheduleWhenNodeDead) { int node_count = 3; for (int index = 0; index < node_count; ++index) { - auto node = Mocker::GenNodeInfo(index); + auto node = GenNodeInfo(index); AddNode(node); } ASSERT_EQ(3, gcs_node_manager_->GetAllAliveNodes().size()); - auto create_placement_group_request = Mocker::GenCreatePlacementGroupRequest( - "pg1", rpc::PlacementStrategy::STRICT_SPREAD); - auto placement_group = std::make_shared( - create_placement_group_request, "", counter_); + auto create_placement_group_request = + GenCreatePlacementGroupRequest("pg1", rpc::PlacementStrategy::STRICT_SPREAD); + auto placement_group = + std::make_shared(create_placement_group_request, "", counter_); // Schedule the placement group successfully. - auto failure_handler = [this](std::shared_ptr placement_group, + auto failure_handler = [this](std::shared_ptr placement_group, bool is_insfeasble) { absl::MutexLock lock(&placement_group_requests_mutex_); failure_placement_groups_.emplace_back(std::move(placement_group)); }; - auto success_handler = [this](std::shared_ptr placement_group) { + auto success_handler = [this](std::shared_ptr placement_group) { absl::MutexLock lock(&placement_group_requests_mutex_); success_placement_groups_.emplace_back(std::move(placement_group)); }; - scheduler_->ScheduleUnplacedBundles(placement_group, failure_handler, success_handler); + scheduler_->ScheduleUnplacedBundles( + SchedulePgRequest{placement_group, failure_handler, success_handler}); // Prepare bundle resources. for (int index = 0; index < node_count; ++index) { @@ -766,7 +764,8 @@ TEST_F(GcsPlacementGroupSchedulerTest, TestStrictSpreadRescheduleWhenNodeDead) { // One node is dead, reschedule the placement group. auto bundle_on_dead_node = placement_group->GetMutableBundle(0); bundle_on_dead_node->clear_node_id(); - scheduler_->ScheduleUnplacedBundles(placement_group, failure_handler, success_handler); + scheduler_->ScheduleUnplacedBundles( + SchedulePgRequest{placement_group, failure_handler, success_handler}); // Prepare bundle resources. for (int index = 0; index < node_count; ++index) { @@ -788,35 +787,38 @@ TEST_F(GcsPlacementGroupSchedulerTest, TestStrictSpreadRescheduleWhenNodeDead) { } TEST_F(GcsPlacementGroupSchedulerTest, TestStrictSpreadStrategyResourceCheck) { - auto node0 = Mocker::GenNodeInfo(0); + auto node0 = GenNodeInfo(0); AddNode(node0); - auto failure_handler = [this](std::shared_ptr placement_group, + auto failure_handler = [this](std::shared_ptr placement_group, bool is_insfeasble) { absl::MutexLock lock(&placement_group_requests_mutex_); failure_placement_groups_.emplace_back(std::move(placement_group)); }; - auto success_handler = [this](std::shared_ptr placement_group) { + auto success_handler = [this](std::shared_ptr placement_group) { absl::MutexLock lock(&placement_group_requests_mutex_); success_placement_groups_.emplace_back(std::move(placement_group)); }; - auto request = Mocker::GenCreatePlacementGroupRequest( - "", rpc::PlacementStrategy::STRICT_SPREAD, 2, 2); - auto placement_group = std::make_shared(request, "", counter_); - scheduler_->ScheduleUnplacedBundles(placement_group, failure_handler, success_handler); + auto request = + GenCreatePlacementGroupRequest("", rpc::PlacementStrategy::STRICT_SPREAD, 2, 2); + auto placement_group = std::make_shared(request, "", counter_); + scheduler_->ScheduleUnplacedBundles( + SchedulePgRequest{placement_group, failure_handler, success_handler}); // The number of nodes is less than the number of bundles, scheduling failed. WaitPlacementGroupPendingDone(1, GcsPlacementGroupStatus::FAILURE); // Node1 resource is insufficient, scheduling failed. - auto node1 = Mocker::GenNodeInfo(1); + auto node1 = GenNodeInfo(1); AddNode(node1, 1); - scheduler_->ScheduleUnplacedBundles(placement_group, failure_handler, success_handler); + scheduler_->ScheduleUnplacedBundles( + SchedulePgRequest{placement_group, failure_handler, success_handler}); WaitPlacementGroupPendingDone(2, GcsPlacementGroupStatus::FAILURE); // The node2 resource is enough and the scheduling is successful. - auto node2 = Mocker::GenNodeInfo(2); + auto node2 = GenNodeInfo(2); AddNode(node2); - scheduler_->ScheduleUnplacedBundles(placement_group, failure_handler, success_handler); + scheduler_->ScheduleUnplacedBundles( + SchedulePgRequest{placement_group, failure_handler, success_handler}); ASSERT_TRUE(raylet_clients_[0]->GrantPrepareBundleResources()); ASSERT_TRUE(raylet_clients_[2]->GrantPrepareBundleResources()); WaitPendingDone(raylet_clients_[0]->commit_callbacks, 1); @@ -831,8 +833,7 @@ TEST_F(GcsPlacementGroupSchedulerTest, TestBundleLocationIndex) { /// Generate data. const auto node1 = NodeID::FromRandom(); const auto node2 = NodeID::FromRandom(); - rpc::CreatePlacementGroupRequest request_pg1 = - Mocker::GenCreatePlacementGroupRequest("pg1"); + rpc::CreatePlacementGroupRequest request_pg1 = GenCreatePlacementGroupRequest("pg1"); const auto pg1_id = PlacementGroupID::FromBinary( request_pg1.placement_group_spec().placement_group_id()); const std::shared_ptr bundle_node1_pg1 = @@ -848,8 +849,7 @@ TEST_F(GcsPlacementGroupSchedulerTest, TestBundleLocationIndex) { (*bundle_locations_pg1) .emplace(bundle_node2_pg1->BundleId(), std::make_pair(node2, bundle_node2_pg1)); - rpc::CreatePlacementGroupRequest request_pg2 = - Mocker::GenCreatePlacementGroupRequest("pg2"); + rpc::CreatePlacementGroupRequest request_pg2 = GenCreatePlacementGroupRequest("pg2"); const auto pg2_id = PlacementGroupID::FromBinary( request_pg2.placement_group_spec().placement_group_id()); const std::shared_ptr bundle_node1_pg2 = @@ -903,30 +903,31 @@ TEST_F(GcsPlacementGroupSchedulerTest, TestBundleLocationIndex) { } TEST_F(GcsPlacementGroupSchedulerTest, TestNodeDeadDuringPreparingResources) { - auto node0 = Mocker::GenNodeInfo(0); - auto node1 = Mocker::GenNodeInfo(1); + auto node0 = GenNodeInfo(0); + auto node1 = GenNodeInfo(1); AddNode(node0); AddNode(node1); ASSERT_EQ(2, gcs_node_manager_->GetAllAliveNodes().size()); - auto create_placement_group_request = Mocker::GenCreatePlacementGroupRequest(); - auto placement_group = std::make_shared( - create_placement_group_request, "", counter_); + auto create_placement_group_request = GenCreatePlacementGroupRequest(); + auto placement_group = + std::make_shared(create_placement_group_request, "", counter_); // Schedule the placement group. // One node is dead, so one bundle failed to schedule. - auto failure_handler = [this](std::shared_ptr placement_group, + auto failure_handler = [this](std::shared_ptr placement_group, bool is_insfeasble) { absl::MutexLock lock(&placement_group_requests_mutex_); ASSERT_EQ(placement_group->GetUnplacedBundles().size(), 2); failure_placement_groups_.emplace_back(std::move(placement_group)); }; - auto success_handler = [this](std::shared_ptr placement_group) { + auto success_handler = [this](std::shared_ptr placement_group) { absl::MutexLock lock(&placement_group_requests_mutex_); success_placement_groups_.emplace_back(std::move(placement_group)); }; - scheduler_->ScheduleUnplacedBundles(placement_group, failure_handler, success_handler); + scheduler_->ScheduleUnplacedBundles( + SchedulePgRequest{placement_group, failure_handler, success_handler}); ASSERT_TRUE(raylet_clients_[0]->GrantPrepareBundleResources()); RemoveNode(node1); // This should fail because the node is dead. @@ -940,30 +941,31 @@ TEST_F(GcsPlacementGroupSchedulerTest, TestNodeDeadDuringPreparingResourcesRaceCondition) { // This covers the scnario where the node is dead right after raylet sends a success // response. - auto node0 = Mocker::GenNodeInfo(0); - auto node1 = Mocker::GenNodeInfo(1); + auto node0 = GenNodeInfo(0); + auto node1 = GenNodeInfo(1); AddNode(node0); AddNode(node1); ASSERT_EQ(2, gcs_node_manager_->GetAllAliveNodes().size()); - auto create_placement_group_request = Mocker::GenCreatePlacementGroupRequest(); - auto placement_group = std::make_shared( - create_placement_group_request, "", counter_); + auto create_placement_group_request = GenCreatePlacementGroupRequest(); + auto placement_group = + std::make_shared(create_placement_group_request, "", counter_); // Schedule the placement group. // One node is dead, so one bundle failed to schedule. - auto failure_handler = [this](std::shared_ptr placement_group, + auto failure_handler = [this](std::shared_ptr placement_group, bool is_insfeasble) { absl::MutexLock lock(&placement_group_requests_mutex_); ASSERT_EQ(placement_group->GetUnplacedBundles().size(), 1); failure_placement_groups_.emplace_back(std::move(placement_group)); }; - auto success_handler = [this](std::shared_ptr placement_group) { + auto success_handler = [this](std::shared_ptr placement_group) { absl::MutexLock lock(&placement_group_requests_mutex_); success_placement_groups_.emplace_back(std::move(placement_group)); }; - scheduler_->ScheduleUnplacedBundles(placement_group, failure_handler, success_handler); + scheduler_->ScheduleUnplacedBundles( + SchedulePgRequest{placement_group, failure_handler, success_handler}); ASSERT_TRUE(raylet_clients_[0]->GrantPrepareBundleResources()); RemoveNode(node1); // If node is dead right after raylet succeds to create a bundle, it will reply that @@ -981,30 +983,31 @@ TEST_F(GcsPlacementGroupSchedulerTest, } TEST_F(GcsPlacementGroupSchedulerTest, TestNodeDeadBeforeCommittingResources) { - auto node0 = Mocker::GenNodeInfo(0); - auto node1 = Mocker::GenNodeInfo(1); + auto node0 = GenNodeInfo(0); + auto node1 = GenNodeInfo(1); AddNode(node0); AddNode(node1); ASSERT_EQ(2, gcs_node_manager_->GetAllAliveNodes().size()); - auto create_placement_group_request = Mocker::GenCreatePlacementGroupRequest(); - auto placement_group = std::make_shared( - create_placement_group_request, "", counter_); + auto create_placement_group_request = GenCreatePlacementGroupRequest(); + auto placement_group = + std::make_shared(create_placement_group_request, "", counter_); // Schedule the placement group. // One node is dead, so one bundle failed to schedule. - auto failure_handler = [this](std::shared_ptr placement_group, + auto failure_handler = [this](std::shared_ptr placement_group, bool is_insfeasble) { absl::MutexLock lock(&placement_group_requests_mutex_); ASSERT_EQ(placement_group->GetUnplacedBundles().size(), 1); failure_placement_groups_.emplace_back(std::move(placement_group)); }; - auto success_handler = [this](std::shared_ptr placement_group) { + auto success_handler = [this](std::shared_ptr placement_group) { absl::MutexLock lock(&placement_group_requests_mutex_); success_placement_groups_.emplace_back(std::move(placement_group)); }; - scheduler_->ScheduleUnplacedBundles(placement_group, failure_handler, success_handler); + scheduler_->ScheduleUnplacedBundles( + SchedulePgRequest{placement_group, failure_handler, success_handler}); ASSERT_TRUE(raylet_clients_[0]->GrantPrepareBundleResources()); // node1 dead right after prepare succeeded. To simulate gcs_placement_group_scheduler // finding the node dead before it tries to commit all nodes, we remove node *before* @@ -1019,30 +1022,31 @@ TEST_F(GcsPlacementGroupSchedulerTest, TestNodeDeadBeforeCommittingResources) { } TEST_F(GcsPlacementGroupSchedulerTest, TestNodeErrorDuringCommittingResources) { - auto node0 = Mocker::GenNodeInfo(0); - auto node1 = Mocker::GenNodeInfo(1); + auto node0 = GenNodeInfo(0); + auto node1 = GenNodeInfo(1); AddNode(node0); AddNode(node1); ASSERT_EQ(2, gcs_node_manager_->GetAllAliveNodes().size()); - auto create_placement_group_request = Mocker::GenCreatePlacementGroupRequest(); - auto placement_group = std::make_shared( - create_placement_group_request, "", counter_); + auto create_placement_group_request = GenCreatePlacementGroupRequest(); + auto placement_group = + std::make_shared(create_placement_group_request, "", counter_); // Schedule the placement group. // One node is dead, so one bundle failed to schedule. - auto failure_handler = [this](std::shared_ptr placement_group, + auto failure_handler = [this](std::shared_ptr placement_group, bool is_insfeasble) { absl::MutexLock lock(&placement_group_requests_mutex_); ASSERT_EQ(placement_group->GetUnplacedBundles().size(), 1); failure_placement_groups_.emplace_back(std::move(placement_group)); }; - auto success_handler = [this](std::shared_ptr placement_group) { + auto success_handler = [this](std::shared_ptr placement_group) { absl::MutexLock lock(&placement_group_requests_mutex_); success_placement_groups_.emplace_back(std::move(placement_group)); }; - scheduler_->ScheduleUnplacedBundles(placement_group, failure_handler, success_handler); + scheduler_->ScheduleUnplacedBundles( + SchedulePgRequest{placement_group, failure_handler, success_handler}); ASSERT_TRUE(raylet_clients_[0]->GrantPrepareBundleResources()); ASSERT_TRUE(raylet_clients_[1]->GrantPrepareBundleResources()); WaitPendingDone(raylet_clients_[0]->commit_callbacks, 1); @@ -1055,28 +1059,29 @@ TEST_F(GcsPlacementGroupSchedulerTest, TestNodeErrorDuringCommittingResources) { } TEST_F(GcsPlacementGroupSchedulerTest, TestNodeDeadDuringRescheduling) { - auto node0 = Mocker::GenNodeInfo(0); - auto node1 = Mocker::GenNodeInfo(1); + auto node0 = GenNodeInfo(0); + auto node1 = GenNodeInfo(1); AddNode(node0); AddNode(node1); ASSERT_EQ(2, gcs_node_manager_->GetAllAliveNodes().size()); - auto create_placement_group_request = Mocker::GenCreatePlacementGroupRequest(); - auto placement_group = std::make_shared( - create_placement_group_request, "", counter_); + auto create_placement_group_request = GenCreatePlacementGroupRequest(); + auto placement_group = + std::make_shared(create_placement_group_request, "", counter_); // Schedule the placement group successfully. - auto failure_handler = [this](std::shared_ptr placement_group, + auto failure_handler = [this](std::shared_ptr placement_group, bool is_insfeasble) { absl::MutexLock lock(&placement_group_requests_mutex_); failure_placement_groups_.emplace_back(std::move(placement_group)); }; - auto success_handler = [this](std::shared_ptr placement_group) { + auto success_handler = [this](std::shared_ptr placement_group) { absl::MutexLock lock(&placement_group_requests_mutex_); success_placement_groups_.emplace_back(std::move(placement_group)); }; - scheduler_->ScheduleUnplacedBundles(placement_group, failure_handler, success_handler); + scheduler_->ScheduleUnplacedBundles( + SchedulePgRequest{placement_group, failure_handler, success_handler}); ASSERT_TRUE(raylet_clients_[0]->GrantPrepareBundleResources()); ASSERT_TRUE(raylet_clients_[1]->GrantPrepareBundleResources()); WaitPendingDone(raylet_clients_[0]->commit_callbacks, 1); @@ -1094,7 +1099,8 @@ TEST_F(GcsPlacementGroupSchedulerTest, TestNodeDeadDuringRescheduling) { // All nodes are dead, reschedule the placement group. placement_group->GetMutableBundle(0)->clear_node_id(); placement_group->GetMutableBundle(1)->clear_node_id(); - scheduler_->ScheduleUnplacedBundles(placement_group, failure_handler, success_handler); + scheduler_->ScheduleUnplacedBundles( + SchedulePgRequest{placement_group, failure_handler, success_handler}); ASSERT_TRUE(raylet_clients_[0]->GrantPrepareBundleResources()); // Before prepare requests are done, suppose a node is dead. @@ -1110,28 +1116,29 @@ TEST_F(GcsPlacementGroupSchedulerTest, TestNodeDeadDuringRescheduling) { } TEST_F(GcsPlacementGroupSchedulerTest, TestPGCancelledDuringReschedulingCommit) { - auto node0 = Mocker::GenNodeInfo(0); - auto node1 = Mocker::GenNodeInfo(1); + auto node0 = GenNodeInfo(0); + auto node1 = GenNodeInfo(1); AddNode(node0); AddNode(node1); ASSERT_EQ(2, gcs_node_manager_->GetAllAliveNodes().size()); - auto create_placement_group_request = Mocker::GenCreatePlacementGroupRequest(); - auto placement_group = std::make_shared( - create_placement_group_request, "", counter_); + auto create_placement_group_request = GenCreatePlacementGroupRequest(); + auto placement_group = + std::make_shared(create_placement_group_request, "", counter_); // Schedule the placement group successfully. - auto failure_handler = [this](std::shared_ptr placement_group, + auto failure_handler = [this](std::shared_ptr placement_group, bool is_insfeasble) { absl::MutexLock lock(&placement_group_requests_mutex_); failure_placement_groups_.emplace_back(std::move(placement_group)); }; - auto success_handler = [this](std::shared_ptr placement_group) { + auto success_handler = [this](std::shared_ptr placement_group) { absl::MutexLock lock(&placement_group_requests_mutex_); success_placement_groups_.emplace_back(std::move(placement_group)); }; - scheduler_->ScheduleUnplacedBundles(placement_group, failure_handler, success_handler); + scheduler_->ScheduleUnplacedBundles( + SchedulePgRequest{placement_group, failure_handler, success_handler}); ASSERT_TRUE(raylet_clients_[0]->GrantPrepareBundleResources()); ASSERT_TRUE(raylet_clients_[1]->GrantPrepareBundleResources()); WaitPendingDone(raylet_clients_[0]->commit_callbacks, 1); @@ -1149,7 +1156,8 @@ TEST_F(GcsPlacementGroupSchedulerTest, TestPGCancelledDuringReschedulingCommit) // All nodes are dead, reschedule the placement group. placement_group->GetMutableBundle(0)->clear_node_id(); placement_group->GetMutableBundle(1)->clear_node_id(); - scheduler_->ScheduleUnplacedBundles(placement_group, failure_handler, success_handler); + scheduler_->ScheduleUnplacedBundles( + SchedulePgRequest{placement_group, failure_handler, success_handler}); // Rescheduling happening. ASSERT_TRUE(raylet_clients_[0]->GrantPrepareBundleResources()); @@ -1167,28 +1175,29 @@ TEST_F(GcsPlacementGroupSchedulerTest, TestPGCancelledDuringReschedulingCommit) } TEST_F(GcsPlacementGroupSchedulerTest, TestPGCancelledDuringReschedulingCommitPrepare) { - auto node0 = Mocker::GenNodeInfo(0); - auto node1 = Mocker::GenNodeInfo(1); + auto node0 = GenNodeInfo(0); + auto node1 = GenNodeInfo(1); AddNode(node0); AddNode(node1); ASSERT_EQ(2, gcs_node_manager_->GetAllAliveNodes().size()); - auto create_placement_group_request = Mocker::GenCreatePlacementGroupRequest(); - auto placement_group = std::make_shared( - create_placement_group_request, "", counter_); + auto create_placement_group_request = GenCreatePlacementGroupRequest(); + auto placement_group = + std::make_shared(create_placement_group_request, "", counter_); // Schedule the placement group successfully. - auto failure_handler = [this](std::shared_ptr placement_group, + auto failure_handler = [this](std::shared_ptr placement_group, bool is_insfeasble) { absl::MutexLock lock(&placement_group_requests_mutex_); failure_placement_groups_.emplace_back(std::move(placement_group)); }; - auto success_handler = [this](std::shared_ptr placement_group) { + auto success_handler = [this](std::shared_ptr placement_group) { absl::MutexLock lock(&placement_group_requests_mutex_); success_placement_groups_.emplace_back(std::move(placement_group)); }; - scheduler_->ScheduleUnplacedBundles(placement_group, failure_handler, success_handler); + scheduler_->ScheduleUnplacedBundles( + SchedulePgRequest{placement_group, failure_handler, success_handler}); ASSERT_TRUE(raylet_clients_[0]->GrantPrepareBundleResources()); ASSERT_TRUE(raylet_clients_[1]->GrantPrepareBundleResources()); WaitPendingDone(raylet_clients_[0]->commit_callbacks, 1); @@ -1206,7 +1215,8 @@ TEST_F(GcsPlacementGroupSchedulerTest, TestPGCancelledDuringReschedulingCommitPr // All nodes are dead, reschedule the placement group. placement_group->GetMutableBundle(0)->clear_node_id(); placement_group->GetMutableBundle(1)->clear_node_id(); - scheduler_->ScheduleUnplacedBundles(placement_group, failure_handler, success_handler); + scheduler_->ScheduleUnplacedBundles( + SchedulePgRequest{placement_group, failure_handler, success_handler}); // Rescheduling happening. // Cancel the placement group scheduling before prepare requests are granted. @@ -1229,15 +1239,15 @@ TEST_F(GcsPlacementGroupSchedulerTest, TestReleaseUnusedBundles) { } TEST_F(GcsPlacementGroupSchedulerTest, TestInitialize) { - auto node0 = Mocker::GenNodeInfo(0); - auto node1 = Mocker::GenNodeInfo(1); + auto node0 = GenNodeInfo(0); + auto node1 = GenNodeInfo(1); AddNode(node0); AddNode(node1); ASSERT_EQ(2, gcs_node_manager_->GetAllAliveNodes().size()); - auto create_placement_group_request = Mocker::GenCreatePlacementGroupRequest(); - auto placement_group = std::make_shared( - create_placement_group_request, "", counter_); + auto create_placement_group_request = GenCreatePlacementGroupRequest(); + auto placement_group = + std::make_shared(create_placement_group_request, "", counter_); placement_group->GetMutableBundle(0)->set_node_id(node0->node_id()); placement_group->GetMutableBundle(1)->set_node_id(node1->node_id()); @@ -1269,8 +1279,8 @@ TEST_F(GcsPlacementGroupSchedulerTest, TestPrepareFromDeadNodes) { ASSERT_TRUE(EnsureClusterResourcesAreNotInUse()); // Create a placement group. - auto placement_group = std::make_shared( - Mocker::GenCreatePlacementGroupRequest(), "", counter_); + auto placement_group = + std::make_shared(GenCreatePlacementGroupRequest(), "", counter_); // Schedule the unplaced bundles of the placement_group. ScheduleUnplacedBundles(placement_group); @@ -1297,8 +1307,8 @@ TEST_F(GcsPlacementGroupSchedulerTest, TestPrepareFromNodeWithInsufficientResour ASSERT_TRUE(EnsureClusterResourcesAreNotInUse()); // Create a placement group. - auto placement_group = std::make_shared( - Mocker::GenCreatePlacementGroupRequest(), "", counter_); + auto placement_group = + std::make_shared(GenCreatePlacementGroupRequest(), "", counter_); // Schedule the unplaced bundles of the placement_group. ScheduleUnplacedBundles(placement_group); @@ -1325,8 +1335,8 @@ TEST_F(GcsPlacementGroupSchedulerTest, TestCommitToDeadNodes) { ASSERT_TRUE(EnsureClusterResourcesAreNotInUse()); // Create a placement group. - auto placement_group = std::make_shared( - Mocker::GenCreatePlacementGroupRequest(), "", counter_); + auto placement_group = + std::make_shared(GenCreatePlacementGroupRequest(), "", counter_); // Schedule the unplaced bundles of the placement_group. ScheduleUnplacedBundles(placement_group); @@ -1351,10 +1361,10 @@ TEST_F(GcsPlacementGroupSchedulerTest, TestCommitToDeadNodes) { } TEST_F(GcsPlacementGroupSchedulerTest, TestCheckingWildcardResource) { - auto create_placement_group_request = Mocker::GenCreatePlacementGroupRequest( + auto create_placement_group_request = GenCreatePlacementGroupRequest( /*name=*/"", /*strategy=*/rpc::PlacementStrategy::SPREAD, /*bundles_count=*/1); - auto placement_group = std::make_shared( - create_placement_group_request, "", counter_); + auto placement_group = + std::make_shared(create_placement_group_request, "", counter_); int wildcard_resource_count = 0; for (const auto &bundle_spec : placement_group->GetBundles()) { for (const auto &resource_entry : bundle_spec->GetFormattedResources()) { @@ -1372,27 +1382,26 @@ TEST_F(GcsPlacementGroupSchedulerTest, TestWaitingRemovedBundles) { // This feature is only required by gcs actor scheduler. RayConfig::instance().initialize(R"({"gcs_actor_scheduling_enabled": true})"); - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); AddNode(node); ASSERT_EQ(1, gcs_node_manager_->GetAllAliveNodes().size()); - auto create_placement_group_request = Mocker::GenCreatePlacementGroupRequest(); - auto placement_group = std::make_shared( - create_placement_group_request, "", counter_); + auto create_placement_group_request = GenCreatePlacementGroupRequest(); + auto placement_group = + std::make_shared(create_placement_group_request, "", counter_); // Schedule the placement_group with 1 available node, and the lease request should be // send to the node. - scheduler_->ScheduleUnplacedBundles( + scheduler_->ScheduleUnplacedBundles(SchedulePgRequest{ placement_group, - [this](std::shared_ptr placement_group, - bool is_insfeasble) { + [this](std::shared_ptr placement_group, bool is_insfeasble) { absl::MutexLock lock(&placement_group_requests_mutex_); failure_placement_groups_.emplace_back(std::move(placement_group)); }, - [this](std::shared_ptr placement_group) { + [this](std::shared_ptr placement_group) { absl::MutexLock lock(&placement_group_requests_mutex_); success_placement_groups_.emplace_back(std::move(placement_group)); - }); + }}); ASSERT_TRUE(raylet_clients_[0]->GrantPrepareBundleResources()); WaitPendingDone(raylet_clients_[0]->commit_callbacks, 1); ASSERT_TRUE(raylet_clients_[0]->GrantCommitBundleResources()); @@ -1417,7 +1426,7 @@ TEST_F(GcsPlacementGroupSchedulerTest, TestWaitingRemovedBundles) { ASSERT_TRUE(raylet_clients_[0]->GrantCancelResourceReserve()); // Because actors have not released the bundle resources, bundles have to keep waiting. - ASSERT_EQ(scheduler_->GetWaitingRemovedBundlesSize(), 2); + ASSERT_EQ(scheduler_->waiting_removed_bundles_.size(), 2); const auto &node_resources = cluster_resource_scheduler_->GetClusterResourceManager().GetNodeResources( scheduling::NodeID(node->node_id())); @@ -1436,33 +1445,32 @@ TEST_F(GcsPlacementGroupSchedulerTest, TestWaitingRemovedBundles) { scheduler_->HandleWaitingRemovedBundles(); // The waiting bundles are removed, and resources are successfully returned to node. - ASSERT_EQ(scheduler_->GetWaitingRemovedBundlesSize(), 0); + ASSERT_EQ(scheduler_->waiting_removed_bundles_.size(), 0); ASSERT_EQ(node_resources.available.Get(scheduling::ResourceID::CPU()), node_resources.total.Get(scheduling::ResourceID::CPU())); } TEST_F(GcsPlacementGroupSchedulerTest, TestBundlesRemovedWhenNodeDead) { - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); AddNode(node); ASSERT_EQ(1, gcs_node_manager_->GetAllAliveNodes().size()); - auto create_placement_group_request = Mocker::GenCreatePlacementGroupRequest(); - auto placement_group = std::make_shared( - create_placement_group_request, "", counter_); + auto create_placement_group_request = GenCreatePlacementGroupRequest(); + auto placement_group = + std::make_shared(create_placement_group_request, "", counter_); // Schedule the placement_group with 1 available node, and the lease request should be // send to the node. - scheduler_->ScheduleUnplacedBundles( + scheduler_->ScheduleUnplacedBundles(SchedulePgRequest{ placement_group, - [this](std::shared_ptr placement_group, - bool is_insfeasble) { + [this](std::shared_ptr placement_group, bool is_insfeasble) { absl::MutexLock lock(&placement_group_requests_mutex_); failure_placement_groups_.emplace_back(std::move(placement_group)); }, - [this](std::shared_ptr placement_group) { + [this](std::shared_ptr placement_group) { absl::MutexLock lock(&placement_group_requests_mutex_); success_placement_groups_.emplace_back(std::move(placement_group)); - }); + }}); ASSERT_TRUE(raylet_clients_[0]->GrantPrepareBundleResources()); WaitPendingDone(raylet_clients_[0]->commit_callbacks, 1); ASSERT_TRUE(raylet_clients_[0]->GrantCommitBundleResources()); @@ -1478,7 +1486,8 @@ TEST_F(GcsPlacementGroupSchedulerTest, TestBundlesRemovedWhenNodeDead) { // There shouldn't be any remaining bundles to be removed since the node is // already removed. The bundles are already removed when the node is removed. - ASSERT_EQ(scheduler_->GetWaitingRemovedBundlesSize(), 0); + ASSERT_EQ(scheduler_->waiting_removed_bundles_.size(), 0); } +} // namespace gcs } // namespace ray diff --git a/src/ray/gcs/tests/gcs_ray_event_converter_test.cc b/src/ray/gcs/tests/gcs_ray_event_converter_test.cc new file mode 100644 index 000000000000..89a10bebe4f2 --- /dev/null +++ b/src/ray/gcs/tests/gcs_ray_event_converter_test.cc @@ -0,0 +1,427 @@ +// Copyright 2022 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ray/gcs/gcs_ray_event_converter.h" + +#include + +#include "gtest/gtest.h" +#include "ray/common/id.h" +#include "src/ray/protobuf/common.pb.h" +#include "src/ray/protobuf/events_event_aggregator_service.pb.h" +#include "src/ray/protobuf/gcs_service.pb.h" +#include "src/ray/protobuf/public/events_base_event.pb.h" + +namespace ray { +namespace gcs { + +class GcsRayEventConverterTest : public ::testing::Test { + public: + GcsRayEventConverterTest() = default; +}; + +TEST_F(GcsRayEventConverterTest, TestConvertToTaskEventData) { + rpc::events::AddEventsRequest request; + GcsRayEventConverter converter; + + // Convert empty request + auto task_event_data_requests = + converter.ConvertToTaskEventDataRequests(std::move(request)); + + // Test empty request + EXPECT_EQ(task_event_data_requests.size(), 0); +} + +TEST_F(GcsRayEventConverterTest, TestConvertTaskDefinitionEvent) { + rpc::events::AddEventsRequest request; + GcsRayEventConverter converter; + + // Create a task definition event + auto *event = request.mutable_events_data()->add_events(); + event->set_event_id("test_event_id"); + event->set_event_type(rpc::events::RayEvent::TASK_DEFINITION_EVENT); + event->set_source_type(rpc::events::RayEvent::CORE_WORKER); + event->set_severity(rpc::events::RayEvent::INFO); + event->set_message("test message"); + + auto *task_def_event = event->mutable_task_definition_event(); + + task_def_event->set_task_type(rpc::TaskType::NORMAL_TASK); + task_def_event->set_language(rpc::Language::PYTHON); + task_def_event->mutable_task_func() + ->mutable_python_function_descriptor() + ->set_function_name("test_task_name"); + task_def_event->set_task_id("test_task_id"); + task_def_event->set_task_attempt(1); + task_def_event->set_job_id("test_job_id"); + task_def_event->set_task_name("test_task_name"); + + task_def_event->set_parent_task_id("parent_task_id"); + task_def_event->set_placement_group_id("pg_id"); + + // Add some required resources + (*task_def_event->mutable_required_resources())["CPU"] = 1.0; + (*task_def_event->mutable_required_resources())["memory"] = 1024.0; + + // Set runtime env info + auto *runtime_env = task_def_event->mutable_runtime_env_info(); + runtime_env->set_serialized_runtime_env("test_env"); + + // Convert + auto task_event_data_requests = + converter.ConvertToTaskEventDataRequests(std::move(request)); + + // Verify conversion + ASSERT_EQ(task_event_data_requests.size(), 1); + const auto &task_event_data = task_event_data_requests[0]; + EXPECT_EQ(task_event_data.data().events_by_task_size(), 1); + const auto &converted_task = task_event_data.data().events_by_task(0); + EXPECT_EQ(converted_task.task_id(), "test_task_id"); + EXPECT_EQ(converted_task.attempt_number(), 1); + EXPECT_EQ(converted_task.job_id(), "test_job_id"); + EXPECT_EQ(task_event_data.data().job_id(), "test_job_id"); + + // Verify task info + ASSERT_TRUE(converted_task.has_task_info()); + const auto &task_info = converted_task.task_info(); + EXPECT_EQ(task_info.name(), "test_task_name"); + EXPECT_EQ(task_info.type(), rpc::TaskType::NORMAL_TASK); + EXPECT_EQ(task_info.language(), rpc::Language::PYTHON); + EXPECT_EQ(task_info.func_or_class_name(), "test_task_name"); + EXPECT_EQ(task_info.runtime_env_info().serialized_runtime_env(), "test_env"); + EXPECT_EQ(task_info.parent_task_id(), "parent_task_id"); + EXPECT_EQ(task_info.placement_group_id(), "pg_id"); + + // Verify required resources + EXPECT_EQ(task_info.required_resources().at("CPU"), 1.0); + EXPECT_EQ(task_info.required_resources().at("memory"), 1024.0); +} + +TEST_F(GcsRayEventConverterTest, TestConvertWithDroppedTaskAttempts) { + rpc::events::AddEventsRequest request; + GcsRayEventConverter converter; + + // Create a proper TaskID for testing + const auto job_id = JobID::FromInt(100); + const auto driver_task_id = TaskID::ForDriverTask(job_id); + const auto test_task_id = TaskID::ForNormalTask(job_id, driver_task_id, 1); + const auto task_id_binary = test_task_id.Binary(); + + // Add dropped task attempts to metadata + auto *dropped_attempt = request.mutable_events_data() + ->mutable_task_events_metadata() + ->add_dropped_task_attempts(); + dropped_attempt->set_task_id(task_id_binary); + dropped_attempt->set_attempt_number(2); + + // Convert + auto task_event_data_requests = + converter.ConvertToTaskEventDataRequests(std::move(request)); + + // Verify dropped task attempts are copied + ASSERT_FALSE(task_event_data_requests.empty()); + EXPECT_EQ(task_event_data_requests[0].data().dropped_task_attempts_size(), 1); + const auto &converted_dropped = + task_event_data_requests[0].data().dropped_task_attempts(0); + EXPECT_EQ(converted_dropped.task_id(), task_id_binary); + EXPECT_EQ(converted_dropped.attempt_number(), 2); +} + +TEST_F(GcsRayEventConverterTest, TestMultipleJobIds) { + rpc::events::AddEventsRequest request; + GcsRayEventConverter converter; + + // Create events with different job IDs + const auto job_id_1 = JobID::FromInt(100); + const auto job_id_2 = JobID::FromInt(200); + + // Create first task event + auto *event1 = request.mutable_events_data()->add_events(); + event1->set_event_id("test_event_1"); + event1->set_event_type(rpc::events::RayEvent::TASK_DEFINITION_EVENT); + auto *task_def_event1 = event1->mutable_task_definition_event(); + task_def_event1->set_task_type(rpc::TaskType::NORMAL_TASK); + task_def_event1->set_language(rpc::Language::PYTHON); + task_def_event1->set_task_id("task_1"); + task_def_event1->set_job_id(job_id_1.Binary()); + task_def_event1->set_task_name("task_1_name"); + + // Create second task event with different job ID + auto *event2 = request.mutable_events_data()->add_events(); + event2->set_event_id("test_event_2"); + event2->set_event_type(rpc::events::RayEvent::TASK_DEFINITION_EVENT); + auto *task_def_event2 = event2->mutable_task_definition_event(); + task_def_event2->set_task_type(rpc::TaskType::NORMAL_TASK); + task_def_event2->set_language(rpc::Language::PYTHON); + task_def_event2->set_task_id("task_2"); + task_def_event2->set_job_id(job_id_2.Binary()); + task_def_event2->set_task_name("task_2_name"); + + // Add dropped task attempts for both job IDs + const auto driver_task_id_1 = TaskID::ForDriverTask(job_id_1); + const auto test_task_id_1 = TaskID::ForNormalTask(job_id_1, driver_task_id_1, 1); + + const auto driver_task_id_2 = TaskID::ForDriverTask(job_id_2); + const auto test_task_id_2 = TaskID::ForNormalTask(job_id_2, driver_task_id_2, 1); + + // Add dropped task attempt for job_id_1 + auto *dropped_attempt_1 = request.mutable_events_data() + ->mutable_task_events_metadata() + ->add_dropped_task_attempts(); + dropped_attempt_1->set_task_id(test_task_id_1.Binary()); + dropped_attempt_1->set_attempt_number(3); + + // Add dropped task attempt for job_id_2 + auto *dropped_attempt_2 = request.mutable_events_data() + ->mutable_task_events_metadata() + ->add_dropped_task_attempts(); + dropped_attempt_2->set_task_id(test_task_id_2.Binary()); + dropped_attempt_2->set_attempt_number(4); + + // Convert + auto task_event_data_requests = + converter.ConvertToTaskEventDataRequests(std::move(request)); + + // Verify that we get two separate requests (one for each job ID) + ASSERT_EQ(task_event_data_requests.size(), 2); + + // Check that each request has the correct job ID and dropped task attempts + bool found_job_1 = false, found_job_2 = false; + for (const auto &req : task_event_data_requests) { + if (req.data().job_id() == job_id_1.Binary()) { + found_job_1 = true; + EXPECT_EQ(req.data().events_by_task_size(), 1); + EXPECT_EQ(req.data().events_by_task(0).job_id(), job_id_1.Binary()); + + // Verify dropped task attempt for job_id_1 + EXPECT_EQ(req.data().dropped_task_attempts_size(), 1); + const auto &dropped = req.data().dropped_task_attempts(0); + EXPECT_EQ(dropped.task_id(), test_task_id_1.Binary()); + EXPECT_EQ(dropped.attempt_number(), 3); + } else if (req.data().job_id() == job_id_2.Binary()) { + found_job_2 = true; + EXPECT_EQ(req.data().events_by_task_size(), 1); + EXPECT_EQ(req.data().events_by_task(0).job_id(), job_id_2.Binary()); + + // Verify dropped task attempt for job_id_2 + EXPECT_EQ(req.data().dropped_task_attempts_size(), 1); + const auto &dropped = req.data().dropped_task_attempts(0); + EXPECT_EQ(dropped.task_id(), test_task_id_2.Binary()); + EXPECT_EQ(dropped.attempt_number(), 4); + } + } + EXPECT_TRUE(found_job_1); + EXPECT_TRUE(found_job_2); +} + +TEST_F(GcsRayEventConverterTest, TestSameJobIdGrouping) { + rpc::events::AddEventsRequest request; + GcsRayEventConverter converter; + + // Create multiple events with the same job ID + const auto job_id = JobID::FromInt(100); + + // Create first task event + auto *event1 = request.mutable_events_data()->add_events(); + event1->set_event_id("test_event_1"); + event1->set_event_type(rpc::events::RayEvent::TASK_DEFINITION_EVENT); + auto *task_def_event1 = event1->mutable_task_definition_event(); + task_def_event1->set_task_type(rpc::TaskType::NORMAL_TASK); + task_def_event1->set_language(rpc::Language::PYTHON); + task_def_event1->set_task_id("task_1"); + task_def_event1->set_job_id(job_id.Binary()); + task_def_event1->set_task_name("task_1_name"); + + // Create second task event with same job ID + auto *event2 = request.mutable_events_data()->add_events(); + event2->set_event_id("test_event_2"); + event2->set_event_type(rpc::events::RayEvent::TASK_DEFINITION_EVENT); + auto *task_def_event2 = event2->mutable_task_definition_event(); + task_def_event2->set_task_type(rpc::TaskType::NORMAL_TASK); + task_def_event2->set_language(rpc::Language::PYTHON); + task_def_event2->set_task_id("task_2"); + task_def_event2->set_job_id(job_id.Binary()); + task_def_event2->set_task_name("task_2_name"); + + // Convert + auto task_event_data_requests = + converter.ConvertToTaskEventDataRequests(std::move(request)); + + // Verify that we get one request with both events grouped together + ASSERT_EQ(task_event_data_requests.size(), 1); + EXPECT_EQ(task_event_data_requests[0].data().job_id(), job_id.Binary()); + EXPECT_EQ(task_event_data_requests[0].data().events_by_task_size(), 2); + + // Verify both tasks are present + const auto &events = task_event_data_requests[0].data().events_by_task(); + EXPECT_EQ(events[0].job_id(), job_id.Binary()); + EXPECT_EQ(events[1].job_id(), job_id.Binary()); +} + +TEST_F(GcsRayEventConverterTest, TestConvertTaskProfileEvents) { + rpc::events::AddEventsRequest request; + GcsRayEventConverter converter; + + // Create a task profile event + auto *event = request.mutable_events_data()->add_events(); + event->set_event_id("test_event_id"); + event->set_event_type(rpc::events::RayEvent::TASK_PROFILE_EVENT); + event->set_source_type(rpc::events::RayEvent::CORE_WORKER); + event->set_severity(rpc::events::RayEvent::INFO); + event->set_message("test message"); + + auto *task_profile_events = event->mutable_task_profile_events(); + task_profile_events->set_task_id("test_task_id"); + task_profile_events->set_attempt_number(1); + task_profile_events->set_job_id("test_job_id"); + + // Add a profile event + auto *profile_events = task_profile_events->mutable_profile_events(); + profile_events->set_component_id("test_component_id"); + profile_events->set_component_type("worker"); + profile_events->set_node_ip_address("test_address"); + + // add a profile event entry + auto *ProfileEventEntry = profile_events->add_events(); + ProfileEventEntry->set_start_time(123456789); + ProfileEventEntry->set_end_time(123456799); + ProfileEventEntry->set_extra_data("{\"foo\": \"bar\"}"); + ProfileEventEntry->set_event_name("test_event"); + + // Convert + auto task_event_data_requests = + converter.ConvertToTaskEventDataRequests(std::move(request)); + + // Verify conversion + EXPECT_EQ(task_event_data_requests.size(), 1); + auto task_event_data = task_event_data_requests[0]; + EXPECT_EQ(task_event_data.data().events_by_task_size(), 1); + const auto &converted_task = task_event_data.data().events_by_task(0); + + EXPECT_EQ(converted_task.task_id(), "test_task_id"); + EXPECT_EQ(converted_task.attempt_number(), 1); + EXPECT_EQ(converted_task.job_id(), "test_job_id"); + EXPECT_EQ(converted_task.profile_events().events_size(), 1); + EXPECT_EQ(task_event_data.data().job_id(), "test_job_id"); + + // Check profile event fields + EXPECT_TRUE(converted_task.has_profile_events()); + const auto &profile_event = converted_task.profile_events(); + EXPECT_EQ(profile_event.component_id(), "test_component_id"); + EXPECT_EQ(profile_event.component_type(), "worker"); + EXPECT_EQ(profile_event.node_ip_address(), "test_address"); + + // verify that there is one profile event entry and values match our expectations + EXPECT_TRUE(profile_event.events().size() == 1); + const auto &entry = profile_event.events(0); + EXPECT_EQ(entry.start_time(), 123456789); + EXPECT_EQ(entry.end_time(), 123456799); + EXPECT_EQ(entry.extra_data(), "{\"foo\": \"bar\"}"); + EXPECT_EQ(entry.event_name(), "test_event"); +} + +TEST_F(GcsRayEventConverterTest, TestConvertTaskExecutionEvent) { + GcsRayEventConverter converter; + rpc::events::TaskExecutionEvent exec_event; + + // Set basic fields + exec_event.set_task_id("test_task_id"); + exec_event.set_task_attempt(3); + exec_event.set_job_id("test_job_id"); + exec_event.set_node_id("test_node_id"); + exec_event.set_worker_id("test_worker_id"); + exec_event.set_worker_pid(1234); + + // Set a RayErrorInfo + exec_event.mutable_ray_error_info()->set_error_message("error"); + + google::protobuf::Timestamp ts; + ts.set_seconds(42); + ts.set_nanos(123456789); + (*exec_event.mutable_task_state())[rpc::TaskStatus::SUBMITTED_TO_WORKER] = ts; + + // Call the converter + rpc::TaskEvents task_event = converter.ConvertToTaskEvents(std::move(exec_event)); + + // Check basic fields + EXPECT_EQ(task_event.attempt_number(), 3); + EXPECT_EQ(task_event.job_id(), "test_job_id"); + EXPECT_TRUE(task_event.has_state_updates()); + const auto &state_updates = task_event.state_updates(); + EXPECT_EQ(state_updates.node_id(), "test_node_id"); + EXPECT_EQ(state_updates.worker_id(), "test_worker_id"); + EXPECT_EQ(state_updates.worker_pid(), 1234); + EXPECT_EQ(state_updates.error_info().error_message(), "error"); + + // Check state_ts_ns + ASSERT_EQ(state_updates.state_ts_ns().size(), 1); + int64_t expected_ns = 42 * 1000000000LL + 123456789; + EXPECT_EQ(state_updates.state_ts_ns().at(5), expected_ns); +} + +TEST_F(GcsRayEventConverterTest, TestConvertActorTaskDefinitionEvent) { + GcsRayEventConverter converter; + rpc::events::ActorTaskDefinitionEvent actor_def_event; + + // Set basic fields + actor_def_event.set_task_id("test_actor_task_id"); + actor_def_event.set_task_attempt(2); + actor_def_event.set_job_id("test_job_id"); + actor_def_event.set_actor_task_name("test_actor_task"); + actor_def_event.set_language(rpc::Language::PYTHON); + actor_def_event.set_actor_id("actor-123"); + actor_def_event.set_parent_task_id("parent-actor-task"); + actor_def_event.set_placement_group_id("pg-actor"); + + // Set runtime env info + auto *runtime_env = actor_def_event.mutable_runtime_env_info(); + runtime_env->set_serialized_runtime_env("test_actor_env"); + + // Set actor function descriptor (Python) + auto *func_desc = actor_def_event.mutable_actor_func(); + auto *python_func = func_desc->mutable_python_function_descriptor(); + python_func->set_function_name("test_actor_function"); + python_func->set_class_name("TestActorClass"); + + // Add required resources + (*actor_def_event.mutable_required_resources())["CPU"] = 2.0; + (*actor_def_event.mutable_required_resources())["GPU"] = 1.0; + + // Call the converter + rpc::TaskEvents task_event = converter.ConvertToTaskEvents(std::move(actor_def_event)); + + // Check basic fields + EXPECT_EQ(task_event.task_id(), "test_actor_task_id"); + EXPECT_EQ(task_event.attempt_number(), 2); + EXPECT_EQ(task_event.job_id(), "test_job_id"); + + // Check task info + EXPECT_TRUE(task_event.has_task_info()); + const auto &task_info = task_event.task_info(); + EXPECT_EQ(task_info.type(), rpc::TaskType::ACTOR_TASK); + EXPECT_EQ(task_info.name(), "test_actor_task"); + EXPECT_EQ(task_info.language(), rpc::Language::PYTHON); + EXPECT_EQ(task_info.func_or_class_name(), "test_actor_function"); + EXPECT_EQ(task_info.runtime_env_info().serialized_runtime_env(), "test_actor_env"); + EXPECT_EQ(task_info.actor_id(), "actor-123"); + EXPECT_EQ(task_info.parent_task_id(), "parent-actor-task"); + EXPECT_EQ(task_info.placement_group_id(), "pg-actor"); + + // Check required resources + EXPECT_EQ(task_info.required_resources().at("CPU"), 2.0); + EXPECT_EQ(task_info.required_resources().at("GPU"), 1.0); +} + +} // namespace gcs +} // namespace ray diff --git a/src/ray/gcs/gcs_server/test/gcs_resource_manager_test.cc b/src/ray/gcs/tests/gcs_resource_manager_test.cc similarity index 96% rename from src/ray/gcs/gcs_server/test/gcs_resource_manager_test.cc rename to src/ray/gcs/tests/gcs_resource_manager_test.cc index 23591b264573..a9732014aadd 100644 --- a/src/ray/gcs/gcs_server/test/gcs_resource_manager_test.cc +++ b/src/ray/gcs/tests/gcs_resource_manager_test.cc @@ -12,16 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/gcs_server/gcs_resource_manager.h" +#include "ray/gcs/gcs_resource_manager.h" #include #include #include #include "gtest/gtest.h" -#include "mock/ray/gcs/gcs_server/gcs_node_manager.h" +#include "mock/ray/gcs/gcs_node_manager.h" #include "ray/common/asio/instrumented_io_context.h" -#include "ray/gcs/test/gcs_test_util.h" +#include "ray/common/test_utils.h" #include "ray/raylet/scheduling/cluster_resource_manager.h" namespace ray { @@ -71,7 +71,7 @@ TEST_F(GcsResourceManagerTest, TestBasic) { absl::flat_hash_map resource_map; resource_map[cpu_resource] = 10; - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); node->mutable_resources_total()->insert(resource_map.begin(), resource_map.end()); // Add node resources. gcs_resource_manager_->OnNodeAdd(*node); @@ -103,7 +103,7 @@ TEST_F(GcsResourceManagerTest, TestBasic) { } TEST_F(GcsResourceManagerTest, TestResourceUsageAPI) { - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); node->mutable_resources_total()->insert({"CPU", 2}); auto node_id = NodeID::FromBinary(node->node_id()); rpc::GetAllResourceUsageRequest get_all_request; @@ -140,7 +140,7 @@ TEST_F(GcsResourceManagerTest, TestResourceUsageAPI) { } TEST_F(GcsResourceManagerTest, TestResourceUsageFromDifferentSyncMsgs) { - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); node->mutable_resources_total()->insert({"CPU", 10}); gcs_resource_manager_->OnNodeAdd(*node); @@ -188,7 +188,7 @@ TEST_F(GcsResourceManagerTest, TestResourceUsageFromDifferentSyncMsgs) { } TEST_F(GcsResourceManagerTest, TestSetAvailableResourcesWhenNodeDead) { - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); node->mutable_resources_total()->insert({"CPU", 10}); gcs_resource_manager_->OnNodeAdd(*node); @@ -212,7 +212,7 @@ TEST_F(GcsResourceManagerTest, TestNodeLabels) { absl::flat_hash_map labels = {{"key", "value"}, {"gpu_type", "a100"}}; - auto node = Mocker::GenNodeInfo(); + auto node = GenNodeInfo(); node->mutable_resources_total()->insert(resource_map.begin(), resource_map.end()); node->mutable_labels()->insert(labels.begin(), labels.end()); // Add node resources. @@ -226,7 +226,7 @@ TEST_F(GcsResourceManagerTest, TestNodeLabels) { } TEST_F(GcsResourceManagerTest, TestGetDrainingNodes) { - auto node1 = Mocker::GenNodeInfo(); + auto node1 = GenNodeInfo(); node1->mutable_resources_total()->insert({"CPU", 10}); gcs_resource_manager_->OnNodeAdd(*node1); UpdateFromResourceViewSync( @@ -237,7 +237,7 @@ TEST_F(GcsResourceManagerTest, TestGetDrainingNodes) { /* is_draining */ true, /* draining_deadline_timestamp_ms */ std::numeric_limits::max()); - auto node2 = Mocker::GenNodeInfo(); + auto node2 = GenNodeInfo(); node2->mutable_resources_total()->insert({"CPU", 1}); gcs_resource_manager_->OnNodeAdd(*node2); UpdateFromResourceViewSync(NodeID::FromBinary(node2->node_id()), diff --git a/src/ray/gcs/gcs_server/test/gcs_server_rpc_test.cc b/src/ray/gcs/tests/gcs_server_rpc_test.cc similarity index 87% rename from src/ray/gcs/gcs_server/test/gcs_server_rpc_test.cc rename to src/ray/gcs/tests/gcs_server_rpc_test.cc index 7929c693f933..fff248c8b0f5 100644 --- a/src/ray/gcs/gcs_server/test/gcs_server_rpc_test.cc +++ b/src/ray/gcs/tests/gcs_server_rpc_test.cc @@ -19,9 +19,9 @@ #include "gtest/gtest.h" #include "ray/common/asio/instrumented_io_context.h" #include "ray/common/ray_config.h" -#include "ray/gcs/gcs_server/gcs_server.h" -#include "ray/gcs/test/gcs_test_util.h" -#include "ray/rpc/gcs/gcs_rpc_client.h" +#include "ray/common/test_utils.h" +#include "ray/gcs/gcs_server.h" +#include "ray/gcs_client/rpc_client.h" namespace ray { @@ -69,9 +69,9 @@ class GcsServerTest : public ::testing::Test { rpc::ResetServerCallExecutor(); } - bool AddJob(const rpc::AddJobRequest &request) { + bool AddJob(rpc::AddJobRequest request) { std::promise promise; - client_->AddJob(request, + client_->AddJob(std::move(request), [&promise](const Status &status, const rpc::AddJobReply &reply) { RAY_CHECK_OK(status); promise.set_value(true); @@ -79,10 +79,10 @@ class GcsServerTest : public ::testing::Test { return WaitReady(promise.get_future(), timeout_ms_); } - bool MarkJobFinished(const rpc::MarkJobFinishedRequest &request) { + bool MarkJobFinished(rpc::MarkJobFinishedRequest request) { std::promise promise; client_->MarkJobFinished( - request, + std::move(request), [&promise](const Status &status, const rpc::MarkJobFinishedReply &reply) { RAY_CHECK_OK(status); promise.set_value(true); @@ -95,7 +95,7 @@ class GcsServerTest : public ::testing::Test { request.set_actor_id(actor_id); std::optional actor_table_data_opt; std::promise promise; - client_->GetActorInfo(request, + client_->GetActorInfo(std::move(request), [&actor_table_data_opt, &promise]( const Status &status, const rpc::GetActorInfoReply &reply) { RAY_CHECK_OK(status); @@ -110,10 +110,11 @@ class GcsServerTest : public ::testing::Test { return actor_table_data_opt; } - bool RegisterNode(const rpc::RegisterNodeRequest &request) { + bool RegisterNode(rpc::RegisterNodeRequest request) { std::promise promise; client_->RegisterNode( - request, [&promise](const Status &status, const rpc::RegisterNodeReply &reply) { + std::move(request), + [&promise](const Status &status, const rpc::RegisterNodeReply &reply) { RAY_CHECK_OK(status); promise.set_value(true); }); @@ -121,10 +122,11 @@ class GcsServerTest : public ::testing::Test { return WaitReady(promise.get_future(), timeout_ms_); } - bool UnregisterNode(const rpc::UnregisterNodeRequest &request) { + bool UnregisterNode(rpc::UnregisterNodeRequest request) { std::promise promise; client_->UnregisterNode( - request, [&promise](const Status &status, const rpc::UnregisterNodeReply &reply) { + std::move(request), + [&promise](const Status &status, const rpc::UnregisterNodeReply &reply) { RAY_CHECK_OK(status); promise.set_value(true); }); @@ -137,7 +139,7 @@ class GcsServerTest : public ::testing::Test { rpc::GetAllNodeInfoRequest request; std::promise promise; client_->GetAllNodeInfo( - request, + std::move(request), [&node_info_list, &promise](const Status &status, const rpc::GetAllNodeInfoReply &reply) { RAY_CHECK_OK(status); @@ -150,10 +152,10 @@ class GcsServerTest : public ::testing::Test { return node_info_list; } - bool ReportWorkerFailure(const rpc::ReportWorkerFailureRequest &request) { + bool ReportWorkerFailure(rpc::ReportWorkerFailureRequest request) { std::promise promise; client_->ReportWorkerFailure( - request, + std::move(request), [&promise](const Status &status, const rpc::ReportWorkerFailureReply &reply) { RAY_CHECK_OK(status); promise.set_value(status.ok()); @@ -167,7 +169,7 @@ class GcsServerTest : public ::testing::Test { std::optional worker_table_data_opt; std::promise promise; client_->GetWorkerInfo( - request, + std::move(request), [&worker_table_data_opt, &promise](const Status &status, const rpc::GetWorkerInfoReply &reply) { RAY_CHECK_OK(status); @@ -187,7 +189,7 @@ class GcsServerTest : public ::testing::Test { rpc::GetAllWorkerInfoRequest request; std::promise promise; client_->GetAllWorkerInfo( - request, + std::move(request), [&worker_table_data, &promise](const Status &status, const rpc::GetAllWorkerInfoReply &reply) { RAY_CHECK_OK(status); @@ -200,10 +202,11 @@ class GcsServerTest : public ::testing::Test { return worker_table_data; } - bool AddWorkerInfo(const rpc::AddWorkerInfoRequest &request) { + bool AddWorkerInfo(rpc::AddWorkerInfoRequest request) { std::promise promise; client_->AddWorkerInfo( - request, [&promise](const Status &status, const rpc::AddWorkerInfoReply &reply) { + std::move(request), + [&promise](const Status &status, const rpc::AddWorkerInfoReply &reply) { RAY_CHECK_OK(status); promise.set_value(true); }); @@ -227,14 +230,14 @@ class GcsServerTest : public ::testing::Test { TEST_F(GcsServerTest, TestActorInfo) { // Create actor_table_data JobID job_id = JobID::FromInt(1); - auto actor_table_data = Mocker::GenActorTableData(job_id); + auto actor_table_data = GenActorTableData(job_id); // TODO(sand): Add tests that don't require checkponit. } TEST_F(GcsServerTest, TestJobInfo) { // Create job_table_data JobID job_id = JobID::FromInt(1); - auto job_table_data = Mocker::GenJobTableData(job_id); + auto job_table_data = GenJobTableData(job_id); // Add job rpc::AddJobRequest add_job_request; @@ -250,17 +253,17 @@ TEST_F(GcsServerTest, TestJobInfo) { TEST_F(GcsServerTest, TestJobGarbageCollection) { // Create job_table_data JobID job_id = JobID::FromInt(1); - auto job_table_data = Mocker::GenJobTableData(job_id); + auto job_table_data = GenJobTableData(job_id); // Add job rpc::AddJobRequest add_job_request; add_job_request.mutable_data()->CopyFrom(*job_table_data); ASSERT_TRUE(AddJob(add_job_request)); - auto actor_table_data = Mocker::GenActorTableData(job_id); + auto actor_table_data = GenActorTableData(job_id); // Register detached actor for job - auto detached_actor_table_data = Mocker::GenActorTableData(job_id); + auto detached_actor_table_data = GenActorTableData(job_id); detached_actor_table_data->set_is_detached(true); // Mark job finished @@ -276,7 +279,7 @@ TEST_F(GcsServerTest, TestJobGarbageCollection) { TEST_F(GcsServerTest, TestNodeInfo) { // Create gcs node info - auto gcs_node_info = Mocker::GenNodeInfo(); + auto gcs_node_info = GenNodeInfo(); // Register node info rpc::RegisterNodeRequest register_node_info_request; @@ -305,9 +308,9 @@ TEST_F(GcsServerTest, TestNodeInfo) { TEST_F(GcsServerTest, TestNodeInfoFilters) { // Create gcs node info - auto node1 = Mocker::GenNodeInfo(1, "127.0.0.1", "node1"); - auto node2 = Mocker::GenNodeInfo(2, "127.0.0.2", "node2"); - auto node3 = Mocker::GenNodeInfo(3, "127.0.0.3", "node3"); + auto node1 = GenNodeInfo(1, "127.0.0.1", "node1"); + auto node2 = GenNodeInfo(2, "127.0.0.2", "node2"); + auto node3 = GenNodeInfo(3, "127.0.0.3", "node3"); // Register node infos for (auto &node : {node1, node2, node3}) { @@ -330,7 +333,7 @@ TEST_F(GcsServerTest, TestNodeInfoFilters) { // Get all rpc::GetAllNodeInfoRequest request; rpc::GetAllNodeInfoReply reply; - RAY_CHECK_OK(client_->SyncGetAllNodeInfo(request, &reply)); + RAY_CHECK_OK(client_->SyncGetAllNodeInfo(std::move(request), &reply)); ASSERT_EQ(reply.node_info_list_size(), 3); ASSERT_EQ(reply.num_filtered(), 0); @@ -342,7 +345,7 @@ TEST_F(GcsServerTest, TestNodeInfoFilters) { request.add_node_selectors()->set_node_id(node1->node_id()); request.add_node_selectors()->set_node_id(node2->node_id()); rpc::GetAllNodeInfoReply reply; - RAY_CHECK_OK(client_->SyncGetAllNodeInfo(request, &reply)); + RAY_CHECK_OK(client_->SyncGetAllNodeInfo(std::move(request), &reply)); ASSERT_EQ(reply.node_info_list_size(), 2); ASSERT_EQ(reply.num_filtered(), 1); @@ -353,7 +356,7 @@ TEST_F(GcsServerTest, TestNodeInfoFilters) { rpc::GetAllNodeInfoRequest request; request.set_state_filter(rpc::GcsNodeInfo::ALIVE); rpc::GetAllNodeInfoReply reply; - RAY_CHECK_OK(client_->SyncGetAllNodeInfo(request, &reply)); + RAY_CHECK_OK(client_->SyncGetAllNodeInfo(std::move(request), &reply)); ASSERT_EQ(reply.node_info_list_size(), 2); ASSERT_EQ(reply.num_filtered(), 1); @@ -365,7 +368,7 @@ TEST_F(GcsServerTest, TestNodeInfoFilters) { rpc::GetAllNodeInfoRequest request; request.set_state_filter(rpc::GcsNodeInfo::DEAD); rpc::GetAllNodeInfoReply reply; - RAY_CHECK_OK(client_->SyncGetAllNodeInfo(request, &reply)); + RAY_CHECK_OK(client_->SyncGetAllNodeInfo(std::move(request), &reply)); ASSERT_EQ(reply.node_info_list_size(), 1); ASSERT_EQ(reply.num_filtered(), 2); @@ -378,7 +381,7 @@ TEST_F(GcsServerTest, TestNodeInfoFilters) { request.add_node_selectors()->set_node_name("node1"); request.add_node_selectors()->set_node_name("node2"); rpc::GetAllNodeInfoReply reply; - RAY_CHECK_OK(client_->SyncGetAllNodeInfo(request, &reply)); + RAY_CHECK_OK(client_->SyncGetAllNodeInfo(std::move(request), &reply)); ASSERT_EQ(reply.node_info_list_size(), 2); ASSERT_EQ(reply.num_filtered(), 1); @@ -391,7 +394,7 @@ TEST_F(GcsServerTest, TestNodeInfoFilters) { request.add_node_selectors()->set_node_ip_address("127.0.0.1"); request.add_node_selectors()->set_node_ip_address("127.0.0.2"); rpc::GetAllNodeInfoReply reply; - RAY_CHECK_OK(client_->SyncGetAllNodeInfo(request, &reply)); + RAY_CHECK_OK(client_->SyncGetAllNodeInfo(std::move(request), &reply)); ASSERT_EQ(reply.node_info_list_size(), 2); ASSERT_EQ(reply.num_filtered(), 1); @@ -404,7 +407,7 @@ TEST_F(GcsServerTest, TestNodeInfoFilters) { request.add_node_selectors()->set_node_id(node1->node_id()); request.add_node_selectors()->set_node_name("node2"); rpc::GetAllNodeInfoReply reply; - RAY_CHECK_OK(client_->SyncGetAllNodeInfo(request, &reply)); + RAY_CHECK_OK(client_->SyncGetAllNodeInfo(std::move(request), &reply)); ASSERT_EQ(reply.node_info_list_size(), 2); ASSERT_EQ(reply.num_filtered(), 1); ASSERT_EQ(reply.total(), 3); @@ -417,7 +420,7 @@ TEST_F(GcsServerTest, TestNodeInfoFilters) { request.add_node_selectors()->set_node_id(node3->node_id()); request.set_state_filter(rpc::GcsNodeInfo::ALIVE); rpc::GetAllNodeInfoReply reply; - RAY_CHECK_OK(client_->SyncGetAllNodeInfo(request, &reply)); + RAY_CHECK_OK(client_->SyncGetAllNodeInfo(std::move(request), &reply)); ASSERT_EQ(reply.node_info_list_size(), 1); ASSERT_EQ(reply.num_filtered(), 2); ASSERT_EQ(reply.total(), 3); @@ -430,7 +433,7 @@ TEST_F(GcsServerTest, TestNodeInfoFilters) { request.add_node_selectors()->set_node_name("node3"); request.set_state_filter(rpc::GcsNodeInfo::DEAD); rpc::GetAllNodeInfoReply reply; - RAY_CHECK_OK(client_->SyncGetAllNodeInfo(request, &reply)); + RAY_CHECK_OK(client_->SyncGetAllNodeInfo(std::move(request), &reply)); ASSERT_EQ(reply.node_info_list_size(), 1); ASSERT_EQ(reply.num_filtered(), 2); ASSERT_EQ(reply.total(), 3); @@ -439,7 +442,7 @@ TEST_F(GcsServerTest, TestNodeInfoFilters) { TEST_F(GcsServerTest, TestWorkerInfo) { // Report worker failure - auto worker_failure_data = Mocker::GenWorkerTableData(); + auto worker_failure_data = GenWorkerTableData(); worker_failure_data->mutable_worker_address()->set_ip_address("127.0.0.1"); worker_failure_data->mutable_worker_address()->set_port(5566); rpc::ReportWorkerFailureRequest report_worker_failure_request; @@ -449,7 +452,7 @@ TEST_F(GcsServerTest, TestWorkerInfo) { ASSERT_EQ(worker_table_data.size(), 1); // Add worker info - auto worker_data = Mocker::GenWorkerTableData(); + auto worker_data = GenWorkerTableData(); worker_data->mutable_worker_address()->set_worker_id(WorkerID::FromRandom().Binary()); rpc::AddWorkerInfoRequest add_worker_request; add_worker_request.mutable_worker_data()->CopyFrom(*worker_data); diff --git a/src/ray/gcs/gcs_server/test/gcs_server_test_util.h b/src/ray/gcs/tests/gcs_server_test_util.h similarity index 80% rename from src/ray/gcs/gcs_server/test/gcs_server_test_util.h rename to src/ray/gcs/tests/gcs_server_test_util.h index c530d76f285e..1b4ff40714a4 100644 --- a/src/ray/gcs/gcs_server/test/gcs_server_test_util.h +++ b/src/ray/gcs/tests/gcs_server_test_util.h @@ -24,16 +24,16 @@ #include "absl/synchronization/mutex.h" #include "fakes/ray/rpc/raylet/raylet_client.h" #include "ray/common/asio/instrumented_io_context.h" -#include "ray/common/task/task.h" +#include "ray/common/lease/lease.h" #include "ray/common/task/task_util.h" -#include "ray/common/test_util.h" -#include "ray/gcs/gcs_client/accessor.h" -#include "ray/gcs/gcs_server/gcs_actor_manager.h" -#include "ray/gcs/gcs_server/gcs_actor_scheduler.h" -#include "ray/gcs/gcs_server/gcs_node_manager.h" -#include "ray/gcs/gcs_server/gcs_placement_group_mgr.h" -#include "ray/gcs/gcs_server/gcs_placement_group_scheduler.h" -#include "ray/gcs/gcs_server/gcs_resource_manager.h" +#include "ray/common/test_utils.h" +#include "ray/gcs/gcs_actor_manager.h" +#include "ray/gcs/gcs_actor_scheduler.h" +#include "ray/gcs/gcs_node_manager.h" +#include "ray/gcs/gcs_placement_group_mgr.h" +#include "ray/gcs/gcs_placement_group_scheduler.h" +#include "ray/gcs/gcs_resource_manager.h" +#include "ray/gcs/store_client/in_memory_store_client.h" namespace ray { @@ -77,30 +77,29 @@ struct GcsServerMocker { class MockRayletClient : public FakeRayletClient { public: - ray::Status ReturnWorker(int worker_port, - const WorkerID &worker_id, - bool disconnect_worker, - const std::string &disconnect_worker_error_detail, - bool worker_exiting) override { + void ReturnWorkerLease(int worker_port, + const LeaseID &lease_id, + bool disconnect_worker, + const std::string &disconnect_worker_error_detail, + bool worker_exiting) override { if (disconnect_worker) { num_workers_disconnected++; } else { num_workers_returned++; } - return Status::OK(); } - void GetTaskFailureCause( - const TaskID &task_id, - const ray::rpc::ClientCallback &callback) + void GetWorkerFailureCause( + const LeaseID &lease_id, + const ray::rpc::ClientCallback &callback) override { - ray::rpc::GetTaskFailureCauseReply reply; + ray::rpc::GetWorkerFailureCauseReply reply; callback(Status::OK(), std::move(reply)); num_get_task_failure_causes += 1; } void RequestWorkerLease( - const rpc::TaskSpec &spec, + const rpc::LeaseSpec &spec, bool grant_or_reject, const rpc::ClientCallback &callback, const int64_t backlog_size, @@ -124,39 +123,38 @@ struct GcsServerMocker { } void CancelWorkerLease( - const TaskID &task_id, + const LeaseID &lease_id, const rpc::ClientCallback &callback) override { num_leases_canceled += 1; cancel_callbacks.push_back(callback); } bool GrantWorkerLease() { - return GrantWorkerLease("", 0, WorkerID::FromRandom(), node_id, NodeID::Nil()); + return GrantWorkerLease("", 0, WorkerID::FromRandom(), node_id_, NodeID::Nil()); } bool GrantWorkerLease(const std::string &address, int port, const WorkerID &worker_id, - const NodeID &raylet_id, - const NodeID &retry_at_raylet_id, + const NodeID &node_id, + const NodeID &retry_at_node_id, Status status = Status::OK(), bool rejected = false) { rpc::RequestWorkerLeaseReply reply; - if (!retry_at_raylet_id.IsNil()) { + if (!retry_at_node_id.IsNil()) { reply.mutable_retry_at_raylet_address()->set_ip_address(address); reply.mutable_retry_at_raylet_address()->set_port(port); - reply.mutable_retry_at_raylet_address()->set_raylet_id( - retry_at_raylet_id.Binary()); + reply.mutable_retry_at_raylet_address()->set_node_id(retry_at_node_id.Binary()); } else { reply.mutable_worker_address()->set_ip_address(address); reply.mutable_worker_address()->set_port(port); - reply.mutable_worker_address()->set_raylet_id(raylet_id.Binary()); + reply.mutable_worker_address()->set_node_id(node_id.Binary()); reply.mutable_worker_address()->set_worker_id(worker_id.Binary()); } if (rejected) { reply.set_rejected(true); auto resources_data = reply.mutable_resources_data(); - resources_data->set_node_id(raylet_id.Binary()); + resources_data->set_node_id(node_id.Binary()); resources_data->set_resources_normal_task_changed(true); auto &normal_task_map = *(resources_data->mutable_resources_normal_task()); normal_task_map[kMemory_ResourceLabel] = @@ -299,7 +297,7 @@ struct GcsServerMocker { int num_leases_canceled = 0; int num_release_unused_workers = 0; int num_get_task_failure_causes = 0; - NodeID node_id = NodeID::FromRandom(); + NodeID node_id_ = NodeID::FromRandom(); std::list> drain_raylet_callbacks = {}; std::list> callbacks = {}; std::list> cancel_callbacks = {}; @@ -382,55 +380,6 @@ struct GcsServerMocker { std::shared_ptr store_client_ = std::make_shared(); }; - - class MockedNodeInfoAccessor : public gcs::NodeInfoAccessor { - public: - Status RegisterSelf(const rpc::GcsNodeInfo &local_node_info, - const gcs::StatusCallback &callback) override { - return Status::NotImplemented(""); - } - - const NodeID &GetSelfId() const override { - static NodeID node_id; - return node_id; - } - - const rpc::GcsNodeInfo &GetSelfInfo() const override { - static rpc::GcsNodeInfo node_info; - return node_info; - } - - void AsyncRegister(const rpc::GcsNodeInfo &node_info, - const gcs::StatusCallback &callback) override {} - - void AsyncGetAll(const gcs::MultiItemCallback &callback, - int64_t timeout_ms, - const std::vector &node_ids = {}) override { - if (callback) { - callback(Status::OK(), {}); - } - } - - void AsyncSubscribeToNodeChange( - std::function subscribe, - gcs::StatusCallback done) override { - RAY_LOG(FATAL) << "Not implemented"; - } - - const rpc::GcsNodeInfo *Get(const NodeID &node_id, - bool filter_dead_nodes = true) const override { - return nullptr; - } - - const absl::flat_hash_map &GetAll() const override { - static absl::flat_hash_map node_info_list; - return node_info_list; - } - - bool IsNodeDead(const NodeID &node_id) const override { return false; } - - void AsyncResubscribe() override {} - }; }; } // namespace ray diff --git a/src/ray/gcs/gcs_server/test/gcs_table_storage_test_base.h b/src/ray/gcs/tests/gcs_table_storage_test_base.h similarity index 86% rename from src/ray/gcs/gcs_server/test/gcs_table_storage_test_base.h rename to src/ray/gcs/tests/gcs_table_storage_test_base.h index 5252b6a99eec..7b5010f8ebaa 100644 --- a/src/ray/gcs/gcs_server/test/gcs_table_storage_test_base.h +++ b/src/ray/gcs/tests/gcs_table_storage_test_base.h @@ -19,9 +19,8 @@ #include "gtest/gtest.h" #include "ray/common/id.h" -#include "ray/common/test_util.h" -#include "ray/gcs/gcs_server/gcs_table_storage.h" -#include "ray/gcs/test/gcs_test_util.h" +#include "ray/common/test_utils.h" +#include "ray/gcs/gcs_table_storage.h" namespace ray { @@ -38,11 +37,11 @@ class GcsTableStorageTestBase : public ::testing::Test { protected: void TestGcsTableApi() { - auto table = gcs_table_storage_->JobTable(); + auto &table = gcs_table_storage_->JobTable(); JobID job1_id = JobID::FromInt(1); JobID job2_id = JobID::FromInt(2); - auto job1_table_data = Mocker::GenJobTableData(job1_id); - auto job2_table_data = Mocker::GenJobTableData(job2_id); + auto job1_table_data = GenJobTableData(job1_id); + auto job2_table_data = GenJobTableData(job2_id); // Put. Put(table, job1_id, *job1_table_data); @@ -65,9 +64,9 @@ class GcsTableStorageTestBase : public ::testing::Test { JobID job_id1 = JobID::FromInt(1); JobID job_id2 = JobID::FromInt(2); JobID job_id3 = JobID::FromInt(3); - auto actor_table_data1 = Mocker::GenActorTableData(job_id1); - auto actor_table_data2 = Mocker::GenActorTableData(job_id2); - auto actor_table_data3 = Mocker::GenActorTableData(job_id3); + auto actor_table_data1 = GenActorTableData(job_id1); + auto actor_table_data2 = GenActorTableData(job_id2); + auto actor_table_data3 = GenActorTableData(job_id3); ActorID actor_id1 = ActorID::FromBinary(actor_table_data1->actor_id()); ActorID actor_id2 = ActorID::FromBinary(actor_table_data2->actor_id()); ActorID actor_id3 = ActorID::FromBinary(actor_table_data3->actor_id()); @@ -105,7 +104,7 @@ class GcsTableStorageTestBase : public ::testing::Test { void Put(TABLE &table, const KEY &key, const VALUE &value) { auto on_done = [this](const Status &status) { --pending_count_; }; ++pending_count_; - RAY_CHECK_OK(table.Put(key, value, {on_done, *(io_service_pool_->Get())})); + table.Put(key, value, {on_done, *(io_service_pool_->Get())}); WaitPendingDone(); } @@ -124,7 +123,7 @@ class GcsTableStorageTestBase : public ::testing::Test { --pending_count_; }; ++pending_count_; - RAY_CHECK_OK(table.Get(key, {on_done, *(io_service_pool_->Get())})); + table.Get(key, {on_done, *(io_service_pool_->Get())}); WaitPendingDone(); return values.size(); } @@ -147,7 +146,7 @@ class GcsTableStorageTestBase : public ::testing::Test { --pending_count_; }; ++pending_count_; - RAY_CHECK_OK(table.GetByJobId(job_id, {on_done, *(io_service_pool_->Get())})); + table.GetByJobId(job_id, {on_done, *(io_service_pool_->Get())}); WaitPendingDone(); return values.size(); } @@ -159,7 +158,7 @@ class GcsTableStorageTestBase : public ::testing::Test { --pending_count_; }; ++pending_count_; - RAY_CHECK_OK(table.Delete(key, {on_done, *(io_service_pool_->Get())})); + table.Delete(key, {on_done, *(io_service_pool_->Get())}); WaitPendingDone(); } @@ -170,7 +169,7 @@ class GcsTableStorageTestBase : public ::testing::Test { --pending_count_; }; ++pending_count_; - RAY_CHECK_OK(table.BatchDelete(keys, {on_done, *(io_service_pool_->Get())})); + table.BatchDelete(keys, {on_done, *(io_service_pool_->Get())}); WaitPendingDone(); } diff --git a/src/ray/gcs/gcs_server/test/gcs_task_manager_test.cc b/src/ray/gcs/tests/gcs_task_manager_test.cc similarity index 93% rename from src/ray/gcs/gcs_server/test/gcs_task_manager_test.cc rename to src/ray/gcs/tests/gcs_task_manager_test.cc index 8349ea04eb22..19249eee30d7 100644 --- a/src/ray/gcs/gcs_server/test/gcs_task_manager_test.cc +++ b/src/ray/gcs/tests/gcs_task_manager_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/gcs_server/gcs_task_manager.h" +#include "ray/gcs/gcs_task_manager.h" #include @@ -25,9 +25,9 @@ #include "gtest/gtest.h" #include "ray/common/asio/asio_util.h" #include "ray/common/id.h" +#include "ray/common/protobuf_utils.h" #include "ray/common/status.h" -#include "ray/gcs/pb_util.h" -#include "ray/gcs/test/gcs_test_util.h" +#include "ray/common/test_utils.h" namespace ray { namespace gcs { @@ -117,7 +117,7 @@ class GcsTaskManagerTest : public ::testing::Test { actor_id.IsNil() ? TaskType::NORMAL_TASK : TaskType::ACTOR_TASK, actor_id), error_info); - auto events_data = Mocker::GenTaskEventsData(events); + auto events_data = GenTaskEventsData(events); SyncAddTaskEventData(events_data); } @@ -146,6 +146,32 @@ class GcsTaskManagerTest : public ::testing::Test { return reply; } + rpc::events::AddEventsReply SyncAddEvents( + const rpc::events::RayEventsData &events_data) { + rpc::events::AddEventsRequest request; + rpc::events::AddEventsReply reply; + std::promise promise; + + request.mutable_events_data()->CopyFrom(events_data); + // Dispatch so that it runs in GcsTaskManager's io service. + io_context_->GetIoService().dispatch( + [this, &promise, &request, &reply]() { + task_manager->HandleAddEvents( + request, + &reply, + [&promise](Status, std::function, std::function) { + promise.set_value(true); + }); + }, + "SyncAddEvent"); + + promise.get_future().get(); + + // Assert on RPC reply. + EXPECT_EQ(StatusCode(reply.status().code()), StatusCode::OK); + return reply; + } + rpc::GetTaskEventsReply SyncGetTaskEvents( const std::vector task_ids, const std::vector task_id_predicates, @@ -401,14 +427,29 @@ class GcsTaskManagerDroppedTaskAttemptsLimit : public GcsTaskManagerTest { } }; +TEST_F(GcsTaskManagerTest, TestHandleAddEventBasic) { + size_t num_task_events = 100; + auto task_ids = GenTaskIDs(num_task_events); + auto events = GenTaskEvents(task_ids, 0); + auto events_data = GenRayEventsData(events, {}); + auto reply = SyncAddEvents(events_data); + + // Assert on RPC reply. + EXPECT_EQ(StatusCode(reply.status().code()), StatusCode::OK); + + // Assert on actual data. + EXPECT_EQ(task_manager->task_event_storage_->GetTaskEvents().size(), num_task_events); + EXPECT_EQ(task_manager->GetTotalNumTaskEventsReported(), num_task_events); +} + TEST_F(GcsTaskManagerTest, TestHandleAddTaskEventBasic) { size_t num_task_events = 100; int32_t num_status_events_dropped = 10; int32_t num_profile_events_dropped = 10; auto task_ids = GenTaskIDs(num_task_events); auto events = GenTaskEvents(task_ids, 0); - auto events_data = Mocker::GenTaskEventsData( - events, num_profile_events_dropped, num_status_events_dropped); + auto events_data = + GenTaskEventsData(events, num_profile_events_dropped, num_status_events_dropped); auto reply = SyncAddTaskEventData(events_data); @@ -425,6 +466,50 @@ TEST_F(GcsTaskManagerTest, TestHandleAddTaskEventBasic) { } } +TEST_F(GcsTaskManagerTest, TestHandleAddEventsMultiJobGrouping) { + // Prepare events for two jobs in a single AddEvents request + auto task_ids_job0 = GenTaskIDs(3); + auto task_ids_job1 = GenTaskIDs(2); + + auto events_job0 = GenTaskEvents(task_ids_job0, /*attempt_number*/ 0, /*job_id*/ 0); + auto events_job1 = GenTaskEvents(task_ids_job1, /*attempt_number*/ 0, /*job_id*/ 1); + + // Build RayEventsData including dropped attempts for each job + std::vector all_events; + all_events.insert(all_events.end(), events_job0.begin(), events_job0.end()); + all_events.insert(all_events.end(), events_job1.begin(), events_job1.end()); + + std::vector dropped_attempts; + dropped_attempts.emplace_back(GenTaskIDForJob(0), 0); + dropped_attempts.emplace_back(GenTaskIDForJob(1), 0); + + auto ray_events_data = GenRayEventsData(all_events, dropped_attempts); + + // Send AddEvents once; converter should group by job id and GCS should record all + auto reply = SyncAddEvents(ray_events_data); + EXPECT_EQ(StatusCode(reply.status().code()), StatusCode::OK); + + // Verify all events stored + EXPECT_EQ(task_manager->task_event_storage_->GetTaskEvents().size(), + task_ids_job0.size() + task_ids_job1.size()); + + // Verify per-job data loss counters populated from dropped attempts + { + auto reply_job0 = SyncGetTaskEvents(/* task_ids */ {}, JobID::FromInt(0)); + EXPECT_EQ(reply_job0.num_status_task_events_dropped(), 1); + } + { + auto reply_job1 = SyncGetTaskEvents(/* task_ids */ {}, JobID::FromInt(1)); + EXPECT_EQ(reply_job1.num_status_task_events_dropped(), 1); + } + + // Verify global counters reflect both drops + { + auto reply_all = SyncGetTaskEvents(/* task_ids */ {}); + EXPECT_EQ(reply_all.num_status_task_events_dropped(), 2); + } +} + TEST_F(GcsTaskManagerTest, TestMergeTaskEventsSameTaskAttempt) { size_t num_task_events = 20; // Same task id and attempt @@ -433,7 +518,7 @@ TEST_F(GcsTaskManagerTest, TestMergeTaskEventsSameTaskAttempt) { for (size_t i = 0; i < num_task_events; ++i) { auto profile_events = GenProfileEvents("event", i, i); auto events = GenTaskEvents(task_ids, attempt_number, 0, profile_events); - auto events_data = Mocker::GenTaskEventsData(events); + auto events_data = GenTaskEventsData(events); auto reply = SyncAddTaskEventData(events_data); EXPECT_EQ(StatusCode(reply.status().code()), StatusCode::OK); @@ -487,14 +572,14 @@ TEST_F(GcsTaskManagerTest, TestGetTaskEvents) { auto all_events = {events_with_profile, events_with_status, events_with_both}; for (auto &events : all_events) { - auto data = Mocker::GenTaskEventsData(events); + auto data = GenTaskEventsData(events); SyncAddTaskEventData(data); } } { // Add drop counter. - auto data = Mocker::GenTaskEventsData( + auto data = GenTaskEventsData( {}, num_profile_task_events_dropped, num_status_task_events_dropped); SyncAddTaskEventData(data); } @@ -506,7 +591,7 @@ TEST_F(GcsTaskManagerTest, TestGetTaskEvents) { std::vector expected_events = ConcatTaskEvents({events_with_status, events_with_profile, events_with_both}); - auto expected_data = Mocker::GenTaskEventsData(expected_events); + auto expected_data = GenTaskEventsData(expected_events); // Expect match events ExpectTaskEventsEq(expected_data.mutable_events_by_task(), reply.mutable_events_by_task()); @@ -528,7 +613,7 @@ TEST_F(GcsTaskManagerTest, TestGetTaskEventsWithLimit) { auto profile_events = GenProfileEvents("event", /*start*/ 1, /*end*/ 1); auto status_update = GenStateUpdate(); auto events = GenTaskEvents(task_ids, 0, 0, profile_events, status_update); - auto data = Mocker::GenTaskEventsData(events); + auto data = GenTaskEventsData(events); SyncAddTaskEventData(data); } @@ -576,7 +661,7 @@ TEST_F(GcsTaskManagerTest, TestGetTaskEventsByTaskIDs) { all_events.push_back(GenTaskEvents({task_id1}, attempt_num)); } auto events_task1 = ConcatTaskEvents(all_events); - events_data_task1 = Mocker::GenTaskEventsData(events_task1); + events_data_task1 = GenTaskEventsData(events_task1); SyncAddTaskEventData(events_data_task1); } @@ -588,7 +673,7 @@ TEST_F(GcsTaskManagerTest, TestGetTaskEventsByTaskIDs) { all_events.push_back(GenTaskEvents({task_id2}, attempt_num)); } auto events_task2 = ConcatTaskEvents(all_events); - events_data_task2 = Mocker::GenTaskEventsData(events_task2); + events_data_task2 = GenTaskEventsData(events_task2); SyncAddTaskEventData(events_data_task2); } @@ -600,7 +685,7 @@ TEST_F(GcsTaskManagerTest, TestGetTaskEventsByTaskIDs) { all_events.push_back(GenTaskEvents({task_id3}, attempt_num)); } auto events_task3 = ConcatTaskEvents(all_events); - events_data_task3 = Mocker::GenTaskEventsData(events_task3); + events_data_task3 = GenTaskEventsData(events_task3); SyncAddTaskEventData(events_data_task3); } @@ -698,7 +783,7 @@ TEST_F(GcsTaskManagerTest, TestGetTaskEventsByJobs) { absl::nullopt, absl::nullopt, task_info); - events_data_job1 = Mocker::GenTaskEventsData(events); + events_data_job1 = GenTaskEventsData(events); SyncAddTaskEventData(events_data_job1); } @@ -713,7 +798,7 @@ TEST_F(GcsTaskManagerTest, TestGetTaskEventsByJobs) { absl::nullopt, absl::nullopt, task_info); - events_data_job2 = Mocker::GenTaskEventsData(events); + events_data_job2 = GenTaskEventsData(events); SyncAddTaskEventData(events_data_job2); } @@ -728,7 +813,7 @@ TEST_F(GcsTaskManagerTest, TestGetTaskEventsByJobs) { absl::nullopt, absl::nullopt, task_info); - events_data_job3 = Mocker::GenTaskEventsData(events); + events_data_job3 = GenTaskEventsData(events); SyncAddTaskEventData(events_data_job3); } @@ -836,7 +921,7 @@ TEST_F(GcsTaskManagerTest, TestGetTaskEventsFilters) { absl::nullopt, absl::nullopt, task_info_actor_id); - event_data_actor_id_job1 = Mocker::GenTaskEventsData(events); + event_data_actor_id_job1 = GenTaskEventsData(events); SyncAddTaskEventData(event_data_actor_id_job1); } @@ -855,7 +940,7 @@ TEST_F(GcsTaskManagerTest, TestGetTaskEventsFilters) { absl::nullopt, absl::nullopt, task_info_name); - event_data_task_name_job1 = Mocker::GenTaskEventsData(events); + event_data_task_name_job1 = GenTaskEventsData(events); SyncAddTaskEventData(event_data_task_name_job1); } @@ -875,7 +960,7 @@ TEST_F(GcsTaskManagerTest, TestGetTaskEventsFilters) { GenStateUpdate({{rpc::TaskStatus::PENDING_NODE_ASSIGNMENT, 1}, {task_status, 5}}, WorkerID::Nil()), task_info); - event_data_task_state_job2 = Mocker::GenTaskEventsData(events); + event_data_task_state_job2 = GenTaskEventsData(events); SyncAddTaskEventData(event_data_task_state_job2); } @@ -1339,7 +1424,7 @@ TEST_F(GcsTaskManagerMemoryLimitedTest, TestIndexNoLeak) { GenProfileEvents("event", 1, 1), GenStateUpdate({}, worker_id), GenTaskInfo(job_id)); - auto events_data = Mocker::GenTaskEventsData(events); + auto events_data = GenTaskEventsData(events); SyncAddTaskEventData(events_data); } @@ -1360,7 +1445,7 @@ TEST_F(GcsTaskManagerMemoryLimitedTest, TestIndexNoLeak) { GenProfileEvents("event", 1, 1), GenStateUpdate(), GenTaskInfo(JobID::FromInt(job_id))); - auto events_data = Mocker::GenTaskEventsData(events); + auto events_data = GenTaskEventsData(events); SyncAddTaskEventData(events_data); } } @@ -1394,8 +1479,7 @@ TEST_F(GcsTaskManagerMemoryLimitedTest, TestLimitTaskEvents) { /* attempt_number */ 0, /* job_id */ 0, GenProfileEvents("event", 1, 1)); - auto events_data = - Mocker::GenTaskEventsData(events, num_profile_events_dropped_on_worker); + auto events_data = GenTaskEventsData(events, num_profile_events_dropped_on_worker); SyncAddTaskEventData(events_data); } { @@ -1405,9 +1489,9 @@ TEST_F(GcsTaskManagerMemoryLimitedTest, TestLimitTaskEvents) { /* job_id */ 0, /* profile_events */ absl::nullopt, GenStateUpdate()); - auto events_data = Mocker::GenTaskEventsData(events, - /*num_profile_task_events_dropped*/ 0, - num_status_events_dropped_on_worker); + auto events_data = GenTaskEventsData(events, + /*num_profile_task_events_dropped*/ 0, + num_status_events_dropped_on_worker); SyncAddTaskEventData(events_data); } @@ -1416,7 +1500,7 @@ TEST_F(GcsTaskManagerMemoryLimitedTest, TestLimitTaskEvents) { { // Add new task events to overwrite the existing ones. expected_events = GenTaskEvents(GenTaskIDs(num_batch2), 0); - auto events_data = Mocker::GenTaskEventsData(expected_events); + auto events_data = GenTaskEventsData(expected_events); SyncAddTaskEventData(events_data); } @@ -1457,7 +1541,7 @@ TEST_F(GcsTaskManagerTest, TestGetTaskEventsWithDriver) { /* status_update*/ absl::nullopt, GenTaskInfo( /* job_id */ JobID::FromInt(0), TaskID::Nil(), rpc::TaskType::DRIVER_TASK)); - auto events_data = Mocker::GenTaskEventsData(events); + auto events_data = GenTaskEventsData(events); SyncAddTaskEventData(events_data); } @@ -1498,7 +1582,7 @@ TEST_F(GcsTaskManagerMemoryLimitedTest, TestLimitReturnRecentTasksWhenGetAll) { /* job_id */ 0, /* profile event */ absl::nullopt, GenStateUpdate({{rpc::TaskStatus::RUNNING, 1}}, WorkerID::Nil())); - auto events_data = Mocker::GenTaskEventsData(events); + auto events_data = GenTaskEventsData(events); SyncAddTaskEventData(events_data); } @@ -1531,7 +1615,7 @@ TEST_F(GcsTaskManagerTest, TestTaskDataLossWorker) { EXPECT_EQ(reply.events_by_task_size(), 1); // Report it as data loss. - auto data = Mocker::GenTaskEventsDataLoss({{task_id, 0}}); + auto data = GenTaskEventsDataLoss({{task_id, 0}}); SyncAddTaskEventData(data); // The task attempt should be dropped. @@ -1554,7 +1638,7 @@ TEST_F(GcsTaskManagerTest, TestMultipleJobsDataLoss) { SyncAddTaskEvent({job_task1}, {{rpc::TaskStatus::RUNNING, 1}}, TaskID::Nil(), 1); // Make data loss happens on job 0. - auto data = Mocker::GenTaskEventsDataLoss({{job_task0, 0}}, 0); + auto data = GenTaskEventsDataLoss({{job_task0, 0}}, 0); SyncAddTaskEventData(data); // Job 0 has data loss @@ -1634,7 +1718,7 @@ TEST_F(GcsTaskManagerProfileEventsLimitTest, TestProfileEventsNoLeak) { /* attempt_number */ 0, /* job_id */ 0, GenProfileEvents("event", 1, 1)); - auto events_data = Mocker::GenTaskEventsData(events); + auto events_data = GenTaskEventsData(events); SyncAddTaskEventData(events_data); } diff --git a/src/ray/gcs/gcs_server/test/gcs_worker_manager_test.cc b/src/ray/gcs/tests/gcs_worker_manager_test.cc similarity index 94% rename from src/ray/gcs/gcs_server/test/gcs_worker_manager_test.cc rename to src/ray/gcs/tests/gcs_worker_manager_test.cc index 5607919bfb8e..6730cdd29283 100644 --- a/src/ray/gcs/gcs_server/test/gcs_worker_manager_test.cc +++ b/src/ray/gcs/tests/gcs_worker_manager_test.cc @@ -12,34 +12,34 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/gcs_server/gcs_worker_manager.h" +#include "ray/gcs/gcs_worker_manager.h" #include #include #include -#include "ray/util/process.h" - -// clang-format off -#include "ray/common/asio/instrumented_io_context.h" -#include "ray/gcs/gcs_server/test/gcs_server_test_util.h" -#include "ray/gcs/test/gcs_test_util.h" #include "mock/ray/pubsub/publisher.h" -#include "src/ray/protobuf/gcs.pb.h" +#include "ray/common/asio/instrumented_io_context.h" +#include "ray/common/test_utils.h" +#include "ray/gcs/store_client/in_memory_store_client.h" +#include "ray/gcs/store_client_kv.h" +#include "ray/util/process.h" #include "src/ray/protobuf/common.pb.h" -#include "ray/gcs/gcs_server/store_client_kv.h" -// clang-format on -using namespace ::testing; // NOLINT -using namespace ray::gcs; // NOLINT -using namespace ray; // NOLINT +#include "src/ray/protobuf/gcs.pb.h" + +using namespace ::testing; // NOLINT +using namespace ray::gcs; // NOLINT +using namespace ray::pubsub; // NOLINT +using namespace ray; // NOLINT class GcsWorkerManagerTest : public Test { public: GcsWorkerManagerTest() { - gcs_publisher_ = - std::make_shared(std::make_unique()); - gcs_table_storage_ = std::make_shared(); + gcs_publisher_ = std::make_shared( + std::make_unique()); + gcs_table_storage_ = + std::make_unique(std::make_unique()); } void SetUp() override { @@ -75,7 +75,7 @@ class GcsWorkerManagerTest : public Test { std::unique_ptr thread_io_service_; instrumented_io_context io_service_; std::shared_ptr gcs_table_storage_; - std::shared_ptr gcs_publisher_; + std::shared_ptr gcs_publisher_; std::shared_ptr worker_manager_; }; diff --git a/src/ray/gcs/gcs_server/test/in_memory_gcs_table_storage_test.cc b/src/ray/gcs/tests/in_memory_gcs_table_storage_test.cc similarity index 80% rename from src/ray/gcs/gcs_server/test/in_memory_gcs_table_storage_test.cc rename to src/ray/gcs/tests/in_memory_gcs_table_storage_test.cc index 9142d119b9bb..ac20883d2e85 100644 --- a/src/ray/gcs/gcs_server/test/in_memory_gcs_table_storage_test.cc +++ b/src/ray/gcs/tests/in_memory_gcs_table_storage_test.cc @@ -16,17 +16,18 @@ #include -#include "ray/common/test_util.h" -#include "ray/gcs/gcs_server/gcs_table_storage.h" -#include "ray/gcs/gcs_server/test/gcs_table_storage_test_base.h" +#include "ray/common/test_utils.h" +#include "ray/gcs/gcs_table_storage.h" #include "ray/gcs/store_client/in_memory_store_client.h" +#include "ray/gcs/tests/gcs_table_storage_test_base.h" namespace ray { class InMemoryGcsTableStorageTest : public gcs::GcsTableStorageTestBase { public: void SetUp() override { - gcs_table_storage_ = std::make_shared(); + gcs_table_storage_ = std::make_shared( + std::make_unique()); } }; diff --git a/src/ray/gcs/gcs_server/test/redis_gcs_table_storage_test.cc b/src/ray/gcs/tests/redis_gcs_table_storage_test.cc similarity index 70% rename from src/ray/gcs/gcs_server/test/redis_gcs_table_storage_test.cc rename to src/ray/gcs/tests/redis_gcs_table_storage_test.cc index 568db9638f11..fd9ec84352f9 100644 --- a/src/ray/gcs/gcs_server/test/redis_gcs_table_storage_test.cc +++ b/src/ray/gcs/tests/redis_gcs_table_storage_test.cc @@ -15,10 +15,10 @@ #include #include "gtest/gtest.h" -#include "ray/common/test_util.h" -#include "ray/gcs/gcs_server/gcs_table_storage.h" -#include "ray/gcs/gcs_server/test/gcs_table_storage_test_base.h" +#include "ray/common/test_utils.h" +#include "ray/gcs/gcs_table_storage.h" #include "ray/gcs/store_client/redis_store_client.h" +#include "ray/gcs/tests/gcs_table_storage_test_base.h" namespace ray { @@ -29,17 +29,13 @@ class RedisGcsTableStorageTest : public gcs::GcsTableStorageTestBase { static void TearDownTestCase() { TestSetupUtil::ShutDownRedisServers(); } void SetUp() override { - gcs::RedisClientOptions options("127.0.0.1", TEST_REDIS_SERVER_PORTS.front(), "", ""); - redis_client_ = std::make_shared(options); - RAY_CHECK_OK(redis_client_->Connect(*io_service_pool_->Get())); - - gcs_table_storage_ = std::make_shared(redis_client_); + auto &io_service = *io_service_pool_->Get(); + gcs::RedisClientOptions options{"127.0.0.1", TEST_REDIS_SERVER_PORTS.front()}; + gcs_table_storage_ = std::make_shared( + std::make_unique(io_service, options)); } - void TearDown() override { redis_client_->Disconnect(); } - - protected: - std::shared_ptr redis_client_; + void TearDown() override {} }; TEST_F(RedisGcsTableStorageTest, TestGcsTableApi) { TestGcsTableApi(); } diff --git a/src/ray/gcs/gcs_server/test/usage_stats_client_test.cc b/src/ray/gcs/tests/usage_stats_client_test.cc similarity index 90% rename from src/ray/gcs/gcs_server/test/usage_stats_client_test.cc rename to src/ray/gcs/tests/usage_stats_client_test.cc index fbe054f4bebc..15fcaa19674d 100644 --- a/src/ray/gcs/gcs_server/test/usage_stats_client_test.cc +++ b/src/ray/gcs/tests/usage_stats_client_test.cc @@ -12,15 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/gcs_server/usage_stats_client.h" +#include "ray/gcs/usage_stats_client.h" + +#include #include #include -#include "gtest/gtest.h" -#include "mock/ray/gcs/gcs_server/gcs_kv_manager.h" -#include "ray/gcs/gcs_server/gcs_kv_manager.h" -#include "ray/gcs/gcs_server/gcs_server.h" +#include "mock/ray/gcs/gcs_kv_manager.h" +#include "ray/common/asio/asio_util.h" +#include "ray/gcs/gcs_kv_manager.h" using namespace ray; // NOLINT diff --git a/src/ray/gcs/gcs_server/usage_stats_client.cc b/src/ray/gcs/usage_stats_client.cc similarity index 96% rename from src/ray/gcs/gcs_server/usage_stats_client.cc rename to src/ray/gcs/usage_stats_client.cc index 8f46eb6b4970..cdd1ae431496 100644 --- a/src/ray/gcs/gcs_server/usage_stats_client.cc +++ b/src/ray/gcs/usage_stats_client.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/gcs_server/usage_stats_client.h" +#include "ray/gcs/usage_stats_client.h" #include diff --git a/src/ray/gcs/gcs_server/usage_stats_client.h b/src/ray/gcs/usage_stats_client.h similarity index 97% rename from src/ray/gcs/gcs_server/usage_stats_client.h rename to src/ray/gcs/usage_stats_client.h index a79cb6bbc4e4..2ff37f70354a 100644 --- a/src/ray/gcs/gcs_server/usage_stats_client.h +++ b/src/ray/gcs/usage_stats_client.h @@ -17,7 +17,7 @@ #include #include -#include "ray/gcs/gcs_server/gcs_kv_manager.h" +#include "ray/gcs/gcs_kv_manager.h" #include "src/ray/protobuf/usage.pb.h" namespace ray { diff --git a/src/ray/gcs_client/BUILD.bazel b/src/ray/gcs_client/BUILD.bazel new file mode 100644 index 000000000000..4b416ebb7026 --- /dev/null +++ b/src/ray/gcs_client/BUILD.bazel @@ -0,0 +1,64 @@ +load("//bazel:ray.bzl", "ray_cc_library") + +ray_cc_library( + name = "gcs_client", + srcs = [ + "accessor.cc", + "gcs_client.cc", + ], + hdrs = [ + "accessor.h", + "gcs_client.h", + ], + deps = [ + ":rpc_client", + "//src/ray/common:asio", + "//src/ray/common:id", + "//src/ray/common:placement_group", + "//src/ray/common:protobuf_utils", + "//src/ray/gcs/store_client:redis_store_client", + "//src/ray/protobuf:usage_cc_proto", + "//src/ray/pubsub:gcs_subscriber", + "//src/ray/pubsub:subscriber", + "//src/ray/util:container_util", + "//src/ray/util:network_util", + "//src/ray/util:sequencer", + ], +) + +ray_cc_library( + name = "global_state_accessor_lib", + srcs = ["global_state_accessor.cc"], + hdrs = ["global_state_accessor.h"], + deps = [ + ":gcs_client", + "//src/ray/util:time", + ], +) + +ray_cc_library( + name = "gcs_python_callbacks", + hdrs = [ + "python_callbacks.h", + ], +) + +ray_cc_library( + name = "rpc_client", + hdrs = [ + "rpc_client.h", + ], + visibility = [ + ":__pkg__", + "//src/ray/pubsub:__pkg__", + ], + deps = [ + "//src/ray/common:ray_config", + "//src/ray/protobuf:autoscaler_cc_grpc", + "//src/ray/protobuf:gcs_service_cc_grpc", + "//src/ray/rpc:client_call", + "//src/ray/rpc:retryable_grpc_client", + "//src/ray/util:network_util", + "@com_google_absl//absl/container:btree", + ], +) diff --git a/src/ray/gcs/gcs_client/accessor.cc b/src/ray/gcs_client/accessor.cc similarity index 90% rename from src/ray/gcs/gcs_client/accessor.cc rename to src/ray/gcs_client/accessor.cc index 72440b02d387..b698a6b14bc4 100644 --- a/src/ray/gcs/gcs_client/accessor.cc +++ b/src/ray/gcs_client/accessor.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/gcs_client/accessor.h" +#include "ray/gcs_client/accessor.h" #include #include @@ -21,8 +21,7 @@ #include #include -#include "ray/common/common_protocol.h" -#include "ray/gcs/gcs_client/gcs_client.h" +#include "ray/gcs_client/gcs_client.h" #include "ray/util/container_util.h" namespace ray { @@ -43,7 +42,8 @@ void JobInfoAccessor::AsyncAdd(const std::shared_ptr &data_pt rpc::AddJobRequest request; request.mutable_data()->CopyFrom(*data_ptr); client_impl_->GetGcsRpcClient().AddJob( - request, [job_id, data_ptr, callback](const Status &status, rpc::AddJobReply &&) { + std::move(request), + [job_id, data_ptr, callback](const Status &status, rpc::AddJobReply &&) { if (callback) { callback(status); } @@ -58,7 +58,8 @@ void JobInfoAccessor::AsyncMarkFinished(const JobID &job_id, rpc::MarkJobFinishedRequest request; request.set_job_id(job_id.Binary()); client_impl_->GetGcsRpcClient().MarkJobFinished( - request, [job_id, callback](const Status &status, rpc::MarkJobFinishedReply &&) { + std::move(request), + [job_id, callback](const Status &status, rpc::MarkJobFinishedReply &&) { if (callback) { callback(status); } @@ -71,14 +72,15 @@ Status JobInfoAccessor::AsyncSubscribeAll( const SubscribeCallback &subscribe, const StatusCallback &done) { RAY_CHECK(subscribe != nullptr); - fetch_all_data_operation_ = [this, subscribe](const StatusCallback &done) { - auto callback = [subscribe, done](const Status &status, - std::vector &&job_info_list) { + fetch_all_data_operation_ = [this, subscribe](const StatusCallback &done_callback) { + auto callback = [subscribe, done_callback]( + const Status &status, + std::vector &&job_info_list) { for (auto &job_info : job_info_list) { subscribe(JobID::FromBinary(job_info.job_id()), std::move(job_info)); } - if (done) { - done(status); + if (done_callback) { + done_callback(status); } }; AsyncGetAll(/*job_or_submission_id=*/std::nullopt, @@ -87,8 +89,8 @@ Status JobInfoAccessor::AsyncSubscribeAll( callback, /*timeout_ms=*/-1); }; - subscribe_operation_ = [this, subscribe](const StatusCallback &done) { - return client_impl_->GetGcsSubscriber().SubscribeAllJobs(subscribe, done); + subscribe_operation_ = [this, subscribe](const StatusCallback &done_callback) { + return client_impl_->GetGcsSubscriber().SubscribeAllJobs(subscribe, done_callback); }; return subscribe_operation_( [this, done](const Status &status) { fetch_all_data_operation_(done); }); @@ -122,7 +124,7 @@ void JobInfoAccessor::AsyncGetAll(const std::optional &job_or_submi request.set_job_or_submission_id(job_or_submission_id.value()); } client_impl_->GetGcsRpcClient().GetAllJobInfo( - request, + std::move(request), [callback](const Status &status, rpc::GetAllJobInfoReply &&reply) { callback(status, VectorFromProtobuf(std::move(*reply.mutable_job_info_list()))); RAY_LOG(DEBUG) << "Finished getting all job info."; @@ -142,8 +144,8 @@ Status JobInfoAccessor::GetAll(const std::optional &job_or_submissi request.set_job_or_submission_id(job_or_submission_id.value()); } rpc::GetAllJobInfoReply reply; - RAY_RETURN_NOT_OK( - client_impl_->GetGcsRpcClient().SyncGetAllJobInfo(request, &reply, timeout_ms)); + RAY_RETURN_NOT_OK(client_impl_->GetGcsRpcClient().SyncGetAllJobInfo( + std::move(request), &reply, timeout_ms)); job_data_list = VectorFromProtobuf(std::move(*reply.mutable_job_info_list())); return Status::OK(); } @@ -152,7 +154,8 @@ void JobInfoAccessor::AsyncGetNextJobID(const ItemCallback &callback) { RAY_LOG(DEBUG) << "Getting next job id"; rpc::GetNextJobIDRequest request; client_impl_->GetGcsRpcClient().GetNextJobID( - request, [callback](const Status &status, rpc::GetNextJobIDReply &&reply) { + std::move(request), + [callback](const Status &status, rpc::GetNextJobIDReply &&reply) { RAY_CHECK_OK(status); auto job_id = JobID::FromInt(reply.job_id()); RAY_LOG(DEBUG) << "Finished getting next job id = " << job_id; @@ -169,7 +172,7 @@ void ActorInfoAccessor::AsyncGet( rpc::GetActorInfoRequest request; request.set_actor_id(actor_id.Binary()); client_impl_->GetGcsRpcClient().GetActorInfo( - request, + std::move(request), [actor_id, callback](const Status &status, rpc::GetActorInfoReply &&reply) { if (reply.has_actor_table_data()) { callback(status, reply.actor_table_data()); @@ -196,13 +199,18 @@ void ActorInfoAccessor::AsyncGetAllByFilter( request.mutable_filters()->set_job_id(job_id.value().Binary()); } if (actor_state_name) { - rpc::ActorTableData::ActorState actor_state = - StringToActorState(actor_state_name.value()); - request.mutable_filters()->set_state(actor_state); + static absl::flat_hash_map + actor_state_map = { + {"DEPENDENCIES_UNREADY", rpc::ActorTableData::DEPENDENCIES_UNREADY}, + {"PENDING_CREATION", rpc::ActorTableData::PENDING_CREATION}, + {"ALIVE", rpc::ActorTableData::ALIVE}, + {"RESTARTING", rpc::ActorTableData::RESTARTING}, + {"DEAD", rpc::ActorTableData::DEAD}}; + request.mutable_filters()->set_state(actor_state_map[*actor_state_name]); } client_impl_->GetGcsRpcClient().GetAllActorInfo( - request, + std::move(request), [callback](const Status &status, rpc::GetAllActorInfoReply &&reply) { callback(status, VectorFromProtobuf(std::move(*reply.mutable_actor_table_data()))); @@ -221,7 +229,7 @@ void ActorInfoAccessor::AsyncGetByName( request.set_name(name); request.set_ray_namespace(ray_namespace); client_impl_->GetGcsRpcClient().GetNamedActorInfo( - request, + std::move(request), [name, callback](const Status &status, rpc::GetNamedActorInfoReply &&reply) { if (reply.has_actor_table_data()) { callback(status, reply.actor_table_data()); @@ -243,10 +251,10 @@ Status ActorInfoAccessor::SyncGetByName(const std::string &name, request.set_name(name); request.set_ray_namespace(ray_namespace); auto status = client_impl_->GetGcsRpcClient().SyncGetNamedActorInfo( - request, &reply, GetGcsTimeoutMs()); + std::move(request), &reply, GetGcsTimeoutMs()); if (status.ok()) { - actor_table_data = reply.actor_table_data(); - task_spec = reply.task_spec(); + actor_table_data = std::move(*reply.mutable_actor_table_data()); + task_spec = std::move(*reply.mutable_task_spec()); } return status; } @@ -260,14 +268,15 @@ Status ActorInfoAccessor::SyncListNamedActors( request.set_ray_namespace(ray_namespace); rpc::ListNamedActorsReply reply; auto status = client_impl_->GetGcsRpcClient().SyncListNamedActors( - request, &reply, GetGcsTimeoutMs()); + std::move(request), &reply, GetGcsTimeoutMs()); if (!status.ok()) { return status; } actors.reserve(reply.named_actors_list_size()); - for (const auto &actor_info : + for (auto &actor_info : VectorFromProtobuf(std::move(*reply.mutable_named_actors_list()))) { - actors.emplace_back(actor_info.ray_namespace(), actor_info.name()); + actors.emplace_back(std::move(*actor_info.mutable_ray_namespace()), + std::move(*actor_info.mutable_name())); } return status; } @@ -282,7 +291,7 @@ void ActorInfoAccessor::AsyncRestartActorForLineageReconstruction( request.set_num_restarts_due_to_lineage_reconstruction( num_restarts_due_to_lineage_reconstruction); client_impl_->GetGcsRpcClient().RestartActorForLineageReconstruction( - request, + std::move(request), [callback](const Status &status, rpc::RestartActorForLineageReconstructionReply &&reply) { callback(status); @@ -313,7 +322,7 @@ void ActorInfoAccessor::AsyncRegisterActor(const ray::TaskSpecification &task_sp rpc::RegisterActorRequest request; request.mutable_task_spec()->CopyFrom(task_spec.GetMessage()); client_impl_->GetGcsRpcClient().RegisterActor( - request, + std::move(request), [callback](const Status &status, rpc::RegisterActorReply &&reply) { callback(ComputeGcsStatus(status, reply.status())); }, @@ -326,7 +335,7 @@ Status ActorInfoAccessor::SyncRegisterActor(const ray::TaskSpecification &task_s rpc::RegisterActorReply reply; request.mutable_task_spec()->CopyFrom(task_spec.GetMessage()); auto status = client_impl_->GetGcsRpcClient().SyncRegisterActor( - request, &reply, GetGcsTimeoutMs()); + std::move(request), &reply, GetGcsTimeoutMs()); return ComputeGcsStatus(status, reply.status()); } @@ -340,7 +349,7 @@ void ActorInfoAccessor::AsyncKillActor(const ActorID &actor_id, request.set_force_kill(force_kill); request.set_no_restart(no_restart); client_impl_->GetGcsRpcClient().KillActorViaGcs( - request, + std::move(request), [callback](const Status &status, rpc::KillActorViaGcsReply &&reply) { if (callback) { callback(status); @@ -356,7 +365,8 @@ void ActorInfoAccessor::AsyncCreateActor( rpc::CreateActorRequest request; request.mutable_task_spec()->CopyFrom(task_spec.GetMessage()); client_impl_->GetGcsRpcClient().CreateActor( - request, [callback](const Status &status, rpc::CreateActorReply &&reply) { + std::move(request), + [callback](const Status &status, rpc::CreateActorReply &&reply) { callback(status, std::move(reply)); }); } @@ -371,7 +381,7 @@ void ActorInfoAccessor::AsyncReportActorOutOfScope( request.set_num_restarts_due_to_lineage_reconstruction( num_restarts_due_to_lineage_reconstruction); client_impl_->GetGcsRpcClient().ReportActorOutOfScope( - request, + std::move(request), [callback](const Status &status, rpc::ReportActorOutOfScopeReply &&reply) { if (callback) { callback(status); @@ -438,9 +448,9 @@ void ActorInfoAccessor::AsyncResubscribe() { // server first, then fetch data from the GCS server. absl::MutexLock lock(&mutex_); for (auto &[actor_id, resubscribe_op] : resubscribe_operations_) { - RAY_CHECK_OK(resubscribe_op([this, actor_id = actor_id](const Status &status) { - absl::MutexLock lock(&mutex_); - auto fetch_data_operation = fetch_data_operations_[actor_id]; + RAY_CHECK_OK(resubscribe_op([this, id = actor_id](const Status &status) { + absl::MutexLock callback_lock(&mutex_); + auto fetch_data_operation = fetch_data_operations_[id]; // `fetch_data_operation` is called in the callback function of subscribe. // Before that, if the user calls `AsyncUnsubscribe` function, the corresponding // fetch function will be deleted, so we need to check if it's null. @@ -467,7 +477,7 @@ Status NodeInfoAccessor::RegisterSelf(const rpc::GcsNodeInfo &local_node_info, rpc::RegisterNodeRequest request; request.mutable_node_info()->CopyFrom(local_node_info); client_impl_->GetGcsRpcClient().RegisterNode( - request, + std::move(request), [this, node_id, local_node_info, callback](const Status &status, rpc::RegisterNodeReply &&reply) { if (status.ok()) { @@ -497,7 +507,7 @@ void NodeInfoAccessor::UnregisterSelf(const rpc::NodeDeathInfo &node_death_info, request.set_node_id(local_node_info_.node_id()); request.mutable_node_death_info()->CopyFrom(node_death_info); client_impl_->GetGcsRpcClient().UnregisterNode( - request, + std::move(request), [this, node_id, unregister_done_callback](const Status &status, rpc::UnregisterNodeReply &&reply) { if (status.ok()) { @@ -521,7 +531,8 @@ void NodeInfoAccessor::AsyncRegister(const rpc::GcsNodeInfo &node_info, rpc::RegisterNodeRequest request; request.mutable_node_info()->CopyFrom(node_info); client_impl_->GetGcsRpcClient().RegisterNode( - request, [node_id, callback](const Status &status, rpc::RegisterNodeReply &&reply) { + std::move(request), + [node_id, callback](const Status &status, rpc::RegisterNodeReply &&reply) { if (callback) { callback(status); } @@ -556,7 +567,7 @@ void NodeInfoAccessor::AsyncCheckAlive(const std::vector &node_ids, } size_t num_raylets = node_ids.size(); client_impl_->GetGcsRpcClient().CheckAlive( - request, + std::move(request), [num_raylets, callback](const Status &status, rpc::CheckAliveReply &&reply) { if (status.ok()) { RAY_CHECK_EQ(static_cast(reply.raylet_alive().size()), num_raylets); @@ -583,8 +594,8 @@ Status NodeInfoAccessor::DrainNodes(const std::vector &node_ids, auto draining_request = request.add_drain_node_data(); draining_request->set_node_id(node_id.Binary()); } - RAY_RETURN_NOT_OK( - client_impl_->GetGcsRpcClient().SyncDrainNode(request, &reply, timeout_ms)); + RAY_RETURN_NOT_OK(client_impl_->GetGcsRpcClient().SyncDrainNode( + std::move(request), &reply, timeout_ms)); drained_node_ids.clear(); for (const auto &s : reply.drain_node_status()) { drained_node_ids.push_back(s.node_id()); @@ -601,7 +612,7 @@ void NodeInfoAccessor::AsyncGetAll(const MultiItemCallback &ca request.add_node_selectors()->set_node_id(node_id.Binary()); } client_impl_->GetGcsRpcClient().GetAllNodeInfo( - request, + std::move(request), [callback](const Status &status, rpc::GetAllNodeInfoReply &&reply) { std::vector result; result.reserve((reply.node_info_list_size())); @@ -633,15 +644,15 @@ void NodeInfoAccessor::AsyncSubscribeToNodeChange( node_change_callback_ = std::move(subscribe); RAY_CHECK(node_change_callback_ != nullptr); - fetch_node_data_operation_ = [this](const StatusCallback &done) { + fetch_node_data_operation_ = [this](const StatusCallback &done_callback) { AsyncGetAll( - [this, done](const Status &status, - std::vector &&node_info_list) { + [this, done_callback](const Status &status, + std::vector &&node_info_list) { for (auto &node_info : node_info_list) { HandleNotification(std::move(node_info)); } - if (done) { - done(status); + if (done_callback) { + done_callback(status); } }, /*timeout_ms=*/-1); @@ -683,8 +694,8 @@ StatusOr> NodeInfoAccessor::GetAllNoCache( *request.add_node_selectors() = std::move(node_selector.value()); } rpc::GetAllNodeInfoReply reply; - RAY_RETURN_NOT_OK( - client_impl_->GetGcsRpcClient().SyncGetAllNodeInfo(request, &reply, timeout_ms)); + RAY_RETURN_NOT_OK(client_impl_->GetGcsRpcClient().SyncGetAllNodeInfo( + std::move(request), &reply, timeout_ms)); return VectorFromProtobuf(std::move(*reply.mutable_node_info_list())); } @@ -783,7 +794,7 @@ void NodeResourceInfoAccessor::AsyncGetAllAvailableResources( const MultiItemCallback &callback) { rpc::GetAllAvailableResourcesRequest request; client_impl_->GetGcsRpcClient().GetAllAvailableResources( - request, + std::move(request), [callback](const Status &status, rpc::GetAllAvailableResourcesReply &&reply) { callback(status, VectorFromProtobuf(std::move(*reply.mutable_resources_list()))); RAY_LOG(DEBUG) << "Finished getting available resources of all nodes, status = " @@ -795,7 +806,8 @@ void NodeResourceInfoAccessor::AsyncGetAllTotalResources( const MultiItemCallback &callback) { rpc::GetAllTotalResourcesRequest request; client_impl_->GetGcsRpcClient().GetAllTotalResources( - request, [callback](const Status &status, rpc::GetAllTotalResourcesReply &&reply) { + std::move(request), + [callback](const Status &status, rpc::GetAllTotalResourcesReply &&reply) { callback(status, VectorFromProtobuf(std::move(*reply.mutable_resources_list()))); RAY_LOG(DEBUG) << "Finished getting total resources of all nodes, status = " << status; @@ -806,7 +818,8 @@ void NodeResourceInfoAccessor::AsyncGetDrainingNodes( const ItemCallback> &callback) { rpc::GetDrainingNodesRequest request; client_impl_->GetGcsRpcClient().GetDrainingNodes( - request, [callback](const Status &status, rpc::GetDrainingNodesReply &&reply) { + std::move(request), + [callback](const Status &status, rpc::GetDrainingNodesReply &&reply) { RAY_CHECK_OK(status); std::unordered_map draining_nodes; for (const auto &draining_node : reply.draining_nodes()) { @@ -831,7 +844,8 @@ void NodeResourceInfoAccessor::AsyncGetAllResourceUsage( const ItemCallback &callback) { rpc::GetAllResourceUsageRequest request; client_impl_->GetGcsRpcClient().GetAllResourceUsage( - request, [callback](const Status &status, rpc::GetAllResourceUsageReply &&reply) { + std::move(request), + [callback](const Status &status, rpc::GetAllResourceUsageReply &&reply) { callback(std::move(*reply.mutable_resource_usage_data())); RAY_LOG(DEBUG) << "Finished getting resource usage of all nodes, status = " << status; @@ -842,7 +856,7 @@ Status NodeResourceInfoAccessor::GetAllResourceUsage( int64_t timeout_ms, rpc::GetAllResourceUsageReply &reply) { rpc::GetAllResourceUsageRequest request; return client_impl_->GetGcsRpcClient().SyncGetAllResourceUsage( - request, &reply, timeout_ms); + std::move(request), &reply, timeout_ms); } void TaskInfoAccessor::AsyncAddTaskEventData(std::unique_ptr data_ptr, @@ -851,7 +865,8 @@ void TaskInfoAccessor::AsyncAddTaskEventData(std::unique_ptr // Prevent copy here request.mutable_data()->Swap(data_ptr.get()); client_impl_->GetGcsRpcClient().AddTaskEventData( - request, [callback](const Status &status, rpc::AddTaskEventDataReply &&reply) { + std::move(request), + [callback](const Status &status, rpc::AddTaskEventDataReply &&reply) { if (callback) { callback(status); } @@ -865,7 +880,8 @@ void TaskInfoAccessor::AsyncGetTaskEvents( RAY_CHECK(callback); rpc::GetTaskEventsRequest request; client_impl_->GetGcsRpcClient().GetTaskEvents( - request, [callback](const Status &status, rpc::GetTaskEventsReply &&reply) { + std::move(request), + [callback](const Status &status, rpc::GetTaskEventsReply &&reply) { callback(status, VectorFromProtobuf(std::move(*reply.mutable_events_by_task()))); }); } @@ -873,19 +889,14 @@ void TaskInfoAccessor::AsyncGetTaskEvents( ErrorInfoAccessor::ErrorInfoAccessor(GcsClient *client_impl) : client_impl_(client_impl) {} -void ErrorInfoAccessor::AsyncReportJobError( - const std::shared_ptr &data_ptr, - const StatusCallback &callback) { - auto job_id = JobID::FromBinary(data_ptr->job_id()); +void ErrorInfoAccessor::AsyncReportJobError(rpc::ErrorTableData data) { + auto job_id = JobID::FromBinary(data.job_id()); RAY_LOG(DEBUG) << "Publishing job error, job id = " << job_id; rpc::ReportJobErrorRequest request; - request.mutable_job_error()->CopyFrom(*data_ptr); + *request.mutable_job_error() = std::move(data); client_impl_->GetGcsRpcClient().ReportJobError( - request, - [job_id, callback](const Status &status, rpc::ReportJobErrorReply &&reply) { - if (callback) { - callback(status); - } + std::move(request), + [job_id](const Status &status, rpc::ReportJobErrorReply &&reply) { RAY_LOG(DEBUG) << "Finished publishing job error, job id = " << job_id; }); } @@ -896,8 +907,9 @@ WorkerInfoAccessor::WorkerInfoAccessor(GcsClient *client_impl) Status WorkerInfoAccessor::AsyncSubscribeToWorkerFailures( const ItemCallback &subscribe, const StatusCallback &done) { RAY_CHECK(subscribe != nullptr); - subscribe_operation_ = [this, subscribe](const StatusCallback &done) { - return client_impl_->GetGcsSubscriber().SubscribeAllWorkerFailures(subscribe, done); + subscribe_operation_ = [this, subscribe](const StatusCallback &done_callback) { + return client_impl_->GetGcsSubscriber().SubscribeAllWorkerFailures(subscribe, + done_callback); }; return subscribe_operation_(done); } @@ -920,7 +932,7 @@ void WorkerInfoAccessor::AsyncReportWorkerFailure( rpc::ReportWorkerFailureRequest request; request.mutable_worker_failure()->CopyFrom(*data_ptr); client_impl_->GetGcsRpcClient().ReportWorkerFailure( - request, + std::move(request), [worker_address, callback](const Status &status, rpc::ReportWorkerFailureReply &&reply) { if (callback) { @@ -938,7 +950,7 @@ void WorkerInfoAccessor::AsyncGet( rpc::GetWorkerInfoRequest request; request.set_worker_id(worker_id.Binary()); client_impl_->GetGcsRpcClient().GetWorkerInfo( - request, + std::move(request), [worker_id, callback](const Status &status, rpc::GetWorkerInfoReply &&reply) { if (reply.has_worker_table_data()) { callback(status, reply.worker_table_data()); @@ -954,7 +966,8 @@ void WorkerInfoAccessor::AsyncGetAll( RAY_LOG(DEBUG) << "Getting all worker info."; rpc::GetAllWorkerInfoRequest request; client_impl_->GetGcsRpcClient().GetAllWorkerInfo( - request, [callback](const Status &status, rpc::GetAllWorkerInfoReply &&reply) { + std::move(request), + [callback](const Status &status, rpc::GetAllWorkerInfoReply &&reply) { callback(status, VectorFromProtobuf(std::move(*reply.mutable_worker_table_data()))); RAY_LOG(DEBUG) << "Finished getting all worker info, status = " << status; @@ -966,7 +979,8 @@ void WorkerInfoAccessor::AsyncAdd(const std::shared_ptr &d rpc::AddWorkerInfoRequest request; request.mutable_worker_data()->CopyFrom(*data_ptr); client_impl_->GetGcsRpcClient().AddWorkerInfo( - request, [callback](const Status &status, rpc::AddWorkerInfoReply &&reply) { + std::move(request), + [callback](const Status &status, rpc::AddWorkerInfoReply &&reply) { if (callback) { callback(status); } @@ -982,7 +996,7 @@ void WorkerInfoAccessor::AsyncUpdateDebuggerPort(const WorkerID &worker_id, RAY_LOG(DEBUG) << "Updating the worker debugger port, worker id = " << worker_id << ", port = " << debugger_port << "."; client_impl_->GetGcsRpcClient().UpdateWorkerDebuggerPort( - request, + std::move(request), [callback](const Status &status, rpc::UpdateWorkerDebuggerPortReply &&reply) { if (callback) { callback(status); @@ -1000,7 +1014,7 @@ void WorkerInfoAccessor::AsyncUpdateWorkerNumPausedThreads( RAY_LOG(DEBUG).WithField(worker_id) << "Update the num paused threads by delta = " << num_paused_threads_delta << "."; client_impl_->GetGcsRpcClient().UpdateWorkerNumPausedThreads( - request, + std::move(request), [callback](const Status &status, rpc::UpdateWorkerNumPausedThreadsReply &&reply) { if (callback) { callback(status); @@ -1017,7 +1031,7 @@ Status PlacementGroupInfoAccessor::SyncCreatePlacementGroup( rpc::CreatePlacementGroupReply reply; request.mutable_placement_group_spec()->CopyFrom(placement_group_spec.GetMessage()); auto status = client_impl_->GetGcsRpcClient().SyncCreatePlacementGroup( - request, &reply, GetGcsTimeoutMs()); + std::move(request), &reply, GetGcsTimeoutMs()); if (status.ok()) { RAY_LOG(DEBUG).WithField(placement_group_spec.PlacementGroupId()) << "Finished registering placement group."; @@ -1034,7 +1048,7 @@ Status PlacementGroupInfoAccessor::SyncRemovePlacementGroup( rpc::RemovePlacementGroupReply reply; request.set_placement_group_id(placement_group_id.Binary()); auto status = client_impl_->GetGcsRpcClient().SyncRemovePlacementGroup( - request, &reply, GetGcsTimeoutMs()); + std::move(request), &reply, GetGcsTimeoutMs()); return status; } @@ -1045,7 +1059,7 @@ void PlacementGroupInfoAccessor::AsyncGet( rpc::GetPlacementGroupRequest request; request.set_placement_group_id(placement_group_id.Binary()); client_impl_->GetGcsRpcClient().GetPlacementGroup( - request, + std::move(request), [placement_group_id, callback](const Status &status, rpc::GetPlacementGroupReply &&reply) { if (reply.has_placement_group_table_data()) { @@ -1068,7 +1082,7 @@ void PlacementGroupInfoAccessor::AsyncGetByName( request.set_name(name); request.set_ray_namespace(ray_namespace); client_impl_->GetGcsRpcClient().GetNamedPlacementGroup( - request, + std::move(request), [name, callback](const Status &status, rpc::GetNamedPlacementGroupReply &&reply) { if (reply.has_placement_group_table_data()) { callback(status, reply.placement_group_table_data()); @@ -1086,7 +1100,8 @@ void PlacementGroupInfoAccessor::AsyncGetAll( RAY_LOG(DEBUG) << "Getting all placement group info."; rpc::GetAllPlacementGroupRequest request; client_impl_->GetGcsRpcClient().GetAllPlacementGroup( - request, [callback](const Status &status, rpc::GetAllPlacementGroupReply &&reply) { + std::move(request), + [callback](const Status &status, rpc::GetAllPlacementGroupReply &&reply) { callback( status, VectorFromProtobuf(std::move(*reply.mutable_placement_group_table_data()))); @@ -1101,7 +1116,9 @@ Status PlacementGroupInfoAccessor::SyncWaitUntilReady( rpc::WaitPlacementGroupUntilReadyReply reply; request.set_placement_group_id(placement_group_id.Binary()); auto status = client_impl_->GetGcsRpcClient().SyncWaitPlacementGroupUntilReady( - request, &reply, absl::ToInt64Milliseconds(absl::Seconds(timeout_seconds))); + std::move(request), + &reply, + absl::ToInt64Milliseconds(absl::Seconds(timeout_seconds))); RAY_LOG(DEBUG).WithField(placement_group_id) << "Finished waiting placement group until ready"; return status; @@ -1119,7 +1136,7 @@ void InternalKVAccessor::AsyncInternalKVGet( req.set_key(key); req.set_namespace_(ns); client_impl_->GetGcsRpcClient().InternalKVGet( - req, + std::move(req), [callback](const Status &status, rpc::InternalKVGetReply &&reply) { if (reply.status().code() == static_cast(StatusCode::NotFound)) { callback(status, std::nullopt); @@ -1141,14 +1158,14 @@ void InternalKVAccessor::AsyncInternalKVMultiGet( } req.set_namespace_(ns); client_impl_->GetGcsRpcClient().InternalKVMultiGet( - req, + std::move(req), [callback](const Status &status, rpc::InternalKVMultiGetReply &&reply) { std::unordered_map map; if (!status.ok()) { callback(status, map); } else { // TODO(ryw): reply.status() is not examined. It's never populated in - // src/ray/gcs/gcs_server/gcs_kv_manager.cc either anyway so it's ok for now. + // src/ray/gcs/gcs_kv_manager.cc either anyway so it's ok for now. // Investigate if we wanna remove that field. for (const auto &entry : reply.results()) { map[entry.key()] = entry.value(); @@ -1171,7 +1188,7 @@ void InternalKVAccessor::AsyncInternalKVPut(const std::string &ns, req.set_value(value); req.set_overwrite(overwrite); client_impl_->GetGcsRpcClient().InternalKVPut( - req, + std::move(req), [callback](const Status &status, rpc::InternalKVPutReply &&reply) { callback(status, reply.added()); }, @@ -1187,7 +1204,7 @@ void InternalKVAccessor::AsyncInternalKVExists( req.set_namespace_(ns); req.set_key(key); client_impl_->GetGcsRpcClient().InternalKVExists( - req, + std::move(req), [callback](const Status &status, rpc::InternalKVExistsReply &&reply) { callback(status, reply.exists()); }, @@ -1204,7 +1221,7 @@ void InternalKVAccessor::AsyncInternalKVDel(const std::string &ns, req.set_key(key); req.set_del_by_prefix(del_by_prefix); client_impl_->GetGcsRpcClient().InternalKVDel( - req, + std::move(req), [callback](const Status &status, rpc::InternalKVDelReply &&reply) { callback(status, reply.deleted_num()); }, @@ -1220,7 +1237,7 @@ void InternalKVAccessor::AsyncInternalKVKeys( req.set_namespace_(ns); req.set_prefix(prefix); client_impl_->GetGcsRpcClient().InternalKVKeys( - req, + std::move(req), [callback](const Status &status, rpc::InternalKVKeysReply &&reply) { if (!status.ok()) { callback(status, std::nullopt); @@ -1352,7 +1369,8 @@ void InternalKVAccessor::AsyncGetInternalConfig( const OptionalItemCallback &callback) { rpc::GetInternalConfigRequest request; client_impl_->GetGcsRpcClient().GetInternalConfig( - request, [callback](const Status &status, rpc::GetInternalConfigReply &&reply) { + std::move(request), + [callback](const Status &status, rpc::GetInternalConfigReply &&reply) { if (status.ok()) { RAY_LOG(DEBUG) << "Fetched internal config: " << reply.config(); } else { @@ -1372,8 +1390,8 @@ Status RuntimeEnvAccessor::PinRuntimeEnvUri(const std::string &uri, request.set_uri(uri); request.set_expiration_s(expiration_s); rpc::PinRuntimeEnvURIReply reply; - auto status = - client_impl_->GetGcsRpcClient().SyncPinRuntimeEnvURI(request, &reply, timeout_ms); + auto status = client_impl_->GetGcsRpcClient().SyncPinRuntimeEnvURI( + std::move(request), &reply, timeout_ms); return status; } @@ -1400,7 +1418,7 @@ Status AutoscalerStateAccessor::RequestClusterResourceConstraint( } return client_impl_->GetGcsRpcClient().SyncRequestClusterResourceConstraint( - request, &reply, timeout_ms); + std::move(request), &reply, timeout_ms); } Status AutoscalerStateAccessor::GetClusterResourceState(int64_t timeout_ms, @@ -1409,7 +1427,7 @@ Status AutoscalerStateAccessor::GetClusterResourceState(int64_t timeout_ms, rpc::autoscaler::GetClusterResourceStateReply reply; RAY_RETURN_NOT_OK(client_impl_->GetGcsRpcClient().SyncGetClusterResourceState( - request, &reply, timeout_ms)); + std::move(request), &reply, timeout_ms)); if (!reply.SerializeToString(&serialized_reply)) { return Status::IOError("Failed to serialize GetClusterResourceState"); @@ -1422,8 +1440,8 @@ Status AutoscalerStateAccessor::GetClusterStatus(int64_t timeout_ms, rpc::autoscaler::GetClusterStatusRequest request; rpc::autoscaler::GetClusterStatusReply reply; - RAY_RETURN_NOT_OK( - client_impl_->GetGcsRpcClient().SyncGetClusterStatus(request, &reply, timeout_ms)); + RAY_RETURN_NOT_OK(client_impl_->GetGcsRpcClient().SyncGetClusterStatus( + std::move(request), &reply, timeout_ms)); if (!reply.SerializeToString(&serialized_reply)) { return Status::IOError("Failed to serialize GetClusterStatusReply"); @@ -1435,9 +1453,8 @@ void AutoscalerStateAccessor::AsyncGetClusterStatus( int64_t timeout_ms, const OptionalItemCallback &callback) { rpc::autoscaler::GetClusterStatusRequest request; - rpc::autoscaler::GetClusterStatusRequest reply; client_impl_->GetGcsRpcClient().GetClusterStatus( - request, + std::move(request), [callback](const Status &status, rpc::autoscaler::GetClusterStatusReply &&reply) { if (!status.ok()) { callback(status, std::nullopt); @@ -1457,7 +1474,7 @@ Status AutoscalerStateAccessor::ReportAutoscalingState( return Status::IOError("Failed to parse ReportAutoscalingState"); } return client_impl_->GetGcsRpcClient().SyncReportAutoscalingState( - request, &reply, timeout_ms); + std::move(request), &reply, timeout_ms); } Status AutoscalerStateAccessor::ReportClusterConfig( @@ -1469,7 +1486,7 @@ Status AutoscalerStateAccessor::ReportClusterConfig( return Status::IOError("Failed to parse ClusterConfig"); } return client_impl_->GetGcsRpcClient().SyncReportClusterConfig( - request, &reply, timeout_ms); + std::move(request), &reply, timeout_ms); } Status AutoscalerStateAccessor::DrainNode(const std::string &node_id, @@ -1487,8 +1504,8 @@ Status AutoscalerStateAccessor::DrainNode(const std::string &node_id, rpc::autoscaler::DrainNodeReply reply; - RAY_RETURN_NOT_OK( - client_impl_->GetGcsRpcClient().SyncDrainNode(request, &reply, timeout_ms)); + RAY_RETURN_NOT_OK(client_impl_->GetGcsRpcClient().SyncDrainNode( + std::move(request), &reply, timeout_ms)); is_accepted = reply.is_accepted(); if (!is_accepted) { @@ -1509,7 +1526,8 @@ Status PublisherAccessor::PublishError(std::string key_id, pub_message->set_key_id(std::move(key_id)); *(pub_message->mutable_error_info_message()) = std::move(data); rpc::GcsPublishReply reply; - return client_impl_->GetGcsRpcClient().SyncGcsPublish(request, &reply, timeout_ms); + return client_impl_->GetGcsRpcClient().SyncGcsPublish( + std::move(request), &reply, timeout_ms); } Status PublisherAccessor::PublishLogs(std::string key_id, @@ -1521,7 +1539,8 @@ Status PublisherAccessor::PublishLogs(std::string key_id, pub_message->set_key_id(std::move(key_id)); *(pub_message->mutable_log_batch_message()) = std::move(data); rpc::GcsPublishReply reply; - return client_impl_->GetGcsRpcClient().SyncGcsPublish(request, &reply, timeout_ms); + return client_impl_->GetGcsRpcClient().SyncGcsPublish( + std::move(request), &reply, timeout_ms); } void PublisherAccessor::AsyncPublishNodeResourceUsage( @@ -1535,7 +1554,7 @@ void PublisherAccessor::AsyncPublishNodeResourceUsage( pub_message->mutable_node_resource_usage_message()->set_json( std::move(node_resource_usage_json)); client_impl_->GetGcsRpcClient().GcsPublish( - request, + std::move(request), [done](const Status &status, rpc::GcsPublishReply &&reply) { done(status); }); } diff --git a/src/ray/gcs/gcs_client/accessor.h b/src/ray/gcs_client/accessor.h similarity index 99% rename from src/ray/gcs/gcs_client/accessor.h rename to src/ray/gcs_client/accessor.h index cd229ad63ce3..973ca76e8824 100644 --- a/src/ray/gcs/gcs_client/accessor.h +++ b/src/ray/gcs_client/accessor.h @@ -20,11 +20,11 @@ #include #include "absl/types/optional.h" +#include "ray/common/gcs_callbacks.h" #include "ray/common/id.h" #include "ray/common/placement_group.h" #include "ray/common/status_or.h" #include "ray/common/task/task_spec.h" -#include "ray/gcs/callback.h" #include "ray/rpc/client_call.h" #include "ray/util/sequencer.h" #include "src/ray/protobuf/autoscaler.pb.h" @@ -540,10 +540,8 @@ class ErrorInfoAccessor { /// duplicate messages currently cause failures (the GCS doesn't allow it). A /// natural way to do this is to have finer-grained time stamps. /// - /// \param data_ptr The error message that will be reported to GCS. - /// \param callback Callback that will be called when report is complete. - virtual void AsyncReportJobError(const std::shared_ptr &data_ptr, - const StatusCallback &callback); + /// \param data The error message that will be reported to GCS. + virtual void AsyncReportJobError(rpc::ErrorTableData data); private: GcsClient *client_impl_; diff --git a/src/ray/gcs/gcs_client/gcs_client.cc b/src/ray/gcs_client/gcs_client.cc similarity index 95% rename from src/ray/gcs/gcs_client/gcs_client.cc rename to src/ray/gcs_client/gcs_client.cc index a26fe77180ca..7d1d9e9bf6f3 100644 --- a/src/ray/gcs/gcs_client/gcs_client.cc +++ b/src/ray/gcs_client/gcs_client.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/gcs_client/gcs_client.h" +#include "ray/gcs_client/gcs_client.h" #include #include @@ -24,7 +24,7 @@ #include "ray/common/asio/asio_util.h" #include "ray/common/ray_config.h" -#include "ray/gcs/gcs_client/accessor.h" +#include "ray/gcs_client/accessor.h" #include "ray/pubsub/subscriber.h" #include "ray/util/network_util.h" @@ -38,8 +38,6 @@ class GcsSubscriberClient final : public pubsub::SubscriberClientInterface { explicit GcsSubscriberClient(const std::shared_ptr &rpc_client) : rpc_client_(rpc_client) {} - ~GcsSubscriberClient() final = default; - void PubsubLongPolling( const rpc::PubsubLongPollingRequest &request, const rpc::ClientCallback &callback) final; @@ -60,7 +58,8 @@ void GcsSubscriberClient::PubsubLongPolling( req.set_max_processed_sequence_id(request.max_processed_sequence_id()); req.set_publisher_id(request.publisher_id()); rpc_client_->GcsSubscriberPoll( - req, [callback](const Status &status, rpc::GcsSubscriberPollReply &&poll_reply) { + std::move(req), + [callback](const Status &status, rpc::GcsSubscriberPollReply &&poll_reply) { rpc::PubsubLongPollingReply reply; reply.mutable_pub_messages()->Swap(poll_reply.mutable_pub_messages()); *reply.mutable_publisher_id() = std::move(*poll_reply.mutable_publisher_id()); @@ -75,7 +74,7 @@ void GcsSubscriberClient::PubsubCommandBatch( req.set_subscriber_id(request.subscriber_id()); *req.mutable_commands() = request.commands(); rpc_client_->GcsSubscriberCommandBatch( - req, + std::move(req), [callback](const Status &status, rpc::GcsSubscriberCommandBatchReply &&batch_reply) { rpc::PubsubCommandBatchReply reply; @@ -147,7 +146,8 @@ Status GcsClient::Connect(instrumented_io_context &io_service, int64_t timeout_m /*callback_service*/ &io_service); // Init GCS subscriber instance. - gcs_subscriber_ = std::make_unique(gcs_address, std::move(subscriber)); + gcs_subscriber_ = + std::make_unique(gcs_address, std::move(subscriber)); job_accessor_ = std::make_unique(this); actor_accessor_ = std::make_unique(this); @@ -179,7 +179,7 @@ Status GcsClient::FetchClusterId(int64_t timeout_ms) { rpc::GetClusterIdReply reply; RAY_LOG(DEBUG) << "Cluster ID is nil, getting cluster ID from GCS server."; - Status s = gcs_rpc_client_->SyncGetClusterId(request, &reply, timeout_ms); + Status s = gcs_rpc_client_->SyncGetClusterId(std::move(request), &reply, timeout_ms); if (!s.ok()) { RAY_LOG(WARNING) << "Failed to get cluster ID from GCS server: " << s; gcs_rpc_client_.reset(); diff --git a/src/ray/gcs/gcs_client/gcs_client.h b/src/ray/gcs_client/gcs_client.h similarity index 96% rename from src/ray/gcs/gcs_client/gcs_client.h rename to src/ray/gcs_client/gcs_client.h index eb088f3f7564..b13eb5292e5d 100644 --- a/src/ray/gcs/gcs_client/gcs_client.h +++ b/src/ray/gcs_client/gcs_client.h @@ -28,9 +28,9 @@ #include "ray/common/asio/periodical_runner.h" #include "ray/common/id.h" #include "ray/common/status.h" -#include "ray/gcs/gcs_client/accessor.h" -#include "ray/gcs/pubsub/gcs_pub_sub.h" -#include "ray/rpc/gcs/gcs_rpc_client.h" +#include "ray/gcs_client/accessor.h" +#include "ray/gcs_client/rpc_client.h" +#include "ray/pubsub/gcs_subscriber.h" #include "ray/util/logging.h" #include "ray/util/network_util.h" #include "src/ray/protobuf/autoscaler.grpc.pb.h" @@ -87,7 +87,7 @@ class GcsClientOptions { std::string gcs_address_; int gcs_port_ = 0; ClusterID cluster_id_; - bool should_fetch_cluster_id_; + bool should_fetch_cluster_id_ = false; }; /// \class GcsClient @@ -223,7 +223,7 @@ class RAY_EXPORT GcsClient : public std::enable_shared_from_this { /// This function is thread safe. virtual InternalKVAccessor &InternalKV() { return *internal_kv_accessor_; } - virtual GcsSubscriber &GetGcsSubscriber() { return *gcs_subscriber_; } + virtual pubsub::GcsSubscriber &GetGcsSubscriber() { return *gcs_subscriber_; } virtual rpc::GcsRpcClient &GetGcsRpcClient() { return *gcs_rpc_client_; } @@ -250,7 +250,7 @@ class RAY_EXPORT GcsClient : public std::enable_shared_from_this { const UniqueID gcs_client_id_ = UniqueID::FromRandom(); - std::unique_ptr gcs_subscriber_; + std::unique_ptr gcs_subscriber_; // Gcs rpc client std::shared_ptr gcs_rpc_client_; diff --git a/src/ray/gcs/gcs_client/global_state_accessor.cc b/src/ray/gcs_client/global_state_accessor.cc similarity index 99% rename from src/ray/gcs/gcs_client/global_state_accessor.cc rename to src/ray/gcs_client/global_state_accessor.cc index f52e24f05967..4e80d9496e5e 100644 --- a/src/ray/gcs/gcs_client/global_state_accessor.cc +++ b/src/ray/gcs_client/global_state_accessor.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/gcs_client/global_state_accessor.h" +#include "ray/gcs_client/global_state_accessor.h" #include #include @@ -23,6 +23,7 @@ #include #include "ray/common/asio/instrumented_io_context.h" +#include "ray/util/time.h" namespace ray { namespace gcs { @@ -429,7 +430,7 @@ ray::Status GlobalStateAccessor::GetNode(const std::string &node_id_hex_str, ". The node registration may not be complete yet before the timeout." + " Try increase the RAY_raylet_start_wait_time_s config."); } - RAY_LOG(INFO) << "Retrying to get node with node ID " << node_id_hex_str; + RAY_LOG(DEBUG) << "Retrying to get node with node ID " << node_id_hex_str; // Some of the information may not be in GCS yet, so wait a little bit. std::this_thread::sleep_for(std::chrono::seconds(1)); } diff --git a/src/ray/gcs/gcs_client/global_state_accessor.h b/src/ray/gcs_client/global_state_accessor.h similarity index 99% rename from src/ray/gcs/gcs_client/global_state_accessor.h rename to src/ray/gcs_client/global_state_accessor.h index 8ad2af80cb2f..c525ab7c5d2c 100644 --- a/src/ray/gcs/gcs_client/global_state_accessor.h +++ b/src/ray/gcs_client/global_state_accessor.h @@ -23,8 +23,7 @@ #include "absl/base/thread_annotations.h" #include "absl/synchronization/mutex.h" #include "ray/common/asio/instrumented_io_context.h" -#include "ray/gcs/gcs_client/gcs_client.h" -#include "ray/rpc/server_call.h" +#include "ray/gcs_client/gcs_client.h" namespace ray { namespace gcs { diff --git a/src/ray/gcs/gcs_client/python_callbacks.h b/src/ray/gcs_client/python_callbacks.h similarity index 100% rename from src/ray/gcs/gcs_client/python_callbacks.h rename to src/ray/gcs_client/python_callbacks.h diff --git a/src/ray/rpc/gcs/gcs_rpc_client.h b/src/ray/gcs_client/rpc_client.h similarity index 98% rename from src/ray/rpc/gcs/gcs_rpc_client.h rename to src/ray/gcs_client/rpc_client.h index cd804f1fac41..17f023e3c116 100644 --- a/src/ray/rpc/gcs/gcs_rpc_client.h +++ b/src/ray/gcs_client/rpc_client.h @@ -88,7 +88,7 @@ namespace rpc { method_timeout_ms, \ handle_payload_status, \ SPECS) \ - void METHOD(const METHOD_NAMESPACE::METHOD##Request &request, \ + void METHOD(METHOD_NAMESPACE::METHOD##Request &&request, \ const ClientCallback &callback, \ const int64_t timeout_ms = method_timeout_ms) SPECS { \ invoke_async_method promise; \ METHOD( \ - request, \ + std::move(request), \ [&promise, reply_in](const Status &status, \ const METHOD_NAMESPACE::METHOD##Reply &reply) { \ reply_in->CopyFrom(reply); \ @@ -223,14 +223,14 @@ class GcsRpcClient { PrepareAsyncFunction prepare_async_function, std::shared_ptr> grpc_client, const std::string &call_name, - const Request &request, + Request &&request, const ClientCallback &callback, const int64_t timeout_ms) { retryable_grpc_client_->template CallMethod( prepare_async_function, std::move(grpc_client), call_name, - request, + std::forward(request), [callback](const Status &status, Reply &&reply) { if (status.ok()) { if constexpr (handle_payload_status) { diff --git a/src/ray/gcs/gcs_client/test/BUILD.bazel b/src/ray/gcs_client/tests/BUILD.bazel similarity index 70% rename from src/ray/gcs/gcs_client/test/BUILD.bazel rename to src/ray/gcs_client/tests/BUILD.bazel index 842853c628cf..f12f17b71480 100644 --- a/src/ray/gcs/gcs_client/test/BUILD.bazel +++ b/src/ray/gcs_client/tests/BUILD.bazel @@ -8,8 +8,8 @@ ray_cc_test( ], tags = ["team:core"], deps = [ - "//src/ray/gcs/gcs_client:gcs_client_lib", - "//src/ray/gcs/test:gcs_test_util_lib", + "//src/ray/common:test_utils", + "//src/ray/gcs_client", "@com_google_googletest//:gtest_main", ], ) @@ -30,10 +30,12 @@ ray_cc_test( ], tags = ["team:core"], deps = [ - "//src/ray/gcs/gcs_client:gcs_client_lib", - "//src/ray/gcs/gcs_client:global_state_accessor_lib", - "//src/ray/gcs/gcs_server:gcs_server_lib", - "//src/ray/gcs/test:gcs_test_util_lib", + "//src/ray/common:test_utils", + "//src/ray/gcs:gcs_server_lib", + "//src/ray/gcs_client", + "//src/ray/gcs_client:global_state_accessor_lib", + "//src/ray/util:path_utils", + "//src/ray/util:raii", "@com_google_googletest//:gtest_main", ], ) @@ -58,10 +60,12 @@ ray_cc_test( "team:core", ], deps = [ - "//src/ray/gcs/gcs_client:gcs_client_lib", - "//src/ray/gcs/gcs_server:gcs_server_lib", - "//src/ray/gcs/test:gcs_test_util_lib", + "//src/ray/common:test_utils", + "//src/ray/gcs:gcs_server_lib", + "//src/ray/gcs_client", "//src/ray/util:network_util", + "//src/ray/util:raii", + "//src/ray/util:time", "@com_google_googletest//:gtest_main", ], ) @@ -84,10 +88,12 @@ ray_cc_test( "team:core", ], deps = [ - "//src/ray/gcs/gcs_client:gcs_client_lib", - "//src/ray/gcs/gcs_server:gcs_server_lib", - "//src/ray/gcs/test:gcs_test_util_lib", + "//src/ray/common:test_utils", + "//src/ray/gcs:gcs_server_lib", + "//src/ray/gcs_client", "//src/ray/util:network_util", + "//src/ray/util:path_utils", + "//src/ray/util:raii", "@com_google_googletest//:gtest_main", ], ) diff --git a/src/ray/gcs/gcs_client/test/accessor_test.cc b/src/ray/gcs_client/tests/accessor_test.cc similarity index 98% rename from src/ray/gcs/gcs_client/test/accessor_test.cc rename to src/ray/gcs_client/tests/accessor_test.cc index ff2c8f78e5c5..b8c1d19c4108 100644 --- a/src/ray/gcs/gcs_client/test/accessor_test.cc +++ b/src/ray/gcs_client/tests/accessor_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/gcs_client/accessor.h" +#include "ray/gcs_client/accessor.h" #include "gtest/gtest.h" #include "src/ray/protobuf/gcs.pb.h" diff --git a/src/ray/gcs/gcs_client/test/gcs_client_reconnection_test.cc b/src/ray/gcs_client/tests/gcs_client_reconnection_test.cc similarity index 98% rename from src/ray/gcs/gcs_client/test/gcs_client_reconnection_test.cc rename to src/ray/gcs_client/tests/gcs_client_reconnection_test.cc index fd828cb27aa4..ec02beadf82d 100644 --- a/src/ray/gcs/gcs_client/test/gcs_client_reconnection_test.cc +++ b/src/ray/gcs_client/tests/gcs_client_reconnection_test.cc @@ -22,14 +22,14 @@ #include "absl/strings/substitute.h" #include "gtest/gtest.h" #include "ray/common/asio/instrumented_io_context.h" -#include "ray/gcs/gcs_client/accessor.h" -#include "ray/gcs/gcs_client/gcs_client.h" -#include "ray/gcs/gcs_server/gcs_server.h" -#include "ray/gcs/test/gcs_test_util.h" -#include "ray/rpc/gcs/gcs_rpc_client.h" +#include "ray/common/test_utils.h" +#include "ray/gcs/gcs_server.h" +#include "ray/gcs_client/accessor.h" +#include "ray/gcs_client/gcs_client.h" +#include "ray/gcs_client/rpc_client.h" #include "ray/util/network_util.h" #include "ray/util/path_utils.h" -#include "ray/util/util.h" +#include "ray/util/raii.h" using namespace std::chrono_literals; // NOLINT using namespace ray; // NOLINT diff --git a/src/ray/gcs/gcs_client/test/gcs_client_test.cc b/src/ray/gcs_client/tests/gcs_client_test.cc similarity index 94% rename from src/ray/gcs/gcs_client/test/gcs_client_test.cc rename to src/ray/gcs_client/tests/gcs_client_test.cc index 7735f1264927..c788f08d2a47 100644 --- a/src/ray/gcs/gcs_client/test/gcs_client_test.cc +++ b/src/ray/gcs_client/tests/gcs_client_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/gcs_client/gcs_client.h" +#include "ray/gcs_client/gcs_client.h" #include #include @@ -22,13 +22,14 @@ #include "absl/strings/substitute.h" #include "gtest/gtest.h" #include "ray/common/asio/instrumented_io_context.h" -#include "ray/gcs/gcs_client/accessor.h" -#include "ray/gcs/gcs_server/gcs_server.h" -#include "ray/gcs/test/gcs_test_util.h" -#include "ray/rpc/gcs/gcs_rpc_client.h" +#include "ray/common/test_utils.h" +#include "ray/gcs/gcs_server.h" +#include "ray/gcs_client/accessor.h" +#include "ray/gcs_client/rpc_client.h" #include "ray/util/network_util.h" #include "ray/util/path_utils.h" -#include "ray/util/util.h" +#include "ray/util/raii.h" +#include "ray/util/time.h" using namespace std::chrono_literals; // NOLINT @@ -371,13 +372,6 @@ class GcsClientTest : public ::testing::TestWithParam { return resources; } - bool ReportJobError(const std::shared_ptr &error_table_data) { - std::promise promise; - gcs_client_->Errors().AsyncReportJobError( - error_table_data, [&promise](Status status) { promise.set_value(status.ok()); }); - return WaitReady(promise.get_future(), timeout_ms_); - } - bool SubscribeToWorkerFailures( const gcs::ItemCallback &subscribe) { std::promise promise; @@ -428,11 +422,11 @@ class GcsClientTest : public ::testing::TestWithParam { INSTANTIATE_TEST_SUITE_P(RedisMigration, GcsClientTest, testing::Bool()); TEST_P(GcsClientTest, TestCheckAlive) { - auto node_info1 = Mocker::GenNodeInfo(); + auto node_info1 = GenNodeInfo(); node_info1->set_node_manager_address("172.1.2.3"); node_info1->set_node_manager_port(31292); - auto node_info2 = Mocker::GenNodeInfo(); + auto node_info2 = GenNodeInfo(); node_info2->set_node_manager_address("172.1.2.4"); node_info2->set_node_manager_port(31293); @@ -465,11 +459,11 @@ TEST_P(GcsClientTest, TestCheckAlive) { } TEST_P(GcsClientTest, TestGcsClientCheckAlive) { - auto node_info1 = Mocker::GenNodeInfo(); + auto node_info1 = GenNodeInfo(); node_info1->set_node_manager_address("172.1.2.3"); node_info1->set_node_manager_port(31292); - auto node_info2 = Mocker::GenNodeInfo(); + auto node_info2 = GenNodeInfo(); node_info2->set_node_manager_address("172.1.2.4"); node_info2->set_node_manager_port(31293); @@ -498,7 +492,7 @@ TEST_P(GcsClientTest, TestGcsClientCheckAlive) { TEST_P(GcsClientTest, TestJobInfo) { // Create job table data. JobID add_job_id = JobID::FromInt(1); - auto job_table_data = Mocker::GenJobTableData(add_job_id); + auto job_table_data = GenJobTableData(add_job_id); // Subscribe to all jobs. std::atomic job_updates(0); @@ -522,11 +516,11 @@ TEST_P(GcsClientTest, TestActorInfo) { // Create actor table data. JobID job_id = JobID::FromInt(1); AddJob(job_id); - auto actor_table_data = Mocker::GenActorTableData(job_id); + auto actor_table_data = GenActorTableData(job_id); ActorID actor_id = ActorID::FromBinary(actor_table_data->actor_id()); // Subscribe to any update operations of an actor. - auto on_subscribe = [](const ActorID &actor_id, const rpc::ActorTableData &data) {}; + auto on_subscribe = [](const ActorID &, const rpc::ActorTableData &) {}; ASSERT_TRUE(SubscribeActor(actor_id, on_subscribe)); // Register an actor to GCS. @@ -540,7 +534,7 @@ TEST_P(GcsClientTest, TestActorInfo) { TEST_P(GcsClientTest, TestNodeInfo) { // Create gcs node info. - auto gcs_node1_info = Mocker::GenNodeInfo(); + auto gcs_node1_info = GenNodeInfo(); NodeID node1_id = NodeID::FromBinary(gcs_node1_info->node_id()); // Subscribe to node addition and removal events from GCS. @@ -564,7 +558,7 @@ TEST_P(GcsClientTest, TestNodeInfo) { EXPECT_EQ(gcs_client_->Nodes().GetSelfInfo().state(), gcs_node1_info->state()); // Register a node to GCS. - auto gcs_node2_info = Mocker::GenNodeInfo(); + auto gcs_node2_info = GenNodeInfo(); NodeID node2_id = NodeID::FromBinary(gcs_node2_info->node_id()); ASSERT_TRUE(RegisterNode(*gcs_node2_info)); WaitForExpectedCount(register_count, 2); @@ -579,7 +573,7 @@ TEST_P(GcsClientTest, TestNodeInfo) { TEST_P(GcsClientTest, TestUnregisterNode) { // Create gcs node info. - auto gcs_node_info = Mocker::GenNodeInfo(); + auto gcs_node_info = GenNodeInfo(); NodeID node_id = NodeID::FromBinary(gcs_node_info->node_id()); // Register local node to GCS. @@ -608,7 +602,7 @@ TEST_P(GcsClientTest, TestUnregisterNode) { TEST_P(GcsClientTest, TestGetAllAvailableResources) { // Register node. - auto node_info = Mocker::GenNodeInfo(); + auto node_info = GenNodeInfo(); node_info->mutable_resources_total()->insert({"CPU", 1.0}); node_info->mutable_resources_total()->insert({"GPU", 10.0}); @@ -641,7 +635,7 @@ TEST_P(GcsClientTest, TestWorkerInfo) { ASSERT_TRUE(SubscribeToWorkerFailures(on_subscribe)); // Report a worker failure to GCS when this worker doesn't exist. - auto worker_data = Mocker::GenWorkerTableData(); + auto worker_data = GenWorkerTableData(); worker_data->mutable_worker_address()->set_worker_id(WorkerID::FromRandom().Binary()); ASSERT_TRUE(ReportWorkerFailure(worker_data)); WaitForExpectedCount(worker_failure_count, 1); @@ -654,20 +648,13 @@ TEST_P(GcsClientTest, TestWorkerInfo) { WaitForExpectedCount(worker_failure_count, 2); } -TEST_P(GcsClientTest, TestErrorInfo) { - // Report a job error to GCS. - JobID job_id = JobID::FromInt(1); - auto error_table_data = Mocker::GenErrorTableData(job_id); - ASSERT_TRUE(ReportJobError(error_table_data)); -} - TEST_P(GcsClientTest, TestJobTableResubscribe) { // TODO(mwtian): Support resubscribing with GCS pubsub. GTEST_SKIP(); // Test that subscription of the job table can still work when GCS server restarts. JobID job_id = JobID::FromInt(1); - auto job_table_data = Mocker::GenJobTableData(job_id); + auto job_table_data = GenJobTableData(job_id); // Subscribe to all jobs. std::atomic job_update_count(0); @@ -695,7 +682,7 @@ TEST_P(GcsClientTest, TestActorTableResubscribe) { // Test that subscription of the actor table can still work when GCS server restarts. JobID job_id = JobID::FromInt(1); AddJob(job_id); - auto actor_table_data = Mocker::GenActorTableData(job_id); + auto actor_table_data = GenActorTableData(job_id); auto actor_id = ActorID::FromBinary(actor_table_data->actor_id()); // Number of notifications for the following `SubscribeActor` operation. @@ -703,7 +690,7 @@ TEST_P(GcsClientTest, TestActorTableResubscribe) { // All the notifications for the following `SubscribeActor` operation. std::vector subscribe_one_notifications; auto actor_subscribe = [&num_subscribe_one_notifications, &subscribe_one_notifications]( - const ActorID &actor_id, const rpc::ActorTableData &data) { + const ActorID &, const rpc::ActorTableData &data) { subscribe_one_notifications.emplace_back(data); ++num_subscribe_one_notifications; RAY_LOG(INFO) << "The number of actor subscription messages received is " @@ -758,7 +745,7 @@ TEST_P(GcsClientTest, TestNodeTableResubscribe) { }; ASSERT_TRUE(SubscribeToNodeChange(node_subscribe)); - auto node_info = Mocker::GenNodeInfo(1); + auto node_info = GenNodeInfo(1); ASSERT_TRUE(RegisterNode(*node_info)); NodeID node_id = NodeID::FromBinary(node_info->node_id()); std::string key = "CPU"; @@ -767,7 +754,7 @@ TEST_P(GcsClientTest, TestNodeTableResubscribe) { RestartGcsServer(); - node_info = Mocker::GenNodeInfo(1); + node_info = GenNodeInfo(1); ASSERT_TRUE(RegisterNode(*node_info)); node_id = NodeID::FromBinary(node_info->node_id()); gcs_server_->UpdateGcsResourceManagerInTest(node_id, resources); @@ -790,7 +777,7 @@ TEST_P(GcsClientTest, TestWorkerTableResubscribe) { RestartGcsServer(); // Add a worker before report worker failure to GCS. - auto worker_data = Mocker::GenWorkerTableData(); + auto worker_data = GenWorkerTableData(); worker_data->mutable_worker_address()->set_worker_id(WorkerID::FromRandom().Binary()); ASSERT_TRUE(AddWorker(worker_data)); @@ -805,7 +792,7 @@ TEST_P(GcsClientTest, TestGcsTableReload) { return; } // Register node to GCS. - auto node_info = Mocker::GenNodeInfo(); + auto node_info = GenNodeInfo(); ASSERT_TRUE(RegisterNode(*node_info)); // Restart GCS. @@ -843,7 +830,7 @@ TEST_P(GcsClientTest, TestMultiThreadSubAndUnsub) { auto job_id = JobID::FromInt(1); for (int index = 0; index < size; ++index) { threads[index].reset(new std::thread([this, sub_and_unsub_loop_count, job_id] { - for (int index = 0; index < sub_and_unsub_loop_count; ++index) { + for (int inner_index = 0; inner_index < sub_and_unsub_loop_count; ++inner_index) { auto actor_id = ActorID::Of(job_id, RandomTaskId(), 0); ASSERT_TRUE(SubscribeActor( actor_id, [](const ActorID &id, const rpc::ActorTableData &result) {})); @@ -872,7 +859,7 @@ TEST_P(GcsClientTest, DISABLED_TestGetActorPerf) { task_spec.add_args()->CopyFrom(task_arg); } for (int index = 0; index < actor_count; ++index) { - auto actor_table_data = Mocker::GenActorTableData(job_id); + auto actor_table_data = GenActorTableData(job_id); RegisterActor(actor_table_data, false, true); } @@ -899,7 +886,7 @@ TEST_P(GcsClientTest, TestEvictExpiredDestroyedActors) { absl::flat_hash_set actor_ids; int actor_count = RayConfig::instance().maximum_gcs_destroyed_actor_cached_count(); for (int index = 0; index < actor_count; ++index) { - auto actor_table_data = Mocker::GenActorTableData(job_id); + auto actor_table_data = GenActorTableData(job_id); RegisterActor(actor_table_data, false); actor_ids.insert(ActorID::FromBinary(actor_table_data->actor_id())); } @@ -909,7 +896,7 @@ TEST_P(GcsClientTest, TestEvictExpiredDestroyedActors) { ReconnectClient(); for (int index = 0; index < actor_count; ++index) { - auto actor_table_data = Mocker::GenActorTableData(job_id); + auto actor_table_data = GenActorTableData(job_id); RegisterActor(actor_table_data, false); actor_ids.insert(ActorID::FromBinary(actor_table_data->actor_id())); } @@ -951,7 +938,7 @@ TEST_P(GcsClientTest, TestGcsAuth) { RayConfig::instance().initialize(R"({"enable_cluster_auth": true})"); // Restart GCS. RestartGcsServer(); - auto node_info = Mocker::GenNodeInfo(); + auto node_info = GenNodeInfo(); if (!no_redis_) { // If we are backed by Redis, we can reuse cluster ID, so the RPC passes. EXPECT_TRUE(RegisterNode(*node_info)); @@ -967,14 +954,14 @@ TEST_P(GcsClientTest, TestGcsAuth) { TEST_P(GcsClientTest, TestRegisterHeadNode) { // Test at most only one head node is alive in GCS server - auto head_node_info = Mocker::GenNodeInfo(1); + auto head_node_info = GenNodeInfo(1); head_node_info->set_is_head_node(true); ASSERT_TRUE(RegisterNode(*head_node_info)); - auto worker_node_info = Mocker::GenNodeInfo(1); + auto worker_node_info = GenNodeInfo(1); ASSERT_TRUE(RegisterNode(*worker_node_info)); - auto head_node_info_2 = Mocker::GenNodeInfo(1); + auto head_node_info_2 = GenNodeInfo(1); head_node_info_2->set_is_head_node(true); ASSERT_TRUE(RegisterNode(*head_node_info_2)); diff --git a/src/ray/gcs/gcs_client/test/global_state_accessor_test.cc b/src/ray/gcs_client/tests/global_state_accessor_test.cc similarity index 93% rename from src/ray/gcs/gcs_client/test/global_state_accessor_test.cc rename to src/ray/gcs_client/tests/global_state_accessor_test.cc index c5d186e41ec7..1575c85766fa 100644 --- a/src/ray/gcs/gcs_client/test/global_state_accessor_test.cc +++ b/src/ray/gcs_client/tests/global_state_accessor_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/gcs/gcs_client/global_state_accessor.h" +#include "ray/gcs_client/global_state_accessor.h" #include #include @@ -20,10 +20,11 @@ #include "gtest/gtest.h" #include "ray/common/asio/instrumented_io_context.h" -#include "ray/gcs/gcs_server/gcs_server.h" -#include "ray/gcs/test/gcs_test_util.h" -#include "ray/rpc/gcs/gcs_rpc_client.h" +#include "ray/common/test_utils.h" +#include "ray/gcs/gcs_server.h" +#include "ray/gcs_client/rpc_client.h" #include "ray/util/path_utils.h" +#include "ray/util/raii.h" namespace ray { @@ -131,7 +132,7 @@ TEST_P(GlobalStateAccessorTest, TestJobTable) { ASSERT_EQ(global_state_->GetAllJobInfo().size(), 0); for (int index = 0; index < job_count; ++index) { auto job_id = JobID::FromInt(index); - auto job_table_data = Mocker::GenJobTableData(job_id); + auto job_table_data = GenJobTableData(job_id); std::promise promise; gcs_client_->Jobs().AsyncAdd( job_table_data, [&promise](Status status) { promise.set_value(status.ok()); }); @@ -147,7 +148,7 @@ TEST_P(GlobalStateAccessorTest, TestJobTableWithSubmissionId) { ASSERT_EQ(global_state_->GetAllJobInfo().size(), 0); for (int index = 0; index < job_count; ++index) { auto job_id = JobID::FromInt(index); - auto job_table_data = Mocker::GenJobTableData(job_id); + auto job_table_data = GenJobTableData(job_id); if (index % 2 == 0) { (*job_table_data->mutable_config()->mutable_metadata())["job_submission_id"] = std::to_string(index); @@ -165,10 +166,9 @@ TEST_P(GlobalStateAccessorTest, TestNodeTable) { ASSERT_EQ(global_state_->GetAllNodeInfo().size(), 0); // It's useful to check if index value will be marked as address suffix. for (int index = 0; index < node_count; ++index) { - auto node_table_data = - Mocker::GenNodeInfo(index, - std::string("127.0.0.") + std::to_string(index), - "Mocker_node_" + std::to_string(index * 10)); + auto node_table_data = GenNodeInfo(index, + std::string("127.0.0.") + std::to_string(index), + "Mocker_node_" + std::to_string(index * 10)); std::promise promise; gcs_client_->Nodes().AsyncRegister( *node_table_data, [&promise](Status status) { promise.set_value(status.ok()); }); @@ -191,7 +191,7 @@ TEST_P(GlobalStateAccessorTest, TestGetAllTotalResources) { ASSERT_EQ(global_state_->GetAllTotalResources().size(), 0); // Register node - auto node_table_data = Mocker::GenNodeInfo(); + auto node_table_data = GenNodeInfo(); node_table_data->mutable_resources_total()->insert({"CPU", 1}); node_table_data->mutable_resources_total()->insert({"GPU", 10}); @@ -221,7 +221,7 @@ TEST_P(GlobalStateAccessorTest, TestGetAllResourceUsage) { resource_usage_batch_data.ParseFromString(*resources.get()); ASSERT_EQ(resource_usage_batch_data.batch_size(), 0); - auto node_table_data = Mocker::GenNodeInfo(); + auto node_table_data = GenNodeInfo(); node_table_data->mutable_resources_total()->insert({"CPU", 1}); std::promise promise; @@ -266,7 +266,7 @@ TEST_P(GlobalStateAccessorTest, TestGetAllResourceUsage) { TEST_P(GlobalStateAccessorTest, TestWorkerTable) { ASSERT_EQ(global_state_->GetAllWorkerInfo().size(), 0); // Add worker info - auto worker_table_data = Mocker::GenWorkerTableData(); + auto worker_table_data = GenWorkerTableData(); worker_table_data->mutable_worker_address()->set_worker_id( WorkerID::FromRandom().Binary()); ASSERT_TRUE(global_state_->AddWorkerInfo(worker_table_data->SerializeAsString())); @@ -276,7 +276,7 @@ TEST_P(GlobalStateAccessorTest, TestWorkerTable) { ASSERT_TRUE(global_state_->GetWorkerInfo(worker_id)); // Add another worker info - auto another_worker_data = Mocker::GenWorkerTableData(); + auto another_worker_data = GenWorkerTableData(); another_worker_data->mutable_worker_address()->set_worker_id( WorkerID::FromRandom().Binary()); ASSERT_TRUE(global_state_->AddWorkerInfo(another_worker_data->SerializeAsString())); @@ -286,7 +286,7 @@ TEST_P(GlobalStateAccessorTest, TestWorkerTable) { TEST_P(GlobalStateAccessorTest, TestUpdateWorkerDebuggerPort) { ASSERT_EQ(global_state_->GetAllWorkerInfo().size(), 0); // Add worker info - auto worker_table_data = Mocker::GenWorkerTableData(); + auto worker_table_data = GenWorkerTableData(); worker_table_data->mutable_worker_address()->set_worker_id( WorkerID::FromRandom().Binary()); ASSERT_TRUE(global_state_->AddWorkerInfo(worker_table_data->SerializeAsString())); @@ -300,7 +300,7 @@ TEST_P(GlobalStateAccessorTest, TestUpdateWorkerDebuggerPort) { ASSERT_TRUE(global_state_->UpdateWorkerDebuggerPort(worker_id, debugger_port)); // Verify the debugger port - auto another_worker_table_data = Mocker::GenWorkerTableData(); + auto another_worker_table_data = GenWorkerTableData(); auto worker_info = global_state_->GetWorkerInfo(worker_id); ASSERT_TRUE(another_worker_table_data->ParseFromString(*worker_info)); ASSERT_EQ(another_worker_table_data->debugger_port(), debugger_port); @@ -309,7 +309,7 @@ TEST_P(GlobalStateAccessorTest, TestUpdateWorkerDebuggerPort) { TEST_P(GlobalStateAccessorTest, TestUpdateWorkerNumPausedThreads) { ASSERT_EQ(global_state_->GetAllWorkerInfo().size(), 0); // Add worker info - auto worker_table_data = Mocker::GenWorkerTableData(); + auto worker_table_data = GenWorkerTableData(); worker_table_data->mutable_worker_address()->set_worker_id( WorkerID::FromRandom().Binary()); ASSERT_TRUE(global_state_->AddWorkerInfo(worker_table_data->SerializeAsString())); @@ -324,7 +324,7 @@ TEST_P(GlobalStateAccessorTest, TestUpdateWorkerNumPausedThreads) { global_state_->UpdateWorkerNumPausedThreads(worker_id, num_paused_threads_delta)); // Verify the num paused threads is equal to num_paused_threads_delta - auto another_worker_table_data = Mocker::GenWorkerTableData(); + auto another_worker_table_data = GenWorkerTableData(); auto worker_info = global_state_->GetWorkerInfo(worker_id); ASSERT_TRUE(another_worker_table_data->ParseFromString(*worker_info)); ASSERT_EQ(another_worker_table_data->num_paused_threads(), num_paused_threads_delta); diff --git a/src/ray/internal/internal.h b/src/ray/internal/internal.h index 20c89a4cc6c6..e9353150d998 100644 --- a/src/ray/internal/internal.h +++ b/src/ray/internal/internal.h @@ -20,7 +20,7 @@ #include "ray/common/buffer.h" #include "ray/common/id.h" -#include "ray/core_worker/core_worker.h" +#include "ray/core_worker/common.h" #include "ray/stats/metric.h" // This header is used to warp some internal code so we can reduce suspicious diff --git a/src/ray/ipc/BUILD.bazel b/src/ray/ipc/BUILD.bazel index 08d01f774235..1dcf5778fac3 100644 --- a/src/ray/ipc/BUILD.bazel +++ b/src/ray/ipc/BUILD.bazel @@ -1,19 +1,24 @@ load("//bazel:ray.bzl", "ray_cc_library") ray_cc_library( - name = "client_connection", - srcs = [ - "client_connection.cc", - ], - hdrs = [ - "client_connection.h", - ], + name = "raylet_ipc_client_interface", + hdrs = ["raylet_ipc_client_interface.h"], deps = [ - "//src/ray/common:asio", - "//src/ray/common:event_stats", + "//src/ray/common:buffer", "//src/ray/common:id", "//src/ray/common:status", "//src/ray/flatbuffers:node_manager_generated", + "//src/ray/protobuf:common_cc_proto", + "//src/ray/util:process", + "@com_google_absl//absl/container:flat_hash_set", + ], +) + +ray_cc_library( + name = "fake_raylet_ipc_client", + hdrs = ["fake_raylet_ipc_client.h"], + deps = [ + "//src/ray/ipc:raylet_ipc_client_interface", ], ) @@ -21,16 +26,39 @@ ray_cc_library( name = "raylet_ipc_client", srcs = ["raylet_ipc_client.cc"], hdrs = ["raylet_ipc_client.h"], + visibility = ["//src/ray/core_worker:__pkg__"], deps = [ ":client_connection", "//src/ray/common:asio", "//src/ray/common:buffer", + "//src/ray/common:flatbuf_utils", "//src/ray/common:id", "//src/ray/common:status", "//src/ray/flatbuffers:node_manager_generated", + "//src/ray/ipc:raylet_ipc_client_interface", "//src/ray/protobuf:common_cc_proto", "//src/ray/util:logging", "//src/ray/util:process", "@com_google_absl//absl/container:flat_hash_set", ], ) + +ray_cc_library( + name = "client_connection", + srcs = [ + "client_connection.cc", + ], + hdrs = [ + "client_connection.h", + ], + deps = [ + "//src/ray/common:asio", + "//src/ray/common:event_stats", + "//src/ray/common:id", + "//src/ray/common:status", + "//src/ray/flatbuffers:node_manager_generated", + "//src/ray/util:network_util", + "//src/ray/util:process", + "//src/ray/util:time", + ], +) diff --git a/src/ray/ipc/client_connection.cc b/src/ray/ipc/client_connection.cc index 87be9cb3e0af..fd6c7a7dc971 100644 --- a/src/ray/ipc/client_connection.cc +++ b/src/ray/ipc/client_connection.cc @@ -30,8 +30,9 @@ #include "ray/common/event_stats.h" #include "ray/common/ray_config.h" +#include "ray/util/network_util.h" #include "ray/util/process.h" -#include "ray/util/util.h" +#include "ray/util/time.h" #if defined(_WIN32) #include @@ -249,8 +250,8 @@ void ServerConnection::DoAsyncWrites() { } // Helper function to call all handlers with the input status. - auto call_handlers = [this](const ray::Status &status, int num_messages) { - for (int i = 0; i < num_messages; i++) { + auto call_handlers = [this](const ray::Status &status, int num_msgs) { + for (int i = 0; i < num_msgs; i++) { auto write_buffer = std::move(async_write_queue_.front()); write_buffer->handler(status); async_write_queue_.pop_front(); @@ -359,6 +360,12 @@ void ClientConnection::Register() { registered_ = true; } +void ClientConnection::Close() { + closed_ = true; + boost::system::error_code ec; + socket_.close(ec); +} + void ClientConnection::ProcessMessages() { // Wait for a message header from the client. The message header includes the // protocol version, the message type, and the length of the message. @@ -398,9 +405,16 @@ void ClientConnection::ProcessMessageHeader(const boost::system::error_code &err return; } - // If there was no error, make sure the ray cookie matches. + if (closed_) { + // In most cases all outstanding reads will have been canceled when the socket was. + // closed. However, if the boost async_read call has already received data into its + // buffer from the poll syscall, it may succeed. If this happens, drop the message. + return; + } + if (!CheckRayCookie()) { - ServerConnection::Close(); + RAY_LOG(WARNING) << "Mismatched Ray cookie, closing client connection."; + Close(); return; } @@ -469,6 +483,13 @@ void ClientConnection::ProcessMessage(const boost::system::error_code &error) { return connection_error_handler_(std::move(this_ptr), error); } + if (closed_) { + // In most cases all outstanding reads will have been canceled when the socket was. + // closed. However, if the boost async_read call has already received data into its + // buffer from the poll syscall, it may succeed. If this happens, drop the message. + return; + } + int64_t start_ms = current_time_ms(); message_handler_(std::move(this_ptr), read_type_, read_message_); int64_t interval = current_time_ms() - start_ms; diff --git a/src/ray/ipc/client_connection.h b/src/ray/ipc/client_connection.h index 86f7788ba73c..1f03a0863a45 100644 --- a/src/ray/ipc/client_connection.h +++ b/src/ray/ipc/client_connection.h @@ -25,7 +25,6 @@ #include #include "ray/common/asio/instrumented_io_context.h" -#include "ray/common/common_protocol.h" #include "ray/common/id.h" #include "ray/common/status.h" @@ -106,12 +105,6 @@ class ServerConnection : public std::enable_shared_from_this { /// \return Status. virtual Status ReadBuffer(const std::vector &buffer); - /// Shuts down socket for this connection. - void Close() { - boost::system::error_code ec; - socket_.close(ec); - } - /// Get the native handle of the socket. int GetNativeHandle() { return socket_.native_handle(); } @@ -227,12 +220,20 @@ class ClientConnection : public ServerConnection { /// Register the client. void Register(); + /// Close the connection forcefully. + /// + /// - Clients will receive an error the next time they interact with the connection. + /// - No further messages will be processed from `ProcessMessages`. + /// - The `ConnectionErrorHandler` may be called with an error indicating that + /// outstanding reads failed. + void Close(); + /// Listen for and process messages from the client connection. Once a /// message has been fully received, the client manager's /// ProcessClientMessage handler will be called. void ProcessMessages(); - const std::string GetDebugLabel() const { return debug_label_; } + std::string GetDebugLabel() const { return debug_label_; } protected: /// A protected constructor for a node client connection. @@ -267,6 +268,8 @@ class ClientConnection : public ServerConnection { /// Whether the client has sent us a registration message yet. bool registered_; + /// Whether the connection has been explicitly closed by the server. + bool closed_ = false; /// The handler for a message from the client. MessageHandler message_handler_; /// The handler for an unexpected connection error from this client. diff --git a/src/fakes/ray/ipc/raylet_ipc_client.h b/src/ray/ipc/fake_raylet_ipc_client.h similarity index 95% rename from src/fakes/ray/ipc/raylet_ipc_client.h rename to src/ray/ipc/fake_raylet_ipc_client.h index 29fb02135bff..7f0ba5daac75 100644 --- a/src/fakes/ray/ipc/raylet_ipc_client.h +++ b/src/ray/ipc/fake_raylet_ipc_client.h @@ -14,7 +14,11 @@ #pragma once -#include "ray/ipc/raylet_ipc_client.h" +#include +#include +#include + +#include "ray/ipc/raylet_ipc_client_interface.h" namespace ray { namespace ipc { @@ -29,7 +33,7 @@ class FakeRayletIpcClient : public RayletIpcClientInterface { const std::string &ip_address, const std::string &serialized_job_config, const StartupToken &startup_token, - NodeID *raylet_id, + NodeID *node_id, int *assigned_port) override { return Status::OK(); } diff --git a/src/ray/ipc/raylet_ipc_client.cc b/src/ray/ipc/raylet_ipc_client.cc index bcf3e4409367..07e5dcc305c7 100644 --- a/src/ray/ipc/raylet_ipc_client.cc +++ b/src/ray/ipc/raylet_ipc_client.cc @@ -15,43 +15,65 @@ #include "ray/ipc/raylet_ipc_client.h" #include -#include #include #include #include #include "absl/container/flat_hash_set.h" -#include "ray/common/common_protocol.h" +#include "ray/common/flatbuf_utils.h" #include "ray/common/ray_config.h" #include "ray/flatbuffers/node_manager_generated.h" #include "ray/ipc/client_connection.h" #include "ray/util/logging.h" +#include "ray/util/process.h" + +namespace ray::ipc { namespace { -flatbuffers::Offset to_flatbuf( - flatbuffers::FlatBufferBuilder &fbb, const ray::rpc::Address &address) { - return ray::protocol::CreateAddress(fbb, - fbb.CreateString(address.raylet_id()), - fbb.CreateString(address.ip_address()), - address.port(), - fbb.CreateString(address.worker_id())); +flatbuffers::Offset AddressToFlatbuffer( + flatbuffers::FlatBufferBuilder &fbb, const rpc::Address &address) { + return protocol::CreateAddress(fbb, + fbb.CreateString(address.node_id()), + fbb.CreateString(address.ip_address()), + address.port(), + fbb.CreateString(address.worker_id())); } -flatbuffers::Offset>> +flatbuffers::Offset>> AddressesToFlatbuffer(flatbuffers::FlatBufferBuilder &fbb, - const std::vector &addresses) { - std::vector> address_vec; + const std::vector &addresses) { + std::vector> address_vec; address_vec.reserve(addresses.size()); for (const auto &addr : addresses) { - address_vec.push_back(to_flatbuf(fbb, addr)); + address_vec.push_back(AddressToFlatbuffer(fbb, addr)); } return fbb.CreateVector(address_vec); } -} // namespace +void ShutdownIfLocalRayletDisconnected(const Status &status) { + // Check if the Raylet process is still alive. + // If we know the Raylet PID, check using that. + // Else, assume the Raylet is our parent process. + bool raylet_alive = true; + auto raylet_pid = RayConfig::instance().RAYLET_PID(); + if (!raylet_pid.empty()) { + if (!IsProcessAlive(static_cast(std::stoi(raylet_pid)))) { + raylet_alive = false; + } + } else if (!IsParentProcessAlive()) { + raylet_alive = false; + } -namespace ray::ipc { + if (!status.ok() && !raylet_alive) { + RAY_LOG(WARNING) << "Exiting because the Raylet IPC connection failed and the local " + "Raylet is dead. Status: " + << status; + QuickExit(); + } +} + +} // namespace RayletIpcClient::RayletIpcClient(instrumented_io_context &io_service, const std::string &address, @@ -74,16 +96,16 @@ ray::Status RayletIpcClient::RegisterClient(const WorkerID &worker_id, const std::string &ip_address, const std::string &serialized_job_config, const StartupToken &startup_token, - NodeID *raylet_id, + NodeID *node_id, int *assigned_port) { flatbuffers::FlatBufferBuilder fbb; auto message = protocol::CreateRegisterClientRequest(fbb, static_cast(worker_type), - to_flatbuf(fbb, worker_id), + flatbuf::to_flatbuf(fbb, worker_id), getpid(), startup_token, - to_flatbuf(fbb, job_id), + flatbuf::to_flatbuf(fbb, job_id), runtime_env_hash, language, fbb.CreateString(ip_address), @@ -98,10 +120,10 @@ ray::Status RayletIpcClient::RegisterClient(const WorkerID &worker_id, auto reply_message = flatbuffers::GetRoot(reply.data()); bool success = reply_message->success(); if (!success) { - return Status::Invalid(string_from_flatbuf(*reply_message->failure_reason())); + return Status::Invalid(reply_message->failure_reason()->str()); } - *raylet_id = NodeID::FromBinary(reply_message->raylet_id()->str()); + *node_id = NodeID::FromBinary(reply_message->node_id()->str()); *assigned_port = reply_message->port(); return Status::OK(); } @@ -164,7 +186,7 @@ Status RayletIpcClient::AnnounceWorkerPortForDriver(int port, if (reply_message->success()) { return Status::OK(); } - return Status::Invalid(string_from_flatbuf(*reply_message->failure_reason())); + return Status::Invalid(reply_message->failure_reason()->str()); } Status RayletIpcClient::ActorCreationTaskDone() { @@ -176,7 +198,7 @@ Status RayletIpcClient::AsyncGetObjects( const std::vector &owner_addresses) { RAY_CHECK(object_ids.size() == owner_addresses.size()); flatbuffers::FlatBufferBuilder fbb; - auto object_ids_message = to_flatbuf(fbb, object_ids); + auto object_ids_message = flatbuf::to_flatbuf(fbb, object_ids); auto message = protocol::CreateAsyncGetObjectsRequest( fbb, object_ids_message, AddressesToFlatbuffer(fbb, owner_addresses)); fbb.Finish(message); @@ -212,7 +234,7 @@ StatusOr> RayletIpcClient::Wait( // Write request. flatbuffers::FlatBufferBuilder fbb; auto message = protocol::CreateWaitRequest(fbb, - to_flatbuf(fbb, object_ids), + flatbuf::to_flatbuf(fbb, object_ids), AddressesToFlatbuffer(fbb, owner_addresses), num_returns, timeout_milliseconds); @@ -241,7 +263,10 @@ Status RayletIpcClient::WaitForActorCallArgs( owner_addresses.push_back(ref.owner_address()); } auto message = protocol::CreateWaitForActorCallArgsRequest( - fbb, to_flatbuf(fbb, object_ids), AddressesToFlatbuffer(fbb, owner_addresses), tag); + fbb, + flatbuf::to_flatbuf(fbb, object_ids), + AddressesToFlatbuffer(fbb, owner_addresses), + tag); fbb.Finish(message); return WriteMessage(MessageType::WaitForActorCallArgsRequest, &fbb); } @@ -252,7 +277,7 @@ Status RayletIpcClient::PushError(const JobID &job_id, double timestamp) { flatbuffers::FlatBufferBuilder fbb; auto message = protocol::CreatePushErrorRequest(fbb, - to_flatbuf(fbb, job_id), + flatbuf::to_flatbuf(fbb, job_id), fbb.CreateString(type), fbb.CreateString(error_message), timestamp); @@ -263,8 +288,8 @@ Status RayletIpcClient::PushError(const JobID &job_id, Status RayletIpcClient::FreeObjects(const std::vector &object_ids, bool local_only) { flatbuffers::FlatBufferBuilder fbb; - auto message = - protocol::CreateFreeObjectsRequest(fbb, local_only, to_flatbuf(fbb, object_ids)); + auto message = protocol::CreateFreeObjectsRequest( + fbb, local_only, flatbuf::to_flatbuf(fbb, object_ids)); fbb.Finish(message); return WriteMessage(MessageType::FreeObjectsInObjectStoreRequest, &fbb); } @@ -273,26 +298,17 @@ void RayletIpcClient::SubscribePlasmaReady(const ObjectID &object_id, const rpc::Address &owner_address) { flatbuffers::FlatBufferBuilder fbb; auto message = protocol::CreateSubscribePlasmaReady( - fbb, to_flatbuf(fbb, object_id), to_flatbuf(fbb, owner_address)); + fbb, flatbuf::to_flatbuf(fbb, object_id), AddressToFlatbuffer(fbb, owner_address)); fbb.Finish(message); RAY_CHECK_OK(WriteMessage(MessageType::SubscribePlasmaReady, &fbb)); } -void ShutdownIfLocalRayletDisconnected(const Status &status) { - if (!status.ok() && IsRayletFailed(RayConfig::instance().RAYLET_PID())) { - RAY_LOG(WARNING) << "Exiting because the Raylet IPC connection failed and the local " - "Raylet is dead. Status: " - << status; - QuickExit(); - } -} - Status RayletIpcClient::WriteMessage(MessageType type, flatbuffers::FlatBufferBuilder *fbb) { std::unique_lock guard(write_mutex_); - int64_t length = fbb ? fbb->GetSize() : 0; - uint8_t *bytes = fbb ? fbb->GetBufferPointer() : nullptr; + int64_t length = fbb != nullptr ? fbb->GetSize() : 0; + uint8_t *bytes = fbb != nullptr ? fbb->GetBufferPointer() : nullptr; auto status = conn_->WriteMessage(static_cast(type), length, bytes); ShutdownIfLocalRayletDisconnected(status); return status; diff --git a/src/ray/ipc/raylet_ipc_client.h b/src/ray/ipc/raylet_ipc_client.h index 47a1132d6a29..f6bf672cf799 100644 --- a/src/ray/ipc/raylet_ipc_client.h +++ b/src/ray/ipc/raylet_ipc_client.h @@ -17,8 +17,6 @@ #include #include #include -#include -#include #include #include "absl/container/flat_hash_set.h" @@ -28,167 +26,15 @@ #include "ray/common/status_or.h" #include "ray/flatbuffers/node_manager_generated.h" #include "ray/ipc/client_connection.h" +#include "ray/ipc/raylet_ipc_client_interface.h" #include "ray/util/process.h" #include "src/ray/protobuf/common.pb.h" using MessageType = ray::protocol::MessageType; namespace ray { - -class RayletIpcClientInterface { - public: - virtual ~RayletIpcClientInterface() = default; - - /// Register this client (worker) with the local Raylet. - /// - /// \param worker_id The worker_id of the connecting worker. - /// \param worker_type The worker type of the connecting worker. - /// \param job_id The job ID that the connecting worker is associated with. - /// \param runtime_env_hash The runtime_env hash of the connecting worker. - /// \param language The language of the connecting worker. - /// \param ip_address The ip_address of the connecting worker. - /// \param serialized_job_config The serialized job config of the connecting worker. - /// \param startup_token The token that was passed to this worker at startup. - /// \param[out] raylet_id The node ID for the local Raylet. - /// \param[out] assigned_port The assigned port for the worker to listen on. If zero, - /// the worker should pick a port randomly. - virtual ray::Status RegisterClient(const WorkerID &worker_id, - rpc::WorkerType worker_type, - const JobID &job_id, - int runtime_env_hash, - const rpc::Language &language, - const std::string &ip_address, - const std::string &serialized_job_config, - const StartupToken &startup_token, - NodeID *raylet_id, - int *assigned_port) = 0; - - /// Notify the raylet that this client is disconnecting gracefully. This - /// is used by actors to exit gracefully so that the raylet doesn't - /// propagate an error message to the driver. - /// - /// It's a blocking call. - /// - /// \param disconnect_type The reason why this worker process is disconnected. - /// \param disconnect_detail The detailed reason for a given exit. - /// \return ray::Status. - virtual ray::Status Disconnect( - const rpc::WorkerExitType &exit_type, - const std::string &exit_detail, - const std::shared_ptr &creation_task_exception_pb_bytes) = 0; - - /// Tell the raylet which port this worker's gRPC server is listening on. - /// - /// \param port The port. - /// \return ray::Status. - virtual Status AnnounceWorkerPortForWorker(int port) = 0; - - /// Tell the raylet this driver and its job is ready to run, with port and entrypoint. - /// - /// \param port The port. - /// \param entrypoint The entrypoint of the driver's job. - /// \return ray::Status. - virtual Status AnnounceWorkerPortForDriver(int port, const std::string &entrypoint) = 0; - - /// Tell the raylet that the client has finished executing a task. - /// - /// \return ray::Status. - virtual ray::Status ActorCreationTaskDone() = 0; - - /// Ask the Raylet to pull a set of objects to the local node. - /// - /// This request is asynchronous. - /// - /// \param object_ids The IDs of the objects to pull. - /// \param owner_addresses The owner addresses of the objects. - /// \return ray::Status. - virtual ray::Status AsyncGetObjects( - const std::vector &object_ids, - const std::vector &owner_addresses) = 0; - - /// Wait for the given objects until timeout expires or num_return objects are - /// found. - /// - /// \param object_ids The objects to wait for. - /// \param owner_addresses The addresses of the workers that own the objects. - /// \param num_returns The number of objects to wait for. - /// \param timeout_milliseconds Duration, in milliseconds, to wait before returning. - /// \param result A pair with the first element containing the object ids that were - /// found, and the second element the objects that were not found. - /// \return ray::StatusOr containing error status or the set of object ids that were - /// found. - virtual ray::StatusOr> Wait( - const std::vector &object_ids, - const std::vector &owner_addresses, - int num_returns, - int64_t timeout_milliseconds) = 0; - - /// Tell the Raylet to cancel the get request from this worker. - /// - /// \return ray::Status. - virtual ray::Status CancelGetRequest() = 0; - - /// Notify the raylet that this client is blocked. This is only used for direct task - /// calls. Note that ordering of this with respect to Unblock calls is important. - /// - /// \return ray::Status. - virtual ray::Status NotifyDirectCallTaskBlocked() = 0; - - /// Notify the raylet that this client is unblocked. This is only used for direct task - /// calls. Note that ordering of this with respect to Block calls is important. - /// - /// \return ray::Status. - virtual ray::Status NotifyDirectCallTaskUnblocked() = 0; - - /// Wait for the given objects asynchronously. - /// - /// The core worker will be notified over gRPC when the wait completes. - /// - /// \param references The objects to wait for. - /// \param tag Value that will be sent to the core worker via gRPC on completion. - /// \return ray::Status. - virtual ray::Status WaitForActorCallArgs( - const std::vector &references, int64_t tag) = 0; - - /// Push an error to the relevant driver. - /// - /// \param The ID of the job_id that the error is for. - /// \param The type of the error. - /// \param The error message. - /// \param The timestamp of the error. - /// \return ray::Status. - virtual ray::Status PushError(const ray::JobID &job_id, - const std::string &type, - const std::string &error_message, - double timestamp) = 0; - - /// Free a list of objects from object stores. - /// - /// \param object_ids A list of ObjectsIDs to be deleted. - /// \param local_only Whether keep this request with local object store - /// or send it to all the object stores. - /// \return ray::Status. - virtual ray::Status FreeObjects(const std::vector &object_ids, - bool local_only) = 0; - - /// Subscribe this worker to a notification when the provided object is ready in the - /// local object store. - /// - /// The worker will be notified over gRPC when the object is ready. - /// - /// \param object_id The ID of the object to subscribe to. - /// \param owner_address The address of the owner of the object. - virtual void SubscribePlasmaReady(const ObjectID &object_id, - const rpc::Address &owner_address) = 0; -}; - namespace ipc { -/// Interface for interacting with the local Raylet over a socket. -/// -/// Message ordering on the socket is guaranteed. -/// -/// If the socket is broken and the local Raylet is detected to be dead, calling any -/// method on the client will quick exit the process. + class RayletIpcClient : public RayletIpcClientInterface { public: /// Connect to the Raylet over a local socket. @@ -210,7 +56,7 @@ class RayletIpcClient : public RayletIpcClientInterface { const std::string &ip_address, const std::string &serialized_job_config, const StartupToken &startup_token, - NodeID *raylet_id, + NodeID *node_id, int *assigned_port) override; ray::Status Disconnect(const rpc::WorkerExitType &exit_type, @@ -275,5 +121,4 @@ class RayletIpcClient : public RayletIpcClientInterface { }; } // namespace ipc - } // namespace ray diff --git a/src/ray/ipc/raylet_ipc_client_interface.h b/src/ray/ipc/raylet_ipc_client_interface.h new file mode 100644 index 000000000000..4d92c5c5023a --- /dev/null +++ b/src/ray/ipc/raylet_ipc_client_interface.h @@ -0,0 +1,190 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include "absl/container/flat_hash_set.h" +#include "ray/common/buffer.h" +#include "ray/common/id.h" +#include "ray/common/status.h" +#include "ray/common/status_or.h" +#include "ray/flatbuffers/node_manager_generated.h" +#include "ray/util/process.h" +#include "src/ray/protobuf/common.pb.h" + +using MessageType = ray::protocol::MessageType; + +namespace ray { +namespace ipc { + +/// Interface for interacting with the local Raylet over a socket. +/// +/// Message ordering is guaranteed. +/// +/// If the local Raylet is detected to be dead, calling any +/// method on the client will un-gracefully exit the process. +class RayletIpcClientInterface { + public: + virtual ~RayletIpcClientInterface() = default; + + /// Register this client (worker) with the local Raylet. + /// + /// \param worker_id The worker_id of the connecting worker. + /// \param worker_type The worker type of the connecting worker. + /// \param job_id The job ID that the connecting worker is associated with. + /// \param runtime_env_hash The runtime_env hash of the connecting worker. + /// \param language The language of the connecting worker. + /// \param ip_address The ip_address of the connecting worker. + /// \param serialized_job_config The serialized job config of the connecting worker. + /// \param startup_token The token that was passed to this worker at startup. + /// \param[out] node_id The node ID for the local Raylet. + /// \param[out] assigned_port The assigned port for the worker to listen on. If zero, + /// the worker should pick a port randomly. + virtual ray::Status RegisterClient(const WorkerID &worker_id, + rpc::WorkerType worker_type, + const JobID &job_id, + int runtime_env_hash, + const rpc::Language &language, + const std::string &ip_address, + const std::string &serialized_job_config, + const StartupToken &startup_token, + NodeID *node_id, + int *assigned_port) = 0; + + /// Notify the raylet that this client is disconnecting gracefully. This + /// is used by actors to exit gracefully so that the raylet doesn't + /// propagate an error message to the driver. + /// + /// It's a blocking call. + /// + /// \param disconnect_type The reason why this worker process is disconnected. + /// \param disconnect_detail The detailed reason for a given exit. + /// \return ray::Status. + virtual ray::Status Disconnect( + const rpc::WorkerExitType &exit_type, + const std::string &exit_detail, + const std::shared_ptr &creation_task_exception_pb_bytes) = 0; + + /// Tell the raylet which port this worker's gRPC server is listening on. + /// + /// \param port The port. + /// \return ray::Status. + virtual Status AnnounceWorkerPortForWorker(int port) = 0; + + /// Tell the raylet this driver and its job is ready to run, with port and entrypoint. + /// + /// \param port The port. + /// \param entrypoint The entrypoint of the driver's job. + /// \return ray::Status. + virtual Status AnnounceWorkerPortForDriver(int port, const std::string &entrypoint) = 0; + + /// Tell the raylet that the client has finished executing a task. + /// + /// \return ray::Status. + virtual ray::Status ActorCreationTaskDone() = 0; + + /// Ask the Raylet to pull a set of objects to the local node. + /// + /// This request is asynchronous. + /// + /// \param object_ids The IDs of the objects to pull. + /// \param owner_addresses The owner addresses of the objects. + /// \return ray::Status. + virtual ray::Status AsyncGetObjects( + const std::vector &object_ids, + const std::vector &owner_addresses) = 0; + + /// Wait for the given objects until timeout expires or num_return objects are + /// found. + /// + /// \param object_ids The objects to wait for. + /// \param owner_addresses The addresses of the workers that own the objects. + /// \param num_returns The number of objects to wait for. + /// \param timeout_milliseconds Duration, in milliseconds, to wait before returning. + /// \param result A pair with the first element containing the object ids that were + /// found, and the second element the objects that were not found. + /// \return ray::StatusOr containing error status or the set of object ids that were + /// found. + virtual ray::StatusOr> Wait( + const std::vector &object_ids, + const std::vector &owner_addresses, + int num_returns, + int64_t timeout_milliseconds) = 0; + + /// Tell the Raylet to cancel the get request from this worker. + /// + /// \return ray::Status. + virtual ray::Status CancelGetRequest() = 0; + + /// Notify the raylet that this client is blocked. This is only used for direct task + /// calls. Note that ordering of this with respect to Unblock calls is important. + /// + /// \return ray::Status. + virtual ray::Status NotifyDirectCallTaskBlocked() = 0; + + /// Notify the raylet that this client is unblocked. This is only used for direct task + /// calls. Note that ordering of this with respect to Block calls is important. + /// + /// \return ray::Status. + virtual ray::Status NotifyDirectCallTaskUnblocked() = 0; + + /// Wait for the given objects asynchronously. + /// + /// The core worker will be notified over gRPC when the wait completes. + /// + /// \param references The objects to wait for. + /// \param tag Value that will be sent to the core worker via gRPC on completion. + /// \return ray::Status. + virtual ray::Status WaitForActorCallArgs( + const std::vector &references, int64_t tag) = 0; + + /// Push an error to the relevant driver. + /// + /// \param job_id The ID of the job_id that the error is for. + /// \param type The type of the error. + /// \param error_message The error message. + /// \param timestamp The timestamp of the error. + /// \return ray::Status. + virtual ray::Status PushError(const ray::JobID &job_id, + const std::string &type, + const std::string &error_message, + double timestamp) = 0; + + /// Free a list of objects from object stores. + /// + /// \param object_ids A list of ObjectsIDs to be deleted. + /// \param local_only Whether keep this request with local object store + /// or send it to all the object stores. + /// \return ray::Status. + virtual ray::Status FreeObjects(const std::vector &object_ids, + bool local_only) = 0; + + /// Subscribe this worker to a notification when the provided object is ready in the + /// local object store. + /// + /// The worker will be notified over gRPC when the object is ready. + /// + /// \param object_id The ID of the object to subscribe to. + /// \param owner_address The address of the owner of the object. + virtual void SubscribePlasmaReady(const ObjectID &object_id, + const rpc::Address &owner_address) = 0; +}; + +} // namespace ipc +} // namespace ray diff --git a/src/ray/ipc/test/BUILD.bazel b/src/ray/ipc/tests/BUILD.bazel similarity index 90% rename from src/ray/ipc/test/BUILD.bazel rename to src/ray/ipc/tests/BUILD.bazel index c6a7c9b76078..9a39013a51f9 100644 --- a/src/ray/ipc/test/BUILD.bazel +++ b/src/ray/ipc/tests/BUILD.bazel @@ -9,6 +9,7 @@ ray_cc_test( "//src/ray/common:asio", "//src/ray/common:id", "//src/ray/ipc:client_connection", + "//src/ray/util:network_util", "@boost//:asio", "@com_google_googletest//:gtest_main", ], diff --git a/src/ray/ipc/test/client_connection_test.cc b/src/ray/ipc/tests/client_connection_test.cc similarity index 99% rename from src/ray/ipc/test/client_connection_test.cc rename to src/ray/ipc/tests/client_connection_test.cc index a287518daf73..3839ccf564d9 100644 --- a/src/ray/ipc/test/client_connection_test.cc +++ b/src/ray/ipc/tests/client_connection_test.cc @@ -24,6 +24,7 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" #include "ray/common/asio/instrumented_io_context.h" +#include "ray/util/network_util.h" namespace ray { namespace raylet { diff --git a/src/ray/object_manager/BUILD.bazel b/src/ray/object_manager/BUILD.bazel index 3013567accad..ab2afb3a2679 100644 --- a/src/ray/object_manager/BUILD.bazel +++ b/src/ray/object_manager/BUILD.bazel @@ -19,8 +19,8 @@ ray_cc_library( "//src/ray/object_manager/plasma:plasma_store_server_lib", "//src/ray/protobuf:common_cc_proto", "//src/ray/protobuf:node_manager_cc_proto", - "//src/ray/rpc:object_manager_client", - "//src/ray/rpc:object_manager_server", + "//src/ray/rpc/object_manager:object_manager_client_interface", + "//src/ray/rpc/object_manager:object_manager_server", "@com_google_absl//absl/container:flat_hash_map", ], ) @@ -66,8 +66,8 @@ ray_cc_library( ":object_directory", "//src/ray/common:asio", "//src/ray/common:id", - "//src/ray/gcs/gcs_client:gcs_client_lib", - "//src/ray/pubsub:subscriber", + "//src/ray/gcs_client", + "//src/ray/pubsub:subscriber_interface", "//src/ray/rpc:core_worker_client", "@com_google_absl//absl/container:flat_hash_map", ], @@ -81,7 +81,7 @@ ray_cc_library( "//src/ray/common:asio", "//src/ray/common:id", "//src/ray/common:status", - "//src/ray/gcs/gcs_client:gcs_client_lib", + "//src/ray/gcs_client", ], ) @@ -110,6 +110,7 @@ ray_cc_library( "//src/ray/common:ray_config", "//src/ray/common:status", "@com_google_absl//absl/strings", + "@com_google_absl//absl/time", ], ) diff --git a/src/ray/object_manager/chunk_object_reader.cc b/src/ray/object_manager/chunk_object_reader.cc index 2038033751c5..950a546e1470 100644 --- a/src/ray/object_manager/chunk_object_reader.cc +++ b/src/ray/object_manager/chunk_object_reader.cc @@ -50,7 +50,7 @@ std::optional ChunkObjectReader::GetChunk(uint64_t chunk_index) con auto offset = cur_chunk_offset; auto data_size = std::min(object_->GetDataSize() - cur_chunk_offset, cur_chunk_size); if (!object_->ReadFromDataSection(offset, data_size, result)) { - return std::optional(); + return std::nullopt; } } @@ -61,9 +61,9 @@ std::optional ChunkObjectReader::GetChunk(uint64_t chunk_index) con auto size = std::min(cur_chunk_offset + cur_chunk_size - object_->GetDataSize(), cur_chunk_size); if (!object_->ReadFromMetadataSection(offset, size, result)) { - return std::optional(); + return std::nullopt; } } - return std::optional(std::move(result)); + return result; } }; // namespace ray diff --git a/src/ray/object_manager/common.cc b/src/ray/object_manager/common.cc index 349ffbd8881f..d7f205a96668 100644 --- a/src/ray/object_manager/common.cc +++ b/src/ray/object_manager/common.cc @@ -17,6 +17,8 @@ #include #include "absl/strings/str_cat.h" +#include "absl/time/clock.h" +#include "absl/time/time.h" #include "ray/common/ray_config.h" namespace ray { diff --git a/src/ray/object_manager/common.h b/src/ray/object_manager/common.h index 7a790bc91275..709671026e11 100644 --- a/src/ray/object_manager/common.h +++ b/src/ray/object_manager/common.h @@ -216,8 +216,8 @@ struct ObjectInfo { bool is_mutable = false; int64_t data_size = 0; int64_t metadata_size = 0; - /// Owner's raylet ID. - NodeID owner_raylet_id; + /// Owner's node ID. + NodeID owner_node_id; /// Owner's IP address. std::string owner_ip_address; /// Owner's port. @@ -232,7 +232,7 @@ struct ObjectInfo { bool operator==(const ObjectInfo &other) const { return ((object_id == other.object_id) && (data_size == other.data_size) && (metadata_size == other.metadata_size) && - (owner_raylet_id == other.owner_raylet_id) && + (owner_node_id == other.owner_node_id) && (owner_ip_address == other.owner_ip_address) && (owner_port == other.owner_port) && (owner_worker_id == other.owner_worker_id)); diff --git a/src/ray/object_manager/object_buffer_pool.cc b/src/ray/object_manager/object_buffer_pool.cc index e5f55b2266bf..00dcbe2b8564 100644 --- a/src/ray/object_manager/object_buffer_pool.cc +++ b/src/ray/object_manager/object_buffer_pool.cc @@ -107,14 +107,14 @@ ray::Status ObjectBufferPool::CreateChunk(const ObjectID &object_id, RAY_RETURN_NOT_OK(EnsureBufferExists( object_id, owner_address, data_size, metadata_size, chunk_index)); auto &state = create_buffer_state_.at(object_id); - if (chunk_index >= state.chunk_state.size()) { + if (chunk_index >= state.chunk_state_.size()) { return ray::Status::IOError("Object size mismatch"); } - if (state.chunk_state[chunk_index] != CreateChunkState::AVAILABLE) { + if (state.chunk_state_[chunk_index] != CreateChunkState::AVAILABLE) { // There can be only one reference to this chunk at any given time. return ray::Status::IOError("Chunk already received by a different thread."); } - state.chunk_state[chunk_index] = CreateChunkState::REFERENCED; + state.chunk_state_[chunk_index] = CreateChunkState::REFERENCED; return ray::Status::OK(); } @@ -128,35 +128,36 @@ void ObjectBufferPool::WriteChunk(const ObjectID &object_id, absl::MutexLock lock(&pool_mutex_); auto it = create_buffer_state_.find(object_id); if (it == create_buffer_state_.end() || - chunk_index >= it->second.chunk_state.size() || - it->second.chunk_state.at(chunk_index) != CreateChunkState::REFERENCED) { + chunk_index >= it->second.chunk_state_.size() || + it->second.chunk_state_.at(chunk_index) != CreateChunkState::REFERENCED) { RAY_LOG(DEBUG) << "Object " << object_id << " aborted before chunk " << chunk_index << " could be sealed"; return; } - if (it->second.data_size != data_size || it->second.metadata_size != metadata_size) { + if (it->second.data_size_ != data_size || + it->second.metadata_size_ != metadata_size) { RAY_LOG(DEBUG) << "Object " << object_id << " size mismatch, rejecting chunk"; return; } - RAY_CHECK(it->second.chunk_info.size() > chunk_index); + RAY_CHECK(it->second.chunk_info_.size() > chunk_index); - chunk_info = it->second.chunk_info.at(chunk_index); - RAY_CHECK(data.size() == chunk_info->buffer_length) + chunk_info = it->second.chunk_info_.at(chunk_index); + RAY_CHECK(data.size() == chunk_info->buffer_length_) << "size mismatch! data size: " << data.size() - << " chunk size: " << chunk_info->buffer_length; + << " chunk size: " << chunk_info->buffer_length_; // Update the state from REFERENCED To SEALED before releasing the lock to ensure // that no other thread sees a REFERENCED state. - it->second.chunk_state.at(chunk_index) = CreateChunkState::SEALED; + it->second.chunk_state_.at(chunk_index) = CreateChunkState::SEALED; // Increment the number of inflight copies to ensure Abort // does not release the buffer. - it->second.num_inflight_copies++; + it->second.num_inflight_copies_++; } RAY_CHECK(chunk_info.has_value()) << "chunk_info is not set"; // The num_inflight_copies is used to ensure that another thread cannot call Release // on the object_id, which makes the unguarded copy call safe. - std::memcpy(chunk_info->data, data.data(), chunk_info->buffer_length); + std::memcpy(chunk_info->data_, data.data(), chunk_info->buffer_length_); { // Ensure the process of object_id Seal and Release is mutex guarded. @@ -165,9 +166,9 @@ void ObjectBufferPool::WriteChunk(const ObjectID &object_id, // Abort cannot be called during inflight copy operations. RAY_CHECK(it != create_buffer_state_.end()); // Decrement the number of inflight copies to ensure Abort can release the buffer. - it->second.num_inflight_copies--; - it->second.num_seals_remaining--; - if (it->second.num_seals_remaining == 0) { + it->second.num_inflight_copies_--; + it->second.num_seals_remaining_--; + if (it->second.num_seals_remaining_ == 0) { RAY_CHECK_OK(store_client_->Seal(object_id)); RAY_CHECK_OK(store_client_->Release(object_id)); create_buffer_state_.erase(it); @@ -186,7 +187,7 @@ void ObjectBufferPool::AbortCreateInternal(const ObjectID &object_id) { auto no_copy_inflight = [this, object_id]() { pool_mutex_.AssertReaderHeld(); auto it = create_buffer_state_.find(object_id); - return it == create_buffer_state_.end() || it->second.num_inflight_copies == 0; + return it == create_buffer_state_.end() || it->second.num_inflight_copies_ == 0; }; pool_mutex_.Await(absl::Condition(&no_copy_inflight)); @@ -230,8 +231,8 @@ ray::Status ObjectBufferPool::EnsureBufferExists(const ObjectID &object_id, // Buffer for object_id already exists and the size matches ours. { auto it = create_buffer_state_.find(object_id); - if (it != create_buffer_state_.end() && it->second.data_size == data_size && - it->second.metadata_size == metadata_size) { + if (it != create_buffer_state_.end() && it->second.data_size_ == data_size && + it->second.metadata_size_ == metadata_size) { return ray::Status::OK(); } } @@ -258,10 +259,10 @@ ray::Status ObjectBufferPool::EnsureBufferExists(const ObjectID &object_id, { auto it = create_buffer_state_.find(object_id); if (it != create_buffer_state_.end()) { - RAY_CHECK(it->second.data_size != data_size || - it->second.metadata_size != metadata_size); + RAY_CHECK(it->second.data_size_ != data_size || + it->second.metadata_size_ != metadata_size); RAY_LOG(WARNING) << "Object " << object_id << " size (" << data_size - << ") differs from the original (" << it->second.data_size + << ") differs from the original (" << it->second.data_size_ << "). This is likely due to re-execution of a task with a " "nondeterministic output. Recreating object with size " << data_size << "."; @@ -317,7 +318,7 @@ ray::Status ObjectBufferPool::EnsureBufferExists(const ObjectID &object_id, std::forward_as_tuple(metadata_size, data_size, BuildChunks(object_id, mutable_data, data_size, data))); - RAY_CHECK(inserted.first->second.chunk_info.size() == num_chunks); + RAY_CHECK(inserted.first->second.chunk_info_.size() == num_chunks); RAY_LOG(DEBUG) << "Created object " << object_id << " in plasma store, number of chunks: " << num_chunks << ", chunk index: " << chunk_index; diff --git a/src/ray/object_manager/object_buffer_pool.h b/src/ray/object_manager/object_buffer_pool.h index 6951e7ff01fd..108af01340bc 100644 --- a/src/ray/object_manager/object_buffer_pool.h +++ b/src/ray/object_manager/object_buffer_pool.h @@ -39,18 +39,18 @@ class ObjectBufferPool { uint8_t *data, uint64_t buffer_length, std::shared_ptr buffer_ref) - : chunk_index(chunk_index), - data(data), - buffer_length(buffer_length), - buffer_ref(buffer_ref){}; + : chunk_index_(chunk_index), + data_(data), + buffer_length_(buffer_length), + buffer_ref_(buffer_ref){}; /// The index of this object chunk within the object, starting with 0. - uint64_t chunk_index; + uint64_t chunk_index_; /// A pointer to the start position of this object chunk. - uint8_t *data; + uint8_t *data_; /// The size of this object chunk. - uint64_t buffer_length; + uint64_t buffer_length_; /// A shared reference to the underlying buffer, keeping it alive. - std::shared_ptr buffer_ref; + std::shared_ptr buffer_ref_; }; /// Constructor. @@ -63,7 +63,8 @@ class ObjectBufferPool { ~ObjectBufferPool(); /// This object cannot be copied due to pool_mutex. - RAY_DISALLOW_COPY_AND_ASSIGN(ObjectBufferPool); + ObjectBufferPool(const ObjectBufferPool &) = delete; + ObjectBufferPool &operator=(const ObjectBufferPool &) = delete; /// Computes the number of chunks needed to transfer an object and its metadata. /// @@ -174,25 +175,25 @@ class ObjectBufferPool { CreateBufferState(uint64_t metadata_size, uint64_t data_size, std::vector chunk_info) - : metadata_size(metadata_size), - data_size(data_size), - chunk_info(chunk_info), - chunk_state(chunk_info.size(), CreateChunkState::AVAILABLE), - num_seals_remaining(chunk_info.size()) {} + : metadata_size_(metadata_size), + data_size_(data_size), + chunk_info_(chunk_info), + chunk_state_(chunk_info.size(), CreateChunkState::AVAILABLE), + num_seals_remaining_(chunk_info.size()) {} /// Total size of the object metadata. - uint64_t metadata_size; + uint64_t metadata_size_; /// Total size of the object data. - uint64_t data_size; + uint64_t data_size_; /// A vector maintaining information about the chunks which comprise /// an object. - std::vector chunk_info; + std::vector chunk_info_; /// The state of each chunk, which is used to enforce strict state /// transitions of each chunk. - std::vector chunk_state; + std::vector chunk_state_; /// The number of chunks left to seal before the buffer is sealed. - uint64_t num_seals_remaining; + uint64_t num_seals_remaining_; /// The number of inflight copy operations. - uint64_t num_inflight_copies = 0; + uint64_t num_inflight_copies_ = 0; }; /// Returned when GetChunk or CreateChunk fails. diff --git a/src/ray/object_manager/object_directory.h b/src/ray/object_manager/object_directory.h index 4eb636ec5e06..fa9130111ea0 100644 --- a/src/ray/object_manager/object_directory.h +++ b/src/ray/object_manager/object_directory.h @@ -24,7 +24,7 @@ #include "ray/common/asio/instrumented_io_context.h" #include "ray/common/id.h" #include "ray/common/status.h" -#include "ray/gcs/gcs_client/gcs_client.h" +#include "ray/gcs_client/gcs_client.h" #include "ray/object_manager/common.h" namespace ray { diff --git a/src/ray/object_manager/object_manager.cc b/src/ray/object_manager/object_manager.cc index a6bdead01cfe..4c2dafed127b 100644 --- a/src/ray/object_manager/object_manager.cc +++ b/src/ray/object_manager/object_manager.cc @@ -21,7 +21,6 @@ #include #include -#include "ray/common/common_protocol.h" #include "ray/object_manager/plasma/store_runner.h" #include "ray/object_manager/spilled_object_reader.h" #include "ray/stats/metric_defs.h" @@ -67,63 +66,43 @@ ObjectManager::ObjectManager( IObjectDirectory *object_directory, RestoreSpilledObjectCallback restore_spilled_object, std::function get_spilled_object_url, - SpillObjectsCallback spill_objects_callback, - std::function object_store_full_callback, - AddObjectCallback add_object_callback, - DeleteObjectCallback delete_object_callback, std::function(const ObjectID &object_id)> pin_object, - const std::function fail_pull_request) + std::function fail_pull_request, + const std::shared_ptr &buffer_pool_store_client, + std::unique_ptr object_store_internal, + std::function( + const std::string &address, + const int port, + rpc::ClientCallManager &client_call_manager)> object_manager_client_factory, + instrumented_io_context &rpc_service) : main_service_(&main_service), self_node_id_(self_node_id), config_(config), gcs_client_(gcs_client), object_directory_(object_directory), - object_store_internal_(std::make_unique( - config, - spill_objects_callback, - object_store_full_callback, - /*add_object_callback=*/ - [this, add_object_callback = std::move(add_object_callback)]( - const ObjectInfo &object_info) { - main_service_->post( - [this, object_info, &add_object_callback]() { - HandleObjectAdded(object_info); - add_object_callback(object_info); - }, - "ObjectManager.ObjectAdded"); - }, - /*delete_object_callback=*/ - [this, delete_object_callback = std::move(delete_object_callback)]( - const ObjectID &object_id) { - main_service_->post( - [this, object_id, &delete_object_callback]() { - HandleObjectDeleted(object_id); - delete_object_callback(object_id); - }, - "ObjectManager.ObjectDeleted"); - })), - buffer_pool_store_client_(std::make_shared()), + object_store_internal_(std::move(object_store_internal)), + buffer_pool_store_client_(buffer_pool_store_client), buffer_pool_(buffer_pool_store_client_, config_.object_chunk_size), - rpc_work_(rpc_service_.get_executor()), + rpc_service_(rpc_service), object_manager_server_("ObjectManager", config_.object_manager_port, config_.object_manager_address == "127.0.0.1", - ClusterID::Nil(), config_.rpc_service_threads_number), client_call_manager_(main_service, /*record_stats=*/true, ClusterID::Nil(), config_.rpc_service_threads_number), - restore_spilled_object_(restore_spilled_object), + restore_spilled_object_(std::move(restore_spilled_object)), get_spilled_object_url_(std::move(get_spilled_object_url)), pull_retry_timer_(*main_service_, - boost::posix_time::milliseconds(config.timer_freq_ms)) { + boost::posix_time::milliseconds(config.timer_freq_ms)), + push_manager_(std::make_unique(/* max_chunks_in_flight= */ std::max( + static_cast(1L), + static_cast(config_.max_bytes_in_flight / + config_.object_chunk_size)))), + object_manager_client_factory_(std::move(object_manager_client_factory)) { RAY_CHECK_GT(config_.rpc_service_threads_number, 0); - push_manager_.reset(new PushManager(/* max_chunks_in_flight= */ std::max( - static_cast(1L), - static_cast(config_.max_bytes_in_flight / config_.object_chunk_size)))); - pull_retry_timer_.async_wait([this](const boost::system::error_code &e) { Tick(e); }); auto object_is_local = [this](const ObjectID &object_id) { @@ -175,16 +154,7 @@ bool ObjectManager::IsPlasmaObjectSpillable(const ObjectID &object_id) { return plasma::plasma_store_runner->IsPlasmaObjectSpillable(object_id); } -void ObjectManager::RunRpcService(int index) { - SetThreadName(absl::StrFormat("rpc.obj.mgr.%d", index)); - rpc_service_.run(); -} - void ObjectManager::StartRpcService() { - rpc_threads_.resize(config_.rpc_service_threads_number); - for (int i = 0; i < config_.rpc_service_threads_number; i++) { - rpc_threads_[i] = std::thread(&ObjectManager::RunRpcService, this, i); - } object_manager_server_.RegisterService( std::make_unique(rpc_service_, *this), false /* token_auth */); @@ -193,11 +163,6 @@ void ObjectManager::StartRpcService() { void ObjectManager::StopRpcService() { rpc_service_.stop(); - for (int i = 0; i < config_.rpc_service_threads_number; i++) { - if (rpc_threads_[i].joinable()) { - rpc_threads_[i].join(); - } - } object_manager_server_.Shutdown(); } @@ -401,7 +366,7 @@ void ObjectManager::PushLocalObject(const ObjectID &object_id, const NodeID &nod uint64_t metadata_size = static_cast(object_info.metadata_size); rpc::Address owner_address; - owner_address.set_raylet_id(object_info.owner_raylet_id.Binary()); + owner_address.set_node_id(object_info.owner_node_id.Binary()); owner_address.set_ip_address(object_info.owner_ip_address); owner_address.set_port(object_info.owner_port); owner_address.set_worker_id(object_info.owner_worker_id.Binary()); @@ -518,14 +483,15 @@ void ObjectManager::PushObjectInternal(const ObjectID &object_id, }); } -void ObjectManager::SendObjectChunk(const UniqueID &push_id, - const ObjectID &object_id, - const NodeID &node_id, - uint64_t chunk_index, - std::shared_ptr rpc_client, - std::function on_complete, - std::shared_ptr chunk_reader, - bool from_disk) { +void ObjectManager::SendObjectChunk( + const UniqueID &push_id, + const ObjectID &object_id, + const NodeID &node_id, + uint64_t chunk_index, + std::shared_ptr rpc_client, + std::function on_complete, + std::shared_ptr chunk_reader, + bool from_disk) { double start_time = absl::GetCurrentTimeNanos() / 1e9; rpc::PushRequest push_request; // Set request header @@ -673,9 +639,11 @@ void ObjectManager::FreeObjects(const std::vector &object_ids, bool local_only) { buffer_pool_.FreeObjects(object_ids); if (!local_only) { - std::vector> rpc_clients; + std::vector> rpc_clients; + // TODO(#56414): optimize this so we don't have to send a free objects request for + // every object to every node const auto &node_info_map = gcs_client_.Nodes().GetAll(); - for (const auto &[node_id, node_info] : node_info_map) { + for (const auto &[node_id, _] : node_info_map) { if (node_id == self_node_id_) { continue; } @@ -694,7 +662,7 @@ void ObjectManager::FreeObjects(const std::vector &object_ids, void ObjectManager::SpreadFreeObjectsRequest( const std::vector &object_ids, - const std::vector> &rpc_clients) { + const std::vector> &rpc_clients) { // This code path should be called from node manager. rpc::FreeObjectsRequest free_objects_request; for (const auto &e : object_ids) { @@ -713,7 +681,7 @@ void ObjectManager::SpreadFreeObjectsRequest( } } -std::shared_ptr ObjectManager::GetRpcClient( +std::shared_ptr ObjectManager::GetRpcClient( const NodeID &node_id) { auto it = remote_object_manager_clients_.find(node_id); if (it != remote_object_manager_clients_.end()) { @@ -724,9 +692,9 @@ std::shared_ptr ObjectManager::GetRpcClient( return nullptr; } auto object_manager_client = - std::make_shared(node_info->node_manager_address(), - node_info->object_manager_port(), - client_call_manager_); + object_manager_client_factory_(node_info->node_manager_address(), + node_info->object_manager_port(), + client_call_manager_); RAY_LOG(DEBUG) << "Get rpc client, address: " << node_info->node_manager_address() << ", port: " << node_info->object_manager_port() @@ -767,17 +735,17 @@ void ObjectManager::RecordMetrics() { push_manager_->RecordMetrics(); // used_memory_ includes the fallback allocation, so we should add it again here // to calculate the exact available memory. - stats::ObjectStoreAvailableMemory().Record( + ray_metric_object_store_available_memory_.Record( config_.object_store_memory - used_memory_ + plasma::plasma_store_runner->GetFallbackAllocated()); // Subtract fallback allocated memory. It is tracked separately by // `ObjectStoreFallbackMemory`. - stats::ObjectStoreUsedMemory().Record( + ray_metric_object_store_used_memory_.Record( used_memory_ - plasma::plasma_store_runner->GetFallbackAllocated()); - stats::ObjectStoreFallbackMemory().Record( + ray_metric_object_store_fallback_memory_.Record( plasma::plasma_store_runner->GetFallbackAllocated()); - stats::ObjectStoreLocalObjects().Record(local_objects_.size()); - stats::ObjectManagerPullRequests().Record(pull_manager_->NumObjectPullRequests()); + ray_metric_object_store_local_objects_.Record(local_objects_.size()); + ray_metric_object_manager_pull_requests_.Record(pull_manager_->NumObjectPullRequests()); ray::stats::STATS_object_manager_bytes.Record(num_bytes_pushed_from_plasma_, "PushedFromLocalPlasma"); @@ -828,7 +796,8 @@ void ObjectManager::Tick(const boost::system::error_code &e) { auto interval = boost::posix_time::milliseconds(config_.timer_freq_ms); pull_retry_timer_.expires_from_now(interval); - pull_retry_timer_.async_wait([this](const boost::system::error_code &e) { Tick(e); }); + pull_retry_timer_.async_wait( + [this](const boost::system::error_code &err) { Tick(err); }); } } // namespace ray diff --git a/src/ray/object_manager/object_manager.h b/src/ray/object_manager/object_manager.h index 8f20893d1d46..3a8658393a12 100644 --- a/src/ray/object_manager/object_manager.h +++ b/src/ray/object_manager/object_manager.h @@ -30,8 +30,9 @@ #include "ray/object_manager/object_directory.h" #include "ray/object_manager/pull_manager.h" #include "ray/object_manager/push_manager.h" -#include "ray/rpc/object_manager/object_manager_client.h" +#include "ray/rpc/object_manager/object_manager_client_interface.h" #include "ray/rpc/object_manager/object_manager_server.h" +#include "ray/stats/metric.h" #include "src/ray/protobuf/common.pb.h" #include "src/ray/protobuf/node_manager.pb.h" @@ -77,6 +78,7 @@ struct LocalObjectInfo { /// Information from the object store about the object. ObjectInfo object_info; }; + class ObjectStoreRunner { public: ObjectStoreRunner(const ObjectManagerConfig &config, @@ -107,10 +109,12 @@ class ObjectManagerInterface { virtual bool PullManagerHasPullsQueued() const = 0; virtual int64_t GetMemoryCapacity() const = 0; virtual std::string DebugString() const = 0; - virtual void FillObjectStoreStats(rpc::GetNodeStatsReply *reply) const = 0; + virtual void FillObjectStoreStats(rpc::GetNodeStatsReply *repOly) const = 0; virtual double GetUsedMemoryPercentage() const = 0; virtual void Stop() = 0; virtual void RecordMetrics() = 0; + virtual void HandleObjectAdded(const ObjectInfo &object_info) = 0; + virtual void HandleObjectDeleted(const ObjectID &object_id) = 0; virtual ~ObjectManagerInterface() = default; }; @@ -163,7 +167,6 @@ class ObjectManager : public ObjectManagerInterface, return pull_manager_->NumInactivePulls(task_key); } - public: /// Takes user-defined IObjectDirectory implementation. /// When this constructor is used, the ObjectManager assumes ownership of /// the given ObjectDirectory instance. @@ -179,12 +182,15 @@ class ObjectManager : public ObjectManagerInterface, IObjectDirectory *object_directory, RestoreSpilledObjectCallback restore_spilled_object, std::function get_spilled_object_url, - SpillObjectsCallback spill_objects_callback, - std::function object_store_full_callback, - AddObjectCallback add_object_callback, - DeleteObjectCallback delete_object_callback, std::function(const ObjectID &object_id)> pin_object, - std::function fail_pull_request); + std::function fail_pull_request, + const std::shared_ptr &buffer_pool_store_client, + std::unique_ptr object_store_internal, + std::function( + const std::string &address, + const int port, + rpc::ClientCallManager &client_call_manager)> object_manager_client_factory, + instrumented_io_context &rpc_service); ~ObjectManager() override; @@ -267,13 +273,14 @@ class ObjectManager : public ObjectManagerInterface, private: friend class TestObjectManager; + friend uint32_t NumRemoteFreeObjectsRequests(const ObjectManager &object_manager); /// Spread the Free request to all objects managers. /// /// \param object_ids the The list of ObjectIDs to be deleted. void SpreadFreeObjectsRequest( const std::vector &object_ids, - const std::vector> &rpc_clients); + const std::vector> &rpc_clients); /// Pushing a known local object to a remote object manager. /// @@ -319,7 +326,7 @@ class ObjectManager : public ObjectManagerInterface, const ObjectID &object_id, const NodeID &node_id, uint64_t chunk_index, - std::shared_ptr rpc_client, + std::shared_ptr rpc_client, std::function on_complete, std::shared_ptr chunk_reader, bool from_disk); @@ -332,12 +339,12 @@ class ObjectManager : public ObjectManagerInterface, /// Handle an object being added to this node. This adds the object to the /// directory, pushes the object to other nodes if necessary, and cancels any /// outstanding Pull requests for the object. - void HandleObjectAdded(const ObjectInfo &object_info); + void HandleObjectAdded(const ObjectInfo &object_info) override; /// Handle an object being deleted from this node. This registers object remove /// with directory. This also asks the pull manager to fetch this object again /// as soon as possible. - void HandleObjectDeleted(const ObjectID &object_id); + void HandleObjectDeleted(const ObjectID &object_id) override; /// This is used to notify the main thread that the sending of a chunk has /// completed. @@ -397,7 +404,7 @@ class ObjectManager : public ObjectManagerInterface, /// Get the rpc client according to the node ID /// /// \param node_id Remote node id, will send rpc request to it - std::shared_ptr GetRpcClient(const NodeID &node_id); + std::shared_ptr GetRpcClient(const NodeID &node_id); /// Weak reference to main service. We ensure this object is destroyed before /// main_service_ is stopped. @@ -417,20 +424,13 @@ class ObjectManager : public ObjectManagerInterface, /// Used by the buffer pool to read and write objects in the local store /// during object transfers. - std::shared_ptr buffer_pool_store_client_; + std::shared_ptr buffer_pool_store_client_; /// Manages accesses to local objects for object transfers. ObjectBufferPool buffer_pool_; /// Multi-thread asio service, deal with all outgoing and incoming RPC request. - instrumented_io_context rpc_service_; - - /// Keep rpc service running when no task in rpc service. - boost::asio::executor_work_guard rpc_work_; - - /// The thread pool used for running `rpc_service`. - /// Data copy operations during request are done in this thread pool. - std::vector rpc_threads_; + instrumented_io_context &rpc_service_; /// Mapping from locally available objects to information about those objects /// including when the object was last pushed to other object managers. @@ -455,7 +455,7 @@ class ObjectManager : public ObjectManagerInterface, rpc::ClientCallManager client_call_manager_; /// Client id - object manager gRPC client. - absl::flat_hash_map> + absl::flat_hash_map> remote_object_manager_clients_; /// Callback to trigger direct restoration of an object. @@ -474,6 +474,13 @@ class ObjectManager : public ObjectManagerInterface, /// Object pull manager. std::unique_ptr pull_manager_; + /// Factory function to create object manager client. + std::function( + const std::string &address, + const int port, + rpc::ClientCallManager &client_call_manager)> + object_manager_client_factory_; + /// Running sum of the amount of memory used in the object store. int64_t used_memory_ = 0; @@ -497,6 +504,32 @@ class ObjectManager : public ObjectManagerInterface, /// create the object in plasma. This is usually due to out-of-memory in /// plasma. size_t num_chunks_received_failed_due_to_plasma_ = 0; + + /// Metrics + ray::stats::Gauge ray_metric_object_store_available_memory_{ + /*name=*/"object_store_available_memory", + /*description=*/"Amount of memory currently available in the object store.", + /*unit=*/"bytes"}; + + ray::stats::Gauge ray_metric_object_store_used_memory_{ + /*name=*/"object_store_used_memory", + /*description=*/"Amount of memory currently occupied in the object store.", + /*unit=*/"bytes"}; + + ray::stats::Gauge ray_metric_object_store_fallback_memory_{ + /*name=*/"object_store_fallback_memory", + /*description=*/"Amount of memory in fallback allocations in the filesystem.", + /*unit=*/"bytes"}; + + ray::stats::Gauge ray_metric_object_store_local_objects_{ + /*name=*/"object_store_num_local_objects", + /*description=*/"Number of objects currently in the object store.", + /*unit=*/"objects"}; + + ray::stats::Gauge ray_metric_object_manager_pull_requests_{ + /*name=*/"object_manager_num_pull_requests", + /*description=*/"Number of active pull requests for objects.", + /*unit=*/"requests"}; }; } // namespace ray diff --git a/src/ray/object_manager/ownership_object_directory.cc b/src/ray/object_manager/ownership_object_directory.cc index bc17c10c8396..431bd14095a2 100644 --- a/src/ray/object_manager/ownership_object_directory.cc +++ b/src/ray/object_manager/ownership_object_directory.cc @@ -104,7 +104,7 @@ bool UpdateObjectLocations(const rpc::WorkerObjectLocationsPubMessage &location_ rpc::Address GetOwnerAddressFromObjectInfo(const ObjectInfo &object_info) { rpc::Address owner_address; - owner_address.set_raylet_id(object_info.owner_raylet_id.Binary()); + owner_address.set_node_id(object_info.owner_node_id.Binary()); owner_address.set_ip_address(object_info.owner_ip_address); owner_address.set_port(object_info.owner_port); owner_address.set_worker_id(object_info.owner_worker_id.Binary()); @@ -246,7 +246,7 @@ void OwnershipBasedObjectDirectory::SendObjectLocationUpdateBatchIfNeeded( in_flight_requests_.emplace(worker_id); auto owner_client = GetClient(owner_address); owner_client->UpdateObjectLocationBatch( - request, + std::move(request), [this, worker_id, node_id, owner_address]( const Status &status, const rpc::UpdateObjectLocationBatchReply &reply) { RAY_CHECK(in_flight_requests_.erase(worker_id) > 0); @@ -345,39 +345,39 @@ ray::Status OwnershipBasedObjectDirectory::SubscribeObjectLocations( auto failure_callback = [this, owner_address](const std::string &object_id_binary, const Status &status) { - const auto object_id = ObjectID::FromBinary(object_id_binary); - rpc::WorkerObjectLocationsPubMessage location_info; + const auto obj_id = ObjectID::FromBinary(object_id_binary); if (!status.ok()) { - RAY_LOG(INFO).WithField(object_id) + RAY_LOG(INFO).WithField(obj_id) << "Failed to get the location: " << status.ToString(); - mark_as_failed_(object_id, rpc::ErrorType::OWNER_DIED); + mark_as_failed_(obj_id, rpc::ErrorType::OWNER_DIED); } else { // Owner is still alive but published a failure because the ref was // deleted. - RAY_LOG(INFO).WithField(object_id) + RAY_LOG(INFO).WithField(obj_id) << "Failed to get the location for object, already released by distributed " "reference counting protocol"; - mark_as_failed_(object_id, rpc::ErrorType::OBJECT_DELETED); + mark_as_failed_(obj_id, rpc::ErrorType::OBJECT_DELETED); } // Location lookup can fail if the owner is reachable but no longer has a // record of this ObjectRef, most likely due to an issue with the // distributed reference counting protocol. - ObjectLocationSubscriptionCallback(location_info, - object_id, - /*location_lookup_failed*/ true); + ObjectLocationSubscriptionCallback( + /*location_info=*/rpc::WorkerObjectLocationsPubMessage{}, + obj_id, + /*location_lookup_failed*/ true); }; auto sub_message = std::make_unique(); sub_message->mutable_worker_object_locations_message()->Swap(request.get()); - RAY_CHECK(object_location_subscriber_->Subscribe( + object_location_subscriber_->Subscribe( std::move(sub_message), rpc::ChannelType::WORKER_OBJECT_LOCATIONS_CHANNEL, owner_address, object_id.Binary(), /*subscribe_done_callback=*/nullptr, /*Success callback=*/msg_published_callback, - /*Failure callback=*/failure_callback)); + /*Failure callback=*/failure_callback); auto location_state = LocationListenerState(); location_state.owner_address = owner_address; @@ -472,34 +472,34 @@ void OwnershipBasedObjectDirectory::HandleNodeRemoved(const NodeID &node_id) { } void OwnershipBasedObjectDirectory::RecordMetrics(uint64_t duration_ms) { - stats::ObjectDirectoryLocationSubscriptions.Record(listeners_.size()); + ray_metric_object_directory_location_subscriptions_.Record(listeners_.size()); // Record number of object location updates per second. metrics_num_object_location_updates_per_second_ = static_cast(metrics_num_object_location_updates_) * (1000.0 / static_cast(duration_ms)); - stats::ObjectDirectoryLocationUpdates.Record( + ray_metric_object_directory_location_updates_.Record( metrics_num_object_location_updates_per_second_); metrics_num_object_location_updates_ = 0; // Record number of object location lookups per second. metrics_num_object_location_lookups_per_second_ = static_cast(metrics_num_object_location_lookups_) * (1000.0 / static_cast(duration_ms)); - stats::ObjectDirectoryLocationLookups.Record( + ray_metric_object_directory_location_lookups_.Record( metrics_num_object_location_lookups_per_second_); metrics_num_object_location_lookups_ = 0; // Record number of object locations added per second. metrics_num_object_locations_added_per_second_ = static_cast(metrics_num_object_locations_added_) * (1000.0 / static_cast(duration_ms)); - stats::ObjectDirectoryAddedLocations.Record( + ray_metric_object_directory_location_added_.Record( metrics_num_object_locations_added_per_second_); metrics_num_object_locations_added_ = 0; // Record number of object locations removed per second. metrics_num_object_locations_removed_per_second_ = static_cast(metrics_num_object_locations_removed_) * (1000.0 / static_cast(duration_ms)); - stats::ObjectDirectoryRemovedLocations.Record( + ray_metric_object_directory_location_removed_.Record( metrics_num_object_locations_removed_per_second_); metrics_num_object_locations_removed_ = 0; } diff --git a/src/ray/object_manager/ownership_object_directory.h b/src/ray/object_manager/ownership_object_directory.h index 556544ff109d..c1da711673cc 100644 --- a/src/ray/object_manager/ownership_object_directory.h +++ b/src/ray/object_manager/ownership_object_directory.h @@ -23,11 +23,12 @@ #include "ray/common/asio/instrumented_io_context.h" #include "ray/common/id.h" #include "ray/common/status.h" -#include "ray/gcs/gcs_client/gcs_client.h" +#include "ray/gcs_client/gcs_client.h" #include "ray/object_manager/object_directory.h" #include "ray/pubsub/subscriber.h" #include "ray/rpc/worker/core_worker_client.h" #include "ray/rpc/worker/core_worker_client_pool.h" +#include "ray/stats/metric.h" namespace ray { @@ -174,6 +175,45 @@ class OwnershipBasedObjectDirectory : public IObjectDirectory { uint64_t cum_metrics_num_object_location_updates_ = 0; + /// Ray metrics + ray::stats::Gauge ray_metric_object_directory_location_subscriptions_{ + /*name=*/"object_directory_subscriptions", + /*description=*/ + "Number of object location subscriptions. If this is high, the raylet is " + "attempting " + "to pull a lot of objects.", + /*unit=*/"subscriptions"}; + + ray::stats::Gauge ray_metric_object_directory_location_updates_{ + /*name=*/"object_directory_updates", + /*description=*/ + "Number of object location updates per second. If this is high, the raylet is " + "attempting to pull a lot of objects and/or the locations for objects are " + "frequently " + "changing (e.g. due to many object copies or evictions).", + /*unit=*/"updates"}; + + ray::stats::Gauge ray_metric_object_directory_location_lookups_{ + /*name=*/"object_directory_lookups", + /*description=*/ + "Number of object location lookups per second. If this is high, the raylet is " + "waiting on a lot of objects.", + /*unit=*/"lookups"}; + + ray::stats::Gauge ray_metric_object_directory_location_added_{ + /*name=*/"object_directory_added_locations", + /*description=*/ + "Number of object locations added per second. If this is high, a lot of objects " + "have been added on this node.", + /*unit=*/"additions"}; + + ray::stats::Gauge ray_metric_object_directory_location_removed_{ + /*name=*/"object_directory_removed_locations", + /*description=*/ + "Number of object locations removed per second. If this is high, a lot of objects " + "have been removed from this node.", + /*unit=*/"removals"}; + friend class OwnershipBasedObjectDirectoryTest; }; diff --git a/src/ray/object_manager/plasma/BUILD.bazel b/src/ray/object_manager/plasma/BUILD.bazel index 29aea9b47cd6..ea88fcbe1501 100644 --- a/src/ray/object_manager/plasma/BUILD.bazel +++ b/src/ray/object_manager/plasma/BUILD.bazel @@ -44,7 +44,6 @@ ray_cc_library( "//src/ray/common:status_or", "//src/ray/object_manager:object_manager_common", "//src/ray/protobuf:common_cc_proto", - "//src/ray/util", "//src/ray/util:compat", "//src/ray/util:visibility", "@com_google_absl//absl/container:flat_hash_map", @@ -52,6 +51,19 @@ ray_cc_library( ], ) +ray_cc_library( + name = "plasma_client_interface", + hdrs = ["client.h"], + deps = [ + "//src/ray/common:buffer", + "//src/ray/common:id", + "//src/ray/common:status", + "//src/ray/object_manager:object_manager_common", + "//src/ray/protobuf:common_cc_proto", + "@com_google_absl//absl/container:flat_hash_map", + ], +) + ray_cc_library( name = "plasma_shared_memory", srcs = ["shared_memory.cc"], @@ -108,7 +120,7 @@ ray_cc_library( "//src/ray/ipc:client_connection", "//src/ray/object_manager:object_manager_common", "//src/ray/stats:stats_metric", - "//src/ray/util", + "//src/ray/util:network_util", "@boost//:bind", "@com_google_absl//absl/base:core_headers", "@com_google_absl//absl/container:flat_hash_set", @@ -243,7 +255,6 @@ ray_cc_library( ray_cc_library( name = "object_manager_plasma_common", - srcs = ["plasma.cc"], hdrs = [ "common.h", "plasma.h", @@ -253,8 +264,6 @@ ray_cc_library( "//src/ray/common:id", "//src/ray/object_manager:object_manager_common", "//src/ray/util:compat", - "//src/ray/util:macros", - "@boost//:asio", "@com_google_googletest//:gtest_prod", ], ) @@ -286,6 +295,7 @@ ray_cc_library( "//src/ray/protobuf:common_cc_proto", "//src/ray/util:compat", "//src/ray/util:logging", + "//src/ray/util:process", "@com_github_google_flatbuffers//:flatbuffers", "@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/container:flat_hash_set", diff --git a/src/ray/object_manager/plasma/client.h b/src/ray/object_manager/plasma/client.h index dcf4bdfef3f1..befba9e60f58 100644 --- a/src/ray/object_manager/plasma/client.h +++ b/src/ray/object_manager/plasma/client.h @@ -229,6 +229,11 @@ class PlasmaClientInterface { /// \param object_ids The list of IDs of the objects to delete. /// \return The return status. If all the objects are non-existent, return OK. virtual Status Delete(const std::vector &object_ids) = 0; + + /// Get the current debug string from the plasma store server. + /// + /// \return the debug string if successful, otherwise return an error status. + virtual StatusOr GetMemoryUsage() = 0; }; class PlasmaClient : public PlasmaClientInterface { @@ -282,7 +287,7 @@ class PlasmaClient : public PlasmaClientInterface { /// Get the current debug string from the plasma store server. /// /// \return the debug string if successful, otherwise return an error status. - StatusOr GetMemoryUsage(); + StatusOr GetMemoryUsage() override; /// Get the memory capacity of the store. /// diff --git a/src/ray/object_manager/plasma/common.h b/src/ray/object_manager/plasma/common.h index a9f2128a01e2..669aad1b7c3c 100644 --- a/src/ray/object_manager/plasma/common.h +++ b/src/ray/object_manager/plasma/common.h @@ -18,11 +18,8 @@ #pragma once #include -#include -#include -#include -#include +#include #include #include "ray/common/id.h" @@ -30,7 +27,6 @@ #include "ray/object_manager/plasma/plasma.h" #include "ray/object_manager/plasma/plasma_generated.h" #include "ray/util/compat.h" -#include "ray/util/macros.h" namespace plasma { @@ -58,22 +54,23 @@ inline constexpr std::string_view kCorruptedRequestErrorMessage = // Represents a chunk of allocated memory. struct Allocation { /// Pointer to the allocated memory. - void *address; + void *address_; /// Num bytes of the allocated memory. - int64_t size; + int64_t size_; /// The file descriptor of the memory mapped file where the memory allocated. - MEMFD_TYPE fd; + MEMFD_TYPE fd_; /// The offset in bytes in the memory mapped file of the allocated memory. - ptrdiff_t offset; + ptrdiff_t offset_; /// Device number of the allocated memory. - int device_num; + int device_num_; /// the total size of this mapped memory. - int64_t mmap_size; + int64_t mmap_size_; /// if it was fallback allocated. - bool fallback_allocated; + bool fallback_allocated_; // only allow moves. - RAY_DISALLOW_COPY_AND_ASSIGN(Allocation); + Allocation(const Allocation &) = delete; + Allocation &operator=(const Allocation &) = delete; Allocation(Allocation &&) noexcept = default; Allocation &operator=(Allocation &&) noexcept = default; @@ -86,23 +83,23 @@ struct Allocation { int device_num, int64_t mmap_size, bool fallback_allocated) - : address(address), - size(size), - fd(std::move(fd)), - offset(offset), - device_num(device_num), - mmap_size(mmap_size), - fallback_allocated(fallback_allocated) {} + : address_(address), + size_(size), + fd_(std::move(fd)), + offset_(offset), + device_num_(device_num), + mmap_size_(mmap_size), + fallback_allocated_(fallback_allocated) {} // Test only Allocation() - : address(nullptr), - size(0), - fd(), - offset(0), - device_num(0), - mmap_size(0), - fallback_allocated(false) {} + : address_(nullptr), + size_(0), + fd_(), + offset_(0), + device_num_(0), + mmap_size_(0), + fallback_allocated_(false) {} friend class PlasmaAllocator; friend class DummyAllocator; @@ -116,25 +113,27 @@ struct Allocation { /// the eviction policy. class LocalObject { public: - explicit LocalObject(Allocation allocation); + explicit LocalObject(Allocation allocation) + : allocation_(std::move(allocation)), ref_count_(0) {} - RAY_DISALLOW_COPY_AND_ASSIGN(LocalObject); + LocalObject(const LocalObject &) = delete; + LocalObject &operator=(const LocalObject &) = delete; - int64_t GetObjectSize() const { return object_info.GetObjectSize(); } + int64_t GetObjectSize() const { return object_info_.GetObjectSize(); } - bool Sealed() const { return state == ObjectState::PLASMA_SEALED; } + bool Sealed() const { return state_ == ObjectState::PLASMA_SEALED; } - int32_t GetRefCount() const { return ref_count; } + int32_t GetRefCount() const { return ref_count_; } - const ray::ObjectInfo &GetObjectInfo() const { return object_info; } + const ray::ObjectInfo &GetObjectInfo() const { return object_info_; } - const Allocation &GetAllocation() const { return allocation; } + const Allocation &GetAllocation() const { return allocation_; } - const plasma::flatbuf::ObjectSource &GetSource() const { return source; } + const plasma::flatbuf::ObjectSource &GetSource() const { return source_; } ray::PlasmaObjectHeader *GetPlasmaObjectHeader() const { - RAY_CHECK(object_info.is_mutable) << "Object is not mutable"; - auto header_ptr = static_cast(allocation.address); + RAY_CHECK(object_info_.is_mutable) << "Object is not mutable"; + auto header_ptr = static_cast(allocation_.address_); return reinterpret_cast(header_ptr); } @@ -143,11 +142,11 @@ class LocalObject { if (check_sealed) { RAY_DCHECK(Sealed()); } - object->store_fd = GetAllocation().fd; - object->header_offset = GetAllocation().offset; - object->data_offset = GetAllocation().offset; - object->metadata_offset = GetAllocation().offset + GetObjectInfo().data_size; - if (object_info.is_mutable) { + object->store_fd = GetAllocation().fd_; + object->header_offset = GetAllocation().offset_; + object->data_offset = GetAllocation().offset_; + object->metadata_offset = GetAllocation().offset_ + GetObjectInfo().data_size; + if (object_info_.is_mutable) { object->data_offset += sizeof(ray::PlasmaObjectHeader); object->metadata_offset += sizeof(ray::PlasmaObjectHeader); }; @@ -157,10 +156,10 @@ class LocalObject { // sizes locally depending on what data is written to the channel, but the // plasma store keeps the original data and metadata size. object->allocated_size = object->data_size + object->metadata_size; - object->device_num = GetAllocation().device_num; - object->mmap_size = GetAllocation().mmap_size; - object->fallback_allocated = GetAllocation().fallback_allocated; - object->is_experimental_mutable_object = object_info.is_mutable; + object->device_num = GetAllocation().device_num_; + object->mmap_size = GetAllocation().mmap_size_; + object->fallback_allocated = GetAllocation().fallback_allocated_; + object->is_experimental_mutable_object = object_info_.is_mutable; } private: @@ -174,19 +173,19 @@ class LocalObject { friend struct GetRequestQueueTest; /// Allocation Info; - Allocation allocation; + Allocation allocation_; /// Ray object info; - ray::ObjectInfo object_info; + ray::ObjectInfo object_info_; /// Number of clients currently using this object. /// TODO: ref_count probably shouldn't belong to LocalObject. - mutable int32_t ref_count; + mutable int32_t ref_count_; /// Unix epoch of when this object was created. - int64_t create_time; + int64_t create_time_; /// How long creation of this object took. - int64_t construct_duration; + int64_t construct_duration_; /// The state of the object, e.g., whether it is open or sealed. - ObjectState state; + ObjectState state_; /// The source of the object. Used for debugging purposes. - plasma::flatbuf::ObjectSource source; + plasma::flatbuf::ObjectSource source_; }; } // namespace plasma diff --git a/src/ray/object_manager/plasma/connection.cc b/src/ray/object_manager/plasma/connection.cc index b8c7549af91b..6ba8a102488c 100644 --- a/src/ray/object_manager/plasma/connection.cc +++ b/src/ray/object_manager/plasma/connection.cc @@ -25,6 +25,7 @@ #include "ray/object_manager/plasma/plasma_generated.h" #include "ray/object_manager/plasma/protocol.h" #include "ray/util/logging.h" +#include "ray/util/process.h" namespace plasma { diff --git a/src/ray/object_manager/plasma/create_request_queue.cc b/src/ray/object_manager/plasma/create_request_queue.cc index 1ed969efa69d..9a9c0966a9c2 100644 --- a/src/ray/object_manager/plasma/create_request_queue.cc +++ b/src/ray/object_manager/plasma/create_request_queue.cc @@ -22,7 +22,6 @@ #include "ray/common/asio/instrumented_io_context.h" #include "ray/object_manager/plasma/common.h" -#include "ray/util/util.h" namespace plasma { @@ -55,8 +54,8 @@ bool CreateRequestQueue::GetRequestResult(uint64_t req_id, return false; } - *result = it->second->result; - *error = it->second->error; + *result = it->second->result_; + *error = it->second->error_; fulfilled_requests_.erase(it); return true; } @@ -75,8 +74,8 @@ std::pair CreateRequestQueue::TryRequestImmediately( Status CreateRequestQueue::ProcessRequest(bool fallback_allocator, std::unique_ptr &request) { - request->error = request->create_callback(fallback_allocator, &request->result); - if (request->error == PlasmaError::OutOfMemory) { + request->error_ = request->create_callback_(fallback_allocator, &request->result_); + if (request->error_ == PlasmaError::OutOfMemory) { return Status::ObjectStoreFull(""); } else { return Status::OK(); @@ -92,10 +91,11 @@ Status CreateRequestQueue::ProcessRequests() { // if allocation failed due to OOM, and fs_monitor_ indicates the local disk is full, // we should failed the request with out of disk error - if ((*request_it)->error == PlasmaError::OutOfMemory && fs_monitor_.OverCapacity()) { - (*request_it)->error = PlasmaError::OutOfDisk; - RAY_LOG(INFO) << "Out-of-disk: Failed to create object " << (*request_it)->object_id - << " of size " << (*request_it)->object_size / 1024 / 1024 << "MB\n"; + if ((*request_it)->error_ == PlasmaError::OutOfMemory && fs_monitor_.OverCapacity()) { + (*request_it)->error_ = PlasmaError::OutOfDisk; + RAY_LOG(INFO) << "Out-of-disk: Failed to create object " + << (*request_it)->object_id_ << " of size " + << (*request_it)->object_size_ / 1024 / 1024 << "MB\n"; FinishRequest(request_it); return Status::OutOfDisk("System running out of disk."); } @@ -132,15 +132,15 @@ Status CreateRequestQueue::ProcessRequests() { if (!status.ok()) { // This only happens when an allocation is bigger than available disk space. // We should throw OutOfDisk Error here. - (*request_it)->error = PlasmaError::OutOfDisk; + (*request_it)->error_ = PlasmaError::OutOfDisk; std::string dump = ""; if (dump_debug_info_callback_ && !logged_oom) { dump = dump_debug_info_callback_(); logged_oom = true; } RAY_LOG(INFO) << "Out-of-disk: Failed to create object " - << (*request_it)->object_id << " of size " - << (*request_it)->object_size / 1024 / 1024 << "MB\n" + << (*request_it)->object_id_ << " of size " + << (*request_it)->object_size_ / 1024 / 1024 << "MB\n" << dump; } FinishRequest(request_it); @@ -154,22 +154,22 @@ void CreateRequestQueue::FinishRequest( std::list>::iterator request_it) { // Fulfill the request. auto &request = *request_it; - auto it = fulfilled_requests_.find(request->request_id); + auto it = fulfilled_requests_.find(request->request_id_); RAY_CHECK(it != fulfilled_requests_.end()); RAY_CHECK(it->second == nullptr); it->second = std::move(request); - RAY_CHECK(num_bytes_pending_ >= it->second->object_size); - num_bytes_pending_ -= it->second->object_size; + RAY_CHECK(num_bytes_pending_ >= it->second->object_size_); + num_bytes_pending_ -= it->second->object_size_; queue_.erase(request_it); } void CreateRequestQueue::RemoveDisconnectedClientRequests( const std::shared_ptr &client) { for (auto it = queue_.begin(); it != queue_.end();) { - if ((*it)->client == client) { - fulfilled_requests_.erase((*it)->request_id); - RAY_CHECK(num_bytes_pending_ >= (*it)->object_size); - num_bytes_pending_ -= (*it)->object_size; + if ((*it)->client_ == client) { + fulfilled_requests_.erase((*it)->request_id_); + RAY_CHECK(num_bytes_pending_ >= (*it)->object_size_); + num_bytes_pending_ -= (*it)->object_size_; it = queue_.erase(it); } else { it++; @@ -177,7 +177,7 @@ void CreateRequestQueue::RemoveDisconnectedClientRequests( } for (auto it = fulfilled_requests_.begin(); it != fulfilled_requests_.end();) { - if (it->second && it->second->client == client) { + if (it->second && it->second->client_ == client) { fulfilled_requests_.erase(it++); } else { it++; diff --git a/src/ray/object_manager/plasma/create_request_queue.h b/src/ray/object_manager/plasma/create_request_queue.h index 607443e67ba7..80ca6092d54a 100644 --- a/src/ray/object_manager/plasma/create_request_queue.h +++ b/src/ray/object_manager/plasma/create_request_queue.h @@ -126,32 +126,32 @@ class CreateRequestQueue { const std::shared_ptr &client, CreateObjectCallback create_callback, size_t object_size) - : object_id(object_id), - request_id(request_id), - client(client), - create_callback(create_callback), - object_size(object_size) {} + : object_id_(object_id), + request_id_(request_id), + client_(client), + create_callback_(create_callback), + object_size_(object_size) {} // The ObjectID to create. - const ObjectID object_id; + const ObjectID object_id_; // A request ID that can be returned to the caller to get the result once // ready. - const uint64_t request_id; + const uint64_t request_id_; // A pointer to the client, used as a key to delete requests that were made // by a client that is now disconnected. - const std::shared_ptr client; + const std::shared_ptr client_; // A callback to attempt to create the object. - const CreateObjectCallback create_callback; + const CreateObjectCallback create_callback_; - const size_t object_size; + const size_t object_size_; // The results of the creation call. These should be sent back to the // client once ready. - PlasmaError error = PlasmaError::OK; - PlasmaObject result = {}; + PlasmaError error_ = PlasmaError::OK; + PlasmaObject result_ = {}; }; /// Process a single request. Sets the request's error result to the error diff --git a/src/ray/object_manager/plasma/get_request_queue.cc b/src/ray/object_manager/plasma/get_request_queue.cc index 286440f3778b..097c71061f68 100644 --- a/src/ray/object_manager/plasma/get_request_queue.cc +++ b/src/ray/object_manager/plasma/get_request_queue.cc @@ -23,11 +23,11 @@ GetRequest::GetRequest(instrumented_io_context &io_context, const std::shared_ptr &client, const std::vector &object_ids, int64_t num_unique_objects_to_wait_for) - : client(client), - object_ids(object_ids.begin(), object_ids.end()), - objects(object_ids.size()), - num_unique_objects_to_wait_for(num_unique_objects_to_wait_for), - num_unique_objects_satisfied(0), + : client_(client), + object_ids_(object_ids.begin(), object_ids.end()), + objects_(object_ids.size()), + num_unique_objects_to_wait_for_(num_unique_objects_to_wait_for), + num_unique_objects_satisfied_(0), timer_(io_context) {} void GetRequest::AsyncWait( @@ -64,20 +64,20 @@ void GetRequestQueue::AddRequest(const std::shared_ptr &client, auto entry = object_lifecycle_mgr_.GetObject(object_id); if (entry != nullptr && entry->Sealed()) { // Update the get request to take into account the present object. - auto *plasma_object = &get_request->objects[object_id]; + auto *plasma_object = &get_request->objects_[object_id]; entry->ToPlasmaObject(plasma_object, /* checksealed */ true); - get_request->num_unique_objects_satisfied += 1; + get_request->num_unique_objects_satisfied_ += 1; std::optional fallback_allocated_fd = std::nullopt; - if (entry->GetAllocation().fallback_allocated) { - fallback_allocated_fd = entry->GetAllocation().fd; + if (entry->GetAllocation().fallback_allocated_) { + fallback_allocated_fd = entry->GetAllocation().fd_; } object_satisfied_callback_(object_id, fallback_allocated_fd, get_request); } else { // Add a placeholder plasma object to the get request to indicate that the // object is not present. This will be parsed by the client. We set the // data size to -1 to indicate that the object is not present. - get_request->objects[object_id].data_size = -1; + get_request->objects_[object_id].data_size = -1; // Add the get request to the relevant data structures. object_get_requests_[object_id].push_back(get_request); } @@ -85,8 +85,8 @@ void GetRequestQueue::AddRequest(const std::shared_ptr &client, // If all of the objects are present already or if the timeout is 0, return to // the client. - if (get_request->num_unique_objects_satisfied == - get_request->num_unique_objects_to_wait_for || + if (get_request->num_unique_objects_satisfied_ == + get_request->num_unique_objects_to_wait_for_ || timeout_ms == 0) { OnGetRequestCompleted(get_request); } else if (timeout_ms != -1) { @@ -108,7 +108,7 @@ void GetRequestQueue::RemoveGetRequestsForClient( absl::flat_hash_set> get_requests_to_remove; for (auto const &pair : object_get_requests_) { for (const auto &get_request : pair.second) { - if (get_request->client == client) { + if (get_request->client_ == client) { get_requests_to_remove.insert(get_request); } } @@ -133,7 +133,7 @@ void GetRequestQueue::RemoveGetRequest(const std::shared_ptr &get_re // Remove the get request from each of the relevant object_get_requests hash // tables if it is present there. It should only be present there if the get // request timed out or if it was issued by a client that has disconnected. - for (const auto &object_id : get_request->object_ids) { + for (const auto &object_id : get_request->object_ids_) { auto object_request_iter = object_get_requests_.find(object_id); if (object_request_iter != object_get_requests_.end()) { auto &get_requests = object_request_iter->second; @@ -170,18 +170,18 @@ void GetRequestQueue::MarkObjectSealed(const ObjectID &object_id) { auto get_request = get_requests[index]; auto entry = object_lifecycle_mgr_.GetObject(object_id); RAY_CHECK(entry != nullptr); - auto *plasma_object = &get_request->objects[object_id]; + auto *plasma_object = &get_request->objects_[object_id]; entry->ToPlasmaObject(plasma_object, /* check sealed */ true); - get_request->num_unique_objects_satisfied += 1; + get_request->num_unique_objects_satisfied_ += 1; std::optional fallback_allocated_fd = std::nullopt; - if (entry->GetAllocation().fallback_allocated) { - fallback_allocated_fd = entry->GetAllocation().fd; + if (entry->GetAllocation().fallback_allocated_) { + fallback_allocated_fd = entry->GetAllocation().fd_; } object_satisfied_callback_(object_id, fallback_allocated_fd, get_request); // If this get request is done, reply to the client. - if (get_request->num_unique_objects_satisfied == - get_request->num_unique_objects_to_wait_for) { + if (get_request->num_unique_objects_satisfied_ == + get_request->num_unique_objects_to_wait_for_) { OnGetRequestCompleted(get_request); } else { // The call to ReturnFromGet will remove the current element in the diff --git a/src/ray/object_manager/plasma/get_request_queue.h b/src/ray/object_manager/plasma/get_request_queue.h index 96387999d2d8..c9585f279a62 100644 --- a/src/ray/object_manager/plasma/get_request_queue.h +++ b/src/ray/object_manager/plasma/get_request_queue.h @@ -37,17 +37,17 @@ struct GetRequest { const std::vector &object_ids, int64_t num_unique_objects_to_wait_for); /// The client that called get. - std::shared_ptr client; + std::shared_ptr client_; /// The object IDs involved in this request. This is used in the reply. - std::vector object_ids; + std::vector object_ids_; /// The object information for the objects in this request. This is used in /// the reply. - absl::flat_hash_map objects; + absl::flat_hash_map objects_; /// The minimum number of objects to wait for in this request. - const int64_t num_unique_objects_to_wait_for; + const int64_t num_unique_objects_to_wait_for_; /// The number of object requests in this wait request that are already /// satisfied. - int64_t num_unique_objects_satisfied; + int64_t num_unique_objects_satisfied_; void AsyncWait(int64_t timeout_ms, std::function on_timeout); diff --git a/src/ray/object_manager/plasma/obj_lifecycle_mgr.cc b/src/ray/object_manager/plasma/obj_lifecycle_mgr.cc index c49eec1e473d..66b426365265 100644 --- a/src/ray/object_manager/plasma/obj_lifecycle_mgr.cc +++ b/src/ray/object_manager/plasma/obj_lifecycle_mgr.cc @@ -76,12 +76,12 @@ flatbuf::PlasmaError ObjectLifecycleManager::AbortObject(const ObjectID &object_ RAY_LOG(ERROR) << "To abort an object it must be in the object table."; return PlasmaError::ObjectNonexistent; } - if (entry->state == ObjectState::PLASMA_SEALED) { + if (entry->state_ == ObjectState::PLASMA_SEALED) { RAY_LOG(ERROR) << "To abort an object it must not have been sealed."; return PlasmaError::ObjectSealed; } - bool abort_while_using = entry->ref_count > 0; + bool abort_while_using = entry->ref_count_ > 0; DeleteObjectInternal(object_id); if (abort_while_using) { @@ -98,7 +98,7 @@ PlasmaError ObjectLifecycleManager::DeleteObject(const ObjectID &object_id) { } // TODO(scv119): should we delete unsealed with ref_count 0? - if (entry->state != ObjectState::PLASMA_SEALED) { + if (entry->state_ != ObjectState::PLASMA_SEALED) { // To delete an object it must have been sealed, // otherwise there might be memeory corruption. // Put it into deletion cache, it will be deleted later. @@ -106,7 +106,7 @@ PlasmaError ObjectLifecycleManager::DeleteObject(const ObjectID &object_id) { return PlasmaError::ObjectNotSealed; } - if (entry->ref_count != 0) { + if (entry->ref_count_ != 0) { // To delete an object, there must be no clients currently using it. // Put it into deletion cache, it will be deleted later. earger_deletion_objects_.emplace(object_id); @@ -133,12 +133,12 @@ bool ObjectLifecycleManager::AddReference(const ObjectID &object_id) { } // If there are no other clients using this object, notify the eviction policy // that the object is being used. - if (entry->ref_count == 0) { + if (entry->ref_count_ == 0) { // Tell the eviction policy that this object is being used. eviction_policy_->BeginObjectAccess(object_id); } // Increase reference count. - entry->ref_count++; + entry->ref_count_++; stats_collector_->OnObjectRefIncreased(*entry); RAY_LOG(DEBUG) << "Object " << object_id << " reference has incremented" << ", num bytes in use is now " << GetNumBytesInUse(); @@ -147,17 +147,17 @@ bool ObjectLifecycleManager::AddReference(const ObjectID &object_id) { bool ObjectLifecycleManager::RemoveReference(const ObjectID &object_id) { auto entry = object_store_->GetObject(object_id); - if (!entry || entry->ref_count == 0) { + if (!entry || entry->ref_count_ == 0) { RAY_LOG(ERROR) << object_id << " doesn't exist, or its ref count is already 0, remove reference failed."; return false; } - entry->ref_count--; + entry->ref_count_--; stats_collector_->OnObjectRefDecreased(*entry); - if (entry->ref_count > 0) { + if (entry->ref_count_ > 0) { return true; } @@ -231,9 +231,9 @@ void ObjectLifecycleManager::EvictObjects(const std::vector &object_id // error. Maybe we should also support deleting objects that have been // created but not sealed. RAY_CHECK(entry != nullptr) << "To evict an object it must be in the object table."; - RAY_CHECK(entry->state == ObjectState::PLASMA_SEALED) + RAY_CHECK(entry->state_ == ObjectState::PLASMA_SEALED) << "To evict an object it must have been sealed."; - RAY_CHECK(entry->ref_count == 0) + RAY_CHECK(entry->ref_count_ == 0) << "To evict an object, there must be no clients currently using it."; DeleteObjectInternal(object_id); @@ -244,7 +244,7 @@ void ObjectLifecycleManager::DeleteObjectInternal(const ObjectID &object_id) { auto entry = object_store_->GetObject(object_id); RAY_CHECK(entry != nullptr); - bool aborted = entry->state == ObjectState::PLASMA_CREATED; + bool aborted = entry->state_ == ObjectState::PLASMA_CREATED; stats_collector_->OnObjectDeleting(*entry); earger_deletion_objects_.erase(object_id); @@ -263,7 +263,7 @@ int64_t ObjectLifecycleManager::GetNumBytesInUse() const { bool ObjectLifecycleManager::IsObjectSealed(const ObjectID &object_id) const { auto entry = GetObject(object_id); - return entry && entry->state == ObjectState::PLASMA_SEALED; + return entry && entry->state_ == ObjectState::PLASMA_SEALED; } int64_t ObjectLifecycleManager::GetNumObjectsCreatedTotal() const { diff --git a/src/ray/object_manager/plasma/object_store.cc b/src/ray/object_manager/plasma/object_store.cc index a2f952ffc6e4..ee7639c48bc1 100644 --- a/src/ray/object_manager/plasma/object_store.cc +++ b/src/ray/object_manager/plasma/object_store.cc @@ -43,11 +43,11 @@ const LocalObject *ObjectStore::CreateObject(const ray::ObjectInfo &object_info, auto ptr = std::make_unique(std::move(allocation.value())); auto entry = object_table_.emplace(object_info.object_id, std::move(ptr)).first->second.get(); - entry->object_info = object_info; - entry->state = ObjectState::PLASMA_CREATED; - entry->create_time = std::time(nullptr); - entry->construct_duration = -1; - entry->source = source; + entry->object_info_ = object_info; + entry->state_ = ObjectState::PLASMA_CREATED; + entry->create_time_ = std::time(nullptr); + entry->construct_duration_ = -1; + entry->source_ = source; #if defined(__APPLE__) || defined(__linux__) if (object_info.is_mutable) { @@ -70,11 +70,11 @@ const LocalObject *ObjectStore::GetObject(const ObjectID &object_id) const { const LocalObject *ObjectStore::SealObject(const ObjectID &object_id) { auto entry = GetMutableObject(object_id); - if (entry == nullptr || entry->state == ObjectState::PLASMA_SEALED) { + if (entry == nullptr || entry->state_ == ObjectState::PLASMA_SEALED) { return nullptr; } - entry->state = ObjectState::PLASMA_SEALED; - entry->construct_duration = std::time(nullptr) - entry->create_time; + entry->state_ = ObjectState::PLASMA_SEALED; + entry->construct_duration_ = std::time(nullptr) - entry->create_time_; return entry; } @@ -83,7 +83,7 @@ bool ObjectStore::DeleteObject(const ObjectID &object_id) { if (entry == nullptr) { return false; } - allocator_.Free(std::move(entry->allocation)); + allocator_.Free(std::move(entry->allocation_)); object_table_.erase(object_id); return true; } diff --git a/src/ray/object_manager/plasma/plasma.cc b/src/ray/object_manager/plasma/plasma.cc deleted file mode 100644 index 84182fc5e0f3..000000000000 --- a/src/ray/object_manager/plasma/plasma.cc +++ /dev/null @@ -1,28 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "ray/object_manager/plasma/plasma.h" - -#include - -#include "ray/object_manager/plasma/common.h" - -namespace plasma { - -LocalObject::LocalObject(Allocation allocation) - : allocation(std::move(allocation)), ref_count(0) {} -} // namespace plasma diff --git a/src/ray/object_manager/plasma/plasma.fbs b/src/ray/object_manager/plasma/plasma.fbs index f64e450da7bc..0211916f0513 100644 --- a/src/ray/object_manager/plasma/plasma.fbs +++ b/src/ray/object_manager/plasma/plasma.fbs @@ -127,8 +127,8 @@ table PlasmaGetDebugStringReply { table PlasmaCreateRequest { // ID of the object to be created. object_id: string; - // Owner raylet ID of this object. - owner_raylet_id: string; + // Owner node ID of this object. + owner_node_id: string; // Owner IP address of this object. owner_ip_address: string; // Owner port address of this object. diff --git a/src/ray/object_manager/plasma/plasma.h b/src/ray/object_manager/plasma/plasma.h index 6b2eecbf805f..3f162396dc90 100644 --- a/src/ray/object_manager/plasma/plasma.h +++ b/src/ray/object_manager/plasma/plasma.h @@ -17,14 +17,8 @@ #pragma once -#include -#include - -#include -#include -#include -#include -#include +#include +#include #include "ray/util/compat.h" diff --git a/src/ray/object_manager/plasma/plasma_allocator.cc b/src/ray/object_manager/plasma/plasma_allocator.cc index 840b3c4599d2..6127fd146fb1 100644 --- a/src/ray/object_manager/plasma/plasma_allocator.cc +++ b/src/ray/object_manager/plasma/plasma_allocator.cc @@ -120,12 +120,12 @@ std::optional PlasmaAllocator::FallbackAllocate(size_t bytes) { } void PlasmaAllocator::Free(Allocation allocation) { - RAY_CHECK(allocation.address != nullptr) << "Cannot free the nullptr"; - RAY_LOG(DEBUG) << "deallocating " << allocation.size << " at " << allocation.address; - dlfree(allocation.address); - allocated_ -= allocation.size; - if (internal::IsOutsideInitialAllocation(allocation.address)) { - fallback_allocated_ -= allocation.size; + RAY_CHECK(allocation.address_ != nullptr) << "Cannot free the nullptr"; + RAY_LOG(DEBUG) << "deallocating " << allocation.size_ << " at " << allocation.address_; + dlfree(allocation.address_); + allocated_ -= allocation.size_; + if (internal::IsOutsideInitialAllocation(allocation.address_)) { + fallback_allocated_ -= allocation.size_; } } diff --git a/src/ray/object_manager/plasma/protocol.cc b/src/ray/object_manager/plasma/protocol.cc index 153de7181d28..8d589efeea01 100644 --- a/src/ray/object_manager/plasma/protocol.cc +++ b/src/ray/object_manager/plasma/protocol.cc @@ -40,7 +40,7 @@ using flatbuffers::uoffset_t; inline constexpr std::string_view kDebugString = "debug_string"; inline constexpr std::string_view kObjectId = "object_id"; inline constexpr std::string_view kObjectIds = "object_ids"; -inline constexpr std::string_view kOwnerRayletId = "owner_raylet_id"; +inline constexpr std::string_view kOwnerNodeId = "owner_node_id"; inline constexpr std::string_view kOwnerIpAddress = "owner_ip_address"; inline constexpr std::string_view kOnwerWorkerId = "owner_worker_id"; @@ -222,7 +222,7 @@ Status SendCreateRequest(const std::shared_ptr &store_conn, auto message = fb::CreatePlasmaCreateRequest(fbb, fbb.CreateString(object_id.Binary()), - fbb.CreateString(owner_address.raylet_id()), + fbb.CreateString(owner_address.node_id()), fbb.CreateString(owner_address.ip_address()), owner_address.port(), fbb.CreateString(owner_address.worker_id()), @@ -249,8 +249,8 @@ void ReadCreateRequest(const uint8_t *data, VerifyNotNullPtr(message->object_id(), kObjectId, MessageType::PlasmaCreateRequest); object_info->object_id = ObjectID::FromBinary(message->object_id()->str()); VerifyNotNullPtr( - message->owner_raylet_id(), kOwnerRayletId, MessageType::PlasmaCreateRequest); - object_info->owner_raylet_id = NodeID::FromBinary(message->owner_raylet_id()->str()); + message->owner_node_id(), kOwnerNodeId, MessageType::PlasmaCreateRequest); + object_info->owner_node_id = NodeID::FromBinary(message->owner_node_id()->str()); VerifyNotNullPtr( message->owner_ip_address(), kOwnerIpAddress, MessageType::PlasmaCreateRequest); object_info->owner_ip_address = message->owner_ip_address()->str(); diff --git a/src/ray/object_manager/plasma/shared_memory.h b/src/ray/object_manager/plasma/shared_memory.h index 8d597d538e1a..6623f25970b9 100644 --- a/src/ray/object_manager/plasma/shared_memory.h +++ b/src/ray/object_manager/plasma/shared_memory.h @@ -27,6 +27,9 @@ class ClientMmapTableEntry { public: ClientMmapTableEntry(MEMFD_TYPE fd, int64_t map_size); + ClientMmapTableEntry(const ClientMmapTableEntry &) = delete; + ClientMmapTableEntry &operator=(const ClientMmapTableEntry &) = delete; + ~ClientMmapTableEntry(); uint8_t *pointer() const { return reinterpret_cast(pointer_); } @@ -42,8 +45,6 @@ class ClientMmapTableEntry { size_t length_; void MaybeMadviseDontdump(); - - RAY_DISALLOW_COPY_AND_ASSIGN(ClientMmapTableEntry); }; } // namespace plasma diff --git a/src/ray/object_manager/plasma/stats_collector.cc b/src/ray/object_manager/plasma/stats_collector.cc index aa3b95a617b1..dd9584b85727 100644 --- a/src/ray/object_manager/plasma/stats_collector.cc +++ b/src/ray/object_manager/plasma/stats_collector.cc @@ -27,7 +27,7 @@ void ObjectStatsCollector::OnObjectCreated(const LocalObject &obj) { const auto &kAllocation = obj.GetAllocation(); bytes_by_loc_seal_.Increment( - {/*fallback_allocated*/ kAllocation.fallback_allocated, /*sealed*/ false}, + {/*fallback_allocated*/ kAllocation.fallback_allocated_, /*sealed*/ false}, kObjectSize); num_objects_created_total_ += 1; @@ -65,8 +65,8 @@ void ObjectStatsCollector::OnObjectSealed(const LocalObject &obj) { const auto kObjectSize = obj.GetObjectInfo().GetObjectSize(); const auto &kAllocation = obj.GetAllocation(); - bytes_by_loc_seal_.Swap({kAllocation.fallback_allocated, /* sealed */ false}, - {kAllocation.fallback_allocated, /* sealed */ true}, + bytes_by_loc_seal_.Swap({kAllocation.fallback_allocated_, /* sealed */ false}, + {kAllocation.fallback_allocated_, /* sealed */ true}, kObjectSize); num_objects_unsealed_--; @@ -91,7 +91,7 @@ void ObjectStatsCollector::OnObjectDeleting(const LocalObject &obj) { const auto kSource = obj.GetSource(); const auto &kAllocation = obj.GetAllocation(); - bytes_by_loc_seal_.Decrement({kAllocation.fallback_allocated, obj.Sealed()}, + bytes_by_loc_seal_.Decrement({kAllocation.fallback_allocated_, obj.Sealed()}, kObjectSize); if (kSource == plasma::flatbuf::ObjectSource::CreatedByWorker) { diff --git a/src/ray/object_manager/plasma/store.cc b/src/ray/object_manager/plasma/store.cc index 976189682ae5..790dbdf5b33a 100644 --- a/src/ray/object_manager/plasma/store.cc +++ b/src/ray/object_manager/plasma/store.cc @@ -50,7 +50,7 @@ #include "ray/object_manager/plasma/plasma_allocator.h" #include "ray/object_manager/plasma/protocol.h" #include "ray/stats/metric_defs.h" -#include "ray/util/util.h" +#include "ray/util/network_util.h" namespace ph = boost::placeholders; namespace fb = plasma::flatbuf; @@ -78,7 +78,7 @@ PlasmaStore::PlasmaStore(instrumented_io_context &main_service, ray::DeleteObjectCallback delete_object_callback) : io_context_(main_service), socket_name_(socket_name), - acceptor_(main_service, ParseUrlEndpoint(socket_name)), + acceptor_(main_service, ray::ParseUrlEndpoint(socket_name)), socket_(main_service), allocator_(allocator), fs_monitor_(fs_monitor), @@ -106,7 +106,8 @@ PlasmaStore::PlasmaStore(instrumented_io_context &main_service, std::optional fallback_allocated_fd, const auto &request) ABSL_NO_THREAD_SAFETY_ANALYSIS { mutex_.AssertHeld(); - this->AddToClientObjectIds(object_id, fallback_allocated_fd, request->client); + this->AddToClientObjectIds( + object_id, fallback_allocated_fd, request->client_); }, [this](const auto &request) { this->ReturnFromGet(request); }) { ray::SetCloseOnExec(acceptor_); @@ -185,8 +186,8 @@ PlasmaError PlasmaStore::CreateObject(const ray::ObjectInfo &object_info, entry->ToPlasmaObject(result, /* check sealed */ false); // Record that this client is using this object. std::optional fallback_allocated_fd = std::nullopt; - if (entry->GetAllocation().fallback_allocated) { - fallback_allocated_fd = entry->GetAllocation().fd; + if (entry->GetAllocation().fallback_allocated_) { + fallback_allocated_fd = entry->GetAllocation().fd_; } AddToClientObjectIds(object_info.object_id, fallback_allocated_fd, client); return PlasmaError::OK; @@ -203,8 +204,8 @@ void PlasmaStore::ReturnFromGet(const std::shared_ptr &get_request) absl::flat_hash_set fds_to_send; std::vector store_fds; std::vector mmap_sizes; - for (const auto &object_id : get_request->object_ids) { - const PlasmaObject &object = get_request->objects[object_id]; + for (const auto &object_id : get_request->object_ids_) { + const PlasmaObject &object = get_request->objects_[object_id]; MEMFD_TYPE fd = object.store_fd; if (object.data_size != -1 && fds_to_send.count(fd) == 0 && fd.first != INVALID_FD) { fds_to_send.insert(fd); @@ -213,10 +214,10 @@ void PlasmaStore::ReturnFromGet(const std::shared_ptr &get_request) } } // Send the get reply to the client. - Status s = SendGetReply(std::dynamic_pointer_cast(get_request->client), - &get_request->object_ids[0], - get_request->objects, - get_request->object_ids.size(), + Status s = SendGetReply(std::dynamic_pointer_cast(get_request->client_), + &get_request->object_ids_[0], + get_request->objects_, + get_request->object_ids_.size(), store_fds, mmap_sizes); // If we successfully sent the get reply message to the client, then also send @@ -224,14 +225,14 @@ void PlasmaStore::ReturnFromGet(const std::shared_ptr &get_request) if (s.ok()) { // Send all of the file descriptors for the present objects. for (MEMFD_TYPE store_fd : store_fds) { - Status send_fd_status = get_request->client->SendFd(store_fd); + Status send_fd_status = get_request->client_->SendFd(store_fd); if (!send_fd_status.ok()) { RAY_LOG(ERROR) << "Failed to send mmap results to client on fd " - << get_request->client; + << get_request->client_; } } } else { - RAY_LOG(ERROR) << "Failed to send Get reply to client on fd " << get_request->client; + RAY_LOG(ERROR) << "Failed to send Get reply to client on fd " << get_request->client_; } } @@ -311,8 +312,8 @@ void PlasmaStore::ConnectClient(const boost::system::error_code &error) { return ProcessClientMessage(std::move(client), message_type, message); }, /*connection_error_handler=*/ - [this](std::shared_ptr client, const boost::system::error_code &error) - -> void { return HandleClientConnectionError(std::move(client), error); }, + [this](std::shared_ptr client, const boost::system::error_code &err) + -> void { return HandleClientConnectionError(std::move(client), err); }, std::move(socket_)); // Start receiving messages. diff --git a/src/ray/object_manager/plasma/test/BUILD.bazel b/src/ray/object_manager/plasma/tests/BUILD.bazel similarity index 98% rename from src/ray/object_manager/plasma/test/BUILD.bazel rename to src/ray/object_manager/plasma/tests/BUILD.bazel index 423cfac1a128..3b53f4011d06 100644 --- a/src/ray/object_manager/plasma/test/BUILD.bazel +++ b/src/ray/object_manager/plasma/tests/BUILD.bazel @@ -5,6 +5,7 @@ ray_cc_test( srcs = ["fallback_allocator_test.cc"], tags = ["team:core"], deps = [ + "//src/ray/common:id", "//src/ray/object_manager/plasma:plasma_allocator", "@com_google_absl//absl/strings:str_format", "@com_google_googletest//:gtest_main", diff --git a/src/ray/object_manager/plasma/test/eviction_policy_test.cc b/src/ray/object_manager/plasma/tests/eviction_policy_test.cc similarity index 94% rename from src/ray/object_manager/plasma/test/eviction_policy_test.cc rename to src/ray/object_manager/plasma/tests/eviction_policy_test.cc index a6367a997285..dacb8d1f020f 100644 --- a/src/ray/object_manager/plasma/test/eviction_policy_test.cc +++ b/src/ray/object_manager/plasma/tests/eviction_policy_test.cc @@ -115,17 +115,17 @@ TEST(EvictionPolicyTest, Test) { ObjectID key4 = ObjectID::FromRandom(); LocalObject object1{Allocation()}; - object1.object_info.data_size = 10; - object1.object_info.metadata_size = 0; + object1.object_info_.data_size = 10; + object1.object_info_.metadata_size = 0; LocalObject object2{Allocation()}; - object2.object_info.data_size = 20; - object2.object_info.metadata_size = 0; + object2.object_info_.data_size = 20; + object2.object_info_.metadata_size = 0; LocalObject object3{Allocation()}; - object3.object_info.data_size = 30; - object3.object_info.metadata_size = 0; + object3.object_info_.data_size = 30; + object3.object_info_.metadata_size = 0; LocalObject object4{Allocation()}; - object4.object_info.data_size = 40; - object4.object_info.metadata_size = 0; + object4.object_info_.data_size = 40; + object4.object_info_.metadata_size = 0; auto init_object_store = [&](EvictionPolicy &policy) { EXPECT_CALL(store, GetObject(_)) diff --git a/src/ray/object_manager/plasma/test/fallback_allocator_test.cc b/src/ray/object_manager/plasma/tests/fallback_allocator_test.cc similarity index 93% rename from src/ray/object_manager/plasma/test/fallback_allocator_test.cc rename to src/ray/object_manager/plasma/tests/fallback_allocator_test.cc index 3ba76efbaeca..addfabfee759 100644 --- a/src/ray/object_manager/plasma/test/fallback_allocator_test.cc +++ b/src/ray/object_manager/plasma/tests/fallback_allocator_test.cc @@ -27,7 +27,8 @@ namespace plasma { namespace { const int64_t kMB = 1024 * 1024; std::string CreateTestDir() { - path directory = std::filesystem::temp_directory_path() / GenerateUUIDV4(); + path directory = + std::filesystem::temp_directory_path() / ray::UniqueID::FromRandom().Hex(); create_directories(directory); return directory.string(); } @@ -48,11 +49,11 @@ TEST(FallbackPlasmaAllocatorTest, FallbackPassThroughTest) { { auto allocation_1 = allocator.Allocate(object_size); EXPECT_TRUE(allocation_1.has_value()); - EXPECT_FALSE(allocation_1->fallback_allocated); + EXPECT_FALSE(allocation_1->fallback_allocated_); auto allocation_2 = allocator.Allocate(object_size); EXPECT_TRUE(allocation_2.has_value()); - EXPECT_FALSE(allocation_2->fallback_allocated); + EXPECT_FALSE(allocation_2->fallback_allocated_); EXPECT_EQ(2 * object_size, allocator.Allocated()); @@ -75,7 +76,7 @@ TEST(FallbackPlasmaAllocatorTest, FallbackPassThroughTest) { auto allocation = allocator.Allocate(kMB); expect_allocated += kMB; EXPECT_TRUE(allocation.has_value()); - EXPECT_FALSE(allocation->fallback_allocated); + EXPECT_FALSE(allocation->fallback_allocated_); EXPECT_EQ(expect_allocated, allocator.Allocated()); EXPECT_EQ(0, allocator.FallbackAllocated()); allocations.push_back(std::move(allocation.value())); @@ -97,7 +98,7 @@ TEST(FallbackPlasmaAllocatorTest, FallbackPassThroughTest) { expect_allocated += kMB; expect_fallback_allocated += kMB; EXPECT_TRUE(allocation.has_value()); - EXPECT_TRUE(allocation->fallback_allocated); + EXPECT_TRUE(allocation->fallback_allocated_); EXPECT_EQ(expect_allocated, allocator.Allocated()); EXPECT_EQ(expect_fallback_allocated, allocator.FallbackAllocated()); fallback_allocations.push_back(std::move(allocation.value())); diff --git a/src/ray/object_manager/plasma/test/mutable_object_test.cc b/src/ray/object_manager/plasma/tests/mutable_object_test.cc similarity index 99% rename from src/ray/object_manager/plasma/test/mutable_object_test.cc rename to src/ray/object_manager/plasma/tests/mutable_object_test.cc index 327595f9214f..7fda37e2c6a7 100644 --- a/src/ray/object_manager/plasma/test/mutable_object_test.cc +++ b/src/ray/object_manager/plasma/tests/mutable_object_test.cc @@ -15,6 +15,7 @@ #include #include #include +#include #include #include diff --git a/src/ray/object_manager/plasma/test/obj_lifecycle_mgr_test.cc b/src/ray/object_manager/plasma/tests/obj_lifecycle_mgr_test.cc similarity index 97% rename from src/ray/object_manager/plasma/test/obj_lifecycle_mgr_test.cc rename to src/ray/object_manager/plasma/tests/obj_lifecycle_mgr_test.cc index 580f400a554d..e83dca206f60 100644 --- a/src/ray/object_manager/plasma/test/obj_lifecycle_mgr_test.cc +++ b/src/ray/object_manager/plasma/tests/obj_lifecycle_mgr_test.cc @@ -76,12 +76,12 @@ struct ObjectLifecycleManagerTest : public Test { std::move(eviction_policy), delete_object_cb, std::move(stats_collector))); - sealed_object_.state = ObjectState::PLASMA_SEALED; - not_sealed_object_.state = ObjectState::PLASMA_CREATED; - one_ref_object_.state = ObjectState::PLASMA_SEALED; - one_ref_object_.ref_count = 1; - two_ref_object_.state = ObjectState::PLASMA_SEALED; - two_ref_object_.ref_count = 2; + sealed_object_.state_ = ObjectState::PLASMA_SEALED; + not_sealed_object_.state_ = ObjectState::PLASMA_CREATED; + one_ref_object_.state_ = ObjectState::PLASMA_SEALED; + one_ref_object_.ref_count_ = 1; + two_ref_object_.state_ = ObjectState::PLASMA_SEALED; + two_ref_object_.ref_count_ = 2; } MockEvictionPolicy *eviction_policy_; diff --git a/src/ray/object_manager/plasma/test/object_store_test.cc b/src/ray/object_manager/plasma/tests/object_store_test.cc similarity index 79% rename from src/ray/object_manager/plasma/test/object_store_test.cc rename to src/ray/object_manager/plasma/tests/object_store_test.cc index bbe73175313e..20d4267d9bd3 100644 --- a/src/ray/object_manager/plasma/test/object_store_test.cc +++ b/src/ray/object_manager/plasma/tests/object_store_test.cc @@ -42,22 +42,22 @@ T Random(T max = std::numeric_limits::max()) { Allocation CreateAllocation(Allocation alloc, int64_t size, bool fallback_allocated = false) { - alloc.size = size; - alloc.offset = Random(); - alloc.mmap_size = Random(); - alloc.fallback_allocated = fallback_allocated; + alloc.size_ = size; + alloc.offset_ = Random(); + alloc.mmap_size_ = Random(); + alloc.fallback_allocated_ = fallback_allocated; return alloc; } const std::string Serialize(const Allocation &allocation) { return absl::StrFormat("%p/%d/%d/%d/%d/%d/%d", - allocation.address, - allocation.size, - allocation.fd.first, - allocation.fd.second, - allocation.offset, - allocation.device_num, - allocation.mmap_size); + allocation.address_, + allocation.size_, + allocation.fd_.first, + allocation.fd_.second, + allocation.offset_, + allocation.device_num_, + allocation.mmap_size_); } ObjectInfo CreateObjectInfo(ObjectID object_id, int64_t object_size) { @@ -65,7 +65,7 @@ ObjectInfo CreateObjectInfo(ObjectID object_id, int64_t object_size) { info.object_id = object_id; info.data_size = Random(object_size); info.metadata_size = object_size - info.data_size; - info.owner_raylet_id = NodeID::FromRandom(); + info.owner_node_id = NodeID::FromRandom(); info.owner_ip_address = "random_ip"; info.owner_port = Random(); info.owner_worker_id = WorkerID::FromRandom(); @@ -106,11 +106,11 @@ TEST(ObjectStoreTest, PassThroughTest) { })); auto entry = store.CreateObject(info, {}, /*fallback_allocate*/ false); EXPECT_NE(entry, nullptr); - EXPECT_EQ(entry->ref_count, 0); - EXPECT_EQ(entry->state, ObjectState::PLASMA_CREATED); - EXPECT_EQ(alloc_str, Serialize(entry->allocation)); - EXPECT_EQ(info, entry->object_info); - EXPECT_FALSE(entry->allocation.fallback_allocated); + EXPECT_EQ(entry->ref_count_, 0); + EXPECT_EQ(entry->state_, ObjectState::PLASMA_CREATED); + EXPECT_EQ(alloc_str, Serialize(entry->allocation_)); + EXPECT_EQ(info, entry->object_info_); + EXPECT_FALSE(entry->allocation_.fallback_allocated_); // verify get auto entry1 = store.GetObject(kId1); @@ -123,14 +123,14 @@ TEST(ObjectStoreTest, PassThroughTest) { // seal object auto entry3 = store.SealObject(kId1); EXPECT_EQ(entry3, entry); - EXPECT_EQ(entry3->state, ObjectState::PLASMA_SEALED); + EXPECT_EQ(entry3->state_, ObjectState::PLASMA_SEALED); // seal non existing EXPECT_EQ(nullptr, store.SealObject(kId2)); // delete sealed - EXPECT_CALL(allocator, Free(_)).Times(1).WillOnce(Invoke([&](auto &&allocation) { - EXPECT_EQ(alloc_str, Serialize(allocation)); + EXPECT_CALL(allocator, Free(_)).Times(1).WillOnce(Invoke([&](auto &&allocation_arg) { + EXPECT_EQ(alloc_str, Serialize(allocation_arg)); })); EXPECT_TRUE(store.DeleteObject(kId1)); @@ -168,15 +168,15 @@ TEST(ObjectStoreTest, PassThroughTest) { auto entry = store.CreateObject(info, {}, /*fallback_allocate*/ true); EXPECT_NE(entry, nullptr); - EXPECT_EQ(entry->ref_count, 0); - EXPECT_EQ(entry->state, ObjectState::PLASMA_CREATED); - EXPECT_EQ(alloc_str, Serialize(entry->allocation)); - EXPECT_EQ(info, entry->object_info); - EXPECT_TRUE(entry->allocation.fallback_allocated); + EXPECT_EQ(entry->ref_count_, 0); + EXPECT_EQ(entry->state_, ObjectState::PLASMA_CREATED); + EXPECT_EQ(alloc_str, Serialize(entry->allocation_)); + EXPECT_EQ(info, entry->object_info_); + EXPECT_TRUE(entry->allocation_.fallback_allocated_); // delete unsealed - EXPECT_CALL(allocator, Free(_)).Times(1).WillOnce(Invoke([&](auto &&allocation) { - EXPECT_EQ(alloc_str, Serialize(allocation)); + EXPECT_CALL(allocator, Free(_)).Times(1).WillOnce(Invoke([&](auto &&allocation_arg) { + EXPECT_EQ(alloc_str, Serialize(allocation_arg)); })); EXPECT_TRUE(store.DeleteObject(kId2)); diff --git a/src/ray/object_manager/plasma/test/stats_collector_test.cc b/src/ray/object_manager/plasma/tests/stats_collector_test.cc similarity index 88% rename from src/ray/object_manager/plasma/test/stats_collector_test.cc rename to src/ray/object_manager/plasma/tests/stats_collector_test.cc index 4831e95a6f80..9c25e7152ffe 100644 --- a/src/ray/object_manager/plasma/test/stats_collector_test.cc +++ b/src/ray/object_manager/plasma/tests/stats_collector_test.cc @@ -39,7 +39,7 @@ class DummyAllocator : public IAllocator { std::optional Allocate(size_t bytes) override { allocated_ += bytes; auto allocation = Allocation(); - allocation.size = bytes; + allocation.size_ = bytes; return std::move(allocation); } @@ -47,7 +47,7 @@ class DummyAllocator : public IAllocator { return absl::nullopt; } - void Free(Allocation allocation) override { allocated_ -= allocation.size; } + void Free(Allocation allocation) override { allocated_ -= allocation.size_; } int64_t GetFootprintLimit() const override { return std::numeric_limits::max(); @@ -99,39 +99,40 @@ struct ObjectStatsCollectorTest : public Test { for (const auto &obj_entry : object_store_->object_table_) { const auto &obj = obj_entry.second; - if (obj->ref_count > 0) { + if (obj->ref_count_ > 0) { num_objects_in_use++; - num_bytes_in_use += obj->object_info.GetObjectSize(); + num_bytes_in_use += obj->object_info_.GetObjectSize(); } - if (obj->state == ObjectState::PLASMA_CREATED) { + if (obj->state_ == ObjectState::PLASMA_CREATED) { num_objects_unsealed++; - num_bytes_unsealed += obj->object_info.GetObjectSize(); + num_bytes_unsealed += obj->object_info_.GetObjectSize(); } else { - if (obj->ref_count == 1 && - obj->source == plasma::flatbuf::ObjectSource::CreatedByWorker) { + if (obj->ref_count_ == 1 && + obj->source_ == plasma::flatbuf::ObjectSource::CreatedByWorker) { num_objects_spillable++; - num_bytes_spillable += obj->object_info.GetObjectSize(); + num_bytes_spillable += obj->object_info_.GetObjectSize(); } - if (obj->ref_count == 0) { + if (obj->ref_count_ == 0) { num_objects_evictable++; - num_bytes_evictable += obj->object_info.GetObjectSize(); + num_bytes_evictable += obj->object_info_.GetObjectSize(); } } - if (obj->source == plasma::flatbuf::ObjectSource::CreatedByWorker) { + if (obj->source_ == plasma::flatbuf::ObjectSource::CreatedByWorker) { num_objects_created_by_worker++; - num_bytes_created_by_worker += obj->object_info.GetObjectSize(); - } else if (obj->source == plasma::flatbuf::ObjectSource::RestoredFromStorage) { + num_bytes_created_by_worker += obj->object_info_.GetObjectSize(); + } else if (obj->source_ == plasma::flatbuf::ObjectSource::RestoredFromStorage) { num_objects_restored++; - num_bytes_restored += obj->object_info.GetObjectSize(); - } else if (obj->source == plasma::flatbuf::ObjectSource::ReceivedFromRemoteRaylet) { + num_bytes_restored += obj->object_info_.GetObjectSize(); + } else if (obj->source_ == + plasma::flatbuf::ObjectSource::ReceivedFromRemoteRaylet) { num_objects_received++; - num_bytes_received += obj->object_info.GetObjectSize(); - } else if (obj->source == plasma::flatbuf::ObjectSource::ErrorStoredByRaylet) { + num_bytes_received += obj->object_info_.GetObjectSize(); + } else if (obj->source_ == plasma::flatbuf::ObjectSource::ErrorStoredByRaylet) { num_objects_errored++; - num_bytes_errored += obj->object_info.GetObjectSize(); + num_bytes_errored += obj->object_info_.GetObjectSize(); } } diff --git a/src/ray/object_manager/pull_manager.cc b/src/ray/object_manager/pull_manager.cc index f91bff777782..3c9af345c527 100644 --- a/src/ray/object_manager/pull_manager.cc +++ b/src/ray/object_manager/pull_manager.cc @@ -20,7 +20,6 @@ #include #include -#include "ray/common/common_protocol.h" #include "ray/common/ray_config.h" #include "ray/stats/metric_defs.h" @@ -70,7 +69,7 @@ uint64_t PullManager::Pull(const std::vector &object_ref_b BundlePullRequest bundle_pull_request(ObjectRefsToIds(deduplicated), task_key); const uint64_t req_id = next_req_id_++; RAY_LOG(DEBUG) << "Start pull request " << req_id - << ". Bundle size: " << bundle_pull_request.objects.size(); + << ". Bundle size: " << bundle_pull_request.objects_.size(); for (const auto &ref : deduplicated) { const auto obj_id = ObjectRefToId(ref); @@ -127,7 +126,7 @@ bool PullManager::ActivateNextBundlePullRequest(BundlePullRequestQueue &bundles, // First calculate the bytes we need. int64_t bytes_to_pull = 0; - for (const auto &obj_id : next_request.objects) { + for (const auto &obj_id : next_request.objects_) { const bool needs_pull = active_object_pull_requests_.count(obj_id) == 0; if (needs_pull) { // This is the first bundle request in the queue to require this object. @@ -158,7 +157,7 @@ bool PullManager::ActivateNextBundlePullRequest(BundlePullRequestQueue &bundles, << " num bytes being pulled: " << num_bytes_being_pulled_ << " num bytes available: " << num_bytes_available_; num_bytes_being_pulled_ += bytes_to_pull; - for (const auto &obj_id : next_request.objects) { + for (const auto &obj_id : next_request.objects_) { const bool needs_pull = active_object_pull_requests_.count(obj_id) == 0; active_object_pull_requests_[obj_id].insert(next_request_id); if (needs_pull) { @@ -184,7 +183,7 @@ void PullManager::DeactivateBundlePullRequest( uint64_t request_id, std::unordered_set *objects_to_cancel) { const auto &request = map_find_or_die(bundles.requests, request_id); - for (const auto &obj_id : request.objects) { + for (const auto &obj_id : request.objects_) { absl::MutexLock lock(&active_objects_mu_); auto it = active_object_pull_requests_.find(obj_id); if (it == active_object_pull_requests_.end() || !it->second.erase(request_id)) { @@ -337,7 +336,7 @@ std::vector PullManager::CancelPull(uint64_t request_id) { // Erase this pull request. std::vector object_ids_to_cancel_subscription; - for (const auto &obj_id : bundle_it->second.objects) { + for (const auto &obj_id : bundle_it->second.objects_) { auto it = object_pull_requests_.find(obj_id); if (it != object_pull_requests_.end()) { RAY_LOG(DEBUG) << "Removing an object pull request of id: " << obj_id; @@ -680,12 +679,12 @@ std::string PullManager::BundleInfo(const BundlePullRequestQueue &bundles) const } const auto &bundle = it->second; std::stringstream result; - result << bundle.objects.size() << " objects"; + result << bundle.objects_.size() << " objects"; if (!bundle.IsPullable()) { result << " (inactive, waiting for object sizes or locations)"; } else { size_t num_bytes_needed = 0; - for (const auto &obj_id : bundle.objects) { + for (const auto &obj_id : bundle.objects_) { num_bytes_needed += map_find_or_die(object_pull_requests_, obj_id).object_size; } result << ", " << num_bytes_needed << " bytes"; @@ -713,7 +712,7 @@ int64_t PullManager::NextRequestBundleSize(const BundlePullRequestQueue &bundles // Calculate the bytes we need. int64_t bytes_needed_calculated = 0; - for (const auto &obj_id : next_request.objects) { + for (const auto &obj_id : next_request.objects_) { bool needs_pull = active_object_pull_requests_.count(obj_id) == 0; if (needs_pull) { // This is the first bundle request in the queue to require this object. diff --git a/src/ray/object_manager/pull_manager.h b/src/ray/object_manager/pull_manager.h index 7cd7598fcb27..c81d1ed13ee8 100644 --- a/src/ray/object_manager/pull_manager.h +++ b/src/ray/object_manager/pull_manager.h @@ -225,25 +225,25 @@ class PullManager { struct BundlePullRequest { BundlePullRequest(std::vector requested_objects, const TaskMetricsKey &task_key) - : objects(std::move(requested_objects)), task_key(task_key) {} + : objects_(std::move(requested_objects)), task_key_(task_key) {} // All the objects that this bundle is trying to pull. - const std::vector objects; + const std::vector objects_; // All the objects that are pullable. - absl::flat_hash_set pullable_objects; + absl::flat_hash_set pullable_objects_; // The name of the task, if a task arg request, otherwise the empty string. - const TaskMetricsKey task_key; + const TaskMetricsKey task_key_; void MarkObjectAsPullable(const ObjectID &object) { - pullable_objects.emplace(object); + pullable_objects_.emplace(object); } void MarkObjectAsUnpullable(const ObjectID &object) { - pullable_objects.erase(object); + pullable_objects_.erase(object); } // A bundle is pullable if we know the sizes of all objects // and none of them is pending creation due to object reconstruction. - bool IsPullable() const { return pullable_objects.size() == objects.size(); } + bool IsPullable() const { return pullable_objects_.size() == objects_.size(); } }; /// A helper structure for tracking all the bundle pull requests for a particular bundle @@ -286,7 +286,7 @@ class PullManager { requests.emplace(request_id, request); if (request.IsPullable()) { inactive_requests.emplace(request_id); - inactive_by_name.Increment(request.task_key); + inactive_by_name.Increment(request.task_key_); RAY_CHECK_EQ(inactive_requests.size(), inactive_by_name.Total()); } } @@ -294,7 +294,7 @@ class PullManager { void ActivateBundlePullRequest(uint64_t request_id) { RAY_CHECK_EQ(inactive_requests.erase(request_id), 1u); active_requests.emplace(request_id); - auto task_key = map_find_or_die(requests, request_id).task_key; + auto task_key = map_find_or_die(requests, request_id).task_key_; inactive_by_name.Decrement(task_key); RAY_CHECK_EQ(inactive_requests.size(), inactive_by_name.Total()); } @@ -302,7 +302,7 @@ class PullManager { void DeactivateBundlePullRequest(uint64_t request_id) { RAY_CHECK_EQ(active_requests.erase(request_id), 1u); inactive_requests.emplace(request_id); - auto task_key = map_find_or_die(requests, request_id).task_key; + auto task_key = map_find_or_die(requests, request_id).task_key_; inactive_by_name.Increment(task_key); RAY_CHECK_EQ(inactive_requests.size(), inactive_by_name.Total()); } @@ -311,7 +311,7 @@ class PullManager { RAY_CHECK(map_find_or_die(requests, request_id).IsPullable()); RAY_CHECK_EQ(active_requests.count(request_id), 0u); inactive_requests.emplace(request_id); - auto task_key = map_find_or_die(requests, request_id).task_key; + auto task_key = map_find_or_die(requests, request_id).task_key_; inactive_by_name.Increment(task_key); RAY_CHECK_EQ(inactive_requests.size(), inactive_by_name.Total()); } @@ -324,14 +324,14 @@ class PullManager { auto it = inactive_requests.find(request_id); if (it != inactive_requests.end()) { inactive_requests.erase(it); - auto task_key = map_find_or_die(requests, request_id).task_key; + auto task_key = map_find_or_die(requests, request_id).task_key_; inactive_by_name.Decrement(task_key); RAY_CHECK_EQ(inactive_requests.size(), inactive_by_name.Total()); } } void RemoveBundlePullRequest(uint64_t request_id) { - auto task_key = map_find_or_die(requests, request_id).task_key; + auto task_key = map_find_or_die(requests, request_id).task_key_; requests.erase(request_id); if (active_requests.find(request_id) != active_requests.end()) { active_requests.erase(request_id); diff --git a/src/ray/object_manager/push_manager.cc b/src/ray/object_manager/push_manager.cc index 1ce1e0258dc8..1487c6d5b563 100644 --- a/src/ray/object_manager/push_manager.cc +++ b/src/ray/object_manager/push_manager.cc @@ -41,7 +41,7 @@ void PushManager::StartPush(const NodeID &dest_id, } else { RAY_LOG(DEBUG) << "Duplicate push request " << push_id.first << ", " << push_id.second << ", resending all the chunks."; - RAY_CHECK_NE(it->second->num_chunks_to_send, 0); + RAY_CHECK_NE(it->second->num_chunks_to_send_, 0); chunks_remaining_ += it->second->ResendAllChunks(std::move(send_chunk_fn)); } ScheduleRemainingPushes(); @@ -73,12 +73,12 @@ void PushManager::ScheduleRemainingPushes() { auto &push_state = *iter; push_state.SendOneChunk(); chunks_in_flight_ += 1; - if (push_state.num_chunks_to_send == 0) { - auto push_state_map_iter = push_state_map_.find(push_state.node_id); + if (push_state.num_chunks_to_send_ == 0) { + auto push_state_map_iter = push_state_map_.find(push_state.node_id_); RAY_CHECK(push_state_map_iter != push_state_map_.end()); auto &dest_map = push_state_map_iter->second; - auto dest_map_iter = dest_map.find(push_state.object_id); + auto dest_map_iter = dest_map.find(push_state.object_id_); RAY_CHECK(dest_map_iter != dest_map.end()); iter = push_requests_with_chunks_to_send_.erase(dest_map_iter->second); diff --git a/src/ray/object_manager/push_manager.h b/src/ray/object_manager/push_manager.h index 79195d2327e3..4149d8f29c30 100644 --- a/src/ray/object_manager/push_manager.h +++ b/src/ray/object_manager/push_manager.h @@ -79,43 +79,43 @@ class PushManager { /// Tracks the state of an active object push to another node. struct PushState { - NodeID node_id; - ObjectID object_id; + NodeID node_id_; + ObjectID object_id_; /// total number of chunks of this object. - int64_t num_chunks; + int64_t num_chunks_; /// The function to send chunks with. - std::function chunk_send_fn; + std::function chunk_send_fn_; /// The index of the next chunk to send. - int64_t next_chunk_id = 0; + int64_t next_chunk_id_ = 0; /// The number of chunks remaining to send. - int64_t num_chunks_to_send; + int64_t num_chunks_to_send_; PushState(NodeID node_id, ObjectID object_id, int64_t num_chunks, std::function chunk_send_fn) - : node_id(node_id), - object_id(object_id), - num_chunks(num_chunks), - chunk_send_fn(std::move(chunk_send_fn)), - num_chunks_to_send(num_chunks) {} + : node_id_(node_id), + object_id_(object_id), + num_chunks_(num_chunks), + chunk_send_fn_(std::move(chunk_send_fn)), + num_chunks_to_send_(num_chunks) {} /// Resend all chunks and returns how many more chunks will be sent. int64_t ResendAllChunks(std::function send_fn) { - chunk_send_fn = std::move(send_fn); - int64_t additional_chunks_to_send = num_chunks - num_chunks_to_send; - num_chunks_to_send = num_chunks; + chunk_send_fn_ = std::move(send_fn); + int64_t additional_chunks_to_send = num_chunks_ - num_chunks_to_send_; + num_chunks_to_send_ = num_chunks_; return additional_chunks_to_send; } /// Send one chunk. Return true if a new chunk is sent, false if no more chunk to /// send. void SendOneChunk() { - num_chunks_to_send--; + num_chunks_to_send_--; // Send the next chunk for this push. - chunk_send_fn(next_chunk_id); - next_chunk_id = (next_chunk_id + 1) % num_chunks; + chunk_send_fn_(next_chunk_id_); + next_chunk_id_ = (next_chunk_id_ + 1) % num_chunks_; } }; diff --git a/src/ray/object_manager/test/BUILD.bazel b/src/ray/object_manager/tests/BUILD.bazel similarity index 86% rename from src/ray/object_manager/test/BUILD.bazel rename to src/ray/object_manager/tests/BUILD.bazel index 4a7192248f8b..437d4c81b4af 100644 --- a/src/ray/object_manager/test/BUILD.bazel +++ b/src/ray/object_manager/tests/BUILD.bazel @@ -36,7 +36,7 @@ ray_cc_test( ], tags = ["team:core"], deps = [ - "//:ray_mock", + "//:ray_fakes", "//src/ray/object_manager:ownership_object_directory", "@com_google_googletest//:gtest_main", ], @@ -103,3 +103,19 @@ ray_cc_test( "@com_google_googletest//:gtest_main", ], ) + +ray_cc_test( + name = "object_manager_test", + size = "medium", + srcs = [ + "object_manager_test.cc", + ], + tags = ["team:core"], + deps = [ + "//:ray_mock", + "//src/fakes/ray/object_manager/plasma:fake_plasma_client", + "//src/ray/object_manager", + "//src/ray/rpc/object_manager:fake_object_manager_client", + "@com_google_googletest//:gtest_main", + ], +) diff --git a/src/ray/object_manager/test/create_request_queue_test.cc b/src/ray/object_manager/tests/create_request_queue_test.cc similarity index 100% rename from src/ray/object_manager/test/create_request_queue_test.cc rename to src/ray/object_manager/tests/create_request_queue_test.cc diff --git a/src/ray/object_manager/test/get_request_queue_test.cc b/src/ray/object_manager/tests/get_request_queue_test.cc similarity index 97% rename from src/ray/object_manager/test/get_request_queue_test.cc rename to src/ray/object_manager/tests/get_request_queue_test.cc index e3e8c40ba615..6a3ec655801e 100644 --- a/src/ray/object_manager/test/get_request_queue_test.cc +++ b/src/ray/object_manager/tests/get_request_queue_test.cc @@ -64,22 +64,22 @@ struct GetRequestQueueTest : public Test { Test::SetUp(); object_id1 = ObjectID::FromRandom(); object_id2 = ObjectID::FromRandom(); - object1.object_info.data_size = 10; - object1.object_info.metadata_size = 0; - object2.object_info.data_size = 10; - object2.object_info.metadata_size = 0; + object1.object_info_.data_size = 10; + object1.object_info_.metadata_size = 0; + object2.object_info_.data_size = 10; + object2.object_info_.metadata_size = 0; } void TearDown() override { io_context_.stop(); } protected: - void MarkObject(LocalObject &object, ObjectState state) { object.state = state; } + void MarkObject(LocalObject &object, ObjectState state) { object.state_ = state; } void MarkObjectFallbackAllocated(LocalObject &object, bool fallback_allocated, MEMFD_TYPE fd) { - object.allocation.fallback_allocated = fallback_allocated; - object.allocation.fd = fd; + object.allocation_.fallback_allocated_ = fallback_allocated; + object.allocation_.fd_ = fd; } bool IsGetRequestExist(GetRequestQueue &queue, const ObjectID &object_id) { diff --git a/src/ray/object_manager/test/object_buffer_pool_test.cc b/src/ray/object_manager/tests/object_buffer_pool_test.cc similarity index 100% rename from src/ray/object_manager/test/object_buffer_pool_test.cc rename to src/ray/object_manager/tests/object_buffer_pool_test.cc diff --git a/src/ray/object_manager/tests/object_manager_test.cc b/src/ray/object_manager/tests/object_manager_test.cc new file mode 100644 index 000000000000..63049ff4023f --- /dev/null +++ b/src/ray/object_manager/tests/object_manager_test.cc @@ -0,0 +1,145 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ray/object_manager/object_manager.h" + +#include +#include +#include +#include + +#include "fakes/ray/object_manager/plasma/fake_plasma_client.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "mock/ray/gcs_client/gcs_client.h" +#include "mock/ray/object_manager/object_directory.h" +#include "ray/common/asio/instrumented_io_context.h" +#include "ray/common/id.h" +#include "ray/common/ray_config.h" +#include "ray/common/ray_object.h" +#include "ray/common/status.h" +#include "ray/object_manager/common.h" +#include "ray/rpc/object_manager/fake_object_manager_client.h" + +namespace ray { + +using ::testing::_; +using ::testing::Invoke; +using ::testing::Return; + +class ObjectManagerTest : public ::testing::Test { + protected: + ObjectManagerTest() + : io_work_(boost::asio::make_work_guard(io_context_.get_executor())), + rpc_work_(boost::asio::make_work_guard(rpc_context_.get_executor())) { + ObjectManagerConfig config_; + config_.object_manager_address = "127.0.0.1"; + config_.object_manager_port = 0; + config_.timer_freq_ms = RayConfig::instance().object_manager_timer_freq_ms(); + config_.pull_timeout_ms = RayConfig::instance().object_manager_pull_timeout_ms(); + config_.object_chunk_size = RayConfig::instance().object_manager_default_chunk_size(); + config_.max_bytes_in_flight = + RayConfig::instance().object_manager_max_bytes_in_flight(); + config_.store_socket_name = "test_store_socket"; + config_.push_timeout_ms = RayConfig::instance().object_manager_push_timeout_ms(); + config_.rpc_service_threads_number = 1; + config_.huge_pages = false; + + local_node_id_ = NodeID::FromRandom(); + mock_gcs_client_ = std::make_unique(); + mock_object_directory_ = std::make_unique(); + fake_plasma_client_ = std::make_shared(); + + object_manager_ = std::make_unique( + io_context_, + local_node_id_, + config_, + *mock_gcs_client_, + mock_object_directory_.get(), + // RestoreSpilledObjectCallback + [](const ObjectID &object_id, + int64_t object_size, + const std::string &object_url, + std::function callback) {}, + // get_spilled_object_url + [](const ObjectID &object_id) -> std::string { return ""; }, + // pin_object + [](const ObjectID &object_id) -> std::unique_ptr { return nullptr; }, + // fail_pull_request + [](const ObjectID &object_id, rpc::ErrorType error_type) {}, + fake_plasma_client_, + nullptr, + [](const std::string &address, + const int port, + ray::rpc::ClientCallManager &client_call_manager) { + return std::make_shared( + address, port, client_call_manager); + }, + rpc_context_); + } + + NodeID local_node_id_; + + instrumented_io_context io_context_{/*enable_lag_probe=*/false, + /*running_on_single_thread=*/true}; + instrumented_io_context rpc_context_{/*enable_lag_probe=*/false, + /*running_on_single_thread=*/true}; + boost::asio::executor_work_guard io_work_; + boost::asio::executor_work_guard rpc_work_; + + std::unique_ptr mock_gcs_client_; + std::unique_ptr mock_object_directory_; + std::unique_ptr object_manager_; + std::shared_ptr fake_plasma_client_; +}; + +uint32_t NumRemoteFreeObjectsRequests(const ObjectManager &object_manager) { + uint32_t num_free_objects_requests = 0; + for (const auto &[node_id, rpc_client] : + object_manager.remote_object_manager_clients_) { + auto fake_rpc_client = + std::dynamic_pointer_cast(rpc_client); + num_free_objects_requests += fake_rpc_client->num_free_objects_requests; + } + return num_free_objects_requests; +} + +TEST_F(ObjectManagerTest, TestFreeObjectsLocalOnlyFalse) { + auto object_id = ObjectID::FromRandom(); + + absl::flat_hash_map node_info_map_; + rpc::GcsNodeInfo self_node_info; + self_node_info.set_node_id(local_node_id_.Binary()); + node_info_map_[local_node_id_] = self_node_info; + NodeID remote_node_id_ = NodeID::FromRandom(); + rpc::GcsNodeInfo remote_node_info; + remote_node_info.set_node_id(remote_node_id_.Binary()); + node_info_map_[remote_node_id_] = remote_node_info; + + EXPECT_CALL(*mock_gcs_client_->mock_node_accessor, GetAll()) + .WillOnce(::testing::ReturnRef(node_info_map_)); + EXPECT_CALL(*mock_gcs_client_->mock_node_accessor, Get(remote_node_id_, _)) + .WillOnce(::testing::Return(&remote_node_info)); + + fake_plasma_client_->objects_in_plasma_[object_id] = + std::make_pair(std::vector(1), std::vector(1)); + object_manager_->FreeObjects({object_id}, false); + ASSERT_EQ(fake_plasma_client_->num_free_objects_requests, 1); + ASSERT_TRUE(!fake_plasma_client_->objects_in_plasma_.contains(object_id)); + ASSERT_EQ(NumRemoteFreeObjectsRequests(*object_manager_), 0); + ASSERT_EQ(rpc_context_.poll_one(), 1); + ASSERT_EQ(NumRemoteFreeObjectsRequests(*object_manager_), 1); +} + +} // namespace ray diff --git a/src/ray/object_manager/test/ownership_object_directory_test.cc b/src/ray/object_manager/tests/ownership_object_directory_test.cc similarity index 98% rename from src/ray/object_manager/test/ownership_object_directory_test.cc rename to src/ray/object_manager/tests/ownership_object_directory_test.cc index 0b082e6bd2ac..ce1a7d54ceda 100644 --- a/src/ray/object_manager/test/ownership_object_directory_test.cc +++ b/src/ray/object_manager/tests/ownership_object_directory_test.cc @@ -21,13 +21,13 @@ #include #include +#include "fakes/ray/pubsub/subscriber.h" #include "gmock/gmock.h" #include "gtest/gtest.h" -#include "mock/ray/pubsub/subscriber.h" #include "ray/common/asio/instrumented_io_context.h" #include "ray/common/status.h" -#include "ray/gcs/gcs_client/accessor.h" -#include "ray/gcs/gcs_client/gcs_client.h" +#include "ray/gcs_client/accessor.h" +#include "ray/gcs_client/gcs_client.h" namespace ray { @@ -37,7 +37,7 @@ using ::testing::Return; class MockWorkerClient : public rpc::CoreWorkerClientInterface { public: void UpdateObjectLocationBatch( - const rpc::UpdateObjectLocationBatchRequest &request, + rpc::UpdateObjectLocationBatchRequest &&request, const rpc::ClientCallback &callback) override { const auto &worker_id = WorkerID::FromBinary(request.intended_worker_id()); const auto &object_location_updates = request.object_location_updates(); @@ -126,7 +126,7 @@ class OwnershipBasedObjectDirectoryTest : public ::testing::Test { /*fetch_cluster_id_if_nil=*/false), gcs_client_mock_( new MockGcsClient(options_, std::make_unique())), - subscriber_(std::make_shared()), + subscriber_(std::make_shared()), owner_client(std::make_shared()), client_pool([&](const rpc::Address &addr) { return owner_client; }) { RayConfig::instance().initialize(R"({"max_object_report_batch_size": 20})"); @@ -155,7 +155,7 @@ class OwnershipBasedObjectDirectoryTest : public ::testing::Test { ray::ObjectInfo info; info.object_id = id; info.data_size = 12; - info.owner_raylet_id = NodeID::FromRandom(); + info.owner_node_id = NodeID::FromRandom(); info.owner_ip_address = "124.2.3.4"; info.owner_port = 6739; info.owner_worker_id = worker_id; @@ -199,7 +199,7 @@ class OwnershipBasedObjectDirectoryTest : public ::testing::Test { instrumented_io_context io_service_; gcs::GcsClientOptions options_; std::shared_ptr gcs_client_mock_; - std::shared_ptr subscriber_; + std::shared_ptr subscriber_; std::shared_ptr owner_client; rpc::CoreWorkerClientPool client_pool; std::unique_ptr obod_; @@ -486,7 +486,6 @@ TEST_F(OwnershipBasedObjectDirectoryTest, TestNotifyOnUpdate) { UniqueID callback_id = UniqueID::FromRandom(); ObjectID obj_id = ObjectID::FromRandom(); int num_callbacks = 0; - EXPECT_CALL(*subscriber_, Subscribe(_, _, _, _, _, _, _)).WillOnce(Return(true)); ASSERT_TRUE( obod_ ->SubscribeObjectLocations(callback_id, diff --git a/src/ray/object_manager/test/pull_manager_test.cc b/src/ray/object_manager/tests/pull_manager_test.cc similarity index 99% rename from src/ray/object_manager/test/pull_manager_test.cc rename to src/ray/object_manager/tests/pull_manager_test.cc index 86a727cb8bdf..f245445e7712 100644 --- a/src/ray/object_manager/test/pull_manager_test.cc +++ b/src/ray/object_manager/tests/pull_manager_test.cc @@ -22,7 +22,6 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" -#include "ray/common/common_protocol.h" namespace ray { diff --git a/src/ray/object_manager/test/push_manager_test.cc b/src/ray/object_manager/tests/push_manager_test.cc similarity index 91% rename from src/ray/object_manager/test/push_manager_test.cc rename to src/ray/object_manager/tests/push_manager_test.cc index d3f5188a9e3b..ae87892bacaf 100644 --- a/src/ray/object_manager/test/push_manager_test.cc +++ b/src/ray/object_manager/tests/push_manager_test.cc @@ -51,22 +51,22 @@ TEST(TestPushManager, TestPushState) { NodeID::FromRandom(), ObjectID::FromRandom(), 2, [&](int64_t chunk_id) { sent_chunks.push_back(chunk_id); }}; - ASSERT_EQ(state.num_chunks, 2); - ASSERT_EQ(state.next_chunk_id, 0); - ASSERT_EQ(state.num_chunks_to_send, 2); + ASSERT_EQ(state.num_chunks_, 2); + ASSERT_EQ(state.next_chunk_id_, 0); + ASSERT_EQ(state.num_chunks_to_send_, 2); state.SendOneChunk(); - ASSERT_EQ(state.num_chunks, 2); - ASSERT_EQ(state.next_chunk_id, 1); - ASSERT_EQ(state.num_chunks_to_send, 1); + ASSERT_EQ(state.num_chunks_, 2); + ASSERT_EQ(state.next_chunk_id_, 1); + ASSERT_EQ(state.num_chunks_to_send_, 1); ASSERT_EQ(sent_chunks, (std::vector{0})); state.SendOneChunk(); - ASSERT_EQ(state.num_chunks, 2); - ASSERT_EQ(state.next_chunk_id, 0); - ASSERT_EQ(state.num_chunks_to_send, 0); + ASSERT_EQ(state.num_chunks_, 2); + ASSERT_EQ(state.next_chunk_id_, 0); + ASSERT_EQ(state.num_chunks_to_send_, 0); ASSERT_EQ(sent_chunks, (std::vector{0, 1})); - ASSERT_EQ(state.num_chunks_to_send, 0); + ASSERT_EQ(state.num_chunks_to_send_, 0); } // resend all chunks. @@ -77,28 +77,28 @@ TEST(TestPushManager, TestPushState) { sent_chunks.push_back(chunk_id); }}; state.SendOneChunk(); - ASSERT_EQ(state.num_chunks, 3); - ASSERT_EQ(state.next_chunk_id, 1); - ASSERT_EQ(state.num_chunks_to_send, 2); + ASSERT_EQ(state.num_chunks_, 3); + ASSERT_EQ(state.next_chunk_id_, 1); + ASSERT_EQ(state.num_chunks_to_send_, 2); ASSERT_EQ(sent_chunks, (std::vector{0})); // resend chunks when 1 chunk is in flight. ASSERT_EQ(1, state.ResendAllChunks([&](int64_t chunk_id) { sent_chunks.push_back(chunk_id); })); - ASSERT_EQ(state.num_chunks, 3); - ASSERT_EQ(state.next_chunk_id, 1); - ASSERT_EQ(state.num_chunks_to_send, 3); + ASSERT_EQ(state.num_chunks_, 3); + ASSERT_EQ(state.next_chunk_id_, 1); + ASSERT_EQ(state.num_chunks_to_send_, 3); for (auto i = 0; i < 3; i++) { state.SendOneChunk(); - ASSERT_EQ(state.num_chunks, 3); - ASSERT_EQ(state.next_chunk_id, (2 + i) % 3); - ASSERT_EQ(state.num_chunks_to_send, 3 - i - 1); + ASSERT_EQ(state.num_chunks_, 3); + ASSERT_EQ(state.next_chunk_id_, (2 + i) % 3); + ASSERT_EQ(state.num_chunks_to_send_, 3 - i - 1); } ASSERT_EQ(sent_chunks, (std::vector{0, 1, 2, 0})); - ASSERT_EQ(state.num_chunks_to_send, 0); + ASSERT_EQ(state.num_chunks_to_send_, 0); } } diff --git a/src/ray/object_manager/test/spilled_object_test.cc b/src/ray/object_manager/tests/spilled_object_test.cc similarity index 95% rename from src/ray/object_manager/test/spilled_object_test.cc rename to src/ray/object_manager/tests/spilled_object_test.cc index 15f54365ea29..643596a106e9 100644 --- a/src/ray/object_manager/test/spilled_object_test.cc +++ b/src/ray/object_manager/tests/spilled_object_test.cc @@ -138,9 +138,9 @@ TEST(SpilledObjectReaderTest, ParseObjectHeader) { auto assert_parse_success = [](uint64_t object_offset, std::string data, std::string metadata, - std::string raylet_id) { + std::string node_id) { rpc::Address owner_address; - owner_address.set_raylet_id(raylet_id); + owner_address.set_node_id(node_id); auto str = ContructObjectString(object_offset, data, metadata, owner_address); uint64_t actual_data_offset = 0; uint64_t actual_data_size = 0; @@ -162,7 +162,7 @@ TEST(SpilledObjectReaderTest, ParseObjectHeader) { actual_data_offset); ASSERT_EQ(data.size(), actual_data_size); ASSERT_EQ(metadata.size(), actual_metadata_size); - ASSERT_EQ(owner_address.raylet_id(), actual_owner_address.raylet_id()); + ASSERT_EQ(owner_address.node_id(), actual_owner_address.node_id()); ASSERT_EQ(data, str.substr(actual_data_offset, actual_data_size)); ASSERT_EQ(metadata, str.substr(actual_metadata_offset, actual_metadata_size)); }; @@ -171,13 +171,13 @@ TEST(SpilledObjectReaderTest, ParseObjectHeader) { std::vector data_list{"", "somedata", large_data}; std::string large_metadata(10000, 'm'); std::vector metadata_list{"", "somemetadata", large_metadata}; - std::vector raylet_ids{"", "yes", "laaaaaaaarrrrrggge"}; + std::vector node_ids{"", "yes", "laaaaaaaarrrrrggge"}; for (auto offset : offsets) { for (auto &data : data_list) { for (auto &metadata : metadata_list) { - for (auto &raylet_id : raylet_ids) { - assert_parse_success(offset, data, metadata, raylet_id); + for (auto &node_id : node_ids) { + assert_parse_success(offset, data, metadata, node_id); } } } @@ -249,7 +249,7 @@ TEST(ChunkObjectReaderTest, GetNumChunks) { auto assert_get_num_chunks = [](uint64_t data_size, uint64_t chunk_size, uint64_t expected_num_chunks) { rpc::Address owner_address; - owner_address.set_raylet_id("nonsense"); + owner_address.set_node_id("nonsense"); ChunkObjectReader reader(std::make_shared( SpilledObjectReader("path", 100 /* object_size */, @@ -334,12 +334,12 @@ TYPED_TEST(ObjectReaderTest, Getters) { std::string data("data"); std::string metadata("metadata"); rpc::Address owner_address; - owner_address.set_raylet_id("nonsense"); + owner_address.set_node_id("nonsense"); auto obj_reader = this->CreateObjectReader_(data, metadata, owner_address); ASSERT_EQ(data.size(), obj_reader->GetDataSize()); ASSERT_EQ(metadata.size(), obj_reader->GetMetadataSize()); ASSERT_EQ(data.size() + metadata.size(), obj_reader->GetObjectSize()); - ASSERT_EQ(owner_address.raylet_id(), obj_reader->GetOwnerAddress().raylet_id()); + ASSERT_EQ(owner_address.node_id(), obj_reader->GetOwnerAddress().node_id()); } TYPED_TEST(ObjectReaderTest, GetDataAndMetadata) { @@ -386,7 +386,7 @@ TYPED_TEST(ObjectReaderTest, GetChunk) { for (auto &metadata : list_metadata) { std::vector chunk_sizes{1, 2, 3, 5, 100}; rpc::Address owner_address; - owner_address.set_raylet_id("nonsense"); + owner_address.set_node_id("nonsense"); std::string expected_output = data + metadata; if (expected_output.size() != 0) { @@ -421,8 +421,8 @@ TEST(StringAllocationTest, TestNoCopyWhenStringMoved) { std::string s(1000, '\0'); auto allocation_address = s.c_str(); rpc::Address address; - address.set_raylet_id(std::move(s)); - EXPECT_EQ(allocation_address, address.raylet_id().c_str()); + address.set_node_id(std::move(s)); + EXPECT_EQ(allocation_address, address.node_id().c_str()); } TEST(StringAllocationTest, TestCopyWhenPassByPointer) { @@ -431,8 +431,8 @@ TEST(StringAllocationTest, TestCopyWhenPassByPointer) { char arr[1000]; auto allocation_address = &arr[0]; rpc::Address address; - address.set_raylet_id(allocation_address, 1000); - EXPECT_NE(allocation_address, address.raylet_id().c_str()); + address.set_node_id(allocation_address, 1000); + EXPECT_NE(allocation_address, address.node_id().c_str()); } } // namespace ray diff --git a/src/ray/observability/BUILD.bazel b/src/ray/observability/BUILD.bazel new file mode 100644 index 000000000000..ac19b4f6eea2 --- /dev/null +++ b/src/ray/observability/BUILD.bazel @@ -0,0 +1,127 @@ +load("//bazel:ray.bzl", "ray_cc_library") + +ray_cc_library( + name = "open_telemetry_metric_recorder", + srcs = [ + "open_telemetry_metric_recorder.cc", + ], + hdrs = [ + "open_telemetry_metric_recorder.h", + ], + deps = [ + "//src/ray/util:logging", + "@com_google_absl//absl/container:flat_hash_map", + "@io_opentelemetry_cpp//api", + "@io_opentelemetry_cpp//exporters/otlp:otlp_grpc_metric_exporter", + "@io_opentelemetry_cpp//sdk/src/metrics", + ], +) + +ray_cc_library( + name = "metric_interface", + hdrs = ["metric_interface.h"], + deps = [ + "@io_opencensus_cpp//opencensus/stats", + ], +) + +ray_cc_library( + name = "fake_metric", + hdrs = [ + "fake_metric.h", + ], + deps = [ + ":metric_interface", + ], +) + +ray_cc_library( + name = "ray_event_interface", + hdrs = [ + "ray_event_interface.h", + ], + deps = [ + "//src/ray/protobuf/public:events_base_event_cc_proto", + ], +) + +ray_cc_library( + name = "ray_event", + hdrs = [ + "ray_event.h", + ], + deps = [ + ":ray_event_interface", + "//src/ray/common:grpc_util", + "//src/ray/common:id", + "//src/ray/protobuf:gcs_cc_proto", + "@com_google_absl//absl/time", + ], +) + +ray_cc_library( + name = "ray_driver_job_definition_event", + srcs = [ + "ray_driver_job_definition_event.cc", + ], + hdrs = [ + "ray_driver_job_definition_event.h", + ], + deps = [ + ":ray_event", + "//src/ray/protobuf/public:events_driver_job_definition_event_cc_proto", + ], +) + +ray_cc_library( + name = "ray_driver_job_execution_event", + srcs = [ + "ray_driver_job_execution_event.cc", + ], + hdrs = [ + "ray_driver_job_execution_event.h", + ], + deps = [ + ":ray_event", + "//src/ray/protobuf/public:events_driver_job_execution_event_cc_proto", + ], +) + +ray_cc_library( + name = "ray_event_recorder_interface", + hdrs = [ + "ray_event_recorder_interface.h", + ], + deps = [ + ":ray_event", + ], +) + +ray_cc_library( + name = "ray_event_recorder", + srcs = [ + "ray_event_recorder.cc", + ], + hdrs = [ + "ray_event_recorder.h", + ], + deps = [ + ":ray_event", + ":ray_event_recorder_interface", + "//src/ray/common:asio", + "//src/ray/protobuf:events_event_aggregator_service_cc_proto", + "//src/ray/protobuf:gcs_cc_proto", + "//src/ray/rpc:event_aggregator_client", + "//src/ray/util:logging", + "@com_google_absl//absl/time", + ], +) + +ray_cc_library( + name = "fake_ray_event_recorder", + hdrs = ["fake_ray_event_recorder.h"], + deps = [ + ":ray_event_interface", + ":ray_event_recorder_interface", + ], +) diff --git a/src/ray/observability/fake_metric.h b/src/ray/observability/fake_metric.h new file mode 100644 index 000000000000..8cafb45ded68 --- /dev/null +++ b/src/ray/observability/fake_metric.h @@ -0,0 +1,70 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "ray/observability/metric_interface.h" + +namespace ray { +namespace observability { + +class FakeMetric : public MetricInterface { + public: + FakeMetric() = default; + ~FakeMetric() = default; + + void Record(double value) override { Record(value, stats::TagsType{}); } + + void Record(double value, stats::TagsType tags) override { + absl::flat_hash_map tags_map; + for (const auto &tag : tags) { + tags_map[tag.first.name()] = tag.second; + } + tag_to_value_.emplace(std::move(tags_map), value); + } + + void Record(double value, + const std::unordered_map &tags) override { + stats::TagsType tags_pair_vec; + tags_pair_vec.reserve(tags.size()); + std::for_each(tags.begin(), tags.end(), [&tags_pair_vec](auto &tag) { + return tags_pair_vec.emplace_back(stats::TagKeyType::Register(tag.first), + std::move(tag.second)); + }); + Record(value, std::move(tags_pair_vec)); + } + + void Record(double value, + const std::unordered_map &tags) override { + stats::TagsType tags_pair_vec; + tags_pair_vec.reserve(tags.size()); + std::for_each(tags.begin(), tags.end(), [&tags_pair_vec](auto &tag) { + return tags_pair_vec.emplace_back(stats::TagKeyType::Register(tag.first), + std::move(tag.second)); + }); + Record(value, std::move(tags_pair_vec)); + } + + const absl::flat_hash_map, double> + &GetTagToValue() const { + return tag_to_value_; + } + + private: + absl::flat_hash_map, double> + tag_to_value_; +}; + +} // namespace observability +} // namespace ray diff --git a/src/ray/observability/fake_ray_event_recorder.h b/src/ray/observability/fake_ray_event_recorder.h new file mode 100644 index 000000000000..bbbecdf0d69a --- /dev/null +++ b/src/ray/observability/fake_ray_event_recorder.h @@ -0,0 +1,48 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "ray/observability/ray_event_interface.h" +#include "ray/observability/ray_event_recorder_interface.h" + +namespace ray { +namespace observability { + +class FakeRayEventRecorder : public RayEventRecorderInterface { + public: + void StartExportingEvents() override {} + void AddEvents(std::vector> &&data_list) override { + absl::MutexLock lock(&mutex_); + buffer_.insert(buffer_.end(), + std::make_move_iterator(data_list.begin()), + std::make_move_iterator(data_list.end())); + } + + const std::vector> FlushBuffer() { + absl::MutexLock lock(&mutex_); + auto buffer = std::move(buffer_); + buffer_.clear(); + return buffer; + } + + private: + std::vector> buffer_ ABSL_GUARDED_BY(mutex_); + absl::Mutex mutex_; +}; + +} // namespace observability +} // namespace ray diff --git a/src/ray/observability/metric_interface.h b/src/ray/observability/metric_interface.h new file mode 100644 index 000000000000..4ba235e0e0b7 --- /dev/null +++ b/src/ray/observability/metric_interface.h @@ -0,0 +1,49 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "opencensus/tags/tag_key.h" + +namespace ray { + +// TODO(can-anyscale): Use stats namespace for backward compatibility. We will remove +// these types soon when opencensus is removed, and then we can remove this namespace. +namespace stats { + +using TagKeyType = opencensus::tags::TagKey; +using TagsType = std::vector>; + +} // namespace stats + +namespace observability { + +class MetricInterface { + public: + virtual ~MetricInterface() = default; + + virtual void Record(double value) = 0; + virtual void Record(double value, stats::TagsType tags) = 0; + virtual void Record(double value, + const std::unordered_map &tags) = 0; + virtual void Record(double value, + const std::unordered_map &tags) = 0; +}; + +} // namespace observability +} // namespace ray diff --git a/src/ray/telemetry/open_telemetry_metric_recorder.cc b/src/ray/observability/open_telemetry_metric_recorder.cc similarity index 98% rename from src/ray/telemetry/open_telemetry_metric_recorder.cc rename to src/ray/observability/open_telemetry_metric_recorder.cc index 8db1f8e44984..8c851ed84f05 100644 --- a/src/ray/telemetry/open_telemetry_metric_recorder.cc +++ b/src/ray/observability/open_telemetry_metric_recorder.cc @@ -11,7 +11,7 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/telemetry/open_telemetry_metric_recorder.h" +#include "ray/observability/open_telemetry_metric_recorder.h" #include #include @@ -33,7 +33,7 @@ // Anonymous namespace that contains the private callback functions for the // OpenTelemetry metrics. namespace { -using ray::telemetry::OpenTelemetryMetricRecorder; +using ray::observability::OpenTelemetryMetricRecorder; static void _DoubleGaugeCallback(opentelemetry::metrics::ObserverResult observer, void *state) { @@ -50,7 +50,7 @@ static void _DoubleGaugeCallback(opentelemetry::metrics::ObserverResult observer } // anonymous namespace namespace ray { -namespace telemetry { +namespace observability { OpenTelemetryMetricRecorder &OpenTelemetryMetricRecorder::GetInstance() { // Note: This creates a singleton instance of the OpenTelemetryMetricRecorder. The @@ -275,5 +275,5 @@ void OpenTelemetryMetricRecorder::SetSynchronousMetricValue( } } -} // namespace telemetry +} // namespace observability } // namespace ray diff --git a/src/ray/telemetry/open_telemetry_metric_recorder.h b/src/ray/observability/open_telemetry_metric_recorder.h similarity index 99% rename from src/ray/telemetry/open_telemetry_metric_recorder.h rename to src/ray/observability/open_telemetry_metric_recorder.h index 5401da24a994..f21181f1739a 100644 --- a/src/ray/telemetry/open_telemetry_metric_recorder.h +++ b/src/ray/observability/open_telemetry_metric_recorder.h @@ -31,7 +31,7 @@ #include "absl/container/flat_hash_map.h" namespace ray { -namespace telemetry { +namespace observability { // OpenTelemetryMetricRecorder is a singleton class that initializes the OpenTelemetry // grpc exporter and creates a Meter for recording metrics. It is responsible for @@ -159,5 +159,5 @@ class OpenTelemetryMetricRecorder { friend class OpenTelemetryMetricRecorderTest; friend class OpenTelemetryMetricRecorderTest_TestGaugeMetric_Test; }; -} // namespace telemetry +} // namespace observability } // namespace ray diff --git a/src/ray/observability/ray_driver_job_definition_event.cc b/src/ray/observability/ray_driver_job_definition_event.cc new file mode 100644 index 000000000000..11bfd03730a5 --- /dev/null +++ b/src/ray/observability/ray_driver_job_definition_event.cc @@ -0,0 +1,67 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ray/observability/ray_driver_job_definition_event.h" + +namespace ray { +namespace observability { + +RayDriverJobDefinitionEvent::RayDriverJobDefinitionEvent(const rpc::JobTableData &data, + const std::string &session_name) + : RayEvent( + rpc::events::RayEvent::GCS, + rpc::events::RayEvent::DRIVER_JOB_DEFINITION_EVENT, + rpc::events::RayEvent::INFO, + "", + session_name) { + data_.set_job_id(data.job_id()); + data_.set_driver_pid(data.driver_pid()); + data_.set_driver_node_id(data.driver_address().node_id()); + data_.set_entrypoint(data.entrypoint()); + data_.mutable_config()->mutable_metadata()->insert(data.config().metadata().begin(), + data.config().metadata().end()); + + auto runtime_env_info = data_.mutable_config()->mutable_runtime_env_info(); + runtime_env_info->set_serialized_runtime_env( + data.config().runtime_env_info().serialized_runtime_env()); + auto runtime_env_uris = runtime_env_info->mutable_uris(); + runtime_env_uris->set_working_dir_uri( + data.config().runtime_env_info().uris().working_dir_uri()); + runtime_env_uris->mutable_py_modules_uris()->CopyFrom( + data.config().runtime_env_info().uris().py_modules_uris()); + auto runtime_env_config = runtime_env_info->mutable_runtime_env_config(); + runtime_env_config->set_setup_timeout_seconds( + data.config().runtime_env_info().runtime_env_config().setup_timeout_seconds()); + runtime_env_config->set_eager_install( + data.config().runtime_env_info().runtime_env_config().eager_install()); + runtime_env_config->mutable_log_files()->CopyFrom( + data.config().runtime_env_info().runtime_env_config().log_files()); +} + +std::string RayDriverJobDefinitionEvent::GetEntityId() const { return data_.job_id(); } + +void RayDriverJobDefinitionEvent::MergeData( + RayEvent &&other) { + RAY_LOG(WARNING) << "Merge should not be called for driver job definition event."; + return; +} + +ray::rpc::events::RayEvent RayDriverJobDefinitionEvent::SerializeData() && { + ray::rpc::events::RayEvent event; + event.mutable_driver_job_definition_event()->Swap(&data_); + return event; +} + +} // namespace observability +} // namespace ray diff --git a/src/ray/observability/ray_driver_job_definition_event.h b/src/ray/observability/ray_driver_job_definition_event.h new file mode 100644 index 000000000000..6ff80ba48c54 --- /dev/null +++ b/src/ray/observability/ray_driver_job_definition_event.h @@ -0,0 +1,38 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ray/observability/ray_event.h" +#include "src/ray/protobuf/gcs.pb.h" +#include "src/ray/protobuf/public/events_driver_job_definition_event.pb.h" + +namespace ray { +namespace observability { + +template class RayEvent; + +class RayDriverJobDefinitionEvent + : public RayEvent { + public: + RayDriverJobDefinitionEvent(const rpc::JobTableData &data, + const std::string &session_name); + + std::string GetEntityId() const override; + + protected: + ray::rpc::events::RayEvent SerializeData() && override; + void MergeData(RayEvent &&other) override; +}; + +} // namespace observability +} // namespace ray diff --git a/src/ray/observability/ray_driver_job_execution_event.cc b/src/ray/observability/ray_driver_job_execution_event.cc new file mode 100644 index 000000000000..fba7b274499a --- /dev/null +++ b/src/ray/observability/ray_driver_job_execution_event.cc @@ -0,0 +1,56 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ray/observability/ray_driver_job_execution_event.h" + +namespace ray { +namespace observability { + +RayDriverJobExecutionEvent::RayDriverJobExecutionEvent( + const rpc::JobTableData &data, + rpc::events::DriverJobExecutionEvent::State state, + const std::string &session_name) + : RayEvent( + rpc::events::RayEvent::GCS, + rpc::events::RayEvent::DRIVER_JOB_EXECUTION_EVENT, + rpc::events::RayEvent::INFO, + "", + session_name) { + ray::rpc::events::DriverJobExecutionEvent::StateTimestamp state_timestamp; + state_timestamp.set_state(state); + state_timestamp.mutable_timestamp()->CopyFrom(AbslTimeNanosToProtoTimestamp( + absl::ToInt64Nanoseconds(absl::Now() - absl::UnixEpoch()))); + + data_.mutable_states()->Add(std::move(state_timestamp)); + data_.set_job_id(data.job_id()); +} + +std::string RayDriverJobExecutionEvent::GetEntityId() const { return data_.job_id(); } + +void RayDriverJobExecutionEvent::MergeData( + RayEvent &&other) { + auto &&other_event = static_cast(other); + for (auto &state : *other_event.data_.mutable_states()) { + data_.mutable_states()->Add(std::move(state)); + } +} + +ray::rpc::events::RayEvent RayDriverJobExecutionEvent::SerializeData() && { + ray::rpc::events::RayEvent event; + event.mutable_driver_job_execution_event()->Swap(&data_); + return event; +} + +} // namespace observability +} // namespace ray diff --git a/src/ray/observability/ray_driver_job_execution_event.h b/src/ray/observability/ray_driver_job_execution_event.h new file mode 100644 index 000000000000..fd3d34ffe078 --- /dev/null +++ b/src/ray/observability/ray_driver_job_execution_event.h @@ -0,0 +1,39 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ray/common/grpc_util.h" +#include "ray/observability/ray_event.h" +#include "src/ray/protobuf/gcs.pb.h" +#include "src/ray/protobuf/public/events_driver_job_execution_event.pb.h" + +namespace ray { +namespace observability { + +template class RayEvent; + +class RayDriverJobExecutionEvent : public RayEvent { + public: + RayDriverJobExecutionEvent(const rpc::JobTableData &data, + rpc::events::DriverJobExecutionEvent::State state, + const std::string &session_name); + + std::string GetEntityId() const override; + + protected: + ray::rpc::events::RayEvent SerializeData() && override; + void MergeData(RayEvent &&other) override; +}; + +} // namespace observability +} // namespace ray diff --git a/src/ray/observability/ray_event.h b/src/ray/observability/ray_event.h new file mode 100644 index 000000000000..32711740723b --- /dev/null +++ b/src/ray/observability/ray_event.h @@ -0,0 +1,81 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "absl/time/time.h" +#include "ray/common/grpc_util.h" +#include "ray/common/id.h" +#include "ray/observability/ray_event_interface.h" +#include "src/ray/protobuf/public/events_base_event.pb.h" + +namespace ray { +namespace observability { + +// RayEvent is a base class for all Ray events. It is used to serialize the event data +// to a RayEvent proto before sending it to the aggregator. +template +class RayEvent : public RayEventInterface { + public: + void Merge(RayEventInterface &&other) override { + RAY_CHECK_EQ(GetEntityId(), other.GetEntityId()); + RAY_CHECK_EQ(GetEventType(), other.GetEventType()); + MergeData(static_cast &&>(other)); + } + + ray::rpc::events::RayEvent Serialize() && override { + ray::rpc::events::RayEvent event = std::move(*this).SerializeData(); + event.set_event_id(UniqueID::FromRandom().Binary()); + event.set_source_type(source_type_); + event.set_event_type(event_type_); + event.set_severity(severity_); + event.set_message(message_); + event.set_session_name(session_name_); + event.mutable_timestamp()->CopyFrom(AbslTimeNanosToProtoTimestamp( + absl::ToInt64Nanoseconds(event_timestamp_ - absl::UnixEpoch()))); + + return event; + } + + ray::rpc::events::RayEvent::EventType GetEventType() const override { + return event_type_; + } + + protected: + RayEvent(ray::rpc::events::RayEvent::SourceType source_type, + ray::rpc::events::RayEvent::EventType event_type, + ray::rpc::events::RayEvent::Severity severity, + const std::string &message, + const std::string &session_name) + : source_type_(source_type), + event_type_(event_type), + severity_(severity), + message_(message), + session_name_(session_name) { + event_timestamp_ = absl::Now(); + } + + T data_; // The nested event message within the RayEvent proto. + absl::Time event_timestamp_; + ray::rpc::events::RayEvent::SourceType source_type_; + ray::rpc::events::RayEvent::EventType event_type_; + ray::rpc::events::RayEvent::Severity severity_; + std::string message_; + std::string session_name_; + virtual void MergeData(RayEvent &&other) = 0; + virtual ray::rpc::events::RayEvent SerializeData() && = 0; +}; + +} // namespace observability +} // namespace ray diff --git a/src/ray/observability/ray_event_interface.h b/src/ray/observability/ray_event_interface.h new file mode 100644 index 000000000000..fc2a358ef311 --- /dev/null +++ b/src/ray/observability/ray_event_interface.h @@ -0,0 +1,64 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "src/ray/protobuf/public/events_base_event.pb.h" + +namespace ray { +namespace observability { + +class RayEventInterface { + public: + virtual ~RayEventInterface() = default; + + // Entity ID is a concept in Ray Event framework that captures the unique identifier + // of the entity that the event is associated with. For example, the entity ID of + // a task is the pair of task ID and task attempt ID, for a driver job, it is the + // driver job ID. + // + // Entity ID is used for two purposes: + // 1. To associate the execution event with the definition event. + // 2. To merge the individual execution events into a single execution event (single + // data point to a time series). + virtual std::string GetEntityId() const = 0; + + // Merge with another data point to form a time series. Merge is meant as an + // optimization for the data size. + // + // For example, given three events: + // + // 1. event 1: {entity_id: "1", type: "task", state_transitions: [("started", 1000)]} + // 2. event 2: {entity_id: "1", type: "task", state_transitions: [("running", 1001)]} + // 3. event 3: {entity_id: "1", type: "task", state_transitions: [("completed", 1002)]} + // + // The merged event will be: + // + // {entity_id: "1", type: "task", state_transitions: [("started", 1000), ("running", + // 1001), + // ("completed", 1002)]} + // + // This function assumes that the two events have the same type and entity ID. + virtual void Merge(RayEventInterface &&other) = 0; + + // Serialize the event data to a RayEvent proto. + virtual ray::rpc::events::RayEvent Serialize() && = 0; + + virtual ray::rpc::events::RayEvent::EventType GetEventType() const = 0; +}; + +} // namespace observability +} // namespace ray diff --git a/src/ray/observability/ray_event_recorder.cc b/src/ray/observability/ray_event_recorder.cc new file mode 100644 index 000000000000..0d214a445a87 --- /dev/null +++ b/src/ray/observability/ray_event_recorder.cc @@ -0,0 +1,74 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ray/observability/ray_event_recorder.h" + +#include "src/ray/protobuf/gcs.pb.h" + +namespace ray { +namespace observability { + +RayEventRecorder::RayEventRecorder(rpc::EventAggregatorClient &event_aggregator_client, + instrumented_io_context &io_service) + : event_aggregator_client_(event_aggregator_client), + periodical_runner_(PeriodicalRunner::Create(io_service)) {} + +void RayEventRecorder::StartExportingEvents() { + absl::MutexLock lock(&mutex_); + RAY_CHECK(!exporting_started_) + << "RayEventRecorder::StartExportingEvents() should be called only once."; + exporting_started_ = true; + periodical_runner_->RunFnPeriodically( + [this]() { ExportEvents(); }, + RayConfig::instance().ray_events_report_interval_ms(), + "RayEventRecorder.ExportEvents"); +} + +void RayEventRecorder::ExportEvents() { + absl::MutexLock lock(&mutex_); + if (buffer_.empty()) { + return; + } + rpc::events::AddEventsRequest request; + rpc::events::RayEventsData ray_event_data; + // TODO(#56391): To further optimize the performance, we can merge multiple + // events with the same resource ID into a single event. + for (auto &event : buffer_) { + rpc::events::RayEvent ray_event = std::move(*event).Serialize(); + *ray_event_data.mutable_events()->Add() = std::move(ray_event); + } + *request.mutable_events_data() = std::move(ray_event_data); + buffer_.clear(); + + event_aggregator_client_.AddEvents( + request, [](Status status, rpc::events::AddEventsReply reply) { + if (!status.ok()) { + // TODO(#56391): Add a metric to track the number of failed events. Also + // add logic for error recovery. + RAY_LOG(ERROR) << "Failed to record ray event: " << status.ToString(); + } + }); +} + +void RayEventRecorder::AddEvents( + std::vector> &&data_list) { + absl::MutexLock lock(&mutex_); + buffer_.reserve(buffer_.size() + data_list.size()); + for (auto &data : data_list) { + buffer_.emplace_back(std::move(data)); + } +} + +} // namespace observability +} // namespace ray diff --git a/src/ray/observability/ray_event_recorder.h b/src/ray/observability/ray_event_recorder.h new file mode 100644 index 000000000000..2650f99378c8 --- /dev/null +++ b/src/ray/observability/ray_event_recorder.h @@ -0,0 +1,69 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "absl/synchronization/mutex.h" +#include "absl/time/time.h" +#include "google/protobuf/timestamp.pb.h" +#include "ray/common/asio/periodical_runner.h" +#include "ray/common/ray_config.h" +#include "ray/observability/ray_event_interface.h" +#include "ray/observability/ray_event_recorder_interface.h" +#include "ray/rpc/event_aggregator_client.h" +#include "ray/util/logging.h" +#include "src/ray/protobuf/public/events_base_event.pb.h" + +namespace ray { +namespace observability { + +// RayEventRecorder is a class for recording different types of Ray +// events (e.g. task events, job events, etc.). Internal buffer is used to store events +// before sending to the event aggregator. Events are converted to RayEvent proto and +// added to the internal buffer. PeriodicalRunner is used to send events to the event +// aggregator periodically. +// +// This class is thread safe. +class RayEventRecorder : public RayEventRecorderInterface { + public: + RayEventRecorder(rpc::EventAggregatorClient &event_aggregator_client, + instrumented_io_context &io_service); + virtual ~RayEventRecorder() = default; + + // Start exporting events to the event aggregator by periodically sending events to + // the event aggregator. This should be called only once. Subsequent calls will be + // ignored. + void StartExportingEvents(); + + // Add a vector of data to the internal buffer. Data in the buffer will be sent to + // the event aggregator periodically. + void AddEvents(std::vector> &&data_list); + + private: + rpc::EventAggregatorClient &event_aggregator_client_; + std::shared_ptr periodical_runner_; + // Lock for thread safety when modifying the buffer. + absl::Mutex mutex_; + // Buffer to store events before sending to the event aggregator. + // TODO(#56391): Add a max size for the buffer and overflow recovery logic. + std::vector> buffer_ ABSL_GUARDED_BY(mutex_); + // Flag to track if exporting has been started + bool exporting_started_ ABSL_GUARDED_BY(mutex_) = false; + // Export events to the event aggregator. This is called periodically by the + // PeriodicalRunner. + void ExportEvents(); +}; + +} // namespace observability +} // namespace ray diff --git a/src/ray/observability/ray_event_recorder_interface.h b/src/ray/observability/ray_event_recorder_interface.h new file mode 100644 index 000000000000..f6e80e38eae2 --- /dev/null +++ b/src/ray/observability/ray_event_recorder_interface.h @@ -0,0 +1,40 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "ray/observability/ray_event_interface.h" + +namespace ray { +namespace observability { + +class RayEventRecorderInterface { + public: + virtual ~RayEventRecorderInterface() = default; + + // Start exporting events to the event aggregator by periodically sending events to + // the event aggregator. This should be called only once. Subsequent calls will be + // ignored. + virtual void StartExportingEvents() = 0; + + // Add a vector of data to the internal buffer. Data in the buffer will be sent to + // the event aggregator periodically. + virtual void AddEvents(std::vector> &&data_list) = 0; +}; + +} // namespace observability +} // namespace ray diff --git a/src/ray/observability/tests/BUILD.bazel b/src/ray/observability/tests/BUILD.bazel new file mode 100644 index 000000000000..db97a32197fa --- /dev/null +++ b/src/ray/observability/tests/BUILD.bazel @@ -0,0 +1,36 @@ +load("//bazel:ray.bzl", "ray_cc_test") + +ray_cc_test( + name = "open_telemetry_metric_recorder_test", + size = "small", + srcs = ["open_telemetry_metric_recorder_test.cc"], + tags = ["team:core"], + deps = [ + "//src/ray/observability:open_telemetry_metric_recorder", + "@com_google_googletest//:gtest_main", + ], +) + +ray_cc_test( + name = "ray_event_recorder_test", + size = "small", + srcs = ["ray_event_recorder_test.cc"], + tags = ["team:core"], + deps = [ + "//src/ray/observability:ray_driver_job_definition_event", + "//src/ray/observability:ray_driver_job_execution_event", + "//src/ray/observability:ray_event_recorder", + "@com_google_googletest//:gtest_main", + ], +) + +ray_cc_test( + name = "ray_driver_job_execution_event_test", + size = "small", + srcs = ["ray_driver_job_execution_event_test.cc"], + tags = ["team:core"], + deps = [ + "//src/ray/observability:ray_driver_job_execution_event", + "@com_google_googletest//:gtest_main", + ], +) diff --git a/src/ray/telemetry/tests/open_telemetry_metric_recorder_test.cc b/src/ray/observability/tests/open_telemetry_metric_recorder_test.cc similarity index 96% rename from src/ray/telemetry/tests/open_telemetry_metric_recorder_test.cc rename to src/ray/observability/tests/open_telemetry_metric_recorder_test.cc index 0c38f24a1532..c4cca1a6b896 100644 --- a/src/ray/telemetry/tests/open_telemetry_metric_recorder_test.cc +++ b/src/ray/observability/tests/open_telemetry_metric_recorder_test.cc @@ -12,12 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/telemetry/open_telemetry_metric_recorder.h" +#include "ray/observability/open_telemetry_metric_recorder.h" #include "gtest/gtest.h" namespace ray { -namespace telemetry { +namespace observability { class OpenTelemetryMetricRecorderTest : public ::testing::Test { public: @@ -84,5 +84,5 @@ TEST_F(OpenTelemetryMetricRecorderTest, TestHistogramMetric) { ASSERT_TRUE(recorder_.IsMetricRegistered("test_histogram")); } -} // namespace telemetry +} // namespace observability } // namespace ray diff --git a/src/ray/observability/tests/ray_driver_job_execution_event_test.cc b/src/ray/observability/tests/ray_driver_job_execution_event_test.cc new file mode 100644 index 000000000000..1e6a26a0ea53 --- /dev/null +++ b/src/ray/observability/tests/ray_driver_job_execution_event_test.cc @@ -0,0 +1,41 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ray/observability/ray_driver_job_execution_event.h" + +#include "gtest/gtest.h" + +namespace ray { +namespace observability { + +class RayDriverJobExecutionEventTest : public ::testing::Test {}; + +TEST_F(RayDriverJobExecutionEventTest, TestMerge) { + rpc::JobTableData data; + data.set_job_id("test_job_id_1"); + auto event1 = std::make_unique( + data, rpc::events::DriverJobExecutionEvent::CREATED, "test_session_name_1"); + auto event2 = std::make_unique( + data, rpc::events::DriverJobExecutionEvent::FINISHED, "test_session_name_1"); + event1->Merge(std::move(*event2)); + auto serialized_event = std::move(*event1).Serialize(); + ASSERT_EQ(serialized_event.driver_job_execution_event().states_size(), 2); + ASSERT_EQ(serialized_event.driver_job_execution_event().states(0).state(), + rpc::events::DriverJobExecutionEvent::CREATED); + ASSERT_EQ(serialized_event.driver_job_execution_event().states(1).state(), + rpc::events::DriverJobExecutionEvent::FINISHED); +} + +} // namespace observability +} // namespace ray diff --git a/src/ray/observability/tests/ray_event_recorder_test.cc b/src/ray/observability/tests/ray_event_recorder_test.cc new file mode 100644 index 000000000000..13a64f8aa4bf --- /dev/null +++ b/src/ray/observability/tests/ray_event_recorder_test.cc @@ -0,0 +1,120 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ray/observability/ray_event_recorder.h" + +#include +#include +#include + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "ray/common/asio/instrumented_io_context.h" +#include "ray/common/ray_config.h" +#include "ray/observability/ray_driver_job_definition_event.h" +#include "ray/observability/ray_driver_job_execution_event.h" +#include "src/ray/protobuf/gcs.pb.h" +#include "src/ray/protobuf/public/events_base_event.pb.h" +#include "src/ray/protobuf/public/events_driver_job_execution_event.pb.h" + +namespace ray { +namespace observability { + +class FakeEventAggregatorClient : public rpc::EventAggregatorClient { + public: + FakeEventAggregatorClient() {} + + void AddEvents( + const rpc::events::AddEventsRequest &request, + const rpc::ClientCallback &callback) override { + absl::MutexLock lock(&mutex_); + for (const auto &event : request.events_data().events()) { + recorded_events_.push_back(event); + } + callback(Status::OK(), rpc::events::AddEventsReply{}); + } + + std::vector GetRecordedEvents() { + absl::MutexLock lock(&mutex_); + return recorded_events_; + } + + private: + std::vector recorded_events_ ABSL_GUARDED_BY(mutex_); + absl::Mutex mutex_; +}; + +class RayEventRecorderTest : public ::testing::Test { + public: + RayEventRecorderTest() { + fake_client_ = std::make_unique(); + recorder_ = std::make_unique(*fake_client_, io_service_); + recorder_->StartExportingEvents(); + } + + instrumented_io_context io_service_; + std::unique_ptr fake_client_; + std::unique_ptr recorder_; +}; + +TEST_F(RayEventRecorderTest, TestRecordEvents) { + rpc::JobTableData data1; + data1.set_job_id("test_job_id_1"); + data1.set_is_dead(false); + data1.set_driver_pid(12345); + data1.set_start_time(absl::ToUnixSeconds(absl::Now())); + data1.set_end_time(0); + data1.set_entrypoint("python test_script.py"); + data1.mutable_driver_address()->set_ip_address("127.0.0.1"); + + rpc::JobTableData data2; + data2.set_job_id("test_job_id_2"); + data2.set_is_dead(true); + data2.set_driver_pid(67890); + data2.set_start_time(absl::ToUnixSeconds(absl::Now()) - 3600); // 1 hour ago + data2.set_end_time(absl::ToUnixSeconds(absl::Now())); + data2.set_entrypoint("python another_script.py"); + data2.mutable_driver_address()->set_ip_address("192.168.1.100"); + + std::vector> events; + events.push_back( + std::make_unique(data1, "test_session_name_1")); + events.push_back(std::make_unique( + data2, rpc::events::DriverJobExecutionEvent::FINISHED, "test_session_name_2")); + recorder_->AddEvents(std::move(events)); + io_service_.run_one(); + + std::vector recorded_events = fake_client_->GetRecordedEvents(); + // Verify first event + ASSERT_EQ(recorded_events.size(), 2); + ASSERT_EQ(recorded_events[0].source_type(), rpc::events::RayEvent::GCS); + ASSERT_EQ(recorded_events[0].session_name(), "test_session_name_1"); + ASSERT_EQ(recorded_events[0].event_type(), + rpc::events::RayEvent::DRIVER_JOB_DEFINITION_EVENT); + ASSERT_EQ(recorded_events[0].severity(), rpc::events::RayEvent::INFO); + ASSERT_TRUE(recorded_events[0].has_driver_job_definition_event()); + ASSERT_EQ(recorded_events[0].driver_job_definition_event().job_id(), "test_job_id_1"); + + // Verify second event + ASSERT_EQ(recorded_events[1].source_type(), rpc::events::RayEvent::GCS); + ASSERT_EQ(recorded_events[1].session_name(), "test_session_name_2"); + ASSERT_EQ(recorded_events[1].event_type(), + rpc::events::RayEvent::DRIVER_JOB_EXECUTION_EVENT); + ASSERT_EQ(recorded_events[1].severity(), rpc::events::RayEvent::INFO); + ASSERT_TRUE(recorded_events[1].has_driver_job_execution_event()); + ASSERT_EQ(recorded_events[1].driver_job_execution_event().job_id(), "test_job_id_2"); +} + +} // namespace observability +} // namespace ray diff --git a/src/ray/protobuf/BUILD.bazel b/src/ray/protobuf/BUILD.bazel index d1bf2f6f2b66..804988e7714b 100644 --- a/src/ray/protobuf/BUILD.bazel +++ b/src/ray/protobuf/BUILD.bazel @@ -8,9 +8,12 @@ package(default_visibility = ["//visibility:public"]) proto_library( name = "common_proto", srcs = ["common.proto"], - visibility = ["//java:__subpackages__"], + visibility = [ + ":__subpackages__", + "//java:__subpackages__", + ], deps = [ - ":runtime_env_common_proto", + "//src/ray/protobuf/public:runtime_environment_proto", ], ) @@ -54,7 +57,10 @@ cc_proto_library( proto_library( name = "runtime_env_common_proto", srcs = ["runtime_env_common.proto"], - visibility = ["//java:__subpackages__"], + visibility = [ + ":__subpackages__", + "//java:__subpackages__", + ], ) proto_library( @@ -101,7 +107,7 @@ proto_library( ":autoscaler_proto", ":common_proto", ":gcs_proto", - ":runtime_env_common_proto", + "//src/ray/protobuf/public:runtime_environment_proto", ], ) @@ -248,6 +254,7 @@ proto_library( deps = [ ":export_actor_event_proto", ":export_dataset_metadata_proto", + ":export_dataset_operator_event_proto", ":export_driver_job_event_proto", ":export_node_event_proto", ":export_submission_job_event_proto", @@ -341,6 +348,16 @@ cc_proto_library( deps = [":export_train_state_proto"], ) +proto_library( + name = "export_dataset_operator_event_proto", + srcs = ["export_dataset_operator_event.proto"], +) + +cc_proto_library( + name = "export_dataset_operator_event_cc_proto", + deps = [":export_dataset_operator_event_proto"], +) + proto_library( name = "export_dataset_metadata_proto", srcs = ["export_dataset_metadata.proto"], @@ -396,6 +413,7 @@ proto_library( deps = [ ":common_proto", ":runtime_env_common_proto", + "//src/ray/protobuf/public:runtime_environment_proto", ], ) @@ -419,7 +437,6 @@ proto_library( srcs = ["autoscaler.proto"], deps = [ ":common_proto", - ":runtime_env_common_proto", ], ) @@ -437,62 +454,6 @@ cc_grpc_library( ], ) -proto_library( - name = "events_actor_task_definition_event_proto", - srcs = ["events_actor_task_definition_event.proto"], - deps = [ - ":common_proto", - ":runtime_env_common_proto", - ], -) - -cc_proto_library( - name = "events_actor_task_definition_event_cc_proto", - deps = [":events_actor_task_definition_event_proto"], -) - -proto_library( - name = "events_actor_task_execution_event_proto", - srcs = ["events_actor_task_execution_event.proto"], - deps = [ - ":common_proto", - "@com_google_protobuf//:timestamp_proto", - ], -) - -cc_proto_library( - name = "events_actor_task_execution_event_cc_proto", - deps = [":events_actor_task_execution_event_proto"], -) - -proto_library( - name = "events_task_definition_event_proto", - srcs = ["events_task_definition_event.proto"], - deps = [ - ":common_proto", - ":runtime_env_common_proto", - ], -) - -cc_proto_library( - name = "events_task_definition_event_cc_proto", - deps = [":events_task_definition_event_proto"], -) - -proto_library( - name = "events_task_execution_event_proto", - srcs = ["events_task_execution_event.proto"], - deps = [ - ":common_proto", - "@com_google_protobuf//:timestamp_proto", - ], -) - -cc_proto_library( - name = "events_task_execution_event_cc_proto", - deps = [":events_task_execution_event_proto"], -) - proto_library( name = "events_task_profile_events_proto", srcs = ["events_task_profile_events.proto"], @@ -506,30 +467,12 @@ cc_proto_library( deps = [":events_task_profile_events_proto"], ) -proto_library( - name = "events_base_event_proto", - srcs = ["events_base_event.proto"], - deps = [ - ":events_actor_task_definition_event_proto", - ":events_actor_task_execution_event_proto", - ":events_task_definition_event_proto", - ":events_task_execution_event_proto", - ":events_task_profile_events_proto", - "@com_google_protobuf//:timestamp_proto", - ], -) - -cc_proto_library( - name = "events_base_event_cc_proto", - deps = [":events_base_event_proto"], -) - proto_library( name = "events_event_aggregator_service_proto", srcs = ["events_event_aggregator_service.proto"], deps = [ ":common_proto", - ":events_base_event_proto", + "//src/ray/protobuf/public:events_base_event_proto", ], ) @@ -566,6 +509,7 @@ python_grpc_compile( ":runtime_env_agent_proto", ":runtime_env_common_proto", ":usage_proto", + "//src/ray/protobuf/public:runtime_environment_proto", ], ) diff --git a/src/ray/protobuf/autoscaler.proto b/src/ray/protobuf/autoscaler.proto index 9ad2ef7b191c..e463b08c1bed 100644 --- a/src/ray/protobuf/autoscaler.proto +++ b/src/ray/protobuf/autoscaler.proto @@ -150,7 +150,11 @@ message NodeState { // The corresponding total resources on the node. map total_resources = 5; - // Dynamic labels associated with the node. + // DEPRECATED: This field is part of the deprecated dynamic labels feature and + // must not be used in new code. It is retained solely for backward compatibility + // in the autoscaler, where it is required to retrieve the placement group ID for + // enforcing antiaffinity constraints in strict-spread placement group scheduling. + // // Reserved dynamic label names: _PG map dynamic_labels = 6; @@ -214,7 +218,7 @@ message ClusterResourceState { // There could be multiple constraints issued by different // jobs. Autoscaler to make sure all constraints are satisfied. repeated ClusterResourceConstraint cluster_resource_constraints = 6; - // The cluster session name. + // The current Ray session name. string cluster_session_name = 7; } diff --git a/src/ray/protobuf/common.proto b/src/ray/protobuf/common.proto index d03d9286e7ff..410e6bfd5e05 100644 --- a/src/ray/protobuf/common.proto +++ b/src/ray/protobuf/common.proto @@ -16,7 +16,7 @@ syntax = "proto3"; package ray.rpc; -import "src/ray/protobuf/runtime_env_common.proto"; +import "src/ray/protobuf/public/runtime_environment.proto"; option java_package = "io.ray.runtime.generated"; @@ -124,7 +124,7 @@ message SchedulingStrategy { // Address of a worker or node manager. message Address { - bytes raylet_id = 1; + bytes node_id = 1; string ip_address = 2; int32 port = 3; // Optional unique id for the worker. @@ -456,6 +456,31 @@ message StreamingGeneratorReturnIdInfo { bool is_plasma_object = 2; } +message LeaseSpec { + bytes lease_id = 1; + bytes job_id = 2; + Address caller_address = 3; + TaskType type = 4; + bytes actor_id = 5; + bool is_detached_actor = 6; + bytes root_detached_actor_id = 7; + int64 max_actor_restarts = 8; + map required_resources = 9; + map required_placement_resources = 10; + SchedulingStrategy scheduling_strategy = 11; + map label_selector = 12; + int64 depth = 13; + RuntimeEnvInfo runtime_env_info = 14; + repeated ObjectReference dependencies = 15; + bytes parent_task_id = 16; + Language language = 17; + FunctionDescriptor function_descriptor = 18; + repeated string dynamic_worker_options = 19; + int32 max_retries = 20; + uint64 attempt_number = 21; + string task_name = 22; +} + /// The task specification encapsulates all immutable information about the /// task. message TaskSpec { @@ -655,8 +680,6 @@ message PlacementGroupSpec { bool creator_actor_dead = 8; // Whether the placement group is persistent. bool is_detached = 9; - // The maximum fraction of CPU cores that this placement group can use on each node. - double max_cpu_fraction_per_node = 10; // Binary ID of the target node where bundles should be placed // iff the target node has enough available resources and alive. // Otherwise, the bundles can be placed elsewhere. @@ -673,6 +696,9 @@ message ObjectReference { // Used to print debugging information if there is an error retrieving the // object. string call_site = 3; + // The tensor transport to use for this object. If not specified, then use the + // default object store. + optional TensorTransport tensor_transport = 4; } message ObjectReferenceCount { @@ -707,6 +733,8 @@ enum TensorTransport { NCCL = 1; // Use GLOO for tensor transport. GLOO = 2; + // Use NIXL for tensor transport. + NIXL = 3; } // Argument in the task. @@ -931,53 +959,50 @@ message ResourceAllocations { // Debug info returned from the core worker. message CoreWorkerStats { - reserved 1; // Number of pending normal and actor tasks. - int32 num_pending_tasks = 2; + int32 num_pending_tasks = 1; // Number of object refs in local scope. - int32 num_object_refs_in_scope = 3; + int32 num_object_refs_in_scope = 2; // IP address of the core worker. - string ip_address = 7; + string ip_address = 3; // Port of the core worker. - int64 port = 8; + int64 port = 4; // Actor ID. - bytes actor_id = 9; + bytes actor_id = 5; // A map from the resource name (e.g. "CPU") to its allocation. - map used_resources = 10; + map used_resources = 6; // A string displayed on Dashboard. - map webui_display = 11; + map webui_display = 7; // Number of objects that are IN_PLASMA_ERROR in the local memory store. - int32 num_in_plasma = 12; + int32 num_in_plasma = 8; // Number of objects stored in local memory. - int32 num_local_objects = 13; + int32 num_local_objects = 9; // Used local object store memory. - int64 used_object_store_memory = 14; + int64 used_object_store_memory = 10; // Length of the task queue. - int32 task_queue_length = 15; + int32 task_queue_length = 11; // Number of executed tasks. - int32 num_executed_tasks = 16; - // Actor constructor. - string actor_title = 17; + int32 num_executed_tasks = 12; // Local reference table. - repeated ObjectRefInfo object_refs = 18; + repeated ObjectRefInfo object_refs = 13; // Job ID. - bytes job_id = 19; + bytes job_id = 14; // Worker id of core worker. - bytes worker_id = 20; + bytes worker_id = 15; // Language - Language language = 21; + Language language = 16; // PID of the worker process. - uint32 pid = 22; + uint32 pid = 17; // The worker type. - WorkerType worker_type = 23; + WorkerType worker_type = 18; // Length of the number of objects without truncation. - int64 objects_total = 24; + int64 objects_total = 19; // Number of objects owned by the worker. - int64 num_owned_objects = 25; + int64 num_owned_objects = 20; // Number of actors owned by the worker. - int64 num_owned_actors = 26; + int64 num_owned_actors = 21; // Number of running tasks - int64 num_running_tasks = 27; + int64 num_running_tasks = 22; } // Resource usage reported by the node reporter. diff --git a/src/ray/protobuf/core_worker.proto b/src/ray/protobuf/core_worker.proto index 38383ce11172..401d1fc7a424 100644 --- a/src/ray/protobuf/core_worker.proto +++ b/src/ray/protobuf/core_worker.proto @@ -72,6 +72,8 @@ message ActorHandle { // The key-value labels for actor. map labels = 15; + + bool enable_tensor_transport = 16; } message PushTaskRequest { diff --git a/src/ray/protobuf/events_actor_task_execution_event.proto b/src/ray/protobuf/events_actor_task_execution_event.proto deleted file mode 100644 index 3e7ae892c769..000000000000 --- a/src/ray/protobuf/events_actor_task_execution_event.proto +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2025 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -syntax = "proto3"; - -import "src/ray/protobuf/common.proto"; -import "google/protobuf/timestamp.proto"; - -package ray.rpc.events; - -// Message containing the execution information of an actor task. -message ActorTaskExecutionEvent { - // task_id and task_attempt form the unique identifier of a task. - bytes task_id = 1; - int32 task_attempt = 2; - - // The actor task execution information - - // The map of task state to the time when the state was last updated. - - // Key is the integer value of TaskStatus enum (protobuf doesn't support Enum as key). - // Value is the timestamp when status changes to the target status indicated by the key. - map task_state = 3; - UserErrorInfo user_error_info = 4; - RayErrorInfo ray_error_info = 5; - - // The correlation ids of the task that can be used to correlate the task with - // other events. - bytes node_id = 6; - bytes worker_id = 7; - int32 worker_pid = 8; -} diff --git a/src/ray/protobuf/events_event_aggregator_service.proto b/src/ray/protobuf/events_event_aggregator_service.proto index 896a9d9be350..9c1557ca7563 100644 --- a/src/ray/protobuf/events_event_aggregator_service.proto +++ b/src/ray/protobuf/events_event_aggregator_service.proto @@ -15,7 +15,7 @@ syntax = "proto3"; import "src/ray/protobuf/common.proto"; -import "src/ray/protobuf/events_base_event.proto"; +import "src/ray/protobuf/public/events_base_event.proto"; package ray.rpc.events; diff --git a/src/ray/protobuf/export_dataset_metadata.proto b/src/ray/protobuf/export_dataset_metadata.proto index ce78d1458b8d..e9135441f7e1 100644 --- a/src/ray/protobuf/export_dataset_metadata.proto +++ b/src/ray/protobuf/export_dataset_metadata.proto @@ -30,6 +30,14 @@ message SubStage { // Represents a data processing operator in the DAG message Operator { + enum OperatorState { + UNKNOWN = 0; + RUNNING = 1; + FINISHED = 2; + FAILED = 3; + PENDING = 4; + } + // Name of the operator string name = 1; @@ -53,6 +61,15 @@ message Operator { // can be found in `_get_logical_args`, and is used to help understand how a // user's arguments lead to a dataset's state execution google.protobuf.Struct args = 6; + + // The timestamp when execution starts (in seconds since epoch) + double execution_start_time = 7; + + // The timestamp when execution ends (in seconds since epoch) + double execution_end_time = 8; + + // The state of the operator + OperatorState state = 9; } // Represents the complete structure of the operator DAG @@ -63,6 +80,21 @@ message Topology { // Top-level message containing full metadata about a Ray Data execution message ExportDatasetMetadata { + enum DatasetState { + UNKNOWN = 0; + RUNNING = 1; + FINISHED = 2; + FAILED = 3; + PENDING = 4; + } + + message DashboardPanelMetadata { + // Unique identifier for the panel + string id = 1; + // Display name of the panel + string title = 2; + } + // The operator DAG structure Topology topology = 1; @@ -72,9 +104,22 @@ message ExportDatasetMetadata { // The Ray Job ID string job_id = 3; - // The timestamp when execution started (in seconds since epoch) + // The timestamp when dataset is registered (in seconds since epoch) double start_time = 4; // The data context attached to the dataset. google.protobuf.Struct data_context = 5; + + // The timestamp when execution starts (in seconds since epoch) + double execution_start_time = 6; + + // The timestamp when execution ends (in seconds since epoch) + double execution_end_time = 7; + + // The state of the dataset + DatasetState state = 8; + + // List of metric panels to show for operators + // When showing these panels, it is expected to filter the metrics by operator ID. + repeated DashboardPanelMetadata operator_panels = 9; } diff --git a/src/ray/protobuf/export_dataset_operator_event.proto b/src/ray/protobuf/export_dataset_operator_event.proto new file mode 100644 index 000000000000..0c82133346b2 --- /dev/null +++ b/src/ray/protobuf/export_dataset_operator_event.proto @@ -0,0 +1,46 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +option cc_enable_arenas = true; +package ray.rpc; + +// This message defines the event_data stored by the export API for +// EXPORT_DATASET_OPERATOR type events from Ray Data operators. +message ExportDatasetOperatorEventData { + enum DatasetOperatorEventType { + UNSPECIFIED = 0; + ISSUE_DETECTION_HANGING = 1; + ISSUE_DETECTION_HIGH_MEMORY = 2; + } + + // The dataset ID + string dataset_id = 1; + + // The operator ID + string operator_id = 2; + + // The operator name + string operator_name = 3; + + // The timestamp when event is emitted (in seconds since epoch) + double event_time = 4; + + // The type of the event + DatasetOperatorEventType event_type = 5; + + // The content of the event message + string message = 6; +} diff --git a/src/ray/protobuf/export_event.proto b/src/ray/protobuf/export_event.proto index 5c0c56fc4dc0..fdc9281915f0 100644 --- a/src/ray/protobuf/export_event.proto +++ b/src/ray/protobuf/export_event.proto @@ -23,6 +23,7 @@ import "src/ray/protobuf/export_driver_job_event.proto"; import "src/ray/protobuf/export_submission_job_event.proto"; import "src/ray/protobuf/export_train_state.proto"; +import "src/ray/protobuf/export_dataset_operator_event.proto"; import "src/ray/protobuf/export_dataset_metadata.proto"; // ExportEvent defines events stored by the export API. This @@ -37,6 +38,7 @@ message ExportEvent { EXPORT_TRAIN_RUN = 5; EXPORT_TRAIN_RUN_ATTEMPT = 6; EXPORT_DATASET_METADATA = 7; + EXPORT_DATASET_OPERATOR_EVENT = 8; } // event_id is the unique ID of this event @@ -56,5 +58,6 @@ message ExportEvent { ExportTrainRunEventData train_run_event_data = 9; ExportTrainRunAttemptEventData train_run_attempt_event_data = 10; ExportDatasetMetadata dataset_metadata = 11; + ExportDatasetOperatorEventData dataset_operator_event_data = 12; } } diff --git a/src/ray/protobuf/gcs.proto b/src/ray/protobuf/gcs.proto index 97a8e1ae1af8..9d350bca72e8 100644 --- a/src/ray/protobuf/gcs.proto +++ b/src/ray/protobuf/gcs.proto @@ -440,7 +440,7 @@ message WorkerTableData { // Fields to publish when worker fails. message WorkerDeltaData { - bytes raylet_id = 1; + bytes node_id = 1; bytes worker_id = 2; } @@ -642,8 +642,6 @@ message PlacementGroupTableData { // The placement group's stats / information such as when it is created or // what's the current scheduling state. PlacementGroupStats stats = 12; - // The maximum fraction of CPU cores that this placement group can use on each node. - double max_cpu_fraction_per_node = 13; // Binary ID of the target node where bundles should be placed // iff the target node has enough available resources and alive. // Otherwise, the bundles can be placed elsewhere. diff --git a/src/ray/protobuf/gcs_service.proto b/src/ray/protobuf/gcs_service.proto index d0c69395ed4c..3e25c48da3c3 100644 --- a/src/ray/protobuf/gcs_service.proto +++ b/src/ray/protobuf/gcs_service.proto @@ -680,12 +680,6 @@ message GcsSubscriberCommandBatchReply { GcsStatus status = 100; } -message GcsUnregisterSubscriberRequest { - bytes subscriber_id = 1; -} - -message GcsUnregisterSubscriberReply {} - /// This supports subscribing updates from GCS with long poll, and registering / /// de-registering subscribers. service InternalPubSubGcsService { @@ -699,10 +693,6 @@ service InternalPubSubGcsService { /// A batch of subscribe / unsubscribe requests sent by the subscriber. rpc GcsSubscriberCommandBatch(GcsSubscriberCommandBatchRequest) returns (GcsSubscriberCommandBatchReply); - /// Unregister a subscriber from GCS, removing all subscriptions as well as the - /// subscriber itself. - rpc GcsUnregisterSubscriber(GcsUnregisterSubscriberRequest) - returns (GcsUnregisterSubscriberReply); } message GetAllResourceUsageRequest {} diff --git a/src/ray/protobuf/node_manager.proto b/src/ray/protobuf/node_manager.proto index 4a6da212b2dd..b8e2f9f1c0a9 100644 --- a/src/ray/protobuf/node_manager.proto +++ b/src/ray/protobuf/node_manager.proto @@ -19,13 +19,13 @@ package ray.rpc; import "src/ray/protobuf/common.proto"; import "src/ray/protobuf/gcs.proto"; import "src/ray/protobuf/autoscaler.proto"; -import "src/ray/protobuf/runtime_env_common.proto"; +import "src/ray/protobuf/public/runtime_environment.proto"; message WorkerBacklogReport { - // TaskSpec indicating the scheduling class. + // LeaseSpec indicating the scheduling class. // Cannot send scheduling class directly // since it's local to each process. - TaskSpec resource_spec = 1; + LeaseSpec lease_spec = 1; // Size of the backlog for the above scheduling class. int64 backlog_size = 2; } @@ -41,8 +41,8 @@ message ReportWorkerBacklogReply {} // Request a worker from the raylet with the specified resources. message RequestWorkerLeaseRequest { - // TaskSpec containing the requested resources. - TaskSpec resource_spec = 1; + // LeaseSpec containing the requested resources. + LeaseSpec lease_spec = 1; // Worker's backlog size for this spec's shape. int64 backlog_size = 2; // If it's true, either grant the lease if the task is @@ -132,12 +132,23 @@ message CancelResourceReserveRequest { message CancelResourceReserveReply {} +message ResizeLocalResourceInstancesRequest { + // Map of resource names to their desired total quantities + // For example: {"CPU": 4, "memory": 1000000} + map resources = 1; +} + +message ResizeLocalResourceInstancesReply { + // Current total resources after the resize operation + map total_resources = 1; +} + // Release a worker back to its raylet. -message ReturnWorkerRequest { +message ReturnWorkerLeaseRequest { // Port of the leased worker that we are now returning. int32 worker_port = 1; - // Unique id of the leased worker we are now returning. - bytes worker_id = 2; + // The lease id of the lease we are now returning. + bytes lease_id = 2; // If true, there was some unrecoverable error and the raylet should // disconnect the worker. bool disconnect_worker = 3; @@ -147,7 +158,7 @@ message ReturnWorkerRequest { string disconnect_worker_error_detail = 5; } -message ReturnWorkerReply {} +message ReturnWorkerLeaseReply {} message ReleaseUnusedActorWorkersRequest { repeated bytes worker_ids_in_use = 1; @@ -163,8 +174,8 @@ message ShutdownRayletRequest { message ShutdownRayletReply {} message CancelWorkerLeaseRequest { - // The task to cancel. - bytes task_id = 1; + // The lease to cancel. + bytes lease_id = 1; } message CancelWorkerLeaseReply { @@ -297,7 +308,7 @@ message GetResourceLoadReply { ResourcesData resources = 1; } -message CancelTasksWithResourceShapesRequest { +message CancelLeasesWithResourceShapesRequest { message ResourceShape { // A map from resource name to the quantity of that resource. This map represents // the resource request shape of a task. @@ -307,7 +318,7 @@ message CancelTasksWithResourceShapesRequest { repeated ResourceShape resource_shapes = 1; } -message CancelTasksWithResourceShapesReply { +message CancelLeasesWithResourceShapesReply { // Empty } @@ -315,11 +326,11 @@ message NotifyGCSRestartRequest {} message NotifyGCSRestartReply {} -message GetTaskFailureCauseRequest { - bytes task_id = 1; +message GetWorkerFailureCauseRequest { + bytes lease_id = 1; } -message GetTaskFailureCauseReply { +message GetWorkerFailureCauseReply { optional RayErrorInfo failure_cause = 1; bool fail_task_immediately = 2; } @@ -403,8 +414,8 @@ service NodeManagerService { // request. // Failure: This doesn't explicitly retry, only logs on failure, but autoscaler will // keep calling this so it will be retried at a layer above. - rpc CancelTasksWithResourceShapes(CancelTasksWithResourceShapesRequest) - returns (CancelTasksWithResourceShapesReply); + rpc CancelLeasesWithResourceShapes(CancelLeasesWithResourceShapesRequest) + returns (CancelLeasesWithResourceShapesReply); // Request a worker from the raylet. // Failure: Does retry if request to remote raylet fails. Just logs warning if request // to local raylet fails. @@ -417,9 +428,9 @@ service NodeManagerService { // Failure: Doesn't need to be retried since it will keep getting periodically called, // and is not critical. rpc ReportWorkerBacklog(ReportWorkerBacklogRequest) returns (ReportWorkerBacklogReply); - // Release a worker back to its raylet. - // Failure: TODO: Failure behavior needs to be fixed. - rpc ReturnWorker(ReturnWorkerRequest) returns (ReturnWorkerReply); + // Return a worker lease back to its raylet. + // Failure: Retries, it's idempotent. + rpc ReturnWorkerLease(ReturnWorkerLeaseRequest) returns (ReturnWorkerLeaseReply); // This method is only used by GCS, and the purpose is to release leased workers // that may be leaked. When GCS restarts, it doesn't know which workers it has leased // in the previous lifecycle. In this case, GCS will send a list of worker ids that @@ -449,6 +460,16 @@ service NodeManagerService { // Failure: Has retry behavior, could be improved to just use retriable grpc client. rpc CancelResourceReserve(CancelResourceReserveRequest) returns (CancelResourceReserveReply); + // Adjust the total number of local resource instances on the raylet to match the + // specified values. + // Success: Returns the updated total resources for the node. If downsizing would make + // available resources negative, the raylet clamps the reduction so that available + // becomes zero. + // Failure: Returns INVALID_ARGUMENT if the request attempts to resize a unit instance + // resource (e.g., GPU), as these cannot be resized by this API. In the cases of + // network errors, the caller should retry the request. + rpc ResizeLocalResourceInstances(ResizeLocalResourceInstancesRequest) + returns (ResizeLocalResourceInstancesReply); // Cancel a pending lease request. This only returns success if the // lease request was not yet granted. // Failure: TODO: This needs to handle network failure @@ -479,10 +500,11 @@ service NodeManagerService { // [State API] Get the all object information of the node. // Failure: State API user can retry. rpc GetObjectsInfo(GetObjectsInfoRequest) returns (GetObjectsInfoReply); - // Gets the task execution result. May contain a result if - // the task completed in error. + // Gets the worker failure cause. May contain a result if + // the worker executing the task failed. // Failure: Gives user error message on failure. - rpc GetTaskFailureCause(GetTaskFailureCauseRequest) returns (GetTaskFailureCauseReply); + rpc GetWorkerFailureCause(GetWorkerFailureCauseRequest) + returns (GetWorkerFailureCauseReply); // Failure: TODO: Handle network failure for cgraphs. rpc RegisterMutableObject(RegisterMutableObjectRequest) returns (RegisterMutableObjectReply); diff --git a/src/ray/protobuf/public/BUILD.bazel b/src/ray/protobuf/public/BUILD.bazel new file mode 100644 index 000000000000..25a484225caf --- /dev/null +++ b/src/ray/protobuf/public/BUILD.bazel @@ -0,0 +1,154 @@ +load("@rules_cc//cc:defs.bzl", "cc_proto_library") +load("@rules_proto//proto:defs.bzl", "proto_library") + +package(default_visibility = ["//visibility:public"]) + +proto_library( + name = "events_base_event_proto", + srcs = ["events_base_event.proto"], + deps = [ + ":events_actor_definition_event_proto", + ":events_actor_lifecycle_event_proto", + ":events_actor_task_definition_event_proto", + ":events_driver_job_definition_event_proto", + ":events_driver_job_execution_event_proto", + ":events_node_definition_event_proto", + ":events_node_lifecycle_event_proto", + ":events_task_definition_event_proto", + ":events_task_execution_event_proto", + "//src/ray/protobuf:events_task_profile_events_proto", + "@com_google_protobuf//:timestamp_proto", + ], +) + +cc_proto_library( + name = "events_base_event_cc_proto", + deps = [":events_base_event_proto"], +) + +proto_library( + name = "events_actor_task_definition_event_proto", + srcs = ["events_actor_task_definition_event.proto"], + deps = [ + ":runtime_environment_proto", + "//src/ray/protobuf:common_proto", + ], +) + +cc_proto_library( + name = "events_actor_task_definition_event_cc_proto", + deps = [":events_actor_task_definition_event_proto"], +) + +proto_library( + name = "events_task_definition_event_proto", + srcs = ["events_task_definition_event.proto"], + deps = [ + ":runtime_environment_proto", + "//src/ray/protobuf:common_proto", + ], +) + +cc_proto_library( + name = "events_task_definition_event_cc_proto", + deps = [":events_task_definition_event_proto"], +) + +proto_library( + name = "events_task_execution_event_proto", + srcs = ["events_task_execution_event.proto"], + deps = [ + "//src/ray/protobuf:common_proto", + "@com_google_protobuf//:timestamp_proto", + ], +) + +cc_proto_library( + name = "events_task_execution_event_cc_proto", + deps = [":events_task_execution_event_proto"], +) + +proto_library( + name = "events_driver_job_definition_event_proto", + srcs = ["events_driver_job_definition_event.proto"], + deps = [ + ":runtime_environment_proto", + "//src/ray/protobuf:common_proto", + "@com_google_protobuf//:timestamp_proto", + ], +) + +cc_proto_library( + name = "events_driver_job_definition_event_cc_proto", + deps = [":events_driver_job_definition_event_proto"], +) + +proto_library( + name = "events_driver_job_execution_event_proto", + srcs = ["events_driver_job_execution_event.proto"], + deps = [ + "@com_google_protobuf//:timestamp_proto", + ], +) + +cc_proto_library( + name = "events_driver_job_execution_event_cc_proto", + deps = [":events_driver_job_execution_event_proto"], +) + +proto_library( + name = "events_actor_definition_event_proto", + srcs = ["events_actor_definition_event.proto"], +) + +cc_proto_library( + name = "events_actor_definition_event_cc_proto", + deps = [":events_actor_definition_event_proto"], +) + +proto_library( + name = "events_actor_lifecycle_event_proto", + srcs = ["events_actor_lifecycle_event.proto"], + deps = [ + "//src/ray/protobuf:common_proto", + "@com_google_protobuf//:timestamp_proto", + ], +) + +proto_library( + name = "events_node_definition_event_proto", + srcs = ["events_node_definition_event.proto"], +) + +cc_proto_library( + name = "events_node_definition_event_cc_proto", + deps = [":events_node_definition_event_proto"], +) + +proto_library( + name = "events_node_lifecycle_event_proto", + srcs = ["events_node_lifecycle_event.proto"], + deps = [ + "@com_google_protobuf//:timestamp_proto", + ], +) + +cc_proto_library( + name = "events_actor_lifecycle_event_cc_proto", + deps = [":events_actor_lifecycle_event_proto"], +) + +cc_proto_library( + name = "events_node_lifecycle_event_cc_proto", + deps = [":events_node_lifecycle_event_proto"], +) + +proto_library( + name = "runtime_environment_proto", + srcs = ["runtime_environment.proto"], +) + +cc_proto_library( + name = "runtime_environment_cc_proto", + deps = [":runtime_environment_proto"], +) diff --git a/src/ray/protobuf/public/README b/src/ray/protobuf/public/README new file mode 100644 index 000000000000..9e8a687b9ce8 --- /dev/null +++ b/src/ray/protobuf/public/README @@ -0,0 +1,17 @@ +All proto files in this directory are part of public APIs. Therefore, please keep the +following guidelines in mind when modifying any of these files: + +Do NOT include private protos in these files. If you need to, either (i) obtain approval +from the core team to make the previously private proto public, or (ii) split the proto +into private and public parts, and move only the public part here. + +Do NOT delete existing fields in any proto messages. If renaming is necessary, add a new +field with the new name, and mark the old field as deprecated. + +For consumers of these proto files (end users): you can rely on field names continuing +to exist, ensuring that applications built on top of these protos do not break +unexpectedly. However, always design applications with the assumption that fields are +always optional, and handle missing or deprecated field contents gracefully. While a +field name may remain, its content could eventually be deprecated and moved to a new +field. This provides a path for us to deprecate emitting logic without breaking your +application. diff --git a/src/ray/protobuf/public/events_actor_definition_event.proto b/src/ray/protobuf/public/events_actor_definition_event.proto new file mode 100644 index 000000000000..63ce89045cca --- /dev/null +++ b/src/ray/protobuf/public/events_actor_definition_event.proto @@ -0,0 +1,43 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package ray.rpc.events; + +message ActorDefinitionEvent { + // The ID of the actor that was created. + bytes actor_id = 1; + // The ID of the job that created the actor. + bytes job_id = 2; + // Whether the actor is persistent. + bool is_detached = 3; + // Name of the actor. + string name = 4; + // The actor's namespace. Named `ray_namespace` to avoid conflicting with c++ keyword. + string ray_namespace = 5; + // Serialized runtime_env used to report in the dashboard snapshot. We need to populate + // it here instead of grabbing it from the task spec because the task spec is cleared + // for deleted actors: https://github.com/ray-project/ray/pull/11149. + string serialized_runtime_env = 6; + // The actor's class name. This is necessary because the task spec's lifetime + // is shorter than the ActorTableData. + string class_name = 7; + // Quantities of the different resources required by this actor. + map required_resources = 8; + // Placement group ID if the actor requires a placement group. + bytes placement_group_id = 9; + // The label selector for the actor. + map label_selector = 11; +} diff --git a/src/ray/protobuf/public/events_actor_lifecycle_event.proto b/src/ray/protobuf/public/events_actor_lifecycle_event.proto new file mode 100644 index 000000000000..debdf844bcf3 --- /dev/null +++ b/src/ray/protobuf/public/events_actor_lifecycle_event.proto @@ -0,0 +1,54 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package ray.rpc.events; + +import "src/ray/protobuf/common.proto"; +import "google/protobuf/timestamp.proto"; + +message ActorLifecycleEvent { + enum State{ + // Actor info is registered in GCS. But its dependencies are not ready. + DEPENDENCIES_UNREADY = 0; + // Actor local dependencies are ready. This actor is being created. + PENDING_CREATION = 1; + // Actor is alive. + ALIVE = 2; + // Actor is dead, now being restarted. + // After reconstruction finishes, the state will become alive again. + RESTARTING = 3; + // Actor is already dead and won't be restarted. + DEAD = 4; + } + + message StateTransition { + State state = 1; + google.protobuf.Timestamp timestamp = 2; + // The node id of the actor once it is created. + // available when state is ALIVE updated when the actor is restarted. + bytes node_id = 3; + // The worker id of the worker on which this actor is running. available when state is ALIVE. + // The worker id can change when the actor is restarted. + bytes worker_id = 4; + // Contains metadata about why the actor is dead. available when state is DEAD. + ActorDeathCause death_cause = 6; + } + + // The ID of the actor that was created. + bytes actor_id = 1; + // Current state of this actor. + repeated StateTransition state_transitions = 2; +} diff --git a/src/ray/protobuf/events_actor_task_definition_event.proto b/src/ray/protobuf/public/events_actor_task_definition_event.proto similarity index 74% rename from src/ray/protobuf/events_actor_task_definition_event.proto rename to src/ray/protobuf/public/events_actor_task_definition_event.proto index 4aa26e561fd9..12cda5260379 100644 --- a/src/ray/protobuf/events_actor_task_definition_event.proto +++ b/src/ray/protobuf/public/events_actor_task_definition_event.proto @@ -14,7 +14,7 @@ syntax = "proto3"; -import "src/ray/protobuf/runtime_env_common.proto"; +import "src/ray/protobuf/public/runtime_environment.proto"; import "src/ray/protobuf/common.proto"; package ray.rpc.events; @@ -27,15 +27,17 @@ message ActorTaskDefinitionEvent { int32 task_attempt = 2; // The actor task definition information. - FunctionDescriptor actor_func = 3; - map required_resources = 5; - RuntimeEnvInfo runtime_env_info = 6; + Language language = 3; + FunctionDescriptor actor_func = 4; + string actor_task_name = 5; + map required_resources = 6; + RuntimeEnvInfo runtime_env_info = 7; // The correlation ids of the task that can be used to correlate the task with // other events. - bytes job_id = 7; - bytes actor_id = 8; - bytes parent_task_id = 9; - bytes placement_group_id = 10; - map ref_ids = 11; + bytes job_id = 8; + bytes actor_id = 9; + bytes parent_task_id = 10; + bytes placement_group_id = 11; + map ref_ids = 12; } diff --git a/src/ray/protobuf/events_base_event.proto b/src/ray/protobuf/public/events_base_event.proto similarity index 66% rename from src/ray/protobuf/events_base_event.proto rename to src/ray/protobuf/public/events_base_event.proto index aac704ef6e25..5adbf9757f62 100644 --- a/src/ray/protobuf/events_base_event.proto +++ b/src/ray/protobuf/public/events_base_event.proto @@ -17,11 +17,16 @@ syntax = "proto3"; package ray.rpc.events; import "google/protobuf/timestamp.proto"; -import "src/ray/protobuf/events_actor_task_definition_event.proto"; -import "src/ray/protobuf/events_actor_task_execution_event.proto"; -import "src/ray/protobuf/events_task_definition_event.proto"; -import "src/ray/protobuf/events_task_execution_event.proto"; import "src/ray/protobuf/events_task_profile_events.proto"; +import "src/ray/protobuf/public/events_actor_task_definition_event.proto"; +import "src/ray/protobuf/public/events_task_definition_event.proto"; +import "src/ray/protobuf/public/events_task_execution_event.proto"; +import "src/ray/protobuf/public/events_driver_job_definition_event.proto"; +import "src/ray/protobuf/public/events_driver_job_execution_event.proto"; +import "src/ray/protobuf/public/events_actor_definition_event.proto"; +import "src/ray/protobuf/public/events_actor_lifecycle_event.proto"; +import "src/ray/protobuf/public/events_node_definition_event.proto"; +import "src/ray/protobuf/public/events_node_lifecycle_event.proto"; // This is the base message for all ray events. message RayEvent { @@ -46,8 +51,13 @@ message RayEvent { TASK_DEFINITION_EVENT = 1; TASK_EXECUTION_EVENT = 2; ACTOR_TASK_DEFINITION_EVENT = 3; - ACTOR_TASK_EXECUTION_EVENT = 4; - TASK_PROFILE_EVENT = 5; + TASK_PROFILE_EVENT = 4; + DRIVER_JOB_DEFINITION_EVENT = 5; + DRIVER_JOB_EXECUTION_EVENT = 6; + NODE_DEFINITION_EVENT = 7; + NODE_LIFECYCLE_EVENT = 8; + ACTOR_DEFINITION_EVENT = 9; + ACTOR_LIFECYCLE_EVENT = 10; } // The severities of events that can be generated. @@ -78,12 +88,19 @@ message RayEvent { Severity severity = 5; // A string message associated with the event. string message = 6; + // The current Ray session name. + string session_name = 7; // Nested event messages containing the specific fields for each event type. // One of the following fields is expected to be set for each RayEvent message. - TaskDefinitionEvent task_definition_event = 7; - TaskExecutionEvent task_execution_event = 8; - ActorTaskDefinitionEvent actor_task_definition_event = 9; - ActorTaskExecutionEvent actor_task_execution_event = 10; + TaskDefinitionEvent task_definition_event = 8; + TaskExecutionEvent task_execution_event = 9; + ActorTaskDefinitionEvent actor_task_definition_event = 10; TaskProfileEvents task_profile_events = 11; + DriverJobDefinitionEvent driver_job_definition_event = 12; + DriverJobExecutionEvent driver_job_execution_event = 13; + NodeDefinitionEvent node_definition_event = 14; + NodeLifecycleEvent node_lifecycle_event = 15; + ActorDefinitionEvent actor_definition_event = 16; + ActorLifecycleEvent actor_lifecycle_event = 17; } diff --git a/src/ray/protobuf/public/events_driver_job_definition_event.proto b/src/ray/protobuf/public/events_driver_job_definition_event.proto new file mode 100644 index 000000000000..a9f17714c7a1 --- /dev/null +++ b/src/ray/protobuf/public/events_driver_job_definition_event.proto @@ -0,0 +1,38 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +syntax = "proto3"; + +import "google/protobuf/timestamp.proto"; +import "src/ray/protobuf/public/runtime_environment.proto"; + +package ray.rpc.events; + +// Message containing the definition information of a driver job. +// The message is expected to be emitted once per job creation. +// +// For runtime information associated with this event, see DriverJobExecutionEvent. +message DriverJobDefinitionEvent { + message Config { + RuntimeEnvInfo runtime_env_info = 1; + map metadata = 2; + } + + bytes job_id = 1; + int64 driver_pid = 3; + bytes driver_node_id = 4; + string entrypoint = 5; + Config config = 6; +} diff --git a/src/ray/protobuf/public/events_driver_job_execution_event.proto b/src/ray/protobuf/public/events_driver_job_execution_event.proto new file mode 100644 index 000000000000..4c9dd611140c --- /dev/null +++ b/src/ray/protobuf/public/events_driver_job_execution_event.proto @@ -0,0 +1,40 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +syntax = "proto3"; + +import "google/protobuf/timestamp.proto"; + +package ray.rpc.events; + +// Message containing the execution information of a driver job. It can be used to +// capture the full state transition history. +// +// For static information associated with this event, see DriverJobDefinitionEvent. +message DriverJobExecutionEvent { + enum State { + UNSPECIFIED = 0; + CREATED = 1; + FINISHED = 2; + } + + message StateTimestamp { + State state = 1; + google.protobuf.Timestamp timestamp = 2; + } + + bytes job_id = 1; + repeated StateTimestamp states = 2; +} diff --git a/src/ray/util/timestamp_utils.h b/src/ray/protobuf/public/events_node_definition_event.proto similarity index 61% rename from src/ray/util/timestamp_utils.h rename to src/ray/protobuf/public/events_node_definition_event.proto index 69d034cb9ceb..3d41a5ab27ed 100644 --- a/src/ray/util/timestamp_utils.h +++ b/src/ray/protobuf/public/events_node_definition_event.proto @@ -12,16 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#pragma once +syntax = "proto3"; -#include +package ray.rpc; -namespace ray { - -inline int64_t current_sys_time_s() { - std::chrono::seconds s_since_epoch = std::chrono::duration_cast( - std::chrono::system_clock::now().time_since_epoch()); - return s_since_epoch.count(); +// Message containing the definition of a node, as observed via GCS. +// The message is expected to be emitted once per node creation. +// +// For runtime information associated with this event, see NodeLifecycleEvent. +message NodeDefinitionEvent { + bytes node_id = 1; + string node_ip_address = 2; + map labels = 3; } - -} // namespace ray diff --git a/src/ray/protobuf/public/events_node_lifecycle_event.proto b/src/ray/protobuf/public/events_node_lifecycle_event.proto new file mode 100644 index 000000000000..cd72ea6b5a97 --- /dev/null +++ b/src/ray/protobuf/public/events_node_lifecycle_event.proto @@ -0,0 +1,56 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +import "google/protobuf/timestamp.proto"; + +package ray.rpc; + +// Message containing the lifecycle information of a node, as observed via GCS. +// It can be used to capture the full state transition history. +// +// For static information associated with this event, see NodeDefinitionEvent. +message NodeLifecycleEvent { + enum State { + ALIVE = 0; + DEAD = 1; + } + + message DeathInfo { + enum Reason { + UNSPECIFIED = 0; + EXPECTED_TERMINATION = 1; + UNEXPECTED_TERMINATION = 2; + AUTOSCALER_DRAIN_PREEMPTED = 3; + AUTOSCALER_DRAIN_IDLE = 4; + } + Reason reason = 1; + string reason_message = 2; + } + + message StateTransition { + State state = 1; + google.protobuf.Timestamp timestamp = 2; + map resources = 3; // Resources (cpu, gpu, etc.) and their counts, + // available only in the ALIVE state. + DeathInfo death_info = 4; // Available only in the DEAD state + } + + bytes node_id = 1; + // This records the state transitions within each export interval. The consumer should + // concatenate these intervals over the node’s lifetime to reconstruct the complete + // state transition time series. + repeated StateTransition state_transitions = 2; +} diff --git a/src/ray/protobuf/events_task_definition_event.proto b/src/ray/protobuf/public/events_task_definition_event.proto similarity index 71% rename from src/ray/protobuf/events_task_definition_event.proto rename to src/ray/protobuf/public/events_task_definition_event.proto index c47b2b0503de..7ed83ae87938 100644 --- a/src/ray/protobuf/events_task_definition_event.proto +++ b/src/ray/protobuf/public/events_task_definition_event.proto @@ -14,7 +14,7 @@ syntax = "proto3"; -import "src/ray/protobuf/runtime_env_common.proto"; +import "src/ray/protobuf/public/runtime_environment.proto"; import "src/ray/protobuf/common.proto"; package ray.rpc.events; @@ -27,15 +27,18 @@ message TaskDefinitionEvent { int32 task_attempt = 2; // The task definition information. - FunctionDescriptor task_func = 3; - string task_name = 4; - map required_resources = 5; - RuntimeEnvInfo runtime_env_info = 6; + // Valid values are NORMAL_TASK, ACTOR_CREATION_TASK, DRIVER_TASK + TaskType task_type = 3; + Language language = 4; + FunctionDescriptor task_func = 5; + string task_name = 6; + map required_resources = 7; + RuntimeEnvInfo runtime_env_info = 8; // The correlation ids of the task that can be used to correlate the task with // other events. - bytes job_id = 7; - bytes parent_task_id = 8; - bytes placement_group_id = 9; - map ref_ids = 10; + bytes job_id = 9; + bytes parent_task_id = 10; + bytes placement_group_id = 11; + map ref_ids = 12; } diff --git a/src/ray/protobuf/events_task_execution_event.proto b/src/ray/protobuf/public/events_task_execution_event.proto similarity index 87% rename from src/ray/protobuf/events_task_execution_event.proto rename to src/ray/protobuf/public/events_task_execution_event.proto index 7418f9354064..51724605fb7b 100644 --- a/src/ray/protobuf/events_task_execution_event.proto +++ b/src/ray/protobuf/public/events_task_execution_event.proto @@ -28,15 +28,16 @@ message TaskExecutionEvent { // The task execution information + // The map of task state to the time when the state was last updated. // Key is the integer value of TaskStatus enum (protobuf doesn't support Enum as key). // Value is the timestamp when status changes to the target status indicated by the key. map task_state = 3; - UserErrorInfo user_error_info = 4; - RayErrorInfo ray_error_info = 5; + RayErrorInfo ray_error_info = 4; // The correlation ids of the task that can be used to correlate the task with // other events. - bytes node_id = 6; - bytes worker_id = 7; - int32 worker_pid = 8; + bytes node_id = 5; + bytes worker_id = 6; + int32 worker_pid = 7; + bytes job_id = 8; } diff --git a/src/ray/protobuf/public/runtime_environment.proto b/src/ray/protobuf/public/runtime_environment.proto new file mode 100644 index 000000000000..f707d888e5fd --- /dev/null +++ b/src/ray/protobuf/public/runtime_environment.proto @@ -0,0 +1,47 @@ +// Copyright 2017 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package ray.rpc; + +option java_package = "io.ray.runtime.generated"; + +message RuntimeEnvUris { + /// working dir uri + string working_dir_uri = 1; + /// python modules uris + repeated string py_modules_uris = 2; +} + +/// The runtime env config, include some fields that do not +/// participate in the calculation of the runtime_env hash. +message RuntimeEnvConfig { + /// The timeout of runtime env creation. + int32 setup_timeout_seconds = 1; + /// Indicates whether to install runtime env eagerly before the workers are leased. + bool eager_install = 2; + /// A list of files to stream the runtime env setup logs to. + repeated string log_files = 3; +} + +/// The runtime env information which is transferred between ray core processes. +message RuntimeEnvInfo { + /// The serialized runtime env passed from the user. + string serialized_runtime_env = 1; + /// URIs used in this runtime env. These will be used for reference counting. + RuntimeEnvUris uris = 2; + /// The serialized runtime env config passed from the user. + RuntimeEnvConfig runtime_env_config = 3; +} diff --git a/src/ray/protobuf/pubsub.proto b/src/ray/protobuf/pubsub.proto index 89d3ab55a76e..eba420d2097e 100644 --- a/src/ray/protobuf/pubsub.proto +++ b/src/ray/protobuf/pubsub.proto @@ -24,7 +24,6 @@ import "src/ray/protobuf/logging.proto"; /// For example, for pubsub channels that are used by core workers, /// they have the prefix WORKER_. enum ChannelType { - reserved 9; /// A channel for object eviction. WORKER_OBJECT_EVICTION = 0; /// A channel for ref removed. @@ -44,7 +43,7 @@ enum ChannelType { /// A channel for logs from various Ray components. RAY_LOG_CHANNEL = 8; /// A channel for reporting node resource usage stats. - RAY_NODE_RESOURCE_USAGE_CHANNEL = 10; + RAY_NODE_RESOURCE_USAGE_CHANNEL = 9; } /// @@ -52,7 +51,6 @@ enum ChannelType { /// message PubMessage { - reserved 10, 14; /// Channel type for this publish message. ChannelType channel_type = 1; /// The key id (e.g., object id) in bytes. @@ -62,19 +60,17 @@ message PubMessage { WorkerObjectEvictionMessage worker_object_eviction_message = 3; WorkerRefRemovedMessage worker_ref_removed_message = 4; WorkerObjectLocationsPubMessage worker_object_locations_message = 5; + FailureMessage failure_message = 6; ActorTableData actor_message = 7; JobTableData job_message = 8; GcsNodeInfo node_info_message = 9; - WorkerDeltaData worker_delta_message = 11; - ErrorTableData error_info_message = 12; - LogBatch log_batch_message = 13; - NodeResourceUsage node_resource_usage_message = 15; - - // The message that indicates the given key id is not available anymore. - FailureMessage failure_message = 6; + WorkerDeltaData worker_delta_message = 10; + ErrorTableData error_info_message = 11; + LogBatch log_batch_message = 12; + NodeResourceUsage node_resource_usage_message = 13; } /// A monotonically increasing sequence_id generated by the publisher. - int64 sequence_id = 16; + int64 sequence_id = 14; } message WorkerObjectEvictionMessage { @@ -117,8 +113,7 @@ message WorkerObjectLocationsPubMessage { } /// Indicating the subscriber needs to handle failure callback. -message FailureMessage { -} +message FailureMessage {} /// /// Subscribe @@ -141,8 +136,7 @@ message Command { } } -message UnsubscribeMessage { -} +message UnsubscribeMessage {} /// Each of subscribe command needs to include request body because in Ray's pubsub /// module, it doesn't subscribe the same data structure (like for Redis, @@ -225,8 +219,7 @@ message PubsubCommandBatchRequest { repeated Command commands = 2; } -message PubsubCommandBatchReply { -} +message PubsubCommandBatchReply {} service SubscriberService { /// The long polling request sent to the publisher for pubsub operations. diff --git a/src/ray/protobuf/ray_syncer.proto b/src/ray/protobuf/ray_syncer.proto index fe239695f129..36da8decc794 100644 --- a/src/ray/protobuf/ray_syncer.proto +++ b/src/ray/protobuf/ray_syncer.proto @@ -45,6 +45,8 @@ message ResourceViewSyncMessage { int64 draining_deadline_timestamp_ms = 6; // Why the node is not idle. repeated string node_activity = 7; + // The key-value labels of this node. + map labels = 8; } message RaySyncMessage { diff --git a/src/ray/protobuf/reporter.proto b/src/ray/protobuf/reporter.proto index 552b5a95a9c4..3397c3da1c60 100644 --- a/src/ray/protobuf/reporter.proto +++ b/src/ray/protobuf/reporter.proto @@ -83,6 +83,10 @@ message ReportOCMetricsRequest { message ReportOCMetricsReply {} +message HealthCheckRequest {} + +message HealthCheckReply {} + // Service for communicating with the reporter agent module on a remote node. service ReporterService { // Report OpenCensus metrics to the local metrics agent. @@ -91,6 +95,8 @@ service ReporterService { rpc CpuProfiling(CpuProfilingRequest) returns (CpuProfilingReply); rpc GpuProfiling(GpuProfilingRequest) returns (GpuProfilingReply); rpc MemoryProfiling(MemoryProfilingRequest) returns (MemoryProfilingReply); + // Health check to validate whether the service is running + rpc HealthCheck(HealthCheckRequest) returns (HealthCheckReply); } message StreamLogRequest { diff --git a/src/ray/protobuf/runtime_env_agent.proto b/src/ray/protobuf/runtime_env_agent.proto index 707b818d3279..161f844a0d77 100644 --- a/src/ray/protobuf/runtime_env_agent.proto +++ b/src/ray/protobuf/runtime_env_agent.proto @@ -17,6 +17,7 @@ syntax = "proto3"; package ray.rpc; import "src/ray/protobuf/runtime_env_common.proto"; +import "src/ray/protobuf/public/runtime_environment.proto"; enum AgentRpcStatus { // OK. diff --git a/src/ray/protobuf/runtime_env_common.proto b/src/ray/protobuf/runtime_env_common.proto index b11021ef6ab8..c7f01fd493b4 100644 --- a/src/ray/protobuf/runtime_env_common.proto +++ b/src/ray/protobuf/runtime_env_common.proto @@ -18,34 +18,6 @@ package ray.rpc; option java_package = "io.ray.runtime.generated"; -message RuntimeEnvUris { - /// working dir uri - string working_dir_uri = 1; - /// python modules uris - repeated string py_modules_uris = 2; -} - -/// The runtime env config, include some fields that do not -/// participate in the calculation of the runtime_env hash. -message RuntimeEnvConfig { - /// The timeout of runtime env creation. - int32 setup_timeout_seconds = 1; - /// Indicates whether to install runtime env eagerly before the workers are leased. - bool eager_install = 2; - /// A list of files to stream the runtime env setup logs to. - repeated string log_files = 3; -} - -/// The runtime env information which is transferred between ray core processes. -message RuntimeEnvInfo { - /// The serialized runtime env passed from the user. - string serialized_runtime_env = 1; - /// URIs used in this runtime env. These will be used for reference counting. - RuntimeEnvUris uris = 2; - /// The serialized runtime env config passed from the user. - RuntimeEnvConfig runtime_env_config = 3; -} - message RuntimeEnvState { /// The serialized runtime env. string runtime_env = 1; diff --git a/src/ray/protobuf/serve.proto b/src/ray/protobuf/serve.proto index d26aa1e9ad81..420bb7258b23 100644 --- a/src/ray/protobuf/serve.proto +++ b/src/ray/protobuf/serve.proto @@ -22,6 +22,14 @@ option java_outer_classname = "ServeProtos"; option java_multiple_files = true; +// Configuration options for Serve's autoscaling policy +message AutoscalingPolicy { + // Name of the policy function or the import path of the policy if user passed a string. + // Will be the concatenation of the policy module and the policy name if user passed a + // callable. + string name = 1; +} + // Configuration options for Serve's replica autoscaler. message AutoscalingConfig { // Minimal number of replicas, must be a non-negative integer. @@ -61,9 +69,8 @@ message AutoscalingConfig { // The cloudpickled policy definition. bytes _serialized_policy_def = 11; - // The import path of the policy if user passed a string. Will be the concatenation - // of the policy module and the policy name if user passed a callable. - string _policy = 12; + // The autoscaling policy definition. + AutoscalingPolicy policy = 12; // Target number of in flight requests per replica. This is the primary configuration // knob for replica autoscaler. Lower the number, the more rapidly the replicas diff --git a/src/ray/pubsub/BUILD.bazel b/src/ray/pubsub/BUILD.bazel index ba280ed8eb04..302f799ce1c3 100644 --- a/src/ray/pubsub/BUILD.bazel +++ b/src/ray/pubsub/BUILD.bazel @@ -1,5 +1,16 @@ load("//bazel:ray.bzl", "ray_cc_library") +ray_cc_library( + name = "publisher_interface", + hdrs = ["publisher_interface.h"], + deps = [ + "//src/ray/protobuf:pubsub_cc_grpc", + # NOTE(edoakes): we only seem to need `SendReplyCallback` from server_call.h. + # We should move that definition to its own target. + "//src/ray/rpc:server_call", + ], +) + ray_cc_library( name = "publisher", srcs = ["publisher.cc"], @@ -8,6 +19,7 @@ ray_cc_library( "//src/ray/protobuf:pubsub_cc_grpc", # NOTE(edoakes): we only seem to need `SendReplyCallback` from server_call.h. # We should move that definition to its own target. + "//src/ray/pubsub:publisher_interface", "//src/ray/rpc:server_call", "@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/container:flat_hash_set", @@ -15,6 +27,18 @@ ray_cc_library( ], ) +ray_cc_library( + name = "subscriber_interface", + hdrs = ["subscriber_interface.h"], + deps = [ + "//src/ray/protobuf:common_cc_proto", + "//src/ray/protobuf:pubsub_cc_grpc", + # NOTE(edoakes): we only seem to need `ClientCallback` from client_call.h. + # We should move that definition to its own target. + "//src/ray/rpc:client_call", + ], +) + ray_cc_library( name = "subscriber", srcs = ["subscriber.cc"], @@ -24,8 +48,45 @@ ray_cc_library( # NOTE(edoakes): we only seem to need `ClientCallback` from client_call.h. # We should move that definition to its own target. "//src/ray/rpc:client_call", + "//src/ray/pubsub:subscriber_interface", "@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/container:flat_hash_set", "@com_google_absl//absl/synchronization", ], ) + +ray_cc_library( + name = "gcs_publisher", + srcs = ["gcs_publisher.cc"], + hdrs = ["gcs_publisher.h"], + deps = [ + ":publisher_interface", + "//src/ray/protobuf:gcs_cc_proto", + ], +) + +ray_cc_library( + name = "gcs_subscriber", + srcs = ["gcs_subscriber.cc"], + hdrs = ["gcs_subscriber.h"], + deps = [ + ":subscriber_interface", + "//src/ray/common:gcs_callbacks", + "//src/ray/protobuf:gcs_cc_proto", + ], +) + +ray_cc_library( + name = "python_gcs_subscriber", + srcs = ["python_gcs_subscriber.cc"], + hdrs = ["python_gcs_subscriber.h"], + deps = [ + "//src/ray/common:status", + "//src/ray/gcs_client:rpc_client", + "//src/ray/protobuf:gcs_service_cc_proto", + "//src/ray/protobuf:pubsub_cc_proto", + "//src/ray/util:visibility", + "@com_github_grpc_grpc//:grpc++", + "@com_google_absl//absl/synchronization", + ], +) diff --git a/src/ray/pubsub/README.md b/src/ray/pubsub/README.md index 4e8c6b477287..fd791fd1cddb 100644 --- a/src/ray/pubsub/README.md +++ b/src/ray/pubsub/README.md @@ -1,7 +1,7 @@ # Pubsub module -The doc is written on June 9th 2021. The implementation can be changed in any -time, and the documentation could be out of date. +This doc has last been updated on Aug 19, 2025. This doc should be updated +as the implementation changes. ## Motivation @@ -31,6 +31,9 @@ situation. - Publisher: A process that publishes messages to subscribers. - Subscriber: A process that subscribes channels from publishers. - Channel: Equivalent to topic in Kafka. +- Key/Entity: A specific item you care about in the channel. E.g. in + the actor channel, you only care about a specific actor id so that's + the key you subscribe to. Not all channels have keys you can subscribe by. - Command: Equivalent to Redis pubsub's command. E.g., Subscribe / Unsubscribe. ## Features @@ -45,52 +48,113 @@ situation. subscribers. - Subscriber failure detection. The subscriber failure is tracked by publishers. -- The module is general and can be used in arbitrary two core ray components. +- The module is general and can be used in two arbitrary Ray components. ## Limitation -- If messages are published before it is subscribed from the publisher, they - are lost. -- It doesn't handle the fault tolerance by design because raylet -> core_worker - (the most common use case) doesn't require it. The fault tolerance needs to - be implemented in the higher layer. +If messages are published before a subscription, they're lost. ## Implementation -The pubsub module doesn't have a broker like traditional pubsub systems because -there's no use case. In the pubsub module, all publishers are also brokers. The -performance, especially a throughput is not a requirement when developed, and -the module is not designed for high throughput. +In this pubsub implementation, publishers directly send messages to subscribers. +There are no intermediary brokers. The performance, especially throughput +wasn't a requirement when developed, and therefore the module isn't designed +for high throughput. ### Basic mechanism -Between the publisher and subscriber, there's only 1 long-polling connection. -The long polling connection is initiated from the subscriber when there are -subscribing entries from the publisher. Whenever publisher publishes messages, -they are batched to the reply of the long polling request in FIFO order. +#### PubsubCommandBatch +A command is an operation from a subscriber to publisher. Subscribe and +Unsubscribe are the only commands. Commands are served by `PubsubCommandBatch`, +which batches them in the FIFO order. We limit to it one in-flight `PubsubCommandBatchRequest` +at a time to prevent out of order subscribes / unsubscribes. Because of this, +we have to queue up commands and therefore have to batch commands when sending them. + +#### PubsubLongPolling +Between the publisher and subscriber, there's only 1 long-polling connection +(only one in-flight request), no matter how many separate channels / keys the +subscriber is subscribed to. The subscriber will always have an in-flight +`PubsubLongPollingRequest` as long as it's subscribed to something. Whenever a +publisher publishes messages to that subscriber, they're batched to the reply +of the long polling request in FIFO order. + +### Pubsub Code Flow +Breakdown of the pubsub flow from the subscriber and publisher +Note that this section ignores fault tolerance. + +#### Subscriber Actions + +1. **On a Subscribe call** + - Sends a `PubsubCommandBatchRequest` with its own `subscriber_id` and a `SubMessage` + Command containing `channel_type` and optionally `key_id` + - Sends a `PubsubLongPollingRequest` with its own `subscriber_id` + +2. **Subscribe done** + - Receives `PubsubCommandBatchReply` and runs a callback if provided on subscribe + - Sends new commands to publisher if they've been queued up, e.g. another subscribe to + something else or an unsubscribe to something + - Only allows one in-flight `PubsubCommandBatchRequest` to ensure command ordering + +3. **Message Processing** + - Receives reply to `PubsubLongPollingRequest` and processes published messages + - Sends another `PubsubLongPollingRequest` if subscription still exists + +4. **Unsubscribe** + - Sends `PubsubCommandBatchRequest` with `UnsubscribeMessage` when unsubscribing + +#### Publisher Actions + +1. **Subscribe Handling** + - Receives `PubsubCommandBatchRequest` and creates a `SubscriberState` for the + subscriber if it doesn't exist + - Registers subscription for the given channel + key by setting up a relation between + an `EntityState` and a `SubscriberState` + - Note that the publisher maintains a `SubscriptionIndex` for each channel, and each + `SubscriptionIndex` holds `EntityState` objects for each key. Each `EntityState` + holds `SubscriberState` pointers to send / queue up messages to send. There's a + special `EntityState` in every `SubscriptionIndex` for "subscribing to all" + +2. **Initial Long Polling Request** + - Receives `PubsubLongPollingRequest` and creates `SubscriberState` if it doesn't exist. + Note that the `SubscriberState` might not exist because the initial `PubsubLongPollingRequest` + could arrive before the associated `PubsubCommandBatchRequest`. + - Creates a `LongPollConnection` in the `SubscriberState` to store the reply + reply callback + - Attempts to publish by replying to the request if mailbox already contains messages + - If mailbox is empty, waits until next relevant publish to reply and send the publish + +3. **Subsequent Long Polling** + - Receives a subsequent `PubsubLongPollingRequest` from the subscriber and checks mailbox + - Publishes messages if mailbox isn't empty, or waits for relevant publish to reply + +4. **Unsubscribe** + - Receives unsubscribe command and unregisters `SubscriberState` from the appropriate + `EntityState` + - Erases the `EntityState` if it no longer contains any `SubscriberState` pointers + - Periodically cleans up "Dead" `SubscriberState`'s -### Commands - -A command is an operation from a subscriber to publisher. For example, -Subscribe or Unsubscribe could be a command. Commands are served by a separate -RPC, which also batches them in the FIFO order. Subscriber keeps sending -commands until they are not queued. There's no backpressure mechanism here. ### Fault detection -Fault detection needed to be implemented in the component-agonistic manner, so -it doesn't use Ray's GCS for that. - -Subscriber detects the publisher failures from the long polling request. A -single long polling request is initiated from the subscriber, and it sends them -again and again whenever replied as long as there are subscribing entreis. If -the publisher fails, the long polling request is also failed, so that the -subscriber can detect the failures of publishers. All metadata is cleaned up in -this case. - -Publishers always have received long polling request from a subscriber as long -as there are subscribing entries from them. If subscribers are failed, they are -not sending any more long polling requests. Publishers refreshes the long -polling request every 30 seconds to check if the subscriber is still alive. If -the subscriber doesn't initiate a long polling request for more than certain -threshold, subscriber is condiered to be failed and all metadata is cleaned up. +Both pubsub RPC's will be retried by the client on transient network failures using the +retryable grpc client used by other RPC's throughout. + +TODO(dayshah): Only the GCS client currently retries the requests, the core worker clients will in the future. + +Subscribing and unsubscribing are idempotent so the `PubsubCommandBatchRequest` can be resent. +Since we restrict it to one in-flight request, the commands will be ordered even with retries. + +The subscriber's `PubsubLongPollingRequest` can also be retried since it comes with a +max_processed_sequence_id. The retry will be sent with the same max_processed_sequence_id +and therefore the publisher will send back the all the messages from max_processed_sequence_id +to max_sequence_id in that subscriber's mailbox. Messages will not be removed from a subscriber's +mailbox until the subscriber sends a request with max_processed_sequence_id > sequence id of message. +Sequence id increments on every publish on a publisher, regardless of channel or entity. + +Publishers keep receiving long polling requests from a subscriber as long +as there are subscribing entries from them. If subscribers are "dead", they are +not sending any more long polling requests. Publishers check if there's been active +long polling requests every 30 seconds to check if the subscriber is still alive. If +there's no activity on a LongPollingRequest for subscriber_timeout_ms (300s by default), +we'll flush the request (we'll reply) and wait to see if the subscriber sends another one. +If there hasn't been an active long polling request for over subscriber_timeout_ms, the +subscriber is considered dead and all metadata is cleaned up. diff --git a/src/ray/pubsub/gcs_publisher.cc b/src/ray/pubsub/gcs_publisher.cc new file mode 100644 index 000000000000..5cd79bdbe5ca --- /dev/null +++ b/src/ray/pubsub/gcs_publisher.cc @@ -0,0 +1,67 @@ +// Copyright 2017 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ray/pubsub/gcs_publisher.h" + +#include +#include + +namespace ray { +namespace pubsub { + +void GcsPublisher::PublishActor(const ActorID &id, rpc::ActorTableData message) { + rpc::PubMessage msg; + msg.set_channel_type(rpc::ChannelType::GCS_ACTOR_CHANNEL); + msg.set_key_id(id.Binary()); + *msg.mutable_actor_message() = std::move(message); + publisher_->Publish(std::move(msg)); +} + +void GcsPublisher::PublishJob(const JobID &id, rpc::JobTableData message) { + rpc::PubMessage msg; + msg.set_channel_type(rpc::ChannelType::GCS_JOB_CHANNEL); + msg.set_key_id(id.Binary()); + *msg.mutable_job_message() = std::move(message); + publisher_->Publish(std::move(msg)); +} + +void GcsPublisher::PublishNodeInfo(const NodeID &id, rpc::GcsNodeInfo message) { + rpc::PubMessage msg; + msg.set_channel_type(rpc::ChannelType::GCS_NODE_INFO_CHANNEL); + msg.set_key_id(id.Binary()); + *msg.mutable_node_info_message() = std::move(message); + publisher_->Publish(std::move(msg)); +} + +void GcsPublisher::PublishWorkerFailure(const WorkerID &id, + rpc::WorkerDeltaData message) { + rpc::PubMessage msg; + msg.set_channel_type(rpc::ChannelType::GCS_WORKER_DELTA_CHANNEL); + msg.set_key_id(id.Binary()); + *msg.mutable_worker_delta_message() = std::move(message); + publisher_->Publish(std::move(msg)); +} + +void GcsPublisher::PublishError(std::string id, rpc::ErrorTableData message) { + rpc::PubMessage msg; + msg.set_channel_type(rpc::ChannelType::RAY_ERROR_INFO_CHANNEL); + msg.set_key_id(std::move(id)); + *msg.mutable_error_info_message() = std::move(message); + publisher_->Publish(std::move(msg)); +} + +std::string GcsPublisher::DebugString() const { return publisher_->DebugString(); } + +} // namespace pubsub +} // namespace ray diff --git a/src/ray/pubsub/gcs_publisher.h b/src/ray/pubsub/gcs_publisher.h new file mode 100644 index 000000000000..3f1eb8a4b2d5 --- /dev/null +++ b/src/ray/pubsub/gcs_publisher.h @@ -0,0 +1,72 @@ +// Copyright 2017 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "ray/pubsub/publisher_interface.h" +#include "src/ray/protobuf/gcs.pb.h" + +namespace ray { +namespace pubsub { + +/// \class GcsPublisher +/// +/// Supports publishing per-entity data and errors from GCS. Thread safe. +class GcsPublisher { + public: + /// Initializes GcsPublisher with GCS based publishers. + /// Publish*() member functions below would be incrementally converted to use the GCS + /// based publisher, if available. + explicit GcsPublisher(std::unique_ptr publisher) + : publisher_(std::move(publisher)) { + RAY_CHECK(publisher_); + } + + /// Returns the underlying pubsub::Publisher. Caller does not take ownership. + pubsub::PublisherInterface &GetPublisher() const { return *publisher_; } + + /// Each publishing method below publishes to a different "channel". + /// ID is the entity which the message is associated with, e.g. ActorID for Actor data. + /// Subscribers receive typed messages for the ID that they subscribe to. + /// + /// The full stream of NodeResource and Error channels are needed by its subscribers. + /// But for other channels, subscribers should only need the latest data. + /// + /// TODO: Verify GCS pubsub satisfies the streaming semantics. + /// TODO: Implement optimization for channels where only latest data per ID is useful. + + void PublishActor(const ActorID &id, rpc::ActorTableData message); + + void PublishJob(const JobID &id, rpc::JobTableData message); + + void PublishNodeInfo(const NodeID &id, rpc::GcsNodeInfo message); + + /// Actually rpc::WorkerDeltaData is not a delta message. + void PublishWorkerFailure(const WorkerID &id, rpc::WorkerDeltaData message); + + void PublishError(std::string id, rpc::ErrorTableData message); + + /// Prints debugging info for the publisher. + std::string DebugString() const; + + private: + const std::unique_ptr publisher_; +}; + +} // namespace pubsub +} // namespace ray diff --git a/src/ray/pubsub/gcs_subscriber.cc b/src/ray/pubsub/gcs_subscriber.cc new file mode 100644 index 000000000000..f02e5e66e9ea --- /dev/null +++ b/src/ray/pubsub/gcs_subscriber.cc @@ -0,0 +1,144 @@ +// Copyright 2017 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ray/pubsub/gcs_subscriber.h" + +#include +#include +#include + +namespace ray { +namespace pubsub { + +Status GcsSubscriber::SubscribeAllJobs( + const gcs::SubscribeCallback &subscribe, + const gcs::StatusCallback &done) { + auto subscribe_item_callback = [subscribe](rpc::PubMessage &&msg) { + RAY_CHECK(msg.channel_type() == rpc::ChannelType::GCS_JOB_CHANNEL); + const JobID id = JobID::FromBinary(msg.key_id()); + subscribe(id, std::move(*msg.mutable_job_message())); + }; + auto subscription_failure_callback = [](const std::string &, const Status &status) { + RAY_LOG(WARNING) << "Subscription to Job channel failed: " << status.ToString(); + }; + subscriber_->Subscribe( + std::make_unique(), + rpc::ChannelType::GCS_JOB_CHANNEL, + gcs_address_, + /*key_id=*/std::nullopt, + [done](const Status &status) { + if (done != nullptr) { + done(status); + } + }, + std::move(subscribe_item_callback), + std::move(subscription_failure_callback)); + return Status::OK(); +} + +Status GcsSubscriber::SubscribeActor( + const ActorID &id, + const gcs::SubscribeCallback &subscribe, + const gcs::StatusCallback &done) { + auto subscription_callback = [id, subscribe](rpc::PubMessage &&msg) { + RAY_CHECK(msg.channel_type() == rpc::ChannelType::GCS_ACTOR_CHANNEL); + RAY_CHECK(msg.key_id() == id.Binary()); + subscribe(id, std::move(*msg.mutable_actor_message())); + }; + auto subscription_failure_callback = [id](const std::string &failed_id, + const Status &status) { + RAY_CHECK(failed_id == id.Binary()); + RAY_LOG(WARNING) << "Subscription to Actor " << id.Hex() + << " failed: " << status.ToString(); + }; + subscriber_->Subscribe( + std::make_unique(), + rpc::ChannelType::GCS_ACTOR_CHANNEL, + gcs_address_, + /*key_id=*/id.Binary(), + [done](const Status &status) { + if (done != nullptr) { + done(status); + } + }, + std::move(subscription_callback), + std::move(subscription_failure_callback)); + return Status::OK(); +} + +Status GcsSubscriber::UnsubscribeActor(const ActorID &id) { + subscriber_->Unsubscribe( + rpc::ChannelType::GCS_ACTOR_CHANNEL, gcs_address_, id.Binary()); + return Status::OK(); +} + +bool GcsSubscriber::IsActorUnsubscribed(const ActorID &id) { + return !subscriber_->IsSubscribed( + rpc::ChannelType::GCS_ACTOR_CHANNEL, gcs_address_, id.Binary()); +} + +void GcsSubscriber::SubscribeAllNodeInfo( + const gcs::ItemCallback &subscribe, + const gcs::StatusCallback &done) { + auto subscribe_item_callback = [subscribe](rpc::PubMessage &&msg) { + RAY_CHECK(msg.channel_type() == rpc::ChannelType::GCS_NODE_INFO_CHANNEL); + subscribe(std::move(*msg.mutable_node_info_message())); + }; + auto subscription_failure_callback = [](const std::string &, const Status &status) { + RAY_LOG(WARNING) << "Subscription to NodeInfo channel failed: " << status.ToString(); + }; + subscriber_->Subscribe( + std::make_unique(), + rpc::ChannelType::GCS_NODE_INFO_CHANNEL, + gcs_address_, + /*key_id=*/std::nullopt, + [done](const Status &status) { + if (done != nullptr) { + done(status); + } + }, + std::move(subscribe_item_callback), + std::move(subscription_failure_callback)); +} + +Status GcsSubscriber::SubscribeAllWorkerFailures( + const gcs::ItemCallback &subscribe, + const gcs::StatusCallback &done) { + auto subscribe_item_callback = [subscribe](rpc::PubMessage &&msg) { + RAY_CHECK(msg.channel_type() == rpc::ChannelType::GCS_WORKER_DELTA_CHANNEL); + subscribe(std::move(*msg.mutable_worker_delta_message())); + }; + auto subscription_failure_callback = [](const std::string &, const Status &status) { + RAY_LOG(WARNING) << "Subscription to WorkerDelta channel failed: " + << status.ToString(); + }; + // Ignore if the subscription already exists, because the resubscription is intentional. + subscriber_->Subscribe( + std::make_unique(), + rpc::ChannelType::GCS_WORKER_DELTA_CHANNEL, + gcs_address_, + /*key_id=*/std::nullopt, + /*subscribe_done_callback=*/ + [done](const Status &status) { + if (done != nullptr) { + done(status); + } + }, + std::move(subscribe_item_callback), + std::move(subscription_failure_callback)); + return Status::OK(); +} + +} // namespace pubsub +} // namespace ray diff --git a/src/ray/pubsub/gcs_subscriber.h b/src/ray/pubsub/gcs_subscriber.h new file mode 100644 index 000000000000..31dd9a733bd0 --- /dev/null +++ b/src/ray/pubsub/gcs_subscriber.h @@ -0,0 +1,73 @@ +// Copyright 2017 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "ray/common/gcs_callbacks.h" +#include "ray/pubsub/subscriber_interface.h" +#include "src/ray/protobuf/gcs.pb.h" + +namespace ray { +namespace pubsub { + +/// \class GcsSubscriber +/// +/// Supports subscribing to an entity or a channel from GCS. Thread safe. +class GcsSubscriber { + public: + /// Initializes GcsSubscriber with GCS based GcsSubscribers. + // TODO(mwtian): Support restarted GCS publisher, at the same or a different address. + GcsSubscriber(rpc::Address gcs_address, + std::unique_ptr subscriber) + : gcs_address_(std::move(gcs_address)), subscriber_(std::move(subscriber)) {} + + /// Subscribe*() member functions below would be incrementally converted to use the GCS + /// based subscriber, if available. + /// The `subscribe` callbacks must not be empty. The `done` callbacks can optionally be + /// empty. + + /// Uses GCS pubsub when created with `subscriber`. + Status SubscribeActor( + const ActorID &id, + const gcs::SubscribeCallback &subscribe, + const gcs::StatusCallback &done); + Status UnsubscribeActor(const ActorID &id); + + bool IsActorUnsubscribed(const ActorID &id); + + Status SubscribeAllJobs( + const gcs::SubscribeCallback &subscribe, + const gcs::StatusCallback &done); + + void SubscribeAllNodeInfo(const gcs::ItemCallback &subscribe, + const gcs::StatusCallback &done); + + Status SubscribeAllWorkerFailures( + const gcs::ItemCallback &subscribe, + const gcs::StatusCallback &done); + + /// Prints debugging info for the subscriber. + std::string DebugString() const; + + private: + const rpc::Address gcs_address_; + const std::unique_ptr subscriber_; +}; + +} // namespace pubsub +} // namespace ray diff --git a/src/ray/pubsub/publisher.cc b/src/ray/pubsub/publisher.cc index ec9de99534a8..b9494c5b0034 100644 --- a/src/ray/pubsub/publisher.cc +++ b/src/ray/pubsub/publisher.cc @@ -25,9 +25,7 @@ namespace ray { namespace pubsub { -namespace pub_internal { - -bool EntityState::Publish(std::shared_ptr msg, size_t msg_size) { +bool EntityState::Publish(const std::shared_ptr &msg, size_t msg_size) { if (subscribers_.empty()) { return false; } @@ -48,7 +46,8 @@ bool EntityState::Publish(std::shared_ptr msg, size_t msg_size) // to implement inflight message tracking across subscribers with non-atomic // ref-counting or with a LRU-like data structure tracking the range of buffered // messages for each subscriber. - auto front_msg = pending_messages_.front().lock(); + auto &[front_msg_weak, front_msg_size] = pending_messages_.front(); + auto front_msg = front_msg_weak.lock(); if (front_msg == nullptr) { // The message has no other reference. // This means that it has been published to all subscribers. @@ -78,14 +77,12 @@ bool EntityState::Publish(std::shared_ptr msg, size_t msg_size) // The first message in the queue has been published to all subscribers, or // it has been dropped due to memory cap. Subtract it from memory // accounting. + total_size_ -= front_msg_size; pending_messages_.pop(); - total_size_ -= message_sizes_.front(); - message_sizes_.pop(); } - pending_messages_.push(msg); + pending_messages_.emplace(msg, msg_size); total_size_ += msg_size; - message_sizes_.push(msg_size); for (auto &[id, subscriber] : subscribers_) { subscriber->QueueMessage(msg); @@ -93,16 +90,15 @@ bool EntityState::Publish(std::shared_ptr msg, size_t msg_size) return true; } -bool EntityState::AddSubscriber(SubscriberState *subscriber) { - return subscribers_.emplace(subscriber->id(), subscriber).second; +void EntityState::AddSubscriber(SubscriberState *subscriber) { + subscribers_.emplace(subscriber->id(), subscriber); } -bool EntityState::RemoveSubscriber(const SubscriberID &id) { - return subscribers_.erase(id) > 0; +void EntityState::RemoveSubscriber(const UniqueID &subscriber_id) { + subscribers_.erase(subscriber_id); } -const absl::flat_hash_map &EntityState::Subscribers() - const { +const absl::flat_hash_map &EntityState::Subscribers() const { return subscribers_; } @@ -122,7 +118,7 @@ int64_t SubscriptionIndex::GetNumBufferedBytes() const { return num_bytes_buffered; } -bool SubscriptionIndex::Publish(std::shared_ptr pub_message, +bool SubscriptionIndex::Publish(const std::shared_ptr &pub_message, size_t msg_size) { const bool publish_to_all = subscribers_to_all_->Publish(pub_message, msg_size); bool publish_to_entity = false; @@ -133,27 +129,25 @@ bool SubscriptionIndex::Publish(std::shared_ptr pub_message, return publish_to_all || publish_to_entity; } -bool SubscriptionIndex::AddEntry(const std::string &key_id, SubscriberState *subscriber) { +void SubscriptionIndex::AddEntry(const std::string &key_id, SubscriberState *subscriber) { if (key_id.empty()) { - return subscribers_to_all_->AddSubscriber(subscriber); + subscribers_to_all_->AddSubscriber(subscriber); + return; } auto &subscribing_key_ids = subscribers_to_key_id_[subscriber->id()]; - const bool key_added = subscribing_key_ids.emplace(key_id).second; + subscribing_key_ids.emplace(key_id); auto sub_it = entities_.find(key_id); if (sub_it == entities_.end()) { sub_it = entities_.emplace(key_id, CreateEntityState(channel_type_)).first; } - const bool subscriber_added = sub_it->second->AddSubscriber(subscriber); - - RAY_CHECK(key_added == subscriber_added); - return key_added; + sub_it->second->AddSubscriber(subscriber); } -std::vector SubscriptionIndex::GetSubscriberIdsByKeyId( +std::vector SubscriptionIndex::GetSubscriberIdsByKeyId( const std::string &key_id) const { - std::vector subscribers; + std::vector subscribers; if (!subscribers_to_all_->Subscribers().empty()) { for (const auto &[sub_id, sub] : subscribers_to_all_->Subscribers()) { subscribers.push_back(sub_id); @@ -168,15 +162,13 @@ std::vector SubscriptionIndex::GetSubscriberIdsByKeyId( return subscribers; } -bool SubscriptionIndex::EraseSubscriber(const SubscriberID &subscriber_id) { +void SubscriptionIndex::EraseSubscriber(const UniqueID &subscriber_id) { // Erase subscriber of all keys. - if (subscribers_to_all_->RemoveSubscriber(subscriber_id)) { - return true; - } + subscribers_to_all_->RemoveSubscriber(subscriber_id); auto subscribing_key_it = subscribers_to_key_id_.find(subscriber_id); if (subscribing_key_it == subscribers_to_key_id_.end()) { - return false; + return; } // Erase subscriber of individual keys. @@ -194,53 +186,48 @@ bool SubscriptionIndex::EraseSubscriber(const SubscriberID &subscriber_id) { } } subscribers_to_key_id_.erase(subscribing_key_it); - return true; } -bool SubscriptionIndex::EraseEntry(const std::string &key_id, - const SubscriberID &subscriber_id) { +void SubscriptionIndex::EraseEntry(const std::string &key_id, + const UniqueID &subscriber_id) { // Erase the subscriber of all keys. if (key_id.empty()) { - return subscribers_to_all_->RemoveSubscriber(subscriber_id); + subscribers_to_all_->RemoveSubscriber(subscriber_id); } // Erase keys from the subscriber of individual keys. - auto subscribers_to_message_it = subscribers_to_key_id_.find(subscriber_id); - if (subscribers_to_message_it == subscribers_to_key_id_.end()) { - return false; + auto subscribers_to_key_id_it = subscribers_to_key_id_.find(subscriber_id); + if (subscribers_to_key_id_it == subscribers_to_key_id_.end()) { + return; } - auto &objects = subscribers_to_message_it->second; + auto &objects = subscribers_to_key_id_it->second; auto object_it = objects.find(key_id); if (object_it == objects.end()) { - auto it = entities_.find(key_id); - if (it != entities_.end()) { - RAY_CHECK(!it->second->Subscribers().contains(subscriber_id)); - } - return false; + return; } objects.erase(object_it); if (objects.empty()) { - subscribers_to_key_id_.erase(subscribers_to_message_it); + subscribers_to_key_id_.erase(subscribers_to_key_id_it); } // Erase subscribers from keys (reverse index). auto entity_it = entities_.find(key_id); - // If code reaches this line, that means the object id was in the index. - RAY_CHECK(entity_it != entities_.end()); + if (entity_it == entities_.end()) { + return; + } auto &entity = *entity_it->second; // If code reaches this line, that means the subscriber id was in the index. - RAY_CHECK(entity.RemoveSubscriber(subscriber_id)); + entity.RemoveSubscriber(subscriber_id); if (entity.Subscribers().empty()) { entities_.erase(entity_it); } - return true; } bool SubscriptionIndex::HasKeyId(const std::string &key_id) const { return entities_.contains(key_id); } -bool SubscriptionIndex::HasSubscriber(const SubscriberID &subscriber_id) const { +bool SubscriptionIndex::HasSubscriber(const UniqueID &subscriber_id) const { if (subscribers_to_all_->Subscribers().contains(subscriber_id)) { return true; } @@ -257,6 +244,7 @@ std::unique_ptr SubscriptionIndex::CreateEntityState( case rpc::ChannelType::RAY_ERROR_INFO_CHANNEL: case rpc::ChannelType::RAY_LOG_CHANNEL: case rpc::ChannelType::RAY_NODE_RESOURCE_USAGE_CHANNEL: + // Not critical if some messages are dropped. return std::make_unique( RayConfig::instance().max_grpc_message_size(), RayConfig::instance().publisher_entity_buffer_max_bytes()); @@ -268,6 +256,7 @@ std::unique_ptr SubscriptionIndex::CreateEntityState( case rpc::ChannelType::GCS_JOB_CHANNEL: case rpc::ChannelType::GCS_NODE_INFO_CHANNEL: case rpc::ChannelType::GCS_WORKER_DELTA_CHANNEL: + // Critical if messages are dropped. return std::make_unique(RayConfig::instance().max_grpc_message_size(), /*max_buffered_bytes=*/-1); @@ -277,12 +266,13 @@ std::unique_ptr SubscriptionIndex::CreateEntityState( } } -void SubscriberState::ConnectToSubscriber(const rpc::PubsubLongPollingRequest &request, - rpc::PubsubLongPollingReply *reply, - rpc::SendReplyCallback send_reply_callback) { +void SubscriberState::ConnectToSubscriber( + const rpc::PubsubLongPollingRequest &request, + std::string *publisher_id, + google::protobuf::RepeatedPtrField *pub_messages, + rpc::SendReplyCallback send_reply_callback) { int64_t max_processed_sequence_id = request.max_processed_sequence_id(); - if (request.publisher_id().empty() || - publisher_id_ != PublisherID::FromBinary(request.publisher_id())) { + if (request.publisher_id().empty() || publisher_id_binary_ != request.publisher_id()) { // in case the publisher_id mismatches, we should ignore the // max_processed_sequence_id. max_processed_sequence_id = 0; @@ -300,38 +290,33 @@ void SubscriberState::ConnectToSubscriber(const rpc::PubsubLongPollingRequest &r PublishIfPossible(/*force_noop=*/true); } RAY_CHECK(!long_polling_connection_); - RAY_CHECK(reply != nullptr); - RAY_CHECK(send_reply_callback != nullptr); - long_polling_connection_ = - std::make_unique(reply, std::move(send_reply_callback)); + long_polling_connection_ = std::make_unique( + publisher_id, pub_messages, std::move(send_reply_callback)); last_connection_update_time_ms_ = get_time_ms_(); - PublishIfPossible(); + PublishIfPossible(/*force_noop=*/false); } -void SubscriberState::QueueMessage(const std::shared_ptr &pub_message, - bool try_publish) { +void SubscriberState::QueueMessage(const std::shared_ptr &pub_message) { RAY_LOG(DEBUG) << "enqueue: " << pub_message->sequence_id(); mailbox_.push_back(pub_message); - if (try_publish) { - PublishIfPossible(); - } + PublishIfPossible(/*force_noop=*/false); } -bool SubscriberState::PublishIfPossible(bool force_noop) { +void SubscriberState::PublishIfPossible(bool force_noop) { if (!long_polling_connection_) { - return false; + return; } if (!force_noop && mailbox_.empty()) { - return false; + return; } // No message should have been added to the reply. - RAY_CHECK(long_polling_connection_->reply->pub_messages().empty()); - *long_polling_connection_->reply->mutable_publisher_id() = publisher_id_.Binary(); + RAY_CHECK(long_polling_connection_->pub_messages_->empty()); + *long_polling_connection_->publisher_id_ = publisher_id_binary_; int64_t num_total_bytes = 0; if (!force_noop) { for (auto it = mailbox_.begin(); it != mailbox_.end(); it++) { - if (long_polling_connection_->reply->pub_messages().size() >= publish_batch_size_) { + if (long_polling_connection_->pub_messages_->size() >= publish_batch_size_) { break; } @@ -350,20 +335,16 @@ bool SubscriberState::PublishIfPossible(bool force_noop) { // Avoid sending empty message to the subscriber. The message might have been // cleared because the subscribed entity's buffer was full. if (msg.inner_message_case() != rpc::PubMessage::INNER_MESSAGE_NOT_SET) { - *long_polling_connection_->reply->add_pub_messages() = msg; + *long_polling_connection_->pub_messages_->Add() = msg; } } } - - RAY_LOG(DEBUG) << "sending reply back" - << long_polling_connection_->reply->DebugString(); - long_polling_connection_->send_reply_callback(Status::OK(), nullptr, nullptr); + long_polling_connection_->send_reply_callback_(Status::OK(), nullptr, nullptr); // Clean up & update metadata. long_polling_connection_.reset(); // Clean up & update metadata. last_connection_update_time_ms_ = get_time_ms_(); - return true; } bool SubscriberState::CheckNoLeaks() const { @@ -379,56 +360,54 @@ bool SubscriberState::IsActive() const { return get_time_ms_() - last_connection_update_time_ms_ < connection_timeout_ms_; } -} // namespace pub_internal - -void Publisher::ConnectToSubscriber(const rpc::PubsubLongPollingRequest &request, - rpc::PubsubLongPollingReply *reply, - rpc::SendReplyCallback send_reply_callback) { - RAY_CHECK(reply != nullptr); +void Publisher::ConnectToSubscriber( + const rpc::PubsubLongPollingRequest &request, + std::string *publisher_id, + google::protobuf::RepeatedPtrField *pub_messages, + rpc::SendReplyCallback send_reply_callback) { RAY_CHECK(send_reply_callback != nullptr); - const auto subscriber_id = SubscriberID::FromBinary(request.subscriber_id()); + const auto subscriber_id = UniqueID::FromBinary(request.subscriber_id()); RAY_LOG(DEBUG) << "Long polling connection initiated by " << subscriber_id.Hex() << ", publisher_id " << publisher_id_.Hex(); absl::MutexLock lock(&mutex_); auto it = subscribers_.find(subscriber_id); if (it == subscribers_.end()) { it = subscribers_ - .emplace( - subscriber_id, - std::make_unique(subscriber_id, - get_time_ms_, - subscriber_timeout_ms_, - publish_batch_size_, - publisher_id_)) + .emplace(subscriber_id, + std::make_unique(subscriber_id, + get_time_ms_, + subscriber_timeout_ms_, + publish_batch_size_, + publisher_id_)) .first; } auto &subscriber = it->second; // May flush the current long poll with an empty message, if a poll request exists. - subscriber->ConnectToSubscriber(request, reply, std::move(send_reply_callback)); + subscriber->ConnectToSubscriber( + request, publisher_id, pub_messages, std::move(send_reply_callback)); } -bool Publisher::RegisterSubscription(const rpc::ChannelType channel_type, - const SubscriberID &subscriber_id, +void Publisher::RegisterSubscription(const rpc::ChannelType channel_type, + const UniqueID &subscriber_id, const std::optional &key_id) { absl::MutexLock lock(&mutex_); auto it = subscribers_.find(subscriber_id); if (it == subscribers_.end()) { it = subscribers_ - .emplace( - subscriber_id, - std::make_unique(subscriber_id, - get_time_ms_, - subscriber_timeout_ms_, - publish_batch_size_, - publisher_id_)) + .emplace(subscriber_id, + std::make_unique(subscriber_id, + get_time_ms_, + subscriber_timeout_ms_, + publish_batch_size_, + publisher_id_)) .first; } - pub_internal::SubscriberState *subscriber = it->second.get(); + SubscriberState *subscriber = it->second.get(); auto subscription_index_it = subscription_index_map_.find(channel_type); RAY_CHECK(subscription_index_it != subscription_index_map_.end()); - return subscription_index_it->second.AddEntry(key_id.value_or(""), subscriber); + subscription_index_it->second.AddEntry(key_id.value_or(""), subscriber); } void Publisher::Publish(rpc::PubMessage pub_message) { @@ -436,8 +415,6 @@ void Publisher::Publish(rpc::PubMessage pub_message) { const auto channel_type = pub_message.channel_type(); absl::MutexLock lock(&mutex_); auto &subscription_index = subscription_index_map_.at(channel_type); - // TODO(sang): Currently messages are lost if publish happens - // before there's any subscriber for the object. pub_message.set_sequence_id(++next_sequence_id_); const size_t msg_size = pub_message.ByteSizeLong(); @@ -457,56 +434,40 @@ void Publisher::PublishFailure(const rpc::ChannelType channel_type, Publish(pub_message); } -bool Publisher::UnregisterSubscription(const rpc::ChannelType channel_type, - const SubscriberID &subscriber_id, +void Publisher::UnregisterSubscription(const rpc::ChannelType channel_type, + const UniqueID &subscriber_id, const std::optional &key_id) { absl::MutexLock lock(&mutex_); auto subscription_index_it = subscription_index_map_.find(channel_type); - RAY_CHECK(subscription_index_it != subscription_index_map_.end()); - return subscription_index_it->second.EraseEntry(key_id.value_or(""), subscriber_id); -} - -bool Publisher::UnregisterSubscriber(const SubscriberID &subscriber_id) { - absl::MutexLock lock(&mutex_); - return UnregisterSubscriberInternal(subscriber_id); + if (subscription_index_it != subscription_index_map_.end()) { + subscription_index_it->second.EraseEntry(key_id.value_or(""), subscriber_id); + } } -void Publisher::UnregisterAll() { +void Publisher::UnregisterSubscriber(const UniqueID &subscriber_id) { absl::MutexLock lock(&mutex_); - // Save the subscriber IDs to be removed, because UnregisterSubscriberInternal() - // erases from subscribers_. - std::vector ids; - for (const auto &[id, subscriber] : subscribers_) { - ids.push_back(id); - } - for (const auto &id : ids) { - UnregisterSubscriberInternal(id); - } + UnregisterSubscriberInternal(subscriber_id); } -int Publisher::UnregisterSubscriberInternal(const SubscriberID &subscriber_id) { +void Publisher::UnregisterSubscriberInternal(const UniqueID &subscriber_id) { RAY_LOG(DEBUG) << "Unregistering subscriber " << subscriber_id.Hex(); - int erased = 0; for (auto &index : subscription_index_map_) { - if (index.second.EraseSubscriber(subscriber_id)) { - erased += 1; - } + index.second.EraseSubscriber(subscriber_id); } auto it = subscribers_.find(subscriber_id); if (it == subscribers_.end()) { - return erased; + return; } auto &subscriber = it->second; // Flush the long polling connection because otherwise the reply could be leaked. subscriber->PublishIfPossible(/*force_noop=*/true); subscribers_.erase(it); - return erased; } void Publisher::CheckDeadSubscribers() { absl::MutexLock lock(&mutex_); - std::vector dead_subscribers; + std::vector dead_subscribers; for (const auto &it : subscribers_) { const auto &subscriber = it.second; diff --git a/src/ray/pubsub/publisher.h b/src/ray/pubsub/publisher.h index b61027e11928..9657fab90309 100644 --- a/src/ray/pubsub/publisher.h +++ b/src/ray/pubsub/publisher.h @@ -21,7 +21,6 @@ #include #include #include -#include #include #include @@ -30,19 +29,14 @@ #include "absl/synchronization/mutex.h" #include "ray/common/asio/periodical_runner.h" #include "ray/common/id.h" +#include "ray/pubsub/publisher_interface.h" #include "ray/rpc/server_call.h" -#include "src/ray/protobuf/common.pb.h" #include "src/ray/protobuf/pubsub.pb.h" namespace ray { namespace pubsub { -using SubscriberID = UniqueID; -using PublisherID = UniqueID; - -namespace pub_internal { - class SubscriberState; /// State for an entity / topic in a pub/sub channel. @@ -55,37 +49,39 @@ class EntityState { /// Publishes the message to subscribers of the entity. /// Returns true if there are subscribers, returns false otherwise. - bool Publish(std::shared_ptr pub_message, size_t msg_size); + bool Publish(const std::shared_ptr &pub_message, size_t msg_size); /// Manages the set of subscribers of this entity. - bool AddSubscriber(SubscriberState *subscriber); - bool RemoveSubscriber(const SubscriberID &id); + void AddSubscriber(SubscriberState *subscriber); + void RemoveSubscriber(const UniqueID &subscriber_id); /// Gets the current set of subscribers, keyed by subscriber IDs. - const absl::flat_hash_map &Subscribers() const; + const absl::flat_hash_map &Subscribers() const; size_t GetNumBufferedBytes() const { return total_size_; } protected: // Subscribers of this entity. // The underlying SubscriberState is owned by Publisher. - absl::flat_hash_map subscribers_; + absl::flat_hash_map subscribers_; private: // Tracks inflight messages. The messages have shared ownership by // individual subscribers, and get deleted after no subscriber has - // the message in buffer. - std::queue> pending_messages_; - // Size of each inflight message. - std::queue message_sizes_; + // the message in buffer. Also stores the size of the message so that we can keep track + // of total_size_. + std::queue, size_t>> pending_messages_; + // Protobuf messages fail to serialize if 2GB or larger. Cap published // message batches to this size to ensure that we can publish each message // batch. Individual messages larger than this limit will also be dropped. // TODO(swang): Pubsub clients should also ensure that they don't try to // publish messages larger than this. const size_t max_message_size_bytes_; + // Set to -1 to disable buffering. const int64_t max_buffered_bytes_; + // Total size of inflight messages. size_t total_size_ = 0; }; @@ -95,28 +91,23 @@ class EntityState { class SubscriptionIndex { public: explicit SubscriptionIndex(rpc::ChannelType channel_type); - ~SubscriptionIndex() = default; - - SubscriptionIndex(SubscriptionIndex &&) noexcept = default; - SubscriptionIndex &operator=(SubscriptionIndex &&) noexcept = default; /// Publishes the message to relevant subscribers. /// Returns true if there are subscribers listening on the entity key of the message, /// returns false otherwise. - bool Publish(std::shared_ptr pub_message, size_t msg_size); + bool Publish(const std::shared_ptr &pub_message, size_t msg_size); /// Adds a new subscriber and the key it subscribes to. /// When `key_id` is empty, the subscriber subscribes to all keys. - /// NOTE: The method is idempotent. If it adds a duplicated entry, it will be no-op. - bool AddEntry(const std::string &key_id, SubscriberState *subscriber); + void AddEntry(const std::string &key_id, SubscriberState *subscriber); /// Erases the subscriber from this index. /// Returns whether the subscriber exists before the call. - bool EraseSubscriber(const SubscriberID &subscriber_id); + void EraseSubscriber(const UniqueID &subscriber_id); /// Erases the subscriber from the particular key. /// When `key_id` is empty, the subscriber subscribes to all keys. - bool EraseEntry(const std::string &key_id, const SubscriberID &subscriber_id); + void EraseEntry(const std::string &key_id, const UniqueID &subscriber_id); /// Test only. /// Returns true if the entity id exists in the index. @@ -126,11 +117,11 @@ class SubscriptionIndex { /// Test only. /// Returns true if the subscriber id exists in the index, including both per-entity /// and all-entity subscribers. - bool HasSubscriber(const SubscriberID &subscriber_id) const; + bool HasSubscriber(const UniqueID &subscriber_id) const; /// Returns a vector of subscriber ids that are subscribing to the given object ids. /// Test only. - std::vector GetSubscriberIdsByKeyId(const std::string &key_id) const; + std::vector GetSubscriberIdsByKeyId(const std::string &key_id) const; int64_t GetNumBufferedBytes() const; @@ -148,64 +139,63 @@ class SubscriptionIndex { absl::flat_hash_map> entities_; // Mapping from subscriber IDs -> subscribed key ids. // Reverse index of key_id_to_subscribers_. - absl::flat_hash_map> - subscribers_to_key_id_; + absl::flat_hash_map> subscribers_to_key_id_; }; struct LongPollConnection { - LongPollConnection(rpc::PubsubLongPollingReply *reply, + LongPollConnection(std::string *publisher_id, + google::protobuf::RepeatedPtrField *pub_messages, rpc::SendReplyCallback send_reply_callback) - : reply(reply), send_reply_callback(send_reply_callback) {} + : publisher_id_(publisher_id), + pub_messages_(pub_messages), + send_reply_callback_(std::move(send_reply_callback)) {} - rpc::PubsubLongPollingReply *reply; - rpc::SendReplyCallback send_reply_callback; + std::string *publisher_id_; + google::protobuf::RepeatedPtrField *pub_messages_; + rpc::SendReplyCallback send_reply_callback_; }; /// Keeps the state of each connected subscriber. class SubscriberState { public: - SubscriberState(SubscriberID subscriber_id, + SubscriberState(UniqueID subscriber_id, std::function get_time_ms, uint64_t connection_timeout_ms, int64_t publish_batch_size, - PublisherID publisher_id) + UniqueID publisher_id) : subscriber_id_(subscriber_id), get_time_ms_(std::move(get_time_ms)), connection_timeout_ms_(connection_timeout_ms), publish_batch_size_(publish_batch_size), last_connection_update_time_ms_(get_time_ms_()), - publisher_id_(publisher_id) {} + publisher_id_binary_(publisher_id.Binary()) {} ~SubscriberState() { // Force a push to close the long-polling. // Otherwise, there will be a connection leak. - PublishIfPossible(true); + PublishIfPossible(/*force_noop=*/true); } + SubscriberState(const SubscriberState &) = delete; + SubscriberState &operator=(const SubscriberState &) = delete; + /// Connect to the subscriber. Currently, it means we cache the long polling request to - /// memory. Once the bidirectional gRPC streaming is enabled, we should replace it. - /// - /// \param reply pubsub long polling reply. - /// \param send_reply_callback A callback to reply to the long polling subscriber. - void ConnectToSubscriber(const rpc::PubsubLongPollingRequest &request, - rpc::PubsubLongPollingReply *reply, - rpc::SendReplyCallback send_reply_callback); + /// memory. + void ConnectToSubscriber( + const rpc::PubsubLongPollingRequest &request, + std::string *publisher_id, + google::protobuf::RepeatedPtrField *pub_messages, + rpc::SendReplyCallback send_reply_callback); /// Queue the pubsub message to publish to the subscriber. - /// - /// \param pub_message A message to publish. - /// \param try_publish If true, try publishing the object id if there is a connection. - /// Currently only set to false in tests. - void QueueMessage(const std::shared_ptr &pub_message, - bool try_publish = true); + void QueueMessage(const std::shared_ptr &pub_message); /// Publish all queued messages if possible. /// /// \param force_noop If true, reply to the subscriber with an empty message, regardless /// of whethere there is any queued message. This is for cases where the current poll /// might have been cancelled, or the subscriber might be dead. - /// \return True if it publishes. False otherwise. - bool PublishIfPossible(bool force_noop = false); + void PublishIfPossible(bool force_noop); /// Testing only. Return true if there's no metadata remained in the private attribute. bool CheckNoLeaks() const; @@ -218,11 +208,11 @@ class SubscriberState { bool IsActive() const; /// Returns the ID of this subscriber. - const SubscriberID &id() const { return subscriber_id_; } + const UniqueID &id() const { return subscriber_id_; } private: /// Subscriber ID, for logging and debugging. - const SubscriberID subscriber_id_; + const UniqueID subscriber_id_; /// Inflight long polling reply callback, for replying to the subscriber. std::unique_ptr long_polling_connection_; /// Queued messages to publish. @@ -235,61 +225,7 @@ class SubscriberState { const int64_t publish_batch_size_; /// The last time long polling was connected in milliseconds. double last_connection_update_time_ms_; - PublisherID publisher_id_; -}; - -} // namespace pub_internal - -/// Publisher interface. Note that message ids are passed as a string to avoid templated -/// definition which doesn't go well with virtual methods. -class PublisherInterface { - public: - virtual ~PublisherInterface() = default; - - /// Handle a long poll request from `subscriber_id`. - /// - /// TODO(sang): Currently, we need to pass the callback for connection because we are - /// using long polling internally. This should be changed once the bidirectional grpc - /// streaming is supported. - virtual void ConnectToSubscriber(const rpc::PubsubLongPollingRequest &request, - rpc::PubsubLongPollingReply *reply, - rpc::SendReplyCallback send_reply_callback) = 0; - - /// Register the subscription. - /// - /// \param channel_type The type of the channel. - /// \param subscriber_id The node id of the subscriber. - /// \param key_id The key_id that the subscriber is subscribing to. std::nullopt if - /// subscribing to all. - /// \return True if registration is new. False otherwise. - virtual bool RegisterSubscription(const rpc::ChannelType channel_type, - const SubscriberID &subscriber_id, - const std::optional &key_id) = 0; - - /// Publish the given object id to subscribers. - /// - /// \param pub_message The message to publish. - /// Required to contain channel_type and key_id fields. - virtual void Publish(rpc::PubMessage pub_message) = 0; - - /// Publish to the subscriber that the given key id is not available anymore. - /// It will invoke the failure callback on the subscriber side. - /// - /// \param channel_type The type of the channel. - /// \param key_id The message id to publish. - virtual void PublishFailure(const rpc::ChannelType channel_type, - const std::string &key_id) = 0; - - /// Unregister subscription. It means the given object id won't be published to the - /// subscriber anymore. - /// - /// \param channel_type The type of the channel. - /// \param subscriber_id The node id of the subscriber. - /// \param key_id The key_id of the subscriber. std::nullopt if subscribing to all. - /// \return True if erased. False otherwise. - virtual bool UnregisterSubscription(const rpc::ChannelType channel_type, - const SubscriberID &subscriber_id, - const std::optional &key_id) = 0; + std::string publisher_id_binary_; }; /// Protocol detail @@ -323,7 +259,7 @@ class Publisher : public PublisherInterface { std::function get_time_ms, const uint64_t subscriber_timeout_ms, int64_t publish_batch_size, - PublisherID publisher_id = NodeID::FromRandom()) + UniqueID publisher_id = NodeID::FromRandom()) : periodical_runner_(&periodical_runner), get_time_ms_(std::move(get_time_ms)), subscriber_timeout_ms_(subscriber_timeout_ms), @@ -331,7 +267,7 @@ class Publisher : public PublisherInterface { publisher_id_(publisher_id) { // Insert index map for each channel. for (auto type : channels) { - subscription_index_map_.emplace(type, pub_internal::SubscriptionIndex(type)); + subscription_index_map_.emplace(type, SubscriptionIndex(type)); } periodical_runner_->RunFnPeriodically([this] { CheckDeadSubscribers(); }, @@ -339,14 +275,14 @@ class Publisher : public PublisherInterface { "Publisher.CheckDeadSubscribers"); } - ~Publisher() override = default; - - void ConnectToSubscriber(const rpc::PubsubLongPollingRequest &request, - rpc::PubsubLongPollingReply *reply, - rpc::SendReplyCallback send_reply_callback) override; + void ConnectToSubscriber( + const rpc::PubsubLongPollingRequest &request, + std::string *publisher_id, + google::protobuf::RepeatedPtrField *pub_messages, + rpc::SendReplyCallback send_reply_callback) override; - bool RegisterSubscription(const rpc::ChannelType channel_type, - const SubscriberID &subscriber_id, + void RegisterSubscription(const rpc::ChannelType channel_type, + const UniqueID &subscriber_id, const std::optional &key_id) override; void Publish(rpc::PubMessage pub_message) override; @@ -354,19 +290,13 @@ class Publisher : public PublisherInterface { void PublishFailure(const rpc::ChannelType channel_type, const std::string &key_id) override; - bool UnregisterSubscription(const rpc::ChannelType channel_type, - const SubscriberID &subscriber_id, + void UnregisterSubscription(const rpc::ChannelType channel_type, + const UniqueID &subscriber_id, const std::optional &key_id) override; - /// Remove the subscriber. Once the subscriber is removed, messages won't be published - /// to it anymore. - /// - /// \param subscriber_id The node id of the subscriber to unsubscribe. - /// \return True if erased. False otherwise. - bool UnregisterSubscriber(const SubscriberID &subscriber_id); + void UnregisterSubscriber(const UniqueID &subscriber_id) override; - /// Flushes all inflight pollings and unregisters all subscribers. - void UnregisterAll(); + std::string DebugString() const override; /// Check all subscribers, detect which subscribers are dead or its connection is timed /// out, and clean up their metadata. This uses the goal-oriented logic to clean up all @@ -387,8 +317,6 @@ class Publisher : public PublisherInterface { /// having a timer per subscriber. void CheckDeadSubscribers(); - std::string DebugString() const; - private: /// /// Testing fields @@ -418,7 +346,7 @@ class Publisher : public PublisherInterface { /// Private fields /// - int UnregisterSubscriberInternal(const SubscriberID &subscriber_id) + void UnregisterSubscriberInternal(const UniqueID &subscriber_id) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_); // Periodic runner to invoke CheckDeadSubscribers. @@ -437,12 +365,12 @@ class Publisher : public PublisherInterface { mutable absl::Mutex mutex_; /// Mapping of node id -> subscribers. - absl::flat_hash_map> - subscribers_ ABSL_GUARDED_BY(mutex_); + absl::flat_hash_map> subscribers_ + ABSL_GUARDED_BY(mutex_); /// Index that stores the mapping of messages <-> subscribers. - absl::flat_hash_map - subscription_index_map_ ABSL_GUARDED_BY(mutex_); + absl::flat_hash_map subscription_index_map_ + ABSL_GUARDED_BY(mutex_); /// The maximum number of objects to publish for each publish calls. const int64_t publish_batch_size_; @@ -466,9 +394,7 @@ class Publisher : public PublisherInterface { /// of a channel. int64_t next_sequence_id_ ABSL_GUARDED_BY(mutex_) = 0; - /// A unique identifier identifies the publisher_id. - /// TODO(scv119) add docs about the semantics. - const PublisherID publisher_id_; + const UniqueID publisher_id_; }; } // namespace pubsub diff --git a/src/ray/pubsub/publisher_interface.h b/src/ray/pubsub/publisher_interface.h new file mode 100644 index 000000000000..35bde4fab94a --- /dev/null +++ b/src/ray/pubsub/publisher_interface.h @@ -0,0 +1,84 @@ +// Copyright 2017 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include + +#include "ray/common/id.h" +#include "ray/rpc/server_call.h" +#include "src/ray/protobuf/pubsub.pb.h" + +namespace ray { +namespace pubsub { + +/// Publisher interface. Note that message ids are passed as a string to avoid templated +/// definition which doesn't go well with virtual methods. +class PublisherInterface { + public: + virtual ~PublisherInterface() = default; + + /// Handle a long poll request from `subscriber_id`. + virtual void ConnectToSubscriber( + const rpc::PubsubLongPollingRequest &request, + std::string *publisher_id, + google::protobuf::RepeatedPtrField *pub_messages, + rpc::SendReplyCallback send_reply_callback) = 0; + + /// Register the subscription. + /// + /// \param channel_type The type of the channel. + /// \param subscriber_id The ID of the subscriber. + /// \param key_id The key_id that the subscriber is subscribing to. std::nullopt if + /// subscribing to all. + virtual void RegisterSubscription(const rpc::ChannelType channel_type, + const UniqueID &subscriber_id, + const std::optional &key_id) = 0; + + /// Publish the given object id to subscribers. + /// + /// \param pub_message The message to publish. + /// Required to contain channel_type and key_id fields. + virtual void Publish(rpc::PubMessage pub_message) = 0; + + /// Publish to the subscriber that the given key id is not available anymore. + /// It will invoke the failure callback on the subscriber side. + /// + /// \param channel_type The type of the channel. + /// \param key_id The message id to publish. + virtual void PublishFailure(const rpc::ChannelType channel_type, + const std::string &key_id) = 0; + + /// Unregister subscription. It means the given object id won't be published to the + /// subscriber anymore. + /// + /// \param channel_type The type of the channel. + /// \param subscriber_id The ID of the subscriber. + /// \param key_id The key_id of the subscriber. std::nullopt if subscribing to all. + virtual void UnregisterSubscription(const rpc::ChannelType channel_type, + const UniqueID &subscriber_id, + const std::optional &key_id) = 0; + + /// Unregister subscriber. No messages on any channels will be published to it anymore. + /// + /// \param subscriber_id The ID of the subscriber. + virtual void UnregisterSubscriber(const UniqueID &subscriber_id) = 0; + + virtual std::string DebugString() const = 0; +}; + +} // namespace pubsub +} // namespace ray diff --git a/src/ray/pubsub/python_gcs_subscriber.cc b/src/ray/pubsub/python_gcs_subscriber.cc new file mode 100644 index 000000000000..995fd35d457b --- /dev/null +++ b/src/ray/pubsub/python_gcs_subscriber.cc @@ -0,0 +1,199 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ray/pubsub/python_gcs_subscriber.h" + +#include + +#include +#include +#include +#include + +#include "ray/gcs_client/rpc_client.h" + +namespace ray { +namespace pubsub { + +std::vector PythonGetLogBatchLines(rpc::LogBatch log_batch) { + return std::vector( + std::make_move_iterator(log_batch.mutable_lines()->begin()), + std::make_move_iterator(log_batch.mutable_lines()->end())); +} + +PythonGcsSubscriber::PythonGcsSubscriber(const std::string &gcs_address, + int gcs_port, + rpc::ChannelType channel_type, + std::string subscriber_id, + std::string worker_id) + : channel_(rpc::GcsRpcClient::CreateGcsChannel(gcs_address, gcs_port)), + pubsub_stub_(rpc::InternalPubSubGcsService::NewStub(channel_)), + channel_type_(channel_type), + subscriber_id_(std::move(subscriber_id)), + worker_id_(std::move(worker_id)) {} + +Status PythonGcsSubscriber::Subscribe() { + absl::MutexLock lock(&mu_); + + if (closed_) { + return Status::OK(); + } + + grpc::ClientContext context; + + rpc::GcsSubscriberCommandBatchRequest request; + request.set_subscriber_id(subscriber_id_); + request.set_sender_id(worker_id_); + auto *command = request.add_commands(); + command->set_channel_type(channel_type_); + command->mutable_subscribe_message(); + + rpc::GcsSubscriberCommandBatchReply reply; + grpc::Status status = + pubsub_stub_->GcsSubscriberCommandBatch(&context, request, &reply); + + if (status.ok()) { + return Status::OK(); + } else { + return Status::RpcError(status.error_message(), status.error_code()); + } +} + +Status PythonGcsSubscriber::DoPoll(int64_t timeout_ms, rpc::PubMessage *message) { + absl::MutexLock lock(&mu_); + + while (queue_.empty()) { + if (closed_) { + return Status::OK(); + } + current_polling_context_ = std::make_shared(); + if (timeout_ms != -1) { + current_polling_context_->set_deadline(std::chrono::system_clock::now() + + std::chrono::milliseconds(timeout_ms)); + } + rpc::GcsSubscriberPollRequest request; + request.set_subscriber_id(subscriber_id_); + request.set_max_processed_sequence_id(max_processed_sequence_id_); + request.set_publisher_id(publisher_id_); + + rpc::GcsSubscriberPollReply reply; + auto context = current_polling_context_; + // Drop the lock while in RPC + mu_.Unlock(); + grpc::Status status = pubsub_stub_->GcsSubscriberPoll(context.get(), request, &reply); + mu_.Lock(); + + if (status.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED || + status.error_code() == grpc::StatusCode::UNAVAILABLE) { + return Status::OK(); + } + if (status.error_code() == grpc::StatusCode::CANCELLED) { + // This channel was shut down via Close() + return Status::OK(); + } + if (status.error_code() != grpc::StatusCode::OK) { + return Status::Invalid(status.error_message()); + } + + if (publisher_id_ != reply.publisher_id()) { + if (publisher_id_ != "") { + RAY_LOG(DEBUG) << "Replied publisher_id " << reply.publisher_id() + << " different from " << publisher_id_ + << ", this should only happen" + << " during GCS failover."; + } + publisher_id_ = reply.publisher_id(); + max_processed_sequence_id_ = 0; + } + last_batch_size_ = reply.pub_messages().size(); + for (auto &cur_pub_msg : *reply.mutable_pub_messages()) { + if (cur_pub_msg.sequence_id() <= max_processed_sequence_id_) { + RAY_LOG(WARNING) << "Ignoring out of order message " << cur_pub_msg.sequence_id(); + continue; + } + max_processed_sequence_id_ = cur_pub_msg.sequence_id(); + if (cur_pub_msg.channel_type() != channel_type_) { + RAY_LOG(WARNING) << "Ignoring message from unsubscribed channel " + << cur_pub_msg.channel_type(); + continue; + } + queue_.emplace_back(std::move(cur_pub_msg)); + } + } + + *message = std::move(queue_.front()); + queue_.pop_front(); + + return Status::OK(); +} + +Status PythonGcsSubscriber::PollError(std::string *key_id, + int64_t timeout_ms, + rpc::ErrorTableData *data) { + rpc::PubMessage message; + RAY_RETURN_NOT_OK(DoPoll(timeout_ms, &message)); + *key_id = std::move(*message.mutable_key_id()); + *data = std::move(*message.mutable_error_info_message()); + return Status::OK(); +} + +Status PythonGcsSubscriber::PollLogs(std::string *key_id, + int64_t timeout_ms, + rpc::LogBatch *data) { + rpc::PubMessage message; + RAY_RETURN_NOT_OK(DoPoll(timeout_ms, &message)); + *key_id = std::move(*message.mutable_key_id()); + *data = std::move(*message.mutable_log_batch_message()); + return Status::OK(); +} + +Status PythonGcsSubscriber::Close() { + std::shared_ptr current_polling_context; + { + absl::MutexLock lock(&mu_); + if (closed_) { + return Status::OK(); + } + closed_ = true; + current_polling_context = current_polling_context_; + } + if (current_polling_context) { + current_polling_context->TryCancel(); + } + + grpc::ClientContext context; + + rpc::GcsSubscriberCommandBatchRequest request; + request.set_subscriber_id(subscriber_id_); + auto *command = request.add_commands(); + command->set_channel_type(channel_type_); + command->mutable_unsubscribe_message(); + rpc::GcsSubscriberCommandBatchReply reply; + grpc::Status status = + pubsub_stub_->GcsSubscriberCommandBatch(&context, request, &reply); + + if (!status.ok()) { + RAY_LOG(WARNING) << "Error while unregistering the subscriber: " + << status.error_message() << " [code " << status.error_code() << "]"; + } + return Status::OK(); +} + +int64_t PythonGcsSubscriber::last_batch_size() { + absl::MutexLock lock(&mu_); + return last_batch_size_; +} + +} // namespace pubsub +} // namespace ray diff --git a/src/ray/pubsub/python_gcs_subscriber.h b/src/ray/pubsub/python_gcs_subscriber.h new file mode 100644 index 000000000000..5fe4eda29812 --- /dev/null +++ b/src/ray/pubsub/python_gcs_subscriber.h @@ -0,0 +1,90 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include "absl/synchronization/mutex.h" +#include "ray/common/status.h" +#include "ray/util/visibility.h" +#include "src/ray/protobuf/gcs_service.grpc.pb.h" +#include "src/ray/protobuf/pubsub.pb.h" + +// Use forward declarations to avoid exposing heavyweight gRPC headers. +namespace grpc { + +class Channel; +class ClientContext; + +} // namespace grpc + +namespace ray { +namespace pubsub { + +// This client is only supposed to be used from Cython / Python +class RAY_EXPORT PythonGcsSubscriber { + public: + PythonGcsSubscriber(const std::string &gcs_address, + int gcs_port, + rpc::ChannelType channel_type, + std::string subscriber_id, + std::string worker_id); + + /// Register a subscription for the subscriber's channel type. + /// + /// Before the registration, published messages in the channel + /// will not be saved for the subscriber. + Status Subscribe(); + + /// Polls for new error message. + /// Both key_id and data are out parameters. + Status PollError(std::string *key_id, int64_t timeout_ms, rpc::ErrorTableData *data); + + /// Polls for new log messages. + Status PollLogs(std::string *key_id, int64_t timeout_ms, rpc::LogBatch *data); + + /// Closes the subscriber and its active subscription. + Status Close(); + + int64_t last_batch_size(); + + private: + Status DoPoll(int64_t timeout_ms, rpc::PubMessage *message); + + mutable absl::Mutex mu_; + + std::shared_ptr channel_; + std::unique_ptr pubsub_stub_; + + const rpc::ChannelType channel_type_; + const std::string subscriber_id_; + std::string publisher_id_; + const std::string worker_id_; + int64_t max_processed_sequence_id_ ABSL_GUARDED_BY(mu_) = 0; + int64_t last_batch_size_ ABSL_GUARDED_BY(mu_) = 0; + std::deque queue_ ABSL_GUARDED_BY(mu_); + bool closed_ ABSL_GUARDED_BY(mu_) = false; + std::shared_ptr current_polling_context_ ABSL_GUARDED_BY(mu_); +}; + +/// Get the .lines() attribute of a LogBatch as a std::vector +/// (this is needed so it can be wrapped in Cython) +std::vector PythonGetLogBatchLines(rpc::LogBatch log_batch); + +} // namespace pubsub +} // namespace ray diff --git a/src/ray/pubsub/subscriber.cc b/src/ray/pubsub/subscriber.cc index ed167d78f097..792cf9fd403a 100644 --- a/src/ray/pubsub/subscriber.cc +++ b/src/ray/pubsub/subscriber.cc @@ -23,43 +23,40 @@ namespace ray { namespace pubsub { namespace { -const PublisherID kDefaultPublisherID{}; +const UniqueID kDefaultUniqueID{}; } /////////////////////////////////////////////////////////////////////////////// /// SubscriberChannel /////////////////////////////////////////////////////////////////////////////// -bool SubscriberChannel::Subscribe( +void SubscriberChannel::Subscribe( const rpc::Address &publisher_address, const std::optional &key_id, SubscriptionItemCallback subscription_callback, SubscriptionFailureCallback subscription_failure_callback) { cum_subscribe_requests_++; - const auto publisher_id = PublisherID::FromBinary(publisher_address.worker_id()); + const auto publisher_id = UniqueID::FromBinary(publisher_address.worker_id()); if (key_id) { - return subscription_map_[publisher_id] - .per_entity_subscription - .try_emplace(*key_id, - SubscriptionInfo(std::move(subscription_callback), - std::move(subscription_failure_callback))) - .second; + subscription_map_[publisher_id].per_entity_subscription.try_emplace( + *key_id, + SubscriptionInfo(std::move(subscription_callback), + std::move(subscription_failure_callback))); + return; } auto &all_entities_subscription = subscription_map_[publisher_id].all_entities_subscription; - if (all_entities_subscription != nullptr) { - return false; + if (all_entities_subscription == nullptr) { + all_entities_subscription = std::make_unique( + std::move(subscription_callback), std::move(subscription_failure_callback)); } - all_entities_subscription = std::make_unique( - std::move(subscription_callback), std::move(subscription_failure_callback)); - return true; } bool SubscriberChannel::Unsubscribe(const rpc::Address &publisher_address, const std::optional &key_id) { cum_unsubscribe_requests_++; - const auto publisher_id = PublisherID::FromBinary(publisher_address.worker_id()); + const auto publisher_id = UniqueID::FromBinary(publisher_address.worker_id()); // Find subscription info. auto subscription_it = subscription_map_.find(publisher_id); @@ -94,7 +91,7 @@ bool SubscriberChannel::Unsubscribe(const rpc::Address &publisher_address, bool SubscriberChannel::IsSubscribed(const rpc::Address &publisher_address, const std::string &key_id) const { - const auto publisher_id = PublisherID::FromBinary(publisher_address.worker_id()); + const auto publisher_id = UniqueID::FromBinary(publisher_address.worker_id()); auto subscription_it = subscription_map_.find(publisher_id); if (subscription_it == subscription_map_.end()) { return false; @@ -122,7 +119,7 @@ bool SubscriberChannel::CheckNoLeaks() const { void SubscriberChannel::HandlePublishedMessage(const rpc::Address &publisher_address, rpc::PubMessage &&pub_message) const { - const auto publisher_id = PublisherID::FromBinary(publisher_address.worker_id()); + const auto publisher_id = UniqueID::FromBinary(publisher_address.worker_id()); auto subscription_it = subscription_map_.find(publisher_id); // If there's no more subscription, do nothing. if (subscription_it == subscription_map_.end()) { @@ -154,7 +151,7 @@ void SubscriberChannel::HandlePublishedMessage(const rpc::Address &publisher_add void SubscriberChannel::HandlePublisherFailure(const rpc::Address &publisher_address, const Status &status) { - const auto publisher_id = PublisherID::FromBinary(publisher_address.worker_id()); + const auto publisher_id = UniqueID::FromBinary(publisher_address.worker_id()); const auto &subscription_it = subscription_map_.find(publisher_id); // If there's no more subscription, do nothing. if (subscription_it == subscription_map_.end()) { @@ -183,7 +180,7 @@ void SubscriberChannel::HandlePublisherFailure(const rpc::Address &publisher_add void SubscriberChannel::HandlePublisherFailure(const rpc::Address &publisher_address, const std::string &key_id) { - const auto publisher_id = PublisherID::FromBinary(publisher_address.worker_id()); + const auto publisher_id = UniqueID::FromBinary(publisher_address.worker_id()); const auto &subscription_it = subscription_map_.find(publisher_id); // If there's no more subscription, do nothing. if (subscription_it == subscription_map_.end()) { @@ -232,76 +229,26 @@ std::string SubscriberChannel::DebugString() const { /// Subscriber /////////////////////////////////////////////////////////////////////////////// -Subscriber::~Subscriber() { - // TODO(mwtian): flush Subscriber and ensure there is no leak during destruction. - // TODO(ryw): Remove this subscriber from the service by GcsUnregisterSubscriber. -} - -bool Subscriber::Subscribe(std::unique_ptr sub_message, - const rpc::ChannelType channel_type, - const rpc::Address &publisher_address, - const std::string &key_id, - SubscribeDoneCallback subscribe_done_callback, - SubscriptionItemCallback subscription_callback, - SubscriptionFailureCallback subscription_failure_callback) { - return SubscribeInternal(std::move(sub_message), - channel_type, - publisher_address, - key_id, - std::move(subscribe_done_callback), - std::move(subscription_callback), - std::move(subscription_failure_callback)); -} - -bool Subscriber::SubscribeChannel( - std::unique_ptr sub_message, - const rpc::ChannelType channel_type, - const rpc::Address &publisher_address, - SubscribeDoneCallback subscribe_done_callback, - SubscriptionItemCallback subscription_callback, - SubscriptionFailureCallback subscription_failure_callback) { - return SubscribeInternal(std::move(sub_message), - channel_type, - publisher_address, - std::nullopt, - std::move(subscribe_done_callback), - std::move(subscription_callback), - std::move(subscription_failure_callback)); -} - -bool Subscriber::Unsubscribe(const rpc::ChannelType channel_type, +bool Subscriber::Unsubscribe(rpc::ChannelType channel_type, const rpc::Address &publisher_address, - const std::string &key_id) { + const std::optional &key_id) { // Batch the unsubscribe command. auto command = std::make_unique(); command->cmd.set_channel_type(channel_type); - command->cmd.set_key_id(key_id); + if (key_id.has_value()) { + command->cmd.set_key_id(*key_id); + } command->cmd.mutable_unsubscribe_message(); absl::MutexLock lock(&mutex_); - const auto publisher_id = PublisherID::FromBinary(publisher_address.worker_id()); + const auto publisher_id = UniqueID::FromBinary(publisher_address.worker_id()); commands_[publisher_id].emplace(std::move(command)); SendCommandBatchIfPossible(publisher_address); return Channel(channel_type)->Unsubscribe(publisher_address, key_id); } -bool Subscriber::UnsubscribeChannel(const rpc::ChannelType channel_type, - const rpc::Address &publisher_address) { - // Batch the unsubscribe command. - auto command = std::make_unique(); - command->cmd.set_channel_type(channel_type); - command->cmd.mutable_unsubscribe_message(); - - absl::MutexLock lock(&mutex_); - const auto publisher_id = PublisherID::FromBinary(publisher_address.worker_id()); - commands_[publisher_id].emplace(std::move(command)); - SendCommandBatchIfPossible(publisher_address); - - return Channel(channel_type)->Unsubscribe(publisher_address, std::nullopt); -} - -bool Subscriber::IsSubscribed(const rpc::ChannelType channel_type, +bool Subscriber::IsSubscribed(rpc::ChannelType channel_type, const rpc::Address &publisher_address, const std::string &key_id) const { absl::MutexLock lock(&mutex_); @@ -312,31 +259,30 @@ bool Subscriber::IsSubscribed(const rpc::ChannelType channel_type, return channel->IsSubscribed(publisher_address, key_id); } -bool Subscriber::SubscribeInternal( - std::unique_ptr sub_message, - const rpc::ChannelType channel_type, - const rpc::Address &publisher_address, - const std::optional &key_id, - SubscribeDoneCallback subscribe_done_callback, - SubscriptionItemCallback subscription_callback, - SubscriptionFailureCallback subscription_failure_callback) { +void Subscriber::Subscribe(std::unique_ptr sub_message, + rpc::ChannelType channel_type, + const rpc::Address &publisher_address, + const std::optional &key_id, + SubscribeDoneCallback subscribe_done_callback, + SubscriptionItemCallback subscription_callback, + SubscriptionFailureCallback subscription_failure_callback) { // Batch a subscribe command. auto command = std::make_unique(); command->cmd.set_channel_type(channel_type); - if (key_id) { + if (key_id.has_value()) { command->cmd.set_key_id(*key_id); } if (sub_message != nullptr) { command->cmd.mutable_subscribe_message()->Swap(sub_message.get()); } command->done_cb = std::move(subscribe_done_callback); - const auto publisher_id = PublisherID::FromBinary(publisher_address.worker_id()); + const auto publisher_id = UniqueID::FromBinary(publisher_address.worker_id()); absl::MutexLock lock(&mutex_); commands_[publisher_id].emplace(std::move(command)); SendCommandBatchIfPossible(publisher_address); MakeLongPollingConnectionIfNotConnected(publisher_address); - return Channel(channel_type) + this->Channel(channel_type) ->Subscribe(publisher_address, key_id, std::move(subscription_callback), @@ -345,7 +291,7 @@ bool Subscriber::SubscribeInternal( void Subscriber::MakeLongPollingConnectionIfNotConnected( const rpc::Address &publisher_address) { - const auto publisher_id = PublisherID::FromBinary(publisher_address.worker_id()); + const auto publisher_id = UniqueID::FromBinary(publisher_address.worker_id()); auto publishers_connected_it = publishers_connected_.find(publisher_id); if (publishers_connected_it == publishers_connected_.end()) { publishers_connected_.emplace(publisher_id); @@ -354,7 +300,7 @@ void Subscriber::MakeLongPollingConnectionIfNotConnected( } void Subscriber::MakeLongPollingPubsubConnection(const rpc::Address &publisher_address) { - const auto publisher_id = PublisherID::FromBinary(publisher_address.worker_id()); + const auto publisher_id = UniqueID::FromBinary(publisher_address.worker_id()); RAY_LOG(DEBUG) << "Make a long polling request to " << publisher_id; auto subscriber_client = get_client_(publisher_address); rpc::PubsubLongPollingRequest long_polling_request; @@ -364,7 +310,8 @@ void Subscriber::MakeLongPollingPubsubConnection(const rpc::Address &publisher_a long_polling_request.set_max_processed_sequence_id(processed_state.second); subscriber_client->PubsubLongPolling( long_polling_request, - [this, publisher_address](Status status, rpc::PubsubLongPollingReply &&reply) { + [this, publisher_address](const Status &status, + rpc::PubsubLongPollingReply &&reply) { absl::MutexLock lock(&mutex_); HandleLongPollingResponse(publisher_address, status, std::move(reply)); }); @@ -373,9 +320,8 @@ void Subscriber::MakeLongPollingPubsubConnection(const rpc::Address &publisher_a void Subscriber::HandleLongPollingResponse(const rpc::Address &publisher_address, const Status &status, rpc::PubsubLongPollingReply &&reply) { - const auto publisher_id = PublisherID::FromBinary(publisher_address.worker_id()); + const auto publisher_id = UniqueID::FromBinary(publisher_address.worker_id()); RAY_LOG(DEBUG) << "Long polling request has been replied from " << publisher_id; - RAY_CHECK(publishers_connected_.count(publisher_id)); if (!status.ok()) { // If status is not okay, we treat that the publisher is dead. @@ -390,16 +336,15 @@ void Subscriber::HandleLongPollingResponse(const rpc::Address &publisher_address commands_.erase(publisher_id); } else { RAY_CHECK(!reply.publisher_id().empty()) << "publisher_id is empty."; - auto reply_publisher_id = PublisherID::FromBinary(reply.publisher_id()); + auto reply_publisher_id = UniqueID::FromBinary(reply.publisher_id()); if (reply_publisher_id != processed_sequences_[publisher_id].first) { - if (processed_sequences_[publisher_id].first != kDefaultPublisherID) { + if (processed_sequences_[publisher_id].first != kDefaultUniqueID) { RAY_LOG(INFO) << "Received publisher_id " << reply_publisher_id.Hex() << " is different from last seen publisher_id " << processed_sequences_[publisher_id].first << ", this can only happen when gcs failsover."; } - // reset publisher_id and processed_sequence - // if the publisher_id changes. + // reset publisher_id and processed_sequence if the publisher_id changes. processed_sequences_[publisher_id].first = reply_publisher_id; processed_sequences_[publisher_id].second = 0; } @@ -447,7 +392,7 @@ void Subscriber::HandleLongPollingResponse(const rpc::Address &publisher_address } void Subscriber::SendCommandBatchIfPossible(const rpc::Address &publisher_address) { - const auto publisher_id = PublisherID::FromBinary(publisher_address.worker_id()); + const auto publisher_id = UniqueID::FromBinary(publisher_address.worker_id()); auto command_batch_sent_it = command_batch_sent_.find(publisher_id); // If there's no in flight command batch request to the publisher, @@ -488,9 +433,9 @@ void Subscriber::SendCommandBatchIfPossible(const rpc::Address &publisher_addres Status status, const rpc::PubsubCommandBatchReply &reply) { { absl::MutexLock lock(&mutex_); - auto command_batch_sent_it = command_batch_sent_.find(publisher_id); - RAY_CHECK(command_batch_sent_it != command_batch_sent_.end()); - command_batch_sent_.erase(command_batch_sent_it); + auto command_batch_sent_iter = command_batch_sent_.find(publisher_id); + RAY_CHECK(command_batch_sent_iter != command_batch_sent_.end()); + command_batch_sent_.erase(command_batch_sent_iter); } for (const auto &done : done_cb) { if (done) { @@ -501,8 +446,8 @@ void Subscriber::SendCommandBatchIfPossible(const rpc::Address &publisher_addres // This means the publisher has failed. // The publisher dead detection & command clean up will be done // from the long polling request. - RAY_LOG(DEBUG) << "The command batch request to " << publisher_id - << " has failed"; + RAY_LOG(WARNING) << "The command batch request to " << publisher_id + << " has failed"; } { absl::MutexLock lock(&mutex_); diff --git a/src/ray/pubsub/subscriber.h b/src/ray/pubsub/subscriber.h index b76068c49775..44920840e6da 100644 --- a/src/ray/pubsub/subscriber.h +++ b/src/ray/pubsub/subscriber.h @@ -27,6 +27,7 @@ #include "absl/container/flat_hash_set.h" #include "ray/common/asio/instrumented_io_context.h" #include "ray/common/id.h" +#include "ray/pubsub/subscriber_interface.h" #include "ray/rpc/client_call.h" #include "src/ray/protobuf/common.pb.h" #include "src/ray/protobuf/pubsub.pb.h" @@ -35,17 +36,6 @@ namespace ray { namespace pubsub { -using SubscriberID = UniqueID; -using PublisherID = UniqueID; -using SubscribeDoneCallback = std::function; -using SubscriptionItemCallback = std::function; -using SubscriptionFailureCallback = - std::function; - -/////////////////////////////////////////////////////////////////////////////// -/// SubscriberChannel Abstraction -/////////////////////////////////////////////////////////////////////////////// - /// Subscription info stores metadata that is needed for subscriptions. struct SubscriptionInfo { SubscriptionInfo(SubscriptionItemCallback i_cb, SubscriptionFailureCallback f_cb) @@ -80,11 +70,11 @@ class SubscriberChannel { /// /// \param publisher_address Address of the publisher to subscribe the object. /// \param message id The message id to subscribe from the publisher. - /// \param subscription_callback A callback that is invoked whenever the given object - /// information is published. - /// \param subscription_failure_callback A callback that is - /// invoked whenever the publisher is dead (or failed). - bool Subscribe(const rpc::Address &publisher_address, + /// \param subscription_item_callback A callback that is invoked whenever the given + /// object information is published. + /// \param subscription_failure_callback A callback that is invoked whenever the + /// publisher is dead (or failed). + void Subscribe(const rpc::Address &publisher_address, const std::optional &key_id, SubscriptionItemCallback subscription_item_callback, SubscriptionFailureCallback subscription_failure_callback); @@ -134,12 +124,12 @@ class SubscriberChannel { const std::string &key_id); /// Return true if the subscription exists for a given publisher id. - bool SubscriptionExists(const PublisherID &publisher_id) { + bool SubscriptionExists(const UniqueID &publisher_id) { return subscription_map_.contains(publisher_id); } /// Return the channel type of this subscribe channel. - const rpc::ChannelType GetChannelType() const { return channel_type_; } + rpc::ChannelType GetChannelType() const { return channel_type_; } /// Return the statistics of the specific channel. std::string DebugString() const; @@ -156,17 +146,17 @@ class SubscriberChannel { /// subscribed. std::optional GetSubscriptionItemCallback( const rpc::Address &publisher_address, const std::string &key_id) const { - const auto publisher_id = PublisherID::FromBinary(publisher_address.worker_id()); + const auto publisher_id = UniqueID::FromBinary(publisher_address.worker_id()); auto subscription_it = subscription_map_.find(publisher_id); if (subscription_it == subscription_map_.end()) { - return absl::nullopt; + return std::nullopt; } if (subscription_it->second.all_entities_subscription != nullptr) { return subscription_it->second.all_entities_subscription->item_cb; } auto callback_it = subscription_it->second.per_entity_subscription.find(key_id); if (callback_it == subscription_it->second.per_entity_subscription.end()) { - return absl::nullopt; + return std::nullopt; } return callback_it->second.item_cb; } @@ -175,17 +165,17 @@ class SubscriberChannel { /// subscribed. std::optional GetFailureCallback( const rpc::Address &publisher_address, const std::string &key_id) const { - const auto publisher_id = PublisherID::FromBinary(publisher_address.worker_id()); + const auto publisher_id = UniqueID::FromBinary(publisher_address.worker_id()); auto subscription_it = subscription_map_.find(publisher_id); if (subscription_it == subscription_map_.end()) { - return absl::nullopt; + return std::nullopt; } if (subscription_it->second.all_entities_subscription != nullptr) { return subscription_it->second.all_entities_subscription->failure_cb; } auto callback_it = subscription_it->second.per_entity_subscription.find(key_id); if (callback_it == subscription_it->second.per_entity_subscription.end()) { - return absl::nullopt; + return std::nullopt; } return callback_it->second.failure_cb; } @@ -193,7 +183,7 @@ class SubscriberChannel { const rpc::ChannelType channel_type_; /// Mapping of the publisher ID -> subscription info for the publisher. - absl::flat_hash_map subscription_map_; + absl::flat_hash_map subscription_map_; /// An event loop to execute RPC callbacks. This should be equivalent to the client /// pool's io service. @@ -208,113 +198,6 @@ class SubscriberChannel { mutable uint64_t cum_processed_messages_ = 0; }; -/////////////////////////////////////////////////////////////////////////////// -/// Subscriber Abstraction -/////////////////////////////////////////////////////////////////////////////// - -/// Interface for the pubsub client. -class SubscriberInterface { - public: - /// There are two modes of subscriptions. Each channel can only be subscribed in one - /// mode, i.e. - /// - Calling Subscribe() to subscribe to one or more entities in a channel - /// - Calling SubscribeChannel() once to subscribe to all entities in a channel - /// It is an error to call both Subscribe() and SubscribeChannel() on the same channel - /// type. This restriction can be relaxed later, if there is a use case. - - /// Subscribe to entity key_id in channel channel_type. - /// NOTE(sang): All the callbacks could be executed in a different thread from a caller. - /// For example, Subscriber executes callbacks on a passed io_service. - /// - /// \param sub_message The subscription message. - /// \param channel_type The channel to subscribe to. - /// \param publisher_address Address of the publisher to subscribe the object. - /// \param key_id The entity id to subscribe from the publisher. - /// \param subscription_callback A callback that is invoked whenever the given entity - /// information is received by the subscriber. - /// \param subscription_failure_callback A callback that is invoked whenever the - /// connection to publisher is broken (e.g. the publisher fails). - /// \return True if inserted, false if the key already exists and this becomes a no-op. - [[nodiscard]] virtual bool Subscribe( - std::unique_ptr sub_message, - rpc::ChannelType channel_type, - const rpc::Address &publisher_address, - const std::string &key_id, - SubscribeDoneCallback subscribe_done_callback, - SubscriptionItemCallback subscription_callback, - SubscriptionFailureCallback subscription_failure_callback) = 0; - - /// Subscribe to all entities in channel channel_type. - /// - /// \param sub_message The subscription message. - /// \param channel_type The channel to subscribe to. - /// \param publisher_address Address of the publisher to subscribe the object. - /// \param subscription_callback A callback that is invoked whenever an entity - /// information is received by the subscriber. - /// \param subscription_failure_callback A callback that is invoked whenever the - /// connection to publisher is broken (e.g. the publisher fails). - /// \return True if inserted, false if the channel is already subscribed and this - /// becomes a no-op. - [[nodiscard]] virtual bool SubscribeChannel( - std::unique_ptr sub_message, - rpc::ChannelType channel_type, - const rpc::Address &publisher_address, - SubscribeDoneCallback subscribe_done_callback, - SubscriptionItemCallback subscription_callback, - SubscriptionFailureCallback subscription_failure_callback) = 0; - - /// Unsubscribe the entity if the entity has been subscribed with Subscribe(). - /// NOTE: Calling this method inside subscription_failure_callback is not allowed. - /// - /// \param channel_type The channel to unsubscribe from. - /// \param publisher_address The publisher address that it will unsubscribe from. - /// \param key_id The entity id to unsubscribe. - /// \return Returns whether the entity key_id has been subscribed before. - virtual bool Unsubscribe(const rpc::ChannelType channel_type, - const rpc::Address &publisher_address, - const std::string &key_id) = 0; - - /// Unsubscribe from the channel_type. Must be paired with SubscribeChannel(). - /// NOTE: Calling this method inside subscription_failure_callback is not allowed. - /// - /// \param channel_type The channel to unsubscribe from. - /// \param publisher_address The publisher address that it will unsubscribe from. - /// \return Returns whether the entity key_id has been subscribed before. - virtual bool UnsubscribeChannel(const rpc::ChannelType channel_type, - const rpc::Address &publisher_address) = 0; - - /// Test only. - /// Checks if the entity key_id is being subscribed to specifically. - /// Does not consider if SubscribeChannel() has been called on the channel. - /// - /// \param publisher_address The publisher address to check. - /// \param key_id The entity id to check. - [[nodiscard]] virtual bool IsSubscribed(const rpc::ChannelType channel_type, - const rpc::Address &publisher_address, - const std::string &key_id) const = 0; - - /// Return the statistics string for the subscriber. - virtual std::string DebugString() const = 0; - - virtual ~SubscriberInterface() {} -}; - -/// The grpc client that the subscriber needs. -class SubscriberClientInterface { - public: - /// Send a long polling request to a core worker for pubsub operations. - virtual void PubsubLongPolling( - const rpc::PubsubLongPollingRequest &request, - const rpc::ClientCallback &callback) = 0; - - /// Send a pubsub command batch request to a core worker for pubsub operations. - virtual void PubsubCommandBatch( - const rpc::PubsubCommandBatchRequest &request, - const rpc::ClientCallback &callback) = 0; - - virtual ~SubscriberClientInterface() = default; -}; - /// The pubsub client implementation. The class is thread-safe. /// /// Protocol details: @@ -332,7 +215,7 @@ class SubscriberClientInterface { class Subscriber : public SubscriberInterface { public: Subscriber( - const SubscriberID subscriber_id, + const UniqueID subscriber_id, const std::vector &channels, const int64_t max_command_batch_size, std::function(const rpc::Address &)> @@ -340,39 +223,26 @@ class Subscriber : public SubscriberInterface { instrumented_io_context *callback_service) : subscriber_id_(subscriber_id), max_command_batch_size_(max_command_batch_size), - get_client_(get_client) { + get_client_(std::move(get_client)) { for (auto type : channels) { channels_.emplace(type, std::make_unique(type, callback_service)); } } - ~Subscriber(); - - bool Subscribe(std::unique_ptr sub_message, - const rpc::ChannelType channel_type, + void Subscribe(std::unique_ptr sub_message, + rpc::ChannelType channel_type, const rpc::Address &publisher_address, - const std::string &key_id, + const std::optional &key_id, SubscribeDoneCallback subscribe_done_callback, SubscriptionItemCallback subscription_callback, SubscriptionFailureCallback subscription_failure_callback) override; - bool SubscribeChannel( - std::unique_ptr sub_message, - rpc::ChannelType channel_type, - const rpc::Address &publisher_address, - SubscribeDoneCallback subscribe_done_callback, - SubscriptionItemCallback subscription_callback, - SubscriptionFailureCallback subscription_failure_callback) override; - - bool Unsubscribe(const rpc::ChannelType channel_type, + bool Unsubscribe(rpc::ChannelType channel_type, const rpc::Address &publisher_address, - const std::string &key_id) override; + const std::optional &key_id) override; - bool UnsubscribeChannel(const rpc::ChannelType channel_type, - const rpc::Address &publisher_address) override; - - bool IsSubscribed(const rpc::ChannelType channel_type, + bool IsSubscribed(rpc::ChannelType channel_type, const rpc::Address &publisher_address, const std::string &key_id) const override; @@ -394,7 +264,6 @@ class Subscriber : public SubscriberInterface { /// FRIEND_TEST(IntegrationTest, SubscribersToOneIDAndAllIDs); - FRIEND_TEST(IntegrationTest, GcsFailsOver); FRIEND_TEST(SubscriberTest, TestBasicSubscription); FRIEND_TEST(SubscriberTest, TestSingleLongPollingWithMultipleSubscriptions); FRIEND_TEST(SubscriberTest, TestMultiLongPollingWithTheSameSubscription); @@ -407,18 +276,6 @@ class Subscriber : public SubscriberInterface { // Testing only. Check if there are leaks. bool CheckNoLeaks() const ABSL_LOCKS_EXCLUDED(mutex_); - /// - /// Private fields - /// - - bool SubscribeInternal(std::unique_ptr sub_message, - const rpc::ChannelType channel_type, - const rpc::Address &publisher_address, - const std::optional &key_id, - SubscribeDoneCallback subscribe_done_callback, - SubscriptionItemCallback subscription_callback, - SubscriptionFailureCallback subscription_failure_callback); - /// Create a long polling connection to the publisher for receiving the published /// messages. /// NOTE(sang): Note that the subscriber needs to "ensure" that the long polling @@ -454,7 +311,7 @@ class Subscriber : public SubscriberInterface { ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_); /// Return true if the given publisher id has subscription to any of channel. - bool SubscriptionExists(const PublisherID &publisher_id) + bool SubscriptionExists(const UniqueID &publisher_id) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_) { return std::any_of(channels_.begin(), channels_.end(), [publisher_id](const auto &p) { return p.second->SubscriptionExists(publisher_id); @@ -462,7 +319,7 @@ class Subscriber : public SubscriberInterface { } /// Self node's identifying information. - const SubscriberID subscriber_id_; + const UniqueID subscriber_id_; /// The command batch size for the subscriber. const int64_t max_command_batch_size_; @@ -483,14 +340,14 @@ class Subscriber : public SubscriberInterface { SubscribeDoneCallback done_cb; }; using CommandQueue = std::queue>; - absl::flat_hash_map commands_ ABSL_GUARDED_BY(mutex_); + absl::flat_hash_map commands_ ABSL_GUARDED_BY(mutex_); /// A set to cache the connected publisher ids. "Connected" means the long polling /// request is in flight. - absl::flat_hash_set publishers_connected_ ABSL_GUARDED_BY(mutex_); + absl::flat_hash_set publishers_connected_ ABSL_GUARDED_BY(mutex_); /// A set to keep track of in-flight command batch requests - absl::flat_hash_set command_batch_sent_ ABSL_GUARDED_BY(mutex_); + absl::flat_hash_set command_batch_sent_ ABSL_GUARDED_BY(mutex_); /// Mapping of channel type to channels. absl::flat_hash_map> channels_ @@ -498,7 +355,7 @@ class Subscriber : public SubscriberInterface { /// Keeps track of last processed by publisher. /// Note the publisher_id only change if gcs failover. - absl::flat_hash_map> processed_sequences_ + absl::flat_hash_map> processed_sequences_ ABSL_GUARDED_BY(mutex_); }; diff --git a/src/ray/pubsub/subscriber_interface.h b/src/ray/pubsub/subscriber_interface.h new file mode 100644 index 000000000000..86db9130410e --- /dev/null +++ b/src/ray/pubsub/subscriber_interface.h @@ -0,0 +1,113 @@ +// Copyright 2017 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include +#include +#include + +#include "ray/common/id.h" +#include "ray/rpc/client_call.h" +#include "src/ray/protobuf/common.pb.h" +#include "src/ray/protobuf/pubsub.pb.h" + +namespace ray { + +namespace pubsub { + +using SubscribeDoneCallback = std::function; +using SubscriptionItemCallback = std::function; +using SubscriptionFailureCallback = + std::function; + +/// Interface for a subscriber to one or more pubsub channels from a publisher. +class SubscriberInterface { + public: + /// There are two modes of subscriptions. Each channel can only be subscribed in one + /// mode, i.e. + /// - Calling Subscribe() to subscribe to one or more entities in a channel. + /// - Calling Subscribe() once to subscribe to all entities in a channel. + /// NOTE: It is an error to call both Subscribe() to all entities and then only + /// subscribe to one entity on the same channel type. + + /// Subscribe to entity key_id in channel channel_type. + /// NOTE(sang): All the callbacks could be executed in a different thread from a caller. + /// For example, Subscriber executes callbacks on a passed io_service. + /// + /// \param sub_message The subscription message. + /// \param channel_type The channel to subscribe to. + /// \param publisher_address Address of the publisher to subscribe the object. + /// \param key_id The entity id to subscribe from the publisher. Subscribes to all + /// entities if nullopt. + /// \param subscription_callback A callback that is invoked whenever the given entity + /// information is received by the subscriber. + /// \param subscription_failure_callback A callback that is invoked whenever the + /// connection to publisher is broken (e.g. the publisher fails). + virtual void Subscribe(std::unique_ptr sub_message, + rpc::ChannelType channel_type, + const rpc::Address &publisher_address, + const std::optional &key_id, + SubscribeDoneCallback subscribe_done_callback, + SubscriptionItemCallback subscription_callback, + SubscriptionFailureCallback subscription_failure_callback) = 0; + + /// Unsubscribe the entity if the entity has been subscribed with Subscribe(). + /// NOTE: Calling this method inside subscription_failure_callback is not allowed. + /// + /// \param channel_type The channel to unsubscribe from. + /// \param publisher_address The publisher address that it will unsubscribe from. + /// \param key_id The entity id to unsubscribe. Unsubscribes from all entities if + /// nullopt. + /// \return Returns whether the entity key_id has been subscribed before. + virtual bool Unsubscribe(rpc::ChannelType channel_type, + const rpc::Address &publisher_address, + const std::optional &key_id) = 0; + + /// Test only. + /// Checks if the entity key_id is being subscribed to specifically. + /// Does not consider if the subscriber is subscribed to all entities in a channel. + /// + /// \param publisher_address The publisher address to check. + /// \param key_id The entity id to check. + virtual bool IsSubscribed(rpc::ChannelType channel_type, + const rpc::Address &publisher_address, + const std::string &key_id) const = 0; + + virtual std::string DebugString() const = 0; + + virtual ~SubscriberInterface() = default; +}; + +/// Interface for the client used by a subscriber. +class SubscriberClientInterface { + public: + /// Send a long polling request to a publisher. + virtual void PubsubLongPolling( + const rpc::PubsubLongPollingRequest &request, + const rpc::ClientCallback &callback) = 0; + + /// Send a pubsub command batch to a publisher. + virtual void PubsubCommandBatch( + const rpc::PubsubCommandBatchRequest &request, + const rpc::ClientCallback &callback) = 0; + + virtual ~SubscriberClientInterface() = default; +}; + +} // namespace pubsub + +} // namespace ray diff --git a/src/ray/pubsub/test/BUILD.bazel b/src/ray/pubsub/tests/BUILD.bazel similarity index 95% rename from src/ray/pubsub/test/BUILD.bazel rename to src/ray/pubsub/tests/BUILD.bazel index 0f19f2b7f356..f1879f8008e6 100644 --- a/src/ray/pubsub/test/BUILD.bazel +++ b/src/ray/pubsub/tests/BUILD.bazel @@ -27,7 +27,7 @@ ray_cc_test( ray_cc_test( name = "pubsub_integration_test", size = "small", - srcs = ["integration_test.cc"], + srcs = ["pubsub_integration_test.cc"], tags = ["team:core"], deps = [ "//src/ray/protobuf:pubsub_cc_grpc", diff --git a/src/ray/pubsub/test/publisher_test.cc b/src/ray/pubsub/tests/publisher_test.cc similarity index 76% rename from src/ray/pubsub/test/publisher_test.cc rename to src/ray/pubsub/tests/publisher_test.cc index 7c09e0598012..17b728a38c4a 100644 --- a/src/ray/pubsub/test/publisher_test.cc +++ b/src/ray/pubsub/tests/publisher_test.cc @@ -19,7 +19,6 @@ #include #include -#include "gmock/gmock.h" #include "gtest/gtest.h" #include "ray/common/asio/instrumented_io_context.h" #include "ray/common/asio/periodical_runner.h" @@ -28,20 +27,16 @@ namespace ray { namespace pubsub { + namespace { const NodeID kDefaultPublisherId = NodeID::FromRandom(); } -using pub_internal::SubscriberState; -using pub_internal::SubscriptionIndex; - class PublisherTest : public ::testing::Test { public: PublisherTest() : periodical_runner_(PeriodicalRunner::Create(io_service_)) {} - ~PublisherTest() {} - - void SetUp() { + void SetUp() override { publisher_ = std::make_shared( /*channels=*/ std::vector{ @@ -60,26 +55,23 @@ class PublisherTest : public ::testing::Test { request_.set_publisher_id(kDefaultPublisherId.Binary()); } - void TearDown() {} - void ResetSequenceId() { sequence_id_ = 0; } int64_t GetNextSequenceId() { return ++sequence_id_; } - const rpc::PubMessage GeneratePubMessage(const ObjectID &object_id, - int64_t sequence_id = 0) { + rpc::PubMessage GeneratePubMessage(const ObjectID &object_id, int64_t sequence_id = 0) { rpc::PubMessage pub_message; auto *object_eviction_msg = pub_message.mutable_worker_object_eviction_message(); object_eviction_msg->set_object_id(object_id.Binary()); pub_message.set_key_id(object_id.Binary()); pub_message.set_channel_type(rpc::ChannelType::WORKER_OBJECT_EVICTION); - RAY_LOG(INFO) << "message sequence_id is" << sequence_id; + RAY_LOG(INFO) << "message sequence_id is " << sequence_id; pub_message.set_sequence_id(sequence_id); return pub_message; } - const rpc::PubMessage GenerateErrorInfoMessage(const std::string &id, - const std::string &text) { + rpc::PubMessage GenerateErrorInfoMessage(const std::string &id, + const std::string &text) { rpc::PubMessage pub_message; auto *error_msg = pub_message.mutable_error_info_message(); error_msg->set_error_message(text); @@ -88,8 +80,8 @@ class PublisherTest : public ::testing::Test { return pub_message; } - bool HasSubscriber(const std::vector &subscribers, - const SubscriberID &subscriber) { + bool HasSubscriber(const std::vector &subscribers, + const UniqueID &subscriber) { return std::find(subscribers.begin(), subscribers.end(), subscriber) != subscribers.end(); } @@ -115,8 +107,11 @@ class PublisherTest : public ::testing::Test { rpc::SendReplyCallback callback = [pubsub_reply](Status status, std::function success, std::function failure) {}; - subscriber->ConnectToSubscriber(request, pubsub_reply.get(), callback); - subscriber->PublishIfPossible(); + subscriber->ConnectToSubscriber(request, + pubsub_reply->mutable_publisher_id(), + pubsub_reply->mutable_pub_messages(), + callback); + subscriber->PublishIfPossible(/*force_noop=*/false); return pubsub_reply; } @@ -128,7 +123,7 @@ class PublisherTest : public ::testing::Test { absl::flat_hash_map> subscribers_map_; const uint64_t subscriber_timeout_ms_ = 30000; double current_time_; - const SubscriberID subscriber_id_ = SubscriberID::FromRandom(); + const UniqueID subscriber_id_ = UniqueID::FromRandom(); rpc::PubsubLongPollingRequest request_; std::vector> subscribers_; int64_t sequence_id_ = 0; @@ -229,7 +224,7 @@ TEST_F(PublisherTest, TestSubscriptionIndexErase) { auto current = it++; auto subscriber_id = *current; oid_subscribers.erase(current); - ASSERT_EQ(subscription_index.EraseEntry(oid.Binary(), subscriber_id), 1); + subscription_index.EraseEntry(oid.Binary(), subscriber_id); i++; } const auto &subscribers_from_index = @@ -269,8 +264,8 @@ TEST_F(PublisherTest, TestSubscriptionIndexEraseMultiSubscribers) { subscription_index.AddEntry(oid.Binary(), subscriber_1); subscription_index.AddEntry(oid2.Binary(), subscriber_1); subscription_index.AddEntry(oid.Binary(), subscriber_2); - ASSERT_TRUE(subscription_index.EraseEntry(oid.Binary(), subscriber_id)); - ASSERT_FALSE(subscription_index.EraseEntry(oid.Binary(), subscriber_id)); + subscription_index.EraseEntry(oid.Binary(), subscriber_id); + subscription_index.EraseEntry(oid.Binary(), subscriber_id); } TEST_F(PublisherTest, TestSubscriptionIndexEraseSubscriber) { @@ -280,7 +275,7 @@ TEST_F(PublisherTest, TestSubscriptionIndexEraseSubscriber) { SubscriptionIndex subscription_index(rpc::ChannelType::RAY_ERROR_INFO_CHANNEL); auto oid = ObjectID::FromRandom(); auto &subscribers = subscribers_map_[oid]; - std::vector subscriber_ids; + std::vector subscriber_ids; // Add entries. for (int i = 0; i < 6; i++) { @@ -339,6 +334,7 @@ TEST_F(PublisherTest, TestSubscriptionIndexIdempotency) { TEST_F(PublisherTest, TestSubscriber) { absl::flat_hash_set object_ids_published; + reply = rpc::PubsubLongPollingReply(); send_reply_callback = [this, &object_ids_published](Status status, std::function success, std::function failure) { @@ -348,7 +344,7 @@ TEST_F(PublisherTest, TestSubscriber) { ObjectID::FromBinary(msg.worker_object_eviction_message().object_id()); object_ids_published.emplace(oid); } - reply = rpc::PubsubLongPollingReply(); + reply.Clear(); }; auto subscriber = std::make_shared( @@ -358,79 +354,93 @@ TEST_F(PublisherTest, TestSubscriber) { 10, kDefaultPublisherId); // If there's no connection, it will return false. - ASSERT_FALSE(subscriber->PublishIfPossible()); + subscriber->PublishIfPossible(/*force_noop=*/false); // Try connecting. - subscriber->ConnectToSubscriber(request_, &reply, send_reply_callback); + subscriber->ConnectToSubscriber(request_, + reply.mutable_publisher_id(), + reply.mutable_pub_messages(), + send_reply_callback); // Reconnection should still succeed. - subscriber->ConnectToSubscriber(request_, &reply, send_reply_callback); + subscriber->ConnectToSubscriber(request_, + reply.mutable_publisher_id(), + reply.mutable_pub_messages(), + send_reply_callback); // No result should have been returned. ASSERT_TRUE(object_ids_published.empty()); - // Since there's no objects pending to be published, it should return false. - ASSERT_FALSE(subscriber->PublishIfPossible()); + subscriber->PublishIfPossible(/*force_noop=*/false); + ASSERT_TRUE(object_ids_published.empty()); - absl::flat_hash_set published_objects; + absl::flat_hash_set expected_published_objects; // Make sure publishing one object works as expected. auto oid = ObjectID::FromRandom(); subscriber->QueueMessage( - std::make_shared(GeneratePubMessage(oid, GetNextSequenceId())), - /*try_publish=*/false); - published_objects.emplace(oid); - ASSERT_TRUE(subscriber->PublishIfPossible()); + std::make_shared(GeneratePubMessage(oid, GetNextSequenceId()))); + expected_published_objects.emplace(oid); + subscriber->PublishIfPossible(/*force_noop=*/false); ASSERT_TRUE(object_ids_published.contains(oid)); // No object is pending to be published, and there's no connection. - ASSERT_FALSE(subscriber->PublishIfPossible()); + subscriber->PublishIfPossible(/*force_noop=*/false); // Add 3 oids and see if it works properly. for (int i = 0; i < 3; i++) { oid = ObjectID::FromRandom(); subscriber->QueueMessage( - std::make_shared(GeneratePubMessage(oid, GetNextSequenceId())), - /*try_publish=*/false); - published_objects.emplace(oid); + std::make_shared(GeneratePubMessage(oid, GetNextSequenceId()))); + expected_published_objects.emplace(oid); } // Since there's no connection, objects won't be published. - ASSERT_FALSE(subscriber->PublishIfPossible()); - subscriber->ConnectToSubscriber(request_, &reply, send_reply_callback); - for (auto cur_oid : published_objects) { - ASSERT_TRUE(object_ids_published.contains(cur_oid)); - } + subscriber->PublishIfPossible(/*force_noop=*/false); + subscriber->ConnectToSubscriber(request_, + reply.mutable_publisher_id(), + reply.mutable_pub_messages(), + send_reply_callback); + ASSERT_EQ(expected_published_objects, object_ids_published); // Queue is not cleaned up if max_processed_sequence_id hasn't // been set properly. request_.set_max_processed_sequence_id(1); - subscriber->ConnectToSubscriber(request_, &reply, send_reply_callback); + subscriber->ConnectToSubscriber(request_, + reply.mutable_publisher_id(), + reply.mutable_pub_messages(), + send_reply_callback); ASSERT_FALSE(subscriber->CheckNoLeaks()); // If we set wrong publisher_id, the queue won't be cleaned up. request_.set_publisher_id(NodeID::FromRandom().Binary()); request_.set_max_processed_sequence_id(sequence_id_); - subscriber->ConnectToSubscriber(request_, &reply, send_reply_callback); + subscriber->ConnectToSubscriber(request_, + reply.mutable_publisher_id(), + reply.mutable_pub_messages(), + send_reply_callback); ASSERT_FALSE(subscriber->CheckNoLeaks()); // By sending back max_processed_sequence_id, the subscriber's sending queue // is cleaned up. request_.set_max_processed_sequence_id(sequence_id_); request_.set_publisher_id(kDefaultPublisherId.Binary()); - subscriber->ConnectToSubscriber(request_, &reply, send_reply_callback); + subscriber->ConnectToSubscriber(request_, + reply.mutable_publisher_id(), + reply.mutable_pub_messages(), + send_reply_callback); ASSERT_TRUE(subscriber->CheckNoLeaks()); } TEST_F(PublisherTest, TestSubscriberBatchSize) { absl::flat_hash_set object_ids_published; - int64_t max_processed_seuquence_id = 0; - send_reply_callback = - [this, &object_ids_published, &max_processed_seuquence_id]( - Status status, std::function success, std::function failure) { - for (int i = 0; i < reply.pub_messages_size(); i++) { - const auto &msg = reply.pub_messages(i); - const auto oid = - ObjectID::FromBinary(msg.worker_object_eviction_message().object_id()); - object_ids_published.emplace(oid); - max_processed_seuquence_id = - std::max(msg.sequence_id(), max_processed_seuquence_id); - } - reply = rpc::PubsubLongPollingReply(); - }; + int64_t max_processed_sequence_id = 0; + send_reply_callback = [this, &object_ids_published, &max_processed_sequence_id]( + Status status, + std::function success, + std::function failure) { + for (int i = 0; i < reply.pub_messages_size(); i++) { + const auto &msg = reply.pub_messages(i); + const auto oid = + ObjectID::FromBinary(msg.worker_object_eviction_message().object_id()); + object_ids_published.emplace(oid); + max_processed_sequence_id = std::max(msg.sequence_id(), max_processed_sequence_id); + } + reply = rpc::PubsubLongPollingReply(); + }; auto max_publish_size = 5; auto subscriber = std::make_shared( @@ -439,21 +449,19 @@ TEST_F(PublisherTest, TestSubscriberBatchSize) { subscriber_timeout_ms_, max_publish_size, kDefaultPublisherId); - subscriber->ConnectToSubscriber(request_, &reply, send_reply_callback); - absl::flat_hash_set published_objects; std::vector oids; for (int i = 0; i < 10; i++) { auto oid = ObjectID::FromRandom(); oids.push_back(oid); subscriber->QueueMessage( - std::make_shared(GeneratePubMessage(oid, GetNextSequenceId())), - /*try_publish=*/false); - published_objects.emplace(oid); + std::make_shared(GeneratePubMessage(oid, GetNextSequenceId()))); } - // Make sure only up to batch size is published. - ASSERT_TRUE(subscriber->PublishIfPossible()); + subscriber->ConnectToSubscriber(request_, + reply.mutable_publisher_id(), + reply.mutable_pub_messages(), + send_reply_callback); for (int i = 0; i < max_publish_size; i++) { ASSERT_TRUE(object_ids_published.contains(oids[i])); @@ -463,9 +471,12 @@ TEST_F(PublisherTest, TestSubscriberBatchSize) { } // Remaining messages are published upon polling. - ASSERT_EQ(max_processed_seuquence_id, max_publish_size); - request_.set_max_processed_sequence_id(max_processed_seuquence_id); - subscriber->ConnectToSubscriber(request_, &reply, send_reply_callback); + ASSERT_EQ(max_processed_sequence_id, max_publish_size); + request_.set_max_processed_sequence_id(max_processed_sequence_id); + subscriber->ConnectToSubscriber(request_, + reply.mutable_publisher_id(), + reply.mutable_pub_messages(), + send_reply_callback); for (int i = 0; i < 10; i++) { ASSERT_TRUE(object_ids_published.contains(oids[i])); } @@ -488,7 +499,10 @@ TEST_F(PublisherTest, TestSubscriberActiveTimeout) { 10, kDefaultPublisherId); - subscriber->ConnectToSubscriber(request_, &reply, send_reply_callback); + subscriber->ConnectToSubscriber(request_, + reply.mutable_publisher_id(), + reply.mutable_pub_messages(), + send_reply_callback); // Connection is not timed out yet. ASSERT_TRUE(subscriber->IsActive()); @@ -510,7 +524,10 @@ TEST_F(PublisherTest, TestSubscriberActiveTimeout) { // New connection is established. reply = rpc::PubsubLongPollingReply(); - subscriber->ConnectToSubscriber(request_, &reply, send_reply_callback); + subscriber->ConnectToSubscriber(request_, + reply.mutable_publisher_id(), + reply.mutable_pub_messages(), + send_reply_callback); ASSERT_TRUE(subscriber->IsActive()); ASSERT_TRUE(subscriber->ConnectionExists()); @@ -539,7 +556,10 @@ TEST_F(PublisherTest, TestSubscriberActiveTimeout) { // Notify that message 1 is safe to be GCed. request_.set_max_processed_sequence_id(1); reply = rpc::PubsubLongPollingReply(); - subscriber->ConnectToSubscriber(request_, &reply, send_reply_callback); + subscriber->ConnectToSubscriber(request_, + reply.mutable_publisher_id(), + reply.mutable_pub_messages(), + send_reply_callback); ASSERT_TRUE(subscriber->CheckNoLeaks()); } @@ -561,8 +581,11 @@ TEST_F(PublisherTest, TestSubscriberDisconnected) { kDefaultPublisherId); // Suppose the new connection is removed. - subscriber->ConnectToSubscriber(request_, &reply, send_reply_callback); - subscriber->PublishIfPossible(/*force*/ true); + subscriber->ConnectToSubscriber(request_, + reply.mutable_publisher_id(), + reply.mutable_pub_messages(), + send_reply_callback); + subscriber->PublishIfPossible(/*force_noop=*/true); ASSERT_EQ(reply_cnt, 1); ASSERT_TRUE(subscriber->IsActive()); ASSERT_FALSE(subscriber->ConnectionExists()); @@ -579,8 +602,11 @@ TEST_F(PublisherTest, TestSubscriberDisconnected) { ASSERT_FALSE(subscriber->ConnectionExists()); // New connection is coming in. - subscriber->ConnectToSubscriber(request_, &reply, send_reply_callback); - subscriber->PublishIfPossible(/*force*/ true); + subscriber->ConnectToSubscriber(request_, + reply.mutable_publisher_id(), + reply.mutable_pub_messages(), + send_reply_callback); + subscriber->PublishIfPossible(/*force_noop=*/true); ASSERT_EQ(reply_cnt, 2); // Some time has passed, but it is not timed out yet. @@ -590,8 +616,11 @@ TEST_F(PublisherTest, TestSubscriberDisconnected) { // Another connection is made, so it shouldn't timeout until the next timeout is // reached. - subscriber->ConnectToSubscriber(request_, &reply, send_reply_callback); - subscriber->PublishIfPossible(/*force*/ true); + subscriber->ConnectToSubscriber(request_, + reply.mutable_publisher_id(), + reply.mutable_pub_messages(), + send_reply_callback); + subscriber->PublishIfPossible(/*force_noop=*/true); ASSERT_EQ(reply_cnt, 3); current_time_ += subscriber_timeout_ms_ / 2; ASSERT_TRUE(subscriber->IsActive()); @@ -623,15 +652,21 @@ TEST_F(PublisherTest, TestSubscriberTimeoutComplicated) { kDefaultPublisherId); // Suppose the new connection is removed. - subscriber->ConnectToSubscriber(request_, &reply, send_reply_callback); - subscriber->PublishIfPossible(/*force*/ true); + subscriber->ConnectToSubscriber(request_, + reply.mutable_publisher_id(), + reply.mutable_pub_messages(), + send_reply_callback); + subscriber->PublishIfPossible(/*force_noop=*/true); ASSERT_EQ(reply_cnt, 1); ASSERT_TRUE(subscriber->IsActive()); ASSERT_FALSE(subscriber->ConnectionExists()); // Some time has passed, and the connection is removed. current_time_ += subscriber_timeout_ms_ - 1; - subscriber->ConnectToSubscriber(request_, &reply, send_reply_callback); + subscriber->ConnectToSubscriber(request_, + reply.mutable_publisher_id(), + reply.mutable_pub_messages(), + send_reply_callback); current_time_ += 2; // Timeout shouldn't happen because the connection has been refreshed. ASSERT_TRUE(subscriber->IsActive()); @@ -640,7 +675,7 @@ TEST_F(PublisherTest, TestSubscriberTimeoutComplicated) { // Right before the timeout, connection is removed. In this case, timeout shouldn't also // happen. current_time_ += subscriber_timeout_ms_ - 1; - subscriber->PublishIfPossible(/*force*/ true); + subscriber->PublishIfPossible(/*force_noop=*/true); current_time_ += 2; ASSERT_TRUE(subscriber->IsActive()); ASSERT_FALSE(subscriber->ConnectionExists()); @@ -670,7 +705,10 @@ TEST_F(PublisherTest, TestBasicSingleSubscriber) { const auto oid = ObjectID::FromRandom(); - publisher_->ConnectToSubscriber(request_, &reply, send_reply_callback); + publisher_->ConnectToSubscriber(request_, + reply.mutable_publisher_id(), + reply.mutable_pub_messages(), + send_reply_callback); publisher_->RegisterSubscription( rpc::ChannelType::WORKER_OBJECT_EVICTION, subscriber_id_, oid.Binary()); publisher_->Publish(GeneratePubMessage(oid, 0)); @@ -698,7 +736,10 @@ TEST_F(PublisherTest, TestNoConnectionWhenRegistered) { publisher_->Publish(GeneratePubMessage(oid)); // Nothing has been published because there's no connection. ASSERT_EQ(batched_ids.size(), 0); - publisher_->ConnectToSubscriber(request_, &reply, send_reply_callback); + publisher_->ConnectToSubscriber(request_, + reply.mutable_publisher_id(), + reply.mutable_pub_messages(), + send_reply_callback); // When the connection is coming, it should be published. ASSERT_EQ(batched_ids[0], oid); } @@ -729,7 +770,10 @@ TEST_F(PublisherTest, TestMultiObjectsFromSingleNode) { ASSERT_EQ(batched_ids.size(), 0); // Now connection is initiated, and all oids are published. - publisher_->ConnectToSubscriber(request_, &reply, send_reply_callback); + publisher_->ConnectToSubscriber(request_, + reply.mutable_publisher_id(), + reply.mutable_pub_messages(), + send_reply_callback); for (int i = 0; i < num_oids; i++) { const auto oid_test = oids[i]; const auto published_oid = batched_ids[i]; @@ -770,7 +814,10 @@ TEST_F(PublisherTest, TestMultiObjectsFromMultiNodes) { // Check all of nodes are publishing objects properly. for (int i = 0; i < num_nodes; i++) { - publisher_->ConnectToSubscriber(request_, &reply, send_reply_callback); + publisher_->ConnectToSubscriber(request_, + reply.mutable_publisher_id(), + reply.mutable_pub_messages(), + send_reply_callback); const auto oid_test = oids[i]; const auto published_oid = batched_ids[i]; ASSERT_EQ(oid_test, published_oid); @@ -780,6 +827,7 @@ TEST_F(PublisherTest, TestMultiObjectsFromMultiNodes) { TEST_F(PublisherTest, TestMultiSubscribers) { absl::flat_hash_set batched_ids; int reply_invoked = 0; + reply = rpc::PubsubLongPollingReply(); send_reply_callback = [this, &batched_ids, &reply_invoked]( Status status, std::function success, std::function failure) { @@ -789,7 +837,7 @@ TEST_F(PublisherTest, TestMultiSubscribers) { ObjectID::FromBinary(msg.worker_object_eviction_message().object_id()); batched_ids.emplace(oid); } - reply = rpc::PubsubLongPollingReply(); + reply.Clear(); reply_invoked += 1; }; @@ -809,7 +857,10 @@ TEST_F(PublisherTest, TestMultiSubscribers) { // Check all of nodes are publishing objects properly. for (int i = 0; i < num_nodes; i++) { - publisher_->ConnectToSubscriber(request_, &reply, send_reply_callback); + publisher_->ConnectToSubscriber(request_, + reply.mutable_publisher_id(), + reply.mutable_pub_messages(), + send_reply_callback); } publisher_->Publish(GeneratePubMessage(oid)); ASSERT_EQ(batched_ids.size(), 1); @@ -847,7 +898,10 @@ TEST_F(PublisherTest, TestBatch) { // Now connection is initiated, and all oids are published. request_.set_max_processed_sequence_id(max_processed_sequence_id); - publisher_->ConnectToSubscriber(request_, &reply, send_reply_callback); + publisher_->ConnectToSubscriber(request_, + reply.mutable_publisher_id(), + reply.mutable_pub_messages(), + send_reply_callback); for (int i = 0; i < num_oids; i++) { const auto oid_test = oids[i]; const auto published_oid = batched_ids[i]; @@ -865,7 +919,10 @@ TEST_F(PublisherTest, TestBatch) { publisher_->Publish(GeneratePubMessage(oid)); } request_.set_max_processed_sequence_id(max_processed_sequence_id); - publisher_->ConnectToSubscriber(request_, &reply, send_reply_callback); + publisher_->ConnectToSubscriber(request_, + reply.mutable_publisher_id(), + reply.mutable_pub_messages(), + send_reply_callback); ASSERT_EQ(num_oids, oids.size()); ASSERT_EQ(num_oids, batched_ids.size()); for (int i = 0; i < num_oids; i++) { @@ -884,7 +941,10 @@ TEST_F(PublisherTest, TestNodeFailureWhenConnectionExisted) { }; const auto oid = ObjectID::FromRandom(); - publisher_->ConnectToSubscriber(request_, &reply, send_reply_callback); + publisher_->ConnectToSubscriber(request_, + reply.mutable_publisher_id(), + reply.mutable_pub_messages(), + send_reply_callback); // This information should be cleaned up as the subscriber is dead. publisher_->RegisterSubscription( rpc::ChannelType::WORKER_OBJECT_EVICTION, subscriber_id_, oid.Binary()); @@ -900,8 +960,7 @@ TEST_F(PublisherTest, TestNodeFailureWhenConnectionExisted) { publisher_->CheckDeadSubscribers(); // Connection should be replied (removed) when the subscriber is unregistered. - int erased = publisher_->UnregisterSubscriber(subscriber_id_); - ASSERT_EQ(erased, 0); + publisher_->UnregisterSubscriber(subscriber_id_); ASSERT_TRUE(publisher_->CheckNoLeaks()); // New subscriber is registsered for some reason. Since there's no new long polling @@ -911,8 +970,7 @@ TEST_F(PublisherTest, TestNodeFailureWhenConnectionExisted) { rpc::ChannelType::WORKER_OBJECT_EVICTION, subscriber_id_, oid.Binary()); current_time_ += subscriber_timeout_ms_; publisher_->CheckDeadSubscribers(); - erased = publisher_->UnregisterSubscriber(subscriber_id_); - ASSERT_EQ(erased, 0); + publisher_->UnregisterSubscriber(subscriber_id_); ASSERT_TRUE(publisher_->CheckNoLeaks()); } @@ -935,7 +993,10 @@ TEST_F(PublisherTest, TestNodeFailureWhenConnectionDoesntExist) { ASSERT_EQ(long_polling_connection_replied, false); // Connect should be removed eventually to avoid having a memory leak. - publisher_->ConnectToSubscriber(request_, &reply, send_reply_callback); + publisher_->ConnectToSubscriber(request_, + reply.mutable_publisher_id(), + reply.mutable_pub_messages(), + send_reply_callback); ASSERT_EQ(long_polling_connection_replied, true); // Nothing happens at first. publisher_->CheckDeadSubscribers(); @@ -970,30 +1031,28 @@ TEST_F(PublisherTest, TestUnregisterSubscription) { }; const auto oid = ObjectID::FromRandom(); - publisher_->ConnectToSubscriber(request_, &reply, send_reply_callback); + publisher_->ConnectToSubscriber(request_, + reply.mutable_publisher_id(), + reply.mutable_pub_messages(), + send_reply_callback); publisher_->RegisterSubscription( rpc::ChannelType::WORKER_OBJECT_EVICTION, subscriber_id_, oid.Binary()); ASSERT_EQ(long_polling_connection_replied, false); // Connection should be replied (removed) when the subscriber is unregistered. - int erased = publisher_->UnregisterSubscription( + publisher_->UnregisterSubscription( rpc::ChannelType::WORKER_OBJECT_EVICTION, subscriber_id_, oid.Binary()); - ASSERT_EQ(erased, 1); ASSERT_EQ(long_polling_connection_replied, false); // Make sure when the entries don't exist, it doesn't delete anything. - ASSERT_EQ(publisher_->UnregisterSubscription(rpc::ChannelType::WORKER_OBJECT_EVICTION, - subscriber_id_, - ObjectID::FromRandom().Binary()), - 0); - ASSERT_EQ( - publisher_->UnregisterSubscription( - rpc::ChannelType::WORKER_OBJECT_EVICTION, NodeID::FromRandom(), oid.Binary()), - 0); - ASSERT_EQ(publisher_->UnregisterSubscription(rpc::ChannelType::WORKER_OBJECT_EVICTION, - NodeID::FromRandom(), - ObjectID::FromRandom().Binary()), - 0); + publisher_->UnregisterSubscription(rpc::ChannelType::WORKER_OBJECT_EVICTION, + subscriber_id_, + ObjectID::FromRandom().Binary()); + publisher_->UnregisterSubscription( + rpc::ChannelType::WORKER_OBJECT_EVICTION, NodeID::FromRandom(), oid.Binary()); + publisher_->UnregisterSubscription(rpc::ChannelType::WORKER_OBJECT_EVICTION, + NodeID::FromRandom(), + ObjectID::FromRandom().Binary()); ASSERT_EQ(long_polling_connection_replied, false); // Metadata won't be removed until we unregsiter the subscriber. publisher_->UnregisterSubscriber(subscriber_id_); @@ -1011,28 +1070,31 @@ TEST_F(PublisherTest, TestUnregisterSubscriber) { // Test basic. const auto oid = ObjectID::FromRandom(); - publisher_->ConnectToSubscriber(request_, &reply, send_reply_callback); + publisher_->ConnectToSubscriber(request_, + reply.mutable_publisher_id(), + reply.mutable_pub_messages(), + send_reply_callback); publisher_->RegisterSubscription( rpc::ChannelType::WORKER_OBJECT_EVICTION, subscriber_id_, oid.Binary()); ASSERT_EQ(long_polling_connection_replied, false); - int erased = publisher_->UnregisterSubscriber(subscriber_id_); - ASSERT_TRUE(erased); + publisher_->UnregisterSubscriber(subscriber_id_); // Make sure the long polling request is replied to avoid memory leak. ASSERT_EQ(long_polling_connection_replied, true); // Test when registration wasn't done. long_polling_connection_replied = false; - publisher_->ConnectToSubscriber(request_, &reply, send_reply_callback); - erased = publisher_->UnregisterSubscriber(subscriber_id_); - ASSERT_FALSE(erased); + publisher_->ConnectToSubscriber(request_, + reply.mutable_publisher_id(), + reply.mutable_pub_messages(), + send_reply_callback); + publisher_->UnregisterSubscriber(subscriber_id_); ASSERT_EQ(long_polling_connection_replied, true); // Test when connect wasn't done. long_polling_connection_replied = false; publisher_->RegisterSubscription( rpc::ChannelType::WORKER_OBJECT_EVICTION, subscriber_id_, oid.Binary()); - erased = publisher_->UnregisterSubscriber(subscriber_id_); - ASSERT_TRUE(erased); + publisher_->UnregisterSubscriber(subscriber_id_); ASSERT_EQ(long_polling_connection_replied, false); ASSERT_TRUE(publisher_->CheckNoLeaks()); } @@ -1040,25 +1102,93 @@ TEST_F(PublisherTest, TestUnregisterSubscriber) { // Test if registration / unregistration is idempotent. TEST_F(PublisherTest, TestRegistrationIdempotency) { const auto oid = ObjectID::FromRandom(); - ASSERT_TRUE(publisher_->RegisterSubscription( - rpc::ChannelType::WORKER_OBJECT_EVICTION, subscriber_id_, oid.Binary())); - ASSERT_FALSE(publisher_->RegisterSubscription( - rpc::ChannelType::WORKER_OBJECT_EVICTION, subscriber_id_, oid.Binary())); - ASSERT_FALSE(publisher_->RegisterSubscription( - rpc::ChannelType::WORKER_OBJECT_EVICTION, subscriber_id_, oid.Binary())); - ASSERT_FALSE(publisher_->RegisterSubscription( - rpc::ChannelType::WORKER_OBJECT_EVICTION, subscriber_id_, oid.Binary())); - ASSERT_FALSE(publisher_->CheckNoLeaks()); - ASSERT_TRUE(publisher_->UnregisterSubscription( - rpc::ChannelType::WORKER_OBJECT_EVICTION, subscriber_id_, oid.Binary())); - ASSERT_FALSE(publisher_->UnregisterSubscription( - rpc::ChannelType::WORKER_OBJECT_EVICTION, subscriber_id_, oid.Binary())); + + // Double register and assert publish + publisher_->RegisterSubscription( + rpc::ChannelType::WORKER_OBJECT_EVICTION, subscriber_id_, oid.Binary()); + publisher_->RegisterSubscription( + rpc::ChannelType::WORKER_OBJECT_EVICTION, subscriber_id_, oid.Binary()); + publisher_->ConnectToSubscriber( + request_, + reply.mutable_publisher_id(), + reply.mutable_pub_messages(), + [](Status, std::function, std::function) {}); + publisher_->Publish(GeneratePubMessage(oid)); + ASSERT_EQ(reply.publisher_id(), kDefaultPublisherId.Binary()); + ASSERT_EQ(reply.pub_messages().size(), 1); + reply = rpc::PubsubLongPollingReply(); + + // Reconnect, unregister and assert no publish messages + request_.set_max_processed_sequence_id(1); + publisher_->ConnectToSubscriber( + request_, + reply.mutable_publisher_id(), + reply.mutable_pub_messages(), + [](Status, std::function, std::function) {}); + publisher_->UnregisterSubscription( + rpc::ChannelType::WORKER_OBJECT_EVICTION, subscriber_id_, oid.Binary()); + publisher_->UnregisterSubscription( + rpc::ChannelType::WORKER_OBJECT_EVICTION, subscriber_id_, oid.Binary()); + auto pub_message = GeneratePubMessage(oid); + publisher_->Publish(pub_message); + ASSERT_TRUE(reply.pub_messages().empty()); ASSERT_TRUE(publisher_->CheckNoLeaks()); - ASSERT_TRUE(publisher_->RegisterSubscription( - rpc::ChannelType::WORKER_OBJECT_EVICTION, subscriber_id_, oid.Binary())); + + // Register and connect. Then unregister a couple times and make sure there's no + // publish. + publisher_->RegisterSubscription( + rpc::ChannelType::WORKER_OBJECT_EVICTION, subscriber_id_, oid.Binary()); + publisher_->ConnectToSubscriber( + request_, + reply.mutable_publisher_id(), + reply.mutable_pub_messages(), + [](Status, std::function, std::function) {}); ASSERT_FALSE(publisher_->CheckNoLeaks()); - ASSERT_TRUE(publisher_->UnregisterSubscription( - rpc::ChannelType::WORKER_OBJECT_EVICTION, subscriber_id_, oid.Binary())); + publisher_->UnregisterSubscriber(subscriber_id_); + publisher_->UnregisterSubscriber(subscriber_id_); + publisher_->UnregisterSubscription( + rpc::ChannelType::WORKER_OBJECT_EVICTION, subscriber_id_, oid.Binary()); + ASSERT_TRUE(publisher_->CheckNoLeaks()); + publisher_->Publish(GeneratePubMessage(oid)); + ASSERT_TRUE(reply.pub_messages().empty()); +} + +TEST_F(PublisherTest, TestSubscriberLostAPublish) { + const auto oid = ObjectID::FromRandom(); + send_reply_callback = [](Status, std::function, std::function) {}; + + // Subscriber registers and connects and publisher publishes. + publisher_->RegisterSubscription( + rpc::ChannelType::WORKER_OBJECT_EVICTION, subscriber_id_, oid.Binary()); + publisher_->ConnectToSubscriber(request_, + reply.mutable_publisher_id(), + reply.mutable_pub_messages(), + send_reply_callback); + publisher_->Publish(GeneratePubMessage(oid)); + ASSERT_EQ(reply.pub_messages().size(), 1); + reply = rpc::PubsubLongPollingReply(); + + // The publisher publishes while there's no active request, then the Subscriber retries + // the LongPollingRequest with the same max_sequence_id since it lost the reply from the + // publisher. The subscriber should get both the 1st and 2nd messages. + publisher_->Publish(GeneratePubMessage(oid)); + publisher_->ConnectToSubscriber(request_, + reply.mutable_publisher_id(), + reply.mutable_pub_messages(), + send_reply_callback); + ASSERT_EQ(reply.pub_messages().size(), 2); + auto max_processed = reply.pub_messages(1).sequence_id(); + reply = rpc::PubsubLongPollingReply(); + + // Subscriber got the reply this time, sends another request with a higher + // max_sequence_id, and then the publisher publishes. + request_.set_max_processed_sequence_id(max_processed); + publisher_->ConnectToSubscriber(request_, + reply.mutable_publisher_id(), + reply.mutable_pub_messages(), + send_reply_callback); + publisher_->Publish(GeneratePubMessage(oid)); + ASSERT_EQ(reply.pub_messages().size(), 1); } TEST_F(PublisherTest, TestPublishFailure) { @@ -1082,7 +1212,10 @@ TEST_F(PublisherTest, TestPublishFailure) { const auto oid = ObjectID::FromRandom(); - publisher_->ConnectToSubscriber(request_, &reply, send_reply_callback); + publisher_->ConnectToSubscriber(request_, + reply.mutable_publisher_id(), + reply.mutable_pub_messages(), + send_reply_callback); publisher_->RegisterSubscription( rpc::ChannelType::WORKER_OBJECT_EVICTION, subscriber_id_, oid.Binary()); publisher_->PublishFailure(rpc::ChannelType::WORKER_OBJECT_EVICTION, oid.Binary()); diff --git a/src/ray/pubsub/test/integration_test.cc b/src/ray/pubsub/tests/pubsub_integration_test.cc similarity index 94% rename from src/ray/pubsub/test/integration_test.cc rename to src/ray/pubsub/tests/pubsub_integration_test.cc index 6884b46b64a2..4f88012c91d1 100644 --- a/src/ray/pubsub/test/integration_test.cc +++ b/src/ray/pubsub/tests/pubsub_integration_test.cc @@ -47,7 +47,8 @@ class SubscriberServiceImpl final : public rpc::SubscriberService::CallbackServi rpc::PubsubLongPollingReply *reply) override { auto *reactor = context->DefaultReactor(); publisher_->ConnectToSubscriber(*request, - reply, + reply->mutable_publisher_id(), + reply->mutable_pub_messages(), [reactor](ray::Status status, std::function success_cb, std::function failure_cb) { @@ -131,6 +132,8 @@ class CallbackSubscriberClient final : public pubsub::SubscriberClientInterface }); } + std::string DebugString() const { return ""; } + private: std::unique_ptr stub_; }; @@ -236,10 +239,11 @@ TEST_F(IntegrationTest, SubscribersToOneIDAndAllIDs) { std::vector actors_2; auto subscriber_2 = CreateSubscriber(); - subscriber_2->SubscribeChannel( + subscriber_2->Subscribe( std::make_unique(), rpc::ChannelType::GCS_ACTOR_CHANNEL, address_proto_, + /*key_id=*/std::nullopt, /*subscribe_done_callback=*/ [&counter](Status status) { RAY_CHECK_OK(status); @@ -290,7 +294,9 @@ TEST_F(IntegrationTest, SubscribersToOneIDAndAllIDs) { subscriber_1->Unsubscribe( rpc::ChannelType::GCS_ACTOR_CHANNEL, address_proto_, subscribed_actor); - subscriber_2->UnsubscribeChannel(rpc::ChannelType::GCS_ACTOR_CHANNEL, address_proto_); + subscriber_2->Unsubscribe(rpc::ChannelType::GCS_ACTOR_CHANNEL, + address_proto_, + /*key_id=*/std::nullopt); // Waiting here is necessary to avoid invalid memory access during shutdown. // TODO(mwtian): cancel inflight polls during subscriber shutdown, and remove the @@ -298,11 +304,15 @@ TEST_F(IntegrationTest, SubscribersToOneIDAndAllIDs) { int wait_count = 0; while (!(subscriber_1->CheckNoLeaks() && subscriber_2->CheckNoLeaks())) { // Flush all the inflight long polling. - subscriber_service_->GetPublisher().UnregisterAll(); + subscriber_service_->GetPublisher().UnregisterSubscriber( + subscriber_1->subscriber_id_); + subscriber_service_->GetPublisher().UnregisterSubscriber( + subscriber_2->subscriber_id_); ASSERT_LT(wait_count, 60) << "Subscribers still have inflight operations after 60s"; ++wait_count; absl::SleepFor(absl::Seconds(1)); } } + } // namespace pubsub } // namespace ray diff --git a/src/ray/pubsub/test/subscriber_test.cc b/src/ray/pubsub/tests/subscriber_test.cc similarity index 95% rename from src/ray/pubsub/test/subscriber_test.cc rename to src/ray/pubsub/tests/subscriber_test.cc index 1453ec9409da..c2b212c13ce9 100644 --- a/src/ray/pubsub/test/subscriber_test.cc +++ b/src/ray/pubsub/tests/subscriber_test.cc @@ -128,7 +128,7 @@ class MockWorkerClient : public pubsub::SubscriberClientInterface { std::queue requests_; int64_t sequence_id_ = 0; int64_t max_processed_sequence_id_ = 0; - std::string publisher_id_ = pubsub::PublisherID::FromRandom().Binary(); + std::string publisher_id_ = UniqueID::FromRandom().Binary(); }; namespace pubsub { @@ -165,7 +165,7 @@ class SubscriberTest : public ::testing::Test { const std::string address = "abc", const int port = 1234) { rpc::Address addr; - addr.set_raylet_id(node_id); + addr.set_node_id(node_id); addr.set_ip_address(address); addr.set_port(port); addr.set_worker_id(worker_id); @@ -272,12 +272,13 @@ TEST_F(SubscriberTest, TestIgnoreOutofOrderMessage) { const auto owner_addr = GenerateOwnerAddress(); const auto object_id = ObjectID::FromRandom(); const auto object_id1 = ObjectID::FromRandom(); - subscriber_->SubscribeChannel(std::make_unique(), - channel, - owner_addr, - /*subscribe_done_callback=*/nullptr, - subscription_callback, - failure_callback); + subscriber_->Subscribe(std::make_unique(), + channel, + owner_addr, + /*key_id=*/std::nullopt, + /*subscribe_done_callback=*/nullptr, + subscription_callback, + failure_callback); ASSERT_TRUE(owner_client->ReplyCommandBatch()); std::vector objects_batched; @@ -318,12 +319,13 @@ TEST_F(SubscriberTest, TestPublisherFailsOver) { const auto owner_addr = GenerateOwnerAddress(); const auto object_id = ObjectID::FromRandom(); const auto object_id1 = ObjectID::FromRandom(); - subscriber_->SubscribeChannel(std::make_unique(), - channel, - owner_addr, - /*subscribe_done_callback=*/nullptr, - subscription_callback, - failure_callback); + subscriber_->Subscribe(std::make_unique(), + channel, + owner_addr, + /*key_id=*/std::nullopt, + /*subscribe_done_callback=*/nullptr, + subscription_callback, + failure_callback); ASSERT_TRUE(owner_client->ReplyCommandBatch()); std::vector objects_batched; @@ -456,9 +458,9 @@ TEST_F(SubscriberTest, TestCallbackNotInvokedForNonSubscribedObject) { ASSERT_EQ(object_subscribed_[object_id], 0); } -TEST_F(SubscriberTest, TestSubscribeChannelEntities) { +TEST_F(SubscriberTest, TestSubscribeAllEntities) { /// - /// Make sure SubscribeChannel() can receive all entities from a channel. + /// Make sure Subscribe() can receive all entities from a channel. /// auto subscription_callback = [this](const rpc::PubMessage &msg) { @@ -467,12 +469,13 @@ TEST_F(SubscriberTest, TestSubscribeChannelEntities) { auto failure_callback = EMPTY_FAILURE_CALLBACK; const auto owner_addr = GenerateOwnerAddress(); - subscriber_->SubscribeChannel(std::make_unique(), - channel, - owner_addr, - /*subscribe_done_callback=*/nullptr, - subscription_callback, - failure_callback); + subscriber_->Subscribe(std::make_unique(), + channel, + owner_addr, + /*key_id=*/std::nullopt, + /*subscribe_done_callback=*/nullptr, + subscription_callback, + failure_callback); ASSERT_TRUE(owner_client->ReplyCommandBatch()); ASSERT_EQ(owner_client->GetNumberOfInFlightLongPollingRequests(), 1); @@ -501,7 +504,7 @@ TEST_F(SubscriberTest, TestSubscribeChannelEntities) { } // Unsubscribe from the channel. - ASSERT_TRUE(subscriber_->UnsubscribeChannel(channel, owner_addr)); + ASSERT_TRUE(subscriber_->Unsubscribe(channel, owner_addr, /*key_id=*/std::nullopt)); } TEST_F(SubscriberTest, TestIgnoreBatchAfterUnsubscription) { @@ -549,14 +552,15 @@ TEST_F(SubscriberTest, TestIgnoreBatchAfterUnsubscribeFromAll) { auto failure_callback = EMPTY_FAILURE_CALLBACK; const auto owner_addr = GenerateOwnerAddress(); - subscriber_->SubscribeChannel(std::make_unique(), - channel, - owner_addr, - /*subscribe_done_callback=*/nullptr, - subscription_callback, - failure_callback); + subscriber_->Subscribe(std::make_unique(), + channel, + owner_addr, + /*key_id=*/std::nullopt, + /*subscribe_done_callback=*/nullptr, + subscription_callback, + failure_callback); ASSERT_TRUE(owner_client->ReplyCommandBatch()); - ASSERT_TRUE(subscriber_->UnsubscribeChannel(channel, owner_addr)); + ASSERT_TRUE(subscriber_->Unsubscribe(channel, owner_addr, /*key_id=*/std::nullopt)); ASSERT_TRUE(owner_client->ReplyCommandBatch()); const auto object_id = ObjectID::FromRandom(); @@ -979,9 +983,6 @@ TEST_F(SubscriberTest, TestIsSubscribed) { ASSERT_FALSE(subscriber_->IsSubscribed(channel, owner_addr, object_id.Binary())); } -// TODO(sang): Need to add a network failure test once we support network failure -// properly. - } // namespace pubsub } // namespace ray diff --git a/src/ray/raylet/BUILD.bazel b/src/ray/raylet/BUILD.bazel index c06b8e9f3a4e..5e70ab42e5f4 100644 --- a/src/ray/raylet/BUILD.bazel +++ b/src/ray/raylet/BUILD.bazel @@ -9,7 +9,6 @@ ray_cc_library( "//src/ray/common:id", "//src/ray/common:ray_config", "//src/ray/protobuf:gcs_cc_proto", - "//src/ray/util", "//src/ray/util:event", "//src/ray/util:logging", "//src/ray/util:process", @@ -19,13 +18,12 @@ ray_cc_library( ) ray_cc_library( - name = "dependency_manager", - srcs = ["dependency_manager.cc"], - hdrs = ["dependency_manager.h"], + name = "lease_dependency_manager", + srcs = ["lease_dependency_manager.cc"], + hdrs = ["lease_dependency_manager.h"], visibility = [":__subpackages__"], deps = [ "//src/ray/common:id", - "//src/ray/common:task_common", "//src/ray/object_manager", "//src/ray/util:counter_map", "@com_google_absl//absl/container:flat_hash_map", @@ -35,19 +33,21 @@ ray_cc_library( # TODO(edoakes): looks like this belongs under scheduling/... ray_cc_library( - name = "local_task_manager", - srcs = ["local_task_manager.cc"], - hdrs = ["local_task_manager.h"], + name = "local_lease_manager", + srcs = ["local_lease_manager.cc"], + hdrs = ["local_lease_manager.h"], visibility = [":__subpackages__"], deps = [ - ":dependency_manager", + ":lease_dependency_manager", ":worker", ":worker_pool", + "//src/ray/common:lease", "//src/ray/common:ray_object", - "//src/ray/common:task_common", + "//src/ray/common/scheduling:cluster_resource_data", + "//src/ray/common/scheduling:placement_group_util", "//src/ray/object_manager:object_manager_common", "//src/ray/raylet/scheduling:cluster_resource_scheduler", - "//src/ray/raylet/scheduling:local_task_manager_interface", + "//src/ray/raylet/scheduling:local_lease_manager_interface", "//src/ray/raylet/scheduling:scheduler_internal", "@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/container:flat_hash_set", @@ -61,9 +61,8 @@ ray_cc_library( visibility = [":__subpackages__"], deps = [ "//src/ray/common:id", - "//src/ray/common:task_common", + "//src/ray/gcs_client", "//src/ray/raylet/scheduling:cluster_resource_scheduler", - "//src/ray/util", "//src/ray/util:container_util", "@com_google_absl//absl/container:flat_hash_map", ], @@ -87,7 +86,7 @@ ray_cc_library( visibility = [":__subpackages__"], deps = [ "//src/ray/common:id", - "//src/ray/common:task_common", + "//src/ray/common:lease", "//src/ray/flatbuffers:node_manager_generated", "//src/ray/ipc:client_connection", "//src/ray/raylet/scheduling:cluster_resource_scheduler", @@ -108,13 +107,15 @@ ray_cc_library( ":runtime_env_agent_client", ":worker", "//src/ray/common:constants", + "//src/ray/common:lease", + "//src/ray/common:protobuf_utils", "//src/ray/common:ray_config", "//src/ray/common:runtime_env", "//src/ray/common:status", - "//src/ray/common:task_common", - "//src/ray/gcs/gcs_client:gcs_client_lib", + "//src/ray/gcs_client", "//src/ray/ipc:client_connection", "//src/ray/util:network_util", + "//src/ray/util:time", "@boost//:system", "@com_google_absl//absl/strings", ], @@ -133,6 +134,8 @@ ray_cc_library( "//src/ray/protobuf:gcs_cc_proto", "//src/ray/protobuf:runtime_env_agent_cc_proto", "//src/ray/util:logging", + "//src/ray/util:process", + "//src/ray/util:time", "@boost//:beast", "@com_google_absl//absl/container:flat_hash_set", "@com_google_absl//absl/strings:str_format", @@ -160,12 +163,13 @@ ray_cc_library( ":worker_pool", "//src/ray/common:id", "//src/ray/common:ray_object", - "//src/ray/gcs/gcs_client:gcs_client_lib", + "//src/ray/gcs_client", "//src/ray/object_manager:object_directory", "//src/ray/object_manager:object_manager_common", "//src/ray/protobuf:node_manager_cc_proto", - "//src/ray/pubsub:subscriber", + "//src/ray/pubsub:subscriber_interface", "//src/ray/rpc:core_worker_client", + "//src/ray/util:time", ], ) @@ -212,9 +216,9 @@ ray_cc_library( visibility = [":__subpackages__"], deps = [ ":agent_manager", - ":dependency_manager", + ":lease_dependency_manager", + ":local_lease_manager", ":local_object_manager_interface", - ":local_task_manager", ":placement_group_resource_manager", ":runtime_env_agent_client", ":wait_manager", @@ -222,25 +226,25 @@ ray_cc_library( ":worker_killing_policy", ":worker_pool", "//src/ray/common:buffer", + "//src/ray/common:flatbuf_utils", + "//src/ray/common:lease", "//src/ray/common:memory_monitor", "//src/ray/core_worker:experimental_mutable_object_provider", "//src/ray/flatbuffers:node_manager_generated", - "//src/ray/gcs", - "//src/ray/gcs/gcs_client:gcs_client_lib", + "//src/ray/gcs_client", "//src/ray/object_manager", "//src/ray/object_manager:ownership_object_directory", "//src/ray/object_manager/plasma:plasma_client", - "//src/ray/pubsub:publisher", "//src/ray/pubsub:subscriber", "//src/ray/raylet/scheduling:scheduler", "//src/ray/rpc:core_worker_client", - "//src/ray/rpc:node_manager_client", "//src/ray/rpc:node_manager_server", "//src/ray/stats:stats_lib", "//src/ray/util:cmd_line_utils", "//src/ray/util:container_util", "//src/ray/util:network_util", "//src/ray/util:throttler", + "//src/ray/util:time", "@boost//:system", "@com_google_absl//absl/base:core_headers", "@com_google_absl//absl/container:flat_hash_map", @@ -262,6 +266,7 @@ ray_cc_library( "//src/ray/common:asio", "//src/ray/object_manager", "//src/ray/util:network_util", + "//src/ray/util:time", "@boost//:asio", ], ) @@ -275,19 +280,27 @@ ray_cc_binary( ":local_object_manager_interface", ":raylet_lib", "//src/ray/common:asio", + "//src/ray/common:lease", "//src/ray/common:ray_config", "//src/ray/common:status", - "//src/ray/common:task_common", - "//src/ray/common/cgroup:cgroup_manager", - "//src/ray/gcs/gcs_client:gcs_client_lib", + "//src/ray/common/cgroup2:cgroup_manager", + "//src/ray/common/cgroup2:sysfs_cgroup_driver", + "//src/ray/core_worker:metrics", + "//src/ray/gcs_client", "//src/ray/object_manager:ownership_object_directory", - "//src/ray/raylet/scheduling:cluster_task_manager", + "//src/ray/raylet/scheduling:cluster_lease_manager", + "//src/ray/rpc:metrics_agent_client", + "//src/ray/rpc:raylet_client_lib", + "//src/ray/rpc:raylet_client_pool", + "//src/ray/rpc/object_manager:object_manager_client", "//src/ray/stats:stats_lib", "//src/ray/util:cmd_line_utils", "//src/ray/util:event", "//src/ray/util:process", + "//src/ray/util:raii", "//src/ray/util:stream_redirection", "//src/ray/util:stream_redirection_options", + "//src/ray/util:time", "@com_github_gflags_gflags//:gflags", "@nlohmann_json", ], diff --git a/src/ray/raylet/agent_manager.cc b/src/ray/raylet/agent_manager.cc index ebf55761812a..26e142d824a1 100644 --- a/src/ray/raylet/agent_manager.cc +++ b/src/ray/raylet/agent_manager.cc @@ -23,7 +23,6 @@ #include "ray/util/logging.h" #include "ray/util/process.h" #include "ray/util/thread_utils.h" -#include "ray/util/util.h" namespace ray { namespace raylet { diff --git a/src/ray/raylet/agent_manager.h b/src/ray/raylet/agent_manager.h index 30fc60f024a2..a220bc515471 100644 --- a/src/ray/raylet/agent_manager.h +++ b/src/ray/raylet/agent_manager.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include diff --git a/src/ray/raylet/dependency_manager.cc b/src/ray/raylet/lease_dependency_manager.cc similarity index 62% rename from src/ray/raylet/dependency_manager.cc rename to src/ray/raylet/lease_dependency_manager.cc index 27d283762109..543866f0cfb4 100644 --- a/src/ray/raylet/dependency_manager.cc +++ b/src/ray/raylet/lease_dependency_manager.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/raylet/dependency_manager.h" +#include "ray/raylet/lease_dependency_manager.h" #include #include @@ -23,12 +23,12 @@ namespace ray { namespace raylet { -bool DependencyManager::CheckObjectLocal(const ObjectID &object_id) const { - return local_objects_.count(object_id) == 1; +bool LeaseDependencyManager::CheckObjectLocal(const ObjectID &object_id) const { + return local_objects_.contains(object_id); } -bool DependencyManager::GetOwnerAddress(const ObjectID &object_id, - rpc::Address *owner_address) const { +bool LeaseDependencyManager::GetOwnerAddress(const ObjectID &object_id, + rpc::Address *owner_address) const { auto obj = required_objects_.find(object_id); if (obj == required_objects_.end()) { return false; @@ -38,8 +38,8 @@ bool DependencyManager::GetOwnerAddress(const ObjectID &object_id, return !owner_address->worker_id().empty(); } -void DependencyManager::RemoveObjectIfNotNeeded( - absl::flat_hash_map::iterator +void LeaseDependencyManager::RemoveObjectIfNotNeeded( + absl::flat_hash_map::iterator required_object_it) { const auto &object_id = required_object_it->first; if (required_object_it->second.Empty()) { @@ -53,9 +53,9 @@ void DependencyManager::RemoveObjectIfNotNeeded( } } -absl::flat_hash_map::iterator -DependencyManager::GetOrInsertRequiredObject(const ObjectID &object_id, - const rpc::ObjectReference &ref) { +absl::flat_hash_map::iterator +LeaseDependencyManager::GetOrInsertRequiredObject(const ObjectID &object_id, + const rpc::ObjectReference &ref) { auto it = required_objects_.find(object_id); if (it == required_objects_.end()) { it = required_objects_.emplace(object_id, ref).first; @@ -63,14 +63,14 @@ DependencyManager::GetOrInsertRequiredObject(const ObjectID &object_id, return it; } -void DependencyManager::StartOrUpdateWaitRequest( +void LeaseDependencyManager::StartOrUpdateWaitRequest( const WorkerID &worker_id, const std::vector &required_objects) { RAY_LOG(DEBUG) << "Starting wait request for worker " << worker_id; auto &wait_request = wait_requests_[worker_id]; for (const auto &ref : required_objects) { const auto obj_id = ObjectRefToId(ref); - if (local_objects_.count(obj_id)) { + if (local_objects_.contains(obj_id)) { // Object is already local. No need to fetch it. continue; } @@ -95,7 +95,7 @@ void DependencyManager::StartOrUpdateWaitRequest( } } -void DependencyManager::CancelWaitRequest(const WorkerID &worker_id) { +void LeaseDependencyManager::CancelWaitRequest(const WorkerID &worker_id) { RAY_LOG(DEBUG) << "Canceling wait request for worker " << worker_id; auto req_iter = wait_requests_.find(worker_id); if (req_iter == wait_requests_.end()) { @@ -112,7 +112,7 @@ void DependencyManager::CancelWaitRequest(const WorkerID &worker_id) { wait_requests_.erase(req_iter); } -void DependencyManager::StartOrUpdateGetRequest( +void LeaseDependencyManager::StartOrUpdateGetRequest( const WorkerID &worker_id, const std::vector &required_objects) { RAY_LOG(DEBUG) << "Starting get request for worker " << worker_id; @@ -133,7 +133,10 @@ void DependencyManager::StartOrUpdateGetRequest( for (auto &obj_id : get_request.first) { auto it = required_objects_.find(obj_id); RAY_CHECK(it != required_objects_.end()); - refs.push_back(ObjectIdToRef(obj_id, it->second.owner_address)); + ray::rpc::ObjectReference ref; + ref.set_object_id(obj_id.Binary()); + ref.mutable_owner_address()->CopyFrom(it->second.owner_address); + refs.push_back(std::move(ref)); } // Pull the new dependencies before canceling the old request, in case some // of the old dependencies are still being fetched. @@ -150,7 +153,7 @@ void DependencyManager::StartOrUpdateGetRequest( } } -void DependencyManager::CancelGetRequest(const WorkerID &worker_id) { +void LeaseDependencyManager::CancelGetRequest(const WorkerID &worker_id) { RAY_LOG(DEBUG) << "Canceling get request for worker " << worker_id; auto req_iter = get_requests_.find(worker_id); if (req_iter == get_requests_.end()) { @@ -171,120 +174,121 @@ void DependencyManager::CancelGetRequest(const WorkerID &worker_id) { get_requests_.erase(req_iter); } -/// Request dependencies for a queued task. -bool DependencyManager::RequestTaskDependencies( - const TaskID &task_id, +/// Request dependencies for a queued lease. +bool LeaseDependencyManager::RequestLeaseDependencies( + const LeaseID &lease_id, const std::vector &required_objects, const TaskMetricsKey &task_key) { - RAY_LOG(DEBUG) << "Adding dependencies for task " << task_id + RAY_LOG(DEBUG) << "Adding dependencies for lease " << lease_id << ". Required objects length: " << required_objects.size(); const auto required_ids = ObjectRefsToIds(required_objects); absl::flat_hash_set deduped_ids(required_ids.begin(), required_ids.end()); - auto inserted = queued_task_requests_.emplace( - task_id, - std::make_unique( - std::move(deduped_ids), waiting_tasks_counter_, task_key)); - RAY_CHECK(inserted.second) << "Task depedencies can be requested only once per task. " - << task_id; - auto &task_entry = inserted.first->second; + auto inserted = queued_lease_requests_.emplace( + lease_id, + std::make_unique( + std::move(deduped_ids), waiting_leases_counter_, task_key)); + RAY_CHECK(inserted.second) << "Lease depedencies can be requested only once per lease. " + << lease_id; + auto &lease_entry = inserted.first->second; for (const auto &ref : required_objects) { const auto obj_id = ObjectRefToId(ref); - RAY_LOG(DEBUG) << "Task " << task_id << " blocked on object " << obj_id; + RAY_LOG(DEBUG).WithField(lease_id).WithField(obj_id) << "Lease blocked on object"; auto it = GetOrInsertRequiredObject(obj_id, ref); - it->second.dependent_tasks.insert(task_id); + it->second.dependent_leases.insert(lease_id); } - for (const auto &obj_id : task_entry->dependencies) { - if (local_objects_.count(obj_id)) { - task_entry->DecrementMissingDependencies(); + for (const auto &obj_id : lease_entry->dependencies_) { + if (local_objects_.contains(obj_id)) { + lease_entry->DecrementMissingDependencies(); } } if (!required_objects.empty()) { - task_entry->pull_request_id = + lease_entry->pull_request_id_ = object_manager_.Pull(required_objects, BundlePriority::TASK_ARGS, task_key); - RAY_LOG(DEBUG) << "Started pull for dependencies of task " << task_id - << " request: " << task_entry->pull_request_id; + RAY_LOG(DEBUG) << "Started pull for dependencies of lease " << lease_id + << " request: " << lease_entry->pull_request_id_; } - return task_entry->num_missing_dependencies == 0; + return lease_entry->num_missing_dependencies_ == 0; } -void DependencyManager::RemoveTaskDependencies(const TaskID &task_id) { - RAY_LOG(DEBUG) << "Removing dependencies for task " << task_id; - auto task_entry = queued_task_requests_.find(task_id); - RAY_CHECK(task_entry != queued_task_requests_.end()) +void LeaseDependencyManager::RemoveLeaseDependencies(const LeaseID &lease_id) { + RAY_LOG(DEBUG) << "Removing dependencies for lease " << lease_id; + auto lease_entry = queued_lease_requests_.find(lease_id); + RAY_CHECK(lease_entry != queued_lease_requests_.end()) << "Can't remove dependencies of tasks that are not queued."; - if (task_entry->second->pull_request_id > 0) { - RAY_LOG(DEBUG) << "Canceling pull for dependencies of task " << task_id - << " request: " << task_entry->second->pull_request_id; - object_manager_.CancelPull(task_entry->second->pull_request_id); + if (lease_entry->second->pull_request_id_ > 0) { + RAY_LOG(DEBUG) << "Canceling pull for dependencies of lease " << lease_id + << " request: " << lease_entry->second->pull_request_id_; + object_manager_.CancelPull(lease_entry->second->pull_request_id_); } - for (const auto &obj_id : task_entry->second->dependencies) { + for (const auto &obj_id : lease_entry->second->dependencies_) { auto it = required_objects_.find(obj_id); RAY_CHECK(it != required_objects_.end()); - it->second.dependent_tasks.erase(task_id); + it->second.dependent_leases.erase(lease_id); RemoveObjectIfNotNeeded(it); } - queued_task_requests_.erase(task_entry); + queued_lease_requests_.erase(lease_entry); } -std::vector DependencyManager::HandleObjectMissing( +std::vector LeaseDependencyManager::HandleObjectMissing( const ray::ObjectID &object_id) { RAY_CHECK(local_objects_.erase(object_id)) << "Evicted object was not local " << object_id; - // Find any tasks that are dependent on the missing object. - std::vector waiting_task_ids; + // Find any leases that are dependent on the missing object. + std::vector waiting_lease_ids; auto object_entry = required_objects_.find(object_id); if (object_entry != required_objects_.end()) { - for (auto &dependent_task_id : object_entry->second.dependent_tasks) { - auto it = queued_task_requests_.find(dependent_task_id); - RAY_CHECK(it != queued_task_requests_.end()); - auto &task_entry = it->second; - // If the dependent task had all of its arguments ready, it was ready to + for (auto &dependent_lease_id : object_entry->second.dependent_leases) { + auto it = queued_lease_requests_.find(dependent_lease_id); + RAY_CHECK(it != queued_lease_requests_.end()); + auto &lease_entry = it->second; + // If the dependent lease had all of its arguments ready, it was ready to // run but must be switched to waiting since one of its arguments is now // missing. - if (task_entry->num_missing_dependencies == 0) { - waiting_task_ids.push_back(dependent_task_id); + if (lease_entry->num_missing_dependencies_ == 0) { + waiting_lease_ids.push_back(dependent_lease_id); // During normal execution we should be able to include the check - // RAY_CHECK(pending_tasks_.count(dependent_task_id) == 1); + // RAY_CHECK(pending_leases_.count(dependent_lease_id) == 1); // However, this invariant will not hold during unit test execution. } - task_entry->IncrementMissingDependencies(); + lease_entry->IncrementMissingDependencies(); } } - // Process callbacks for all of the tasks dependent on the object that are + // Process callbacks for all of the leases dependent on the object that are // now ready to run. - return waiting_task_ids; + return waiting_lease_ids; } -std::vector DependencyManager::HandleObjectLocal(const ray::ObjectID &object_id) { +std::vector LeaseDependencyManager::HandleObjectLocal( + const ray::ObjectID &object_id) { // Add the object to the table of locally available objects. auto inserted = local_objects_.insert(object_id); RAY_CHECK(inserted.second) << "Local object was already local " << object_id; - // Find all tasks and workers that depend on the newly available object. - std::vector ready_task_ids; + // Find all leases and workers that depend on the newly available object. + std::vector ready_lease_ids; auto object_entry = required_objects_.find(object_id); if (object_entry != required_objects_.end()) { - // Loop through all tasks that depend on the newly available object. - for (const auto &dependent_task_id : object_entry->second.dependent_tasks) { - auto it = queued_task_requests_.find(dependent_task_id); - RAY_CHECK(it != queued_task_requests_.end()); - auto &task_entry = it->second; - task_entry->DecrementMissingDependencies(); - // If the dependent task now has all of its arguments ready, it's ready + // Loop through all leases that depend on the newly available object. + for (const auto &dependent_lease_id : object_entry->second.dependent_leases) { + auto it = queued_lease_requests_.find(dependent_lease_id); + RAY_CHECK(it != queued_lease_requests_.end()); + auto &lease_entry = it->second; + lease_entry->DecrementMissingDependencies(); + // If the dependent lease now has all of its arguments ready, it's ready // to run. - if (task_entry->num_missing_dependencies == 0) { - ready_task_ids.push_back(dependent_task_id); + if (lease_entry->num_missing_dependencies_ == 0) { + ready_lease_ids.push_back(dependent_lease_id); } } @@ -310,29 +314,29 @@ std::vector DependencyManager::HandleObjectLocal(const ray::ObjectID &ob RemoveObjectIfNotNeeded(object_entry); } - return ready_task_ids; + return ready_lease_ids; } -bool DependencyManager::TaskDependenciesBlocked(const TaskID &task_id) const { - auto it = queued_task_requests_.find(task_id); - RAY_CHECK(it != queued_task_requests_.end()); - RAY_CHECK(it->second->pull_request_id != 0); +bool LeaseDependencyManager::LeaseDependenciesBlocked(const LeaseID &lease_id) const { + auto it = queued_lease_requests_.find(lease_id); + RAY_CHECK(it != queued_lease_requests_.end()); + RAY_CHECK(it->second->pull_request_id_ != 0); return !object_manager_.PullRequestActiveOrWaitingForMetadata( - it->second->pull_request_id); + it->second->pull_request_id_); } -std::string DependencyManager::DebugString() const { +std::string LeaseDependencyManager::DebugString() const { std::stringstream result; - result << "TaskDependencyManager:"; - result << "\n- task deps map size: " << queued_task_requests_.size(); + result << "LeaseDependencyManager:"; + result << "\n- lease deps map size: " << queued_lease_requests_.size(); result << "\n- get req map size: " << get_requests_.size(); result << "\n- wait req map size: " << wait_requests_.size(); result << "\n- local objects map size: " << local_objects_.size(); return result.str(); } -void DependencyManager::RecordMetrics() { - waiting_tasks_counter_.FlushOnChangeCallbacks(); +void LeaseDependencyManager::RecordMetrics() { + waiting_leases_counter_.FlushOnChangeCallbacks(); } } // namespace raylet diff --git a/src/ray/raylet/dependency_manager.h b/src/ray/raylet/lease_dependency_manager.h similarity index 57% rename from src/ray/raylet/dependency_manager.h rename to src/ray/raylet/lease_dependency_manager.h index 6788f399e266..358a3e3cad0c 100644 --- a/src/ray/raylet/dependency_manager.h +++ b/src/ray/raylet/lease_dependency_manager.h @@ -22,9 +22,7 @@ #include "absl/container/flat_hash_map.h" #include "absl/container/flat_hash_set.h" -#include "ray/common/common_protocol.h" #include "ray/common/id.h" -#include "ray/common/task/task.h" #include "ray/object_manager/object_manager.h" #include "ray/util/counter_map.h" @@ -32,61 +30,66 @@ namespace ray { namespace raylet { -/// Used for unit-testing the ClusterTaskManager, which requests dependencies -/// for queued tasks. -class TaskDependencyManagerInterface { +using std::literals::operator""sv; + +/// Used for unit-testing the ClusterLeaseManager, which requests dependencies +/// for queued leases. +class LeaseDependencyManagerInterface { public: - virtual bool RequestTaskDependencies( - const TaskID &task_id, + virtual bool RequestLeaseDependencies( + const LeaseID &lease_id, const std::vector &required_objects, - const TaskMetricsKey &task_key) = 0; - virtual void RemoveTaskDependencies(const TaskID &task_id) = 0; - virtual bool TaskDependenciesBlocked(const TaskID &task_id) const = 0; + const TaskMetricsKey &lease_key) = 0; + virtual void RemoveLeaseDependencies(const LeaseID &lease_id) = 0; + virtual bool LeaseDependenciesBlocked(const LeaseID &lease_id) const = 0; virtual bool CheckObjectLocal(const ObjectID &object_id) const = 0; - virtual ~TaskDependencyManagerInterface(){}; + virtual ~LeaseDependencyManagerInterface() = default; }; -/// \class DependencyManager +/// \class LeaseDependencyManager /// /// Responsible for managing object dependencies for local workers calling /// `ray.get` or `ray.wait` and arguments of queued tasks. The caller can -/// request object dependencies for a task or worker. The task manager will +/// request object dependencies for a lease or worker. The lease manager will /// determine which object dependencies are remote and will request that these /// objects be made available locally, either via the object manager or by /// storing an error if the object is lost. -class DependencyManager : public TaskDependencyManagerInterface { +class LeaseDependencyManager : public LeaseDependencyManagerInterface { public: - /// Create a task dependency manager. - explicit DependencyManager(ObjectManagerInterface &object_manager) - : object_manager_(object_manager) { - waiting_tasks_counter_.SetOnChangeCallback( + /// Create a lease dependency manager. + explicit LeaseDependencyManager( + ObjectManagerInterface &object_manager, + ray::observability::MetricInterface &task_by_state_counter) + : object_manager_(object_manager), task_by_state_counter_(task_by_state_counter) { + waiting_leases_counter_.SetOnChangeCallback( [this](std::pair key) mutable { - int64_t num_total = waiting_tasks_counter_.Get(key); + int64_t num_total = waiting_leases_counter_.Get(key); // Of the waiting tasks of this name, some fraction may be inactive (blocked on // object store memory availability). Get this breakdown by querying the pull // manager. int64_t num_inactive = std::min( num_total, object_manager_.PullManagerNumInactivePullsByTaskName(key)); // Offset the metric values recorded from the owner process. - ray::stats::STATS_tasks.Record( + task_by_state_counter_.Record( -num_total, - {{"State", rpc::TaskStatus_Name(rpc::TaskStatus::PENDING_NODE_ASSIGNMENT)}, - {"Name", key.first}, - {"IsRetry", key.second ? "1" : "0"}, - {"Source", "dependency_manager"}}); - ray::stats::STATS_tasks.Record( + {{"State"sv, + rpc::TaskStatus_Name(rpc::TaskStatus::PENDING_NODE_ASSIGNMENT)}, + {"Name"sv, key.first}, + {"IsRetry"sv, key.second ? "1" : "0"}, + {"Source"sv, "dependency_manager"}}); + task_by_state_counter_.Record( num_total - num_inactive, - {{"State", rpc::TaskStatus_Name(rpc::TaskStatus::PENDING_ARGS_FETCH)}, - {"Name", key.first}, - {"IsRetry", key.second ? "1" : "0"}, - {"Source", "dependency_manager"}}); - ray::stats::STATS_tasks.Record( + {{"State"sv, rpc::TaskStatus_Name(rpc::TaskStatus::PENDING_ARGS_FETCH)}, + {"Name"sv, key.first}, + {"IsRetry"sv, key.second ? "1" : "0"}, + {"Source"sv, "dependency_manager"}}); + task_by_state_counter_.Record( num_inactive, - {{"State", + {{"State"sv, rpc::TaskStatus_Name(rpc::TaskStatus::PENDING_OBJ_STORE_MEM_AVAIL)}, - {"Name", key.first}, - {"IsRetry", key.second ? "1" : "0"}, - {"Source", "dependency_manager"}}); + {"Name"sv, key.first}, + {"IsRetry"sv, key.second ? "1" : "0"}, + {"Source"sv, "dependency_manager"}}); }); } @@ -94,7 +97,7 @@ class DependencyManager : public TaskDependencyManagerInterface { /// /// \param object_id The object to check for. /// \return Whether the object is local. - bool CheckObjectLocal(const ObjectID &object_id) const; + bool CheckObjectLocal(const ObjectID &object_id) const override; /// Get the address of the owner of this object. An address will only be /// returned if the caller previously specified that this object is required @@ -141,54 +144,54 @@ class DependencyManager : public TaskDependencyManagerInterface { const std::vector &required_objects); /// Cancel a worker's `ray.get` request. We will no longer attempt to fetch - /// any objects that this worker requested previously, if no other task or + /// any objects that this worker requested previously, if no other lease or /// worker requires them. /// /// \param worker_id The ID of the worker whose `ray.get` request we should /// cancel. void CancelGetRequest(const WorkerID &worker_id); - /// Request dependencies for a queued task. This will attempt to make any - /// remote objects local until the caller cancels the task's dependencies. + /// Request dependencies for a queued lease. This will attempt to make any + /// remote objects local until the caller cancels the lease's dependencies. /// - /// This method can only be called once per task, until the task has been + /// This method can only be called once per lease, until the lease has been /// canceled. /// - /// \param task_id The task that requires the objects. - /// \param required_objects The objects required by the task. - bool RequestTaskDependencies(const TaskID &task_id, - const std::vector &required_objects, - const TaskMetricsKey &task_key); - - /// Cancel a task's dependencies. We will no longer attempt to fetch any - /// remote dependencies, if no other task or worker requires them. + /// \param lease_id The lease that requires the objects. + /// \param required_objects The objects required by the lease. + bool RequestLeaseDependencies(const LeaseID &lease_id, + const std::vector &required_objects, + const TaskMetricsKey &task_key) override; + + /// Cancel a lease's dependencies. We will no longer attempt to fetch any + /// remote dependencies, if no other lease or worker requires them. /// - /// This method can only be called on a task whose dependencies were added. + /// This method can only be called on a lease whose dependencies were added. /// - /// \param task_id The task that requires the objects. - /// \param required_objects The objects required by the task. - void RemoveTaskDependencies(const TaskID &task_id); + /// \param lease_id The lease that requires the objects. + /// \param required_objects The objects required by the lease. + void RemoveLeaseDependencies(const LeaseID &lease_id) override; /// Handle an object becoming locally available. /// /// \param object_id The object ID of the object to mark as locally /// available. - /// \return A list of task IDs. This contains all added tasks that now have + /// \return A list of lease IDs. This contains all granted leases that now have /// all of their dependencies fulfilled. - std::vector HandleObjectLocal(const ray::ObjectID &object_id); + std::vector HandleObjectLocal(const ray::ObjectID &object_id); /// Handle an object that is no longer locally available. /// /// \param object_id The object ID of the object that was previously locally /// available. - /// \return A list of task IDs. This contains all added tasks that previously + /// \return A list of lease IDs. This contains all granted leases that previously /// had all of their dependencies fulfilled, but are now missing this object /// dependency. - std::vector HandleObjectMissing(const ray::ObjectID &object_id); + std::vector HandleObjectMissing(const ray::ObjectID &object_id); - /// Check whether a requested task's dependencies are not being fetched to + /// Check whether a requested lease's dependencies are not being fetched to /// the local node due to lack of memory. - bool TaskDependenciesBlocked(const TaskID &task_id) const; + bool LeaseDependenciesBlocked(const LeaseID &lease_id) const override; /// Returns debug string for class. /// @@ -200,13 +203,13 @@ class DependencyManager : public TaskDependencyManagerInterface { private: /// Metadata for an object that is needed by at least one executing worker - /// and/or one queued task. + /// and/or one queued lease. struct ObjectDependencies { explicit ObjectDependencies(const rpc::ObjectReference &ref) : owner_address(ref.owner_address()) {} - /// The tasks that depend on this object, either because the object is a task argument - /// or because the task called `ray.get` on the object. - std::unordered_set dependent_tasks; + /// The leases that depend on this object, either because the object is a lease + /// argument or because the lease of the lease called `ray.get` on the object. + std::unordered_set dependent_leases; /// The workers that depend on this object because they called `ray.get` on the /// object. std::unordered_set dependent_get_requests; @@ -220,57 +223,60 @@ class DependencyManager : public TaskDependencyManagerInterface { rpc::Address owner_address; bool Empty() const { - return dependent_tasks.empty() && dependent_get_requests.empty() && + return dependent_leases.empty() && dependent_get_requests.empty() && dependent_wait_requests.empty(); } }; /// A struct to represent the object dependencies of a task. - struct TaskDependencies { - TaskDependencies(const absl::flat_hash_set &deps, - CounterMap> &counter_map, - const TaskMetricsKey &task_key) - : dependencies(std::move(deps)), - num_missing_dependencies(dependencies.size()), - waiting_task_counter_map(counter_map), - task_key(task_key) { - if (num_missing_dependencies > 0) { - waiting_task_counter_map.Increment(task_key); + struct LeaseDependencies { + LeaseDependencies(absl::flat_hash_set deps, + CounterMap> &counter_map, + TaskMetricsKey task_key) + : dependencies_(std::move(deps)), + num_missing_dependencies_(dependencies_.size()), + waiting_task_counter_map_(counter_map), + task_key_(std::move(task_key)) { + if (num_missing_dependencies_ > 0) { + waiting_task_counter_map_.Increment(task_key_); } } - /// The objects that the task depends on. These are the arguments to the - /// task. These must all be simultaneously local before the task is ready + /// The objects that the lease depends on. These are the arguments to the + /// lease. These must all be simultaneously local before the lease is ready /// to execute. Objects are removed from this set once /// UnsubscribeGetDependencies is called. - absl::flat_hash_set dependencies; + absl::flat_hash_set dependencies_; /// The number of object arguments that are not available locally. This /// must be zero before the task is ready to execute. - size_t num_missing_dependencies; + size_t num_missing_dependencies_; /// Used to identify the pull request for the dependencies to the object /// manager. - uint64_t pull_request_id = 0; + uint64_t pull_request_id_ = 0; /// Reference to the counter map for metrics tracking. - CounterMap> &waiting_task_counter_map; + CounterMap> &waiting_task_counter_map_; /// The task name / is_retry tuple used for metrics tracking. - const TaskMetricsKey task_key; + const TaskMetricsKey task_key_; void IncrementMissingDependencies() { - if (num_missing_dependencies == 0) { - waiting_task_counter_map.Increment(task_key); + if (num_missing_dependencies_ == 0) { + waiting_task_counter_map_.Increment(task_key_); } - num_missing_dependencies++; + num_missing_dependencies_++; } void DecrementMissingDependencies() { - num_missing_dependencies--; - if (num_missing_dependencies == 0) { - waiting_task_counter_map.Decrement(task_key); + num_missing_dependencies_--; + if (num_missing_dependencies_ == 0) { + waiting_task_counter_map_.Decrement(task_key_); } } - ~TaskDependencies() { - if (num_missing_dependencies > 0) { - waiting_task_counter_map.Decrement(task_key); + LeaseDependencies(const LeaseDependencies &) = delete; + LeaseDependencies &operator=(const LeaseDependencies &) = delete; + + ~LeaseDependencies() { + if (num_missing_dependencies_ > 0) { + waiting_task_counter_map_.Decrement(task_key_); } } }; @@ -280,16 +286,16 @@ class DependencyManager : public TaskDependencyManagerInterface { void RemoveObjectIfNotNeeded( absl::flat_hash_map::iterator required_object_it); - /// Start tracking an object that is needed by a worker and/or queued task. + /// Start tracking an object that is needed by a worker and/or queued lease. absl::flat_hash_map::iterator GetOrInsertRequiredObject( const ObjectID &object_id, const rpc::ObjectReference &ref); /// The object manager, used to fetch required objects from remote nodes. ObjectManagerInterface &object_manager_; - /// A map from the ID of a queued task to metadata about whether the task's + /// A map from the ID of a queued lease to metadata about whether the lease's /// dependencies are all local or not. - absl::flat_hash_map> queued_task_requests_; + absl::flat_hash_map> queued_lease_requests_; /// A map from worker ID to the set of objects that the worker called /// `ray.get` on and a pull request ID for these objects. The pull request ID @@ -303,20 +309,28 @@ class DependencyManager : public TaskDependencyManagerInterface { /// or the worker cancels the `ray.wait` request. absl::flat_hash_map> wait_requests_; - /// Deduplicated pool of objects required by all queued tasks and workers. - /// Objects are removed from this set once there are no more tasks or workers + /// Deduplicated pool of objects required by all queued leases and workers. + /// Objects are removed from this set once there are no more leases or workers /// that require it. absl::flat_hash_map required_objects_; /// The set of locally available objects. This is used to determine which - /// tasks are ready to run and which `ray.wait` requests can be finished. - std::unordered_set local_objects_; - - /// Counts the number of active task dependency fetches by task name. The counter - /// total will be less than or equal to the size of queued_task_requests_. - CounterMap waiting_tasks_counter_; - - friend class DependencyManagerTest; + /// leases are ready to run and which `ray.wait` requests can be finished. + absl::flat_hash_set local_objects_; + + /// Counts the number of active lease dependency fetches by lease name. The counter + /// total will be less than or equal to the size of queued_lease_requests_. + CounterMap waiting_leases_counter_; + + // Metric to track the number of tasks by state. + // Expected tags: + // - State: the task state, as described by rpc::TaskState proto in common.proto + // - Name: the name of the function called + // - IsRetry: whether the task is a retry + // - Source: component reporting, e.g., "core_worker", "executor", or "pull_manager" + ray::observability::MetricInterface &task_by_state_counter_; + + friend class LeaseDependencyManagerTest; }; } // namespace raylet diff --git a/src/ray/raylet/local_task_manager.cc b/src/ray/raylet/local_lease_manager.cc similarity index 54% rename from src/ray/raylet/local_task_manager.cc rename to src/ray/raylet/local_lease_manager.cc index 44be7658d3d9..402bfd6e1358 100644 --- a/src/ray/raylet/local_task_manager.cc +++ b/src/ray/raylet/local_lease_manager.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/raylet/local_task_manager.h" +#include "ray/raylet/local_lease_manager.h" #include @@ -25,233 +25,236 @@ #include #include "ray/common/scheduling/cluster_resource_data.h" +#include "ray/common/scheduling/placement_group_util.h" #include "ray/stats/metric_defs.h" #include "ray/util/logging.h" namespace ray { namespace raylet { -LocalTaskManager::LocalTaskManager( +LocalLeaseManager::LocalLeaseManager( const NodeID &self_node_id, ClusterResourceScheduler &cluster_resource_scheduler, - TaskDependencyManagerInterface &task_dependency_manager, + LeaseDependencyManagerInterface &lease_dependency_manager, internal::NodeInfoGetter get_node_info, WorkerPoolInterface &worker_pool, - absl::flat_hash_map> &leased_workers, + absl::flat_hash_map> &leased_workers, std::function &object_ids, std::vector> *results)> - get_task_arguments, - size_t max_pinned_task_arguments_bytes, + get_lease_arguments, + size_t max_pinned_lease_arguments_bytes, std::function get_time_ms, int64_t sched_cls_cap_interval_ms) : self_node_id_(self_node_id), self_scheduling_node_id_(self_node_id.Binary()), cluster_resource_scheduler_(cluster_resource_scheduler), - task_dependency_manager_(task_dependency_manager), + lease_dependency_manager_(lease_dependency_manager), get_node_info_(get_node_info), max_resource_shapes_per_load_report_( RayConfig::instance().max_resource_shapes_per_load_report()), worker_pool_(worker_pool), leased_workers_(leased_workers), - get_task_arguments_(get_task_arguments), - max_pinned_task_arguments_bytes_(max_pinned_task_arguments_bytes), + get_lease_arguments_(get_lease_arguments), + max_pinned_lease_arguments_bytes_(max_pinned_lease_arguments_bytes), get_time_ms_(get_time_ms), sched_cls_cap_enabled_(RayConfig::instance().worker_cap_enabled()), sched_cls_cap_interval_ms_(sched_cls_cap_interval_ms), sched_cls_cap_max_ms_(RayConfig::instance().worker_cap_max_backoff_delay_ms()) {} -void LocalTaskManager::QueueAndScheduleTask(std::shared_ptr work) { - // If the local node is draining, the cluster task manager will +void LocalLeaseManager::QueueAndScheduleLease(std::shared_ptr work) { + // If the local node is draining, the cluster lease manager will // guarantee that the local node is not selected for scheduling. RAY_CHECK(!cluster_resource_scheduler_.GetLocalResourceManager().IsLocalNodeDraining()); - // The local node must be feasible if the cluster task manager decides to run the task + // The local node must be feasible if the cluster lease manager decides to run the task // locally. RAY_CHECK(cluster_resource_scheduler_.GetClusterResourceManager().HasFeasibleResources( self_scheduling_node_id_, - ResourceMapToResourceRequest(work->task.GetTaskSpecification() + ResourceMapToResourceRequest(work->lease_.GetLeaseSpecification() .GetRequiredPlacementResources() .GetResourceMap(), /*requires_object_store_memory=*/false))) - << work->task.GetTaskSpecification().DebugString() << " " + << work->lease_.GetLeaseSpecification().DebugString() << " " << cluster_resource_scheduler_.GetClusterResourceManager() .GetNodeResources(self_scheduling_node_id_) .DebugString(); - WaitForTaskArgsRequests(std::move(work)); - ScheduleAndDispatchTasks(); + WaitForLeaseArgsRequests(std::move(work)); + ScheduleAndGrantLeases(); } -void LocalTaskManager::WaitForTaskArgsRequests(std::shared_ptr work) { - const auto &task = work->task; - const auto &task_id = task.GetTaskSpecification().TaskId(); - const auto &scheduling_key = task.GetTaskSpecification().GetSchedulingClass(); - auto object_ids = task.GetTaskSpecification().GetDependencies(); +void LocalLeaseManager::WaitForLeaseArgsRequests(std::shared_ptr work) { + const auto &lease = work->lease_; + const auto &lease_id = lease.GetLeaseSpecification().LeaseId(); + const auto &scheduling_key = lease.GetLeaseSpecification().GetSchedulingClass(); + auto object_ids = lease.GetLeaseSpecification().GetDependencies(); if (!object_ids.empty()) { - bool args_ready = task_dependency_manager_.RequestTaskDependencies( - task_id, - task.GetDependencies(), - {task.GetTaskSpecification().GetName(), task.GetTaskSpecification().IsRetry()}); + bool args_ready = lease_dependency_manager_.RequestLeaseDependencies( + lease_id, + lease.GetLeaseSpecification().GetDependencies(), + {lease.GetLeaseSpecification().GetTaskName(), + lease.GetLeaseSpecification().IsRetry()}); if (args_ready) { - RAY_LOG(DEBUG) << "Args already ready, task can be dispatched " << task_id; - tasks_to_dispatch_[scheduling_key].emplace_back(std::move(work)); + RAY_LOG(DEBUG) << "Args already ready, lease can be granted " << lease_id; + leases_to_grant_[scheduling_key].emplace_back(std::move(work)); } else { - RAY_LOG(DEBUG) << "Waiting for args for task: " - << task.GetTaskSpecification().TaskId(); - auto it = waiting_task_queue_.insert(waiting_task_queue_.end(), std::move(work)); - RAY_CHECK(waiting_tasks_index_.emplace(task_id, it).second); + RAY_LOG(DEBUG) << "Waiting for args for lease: " << lease_id; + auto it = waiting_lease_queue_.insert(waiting_lease_queue_.end(), std::move(work)); + RAY_CHECK(waiting_leases_index_.emplace(lease_id, it).second); } } else { - RAY_LOG(DEBUG) << "No args, task can be dispatched " - << task.GetTaskSpecification().TaskId(); - tasks_to_dispatch_[scheduling_key].emplace_back(std::move(work)); + RAY_LOG(DEBUG) << "No args, lease can be granted " << lease_id; + leases_to_grant_[scheduling_key].emplace_back(std::move(work)); } } -void LocalTaskManager::ScheduleAndDispatchTasks() { - DispatchScheduledTasksToWorkers(); +void LocalLeaseManager::ScheduleAndGrantLeases() { + GrantScheduledLeasesToWorkers(); // TODO(swang): Spill from waiting queue first? Otherwise, we may end up - // spilling a task whose args are already local. - // TODO(swang): Invoke ScheduleAndDispatchTasks() when we run out of memory + // spilling a lease whose args are already local. + // TODO(swang): Invoke ScheduleAndGrantLeases() when we run out of memory // in the PullManager or periodically, to make sure that we spill waiting - // tasks that are blocked. - SpillWaitingTasks(); + // leases that are blocked. + SpillWaitingLeases(); } -void LocalTaskManager::DispatchScheduledTasksToWorkers() { - // Check every task in task_to_dispatch queue to see - // whether it can be dispatched and ran. This avoids head-of-line - // blocking where a task which cannot be dispatched because +void LocalLeaseManager::GrantScheduledLeasesToWorkers() { + // Check every lease in leases_to_grant queue to see + // whether it can be granted and ran. This avoids head-of-line + // blocking where a lease which cannot be granted because // there are not enough available resources blocks other - // tasks from being dispatched. - for (auto shapes_it = tasks_to_dispatch_.begin(); - shapes_it != tasks_to_dispatch_.end();) { + // leases from being granted. + for (auto shapes_it = leases_to_grant_.begin(); shapes_it != leases_to_grant_.end();) { auto &scheduling_class = shapes_it->first; - auto &dispatch_queue = shapes_it->second; + auto &leases_to_grant_queue = shapes_it->second; auto sched_cls_iter = info_by_sched_cls_.find(scheduling_class); if (sched_cls_iter == info_by_sched_cls_.end()) { // Initialize the class info. - sched_cls_iter = info_by_sched_cls_ - .emplace(scheduling_class, - SchedulingClassInfo(MaxRunningTasksPerSchedulingClass( - scheduling_class))) - .first; + sched_cls_iter = + info_by_sched_cls_ + .emplace(scheduling_class, + SchedulingClassInfo( + MaxGrantedLeasesPerSchedulingClass(scheduling_class))) + .first; } auto &sched_cls_info = sched_cls_iter->second; // Fair scheduling is applied only when the total CPU requests exceed the node's - // capacity. This skips scheduling classes whose number of running tasks exceeds the - // average number of tasks per scheduling class. + // capacity. This skips scheduling classes whose number of granted leases exceeds the + // average number of granted leases per scheduling class. // The purpose of fair scheduling is to ensure that each scheduling class has an - // equal chance of being selected for dispatch. For instance, in a pipeline with both - // data producers and consumers, we aim for consumers to have the same chance to be - // dispatched as producers. This prevents memory peak caused by dispatching all - // producer tasks first. - // A scheduling class is skipped from dispatching if its number of running tasks - // exceeds the fair_share, which is the average number of running tasks among all + // equal chance of being selected for lease granting. For instance, in a pipeline with + // both data producers and consumers, we aim for consumers to have the same chance to + // be granted a lease as producers. This prevents memory peak caused by granting all + // producer leases first. + // A scheduling class is skipped from lease granting if its number of granted leases + // exceeds the fair_share, which is the average number of granted leases among all // scheduling classes. For example, consider a scenario where we have 3 CPUs and 2 - // scheduling classes, `f` and `g`, each with 4 tasks. - // Status 1: The queue init with [f, f, f, f, g, g, g, g], and 0 running tasks. - // Status 2: We dispatch 3 `f` tasks. Now the queue is [f, g, g, g, g], - // with 3 `f` tasks running. - // Status 3: Suppose 1 `f` task finishes. When choosing the next task to dispatch, - // the queue is [f, g, g, g, g], and there are 2 `f` tasks running. + // scheduling classes, `f` and `g`, each with 4 leases. + // Status 1: The queue init with [f, f, f, f, g, g, g, g], and 0 granted leases. + // Status 2: We grant 3 `f` leases. Now the queue is [f, g, g, g, g], + // with 3 `f` leases granted. + // Status 3: Suppose 1 `f` lease finishes. When choosing the next lease to grant, + // the queue is [f, g, g, g, g], and there are 2 `f` leases granted. // We calculate fair_share as follows: - // fair_share = number of running tasks / number of scheduling classes + // fair_share = number of granted leases / number of scheduling classes // = 2 / 2 = 1. - // Since the number of running `f` tasks (2) is greater than the - // fair_share (1), we skip `f` and choose to dispatch `g`. - // Note 1: Fair_share is calculated as (total number of running tasks with >0 CPU) - // / (number of scheduling classes in tasks_to_dispatch_). + // Since the number of granted `f` leases (2) is greater than the + // fair_share (1), we skip `f` and choose to grant `g`. + // Note 1: Fair_share is calculated as (total number of granted leases with >0 CPU) + // / (number of scheduling classes in leases_to_dispatch_). // Note 2: The decision to skip a scheduling class happens when loop through the - // scheduling classes (keys of tasks_to_dispatch_). This means we check for + // scheduling classes (keys of leases_to_grant_). This means we check for // fair dispatching when looping through the scheduling classes rather than - // for each individual task, reducing the number of checks required. - // This is why in Status 2 of the example, we dispatch 3 `f` tasks because - // we chose `f` for dispatch,and we continue dispatching all `f` - // tasks until resources are fully utilized. + // for each individual lease, reducing the number of checks required. + // This is why in Status 2 of the example, we grant 3 `f` leases because + // we chose `f` for grant, and we continue granting all `f` + // leases until resources are fully utilized. - // Currently, fair dispatching is implemented only for tasks that require CPU + // Currently, fair granting is implemented only for leases that require CPU // resources. CPU. For details, see https://github.com/ray-project/ray/pull/44733. - // Calculate the total CPU requests for all tasks in the tasks_to_dispatch queue. + // Calculate the total CPU requests for all leases in the leases_to_grant queue. double total_cpu_requests_ = 0.0; // Count the number of scheduling classes that require CPU and sum their total CPU // requests. size_t num_classes_with_cpu = 0; - for (const auto &[_, cur_dispatch_queue] : tasks_to_dispatch_) { + for (const auto &[_, cur_dispatch_queue] : leases_to_grant_) { // Only need to check the first because all tasks with the same scheduling class // have the same CPU resource requirements. RAY_CHECK(!cur_dispatch_queue.empty()); const auto &work = cur_dispatch_queue.front(); - const auto &task_spec = work->task.GetTaskSpecification(); + const auto &lease_spec = work->lease_.GetLeaseSpecification(); auto cpu_request_ = - task_spec.GetRequiredResources().Get(scheduling::ResourceID::CPU()).Double(); + lease_spec.GetRequiredResources().Get(scheduling::ResourceID::CPU()).Double(); if (cpu_request_ > 0) { num_classes_with_cpu++; total_cpu_requests_ += cur_dispatch_queue.size() * cpu_request_; } } const auto &sched_cls_desc = - TaskSpecification::GetSchedulingClassDescriptor(scheduling_class); + SchedulingClassToIds::GetSchedulingClassDescriptor(scheduling_class); double total_cpus = cluster_resource_scheduler_.GetLocalResourceManager().GetNumCpus(); // Compare total CPU requests with the node's total CPU capacity. If the requests - // exceed the capacity, check if fair dispatching is needed. + // exceed the capacity, check if fair granting is needed. if (sched_cls_desc.resource_set.Get(scheduling::ResourceID::CPU()).Double() > 0 && total_cpu_requests_ > total_cpus) { RAY_LOG(DEBUG) - << "Applying fairness policy. Total CPU requests in tasks_to_dispatch_ (" + << "Applying fairness policy. Total CPU requests in leases_to_grant_ (" << total_cpu_requests_ << ") exceed total CPUs available (" << total_cpus << ")."; - // Get the total number of running tasks requires CPU. - size_t total_cpu_running_tasks = 0; + // Get the total number of granted leases that require CPU. + size_t total_cpu_granted_leases = 0; for (auto &entry : info_by_sched_cls_) { // Only consider CPU requests const auto &cur_sched_cls_desc = - TaskSpecification::GetSchedulingClassDescriptor(entry.first); + SchedulingClassToIds::GetSchedulingClassDescriptor(entry.first); if (cur_sched_cls_desc.resource_set.Get(scheduling::ResourceID::CPU()).Double() > 0) { - total_cpu_running_tasks += entry.second.running_tasks.size(); + total_cpu_granted_leases += entry.second.granted_leases.size(); } } // 1. We have confirmed that this is a scheduling class that requires CPU resources, // hence num_classes_with_cpu >= 1 (cannot be 0) as this scheduling class is in - // tasks_to_dispatch_. - // 2. We will compute fair_share as the ideal distribution of tasks among all - // scheduling classes in tasks_to_dispatch_. Then, we will check if the number of - // running tasks for this scheduling class exceeds its ideal fair_share. - // 3. Note: We should get the num_classes_with_cpu from tasks_to_dispatch_ - // instead of the info_by_sched_cls_ although total_cpu_running_tasks gets from - // the task running. First, info_by_sched_cls_ may not be initialized yet for - // some scheduling classes (as we initialize it in the loop). Second, we expect - // the number of running tasks for this scheduling class to not be much. However, - // if no tasks of this scheduling class are running, it will not be skipped. - - size_t fair_share = total_cpu_running_tasks / num_classes_with_cpu; - if (sched_cls_info.running_tasks.size() > fair_share) { - RAY_LOG(DEBUG) << "Skipping dispatch for scheduling class " << scheduling_class - << ". Running tasks (" << sched_cls_info.running_tasks.size() - << ") exceed fair share (" << fair_share << ")."; + // leases_to_grant_. + // 2. We will compute fair_share as the ideal distribution of leases among all + // scheduling classes in leases_to_grant_. Then, we will check if the number + // of granted leases for this scheduling class exceeds its ideal fair_share. + // 3. Note: We should get the num_classes_with_cpu from leases_to_grant_ + // instead of the info_by_sched_cls_ although total_cpu_granted_leases is + // obtained from the granted leases. First, info_by_sched_cls_ may not be + // initialized yet for some scheduling classes (as we initialize it in the loop). + // Second, we expect the number of granted leases for this scheduling class to + // not be much. However, if no leases of this scheduling class are granted, it + // will not be skipped. + + size_t fair_share = total_cpu_granted_leases / num_classes_with_cpu; + if (sched_cls_info.granted_leases.size() > fair_share) { + RAY_LOG(DEBUG) << "Skipping lease granting for scheduling class " + << scheduling_class << ". Granted leases (" + << sched_cls_info.granted_leases.size() << ") exceed fair share (" + << fair_share << ")."; shapes_it++; continue; } } - /// We cap the maximum running tasks of a scheduling class to avoid - /// scheduling too many tasks of a single type/depth, when there are + /// We cap the maximum granted leases of a scheduling class to avoid + /// granting too many leases of a single type/depth, when there are /// deeper/other functions that should be run. We need to apply back /// pressure to limit the number of worker processes started in scenarios /// with nested tasks. bool is_infeasible = false; - for (auto work_it = dispatch_queue.begin(); work_it != dispatch_queue.end();) { + for (auto work_it = leases_to_grant_queue.begin(); + work_it != leases_to_grant_queue.end();) { auto &work = *work_it; - const auto &task = work->task; - const auto &spec = task.GetTaskSpecification(); - TaskID task_id = spec.TaskId(); + const auto &lease = work->lease_; + const auto &spec = lease.GetLeaseSpecification(); + LeaseID lease_id = spec.LeaseId(); if (work->GetState() == internal::WorkStatus::WAITING_FOR_WORKER) { work_it++; continue; @@ -259,14 +262,14 @@ void LocalTaskManager::DispatchScheduledTasksToWorkers() { // Check if the scheduling class is at capacity now. if (sched_cls_cap_enabled_ && - sched_cls_info.running_tasks.size() >= sched_cls_info.capacity && + sched_cls_info.granted_leases.size() >= sched_cls_info.capacity && work->GetState() == internal::WorkStatus::WAITING) { RAY_LOG(DEBUG) << "Hit cap! time=" << get_time_ms_() << " next update time=" << sched_cls_info.next_update_time; if (get_time_ms_() < sched_cls_info.next_update_time) { - // We're over capacity and it's not time to admit a new task yet. - // Calculate the next time we should admit a new task. - int64_t current_capacity = sched_cls_info.running_tasks.size(); + // We're over capacity and it's not time to grant a new lease yet. + // Calculate the next time we should grant a new lease. + int64_t current_capacity = sched_cls_info.granted_leases.size(); int64_t allowed_capacity = sched_cls_info.capacity; int64_t exp = current_capacity - allowed_capacity; int64_t wait_time = sched_cls_cap_interval_ms_ * (1L << exp); @@ -280,11 +283,11 @@ void LocalTaskManager::DispatchScheduledTasksToWorkers() { sched_cls_info.next_update_time = std::min(target_time, sched_cls_info.next_update_time); - // While we're over capacity and cannot run the task, - // try to spill to a node that can run it. + // While we're over capacity and cannot grant the lease, + // try to spill to a node that can. bool did_spill = TrySpillback(work, is_infeasible); if (did_spill) { - work_it = dispatch_queue.erase(work_it); + work_it = leases_to_grant_queue.erase(work_it); continue; } @@ -293,32 +296,32 @@ void LocalTaskManager::DispatchScheduledTasksToWorkers() { } bool args_missing = false; - bool success = PinTaskArgsIfMemoryAvailable(spec, &args_missing); - // An argument was evicted since this task was added to the dispatch + bool success = PinLeaseArgsIfMemoryAvailable(spec, &args_missing); + // An argument was evicted since this lease was added to the grant // queue. Move it back to the waiting queue. The caller is responsible - // for notifying us when the task is unblocked again. + // for notifying us when the lease is unblocked again. if (!success) { if (args_missing) { - // Insert the task at the head of the waiting queue because we + // Insert the lease at the head of the waiting queue because we // prioritize spilling from the end of the queue. // TODO(scv119): where does pulling happen? - auto it = waiting_task_queue_.insert(waiting_task_queue_.begin(), - std::move(*work_it)); - RAY_CHECK(waiting_tasks_index_.emplace(task_id, it).second); - work_it = dispatch_queue.erase(work_it); + auto it = waiting_lease_queue_.insert(waiting_lease_queue_.begin(), + std::move(*work_it)); + RAY_CHECK(waiting_leases_index_.emplace(lease_id, it).second); + work_it = leases_to_grant_queue.erase(work_it); } else { - // The task's args cannot be pinned due to lack of memory. We should - // retry dispatching the task once another task finishes and releases + // The lease's args cannot be pinned due to lack of memory. We should + // retry granting the lease once another lease finishes and releases // its arguments. - RAY_LOG(DEBUG) << "Dispatching task " << task_id + RAY_LOG(DEBUG) << "Granting lease " << lease_id << " would put this node over the max memory allowed for " - "arguments of executing tasks (" - << max_pinned_task_arguments_bytes_ - << "). Waiting to dispatch task until other tasks complete"; - RAY_CHECK(!executing_task_args_.empty() && !pinned_task_arguments_.empty()) - << "Cannot dispatch task " << task_id - << " until another task finishes and releases its arguments, but no other " - "task is running"; + "arguments of granted leases (" + << max_pinned_lease_arguments_bytes_ + << "). Waiting to grant lease until other leases are returned"; + RAY_CHECK(!granted_lease_args_.empty() && !pinned_lease_arguments_.empty()) + << "Cannot grant lease " << lease_id + << " until another lease is returned and releases its arguments, but no " + "other lease is granted"; work->SetStateWaiting( internal::UnscheduledWorkCause::WAITING_FOR_AVAILABLE_PLASMA_MEMORY); work_it++; @@ -335,41 +338,41 @@ void LocalTaskManager::DispatchScheduledTasksToWorkers() { .AllocateLocalTaskResources(spec.GetRequiredResources().GetResourceMap(), allocated_instances); if (!schedulable) { - ReleaseTaskArgs(task_id); - // The local node currently does not have the resources to run the task, so we + ReleaseLeaseArgs(lease_id); + // The local node currently does not have the resources to grant the lease, so we // should try spilling to another node. bool did_spill = TrySpillback(work, is_infeasible); if (!did_spill) { - // There must not be any other available nodes in the cluster, so the task + // There must not be any other available nodes in the cluster, so the lease // should stay on this node. We can skip the rest of the shape because the // scheduler will make the same decision. work->SetStateWaiting( internal::UnscheduledWorkCause::WAITING_FOR_RESOURCES_AVAILABLE); break; } - work_it = dispatch_queue.erase(work_it); + work_it = leases_to_grant_queue.erase(work_it); } else { // Force us to recalculate the next update time the next time a task // comes through this queue. We should only do this when we're // confident we're ready to dispatch the task after all checks have // passed. sched_cls_info.next_update_time = std::numeric_limits::max(); - sched_cls_info.running_tasks.insert(spec.TaskId()); - // The local node has the available resources to run the task, so we should run - // it. - work->allocated_instances = allocated_instances; + sched_cls_info.granted_leases.insert(lease_id); + // The local node has the available resources to grant the lease, so we should + // grant it. + work->allocated_instances_ = allocated_instances; work->SetStateWaitingForWorker(); bool is_detached_actor = spec.IsDetachedActor(); auto &owner_address = spec.CallerAddress(); - /// TODO(scv119): if a worker is not started, the resources is leaked and + /// TODO(scv119): if a worker is not started, the resources are leaked and // task might be hanging. worker_pool_.PopWorker( spec, - [this, task_id, scheduling_class, work, is_detached_actor, owner_address]( + [this, lease_id, scheduling_class, work, is_detached_actor, owner_address]( const std::shared_ptr worker, PopWorkerStatus status, const std::string &runtime_env_setup_error_message) -> bool { - // TODO(hjiang): After getting the ready-to-use worker and task id, we're + // TODO(hjiang): After getting the ready-to-use worker and lease id, we're // able to get physical execution context. // // ownership chain: raylet has-a node manager, node manager has-a local task @@ -381,7 +384,7 @@ void LocalTaskManager::DispatchScheduledTasksToWorkers() { return PoppedWorkerHandler(worker, status, - task_id, + lease_id, scheduling_class, work, is_detached_actor, @@ -394,77 +397,79 @@ void LocalTaskManager::DispatchScheduledTasksToWorkers() { // In the beginning of the loop, we add scheduling_class // to the `info_by_sched_cls_` map. // In cases like dead owners, we may not add any tasks - // to `running_tasks` so we can remove the map entry + // to `granted_leases` so we can remove the map entry // for that scheduling_class to prevent memory leaks. - if (sched_cls_info.running_tasks.size() == 0) { + if (sched_cls_info.granted_leases.size() == 0) { info_by_sched_cls_.erase(scheduling_class); } if (is_infeasible) { - const auto &front_task = dispatch_queue.front()->task.GetTaskSpecification(); - RAY_LOG(ERROR) << "A task got scheduled to a node even though it was infeasible. " - "Please report an issue on GitHub.\nTask: " - << front_task.DebugString(); - auto dispatch_queue_iter = dispatch_queue.begin(); - while (dispatch_queue_iter != dispatch_queue.end()) { - CancelTaskToDispatch( - *dispatch_queue_iter, + const auto &front_lease = + leases_to_grant_queue.front()->lease_.GetLeaseSpecification(); + RAY_LOG(ERROR) << "A lease got granted to a node even though it was infeasible. " + "Please report an issue on GitHub.\nLease: " + << front_lease.DebugString(); + auto leases_to_grant_queue_iter = leases_to_grant_queue.begin(); + while (leases_to_grant_queue_iter != leases_to_grant_queue.end()) { + CancelLeaseToGrant( + *leases_to_grant_queue_iter, rpc::RequestWorkerLeaseReply::SCHEDULING_CANCELLED_UNSCHEDULABLE, - "Scheduling failed due to the task becoming infeasible."); - dispatch_queue_iter = dispatch_queue.erase(dispatch_queue_iter); + "Lease granting failed due to the lease becoming infeasible."); + leases_to_grant_queue_iter = + leases_to_grant_queue.erase(leases_to_grant_queue_iter); } - tasks_to_dispatch_.erase(shapes_it++); - } else if (dispatch_queue.empty()) { - tasks_to_dispatch_.erase(shapes_it++); + leases_to_grant_.erase(shapes_it++); + } else if (leases_to_grant_queue.empty()) { + leases_to_grant_.erase(shapes_it++); } else { shapes_it++; } } } -void LocalTaskManager::SpillWaitingTasks() { - // Try to spill waiting tasks to a remote node, prioritizing those at the end - // of the queue. Waiting tasks are spilled if there are enough remote +void LocalLeaseManager::SpillWaitingLeases() { + // Try to spill waiting leases to a remote node, prioritizing those at the end + // of the queue. Waiting leases are spilled if there are enough remote // resources AND (we have no resources available locally OR their - // dependencies are not being fetched). We should not spill tasks whose + // dependencies are not being fetched). We should not spill leases whose // dependencies are actively being fetched because some of their dependencies // may already be local or in-flight to this node. // // NOTE(swang): We do not iterate by scheduling class here, so if we break - // due to lack of remote resources, it is possible that a waiting task that + // due to lack of remote resources, it is possible that a waiting lease that // is earlier in the queue could have been scheduled to a remote node. // TODO(scv119): this looks very aggressive: we will try to spillback - // all the tasks in the waiting queue regardless of the wait time. - auto it = waiting_task_queue_.end(); - while (it != waiting_task_queue_.begin()) { + // all the leases in the waiting queue regardless of the wait time. + auto it = waiting_lease_queue_.end(); + while (it != waiting_lease_queue_.begin()) { it--; - const auto &task = (*it)->task; - const auto &spec = task.GetTaskSpecification(); - const auto &task_id = spec.TaskId(); + const auto &lease = (*it)->lease_; + const auto &lease_spec = lease.GetLeaseSpecification(); + const auto &lease_id = lease_spec.LeaseId(); - // Check whether this task's dependencies are blocked (not being actively - // pulled). If this is true, then we should force the task onto a remote + // Check whether this lease's dependencies are blocked (not being actively + // pulled). If this is true, then we should force the lease onto a remote // feasible node, even if we have enough resources available locally for // placement. - bool task_dependencies_blocked = - task_dependency_manager_.TaskDependenciesBlocked(task_id); - RAY_LOG(DEBUG) << "Attempting to spill back waiting task " << task_id + bool lease_dependencies_blocked = + lease_dependency_manager_.LeaseDependenciesBlocked(lease_id); + RAY_LOG(DEBUG) << "Attempting to spill back waiting lease " << lease_id << " to remote node. Dependencies blocked? " - << task_dependencies_blocked; + << lease_dependencies_blocked; bool is_infeasible; // TODO(swang): The policy currently does not account for the amount of // object store memory availability. Ideally, we should pick the node with // the most memory availability. scheduling::NodeID scheduling_node_id; - if (!spec.IsSpreadSchedulingStrategy()) { + if (!lease_spec.IsSpreadSchedulingStrategy()) { scheduling_node_id = cluster_resource_scheduler_.GetBestSchedulableNode( - spec, + lease_spec, /*preferred_node_id*/ self_node_id_.Binary(), - /*exclude_local_node*/ task_dependencies_blocked, + /*exclude_local_node*/ lease_dependencies_blocked, /*requires_object_store_memory*/ true, &is_infeasible); } else { // If scheduling strategy is spread, we prefer honoring spread decision - // and waiting for task dependencies to be pulled + // and waiting for lease dependencies to be pulled // locally than spilling back and causing uneven spread. scheduling_node_id = self_scheduling_node_id_; } @@ -472,21 +477,21 @@ void LocalTaskManager::SpillWaitingTasks() { if (!scheduling_node_id.IsNil() && scheduling_node_id != self_scheduling_node_id_) { NodeID node_id = NodeID::FromBinary(scheduling_node_id.Binary()); Spillback(node_id, *it); - if (!spec.GetDependencies().empty()) { - task_dependency_manager_.RemoveTaskDependencies(spec.TaskId()); + if (!lease_spec.GetDependencies().empty()) { + lease_dependency_manager_.RemoveLeaseDependencies(lease_id); } - num_waiting_task_spilled_++; - waiting_tasks_index_.erase(task_id); - it = waiting_task_queue_.erase(it); + num_waiting_lease_spilled_++; + waiting_leases_index_.erase(lease_id); + it = waiting_lease_queue_.erase(it); } else { if (scheduling_node_id.IsNil()) { - RAY_LOG(DEBUG) << "RayTask " << task_id + RAY_LOG(DEBUG) << "RayLease " << lease_id << " has blocked dependencies, but no other node has resources, " - "keeping the task local"; + "keeping the lease local"; } else { - RAY_LOG(DEBUG) << "Keeping waiting task " << task_id << " local"; + RAY_LOG(DEBUG) << "Keeping waiting lease " << lease_id << " local"; } - // We should keep the task local. Note that an earlier task in the queue + // We should keep the lease local. Note that an earlier lease in the queue // may have different resource requirements and could actually be // scheduled on a remote node. break; @@ -494,9 +499,9 @@ void LocalTaskManager::SpillWaitingTasks() { } } -bool LocalTaskManager::TrySpillback(const std::shared_ptr &work, - bool &is_infeasible) { - const auto &spec = work->task.GetTaskSpecification(); +bool LocalLeaseManager::TrySpillback(const std::shared_ptr &work, + bool &is_infeasible) { + const auto &spec = work->lease_.GetLeaseSpecification(); auto scheduling_node_id = cluster_resource_scheduler_.GetBestSchedulableNode( spec, // We should prefer to stay local if possible @@ -514,31 +519,31 @@ bool LocalTaskManager::TrySpillback(const std::shared_ptr &work, NodeID node_id = NodeID::FromBinary(scheduling_node_id.Binary()); Spillback(node_id, work); - num_unschedulable_task_spilled_++; + num_unschedulable_lease_spilled_++; if (!spec.GetDependencies().empty()) { - task_dependency_manager_.RemoveTaskDependencies(spec.TaskId()); + lease_dependency_manager_.RemoveLeaseDependencies(spec.LeaseId()); } return true; } -bool LocalTaskManager::PoppedWorkerHandler( +bool LocalLeaseManager::PoppedWorkerHandler( const std::shared_ptr worker, PopWorkerStatus status, - const TaskID &task_id, + const LeaseID &lease_id, SchedulingClass scheduling_class, const std::shared_ptr &work, bool is_detached_actor, const rpc::Address &owner_address, const std::string &runtime_env_setup_error_message) { - const auto &reply = work->reply; - const auto &callback = work->callback; + const auto &reply = work->reply_; + const auto &callback = work->callback_; const bool canceled = work->GetState() == internal::WorkStatus::CANCELLED; - const auto &task = work->task; - bool dispatched = false; + const auto &lease = work->lease_; + bool granted = false; if (!canceled) { const auto &required_resource = - task.GetTaskSpecification().GetRequiredResources().GetResourceMap(); + lease.GetLeaseSpecification().GetRequiredResources().GetResourceMap(); for (auto &entry : required_resource) { // This is to make sure PG resource is not deleted during popping worker // unless the lease request is cancelled. @@ -548,61 +553,63 @@ bool LocalTaskManager::PoppedWorkerHandler( } } - // Erases the work from task_to_dispatch_ queue, also removes the task dependencies. + // Erases the work from lease_to_grant_ queue, also removes the lease dependencies. // // IDEA(ryw): Make an RAII class to wrap the a shared_ptr and - // requests task dependency upon ctor, and remove task dependency upon dtor. - // I tried this, it works, but we expose the map via GetTaskToDispatch() used in + // requests lease dependency upon ctor, and remove lease dependency upon dtor. + // I tried this, it works, but we expose the map via GetLeasesToGrant() used in // scheduler_resource_reporter.cc. Maybe we can use `boost::any_range` to only expose // a view of the Work ptrs, but I got dependency issues // (can't include boost/range/any_range.hpp). - auto erase_from_dispatch_queue_fn = [this](const std::shared_ptr &work, - const SchedulingClass &scheduling_class) { - auto shapes_it = tasks_to_dispatch_.find(scheduling_class); - RAY_CHECK(shapes_it != tasks_to_dispatch_.end()); - auto &dispatch_queue = shapes_it->second; - bool erased = false; - for (auto work_it = dispatch_queue.begin(); work_it != dispatch_queue.end(); - work_it++) { - if (*work_it == work) { - dispatch_queue.erase(work_it); - erased = true; - break; - } - } - if (dispatch_queue.empty()) { - tasks_to_dispatch_.erase(shapes_it); - } - RAY_CHECK(erased); + auto erase_from_leases_to_grant_queue_fn = + [this](const std::shared_ptr &work_to_erase, + const SchedulingClass &_scheduling_class) { + auto shapes_it = leases_to_grant_.find(_scheduling_class); + RAY_CHECK(shapes_it != leases_to_grant_.end()); + auto &leases_to_grant_queue = shapes_it->second; + bool erased = false; + for (auto work_it = leases_to_grant_queue.begin(); + work_it != leases_to_grant_queue.end(); + work_it++) { + if (*work_it == work_to_erase) { + leases_to_grant_queue.erase(work_it); + erased = true; + break; + } + } + if (leases_to_grant_queue.empty()) { + leases_to_grant_.erase(shapes_it); + } + RAY_CHECK(erased); - const auto &task = work->task; - if (!task.GetDependencies().empty()) { - task_dependency_manager_.RemoveTaskDependencies( - task.GetTaskSpecification().TaskId()); - } - }; + const auto &_lease = work_to_erase->lease_; + if (!_lease.GetLeaseSpecification().GetDependencies().empty()) { + lease_dependency_manager_.RemoveLeaseDependencies( + _lease.GetLeaseSpecification().LeaseId()); + } + }; if (canceled) { // Task has been canceled. - RAY_LOG(DEBUG) << "Task " << task_id << " has been canceled when worker popped"; - RemoveFromRunningTasksIfExists(task); - // All the cleaning work has been done when canceled task. Just return + RAY_LOG(DEBUG) << "Lease " << lease_id << " has been canceled when worker popped"; + RemoveFromGrantedLeasesIfExists(lease); + // All the cleaning work has been done when canceled lease. Just return // false without doing anything. return false; } if (!worker) { - dispatched = false; + granted = false; // We've already acquired resources so we need to release them. cluster_resource_scheduler_.GetLocalResourceManager().ReleaseWorkerResources( - work->allocated_instances); - work->allocated_instances = nullptr; + work->allocated_instances_); + work->allocated_instances_ = nullptr; // Release pinned task args. - ReleaseTaskArgs(task_id); - RemoveFromRunningTasksIfExists(task); + ReleaseLeaseArgs(lease_id); + RemoveFromGrantedLeasesIfExists(lease); // Empty worker popped. - RAY_LOG(DEBUG).WithField(task_id) + RAY_LOG(DEBUG).WithField(lease_id) << "This node has available resources, but no worker processes " "to grant the lease: status " << status; @@ -611,17 +618,17 @@ bool LocalTaskManager::PoppedWorkerHandler( // directly and raise a `RuntimeEnvSetupError` exception to user // eventually. The task will be removed from dispatch queue in // `CancelTask`. - CancelTasks( - [task_id](const auto &work) { - return task_id == work->task.GetTaskSpecification().TaskId(); + CancelLeases( + [lease_id](const auto &w) { + return lease_id == w->lease_.GetLeaseSpecification().LeaseId(); }, rpc::RequestWorkerLeaseReply::SCHEDULING_CANCELLED_RUNTIME_ENV_SETUP_FAILED, /*scheduling_failure_message*/ runtime_env_setup_error_message); } else if (status == PopWorkerStatus::JobFinished) { // The task job finished. // Just remove the task from dispatch queue. - RAY_LOG(DEBUG) << "Call back to a job finished task, task id = " << task_id; - erase_from_dispatch_queue_fn(work, scheduling_class); + RAY_LOG(DEBUG) << "Call back to a job finished lease, lease id = " << lease_id; + erase_from_leases_to_grant_queue_fn(work, scheduling_class); } else { // In other cases, set the work status `WAITING` to make this task // could be re-dispatched. @@ -638,38 +645,38 @@ bool LocalTaskManager::PoppedWorkerHandler( work->SetStateWaiting(cause); } } else { - // A worker has successfully popped for a valid task. Dispatch the task to + // A worker has successfully popped for a valid lease. Grant the lease to // the worker. - RAY_LOG(DEBUG) << "Dispatching task " << task_id << " to worker " + RAY_LOG(DEBUG) << "Granting lease " << lease_id << " to worker " << worker->WorkerId(); - Dispatch(worker, leased_workers_, work->allocated_instances, task, reply, callback); - erase_from_dispatch_queue_fn(work, scheduling_class); - dispatched = true; + Grant(worker, leased_workers_, work->allocated_instances_, lease, reply, callback); + erase_from_leases_to_grant_queue_fn(work, scheduling_class); + granted = true; } - return dispatched; + return granted; } -void LocalTaskManager::Spillback(const NodeID &spillback_to, - const std::shared_ptr &work) { - auto send_reply_callback = work->callback; +void LocalLeaseManager::Spillback(const NodeID &spillback_to, + const std::shared_ptr &work) { + auto send_reply_callback = work->callback_; - if (work->grant_or_reject) { - work->reply->set_rejected(true); + if (work->grant_or_reject_) { + work->reply_->set_rejected(true); send_reply_callback(); return; } - num_task_spilled_++; - const auto &task = work->task; - const auto &task_spec = task.GetTaskSpecification(); - RAY_LOG(DEBUG) << "Spilling task " << task_spec.TaskId() << " to node " << spillback_to; + num_lease_spilled_++; + const auto &lease_spec = work->lease_.GetLeaseSpecification(); + RAY_LOG(DEBUG) << "Spilling lease " << lease_spec.LeaseId() << " to node " + << spillback_to; if (!cluster_resource_scheduler_.AllocateRemoteTaskResources( scheduling::NodeID(spillback_to.Binary()), - task_spec.GetRequiredResources().GetResourceMap())) { - RAY_LOG(DEBUG) << "Tried to allocate resources for request " << task_spec.TaskId() + lease_spec.GetRequiredResources().GetResourceMap())) { + RAY_LOG(DEBUG) << "Tried to allocate resources for request " << lease_spec.LeaseId() << " on a remote node that are no longer available"; } @@ -677,83 +684,83 @@ void LocalTaskManager::Spillback(const NodeID &spillback_to, RAY_CHECK(node_info_ptr) << "Spilling back to a node manager, but no GCS info found for node " << spillback_to; - auto reply = work->reply; + auto reply = work->reply_; reply->mutable_retry_at_raylet_address()->set_ip_address( node_info_ptr->node_manager_address()); reply->mutable_retry_at_raylet_address()->set_port(node_info_ptr->node_manager_port()); - reply->mutable_retry_at_raylet_address()->set_raylet_id(spillback_to.Binary()); + reply->mutable_retry_at_raylet_address()->set_node_id(spillback_to.Binary()); send_reply_callback(); } -void LocalTaskManager::TasksUnblocked(const std::vector &ready_ids) { +void LocalLeaseManager::LeasesUnblocked(const std::vector &ready_ids) { if (ready_ids.empty()) { return; } - for (const auto &task_id : ready_ids) { - auto it = waiting_tasks_index_.find(task_id); - if (it != waiting_tasks_index_.end()) { + for (const auto &lease_id : ready_ids) { + auto it = waiting_leases_index_.find(lease_id); + if (it != waiting_leases_index_.end()) { auto work = *it->second; - const auto &task = work->task; - const auto &scheduling_key = task.GetTaskSpecification().GetSchedulingClass(); - RAY_LOG(DEBUG) << "Args ready, task can be dispatched " - << task.GetTaskSpecification().TaskId(); - tasks_to_dispatch_[scheduling_key].push_back(work); - waiting_task_queue_.erase(it->second); - waiting_tasks_index_.erase(it); + const auto &lease = work->lease_; + const auto &scheduling_key = lease.GetLeaseSpecification().GetSchedulingClass(); + RAY_LOG(DEBUG) << "Args ready, lease can be granted " + << lease.GetLeaseSpecification().LeaseId(); + leases_to_grant_[scheduling_key].push_back(work); + waiting_lease_queue_.erase(it->second); + waiting_leases_index_.erase(it); } } - ScheduleAndDispatchTasks(); + ScheduleAndGrantLeases(); } -void LocalTaskManager::RemoveFromRunningTasksIfExists(const RayTask &task) { - auto sched_cls = task.GetTaskSpecification().GetSchedulingClass(); +void LocalLeaseManager::RemoveFromGrantedLeasesIfExists(const RayLease &lease) { + auto sched_cls = lease.GetLeaseSpecification().GetSchedulingClass(); auto it = info_by_sched_cls_.find(sched_cls); if (it != info_by_sched_cls_.end()) { - // TODO(hjiang): After remove the task id from `running_tasks`, corresponding cgroup + // TODO(hjiang): After remove the lease id from `granted_leases`, corresponding cgroup // will be updated. - it->second.running_tasks.erase(task.GetTaskSpecification().TaskId()); - if (it->second.running_tasks.size() == 0) { + it->second.granted_leases.erase(lease.GetLeaseSpecification().LeaseId()); + if (it->second.granted_leases.size() == 0) { info_by_sched_cls_.erase(it); } } } -void LocalTaskManager::TaskFinished(std::shared_ptr worker, - RayTask *task) { - RAY_CHECK(worker != nullptr && task != nullptr); - *task = worker->GetAssignedTask(); - RemoveFromRunningTasksIfExists(*task); +void LocalLeaseManager::CleanupLease(std::shared_ptr worker, + RayLease *lease) { + RAY_CHECK(worker != nullptr && lease != nullptr); + *lease = worker->GetGrantedLease(); + RemoveFromGrantedLeasesIfExists(*lease); - ReleaseTaskArgs(task->GetTaskSpecification().TaskId()); + ReleaseLeaseArgs(lease->GetLeaseSpecification().LeaseId()); if (worker->GetAllocatedInstances() != nullptr) { ReleaseWorkerResources(worker); } } -// TODO(scv119): task args related logic probaly belongs task dependency manager. -bool LocalTaskManager::PinTaskArgsIfMemoryAvailable(const TaskSpecification &spec, - bool *args_missing) { +// TODO(scv119): lease args related logic probaly belongs lease dependency manager. +bool LocalLeaseManager::PinLeaseArgsIfMemoryAvailable( + const LeaseSpecification &lease_spec, bool *args_missing) { std::vector> args; - const auto &deps = spec.GetDependencyIds(); + const auto &deps = lease_spec.GetDependencyIds(); if (!deps.empty()) { // This gets refs to the arguments stored in plasma. The refs should be // deleted once we no longer need to pin the arguments. - if (!get_task_arguments_(deps, &args)) { + if (!get_lease_arguments_(deps, &args)) { *args_missing = true; return false; } for (size_t i = 0; i < deps.size(); i++) { if (args[i] == nullptr) { - // This can happen if the task's arguments were all local at some - // point, but then at least one was evicted before the task could - // be dispatched to a worker. + // This can happen if the lease's arguments were all local at some + // point, but then at least one was evicted before the lease could + // be granted to a worker. RAY_LOG(DEBUG) - << "RayTask " << spec.TaskId() << " argument " << deps[i] - << " was evicted before the task could be dispatched. This can happen " - "when there are many objects needed on this node. The task will be " - "scheduled once all of its dependencies are local."; + << "RayLease " << lease_spec.LeaseId() << " argument " << deps[i] + << " was evicted before the lease could be granted. This can happen " + "when there are many objects needed on this node. The lease will be " + "granted once all of its dependencies are local."; *args_missing = true; return false; } @@ -761,76 +768,78 @@ bool LocalTaskManager::PinTaskArgsIfMemoryAvailable(const TaskSpecification &spe } *args_missing = false; - size_t task_arg_bytes = 0; + size_t lease_arg_bytes = 0; for (auto &arg : args) { - task_arg_bytes += arg->GetSize(); + lease_arg_bytes += arg->GetSize(); } - RAY_LOG(DEBUG) << "RayTask " << spec.TaskId() << " has args of size " << task_arg_bytes; - PinTaskArgs(spec, std::move(args)); - RAY_LOG(DEBUG) << "Size of pinned task args is now " << pinned_task_arguments_bytes_; - if (max_pinned_task_arguments_bytes_ == 0) { + RAY_LOG(DEBUG) << "RayLease " << lease_spec.LeaseId() << " has args of size " + << lease_arg_bytes; + PinLeaseArgs(lease_spec, std::move(args)); + RAY_LOG(DEBUG) << "Size of pinned task args is now " << pinned_lease_arguments_bytes_; + if (max_pinned_lease_arguments_bytes_ == 0) { // Max threshold for pinned args is not set. return true; } - if (task_arg_bytes > max_pinned_task_arguments_bytes_) { + if (lease_arg_bytes > max_pinned_lease_arguments_bytes_) { RAY_LOG(WARNING) - << "Dispatched task " << spec.TaskId() << " has arguments of size " - << task_arg_bytes - << ", but the max memory allowed for arguments of executing tasks is only " - << max_pinned_task_arguments_bytes_; - } else if (pinned_task_arguments_bytes_ > max_pinned_task_arguments_bytes_) { - ReleaseTaskArgs(spec.TaskId()); - RAY_LOG(DEBUG) << "Cannot dispatch task " << spec.TaskId() - << " with arguments of size " << task_arg_bytes - << " current pinned bytes is " << pinned_task_arguments_bytes_; + << "Granted lease " << lease_spec.LeaseId() << " has arguments of size " + << lease_arg_bytes + << ", but the max memory allowed for arguments of granted leases is only " + << max_pinned_lease_arguments_bytes_; + } else if (pinned_lease_arguments_bytes_ > max_pinned_lease_arguments_bytes_) { + ReleaseLeaseArgs(lease_spec.LeaseId()); + RAY_LOG(DEBUG) << "Cannot grant lease " << lease_spec.LeaseId() + << " with arguments of size " << lease_arg_bytes + << " current pinned bytes is " << pinned_lease_arguments_bytes_; return false; } return true; } -void LocalTaskManager::PinTaskArgs(const TaskSpecification &spec, - std::vector> args) { - const auto &deps = spec.GetDependencyIds(); +void LocalLeaseManager::PinLeaseArgs(const LeaseSpecification &lease_spec, + std::vector> args) { + const auto &deps = lease_spec.GetDependencyIds(); // TODO(swang): This should really be an assertion, but we can sometimes - // receive a duplicate task request if there is a failure and the original - // version of the task has not yet been canceled. - auto executed_task_inserted = executing_task_args_.emplace(spec.TaskId(), deps).second; - if (executed_task_inserted) { + // receive a duplicate lease request if there is a failure and the original + // version of the lease has not yet been canceled. + auto executed_lease_inserted = + granted_lease_args_.emplace(lease_spec.LeaseId(), deps).second; + if (executed_lease_inserted) { for (size_t i = 0; i < deps.size(); i++) { - auto [it, pinned_task_inserted] = - pinned_task_arguments_.emplace(deps[i], std::make_pair(std::move(args[i]), 0)); - if (pinned_task_inserted) { - // This is the first task that needed this argument. - pinned_task_arguments_bytes_ += it->second.first->GetSize(); + auto [it, pinned_lease_inserted] = + pinned_lease_arguments_.emplace(deps[i], std::make_pair(std::move(args[i]), 0)); + if (pinned_lease_inserted) { + // This is the first lease that needed this argument. + pinned_lease_arguments_bytes_ += it->second.first->GetSize(); } it->second.second++; } } else { - RAY_LOG(DEBUG) << "Scheduler received duplicate task " << spec.TaskId() + RAY_LOG(DEBUG) << "Scheduler received duplicate lease " << lease_spec.LeaseId() << ", most likely because the first execution failed"; } } -void LocalTaskManager::ReleaseTaskArgs(const TaskID &task_id) { - auto it = executing_task_args_.find(task_id); +void LocalLeaseManager::ReleaseLeaseArgs(const LeaseID &lease_id) { + auto it = granted_lease_args_.find(lease_id); // TODO(swang): This should really be an assertion, but we can sometimes - // receive a duplicate task request if there is a failure and the original - // version of the task has not yet been canceled. - if (it != executing_task_args_.end()) { + // receive a duplicate lease request if there is a failure and the original + // version of the lease has not yet been canceled. + if (it != granted_lease_args_.end()) { for (auto &arg : it->second) { - auto arg_it = pinned_task_arguments_.find(arg); - RAY_CHECK(arg_it != pinned_task_arguments_.end()); + auto arg_it = pinned_lease_arguments_.find(arg); + RAY_CHECK(arg_it != pinned_lease_arguments_.end()); RAY_CHECK(arg_it->second.second > 0); arg_it->second.second--; if (arg_it->second.second == 0) { - // This is the last task that needed this argument. - pinned_task_arguments_bytes_ -= arg_it->second.first->GetSize(); - pinned_task_arguments_.erase(arg_it); + // This is the last lease that needed this argument. + pinned_lease_arguments_bytes_ -= arg_it->second.first->GetSize(); + pinned_lease_arguments_.erase(arg_it); } } - executing_task_args_.erase(it); + granted_lease_args_.erase(it); } } @@ -838,8 +847,8 @@ namespace { void ReplyCancelled(const std::shared_ptr &work, rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type, const std::string &scheduling_failure_message) { - auto reply = work->reply; - auto callback = work->callback; + auto reply = work->reply_; + auto callback = work->callback_; reply->set_canceled(true); reply->set_failure_type(failure_type); reply->set_scheduling_failure_message(scheduling_failure_message); @@ -847,31 +856,31 @@ void ReplyCancelled(const std::shared_ptr &work, } } // namespace -bool LocalTaskManager::CancelTasks( +bool LocalLeaseManager::CancelLeases( std::function &)> predicate, rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type, const std::string &scheduling_failure_message) { bool tasks_cancelled = false; ray::erase_if>( - tasks_to_dispatch_, [&](const std::shared_ptr &work) { + leases_to_grant_, [&](const std::shared_ptr &work) { if (!predicate(work)) { return false; } - CancelTaskToDispatch(work, failure_type, scheduling_failure_message); + CancelLeaseToGrant(work, failure_type, scheduling_failure_message); tasks_cancelled = true; return true; }); ray::erase_if>( - waiting_task_queue_, [&](const std::shared_ptr &work) { + waiting_lease_queue_, [&](const std::shared_ptr &work) { if (predicate(work)) { ReplyCancelled(work, failure_type, scheduling_failure_message); - if (!work->task.GetTaskSpecification().GetDependencies().empty()) { - task_dependency_manager_.RemoveTaskDependencies( - work->task.GetTaskSpecification().TaskId()); + if (!work->lease_.GetLeaseSpecification().GetDependencies().empty()) { + lease_dependency_manager_.RemoveLeaseDependencies( + work->lease_.GetLeaseSpecification().LeaseId()); } - waiting_tasks_index_.erase(work->task.GetTaskSpecification().TaskId()); + waiting_leases_index_.erase(work->lease_.GetLeaseSpecification().LeaseId()); tasks_cancelled = true; return true; } else { @@ -882,39 +891,39 @@ bool LocalTaskManager::CancelTasks( return tasks_cancelled; } -void LocalTaskManager::CancelTaskToDispatch( +void LocalLeaseManager::CancelLeaseToGrant( const std::shared_ptr &work, rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type, const std::string &scheduling_failure_message) { - const TaskID task_id = work->task.GetTaskSpecification().TaskId(); - RAY_LOG(DEBUG) << "Canceling task " << task_id << " from dispatch queue."; + const LeaseID lease_id = work->lease_.GetLeaseSpecification().LeaseId(); + RAY_LOG(DEBUG) << "Canceling lease " << lease_id << " from leases_to_grant_queue."; ReplyCancelled(work, failure_type, scheduling_failure_message); if (work->GetState() == internal::WorkStatus::WAITING_FOR_WORKER) { // We've already acquired resources so we need to release them. cluster_resource_scheduler_.GetLocalResourceManager().ReleaseWorkerResources( - work->allocated_instances); - // Release pinned task args. - ReleaseTaskArgs(task_id); + work->allocated_instances_); + // Release pinned lease args. + ReleaseLeaseArgs(lease_id); } - if (!work->task.GetTaskSpecification().GetDependencies().empty()) { - task_dependency_manager_.RemoveTaskDependencies( - work->task.GetTaskSpecification().TaskId()); + if (!work->lease_.GetLeaseSpecification().GetDependencies().empty()) { + lease_dependency_manager_.RemoveLeaseDependencies( + work->lease_.GetLeaseSpecification().LeaseId()); } - RemoveFromRunningTasksIfExists(work->task); + RemoveFromGrantedLeasesIfExists(work->lease_); work->SetStateCancelled(); } -const RayTask *LocalTaskManager::AnyPendingTasksForResourceAcquisition( - int *num_pending_actor_creation, int *num_pending_tasks) const { - const RayTask *exemplar = nullptr; - // We are guaranteed that these tasks are blocked waiting for resources after a - // call to ScheduleAndDispatchTasks(). They may be waiting for workers as well, but +const RayLease *LocalLeaseManager::AnyPendingLeasesForResourceAcquisition( + int *num_pending_actor_creation, int *num_pending_leases) const { + const RayLease *exemplar = nullptr; + // We are guaranteed that these leases are blocked waiting for resources after a + // call to ScheduleAndGrantLeases(). They may be waiting for workers as well, but // this should be a transient condition only. - for (const auto &shapes_it : tasks_to_dispatch_) { + for (const auto &shapes_it : leases_to_grant_) { auto &work_queue = shapes_it.second; for (const auto &work_it : work_queue) { const auto &work = *work_it; - const auto &task = work_it->task; + const auto &lease = work_it->lease_; // If the work is not in the waiting state, it will be scheduled soon or won't be // scheduled. Consider as non-pending. @@ -933,52 +942,52 @@ const RayTask *LocalTaskManager::AnyPendingTasksForResourceAcquisition( continue; } - if (task.GetTaskSpecification().IsActorCreationTask()) { + if (lease.GetLeaseSpecification().IsActorCreationTask()) { *num_pending_actor_creation += 1; } else { - *num_pending_tasks += 1; + *num_pending_leases += 1; } if (exemplar == nullptr) { - exemplar = &task; + exemplar = &lease; } } } return exemplar; } -void LocalTaskManager::Dispatch( +void LocalLeaseManager::Grant( std::shared_ptr worker, - absl::flat_hash_map> &leased_workers, + absl::flat_hash_map> &leased_workers, const std::shared_ptr &allocated_instances, - const RayTask &task, + const RayLease &lease, rpc::RequestWorkerLeaseReply *reply, std::function send_reply_callback) { - const auto &task_spec = task.GetTaskSpecification(); + const auto &lease_spec = lease.GetLeaseSpecification(); - if (task_spec.IsActorCreationTask()) { + if (lease_spec.IsActorCreationTask()) { // The actor belongs to this worker now. worker->SetLifetimeAllocatedInstances(allocated_instances); } else { worker->SetAllocatedInstances(allocated_instances); } - worker->SetAssignedTask(task); + worker->GrantLease(lease); // Pass the contact info of the worker to use. reply->set_worker_pid(worker->GetProcess().GetId()); reply->mutable_worker_address()->set_ip_address(worker->IpAddress()); reply->mutable_worker_address()->set_port(worker->Port()); reply->mutable_worker_address()->set_worker_id(worker->WorkerId().Binary()); - reply->mutable_worker_address()->set_raylet_id(self_node_id_.Binary()); + reply->mutable_worker_address()->set_node_id(self_node_id_.Binary()); - RAY_CHECK(leased_workers.find(worker->WorkerId()) == leased_workers.end()); - leased_workers[worker->WorkerId()] = worker; + RAY_CHECK(!leased_workers.contains(lease_spec.LeaseId())); + leased_workers[lease_spec.LeaseId()] = worker; cluster_resource_scheduler_.GetLocalResourceManager().SetBusyFootprint( WorkFootprint::NODE_WORKERS); // Update our internal view of the cluster state. std::shared_ptr allocated_resources; - if (task_spec.IsActorCreationTask()) { + if (lease_spec.IsActorCreationTask()) { allocated_resources = worker->GetLifetimeAllocatedInstances(); } else { allocated_resources = worker->GetAllocatedInstances(); @@ -1005,7 +1014,7 @@ void LocalTaskManager::Dispatch( send_reply_callback(); } -void LocalTaskManager::ClearWorkerBacklog(const WorkerID &worker_id) { +void LocalLeaseManager::ClearWorkerBacklog(const WorkerID &worker_id) { for (auto it = backlog_tracker_.begin(); it != backlog_tracker_.end();) { it->second.erase(worker_id); if (it->second.empty()) { @@ -1016,9 +1025,9 @@ void LocalTaskManager::ClearWorkerBacklog(const WorkerID &worker_id) { } } -void LocalTaskManager::SetWorkerBacklog(SchedulingClass scheduling_class, - const WorkerID &worker_id, - int64_t backlog_size) { +void LocalLeaseManager::SetWorkerBacklog(SchedulingClass scheduling_class, + const WorkerID &worker_id, + int64_t backlog_size) { if (backlog_size == 0) { backlog_tracker_[scheduling_class].erase(worker_id); if (backlog_tracker_[scheduling_class].empty()) { @@ -1029,7 +1038,7 @@ void LocalTaskManager::SetWorkerBacklog(SchedulingClass scheduling_class, } } -void LocalTaskManager::ReleaseWorkerResources(std::shared_ptr worker) { +void LocalLeaseManager::ReleaseWorkerResources(std::shared_ptr worker) { RAY_CHECK(worker != nullptr); auto allocated_instances = worker->GetAllocatedInstances() ? worker->GetAllocatedInstances() @@ -1061,7 +1070,7 @@ void LocalTaskManager::ReleaseWorkerResources(std::shared_ptr w worker->ClearLifetimeAllocatedInstances(); } -bool LocalTaskManager::ReleaseCpuResourcesFromBlockedWorker( +bool LocalLeaseManager::ReleaseCpuResourcesFromBlockedWorker( std::shared_ptr worker) { if (!worker || worker->IsBlocked()) { return false; @@ -1090,7 +1099,7 @@ bool LocalTaskManager::ReleaseCpuResourcesFromBlockedWorker( } } -bool LocalTaskManager::ReturnCpuResourcesToUnblockedWorker( +bool LocalLeaseManager::ReturnCpuResourcesToUnblockedWorker( std::shared_ptr worker) { if (!worker || !worker->IsBlocked()) { return false; @@ -1122,18 +1131,17 @@ bool LocalTaskManager::ReturnCpuResourcesToUnblockedWorker( } } -ResourceSet LocalTaskManager::CalcNormalTaskResources() const { +ResourceSet LocalLeaseManager::CalcNormalTaskResources() const { ResourceSet total_normal_task_resources; for (auto &entry : leased_workers_) { std::shared_ptr worker = entry.second; - auto &task_spec = worker->GetAssignedTask().GetTaskSpecification(); - if (!task_spec.PlacementGroupBundleId().first.IsNil()) { + auto &lease_spec = worker->GetGrantedLease().GetLeaseSpecification(); + if (!lease_spec.PlacementGroupBundleId().first.IsNil()) { continue; } - auto task_id = worker->GetAssignedTaskId(); - auto actor_id = task_id.ActorId(); - if (!actor_id.IsNil() && task_id == TaskID::ForActorCreationTask(actor_id)) { + auto actor_id = worker->GetActorId(); + if (!actor_id.IsNil() && lease_spec.IsActorCreationTask()) { // This task ID corresponds to an actor creation task. continue; } @@ -1154,9 +1162,9 @@ ResourceSet LocalTaskManager::CalcNormalTaskResources() const { return total_normal_task_resources; } -uint64_t LocalTaskManager::MaxRunningTasksPerSchedulingClass( +uint64_t LocalLeaseManager::MaxGrantedLeasesPerSchedulingClass( SchedulingClass sched_cls_id) const { - auto sched_cls = TaskSpecification::GetSchedulingClassDescriptor(sched_cls_id); + auto sched_cls = SchedulingClassToIds::GetSchedulingClassDescriptor(sched_cls_id); double cpu_req = sched_cls.resource_set.Get(ResourceID::CPU()).Double(); uint64_t total_cpus = cluster_resource_scheduler_.GetLocalResourceManager().GetNumCpus(); @@ -1167,23 +1175,24 @@ uint64_t LocalTaskManager::MaxRunningTasksPerSchedulingClass( return static_cast(std::round(total_cpus / cpu_req)); } -void LocalTaskManager::RecordMetrics() const { - ray::stats::STATS_scheduler_tasks.Record(executing_task_args_.size(), "Executing"); - ray::stats::STATS_scheduler_tasks.Record(waiting_tasks_index_.size(), "Waiting"); +void LocalLeaseManager::RecordMetrics() const { + ray::stats::STATS_scheduler_tasks.Record(granted_lease_args_.size(), "Executing"); + ray::stats::STATS_scheduler_tasks.Record(waiting_leases_index_.size(), "Waiting"); } -void LocalTaskManager::DebugStr(std::stringstream &buffer) const { - buffer << "Waiting tasks size: " << waiting_tasks_index_.size() << "\n"; - buffer << "Number of executing tasks: " << executing_task_args_.size() << "\n"; - buffer << "Number of pinned task arguments: " << pinned_task_arguments_.size() << "\n"; - buffer << "Number of total spilled tasks: " << num_task_spilled_ << "\n"; - buffer << "Number of spilled waiting tasks: " << num_waiting_task_spilled_ << "\n"; - buffer << "Number of spilled unschedulable tasks: " << num_unschedulable_task_spilled_ +void LocalLeaseManager::DebugStr(std::stringstream &buffer) const { + buffer << "Waiting leases size: " << waiting_leases_index_.size() << "\n"; + buffer << "Number of granted lease arguments: " << granted_lease_args_.size() << "\n"; + buffer << "Number of pinned lease arguments: " << pinned_lease_arguments_.size() + << "\n"; + buffer << "Number of total spilled leases: " << num_lease_spilled_ << "\n"; + buffer << "Number of spilled waiting leases: " << num_waiting_lease_spilled_ << "\n"; + buffer << "Number of spilled unschedulable leases: " << num_unschedulable_lease_spilled_ << "\n"; buffer << "Resource usage {\n"; - // Calculates how much resources are occupied by tasks or actors. - // Only iterate upto this number to avoid excessive CPU usage. + // Calculates how much resources are occupied by leases. + // Only iterate up to this number to avoid excessive CPU usage. auto max_iteration = RayConfig::instance().worker_max_resource_analysis_iteration(); uint32_t iteration = 0; for (const auto &worker : worker_pool_.GetAllRegisteredWorkers( @@ -1193,24 +1202,26 @@ void LocalTaskManager::DebugStr(std::stringstream &buffer) const { } if (worker->IsDead() // worker is dead || worker->IsBlocked() // worker is blocked by blocking Ray API - || (worker->GetAssignedTaskId().IsNil() && - worker->GetActorId().IsNil())) { // Tasks or actors not assigned + || (worker->GetGrantedLeaseId().IsNil() && + worker->GetActorId().IsNil())) { // Lease not assigned + // TODO(#55923) probably don't need to above check for ActorId since LeaseId is not + // reset for actors either // Then this shouldn't have allocated resources. continue; } - const auto &task_or_actor_name = worker->GetAssignedTask() - .GetTaskSpecification() + const auto &task_or_actor_name = worker->GetGrantedLease() + .GetLeaseSpecification() .FunctionDescriptor() ->CallString(); buffer << " - (language=" << rpc::Language_descriptor()->FindValueByNumber(worker->GetLanguage())->name() << " " - << "actor_or_task=" << task_or_actor_name << " " + << "actor_or_task" << task_or_actor_name << " " << "pid=" << worker->GetProcess().GetId() << " " << "worker_id=" << worker->WorkerId() << "): " - << worker->GetAssignedTask() - .GetTaskSpecification() + << worker->GetGrantedLease() + .GetLeaseSpecification() .GetRequiredResources() .DebugString() << "\n"; @@ -1218,7 +1229,8 @@ void LocalTaskManager::DebugStr(std::stringstream &buffer) const { buffer << "}\n"; buffer << "Backlog Size per scheduling descriptor :{workerId: num backlogs}:\n"; for (const auto &[sched_cls, worker_to_backlog_size] : backlog_tracker_) { - const auto &descriptor = TaskSpecification::GetSchedulingClassDescriptor(sched_cls); + const auto &descriptor = + SchedulingClassToIds::GetSchedulingClassDescriptor(sched_cls); buffer << "\t" << descriptor.ResourceSetStr() << ": {\n"; for (const auto &[worker_id, backlog_size] : worker_to_backlog_size) { buffer << "\t\t" << worker_id << ": " << backlog_size << "\n"; @@ -1226,13 +1238,14 @@ void LocalTaskManager::DebugStr(std::stringstream &buffer) const { buffer << "\t}\n"; } buffer << "\n"; - buffer << "Running tasks by scheduling class:\n"; + buffer << "Granted leases by scheduling class:\n"; for (const auto &pair : info_by_sched_cls_) { const auto &sched_cls = pair.first; const auto &info = pair.second; - const auto &descriptor = TaskSpecification::GetSchedulingClassDescriptor(sched_cls); - buffer << " - " << descriptor.DebugString() << ": " << info.running_tasks.size() + const auto &descriptor = + SchedulingClassToIds::GetSchedulingClassDescriptor(sched_cls); + buffer << " - " << descriptor.DebugString() << ": " << info.granted_leases.size() << "/" << info.capacity << "\n"; } } diff --git a/src/ray/raylet/local_task_manager.h b/src/ray/raylet/local_lease_manager.h similarity index 50% rename from src/ray/raylet/local_task_manager.h rename to src/ray/raylet/local_lease_manager.h index ebba83089d23..e3ce995650d0 100644 --- a/src/ray/raylet/local_task_manager.h +++ b/src/ray/raylet/local_lease_manager.h @@ -23,120 +23,122 @@ #include "absl/container/flat_hash_map.h" #include "absl/container/flat_hash_set.h" +#include "ray/common/lease/lease.h" #include "ray/common/ray_object.h" -#include "ray/common/task/task.h" -#include "ray/raylet/dependency_manager.h" +#include "ray/raylet/lease_dependency_manager.h" #include "ray/raylet/scheduling/cluster_resource_scheduler.h" #include "ray/raylet/scheduling/internal.h" -#include "ray/raylet/scheduling/local_task_manager_interface.h" +#include "ray/raylet/scheduling/local_lease_manager_interface.h" #include "ray/raylet/worker.h" #include "ray/raylet/worker_pool.h" namespace ray { namespace raylet { -/// Manages the lifetime of a task on the local node. It receives request from -/// cluster_task_manager (the distributed scheduler) and does the following +/// Manages the lifetime of a lease on the local node. It receives request from +/// cluster_lease_manager (the distributed scheduler) and does the following /// steps: -/// 1. Pulling task dependencies, add the task into waiting queue. -/// 2. Once task's dependencies are all pulled locally, the task be added into -/// dispatch queue. -/// 3. For all tasks in dispatch queue, we schedule them by first acquiring -/// local resources (including pinning the objects in memory and deduct -/// cpu/gpu and other resources from local reosource manager)) . -/// If a task failed to acquire resources in step 3, we will try to -/// spill it to an different remote node. -/// 4. If all resources are acquired, we start a worker and returns the worker +/// 1. Pulling lease dependencies, add the lease into waiting queue. +/// 2. Once lease's dependencies are all pulled locally, the lease is added into +/// the grant queue. +/// 3. For all leases in the grant queue, we schedule them by first acquiring +/// local resources (including pinning the objects in memory and deducting +/// cpu/gpu and other resources from the local resource manager). +/// If a lease failed to acquire resources in step 3, we will try to +/// spill it to a different remote node. +/// 4. If all resources are acquired, we start a worker and return the worker /// address to the client once worker starts up. /// 5. When a worker finishes executing its task(s), the requester will return -/// it and we should release the resources in our view of the node's state. -/// 6. If a task has been waiting for arguments for too long, it will also be +/// the lease and we should release the resources in our view of the node's state. +/// 6. If a lease has been waiting for arguments for too long, it will also be /// spilled back to a different node. /// /// TODO(scv119): ideally, the local scheduler shouldn't be responsible for spilling, /// as it should return the request to the distributed scheduler if -/// resource accusition failed, or a task has arguments pending resolution for too long +/// resource accusition failed, or a lease has arguments pending resolution for too long /// time. -class LocalTaskManager : public ILocalTaskManager { +class LocalLeaseManager : public LocalLeaseManagerInterface { public: + /// Create a local lease manager. /// \param self_node_id: ID of local node. /// \param cluster_resource_scheduler: The resource scheduler which contains /// the state of the cluster. - /// \param task_dependency_manager_ Used to fetch task's dependencies. + /// \param lease_dependency_manager_ Used to fetch lease's dependencies. /// \param get_node_info: Function that returns the node info for a node. /// \param worker_pool: A reference to the worker pool. /// \param leased_workers: A reference to the leased workers map. - /// \param get_task_arguments: A callback for getting a tasks' arguments by + /// \param get_lease_arguments: A callback for getting a leases' arguments by /// their ids. - /// \param max_pinned_task_arguments_bytes: The cap on pinned arguments. + /// \param max_pinned_lease_arguments_bytes: The cap on pinned arguments. /// \param get_time_ms: A callback which returns the current time in milliseconds. /// \param sched_cls_cap_interval_ms: The time before we increase the cap - /// on the number of tasks that can run per + /// on the number of leases that can run per /// scheduling class. If set to 0, there is no /// cap. If it's a large number, the cap is hard. - LocalTaskManager( + LocalLeaseManager( const NodeID &self_node_id, ClusterResourceScheduler &cluster_resource_scheduler, - TaskDependencyManagerInterface &task_dependency_manager, + LeaseDependencyManagerInterface &lease_dependency_manager, internal::NodeInfoGetter get_node_info, WorkerPoolInterface &worker_pool, - absl::flat_hash_map> &leased_workers, + absl::flat_hash_map> &leased_workers, std::function &object_ids, std::vector> *results)> - get_task_arguments, - size_t max_pinned_task_arguments_bytes, + get_lease_arguments, + size_t max_pinned_lease_arguments_bytes, std::function get_time_ms = []() { return static_cast(absl::GetCurrentTimeNanos() / 1e6); }, int64_t sched_cls_cap_interval_ms = RayConfig::instance().worker_cap_initial_backoff_delay_ms()); - /// Queue task and schedule. - void QueueAndScheduleTask(std::shared_ptr work) override; + /// Queue lease and schedule. + void QueueAndScheduleLease(std::shared_ptr work) override; - // Schedule and dispatch tasks. - void ScheduleAndDispatchTasks() override; + // Schedule and dispatch leases. + void ScheduleAndGrantLeases() override; - /// Move tasks from waiting to ready for dispatch. Called when a task's + /// Move leases from waiting to ready for dispatch. Called when a lease's /// dependencies are resolved. /// - /// \param ready_ids: The tasks which are now ready to be dispatched. - void TasksUnblocked(const std::vector &ready_ids) override; + /// \param ready_ids: The leases which are now ready to be granted. + void LeasesUnblocked(const std::vector &ready_ids) override; - /// Return the finished task and release the worker resources. + /// Cleanup the lease and release the worker resources. /// This method will be removed and can be replaced by `ReleaseWorkerResources` directly /// once we remove the legacy scheduler. /// - /// \param worker: The worker which was running the task. - /// \param task: Output parameter. - void TaskFinished(std::shared_ptr worker, RayTask *task) override; + /// \param worker: The worker which was granted the lease. + /// \param lease: Output parameter. + void CleanupLease(std::shared_ptr worker, RayLease *lease) override; - /// Attempt to cancel all queued tasks that match the predicate. + /// Attempt to cancel all queued leases that match the predicate. /// - /// \param predicate: A function that returns true if a task needs to be cancelled. + /// \param predicate: A function that returns true if a lease needs to be cancelled. /// \param failure_type: The reason for cancellation. /// \param scheduling_failure_message: The reason message for cancellation. - /// \return True if any task was successfully cancelled. - bool CancelTasks(std::function &)> predicate, - rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type, - const std::string &scheduling_failure_message) override; + /// \return True if any lease was successfully cancelled. + bool CancelLeases( + std::function &)> predicate, + rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type, + const std::string &scheduling_failure_message) override; - /// Return with an exemplar if any tasks are pending resource acquisition. + /// Return with an exemplar if any leases are pending resource acquisition. /// - /// \param[in,out] num_pending_actor_creation: Number of pending actor creation tasks. - /// \param[in,out] num_pending_tasks: Number of pending tasks. - /// \return An example task that is deadlocking if any tasks are pending resource + /// \param[in,out] num_pending_actor_creation: Number of pending actor creation leases. + /// \param[in,out] num_pending_leases: Number of pending leases. + /// \return An example lease that is deadlocking if any leases are pending resource /// acquisition. - const RayTask *AnyPendingTasksForResourceAcquisition( - int *num_pending_actor_creation, int *num_pending_tasks) const override; + const RayLease *AnyPendingLeasesForResourceAcquisition( + int *num_pending_actor_creation, int *num_pending_leases) const override; - /// Call once a task finishes (i.e. a worker is returned). + /// Call once a lease finishes (i.e. a worker is returned). /// - /// \param worker: The worker which was running the task. + /// \param worker: The worker which was granted the lease. void ReleaseWorkerResources(std::shared_ptr worker) override; - /// When a task is blocked in ray.get or ray.wait, the worker who is executing the task - /// should give up the CPU resources allocated for the running task for the time being - /// and the worker itself should also be marked as blocked. + /// When a lease is blocked in ray.get or ray.wait, the worker who is executing the + /// lease should give up the CPU resources allocated for the granted lease for the time + /// being and the worker itself should also be marked as blocked. /// /// \param worker: The worker who will give up the CPU resources. /// \return true if the cpu resources of the specified worker are released successfully, @@ -144,7 +146,7 @@ class LocalTaskManager : public ILocalTaskManager { bool ReleaseCpuResourcesFromBlockedWorker( std::shared_ptr worker) override; - /// When a task is no longer blocked in a ray.get or ray.wait, the CPU resources that + /// When a lease is no longer blocked in a ray.get or ray.wait, the CPU resources that /// the worker gave up should be returned to it. /// /// \param worker The blocked worker. @@ -165,8 +167,8 @@ class LocalTaskManager : public ILocalTaskManager { void ClearWorkerBacklog(const WorkerID &worker_id) override; const absl::flat_hash_map>> - &GetTaskToDispatch() const override { - return tasks_to_dispatch_; + &GetLeasesToGrant() const override { + return leases_to_grant_; } const absl::flat_hash_map> @@ -178,194 +180,193 @@ class LocalTaskManager : public ILocalTaskManager { void DebugStr(std::stringstream &buffer) const override; - size_t GetNumTaskSpilled() const override { return num_task_spilled_; } - size_t GetNumWaitingTaskSpilled() const override { return num_waiting_task_spilled_; } - size_t GetNumUnschedulableTaskSpilled() const override { - return num_unschedulable_task_spilled_; + size_t GetNumLeaseSpilled() const override { return num_lease_spilled_; } + size_t GetNumWaitingLeaseSpilled() const override { return num_waiting_lease_spilled_; } + size_t GetNumUnschedulableLeaseSpilled() const override { + return num_unschedulable_lease_spilled_; } private: struct SchedulingClassInfo; - void RemoveFromRunningTasksIfExists(const RayTask &task); + void RemoveFromGrantedLeasesIfExists(const RayLease &lease); /// Handle the popped worker from worker pool. bool PoppedWorkerHandler(const std::shared_ptr worker, PopWorkerStatus status, - const TaskID &task_id, + const LeaseID &lease_id, SchedulingClass scheduling_class, const std::shared_ptr &work, bool is_detached_actor, const rpc::Address &owner_address, const std::string &runtime_env_setup_error_message); - /// Cancels a task in tasks_to_dispatch_. Does not remove it from tasks_to_dispatch_. - void CancelTaskToDispatch( + /// Cancels a lease in leases_to_grant_. Does not remove it from leases_to_grant_. + void CancelLeaseToGrant( const std::shared_ptr &work, rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type = rpc::RequestWorkerLeaseReply::SCHEDULING_CANCELLED_INTENDED, const std::string &scheduling_failure_message = ""); - /// Attempts to dispatch all tasks which are ready to run. A task - /// will be dispatched if it is on `tasks_to_dispatch_` and there are still + /// Attempts to grant all leases which are ready to run. A lease + /// will be granted if it is on `leases_to_grant_` and there are still /// available resources on the node. /// - /// If there are not enough resources locally, up to one task per resource - /// shape (the task at the head of the queue) will get spilled back to a + /// If there are not enough resources locally, up to one lease per resource + /// shape (the lease at the head of the queue) will get spilled back to a /// different node. - void DispatchScheduledTasksToWorkers(); + void GrantScheduledLeasesToWorkers(); /// Helper method when the current node does not have the available resources to run a - /// task. + /// lease. /// - /// \returns true if the task was spilled. The task may not be spilled if the + /// \returns true if the lease was spilled. The lease may not be spilled if the /// spillback policy specifies the local node (which may happen if no other nodes have /// the requested resources available). bool TrySpillback(const std::shared_ptr &work, bool &is_infeasible); - // Try to spill waiting tasks to a remote node, starting from the end of the + // Try to spill waiting leases to a remote node, starting from the end of the // queue. - void SpillWaitingTasks(); + void SpillWaitingLeases(); - /// Calculate the maximum number of running tasks for a given scheduling + /// Calculate the maximum number of granted leases for a given scheduling /// class. https://github.com/ray-project/ray/issues/16973 /// /// \param sched_cls_id The scheduling class in question. /// \returns The maximum number instances of that scheduling class that - /// should be running (or blocked) at once. - uint64_t MaxRunningTasksPerSchedulingClass(SchedulingClass sched_cls_id) const; + /// should be granted (or blocked) at once. + uint64_t MaxGrantedLeasesPerSchedulingClass(SchedulingClass sched_cls_id) const; /// Recompute the debug stats. - /// It is needed because updating the debug state is expensive for cluster_task_manager. + /// It is needed because updating the debug state is expensive for + /// cluster_lease_manager. /// TODO(sang): Update the internal states value dynamically instead of iterating the /// data structure. void RecomputeDebugStats() const; - void Dispatch( + void Grant( std::shared_ptr worker, - absl::flat_hash_map> &leased_workers_, + absl::flat_hash_map> &leased_workers_, const std::shared_ptr &allocated_instances, - const RayTask &task, + const RayLease &lease, rpc::RequestWorkerLeaseReply *reply, std::function send_reply_callback); void Spillback(const NodeID &spillback_to, const std::shared_ptr &work); - // Helper function to pin a task's args immediately before dispatch. This + // Helper function to pin a lease's args immediately before being granted. This // returns false if there are missing args (due to eviction) or if there is - // not enough memory available to dispatch the task, due to other executing - // tasks' arguments. - bool PinTaskArgsIfMemoryAvailable(const TaskSpecification &spec, bool *args_missing); + // not enough memory available to grant the lease, due to other granted + // leases' arguments. + bool PinLeaseArgsIfMemoryAvailable(const LeaseSpecification &lease_spec, + bool *args_missing); - // Helper functions to pin and release an executing task's args. - void PinTaskArgs(const TaskSpecification &spec, - std::vector> args); - void ReleaseTaskArgs(const TaskID &task_id); + // Helper functions to pin and release a granted lease's args. + void PinLeaseArgs(const LeaseSpecification &lease_spec, + std::vector> args); + void ReleaseLeaseArgs(const LeaseID &lease_id); private: - /// Determine whether a task should be immediately dispatched, + /// Determine whether a lease should be immediately granted, /// or placed on a wait queue. - void WaitForTaskArgsRequests(std::shared_ptr work); + void WaitForLeaseArgsRequests(std::shared_ptr work); const NodeID &self_node_id_; const scheduling::NodeID self_scheduling_node_id_; /// Responsible for resource tracking/view of the cluster. ClusterResourceScheduler &cluster_resource_scheduler_; - /// Class to make task dependencies to be local. - TaskDependencyManagerInterface &task_dependency_manager_; + /// Class to make lease dependencies to be local. + LeaseDependencyManagerInterface &lease_dependency_manager_; /// Function to get the node information of a given node id. internal::NodeInfoGetter get_node_info_; const int max_resource_shapes_per_load_report_; - /// Tracking information about the currently running tasks in a scheduling - /// class. This information is used to place a cap on the number of running - /// running tasks per scheduling class. + /// Tracking information about the currently granted leases in a scheduling + /// class. This information is used to place a cap on the number of + /// granted leases per scheduling class. struct SchedulingClassInfo { explicit SchedulingClassInfo(int64_t cap) : capacity(cap), next_update_time(std::numeric_limits::max()) {} - /// Track the running task ids in this scheduling class. + /// Track the granted lease ids in this scheduling class. /// - /// TODO(hjiang): Store cgroup manager along with task id as the value for map. - absl::flat_hash_set running_tasks; - /// The total number of tasks that can run from this scheduling class. + /// TODO(hjiang): Store cgroup manager along with lease id as the value for map. + absl::flat_hash_set granted_leases; + /// The total number of leases that can run from this scheduling class. uint64_t capacity; - /// The next time that a new task of this scheduling class may be dispatched. + /// The next time that a new lease of this scheduling class may be dispatched. int64_t next_update_time; }; - /// Mapping from scheduling class to information about the running tasks of + /// Mapping from scheduling class to information about the granted leases of /// the scheduling class. See `struct SchedulingClassInfo` above for more /// details about what information is tracked. absl::flat_hash_map info_by_sched_cls_; /// Queue of lease requests that should be scheduled onto workers. - /// Tasks move from scheduled | waiting -> dispatch. - /// Tasks can also move from dispatch -> waiting if one of their arguments is + /// Leases move from scheduled | waiting -> granting. + /// Leases can also move from granting -> waiting if one of their arguments is /// evicted. - /// All tasks in this map that have dependencies should be registered with - /// the dependency manager, in case a dependency gets evicted while the task + /// All leases in this map that have dependencies should be registered with + /// the dependency manager, in case a dependency gets evicted while the lease /// is still queued. /// Note that if a queue exists, it should be guaranteed to be non-empty. absl::flat_hash_map>> - tasks_to_dispatch_; + leases_to_grant_; - /// Tasks waiting for arguments to be transferred locally. - /// Tasks move from waiting -> dispatch. - /// Tasks can also move from dispatch -> waiting if one of their arguments is + /// Leases waiting for arguments to be transferred locally. + /// Leases move from waiting -> granting. + /// Leases can also move from granting -> waiting if one of their arguments is /// evicted. - /// All tasks in this map that have dependencies should be registered with - /// the dependency manager, so that they can be moved to dispatch once their + /// All leases in this map that have dependencies should be registered with + /// the dependency manager, so that they can be moved to granting once their /// dependencies are local. - /// - /// We keep these in a queue so that tasks can be spilled back from the end - /// of the queue. This is to try to prioritize spilling tasks whose + + /// We keep these in a queue so that leases can be spilled back from the end + /// of the queue. This is to try to prioritize spilling leases whose /// dependencies may not be fetched locally yet. - /// - /// Note that because tasks can also move from dispatch -> waiting, the order + + /// Note that because leases can also move from grant -> waiting, the order /// in this queue may not match the order in which we initially received the - /// tasks. This also means that the PullManager may request dependencies for - /// these tasks in a different order than the waiting task queue. + /// leases. This also means that the PullManager may request dependencies for + /// these leases in a different order than the waiting lease queue. /// Note that if a queue exists, it should be guaranteed to be non-empty. - std::list> waiting_task_queue_; + std::list> waiting_lease_queue_; /// An index for the above queue. - absl::flat_hash_map>::iterator> - waiting_tasks_index_; + absl::flat_hash_map>::iterator> + waiting_leases_index_; /// Track the backlog of all workers belonging to this raylet. absl::flat_hash_map> backlog_tracker_; - /// TODO(Shanly): Remove `worker_pool_` and `leased_workers_` and make them as - /// parameters of methods if necessary once we remove the legacy scheduler. WorkerPoolInterface &worker_pool_; - absl::flat_hash_map> &leased_workers_; + absl::flat_hash_map> &leased_workers_; - /// Callback to get references to task arguments. These will be pinned while - /// the task is running. + /// Callback to get references to lease arguments. These will be pinned while + /// the lease is granted. std::function &object_ids, std::vector> *results)> - get_task_arguments_; + get_lease_arguments_; - /// Arguments needed by currently granted lease requests. These should be - /// pinned before the lease is granted to ensure that the arguments are not - /// evicted before the task(s) start running. - absl::flat_hash_map> executing_task_args_; + /// Arguments needed by currently granted leases. These should be pinned before + /// the lease is granted to ensure that the arguments are not evicted. + absl::flat_hash_map> granted_lease_args_; - /// All arguments of running tasks, which are also pinned in the object - /// store. The value is a pair: (the pointer to the object store that should - /// be deleted once the object is no longer needed, number of tasks that - /// depend on the object). + /// All arguments of granted leases, which are also pinned in the object store. + /// The value is a pair: (the pointer to the object store that should be deleted + /// once the object is no longer needed, number of leases that depend on the + /// object). absl::flat_hash_map, size_t>> - pinned_task_arguments_; + pinned_lease_arguments_; - /// The total number of arguments pinned for running tasks. + /// The total number of arguments pinned for granted leases. /// Used for debug purposes. - size_t pinned_task_arguments_bytes_ = 0; + size_t pinned_lease_arguments_bytes_ = 0; - /// The maximum amount of bytes that can be used by executing task arguments. - size_t max_pinned_task_arguments_bytes_; + /// The maximum amount of bytes that can be used by granted lease arguments. + size_t max_pinned_lease_arguments_bytes_; /// Returns the current time in milliseconds. std::function get_time_ms_; @@ -378,16 +379,17 @@ class LocalTaskManager : public ILocalTaskManager { const int64_t sched_cls_cap_max_ms_; - size_t num_task_spilled_ = 0; - size_t num_waiting_task_spilled_ = 0; - size_t num_unschedulable_task_spilled_ = 0; + size_t num_lease_spilled_ = 0; + size_t num_waiting_lease_spilled_ = 0; + size_t num_unschedulable_lease_spilled_ = 0; friend class SchedulerResourceReporter; - friend class ClusterTaskManagerTest; + friend class ClusterLeaseManagerTest; friend class SchedulerStats; - friend class LocalTaskManagerTest; - FRIEND_TEST(ClusterTaskManagerTest, FeasibleToNonFeasible); - FRIEND_TEST(LocalTaskManagerTest, TestTaskDispatchingOrder); + friend class LocalLeaseManagerTest; + FRIEND_TEST(ClusterLeaseManagerTest, FeasibleToNonFeasible); + FRIEND_TEST(LocalLeaseManagerTest, TestLeaseGrantingOrder); + friend size_t GetPendingLeaseWorkerCount(const LocalLeaseManager &local_lease_manager); }; } // namespace raylet } // namespace ray diff --git a/src/ray/raylet/local_object_manager.cc b/src/ray/raylet/local_object_manager.cc index 51aff1d5c824..a7100a828c96 100644 --- a/src/ray/raylet/local_object_manager.cc +++ b/src/ray/raylet/local_object_manager.cc @@ -22,7 +22,6 @@ #include "ray/common/asio/instrumented_io_context.h" #include "ray/stats/metric_defs.h" -#include "ray/util/util.h" namespace ray { @@ -51,7 +50,7 @@ void LocalObjectManager::PinObjectsAndWaitForFree( pinned_objects_.emplace(object_id, std::move(object)); } else { auto original_worker_id = - WorkerID::FromBinary(inserted.first->second.owner_address.worker_id()); + WorkerID::FromBinary(inserted.first->second.owner_address_.worker_id()); auto new_worker_id = WorkerID::FromBinary(owner_address.worker_id()); if (original_worker_id != new_worker_id) { // TODO(swang): Handle this case. We should use the new owner address @@ -72,7 +71,7 @@ void LocalObjectManager::PinObjectsAndWaitForFree( wait_request->set_generator_id(generator_id.Binary()); } rpc::Address subscriber_address; - subscriber_address.set_raylet_id(self_node_id_.Binary()); + subscriber_address.set_node_id(self_node_id_.Binary()); subscriber_address.set_ip_address(self_node_address_); subscriber_address.set_port(self_node_port_); wait_request->mutable_subscriber_address()->CopyFrom(subscriber_address); @@ -82,43 +81,43 @@ void LocalObjectManager::PinObjectsAndWaitForFree( auto subscription_callback = [this, owner_address](const rpc::PubMessage &msg) { RAY_CHECK(msg.has_worker_object_eviction_message()); const auto &object_eviction_msg = msg.worker_object_eviction_message(); - const auto object_id = ObjectID::FromBinary(object_eviction_msg.object_id()); - ReleaseFreedObject(object_id); + const auto obj_id = ObjectID::FromBinary(object_eviction_msg.object_id()); + ReleaseFreedObject(obj_id); core_worker_subscriber_->Unsubscribe( - rpc::ChannelType::WORKER_OBJECT_EVICTION, owner_address, object_id.Binary()); + rpc::ChannelType::WORKER_OBJECT_EVICTION, owner_address, obj_id.Binary()); }; // Callback that is invoked when the owner of the object id is dead. auto owner_dead_callback = [this, owner_address](const std::string &object_id_binary, const Status &) { - const auto object_id = ObjectID::FromBinary(object_id_binary); - ReleaseFreedObject(object_id); + const auto obj_id = ObjectID::FromBinary(object_id_binary); + ReleaseFreedObject(obj_id); }; auto sub_message = std::make_unique(); sub_message->mutable_worker_object_eviction_message()->Swap(wait_request.get()); - RAY_CHECK(core_worker_subscriber_->Subscribe(std::move(sub_message), - rpc::ChannelType::WORKER_OBJECT_EVICTION, - owner_address, - object_id.Binary(), - /*subscribe_done_callback=*/nullptr, - subscription_callback, - owner_dead_callback)); + core_worker_subscriber_->Subscribe(std::move(sub_message), + rpc::ChannelType::WORKER_OBJECT_EVICTION, + owner_address, + object_id.Binary(), + /*subscribe_done_callback=*/nullptr, + subscription_callback, + owner_dead_callback); } } void LocalObjectManager::ReleaseFreedObject(const ObjectID &object_id) { // Only free the object if it is not already freed. auto it = local_objects_.find(object_id); - if (it == local_objects_.end() || it->second.is_freed) { + if (it == local_objects_.end() || it->second.is_freed_) { return; } // Mark the object as freed. NOTE(swang): We have to mark this instead of // deleting the entry immediately in case the object is currently being // spilled. In that case, we should process the free event once the object // spill is complete. - it->second.is_freed = true; + it->second.is_freed_ = true; RAY_LOG(DEBUG) << "Unpinning object " << object_id; // The object should be in one of these states: pinned, spilling, or spilled. @@ -326,13 +325,13 @@ void LocalObjectManager::SpillObjectsInternal( RAY_CHECK(it != objects_pending_spill_.end()); auto freed_it = local_objects_.find(object_id); // If the object hasn't already been freed, spill it. - if (freed_it == local_objects_.end() || freed_it->second.is_freed) { + if (freed_it == local_objects_.end() || freed_it->second.is_freed_) { num_bytes_pending_spill_ -= it->second->GetSize(); objects_pending_spill_.erase(it); } else { auto ref = request.add_object_refs_to_spill(); ref->set_object_id(object_id.Binary()); - ref->mutable_owner_address()->CopyFrom(freed_it->second.owner_address); + ref->mutable_owner_address()->CopyFrom(freed_it->second.owner_address_); RAY_LOG(DEBUG) << "Sending spill request for object " << object_id; requested_objects_to_spill.push_back(object_id); } @@ -423,19 +422,19 @@ void LocalObjectManager::OnObjectSpilled(const std::vector &object_ids // Asynchronously Update the spilled URL. auto freed_it = local_objects_.find(object_id); - if (freed_it == local_objects_.end() || freed_it->second.is_freed) { + if (freed_it == local_objects_.end() || freed_it->second.is_freed_) { RAY_LOG(DEBUG) << "Spilled object already freed, skipping send of spilled URL to " "object directory for object " << object_id; continue; } - const auto &worker_addr = freed_it->second.owner_address; + const auto &worker_addr = freed_it->second.owner_address_; object_directory_->ReportObjectSpilled( object_id, self_node_id_, worker_addr, object_url, - freed_it->second.generator_id.value_or(ObjectID::Nil()), + freed_it->second.generator_id_.value_or(ObjectID::Nil()), is_external_storage_type_fs_); } } @@ -555,7 +554,7 @@ void LocalObjectManager::ProcessSpilledObjectsDeleteQueue(uint32_t max_batch_siz // Update current spilled objects metrics RAY_CHECK(local_objects_.contains(object_id)) << "local objects should contain the spilled object: " << object_id; - spilled_bytes_current_ -= local_objects_.at(object_id).object_size; + spilled_bytes_current_ -= local_objects_.at(object_id).object_size_; } else { // If the object was not spilled, it gets pinned again. Unpin here to // prevent a memory leak. diff --git a/src/ray/raylet/local_object_manager.h b/src/ray/raylet/local_object_manager.h index e9cef263b973..25597aeda64b 100644 --- a/src/ray/raylet/local_object_manager.h +++ b/src/ray/raylet/local_object_manager.h @@ -23,13 +23,14 @@ #include "ray/common/id.h" #include "ray/common/ray_object.h" -#include "ray/gcs/gcs_client/accessor.h" +#include "ray/gcs_client/accessor.h" #include "ray/object_manager/common.h" #include "ray/object_manager/object_directory.h" -#include "ray/pubsub/subscriber.h" +#include "ray/pubsub/subscriber_interface.h" #include "ray/raylet/local_object_manager_interface.h" #include "ray/raylet/worker_pool.h" #include "ray/rpc/worker/core_worker_client_pool.h" +#include "ray/util/time.h" namespace ray { @@ -184,14 +185,14 @@ class LocalObjectManager : public LocalObjectManagerInterface { LocalObjectInfo(const rpc::Address &owner_address, const ObjectID &generator_id, size_t object_size) - : owner_address(owner_address), - generator_id(generator_id.IsNil() ? std::nullopt - : std::optional(generator_id)), - object_size(object_size) {} - rpc::Address owner_address; - bool is_freed = false; - std::optional generator_id; - size_t object_size; + : owner_address_(owner_address), + generator_id_(generator_id.IsNil() ? std::nullopt + : std::optional(generator_id)), + object_size_(object_size) {} + rpc::Address owner_address_; + bool is_freed_ = false; + std::optional generator_id_; + size_t object_size_; }; FRIEND_TEST(LocalObjectManagerTest, TestTryToSpillObjectsZero); diff --git a/src/ray/raylet/main.cc b/src/ray/raylet/main.cc index 676b7d0bc2ef..c556a1219e81 100644 --- a/src/ray/raylet/main.cc +++ b/src/ray/raylet/main.cc @@ -24,24 +24,32 @@ #include "gflags/gflags.h" #include "nlohmann/json.hpp" #include "ray/common/asio/instrumented_io_context.h" -#include "ray/common/cgroup/cgroup_manager.h" +#include "ray/common/cgroup2/cgroup_manager.h" +#include "ray/common/cgroup2/sysfs_cgroup_driver.h" +#include "ray/common/constants.h" #include "ray/common/id.h" +#include "ray/common/lease/lease.h" #include "ray/common/ray_config.h" #include "ray/common/status.h" -#include "ray/common/task/task_common.h" -#include "ray/gcs/gcs_client/gcs_client.h" +#include "ray/common/status_or.h" +#include "ray/core_worker/metrics.h" +#include "ray/gcs_client/gcs_client.h" #include "ray/object_manager/ownership_object_directory.h" #include "ray/raylet/local_object_manager.h" #include "ray/raylet/local_object_manager_interface.h" #include "ray/raylet/raylet.h" +#include "ray/rpc/object_manager/object_manager_client.h" +#include "ray/rpc/raylet/raylet_client.h" #include "ray/stats/stats.h" #include "ray/util/cmd_line_utils.h" #include "ray/util/event.h" #include "ray/util/process.h" +#include "ray/util/raii.h" #include "ray/util/stream_redirection.h" #include "ray/util/stream_redirection_options.h" #include "ray/util/subreaper.h" -#include "scheduling/cluster_task_manager.h" +#include "ray/util/time.h" +#include "scheduling/cluster_lease_manager.h" using json = nlohmann::json; @@ -90,14 +98,8 @@ DEFINE_int32(ray_debugger_external, 0, "Make Ray debugger externally accessible. // store options DEFINE_int64(object_store_memory, -1, "The initial memory of the object store."); DEFINE_string(node_name, "", "The user-provided identifier or name for this node."); -DEFINE_string(session_name, "", "Session name (ClusterID) of the cluster."); +DEFINE_string(session_name, "", "The current Ray session name."); DEFINE_string(cluster_id, "", "ID of the cluster, separate from observability."); -// TODO(hjiang): At the moment only enablement flag is added, I will add other flags for -// CPU and memory resource reservation in the followup PR. -DEFINE_bool(enable_resource_isolation, - false, - "Enable resource isolation through cgroupv2 by reserving resources for ray " - "system processes."); #ifdef __linux__ DEFINE_string(plasma_directory, @@ -113,6 +115,30 @@ DEFINE_bool(huge_pages, false, "Enable huge pages."); DEFINE_string(labels, "", "Define the key-value format of node labels, which is a serialized JSON."); +DEFINE_bool( + enable_resource_isolation, + false, + "Enables resource isolation through cgroupv2. The raylet will create and " + "manage a cgroup hierarchy that separates system processes and worker processes " + "into separate cgroups."); +DEFINE_string( + cgroup_path, + "", + "Path of the cgroup that the raylet will take ownership of to create its cgorup " + "hierarchy. The raylet process must have read, write, and execute permission for " + "this path. If enable_resource_isolation is true, then this cannot be empty."); +DEFINE_int64( + system_reserved_cpu_weight, + -1, + "The amount of cores reserved for ray system processes. It will be applied " + "as a cpu.weight constraint to the system cgroup. 10000 - " + "system_reserved_cpu_weight will be applied as a constraint to the " + "application cgroup. If enable resource isolation is true, then this cannot be -1."); +DEFINE_int64(system_reserved_memory_bytes, + -1, + "The amount of memory in bytes reserved for ray system processes. It will " + "be applied as a memory.min constraint to the sytem cgroup. If enable " + "resource isolation is true, then this cannot be -1"); absl::flat_hash_map parse_node_labels( const std::string &labels_json_str) { @@ -220,31 +246,75 @@ int main(int argc, char *argv[]) { const std::string session_name = FLAGS_session_name; const bool is_head_node = FLAGS_head; const std::string labels_json_str = FLAGS_labels; + const bool enable_resource_isolation = FLAGS_enable_resource_isolation; + const std::string cgroup_path = FLAGS_cgroup_path; + const int64_t system_reserved_cpu_weight = FLAGS_system_reserved_cpu_weight; + const int64_t system_reserved_memory_bytes = FLAGS_system_reserved_memory_bytes; RAY_CHECK_NE(FLAGS_cluster_id, "") << "Expected cluster ID."; ray::ClusterID cluster_id = ray::ClusterID::FromHex(FLAGS_cluster_id); RAY_LOG(INFO) << "Setting cluster ID to: " << cluster_id; gflags::ShutDownCommandLineFlags(); - // Get cgroup setup instance and perform necessary resource setup. - ray::GetCgroupSetup(FLAGS_enable_resource_isolation); + // TODO(#54703): Link OSS documentation once it's available in the error messages. + if (enable_resource_isolation) { + RAY_CHECK(!cgroup_path.empty()) + << "Failed to start up raylet. If enable_resource_isolation is set to true, " + "cgroup_path cannot be empty."; + RAY_CHECK_NE(system_reserved_cpu_weight, -1) + << "Failed to start up raylet. If enable_resource_isolation is set to true, " + "system_reserved_cpu_weight must be set to a value between [1,10000]"; + RAY_CHECK_NE(system_reserved_memory_bytes, -1) + << "Failed to start up raylet. If enable_resource_isolation is set to true, " + "system_reserved_memory_byres must be set to a value > 0"; + + std::unique_ptr cgroup_driver; + ray::StatusOr> cgroup_manager = + ray::CgroupManager::Create(std::move(cgroup_path), + node_id, + system_reserved_cpu_weight, + system_reserved_memory_bytes, + std::move(cgroup_driver)); + + // TODO(#54703) - Link to OSS documentation once available. + RAY_CHECK(cgroup_manager.ok()) + << "Failed to start raylet. Could not create CgroupManager because of " + << cgroup_manager.ToString(); + +#ifndef __linux__ + RAY_LOG(WARNING) + << "Resource isolation with cgroups is only supported in linux. Please set " + "enable_resource_isolation to false. This is likely a misconfiguration."; +#endif + } // Configuration for the node manager. ray::raylet::NodeManagerConfig node_manager_config; - node_manager_config.enable_resource_isolation = FLAGS_enable_resource_isolation; absl::flat_hash_map static_resource_conf; SetThreadName("raylet"); // IO Service for node manager. - instrumented_io_context main_service{/*enable_lag_probe=*/false, - /*running_on_single_thread=*/true}; + instrumented_io_context main_service{ + /*emit_metrics=*/RayConfig::instance().emit_main_service_metrics(), + /*running_on_single_thread=*/true, + "raylet_main_io_context"}; // Ensure that the IO service keeps running. Without this, the service will exit as soon // as there is no more work to be processed. boost::asio::executor_work_guard main_service_work(main_service.get_executor()); + instrumented_io_context object_manager_rpc_service{/*emit_metrics=*/false, + /*running_on_single_thread=*/false, + "object_manager_rpc_io_context"}; + boost::asio::executor_work_guard + object_manager_rpc_work(object_manager_rpc_service.get_executor()); + + /// The thread pool used for running `rpc_service`. + /// Data copy operations during request are done in this thread pool. + std::vector object_manager_rpc_threads; + // Initialize gcs client std::unique_ptr gcs_client; ray::gcs::GcsClientOptions client_options(FLAGS_gcs_address, @@ -256,6 +326,7 @@ int main(int argc, char *argv[]) { RAY_CHECK_OK(gcs_client->Connect(main_service)); std::unique_ptr raylet; + ray::stats::Gauge task_by_state_counter = ray::core::GetTaskMetric(); std::unique_ptr plasma_client; std::unique_ptr node_manager; std::unique_ptr client_call_manager; @@ -267,11 +338,11 @@ int main(int argc, char *argv[]) { std::unique_ptr local_object_manager; /// These classes make up the new scheduler. ClusterResourceScheduler is /// responsible for maintaining a view of the cluster state w.r.t resource - /// usage. ClusterTaskManager is responsible for queuing, spilling back, and - /// dispatching tasks. + /// usage. ClusterLeaseManager is responsible for queuing, spilling back, and + /// granting leases. std::unique_ptr cluster_resource_scheduler; - std::unique_ptr local_task_manager; - std::unique_ptr cluster_task_manager; + std::unique_ptr local_lease_manager; + std::unique_ptr cluster_lease_manager; /// The raylet client to initiate the pubsub to core workers (owners). /// It is used to subscribe objects to evict. std::unique_ptr core_worker_subscriber; @@ -280,11 +351,13 @@ int main(int argc, char *argv[]) { std::unique_ptr object_directory; /// Manages client requests for object transfers and availability. std::unique_ptr object_manager; - /// A manager to resolve objects needed by queued tasks and workers that + /// A manager to resolve objects needed by queued leases and workers that /// called `ray.get` or `ray.wait`. - std::unique_ptr dependency_manager; + std::unique_ptr lease_dependency_manager; + /// The client to export metrics to the metrics agent. + std::unique_ptr metrics_agent_client; /// Map of workers leased out to clients. - absl::flat_hash_map> + absl::flat_hash_map> leased_workers; // Enable subreaper. This is called in `AsyncGetInternalConfig` below, but MSVC does @@ -315,15 +388,23 @@ int main(int argc, char *argv[]) { auto shutted_down = std::make_shared>(false); - auto shutdown_raylet_after_unregistration = - [&main_service, &raylet_socket_name, &raylet, &gcs_client]() { - // We should stop the service and remove the local socket file. - raylet->Stop(); - gcs_client->Disconnect(); - ray::stats::Shutdown(); - main_service.stop(); - remove(raylet_socket_name.c_str()); - }; + auto shutdown_raylet_after_unregistration = [&main_service, + &raylet_socket_name, + &raylet, + &gcs_client, + &object_manager_rpc_threads]() { + // We should stop the service and remove the local socket file. + raylet->Stop(); + gcs_client->Disconnect(); + ray::stats::Shutdown(); + main_service.stop(); + for (size_t i = 0; i < object_manager_rpc_threads.size(); i++) { + if (object_manager_rpc_threads[i].joinable()) { + object_manager_rpc_threads[i].join(); + } + } + remove(raylet_socket_name.c_str()); + }; // Shut down raylet gracefully, in a synchronous fashion. // This is an internal method and should only be run on the main_service. @@ -492,7 +573,12 @@ int main(int argc, char *argv[]) { << ", object_chunk_size = " << object_manager_config.object_chunk_size; RAY_LOG(INFO).WithField(raylet_node_id) << "Setting node ID"; - node_manager_config.AddDefaultLabels(raylet_node_id.Hex()); + std::vector default_keys = {kLabelKeyNodeID}; + for (const auto &key : default_keys) { + RAY_CHECK(!node_manager_config.labels.contains(key)) + << "The label key name " << key << " should never be set by the user."; + } + node_manager_config.labels[kLabelKeyNodeID] = raylet_node_id.Hex(); worker_pool = std::make_unique( main_service, @@ -505,12 +591,12 @@ int main(int argc, char *argv[]) { return node_manager_config.num_workers_soft_limit; } // If no limit is provided, use the available number of CPUs, - // assuming that each incoming task will likely require 1 CPU. + // assuming that each incoming lease will likely require 1 CPU. // We floor the available CPUs to the nearest integer to avoid // starting too many workers when there is less than 1 CPU left. // Otherwise, we could end up repeatedly starting the worker, then // killing it because it idles for too long. The downside is that - // we will be slower to schedule tasks that could use a fraction + // we will be slower to schedule leases that could use a fraction // of a CPU. return static_cast( cluster_resource_scheduler->GetLocalResourceManager() @@ -525,10 +611,9 @@ int main(int argc, char *argv[]) { node_manager_config.worker_commands, node_manager_config.native_library_path, /*starting_worker_timeout_callback=*/ - [&] { cluster_task_manager->ScheduleAndDispatchTasks(); }, + [&] { cluster_lease_manager->ScheduleAndGrantLeases(); }, node_manager_config.ray_debugger_external, - /*get_time=*/[]() { return absl::Now(); }, - node_manager_config.enable_resource_isolation); + /*get_time=*/[]() { return absl::Now(); }); client_call_manager = std::make_unique( main_service, /*record_stats=*/true); @@ -547,7 +632,7 @@ int main(int argc, char *argv[]) { raylet_client_pool = std::make_unique([&](const ray::rpc::Address &addr) { - return std::make_shared( + return std::make_shared( addr, *client_call_manager, ray::rpc::RayletClientPool::GetDefaultUnavailableTimeoutCallback( @@ -573,30 +658,14 @@ int main(int argc, char *argv[]) { *gcs_client, core_worker_subscriber.get(), worker_rpc_pool.get(), - [&](const ObjectID &obj_id, const ray::rpc::ErrorType &error_type) { + [&](const ray::ObjectID &obj_id, const ray::rpc::ErrorType &error_type) { ray::rpc::ObjectReference ref; ref.set_object_id(obj_id.Binary()); - node_manager->MarkObjectsAsFailed(error_type, {ref}, JobID::Nil()); + node_manager->MarkObjectsAsFailed(error_type, {ref}, ray::JobID::Nil()); }); - object_manager = std::make_unique( - main_service, - raylet_node_id, + auto object_store_runner = std::make_unique( object_manager_config, - *gcs_client, - object_directory.get(), - /*restore_spilled_object=*/ - [&](const ObjectID &object_id, - int64_t object_size, - const std::string &object_url, - std::function callback) { - local_object_manager->AsyncRestoreSpilledObject( - object_id, object_size, object_url, std::move(callback)); - }, - /*get_spilled_object_url=*/ - [&](const ObjectID &object_id) { - return local_object_manager->GetLocalSpilledObjectURL(object_id); - }, /*spill_objects_callback=*/ [&]() { // This callback is called from the plasma store thread. @@ -616,13 +685,52 @@ int main(int argc, char *argv[]) { }, /*add_object_callback=*/ [&](const ray::ObjectInfo &object_info) { - node_manager->HandleObjectLocal(object_info); + main_service.post( + [&object_manager, &node_manager, object_info]() { + object_manager->HandleObjectAdded(object_info); + node_manager->HandleObjectLocal(object_info); + }, + "ObjectManager.ObjectAdded"); }, /*delete_object_callback=*/ - [&](const ObjectID &object_id) { node_manager->HandleObjectMissing(object_id); }, + [&](const ray::ObjectID &object_id) { + main_service.post( + [&object_manager, &node_manager, object_id]() { + object_manager->HandleObjectDeleted(object_id); + node_manager->HandleObjectMissing(object_id); + }, + "ObjectManager.ObjectDeleted"); + }); + + object_manager_rpc_threads.resize(object_manager_config.rpc_service_threads_number); + for (int i = 0; i < object_manager_config.rpc_service_threads_number; i++) { + object_manager_rpc_threads[i] = std::thread([&object_manager_rpc_service, i] { + SetThreadName(absl::StrFormat("rpc.obj.mgr.%d", i)); + object_manager_rpc_service.run(); + }); + } + + object_manager = std::make_unique( + main_service, + raylet_node_id, + object_manager_config, + *gcs_client, + object_directory.get(), + /*restore_spilled_object=*/ + [&](const ray::ObjectID &object_id, + int64_t object_size, + const std::string &object_url, + std::function callback) { + local_object_manager->AsyncRestoreSpilledObject( + object_id, object_size, object_url, std::move(callback)); + }, + /*get_spilled_object_url=*/ + [&](const ray::ObjectID &object_id) { + return local_object_manager->GetLocalSpilledObjectURL(object_id); + }, /*pin_object=*/ - [&](const ObjectID &object_id) { - std::vector object_ids = {object_id}; + [&](const ray::ObjectID &object_id) { + std::vector object_ids = {object_id}; std::vector> results; std::unique_ptr result; if (node_manager->GetObjectsFromPlasma(object_ids, &results) && @@ -632,11 +740,20 @@ int main(int argc, char *argv[]) { return result; }, /*fail_pull_request=*/ - [&](const ObjectID &object_id, ray::rpc::ErrorType error_type) { + [&](const ray::ObjectID &object_id, ray::rpc::ErrorType error_type) { ray::rpc::ObjectReference ref; ref.set_object_id(object_id.Binary()); - node_manager->MarkObjectsAsFailed(error_type, {ref}, JobID::Nil()); - }); + node_manager->MarkObjectsAsFailed(error_type, {ref}, ray::JobID::Nil()); + }, + std::make_shared(), + std::move(object_store_runner), + [&](const std::string &address, + const int port, + ray::rpc::ClientCallManager &call_manager) { + return std::make_shared( + address, port, call_manager); + }, + object_manager_rpc_service); local_object_manager = std::make_unique( raylet_node_id, @@ -652,27 +769,27 @@ int main(int argc, char *argv[]) { RayConfig::instance().is_external_storage_type_fs(), /*max_fused_object_count*/ RayConfig::instance().max_fused_object_count(), /*on_objects_freed*/ - [&](const std::vector &object_ids) { + [&](const std::vector &object_ids) { object_manager->FreeObjects(object_ids, /*local_only=*/false); }, /*is_plasma_object_spillable*/ - [&](const ObjectID &object_id) { + [&](const ray::ObjectID &object_id) { return object_manager->IsPlasmaObjectSpillable(object_id); }, /*core_worker_subscriber_=*/core_worker_subscriber.get(), object_directory.get()); - dependency_manager = - std::make_unique(*object_manager); + lease_dependency_manager = std::make_unique( + *object_manager, task_by_state_counter); cluster_resource_scheduler = std::make_unique( main_service, ray::scheduling::NodeID(raylet_node_id.Binary()), node_manager_config.resource_config.GetResourceMap(), /*is_node_available_fn*/ - [&](ray::scheduling::NodeID node_id) { - return gcs_client->Nodes().Get(NodeID::FromBinary(node_id.Binary())) != nullptr; + [&](ray::scheduling::NodeID id) { + return gcs_client->Nodes().Get(ray::NodeID::FromBinary(id.Binary())) != nullptr; }, /*get_used_object_store_memory*/ [&]() { @@ -695,34 +812,34 @@ int main(int argc, char *argv[]) { /*labels*/ node_manager_config.labels); - auto get_node_info_func = [&](const NodeID &node_id) { - return gcs_client->Nodes().Get(node_id); + auto get_node_info_func = [&](const ray::NodeID &id) { + return gcs_client->Nodes().Get(id); }; - auto announce_infeasible_task = [](const ray::RayTask &task) { - /// Publish the infeasible task error to GCS so that drivers can subscribe to it + auto announce_infeasible_lease = [](const ray::RayLease &lease) { + /// Publish the infeasible lease error to GCS so that drivers can subscribe to it /// and print. bool suppress_warning = false; - if (!task.GetTaskSpecification().PlacementGroupBundleId().first.IsNil()) { - // If the task is part of a placement group, do nothing. If necessary, the + if (!lease.GetLeaseSpecification().PlacementGroupBundleId().first.IsNil()) { + // If the lease is part of a placement group, do nothing. If necessary, the // infeasible warning should come from the placement group scheduling, not the - // task scheduling. + // lease scheduling. suppress_warning = true; } - // Push a warning to the task's driver that this task is currently infeasible. + // Push a warning to the lease's driver that this lease is currently infeasible. if (!suppress_warning) { std::ostringstream error_message; error_message - << "The actor or task with ID " << task.GetTaskSpecification().TaskId() + << "The lease with ID " << lease.GetLeaseSpecification().LeaseId() << " cannot be scheduled right now. It requires " - << task.GetTaskSpecification().GetRequiredPlacementResources().DebugString() + << lease.GetLeaseSpecification().GetRequiredPlacementResources().DebugString() << " for placement, however the cluster currently cannot provide the " "requested " "resources. The required resources may be added as autoscaling takes " "place " "or placement groups are scheduled. Otherwise, consider reducing the " - "resource requirements of the task."; + "resource requirements of the lease."; std::string error_message_str = error_message.str(); RAY_LOG(WARNING) << error_message_str; } @@ -745,32 +862,32 @@ int main(int argc, char *argv[]) { max_task_args_memory = 0; } - local_task_manager = std::make_unique( + local_lease_manager = std::make_unique( raylet_node_id, *cluster_resource_scheduler, - *dependency_manager, + *lease_dependency_manager, get_node_info_func, *worker_pool, leased_workers, - [&](const std::vector &object_ids, + [&](const std::vector &object_ids, std::vector> *results) { return node_manager->GetObjectsFromPlasma(object_ids, results); }, max_task_args_memory); - cluster_task_manager = - std::make_unique(raylet_node_id, - *cluster_resource_scheduler, - get_node_info_func, - announce_infeasible_task, - *local_task_manager); + cluster_lease_manager = + std::make_unique(raylet_node_id, + *cluster_resource_scheduler, + get_node_info_func, + announce_infeasible_lease, + *local_lease_manager); - auto raylet_client_factory = [&](const NodeID &node_id) { - const ray::rpc::GcsNodeInfo *node_info = gcs_client->Nodes().Get(node_id); - RAY_CHECK(node_info) << "No GCS info for node " << node_id; + auto raylet_client_factory = [&](const ray::NodeID &id) { + const ray::rpc::GcsNodeInfo *node_info = gcs_client->Nodes().Get(id); + RAY_CHECK(node_info) << "No GCS info for node " << id; auto addr = ray::rpc::RayletClientPool::GenerateRayletAddress( - node_id, node_info->node_manager_address(), node_info->node_manager_port()); - return raylet_client_pool->GetOrConnectByAddress(std::move(addr)); + id, node_info->node_manager_address(), node_info->node_manager_port()); + return raylet_client_pool->GetOrConnectByAddress(addr); }; plasma_client = std::make_unique(); @@ -785,12 +902,12 @@ int main(int argc, char *argv[]) { *raylet_client_pool, *core_worker_subscriber, *cluster_resource_scheduler, - *local_task_manager, - *cluster_task_manager, + *local_lease_manager, + *cluster_lease_manager, *object_directory, *object_manager, *local_object_manager, - *dependency_manager, + *lease_dependency_manager, *worker_pool, leased_workers, *plasma_client, @@ -822,7 +939,13 @@ int main(int argc, char *argv[]) { {ray::stats::VersionKey, kRayVersion}, {ray::stats::NodeAddressKey, node_ip_address}, {ray::stats::SessionNameKey, session_name}}; - ray::stats::Init(global_tags, metrics_agent_port, WorkerID::Nil()); + ray::stats::Init(global_tags, metrics_agent_port, ray::WorkerID::Nil()); + metrics_agent_client = std::make_unique( + "127.0.0.1", metrics_agent_port, main_service, *client_call_manager); + metrics_agent_client->WaitForServerReady( + [metrics_agent_port](const ray::Status &server_status) { + ray::stats::InitOpenTelemetryExporter(metrics_agent_port, server_status); + }); // Initialize event framework. This should be done after the node manager is // initialized. @@ -850,7 +973,7 @@ int main(int argc, char *argv[]) { drain_request->reason() == ray::rpc::autoscaler::DrainNodeReason::DRAIN_NODE_REASON_PREEMPTION && drain_request->deadline_timestamp_ms() != 0 && - drain_request->deadline_timestamp_ms() < current_sys_time_ms()) { + drain_request->deadline_timestamp_ms() < ray::current_sys_time_ms()) { node_death_info.set_reason(ray::rpc::NodeDeathInfo::AUTOSCALER_DRAIN_PREEMPTED); node_death_info.set_reason_message(drain_request->reason_message()); } else { diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index 0d726f782f79..b48290b51adf 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -31,83 +31,66 @@ #include "ray/common/asio/asio_util.h" #include "ray/common/asio/instrumented_io_context.h" #include "ray/common/buffer.h" -#include "ray/common/common_protocol.h" #include "ray/common/constants.h" +#include "ray/common/flatbuf_utils.h" +#include "ray/common/grpc_util.h" +#include "ray/common/lease/lease.h" #include "ray/common/memory_monitor.h" +#include "ray/common/protobuf_utils.h" #include "ray/common/scheduling/scheduling_ids.h" #include "ray/common/status.h" -#include "ray/common/task/task_common.h" -#include "ray/common/task/task_spec.h" #include "ray/flatbuffers/node_manager_generated.h" -#include "ray/gcs/pb_util.h" #include "ray/ipc/client_connection.h" -#include "ray/object_manager/ownership_object_directory.h" #include "ray/raylet/local_object_manager_interface.h" -#include "ray/raylet/scheduling/cluster_task_manager.h" #include "ray/raylet/worker_killing_policy.h" #include "ray/raylet/worker_pool.h" -#include "ray/rpc/node_manager/node_manager_client.h" #include "ray/stats/metric_defs.h" #include "ray/util/cmd_line_utils.h" #include "ray/util/event.h" #include "ray/util/network_util.h" -#include "ray/util/util.h" +#include "ray/util/string_utils.h" +#include "ray/util/time.h" -namespace { - -#define RAY_CHECK_ENUM(x, y) \ - static_assert(static_cast(x) == static_cast(y), "protocol mismatch") +namespace ray::raylet { -struct ActorStats { - int live_actors = 0; - int dead_actors = 0; - int restarting_actors = 0; -}; +namespace { -inline ray::rpc::ObjectReference FlatbufferToSingleObjectReference( - const flatbuffers::String &object_id, const ray::protocol::Address &address) { - ray::rpc::ObjectReference ref; +rpc::ObjectReference FlatbufferToSingleObjectReference( + const flatbuffers::String &object_id, const protocol::Address &address) { + rpc::ObjectReference ref; ref.set_object_id(object_id.str()); - ref.mutable_owner_address()->set_raylet_id(address.raylet_id()->str()); + ref.mutable_owner_address()->set_node_id(address.node_id()->str()); ref.mutable_owner_address()->set_ip_address(address.ip_address()->str()); ref.mutable_owner_address()->set_port(address.port()); ref.mutable_owner_address()->set_worker_id(address.worker_id()->str()); return ref; } -std::vector FlatbufferToObjectReference( +std::vector FlatbufferToObjectReferences( const flatbuffers::Vector> &object_ids, - const flatbuffers::Vector> - &owner_addresses) { + const flatbuffers::Vector> &owner_addresses) { RAY_CHECK(object_ids.size() == owner_addresses.size()); - std::vector refs; + std::vector refs; + refs.reserve(object_ids.size()); for (int64_t i = 0; i < object_ids.size(); i++) { - ray::rpc::ObjectReference ref; - ref.set_object_id(object_ids.Get(i)->str()); - const auto &addr = owner_addresses.Get(i); - ref.mutable_owner_address()->set_raylet_id(addr->raylet_id()->str()); - ref.mutable_owner_address()->set_ip_address(addr->ip_address()->str()); - ref.mutable_owner_address()->set_port(addr->port()); - ref.mutable_owner_address()->set_worker_id(addr->worker_id()->str()); - refs.emplace_back(std::move(ref)); + refs.push_back( + FlatbufferToSingleObjectReference(*object_ids.Get(i), *owner_addresses.Get(i))); } return refs; } -} // namespace - -namespace ray::raylet { - -void NodeManagerConfig::AddDefaultLabels(const std::string &self_node_id) { - std::vector default_keys = {kLabelKeyNodeID}; - - for (const auto &key : default_keys) { - RAY_CHECK(!labels.contains(key)) - << "The label key name " << key << " should never be set by the user."; +std::vector FlatbufferToObjectIds( + const flatbuffers::Vector> &vector) { + std::vector ids; + ids.reserve(vector.size()); + for (int64_t i = 0; i < vector.size(); i++) { + ids.push_back(ObjectID::FromBinary(vector.Get(i)->str())); } - labels[kLabelKeyNodeID] = self_node_id; + return ids; } +} // namespace + NodeManager::NodeManager( instrumented_io_context &io_service, const NodeID &self_node_id, @@ -119,14 +102,14 @@ NodeManager::NodeManager( rpc::RayletClientPool &raylet_client_pool, pubsub::SubscriberInterface &core_worker_subscriber, ClusterResourceScheduler &cluster_resource_scheduler, - ILocalTaskManager &local_task_manager, - ClusterTaskManagerInterface &cluster_task_manager, + LocalLeaseManagerInterface &local_lease_manager, + ClusterLeaseManagerInterface &cluster_lease_manager, IObjectDirectory &object_directory, ObjectManagerInterface &object_manager, LocalObjectManagerInterface &local_object_manager, - DependencyManager &dependency_manager, + LeaseDependencyManager &lease_dependency_manager, WorkerPoolInterface &worker_pool, - absl::flat_hash_map> &leased_workers, + absl::flat_hash_map> &leased_workers, plasma::PlasmaClientInterface &store_client, std::unique_ptr mutable_object_provider, @@ -135,7 +118,7 @@ NodeManager::NodeManager( self_node_name_(std::move(self_node_name)), io_service_(io_service), gcs_client_(gcs_client), - shutdown_raylet_gracefully_(shutdown_raylet_gracefully), + shutdown_raylet_gracefully_(std::move(shutdown_raylet_gracefully)), worker_pool_(worker_pool), client_call_manager_(client_call_manager), worker_rpc_pool_(worker_rpc_pool), @@ -148,15 +131,16 @@ NodeManager::NodeManager( periodical_runner_(PeriodicalRunner::Create(io_service)), report_resources_period_ms_(config.report_resources_period_ms), initial_config_(config), - dependency_manager_(dependency_manager), + lease_dependency_manager_(lease_dependency_manager), wait_manager_(/*is_object_local*/ [this](const ObjectID &object_id) { - return dependency_manager_.CheckObjectLocal(object_id); + return lease_dependency_manager_.CheckObjectLocal(object_id); }, /*delay_executor*/ [this](std::function fn, int64_t delay_ms) { - RAY_UNUSED(execute_after( - io_service_, fn, std::chrono::milliseconds(delay_ms))); + RAY_UNUSED(execute_after(io_service_, + std::move(fn), + std::chrono::milliseconds(delay_ms))); }), node_manager_server_("NodeManager", config.node_manager_port, @@ -169,8 +153,8 @@ NodeManager::NodeManager( global_gc_throttler_(RayConfig::instance().global_gc_min_interval_s() * 1e9), local_gc_interval_ns_(RayConfig::instance().local_gc_interval_s() * 1e9), cluster_resource_scheduler_(cluster_resource_scheduler), - local_task_manager_(local_task_manager), - cluster_task_manager_(cluster_task_manager), + local_lease_manager_(local_lease_manager), + cluster_lease_manager_(cluster_lease_manager), record_metrics_period_ms_(config.record_metrics_period_ms), next_resource_seq_no_(0), ray_syncer_(io_service_, self_node_id_.Binary()), @@ -188,9 +172,9 @@ NodeManager::NodeManager( std::make_unique(cluster_resource_scheduler_); periodical_runner_->RunFnPeriodically( - [this]() { cluster_task_manager_.ScheduleAndDispatchTasks(); }, + [this]() { cluster_lease_manager_.ScheduleAndGrantLeases(); }, RayConfig::instance().worker_cap_initial_backoff_delay_ms(), - "NodeManager.ScheduleAndDispatchTasks"); + "NodeManager.ScheduleAndGrantLeases"); periodical_runner_->RunFnPeriodically( [this]() { CheckForUnexpectedWorkerDisconnects(); }, @@ -225,7 +209,7 @@ NodeManager::NodeManager( worker_pool_.SetRuntimeEnvAgentClient(std::move(runtime_env_agent_client)); worker_pool_.Start(); - periodical_runner_->RunFnPeriodically([this]() { GCTaskFailureReason(); }, + periodical_runner_->RunFnPeriodically([this]() { GCWorkerFailureReason(); }, RayConfig::instance().task_failure_entry_ttl_ms(), "NodeManager.GCTaskFailureReason"); } @@ -251,7 +235,7 @@ void NodeManager::RegisterGcs() { /* reporter */ &cluster_resource_scheduler_.GetLocalResourceManager(), /* receiver */ this, /* pull_from_reporter_interval_ms */ - RayConfig::instance().raylet_report_resources_period_milliseconds()); + report_resources_period_ms_); // Register a commands channel. // It's only used for GC right now. @@ -281,7 +265,7 @@ void NodeManager::RegisterGcs() { // Subscribe to all unexpected failure notifications from the local and // remote raylets. Note that this does not include workers that failed due to - // node failure. These workers can be identified by comparing the raylet_id + // node failure. These workers can be identified by comparing the node_id // in their rpc::Address to the ID of a failed raylet. const auto &worker_failure_handler = [this](const rpc::WorkerDeltaData &worker_failure_data) { @@ -343,7 +327,7 @@ void NodeManager::RegisterGcs() { [this] { std::stringstream debug_msg; debug_msg << DebugString() << "\n\n"; - RAY_LOG(INFO) << AppendToEachLine(debug_msg.str(), "[state-dump] "); + RAY_LOG(INFO) << PrependToEachLine(debug_msg.str(), "[state-dump] "); ReportWorkerOOMKillStats(); }, event_stats_print_interval_ms, @@ -407,10 +391,10 @@ void NodeManager::HandleJobStarted(const JobID &job_id, const JobTableData &job_ << " is dead: " << job_data.is_dead() << " driver address: " << job_data.driver_address().ip_address(); worker_pool_.HandleJobStarted(job_id, job_data.config()); - // Tasks of this job may already arrived but failed to pop a worker because the job - // config is not local yet. So we trigger dispatching again here to try to - // reschedule these tasks. - cluster_task_manager_.ScheduleAndDispatchTasks(); + // Leases of this job may already arrived but failed to pop a worker because the job + // config is not local yet. So we trigger granting again here to try to + // reschedule these leases. + cluster_lease_manager_.ScheduleAndGrantLeases(); } void NodeManager::HandleJobFinished(const JobID &job_id, const JobTableData &job_data) { @@ -513,9 +497,10 @@ void NodeManager::HandleReleaseUnusedBundles(rpc::ReleaseUnusedBundlesRequest re // Cancel lease requests that are waiting for workers // to free the acquired pg bundle resources // so that pg bundle can be returned. - local_task_manager_.CancelTasks( + local_lease_manager_.CancelLeases( [&](const std::shared_ptr &work) { - const auto bundle_id = work->task.GetTaskSpecification().PlacementGroupBundleId(); + const auto bundle_id = + work->lease_.GetLeaseSpecification().PlacementGroupBundleId(); return !bundle_id.first.IsNil() && (0 == in_use_bundles.count(bundle_id)) && (work->GetState() == internal::WorkStatus::WAITING_FOR_WORKER); }, @@ -541,7 +526,7 @@ void NodeManager::HandleReleaseUnusedBundles(rpc::ReleaseUnusedBundlesRequest re for (const auto &worker : workers_associated_with_unused_bundles) { RAY_LOG(DEBUG) .WithField(worker->GetBundleId().first) - .WithField(worker->GetAssignedTaskId()) + .WithField(worker->GetGrantedLeaseId()) .WithField(worker->GetActorId()) .WithField(worker->WorkerId()) << "Destroying worker since its bundle was unused, bundle index: " @@ -593,21 +578,23 @@ void NodeManager::HandleGetObjectsInfo(rpc::GetObjectsInfoRequest request, /*on_all_replied*/ [total, reply]() { reply->set_total(*total); }); } -void NodeManager::HandleGetTaskFailureCause(rpc::GetTaskFailureCauseRequest request, - rpc::GetTaskFailureCauseReply *reply, - rpc::SendReplyCallback send_reply_callback) { - const TaskID task_id = TaskID::FromBinary(request.task_id()); - RAY_LOG(DEBUG) << "Received a HandleGetTaskFailureCause request for task " << task_id; - - auto it = task_failure_reasons_.find(task_id); - if (it != task_failure_reasons_.end()) { - RAY_LOG(DEBUG) << "task " << task_id << " has failure reason " - << ray::gcs::RayErrorInfoToString(it->second.ray_error_info) - << ", fail immediately: " << !it->second.should_retry; - reply->mutable_failure_cause()->CopyFrom(it->second.ray_error_info); - reply->set_fail_task_immediately(!it->second.should_retry); +void NodeManager::HandleGetWorkerFailureCause( + rpc::GetWorkerFailureCauseRequest request, + rpc::GetWorkerFailureCauseReply *reply, + rpc::SendReplyCallback send_reply_callback) { + const LeaseID lease_id = LeaseID::FromBinary(request.lease_id()); + RAY_LOG(DEBUG) << "Received a HandleGetWorkerFailureCause request for lease " + << lease_id; + + auto it = worker_failure_reasons_.find(lease_id); + if (it != worker_failure_reasons_.end()) { + RAY_LOG(DEBUG) << "lease " << lease_id << " has failure reason " + << ray::gcs::RayErrorInfoToString(it->second.ray_error_info_) + << ", fail immediately: " << !it->second.should_retry_; + reply->mutable_failure_cause()->CopyFrom(it->second.ray_error_info_); + reply->set_fail_task_immediately(!it->second.should_retry_); } else { - RAY_LOG(INFO) << "didn't find failure cause for task " << task_id; + RAY_LOG(INFO) << "didn't find failure cause for lease " << lease_id; } send_reply_callback(Status::OK(), nullptr, nullptr); @@ -643,7 +630,7 @@ void NodeManager::QueryAllWorkerStates( const std::function &on_all_replied) { auto all_workers = worker_pool_.GetAllRegisteredWorkers(/* filter_dead_worker */ true, /*filter_io_workers*/ true); - for (auto driver : + for (auto &driver : worker_pool_.GetAllRegisteredDrivers(/* filter_dead_driver */ true)) { all_workers.push_back(driver); } @@ -707,31 +694,31 @@ void NodeManager::QueryAllWorkerStates( // This warns users that there could be the resource deadlock. It works this way; // - If there's no available workers for scheduling -// - But if there are still pending tasks waiting for resource acquisition +// - But if there are still pending leases waiting for resource acquisition // It means the cluster might not have enough resources to be in progress. // Note that this can print the false negative messages // e.g., there are many actors taking up resources for a long time. void NodeManager::WarnResourceDeadlock() { int pending_actor_creations = 0; - int pending_tasks = 0; + int pending_leases = 0; // Check if any progress is being made on this raylet. if (worker_pool_.IsWorkerAvailableForScheduling()) { - // Progress is being made in a task, don't warn. + // Progress is being made in a lease, don't warn. resource_deadlock_warned_ = 0; return; } - auto exemplar = cluster_task_manager_.AnyPendingTasksForResourceAcquisition( - &pending_actor_creations, &pending_tasks); - // Check if any tasks are blocked on resource acquisition. + auto exemplar = cluster_lease_manager_.AnyPendingLeasesForResourceAcquisition( + &pending_actor_creations, &pending_leases); + // Check if any leases are blocked on resource acquisition. if (exemplar == nullptr) { - // No pending tasks, no need to warn. + // No pending leases, no need to warn. resource_deadlock_warned_ = 0; return; } - // Push an warning to the driver that a task is blocked trying to acquire resources. + // Push an warning to the driver that a lease is blocked trying to acquire resources. // To avoid spurious triggers, only take action starting with the second time. // case resource_deadlock_warned_: 0 => first time, don't do anything yet // case resource_deadlock_warned_: 1 => second time, print a warning @@ -747,26 +734,26 @@ void NodeManager::WarnResourceDeadlock() { } RAY_LOG(WARNING) - << "The actor or task with ID " << exemplar->GetTaskSpecification().TaskId() + << "The lease with ID " << exemplar->GetLeaseSpecification().LeaseId() << " cannot be scheduled right now. You can ignore this message if this " << "Ray cluster is expected to auto-scale or if you specified a " - << "runtime_env for this actor or task, which may take time to install. " + << "runtime_env for this actor or lease, which may take time to install. " << "Otherwise, this is likely due to all cluster resources being claimed " << "by actors. To resolve the issue, consider creating fewer actors or " << "increasing the resources available to this Ray cluster.\n" - << "Required resources for this actor or task: " - << exemplar->GetTaskSpecification().GetRequiredPlacementResources().DebugString() + << "Required resources for this lease: " + << exemplar->GetLeaseSpecification().GetRequiredPlacementResources().DebugString() << "\n" << "Available resources on this node: " << cluster_resource_scheduler_.GetClusterResourceManager() .GetNodeResourceViewString(scheduling::NodeID(self_node_id_.Binary())) - << " In total there are " << pending_tasks << " pending tasks and " + << " In total there are " << pending_leases << " pending leases and " << pending_actor_creations << " pending actors on this node."; - RAY_LOG_EVERY_MS(WARNING, 10 * 1000) << cluster_task_manager_.DebugStr(); + RAY_LOG_EVERY_MS(WARNING, 10 * 1000) << cluster_lease_manager_.DebugStr(); } - // Try scheduling tasks. Without this, if there's no more tasks coming in, deadlocked - // tasks are never be scheduled. - cluster_task_manager_.ScheduleAndDispatchTasks(); + // Try scheduling leases. Without this, if there's no more leases coming in, deadlocked + // leases are never be scheduled. + cluster_lease_manager_.ScheduleAndGrantLeases(); } void NodeManager::NodeAdded(const GcsNodeInfo &node_info) { @@ -781,21 +768,6 @@ void NodeManager::NodeAdded(const GcsNodeInfo &node_info) { remote_node_manager_addresses_[node_id] = std::make_pair(node_info.node_manager_address(), node_info.node_manager_port()); - // Set node labels when node added. - absl::flat_hash_map labels(node_info.labels().begin(), - node_info.labels().end()); - cluster_resource_scheduler_.GetClusterResourceManager().SetNodeLabels( - scheduling::NodeID(node_id.Binary()), labels); - - // TODO: Always use the message from ray syncer. // NOLINT - ResourceRequest resources; - for (auto &resource_entry : node_info.resources_total()) { - resources.Set(scheduling::ResourceID(resource_entry.first), - FixedPoint(resource_entry.second)); - } - if (ResourceCreateUpdated(node_id, resources)) { - cluster_task_manager_.ScheduleAndDispatchTasks(); - } // Update the resource view if a new message has been sent. if (auto sync_msg = ray_syncer_.GetSyncMessage(node_id.Binary(), syncer::MessageType::RESOURCE_VIEW)) { @@ -831,12 +803,12 @@ void NodeManager::NodeRemoved(const NodeID &node_id) { failed_nodes_cache_.insert(node_id); - cluster_task_manager_.CancelAllTasksOwnedBy(node_id); + cluster_lease_manager_.CancelAllLeasesOwnedBy(node_id); // Clean up workers that were owned by processes that were on the failed // node. for (const auto &[_, worker] : leased_workers_) { - const auto owner_node_id = NodeID::FromBinary(worker->GetOwnerAddress().raylet_id()); + const auto owner_node_id = NodeID::FromBinary(worker->GetOwnerAddress().node_id()); RAY_CHECK(!owner_node_id.IsNil()); if (worker->IsDetachedActor() || owner_node_id != node_id) { continue; @@ -875,7 +847,7 @@ void NodeManager::HandleUnexpectedWorkerFailure(const WorkerID &worker_id) { RAY_LOG(DEBUG).WithField(worker_id) << "Worker failed"; failed_workers_cache_.insert(worker_id); - cluster_task_manager_.CancelAllTasksOwnedBy(worker_id); + cluster_lease_manager_.CancelAllLeasesOwnedBy(worker_id); for (const auto &[_, worker] : leased_workers_) { const auto owner_worker_id = @@ -1058,7 +1030,7 @@ void NodeManager::ProcessClientMessage(const std::shared_ptr & } break; case protocol::MessageType::FreeObjectsInObjectStoreRequest: { auto message = flatbuffers::GetRoot(message_data); - std::vector object_ids = from_flatbuf(*message->object_ids()); + auto object_ids = FlatbufferToObjectIds(*message->object_ids()); // Clean up objects from the object store. object_manager_.FreeObjects(object_ids, message->local_only()); } break; @@ -1085,12 +1057,12 @@ Status NodeManager::ProcessRegisterClientRequestMessageImpl( client->Register(); Language language = static_cast(message->language()); - const JobID job_id = from_flatbuf(*message->job_id()); + const JobID job_id = JobID::FromBinary(message->job_id()->str()); const int runtime_env_hash = static_cast(message->runtime_env_hash()); - WorkerID worker_id = from_flatbuf(*message->worker_id()); + WorkerID worker_id = WorkerID::FromBinary(message->worker_id()->str()); pid_t pid = message->worker_pid(); StartupToken worker_startup_token = message->startup_token(); - std::string worker_ip_address = string_from_flatbuf(*message->ip_address()); + std::string worker_ip_address = message->ip_address()->str(); // TODO(suquark): Use `WorkerType` in `common.proto` without type converting. rpc::WorkerType worker_type = static_cast(message->worker_type()); if (worker_type == rpc::WorkerType::DRIVER) { @@ -1118,21 +1090,21 @@ Status NodeManager::ProcessRegisterClientRequestMessageImpl( ray::protocol::CreateRegisterClientReply(fbb, status.ok(), fbb.CreateString(status.ToString()), - to_flatbuf(fbb, self_node_id_), + flatbuf::to_flatbuf(fbb, self_node_id_), assigned_port); fbb.Finish(reply); client->WriteMessageAsync( static_cast(protocol::MessageType::RegisterClientReply), fbb.GetSize(), fbb.GetBufferPointer(), - [this, client](const ray::Status &status) { - if (!status.ok()) { + [this, client](const ray::Status &write_msg_status) { + if (!write_msg_status.ok()) { DisconnectClient(client, /*graceful=*/false, rpc::WorkerExitType::SYSTEM_ERROR, "Worker is failed because the raylet couldn't reply the " "registration request: " + - status.ToString()); + write_msg_status.ToString()); } }); }; @@ -1155,10 +1127,10 @@ Status NodeManager::RegisterForNewWorker( Status status = worker_pool_.RegisterWorker(worker, pid, worker_startup_token, send_reply_callback); if (!status.ok()) { - // If the worker failed to register to Raylet, trigger task dispatching here to + // If the worker failed to register to Raylet, trigger lease granting here to // allow new worker processes to be started (if capped by // maximum_startup_concurrency). - cluster_task_manager_.ScheduleAndDispatchTasks(); + cluster_lease_manager_.ScheduleAndGrantLeases(); } return status; } @@ -1170,11 +1142,6 @@ Status NodeManager::RegisterForNewDriver( const ray::protocol::RegisterClientRequest *message, std::function send_reply_callback) { worker->SetProcess(Process::FromPid(pid)); - // Compute a dummy driver task id from a given driver. - // The task id set in the worker here should be consistent with the task - // id set in the core worker. - const TaskID driver_task_id = TaskID::ForDriverTask(job_id); - worker->AssignTaskId(driver_task_id); rpc::JobConfig job_config; job_config.ParseFromString(message->serialized_job_config()->str()); @@ -1212,18 +1179,17 @@ void NodeManager::ProcessAnnounceWorkerPortMessageImpl( RAY_CHECK(job_config.has_value()); rpc::Address driver_address; - // Assume raylet ID is the same as the node ID. - driver_address.set_raylet_id(self_node_id_.Binary()); + // Assume node ID is the same as the node ID. + driver_address.set_node_id(self_node_id_.Binary()); driver_address.set_ip_address(worker->IpAddress()); driver_address.set_port(port); driver_address.set_worker_id(worker->WorkerId().Binary()); - auto job_data_ptr = - gcs::CreateJobTableData(job_id, - /*is_dead=*/false, - driver_address, - worker->GetProcess().GetId(), - string_from_flatbuf(*message->entrypoint()), - *job_config); + auto job_data_ptr = gcs::CreateJobTableData(job_id, + /*is_dead=*/false, + driver_address, + worker->GetProcess().GetId(), + message->entrypoint()->str(), + *job_config); gcs_client_.Jobs().AsyncAdd(job_data_ptr, [this, client](Status status) { SendPortAnnouncementResponse(client, std::move(status)); @@ -1246,19 +1212,20 @@ void NodeManager::SendPortAnnouncementResponse( static_cast(protocol::MessageType::AnnounceWorkerPortReply), fbb.GetSize(), fbb.GetBufferPointer(), - [this, client](const ray::Status &status) { - if (!status.ok()) { - DisconnectClient( - client, - /*graceful=*/false, - rpc::WorkerExitType::SYSTEM_ERROR, - "Failed to send AnnounceWorkerPortReply to client: " + status.ToString()); + [this, client](const ray::Status &write_msg_status) { + if (!write_msg_status.ok()) { + DisconnectClient(client, + /*graceful=*/false, + rpc::WorkerExitType::SYSTEM_ERROR, + "Failed to send AnnounceWorkerPortReply to client: " + + write_msg_status.ToString()); } }); } void NodeManager::HandleWorkerAvailable(const std::shared_ptr &worker) { RAY_CHECK(worker); + RAY_CHECK_NE(worker->GetWorkerType(), rpc::WorkerType::DRIVER); if (worker->GetWorkerType() == rpc::WorkerType::SPILL_WORKER) { // Return the worker to the idle pool. @@ -1274,9 +1241,9 @@ void NodeManager::HandleWorkerAvailable(const std::shared_ptr & bool worker_idle = true; - // If the worker was assigned a task, mark it as finished. - if (!worker->GetAssignedTaskId().IsNil()) { - worker_idle = FinishAssignedTask(worker); + // If the worker was granted a lease, clean up any lease resources and state + if (!worker->GetGrantedLeaseId().IsNil()) { + worker_idle = CleanupLease(worker); } if (worker_idle) { @@ -1284,7 +1251,7 @@ void NodeManager::HandleWorkerAvailable(const std::shared_ptr & worker_pool_.PushWorker(worker); } - cluster_task_manager_.ScheduleAndDispatchTasks(); + cluster_lease_manager_.ScheduleAndGrantLeases(); } void SendDisconnectClientReply(const WorkerID &worker_id, @@ -1336,11 +1303,13 @@ void NodeManager::DisconnectClient(const std::shared_ptr &clie RAY_CHECK(worker != nullptr); RAY_CHECK(!(is_worker && is_driver)); // Clean up any open ray.get or ray.wait calls that the worker made. - dependency_manager_.CancelGetRequest(worker->WorkerId()); - dependency_manager_.CancelWaitRequest(worker->WorkerId()); + lease_dependency_manager_.CancelGetRequest(worker->WorkerId()); + lease_dependency_manager_.CancelWaitRequest(worker->WorkerId()); // Erase any lease metadata. - ReleaseWorker(worker->WorkerId()); + if (leased_workers_.contains(worker->GetGrantedLeaseId())) { + ReleaseWorker(worker->GetGrantedLeaseId()); + } if (creation_task_exception != nullptr) { RAY_LOG(INFO).WithField(worker->WorkerId()) @@ -1361,15 +1330,15 @@ void NodeManager::DisconnectClient(const std::shared_ptr &clie if (is_worker) { const ActorID &actor_id = worker->GetActorId(); - const TaskID &task_id = worker->GetAssignedTaskId(); - // If the worker was running a task or actor, clean up the task and push an + const LeaseID &lease_id = worker->GetGrantedLeaseId(); + // If the worker was granted a lease, clean up the lease and push an // error to the driver, unless the worker is already dead. - if ((!task_id.IsNil() || !actor_id.IsNil()) && !worker->IsDead()) { + if ((!lease_id.IsNil() || !actor_id.IsNil()) && !worker->IsDead()) { // If the worker was an actor, it'll be cleaned by GCS. if (actor_id.IsNil()) { // Return the resources that were being used by this worker. - RayTask task; - local_task_manager_.TaskFinished(worker, &task); + RayLease lease; + local_lease_manager_.CleanupLease(worker, &lease); } if (disconnect_type == rpc::WorkerExitType::SYSTEM_ERROR) { @@ -1382,8 +1351,7 @@ void NodeManager::DisconnectClient(const std::shared_ptr &clie "unexpected system " "error. To troubleshoot the problem, check the logs for the " "dead worker." - << " RayTask ID: " << task_id - << " Worker ID: " << worker->WorkerId() + << " Lease ID: " << lease_id << " Worker ID: " << worker->WorkerId() << " Node ID: " << self_node_id_ << " Worker IP address: " << worker->IpAddress() << " Worker port: " << worker->Port() @@ -1397,9 +1365,9 @@ void NodeManager::DisconnectClient(const std::shared_ptr &clie .WithField("node_id", self_node_id_.Hex()) .WithField("job_id", worker->GetAssignedJobId().Hex()) << error_message_str; - auto error_data_ptr = gcs::CreateErrorTableData( + auto error_data = gcs::CreateErrorTableData( type, error_message_str, absl::FromUnixMillis(current_time_ms()), job_id); - gcs_client_.Errors().AsyncReportJobError(error_data_ptr, nullptr); + gcs_client_.Errors().AsyncReportJobError(std::move(error_data)); } } @@ -1407,10 +1375,10 @@ void NodeManager::DisconnectClient(const std::shared_ptr &clie worker_pool_.DisconnectWorker(worker, disconnect_type); // Return the resources that were being used by this worker. - local_task_manager_.ReleaseWorkerResources(worker); + local_lease_manager_.ReleaseWorkerResources(worker); - // Since some resources may have been released, we can try to dispatch more tasks. - cluster_task_manager_.ScheduleAndDispatchTasks(); + // Since some resources may have been released, we can try to grant more leases. + cluster_lease_manager_.ScheduleAndGrantLeases(); } else if (is_driver) { // The client is a driver. const auto job_id = worker->GetAssignedJobId(); @@ -1431,8 +1399,8 @@ void NodeManager::DisconnectClient(const std::shared_ptr &clie } } - local_task_manager_.ClearWorkerBacklog(worker->WorkerId()); - cluster_task_manager_.CancelAllTasksOwnedBy(worker->WorkerId()); + local_lease_manager_.ClearWorkerBacklog(worker->WorkerId()); + cluster_lease_manager_.CancelAllLeasesOwnedBy(worker->WorkerId()); if (graceful) { // Graceful disconnects are initiated by a request from the worker and @@ -1471,7 +1439,7 @@ void NodeManager::HandleAsyncGetObjectsRequest( const std::shared_ptr &client, const uint8_t *message_data) { auto request = flatbuffers::GetRoot(message_data); const auto refs = - FlatbufferToObjectReference(*request->object_ids(), *request->owner_addresses()); + FlatbufferToObjectReferences(*request->object_ids(), *request->owner_addresses()); // Asynchronously pull all requested objects to the local node. AsyncGetOrWait(client, @@ -1483,13 +1451,13 @@ void NodeManager::ProcessWaitRequestMessage( const std::shared_ptr &client, const uint8_t *message_data) { // Read the data. auto message = flatbuffers::GetRoot(message_data); - std::vector object_ids = from_flatbuf(*message->object_ids()); + auto object_ids = FlatbufferToObjectIds(*message->object_ids()); const auto refs = - FlatbufferToObjectReference(*message->object_ids(), *message->owner_addresses()); + FlatbufferToObjectReferences(*message->object_ids(), *message->owner_addresses()); bool all_objects_local = true; for (auto const &object_id : object_ids) { - if (!dependency_manager_.CheckObjectLocal(object_id)) { + if (!lease_dependency_manager_.CheckObjectLocal(object_id)) { all_objects_local = false; } } @@ -1506,9 +1474,10 @@ void NodeManager::ProcessWaitRequestMessage( // If we don't need to wait for any, return immediately after making the pull // requests through AsyncGetOrWait above. flatbuffers::FlatBufferBuilder fbb; - auto wait_reply = protocol::CreateWaitReply(fbb, - to_flatbuf(fbb, std::vector{}), - to_flatbuf(fbb, std::vector{})); + auto wait_reply = + protocol::CreateWaitReply(fbb, + flatbuf::to_flatbuf(fbb, std::vector{}), + flatbuf::to_flatbuf(fbb, std::vector{})); fbb.Finish(wait_reply); const auto status = client->WriteMessage(static_cast(protocol::MessageType::WaitReply), @@ -1523,17 +1492,17 @@ void NodeManager::ProcessWaitRequestMessage( } return; } - uint64_t num_required_objects = static_cast(message->num_required_objects()); + wait_manager_.Wait( object_ids, message->timeout(), - num_required_objects, - [this, client, all_objects_local](std::vector ready, - std::vector remaining) { + message->num_required_objects(), + [this, client, all_objects_local](const std::vector &ready, + const std::vector &remaining) { // Write the data. flatbuffers::FlatBufferBuilder fbb; flatbuffers::Offset wait_reply = protocol::CreateWaitReply( - fbb, to_flatbuf(fbb, ready), to_flatbuf(fbb, remaining)); + fbb, flatbuf::to_flatbuf(fbb, ready), flatbuf::to_flatbuf(fbb, remaining)); fbb.Finish(wait_reply); auto status = @@ -1560,42 +1529,42 @@ void NodeManager::ProcessWaitForActorCallArgsRequestMessage( const std::shared_ptr &client, const uint8_t *message_data) { auto message = flatbuffers::GetRoot(message_data); - std::vector object_ids = from_flatbuf(*message->object_ids()); + auto object_ids = FlatbufferToObjectIds(*message->object_ids()); int64_t tag = message->tag(); // Pull any missing objects to the local node. const auto refs = - FlatbufferToObjectReference(*message->object_ids(), *message->owner_addresses()); + FlatbufferToObjectReferences(*message->object_ids(), *message->owner_addresses()); AsyncGetOrWait(client, refs, /*is_get_request=*/false); // De-duplicate the object IDs. absl::flat_hash_set object_id_set(object_ids.begin(), object_ids.end()); object_ids.assign(object_id_set.begin(), object_id_set.end()); - wait_manager_.Wait( - object_ids, - -1, - object_ids.size(), - [this, client, tag](std::vector ready, std::vector remaining) { - RAY_CHECK(remaining.empty()); - std::shared_ptr worker = - worker_pool_.GetRegisteredWorker(client); - if (!worker) { - RAY_LOG(ERROR) << "Lost worker for wait request " << client; - } else { - worker->ActorCallArgWaitComplete(tag); - } - }); + wait_manager_.Wait(object_ids, + -1, + object_ids.size(), + [this, client, tag](const std::vector &ready, + const std::vector &remaining) { + RAY_CHECK(remaining.empty()); + std::shared_ptr worker = + worker_pool_.GetRegisteredWorker(client); + if (!worker) { + RAY_LOG(ERROR) << "Lost worker for wait request " << client; + } else { + worker->ActorCallArgWaitComplete(tag); + } + }); } void NodeManager::ProcessPushErrorRequestMessage(const uint8_t *message_data) { auto message = flatbuffers::GetRoot(message_data); - auto const &type = string_from_flatbuf(*message->type()); - auto const &error_message = string_from_flatbuf(*message->error_message()); + auto const &type = message->type()->str(); + auto const &error_message = message->error_message()->str(); // TODO(hjiang): Figure out what's the unit for `PushErrorRequest`. double timestamp = message->timestamp(); - JobID job_id = from_flatbuf(*message->job_id()); - auto error_data_ptr = gcs::CreateErrorTableData( + JobID job_id = JobID::FromBinary(message->job_id()->str()); + auto error_data = gcs::CreateErrorTableData( type, error_message, absl::FromUnixMillis(timestamp), job_id); - gcs_client_.Errors().AsyncReportJobError(error_data_ptr, nullptr); + gcs_client_.Errors().AsyncReportJobError(std::move(error_data)); } void NodeManager::HandleGetResourceLoad(rpc::GetResourceLoadRequest request, @@ -1604,22 +1573,21 @@ void NodeManager::HandleGetResourceLoad(rpc::GetResourceLoadRequest request, auto resources_data = reply->mutable_resources(); resources_data->set_node_id(self_node_id_.Binary()); resources_data->set_node_manager_address(initial_config_.node_manager_address); - cluster_task_manager_.FillResourceUsage(*resources_data); + cluster_lease_manager_.FillResourceUsage(*resources_data); send_reply_callback(Status::OK(), nullptr, nullptr); } -void NodeManager::HandleCancelTasksWithResourceShapes( - rpc::CancelTasksWithResourceShapesRequest request, - rpc::CancelTasksWithResourceShapesReply *reply, +void NodeManager::HandleCancelLeasesWithResourceShapes( + rpc::CancelLeasesWithResourceShapesRequest request, + rpc::CancelLeasesWithResourceShapesReply *reply, rpc::SendReplyCallback send_reply_callback) { const auto &resource_shapes = request.resource_shapes(); std::vector target_resource_shapes; for (const auto &resource_shape : resource_shapes) { - target_resource_shapes.emplace_back( - ResourceSet(MapFromProtobuf(resource_shape.resource_shape()))); + target_resource_shapes.emplace_back(MapFromProtobuf(resource_shape.resource_shape())); } - cluster_task_manager_.CancelTasksWithResourceShapes(target_resource_shapes); + cluster_lease_manager_.CancelLeasesWithResourceShapes(target_resource_shapes); send_reply_callback(Status::OK(), nullptr, nullptr); } @@ -1627,14 +1595,15 @@ void NodeManager::HandleReportWorkerBacklog(rpc::ReportWorkerBacklogRequest requ rpc::ReportWorkerBacklogReply *reply, rpc::SendReplyCallback send_reply_callback) { HandleReportWorkerBacklog( - request, reply, send_reply_callback, worker_pool_, local_task_manager_); + request, reply, send_reply_callback, worker_pool_, local_lease_manager_); } -void NodeManager::HandleReportWorkerBacklog(rpc::ReportWorkerBacklogRequest request, - rpc::ReportWorkerBacklogReply *reply, - rpc::SendReplyCallback send_reply_callback, - WorkerPoolInterface &worker_pool, - ILocalTaskManager &local_task_manager) { +void NodeManager::HandleReportWorkerBacklog( + rpc::ReportWorkerBacklogRequest request, + rpc::ReportWorkerBacklogReply *reply, + rpc::SendReplyCallback send_reply_callback, + WorkerPoolInterface &worker_pool, + LocalLeaseManagerInterface &local_lease_manager) { const WorkerID worker_id = WorkerID::FromBinary(request.worker_id()); if (worker_pool.GetRegisteredWorker(worker_id) == nullptr && worker_pool.GetRegisteredDriver(worker_id) == nullptr) { @@ -1643,13 +1612,13 @@ void NodeManager::HandleReportWorkerBacklog(rpc::ReportWorkerBacklogRequest requ return; } - local_task_manager.ClearWorkerBacklog(worker_id); + local_lease_manager.ClearWorkerBacklog(worker_id); std::unordered_set seen; for (const auto &backlog_report : request.backlog_reports()) { - const TaskSpecification resource_spec(backlog_report.resource_spec()); - const SchedulingClass scheduling_class = resource_spec.GetSchedulingClass(); + const LeaseSpecification lease_spec(backlog_report.lease_spec()); + const SchedulingClass scheduling_class = lease_spec.GetSchedulingClass(); RAY_CHECK(seen.find(scheduling_class) == seen.end()); - local_task_manager.SetWorkerBacklog( + local_lease_manager.SetWorkerBacklog( scheduling_class, worker_id, backlog_report.backlog_size()); } send_reply_callback(Status::OK(), nullptr, nullptr); @@ -1658,13 +1627,27 @@ void NodeManager::HandleReportWorkerBacklog(rpc::ReportWorkerBacklogRequest requ void NodeManager::HandleRequestWorkerLease(rpc::RequestWorkerLeaseRequest request, rpc::RequestWorkerLeaseReply *reply, rpc::SendReplyCallback send_reply_callback) { - RayTask task{std::move(*request.mutable_resource_spec())}; - + auto lease_id = LeaseID::FromBinary(request.lease_spec().lease_id()); + // If the lease is already granted, this is a retry and forward the address of the + // already leased worker to use. + if (leased_workers_.contains(lease_id)) { + const auto &worker = leased_workers_[lease_id]; + RAY_LOG(DEBUG) << "Lease " << lease_id + << " is already granted with worker: " << worker->WorkerId(); + reply->set_worker_pid(worker->GetProcess().GetId()); + reply->mutable_worker_address()->set_ip_address(worker->IpAddress()); + reply->mutable_worker_address()->set_port(worker->Port()); + reply->mutable_worker_address()->set_worker_id(worker->WorkerId().Binary()); + reply->mutable_worker_address()->set_node_id(self_node_id_.Binary()); + send_reply_callback(Status::OK(), nullptr, nullptr); + return; + } + RayLease lease{std::move(*request.mutable_lease_spec())}; const auto caller_worker = - WorkerID::FromBinary(task.GetTaskSpecification().CallerAddress().worker_id()); + WorkerID::FromBinary(lease.GetLeaseSpecification().CallerAddress().worker_id()); const auto caller_node = - NodeID::FromBinary(task.GetTaskSpecification().CallerAddress().raylet_id()); - if (!task.GetTaskSpecification().IsDetachedActor() && + NodeID::FromBinary(lease.GetLeaseSpecification().CallerAddress().node_id()); + if (!lease.GetLeaseSpecification().IsDetachedActor() && (failed_workers_cache_.contains(caller_worker) || failed_nodes_cache_.contains(caller_node))) { RAY_LOG(INFO).WithField(caller_worker).WithField(caller_node) @@ -1677,16 +1660,16 @@ void NodeManager::HandleRequestWorkerLease(rpc::RequestWorkerLeaseRequest reques return; }; - const bool is_actor_creation_task = task.GetTaskSpecification().IsActorCreationTask(); + const bool is_actor_creation_task = lease.GetLeaseSpecification().IsActorCreationTask(); ActorID actor_id = ActorID::Nil(); metrics_num_task_scheduled_ += 1; if (is_actor_creation_task) { - actor_id = task.GetTaskSpecification().ActorCreationId(); + actor_id = lease.GetLeaseSpecification().ActorId(); } - const auto &task_spec = task.GetTaskSpecification(); - worker_pool_.PrestartWorkers(task_spec, request.backlog_size()); + const auto &lease_spec = lease.GetLeaseSpecification(); + worker_pool_.PrestartWorkers(lease_spec, request.backlog_size()); auto send_reply_callback_wrapper = [this, is_actor_creation_task, actor_id, reply, send_reply_callback]( @@ -1699,11 +1682,11 @@ void NodeManager::HandleRequestWorkerLease(rpc::RequestWorkerLeaseRequest reques // with normal task resource usages so GCS can fast update // its resource view of this raylet. if (RayConfig::instance().gcs_actor_scheduling_enabled()) { - auto normal_task_resources = local_task_manager_.CalcNormalTaskResources(); + auto normal_task_resources = local_lease_manager_.CalcNormalTaskResources(); RAY_LOG(DEBUG).WithField(actor_id) << "Reject leasing as the raylet has no enough resources. " "normal_task_resources = " - << normal_task_resources.DebugString() << ", local_resoruce_view = " + << normal_task_resources.DebugString() << ", local_resource_view = " << cluster_resource_scheduler_.GetClusterResourceManager() .GetNodeResourceViewString( scheduling::NodeID(self_node_id_.Binary())); @@ -1718,11 +1701,11 @@ void NodeManager::HandleRequestWorkerLease(rpc::RequestWorkerLeaseRequest reques send_reply_callback(status, success, failure); }; - cluster_task_manager_.QueueAndScheduleTask(std::move(task), - request.grant_or_reject(), - request.is_selected_based_on_locality(), - reply, - std::move(send_reply_callback_wrapper)); + cluster_lease_manager_.QueueAndScheduleLease(std::move(lease), + request.grant_or_reject(), + request.is_selected_based_on_locality(), + reply, + std::move(send_reply_callback_wrapper)); } void NodeManager::HandlePrestartWorkers(rpc::PrestartWorkersRequest request, @@ -1788,7 +1771,7 @@ void NodeManager::HandleCommitBundleResources( placement_group_resource_manager_->CommitBundles(bundle_specs); send_reply_callback(Status::OK(), nullptr, nullptr); - cluster_task_manager_.ScheduleAndDispatchTasks(); + cluster_lease_manager_.ScheduleAndGrantLeases(); } void NodeManager::HandleCancelResourceReserve( @@ -1800,12 +1783,13 @@ void NodeManager::HandleCancelResourceReserve( << bundle_spec.DebugString(); // The PG bundle resource must be committed before a lease request asking for it - // can be added to local_task_manager and the only reason why we cancel + // can be added to local_lease_manager and the only reason why we cancel // a committed bundle is when the placement group is removed. // In the case of placement group removal, we should cancel all the lease requests. - local_task_manager_.CancelTasks( + local_lease_manager_.CancelLeases( [&](const std::shared_ptr &work) { - const auto bundle_id = work->task.GetTaskSpecification().PlacementGroupBundleId(); + const auto bundle_id = + work->lease_.GetLeaseSpecification().PlacementGroupBundleId(); return bundle_id.first == bundle_spec.PlacementGroupId(); }, rpc::RequestWorkerLeaseReply::SCHEDULING_CANCELLED_PLACEMENT_GROUP_REMOVED, @@ -1830,7 +1814,7 @@ void NodeManager::HandleCancelResourceReserve( << "Destroying worker since its placement group was removed. Placement group id: " << worker->GetBundleId().first << ", bundle index: " << bundle_spec.BundleId().second - << ", task id: " << worker->GetAssignedTaskId() + << ", lease id: " << worker->GetGrantedLeaseId() << ", actor id: " << worker->GetActorId() << ", worker id: " << worker->WorkerId(); const auto &message = stream.str(); @@ -1839,47 +1823,146 @@ void NodeManager::HandleCancelResourceReserve( } RAY_CHECK_OK(placement_group_resource_manager_->ReturnBundle(bundle_spec)); - cluster_task_manager_.ScheduleAndDispatchTasks(); + cluster_lease_manager_.ScheduleAndGrantLeases(); send_reply_callback(Status::OK(), nullptr, nullptr); } -void NodeManager::HandleReturnWorker(rpc::ReturnWorkerRequest request, - rpc::ReturnWorkerReply *reply, - rpc::SendReplyCallback send_reply_callback) { - // Read the resource spec submitted by the client. - auto worker_id = WorkerID::FromBinary(request.worker_id()); - std::shared_ptr worker = leased_workers_[worker_id]; +void NodeManager::HandleResizeLocalResourceInstances( + rpc::ResizeLocalResourceInstancesRequest request, + rpc::ResizeLocalResourceInstancesReply *reply, + rpc::SendReplyCallback send_reply_callback) { + const auto &target_resource_map = request.resources(); + + // Check if any resource is a unit instance resource + // Unit instance resources (e.g., GPU) cannot be resized with this API + for (const auto &[resource_name, target_value] : target_resource_map) { + if (ResourceID(resource_name).IsUnitInstanceResource()) { + std::string error_msg = absl::StrFormat( + "Cannot resize unit instance resource '%s'. Unit instance resources " + "(e.g., GPU) cannot be resized dynamically.", + resource_name); + send_reply_callback(Status::InvalidArgument(error_msg), nullptr, nullptr); + return; + } + } - Status status; - ReleaseWorker(worker_id); + // Get current local resources and convert to resource maps + const auto ¤t_resources = + cluster_resource_scheduler_.GetLocalResourceManager().GetLocalResources(); + const auto ¤t_total_map = + current_resources.GetTotalResourceInstances().ToNodeResourceSet().GetResourceMap(); + const auto ¤t_available_map = current_resources.GetAvailableResourceInstances() + .ToNodeResourceSet() + .GetResourceMap(); + + // Calculate delta resource map (target - current) and clamp to avoid + // making available resources negative + absl::flat_hash_map delta_resource_map; + for (const auto &[resource_name, target_value] : target_resource_map) { + double current_total = 0.0; + double current_available = 0.0; + + if (auto total_it = current_total_map.find(resource_name); + total_it != current_total_map.end()) { + current_total = total_it->second; + } - if (worker) { - if (request.disconnect_worker()) { - // The worker should be destroyed. - DisconnectClient( - worker->Connection(), - /*graceful=*/false, - rpc::WorkerExitType::SYSTEM_ERROR, - absl::StrCat("The leased worker has unrecoverable failure. Worker is requested " - "to be destroyed when it is returned. ", - request.disconnect_worker_error_detail())); - } else { - if (worker->IsBlocked()) { - // Handle the edge case where the worker was returned before we got the - // unblock RPC by unblocking it immediately (unblock is idempotent). - HandleDirectCallTaskUnblocked(worker); - } - local_task_manager_.ReleaseWorkerResources(worker); - // If the worker is exiting, don't add it to our pool. The worker will cleanup - // and terminate itself. - if (!request.worker_exiting()) { - HandleWorkerAvailable(worker); - } + if (auto available_it = current_available_map.find(resource_name); + available_it != current_available_map.end()) { + current_available = available_it->second; + } + + double delta_value = target_value - current_total; + + // Clamp so current_available never goes below 0. + // For example, if delta_value is -4 but the current_available is 2, + // then clamp delta_value to -2. + if (delta_value < -current_available) { + delta_value = -current_available; + } + + if (delta_value != 0.0) { + delta_resource_map[resource_name] = delta_value; } + } + + // Convert the delta resource map to NodeResourceInstanceSet and apply + if (!delta_resource_map.empty()) { + NodeResourceSet delta_resources(delta_resource_map); + NodeResourceInstanceSet delta_instances(delta_resources); + + // Apply deltas for each resource + for (const auto &resource_id : delta_resources.ExplicitResourceIds()) { + const auto &instances = delta_instances.Get(resource_id); + cluster_resource_scheduler_.GetLocalResourceManager().AddLocalResourceInstances( + resource_id, instances); + } + } + + // Get updated resource state and populate reply + const auto &updated_resources = + cluster_resource_scheduler_.GetLocalResourceManager().GetLocalResources(); + const auto &updated_total_map = + updated_resources.GetTotalResourceInstances().ToNodeResourceSet().GetResourceMap(); + const auto &updated_available_map = updated_resources.GetAvailableResourceInstances() + .ToNodeResourceSet() + .GetResourceMap(); + + if (!delta_resource_map.empty()) { + // Log the updated resources + RAY_LOG(INFO) << "Successfully resized local resources. Current total resources: " + << debug_string(updated_total_map); + RAY_LOG(INFO) << "Available resources: " << debug_string(updated_available_map); + // Trigger scheduling to account for the new resources + cluster_lease_manager_.ScheduleAndGrantLeases(); + } + + // Populate the reply with the current resource state + auto *total_resources = reply->mutable_total_resources(); + total_resources->insert(updated_total_map.begin(), updated_total_map.end()); + + send_reply_callback(Status::OK(), nullptr, nullptr); +} + +void NodeManager::HandleReturnWorkerLease(rpc::ReturnWorkerLeaseRequest request, + rpc::ReturnWorkerLeaseReply *reply, + rpc::SendReplyCallback send_reply_callback) { + // Read the resource spec submitted by the client. + auto lease_id = LeaseID::FromBinary(request.lease_id()); + + // Check if this message is a retry + if (!leased_workers_.contains(lease_id)) { + send_reply_callback(Status::OK(), nullptr, nullptr); + return; + } + + std::shared_ptr worker = leased_workers_[lease_id]; + ReleaseWorker(lease_id); + + if (request.disconnect_worker()) { + // The worker should be destroyed. + DisconnectClient( + worker->Connection(), + /*graceful=*/false, + rpc::WorkerExitType::SYSTEM_ERROR, + absl::StrCat("The leased worker has unrecoverable failure. Worker is requested " + "to be destroyed when it is returned. ", + request.disconnect_worker_error_detail())); } else { - status = Status::Invalid("Returned worker does not exist any more"); + if (worker->IsBlocked()) { + // Handle the edge case where the worker was returned before we got the + // unblock RPC by unblocking it immediately (unblock is idempotent). + HandleDirectCallTaskUnblocked(worker); + } + local_lease_manager_.ReleaseWorkerResources(worker); + // If the worker is exiting, don't add it to our pool. The worker will cleanup + // and terminate itself. + if (!request.worker_exiting()) { + HandleWorkerAvailable(worker); + } } - send_reply_callback(status, nullptr, nullptr); + + send_reply_callback(Status::OK(), nullptr, nullptr); } void NodeManager::HandleIsLocalWorkerDead(rpc::IsLocalWorkerDeadRequest request, @@ -1953,16 +2036,17 @@ void NodeManager::HandleReleaseUnusedActorWorkers( rpc::ReleaseUnusedActorWorkersRequest request, rpc::ReleaseUnusedActorWorkersReply *reply, rpc::SendReplyCallback send_reply_callback) { - std::unordered_set in_use_worker_ids; - for (int index = 0; index < request.worker_ids_in_use_size(); ++index) { - auto worker_id = WorkerID::FromBinary(request.worker_ids_in_use(index)); - in_use_worker_ids.emplace(worker_id); + absl::flat_hash_set in_use_worker_ids; + in_use_worker_ids.reserve(request.worker_ids_in_use_size()); + for (const auto &worker_id_in_use_binary : request.worker_ids_in_use()) { + in_use_worker_ids.emplace(WorkerID::FromBinary(worker_id_in_use_binary)); } std::vector> unused_actor_workers; for (auto &iter : leased_workers_) { // We only kill *actor* workers. - if (!iter.second->GetActorId().IsNil() && !in_use_worker_ids.count(iter.first)) { + if (!iter.second->GetActorId().IsNil() && + !in_use_worker_ids.contains(iter.second->WorkerId())) { unused_actor_workers.push_back(iter.second); } } @@ -1981,11 +2065,11 @@ void NodeManager::HandleReleaseUnusedActorWorkers( void NodeManager::HandleCancelWorkerLease(rpc::CancelWorkerLeaseRequest request, rpc::CancelWorkerLeaseReply *reply, rpc::SendReplyCallback send_reply_callback) { - const TaskID task_id = TaskID::FromBinary(request.task_id()); - bool canceled = cluster_task_manager_.CancelTask(task_id); - // The task cancellation failed if we did not have the task queued, since - // this means that we may not have received the task request yet. It is - // successful if we did have the task queued, since we have now replied to + const LeaseID lease_id = LeaseID::FromBinary(request.lease_id()); + bool canceled = cluster_lease_manager_.CancelLease(lease_id); + // The lease cancellation failed if we did not have the lease queued, since + // this means that we may not have received the lease request yet. It is + // successful if we did have the lease queued, since we have now replied to // the client that requested the lease. reply->set_success(canceled); send_reply_callback(Status::OK(), nullptr, nullptr); @@ -2027,36 +2111,36 @@ void NodeManager::MarkObjectsAsFailed( << " object may hang forever."; std::string error_message = stream.str(); RAY_LOG(ERROR) << error_message; - auto error_data_ptr = gcs::CreateErrorTableData( + auto error_data = gcs::CreateErrorTableData( "task", error_message, absl::FromUnixMillis(current_time_ms()), job_id); - gcs_client_.Errors().AsyncReportJobError(error_data_ptr, nullptr); + gcs_client_.Errors().AsyncReportJobError(std::move(error_data)); } } } void NodeManager::HandleDirectCallTaskBlocked( const std::shared_ptr &worker) { - if (!worker || worker->IsBlocked() || worker->GetAssignedTaskId().IsNil()) { + if (!worker || worker->IsBlocked() || worker->GetGrantedLeaseId().IsNil()) { return; // The worker may have died or is no longer processing the task. } - local_task_manager_.ReleaseCpuResourcesFromBlockedWorker(worker); - cluster_task_manager_.ScheduleAndDispatchTasks(); + local_lease_manager_.ReleaseCpuResourcesFromBlockedWorker(worker); + cluster_lease_manager_.ScheduleAndGrantLeases(); } void NodeManager::HandleDirectCallTaskUnblocked( const std::shared_ptr &worker) { - if (!worker || worker->GetAssignedTaskId().IsNil()) { + if (!worker || worker->GetGrantedLeaseId().IsNil()) { return; // The worker may have died or is no longer processing the task. } // First, always release task dependencies. This ensures we don't leak resources even // if we don't need to unblock the worker below. - dependency_manager_.CancelGetRequest(worker->WorkerId()); + lease_dependency_manager_.CancelGetRequest(worker->WorkerId()); if (worker->IsBlocked()) { - local_task_manager_.ReturnCpuResourcesToUnblockedWorker(worker); - cluster_task_manager_.ScheduleAndDispatchTasks(); + local_lease_manager_.ReturnCpuResourcesToUnblockedWorker(worker); + cluster_lease_manager_.ScheduleAndGrantLeases(); } } @@ -2072,9 +2156,9 @@ void NodeManager::AsyncGetOrWait(const std::shared_ptr &client // Start an async request to get or wait for the objects. // The objects will be fetched locally unless the get or wait request is canceled. if (is_get_request) { - dependency_manager_.StartOrUpdateGetRequest(worker->WorkerId(), object_refs); + lease_dependency_manager_.StartOrUpdateGetRequest(worker->WorkerId(), object_refs); } else { - dependency_manager_.StartOrUpdateWaitRequest(worker->WorkerId(), object_refs); + lease_dependency_manager_.StartOrUpdateWaitRequest(worker->WorkerId(), object_refs); } } @@ -2085,52 +2169,48 @@ void NodeManager::CancelGetRequest(const std::shared_ptr &clie } RAY_CHECK(worker); - dependency_manager_.CancelGetRequest(worker->WorkerId()); + lease_dependency_manager_.CancelGetRequest(worker->WorkerId()); } -bool NodeManager::FinishAssignedTask(const std::shared_ptr &worker) { - TaskID task_id = worker->GetAssignedTaskId(); - RAY_LOG(DEBUG).WithField(task_id) << "Finished task "; +bool NodeManager::CleanupLease(const std::shared_ptr &worker) { + LeaseID lease_id = worker->GetGrantedLeaseId(); + RAY_LOG(DEBUG).WithField(lease_id) << "Cleaning up lease "; - RayTask task; - local_task_manager_.TaskFinished(worker, &task); + RayLease lease; + local_lease_manager_.CleanupLease(worker, &lease); - const auto &spec = task.GetTaskSpecification(); // - if ((spec.IsActorCreationTask())) { - // If this was an actor or actor creation task, handle the actor's new - // state. - FinishAssignedActorCreationTask(worker, task); + const auto &lease_spec = lease.GetLeaseSpecification(); + if ((lease_spec.IsActorCreationTask())) { + // If this was an actor or actor creation task, convert the worker to an actor. + ConvertWorkerToActor(worker, lease); } else { - // If this was a non-actor task, then cancel any ray.wait calls that were - // made during the task execution. - dependency_manager_.CancelWaitRequest(worker->WorkerId()); + // If this was a non-actor lease, cancel any ray.wait calls that were + // made during the lease execution. + lease_dependency_manager_.CancelWaitRequest(worker->WorkerId()); } - // Notify the task dependency manager that this task has finished execution. - dependency_manager_.CancelGetRequest(worker->WorkerId()); + // Notify the lease dependency manager that this lease has returned. + lease_dependency_manager_.CancelGetRequest(worker->WorkerId()); - if (!spec.IsActorCreationTask()) { - // Unset the worker's assigned task. We keep the assigned task ID for - // actor creation calls because this ID is used later if the actor - // requires objects from plasma. - worker->AssignTaskId(TaskID::Nil()); + if (!lease_spec.IsActorCreationTask()) { + worker->GrantLeaseId(LeaseID::Nil()); worker->SetOwnerAddress(rpc::Address()); } // Actors will be assigned tasks via the core worker and therefore are not idle. - return !spec.IsActorCreationTask(); + return !lease_spec.IsActorCreationTask(); } -void NodeManager::FinishAssignedActorCreationTask( - const std::shared_ptr &worker, const RayTask &task) { - RAY_LOG(DEBUG) << "Finishing assigned actor creation task"; - const TaskSpecification task_spec = task.GetTaskSpecification(); - ActorID actor_id = task_spec.ActorCreationId(); +void NodeManager::ConvertWorkerToActor(const std::shared_ptr &worker, + const RayLease &lease) { + RAY_LOG(DEBUG) << "Converting worker to actor"; + const LeaseSpecification lease_spec = lease.GetLeaseSpecification(); + ActorID actor_id = lease_spec.ActorId(); // This was an actor creation task. Convert the worker to an actor. worker->AssignActorId(actor_id); - if (task_spec.IsDetachedActor()) { - auto job_id = task.GetTaskSpecification().JobId(); + if (lease_spec.IsDetachedActor()) { + auto job_id = lease_spec.JobId(); auto job_config = worker_pool_.GetJobConfig(job_id); RAY_CHECK(job_config); } @@ -2156,10 +2236,10 @@ void NodeManager::SpillIfOverPrimaryObjectsThreshold() { void NodeManager::HandleObjectLocal(const ObjectInfo &object_info) { const ObjectID &object_id = object_info.object_id; // Notify the task dependency manager that this object is local. - const auto ready_task_ids = dependency_manager_.HandleObjectLocal(object_id); + const auto ready_lease_ids = lease_dependency_manager_.HandleObjectLocal(object_id); RAY_LOG(DEBUG).WithField(object_id).WithField(self_node_id_) - << "Object local on node, " << ready_task_ids.size() << " tasks ready"; - local_task_manager_.TasksUnblocked(ready_task_ids); + << "Object local on node, " << ready_lease_ids.size() << " tasks ready"; + local_lease_manager_.LeasesUnblocked(ready_lease_ids); // Notify the wait manager that this object is local. wait_manager_.HandleObjectLocal(object_id); @@ -2175,7 +2255,7 @@ void NodeManager::HandleObjectLocal(const ObjectInfo &object_info) { rpc::PlasmaObjectReadyRequest request; request.set_object_id(object_id.Binary()); - for (auto worker : waiting_workers) { + for (const auto &worker : waiting_workers) { worker->rpc_client()->PlasmaObjectReady( request, [](Status status, const rpc::PlasmaObjectReadyReply &reply) { if (!status.ok()) { @@ -2190,27 +2270,17 @@ void NodeManager::HandleObjectLocal(const ObjectInfo &object_info) { SpillIfOverPrimaryObjectsThreshold(); } -bool NodeManager::IsActorCreationTask(const TaskID &task_id) { - auto actor_id = task_id.ActorId(); - if (!actor_id.IsNil() && task_id == TaskID::ForActorCreationTask(actor_id)) { - // This task ID corresponds to an actor creation task. - return true; - } - - return false; -} - void NodeManager::HandleObjectMissing(const ObjectID &object_id) { - // Notify the task dependency manager that this object is no longer local. - const auto waiting_task_ids = dependency_manager_.HandleObjectMissing(object_id); + // Notify the lease dependency manager that this object is no longer local. + const auto waiting_lease_ids = lease_dependency_manager_.HandleObjectMissing(object_id); std::stringstream result; result << "Object missing " << object_id << ", " - << " on " << self_node_id_ << ", " << waiting_task_ids.size() - << " tasks waiting"; - if (waiting_task_ids.size() > 0) { - result << ", tasks: "; - for (const auto &task_id : waiting_task_ids) { - result << task_id << " "; + << " on " << self_node_id_ << ", " << waiting_lease_ids.size() + << " leases waiting"; + if (waiting_lease_ids.size() > 0) { + result << ", leases: "; + for (const auto &lease_id : waiting_lease_ids) { + result << lease_id << " "; } } RAY_LOG(DEBUG) << result.str(); @@ -2227,9 +2297,9 @@ void NodeManager::ProcessSubscribePlasmaReady( << "No worker exists for CoreWorker with client: " << client->DebugString(); auto message = flatbuffers::GetRoot(message_data); - auto id = from_flatbuf(*message->object_id()); + auto id = ObjectID::FromBinary(message->object_id()->str()); - if (dependency_manager_.CheckObjectLocal(id)) { + if (lease_dependency_manager_.CheckObjectLocal(id)) { // Object is already local, so we directly fire the callback to tell the core worker // that the plasma object is ready. rpc::PlasmaObjectReadyRequest request; @@ -2256,7 +2326,8 @@ void NodeManager::ProcessSubscribePlasmaReady( // is local at this time but when the core worker was notified, the object is // is evicted. The core worker should be able to handle evicted object in this // case. - dependency_manager_.StartOrUpdateWaitRequest(associated_worker->WorkerId(), refs); + lease_dependency_manager_.StartOrUpdateWaitRequest(associated_worker->WorkerId(), + refs); // Add this worker to the listeners for the object ID. { @@ -2283,14 +2354,14 @@ std::string NodeManager::DebugString() const { result << "\nNode ID: " << self_node_id_; result << "\nNode name: " << self_node_name_; result << "\nInitialConfigResources: " << initial_config_.resource_config.DebugString(); - result << "\nClusterTaskManager:\n"; - result << cluster_task_manager_.DebugStr(); + result << "\nClusterLeaseManager:\n"; + result << cluster_lease_manager_.DebugStr(); result << "\nClusterResources:"; result << "\n" << local_object_manager_.DebugString(); result << "\n" << object_manager_.DebugString(); result << "\n" << gcs_client_.DebugString(); result << "\n" << worker_pool_.DebugString(); - result << "\n" << dependency_manager_.DebugString(); + result << "\n" << lease_dependency_manager_.DebugString(); result << "\n" << wait_manager_.DebugString(); result << "\n" << core_worker_subscriber_.DebugString(); { @@ -2573,8 +2644,8 @@ void NodeManager::HandleFormatGlobalMemoryInfo( auto store_reply = [replies, reply, num_nodes, send_reply_callback, include_memory_info]( - rpc::GetNodeStatsReply &&local_reply) { - replies->push_back(std::move(local_reply)); + rpc::GetNodeStatsReply &&get_node_status_local_reply) { + replies->push_back(std::move(get_node_status_local_reply)); if (replies->size() >= num_nodes) { if (include_memory_info) { reply->set_memory_summary(FormatMemoryInfo(*replies)); @@ -2662,7 +2733,7 @@ void NodeManager::RecordMetrics() { return; } - cluster_task_manager_.RecordMetrics(); + cluster_lease_manager_.RecordMetrics(); object_manager_.RecordMetrics(); local_object_manager_.RecordMetrics(); @@ -2670,7 +2741,7 @@ void NodeManager::RecordMetrics() { uint64_t duration_ms = current_time - last_metrics_recorded_at_ms_; last_metrics_recorded_at_ms_ = current_time; object_directory_.RecordMetrics(duration_ms); - dependency_manager_.RecordMetrics(); + lease_dependency_manager_.RecordMetrics(); } void NodeManager::ConsumeSyncMessage( @@ -2679,8 +2750,19 @@ void NodeManager::ConsumeSyncMessage( syncer::ResourceViewSyncMessage resource_view_sync_message; resource_view_sync_message.ParseFromString(message->sync_message()); NodeID node_id = NodeID::FromBinary(message->node_id()); - if (UpdateResourceUsage(node_id, resource_view_sync_message)) { - cluster_task_manager_.ScheduleAndDispatchTasks(); + // Set node labels when node added. + auto node_labels = MapFromProtobuf(resource_view_sync_message.labels()); + cluster_resource_scheduler_.GetClusterResourceManager().SetNodeLabels( + scheduling::NodeID(node_id.Binary()), std::move(node_labels)); + ResourceRequest resources; + for (auto &resource_entry : resource_view_sync_message.resources_total()) { + resources.Set(scheduling::ResourceID(resource_entry.first), + FixedPoint(resource_entry.second)); + } + const bool capacity_updated = ResourceCreateUpdated(node_id, resources); + const bool usage_update = UpdateResourceUsage(node_id, resource_view_sync_message); + if (capacity_updated || usage_update) { + cluster_lease_manager_.ScheduleAndGrantLeases(); } } else if (message->message_type() == syncer::MessageType::COMMANDS) { syncer::CommandsSyncMessage commands_sync_message; @@ -2724,7 +2806,7 @@ MemoryUsageRefreshCallback NodeManager::CreateMemoryUsageRefreshCallback() { if (!high_memory_eviction_target_->GetProcess().IsAlive()) { RAY_LOG(INFO) .WithField(high_memory_eviction_target_->WorkerId()) - .WithField(high_memory_eviction_target_->GetAssignedTaskId()) + .WithField(high_memory_eviction_target_->GetGrantedLeaseId()) << "Worker evicted and process killed to reclaim memory. " << "worker pid: " << high_memory_eviction_target_->GetProcess().GetId(); high_memory_eviction_target_ = nullptr; @@ -2733,7 +2815,7 @@ MemoryUsageRefreshCallback NodeManager::CreateMemoryUsageRefreshCallback() { if (is_usage_above_threshold) { if (high_memory_eviction_target_ != nullptr) { RAY_LOG_EVERY_MS(INFO, 1000) - .WithField(high_memory_eviction_target_->GetAssignedTaskId()) + .WithField(high_memory_eviction_target_->GetGrantedLeaseId()) .WithField(high_memory_eviction_target_->WorkerId()) << "Memory usage above threshold. " << "Still waiting for worker eviction to free up memory. " @@ -2766,7 +2848,7 @@ MemoryUsageRefreshCallback NodeManager::CreateMemoryUsageRefreshCallback() { RAY_LOG(INFO) << "Killing worker with task " - << worker_to_kill->GetAssignedTask().GetTaskSpecification().DebugString() + << worker_to_kill->GetGrantedLease().GetLeaseSpecification().DebugString() << "\n\n" << oom_kill_details << "\n\n" << oom_kill_suggestions; @@ -2781,13 +2863,13 @@ MemoryUsageRefreshCallback NodeManager::CreateMemoryUsageRefreshCallback() { // Rerpot the event to the dashboard. RAY_EVENT_EVERY_MS(ERROR, "Out of Memory", 10 * 1000) << worker_exit_message; - // Mark the task as failure and raise an exception from a caller. - rpc::RayErrorInfo task_failure_reason; - task_failure_reason.set_error_message(worker_exit_message); - task_failure_reason.set_error_type(rpc::ErrorType::OUT_OF_MEMORY); - SetTaskFailureReason(worker_to_kill->GetAssignedTaskId(), - std::move(task_failure_reason), - should_retry); + // Mark the worker as failure and raise an exception from a caller. + rpc::RayErrorInfo worker_failure_reason; + worker_failure_reason.set_error_message(worker_exit_message); + worker_failure_reason.set_error_type(rpc::ErrorType::OUT_OF_MEMORY); + SetWorkerFailureReason(worker_to_kill->GetGrantedLeaseId(), + std::move(worker_failure_reason), + should_retry); /// since we print the process memory in the message. Destroy should be called /// as soon as possible to free up memory. @@ -2801,17 +2883,17 @@ MemoryUsageRefreshCallback NodeManager::CreateMemoryUsageRefreshCallback() { ray::stats::STATS_memory_manager_worker_eviction_total.Record( 1, {{"Type", "MemoryManager.DriverEviction.Total"}, {"Name", ""}}); } else if (worker_to_kill->GetActorId().IsNil()) { - const auto &ray_task = worker_to_kill->GetAssignedTask(); + const auto &ray_lease = worker_to_kill->GetGrantedLease(); ray::stats::STATS_memory_manager_worker_eviction_total.Record( 1, {{"Type", "MemoryManager.TaskEviction.Total"}, - {"Name", ray_task.GetTaskSpecification().GetName()}}); + {"Name", ray_lease.GetLeaseSpecification().GetTaskName()}}); } else { - const auto &ray_task = worker_to_kill->GetAssignedTask(); + const auto &ray_lease = worker_to_kill->GetGrantedLease(); ray::stats::STATS_memory_manager_worker_eviction_total.Record( 1, {{"Type", "MemoryManager.ActorEviction.Total"}, - {"Name", ray_task.GetTaskSpecification().GetName()}}); + {"Name", ray_lease.GetLeaseSpecification().GetTaskName()}}); } } } @@ -2847,8 +2929,8 @@ const std::string NodeManager::CreateOomKillMessageDetails( oom_kill_details_ss << "Memory on the node (IP: " << worker->IpAddress() << ", ID: " << node_id - << ") where the task (" << worker->GetTaskOrActorIdAsDebugString() - << ", name=" << worker->GetAssignedTask().GetTaskSpecification().GetName() + << ") where the lease (" << worker->GetLeaseIdAsDebugString() + << ", name=" << worker->GetGrantedLease().GetLeaseSpecification().GetTaskName() << ", pid=" << worker->GetProcess().GetId() << ", memory used=" << process_used_bytes_gb << "GB) was running was " << used_bytes_gb << "GB / " << total_bytes_gb << "GB (" << usage_fraction @@ -2867,9 +2949,9 @@ const std::string NodeManager::CreateOomKillMessageDetails( const std::string NodeManager::CreateOomKillMessageSuggestions( const std::shared_ptr &worker, bool should_retry) const { std::stringstream not_retriable_recommendation_ss; - if (worker && !worker->GetAssignedTask().GetTaskSpecification().IsRetriable()) { + if (worker && !worker->GetGrantedLease().GetLeaseSpecification().IsRetriable()) { not_retriable_recommendation_ss << "Set "; - if (worker->GetAssignedTask().GetTaskSpecification().IsNormalTask()) { + if (worker->GetGrantedLease().GetLeaseSpecification().IsNormalTask()) { not_retriable_recommendation_ss << "max_retries"; } else { not_retriable_recommendation_ss << "max_restarts and max_task_retries"; @@ -2897,29 +2979,29 @@ const std::string NodeManager::CreateOomKillMessageSuggestions( return oom_kill_suggestions_ss.str(); } -void NodeManager::SetTaskFailureReason(const TaskID &task_id, - const rpc::RayErrorInfo &failure_reason, - bool should_retry) { - RAY_LOG(DEBUG).WithField(task_id) << "set failure reason for task "; +void NodeManager::SetWorkerFailureReason(const LeaseID &lease_id, + const rpc::RayErrorInfo &failure_reason, + bool should_retry) { + RAY_LOG(DEBUG).WithField(lease_id) << "set failure reason for lease "; ray::TaskFailureEntry entry(failure_reason, should_retry); - auto result = task_failure_reasons_.emplace(task_id, std::move(entry)); + auto result = worker_failure_reasons_.emplace(lease_id, std::move(entry)); if (!result.second) { - RAY_LOG(WARNING).WithField(task_id) + RAY_LOG(WARNING).WithField(lease_id) << "Trying to insert failure reason more than once for the same " - "task, the previous failure will be removed."; + "worker, the previous failure will be removed."; } } -void NodeManager::GCTaskFailureReason() { - for (const auto &entry : task_failure_reasons_) { +void NodeManager::GCWorkerFailureReason() { + for (const auto &entry : worker_failure_reasons_) { auto duration = static_cast( std::chrono::duration_cast( - std::chrono::steady_clock::now() - entry.second.creation_time) + std::chrono::steady_clock::now() - entry.second.creation_time_) .count()); if (duration > RayConfig::instance().task_failure_entry_ttl_ms()) { RAY_LOG(INFO).WithField(entry.first) - << "Removing task failure reason since it expired"; - task_failure_reasons_.erase(entry.first); + << "Removing worker failure reason since it expired"; + worker_failure_reasons_.erase(entry.first); } } } diff --git a/src/ray/raylet/node_manager.h b/src/ray/raylet/node_manager.h index df53758583c3..38665194bd8b 100644 --- a/src/ray/raylet/node_manager.h +++ b/src/ray/raylet/node_manager.h @@ -24,11 +24,11 @@ #include "ray/common/asio/instrumented_io_context.h" #include "ray/common/bundle_spec.h" #include "ray/common/id.h" +#include "ray/common/lease/lease.h" #include "ray/common/memory_monitor.h" #include "ray/common/ray_object.h" #include "ray/common/ray_syncer/ray_syncer.h" #include "ray/common/scheduling/resource_set.h" -#include "ray/common/task/task.h" #include "ray/common/task/task_util.h" #include "ray/core_worker/experimental_mutable_object_provider.h" #include "ray/flatbuffers/node_manager_generated.h" @@ -38,19 +38,18 @@ #include "ray/object_manager/plasma/client.h" #include "ray/pubsub/subscriber.h" #include "ray/raylet/agent_manager.h" -#include "ray/raylet/dependency_manager.h" +#include "ray/raylet/lease_dependency_manager.h" +#include "ray/raylet/local_lease_manager.h" #include "ray/raylet/local_object_manager_interface.h" -#include "ray/raylet/local_task_manager.h" #include "ray/raylet/placement_group_resource_manager.h" #include "ray/raylet/runtime_env_agent_client.h" +#include "ray/raylet/scheduling/cluster_lease_manager_interface.h" #include "ray/raylet/scheduling/cluster_resource_scheduler.h" -#include "ray/raylet/scheduling/cluster_task_manager_interface.h" #include "ray/raylet/wait_manager.h" #include "ray/raylet/worker_killing_policy.h" #include "ray/raylet/worker_pool.h" -#include "ray/raylet_client/raylet_client.h" #include "ray/rpc/node_manager/node_manager_server.h" -#include "ray/rpc/node_manager/raylet_client_pool.h" +#include "ray/rpc/raylet/raylet_client_pool.h" #include "ray/rpc/worker/core_worker_client_pool.h" #include "ray/util/throttler.h" @@ -116,11 +115,6 @@ struct NodeManagerConfig { int max_io_workers; // The key-value labels of this node. absl::flat_hash_map labels; - // If true, core worker enables resource isolation by adding itself into appropriate - // cgroup. - bool enable_resource_isolation = false; - - void AddDefaultLabels(const std::string &self_node_id); }; class NodeManager : public rpc::NodeManagerServiceHandler, @@ -143,14 +137,14 @@ class NodeManager : public rpc::NodeManagerServiceHandler, rpc::RayletClientPool &raylet_client_pool, pubsub::SubscriberInterface &core_worker_subscriber, ClusterResourceScheduler &cluster_resource_scheduler, - ILocalTaskManager &local_task_manager, - ClusterTaskManagerInterface &cluster_task_manager, + LocalLeaseManagerInterface &local_lease_manager, + ClusterLeaseManagerInterface &cluster_lease_manager, IObjectDirectory &object_directory, ObjectManagerInterface &object_manager, LocalObjectManagerInterface &local_object_manager, - DependencyManager &dependency_manager, + LeaseDependencyManager &lease_dependency_manager, WorkerPoolInterface &worker_pool, - absl::flat_hash_map> &leased_workers, + absl::flat_hash_map> &leased_workers, plasma::PlasmaClientInterface &store_client, std::unique_ptr mutable_object_provider, @@ -279,6 +273,20 @@ class NodeManager : public rpc::NodeManagerServiceHandler, rpc::PinObjectIDsReply *reply, rpc::SendReplyCallback send_reply_callback) override; + /// Handle a `ResizeLocalResourceInstances` request. + void HandleResizeLocalResourceInstances( + rpc::ResizeLocalResourceInstancesRequest request, + rpc::ResizeLocalResourceInstancesReply *reply, + rpc::SendReplyCallback send_reply_callback) override; + + void HandleReturnWorkerLease(rpc::ReturnWorkerLeaseRequest request, + rpc::ReturnWorkerLeaseReply *reply, + rpc::SendReplyCallback send_reply_callback) override; + + void HandleCancelWorkerLease(rpc::CancelWorkerLeaseRequest request, + rpc::CancelWorkerLeaseReply *reply, + rpc::SendReplyCallback send_reply_callback) override; + private: FRIEND_TEST(NodeManagerStaticTest, TestHandleReportWorkerBacklog); @@ -286,8 +294,9 @@ class NodeManager : public rpc::NodeManagerServiceHandler, // Warning: this does NOT release the worker's resources, or put the leased worker // back to the worker pool, or destroy the worker. The caller must handle the worker's // resources well. - void ReleaseWorker(const WorkerID &worker_id) { - leased_workers_.erase(worker_id); + void ReleaseWorker(const LeaseID &lease_id) { + RAY_CHECK(leased_workers_.contains(lease_id)); + leased_workers_.erase(lease_id); SetIdleIfLeaseEmpty(); } @@ -353,18 +362,18 @@ class NodeManager : public rpc::NodeManagerServiceHandler, const NodeID &id, const syncer::ResourceViewSyncMessage &resource_view_sync_message); - /// Handle a worker finishing its assigned task. + /// Cleanup any lease resources and state for a worker that was granted a lease. /// - /// \param worker The worker that finished the task. + /// \param worker The worker that was granted the lease. /// \return Whether the worker should be returned to the idle pool. This is /// only false for actor creation calls, which should never be returned to idle. - bool FinishAssignedTask(const std::shared_ptr &worker); + bool CleanupLease(const std::shared_ptr &worker); - /// Handle a worker finishing an assigned actor creation task. - /// \param worker The worker that finished the task. - /// \param task The actor task or actor creation task. - void FinishAssignedActorCreationTask(const std::shared_ptr &worker, - const RayTask &task); + /// Convert a worker to an actor since it's finished an actor creation task. + /// \param worker The worker that was granted the actor creation lease. + /// \param lease The lease of the actor creation task. + void ConvertWorkerToActor(const std::shared_ptr &worker, + const RayLease &lease); /// Start a get or wait request for the requested objects. /// @@ -408,12 +417,6 @@ class NodeManager : public rpc::NodeManagerServiceHandler, const std::string &disconnect_detail, bool force = false); - /// When a job finished, loop over all of the queued tasks for that job and - /// treat them as failed. - /// - /// \param job_id The job that exited. - void CleanUpTasksForFinishedJob(const JobID &job_id); - /// Handles the event that a job is started. /// /// \param job_id ID of the started job. @@ -519,10 +522,10 @@ class NodeManager : public rpc::NodeManagerServiceHandler, rpc::GetResourceLoadReply *reply, rpc::SendReplyCallback send_reply_callback) override; - /// Handle a `CancelTasksWithResourceShapes` request. - void HandleCancelTasksWithResourceShapes( - rpc::CancelTasksWithResourceShapesRequest request, - rpc::CancelTasksWithResourceShapesReply *reply, + /// Handle a `CancelLeasesWithResourceShapes` request. + void HandleCancelLeasesWithResourceShapes( + rpc::CancelLeasesWithResourceShapesRequest request, + rpc::CancelLeasesWithResourceShapesReply *reply, rpc::SendReplyCallback send_reply_callback) override; /// Handle a `PrepareBundleResources` request. @@ -555,12 +558,7 @@ class NodeManager : public rpc::NodeManagerServiceHandler, rpc::ReportWorkerBacklogReply *reply, rpc::SendReplyCallback send_reply_callback, WorkerPoolInterface &worker_pool, - ILocalTaskManager &local_task_manager); - - /// Handle a `ReturnWorker` request. - void HandleReturnWorker(rpc::ReturnWorkerRequest request, - rpc::ReturnWorkerReply *reply, - rpc::SendReplyCallback send_reply_callback) override; + LocalLeaseManagerInterface &local_lease_manager); /// Handle a `ReleaseUnusedActorWorkers` request. // On GCS restart, there's a pruning effort. GCS sends raylet a list of actor workers it @@ -585,11 +583,6 @@ class NodeManager : public rpc::NodeManagerServiceHandler, rpc::IsLocalWorkerDeadReply *reply, rpc::SendReplyCallback send_reply_callback) override; - /// Handle a `CancelWorkerLease` request. - void HandleCancelWorkerLease(rpc::CancelWorkerLeaseRequest request, - rpc::CancelWorkerLeaseReply *reply, - rpc::SendReplyCallback send_reply_callback) override; - /// Handle a `NodeStats` request. void HandleGetNodeStats(rpc::GetNodeStatsRequest request, rpc::GetNodeStatsReply *reply, @@ -615,10 +608,10 @@ class NodeManager : public rpc::NodeManagerServiceHandler, rpc::GetSystemConfigReply *reply, rpc::SendReplyCallback send_reply_callback) override; - /// Handle a `GetTaskFailureCause` request. - void HandleGetTaskFailureCause(rpc::GetTaskFailureCauseRequest request, - rpc::GetTaskFailureCauseReply *reply, - rpc::SendReplyCallback send_reply_callback) override; + /// Handle a `GetWorkerFailureCause` request. + void HandleGetWorkerFailureCause(rpc::GetWorkerFailureCauseRequest request, + rpc::GetWorkerFailureCauseReply *reply, + rpc::SendReplyCallback send_reply_callback) override; void HandleRegisterMutableObject(rpc::RegisterMutableObjectRequest request, rpc::RegisterMutableObjectReply *reply, @@ -708,12 +701,12 @@ class NodeManager : public rpc::NodeManagerServiceHandler, /// Stores the failure reason for the task. The entry will be cleaned up by a periodic /// function post TTL. - void SetTaskFailureReason(const TaskID &task_id, - const rpc::RayErrorInfo &failure_reason, - bool should_retry); + void SetWorkerFailureReason(const LeaseID &lease_id, + const rpc::RayErrorInfo &failure_reason, + bool should_retry); - /// Checks the expiry time of the task failures and garbage collect them. - void GCTaskFailureReason(); + /// Checks the expiry time of the worker failures and garbage collect them. + void GCWorkerFailureReason(); /// Creates a AgentManager that creates and manages a dashboard agent. std::unique_ptr CreateDashboardAgentManager( @@ -770,7 +763,7 @@ class NodeManager : public rpc::NodeManagerServiceHandler, /// A manager to resolve objects needed by queued tasks and workers that /// called `ray.get` or `ray.wait`. - DependencyManager &dependency_manager_; + LeaseDependencyManager &lease_dependency_manager_; /// A manager for wait requests. WaitManager wait_manager_; @@ -795,11 +788,11 @@ class NodeManager : public rpc::NodeManagerServiceHandler, absl::flat_hash_map> remote_node_manager_addresses_; - /// Map of workers leased out to clients. - absl::flat_hash_map> &leased_workers_; + /// Map of leased workers to their lease ids. + absl::flat_hash_map> &leased_workers_; - /// Optional extra information about why the task failed. - absl::flat_hash_map task_failure_reasons_; + /// Optional extra information about why the worker failed. + absl::flat_hash_map worker_failure_reasons_; /// Whether to trigger global GC in the next resource usage report. This will broadcast /// a global GC message to all raylets except for this one. @@ -829,11 +822,11 @@ class NodeManager : public rpc::NodeManagerServiceHandler, /// These classes make up the new scheduler. ClusterResourceScheduler is /// responsible for maintaining a view of the cluster state w.r.t resource - /// usage. ClusterTaskManager is responsible for queuing, spilling back, and + /// usage. ClusterLeaseManager is responsible for queuing, spilling back, and /// dispatching tasks. ClusterResourceScheduler &cluster_resource_scheduler_; - ILocalTaskManager &local_task_manager_; - ClusterTaskManagerInterface &cluster_task_manager_; + LocalLeaseManagerInterface &local_lease_manager_; + ClusterLeaseManagerInterface &cluster_lease_manager_; absl::flat_hash_map> pinned_objects_; diff --git a/src/ray/raylet/placement_group_resource_manager.h b/src/ray/raylet/placement_group_resource_manager.h index 4439bf17c392..76dc72e5a244 100644 --- a/src/ray/raylet/placement_group_resource_manager.h +++ b/src/ray/raylet/placement_group_resource_manager.h @@ -24,7 +24,6 @@ #include "ray/common/placement_group.h" #include "ray/common/scheduling/resource_set.h" #include "ray/raylet/scheduling/cluster_resource_scheduler.h" -#include "ray/util/util.h" namespace ray { diff --git a/src/ray/raylet/raylet.cc b/src/ray/raylet/raylet.cc index 22978ad459a5..27a1c8c1a04b 100644 --- a/src/ray/raylet/raylet.cc +++ b/src/ray/raylet/raylet.cc @@ -29,7 +29,7 @@ #include "ray/object_manager/object_manager.h" #include "ray/object_manager/ownership_object_directory.h" #include "ray/util/network_util.h" -#include "ray/util/util.h" +#include "ray/util/time.h" namespace { @@ -158,8 +158,8 @@ void Raylet::HandleAccept(const boost::system::error_code &error) { if (!error) { ConnectionErrorHandler error_handler = [this]( std::shared_ptr client, - const boost::system::error_code &error) { - node_manager_.HandleClientConnectionError(client, error); + const boost::system::error_code &err) { + node_manager_.HandleClientConnectionError(client, err); }; MessageHandler message_handler = [this](std::shared_ptr client, diff --git a/src/ray/raylet/runtime_env_agent_client.cc b/src/ray/raylet/runtime_env_agent_client.cc index 2c9b04740a31..4934f3da36cb 100644 --- a/src/ray/raylet/runtime_env_agent_client.cc +++ b/src/ray/raylet/runtime_env_agent_client.cc @@ -29,6 +29,8 @@ #include "ray/common/asio/instrumented_io_context.h" #include "ray/common/status.h" #include "ray/util/logging.h" +#include "ray/util/process.h" +#include "ray/util/time.h" #include "src/ray/protobuf/runtime_env_agent.pb.h" namespace beast = boost::beast; // from @@ -230,9 +232,10 @@ class SessionPool { void enqueue(std::shared_ptr session) { if (running_sessions_.size() < max_concurrency_) { running_sessions_.insert(session); - session->run(/*finished_callback=*/[this](std::shared_ptr session) { - this->remove_session_from_running(session); - }); + session->run( + /*finished_callback=*/[this](std::shared_ptr session_to_remove) { + this->remove_session_from_running(session_to_remove); + }); } else { pending_sessions_.emplace(std::move(session)); } diff --git a/src/ray/raylet/runtime_env_agent_client.h b/src/ray/raylet/runtime_env_agent_client.h index f86b0fd3ddbf..feec543aed55 100644 --- a/src/ray/raylet/runtime_env_agent_client.h +++ b/src/ray/raylet/runtime_env_agent_client.h @@ -25,7 +25,7 @@ #include "ray/common/id.h" #include "ray/common/ray_config.h" #include "src/ray/protobuf/gcs.pb.h" -#include "src/ray/protobuf/runtime_env_common.pb.h" +#include "src/ray/protobuf/public/runtime_environment.pb.h" namespace ray { namespace raylet { diff --git a/src/ray/raylet/scheduling/BUILD.bazel b/src/ray/raylet/scheduling/BUILD.bazel index 8d7c28b6dd77..6d00fd1c3205 100644 --- a/src/ray/raylet/scheduling/BUILD.bazel +++ b/src/ray/raylet/scheduling/BUILD.bazel @@ -1,4 +1,4 @@ -load("//bazel:ray.bzl", "ray_cc_library", "ray_cc_test") +load("//bazel:ray.bzl", "ray_cc_library") ray_cc_library( name = "scheduler", @@ -16,9 +16,9 @@ ray_cc_library( deps = [ ":affinity_with_bundle_scheduling_policy", ":bundle_scheduling_policy", + ":cluster_lease_manager", ":cluster_resource_manager", ":cluster_resource_scheduler", - ":cluster_task_manager", ":composite_scheduling_policy", ":hybrid_scheduling_policy", ":local_resource_manager", @@ -33,8 +33,8 @@ ray_cc_library( name = "scheduler_internal", hdrs = ["internal.h"], deps = [ - "//src/ray/common:ray_object", - "//src/ray/common:task_common", + "//src/ray/common:lease", + "//src/ray/common/scheduling:cluster_resource_data", "//src/ray/protobuf:node_manager_cc_proto", ], ) @@ -45,9 +45,11 @@ ray_cc_library( hdrs = ["cluster_resource_manager.h"], deps = [ ":local_resource_manager", + "//src/ray/common:bundle_location_index", "//src/ray/common:grpc_util", + "//src/ray/common:lease", "//src/ray/common:ray_config", - "//src/ray/common:task_common", + "//src/ray/common/scheduling:cluster_resource_data", "//src/ray/protobuf:gcs_cc_proto", "//src/ray/util:container_util", "//src/ray/util:logging", @@ -72,24 +74,23 @@ ray_cc_library( ) ray_cc_library( - name = "cluster_task_manager", + name = "cluster_lease_manager", srcs = [ - "cluster_task_manager.cc", + "cluster_lease_manager.cc", "scheduler_stats.cc", ], hdrs = [ - "cluster_task_manager.h", + "cluster_lease_manager.h", "scheduler_stats.h", ], deps = [ + ":cluster_lease_manager_interface", ":cluster_resource_scheduler", - ":cluster_task_manager_interface", - ":local_task_manager_interface", + ":local_lease_manager_interface", ":scheduler_internal", ":scheduler_resource_reporter", + "//src/ray/common:lease", "//src/ray/common:ray_config", - "//src/ray/common:ray_object", - "//src/ray/common:task_common", "//src/ray/stats:stats_lib", "//src/ray/util:logging", "@com_google_absl//absl/container:flat_hash_map", @@ -97,8 +98,8 @@ ray_cc_library( ) ray_cc_library( - name = "cluster_task_manager_interface", - hdrs = ["cluster_task_manager_interface.h"], + name = "cluster_lease_manager_interface", + hdrs = ["cluster_lease_manager_interface.h"], deps = [ "//src/ray/protobuf:node_manager_cc_proto", "//src/ray/rpc:server_call", @@ -106,11 +107,10 @@ ray_cc_library( ) ray_cc_library( - name = "local_task_manager_interface", - hdrs = ["local_task_manager_interface.h"], + name = "local_lease_manager_interface", + hdrs = ["local_lease_manager_interface.h"], deps = [ ":scheduler_internal", - "//src/ray/common:task_common", "@com_google_absl//absl/container:flat_hash_map", ], ) @@ -120,13 +120,12 @@ ray_cc_library( srcs = ["local_resource_manager.cc"], hdrs = ["local_resource_manager.h"], deps = [ - "//src/ray/common:grpc_util", - "//src/ray/common:ray_config", "//src/ray/common:ray_syncer", - "//src/ray/common:task_common", - "//src/ray/gcs/gcs_client:gcs_client_lib", + "//src/ray/common/scheduling:cluster_resource_data", + "//src/ray/common/scheduling:placement_group_util", "//src/ray/protobuf:gcs_cc_proto", "//src/ray/protobuf:node_manager_cc_proto", + "//src/ray/stats:stats_metric", "//src/ray/util:logging", "@com_google_absl//absl/container:flat_hash_map", "@com_google_googletest//:gtest_prod", @@ -138,10 +137,9 @@ ray_cc_library( srcs = ["scheduler_resource_reporter.cc"], hdrs = ["scheduler_resource_reporter.h"], deps = [ - ":local_task_manager_interface", + ":local_lease_manager_interface", ":scheduler_internal", "//src/ray/common:ray_config", - "//src/ray/common:task_common", "@com_google_absl//absl/container:flat_hash_map", ], ) @@ -160,8 +158,7 @@ ray_cc_library( hdrs = ["policy/scheduling_context.h"], deps = [ "//src/ray/common:id", - "//src/ray/common:task_common", - "@com_google_absl//absl/container:flat_hash_map", + "//src/ray/common:placement_group", ], ) @@ -171,7 +168,7 @@ ray_cc_library( hdrs = ["policy/affinity_with_bundle_scheduling_policy.h"], deps = [ ":scheduling_policy", - "//src/ray/common:task_common", + "//src/ray/common:bundle_location_index", ], ) @@ -184,7 +181,6 @@ ray_cc_library( ":scheduling_context", ":scheduling_policy", ":scorer", - "//src/ray/common:task_common", ], ) @@ -258,7 +254,9 @@ ray_cc_library( name = "scorer", srcs = ["policy/scorer.cc"], hdrs = ["policy/scorer.h"], - deps = ["//src/ray/common:task_common"], + deps = [ + "//src/ray/common/scheduling:cluster_resource_data", + ], ) ray_cc_library( @@ -266,113 +264,6 @@ ray_cc_library( hdrs = ["policy/scheduling_policy.h"], deps = [ ":scheduling_options", - "//src/ray/common:task_common", - ], -) - -ray_cc_test( - name = "cluster_resource_scheduler_test", - size = "small", - srcs = [ - "cluster_resource_scheduler_test.cc", - ], - tags = ["team:core"], - deps = [ - ":cluster_resource_scheduler", - "//:ray_mock", - "//src/ray/common:ray_config", - "//src/ray/common:task_common", - "//src/ray/common:test_util", - "@com_google_googletest//:gtest_main", - ], -) - -ray_cc_test( - name = "cluster_resource_scheduler_2_test", - size = "small", - srcs = [ - "cluster_resource_scheduler_2_test.cc", - ], - tags = ["team:core"], - deps = [ - ":cluster_resource_scheduler", - ":scheduling_context", - ":scheduling_options", - "@com_google_googletest//:gtest_main", - ], -) - -ray_cc_test( - name = "local_resource_manager_test", - size = "small", - srcs = [ - "local_resource_manager_test.cc", - ], - tags = ["team:core"], - deps = [ - ":local_resource_manager", - "@com_google_googletest//:gtest_main", - ], -) - -ray_cc_test( - name = "scheduling_policy_test", - size = "small", - srcs = [ - "policy/scheduling_policy_test.cc", - ], - tags = ["team:core"], - deps = [ - ":composite_scheduling_policy", - "@com_google_googletest//:gtest_main", - ], -) - -ray_cc_test( - name = "hybrid_scheduling_policy_test", - size = "small", - srcs = [ - "policy/hybrid_scheduling_policy_test.cc", - ], - tags = ["team:core"], - deps = [ - ":composite_scheduling_policy", - ":hybrid_scheduling_policy", - "@com_google_absl//absl/random:mock_distributions", - "@com_google_absl//absl/random:mocking_bit_gen", - "@com_google_googletest//:gtest_main", - ], -) - -ray_cc_test( - name = "cluster_task_manager_test", - size = "small", - srcs = [ - "cluster_task_manager_test.cc", - ], - tags = ["team:core"], - deps = [ - ":cluster_resource_scheduler", - ":cluster_task_manager", - "//:ray_mock", - "//src/ray/common:id", - "//src/ray/common:task_common", - "//src/ray/common:test_util", - "//src/ray/raylet:local_task_manager", - "//src/ray/raylet/test:util", - "@com_google_googletest//:gtest_main", - ], -) - -ray_cc_test( - name = "cluster_resource_manager_test", - size = "small", - srcs = [ - "cluster_resource_manager_test.cc", - ], - tags = ["team:core"], - deps = [ - ":cluster_resource_manager", - "@com_google_googletest//:gtest_main", + "//src/ray/common/scheduling:cluster_resource_data", ], ) diff --git a/src/ray/raylet/scheduling/cluster_task_manager.cc b/src/ray/raylet/scheduling/cluster_lease_manager.cc similarity index 61% rename from src/ray/raylet/scheduling/cluster_task_manager.cc rename to src/ray/raylet/scheduling/cluster_lease_manager.cc index d3b11fb8022b..61893fc29de8 100644 --- a/src/ray/raylet/scheduling/cluster_task_manager.cc +++ b/src/ray/raylet/scheduling/cluster_lease_manager.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/raylet/scheduling/cluster_task_manager.h" +#include "ray/raylet/scheduling/cluster_lease_manager.h" #include @@ -21,41 +21,40 @@ #include #include -#include "ray/stats/metric_defs.h" #include "ray/util/logging.h" #include "ray/util/string_utils.h" namespace ray { namespace raylet { -ClusterTaskManager::ClusterTaskManager( +ClusterLeaseManager::ClusterLeaseManager( const NodeID &self_node_id, ClusterResourceScheduler &cluster_resource_scheduler, internal::NodeInfoGetter get_node_info, - std::function announce_infeasible_task, - ILocalTaskManager &local_task_manager, + std::function announce_infeasible_lease, + LocalLeaseManagerInterface &local_lease_manager, std::function get_time_ms) : self_node_id_(self_node_id), cluster_resource_scheduler_(cluster_resource_scheduler), get_node_info_(std::move(get_node_info)), - announce_infeasible_task_(std::move(announce_infeasible_task)), - local_task_manager_(local_task_manager), + announce_infeasible_lease_(std::move(announce_infeasible_lease)), + local_lease_manager_(local_lease_manager), scheduler_resource_reporter_( - tasks_to_schedule_, infeasible_tasks_, local_task_manager_), - internal_stats_(*this, local_task_manager_), + leases_to_schedule_, infeasible_leases_, local_lease_manager_), + internal_stats_(*this, local_lease_manager_), get_time_ms_(std::move(get_time_ms)) {} -void ClusterTaskManager::QueueAndScheduleTask( - RayTask task, +void ClusterLeaseManager::QueueAndScheduleLease( + RayLease lease, bool grant_or_reject, bool is_selected_based_on_locality, rpc::RequestWorkerLeaseReply *reply, rpc::SendReplyCallback send_reply_callback) { - RAY_LOG(DEBUG) << "Queuing and scheduling task " - << task.GetTaskSpecification().TaskId(); - const auto scheduling_class = task.GetTaskSpecification().GetSchedulingClass(); + RAY_LOG(DEBUG) << "Queuing and scheduling lease " + << lease.GetLeaseSpecification().LeaseId(); + const auto scheduling_class = lease.GetLeaseSpecification().GetSchedulingClass(); auto work = std::make_shared( - std::move(task), + std::move(lease), grant_or_reject, is_selected_based_on_locality, reply, @@ -64,21 +63,21 @@ void ClusterTaskManager::QueueAndScheduleTask( }); // If the scheduling class is infeasible, just add the work to the infeasible queue // directly. - auto infeasible_tasks_iter = infeasible_tasks_.find(scheduling_class); - if (infeasible_tasks_iter != infeasible_tasks_.end()) { - infeasible_tasks_iter->second.emplace_back(std::move(work)); + auto infeasible_leases_iter = infeasible_leases_.find(scheduling_class); + if (infeasible_leases_iter != infeasible_leases_.end()) { + infeasible_leases_iter->second.emplace_back(std::move(work)); } else { - tasks_to_schedule_[scheduling_class].emplace_back(std::move(work)); + leases_to_schedule_[scheduling_class].emplace_back(std::move(work)); } - ScheduleAndDispatchTasks(); + ScheduleAndGrantLeases(); } namespace { void ReplyCancelled(const internal::Work &work, rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type, const std::string &scheduling_failure_message) { - auto reply = work.reply; - auto callback = work.callback; + auto reply = work.reply_; + auto callback = work.callback_; reply->set_canceled(true); reply->set_failure_type(failure_type); reply->set_scheduling_failure_message(scheduling_failure_message); @@ -86,20 +85,20 @@ void ReplyCancelled(const internal::Work &work, } } // namespace -bool ClusterTaskManager::CancelTasks( +bool ClusterLeaseManager::CancelLeases( std::function &)> predicate, rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type, const std::string &scheduling_failure_message) { - bool tasks_cancelled = false; + bool leases_cancelled = false; ray::erase_if>( - tasks_to_schedule_, [&](const std::shared_ptr &work) { + leases_to_schedule_, [&](const std::shared_ptr &work) { if (predicate(work)) { - RAY_LOG(DEBUG) << "Canceling task " - << work->task.GetTaskSpecification().TaskId() + RAY_LOG(DEBUG) << "Canceling lease " + << work->lease_.GetLeaseSpecification().LeaseId() << " from schedule queue."; ReplyCancelled(*work, failure_type, scheduling_failure_message); - tasks_cancelled = true; + leases_cancelled = true; return true; } else { return false; @@ -107,28 +106,28 @@ bool ClusterTaskManager::CancelTasks( }); ray::erase_if>( - infeasible_tasks_, [&](const std::shared_ptr &work) { + infeasible_leases_, [&](const std::shared_ptr &work) { if (predicate(work)) { - RAY_LOG(DEBUG) << "Canceling task " - << work->task.GetTaskSpecification().TaskId() + RAY_LOG(DEBUG) << "Canceling lease " + << work->lease_.GetLeaseSpecification().LeaseId() << " from infeasible queue."; ReplyCancelled(*work, failure_type, scheduling_failure_message); - tasks_cancelled = true; + leases_cancelled = true; return true; } else { return false; } }); - if (local_task_manager_.CancelTasks( + if (local_lease_manager_.CancelLeases( predicate, failure_type, scheduling_failure_message)) { - tasks_cancelled = true; + leases_cancelled = true; } - return tasks_cancelled; + return leases_cancelled; } -bool ClusterTaskManager::CancelTasksWithResourceShapes( +bool ClusterLeaseManager::CancelLeasesWithResourceShapes( const std::vector target_resource_shapes) { auto predicate = [target_resource_shapes, this](const std::shared_ptr &work) { @@ -140,7 +139,7 @@ bool ClusterTaskManager::CancelTasksWithResourceShapes( RAY_LOG(WARNING) << "Cancelling infeasible tasks with resource shapes " << resource_shapes_str; - bool task_cancelled = CancelTasks( + bool lease_cancelled = CancelLeases( predicate, rpc::RequestWorkerLeaseReply::SCHEDULING_CANCELLED_UNSCHEDULABLE, absl::StrCat( @@ -149,19 +148,19 @@ bool ClusterTaskManager::CancelTasksWithResourceShapes( " failed to schedule because there are not enough resources for the tasks " "or actors on the whole cluster.")); - RAY_LOG(INFO) << "Infeasible tasks cancellation complete with result=" << task_cancelled - << ",resource shapes=" << resource_shapes_str; + RAY_LOG(INFO) << "Infeasible tasks cancellation complete with result=" + << lease_cancelled << ",resource shapes=" << resource_shapes_str; - return task_cancelled; + return lease_cancelled; } -bool ClusterTaskManager::IsWorkWithResourceShape( +bool ClusterLeaseManager::IsWorkWithResourceShape( const std::shared_ptr &work, const std::vector &target_resource_shapes) { SchedulingClass scheduling_class = - work->task.GetTaskSpecification().GetSchedulingClass(); + work->lease_.GetLeaseSpecification().GetSchedulingClass(); ResourceSet resource_set = - TaskSpecification::GetSchedulingClassDescriptor(scheduling_class).resource_set; + SchedulingClassToIds::GetSchedulingClassDescriptor(scheduling_class).resource_set; for (const auto &target_resource_shape : target_resource_shapes) { if (resource_set == target_resource_shape) { return true; @@ -170,56 +169,56 @@ bool ClusterTaskManager::IsWorkWithResourceShape( return false; } -bool ClusterTaskManager::CancelAllTasksOwnedBy( +bool ClusterLeaseManager::CancelAllLeasesOwnedBy( const NodeID &node_id, rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type, const std::string &scheduling_failure_message) { // Only tasks and regular actors are canceled because their lifetime is // the same as the owner. auto predicate = [node_id](const std::shared_ptr &work) { - return !work->task.GetTaskSpecification().IsDetachedActor() && - work->task.GetTaskSpecification().CallerNodeId() == node_id; + return !work->lease_.GetLeaseSpecification().IsDetachedActor() && + work->lease_.GetLeaseSpecification().CallerNodeId() == node_id; }; - return CancelTasks(predicate, failure_type, scheduling_failure_message); + return CancelLeases(predicate, failure_type, scheduling_failure_message); } -bool ClusterTaskManager::CancelAllTasksOwnedBy( +bool ClusterLeaseManager::CancelAllLeasesOwnedBy( const WorkerID &worker_id, rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type, const std::string &scheduling_failure_message) { // Only tasks and regular actors are canceled because their lifetime is // the same as the owner. auto predicate = [worker_id](const std::shared_ptr &work) { - return !work->task.GetTaskSpecification().IsDetachedActor() && - work->task.GetTaskSpecification().CallerWorkerId() == worker_id; + return !work->lease_.GetLeaseSpecification().IsDetachedActor() && + work->lease_.GetLeaseSpecification().CallerWorkerId() == worker_id; }; - return CancelTasks(predicate, failure_type, scheduling_failure_message); + return CancelLeases(predicate, failure_type, scheduling_failure_message); } -void ClusterTaskManager::ScheduleAndDispatchTasks() { +void ClusterLeaseManager::ScheduleAndGrantLeases() { // Always try to schedule infeasible tasks in case they are now feasible. - TryScheduleInfeasibleTask(); + TryScheduleInfeasibleLease(); std::deque> works_to_cancel; - for (auto shapes_it = tasks_to_schedule_.begin(); - shapes_it != tasks_to_schedule_.end();) { + for (auto shapes_it = leases_to_schedule_.begin(); + shapes_it != leases_to_schedule_.end();) { auto &work_queue = shapes_it->second; bool is_infeasible = false; for (auto work_it = work_queue.begin(); work_it != work_queue.end();) { - // Check every task in task_to_schedule queue to see + // Check every lease in lease_to_schedule queue to see // whether it can be scheduled. This avoids head-of-line - // blocking where a task which cannot be scheduled because + // blocking where a lease which cannot be scheduled because // there are not enough available resources blocks other - // tasks from being scheduled. + // leases from being scheduled. const std::shared_ptr &work = *work_it; - RayTask task = work->task; - RAY_LOG(DEBUG) << "Scheduling pending task " - << task.GetTaskSpecification().TaskId(); + RayLease lease = work->lease_; + RAY_LOG(DEBUG) << "Scheduling pending lease " + << lease.GetLeaseSpecification().LeaseId(); auto scheduling_node_id = cluster_resource_scheduler_.GetBestSchedulableNode( - task.GetTaskSpecification(), + lease.GetLeaseSpecification(), /*preferred_node_id*/ work->PrioritizeLocalNode() ? self_node_id_.Binary() - : task.GetPreferredNodeID(), + : lease.GetPreferredNodeID(), /*exclude_local_node*/ false, /*requires_object_store_memory*/ false, &is_infeasible); @@ -227,14 +226,14 @@ void ClusterTaskManager::ScheduleAndDispatchTasks() { // There is no node that has available resources to run the request. // Move on to the next shape. if (scheduling_node_id.IsNil()) { - RAY_LOG(DEBUG) << "No node found to schedule a task " - << task.GetTaskSpecification().TaskId() << " is infeasible?" + RAY_LOG(DEBUG) << "No node found to schedule a lease " + << lease.GetLeaseSpecification().LeaseId() << " is infeasible?" << is_infeasible; - if (task.GetTaskSpecification().IsNodeAffinitySchedulingStrategy() && - !task.GetTaskSpecification().GetNodeAffinitySchedulingStrategySoft()) { + if (lease.GetLeaseSpecification().IsNodeAffinitySchedulingStrategy() && + !lease.GetLeaseSpecification().GetNodeAffinitySchedulingStrategySoft()) { // This can only happen if the target node doesn't exist or is infeasible. - // The task will never be schedulable in either case so we should fail it. + // The lease will never be schedulable in either case so we should fail it. if (cluster_resource_scheduler_.IsLocalNodeWithRaylet()) { ReplyCancelled( *work, @@ -246,9 +245,9 @@ void ClusterTaskManager::ScheduleAndDispatchTasks() { work_it = work_queue.erase(work_it); } else { // If scheduling is done by gcs, we can not `ReplyCancelled` now because it - // would synchronously call `ClusterTaskManager::CancelTask`, where - // `task_to_schedule_`'s iterator will be invalidated. So record this work and - // it will be handled below (out of the loop). + // would synchronously call `ClusterLeaseManager::CancelLease`, where + // `lease_to_schedule_`'s iterator will be invalidated. So record this work + // and it will be handled below (out of the loop). works_to_cancel.push_back(*work_it); work_it++; } @@ -269,15 +268,15 @@ void ClusterTaskManager::ScheduleAndDispatchTasks() { // Only announce the first item as infeasible. auto &cur_work_queue = shapes_it->second; const auto &work = cur_work_queue[0]; - const RayTask task = work->task; - if (announce_infeasible_task_) { - announce_infeasible_task_(task); + const RayLease lease = work->lease_; + if (announce_infeasible_lease_) { + announce_infeasible_lease_(lease); } - infeasible_tasks_[shapes_it->first] = std::move(shapes_it->second); - tasks_to_schedule_.erase(shapes_it++); + infeasible_leases_[shapes_it->first] = std::move(shapes_it->second); + leases_to_schedule_.erase(shapes_it++); } else if (work_queue.empty()) { - tasks_to_schedule_.erase(shapes_it++); + leases_to_schedule_.erase(shapes_it++); } else { shapes_it++; } @@ -285,7 +284,7 @@ void ClusterTaskManager::ScheduleAndDispatchTasks() { for (const auto &work : works_to_cancel) { // All works in `works_to_cancel` are scheduled by gcs. So `ReplyCancelled` - // will synchronously call `ClusterTaskManager::CancelTask`, where works are + // will synchronously call `ClusterLeaseManager::CancelLease`, where works are // erased from the pending queue. ReplyCancelled(*work, rpc::RequestWorkerLeaseReply::SCHEDULING_CANCELLED_UNSCHEDULABLE, @@ -294,26 +293,27 @@ void ClusterTaskManager::ScheduleAndDispatchTasks() { } works_to_cancel.clear(); - local_task_manager_.ScheduleAndDispatchTasks(); + local_lease_manager_.ScheduleAndGrantLeases(); } -void ClusterTaskManager::TryScheduleInfeasibleTask() { - for (auto shapes_it = infeasible_tasks_.begin(); - shapes_it != infeasible_tasks_.end();) { +void ClusterLeaseManager::TryScheduleInfeasibleLease() { + for (auto shapes_it = infeasible_leases_.begin(); + shapes_it != infeasible_leases_.end();) { auto &work_queue = shapes_it->second; RAY_CHECK(!work_queue.empty()) << "Empty work queue shouldn't have been added as a infeasible shape."; // We only need to check the first item because every task has the same shape. // If the first entry is infeasible, that means everything else is the same. const auto work = work_queue[0]; - RayTask task = work->task; - RAY_LOG(DEBUG) << "Check if the infeasible task is schedulable in any node. task_id:" - << task.GetTaskSpecification().TaskId(); + RayLease lease = work->lease_; + RAY_LOG(DEBUG) + << "Check if the infeasible lease is schedulable in any node. lease_id:" + << lease.GetLeaseSpecification().LeaseId(); bool is_infeasible; cluster_resource_scheduler_.GetBestSchedulableNode( - task.GetTaskSpecification(), + lease.GetLeaseSpecification(), /*preferred_node_id*/ work->PrioritizeLocalNode() ? self_node_id_.Binary() - : task.GetPreferredNodeID(), + : lease.GetPreferredNodeID(), /*exclude_local_node*/ false, /*requires_object_store_memory*/ false, &is_infeasible); @@ -321,31 +321,31 @@ void ClusterTaskManager::TryScheduleInfeasibleTask() { // There is no node that has available resources to run the request. // Move on to the next shape. if (is_infeasible) { - RAY_LOG(DEBUG) << "No feasible node found for task " - << task.GetTaskSpecification().TaskId(); + RAY_LOG(DEBUG) << "No feasible node found for lease " + << lease.GetLeaseSpecification().LeaseId(); shapes_it++; } else { - RAY_LOG(DEBUG) << "Infeasible task of task id " - << task.GetTaskSpecification().TaskId() - << " is now feasible. Move the entry back to tasks_to_schedule_"; - tasks_to_schedule_[shapes_it->first] = std::move(shapes_it->second); - infeasible_tasks_.erase(shapes_it++); + RAY_LOG(DEBUG) << "Infeasible lease of lease id " + << lease.GetLeaseSpecification().LeaseId() + << " is now feasible. Move the entry back to leases_to_schedule_"; + leases_to_schedule_[shapes_it->first] = std::move(shapes_it->second); + infeasible_leases_.erase(shapes_it++); } } } -bool ClusterTaskManager::CancelTask( - const TaskID &task_id, +bool ClusterLeaseManager::CancelLease( + const LeaseID &lease_id, rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type, const std::string &scheduling_failure_message) { - auto predicate = [task_id](const std::shared_ptr &work) { - return work->task.GetTaskSpecification().TaskId() == task_id; + auto predicate = [lease_id](const std::shared_ptr &work) { + return work->lease_.GetLeaseSpecification().LeaseId() == lease_id; }; - return CancelTasks(predicate, failure_type, scheduling_failure_message); + return CancelLeases(predicate, failure_type, scheduling_failure_message); } -void ClusterTaskManager::FillResourceUsage(rpc::ResourcesData &data) { +void ClusterLeaseManager::FillResourceUsage(rpc::ResourcesData &data) { // This populates load information. scheduler_resource_reporter_.FillResourceUsage(data); // This populates usage information. @@ -363,17 +363,17 @@ void ClusterTaskManager::FillResourceUsage(rpc::ResourcesData &data) { resource_view_sync_message.draining_deadline_timestamp_ms()); } -const RayTask *ClusterTaskManager::AnyPendingTasksForResourceAcquisition( - int *num_pending_actor_creation, int *num_pending_tasks) const { - const RayTask *exemplar = nullptr; - // We are guaranteed that these tasks are blocked waiting for resources after a - // call to ScheduleAndDispatchTasks(). They may be waiting for workers as well, but +const RayLease *ClusterLeaseManager::AnyPendingLeasesForResourceAcquisition( + int *num_pending_actor_creation, int *num_pending_leases) const { + const RayLease *exemplar = nullptr; + // We are guaranteed that these leases are blocked waiting for resources after a + // call to ScheduleAndGrantLeases(). They may be waiting for workers as well, but // this should be a transient condition only. - for (const auto &shapes_it : tasks_to_schedule_) { + for (const auto &shapes_it : leases_to_schedule_) { auto &work_queue = shapes_it.second; for (const auto &work_it : work_queue) { const auto &work = *work_it; - const auto &task = work_it->task; + const auto &lease = work_it->lease_; // If the work is not in the waiting state, it will be scheduled soon or won't be // scheduled. Consider as non-pending. @@ -392,58 +392,59 @@ const RayTask *ClusterTaskManager::AnyPendingTasksForResourceAcquisition( continue; } - if (task.GetTaskSpecification().IsActorCreationTask()) { + if (lease.GetLeaseSpecification().IsActorCreationTask()) { *num_pending_actor_creation += 1; } else { - *num_pending_tasks += 1; + *num_pending_leases += 1; } if (exemplar == nullptr) { - exemplar = &task; + exemplar = &lease; } } } - auto local_task_exemplar = local_task_manager_.AnyPendingTasksForResourceAcquisition( - num_pending_actor_creation, num_pending_tasks); - // Prefer returning the cluster task manager exemplar if it exists. - return exemplar == nullptr ? local_task_exemplar : exemplar; + auto local_lease_exemplar = local_lease_manager_.AnyPendingLeasesForResourceAcquisition( + num_pending_actor_creation, num_pending_leases); + // Prefer returning the cluster lease manager exemplar if it exists. + return exemplar == nullptr ? local_lease_exemplar : exemplar; } -void ClusterTaskManager::RecordMetrics() const { +void ClusterLeaseManager::RecordMetrics() const { internal_stats_.RecordMetrics(); cluster_resource_scheduler_.GetLocalResourceManager().RecordMetrics(); } -std::string ClusterTaskManager::DebugStr() const { +std::string ClusterLeaseManager::DebugStr() const { return internal_stats_.ComputeAndReportDebugStr(); } -void ClusterTaskManager::ScheduleOnNode(const NodeID &spillback_to, - const std::shared_ptr &work) { +void ClusterLeaseManager::ScheduleOnNode(const NodeID &spillback_to, + const std::shared_ptr &work) { if (spillback_to == self_node_id_) { - local_task_manager_.QueueAndScheduleTask(work); + local_lease_manager_.QueueAndScheduleLease(work); return; } - auto send_reply_callback = work->callback; + auto send_reply_callback = work->callback_; - if (work->grant_or_reject) { - work->reply->set_rejected(true); + if (work->grant_or_reject_) { + work->reply_->set_rejected(true); send_reply_callback(); return; } - internal_stats_.TaskSpilled(); + internal_stats_.LeaseSpilled(); - const auto &task = work->task; - const auto &task_spec = task.GetTaskSpecification(); - RAY_LOG(DEBUG) << "Spilling task " << task_spec.TaskId() << " to node " << spillback_to; + const auto &lease = work->lease_; + const auto &lease_spec = lease.GetLeaseSpecification(); + RAY_LOG(DEBUG) << "Spilling lease " << lease_spec.LeaseId() << " to node " + << spillback_to; if (!cluster_resource_scheduler_.AllocateRemoteTaskResources( scheduling::NodeID(spillback_to.Binary()), - task_spec.GetRequiredResources().GetResourceMap())) { - RAY_LOG(DEBUG) << "Tried to allocate resources for request " << task_spec.TaskId() + lease_spec.GetRequiredResources().GetResourceMap())) { + RAY_LOG(DEBUG) << "Tried to allocate resources for request " << lease_spec.LeaseId() << " on a remote node that are no longer available"; } @@ -451,36 +452,36 @@ void ClusterTaskManager::ScheduleOnNode(const NodeID &spillback_to, RAY_CHECK(node_info_ptr) << "Spilling back to a node manager, but no GCS info found for node " << spillback_to; - auto reply = work->reply; + auto reply = work->reply_; reply->mutable_retry_at_raylet_address()->set_ip_address( node_info_ptr->node_manager_address()); reply->mutable_retry_at_raylet_address()->set_port(node_info_ptr->node_manager_port()); - reply->mutable_retry_at_raylet_address()->set_raylet_id(spillback_to.Binary()); + reply->mutable_retry_at_raylet_address()->set_node_id(spillback_to.Binary()); send_reply_callback(); } -ClusterResourceScheduler &ClusterTaskManager::GetClusterResourceScheduler() const { +ClusterResourceScheduler &ClusterLeaseManager::GetClusterResourceScheduler() const { return cluster_resource_scheduler_; } -size_t ClusterTaskManager::GetInfeasibleQueueSize() const { +size_t ClusterLeaseManager::GetInfeasibleQueueSize() const { size_t count = 0; - for (const auto &cls_entry : infeasible_tasks_) { + for (const auto &cls_entry : infeasible_leases_) { count += cls_entry.second.size(); } return count; } -size_t ClusterTaskManager::GetPendingQueueSize() const { +size_t ClusterLeaseManager::GetPendingQueueSize() const { size_t count = 0; - for (const auto &cls_entry : tasks_to_schedule_) { + for (const auto &cls_entry : leases_to_schedule_) { count += cls_entry.second.size(); } return count; } -void ClusterTaskManager::FillPendingActorInfo(rpc::ResourcesData &data) const { +void ClusterLeaseManager::FillPendingActorInfo(rpc::ResourcesData &data) const { scheduler_resource_reporter_.FillPendingActorCountByShape(data); } diff --git a/src/ray/raylet/scheduling/cluster_task_manager.h b/src/ray/raylet/scheduling/cluster_lease_manager.h similarity index 62% rename from src/ray/raylet/scheduling/cluster_task_manager.h rename to src/ray/raylet/scheduling/cluster_lease_manager.h index 5137d1fb527a..2e0efaa58a0e 100644 --- a/src/ray/raylet/scheduling/cluster_task_manager.h +++ b/src/ray/raylet/scheduling/cluster_lease_manager.h @@ -19,109 +19,107 @@ #include #include "absl/container/flat_hash_map.h" -#include "ray/common/ray_object.h" -#include "ray/common/task/task.h" -#include "ray/common/task/task_common.h" +#include "ray/common/lease/lease.h" +#include "ray/raylet/scheduling/cluster_lease_manager_interface.h" #include "ray/raylet/scheduling/cluster_resource_scheduler.h" -#include "ray/raylet/scheduling/cluster_task_manager_interface.h" -#include "ray/raylet/scheduling/internal.h" -#include "ray/raylet/scheduling/local_task_manager_interface.h" +#include "ray/raylet/scheduling/local_lease_manager_interface.h" #include "ray/raylet/scheduling/scheduler_resource_reporter.h" #include "ray/raylet/scheduling/scheduler_stats.h" namespace ray { namespace raylet { -/// Schedules a task onto one node of the cluster. The logic is as follows: -/// 1. Queue tasks for scheduling. +/// Schedules a lease onto one node of the cluster. The logic is as follows: +/// 1. Queue leases for scheduling. /// 2. Pick a node on the cluster which has the available resources to run a -/// task. +/// lease. /// * Step 2 should occur any time the state of the cluster is -/// changed, or a new task is queued. -/// 3. For tasks that's infeasable, put them into infeasible queue and reports -/// it to gcs, where the auto scaler will be notified and start new node +/// changed, or a new lease is queued. +/// 3. For leases that are infeasible, put them into infeasible queue and report +/// it to gcs, where the auto scaler will be notified and start a new node /// to accommodate the requirement. -class ClusterTaskManager : public ClusterTaskManagerInterface { +class ClusterLeaseManager : public ClusterLeaseManagerInterface { public: /// \param self_node_id: ID of local node. /// \param cluster_resource_scheduler: The resource scheduler which contains /// the state of the cluster. /// \param get_node_info: Function that returns the node info for a node. - /// \param announce_infeasible_task: Callback that informs the user if a task + /// \param announce_infeasible_lease: Callback that informs the user if a lease /// is infeasible. - /// \param local_task_manager: Manages local tasks. + /// \param local_lease_manager: Manages local leases. /// \param get_time_ms: A callback which returns the current time in milliseconds. - ClusterTaskManager( + ClusterLeaseManager( const NodeID &self_node_id, ClusterResourceScheduler &cluster_resource_scheduler, internal::NodeInfoGetter get_node_info, - std::function announce_infeasible_task, - ILocalTaskManager &local_task_manager, + std::function announce_infeasible_lease, + LocalLeaseManagerInterface &local_lease_manager, std::function get_time_ms = []() { return static_cast(absl::GetCurrentTimeNanos() / 1e6); }); - /// Queue task and schedule. This happens when processing the worker lease request. + /// Queue lease and schedule. This happens when processing the worker lease request. /// - /// \param task: The incoming task to be queued and scheduled. + /// \param lease: The incoming lease to be queued and scheduled. /// \param grant_or_reject: True if we we should either grant or reject the request /// but no spillback. /// \param is_selected_based_on_locality : should schedule on local node if possible. /// \param reply: The reply of the lease request. /// \param send_reply_callback: The function used during dispatching. - void QueueAndScheduleTask(RayTask task, - bool grant_or_reject, - bool is_selected_based_on_locality, - rpc::RequestWorkerLeaseReply *reply, - rpc::SendReplyCallback send_reply_callback) override; + void QueueAndScheduleLease(RayLease lease, + bool grant_or_reject, + bool is_selected_based_on_locality, + rpc::RequestWorkerLeaseReply *reply, + rpc::SendReplyCallback send_reply_callback) override; - /// Attempt to cancel an already queued task. + /// Attempt to cancel an already queued lease. /// - /// \param task_id: The id of the task to remove. + /// \param lease_id: The lease_id of the lease to remove. /// \param failure_type: The failure type. /// \param scheduling_failure_message: The failure message. /// - /// \return True if task was successfully removed. This function will return - /// false if the task is already running. - bool CancelTask(const TaskID &task_id, - rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type = - rpc::RequestWorkerLeaseReply::SCHEDULING_CANCELLED_INTENDED, - const std::string &scheduling_failure_message = "") override; - - bool CancelAllTasksOwnedBy( + /// \return True if lease was successfully cancelled. This function will return + /// false if the lease is already granted. + bool CancelLease(const LeaseID &lease_id, + rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type = + rpc::RequestWorkerLeaseReply::SCHEDULING_CANCELLED_INTENDED, + const std::string &scheduling_failure_message = "") override; + + bool CancelAllLeasesOwnedBy( const WorkerID &worker_id, rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type = rpc::RequestWorkerLeaseReply::SCHEDULING_CANCELLED_INTENDED, const std::string &scheduling_failure_message = "") override; - bool CancelAllTasksOwnedBy( + bool CancelAllLeasesOwnedBy( const NodeID &node_id, rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type = rpc::RequestWorkerLeaseReply::SCHEDULING_CANCELLED_INTENDED, const std::string &scheduling_failure_message = "") override; - /// Cancel all tasks that requires certain resource shape. - /// This function is intended to be used to cancel the infeasible tasks. To make it a + /// Cancel all leases that requires certain resource shape. + /// This function is intended to be used to cancel the infeasible leases. To make it a /// more general function, please modify the signature by adding parameters including /// the failure type and the failure message. /// /// \param target_resource_shapes: The resource shapes to cancel. /// - /// \return True if any task was successfully cancelled. This function will return - /// false if the task is already running. This shouldn't happen in noremal cases - /// because the infeasible tasks shouldn't be able to run due to resource constraints. - bool CancelTasksWithResourceShapes( + /// \return True if any lease was successfully cancelled. This function will return + /// false if the lease is already granted. This shouldn't happen in normal cases + /// because the infeasible leases shouldn't be granted due to resource constraints. + bool CancelLeasesWithResourceShapes( const std::vector target_resource_shapes) override; - /// Attempt to cancel all queued tasks that match the predicate. + /// Attempt to cancel all queued leases that match the predicate. /// - /// \param predicate: A function that returns true if a task needs to be cancelled. + /// \param predicate: A function that returns true if a lease needs to be cancelled. /// \param failure_type: The reason for cancellation. /// \param scheduling_failure_message: The reason message for cancellation. - /// \return True if any task was successfully cancelled. - bool CancelTasks(std::function &)> predicate, - rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type, - const std::string &scheduling_failure_message) override; + /// \return True if any lease was successfully cancelled. + bool CancelLeases( + std::function &)> predicate, + rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type, + const std::string &scheduling_failure_message) override; /// Populate the relevant parts of the heartbeat table. This is intended for /// sending resource usage of raylet to gcs. In particular, this should fill in @@ -131,29 +129,29 @@ class ClusterTaskManager : public ClusterTaskManagerInterface { /// the only fields used. void FillResourceUsage(rpc::ResourcesData &data) override; - /// Return with an exemplar if any tasks are pending resource acquisition. + /// Return with an exemplar if any leases are pending resource acquisition. /// - /// \param[in,out] num_pending_actor_creation: Number of pending actor creation tasks. - /// \param[in,out] num_pending_tasks: Number of pending tasks. - /// \return An example task that is deadlocking if any tasks are pending resource + /// \param[in,out] num_pending_actor_creation: Number of pending actor creation leases. + /// \param[in,out] num_pending_leases: Number of pending leases. + /// \return An example lease that is deadlocking if any leases are pending resource /// acquisition. - const RayTask *AnyPendingTasksForResourceAcquisition( - int *num_pending_actor_creation, int *num_pending_tasks) const override; + const RayLease *AnyPendingLeasesForResourceAcquisition( + int *num_pending_actor_creation, int *num_pending_leases) const override; - // Schedule and dispatch tasks. - void ScheduleAndDispatchTasks() override; + // Schedule and grant leases. + void ScheduleAndGrantLeases() override; /// Record the internal metrics. void RecordMetrics() const override; - /// The helper to dump the debug state of the cluster task manater. + /// The helper to dump the debug state of the cluster lease manater. std::string DebugStr() const override; ClusterResourceScheduler &GetClusterResourceScheduler() const; - /// Get the count of tasks in `infeasible_tasks_`. + /// Get the count of leases in `infeasible_leases_`. size_t GetInfeasibleQueueSize() const; - /// Get the count of tasks in `tasks_to_schedule_`. + /// Get the count of leases in `leases_to_schedule_`. size_t GetPendingQueueSize() const; /// Populate the info of pending and infeasible actors. This function @@ -164,14 +162,16 @@ class ClusterTaskManager : public ClusterTaskManagerInterface { void FillPendingActorInfo(rpc::ResourcesData &data) const; private: - void TryScheduleInfeasibleTask(); + void TryScheduleInfeasibleLease(); - // Schedule the task onto a node (which could be either remote or local). + // Schedule the lease onto a node (which could be to a worker thats in a local or remote + // node). void ScheduleOnNode(const NodeID &node_to_schedule, const std::shared_ptr &work); /// Recompute the debug stats. - /// It is needed because updating the debug state is expensive for cluster_task_manager. + /// It is needed because updating the debug state is expensive for + /// cluster_lease_manager. /// TODO(sang): Update the internal states value dynamically instead of iterating the /// data structure. void RecomputeDebugStats() const; @@ -194,20 +194,20 @@ class ClusterTaskManager : public ClusterTaskManagerInterface { /// Function to get the node information of a given node id. internal::NodeInfoGetter get_node_info_; - /// Function to announce infeasible task to GCS. - std::function announce_infeasible_task_; + /// Function to announce infeasible lease to GCS. + std::function announce_infeasible_lease_; - ILocalTaskManager &local_task_manager_; + LocalLeaseManagerInterface &local_lease_manager_; /// Queue of lease requests that are waiting for resources to become available. - /// Tasks move from scheduled -> dispatch | waiting. + /// Leases move from scheduled -> dispatch | waiting. absl::flat_hash_map>> - tasks_to_schedule_; + leases_to_schedule_; /// Queue of lease requests that are infeasible. - /// Tasks go between scheduling <-> infeasible. + /// Leases go between scheduling <-> infeasible. absl::flat_hash_map>> - infeasible_tasks_; + infeasible_leases_; const SchedulerResourceReporter scheduler_resource_reporter_; mutable SchedulerStats internal_stats_; @@ -216,8 +216,8 @@ class ClusterTaskManager : public ClusterTaskManagerInterface { std::function get_time_ms_; friend class SchedulerStats; - friend class ClusterTaskManagerTest; - FRIEND_TEST(ClusterTaskManagerTest, FeasibleToNonFeasible); + friend class ClusterLeaseManagerTest; + FRIEND_TEST(ClusterLeaseManagerTest, FeasibleToNonFeasible); }; } // namespace raylet } // namespace ray diff --git a/src/ray/raylet/scheduling/cluster_task_manager_interface.h b/src/ray/raylet/scheduling/cluster_lease_manager_interface.h similarity index 58% rename from src/ray/raylet/scheduling/cluster_task_manager_interface.h rename to src/ray/raylet/scheduling/cluster_lease_manager_interface.h index 7950706eb04e..e8b885c7a8ad 100644 --- a/src/ray/raylet/scheduling/cluster_task_manager_interface.h +++ b/src/ray/raylet/scheduling/cluster_lease_manager_interface.h @@ -17,17 +17,18 @@ #include #include +#include "ray/raylet/scheduling/internal.h" #include "ray/rpc/server_call.h" #include "src/ray/protobuf/node_manager.pb.h" namespace ray { namespace raylet { -class ClusterTaskManagerInterface { +class ClusterLeaseManagerInterface { public: - virtual ~ClusterTaskManagerInterface() = default; + virtual ~ClusterLeaseManagerInterface() = default; - // Schedule and dispatch tasks. - virtual void ScheduleAndDispatchTasks() = 0; + // Schedule and dispatch leases. + virtual void ScheduleAndGrantLeases() = 0; /// Populate the relevant parts of the heartbeat table. This is intended for /// sending raylet <-> gcs heartbeats. In particular, this should fill in @@ -37,81 +38,81 @@ class ClusterTaskManagerInterface { /// fields used. virtual void FillResourceUsage(rpc::ResourcesData &data) = 0; - /// Attempt to cancel an already queued task. + /// Attempt to cancel an already queued lease. /// - /// \param task_id: The id of the task to remove. + /// \param lease_id: The id of the lease to remove. /// \param failure_type: The failure type. /// \param scheduling_failure_message: The failure message. /// - /// \return True if task was successfully removed. This function will return - /// false if the task is already running. - virtual bool CancelTask( - const TaskID &task_id, + /// \return True if lease was successfully cancelled. This function will return + /// false if the lease is already granted. + virtual bool CancelLease( + const LeaseID &lease_id, rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type = rpc::RequestWorkerLeaseReply::SCHEDULING_CANCELLED_INTENDED, const std::string &scheduling_failure_message = "") = 0; - /// Cancel all tasks owned by a specific worker. - virtual bool CancelAllTasksOwnedBy( + /// Cancel all leases owned by a specific worker. + virtual bool CancelAllLeasesOwnedBy( const WorkerID &worker_id, rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type = rpc::RequestWorkerLeaseReply::SCHEDULING_CANCELLED_INTENDED, const std::string &scheduling_failure_message = "") = 0; - /// Cancel all tasks owned by a worker on the specific node. - virtual bool CancelAllTasksOwnedBy( + /// Cancel all leases owned by a worker on the specific node. + virtual bool CancelAllLeasesOwnedBy( const NodeID &node_id, rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type = rpc::RequestWorkerLeaseReply::SCHEDULING_CANCELLED_INTENDED, const std::string &scheduling_failure_message = "") = 0; - /// Attempt to cancel all queued tasks that match the resource shapes. - /// This function is intended to be used to cancel the infeasible tasks. To make it a + /// Attempt to cancel all queued leases that match the resource shapes. + /// This function is intended to be used to cancel the infeasible leases. To make it a /// more general function, please modify the signature by adding parameters including /// the failure type and the failure message. /// /// \param target_resource_shapes: The resource shapes to cancel. /// - /// \return True if any task was successfully removed. This function will return false - /// if the task is already running. This shouldn't happen in noremal cases because the - /// infeasible tasks shouldn't be able to run due to resource constraints. - virtual bool CancelTasksWithResourceShapes( + /// \return True if any lease was successfully removed. This function will return false + /// if the lease is already running. This shouldn't happen in noremal cases because the + /// infeasible leases shouldn't be able to run due to resource constraints. + virtual bool CancelLeasesWithResourceShapes( const std::vector target_resource_shapes) = 0; - /// Attempt to cancel all queued tasks that match the predicate. + /// Attempt to cancel all queued leases that match the predicate. /// - /// \param predicate: A function that returns true if a task needs to be cancelled. + /// \param predicate: A function that returns true if a lease needs to be cancelled. /// \param failure_type: The reason for cancellation. /// \param scheduling_failure_message: The reason message for cancellation. - /// \return True if any task was successfully cancelled. - virtual bool CancelTasks( + /// \return True if any lease was successfully cancelled. + virtual bool CancelLeases( std::function &)> predicate, rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type, const std::string &scheduling_failure_message) = 0; - /// Queue task and schedule. This happens when processing the worker lease request. + /// Queue lease and schedule. This happens when processing the worker lease request. /// - /// \param task: The incoming task to be queued and scheduled. + /// \param lease: The incoming lease to be queued and scheduled. /// \param grant_or_reject: True if we we should either grant or reject the request /// but no spillback. /// \param reply: The reply of the lease request. /// \param send_reply_callback: The function used during dispatching. - virtual void QueueAndScheduleTask(RayTask task, - bool grant_or_reject, - bool is_selected_based_on_locality, - rpc::RequestWorkerLeaseReply *reply, - rpc::SendReplyCallback send_reply_callback) = 0; + virtual void QueueAndScheduleLease(RayLease lease, + bool grant_or_reject, + bool is_selected_based_on_locality, + rpc::RequestWorkerLeaseReply *reply, + rpc::SendReplyCallback send_reply_callback) = 0; - /// Return with an exemplar if any tasks are pending resource acquisition. + /// Return with an exemplar if any leases are pending resource acquisition. /// /// \param[in] num_pending_actor_creation Number of pending actor creation tasks. - /// \param[in] num_pending_tasks Number of pending tasks. - /// \return An example task that is deadlocking if any tasks are pending resource + /// \param[in] num_pending_leases Number of pending leases. + /// \return An example lease that is deadlocking if any leases are pending resource /// acquisition. - virtual const RayTask *AnyPendingTasksForResourceAcquisition( - int *num_pending_actor_creation, int *num_pending_tasks) const = 0; + virtual const RayLease *AnyPendingLeasesForResourceAcquisition( + int *num_pending_actor_creation, int *num_pending_leases) const = 0; - /// The helper to dump the debug state of the cluster task manater. + /// The helper to dump the debug state of the cluster lease manater. virtual std::string DebugStr() const = 0; /// Record the internal metrics. diff --git a/src/ray/raylet/scheduling/cluster_resource_manager.cc b/src/ray/raylet/scheduling/cluster_resource_manager.cc index 225beb0cfbe6..7ed06e6b96f5 100644 --- a/src/ray/raylet/scheduling/cluster_resource_manager.cc +++ b/src/ray/raylet/scheduling/cluster_resource_manager.cc @@ -77,16 +77,19 @@ bool ClusterResourceManager::UpdateNode( return false; } - auto resources_total = MapFromProtobuf(resource_view_sync_message.resources_total()); - auto resources_available = + const auto resources_total = + MapFromProtobuf(resource_view_sync_message.resources_total()); + const auto resources_available = MapFromProtobuf(resource_view_sync_message.resources_available()); + auto node_labels = MapFromProtobuf(resource_view_sync_message.labels()); NodeResources node_resources = ResourceMapToNodeResources(resources_total, resources_available); NodeResources local_view; RAY_CHECK(GetNodeResources(node_id, &local_view)); - local_view.total = node_resources.total; - local_view.available = node_resources.available; + local_view.total = std::move(node_resources.total); + local_view.available = std::move(node_resources.available); + local_view.labels = std::move(node_labels); local_view.object_pulls_queued = resource_view_sync_message.object_pulls_queued(); // Update the idle duration for the node in terms of resources usage. @@ -290,13 +293,13 @@ BundleLocationIndex &ClusterResourceManager::GetBundleLocationIndex() { void ClusterResourceManager::SetNodeLabels( const scheduling::NodeID &node_id, - const absl::flat_hash_map &labels) { + absl::flat_hash_map labels) { auto it = nodes_.find(node_id); if (it == nodes_.end()) { NodeResources node_resources; it = nodes_.emplace(node_id, node_resources).first; } - it->second.GetMutableLocalView()->labels = labels; + it->second.GetMutableLocalView()->labels = std::move(labels); } } // namespace ray diff --git a/src/ray/raylet/scheduling/cluster_resource_manager.h b/src/ray/raylet/scheduling/cluster_resource_manager.h index a83c6a608624..58bde9688105 100644 --- a/src/ray/raylet/scheduling/cluster_resource_manager.h +++ b/src/ray/raylet/scheduling/cluster_resource_manager.h @@ -33,7 +33,7 @@ namespace ray { namespace raylet { -class ClusterTaskManagerTest; +class ClusterLeaseManagerTest; class SchedulingPolicyTest; } // namespace raylet namespace raylet_scheduling_policy { @@ -138,7 +138,7 @@ class ClusterResourceManager { BundleLocationIndex &GetBundleLocationIndex(); void SetNodeLabels(const scheduling::NodeID &node_id, - const absl::flat_hash_map &labels); + absl::flat_hash_map labels); private: friend class ClusterResourceScheduler; @@ -180,7 +180,7 @@ class ClusterResourceManager { friend class ClusterResourceSchedulerTest; friend struct ClusterResourceManagerTest; - friend class raylet::ClusterTaskManagerTest; + friend class raylet::ClusterLeaseManagerTest; FRIEND_TEST(ClusterResourceSchedulerTest, SchedulingDeleteClusterNodeTest); FRIEND_TEST(ClusterResourceSchedulerTest, SchedulingModifyClusterNodeTest); FRIEND_TEST(ClusterResourceSchedulerTest, SchedulingUpdateAvailableResourcesTest); @@ -199,7 +199,7 @@ class ClusterResourceManager { FRIEND_TEST(ClusterResourceSchedulerTest, AvailableResourceInstancesOpsTest); FRIEND_TEST(ClusterResourceSchedulerTest, DirtyLocalViewTest); FRIEND_TEST(ClusterResourceSchedulerTest, DynamicResourceTest); - FRIEND_TEST(ClusterTaskManagerTestWithGPUsAtHead, RleaseAndReturnWorkerCpuResources); + FRIEND_TEST(ClusterLeaseManagerTestWithGPUsAtHead, RleaseAndReturnWorkerCpuResources); FRIEND_TEST(ClusterResourceSchedulerTest, TestForceSpillback); FRIEND_TEST(ClusterResourceSchedulerTest, AffinityWithBundleScheduleTest); FRIEND_TEST(ClusterResourceSchedulerTest, LabelSelectorIsSchedulableOnNodeTest); diff --git a/src/ray/raylet/scheduling/cluster_resource_scheduler.cc b/src/ray/raylet/scheduling/cluster_resource_scheduler.cc index dff976ffcf62..4f1448320547 100644 --- a/src/ray/raylet/scheduling/cluster_resource_scheduler.cc +++ b/src/ray/raylet/scheduling/cluster_resource_scheduler.cc @@ -284,7 +284,7 @@ bool ClusterResourceScheduler::IsSchedulableOnNode( } scheduling::NodeID ClusterResourceScheduler::GetBestSchedulableNode( - const TaskSpecification &task_spec, + const LeaseSpecification &lease_spec, const std::string &preferred_node_id, bool exclude_local_node, bool requires_object_store_memory, @@ -293,8 +293,8 @@ scheduling::NodeID ClusterResourceScheduler::GetBestSchedulableNode( // going through the full hybrid policy since we don't want spillback. if (preferred_node_id == local_node_id_.Binary() && !exclude_local_node && IsSchedulableOnNode(local_node_id_, - task_spec.GetRequiredPlacementResources().GetResourceMap(), - task_spec.GetLabelSelector(), + lease_spec.GetRequiredPlacementResources().GetResourceMap(), + lease_spec.GetLabelSelector(), requires_object_store_memory)) { *is_infeasible = false; return local_node_id_; @@ -303,11 +303,11 @@ scheduling::NodeID ClusterResourceScheduler::GetBestSchedulableNode( // This argument is used to set violation, which is an unsupported feature now. int64_t _unused; scheduling::NodeID best_node = - GetBestSchedulableNode(task_spec.GetRequiredPlacementResources().GetResourceMap(), - task_spec.GetLabelSelector(), - task_spec.GetMessage().scheduling_strategy(), + GetBestSchedulableNode(lease_spec.GetRequiredPlacementResources().GetResourceMap(), + lease_spec.GetLabelSelector(), + lease_spec.GetMessage().scheduling_strategy(), requires_object_store_memory, - task_spec.IsActorCreationTask(), + lease_spec.IsActorCreationTask(), exclude_local_node, preferred_node_id, &_unused, @@ -316,16 +316,16 @@ scheduling::NodeID ClusterResourceScheduler::GetBestSchedulableNode( // There is no other available nodes. if (!best_node.IsNil() && !IsSchedulableOnNode(best_node, - task_spec.GetRequiredPlacementResources().GetResourceMap(), - task_spec.GetLabelSelector(), + lease_spec.GetRequiredPlacementResources().GetResourceMap(), + lease_spec.GetLabelSelector(), requires_object_store_memory)) { // Prefer waiting on the local node if possible // since the local node is chosen for a reason (e.g. spread). if ((preferred_node_id == local_node_id_.Binary()) && NodeAvailable(local_node_id_)) { auto resource_request = ResourceMapToResourceRequest( - task_spec.GetRequiredPlacementResources().GetResourceMap(), + lease_spec.GetRequiredPlacementResources().GetResourceMap(), requires_object_store_memory); - const auto &selector = task_spec.GetLabelSelector(); + const auto &selector = lease_spec.GetLabelSelector(); resource_request.SetLabelSelector(selector); if (cluster_resource_manager_->HasFeasibleResources(local_node_id_, resource_request)) { @@ -334,7 +334,7 @@ scheduling::NodeID ClusterResourceScheduler::GetBestSchedulableNode( } } // If the task is being scheduled by gcs, return nil to make it stay in the - // `cluster_task_manager`'s queue. + // `cluster_lease_manager`'s queue. if (!is_local_node_with_raylet_) { return scheduling::NodeID::Nil(); } diff --git a/src/ray/raylet/scheduling/cluster_resource_scheduler.h b/src/ray/raylet/scheduling/cluster_resource_scheduler.h index 39a7f0111e1b..2df66334975f 100644 --- a/src/ray/raylet/scheduling/cluster_resource_scheduler.h +++ b/src/ray/raylet/scheduling/cluster_resource_scheduler.h @@ -84,7 +84,7 @@ class ClusterResourceScheduler { /// Find a node in the cluster on which we can schedule a given resource request. /// In hybrid mode, see `scheduling_policy.h` for a description of the policy. /// - /// \param task_spec: Task/Actor to be scheduled. + /// \param lease_spec: Lease to be scheduled. /// \param preferred_node_id: the node where the task is preferred to be placed. An /// empty `preferred_node_id` (string) means no preferred node. /// \param exclude_local_node: true if we want to avoid local node. This will cancel @@ -96,7 +96,7 @@ class ClusterResourceScheduler { /// /// \return empty string, if no node can schedule the current request; otherwise, /// return the string name of a node that can schedule the resource request. - scheduling::NodeID GetBestSchedulableNode(const TaskSpecification &task_spec, + scheduling::NodeID GetBestSchedulableNode(const LeaseSpecification &lease_spec, const std::string &preferred_node_id, bool exclude_local_node, bool requires_object_store_memory, @@ -244,7 +244,7 @@ class ClusterResourceScheduler { FRIEND_TEST(ClusterResourceSchedulerTest, AvailableResourceInstancesOpsTest); FRIEND_TEST(ClusterResourceSchedulerTest, DirtyLocalViewTest); FRIEND_TEST(ClusterResourceSchedulerTest, DynamicResourceTest); - FRIEND_TEST(ClusterTaskManagerTestWithGPUsAtHead, RleaseAndReturnWorkerCpuResources); + FRIEND_TEST(ClusterLeaseManagerTestWithGPUsAtHead, RleaseAndReturnWorkerCpuResources); FRIEND_TEST(ClusterResourceSchedulerTest, TestForceSpillback); FRIEND_TEST(ClusterResourceSchedulerTest, AffinityWithBundleScheduleTest); FRIEND_TEST(ClusterResourceSchedulerTest, LabelSelectorIsSchedulableOnNodeTest); diff --git a/src/ray/raylet/scheduling/internal.h b/src/ray/raylet/scheduling/internal.h index 66630199bafc..19f2f587be56 100644 --- a/src/ray/raylet/scheduling/internal.h +++ b/src/ray/raylet/scheduling/internal.h @@ -17,10 +17,8 @@ #include #include -#include "ray/common/ray_object.h" +#include "ray/common/lease/lease.h" #include "ray/common/scheduling/cluster_resource_data.h" -#include "ray/common/task/task.h" -#include "ray/common/task/task_common.h" #include "src/ray/protobuf/node_manager.pb.h" namespace ray::raylet::internal { @@ -51,28 +49,28 @@ enum class UnscheduledWorkCause { }; /// Work represents all the information needed to make a scheduling decision. -/// This includes the task, the information we need to communicate to +/// This includes the lease, the information we need to communicate to /// dispatch/spillback and the callback to trigger it. class Work { public: - RayTask task; - bool grant_or_reject; - bool is_selected_based_on_locality; - rpc::RequestWorkerLeaseReply *reply; - std::function callback; - std::shared_ptr allocated_instances; - Work(RayTask task, + RayLease lease_; + bool grant_or_reject_; + bool is_selected_based_on_locality_; + rpc::RequestWorkerLeaseReply *reply_; + std::function callback_; + std::shared_ptr allocated_instances_; + Work(RayLease lease, bool grant_or_reject, bool is_selected_based_on_locality, rpc::RequestWorkerLeaseReply *reply, std::function callback, WorkStatus status = WorkStatus::WAITING) - : task(std::move(task)), - grant_or_reject(grant_or_reject), - is_selected_based_on_locality(is_selected_based_on_locality), - reply(reply), - callback(std::move(callback)), - allocated_instances(nullptr), + : lease_(std::move(lease)), + grant_or_reject_(grant_or_reject), + is_selected_based_on_locality_(is_selected_based_on_locality), + reply_(reply), + callback_(std::move(callback)), + allocated_instances_(nullptr), status_(status){}; Work(const Work &Work) = delete; Work &operator=(const Work &work) = delete; @@ -95,7 +93,7 @@ class Work { UnscheduledWorkCause GetUnscheduledCause() const { return unscheduled_work_cause_; } bool PrioritizeLocalNode() const { - return grant_or_reject || is_selected_based_on_locality; + return grant_or_reject_ || is_selected_based_on_locality_; } private: diff --git a/src/ray/raylet/scheduling/local_task_manager_interface.h b/src/ray/raylet/scheduling/local_lease_manager_interface.h similarity index 57% rename from src/ray/raylet/scheduling/local_task_manager_interface.h rename to src/ray/raylet/scheduling/local_lease_manager_interface.h index 3eae10859b0d..8017efb1be13 100644 --- a/src/ray/raylet/scheduling/local_task_manager_interface.h +++ b/src/ray/raylet/scheduling/local_lease_manager_interface.h @@ -19,42 +19,43 @@ #include #include "absl/container/flat_hash_map.h" -#include "ray/common/task/task.h" #include "ray/raylet/scheduling/internal.h" namespace ray { +class RayLease; + namespace raylet { // Forward declaration class WorkerInterface; -/// Manages the lifetime of a task on the local node. It receives request from -/// cluster_task_manager and tries to execute the task locally. -/// Read raylet/local_task_manager.h for more information. -class ILocalTaskManager { +/// Manages the lifetime of a lease on the local node. It receives request from +/// cluster_lease_manager and tries to execute the lease locally. +/// Read raylet/local_lease_manager.h for more information. +class LocalLeaseManagerInterface { public: - virtual ~ILocalTaskManager() = default; + virtual ~LocalLeaseManagerInterface() = default; - /// Queue task and schedule. - virtual void QueueAndScheduleTask(std::shared_ptr work) = 0; + /// Queue lease and schedule. + virtual void QueueAndScheduleLease(std::shared_ptr work) = 0; - // Schedule and dispatch tasks. - virtual void ScheduleAndDispatchTasks() = 0; + // Schedule and grant leases. + virtual void ScheduleAndGrantLeases() = 0; - /// Attempt to cancel all queued tasks that match the predicate. + /// Attempt to cancel all queued leases that match the predicate. /// - /// \param predicate: A function that returns true if a task needs to be cancelled. + /// \param predicate: A function that returns true if a lease needs to be cancelled. /// \param failure_type: The reason for cancellation. /// \param scheduling_failure_message: The reason message for cancellation. - /// \return True if any task was successfully cancelled. - virtual bool CancelTasks( + /// \return True if any lease was successfully cancelled. + virtual bool CancelLeases( std::function &)> predicate, rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type, const std::string &scheduling_failure_message) = 0; virtual const absl::flat_hash_map>> - &GetTaskToDispatch() const = 0; + &GetLeasesToGrant() const = 0; virtual const absl::flat_hash_map> @@ -66,12 +67,12 @@ class ILocalTaskManager { virtual void ClearWorkerBacklog(const WorkerID &worker_id) = 0; - virtual const RayTask *AnyPendingTasksForResourceAcquisition( - int *num_pending_actor_creation, int *num_pending_tasks) const = 0; + virtual const RayLease *AnyPendingLeasesForResourceAcquisition( + int *num_pending_actor_creation, int *num_pending_leases) const = 0; - virtual void TasksUnblocked(const std::vector &ready_ids) = 0; + virtual void LeasesUnblocked(const std::vector &ready_ids) = 0; - virtual void TaskFinished(std::shared_ptr worker, RayTask *task) = 0; + virtual void CleanupLease(std::shared_ptr worker, RayLease *lease) = 0; virtual void ReleaseWorkerResources(std::shared_ptr worker) = 0; @@ -87,39 +88,38 @@ class ILocalTaskManager { virtual void DebugStr(std::stringstream &buffer) const = 0; - virtual size_t GetNumTaskSpilled() const = 0; - virtual size_t GetNumWaitingTaskSpilled() const = 0; - virtual size_t GetNumUnschedulableTaskSpilled() const = 0; + virtual size_t GetNumLeaseSpilled() const = 0; + virtual size_t GetNumWaitingLeaseSpilled() const = 0; + virtual size_t GetNumUnschedulableLeaseSpilled() const = 0; }; -/// A noop local task manager. It is a no-op class. We need this because there's no -/// "LocalTaskManager" when the `ClusterTaskManager` is used within GCS. In the long term, -/// we should make `ClusterTaskManager` not aware of `LocalTaskManager`. -class NoopLocalTaskManager : public ILocalTaskManager { +/// A noop local lease manager. It is a no-op class. We need this because there's no +/// "LocalLeaseManager" when the `ClusterLeaseManager` is used within GCS. In the long +/// term, we should make `ClusterLeaseManager` not aware of `LocalLeaseManager`. +class NoopLocalLeaseManager : public LocalLeaseManagerInterface { public: - NoopLocalTaskManager() = default; + NoopLocalLeaseManager() = default; - /// Queue task and schedule. - void QueueAndScheduleTask(std::shared_ptr work) override { + void QueueAndScheduleLease(std::shared_ptr work) override { RAY_CHECK(false) - << "This function should never be called by gcs' local task manager."; + << "This function should never be called by gcs' local lease manager."; } - // Schedule and dispatch tasks. - void ScheduleAndDispatchTasks() override {} + void ScheduleAndGrantLeases() override {} - bool CancelTasks(std::function &)> predicate, - rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type, - const std::string &scheduling_failure_message) override { + bool CancelLeases( + std::function &)> predicate, + rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type, + const std::string &scheduling_failure_message) override { return false; } const absl::flat_hash_map>> - &GetTaskToDispatch() const override { + &GetLeasesToGrant() const override { static const absl::flat_hash_map>> - tasks_to_dispatch; - return tasks_to_dispatch; + leases_to_grant; + return leases_to_grant; } const absl::flat_hash_map> @@ -136,14 +136,14 @@ class NoopLocalTaskManager : public ILocalTaskManager { void ClearWorkerBacklog(const WorkerID &worker_id) override {} - const RayTask *AnyPendingTasksForResourceAcquisition( - int *num_pending_actor_creation, int *num_pending_tasks) const override { + const RayLease *AnyPendingLeasesForResourceAcquisition( + int *num_pending_actor_creation, int *num_pending_leases) const override { return nullptr; } - void TasksUnblocked(const std::vector &ready_ids) override {} + void LeasesUnblocked(const std::vector &ready_ids) override {} - void TaskFinished(std::shared_ptr worker, RayTask *task) override {} + void CleanupLease(std::shared_ptr worker, RayLease *lease) override {} void ReleaseWorkerResources(std::shared_ptr worker) override {} @@ -163,9 +163,9 @@ class NoopLocalTaskManager : public ILocalTaskManager { void DebugStr(std::stringstream &buffer) const override {} - size_t GetNumTaskSpilled() const override { return 0; } - size_t GetNumWaitingTaskSpilled() const override { return 0; } - size_t GetNumUnschedulableTaskSpilled() const override { return 0; } + size_t GetNumLeaseSpilled() const override { return 0; } + size_t GetNumWaitingLeaseSpilled() const override { return 0; } + size_t GetNumUnschedulableLeaseSpilled() const override { return 0; } }; } // namespace raylet diff --git a/src/ray/raylet/scheduling/local_resource_manager.cc b/src/ray/raylet/scheduling/local_resource_manager.cc index 68374d60e23e..f0e5064a511b 100644 --- a/src/ray/raylet/scheduling/local_resource_manager.cc +++ b/src/ray/raylet/scheduling/local_resource_manager.cc @@ -22,8 +22,10 @@ #include #include -#include "ray/common/grpc_util.h" -#include "ray/common/ray_config.h" +#include "ray/common/scheduling/placement_group_util.h" +#include "ray/common/scheduling/resource_set.h" +#include "ray/stats/metric_defs.h" +#include "ray/util/logging.h" namespace ray { @@ -305,6 +307,10 @@ void LocalResourceManager::PopulateResourceViewSyncMessage( syncer::ResourceViewSyncMessage &resource_view_sync_message) const { NodeResources resources = ToNodeResources(); + // Populate node labels. + resource_view_sync_message.mutable_labels()->insert(resources.labels.begin(), + resources.labels.end()); + auto total = resources.total.GetResourceMap(); resource_view_sync_message.mutable_resources_total()->insert(total.begin(), total.end()); diff --git a/src/ray/raylet/scheduling/local_resource_manager.h b/src/ray/raylet/scheduling/local_resource_manager.h index 7b78327efc49..3d3329c2fc0e 100644 --- a/src/ray/raylet/scheduling/local_resource_manager.h +++ b/src/ray/raylet/scheduling/local_resource_manager.h @@ -21,14 +21,9 @@ #include #include "absl/container/flat_hash_map.h" -#include "ray/common/bundle_spec.h" #include "ray/common/ray_syncer/ray_syncer.h" #include "ray/common/scheduling/cluster_resource_data.h" #include "ray/common/scheduling/fixed_point.h" -#include "ray/common/scheduling/resource_set.h" -#include "ray/gcs/gcs_client/accessor.h" -#include "ray/gcs/gcs_client/gcs_client.h" -#include "ray/util/logging.h" #include "src/ray/protobuf/gcs.pb.h" #include "src/ray/protobuf/node_manager.pb.h" diff --git a/src/ray/raylet/scheduling/policy/affinity_with_bundle_scheduling_policy.cc b/src/ray/raylet/scheduling/policy/affinity_with_bundle_scheduling_policy.cc index bfe46d314abc..7d27b2892552 100644 --- a/src/ray/raylet/scheduling/policy/affinity_with_bundle_scheduling_policy.cc +++ b/src/ray/raylet/scheduling/policy/affinity_with_bundle_scheduling_policy.cc @@ -43,11 +43,11 @@ bool AffinityWithBundleSchedulingPolicy::IsNodeFeasibleAndAvailable( scheduling::NodeID AffinityWithBundleSchedulingPolicy::Schedule( const ResourceRequest &resource_request, SchedulingOptions options) { - RAY_CHECK(options.scheduling_type == SchedulingType::AFFINITY_WITH_BUNDLE); + RAY_CHECK(options.scheduling_type_ == SchedulingType::AFFINITY_WITH_BUNDLE); auto bundle_scheduling_context = dynamic_cast( - options.scheduling_context.get()); + options.scheduling_context_.get()); const BundleID &bundle_id = bundle_scheduling_context->GetAffinityBundleID(); if (bundle_id.second != -1) { const auto &node_id_opt = bundle_location_index_.GetBundleLocation(bundle_id); @@ -63,7 +63,7 @@ scheduling::NodeID AffinityWithBundleSchedulingPolicy::Schedule( const auto &bundle_locations_opt = bundle_location_index_.GetBundleLocations(pg_id); if (bundle_locations_opt) { // Find a target with gpu nodes avoided (if required). - if (options.avoid_gpu_nodes) { + if (options.avoid_gpu_nodes_) { for (const auto &iter : *(bundle_locations_opt.value())) { auto target_node_id = scheduling::NodeID(iter.second.first.Binary()); if (IsNodeFeasibleAndAvailable( diff --git a/src/ray/raylet/scheduling/policy/bundle_scheduling_policy.cc b/src/ray/raylet/scheduling/policy/bundle_scheduling_policy.cc index a758dba70b7a..d01871377f3d 100644 --- a/src/ray/raylet/scheduling/policy/bundle_scheduling_policy.cc +++ b/src/ray/raylet/scheduling/policy/bundle_scheduling_policy.cc @@ -14,81 +14,6 @@ #include "ray/raylet/scheduling/policy/bundle_scheduling_policy.h" -namespace { - -/// Return true if scheduling this bundle (with resource_request) will exceed the -/// max cpu fraction for placement groups. This is per node. -/// -/// \param node_resources The resource of the current node. -/// \param bundle_resource_request The requested resources for the current bundle. -/// \param max_cpu_fraction_per_node Highest CPU fraction the bundles can take up. -/// \param available_cpus_before_curernt_pg_request Available CPUs on this node before -/// scheduling the current pg request. It is used to calculate how many CPUs are -/// allocated by the current bundles so far. It will help us figuring out -/// the total CPU allocation from the current bundles for this node. -bool AllocationWillExceedMaxCpuFraction( - const ray::NodeResources &node_resources, - const ray::ResourceRequest &bundle_resource_request, - double max_cpu_fraction_per_node, - double available_cpus_before_curernt_pg_request) { - if (max_cpu_fraction_per_node == 1.0) { - // Allocation will never exceed the threshold if the fraction == 1.0. - return false; - } - - auto cpu_id = ray::ResourceID::CPU(); - auto total_cpus = node_resources.total.Get(cpu_id).Double(); - - // Calculate max_reservable_cpus - auto max_reservable_cpus = - max_cpu_fraction_per_node * node_resources.total.Get(cpu_id).Double(); - - // If the max reservable cpu < 1, we allow at least 1 CPU. - if (max_reservable_cpus < 1) { - max_reservable_cpus = 1; - } - - // We guarantee at least 1 CPU is excluded from the placement group - // when max_cpu_fraction_per_node is specified. - if (max_reservable_cpus > total_cpus - 1) { - max_reservable_cpus = total_cpus - 1; - } - - /* - To calculate if allocating a new bundle will exceed the pg max_fraction, - we need a sum of - - - CPUs used by placement groups before. - - CPUs that will be allocated by the current pg request. - */ - - // Get the sum of all cpu allocated by placement group on this node. - FixedPoint cpus_used_by_pg_before(0); - for (const auto &resource_id : node_resources.total.ExplicitResourceIds()) { - if (ray::GetOriginalResourceNameFromWildcardResource(resource_id.Binary()) == "CPU") { - cpus_used_by_pg_before += node_resources.total.Get(resource_id); - } - } - - // Get the CPUs allocated by current pg request so far. - // Note that when we schedule the current pg, we allocate resources - // temporarily meaning `node_resources.available` will contain - // available CPUs after allocating CPUs for the current pg request. - auto cpus_allocated_by_current_pg_request = - (available_cpus_before_curernt_pg_request - - node_resources.available.Get(cpu_id).Double()); - - auto cpus_to_allocate_by_current_pg_request = - (cpus_allocated_by_current_pg_request + - bundle_resource_request.Get(cpu_id).Double()); - - auto cpus_used_by_pg_after = - cpus_used_by_pg_before.Double() + cpus_to_allocate_by_current_pg_request; - return cpus_used_by_pg_after > max_reservable_cpus; -} - -} // namespace - namespace ray { namespace raylet_scheduling_policy { @@ -117,19 +42,6 @@ BundleSchedulingPolicy::SelectCandidateNodes(const SchedulingContext *context) c return result; } -/// Return the map of node id -> available cpus before the current bundle scheduling. -/// It is used to calculate how many CPUs have been allocated for the current bundles. -const absl::flat_hash_map -BundleSchedulingPolicy::GetAvailableCpusBeforeBundleScheduling() const { - absl::flat_hash_map result; - for (const auto &entry : cluster_resource_manager_.GetResourceView()) { - result.emplace( - entry.first, - entry.second.GetLocalView().available.Get(ray::ResourceID::CPU()).Double()); - } - return result; -} - std::pair, std::vector> BundleSchedulingPolicy::SortRequiredResources( const std::vector &resource_request_list) { @@ -203,9 +115,7 @@ BundleSchedulingPolicy::SortRequiredResources( std::pair BundleSchedulingPolicy::GetBestNode( const ResourceRequest &required_resources, const absl::flat_hash_map &candidate_nodes, - const SchedulingOptions &options, - const absl::flat_hash_map - &available_cpus_before_bundle_scheduling) const { + const SchedulingOptions &options) const { double best_node_score = -1; auto best_node_id = scheduling::NodeID::Nil(); const Node *best_node = nullptr; @@ -213,14 +123,6 @@ std::pair BundleSchedulingPolicy::GetBestNode( // Score the nodes. for (const auto &[node_id, node] : candidate_nodes) { const auto &node_resources = node->GetLocalView(); - if (AllocationWillExceedMaxCpuFraction( - node_resources, - required_resources, - options.max_cpu_fraction_per_node, - available_cpus_before_bundle_scheduling.at(node_id))) { - continue; - } - double node_score = node_scorer_->Score(required_resources, node_resources); if (best_node_id.IsNil() || best_node_score < node_score) { best_node_id = node_id; @@ -240,15 +142,12 @@ SchedulingResult BundlePackSchedulingPolicy::Schedule( SchedulingOptions options) { RAY_CHECK(!resource_request_list.empty()); - auto candidate_nodes = SelectCandidateNodes(options.scheduling_context.get()); + auto candidate_nodes = SelectCandidateNodes(options.scheduling_context_.get()); if (candidate_nodes.empty()) { RAY_LOG(DEBUG) << "The candidate nodes is empty, return directly."; return SchedulingResult::Infeasible(); } - const auto available_cpus_before_bundle_scheduling = - GetAvailableCpusBeforeBundleScheduling(); - // First schedule scarce resources (such as GPU) and large capacity resources to improve // the scheduling success rate. auto sorted_result = SortRequiredResources(resource_request_list); @@ -266,10 +165,7 @@ SchedulingResult BundlePackSchedulingPolicy::Schedule( while (!required_resources_list_copy.empty()) { const auto &required_resources_index = required_resources_list_copy.front().first; const auto &required_resources = required_resources_list_copy.front().second; - auto best_node = GetBestNode(*required_resources, - candidate_nodes, - options, - available_cpus_before_bundle_scheduling); + auto best_node = GetBestNode(*required_resources, candidate_nodes, options); if (best_node.first.IsNil()) { // There is no node to meet the scheduling requirements. break; @@ -285,14 +181,8 @@ SchedulingResult BundlePackSchedulingPolicy::Schedule( // We try to schedule more resources on one node. for (auto iter = required_resources_list_copy.begin(); iter != required_resources_list_copy.end();) { - if (node_resources.IsAvailable(*iter->second) // If the node has enough resources. - && !AllocationWillExceedMaxCpuFraction( // and allocating resources won't - // exceed max cpu fraction. - node_resources, - *iter->second, - options.max_cpu_fraction_per_node, - available_cpus_before_bundle_scheduling.at(best_node.first))) { - // Then allocate it. + // If the node has sufficient resources, allocate it. + if (node_resources.IsAvailable(*iter->second)) { RAY_CHECK(cluster_resource_manager_.SubtractNodeAvailableResources( best_node.first, *iter->second)); result_nodes[iter->first] = best_node.first; @@ -329,15 +219,12 @@ SchedulingResult BundleSpreadSchedulingPolicy::Schedule( SchedulingOptions options) { RAY_CHECK(!resource_request_list.empty()); - auto candidate_nodes = SelectCandidateNodes(options.scheduling_context.get()); + auto candidate_nodes = SelectCandidateNodes(options.scheduling_context_.get()); if (candidate_nodes.empty()) { RAY_LOG(DEBUG) << "The candidate nodes is empty, return directly."; return SchedulingResult::Infeasible(); } - const auto available_cpus_before_bundle_scheduling = - GetAvailableCpusBeforeBundleScheduling(); - // First schedule scarce resources (such as GPU) and large capacity resources to improve // the scheduling success rate. auto sorted_result = SortRequiredResources(resource_request_list); @@ -348,10 +235,7 @@ SchedulingResult BundleSpreadSchedulingPolicy::Schedule( absl::flat_hash_map selected_nodes; for (const auto &resource_request : sorted_resource_request_list) { // Score and sort nodes. - auto best_node = GetBestNode(*resource_request, - candidate_nodes, - options, - available_cpus_before_bundle_scheduling); + auto best_node = GetBestNode(*resource_request, candidate_nodes, options); // There are nodes to meet the scheduling requirements. if (!best_node.first.IsNil()) { @@ -362,10 +246,7 @@ SchedulingResult BundleSpreadSchedulingPolicy::Schedule( selected_nodes.emplace(best_node); } else { // Scheduling from selected nodes. - best_node = GetBestNode(*resource_request, - selected_nodes, - options, - available_cpus_before_bundle_scheduling); + best_node = GetBestNode(*resource_request, selected_nodes, options); if (!best_node.first.IsNil()) { result_nodes.emplace_back(best_node.first); RAY_CHECK(cluster_resource_manager_.SubtractNodeAvailableResources( @@ -399,15 +280,12 @@ SchedulingResult BundleStrictPackSchedulingPolicy::Schedule( SchedulingOptions options) { RAY_CHECK(!resource_request_list.empty()); - auto candidate_nodes = SelectCandidateNodes(options.scheduling_context.get()); + auto candidate_nodes = SelectCandidateNodes(options.scheduling_context_.get()); if (candidate_nodes.empty()) { RAY_LOG(DEBUG) << "The candidate nodes is empty, return directly."; return SchedulingResult::Infeasible(); } - const auto available_cpus_before_bundle_scheduling = - GetAvailableCpusBeforeBundleScheduling(); - // Aggregate required resources. ResourceRequest aggregated_resource_request; for (const auto &resource_request : resource_request_list) { @@ -418,23 +296,13 @@ SchedulingResult BundleStrictPackSchedulingPolicy::Schedule( } } - const auto &right_node_it = std::find_if( - candidate_nodes.begin(), - candidate_nodes.end(), - [&aggregated_resource_request, &options, &available_cpus_before_bundle_scheduling]( - const auto &entry) { - const auto &node_resources = entry.second->GetLocalView(); - auto allocatable = - (node_resources.IsFeasible( - aggregated_resource_request) // If the resource is available - && !AllocationWillExceedMaxCpuFraction( // and allocating resources won't - // exceed max cpu fraction. - node_resources, - aggregated_resource_request, - options.max_cpu_fraction_per_node, - available_cpus_before_bundle_scheduling.at(entry.first))); - return allocatable; - }); + const auto &right_node_it = + std::find_if(candidate_nodes.begin(), + candidate_nodes.end(), + [&aggregated_resource_request](const auto &entry) { + const auto &node_resources = entry.second->GetLocalView(); + return node_resources.IsFeasible(aggregated_resource_request); + }); if (right_node_it == candidate_nodes.end()) { RAY_LOG(DEBUG) << "The required resource is bigger than the maximum resource in the " @@ -444,23 +312,19 @@ SchedulingResult BundleStrictPackSchedulingPolicy::Schedule( std::pair best_node(scheduling::NodeID::Nil(), nullptr); - if (!options.bundle_strict_pack_soft_target_node_id.IsNil()) { - if (candidate_nodes.contains(options.bundle_strict_pack_soft_target_node_id)) { + if (!options.bundle_strict_pack_soft_target_node_id_.IsNil()) { + if (candidate_nodes.contains(options.bundle_strict_pack_soft_target_node_id_)) { best_node = GetBestNode( aggregated_resource_request, absl::flat_hash_map{ - {options.bundle_strict_pack_soft_target_node_id, - candidate_nodes[options.bundle_strict_pack_soft_target_node_id]}}, - options, - available_cpus_before_bundle_scheduling); + {options.bundle_strict_pack_soft_target_node_id_, + candidate_nodes[options.bundle_strict_pack_soft_target_node_id_]}}, + options); } } if (best_node.first.IsNil()) { - best_node = GetBestNode(aggregated_resource_request, - candidate_nodes, - options, - available_cpus_before_bundle_scheduling); + best_node = GetBestNode(aggregated_resource_request, candidate_nodes, options); } // Select the node with the highest score. @@ -485,15 +349,12 @@ SchedulingResult BundleStrictSpreadSchedulingPolicy::Schedule( RAY_CHECK(!resource_request_list.empty()); // Filter candidate nodes. - auto candidate_nodes = SelectCandidateNodes(options.scheduling_context.get()); + auto candidate_nodes = SelectCandidateNodes(options.scheduling_context_.get()); if (candidate_nodes.empty()) { RAY_LOG(DEBUG) << "The candidate nodes is empty, return directly."; return SchedulingResult::Infeasible(); } - const auto available_cpus_before_bundle_scheduling = - GetAvailableCpusBeforeBundleScheduling(); - if (resource_request_list.size() > candidate_nodes.size()) { RAY_LOG(DEBUG) << "The number of required resources " << resource_request_list.size() << " is greater than the number of candidate nodes " @@ -510,10 +371,7 @@ SchedulingResult BundleStrictSpreadSchedulingPolicy::Schedule( std::vector result_nodes; for (const auto &resource_request : sorted_resource_request_list) { // Score and sort nodes. - auto best_node = GetBestNode(*resource_request, - candidate_nodes, - options, - available_cpus_before_bundle_scheduling); + auto best_node = GetBestNode(*resource_request, candidate_nodes, options); // There are nodes to meet the scheduling requirements. if (!best_node.first.IsNil()) { diff --git a/src/ray/raylet/scheduling/policy/bundle_scheduling_policy.h b/src/ray/raylet/scheduling/policy/bundle_scheduling_policy.h index 255a11957d70..4159b1a5c468 100644 --- a/src/ray/raylet/scheduling/policy/bundle_scheduling_policy.h +++ b/src/ray/raylet/scheduling/policy/bundle_scheduling_policy.h @@ -16,8 +16,6 @@ #include -#include "ray/common/bundle_spec.h" -#include "ray/common/scheduling/fixed_point.h" #include "ray/raylet/scheduling/cluster_resource_manager.h" #include "ray/raylet/scheduling/policy/scheduling_context.h" #include "ray/raylet/scheduling/policy/scheduling_policy.h" @@ -61,14 +59,7 @@ class BundleSchedulingPolicy : public IBundleSchedulingPolicy { std::pair GetBestNode( const ResourceRequest &required_resources, const absl::flat_hash_map &candidate_nodes, - const SchedulingOptions &options, - const absl::flat_hash_map - &available_cpus_before_bundle_scheduling) const; - - /// Return the map of node id -> available cpus before the current bundle scheduling. - /// It is used to calculate how many CPUs have been allocated for the current bundles. - const absl::flat_hash_map - GetAvailableCpusBeforeBundleScheduling() const; + const SchedulingOptions &options) const; protected: /// The cluster resource manager. diff --git a/src/ray/raylet/scheduling/policy/composite_scheduling_policy.cc b/src/ray/raylet/scheduling/policy/composite_scheduling_policy.cc index 1cb8a5677445..5afb5763cc5d 100644 --- a/src/ray/raylet/scheduling/policy/composite_scheduling_policy.cc +++ b/src/ray/raylet/scheduling/policy/composite_scheduling_policy.cc @@ -22,7 +22,7 @@ namespace raylet_scheduling_policy { scheduling::NodeID CompositeSchedulingPolicy::Schedule( const ResourceRequest &resource_request, SchedulingOptions options) { - switch (options.scheduling_type) { + switch (options.scheduling_type_) { case SchedulingType::SPREAD: return spread_policy_.Schedule(resource_request, options); case SchedulingType::RANDOM: @@ -38,7 +38,7 @@ scheduling::NodeID CompositeSchedulingPolicy::Schedule( default: RAY_LOG(FATAL) << "Unsupported scheduling type: " << static_cast::type>( - options.scheduling_type); + options.scheduling_type_); } UNREACHABLE; } @@ -46,7 +46,7 @@ scheduling::NodeID CompositeSchedulingPolicy::Schedule( SchedulingResult CompositeBundleSchedulingPolicy::Schedule( const std::vector &resource_request_list, SchedulingOptions options) { - switch (options.scheduling_type) { + switch (options.scheduling_type_) { case SchedulingType::BUNDLE_PACK: return bundle_pack_policy_.Schedule(resource_request_list, options); case SchedulingType::BUNDLE_SPREAD: @@ -58,7 +58,7 @@ SchedulingResult CompositeBundleSchedulingPolicy::Schedule( default: RAY_LOG(FATAL) << "Unsupported scheduling type: " << static_cast::type>( - options.scheduling_type); + options.scheduling_type_); } UNREACHABLE; } diff --git a/src/ray/raylet/scheduling/policy/composite_scheduling_policy.h b/src/ray/raylet/scheduling/policy/composite_scheduling_policy.h index d5cf66ae8be3..185a29521619 100644 --- a/src/ray/raylet/scheduling/policy/composite_scheduling_policy.h +++ b/src/ray/raylet/scheduling/policy/composite_scheduling_policy.h @@ -29,7 +29,7 @@ namespace ray { namespace raylet_scheduling_policy { /// A composite scheduling policy that routes the request to the underlining -/// scheduling_policy according to the scheduling_type. +/// scheduling_policy according to the scheduling_type_. class CompositeSchedulingPolicy : public ISchedulingPolicy { public: CompositeSchedulingPolicy(scheduling::NodeID local_node_id, @@ -64,7 +64,7 @@ class CompositeSchedulingPolicy : public ISchedulingPolicy { }; /// A composite scheduling policy that routes the request to the underlining -/// bundle_scheduling_policy according to the scheduling_type. +/// bundle_scheduling_policy according to the scheduling_type_. class CompositeBundleSchedulingPolicy : public IBundleSchedulingPolicy { public: explicit CompositeBundleSchedulingPolicy( diff --git a/src/ray/raylet/scheduling/policy/hybrid_scheduling_policy.cc b/src/ray/raylet/scheduling/policy/hybrid_scheduling_policy.cc index 6bf60d2a2d8a..1f82f2d6f153 100644 --- a/src/ray/raylet/scheduling/policy/hybrid_scheduling_policy.cc +++ b/src/ray/raylet/scheduling/policy/hybrid_scheduling_policy.cc @@ -182,28 +182,28 @@ scheduling::NodeID HybridSchedulingPolicy::ScheduleImpl( scheduling::NodeID HybridSchedulingPolicy::Schedule( const ResourceRequest &resource_request, SchedulingOptions options) { - RAY_CHECK(options.scheduling_type == SchedulingType::HYBRID) + RAY_CHECK(options.scheduling_type_ == SchedulingType::HYBRID) << "HybridPolicy policy requires type = HYBRID"; - if (!options.avoid_gpu_nodes || resource_request.Has(ResourceID::GPU())) { + if (!options.avoid_gpu_nodes_ || resource_request.Has(ResourceID::GPU())) { return ScheduleImpl(resource_request, - options.spread_threshold, - options.avoid_local_node, - options.require_node_available, + options.spread_threshold_, + options.avoid_local_node_, + options.require_node_available_, NodeFilter::kAny, - options.preferred_node_id, - options.schedule_top_k_absolute, - options.scheduler_top_k_fraction); + options.preferred_node_id_, + options.schedule_top_k_absolute_, + options.scheduler_top_k_fraction_); } // Try schedule on non-GPU nodes. auto best_node_id = ScheduleImpl(resource_request, - options.spread_threshold, - options.avoid_local_node, + options.spread_threshold_, + options.avoid_local_node_, /*require_node_available*/ true, NodeFilter::kNonGpu, - options.preferred_node_id, - options.schedule_top_k_absolute, - options.scheduler_top_k_fraction); + options.preferred_node_id_, + options.schedule_top_k_absolute_, + options.scheduler_top_k_fraction_); if (!best_node_id.IsNil()) { return best_node_id; } @@ -211,13 +211,13 @@ scheduling::NodeID HybridSchedulingPolicy::Schedule( // If we cannot find any available node from non-gpu nodes, fallback to the original // scheduling return ScheduleImpl(resource_request, - options.spread_threshold, - options.avoid_local_node, - options.require_node_available, + options.spread_threshold_, + options.avoid_local_node_, + options.require_node_available_, NodeFilter::kAny, - options.preferred_node_id, - options.schedule_top_k_absolute, - options.scheduler_top_k_fraction); + options.preferred_node_id_, + options.schedule_top_k_absolute_, + options.scheduler_top_k_fraction_); } } // namespace raylet_scheduling_policy diff --git a/src/ray/raylet/scheduling/policy/node_affinity_scheduling_policy.cc b/src/ray/raylet/scheduling/policy/node_affinity_scheduling_policy.cc index 737aa33a80f8..13e4dea53ed5 100644 --- a/src/ray/raylet/scheduling/policy/node_affinity_scheduling_policy.cc +++ b/src/ray/raylet/scheduling/policy/node_affinity_scheduling_policy.cc @@ -19,24 +19,24 @@ namespace raylet_scheduling_policy { scheduling::NodeID NodeAffinitySchedulingPolicy::Schedule( const ResourceRequest &resource_request, SchedulingOptions options) { - RAY_CHECK(options.scheduling_type == SchedulingType::NODE_AFFINITY); + RAY_CHECK(options.scheduling_type_ == SchedulingType::NODE_AFFINITY); - scheduling::NodeID target_node_id = scheduling::NodeID(options.node_affinity_node_id); + scheduling::NodeID target_node_id = scheduling::NodeID(options.node_affinity_node_id_); if (nodes_.contains(target_node_id) && is_node_alive_(target_node_id) && nodes_.at(target_node_id).GetLocalView().IsFeasible(resource_request)) { - if (!options.node_affinity_spill_on_unavailable && - !options.node_affinity_fail_on_unavailable) { + if (!options.node_affinity_spill_on_unavailable_ && + !options.node_affinity_fail_on_unavailable_) { return target_node_id; } else if (nodes_.at(target_node_id).GetLocalView().IsAvailable(resource_request)) { return target_node_id; } } - if (!options.node_affinity_soft) { + if (!options.node_affinity_soft_) { return scheduling::NodeID::Nil(); } - options.scheduling_type = SchedulingType::HYBRID; + options.scheduling_type_ = SchedulingType::HYBRID; return hybrid_policy_.Schedule(resource_request, options); } diff --git a/src/ray/raylet/scheduling/policy/node_label_scheduling_policy.cc b/src/ray/raylet/scheduling/policy/node_label_scheduling_policy.cc index c5393b464198..2bbd935a96dd 100644 --- a/src/ray/raylet/scheduling/policy/node_label_scheduling_policy.cc +++ b/src/ray/raylet/scheduling/policy/node_label_scheduling_policy.cc @@ -21,9 +21,9 @@ namespace raylet_scheduling_policy { scheduling::NodeID NodeLabelSchedulingPolicy::Schedule( const ResourceRequest &resource_request, SchedulingOptions options) { - RAY_CHECK(options.scheduling_type == SchedulingType::NODE_LABEL); + RAY_CHECK(options.scheduling_type_ == SchedulingType::NODE_LABEL); auto context = - dynamic_cast(options.scheduling_context.get()); + dynamic_cast(options.scheduling_context_.get()); const auto &scheduling_strategy = context->GetSchedulingStrategy(); RAY_CHECK(scheduling_strategy.has_node_label_scheduling_strategy()); const auto &node_label_scheduling_strategy = diff --git a/src/ray/raylet/scheduling/policy/random_scheduling_policy.cc b/src/ray/raylet/scheduling/policy/random_scheduling_policy.cc index 423aad73ca9c..f48a0c9c5bf1 100644 --- a/src/ray/raylet/scheduling/policy/random_scheduling_policy.cc +++ b/src/ray/raylet/scheduling/policy/random_scheduling_policy.cc @@ -22,15 +22,15 @@ namespace raylet_scheduling_policy { scheduling::NodeID RandomSchedulingPolicy::Schedule( const ResourceRequest &resource_request, SchedulingOptions options) { - RAY_CHECK(options.scheduling_type == SchedulingType::RANDOM) + RAY_CHECK(options.scheduling_type_ == SchedulingType::RANDOM) << "HybridPolicy policy requires type = RANDOM"; scheduling::NodeID best_node = scheduling::NodeID::Nil(); if (nodes_.empty()) { return best_node; } - RAY_CHECK(options.spread_threshold == 0 && !options.avoid_local_node && - options.require_node_available && !options.avoid_gpu_nodes) + RAY_CHECK(options.spread_threshold_ == 0 && !options.avoid_local_node_ && + options.require_node_available_ && !options.avoid_gpu_nodes_) << "Random policy requires spread_threshold = 0, " << "avoid_local_node = false, " << "require_node_available = true, " diff --git a/src/ray/raylet/scheduling/policy/scheduling_context.h b/src/ray/raylet/scheduling/policy/scheduling_context.h index 7dc71956d018..5d98a387cded 100644 --- a/src/ray/raylet/scheduling/policy/scheduling_context.h +++ b/src/ray/raylet/scheduling/policy/scheduling_context.h @@ -14,9 +14,6 @@ #pragma once -#include "absl/container/flat_hash_map.h" -#include "ray/common/bundle_location_index.h" -#include "ray/common/bundle_spec.h" #include "ray/common/id.h" #include "ray/common/placement_group.h" diff --git a/src/ray/raylet/scheduling/policy/scheduling_options.h b/src/ray/raylet/scheduling/policy/scheduling_options.h index 6a44cec601e4..b8f8804e3be2 100644 --- a/src/ray/raylet/scheduling/policy/scheduling_options.h +++ b/src/ray/raylet/scheduling/policy/scheduling_options.h @@ -68,7 +68,6 @@ struct SchedulingOptions { avoid_local_node, require_node_available, RayConfig::instance().scheduler_avoid_gpu_nodes(), - /*max_cpu_fraction_per_node*/ 1.0, /*scheduling_context*/ nullptr, preferred_node_id); } @@ -87,11 +86,11 @@ struct SchedulingOptions { } SchedulingOptions scheduling_options = Hybrid(avoid_local_node, require_node_available); - scheduling_options.scheduling_type = SchedulingType::NODE_AFFINITY; - scheduling_options.node_affinity_node_id = node_id; - scheduling_options.node_affinity_soft = soft; - scheduling_options.node_affinity_spill_on_unavailable = spill_on_unavailable; - scheduling_options.node_affinity_fail_on_unavailable = fail_on_unavailable; + scheduling_options.scheduling_type_ = SchedulingType::NODE_AFFINITY; + scheduling_options.node_affinity_node_id_ = node_id; + scheduling_options.node_affinity_soft_ = soft; + scheduling_options.node_affinity_spill_on_unavailable_ = spill_on_unavailable; + scheduling_options.node_affinity_fail_on_unavailable_ = fail_on_unavailable; return scheduling_options; } @@ -105,7 +104,6 @@ struct SchedulingOptions { /*avoid_local_node*/ false, /*require_node_available*/ true, /*avoid_gpu_nodes*/ RayConfig::instance().scheduler_avoid_gpu_nodes(), - /*max_cpu_fraction_per_node*/ 0, std::move(scheduling_context)); } @@ -119,7 +117,6 @@ struct SchedulingOptions { /*avoid_local_node*/ false, /*require_node_available*/ true, /*avoid_gpu_nodes*/ RayConfig::instance().scheduler_avoid_gpu_nodes(), - /*max_cpu_fraction_per_node*/ 0, std::move(scheduling_context)); } /* @@ -127,79 +124,67 @@ struct SchedulingOptions { */ // construct option for soft pack scheduling policy. - static SchedulingOptions BundlePack(double max_cpu_fraction_per_node = 1.0) { + static SchedulingOptions BundlePack() { return SchedulingOptions(SchedulingType::BUNDLE_PACK, /*spread_threshold*/ 0, /*avoid_local_node*/ false, /*require_node_available*/ true, - /*avoid_gpu_nodes*/ false, - /*max_cpu_fraction_per_node*/ max_cpu_fraction_per_node); + /*avoid_gpu_nodes*/ false); } // construct option for strict spread scheduling policy. - static SchedulingOptions BundleSpread(double max_cpu_fraction_per_node = 1.0) { + static SchedulingOptions BundleSpread() { return SchedulingOptions(SchedulingType::BUNDLE_SPREAD, /*spread_threshold*/ 0, /*avoid_local_node*/ false, /*require_node_available*/ true, - /*avoid_gpu_nodes*/ false, - /*max_cpu_fraction_per_node*/ max_cpu_fraction_per_node); + /*avoid_gpu_nodes*/ false); } // construct option for strict pack scheduling policy. static SchedulingOptions BundleStrictPack( - double max_cpu_fraction_per_node = 1.0, scheduling::NodeID soft_target_node_id = scheduling::NodeID::Nil()) { SchedulingOptions scheduling_options = SchedulingOptions(SchedulingType::BUNDLE_STRICT_PACK, /*spread_threshold*/ 0, /*avoid_local_node*/ false, /*require_node_available*/ true, - /*avoid_gpu_nodes*/ false, - /*max_cpu_fraction_per_node*/ max_cpu_fraction_per_node); - scheduling_options.bundle_strict_pack_soft_target_node_id = soft_target_node_id; + /*avoid_gpu_nodes*/ false); + scheduling_options.bundle_strict_pack_soft_target_node_id_ = soft_target_node_id; return scheduling_options; } // construct option for strict spread scheduling policy. static SchedulingOptions BundleStrictSpread( - double max_cpu_fraction_per_node = 1.0, std::unique_ptr scheduling_context = nullptr) { return SchedulingOptions(SchedulingType::BUNDLE_STRICT_SPREAD, /*spread_threshold*/ 0, /*avoid_local_node*/ false, /*require_node_available*/ true, /*avoid_gpu_nodes*/ false, - /*max_cpu_fraction_per_node*/ max_cpu_fraction_per_node, /*scheduling_context*/ std::move(scheduling_context)); } - SchedulingType scheduling_type; - float spread_threshold; - bool avoid_local_node; - bool require_node_available; - bool avoid_gpu_nodes; - // Maximum reservable CPU fraction per node. It is applied across multiple - // bundles, individually. E.g., when you have 2 bundles {CPU: 4} from 2 different - // scheduilng request, and there's one node with {CPU: 8}, only 1 bundle from 1 request - // can be scheduled on this node. This is only used for bundle scheduling policies - // (bundle pack, spread). - double max_cpu_fraction_per_node; + SchedulingType scheduling_type_; + float spread_threshold_; + bool avoid_local_node_; + bool require_node_available_; + bool avoid_gpu_nodes_; // ID of the target node where bundles should be placed // iff the target node has enough available resources. // Otherwise, the bundles can be placed elsewhere. // This is only used by PG STRICT_PACK scheduling. - scheduling::NodeID bundle_strict_pack_soft_target_node_id = scheduling::NodeID::Nil(); - std::shared_ptr scheduling_context; - std::string node_affinity_node_id; - bool node_affinity_soft = false; - bool node_affinity_spill_on_unavailable = false; - bool node_affinity_fail_on_unavailable = false; + scheduling::NodeID bundle_strict_pack_soft_target_node_id_ = scheduling::NodeID::Nil(); + std::shared_ptr scheduling_context_; + std::string node_affinity_node_id_; + bool node_affinity_soft_ = false; + bool node_affinity_spill_on_unavailable_ = false; + bool node_affinity_fail_on_unavailable_ = false; // The node where the task is preferred to be placed. By default, this node id // is empty, which means no preferred node. - std::string preferred_node_id; - int32_t schedule_top_k_absolute; - float scheduler_top_k_fraction; + std::string preferred_node_id_; + int32_t schedule_top_k_absolute_; + float scheduler_top_k_fraction_; private: SchedulingOptions( @@ -208,21 +193,19 @@ struct SchedulingOptions { bool avoid_local_node, bool require_node_available, bool avoid_gpu_nodes, - double max_cpu_fraction_per_node = 1.0, std::shared_ptr scheduling_context = nullptr, const std::string &preferred_node_id = std::string(), int32_t schedule_top_k_absolute = RayConfig::instance().scheduler_top_k_absolute(), float scheduler_top_k_fraction = RayConfig::instance().scheduler_top_k_fraction()) - : scheduling_type(type), - spread_threshold(spread_threshold), - avoid_local_node(avoid_local_node), - require_node_available(require_node_available), - avoid_gpu_nodes(avoid_gpu_nodes), - max_cpu_fraction_per_node(max_cpu_fraction_per_node), - scheduling_context(std::move(scheduling_context)), - preferred_node_id(preferred_node_id), - schedule_top_k_absolute(schedule_top_k_absolute), - scheduler_top_k_fraction(scheduler_top_k_fraction) {} + : scheduling_type_(type), + spread_threshold_(spread_threshold), + avoid_local_node_(avoid_local_node), + require_node_available_(require_node_available), + avoid_gpu_nodes_(avoid_gpu_nodes), + scheduling_context_(std::move(scheduling_context)), + preferred_node_id_(preferred_node_id), + schedule_top_k_absolute_(schedule_top_k_absolute), + scheduler_top_k_fraction_(scheduler_top_k_fraction) {} friend class ::ray::raylet::SchedulingPolicyTest; friend class HybridSchedulingPolicyTest; diff --git a/src/ray/raylet/scheduling/policy/scorer.cc b/src/ray/raylet/scheduling/policy/scorer.cc index b8c67f3d920d..c53812b0abc2 100644 --- a/src/ray/raylet/scheduling/policy/scorer.cc +++ b/src/ray/raylet/scheduling/policy/scorer.cc @@ -14,13 +14,17 @@ #include "ray/raylet/scheduling/policy/scorer.h" -#include - namespace ray { namespace raylet_scheduling_policy { double LeastResourceScorer::Score(const ResourceRequest &required_resources, const NodeResources &node_resources) { + // Check if the node has required labels before scoring on the resources. + const auto &label_selector = required_resources.GetLabelSelector(); + if (!node_resources.HasRequiredLabels(label_selector)) { + return -1.; + } + // In GCS-based actor scheduling, the `NodeResources` are only acquired or released by // actor scheduling, instead of being updated by resource reports from raylets. So we // have to subtract normal task resources (if exist) from the current available diff --git a/src/ray/raylet/scheduling/policy/scorer.h b/src/ray/raylet/scheduling/policy/scorer.h index cfc22a040958..e2bd1cfb2c72 100644 --- a/src/ray/raylet/scheduling/policy/scorer.h +++ b/src/ray/raylet/scheduling/policy/scorer.h @@ -13,7 +13,6 @@ // limitations under the License. #pragma once -#include #include "ray/common/scheduling/cluster_resource_data.h" diff --git a/src/ray/raylet/scheduling/policy/spread_scheduling_policy.cc b/src/ray/raylet/scheduling/policy/spread_scheduling_policy.cc index 076d1845dcb6..1d53494ddbcc 100644 --- a/src/ray/raylet/scheduling/policy/spread_scheduling_policy.cc +++ b/src/ray/raylet/scheduling/policy/spread_scheduling_policy.cc @@ -24,8 +24,8 @@ namespace raylet_scheduling_policy { scheduling::NodeID SpreadSchedulingPolicy::Schedule( const ResourceRequest &resource_request, SchedulingOptions options) { - RAY_CHECK(options.spread_threshold == 0 && - options.scheduling_type == SchedulingType::SPREAD) + RAY_CHECK(options.spread_threshold_ == 0 && + options.scheduling_type_ == SchedulingType::SPREAD) << "SpreadPolicy policy requires spread_threshold = 0 and type = SPREAD"; std::vector round; round.reserve(nodes_.size()); @@ -37,13 +37,13 @@ scheduling::NodeID SpreadSchedulingPolicy::Schedule( // Spread among available nodes first. // If there is no available nodes, we spread among feasible nodes. for (bool available_nodes_only : - (options.require_node_available ? std::vector{true} - : std::vector{true, false})) { + (options.require_node_available_ ? std::vector{true} + : std::vector{true, false})) { size_t round_index = spread_scheduling_next_index_; for (size_t i = 0; i < round.size(); ++i, ++round_index) { const auto &node_id = round[round_index % round.size()]; const auto &node = map_find_or_die(nodes_, node_id); - if (node_id == local_node_id_ && options.avoid_local_node) { + if (node_id == local_node_id_ && options.avoid_local_node_) { continue; } if (!is_node_alive_(node_id) || !node.GetLocalView().IsFeasible(resource_request)) { diff --git a/src/ray/raylet/scheduling/policy/tests/BUILD.bazel b/src/ray/raylet/scheduling/policy/tests/BUILD.bazel new file mode 100644 index 000000000000..a9ee6d460cd0 --- /dev/null +++ b/src/ray/raylet/scheduling/policy/tests/BUILD.bazel @@ -0,0 +1,30 @@ +load("//bazel:ray.bzl", "ray_cc_test") + +ray_cc_test( + name = "scheduling_policy_test", + size = "small", + srcs = [ + "scheduling_policy_test.cc", + ], + tags = ["team:core"], + deps = [ + "//src/ray/raylet/scheduling:composite_scheduling_policy", + "@com_google_googletest//:gtest_main", + ], +) + +ray_cc_test( + name = "hybrid_scheduling_policy_test", + size = "small", + srcs = [ + "hybrid_scheduling_policy_test.cc", + ], + tags = ["team:core"], + deps = [ + "//src/ray/raylet/scheduling:composite_scheduling_policy", + "//src/ray/raylet/scheduling:hybrid_scheduling_policy", + "@com_google_absl//absl/random:mock_distributions", + "@com_google_absl//absl/random:mocking_bit_gen", + "@com_google_googletest//:gtest_main", + ], +) diff --git a/src/ray/raylet/scheduling/policy/hybrid_scheduling_policy_test.cc b/src/ray/raylet/scheduling/policy/tests/hybrid_scheduling_policy_test.cc similarity index 98% rename from src/ray/raylet/scheduling/policy/hybrid_scheduling_policy_test.cc rename to src/ray/raylet/scheduling/policy/tests/hybrid_scheduling_policy_test.cc index 786fc52aac61..f0a0042ae3ac 100644 --- a/src/ray/raylet/scheduling/policy/hybrid_scheduling_policy_test.cc +++ b/src/ray/raylet/scheduling/policy/tests/hybrid_scheduling_policy_test.cc @@ -62,7 +62,6 @@ class HybridSchedulingPolicyTest : public ::testing::Test { avoid_local_node, require_node_available, avoid_gpu_nodes, - /*max_cpu_fraction_per_node*/ 1.0, /*scheduling_context*/ nullptr, /*preferred_node*/ "", schedule_top_k_absolute, diff --git a/src/ray/raylet/scheduling/policy/scheduling_policy_test.cc b/src/ray/raylet/scheduling/policy/tests/scheduling_policy_test.cc similarity index 80% rename from src/ray/raylet/scheduling/policy/scheduling_policy_test.cc rename to src/ray/raylet/scheduling/policy/tests/scheduling_policy_test.cc index 4cce097edd8f..5d20b6e29c4c 100644 --- a/src/ray/raylet/scheduling/policy/scheduling_policy_test.cc +++ b/src/ray/raylet/scheduling/policy/tests/scheduling_policy_test.cc @@ -59,7 +59,6 @@ class SchedulingPolicyTest : public ::testing::Test { avoid_local_node, require_node_available, avoid_gpu_nodes, - /*max_cpu_fraction_per_node*/ 1.0, /*scheduling_context*/ nullptr, /*preferred node*/ "", schedule_top_k_absolute, @@ -524,8 +523,7 @@ TEST_F(SchedulingPolicyTest, StrictPackBundleSchedulingTest) { req_list.push_back(&req); // No target node. - auto strict_pack_op = SchedulingOptions::BundleStrictPack( - /*max_cpu_fraction_per_node*/ 1.0, scheduling::NodeID::Nil()); + auto strict_pack_op = SchedulingOptions::BundleStrictPack(scheduling::NodeID::Nil()); auto to_schedule = raylet_scheduling_policy::BundleStrictPackSchedulingPolicy( *cluster_resource_manager, [](auto) { return true; }) .Schedule(req_list, strict_pack_op); @@ -533,8 +531,7 @@ TEST_F(SchedulingPolicyTest, StrictPackBundleSchedulingTest) { ASSERT_EQ(to_schedule.selected_nodes[0], local_node); // Target node has enough available resources. - strict_pack_op = SchedulingOptions::BundleStrictPack(/*max_cpu_fraction_per_node*/ 1.0, - remote_node_2); + strict_pack_op = SchedulingOptions::BundleStrictPack(remote_node_2); to_schedule = raylet_scheduling_policy::BundleStrictPackSchedulingPolicy( *cluster_resource_manager, [](auto) { return true; }) .Schedule(req_list, strict_pack_op); @@ -542,8 +539,7 @@ TEST_F(SchedulingPolicyTest, StrictPackBundleSchedulingTest) { ASSERT_EQ(to_schedule.selected_nodes[0], remote_node_2); // Target node doesn't have enough available resources. - strict_pack_op = - SchedulingOptions::BundleStrictPack(/*max_cpu_fraction_per_node*/ 1.0, remote_node); + strict_pack_op = SchedulingOptions::BundleStrictPack(remote_node); to_schedule = raylet_scheduling_policy::BundleStrictPackSchedulingPolicy( *cluster_resource_manager, [](auto) { return true; }) .Schedule(req_list, strict_pack_op); @@ -551,8 +547,7 @@ TEST_F(SchedulingPolicyTest, StrictPackBundleSchedulingTest) { ASSERT_EQ(to_schedule.selected_nodes[0], local_node); // Target node doesn't exist. - strict_pack_op = SchedulingOptions::BundleStrictPack(/*max_cpu_fraction_per_node*/ 1.0, - scheduling::NodeID(888)); + strict_pack_op = SchedulingOptions::BundleStrictPack(scheduling::NodeID(888)); to_schedule = raylet_scheduling_policy::BundleStrictPackSchedulingPolicy( *cluster_resource_manager, [](auto) { return true; }) .Schedule(req_list, strict_pack_op); @@ -560,125 +555,6 @@ TEST_F(SchedulingPolicyTest, StrictPackBundleSchedulingTest) { ASSERT_EQ(to_schedule.selected_nodes[0], local_node); } -TEST_F(SchedulingPolicyTest, BundleSchedulingMaxFractionTest) { - /* - * Test the bundle scheduling policy respects the max fraction request. - */ - - ResourceRequest req = ResourceMapToResourceRequest({{"CPU", 2}, {"GPU", 1}}, false); - std::vector req_list; - req_list.push_back(&req); - req_list.push_back(&req); - auto pack_op = SchedulingOptions::BundlePack(/*max_cpu_fraction_per_node*/ 0.5); - auto strict_pack_op = - SchedulingOptions::BundleStrictPack(/*max_cpu_fraction_per_node*/ 0.5); - auto spread_op = SchedulingOptions::BundleSpread(/*max_cpu_fraction_per_node*/ 0.5); - auto strict_spread_op = - SchedulingOptions::BundleStrictSpread(/*max_cpu_fraction_per_node*/ 0.5); - - nodes.emplace(local_node, CreateNodeResources(7, 7, 0, 0, 2, 2)); - - auto cluster_resource_manager = MockClusterResourceManager(nodes); - // req is unscheduleable because the max cpu fraction reaches 0.5. - auto unscheduable = raylet_scheduling_policy::BundlePackSchedulingPolicy( - *cluster_resource_manager, [](auto) { return true; }) - .Schedule(req_list, pack_op); - ASSERT_TRUE(unscheduable.status.IsFailed()); - - unscheduable = raylet_scheduling_policy::BundleSpreadSchedulingPolicy( - *cluster_resource_manager, [](auto) { return true; }) - .Schedule(req_list, spread_op); - ASSERT_TRUE(unscheduable.status.IsFailed()); - - unscheduable = raylet_scheduling_policy::BundleStrictPackSchedulingPolicy( - *cluster_resource_manager, [](auto) { return true; }) - .Schedule(req_list, strict_pack_op); - ASSERT_TRUE(unscheduable.status.IsInfeasible()); - - unscheduable = raylet_scheduling_policy::BundleStrictSpreadSchedulingPolicy( - *cluster_resource_manager, [](auto) { return true; }) - .Schedule(req_list, strict_spread_op); - ASSERT_TRUE(unscheduable.status.IsInfeasible()); -} - -TEST_F(SchedulingPolicyTest, BundleSchedulingMaxFractionOneCpuReservationGuaranteeTest) { - /* - * Test that when the max cpu fraction is provided, it reserves at least 1 CPU. - */ - - ResourceRequest req = ResourceMapToResourceRequest({{"CPU", 1}}, false); - std::vector req_list; - req_list.push_back(&req); - - // NOTE: We can only reserve up to 0.4 CPU, but it will round up to 1, - // which means the placement group is schedulable. - auto pack_op = SchedulingOptions::BundlePack(/*max_cpu_fraction_per_node*/ 0.1); - nodes.emplace(local_node, CreateNodeResources(4, 4, 0, 0, 0, 0)); - - auto cluster_resource_manager = MockClusterResourceManager(nodes); - // req is unscheduleable because the max cpu fraction reaches 0.5. - auto to_schedule = raylet_scheduling_policy::BundlePackSchedulingPolicy( - *cluster_resource_manager, [](auto) { return true; }) - .Schedule(req_list, pack_op); - ASSERT_TRUE(to_schedule.status.IsSuccess()); -} - -TEST_F(SchedulingPolicyTest, - BundleSchedulingMinFractionExcludeOneCpuReservationGuaranteeTest) { - /* - * Test that when the max cpu fraction is high, it excludes at least 1 CPU. - */ - - ResourceRequest req = ResourceMapToResourceRequest({{"CPU", 3}}, false); - std::vector req_list; - req_list.push_back(&req); - - // NOTE: We can reserve up to 3.96 CPU, but it will round down to 3 (exclude 1 CPU), - // which means a regular task with 1 CPU can be scheduled. - auto pack_op = SchedulingOptions::BundlePack(/*max_cpu_fraction_per_node*/ 0.99); - nodes.emplace(local_node, CreateNodeResources(4, 4, 0, 0, 0, 0)); - - auto cluster_resource_manager = MockClusterResourceManager(nodes); - // req is unscheduleable because the max cpu fraction reaches 0.5. - auto to_schedule = raylet_scheduling_policy::BundlePackSchedulingPolicy( - *cluster_resource_manager, [](auto) { return true; }) - .Schedule(req_list, pack_op); - ASSERT_TRUE(to_schedule.status.IsSuccess()); - - req = ResourceMapToResourceRequest({{"CPU", 1}}, false); - - auto to_schedule_task = - raylet_scheduling_policy::CompositeSchedulingPolicy( - local_node, *cluster_resource_manager, [](auto) { return true; }) - .Schedule(req, HybridOptions(0.50, false, false)); - ASSERT_TRUE(!to_schedule_task.IsNil()); -} - -TEST_F(SchedulingPolicyTest, BundleSchedulingMaxFractionWorkingWhenNormalResourceUsed) { - /* - * Test that it can schedule placement group correctly when there are non-pg - * resources occupying resources. - */ - - ResourceRequest req = ResourceMapToResourceRequest({{"CPU", 1}}, false); - std::vector req_list; - req_list.push_back(&req); - - // 2 CPUs / 4 CPUs is used by a regular task/actor. - // It means that when the fraction is 0.5, we still should - // be able to schedule a pg because 50% of CPUs still can be - // used for the placement group. - auto pack_op = SchedulingOptions::BundlePack(/*max_cpu_fraction_per_node*/ 0.5); - nodes.emplace(local_node, CreateNodeResources(2, 4, 0, 0, 0, 0)); - - auto cluster_resource_manager = MockClusterResourceManager(nodes); - // req is unscheduleable because the max cpu fraction reaches 0.5. - auto to_schedule = raylet_scheduling_policy::BundlePackSchedulingPolicy( - *cluster_resource_manager, [](auto) { return true; }) - .Schedule(req_list, pack_op); - ASSERT_TRUE(to_schedule.status.IsSuccess()); -} - int main(int argc, char **argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/src/ray/raylet/scheduling/scheduler_resource_reporter.cc b/src/ray/raylet/scheduling/scheduler_resource_reporter.cc index 0523851724a0..a728bd69e214 100644 --- a/src/ray/raylet/scheduling/scheduler_resource_reporter.cc +++ b/src/ray/raylet/scheduling/scheduler_resource_reporter.cc @@ -22,23 +22,25 @@ #include #include +#include "ray/common/ray_config.h" + namespace ray { namespace raylet { SchedulerResourceReporter::SchedulerResourceReporter( const absl::flat_hash_map>> - &tasks_to_schedule, + &leases_to_schedule, const absl::flat_hash_map>> - &infeasible_tasks, - const ILocalTaskManager &local_task_manager) + &infeasible_leases, + const LocalLeaseManagerInterface &local_lease_manager) : max_resource_shapes_per_load_report_( RayConfig::instance().max_resource_shapes_per_load_report()), - tasks_to_schedule_(tasks_to_schedule), - tasks_to_dispatch_(local_task_manager.GetTaskToDispatch()), - infeasible_tasks_(infeasible_tasks), - backlog_tracker_(local_task_manager.GetBackLogTracker()) {} + leases_to_schedule_(leases_to_schedule), + leases_to_grant_(local_lease_manager.GetLeasesToGrant()), + infeasible_leases_(infeasible_leases), + backlog_tracker_(local_lease_manager.GetBackLogTracker()) {} int64_t SchedulerResourceReporter::TotalBacklogSize( SchedulingClass scheduling_class) const { @@ -78,7 +80,7 @@ void SchedulerResourceReporter::FillResourceUsage(rpc::ResourcesData &data) cons } const auto &scheduling_class_descriptor = - TaskSpecification::GetSchedulingClassDescriptor(scheduling_class); + SchedulingClassToIds::GetSchedulingClassDescriptor(scheduling_class); if ((scheduling_class_descriptor.scheduling_strategy.scheduling_strategy_case() == rpc::SchedulingStrategy::SchedulingStrategyCase:: kNodeAffinitySchedulingStrategy) && @@ -133,22 +135,23 @@ void SchedulerResourceReporter::FillResourceUsage(rpc::ResourcesData &data) cons }; fill_resource_usage_helper( - tasks_to_schedule_ | boost::adaptors::transformed(transform_func), false); - auto tasks_to_dispatch_range = - tasks_to_dispatch_ | boost::adaptors::transformed([](const auto &pair) { + leases_to_schedule_ | boost::adaptors::transformed(transform_func), false); + auto leases_to_grant_range = + leases_to_grant_ | boost::adaptors::transformed([](const auto &pair) { auto cnt = pair.second.size(); - // We should only report dispatching tasks that do not have resources allocated. - for (const auto &task : pair.second) { - if (task->allocated_instances) { + // We should only report leases to be granted that do not have resources + // allocated. + for (const auto &lease : pair.second) { + if (lease->allocated_instances_) { cnt--; } } return std::make_pair(pair.first, cnt); }); - fill_resource_usage_helper(tasks_to_dispatch_range, false); + fill_resource_usage_helper(leases_to_grant_range, false); fill_resource_usage_helper( - infeasible_tasks_ | boost::adaptors::transformed(transform_func), true); + infeasible_leases_ | boost::adaptors::transformed(transform_func), true); auto backlog_tracker_range = backlog_tracker_ | boost::adaptors::transformed([](const auto &pair) { return std::make_pair(pair.first, 0); @@ -169,10 +172,10 @@ void SchedulerResourceReporter::FillResourceUsage(rpc::ResourcesData &data) cons void SchedulerResourceReporter::FillPendingActorCountByShape( rpc::ResourcesData &data) const { absl::flat_hash_map> pending_count_by_shape; - for (const auto &[scheduling_class, queue] : infeasible_tasks_) { + for (const auto &[scheduling_class, queue] : infeasible_leases_) { pending_count_by_shape[scheduling_class].first = queue.size(); } - for (const auto &[scheduling_class, queue] : tasks_to_schedule_) { + for (const auto &[scheduling_class, queue] : leases_to_schedule_) { pending_count_by_shape[scheduling_class].second = queue.size(); } @@ -183,7 +186,7 @@ void SchedulerResourceReporter::FillPendingActorCountByShape( for (const auto &shape_entry : pending_count_by_shape) { auto by_shape_entry = resource_load_by_shape->Add(); for (const auto &resource_entry : - TaskSpecification::GetSchedulingClassDescriptor(shape_entry.first) + SchedulingClassToIds::GetSchedulingClassDescriptor(shape_entry.first) .resource_set.GetResourceMap()) { (*by_shape_entry->mutable_shape())[resource_entry.first] = resource_entry.second; } diff --git a/src/ray/raylet/scheduling/scheduler_resource_reporter.h b/src/ray/raylet/scheduling/scheduler_resource_reporter.h index 29c3f9818910..5bc12c5d4139 100644 --- a/src/ray/raylet/scheduling/scheduler_resource_reporter.h +++ b/src/ray/raylet/scheduling/scheduler_resource_reporter.h @@ -17,10 +17,8 @@ #include #include "absl/container/flat_hash_map.h" -#include "ray/common/ray_config.h" -#include "ray/common/task/task_spec.h" #include "ray/raylet/scheduling/internal.h" -#include "ray/raylet/scheduling/local_task_manager_interface.h" +#include "ray/raylet/scheduling/local_lease_manager_interface.h" namespace ray { namespace raylet { @@ -31,11 +29,11 @@ class SchedulerResourceReporter { SchedulerResourceReporter( const absl::flat_hash_map>> - &tasks_to_schedule, + &leases_to_schedule, const absl::flat_hash_map>> - &infeasible_tasks, - const ILocalTaskManager &local_task_manager); + &infeasible_leases, + const LocalLeaseManagerInterface &local_lease_manager); /// Populate the relevant parts of the heartbeat table. This is intended for /// sending resource usage of raylet to gcs. In particular, this should fill in @@ -56,13 +54,13 @@ class SchedulerResourceReporter { const int64_t max_resource_shapes_per_load_report_; const absl::flat_hash_map>> - &tasks_to_schedule_; + &leases_to_schedule_; const absl::flat_hash_map>> - &tasks_to_dispatch_; + &leases_to_grant_; const absl::flat_hash_map>> - &infeasible_tasks_; + &infeasible_leases_; const absl::flat_hash_map> &backlog_tracker_; diff --git a/src/ray/raylet/scheduling/scheduler_stats.cc b/src/ray/raylet/scheduling/scheduler_stats.cc index c80a44fa8b2b..0534c80dafd6 100644 --- a/src/ray/raylet/scheduling/scheduler_stats.cc +++ b/src/ray/raylet/scheduling/scheduler_stats.cc @@ -18,16 +18,16 @@ #include #include -#include "ray/raylet/scheduling/cluster_task_manager.h" +#include "ray/raylet/scheduling/cluster_lease_manager.h" #include "ray/stats/metric_defs.h" namespace ray { namespace raylet { -SchedulerStats::SchedulerStats(const ClusterTaskManager &cluster_task_manager, - const ILocalTaskManager &local_task_manager) - : cluster_task_manager_(cluster_task_manager), - local_task_manager_(local_task_manager) {} +SchedulerStats::SchedulerStats(const ClusterLeaseManager &cluster_lease_manager, + const LocalLeaseManagerInterface &local_lease_manager) + : cluster_lease_manager_(cluster_lease_manager), + local_lease_manager_(local_lease_manager) {} void SchedulerStats::ComputeStats() { auto accumulator = @@ -41,11 +41,11 @@ void SchedulerStats::ComputeStats() { size_t num_worker_not_started_by_job_config_not_exist = 0; size_t num_worker_not_started_by_registration_timeout = 0; size_t num_tasks_waiting_for_workers = 0; - size_t num_cancelled_tasks = 0; + size_t num_cancelled_leases = 0; - size_t num_infeasible_tasks = - std::accumulate(cluster_task_manager_.infeasible_tasks_.begin(), - cluster_task_manager_.infeasible_tasks_.end(), + size_t num_infeasible_leases = + std::accumulate(cluster_lease_manager_.infeasible_leases_.begin(), + cluster_lease_manager_.infeasible_leases_.end(), static_cast(0), accumulator); @@ -58,7 +58,7 @@ void SchedulerStats::ComputeStats() { &num_worker_not_started_by_job_config_not_exist, &num_worker_not_started_by_registration_timeout, &num_tasks_waiting_for_workers, - &num_cancelled_tasks]( + &num_cancelled_leases]( size_t state, const std::pair< int, @@ -70,7 +70,7 @@ void SchedulerStats::ComputeStats() { if (work->GetState() == internal::WorkStatus::WAITING_FOR_WORKER) { num_tasks_waiting_for_workers += 1; } else if (work->GetState() == internal::WorkStatus::CANCELLED) { - num_cancelled_tasks += 1; + num_cancelled_leases += 1; } else if (work->GetUnscheduledCause() == internal::UnscheduledWorkCause::WAITING_FOR_RESOURCE_ACQUISITION) { num_waiting_for_resource += 1; @@ -90,14 +90,14 @@ void SchedulerStats::ComputeStats() { } return state + pair.second.size(); }; - size_t num_tasks_to_schedule = - std::accumulate(cluster_task_manager_.tasks_to_schedule_.begin(), - cluster_task_manager_.tasks_to_schedule_.end(), + size_t num_leases_to_schedule = + std::accumulate(cluster_lease_manager_.leases_to_schedule_.begin(), + cluster_lease_manager_.leases_to_schedule_.end(), static_cast(0), per_work_accumulator); - size_t num_tasks_to_dispatch = - std::accumulate(local_task_manager_.GetTaskToDispatch().begin(), - local_task_manager_.GetTaskToDispatch().end(), + size_t num_leases_to_grant = + std::accumulate(local_lease_manager_.GetLeasesToGrant().begin(), + local_lease_manager_.GetLeasesToGrant().end(), static_cast(0), per_work_accumulator); @@ -110,21 +110,21 @@ void SchedulerStats::ComputeStats() { num_worker_not_started_by_registration_timeout_ = num_worker_not_started_by_registration_timeout; num_tasks_waiting_for_workers_ = num_tasks_waiting_for_workers; - num_cancelled_tasks_ = num_cancelled_tasks; - num_infeasible_tasks_ = num_infeasible_tasks; - num_tasks_to_schedule_ = num_tasks_to_schedule; - num_tasks_to_dispatch_ = num_tasks_to_dispatch; + num_cancelled_leases_ = num_cancelled_leases; + num_infeasible_leases_ = num_infeasible_leases; + num_leases_to_schedule_ = num_leases_to_schedule; + num_leases_to_grant_ = num_leases_to_grant; } -void SchedulerStats::RecordMetrics() const { +void SchedulerStats::RecordMetrics() { /// This method intentionally doesn't call ComputeStats() because /// that function is expensive. ComputeStats is called by ComputeAndReportDebugStr /// method and they are always periodically called by node manager. - stats::NumSpilledTasks.Record(metric_tasks_spilled_ + - local_task_manager_.GetNumTaskSpilled()); - local_task_manager_.RecordMetrics(); - stats::NumInfeasibleSchedulingClasses.Record( - cluster_task_manager_.infeasible_tasks_.size()); + ray_metric_num_spilled_tasks_.Record(metric_leases_spilled_ + + local_lease_manager_.GetNumLeaseSpilled()); + local_lease_manager_.RecordMetrics(); + ray_metric_num_infeasible_scheduling_classes_.Record( + cluster_lease_manager_.infeasible_leases_.size()); /// Worker startup failure ray::stats::STATS_scheduler_failed_worker_startup_total.Record( num_worker_not_started_by_job_config_not_exist_, "JobConfigMissing"); @@ -134,16 +134,16 @@ void SchedulerStats::RecordMetrics() const { num_worker_not_started_by_process_rate_limit_, "RateLimited"); /// Queued tasks. - ray::stats::STATS_scheduler_tasks.Record(num_cancelled_tasks_, "Cancelled"); - ray::stats::STATS_scheduler_tasks.Record(num_tasks_to_dispatch_, "Dispatched"); - ray::stats::STATS_scheduler_tasks.Record(num_tasks_to_schedule_, "Received"); - ray::stats::STATS_scheduler_tasks.Record(local_task_manager_.GetNumWaitingTaskSpilled(), - "SpilledWaiting"); + ray::stats::STATS_scheduler_tasks.Record(num_cancelled_leases_, "Cancelled"); + ray::stats::STATS_scheduler_tasks.Record(num_leases_to_grant_, "Dispatched"); + ray::stats::STATS_scheduler_tasks.Record(num_leases_to_schedule_, "Received"); ray::stats::STATS_scheduler_tasks.Record( - local_task_manager_.GetNumUnschedulableTaskSpilled(), "SpilledUnschedulable"); + local_lease_manager_.GetNumWaitingLeaseSpilled(), "SpilledWaiting"); + ray::stats::STATS_scheduler_tasks.Record( + local_lease_manager_.GetNumUnschedulableLeaseSpilled(), "SpilledUnschedulable"); /// Pending task count. - ray::stats::STATS_scheduler_unscheduleable_tasks.Record(num_infeasible_tasks_, + ray::stats::STATS_scheduler_unscheduleable_tasks.Record(num_infeasible_leases_, "Infeasible"); ray::stats::STATS_scheduler_unscheduleable_tasks.Record(num_waiting_for_resource_, "WaitingForResources"); @@ -157,17 +157,17 @@ void SchedulerStats::RecordMetrics() const { std::string SchedulerStats::ComputeAndReportDebugStr() { ComputeStats(); - if (num_tasks_to_schedule_ + num_tasks_to_dispatch_ + num_infeasible_tasks_ > 1000) { + if (num_leases_to_schedule_ + num_leases_to_grant_ + num_infeasible_leases_ > 1000) { RAY_LOG(WARNING) << "More than 1000 tasks are queued for scheduling on this node. " "This can slow down the raylet."; } std::stringstream buffer; - buffer << "========== Node: " << cluster_task_manager_.self_node_id_ + buffer << "========== Node: " << cluster_lease_manager_.self_node_id_ << " =================\n"; - buffer << "Infeasible queue length: " << num_infeasible_tasks_ << "\n"; - buffer << "Schedule queue length: " << num_tasks_to_schedule_ << "\n"; - buffer << "Dispatch queue length: " << num_tasks_to_dispatch_ << "\n"; + buffer << "Infeasible queue length: " << num_infeasible_leases_ << "\n"; + buffer << "Schedule queue length: " << num_leases_to_schedule_ << "\n"; + buffer << "Grant queue length: " << num_leases_to_grant_ << "\n"; buffer << "num_waiting_for_resource: " << num_waiting_for_resource_ << "\n"; buffer << "num_waiting_for_plasma_memory: " << num_waiting_for_plasma_memory_ << "\n"; buffer << "num_waiting_for_remote_node_resources: " @@ -177,16 +177,16 @@ std::string SchedulerStats::ComputeAndReportDebugStr() { buffer << "num_worker_not_started_by_registration_timeout: " << num_worker_not_started_by_registration_timeout_ << "\n"; buffer << "num_tasks_waiting_for_workers: " << num_tasks_waiting_for_workers_ << "\n"; - buffer << "num_cancelled_tasks: " << num_cancelled_tasks_ << "\n"; + buffer << "num_cancelled_leases: " << num_cancelled_leases_ << "\n"; buffer << "cluster_resource_scheduler state: " - << cluster_task_manager_.cluster_resource_scheduler_.DebugString() << "\n"; - local_task_manager_.DebugStr(buffer); + << cluster_lease_manager_.cluster_resource_scheduler_.DebugString() << "\n"; + local_lease_manager_.DebugStr(buffer); buffer << "==================================================\n"; return buffer.str(); } -void SchedulerStats::TaskSpilled() { metric_tasks_spilled_++; } +void SchedulerStats::LeaseSpilled() { metric_leases_spilled_++; } } // namespace raylet } // namespace ray diff --git a/src/ray/raylet/scheduling/scheduler_stats.h b/src/ray/raylet/scheduling/scheduler_stats.h index c71f1fb8cab4..21b2ef86738a 100644 --- a/src/ray/raylet/scheduling/scheduler_stats.h +++ b/src/ray/raylet/scheduling/scheduler_stats.h @@ -16,42 +16,39 @@ #include -#include "absl/container/flat_hash_map.h" -#include "ray/common/ray_config.h" -#include "ray/common/task/task_spec.h" -#include "ray/raylet/scheduling/internal.h" -#include "ray/raylet/scheduling/local_task_manager_interface.h" +#include "ray/raylet/scheduling/local_lease_manager_interface.h" +#include "ray/stats/metric.h" namespace ray { namespace raylet { -class ClusterTaskManager; +class ClusterLeaseManager; // Helper class that collects and reports scheduler's metrics into counters or human // readable string. class SchedulerStats { public: - explicit SchedulerStats(const ClusterTaskManager &cluster_task_manager, - const ILocalTaskManager &local_task_manager); + explicit SchedulerStats(const ClusterLeaseManager &cluster_lease_manager, + const LocalLeaseManagerInterface &local_lease_manager); // Report metrics doesn't recompute the stats. - void RecordMetrics() const; + void RecordMetrics(); // Recompute the stats and report the result as string. std::string ComputeAndReportDebugStr(); - // increase the task spilled counter. - void TaskSpilled(); + // increase the lease spilled counter. + void LeaseSpilled(); private: // recompute the metrics. void ComputeStats(); - const ClusterTaskManager &cluster_task_manager_; - const ILocalTaskManager &local_task_manager_; + const ClusterLeaseManager &cluster_lease_manager_; + const LocalLeaseManagerInterface &local_lease_manager_; /// Number of tasks that are spilled to other /// nodes because it cannot be scheduled locally. - int64_t metric_tasks_spilled_ = 0; + int64_t metric_leases_spilled_ = 0; /// Number of tasks that are waiting for /// resources to be available locally. int64_t num_waiting_for_resource_ = 0; @@ -70,14 +67,27 @@ class SchedulerStats { int64_t num_worker_not_started_by_process_rate_limit_ = 0; /// Number of tasks that are waiting for worker processes to start. int64_t num_tasks_waiting_for_workers_ = 0; - /// Number of cancelled tasks. - int64_t num_cancelled_tasks_ = 0; - /// Number of infeasible tasks. - int64_t num_infeasible_tasks_ = 0; - /// Number of tasks to schedule. - int64_t num_tasks_to_schedule_ = 0; - /// Number of tasks to dispatch. - int64_t num_tasks_to_dispatch_ = 0; + /// Number of cancelled leases. + int64_t num_cancelled_leases_ = 0; + /// Number of infeasible leases. + int64_t num_infeasible_leases_ = 0; + /// Number of leases to schedule. + int64_t num_leases_to_schedule_ = 0; + /// Number of leases to grant. + int64_t num_leases_to_grant_ = 0; + + /// Ray metrics + ray::stats::Gauge ray_metric_num_spilled_tasks_{ + /*name=*/"internal_num_spilled_tasks", + /*description=*/ + "The cumulative number of lease requeusts that this raylet has spilled to other " + "raylets.", + /*unit=*/"tasks"}; + + ray::stats::Gauge ray_metric_num_infeasible_scheduling_classes_{ + /*name=*/"internal_num_infeasible_scheduling_classes", + /*description=*/"The number of unique scheduling classes that are infeasible.", + /*unit=*/"tasks"}; }; } // namespace raylet diff --git a/src/ray/raylet/scheduling/scheduling_policy.h b/src/ray/raylet/scheduling/scheduling_policy.h index fe689355ac76..5fd17b7bf137 100644 --- a/src/ray/raylet/scheduling/scheduling_policy.h +++ b/src/ray/raylet/scheduling/scheduling_policy.h @@ -18,7 +18,6 @@ #include "ray/common/ray_config.h" #include "ray/common/scheduling/cluster_resource_data.h" -#include "ray/gcs/gcs_client/gcs_client.h" namespace ray { namespace raylet_scheduling_policy { diff --git a/src/ray/raylet/scheduling/tests/BUILD.bazel b/src/ray/raylet/scheduling/tests/BUILD.bazel new file mode 100644 index 000000000000..25483ef3b383 --- /dev/null +++ b/src/ray/raylet/scheduling/tests/BUILD.bazel @@ -0,0 +1,82 @@ +load("//bazel:ray.bzl", "ray_cc_test") + +ray_cc_test( + name = "cluster_resource_scheduler_test", + size = "small", + srcs = [ + "cluster_resource_scheduler_test.cc", + ], + tags = ["team:core"], + deps = [ + "//:ray_mock", + "//src/ray/common:lease", + "//src/ray/common:ray_config", + "//src/ray/common:task_common", + "//src/ray/common:test_utils", + "//src/ray/gcs_client", + "//src/ray/raylet/scheduling:cluster_resource_scheduler", + "@com_google_googletest//:gtest_main", + ], +) + +ray_cc_test( + name = "cluster_resource_scheduler_2_test", + size = "small", + srcs = [ + "cluster_resource_scheduler_2_test.cc", + ], + tags = ["team:core"], + deps = [ + "//src/ray/raylet/scheduling:cluster_resource_scheduler", + "//src/ray/raylet/scheduling:scheduling_context", + "//src/ray/raylet/scheduling:scheduling_options", + "@com_google_googletest//:gtest_main", + ], +) + +ray_cc_test( + name = "local_resource_manager_test", + size = "small", + srcs = [ + "local_resource_manager_test.cc", + ], + tags = ["team:core"], + deps = [ + "//src/ray/raylet/scheduling:local_resource_manager", + "@com_google_googletest//:gtest_main", + ], +) + +ray_cc_test( + name = "cluster_lease_manager_test", + size = "small", + srcs = [ + "cluster_lease_manager_test.cc", + ], + tags = ["team:core"], + deps = [ + "//:ray_mock", + "//src/ray/common:id", + "//src/ray/common:lease", + "//src/ray/common:task_common", + "//src/ray/common:test_utils", + "//src/ray/raylet:local_lease_manager", + "//src/ray/raylet/scheduling:cluster_lease_manager", + "//src/ray/raylet/scheduling:cluster_resource_scheduler", + "//src/ray/raylet/tests:util", + "@com_google_googletest//:gtest_main", + ], +) + +ray_cc_test( + name = "cluster_resource_manager_test", + size = "small", + srcs = [ + "cluster_resource_manager_test.cc", + ], + tags = ["team:core"], + deps = [ + "//src/ray/raylet/scheduling:cluster_resource_manager", + "@com_google_googletest//:gtest_main", + ], +) diff --git a/src/ray/raylet/scheduling/cluster_task_manager_test.cc b/src/ray/raylet/scheduling/tests/cluster_lease_manager_test.cc similarity index 62% rename from src/ray/raylet/scheduling/cluster_task_manager_test.cc rename to src/ray/raylet/scheduling/tests/cluster_lease_manager_test.cc index 27bc2bfa3c14..aabdab0c559e 100644 --- a/src/ray/raylet/scheduling/cluster_task_manager_test.cc +++ b/src/ray/raylet/scheduling/tests/cluster_lease_manager_test.cc @@ -14,7 +14,7 @@ // limitations under the License. // clang-format off -#include "ray/raylet/scheduling/cluster_task_manager.h" +#include "ray/raylet/scheduling/cluster_lease_manager.h" #include #include @@ -29,13 +29,13 @@ #include "ray/common/id.h" #include "ray/common/scheduling/resource_set.h" #include "ray/common/scheduling/scheduling_ids.h" -#include "ray/common/task/task.h" +#include "ray/common/lease/lease.h" #include "ray/common/task/task_util.h" -#include "ray/common/test_util.h" -#include "ray/raylet/local_task_manager.h" +#include "ray/common/test_utils.h" +#include "ray/raylet/local_lease_manager.h" #include "ray/raylet/scheduling/cluster_resource_scheduler.h" -#include "ray/raylet/test/util.h" -#include "mock/ray/gcs/gcs_client/gcs_client.h" +#include "ray/raylet/tests/util.h" +#include "mock/ray/gcs_client/gcs_client.h" // clang-format on namespace ray { @@ -48,10 +48,10 @@ class MockWorkerPool : public WorkerPoolInterface { public: MockWorkerPool() : num_pops(0) {} - void PopWorker(const TaskSpecification &task_spec, + void PopWorker(const LeaseSpecification &lease_spec, const PopWorkerCallback &callback) override { num_pops++; - const int runtime_env_hash = task_spec.GetRuntimeEnvHash(); + const int runtime_env_hash = lease_spec.GetRuntimeEnvHash(); callbacks[runtime_env_hash].push_back(callback); } @@ -87,7 +87,7 @@ class MockWorkerPool : public WorkerPoolInterface { RAY_CHECK(status != PopWorkerStatus::OK); for (const auto &pair : callbacks) { for (const auto &callback : pair.second) { - // No task should be dispatched. + // No lease should be dispatched. ASSERT_FALSE( callback(nullptr, status, @@ -205,7 +205,7 @@ class MockWorkerPool : public WorkerPoolInterface { RAY_CHECK(false) << "Not used."; } - void PrestartWorkers(const TaskSpecification &task_spec, + void PrestartWorkers(const LeaseSpecification &lease_spec, int64_t backlog_size) override { RAY_CHECK(false) << "Not used."; } @@ -271,17 +271,18 @@ std::shared_ptr CreateSingleNodeScheduler( return scheduler; } -RayTask CreateTask( +RayLease CreateLease( const std::unordered_map &required_resources, int num_args = 0, std::vector args = {}, const std::shared_ptr runtime_env_info = nullptr, - rpc::SchedulingStrategy scheduling_strategy = rpc::SchedulingStrategy()) { + rpc::SchedulingStrategy scheduling_strategy = rpc::SchedulingStrategy(), + const LeaseID &lease_id = LeaseID::FromRandom()) { TaskSpecBuilder spec_builder; TaskID id = RandomTaskId(); JobID job_id = RandomJobId(); rpc::Address address; - address.set_raylet_id(NodeID::FromRandom().Binary()); + address.set_node_id(NodeID::FromRandom().Binary()); address.set_worker_id(WorkerID::FromRandom().Binary()); spec_builder.SetCommonTaskSpec(id, "dummy_task", @@ -317,19 +318,21 @@ RayTask CreateTask( } spec_builder.SetNormalTaskSpec(0, false, "", scheduling_strategy, ActorID::Nil()); - - return RayTask(std::move(spec_builder).ConsumeAndBuild()); + TaskSpecification spec = std::move(spec_builder).ConsumeAndBuild(); + LeaseSpecification lease_spec(spec.GetMessage()); + lease_spec.GetMutableMessage().set_lease_id(lease_id.Binary()); + return RayLease(std::move(lease_spec)); } -class MockTaskDependencyManager : public TaskDependencyManagerInterface { +class MockLeaseDependencyManager : public LeaseDependencyManagerInterface { public: - explicit MockTaskDependencyManager(std::unordered_set &missing_objects) + explicit MockLeaseDependencyManager(std::unordered_set &missing_objects) : missing_objects_(missing_objects) {} - bool RequestTaskDependencies(const TaskID &task_id, - const std::vector &required_objects, - const TaskMetricsKey &task_key) { - RAY_CHECK(subscribed_tasks.insert(task_id).second); + bool RequestLeaseDependencies(const LeaseID &lease_id, + const std::vector &required_objects, + const TaskMetricsKey &task_key) { + RAY_CHECK(subscribed_leases.insert(lease_id).second); for (auto &obj_ref : required_objects) { if (missing_objects_.find(ObjectRefToId(obj_ref)) != missing_objects_.end()) { return false; @@ -338,19 +341,19 @@ class MockTaskDependencyManager : public TaskDependencyManagerInterface { return true; } - void RemoveTaskDependencies(const TaskID &task_id) { - RAY_CHECK(subscribed_tasks.erase(task_id)); + void RemoveLeaseDependencies(const LeaseID &lease_id) { + RAY_CHECK(subscribed_leases.erase(lease_id)); } - bool TaskDependenciesBlocked(const TaskID &task_id) const { - return blocked_tasks.count(task_id); + bool LeaseDependenciesBlocked(const LeaseID &lease_id) const { + return blocked_leases.count(lease_id); } bool CheckObjectLocal(const ObjectID &object_id) const { return true; } std::unordered_set &missing_objects_; - std::unordered_set subscribed_tasks; - std::unordered_set blocked_tasks; + std::unordered_set subscribed_leases; + std::unordered_set blocked_leases; }; class FeatureFlagEnvironment : public ::testing::Environment { @@ -369,19 +372,19 @@ class FeatureFlagEnvironment : public ::testing::Environment { testing::Environment *const env = ::testing::AddGlobalTestEnvironment(new FeatureFlagEnvironment); -class ClusterTaskManagerTest : public ::testing::Test { +class ClusterLeaseManagerTest : public ::testing::Test { public: - explicit ClusterTaskManagerTest(double num_cpus_at_head = 8.0, - double num_gpus_at_head = 0.0) + explicit ClusterLeaseManagerTest(double num_cpus_at_head = 8.0, + double num_gpus_at_head = 0.0) : gcs_client_(std::make_unique()), id_(NodeID::FromRandom()), scheduler_(CreateSingleNodeScheduler( id_.Binary(), num_cpus_at_head, num_gpus_at_head, *gcs_client_)), - dependency_manager_(missing_objects_), - local_task_manager_(std::make_unique( + lease_dependency_manager_(missing_objects_), + local_lease_manager_(std::make_unique( id_, *scheduler_, - dependency_manager_, + lease_dependency_manager_, /* get_node_info= */ [this](const NodeID &node_id) -> const rpc::GcsNodeInfo * { node_info_calls_++; @@ -392,7 +395,7 @@ class ClusterTaskManagerTest : public ::testing::Test { }, pool_, leased_workers_, - /* get_task_arguments= */ + /* get_lease_args= */ [this](const std::vector &object_ids, std::vector> *results) { for (auto &obj_id : object_ids) { @@ -404,9 +407,9 @@ class ClusterTaskManagerTest : public ::testing::Test { } return true; }, - /*max_pinned_task_arguments_bytes=*/1000, + /*max_pinned_lease_args_bytes=*/1000, /*get_time=*/[this]() { return current_time_ms_; })), - task_manager_( + lease_manager_( id_, *scheduler_, /* get_node_info= */ @@ -417,9 +420,9 @@ class ClusterTaskManagerTest : public ::testing::Test { } return nullptr; }, - /* announce_infeasible_task= */ - [this](const RayTask &task) { announce_infeasible_task_calls_++; }, - *local_task_manager_, + /* announce_infeasible_lease= */ + [this](const RayLease &lease) { announce_infeasible_lease_calls_++; }, + *local_lease_manager_, /*get_time=*/[this]() { return current_time_ms_; }) { RayConfig::instance().initialize("{\"scheduler_top_k_absolute\": 1}"); } @@ -455,31 +458,31 @@ class ClusterTaskManagerTest : public ::testing::Test { } void AssertNoLeaks() { - ASSERT_TRUE(task_manager_.tasks_to_schedule_.empty()); - ASSERT_TRUE(local_task_manager_->tasks_to_dispatch_.empty()); - ASSERT_TRUE(local_task_manager_->waiting_tasks_index_.empty()); - ASSERT_TRUE(local_task_manager_->waiting_task_queue_.empty()); - ASSERT_TRUE(task_manager_.infeasible_tasks_.empty()); - ASSERT_TRUE(local_task_manager_->executing_task_args_.empty()); - ASSERT_TRUE(local_task_manager_->pinned_task_arguments_.empty()); - ASSERT_TRUE(local_task_manager_->info_by_sched_cls_.empty()); - ASSERT_EQ(local_task_manager_->pinned_task_arguments_bytes_, 0); - ASSERT_TRUE(dependency_manager_.subscribed_tasks.empty()); - } - - void AssertPinnedTaskArgumentsPresent(const RayTask &task) { - const auto &expected_deps = task.GetTaskSpecification().GetDependencyIds(); - ASSERT_EQ( - local_task_manager_->executing_task_args_[task.GetTaskSpecification().TaskId()], - expected_deps); + ASSERT_TRUE(lease_manager_.leases_to_schedule_.empty()); + ASSERT_TRUE(local_lease_manager_->leases_to_grant_.empty()); + ASSERT_TRUE(local_lease_manager_->waiting_leases_index_.empty()); + ASSERT_TRUE(local_lease_manager_->waiting_lease_queue_.empty()); + ASSERT_TRUE(lease_manager_.infeasible_leases_.empty()); + ASSERT_TRUE(local_lease_manager_->granted_lease_args_.empty()); + ASSERT_TRUE(local_lease_manager_->pinned_lease_arguments_.empty()); + ASSERT_TRUE(local_lease_manager_->info_by_sched_cls_.empty()); + ASSERT_EQ(local_lease_manager_->pinned_lease_arguments_bytes_, 0); + ASSERT_TRUE(lease_dependency_manager_.subscribed_leases.empty()); + } + + void AssertPinnedLeaseArgumentsPresent(const RayLease &lease) { + const auto &expected_deps = lease.GetLeaseSpecification().GetDependencyIds(); + ASSERT_EQ(local_lease_manager_ + ->granted_lease_args_[lease.GetLeaseSpecification().LeaseId()], + expected_deps); for (auto &arg : expected_deps) { - ASSERT_TRUE(local_task_manager_->pinned_task_arguments_.count(arg)); + ASSERT_TRUE(local_lease_manager_->pinned_lease_arguments_.count(arg)); } } - int NumTasksToDispatchWithStatus(internal::WorkStatus status) { + int NumLeasesToDispatchWithStatus(internal::WorkStatus status) { int count = 0; - for (const auto &pair : local_task_manager_->tasks_to_dispatch_) { + for (const auto &pair : local_lease_manager_->leases_to_grant_) { for (const auto &work : pair.second) { if (work->GetState() == status) { count++; @@ -489,10 +492,10 @@ class ClusterTaskManagerTest : public ::testing::Test { return count; } - int NumRunningTasks() { + int NumRunningLeases() { int count = 0; - for (const auto &pair : local_task_manager_->info_by_sched_cls_) { - count += (pair.second.running_tasks.size()); + for (const auto &pair : local_lease_manager_->info_by_sched_cls_) { + count += (pair.second.granted_leases.size()); } return count; @@ -502,42 +505,42 @@ class ClusterTaskManagerTest : public ::testing::Test { NodeID id_; std::shared_ptr scheduler_; MockWorkerPool pool_; - absl::flat_hash_map> leased_workers_; + absl::flat_hash_map> leased_workers_; std::unordered_set missing_objects_; int default_arg_size_ = 10; int node_info_calls_ = 0; - int announce_infeasible_task_calls_ = 0; + int announce_infeasible_lease_calls_ = 0; absl::flat_hash_map node_info_; int64_t current_time_ms_ = 0; - MockTaskDependencyManager dependency_manager_; - std::unique_ptr local_task_manager_; - ClusterTaskManager task_manager_; + MockLeaseDependencyManager lease_dependency_manager_; + std::unique_ptr local_lease_manager_; + ClusterLeaseManager lease_manager_; }; -// Same as ClusterTaskManagerTest, but the head node starts with 4.0 num gpus. -class ClusterTaskManagerTestWithGPUsAtHead : public ClusterTaskManagerTest { +// Same as ClusterLeaseManagerTest, but the head node starts with 4.0 num gpus. +class ClusterLeaseManagerTestWithGPUsAtHead : public ClusterLeaseManagerTest { public: - ClusterTaskManagerTestWithGPUsAtHead() - : ClusterTaskManagerTest(/*num_cpus_at_head=*/8.0, /*num_gpus_at_head=*/4.0) {} + ClusterLeaseManagerTestWithGPUsAtHead() + : ClusterLeaseManagerTest(/*num_cpus_at_head=*/8.0, /*num_gpus_at_head=*/4.0) {} }; -// Same as ClusterTaskManagerTest, but the head node starts with 0.0 num cpus. -class ClusterTaskManagerTestWithoutCPUsAtHead : public ClusterTaskManagerTest { +// Same as ClusterLeaseManagerTest, but the head node starts with 0.0 num cpus. +class ClusterLeaseManagerTestWithoutCPUsAtHead : public ClusterLeaseManagerTest { public: - ClusterTaskManagerTestWithoutCPUsAtHead() - : ClusterTaskManagerTest(/*num_cpus_at_head=*/0.0) {} + ClusterLeaseManagerTestWithoutCPUsAtHead() + : ClusterLeaseManagerTest(/*num_cpus_at_head=*/0.0) {} }; -TEST_F(ClusterTaskManagerTest, BasicTest) { +TEST_F(ClusterLeaseManagerTest, BasicTest) { /* Test basic scheduler functionality: 1. Queue and attempt to schedule/dispatch atest with no workers available 2. A worker becomes available, dispatch again. */ - RayTask task = CreateTask({{ray::kCPU_ResourceLabel, 4}}); + RayLease lease = CreateLease({{ray::kCPU_ResourceLabel, 4}}); rpc::RequestWorkerLeaseReply reply; bool callback_occurred = false; bool *callback_occurred_ptr = &callback_occurred; @@ -546,7 +549,7 @@ TEST_F(ClusterTaskManagerTest, BasicTest) { *callback_occurred_ptr = true; }; - task_manager_.QueueAndScheduleTask(task, false, false, &reply, callback); + lease_manager_.QueueAndScheduleLease(lease, false, false, &reply, callback); pool_.TriggerCallbacks(); ASSERT_FALSE(callback_occurred); ASSERT_EQ(leased_workers_.size(), 0); @@ -562,21 +565,21 @@ TEST_F(ClusterTaskManagerTest, BasicTest) { ASSERT_EQ(pool_.workers.size(), 0); ASSERT_EQ(node_info_calls_, 0); - RayTask finished_task; - local_task_manager_->TaskFinished(leased_workers_.begin()->second, &finished_task); - ASSERT_EQ(finished_task.GetTaskSpecification().TaskId(), - task.GetTaskSpecification().TaskId()); + RayLease finished_lease; + local_lease_manager_->CleanupLease(leased_workers_.begin()->second, &finished_lease); + ASSERT_EQ(finished_lease.GetLeaseSpecification().LeaseId(), + lease.GetLeaseSpecification().LeaseId()); AssertNoLeaks(); } -TEST_F(ClusterTaskManagerTest, IdempotencyTest) { +TEST_F(ClusterLeaseManagerTest, IdempotencyTest) { /* - A few task manager methods are meant to be idempotent. - * `TaskFinished` + A few lease manager methods are meant to be idempotent. + * `CleanupLease` * `ReleaseCpuResourcesFromBlockedWorker` * `ReturnCpuResourcesToUnblockedWorker` */ - RayTask task = CreateTask({{ray::kCPU_ResourceLabel, 4}}); + RayLease lease = CreateLease({{ray::kCPU_ResourceLabel, 4}}); rpc::RequestWorkerLeaseReply reply; bool callback_occurred = false; bool *callback_occurred_ptr = &callback_occurred; @@ -585,7 +588,7 @@ TEST_F(ClusterTaskManagerTest, IdempotencyTest) { *callback_occurred_ptr = true; }; - task_manager_.QueueAndScheduleTask(task, false, false, &reply, callback); + lease_manager_.QueueAndScheduleLease(lease, false, false, &reply, callback); pool_.TriggerCallbacks(); ASSERT_FALSE(callback_occurred); ASSERT_EQ(leased_workers_.size(), 0); @@ -603,30 +606,30 @@ TEST_F(ClusterTaskManagerTest, IdempotencyTest) { ASSERT_EQ(scheduler_->GetLocalResourceManager().GetLocalAvailableCpus(), 4.0); - local_task_manager_->ReleaseCpuResourcesFromBlockedWorker(worker); - local_task_manager_->ReleaseCpuResourcesFromBlockedWorker(worker); + local_lease_manager_->ReleaseCpuResourcesFromBlockedWorker(worker); + local_lease_manager_->ReleaseCpuResourcesFromBlockedWorker(worker); ASSERT_EQ(scheduler_->GetLocalResourceManager().GetLocalAvailableCpus(), 8.0); - local_task_manager_->ReturnCpuResourcesToUnblockedWorker(worker); - local_task_manager_->ReturnCpuResourcesToUnblockedWorker(worker); + local_lease_manager_->ReturnCpuResourcesToUnblockedWorker(worker); + local_lease_manager_->ReturnCpuResourcesToUnblockedWorker(worker); ASSERT_EQ(scheduler_->GetLocalResourceManager().GetLocalAvailableCpus(), 4.0); - RayTask finished_task; - local_task_manager_->TaskFinished(leased_workers_.begin()->second, &finished_task); - local_task_manager_->TaskFinished(leased_workers_.begin()->second, &finished_task); - ASSERT_EQ(finished_task.GetTaskSpecification().TaskId(), - task.GetTaskSpecification().TaskId()); + RayLease finished_lease; + local_lease_manager_->CleanupLease(leased_workers_.begin()->second, &finished_lease); + local_lease_manager_->CleanupLease(leased_workers_.begin()->second, &finished_lease); + ASSERT_EQ(finished_lease.GetLeaseSpecification().LeaseId(), + lease.GetLeaseSpecification().LeaseId()); ASSERT_EQ(scheduler_->GetLocalResourceManager().GetLocalAvailableCpus(), 8.0); AssertNoLeaks(); } -TEST_F(ClusterTaskManagerTest, DispatchQueueNonBlockingTest) { +TEST_F(ClusterLeaseManagerTest, GrantQueueNonBlockingTest) { /* - Test that if no worker is available for the first task in a dispatch - queue (because the runtime env in the task spec doesn't match any - available worker), other tasks in the dispatch queue can still be scheduled. + Test that if no worker is available for the first lease in a leases to grant + queue (because the runtime env in the lease spec doesn't match any + available worker), other leases in the grant queue can still be scheduled. https://github.com/ray-project/ray/issues/16226 */ @@ -639,8 +642,8 @@ TEST_F(ClusterTaskManagerTest, DispatchQueueNonBlockingTest) { runtime_env_info_A.reset(new rpc::RuntimeEnvInfo()); runtime_env_info_A->set_serialized_runtime_env(serialized_runtime_env_A); - RayTask task_A = - CreateTask(required_resources, /*num_args=*/0, /*args=*/{}, runtime_env_info_A); + RayLease lease_A = + CreateLease(required_resources, /*num_args=*/0, /*args=*/{}, runtime_env_info_A); rpc::RequestWorkerLeaseReply reply_A; bool callback_occurred = false; bool *callback_occurred_ptr = &callback_occurred; @@ -654,18 +657,20 @@ TEST_F(ClusterTaskManagerTest, DispatchQueueNonBlockingTest) { runtime_env_info_B.reset(new rpc::RuntimeEnvInfo()); runtime_env_info_B->set_serialized_runtime_env(serialized_runtime_env_B); - RayTask task_B_1 = - CreateTask(required_resources, /*num_args=*/0, /*args=*/{}, runtime_env_info_B); - RayTask task_B_2 = - CreateTask(required_resources, /*num_args=*/0, /*args=*/{}, runtime_env_info_B); + RayLease lease_B_1 = + CreateLease(required_resources, /*num_args=*/0, /*args=*/{}, runtime_env_info_B); + RayLease lease_B_2 = + CreateLease(required_resources, /*num_args=*/0, /*args=*/{}, runtime_env_info_B); rpc::RequestWorkerLeaseReply reply_B_1; rpc::RequestWorkerLeaseReply reply_B_2; auto empty_callback = [](Status, std::function, std::function) {}; // Ensure task_A is not at the front of the queue. - task_manager_.QueueAndScheduleTask(task_B_1, false, false, &reply_B_1, empty_callback); - task_manager_.QueueAndScheduleTask(task_A, false, false, &reply_A, callback); - task_manager_.QueueAndScheduleTask(task_B_2, false, false, &reply_B_2, empty_callback); + lease_manager_.QueueAndScheduleLease( + lease_B_1, false, false, &reply_B_1, empty_callback); + lease_manager_.QueueAndScheduleLease(lease_A, false, false, &reply_A, callback); + lease_manager_.QueueAndScheduleLease( + lease_B_2, false, false, &reply_B_2, empty_callback); pool_.TriggerCallbacks(); // Push a worker that can only run task A. @@ -679,16 +684,16 @@ TEST_F(ClusterTaskManagerTest, DispatchQueueNonBlockingTest) { ASSERT_EQ(pool_.workers.size(), 0); ASSERT_EQ(node_info_calls_, 0); - RayTask finished_task; - local_task_manager_->TaskFinished(leased_workers_.begin()->second, &finished_task); - ASSERT_EQ(finished_task.GetTaskSpecification().TaskId(), - task_A.GetTaskSpecification().TaskId()); + RayLease finished_lease; + local_lease_manager_->CleanupLease(leased_workers_.begin()->second, &finished_lease); + ASSERT_EQ(finished_lease.GetLeaseSpecification().LeaseId(), + lease_A.GetLeaseSpecification().LeaseId()); // task_B_1 and task_B_2 remain in the dispatch queue, so don't call AssertNoLeaks(). // AssertNoLeaks(); } -TEST_F(ClusterTaskManagerTest, BlockedWorkerDiesTest) { +TEST_F(ClusterLeaseManagerTest, BlockedWorkerDiesTest) { /* Tests the edge case in which a worker crashes while it's blocked. In this case, its CPU resources should not be double freed. @@ -700,9 +705,23 @@ TEST_F(ClusterTaskManagerTest, BlockedWorkerDiesTest) { scheduler_->GetLocalResourceManager().AddLocalResourceInstances( scheduling::ResourceID("CPU_group_0_aaa"), std::vector{FixedPoint(1)}); - RayTask task1 = CreateTask({{ray::kCPU_ResourceLabel, 4}}); + WorkerID worker_id1 = WorkerID::FromRandom(); + WorkerID worker_id2 = WorkerID::FromRandom(); + LeaseID lease_id1 = LeaseID::FromWorker(worker_id1, 1); + LeaseID lease_id2 = LeaseID::FromWorker(worker_id2, 1); + RayLease lease1 = CreateLease({{ray::kCPU_ResourceLabel, 4}}, + 0, + {}, + nullptr, + rpc::SchedulingStrategy(), + lease_id1); rpc::RequestWorkerLeaseReply reply1; - RayTask task2 = CreateTask({{"CPU_group_aaa", 1}, {"CPU_group_0_aaa", 1}}); + RayLease lease2 = CreateLease({{"CPU_group_aaa", 1}, {"CPU_group_0_aaa", 1}}, + 0, + {}, + nullptr, + rpc::SchedulingStrategy(), + lease_id2); rpc::RequestWorkerLeaseReply reply2; bool callback_occurred = false; @@ -712,25 +731,23 @@ TEST_F(ClusterTaskManagerTest, BlockedWorkerDiesTest) { *callback_occurred_ptr = true; }; - task_manager_.QueueAndScheduleTask(task1, false, false, &reply1, callback); + lease_manager_.QueueAndScheduleLease(lease1, false, false, &reply1, callback); pool_.TriggerCallbacks(); ASSERT_FALSE(callback_occurred); ASSERT_EQ(leased_workers_.size(), 0); ASSERT_EQ(pool_.workers.size(), 0); - std::shared_ptr worker1 = - std::make_shared(WorkerID::FromRandom(), 1234); - std::shared_ptr worker2 = - std::make_shared(WorkerID::FromRandom(), 5678); + std::shared_ptr worker1 = std::make_shared(worker_id1, 1234); + std::shared_ptr worker2 = std::make_shared(worker_id2, 5678); pool_.PushWorker(std::static_pointer_cast(worker1)); - task_manager_.ScheduleAndDispatchTasks(); + lease_manager_.ScheduleAndGrantLeases(); pool_.TriggerCallbacks(); - task_manager_.QueueAndScheduleTask(task2, false, false, &reply2, callback); + lease_manager_.QueueAndScheduleLease(lease2, false, false, &reply2, callback); pool_.PushWorker(std::static_pointer_cast(worker2)); - task_manager_.ScheduleAndDispatchTasks(); + lease_manager_.ScheduleAndGrantLeases(); pool_.TriggerCallbacks(); ASSERT_TRUE(callback_occurred); @@ -739,30 +756,28 @@ TEST_F(ClusterTaskManagerTest, BlockedWorkerDiesTest) { ASSERT_EQ(node_info_calls_, 0); // Block the worker. Which releases only the CPU resource. - local_task_manager_->ReleaseCpuResourcesFromBlockedWorker(worker1); - local_task_manager_->ReleaseCpuResourcesFromBlockedWorker(worker2); + local_lease_manager_->ReleaseCpuResourcesFromBlockedWorker(worker1); + local_lease_manager_->ReleaseCpuResourcesFromBlockedWorker(worker2); - RayTask finished_task1; - RayTask finished_task2; + RayLease finished_lease1; + RayLease finished_lease2; // If a resource was double-freed, we will crash in this call. - local_task_manager_->TaskFinished(leased_workers_[worker1->WorkerId()], - &finished_task1); - local_task_manager_->TaskFinished(leased_workers_[worker2->WorkerId()], - &finished_task2); - ASSERT_EQ(finished_task1.GetTaskSpecification().TaskId(), - task1.GetTaskSpecification().TaskId()); - ASSERT_EQ(finished_task2.GetTaskSpecification().TaskId(), - task2.GetTaskSpecification().TaskId()); + local_lease_manager_->CleanupLease(leased_workers_[lease_id1], &finished_lease1); + local_lease_manager_->CleanupLease(leased_workers_[lease_id2], &finished_lease2); + ASSERT_EQ(finished_lease1.GetLeaseSpecification().LeaseId(), + lease1.GetLeaseSpecification().LeaseId()); + ASSERT_EQ(finished_lease2.GetLeaseSpecification().LeaseId(), + lease2.GetLeaseSpecification().LeaseId()); AssertNoLeaks(); } -TEST_F(ClusterTaskManagerTest, BlockedWorkerDies2Test) { +TEST_F(ClusterLeaseManagerTest, BlockedWorkerDies2Test) { /* Same edge case as the previous test, but this time the block and finish requests happen in the opposite order. */ - RayTask task = CreateTask({{ray::kCPU_ResourceLabel, 4}}); + RayLease lease = CreateLease({{ray::kCPU_ResourceLabel, 4}}); rpc::RequestWorkerLeaseReply reply; bool callback_occurred = false; bool *callback_occurred_ptr = &callback_occurred; @@ -771,7 +786,7 @@ TEST_F(ClusterTaskManagerTest, BlockedWorkerDies2Test) { *callback_occurred_ptr = true; }; - task_manager_.QueueAndScheduleTask(task, false, false, &reply, callback); + lease_manager_.QueueAndScheduleLease(lease, false, false, &reply, callback); pool_.TriggerCallbacks(); ASSERT_FALSE(callback_occurred); @@ -782,7 +797,7 @@ TEST_F(ClusterTaskManagerTest, BlockedWorkerDies2Test) { std::make_shared(WorkerID::FromRandom(), 1234); pool_.PushWorker(std::static_pointer_cast(worker)); - task_manager_.ScheduleAndDispatchTasks(); + lease_manager_.ScheduleAndGrantLeases(); pool_.TriggerCallbacks(); ASSERT_TRUE(callback_occurred); @@ -790,23 +805,23 @@ TEST_F(ClusterTaskManagerTest, BlockedWorkerDies2Test) { ASSERT_EQ(pool_.workers.size(), 0); ASSERT_EQ(node_info_calls_, 0); - RayTask finished_task; - local_task_manager_->TaskFinished(leased_workers_.begin()->second, &finished_task); - ASSERT_EQ(finished_task.GetTaskSpecification().TaskId(), - task.GetTaskSpecification().TaskId()); + RayLease finished_lease; + local_lease_manager_->CleanupLease(leased_workers_.begin()->second, &finished_lease); + ASSERT_EQ(finished_lease.GetLeaseSpecification().LeaseId(), + lease.GetLeaseSpecification().LeaseId()); // Block the worker. Which releases only the CPU resource. - local_task_manager_->ReleaseCpuResourcesFromBlockedWorker(worker); + local_lease_manager_->ReleaseCpuResourcesFromBlockedWorker(worker); AssertNoLeaks(); } -TEST_F(ClusterTaskManagerTest, NoFeasibleNodeTest) { +TEST_F(ClusterLeaseManagerTest, NoFeasibleNodeTest) { std::shared_ptr worker = std::make_shared(WorkerID::FromRandom(), 1234); pool_.PushWorker(std::dynamic_pointer_cast(worker)); - RayTask task = CreateTask({{ray::kCPU_ResourceLabel, 999}}); + RayLease lease = CreateLease({{ray::kCPU_ResourceLabel, 999}}); rpc::RequestWorkerLeaseReply reply; bool callback_called = false; @@ -816,7 +831,7 @@ TEST_F(ClusterTaskManagerTest, NoFeasibleNodeTest) { *callback_called_ptr = true; }; - task_manager_.QueueAndScheduleTask(task, false, false, &reply, callback); + lease_manager_.QueueAndScheduleLease(lease, false, false, &reply, callback); pool_.TriggerCallbacks(); ASSERT_FALSE(callback_called); @@ -826,13 +841,13 @@ TEST_F(ClusterTaskManagerTest, NoFeasibleNodeTest) { ASSERT_EQ(node_info_calls_, 0); } -TEST_F(ClusterTaskManagerTest, DrainingWhileResolving) { +TEST_F(ClusterLeaseManagerTest, DrainingWhileResolving) { /* - Test the race condition in which a task is assigned to a node, but cannot + Test the race condition in which a lease is assigned to a node, but cannot run because its dependencies are unresolved. Once its dependencies are resolved, the node is being drained. */ - RayTask task = CreateTask({{ray::kCPU_ResourceLabel, 1}}); + RayLease lease = CreateLease({{ray::kCPU_ResourceLabel, 1}}); rpc::RequestWorkerLeaseReply reply; bool callback_occurred = false; bool *callback_occurred_ptr = &callback_occurred; @@ -840,7 +855,7 @@ TEST_F(ClusterTaskManagerTest, DrainingWhileResolving) { Status, std::function, std::function) { *callback_occurred_ptr = true; }; - task_manager_.QueueAndScheduleTask(task, false, false, &reply, callback); + lease_manager_.QueueAndScheduleLease(lease, false, false, &reply, callback); std::shared_ptr worker = std::make_shared(WorkerID::FromRandom(), 1234); std::shared_ptr worker2 = @@ -855,12 +870,12 @@ TEST_F(ClusterTaskManagerTest, DrainingWhileResolving) { auto remote_node_id = NodeID::FromRandom(); AddNode(remote_node_id, 5); - RayTask resolving_args_task = CreateTask({{ray::kCPU_ResourceLabel, 1}}, 1); - auto missing_arg = resolving_args_task.GetTaskSpecification().GetDependencyIds()[0]; + RayLease resolving_args_lease = CreateLease({{ray::kCPU_ResourceLabel, 1}}, 1); + auto missing_arg = resolving_args_lease.GetLeaseSpecification().GetDependencyIds()[0]; missing_objects_.insert(missing_arg); rpc::RequestWorkerLeaseReply spillback_reply; - task_manager_.QueueAndScheduleTask( - resolving_args_task, false, false, &spillback_reply, callback); + lease_manager_.QueueAndScheduleLease( + resolving_args_lease, false, false, &spillback_reply, callback); pool_.TriggerCallbacks(); ASSERT_EQ(leased_workers_.size(), 1); ASSERT_EQ(pool_.workers.size(), 1); @@ -872,15 +887,15 @@ TEST_F(ClusterTaskManagerTest, DrainingWhileResolving) { // Arg is resolved. missing_objects_.erase(missing_arg); - std::vector unblocked = {resolving_args_task.GetTaskSpecification().TaskId()}; - local_task_manager_->TasksUnblocked(unblocked); - ASSERT_EQ(spillback_reply.retry_at_raylet_address().raylet_id(), - remote_node_id.Binary()); + std::vector unblocked = { + resolving_args_lease.GetLeaseSpecification().LeaseId()}; + local_lease_manager_->LeasesUnblocked(unblocked); + ASSERT_EQ(spillback_reply.retry_at_raylet_address().node_id(), remote_node_id.Binary()); } -TEST_F(ClusterTaskManagerTest, ResourceTakenWhileResolving) { +TEST_F(ClusterLeaseManagerTest, ResourceTakenWhileResolving) { /* - Test the race condition in which a task is assigned to a node, but cannot + Test the race condition in which a lease is assigned to a node, but cannot run because its dependencies are unresolved. Once its dependencies are resolved, the node no longer has available resources. */ @@ -900,14 +915,14 @@ TEST_F(ClusterTaskManagerTest, ResourceTakenWhileResolving) { }; /* Blocked on dependencies */ - auto task = CreateTask({{ray::kCPU_ResourceLabel, 5}}, 2); - auto missing_arg = task.GetTaskSpecification().GetDependencyIds()[0]; + auto lease = CreateLease({{ray::kCPU_ResourceLabel, 5}}, 2); + auto missing_arg = lease.GetLeaseSpecification().GetDependencyIds()[0]; missing_objects_.insert(missing_arg); - std::unordered_set expected_subscribed_tasks = { - task.GetTaskSpecification().TaskId()}; - task_manager_.QueueAndScheduleTask(task, false, false, &reply, callback); + std::unordered_set expected_subscribed_leases = { + lease.GetLeaseSpecification().LeaseId()}; + lease_manager_.QueueAndScheduleLease(lease, false, false, &reply, callback); pool_.TriggerCallbacks(); - ASSERT_EQ(dependency_manager_.subscribed_tasks, expected_subscribed_tasks); + ASSERT_EQ(lease_dependency_manager_.subscribed_leases, expected_subscribed_leases); ASSERT_EQ(num_callbacks, 0); ASSERT_EQ(leased_workers_.size(), 0); @@ -916,52 +931,52 @@ TEST_F(ClusterTaskManagerTest, ResourceTakenWhileResolving) { // https://github.com/ray-project/ray/issues/13725. ASSERT_EQ(pool_.num_pops, 0); - /* This task can run */ - auto task2 = CreateTask({{ray::kCPU_ResourceLabel, 5}}, 1); - task_manager_.QueueAndScheduleTask(task2, false, false, &reply, callback); + /* This lease can run */ + auto lease2 = CreateLease({{ray::kCPU_ResourceLabel, 5}}, 1); + lease_manager_.QueueAndScheduleLease(lease2, false, false, &reply, callback); pool_.TriggerCallbacks(); - ASSERT_EQ(dependency_manager_.subscribed_tasks, expected_subscribed_tasks); + ASSERT_EQ(lease_dependency_manager_.subscribed_leases, expected_subscribed_leases); - AssertPinnedTaskArgumentsPresent(task2); + AssertPinnedLeaseArgumentsPresent(lease2); ASSERT_EQ(num_callbacks, 1); ASSERT_EQ(leased_workers_.size(), 1); ASSERT_EQ(pool_.workers.size(), 1); ASSERT_EQ(pool_.num_pops, 1); - /* First task is unblocked now, but resources are no longer available */ + /* First lease is unblocked now, but resources are no longer available */ missing_objects_.erase(missing_arg); - auto id = task.GetTaskSpecification().TaskId(); - std::vector unblocked = {id}; - local_task_manager_->TasksUnblocked(unblocked); - ASSERT_EQ(dependency_manager_.subscribed_tasks, expected_subscribed_tasks); + auto id = lease.GetLeaseSpecification().LeaseId(); + std::vector unblocked = {id}; + local_lease_manager_->LeasesUnblocked(unblocked); + ASSERT_EQ(lease_dependency_manager_.subscribed_leases, expected_subscribed_leases); - AssertPinnedTaskArgumentsPresent(task2); + AssertPinnedLeaseArgumentsPresent(lease2); ASSERT_EQ(num_callbacks, 1); ASSERT_EQ(leased_workers_.size(), 1); ASSERT_EQ(pool_.workers.size(), 1); ASSERT_EQ(pool_.num_pops, 1); - /* Second task finishes, making space for the original task */ - RayTask finished_task; - local_task_manager_->TaskFinished(leased_workers_.begin()->second, &finished_task); + /* Second lease finishes, making space for the original lease */ + RayLease finished_lease; + local_lease_manager_->CleanupLease(leased_workers_.begin()->second, &finished_lease); leased_workers_.clear(); - task_manager_.ScheduleAndDispatchTasks(); + lease_manager_.ScheduleAndGrantLeases(); pool_.TriggerCallbacks(); - ASSERT_TRUE(dependency_manager_.subscribed_tasks.empty()); + ASSERT_TRUE(lease_dependency_manager_.subscribed_leases.empty()); - // Task2 is now done so task can run. - AssertPinnedTaskArgumentsPresent(task); + // Lease2 is now done so lease can run. + AssertPinnedLeaseArgumentsPresent(lease); ASSERT_EQ(num_callbacks, 2); ASSERT_EQ(leased_workers_.size(), 1); ASSERT_EQ(pool_.workers.size(), 0); ASSERT_EQ(pool_.num_pops, 2); - local_task_manager_->TaskFinished(leased_workers_.begin()->second, &finished_task); + local_lease_manager_->CleanupLease(leased_workers_.begin()->second, &finished_lease); AssertNoLeaks(); } -TEST_F(ClusterTaskManagerTest, TestIsSelectedBasedOnLocality) { +TEST_F(ClusterLeaseManagerTest, TestIsSelectedBasedOnLocality) { std::shared_ptr worker1 = std::make_shared(WorkerID::FromRandom(), 1234); std::shared_ptr worker2 = @@ -977,46 +992,45 @@ TEST_F(ClusterTaskManagerTest, TestIsSelectedBasedOnLocality) { auto remote_node_id = NodeID::FromRandom(); AddNode(remote_node_id, 8); - auto task1 = CreateTask({{ray::kCPU_ResourceLabel, 5}}); + auto lease1 = CreateLease({{ray::kCPU_ResourceLabel, 5}}); rpc::RequestWorkerLeaseReply local_reply; - task_manager_.QueueAndScheduleTask( - task1, false, /*is_selected_based_on_locality=*/false, &local_reply, callback); + lease_manager_.QueueAndScheduleLease( + lease1, false, /*is_selected_based_on_locality=*/false, &local_reply, callback); pool_.TriggerCallbacks(); ASSERT_EQ(num_callbacks, 1); - // The first task was dispatched. + // The first lease was dispatched. ASSERT_EQ(leased_workers_.size(), 1); ASSERT_EQ(pool_.workers.size(), 1); - auto task2 = CreateTask({{ray::kCPU_ResourceLabel, 1}}); + auto lease2 = CreateLease({{ray::kCPU_ResourceLabel, 1}}); rpc::RequestWorkerLeaseReply spillback_reply; - task_manager_.QueueAndScheduleTask( - task2, false, /*is_selected_based_on_locality=*/false, &spillback_reply, callback); + lease_manager_.QueueAndScheduleLease( + lease2, false, /*is_selected_based_on_locality=*/false, &spillback_reply, callback); pool_.TriggerCallbacks(); - // The second task was spilled. + // The second lease was spilled. ASSERT_EQ(num_callbacks, 2); - ASSERT_EQ(spillback_reply.retry_at_raylet_address().raylet_id(), - remote_node_id.Binary()); + ASSERT_EQ(spillback_reply.retry_at_raylet_address().node_id(), remote_node_id.Binary()); ASSERT_EQ(leased_workers_.size(), 1); ASSERT_EQ(pool_.workers.size(), 1); - auto task3 = CreateTask({{ray::kCPU_ResourceLabel, 1}}); - task_manager_.QueueAndScheduleTask( - task3, false, /*is_selected_based_on_locality=*/true, &local_reply, callback); + auto lease3 = CreateLease({{ray::kCPU_ResourceLabel, 1}}); + lease_manager_.QueueAndScheduleLease( + lease3, false, /*is_selected_based_on_locality=*/true, &local_reply, callback); pool_.TriggerCallbacks(); ASSERT_EQ(num_callbacks, 3); - // The third task was dispatched. + // The third lease was dispatched. ASSERT_EQ(leased_workers_.size(), 2); ASSERT_EQ(pool_.workers.size(), 0); while (!leased_workers_.empty()) { - RayTask finished_task; - local_task_manager_->TaskFinished(leased_workers_.begin()->second, &finished_task); + RayLease finished_lease; + local_lease_manager_->CleanupLease(leased_workers_.begin()->second, &finished_lease); leased_workers_.erase(leased_workers_.begin()); } AssertNoLeaks(); } -TEST_F(ClusterTaskManagerTest, TestGrantOrReject) { +TEST_F(ClusterLeaseManagerTest, TestGrantOrReject) { std::shared_ptr worker1 = std::make_shared(WorkerID::FromRandom(), 1234); std::shared_ptr worker2 = @@ -1032,50 +1046,49 @@ TEST_F(ClusterTaskManagerTest, TestGrantOrReject) { auto remote_node_id = NodeID::FromRandom(); AddNode(remote_node_id, 8); - auto task1 = CreateTask({{ray::kCPU_ResourceLabel, 5}}); + auto lease1 = CreateLease({{ray::kCPU_ResourceLabel, 5}}); rpc::RequestWorkerLeaseReply local_reply; - task_manager_.QueueAndScheduleTask( - task1, /*grant_or_reject=*/false, false, &local_reply, callback); + lease_manager_.QueueAndScheduleLease( + lease1, /*grant_or_reject=*/false, false, &local_reply, callback); pool_.TriggerCallbacks(); ASSERT_EQ(num_callbacks, 1); - // The first task was dispatched. + // The first lease was dispatched. ASSERT_EQ(leased_workers_.size(), 1); ASSERT_EQ(pool_.workers.size(), 1); - auto task2 = CreateTask({{ray::kCPU_ResourceLabel, 1}}); + auto lease2 = CreateLease({{ray::kCPU_ResourceLabel, 1}}); rpc::RequestWorkerLeaseReply spillback_reply; - task_manager_.QueueAndScheduleTask( - task2, /*grant_or_reject=*/false, false, &spillback_reply, callback); + lease_manager_.QueueAndScheduleLease( + lease2, /*grant_or_reject=*/false, false, &spillback_reply, callback); pool_.TriggerCallbacks(); - // The second task was spilled. + // The second lease was spilled. ASSERT_EQ(num_callbacks, 2); - ASSERT_EQ(spillback_reply.retry_at_raylet_address().raylet_id(), - remote_node_id.Binary()); + ASSERT_EQ(spillback_reply.retry_at_raylet_address().node_id(), remote_node_id.Binary()); ASSERT_EQ(leased_workers_.size(), 1); ASSERT_EQ(pool_.workers.size(), 1); - auto task3 = CreateTask({{ray::kCPU_ResourceLabel, 1}}); - task_manager_.QueueAndScheduleTask( - task3, /*grant_or_reject=*/true, false, &local_reply, callback); + auto lease3 = CreateLease({{ray::kCPU_ResourceLabel, 1}}); + lease_manager_.QueueAndScheduleLease( + lease3, /*grant_or_reject=*/true, false, &local_reply, callback); pool_.TriggerCallbacks(); ASSERT_EQ(num_callbacks, 3); - // The third task was dispatched. + // The third lease was dispatched. ASSERT_EQ(leased_workers_.size(), 2); ASSERT_EQ(pool_.workers.size(), 0); while (!leased_workers_.empty()) { - RayTask finished_task; - local_task_manager_->TaskFinished(leased_workers_.begin()->second, &finished_task); + RayLease finished_lease; + local_lease_manager_->CleanupLease(leased_workers_.begin()->second, &finished_lease); leased_workers_.erase(leased_workers_.begin()); } AssertNoLeaks(); } -TEST_F(ClusterTaskManagerTest, TestSpillAfterAssigned) { +TEST_F(ClusterLeaseManagerTest, TestSpillAfterAssigned) { /* - Test the race condition in which a task is assigned to the local node, but - it cannot be run because a different task gets assigned the resources - first. The un-runnable task should eventually get spilled back to another + Test the race condition in which a lease is assigned to the local node, but + it cannot be run because a different lease gets assigned the resources + first. The un-runnable lease should eventually get spilled back to another node. */ std::shared_ptr worker = @@ -1089,60 +1102,59 @@ TEST_F(ClusterTaskManagerTest, TestSpillAfterAssigned) { }; /* Blocked on starting a worker. */ - auto task = CreateTask({{ray::kCPU_ResourceLabel, 5}}); + auto lease = CreateLease({{ray::kCPU_ResourceLabel, 5}}); rpc::RequestWorkerLeaseReply local_reply; - task_manager_.QueueAndScheduleTask(task, false, false, &local_reply, callback); + lease_manager_.QueueAndScheduleLease(lease, false, false, &local_reply, callback); pool_.TriggerCallbacks(); ASSERT_EQ(num_callbacks, 0); ASSERT_EQ(leased_workers_.size(), 0); // Resources are no longer available for the second. - auto task2 = CreateTask({{ray::kCPU_ResourceLabel, 5}}); + auto lease2 = CreateLease({{ray::kCPU_ResourceLabel, 5}}); rpc::RequestWorkerLeaseReply reject_reply; - task_manager_.QueueAndScheduleTask( - task2, /*grant_or_reject=*/true, false, &reject_reply, callback); + lease_manager_.QueueAndScheduleLease( + lease2, /*grant_or_reject=*/true, false, &reject_reply, callback); pool_.TriggerCallbacks(); - // The second task was rejected. + // The second lease was rejected. ASSERT_EQ(num_callbacks, 1); ASSERT_TRUE(reject_reply.rejected()); ASSERT_EQ(leased_workers_.size(), 0); // Resources are no longer available for the third. - auto task3 = CreateTask({{ray::kCPU_ResourceLabel, 5}}); + auto lease3 = CreateLease({{ray::kCPU_ResourceLabel, 5}}); rpc::RequestWorkerLeaseReply spillback_reply; - task_manager_.QueueAndScheduleTask(task3, false, false, &spillback_reply, callback); + lease_manager_.QueueAndScheduleLease(lease3, false, false, &spillback_reply, callback); pool_.TriggerCallbacks(); - // The third task was spilled. + // The third lease was spilled. ASSERT_EQ(num_callbacks, 2); - ASSERT_EQ(spillback_reply.retry_at_raylet_address().raylet_id(), - remote_node_id.Binary()); + ASSERT_EQ(spillback_reply.retry_at_raylet_address().node_id(), remote_node_id.Binary()); ASSERT_EQ(leased_workers_.size(), 0); - // Two workers start. First task was dispatched now. + // Two workers start. First lease was dispatched now. pool_.PushWorker(std::static_pointer_cast(worker)); pool_.PushWorker(std::static_pointer_cast(worker)); - task_manager_.ScheduleAndDispatchTasks(); + lease_manager_.ScheduleAndGrantLeases(); pool_.TriggerCallbacks(); - // Check that all tasks got removed from the queue. + // Check that all leases got removed from the queue. ASSERT_EQ(num_callbacks, 3); - // The first task was dispatched. + // The first lease was dispatched. ASSERT_EQ(leased_workers_.size(), 1); // Leave one alive worker. ASSERT_EQ(pool_.workers.size(), 1); - RayTask finished_task; - local_task_manager_->TaskFinished(leased_workers_.begin()->second, &finished_task); - ASSERT_EQ(finished_task.GetTaskSpecification().TaskId(), - task.GetTaskSpecification().TaskId()); + RayLease finished_lease; + local_lease_manager_->CleanupLease(leased_workers_.begin()->second, &finished_lease); + ASSERT_EQ(finished_lease.GetLeaseSpecification().LeaseId(), + lease.GetLeaseSpecification().LeaseId()); AssertNoLeaks(); } -TEST_F(ClusterTaskManagerTest, TestIdleNode) { - RayTask task = CreateTask({{}}); +TEST_F(ClusterLeaseManagerTest, TestIdleNode) { + RayLease lease = CreateLease({{}}); rpc::RequestWorkerLeaseReply reply; bool callback_occurred = false; bool *callback_occurred_ptr = &callback_occurred; @@ -1151,7 +1163,7 @@ TEST_F(ClusterTaskManagerTest, TestIdleNode) { *callback_occurred_ptr = true; }; - task_manager_.QueueAndScheduleTask(task, false, false, &reply, callback); + lease_manager_.QueueAndScheduleLease(lease, false, false, &reply, callback); pool_.TriggerCallbacks(); ASSERT_TRUE(scheduler_->GetLocalResourceManager().IsLocalNodeIdle()); ASSERT_FALSE(callback_occurred); @@ -1168,7 +1180,7 @@ TEST_F(ClusterTaskManagerTest, TestIdleNode) { ASSERT_EQ(node_info_calls_, 0); } -TEST_F(ClusterTaskManagerTest, NotOKPopWorkerAfterDrainingTest) { +TEST_F(ClusterLeaseManagerTest, NotOKPopWorkerAfterDrainingTest) { /* Test cases where the node is being drained after PopWorker is called and PopWorker fails. @@ -1184,8 +1196,8 @@ TEST_F(ClusterTaskManagerTest, NotOKPopWorkerAfterDrainingTest) { task_allocation); } - RayTask task1 = CreateTask({{ray::kCPU_ResourceLabel, 1}}); - RayTask task2 = CreateTask({{ray::kCPU_ResourceLabel, 1}}); + RayLease lease1 = CreateLease({{ray::kCPU_ResourceLabel, 1}}); + RayLease lease2 = CreateLease({{ray::kCPU_ResourceLabel, 1}}); rpc::RequestWorkerLeaseReply reply1; rpc::RequestWorkerLeaseReply reply2; bool callback_called = false; @@ -1194,8 +1206,8 @@ TEST_F(ClusterTaskManagerTest, NotOKPopWorkerAfterDrainingTest) { Status, std::function, std::function) { *callback_called_ptr = true; }; - task_manager_.QueueAndScheduleTask(task1, false, false, &reply1, callback); - task_manager_.QueueAndScheduleTask(task2, false, false, &reply2, callback); + lease_manager_.QueueAndScheduleLease(lease1, false, false, &reply1, callback); + lease_manager_.QueueAndScheduleLease(lease2, false, false, &reply2, callback); auto remote_node_id = NodeID::FromRandom(); AddNode(remote_node_id, 5); @@ -1205,20 +1217,20 @@ TEST_F(ClusterTaskManagerTest, NotOKPopWorkerAfterDrainingTest) { drain_request.set_deadline_timestamp_ms(std::numeric_limits::max()); scheduler_->GetLocalResourceManager().SetLocalNodeDraining(drain_request); - pool_.callbacks[task1.GetTaskSpecification().GetRuntimeEnvHash()].front()( + pool_.callbacks[lease1.GetLeaseSpecification().GetRuntimeEnvHash()].front()( nullptr, PopWorkerStatus::WorkerPendingRegistration, ""); - pool_.callbacks[task1.GetTaskSpecification().GetRuntimeEnvHash()].back()( + pool_.callbacks[lease1.GetLeaseSpecification().GetRuntimeEnvHash()].back()( nullptr, PopWorkerStatus::RuntimeEnvCreationFailed, "runtime env setup error"); pool_.callbacks.clear(); - task_manager_.ScheduleAndDispatchTasks(); - // task1 is spilled and task2 is cancelled. - ASSERT_EQ(reply1.retry_at_raylet_address().raylet_id(), remote_node_id.Binary()); + lease_manager_.ScheduleAndGrantLeases(); + // lease1 is spilled and lease2 is cancelled. + ASSERT_EQ(reply1.retry_at_raylet_address().node_id(), remote_node_id.Binary()); ASSERT_TRUE(reply2.canceled()); ASSERT_EQ(reply2.scheduling_failure_message(), "runtime env setup error"); } -TEST_F(ClusterTaskManagerTest, NotOKPopWorkerTest) { - RayTask task1 = CreateTask({{ray::kCPU_ResourceLabel, 1}}); +TEST_F(ClusterLeaseManagerTest, NotOKPopWorkerTest) { + RayLease lease1 = CreateLease({{ray::kCPU_ResourceLabel, 1}}); rpc::RequestWorkerLeaseReply reply; bool callback_called = false; bool *callback_called_ptr = &callback_called; @@ -1226,61 +1238,61 @@ TEST_F(ClusterTaskManagerTest, NotOKPopWorkerTest) { Status, std::function, std::function) { *callback_called_ptr = true; }; - task_manager_.QueueAndScheduleTask(task1, false, false, &reply, callback); - ASSERT_EQ(NumTasksToDispatchWithStatus(internal::WorkStatus::WAITING_FOR_WORKER), 1); - ASSERT_EQ(NumTasksToDispatchWithStatus(internal::WorkStatus::WAITING), 0); - ASSERT_EQ(NumRunningTasks(), 1); + lease_manager_.QueueAndScheduleLease(lease1, false, false, &reply, callback); + ASSERT_EQ(NumLeasesToDispatchWithStatus(internal::WorkStatus::WAITING_FOR_WORKER), 1); + ASSERT_EQ(NumLeasesToDispatchWithStatus(internal::WorkStatus::WAITING), 0); + ASSERT_EQ(NumRunningLeases(), 1); pool_.TriggerCallbacksWithNotOKStatus(PopWorkerStatus::WorkerPendingRegistration); ASSERT_FALSE(callback_called); - ASSERT_EQ(NumTasksToDispatchWithStatus(internal::WorkStatus::WAITING_FOR_WORKER), 0); - ASSERT_EQ(NumTasksToDispatchWithStatus(internal::WorkStatus::WAITING), 1); - ASSERT_EQ(NumRunningTasks(), 0); - ASSERT_TRUE(task_manager_.CancelTask(task1.GetTaskSpecification().TaskId())); + ASSERT_EQ(NumLeasesToDispatchWithStatus(internal::WorkStatus::WAITING_FOR_WORKER), 0); + ASSERT_EQ(NumLeasesToDispatchWithStatus(internal::WorkStatus::WAITING), 1); + ASSERT_EQ(NumRunningLeases(), 0); + ASSERT_TRUE(lease_manager_.CancelLease(lease1.GetLeaseSpecification().LeaseId())); callback_called = false; reply.Clear(); - RayTask task2 = CreateTask({{ray::kCPU_ResourceLabel, 1}}); - task_manager_.QueueAndScheduleTask(task2, false, false, &reply, callback); - ASSERT_EQ(NumTasksToDispatchWithStatus(internal::WorkStatus::WAITING_FOR_WORKER), 1); - ASSERT_EQ(NumTasksToDispatchWithStatus(internal::WorkStatus::WAITING), 0); - ASSERT_EQ(NumRunningTasks(), 1); - // The task should be cancelled. + RayLease lease2 = CreateLease({{ray::kCPU_ResourceLabel, 1}}); + lease_manager_.QueueAndScheduleLease(lease2, false, false, &reply, callback); + ASSERT_EQ(NumLeasesToDispatchWithStatus(internal::WorkStatus::WAITING_FOR_WORKER), 1); + ASSERT_EQ(NumLeasesToDispatchWithStatus(internal::WorkStatus::WAITING), 0); + ASSERT_EQ(NumRunningLeases(), 1); + // The lease should be cancelled. const auto runtime_env_error_msg = "Runtime env error message"; pool_.TriggerCallbacksWithNotOKStatus(PopWorkerStatus::RuntimeEnvCreationFailed, runtime_env_error_msg); ASSERT_TRUE(callback_called); - ASSERT_EQ(NumTasksToDispatchWithStatus(internal::WorkStatus::WAITING_FOR_WORKER), 0); - ASSERT_EQ(NumTasksToDispatchWithStatus(internal::WorkStatus::WAITING), 0); - ASSERT_EQ(NumRunningTasks(), 0); + ASSERT_EQ(NumLeasesToDispatchWithStatus(internal::WorkStatus::WAITING_FOR_WORKER), 0); + ASSERT_EQ(NumLeasesToDispatchWithStatus(internal::WorkStatus::WAITING), 0); + ASSERT_EQ(NumRunningLeases(), 0); ASSERT_TRUE(reply.canceled()); ASSERT_EQ(reply.scheduling_failure_message(), runtime_env_error_msg); - // Test that local task manager handles PopWorkerStatus::JobFinished correctly. + // Test that local lease manager handles PopWorkerStatus::JobFinished correctly. callback_called = false; reply.Clear(); - RayTask task3 = CreateTask({{ray::kCPU_ResourceLabel, 1}}); - task_manager_.QueueAndScheduleTask(task3, false, false, &reply, callback); - ASSERT_EQ(NumTasksToDispatchWithStatus(internal::WorkStatus::WAITING_FOR_WORKER), 1); - ASSERT_EQ(NumTasksToDispatchWithStatus(internal::WorkStatus::WAITING), 0); - ASSERT_EQ(NumRunningTasks(), 1); + RayLease lease3 = CreateLease({{ray::kCPU_ResourceLabel, 1}}); + lease_manager_.QueueAndScheduleLease(lease3, false, false, &reply, callback); + ASSERT_EQ(NumLeasesToDispatchWithStatus(internal::WorkStatus::WAITING_FOR_WORKER), 1); + ASSERT_EQ(NumLeasesToDispatchWithStatus(internal::WorkStatus::WAITING), 0); + ASSERT_EQ(NumRunningLeases(), 1); pool_.TriggerCallbacksWithNotOKStatus(PopWorkerStatus::JobFinished); - // The task should be removed from the dispatch queue. + // The lease should be removed from the leases_to_grant queue. ASSERT_FALSE(callback_called); - ASSERT_EQ(NumTasksToDispatchWithStatus(internal::WorkStatus::WAITING_FOR_WORKER), 0); - ASSERT_EQ(NumTasksToDispatchWithStatus(internal::WorkStatus::WAITING), 0); - ASSERT_EQ(NumRunningTasks(), 0); + ASSERT_EQ(NumLeasesToDispatchWithStatus(internal::WorkStatus::WAITING_FOR_WORKER), 0); + ASSERT_EQ(NumLeasesToDispatchWithStatus(internal::WorkStatus::WAITING), 0); + ASSERT_EQ(NumRunningLeases(), 0); AssertNoLeaks(); } -TEST_F(ClusterTaskManagerTest, TaskUnschedulableTest) { - TaskSpecification task_spec = - CreateTask({{ray::kCPU_ResourceLabel, 1}}).GetTaskSpecification(); - task_spec.GetMutableMessage() +TEST_F(ClusterLeaseManagerTest, TaskUnschedulableTest) { + LeaseSpecification lease_spec = + CreateLease({{ray::kCPU_ResourceLabel, 1}}).GetLeaseSpecification(); + lease_spec.GetMutableMessage() .mutable_scheduling_strategy() ->mutable_node_affinity_scheduling_strategy() ->set_node_id(NodeID::FromRandom().Binary()); - task_spec.GetMutableMessage() + lease_spec.GetMutableMessage() .mutable_scheduling_strategy() ->mutable_node_affinity_scheduling_strategy() ->set_soft(false); @@ -1293,7 +1305,8 @@ TEST_F(ClusterTaskManagerTest, TaskUnschedulableTest) { *callback_called_ptr = true; }; - task_manager_.QueueAndScheduleTask(RayTask(task_spec), false, false, &reply, callback); + lease_manager_.QueueAndScheduleLease( + RayLease(lease_spec), false, false, &reply, callback); ASSERT_TRUE(callback_called); ASSERT_TRUE(reply.canceled()); ASSERT_EQ(reply.failure_type(), @@ -1302,10 +1315,10 @@ TEST_F(ClusterTaskManagerTest, TaskUnschedulableTest) { AssertNoLeaks(); } -TEST_F(ClusterTaskManagerTest, TaskCancellationTest) { +TEST_F(ClusterLeaseManagerTest, TaskCancellationTest) { std::shared_ptr worker = std::make_shared(WorkerID::FromRandom(), 1234); - RayTask task1 = CreateTask({{ray::kCPU_ResourceLabel, 1}}); + RayLease lease1 = CreateLease({{ray::kCPU_ResourceLabel, 1}}); rpc::RequestWorkerLeaseReply reply; bool callback_called = false; @@ -1315,52 +1328,52 @@ TEST_F(ClusterTaskManagerTest, TaskCancellationTest) { *callback_called_ptr = true; }; - // Task1 not queued so we can't cancel it. - ASSERT_FALSE(task_manager_.CancelTask(task1.GetTaskSpecification().TaskId())); + // Lease1 not queued so we can't cancel it. + ASSERT_FALSE(lease_manager_.CancelLease(lease1.GetLeaseSpecification().LeaseId())); - task_manager_.QueueAndScheduleTask(task1, false, false, &reply, callback); + lease_manager_.QueueAndScheduleLease(lease1, false, false, &reply, callback); pool_.TriggerCallbacks(); - // Task1 is now in dispatch queue. + // Lease1 is now in dispatch queue. callback_called = false; reply.Clear(); - ASSERT_TRUE(task_manager_.CancelTask(task1.GetTaskSpecification().TaskId())); + ASSERT_TRUE(lease_manager_.CancelLease(lease1.GetLeaseSpecification().LeaseId())); pool_.PushWorker(std::static_pointer_cast(worker)); - task_manager_.ScheduleAndDispatchTasks(); + lease_manager_.ScheduleAndGrantLeases(); pool_.TriggerCallbacks(); - // Task1 will not execute. + // Lease1 will not be granted. ASSERT_TRUE(callback_called); ASSERT_TRUE(reply.canceled()); ASSERT_EQ(leased_workers_.size(), 0); - RayTask task2 = CreateTask({{ray::kCPU_ResourceLabel, 1}}); - task_manager_.QueueAndScheduleTask(task2, false, false, &reply, callback); + RayLease lease2 = CreateLease({{ray::kCPU_ResourceLabel, 1}}); + lease_manager_.QueueAndScheduleLease(lease2, false, false, &reply, callback); pool_.TriggerCallbacks(); - // Task2 is now running so we can't cancel it. + // Lease2 is now granted so we can't cancel it. callback_called = false; reply.Clear(); - ASSERT_FALSE(task_manager_.CancelTask(task2.GetTaskSpecification().TaskId())); + ASSERT_FALSE(lease_manager_.CancelLease(lease2.GetLeaseSpecification().LeaseId())); ASSERT_FALSE(reply.canceled()); ASSERT_FALSE(callback_called); ASSERT_EQ(pool_.workers.size(), 0); ASSERT_EQ(leased_workers_.size(), 1); - RayTask finished_task; - local_task_manager_->TaskFinished(leased_workers_.begin()->second, &finished_task); - ASSERT_EQ(finished_task.GetTaskSpecification().TaskId(), - task2.GetTaskSpecification().TaskId()); + RayLease finished_lease; + local_lease_manager_->CleanupLease(leased_workers_.begin()->second, &finished_lease); + ASSERT_EQ(finished_lease.GetLeaseSpecification().LeaseId(), + lease2.GetLeaseSpecification().LeaseId()); - RayTask task3 = CreateTask({{ray::kCPU_ResourceLabel, 2}}); + RayLease lease3 = CreateLease({{ray::kCPU_ResourceLabel, 2}}); rpc::RequestWorkerLeaseReply reply3; - RayTask task4 = CreateTask({{ray::kCPU_ResourceLabel, 200}}); + RayLease lease4 = CreateLease({{ray::kCPU_ResourceLabel, 200}}); rpc::RequestWorkerLeaseReply reply4; - // Task 3 should be popping worker - task_manager_.QueueAndScheduleTask(task3, false, false, &reply3, callback); - // Task 4 is infeasible - task_manager_.QueueAndScheduleTask(task4, false, false, &reply4, callback); + // Lease 3 should be popping worker + lease_manager_.QueueAndScheduleLease(lease3, false, false, &reply3, callback); + // Lease 4 is infeasible + lease_manager_.QueueAndScheduleLease(lease4, false, false, &reply4, callback); pool_.TriggerCallbacks(); - ASSERT_TRUE(task_manager_.CancelTasks( + ASSERT_TRUE(lease_manager_.CancelLeases( [](const std::shared_ptr &work) { return true; }, rpc::RequestWorkerLeaseReply::SCHEDULING_CANCELLED_INTENDED, "")); @@ -1370,13 +1383,13 @@ TEST_F(ClusterTaskManagerTest, TaskCancellationTest) { AssertNoLeaks(); } -TEST_F(ClusterTaskManagerTest, TaskCancelInfeasibleTask) { - /* Make sure cancelTask works for infeasible tasks */ +TEST_F(ClusterLeaseManagerTest, TaskCancelInfeasibleTask) { + /* Make sure cancelLease works for infeasible leases */ std::shared_ptr worker = std::make_shared(WorkerID::FromRandom(), 1234); pool_.PushWorker(std::static_pointer_cast(worker)); - RayTask task = CreateTask({{ray::kCPU_ResourceLabel, 12}}); + RayLease lease = CreateLease({{ray::kCPU_ResourceLabel, 12}}); rpc::RequestWorkerLeaseReply reply; bool callback_called = false; @@ -1386,24 +1399,24 @@ TEST_F(ClusterTaskManagerTest, TaskCancelInfeasibleTask) { *callback_called_ptr = true; }; - task_manager_.QueueAndScheduleTask(task, false, false, &reply, callback); + lease_manager_.QueueAndScheduleLease(lease, false, false, &reply, callback); pool_.TriggerCallbacks(); - // RayTask is now queued so cancellation works. - ASSERT_TRUE(task_manager_.CancelTask(task.GetTaskSpecification().TaskId())); - task_manager_.ScheduleAndDispatchTasks(); + // RayLease is now queued so cancellation works. + ASSERT_TRUE(lease_manager_.CancelLease(lease.GetLeaseSpecification().LeaseId())); + lease_manager_.ScheduleAndGrantLeases(); pool_.TriggerCallbacks(); - // Task will not execute. + // Lease will not be granted. ASSERT_TRUE(callback_called); ASSERT_TRUE(reply.canceled()); ASSERT_EQ(leased_workers_.size(), 0); ASSERT_EQ(pool_.workers.size(), 1); - // Although the feasible node is added, task shouldn't be executed because it is + // Although the feasible node is added, lease shouldn't be granted because it is // cancelled. auto remote_node_id = NodeID::FromRandom(); AddNode(remote_node_id, 12); - task_manager_.ScheduleAndDispatchTasks(); + lease_manager_.ScheduleAndGrantLeases(); pool_.TriggerCallbacks(); ASSERT_TRUE(callback_called); ASSERT_TRUE(reply.canceled()); @@ -1412,13 +1425,13 @@ TEST_F(ClusterTaskManagerTest, TaskCancelInfeasibleTask) { AssertNoLeaks(); } -TEST_F(ClusterTaskManagerTest, TaskCancelWithResourceShape) { - // task1 doesn't match the resource shape so shouldn't be cancelled - // task2 matches the resource shape and should be cancelled +TEST_F(ClusterLeaseManagerTest, TaskCancelWithResourceShape) { + // lease1 doesn't match the resource shape so shouldn't be cancelled + // lease2 matches the resource shape and should be cancelled std::shared_ptr worker = std::make_shared(WorkerID::FromRandom(), 1234); - RayTask task1 = CreateTask({{ray::kCPU_ResourceLabel, 1}}); - RayTask task2 = CreateTask({{ray::kCPU_ResourceLabel, 10}}); + RayLease lease1 = CreateLease({{ray::kCPU_ResourceLabel, 1}}); + RayLease lease2 = CreateLease({{ray::kCPU_ResourceLabel, 10}}); absl::flat_hash_map resource_shape_1 = { {ray::kCPU_ResourceLabel, 10}}; absl::flat_hash_map resource_shape_2 = { @@ -1441,42 +1454,42 @@ TEST_F(ClusterTaskManagerTest, TaskCancelWithResourceShape) { *callback_called_ptr_2 = true; }; - task_manager_.QueueAndScheduleTask(task1, false, false, &reply1, callback1); + lease_manager_.QueueAndScheduleLease(lease1, false, false, &reply1, callback1); pool_.TriggerCallbacks(); - task_manager_.QueueAndScheduleTask(task2, false, false, &reply2, callback2); + lease_manager_.QueueAndScheduleLease(lease2, false, false, &reply2, callback2); pool_.TriggerCallbacks(); callback_called_1 = false; callback_called_2 = false; reply1.Clear(); reply2.Clear(); - ASSERT_TRUE(task_manager_.CancelTasksWithResourceShapes(target_resource_shapes)); + ASSERT_TRUE(lease_manager_.CancelLeasesWithResourceShapes(target_resource_shapes)); ASSERT_FALSE(reply1.canceled()); ASSERT_FALSE(callback_called_1); ASSERT_TRUE(reply2.canceled()); ASSERT_TRUE(callback_called_2); pool_.PushWorker(std::static_pointer_cast(worker)); - task_manager_.ScheduleAndDispatchTasks(); + lease_manager_.ScheduleAndGrantLeases(); pool_.TriggerCallbacks(); ASSERT_EQ(pool_.workers.size(), 0); ASSERT_EQ(leased_workers_.size(), 1); - RayTask finished_task; - local_task_manager_->TaskFinished(leased_workers_.begin()->second, &finished_task); - ASSERT_EQ(finished_task.GetTaskSpecification().TaskId(), - task1.GetTaskSpecification().TaskId()); + RayLease finished_lease; + local_lease_manager_->CleanupLease(leased_workers_.begin()->second, &finished_lease); + ASSERT_EQ(finished_lease.GetLeaseSpecification().LeaseId(), + lease1.GetLeaseSpecification().LeaseId()); AssertNoLeaks(); } -TEST_F(ClusterTaskManagerTest, HeartbeatTest) { +TEST_F(ClusterLeaseManagerTest, HeartbeatTest) { std::shared_ptr worker = std::make_shared(WorkerID::FromRandom(), 1234); pool_.PushWorker(std::static_pointer_cast(worker)); { - RayTask task = CreateTask({{ray::kCPU_ResourceLabel, 1}}); + RayLease lease = CreateLease({{ray::kCPU_ResourceLabel, 1}}); rpc::RequestWorkerLeaseReply reply; bool callback_called = false; @@ -1486,14 +1499,14 @@ TEST_F(ClusterTaskManagerTest, HeartbeatTest) { *callback_called_ptr = true; }; - task_manager_.QueueAndScheduleTask(task, false, false, &reply, callback); + lease_manager_.QueueAndScheduleLease(lease, false, false, &reply, callback); pool_.TriggerCallbacks(); ASSERT_TRUE(callback_called); // Now {CPU: 7, GPU: 4, MEM:128} } { - RayTask task = CreateTask({{ray::kCPU_ResourceLabel, 1}}); + RayLease lease = CreateLease({{ray::kCPU_ResourceLabel, 1}}); rpc::RequestWorkerLeaseReply reply; bool callback_called = false; @@ -1503,15 +1516,15 @@ TEST_F(ClusterTaskManagerTest, HeartbeatTest) { *callback_called_ptr = true; }; - task_manager_.QueueAndScheduleTask(task, false, false, &reply, callback); + lease_manager_.QueueAndScheduleLease(lease, false, false, &reply, callback); pool_.TriggerCallbacks(); ASSERT_FALSE(callback_called); // No worker available. - // Now {CPU: 7, GPU: 4, MEM:128} with 1 queued task. + // Now {CPU: 7, GPU: 4, MEM:128} with 1 queued lease. } { - RayTask task = - CreateTask({{ray::kCPU_ResourceLabel, 9}, {ray::kGPU_ResourceLabel, 5}}); + RayLease lease = + CreateLease({{ray::kCPU_ResourceLabel, 9}, {ray::kGPU_ResourceLabel, 5}}); rpc::RequestWorkerLeaseReply reply; bool callback_called = false; @@ -1521,15 +1534,15 @@ TEST_F(ClusterTaskManagerTest, HeartbeatTest) { *callback_called_ptr = true; }; - task_manager_.QueueAndScheduleTask(task, false, false, &reply, callback); + lease_manager_.QueueAndScheduleLease(lease, false, false, &reply, callback); pool_.TriggerCallbacks(); ASSERT_FALSE(callback_called); // Infeasible. - // Now there is also an infeasible task {CPU: 9}. + // Now there is also an infeasible lease {CPU: 9}. } { - RayTask task = - CreateTask({{ray::kCPU_ResourceLabel, 10}, {ray::kGPU_ResourceLabel, 1}}); + RayLease lease = + CreateLease({{ray::kCPU_ResourceLabel, 10}, {ray::kGPU_ResourceLabel, 1}}); rpc::RequestWorkerLeaseReply reply; bool callback_called = false; @@ -1539,15 +1552,15 @@ TEST_F(ClusterTaskManagerTest, HeartbeatTest) { *callback_called_ptr = true; }; - task_manager_.QueueAndScheduleTask(task, false, false, &reply, callback); + lease_manager_.QueueAndScheduleLease(lease, false, false, &reply, callback); pool_.TriggerCallbacks(); ASSERT_FALSE(callback_called); // Infeasible. - // Now there is also an infeasible task {CPU: 10}. + // Now there is also an infeasible lease {CPU: 10}. } { rpc::ResourcesData data; - task_manager_.FillResourceUsage(data); + lease_manager_.FillResourceUsage(data); auto load_by_shape = data.mutable_resource_load_by_shape()->mutable_resource_demands(); @@ -1589,7 +1602,7 @@ TEST_F(ClusterTaskManagerTest, HeartbeatTest) { } } -TEST_F(ClusterTaskManagerTest, ResourceReportForNodeAffinitySchedulingStrategyTasks) { +TEST_F(ClusterLeaseManagerTest, ResourceReportForNodeAffinitySchedulingStrategyTasks) { rpc::RequestWorkerLeaseReply reply; bool callback_occurred = false; bool *callback_occurred_ptr = &callback_occurred; @@ -1598,39 +1611,39 @@ TEST_F(ClusterTaskManagerTest, ResourceReportForNodeAffinitySchedulingStrategyTa *callback_occurred_ptr = true; }; - // Feasible strict task won't be reported. + // Feasible strict lease won't be reported. rpc::SchedulingStrategy scheduling_strategy; scheduling_strategy.mutable_node_affinity_scheduling_strategy()->set_node_id( id_.Binary()); scheduling_strategy.mutable_node_affinity_scheduling_strategy()->set_soft(false); - RayTask task1 = - CreateTask({{ray::kCPU_ResourceLabel, 1}}, 0, {}, nullptr, scheduling_strategy); - task_manager_.QueueAndScheduleTask(task1, false, false, &reply, callback); + RayLease lease1 = + CreateLease({{ray::kCPU_ResourceLabel, 1}}, 0, {}, nullptr, scheduling_strategy); + lease_manager_.QueueAndScheduleLease(lease1, false, false, &reply, callback); - // Feasible soft task won't be reported. + // Feasible soft lease won't be reported. scheduling_strategy.mutable_node_affinity_scheduling_strategy()->set_node_id( id_.Binary()); scheduling_strategy.mutable_node_affinity_scheduling_strategy()->set_soft(true); - RayTask task2 = - CreateTask({{ray::kCPU_ResourceLabel, 2}}, 0, {}, nullptr, scheduling_strategy); - task_manager_.QueueAndScheduleTask(task2, false, false, &reply, callback); + RayLease task2 = + CreateLease({{ray::kCPU_ResourceLabel, 2}}, 0, {}, nullptr, scheduling_strategy); + lease_manager_.QueueAndScheduleLease(task2, false, false, &reply, callback); - // Infeasible soft task will be reported. + // Infeasible soft lease will be reported. scheduling_strategy.mutable_node_affinity_scheduling_strategy()->set_node_id( id_.Binary()); scheduling_strategy.mutable_node_affinity_scheduling_strategy()->set_soft(true); - RayTask task3 = - CreateTask({{ray::kGPU_ResourceLabel, 1}}, 0, {}, nullptr, scheduling_strategy); - task_manager_.QueueAndScheduleTask(task3, false, false, &reply, callback); + RayLease task3 = + CreateLease({{ray::kGPU_ResourceLabel, 1}}, 0, {}, nullptr, scheduling_strategy); + lease_manager_.QueueAndScheduleLease(task3, false, false, &reply, callback); ASSERT_FALSE(callback_occurred); - // Infeasible strict task won't be reported (will fail immediately). + // Infeasible strict lease won't be reported (will fail immediately). scheduling_strategy.mutable_node_affinity_scheduling_strategy()->set_node_id( id_.Binary()); scheduling_strategy.mutable_node_affinity_scheduling_strategy()->set_soft(false); - RayTask task4 = - CreateTask({{ray::kGPU_ResourceLabel, 2}}, 0, {}, nullptr, scheduling_strategy); - task_manager_.QueueAndScheduleTask(task4, false, false, &reply, callback); + RayLease task4 = + CreateLease({{ray::kGPU_ResourceLabel, 2}}, 0, {}, nullptr, scheduling_strategy); + lease_manager_.QueueAndScheduleLease(task4, false, false, &reply, callback); ASSERT_TRUE(callback_occurred); ASSERT_TRUE(reply.canceled()); ASSERT_EQ(reply.failure_type(), @@ -1640,7 +1653,7 @@ TEST_F(ClusterTaskManagerTest, ResourceReportForNodeAffinitySchedulingStrategyTa ASSERT_EQ(pool_.workers.size(), 0); rpc::ResourcesData data; - task_manager_.FillResourceUsage(data); + lease_manager_.FillResourceUsage(data); auto resource_load_by_shape = data.resource_load_by_shape(); ASSERT_EQ(resource_load_by_shape.resource_demands().size(), 1); auto demand = resource_load_by_shape.resource_demands()[0]; @@ -1649,7 +1662,7 @@ TEST_F(ClusterTaskManagerTest, ResourceReportForNodeAffinitySchedulingStrategyTa ASSERT_EQ(demand.shape().at("GPU"), 1); } -TEST_F(ClusterTaskManagerTest, BacklogReportTest) { +TEST_F(ClusterLeaseManagerTest, BacklogReportTest) { /* Test basic scheduler functionality: 1. Queue and attempt to schedule/dispatch a test with no workers available @@ -1663,18 +1676,18 @@ TEST_F(ClusterTaskManagerTest, BacklogReportTest) { *callback_occurred_ptr = true; }; - std::vector to_cancel; + std::vector to_cancel; std::vector worker_ids; for (int i = 0; i < 10; i++) { - RayTask task = CreateTask({{ray::kCPU_ResourceLabel, 8}}); - task_manager_.QueueAndScheduleTask(task, false, false, &reply, callback); + RayLease lease = CreateLease({{ray::kCPU_ResourceLabel, 8}}); + lease_manager_.QueueAndScheduleLease(lease, false, false, &reply, callback); worker_ids.push_back(WorkerID::FromRandom()); - local_task_manager_->SetWorkerBacklog( - task.GetTaskSpecification().GetSchedulingClass(), worker_ids.back(), 10 - i); + local_lease_manager_->SetWorkerBacklog( + lease.GetLeaseSpecification().GetSchedulingClass(), worker_ids.back(), 10 - i); pool_.TriggerCallbacks(); - // Don't add the fist task to `to_cancel`. + // Don't add the first lease to `to_cancel`. if (i != 0) { - to_cancel.push_back(task.GetTaskSpecification().TaskId()); + to_cancel.push_back(lease.GetLeaseSpecification().LeaseId()); } } @@ -1683,9 +1696,9 @@ TEST_F(ClusterTaskManagerTest, BacklogReportTest) { ASSERT_EQ(pool_.workers.size(), 0); ASSERT_EQ(node_info_calls_, 0); - { // 1 task has resources allocated, while remaining 9 is stuck. + { // 1 lease has resources allocated, while remaining 9 are stuck. rpc::ResourcesData data; - task_manager_.FillResourceUsage(data); + lease_manager_.FillResourceUsage(data); auto resource_load_by_shape = data.resource_load_by_shape(); auto shape1 = resource_load_by_shape.resource_demands()[0]; @@ -1694,17 +1707,17 @@ TEST_F(ClusterTaskManagerTest, BacklogReportTest) { ASSERT_EQ(shape1.num_ready_requests_queued(), 9); } - // Push a worker so the first task can run. + // Push a worker so the first lease can be granted. std::shared_ptr worker = std::make_shared(WorkerID::FromRandom(), 1234); pool_.PushWorker(worker); - task_manager_.ScheduleAndDispatchTasks(); - local_task_manager_->ClearWorkerBacklog(worker_ids[0]); + lease_manager_.ScheduleAndGrantLeases(); + local_lease_manager_->ClearWorkerBacklog(worker_ids[0]); pool_.TriggerCallbacks(); { rpc::ResourcesData data; - task_manager_.FillResourceUsage(data); + lease_manager_.FillResourceUsage(data); auto resource_load_by_shape = data.resource_load_by_shape(); auto shape1 = resource_load_by_shape.resource_demands()[0]; @@ -1715,32 +1728,33 @@ TEST_F(ClusterTaskManagerTest, BacklogReportTest) { } // Cancel the rest. - for (auto &task_id : to_cancel) { - ASSERT_TRUE(task_manager_.CancelTask(task_id)); + for (auto &lease_id : to_cancel) { + ASSERT_TRUE(lease_manager_.CancelLease(lease_id)); } for (size_t i = 1; i < worker_ids.size(); ++i) { - local_task_manager_->ClearWorkerBacklog(worker_ids[i]); + local_lease_manager_->ClearWorkerBacklog(worker_ids[i]); } { rpc::ResourcesData data; - task_manager_.FillResourceUsage(data); + lease_manager_.FillResourceUsage(data); auto resource_load_by_shape = data.resource_load_by_shape(); ASSERT_EQ(resource_load_by_shape.resource_demands().size(), 0); while (!leased_workers_.empty()) { - RayTask finished_task; - local_task_manager_->TaskFinished(leased_workers_.begin()->second, &finished_task); + RayLease finished_lease; + local_lease_manager_->CleanupLease(leased_workers_.begin()->second, + &finished_lease); leased_workers_.erase(leased_workers_.begin()); } AssertNoLeaks(); } } -TEST_F(ClusterTaskManagerTest, OwnerDeadTest) { - // Test the case when the task owner (worker or node) dies, the task is cancelled. - RayTask task = CreateTask({{ray::kCPU_ResourceLabel, 4}}); +TEST_F(ClusterLeaseManagerTest, OwnerDeadTest) { + // Test the case when the lease owner (worker or node) dies, the lease is cancelled. + RayLease lease = CreateLease({{ray::kCPU_ResourceLabel, 4}}); rpc::RequestWorkerLeaseReply reply; bool callback_occurred = false; bool *callback_occurred_ptr = &callback_occurred; @@ -1749,156 +1763,156 @@ TEST_F(ClusterTaskManagerTest, OwnerDeadTest) { *callback_occurred_ptr = true; }; - task_manager_.QueueAndScheduleTask(task, false, false, &reply, callback); + lease_manager_.QueueAndScheduleLease(lease, false, false, &reply, callback); pool_.TriggerCallbacks(); ASSERT_FALSE(callback_occurred); - task_manager_.CancelAllTasksOwnedBy(task.GetTaskSpecification().CallerWorkerId()); + lease_manager_.CancelAllLeasesOwnedBy(lease.GetLeaseSpecification().CallerWorkerId()); AssertNoLeaks(); callback_occurred = false; - task_manager_.QueueAndScheduleTask(task, false, false, &reply, callback); + lease_manager_.QueueAndScheduleLease(lease, false, false, &reply, callback); pool_.TriggerCallbacks(); ASSERT_FALSE(callback_occurred); - task_manager_.CancelAllTasksOwnedBy(task.GetTaskSpecification().CallerNodeId()); + lease_manager_.CancelAllLeasesOwnedBy(lease.GetLeaseSpecification().CallerNodeId()); AssertNoLeaks(); } -TEST_F(ClusterTaskManagerTest, TestInfeasibleTaskWarning) { +TEST_F(ClusterLeaseManagerTest, TestInfeasibleLeaseWarning) { /* - Test if infeasible tasks warnings are printed. + Test if infeasible leases warnings are printed. */ - // Create an infeasible task. - RayTask task = CreateTask({{ray::kCPU_ResourceLabel, 12}}); + // Create an infeasible lease. + RayLease lease = CreateLease({{ray::kCPU_ResourceLabel, 12}}); rpc::RequestWorkerLeaseReply reply; std::shared_ptr callback_occurred = std::make_shared(false); auto callback = [callback_occurred]( Status, std::function, std::function) { *callback_occurred = true; }; - task_manager_.QueueAndScheduleTask(task, false, false, &reply, callback); + lease_manager_.QueueAndScheduleLease(lease, false, false, &reply, callback); pool_.TriggerCallbacks(); - ASSERT_EQ(announce_infeasible_task_calls_, 1); + ASSERT_EQ(announce_infeasible_lease_calls_, 1); - // Infeasible warning shouldn't be reprinted when the previous task is still infeasible + // Infeasible warning shouldn't be reprinted when the previous lease is still infeasible // after adding a new node. AddNode(NodeID::FromRandom(), 8); std::shared_ptr worker = std::make_shared(WorkerID::FromRandom(), 1234); pool_.PushWorker(std::static_pointer_cast(worker)); - task_manager_.ScheduleAndDispatchTasks(); + lease_manager_.ScheduleAndGrantLeases(); pool_.TriggerCallbacks(); - // Task shouldn't be scheduled yet. - ASSERT_EQ(announce_infeasible_task_calls_, 1); + // Lease shouldn't be scheduled yet. + ASSERT_EQ(announce_infeasible_lease_calls_, 1); ASSERT_FALSE(*callback_occurred); ASSERT_EQ(leased_workers_.size(), 0); ASSERT_EQ(pool_.workers.size(), 1); - // Now we have a node that is feasible to schedule the task. Make sure the infeasible - // task is spillbacked properly. + // Now we have a node that is feasible to schedule the lease. Make sure the infeasible + // lease is spillbacked properly. auto remote_node_id = NodeID::FromRandom(); AddNode(remote_node_id, 12); - task_manager_.ScheduleAndDispatchTasks(); + lease_manager_.ScheduleAndGrantLeases(); pool_.TriggerCallbacks(); // Make sure nothing happens locally. - ASSERT_EQ(announce_infeasible_task_calls_, 1); + ASSERT_EQ(announce_infeasible_lease_calls_, 1); ASSERT_TRUE(*callback_occurred); ASSERT_EQ(leased_workers_.size(), 0); ASSERT_EQ(pool_.workers.size(), 1); // Make sure the spillback callback is called. - ASSERT_EQ(reply.retry_at_raylet_address().raylet_id(), remote_node_id.Binary()); + ASSERT_EQ(reply.retry_at_raylet_address().node_id(), remote_node_id.Binary()); AssertNoLeaks(); } -TEST_F(ClusterTaskManagerTest, TestMultipleInfeasibleTasksWarnOnce) { +TEST_F(ClusterLeaseManagerTest, TestMultipleInfeasibleLeasesWarnOnce) { /* Test infeasible warning is printed only once when the same shape is queued again. */ - // Make sure the first infeasible task announces warning. - RayTask task = CreateTask({{ray::kCPU_ResourceLabel, 12}}); + // Make sure the first infeasible lease announces warning. + RayLease lease = CreateLease({{ray::kCPU_ResourceLabel, 12}}); rpc::RequestWorkerLeaseReply reply; std::shared_ptr callback_occurred = std::make_shared(false); auto callback = [callback_occurred]( Status, std::function, std::function) { *callback_occurred = true; }; - task_manager_.QueueAndScheduleTask(task, false, false, &reply, callback); + lease_manager_.QueueAndScheduleLease(lease, false, false, &reply, callback); pool_.TriggerCallbacks(); - ASSERT_EQ(announce_infeasible_task_calls_, 1); + ASSERT_EQ(announce_infeasible_lease_calls_, 1); - // Make sure the same shape infeasible task won't be announced. - RayTask task2 = CreateTask({{ray::kCPU_ResourceLabel, 12}}); + // Make sure the same shape infeasible lease won't be announced. + RayLease lease2 = CreateLease({{ray::kCPU_ResourceLabel, 12}}); rpc::RequestWorkerLeaseReply reply2; std::shared_ptr callback_occurred2 = std::make_shared(false); auto callback2 = [callback_occurred2]( Status, std::function, std::function) { *callback_occurred2 = true; }; - task_manager_.QueueAndScheduleTask(task2, false, false, &reply2, callback2); + lease_manager_.QueueAndScheduleLease(lease2, false, false, &reply2, callback2); pool_.TriggerCallbacks(); - ASSERT_EQ(announce_infeasible_task_calls_, 1); + ASSERT_EQ(announce_infeasible_lease_calls_, 1); } -TEST_F(ClusterTaskManagerTest, TestAnyPendingTasksForResourceAcquisition) { +TEST_F(ClusterLeaseManagerTest, TestAnyPendingLeasesForResourceAcquisition) { /* - Check if the manager can correctly identify pending tasks. + Check if the manager can correctly identify pending leases. */ std::shared_ptr worker = std::make_shared(WorkerID::FromRandom(), 1234); pool_.PushWorker(std::static_pointer_cast(worker)); - // task1: running - RayTask task = CreateTask({{ray::kCPU_ResourceLabel, 6}}); + // lease1: running. + RayLease lease = CreateLease({{ray::kCPU_ResourceLabel, 6}}); rpc::RequestWorkerLeaseReply reply; std::shared_ptr callback_occurred = std::make_shared(false); auto callback = [callback_occurred]( Status, std::function, std::function) { *callback_occurred = true; }; - task_manager_.QueueAndScheduleTask(task, false, false, &reply, callback); + lease_manager_.QueueAndScheduleLease(lease, false, false, &reply, callback); pool_.TriggerCallbacks(); ASSERT_TRUE(*callback_occurred); ASSERT_EQ(leased_workers_.size(), 1); ASSERT_EQ(pool_.workers.size(), 0); - // task1: running. Progress is made, and there's no deadlock. - int pending_actor_creations = 0; - int pending_tasks = 0; - ASSERT_EQ(task_manager_.AnyPendingTasksForResourceAcquisition(&pending_actor_creations, - &pending_tasks), + // lease1: running. Progress is made, and there's no deadlock. + int pending_lease_creations = 0; + int pending_leases = 0; + ASSERT_EQ(lease_manager_.AnyPendingLeasesForResourceAcquisition( + &pending_lease_creations, &pending_leases), nullptr); - ASSERT_EQ(pending_actor_creations, 0); - ASSERT_EQ(pending_tasks, 0); + ASSERT_EQ(pending_lease_creations, 0); + ASSERT_EQ(pending_leases, 0); - // task1: running, task2: queued. - RayTask task2 = CreateTask({{ray::kCPU_ResourceLabel, 6}}); + // lease1: running, lease2: queued. + RayLease lease2 = CreateLease({{ray::kCPU_ResourceLabel, 6}}); rpc::RequestWorkerLeaseReply reply2; std::shared_ptr callback_occurred2 = std::make_shared(false); auto callback2 = [callback_occurred2]( Status, std::function, std::function) { *callback_occurred2 = true; }; - task_manager_.QueueAndScheduleTask(task2, false, false, &reply2, callback2); + lease_manager_.QueueAndScheduleLease(lease2, false, false, &reply2, callback2); pool_.TriggerCallbacks(); ASSERT_FALSE(*callback_occurred2); - auto pending_task = task_manager_.AnyPendingTasksForResourceAcquisition( - &pending_actor_creations, &pending_tasks); - ASSERT_EQ(pending_task->GetTaskSpecification().TaskId(), - task2.GetTaskSpecification().TaskId()); - ASSERT_EQ(pending_actor_creations, 0); - ASSERT_EQ(pending_tasks, 1); + auto pending_lease = lease_manager_.AnyPendingLeasesForResourceAcquisition( + &pending_lease_creations, &pending_leases); + ASSERT_EQ(pending_lease->GetLeaseSpecification().LeaseId(), + lease2.GetLeaseSpecification().LeaseId()); + ASSERT_EQ(pending_lease_creations, 0); + ASSERT_EQ(pending_leases, 1); } -TEST_F(ClusterTaskManagerTest, ArgumentEvicted) { +TEST_F(ClusterLeaseManagerTest, ArgumentEvicted) { /* - Test the task's dependencies becoming local, then one of the arguments is - evicted. The task should go from waiting -> dispatch -> waiting. + Test the lease's dependencies becoming local, then one of the arguments is + evicted. The lease should go from waiting -> dispatch -> waiting. */ std::shared_ptr worker = std::make_shared(WorkerID::FromRandom(), 1234); @@ -1913,52 +1927,52 @@ TEST_F(ClusterTaskManagerTest, ArgumentEvicted) { }; /* Blocked on dependencies */ - auto task = CreateTask({{ray::kCPU_ResourceLabel, 5}}, 2); - auto missing_arg = task.GetTaskSpecification().GetDependencyIds()[0]; + auto lease = CreateLease({{ray::kCPU_ResourceLabel, 5}}, 2); + auto missing_arg = lease.GetLeaseSpecification().GetDependencyIds()[0]; missing_objects_.insert(missing_arg); - std::unordered_set expected_subscribed_tasks = { - task.GetTaskSpecification().TaskId()}; - task_manager_.QueueAndScheduleTask(task, false, false, &reply, callback); + std::unordered_set expected_subscribed_leases = { + lease.GetLeaseSpecification().LeaseId()}; + lease_manager_.QueueAndScheduleLease(lease, false, false, &reply, callback); pool_.TriggerCallbacks(); - ASSERT_EQ(dependency_manager_.subscribed_tasks, expected_subscribed_tasks); + ASSERT_EQ(lease_dependency_manager_.subscribed_leases, expected_subscribed_leases); ASSERT_EQ(num_callbacks, 0); ASSERT_EQ(leased_workers_.size(), 0); - /* RayTask is unblocked now */ + /* RayLease is unblocked now */ missing_objects_.erase(missing_arg); pool_.workers.clear(); - auto id = task.GetTaskSpecification().TaskId(); - local_task_manager_->TasksUnblocked({id}); - ASSERT_EQ(dependency_manager_.subscribed_tasks, expected_subscribed_tasks); + auto id = lease.GetLeaseSpecification().LeaseId(); + local_lease_manager_->LeasesUnblocked({id}); + ASSERT_EQ(lease_dependency_manager_.subscribed_leases, expected_subscribed_leases); ASSERT_EQ(num_callbacks, 0); ASSERT_EQ(leased_workers_.size(), 0); /* Worker available and arguments available */ pool_.PushWorker(std::static_pointer_cast(worker)); - task_manager_.ScheduleAndDispatchTasks(); + lease_manager_.ScheduleAndGrantLeases(); pool_.TriggerCallbacks(); ASSERT_EQ(num_callbacks, 1); ASSERT_EQ(leased_workers_.size(), 1); - RayTask finished_task; - local_task_manager_->TaskFinished(leased_workers_.begin()->second, &finished_task); - ASSERT_EQ(finished_task.GetTaskSpecification().TaskId(), - task.GetTaskSpecification().TaskId()); + RayLease finished_lease; + local_lease_manager_->CleanupLease(leased_workers_.begin()->second, &finished_lease); + ASSERT_EQ(finished_lease.GetLeaseSpecification().LeaseId(), + lease.GetLeaseSpecification().LeaseId()); AssertNoLeaks(); } -TEST_F(ClusterTaskManagerTest, FeasibleToNonFeasible) { - // Test the case, when resources changes in local node, the feasible task should - // able to transfer to infeasible task +TEST_F(ClusterLeaseManagerTest, FeasibleToNonFeasible) { + // Test the case, when resources changes in local node, the feasible lease should + // able to transfer to infeasible lease std::shared_ptr worker = std::make_shared(WorkerID::FromRandom(), 1234); pool_.PushWorker(std::static_pointer_cast(worker)); - RayTask task1 = CreateTask({{ray::kCPU_ResourceLabel, 4}}); + RayLease lease1 = CreateLease({{ray::kCPU_ResourceLabel, 4}}); rpc::RequestWorkerLeaseReply reply1; bool callback_occurred1 = false; - task_manager_.QueueAndScheduleTask( - task1, + lease_manager_.QueueAndScheduleLease( + lease1, false, false, &reply1, @@ -1969,20 +1983,20 @@ TEST_F(ClusterTaskManagerTest, FeasibleToNonFeasible) { ASSERT_EQ(leased_workers_.size(), 1); ASSERT_TRUE(callback_occurred1); ASSERT_EQ(pool_.workers.size(), 0); - ASSERT_EQ(task_manager_.tasks_to_schedule_.size(), 0); - ASSERT_EQ(local_task_manager_->tasks_to_dispatch_.size(), 0); - ASSERT_EQ(task_manager_.infeasible_tasks_.size(), 0); + ASSERT_EQ(lease_manager_.leases_to_schedule_.size(), 0); + ASSERT_EQ(local_lease_manager_->leases_to_grant_.size(), 0); + ASSERT_EQ(lease_manager_.infeasible_leases_.size(), 0); - // Delete cpu resource of local node, then task 2 should be turned into + // Delete cpu resource of local node, then lease 2 should be turned into // infeasible. scheduler_->GetLocalResourceManager().DeleteLocalResource( scheduling::ResourceID(ray::kCPU_ResourceLabel)); - RayTask task2 = CreateTask({{ray::kCPU_ResourceLabel, 4}}); + RayLease lease2 = CreateLease({{ray::kCPU_ResourceLabel, 4}}); rpc::RequestWorkerLeaseReply reply2; bool callback_occurred2 = false; - task_manager_.QueueAndScheduleTask( - task2, + lease_manager_.QueueAndScheduleLease( + lease2, false, false, &reply2, @@ -1993,17 +2007,18 @@ TEST_F(ClusterTaskManagerTest, FeasibleToNonFeasible) { ASSERT_EQ(leased_workers_.size(), 1); ASSERT_FALSE(callback_occurred2); ASSERT_EQ(pool_.workers.size(), 0); - ASSERT_EQ(task_manager_.tasks_to_schedule_.size(), 0); - ASSERT_EQ(local_task_manager_->tasks_to_dispatch_.size(), 0); - ASSERT_EQ(task_manager_.infeasible_tasks_.size(), 1); - - RayTask finished_task; - local_task_manager_->TaskFinished(leased_workers_.begin()->second, &finished_task); - ASSERT_EQ(finished_task.GetTaskSpecification().TaskId(), - task1.GetTaskSpecification().TaskId()); + ASSERT_EQ(lease_manager_.leases_to_schedule_.size(), 0); + ASSERT_EQ(local_lease_manager_->leases_to_grant_.size(), 0); + ASSERT_EQ(local_lease_manager_->waiting_lease_queue_.size(), 0); + ASSERT_EQ(lease_manager_.infeasible_leases_.size(), 1); + + RayLease finished_lease; + local_lease_manager_->CleanupLease(leased_workers_.begin()->second, &finished_lease); + ASSERT_EQ(finished_lease.GetLeaseSpecification().LeaseId(), + lease1.GetLeaseSpecification().LeaseId()); } -TEST_F(ClusterTaskManagerTest, NegativePlacementGroupCpuResources) { +TEST_F(ClusterLeaseManagerTest, NegativePlacementGroupCpuResources) { // Add PG CPU resources. scheduler_->GetLocalResourceManager().AddLocalResourceInstances( scheduling::ResourceID("CPU_group_aaa"), std::vector{FixedPoint(2)}); @@ -2022,7 +2037,7 @@ TEST_F(ClusterTaskManagerTest, NegativePlacementGroupCpuResources) { {{"CPU_group_aaa", 1.}, {"CPU_group_0_aaa", 1.}}, allocated_instances)); worker1->SetAllocatedInstances(allocated_instances); // worker1 calls ray.get() and release the CPU resource - ASSERT_TRUE(local_task_manager_->ReleaseCpuResourcesFromBlockedWorker(worker1)); + ASSERT_TRUE(local_lease_manager_->ReleaseCpuResourcesFromBlockedWorker(worker1)); // the released CPU resource is acquired by worker2 auto worker2 = std::make_shared(WorkerID::FromRandom(), 5678); @@ -2032,7 +2047,7 @@ TEST_F(ClusterTaskManagerTest, NegativePlacementGroupCpuResources) { worker2->SetAllocatedInstances(allocated_instances); // ray.get() returns and worker1 acquires the CPU resource again - ASSERT_TRUE(local_task_manager_->ReturnCpuResourcesToUnblockedWorker(worker1)); + ASSERT_TRUE(local_lease_manager_->ReturnCpuResourcesToUnblockedWorker(worker1)); ASSERT_EQ(node_resources.available.Get(scheduling::ResourceID("CPU_group_aaa")), 0); ASSERT_EQ(node_resources.available.Get(scheduling::ResourceID("CPU_group_0_aaa")), -1); ASSERT_EQ(node_resources.available.Get(scheduling::ResourceID("CPU_group_1_aaa")), 1); @@ -2047,7 +2062,7 @@ TEST_F(ClusterTaskManagerTest, NegativePlacementGroupCpuResources) { ASSERT_EQ(node_resources.available.Get(scheduling::ResourceID("CPU_group_1_aaa")), 0); } -TEST_F(ClusterTaskManagerTestWithGPUsAtHead, ReleaseAndReturnWorkerCpuResources) { +TEST_F(ClusterLeaseManagerTestWithGPUsAtHead, ReleaseAndReturnWorkerCpuResources) { // Add PG CPU and GPU resources. scheduler_->GetLocalResourceManager().AddLocalResourceInstances( scheduling::ResourceID("CPU_group_aaa"), std::vector{FixedPoint(1)}); @@ -2068,8 +2083,8 @@ TEST_F(ClusterTaskManagerTestWithGPUsAtHead, ReleaseAndReturnWorkerCpuResources) auto worker2 = std::make_shared(WorkerID::FromRandom(), 5678); // Check failed as the worker has no allocated resource instances. - ASSERT_FALSE(local_task_manager_->ReleaseCpuResourcesFromBlockedWorker(worker1)); - ASSERT_FALSE(local_task_manager_->ReleaseCpuResourcesFromBlockedWorker(worker2)); + ASSERT_FALSE(local_lease_manager_->ReleaseCpuResourcesFromBlockedWorker(worker1)); + ASSERT_FALSE(local_lease_manager_->ReleaseCpuResourcesFromBlockedWorker(worker2)); auto node_resource_instances = scheduler_->GetLocalResourceManager().GetLocalResources(); @@ -2077,18 +2092,18 @@ TEST_F(ClusterTaskManagerTestWithGPUsAtHead, ReleaseAndReturnWorkerCpuResources) node_resource_instances.GetAvailableResourceInstances(); auto allocated_instances = std::make_shared(); - absl::flat_hash_map task_spec = {{"CPU", 1.}, {"GPU", 1.}}; + absl::flat_hash_map lease_spec = {{"CPU", 1.}, {"GPU", 1.}}; ASSERT_TRUE(scheduler_->GetLocalResourceManager().AllocateLocalTaskResources( - task_spec, allocated_instances)); + lease_spec, allocated_instances)); worker1->SetAllocatedInstances(allocated_instances); allocated_instances = std::make_shared(); - task_spec = {{"CPU_group_aaa", 1.}, - {"CPU_group_0_aaa", 1.}, - {"GPU_group_aaa", 1.}, - {"GPU_group_0_aaa", 1.}}; + lease_spec = {{"CPU_group_aaa", 1.}, + {"CPU_group_0_aaa", 1.}, + {"GPU_group_aaa", 1.}, + {"GPU_group_0_aaa", 1.}}; ASSERT_TRUE(scheduler_->GetLocalResourceManager().AllocateLocalTaskResources( - task_spec, allocated_instances)); + lease_spec, allocated_instances)); worker2->SetAllocatedInstances(allocated_instances); // Check that the resources are allocated successfully. @@ -2100,8 +2115,8 @@ TEST_F(ClusterTaskManagerTestWithGPUsAtHead, ReleaseAndReturnWorkerCpuResources) ASSERT_EQ(node_resources.available.Get(scheduling::ResourceID("GPU_group_0_aaa")), 0); // Check that the cpu resources are released successfully. - ASSERT_TRUE(local_task_manager_->ReleaseCpuResourcesFromBlockedWorker(worker1)); - ASSERT_TRUE(local_task_manager_->ReleaseCpuResourcesFromBlockedWorker(worker2)); + ASSERT_TRUE(local_lease_manager_->ReleaseCpuResourcesFromBlockedWorker(worker1)); + ASSERT_TRUE(local_lease_manager_->ReleaseCpuResourcesFromBlockedWorker(worker2)); // Check that only cpu resources are released. ASSERT_EQ(node_resources.available.Get(ResourceID::CPU()), 8); @@ -2115,8 +2130,8 @@ TEST_F(ClusterTaskManagerTestWithGPUsAtHead, ReleaseAndReturnWorkerCpuResources) worker1->MarkBlocked(); worker2->MarkBlocked(); // Check failed as the worker is blocked. - ASSERT_FALSE(local_task_manager_->ReleaseCpuResourcesFromBlockedWorker(worker1)); - ASSERT_FALSE(local_task_manager_->ReleaseCpuResourcesFromBlockedWorker(worker2)); + ASSERT_FALSE(local_lease_manager_->ReleaseCpuResourcesFromBlockedWorker(worker1)); + ASSERT_FALSE(local_lease_manager_->ReleaseCpuResourcesFromBlockedWorker(worker2)); // Check nothing will be changed. ASSERT_EQ(node_resources.available.Get(ResourceID::CPU()), 8); ASSERT_EQ(node_resources.available.Get(ResourceID::GPU()), 3); @@ -2126,8 +2141,8 @@ TEST_F(ClusterTaskManagerTestWithGPUsAtHead, ReleaseAndReturnWorkerCpuResources) ASSERT_EQ(node_resources.available.Get(scheduling::ResourceID("GPU_group_0_aaa")), 0); // Check that the cpu resources are returned back to worker successfully. - ASSERT_TRUE(local_task_manager_->ReturnCpuResourcesToUnblockedWorker(worker1)); - ASSERT_TRUE(local_task_manager_->ReturnCpuResourcesToUnblockedWorker(worker2)); + ASSERT_TRUE(local_lease_manager_->ReturnCpuResourcesToUnblockedWorker(worker1)); + ASSERT_TRUE(local_lease_manager_->ReturnCpuResourcesToUnblockedWorker(worker2)); // Check that only cpu resources are returned back to the worker. ASSERT_EQ(node_resources.available.Get(ResourceID::CPU()), 7); @@ -2140,8 +2155,8 @@ TEST_F(ClusterTaskManagerTestWithGPUsAtHead, ReleaseAndReturnWorkerCpuResources) // Mark worker as unblocked. worker1->MarkUnblocked(); worker2->MarkUnblocked(); - ASSERT_FALSE(local_task_manager_->ReturnCpuResourcesToUnblockedWorker(worker1)); - ASSERT_FALSE(local_task_manager_->ReturnCpuResourcesToUnblockedWorker(worker2)); + ASSERT_FALSE(local_lease_manager_->ReturnCpuResourcesToUnblockedWorker(worker1)); + ASSERT_FALSE(local_lease_manager_->ReturnCpuResourcesToUnblockedWorker(worker2)); // Check nothing will be changed. ASSERT_EQ(node_resources.available.Get(ResourceID::CPU()), 7); ASSERT_EQ(node_resources.available.Get(ResourceID::GPU()), 3); @@ -2151,104 +2166,104 @@ TEST_F(ClusterTaskManagerTestWithGPUsAtHead, ReleaseAndReturnWorkerCpuResources) ASSERT_EQ(node_resources.available.Get(scheduling::ResourceID("GPU_group_0_aaa")), 0); } -TEST_F(ClusterTaskManagerTest, TestSpillWaitingTasks) { +TEST_F(ClusterLeaseManagerTest, TestSpillWaitingLeases) { // Cases to check: - // - resources available locally, task dependencies being fetched -> do not spill. - // - resources available locally, task dependencies blocked -> spill. + // - resources available locally, lease dependencies being fetched -> do not spill. + // - resources available locally, lease dependencies blocked -> spill. // - resources not available locally -> spill. - std::vector tasks; + std::vector leases; std::vector> replies; int num_callbacks = 0; auto callback = [&](Status, std::function, std::function) { num_callbacks++; }; for (int i = 0; i < 5; i++) { - RayTask task = CreateTask({{ray::kCPU_ResourceLabel, 8}}, /*num_args=*/1); - tasks.push_back(task); + RayLease lease = CreateLease({{ray::kCPU_ResourceLabel, 8}}, /*num_args=*/1); + leases.push_back(lease); replies.push_back(std::make_unique()); - // All tasks except the last one added are waiting for dependencies. + // All leases except the last one added are waiting for dependencies. if (i < 4) { - auto missing_arg = task.GetTaskSpecification().GetDependencyIds()[0]; + auto missing_arg = lease.GetLeaseSpecification().GetDependencyIds()[0]; missing_objects_.insert(missing_arg); } if (i == 0) { - const_cast(task.GetTaskSpecification()) + const_cast(lease.GetLeaseSpecification()) .GetMutableMessage() .mutable_scheduling_strategy() ->mutable_spread_scheduling_strategy(); } - task_manager_.QueueAndScheduleTask(task, false, false, replies[i].get(), callback); + lease_manager_.QueueAndScheduleLease(lease, false, false, replies[i].get(), callback); pool_.TriggerCallbacks(); } ASSERT_EQ(num_callbacks, 0); - // Local resources could only dispatch one task. - ASSERT_EQ(NumTasksToDispatchWithStatus(internal::WorkStatus::WAITING_FOR_WORKER), 1); + // Local resources could only dispatch one lease. + ASSERT_EQ(NumLeasesToDispatchWithStatus(internal::WorkStatus::WAITING_FOR_WORKER), 1); auto remote_node_id = NodeID::FromRandom(); AddNode(remote_node_id, 16); - // We are fetching dependencies for all waiting tasks but we have no enough - // resources available locally to schedule tasks except the first. + // We are fetching dependencies for all waiting leases but we have no enough + // resources available locally to schedule leases except the first. // We should only spill up to the remote node's resource availability. - task_manager_.ScheduleAndDispatchTasks(); + lease_manager_.ScheduleAndGrantLeases(); ASSERT_EQ(num_callbacks, 2); // Spill from the back of the waiting queue. - ASSERT_EQ(replies[0]->retry_at_raylet_address().raylet_id(), ""); - ASSERT_EQ(replies[1]->retry_at_raylet_address().raylet_id(), ""); - ASSERT_EQ(replies[2]->retry_at_raylet_address().raylet_id(), remote_node_id.Binary()); - ASSERT_EQ(replies[3]->retry_at_raylet_address().raylet_id(), remote_node_id.Binary()); - ASSERT_FALSE(task_manager_.CancelTask(tasks[2].GetTaskSpecification().TaskId())); - ASSERT_FALSE(task_manager_.CancelTask(tasks[3].GetTaskSpecification().TaskId())); - // Do not spill back tasks ready to dispatch. - ASSERT_EQ(replies[4]->retry_at_raylet_address().raylet_id(), ""); + ASSERT_EQ(replies[0]->retry_at_raylet_address().node_id(), ""); + ASSERT_EQ(replies[1]->retry_at_raylet_address().node_id(), ""); + ASSERT_EQ(replies[2]->retry_at_raylet_address().node_id(), remote_node_id.Binary()); + ASSERT_EQ(replies[3]->retry_at_raylet_address().node_id(), remote_node_id.Binary()); + ASSERT_FALSE(lease_manager_.CancelLease(leases[2].GetLeaseSpecification().LeaseId())); + ASSERT_FALSE(lease_manager_.CancelLease(leases[3].GetLeaseSpecification().LeaseId())); + // Do not spill back leases ready to dispatch. + ASSERT_EQ(replies[4]->retry_at_raylet_address().node_id(), ""); AddNode(remote_node_id, 8); - // Dispatch the ready task. + // Dispatch the ready lease. std::shared_ptr worker = std::make_shared(WorkerID::FromRandom(), 1234); pool_.PushWorker(std::dynamic_pointer_cast(worker)); - task_manager_.ScheduleAndDispatchTasks(); + lease_manager_.ScheduleAndGrantLeases(); pool_.TriggerCallbacks(); ASSERT_EQ(num_callbacks, 4); - // One waiting task spilled. - ASSERT_EQ(replies[0]->retry_at_raylet_address().raylet_id(), ""); - ASSERT_EQ(replies[1]->retry_at_raylet_address().raylet_id(), remote_node_id.Binary()); - ASSERT_FALSE(task_manager_.CancelTask(tasks[1].GetTaskSpecification().TaskId())); - // One task dispatched. + // One waiting lease spilled. + ASSERT_EQ(replies[0]->retry_at_raylet_address().node_id(), ""); + ASSERT_EQ(replies[1]->retry_at_raylet_address().node_id(), remote_node_id.Binary()); + ASSERT_FALSE(lease_manager_.CancelLease(leases[1].GetLeaseSpecification().LeaseId())); + // One lease dispatched. ASSERT_EQ(replies[4]->worker_address().port(), 1234); // Spillback is idempotent. - task_manager_.ScheduleAndDispatchTasks(); + lease_manager_.ScheduleAndGrantLeases(); pool_.TriggerCallbacks(); ASSERT_EQ(num_callbacks, 4); - // One waiting task spilled. - ASSERT_EQ(replies[0]->retry_at_raylet_address().raylet_id(), ""); - ASSERT_EQ(replies[1]->retry_at_raylet_address().raylet_id(), remote_node_id.Binary()); - ASSERT_FALSE(task_manager_.CancelTask(tasks[1].GetTaskSpecification().TaskId())); - // One task dispatched. + // One waiting lease spilled. + ASSERT_EQ(replies[0]->retry_at_raylet_address().node_id(), ""); + ASSERT_EQ(replies[1]->retry_at_raylet_address().node_id(), remote_node_id.Binary()); + ASSERT_FALSE(lease_manager_.CancelLease(leases[1].GetLeaseSpecification().LeaseId())); + // One lease dispatched. ASSERT_EQ(replies[4]->worker_address().port(), 1234); - // Spread task won't be spilled due to waiting for dependencies. + // Spread lease won't be spilled due to waiting for dependencies. AddNode(remote_node_id, 8); - task_manager_.ScheduleAndDispatchTasks(); + lease_manager_.ScheduleAndGrantLeases(); ASSERT_EQ(num_callbacks, 4); - ASSERT_EQ(replies[0]->retry_at_raylet_address().raylet_id(), ""); + ASSERT_EQ(replies[0]->retry_at_raylet_address().node_id(), ""); - RayTask finished_task; - local_task_manager_->TaskFinished(leased_workers_.begin()->second, &finished_task); + RayLease finished_lease; + local_lease_manager_->CleanupLease(leased_workers_.begin()->second, &finished_lease); leased_workers_.clear(); - ASSERT_TRUE(task_manager_.CancelTask(tasks[0].GetTaskSpecification().TaskId())); + ASSERT_TRUE(lease_manager_.CancelLease(leases[0].GetLeaseSpecification().LeaseId())); AssertNoLeaks(); } -TEST_F(ClusterTaskManagerTest, PinnedArgsMemoryTest) { +TEST_F(ClusterLeaseManagerTest, PinnedArgsMemoryTest) { /* - Total memory required by executing tasks' args stays under the specified + Total memory required by granted lease args stays under the specified threshold. */ - std::shared_ptr worker = - std::make_shared(WorkerID::FromRandom(), 1234); - std::shared_ptr worker2 = - std::make_shared(WorkerID::FromRandom(), 12345); + auto worker_id1 = WorkerID::FromRandom(); + auto worker_id2 = WorkerID::FromRandom(); + std::shared_ptr worker = std::make_shared(worker_id1, 1234); + std::shared_ptr worker2 = std::make_shared(worker_id2, 12345); pool_.PushWorker(std::static_pointer_cast(worker2)); pool_.PushWorker(std::static_pointer_cast(worker)); @@ -2260,44 +2275,56 @@ TEST_F(ClusterTaskManagerTest, PinnedArgsMemoryTest) { (*num_callbacks_ptr) = *num_callbacks_ptr + 1; }; - // This task can run. + // This lease can run. + auto lease_id1 = LeaseID::FromWorker(worker_id1, 1); default_arg_size_ = 600; - auto task = CreateTask({{ray::kCPU_ResourceLabel, 1}}, 1); - task_manager_.QueueAndScheduleTask(task, false, false, &reply, callback); + auto lease1 = CreateLease({{ray::kCPU_ResourceLabel, 1}}, + 1, + {}, + nullptr, + rpc::SchedulingStrategy(), + lease_id1); + lease_manager_.QueueAndScheduleLease(lease1, false, false, &reply, callback); pool_.TriggerCallbacks(); ASSERT_EQ(num_callbacks, 1); ASSERT_EQ(leased_workers_.size(), 1); ASSERT_EQ(pool_.workers.size(), 1); - AssertPinnedTaskArgumentsPresent(task); - - // This task cannot run because it would put us over the memory threshold. - auto task2 = CreateTask({{ray::kCPU_ResourceLabel, 1}}, 1); - task_manager_.QueueAndScheduleTask(task2, false, false, &reply, callback); + AssertPinnedLeaseArgumentsPresent(lease1); + + // This lease cannot run because it would put us over the memory threshold. + auto lease_id2 = LeaseID::FromWorker(worker_id2, 1); + auto lease2 = CreateLease({{ray::kCPU_ResourceLabel, 1}}, + 1, + {}, + nullptr, + rpc::SchedulingStrategy(), + lease_id2); + lease_manager_.QueueAndScheduleLease(lease2, false, false, &reply, callback); pool_.TriggerCallbacks(); ASSERT_EQ(num_callbacks, 1); ASSERT_EQ(leased_workers_.size(), 1); ASSERT_EQ(pool_.workers.size(), 1); - /* First task finishes, freeing memory for the second task */ - RayTask finished_task; - local_task_manager_->TaskFinished(leased_workers_.begin()->second, &finished_task); + /* First lease finishes, freeing memory for the second lease */ + RayLease finished_lease; + local_lease_manager_->CleanupLease(leased_workers_.begin()->second, &finished_lease); leased_workers_.clear(); - task_manager_.ScheduleAndDispatchTasks(); + lease_manager_.ScheduleAndGrantLeases(); pool_.TriggerCallbacks(); - AssertPinnedTaskArgumentsPresent(task2); + AssertPinnedLeaseArgumentsPresent(lease2); ASSERT_EQ(num_callbacks, 2); ASSERT_EQ(leased_workers_.size(), 1); ASSERT_EQ(pool_.workers.size(), 0); - local_task_manager_->TaskFinished(leased_workers_.begin()->second, &finished_task); + local_lease_manager_->CleanupLease(leased_workers_.begin()->second, &finished_lease); leased_workers_.clear(); AssertNoLeaks(); } -TEST_F(ClusterTaskManagerTest, PinnedArgsSameMemoryTest) { +TEST_F(ClusterLeaseManagerTest, PinnedArgsSameMemoryTest) { /* - * Two tasks that depend on the same object can run concurrently. + * Two leases that depend on the same object can run concurrently. */ std::shared_ptr worker = std::make_shared(WorkerID::FromRandom(), 1234); @@ -2314,33 +2341,34 @@ TEST_F(ClusterTaskManagerTest, PinnedArgsSameMemoryTest) { (*num_callbacks_ptr) = *num_callbacks_ptr + 1; }; - // This task can run. + // This lease can run. default_arg_size_ = 600; - auto task = CreateTask({{ray::kCPU_ResourceLabel, 1}}, 1); - task_manager_.QueueAndScheduleTask(task, false, false, &reply, callback); + auto lease = CreateLease({{ray::kCPU_ResourceLabel, 1}}, 1); + lease_manager_.QueueAndScheduleLease(lease, false, false, &reply, callback); pool_.TriggerCallbacks(); ASSERT_EQ(num_callbacks, 1); ASSERT_EQ(leased_workers_.size(), 1); ASSERT_EQ(pool_.workers.size(), 1); - AssertPinnedTaskArgumentsPresent(task); + AssertPinnedLeaseArgumentsPresent(lease); - // This task can run because it depends on the same object as the first task. - auto task2 = CreateTask( - {{ray::kCPU_ResourceLabel, 1}}, 1, task.GetTaskSpecification().GetDependencyIds()); - task_manager_.QueueAndScheduleTask(task2, false, false, &reply, callback); + // This lease can run because it depends on the same object as the first lease. + auto lease2 = CreateLease({{ray::kCPU_ResourceLabel, 1}}, + 1, + lease.GetLeaseSpecification().GetDependencyIds()); + lease_manager_.QueueAndScheduleLease(lease2, false, false, &reply, callback); pool_.TriggerCallbacks(); ASSERT_EQ(num_callbacks, 2); ASSERT_EQ(leased_workers_.size(), 2); ASSERT_EQ(pool_.workers.size(), 0); - RayTask finished_task; + RayLease finished_lease; for (auto &cur_worker : leased_workers_) { - local_task_manager_->TaskFinished(cur_worker.second, &finished_task); + local_lease_manager_->CleanupLease(cur_worker.second, &finished_lease); } AssertNoLeaks(); } -TEST_F(ClusterTaskManagerTest, LargeArgsNoStarvationTest) { +TEST_F(ClusterLeaseManagerTest, LargeArgsNoStarvationTest) { std::shared_ptr worker = std::make_shared(WorkerID::FromRandom(), 1234); pool_.PushWorker(std::static_pointer_cast(worker)); @@ -2354,29 +2382,29 @@ TEST_F(ClusterTaskManagerTest, LargeArgsNoStarvationTest) { }; default_arg_size_ = 2000; - auto task = CreateTask({{ray::kCPU_ResourceLabel, 1}}, 1); + auto lease = CreateLease({{ray::kCPU_ResourceLabel, 1}}, 1); pool_.PushWorker(std::static_pointer_cast(worker)); - task_manager_.QueueAndScheduleTask(task, false, false, &reply, callback); + lease_manager_.QueueAndScheduleLease(lease, false, false, &reply, callback); pool_.TriggerCallbacks(); ASSERT_EQ(num_callbacks, 1); ASSERT_EQ(leased_workers_.size(), 1); - AssertPinnedTaskArgumentsPresent(task); + AssertPinnedLeaseArgumentsPresent(lease); - RayTask finished_task; - local_task_manager_->TaskFinished(leased_workers_.begin()->second, &finished_task); + RayLease finished_lease; + local_lease_manager_->CleanupLease(leased_workers_.begin()->second, &finished_lease); AssertNoLeaks(); } -TEST_F(ClusterTaskManagerTest, PopWorkerExactlyOnce) { - // Create and queue one task. +TEST_F(ClusterLeaseManagerTest, PopWorkerExactlyOnce) { + // Create and queue one lease. std::string serialized_runtime_env = "mock_env"; std::shared_ptr runtime_env_info = nullptr; runtime_env_info.reset(new rpc::RuntimeEnvInfo()); runtime_env_info->set_serialized_runtime_env(serialized_runtime_env); - RayTask task = CreateTask( + RayLease lease = CreateLease( {{ray::kCPU_ResourceLabel, 4}}, /*num_args=*/0, /*args=*/{}, runtime_env_info); - auto runtime_env_hash = task.GetTaskSpecification().GetRuntimeEnvHash(); + auto runtime_env_hash = lease.GetLeaseSpecification().GetRuntimeEnvHash(); rpc::RequestWorkerLeaseReply reply; bool callback_occurred = false; bool *callback_occurred_ptr = &callback_occurred; @@ -2385,7 +2413,7 @@ TEST_F(ClusterTaskManagerTest, PopWorkerExactlyOnce) { *callback_occurred_ptr = true; }; - task_manager_.QueueAndScheduleTask(task, false, false, &reply, callback); + lease_manager_.QueueAndScheduleLease(lease, false, false, &reply, callback); // Make sure callback doesn't occurred. ASSERT_FALSE(callback_occurred); @@ -2393,8 +2421,8 @@ TEST_F(ClusterTaskManagerTest, PopWorkerExactlyOnce) { ASSERT_EQ(pool_.workers.size(), 0); // Popworker was called once. ASSERT_EQ(pool_.CallbackSize(runtime_env_hash), 1); - // Try to schedule and dispatch tasks. - task_manager_.ScheduleAndDispatchTasks(); + // Try to schedule and dispatch leases. + lease_manager_.ScheduleAndGrantLeases(); // Popworker has been called once, don't call it repeatedly. ASSERT_EQ(pool_.CallbackSize(runtime_env_hash), 1); // Push a worker and try to call back. @@ -2406,31 +2434,34 @@ TEST_F(ClusterTaskManagerTest, PopWorkerExactlyOnce) { ASSERT_TRUE(callback_occurred); ASSERT_EQ(leased_workers_.size(), 1); ASSERT_EQ(pool_.workers.size(), 0); - // Try to schedule and dispatch tasks. - task_manager_.ScheduleAndDispatchTasks(); + // Try to schedule and dispatch leases. + lease_manager_.ScheduleAndGrantLeases(); // Worker has been popped. Don't call `PopWorker` repeatedly. ASSERT_EQ(pool_.CallbackSize(runtime_env_hash), 0); - RayTask finished_task; - local_task_manager_->TaskFinished(leased_workers_.begin()->second, &finished_task); - ASSERT_EQ(finished_task.GetTaskSpecification().TaskId(), - task.GetTaskSpecification().TaskId()); + RayLease finished_lease; + local_lease_manager_->CleanupLease(leased_workers_.begin()->second, &finished_lease); + ASSERT_EQ(finished_lease.GetLeaseSpecification().LeaseId(), + lease.GetLeaseSpecification().LeaseId()); AssertNoLeaks(); } -TEST_F(ClusterTaskManagerTest, CapRunningOnDispatchQueue) { +TEST_F(ClusterLeaseManagerTest, CapRunningOnDispatchQueue) { scheduler_->GetLocalResourceManager().AddLocalResourceInstances( scheduling::ResourceID(ray::kGPU_ResourceLabel), {1, 1, 1}); - RayTask task = CreateTask({{ray::kCPU_ResourceLabel, 4}, {ray::kGPU_ResourceLabel, 1}}, - /*num_args=*/0, - /*args=*/{}); - RayTask task2 = CreateTask({{ray::kCPU_ResourceLabel, 4}, {ray::kGPU_ResourceLabel, 1}}, - /*num_args=*/0, - /*args=*/{}); - RayTask task3 = CreateTask({{ray::kCPU_ResourceLabel, 4}, {ray::kGPU_ResourceLabel, 1}}, - /*num_args=*/0, - /*args=*/{}); - auto runtime_env_hash = task.GetTaskSpecification().GetRuntimeEnvHash(); + RayLease lease = + CreateLease({{ray::kCPU_ResourceLabel, 4}, {ray::kGPU_ResourceLabel, 1}}, + /*num_args=*/0, + /*args=*/{}); + RayLease lease2 = + CreateLease({{ray::kCPU_ResourceLabel, 4}, {ray::kGPU_ResourceLabel, 1}}, + /*num_args=*/0, + /*args=*/{}); + RayLease lease3 = + CreateLease({{ray::kCPU_ResourceLabel, 4}, {ray::kGPU_ResourceLabel, 1}}, + /*num_args=*/0, + /*args=*/{}); + auto runtime_env_hash = lease.GetLeaseSpecification().GetRuntimeEnvHash(); std::vector> workers; for (int i = 0; i < 3; i++) { std::shared_ptr worker = @@ -2444,42 +2475,42 @@ TEST_F(ClusterTaskManagerTest, CapRunningOnDispatchQueue) { auto callback = [&num_callbacks](Status, std::function, std::function) { num_callbacks++; }; - task_manager_.QueueAndScheduleTask(task, false, false, &reply, callback); - task_manager_.QueueAndScheduleTask(task2, false, false, &reply, callback); - task_manager_.QueueAndScheduleTask(task3, false, false, &reply, callback); + lease_manager_.QueueAndScheduleLease(lease, false, false, &reply, callback); + lease_manager_.QueueAndScheduleLease(lease2, false, false, &reply, callback); + lease_manager_.QueueAndScheduleLease(lease3, false, false, &reply, callback); pool_.TriggerCallbacks(); ASSERT_EQ(num_callbacks, 2); - local_task_manager_->ReleaseCpuResourcesFromBlockedWorker(workers[0]); - local_task_manager_->ReleaseCpuResourcesFromBlockedWorker(workers[1]); + local_lease_manager_->ReleaseCpuResourcesFromBlockedWorker(workers[0]); + local_lease_manager_->ReleaseCpuResourcesFromBlockedWorker(workers[1]); - task_manager_.ScheduleAndDispatchTasks(); + lease_manager_.ScheduleAndGrantLeases(); // Even though there are free resources, we've hit our cap of (8/4=)2 workers - // of the given scheduling class so we shouldn't dispatch the remaining task. + // of the given scheduling class so we shouldn't dispatch the remaining lease. ASSERT_EQ(num_callbacks, 2); - RayTask buf; - local_task_manager_->TaskFinished(workers[1], &buf); + RayLease buf; + local_lease_manager_->CleanupLease(workers[1], &buf); - task_manager_.ScheduleAndDispatchTasks(); + lease_manager_.ScheduleAndGrantLeases(); pool_.TriggerCallbacks(); ASSERT_EQ(num_callbacks, 3); - local_task_manager_->TaskFinished(workers[0], &buf); - local_task_manager_->TaskFinished(workers[2], &buf); + local_lease_manager_->CleanupLease(workers[0], &buf); + local_lease_manager_->CleanupLease(workers[2], &buf); AssertNoLeaks(); } -TEST_F(ClusterTaskManagerTest, ZeroCPUTasks) { +TEST_F(ClusterLeaseManagerTest, ZeroCPULeases) { scheduler_->GetLocalResourceManager().AddLocalResourceInstances( scheduling::ResourceID(ray::kGPU_ResourceLabel), {1, 1, 1}); - RayTask task = CreateTask({{"GPU", 1}}, /*num_args=*/0, /*args=*/{}); - RayTask task2 = CreateTask({{"GPU", 1}}, /*num_args=*/0, /*args=*/{}); - RayTask task3 = CreateTask({{"GPU", 1}}, /*num_args=*/0, /*args=*/{}); - auto runtime_env_hash = task.GetTaskSpecification().GetRuntimeEnvHash(); + RayLease lease = CreateLease({{"GPU", 1}}, /*num_args=*/0, /*args=*/{}); + RayLease lease2 = CreateLease({{"GPU", 1}}, /*num_args=*/0, /*args=*/{}); + RayLease lease3 = CreateLease({{"GPU", 1}}, /*num_args=*/0, /*args=*/{}); + auto runtime_env_hash = lease.GetLeaseSpecification().GetRuntimeEnvHash(); std::vector> workers; for (int i = 0; i < 3; i++) { std::shared_ptr worker = @@ -2493,28 +2524,28 @@ TEST_F(ClusterTaskManagerTest, ZeroCPUTasks) { auto callback = [&num_callbacks](Status, std::function, std::function) { num_callbacks++; }; - task_manager_.QueueAndScheduleTask(task, false, false, &reply, callback); - task_manager_.QueueAndScheduleTask(task2, false, false, &reply, callback); - task_manager_.QueueAndScheduleTask(task3, false, false, &reply, callback); + lease_manager_.QueueAndScheduleLease(lease, false, false, &reply, callback); + lease_manager_.QueueAndScheduleLease(lease2, false, false, &reply, callback); + lease_manager_.QueueAndScheduleLease(lease3, false, false, &reply, callback); pool_.TriggerCallbacks(); - // We shouldn't cap anything for zero cpu tasks (and shouldn't crash before + // We shouldn't cap anything for zero cpu leases (and shouldn't crash before // this point). ASSERT_EQ(num_callbacks, 3); for (auto &worker : workers) { - RayTask buf; - local_task_manager_->TaskFinished(worker, &buf); + RayLease buf; + local_lease_manager_->CleanupLease(worker, &buf); } AssertNoLeaks(); } -TEST_F(ClusterTaskManagerTestWithoutCPUsAtHead, ZeroCPUNode) { - RayTask task = CreateTask({}, /*num_args=*/0, /*args=*/{}); - RayTask task2 = CreateTask({}, /*num_args=*/0, /*args=*/{}); - RayTask task3 = CreateTask({}, /*num_args=*/0, /*args=*/{}); - auto runtime_env_hash = task.GetTaskSpecification().GetRuntimeEnvHash(); +TEST_F(ClusterLeaseManagerTestWithoutCPUsAtHead, ZeroCPUNode) { + RayLease lease = CreateLease({}, /*num_args=*/0, /*args=*/{}); + RayLease lease2 = CreateLease({}, /*num_args=*/0, /*args=*/{}); + RayLease lease3 = CreateLease({}, /*num_args=*/0, /*args=*/{}); + auto runtime_env_hash = lease.GetLeaseSpecification().GetRuntimeEnvHash(); std::vector> workers; for (int i = 0; i < 3; i++) { std::shared_ptr worker = @@ -2528,60 +2559,60 @@ TEST_F(ClusterTaskManagerTestWithoutCPUsAtHead, ZeroCPUNode) { auto callback = [&num_callbacks](Status, std::function, std::function) { num_callbacks++; }; - task_manager_.QueueAndScheduleTask(task, false, false, &reply, callback); - task_manager_.QueueAndScheduleTask(task2, false, false, &reply, callback); - task_manager_.QueueAndScheduleTask(task3, false, false, &reply, callback); + lease_manager_.QueueAndScheduleLease(lease, false, false, &reply, callback); + lease_manager_.QueueAndScheduleLease(lease2, false, false, &reply, callback); + lease_manager_.QueueAndScheduleLease(lease3, false, false, &reply, callback); pool_.TriggerCallbacks(); - // We shouldn't cap anything for zero cpu tasks (and shouldn't crash before + // We shouldn't cap anything for zero cpu leases (and shouldn't crash before // this point). ASSERT_EQ(num_callbacks, 3); for (auto &worker : workers) { - RayTask buf; - local_task_manager_->TaskFinished(worker, &buf); + RayLease buf; + local_lease_manager_->CleanupLease(worker, &buf); } AssertNoLeaks(); } -/// Test that we are able to spillback tasks +/// Test that we are able to spillback leases /// while hitting the scheduling class cap. -TEST_F(ClusterTaskManagerTest, SchedulingClassCapSpillback) { +TEST_F(ClusterLeaseManagerTest, SchedulingClassCapSpillback) { std::shared_ptr worker = std::make_shared(WorkerID::FromRandom(), 1234); pool_.PushWorker(std::dynamic_pointer_cast(worker)); - std::vector tasks; + std::vector leases; std::vector> replies; int num_callbacks = 0; auto callback = [&](Status, std::function, std::function) { num_callbacks++; }; - // The first task will be dispatched right away, - // and the second task will hit the scheduling class cap. + // The first lease will be dispatched right away, + // and the second lease will hit the scheduling class cap. for (int i = 0; i < 2; ++i) { - RayTask task = CreateTask({{ray::kCPU_ResourceLabel, 8}}); - tasks.push_back(task); + RayLease lease = CreateLease({{ray::kCPU_ResourceLabel, 8}}); + leases.push_back(lease); replies.push_back(std::make_unique()); - task_manager_.QueueAndScheduleTask(task, false, false, replies[i].get(), callback); + lease_manager_.QueueAndScheduleLease(lease, false, false, replies[i].get(), callback); pool_.TriggerCallbacks(); } ASSERT_EQ(replies[0]->worker_address().port(), 1234); ASSERT_EQ(num_callbacks, 1); - ASSERT_EQ(NumTasksToDispatchWithStatus(internal::WorkStatus::WAITING), 1); + ASSERT_EQ(NumLeasesToDispatchWithStatus(internal::WorkStatus::WAITING), 1); // A new node is added so we should be able to spillback to it. auto remote_node_id = NodeID::FromRandom(); AddNode(remote_node_id, 8); - task_manager_.ScheduleAndDispatchTasks(); + lease_manager_.ScheduleAndGrantLeases(); ASSERT_EQ(num_callbacks, 2); - ASSERT_EQ(replies[1]->retry_at_raylet_address().raylet_id(), remote_node_id.Binary()); + ASSERT_EQ(replies[1]->retry_at_raylet_address().node_id(), remote_node_id.Binary()); } /// Test that we exponentially increase the amount of time it takes to increase /// the dispatch cap for a scheduling class. -TEST_F(ClusterTaskManagerTest, SchedulingClassCapIncrease) { +TEST_F(ClusterLeaseManagerTest, SchedulingClassCapIncrease) { auto get_unblocked_worker = [](std::vector> &workers) -> std::shared_ptr { for (auto &worker : workers) { @@ -2593,12 +2624,12 @@ TEST_F(ClusterTaskManagerTest, SchedulingClassCapIncrease) { }; int64_t UNIT = RayConfig::instance().worker_cap_initial_backoff_delay_ms(); - std::vector tasks; + std::vector leases; for (int i = 0; i < 3; i++) { - RayTask task = CreateTask({{ray::kCPU_ResourceLabel, 8}}, - /*num_args=*/0, - /*args=*/{}); - tasks.emplace_back(task); + RayLease lease = CreateLease({{ray::kCPU_ResourceLabel, 8}}, + /*num_args=*/0, + /*args=*/{}); + leases.emplace_back(lease); } rpc::RequestWorkerLeaseReply reply; @@ -2606,11 +2637,11 @@ TEST_F(ClusterTaskManagerTest, SchedulingClassCapIncrease) { auto callback = [&num_callbacks](Status, std::function, std::function) { num_callbacks++; }; - for (const auto &task : tasks) { - task_manager_.QueueAndScheduleTask(task, false, false, &reply, callback); + for (const auto &lease : leases) { + lease_manager_.QueueAndScheduleLease(lease, false, false, &reply, callback); } - auto runtime_env_hash = tasks[0].GetTaskSpecification().GetRuntimeEnvHash(); + auto runtime_env_hash = leases[0].GetLeaseSpecification().GetRuntimeEnvHash(); std::vector> workers; for (int i = 0; i < 3; i++) { std::shared_ptr worker = @@ -2619,40 +2650,40 @@ TEST_F(ClusterTaskManagerTest, SchedulingClassCapIncrease) { pool_.TriggerCallbacks(); workers.push_back(worker); } - task_manager_.ScheduleAndDispatchTasks(); + lease_manager_.ScheduleAndGrantLeases(); ASSERT_EQ(num_callbacks, 1); current_time_ms_ += UNIT; ASSERT_FALSE(workers.back()->IsBlocked()); - ASSERT_TRUE(local_task_manager_->ReleaseCpuResourcesFromBlockedWorker( + ASSERT_TRUE(local_lease_manager_->ReleaseCpuResourcesFromBlockedWorker( get_unblocked_worker(workers))); - task_manager_.ScheduleAndDispatchTasks(); + lease_manager_.ScheduleAndGrantLeases(); pool_.TriggerCallbacks(); - task_manager_.ScheduleAndDispatchTasks(); + lease_manager_.ScheduleAndGrantLeases(); ASSERT_EQ(num_callbacks, 2); // Since we're increasing exponentially, increasing by a unit show no longer be enough. current_time_ms_ += UNIT; - ASSERT_TRUE(local_task_manager_->ReleaseCpuResourcesFromBlockedWorker( + ASSERT_TRUE(local_lease_manager_->ReleaseCpuResourcesFromBlockedWorker( get_unblocked_worker(workers))); - task_manager_.ScheduleAndDispatchTasks(); + lease_manager_.ScheduleAndGrantLeases(); pool_.TriggerCallbacks(); - task_manager_.ScheduleAndDispatchTasks(); + lease_manager_.ScheduleAndGrantLeases(); ASSERT_EQ(num_callbacks, 2); // Now it should run current_time_ms_ += UNIT; - task_manager_.ScheduleAndDispatchTasks(); + lease_manager_.ScheduleAndGrantLeases(); pool_.TriggerCallbacks(); - task_manager_.ScheduleAndDispatchTasks(); + lease_manager_.ScheduleAndGrantLeases(); ASSERT_EQ(num_callbacks, 3); - // Let just one task finish. + // Let just one lease finish. for (auto it = workers.begin(); it != workers.end(); it++) { if (!(*it)->IsBlocked()) { - RayTask buf; - local_task_manager_->TaskFinished(*it, &buf); + RayLease buf; + local_lease_manager_->CleanupLease(*it, &buf); workers.erase(it); break; } @@ -2660,11 +2691,11 @@ TEST_F(ClusterTaskManagerTest, SchedulingClassCapIncrease) { current_time_ms_ += UNIT; - // Now schedule another task of the same scheduling class. - RayTask task = CreateTask({{ray::kCPU_ResourceLabel, 8}}, - /*num_args=*/0, - /*args=*/{}); - task_manager_.QueueAndScheduleTask(task, false, false, &reply, callback); + // Now schedule another lease of the same scheduling class. + RayLease lease = CreateLease({{ray::kCPU_ResourceLabel, 8}}, + /*num_args=*/0, + /*args=*/{}); + lease_manager_.QueueAndScheduleLease(lease, false, false, &reply, callback); std::shared_ptr new_worker = std::make_shared(WorkerID::FromRandom(), 1234, runtime_env_hash); @@ -2673,31 +2704,31 @@ TEST_F(ClusterTaskManagerTest, SchedulingClassCapIncrease) { workers.push_back(new_worker); // It can't run for another 2 units (doesn't increase to 4, because one of - // the tasks finished). + // the leases finished). ASSERT_EQ(num_callbacks, 3); current_time_ms_ += 2 * UNIT; - task_manager_.ScheduleAndDispatchTasks(); + lease_manager_.ScheduleAndGrantLeases(); pool_.TriggerCallbacks(); ASSERT_EQ(num_callbacks, 4); for (auto &worker : workers) { - RayTask buf; - local_task_manager_->TaskFinished(worker, &buf); + RayLease buf; + local_lease_manager_->CleanupLease(worker, &buf); } AssertNoLeaks(); } -/// Ensure we reset the cap after we've finished executing through the queue. -TEST_F(ClusterTaskManagerTest, SchedulingClassCapResetTest) { +/// Ensure we reset the cap after we've granted all leases in the queue. +TEST_F(ClusterLeaseManagerTest, SchedulingClassCapResetTest) { int64_t UNIT = RayConfig::instance().worker_cap_initial_backoff_delay_ms(); - std::vector tasks; + std::vector leases; for (int i = 0; i < 2; i++) { - RayTask task = CreateTask({{ray::kCPU_ResourceLabel, 8}}, - /*num_args=*/0, - /*args=*/{}); - tasks.emplace_back(task); + RayLease lease = CreateLease({{ray::kCPU_ResourceLabel, 8}}, + /*num_args=*/0, + /*args=*/{}); + leases.emplace_back(lease); } rpc::RequestWorkerLeaseReply reply; @@ -2705,97 +2736,97 @@ TEST_F(ClusterTaskManagerTest, SchedulingClassCapResetTest) { auto callback = [&num_callbacks](Status, std::function, std::function) { num_callbacks++; }; - for (const auto &task : tasks) { - task_manager_.QueueAndScheduleTask(task, false, false, &reply, callback); + for (const auto &lease : leases) { + lease_manager_.QueueAndScheduleLease(lease, false, false, &reply, callback); } - auto runtime_env_hash = tasks[0].GetTaskSpecification().GetRuntimeEnvHash(); + auto runtime_env_hash = leases[0].GetLeaseSpecification().GetRuntimeEnvHash(); std::shared_ptr worker1 = std::make_shared(WorkerID::FromRandom(), 1234, runtime_env_hash); pool_.PushWorker(std::static_pointer_cast(worker1)); pool_.TriggerCallbacks(); - task_manager_.ScheduleAndDispatchTasks(); + lease_manager_.ScheduleAndGrantLeases(); - ASSERT_TRUE(local_task_manager_->ReleaseCpuResourcesFromBlockedWorker(worker1)); + ASSERT_TRUE(local_lease_manager_->ReleaseCpuResourcesFromBlockedWorker(worker1)); current_time_ms_ += UNIT; std::shared_ptr worker2 = std::make_shared(WorkerID::FromRandom(), 1234, runtime_env_hash); pool_.PushWorker(std::static_pointer_cast(worker2)); - task_manager_.ScheduleAndDispatchTasks(); + lease_manager_.ScheduleAndGrantLeases(); pool_.TriggerCallbacks(); ASSERT_EQ(num_callbacks, 2); - RayTask buf; - local_task_manager_->TaskFinished(worker1, &buf); - local_task_manager_->TaskFinished(worker2, &buf); + RayLease buf; + local_lease_manager_->CleanupLease(worker1, &buf); + local_lease_manager_->CleanupLease(worker2, &buf); AssertNoLeaks(); for (int i = 0; i < 2; i++) { - RayTask task = CreateTask({{ray::kCPU_ResourceLabel, 8}}, - /*num_args=*/0, - /*args=*/{}); - task_manager_.QueueAndScheduleTask(task, false, false, &reply, callback); + RayLease lease = CreateLease({{ray::kCPU_ResourceLabel, 8}}, + /*num_args=*/0, + /*args=*/{}); + lease_manager_.QueueAndScheduleLease(lease, false, false, &reply, callback); } std::shared_ptr worker3 = std::make_shared(WorkerID::FromRandom(), 1234, runtime_env_hash); pool_.PushWorker(std::static_pointer_cast(worker3)); pool_.TriggerCallbacks(); - task_manager_.ScheduleAndDispatchTasks(); + lease_manager_.ScheduleAndGrantLeases(); ASSERT_EQ(num_callbacks, 3); - ASSERT_TRUE(local_task_manager_->ReleaseCpuResourcesFromBlockedWorker(worker3)); + ASSERT_TRUE(local_lease_manager_->ReleaseCpuResourcesFromBlockedWorker(worker3)); current_time_ms_ += UNIT; std::shared_ptr worker4 = std::make_shared(WorkerID::FromRandom(), 1234, runtime_env_hash); pool_.PushWorker(std::static_pointer_cast(worker4)); - task_manager_.ScheduleAndDispatchTasks(); + lease_manager_.ScheduleAndGrantLeases(); pool_.TriggerCallbacks(); ASSERT_EQ(num_callbacks, 4); { // Ensure a class of a different scheduling class can still be scheduled. - RayTask task5 = CreateTask({}, - /*num_args=*/0, - /*args=*/{}); - task_manager_.QueueAndScheduleTask(task5, false, false, &reply, callback); + RayLease lease5 = CreateLease({}, + /*num_args=*/0, + /*args=*/{}); + lease_manager_.QueueAndScheduleLease(lease5, false, false, &reply, callback); std::shared_ptr worker5 = std::make_shared(WorkerID::FromRandom(), 1234, runtime_env_hash); pool_.PushWorker(std::static_pointer_cast(worker5)); - task_manager_.ScheduleAndDispatchTasks(); + lease_manager_.ScheduleAndGrantLeases(); pool_.TriggerCallbacks(); ASSERT_EQ(num_callbacks, 5); - local_task_manager_->TaskFinished(worker5, &buf); + local_lease_manager_->CleanupLease(worker5, &buf); } - local_task_manager_->TaskFinished(worker3, &buf); - local_task_manager_->TaskFinished(worker4, &buf); + local_lease_manager_->CleanupLease(worker3, &buf); + local_lease_manager_->CleanupLease(worker4, &buf); AssertNoLeaks(); } /// Test that scheduling classes which have reached their running cap start -/// their timer after the new task is submitted, not before. -TEST_F(ClusterTaskManagerTest, DispatchTimerAfterRequestTest) { +/// their timer after the new lease is submitted, not before. +TEST_F(ClusterLeaseManagerTest, DispatchTimerAfterRequestTest) { int64_t UNIT = RayConfig::instance().worker_cap_initial_backoff_delay_ms(); - RayTask first_task = CreateTask({{ray::kCPU_ResourceLabel, 8}}, - /*num_args=*/0, - /*args=*/{}); + RayLease first_lease = CreateLease({{ray::kCPU_ResourceLabel, 8}}, + /*num_args=*/0, + /*args=*/{}); rpc::RequestWorkerLeaseReply reply; int num_callbacks = 0; auto callback = [&num_callbacks](Status, std::function, std::function) { num_callbacks++; }; - task_manager_.QueueAndScheduleTask(first_task, false, false, &reply, callback); + lease_manager_.QueueAndScheduleLease(first_lease, false, false, &reply, callback); - auto runtime_env_hash = first_task.GetTaskSpecification().GetRuntimeEnvHash(); + auto runtime_env_hash = first_lease.GetLeaseSpecification().GetRuntimeEnvHash(); std::vector> workers; for (int i = 0; i < 3; i++) { std::shared_ptr worker = @@ -2804,68 +2835,68 @@ TEST_F(ClusterTaskManagerTest, DispatchTimerAfterRequestTest) { pool_.TriggerCallbacks(); workers.push_back(worker); } - task_manager_.ScheduleAndDispatchTasks(); + lease_manager_.ScheduleAndGrantLeases(); ASSERT_EQ(num_callbacks, 1); - RayTask second_task = CreateTask({{ray::kCPU_ResourceLabel, 8}}, - /*num_args=*/0, - /*args=*/{}); - task_manager_.QueueAndScheduleTask(second_task, false, false, &reply, callback); + RayLease second_lease = CreateLease({{ray::kCPU_ResourceLabel, 8}}, + /*num_args=*/0, + /*args=*/{}); + lease_manager_.QueueAndScheduleLease(second_lease, false, false, &reply, callback); pool_.TriggerCallbacks(); /// Can't schedule yet due to the cap. ASSERT_EQ(num_callbacks, 1); for (auto &worker : workers) { if (worker->GetAllocatedInstances() && !worker->IsBlocked()) { - local_task_manager_->ReleaseCpuResourcesFromBlockedWorker(worker); + local_lease_manager_->ReleaseCpuResourcesFromBlockedWorker(worker); } } current_time_ms_ += UNIT; - task_manager_.ScheduleAndDispatchTasks(); + lease_manager_.ScheduleAndGrantLeases(); pool_.TriggerCallbacks(); ASSERT_EQ(num_callbacks, 2); for (auto &worker : workers) { if (worker->GetAllocatedInstances() && !worker->IsBlocked()) { - local_task_manager_->ReleaseCpuResourcesFromBlockedWorker(worker); + local_lease_manager_->ReleaseCpuResourcesFromBlockedWorker(worker); } } /// A lot of time passes, definitely more than the timeout. current_time_ms_ += 100000 * UNIT; - RayTask third_task = CreateTask({{ray::kCPU_ResourceLabel, 8}}, - /*num_args=*/0, - /*args=*/{}); - task_manager_.QueueAndScheduleTask(third_task, false, false, &reply, callback); + RayLease third_lease = CreateLease({{ray::kCPU_ResourceLabel, 8}}, + /*num_args=*/0, + /*args=*/{}); + lease_manager_.QueueAndScheduleLease(third_lease, false, false, &reply, callback); pool_.TriggerCallbacks(); - /// We still can't schedule the third task since the timer doesn't start - /// until after the task is queued. + /// We still can't schedule the third lease since the timer doesn't start + /// until after the lease is queued. ASSERT_EQ(num_callbacks, 2); current_time_ms_ += 2 * UNIT; - task_manager_.ScheduleAndDispatchTasks(); + lease_manager_.ScheduleAndGrantLeases(); pool_.TriggerCallbacks(); ASSERT_EQ(num_callbacks, 3); for (auto &worker : workers) { - RayTask buf; - local_task_manager_->TaskFinished(worker, &buf); + RayLease buf; + local_lease_manager_->CleanupLease(worker, &buf); } AssertNoLeaks(); } -TEST_F(ClusterTaskManagerTest, PopWorkerBeforeDraining) { +TEST_F(ClusterLeaseManagerTest, PopWorkerBeforeDraining) { /* Test that if PopWorker happens before draining, the lease request can still succeed. */ - RayTask task = CreateTask({{ray::kCPU_ResourceLabel, 1}}); + RayLease lease = CreateLease({{ray::kCPU_ResourceLabel, 1}}); rpc::RequestWorkerLeaseReply reply; bool callback_occurred = false; bool *callback_occurred_ptr = &callback_occurred; @@ -2873,7 +2904,7 @@ TEST_F(ClusterTaskManagerTest, PopWorkerBeforeDraining) { Status, std::function, std::function) { *callback_occurred_ptr = true; }; - task_manager_.QueueAndScheduleTask(task, false, false, &reply, callback); + lease_manager_.QueueAndScheduleLease(lease, false, false, &reply, callback); // Drain the local node. rpc::DrainRayletRequest drain_request; @@ -2888,11 +2919,11 @@ TEST_F(ClusterTaskManagerTest, PopWorkerBeforeDraining) { ASSERT_EQ(leased_workers_.size(), 1); } -TEST_F(ClusterTaskManagerTest, UnscheduleableWhileDraining) { +TEST_F(ClusterLeaseManagerTest, UnscheduleableWhileDraining) { /* - Test that new tasks are not scheduled onto draining nodes. + Test that new leases are not scheduled onto draining nodes. */ - RayTask task = CreateTask({{ray::kCPU_ResourceLabel, 1}}); + RayLease lease = CreateLease({{ray::kCPU_ResourceLabel, 1}}); rpc::RequestWorkerLeaseReply reply; bool callback_occurred = false; bool *callback_occurred_ptr = &callback_occurred; @@ -2900,7 +2931,7 @@ TEST_F(ClusterTaskManagerTest, UnscheduleableWhileDraining) { Status, std::function, std::function) { *callback_occurred_ptr = true; }; - task_manager_.QueueAndScheduleTask(task, false, false, &reply, callback); + lease_manager_.QueueAndScheduleLease(lease, false, false, &reply, callback); std::shared_ptr worker = std::make_shared(WorkerID::FromRandom(), 1234); std::shared_ptr worker2 = @@ -2920,21 +2951,20 @@ TEST_F(ClusterTaskManagerTest, UnscheduleableWhileDraining) { drain_request.set_deadline_timestamp_ms(std::numeric_limits::max()); scheduler_->GetLocalResourceManager().SetLocalNodeDraining(drain_request); - RayTask spillback_task = CreateTask({{ray::kCPU_ResourceLabel, 1}}); + RayLease spillback_lease = CreateLease({{ray::kCPU_ResourceLabel, 1}}); rpc::RequestWorkerLeaseReply spillback_reply; - task_manager_.QueueAndScheduleTask( - spillback_task, false, false, &spillback_reply, callback); + lease_manager_.QueueAndScheduleLease( + spillback_lease, false, false, &spillback_reply, callback); pool_.TriggerCallbacks(); ASSERT_EQ(leased_workers_.size(), 1); ASSERT_EQ(pool_.workers.size(), 1); - ASSERT_EQ(spillback_reply.retry_at_raylet_address().raylet_id(), - remote_node_id.Binary()); + ASSERT_EQ(spillback_reply.retry_at_raylet_address().node_id(), remote_node_id.Binary()); } // Regression test for https://github.com/ray-project/ray/issues/16935: -// When a task requires 1 CPU and is infeasible because head node has 0 CPU, -// make sure the task's resource demand is reported. -TEST_F(ClusterTaskManagerTestWithoutCPUsAtHead, OneCpuInfeasibleTask) { +// When a lease requires 1 CPU and is infeasible because head node has 0 CPU, +// make sure the lease's resource demand is reported. +TEST_F(ClusterLeaseManagerTestWithoutCPUsAtHead, OneCpuInfeasibleLease) { rpc::RequestWorkerLeaseReply reply; bool callback_occurred = false; bool *callback_occurred_ptr = &callback_occurred; @@ -2945,7 +2975,7 @@ TEST_F(ClusterTaskManagerTestWithoutCPUsAtHead, OneCpuInfeasibleTask) { }; constexpr int num_cases = 5; - // Create 5 tasks with different CPU requests. + // Create 5 leases with different CPU requests. const std::array cpu_request = {1, 2, 1, 3, 1}; // Each type of CPU request corresponds to a types of resource demand. const std::array demand_types = {1, 2, 2, 3, 3}; @@ -2953,18 +2983,18 @@ TEST_F(ClusterTaskManagerTestWithoutCPUsAtHead, OneCpuInfeasibleTask) { const std::array num_infeasible_1cpu = {1, 1, 2, 2, 3}; for (int i = 0; i < num_cases; ++i) { - RayTask task = CreateTask({{ray::kCPU_ResourceLabel, cpu_request[i]}}); - task_manager_.QueueAndScheduleTask(task, false, false, &reply, callback); + RayLease lease = CreateLease({{ray::kCPU_ResourceLabel, cpu_request[i]}}); + lease_manager_.QueueAndScheduleLease(lease, false, false, &reply, callback); pool_.TriggerCallbacks(); - // The task cannot run because there is only 1 node (head) with 0 CPU. + // The lease cannot run because there is only 1 node (head) with 0 CPU. ASSERT_FALSE(callback_occurred); ASSERT_EQ(leased_workers_.size(), 0); ASSERT_EQ(pool_.workers.size(), 0); ASSERT_EQ(node_info_calls_, 0); rpc::ResourcesData data; - task_manager_.FillResourceUsage(data); + lease_manager_.FillResourceUsage(data); const auto &resource_load_by_shape = data.resource_load_by_shape(); ASSERT_EQ(resource_load_by_shape.resource_demands().size(), demand_types[i]); diff --git a/src/ray/raylet/scheduling/cluster_resource_manager_test.cc b/src/ray/raylet/scheduling/tests/cluster_resource_manager_test.cc similarity index 88% rename from src/ray/raylet/scheduling/cluster_resource_manager_test.cc rename to src/ray/raylet/scheduling/tests/cluster_resource_manager_test.cc index 0324c84bb31a..f7d4506dd4e5 100644 --- a/src/ray/raylet/scheduling/cluster_resource_manager_test.cc +++ b/src/ray/raylet/scheduling/tests/cluster_resource_manager_test.cc @@ -60,6 +60,31 @@ struct ClusterResourceManagerTest : public ::testing::Test { std::unique_ptr manager; }; +TEST_F(ClusterResourceManagerTest, UpdateNode) { + // Prepare a sync message with updated totals/available, labels and flags. + syncer::ResourceViewSyncMessage payload; + payload.mutable_resources_total()->insert({"CPU", 10.0}); + payload.mutable_resources_available()->insert({"CPU", 5.0}); + payload.mutable_labels()->insert({"zone", "us-east-1a"}); + payload.set_object_pulls_queued(true); + payload.set_idle_duration_ms(42); + payload.set_is_draining(true); + payload.set_draining_deadline_timestamp_ms(123456); + + // Update existing node and validate the local view reflects the payload. + ASSERT_TRUE(manager->UpdateNode(node0, payload)); + + const auto &node_resources = manager->GetNodeResources(node0); + ASSERT_EQ(node_resources.total.Get(scheduling::ResourceID("CPU")), 10); + ASSERT_EQ(node_resources.available.Get(scheduling::ResourceID("CPU")), 5); + ASSERT_EQ(node_resources.labels.at("zone"), "us-east-1a"); + ASSERT_TRUE(node_resources.object_pulls_queued); + ASSERT_EQ(node_resources.idle_resource_duration_ms, 42); + ASSERT_TRUE(node_resources.is_draining); + ASSERT_EQ(node_resources.draining_deadline_timestamp_ms, 123456); + ASSERT_TRUE(node_resources.last_resource_update_time.has_value()); +} + TEST_F(ClusterResourceManagerTest, DebugStringTest) { // Test max_num_nodes_to_include parameter is working. ASSERT_EQ(std::vector(absl::StrSplit(manager->DebugString(), "node id:")) diff --git a/src/ray/raylet/scheduling/cluster_resource_scheduler_2_test.cc b/src/ray/raylet/scheduling/tests/cluster_resource_scheduler_2_test.cc similarity index 99% rename from src/ray/raylet/scheduling/cluster_resource_scheduler_2_test.cc rename to src/ray/raylet/scheduling/tests/cluster_resource_scheduler_2_test.cc index f2a19f15474b..06db0f82085a 100644 --- a/src/ray/raylet/scheduling/cluster_resource_scheduler_2_test.cc +++ b/src/ray/raylet/scheduling/tests/cluster_resource_scheduler_2_test.cc @@ -229,7 +229,6 @@ TEST_F(GcsResourceSchedulerTest, TestNodeFilter) { auto result1 = cluster_resource_scheduler_->Schedule( resource_request_list, SchedulingOptions::BundleStrictSpread( - /*max_cpu_fraction_per_node*/ 1.0, std::make_unique(bundle_locations))); ASSERT_TRUE(result1.status.IsInfeasible()); ASSERT_EQ(result1.selected_nodes.size(), 0); @@ -238,7 +237,6 @@ TEST_F(GcsResourceSchedulerTest, TestNodeFilter) { auto result2 = cluster_resource_scheduler_->Schedule( resource_request_list, SchedulingOptions::BundleStrictSpread( - /*max_cpu_fraction_per_node*/ 1.0, std::make_unique(nullptr))); ASSERT_TRUE(result2.status.IsSuccess()); ASSERT_EQ(result2.selected_nodes.size(), 1); diff --git a/src/ray/raylet/scheduling/cluster_resource_scheduler_test.cc b/src/ray/raylet/scheduling/tests/cluster_resource_scheduler_test.cc similarity index 99% rename from src/ray/raylet/scheduling/cluster_resource_scheduler_test.cc rename to src/ray/raylet/scheduling/tests/cluster_resource_scheduler_test.cc index cafc79dfbeab..9614fa22a737 100644 --- a/src/ray/raylet/scheduling/cluster_resource_scheduler_test.cc +++ b/src/ray/raylet/scheduling/tests/cluster_resource_scheduler_test.cc @@ -25,10 +25,10 @@ #include "gtest/gtest.h" #include "ray/common/ray_config.h" #include "ray/common/task/task_util.h" -#include "ray/common/test_util.h" +#include "ray/common/test_utils.h" #include "ray/common/scheduling/resource_set.h" #include "ray/common/scheduling/scheduling_ids.h" -#include "mock/ray/gcs/gcs_client/gcs_client.h" +#include "mock/ray/gcs_client/gcs_client.h" // clang-format on using namespace std; // NOLINT @@ -508,8 +508,10 @@ TEST_F(ClusterResourceSchedulerTest, SchedulingWithPreferredNodeTest) { // Remote node is feasible but has no available resource. resource_scheduler.GetClusterResourceManager().AddOrUpdateNode( remote_node_id, remote_resource_total, {{"CPU", 0}}); + LeaseSpecification lease_spec1( + std::move(spec_builder_1).ConsumeAndBuild().GetMessage()); auto node_id_3 = resource_scheduler.GetBestSchedulableNode( - std::move(spec_builder_1).ConsumeAndBuild(), + lease_spec1, /*preferred_node_id=*/local_node_id.Binary(), false, false, @@ -550,8 +552,10 @@ TEST_F(ClusterResourceSchedulerTest, SchedulingWithPreferredNodeTest) { "", nullptr); spec_builder_2.SetNormalTaskSpec(0, false, "", scheduling_strategy, ActorID::Nil()); + LeaseSpecification lease_spec2( + std::move(spec_builder_2).ConsumeAndBuild().GetMessage()); auto node_id_4 = resource_scheduler.GetBestSchedulableNode( - std::move(spec_builder_2).ConsumeAndBuild(), + lease_spec2, /*preferred_node_id=*/local_node_id.Binary(), false, false, @@ -1848,8 +1852,8 @@ TEST_F(ClusterResourceSchedulerTest, LabelSelectorIsSchedulableOnNodeTest) { label_selector_spec.SetNormalTaskSpec( 0, false, "", scheduling_strategy, ActorID::Nil()); auto built_label_selector = std::move(label_selector_spec).ConsumeAndBuild(); - resource_scheduler.GetBestSchedulableNode( - built_label_selector, "", false, false, &is_infeasible); + LeaseSpecification lease_spec(built_label_selector.GetMessage()); + resource_scheduler.GetBestSchedulableNode(lease_spec, "", false, false, &is_infeasible); ASSERT_TRUE(is_infeasible); // Set node labels - node should now be schedulable @@ -1858,7 +1862,7 @@ TEST_F(ClusterResourceSchedulerTest, LabelSelectorIsSchedulableOnNodeTest) { }; resource_scheduler.GetClusterResourceManager().SetNodeLabels(node_1, test_labels); auto best_node_2 = resource_scheduler.GetBestSchedulableNode( - built_label_selector, "", false, false, &is_infeasible); + lease_spec, "", false, false, &is_infeasible); ASSERT_EQ(best_node_2, node_1); ASSERT_FALSE(is_infeasible); } diff --git a/src/ray/raylet/scheduling/local_resource_manager_test.cc b/src/ray/raylet/scheduling/tests/local_resource_manager_test.cc similarity index 94% rename from src/ray/raylet/scheduling/local_resource_manager_test.cc rename to src/ray/raylet/scheduling/tests/local_resource_manager_test.cc index 30b30e573f13..17d9d260c10b 100644 --- a/src/ray/raylet/scheduling/local_resource_manager_test.cc +++ b/src/ray/raylet/scheduling/tests/local_resource_manager_test.cc @@ -371,4 +371,25 @@ TEST_F(LocalResourceManagerTest, CreateSyncMessageNegativeResourceAvailability) ASSERT_EQ(resource_view_sync_messge.resources_available().at("CPU"), 0); } +TEST_F(LocalResourceManagerTest, PopulateResourceViewSyncMessage) { + // Prepare node resources with labels. + NodeResources resources = CreateNodeResources({{ResourceID::CPU(), 2.0}}); + resources.labels = {{"label1", "value1"}, {"label2", "value2"}}; + + manager = std::make_unique( + local_node_id, resources, nullptr, nullptr, nullptr, nullptr); + + // Populate the sync message and verify labels are copied over. + syncer::ResourceViewSyncMessage msg; + manager->PopulateResourceViewSyncMessage(msg); + + // Verify total resources are populated. + ASSERT_EQ(msg.resources_total_size(), 1); + ASSERT_EQ(msg.resources_total().at("CPU"), 2.0); + // Verify labels are populated. + ASSERT_EQ(msg.labels_size(), 2); + ASSERT_EQ(msg.labels().at("label1"), "value1"); + ASSERT_EQ(msg.labels().at("label2"), "value2"); +} + } // namespace ray diff --git a/src/ray/raylet/test/dependency_manager_test.cc b/src/ray/raylet/test/dependency_manager_test.cc deleted file mode 100644 index 9ad14a15df91..000000000000 --- a/src/ray/raylet/test/dependency_manager_test.cc +++ /dev/null @@ -1,399 +0,0 @@ -// Copyright 2017 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "ray/raylet/dependency_manager.h" - -#include -#include -#include -#include -#include - -#include "gmock/gmock.h" -#include "gtest/gtest.h" -#include "mock/ray/object_manager/object_manager.h" -#include "ray/common/task/task_util.h" -#include "ray/common/test_util.h" - -namespace ray { - -namespace raylet { - -using ::testing::_; -using ::testing::InSequence; -using ::testing::Return; - -class CustomMockObjectManager : public MockObjectManager { - public: - uint64_t Pull(const std::vector &object_refs, - BundlePriority prio, - const TaskMetricsKey &task_key) override { - if (prio == BundlePriority::GET_REQUEST) { - active_get_requests.insert(req_id); - } else if (prio == BundlePriority::WAIT_REQUEST) { - active_wait_requests.insert(req_id); - } else { - active_task_requests.insert(req_id); - } - return req_id++; - } - - void CancelPull(uint64_t request_id) override { - ASSERT_TRUE(active_get_requests.erase(request_id) || - active_wait_requests.erase(request_id) || - active_task_requests.erase(request_id)); - } - - bool PullRequestActiveOrWaitingForMetadata(uint64_t request_id) const override { - return active_get_requests.count(request_id) || - active_wait_requests.count(request_id) || - active_task_requests.count(request_id); - } - - uint64_t req_id = 1; - std::unordered_set active_get_requests; - std::unordered_set active_wait_requests; - std::unordered_set active_task_requests; -}; - -class DependencyManagerTest : public ::testing::Test { - public: - DependencyManagerTest() - : object_manager_mock_(), dependency_manager_(object_manager_mock_) {} - - int64_t NumWaiting(const std::string &task_name) { - return dependency_manager_.waiting_tasks_counter_.Get({task_name, false}); - } - - int64_t NumWaitingTotal() { return dependency_manager_.waiting_tasks_counter_.Total(); } - - void AssertNoLeaks() { - ASSERT_TRUE(dependency_manager_.required_objects_.empty()); - ASSERT_TRUE(dependency_manager_.queued_task_requests_.empty()); - ASSERT_TRUE(dependency_manager_.get_requests_.empty()); - ASSERT_TRUE(dependency_manager_.wait_requests_.empty()); - ASSERT_EQ(dependency_manager_.waiting_tasks_counter_.Total(), 0); - // All pull requests are canceled. - ASSERT_TRUE(object_manager_mock_.active_task_requests.empty()); - ASSERT_TRUE(object_manager_mock_.active_get_requests.empty()); - ASSERT_TRUE(object_manager_mock_.active_wait_requests.empty()); - } - - CustomMockObjectManager object_manager_mock_; - DependencyManager dependency_manager_; -}; - -/// Test requesting the dependencies for a task. The dependency manager should -/// return the task ID as ready once all of its arguments are local. -TEST_F(DependencyManagerTest, TestSimpleTask) { - // Create a task with 3 arguments. - int num_arguments = 3; - std::vector arguments; - for (int i = 0; i < num_arguments; i++) { - arguments.push_back(ObjectID::FromRandom()); - } - TaskID task_id = RandomTaskId(); - bool ready = dependency_manager_.RequestTaskDependencies( - task_id, ObjectIdsToRefs(arguments), {"foo", false}); - ASSERT_FALSE(ready); - ASSERT_EQ(NumWaiting("bar"), 0); - ASSERT_EQ(NumWaiting("foo"), 1); - ASSERT_EQ(NumWaitingTotal(), 1); - - // For each argument, tell the task dependency manager that the argument is - // local. All arguments should be canceled as they become available locally. - auto ready_task_ids = dependency_manager_.HandleObjectLocal(arguments[0]); - ASSERT_TRUE(ready_task_ids.empty()); - ready_task_ids = dependency_manager_.HandleObjectLocal(arguments[1]); - ASSERT_TRUE(ready_task_ids.empty()); - // The task is ready to run. - ready_task_ids = dependency_manager_.HandleObjectLocal(arguments[2]); - ASSERT_EQ(ready_task_ids.size(), 1); - ASSERT_EQ(ready_task_ids.front(), task_id); - ASSERT_EQ(NumWaiting("bar"), 0); - ASSERT_EQ(NumWaiting("foo"), 0); - ASSERT_EQ(NumWaitingTotal(), 0); - - // Remove the task. - dependency_manager_.RemoveTaskDependencies(task_id); - AssertNoLeaks(); -} - -/// Test multiple tasks that depend on the same object. The dependency manager -/// should return all task IDs as ready once the object is local. -TEST_F(DependencyManagerTest, TestMultipleTasks) { - // Create 3 tasks that are dependent on the same object. - ObjectID argument_id = ObjectID::FromRandom(); - std::vector dependent_tasks; - int num_dependent_tasks = 3; - for (int i = 0; i < num_dependent_tasks; i++) { - TaskID task_id = RandomTaskId(); - dependent_tasks.push_back(task_id); - bool ready = dependency_manager_.RequestTaskDependencies( - task_id, ObjectIdsToRefs({argument_id}), {"foo", false}); - ASSERT_FALSE(ready); - // The object should be requested from the object manager once for each task. - ASSERT_EQ(object_manager_mock_.active_task_requests.size(), i + 1); - } - ASSERT_EQ(NumWaiting("bar"), 0); - ASSERT_EQ(NumWaiting("foo"), 3); - ASSERT_EQ(NumWaitingTotal(), 3); - - // Tell the task dependency manager that the object is local. - auto ready_task_ids = dependency_manager_.HandleObjectLocal(argument_id); - // Check that all tasks are now ready to run. - std::unordered_set added_tasks(dependent_tasks.begin(), dependent_tasks.end()); - for (auto &id : ready_task_ids) { - ASSERT_TRUE(added_tasks.erase(id)); - } - ASSERT_TRUE(added_tasks.empty()); - - for (auto &id : dependent_tasks) { - dependency_manager_.RemoveTaskDependencies(id); - } - AssertNoLeaks(); -} - -/// Test task with multiple dependencies. The dependency manager should return -/// the task ID as ready once all dependencies are local. If a dependency is -/// later evicted, the dependency manager should return the task ID as waiting. -TEST_F(DependencyManagerTest, TestTaskArgEviction) { - // Add a task with 3 arguments. - int num_arguments = 3; - std::vector arguments; - for (int i = 0; i < num_arguments; i++) { - arguments.push_back(ObjectID::FromRandom()); - } - TaskID task_id = RandomTaskId(); - bool ready = dependency_manager_.RequestTaskDependencies( - task_id, ObjectIdsToRefs(arguments), {"", false}); - ASSERT_FALSE(ready); - - // Tell the task dependency manager that each of the arguments is now - // available. - for (size_t i = 0; i < arguments.size(); i++) { - std::vector ready_tasks; - ready_tasks = dependency_manager_.HandleObjectLocal(arguments[i]); - if (i == arguments.size() - 1) { - ASSERT_EQ(ready_tasks.size(), 1); - ASSERT_EQ(ready_tasks.front(), task_id); - } else { - ASSERT_TRUE(ready_tasks.empty()); - } - } - - // Simulate each of the arguments getting evicted. Each object should now be - // considered remote. - for (size_t i = 0; i < arguments.size(); i++) { - std::vector waiting_tasks; - waiting_tasks = dependency_manager_.HandleObjectMissing(arguments[i]); - if (i == 0) { - // The first eviction should cause the task to go back to the waiting - // state. - ASSERT_EQ(waiting_tasks.size(), 1); - ASSERT_EQ(waiting_tasks.front(), task_id); - } else { - // The subsequent evictions shouldn't cause any more tasks to go back to - // the waiting state. - ASSERT_TRUE(waiting_tasks.empty()); - } - } - - // Tell the task dependency manager that each of the arguments is available - // again. - for (size_t i = 0; i < arguments.size(); i++) { - std::vector ready_tasks; - ready_tasks = dependency_manager_.HandleObjectLocal(arguments[i]); - if (i == arguments.size() - 1) { - ASSERT_EQ(ready_tasks.size(), 1); - ASSERT_EQ(ready_tasks.front(), task_id); - } else { - ASSERT_TRUE(ready_tasks.empty()); - } - } - - dependency_manager_.RemoveTaskDependencies(task_id); - AssertNoLeaks(); -} - -/// Test `ray.get`. Worker calls ray.get on {oid1}, then {oid1, oid2}, then -/// {oid1, oid2, oid3}. -TEST_F(DependencyManagerTest, TestGet) { - WorkerID worker_id = WorkerID::FromRandom(); - int num_arguments = 3; - std::vector arguments; - for (int i = 0; i < num_arguments; i++) { - // Add the new argument to the list of dependencies to subscribe to. - ObjectID argument_id = ObjectID::FromRandom(); - arguments.push_back(argument_id); - // Subscribe to the task's dependencies. All arguments except the last are - // duplicates of previous subscription calls. Each argument should only be - // requested from the node manager once. - auto prev_pull_reqs = object_manager_mock_.active_get_requests; - dependency_manager_.StartOrUpdateGetRequest(worker_id, ObjectIdsToRefs(arguments)); - // Previous pull request for this get should be canceled upon each new - // bundle. - ASSERT_EQ(object_manager_mock_.active_get_requests.size(), 1); - ASSERT_NE(object_manager_mock_.active_get_requests, prev_pull_reqs); - } - - // Nothing happens if the same bundle is requested. - auto prev_pull_reqs = object_manager_mock_.active_get_requests; - dependency_manager_.StartOrUpdateGetRequest(worker_id, ObjectIdsToRefs(arguments)); - ASSERT_EQ(object_manager_mock_.active_get_requests, prev_pull_reqs); - - // Cancel the pull request once the worker cancels the `ray.get`. - dependency_manager_.CancelGetRequest(worker_id); - AssertNoLeaks(); -} - -/// Test that when one of the objects becomes local after a `ray.wait` call, -/// all requests to remote nodes associated with the object are canceled. -TEST_F(DependencyManagerTest, TestWait) { - // Generate a random worker and objects to wait on. - WorkerID worker_id = WorkerID::FromRandom(); - int num_objects = 3; - std::vector oids; - for (int i = 0; i < num_objects; i++) { - oids.push_back(ObjectID::FromRandom()); - } - dependency_manager_.StartOrUpdateWaitRequest(worker_id, ObjectIdsToRefs(oids)); - ASSERT_EQ(object_manager_mock_.active_wait_requests.size(), num_objects); - - for (int i = 0; i < num_objects; i++) { - // Object is local. - auto ready_task_ids = dependency_manager_.HandleObjectLocal(oids[i]); - - // Local object gets evicted. The `ray.wait` call should not be - // reactivated. - auto waiting_task_ids = dependency_manager_.HandleObjectMissing(oids[i]); - ASSERT_TRUE(waiting_task_ids.empty()); - ASSERT_EQ(object_manager_mock_.active_wait_requests.size(), num_objects - i - 1); - } - AssertNoLeaks(); -} - -/// Test that when no objects are locally available, a `ray.wait` call makes -/// the correct requests to remote nodes and correctly cancels the requests -/// when the `ray.wait` call is canceled. -TEST_F(DependencyManagerTest, TestWaitThenCancel) { - // Generate a random worker and objects to wait on. - WorkerID worker_id = WorkerID::FromRandom(); - int num_objects = 3; - std::vector oids; - for (int i = 0; i < num_objects; i++) { - oids.push_back(ObjectID::FromRandom()); - } - // Simulate a worker calling `ray.wait` on some objects. - dependency_manager_.StartOrUpdateWaitRequest(worker_id, ObjectIdsToRefs(oids)); - ASSERT_EQ(object_manager_mock_.active_wait_requests.size(), num_objects); - // Check that it's okay to call `ray.wait` on the same objects again. No new - // calls should be made to try and make the objects local. - dependency_manager_.StartOrUpdateWaitRequest(worker_id, ObjectIdsToRefs(oids)); - ASSERT_EQ(object_manager_mock_.active_wait_requests.size(), num_objects); - // Cancel the worker's `ray.wait`. - dependency_manager_.CancelWaitRequest(worker_id); - AssertNoLeaks(); -} - -/// Test that when one of the objects is already local at the time of the -/// `ray.wait` call, the `ray.wait` call does not trigger any requests to -/// remote nodes for that object. -TEST_F(DependencyManagerTest, TestWaitObjectLocal) { - // Generate a random worker and objects to wait on. - WorkerID worker_id = WorkerID::FromRandom(); - int num_objects = 3; - std::vector oids; - for (int i = 0; i < num_objects; i++) { - oids.push_back(ObjectID::FromRandom()); - } - // Simulate one of the objects becoming local. The later `ray.wait` call - // should have no effect because the object is already local. - const ObjectID local_object_id = std::move(oids.back()); - auto ready_task_ids = dependency_manager_.HandleObjectLocal(local_object_id); - ASSERT_TRUE(ready_task_ids.empty()); - dependency_manager_.StartOrUpdateWaitRequest(worker_id, ObjectIdsToRefs(oids)); - ASSERT_EQ(object_manager_mock_.active_wait_requests.size(), num_objects - 1); - // Simulate the local object getting evicted. The `ray.wait` call should not - // be reactivated. - auto waiting_task_ids = dependency_manager_.HandleObjectMissing(local_object_id); - ASSERT_TRUE(waiting_task_ids.empty()); - ASSERT_EQ(object_manager_mock_.active_wait_requests.size(), num_objects - 1); - // Cancel the worker's `ray.wait`. - dependency_manager_.CancelWaitRequest(worker_id); - AssertNoLeaks(); -} - -/// Test requesting the dependencies for a task. The dependency manager should -/// return the task ID as ready once all of its unique arguments are local. -TEST_F(DependencyManagerTest, TestDuplicateTaskArgs) { - // Create a task with 3 arguments. - int num_arguments = 3; - auto obj_id = ObjectID::FromRandom(); - std::vector arguments; - for (int i = 0; i < num_arguments; i++) { - arguments.push_back(obj_id); - } - TaskID task_id = RandomTaskId(); - bool ready = dependency_manager_.RequestTaskDependencies( - task_id, ObjectIdsToRefs(arguments), {"", false}); - ASSERT_FALSE(ready); - ASSERT_EQ(object_manager_mock_.active_task_requests.size(), 1); - - auto ready_task_ids = dependency_manager_.HandleObjectLocal(obj_id); - ASSERT_EQ(ready_task_ids.size(), 1); - ASSERT_EQ(ready_task_ids.front(), task_id); - dependency_manager_.RemoveTaskDependencies(task_id); - - TaskID task_id2 = RandomTaskId(); - ready = dependency_manager_.RequestTaskDependencies( - task_id2, ObjectIdsToRefs(arguments), {"", false}); - ASSERT_TRUE(ready); - ASSERT_EQ(object_manager_mock_.active_task_requests.size(), 1); - dependency_manager_.RemoveTaskDependencies(task_id2); - - AssertNoLeaks(); -} - -/// Test that RemoveTaskDependencies is called before objects -/// becoming local (e.g. the task is cancelled). -TEST_F(DependencyManagerTest, TestRemoveTaskDependenciesBeforeLocal) { - int num_arguments = 3; - std::vector arguments; - for (int i = 0; i < num_arguments; i++) { - arguments.push_back(ObjectID::FromRandom()); - } - TaskID task_id = RandomTaskId(); - bool ready = dependency_manager_.RequestTaskDependencies( - task_id, ObjectIdsToRefs(arguments), {"foo", false}); - ASSERT_FALSE(ready); - ASSERT_EQ(NumWaiting("bar"), 0); - ASSERT_EQ(NumWaiting("foo"), 1); - ASSERT_EQ(NumWaitingTotal(), 1); - - // The task is cancelled - dependency_manager_.RemoveTaskDependencies(task_id); - ASSERT_EQ(NumWaiting("foo"), 0); - ASSERT_EQ(NumWaitingTotal(), 0); - AssertNoLeaks(); -} - -} // namespace raylet - -} // namespace ray - -int main(int argc, char **argv) { - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/src/ray/raylet/test/BUILD.bazel b/src/ray/raylet/tests/BUILD.bazel similarity index 81% rename from src/ray/raylet/test/BUILD.bazel rename to src/ray/raylet/tests/BUILD.bazel index abd8e45e5e4f..615dae910b46 100644 --- a/src/ray/raylet/test/BUILD.bazel +++ b/src/ray/raylet/tests/BUILD.bazel @@ -33,6 +33,7 @@ ray_cc_test( "//:ray_mock", "//src/ray/raylet:worker_pool", "//src/ray/util:path_utils", + "//src/ray/util:raii", "@com_google_googletest//:gtest_main", ], ) @@ -49,7 +50,7 @@ ray_cc_test( "//:ray_mock", "//src/ray/common:asio", "//src/ray/common:id", - "//src/ray/gcs/gcs_client:gcs_client_lib", + "//src/ray/gcs_client", "//src/ray/object_manager:ownership_object_directory", "//src/ray/protobuf:core_worker_cc_grpc", "//src/ray/pubsub:subscriber", @@ -69,8 +70,7 @@ ray_cc_test( deps = [ "//:ray_mock", "//src/ray/common:id", - "//src/ray/common:task_common", - "//src/ray/gcs/test:gcs_test_util_lib", + "//src/ray/common/scheduling:placement_group_util", "//src/ray/raylet:placement_group_resource_manager", "@com_google_googletest//:gtest_main", ], @@ -95,31 +95,33 @@ ray_cc_test( ) ray_cc_test( - name = "dependency_manager_test", + name = "lease_dependency_manager_test", size = "small", - srcs = ["dependency_manager_test.cc"], + srcs = ["lease_dependency_manager_test.cc"], tags = ["team:core"], deps = [ "//:ray_mock", - "//src/ray/common:task_common", - "//src/ray/common:test_util", - "//src/ray/raylet:dependency_manager", + "//src/ray/common:test_utils", + "//src/ray/observability:fake_metric", + "//src/ray/raylet:lease_dependency_manager", "@com_google_googletest//:gtest_main", ], ) ray_cc_test( - name = "local_task_manager_test", + name = "local_lease_manager_test", size = "small", - srcs = ["local_task_manager_test.cc"], + srcs = ["local_lease_manager_test.cc"], tags = ["team:core"], deps = [ ":util", "//:ray_mock", "//src/ray/common:id", + "//src/ray/common:lease", "//src/ray/common:task_common", - "//src/ray/common:test_util", - "//src/ray/raylet:local_task_manager", + "//src/ray/common:test_utils", + "//src/ray/observability:fake_metric", + "//src/ray/raylet:local_lease_manager", "//src/ray/raylet/scheduling:cluster_resource_scheduler", "@com_google_googletest//:gtest_main", ], @@ -134,7 +136,7 @@ ray_cc_test( tags = ["team:core"], deps = [ ":util", - "//src/ray/common:task_common", + "//src/ray/common:lease", "//src/ray/raylet:worker_killing_policy", "@com_google_googletest//:gtest_main", ], @@ -149,7 +151,7 @@ ray_cc_test( tags = ["team:core"], deps = [ ":util", - "//src/ray/common:task_common", + "//src/ray/common:lease", "//src/ray/raylet:worker_killing_policy", "@com_google_googletest//:gtest_main", ], @@ -164,7 +166,7 @@ ray_cc_test( tags = ["team:core"], deps = [ ":util", - "//src/ray/common:task_common", + "//src/ray/common:lease", "//src/ray/raylet:worker_killing_policy", "@com_google_googletest//:gtest_main", ], @@ -179,11 +181,15 @@ ray_cc_test( ":util", "//:ray_fakes", "//:ray_mock", + "//src/fakes/ray/object_manager/plasma:fake_plasma_client", + "//src/fakes/ray/rpc/raylet:fake_raylet_client", + "//src/ray/common:lease", "//src/ray/common:ray_object", - "//src/ray/object_manager/plasma:plasma_client", + "//src/ray/common:task_common", + "//src/ray/observability:fake_metric", "//src/ray/raylet:local_object_manager_interface", "//src/ray/raylet:node_manager", - "//src/ray/raylet/scheduling:cluster_task_manager", + "//src/ray/raylet/scheduling:cluster_lease_manager", "//src/ray/util:macros", "@com_google_googletest//:gtest_main", ], diff --git a/src/ray/raylet/tests/lease_dependency_manager_test.cc b/src/ray/raylet/tests/lease_dependency_manager_test.cc new file mode 100644 index 000000000000..73b41cd0d688 --- /dev/null +++ b/src/ray/raylet/tests/lease_dependency_manager_test.cc @@ -0,0 +1,418 @@ +// Copyright 2017 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ray/raylet/lease_dependency_manager.h" + +#include +#include +#include +#include + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "mock/ray/object_manager/object_manager.h" +#include "ray/common/test_utils.h" +#include "ray/observability/fake_metric.h" + +namespace ray { + +namespace raylet { + +using ::testing::_; +using ::testing::InSequence; +using ::testing::Return; + +class CustomMockObjectManager : public MockObjectManager { + public: + uint64_t Pull(const std::vector &object_refs, + BundlePriority prio, + const TaskMetricsKey &task_key) override { + if (prio == BundlePriority::GET_REQUEST) { + active_get_requests.insert(req_id); + } else if (prio == BundlePriority::WAIT_REQUEST) { + active_wait_requests.insert(req_id); + } else { + active_lease_requests.insert(req_id); + } + return req_id++; + } + + void CancelPull(uint64_t request_id) override { + ASSERT_TRUE(active_get_requests.erase(request_id) || + active_wait_requests.erase(request_id) || + active_lease_requests.erase(request_id)); + } + + bool PullRequestActiveOrWaitingForMetadata(uint64_t request_id) const override { + return active_get_requests.count(request_id) || + active_wait_requests.count(request_id) || + active_lease_requests.count(request_id); + } + + uint64_t req_id = 1; + std::unordered_set active_get_requests; + std::unordered_set active_wait_requests; + std::unordered_set active_lease_requests; +}; + +class LeaseDependencyManagerTest : public ::testing::Test { + public: + LeaseDependencyManagerTest() + : object_manager_mock_(), + fake_task_by_state_counter_(), + lease_dependency_manager_(object_manager_mock_, fake_task_by_state_counter_) {} + + int64_t NumWaiting(const std::string &lease_name) { + return lease_dependency_manager_.waiting_leases_counter_.Get({lease_name, false}); + } + + int64_t NumWaitingTotal() { + return lease_dependency_manager_.waiting_leases_counter_.Total(); + } + + void AssertNoLeaks() { + ASSERT_TRUE(lease_dependency_manager_.required_objects_.empty()); + ASSERT_TRUE(lease_dependency_manager_.queued_lease_requests_.empty()); + ASSERT_TRUE(lease_dependency_manager_.get_requests_.empty()); + ASSERT_TRUE(lease_dependency_manager_.wait_requests_.empty()); + ASSERT_EQ(lease_dependency_manager_.waiting_leases_counter_.Total(), 0); + // All pull requests are canceled. + ASSERT_TRUE(object_manager_mock_.active_lease_requests.empty()); + ASSERT_TRUE(object_manager_mock_.active_get_requests.empty()); + ASSERT_TRUE(object_manager_mock_.active_wait_requests.empty()); + } + + CustomMockObjectManager object_manager_mock_; + ray::observability::FakeMetric fake_task_by_state_counter_; + LeaseDependencyManager lease_dependency_manager_; +}; + +TEST_F(LeaseDependencyManagerTest, TestRecordMetrics) { + auto obj_id = ObjectID::FromRandom(); + lease_dependency_manager_.RequestLeaseDependencies( + LeaseID::FromRandom(), ObjectIdsToRefs({obj_id}), {"foo", false}); + lease_dependency_manager_.HandleObjectLocal(obj_id); + lease_dependency_manager_.RecordMetrics(); + auto tag_to_value = fake_task_by_state_counter_.GetTagToValue(); + // 3 states: PENDING_NODE_ASSIGNMENT, PENDING_ARGS_FETCH, PENDING_OBJ_STORE_MEM_AVAIL + ASSERT_EQ(tag_to_value.size(), 3); + ASSERT_EQ(tag_to_value.begin()->first.at("Name"), "foo"); +} + +/// Test requesting the dependencies for a lease. The dependency manager should +/// return the lease ID as ready once all of its arguments are local. +TEST_F(LeaseDependencyManagerTest, TestSimpleLease) { + // Create a lease with 3 arguments. + int num_arguments = 3; + std::vector arguments; + for (int i = 0; i < num_arguments; i++) { + arguments.push_back(ObjectID::FromRandom()); + } + LeaseID lease_id = LeaseID::FromRandom(); + bool ready = lease_dependency_manager_.RequestLeaseDependencies( + lease_id, ObjectIdsToRefs(arguments), {"foo", false}); + ASSERT_FALSE(ready); + ASSERT_EQ(NumWaiting("bar"), 0); + ASSERT_EQ(NumWaiting("foo"), 1); + ASSERT_EQ(NumWaitingTotal(), 1); + + // For each argument, tell the lease dependency manager that the argument is + // local. All arguments should be canceled as they become available locally. + auto ready_lease_ids = lease_dependency_manager_.HandleObjectLocal(arguments[0]); + ASSERT_TRUE(ready_lease_ids.empty()); + ready_lease_ids = lease_dependency_manager_.HandleObjectLocal(arguments[1]); + ASSERT_TRUE(ready_lease_ids.empty()); + // The lease is ready to run. + ready_lease_ids = lease_dependency_manager_.HandleObjectLocal(arguments[2]); + ASSERT_EQ(ready_lease_ids.size(), 1); + ASSERT_EQ(ready_lease_ids.front(), lease_id); + ASSERT_EQ(NumWaiting("bar"), 0); + ASSERT_EQ(NumWaiting("foo"), 0); + ASSERT_EQ(NumWaitingTotal(), 0); + + // Remove the lease. + lease_dependency_manager_.RemoveLeaseDependencies(lease_id); + AssertNoLeaks(); +} + +/// Test multiple leases that depend on the same object. The dependency manager +/// should return all lease IDs as ready once the object is local. +TEST_F(LeaseDependencyManagerTest, TestMultipleLeases) { + // Create 3 leases that are dependent on the same object. + ObjectID argument_id = ObjectID::FromRandom(); + std::vector dependent_leases; + int num_dependent_leases = 3; + for (int i = 0; i < num_dependent_leases; i++) { + LeaseID lease_id = LeaseID::FromRandom(); + dependent_leases.push_back(lease_id); + bool ready = lease_dependency_manager_.RequestLeaseDependencies( + lease_id, ObjectIdsToRefs({argument_id}), {"foo", false}); + ASSERT_FALSE(ready); + // The object should be requested from the object manager once for each lease. + ASSERT_EQ(object_manager_mock_.active_lease_requests.size(), i + 1); + } + ASSERT_EQ(NumWaiting("bar"), 0); + ASSERT_EQ(NumWaiting("foo"), 3); + ASSERT_EQ(NumWaitingTotal(), 3); + + // Tell the lease dependency manager that the object is local. + auto ready_lease_ids = lease_dependency_manager_.HandleObjectLocal(argument_id); + // Check that all leases are now ready to run. + std::unordered_set added_leases(dependent_leases.begin(), + dependent_leases.end()); + for (auto &id : ready_lease_ids) { + ASSERT_TRUE(added_leases.erase(id)); + } + ASSERT_TRUE(added_leases.empty()); + + for (auto &id : dependent_leases) { + lease_dependency_manager_.RemoveLeaseDependencies(id); + } + AssertNoLeaks(); +} + +/// Test lease with multiple dependencies. The dependency manager should return +/// the lease ID as ready once all dependencies are local. If a dependency is +/// later evicted, the dependency manager should return the lease ID as waiting. +TEST_F(LeaseDependencyManagerTest, TestLeaseArgEviction) { + // Add a lease with 3 arguments. + int num_arguments = 3; + std::vector arguments; + for (int i = 0; i < num_arguments; i++) { + arguments.push_back(ObjectID::FromRandom()); + } + LeaseID lease_id = LeaseID::FromRandom(); + bool ready = lease_dependency_manager_.RequestLeaseDependencies( + lease_id, ObjectIdsToRefs(arguments), {"", false}); + ASSERT_FALSE(ready); + + // Tell the lease dependency manager that each of the arguments is now + // available. + for (size_t i = 0; i < arguments.size(); i++) { + std::vector ready_leases; + ready_leases = lease_dependency_manager_.HandleObjectLocal(arguments[i]); + if (i == arguments.size() - 1) { + ASSERT_EQ(ready_leases.size(), 1); + ASSERT_EQ(ready_leases.front(), lease_id); + } else { + ASSERT_TRUE(ready_leases.empty()); + } + } + + // Simulate each of the arguments getting evicted. Each object should now be + // considered remote. + for (size_t i = 0; i < arguments.size(); i++) { + std::vector waiting_leases; + waiting_leases = lease_dependency_manager_.HandleObjectMissing(arguments[i]); + if (i == 0) { + // The first eviction should cause the lease to go back to the waiting + // state. + ASSERT_EQ(waiting_leases.size(), 1); + ASSERT_EQ(waiting_leases.front(), lease_id); + } else { + // The subsequent evictions shouldn't cause any more leases to go back to + // the waiting state. + ASSERT_TRUE(waiting_leases.empty()); + } + } + + // Tell the lease dependency manager that each of the arguments is available + // again. + for (size_t i = 0; i < arguments.size(); i++) { + std::vector ready_leases; + ready_leases = lease_dependency_manager_.HandleObjectLocal(arguments[i]); + if (i == arguments.size() - 1) { + ASSERT_EQ(ready_leases.size(), 1); + ASSERT_EQ(ready_leases.front(), lease_id); + } else { + ASSERT_TRUE(ready_leases.empty()); + } + } + + lease_dependency_manager_.RemoveLeaseDependencies(lease_id); + AssertNoLeaks(); +} + +/// Test `ray.get`. Worker calls ray.get on {oid1}, then {oid1, oid2}, then +/// {oid1, oid2, oid3}. +TEST_F(LeaseDependencyManagerTest, TestGet) { + WorkerID worker_id = WorkerID::FromRandom(); + int num_arguments = 3; + std::vector arguments; + for (int i = 0; i < num_arguments; i++) { + // Add the new argument to the list of dependencies to subscribe to. + ObjectID argument_id = ObjectID::FromRandom(); + arguments.push_back(argument_id); + // Subscribe to the lease's dependencies. All arguments except the last are + // duplicates of previous subscription calls. Each argument should only be + // requested from the node manager once. + auto prev_pull_reqs = object_manager_mock_.active_get_requests; + lease_dependency_manager_.StartOrUpdateGetRequest(worker_id, + ObjectIdsToRefs(arguments)); + // Previous pull request for this get should be canceled upon each new + // bundle. + ASSERT_EQ(object_manager_mock_.active_get_requests.size(), 1); + ASSERT_NE(object_manager_mock_.active_get_requests, prev_pull_reqs); + } + + // Nothing happens if the same bundle is requested. + auto prev_pull_reqs = object_manager_mock_.active_get_requests; + lease_dependency_manager_.StartOrUpdateGetRequest(worker_id, + ObjectIdsToRefs(arguments)); + ASSERT_EQ(object_manager_mock_.active_get_requests, prev_pull_reqs); + + // Cancel the pull request once the worker cancels the `ray.get`. + lease_dependency_manager_.CancelGetRequest(worker_id); + AssertNoLeaks(); +} + +/// Test that when one of the objects becomes local after a `ray.wait` call, +/// all requests to remote nodes associated with the object are canceled. +TEST_F(LeaseDependencyManagerTest, TestWait) { + // Generate a random worker and objects to wait on. + WorkerID worker_id = WorkerID::FromRandom(); + int num_objects = 3; + std::vector oids; + for (int i = 0; i < num_objects; i++) { + oids.push_back(ObjectID::FromRandom()); + } + lease_dependency_manager_.StartOrUpdateWaitRequest(worker_id, ObjectIdsToRefs(oids)); + ASSERT_EQ(object_manager_mock_.active_wait_requests.size(), num_objects); + + for (int i = 0; i < num_objects; i++) { + // Object is local. + auto ready_lease_ids = lease_dependency_manager_.HandleObjectLocal(oids[i]); + + // Local object gets evicted. The `ray.wait` call should not be + // reactivated. + auto waiting_lease_ids = lease_dependency_manager_.HandleObjectMissing(oids[i]); + ASSERT_TRUE(waiting_lease_ids.empty()); + ASSERT_EQ(object_manager_mock_.active_wait_requests.size(), num_objects - i - 1); + } + AssertNoLeaks(); +} + +/// Test that when no objects are locally available, a `ray.wait` call makes +/// the correct requests to remote nodes and correctly cancels the requests +/// when the `ray.wait` call is canceled. +TEST_F(LeaseDependencyManagerTest, TestWaitThenCancel) { + // Generate a random worker and objects to wait on. + WorkerID worker_id = WorkerID::FromRandom(); + int num_objects = 3; + std::vector oids; + for (int i = 0; i < num_objects; i++) { + oids.push_back(ObjectID::FromRandom()); + } + // Simulate a worker calling `ray.wait` on some objects. + lease_dependency_manager_.StartOrUpdateWaitRequest(worker_id, ObjectIdsToRefs(oids)); + ASSERT_EQ(object_manager_mock_.active_wait_requests.size(), num_objects); + // Check that it's okay to call `ray.wait` on the same objects again. No new + // calls should be made to try and make the objects local. + lease_dependency_manager_.StartOrUpdateWaitRequest(worker_id, ObjectIdsToRefs(oids)); + ASSERT_EQ(object_manager_mock_.active_wait_requests.size(), num_objects); + // Cancel the worker's `ray.wait`. + lease_dependency_manager_.CancelWaitRequest(worker_id); + AssertNoLeaks(); +} + +/// Test that when one of the objects is already local at the time of the +/// `ray.wait` call, the `ray.wait` call does not trigger any requests to +/// remote nodes for that object. +TEST_F(LeaseDependencyManagerTest, TestWaitObjectLocal) { + // Generate a random worker and objects to wait on. + WorkerID worker_id = WorkerID::FromRandom(); + int num_objects = 3; + std::vector oids; + for (int i = 0; i < num_objects; i++) { + oids.push_back(ObjectID::FromRandom()); + } + // Simulate one of the objects becoming local. The later `ray.wait` call + // should have no effect because the object is already local. + const ObjectID local_object_id = std::move(oids.back()); + auto ready_lease_ids = lease_dependency_manager_.HandleObjectLocal(local_object_id); + ASSERT_TRUE(ready_lease_ids.empty()); + lease_dependency_manager_.StartOrUpdateWaitRequest(worker_id, ObjectIdsToRefs(oids)); + ASSERT_EQ(object_manager_mock_.active_wait_requests.size(), num_objects - 1); + // Simulate the local object getting evicted. The `ray.wait` call should not + // be reactivated. + auto waiting_lease_ids = lease_dependency_manager_.HandleObjectMissing(local_object_id); + ASSERT_TRUE(waiting_lease_ids.empty()); + ASSERT_EQ(object_manager_mock_.active_wait_requests.size(), num_objects - 1); + // Cancel the worker's `ray.wait`. + lease_dependency_manager_.CancelWaitRequest(worker_id); + AssertNoLeaks(); +} + +/// Test requesting the dependencies for a lease. The dependency manager should +/// return the lease ID as ready once all of its unique arguments are local. +TEST_F(LeaseDependencyManagerTest, TestDuplicateLeaseArgs) { + // Create a lease with 3 arguments. + int num_arguments = 3; + auto obj_id = ObjectID::FromRandom(); + std::vector arguments; + for (int i = 0; i < num_arguments; i++) { + arguments.push_back(obj_id); + } + LeaseID lease_id = LeaseID::FromRandom(); + bool ready = lease_dependency_manager_.RequestLeaseDependencies( + lease_id, ObjectIdsToRefs(arguments), {"", false}); + ASSERT_FALSE(ready); + ASSERT_EQ(object_manager_mock_.active_lease_requests.size(), 1); + + auto ready_lease_ids = lease_dependency_manager_.HandleObjectLocal(obj_id); + ASSERT_EQ(ready_lease_ids.size(), 1); + ASSERT_EQ(ready_lease_ids.front(), lease_id); + lease_dependency_manager_.RemoveLeaseDependencies(lease_id); + + LeaseID lease_id2 = LeaseID::FromRandom(); + ready = lease_dependency_manager_.RequestLeaseDependencies( + lease_id2, ObjectIdsToRefs(arguments), {"", false}); + ASSERT_TRUE(ready); + ASSERT_EQ(object_manager_mock_.active_lease_requests.size(), 1); + lease_dependency_manager_.RemoveLeaseDependencies(lease_id2); + + AssertNoLeaks(); +} + +/// Test that RemoveLeaseDependencies is called before objects +/// becoming local (e.g. the lease is cancelled). +TEST_F(LeaseDependencyManagerTest, TestRemoveLeaseDependenciesBeforeLocal) { + int num_arguments = 3; + std::vector arguments; + for (int i = 0; i < num_arguments; i++) { + arguments.push_back(ObjectID::FromRandom()); + } + LeaseID lease_id = LeaseID::FromRandom(); + bool ready = lease_dependency_manager_.RequestLeaseDependencies( + lease_id, ObjectIdsToRefs(arguments), {"foo", false}); + ASSERT_FALSE(ready); + ASSERT_EQ(NumWaiting("bar"), 0); + ASSERT_EQ(NumWaiting("foo"), 1); + ASSERT_EQ(NumWaitingTotal(), 1); + + // The lease is cancelled + lease_dependency_manager_.RemoveLeaseDependencies(lease_id); + ASSERT_EQ(NumWaiting("foo"), 0); + ASSERT_EQ(NumWaitingTotal(), 0); + AssertNoLeaks(); +} + +} // namespace raylet + +} // namespace ray + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/ray/raylet/test/local_task_manager_test.cc b/src/ray/raylet/tests/local_lease_manager_test.cc similarity index 73% rename from src/ray/raylet/test/local_task_manager_test.cc rename to src/ray/raylet/tests/local_lease_manager_test.cc index fe505a303340..c2755dfe0659 100644 --- a/src/ray/raylet/test/local_task_manager_test.cc +++ b/src/ray/raylet/tests/local_lease_manager_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/raylet/local_task_manager.h" +#include "ray/raylet/local_lease_manager.h" #include #include @@ -25,14 +25,15 @@ #include #include -#include "mock/ray/gcs/gcs_client/gcs_client.h" +#include "mock/ray/gcs_client/gcs_client.h" #include "mock/ray/object_manager/object_manager.h" #include "ray/common/id.h" -#include "ray/common/task/task.h" +#include "ray/common/lease/lease.h" #include "ray/common/task/task_util.h" -#include "ray/common/test_util.h" +#include "ray/common/test_utils.h" +#include "ray/observability/fake_metric.h" #include "ray/raylet/scheduling/cluster_resource_scheduler.h" -#include "ray/raylet/test/util.h" +#include "ray/raylet/tests/util.h" namespace ray::raylet { @@ -42,10 +43,10 @@ class MockWorkerPool : public WorkerPoolInterface { public: MockWorkerPool() : num_pops(0) {} - void PopWorker(const TaskSpecification &task_spec, + void PopWorker(const LeaseSpecification &lease_spec, const PopWorkerCallback &callback) override { num_pops++; - const int runtime_env_hash = task_spec.GetRuntimeEnvHash(); + const int runtime_env_hash = lease_spec.GetRuntimeEnvHash(); callbacks[runtime_env_hash].push_back(callback); } @@ -81,7 +82,7 @@ class MockWorkerPool : public WorkerPoolInterface { RAY_CHECK(status != PopWorkerStatus::OK); for (const auto &pair : callbacks) { for (const auto &callback : pair.second) { - // No task should be dispatched. + // No lease should be granted. ASSERT_FALSE( callback(nullptr, status, @@ -95,23 +96,23 @@ class MockWorkerPool : public WorkerPoolInterface { for (auto it = workers.begin(); it != workers.end();) { std::shared_ptr worker = *it; auto runtime_env_hash = worker->GetRuntimeEnvHash(); - bool dispatched = false; + bool granted = false; auto cb_it = callbacks.find(runtime_env_hash); if (cb_it != callbacks.end()) { auto &list = cb_it->second; RAY_CHECK(!list.empty()); for (auto list_it = list.begin(); list_it != list.end();) { auto &callback = *list_it; - dispatched = callback(worker, PopWorkerStatus::OK, ""); + granted = callback(worker, PopWorkerStatus::OK, ""); list_it = list.erase(list_it); - if (dispatched) { + if (granted) { break; } } if (list.empty()) { callbacks.erase(cb_it); } - if (dispatched) { + if (granted) { it = workers.erase(it); continue; } @@ -208,7 +209,7 @@ class MockWorkerPool : public WorkerPoolInterface { RAY_CHECK(false) << "Not used."; } - void PrestartWorkers(const TaskSpecification &task_spec, + void PrestartWorkers(const LeaseSpecification &lease_spec, int64_t backlog_size) override { RAY_CHECK(false) << "Not used."; } @@ -265,9 +266,9 @@ std::shared_ptr CreateSingleNodeScheduler( return scheduler; } -RayTask CreateTask(const std::unordered_map &required_resources, - const std::string &task_name = "default", - const std::vector> &args = {}) { +RayLease CreateLease(const std::unordered_map &required_resources, + const std::string &task_name = "default", + const std::vector> &args = {}) { TaskSpecBuilder spec_builder; TaskID id = RandomTaskId(); JobID job_id = RandomJobId(); @@ -301,23 +302,27 @@ RayTask CreateTask(const std::unordered_map &required_resou spec_builder.AddArg(*arg); } - return RayTask(std::move(spec_builder).ConsumeAndBuild()); + TaskSpecification spec = std::move(spec_builder).ConsumeAndBuild(); + LeaseSpecification lease_spec(spec.GetMessage()); + lease_spec.GetMutableMessage().set_lease_id(LeaseID::FromRandom().Binary()); + return RayLease(std::move(lease_spec)); } } // namespace -class LocalTaskManagerTest : public ::testing::Test { +class LocalLeaseManagerTest : public ::testing::Test { public: - explicit LocalTaskManagerTest(double num_cpus = 3.0) + explicit LocalLeaseManagerTest(double num_cpus = 3.0) : gcs_client_(std::make_unique()), id_(NodeID::FromRandom()), scheduler_(CreateSingleNodeScheduler(id_.Binary(), num_cpus, *gcs_client_)), object_manager_(), - dependency_manager_(object_manager_), - local_task_manager_(std::make_shared( + fake_task_by_state_counter_(), + lease_dependency_manager_(object_manager_, fake_task_by_state_counter_), + local_lease_manager_(std::make_shared( id_, *scheduler_, - dependency_manager_, + lease_dependency_manager_, /* get_node_info= */ [this](const NodeID &node_id) -> const rpc::GcsNodeInfo * { if (node_info_.count(node_id) != 0) { @@ -327,7 +332,7 @@ class LocalTaskManagerTest : public ::testing::Test { }, pool_, leased_workers_, - /* get_task_arguments= */ + /* get_lease_arguments= */ [this](const std::vector &object_ids, std::vector> *results) { for (auto &obj_id : object_ids) { @@ -339,7 +344,7 @@ class LocalTaskManagerTest : public ::testing::Test { } return true; }, - /*max_pinned_task_arguments_bytes=*/1000, + /*max_pinned_lease_arguments_bytes=*/1000, /*get_time=*/[this]() { return current_time_ms_; })) {} void SetUp() override { @@ -361,7 +366,7 @@ class LocalTaskManagerTest : public ::testing::Test { NodeID id_; std::shared_ptr scheduler_; MockWorkerPool pool_; - absl::flat_hash_map> leased_workers_; + absl::flat_hash_map> leased_workers_; std::unordered_set missing_objects_; int default_arg_size_ = 10; @@ -370,11 +375,12 @@ class LocalTaskManagerTest : public ::testing::Test { absl::flat_hash_map node_info_; MockObjectManager object_manager_; - DependencyManager dependency_manager_; - std::shared_ptr local_task_manager_; + ray::observability::FakeMetric fake_task_by_state_counter_; + LeaseDependencyManager lease_dependency_manager_; + std::shared_ptr local_lease_manager_; }; -TEST_F(LocalTaskManagerTest, TestTaskDispatchingOrder) { +TEST_F(LocalLeaseManagerTest, TestLeaseGrantingOrder) { // Initial setup: 3 CPUs available. std::shared_ptr worker1 = std::make_shared(WorkerID::FromRandom(), 0); @@ -386,42 +392,42 @@ TEST_F(LocalTaskManagerTest, TestTaskDispatchingOrder) { pool_.PushWorker(std::static_pointer_cast(worker2)); pool_.PushWorker(std::static_pointer_cast(worker3)); - // First batch of tasks: 2 'f' tasks - auto task_f1 = CreateTask({{ray::kCPU_ResourceLabel, 1}}, "f"); - auto task_f2 = CreateTask({{ray::kCPU_ResourceLabel, 1}}, "f"); + // First batch of leases: [f, f] + auto lease_f1 = CreateLease({{ray::kCPU_ResourceLabel, 1}}, "f"); + auto lease_f2 = CreateLease({{ray::kCPU_ResourceLabel, 1}}, "f"); rpc::RequestWorkerLeaseReply reply; - local_task_manager_->WaitForTaskArgsRequests(std::make_shared( - task_f1, false, false, &reply, [] {}, internal::WorkStatus::WAITING)); - local_task_manager_->ScheduleAndDispatchTasks(); + local_lease_manager_->WaitForLeaseArgsRequests(std::make_shared( + lease_f1, false, false, &reply, [] {}, internal::WorkStatus::WAITING)); + local_lease_manager_->ScheduleAndGrantLeases(); pool_.TriggerCallbacks(); - local_task_manager_->WaitForTaskArgsRequests(std::make_shared( - task_f2, false, false, &reply, [] {}, internal::WorkStatus::WAITING)); - local_task_manager_->ScheduleAndDispatchTasks(); + local_lease_manager_->WaitForLeaseArgsRequests(std::make_shared( + lease_f2, false, false, &reply, [] {}, internal::WorkStatus::WAITING)); + local_lease_manager_->ScheduleAndGrantLeases(); pool_.TriggerCallbacks(); - // Second batch of tasks: [f, f, f, g] - auto task_f3 = CreateTask({{ray::kCPU_ResourceLabel, 1}}, "f"); - auto task_f4 = CreateTask({{ray::kCPU_ResourceLabel, 1}}, "f"); - auto task_f5 = CreateTask({{ray::kCPU_ResourceLabel, 1}}, "f"); - auto task_g1 = CreateTask({{ray::kCPU_ResourceLabel, 1}}, "g"); - local_task_manager_->WaitForTaskArgsRequests(std::make_shared( - task_f3, false, false, &reply, [] {}, internal::WorkStatus::WAITING)); - local_task_manager_->WaitForTaskArgsRequests(std::make_shared( - task_f4, false, false, &reply, [] {}, internal::WorkStatus::WAITING)); - local_task_manager_->WaitForTaskArgsRequests(std::make_shared( - task_f5, false, false, &reply, [] {}, internal::WorkStatus::WAITING)); - local_task_manager_->WaitForTaskArgsRequests(std::make_shared( - task_g1, false, false, &reply, [] {}, internal::WorkStatus::WAITING)); - local_task_manager_->ScheduleAndDispatchTasks(); + // Second batch of leases: [f, f, f, g] + auto lease_f3 = CreateLease({{ray::kCPU_ResourceLabel, 1}}, "f"); + auto lease_f4 = CreateLease({{ray::kCPU_ResourceLabel, 1}}, "f"); + auto lease_f5 = CreateLease({{ray::kCPU_ResourceLabel, 1}}, "f"); + auto lease_g1 = CreateLease({{ray::kCPU_ResourceLabel, 1}}, "g"); + local_lease_manager_->WaitForLeaseArgsRequests(std::make_shared( + lease_f3, false, false, &reply, [] {}, internal::WorkStatus::WAITING)); + local_lease_manager_->WaitForLeaseArgsRequests(std::make_shared( + lease_f4, false, false, &reply, [] {}, internal::WorkStatus::WAITING)); + local_lease_manager_->WaitForLeaseArgsRequests(std::make_shared( + lease_f5, false, false, &reply, [] {}, internal::WorkStatus::WAITING)); + local_lease_manager_->WaitForLeaseArgsRequests(std::make_shared( + lease_g1, false, false, &reply, [] {}, internal::WorkStatus::WAITING)); + local_lease_manager_->ScheduleAndGrantLeases(); pool_.TriggerCallbacks(); - auto tasks_to_dispatch_ = local_task_manager_->GetTaskToDispatch(); - // Only task f in queue now as g is dispatched. - ASSERT_EQ(tasks_to_dispatch_.size(), 1); + auto leases_to_grant_ = local_lease_manager_->GetLeasesToGrant(); + // Out of the leases in the second batch, only lease g is granted due to fair scheduling + ASSERT_EQ(leases_to_grant_.size(), 1); } -TEST_F(LocalTaskManagerTest, TestNoLeakOnImpossibleInfeasibleTask) { - // Note that ideally it shouldn't be possible for an infeasible task to - // be in the local task manager when ScheduleAndDispatchTasks happens. +TEST_F(LocalLeaseManagerTest, TestNoLeakOnImpossibleInfeasibleLease) { + // Note that ideally it shouldn't be possible for an infeasible lease to + // be in the local lease manager when ScheduleAndGrantLeases happens. // See https://github.com/ray-project/ray/pull/52295 for reasons why added this. std::shared_ptr worker1 = @@ -430,43 +436,43 @@ TEST_F(LocalTaskManagerTest, TestNoLeakOnImpossibleInfeasibleTask) { std::make_shared(WorkerID::FromRandom(), 0); pool_.PushWorker(std::static_pointer_cast(worker1)); - // Create 2 tasks that requires 3 CPU's each and are waiting on an arg. + // Create 2 leases that requires 3 CPU's each and are waiting on an arg. auto arg_id = ObjectID::FromRandom(); std::vector> args; args.push_back( std::make_unique(arg_id, rpc::Address{}, "call_site")); - auto task1 = CreateTask({{kCPU_ResourceLabel, 3}}, "f", args); - auto task2 = CreateTask({{kCPU_ResourceLabel, 3}}, "f2", args); + auto lease1 = CreateLease({{kCPU_ResourceLabel, 3}}, "f", args); + auto lease2 = CreateLease({{kCPU_ResourceLabel, 3}}, "f2", args); EXPECT_CALL(object_manager_, Pull(_, _, _)) .WillOnce(::testing::Return(1)) .WillOnce(::testing::Return(2)); - // Submit the tasks to the local task manager. + // Submit the leases to the local lease manager. int num_callbacks_called = 0; auto callback = [&num_callbacks_called]() { ++num_callbacks_called; }; rpc::RequestWorkerLeaseReply reply1; - local_task_manager_->QueueAndScheduleTask(std::make_shared( - task1, false, false, &reply1, callback, internal::WorkStatus::WAITING)); + local_lease_manager_->QueueAndScheduleLease(std::make_shared( + lease1, false, false, &reply1, callback, internal::WorkStatus::WAITING)); rpc::RequestWorkerLeaseReply reply2; - local_task_manager_->QueueAndScheduleTask(std::make_shared( - task2, false, false, &reply2, callback, internal::WorkStatus::WAITING)); + local_lease_manager_->QueueAndScheduleLease(std::make_shared( + lease2, false, false, &reply2, callback, internal::WorkStatus::WAITING)); // Node no longer has cpu. scheduler_->GetLocalResourceManager().DeleteLocalResource( scheduling::ResourceID::CPU()); // Simulate arg becoming local. - local_task_manager_->TasksUnblocked( - {task1.GetTaskSpecification().TaskId(), task2.GetTaskSpecification().TaskId()}); + local_lease_manager_->LeasesUnblocked({lease1.GetLeaseSpecification().LeaseId(), + lease2.GetLeaseSpecification().LeaseId()}); - // Assert that the the correct rpc replies were sent back and the dispatch map is empty. + // Assert that the the correct rpc replies were sent back and the grant map is empty. ASSERT_EQ(reply1.failure_type(), rpc::RequestWorkerLeaseReply::SCHEDULING_CANCELLED_UNSCHEDULABLE); ASSERT_EQ(reply2.failure_type(), rpc::RequestWorkerLeaseReply::SCHEDULING_CANCELLED_UNSCHEDULABLE); ASSERT_EQ(num_callbacks_called, 2); - ASSERT_EQ(local_task_manager_->GetTaskToDispatch().size(), 0); + ASSERT_EQ(local_lease_manager_->GetLeasesToGrant().size(), 0); } int main(int argc, char **argv) { diff --git a/src/ray/raylet/test/local_object_manager_test.cc b/src/ray/raylet/tests/local_object_manager_test.cc similarity index 96% rename from src/ray/raylet/test/local_object_manager_test.cc rename to src/ray/raylet/tests/local_object_manager_test.cc index d98fb7dc834d..294b25cd40e4 100644 --- a/src/ray/raylet/test/local_object_manager_test.cc +++ b/src/ray/raylet/tests/local_object_manager_test.cc @@ -25,13 +25,13 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" -#include "mock/ray/gcs/gcs_client/gcs_client.h" +#include "mock/ray/gcs_client/gcs_client.h" #include "ray/common/asio/instrumented_io_context.h" #include "ray/common/id.h" -#include "ray/gcs/gcs_client/accessor.h" +#include "ray/gcs_client/accessor.h" #include "ray/object_manager/ownership_object_directory.h" #include "ray/pubsub/subscriber.h" -#include "ray/raylet/test/util.h" +#include "ray/raylet/tests/util.h" #include "ray/raylet/worker_pool.h" #include "ray/rpc/grpc_client.h" #include "ray/rpc/worker/core_worker_client.h" @@ -47,18 +47,17 @@ using ::testing::_; class MockSubscriber : public pubsub::SubscriberInterface { public: - bool Subscribe( + void Subscribe( const std::unique_ptr sub_message, - const rpc::ChannelType channel_type, + rpc::ChannelType channel_type, const rpc::Address &owner_address, - const std::string &key_id_binary, + const std::optional &key_id_binary, pubsub::SubscribeDoneCallback subscribe_done_callback, pubsub::SubscriptionItemCallback subscription_callback, pubsub::SubscriptionFailureCallback subscription_failure_callback) override { auto worker_id = WorkerID::FromBinary(owner_address.worker_id()); callbacks[worker_id].push_back( - std::make_pair(ObjectID::FromBinary(key_id_binary), subscription_callback)); - return true; + std::make_pair(ObjectID::FromBinary(*key_id_binary), subscription_callback)); } bool PublishObjectEviction(WorkerID worker_id = WorkerID::Nil()) { @@ -87,25 +86,13 @@ class MockSubscriber : public pubsub::SubscriberInterface { return true; } - MOCK_METHOD6(SubscribeChannel, - bool(std::unique_ptr sub_message, - const rpc::ChannelType channel_type, - const rpc::Address &owner_address, - pubsub::SubscribeDoneCallback subscribe_done_callback, - pubsub::SubscriptionItemCallback subscription_callback, - pubsub::SubscriptionFailureCallback subscription_failure_callback)); - MOCK_METHOD3(Unsubscribe, - bool(const rpc::ChannelType channel_type, + bool(rpc::ChannelType channel_type, const rpc::Address &publisher_address, - const std::string &key_id_binary)); - - MOCK_METHOD2(UnsubscribeChannel, - bool(const rpc::ChannelType channel_type, - const rpc::Address &publisher_address)); + const std::optional &key_id_binary)); MOCK_CONST_METHOD3(IsSubscribed, - bool(const rpc::ChannelType channel_type, + bool(rpc::ChannelType channel_type, const rpc::Address &publisher_address, const std::string &key_id_binary)); @@ -120,7 +107,7 @@ class MockSubscriber : public pubsub::SubscriberInterface { class MockWorkerClient : public rpc::CoreWorkerClientInterface { public: void UpdateObjectLocationBatch( - const rpc::UpdateObjectLocationBatchRequest &request, + rpc::UpdateObjectLocationBatchRequest &&request, const rpc::ClientCallback &callback) override { for (const auto &object_location_update : request.object_location_updates()) { ASSERT_TRUE(object_location_update.has_spilled_location_update()); @@ -242,11 +229,12 @@ class MockIOWorker : public MockWorker { MockIOWorker(WorkerID worker_id, int port, std::shared_ptr io_worker) - : MockWorker(worker_id, port), io_worker(io_worker) {} + : MockWorker(worker_id, port), io_worker_(io_worker) {} - rpc::CoreWorkerClientInterface *rpc_client() { return io_worker.get(); } + rpc::CoreWorkerClientInterface *rpc_client() { return io_worker_.get(); } - std::shared_ptr io_worker; + private: + std::shared_ptr io_worker_; }; class MockIOWorkerPool : public IOWorkerPoolInterface { @@ -459,7 +447,9 @@ TEST_F(LocalObjectManagerTest, TestPin) { for (size_t i = 0; i < free_objects_batch_size; i++) { ASSERT_TRUE(freed.empty()); - EXPECT_CALL(*subscriber_, Unsubscribe(_, _, object_ids[i].Binary())); + EXPECT_CALL( + *subscriber_, + Unsubscribe(_, _, std::make_optional(object_ids[i].Binary()))); ASSERT_TRUE(subscriber_->PublishObjectEviction()); } std::unordered_set expected(object_ids.begin(), object_ids.end()); @@ -945,7 +935,9 @@ TEST_F(LocalObjectManagerTest, TestDeleteNoSpilledObjects) { for (size_t i = 0; i < free_objects_batch_size; i++) { ASSERT_TRUE(freed.empty()); - EXPECT_CALL(*subscriber_, Unsubscribe(_, _, object_ids[i].Binary())); + EXPECT_CALL( + *subscriber_, + Unsubscribe(_, _, std::make_optional(object_ids[i].Binary()))); ASSERT_TRUE(subscriber_->PublishObjectEviction()); } @@ -995,7 +987,9 @@ TEST_F(LocalObjectManagerTest, TestDeleteSpilledObjects) { // All objects are out of scope now. for (size_t i = 0; i < free_objects_batch_size; i++) { - EXPECT_CALL(*subscriber_, Unsubscribe(_, _, object_ids[i].Binary())); + EXPECT_CALL( + *subscriber_, + Unsubscribe(_, _, std::make_optional(object_ids[i].Binary()))); ASSERT_TRUE(subscriber_->PublishObjectEviction()); } @@ -1054,7 +1048,9 @@ TEST_F(LocalObjectManagerTest, TestDeleteURLRefCount) { // Everything is evicted except the last object. In this case, ref count is still > 0. for (size_t i = 0; i < free_objects_batch_size - 1; i++) { - EXPECT_CALL(*subscriber_, Unsubscribe(_, _, object_ids[i].Binary())); + EXPECT_CALL( + *subscriber_, + Unsubscribe(_, _, std::make_optional(object_ids[i].Binary()))); ASSERT_TRUE(subscriber_->PublishObjectEviction()); } manager.ProcessSpilledObjectsDeleteQueue(/* max_batch_size */ 30); @@ -1068,7 +1064,10 @@ TEST_F(LocalObjectManagerTest, TestDeleteURLRefCount) { // The last reference is deleted. EXPECT_CALL(*subscriber_, - Unsubscribe(_, _, object_ids[free_objects_batch_size - 1].Binary())); + Unsubscribe(_, + _, + std::make_optional( + object_ids[free_objects_batch_size - 1].Binary()))); ASSERT_TRUE(subscriber_->PublishObjectEviction()); manager.ProcessSpilledObjectsDeleteQueue(/* max_batch_size */ 30); deleted_urls_size = worker_pool.io_worker_client->ReplyDeleteSpilledObjects(); @@ -1136,7 +1135,9 @@ TEST_F(LocalObjectManagerTest, TestDeleteSpillingObjectsBlocking) { // Every object has gone out of scope. for (size_t i = 0; i < spilled_urls_size; i++) { - EXPECT_CALL(*subscriber_, Unsubscribe(_, _, object_ids[i].Binary())); + EXPECT_CALL( + *subscriber_, + Unsubscribe(_, _, std::make_optional(object_ids[i].Binary()))); ASSERT_TRUE(subscriber_->PublishObjectEviction()); } // Now, deletion queue would process only the first spill set. Everything else won't be @@ -1204,7 +1205,9 @@ TEST_F(LocalObjectManagerTest, TestDeleteMaxObjects) { // Every reference has gone out of scope. for (size_t i = 0; i < free_objects_batch_size; i++) { - EXPECT_CALL(*subscriber_, Unsubscribe(_, _, object_ids[i].Binary())); + EXPECT_CALL( + *subscriber_, + Unsubscribe(_, _, std::make_optional(object_ids[i].Binary()))); ASSERT_TRUE(subscriber_->PublishObjectEviction()); } @@ -1256,7 +1259,8 @@ TEST_F(LocalObjectManagerTest, TestDeleteURLRefCountRaceCondition) { ASSERT_EQ(GetCurrentSpilledCount(), object_ids_to_spill.size()); ASSERT_EQ(GetCurrentSpilledBytes(), object_size * object_ids_to_spill.size()); - EXPECT_CALL(*subscriber_, Unsubscribe(_, _, object_ids[0].Binary())); + EXPECT_CALL(*subscriber_, + Unsubscribe(_, _, std::make_optional(object_ids[0].Binary()))); ASSERT_TRUE(subscriber_->PublishObjectEviction()); // Delete operation is called. In this case, the file with the url should not be // deleted. @@ -1270,7 +1274,9 @@ TEST_F(LocalObjectManagerTest, TestDeleteURLRefCountRaceCondition) { // Everything else is now deleted. for (size_t i = 1; i < free_objects_batch_size; i++) { - EXPECT_CALL(*subscriber_, Unsubscribe(_, _, object_ids[i].Binary())); + EXPECT_CALL( + *subscriber_, + Unsubscribe(_, _, std::make_optional(object_ids[i].Binary()))); ASSERT_TRUE(subscriber_->PublishObjectEviction()); } manager.ProcessSpilledObjectsDeleteQueue(/* max_batch_size */ 30); @@ -1337,7 +1343,9 @@ TEST_F(LocalObjectManagerTest, TestDuplicatePin) { auto owner_id1 = WorkerID::FromBinary(owner_address.worker_id()); for (size_t i = 0; i < free_objects_batch_size; i++) { ASSERT_TRUE(freed.empty()); - EXPECT_CALL(*subscriber_, Unsubscribe(_, _, object_ids[i].Binary())); + EXPECT_CALL( + *subscriber_, + Unsubscribe(_, _, std::make_optional(object_ids[i].Binary()))); ASSERT_TRUE(subscriber_->PublishObjectEviction(owner_id1)); } std::unordered_set expected(object_ids.begin(), object_ids.end()); @@ -1378,7 +1386,9 @@ TEST_F(LocalObjectManagerTest, TestDuplicatePinAndSpill) { auto owner_id1 = WorkerID::FromBinary(owner_address.worker_id()); for (size_t i = 0; i < free_objects_batch_size; i++) { ASSERT_TRUE(freed.empty()); - EXPECT_CALL(*subscriber_, Unsubscribe(_, _, object_ids[i].Binary())); + EXPECT_CALL( + *subscriber_, + Unsubscribe(_, _, std::make_optional(object_ids[i].Binary()))); ASSERT_TRUE(subscriber_->PublishObjectEviction(owner_id1)); } std::unordered_set expected(object_ids.begin(), object_ids.end()); @@ -1604,7 +1614,9 @@ TEST_F(LocalObjectManagerTest, TestPinBytes) { // Delete all (spilled) objects. for (size_t i = 0; i < free_objects_batch_size; i++) { - EXPECT_CALL(*subscriber_, Unsubscribe(_, _, object_ids[i].Binary())); + EXPECT_CALL( + *subscriber_, + Unsubscribe(_, _, std::make_optional(object_ids[i].Binary()))); ASSERT_TRUE(subscriber_->PublishObjectEviction()); } manager.ProcessSpilledObjectsDeleteQueue(/* max_batch_size */ 30); @@ -1666,7 +1678,9 @@ TEST_F(LocalObjectManagerTest, TestConcurrentSpillAndDelete1) { // Delete all objects while they're being spilled. for (size_t i = 0; i < free_objects_batch_size; i++) { - EXPECT_CALL(*subscriber_, Unsubscribe(_, _, object_ids[i].Binary())); + EXPECT_CALL( + *subscriber_, + Unsubscribe(_, _, std::make_optional(object_ids[i].Binary()))); ASSERT_TRUE(subscriber_->PublishObjectEviction()); } @@ -1737,7 +1751,9 @@ TEST_F(LocalObjectManagerTest, TestConcurrentSpillAndDelete2) { // Delete all objects while allocating an IO worker. for (size_t i = 0; i < free_objects_batch_size; i++) { - EXPECT_CALL(*subscriber_, Unsubscribe(_, _, object_ids[i].Binary())); + EXPECT_CALL( + *subscriber_, + Unsubscribe(_, _, std::make_optional(object_ids[i].Binary()))); ASSERT_TRUE(subscriber_->PublishObjectEviction()); } diff --git a/src/ray/raylet/test/node_manager_test.cc b/src/ray/raylet/tests/node_manager_test.cc similarity index 55% rename from src/ray/raylet/test/node_manager_test.cc rename to src/ray/raylet/tests/node_manager_test.cc index 9a42fcac04d7..bd3780619e26 100644 --- a/src/ray/raylet/test/node_manager_test.cc +++ b/src/ray/raylet/tests/node_manager_test.cc @@ -21,22 +21,23 @@ #include #include +#include "fakes/ray/object_manager/plasma/fake_plasma_client.h" +#include "fakes/ray/pubsub/subscriber.h" #include "fakes/ray/rpc/raylet/raylet_client.h" #include "gmock/gmock.h" #include "mock/ray/core_worker/experimental_mutable_object_provider.h" -#include "mock/ray/gcs/gcs_client/gcs_client.h" +#include "mock/ray/gcs_client/gcs_client.h" #include "mock/ray/object_manager/object_directory.h" #include "mock/ray/object_manager/object_manager.h" -#include "mock/ray/object_manager/plasma/client.h" -#include "mock/ray/pubsub/subscriber.h" -#include "mock/ray/raylet/local_task_manager.h" +#include "mock/ray/raylet/local_lease_manager.h" #include "mock/ray/raylet/worker_pool.h" #include "mock/ray/rpc/worker/core_worker_client.h" #include "ray/common/buffer.h" -#include "ray/object_manager/plasma/client.h" +#include "ray/common/scheduling/cluster_resource_data.h" +#include "ray/observability/fake_metric.h" #include "ray/raylet/local_object_manager_interface.h" -#include "ray/raylet/scheduling/cluster_task_manager.h" -#include "ray/raylet/test/util.h" +#include "ray/raylet/scheduling/cluster_lease_manager.h" +#include "ray/raylet/tests/util.h" namespace ray::raylet { using ::testing::_; @@ -94,95 +95,7 @@ class FakeLocalObjectManager : public LocalObjectManagerInterface { std::shared_ptr> objects_pending_deletion_; }; -class FakePlasmaClient : public plasma::PlasmaClientInterface { - public: - Status Connect(const std::string &store_socket_name, - const std::string &manager_socket_name = "", - int num_retries = -1) override { - return Status::OK(); - }; - - Status CreateAndSpillIfNeeded(const ObjectID &object_id, - const ray::rpc::Address &owner_address, - bool is_mutable, - int64_t data_size, - const uint8_t *metadata, - int64_t metadata_size, - std::shared_ptr *data, - plasma::flatbuf::ObjectSource source, - int device_num = 0) override { - return TryCreateImmediately( - object_id, owner_address, data_size, metadata, metadata_size, data, source); - } - - Status TryCreateImmediately(const ObjectID &object_id, - const ray::rpc::Address &owner_address, - int64_t data_size, - const uint8_t *metadata, - int64_t metadata_size, - std::shared_ptr *data, - plasma::flatbuf::ObjectSource source, - int device_num = 0) override { - objects_ids_in_plasma_.emplace(object_id); - objects_in_plasma_.emplace( - object_id, std::make_pair(std::vector{}, std::vector{})); - return Status::OK(); - } - - Status Get(const std::vector &object_ids, - int64_t timeout_ms, - std::vector *object_buffers) override { - for (const auto &id : object_ids) { - auto &buffers = objects_in_plasma_[id]; - plasma::ObjectBuffer shm_buffer{std::make_shared( - buffers.first.data(), buffers.first.size()), - std::make_shared( - buffers.second.data(), buffers.second.size())}; - object_buffers->emplace_back(shm_buffer); - } - return Status::OK(); - } - - Status GetExperimentalMutableObject( - const ObjectID &object_id, - std::unique_ptr *mutable_object) override { - return Status::OK(); - } - - Status Release(const ObjectID &object_id) override { - objects_ids_in_plasma_.erase(object_id); - return Status::OK(); - } - - Status Contains(const ObjectID &object_id, bool *has_object) override { - *has_object = objects_ids_in_plasma_.find(object_id) != objects_ids_in_plasma_.end(); - return Status::OK(); - } - - Status Abort(const ObjectID &object_id) override { return Status::OK(); } - - Status Seal(const ObjectID &object_id) override { return Status::OK(); } - - Status Delete(const std::vector &object_ids) override { - for (const auto &id : object_ids) { - objects_ids_in_plasma_.erase(id); - } - return Status::OK(); - } - - Status Disconnect() override { return Status::OK(); }; - - std::string DebugString() { return ""; } - - int64_t store_capacity() { return 1; } - - private: - absl::flat_hash_set objects_ids_in_plasma_; - absl::flat_hash_map, std::vector>> - objects_in_plasma_; -}; - -TaskSpecification BuildTaskSpec( +LeaseSpecification BuildLeaseSpec( const std::unordered_map &resources) { TaskSpecBuilder builder; rpc::Address empty_address; @@ -209,11 +122,11 @@ TaskSpecification BuildTaskSpec( 0, TaskID::Nil(), ""); - return std::move(builder).ConsumeAndBuild(); + return LeaseSpecification(std::move(builder).ConsumeAndBuild().GetMessage()); } -TaskSpecBuilder DetachedActorCreationTaskBuilder(const rpc::Address &owner_address, - const ActorID &actor_id) { +LeaseSpecification DetachedActorCreationLeaseSpec(const rpc::Address &owner_address, + const ActorID &actor_id) { rpc::JobConfig config; const FunctionDescriptor function_descriptor = FunctionDescriptorBuilder::BuildPython("x", "", "", ""); @@ -253,7 +166,7 @@ TaskSpecBuilder DetachedActorCreationTaskBuilder(const rpc::Address &owner_addre /*extension_data=*/"", /*allow_out_of_order_execution=*/false, /*root_detached_actor_id=*/actor_id); - return task_spec_builder; + return LeaseSpecification(std::move(task_spec_builder).ConsumeAndBuild().GetMessage()); } } // namespace @@ -262,7 +175,7 @@ TEST(NodeManagerStaticTest, TestHandleReportWorkerBacklog) { { // Worker backlog report from a disconnected worker should be ignored. MockWorkerPool worker_pool; - MockLocalTaskManager local_task_manager; + MockLocalLeaseManager local_lease_manager; WorkerID worker_id = WorkerID::FromRandom(); EXPECT_CALL(worker_pool, GetRegisteredWorker(worker_id)) @@ -271,8 +184,8 @@ TEST(NodeManagerStaticTest, TestHandleReportWorkerBacklog) { EXPECT_CALL(worker_pool, GetRegisteredDriver(worker_id)) .Times(1) .WillOnce(Return(nullptr)); - EXPECT_CALL(local_task_manager, ClearWorkerBacklog(_)).Times(0); - EXPECT_CALL(local_task_manager, SetWorkerBacklog(_, _, _)).Times(0); + EXPECT_CALL(local_lease_manager, ClearWorkerBacklog(_)).Times(0); + EXPECT_CALL(local_lease_manager, SetWorkerBacklog(_, _, _)).Times(0); rpc::ReportWorkerBacklogRequest request; request.set_worker_id(worker_id.Binary()); @@ -283,13 +196,13 @@ TEST(NodeManagerStaticTest, TestHandleReportWorkerBacklog) { [](Status status, std::function success, std::function failure) { }, worker_pool, - local_task_manager); + local_lease_manager); } { // Worker backlog report from a connected driver should be recorded. MockWorkerPool worker_pool; - MockLocalTaskManager local_task_manager; + MockLocalLeaseManager local_lease_manager; WorkerID worker_id = WorkerID::FromRandom(); std::shared_ptr driver = std::make_shared(worker_id, 10); @@ -297,13 +210,13 @@ TEST(NodeManagerStaticTest, TestHandleReportWorkerBacklog) { rpc::ReportWorkerBacklogRequest request; request.set_worker_id(worker_id.Binary()); auto backlog_report_1 = request.add_backlog_reports(); - auto task_spec_1 = BuildTaskSpec({{"CPU", 1}}); - backlog_report_1->mutable_resource_spec()->CopyFrom(task_spec_1.GetMessage()); + auto lease_spec_1 = BuildLeaseSpec({{"CPU", 1}}); + backlog_report_1->mutable_lease_spec()->CopyFrom(lease_spec_1.GetMessage()); backlog_report_1->set_backlog_size(1); auto backlog_report_2 = request.add_backlog_reports(); - auto task_spec_2 = BuildTaskSpec({{"GPU", 2}}); - backlog_report_2->mutable_resource_spec()->CopyFrom(task_spec_2.GetMessage()); + auto lease_spec_2 = BuildLeaseSpec({{"GPU", 2}}); + backlog_report_2->mutable_lease_spec()->CopyFrom(lease_spec_2.GetMessage()); backlog_report_2->set_backlog_size(3); rpc::ReportWorkerBacklogReply reply; @@ -313,12 +226,12 @@ TEST(NodeManagerStaticTest, TestHandleReportWorkerBacklog) { EXPECT_CALL(worker_pool, GetRegisteredDriver(worker_id)) .Times(1) .WillOnce(Return(driver)); - EXPECT_CALL(local_task_manager, ClearWorkerBacklog(worker_id)).Times(1); - EXPECT_CALL(local_task_manager, - SetWorkerBacklog(task_spec_1.GetSchedulingClass(), worker_id, 1)) + EXPECT_CALL(local_lease_manager, ClearWorkerBacklog(worker_id)).Times(1); + EXPECT_CALL(local_lease_manager, + SetWorkerBacklog(lease_spec_1.GetSchedulingClass(), worker_id, 1)) .Times(1); - EXPECT_CALL(local_task_manager, - SetWorkerBacklog(task_spec_2.GetSchedulingClass(), worker_id, 3)) + EXPECT_CALL(local_lease_manager, + SetWorkerBacklog(lease_spec_2.GetSchedulingClass(), worker_id, 3)) .Times(1); NodeManager::HandleReportWorkerBacklog( @@ -327,13 +240,13 @@ TEST(NodeManagerStaticTest, TestHandleReportWorkerBacklog) { [](Status status, std::function success, std::function failure) { }, worker_pool, - local_task_manager); + local_lease_manager); } { // Worker backlog report from a connected worker should be recorded. MockWorkerPool worker_pool; - MockLocalTaskManager local_task_manager; + MockLocalLeaseManager local_lease_manager; WorkerID worker_id = WorkerID::FromRandom(); std::shared_ptr worker = std::make_shared(worker_id, 10); @@ -341,13 +254,13 @@ TEST(NodeManagerStaticTest, TestHandleReportWorkerBacklog) { rpc::ReportWorkerBacklogRequest request; request.set_worker_id(worker_id.Binary()); auto backlog_report_1 = request.add_backlog_reports(); - auto task_spec_1 = BuildTaskSpec({{"CPU", 1}}); - backlog_report_1->mutable_resource_spec()->CopyFrom(task_spec_1.GetMessage()); + auto lease_spec_1 = BuildLeaseSpec({{"CPU", 1}}); + backlog_report_1->mutable_lease_spec()->CopyFrom(lease_spec_1.GetMessage()); backlog_report_1->set_backlog_size(1); auto backlog_report_2 = request.add_backlog_reports(); - auto task_spec_2 = BuildTaskSpec({{"GPU", 2}}); - backlog_report_2->mutable_resource_spec()->CopyFrom(task_spec_2.GetMessage()); + auto lease_spec_2 = BuildLeaseSpec({{"GPU", 2}}); + backlog_report_2->mutable_lease_spec()->CopyFrom(lease_spec_2.GetMessage()); backlog_report_2->set_backlog_size(3); rpc::ReportWorkerBacklogReply reply; @@ -356,12 +269,12 @@ TEST(NodeManagerStaticTest, TestHandleReportWorkerBacklog) { .WillOnce(Return(worker)); EXPECT_CALL(worker_pool, GetRegisteredDriver(worker_id)).Times(0); - EXPECT_CALL(local_task_manager, ClearWorkerBacklog(worker_id)).Times(1); - EXPECT_CALL(local_task_manager, - SetWorkerBacklog(task_spec_1.GetSchedulingClass(), worker_id, 1)) + EXPECT_CALL(local_lease_manager, ClearWorkerBacklog(worker_id)).Times(1); + EXPECT_CALL(local_lease_manager, + SetWorkerBacklog(lease_spec_1.GetSchedulingClass(), worker_id, 1)) .Times(1); - EXPECT_CALL(local_task_manager, - SetWorkerBacklog(task_spec_2.GetSchedulingClass(), worker_id, 3)) + EXPECT_CALL(local_lease_manager, + SetWorkerBacklog(lease_spec_2.GetSchedulingClass(), worker_id, 3)) .Times(1); NodeManager::HandleReportWorkerBacklog( @@ -370,7 +283,7 @@ TEST(NodeManagerStaticTest, TestHandleReportWorkerBacklog) { [](Status status, std::function success, std::function failure) { }, worker_pool, - local_task_manager); + local_lease_manager); } } @@ -391,9 +304,10 @@ class NodeManagerTest : public ::testing::Test { node_manager_config.maximum_startup_concurrency = 1; node_manager_config.store_socket_name = "test_store_socket"; - core_worker_subscriber_ = std::make_unique(); + core_worker_subscriber_ = std::make_unique(); mock_object_directory_ = std::make_unique(); mock_object_manager_ = std::make_unique(); + fake_task_by_state_counter_ = ray::observability::FakeMetric(); EXPECT_CALL(*mock_object_manager_, GetMemoryCapacity()).WillRepeatedly(Return(0)); @@ -409,7 +323,6 @@ class NodeManagerTest : public ::testing::Test { EXPECT_CALL(*mock_gcs_client_, DebugString()).WillRepeatedly(Return("")); EXPECT_CALL(*mock_object_manager_, DebugString()).WillRepeatedly(Return("")); EXPECT_CALL(*mock_object_directory_, DebugString()).WillRepeatedly(Return("")); - EXPECT_CALL(*core_worker_subscriber_, DebugString()).WillRepeatedly(Return("")); raylet_node_id_ = NodeID::FromRandom(); @@ -418,7 +331,8 @@ class NodeManagerTest : public ::testing::Test { local_object_manager_ = std::make_unique(objects_pending_deletion_); - dependency_manager_ = std::make_unique(*mock_object_manager_); + lease_dependency_manager_ = std::make_unique( + *mock_object_manager_, fake_task_by_state_counter_); cluster_resource_scheduler_ = std::make_unique( io_service_, @@ -458,10 +372,10 @@ class NodeManagerTest : public ::testing::Test { static_cast(mock_object_manager_->GetMemoryCapacity()) * RayConfig::instance().max_task_args_memory_fraction()); - local_task_manager_ = std::make_unique( + local_lease_manager_ = std::make_unique( raylet_node_id_, *cluster_resource_scheduler_, - *dependency_manager_, + *lease_dependency_manager_, get_node_info_func, mock_worker_pool_, leased_workers_, @@ -471,12 +385,12 @@ class NodeManagerTest : public ::testing::Test { }, max_task_args_memory); - cluster_task_manager_ = std::make_unique( + cluster_lease_manager_ = std::make_unique( raylet_node_id_, *cluster_resource_scheduler_, get_node_info_func, - [](const ray::RayTask &task) {}, - *local_task_manager_); + [](const ray::RayLease &lease) {}, + *local_lease_manager_); node_manager_ = std::make_unique(io_service_, raylet_node_id_, @@ -488,12 +402,12 @@ class NodeManagerTest : public ::testing::Test { raylet_client_pool_, *core_worker_subscriber_, *cluster_resource_scheduler_, - *local_task_manager_, - *cluster_task_manager_, + *local_lease_manager_, + *cluster_lease_manager_, *mock_object_directory_, *mock_object_manager_, *local_object_manager_, - *dependency_manager_, + *lease_dependency_manager_, mock_worker_pool_, leased_workers_, *mock_store_client_, @@ -508,24 +422,25 @@ class NodeManagerTest : public ::testing::Test { rpc::RayletClientPool raylet_client_pool_; NodeID raylet_node_id_; - std::unique_ptr core_worker_subscriber_; + std::unique_ptr core_worker_subscriber_; std::unique_ptr cluster_resource_scheduler_; - std::unique_ptr local_task_manager_; - std::unique_ptr cluster_task_manager_; + std::unique_ptr local_lease_manager_; + std::unique_ptr cluster_lease_manager_; std::shared_ptr local_object_manager_; - std::unique_ptr dependency_manager_; + std::unique_ptr lease_dependency_manager_; std::unique_ptr mock_gcs_client_ = std::make_unique(); std::unique_ptr mock_object_directory_; std::unique_ptr mock_object_manager_; core::experimental::MockMutableObjectProvider *mock_mutable_object_provider_; std::shared_ptr mock_store_client_ = - std::make_shared(); + std::make_shared(); std::unique_ptr node_manager_; MockWorkerPool mock_worker_pool_; - absl::flat_hash_map> leased_workers_; + absl::flat_hash_map> leased_workers_; std::shared_ptr> objects_pending_deletion_; + ray::observability::FakeMetric fake_task_by_state_counter_; }; TEST_F(NodeManagerTest, TestRegisterGcsAndCheckSelfAlive) { @@ -574,7 +489,7 @@ TEST_F(NodeManagerTest, TestDetachedWorkerIsKilledByFailedWorker) { PopWorkerCallback pop_worker_callback; EXPECT_CALL(mock_worker_pool_, PopWorker(_, _)) .WillOnce( - [&](const TaskSpecification &task_spec, const PopWorkerCallback &callback) { + [&](const LeaseSpecification &lease_spec, const PopWorkerCallback &callback) { pop_worker_callback = callback; }); @@ -600,15 +515,14 @@ TEST_F(NodeManagerTest, TestDetachedWorkerIsKilledByFailedWorker) { owner_address.set_worker_id(owner_worker_id.Binary()); const auto actor_id = ActorID::Of(JobID::FromInt(1), TaskID::FromRandom(JobID::FromInt(1)), 0); - const auto task_spec_builder = - DetachedActorCreationTaskBuilder(owner_address, actor_id); + const auto lease_spec = DetachedActorCreationLeaseSpec(owner_address, actor_id); // Invoke RequestWorkerLease to request a leased worker for the task in the // NodeManager. std::promise promise; rpc::RequestWorkerLeaseReply reply; rpc::RequestWorkerLeaseRequest request; - request.mutable_resource_spec()->CopyFrom(task_spec_builder.GetMessage()); + request.mutable_lease_spec()->CopyFrom(lease_spec.GetMessage()); node_manager_->HandleRequestWorkerLease( request, &reply, @@ -653,7 +567,7 @@ TEST_F(NodeManagerTest, TestDetachedWorkerIsKilledByFailedNode) { PopWorkerCallback pop_worker_callback; EXPECT_CALL(mock_worker_pool_, PopWorker(_, _)) .WillOnce( - [&](const TaskSpecification &task_spec, const PopWorkerCallback &callback) { + [&](const LeaseSpecification &lease_spec, const PopWorkerCallback &callback) { pop_worker_callback = callback; }); @@ -675,18 +589,17 @@ TEST_F(NodeManagerTest, TestDetachedWorkerIsKilledByFailedNode) { // Preparing a detached actor creation task spec for the later RequestWorkerLease rpc. const auto owner_node_id = NodeID::FromRandom(); rpc::Address owner_address; - owner_address.set_raylet_id(owner_node_id.Binary()); + owner_address.set_node_id(owner_node_id.Binary()); const auto actor_id = ActorID::Of(JobID::FromInt(1), TaskID::FromRandom(JobID::FromInt(1)), 0); - const auto task_spec_builder = - DetachedActorCreationTaskBuilder(owner_address, actor_id); + const auto lease_spec = DetachedActorCreationLeaseSpec(owner_address, actor_id); // Invoke RequestWorkerLease to request a leased worker for the task in the // NodeManager. std::promise promise; rpc::RequestWorkerLeaseReply reply; rpc::RequestWorkerLeaseRequest request; - request.mutable_resource_spec()->CopyFrom(task_spec_builder.GetMessage()); + request.mutable_lease_spec()->CopyFrom(lease_spec.GetMessage()); node_manager_->HandleRequestWorkerLease( request, &reply, @@ -747,6 +660,413 @@ TEST_F(NodeManagerTest, TestPinningAnObjectPendingDeletionFails) { EXPECT_FALSE(failed_pin_reply.successes(0)); } +TEST_F(NodeManagerTest, TestConsumeSyncMessage) { + // Create and wrap a mock resource view sync message. + syncer::ResourceViewSyncMessage payload; + payload.mutable_resources_total()->insert({"CPU", 10.0}); + payload.mutable_resources_available()->insert({"CPU", 10.0}); + payload.mutable_labels()->insert({"label1", "value1"}); + + std::string serialized; + ASSERT_TRUE(payload.SerializeToString(&serialized)); + + auto node_id = NodeID::FromRandom(); + syncer::RaySyncMessage msg; + msg.set_node_id(node_id.Binary()); + msg.set_message_type(syncer::MessageType::RESOURCE_VIEW); + msg.set_sync_message(serialized); + + node_manager_->ConsumeSyncMessage(std::make_shared(msg)); + + // Verify node resources and labels were updated. + const auto &node_resources = + cluster_resource_scheduler_->GetClusterResourceManager().GetNodeResources( + scheduling::NodeID(node_id.Binary())); + EXPECT_EQ(node_resources.labels.at("label1"), "value1"); + EXPECT_EQ(node_resources.total.Get(scheduling::ResourceID("CPU")).Double(), 10.0); + EXPECT_EQ(node_resources.available.Get(scheduling::ResourceID("CPU")).Double(), 10.0); +} + +TEST_F(NodeManagerTest, TestResizeLocalResourceInstancesSuccessful) { + // Test 1: Up scaling (increasing resource capacity) + rpc::ResizeLocalResourceInstancesRequest request; + rpc::ResizeLocalResourceInstancesReply reply; + + (*request.mutable_resources())["CPU"] = 8.0; + (*request.mutable_resources())["memory"] = 16000000.0; + + bool callback_called = false; + + node_manager_->HandleResizeLocalResourceInstances( + request, + &reply, + [&callback_called]( + Status s, std::function success, std::function failure) { + callback_called = true; + EXPECT_TRUE(s.ok()); + }); + EXPECT_TRUE(callback_called); + + // Check that reply contains the updated resources + EXPECT_EQ(reply.total_resources().at("CPU"), 8.0); + EXPECT_EQ(reply.total_resources().at("memory"), 16000000.0); + + // Test 2: Down scaling (decreasing resources) + (*request.mutable_resources())["CPU"] = 4.0; + (*request.mutable_resources())["memory"] = 8000000.0; + + reply.Clear(); + callback_called = false; + node_manager_->HandleResizeLocalResourceInstances( + request, + &reply, + [&callback_called]( + Status s, std::function success, std::function failure) { + callback_called = true; + EXPECT_TRUE(s.ok()); + }); + EXPECT_TRUE(callback_called); + + // Check that reply contains the updated (reduced) resources + EXPECT_EQ(reply.total_resources().at("CPU"), 4.0); + EXPECT_EQ(reply.total_resources().at("memory"), 8000000.0); + + // Test 3: No changes (same values) + reply.Clear(); + callback_called = false; + node_manager_->HandleResizeLocalResourceInstances( + request, + &reply, + [&callback_called]( + Status s, std::function success, std::function failure) { + callback_called = true; + EXPECT_TRUE(s.ok()); + }); + EXPECT_TRUE(callback_called); + + // Should still succeed and return current state + EXPECT_EQ(reply.total_resources().at("CPU"), 4.0); + EXPECT_EQ(reply.total_resources().at("memory"), 8000000.0); + + // Test 4: Now update only CPU, leaving memory unchanged + request.mutable_resources()->clear(); + (*request.mutable_resources())["CPU"] = 8.0; // Double the CPU + + reply.Clear(); + callback_called = false; + node_manager_->HandleResizeLocalResourceInstances( + request, + &reply, + [&callback_called]( + Status s, std::function success, std::function failure) { + callback_called = true; + EXPECT_TRUE(s.ok()); + }); + EXPECT_TRUE(callback_called); + + // Check that CPU was updated, and memory was unchanged + EXPECT_EQ(reply.total_resources().at("CPU"), 8.0); + EXPECT_EQ(reply.total_resources().at("memory"), 8000000.0); +} + +TEST_F(NodeManagerTest, TestResizeLocalResourceInstancesInvalidArgument) { + // Test trying to resize unit instance resources (GPU, etc.) + rpc::ResizeLocalResourceInstancesRequest request; + rpc::ResizeLocalResourceInstancesReply reply; + + (*request.mutable_resources())["GPU"] = 4.0; // GPU is a unit instance resource + + bool callback_called = false; + + node_manager_->HandleResizeLocalResourceInstances( + request, + &reply, + [&callback_called]( + Status s, std::function success, std::function failure) { + callback_called = true; + EXPECT_FALSE(s.ok()); + EXPECT_TRUE(s.IsInvalidArgument()); + // Check the error message contains expected details + std::string error_msg = s.message(); + EXPECT_TRUE(error_msg.find("Cannot resize unit instance resource 'GPU'") != + std::string::npos); + EXPECT_TRUE(error_msg.find("Unit instance resources") != std::string::npos); + EXPECT_TRUE(error_msg.find("cannot be resized dynamically") != std::string::npos); + }); + + // The callback should have been called with an InvalidArgument status + EXPECT_TRUE(callback_called); +} + +TEST_F(NodeManagerTest, TestResizeLocalResourceInstancesClamps) { + // Test 1: Best effort downsizing + rpc::ResizeLocalResourceInstancesRequest request; + rpc::ResizeLocalResourceInstancesReply reply; + + // Initialize resources to a known state + (*request.mutable_resources())["CPU"] = 8.0; + (*request.mutable_resources())["memory"] = 16000000.0; + + bool callback_called = false; + node_manager_->HandleResizeLocalResourceInstances( + request, + &reply, + [&callback_called]( + Status s, std::function success, std::function failure) { + callback_called = true; + EXPECT_TRUE(s.ok()); + }); + EXPECT_TRUE(callback_called); + + // Simulate resource usage by allocating task resources through the local resource + // manager: Use 6 out of 8 CPUs and 2 are free. + const absl::flat_hash_map task_resources = {{"CPU", 6.0}}; + std::shared_ptr task_allocation = + std::make_shared(); + bool allocation_success = + cluster_resource_scheduler_->GetLocalResourceManager().AllocateLocalTaskResources( + task_resources, task_allocation); + EXPECT_TRUE(allocation_success); + + // Now request to downsize CPU to 4. Should clamp to 6. + callback_called = false; + (*request.mutable_resources())["CPU"] = 4.0; + reply.Clear(); + node_manager_->HandleResizeLocalResourceInstances( + request, + &reply, + [&callback_called]( + Status s, std::function success, std::function failure) { + callback_called = true; + EXPECT_TRUE(s.ok()); + }); + EXPECT_TRUE(callback_called); + // Total CPU should be clamped to 6 because there are only 2 CPUs available. + // It should resize from 8 to 6 instead of resizing to 4. + EXPECT_EQ(reply.total_resources().at("CPU"), 6.0); + + // Test 2: Extreme request (e.g., 0). Should clamp to current usage. + callback_called = false; + (*request.mutable_resources())["CPU"] = 0.0; + reply.Clear(); + node_manager_->HandleResizeLocalResourceInstances( + request, + &reply, + [&callback_called]( + Status s, std::function success, std::function failure) { + callback_called = true; + EXPECT_TRUE(s.ok()); + }); + EXPECT_TRUE(callback_called); + // With 6 used, total should remain 6 + EXPECT_EQ(reply.total_resources().at("CPU"), 6.0); +} + +class NodeManagerReturnWorkerLeaseIdempotentTest + : public NodeManagerTest, + public testing::WithParamInterface> {}; + +TEST_P(NodeManagerReturnWorkerLeaseIdempotentTest, TestDifferentRequestArgs) { + const auto ¶ms = GetParam(); + bool disconnect_worker = std::get<0>(params); + bool worker_exiting = std::get<1>(params); + + LeaseID lease_id = LeaseID::FromRandom(); + leased_workers_[lease_id] = std::make_shared(WorkerID::FromRandom(), 10); + rpc::ReturnWorkerLeaseRequest request; + rpc::ReturnWorkerLeaseReply reply1; + rpc::ReturnWorkerLeaseReply reply2; + request.set_lease_id(lease_id.Binary()); + request.set_disconnect_worker(disconnect_worker); + request.set_disconnect_worker_error_detail("test"); + request.set_worker_exiting(worker_exiting); + + if (disconnect_worker) { + EXPECT_CALL( + mock_worker_pool_, + GetRegisteredWorker(testing::A &>())) + .Times(1) + .WillOnce(Return(nullptr)); + EXPECT_CALL( + mock_worker_pool_, + GetRegisteredDriver(testing::A &>())) + .Times(1) + .WillOnce(Return(nullptr)); + } + node_manager_->HandleReturnWorkerLease( + request, + &reply1, + [](Status s, std::function success, std::function failure) { + ASSERT_TRUE(s.ok()); + }); + ASSERT_EQ(leased_workers_.size(), 0); + node_manager_->HandleReturnWorkerLease( + request, + &reply2, + [](Status s, std::function success, std::function failure) { + ASSERT_TRUE(s.ok()); + }); + ASSERT_EQ(leased_workers_.size(), 0); +} + +INSTANTIATE_TEST_SUITE_P(NodeManagerReturnWorkerLeaseIdempotentVariations, + NodeManagerReturnWorkerLeaseIdempotentTest, + testing::Combine(testing::Bool(), testing::Bool())); + +TEST_F(NodeManagerTest, TestHandleRequestWorkerLeaseIdempotent) { + auto lease_spec = BuildLeaseSpec({}); + rpc::RequestWorkerLeaseRequest request; + rpc::RequestWorkerLeaseReply reply1; + rpc::RequestWorkerLeaseReply reply2; + LeaseID lease_id = LeaseID::FromRandom(); + lease_spec.GetMutableMessage().set_lease_id(lease_id.Binary()); + request.mutable_lease_spec()->CopyFrom(lease_spec.GetMessage()); + request.set_backlog_size(1); + request.set_grant_or_reject(true); + request.set_is_selected_based_on_locality(true); + auto worker = std::make_shared(WorkerID::FromRandom(), 10); + PopWorkerCallback pop_worker_callback; + EXPECT_CALL(mock_worker_pool_, PopWorker(_, _)) + .Times(1) + .WillOnce([&](const LeaseSpecification &ls, const PopWorkerCallback &callback) { + pop_worker_callback = callback; + }); + node_manager_->HandleRequestWorkerLease( + request, + &reply1, + [](Status s, std::function success, std::function failure) { + ASSERT_TRUE(s.ok()); + }); + pop_worker_callback(worker, PopWorkerStatus::OK, ""); + ASSERT_EQ(leased_workers_.size(), 1); + ASSERT_EQ(leased_workers_[lease_id]->GetGrantedLeaseId(), lease_id); + request.mutable_lease_spec()->CopyFrom(lease_spec.GetMessage()); + node_manager_->HandleRequestWorkerLease( + request, + &reply2, + [](Status s, std::function success, std::function failure) { + ASSERT_TRUE(s.ok()); + }); + ASSERT_EQ(leased_workers_.size(), 1); + ASSERT_EQ(leased_workers_[lease_id]->GetGrantedLeaseId(), lease_id); + ASSERT_EQ(leased_workers_[lease_id]->WorkerId(), + WorkerID::FromBinary(reply1.worker_address().worker_id())); + ASSERT_EQ(reply1.worker_address(), reply2.worker_address()); +} + +TEST_F(NodeManagerTest, TestHandleRequestWorkerLeaseInfeasibleIdempotent) { + auto lease_spec = BuildLeaseSpec({{"CPU", 1}}); + lease_spec.GetMutableMessage() + .mutable_scheduling_strategy() + ->mutable_node_affinity_scheduling_strategy() + ->set_soft(false); // Hard constraint + + rpc::RequestWorkerLeaseRequest request; + rpc::RequestWorkerLeaseReply reply1; + rpc::RequestWorkerLeaseReply reply2; + LeaseID lease_id = LeaseID::FromRandom(); + lease_spec.GetMutableMessage().set_lease_id(lease_id.Binary()); + request.mutable_lease_spec()->CopyFrom(lease_spec.GetMessage()); + request.set_backlog_size(1); + request.set_grant_or_reject(true); + request.set_is_selected_based_on_locality(true); + node_manager_->HandleRequestWorkerLease( + request, + &reply1, + [](Status s, std::function success, std::function failure) { + ASSERT_TRUE(s.ok()); + }); + ASSERT_EQ(leased_workers_.size(), 0); + ASSERT_EQ(reply1.canceled(), true); + ASSERT_EQ(reply1.failure_type(), + rpc::RequestWorkerLeaseReply::SCHEDULING_CANCELLED_UNSCHEDULABLE); + request.mutable_lease_spec()->CopyFrom(lease_spec.GetMessage()); + node_manager_->HandleRequestWorkerLease( + request, + &reply2, + [](Status s, std::function success, std::function failure) { + ASSERT_TRUE(s.ok()); + }); + ASSERT_EQ(leased_workers_.size(), 0); + ASSERT_EQ(reply1.canceled(), reply2.canceled()); + ASSERT_EQ(reply1.failure_type(), reply2.failure_type()); + ASSERT_EQ(reply1.scheduling_failure_message(), reply2.scheduling_failure_message()); +} + +size_t GetPendingLeaseWorkerCount(const LocalLeaseManager &local_lease_manager) { + return local_lease_manager.waiting_lease_queue_.size() + + local_lease_manager.leases_to_grant_.size(); +} + +TEST_F(NodeManagerTest, RetryHandleCancelWorkerLeaseWhenHasLeaseRequest) { + auto lease_spec = BuildLeaseSpec({}); + rpc::RequestWorkerLeaseRequest request_worker_lease_request; + rpc::RequestWorkerLeaseReply request_worker_lease_reply; + LeaseID lease_id = LeaseID::FromRandom(); + lease_spec.GetMutableMessage().set_lease_id(lease_id.Binary()); + request_worker_lease_request.mutable_lease_spec()->CopyFrom(lease_spec.GetMessage()); + request_worker_lease_request.set_backlog_size(1); + request_worker_lease_request.set_grant_or_reject(true); + request_worker_lease_request.set_is_selected_based_on_locality(true); + node_manager_->HandleRequestWorkerLease( + request_worker_lease_request, + &request_worker_lease_reply, + [](Status s, std::function success, std::function failure) { + ASSERT_TRUE(s.ok()); + }); + ASSERT_EQ(GetPendingLeaseWorkerCount(*local_lease_manager_), 1); + rpc::CancelWorkerLeaseRequest cancel_worker_lease_request; + cancel_worker_lease_request.set_lease_id(lease_id.Binary()); + rpc::CancelWorkerLeaseReply cancel_worker_lease_reply1; + rpc::CancelWorkerLeaseReply cancel_worker_lease_reply2; + node_manager_->HandleCancelWorkerLease( + cancel_worker_lease_request, + &cancel_worker_lease_reply1, + [](Status s, std::function success, std::function failure) { + ASSERT_TRUE(s.ok()); + }); + ASSERT_EQ(GetPendingLeaseWorkerCount(*local_lease_manager_), 0); + node_manager_->HandleCancelWorkerLease( + cancel_worker_lease_request, + &cancel_worker_lease_reply2, + [](Status s, std::function success, std::function failure) { + ASSERT_TRUE(s.ok()); + }); + ASSERT_EQ(GetPendingLeaseWorkerCount(*local_lease_manager_), 0); + ASSERT_EQ(cancel_worker_lease_reply1.success(), true); + // Due to the message reordering case where the cancel worker lease request + // arrives at the raylet before the worker lease request has been received, we + // cannot return true on the retry since from the raylet perspective both situations are + // equivalent. Even if this returns false, the first request to HandleCancelWorkerLease + // will trigger the callback for HandleRequestWorkerLease and remove the pending lease + // request which prevents the CancelWorkerLease loop. + ASSERT_EQ(cancel_worker_lease_reply2.success(), false); +} + +TEST_F(NodeManagerTest, TestHandleCancelWorkerLeaseNoLeaseIdempotent) { + LeaseID lease_id = LeaseID::FromRandom(); + rpc::CancelWorkerLeaseRequest request; + request.set_lease_id(lease_id.Binary()); + rpc::CancelWorkerLeaseReply reply1; + rpc::CancelWorkerLeaseReply reply2; + node_manager_->HandleCancelWorkerLease( + request, + &reply1, + [](Status s, std::function success, std::function failure) { + ASSERT_TRUE(s.ok()); + }); + ASSERT_EQ(GetPendingLeaseWorkerCount(*local_lease_manager_), 0); + node_manager_->HandleCancelWorkerLease( + request, + &reply2, + [](Status s, std::function success, std::function failure) { + ASSERT_TRUE(s.ok()); + }); + ASSERT_EQ(GetPendingLeaseWorkerCount(*local_lease_manager_), 0); + ASSERT_EQ(reply1.success(), false); + ASSERT_EQ(reply2.success(), false); +} + } // namespace ray::raylet int main(int argc, char **argv) { diff --git a/src/ray/raylet/test/placement_group_resource_manager_test.cc b/src/ray/raylet/tests/placement_group_resource_manager_test.cc similarity index 90% rename from src/ray/raylet/test/placement_group_resource_manager_test.cc rename to src/ray/raylet/tests/placement_group_resource_manager_test.cc index 0e1530f74c63..494e81941ee4 100644 --- a/src/ray/raylet/test/placement_group_resource_manager_test.cc +++ b/src/ray/raylet/tests/placement_group_resource_manager_test.cc @@ -12,24 +12,61 @@ // See the License for the specific language governing permissions and // limitations under the License. -// clang-format off #include "ray/raylet/placement_group_resource_manager.h" #include -#include -#include #include +#include +#include #include "gtest/gtest.h" +#include "mock/ray/gcs_client/gcs_client.h" #include "ray/common/bundle_spec.h" #include "ray/common/id.h" +#include "ray/common/scheduling/placement_group_util.h" #include "ray/common/scheduling/resource_set.h" -#include "ray/gcs/test/gcs_test_util.h" -#include "mock/ray/gcs/gcs_client/gcs_client.h" -// clang-format on namespace ray { +namespace { + +BundleSpecification GenBundleCreation( + const PlacementGroupID &placement_group_id, + const int bundle_index, + const absl::flat_hash_map &unit_resource) { + rpc::Bundle bundle; + auto mutable_bundle_id = bundle.mutable_bundle_id(); + mutable_bundle_id->set_bundle_index(bundle_index); + mutable_bundle_id->set_placement_group_id(placement_group_id.Binary()); + auto mutable_unit_resources = bundle.mutable_unit_resources(); + for (auto &resource : unit_resource) { + mutable_unit_resources->insert({resource.first, resource.second}); + } + return BundleSpecification(bundle); +} + +std::vector> GenBundleSpecifications( + const PlacementGroupID &placement_group_id, + const absl::flat_hash_map &unit_resource, + int bundles_size = 1) { + std::vector> bundle_specs; + for (int i = 0; i < bundles_size; i++) { + rpc::Bundle bundle; + auto mutable_bundle_id = bundle.mutable_bundle_id(); + // The bundle index is start from 1. + mutable_bundle_id->set_bundle_index(i + 1); + mutable_bundle_id->set_placement_group_id(placement_group_id.Binary()); + auto mutable_unit_resources = bundle.mutable_unit_resources(); + for (auto &resource : unit_resource) { + mutable_unit_resources->insert({resource.first, resource.second}); + } + bundle_specs.emplace_back(std::make_shared(bundle)); + } + return bundle_specs; +} + +} // namespace + class NewPlacementGroupResourceManagerTest : public ::testing::Test { public: instrumented_io_context io_context; @@ -150,7 +187,7 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestNewPrepareBundleResource) { auto group_id = PlacementGroupID::Of(JobID::FromInt(1)); absl::flat_hash_map unit_resource; unit_resource.insert({"CPU", 1.0}); - auto bundle_specs = Mocker::GenBundleSpecifications(group_id, unit_resource, 1); + auto bundle_specs = GenBundleSpecifications(group_id, unit_resource, 1); /// 2. init local available resource. InitLocalAvailableResource(unit_resource); /// 3. prepare bundle resource. @@ -165,7 +202,7 @@ TEST_F(NewPlacementGroupResourceManagerTest, auto group_id = PlacementGroupID::Of(JobID::FromInt(1)); absl::flat_hash_map unit_resource; unit_resource.insert({"CPU", 2.0}); - auto bundle_specs = Mocker::GenBundleSpecifications(group_id, unit_resource, 1); + auto bundle_specs = GenBundleSpecifications(group_id, unit_resource, 1); /// 2. init local available resource. absl::flat_hash_map init_unit_resource; init_unit_resource.insert({"CPU", 1.0}); @@ -179,9 +216,9 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestNewPrepareBundleDuringDraining) absl::flat_hash_map unit_resource; unit_resource.insert({"CPU", 1.0}); auto group1_id = PlacementGroupID::Of(JobID::FromInt(1)); - auto bundle1_specs = Mocker::GenBundleSpecifications(group1_id, unit_resource, 1); + auto bundle1_specs = GenBundleSpecifications(group1_id, unit_resource, 1); auto group2_id = PlacementGroupID::Of(JobID::FromInt(2)); - auto bundle2_specs = Mocker::GenBundleSpecifications(group2_id, unit_resource, 1); + auto bundle2_specs = GenBundleSpecifications(group2_id, unit_resource, 1); /// 2. init local available resource. absl::flat_hash_map init_unit_resource; init_unit_resource.insert({"CPU", 2.0}); @@ -218,7 +255,7 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestNewCommitBundleResource) { auto group_id = PlacementGroupID::Of(JobID::FromInt(1)); absl::flat_hash_map unit_resource; unit_resource.insert({"CPU", 1.0}); - auto bundle_specs = Mocker::GenBundleSpecifications(group_id, unit_resource, 1); + auto bundle_specs = GenBundleSpecifications(group_id, unit_resource, 1); /// 2. init local available resource. InitLocalAvailableResource(unit_resource); /// 3. prepare and commit bundle resource. @@ -247,7 +284,7 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestNewReturnBundleResource) { auto group_id = PlacementGroupID::Of(JobID::FromInt(1)); absl::flat_hash_map unit_resource; unit_resource.insert({"CPU", 1.0}); - auto bundle_spec = Mocker::GenBundleCreation(group_id, 1, unit_resource); + auto bundle_spec = GenBundleCreation(group_id, 1, unit_resource); /// 2. init local available resource. InitLocalAvailableResource(unit_resource); /// 3. prepare and commit bundle resource. @@ -268,8 +305,8 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestNewMultipleBundlesCommitAndRetu auto group_id = PlacementGroupID::Of(JobID::FromInt(1)); absl::flat_hash_map unit_resource; unit_resource.insert({"CPU", 1.0}); - auto first_bundle_spec = Mocker::GenBundleCreation(group_id, 1, unit_resource); - auto second_bundle_spec = Mocker::GenBundleCreation(group_id, 2, unit_resource); + auto first_bundle_spec = GenBundleCreation(group_id, 1, unit_resource); + auto second_bundle_spec = GenBundleCreation(group_id, 2, unit_resource); /// 2. init local available resource. absl::flat_hash_map init_unit_resource; init_unit_resource.insert({"CPU", 2.0}); @@ -335,7 +372,7 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestNewIdempotencyWithMultiPrepare) auto group_id = PlacementGroupID::Of(JobID::FromInt(1)); absl::flat_hash_map unit_resource; unit_resource.insert({"CPU", 1.0}); - auto bundle_specs = Mocker::GenBundleSpecifications(group_id, unit_resource, 1); + auto bundle_specs = GenBundleSpecifications(group_id, unit_resource, 1); /// 2. init local available resource. absl::flat_hash_map available_resource = { std::make_pair("CPU", 3.0)}; @@ -357,7 +394,7 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestNewIdempotencyWithRandomOrder) auto group_id = PlacementGroupID::Of(JobID::FromInt(1)); absl::flat_hash_map unit_resource; unit_resource.insert({"CPU", 1.0}); - auto bundle_spec = Mocker::GenBundleCreation(group_id, 1, unit_resource); + auto bundle_spec = GenBundleCreation(group_id, 1, unit_resource); /// 2. init local available resource. absl::flat_hash_map available_resource = { std::make_pair("CPU", 3.0)}; @@ -413,7 +450,7 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestPreparedResourceBatched) { auto group_id = PlacementGroupID::Of(JobID::FromInt(1)); absl::flat_hash_map unit_resource; unit_resource.insert({"CPU", 1.0}); - auto bundle_specs = Mocker::GenBundleSpecifications(group_id, unit_resource, 4); + auto bundle_specs = GenBundleSpecifications(group_id, unit_resource, 4); // 2. init local available resource with 3 CPUs. absl::flat_hash_map available_resource = { std::make_pair("CPU", 3.0)}; @@ -472,7 +509,7 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestCommiteResourceBatched) { auto group_id = PlacementGroupID::Of(JobID::FromInt(1)); absl::flat_hash_map unit_resource; unit_resource.insert({"GPU", 2.0}); - auto bundle_specs = Mocker::GenBundleSpecifications(group_id, unit_resource, 4); + auto bundle_specs = GenBundleSpecifications(group_id, unit_resource, 4); // 2. init local available resource with 4 CPUs. absl::flat_hash_map available_resource = { std::make_pair("GPU", 10.0)}; @@ -520,7 +557,7 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestNewReturnBundleFailure) { auto group_id = PlacementGroupID::Of(JobID::FromInt(1)); absl::flat_hash_map unit_resource; unit_resource.insert({"CPU", 1.0}); - auto bundle_spec = Mocker::GenBundleCreation(group_id, 1, unit_resource); + auto bundle_spec = GenBundleCreation(group_id, 1, unit_resource); /// init local available resource. InitLocalAvailableResource(unit_resource); /// prepare and commit bundle resource. diff --git a/src/ray/raylet/test/runtime_env_agent_client_test.cc b/src/ray/raylet/tests/runtime_env_agent_client_test.cc similarity index 100% rename from src/ray/raylet/test/runtime_env_agent_client_test.cc rename to src/ray/raylet/tests/runtime_env_agent_client_test.cc diff --git a/src/ray/raylet/test/util.h b/src/ray/raylet/tests/util.h similarity index 84% rename from src/ray/raylet/test/util.h rename to src/ray/raylet/tests/util.h index aee501c99870..467ff373bbc9 100644 --- a/src/ray/raylet/test/util.h +++ b/src/ray/raylet/tests/util.h @@ -42,20 +42,22 @@ class MockWorker : public WorkerInterface { void SetOwnerAddress(const rpc::Address &address) override { address_ = address; } - void AssignTaskId(const TaskID &task_id) override { task_id_ = task_id; } - - void SetAssignedTask(const RayTask &assigned_task) override { - task_ = assigned_task; - task_assign_time_ = absl::Now(); - root_detached_actor_id_ = assigned_task.GetTaskSpecification().RootDetachedActorId(); - const auto &task_spec = assigned_task.GetTaskSpecification(); - SetJobId(task_spec.JobId()); - SetBundleId(task_spec.PlacementGroupBundleId()); - SetOwnerAddress(task_spec.CallerAddress()); - AssignTaskId(task_spec.TaskId()); + void GrantLease(const RayLease &granted_lease) override { + lease_ = granted_lease; + lease_grant_time_ = absl::Now(); + root_detached_actor_id_ = granted_lease.GetLeaseSpecification().RootDetachedActorId(); + const auto &lease_spec = granted_lease.GetLeaseSpecification(); + SetJobId(lease_spec.JobId()); + SetBundleId(lease_spec.PlacementGroupBundleId()); + SetOwnerAddress(lease_spec.CallerAddress()); + GrantLeaseId(lease_spec.LeaseId()); }; - absl::Time GetAssignedTaskTime() const override { return task_assign_time_; }; + void GrantLeaseId(const LeaseID &lease_id) override { lease_id_ = lease_id; } + + const RayLease &GetGrantedLease() const override { return lease_; } + + absl::Time GetGrantedLeaseTime() const override { return lease_grant_time_; }; std::optional GetIsGpu() const override { return is_gpu_; } @@ -116,7 +118,7 @@ class MockWorker : public WorkerInterface { return -1; } void SetAssignedPort(int port) override { RAY_CHECK(false) << "Method unused"; } - const TaskID &GetAssignedTaskId() const override { return task_id_; } + const LeaseID &GetGrantedLeaseId() const override { return lease_id_; } const JobID &GetAssignedJobId() const override { return job_id_; } int GetRuntimeEnvHash() const override { return runtime_env_hash_; } void AssignActorId(const ActorID &actor_id) override { @@ -126,19 +128,16 @@ class MockWorker : public WorkerInterface { RAY_CHECK(false) << "Method unused"; return ActorID::Nil(); } - const std::string GetTaskOrActorIdAsDebugString() const override { + const std::string GetLeaseIdAsDebugString() const override { RAY_CHECK(false) << "Method unused"; return ""; } bool IsDetachedActor() const override { - return task_.GetTaskSpecification().IsDetachedActor(); + return lease_.GetLeaseSpecification().IsDetachedActor(); } - const std::shared_ptr Connection() const override { - RAY_CHECK(false) << "Method unused"; - return nullptr; - } + const std::shared_ptr Connection() const override { return nullptr; } const rpc::Address &GetOwnerAddress() const override { return address_; } void ActorCallArgWaitComplete(int64_t tag) override { @@ -158,7 +157,7 @@ class MockWorker : public WorkerInterface { void SetBundleId(const BundleID &bundle_id) override { bundle_id_ = bundle_id; } - RayTask &GetAssignedTask() override { return task_; } + RayLease &GetGrantedLease() override { return lease_; } bool IsRegistered() override { RAY_CHECK(false) << "Method unused"; @@ -197,10 +196,10 @@ class MockWorker : public WorkerInterface { std::optional is_actor_worker_; BundleID bundle_id_; bool blocked_ = false; - RayTask task_; - absl::Time task_assign_time_; + RayLease lease_; + absl::Time lease_grant_time_; int runtime_env_hash_; - TaskID task_id_; + LeaseID lease_id_; JobID job_id_; ActorID root_detached_actor_id_; Process proc_; diff --git a/src/ray/raylet/test/wait_manager_test.cc b/src/ray/raylet/tests/wait_manager_test.cc similarity index 100% rename from src/ray/raylet/test/wait_manager_test.cc rename to src/ray/raylet/tests/wait_manager_test.cc diff --git a/src/ray/raylet/test/worker_killing_policy_group_by_owner_test.cc b/src/ray/raylet/tests/worker_killing_policy_group_by_owner_test.cc similarity index 93% rename from src/ray/raylet/test/worker_killing_policy_group_by_owner_test.cc rename to src/ray/raylet/tests/worker_killing_policy_group_by_owner_test.cc index 7328ae41e2ef..93b9b5f8f718 100644 --- a/src/ray/raylet/test/worker_killing_policy_group_by_owner_test.cc +++ b/src/ray/raylet/tests/worker_killing_policy_group_by_owner_test.cc @@ -20,8 +20,8 @@ #include #include "gtest/gtest.h" -#include "ray/common/task/task_spec.h" -#include "ray/raylet/test/util.h" +#include "ray/common/lease/lease_spec.h" +#include "ray/raylet/tests/util.h" #include "ray/raylet/worker_killing_policy.h" namespace ray { @@ -41,31 +41,31 @@ class WorkerKillingGroupByOwnerTest : public ::testing::Test { std::shared_ptr CreateActorCreationWorker(TaskID owner_id, int32_t max_restarts) { - rpc::TaskSpec message; - message.set_task_id(TaskID::FromRandom(job_id_).Binary()); + rpc::LeaseSpec message; + message.set_lease_id(LeaseID::FromRandom().Binary()); message.set_parent_task_id(owner_id.Binary()); - message.mutable_actor_creation_task_spec()->set_max_actor_restarts(max_restarts); message.set_type(ray::rpc::TaskType::ACTOR_CREATION_TASK); - TaskSpecification task_spec(message); - RayTask task(task_spec); + message.set_max_actor_restarts(max_restarts); + LeaseSpecification lease_spec(message); + RayLease lease(lease_spec); auto worker = std::make_shared(ray::WorkerID::FromRandom(), port_); - worker->SetAssignedTask(task); - worker->AssignTaskId(task.GetTaskSpecification().TaskId()); + worker->GrantLease(lease); + worker->GrantLeaseId(lease.GetLeaseSpecification().LeaseId()); return worker; } std::shared_ptr CreateTaskWorker(TaskID owner_id, int32_t max_retries) { - rpc::TaskSpec message; - message.set_task_id(TaskID::FromRandom(job_id_).Binary()); + rpc::LeaseSpec message; + message.set_lease_id(LeaseID::FromRandom().Binary()); message.set_parent_task_id(owner_id.Binary()); - message.set_max_retries(max_retries); message.set_type(ray::rpc::TaskType::NORMAL_TASK); - TaskSpecification task_spec(message); - RayTask task(task_spec); + message.set_max_retries(max_retries); + LeaseSpecification lease_spec(message); + RayLease lease(lease_spec); auto worker = std::make_shared(ray::WorkerID::FromRandom(), port_); - worker->SetAssignedTask(task); - worker->AssignTaskId(task.GetTaskSpecification().TaskId()); + worker->GrantLease(lease); + worker->GrantLeaseId(lease.GetLeaseSpecification().LeaseId()); return worker; } }; diff --git a/src/ray/raylet/test/worker_killing_policy_retriable_fifo_test.cc b/src/ray/raylet/tests/worker_killing_policy_retriable_fifo_test.cc similarity index 86% rename from src/ray/raylet/test/worker_killing_policy_retriable_fifo_test.cc rename to src/ray/raylet/tests/worker_killing_policy_retriable_fifo_test.cc index 0c512233fc7b..9026e26b836a 100644 --- a/src/ray/raylet/test/worker_killing_policy_retriable_fifo_test.cc +++ b/src/ray/raylet/tests/worker_killing_policy_retriable_fifo_test.cc @@ -18,8 +18,8 @@ #include #include "gtest/gtest.h" -#include "ray/common/task/task_spec.h" -#include "ray/raylet/test/util.h" +#include "ray/common/lease/lease_spec.h" +#include "ray/raylet/tests/util.h" #include "ray/raylet/worker_killing_policy.h" namespace ray { @@ -32,24 +32,24 @@ class WorkerKillerTest : public ::testing::Test { RetriableFIFOWorkerKillingPolicy worker_killing_policy_; std::shared_ptr CreateActorCreationWorker(int32_t max_restarts) { - rpc::TaskSpec message; - message.mutable_actor_creation_task_spec()->set_max_actor_restarts(max_restarts); + rpc::LeaseSpec message; + message.set_max_actor_restarts(max_restarts); message.set_type(ray::rpc::TaskType::ACTOR_CREATION_TASK); - TaskSpecification task_spec(message); - RayTask task(task_spec); + LeaseSpecification lease_spec(message); + RayLease lease(lease_spec); auto worker = std::make_shared(ray::WorkerID::FromRandom(), port_); - worker->SetAssignedTask(task); + worker->GrantLease(lease); return worker; } std::shared_ptr CreateTaskWorker(int32_t max_retries) { - rpc::TaskSpec message; + rpc::LeaseSpec message; message.set_max_retries(max_retries); message.set_type(ray::rpc::TaskType::NORMAL_TASK); - TaskSpecification task_spec(message); - RayTask task(task_spec); + LeaseSpecification lease_spec(message); + RayLease lease(lease_spec); auto worker = std::make_shared(ray::WorkerID::FromRandom(), port_); - worker->SetAssignedTask(task); + worker->GrantLease(lease); return worker; } }; @@ -68,9 +68,9 @@ TEST_F(WorkerKillerTest, auto first_submitted = WorkerKillerTest::CreateActorCreationWorker(0 /* max_restarts */); auto second_submitted = - WorkerKillerTest::CreateActorCreationWorker(5 /* max_restarts */); + WorkerKillerTest::CreateActorCreationWorker(1 /* max_restarts */); auto third_submitted = WorkerKillerTest::CreateTaskWorker(0 /* max_restarts */); - auto fourth_submitted = WorkerKillerTest::CreateTaskWorker(11 /* max_restarts */); + auto fourth_submitted = WorkerKillerTest::CreateTaskWorker(1 /* max_restarts */); workers.push_back(first_submitted); workers.push_back(second_submitted); diff --git a/src/ray/raylet/test/worker_killing_policy_test.cc b/src/ray/raylet/tests/worker_killing_policy_test.cc similarity index 69% rename from src/ray/raylet/test/worker_killing_policy_test.cc rename to src/ray/raylet/tests/worker_killing_policy_test.cc index c9c0ef5ed572..dca60ad6f58c 100644 --- a/src/ray/raylet/test/worker_killing_policy_test.cc +++ b/src/ray/raylet/tests/worker_killing_policy_test.cc @@ -18,8 +18,8 @@ #include #include "gtest/gtest.h" -#include "ray/common/task/task_spec.h" -#include "ray/raylet/test/util.h" +#include "ray/common/lease/lease_spec.h" +#include "ray/raylet/tests/util.h" namespace ray { @@ -31,36 +31,25 @@ class WorkerKillerTest : public ::testing::Test { int32_t port_ = 2389; RetriableLIFOWorkerKillingPolicy worker_killing_policy_; - std::shared_ptr CreateActorWorker(int32_t max_restarts) { - rpc::TaskSpec message; - message.mutable_actor_creation_task_spec()->set_max_actor_restarts(max_restarts); - message.set_type(ray::rpc::TaskType::ACTOR_TASK); - TaskSpecification task_spec(message); - RayTask task(task_spec); - auto worker = std::make_shared(ray::WorkerID::FromRandom(), port_); - worker->SetAssignedTask(task); - return worker; - } - std::shared_ptr CreateActorCreationWorker(int32_t max_restarts) { - rpc::TaskSpec message; - message.mutable_actor_creation_task_spec()->set_max_actor_restarts(max_restarts); + rpc::LeaseSpec message; + message.set_max_actor_restarts(max_restarts); message.set_type(ray::rpc::TaskType::ACTOR_CREATION_TASK); - TaskSpecification task_spec(message); - RayTask task(task_spec); + LeaseSpecification lease_spec(message); + RayLease lease(lease_spec); auto worker = std::make_shared(ray::WorkerID::FromRandom(), port_); - worker->SetAssignedTask(task); + worker->GrantLease(lease); return worker; } std::shared_ptr CreateTaskWorker(int32_t max_retries) { - rpc::TaskSpec message; + rpc::LeaseSpec message; message.set_max_retries(max_retries); message.set_type(ray::rpc::TaskType::NORMAL_TASK); - TaskSpecification task_spec(message); - RayTask task(task_spec); + LeaseSpecification lease_spec(message); + RayLease lease(lease_spec); auto worker = std::make_shared(ray::WorkerID::FromRandom(), port_); - worker->SetAssignedTask(task); + worker->GrantLease(lease); return worker; } }; @@ -76,14 +65,15 @@ TEST_F(WorkerKillerTest, TestEmptyWorkerPoolSelectsNullWorker) { TEST_F(WorkerKillerTest, TestPreferRetriableOverNonRetriableAndOrderByTimestampDescending) { std::vector> workers; - auto first_submitted = WorkerKillerTest::CreateActorWorker(7 /* max_restarts */); + auto first_submitted = + WorkerKillerTest::CreateActorCreationWorker(false /* max_restarts */); auto second_submitted = - WorkerKillerTest::CreateActorCreationWorker(5 /* max_restarts */); - auto third_submitted = WorkerKillerTest::CreateTaskWorker(0 /* max_restarts */); - auto fourth_submitted = WorkerKillerTest::CreateTaskWorker(11 /* max_restarts */); + WorkerKillerTest::CreateActorCreationWorker(true /* max_restarts */); + auto third_submitted = WorkerKillerTest::CreateTaskWorker(false /* max_retries */); + auto fourth_submitted = WorkerKillerTest::CreateTaskWorker(true /* max_retries */); auto fifth_submitted = - WorkerKillerTest::CreateActorCreationWorker(0 /* max_restarts */); - auto sixth_submitted = WorkerKillerTest::CreateActorWorker(0 /* max_restarts */); + WorkerKillerTest::CreateActorCreationWorker(false /* max_restarts */); + auto sixth_submitted = WorkerKillerTest::CreateTaskWorker(true /* max_retries */); workers.push_back(first_submitted); workers.push_back(second_submitted); @@ -93,9 +83,9 @@ TEST_F(WorkerKillerTest, workers.push_back(sixth_submitted); std::vector> expected_order; + expected_order.push_back(sixth_submitted); expected_order.push_back(fourth_submitted); expected_order.push_back(second_submitted); - expected_order.push_back(sixth_submitted); expected_order.push_back(fifth_submitted); expected_order.push_back(third_submitted); expected_order.push_back(first_submitted); diff --git a/src/ray/raylet/test/worker_pool_test.cc b/src/ray/raylet/tests/worker_pool_test.cc similarity index 85% rename from src/ray/raylet/test/worker_pool_test.cc rename to src/ray/raylet/tests/worker_pool_test.cc index af7df9dcd114..b5d24485a7bf 100644 --- a/src/ray/raylet/test/worker_pool_test.cc +++ b/src/ray/raylet/tests/worker_pool_test.cc @@ -26,14 +26,16 @@ #include #include "absl/time/time.h" -#include "mock/ray/gcs/gcs_client/gcs_client.h" +#include "mock/ray/gcs_client/gcs_client.h" #include "nlohmann/json.hpp" #include "ray/common/asio/asio_util.h" #include "ray/common/asio/instrumented_io_context.h" #include "ray/common/constants.h" +#include "ray/common/lease/lease_spec.h" #include "ray/raylet/runtime_env_agent_client.h" #include "ray/util/path_utils.h" #include "ray/util/process.h" +#include "ray/util/raii.h" #include "src/ray/protobuf/runtime_env_agent.pb.h" using json = nlohmann::json; @@ -151,8 +153,7 @@ class WorkerPoolMock : public WorkerPool { "", []() {}, 0, - [this]() { return absl::FromUnixMillis(current_time_ms_); }, - /*enable_resource_isolation=*/false), + [this]() { return absl::FromUnixMillis(current_time_ms_); }), last_worker_process_(), instrumented_io_service_(io_service), client_call_manager_(instrumented_io_service_, false), @@ -363,14 +364,14 @@ class WorkerPoolMock : public WorkerPool { // \param[in] push_workers If true, tries to push the workers from the started // processes. std::shared_ptr PopWorkerSync( - const TaskSpecification &task_spec, + const LeaseSpecification &lease_spec, bool push_workers = true, PopWorkerStatus *worker_status = nullptr, int timeout_worker_number = 0, std::string *runtime_env_error_msg = nullptr) { std::shared_ptr popped_worker = nullptr; std::promise promise; - this->PopWorker(task_spec, + this->PopWorker(lease_spec, [&popped_worker, worker_status, &promise, runtime_env_error_msg]( const std::shared_ptr worker, PopWorkerStatus status, @@ -386,7 +387,7 @@ class WorkerPoolMock : public WorkerPool { return true; }); if (push_workers) { - PushWorkers(timeout_worker_number, task_spec.JobId()); + PushWorkers(timeout_worker_number, lease_spec.JobId()); } promise.get_future().get(); return popped_worker; @@ -456,7 +457,6 @@ class WorkerPoolTest : public ::testing::Test { const rpc::JobConfig &job_config = rpc::JobConfig()) { auto driver = worker_pool_->CreateWorker(Process::CreateNewDummy(), Language::PYTHON, job_id); - driver->AssignTaskId(TaskID::ForDriverTask(job_id)); RAY_CHECK_OK(worker_pool_->RegisterDriver(driver, job_config, [](Status, int) {})); return driver; } @@ -528,29 +528,23 @@ static inline rpc::RuntimeEnvInfo ExampleRuntimeEnvInfoFromString( return runtime_env_info; } -static inline TaskSpecification ExampleTaskSpec( - const ActorID actor_id = ActorID::Nil(), +static inline LeaseSpecification ExampleLeaseSpec( + const ActorID actor_creation_id = ActorID::Nil(), const Language &language = Language::PYTHON, const JobID &job_id = JOB_ID, - const ActorID actor_creation_id = ActorID::Nil(), const std::vector &dynamic_worker_options = {}, - const TaskID &task_id = TaskID::FromRandom(JobID::Nil()), + const LeaseID &lease_id = LeaseID::Nil(), const rpc::RuntimeEnvInfo runtime_env_info = rpc::RuntimeEnvInfo(), std::unordered_map resources = {{"CPU", 1}}) { - rpc::TaskSpec message; + rpc::LeaseSpec message; message.set_job_id(job_id.Binary()); message.set_language(language); - // Make sure no reduplicative task id. - RAY_CHECK(!task_id.IsNil()); - message.set_task_id(task_id.Binary()); - if (!actor_id.IsNil()) { - message.set_type(TaskType::ACTOR_TASK); - message.mutable_actor_task_spec()->set_actor_id(actor_id.Binary()); - } else if (!actor_creation_id.IsNil()) { + message.set_lease_id(lease_id.Binary()); + if (!actor_creation_id.IsNil()) { message.set_type(TaskType::ACTOR_CREATION_TASK); - message.mutable_actor_creation_task_spec()->set_actor_id(actor_creation_id.Binary()); + message.set_actor_id(actor_creation_id.Binary()); for (const auto &option : dynamic_worker_options) { - message.mutable_actor_creation_task_spec()->add_dynamic_worker_options(option); + message.add_dynamic_worker_options(option); } } else { message.set_type(TaskType::NORMAL_TASK); @@ -558,7 +552,7 @@ static inline TaskSpecification ExampleTaskSpec( message.mutable_required_resources()->insert(resources.begin(), resources.end()); message.mutable_runtime_env_info()->CopyFrom(runtime_env_info); - return TaskSpecification(std::move(message)); + return LeaseSpecification(std::move(message)); } TEST_F(WorkerPoolDriverRegisteredTest, CompareWorkerProcessObjects) { @@ -649,42 +643,43 @@ TEST_F(WorkerPoolDriverRegisteredTest, InitialWorkerProcessCount) { } TEST_F(WorkerPoolDriverRegisteredTest, TestPrestartingWorkers) { - const auto task_spec = ExampleTaskSpec(); + auto lease_spec = ExampleLeaseSpec(); + lease_spec.GetMutableMessage().set_lease_id(LeaseID::FromRandom().Binary()); // Prestarts 2 workers. - worker_pool_->PrestartWorkers(task_spec, 2); + worker_pool_->PrestartWorkers(lease_spec, 2); ASSERT_EQ(worker_pool_->NumWorkersStarting(), 2); // Prestarts 1 more worker. - worker_pool_->PrestartWorkers(task_spec, 3); + worker_pool_->PrestartWorkers(lease_spec, 3); ASSERT_EQ(worker_pool_->NumWorkersStarting(), 3); // No more needed. - worker_pool_->PrestartWorkers(task_spec, 1); + worker_pool_->PrestartWorkers(lease_spec, 1); ASSERT_EQ(worker_pool_->NumWorkersStarting(), 3); // Capped by soft limit. - worker_pool_->PrestartWorkers(task_spec, 20); + worker_pool_->PrestartWorkers(lease_spec, 20); ASSERT_EQ(worker_pool_->NumWorkersStarting(), POOL_SIZE_SOFT_LIMIT); } TEST_F(WorkerPoolDriverRegisteredTest, TestPrestartingWorkersWithRuntimeEnv) { - auto task_spec = ExampleTaskSpec(); - task_spec.GetMutableMessage().mutable_runtime_env_info()->set_serialized_runtime_env( + auto lease_spec = ExampleLeaseSpec(); + lease_spec.GetMutableMessage().mutable_runtime_env_info()->set_serialized_runtime_env( "{\"env_vars\": {\"FOO\": \"bar\"}}"); // Prestarts 2 workers. - worker_pool_->PrestartWorkers(task_spec, 2); + worker_pool_->PrestartWorkers(lease_spec, 2); ASSERT_EQ(worker_pool_->NumWorkersStarting(), 2); // Prestarts 1 more worker. - worker_pool_->PrestartWorkers(task_spec, 3); + worker_pool_->PrestartWorkers(lease_spec, 3); ASSERT_EQ(worker_pool_->NumWorkersStarting(), 3); // No more needed. - worker_pool_->PrestartWorkers(task_spec, 1); + worker_pool_->PrestartWorkers(lease_spec, 1); ASSERT_EQ(worker_pool_->NumWorkersStarting(), 3); // Capped by soft limit. - worker_pool_->PrestartWorkers(task_spec, 20); + worker_pool_->PrestartWorkers(lease_spec, 20); ASSERT_EQ(worker_pool_->NumWorkersStarting(), POOL_SIZE_SOFT_LIMIT); } TEST_F(WorkerPoolDriverRegisteredTest, HandleWorkerPushPop) { std::shared_ptr popped_worker; - const auto task_spec = ExampleTaskSpec(); + const auto lease_spec = ExampleLeaseSpec(); // Create some workers. std::unordered_set> workers; workers.insert(worker_pool_->CreateWorker(Process::CreateNewDummy())); @@ -694,15 +689,15 @@ TEST_F(WorkerPoolDriverRegisteredTest, HandleWorkerPushPop) { worker_pool_->PushWorker(worker); } // Pop two workers and make sure they're one of the workers we created. - popped_worker = worker_pool_->PopWorkerSync(task_spec); + popped_worker = worker_pool_->PopWorkerSync(lease_spec); ASSERT_NE(popped_worker, nullptr); ASSERT_GT(workers.count(popped_worker), 0); - popped_worker = worker_pool_->PopWorkerSync(task_spec); + popped_worker = worker_pool_->PopWorkerSync(lease_spec); ASSERT_NE(popped_worker, nullptr); ASSERT_GT(workers.count(popped_worker), 0); // Pop a worker from the empty pool and make sure it isn't one of the workers we // created. - popped_worker = worker_pool_->PopWorkerSync(task_spec); + popped_worker = worker_pool_->PopWorkerSync(lease_spec); ASSERT_NE(popped_worker, nullptr); ASSERT_EQ(workers.count(popped_worker), 0); } @@ -712,26 +707,26 @@ TEST_F(WorkerPoolDriverRegisteredTest, PopWorkerSyncsOfMultipleLanguages) { auto py_worker = worker_pool_->CreateWorker(Process::CreateNewDummy(), Language::PYTHON); worker_pool_->PushWorker(py_worker); - // Check that the Python worker will not be popped if the given task is a Java task - const auto java_task_spec = ExampleTaskSpec(ActorID::Nil(), Language::JAVA); - ASSERT_NE(worker_pool_->PopWorkerSync(java_task_spec), py_worker); - // Check that the Python worker can be popped if the given task is a Python task - const auto py_task_spec = ExampleTaskSpec(ActorID::Nil(), Language::PYTHON); - ASSERT_EQ(worker_pool_->PopWorkerSync(py_task_spec), py_worker); + // Check that the Python worker will not be popped if the given lease is a Java lease + const auto java_lease_spec = ExampleLeaseSpec(ActorID::Nil(), Language::JAVA); + ASSERT_NE(worker_pool_->PopWorkerSync(java_lease_spec), py_worker); + // Check that the Python worker can be popped if the given lease is a Python lease + const auto py_lease_spec = ExampleLeaseSpec(ActorID::Nil(), Language::PYTHON); + ASSERT_EQ(worker_pool_->PopWorkerSync(py_lease_spec), py_worker); // Create a Java Worker, and add it to the pool auto java_worker = worker_pool_->CreateWorker(Process::CreateNewDummy(), Language::JAVA); worker_pool_->PushWorker(java_worker); - // Check that the Java worker will be popped now for Java task - ASSERT_EQ(worker_pool_->PopWorkerSync(java_task_spec), java_worker); + // Check that the Java worker will be popped now for Java lease + ASSERT_EQ(worker_pool_->PopWorkerSync(java_lease_spec), java_worker); } TEST_F(WorkerPoolDriverRegisteredTest, StartWorkerWithNodeIdArg) { - auto task_id = TaskID::FromRandom(JOB_ID); - TaskSpecification task_spec = ExampleTaskSpec( - ActorID::Nil(), Language::PYTHON, JOB_ID, ActorID::Nil(), {}, task_id); - ASSERT_NE(worker_pool_->PopWorkerSync(task_spec), nullptr); + auto lease_id = LeaseID::FromRandom(); + LeaseSpecification lease_spec = + ExampleLeaseSpec(ActorID::Nil(), Language::PYTHON, JOB_ID, {}, lease_id); + ASSERT_NE(worker_pool_->PopWorkerSync(lease_spec), nullptr); const auto real_command = worker_pool_->GetWorkerCommand(worker_pool_->LastStartedWorkerProcess()); @@ -755,10 +750,12 @@ TEST_F(WorkerPoolDriverRegisteredTest, StartWorkerWithDynamicOptionsCommand) { actor_jvm_options.end(), {"-Dmy-actor.hello=foo", "-Dmy-actor.world=bar", "-Xmx2g", "-Xms1g"}); JobID job_id = JobID::FromInt(12345); - auto task_id = TaskID::ForDriverTask(job_id); - auto actor_id = ActorID::Of(job_id, task_id, 1); - TaskSpecification task_spec = ExampleTaskSpec( - ActorID::Nil(), Language::JAVA, job_id, actor_id, actor_jvm_options, task_id); + auto actor_creation_id = ActorID::Of(job_id, TaskID::ForDriverTask(job_id), 1); + LeaseSpecification lease_spec = ExampleLeaseSpec(actor_creation_id, + Language::JAVA, + job_id, + actor_jvm_options, + LeaseID::FromRandom()); rpc::JobConfig job_config = rpc::JobConfig(); job_config.add_code_search_path("/test/code_search_path"); @@ -768,7 +765,7 @@ TEST_F(WorkerPoolDriverRegisteredTest, StartWorkerWithDynamicOptionsCommand) { job_config.add_jvm_options("-Dmy-job.foo=bar"); worker_pool_->HandleJobStarted(job_id, job_config); - ASSERT_NE(worker_pool_->PopWorkerSync(task_spec), nullptr); + ASSERT_NE(worker_pool_->PopWorkerSync(lease_spec), nullptr); const auto real_command = worker_pool_->GetWorkerCommand(worker_pool_->LastStartedWorkerProcess()); @@ -839,7 +836,7 @@ TEST_F(WorkerPoolDriverRegisteredTest, TestWorkerStartupKeepAliveDuration) { ASSERT_EQ(worker_pool_->GetProcessSize(), POOL_SIZE_SOFT_LIMIT + 2); ASSERT_EQ(worker_pool_->GetIdleWorkerSize(), 0); - // The worker registered. There's no pending tasks so it becomes idle. + // The worker registered. There's no pending leases so it becomes idle. worker_pool_->PushWorkers(0, JOB_ID); ASSERT_EQ(worker_pool_->NumWorkersStarting(), 0); ASSERT_EQ(worker_pool_->GetProcessSize(), POOL_SIZE_SOFT_LIMIT + 2); @@ -880,9 +877,8 @@ TEST_F(WorkerPoolDriverRegisteredTest, PopWorkerMultiTenancy) { // Make the first worker an actor worker. if (i == 0) { auto actor_creation_id = ActorID::Of(job_id, TaskID::ForDriverTask(job_id), 1); - auto task_spec = ExampleTaskSpec( - /*actor_id=*/ActorID::Nil(), Language::PYTHON, job_id, actor_creation_id); - runtime_env_hash = task_spec.GetRuntimeEnvHash(); + auto lease_spec = ExampleLeaseSpec(actor_creation_id, Language::PYTHON, job_id); + runtime_env_hash = lease_spec.GetRuntimeEnvHash(); } auto worker = worker_pool_->CreateWorker(Process::CreateNewDummy(), Language::PYTHON, @@ -899,19 +895,18 @@ TEST_F(WorkerPoolDriverRegisteredTest, PopWorkerMultiTenancy) { // Pop workers for actor. for (auto job_id : job_ids) { auto actor_creation_id = ActorID::Of(job_id, TaskID::ForDriverTask(job_id), 1); - // Pop workers for actor creation tasks. - auto task_spec = ExampleTaskSpec( - /*actor_id=*/ActorID::Nil(), Language::PYTHON, job_id, actor_creation_id); - auto worker = worker_pool_->PopWorkerSync(task_spec); + // Pop workers for actor creation leases. + auto lease_spec = ExampleLeaseSpec(actor_creation_id, Language::PYTHON, job_id); + auto worker = worker_pool_->PopWorkerSync(lease_spec); ASSERT_TRUE(worker); ASSERT_EQ(worker->GetAssignedJobId(), job_id); workers.push_back(worker); } - // Pop workers for normal tasks. + // Pop workers for normal leases. for (auto job_id : job_ids) { - auto task_spec = ExampleTaskSpec(ActorID::Nil(), Language::PYTHON, job_id); - auto worker = worker_pool_->PopWorkerSync(task_spec); + auto lease_spec = ExampleLeaseSpec(ActorID::Nil(), Language::PYTHON, job_id); + auto worker = worker_pool_->PopWorkerSync(lease_spec); ASSERT_TRUE(worker); ASSERT_EQ(worker->GetAssignedJobId(), job_id); workers.push_back(worker); @@ -931,8 +926,8 @@ TEST_F(WorkerPoolDriverRegisteredTest, PopWorkerMultiTenancy) { } } -// Tests the worker assignment logic for task specs that have a root detached actor ID. -// These tasks: +// Tests the worker assignment logic for lease specs that have a root detached actor ID. +// These leases: // - Must be matched to workers that have a matching job ID (or no job ID). // - Must be matched to workers that have a matching detached actor ID (or no detached // actor ID). @@ -942,10 +937,11 @@ TEST_F(WorkerPoolDriverRegisteredTest, PopWorkerForRequestWithRootDetachedActor) // NOTE: in all test cases the request has job_1_detached_actor_1 as its root detached // actor. - auto detached_actor_id_1_job_1 = ActorID::Of(job_1_id, TaskID::FromRandom(job_1_id), 0); - auto task_spec_job_1_detached_actor_1 = - ExampleTaskSpec(ActorID::Nil(), Language::PYTHON, job_1_id); - task_spec_job_1_detached_actor_1.GetMutableMessage().set_root_detached_actor_id( + auto detached_actor_id_1_job_1 = + ActorID::Of(job_1_id, TaskID::ForDriverTask(job_1_id), 0); + auto lease_spec_job_1_detached_actor_1 = + ExampleLeaseSpec(ActorID::Nil(), Language::PYTHON, job_1_id); + lease_spec_job_1_detached_actor_1.GetMutableMessage().set_root_detached_actor_id( detached_actor_id_1_job_1.Binary()); // Case 1 (match): @@ -954,7 +950,7 @@ TEST_F(WorkerPoolDriverRegisteredTest, PopWorkerForRequestWithRootDetachedActor) Process::CreateNewDummy(), Language::PYTHON, JobID::Nil()); worker_pool_->PushWorker(worker_no_job_no_detached_actor); - ASSERT_EQ(worker_pool_->PopWorkerSync(task_spec_job_1_detached_actor_1), + ASSERT_EQ(worker_pool_->PopWorkerSync(lease_spec_job_1_detached_actor_1), worker_no_job_no_detached_actor); ASSERT_EQ(worker_pool_->GetIdleWorkerSize(), 0); @@ -964,7 +960,7 @@ TEST_F(WorkerPoolDriverRegisteredTest, PopWorkerForRequestWithRootDetachedActor) worker_pool_->CreateWorker(Process::CreateNewDummy(), Language::PYTHON, job_1_id); worker_pool_->PushWorker(worker_job_1_no_detached_actor); - ASSERT_EQ(worker_pool_->PopWorkerSync(task_spec_job_1_detached_actor_1), + ASSERT_EQ(worker_pool_->PopWorkerSync(lease_spec_job_1_detached_actor_1), worker_job_1_no_detached_actor); ASSERT_EQ(worker_pool_->GetIdleWorkerSize(), 0); @@ -972,12 +968,12 @@ TEST_F(WorkerPoolDriverRegisteredTest, PopWorkerForRequestWithRootDetachedActor) // worker has matching root detached actor ID and job ID auto worker_job_1_detached_actor_1 = worker_pool_->CreateWorker(Process::CreateNewDummy(), Language::PYTHON, job_1_id); - RayTask job_1_detached_actor_1_task(task_spec_job_1_detached_actor_1); - worker_job_1_detached_actor_1->SetAssignedTask(job_1_detached_actor_1_task); - worker_job_1_detached_actor_1->AssignTaskId(TaskID::Nil()); + RayLease job_1_detached_actor_1_lease(lease_spec_job_1_detached_actor_1); + worker_job_1_detached_actor_1->GrantLease(job_1_detached_actor_1_lease); + worker_job_1_detached_actor_1->GrantLeaseId(LeaseID::Nil()); worker_pool_->PushWorker(worker_job_1_detached_actor_1); - ASSERT_EQ(worker_pool_->PopWorkerSync(task_spec_job_1_detached_actor_1), + ASSERT_EQ(worker_pool_->PopWorkerSync(lease_spec_job_1_detached_actor_1), worker_job_1_detached_actor_1); ASSERT_EQ(worker_pool_->GetIdleWorkerSize(), 0); @@ -987,7 +983,7 @@ TEST_F(WorkerPoolDriverRegisteredTest, PopWorkerForRequestWithRootDetachedActor) worker_pool_->CreateWorker(Process::CreateNewDummy(), Language::PYTHON, job_2_id); worker_pool_->PushWorker(worker_job_2_no_detached_actor); - ASSERT_NE(worker_pool_->PopWorkerSync(task_spec_job_1_detached_actor_1), + ASSERT_NE(worker_pool_->PopWorkerSync(lease_spec_job_1_detached_actor_1), worker_job_2_no_detached_actor); ASSERT_EQ(worker_pool_->GetIdleWorkerSize(), 1); worker_job_2_no_detached_actor->MarkDead(); @@ -998,17 +994,18 @@ TEST_F(WorkerPoolDriverRegisteredTest, PopWorkerForRequestWithRootDetachedActor) // worker has mismatched detached actor ID and mismatched job ID auto worker_job_2_detached_actor_3 = worker_pool_->CreateWorker(Process::CreateNewDummy(), Language::PYTHON, job_2_id); - auto detached_actor_3_id_job_2 = ActorID::Of(job_2_id, TaskID::FromRandom(job_2_id), 0); - auto task_spec_job_2_detached_actor_3 = - ExampleTaskSpec(ActorID::Nil(), Language::PYTHON, job_2_id); - task_spec_job_2_detached_actor_3.GetMutableMessage().set_root_detached_actor_id( + auto detached_actor_3_id_job_2 = + ActorID::Of(job_2_id, TaskID::ForDriverTask(job_2_id), 0); + auto lease_spec_job_2_detached_actor_3 = + ExampleLeaseSpec(ActorID::Nil(), Language::PYTHON, job_2_id); + lease_spec_job_2_detached_actor_3.GetMutableMessage().set_root_detached_actor_id( detached_actor_3_id_job_2.Binary()); - RayTask job_2_detached_actor_3_task(task_spec_job_2_detached_actor_3); - worker_job_2_detached_actor_3->SetAssignedTask(job_2_detached_actor_3_task); - worker_job_2_detached_actor_3->AssignTaskId(TaskID::Nil()); + RayLease job_2_detached_actor_3_lease(lease_spec_job_2_detached_actor_3); + worker_job_2_detached_actor_3->GrantLease(job_2_detached_actor_3_lease); + worker_job_2_detached_actor_3->GrantLeaseId(LeaseID::Nil()); worker_pool_->PushWorker(worker_job_2_detached_actor_3); - ASSERT_NE(worker_pool_->PopWorkerSync(task_spec_job_1_detached_actor_1), + ASSERT_NE(worker_pool_->PopWorkerSync(lease_spec_job_1_detached_actor_1), worker_job_2_detached_actor_3); ASSERT_EQ(worker_pool_->GetIdleWorkerSize(), 1); worker_job_2_detached_actor_3->MarkDead(); @@ -1019,17 +1016,18 @@ TEST_F(WorkerPoolDriverRegisteredTest, PopWorkerForRequestWithRootDetachedActor) // worker has mismatched detached actor ID and matching job ID auto worker_job_1_detached_actor_2 = worker_pool_->CreateWorker(Process::CreateNewDummy(), Language::PYTHON, job_1_id); - auto detached_actor_id_2_job_1 = ActorID::Of(job_1_id, TaskID::FromRandom(job_1_id), 1); - auto task_spec_job_1_detached_actor_2 = - ExampleTaskSpec(ActorID::Nil(), Language::PYTHON, job_1_id); - task_spec_job_1_detached_actor_2.GetMutableMessage().set_root_detached_actor_id( + auto detached_actor_id_2_job_1 = + ActorID::Of(job_1_id, TaskID::ForDriverTask(job_1_id), 1); + auto lease_spec_job_1_detached_actor_2 = + ExampleLeaseSpec(ActorID::Nil(), Language::PYTHON, job_1_id); + lease_spec_job_1_detached_actor_2.GetMutableMessage().set_root_detached_actor_id( detached_actor_id_2_job_1.Binary()); - RayTask job_1_detached_actor_2_task(task_spec_job_1_detached_actor_2); - worker_job_1_detached_actor_2->SetAssignedTask(job_1_detached_actor_2_task); - worker_job_1_detached_actor_2->AssignTaskId(TaskID::Nil()); + RayLease job_1_detached_actor_2_lease(lease_spec_job_1_detached_actor_2); + worker_job_1_detached_actor_2->GrantLease(job_1_detached_actor_2_lease); + worker_job_1_detached_actor_2->GrantLeaseId(LeaseID::Nil()); worker_pool_->PushWorker(worker_job_1_detached_actor_2); - ASSERT_NE(worker_pool_->PopWorkerSync(task_spec_job_1_detached_actor_1), + ASSERT_NE(worker_pool_->PopWorkerSync(lease_spec_job_1_detached_actor_1), worker_job_1_detached_actor_2); ASSERT_EQ(worker_pool_->GetIdleWorkerSize(), 1); worker_job_1_detached_actor_2->MarkDead(); @@ -1044,16 +1042,16 @@ TEST_F(WorkerPoolDriverRegisteredTest, PopWorkerForRequestWithRootDetachedActor) // Test the worker pool logic regardless for completeness. auto worker_job_2_detached_actor_1 = worker_pool_->CreateWorker(Process::CreateNewDummy(), Language::PYTHON, job_2_id); - auto task_spec_job_2_detached_actor_1 = - ExampleTaskSpec(ActorID::Nil(), Language::PYTHON, job_2_id); - task_spec_job_2_detached_actor_1.GetMutableMessage().set_root_detached_actor_id( + auto lease_spec_job_2_detached_actor_1 = + ExampleLeaseSpec(ActorID::Nil(), Language::PYTHON, job_2_id); + lease_spec_job_2_detached_actor_1.GetMutableMessage().set_root_detached_actor_id( detached_actor_id_1_job_1.Binary()); - RayTask job_2_detached_actor_1_task(task_spec_job_2_detached_actor_1); - worker_job_2_detached_actor_1->SetAssignedTask(job_2_detached_actor_1_task); - worker_job_2_detached_actor_1->AssignTaskId(TaskID::Nil()); + RayLease job_2_detached_actor_1_lease(lease_spec_job_2_detached_actor_1); + worker_job_2_detached_actor_1->GrantLease(job_2_detached_actor_1_lease); + worker_job_2_detached_actor_1->GrantLeaseId(LeaseID::Nil()); worker_pool_->PushWorker(worker_job_2_detached_actor_1); - ASSERT_NE(worker_pool_->PopWorkerSync(task_spec_job_1_detached_actor_1), + ASSERT_NE(worker_pool_->PopWorkerSync(lease_spec_job_1_detached_actor_1), worker_job_2_detached_actor_1); ASSERT_EQ(worker_pool_->GetIdleWorkerSize(), 1); worker_job_2_detached_actor_1->MarkDead(); @@ -1062,7 +1060,7 @@ TEST_F(WorkerPoolDriverRegisteredTest, PopWorkerForRequestWithRootDetachedActor) } // Tests the worker assignment logic for workers that have a root detached actor ID -// but tasks that *don't* have one. +// but leases that *don't* have one. // // Workers with a root detached actor ID can be used so long as their job ID matches // or hasn't been assigned yet. @@ -1073,63 +1071,65 @@ TEST_F(WorkerPoolDriverRegisteredTest, PopWorkerWithRootDetachedActorID) { // NOTE: in all test cases the only worker in the pool is worker_job_1_detached_actor_1. auto worker_job_1_detached_actor_1 = worker_pool_->CreateWorker(Process::CreateNewDummy(), Language::PYTHON, job_1_id); - auto task_spec_job_1_detached_actor_1 = - ExampleTaskSpec(ActorID::Nil(), Language::PYTHON, job_1_id); - auto detached_actor_id_1_job_1 = ActorID::Of(job_1_id, TaskID::FromRandom(job_1_id), 0); - task_spec_job_1_detached_actor_1.GetMutableMessage().set_root_detached_actor_id( + auto lease_spec_job_1_detached_actor_1 = + ExampleLeaseSpec(ActorID::Nil(), Language::PYTHON, job_1_id); + auto detached_actor_id_1_job_1 = + ActorID::Of(job_1_id, TaskID::ForDriverTask(job_1_id), 0); + lease_spec_job_1_detached_actor_1.GetMutableMessage().set_root_detached_actor_id( detached_actor_id_1_job_1.Binary()); - RayTask job_1_detached_actor_1_task(task_spec_job_1_detached_actor_1); - worker_job_1_detached_actor_1->SetAssignedTask(job_1_detached_actor_1_task); - worker_job_1_detached_actor_1->AssignTaskId(TaskID::Nil()); + RayLease job_1_detached_actor_1_lease(lease_spec_job_1_detached_actor_1); + worker_job_1_detached_actor_1->GrantLease(job_1_detached_actor_1_lease); + worker_job_1_detached_actor_1->GrantLeaseId(LeaseID::Nil()); // Case 1 (match): // request has no root detached actor ID and matching job ID - auto task_spec_job_1_no_detached_actor = - ExampleTaskSpec(ActorID::Nil(), Language::PYTHON, job_1_id); + auto lease_spec_job_1_no_detached_actor = + ExampleLeaseSpec(ActorID::Nil(), Language::PYTHON, job_1_id); worker_pool_->PushWorker(worker_job_1_detached_actor_1); - ASSERT_EQ(worker_pool_->PopWorkerSync(task_spec_job_1_no_detached_actor), + ASSERT_EQ(worker_pool_->PopWorkerSync(lease_spec_job_1_no_detached_actor), worker_job_1_detached_actor_1); ASSERT_EQ(worker_pool_->GetIdleWorkerSize(), 0); // Case 2 (match): // request has matching root detached actor ID and matching job ID worker_pool_->PushWorker(worker_job_1_detached_actor_1); - ASSERT_EQ(worker_pool_->PopWorkerSync(task_spec_job_1_detached_actor_1), + ASSERT_EQ(worker_pool_->PopWorkerSync(lease_spec_job_1_detached_actor_1), worker_job_1_detached_actor_1); ASSERT_EQ(worker_pool_->GetIdleWorkerSize(), 0); // Case 3 (mismatch): // request has no root detached actor ID and mismatched job ID - auto task_spec_job_2_no_detached_actor = - ExampleTaskSpec(ActorID::Nil(), Language::PYTHON, job_2_id); + auto lease_spec_job_2_no_detached_actor = + ExampleLeaseSpec(ActorID::Nil(), Language::PYTHON, job_2_id); worker_pool_->PushWorker(worker_job_1_detached_actor_1); - ASSERT_NE(worker_pool_->PopWorkerSync(task_spec_job_2_no_detached_actor), + ASSERT_NE(worker_pool_->PopWorkerSync(lease_spec_job_2_no_detached_actor), worker_job_1_detached_actor_1); ASSERT_EQ(worker_pool_->GetIdleWorkerSize(), 1); // Case 4 (mismatch): // request has mismatched root detached actor ID and mismatched job ID - auto task_spec_job_2_detached_actor_2 = - ExampleTaskSpec(ActorID::Nil(), Language::PYTHON, job_2_id); - auto job_2_detached_actor_2_id = ActorID::Of(job_2_id, TaskID::FromRandom(job_2_id), 0); - task_spec_job_2_detached_actor_2.GetMutableMessage().set_root_detached_actor_id( + auto lease_spec_job_2_detached_actor_2 = + ExampleLeaseSpec(ActorID::Nil(), Language::PYTHON, job_2_id); + auto job_2_detached_actor_2_id = + ActorID::Of(job_2_id, TaskID::ForDriverTask(job_2_id), 0); + lease_spec_job_2_detached_actor_2.GetMutableMessage().set_root_detached_actor_id( job_2_detached_actor_2_id.Binary()); - ASSERT_NE(worker_pool_->PopWorkerSync(task_spec_job_2_detached_actor_2), + ASSERT_NE(worker_pool_->PopWorkerSync(lease_spec_job_2_detached_actor_2), worker_job_1_detached_actor_1); ASSERT_EQ(worker_pool_->GetIdleWorkerSize(), 1); } TEST_F(WorkerPoolDriverRegisteredTest, MaximumStartupConcurrency) { - auto task_spec = ExampleTaskSpec(); + auto lease_spec = ExampleLeaseSpec(); std::vector started_processes; // Try to pop some workers. Some worker processes will be started. for (int i = 0; i < MAXIMUM_STARTUP_CONCURRENCY; i++) { worker_pool_->PopWorker( - task_spec, + lease_spec, [](const std::shared_ptr worker, PopWorkerStatus status, const std::string &runtime_env_setup_error_message) -> bool { return true; }); @@ -1143,7 +1143,7 @@ TEST_F(WorkerPoolDriverRegisteredTest, MaximumStartupConcurrency) { // Can't start a new worker process at this point. worker_pool_->PopWorker( - task_spec, + lease_spec, [](const std::shared_ptr worker, PopWorkerStatus status, const std::string &runtime_env_setup_error_message) -> bool { return true; }); @@ -1171,7 +1171,7 @@ TEST_F(WorkerPoolDriverRegisteredTest, MaximumStartupConcurrency) { // Can't start a new worker process at this point. ASSERT_EQ(MAXIMUM_STARTUP_CONCURRENCY, worker_pool_->NumWorkersStarting()); worker_pool_->PopWorker( - task_spec, + lease_spec, [](const std::shared_ptr worker, PopWorkerStatus status, const std::string &runtime_env_setup_error_message) -> bool { return true; }); @@ -1191,7 +1191,7 @@ TEST_F(WorkerPoolDriverRegisteredTest, MaximumStartupConcurrency) { // Can't start a new worker process at this point. worker_pool_->PopWorker( - task_spec, + lease_spec, [](const std::shared_ptr worker, PopWorkerStatus status, const std::string &runtime_env_setup_error_message) -> bool { return true; }); @@ -1248,7 +1248,7 @@ TEST_F(WorkerPoolDriverRegisteredTest, HandleIOWorkersPushPop) { spill_workers.insert(CreateSpillWorker(Process())); spill_workers.insert(CreateSpillWorker(Process())); // Add the workers to the pool. - // 2 pending tasks / 2 new idle workers. + // 2 pending leases / 2 new idle workers. for (const auto &worker : spill_workers) { auto status = PopWorkerStatus::OK; auto [proc, token] = worker_pool_->StartWorkerProcess( @@ -1276,7 +1276,7 @@ TEST_F(WorkerPoolDriverRegisteredTest, HandleIOWorkersPushPop) { worker_pool_->OnWorkerStarted(worker); } // Now push back to used workers - // 0 pending task, 3 idle workers. + // 0 pending lease, 3 idle workers. for (const auto &worker : spill_workers) { worker_pool_->PushSpillWorker(worker); } @@ -1497,20 +1497,18 @@ TEST_F(WorkerPoolDriverRegisteredTest, TestWorkerCapping) { /// std::vector> popped_workers; for (int i = 0; i < num_workers; i++) { - // Pop workers for actor creation tasks. - auto task_spec = - ExampleTaskSpec(/*actor_id=*/ActorID::Nil(), Language::PYTHON, job_id); - auto worker = worker_pool_->PopWorkerSync(task_spec, false); - // Simulate running the task and finish. This is to set task_assign_time_. - RayTask task(task_spec); - worker->SetAssignedTask(task); - worker->AssignTaskId(TaskID::Nil()); - + // Pop workers for actor creation leases. + auto lease_spec = ExampleLeaseSpec( + /*actor_id=*/ActorID::Nil(), Language::PYTHON, job_id, {}, LeaseID::FromRandom()); + auto worker = worker_pool_->PopWorkerSync(lease_spec, false); + // Simulate granting the lease and finish. This is to set lease_grant_time_. + RayLease lease(lease_spec); + worker->GrantLease(lease); popped_workers.push_back(worker); ASSERT_TRUE(worker); ASSERT_EQ(worker->GetAssignedJobId(), job_id); } - // After scheduling an actor and task, there's no more idle worker. + // After granting a lease to each worker, there should be no idle workers. ASSERT_EQ(worker_pool_->GetIdleWorkerSize(), 0); /// @@ -1518,6 +1516,7 @@ TEST_F(WorkerPoolDriverRegisteredTest, TestWorkerCapping) { /// // Return all workers. for (const auto &worker : popped_workers) { + worker->GrantLeaseId(LeaseID::Nil()); worker_pool_->PushWorker(worker); } ASSERT_EQ(worker_pool_->GetIdleWorkerSize(), num_workers); @@ -1531,8 +1530,8 @@ TEST_F(WorkerPoolDriverRegisteredTest, TestWorkerCapping) { ASSERT_EQ(worker_pool_->GetIdleWorkerSize(), POOL_SIZE_SOFT_LIMIT); // The first core worker exits, so one of idle workers should've been killed. - // Since the idle workers are killed in FIFO, we can assume the first entry in the idle - // workers will be killed. + // Since the idle workers are killed in FIFO if they've been granted a lease, we can + // assume the first entry in the idle workers will be killed. auto mock_rpc_client_it = mock_worker_rpc_clients_.find(popped_workers[0]->WorkerId()); ASSERT_EQ(mock_rpc_client_it->second->exit_count, 1) << " expected pid " << popped_workers[0]->GetProcess().GetId(); @@ -1719,10 +1718,11 @@ TEST_F(WorkerPoolDriverRegisteredTest, TestJobFinishedForPopWorker) { // Finish the job. worker_pool_->HandleJobFinished(job_id); - auto task_spec = ExampleTaskSpec(/*actor_id=*/ActorID::Nil(), Language::PYTHON, job_id); + auto lease_spec = + ExampleLeaseSpec(/*actor_creation_id=*/ActorID::Nil(), Language::PYTHON, job_id); PopWorkerStatus pop_worker_status; // This PopWorker should fail since the job finished. - worker = worker_pool_->PopWorkerSync(task_spec, false, &pop_worker_status); + worker = worker_pool_->PopWorkerSync(lease_spec, false, &pop_worker_status); ASSERT_EQ(pop_worker_status, PopWorkerStatus::JobFinished); ASSERT_FALSE(worker); ASSERT_EQ(worker_pool_->GetIdleWorkerSize(), 1); @@ -1735,12 +1735,13 @@ TEST_F(WorkerPoolDriverRegisteredTest, TestJobFinishedForPopWorker) { job_id = JOB_ID_2; rpc::JobConfig job_config; RegisterDriver(Language::PYTHON, job_id, job_config); - task_spec = ExampleTaskSpec(/*actor_id=*/ActorID::Nil(), Language::PYTHON, job_id); + lease_spec = + ExampleLeaseSpec(/*actor_creation_id=*/ActorID::Nil(), Language::PYTHON, job_id); pop_worker_status = PopWorkerStatus::OK; // This will start a new worker. std::promise promise; worker_pool_->PopWorker( - task_spec, + lease_spec, [&](const std::shared_ptr worker, PopWorkerStatus status, const std::string &runtime_env_setup_error_message) -> bool { @@ -1795,9 +1796,10 @@ TEST_F(WorkerPoolDriverRegisteredTest, TestJobFinishedForceKillIdleWorker) { worker_pool_->PushWorker(worker); ASSERT_EQ(worker_pool_->GetIdleWorkerSize(), 1); - /// Execute some task with the worker. - auto task_spec = ExampleTaskSpec(/*actor_id=*/ActorID::Nil(), Language::PYTHON, job_id); - worker = worker_pool_->PopWorkerSync(task_spec, false); + /// Grant some lease with the worker. + auto lease_spec = + ExampleLeaseSpec(/*actor_creation_id=*/ActorID::Nil(), Language::PYTHON, job_id); + worker = worker_pool_->PopWorkerSync(lease_spec, false); ASSERT_EQ(worker_pool_->GetIdleWorkerSize(), 0); /// Return the worker. @@ -1888,41 +1890,39 @@ TEST_F(WorkerPoolDriverRegisteredTest, TEST_F(WorkerPoolDriverRegisteredTest, PopWorkerWithRuntimeEnv) { ASSERT_EQ(worker_pool_->GetProcessSize(), 0); auto actor_creation_id = ActorID::Of(JOB_ID, TaskID::ForDriverTask(JOB_ID), 1); - const auto actor_creation_task_spec = ExampleTaskSpec(ActorID::Nil(), - Language::PYTHON, - JOB_ID, - actor_creation_id, - {"XXX=YYY"}, - TaskID::FromRandom(JobID::Nil()), - ExampleRuntimeEnvInfo({"XXX"})); - const auto normal_task_spec = ExampleTaskSpec(ActorID::Nil(), - Language::PYTHON, - JOB_ID, - ActorID::Nil(), - {"XXX=YYY"}, - TaskID::FromRandom(JobID::Nil()), - ExampleRuntimeEnvInfo({"XXX"})); - const auto normal_task_spec_without_runtime_env = - ExampleTaskSpec(ActorID::Nil(), Language::PYTHON, JOB_ID, ActorID::Nil(), {}); - // Pop worker for actor creation task again. - auto popped_worker = worker_pool_->PopWorkerSync(actor_creation_task_spec); + const auto actor_creation_lease_spec = ExampleLeaseSpec(actor_creation_id, + Language::PYTHON, + JOB_ID, + {"XXX=YYY"}, + LeaseID::FromRandom(), + ExampleRuntimeEnvInfo({"XXX"})); + const auto normal_lease_spec = ExampleLeaseSpec(actor_creation_id, + Language::PYTHON, + JOB_ID, + {"XXX=YYY"}, + LeaseID::FromRandom(), + ExampleRuntimeEnvInfo({"XXX"})); + const auto normal_lease_spec_without_runtime_env = + ExampleLeaseSpec(ActorID::Nil(), Language::PYTHON, JOB_ID, {}); + // Pop worker for actor creation lease again. + auto popped_worker = worker_pool_->PopWorkerSync(actor_creation_lease_spec); // Got a worker with correct runtime env hash. ASSERT_NE(popped_worker, nullptr); ASSERT_EQ(popped_worker->GetRuntimeEnvHash(), - actor_creation_task_spec.GetRuntimeEnvHash()); + actor_creation_lease_spec.GetRuntimeEnvHash()); ASSERT_EQ(worker_pool_->GetProcessSize(), 1); - // Pop worker for normal task. - popped_worker = worker_pool_->PopWorkerSync(normal_task_spec); + // Pop worker for normal lease. + popped_worker = worker_pool_->PopWorkerSync(normal_lease_spec); // Got a worker with correct runtime env hash. ASSERT_NE(popped_worker, nullptr); - ASSERT_EQ(popped_worker->GetRuntimeEnvHash(), normal_task_spec.GetRuntimeEnvHash()); + ASSERT_EQ(popped_worker->GetRuntimeEnvHash(), normal_lease_spec.GetRuntimeEnvHash()); ASSERT_EQ(worker_pool_->GetProcessSize(), 2); - // Pop worker for normal task without runtime env. - popped_worker = worker_pool_->PopWorkerSync(normal_task_spec_without_runtime_env); + // Pop worker for normal lease without runtime env. + popped_worker = worker_pool_->PopWorkerSync(normal_lease_spec_without_runtime_env); // Got a worker with correct runtime env hash. ASSERT_NE(popped_worker, nullptr); ASSERT_EQ(popped_worker->GetRuntimeEnvHash(), - normal_task_spec_without_runtime_env.GetRuntimeEnvHash()); + normal_lease_spec_without_runtime_env.GetRuntimeEnvHash()); ASSERT_EQ(worker_pool_->GetProcessSize(), 3); } @@ -1971,25 +1971,22 @@ TEST_F(WorkerPoolDriverRegisteredTest, RuntimeEnvUriReferenceWorkerLevel) { ASSERT_EQ(GetReferenceCount(runtime_env_info.serialized_runtime_env()), 1); // Start actor with runtime env. auto actor_creation_id = ActorID::Of(job_id, TaskID::ForDriverTask(job_id), 1); - const auto actor_creation_task_spec = - ExampleTaskSpec(ActorID::Nil(), - Language::PYTHON, - job_id, - actor_creation_id, - {"XXX=YYY"}, - TaskID::FromRandom(JobID::Nil()), - runtime_env_info); - auto popped_actor_worker = worker_pool_->PopWorkerSync(actor_creation_task_spec); + const auto actor_creation_lease_spec = ExampleLeaseSpec(actor_creation_id, + Language::PYTHON, + job_id, + {"XXX=YYY"}, + LeaseID::FromRandom(), + runtime_env_info); + auto popped_actor_worker = worker_pool_->PopWorkerSync(actor_creation_lease_spec); ASSERT_EQ(GetReferenceCount(runtime_env_info.serialized_runtime_env()), 2); - // Start task with runtime env. - const auto normal_task_spec = ExampleTaskSpec(ActorID::Nil(), - Language::PYTHON, - job_id, - ActorID::Nil(), - {"XXX=YYY"}, - TaskID::FromRandom(JobID::Nil()), - runtime_env_info); - auto popped_normal_worker = worker_pool_->PopWorkerSync(actor_creation_task_spec); + // Start lease with runtime env. + const auto normal_lease_spec = ExampleLeaseSpec(ActorID::Nil(), + Language::PYTHON, + job_id, + {"XXX=YYY"}, + LeaseID::FromRandom(), + runtime_env_info); + auto popped_normal_worker = worker_pool_->PopWorkerSync(actor_creation_lease_spec); ASSERT_EQ(GetReferenceCount(runtime_env_info.serialized_runtime_env()), 3); // Disconnect actor worker. worker_pool_->DisconnectWorker(popped_actor_worker, @@ -2018,18 +2015,16 @@ TEST_F(WorkerPoolDriverRegisteredTest, RuntimeEnvUriReferenceWorkerLevel) { ASSERT_EQ(GetReferenceCount(runtime_env_info.serialized_runtime_env()), 0); // Start actor with runtime env. auto actor_creation_id = ActorID::Of(job_id, TaskID::ForDriverTask(job_id), 2); - const auto actor_creation_task_spec = - ExampleTaskSpec(ActorID::Nil(), - Language::PYTHON, - job_id, - actor_creation_id, - {"XXX=YYY"}, - TaskID::FromRandom(JobID::Nil()), - runtime_env_info); - auto popped_actor_worker = worker_pool_->PopWorkerSync(actor_creation_task_spec); + const auto actor_creation_lease_spec = ExampleLeaseSpec(actor_creation_id, + Language::PYTHON, + job_id, + {"XXX=YYY"}, + LeaseID::FromRandom(), + runtime_env_info); + auto popped_actor_worker = worker_pool_->PopWorkerSync(actor_creation_lease_spec); ASSERT_EQ(GetReferenceCount(runtime_env_info.serialized_runtime_env()), 1); - // Start task with runtime env. - auto popped_normal_worker = worker_pool_->PopWorkerSync(actor_creation_task_spec); + // Start lease with runtime env. + auto popped_normal_worker = worker_pool_->PopWorkerSync(actor_creation_lease_spec); ASSERT_EQ(GetReferenceCount(runtime_env_info.serialized_runtime_env()), 2); // Disconnect actor worker. worker_pool_->DisconnectWorker(popped_actor_worker, @@ -2049,36 +2044,33 @@ TEST_F(WorkerPoolDriverRegisteredTest, CacheWorkersByRuntimeEnvHash) { /// /// Check that a worker can be popped only if there is a /// worker available whose runtime env matches the runtime env - /// in the task spec. + /// in the lease spec. /// ASSERT_EQ(worker_pool_->GetProcessSize(), 0); auto actor_creation_id = ActorID::Of(JOB_ID, TaskID::ForDriverTask(JOB_ID), 1); - const auto actor_creation_task_spec_1 = - ExampleTaskSpec(ActorID::Nil(), - Language::PYTHON, - JOB_ID, - actor_creation_id, - /*dynamic_worker_options=*/{}, - TaskID::FromRandom(JobID::Nil()), - ExampleRuntimeEnvInfoFromString("mock_runtime_env_1")); - const auto task_spec_1 = - ExampleTaskSpec(ActorID::Nil(), - Language::PYTHON, - JOB_ID, - ActorID::Nil(), - /*dynamic_worker_options=*/{}, - TaskID::FromRandom(JobID::Nil()), - ExampleRuntimeEnvInfoFromString("mock_runtime_env_1")); - const auto task_spec_2 = - ExampleTaskSpec(ActorID::Nil(), - Language::PYTHON, - JOB_ID, - ActorID::Nil(), - /*dynamic_worker_options=*/{}, - TaskID::FromRandom(JobID::Nil()), - ExampleRuntimeEnvInfoFromString("mock_runtime_env_2")); - - const int runtime_env_hash_1 = actor_creation_task_spec_1.GetRuntimeEnvHash(); + const auto actor_creation_lease_spec_1 = + ExampleLeaseSpec(actor_creation_id, + Language::PYTHON, + JOB_ID, + /*dynamic_worker_options=*/{}, + LeaseID::FromRandom(), + ExampleRuntimeEnvInfoFromString("mock_runtime_env_1")); + const auto lease_spec_1 = + ExampleLeaseSpec(ActorID::Nil(), + Language::PYTHON, + JOB_ID, + /*dynamic_worker_options=*/{}, + LeaseID::FromRandom(), + ExampleRuntimeEnvInfoFromString("mock_runtime_env_1")); + const auto lease_spec_2 = + ExampleLeaseSpec(ActorID::Nil(), + Language::PYTHON, + JOB_ID, + /*dynamic_worker_options=*/{}, + LeaseID::FromRandom(), + ExampleRuntimeEnvInfoFromString("mock_runtime_env_2")); + + const int runtime_env_hash_1 = actor_creation_lease_spec_1.GetRuntimeEnvHash(); // Push worker with runtime env 1. auto worker = worker_pool_->CreateWorker(Process::CreateNewDummy(), @@ -2088,14 +2080,14 @@ TEST_F(WorkerPoolDriverRegisteredTest, CacheWorkersByRuntimeEnvHash) { runtime_env_hash_1); worker_pool_->PushWorker(worker); - // Try to pop worker for task with runtime env 2. - auto popped_worker = worker_pool_->PopWorkerSync(task_spec_2); + // Try to pop worker for lease with runtime env 2. + auto popped_worker = worker_pool_->PopWorkerSync(lease_spec_2); // Check that popped worker isn't the one we pushed. ASSERT_NE(popped_worker, nullptr); ASSERT_NE(popped_worker, worker); - // Try to pop the worker for task with runtime env 1. - popped_worker = worker_pool_->PopWorkerSync(task_spec_1); + // Try to pop the worker for lease with runtime env 1. + popped_worker = worker_pool_->PopWorkerSync(lease_spec_1); ASSERT_EQ(popped_worker, worker); // Push another worker with runtime env 1. @@ -2107,7 +2099,7 @@ TEST_F(WorkerPoolDriverRegisteredTest, CacheWorkersByRuntimeEnvHash) { worker_pool_->PushWorker(worker); // Try to pop the worker for an actor with runtime env 1. - popped_worker = worker_pool_->PopWorkerSync(actor_creation_task_spec_1); + popped_worker = worker_pool_->PopWorkerSync(actor_creation_lease_spec_1); // Check that we got the pushed worker. ASSERT_EQ(popped_worker, worker); worker_pool_->ClearProcesses(); @@ -2115,10 +2107,10 @@ TEST_F(WorkerPoolDriverRegisteredTest, CacheWorkersByRuntimeEnvHash) { TEST_F(WorkerPoolDriverRegisteredTest, WorkerNoLeaks) { std::shared_ptr popped_worker; - const auto task_spec = ExampleTaskSpec(); + const auto lease_spec = ExampleLeaseSpec(); // Pop a worker and don't dispatch. - worker_pool_->PopWorker(task_spec, + worker_pool_->PopWorker(lease_spec, [](const std::shared_ptr worker, PopWorkerStatus status, const std::string &runtime_env_setup_error_message) -> bool { @@ -2130,11 +2122,11 @@ TEST_F(WorkerPoolDriverRegisteredTest, WorkerNoLeaks) { // No idle workers because no workers pushed. ASSERT_EQ(worker_pool_->GetIdleWorkerSize(), 0); // push workers. - worker_pool_->PushWorkers(0, task_spec.JobId()); + worker_pool_->PushWorkers(0, lease_spec.JobId()); // The worker has been pushed but not dispatched. ASSERT_EQ(worker_pool_->GetIdleWorkerSize(), 1); // Pop a worker and don't dispatch. - worker_pool_->PopWorker(task_spec, + worker_pool_->PopWorker(lease_spec, [](const std::shared_ptr worker, PopWorkerStatus status, const std::string &runtime_env_setup_error_message) -> bool { @@ -2145,7 +2137,7 @@ TEST_F(WorkerPoolDriverRegisteredTest, WorkerNoLeaks) { ASSERT_EQ(worker_pool_->GetIdleWorkerSize(), 1); ASSERT_EQ(worker_pool_->GetProcessSize(), 1); // Pop a worker and dispatch. - worker_pool_->PopWorker(task_spec, + worker_pool_->PopWorker(lease_spec, [](const std::shared_ptr worker, PopWorkerStatus status, const std::string &runtime_env_setup_error_message) -> bool { @@ -2163,56 +2155,54 @@ TEST_F(WorkerPoolDriverRegisteredTest, PopWorkerStatus) { PopWorkerStatus status; /* Test PopWorkerStatus JobConfigMissing */ - // Create a task by unregistered job id. + // Create a lease by unregistered job id. auto job_id = JobID::FromInt(123); - auto task_spec = ExampleTaskSpec(ActorID::Nil(), Language::PYTHON, job_id); - popped_worker = worker_pool_->PopWorkerSync(task_spec, true, &status); + auto lease_spec = ExampleLeaseSpec(ActorID::Nil(), Language::PYTHON, job_id); + popped_worker = worker_pool_->PopWorkerSync(lease_spec, true, &status); // PopWorker failed and the status is `JobConfigMissing`. ASSERT_EQ(popped_worker, nullptr); ASSERT_EQ(status, PopWorkerStatus::JobConfigMissing); // Register driver fot the job. RegisterDriver(Language::PYTHON, job_id); - popped_worker = worker_pool_->PopWorkerSync(task_spec, true, &status); + popped_worker = worker_pool_->PopWorkerSync(lease_spec, true, &status); // PopWorker success. ASSERT_NE(popped_worker, nullptr); ASSERT_EQ(status, PopWorkerStatus::OK); /* Test PopWorkerStatus RuntimeEnvCreationFailed */ - // Create a task with bad runtime env. - const auto task_spec_with_bad_runtime_env = - ExampleTaskSpec(ActorID::Nil(), - Language::PYTHON, - job_id, - ActorID::Nil(), - {"XXX=YYY"}, - TaskID::FromRandom(JobID::Nil()), - ExampleRuntimeEnvInfoFromString(std::string(kBadRuntimeEnv))); + // Create a lease with bad runtime env. + const auto lease_spec_with_bad_runtime_env = + ExampleLeaseSpec(ActorID::Nil(), + Language::PYTHON, + job_id, + {"XXX=YYY"}, + LeaseID::FromRandom(), + ExampleRuntimeEnvInfoFromString(std::string(kBadRuntimeEnv))); std::string error_msg; popped_worker = worker_pool_->PopWorkerSync( - task_spec_with_bad_runtime_env, true, &status, 0, &error_msg); + lease_spec_with_bad_runtime_env, true, &status, 0, &error_msg); // PopWorker failed and the status is `RuntimeEnvCreationFailed`. ASSERT_EQ(popped_worker, nullptr); ASSERT_EQ(status, PopWorkerStatus::RuntimeEnvCreationFailed); ASSERT_EQ(error_msg, kBadRuntimeEnvErrorMsg); - // Create a task with available runtime env. - const auto task_spec_with_runtime_env = - ExampleTaskSpec(ActorID::Nil(), - Language::PYTHON, - job_id, - ActorID::Nil(), - {"XXX=YYY"}, - TaskID::FromRandom(JobID::Nil()), - ExampleRuntimeEnvInfo({"XXX"})); - popped_worker = worker_pool_->PopWorkerSync(task_spec_with_runtime_env, true, &status); + // Create a lease with available runtime env. + const auto lease_spec_with_runtime_env = + ExampleLeaseSpec(ActorID::Nil(), + Language::PYTHON, + job_id, + {"XXX=YYY"}, + LeaseID::FromRandom(), + ExampleRuntimeEnvInfo({"XXX"})); + popped_worker = worker_pool_->PopWorkerSync(lease_spec_with_runtime_env, true, &status); // PopWorker success. ASSERT_NE(popped_worker, nullptr); ASSERT_EQ(status, PopWorkerStatus::OK); /* Test PopWorkerStatus WorkerPendingRegistration */ - // Create a task without push worker. - popped_worker = worker_pool_->PopWorkerSync(task_spec, false, &status); + // Create a lease without push worker. + popped_worker = worker_pool_->PopWorkerSync(lease_spec, false, &status); ASSERT_EQ(popped_worker, nullptr); // PopWorker failed while the timer was triggered and the status is // `WorkerPendingRegistration`. @@ -2223,9 +2213,9 @@ TEST_F(WorkerPoolDriverRegisteredTest, PopWorkerStatus) { TEST_F(WorkerPoolDriverRegisteredTest, WorkerPendingRegistrationErasesRequest) { std::shared_ptr popped_worker; PopWorkerStatus status; - auto task_spec = ExampleTaskSpec(); - // Create a task without push worker. It should time out (WorkerPendingRegistration). - popped_worker = worker_pool_->PopWorkerSync(task_spec, false, &status); + auto lease_spec = ExampleLeaseSpec(); + // Create a lease without push worker. It should time out (WorkerPendingRegistration). + popped_worker = worker_pool_->PopWorkerSync(lease_spec, false, &status); ASSERT_EQ(popped_worker, nullptr); ASSERT_EQ(status, PopWorkerStatus::WorkerPendingRegistration); // The request should be erased. @@ -2345,14 +2335,14 @@ TEST_F(WorkerPoolDriverRegisteredTest, TestIOWorkerFailureAndSpawn) { } TEST_F(WorkerPoolDriverRegisteredTest, WorkerReuseForPrestartedWorker) { - const auto task_spec = ExampleTaskSpec(); - worker_pool_->PrestartWorkersInternal(task_spec, /*num_needed=*/1); - worker_pool_->PushWorkers(0, task_spec.JobId()); + const auto lease_spec = ExampleLeaseSpec(); + worker_pool_->PrestartWorkersInternal(lease_spec, /*num_needed=*/1); + worker_pool_->PushWorkers(0, lease_spec.JobId()); // One worker process has been prestarted. ASSERT_EQ(worker_pool_->GetProcessSize(), 1); ASSERT_EQ(worker_pool_->GetIdleWorkerSize(), 1); // Pop a worker and don't dispatch. - auto popped_worker = worker_pool_->PopWorkerSync(task_spec); + auto popped_worker = worker_pool_->PopWorkerSync(lease_spec); ASSERT_NE(popped_worker, nullptr); // no new worker started since we can reuse the cached worker. ASSERT_EQ(worker_pool_->GetProcessSize(), 1); @@ -2361,17 +2351,17 @@ TEST_F(WorkerPoolDriverRegisteredTest, WorkerReuseForPrestartedWorker) { } TEST_F(WorkerPoolDriverRegisteredTest, WorkerReuseForSameJobId) { - const auto task_spec = ExampleTaskSpec(); + const auto lease_spec = ExampleLeaseSpec(); // start one worker - auto popped_worker = worker_pool_->PopWorkerSync(task_spec); + auto popped_worker = worker_pool_->PopWorkerSync(lease_spec); ASSERT_NE(popped_worker, nullptr); ASSERT_EQ(worker_pool_->GetProcessSize(), 1); ASSERT_EQ(worker_pool_->GetIdleWorkerSize(), 0); worker_pool_->PushWorker(popped_worker); // start a new worker withe same job_id resuse the same worker. - auto popped_worker1 = worker_pool_->PopWorkerSync(task_spec); + auto popped_worker1 = worker_pool_->PopWorkerSync(lease_spec); ASSERT_NE(popped_worker1, nullptr); ASSERT_EQ(popped_worker1, popped_worker); ASSERT_EQ(worker_pool_->GetProcessSize(), 1); @@ -2379,11 +2369,11 @@ TEST_F(WorkerPoolDriverRegisteredTest, WorkerReuseForSameJobId) { } TEST_F(WorkerPoolDriverRegisteredTest, WorkerReuseFailureForDifferentJobId) { - const auto task_spec = ExampleTaskSpec(); - const auto task_spec1 = ExampleTaskSpec(ActorID::Nil(), Language::PYTHON, JOB_ID_2); + const auto lease_spec = ExampleLeaseSpec(); + const auto lease_spec1 = ExampleLeaseSpec(ActorID::Nil(), Language::PYTHON, JOB_ID_2); // start one worker - auto popped_worker = worker_pool_->PopWorkerSync(task_spec); + auto popped_worker = worker_pool_->PopWorkerSync(lease_spec); ASSERT_NE(popped_worker, nullptr); ASSERT_EQ(worker_pool_->GetProcessSize(), 1); ASSERT_EQ(worker_pool_->GetIdleWorkerSize(), 0); @@ -2392,7 +2382,7 @@ TEST_F(WorkerPoolDriverRegisteredTest, WorkerReuseFailureForDifferentJobId) { RegisterDriver(Language::PYTHON, JOB_ID_2); // start a new worker with different job_id requires a new worker. - auto popped_worker1 = worker_pool_->PopWorkerSync(task_spec1); + auto popped_worker1 = worker_pool_->PopWorkerSync(lease_spec1); ASSERT_NE(popped_worker1, nullptr); ASSERT_NE(popped_worker1, popped_worker); ASSERT_EQ(worker_pool_->GetProcessSize(), 2); @@ -2402,7 +2392,6 @@ TEST_F(WorkerPoolDriverRegisteredTest, WorkerReuseFailureForDifferentJobId) { TEST_F(WorkerPoolTest, RegisterFirstPythonDriverWaitForWorkerStart) { auto driver = worker_pool_->CreateWorker(Process::CreateNewDummy(), Language::PYTHON, JOB_ID); - driver->AssignTaskId(TaskID::ForDriverTask(JOB_ID)); bool callback_called = false; auto callback = [callback_called_ptr = &callback_called](Status, int) mutable { *callback_called_ptr = true; @@ -2414,7 +2403,6 @@ TEST_F(WorkerPoolTest, RegisterFirstPythonDriverWaitForWorkerStart) { TEST_F(WorkerPoolTest, RegisterSecondPythonDriverCallbackImmediately) { auto driver = worker_pool_->CreateWorker(Process::CreateNewDummy(), Language::PYTHON, JOB_ID); - driver->AssignTaskId(TaskID::ForDriverTask(JOB_ID)); RAY_CHECK_OK( worker_pool_->RegisterDriver(driver, rpc::JobConfig(), [](Status, int) {})); @@ -2424,7 +2412,6 @@ TEST_F(WorkerPoolTest, RegisterSecondPythonDriverCallbackImmediately) { }; auto second_driver = worker_pool_->CreateWorker(Process::CreateNewDummy(), Language::PYTHON, JOB_ID); - second_driver->AssignTaskId(TaskID::ForDriverTask(JOB_ID)); RAY_CHECK_OK(worker_pool_->RegisterDriver(second_driver, rpc::JobConfig(), callback)); ASSERT_TRUE(callback_called); } @@ -2433,7 +2420,6 @@ TEST_F(WorkerPoolTest, RegisterFirstJavaDriverCallbackImmediately) { auto driver = worker_pool_->CreateWorker(Process::CreateNewDummy(), Language::JAVA, JOB_ID); - driver->AssignTaskId(TaskID::ForDriverTask(JOB_ID)); bool callback_called = false; auto callback = [callback_called_ptr = &callback_called](Status, int) mutable { *callback_called_ptr = true; diff --git a/src/ray/raylet/wait_manager.cc b/src/ray/raylet/wait_manager.cc index 8745848f2f59..ae26e0cf9e56 100644 --- a/src/ray/raylet/wait_manager.cc +++ b/src/ray/raylet/wait_manager.cc @@ -40,19 +40,19 @@ void WaitManager::Wait(const std::vector &object_ids, auto &wait_request = wait_requests_.at(wait_id); for (const auto &object_id : object_ids) { if (is_object_local_(object_id)) { - wait_request.ready.emplace(object_id); + wait_request.ready_.emplace(object_id); } } - for (const auto &object_id : wait_request.object_ids) { + for (const auto &object_id : wait_request.object_ids_) { object_to_wait_requests_[object_id].emplace(wait_id); } - if (wait_request.ready.size() >= wait_request.num_required_objects || - wait_request.timeout_ms == 0) { + if (wait_request.ready_.size() >= wait_request.num_required_objects_ || + wait_request.timeout_ms_ == 0) { // Requirements already satisfied. WaitComplete(wait_id); - } else if (wait_request.timeout_ms != -1) { + } else if (wait_request.timeout_ms_ != -1) { // If a timeout was provided, then set a timer. If there are no // enough locally available objects by the time the timer expires, // then we will return from the Wait. @@ -65,14 +65,14 @@ void WaitManager::Wait(const std::vector &object_ids, } WaitComplete(wait_id); }, - wait_request.timeout_ms); + wait_request.timeout_ms_); } } void WaitManager::WaitComplete(uint64_t wait_id) { auto &wait_request = map_find_or_die(wait_requests_, wait_id); - for (const auto &object_id : wait_request.object_ids) { + for (const auto &object_id : wait_request.object_ids_) { auto &requests = object_to_wait_requests_.at(object_id); requests.erase(wait_id); if (requests.empty()) { @@ -83,15 +83,15 @@ void WaitManager::WaitComplete(uint64_t wait_id) { // Order objects according to input order. std::vector ready; std::vector remaining; - for (const auto &object_id : wait_request.object_ids) { - if (ready.size() < wait_request.num_required_objects && - wait_request.ready.count(object_id) > 0) { + for (const auto &object_id : wait_request.object_ids_) { + if (ready.size() < wait_request.num_required_objects_ && + wait_request.ready_.count(object_id) > 0) { ready.push_back(object_id); } else { remaining.push_back(object_id); } } - wait_request.callback(ready, remaining); + wait_request.callback_(ready, remaining); wait_requests_.erase(wait_id); RAY_LOG(DEBUG) << "Wait request " << wait_id << " finished: ready " << ready.size() << " remaining " << remaining.size(); @@ -105,8 +105,8 @@ void WaitManager::HandleObjectLocal(const ray::ObjectID &object_id) { std::vector complete_waits; for (const auto &wait_id : object_to_wait_requests_.at(object_id)) { auto &wait_request = map_find_or_die(wait_requests_, wait_id); - wait_request.ready.emplace(object_id); - if (wait_request.ready.size() >= wait_request.num_required_objects) { + wait_request.ready_.emplace(object_id); + if (wait_request.ready_.size() >= wait_request.num_required_objects_) { complete_waits.emplace_back(wait_id); } } diff --git a/src/ray/raylet/wait_manager.h b/src/ray/raylet/wait_manager.h index 5b9f3cad0d45..de66735165c3 100644 --- a/src/ray/raylet/wait_manager.h +++ b/src/ray/raylet/wait_manager.h @@ -66,20 +66,20 @@ class WaitManager { const WaitCallback &callback, const std::vector &object_ids, uint64_t num_required_objects) - : timeout_ms(timeout_ms), - callback(callback), - object_ids(object_ids), - num_required_objects(num_required_objects) {} + : timeout_ms_(timeout_ms), + callback_(callback), + object_ids_(object_ids), + num_required_objects_(num_required_objects) {} /// The period of time to wait before invoking the callback. - const int64_t timeout_ms; + const int64_t timeout_ms_; /// The callback invoked when Wait is complete. - WaitCallback callback; + WaitCallback callback_; /// Ordered input object_ids. - const std::vector object_ids; + const std::vector object_ids_; /// The number of required objects. - const uint64_t num_required_objects; + const uint64_t num_required_objects_; /// The objects that have been locally available. - std::unordered_set ready; + std::unordered_set ready_; }; /// Completion handler for Wait. diff --git a/src/ray/raylet/worker.cc b/src/ray/raylet/worker.cc index fbdf0400a152..b4f908e114e6 100644 --- a/src/ray/raylet/worker.cc +++ b/src/ray/raylet/worker.cc @@ -31,7 +31,7 @@ namespace raylet { Worker::Worker(const JobID &job_id, int runtime_env_hash, const WorkerID &worker_id, - const Language &language, + const rpc::Language &language, rpc::WorkerType worker_type, const std::string &ip_address, std::shared_ptr connection, @@ -120,7 +120,7 @@ void Worker::SetStartupToken(StartupToken startup_token) { startup_token_ = startup_token; } -Language Worker::GetLanguage() const { return language_; } +rpc::Language Worker::GetLanguage() const { return language_; } const std::string Worker::IpAddress() const { return ip_address_; } @@ -173,14 +173,15 @@ void Worker::Connect(std::shared_ptr rpc_client) } } -void Worker::AssignTaskId(const TaskID &task_id) { - assigned_task_id_ = task_id; - if (!task_id.IsNil()) { - task_assign_time_ = absl::Now(); +void Worker::GrantLeaseId(const LeaseID &lease_id) { + lease_id_ = lease_id; + if (!lease_id.IsNil()) { + RAY_CHECK(worker_type_ != rpc::WorkerType::DRIVER); + lease_grant_time_ = absl::Now(); } -} +}; -const TaskID &Worker::GetAssignedTaskId() const { return assigned_task_id_; } +const LeaseID &Worker::GetGrantedLeaseId() const { return lease_id_; } const JobID &Worker::GetAssignedJobId() const { return assigned_job_id_; } @@ -199,18 +200,19 @@ void Worker::AssignActorId(const ActorID &actor_id) { const ActorID &Worker::GetActorId() const { return actor_id_; } -const std::string Worker::GetTaskOrActorIdAsDebugString() const { +const RayLease &Worker::GetGrantedLease() const { return granted_lease_; } + +const std::string Worker::GetLeaseIdAsDebugString() const { std::stringstream id_ss; if (GetActorId().IsNil()) { - id_ss << "task ID: " << GetAssignedTaskId(); - } else { id_ss << "actor ID: " << GetActorId(); } + id_ss << "lease ID: " << GetGrantedLeaseId(); return id_ss.str(); } bool Worker::IsDetachedActor() const { - return assigned_task_.GetTaskSpecification().IsDetachedActor(); + return granted_lease_.GetLeaseSpecification().IsDetachedActor(); } const std::shared_ptr Worker::Connection() const { return connection_; } diff --git a/src/ray/raylet/worker.h b/src/ray/raylet/worker.h index 2e0ef7f13f64..d7466dac569e 100644 --- a/src/ray/raylet/worker.h +++ b/src/ray/raylet/worker.h @@ -23,10 +23,9 @@ #include "absl/time/time.h" #include "gtest/gtest_prod.h" #include "ray/common/id.h" +#include "ray/common/lease/lease.h" #include "ray/common/scheduling/resource_set.h" #include "ray/common/scheduling/scheduling_ids.h" -#include "ray/common/task/task.h" -#include "ray/common/task/task_common.h" #include "ray/ipc/client_connection.h" #include "ray/raylet/scheduling/cluster_resource_scheduler.h" #include "ray/rpc/worker/core_worker_client.h" @@ -57,7 +56,7 @@ class WorkerInterface { /// Return the worker process's startup token virtual StartupToken GetStartupToken() const = 0; virtual void SetProcess(Process proc) = 0; - virtual Language GetLanguage() const = 0; + virtual rpc::Language GetLanguage() const = 0; virtual const std::string IpAddress() const = 0; virtual void AsyncNotifyGCSRestart() = 0; /// Connect this worker's gRPC client. @@ -67,15 +66,16 @@ class WorkerInterface { virtual int Port() const = 0; virtual int AssignedPort() const = 0; virtual void SetAssignedPort(int port) = 0; - virtual void AssignTaskId(const TaskID &task_id) = 0; - virtual const TaskID &GetAssignedTaskId() const = 0; + virtual void GrantLeaseId(const LeaseID &lease_id) = 0; + virtual const LeaseID &GetGrantedLeaseId() const = 0; virtual const JobID &GetAssignedJobId() const = 0; + virtual const RayLease &GetGrantedLease() const = 0; virtual std::optional GetIsGpu() const = 0; virtual std::optional GetIsActorWorker() const = 0; virtual int GetRuntimeEnvHash() const = 0; virtual void AssignActorId(const ActorID &actor_id) = 0; virtual const ActorID &GetActorId() const = 0; - virtual const std::string GetTaskOrActorIdAsDebugString() const = 0; + virtual const std::string GetLeaseIdAsDebugString() const = 0; virtual bool IsDetachedActor() const = 0; virtual const std::shared_ptr Connection() const = 0; virtual void SetOwnerAddress(const rpc::Address &address) = 0; @@ -100,9 +100,9 @@ class WorkerInterface { virtual void ClearLifetimeAllocatedInstances() = 0; - virtual RayTask &GetAssignedTask() = 0; + virtual RayLease &GetGrantedLease() = 0; - virtual void SetAssignedTask(const RayTask &assigned_task) = 0; + virtual void GrantLease(const RayLease &granted_lease) = 0; virtual bool IsRegistered() = 0; @@ -112,7 +112,7 @@ class WorkerInterface { virtual bool IsAvailableForScheduling() const = 0; /// Time when the last task was assigned to this worker. - virtual absl::Time GetAssignedTaskTime() const = 0; + virtual absl::Time GetGrantedLeaseTime() const = 0; virtual void SetJobId(const JobID &job_id) = 0; @@ -144,7 +144,7 @@ class Worker : public std::enable_shared_from_this, public WorkerInterfa Worker(const JobID &job_id, int runtime_env_hash, const WorkerID &worker_id, - const Language &language, + const rpc::Language &language, rpc::WorkerType worker_type, const std::string &ip_address, std::shared_ptr connection, @@ -169,7 +169,7 @@ class Worker : public std::enable_shared_from_this, public WorkerInterfa /// Return the worker process's startup token StartupToken GetStartupToken() const; void SetProcess(Process proc); - Language GetLanguage() const; + rpc::Language GetLanguage() const; const std::string IpAddress() const; void AsyncNotifyGCSRestart(); /// Connect this worker's gRPC client. @@ -179,17 +179,17 @@ class Worker : public std::enable_shared_from_this, public WorkerInterfa int Port() const; int AssignedPort() const; void SetAssignedPort(int port); - void AssignTaskId(const TaskID &task_id); - const TaskID &GetAssignedTaskId() const; + void GrantLeaseId(const LeaseID &lease_id); + const LeaseID &GetGrantedLeaseId() const; const JobID &GetAssignedJobId() const; + const RayLease &GetGrantedLease() const; std::optional GetIsGpu() const; std::optional GetIsActorWorker() const; int GetRuntimeEnvHash() const; void AssignActorId(const ActorID &actor_id); const ActorID &GetActorId() const; - // Creates the debug string for the ID of the task or actor depending on which is - // running. - const std::string GetTaskOrActorIdAsDebugString() const; + // Creates the debug string for the ID of the lease and the actor ID if it exists. + const std::string GetLeaseIdAsDebugString() const; bool IsDetachedActor() const; const std::shared_ptr Connection() const; void SetOwnerAddress(const rpc::Address &address); @@ -225,28 +225,27 @@ class Worker : public std::enable_shared_from_this, public WorkerInterfa void ClearLifetimeAllocatedInstances() { lifetime_allocated_instances_ = nullptr; }; - RayTask &GetAssignedTask() { return assigned_task_; }; - - void SetAssignedTask(const RayTask &assigned_task) { - const auto &task_spec = assigned_task.GetTaskSpecification(); - SetJobId(task_spec.JobId()); - SetBundleId(task_spec.PlacementGroupBundleId()); - SetOwnerAddress(task_spec.CallerAddress()); - AssignTaskId(task_spec.TaskId()); - SetIsGpu(task_spec.GetRequiredResources().Get(scheduling::ResourceID::GPU()) > 0); - RAY_CHECK(!task_spec.IsActorTask()); - SetIsActorWorker(task_spec.IsActorCreationTask()); - assigned_task_ = assigned_task; - root_detached_actor_id_ = assigned_task.GetTaskSpecification().RootDetachedActorId(); + RayLease &GetGrantedLease() { return granted_lease_; }; + + void GrantLease(const RayLease &granted_lease) { + const auto &lease_spec = granted_lease.GetLeaseSpecification(); + SetJobId(lease_spec.JobId()); + SetBundleId(lease_spec.PlacementGroupBundleId()); + SetOwnerAddress(lease_spec.CallerAddress()); + GrantLeaseId(lease_spec.LeaseId()); + SetIsGpu(lease_spec.GetRequiredResources().Get(scheduling::ResourceID::GPU()) > 0); + SetIsActorWorker(lease_spec.IsActorCreationTask()); + granted_lease_ = granted_lease; + root_detached_actor_id_ = granted_lease.GetLeaseSpecification().RootDetachedActorId(); } - absl::Time GetAssignedTaskTime() const { return task_assign_time_; }; + absl::Time GetGrantedLeaseTime() const { return lease_grant_time_; }; bool IsRegistered() { return rpc_client_ != nullptr; } bool IsAvailableForScheduling() const { return !IsDead() // Not dead - && !GetAssignedTaskId().IsNil() // No assigned task + && !GetGrantedLeaseId().IsNil() // Has assigned lease && !IsBlocked() // Not blocked && GetActorId().IsNil(); // No assigned actor } @@ -255,7 +254,6 @@ class Worker : public std::enable_shared_from_this, public WorkerInterfa RAY_CHECK(IsRegistered()); return rpc_client_.get(); } - void SetJobId(const JobID &job_id); void SetIsGpu(bool is_gpu); void SetIsActorWorker(bool is_actor_worker); @@ -271,7 +269,7 @@ class Worker : public std::enable_shared_from_this, public WorkerInterfa /// The worker's process's startup_token StartupToken startup_token_; /// The language type of this worker. - Language language_; + rpc::Language language_; /// The type of the worker. rpc::WorkerType worker_type_; /// IP address of this worker. @@ -285,9 +283,10 @@ class Worker : public std::enable_shared_from_this, public WorkerInterfa int port_; /// Connection state of a worker. std::shared_ptr connection_; - /// The worker's currently assigned task. - TaskID assigned_task_id_; - /// Job ID for the worker's current assigned task. + /// The lease id of the worker's currently assigned lease. + /// It is always Nil for the driver. + LeaseID lease_id_; + /// Job ID for the worker's current assigned lease. JobID assigned_job_id_; /// The hash of the worker's assigned runtime env. We use this in the worker /// pool to cache and reuse workers with the same runtime env, because @@ -295,7 +294,7 @@ class Worker : public std::enable_shared_from_this, public WorkerInterfa const int runtime_env_hash_; /// The worker's actor ID. If this is nil, then the worker is not an actor. ActorID actor_id_; - /// Root detached actor ID for the worker's last assigned task. + /// Root detached actor ID for the worker's last assigned lease. ActorID root_detached_actor_id_; /// The worker's placement group bundle. It is used to detect if the worker is /// associated with a placement group bundle. @@ -314,19 +313,19 @@ class Worker : public std::enable_shared_from_this, public WorkerInterfa /// currently holds the lease on this worker, if any. rpc::Address owner_address_; /// The capacity of each resource instance allocated to this worker in order - /// to satisfy the resource requests of the task is currently running. + /// to satisfy the resource requests of the granted lease. std::shared_ptr allocated_instances_; /// The capacity of each resource instance allocated to this worker /// when running as an actor. std::shared_ptr lifetime_allocated_instances_; - /// RayTask being assigned to this worker. - RayTask assigned_task_; - /// Time when the last task was assigned to this worker. - absl::Time task_assign_time_; - /// Whether this worker ever holded a GPU resource. Once it holds a GPU or non-GPU task + /// RayLease being assigned to this worker. + RayLease granted_lease_; + /// Time when the last lease was granted to this worker. + absl::Time lease_grant_time_; + /// Whether this worker ever holded a GPU resource. Once it holds a GPU or non-GPU lease /// it can't switch to the other type. std::optional is_gpu_ = std::nullopt; - /// Whether this worker can hold an actor. Once it holds an actor or a normal task, it + /// Whether this worker can hold an actor. Once it holds an actor or a normal lease, it /// can't switch to the other type. std::optional is_actor_worker_ = std::nullopt; /// If true, a RPC need to be sent to notify the worker about GCS restarting. diff --git a/src/ray/raylet/worker_killing_policy.cc b/src/ray/raylet/worker_killing_policy.cc index ce37f23b298b..e804864f54ac 100644 --- a/src/ray/raylet/worker_killing_policy.cc +++ b/src/ray/raylet/worker_killing_policy.cc @@ -50,13 +50,14 @@ RetriableLIFOWorkerKillingPolicy::SelectWorkerToKill( sorted.end(), [](std::shared_ptr const &left, std::shared_ptr const &right) -> bool { - // First sort by retriable tasks and then by task time in descending order. + // First sort by retriable tasks and then by assigned time in descending + // order. int left_retriable = - left->GetAssignedTask().GetTaskSpecification().IsRetriable() ? 0 : 1; + left->GetGrantedLease().GetLeaseSpecification().IsRetriable() ? 0 : 1; int right_retriable = - right->GetAssignedTask().GetTaskSpecification().IsRetriable() ? 0 : 1; + right->GetGrantedLease().GetLeaseSpecification().IsRetriable() ? 0 : 1; if (left_retriable == right_retriable) { - return left->GetAssignedTaskTime() > right->GetAssignedTaskTime(); + return left->GetGrantedLeaseTime() > right->GetGrantedLeaseTime(); } return left_retriable < right_retriable; }); @@ -84,11 +85,11 @@ std::string WorkerKillingPolicy::WorkersDebugString( RAY_LOG_EVERY_MS(INFO, 60000) << "Can't find memory usage for PID, reporting zero. PID: " << pid; } - result << "Worker " << index << ": task assigned time " - << absl::FormatTime(worker->GetAssignedTaskTime(), absl::UTCTimeZone()) + result << "Worker " << index << ": lease granted time " + << absl::FormatTime(worker->GetGrantedLeaseTime(), absl::UTCTimeZone()) << " worker id " << worker->WorkerId() << " memory used " << used_memory - << " task spec " - << worker->GetAssignedTask().GetTaskSpecification().DebugString() << "\n"; + << " lease spec " + << worker->GetGrantedLease().GetLeaseSpecification().DebugString() << "\n"; index += 1; if (index > num_workers) { diff --git a/src/ray/raylet/worker_killing_policy_group_by_owner.cc b/src/ray/raylet/worker_killing_policy_group_by_owner.cc index dfa2588856a8..97d7010d55c7 100644 --- a/src/ray/raylet/worker_killing_policy_group_by_owner.cc +++ b/src/ray/raylet/worker_killing_policy_group_by_owner.cc @@ -50,9 +50,9 @@ GroupByOwnerIdWorkerKillingPolicy::SelectWorkerToKill( TaskID non_retriable_owner_id = TaskID::Nil(); std::unordered_map group_map; for (auto worker : workers) { - bool retriable = worker->GetAssignedTask().GetTaskSpecification().IsRetriable(); + bool retriable = worker->GetGrantedLease().GetLeaseSpecification().IsRetriable(); TaskID owner_id = - retriable ? worker->GetAssignedTask().GetTaskSpecification().ParentTaskId() + retriable ? worker->GetGrantedLease().GetLeaseSpecification().ParentTaskId() : non_retriable_owner_id; auto it = group_map.find(owner_id); @@ -81,7 +81,7 @@ GroupByOwnerIdWorkerKillingPolicy::SelectWorkerToKill( if (left_retriable == right_retriable) { if (left.GetAllWorkers().size() == right.GetAllWorkers().size()) { - return left.GetAssignedTaskTime() > right.GetAssignedTaskTime(); + return left.GetGrantedLeaseTime() > right.GetGrantedLeaseTime(); } return left.GetAllWorkers().size() > right.GetAllWorkers().size(); } @@ -93,9 +93,9 @@ GroupByOwnerIdWorkerKillingPolicy::SelectWorkerToKill( selected_group.GetAllWorkers().size() > 1 && selected_group.IsRetriable(); auto worker_to_kill = selected_group.SelectWorkerToKill(); - RAY_LOG(INFO) << "Sorted list of tasks based on the policy:\n" + RAY_LOG(INFO) << "Sorted list of leases based on the policy:\n" << PolicyDebugString(sorted, system_memory) - << "\nTask should be retried? " << should_retry; + << "\nLease should be retried? " << should_retry; return std::make_pair(worker_to_kill, should_retry); } @@ -105,9 +105,9 @@ std::string GroupByOwnerIdWorkerKillingPolicy::PolicyDebugString( std::stringstream result; int32_t group_index = 0; for (auto &group : groups) { - result << "Tasks (retriable: " << group.IsRetriable() - << ") (parent task id: " << group.OwnerId() << ") (Earliest assigned time: " - << absl::FormatTime(group.GetAssignedTaskTime(), absl::UTCTimeZone()) + result << "Leases (retriable: " << group.IsRetriable() + << ") (parent task id: " << group.OwnerId() << ") (Earliest granted time: " + << absl::FormatTime(group.GetGrantedLeaseTime(), absl::UTCTimeZone()) << "):\n"; int64_t worker_index = 0; @@ -121,11 +121,11 @@ std::string GroupByOwnerIdWorkerKillingPolicy::PolicyDebugString( RAY_LOG_EVERY_MS(INFO, 60000) << "Can't find memory usage for PID, reporting zero. PID: " << pid; } - result << "Task assigned time " - << absl::FormatTime(worker->GetAssignedTaskTime(), absl::UTCTimeZone()) + result << "Lease granted time " + << absl::FormatTime(worker->GetGrantedLeaseTime(), absl::UTCTimeZone()) << " worker id " << worker->WorkerId() << " memory used " << used_memory - << " task spec " - << worker->GetAssignedTask().GetTaskSpecification().DebugString() << "\n"; + << " lease spec " + << worker->GetGrantedLease().GetLeaseSpecification().DebugString() << "\n"; worker_index += 1; if (worker_index > 10) { @@ -146,13 +146,15 @@ const TaskID &Group::OwnerId() const { return owner_id_; } const bool Group::IsRetriable() const { return retriable_; } -const absl::Time Group::GetAssignedTaskTime() const { return earliest_task_time_; } +const absl::Time Group::GetGrantedLeaseTime() const { + return earliest_granted_lease_time_; +} void Group::AddToGroup(std::shared_ptr worker) { - if (worker->GetAssignedTaskTime() < earliest_task_time_) { - earliest_task_time_ = worker->GetAssignedTaskTime(); + if (worker->GetGrantedLeaseTime() < earliest_granted_lease_time_) { + earliest_granted_lease_time_ = worker->GetGrantedLeaseTime(); } - bool retriable = worker->GetAssignedTask().GetTaskSpecification().IsRetriable(); + bool retriable = worker->GetGrantedLease().GetLeaseSpecification().IsRetriable(); RAY_CHECK_EQ(retriable_, retriable); workers_.push_back(worker); } @@ -165,7 +167,7 @@ const std::shared_ptr Group::SelectWorkerToKill() const { sorted.end(), [](std::shared_ptr const &left, std::shared_ptr const &right) -> bool { - return left->GetAssignedTaskTime() > right->GetAssignedTaskTime(); + return left->GetGrantedLeaseTime() > right->GetGrantedLeaseTime(); }); return sorted.front(); diff --git a/src/ray/raylet/worker_killing_policy_group_by_owner.h b/src/ray/raylet/worker_killing_policy_group_by_owner.h index c5f3e95b5282..791126aab92d 100644 --- a/src/ray/raylet/worker_killing_policy_group_by_owner.h +++ b/src/ray/raylet/worker_killing_policy_group_by_owner.h @@ -32,11 +32,11 @@ namespace ray { namespace raylet { -/// Key groups on its owner id. For non-retriable task the owner id is itself, -/// Since non-retriable task forms its own group. +/// Key groups on its owner id. For non-retriable lease the owner id is Nil, +/// Since non-retriable lease forms its own group. struct GroupKey { - explicit GroupKey(const TaskID &owner_id) : owner_id(owner_id) {} - const TaskID &owner_id; + explicit GroupKey(const TaskID &owner_id) : owner_id_(owner_id) {} + const TaskID &owner_id_; }; struct Group { @@ -44,46 +44,44 @@ struct Group { Group(const TaskID &owner_id, bool retriable) : owner_id_(owner_id), retriable_(retriable) {} - /// The parent task id of the tasks belonging to this group + /// The parent task id of the leases belonging to this group const TaskID &OwnerId() const; - /// Whether tasks in this group are retriable. + /// Whether leases in this group are retriable. const bool IsRetriable() const; - /// Gets the task time of the earliest task of this group, to be + /// Gets the assigned lease time of the earliest lease of this group, to be /// used for group priority. - const absl::Time GetAssignedTaskTime() const; + const absl::Time GetGrantedLeaseTime() const; /// Returns the worker to be killed in this group, in LIFO order. const std::shared_ptr SelectWorkerToKill() const; - /// Tasks belonging to this group. + /// Leases belonging to this group. const std::vector> GetAllWorkers() const; - /// Adds worker that the task belongs to to the group. + /// Adds worker that the lease belongs to to the group. void AddToGroup(std::shared_ptr worker); private: - /// Tasks belonging to this group. + /// Leases belonging to this group. std::vector> workers_; - /// The earliest creation time of the tasks. - absl::Time earliest_task_time_ = absl::Now(); + /// The earliest creation time of the leases. + absl::Time earliest_granted_lease_time_ = absl::Now(); - /// The owner id shared by tasks of this group. - /// TODO(clarng): make this const and implement move / swap. + /// The owner id shared by leases of this group. TaskID owner_id_; - /// Whether the tasks are retriable. - /// TODO(clarng): make this const and implement move / swap. + /// Whether the leases are retriable. bool retriable_; }; -/// Groups task by its owner id. Non-retriable task (whether it be task or actor) forms -/// its own group. Prioritizes killing groups that are retriable first, else it picks the -/// largest group, else it picks the newest group. The "age" of a group is based on the -/// time of its earliest submitted task. When a group is selected for killing it selects -/// the last submitted task. +/// Groups leases by its owner id. Non-retriable leases (whether it be task or actor) +/// forms its own group. Prioritizes killing groups that are retriable first, else it +/// picks the largest group, else it picks the newest group. The "age" of a group is based +/// on the time of its earliest granted leases. When a group is selected for killing it +/// selects the last submitted task. /// /// When selecting a worker / task to be killed, it will set the task to-be-killed to be /// non-retriable if it is the last member of the group, and is retriable otherwise. diff --git a/src/ray/raylet/worker_killing_policy_retriable_fifo.cc b/src/ray/raylet/worker_killing_policy_retriable_fifo.cc index 571517f4558e..1169caf35370 100644 --- a/src/ray/raylet/worker_killing_policy_retriable_fifo.cc +++ b/src/ray/raylet/worker_killing_policy_retriable_fifo.cc @@ -52,13 +52,13 @@ RetriableFIFOWorkerKillingPolicy::SelectWorkerToKill( sorted.end(), [](std::shared_ptr const &left, std::shared_ptr const &right) -> bool { - // First sort by retriable tasks and then by task time in ascending order. + // First sort by retriable leases and then by lease time in ascending order. int left_retriable = - left->GetAssignedTask().GetTaskSpecification().IsRetriable() ? 0 : 1; + left->GetGrantedLease().GetLeaseSpecification().IsRetriable() ? 0 : 1; int right_retriable = - right->GetAssignedTask().GetTaskSpecification().IsRetriable() ? 0 : 1; + right->GetGrantedLease().GetLeaseSpecification().IsRetriable() ? 0 : 1; if (left_retriable == right_retriable) { - return left->GetAssignedTaskTime() < right->GetAssignedTaskTime(); + return left->GetGrantedLeaseTime() < right->GetGrantedLeaseTime(); } return left_retriable < right_retriable; }); diff --git a/src/ray/raylet/worker_pool.cc b/src/ray/raylet/worker_pool.cc index 7058b96847b3..32c6d7530e45 100644 --- a/src/ray/raylet/worker_pool.cc +++ b/src/ray/raylet/worker_pool.cc @@ -28,15 +28,15 @@ #include "absl/strings/str_split.h" #include "ray/common/constants.h" +#include "ray/common/lease/lease_spec.h" +#include "ray/common/protobuf_utils.h" #include "ray/common/ray_config.h" #include "ray/common/runtime_env_common.h" #include "ray/common/status.h" -#include "ray/common/task/task_spec.h" -#include "ray/gcs/pb_util.h" #include "ray/stats/metric_defs.h" #include "ray/util/logging.h" #include "ray/util/network_util.h" -#include "ray/util/util.h" +#include "ray/util/time.h" DEFINE_stats(worker_register_time_ms, "end to end latency of register a worker process.", @@ -101,8 +101,7 @@ WorkerPool::WorkerPool(instrumented_io_context &io_service, std::string native_library_path, std::function starting_worker_timeout_callback, int ray_debugger_external, - std::function get_time, - bool enable_resource_isolation) + std::function get_time) : worker_startup_token_counter_(0), io_service_(&io_service), node_id_(node_id), @@ -117,23 +116,22 @@ WorkerPool::WorkerPool(instrumented_io_context &io_service, gcs_client_(gcs_client), native_library_path_(std::move(native_library_path)), starting_worker_timeout_callback_(std::move(starting_worker_timeout_callback)), - ray_debugger_external(ray_debugger_external), + ray_debugger_external_(ray_debugger_external), first_job_registered_python_worker_count_(0), first_job_driver_wait_num_python_workers_( std::min(num_prestarted_python_workers, maximum_startup_concurrency_)), num_prestart_python_workers(num_prestarted_python_workers), periodical_runner_(PeriodicalRunner::Create(io_service)), - get_time_(std::move(get_time)), - enable_resource_isolation_(enable_resource_isolation) { + get_time_(std::move(get_time)) { RAY_CHECK_GT(maximum_startup_concurrency_, 0); // We need to record so that the metric exists. This way, we report that 0 // processes have started before a task runs on the node (as opposed to the // metric not existing at all). - stats::NumWorkersStarted.Record(0); - stats::NumWorkersStartedFromCache.Record(0); - stats::NumCachedWorkersSkippedJobMismatch.Record(0); - stats::NumCachedWorkersSkippedDynamicOptionsMismatch.Record(0); - stats::NumCachedWorkersSkippedRuntimeEnvironmentMismatch.Record(0); + ray_metric_num_workers_started_.Record(0); + ray_metric_num_workers_started_from_cache_.Record(0); + ray_metric_num_cached_workers_skipped_job_mismatch_.Record(0); + ray_metric_num_cached_workers_skipped_dynamic_options_mismatch_.Record(0); + ray_metric_num_cached_workers_skipped_runtime_environment_mismatch_.Record(0); // We used to ignore SIGCHLD here. The code is moved to raylet main.cc to support the // subreaper feature. for (const auto &entry : worker_commands) { @@ -186,12 +184,12 @@ void WorkerPool::Start() { } if (RayConfig::instance().enable_worker_prestart()) { - rpc::TaskSpec rpc_task_spec; - rpc_task_spec.set_language(Language::PYTHON); - rpc_task_spec.mutable_runtime_env_info()->set_serialized_runtime_env("{}"); + rpc::LeaseSpec rpc_lease_spec; + rpc_lease_spec.set_language(Language::PYTHON); + rpc_lease_spec.mutable_runtime_env_info()->set_serialized_runtime_env("{}"); - TaskSpecification task_spec{std::move(rpc_task_spec)}; - PrestartWorkersInternal(task_spec, num_prestart_python_workers); + LeaseSpecification lease_spec{std::move(rpc_lease_spec)}; + PrestartWorkersInternal(lease_spec, num_prestart_python_workers); } } @@ -389,7 +387,7 @@ WorkerPool::BuildProcessCommandArgs(const Language &language, worker_command_args.push_back("--language=" + Language_Name(language)); } - if (ray_debugger_external) { + if (ray_debugger_external_) { worker_command_args.push_back("--ray-debugger-external"); } @@ -443,12 +441,6 @@ WorkerPool::BuildProcessCommandArgs(const Language &language, serialized_preload_python_modules); } - // Pass resource isolation flag to python worker. - if (language == Language::PYTHON && worker_type == rpc::WorkerType::WORKER) { - worker_command_args.emplace_back(absl::StrFormat( - "--enable-resource-isolation=%s", enable_resource_isolation_ ? "true" : "false")); - } - // We use setproctitle to change python worker process title, // causing the process's /proc/PID/environ being empty. // Add `SPT_NOENV` env to prevent setproctitle breaking /proc/PID/environ. @@ -461,6 +453,7 @@ WorkerPool::BuildProcessCommandArgs(const Language &language, // Support forking in gRPC. env.insert({"GRPC_ENABLE_FORK_SUPPORT", "True"}); env.insert({"GRPC_POLL_STRATEGY", "poll"}); + env.insert({"RAY_start_python_gc_manager_thread", "0"}); } return {std::move(worker_command_args), std::move(env)}; @@ -481,7 +474,7 @@ std::tuple WorkerPool::StartWorkerProcess( auto it = all_jobs_.find(job_id); if (it == all_jobs_.end()) { RAY_LOG(DEBUG) << "Job config of job " << job_id << " are not local yet."; - // Will reschedule ready tasks in `NodeManager::HandleJobStarted`. + // Will reschedule ready leases in `NodeManager::HandleJobStarted`. *status = PopWorkerStatus::JobConfigMissing; process_failed_job_config_missing_++; return {Process(), (StartupToken)-1}; @@ -529,7 +522,7 @@ std::tuple WorkerPool::StartWorkerProcess( auto start = std::chrono::high_resolution_clock::now(); // Start a process and measure the startup time. Process proc = StartProcess(worker_command_args, env); - stats::NumWorkersStarted.Record(1); + ray_metric_num_workers_started_.Record(1); RAY_LOG(INFO) << "Started worker process with pid " << proc.GetId() << ", the token is " << worker_startup_token_counter_; if (!IsIOWorkerType(worker_type)) { @@ -627,14 +620,14 @@ void WorkerPool::MonitorPopWorkerRequestForRegistration( // Capture timer in lambda to copy it once, so that it can avoid destructing timer. timer->async_wait([timer, pop_worker_request = std::move(pop_worker_request), this]( const boost::system::error_code e) mutable { - auto &state = GetStateForLanguage(pop_worker_request->language); + auto &state = GetStateForLanguage(pop_worker_request->language_); auto &requests = state.pending_registration_requests; auto it = std::find(requests.begin(), requests.end(), pop_worker_request); if (it != requests.end()) { - // Pop and fail the task... + // Pop and fail the lease... requests.erase(it); PopWorkerStatus status = PopWorkerStatus::WorkerPendingRegistration; - PopWorkerCallbackAsync(pop_worker_request->callback, nullptr, status); + PopWorkerCallbackAsync(pop_worker_request->callback_, nullptr, status); } }); } @@ -876,7 +869,7 @@ Status WorkerPool::RegisterDriver(const std::shared_ptr &driver const rpc::JobConfig &job_config, std::function send_reply_callback) { int port; - RAY_CHECK(!driver->GetAssignedTaskId().IsNil()); + RAY_CHECK(driver->GetGrantedLeaseId().IsNil()); Status status = GetNextFreePort(&port); if (!status.ok()) { send_reply_callback(status, /*port=*/0); @@ -894,12 +887,12 @@ Status WorkerPool::RegisterDriver(const std::shared_ptr &driver if (!first_job_registered_ && RayConfig::instance().prestart_worker_first_driver() && !RayConfig::instance().enable_worker_prestart()) { RAY_LOG(DEBUG) << "PrestartDefaultCpuWorkers " << num_prestart_python_workers; - rpc::TaskSpec rpc_task_spec; - rpc_task_spec.set_language(Language::PYTHON); - rpc_task_spec.mutable_runtime_env_info()->set_serialized_runtime_env("{}"); + rpc::LeaseSpec rpc_lease_spec; + rpc_lease_spec.set_language(Language::PYTHON); + rpc_lease_spec.mutable_runtime_env_info()->set_serialized_runtime_env("{}"); - TaskSpecification task_spec{std::move(rpc_task_spec)}; - PrestartWorkersInternal(task_spec, num_prestart_python_workers); + LeaseSpecification lease_spec{std::move(rpc_lease_spec)}; + PrestartWorkersInternal(lease_spec, num_prestart_python_workers); } // Invoke the `send_reply_callback` later to only finish driver @@ -1049,11 +1042,12 @@ void WorkerPool::PopDeleteWorker( } void WorkerPool::PushWorker(const std::shared_ptr &worker) { - // Since the worker is now idle, unset its assigned task ID. - RAY_CHECK(worker->GetAssignedTaskId().IsNil()) - << "Idle workers cannot have an assigned task ID"; - - // Find a task that this worker can fit. If there's none, put it in the idle pool. + // Since the worker is now idle, verify that it has no assigned lease ID. + RAY_CHECK(worker->GetGrantedLeaseId().IsNil()) + << "Idle workers cannot have an assigned lease ID"; + RAY_CHECK(worker->GetWorkerType() != rpc::WorkerType::DRIVER) + << "Idle workers cannot be drivers"; + // Find a lease that this worker can fit. If there's none, put it in the idle pool. // First find in pending_registration_requests, then in pending_start_requests. std::shared_ptr pop_worker_request = nullptr; auto &state = GetStateForLanguage(worker->GetLanguage()); @@ -1061,9 +1055,8 @@ void WorkerPool::PushWorker(const std::shared_ptr &worker) { auto it = std::find_if( state.pending_registration_requests.begin(), state.pending_registration_requests.end(), - [this, &worker](const std::shared_ptr &pop_worker_request) { - return WorkerFitsForTask(*worker, *pop_worker_request) == - WorkerUnfitForTaskReason::NONE; + [this, &worker](const std::shared_ptr &request) { + return WorkerFitForLease(*worker, *request) == WorkerUnfitForLeaseReason::NONE; }); if (it != state.pending_registration_requests.end()) { pop_worker_request = *it; @@ -1074,9 +1067,8 @@ void WorkerPool::PushWorker(const std::shared_ptr &worker) { auto it = std::find_if( state.pending_start_requests.begin(), state.pending_start_requests.end(), - [this, &worker](const std::shared_ptr &pop_worker_request) { - return WorkerFitsForTask(*worker, *pop_worker_request) == - WorkerUnfitForTaskReason::NONE; + [this, &worker](const std::shared_ptr &request) { + return WorkerFitForLease(*worker, *request) == WorkerUnfitForLeaseReason::NONE; }); if (it != state.pending_start_requests.end()) { pop_worker_request = *it; @@ -1085,9 +1077,9 @@ void WorkerPool::PushWorker(const std::shared_ptr &worker) { } if (pop_worker_request) { - bool used = pop_worker_request->callback(worker, PopWorkerStatus::OK, ""); + bool used = pop_worker_request->callback_(worker, PopWorkerStatus::OK, ""); if (!used) { - // Retry PushWorker. Maybe it can be used by other tasks. + // Retry PushWorker. Maybe it can be used by other leases. // Can we have tail call optimization for this? :) return PushWorker(worker); } @@ -1099,7 +1091,7 @@ void WorkerPool::PushWorker(const std::shared_ptr &worker) { absl::Time keep_alive_until = now + absl::Milliseconds(RayConfig::instance().idle_worker_killing_time_threshold_ms()); - if (worker->GetAssignedTaskTime() == absl::Time()) { + if (worker->GetGrantedLeaseTime() == absl::Time()) { // Newly registered worker. Respect worker_startup_keep_alive_duration if any. auto it = state.worker_processes.find(worker->GetStartupToken()); if (it != state.worker_processes.end()) { @@ -1109,9 +1101,9 @@ void WorkerPool::PushWorker(const std::shared_ptr &worker) { } } - // If the worker never held any tasks, then we should consider it first when + // If the worker never held any leases, then we should consider it first when // choosing which idle workers to kill because it is not warmed up and is slower - // than those workers who served tasks before. + // than those workers who held leases before. // See https://github.com/ray-project/ray/pull/36766 // // Also, we set keep_alive_until w.r.t. worker_startup_keep_alive_duration. @@ -1160,7 +1152,7 @@ void WorkerPool::TryKillingIdleWorkers() { } // Compute the soft limit for the number of idle workers to keep around. - // This assumes the common case where each task requires 1 CPU. + // This assumes the common case where each lease requires 1 CPU. const auto num_desired_idle_workers = get_num_cpus_available_(); RAY_LOG(DEBUG) << "Idle workers: " << idle_of_all_languages_.size() << ", idle workers that are eligible to kill: " @@ -1207,9 +1199,9 @@ void WorkerPool::KillIdleWorker(const IdleWorkerEntry &entry) { } rpc_client->Exit( request, [this, entry](const ray::Status &status, const rpc::ExitReply &r) { - const auto &idle_worker = entry.worker; + const auto &worker = entry.worker; - RAY_CHECK(pending_exit_idle_workers_.erase(idle_worker->WorkerId())); + RAY_CHECK(pending_exit_idle_workers_.erase(worker->WorkerId())); if (!status.ok()) { RAY_LOG(ERROR) << "Failed to send exit request: " << status.ToString(); } @@ -1217,19 +1209,19 @@ void WorkerPool::KillIdleWorker(const IdleWorkerEntry &entry) { // In case of failed to send request, we remove it from pool as well // TODO(iycheng): We should handle the grpc failure in better way. if (!status.ok() || r.success()) { - RAY_LOG(DEBUG) << "Removed worker " << idle_worker->WorkerId(); - auto &worker_state = GetStateForLanguage(idle_worker->GetLanguage()); + RAY_LOG(DEBUG) << "Removed worker " << worker->WorkerId(); + auto &worker_state = GetStateForLanguage(worker->GetLanguage()); // If we could kill the worker properly, we remove them from the idle // pool. - RemoveWorker(worker_state.idle, idle_worker); + RemoveWorker(worker_state.idle, worker); // We always mark the worker as dead. // If the worker is not idle at this moment, we'd want to mark it as dead // so it won't be reused later. - if (!idle_worker->IsDead()) { - idle_worker->MarkDead(); + if (!worker->IsDead()) { + worker->MarkDead(); } } else { - RAY_LOG(DEBUG) << "Failed to remove worker " << idle_worker->WorkerId(); + RAY_LOG(DEBUG) << "Failed to remove worker " << worker->WorkerId(); // We re-insert the idle worker to the back of the queue if it fails to // kill the worker (e.g., when the worker owns the object). Without this, // if the first N workers own objects, it can't kill idle workers that are @@ -1239,111 +1231,111 @@ void WorkerPool::KillIdleWorker(const IdleWorkerEntry &entry) { }); } -WorkerUnfitForTaskReason WorkerPool::WorkerFitsForTask( +WorkerUnfitForLeaseReason WorkerPool::WorkerFitForLease( const WorkerInterface &worker, const PopWorkerRequest &pop_worker_request) const { if (worker.IsDead()) { - return WorkerUnfitForTaskReason::OTHERS; + return WorkerUnfitForLeaseReason::OTHERS; } // These workers are exiting. So skip them. if (pending_exit_idle_workers_.contains(worker.WorkerId())) { - return WorkerUnfitForTaskReason::OTHERS; + return WorkerUnfitForLeaseReason::OTHERS; } - if (worker.GetLanguage() != pop_worker_request.language) { - return WorkerUnfitForTaskReason::OTHERS; + if (worker.GetLanguage() != pop_worker_request.language_) { + return WorkerUnfitForLeaseReason::OTHERS; } - if (worker.GetWorkerType() != pop_worker_request.worker_type) { - return WorkerUnfitForTaskReason::OTHERS; + if (worker.GetWorkerType() != pop_worker_request.worker_type_) { + return WorkerUnfitForLeaseReason::OTHERS; } // For scheduling requests with a root detached actor ID, ensure that either the // worker has _no_ detached actor ID or it matches the request. // NOTE(edoakes): the job ID for a worker with no detached actor ID must still match, - // which is checked below. The pop_worker_request for a task rooted in a detached + // which is checked below. The pop_worker_request for a lease rooted in a detached // actor will have the job ID of the job that created the detached actor. - if (!pop_worker_request.root_detached_actor_id.IsNil() && + if (!pop_worker_request.root_detached_actor_id_.IsNil() && !worker.GetRootDetachedActorId().IsNil() && - pop_worker_request.root_detached_actor_id != worker.GetRootDetachedActorId()) { - return WorkerUnfitForTaskReason::ROOT_MISMATCH; + pop_worker_request.root_detached_actor_id_ != worker.GetRootDetachedActorId()) { + return WorkerUnfitForLeaseReason::ROOT_MISMATCH; } // Only consider workers that haven't been assigned to a job yet or have been assigned // to the requested job. const auto worker_job_id = worker.GetAssignedJobId(); - if (!worker_job_id.IsNil() && pop_worker_request.job_id != worker_job_id) { - return WorkerUnfitForTaskReason::ROOT_MISMATCH; + if (!worker_job_id.IsNil() && pop_worker_request.job_id_ != worker_job_id) { + return WorkerUnfitForLeaseReason::ROOT_MISMATCH; } // If the request asks for a is_gpu, and the worker is assigned a different is_gpu, // then skip it. - if (!OptionalsMatchOrEitherEmpty(pop_worker_request.is_gpu, worker.GetIsGpu())) { - return WorkerUnfitForTaskReason::OTHERS; + if (!OptionalsMatchOrEitherEmpty(pop_worker_request.is_gpu_, worker.GetIsGpu())) { + return WorkerUnfitForLeaseReason::OTHERS; } // If the request asks for a is_actor_worker, and the worker is assigned a different // is_actor_worker, then skip it. - if (!OptionalsMatchOrEitherEmpty(pop_worker_request.is_actor_worker, + if (!OptionalsMatchOrEitherEmpty(pop_worker_request.is_actor_worker_, worker.GetIsActorWorker())) { - return WorkerUnfitForTaskReason::OTHERS; + return WorkerUnfitForLeaseReason::OTHERS; } // Skip workers with a mismatched runtime_env. - // Even if the task doesn't have a runtime_env specified, we cannot schedule it to a - // worker with a runtime_env because the task is expected to run in the base + // Even if the lease doesn't have a runtime_env specified, we cannot schedule it to a + // worker with a runtime_env because the lease is expected to run in the base // environment. - if (worker.GetRuntimeEnvHash() != pop_worker_request.runtime_env_hash) { - return WorkerUnfitForTaskReason::RUNTIME_ENV_MISMATCH; + if (worker.GetRuntimeEnvHash() != pop_worker_request.runtime_env_hash_) { + return WorkerUnfitForLeaseReason::RUNTIME_ENV_MISMATCH; } // Skip if the dynamic_options doesn't match. if (LookupWorkerDynamicOptions(worker.GetStartupToken()) != - pop_worker_request.dynamic_options) { - return WorkerUnfitForTaskReason::DYNAMIC_OPTIONS_MISMATCH; + pop_worker_request.dynamic_options_) { + return WorkerUnfitForLeaseReason::DYNAMIC_OPTIONS_MISMATCH; } - return WorkerUnfitForTaskReason::NONE; + return WorkerUnfitForLeaseReason::NONE; } void WorkerPool::StartNewWorker( const std::shared_ptr &pop_worker_request) { auto start_worker_process_fn = [this]( - std::shared_ptr pop_worker_request, + std::shared_ptr request, const std::string &serialized_runtime_env_context) { - auto &state = GetStateForLanguage(pop_worker_request->language); + auto &state = GetStateForLanguage(request->language_); const std::string &serialized_runtime_env = - pop_worker_request->runtime_env_info.serialized_runtime_env(); + request->runtime_env_info_.serialized_runtime_env(); PopWorkerStatus status = PopWorkerStatus::OK; auto [proc, startup_token] = - StartWorkerProcess(pop_worker_request->language, - pop_worker_request->worker_type, - pop_worker_request->job_id, + StartWorkerProcess(request->language_, + request->worker_type_, + request->job_id_, &status, - pop_worker_request->dynamic_options, - pop_worker_request->runtime_env_hash, + request->dynamic_options_, + request->runtime_env_hash_, serialized_runtime_env_context, - pop_worker_request->runtime_env_info, - pop_worker_request->worker_startup_keep_alive_duration); + request->runtime_env_info_, + request->worker_startup_keep_alive_duration_); if (status == PopWorkerStatus::OK) { RAY_CHECK(proc.IsValid()); WarnAboutSize(); - state.pending_registration_requests.emplace_back(pop_worker_request); - MonitorPopWorkerRequestForRegistration(pop_worker_request); + state.pending_registration_requests.emplace_back(request); + MonitorPopWorkerRequestForRegistration(request); } else if (status == PopWorkerStatus::TooManyStartingWorkerProcesses) { // TODO(jjyao) As an optimization, we don't need to delete the runtime env // but reuse it the next time we retry the request. DeleteRuntimeEnvIfPossible(serialized_runtime_env); - state.pending_start_requests.emplace_back(std::move(pop_worker_request)); + state.pending_start_requests.emplace_back(std::move(request)); } else { DeleteRuntimeEnvIfPossible(serialized_runtime_env); - PopWorkerCallbackAsync(std::move(pop_worker_request->callback), nullptr, status); + PopWorkerCallbackAsync(std::move(request->callback_), nullptr, status); } }; const std::string &serialized_runtime_env = - pop_worker_request->runtime_env_info.serialized_runtime_env(); + pop_worker_request->runtime_env_info_.serialized_runtime_env(); if (!IsRuntimeEnvEmpty(serialized_runtime_env)) { // create runtime env. GetOrCreateRuntimeEnv( serialized_runtime_env, - pop_worker_request->runtime_env_info.runtime_env_config(), - pop_worker_request->job_id, + pop_worker_request->runtime_env_info_.runtime_env_config(), + pop_worker_request->job_id_, [this, start_worker_process_fn, pop_worker_request]( bool successful, const std::string &serialized_runtime_env_context, @@ -1352,7 +1344,7 @@ void WorkerPool::StartNewWorker( start_worker_process_fn(pop_worker_request, serialized_runtime_env_context); } else { process_failed_runtime_env_setup_failed_++; - pop_worker_request->callback( + pop_worker_request->callback_( nullptr, PopWorkerStatus::RuntimeEnvCreationFailed, /*runtime_env_setup_error_message*/ setup_error_message); @@ -1363,32 +1355,27 @@ void WorkerPool::StartNewWorker( } } -void WorkerPool::PopWorker(const TaskSpecification &task_spec, +void WorkerPool::PopWorker(const LeaseSpecification &lease_spec, const PopWorkerCallback &callback) { - RAY_LOG(DEBUG) << "Pop worker for task " << task_spec.TaskId() << " task name " - << task_spec.FunctionDescriptor()->ToString(); - // Code path of actor task. - RAY_CHECK(!task_spec.IsActorTask()) << "Direct call shouldn't reach here."; - auto pop_worker_request = std::make_shared( - task_spec.GetLanguage(), + lease_spec.GetLanguage(), rpc::WorkerType::WORKER, - task_spec.JobId(), - task_spec.RootDetachedActorId(), - /*is_gpu=*/task_spec.GetRequiredResources().Get(scheduling::ResourceID::GPU()) > 0, - /*is_actor_worker=*/task_spec.IsActorCreationTask(), - task_spec.RuntimeEnvInfo(), - task_spec.GetRuntimeEnvHash(), - task_spec.DynamicWorkerOptionsOrEmpty(), + lease_spec.JobId(), + lease_spec.RootDetachedActorId(), + /*is_gpu=*/lease_spec.GetRequiredResources().Get(scheduling::ResourceID::GPU()) > 0, + /*is_actor_worker=*/lease_spec.IsActorCreationTask(), + lease_spec.RuntimeEnvInfo(), + lease_spec.GetRuntimeEnvHash(), + lease_spec.DynamicWorkerOptionsOrEmpty(), /*worker_startup_keep_alive_duration=*/std::nullopt, - [this, task_spec, callback]( + [this, lease_spec, callback]( const std::shared_ptr &worker, PopWorkerStatus status, const std::string &runtime_env_setup_error_message) -> bool { - // We got a worker suitable for the task. Now let's check if the task is still + // We got a worker suitable for the lease. Now let's check if the lease is still // executable. - if (worker && finished_jobs_.contains(task_spec.JobId()) && - task_spec.RootDetachedActorId().IsNil()) { + if (worker && finished_jobs_.contains(lease_spec.JobId()) && + lease_spec.RootDetachedActorId().IsNil()) { // When a job finishes, node manager will kill leased workers one time // and worker pool will kill idle workers periodically. // The current worker is already removed from the idle workers @@ -1409,29 +1396,29 @@ void WorkerPool::PopWorker(const TaskSpecification &task_spec, std::shared_ptr WorkerPool::FindAndPopIdleWorker( const PopWorkerRequest &pop_worker_request) { - absl::flat_hash_map skip_reason_count; + absl::flat_hash_map skip_reason_count; - auto worker_fits_for_task_fn = [this, &pop_worker_request, &skip_reason_count]( + auto worker_fit_for_lease_fn = [this, &pop_worker_request, &skip_reason_count]( const IdleWorkerEntry &entry) -> bool { - WorkerUnfitForTaskReason reason = - WorkerFitsForTask(*entry.worker, pop_worker_request); - if (reason == WorkerUnfitForTaskReason::NONE) { + WorkerUnfitForLeaseReason reason = + WorkerFitForLease(*entry.worker, pop_worker_request); + if (reason == WorkerUnfitForLeaseReason::NONE) { return true; } skip_reason_count[reason]++; - if (reason == WorkerUnfitForTaskReason::DYNAMIC_OPTIONS_MISMATCH) { - stats::NumCachedWorkersSkippedDynamicOptionsMismatch.Record(1); - } else if (reason == WorkerUnfitForTaskReason::RUNTIME_ENV_MISMATCH) { - stats::NumCachedWorkersSkippedRuntimeEnvironmentMismatch.Record(1); - } else if (reason == WorkerUnfitForTaskReason::ROOT_MISMATCH) { - stats::NumCachedWorkersSkippedJobMismatch.Record(1); + if (reason == WorkerUnfitForLeaseReason::DYNAMIC_OPTIONS_MISMATCH) { + ray_metric_num_cached_workers_skipped_dynamic_options_mismatch_.Record(1); + } else if (reason == WorkerUnfitForLeaseReason::RUNTIME_ENV_MISMATCH) { + ray_metric_num_cached_workers_skipped_runtime_environment_mismatch_.Record(1); + } else if (reason == WorkerUnfitForLeaseReason::ROOT_MISMATCH) { + ray_metric_num_cached_workers_skipped_job_mismatch_.Record(1); } return false; }; - auto &state = GetStateForLanguage(pop_worker_request.language); + auto &state = GetStateForLanguage(pop_worker_request.language_); auto worker_it = std::find_if(idle_of_all_languages_.rbegin(), idle_of_all_languages_.rend(), - worker_fits_for_task_fn); + worker_fit_for_lease_fn); if (worker_it == idle_of_all_languages_.rend()) { RAY_LOG(DEBUG) << "No cached worker, cached workers skipped due to " << debug_string(skip_reason_count); @@ -1446,15 +1433,15 @@ std::shared_ptr WorkerPool::FindAndPopIdleWorker( idle_of_all_languages_.erase(lit); // Assigned workers should always match the request's job_id - // *except* if the task originates from a detached actor. + // *except* if the lease originates from a detached actor. RAY_CHECK(worker->GetAssignedJobId().IsNil() || - worker->GetAssignedJobId() == pop_worker_request.job_id || - !pop_worker_request.root_detached_actor_id.IsNil()); + worker->GetAssignedJobId() == pop_worker_request.job_id_ || + !pop_worker_request.root_detached_actor_id_.IsNil()); return worker; } void WorkerPool::PopWorker(std::shared_ptr pop_worker_request) { - // If there's an idle worker that fits the task, use it. + // If there's an idle worker that fits the lease, use it. // Else, start a new worker. auto worker = FindAndPopIdleWorker(*pop_worker_request); if (worker == nullptr) { @@ -1462,26 +1449,26 @@ void WorkerPool::PopWorker(std::shared_ptr pop_worker_request) return; } RAY_CHECK(worker->GetAssignedJobId().IsNil() || - worker->GetAssignedJobId() == pop_worker_request->job_id); - stats::NumWorkersStartedFromCache.Record(1); - PopWorkerCallbackAsync(pop_worker_request->callback, worker, PopWorkerStatus::OK); + worker->GetAssignedJobId() == pop_worker_request->job_id_); + ray_metric_num_workers_started_from_cache_.Record(1); + PopWorkerCallbackAsync(pop_worker_request->callback_, worker, PopWorkerStatus::OK); } -void WorkerPool::PrestartWorkers(const TaskSpecification &task_spec, +void WorkerPool::PrestartWorkers(const LeaseSpecification &lease_spec, int64_t backlog_size) { int64_t num_available_cpus = get_num_cpus_available_(); - // Code path of task that needs a dedicated worker. + // Code path of lease that needs a dedicated worker. RAY_LOG(DEBUG) << "PrestartWorkers, num_available_cpus " << num_available_cpus - << " backlog_size " << backlog_size << " task spec " - << task_spec.DebugString() << " has runtime env " - << task_spec.HasRuntimeEnv(); - if ((task_spec.IsActorCreationTask() && !task_spec.DynamicWorkerOptions().empty()) || - task_spec.GetLanguage() != ray::Language::PYTHON) { + << " backlog_size " << backlog_size << " lease spec " + << lease_spec.DebugString() << " has runtime env " + << lease_spec.HasRuntimeEnv(); + if (lease_spec.IsActorCreationTask() && lease_spec.DynamicWorkerOptionsSize() > 0 && + lease_spec.GetLanguage() != ray::Language::PYTHON) { return; // Not handled. } - auto &state = GetStateForLanguage(task_spec.GetLanguage()); - // The number of available workers that can be used for this task spec. + auto &state = GetStateForLanguage(lease_spec.GetLanguage()); + // The number of available workers that can be used for this lease spec. int num_usable_workers = state.idle.size(); for (auto &entry : state.worker_processes) { num_usable_workers += entry.second.is_pending_registration ? 1 : 0; @@ -1492,48 +1479,48 @@ void WorkerPool::PrestartWorkers(const TaskSpecification &task_spec, if (num_usable_workers < desired_usable_workers) { // Account for workers that are idle or already starting. int64_t num_needed = desired_usable_workers - num_usable_workers; - RAY_LOG(DEBUG) << "Prestarting " << num_needed << " workers given task backlog size " + RAY_LOG(DEBUG) << "Prestarting " << num_needed << " workers given lease backlog size " << backlog_size << " and available CPUs " << num_available_cpus << " num idle workers " << state.idle.size() << " num registered workers " << state.registered_workers.size(); - PrestartWorkersInternal(task_spec, num_needed); + PrestartWorkersInternal(lease_spec, num_needed); } } -void WorkerPool::PrestartWorkersInternal(const TaskSpecification &task_spec, +void WorkerPool::PrestartWorkersInternal(const LeaseSpecification &lease_spec, int64_t num_needed) { RAY_LOG(DEBUG) << "PrestartWorkers " << num_needed; for (int ii = 0; ii < num_needed; ++ii) { // Prestart worker with no runtime env. - if (IsRuntimeEnvEmpty(task_spec.SerializedRuntimeEnv())) { + if (IsRuntimeEnvEmpty(lease_spec.SerializedRuntimeEnv())) { PopWorkerStatus status; StartWorkerProcess( - task_spec.GetLanguage(), rpc::WorkerType::WORKER, task_spec.JobId(), &status); + lease_spec.GetLanguage(), rpc::WorkerType::WORKER, lease_spec.JobId(), &status); continue; } // Prestart worker with runtime env. GetOrCreateRuntimeEnv( - task_spec.SerializedRuntimeEnv(), - task_spec.RuntimeEnvConfig(), - task_spec.JobId(), - [this, task_spec = task_spec](bool successful, - const std::string &serialized_runtime_env_context, - const std::string &setup_error_message) { + lease_spec.SerializedRuntimeEnv(), + lease_spec.RuntimeEnvConfig(), + lease_spec.JobId(), + [this, lease_spec = lease_spec](bool successful, + const std::string &serialized_runtime_env_context, + const std::string &setup_error_message) { if (!successful) { RAY_LOG(ERROR) << "Fails to create or get runtime env " << setup_error_message; return; } PopWorkerStatus status; - StartWorkerProcess(task_spec.GetLanguage(), + StartWorkerProcess(lease_spec.GetLanguage(), rpc::WorkerType::WORKER, - task_spec.JobId(), + lease_spec.JobId(), &status, /*dynamic_options=*/{}, - task_spec.GetRuntimeEnvHash(), + lease_spec.GetRuntimeEnvHash(), serialized_runtime_env_context, - task_spec.RuntimeEnvInfo()); + lease_spec.RuntimeEnvInfo()); }); } } @@ -1692,9 +1679,9 @@ void WorkerPool::WarnAboutSize() { std::string warning_message_str = warning_message.str(); RAY_LOG(WARNING) << warning_message_str; - auto error_data_ptr = gcs::CreateErrorTableData( + auto error_data = gcs::CreateErrorTableData( "worker_pool_large", warning_message_str, get_time_()); - gcs_client_.Errors().AsyncReportJobError(error_data_ptr, nullptr); + gcs_client_.Errors().AsyncReportJobError(std::move(error_data)); } } } diff --git a/src/ray/raylet/worker_pool.h b/src/ray/raylet/worker_pool.h index 36ba3601168a..f048ac2f6c29 100644 --- a/src/ray/raylet/worker_pool.h +++ b/src/ray/raylet/worker_pool.h @@ -34,13 +34,13 @@ #include "absl/time/time.h" #include "ray/common/asio/instrumented_io_context.h" #include "ray/common/asio/periodical_runner.h" +#include "ray/common/lease/lease.h" #include "ray/common/runtime_env_manager.h" -#include "ray/common/task/task.h" -#include "ray/common/task/task_common.h" -#include "ray/gcs/gcs_client/gcs_client.h" +#include "ray/gcs_client/gcs_client.h" #include "ray/ipc/client_connection.h" #include "ray/raylet/runtime_env_agent_client.h" #include "ray/raylet/worker.h" +#include "ray/stats/metric.h" namespace ray { @@ -66,7 +66,7 @@ enum PopWorkerStatus { // Any fails of runtime env creation. // A nullptr worker will be returned with callback. RuntimeEnvCreationFailed = 4, - // The task's job has finished. + // The lease's job has finished. // A nullptr worker will be returned with callback. JobFinished = 5, }; @@ -85,18 +85,18 @@ using PopWorkerCallback = const std::string &runtime_env_setup_error_message)>; struct PopWorkerRequest { - const rpc::Language language; - const rpc::WorkerType worker_type; - const JobID job_id; // can be Nil - const ActorID root_detached_actor_id; // can be Nil - const std::optional is_gpu; - const std::optional is_actor_worker; - const rpc::RuntimeEnvInfo runtime_env_info; - const int runtime_env_hash; - const std::vector dynamic_options; - std::optional worker_startup_keep_alive_duration; - - PopWorkerCallback callback; + const rpc::Language language_; + const rpc::WorkerType worker_type_; + const JobID job_id_; // can be Nil + const ActorID root_detached_actor_id_; // can be Nil + const std::optional is_gpu_; + const std::optional is_actor_worker_; + const rpc::RuntimeEnvInfo runtime_env_info_; + const int runtime_env_hash_; + const std::vector dynamic_options_; + std::optional worker_startup_keep_alive_duration_; + + PopWorkerCallback callback_; PopWorkerRequest(rpc::Language lang, rpc::WorkerType worker_type, @@ -109,18 +109,17 @@ struct PopWorkerRequest { std::vector options, std::optional worker_startup_keep_alive_duration, PopWorkerCallback callback) - : language(lang), - worker_type(worker_type), - job_id(job), - root_detached_actor_id(root_actor_id), - is_gpu(gpu), - is_actor_worker(actor_worker), - runtime_env_info(std::move(runtime_env_info)), - // this-> is needed to disambiguate the member variable from the ctor arg. - runtime_env_hash(runtime_env_hash), - dynamic_options(std::move(options)), - worker_startup_keep_alive_duration(worker_startup_keep_alive_duration), - callback(std::move(callback)) {} + : language_(lang), + worker_type_(worker_type), + job_id_(job), + root_detached_actor_id_(root_actor_id), + is_gpu_(gpu), + is_actor_worker_(actor_worker), + runtime_env_info_(std::move(runtime_env_info)), + runtime_env_hash_(runtime_env_hash), + dynamic_options_(std::move(options)), + worker_startup_keep_alive_duration_(worker_startup_keep_alive_duration), + callback_(std::move(callback)) {} }; /// \class IOWorkerPoolInterface @@ -154,7 +153,7 @@ class WorkerPoolInterface : public IOWorkerPoolInterface { /// Pop an idle worker from the pool. The caller is responsible for pushing /// the worker back onto the pool once the worker has completed its work. /// - /// \param task_spec The returned worker must be able to execute this task. + /// \param lease_spec The returned worker must be able to execute this lease. /// \param callback The callback function that executed when gets the result of /// worker popping. /// The callback will be executed with an empty worker in following cases: @@ -168,7 +167,7 @@ class WorkerPoolInterface : public IOWorkerPoolInterface { /// Case 1: An suitable worker was found in idle worker pool. /// Case 2: An suitable worker registered to raylet. /// The corresponding PopWorkerStatus will be passed to the callback. - virtual void PopWorker(const TaskSpecification &task_spec, + virtual void PopWorker(const LeaseSpecification &lease_spec, const PopWorkerCallback &callback) = 0; /// Add an idle worker to the pool. /// @@ -238,7 +237,7 @@ class WorkerPoolInterface : public IOWorkerPoolInterface { virtual void DisconnectDriver(const std::shared_ptr &driver) = 0; - virtual void PrestartWorkers(const TaskSpecification &task_spec, + virtual void PrestartWorkers(const LeaseSpecification &lease_spec, int64_t backlog_size) = 0; virtual void StartNewWorker( @@ -250,14 +249,14 @@ class WorkerPoolInterface : public IOWorkerPoolInterface { class WorkerInterface; class Worker; -enum class WorkerUnfitForTaskReason { +enum class WorkerUnfitForLeaseReason { NONE = 0, // OK ROOT_MISMATCH = 1, // job ID or root detached actor ID mismatch RUNTIME_ENV_MISMATCH = 2, // runtime env hash mismatch DYNAMIC_OPTIONS_MISMATCH = 3, // dynamic options mismatch OTHERS = 4, // reasons we don't do stats for (e.g. language) }; -static constexpr std::string_view kWorkerUnfitForTaskReasonDebugName[] = { +static constexpr std::string_view kWorkerUnfitForLeaseReasonDebugName[] = { "NONE", "ROOT_MISMATCH", "RUNTIME_ENV_MISMATCH", @@ -266,8 +265,8 @@ static constexpr std::string_view kWorkerUnfitForTaskReasonDebugName[] = { }; inline std::ostream &operator<<(std::ostream &os, - const WorkerUnfitForTaskReason &reason) { - os << kWorkerUnfitForTaskReasonDebugName[static_cast(reason)]; + const WorkerUnfitForLeaseReason &reason) { + os << kWorkerUnfitForLeaseReasonDebugName[static_cast(reason)]; return os; } @@ -305,7 +304,6 @@ class WorkerPool : public WorkerPoolInterface { /// \param ray_debugger_external Ray debugger in workers will be started in a way /// that they are accessible from outside the node. /// \param get_time A callback to get the current time in milliseconds. - /// \param enable_resource_isolation If true, core worker enables resource isolation by /// adding itself into appropriate cgroup. WorkerPool(instrumented_io_context &io_service, const NodeID &node_id, @@ -321,8 +319,7 @@ class WorkerPool : public WorkerPoolInterface { std::string native_library_path, std::function starting_worker_timeout_callback, int ray_debugger_external, - std::function get_time, - bool enable_resource_isolation); + std::function get_time); /// Destructor responsible for freeing a set of workers owned by this class. ~WorkerPool() override; @@ -477,19 +474,20 @@ class WorkerPool : public WorkerPoolInterface { void PushWorker(const std::shared_ptr &worker) override; /// See interface. - void PopWorker(const TaskSpecification &task_spec, + void PopWorker(const LeaseSpecification &lease_spec, const PopWorkerCallback &callback) override; - /// Try to prestart a number of workers suitable the given task spec. Prestarting + /// Try to prestart a number of workers suitable the given lease spec. Prestarting /// is needed since core workers request one lease at a time, if starting is slow, /// then it means it takes a long time to scale up. /// - /// \param task_spec The returned worker must be able to execute this task. - /// \param backlog_size The number of tasks in the client backlog of this shape. + /// \param lease_spec The returned worker must be able to execute this lease. + /// \param backlog_size The number of leases in the client backlog of this shape. /// We aim to prestart 1 worker per CPU, up to the backlog size. - void PrestartWorkers(const TaskSpecification &task_spec, int64_t backlog_size) override; + void PrestartWorkers(const LeaseSpecification &lease_spec, + int64_t backlog_size) override; - void PrestartWorkersInternal(const TaskSpecification &task_spec, int64_t num_needed); + void PrestartWorkersInternal(const LeaseSpecification &lease_spec, int64_t num_needed); /// Return the current size of the worker pool for the requested language. Counts only /// idle workers. @@ -534,7 +532,7 @@ class WorkerPool : public WorkerPoolInterface { /// Internal implementation of PopWorker. void PopWorker(std::shared_ptr pop_worker_request); - // Find an idle worker that can serve the task. If found, pop it out and return it. + // Find an idle worker that can serve the lease. If found, pop it out and return it. // Otherwise, return nullptr. std::shared_ptr FindAndPopIdleWorker( const PopWorkerRequest &pop_worker_request); @@ -570,8 +568,8 @@ class WorkerPool : public WorkerPoolInterface { /// \param serialized_runtime_env_context The context of runtime env. /// \param runtime_env_info The raw runtime env info. /// \param worker_startup_keep_alive_duration If set, the worker will be kept alive for - /// this duration even if it's idle. This is only applicable before a task is assigned - /// to the worker. + /// this duration even if it's idle. This is only applicable before a lease is + /// assigned to the worker. /// \return The process that we started and a token. If the token is less than 0, /// we didn't start a process. std::tuple StartWorkerProcess( @@ -639,7 +637,7 @@ class WorkerPool : public WorkerPoolInterface { rpc::RuntimeEnvInfo runtime_env_info; /// The dynamic_options. std::vector dynamic_options; - /// The duration to keep the newly created worker alive before it's assigned a task. + /// The duration to keep the newly created worker alive before it's assigned a lease. std::optional worker_startup_keep_alive_duration; }; @@ -843,9 +841,9 @@ class WorkerPool : public WorkerPoolInterface { /// /// \param[in] worker The worker. /// \param[in] pop_worker_request The pop worker request. - /// \return WorkerUnfitForTaskReason::NONE if the worker can be used, else a + /// \return WorkerUnfitForLeaseReason::NONE if the worker can be used, else a /// status indicating why it cannot. - WorkerUnfitForTaskReason WorkerFitsForTask( + WorkerUnfitForLeaseReason WorkerFitForLease( const WorkerInterface &worker, const PopWorkerRequest &pop_worker_request) const; /// For Process class for managing subprocesses (e.g. reaping zombies). @@ -871,7 +869,7 @@ class WorkerPool : public WorkerPoolInterface { /// The callback that will be triggered once it times out to start a worker. std::function starting_worker_timeout_callback_; /// If 1, expose Ray debuggers started by the workers externally (to this node). - int ray_debugger_external; + int ray_debugger_external_; /// If the first job has already been registered. bool first_job_registered_ = false; @@ -912,9 +910,33 @@ class WorkerPool : public WorkerPoolInterface { int64_t process_failed_pending_registration_ = 0; int64_t process_failed_runtime_env_setup_failed_ = 0; - // If true, core worker enables resource isolation by adding itself into appropriate - // cgroup after it is created. - bool enable_resource_isolation_ = false; + /// Ray metrics + ray::stats::Sum ray_metric_num_workers_started_{ + /*name=*/"internal_num_processes_started", + /*description=*/"The total number of worker processes the worker pool has created.", + /*unit=*/"processes"}; + + ray::stats::Sum ray_metric_num_cached_workers_skipped_job_mismatch_{ + /*name=*/"internal_num_processes_skipped_job_mismatch", + /*description=*/"The total number of cached workers skipped due to job mismatch.", + /*unit=*/"workers"}; + + ray::stats::Sum ray_metric_num_cached_workers_skipped_runtime_environment_mismatch_{ + /*name=*/"internal_num_processes_skipped_runtime_environment_mismatch", + /*description=*/ + "The total number of cached workers skipped due to runtime environment mismatch.", + /*unit=*/"workers"}; + + ray::stats::Sum ray_metric_num_cached_workers_skipped_dynamic_options_mismatch_{ + /*name=*/"internal_num_processes_skipped_dynamic_options_mismatch", + /*description=*/ + "The total number of cached workers skipped due to dynamic options mismatch.", + /*unit=*/"workers"}; + + ray::stats::Sum ray_metric_num_workers_started_from_cache_{ + /*name=*/"internal_num_processes_started_from_cache", + /*description=*/"The total number of workers started from a cached worker process.", + /*unit=*/"workers"}; friend class WorkerPoolTest; friend class WorkerPoolDriverRegisteredTest; diff --git a/src/ray/raylet_client/BUILD.bazel b/src/ray/raylet_client/BUILD.bazel deleted file mode 100644 index 9dc98c643617..000000000000 --- a/src/ray/raylet_client/BUILD.bazel +++ /dev/null @@ -1,20 +0,0 @@ -load("//bazel:ray.bzl", "ray_cc_library") - -exports_files([ - "raylet_client.h", -]) - -ray_cc_library( - name = "raylet_client_lib", - srcs = ["raylet_client.cc"], - hdrs = ["raylet_client.h"], - deps = [ - "//src/ray/common:id", - "//src/ray/common:status", - "//src/ray/common:task_common", - "//src/ray/flatbuffers:node_manager_generated", - "//src/ray/protobuf:common_cc_proto", - "//src/ray/rpc:node_manager_client", - "//src/ray/util:logging", - ], -) diff --git a/src/ray/rpc/BUILD.bazel b/src/ray/rpc/BUILD.bazel index 776a24e80abd..2a83b60815e8 100644 --- a/src/ray/rpc/BUILD.bazel +++ b/src/ray/rpc/BUILD.bazel @@ -51,6 +51,7 @@ ray_cc_library( ray_cc_library( name = "metrics_agent_client", + srcs = ["metrics_agent_client.cc"], hdrs = ["metrics_agent_client.h"], visibility = ["//visibility:public"], deps = [ @@ -122,34 +123,6 @@ ray_cc_library( ], ) -ray_cc_library( - name = "node_manager_client", - srcs = ["node_manager/raylet_client_pool.cc"], - hdrs = [ - "node_manager/node_manager_client.h", - "node_manager/node_manager_server.h", - "node_manager/raylet_client_pool.h", - ] + [ - # TODO(eoakes): these are needed due to a circular dependency: - # raylet_client_pool.cc -> raylet_client.h -> node_manager_client.h - "//src/ray/raylet_client:raylet_client.h", - ], - visibility = ["//visibility:public"], - deps = [ - ":client_call", - ":grpc_client", - "//src/ray/common:id", - "//src/ray/gcs/gcs_client:gcs_client_lib", - "//src/ray/protobuf:node_manager_cc_grpc", - "//src/ray/util:network_util", - ] + [ - # TODO(eoakes): these three come from raylet_client.h, remove after breaking the circular dependency. - "//src/ray/ipc:client_connection", - "//src/ray/common:ray_object", - "//src/ray/common:task_common", - ], -) - ray_cc_library( name = "node_manager_server", hdrs = [ @@ -165,70 +138,43 @@ ray_cc_library( ) ray_cc_library( - name = "gcs_client", + name = "raylet_client_interface", hdrs = [ - "gcs/gcs_rpc_client.h", + "raylet/raylet_client_interface.h", ], visibility = ["//visibility:public"], deps = [ - ":client_call", - ":retryable_grpc_client", - "//src/ray/common:ray_config", - "//src/ray/protobuf:autoscaler_cc_grpc", - "//src/ray/protobuf:gcs_service_cc_grpc", - "//src/ray/util:network_util", - "@com_google_absl//absl/container:btree", + "//src/ray/protobuf:autoscaler_cc_proto", + "//src/ray/protobuf:common_cc_proto", + "//src/ray/protobuf:node_manager_cc_proto", ], ) ray_cc_library( - name = "gcs_server", + name = "raylet_client_pool", + srcs = ["raylet/raylet_client_pool.cc"], hdrs = [ - "gcs/gcs_rpc_server.h", + "raylet/raylet_client_pool.h", ], visibility = ["//visibility:public"], deps = [ - ":grpc_server", - ":server_call", - "//src/ray/common:asio", - "//src/ray/common:id", - "//src/ray/common:ray_config", - "//src/ray/protobuf:autoscaler_cc_grpc", - "//src/ray/protobuf:events_event_aggregator_service_cc_grpc", - "//src/ray/protobuf:gcs_service_cc_grpc", - "@boost//:asio", - "@com_github_grpc_grpc//:grpc++", + ":raylet_client_interface", + "//src/ray/gcs_client", ], ) ray_cc_library( - name = "object_manager_client", - hdrs = [ - "object_manager/object_manager_client.h", - ], + name = "raylet_client_lib", + srcs = ["raylet/raylet_client.cc"], + hdrs = ["raylet/raylet_client.h"], visibility = ["//visibility:public"], deps = [ - "//src/ray/object_manager:object_manager_grpc_client_manager", - "//src/ray/protobuf:object_manager_cc_grpc", + ":raylet_client_interface", + ":retryable_grpc_client", + "//src/ray/common:bundle_spec", + "//src/ray/common:ray_config", + "//src/ray/protobuf:node_manager_cc_grpc", "//src/ray/util:logging", - "@com_github_grpc_grpc//:grpc++", - ], -) - -ray_cc_library( - name = "object_manager_server", - hdrs = [ - "object_manager/object_manager_server.h", - ], - visibility = ["//visibility:public"], - deps = [ - ":grpc_server", - ":server_call", - "//src/ray/common:asio", - "//src/ray/object_manager:object_manager_grpc_client_manager", - "//src/ray/protobuf:object_manager_cc_grpc", - "@boost//:asio", - "@com_github_grpc_grpc//:grpc++", ], ) @@ -241,31 +187,18 @@ ray_cc_library( hdrs = [ "worker/core_worker_client.h", "worker/core_worker_client_pool.h", - "worker/core_worker_server.h", ], deps = [ + ":raylet_client_interface", + ":raylet_client_pool", "//src/ray/common:id", "//src/ray/common:status", - "//src/ray/gcs/gcs_client:gcs_client_lib", + "//src/ray/gcs_client", "//src/ray/protobuf:core_worker_cc_grpc", "//src/ray/pubsub:subscriber", - "//src/ray/raylet_client:raylet_client_lib", "//src/ray/util:logging", "//src/ray/util:network_util", "@com_github_grpc_grpc//:grpc++", "@com_google_absl//absl/synchronization", ], ) - -ray_cc_library( - name = "core_worker_server", - hdrs = [ - "worker/core_worker_server.h", - ], - deps = [ - ":grpc_server", - ":server_call", - "//src/ray/common:asio", - "//src/ray/protobuf:core_worker_cc_grpc", - ], -) diff --git a/src/ray/rpc/client_call.h b/src/ray/rpc/client_call.h index e9197e6466d3..dd5c794764c9 100644 --- a/src/ray/rpc/client_call.h +++ b/src/ray/rpc/client_call.h @@ -206,6 +206,10 @@ class ClientCallManager { /// /// \param[in] main_service The main event loop, to which the callback functions will be /// posted. + /// \param record_stats Whether to record stats for calls made with this client + /// \param cluster_id UUID of the destination cluster + /// \param num_threads The number of threads used for polling for completion events + /// \param call_timeout_ms Set's the default call timeout for requests on this client /// explicit ClientCallManager(instrumented_io_context &main_service, bool record_stats, diff --git a/src/ray/rpc/event_aggregator_client.h b/src/ray/rpc/event_aggregator_client.h index b8b0f2fe3dd1..83746f02faa7 100644 --- a/src/ray/rpc/event_aggregator_client.h +++ b/src/ray/rpc/event_aggregator_client.h @@ -20,7 +20,6 @@ #include #include -#include "ray/common/status.h" #include "ray/rpc/grpc_client.h" #include "ray/util/logging.h" #include "src/ray/protobuf/events_event_aggregator_service.grpc.pb.h" diff --git a/src/ray/rpc/gcs/gcs_rpc_server.h b/src/ray/rpc/gcs/gcs_rpc_server.h deleted file mode 100644 index e91b8161a8af..000000000000 --- a/src/ray/rpc/gcs/gcs_rpc_server.h +++ /dev/null @@ -1,781 +0,0 @@ -// Copyright 2017 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include - -#include "ray/common/asio/instrumented_io_context.h" -#include "ray/common/id.h" -#include "ray/rpc/grpc_server.h" -#include "ray/rpc/server_call.h" -#include "src/ray/protobuf/autoscaler.grpc.pb.h" -#include "src/ray/protobuf/events_event_aggregator_service.pb.h" -#include "src/ray/protobuf/gcs_service.grpc.pb.h" - -namespace ray { -namespace rpc { -// Most of our RPC templates, if not all, expect messages in the ray::rpc protobuf -// namespace. Since the following two messages are defined under the rpc::events -// namespace, we treat them as if they were part of ray::rpc for compatibility. -using ray::rpc::events::AddEventsReply; -using ray::rpc::events::AddEventsRequest; -namespace autoscaler { - -#define AUTOSCALER_STATE_SERVICE_RPC_HANDLER(HANDLER) \ - RPC_SERVICE_HANDLER(AutoscalerStateService, \ - HANDLER, \ - RayConfig::instance().gcs_max_active_rpcs_per_handler()) - -class AutoscalerStateServiceHandler { - public: - virtual ~AutoscalerStateServiceHandler() = default; - - virtual void HandleGetClusterResourceState(GetClusterResourceStateRequest request, - GetClusterResourceStateReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleReportAutoscalingState(ReportAutoscalingStateRequest request, - ReportAutoscalingStateReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleRequestClusterResourceConstraint( - RequestClusterResourceConstraintRequest request, - RequestClusterResourceConstraintReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleGetClusterStatus(GetClusterStatusRequest request, - GetClusterStatusReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleDrainNode(DrainNodeRequest request, - DrainNodeReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleReportClusterConfig(ReportClusterConfigRequest request, - ReportClusterConfigReply *reply, - SendReplyCallback send_reply_callback) = 0; -}; - -/// The `GrpcService` for `AutoscalerStateService`. -class AutoscalerStateGrpcService : public GrpcService { - public: - /// Constructor. - /// - /// \param[in] handler The service handler that actually handle the requests. - explicit AutoscalerStateGrpcService(instrumented_io_context &io_service, - AutoscalerStateServiceHandler &handler) - : GrpcService(io_service), service_handler_(handler){}; - - protected: - grpc::Service &GetGrpcService() override { return service_; } - void InitServerCallFactories( - const std::unique_ptr &cq, - std::vector> *server_call_factories, - const ClusterID &cluster_id) override { - AUTOSCALER_STATE_SERVICE_RPC_HANDLER(GetClusterResourceState); - AUTOSCALER_STATE_SERVICE_RPC_HANDLER(ReportAutoscalingState); - AUTOSCALER_STATE_SERVICE_RPC_HANDLER(ReportClusterConfig); - AUTOSCALER_STATE_SERVICE_RPC_HANDLER(RequestClusterResourceConstraint); - AUTOSCALER_STATE_SERVICE_RPC_HANDLER(GetClusterStatus); - AUTOSCALER_STATE_SERVICE_RPC_HANDLER(DrainNode); - } - - private: - /// The grpc async service object. - AutoscalerStateService::AsyncService service_; - /// The service handler that actually handle the requests. - AutoscalerStateServiceHandler &service_handler_; -}; - -using AutoscalerStateHandler = AutoscalerStateServiceHandler; - -} // namespace autoscaler -} // namespace rpc -} // namespace ray - -namespace ray { -namespace rpc { - -#define JOB_INFO_SERVICE_RPC_HANDLER(HANDLER) \ - RPC_SERVICE_HANDLER(JobInfoGcsService, \ - HANDLER, \ - RayConfig::instance().gcs_max_active_rpcs_per_handler()) - -#define ACTOR_INFO_SERVICE_RPC_HANDLER(HANDLER, MAX_ACTIVE_RPCS) \ - RPC_SERVICE_HANDLER(ActorInfoGcsService, HANDLER, MAX_ACTIVE_RPCS) - -#define MONITOR_SERVICE_RPC_HANDLER(HANDLER) \ - RPC_SERVICE_HANDLER(MonitorGcsService, \ - HANDLER, \ - RayConfig::instance().gcs_max_active_rpcs_per_handler()) - -#define NODE_INFO_SERVICE_RPC_HANDLER(HANDLER) \ - RPC_SERVICE_HANDLER(NodeInfoGcsService, \ - HANDLER, \ - RayConfig::instance().gcs_max_active_rpcs_per_handler()) - -#define TASK_INFO_SERVICE_RPC_HANDLER(HANDLER) \ - RPC_SERVICE_HANDLER(TaskInfoGcsService, \ - HANDLER, \ - RayConfig::instance().gcs_max_active_rpcs_per_handler()) - -#define RAY_EVENT_EXPORT_SERVICE_RPC_HANDLER(HANDLER) \ - RPC_SERVICE_HANDLER(RayEventExportGcsService, \ - HANDLER, \ - RayConfig::instance().gcs_max_active_rpcs_per_handler()) - -#define NODE_RESOURCE_INFO_SERVICE_RPC_HANDLER(HANDLER) \ - RPC_SERVICE_HANDLER(NodeResourceInfoGcsService, \ - HANDLER, \ - RayConfig::instance().gcs_max_active_rpcs_per_handler()) - -#define OBJECT_INFO_SERVICE_RPC_HANDLER(HANDLER) \ - RPC_SERVICE_HANDLER(ObjectInfoGcsService, \ - HANDLER, \ - RayConfig::instance().gcs_max_active_rpcs_per_handler()) - -#define WORKER_INFO_SERVICE_RPC_HANDLER(HANDLER) \ - RPC_SERVICE_HANDLER(WorkerInfoGcsService, \ - HANDLER, \ - RayConfig::instance().gcs_max_active_rpcs_per_handler()) - -#define PLACEMENT_GROUP_INFO_SERVICE_RPC_HANDLER(HANDLER) \ - RPC_SERVICE_HANDLER(PlacementGroupInfoGcsService, \ - HANDLER, \ - RayConfig::instance().gcs_max_active_rpcs_per_handler()) - -#define INTERNAL_KV_SERVICE_RPC_HANDLER(HANDLER) \ - RPC_SERVICE_HANDLER(InternalKVGcsService, HANDLER, -1) - -#define RUNTIME_ENV_SERVICE_RPC_HANDLER(HANDLER) \ - RPC_SERVICE_HANDLER(RuntimeEnvGcsService, HANDLER, -1) - -// Unlimited max active RPCs, because of long poll. -#define INTERNAL_PUBSUB_SERVICE_RPC_HANDLER(HANDLER) \ - RPC_SERVICE_HANDLER(InternalPubSubGcsService, HANDLER, -1) - -#define GCS_RPC_SEND_REPLY(send_reply_callback, reply, status) \ - reply->mutable_status()->set_code(static_cast(status.code())); \ - reply->mutable_status()->set_message(status.message()); \ - send_reply_callback(ray::Status::OK(), nullptr, nullptr) - -class JobInfoGcsServiceHandler { - public: - using JobFinishListenerCallback = std::function; - - virtual ~JobInfoGcsServiceHandler() = default; - - virtual void HandleAddJob(AddJobRequest request, - AddJobReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleMarkJobFinished(MarkJobFinishedRequest request, - MarkJobFinishedReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleGetAllJobInfo(GetAllJobInfoRequest request, - GetAllJobInfoReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void AddJobFinishedListener(JobFinishListenerCallback listener) = 0; - - virtual void HandleReportJobError(ReportJobErrorRequest request, - ReportJobErrorReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleGetNextJobID(GetNextJobIDRequest request, - GetNextJobIDReply *reply, - SendReplyCallback send_reply_callback) = 0; -}; - -/// The `GrpcService` for `JobInfoGcsService`. -class JobInfoGrpcService : public GrpcService { - public: - /// Constructor. - /// - /// \param[in] handler The service handler that actually handle the requests. - explicit JobInfoGrpcService(instrumented_io_context &io_service, - JobInfoGcsServiceHandler &handler) - : GrpcService(io_service), service_handler_(handler){}; - - protected: - grpc::Service &GetGrpcService() override { return service_; } - - void InitServerCallFactories( - const std::unique_ptr &cq, - std::vector> *server_call_factories, - const ClusterID &cluster_id) override { - JOB_INFO_SERVICE_RPC_HANDLER(AddJob); - JOB_INFO_SERVICE_RPC_HANDLER(MarkJobFinished); - JOB_INFO_SERVICE_RPC_HANDLER(GetAllJobInfo); - JOB_INFO_SERVICE_RPC_HANDLER(ReportJobError); - JOB_INFO_SERVICE_RPC_HANDLER(GetNextJobID); - } - - private: - /// The grpc async service object. - JobInfoGcsService::AsyncService service_; - /// The service handler that actually handle the requests. - JobInfoGcsServiceHandler &service_handler_; -}; - -class ActorInfoGcsServiceHandler { - public: - virtual ~ActorInfoGcsServiceHandler() = default; - - virtual void HandleRegisterActor(RegisterActorRequest request, - RegisterActorReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleRestartActorForLineageReconstruction( - RestartActorForLineageReconstructionRequest request, - RestartActorForLineageReconstructionReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleCreateActor(CreateActorRequest request, - CreateActorReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleGetActorInfo(GetActorInfoRequest request, - GetActorInfoReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleGetNamedActorInfo(GetNamedActorInfoRequest request, - GetNamedActorInfoReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleListNamedActors(rpc::ListNamedActorsRequest request, - rpc::ListNamedActorsReply *reply, - rpc::SendReplyCallback send_reply_callback) = 0; - - virtual void HandleGetAllActorInfo(GetAllActorInfoRequest request, - GetAllActorInfoReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleKillActorViaGcs(KillActorViaGcsRequest request, - KillActorViaGcsReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleReportActorOutOfScope(ReportActorOutOfScopeRequest request, - ReportActorOutOfScopeReply *reply, - SendReplyCallback send_reply_callback) = 0; -}; - -/// The `GrpcService` for `ActorInfoGcsService`. -class ActorInfoGrpcService : public GrpcService { - public: - /// Constructor. - /// - /// \param[in] handler The service handler that actually handle the requests. - explicit ActorInfoGrpcService(instrumented_io_context &io_service, - ActorInfoGcsServiceHandler &handler) - : GrpcService(io_service), service_handler_(handler){}; - - protected: - grpc::Service &GetGrpcService() override { return service_; } - - void InitServerCallFactories( - const std::unique_ptr &cq, - std::vector> *server_call_factories, - const ClusterID &cluster_id) override { - /// Register/Create Actor RPC takes long time, we shouldn't limit them to avoid - /// distributed deadlock. - ACTOR_INFO_SERVICE_RPC_HANDLER(RegisterActor, -1); - ACTOR_INFO_SERVICE_RPC_HANDLER(RestartActorForLineageReconstruction, -1); - ACTOR_INFO_SERVICE_RPC_HANDLER(CreateActor, -1); - - /// Others need back pressure. - ACTOR_INFO_SERVICE_RPC_HANDLER( - GetActorInfo, RayConfig::instance().gcs_max_active_rpcs_per_handler()); - ACTOR_INFO_SERVICE_RPC_HANDLER( - GetNamedActorInfo, RayConfig::instance().gcs_max_active_rpcs_per_handler()); - ACTOR_INFO_SERVICE_RPC_HANDLER( - ListNamedActors, RayConfig::instance().gcs_max_active_rpcs_per_handler()); - ACTOR_INFO_SERVICE_RPC_HANDLER( - GetAllActorInfo, RayConfig::instance().gcs_max_active_rpcs_per_handler()); - ACTOR_INFO_SERVICE_RPC_HANDLER( - KillActorViaGcs, RayConfig::instance().gcs_max_active_rpcs_per_handler()); - ACTOR_INFO_SERVICE_RPC_HANDLER( - ReportActorOutOfScope, RayConfig::instance().gcs_max_active_rpcs_per_handler()); - } - - private: - /// The grpc async service object. - ActorInfoGcsService::AsyncService service_; - /// The service handler that actually handle the requests. - ActorInfoGcsServiceHandler &service_handler_; -}; - -class NodeInfoGcsServiceHandler { - public: - virtual ~NodeInfoGcsServiceHandler() = default; - - virtual void HandleGetClusterId(rpc::GetClusterIdRequest request, - rpc::GetClusterIdReply *reply, - rpc::SendReplyCallback send_reply_callback) = 0; - - virtual void HandleRegisterNode(RegisterNodeRequest request, - RegisterNodeReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleUnregisterNode(UnregisterNodeRequest request, - UnregisterNodeReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleCheckAlive(CheckAliveRequest request, - CheckAliveReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleDrainNode(DrainNodeRequest request, - DrainNodeReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleGetAllNodeInfo(GetAllNodeInfoRequest request, - GetAllNodeInfoReply *reply, - SendReplyCallback send_reply_callback) = 0; -}; - -/// The `GrpcService` for `NodeInfoGcsService`. -class NodeInfoGrpcService : public GrpcService { - public: - /// Constructor. - /// - /// \param[in] handler The service handler that actually handle the requests. - explicit NodeInfoGrpcService(instrumented_io_context &io_service, - NodeInfoGcsServiceHandler &handler) - : GrpcService(io_service), service_handler_(handler){}; - - protected: - grpc::Service &GetGrpcService() override { return service_; } - - void InitServerCallFactories( - const std::unique_ptr &cq, - std::vector> *server_call_factories, - const ClusterID &cluster_id) override { - // We only allow one cluster ID in the lifetime of a client. - // So, if a client connects, it should not have a pre-existing different ID. - RPC_SERVICE_HANDLER_CUSTOM_AUTH( - NodeInfoGcsService, - GetClusterId, - RayConfig::instance().gcs_max_active_rpcs_per_handler(), - AuthType::EMPTY_AUTH); - NODE_INFO_SERVICE_RPC_HANDLER(RegisterNode); - NODE_INFO_SERVICE_RPC_HANDLER(UnregisterNode); - NODE_INFO_SERVICE_RPC_HANDLER(DrainNode); - NODE_INFO_SERVICE_RPC_HANDLER(GetAllNodeInfo); - NODE_INFO_SERVICE_RPC_HANDLER(CheckAlive); - } - - private: - /// The grpc async service object. - NodeInfoGcsService::AsyncService service_; - /// The service handler that actually handle the requests. - NodeInfoGcsServiceHandler &service_handler_; -}; - -class NodeResourceInfoGcsServiceHandler { - public: - virtual ~NodeResourceInfoGcsServiceHandler() = default; - - virtual void HandleGetAllAvailableResources( - rpc::GetAllAvailableResourcesRequest request, - rpc::GetAllAvailableResourcesReply *reply, - rpc::SendReplyCallback send_reply_callback) = 0; - - virtual void HandleGetAllTotalResources(rpc::GetAllTotalResourcesRequest request, - rpc::GetAllTotalResourcesReply *reply, - rpc::SendReplyCallback send_reply_callback) = 0; - - virtual void HandleGetDrainingNodes(rpc::GetDrainingNodesRequest request, - rpc::GetDrainingNodesReply *reply, - rpc::SendReplyCallback send_reply_callback) = 0; - - virtual void HandleGetAllResourceUsage(GetAllResourceUsageRequest request, - GetAllResourceUsageReply *reply, - SendReplyCallback send_reply_callback) = 0; -}; - -/// The `GrpcService` for `NodeResourceInfoGcsService`. -class NodeResourceInfoGrpcService : public GrpcService { - public: - /// Constructor. - /// - /// \param[in] handler The service handler that actually handle the requests. - explicit NodeResourceInfoGrpcService(instrumented_io_context &io_service, - NodeResourceInfoGcsServiceHandler &handler) - : GrpcService(io_service), service_handler_(handler){}; - - protected: - grpc::Service &GetGrpcService() override { return service_; } - - void InitServerCallFactories( - const std::unique_ptr &cq, - std::vector> *server_call_factories, - const ClusterID &cluster_id) override { - NODE_RESOURCE_INFO_SERVICE_RPC_HANDLER(GetAllAvailableResources); - NODE_RESOURCE_INFO_SERVICE_RPC_HANDLER(GetAllTotalResources); - NODE_RESOURCE_INFO_SERVICE_RPC_HANDLER(GetDrainingNodes); - NODE_RESOURCE_INFO_SERVICE_RPC_HANDLER(GetAllResourceUsage); - } - - private: - /// The grpc async service object. - NodeResourceInfoGcsService::AsyncService service_; - /// The service handler that actually handle the requests. - NodeResourceInfoGcsServiceHandler &service_handler_; -}; - -class WorkerInfoGcsServiceHandler { - public: - virtual ~WorkerInfoGcsServiceHandler() = default; - - virtual void HandleReportWorkerFailure(ReportWorkerFailureRequest request, - ReportWorkerFailureReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleGetWorkerInfo(GetWorkerInfoRequest request, - GetWorkerInfoReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleGetAllWorkerInfo(GetAllWorkerInfoRequest request, - GetAllWorkerInfoReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleAddWorkerInfo(AddWorkerInfoRequest request, - AddWorkerInfoReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleUpdateWorkerDebuggerPort(UpdateWorkerDebuggerPortRequest request, - UpdateWorkerDebuggerPortReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleUpdateWorkerNumPausedThreads( - UpdateWorkerNumPausedThreadsRequest request, - UpdateWorkerNumPausedThreadsReply *reply, - SendReplyCallback send_reply_callback) = 0; -}; - -/// The `GrpcService` for `WorkerInfoGcsService`. -class WorkerInfoGrpcService : public GrpcService { - public: - /// Constructor. - /// - /// \param[in] handler The service handler that actually handle the requests. - explicit WorkerInfoGrpcService(instrumented_io_context &io_service, - WorkerInfoGcsServiceHandler &handler) - : GrpcService(io_service), service_handler_(handler){}; - - protected: - grpc::Service &GetGrpcService() override { return service_; } - - void InitServerCallFactories( - const std::unique_ptr &cq, - std::vector> *server_call_factories, - const ClusterID &cluster_id) override { - WORKER_INFO_SERVICE_RPC_HANDLER(ReportWorkerFailure); - WORKER_INFO_SERVICE_RPC_HANDLER(GetWorkerInfo); - WORKER_INFO_SERVICE_RPC_HANDLER(GetAllWorkerInfo); - WORKER_INFO_SERVICE_RPC_HANDLER(AddWorkerInfo); - WORKER_INFO_SERVICE_RPC_HANDLER(UpdateWorkerDebuggerPort); - WORKER_INFO_SERVICE_RPC_HANDLER(UpdateWorkerNumPausedThreads); - } - - private: - /// The grpc async service object. - WorkerInfoGcsService::AsyncService service_; - /// The service handler that actually handle the requests. - WorkerInfoGcsServiceHandler &service_handler_; -}; - -class PlacementGroupInfoGcsServiceHandler { - public: - virtual ~PlacementGroupInfoGcsServiceHandler() = default; - - virtual void HandleCreatePlacementGroup(CreatePlacementGroupRequest request, - CreatePlacementGroupReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleRemovePlacementGroup(RemovePlacementGroupRequest request, - RemovePlacementGroupReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleGetPlacementGroup(GetPlacementGroupRequest request, - GetPlacementGroupReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleGetAllPlacementGroup(GetAllPlacementGroupRequest request, - GetAllPlacementGroupReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleWaitPlacementGroupUntilReady( - WaitPlacementGroupUntilReadyRequest request, - WaitPlacementGroupUntilReadyReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleGetNamedPlacementGroup(GetNamedPlacementGroupRequest request, - GetNamedPlacementGroupReply *reply, - SendReplyCallback send_reply_callback) = 0; -}; - -/// The `GrpcService` for `PlacementGroupInfoGcsService`. -class PlacementGroupInfoGrpcService : public GrpcService { - public: - /// Constructor. - /// - /// \param[in] handler The service handler that actually handle the requests. - explicit PlacementGroupInfoGrpcService(instrumented_io_context &io_service, - PlacementGroupInfoGcsServiceHandler &handler) - : GrpcService(io_service), service_handler_(handler) {} - - protected: - grpc::Service &GetGrpcService() override { return service_; } - - void InitServerCallFactories( - const std::unique_ptr &cq, - std::vector> *server_call_factories, - const ClusterID &cluster_id) override { - PLACEMENT_GROUP_INFO_SERVICE_RPC_HANDLER(CreatePlacementGroup); - PLACEMENT_GROUP_INFO_SERVICE_RPC_HANDLER(RemovePlacementGroup); - PLACEMENT_GROUP_INFO_SERVICE_RPC_HANDLER(GetPlacementGroup); - PLACEMENT_GROUP_INFO_SERVICE_RPC_HANDLER(GetNamedPlacementGroup); - PLACEMENT_GROUP_INFO_SERVICE_RPC_HANDLER(GetAllPlacementGroup); - PLACEMENT_GROUP_INFO_SERVICE_RPC_HANDLER(WaitPlacementGroupUntilReady); - } - - private: - /// The grpc async service object. - PlacementGroupInfoGcsService::AsyncService service_; - /// The service handler that actually handle the requests. - PlacementGroupInfoGcsServiceHandler &service_handler_; -}; - -class InternalKVGcsServiceHandler { - public: - virtual ~InternalKVGcsServiceHandler() = default; - virtual void HandleInternalKVKeys(InternalKVKeysRequest request, - InternalKVKeysReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleInternalKVGet(InternalKVGetRequest request, - InternalKVGetReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleInternalKVMultiGet(InternalKVMultiGetRequest request, - InternalKVMultiGetReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleInternalKVPut(InternalKVPutRequest request, - InternalKVPutReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleInternalKVDel(InternalKVDelRequest request, - InternalKVDelReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleInternalKVExists(InternalKVExistsRequest request, - InternalKVExistsReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleGetInternalConfig(GetInternalConfigRequest request, - GetInternalConfigReply *reply, - SendReplyCallback send_reply_callback) = 0; -}; - -class InternalKVGrpcService : public GrpcService { - public: - explicit InternalKVGrpcService(instrumented_io_context &io_service, - InternalKVGcsServiceHandler &handler) - : GrpcService(io_service), service_handler_(handler) {} - - protected: - grpc::Service &GetGrpcService() override { return service_; } - void InitServerCallFactories( - const std::unique_ptr &cq, - std::vector> *server_call_factories, - const ClusterID &cluster_id) override { - INTERNAL_KV_SERVICE_RPC_HANDLER(InternalKVGet); - INTERNAL_KV_SERVICE_RPC_HANDLER(InternalKVMultiGet); - INTERNAL_KV_SERVICE_RPC_HANDLER(InternalKVPut); - INTERNAL_KV_SERVICE_RPC_HANDLER(InternalKVDel); - INTERNAL_KV_SERVICE_RPC_HANDLER(InternalKVExists); - INTERNAL_KV_SERVICE_RPC_HANDLER(InternalKVKeys); - INTERNAL_KV_SERVICE_RPC_HANDLER(GetInternalConfig); - } - - private: - InternalKVGcsService::AsyncService service_; - InternalKVGcsServiceHandler &service_handler_; -}; - -class RuntimeEnvGcsServiceHandler { - public: - virtual ~RuntimeEnvGcsServiceHandler() = default; - virtual void HandlePinRuntimeEnvURI(PinRuntimeEnvURIRequest request, - PinRuntimeEnvURIReply *reply, - SendReplyCallback send_reply_callback) = 0; -}; - -class RuntimeEnvGrpcService : public GrpcService { - public: - explicit RuntimeEnvGrpcService(instrumented_io_context &io_service, - RuntimeEnvGcsServiceHandler &handler) - : GrpcService(io_service), service_handler_(handler) {} - - protected: - grpc::Service &GetGrpcService() override { return service_; } - void InitServerCallFactories( - const std::unique_ptr &cq, - std::vector> *server_call_factories, - const ClusterID &cluster_id) override { - RUNTIME_ENV_SERVICE_RPC_HANDLER(PinRuntimeEnvURI); - } - - private: - RuntimeEnvGcsService::AsyncService service_; - RuntimeEnvGcsServiceHandler &service_handler_; -}; - -class TaskInfoGcsServiceHandler { - public: - virtual ~TaskInfoGcsServiceHandler() = default; - - virtual void HandleAddTaskEventData(AddTaskEventDataRequest request, - AddTaskEventDataReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleGetTaskEvents(GetTaskEventsRequest request, - GetTaskEventsReply *reply, - SendReplyCallback send_reply_callback) = 0; -}; - -/// The `GrpcService` for `TaskInfoGcsService`. -class TaskInfoGrpcService : public GrpcService { - public: - /// Constructor. - /// - /// \param[in] io_service IO service to run the handler. - /// \param[in] handler The service handler that actually handle the requests. - explicit TaskInfoGrpcService(instrumented_io_context &io_service, - TaskInfoGcsServiceHandler &handler) - : GrpcService(io_service), service_handler_(handler){}; - - protected: - grpc::Service &GetGrpcService() override { return service_; } - - void InitServerCallFactories( - const std::unique_ptr &cq, - std::vector> *server_call_factories, - const ClusterID &cluster_id) override { - TASK_INFO_SERVICE_RPC_HANDLER(AddTaskEventData); - TASK_INFO_SERVICE_RPC_HANDLER(GetTaskEvents); - } - - private: - /// The grpc async service object. - TaskInfoGcsService::AsyncService service_; - /// The service handler that actually handle the requests. - TaskInfoGcsServiceHandler &service_handler_; -}; - -class RayEventExportGcsServiceHandler { - public: - virtual ~RayEventExportGcsServiceHandler() = default; - virtual void HandleAddEvents(AddEventsRequest request, - AddEventsReply *reply, - SendReplyCallback send_reply_callback) = 0; -}; - -/// The `GrpcService` for `RayEventExportGcsService`. -class RayEventExportGrpcService : public GrpcService { - public: - explicit RayEventExportGrpcService(instrumented_io_context &io_service, - RayEventExportGcsServiceHandler &handler) - : GrpcService(io_service), service_handler_(handler) {} - - protected: - grpc::Service &GetGrpcService() override { return service_; } - void InitServerCallFactories( - const std::unique_ptr &cq, - std::vector> *server_call_factories, - const ClusterID &cluster_id) override { - RAY_EVENT_EXPORT_SERVICE_RPC_HANDLER(AddEvents); - } - - private: - /// The grpc async service object. - RayEventExportGcsService::AsyncService service_; - /// The service handler that actually handle the requests. - RayEventExportGcsServiceHandler &service_handler_; -}; - -class InternalPubSubGcsServiceHandler { - public: - virtual ~InternalPubSubGcsServiceHandler() = default; - - virtual void HandleGcsPublish(GcsPublishRequest request, - GcsPublishReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleGcsSubscriberPoll(GcsSubscriberPollRequest request, - GcsSubscriberPollReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleGcsSubscriberCommandBatch(GcsSubscriberCommandBatchRequest request, - GcsSubscriberCommandBatchReply *reply, - SendReplyCallback send_reply_callback) = 0; - - virtual void HandleGcsUnregisterSubscriber(GcsUnregisterSubscriberRequest request, - GcsUnregisterSubscriberReply *reply, - SendReplyCallback send_reply_callback) = 0; -}; - -class InternalPubSubGrpcService : public GrpcService { - public: - InternalPubSubGrpcService(instrumented_io_context &io_service, - InternalPubSubGcsServiceHandler &handler) - : GrpcService(io_service), service_handler_(handler) {} - - protected: - grpc::Service &GetGrpcService() override { return service_; } - void InitServerCallFactories( - const std::unique_ptr &cq, - std::vector> *server_call_factories, - const ClusterID &cluster_id) override { - INTERNAL_PUBSUB_SERVICE_RPC_HANDLER(GcsPublish); - INTERNAL_PUBSUB_SERVICE_RPC_HANDLER(GcsSubscriberPoll); - INTERNAL_PUBSUB_SERVICE_RPC_HANDLER(GcsSubscriberCommandBatch); - INTERNAL_PUBSUB_SERVICE_RPC_HANDLER(GcsUnregisterSubscriber); - } - - private: - InternalPubSubGcsService::AsyncService service_; - InternalPubSubGcsServiceHandler &service_handler_; -}; - -using JobInfoHandler = JobInfoGcsServiceHandler; -using ActorInfoHandler = ActorInfoGcsServiceHandler; -using NodeInfoHandler = NodeInfoGcsServiceHandler; -using NodeResourceInfoHandler = NodeResourceInfoGcsServiceHandler; -using WorkerInfoHandler = WorkerInfoGcsServiceHandler; -using PlacementGroupInfoHandler = PlacementGroupInfoGcsServiceHandler; -using InternalKVHandler = InternalKVGcsServiceHandler; -using InternalPubSubHandler = InternalPubSubGcsServiceHandler; -using RuntimeEnvHandler = RuntimeEnvGcsServiceHandler; -using TaskInfoHandler = TaskInfoGcsServiceHandler; -using RayEventExportHandler = RayEventExportGcsServiceHandler; - -} // namespace rpc -} // namespace ray diff --git a/src/ray/rpc/grpc_server.h b/src/ray/rpc/grpc_server.h index 686c4a68b2a4..db63eed08020 100644 --- a/src/ray/rpc/grpc_server.h +++ b/src/ray/rpc/grpc_server.h @@ -68,12 +68,6 @@ namespace rpc { SERVICE, HANDLER, MAX_ACTIVE_RPCS, AUTH_TYPE) \ _RPC_SERVICE_HANDLER(SERVICE, HANDLER, MAX_ACTIVE_RPCS, AUTH_TYPE, false) -// Define a void RPC client method. -#define DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(METHOD) \ - virtual void Handle##METHOD(::ray::rpc::METHOD##Request request, \ - ::ray::rpc::METHOD##Reply *reply, \ - ::ray::rpc::SendReplyCallback send_reply_callback) = 0; - class GrpcService; /// Class that represents an gRPC server. @@ -96,13 +90,11 @@ class GrpcServer { GrpcServer(std::string name, const uint32_t port, bool listen_to_localhost_only, - const ClusterID &cluster_id = ClusterID::Nil(), int num_threads = 1, int64_t keepalive_time_ms = 7200000 /*2 hours, grpc default*/) : name_(std::move(name)), port_(port), listen_to_localhost_only_(listen_to_localhost_only), - cluster_id_(ClusterID::Nil()), is_shutdown_(true), num_threads_(num_threads), keepalive_time_ms_(keepalive_time_ms) { diff --git a/src/ray/rpc/metrics_agent_client.cc b/src/ray/rpc/metrics_agent_client.cc new file mode 100644 index 000000000000..29dc579af14c --- /dev/null +++ b/src/ray/rpc/metrics_agent_client.cc @@ -0,0 +1,69 @@ +// Copyright 2017 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ray/rpc/metrics_agent_client.h" + +#include +#include + +#include "ray/util/logging.h" + +namespace ray { +namespace rpc { + +void MetricsAgentClientImpl::WaitForServerReady( + std::function init_exporter_fn) { + WaitForServerReadyWithRetry( + init_exporter_fn, 0, kMetricAgentInitMaxRetries, kMetricAgentInitRetryDelayMs); +} + +void MetricsAgentClientImpl::WaitForServerReadyWithRetry( + std::function init_exporter_fn, + int retry_count, + int max_retry, + int retry_interval_ms) { + if (exporter_initialized_) { + return; + } + + RAY_LOG(INFO) << "Initializing exporter ..."; + HealthCheck(rpc::HealthCheckRequest(), + [this, init_exporter_fn](auto &status, auto &&reply) { + if (status.ok() && !exporter_initialized_) { + init_exporter_fn(status); + exporter_initialized_ = true; + RAY_LOG(INFO) << "Exporter initialized."; + } + }); + if (retry_count >= max_retry) { + init_exporter_fn(Status::RpcError("The metrics agent server is not ready.", 14)); + return; + } + retry_count++; + retry_timer_->expires_after(std::chrono::milliseconds(retry_interval_ms)); + retry_timer_->async_wait( + [this, init_exporter_fn, retry_count, max_retry, retry_interval_ms]( + const boost::system::error_code &error) { + if (!error) { + WaitForServerReadyWithRetry( + init_exporter_fn, retry_count, max_retry, retry_interval_ms); + } else { + RAY_LOG(ERROR) << "Failed to initialize exporter. Data will not be exported to " + "the metrics agent."; + } + }); +} + +} // namespace rpc +} // namespace ray diff --git a/src/ray/rpc/metrics_agent_client.h b/src/ray/rpc/metrics_agent_client.h index 9af5cf290c80..c5a6085d23a1 100644 --- a/src/ray/rpc/metrics_agent_client.h +++ b/src/ray/rpc/metrics_agent_client.h @@ -30,6 +30,11 @@ namespace ray { namespace rpc { +/// The maximum number of retries to wait for the server to be ready. +/// This setting allows for 30 seconds of retries. +constexpr int kMetricAgentInitMaxRetries = 30; +constexpr int kMetricAgentInitRetryDelayMs = 1000; + /// Client used for communicating with a remote node manager server. class MetricsAgentClient { public: @@ -40,6 +45,20 @@ class MetricsAgentClient { /// \param[in] request The request message. /// \param[in] callback The callback function that handles reply. VOID_RPC_CLIENT_VIRTUAL_METHOD_DECL(ReporterService, ReportOCMetrics) + + /// Send a health check request to the metrics agent. + /// + /// \param[in] request The request message. + /// \param[in] callback The callback function that handles reply. + VOID_RPC_CLIENT_VIRTUAL_METHOD_DECL(ReporterService, HealthCheck) + + /// Initialize an exporter (e.g. metrics, events exporter). + /// + /// This function ensures that the server is ready to receive metrics before + /// initializing the exporter. If the server is not ready, it will retry for + /// a number of times. + virtual void WaitForServerReady( + std::function init_exporter_fn) = 0; }; class MetricsAgentClientImpl : public MetricsAgentClient { @@ -48,15 +67,17 @@ class MetricsAgentClientImpl : public MetricsAgentClient { /// /// \param[in] address Address of the metrics agent server. /// \param[in] port Port of the metrics agent server. + /// \param[in] io_service The `instrumented_io_context` used for managing requests. /// \param[in] client_call_manager The `ClientCallManager` used for managing requests. MetricsAgentClientImpl(const std::string &address, const int port, - instrumented_io_context &io_service) - : client_call_manager_(io_service, /*record_stats=*/true) { + instrumented_io_context &io_service, + rpc::ClientCallManager &client_call_manager) { RAY_LOG(DEBUG) << "Initiate the metrics client of address:" << BuildAddress(address, port); - grpc_client_ = std::make_unique>( - address, port, client_call_manager_); + grpc_client_ = + std::make_unique>(address, port, client_call_manager); + retry_timer_ = std::make_unique(io_service); }; VOID_RPC_CLIENT_METHOD(ReporterService, @@ -65,11 +86,33 @@ class MetricsAgentClientImpl : public MetricsAgentClient { /*method_timeout_ms*/ -1, override) + VOID_RPC_CLIENT_METHOD(ReporterService, + HealthCheck, + grpc_client_, + /*method_timeout_ms*/ -1, + override) + + /// Wait for the server to be ready. Invokes the callback with the final readiness + /// status of the server. + void WaitForServerReady(std::function init_exporter_fn) override; + private: - /// Call Manager for gRPC client. - rpc::ClientCallManager client_call_manager_; /// The RPC client. std::unique_ptr> grpc_client_; + /// Timer for retrying to initialize the OpenTelemetry exporter. + std::unique_ptr retry_timer_; + /// Whether the exporter is initialized. + bool exporter_initialized_ = false; + /// Wait for the server to be ready with a retry count. Invokes the callback + /// with the status of the server. This is a helper function for WaitForServerReady. + void WaitForServerReadyWithRetry(std::function init_exporter_fn, + int retry_count, + int max_retry, + int retry_interval_ms); + + friend class MetricsAgentClientTest; + FRIEND_TEST(MetricsAgentClientTest, WaitForServerReadyWithRetrySuccess); + FRIEND_TEST(MetricsAgentClientTest, WaitForServerReadyWithRetryFailure); }; } // namespace rpc diff --git a/src/ray/rpc/node_manager/node_manager_client.h b/src/ray/rpc/node_manager/node_manager_client.h deleted file mode 100644 index 7f119f2db8af..000000000000 --- a/src/ray/rpc/node_manager/node_manager_client.h +++ /dev/null @@ -1,205 +0,0 @@ -// Copyright 2017 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include -#include -#include -#include -#include - -#include "ray/common/status.h" -#include "ray/rpc/grpc_client.h" -#include "ray/rpc/retryable_grpc_client.h" -#include "ray/util/logging.h" -#include "src/ray/protobuf/node_manager.grpc.pb.h" -#include "src/ray/protobuf/node_manager.pb.h" - -namespace ray { - -namespace raylet { -class RayletClient; -} - -namespace rpc { - -/// TODO(dayshah): https://github.com/ray-project/ray/issues/54816 Kill this completely. -/// This class is only used by the RayletClient which is just a wrapper around this. This -/// exists for the legacy reason that all the function definitions in RayletClient have to -/// change if you move the things in here into RayletClient. -class NodeManagerClient { - public: - friend class raylet::RayletClient; - - private: - /// Constructor. - /// - /// \param[in] address Address of the node manager server. - /// \param[in] port Port of the node manager server. - /// \param[in] client_call_manager The `ClientCallManager` used for managing requests. - /// \param[in] raylet_unavailable_timeout_callback The callback function that is used - /// by the retryable grpc to remove unresponsive raylet connections from the pool once - /// its been unavailable for more than server_unavailable_timeout_seconds. - NodeManagerClient(const rpc::Address &address, - ClientCallManager &client_call_manager, - std::function raylet_unavailable_timeout_callback) - : grpc_client_(std::make_shared>( - address.ip_address(), address.port(), client_call_manager)), - retryable_grpc_client_(RetryableGrpcClient::Create( - grpc_client_->Channel(), - client_call_manager.GetMainService(), - /*max_pending_requests_bytes=*/ - std::numeric_limits::max(), - /*check_channel_status_interval_milliseconds=*/ - ::RayConfig::instance() - .grpc_client_check_connection_status_interval_milliseconds(), - /*server_unavailable_timeout_seconds=*/ - ::RayConfig::instance().raylet_rpc_server_reconnect_timeout_s(), - /*server_unavailable_timeout_callback=*/ - std::move(raylet_unavailable_timeout_callback), - /*server_name=*/"Raylet " + address.ip_address())) {} - - std::shared_ptr Channel() const { return grpc_client_->Channel(); } - - VOID_RPC_CLIENT_METHOD(NodeManagerService, - GetResourceLoad, - grpc_client_, - /*method_timeout_ms*/ -1, ) - - VOID_RPC_CLIENT_METHOD(NodeManagerService, - CancelTasksWithResourceShapes, - grpc_client_, - /*method_timeout_ms*/ -1, ) - - VOID_RPC_CLIENT_METHOD(NodeManagerService, - NotifyGCSRestart, - grpc_client_, - /*method_timeout_ms*/ -1, ) - - VOID_RPC_CLIENT_METHOD(NodeManagerService, - RequestWorkerLease, - grpc_client_, - /*method_timeout_ms*/ -1, ) - - VOID_RPC_CLIENT_METHOD(NodeManagerService, - PrestartWorkers, - grpc_client_, - /*method_timeout_ms*/ -1, ) - - VOID_RPC_CLIENT_METHOD(NodeManagerService, - ReportWorkerBacklog, - grpc_client_, - /*method_timeout_ms*/ -1, ) - - VOID_RPC_CLIENT_METHOD(NodeManagerService, - ReturnWorker, - grpc_client_, - /*method_timeout_ms*/ -1, ) - - VOID_RPC_CLIENT_METHOD(NodeManagerService, - ReleaseUnusedActorWorkers, - grpc_client_, - /*method_timeout_ms*/ -1, ) - - VOID_RPC_CLIENT_METHOD(NodeManagerService, - ShutdownRaylet, - grpc_client_, - /*method_timeout_ms*/ -1, ) - - VOID_RPC_CLIENT_METHOD(NodeManagerService, - DrainRaylet, - grpc_client_, - /*method_timeout_ms*/ -1, ) - - VOID_RPC_CLIENT_METHOD(NodeManagerService, - IsLocalWorkerDead, - grpc_client_, - /*method_timeout_ms*/ -1, ) - - VOID_RPC_CLIENT_METHOD(NodeManagerService, - CancelWorkerLease, - grpc_client_, - /*method_timeout_ms*/ -1, ) - - VOID_RPC_CLIENT_METHOD(NodeManagerService, - PrepareBundleResources, - grpc_client_, - /*method_timeout_ms*/ -1, ) - - VOID_RPC_CLIENT_METHOD(NodeManagerService, - CommitBundleResources, - grpc_client_, - /*method_timeout_ms*/ -1, ) - - VOID_RPC_CLIENT_METHOD(NodeManagerService, - CancelResourceReserve, - grpc_client_, - /*method_timeout_ms*/ -1, ) - - VOID_RPC_CLIENT_METHOD(NodeManagerService, - PinObjectIDs, - grpc_client_, - /*method_timeout_ms*/ -1, ) - - VOID_RPC_CLIENT_METHOD(NodeManagerService, - GlobalGC, - grpc_client_, - /*method_timeout_ms*/ -1, ) - - VOID_RPC_CLIENT_METHOD(NodeManagerService, - ReleaseUnusedBundles, - grpc_client_, - /*method_timeout_ms*/ -1, ) - - VOID_RPC_CLIENT_METHOD(NodeManagerService, - GetSystemConfig, - grpc_client_, - /*method_timeout_ms*/ -1, ) - - /// Get all the object information from the node. - VOID_RPC_CLIENT_METHOD(NodeManagerService, - GetObjectsInfo, - grpc_client_, - /*method_timeout_ms*/ -1, ) - - VOID_RPC_CLIENT_METHOD(NodeManagerService, - GetTaskFailureCause, - grpc_client_, - /*method_timeout_ms*/ -1, ) - - VOID_RPC_CLIENT_METHOD(NodeManagerService, - RegisterMutableObject, - grpc_client_, - /*method_timeout_ms*/ -1, ) - - VOID_RPC_CLIENT_METHOD(NodeManagerService, - PushMutableObject, - grpc_client_, - /*method_timeout_ms*/ -1, ) - - VOID_RPC_CLIENT_METHOD(NodeManagerService, - GetNodeStats, - grpc_client_, - /*method_timeout_ms*/ -1, ) - - std::shared_ptr> grpc_client_; - - std::shared_ptr retryable_grpc_client_; -}; - -} // namespace rpc -} // namespace ray diff --git a/src/ray/rpc/node_manager/node_manager_server.h b/src/ray/rpc/node_manager/node_manager_server.h index f7e1cc37f171..262c72cb284b 100644 --- a/src/ray/rpc/node_manager/node_manager_server.h +++ b/src/ray/rpc/node_manager/node_manager_server.h @@ -31,31 +31,32 @@ namespace rpc { RPC_SERVICE_HANDLER_CUSTOM_AUTH(NodeManagerService, METHOD, -1, AuthType::NO_AUTH) /// NOTE: See src/ray/core_worker/core_worker.h on how to add a new grpc handler. -#define RAY_NODE_MANAGER_RPC_HANDLERS \ - RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(GetResourceLoad) \ - RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(CancelTasksWithResourceShapes) \ - RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(NotifyGCSRestart) \ - RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(RequestWorkerLease) \ - RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(PrestartWorkers) \ - RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(ReportWorkerBacklog) \ - RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(ReturnWorker) \ - RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(ReleaseUnusedActorWorkers) \ - RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(CancelWorkerLease) \ - RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(PinObjectIDs) \ - RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(GetNodeStats) \ - RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(GlobalGC) \ - RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(FormatGlobalMemoryInfo) \ - RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(PrepareBundleResources) \ - RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(CommitBundleResources) \ - RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(CancelResourceReserve) \ - RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(ReleaseUnusedBundles) \ - RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(GetSystemConfig) \ - RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(IsLocalWorkerDead) \ - RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(ShutdownRaylet) \ - RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(DrainRaylet) \ - RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(GetObjectsInfo) \ - RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(GetTaskFailureCause) \ - RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(RegisterMutableObject) \ +#define RAY_NODE_MANAGER_RPC_HANDLERS \ + RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(GetResourceLoad) \ + RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(CancelLeasesWithResourceShapes) \ + RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(NotifyGCSRestart) \ + RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(RequestWorkerLease) \ + RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(PrestartWorkers) \ + RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(ReportWorkerBacklog) \ + RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(ReturnWorkerLease) \ + RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(ReleaseUnusedActorWorkers) \ + RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(CancelWorkerLease) \ + RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(PinObjectIDs) \ + RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(GetNodeStats) \ + RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(GlobalGC) \ + RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(FormatGlobalMemoryInfo) \ + RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(PrepareBundleResources) \ + RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(CommitBundleResources) \ + RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(CancelResourceReserve) \ + RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(ResizeLocalResourceInstances) \ + RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(ReleaseUnusedBundles) \ + RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(GetSystemConfig) \ + RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(IsLocalWorkerDead) \ + RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(ShutdownRaylet) \ + RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(DrainRaylet) \ + RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(GetObjectsInfo) \ + RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(GetWorkerFailureCause) \ + RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(RegisterMutableObject) \ RAY_NODE_MANAGER_RPC_SERVICE_HANDLER(PushMutableObject) /// Interface of the `NodeManagerService`, see `src/ray/protobuf/node_manager.proto`. @@ -64,7 +65,7 @@ class NodeManagerServiceHandler { /// Handlers. For all of the following handlers, the implementations can /// handle the request asynchronously. When handling is done, the /// `send_reply_callback` should be called. See - /// src/ray/rpc/node_manager/node_manager_client.h and + /// src/ray/rpc/raylet/raylet_client.cc and /// src/ray/protobuf/node_manager.proto for a description of the /// functionality of each handler. /// @@ -76,9 +77,9 @@ class NodeManagerServiceHandler { rpc::GetResourceLoadReply *reply, rpc::SendReplyCallback send_reply_callback) = 0; - virtual void HandleCancelTasksWithResourceShapes( - rpc::CancelTasksWithResourceShapesRequest request, - rpc::CancelTasksWithResourceShapesReply *reply, + virtual void HandleCancelLeasesWithResourceShapes( + rpc::CancelLeasesWithResourceShapesRequest request, + rpc::CancelLeasesWithResourceShapesReply *reply, rpc::SendReplyCallback send_reply_callback) = 0; virtual void HandleNotifyGCSRestart(rpc::NotifyGCSRestartRequest request, @@ -97,9 +98,9 @@ class NodeManagerServiceHandler { ReportWorkerBacklogReply *reply, SendReplyCallback send_reply_callback) = 0; - virtual void HandleReturnWorker(ReturnWorkerRequest request, - ReturnWorkerReply *reply, - SendReplyCallback send_reply_callback) = 0; + virtual void HandleReturnWorkerLease(ReturnWorkerLeaseRequest request, + ReturnWorkerLeaseReply *reply, + SendReplyCallback send_reply_callback) = 0; virtual void HandleReleaseUnusedActorWorkers(ReleaseUnusedActorWorkersRequest request, ReleaseUnusedActorWorkersReply *reply, @@ -136,6 +137,11 @@ class NodeManagerServiceHandler { rpc::CancelResourceReserveReply *reply, rpc::SendReplyCallback send_reply_callback) = 0; + virtual void HandleResizeLocalResourceInstances( + rpc::ResizeLocalResourceInstancesRequest request, + rpc::ResizeLocalResourceInstancesReply *reply, + rpc::SendReplyCallback send_reply_callback) = 0; + virtual void HandlePinObjectIDs(PinObjectIDsRequest request, PinObjectIDsReply *reply, SendReplyCallback send_reply_callback) = 0; @@ -164,9 +170,9 @@ class NodeManagerServiceHandler { GetObjectsInfoReply *reply, SendReplyCallback send_reply_callback) = 0; - virtual void HandleGetTaskFailureCause(GetTaskFailureCauseRequest request, - GetTaskFailureCauseReply *reply, - SendReplyCallback send_reply_callback) = 0; + virtual void HandleGetWorkerFailureCause(GetWorkerFailureCauseRequest request, + GetWorkerFailureCauseReply *reply, + SendReplyCallback send_reply_callback) = 0; virtual void HandleRegisterMutableObject(RegisterMutableObjectRequest request, RegisterMutableObjectReply *reply, diff --git a/src/ray/rpc/object_manager/BUILD.bazel b/src/ray/rpc/object_manager/BUILD.bazel new file mode 100644 index 000000000000..d6bf8135ac43 --- /dev/null +++ b/src/ray/rpc/object_manager/BUILD.bazel @@ -0,0 +1,51 @@ +load("//bazel:ray.bzl", "ray_cc_library") + +ray_cc_library( + name = "object_manager_client", + hdrs = [ + "object_manager_client.h", + ], + visibility = ["//visibility:public"], + deps = [ + ":object_manager_client_interface", + "//src/ray/object_manager:object_manager_grpc_client_manager", + "//src/ray/protobuf:object_manager_cc_grpc", + "//src/ray/util:logging", + "@com_github_grpc_grpc//:grpc++", + ], +) + +ray_cc_library( + name = "object_manager_client_interface", + hdrs = ["object_manager_client_interface.h"], + deps = [ + "//src/ray/protobuf:object_manager_cc_proto", + ], +) + +ray_cc_library( + name = "object_manager_server", + hdrs = [ + "object_manager_server.h", + ], + visibility = ["//visibility:public"], + deps = [ + "//src/ray/common:asio", + "//src/ray/object_manager:object_manager_grpc_client_manager", + "//src/ray/protobuf:object_manager_cc_grpc", + "//src/ray/rpc:grpc_server", + "//src/ray/rpc:server_call", + "@boost//:asio", + "@com_github_grpc_grpc//:grpc++", + ], +) + +ray_cc_library( + name = "fake_object_manager_client", + hdrs = ["fake_object_manager_client.h"], + deps = [ + ":object_manager_client_interface", + "//src/ray/common:status", + "//src/ray/protobuf:object_manager_cc_proto", + ], +) diff --git a/src/ray/rpc/object_manager/fake_object_manager_client.h b/src/ray/rpc/object_manager/fake_object_manager_client.h new file mode 100644 index 000000000000..97b0af8fc848 --- /dev/null +++ b/src/ray/rpc/object_manager/fake_object_manager_client.h @@ -0,0 +1,110 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include +#include +#include +#include +#include + +#include "ray/common/status.h" +#include "ray/rpc/object_manager/object_manager_client_interface.h" +#include "src/ray/protobuf/object_manager.pb.h" + +namespace ray { +namespace rpc { + +template +using ClientCallback = std::function; + +class FakeObjectManagerClient : public ObjectManagerClientInterface { + public: + FakeObjectManagerClient(const std::string &address, + const int port, + ClientCallManager &client_call_manager) + : address_(address), port_(port) {} + + void Push(const PushRequest &request, + const ClientCallback &callback) override { + num_push_requests++; + push_callbacks.push_back(callback); + } + + void Pull(const PullRequest &request, + const ClientCallback &callback) override { + num_pull_requests++; + pull_callbacks.push_back(callback); + } + + void FreeObjects(const FreeObjectsRequest &request, + const ClientCallback &callback) override { + num_free_objects_requests++; + free_objects_callbacks.push_back(callback); + } + + bool ReplyPush(const Status &status = Status::OK()) { + if (push_callbacks.empty()) { + return false; + } + PushReply reply; + auto callback = push_callbacks.front(); + push_callbacks.pop_front(); + callback(status, std::move(reply)); + return true; + } + + bool ReplyPull(const Status &status = Status::OK()) { + if (pull_callbacks.empty()) { + return false; + } + PullReply reply; + auto callback = pull_callbacks.front(); + pull_callbacks.pop_front(); + callback(status, std::move(reply)); + return true; + } + + bool ReplyFreeObjects(const Status &status = Status::OK()) { + if (free_objects_callbacks.empty()) { + return false; + } + FreeObjectsReply reply; + auto callback = free_objects_callbacks.front(); + free_objects_callbacks.pop_front(); + callback(status, std::move(reply)); + return true; + } + + const std::string &GetAddress() const { return address_; } + + int GetPort() const { return port_; } + + uint32_t num_push_requests = 0; + uint32_t num_pull_requests = 0; + uint32_t num_free_objects_requests = 0; + + std::list> push_callbacks; + std::list> pull_callbacks; + std::list> free_objects_callbacks; + + std::string address_; + int port_; +}; + +} // namespace rpc +} // namespace ray diff --git a/src/ray/rpc/object_manager/object_manager_client.h b/src/ray/rpc/object_manager/object_manager_client.h index 1adadb9d72bd..121363961346 100644 --- a/src/ray/rpc/object_manager/object_manager_client.h +++ b/src/ray/rpc/object_manager/object_manager_client.h @@ -25,6 +25,7 @@ #include "ray/common/status.h" #include "ray/object_manager/grpc_client_manager.h" #include "ray/rpc/grpc_client.h" +#include "ray/rpc/object_manager/object_manager_client_interface.h" #include "ray/util/logging.h" #include "src/ray/protobuf/object_manager.grpc.pb.h" #include "src/ray/protobuf/object_manager.pb.h" @@ -32,8 +33,8 @@ namespace ray { namespace rpc { -/// Client used for communicating with a remote node manager server. -class ObjectManagerClient { +/// Client used for communicating with a remote object manager server. +class ObjectManagerClient : public ObjectManagerClientInterface { public: /// Constructor. /// @@ -54,7 +55,8 @@ class ObjectManagerClient { VOID_RPC_CLIENT_METHOD(ObjectManagerService, Push, grpc_client_manager_->GetGrpcClient(), - /*method_timeout_ms*/ -1, ) + /*method_timeout_ms*/ -1, + override) /// Pull object from remote object manager /// @@ -63,7 +65,8 @@ class ObjectManagerClient { VOID_RPC_CLIENT_METHOD(ObjectManagerService, Pull, grpc_client_manager_->GetGrpcClient(), - /*method_timeout_ms*/ -1, ) + /*method_timeout_ms*/ -1, + override) /// Tell remote object manager to free objects /// @@ -72,7 +75,8 @@ class ObjectManagerClient { VOID_RPC_CLIENT_METHOD(ObjectManagerService, FreeObjects, grpc_client_manager_->GetGrpcClient(), - /*method_timeout_ms*/ -1, ) + /*method_timeout_ms*/ -1, + override) private: std::unique_ptr> grpc_client_manager_; diff --git a/src/ray/rpc/object_manager/object_manager_client_interface.h b/src/ray/rpc/object_manager/object_manager_client_interface.h new file mode 100644 index 000000000000..ad74fcf5ffdd --- /dev/null +++ b/src/ray/rpc/object_manager/object_manager_client_interface.h @@ -0,0 +1,52 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "src/ray/protobuf/object_manager.pb.h" + +namespace ray { +namespace rpc { + +template +using ClientCallback = std::function; +/// Abstract client interface for object manager clients. +class ObjectManagerClientInterface { + public: + virtual ~ObjectManagerClientInterface() = default; + + /// Push object to remote object manager + /// + /// \param request The request message. + /// \param callback The callback function that handles reply from server + virtual void Push(const PushRequest &request, + const ClientCallback &callback) = 0; + + /// Pull object from remote object manager + /// + /// \param request The request message + /// \param callback The callback function that handles reply from server + virtual void Pull(const PullRequest &request, + const ClientCallback &callback) = 0; + + /// Tell remote object manager to free objects + /// + /// \param request The request message + /// \param callback The callback function that handles reply + virtual void FreeObjects(const FreeObjectsRequest &request, + const ClientCallback &callback) = 0; +}; + +} // namespace rpc +} // namespace ray diff --git a/src/ray/raylet_client/raylet_client.cc b/src/ray/rpc/raylet/raylet_client.cc similarity index 57% rename from src/ray/raylet_client/raylet_client.cc rename to src/ray/rpc/raylet/raylet_client.cc index bfb8b0665d0a..622271f4a30a 100644 --- a/src/ray/raylet_client/raylet_client.cc +++ b/src/ray/rpc/raylet/raylet_client.cc @@ -12,55 +12,70 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/raylet_client/raylet_client.h" +#include "ray/rpc/raylet/raylet_client.h" +#include #include #include #include #include #include -#include "absl/synchronization/notification.h" -#include "ray/common/common_protocol.h" +#include "ray/common/bundle_spec.h" #include "ray/common/ray_config.h" -#include "ray/common/task/task_spec.h" #include "ray/util/logging.h" +#include "src/ray/protobuf/node_manager.grpc.pb.h" -namespace ray::raylet { +namespace ray { +namespace rpc { RayletClient::RayletClient(const rpc::Address &address, rpc::ClientCallManager &client_call_manager, std::function raylet_unavailable_timeout_callback) - : grpc_client_(std::shared_ptr( - new rpc::NodeManagerClient(address, - client_call_manager, - std::move(raylet_unavailable_timeout_callback)))) {} + : grpc_client_(std::make_shared>( + address.ip_address(), address.port(), client_call_manager)), + retryable_grpc_client_(rpc::RetryableGrpcClient::Create( + grpc_client_->Channel(), + client_call_manager.GetMainService(), + /*max_pending_requests_bytes=*/std::numeric_limits::max(), + /*check_channel_status_interval_milliseconds=*/ + ::RayConfig::instance() + .grpc_client_check_connection_status_interval_milliseconds(), + /*server_unavailable_timeout_seconds=*/ + ::RayConfig::instance().raylet_rpc_server_reconnect_timeout_s(), + /*server_unavailable_timeout_callback=*/ + std::move(raylet_unavailable_timeout_callback), + /*server_name=*/std::string("Raylet ") + address.ip_address())) {} void RayletClient::RequestWorkerLease( - const rpc::TaskSpec &task_spec, + const rpc::LeaseSpec &lease_spec, bool grant_or_reject, const rpc::ClientCallback &callback, const int64_t backlog_size, const bool is_selected_based_on_locality) { - google::protobuf::Arena arena; - auto request = - google::protobuf::Arena::CreateMessage(&arena); - // The unsafe allocating here is actually safe because the life-cycle of - // task_spec is longer than request. - // Request will be sent before the end of this call, and after that, it won't be - // used any more. - request->unsafe_arena_set_allocated_resource_spec( - const_cast(&task_spec)); - request->set_grant_or_reject(grant_or_reject); - request->set_backlog_size(backlog_size); - request->set_is_selected_based_on_locality(is_selected_based_on_locality); - grpc_client_->RequestWorkerLease(*request, callback); + rpc::RequestWorkerLeaseRequest request; + request.mutable_lease_spec()->CopyFrom(lease_spec); + request.set_grant_or_reject(grant_or_reject); + request.set_backlog_size(backlog_size); + request.set_is_selected_based_on_locality(is_selected_based_on_locality); + INVOKE_RETRYABLE_RPC_CALL(retryable_grpc_client_, + NodeManagerService, + RequestWorkerLease, + request, + callback, + grpc_client_, + /*method_timeout_ms*/ -1); } void RayletClient::PrestartWorkers( const rpc::PrestartWorkersRequest &request, const rpc::ClientCallback &callback) { - grpc_client_->PrestartWorkers(request, callback); + INVOKE_RPC_CALL(NodeManagerService, + PrestartWorkers, + request, + callback, + grpc_client_, + /*method_timeout_ms*/ -1); } std::shared_ptr RayletClient::GetChannel() const { @@ -73,42 +88,56 @@ void RayletClient::ReportWorkerBacklog( rpc::ReportWorkerBacklogRequest request; request.set_worker_id(worker_id.Binary()); request.mutable_backlog_reports()->Add(backlog_reports.begin(), backlog_reports.end()); - grpc_client_->ReportWorkerBacklog( + INVOKE_RPC_CALL( + NodeManagerService, + ReportWorkerBacklog, request, [](const Status &status, rpc::ReportWorkerBacklogReply &&reply /*unused*/) { RAY_LOG_IF_ERROR(INFO, status) - << "Error reporting task backlog information: " << status; - }); + << "Error reporting lease backlog information: " << status; + }, + grpc_client_, + /*method_timeout_ms*/ -1); } -Status RayletClient::ReturnWorker(int worker_port, - const WorkerID &worker_id, - bool disconnect_worker, - const std::string &disconnect_worker_error_detail, - bool worker_exiting) { - rpc::ReturnWorkerRequest request; +void RayletClient::ReturnWorkerLease(int worker_port, + const LeaseID &lease_id, + bool disconnect_worker, + const std::string &disconnect_worker_error_detail, + bool worker_exiting) { + rpc::ReturnWorkerLeaseRequest request; request.set_worker_port(worker_port); - request.set_worker_id(worker_id.Binary()); + request.set_lease_id(lease_id.Binary()); request.set_disconnect_worker(disconnect_worker); request.set_disconnect_worker_error_detail(disconnect_worker_error_detail); request.set_worker_exiting(worker_exiting); - grpc_client_->ReturnWorker( - request, [](const Status &status, rpc::ReturnWorkerReply &&reply /*unused*/) { + INVOKE_RETRYABLE_RPC_CALL( + retryable_grpc_client_, + NodeManagerService, + ReturnWorkerLease, + request, + [](const Status &status, rpc::ReturnWorkerLeaseReply &&reply /*unused*/) { RAY_LOG_IF_ERROR(INFO, status) << "Error returning worker: " << status; - }); - return Status::OK(); + }, + grpc_client_, + /*method_timeout_ms*/ -1); } -void RayletClient::GetTaskFailureCause( - const TaskID &task_id, - const ray::rpc::ClientCallback &callback) { - rpc::GetTaskFailureCauseRequest request; - request.set_task_id(task_id.Binary()); - grpc_client_->GetTaskFailureCause( - request, [callback](const Status &status, rpc::GetTaskFailureCauseReply &&reply) { +void RayletClient::GetWorkerFailureCause( + const LeaseID &lease_id, + const ray::rpc::ClientCallback &callback) { + rpc::GetWorkerFailureCauseRequest request; + request.set_lease_id(lease_id.Binary()); + INVOKE_RPC_CALL( + NodeManagerService, + GetWorkerFailureCause, + request, + [callback](const Status &status, rpc::GetWorkerFailureCauseReply &&reply) { RAY_LOG_IF_ERROR(INFO, status) << "Error getting task result: " << status; callback(status, std::move(reply)); - }); + }, + grpc_client_, + /*method_timeout_ms*/ -1); } void RayletClient::RegisterMutableObjectReader( @@ -120,7 +149,12 @@ void RayletClient::RegisterMutableObjectReader( request.set_writer_object_id(writer_object_id.Binary()); request.set_num_readers(num_readers); request.set_reader_object_id(reader_object_id.Binary()); - grpc_client_->RegisterMutableObject(request, callback); + INVOKE_RPC_CALL(NodeManagerService, + RegisterMutableObject, + request, + callback, + grpc_client_, + /*method_timeout_ms*/ -1); } void RayletClient::PushMutableObject( @@ -157,15 +191,20 @@ void RayletClient::PushMutableObject( request.set_metadata(static_cast(metadata), metadata_size); // TODO(jackhumphries): Add failure recovery, retries, and timeout. - grpc_client_->PushMutableObject( - request, [callback](const Status &status, rpc::PushMutableObjectReply &&reply) { + INVOKE_RPC_CALL( + NodeManagerService, + PushMutableObject, + request, + [callback](const Status &status, rpc::PushMutableObjectReply &&reply) { RAY_LOG_IF_ERROR(ERROR, status) << "Error pushing mutable object: " << status; if (reply.done()) { // The callback is only executed once the receiver node receives all chunks // for the mutable object write. callback(status, std::move(reply)); } - }); + }, + grpc_client_, + /*method_timeout_ms*/ -1); } } @@ -176,7 +215,9 @@ void RayletClient::ReleaseUnusedActorWorkers( for (auto &worker_id : workers_in_use) { request.add_worker_ids_in_use(worker_id.Binary()); } - grpc_client_->ReleaseUnusedActorWorkers( + INVOKE_RPC_CALL( + NodeManagerService, + ReleaseUnusedActorWorkers, request, [callback](const Status &status, rpc::ReleaseUnusedActorWorkersReply &&reply) { if (!status.ok()) { @@ -185,15 +226,23 @@ void RayletClient::ReleaseUnusedActorWorkers( << status; } callback(status, std::move(reply)); - }); + }, + grpc_client_, + /*method_timeout_ms*/ -1); } void RayletClient::CancelWorkerLease( - const TaskID &task_id, + const LeaseID &lease_id, const rpc::ClientCallback &callback) { rpc::CancelWorkerLeaseRequest request; - request.set_task_id(task_id.Binary()); - grpc_client_->CancelWorkerLease(request, callback); + request.set_lease_id(lease_id.Binary()); + INVOKE_RETRYABLE_RPC_CALL(retryable_grpc_client_, + NodeManagerService, + CancelWorkerLease, + request, + callback, + grpc_client_, + /*method_timeout_ms*/ -1); } void RayletClient::PrepareBundleResources( @@ -207,7 +256,12 @@ void RayletClient::PrepareBundleResources( message_bundle->CopyFrom(bundle_spec->GetMessage()); } RAY_CHECK(nodes.size() == 1); - grpc_client_->PrepareBundleResources(request, callback); + INVOKE_RPC_CALL(NodeManagerService, + PrepareBundleResources, + request, + callback, + grpc_client_, + /*method_timeout_ms*/ -1); } void RayletClient::CommitBundleResources( @@ -221,7 +275,12 @@ void RayletClient::CommitBundleResources( message_bundle->CopyFrom(bundle_spec->GetMessage()); } RAY_CHECK(nodes.size() == 1); - grpc_client_->CommitBundleResources(request, callback); + INVOKE_RPC_CALL(NodeManagerService, + CommitBundleResources, + request, + callback, + grpc_client_, + /*method_timeout_ms*/ -1); } void RayletClient::CancelResourceReserve( @@ -229,7 +288,12 @@ void RayletClient::CancelResourceReserve( const ray::rpc::ClientCallback &callback) { rpc::CancelResourceReserveRequest request; request.mutable_bundle_spec()->CopyFrom(bundle_spec.GetMessage()); - grpc_client_->CancelResourceReserve(request, callback); + INVOKE_RPC_CALL(NodeManagerService, + CancelResourceReserve, + request, + callback, + grpc_client_, + /*method_timeout_ms*/ -1); } void RayletClient::ReleaseUnusedBundles( @@ -239,15 +303,20 @@ void RayletClient::ReleaseUnusedBundles( for (auto &bundle : bundles_in_use) { request.add_bundles_in_use()->CopyFrom(bundle); } - grpc_client_->ReleaseUnusedBundles( - request, [callback](const Status &status, rpc::ReleaseUnusedBundlesReply &&reply) { + INVOKE_RPC_CALL( + NodeManagerService, + ReleaseUnusedBundles, + request, + [callback](const Status &status, rpc::ReleaseUnusedBundlesReply &&reply) { if (!status.ok()) { RAY_LOG(WARNING) << "Error releasing bundles from raylet, the raylet may have died:" << status; } callback(status, std::move(reply)); - }); + }, + grpc_client_, + /*method_timeout_ms*/ -1); } void RayletClient::PinObjectIDs( @@ -269,7 +338,12 @@ void RayletClient::PinObjectIDs( pins_in_flight_--; callback(status, std::move(reply)); }; - grpc_client_->PinObjectIDs(request, rpc_callback); + INVOKE_RPC_CALL(NodeManagerService, + PinObjectIDs, + request, + rpc_callback, + grpc_client_, + /*method_timeout_ms*/ -1); } void RayletClient::ShutdownRaylet( @@ -278,7 +352,12 @@ void RayletClient::ShutdownRaylet( const rpc::ClientCallback &callback) { rpc::ShutdownRayletRequest request; request.set_graceful(graceful); - grpc_client_->ShutdownRaylet(request, callback); + INVOKE_RPC_CALL(NodeManagerService, + ShutdownRaylet, + request, + callback, + grpc_client_, + /*method_timeout_ms*/ -1); } void RayletClient::DrainRaylet( @@ -290,7 +369,12 @@ void RayletClient::DrainRaylet( request.set_reason(reason); request.set_reason_message(reason_message); request.set_deadline_timestamp_ms(deadline_timestamp_ms); - grpc_client_->DrainRaylet(request, callback); + INVOKE_RPC_CALL(NodeManagerService, + DrainRaylet, + request, + callback, + grpc_client_, + /*method_timeout_ms*/ -1); } void RayletClient::IsLocalWorkerDead( @@ -298,51 +382,87 @@ void RayletClient::IsLocalWorkerDead( const rpc::ClientCallback &callback) { rpc::IsLocalWorkerDeadRequest request; request.set_worker_id(worker_id.Binary()); - grpc_client_->IsLocalWorkerDead(request, callback); + INVOKE_RPC_CALL(NodeManagerService, + IsLocalWorkerDead, + request, + callback, + grpc_client_, + /*method_timeout_ms*/ -1); } void RayletClient::GlobalGC(const rpc::ClientCallback &callback) { rpc::GlobalGCRequest request; - grpc_client_->GlobalGC(request, callback); + INVOKE_RPC_CALL(NodeManagerService, + GlobalGC, + request, + callback, + grpc_client_, + /*method_timeout_ms*/ -1); } void RayletClient::GetResourceLoad( const rpc::ClientCallback &callback) { rpc::GetResourceLoadRequest request; - grpc_client_->GetResourceLoad(request, callback); + INVOKE_RPC_CALL(NodeManagerService, + GetResourceLoad, + request, + callback, + grpc_client_, + /*method_timeout_ms*/ -1); } -void RayletClient::CancelTasksWithResourceShapes( +void RayletClient::CancelLeasesWithResourceShapes( const std::vector> &resource_shapes, - const rpc::ClientCallback &callback) { - rpc::CancelTasksWithResourceShapesRequest request; + const rpc::ClientCallback &callback) { + rpc::CancelLeasesWithResourceShapesRequest request; for (const auto &resource_shape : resource_shapes) { - rpc::CancelTasksWithResourceShapesRequest::ResourceShape *resource_shape_proto = + rpc::CancelLeasesWithResourceShapesRequest::ResourceShape *resource_shape_proto = request.add_resource_shapes(); resource_shape_proto->mutable_resource_shape()->insert(resource_shape.begin(), resource_shape.end()); } - grpc_client_->CancelTasksWithResourceShapes(request, callback); + INVOKE_RPC_CALL(NodeManagerService, + CancelLeasesWithResourceShapes, + request, + callback, + grpc_client_, + /*method_timeout_ms*/ -1); } void RayletClient::NotifyGCSRestart( const rpc::ClientCallback &callback) { rpc::NotifyGCSRestartRequest request; - grpc_client_->NotifyGCSRestart(request, callback); + INVOKE_RPC_CALL(NodeManagerService, + NotifyGCSRestart, + request, + callback, + grpc_client_, + /*method_timeout_ms*/ -1); } void RayletClient::GetSystemConfig( const rpc::ClientCallback &callback) { rpc::GetSystemConfigRequest request; - grpc_client_->GetSystemConfig(request, callback); + INVOKE_RPC_CALL(NodeManagerService, + GetSystemConfig, + request, + callback, + grpc_client_, + /*method_timeout_ms*/ -1); } void RayletClient::GetNodeStats( const rpc::GetNodeStatsRequest &request, const rpc::ClientCallback &callback) { - grpc_client_->GetNodeStats(request, callback); + INVOKE_RPC_CALL(NodeManagerService, + GetNodeStats, + request, + callback, + grpc_client_, + /*method_timeout_ms*/ -1); } -} // namespace ray::raylet +} // namespace rpc +} // namespace ray diff --git a/src/ray/rpc/raylet/raylet_client.h b/src/ray/rpc/raylet/raylet_client.h new file mode 100644 index 000000000000..ad8ea08cd37a --- /dev/null +++ b/src/ray/rpc/raylet/raylet_client.h @@ -0,0 +1,182 @@ +// Copyright 2017 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include +#include +#include +#include +#include + +#include "ray/rpc/grpc_client.h" +#include "ray/rpc/raylet/raylet_client_interface.h" +#include "ray/rpc/retryable_grpc_client.h" +#include "src/ray/protobuf/node_manager.grpc.pb.h" +#include "src/ray/protobuf/node_manager.pb.h" + +// Maps from resource name to its allocation. +using ResourceMappingType = + std::unordered_map>>; + +namespace ray { +namespace rpc { + +/// Raylet client is responsible for communication with raylet. It implements +/// [RayletClientInterface] and works on worker registration, lease management, etc. +class RayletClient : public RayletClientInterface { + public: + /// Connect to the raylet. + /// + /// \param address The IP address of the worker. + /// \param port The port that the worker should listen on for gRPC requests. If + /// 0, the worker should choose a random port. + /// \param client_call_manager The client call manager to use for the grpc connection. + explicit RayletClient(const rpc::Address &address, + rpc::ClientCallManager &client_call_manager, + std::function raylet_unavailable_timeout_callback); + + std::shared_ptr GetChannel() const override; + + void RequestWorkerLease( + const rpc::LeaseSpec &lease_spec, + bool grant_or_reject, + const ray::rpc::ClientCallback &callback, + const int64_t backlog_size, + const bool is_selected_based_on_locality) override; + + void ReturnWorkerLease(int worker_port, + const LeaseID &lease_id, + bool disconnect_worker, + const std::string &disconnect_worker_error_detail, + bool worker_exiting) override; + + void PrestartWorkers( + const ray::rpc::PrestartWorkersRequest &request, + const ray::rpc::ClientCallback &callback) override; + + void GetWorkerFailureCause( + const LeaseID &lease_id, + const ray::rpc::ClientCallback &callback) + override; + + void RegisterMutableObjectReader( + const ObjectID &writer_object_id, + int64_t num_readers, + const ObjectID &reader_object_id, + const ray::rpc::ClientCallback &callback) + override; + + void PushMutableObject(const ObjectID &writer_object_id, + uint64_t data_size, + uint64_t metadata_size, + void *data, + void *metadata, + const ray::rpc::ClientCallback + &callback) override; + + void ReportWorkerBacklog( + const WorkerID &worker_id, + const std::vector &backlog_reports) override; + + void ReleaseUnusedActorWorkers( + const std::vector &workers_in_use, + const rpc::ClientCallback &callback) override; + + void CancelWorkerLease( + const LeaseID &lease_id, + const rpc::ClientCallback &callback) override; + + void PrepareBundleResources( + const std::vector> &bundle_specs, + const ray::rpc::ClientCallback &callback) + override; + + void CommitBundleResources( + const std::vector> &bundle_specs, + const ray::rpc::ClientCallback &callback) + override; + + void CancelResourceReserve( + const BundleSpecification &bundle_spec, + const ray::rpc::ClientCallback &callback) + override; + + void ReleaseUnusedBundles( + const std::vector &bundles_in_use, + const rpc::ClientCallback &callback) override; + + void PinObjectIDs( + const rpc::Address &caller_address, + const std::vector &object_ids, + const ObjectID &generator_id, + const ray::rpc::ClientCallback &callback) override; + + void ShutdownRaylet( + const NodeID &node_id, + bool graceful, + const rpc::ClientCallback &callback) override; + + void DrainRaylet(const rpc::autoscaler::DrainNodeReason &reason, + const std::string &reason_message, + int64_t deadline_timestamp_ms, + const rpc::ClientCallback &callback) override; + + void CancelLeasesWithResourceShapes( + const std::vector> &resource_shapes, + const rpc::ClientCallback &callback) + override; + + void IsLocalWorkerDead( + const WorkerID &worker_id, + const rpc::ClientCallback &callback) override; + + void GetSystemConfig( + const rpc::ClientCallback &callback) override; + + void GlobalGC(const rpc::ClientCallback &callback) override; + + void GetResourceLoad( + const rpc::ClientCallback &callback) override; + + void NotifyGCSRestart( + const rpc::ClientCallback &callback) override; + + const ResourceMappingType &GetResourceIDs() const { return resource_ids_; } + + int64_t GetPinsInFlight() const override { return pins_in_flight_.load(); } + + void GetNodeStats(const rpc::GetNodeStatsRequest &request, + const rpc::ClientCallback &callback) override; + + private: + /// gRPC client to the NodeManagerService. + std::shared_ptr> grpc_client_; + + /// Retryable gRPC client to monitor channel health and trigger timeout callbacks. + std::shared_ptr retryable_grpc_client_; + + /// A map from resource name to the resource IDs that are currently reserved + /// for this worker. Each pair consists of the resource ID and the fraction + /// of that resource allocated for this worker. + ResourceMappingType resource_ids_; + + /// The number of object ID pin RPCs currently in flight. + std::atomic pins_in_flight_ = 0; +}; + +} // namespace rpc +} // namespace ray diff --git a/src/ray/raylet_client/raylet_client.h b/src/ray/rpc/raylet/raylet_client_interface.h similarity index 52% rename from src/ray/raylet_client/raylet_client.h rename to src/ray/rpc/raylet/raylet_client_interface.h index cba4feeee6b7..f0d72452f547 100644 --- a/src/ray/raylet_client/raylet_client.h +++ b/src/ray/rpc/raylet/raylet_client_interface.h @@ -15,38 +15,37 @@ #pragma once #include -#include #include #include #include -#include "ray/common/asio/instrumented_io_context.h" -#include "ray/common/buffer.h" -#include "ray/common/bundle_spec.h" -#include "ray/common/status.h" -#include "ray/common/status_or.h" -#include "ray/common/task/task_spec.h" -#include "ray/ipc/client_connection.h" -#include "ray/rpc/node_manager/node_manager_client.h" -#include "ray/util/process.h" +#include "src/ray/protobuf/autoscaler.pb.h" #include "src/ray/protobuf/common.pb.h" -#include "src/ray/protobuf/gcs.pb.h" - -using ray::ActorID; -using ray::JobID; -using ray::NodeID; -using ray::ObjectID; -using ray::TaskID; -using ray::WorkerID; - -using ray::Language; +#include "src/ray/protobuf/node_manager.pb.h" // Maps from resource name to its allocation. using ResourceMappingType = std::unordered_map>>; +namespace grpc { +class Channel; +} + namespace ray { +// Forward declarations. +class Status; +class WorkerID; +class ObjectID; +class LeaseID; +class NodeID; +class BundleSpecification; + +namespace rpc { +template +using ClientCallback = std::function; +} + class RayletClientInterface { public: /// Request to a raylet to pin a plasma object. The callback will be sent via gRPC. @@ -54,32 +53,32 @@ class RayletClientInterface { const rpc::Address &caller_address, const std::vector &object_ids, const ObjectID &generator_id, - const ray::rpc::ClientCallback &callback) = 0; + const rpc::ClientCallback &callback) = 0; /// Requests a worker from the raylet. The callback will be sent via gRPC. - /// \param resource_spec Resources that should be allocated for the worker. + /// \param lease_spec Lease that is requested by the owner. /// \param grant_or_reject: True if we we should either grant or reject the request /// but no spillback. /// \param callback: The callback to call when the request finishes. /// \param backlog_size The queue length for the given shape on the CoreWorker. + /// \param lease_id Unique lease ID for this worker lease request. virtual void RequestWorkerLease( - const rpc::TaskSpec &task_spec, + const rpc::LeaseSpec &lease_spec, bool grant_or_reject, - const ray::rpc::ClientCallback &callback, + const rpc::ClientCallback &callback, const int64_t backlog_size = -1, const bool is_selected_based_on_locality = false) = 0; /// Returns a worker to the raylet. /// \param worker_port The local port of the worker on the raylet node. - /// \param worker_id The unique worker id of the worker on the raylet node. + /// \param lease_id The unique lease id of the worker on the raylet node. /// \param disconnect_worker Whether the raylet should disconnect the worker. /// \param worker_exiting Whether the worker is exiting and cannot be reused. - /// \return ray::Status - virtual ray::Status ReturnWorker(int worker_port, - const WorkerID &worker_id, - bool disconnect_worker, - const std::string &disconnect_worker_error_detail, - bool worker_exiting) = 0; + virtual void ReturnWorkerLease(int worker_port, + const LeaseID &lease_id, + bool disconnect_worker, + const std::string &disconnect_worker_error_detail, + bool worker_exiting) = 0; /// Request the raylet to prestart workers. In `request` we can set the worker's owner, /// runtime env info and number of workers. @@ -97,7 +96,7 @@ class RayletClientInterface { const rpc::ClientCallback &callback) = 0; virtual void CancelWorkerLease( - const TaskID &task_id, + const LeaseID &lease_id, const rpc::ClientCallback &callback) = 0; /// Report the backlog size of a given worker and a given scheduling class to the @@ -108,9 +107,9 @@ class RayletClientInterface { const WorkerID &worker_id, const std::vector &backlog_reports) = 0; - virtual void GetTaskFailureCause( - const TaskID &task_id, - const ray::rpc::ClientCallback &callback) = 0; + virtual void GetWorkerFailureCause( + const LeaseID &lease_id, + const ray::rpc::ClientCallback &callback) = 0; /// Request a raylet to prepare resources of given bundles for atomic placement group /// creation. This is used for the first phase of atomic placement group creation. The @@ -202,9 +201,9 @@ class RayletClientInterface { int64_t deadline_timestamp_ms, const rpc::ClientCallback &callback) = 0; - virtual void CancelTasksWithResourceShapes( + virtual void CancelLeasesWithResourceShapes( const std::vector> &resource_shapes, - const rpc::ClientCallback &callback) = 0; + const rpc::ClientCallback &callback) = 0; virtual void IsLocalWorkerDead( const WorkerID &worker_id, @@ -221,148 +220,4 @@ class RayletClientInterface { virtual ~RayletClientInterface() = default; }; -namespace raylet { - -/// Raylet client is responsible for communication with raylet. It implements -/// [RayletClientInterface] and works on worker registration, lease management, etc. -class RayletClient : public RayletClientInterface { - public: - /// Connect to the raylet. - /// - /// \param address The IP address of the worker. - /// \param port The port that the worker should listen on for gRPC requests. If - /// 0, the worker should choose a random port. - /// \param client_call_manager The client call manager to use for the grpc connection. - explicit RayletClient(const rpc::Address &address, - rpc::ClientCallManager &client_call_manager, - std::function raylet_unavailable_timeout_callback); - - std::shared_ptr GetChannel() const override; - - void RequestWorkerLease( - const rpc::TaskSpec &resource_spec, - bool grant_or_reject, - const ray::rpc::ClientCallback &callback, - const int64_t backlog_size, - const bool is_selected_based_on_locality) override; - - ray::Status ReturnWorker(int worker_port, - const WorkerID &worker_id, - bool disconnect_worker, - const std::string &disconnect_worker_error_detail, - bool worker_exiting) override; - - void PrestartWorkers( - const ray::rpc::PrestartWorkersRequest &request, - const ray::rpc::ClientCallback &callback) override; - - void GetTaskFailureCause( - const TaskID &task_id, - const ray::rpc::ClientCallback &callback) - override; - - void RegisterMutableObjectReader( - const ObjectID &writer_object_id, - int64_t num_readers, - const ObjectID &reader_object_id, - const ray::rpc::ClientCallback &callback) - override; - - void PushMutableObject(const ObjectID &writer_object_id, - uint64_t data_size, - uint64_t metadata_size, - void *data, - void *metadata, - const ray::rpc::ClientCallback - &callback) override; - - void ReportWorkerBacklog( - const WorkerID &worker_id, - const std::vector &backlog_reports) override; - - void ReleaseUnusedActorWorkers( - const std::vector &workers_in_use, - const rpc::ClientCallback &callback) override; - - void CancelWorkerLease( - const TaskID &task_id, - const rpc::ClientCallback &callback) override; - - void PrepareBundleResources( - const std::vector> &bundle_specs, - const ray::rpc::ClientCallback &callback) - override; - - void CommitBundleResources( - const std::vector> &bundle_specs, - const ray::rpc::ClientCallback &callback) - override; - - void CancelResourceReserve( - const BundleSpecification &bundle_spec, - const ray::rpc::ClientCallback &callback) - override; - - void ReleaseUnusedBundles( - const std::vector &bundles_in_use, - const rpc::ClientCallback &callback) override; - - void PinObjectIDs( - const rpc::Address &caller_address, - const std::vector &object_ids, - const ObjectID &generator_id, - const ray::rpc::ClientCallback &callback) override; - - void ShutdownRaylet( - const NodeID &node_id, - bool graceful, - const rpc::ClientCallback &callback) override; - - void DrainRaylet(const rpc::autoscaler::DrainNodeReason &reason, - const std::string &reason_message, - int64_t deadline_timestamp_ms, - const rpc::ClientCallback &callback) override; - - void CancelTasksWithResourceShapes( - const std::vector> &resource_shapes, - const rpc::ClientCallback &callback) - override; - - void IsLocalWorkerDead( - const WorkerID &worker_id, - const rpc::ClientCallback &callback) override; - - void GetSystemConfig( - const rpc::ClientCallback &callback) override; - - void GlobalGC(const rpc::ClientCallback &callback) override; - - void GetResourceLoad( - const rpc::ClientCallback &callback) override; - - void NotifyGCSRestart( - const rpc::ClientCallback &callback) override; - - const ResourceMappingType &GetResourceIDs() const { return resource_ids_; } - - int64_t GetPinsInFlight() const override { return pins_in_flight_.load(); } - - void GetNodeStats(const rpc::GetNodeStatsRequest &request, - const rpc::ClientCallback &callback) override; - - private: - /// gRPC client to the NodeManagerService. - std::shared_ptr grpc_client_; - - /// A map from resource name to the resource IDs that are currently reserved - /// for this worker. Each pair consists of the resource ID and the fraction - /// of that resource allocated for this worker. - ResourceMappingType resource_ids_; - - /// The number of object ID pin RPCs currently in flight. - std::atomic pins_in_flight_ = 0; -}; - -} // namespace raylet - } // namespace ray diff --git a/src/ray/rpc/node_manager/raylet_client_pool.cc b/src/ray/rpc/raylet/raylet_client_pool.cc similarity index 78% rename from src/ray/rpc/node_manager/raylet_client_pool.cc rename to src/ray/rpc/raylet/raylet_client_pool.cc index bd37718b5e55..5283f73c88ff 100644 --- a/src/ray/rpc/node_manager/raylet_client_pool.cc +++ b/src/ray/rpc/raylet/raylet_client_pool.cc @@ -12,14 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/rpc/node_manager/raylet_client_pool.h" +#include "ray/rpc/raylet/raylet_client_pool.h" #include #include #include -#include "ray/util/util.h" - namespace ray { namespace rpc { @@ -28,12 +26,12 @@ std::function RayletClientPool::GetDefaultUnavailableTimeoutCallback( rpc::RayletClientPool *raylet_client_pool, const rpc::Address &addr) { return [addr, gcs_client, raylet_client_pool]() { - const NodeID raylet_id = NodeID::FromBinary(addr.raylet_id()); + const NodeID node_id = NodeID::FromBinary(addr.node_id()); - auto gcs_check_node_alive = [raylet_id, addr, raylet_client_pool, gcs_client]() { + auto gcs_check_node_alive = [node_id, addr, raylet_client_pool, gcs_client]() { gcs_client->Nodes().AsyncGetAll( - [addr, raylet_id, raylet_client_pool](const Status &status, - std::vector &&nodes) { + [addr, node_id, raylet_client_pool](const Status &status, + std::vector &&nodes) { if (!status.ok()) { // Will try again when unavailable timeout callback is retried. RAY_LOG(INFO) << "Failed to get node info from GCS"; @@ -47,18 +45,18 @@ std::function RayletClientPool::GetDefaultUnavailableTimeoutCallback( // maximum_gcs_dead_node_cached_count. // In this case, it must be 2 since there's no way for a component to // know about a remote node id until the gcs has registered it. - RAY_LOG(INFO).WithField(raylet_id) + RAY_LOG(INFO).WithField(node_id) << "Disconnecting raylet client because its node is dead"; - raylet_client_pool->Disconnect(raylet_id); + raylet_client_pool->Disconnect(node_id); return; } }, -1, - {raylet_id}); + {node_id}); }; if (gcs_client->Nodes().IsSubscribedToNodeChange()) { - auto *node_info = gcs_client->Nodes().Get(raylet_id, /*filter_dead_nodes=*/false); + auto *node_info = gcs_client->Nodes().Get(node_id, /*filter_dead_nodes=*/false); if (node_info == nullptr) { // Node could be dead or info may have not made it to the subscriber cache yet. // Check with the GCS to confirm if the node is dead. @@ -66,9 +64,9 @@ std::function RayletClientPool::GetDefaultUnavailableTimeoutCallback( return; } if (node_info->state() == rpc::GcsNodeInfo::DEAD) { - RAY_LOG(INFO).WithField(raylet_id) + RAY_LOG(INFO).WithField(node_id) << "Disconnecting raylet client because its node is dead."; - raylet_client_pool->Disconnect(raylet_id); + raylet_client_pool->Disconnect(node_id); return; } // Node is alive so raylet client is alive. @@ -81,18 +79,18 @@ std::function RayletClientPool::GetDefaultUnavailableTimeoutCallback( std::shared_ptr RayletClientPool::GetOrConnectByAddress( const rpc::Address &address) { - RAY_CHECK(address.raylet_id() != ""); + RAY_CHECK(address.node_id() != ""); absl::MutexLock lock(&mu_); - auto raylet_id = NodeID::FromBinary(address.raylet_id()); - auto it = client_map_.find(raylet_id); + auto node_id = NodeID::FromBinary(address.node_id()); + auto it = client_map_.find(node_id); if (it != client_map_.end()) { RAY_CHECK(it->second != nullptr); return it->second; } auto connection = client_factory_(address); - client_map_[raylet_id] = connection; + client_map_[node_id] = connection; - RAY_LOG(DEBUG) << "Connected to raylet " << raylet_id << " at " + RAY_LOG(DEBUG) << "Connected to raylet " << node_id << " at " << BuildAddress(address.ip_address(), address.port()); RAY_CHECK(connection != nullptr); return connection; @@ -116,13 +114,13 @@ void RayletClientPool::Disconnect(ray::NodeID id) { client_map_.erase(it); } -rpc::Address RayletClientPool::GenerateRayletAddress(const NodeID &raylet_id, +rpc::Address RayletClientPool::GenerateRayletAddress(const NodeID &node_id, const std::string &ip_address, int port) { rpc::Address address; address.set_ip_address(ip_address); address.set_port(port); - address.set_raylet_id(raylet_id.Binary()); + address.set_node_id(node_id.Binary()); return address; } diff --git a/src/ray/rpc/node_manager/raylet_client_pool.h b/src/ray/rpc/raylet/raylet_client_pool.h similarity index 95% rename from src/ray/rpc/node_manager/raylet_client_pool.h rename to src/ray/rpc/raylet/raylet_client_pool.h index a1d686fd4e25..1ab520b0a519 100644 --- a/src/ray/rpc/node_manager/raylet_client_pool.h +++ b/src/ray/rpc/raylet/raylet_client_pool.h @@ -23,9 +23,8 @@ #include "absl/strings/str_cat.h" #include "absl/synchronization/mutex.h" #include "ray/common/id.h" -#include "ray/gcs/gcs_client/gcs_client.h" -#include "ray/raylet_client/raylet_client.h" -#include "ray/rpc/node_manager/node_manager_client.h" +#include "ray/gcs_client/gcs_client.h" +#include "ray/rpc/raylet/raylet_client_interface.h" namespace ray { namespace rpc { diff --git a/src/ray/rpc/node_manager/test/BUILD.bazel b/src/ray/rpc/raylet/tests/BUILD.bazel similarity index 68% rename from src/ray/rpc/node_manager/test/BUILD.bazel rename to src/ray/rpc/raylet/tests/BUILD.bazel index e8c9d9dd6b2d..ac59bcc2f969 100644 --- a/src/ray/rpc/node_manager/test/BUILD.bazel +++ b/src/ray/rpc/raylet/tests/BUILD.bazel @@ -6,9 +6,9 @@ ray_cc_test( srcs = ["raylet_client_pool_test.cc"], tags = ["team:core"], deps = [ - "//:ray_fakes", - "//src/ray/gcs/gcs_client:gcs_client_lib", - "//src/ray/rpc:node_manager_client", + "//src/fakes/ray/rpc/raylet:fake_raylet_client", + "//src/ray/gcs_client", + "//src/ray/rpc:raylet_client_pool", "@com_google_googletest//:gtest", "@com_google_googletest//:gtest_main", ], diff --git a/src/ray/rpc/node_manager/test/raylet_client_pool_test.cc b/src/ray/rpc/raylet/tests/raylet_client_pool_test.cc similarity index 97% rename from src/ray/rpc/node_manager/test/raylet_client_pool_test.cc rename to src/ray/rpc/raylet/tests/raylet_client_pool_test.cc index 15d81cccf7a2..5d711c92e5b3 100644 --- a/src/ray/rpc/node_manager/test/raylet_client_pool_test.cc +++ b/src/ray/rpc/raylet/tests/raylet_client_pool_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/rpc/node_manager/raylet_client_pool.h" +#include "ray/rpc/raylet/raylet_client_pool.h" #include @@ -44,7 +44,7 @@ namespace { rpc::Address CreateRandomAddress(const std::string &addr) { rpc::Address address; address.set_ip_address(addr); - address.set_raylet_id(NodeID::FromRandom().Binary()); + address.set_node_id(NodeID::FromRandom().Binary()); address.set_worker_id(WorkerID::FromRandom().Binary()); return address; } @@ -123,8 +123,8 @@ TEST_P(DefaultUnavailableTimeoutCallbackTest, NodeDeath) { auto raylet_client_1_address = CreateRandomAddress("1"); auto raylet_client_2_address = CreateRandomAddress("2"); - auto raylet_client_1_node_id = NodeID::FromBinary(raylet_client_1_address.raylet_id()); - auto raylet_client_2_node_id = NodeID::FromBinary(raylet_client_2_address.raylet_id()); + auto raylet_client_1_node_id = NodeID::FromBinary(raylet_client_1_address.node_id()); + auto raylet_client_2_node_id = NodeID::FromBinary(raylet_client_2_address.node_id()); auto raylet_client_1 = dynamic_cast( raylet_client_pool_->GetOrConnectByAddress(raylet_client_1_address).get()); diff --git a/src/ray/rpc/retryable_grpc_client.h b/src/ray/rpc/retryable_grpc_client.h index 6bb6df4477f5..270f101f83c4 100644 --- a/src/ray/rpc/retryable_grpc_client.h +++ b/src/ray/rpc/retryable_grpc_client.h @@ -31,18 +31,35 @@ namespace ray::rpc { +// This macro wraps the logic to call a specific RPC method of a service with the +// retryable grpc client, to make it easier to implement a new RPC client. +#define INVOKE_RETRYABLE_RPC_CALL(retryable_rpc_client, \ + SERVICE, \ + METHOD, \ + request, \ + callback, \ + rpc_client, \ + method_timeout_ms) \ + (retryable_rpc_client->CallMethod( \ + &SERVICE::Stub::PrepareAsync##METHOD, \ + rpc_client, \ + #SERVICE ".grpc_client." #METHOD, \ + std::move(request), \ + callback, \ + method_timeout_ms)) + // Define a void retryable RPC client method. -#define VOID_RETRYABLE_RPC_CLIENT_METHOD( \ - retryable_rpc_client, SERVICE, METHOD, rpc_client, method_timeout_ms, SPECS) \ - void METHOD(const METHOD##Request &request, \ - const ClientCallback &callback) SPECS { \ - retryable_rpc_client->CallMethod( \ - &SERVICE::Stub::PrepareAsync##METHOD, \ - rpc_client, \ - #SERVICE ".grpc_client." #METHOD, \ - request, \ - callback, \ - method_timeout_ms); \ +#define VOID_RETRYABLE_RPC_CLIENT_METHOD( \ + retryable_rpc_client, SERVICE, METHOD, rpc_client, method_timeout_ms, SPECS) \ + void METHOD(METHOD##Request &&request, const ClientCallback &callback) \ + SPECS { \ + INVOKE_RETRYABLE_RPC_CALL(retryable_rpc_client, \ + SERVICE, \ + METHOD, \ + request, \ + callback, \ + rpc_client, \ + method_timeout_ms); \ } /** diff --git a/src/ray/rpc/rpc_chaos.cc b/src/ray/rpc/rpc_chaos.cc index b56738945637..d8c1b2bd47bb 100644 --- a/src/ray/rpc/rpc_chaos.cc +++ b/src/ray/rpc/rpc_chaos.cc @@ -25,16 +25,22 @@ namespace ray { namespace rpc { namespace testing { -namespace { // RpcFailureManager is a simple chaos testing framework. Before starting ray, users // should set up os environment to use this feature for testing purposes. -// To use this, simply do + +// You can use this to set probabilities for specific rpc's. // export RAY_testing_rpc_failure="method1=3:25:50,method2=5:25:25" // Key is the RPC call name and value is a three part colon separated structure. It // contains the max number of failures to inject + probability of req failure + // probability of reply failure. +// You can also use a wildcard to set probabilities for all rpc's and -1 as num_failures +// to have unlimited failures. +// export RAY_testing_rpc_failure="*=-1:25:50" +// This will set the probabilities for all rpc's to 25% for request failures and 50% for +// reply failures. + class RpcFailureManager { public: RpcFailureManager() { Init(); } @@ -42,7 +48,10 @@ class RpcFailureManager { void Init() { absl::MutexLock lock(&mu_); + // Clear old state failable_methods_.clear(); + wildcard_set_ = false; + has_failures_ = false; if (!RayConfig::instance().testing_rpc_failure().empty()) { for (const auto &item : @@ -52,33 +61,67 @@ class RpcFailureManager { std::vector colon_split = absl::StrSplit(equal_split[1], ':'); RAY_CHECK_EQ(colon_split.size(), 3UL); auto [iter, _] = failable_methods_.emplace(equal_split[0], - Failable{std::stoul(colon_split[0]), + Failable{std::stol(colon_split[0]), std::stoul(colon_split[1]), std::stoul(colon_split[2])}); const auto &failable = iter->second; RAY_CHECK_LE(failable.req_failure_prob + failable.resp_failure_prob, 100UL); + if (equal_split[0] == "*") { + wildcard_set_ = true; + // The wildcard overrides all other method configurations. + break; + } } std::random_device rd; auto seed = rd(); RAY_LOG(INFO) << "Setting RpcFailureManager seed to " << seed; gen_.seed(seed); + has_failures_ = true; } } RpcFailure GetRpcFailure(const std::string &name) { + if (!has_failures_) { + return RpcFailure::None; + } + absl::MutexLock lock(&mu_); + // Wildcard overrides any other method configurations. + if (wildcard_set_) { + return GetFailureTypeFromFailable(failable_methods_["*"]); + } + auto iter = failable_methods_.find(name); if (iter == failable_methods_.end()) { return RpcFailure::None; } + return GetFailureTypeFromFailable(iter->second); + } - auto &failable = iter->second; + private: + absl::Mutex mu_; + std::mt19937 gen_; + std::atomic_bool has_failures_ = false; + + // If we're testing all rpc failures, we'll use these probabilites instead of + // failable_methods_ + bool wildcard_set_ = false; + + // call name -> (num_remaining_failures, req_failure_prob, resp_failure_prob) + struct Failable { + int64_t num_remaining_failures; + size_t req_failure_prob; + size_t resp_failure_prob; + }; + absl::flat_hash_map failable_methods_ ABSL_GUARDED_BY(&mu_); + + RpcFailure GetFailureTypeFromFailable(Failable &failable) { if (failable.num_remaining_failures == 0) { + // If < 0, unlimited failures. return RpcFailure::None; } - std::uniform_int_distribution dist(1ul, 100ul); const size_t random_number = dist(gen_); if (random_number <= failable.req_failure_prob) { @@ -91,34 +134,22 @@ class RpcFailureManager { } return RpcFailure::None; } - - private: - absl::Mutex mu_; - std::mt19937 gen_; - struct Failable { - size_t num_remaining_failures; - size_t req_failure_prob; - size_t resp_failure_prob; - }; - // call name -> (num_remaining_failures, req_failure_prob, resp_failure_prob) - absl::flat_hash_map failable_methods_ ABSL_GUARDED_BY(&mu_); }; -auto &rpc_failure_manager = []() -> RpcFailureManager & { +namespace { + +RpcFailureManager &GetRpcFailureManager() { static auto *manager = new RpcFailureManager(); return *manager; -}(); +} } // namespace RpcFailure GetRpcFailure(const std::string &name) { - if (RayConfig::instance().testing_rpc_failure().empty()) { - return RpcFailure::None; - } - return rpc_failure_manager.GetRpcFailure(name); + return GetRpcFailureManager().GetRpcFailure(name); } -void Init() { rpc_failure_manager.Init(); } +void Init() { GetRpcFailureManager().Init(); } } // namespace testing } // namespace rpc diff --git a/src/ray/rpc/rpc_chaos.h b/src/ray/rpc/rpc_chaos.h index f839fad39e6c..68a41aa9a4a0 100644 --- a/src/ray/rpc/rpc_chaos.h +++ b/src/ray/rpc/rpc_chaos.h @@ -20,7 +20,7 @@ namespace ray { namespace rpc { namespace testing { -enum class RpcFailure { +enum class RpcFailure : uint8_t { None, // Failure before server receives the request Request, diff --git a/src/ray/rpc/server_call.h b/src/ray/rpc/server_call.h index a2e7cd2ceea5..698767d7e25b 100644 --- a/src/ray/rpc/server_call.h +++ b/src/ray/rpc/server_call.h @@ -109,9 +109,6 @@ class ServerCall { /// Get the state of this `ServerCall`. virtual ServerCallState GetState() const = 0; - /// Set state of this `ServerCall`. - virtual void SetState(const ServerCallState &new_state) = 0; - /// Handle the requst. This is the callback function to be called by /// `GrpcServer` when the request is received. virtual void HandleRequest() = 0; @@ -201,12 +198,8 @@ class ServerCallImpl : public ServerCall { } } - ~ServerCallImpl() override = default; - ServerCallState GetState() const override { return state_; } - void SetState(const ServerCallState &new_state) override { state_ = new_state; } - void HandleRequest() override { stats_handle_ = io_service_.stats().RecordStart(call_name_); bool auth_success = true; @@ -262,15 +255,12 @@ class ServerCallImpl : public ServerCall { } } state_ = ServerCallState::PROCESSING; - // NOTE(hchen): This `factory` local variable is needed. Because `SendReply` runs in - // a different thread, and will cause `this` to be deleted. - const auto &factory = factory_; - if (factory.GetMaxActiveRPCs() == -1) { + if (factory_.GetMaxActiveRPCs() == -1) { // Create a new `ServerCall` to accept the next incoming request. // We create this before handling the request only when no back pressure limit is // set. So that the it can be populated by the completion queue in the background if // a new request comes in. - factory.CreateCall(); + factory_.CreateCall(); } if (!auth_success) { boost::asio::post(GetServerCallExecutor(), [this]() { diff --git a/src/ray/rpc/test/rpc_chaos_test.cc b/src/ray/rpc/test/rpc_chaos_test.cc deleted file mode 100644 index 021a139dd990..000000000000 --- a/src/ray/rpc/test/rpc_chaos_test.cc +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2024 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "ray/rpc/rpc_chaos.h" - -#include "gtest/gtest.h" -#include "ray/common/ray_config.h" - -TEST(RpcChaosTest, Basic) { - RayConfig::instance().testing_rpc_failure() = "method1=0:25:25,method2=1:25:25"; - ray::rpc::testing::Init(); - ASSERT_EQ(ray::rpc::testing::GetRpcFailure("unknown"), - ray::rpc::testing::RpcFailure::None); - ASSERT_EQ(ray::rpc::testing::GetRpcFailure("method1"), - ray::rpc::testing::RpcFailure::None); - // At most one failure. - ASSERT_FALSE(ray::rpc::testing::GetRpcFailure("method2") != - ray::rpc::testing::RpcFailure::None && - ray::rpc::testing::GetRpcFailure("method2") != - ray::rpc::testing::RpcFailure::None); -} - -TEST(RpcChaosTest, EdgeCaseProbability) { - RayConfig::instance().testing_rpc_failure() = - "method1=1000:100:0,method2=1000:0:100,method3=1000:0:0"; - ray::rpc::testing::Init(); - for (int i = 0; i < 1000; i++) { - ASSERT_EQ(ray::rpc::testing::GetRpcFailure("method1"), - ray::rpc::testing::RpcFailure::Request); - ASSERT_EQ(ray::rpc::testing::GetRpcFailure("method2"), - ray::rpc::testing::RpcFailure::Response); - ASSERT_EQ(ray::rpc::testing::GetRpcFailure("method3"), - ray::rpc::testing::RpcFailure::None); - } -} diff --git a/src/ray/rpc/test/BUILD.bazel b/src/ray/rpc/tests/BUILD.bazel similarity index 77% rename from src/ray/rpc/test/BUILD.bazel rename to src/ray/rpc/tests/BUILD.bazel index 8bd523f17c97..d2803b88c651 100644 --- a/src/ray/rpc/test/BUILD.bazel +++ b/src/ray/rpc/tests/BUILD.bazel @@ -41,3 +41,16 @@ ray_cc_test( "@com_google_googletest//:gtest_main", ], ) + +ray_cc_test( + name = "metrics_agent_client_test", + size = "small", + srcs = [ + "metrics_agent_client_test.cc", + ], + tags = ["team:core"], + deps = [ + "//src/ray/rpc:metrics_agent_client", + "@com_google_googletest//:gtest_main", + ], +) diff --git a/src/ray/rpc/test/core_worker_client_pool_test.cc b/src/ray/rpc/tests/core_worker_client_pool_test.cc similarity index 98% rename from src/ray/rpc/test/core_worker_client_pool_test.cc rename to src/ray/rpc/tests/core_worker_client_pool_test.cc index 7f06a7c5b192..f57eb3b29430 100644 --- a/src/ray/rpc/test/core_worker_client_pool_test.cc +++ b/src/ray/rpc/tests/core_worker_client_pool_test.cc @@ -49,7 +49,7 @@ namespace { rpc::Address CreateRandomAddress(const std::string &addr) { rpc::Address address; address.set_ip_address(addr); - address.set_raylet_id(NodeID::FromRandom().Binary()); + address.set_node_id(NodeID::FromRandom().Binary()); address.set_worker_id(WorkerID::FromRandom().Binary()); return address; } @@ -181,8 +181,8 @@ TEST_P(DefaultUnavailableTimeoutCallbackTest, NodeDeath) { client_pool_->GetOrConnect(worker_2_address).get()); AssertID(worker_id2, *client_pool_, true); - auto worker_1_node_id = NodeID::FromBinary(worker_1_address.raylet_id()); - auto worker_2_node_id = NodeID::FromBinary(worker_2_address.raylet_id()); + auto worker_1_node_id = NodeID::FromBinary(worker_1_address.node_id()); + auto worker_2_node_id = NodeID::FromBinary(worker_2_address.node_id()); rpc::GcsNodeInfo node_info_alive; node_info_alive.set_state(rpc::GcsNodeInfo::ALIVE); diff --git a/src/ray/rpc/test/grpc_bench/BUILD.bazel b/src/ray/rpc/tests/grpc_bench/BUILD.bazel similarity index 100% rename from src/ray/rpc/test/grpc_bench/BUILD.bazel rename to src/ray/rpc/tests/grpc_bench/BUILD.bazel diff --git a/src/ray/rpc/test/grpc_bench/Dockerfile b/src/ray/rpc/tests/grpc_bench/Dockerfile similarity index 100% rename from src/ray/rpc/test/grpc_bench/Dockerfile rename to src/ray/rpc/tests/grpc_bench/Dockerfile diff --git a/src/ray/rpc/test/grpc_bench/README b/src/ray/rpc/tests/grpc_bench/README similarity index 100% rename from src/ray/rpc/test/grpc_bench/README rename to src/ray/rpc/tests/grpc_bench/README diff --git a/src/ray/rpc/test/grpc_bench/grpc_bench.cc b/src/ray/rpc/tests/grpc_bench/grpc_bench.cc similarity index 100% rename from src/ray/rpc/test/grpc_bench/grpc_bench.cc rename to src/ray/rpc/tests/grpc_bench/grpc_bench.cc diff --git a/src/ray/rpc/test/grpc_bench/helloworld.proto b/src/ray/rpc/tests/grpc_bench/helloworld.proto similarity index 100% rename from src/ray/rpc/test/grpc_bench/helloworld.proto rename to src/ray/rpc/tests/grpc_bench/helloworld.proto diff --git a/src/ray/rpc/test/grpc_server_client_test.cc b/src/ray/rpc/tests/grpc_server_client_test.cc similarity index 100% rename from src/ray/rpc/test/grpc_server_client_test.cc rename to src/ray/rpc/tests/grpc_server_client_test.cc diff --git a/src/ray/rpc/tests/metrics_agent_client_test.cc b/src/ray/rpc/tests/metrics_agent_client_test.cc new file mode 100644 index 000000000000..114252116794 --- /dev/null +++ b/src/ray/rpc/tests/metrics_agent_client_test.cc @@ -0,0 +1,94 @@ +// Copyright 2024 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ray/rpc/metrics_agent_client.h" + +#include +#include + +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +namespace ray { +namespace rpc { + +constexpr int kCountToReturnOk = 3; +constexpr int kRetryIntervalMs = 100; + +class TestableMetricsAgentClientImpl : public MetricsAgentClientImpl { + public: + TestableMetricsAgentClientImpl(const std::string &address, + const int port, + instrumented_io_context &io_service, + rpc::ClientCallManager &client_call_manager, + int count_to_return_ok) + : MetricsAgentClientImpl(address, port, io_service, client_call_manager), + count_to_return_ok_(count_to_return_ok) {} + + // HealthCheck is a macro+template method that supposes to invoke the callback upon + // the completion of an RPC call. We override it to invoke the callback directly + // without the RPC call. Ideally we would create a GrpcClientMock that overrides + // the RPC call. However, currently the RPC call is a template method, which cannot + // be overridden. + void HealthCheck(const HealthCheckRequest &request, + const ClientCallback &callback) override { + health_check_count_++; + if (health_check_count_ <= count_to_return_ok_) { + callback(Status::RpcError("Failed to connect to the metrics agent server.", 14), + HealthCheckReply()); + } else { + callback(Status::OK(), HealthCheckReply()); + } + } + + private: + int count_to_return_ok_; + int health_check_count_ = 1; +}; + +class MetricsAgentClientTest : public ::testing::Test { + protected: + void SetUp() override { + client_call_manager_ = std::make_unique(io_service_, true); + client_ = std::make_unique( + "127.0.0.1", 8000, io_service_, *client_call_manager_, kCountToReturnOk); + } + + instrumented_io_context io_service_; + std::unique_ptr client_; + std::unique_ptr client_call_manager_; +}; + +TEST_F(MetricsAgentClientTest, WaitForServerReadyWithRetrySuccess) { + client_->WaitForServerReadyWithRetry( + [](const Status &server_status) { ASSERT_TRUE(server_status.ok()); }, + 0, + kCountToReturnOk, + kRetryIntervalMs); + io_service_.run_for(std::chrono::milliseconds(kCountToReturnOk * kRetryIntervalMs)); + ASSERT_TRUE(client_->exporter_initialized_); +} + +TEST_F(MetricsAgentClientTest, WaitForServerReadyWithRetryFailure) { + client_->WaitForServerReadyWithRetry( + [](const Status &server_status) { ASSERT_FALSE(server_status.ok()); }, + 0, + kCountToReturnOk - 2, + kRetryIntervalMs); + io_service_.run_for(std::chrono::milliseconds(kCountToReturnOk * kRetryIntervalMs)); + ASSERT_FALSE(client_->exporter_initialized_); +} + +} // namespace rpc +} // namespace ray diff --git a/src/ray/rpc/tests/rpc_chaos_test.cc b/src/ray/rpc/tests/rpc_chaos_test.cc new file mode 100644 index 000000000000..f3e6d6b7b8e5 --- /dev/null +++ b/src/ray/rpc/tests/rpc_chaos_test.cc @@ -0,0 +1,63 @@ +// Copyright 2024 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ray/rpc/rpc_chaos.h" + +#include "gtest/gtest.h" +#include "ray/common/ray_config.h" + +namespace ray::rpc::testing { + +TEST(RpcChaosTest, MethodRpcFailure) { + RayConfig::instance().testing_rpc_failure() = "method1=0:25:25,method2=1:100:0"; + Init(); + ASSERT_EQ(GetRpcFailure("unknown"), RpcFailure::None); + ASSERT_EQ(GetRpcFailure("method1"), RpcFailure::None); + // At most one failure. + ASSERT_TRUE(GetRpcFailure("method2") == RpcFailure::Request); + ASSERT_TRUE(GetRpcFailure("method2") == RpcFailure::None); +} + +TEST(RpcChaosTest, MethodRpcFailureEdgeCase) { + RayConfig::instance().testing_rpc_failure() = + "method1=1000:100:0,method2=1000:0:100,method3=1000:0:0"; + Init(); + for (int i = 0; i < 1000; i++) { + ASSERT_EQ(GetRpcFailure("method1"), RpcFailure::Request); + ASSERT_EQ(GetRpcFailure("method2"), RpcFailure::Response); + ASSERT_EQ(GetRpcFailure("method3"), RpcFailure::None); + } +} + +TEST(RpcChaosTest, WildcardRpcFailure) { + RayConfig::instance().testing_rpc_failure() = "*=-1:100:0"; + Init(); + for (int i = 0; i < 100; i++) { + ASSERT_EQ(GetRpcFailure("method"), RpcFailure::Request); + } + + RayConfig::instance().testing_rpc_failure() = "*=-1:0:100"; + Init(); + for (int i = 0; i < 100; i++) { + ASSERT_EQ(GetRpcFailure("method"), RpcFailure::Response); + } + + RayConfig::instance().testing_rpc_failure() = "*=-1:0:0"; + Init(); + for (int i = 0; i < 100; i++) { + ASSERT_EQ(GetRpcFailure("method"), RpcFailure::None); + } +} + +} // namespace ray::rpc::testing diff --git a/src/ray/rpc/worker/core_worker_client.cc b/src/ray/rpc/worker/core_worker_client.cc index a182c020de7d..6863511dfe05 100644 --- a/src/ray/rpc/worker/core_worker_client.cc +++ b/src/ray/rpc/worker/core_worker_client.cc @@ -110,7 +110,7 @@ void CoreWorkerClient::SendRequests() { [this, this_ptr, seq_no, task_size, callback = std::move(pair.second)]( Status status, rpc::PushTaskReply &&reply) { { - absl::MutexLock lock(&mutex_); + absl::MutexLock lk(&mutex_); if (seq_no > max_finished_seq_no_) { max_finished_seq_no_ = seq_no; } diff --git a/src/ray/rpc/worker/core_worker_client.h b/src/ray/rpc/worker/core_worker_client.h index 51a06cdb7166..0f099dd3e86e 100644 --- a/src/ray/rpc/worker/core_worker_client.h +++ b/src/ray/rpc/worker/core_worker_client.h @@ -37,11 +37,11 @@ namespace std { template <> struct hash { size_t operator()(const ray::rpc::Address &addr) const { - size_t hash = std::hash()(addr.port()); - hash ^= std::hash()(addr.ip_address()); - hash ^= std::hash()(addr.worker_id()); - hash ^= std::hash()(addr.raylet_id()); - return hash; + size_t hash_value = std::hash()(addr.port()); + hash_value ^= std::hash()(addr.ip_address()); + hash_value ^= std::hash()(addr.worker_id()); + hash_value ^= std::hash()(addr.node_id()); + return hash_value; } }; } // namespace std @@ -109,7 +109,7 @@ class CoreWorkerClientInterface : public pubsub::SubscriberClientInterface { const ClientCallback &callback) {} /// Ask the owner of an object about the object's current status. - virtual void GetObjectStatus(const GetObjectStatusRequest &request, + virtual void GetObjectStatus(GetObjectStatusRequest &&request, const ClientCallback &callback) {} /// Ask the actor's owner to reply when the actor has no references. @@ -128,7 +128,7 @@ class CoreWorkerClientInterface : public pubsub::SubscriberClientInterface { const ClientCallback &callback) {} virtual void UpdateObjectLocationBatch( - const UpdateObjectLocationBatchRequest &request, + UpdateObjectLocationBatchRequest &&request, const ClientCallback &callback) {} virtual void GetObjectLocationsOwner( @@ -136,7 +136,7 @@ class CoreWorkerClientInterface : public pubsub::SubscriberClientInterface { const ClientCallback &callback) {} virtual void ReportGeneratorItemReturns( - const ReportGeneratorItemReturnsRequest &request, + ReportGeneratorItemReturnsRequest &&request, const ClientCallback &callback) {} /// Tell this actor to exit immediately. @@ -192,6 +192,8 @@ class CoreWorkerClientInterface : public pubsub::SubscriberClientInterface { virtual void FreeActorObject(const FreeActorObjectRequest &request, const ClientCallback &callback) {} + virtual std::string DebugString() const { return ""; } + virtual ~CoreWorkerClientInterface() = default; }; @@ -369,6 +371,8 @@ class CoreWorkerClient : public std::enable_shared_from_this, CoreWorkerService, NumPendingTasks, *request, callback, grpc_client_, timeout_ms); } + std::string DebugString() const override { return ""; } + /// Send as many pending tasks as possible. This method is thread-safe. /// /// The client will guarantee no more than kMaxBytesInFlight bytes of RPCs are being diff --git a/src/ray/rpc/worker/core_worker_client_pool.cc b/src/ray/rpc/worker/core_worker_client_pool.cc index 99df4075a900..33a1c673ccf5 100644 --- a/src/ray/rpc/worker/core_worker_client_pool.cc +++ b/src/ray/rpc/worker/core_worker_client_pool.cc @@ -30,7 +30,7 @@ std::function CoreWorkerClientPool::GetDefaultUnavailableTimeoutCallback rpc::RayletClientPool *raylet_client_pool, const rpc::Address &addr) { return [addr, gcs_client, worker_client_pool, raylet_client_pool]() { - const NodeID node_id = NodeID::FromBinary(addr.raylet_id()); + const NodeID node_id = NodeID::FromBinary(addr.node_id()); const WorkerID worker_id = WorkerID::FromBinary(addr.worker_id()); auto check_worker_alive = [raylet_client_pool, @@ -39,8 +39,7 @@ std::function CoreWorkerClientPool::GetDefaultUnavailableTimeoutCallback node_id](const rpc::GcsNodeInfo &node_info) { auto raylet_addr = RayletClientPool::GenerateRayletAddress( node_id, node_info.node_manager_address(), node_info.node_manager_port()); - auto raylet_client = - raylet_client_pool->GetOrConnectByAddress(std::move(raylet_addr)); + auto raylet_client = raylet_client_pool->GetOrConnectByAddress(raylet_addr); raylet_client->IsLocalWorkerDead( worker_id, [worker_client_pool, worker_id, node_id](const Status &status, @@ -122,7 +121,7 @@ std::shared_ptr CoreWorkerClientPool::GetOrConnect( RemoveIdleClients(); CoreWorkerClientEntry entry; - auto node_id = NodeID::FromBinary(addr_proto.raylet_id()); + auto node_id = NodeID::FromBinary(addr_proto.node_id()); auto worker_id = WorkerID::FromBinary(addr_proto.worker_id()); auto it = worker_client_map_.find(worker_id); if (it != worker_client_map_.end()) { @@ -138,15 +137,15 @@ std::shared_ptr CoreWorkerClientPool::GetOrConnect( RAY_LOG(DEBUG) << "Connected to worker " << worker_id << " with address " << BuildAddress(addr_proto.ip_address(), addr_proto.port()); - return entry.core_worker_client; + return entry.core_worker_client_; } void CoreWorkerClientPool::RemoveIdleClients() { while (!client_list_.empty()) { - auto worker_id = client_list_.back().worker_id; - auto node_id = client_list_.back().node_id; + auto worker_id = client_list_.back().worker_id_; + auto node_id = client_list_.back().node_id_; // The last client in the list is the least recent accessed client. - if (client_list_.back().core_worker_client->IsIdleAfterRPCs()) { + if (client_list_.back().core_worker_client_->IsIdleAfterRPCs()) { worker_client_map_.erase(worker_id); EraseFromNodeClientMap(node_id, worker_id); client_list_.pop_back(); @@ -169,7 +168,7 @@ void CoreWorkerClientPool::Disconnect(ray::WorkerID id) { if (it == worker_client_map_.end()) { return; } - EraseFromNodeClientMap(it->second->node_id, /*worker_id=*/id); + EraseFromNodeClientMap(it->second->node_id_, /*worker_id=*/id); client_list_.erase(it->second); worker_client_map_.erase(it); } diff --git a/src/ray/rpc/worker/core_worker_client_pool.h b/src/ray/rpc/worker/core_worker_client_pool.h index f26ae8d81938..4e33fd8ca2c4 100644 --- a/src/ray/rpc/worker/core_worker_client_pool.h +++ b/src/ray/rpc/worker/core_worker_client_pool.h @@ -23,9 +23,9 @@ #include "absl/container/flat_hash_map.h" #include "absl/synchronization/mutex.h" #include "ray/common/id.h" -#include "ray/gcs/gcs_client/gcs_client.h" -#include "ray/raylet_client/raylet_client.h" -#include "ray/rpc/node_manager/raylet_client_pool.h" +#include "ray/gcs_client/gcs_client.h" +#include "ray/rpc/raylet/raylet_client_interface.h" +#include "ray/rpc/raylet/raylet_client_pool.h" #include "ray/rpc/worker/core_worker_client.h" namespace ray { @@ -90,13 +90,13 @@ class CoreWorkerClientPool { CoreWorkerClientEntry(WorkerID worker_id, NodeID node_id, std::shared_ptr core_worker_client) - : worker_id(std::move(worker_id)), - node_id(std::move(node_id)), - core_worker_client(std::move(core_worker_client)) {} + : worker_id_(std::move(worker_id)), + node_id_(std::move(node_id)), + core_worker_client_(std::move(core_worker_client)) {} - WorkerID worker_id; - NodeID node_id; - std::shared_ptr core_worker_client; + WorkerID worker_id_; + NodeID node_id_; + std::shared_ptr core_worker_client_; }; /// A list of open connections from the most recent accessed to the least recent diff --git a/src/ray/rpc/worker/core_worker_server.h b/src/ray/rpc/worker/core_worker_server.h deleted file mode 100644 index 39a6a918414e..000000000000 --- a/src/ray/rpc/worker/core_worker_server.h +++ /dev/null @@ -1,141 +0,0 @@ -// Copyright 2017 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include - -#include "ray/common/asio/instrumented_io_context.h" -#include "ray/rpc/grpc_server.h" -#include "ray/rpc/server_call.h" -#include "src/ray/protobuf/core_worker.grpc.pb.h" -#include "src/ray/protobuf/core_worker.pb.h" - -namespace ray { - -class CoreWorker; - -namespace rpc { -/// TODO(vitsai): Remove this when auth is implemented for node manager -#define RAY_CORE_WORKER_RPC_SERVICE_HANDLER(METHOD) \ - RPC_SERVICE_HANDLER_CUSTOM_AUTH_SERVER_METRICS_DISABLED( \ - CoreWorkerService, METHOD, -1, AuthType::NO_AUTH) - -/// NOTE: See src/ray/core_worker/core_worker.h on how to add a new grpc handler. -/// Disable gRPC server metrics since it incurs too high cardinality. -#define RAY_CORE_WORKER_RPC_HANDLERS \ - RAY_CORE_WORKER_RPC_SERVICE_HANDLER(PushTask) \ - RAY_CORE_WORKER_RPC_SERVICE_HANDLER(ActorCallArgWaitComplete) \ - RAY_CORE_WORKER_RPC_SERVICE_HANDLER(RayletNotifyGCSRestart) \ - RAY_CORE_WORKER_RPC_SERVICE_HANDLER(GetObjectStatus) \ - RAY_CORE_WORKER_RPC_SERVICE_HANDLER(WaitForActorRefDeleted) \ - RAY_CORE_WORKER_RPC_SERVICE_HANDLER(PubsubLongPolling) \ - RAY_CORE_WORKER_RPC_SERVICE_HANDLER(PubsubCommandBatch) \ - RAY_CORE_WORKER_RPC_SERVICE_HANDLER(UpdateObjectLocationBatch) \ - RAY_CORE_WORKER_RPC_SERVICE_HANDLER(GetObjectLocationsOwner) \ - RAY_CORE_WORKER_RPC_SERVICE_HANDLER(ReportGeneratorItemReturns) \ - RAY_CORE_WORKER_RPC_SERVICE_HANDLER(KillActor) \ - RAY_CORE_WORKER_RPC_SERVICE_HANDLER(CancelTask) \ - RAY_CORE_WORKER_RPC_SERVICE_HANDLER(RemoteCancelTask) \ - RAY_CORE_WORKER_RPC_SERVICE_HANDLER(RegisterMutableObjectReader) \ - RAY_CORE_WORKER_RPC_SERVICE_HANDLER(GetCoreWorkerStats) \ - RAY_CORE_WORKER_RPC_SERVICE_HANDLER(LocalGC) \ - RAY_CORE_WORKER_RPC_SERVICE_HANDLER(DeleteObjects) \ - RAY_CORE_WORKER_RPC_SERVICE_HANDLER(SpillObjects) \ - RAY_CORE_WORKER_RPC_SERVICE_HANDLER(RestoreSpilledObjects) \ - RAY_CORE_WORKER_RPC_SERVICE_HANDLER(DeleteSpilledObjects) \ - RAY_CORE_WORKER_RPC_SERVICE_HANDLER(PlasmaObjectReady) \ - RAY_CORE_WORKER_RPC_SERVICE_HANDLER(Exit) \ - RAY_CORE_WORKER_RPC_SERVICE_HANDLER(AssignObjectOwner) \ - RAY_CORE_WORKER_RPC_SERVICE_HANDLER(NumPendingTasks) \ - RAY_CORE_WORKER_RPC_SERVICE_HANDLER(FreeActorObject) - -#define RAY_CORE_WORKER_DECLARE_RPC_HANDLERS \ - DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(PushTask) \ - DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(ActorCallArgWaitComplete) \ - DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(RayletNotifyGCSRestart) \ - DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(GetObjectStatus) \ - DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(WaitForActorRefDeleted) \ - DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(PubsubLongPolling) \ - DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(PubsubCommandBatch) \ - DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(UpdateObjectLocationBatch) \ - DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(GetObjectLocationsOwner) \ - DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(ReportGeneratorItemReturns) \ - DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(KillActor) \ - DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(CancelTask) \ - DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(RemoteCancelTask) \ - DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(RegisterMutableObjectReader) \ - DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(GetCoreWorkerStats) \ - DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(LocalGC) \ - DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(DeleteObjects) \ - DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(SpillObjects) \ - DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(RestoreSpilledObjects) \ - DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(DeleteSpilledObjects) \ - DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(PlasmaObjectReady) \ - DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(Exit) \ - DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(AssignObjectOwner) \ - DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(NumPendingTasks) \ - DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(FreeActorObject) - -/// Interface of the `CoreWorkerServiceHandler`, see `src/ray/protobuf/core_worker.proto`. -class CoreWorkerServiceHandler : public DelayedServiceHandler { - public: - /// Blocks until the service is ready to serve RPCs. - virtual void WaitUntilInitialized() = 0; - - /// Handlers. For all of the following handlers, the implementations can - /// handle the request asynchronously. When handling is done, the - /// `send_reply_callback` should be called. See - /// src/ray/rpc/node_manager/node_manager_client.h and - /// src/ray/protobuf/node_manager.proto for a description of the - /// functionality of each handler. - /// - /// \param[in] request The request message. - /// \param[out] reply The reply message. - /// \param[in] send_reply_callback The callback to be called when the request is done. - RAY_CORE_WORKER_DECLARE_RPC_HANDLERS -}; - -/// The `GrpcServer` for `CoreWorkerService`. -class CoreWorkerGrpcService : public GrpcService { - public: - /// Constructor. - /// - /// \param[in] main_service See super class. - /// \param[in] handler The service handler that actually handle the requests. - CoreWorkerGrpcService(instrumented_io_context &main_service, - CoreWorkerServiceHandler &service_handler) - : GrpcService(main_service), service_handler_(service_handler) {} - - protected: - grpc::Service &GetGrpcService() override { return service_; } - - void InitServerCallFactories( - const std::unique_ptr &cq, - std::vector> *server_call_factories, - const ClusterID &cluster_id) override { - RAY_CORE_WORKER_RPC_HANDLERS - } - - private: - /// The grpc async service object. - CoreWorkerService::AsyncService service_; - - /// The service handler that actually handles the requests. - CoreWorkerServiceHandler &service_handler_; -}; - -} // namespace rpc -} // namespace ray diff --git a/src/ray/stats/BUILD.bazel b/src/ray/stats/BUILD.bazel index 4a088305a22b..2be6459bb0fd 100644 --- a/src/ray/stats/BUILD.bazel +++ b/src/ray/stats/BUILD.bazel @@ -1,4 +1,4 @@ -load("//bazel:ray.bzl", "ray_cc_library", "ray_cc_test") +load("//bazel:ray.bzl", "ray_cc_library") ray_cc_library( name = "stats_metric", @@ -10,12 +10,12 @@ ray_cc_library( hdrs = [ "metric.h", "metric_defs.h", - "tag_defs.h", ], deps = [ + ":tag_defs", "//src/ray/common:ray_config", - "//src/ray/telemetry:open_telemetry_metric_recorder", - "//src/ray/util", + "//src/ray/observability:metric_interface", + "//src/ray/observability:open_telemetry_metric_recorder", "//src/ray/util:logging", "//src/ray/util:size_literals", "@com_github_jupp0r_prometheus_cpp//pull", @@ -38,7 +38,6 @@ ray_cc_library( "metric.h", "metric_exporter.h", "stats.h", - "tag_defs.h", ], linkopts = select({ "@platforms//os:windows": [ @@ -49,6 +48,8 @@ ray_cc_library( }), deps = [ ":stats_metric", + ":tag_defs", + "//src/ray/observability:metric_interface", "//src/ray/rpc:metrics_agent_client", "//src/ray/util:network_util", "//src/ray/util:size_literals", @@ -56,33 +57,11 @@ ray_cc_library( ], ) -ray_cc_test( - name = "stats_test", - size = "small", - srcs = ["stats_test.cc"], - tags = [ - "no_tsan", - "stats", - "team:core", - ], - deps = [ - ":stats_lib", - "@com_google_googletest//:gtest_main", - ], -) - -ray_cc_test( - name = "metric_exporter_grpc_test", - size = "small", - srcs = [ - "metric_exporter_grpc_test.cc", - ], - tags = [ - "stats", - "team:core", - ], +ray_cc_library( + name = "tag_defs", + srcs = ["tag_defs.cc"], + hdrs = ["tag_defs.h"], deps = [ - ":stats_lib", - "@com_google_googletest//:gtest_main", + "//src/ray/observability:metric_interface", ], ) diff --git a/src/ray/stats/metric.cc b/src/ray/stats/metric.cc index e5e620fbb5ae..89ba9e21303e 100644 --- a/src/ray/stats/metric.cc +++ b/src/ray/stats/metric.cc @@ -113,32 +113,19 @@ void Metric::Record(double value, TagsType tags) { return; } - if (::RayConfig::instance().experimental_enable_open_telemetry_on_core()) { - // Register the metric if it hasn't been registered yet; otherwise, this is a no-op. - // We defer metric registration until the first time it's recorded, rather than during - // construction, to avoid issues with static initialization order. Specifically, our - // internal Metric objects (see metric_defs.h) are declared as static, and - // constructing another static object within their constructor can lead to crashes at - // program exit due to unpredictable destruction order. - // - // Once these internal Metric objects are migrated to use DEFINE_stats, we can - // safely move the registration logic to the constructor. See - // https://github.com/ray-project/ray/issues/54538 for the backlog of Ray metric infra - // improvements. - // - // This function is thread-safe. - RegisterOpenTelemetryMetric(); + if (::RayConfig::instance().enable_open_telemetry()) { // Collect tags from both the metric-specific tags and the global tags. absl::flat_hash_map open_telemetry_tags; - std::unordered_set tag_keys_set; + // Add default values for missing tag keys. for (const auto &tag_key : tag_keys_) { - tag_keys_set.insert(tag_key.name()); + open_telemetry_tags[tag_key.name()] = ""; } // Insert metric-specific tags that match the expected keys. for (const auto &tag : tags) { const std::string &key = tag.first.name(); - if (tag_keys_set.count(key)) { - open_telemetry_tags[key] = tag.second; + auto it = open_telemetry_tags.find(key); + if (it != open_telemetry_tags.end()) { + it->second = tag.second; } } // Add global tags, overwriting any existing tag keys. @@ -175,7 +162,7 @@ void Metric::Record(double value, TagsType tags) { } void Metric::Record(double value, - std::unordered_map tags) { + const std::unordered_map &tags) { TagsType tags_pair_vec; tags_pair_vec.reserve(tags.size()); std::for_each(tags.begin(), tags.end(), [&tags_pair_vec](auto &tag) { @@ -185,7 +172,8 @@ void Metric::Record(double value, Record(value, std::move(tags_pair_vec)); } -void Metric::Record(double value, std::unordered_map tags) { +void Metric::Record(double value, + const std::unordered_map &tags) { TagsType tags_pair_vec; tags_pair_vec.reserve(tags.size()); std::for_each(tags.begin(), tags.end(), [&tags_pair_vec](auto &tag) { diff --git a/src/ray/stats/metric.h b/src/ray/stats/metric.h index daacf41e1c10..4ae0c4ec3f4a 100644 --- a/src/ray/stats/metric.h +++ b/src/ray/stats/metric.h @@ -29,17 +29,16 @@ #include "opencensus/stats/stats_exporter.h" #include "opencensus/tags/tag_key.h" #include "ray/common/ray_config.h" -#include "ray/telemetry/open_telemetry_metric_recorder.h" +#include "ray/observability/metric_interface.h" +#include "ray/observability/open_telemetry_metric_recorder.h" +#include "ray/stats/tag_defs.h" #include "ray/util/logging.h" namespace ray { namespace stats { -/// Include tag_defs.h to define tag items -#include "ray/stats/tag_defs.h" - -using OpenTelemetryMetricRecorder = ray::telemetry::OpenTelemetryMetricRecorder; +using OpenTelemetryMetricRecorder = ray::observability::OpenTelemetryMetricRecorder; /// StatsConfig per process. /// Note that this is not thread-safe. Don't modify its internal values @@ -107,7 +106,7 @@ class StatsConfig final { }; /// A thin wrapper that wraps the `opencensus::tag::measure` for using it simply. -class Metric { +class Metric : public observability::MetricInterface { public: Metric(const std::string &name, std::string description, @@ -124,20 +123,22 @@ class Metric { const std::string &GetName() const { return name_; } /// Record the value for this metric. - void Record(double value) { Record(value, TagsType{}); } + void Record(double value) override { Record(value, TagsType{}); } /// Record the value for this metric. /// /// \param value The value that we record. /// \param tags The tag values that we want to record for this metric record. - void Record(double value, TagsType tags); + void Record(double value, TagsType tags) override; /// Record the value for this metric. /// /// \param value The value that we record. /// \param tags The map tag values that we want to record for this metric record. - void Record(double value, std::unordered_map tags); - void Record(double value, std::unordered_map tags); + void Record(double value, + const std::unordered_map &tags) override; + void Record(double value, + const std::unordered_map &tags) override; protected: virtual void RegisterView() = 0; @@ -163,7 +164,11 @@ class Gauge : public Metric { const std::string &description, const std::string &unit, const std::vector &tag_keys = {}) - : Metric(name, description, unit, tag_keys) {} + : Metric(name, description, unit, tag_keys) { + if (::RayConfig::instance().enable_open_telemetry()) { + RegisterOpenTelemetryMetric(); + } + } private: void RegisterView() override; @@ -178,7 +183,11 @@ class Histogram : public Metric { const std::string &unit, const std::vector &boundaries, const std::vector &tag_keys = {}) - : Metric(name, description, unit, tag_keys), boundaries_(boundaries) {} + : Metric(name, description, unit, tag_keys), boundaries_(boundaries) { + if (::RayConfig::instance().enable_open_telemetry()) { + RegisterOpenTelemetryMetric(); + } + } private: void RegisterView() override; @@ -195,7 +204,11 @@ class Count : public Metric { const std::string &description, const std::string &unit, const std::vector &tag_keys = {}) - : Metric(name, description, unit, tag_keys) {} + : Metric(name, description, unit, tag_keys) { + if (::RayConfig::instance().enable_open_telemetry()) { + RegisterOpenTelemetryMetric(); + } + } private: void RegisterView() override; @@ -209,7 +222,11 @@ class Sum : public Metric { const std::string &description, const std::string &unit, const std::vector &tag_keys = {}) - : Metric(name, description, unit, tag_keys) {} + : Metric(name, description, unit, tag_keys) { + if (::RayConfig::instance().enable_open_telemetry()) { + RegisterOpenTelemetryMetric(); + } + } private: void RegisterView() override; @@ -265,7 +282,7 @@ void RegisterView(const std::string &name, const std::string &description, const std::vector &tag_keys, const std::vector &buckets) { - if (!::RayConfig::instance().experimental_enable_open_telemetry_on_core()) { + if (!::RayConfig::instance().enable_open_telemetry()) { // OpenTelemetry is not enabled, register the view as an OpenCensus view. using I = StatsTypeMap; auto view_descriptor = opencensus::stats::ViewDescriptor() @@ -317,16 +334,6 @@ inline std::vector convert_tags( return ret; } -inline std::unordered_set build_tag_key_set( - const std::vector &tag_keys) { - std::unordered_set tag_keys_set; - tag_keys_set.reserve(tag_keys.size()); - for (const auto &tag_key : tag_keys) { - tag_keys_set.insert(tag_key); - } - return tag_keys_set; -} - /* This is a helper class to define a metrics. With this class we'll be able to define a multi-view-single-measure metric for @@ -349,9 +356,7 @@ class Stats { const std::string, const std::vector, const std::vector &buckets)> register_func) - : name_(measure), - tag_keys_(convert_tags(tag_keys)), - tag_keys_set_(build_tag_key_set(tag_keys)) { + : name_(measure), tag_keys_(convert_tags(tag_keys)) { auto stats_init = [register_func, measure, description, buckets, this]() { measure_ = std::make_unique(Measure::Register(measure, description, "")); register_func(measure, description, tag_keys_, buckets); @@ -381,10 +386,14 @@ class Stats { absl::flat_hash_map open_telemetry_tags; // Insert metric-specific tags that match the expected keys. + for (const auto &tag_key : tag_keys_) { + open_telemetry_tags[tag_key.name()] = ""; + } for (const auto &tag : open_census_tags) { const std::string &key = tag.first.name(); - if (tag_keys_set_.count(key) != 0) { - open_telemetry_tags[key] = tag.second; + auto it = open_telemetry_tags.find(key); + if (it != open_telemetry_tags.end()) { + it->second = tag.second; } } // Add global tags, overwriting any existing tag keys. @@ -462,7 +471,6 @@ class Stats { const std::string name_; // TODO: Depricate `tag_keys_` once we have fully migrated away from opencensus const std::vector tag_keys_; - const std::unordered_set tag_keys_set_; std::unique_ptr> measure_; }; diff --git a/src/ray/stats/metric_defs.cc b/src/ray/stats/metric_defs.cc index 0edf7668e4a7..b42d661eec64 100644 --- a/src/ray/stats/metric_defs.cc +++ b/src/ray/stats/metric_defs.cc @@ -37,24 +37,6 @@ namespace ray::stats { /// =========== PUBLIC METRICS; keep in sync with ray-metrics.rst ================= /// =============================================================================== -/// Tracks tasks by state, including pending, running, and finished tasks. -/// This metric may be recorded from multiple components processing the task in Ray, -/// including the submitting core worker, executor core worker, and pull manager. -/// -/// To avoid metric collection conflicts between components reporting on the same task, -/// we use the "Source" required label. -DEFINE_stats( - tasks, - "Current number of tasks currently in a particular state.", - // State: the task state, as described by rpc::TaskState proto in common.proto. - // Name: the name of the function called (Keep in sync with the - // TASK_OR_ACTOR_NAME_TAG_KEY in python/ray/_private/telemetry/metric_cardinality.py) - // Source: component reporting, e.g., "core_worker", "executor", or "pull_manager". - // IsRetry: whether this task is a retry. - ("State", "Name", "Source", "IsRetry", "JobId"), - (), - ray::stats::GAUGE); - /// Tracks actors by state, including pending, running, and idle actors. /// /// To avoid metric collection conflicts between components reporting on the same task, @@ -159,16 +141,19 @@ DEFINE_stats(io_context_event_loop_lag_ms, ray::stats::GAUGE); /// Event stats -DEFINE_stats(operation_count, "operation count", ("Method"), (), ray::stats::GAUGE); -DEFINE_stats( - operation_run_time_ms, "operation execution time", ("Method"), (), ray::stats::GAUGE); +DEFINE_stats(operation_count, "operation count", ("Name"), (), ray::stats::COUNT); +DEFINE_stats(operation_run_time_ms, + "operation execution time", + ("Name"), + ({1, 10, 100, 1000, 10000}), + ray::stats::HISTOGRAM); +DEFINE_stats(operation_queue_time_ms, + "operation queuing time", + ("Name"), + ({1, 10, 100, 1000, 10000}), + ray::stats::HISTOGRAM); DEFINE_stats( - operation_queue_time_ms, "operation queuing time", ("Method"), (), ray::stats::GAUGE); -DEFINE_stats(operation_active_count, - "activate operation number", - ("Method"), - (), - ray::stats::GAUGE); + operation_active_count, "active operation number", ("Name"), (), ray::stats::GAUGE); /// GRPC server DEFINE_stats(grpc_server_req_process_time_ms, diff --git a/src/ray/stats/metric_defs.h b/src/ray/stats/metric_defs.h index ebd44994d3a0..8a78603f3968 100644 --- a/src/ray/stats/metric_defs.h +++ b/src/ray/stats/metric_defs.h @@ -42,9 +42,6 @@ namespace stats { /// ray_[component]_[metrics_name]_total (e.g., ray_pull_manager_total) /// -/// Tasks stats, broken down by state. -DECLARE_stats(tasks); - /// Actor stats, broken down by state. DECLARE_stats(actors); @@ -135,161 +132,6 @@ DECLARE_stats(memory_manager_worker_eviction_total); /// Core Worker Task Manager DECLARE_stats(total_lineage_bytes); -/// The below items are legacy implementation of metrics. -/// TODO(sang): Use DEFINE_stats instead. - -/// -/// Common -/// -/// RPC -static Histogram GcsLatency("gcs_latency", - "The latency of a GCS (by default Redis) operation.", - "us", - {100, 200, 300, 400, 500, 600, 700, 800, 900, 1000}, - {kCustomKey}); - -/// -/// Raylet Metrics -/// - -/// Raylet Resource Manager -static Gauge TestMetrics("local_available_resource", - "The available resources on this node.", - "", - {kResourceNameKey}); - -static Gauge LocalTotalResource("local_total_resource", - "The total resources on this node.", - "", - {kResourceNameKey}); - -/// Object Manager. -static Gauge ObjectStoreAvailableMemory( - "object_store_available_memory", - "Amount of memory currently available in the object store.", - "bytes"); - -static Gauge ObjectStoreUsedMemory( - "object_store_used_memory", - "Amount of memory currently occupied in the object store.", - "bytes"); - -static Gauge ObjectStoreFallbackMemory( - "object_store_fallback_memory", - "Amount of memory in fallback allocations in the filesystem.", - "bytes"); - -static Gauge ObjectStoreLocalObjects("object_store_num_local_objects", - "Number of objects currently in the object store.", - "objects"); - -static Gauge ObjectManagerPullRequests("object_manager_num_pull_requests", - "Number of active pull requests for objects.", - "requests"); - -/// Object Directory. -static Gauge ObjectDirectoryLocationSubscriptions( - "object_directory_subscriptions", - "Number of object location subscriptions. If this is high, the raylet is attempting " - "to pull a lot of objects.", - "subscriptions"); - -static Gauge ObjectDirectoryLocationUpdates( - "object_directory_updates", - "Number of object location updates per second., If this is high, the raylet is " - "attempting to pull a lot of objects and/or the locations for objects are frequently " - "changing (e.g. due to many object copies or evictions).", - "updates"); - -static Gauge ObjectDirectoryLocationLookups( - "object_directory_lookups", - "Number of object location lookups per second. If this is high, the raylet is " - "waiting on a lot of objects.", - "lookups"); - -static Gauge ObjectDirectoryAddedLocations( - "object_directory_added_locations", - "Number of object locations added per second., If this is high, a lot of objects " - "have been added on this node.", - "additions"); - -static Gauge ObjectDirectoryRemovedLocations( - "object_directory_removed_locations", - "Number of object locations removed per second. If this is high, a lot of objects " - "have been removed from this node.", - "removals"); - -static Sum NumWorkersStarted( - "internal_num_processes_started", - "The total number of worker processes the worker pool has created.", - "processes"); - -static Sum NumCachedWorkersSkippedJobMismatch( - "internal_num_processes_skipped_job_mismatch", - "The total number of cached workers skipped due to job mismatch.", - "workers"); - -static Sum NumCachedWorkersSkippedRuntimeEnvironmentMismatch( - "internal_num_processes_skipped_runtime_environment_mismatch", - "The total number of cached workers skipped due to runtime environment mismatch.", - "workers"); - -static Sum NumCachedWorkersSkippedDynamicOptionsMismatch( - "internal_num_processes_skipped_dynamic_options_mismatch", - "The total number of cached workers skipped due to dynamic options mismatch.", - "workers"); - -static Sum NumWorkersStartedFromCache( - "internal_num_processes_started_from_cache", - "The total number of workers started from a cached worker process.", - "workers"); - -static Gauge NumSpilledTasks("internal_num_spilled_tasks", - "The cumulative number of lease requeusts that this raylet " - "has spilled to other raylets.", - "tasks"); - -static Gauge NumInfeasibleSchedulingClasses( - "internal_num_infeasible_scheduling_classes", - "The number of unique scheduling classes that are infeasible.", - "tasks"); - -/// -/// GCS Server Metrics -/// - -/// Workers -static Count UnintentionalWorkerFailures( - "unintentional_worker_failures_total", - "Number of worker failures that are not intentional. For example, worker failures " - "due to system related errors.", - ""); - -/// Nodes -static Count NodeFailureTotal( - "node_failure_total", - "Number of node failures that have happened in the cluster.", - ""); - -/// Resources -static Histogram OutboundHeartbeatSizeKB("outbound_heartbeat_size_kb", - "Outbound heartbeat payload size", - "kb", - {10, 50, 100, 1000, 10000, 100000}); - -static Histogram GcsUpdateResourceUsageTime( - "gcs_update_resource_usage_time", - "The average RTT of a UpdateResourceUsage RPC.", - "ms", - {1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000}, - {kCustomKey}); - -/// Testing -static Gauge LiveActors("live_actors", "Number of live actors.", "actors"); -static Gauge RestartingActors("restarting_actors", - "Number of restarting actors.", - "actors"); - } // namespace stats } // namespace ray diff --git a/src/ray/stats/metric_exporter.cc b/src/ray/stats/metric_exporter.cc index 5a3762ab45d0..76bc63abe4d8 100644 --- a/src/ray/stats/metric_exporter.cc +++ b/src/ray/stats/metric_exporter.cc @@ -30,26 +30,31 @@ OpenCensusProtoExporter::OpenCensusProtoExporter(const int port, const WorkerID &worker_id, size_t report_batch_size, size_t max_grpc_payload_size) - : OpenCensusProtoExporter( - std::make_shared(address, port, io_service), - worker_id, - report_batch_size, - max_grpc_payload_size) {} + : client_call_manager_( + std::make_unique(io_service, /*record_stats=*/true)), + worker_id_(worker_id), + report_batch_size_(report_batch_size), + // To make sure we're not overflowing Agent's set gRPC max message size, we will be + // tracking target payload binary size and make sure it stays w/in 95% of the + // threshold + proto_payload_size_threshold_bytes_((size_t)(max_grpc_payload_size * .95f)) { + absl::MutexLock l(&mu_); + client_ = std::make_shared( + address, port, io_service, *client_call_manager_); +} OpenCensusProtoExporter::OpenCensusProtoExporter( std::shared_ptr agent_client, const WorkerID &worker_id, size_t report_batch_size, size_t max_grpc_payload_size) + : worker_id_(worker_id), report_batch_size_(report_batch_size), - // To make sure we're not overflowing Agent's set gRPC max message size, we will be - // tracking target payload binary size and make sure it stays w/in 95% of the - // threshold proto_payload_size_threshold_bytes_((size_t)(max_grpc_payload_size * .95f)) { absl::MutexLock l(&mu_); client_ = std::move(agent_client); -}; +} /// Hack. We want to add GlobalTags to all our metrics, but gRPC OpenCencus plugin is not /// configurable at all so we don't have chance to add our own tags. We use this hack to diff --git a/src/ray/stats/metric_exporter.h b/src/ray/stats/metric_exporter.h index a2f00914a620..c1ea7197823c 100644 --- a/src/ray/stats/metric_exporter.h +++ b/src/ray/stats/metric_exporter.h @@ -24,7 +24,6 @@ #include "ray/rpc/metrics_agent_client.h" #include "ray/stats/metric.h" #include "ray/util/logging.h" -#include "ray/util/util.h" namespace ray { namespace stats { @@ -106,6 +105,7 @@ class OpenCensusProtoExporter final : public opencensus::stats::StatsExporter::H /// Lock to protect the client mutable absl::Mutex mu_; /// Client to call a metrics agent gRPC server. + std::unique_ptr client_call_manager_; std::shared_ptr client_ ABSL_GUARDED_BY(&mu_); /// The worker ID of the current component. WorkerID worker_id_; diff --git a/src/ray/stats/stats.h b/src/ray/stats/stats.h index 3bdadbe51f23..9347b70bfad2 100644 --- a/src/ray/stats/stats.h +++ b/src/ray/stats/stats.h @@ -28,9 +28,9 @@ #include "ray/common/asio/io_service_pool.h" #include "ray/common/id.h" #include "ray/common/ray_config.h" +#include "ray/observability/open_telemetry_metric_recorder.h" #include "ray/stats/metric.h" #include "ray/stats/metric_exporter.h" -#include "ray/telemetry/open_telemetry_metric_recorder.h" #include "ray/util/logging.h" #include "ray/util/network_util.h" @@ -40,12 +40,18 @@ namespace stats { #include -using OpenTelemetryMetricRecorder = ray::telemetry::OpenTelemetryMetricRecorder; +using OpenTelemetryMetricRecorder = ray::observability::OpenTelemetryMetricRecorder; // TODO(sang) Put all states and logic into a singleton class Stats. static std::shared_ptr metrics_io_service_pool; static absl::Mutex stats_mutex; +// Returns true if OpenCensus should be enabled. +static inline bool should_enable_open_census() { + return !RayConfig::instance().enable_open_telemetry() || + !RayConfig::instance().enable_grpc_metrics_collection_for().empty(); +} + /// Initialize stats for a process. /// NOTE: /// - stats::Init should be called only once per PROCESS. Redundant calls will be just @@ -84,14 +90,7 @@ static inline void Init( absl::Milliseconds(std::max(RayConfig::instance().metrics_report_interval_ms() / 2, static_cast(500)))); // Register the metric recorder. - if (RayConfig::instance().experimental_enable_open_telemetry_on_core()) { - OpenTelemetryMetricRecorder::GetInstance().RegisterGrpcExporter( - BuildAddress("127.0.0.1", metrics_agent_port), - std::chrono::milliseconds( - absl::ToInt64Milliseconds(StatsConfig::instance().GetReportInterval())), - std::chrono::milliseconds( - absl::ToInt64Milliseconds(StatsConfig::instance().GetHarvestInterval()))); - } else { + if (should_enable_open_census()) { metrics_io_service_pool = std::make_shared(1); metrics_io_service_pool->Run(); instrumented_io_context *metrics_io_service = metrics_io_service_pool->Get(); @@ -115,6 +114,27 @@ static inline void Init( StatsConfig::instance().SetIsInitialized(true); } +static inline void InitOpenTelemetryExporter(const int metrics_agent_port, + const Status &metrics_agent_server_status) { + if (!RayConfig::instance().enable_open_telemetry()) { + return; + } + if (!metrics_agent_server_status.ok()) { + RAY_LOG(ERROR) << "Failed to initialize OpenTelemetry exporter. Data will not be " + "exported to the " + << "metrics agent. Server status: " << metrics_agent_server_status; + return; + } + OpenTelemetryMetricRecorder::GetInstance().RegisterGrpcExporter( + /*endpoint=*/std::string("127.0.0.1:") + std::to_string(metrics_agent_port), + /*interval=*/ + std::chrono::milliseconds( + absl::ToInt64Milliseconds(StatsConfig::instance().GetReportInterval())), + /*timeout=*/ + std::chrono::milliseconds( + absl::ToInt64Milliseconds(StatsConfig::instance().GetHarvestInterval()))); +} + /// Shutdown the initialized stats library. /// This cleans up various threads and metadata for stats library. static inline void Shutdown() { @@ -123,9 +143,10 @@ static inline void Shutdown() { // Return if stats had never been initialized. return; } - if (RayConfig::instance().experimental_enable_open_telemetry_on_core()) { + if (RayConfig::instance().enable_open_telemetry()) { OpenTelemetryMetricRecorder::GetInstance().Shutdown(); - } else { + } + if (should_enable_open_census()) { metrics_io_service_pool->Stop(); opencensus::stats::DeltaProducer::Get()->Shutdown(); opencensus::stats::StatsExporter::Shutdown(); diff --git a/src/ray/stats/tag_defs.cc b/src/ray/stats/tag_defs.cc index 527a06007c87..600d5f1ef95b 100644 --- a/src/ray/stats/tag_defs.cc +++ b/src/ray/stats/tag_defs.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/stats/metric.h" +#include "ray/stats/tag_defs.h" namespace ray { namespace stats { diff --git a/src/ray/stats/tag_defs.h b/src/ray/stats/tag_defs.h index 38c2df149dfc..47d197f71161 100644 --- a/src/ray/stats/tag_defs.h +++ b/src/ray/stats/tag_defs.h @@ -14,11 +14,13 @@ #pragma once +#include "ray/observability/metric_interface.h" + /// The definitions of tag keys that you can use every where. /// You can follow these examples to define and register your tag keys. -using TagKeyType = opencensus::tags::TagKey; -using TagsType = std::vector>; +namespace ray { +namespace stats { extern const TagKeyType ComponentKey; @@ -66,3 +68,6 @@ constexpr char kObjectUnsealed[] = "UNSEALED"; // GCS task manager tags constexpr char kGcsTaskStatusEventDropped[] = "STATUS_EVENT"; constexpr char kGcsProfileEventDropped[] = "PROFILE_EVENT"; + +} // namespace stats +} // namespace ray diff --git a/src/ray/stats/tests/BUILD.bazel b/src/ray/stats/tests/BUILD.bazel index e1d3f7d66f0a..ac78d6de4d00 100644 --- a/src/ray/stats/tests/BUILD.bazel +++ b/src/ray/stats/tests/BUILD.bazel @@ -5,7 +5,7 @@ ray_cc_test( size = "small", srcs = ["metric_with_open_telemetry_test.cc"], env = { - "RAY_experimental_enable_open_telemetry_on_core": "1", + "RAY_enable_open_telemetry": "1", }, tags = ["team:core"], deps = [ @@ -13,3 +13,34 @@ ray_cc_test( "@com_google_googletest//:gtest_main", ], ) + +ray_cc_test( + name = "stats_test", + size = "small", + srcs = ["stats_test.cc"], + tags = [ + "no_tsan", + "stats", + "team:core", + ], + deps = [ + "//src/ray/stats:stats_lib", + "@com_google_googletest//:gtest_main", + ], +) + +ray_cc_test( + name = "metric_exporter_grpc_test", + size = "small", + srcs = [ + "metric_exporter_grpc_test.cc", + ], + tags = [ + "stats", + "team:core", + ], + deps = [ + "//src/ray/stats:stats_lib", + "@com_google_googletest//:gtest_main", + ], +) diff --git a/src/ray/stats/metric_exporter_grpc_test.cc b/src/ray/stats/tests/metric_exporter_grpc_test.cc similarity index 97% rename from src/ray/stats/metric_exporter_grpc_test.cc rename to src/ray/stats/tests/metric_exporter_grpc_test.cc index a823a863ebd3..a7553c88f792 100644 --- a/src/ray/stats/metric_exporter_grpc_test.cc +++ b/src/ray/stats/tests/metric_exporter_grpc_test.cc @@ -57,6 +57,12 @@ class MockMetricsAgentClient : public rpc::MetricsAgentClient { callback(Status::OK(), {}); } + void HealthCheck(const rpc::HealthCheckRequest &request, + const rpc::ClientCallback &callback) override {} + + void WaitForServerReady(std::function init_exporter_fn) override { + } + const std::vector &CollectedReportOCMetricsRequests() const { return reportOCMetricsRequests_; diff --git a/src/ray/stats/tests/metric_with_open_telemetry_test.cc b/src/ray/stats/tests/metric_with_open_telemetry_test.cc index d6aeb3cbb471..0ff403e939c9 100644 --- a/src/ray/stats/tests/metric_with_open_telemetry_test.cc +++ b/src/ray/stats/tests/metric_with_open_telemetry_test.cc @@ -14,14 +14,14 @@ #include "gtest/gtest.h" #include "ray/common/ray_config.h" +#include "ray/observability/open_telemetry_metric_recorder.h" #include "ray/stats/metric.h" -#include "ray/telemetry/open_telemetry_metric_recorder.h" namespace ray { -namespace telemetry { +namespace observability { using namespace std::literals; -using OpenTelemetryMetricRecorder = ray::telemetry::OpenTelemetryMetricRecorder; +using OpenTelemetryMetricRecorder = ray::observability::OpenTelemetryMetricRecorder; using StatsConfig = ray::stats::StatsConfig; using TagsMap = absl::flat_hash_map; @@ -176,14 +176,15 @@ INSTANTIATE_TEST_SUITE_P( GaugeMetricTest, ::testing::Values( // Gauge metric without global tags - GaugeMetricCase{/*metric_name=*/"metric_gauge_test", - /*record_value=*/42.0, - /*record_tags=*/ - {{stats::TagKeyType::Register("Tag1"), "Value1"}, - {stats::TagKeyType::Register("Tag2"), "Value1"}}, - /*global_tags=*/{}, // no global tags - /*expected_tags=*/{{"Tag1", "Value1"}, {"Tag2", "Value1"}}, - /*expected_value=*/42.0}, + GaugeMetricCase{ + /*metric_name=*/"metric_gauge_test", + /*record_value=*/42.0, + /*record_tags=*/ + {{stats::TagKeyType::Register("Tag1"), "Value1"}, + {stats::TagKeyType::Register("Tag2"), "Value1"}}, + /*global_tags=*/{}, // no global tags + /*expected_tags=*/{{"Tag1", "Value1"}, {"Tag2", "Value1"}, {"Tag3", ""}}, + /*expected_value=*/42.0}, // Gauge metric with a single global tag that is metric-specific GaugeMetricCase{/*metric_name=*/"metric_gauge_test", /*record_value=*/52.0, @@ -195,19 +196,20 @@ INSTANTIATE_TEST_SUITE_P( {{"Tag1", "Value2"}, {"Tag2", "Value2"}, {"Tag3", "Global"}}, /*expected_value=*/52.0}, // Gauge metric with a non-metric-specific global tag - GaugeMetricCase{/*metric_name=*/"metric_gauge_test", - /*record_value=*/62.0, - /*record_tags=*/ - {{stats::TagKeyType::Register("Tag1"), "Value3"}, - {stats::TagKeyType::Register("Tag2"), "Value3"}}, - /*global_tags=*/ - { - {stats::TagKeyType::Register("Tag4"), - "Global"} // Tag4 not registered in metric definition - }, - /*expected_tags=*/ - {{"Tag1", "Value3"}, {"Tag2", "Value3"}, {"Tag4", "Global"}}, - /*expected_value=*/62.0}, + GaugeMetricCase{ + /*metric_name=*/"metric_gauge_test", + /*record_value=*/62.0, + /*record_tags=*/ + {{stats::TagKeyType::Register("Tag1"), "Value3"}, + {stats::TagKeyType::Register("Tag2"), "Value3"}}, + /*global_tags=*/ + { + {stats::TagKeyType::Register("Tag4"), + "Global"} // Tag4 not registered in metric definition + }, + /*expected_tags=*/ + {{"Tag1", "Value3"}, {"Tag2", "Value3"}, {"Tag3", ""}, {"Tag4", "Global"}}, + /*expected_value=*/62.0}, // Gauge metric where global tags overwrite record tags GaugeMetricCase{/*metric_name=*/"metric_gauge_test", /*record_value=*/72.0, @@ -230,8 +232,9 @@ INSTANTIATE_TEST_SUITE_P( /*global_tags=*/{}, // no global tags /*expected_tags=*/ {{"Tag1", "Value5"}, // unsupported tag dropped - {"Tag2", "Value5"}}, + {"Tag2", "Value5"}, + {"Tag3", ""}}, /*expected_value=*/82.0})); -} // namespace telemetry +} // namespace observability } // namespace ray diff --git a/src/ray/stats/stats_test.cc b/src/ray/stats/tests/stats_test.cc similarity index 95% rename from src/ray/stats/stats_test.cc rename to src/ray/stats/tests/stats_test.cc index a333ba335313..47d2c22c7663 100644 --- a/src/ray/stats/stats_test.cc +++ b/src/ray/stats/tests/stats_test.cc @@ -22,7 +22,8 @@ #include "absl/memory/memory.h" #include "gmock/gmock.h" #include "gtest/gtest.h" -#include "ray/stats/metric_defs.h" +#include "ray/stats/metric.h" +#include "ray/stats/tag_defs.h" DEFINE_stats(test_hist, "TestStats", @@ -93,12 +94,18 @@ class StatsTest : public ::testing::Test { virtual void TearDown() override { Shutdown(); } void Shutdown() { ray::stats::Shutdown(); } + + protected: + ray::stats::Gauge ray_metric_test_metrics_{"local_available_resource", + "The available resources on this node.", + "", + {stats::kResourceNameKey}}; }; TEST_F(StatsTest, F) { for (size_t i = 0; i < 20; ++i) { std::this_thread::sleep_for(std::chrono::milliseconds(50)); - stats::TestMetrics().Record(2345); + ray_metric_test_metrics_.Record(2345); } } diff --git a/src/ray/telemetry/BUILD.bazel b/src/ray/telemetry/BUILD.bazel deleted file mode 100644 index a55197186043..000000000000 --- a/src/ray/telemetry/BUILD.bazel +++ /dev/null @@ -1,18 +0,0 @@ -load("//bazel:ray.bzl", "ray_cc_library") - -ray_cc_library( - name = "open_telemetry_metric_recorder", - srcs = [ - "open_telemetry_metric_recorder.cc", - ], - hdrs = [ - "open_telemetry_metric_recorder.h", - ], - deps = [ - "//src/ray/util:logging", - "@com_google_absl//absl/container:flat_hash_map", - "@io_opentelemetry_cpp//api", - "@io_opentelemetry_cpp//exporters/otlp:otlp_grpc_metric_exporter", - "@io_opentelemetry_cpp//sdk/src/metrics", - ], -) diff --git a/src/ray/telemetry/tests/BUILD.bazel b/src/ray/telemetry/tests/BUILD.bazel deleted file mode 100644 index 78af015a1fd0..000000000000 --- a/src/ray/telemetry/tests/BUILD.bazel +++ /dev/null @@ -1,12 +0,0 @@ -load("//bazel:ray.bzl", "ray_cc_test") - -ray_cc_test( - name = "open_telemetry_metric_recorder_test", - size = "small", - srcs = ["open_telemetry_metric_recorder_test.cc"], - tags = ["team:core"], - deps = [ - "//src/ray/telemetry:open_telemetry_metric_recorder", - "@com_google_googletest//:gtest_main", - ], -) diff --git a/src/ray/util/BUILD.bazel b/src/ray/util/BUILD.bazel index 938a46b0b8d1..98e0c7a74860 100644 --- a/src/ray/util/BUILD.bazel +++ b/src/ray/util/BUILD.bazel @@ -112,6 +112,8 @@ ray_cc_library( deps = [ ":logging", ":mutex_protected", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/container:flat_hash_set", ], ) @@ -123,7 +125,7 @@ ray_cc_library( ":logging", ":random", ":string_utils", - ":timestamp_utils", + ":time", "//src/ray/protobuf:event_cc_proto", "//src/ray/protobuf:export_event_cc_proto", "@boost//:asio", @@ -145,8 +147,9 @@ ray_cc_library( ) ray_cc_library( - name = "timestamp_utils", - hdrs = ["timestamp_utils.h"], + name = "time", + srcs = ["time.cc"], + hdrs = ["time.h"], ) ray_cc_library( @@ -196,14 +199,6 @@ ray_cc_library( ], ) -ray_cc_library( - name = "sample", - hdrs = ["sample.h"], - deps = [ - "@com_google_absl//absl/time", - ], -) - ray_cc_library( name = "cmd_line_utils", srcs = ["cmd_line_utils.cc"], @@ -219,26 +214,19 @@ ray_cc_library( srcs = ["network_util.cc"], hdrs = ["network_util.h"], deps = [ + ":filesystem", + ":string_utils", "@boost//:asio", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/strings", "@com_google_absl//absl/strings:str_format", ], ) -# TODO(hjiang): Split URL related functions into a separate util target. ray_cc_library( - name = "util", - srcs = ["util.cc"], - hdrs = ["util.h"], - deps = [ - ":filesystem", - ":logging", - ":macros", - ":process", - ":string_utils", - "//src/ray/thirdparty:sha256", - "@boost//:asio", - "@com_google_absl//absl/container:flat_hash_map", - ], + name = "raii", + hdrs = ["raii.h"], + deps = [], ) ray_cc_library( @@ -261,8 +249,8 @@ ray_cc_library( name = "shared_lru", hdrs = ["shared_lru.h"], deps = [ + ":logging", ":map_utils", - ":util", "@com_google_absl//absl/container:flat_hash_map", ], ) @@ -288,16 +276,19 @@ ray_cc_library( hdrs = ["pipe_logger.h"], deps = [ ":compat", + ":logging", ":spdlog_fd_sink", ":spdlog_newliner_sink", ":stream_redirection_options", ":thread_utils", - ":util", "//src/ray/common:ray_config", + "//src/ray/common:status", "@boost//:iostreams", "@com_github_spdlog//:spdlog", "@com_google_absl//absl/container:inlined_vector", "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/synchronization", ], ) @@ -309,9 +300,8 @@ ray_cc_library( ":pipe_logger", ":scoped_dup2_wrapper", ":stream_redirection_options", - ":util", "//src/ray/util/internal:stream_redirection_handle", - "@com_google_absl//absl/container:inlined_vector", + "@com_google_absl//absl/container:flat_hash_map", ], ) @@ -320,7 +310,7 @@ ray_cc_library( hdrs = ["spdlog_fd_sink.h"], deps = [ ":compat", - ":util", + "//src/ray/common:status", "@com_github_spdlog//:spdlog", ], ) @@ -330,7 +320,6 @@ ray_cc_library( hdrs = ["spdlog_newliner_sink.h"], deps = [ ":compat", - ":util", "@com_github_spdlog//:spdlog", ], ) @@ -340,7 +329,7 @@ ray_cc_library( srcs = ["temporary_directory.cc"], hdrs = ["temporary_directory.h"], deps = [ - ":util", + "//src/ray/common:id", "@com_google_absl//absl/strings:str_format", ], ) diff --git a/src/ray/util/event.cc b/src/ray/util/event.cc index e704c8684738..bcb4422c6502 100644 --- a/src/ray/util/event.cc +++ b/src/ray/util/event.cc @@ -25,7 +25,7 @@ #include "ray/util/random.h" #include "ray/util/string_utils.h" -#include "ray/util/timestamp_utils.h" +#include "ray/util/time.h" using json = nlohmann::json; diff --git a/src/ray/util/internal/tests/BUILD.bazel b/src/ray/util/internal/tests/BUILD.bazel index bff2005719a1..a3de8e2dddf9 100644 --- a/src/ray/util/internal/tests/BUILD.bazel +++ b/src/ray/util/internal/tests/BUILD.bazel @@ -12,9 +12,11 @@ ray_cc_test( "no_tsan", ], deps = [ - "//src/ray/common/test:testing", - "//src/ray/util", + "//src/ray/common:id", + "//src/ray/common/tests:testing", + "//src/ray/util:filesystem", "//src/ray/util/internal:stream_redirection_handle", + "@com_google_absl//absl/strings:str_format", "@com_google_googletest//:gtest_main", ], ) diff --git a/src/ray/util/internal/tests/stream_redirection_handle_test.cc b/src/ray/util/internal/tests/stream_redirection_handle_test.cc index a4ac9e87aed4..d47ce0d5296e 100644 --- a/src/ray/util/internal/tests/stream_redirection_handle_test.cc +++ b/src/ray/util/internal/tests/stream_redirection_handle_test.cc @@ -22,14 +22,17 @@ #include #include -#include "ray/common/test/testing.h" +#include "absl/strings/str_format.h" +#include "ray/common/id.h" +#include "ray/common/tests/testing.h" #include "ray/util/filesystem.h" -#include "ray/util/util.h" namespace ray::internal { namespace { +inline std::string RandomID() { return UniqueID::FromRandom().Hex(); } + // Output logging files to cleanup at process termination. std::vector log_files; void CleanupOutputLogFiles() { @@ -54,7 +57,7 @@ TEST(LoggingUtilTest, WriteContentWithNewliner) { constexpr std::string_view kLogLine1 = "hello\n"; constexpr std::string_view kLogLine2 = "world\n"; - const std::string test_file_path = absl::StrFormat("%s.err", GenerateUUIDV4()); + const std::string test_file_path = absl::StrFormat("%s.err", RandomID()); const std::string log_file_path1 = test_file_path; const std::string log_file_path2 = absl::StrFormat("%s.1", test_file_path); log_files.emplace_back(log_file_path1); @@ -100,7 +103,7 @@ TEST(LoggingUtilTest, WriteContentWithFlush) { constexpr std::string_view kLogLine1 = "hello"; constexpr std::string_view kLogLine2 = "world"; - const std::string test_file_path = absl::StrFormat("%s.err", GenerateUUIDV4()); + const std::string test_file_path = absl::StrFormat("%s.err", RandomID()); const std::string log_file_path1 = test_file_path; const std::string log_file_path2 = absl::StrFormat("%s.1", test_file_path); log_files.emplace_back(log_file_path1); diff --git a/src/ray/util/logging.h b/src/ray/util/logging.h index 4a010ae9b8e1..405d772b57ef 100644 --- a/src/ray/util/logging.h +++ b/src/ray/util/logging.h @@ -104,6 +104,7 @@ inline constexpr std::string_view kLogKeyActorID = "actor_id"; inline constexpr std::string_view kLogKeyTaskID = "task_id"; inline constexpr std::string_view kLogKeyObjectID = "object_id"; inline constexpr std::string_view kLogKeyPlacementGroupID = "placement_group_id"; +inline constexpr std::string_view kLogKeyLeaseID = "lease_id"; // Define your specialization DefaultLogKey::key to get .WithField(t) // See src/ray/common/id.h @@ -139,11 +140,14 @@ enum class RayLogLevel { #define RAY_IGNORE_EXPR(expr) ((void)(expr)) -#define RAY_CHECK_WITH_DISPLAY(condition, display) \ - RAY_PREDICT_TRUE((condition)) \ - ? RAY_IGNORE_EXPR(0) \ - : ::ray::Voidify() & (::ray::RayLog(__FILE__, __LINE__, ray::RayLogLevel::FATAL) \ - << " Check failed: " display " ") +#define RAY_CHECK_WITH_DISPLAY(condition, display) \ + RAY_PREDICT_TRUE((condition)) \ + ? RAY_IGNORE_EXPR(0) \ + : ::ray::Voidify() & (::ray::RayLog(__FILE__, __LINE__, ray::RayLogLevel::FATAL) \ + << " An unexpected system state has occurred. You have likely " \ + "discovered a bug in Ray. Please report this issue at " \ + "https://github.com/ray-project/ray/issues and we'll work " \ + "with you to fix it. Check failed: " display " ") #define RAY_CHECK(condition) RAY_CHECK_WITH_DISPLAY(condition, #condition) diff --git a/src/ray/util/macros.h b/src/ray/util/macros.h index 0a81b92bc230..5e111c43fcb3 100644 --- a/src/ray/util/macros.h +++ b/src/ray/util/macros.h @@ -14,13 +14,6 @@ #pragma once -// From Google gutil -#ifndef RAY_DISALLOW_COPY_AND_ASSIGN -#define RAY_DISALLOW_COPY_AND_ASSIGN(TypeName) \ - TypeName(const TypeName &) = delete; \ - void operator=(const TypeName &) = delete -#endif - #define RAY_UNUSED(x) (void)x // diff --git a/src/ray/util/network_util.cc b/src/ray/util/network_util.cc index 1a3c8a9c65af..2364a42265c4 100644 --- a/src/ray/util/network_util.cc +++ b/src/ray/util/network_util.cc @@ -16,11 +16,22 @@ #include #include +#include +#ifndef _WIN32 +#include +#endif +#include +#include #include #include +#include +#include "absl/strings/match.h" #include "absl/strings/str_format.h" +#include "ray/util/filesystem.h" +#include "ray/util/string_utils.h" +using boost::asio::io_context; using boost::asio::ip::tcp; namespace ray { @@ -62,7 +73,7 @@ std::optional> ParseAddress(const std::string &addres } bool CheckPortFree(int port) { - boost::asio::io_context io_service; + io_context io_service; tcp::socket socket(io_service); socket.open(tcp::v4()); boost::system::error_code ec; @@ -71,4 +82,125 @@ bool CheckPortFree(int port) { return !ec.failed(); } +std::string EndpointToUrl( + const boost::asio::generic::basic_endpoint &ep, + bool include_scheme) { + std::string result, scheme; + switch (ep.protocol().family()) { + case AF_INET: { + scheme = "tcp://"; + tcp::endpoint e(tcp::v4(), 0); + RAY_CHECK_EQ(e.size(), ep.size()); + const sockaddr *src = ep.data(); + sockaddr *dst = e.data(); + *reinterpret_cast(dst) = *reinterpret_cast(src); + std::ostringstream ss; + ss << e; + result = ss.str(); + break; + } + case AF_INET6: { + scheme = "tcp://"; + tcp::endpoint e(tcp::v6(), 0); + RAY_CHECK_EQ(e.size(), ep.size()); + const sockaddr *src = ep.data(); + sockaddr *dst = e.data(); + *reinterpret_cast(dst) = *reinterpret_cast(src); + std::ostringstream ss; + ss << e; + result = ss.str(); + break; + } +#if defined(BOOST_ASIO_HAS_LOCAL_SOCKETS) && !defined(_WIN32) + case AF_UNIX: + scheme = "unix://"; + result.append(reinterpret_cast(ep.data())->sun_path, + ep.size() - offsetof(sockaddr_un, sun_path)); + break; +#endif + default: + RAY_LOG(FATAL) << "unsupported protocol family: " << ep.protocol().family(); + break; + } + if (include_scheme) { + result.insert(0, scheme); + } + return result; +} + +boost::asio::generic::basic_endpoint +ParseUrlEndpoint(const std::string &endpoint, int default_port) { + // Syntax reference: https://en.wikipedia.org/wiki/URL#Syntax + // Note that we're a bit more flexible, to allow parsing "127.0.0.1" as a URL. + boost::asio::generic::stream_protocol::endpoint result; + std::string address = endpoint, scheme; + if (absl::StartsWith(address, "unix://")) { + scheme = "unix://"; + address.erase(0, scheme.size()); + } else if (!address.empty() && ray::IsDirSep(address[0])) { + scheme = "unix://"; + } else if (absl::StartsWith(address, "tcp://")) { + scheme = "tcp://"; + address.erase(0, scheme.size()); + } else { + scheme = "tcp://"; + } + if (scheme == "unix://") { +#if defined(BOOST_ASIO_HAS_LOCAL_SOCKETS) && !defined(_WIN32) + size_t maxlen = sizeof(sockaddr_un().sun_path) / sizeof(*sockaddr_un().sun_path) - 1; + RAY_CHECK(address.size() <= maxlen) + << "AF_UNIX path length cannot exceed " << maxlen << " bytes: " << address; + result = boost::asio::local::stream_protocol::endpoint(address); +#else + RAY_LOG(FATAL) << "UNIX-domain socket endpoints are not supported: " << endpoint; +#endif + } else if (scheme == "tcp://") { + std::string::const_iterator i = address.begin(); + std::string host = ScanToken(i, "[%*[^][/]]"); + host = host.empty() ? ScanToken(i, "%*[^/:]") : host.substr(1, host.size() - 2); + std::string port_str = ScanToken(i, ":%*d"); + int port = port_str.empty() ? default_port : std::stoi(port_str.substr(1)); + result = tcp::endpoint(boost::asio::ip::make_address(host), port); + } else { + RAY_LOG(FATAL) << "Unable to parse socket endpoint: " << endpoint; + } + return result; +} + +std::shared_ptr> ParseURL(std::string url) { + auto result = std::make_shared>(); + std::string delimiter = "?"; + size_t pos = 0; + pos = url.find(delimiter); + if (pos == std::string::npos) { + return result; + } + + const std::string base_url = url.substr(0, pos); + result->emplace("url", base_url); + url.erase(0, pos + delimiter.length()); + const std::string query_delimeter = "&"; + + auto parse_key_value_with_equal_delimter = + [](std::string_view key_value) -> std::pair { + // Parse the query key value pair. + const std::string key_value_delimter = "="; + size_t key_value_pos = key_value.find(key_value_delimter); + std::string_view key = key_value.substr(0, key_value_pos); + return std::make_pair(key, key_value.substr(key.size() + 1)); + }; + + while ((pos = url.find(query_delimeter)) != std::string::npos) { + std::string_view token = std::string_view{url}.substr(0, pos); + auto key_value_pair = parse_key_value_with_equal_delimter(token); + result->emplace(std::string(key_value_pair.first), + std::string(key_value_pair.second)); + url.erase(0, pos + delimiter.length()); + } + std::string_view token = std::string_view{url}.substr(0, pos); + auto key_value_pair = parse_key_value_with_equal_delimter(token); + result->emplace(std::string(key_value_pair.first), std::string(key_value_pair.second)); + return result; +} + } // namespace ray diff --git a/src/ray/util/network_util.h b/src/ray/util/network_util.h index 5ca3b48bde45..c10a062b6089 100644 --- a/src/ray/util/network_util.h +++ b/src/ray/util/network_util.h @@ -15,9 +15,21 @@ #pragma once #include +#include #include #include +#include "absl/container/flat_hash_map.h" + +// Boost forward-declarations (to avoid forcing slow header inclusions) +namespace boost::asio::generic { + +template +class basic_endpoint; +class stream_protocol; + +} // namespace boost::asio::generic + namespace ray { /// Build a network address string from host and port. @@ -43,4 +55,28 @@ std::optional> ParseAddress(const std::string &addres /// \return true if the port is available, false otherwise. bool CheckPortFree(int port); +/// Converts the given endpoint (such as TCP or UNIX domain socket address) to a string. +/// \param include_scheme Whether to include the scheme prefix (such as tcp://). +/// This is recommended to avoid later ambiguity when parsing. +std::string EndpointToUrl( + const boost::asio::generic::basic_endpoint &ep, + bool include_scheme = true); + +/// Parses the endpoint socket address of a URL. +/// If a scheme:// prefix is absent, the address family is guessed automatically. +/// For TCP/IP, the endpoint comprises the IP address and port number in the URL. +/// For UNIX domain sockets, the endpoint comprises the socket path. +boost::asio::generic::basic_endpoint +ParseUrlEndpoint(const std::string &endpoint, int default_port = 0); + +/// Parse the url and return a pair of base_url and query string map. +/// EX) http://abc?num_objects=9&offset=8388878 +/// will be returned as +/// { +/// url: http://abc, +/// num_objects: 9, +/// offset: 8388878 +/// } +std::shared_ptr> ParseURL(std::string url); + } // namespace ray diff --git a/src/ray/util/pipe_logger.cc b/src/ray/util/pipe_logger.cc index d16a6589d666..a440dffada3f 100644 --- a/src/ray/util/pipe_logger.cc +++ b/src/ray/util/pipe_logger.cc @@ -30,7 +30,9 @@ #include #include "absl/container/inlined_vector.h" +#include "absl/strings/str_format.h" #include "absl/strings/str_split.h" +#include "absl/synchronization/mutex.h" #include "ray/common/ray_config.h" #include "ray/util/spdlog_fd_sink.h" #include "ray/util/spdlog_newliner_sink.h" diff --git a/src/ray/util/pipe_logger.h b/src/ray/util/pipe_logger.h index 538b925fbdaf..9715403528a1 100644 --- a/src/ray/util/pipe_logger.h +++ b/src/ray/util/pipe_logger.h @@ -25,9 +25,9 @@ #include #include +#include "ray/common/status.h" #include "ray/util/compat.h" #include "ray/util/stream_redirection_options.h" -#include "ray/util/util.h" #include "spdlog/logger.h" namespace ray { diff --git a/src/ray/util/process.cc b/src/ray/util/process.cc index 3412b2d5f902..b1bb78ee77cd 100644 --- a/src/ray/util/process.cc +++ b/src/ray/util/process.cc @@ -767,6 +767,11 @@ std::optional> GetAllProcsWithPpid(pid_t parent_pid) { #endif } +void QuickExit() { + ray::RayLog::ShutDownRayLog(); + _Exit(1); +} + } // namespace ray namespace std { diff --git a/src/ray/util/process.h b/src/ray/util/process.h index b0773811e50a..e222a6f7bfcb 100644 --- a/src/ray/util/process.h +++ b/src/ray/util/process.h @@ -31,6 +31,7 @@ #include #include "ray/util/compat.h" +#include "ray/util/logging.h" #ifndef PID_MAX_LIMIT // This is defined by Linux to be the maximum allowable number of processes @@ -156,6 +157,9 @@ std::optional KillProc(pid_t pid); // Currently only supported on Linux. Returns nullopt on other platforms. std::optional> GetAllProcsWithPpid(pid_t parent_pid); +/// Terminate the process without cleaning up the resources. +void QuickExit(); + } // namespace ray // We only define operators required by the standard library (==, hash): diff --git a/src/ray/util/raii.h b/src/ray/util/raii.h new file mode 100644 index 000000000000..c9baf921e5cd --- /dev/null +++ b/src/ray/util/raii.h @@ -0,0 +1,43 @@ +// Copyright 2017 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +class InitShutdownRAII { + public: + /// Type of the Shutdown function. + using ShutdownFunc = void (*)(); + + /// Create an instance of InitShutdownRAII which will call shutdown + /// function when it is out of scope. + /// + /// \param init_func The init function. + /// \param shutdown_func The shutdown function. + /// \param args The arguments for the init function. + template + InitShutdownRAII(InitFunc init_func, ShutdownFunc shutdown_func, Args &&...args) + : shutdown_(shutdown_func) { + init_func(args...); + } + + /// Destructor of InitShutdownRAII which will call the shutdown function. + ~InitShutdownRAII() { + if (shutdown_ != nullptr) { + shutdown_(); + } + } + + private: + ShutdownFunc shutdown_; +}; diff --git a/src/ray/util/sample.h b/src/ray/util/sample.h deleted file mode 100644 index 886a4a1104e2..000000000000 --- a/src/ray/util/sample.h +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright 2017 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include - -#include "absl/time/clock.h" - -// Randomly samples num_elements from the elements between first and last using reservoir -// sampling. -template ::value_type> -void random_sample(Iterator begin, - Iterator end, - size_t num_elements, - std::vector *out) { - out->resize(0); - if (num_elements == 0) { - return; - } - - std::default_random_engine gen(absl::GetCurrentTimeNanos()); - size_t current_index = 0; - for (auto it = begin; it != end; it++) { - if (current_index < num_elements) { - out->push_back(*it); - } else { - size_t random_index = std::uniform_int_distribution(0, current_index)(gen); - if (random_index < num_elements) { - out->at(random_index) = *it; - } - } - current_index++; - } - return; -} diff --git a/src/ray/util/spdlog_fd_sink.h b/src/ray/util/spdlog_fd_sink.h index c307b85e2b4d..9dd0249512e7 100644 --- a/src/ray/util/spdlog_fd_sink.h +++ b/src/ray/util/spdlog_fd_sink.h @@ -16,8 +16,8 @@ #include +#include "ray/common/status.h" #include "ray/util/compat.h" -#include "ray/util/util.h" namespace ray { diff --git a/src/ray/util/spdlog_newliner_sink.h b/src/ray/util/spdlog_newliner_sink.h index 11aa5234bc26..9a8570743d10 100644 --- a/src/ray/util/spdlog_newliner_sink.h +++ b/src/ray/util/spdlog_newliner_sink.h @@ -24,7 +24,6 @@ #include "absl/strings/str_split.h" #include "ray/util/compat.h" -#include "ray/util/util.h" namespace ray { diff --git a/src/ray/util/stream_redirection.cc b/src/ray/util/stream_redirection.cc index b806196079b4..04d3b5c54623 100644 --- a/src/ray/util/stream_redirection.cc +++ b/src/ray/util/stream_redirection.cc @@ -23,18 +23,14 @@ #include #include -#include "absl/container/inlined_vector.h" +#include "absl/container/flat_hash_map.h" #include "ray/util/compat.h" #include "ray/util/internal/stream_redirection_handle.h" -#include "ray/util/util.h" namespace ray { namespace { -// TODO(hjiang): Revisit later, should be able to save some heap allocation with -// absl::InlinedVector. -// // Maps from original stream file fd (i.e. stdout/stderr) to its stream redirector. absl::flat_hash_map redirection_file_handles; diff --git a/src/ray/util/string_utils.cc b/src/ray/util/string_utils.cc index ffd03b7065fc..b042cb058075 100644 --- a/src/ray/util/string_utils.cc +++ b/src/ray/util/string_utils.cc @@ -40,4 +40,17 @@ std::string ScanToken(std::string::const_iterator &c_str, std::string format) { } return result; } + +std::string PrependToEachLine(const std::string &str, const std::string &prefix) { + std::stringstream ss; + ss << prefix; + for (char c : str) { + ss << c; + if (c == '\n') { + ss << prefix; + } + } + return ss.str(); +} + } // namespace ray diff --git a/src/ray/util/string_utils.h b/src/ray/util/string_utils.h index db1b9279a71a..557176bdb6da 100644 --- a/src/ray/util/string_utils.h +++ b/src/ray/util/string_utils.h @@ -98,4 +98,7 @@ StatusOr StringToInt(const std::string &input) noexcept { return StatusOr(value); } +// Prepend the prefix to each line of str. +std::string PrependToEachLine(const std::string &str, const std::string &prefix); + } // namespace ray diff --git a/src/ray/util/subreaper.h b/src/ray/util/subreaper.h index ab3f060f00a9..94047a751373 100644 --- a/src/ray/util/subreaper.h +++ b/src/ray/util/subreaper.h @@ -105,10 +105,13 @@ class KnownChildrenTracker { std::vector ListUnknownChildren( std::function()> list_pids_fn); + KnownChildrenTracker(const KnownChildrenTracker &) = delete; + KnownChildrenTracker &operator=(const KnownChildrenTracker &) = delete; + + ~KnownChildrenTracker() = default; + private: KnownChildrenTracker() = default; - ~KnownChildrenTracker() = default; - RAY_DISALLOW_COPY_AND_ASSIGN(KnownChildrenTracker); bool enabled_ = false; absl::Mutex m_; diff --git a/src/ray/util/temporary_directory.cc b/src/ray/util/temporary_directory.cc index a6803e8ea365..1b3cc40780e0 100644 --- a/src/ray/util/temporary_directory.cc +++ b/src/ray/util/temporary_directory.cc @@ -17,7 +17,7 @@ #include #include -#include "ray/util/util.h" +#include "ray/common/id.h" namespace ray { @@ -25,7 +25,7 @@ ScopedTemporaryDirectory::ScopedTemporaryDirectory(const std::string &dir) { temporary_directory_ = dir.empty() ? std::filesystem::temp_directory_path() : std::filesystem::path{dir}; // Manually generate a directory name by appending UUID. - temporary_directory_ = temporary_directory_ / GenerateUUIDV4(); + temporary_directory_ = temporary_directory_ / UniqueID::FromRandom().Hex(); RAY_CHECK(std::filesystem::create_directory(temporary_directory_)); } ScopedTemporaryDirectory::~ScopedTemporaryDirectory() { diff --git a/src/ray/util/tests/BUILD.bazel b/src/ray/util/tests/BUILD.bazel index 443b97190955..88854484df15 100644 --- a/src/ray/util/tests/BUILD.bazel +++ b/src/ray/util/tests/BUILD.bazel @@ -5,7 +5,6 @@ ray_cc_test( srcs = ["array_test.cc"], tags = ["team:core"], deps = [ - "//src/ray/util", "//src/ray/util:array", "@com_google_googletest//:gtest_main", ], @@ -16,7 +15,6 @@ ray_cc_test( srcs = ["function_traits_test.cc"], tags = ["team:core"], deps = [ - "//src/ray/util", "//src/ray/util:function_traits", "@com_google_googletest//:gtest_main", ], @@ -40,7 +38,6 @@ ray_cc_test( linkstatic = True, tags = ["team:core"], deps = [ - "//src/ray/util", "//src/ray/util:container_util", "@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/container:flat_hash_set", @@ -54,7 +51,6 @@ ray_cc_test( srcs = ["counter_test.cc"], tags = ["team:core"], deps = [ - "//src/ray/util", "//src/ray/util:counter_map", "@com_google_googletest//:gtest_main", ], @@ -72,7 +68,6 @@ ray_cc_test( deps = [ "//src/ray/common:ray_config", "//src/ray/protobuf:gcs_cc_proto", - "//src/ray/util", "//src/ray/util:event", "//src/ray/util:path_utils", "@boost//:range", @@ -86,7 +81,6 @@ ray_cc_test( srcs = ["exponential_backoff_test.cc"], tags = ["team:core"], deps = [ - "//src/ray/util", "//src/ray/util:exponential_backoff", "@com_google_googletest//:gtest_main", ], @@ -120,9 +114,10 @@ ray_cc_test( ], deps = [ "//src/ray/common:status", - "//src/ray/util", "//src/ray/util:env", + "//src/ray/util:filesystem", "//src/ray/util:path_utils", + "//src/ray/util:time", "@boost//:asio", "@com_google_absl//absl/strings:str_format", "@com_google_googletest//:gtest_main", @@ -130,24 +125,13 @@ ray_cc_test( ], ) -ray_cc_test( - name = "sample_test", - size = "small", - srcs = ["sample_test.cc"], - tags = ["team:core"], - deps = [ - "//src/ray/util:sample", - "@com_google_googletest//:gtest_main", - ], -) - ray_cc_test( name = "sequencer_test", size = "small", srcs = ["sequencer_test.cc"], tags = ["team:core"], deps = [ - "//src/ray/util", + "//src/ray/util:logging", "//src/ray/util:sequencer", "@com_google_googletest//:gtest_main", ], @@ -159,9 +143,9 @@ ray_cc_test( srcs = ["signal_test.cc"], tags = ["team:core"], deps = [ - "//src/ray/util", "//src/ray/util:logging", "//src/ray/util:path_utils", + "//src/ray/util:raii", "@com_google_googletest//:gtest_main", ], ) @@ -172,7 +156,6 @@ ray_cc_test( srcs = ["throttler_test.cc"], tags = ["team:core"], deps = [ - "//src/ray/util", "//src/ray/util:throttler", "@com_google_absl//absl/time", "@com_google_googletest//:gtest_main", @@ -180,13 +163,12 @@ ray_cc_test( ) ray_cc_test( - name = "util_test", + name = "process_test", size = "small", - srcs = ["util_test.cc"], + srcs = ["process_test.cc"], tags = ["team:core"], deps = [ - "//src/ray/util", - "@boost//:asio", + "//src/ray/util:process", "@boost//:process", "@com_google_googletest//:gtest_main", ], @@ -254,11 +236,13 @@ ray_cc_test( srcs = ["pipe_logger_test.cc"], tags = ["team:core"], deps = [ - "//src/ray/common/test:testing", - "//src/ray/util", + "//src/ray/common:id", + "//src/ray/common/tests:testing", + "//src/ray/util:filesystem", "//src/ray/util:pipe_logger", "//src/ray/util:scoped_env_setter", "//src/ray/util:temporary_directory", + "@com_google_absl//absl/strings:str_format", "@com_google_googletest//:gtest_main", ], ) @@ -276,9 +260,11 @@ ray_cc_test( "no_tsan", ], deps = [ - "//src/ray/common/test:testing", - "//src/ray/util", + "//src/ray/common:id", + "//src/ray/common/tests:testing", + "//src/ray/util:filesystem", "//src/ray/util:stream_redirection", + "@com_google_absl//absl/strings:str_format", "@com_google_googletest//:gtest_main", ], ) @@ -312,11 +298,13 @@ ray_cc_test( srcs = ["spdlog_newliner_sink_test.cc"], tags = ["team:core"], deps = [ - "//src/ray/common/test:testing", + "//src/ray/common:id", + "//src/ray/common/tests:testing", "//src/ray/util:filesystem", "//src/ray/util:spdlog_fd_sink", "//src/ray/util:spdlog_newliner_sink", "//src/ray/util:temporary_directory", + "@com_google_absl//absl/strings:str_format", "@com_google_googletest//:gtest_main", ], ) @@ -349,10 +337,11 @@ ray_cc_test( srcs = ["process_cleanup_test.cc"], tags = ["team:core"], deps = [ - "//src/ray/common/test:testing", - "//src/ray/util", + "//src/ray/common:id", + "//src/ray/common/tests:testing", "//src/ray/util:filesystem", "//src/ray/util:process_cleaner", + "@com_google_absl//absl/strings:str_format", "@com_google_googletest//:gtest_main", ], ) @@ -363,7 +352,7 @@ ray_cc_test( srcs = ["scoped_dup2_wrapper_test.cc"], tags = ["team:core"], deps = [ - "//src/ray/common/test:testing", + "//src/ray/common/tests:testing", "//src/ray/util:compat", "//src/ray/util:filesystem", "//src/ray/util:scoped_dup2_wrapper", diff --git a/src/ray/util/tests/event_test.cc b/src/ray/util/tests/event_test.cc index 60821bcbc699..6f0b00320a7c 100644 --- a/src/ray/util/tests/event_test.cc +++ b/src/ray/util/tests/event_test.cc @@ -594,8 +594,12 @@ TEST_F(EventTest, TestRayCheckAbort) { "FATAL", "RAY_FATAL_CHECK_FAILED", "NULL"); - EXPECT_THAT(ele_1.message(), - testing::HasSubstr("Check failed: 1 < 0 incorrect test case")); + EXPECT_THAT( + ele_1.message(), + testing::HasSubstr( + "An unexpected system state has occurred. You have likely discovered a bug in " + "Ray. Please report this issue at https://github.com/ray-project/ray/issues " + "and we'll work with you to fix it. Check failed: 1 < 0 incorrect test case")); EXPECT_THAT(ele_1.message(), testing::HasSubstr("*** StackTrace Information ***")); EXPECT_THAT(ele_1.message(), testing::HasSubstr("ray::RayLog::~RayLog()")); } diff --git a/src/ray/util/tests/logging_test.cc b/src/ray/util/tests/logging_test.cc index bd320bd98f7f..ab82497afccf 100644 --- a/src/ray/util/tests/logging_test.cc +++ b/src/ray/util/tests/logging_test.cc @@ -30,7 +30,7 @@ #include "ray/util/env.h" #include "ray/util/filesystem.h" #include "ray/util/path_utils.h" -#include "ray/util/util.h" +#include "ray/util/time.h" using namespace testing; // NOLINT using json = nlohmann::json; diff --git a/src/ray/util/tests/network_util_test.cc b/src/ray/util/tests/network_util_test.cc index f1033a574fb4..f8d18fee804f 100644 --- a/src/ray/util/tests/network_util_test.cc +++ b/src/ray/util/tests/network_util_test.cc @@ -14,6 +14,11 @@ #include "ray/util/network_util.h" +#include +#include +#include +#include + #include "gtest/gtest.h" namespace ray { @@ -74,4 +79,38 @@ TEST(NetworkUtilTest, TestParseAddress) { ASSERT_FALSE(result.has_value()); } +TEST(NetworkUtilTest, UrlIpTcpParseTest) { + ASSERT_EQ(EndpointToUrl(ParseUrlEndpoint("tcp://[::1]:1/", 0), false), "[::1]:1"); + ASSERT_EQ(EndpointToUrl(ParseUrlEndpoint("tcp://[::1]/", 0), false), "[::1]:0"); + ASSERT_EQ(EndpointToUrl(ParseUrlEndpoint("tcp://[::1]:1", 0), false), "[::1]:1"); + ASSERT_EQ(EndpointToUrl(ParseUrlEndpoint("tcp://[::1]", 0), false), "[::1]:0"); + ASSERT_EQ(EndpointToUrl(ParseUrlEndpoint("tcp://127.0.0.1:1/", 0), false), + "127.0.0.1:1"); + ASSERT_EQ(EndpointToUrl(ParseUrlEndpoint("tcp://127.0.0.1/", 0), false), "127.0.0.1:0"); + ASSERT_EQ(EndpointToUrl(ParseUrlEndpoint("tcp://127.0.0.1:1", 0), false), + "127.0.0.1:1"); + ASSERT_EQ(EndpointToUrl(ParseUrlEndpoint("tcp://127.0.0.1", 0), false), "127.0.0.1:0"); + ASSERT_EQ(EndpointToUrl(ParseUrlEndpoint("[::1]:1/", 0), false), "[::1]:1"); + ASSERT_EQ(EndpointToUrl(ParseUrlEndpoint("[::1]/", 0), false), "[::1]:0"); + ASSERT_EQ(EndpointToUrl(ParseUrlEndpoint("[::1]:1", 0), false), "[::1]:1"); + ASSERT_EQ(EndpointToUrl(ParseUrlEndpoint("[::1]", 0), false), "[::1]:0"); + ASSERT_EQ(EndpointToUrl(ParseUrlEndpoint("127.0.0.1:1/", 0), false), "127.0.0.1:1"); + ASSERT_EQ(EndpointToUrl(ParseUrlEndpoint("127.0.0.1/", 0), false), "127.0.0.1:0"); + ASSERT_EQ(EndpointToUrl(ParseUrlEndpoint("127.0.0.1:1", 0), false), "127.0.0.1:1"); + ASSERT_EQ(EndpointToUrl(ParseUrlEndpoint("127.0.0.1", 0), false), "127.0.0.1:0"); +#ifndef _WIN32 + ASSERT_EQ(EndpointToUrl(ParseUrlEndpoint("unix:///tmp/sock"), false), "/tmp/sock"); + ASSERT_EQ(EndpointToUrl(ParseUrlEndpoint("/tmp/sock"), false), "/tmp/sock"); +#endif +} + +TEST(NetworkUtilTest, ParseURLTest) { + const std::string url = "http://abc?num_objects=9&offset=8388878&size=8388878"; + auto parsed_url = *ParseURL(url); + ASSERT_EQ(parsed_url["url"], "http://abc"); + ASSERT_EQ(parsed_url["num_objects"], "9"); + ASSERT_EQ(parsed_url["offset"], "8388878"); + ASSERT_EQ(parsed_url["size"], "8388878"); +} + } // namespace ray diff --git a/src/ray/util/tests/pipe_logger_test.cc b/src/ray/util/tests/pipe_logger_test.cc index afcf07a1433b..40b80894f035 100644 --- a/src/ray/util/tests/pipe_logger_test.cc +++ b/src/ray/util/tests/pipe_logger_test.cc @@ -23,16 +23,19 @@ #include #include -#include "ray/common/test/testing.h" +#include "absl/strings/str_format.h" +#include "ray/common/id.h" +#include "ray/common/tests/testing.h" #include "ray/util/filesystem.h" #include "ray/util/scoped_env_setter.h" #include "ray/util/temporary_directory.h" -#include "ray/util/util.h" namespace ray { namespace { +inline std::string RandomID() { return UniqueID::FromRandom().Hex(); } + constexpr std::string_view kLogLine1 = "hello\n"; constexpr std::string_view kLogLine2 = "world\n"; @@ -43,7 +46,7 @@ TEST_P(PipeLoggerTest, RedirectionTest) { ScopedEnvSetter scoped_env_setter{"RAY_pipe_logger_read_buf_size", pipe_buffer_size.data()}; ScopedTemporaryDirectory scoped_directory; - const auto test_file_path = scoped_directory.GetDirectory() / GenerateUUIDV4(); + const auto test_file_path = scoped_directory.GetDirectory() / RandomID(); // Take the default option, which doesn't have rotation enabled. StreamRedirectionOption stream_redirection_opt{}; @@ -65,7 +68,7 @@ TEST_P(PipeLoggerTest, RedirectionWithTee) { ScopedEnvSetter scoped_env_setter{"RAY_pipe_logger_read_buf_size", pipe_buffer_size.data()}; ScopedTemporaryDirectory scoped_directory; - const auto test_file_path = scoped_directory.GetDirectory() / GenerateUUIDV4(); + const auto test_file_path = scoped_directory.GetDirectory() / RandomID(); StreamRedirectionOption stream_redirection_opt{}; stream_redirection_opt.file_path = test_file_path.string(); @@ -94,7 +97,7 @@ TEST_P(PipeLoggerTest, RotatedRedirectionWithTee) { ScopedEnvSetter scoped_env_setter{"RAY_pipe_logger_read_buf_size", pipe_buffer_size.data()}; ScopedTemporaryDirectory scoped_directory; - const auto uuid = GenerateUUIDV4(); + const auto uuid = RandomID(); const auto test_file_path = scoped_directory.GetDirectory() / uuid; const auto log_file_path1 = test_file_path; const auto log_file_path2 = @@ -139,7 +142,7 @@ TEST_P(PipeLoggerTest, CompatibilityTest) { { constexpr std::string_view kContent = "hello"; ScopedTemporaryDirectory scoped_directory; - const auto test_file_path = scoped_directory.GetDirectory() / GenerateUUIDV4(); + const auto test_file_path = scoped_directory.GetDirectory() / RandomID(); StreamRedirectionOption logging_option{}; logging_option.file_path = test_file_path.string(); @@ -165,7 +168,7 @@ TEST_P(PipeLoggerTest, CompatibilityTest) { { constexpr std::string_view kContent = "hello\n"; ScopedTemporaryDirectory scoped_directory; - const auto test_file_path = scoped_directory.GetDirectory() / GenerateUUIDV4(); + const auto test_file_path = scoped_directory.GetDirectory() / RandomID(); StreamRedirectionOption logging_option{}; logging_option.file_path = test_file_path.string(); @@ -190,7 +193,7 @@ TEST_P(PipeLoggerTest, CompatibilityTest) { { constexpr std::string_view kContent = "hello\nworld"; ScopedTemporaryDirectory scoped_directory; - const auto test_file_path = scoped_directory.GetDirectory() / GenerateUUIDV4(); + const auto test_file_path = scoped_directory.GetDirectory() / RandomID(); StreamRedirectionOption logging_option{}; logging_option.file_path = test_file_path.string(); @@ -216,7 +219,7 @@ TEST_P(PipeLoggerTest, CompatibilityTest) { { constexpr std::string_view kContent = "hello\nworld\n"; ScopedTemporaryDirectory scoped_directory; - const auto test_file_path = scoped_directory.GetDirectory() / GenerateUUIDV4(); + const auto test_file_path = scoped_directory.GetDirectory() / RandomID(); StreamRedirectionOption logging_option{}; logging_option.file_path = test_file_path.string(); @@ -241,7 +244,7 @@ TEST_P(PipeLoggerTest, CompatibilityTest) { { constexpr std::string_view kContent = "helloworld\n\n\n"; ScopedTemporaryDirectory scoped_directory; - const auto test_file_path = scoped_directory.GetDirectory() / GenerateUUIDV4(); + const auto test_file_path = scoped_directory.GetDirectory() / RandomID(); StreamRedirectionOption logging_option{}; logging_option.file_path = test_file_path.string(); @@ -266,7 +269,7 @@ TEST_P(PipeLoggerTest, CompatibilityTest) { { constexpr std::string_view kContent = "hello\n\n\nworld"; ScopedTemporaryDirectory scoped_directory; - const auto test_file_path = scoped_directory.GetDirectory() / GenerateUUIDV4(); + const auto test_file_path = scoped_directory.GetDirectory() / RandomID(); StreamRedirectionOption logging_option{}; logging_option.file_path = test_file_path.string(); @@ -292,7 +295,7 @@ TEST_P(PipeLoggerTest, CompatibilityTest) { { constexpr std::string_view kContent = "hello\n\nworld\n\n"; ScopedTemporaryDirectory scoped_directory; - const auto test_file_path = scoped_directory.GetDirectory() / GenerateUUIDV4(); + const auto test_file_path = scoped_directory.GetDirectory() / RandomID(); StreamRedirectionOption logging_option{}; logging_option.file_path = test_file_path.string(); diff --git a/src/ray/util/tests/process_cleanup_test.cc b/src/ray/util/tests/process_cleanup_test.cc index a34ddc49cf75..bf3c1fdb2370 100644 --- a/src/ray/util/tests/process_cleanup_test.cc +++ b/src/ray/util/tests/process_cleanup_test.cc @@ -23,10 +23,11 @@ #include #include -#include "ray/common/test/testing.h" +#include "absl/strings/str_format.h" +#include "ray/common/id.h" +#include "ray/common/tests/testing.h" #include "ray/util/filesystem.h" #include "ray/util/process_cleaner.h" -#include "ray/util/util.h" namespace ray { @@ -34,7 +35,7 @@ namespace { TEST(ProcessCleanerTest, BasicTest) { const std::string kTestFname = - absl::StrFormat("/tmp/process_cleanup_%s", GenerateUUIDV4()); + absl::StrFormat("/tmp/process_cleanup_%s", UniqueID::FromRandom().Hex()); auto test_func = [fname = kTestFname]() { std::fstream f{fname, std::ios::app | std::ios::out}; f << "helloworld"; diff --git a/src/ray/util/tests/util_test.cc b/src/ray/util/tests/process_test.cc similarity index 52% rename from src/ray/util/tests/util_test.cc rename to src/ray/util/tests/process_test.cc index b32e0cc3dfa9..f58e25679748 100644 --- a/src/ray/util/tests/util_test.cc +++ b/src/ray/util/tests/process_test.cc @@ -12,62 +12,21 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ray/util/util.h" +#include "ray/util/process.h" + +#include +#include -#include #include #include #include -#include #include #include -#include "gmock/gmock.h" -#include "gtest/gtest.h" #include "ray/util/logging.h" -#include "ray/util/process.h" - -using namespace std::chrono_literals; // NOLINT namespace ray { -template -static std::string to_str(const T &obj, bool include_scheme) { - return EndpointToUrl(obj, include_scheme); -} - -TEST(UtilTest, UrlIpTcpParseTest) { - ASSERT_EQ(to_str(ParseUrlEndpoint("tcp://[::1]:1/", 0), false), "[::1]:1"); - ASSERT_EQ(to_str(ParseUrlEndpoint("tcp://[::1]/", 0), false), "[::1]:0"); - ASSERT_EQ(to_str(ParseUrlEndpoint("tcp://[::1]:1", 0), false), "[::1]:1"); - ASSERT_EQ(to_str(ParseUrlEndpoint("tcp://[::1]", 0), false), "[::1]:0"); - ASSERT_EQ(to_str(ParseUrlEndpoint("tcp://127.0.0.1:1/", 0), false), "127.0.0.1:1"); - ASSERT_EQ(to_str(ParseUrlEndpoint("tcp://127.0.0.1/", 0), false), "127.0.0.1:0"); - ASSERT_EQ(to_str(ParseUrlEndpoint("tcp://127.0.0.1:1", 0), false), "127.0.0.1:1"); - ASSERT_EQ(to_str(ParseUrlEndpoint("tcp://127.0.0.1", 0), false), "127.0.0.1:0"); - ASSERT_EQ(to_str(ParseUrlEndpoint("[::1]:1/", 0), false), "[::1]:1"); - ASSERT_EQ(to_str(ParseUrlEndpoint("[::1]/", 0), false), "[::1]:0"); - ASSERT_EQ(to_str(ParseUrlEndpoint("[::1]:1", 0), false), "[::1]:1"); - ASSERT_EQ(to_str(ParseUrlEndpoint("[::1]", 0), false), "[::1]:0"); - ASSERT_EQ(to_str(ParseUrlEndpoint("127.0.0.1:1/", 0), false), "127.0.0.1:1"); - ASSERT_EQ(to_str(ParseUrlEndpoint("127.0.0.1/", 0), false), "127.0.0.1:0"); - ASSERT_EQ(to_str(ParseUrlEndpoint("127.0.0.1:1", 0), false), "127.0.0.1:1"); - ASSERT_EQ(to_str(ParseUrlEndpoint("127.0.0.1", 0), false), "127.0.0.1:0"); -#ifndef _WIN32 - ASSERT_EQ(to_str(ParseUrlEndpoint("unix:///tmp/sock"), false), "/tmp/sock"); - ASSERT_EQ(to_str(ParseUrlEndpoint("/tmp/sock"), false), "/tmp/sock"); -#endif -} - -TEST(UtilTest, ParseURLTest) { - const std::string url = "http://abc?num_objects=9&offset=8388878&size=8388878"; - auto parsed_url = *ParseURL(url); - ASSERT_EQ(parsed_url["url"], "http://abc"); - ASSERT_EQ(parsed_url["num_objects"], "9"); - ASSERT_EQ(parsed_url["offset"], "8388878"); - ASSERT_EQ(parsed_url["size"], "8388878"); -} - TEST(UtilTest, IsProcessAlive) { namespace bp = boost::process; bp::child c("bash"); @@ -75,7 +34,7 @@ TEST(UtilTest, IsProcessAlive) { c.join(); for (int i = 0; i < 5; ++i) { if (IsProcessAlive(pid)) { - std::this_thread::sleep_for(1s); + std::this_thread::sleep_for(std::chrono::seconds(1)); } else { break; } diff --git a/src/ray/util/tests/sample_test.cc b/src/ray/util/tests/sample_test.cc deleted file mode 100644 index 451cdf8f18b2..000000000000 --- a/src/ray/util/tests/sample_test.cc +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright 2017 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "ray/util/sample.h" - -#include - -#include - -namespace ray { - -class RandomSampleTest : public ::testing::Test { - protected: - std::vector *sample; - std::vector *test_vector; - virtual void SetUp() { - sample = new std::vector(); - test_vector = new std::vector(); - for (int i = 0; i < 10; i++) { - test_vector->push_back(i); - } - } - - virtual void TearDown() { - delete sample; - delete test_vector; - } -}; - -TEST_F(RandomSampleTest, TestEmpty) { - random_sample(test_vector->begin(), test_vector->end(), 0, sample); - ASSERT_EQ(sample->size(), 0); -} - -TEST_F(RandomSampleTest, TestSmallerThanSampleSize) { - random_sample( - test_vector->begin(), test_vector->end(), test_vector->size() + 1, sample); - ASSERT_EQ(sample->size(), test_vector->size()); -} - -TEST_F(RandomSampleTest, TestEqualToSampleSize) { - random_sample(test_vector->begin(), test_vector->end(), test_vector->size(), sample); - ASSERT_EQ(sample->size(), test_vector->size()); -} - -TEST_F(RandomSampleTest, TestLargerThanSampleSize) { - random_sample( - test_vector->begin(), test_vector->end(), test_vector->size() - 1, sample); - ASSERT_EQ(sample->size(), test_vector->size() - 1); -} - -TEST_F(RandomSampleTest, TestEqualOccurrenceChance) { - int trials = 1000000; - std::vector occurrences(test_vector->size(), 0); - for (int i = 0; i < trials; i++) { - random_sample( - test_vector->begin(), test_vector->end(), test_vector->size() / 2, sample); - for (int idx : *sample) { - occurrences[idx]++; - } - } - for (int count : occurrences) { - ASSERT_NEAR(trials / 2, count, 0.05 * trials / 2); - } -} - -} // namespace ray - -int main(int argc, char **argv) { - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/src/ray/util/tests/scoped_dup2_wrapper_test.cc b/src/ray/util/tests/scoped_dup2_wrapper_test.cc index 822635850558..7db58f66e217 100644 --- a/src/ray/util/tests/scoped_dup2_wrapper_test.cc +++ b/src/ray/util/tests/scoped_dup2_wrapper_test.cc @@ -21,7 +21,7 @@ #include #include -#include "ray/common/test/testing.h" +#include "ray/common/tests/testing.h" #include "ray/util/compat.h" #include "ray/util/filesystem.h" #include "ray/util/temporary_directory.h" diff --git a/src/ray/util/tests/signal_test.cc b/src/ray/util/tests/signal_test.cc index 23324fa07991..9e37b41fbab2 100644 --- a/src/ray/util/tests/signal_test.cc +++ b/src/ray/util/tests/signal_test.cc @@ -14,14 +14,16 @@ #include +#include #include #include #include +#include #include "gtest/gtest.h" #include "ray/util/logging.h" #include "ray/util/path_utils.h" -#include "ray/util/util.h" +#include "ray/util/raii.h" // This test just print some call stack information. namespace ray { diff --git a/src/ray/util/tests/spdlog_newliner_sink_test.cc b/src/ray/util/tests/spdlog_newliner_sink_test.cc index c5439e2b4bb4..565c90d57cfd 100644 --- a/src/ray/util/tests/spdlog_newliner_sink_test.cc +++ b/src/ray/util/tests/spdlog_newliner_sink_test.cc @@ -21,7 +21,9 @@ #include #include -#include "ray/common/test/testing.h" +#include "absl/strings/str_format.h" +#include "ray/common/id.h" +#include "ray/common/tests/testing.h" #include "ray/util/compat.h" #include "ray/util/filesystem.h" #include "ray/util/spdlog_fd_sink.h" @@ -32,6 +34,8 @@ namespace ray { namespace { +inline std::string RandomID() { return ray::UniqueID::FromRandom().Hex(); } + std::shared_ptr CreateLogger() { auto fd_formatter = std::make_unique( "%v", spdlog::pattern_time_type::local, std::string("")); @@ -182,7 +186,7 @@ TEST(NewlinerSinkWithFileinkTest, AppendAndFlushTest) { // Case-1: string with newliner at the end. { - const auto filepath = (dir.GetDirectory() / GenerateUUIDV4()).string(); + const auto filepath = (dir.GetDirectory() / RandomID()).string(); auto logger = CreateLogger(filepath); constexpr std::string_view kContent = "hello\n"; @@ -199,7 +203,7 @@ TEST(NewlinerSinkWithFileinkTest, AppendAndFlushTest) { // Case-2: string with no newliner at the end. { - const auto filepath = (dir.GetDirectory() / GenerateUUIDV4()).string(); + const auto filepath = (dir.GetDirectory() / RandomID()).string(); auto logger = CreateLogger(filepath); constexpr std::string_view kContent = "hello"; @@ -218,7 +222,7 @@ TEST(NewlinerSinkWithFileinkTest, AppendAndFlushTest) { // Case-3: newliner in the middle, with trailing newliner. { - const auto filepath = (dir.GetDirectory() / GenerateUUIDV4()).string(); + const auto filepath = (dir.GetDirectory() / RandomID()).string(); auto logger = CreateLogger(filepath); constexpr std::string_view kContent = "hello\nworld\n"; @@ -235,7 +239,7 @@ TEST(NewlinerSinkWithFileinkTest, AppendAndFlushTest) { // // Case-4: newliner in the middle, without trailing newliner. { - const auto filepath = (dir.GetDirectory() / GenerateUUIDV4()).string(); + const auto filepath = (dir.GetDirectory() / RandomID()).string(); auto logger = CreateLogger(filepath); constexpr std::string_view kContent = "hello\nworld"; @@ -254,7 +258,7 @@ TEST(NewlinerSinkWithFileinkTest, AppendAndFlushTest) { // // Case-5: multiple writes. { - const auto filepath = (dir.GetDirectory() / GenerateUUIDV4()).string(); + const auto filepath = (dir.GetDirectory() / RandomID()).string(); auto logger = CreateLogger(filepath); constexpr std::string_view kContent1 = "hello\nworld"; constexpr std::string_view kContent2 = "hello\nworld\n"; diff --git a/src/ray/util/tests/stream_redirection_exit_test.cc b/src/ray/util/tests/stream_redirection_exit_test.cc index cd1ef17b2462..f753d2a1a71e 100644 --- a/src/ray/util/tests/stream_redirection_exit_test.cc +++ b/src/ray/util/tests/stream_redirection_exit_test.cc @@ -21,10 +21,11 @@ #include #include -#include "ray/common/test/testing.h" +#include "absl/strings/str_format.h" +#include "ray/common/id.h" +#include "ray/common/tests/testing.h" #include "ray/util/filesystem.h" #include "ray/util/stream_redirection.h" -#include "ray/util/util.h" namespace ray { @@ -34,7 +35,8 @@ constexpr std::string_view kLogLine2 = "world"; } // namespace TEST(LoggingUtilTest, RedirectStderr) { - const std::string test_file_path = absl::StrFormat("%s.err", GenerateUUIDV4()); + const std::string test_file_path = + absl::StrFormat("%s.err", UniqueID::FromRandom().Hex()); // Works via `dup`, so have to execute before we redirect via `dup2` and close stderr. testing::internal::CaptureStderr(); diff --git a/src/mock/ray/gcs/gcs_server/gcs_server.h b/src/ray/util/time.cc similarity index 56% rename from src/mock/ray/gcs/gcs_server/gcs_server.h rename to src/ray/util/time.cc index 8c80774b4078..c305ca04e42a 100644 --- a/src/mock/ray/gcs/gcs_server/gcs_server.h +++ b/src/ray/util/time.cc @@ -1,4 +1,4 @@ -// Copyright The Ray Authors. +// Copyright 2025 The Ray Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,22 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. -namespace ray { -namespace gcs { - -class MockGcsServerConfig : public GcsServerConfig { - public: -}; - -} // namespace gcs -} // namespace ray +#include "ray/util/time.h" namespace ray { -namespace gcs { -class MockGcsServer : public GcsServer { - public: -}; +std::optional ToTimeoutPoint(int64_t timeout_ms) { + std::optional timeout_point; + if (timeout_ms == -1) { + return timeout_point; + } + auto now = std::chrono::steady_clock::now(); + auto timeout_duration = std::chrono::milliseconds(timeout_ms); + timeout_point.emplace(now + timeout_duration); + return timeout_point; +} -} // namespace gcs } // namespace ray diff --git a/src/ray/util/time.h b/src/ray/util/time.h new file mode 100644 index 000000000000..473d2cb69356 --- /dev/null +++ b/src/ray/util/time.h @@ -0,0 +1,51 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +namespace ray { + +/// Return the number of milliseconds since the steady clock epoch. NOTE: The +/// returned timestamp may be used for accurately measuring intervals but has +/// no relation to wall clock time. It must not be used for synchronization +/// across multiple nodes. +inline int64_t current_time_ms() { + std::chrono::milliseconds ms_since_epoch = + std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()); + return ms_since_epoch.count(); +} + +inline int64_t current_sys_time_ms() { + std::chrono::milliseconds ms_since_epoch = + std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()); + return ms_since_epoch.count(); +} + +inline int64_t current_sys_time_s() { + std::chrono::seconds s_since_epoch = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()); + return s_since_epoch.count(); +} + +/// Converts a timeout in milliseconds to a timeout point. +/// \param timeout_ms The timeout in milliseconds. +/// \return The timeout point, or std::nullopt if timeout_ms is -1. +std::optional ToTimeoutPoint(int64_t timeout_ms); + +} // namespace ray diff --git a/src/ray/util/util.cc b/src/ray/util/util.cc deleted file mode 100644 index c01f16b74952..000000000000 --- a/src/ray/util/util.cc +++ /dev/null @@ -1,227 +0,0 @@ -// Copyright 2020 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "ray/util/util.h" - -#include -#include -#ifndef _WIN32 -#include -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#ifndef _WIN32 -#include -#endif -#include - -#include "absl/strings/match.h" -#include "ray/util/filesystem.h" -#include "ray/util/logging.h" -#include "ray/util/process.h" -#include "ray/util/string_utils.h" - -std::string EndpointToUrl( - const boost::asio::generic::basic_endpoint &ep, - bool include_scheme) { - std::string result, scheme; - switch (ep.protocol().family()) { - case AF_INET: { - scheme = "tcp://"; - boost::asio::ip::tcp::endpoint e(boost::asio::ip::tcp::v4(), 0); - RAY_CHECK_EQ(e.size(), ep.size()); - const sockaddr *src = ep.data(); - sockaddr *dst = e.data(); - *reinterpret_cast(dst) = *reinterpret_cast(src); - std::ostringstream ss; - ss << e; - result = ss.str(); - break; - } - case AF_INET6: { - scheme = "tcp://"; - boost::asio::ip::tcp::endpoint e(boost::asio::ip::tcp::v6(), 0); - RAY_CHECK_EQ(e.size(), ep.size()); - const sockaddr *src = ep.data(); - sockaddr *dst = e.data(); - *reinterpret_cast(dst) = *reinterpret_cast(src); - std::ostringstream ss; - ss << e; - result = ss.str(); - break; - } -#if defined(BOOST_ASIO_HAS_LOCAL_SOCKETS) && !defined(_WIN32) - case AF_UNIX: - scheme = "unix://"; - result.append(reinterpret_cast(ep.data())->sun_path, - ep.size() - offsetof(sockaddr_un, sun_path)); - break; -#endif - default: - RAY_LOG(FATAL) << "unsupported protocol family: " << ep.protocol().family(); - break; - } - if (include_scheme) { - result.insert(0, scheme); - } - return result; -} - -boost::asio::generic::basic_endpoint -ParseUrlEndpoint(const std::string &endpoint, int default_port) { - // Syntax reference: https://en.wikipedia.org/wiki/URL#Syntax - // Note that we're a bit more flexible, to allow parsing "127.0.0.1" as a URL. - boost::asio::generic::stream_protocol::endpoint result; - std::string address = endpoint, scheme; - if (absl::StartsWith(address, "unix://")) { - scheme = "unix://"; - address.erase(0, scheme.size()); - } else if (!address.empty() && ray::IsDirSep(address[0])) { - scheme = "unix://"; - } else if (absl::StartsWith(address, "tcp://")) { - scheme = "tcp://"; - address.erase(0, scheme.size()); - } else { - scheme = "tcp://"; - } - if (scheme == "unix://") { -#if defined(BOOST_ASIO_HAS_LOCAL_SOCKETS) && !defined(_WIN32) - size_t maxlen = sizeof(sockaddr_un().sun_path) / sizeof(*sockaddr_un().sun_path) - 1; - RAY_CHECK(address.size() <= maxlen) - << "AF_UNIX path length cannot exceed " << maxlen << " bytes: " << address; - result = boost::asio::local::stream_protocol::endpoint(address); -#else - RAY_LOG(FATAL) << "UNIX-domain socket endpoints are not supported: " << endpoint; -#endif - } else if (scheme == "tcp://") { - std::string::const_iterator i = address.begin(); - std::string host = ::ray::ScanToken(i, "[%*[^][/]]"); - host = - host.empty() ? ::ray::ScanToken(i, "%*[^/:]") : host.substr(1, host.size() - 2); - std::string port_str = ::ray::ScanToken(i, ":%*d"); - int port = port_str.empty() ? default_port : std::stoi(port_str.substr(1)); - result = boost::asio::ip::tcp::endpoint(boost::asio::ip::make_address(host), port); - } else { - RAY_LOG(FATAL) << "Unable to parse socket endpoint: " << endpoint; - } - return result; -} - -std::shared_ptr> ParseURL(std::string url) { - auto result = std::make_shared>(); - std::string delimiter = "?"; - size_t pos = 0; - pos = url.find(delimiter); - if (pos == std::string::npos) { - return result; - } - - const std::string base_url = url.substr(0, pos); - result->emplace("url", base_url); - url.erase(0, pos + delimiter.length()); - const std::string query_delimeter = "&"; - - auto parse_key_value_with_equal_delimter = - [](std::string_view key_value) -> std::pair { - // Parse the query key value pair. - const std::string key_value_delimter = "="; - size_t key_value_pos = key_value.find(key_value_delimter); - std::string_view key = key_value.substr(0, key_value_pos); - return std::make_pair(key, key_value.substr(key.size() + 1)); - }; - - while ((pos = url.find(query_delimeter)) != std::string::npos) { - std::string_view token = std::string_view{url}.substr(0, pos); - auto key_value_pair = parse_key_value_with_equal_delimter(token); - result->emplace(std::string(key_value_pair.first), - std::string(key_value_pair.second)); - url.erase(0, pos + delimiter.length()); - } - std::string_view token = std::string_view{url}.substr(0, pos); - auto key_value_pair = parse_key_value_with_equal_delimter(token); - result->emplace(std::string(key_value_pair.first), std::string(key_value_pair.second)); - return result; -} - -std::string GenerateUUIDV4() { - thread_local std::random_device rd; - thread_local std::mt19937 gen(rd()); - std::uniform_int_distribution<> dis(0, 15); - std::uniform_int_distribution<> dis2(8, 11); - - std::stringstream ss; - int i; - ss << std::hex; - for (i = 0; i < 8; i++) { - ss << dis(gen); - } - ss << "-"; - for (i = 0; i < 4; i++) { - ss << dis(gen); - } - ss << "-4"; - for (i = 0; i < 3; i++) { - ss << dis(gen); - } - ss << "-"; - ss << dis2(gen); - for (i = 0; i < 3; i++) { - ss << dis(gen); - } - ss << "-"; - for (i = 0; i < 12; i++) { - ss << dis(gen); - }; - return ss.str(); -} - -namespace ray { - -bool IsRayletFailed(const std::string &raylet_pid) { - auto should_shutdown = false; - if (!raylet_pid.empty()) { - auto pid = static_cast(std::stoi(raylet_pid)); - if (!IsProcessAlive(pid)) { - should_shutdown = true; - } - } else if (!IsParentProcessAlive()) { - should_shutdown = true; - } - return should_shutdown; -} - -void QuickExit() { - ray::RayLog::ShutDownRayLog(); - _Exit(1); -} - -std::optional ToTimeoutPoint(int64_t timeout_ms) { - std::optional timeout_point; - if (timeout_ms == -1) { - return timeout_point; - } - auto now = std::chrono::steady_clock::now(); - auto timeout_duration = std::chrono::milliseconds(timeout_ms); - timeout_point.emplace(now + timeout_duration); - return timeout_point; -} - -} // namespace ray diff --git a/src/ray/util/util.h b/src/ray/util/util.h deleted file mode 100644 index 8f0baecb5b1b..000000000000 --- a/src/ray/util/util.h +++ /dev/null @@ -1,189 +0,0 @@ -// Copyright 2017 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#ifdef __APPLE__ -#include -#endif - -#ifdef __linux__ -#include -#endif - -#ifdef _WIN32 -#ifndef _WINDOWS_ -#ifndef WIN32_LEAN_AND_MEAN // Sorry for the inconvenience. Please include any related - // headers you need manually. - // (https://stackoverflow.com/a/8294669) -#define WIN32_LEAN_AND_MEAN // Prevent inclusion of WinSock2.h -#endif -#include // Force inclusion of WinGDI here to resolve name conflict -#endif -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "absl/container/flat_hash_map.h" -#include "ray/util/logging.h" -#include "ray/util/macros.h" - -#ifdef _WIN32 -#include // to ensure getpid() on Windows -#endif - -// Boost forward-declarations (to avoid forcing slow header inclusions) -namespace boost::asio::generic { - -template -class basic_endpoint; -class stream_protocol; - -} // namespace boost::asio::generic - -// Append append_str to the beginning of each line of str. -inline std::string AppendToEachLine(const std::string &str, - const std::string &append_str) { - std::stringstream ss; - ss << append_str; - for (char c : str) { - ss << c; - if (c == '\n') { - ss << append_str; - } - } - return ss.str(); -} - -inline int64_t current_sys_time_s() { - std::chrono::seconds s_since_epoch = std::chrono::duration_cast( - std::chrono::system_clock::now().time_since_epoch()); - return s_since_epoch.count(); -} - -/// Return the number of milliseconds since the steady clock epoch. NOTE: The -/// returned timestamp may be used for accurately measuring intervals but has -/// no relation to wall clock time. It must not be used for synchronization -/// across multiple nodes. -/// -/// TODO(rkn): This function appears in multiple places. It should be -/// deduplicated. -/// -/// \return The number of milliseconds since the steady clock epoch. -inline int64_t current_time_ms() { - std::chrono::milliseconds ms_since_epoch = - std::chrono::duration_cast( - std::chrono::steady_clock::now().time_since_epoch()); - return ms_since_epoch.count(); -} - -inline int64_t current_sys_time_ms() { - std::chrono::milliseconds ms_since_epoch = - std::chrono::duration_cast( - std::chrono::system_clock::now().time_since_epoch()); - return ms_since_epoch.count(); -} - -inline int64_t current_sys_time_us() { - std::chrono::microseconds mu_since_epoch = - std::chrono::duration_cast( - std::chrono::system_clock::now().time_since_epoch()); - return mu_since_epoch.count(); -} - -std::string GenerateUUIDV4(); - -/// Converts the given endpoint (such as TCP or UNIX domain socket address) to a string. -/// \param include_scheme Whether to include the scheme prefix (such as tcp://). -/// This is recommended to avoid later ambiguity when parsing. -std::string EndpointToUrl( - const boost::asio::generic::basic_endpoint &ep, - bool include_scheme = true); - -/// Parses the endpoint socket address of a URL. -/// If a scheme:// prefix is absent, the address family is guessed automatically. -/// For TCP/IP, the endpoint comprises the IP address and port number in the URL. -/// For UNIX domain sockets, the endpoint comprises the socket path. -boost::asio::generic::basic_endpoint -ParseUrlEndpoint(const std::string &endpoint, int default_port = 0); - -/// Parse the url and return a pair of base_url and query string map. -/// EX) http://abc?num_objects=9&offset=8388878 -/// will be returned as -/// { -/// url: http://abc, -/// num_objects: 9, -/// offset: 8388878 -/// } -std::shared_ptr> ParseURL(std::string url); - -class InitShutdownRAII { - public: - /// Type of the Shutdown function. - using ShutdownFunc = void (*)(); - - /// Create an instance of InitShutdownRAII which will call shutdown - /// function when it is out of scope. - /// - /// \param init_func The init function. - /// \param shutdown_func The shutdown function. - /// \param args The arguments for the init function. - template - InitShutdownRAII(InitFunc init_func, ShutdownFunc shutdown_func, Args &&...args) - : shutdown_(shutdown_func) { - init_func(args...); - } - - /// Destructor of InitShutdownRAII which will call the shutdown function. - ~InitShutdownRAII() { - if (shutdown_ != nullptr) { - shutdown_(); - } - } - - private: - ShutdownFunc shutdown_; -}; - -struct EnumClassHash { - template - std::size_t operator()(T t) const { - return static_cast(t); - } -}; - -namespace ray { - -/// Return true if the raylet is failed. This util function is only meant to be used by -/// core worker modules. -bool IsRayletFailed(const std::string &raylet_pid); - -/// Teriminate the process without cleaning up the resources. -void QuickExit(); - -/// Converts a timeout in milliseconds to a timeout point. -/// \param[in] timeout_ms The timeout in milliseconds. -/// \return The timeout point, or std::nullopt if timeout_ms is -1. -std::optional ToTimeoutPoint(int64_t timeout_ms); - -} // namespace ray diff --git a/thirdparty/patches/abseil-cpp-shadow.patch b/thirdparty/patches/abseil-cpp-shadow.patch new file mode 100644 index 000000000000..b014e8a6dd23 --- /dev/null +++ b/thirdparty/patches/abseil-cpp-shadow.patch @@ -0,0 +1,12 @@ +diff --git absl/container/internal/btree.h absl/container/internal/btree.h +--- absl/container/internal/btree.h ++++ absl/container/internal/btree.h +@@ -223,7 +223,7 @@ struct key_compare_adapter { + + public: + using Base::Base; +- checked_compare(Compare comp) : Base(std::move(comp)) {} // NOLINT ++ checked_compare(Compare _comp) : Base(std::move(_comp)) {} // NOLINT + + // Allow converting to Compare for use in key_comp()/value_comp(). + explicit operator Compare() const { return comp(); } diff --git a/thirdparty/patches/grpc-zlib-fdopen.patch b/thirdparty/patches/grpc-zlib-fdopen.patch index c48a35bc4ec5..83dfba2b95ff 100644 --- a/thirdparty/patches/grpc-zlib-fdopen.patch +++ b/thirdparty/patches/grpc-zlib-fdopen.patch @@ -6,8 +6,8 @@ diff -u bazel/grpc_deps.bzl "https://github.com/madler/zlib/archive/04f42ceca40f73e2978b50e93806c2a18c1281fc.tar.gz", ], + patches = [ -+ "@com_github_ray_project_ray//thirdparty/patches:zlib-fdopen.patch", ++ "@io_ray//thirdparty/patches:zlib-fdopen.patch", + ] ) - if "com_google_protobuf" not in native.existing_rules(): \ No newline at end of file + if "com_google_protobuf" not in native.existing_rules(): diff --git a/thirdparty/patches/msgpack-shadow.patch b/thirdparty/patches/msgpack-shadow.patch new file mode 100644 index 000000000000..581a1e7fde3a --- /dev/null +++ b/thirdparty/patches/msgpack-shadow.patch @@ -0,0 +1,12 @@ +diff --git include/msgpack/v1/adaptor/fixint.hpp include/msgpack/v1/adaptor/fixint.hpp +--- include/msgpack/v1/adaptor/fixint.hpp ++++ include/msgpack/v1/adaptor/fixint.hpp +@@ -24,7 +24,7 @@ template + struct fix_int { + typedef T value_type; + fix_int() : value(0) { } +- fix_int(T value) : value(value) { } ++ fix_int(T _value) : value(_value) { } + + operator T() const { return value; } + diff --git a/thirdparty/patches/prometheus-zlib-fdopen.patch b/thirdparty/patches/prometheus-zlib-fdopen.patch index e8ef276d1d14..6d0a112f0891 100644 --- a/thirdparty/patches/prometheus-zlib-fdopen.patch +++ b/thirdparty/patches/prometheus-zlib-fdopen.patch @@ -6,6 +6,6 @@ diff -u bazel/repositories.bzl /tmp/repositories.bzl ], build_file = "@com_github_jupp0r_prometheus_cpp//bazel:zlib.BUILD", + patches = [ -+ "@com_github_ray_project_ray//thirdparty/patches:zlib-fdopen.patch", ++ "@io_ray//thirdparty/patches:zlib-fdopen.patch", + ] - ) \ No newline at end of file + )