From 3c38f26f951ae79e0d8f9f8e5bd75a25c4981452 Mon Sep 17 00:00:00 2001 From: ActivePeter <1020401660@qq.com> Date: Thu, 2 Jul 2026 15:32:09 +0800 Subject: [PATCH 1/2] test largescale mq --- .github/workflows/all_test.yml | 75 ++- .github/workflows/largescale-mq.yml | 232 ++++++++ .../fluxon_kv/src/metric_reporter/mod.rs | 5 +- .../fluxon_observability/src/metrics_actor.rs | 16 +- fluxon_test_stack/benchmark_node_kv.py | 73 ++- fluxon_test_stack/ci_2_virt_node.py | 247 +++++++- .../distributed_benchmark_coordinator.py | 1 + .../distributed_benchmark_node.py | 549 +++++++++++++++--- fluxon_test_stack/mpmc_readiness.py | 62 ++ fluxon_test_stack/test_runner.py | 145 ++++- .../test_runner_runtime_backend.py | 2 +- .../tests/test_benchmark_node_kv_contract.py | 123 ++++ .../tests/test_ci_2_virt_node_contract.py | 244 +++++++- .../tests/test_mpmc_readiness_contract.py | 177 ++++++ .../tests/test_runner_contract.py | 19 + .../test_test_runner_testbed_contract.py | 132 ++++- ...st_top_attention_largescale_mq_contract.py | 476 +++++++++++++++ .../top_attention_test_index/README.md | 10 +- .../_largescale_mq.py | 511 +++++++++++++++- .../test_doc_site_builder_image_workflow.py | 37 ++ 20 files changed, 2984 insertions(+), 152 deletions(-) create mode 100644 .github/workflows/largescale-mq.yml create mode 100644 fluxon_test_stack/mpmc_readiness.py create mode 100644 fluxon_test_stack/tests/test_benchmark_node_kv_contract.py create mode 100644 fluxon_test_stack/tests/test_mpmc_readiness_contract.py create mode 100644 fluxon_test_stack/tests/test_top_attention_largescale_mq_contract.py diff --git a/.github/workflows/all_test.yml b/.github/workflows/all_test.yml index 3f36d2d..1404293 100644 --- a/.github/workflows/all_test.yml +++ b/.github/workflows/all_test.yml @@ -54,6 +54,7 @@ jobs: run: | python3 - <<'PY' import copy + import shlex from pathlib import Path import yaml @@ -64,7 +65,7 @@ jobs: top_attention_scene_prefix = "ci_top_attention_" - def top_attention_command(scene_id, *, case_config, timeout_seconds=21600): + def top_attention_command(scene_id, *, case_config, extra_args=None, timeout_seconds=21600): if not scene_id.startswith(top_attention_scene_prefix): raise ValueError(f"not a top-attention CI scene id: {scene_id}") suffix = scene_id[len(top_attention_scene_prefix):] @@ -74,6 +75,8 @@ jobs: ) if case_config: command += " --case-config __RUN_DIR__/configs/ci_scene_config.yaml" + for arg in extra_args or []: + command += " " + shlex.quote(str(arg)) return { "id": f"top_attention_{suffix}", "command": command, @@ -151,16 +154,70 @@ jobs: "case_config": True, "scene_config": {}, }, + "ci_top_attention_largescale_mq": { + "subject": "mq", + "runtime_contract": "rust_self_managed", + "scale": "n1_kvowner_dram_3gib", + "case_config": False, + "command_variants": [ + { + "id_suffix": f"p{producer_count}_c{consumer_count}", + "extra_args": [ + "--single-host-logical-targets", + "--testbed-bundle-source", + "__TEST_BED_BUNDLE_ROOT__", + "--workdir", + f"__WORKDIR_ROOT__/largescale_mq_ci_single_host/p{producer_count}_c{consumer_count}", + "--owner-count", + "4", + "--owner-dram-gib", + "1", + "--producer-count", + str(producer_count), + "--consumer-count", + str(consumer_count), + "--duration-seconds", + "30", + "--value-size", + "256", + "--op-timeout-seconds", + "5", + "--cluster-ready-timeout-seconds", + "1800", + "--consumer-sim-min-ms", + "1", + "--consumer-sim-max-ms", + "1", + ], + } + for producer_count, consumer_count in ((8, 8), (32, 32), (160, 8)) + ], + "scene_config": {}, + }, } for scene_id, scene_def in top_attention_ci_scenes.items(): - commands = [ - top_attention_command( - scene_id, - case_config=scene_def["case_config"], - timeout_seconds=scene_def.get("timeout_seconds", 21600), - ) - ] + command_variants = scene_def.get("command_variants") + if command_variants is None: + commands = [ + top_attention_command( + scene_id, + case_config=scene_def["case_config"], + extra_args=scene_def.get("extra_args"), + timeout_seconds=scene_def.get("timeout_seconds", 21600), + ) + ] + else: + commands = [] + for variant in command_variants: + command = top_attention_command( + scene_id, + case_config=scene_def["case_config"], + extra_args=variant["extra_args"], + timeout_seconds=scene_def.get("timeout_seconds", 21600), + ) + command["id"] = f"{command['id']}_{variant['id_suffix']}" + commands.append(command) existing_scene = suite["scenes"].get(scene_id) if existing_scene is None: suite["scenes"][scene_id] = { @@ -189,6 +246,7 @@ jobs: # - ci_top_attention_mq_mpsc keeps MPSC API channel coverage inside the same CI testbed contract. # - ci_top_attention_mq_mpmc keeps MPMC API channel coverage inside the same CI testbed contract. # - ci_top_attention_mq_mpmc_bench keeps heavier MPMC benchmark-style coverage in this workflow. + # - ci_top_attention_largescale_mq runs a bounded same-host MQ TEST_STACK workload in GitHub Actions. suite["scenes"] = { key: value for key, value in suite["scenes"].items() @@ -221,6 +279,7 @@ jobs: # - ci_top_attention_mq_mpsc stays on n1_kvowner_dram_20gib. # - ci_top_attention_mq_mpmc stays on n1_kvowner_dram_20gib. # - ci_top_attention_mq_mpmc_bench stays on n1_kvowner_dram_20gib. + # - ci_top_attention_largescale_mq stays on n1_kvowner_dram_3gib because the nested workload owns its bounded scale. out_path.write_text( yaml.safe_dump(suite, sort_keys=False, allow_unicode=False), diff --git a/.github/workflows/largescale-mq.yml b/.github/workflows/largescale-mq.yml new file mode 100644 index 0000000..278e0dc --- /dev/null +++ b/.github/workflows/largescale-mq.yml @@ -0,0 +1,232 @@ +name: largescale_mq + +on: + workflow_dispatch: + inputs: + run_mode: + description: "run executes the TEST_STACK workload; generate_only only validates suite generation" + required: true + type: choice + options: + - run + - generate_only + default: run + config_path: + description: "Base TEST_STACK suite YAML" + required: true + type: string + workdir: + description: "test_runner workdir for the large-scale MQ run" + required: true + type: string + default: .dever/largescale_mq + testbed_bundle_path: + description: "Existing TEST_STACK testbed bundle directory on the self-hosted runner" + required: true + type: string + start_config_relpath: + description: "Start-testbed config path inside the testbed bundle" + required: true + type: string + default: start_test_bed.runner.yaml + profiles: + description: "Comma-separated TEST_STACK profile ids" + required: true + type: string + default: fluxon_tcp_thread + owner_count: + description: "Owner count" + required: true + type: string + default: "4" + owner_dram_gib: + description: "Owner DRAM GiB per owner" + required: true + type: string + default: "1" + producer_count: + description: "Producer count" + required: true + type: string + default: "160" + consumer_count: + description: "Consumer count" + required: true + type: string + default: "8" + duration_seconds: + description: "Benchmark duration seconds" + required: true + type: string + default: "60" + value_size: + description: "Message value size bytes" + required: true + type: string + default: "256" + op_timeout_seconds: + description: "Per-operation timeout seconds" + required: true + type: string + default: "30" + cluster_ready_timeout_seconds: + description: "Cluster readiness timeout seconds" + required: true + type: string + default: "1800" + consumer_sim_min_ms: + description: "Minimum simulated consumer handle time in milliseconds" + required: true + type: string + default: "700" + consumer_sim_max_ms: + description: "Maximum simulated consumer handle time in milliseconds" + required: true + type: string + default: "1500" + +permissions: + contents: read + +concurrency: + group: largescale-mq-${{ github.ref }} + cancel-in-progress: false + +jobs: + largescale-mq: + runs-on: + - self-hosted + - Linux + - X64 + steps: + - name: Check out repository + uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: Install Python dependencies + run: python3 -m pip install PyYAML + + - name: Sync rather_no_git_submodule workspace inputs + run: python3 fluxon_rs/scripts/rather_no_git_submodule.py + + - name: Generate large-scale MQ suite + run: | + python3 - <<'PY' + from __future__ import annotations + + import shlex + import subprocess + import sys + from pathlib import Path + + runner_temp = Path(${{ toJSON(runner.temp) }}) + suite_out = runner_temp / "largescale_mq_suite.yaml" + profiles = [item.strip() for item in ${{ toJSON(inputs.profiles) }}.split(",") if item.strip()] + if not profiles: + raise SystemExit("profiles input must contain at least one profile id") + + cmd = [ + sys.executable, + "-u", + "fluxon_test_stack/top_attention_test_index/_largescale_mq.py", + "--generate-only", + "--config", + ${{ toJSON(inputs.config_path) }}, + "--workdir", + ${{ toJSON(inputs.workdir) }}, + "--suite-out", + str(suite_out), + "--owner-count", + ${{ toJSON(inputs.owner_count) }}, + "--owner-dram-gib", + ${{ toJSON(inputs.owner_dram_gib) }}, + "--producer-count", + ${{ toJSON(inputs.producer_count) }}, + "--consumer-count", + ${{ toJSON(inputs.consumer_count) }}, + "--duration-seconds", + ${{ toJSON(inputs.duration_seconds) }}, + "--value-size", + ${{ toJSON(inputs.value_size) }}, + "--op-timeout-seconds", + ${{ toJSON(inputs.op_timeout_seconds) }}, + "--cluster-ready-timeout-seconds", + ${{ toJSON(inputs.cluster_ready_timeout_seconds) }}, + "--consumer-sim-min-ms", + ${{ toJSON(inputs.consumer_sim_min_ms) }}, + "--consumer-sim-max-ms", + ${{ toJSON(inputs.consumer_sim_max_ms) }}, + ] + for profile in profiles: + cmd.extend(["--profile", profile]) + print("RUN: " + shlex.join(cmd), flush=True) + subprocess.check_call(cmd) + PY + + - name: Run large-scale MQ benchmark + if: ${{ inputs.run_mode == 'run' }} + run: | + python3 - <<'PY' + from __future__ import annotations + + import shlex + import subprocess + import sys + + profiles = [item.strip() for item in ${{ toJSON(inputs.profiles) }}.split(",") if item.strip()] + if not profiles: + raise SystemExit("profiles input must contain at least one profile id") + + cmd = [ + sys.executable, + "-u", + "fluxon_test_stack/top_attention_test_index/_largescale_mq.py", + "--config", + ${{ toJSON(inputs.config_path) }}, + "--workdir", + ${{ toJSON(inputs.workdir) }}, + "--testbed-bundle-source", + ${{ toJSON(inputs.testbed_bundle_path) }}, + "--start-config-relpath", + ${{ toJSON(inputs.start_config_relpath) }}, + "--owner-count", + ${{ toJSON(inputs.owner_count) }}, + "--owner-dram-gib", + ${{ toJSON(inputs.owner_dram_gib) }}, + "--producer-count", + ${{ toJSON(inputs.producer_count) }}, + "--consumer-count", + ${{ toJSON(inputs.consumer_count) }}, + "--duration-seconds", + ${{ toJSON(inputs.duration_seconds) }}, + "--value-size", + ${{ toJSON(inputs.value_size) }}, + "--op-timeout-seconds", + ${{ toJSON(inputs.op_timeout_seconds) }}, + "--cluster-ready-timeout-seconds", + ${{ toJSON(inputs.cluster_ready_timeout_seconds) }}, + "--consumer-sim-min-ms", + ${{ toJSON(inputs.consumer_sim_min_ms) }}, + "--consumer-sim-max-ms", + ${{ toJSON(inputs.consumer_sim_max_ms) }}, + ] + for profile in profiles: + cmd.extend(["--profile", profile]) + print("RUN: " + shlex.join(cmd), flush=True) + subprocess.check_call(cmd) + PY + + - name: Upload large-scale MQ generated suite + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: largescale-mq-suite-${{ github.sha }} + if-no-files-found: warn + compression-level: 1 + path: ${{ runner.temp }}/largescale_mq_suite.yaml diff --git a/fluxon_rs/fluxon_kv/src/metric_reporter/mod.rs b/fluxon_rs/fluxon_kv/src/metric_reporter/mod.rs index 88fea03..81d7ea9 100644 --- a/fluxon_rs/fluxon_kv/src/metric_reporter/mod.rs +++ b/fluxon_rs/fluxon_kv/src/metric_reporter/mod.rs @@ -583,7 +583,10 @@ impl MetricReporter { } pub fn metrics_handle(&self) -> ObserveMetricsHandle { - self.metrics_actor_handle().clone() + self.metrics_actor_handle + .get() + .cloned() + .unwrap_or_else(ObserveMetricsHandle::noop) } pub fn metrics(&self) -> Arc { diff --git a/fluxon_rs/fluxon_observability/src/metrics_actor.rs b/fluxon_rs/fluxon_observability/src/metrics_actor.rs index 1fcbb6d..e94a050 100644 --- a/fluxon_rs/fluxon_observability/src/metrics_actor.rs +++ b/fluxon_rs/fluxon_observability/src/metrics_actor.rs @@ -11,18 +11,22 @@ enum MetricsActorMsg { #[derive(Clone)] pub struct MetricsHandle { - tx: mpsc::Sender, + tx: Option>, } impl MetricsHandle { + pub fn noop() -> Self { + Self { tx: None } + } + pub fn try_submit_timeseries(&self, series: Vec) { if series.is_empty() { return; } - if let Err(e) = self - .tx - .try_send(MetricsActorMsg::SubmitTimeSeries { series }) - { + let Some(tx) = &self.tx else { + return; + }; + if let Err(e) = tx.try_send(MetricsActorMsg::SubmitTimeSeries { series }) { warn!("metrics actor dropped SubmitTimeSeries: {}", e); } } @@ -39,7 +43,7 @@ impl MetricsActorOwned { prom: PromRemoteWriteHandle, ) -> (MetricsHandle, MetricsActorOwned) { let (tx, rx) = mpsc::channel(max_pending_msgs); - let handle = MetricsHandle { tx }; + let handle = MetricsHandle { tx: Some(tx) }; let owned = MetricsActorOwned { rx, prom }; (handle, owned) } diff --git a/fluxon_test_stack/benchmark_node_kv.py b/fluxon_test_stack/benchmark_node_kv.py index c9c559b..bfc023d 100644 --- a/fluxon_test_stack/benchmark_node_kv.py +++ b/fluxon_test_stack/benchmark_node_kv.py @@ -25,7 +25,7 @@ ) from fluxon_py import FluxonKvClientConfig as KVCacheConfig from fluxon_py import new_store -from fluxon_py.kvclient.kvclient_interface import KvClient, PutOptionalArgs +from fluxon_py.kvclient.kvclient_interface import KvClient, KvLeaseApi, PutOptionalArgs TEST_MODE_MPMC = "MPMC" TEST_MODE_KVSTORE = "KVSTORE" @@ -1865,12 +1865,81 @@ def _sanitize_benchmark_client_kvcache_config(kvcache_config: dict[str, Any]) -> return sanitized -class FluxonBlockingStore: +class FluxonBlockingStore(KvLeaseApi): def __init__(self, store: KvClient) -> None: self.backend_kind = BACKEND_KIND_FLUXON self._store = store self._phase_profiler = _FluxonPhaseProfiler() + @property + def _client(self) -> Any: + return self._store._client # type: ignore[attr-defined] + + def put( + self, + key: str, + value: dict[str, Union[int, float, bool, str, bytes, bytearray, memoryview]], + opts: Optional[PutOptionalArgs] = None, + ) -> Any: + return self._store.put(key, value, opts=opts) + + def get(self, key: str) -> Any: + return self._store.get(key) + + def get_size(self, key: str) -> Any: + return self._store.get_size(key) + + def is_exist(self, key: str) -> Any: + return self._store.is_exist(key) + + def remove(self, key: str) -> Any: + return self._store.remove(key) + + def sync_kv_to_file( + self, + key: str, + target_instance_key: str, + filepath: str, + file_offset: int, + bytes_field_key: str, + timeout_ms: int = 60_000, + ) -> Any: + return self._store.sync_kv_to_file( + key, + target_instance_key, + filepath, + file_offset, + bytes_field_key, + timeout_ms=timeout_ms, + ) + + def instance_key(self) -> Any: + return self._store.instance_key() + + def config(self) -> Any: + return self._store.config() + + def get_cluster_name(self) -> str: + return self._store.get_cluster_name() + + def get_etcd_config(self) -> List[str]: + return self._store.get_etcd_config() + + def third_party_logs_dir(self) -> Any: + return self._store.third_party_logs_dir() + + def ensure_zero_contribution_for_channel(self) -> None: + self._store.ensure_zero_contribution_for_channel() + + def count_prefix(self, prefix: str) -> Any: + return self._store.count_prefix(prefix) + + def allocate_lease(self, ttl_seconds: int) -> Any: + return self._store.allocate_lease(ttl_seconds) + + def keepalive_lease(self, lease_id: int) -> Any: + return self._store.keepalive_lease(lease_id) + def put_blocking( self, key: str, diff --git a/fluxon_test_stack/ci_2_virt_node.py b/fluxon_test_stack/ci_2_virt_node.py index f055426..3d2c3f1 100644 --- a/fluxon_test_stack/ci_2_virt_node.py +++ b/fluxon_test_stack/ci_2_virt_node.py @@ -34,13 +34,17 @@ DEFAULT_TESTBED_BOOTSTRAP_MODE = "bare_then_apply" DEFAULT_TESTBED_UI_PORT = 18080 DEFAULT_TESTBED_CONTROLLER_PORT = 19080 +DEFAULT_TESTBED_OPS_CLUSTER_NAME = "fluxon_testbed" DEFAULT_TESTBED_HOSTWORKDIR = Path("/mnt/nvme0/store_team_dev/fluxon_deploy") LOCAL_PRIMARY_NODE_SUFFIX = "a" LOCAL_SECONDARY_NODE_SUFFIX = "b" TEST_STACK_START_TEST_BED_CONFIG_ENV = "FLUXON_TEST_STACK_START_TEST_BED_CONFIG" +RELEASE_MANIFEST_SHA256_ENV_KEY = "FLUXON_RELEASE_MANIFEST_SHA256" PLACEHOLDER_WHEEL_NAME = "fluxon-0.0.0-ci-placeholder-cp38-abi3-manylinux_2_28_x86_64.whl" SAME_HOST_LOCAL_MULTI_NODE_ETCD_CLIENT_PORT_OFFSET = 100 SAME_HOST_LOCAL_MULTI_NODE_GREPTIME_PORT_OFFSET = 110 +SAME_HOST_LOCAL_MULTI_NODE_TEST_STACK_COORDINATOR_PORT_OFFSET = 1000 +SAME_HOST_LOCAL_MULTI_NODE_TEST_STACK_TOPOLOGY_PORT_SPAN = 100 def _parse_args() -> argparse.Namespace: @@ -147,6 +151,11 @@ def _parse_args() -> argparse.Namespace: default=DEFAULT_TESTBED_CONTROLLER_PORT, help="Fluxon Ops controller HTTP port for the generated testbed configs.", ) + parser.add_argument( + "--testbed-ops-cluster-name", + default=DEFAULT_TESTBED_OPS_CLUSTER_NAME, + help="Ops cluster namespace/path segment for the generated local testbed.", + ) parser.add_argument( "--print-generated", action="store_true", @@ -221,6 +230,13 @@ def _cidr32_list_for_ips(*, ips: list[str]) -> list[str]: return out +def _normalize_testbed_ops_cluster_name(raw: str) -> str: + name = _require_nonempty_str(str(raw), "testbed_ops_cluster_name") + if re.search(r"[\s/]", name): + raise ValueError(f"testbed_ops_cluster_name must not contain whitespace or '/': {name!r}") + return name + + def _detect_local_hostname() -> str: try: return subprocess.check_output(["bash", "-lc", "hostname -s"], text=True).strip() @@ -477,9 +493,70 @@ def _rewrite_suite_for_local_dual_nodes( } suite["profiles"] = {PUBLIC_PROFILE_ID: generated_profile} + _rewrite_test_stack_coordinator_ports_for_local_controller( + suite, + controller_port=int(controller_port), + ) return suite +def _local_test_stack_coordinator_port_base( + *, + controller_port: int, + topology_key: Any, +) -> int: + topology_offset = 0 + if isinstance(topology_key, int): + topology_offset = int(topology_key) * SAME_HOST_LOCAL_MULTI_NODE_TEST_STACK_TOPOLOGY_PORT_SPAN + elif isinstance(topology_key, str) and topology_key.isdigit(): + topology_offset = int(topology_key) * SAME_HOST_LOCAL_MULTI_NODE_TEST_STACK_TOPOLOGY_PORT_SPAN + elif topology_key != "DEFAULT": + raise ValueError(f"unsupported test_stack port_alloc topology key: {topology_key!r}") + + port = ( + int(controller_port) + + SAME_HOST_LOCAL_MULTI_NODE_TEST_STACK_COORDINATOR_PORT_OFFSET + + topology_offset + ) + if port <= 0 or port > 65535: + raise ValueError(f"computed local TEST_STACK coordinator_port_base out of range: {port}") + return port + + +def _rewrite_test_stack_coordinator_ports_for_local_controller( + suite: dict[str, Any], + *, + controller_port: int, +) -> None: + profiles = suite.get("profiles") + if not isinstance(profiles, dict): + raise ValueError("suite.profiles must be a mapping") + for profile_id, profile in profiles.items(): + if not isinstance(profile, dict): + continue + runtime = profile.get("runtime") + if not isinstance(runtime, dict): + continue + test_stack = runtime.get("test_stack") + if not isinstance(test_stack, dict): + continue + port_alloc = test_stack.get("port_alloc") + if not isinstance(port_alloc, dict): + raise ValueError(f"profile[{profile_id!r}].runtime.test_stack.port_alloc must be a mapping") + by_topology = port_alloc.get("by_topology") + if not isinstance(by_topology, dict): + raise ValueError(f"profile[{profile_id!r}].runtime.test_stack.port_alloc.by_topology must be a mapping") + for topology_key, entry in by_topology.items(): + if not isinstance(entry, dict): + continue + if "coordinator_port_base" not in entry: + continue + entry["coordinator_port_base"] = _local_test_stack_coordinator_port_base( + controller_port=int(controller_port), + topology_key=topology_key, + ) + + def _require_mapping_rewritten_template( payload: dict[str, Any], *, @@ -506,12 +583,15 @@ def _rewrite_deployconf_for_local_dual_nodes( secondary_hostworkdir: Path, wheel_name: str, controller_port: int, + testbed_ops_cluster_name: str = DEFAULT_TESTBED_OPS_CLUSTER_NAME, ) -> dict[str, Any]: + ops_cluster_name = _normalize_testbed_ops_cluster_name(testbed_ops_cluster_name) cfg = _require_mapping_rewritten_template( deployconf_cfg, primary_node_name=primary_node_name, secondary_node_name=secondary_node_name, ) + cfg["namespace"] = ops_cluster_name cfg["name_prefix"] = "fluxon-ci-2-virt-node-local2" cfg["gen_k8s_daemonset_mirror_outdir"] = str((primary_hostworkdir / "gen_k8s_daemonset").resolve()) cfg["cluster_nodes"] = [ @@ -546,6 +626,7 @@ def _rewrite_deployconf_for_local_dual_nodes( raise ValueError("deployconf.global_envs must be a mapping") global_envs["FLUXON_RELEASE_WHEEL"] = wheel_name global_envs["FLUXON_RELEASE_WHEEL_PY"] = wheel_name + global_envs["FLUXON_CLUSTER_NAME"] = ops_cluster_name global_envs["FLUXON_CLUSTER_NODE_IDS"] = f"{primary_node_name} {secondary_node_name}" global_envs["MASTER__PORT"] = str(int(controller_port)) global_envs["FLUXON_OPS_UI_BASE_URL"] = f"http://${{OPS_CONTROLLER__NODE_ID__IP}}:{int(controller_port)}" @@ -592,10 +673,12 @@ def _rewrite_start_test_bed_for_local_dual_nodes( controller_port: int, ui_port: int, ui_workdir: Path, + testbed_ops_cluster_name: str = DEFAULT_TESTBED_OPS_CLUSTER_NAME, ) -> dict[str, Any]: + ops_cluster_name = _normalize_testbed_ops_cluster_name(testbed_ops_cluster_name) cfg = copy.deepcopy(start_cfg) cfg["deployconf_path"] = str(generated_deployconf_path) - cfg["controller_url"] = f"http://{controller_access_ip}:{controller_port}/r/ops/fluxon_testbed" + cfg["controller_url"] = f"http://{controller_access_ip}:{controller_port}/r/ops/{ops_cluster_name}" cfg["controller_basic_auth"] = {"username": "ops_admin", "password": "ops_password"} ui_cfg = cfg.get("test_runner_ui") if not isinstance(ui_cfg, dict): @@ -641,6 +724,123 @@ def _write_yaml(path: Path, payload: dict[str, Any]) -> None: path.write_text(yaml.safe_dump(payload, sort_keys=False, allow_unicode=False), encoding="utf-8") +def _write_ci_testbed_bundle( + *, + bundle_root: Path, + deployconf: dict[str, Any], + start_cfg: dict[str, Any], + apply_check_start_cfg: dict[str, Any], + artifacts_source_root: Path, +) -> dict[str, Path]: + bundle_root = bundle_root.resolve() + bundle_root.mkdir(parents=True, exist_ok=True) + deployconf_path = bundle_root / "deployconf_testbed.local.yaml" + start_cfg_path = bundle_root / "start_test_bed.runner.yaml" + apply_check_start_cfg_path = bundle_root / "start_test_bed.apply_check.runner.yaml" + ssh_config_path = bundle_root / "ssh_config" + bootstrap_workdir = bundle_root / "bootstrap_workdir" + mirror_outdir = bundle_root / "gen_k8s_daemonset" + artifacts_path = bundle_root / "artifacts" + + bundled_deployconf = copy.deepcopy(deployconf) + bundled_deployconf["gen_k8s_daemonset_mirror_outdir"] = str(mirror_outdir) + bundled_start_cfg = copy.deepcopy(start_cfg) + bundled_start_cfg["deployconf_path"] = "./deployconf_testbed.local.yaml" + bundled_apply_check_start_cfg = copy.deepcopy(apply_check_start_cfg) + bundled_apply_check_start_cfg["deployconf_path"] = "./deployconf_testbed.local.yaml" + _write_yaml(deployconf_path, bundled_deployconf) + _write_yaml(start_cfg_path, bundled_start_cfg) + _write_yaml(apply_check_start_cfg_path, bundled_apply_check_start_cfg) + ssh_config_path.write_text("# same-host local CI testbed\n", encoding="utf-8") + bootstrap_workdir.mkdir(parents=True, exist_ok=True) + mirror_outdir.mkdir(parents=True, exist_ok=True) + artifacts_path.mkdir(parents=True, exist_ok=True) + if artifacts_source_root.exists(): + for child in artifacts_source_root.iterdir(): + link_path = artifacts_path / child.name + if link_path.exists() or link_path.is_symlink(): + link_path.unlink() + os.symlink(child.resolve(), link_path, target_is_directory=child.is_dir()) + + manifest_path = bundle_root / "manifest.json" + manifest_path.write_text( + json.dumps( + { + "deployconf_path": "deployconf_testbed.local.yaml", + "start_config_path": "start_test_bed.runner.yaml", + "ssh_config_path": "ssh_config", + "workdir": "bootstrap_workdir", + "bootstrap_mode": "apply_only", + "controller_request_mode": "direct", + }, + ensure_ascii=False, + indent=2, + sort_keys=True, + ) + + "\n", + encoding="utf-8", + ) + return { + "bundle_root": bundle_root, + "deployconf_path": deployconf_path, + "start_config_path": start_cfg_path, + "apply_check_start_config_path": apply_check_start_cfg_path, + "ssh_config_path": ssh_config_path, + "manifest_path": manifest_path, + "artifacts_path": artifacts_path, + } + + +def _refresh_ci_testbed_bundle_deployconf_from_start_workdir( + *, + metadata: dict[str, Any], + start_workdir: Path, +) -> None: + source_path = (start_workdir / "deployconf.with_release_manifest_sha256.yaml").resolve() + if not source_path.is_file(): + raise RuntimeError(f"missing normalized start_test_bed deployconf: {source_path}") + + bundle_root = Path(metadata["testbed_bundle_path"]).resolve() + bundle_deployconf_path = Path(metadata["testbed_bundle_deployconf_path"]).resolve() + if bundle_deployconf_path.parent != bundle_root: + raise RuntimeError( + "testbed bundle deployconf must live directly under testbed_bundle: " + f"bundle={bundle_root} deployconf={bundle_deployconf_path}" + ) + + deployconf = _load_yaml_mapping(source_path, ctx="normalized start_test_bed deployconf") + global_envs = deployconf.get("global_envs") + if isinstance(global_envs, dict): + global_envs.pop(RELEASE_MANIFEST_SHA256_ENV_KEY, None) + mirror_outdir = (bundle_root / "gen_k8s_daemonset").resolve() + mirror_outdir.mkdir(parents=True, exist_ok=True) + deployconf["gen_k8s_daemonset_mirror_outdir"] = str(mirror_outdir) + _write_yaml(bundle_deployconf_path, deployconf) + print( + "synced runner testbed bundle deployconf from normalized start_test_bed output: " + f"{source_path} -> {bundle_deployconf_path}", + flush=True, + ) + + +def _prior_normalized_start_workdir_for_skip_start_testbed(*, metadata: dict[str, Any]) -> Path: + candidates = [ + Path(metadata["bootstrap_apply_workdir"]).resolve(), + Path(metadata["bootstrap_bare_workdir"]).resolve(), + ] + for workdir in candidates: + if (workdir / "deployconf.with_release_manifest_sha256.yaml").is_file(): + return workdir + checked = ", ".join( + str(workdir / "deployconf.with_release_manifest_sha256.yaml") + for workdir in candidates + ) + raise RuntimeError( + "--skip-start-testbed requires a previous normalized start_test_bed deployconf before running " + f"test_runner; checked: {checked}. Rerun without --skip-start-testbed or keep the prior start_test_bed workdir." + ) + + def _prepare_pack_release_runtime_dirs(*, project_data_root: Path) -> None: root = project_data_root.resolve() for relpath in ( @@ -742,6 +942,7 @@ def _build_generated_configs( wheel_name: str, ) -> dict[str, Any]: scene_ids = _selected_scene_ids(args, suite_cfg) + testbed_ops_cluster_name = _normalize_testbed_ops_cluster_name(args.testbed_ops_cluster_name) generated_suite = _rewrite_suite_for_local_dual_nodes( suite_cfg=suite_cfg, scene_ids=scene_ids, @@ -760,6 +961,7 @@ def _build_generated_configs( secondary_hostworkdir=secondary_hostworkdir, wheel_name=wheel_name, controller_port=int(args.controller_port), + testbed_ops_cluster_name=testbed_ops_cluster_name, ) generated_start_cfg = _rewrite_start_test_bed_for_local_dual_nodes( start_cfg=start_test_bed_template, @@ -769,6 +971,7 @@ def _build_generated_configs( controller_port=int(args.controller_port), ui_port=int(args.ui_port), ui_workdir=workdir / "test_runner_ui_runtime", + testbed_ops_cluster_name=testbed_ops_cluster_name, ) generated_apply_check_cfg = _rewrite_start_test_bed_for_apply_check( start_cfg=generated_start_cfg, @@ -785,10 +988,21 @@ def _build_generated_configs( runner_workdir = args.runner_workdir.resolve() if args.runner_workdir else (workdir / "runner_run").resolve() bootstrap_root = (workdir / "start_test_bed").resolve() + testbed_bundle = _write_ci_testbed_bundle( + bundle_root=runner_workdir / "testbed_bundle", + deployconf=generated_deployconf, + start_cfg=generated_start_cfg, + apply_check_start_cfg=generated_apply_check_cfg, + artifacts_source_root=_resolve_repo_root_cli_path(args.release_dir) / "test_rsc", + ) return { "suite_path": suite_path, "deployconf_path": deployconf_path, "start_test_bed_path": start_cfg_path, + "testbed_bundle_path": testbed_bundle["bundle_root"], + "testbed_bundle_deployconf_path": testbed_bundle["deployconf_path"], + "testbed_bundle_start_config_path": testbed_bundle["start_config_path"], + "testbed_bundle_apply_check_start_config_path": testbed_bundle["apply_check_start_config_path"], "start_test_bed_apply_check_path": start_apply_check_cfg_path, "bootstrap_root": bootstrap_root, "bootstrap_bare_workdir": bootstrap_root / "bare", @@ -937,13 +1151,17 @@ def main() -> int: sys.executable, str((REPO_ROOT / "fluxon_test_stack" / "start_test_bed.py").resolve()), "-c", - str(metadata["start_test_bed_path"]), + str(metadata["testbed_bundle_start_config_path"]), "-w", str(metadata["bootstrap_bare_workdir"]), "--bootstrap-mode", "bare_only" if not args.skip_apply_check else args.bootstrap_mode, ] _run(start_bare_cmd) + _refresh_ci_testbed_bundle_deployconf_from_start_workdir( + metadata=metadata, + start_workdir=Path(metadata["bootstrap_bare_workdir"]), + ) if not args.skip_apply_check: metadata["bootstrap_apply_workdir"].mkdir(parents=True, exist_ok=True) @@ -951,26 +1169,39 @@ def main() -> int: sys.executable, str((REPO_ROOT / "fluxon_test_stack" / "start_test_bed.py").resolve()), "-c", - str(metadata["start_test_bed_apply_check_path"]), + str(metadata["testbed_bundle_apply_check_start_config_path"]), "-w", str(metadata["bootstrap_apply_workdir"]), "--bootstrap-mode", "apply_only", ] _run(start_apply_cmd) + _refresh_ci_testbed_bundle_deployconf_from_start_workdir( + metadata=metadata, + start_workdir=Path(metadata["bootstrap_apply_workdir"]), + ) elif args.bootstrap_mode in ("apply_only", "bare_then_apply"): metadata["bootstrap_apply_workdir"].mkdir(parents=True, exist_ok=True) start_apply_cmd = [ sys.executable, str((REPO_ROOT / "fluxon_test_stack" / "start_test_bed.py").resolve()), "-c", - str(metadata["start_test_bed_path"]), + str(metadata["testbed_bundle_start_config_path"]), "-w", str(metadata["bootstrap_apply_workdir"]), "--bootstrap-mode", args.bootstrap_mode, ] _run(start_apply_cmd) + _refresh_ci_testbed_bundle_deployconf_from_start_workdir( + metadata=metadata, + start_workdir=Path(metadata["bootstrap_apply_workdir"]), + ) + elif not args.skip_runner: + _refresh_ci_testbed_bundle_deployconf_from_start_workdir( + metadata=metadata, + start_workdir=_prior_normalized_start_workdir_for_skip_start_testbed(metadata=metadata), + ) if not args.skip_runner: runner_workdir = Path(metadata["runner_workdir"]) @@ -983,7 +1214,13 @@ def main() -> int: "-w", str(runner_workdir), ] - _run(runner_cmd, env=_runner_env(release_dir=release_dir, start_cfg_path=Path(metadata["start_test_bed_path"]))) + _run( + runner_cmd, + env=_runner_env( + release_dir=release_dir, + start_cfg_path=Path(metadata["testbed_bundle_start_config_path"]), + ), + ) return 0 diff --git a/fluxon_test_stack/distributed_benchmark_coordinator.py b/fluxon_test_stack/distributed_benchmark_coordinator.py index 0859560..37f4d15 100644 --- a/fluxon_test_stack/distributed_benchmark_coordinator.py +++ b/fluxon_test_stack/distributed_benchmark_coordinator.py @@ -2233,6 +2233,7 @@ def handle_register(self, message: Dict, client_socket: socket.socket) -> bool: "node_id": response_node_id, "node_role": assigned_role, "test_mode": active_test_mode, + "expected_nodes": int(self.expected_nodes), "threads_per_process": THREADS_PER_PROCESS, "max_benchmark_seconds": MAX_BENCHMARK_SECONDS, "cluster_ready_timeout_seconds": CLUSTER_READY_TIMEOUT_SECONDS, diff --git a/fluxon_test_stack/distributed_benchmark_node.py b/fluxon_test_stack/distributed_benchmark_node.py index 0065f5d..e2fcd12 100644 --- a/fluxon_test_stack/distributed_benchmark_node.py +++ b/fluxon_test_stack/distributed_benchmark_node.py @@ -52,6 +52,7 @@ mq_get_once, MQClosedError, ) + from .mpmc_readiness import evaluate_mpmc_topology_ready from .benchmark_node_kv import ( KV_OPERATION_GET, KV_OPERATION_PUT, @@ -92,6 +93,7 @@ mq_get_once, MQClosedError, ) + from mpmc_readiness import evaluate_mpmc_topology_ready from benchmark_node_kv import ( KV_OPERATION_GET, KV_OPERATION_PUT, @@ -217,6 +219,11 @@ # No fallback/default: MPMC cluster readiness timeout must be explicitly provided # by the coordinator via test_config["cluster_ready_timeout_seconds"]. RPC_CLOSE_TIMEOUT_SECONDS = 2.0 +MPMC_READY_INIT_STAGGER_MIN_EXPECTED_NODES = 16 +MPMC_READY_INIT_STAGGER_SECONDS_PER_EXTRA_NODE = 0.5 +MPMC_READY_INIT_STAGGER_MAX_SECONDS = 60.0 +READY_RUNTIME_INIT_RETRY_BASE_SECONDS = 1.0 +READY_RUNTIME_INIT_RETRY_MAX_SECONDS = 10.0 TCP_THREAD_PROM_METRIC_SEND_ENQUEUED = "send_enqueued" TCP_THREAD_PROM_METRIC_SOCKET_SUBMITTED = "socket_submitted" @@ -1010,6 +1017,7 @@ class PreparedWorkerRuntime: producer: Any = None consumer: Any = None + kv_store: Any = None local_mq_state: Optional[MQState] = None @@ -1243,6 +1251,134 @@ def __init__(self): logger.info(f"🔧 初始化基准测试节点: {self.node_id}") + @staticmethod + def _stable_fraction_from_text(text: str) -> float: + digest = hashlib.sha256(text.encode("utf-8")).digest() + value = int.from_bytes(digest[:8], "big") + return value / float((1 << 64) - 1) + + def _runtime_init_stagger_seconds(self) -> float: + """Spread large MPMC node initialization so etcd is not hit all at once.""" + if not isinstance(self.test_config, dict): + return 0.0 + if str(self.test_config.get("test_mode", "")).strip() != TestMode.MPMC.value: + return 0.0 + expected_nodes_raw = self.test_config.get("expected_nodes") + try: + expected_nodes = int(expected_nodes_raw) + except (TypeError, ValueError): + return 0.0 + if expected_nodes <= MPMC_READY_INIT_STAGGER_MIN_EXPECTED_NODES: + return 0.0 + + max_stagger_s = min( + MPMC_READY_INIT_STAGGER_MAX_SECONDS, + ( + expected_nodes - MPMC_READY_INIT_STAGGER_MIN_EXPECTED_NODES + ) + * MPMC_READY_INIT_STAGGER_SECONDS_PER_EXTRA_NODE, + ) + key = self.instance_key or self.node_id + return self._stable_fraction_from_text(str(key)) * max_stagger_s + + def _sleep_for_runtime_init_stagger( + self, + *, + max_sleep_seconds: Optional[float] = None, + ) -> None: + stagger_s = self._runtime_init_stagger_seconds() + if max_sleep_seconds is not None: + stagger_s = min(stagger_s, max(0.0, float(max_sleep_seconds))) + if stagger_s <= 0.0: + return + logger.info( + "⏳ MPMC runtime init stagger: sleep_seconds=%.2f expected_nodes=%s instance_key=%s", + stagger_s, + self.test_config.get("expected_nodes") if isinstance(self.test_config, dict) else None, + self.instance_key, + ) + time.sleep(stagger_s) + + @staticmethod + def _is_retryable_runtime_init_error(error_msg: str) -> bool: + """Classify transient Fluxon runtime init failures seen during fan-out startup.""" + lowered = str(error_msg).lower() + retryable_markers = ( + "backendinitfailederror", + "failed to connect to etcd", + "connect etcd failed", + "etcd connection failed", + "failed to acquire etcd lock", + "deadline has elapsed", + "status probe timed out", + "timed out", + "connection refused", + "p2p timeout", + "payload lease keepalive", + ) + return any(marker in lowered for marker in retryable_markers) + + def _runtime_init_retry_deadline_seconds(self) -> float: + if not isinstance(self.test_config, dict): + return 0.0 + raw_timeout = self.test_config.get("cluster_ready_timeout_seconds") + try: + timeout_s = float(raw_timeout) + except (TypeError, ValueError): + return 0.0 + return max(0.0, timeout_s) + + def _runtime_init_retry_sleep_seconds(self, *, attempt: int) -> float: + base_s = min( + READY_RUNTIME_INIT_RETRY_MAX_SECONDS, + READY_RUNTIME_INIT_RETRY_BASE_SECONDS * (2 ** max(0, attempt - 1)), + ) + key = f"{self.instance_key or self.node_id}:{attempt}" + jitter_s = self._stable_fraction_from_text(key) * 0.5 + return base_s + jitter_s + + def _init_kv_store_with_ready_retry( + self, + kvcache_config: Dict[str, Any], + ) -> Tuple[Optional[Any], Optional[str]]: + """Initialize the KV client with bounded retry for transient fan-out failures.""" + deadline_s = self._runtime_init_retry_deadline_seconds() + deadline_ts = time.monotonic() + deadline_s if deadline_s > 0.0 else time.monotonic() + attempt = 0 + last_err: Optional[str] = None + + while True: + attempt += 1 + store, err = init_kv_store(kvcache_config) + if err is None: + if attempt > 1: + logger.info("✅ KVCache 存储实例重试创建成功: attempts=%s", attempt) + return store, None + + last_err = str(err) + if not self._is_retryable_runtime_init_error(last_err): + return None, last_err + + remaining_s = deadline_ts - time.monotonic() + if remaining_s <= 0.0: + return ( + None, + f"{last_err} (after {attempt} attempts within {deadline_s:.1f}s)", + ) + + sleep_s = min( + remaining_s, + self._runtime_init_retry_sleep_seconds(attempt=attempt), + ) + logger.warning( + "⚠️ KVCache 存储实例创建遇到瞬时错误,将重试: attempt=%s sleep_seconds=%.2f remaining_seconds=%.1f err=%s", + attempt, + sleep_s, + remaining_s, + last_err, + ) + time.sleep(sleep_s) + def _mark_progress(self, *, thread_id: int, op_idx: int, finish_ts: float, latency_us: float) -> None: with self._progress_lock: self._last_op_finish_ts = finish_ts @@ -2215,38 +2351,118 @@ def _close_kv_store(self, *, reason: str) -> None: close_res.unwrap() self._kv_store_closed = True + def _close_worker_owned_kv_store( + self, + kv_store: Any, + *, + reason: str, + thread_id: int, + ) -> None: + """Close a worker-owned KvClient created after START for MPMC producer.""" + if kv_store is None: + return + logger.info( + "🔒 Closing worker-owned kv_store: reason=%s thread_id=%s", + reason, + thread_id, + ) + ok, close_res = _call_with_timeout( + fn=kv_store.close, + timeout_s=RPC_CLOSE_TIMEOUT_SECONDS, + ctx=f"worker-owned kv_store.close reason={reason} thread_id={thread_id}", + ) + if not ok: + logger.warning( + "⚠️ worker-owned kv_store close timed out or raised: " + "reason=%s thread_id=%s err=%s", + reason, + thread_id, + close_res, + ) + return + if not close_res.is_ok(): + logger.warning( + "⚠️ worker-owned kv_store 关闭失败: reason=%s thread_id=%s err=%s", + reason, + thread_id, + close_res.unwrap_error(), + ) + return + close_res.unwrap() + def _prepare_mpmc_worker_runtime(self, *, thread_id: int) -> PreparedWorkerRuntime: """Prepare one worker-owned MPMC runtime before the benchmark window starts.""" - if self.kv_store is None: - raise RuntimeError("MPMC 模式下 KV store 未初始化") - node_role = self.test_config.get("node_role", "") - producer = None - consumer = None - if node_role == "producer": - producer, _, err = init_mq_channel( - role="producer", - kv_store=self.kv_store, - chan_config=self.chan_config, - unique_id=self.mq_unique_id, - weight=self.mq_state.weight if self.mq_state else 1.0, + test_mode = self.test_config.get("test_mode", "") + kv_store = self.kv_store + worker_owned_kv_store = None + if kv_store is None: + if not (test_mode == TestMode.MPMC.value and node_role == "producer"): + raise RuntimeError("MPMC 模式下 KV store 未初始化") + logger.info( + "🔧 线程 %s 正在创建 MPMC producer worker-owned KVCache 存储实例", + thread_id, ) - if err is not None: - raise RuntimeError(f"线程 {thread_id} 初始化 MPMC producer 失败: {err}") - if self.mq_state is not None and self.mq_state.producer_id is None: - self.mq_state.producer_id = self.instance_key or self.node_id - elif node_role == "consumer": - _, consumer, err = init_mq_channel( - role="consumer", - kv_store=self.kv_store, - chan_config=self.chan_config, - unique_id=self.mq_unique_id, - weight=self.mq_state.weight if self.mq_state else 1.0, + max_benchmark_seconds = float( + self.test_config.get("max_benchmark_seconds", 0.0) + ) + max_start_stagger_s = ( + max_benchmark_seconds / 2.0 + if max_benchmark_seconds > 0.0 + else None + ) + self._sleep_for_runtime_init_stagger( + max_sleep_seconds=max_start_stagger_s + ) + store, err = self._init_kv_store_with_ready_retry( + self.test_config["kvcache_config"] ) if err is not None: - raise RuntimeError(f"线程 {thread_id} 初始化 MPMC consumer 失败: {err}") - else: - raise RuntimeError(f"不支持的 MPMC 角色: {node_role}") + raise RuntimeError( + f"线程 {thread_id} 创建 MPMC producer worker-owned KV store 失败: {err}" + ) + kv_store = store + worker_owned_kv_store = store + self._attach_fluxon_phase_summary_callback(worker_owned_kv_store) + logger.info( + "✅ 线程 %s 已创建 MPMC producer worker-owned KVCache 存储实例", + thread_id, + ) + + producer = None + consumer = None + try: + if node_role == "producer": + producer, _, err = init_mq_channel( + role="producer", + kv_store=kv_store, + chan_config=self.chan_config, + unique_id=self.mq_unique_id, + weight=self.mq_state.weight if self.mq_state else 1.0, + ) + if err is not None: + raise RuntimeError(f"线程 {thread_id} 初始化 MPMC producer 失败: {err}") + if self.mq_state is not None and self.mq_state.producer_id is None: + self.mq_state.producer_id = self.instance_key or self.node_id + elif node_role == "consumer": + _, consumer, err = init_mq_channel( + role="consumer", + kv_store=kv_store, + chan_config=self.chan_config, + unique_id=self.mq_unique_id, + weight=self.mq_state.weight if self.mq_state else 1.0, + ) + if err is not None: + raise RuntimeError(f"线程 {thread_id} 初始化 MPMC consumer 失败: {err}") + else: + raise RuntimeError(f"不支持的 MPMC 角色: {node_role}") + except Exception: + self._close_worker_owned_kv_store( + worker_owned_kv_store, + reason="mpmc_endpoint_prepare_failed", + thread_id=thread_id, + ) + raise local_mq_state: Optional[MQState] = None if node_role == "producer": @@ -2261,9 +2477,60 @@ def _prepare_mpmc_worker_runtime(self, *, thread_id: int) -> PreparedWorkerRunti return PreparedWorkerRuntime( producer=producer, consumer=consumer, + kv_store=worker_owned_kv_store, local_mq_state=local_mq_state, ) + def _prepare_mpmc_worker_runtime_with_retry( + self, + *, + thread_id: int, + deadline_ts: float, + ctx: str, + ) -> PreparedWorkerRuntime: + attempt = 0 + last_err = "" + while True: + if self._benchmark_stop.is_set(): + raise RuntimeError( + f"MPMC endpoint prepare stopped before completion: ctx={ctx} thread_id={thread_id}" + ) + attempt += 1 + try: + runtime = self._prepare_mpmc_worker_runtime(thread_id=thread_id) + if attempt > 1: + logger.info( + "✅ MPMC endpoint prepare retry succeeded: ctx=%s thread_id=%s attempts=%s", + ctx, + thread_id, + attempt, + ) + return runtime + except Exception as exc: + last_err = str(exc) + if not self._is_retryable_runtime_init_error(last_err): + raise + remaining_s = deadline_ts - time.monotonic() + if remaining_s <= 0.0: + raise RuntimeError( + "MPMC endpoint prepare retry deadline exceeded: " + f"ctx={ctx} thread_id={thread_id} attempts={attempt} err={last_err}" + ) from exc + sleep_s = min( + remaining_s, + self._runtime_init_retry_sleep_seconds(attempt=attempt), + ) + logger.warning( + "⚠️ MPMC endpoint prepare 遇到瞬时错误,将重试: ctx=%s thread_id=%s attempt=%s sleep_seconds=%.2f remaining_seconds=%.1f err=%s", + ctx, + thread_id, + attempt, + sleep_s, + remaining_s, + last_err, + ) + time.sleep(sleep_s) + def _wait_mpmc_cluster_ready( self, *, @@ -2280,19 +2547,21 @@ def _wait_mpmc_cluster_ready( role = self.test_config.get("node_role", "") deadline_ts = time.time() + float(timeout_s) + last_wait_log_ts = 0.0 + last_wait_reason = "" while True: snapshot = get_cluster_info_snapshot(endpoint) ready_channels = snapshot.ready_channels total_mpsc_channels = snapshot.total_mpsc_channels active_consumers = snapshot.active_consumers - topology_ready = True - if total_mpsc_channels is not None and total_mpsc_channels < expected_workers: - topology_ready = False - if ready_channels is not None and ready_channels < expected_workers: - topology_ready = False - if role == "producer" and active_consumers is not None and active_consumers < 1: - topology_ready = False - if topology_ready: + readiness = evaluate_mpmc_topology_ready( + role=role, + expected_workers=expected_workers, + total_mpsc_channels=total_mpsc_channels, + ready_channels=ready_channels, + active_consumers=active_consumers, + ) + if readiness.ready: logger.info( "✅ MPMC topology ready: role=%s expected_workers=%s mpmc_id=%s total_mpsc_channels=%s ready_channels=%s active_consumers=%s", role, @@ -2303,13 +2572,28 @@ def _wait_mpmc_cluster_ready( active_consumers, ) return - if time.time() >= deadline_ts: + now = time.time() + if now >= deadline_ts: raise RuntimeError( "MPMC topology did not become ready before timeout: " f"role={role} expected_workers={expected_workers} " f"mpmc_id={snapshot.mpmc_id} total_mpsc_channels={total_mpsc_channels} " - f"ready_channels={ready_channels} active_consumers={active_consumers}" + f"ready_channels={ready_channels} active_consumers={active_consumers} " + f"reason={readiness.reason}" ) + if readiness.reason != last_wait_reason or now - last_wait_log_ts >= 10.0: + logger.info( + "⏳ MPMC topology waiting: role=%s expected_workers=%s mpmc_id=%s total_mpsc_channels=%s ready_channels=%s active_consumers=%s reason=%s", + role, + expected_workers, + snapshot.mpmc_id, + total_mpsc_channels, + ready_channels, + active_consumers, + readiness.reason, + ) + last_wait_log_ts = now + last_wait_reason = readiness.reason time.sleep(1.0) def _prepare_mpmc_round_before_ready(self, *, workers: int) -> None: @@ -2317,14 +2601,22 @@ def _prepare_mpmc_round_before_ready(self, *, workers: int) -> None: if self._prepared_mpmc_round is not None: raise RuntimeError("MPMC round is already prepared before READY") - round_state = PreparedMPMCRound() role = self.test_config.get("node_role") mode = self.test_config.get("test_mode") cluster_ready_timeout_s = float(self.test_config["cluster_ready_timeout_seconds"]) + if role not in {"producer", "consumer"}: + raise RuntimeError(f"不支持的 MPMC 角色: {role}") + + round_state = PreparedMPMCRound() + prepare_retry_deadline_ts = time.monotonic() + cluster_ready_timeout_s def worker_target(thread_id: int) -> None: try: - runtime = self._prepare_mpmc_worker_runtime(thread_id=thread_id) + runtime = self._prepare_mpmc_worker_runtime_with_retry( + thread_id=thread_id, + deadline_ts=prepare_retry_deadline_ts, + ctx="before_ready", + ) with round_state.prepared_lock: round_state.prepared_runtimes[thread_id] = runtime logger.info("✅ 线程 %s 已完成 MPMC endpoint prepare", thread_id) @@ -2360,9 +2652,19 @@ def worker_target(thread_id: int) -> None: round_state.pending_threads[thread_id] = thread if role == "producer": _debug_print( - f"worker {thread_id} wrapper start, role={role}, mode={mode}, prewarm_before_ready=true" + f"worker {thread_id} wrapper deferred until START, role={role}, mode={mode}" ) - thread.start() + else: + thread.start() + + if role == "producer": + self._prepared_mpmc_round = round_state + logger.info( + "✅ MPMC producer round registered before READY without starting runtime init: workers=%s role=%s", + workers, + role, + ) + return prepare_deadline_ts = time.time() + cluster_ready_timeout_s while True: @@ -2486,7 +2788,7 @@ def _collect_finished_mpmc_workers( finished_worker_ids = [ thread_id for thread_id, thread in pending_threads.items() - if not thread.is_alive() + if thread.ident is not None and not thread.is_alive() ] for thread_id in finished_worker_ids: pending_threads.pop(thread_id, None) @@ -2739,30 +3041,41 @@ def initialize_from_test_config(self) -> bool: logger.info(f"🚀 开始初始化节点,角色: {self.test_config['node_role']}") try: - # 1) Initialize KVCache store - kvcache_config = self.test_config["kvcache_config"] - logger.debug(f"🔧 KVCache配置: {kvcache_config}") - logger.info("🔧 正在创建KVCache存储实例...") - # KV store initialization is needed only once. A previous merge caused duplicate calls, - # leading to repeated cluster member registration. - store, err = init_kv_store(kvcache_config) - if err is not None: - logger.error(f"❌ KVCache存储实例创建失败: {err}") - return False - self.kv_store = store - self._attach_fluxon_phase_summary_callback(self.kv_store) - logger.info("✅ KVCache存储实例创建成功") + test_mode = self.test_config.get("test_mode", "KVSTORE") + node_role = self.test_config["node_role"] + defer_shared_kv_store = ( + test_mode == TestMode.MPMC.value and node_role == "producer" + ) + if defer_shared_kv_store: + self.kv_store = None + logger.info( + "⏭️ MPMC producer skips shared KVCache init before READY; " + "worker-owned runtime will be initialized after START" + ) + else: + # 1) Initialize KVCache store + kvcache_config = self.test_config["kvcache_config"] + logger.debug(f"🔧 KVCache配置: {kvcache_config}") + logger.info("🔧 正在创建KVCache存储实例...") + self._sleep_for_runtime_init_stagger() + # KV store initialization is needed only once. A previous merge caused duplicate calls, + # leading to repeated cluster member registration. + store, err = self._init_kv_store_with_ready_retry(kvcache_config) + if err is not None: + logger.error(f"❌ KVCache存储实例创建失败: {err}") + return False + self.kv_store = store + self._attach_fluxon_phase_summary_callback(self.kv_store) + logger.info("✅ KVCache存储实例创建成功") # 2) Initialize MPMC components based on test mode - test_mode = self.test_config.get("test_mode", "KVSTORE") if test_mode == TestMode.MPMC.value: logger.info("🔧 MPMC模式,初始化 MPMC 相关配置(每线程独立实例)...") - node_role = (self.mq_state.role or self.test_config["node_role"]) if self.mq_state else self.test_config["node_role"] + node_role = (self.mq_state.role or node_role) if self.mq_state else node_role # Do not create Producer/Consumer instances here; each worker thread initializes them in _run_worker_thread. else: logger.info("🔧 KVSTORE/RPC模式,只使用KVCache存储") - node_role = self.test_config["node_role"] if node_role not in [KV_NODE_ROLE_SEED, KV_NODE_ROLE_WORKER]: logger.error( f"❌ KVSTORE/RPC模式下不支持的角色: {node_role},只支持 {KV_NODE_ROLE_SEED} 和 {KV_NODE_ROLE_WORKER}" @@ -3415,16 +3728,31 @@ def wait_for_start(self) -> bool: return False if self.test_config.get("test_mode") == TestMode.MPMC.value: prepared_round = self._prepared_mpmc_round - if prepared_round is None: - logger.error("❌ MPMC START 收到覆盖配置,但 READY 之前没有 prepared round") - return False - prepared_workers = len(prepared_round.pending_threads) - if prepared_workers != int(self.test_config["threads_per_process"]): - logger.error( - "❌ START overrides changed MPMC threads_per_process after READY: prepared=%s start_override=%s", - prepared_workers, - self.test_config["threads_per_process"], - ) + node_role = str(self.test_config.get("node_role", "")).strip() + if node_role == "consumer": + if prepared_round is None: + logger.error("❌ MPMC consumer START 收到覆盖配置,但 READY 之前没有 prepared round") + return False + prepared_workers = len(prepared_round.pending_threads) + if prepared_workers != int(self.test_config["threads_per_process"]): + logger.error( + "❌ START overrides changed MPMC threads_per_process after READY: prepared=%s start_override=%s", + prepared_workers, + self.test_config["threads_per_process"], + ) + return False + elif node_role == "producer": + if prepared_round is not None: + prepared_workers = len(prepared_round.pending_threads) + if prepared_workers != int(self.test_config["threads_per_process"]): + logger.error( + "❌ START overrides changed MPMC producer threads_per_process after READY: prepared=%s start_override=%s", + prepared_workers, + self.test_config["threads_per_process"], + ) + return False + else: + logger.error("❌ MPMC START 收到不支持的 node_role: %s", node_role) return False logger.info( @@ -3441,7 +3769,11 @@ def wait_for_start(self) -> bool: self.has_more_tests = bool(resp.get("has_more_tests", False)) if self.test_config and self.test_config.get("test_mode") == TestMode.MPMC.value: - logger.info("✅ 收到开始信号,MPMC round 已完成 prewarm,立即进入 benchmark") + node_role = self.test_config.get("node_role") + if node_role == "producer": + logger.info("✅ 收到开始信号,MPMC producer 使用非阻塞 prewarm round 进入 benchmark") + else: + logger.info("✅ 收到开始信号,MPMC round 已完成 prewarm,立即进入 benchmark") else: start_idle_seconds = float(self.test_config.get("start_idle_seconds", 10.0)) logger.info( @@ -3849,6 +4181,11 @@ def _run_worker_thread( finally: self._close_thread_owned_mq_endpoint(producer, role="producer", thread_id=thread_id) self._close_thread_owned_mq_endpoint(consumer, role="consumer", thread_id=thread_id) + self._close_worker_owned_kv_store( + prepared_runtime.kv_store, + reason="mpmc_worker_exit", + thread_id=thread_id, + ) _debug_print( f"thread {thread_id} exit run loop, total_ops={len(results)}, " f"last_op_idx={op_idx}" @@ -4051,6 +4388,71 @@ def _run_mpmc_workers(self, *, workers: int, deadline_ts: float) -> None: stop_requested = False stop_grace_deadline_ts: Optional[float] = None role = self.test_config.get("node_role") + + deferred_thread_ids = [ + thread_id + for thread_id, thread in sorted(pending_threads.items()) + if thread.ident is None + ] + for thread_id in deferred_thread_ids: + thread = pending_threads[thread_id] + try: + logger.info( + "▶️ START 后启动 deferred MPMC worker: role=%s thread_id=%s", + role, + thread_id, + ) + thread.start() + except Exception as exc: + logger.error( + "❌ START 后启动 deferred MPMC worker 失败: role=%s thread_id=%s err=%s", + role, + thread_id, + exc, + ) + with round_state.prepared_lock: + round_state.prepare_errors[thread_id] = str(exc) + + if role == "producer" and deferred_thread_ids: + prepare_deadline_ts = time.monotonic() + float( + self.test_config["cluster_ready_timeout_seconds"] + ) + while True: + with round_state.prepared_lock: + prepared_count = len(round_state.prepared_runtimes) + prepare_error_snapshot = dict(round_state.prepare_errors) + if prepare_error_snapshot: + self._set_forced_benchmark_result( + reason="mpmc_worker_prepare_failed", + total_workers=workers, + completed_workers=0, + timed_out_worker_ids=sorted(prepare_error_snapshot), + ) + self._benchmark_stop.set() + round_state.start_event.set() + return + if prepared_count == workers: + logger.info( + "✅ MPMC producer workers prepared after START: prepared=%s/%s", + prepared_count, + workers, + ) + break + if time.monotonic() >= prepare_deadline_ts: + missing_worker_ids = sorted( + set(range(workers)) - set(round_state.prepared_runtimes) + ) + self._set_forced_benchmark_result( + reason="mpmc_worker_prepare_timeout", + total_workers=workers, + completed_workers=prepared_count, + timed_out_worker_ids=missing_worker_ids, + ) + self._benchmark_stop.set() + round_state.start_event.set() + return + time.sleep(0.2) + self.start_time = time.time() self.end_time = self.start_time + int(self.test_config["max_benchmark_seconds"]) deadline_ts = self.end_time @@ -4132,11 +4534,18 @@ def run_benchmark(self) -> Dict[str, Any]: This prevents the main flow from hanging due to blocked worker threads. """ - if not self.test_config or not self.kv_store: - logger.error("❌ 无法运行基准测试:配置或存储实例未初始化") + if not self.test_config: + logger.error("❌ 无法运行基准测试:配置未初始化") return {} test_mode = str(self.test_config.get("test_mode", TestMode.KVSTORE.value)) + node_role = str(self.test_config.get("node_role", "")) + if self.kv_store is None and not ( + test_mode == TestMode.MPMC.value and node_role == "producer" + ): + logger.error("❌ 无法运行基准测试:存储实例未初始化") + return {} + threads_per_process = int(self.test_config["threads_per_process"]) logger.info("🚀 开始基准测试") diff --git a/fluxon_test_stack/mpmc_readiness.py b/fluxon_test_stack/mpmc_readiness.py new file mode 100644 index 0000000..983b817 --- /dev/null +++ b/fluxon_test_stack/mpmc_readiness.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Optional + + +@dataclass(frozen=True) +class MPMCTopologyReadiness: + """Result of a pre-READY MPMC topology check.""" + + ready: bool + reason: str + + +def evaluate_mpmc_topology_ready( + *, + role: str, + expected_workers: int, + total_mpsc_channels: Optional[int], + ready_channels: Optional[int], + active_consumers: Optional[int], +) -> MPMCTopologyReadiness: + """Evaluate whether an MPMC node can report READY to the coordinator.""" + if expected_workers <= 0: + raise ValueError(f"expected_workers must be > 0, got {expected_workers}") + + if total_mpsc_channels is not None and total_mpsc_channels < expected_workers: + return MPMCTopologyReadiness( + ready=False, + reason=( + "total_mpsc_channels below expected_workers: " + f"{total_mpsc_channels} < {expected_workers}" + ), + ) + + normalized_role = (role or "").strip().lower() + if normalized_role == "producer": + if ready_channels is not None and ready_channels < expected_workers: + return MPMCTopologyReadiness( + ready=False, + reason=( + "ready_channels below expected_workers: " + f"{ready_channels} < {expected_workers}" + ), + ) + if active_consumers is not None and active_consumers < 1: + return MPMCTopologyReadiness( + ready=False, + reason=f"active_consumers below 1: {active_consumers}", + ) + elif normalized_role == "consumer": + # A consumer may be the channel creator. Requiring ready_channels before that + # consumer reports READY can deadlock a one-worker process at the coordinator + # barrier, while producers still wait for consumers before START is released. + pass + else: + return MPMCTopologyReadiness( + ready=False, + reason=f"unsupported MPMC role: {role!r}", + ) + + return MPMCTopologyReadiness(ready=True, reason="ready") diff --git a/fluxon_test_stack/test_runner.py b/fluxon_test_stack/test_runner.py index ac085ed..58657a4 100644 --- a/fluxon_test_stack/test_runner.py +++ b/fluxon_test_stack/test_runner.py @@ -201,6 +201,7 @@ } CI_RUNTIME_INSTANCE_IDS = CI_CASE_RUNTIME_INSTANCE_IDS CONTROLLER_STATUS_TRANSIENT_HTTP_CODES = (502, 503, 504) +CONTROLLER_REQUEST_MODE_DIRECT = "direct" CONTROLLER_REQUEST_MODE_SSH_EXEC_PER_REQUEST = "ssh_exec_per_request" # Controller requests during TEST_STACK teardown can fan out to many remote nodes and are prone to # short SSH/control-plane stalls. Keep each attempt bounded, but allow a wider retry window so @@ -260,7 +261,6 @@ class _HttpGetJsonTransientError(RuntimeError): TEST_STACK_REDIS_INSTANCE_ID_PREFIX = "redis_node_" TEST_STACK_ALLUXIO_INSTANCE_ID_PREFIX = "alluxio_node_" TEST_STACK_MOONCAKE_MASTER_INSTANCE_ID = "mooncake_master" -TEST_STACK_BENCHMARK_FIXED_THREADS_PER_PROCESS = 4 # Owner-mode Fluxon KV configs in CI / TEST_STACK share the same canonical label. FLUXON_KV_OWNER_SUB_CLUSTER = "owner" @@ -1648,6 +1648,11 @@ def _test_bed_manifest_transport_ctx_opt() -> Optional[Dict[str, Any]]: if manifest_info is None: return None manifest_path, manifest = manifest_info + mode_raw = manifest.get("controller_request_mode") + if mode_raw is not None: + mode = _require_str(mode_raw, f"test bed manifest {manifest_path}.controller_request_mode") + if mode == CONTROLLER_REQUEST_MODE_DIRECT: + return None bastion = _require_dict(manifest.get("bastion"), f"test bed manifest {manifest_path}.bastion") bastion_user_raw = manifest.get("bastion_user") bastion_private_key_raw = manifest.get("bastion_private_key") @@ -2187,6 +2192,7 @@ def _build_runtime_token_mapping( stack_identity: Dict[str, Any], extra_tokens: Optional[Dict[str, str]] = None, ) -> Dict[str, str]: + test_bed_bundle_root = _test_bed_bundle_root_opt() mapping = { "__WORKDIR_ROOT__": workdir_root, "__RUN_DIR__": run_dir, @@ -2207,6 +2213,8 @@ def _build_runtime_token_mapping( "stack_identity.share_mem_path", ), } + if test_bed_bundle_root is not None: + mapping["__TEST_BED_BUNDLE_ROOT__"] = str(test_bed_bundle_root) if extra_tokens is not None: for token_name, token_value in extra_tokens.items(): mapping[f"__{token_name}__"] = token_value @@ -3869,12 +3877,40 @@ def _acquire_ui_service_lock(*, workdir_root: Path) -> Any: ) -def _acquire_ci_lock() -> Any: +def _ci_lock_name_for_ops_cluster_name(ops_cluster_name: str) -> str: + return "bench_ci__" + hashlib.sha256(ops_cluster_name.encode("utf-8")).hexdigest()[:16] + ".lock" + + +def _acquire_ci_lock(*, resolved_case: Optional[Dict[str, Any]] = None) -> Any: + owner_lines = [f"pid={os.getpid()}", f"repo_root={_runner_repo_root()}"] + busy_message = "another CI run is active (lock busy)" lock_path = _bench_lock_dir() / "bench_ci.lock" + if resolved_case is not None: + runtime = _require_dict(resolved_case.get("runtime"), "resolved_case.runtime") + stack_identity = _require_dict( + runtime.get("stack_identity"), + "resolved_case.runtime.stack_identity", + ) + ops_cluster_name = _require_str( + stack_identity.get("ops_cluster_name"), + "resolved_case.runtime.stack_identity.ops_cluster_name", + ) + controller_url = _require_str( + stack_identity.get("controller_url"), + "resolved_case.runtime.stack_identity.controller_url", + ) + lock_path = _bench_lock_dir() / _ci_lock_name_for_ops_cluster_name(ops_cluster_name) + owner_lines.extend( + [ + f"ops_cluster_name={ops_cluster_name}", + f"controller_url={controller_url}", + ] + ) + busy_message = f"another CI run is active for ops_cluster_name={ops_cluster_name} (lock busy)" return _acquire_named_lock( lock_path=lock_path, - owner_lines=[f"pid={os.getpid()}", f"repo_root={_runner_repo_root()}"], - busy_message="another CI run is active (lock busy)", + owner_lines=owner_lines, + busy_message=busy_message, ) @@ -5718,11 +5754,6 @@ def _parse_scale(item: Dict[str, Any], ctx: str) -> Dict[str, Any]: f"{ctx}.benchmark.threads_per_process", min_v=1, ) - if int(threads_per_process) != TEST_STACK_BENCHMARK_FIXED_THREADS_PER_PROCESS: - raise ValueError( - f"{ctx}.benchmark.threads_per_process must be fixed to " - f"{TEST_STACK_BENCHMARK_FIXED_THREADS_PER_PROCESS}" - ) owner_group_processes = bench.get("owner_group_processes") if owner_group_processes is not None: _ = _require_int(owner_group_processes, f"{ctx}.benchmark.owner_group_processes", min_v=1) @@ -7899,6 +7930,7 @@ def _test_stack_runtime_role_for_scene_role(*, scene_mode: str, role: str) -> st def _test_stack_scene_uses_per_target_process_fanout(*, scene_mode: str) -> bool: return scene_mode in ( + TEST_STACK_MODE_MPMC, TEST_STACK_MODE_KVSTORE, TEST_STACK_MODE_KVSTORE_WITH_LOCAL_CACHE, TEST_STACK_MODE_RPC, @@ -8886,11 +8918,6 @@ def _compile_test_stack_case(resolved_case: Dict[str, Any], *, run_index: int) - "scale.benchmark.threads_per_process", min_v=1, ) - if int(threads_per_process) != TEST_STACK_BENCHMARK_FIXED_THREADS_PER_PROCESS: - raise ValueError( - "scale.benchmark.threads_per_process must be fixed to " - f"{TEST_STACK_BENCHMARK_FIXED_THREADS_PER_PROCESS}" - ) value_size = _require_int(bench.get("value_size"), "scale.benchmark.value_size", min_v=0) warmup = bench.get("metric_warmup_seconds") if not isinstance(warmup, (int, float)): @@ -10242,11 +10269,11 @@ def _load_test_stack_cluster_nodes_and_dispatch(resolved_case: Dict[str, Any]) - target_ip_map = deploy.get("target_ip_map") if target_ip_map is not None: target_ip_map_d = _require_dict(target_ip_map, "resolved_case.deploy.target_ip_map") - cluster_nodes_by_ip: Dict[str, List[Dict[str, Any]]] = {} + cluster_nodes_by_ip: Dict[str, List[Tuple[str, Dict[str, Any]]]] = {} for hostname, raw_node_cfg in cluster_nodes.items(): node_cfg = _require_dict(raw_node_cfg, f"cluster_nodes[{hostname}]") node_ip = _require_str(node_cfg.get("ip"), f"cluster_nodes[{hostname}].ip") - cluster_nodes_by_ip.setdefault(node_ip, []).append(node_cfg) + cluster_nodes_by_ip.setdefault(node_ip, []).append((hostname, node_cfg)) cluster_nodes = dict(cluster_nodes) for raw_target, raw_ip in target_ip_map_d.items(): target = _require_str(raw_target, "resolved_case.deploy.target_ip_map key") @@ -10256,12 +10283,36 @@ def _load_test_stack_cluster_nodes_and_dispatch(resolved_case: Dict[str, Any]) - matches = cluster_nodes_by_ip.get(node_ip, []) if not matches: continue - if len(matches) > 1: + same_ip_targets = [ + _require_str(candidate, "resolved_case.deploy.target_ip_map key") + for candidate, candidate_ip in target_ip_map_d.items() + if _require_str( + candidate_ip, + f"resolved_case.deploy.target_ip_map[{candidate!r}]", + ) + == node_ip + ] + preferred = _controller_target_for_target( + target, + target_ip_map={ + candidate: node_ip + for candidate in same_ip_targets + }, + ) + selected: Optional[Dict[str, Any]] = None + for hostname, node_cfg in matches: + if hostname == preferred: + selected = node_cfg + break + if selected is None and len(matches) == 1: + selected = matches[0][1] + if selected is None: + available = sorted(hostname for hostname, _node_cfg in matches) raise ValueError( - "resolved_case.deploy.target_ip_map alias expansion is ambiguous: " - f"target={target!r} ip={node_ip!r} matches={len(matches)}" + "resolved_case.deploy.target_ip_map alias expansion has no canonical active node: " + f"target={target!r} ip={node_ip!r} preferred={preferred!r} active_matches={available}" ) - cluster_nodes[target] = copy.deepcopy(matches[0]) + cluster_nodes[target] = copy.deepcopy(selected) return cluster_nodes, start_test_bed_mod.manual_dispatch_release @@ -10974,6 +11025,14 @@ def _cluster_node_ssh_host(node_cfg: Dict[str, Any], *, target_name: str) -> str return _require_str(ssh_host, f"cluster_nodes[{target_name}].ssh_host") +def _cluster_node_execution_mode(node_cfg: Dict[str, Any], *, target_name: str) -> str: + raw = node_cfg.get("execution_mode", "ssh") + if raw is None: + return "ssh" + mode = _require_str(raw, f"cluster_nodes[{target_name}].execution_mode").strip() + return mode or "ssh" + + def _sync_run_dir_archive_to_remote_target( *, archive_path: Path, @@ -13916,10 +13975,17 @@ def _write_ci_prepare_env_script(*, run_dir: Path, exports: Dict[str, str]) -> P def _run_ci_prepare_steps(*, resolved_case: Dict[str, Any], run_dir: Path, src_root: Path) -> Dict[str, str]: prepare_steps = _resolved_ci_prepare_steps(resolved_case) + exports: Dict[str, str] = {} + test_bed_start_cfg = _load_test_bed_bootstrap_config_path() + if _load_test_bed_manifest_opt() is not None: + exports[TEST_STACK_START_TEST_BED_CONFIG_ENV] = str(test_bed_start_cfg) + local_release_root = _local_release_root_override_opt() + if local_release_root is not None: + exports[_LOCAL_RELEASE_CACHE_ROOT_OVERRIDE_ENV] = str(local_release_root) + if not prepare_steps: - return {} + return exports - exports: Dict[str, str] = {} for index, step in enumerate(prepare_steps): kind = _require_str(step.get("kind"), f"resolved_case.scene.ci.prepare[{index}].kind") if kind == CI_PREPARE_KIND_SETUP_DEV_ENV: @@ -14305,6 +14371,20 @@ def _instance_remote_target_access_opt( def _run_remote_bash_capture( *, target_name: str, node_cfg: Dict[str, Any], remote_cmd: str ) -> str: + if _cluster_node_execution_mode(node_cfg, target_name=target_name) == "local": + completed = subprocess.run( + remote_cmd, + shell=True, + executable="/bin/bash", + capture_output=True, + text=True, + ) + if completed.returncode != 0: + raise RuntimeError( + f"local bash capture {target_name} failed rc={completed.returncode}\n" + f"cmd={remote_cmd}\nstdout={completed.stdout}\nstderr={completed.stderr}" + ) + return completed.stdout transport_ctx = _test_bed_manifest_transport_ctx_opt() if transport_ctx is not None and target_name == str(transport_ctx["bastion_name"]): completed = _run_remote_bash_via_bastion_transport( @@ -14366,6 +14446,27 @@ def _run_remote_bash_capture( def _run_remote_bash( *, target_name: str, node_cfg: Dict[str, Any], remote_cmd: str ) -> None: + if _cluster_node_execution_mode(node_cfg, target_name=target_name) == "local": + print("RUN:", remote_cmd, flush=True) + completed = subprocess.run( + remote_cmd, + shell=True, + executable="/bin/bash", + capture_output=True, + text=True, + ) + if completed.stdout: + sys.stdout.write(completed.stdout) + sys.stdout.flush() + if completed.stderr: + sys.stderr.write(_clean_ssh_stderr_text(completed.stderr)) + sys.stderr.flush() + if completed.returncode != 0: + raise RuntimeError( + f"local bash {target_name} failed rc={completed.returncode}\n" + f"cmd={remote_cmd}" + ) + return transport_ctx = _test_bed_manifest_transport_ctx_opt() if transport_ctx is not None and target_name == str(transport_ctx["bastion_name"]): _run_remote_bash_via_bastion_transport( diff --git a/fluxon_test_stack/test_runner_runtime_backend.py b/fluxon_test_stack/test_runner_runtime_backend.py index 30d1191..f764703 100644 --- a/fluxon_test_stack/test_runner_runtime_backend.py +++ b/fluxon_test_stack/test_runner_runtime_backend.py @@ -19,7 +19,7 @@ def _prepare_ci_case( ) -> Any: _ = ctx._require_dict(resolved_case.get("deploy"), "resolved_case.deploy") ci_checkout_root = ctx._runner_repo_root() - runtime_tracking.ci_lock_fp = ctx._acquire_ci_lock() + runtime_tracking.ci_lock_fp = ctx._acquire_ci_lock(resolved_case=resolved_case) ctx._ensure_deployer_online(resolved_case) out_cluster_name = ctx._ci_cluster_name(resolved_case) diff --git a/fluxon_test_stack/tests/test_benchmark_node_kv_contract.py b/fluxon_test_stack/tests/test_benchmark_node_kv_contract.py new file mode 100644 index 0000000..274acb4 --- /dev/null +++ b/fluxon_test_stack/tests/test_benchmark_node_kv_contract.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import sys +import unittest +from pathlib import Path + + +REPO_ROOT = Path(__file__).resolve().parents[2] +TEST_STACK_DIR = REPO_ROOT / "fluxon_test_stack" +if str(TEST_STACK_DIR) not in sys.path: + sys.path.insert(0, str(TEST_STACK_DIR)) +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +from fluxon_py.api_error import OkNone, Result +from fluxon_py.kvclient.kvclient_interface import KvLeaseApi +from benchmark_node_kv import FluxonBlockingStore + + +class _FakeFluxonStore: + def __init__(self) -> None: + self._client = object() + self.zero_contribution_checked = False + self.calls: list[tuple[str, tuple[object, ...], dict[str, object]]] = [] + + def _record(self, name: str, *args: object, **kwargs: object) -> str: + self.calls.append((name, args, kwargs)) + return f"{name}-result" + + def put(self, *args: object, **kwargs: object) -> str: + return self._record("put", *args, **kwargs) + + def get(self, *args: object, **kwargs: object) -> str: + return self._record("get", *args, **kwargs) + + def get_size(self, *args: object, **kwargs: object) -> str: + return self._record("get_size", *args, **kwargs) + + def is_exist(self, *args: object, **kwargs: object) -> str: + return self._record("is_exist", *args, **kwargs) + + def remove(self, *args: object, **kwargs: object) -> str: + return self._record("remove", *args, **kwargs) + + def sync_kv_to_file(self, *args: object, **kwargs: object) -> str: + return self._record("sync_kv_to_file", *args, **kwargs) + + def instance_key(self) -> Result[str, object]: + return Result.new_ok("bench-instance") + + def config(self) -> str: + return "bench-config" + + def get_cluster_name(self) -> str: + return "fluxon_benchmark" + + def get_etcd_config(self) -> list[str]: + return ["127.0.0.1:2379"] + + def third_party_logs_dir(self) -> Result[str, object]: + return Result.new_ok("/tmp/fluxon-logs") + + def ensure_zero_contribution_for_channel(self) -> None: + self.zero_contribution_checked = True + + def count_prefix(self, prefix: str) -> Result[int, object]: + self.calls.append(("count_prefix", (prefix,), {})) + return Result.new_ok(3) + + def allocate_lease(self, ttl_seconds: int) -> Result[int, object]: + self.calls.append(("allocate_lease", (ttl_seconds,), {})) + return Result.new_ok(42) + + def keepalive_lease(self, lease_id: int) -> Result[OkNone, object]: + self.calls.append(("keepalive_lease", (lease_id,), {})) + return Result.new_ok(OkNone()) + + def close(self) -> Result[OkNone, object]: + return Result.new_ok(OkNone()) + + +class TestBenchmarkNodeKvContract(unittest.TestCase): + def test_fluxon_blocking_store_exposes_channel_backend_contract(self) -> None: + raw_store = _FakeFluxonStore() + store = FluxonBlockingStore(raw_store) # type: ignore[arg-type] + + self.assertIsInstance(store, KvLeaseApi) + self.assertIs(store._client, raw_store._client) + self.assertEqual(store.get_etcd_config(), ["127.0.0.1:2379"]) + self.assertEqual(store.get_cluster_name(), "fluxon_benchmark") + self.assertEqual(store.config(), "bench-config") + + store.ensure_zero_contribution_for_channel() + self.assertTrue(raw_store.zero_contribution_checked) + + self.assertEqual(store.count_prefix("/mpmc/1/").unwrap(), 3) + self.assertEqual(store.allocate_lease(90).unwrap(), 42) + self.assertIsInstance(store.keepalive_lease(42).unwrap(), OkNone) + + self.assertEqual(store.get("k"), "get-result") + self.assertEqual(store.remove("k"), "remove-result") + self.assertEqual( + store.sync_kv_to_file("k", "node-a", "/tmp/out", 7, "payload", timeout_ms=10000), + "sync_kv_to_file-result", + ) + self.assertEqual( + raw_store.calls[-3:], + [ + ("get", ("k",), {}), + ("remove", ("k",), {}), + ( + "sync_kv_to_file", + ("k", "node-a", "/tmp/out", 7, "payload"), + {"timeout_ms": 10000}, + ), + ], + ) + + +if __name__ == "__main__": + raise SystemExit(unittest.main()) diff --git a/fluxon_test_stack/tests/test_ci_2_virt_node_contract.py b/fluxon_test_stack/tests/test_ci_2_virt_node_contract.py index 1974927..9fa160b 100644 --- a/fluxon_test_stack/tests/test_ci_2_virt_node_contract.py +++ b/fluxon_test_stack/tests/test_ci_2_virt_node_contract.py @@ -3,6 +3,7 @@ from __future__ import annotations import importlib.util +import json import sys import tempfile import unittest @@ -94,6 +95,11 @@ def test_generated_suite_is_public_dual_local_nodes_ci_only(self) -> None: ]["greptime"]["endpoint"]["host_port"], 19190, ) + test_stack_ports = generated["profiles"]["fluxon_tcp_thread"]["runtime"]["test_stack"]["port_alloc"][ + "by_topology" + ] + self.assertEqual(test_stack_ports[1]["coordinator_port_base"], 20180) + self.assertEqual(test_stack_ports[2]["coordinator_port_base"], 20280) self.assertEqual( generated["artifact_sets"]["fluxon_tcp_thread"]["release_source"]["key_prefix"], "profiles/fluxon_tcp_thread", @@ -352,6 +358,8 @@ def test_generated_deployconf_rewrites_to_dual_local_nodes(self) -> None: generated["service"]["owner"]["entrypoint"], ) self.assertEqual(generated["service"]["ops_controller"]["port"], 19180) + self.assertEqual(generated["namespace"], "fluxon_testbed") + self.assertEqual(generated["global_envs"]["FLUXON_CLUSTER_NAME"], "fluxon_testbed") self.assertIn( 'http_listen_addr: "0.0.0.0:${OPS_CONTROLLER__PORT}"', generated["service"]["ops_controller"]["entrypoint"], @@ -384,6 +392,38 @@ def test_generated_start_test_bed_config_points_to_local_authorities(self) -> No self.assertIsNone(generated["test_runner_ui"]["gitops_config_path"]) self.assertEqual(generated["bootstrap_phases"][0]["node"], "local-node-a") + def test_generated_local_testbed_supports_explicit_ops_cluster_name(self) -> None: + deployconf_cfg = _ENTRY._load_yaml_mapping(_ENTRY.DEFAULT_DEPLOYCONF_TEMPLATE, ctx="deployconf") + deployconf = _ENTRY._rewrite_deployconf_for_local_dual_nodes( + deployconf_cfg=deployconf_cfg, + primary_node_name="local-node-a", + secondary_node_name="local-node-b", + host_ip="10.1.1.119", + primary_hostworkdir=Path("/tmp/fluxon_testbed/a"), + secondary_hostworkdir=Path("/tmp/fluxon_testbed/b"), + wheel_name="fluxon-0.2.1-cp38-abi3-manylinux_2_28_x86_64.whl", + controller_port=19180, + testbed_ops_cluster_name="fluxon_testbed_mq_large_local", + ) + start_cfg = _ENTRY._load_yaml_mapping(_ENTRY.DEFAULT_START_TEST_BED_TEMPLATE, ctx="start_test_bed") + start = _ENTRY._rewrite_start_test_bed_for_local_dual_nodes( + start_cfg=start_cfg, + generated_deployconf_path=Path("/tmp/deployconf.yaml"), + primary_node_name="local-node-a", + controller_access_ip="10.1.1.119", + controller_port=19180, + ui_port=18080, + ui_workdir=Path("/tmp/ui"), + testbed_ops_cluster_name="fluxon_testbed_mq_large_local", + ) + + self.assertEqual(deployconf["namespace"], "fluxon_testbed_mq_large_local") + self.assertEqual(deployconf["global_envs"]["FLUXON_CLUSTER_NAME"], "fluxon_testbed_mq_large_local") + self.assertEqual( + start["controller_url"], + "http://10.1.1.119:19180/r/ops/fluxon_testbed_mq_large_local", + ) + def test_generated_apply_check_config_excludes_control_plane_reapply(self) -> None: start_cfg = _ENTRY._load_yaml_mapping(_ENTRY.DEFAULT_START_TEST_BED_TEMPLATE, ctx="start_test_bed") local_cfg = _ENTRY._rewrite_start_test_bed_for_local_dual_nodes( @@ -405,6 +445,132 @@ def test_generated_apply_check_config_excludes_control_plane_reapply(self) -> No ["fluxon_fs_master", "fluxon_fs_agent"], ) + def test_write_ci_testbed_bundle_is_run_local_and_relocatable(self) -> None: + with tempfile.TemporaryDirectory() as td: + root = Path(td) + bundle_root = root / "runner_run" / "testbed_bundle" + artifacts_source = root / "release" / "test_rsc" + artifacts_source.mkdir(parents=True) + (artifacts_source / "prepare.yaml").write_text("schema_version: 1\n", encoding="utf-8") + deployconf = { + "gen_k8s_daemonset_mirror_outdir": "/tmp/old-mirror", + "cluster_nodes": [ + { + "hostname": "runner-a", + "ip": "10.1.1.119", + "hostworkdir": "/tmp/runner/a", + "execution_mode": "local", + } + ], + "global_envs": {"FLUXON_CLUSTER_NAME": "fluxon_testbed"}, + } + start_cfg = {"schema_version": 6, "deployconf_path": "/tmp/old-deployconf.yaml"} + apply_cfg = {"schema_version": 6, "deployconf_path": "/tmp/old-deployconf.yaml"} + + paths = _ENTRY._write_ci_testbed_bundle( + bundle_root=bundle_root, + deployconf=deployconf, + start_cfg=start_cfg, + apply_check_start_cfg=apply_cfg, + artifacts_source_root=artifacts_source, + ) + + self.assertEqual(paths["bundle_root"], bundle_root.resolve()) + manifest = json.loads((bundle_root / "manifest.json").read_text(encoding="utf-8")) + self.assertEqual( + manifest, + { + "bootstrap_mode": "apply_only", + "controller_request_mode": "direct", + "deployconf_path": "deployconf_testbed.local.yaml", + "ssh_config_path": "ssh_config", + "start_config_path": "start_test_bed.runner.yaml", + "workdir": "bootstrap_workdir", + }, + ) + self.assertTrue((bundle_root / "bootstrap_workdir").is_dir()) + self.assertTrue((bundle_root / "gen_k8s_daemonset").is_dir()) + bundled_deployconf = _ENTRY._load_yaml_mapping( + bundle_root / "deployconf_testbed.local.yaml", + ctx="bundle deployconf", + ) + self.assertEqual( + bundled_deployconf["gen_k8s_daemonset_mirror_outdir"], + str((bundle_root / "gen_k8s_daemonset").resolve()), + ) + bundled_start = _ENTRY._load_yaml_mapping( + bundle_root / "start_test_bed.runner.yaml", + ctx="bundle start", + ) + self.assertEqual(bundled_start["deployconf_path"], "./deployconf_testbed.local.yaml") + self.assertEqual((bundle_root / "artifacts" / "prepare.yaml").resolve(), (artifacts_source / "prepare.yaml").resolve()) + + def test_refresh_testbed_bundle_deployconf_uses_normalized_start_output(self) -> None: + with tempfile.TemporaryDirectory() as td: + root = Path(td) + bundle_root = root / "runner_run" / "testbed_bundle" + start_workdir = root / "start_test_bed" / "bare" + deployconf = { + "gen_k8s_daemonset_mirror_outdir": "/tmp/old-mirror", + "cluster_nodes": [], + "service": { + "etcd": { + "port": 33579, + "entrypoint": "etcd --listen-client-urls http://0.0.0.0:33579", + }, + "greptime": { + "port": 35030, + }, + }, + } + start_cfg = {"schema_version": 6, "deployconf_path": "/tmp/old-deployconf.yaml"} + paths = _ENTRY._write_ci_testbed_bundle( + bundle_root=bundle_root, + deployconf=deployconf, + start_cfg=start_cfg, + apply_check_start_cfg=start_cfg, + artifacts_source_root=root / "missing_artifacts", + ) + + start_workdir.mkdir(parents=True) + _ENTRY._write_yaml( + start_workdir / "deployconf.with_release_manifest_sha256.yaml", + { + "gen_k8s_daemonset_mirror_outdir": "/tmp/start-workdir-mirror", + "cluster_nodes": [], + "service": { + "etcd": { + "port": 19180, + "entrypoint": "etcd --listen-client-urls http://0.0.0.0:19180", + }, + "greptime": { + "port": 19190, + }, + }, + "global_envs": { + "FLUXON_RELEASE_MANIFEST_SHA256": "must-not-leak-into-runner-bundle", + }, + }, + ) + + _ENTRY._refresh_ci_testbed_bundle_deployconf_from_start_workdir( + metadata={ + "testbed_bundle_path": paths["bundle_root"], + "testbed_bundle_deployconf_path": paths["deployconf_path"], + }, + start_workdir=start_workdir, + ) + + refreshed = _ENTRY._load_yaml_mapping(paths["deployconf_path"], ctx="refreshed deployconf") + self.assertEqual(refreshed["service"]["etcd"]["port"], 19180) + self.assertIn("19180", refreshed["service"]["etcd"]["entrypoint"]) + self.assertEqual(refreshed["service"]["greptime"]["port"], 19190) + self.assertNotIn("FLUXON_RELEASE_MANIFEST_SHA256", refreshed["global_envs"]) + self.assertEqual( + refreshed["gen_k8s_daemonset_mirror_outdir"], + str((bundle_root / "gen_k8s_daemonset").resolve()), + ) + def test_write_yaml_emits_ascii_yaml(self) -> None: with tempfile.TemporaryDirectory() as td: path = Path(td) / "sample.yaml" @@ -540,10 +706,21 @@ def test_main_passes_generated_start_test_bed_config_to_runner_env(self) -> None root = Path(td) workdir = root / "ci_2_virt_node_workdir" hostworkdir = root / "hostworkdir" - release_dir = REPO_ROOT / "fluxon_release" + release_dir = root / "release" release_dir.mkdir(parents=True, exist_ok=True) wheel_path = release_dir / "fluxon-0.2.1-cp38-abi3-manylinux_2_28_x86_64.whl" wheel_path.write_text("", encoding="utf-8") + _ENTRY._write_yaml( + workdir / "start_test_bed" / "apply" / "deployconf.with_release_manifest_sha256.yaml", + { + "gen_k8s_daemonset_mirror_outdir": "/tmp/mock-start-mirror", + "cluster_nodes": [], + "service": { + "etcd": {"port": 19180}, + "greptime": {"port": 19190}, + }, + }, + ) calls: list[tuple[list[str], dict[str, str] | None]] = [] def fake_run(argv: list[str], *, env=None) -> None: @@ -555,6 +732,8 @@ def fake_run(argv: list[str], *, env=None) -> None: str(workdir), "--testbed-hostworkdir", str(hostworkdir), + "--release-dir", + str(release_dir), "--scene-id", self._KVTEST_SCENE_ID, "--skip-builder-image", @@ -571,7 +750,6 @@ def fake_run(argv: list[str], *, env=None) -> None: rc = _ENTRY.main() finally: sys.argv = original_argv - wheel_path.unlink(missing_ok=True) self.assertEqual(rc, 0) self.assertTrue(calls) @@ -580,12 +758,18 @@ def fake_run(argv: list[str], *, env=None) -> None: self.assertEqual(runner_argv[1], str((REPO_ROOT / "fluxon_test_stack" / "test_runner.py").resolve())) self.assertEqual( runner_env[_ENTRY.TEST_STACK_START_TEST_BED_CONFIG_ENV], - str((workdir / "generated" / "start_test_bed.local.yaml").resolve()), + str((workdir / "runner_run" / "testbed_bundle" / "start_test_bed.runner.yaml").resolve()), ) self.assertEqual( runner_env["FLUXON_TEST_STACK_LOCAL_RELEASE_ROOT"], - str((REPO_ROOT / "fluxon_release").resolve()), + str(release_dir.resolve()), + ) + refreshed = _ENTRY._load_yaml_mapping( + workdir / "runner_run" / "testbed_bundle" / "deployconf_testbed.local.yaml", + ctx="refreshed runner bundle deployconf", ) + self.assertEqual(refreshed["service"]["etcd"]["port"], 19180) + self.assertEqual(refreshed["service"]["greptime"]["port"], 19190) def test_main_supports_explicit_suite_path(self) -> None: with tempfile.TemporaryDirectory() as td: @@ -608,7 +792,7 @@ def test_main_supports_explicit_suite_path(self) -> None: suite_cfg["profiles"]["fluxon_tcp"]["runtime"]["ci"]["scene_configs"][self._LOG_MGMT_SCENE_ID]["enabled"] = True suite_cfg["profiles"]["fluxon_tcp"]["runtime"]["ci"]["scene_configs"][self._MQ_SCENE_ID] = {} _ENTRY._write_yaml(suite_path, suite_cfg) - release_dir = REPO_ROOT / "fluxon_release" + release_dir = root / "release" release_dir.mkdir(parents=True, exist_ok=True) wheel_path = release_dir / "fluxon-0.2.1-cp38-abi3-manylinux_2_28_x86_64.whl" wheel_path.write_text("", encoding="utf-8") @@ -621,6 +805,8 @@ def test_main_supports_explicit_suite_path(self) -> None: str(workdir), "--testbed-hostworkdir", str(hostworkdir), + "--release-dir", + str(release_dir), "--skip-builder-image", "--skip-pack", "--skip-dispatch", @@ -636,7 +822,6 @@ def test_main_supports_explicit_suite_path(self) -> None: rc = _ENTRY.main() finally: sys.argv = original_argv - wheel_path.unlink(missing_ok=True) self.assertEqual(rc, 0) generated_suite = _ENTRY._load_yaml_mapping( @@ -675,7 +860,7 @@ def test_main_same_host_generated_configs_use_non_loopback_host_ip(self) -> None root = Path(td) workdir = root / "ci_2_virt_node_workdir" hostworkdir = root / "hostworkdir" - release_dir = REPO_ROOT / "fluxon_release" + release_dir = root / "release" release_dir.mkdir(parents=True, exist_ok=True) wheel_path = release_dir / "fluxon-0.2.1-cp38-abi3-manylinux_2_28_x86_64.whl" wheel_path.write_text("", encoding="utf-8") @@ -686,6 +871,8 @@ def test_main_same_host_generated_configs_use_non_loopback_host_ip(self) -> None str(workdir), "--testbed-hostworkdir", str(hostworkdir), + "--release-dir", + str(release_dir), "--scene-id", self._DOC_SCENE_ID, "--skip-builder-image", @@ -702,7 +889,6 @@ def test_main_same_host_generated_configs_use_non_loopback_host_ip(self) -> None rc = _ENTRY.main() finally: sys.argv = original_argv - wheel_path.unlink(missing_ok=True) self.assertEqual(rc, 0) generated_deployconf = _ENTRY._load_yaml_mapping( @@ -728,7 +914,7 @@ def test_main_syncs_rather_no_git_submodule_before_pack(self) -> None: root = Path(td) workdir = root / "ci_2_virt_node_workdir" hostworkdir = root / "hostworkdir" - release_dir = REPO_ROOT / "fluxon_release" + release_dir = root / "release" release_dir.mkdir(parents=True, exist_ok=True) wheel_path = release_dir / "fluxon-0.2.1-cp38-abi3-manylinux_2_28_x86_64.whl" wheel_path.write_text("", encoding="utf-8") @@ -736,6 +922,20 @@ def test_main_syncs_rather_no_git_submodule_before_pack(self) -> None: def fake_run(argv: list[str], *, env=None) -> None: calls.append((list(argv), None if env is None else dict(env))) + if argv[1] != str((REPO_ROOT / "fluxon_test_stack" / "start_test_bed.py").resolve()): + return + start_workdir = Path(argv[argv.index("-w") + 1]) + _ENTRY._write_yaml( + start_workdir / "deployconf.with_release_manifest_sha256.yaml", + { + "gen_k8s_daemonset_mirror_outdir": "/tmp/mock-start-mirror", + "cluster_nodes": [], + "service": { + "etcd": {"port": 19180}, + "greptime": {"port": 19190}, + }, + }, + ) argv = [ "ci_2_virt_node.py", @@ -743,6 +943,8 @@ def fake_run(argv: list[str], *, env=None) -> None: str(workdir), "--testbed-hostworkdir", str(hostworkdir), + "--release-dir", + str(release_dir), "--scene-id", self._KVTEST_SCENE_ID, "--skip-builder-image", @@ -761,7 +963,6 @@ def fake_run(argv: list[str], *, env=None) -> None: rc = _ENTRY.main() finally: sys.argv = original_argv - wheel_path.unlink(missing_ok=True) self.assertEqual(rc, 0) self.assertGreaterEqual(len(calls), 1) @@ -833,7 +1034,7 @@ def test_main_uses_apply_check_config_for_explicit_apply_validation(self) -> Non root = Path(td) workdir = root / "ci_2_virt_node_workdir" hostworkdir = root / "hostworkdir" - release_dir = REPO_ROOT / "fluxon_release" + release_dir = root / "release" release_dir.mkdir(parents=True, exist_ok=True) wheel_path = release_dir / "fluxon-0.2.1-cp38-abi3-manylinux_2_28_x86_64.whl" wheel_path.write_text("", encoding="utf-8") @@ -841,6 +1042,20 @@ def test_main_uses_apply_check_config_for_explicit_apply_validation(self) -> Non def fake_run(argv: list[str], *, env=None) -> None: calls.append((list(argv), None if env is None else dict(env))) + if argv[1] != str((REPO_ROOT / "fluxon_test_stack" / "start_test_bed.py").resolve()): + return + start_workdir = Path(argv[argv.index("-w") + 1]) + _ENTRY._write_yaml( + start_workdir / "deployconf.with_release_manifest_sha256.yaml", + { + "gen_k8s_daemonset_mirror_outdir": "/tmp/mock-start-mirror", + "cluster_nodes": [], + "service": { + "etcd": {"port": 19180}, + "greptime": {"port": 19190}, + }, + }, + ) argv = [ "ci_2_virt_node.py", @@ -848,6 +1063,8 @@ def fake_run(argv: list[str], *, env=None) -> None: str(workdir), "--testbed-hostworkdir", str(hostworkdir), + "--release-dir", + str(release_dir), "--scene-id", self._KVTEST_SCENE_ID, "--skip-builder-image", @@ -864,7 +1081,6 @@ def fake_run(argv: list[str], *, env=None) -> None: rc = _ENTRY.main() finally: sys.argv = original_argv - wheel_path.unlink(missing_ok=True) self.assertEqual(rc, 0) start_bed_calls = [ @@ -873,11 +1089,11 @@ def fake_run(argv: list[str], *, env=None) -> None: self.assertEqual(len(start_bed_calls), 2) self.assertEqual( start_bed_calls[0][start_bed_calls[0].index("-c") + 1], - str((workdir / "generated" / "start_test_bed.local.yaml").resolve()), + str((workdir / "runner_run" / "testbed_bundle" / "start_test_bed.runner.yaml").resolve()), ) self.assertEqual( start_bed_calls[1][start_bed_calls[1].index("-c") + 1], - str((workdir / "generated" / "start_test_bed.apply_check.local.yaml").resolve()), + str((workdir / "runner_run" / "testbed_bundle" / "start_test_bed.apply_check.runner.yaml").resolve()), ) diff --git a/fluxon_test_stack/tests/test_mpmc_readiness_contract.py b/fluxon_test_stack/tests/test_mpmc_readiness_contract.py new file mode 100644 index 0000000..dca27d9 --- /dev/null +++ b/fluxon_test_stack/tests/test_mpmc_readiness_contract.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import threading +import time +import unittest + +from fluxon_test_stack.mpmc_readiness import evaluate_mpmc_topology_ready + +try: + import fluxon_test_stack.distributed_benchmark_node as node_mod + from fluxon_test_stack.distributed_benchmark_node import ( + BenchmarkNode, + PreparedWorkerRuntime, + TestMode, + ) +except ImportError as exc: + node_mod = None + NODE_RUNTIME_IMPORT_ERROR = exc + BenchmarkNode = object # type: ignore[assignment] + PreparedWorkerRuntime = object # type: ignore[assignment] + TestMode = None # type: ignore[assignment] +else: + NODE_RUNTIME_IMPORT_ERROR = None + + +class TestMPMCReadinessContract(unittest.TestCase): + def test_consumer_does_not_wait_for_ready_channels_before_reporting_ready(self) -> None: + readiness = evaluate_mpmc_topology_ready( + role="consumer", + expected_workers=1, + total_mpsc_channels=1, + ready_channels=0, + active_consumers=1, + ) + + self.assertTrue(readiness.ready) + + def test_producer_still_waits_for_ready_channels_and_active_consumers(self) -> None: + no_ready_channel = evaluate_mpmc_topology_ready( + role="producer", + expected_workers=1, + total_mpsc_channels=1, + ready_channels=0, + active_consumers=1, + ) + no_consumer = evaluate_mpmc_topology_ready( + role="producer", + expected_workers=1, + total_mpsc_channels=1, + ready_channels=1, + active_consumers=0, + ) + + self.assertFalse(no_ready_channel.ready) + self.assertIn("ready_channels", no_ready_channel.reason) + self.assertFalse(no_consumer.ready) + self.assertIn("active_consumers", no_consumer.reason) + + @unittest.skipIf(node_mod is None, f"distributed benchmark node import failed: {NODE_RUNTIME_IMPORT_ERROR}") + def test_producer_prewarm_before_ready_is_nonblocking(self) -> None: + class ProducerNode(BenchmarkNode): + def __init__(self) -> None: + super().__init__() + self.prepare_started = threading.Event() + self.allow_prepare = threading.Event() + + def _prepare_mpmc_worker_runtime_with_retry(self, **kwargs) -> PreparedWorkerRuntime: + self.prepare_started.set() + self.allow_prepare.wait(timeout=2.0) + return PreparedWorkerRuntime(producer=object()) + + def _run_worker_thread(self, *args, **kwargs): + return [] + + node = ProducerNode() + node.test_config = { + "node_role": "producer", + "test_mode": TestMode.MPMC.value, + "cluster_ready_timeout_seconds": 5, + "threads_per_process": 1, + "max_benchmark_seconds": 5, + } + + started_at = time.monotonic() + node._prepare_mpmc_round_before_ready(workers=1) + elapsed_s = time.monotonic() - started_at + + self.assertLess(elapsed_s, 0.5) + self.assertIsNotNone(node._prepared_mpmc_round) + self.assertFalse(node.prepare_started.is_set()) + self.assertEqual(node._prepared_mpmc_round.prepared_runtimes, {}) + + node.allow_prepare.set() + node._run_mpmc_workers(workers=1, deadline_ts=0.0) + self.assertTrue(node.prepare_started.is_set()) + + @unittest.skipIf(node_mod is None, f"distributed benchmark node import failed: {NODE_RUNTIME_IMPORT_ERROR}") + def test_consumer_prewarm_before_ready_still_waits_for_endpoint(self) -> None: + class ConsumerNode(BenchmarkNode): + def __init__(self) -> None: + super().__init__() + self.waited_cluster_ready = False + + def _prepare_mpmc_worker_runtime_with_retry(self, **kwargs) -> PreparedWorkerRuntime: + return PreparedWorkerRuntime(consumer=object()) + + def _wait_mpmc_cluster_ready(self, **kwargs) -> None: + self.waited_cluster_ready = True + + def _run_worker_thread(self, *args, **kwargs): + return [] + + node = ConsumerNode() + node.test_config = { + "node_role": "consumer", + "test_mode": TestMode.MPMC.value, + "cluster_ready_timeout_seconds": 5, + "threads_per_process": 1, + } + + node._prepare_mpmc_round_before_ready(workers=1) + + self.assertIsNotNone(node._prepared_mpmc_round) + self.assertEqual(len(node._prepared_mpmc_round.prepared_runtimes), 1) + self.assertTrue(node.waited_cluster_ready) + node._prepared_mpmc_round.start_event.set() + for thread in node._prepared_mpmc_round.pending_threads.values(): + thread.join(timeout=2.0) + + @unittest.skipIf(node_mod is None, f"distributed benchmark node import failed: {NODE_RUNTIME_IMPORT_ERROR}") + def test_runtime_init_stagger_only_applies_to_large_mpmc_runs(self) -> None: + node = BenchmarkNode() + node.instance_key = "bench_mq__producer_7_proc_3" + node.test_config = { + "test_mode": TestMode.MPMC.value, + "expected_nodes": 16, + } + self.assertEqual(node._runtime_init_stagger_seconds(), 0.0) + + node.test_config["expected_nodes"] = 64 + stagger_s = node._runtime_init_stagger_seconds() + self.assertGreaterEqual(stagger_s, 0.0) + self.assertLessEqual(stagger_s, 24.0) + + @unittest.skipIf(node_mod is None, f"distributed benchmark node import failed: {NODE_RUNTIME_IMPORT_ERROR}") + def test_kv_store_init_retries_transient_errors(self) -> None: + node = BenchmarkNode() + node.test_config = { + "test_mode": TestMode.MPMC.value, + "cluster_ready_timeout_seconds": 5, + } + node._runtime_init_retry_sleep_seconds = lambda attempt: 0.0 # type: ignore[method-assign] + calls = [] + sentinel_store = object() + original_init_kv_store = node_mod.init_kv_store + + def fake_init_kv_store(config): + calls.append(config) + if len(calls) == 1: + return None, "Failed to connect to etcd: status probe timed out after 10s" + return sentinel_store, None + + node_mod.init_kv_store = fake_init_kv_store + try: + store, err = node._init_kv_store_with_ready_retry({"backend_kind": "FLUXON"}) + finally: + node_mod.init_kv_store = original_init_kv_store + + self.assertIs(store, sentinel_store) + self.assertIsNone(err) + self.assertEqual(len(calls), 2) + + +if __name__ == "__main__": + raise SystemExit(unittest.main()) diff --git a/fluxon_test_stack/tests/test_runner_contract.py b/fluxon_test_stack/tests/test_runner_contract.py index 806ef66..20ed318 100644 --- a/fluxon_test_stack/tests/test_runner_contract.py +++ b/fluxon_test_stack/tests/test_runner_contract.py @@ -55,6 +55,10 @@ def _build_checks(selected_test_id: Optional[str]) -> List[Tuple[str, Callable[[ "suite_requires_benchmark_bundle_only_for_bench_cases", test_suite_requires_benchmark_bundle_only_for_bench_cases, ), + ( + "ci_lock_name_is_scoped_by_ops_cluster_name", + test_ci_lock_name_is_scoped_by_ops_cluster_name, + ), ( "ci_top_attention_doc_page_build_uses_online_docker_image", test_ci_top_attention_doc_page_build_uses_online_docker_image, @@ -231,6 +235,21 @@ def test_suite_requires_benchmark_bundle_only_for_bench_cases() -> None: print("PASS: test_suite_requires_benchmark_bundle_only_for_bench_cases") +def test_ci_lock_name_is_scoped_by_ops_cluster_name() -> None: + first = _TEST_RUNNER._ci_lock_name_for_ops_cluster_name("fluxon_testbed") + second = _TEST_RUNNER._ci_lock_name_for_ops_cluster_name("fluxon_testbed_mq_large_30680") + if first == second: + print("FAIL: test_ci_lock_name_is_scoped_by_ops_cluster_name - names should differ") + return + if first == "bench_ci.lock" or second == "bench_ci.lock": + print("FAIL: test_ci_lock_name_is_scoped_by_ops_cluster_name - lock name must not be global") + return + if first != _TEST_RUNNER._ci_lock_name_for_ops_cluster_name("fluxon_testbed"): + print("FAIL: test_ci_lock_name_is_scoped_by_ops_cluster_name - name should be deterministic") + return + print("PASS: test_ci_lock_name_is_scoped_by_ops_cluster_name") + + def test_ci_top_attention_doc_page_build_uses_online_docker_image() -> None: suite_cfg = _suite_cfg_with_declared_ci_commands( { diff --git a/fluxon_test_stack/tests/test_test_runner_testbed_contract.py b/fluxon_test_stack/tests/test_test_runner_testbed_contract.py index 0e1cee6..58bbf95 100644 --- a/fluxon_test_stack/tests/test_test_runner_testbed_contract.py +++ b/fluxon_test_stack/tests/test_test_runner_testbed_contract.py @@ -1183,6 +1183,31 @@ def test_ci_runner_script_sources_prepare_env_when_present(self) -> None: self.assertIn('prepare_env_path="', script_text) self.assertIn('. "$prepare_env_path"', script_text) + def test_ci_prepare_exports_testbed_bundle_and_release_authority(self) -> None: + with tempfile.TemporaryDirectory() as td: + root = Path(td) + bundle_root = root / "testbed_bundle" + bundle_root.mkdir() + start_cfg = bundle_root / "start_test_bed.runner.yaml" + start_cfg.write_text("schema_version: 6\n", encoding="utf-8") + (bundle_root / "manifest.json").write_text("{}", encoding="utf-8") + release_root = root / "fluxon_release" + release_root.mkdir() + env = { + **os.environ, + _RUNNER.TEST_STACK_START_TEST_BED_CONFIG_ENV: str(start_cfg), + "FLUXON_TEST_STACK_LOCAL_RELEASE_ROOT": str(release_root), + } + with mock.patch.dict(os.environ, env, clear=True): + exports = _RUNNER._run_ci_prepare_steps( + resolved_case={"scene": {"ci": {}}}, + run_dir=root / "run", + src_root=root / "src", + ) + + self.assertEqual(exports[_RUNNER.TEST_STACK_START_TEST_BED_CONFIG_ENV], str(start_cfg)) + self.assertEqual(exports["FLUXON_TEST_STACK_LOCAL_RELEASE_ROOT"], str(release_root)) + def test_parse_ci_prepare_steps_accepts_online_docker_image(self) -> None: steps = _RUNNER._parse_ci_prepare_steps( [ @@ -1311,8 +1336,8 @@ def test_bootstrap_runner_uses_repo_start_test_bed_entry(self) -> None: manifest_path.write_text( json.dumps( { - "start_config_path": str(start_cfg), - "workdir": str(workdir), + "start_config_path": "start_test_bed.runner.yaml", + "workdir": "bootstrap_workdir", "bootstrap_mode": "apply_only", } ), @@ -1343,6 +1368,109 @@ def test_bootstrap_runner_uses_repo_start_test_bed_entry(self) -> None: ], ) + def test_runtime_token_mapping_includes_testbed_bundle_root_when_manifest_exists(self) -> None: + with tempfile.TemporaryDirectory() as td: + bundle_root = Path(td) + start_cfg = bundle_root / "start_test_bed.runner.yaml" + start_cfg.write_text("schema_version: 6\n", encoding="utf-8") + (bundle_root / "manifest.json").write_text("{}", encoding="utf-8") + env = {**os.environ, _RUNNER.TEST_STACK_START_TEST_BED_CONFIG_ENV: str(start_cfg)} + with mock.patch.dict(os.environ, env, clear=True): + mapping = _RUNNER._build_runtime_token_mapping( + workdir_root="/tmp/workdir", + run_dir="/tmp/run", + release_root="/tmp/release", + test_rsc_root="/tmp/test_rsc", + case_id="case", + profile_id="profile", + stack_identity={ + "cluster_name": "cluster", + "controller_url": "http://127.0.0.1:19080/r/ops/fluxon_testbed", + "share_mem_path": "/tmp/share", + }, + ) + + self.assertEqual(mapping["__TEST_BED_BUNDLE_ROOT__"], str(bundle_root.resolve())) + + def test_direct_local_manifest_remote_bash_uses_local_process_without_bastion(self) -> None: + with tempfile.TemporaryDirectory() as td: + bundle_root = Path(td) + start_cfg = bundle_root / "start_test_bed.runner.yaml" + start_cfg.write_text("schema_version: 6\n", encoding="utf-8") + (bundle_root / "manifest.json").write_text( + json.dumps({"controller_request_mode": "direct"}), + encoding="utf-8", + ) + env = {**os.environ, _RUNNER.TEST_STACK_START_TEST_BED_CONFIG_ENV: str(start_cfg)} + node_cfg = { + "execution_mode": "local", + "ip": "127.0.0.1", + "hostworkdir": td, + } + with mock.patch.dict(os.environ, env, clear=True): + captured = _RUNNER._run_remote_bash_capture( + target_name="logic-a", + node_cfg=node_cfg, + remote_cmd="printf '%s\\n' local-ok", + ) + + self.assertEqual(captured, "local-ok\n") + + def test_load_test_stack_cluster_nodes_expands_same_host_logical_target_aliases(self) -> None: + with tempfile.TemporaryDirectory() as td: + bundle_root = Path(td) + start_cfg = bundle_root / "start_test_bed.runner.yaml" + deployconf_path = bundle_root / "deployconf.yaml" + start_cfg.write_text( + "\n".join( + [ + "schema_version: 6", + "deployconf_path: ./deployconf.yaml", + "", + ] + ), + encoding="utf-8", + ) + deployconf_path.write_text( + "\n".join( + [ + "cluster_nodes:", + " - hostname: logic-a", + " ip: 10.1.1.119", + " hostworkdir: /tmp/logic/a", + " execution_mode: local", + " ssh_user: runner", + " ssh_port: 22", + " ssh_password: null", + " - hostname: logic-b", + " ip: 10.1.1.119", + " hostworkdir: /tmp/logic/b", + " execution_mode: local", + " ssh_user: runner", + " ssh_port: 22", + " ssh_password: null", + "", + ] + ), + encoding="utf-8", + ) + resolved_case = { + "deploy": { + "target_ip_map": { + "node-1": "10.1.1.119", + "logic-a": "10.1.1.119", + "logic-b": "10.1.1.119", + } + } + } + env = {**os.environ, _RUNNER.TEST_STACK_START_TEST_BED_CONFIG_ENV: str(start_cfg)} + with mock.patch.dict(os.environ, env, clear=True): + cluster_nodes, _dispatch = _RUNNER._load_test_stack_cluster_nodes_and_dispatch(resolved_case) + + self.assertIn("node-1", cluster_nodes) + self.assertEqual(cluster_nodes["node-1"]["hostname"], "logic-a") + self.assertEqual(cluster_nodes["node-1"]["hostworkdir"], "/tmp/logic/a") + def test_load_source_stack_contract_accepts_same_host_dual_local_hostworkdirs(self) -> None: with tempfile.TemporaryDirectory() as td: bundle_root = Path(td) diff --git a/fluxon_test_stack/tests/test_top_attention_largescale_mq_contract.py b/fluxon_test_stack/tests/test_top_attention_largescale_mq_contract.py new file mode 100644 index 0000000..be5df53 --- /dev/null +++ b/fluxon_test_stack/tests/test_top_attention_largescale_mq_contract.py @@ -0,0 +1,476 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import importlib.util +import json +import sys +import tempfile +import unittest +from pathlib import Path +from unittest import mock + +import yaml + + +REPO_ROOT = Path(__file__).resolve().parents[2] +INDEX_DIR = REPO_ROOT / "fluxon_test_stack" / "top_attention_test_index" +MODULE_PATH = INDEX_DIR / "_largescale_mq.py" +RUNNER_PATH = REPO_ROOT / "fluxon_test_stack" / "test_runner.py" + + +def _load_module(): + sys.path.insert(0, str(INDEX_DIR)) + try: + spec = importlib.util.spec_from_file_location("fluxon_test_stack_top_attention_largescale_mq", MODULE_PATH) + assert spec is not None and spec.loader is not None + mod = importlib.util.module_from_spec(spec) + sys.modules[spec.name] = mod + spec.loader.exec_module(mod) + return mod + finally: + if sys.path and sys.path[0] == str(INDEX_DIR): + sys.path.pop(0) + + +def _load_runner_module(): + runner_dir = RUNNER_PATH.parent + sys.path.insert(0, str(runner_dir)) + try: + spec = importlib.util.spec_from_file_location("fluxon_test_stack_runner_for_largescale_mq", RUNNER_PATH) + assert spec is not None and spec.loader is not None + mod = importlib.util.module_from_spec(spec) + sys.modules[spec.name] = mod + spec.loader.exec_module(mod) + return mod + finally: + if sys.path and sys.path[0] == str(runner_dir): + sys.path.pop(0) + + +class TestTopAttentionLargescaleMqContract(unittest.TestCase): + def test_runner_mpmc_uses_process_fanout_for_single_host_logical_targets(self) -> None: + runner = _load_runner_module() + + self.assertTrue( + runner._test_stack_scene_uses_per_target_process_fanout( + scene_mode=runner.TEST_STACK_MODE_MPMC, + ) + ) + + def test_generate_only_writes_minimal_ci_smoke_suite_without_running_runner(self) -> None: + entry = _load_module() + with tempfile.TemporaryDirectory() as td: + suite_out = Path(td) / "largescale_mq_suite.yaml" + + with mock.patch.object(entry, "call", side_effect=AssertionError("test_runner should not run")): + with mock.patch.object( + sys, + "argv", + [ + str(MODULE_PATH), + "--generate-only", + "--suite-out", + str(suite_out), + "--owner-count", + "1", + "--owner-dram-gib", + "1", + "--producer-count", + "2", + "--consumer-count", + "2", + "--duration-seconds", + "1", + "--value-size", + "256", + "--threads-per-process", + "1", + "--op-timeout-seconds", + "5", + "--cluster-ready-timeout-seconds", + "1800", + "--consumer-sim-min-ms", + "1", + "--consumer-sim-max-ms", + "1", + ], + ): + rc = entry.main() + + self.assertEqual(rc, 0) + suite = yaml.safe_load(suite_out.read_text(encoding="utf-8")) + scale_id = "largescale_mq_n1owner_1gib_p2_c2" + self.assertEqual(set(suite["scenes"].keys()), {"bench_mq"}) + self.assertEqual(suite["scenes"]["bench_mq"]["test_stack"]["mode"], "MPMC") + self.assertEqual( + suite["scenes"]["bench_mq"]["test_stack"]["role_weights"], + {"producer": 1, "consumer": 1}, + ) + self.assertEqual(suite["scenes"]["bench_mq"]["select"]["scales"], [scale_id]) + self.assertEqual(suite["run"]["selectors"]["case_ids"], [f"bench_mq__{scale_id}__fluxon_tcp_thread"]) + self.assertEqual(suite["scales"][scale_id]["topology"], 4) + self.assertEqual(suite["scales"][scale_id]["owner"]["owner_count"], 1) + self.assertEqual(suite["scales"][scale_id]["owner"]["owner_dram_bytes"], 1073741824) + self.assertEqual(suite["scales"][scale_id]["benchmark"]["threads_per_process"], 1) + self.assertEqual( + suite["scales"][scale_id]["targets"]["hosts"], + ["node-1", "node-2", "node-3", "node-4"], + ) + self.assertEqual(suite["scales"][scale_id]["owner"]["targets"], ["node-1"]) + port_entry = suite["profiles"]["fluxon_tcp_thread"]["runtime"]["test_stack"]["port_alloc"]["by_topology"][4] + self.assertGreaterEqual(port_entry["kv_p2p_port_stride"], 512) + + runner = _load_runner_module() + parsed = runner._parse_suite_config(suite) + cases = runner._expand_cases(parsed) + self.assertEqual([case.case_id for case in cases], [f"bench_mq__{scale_id}__fluxon_tcp_thread"]) + + def test_single_host_logical_targets_support_ci_owner_producer_consumer_matrix(self) -> None: + entry = _load_module() + cases = ( + (8, 8, 4, {"producer": 1, "consumer": 1}), + (32, 32, 16, {"producer": 7, "consumer": 7}), + (160, 8, 42, {"producer": 39, "consumer": 1}), + ) + for producer_count, consumer_count, topology, role_weights in cases: + with self.subTest(producer_count=producer_count, consumer_count=consumer_count): + with tempfile.TemporaryDirectory() as td: + suite_out = Path(td) / f"largescale_mq_p{producer_count}_c{consumer_count}.yaml" + + with mock.patch.object(entry, "call", side_effect=AssertionError("test_runner should not run")): + with mock.patch.object( + sys, + "argv", + [ + str(MODULE_PATH), + "--generate-only", + "--single-host-logical-targets", + "--suite-out", + str(suite_out), + "--owner-count", + "4", + "--owner-dram-gib", + "1", + "--producer-count", + str(producer_count), + "--consumer-count", + str(consumer_count), + "--duration-seconds", + "30", + "--value-size", + "256", + "--op-timeout-seconds", + "5", + "--cluster-ready-timeout-seconds", + "1800", + "--consumer-sim-min-ms", + "1", + "--consumer-sim-max-ms", + "1", + ], + ): + rc = entry.main() + + self.assertEqual(rc, 0) + suite = yaml.safe_load(suite_out.read_text(encoding="utf-8")) + scale_id = f"largescale_mq_n4owner_1gib_p{producer_count}_c{consumer_count}" + scale = suite["scales"][scale_id] + self.assertEqual(scale["topology"], topology) + self.assertEqual(scale["owner"]["owner_count"], 4) + self.assertEqual(scale["owner"]["targets"], ["node-1", "node-2", "node-3", "node-4"]) + self.assertEqual(len(scale["targets"]["hosts"]), topology) + self.assertEqual(scale["benchmark"]["processes_per_target"], 4) + self.assertEqual(scale["benchmark"]["threads_per_process"], 4) + self.assertEqual(scale["benchmark"]["owner_group_processes"], 1) + self.assertEqual(scale["benchmark"]["value_size"], 256) + target_map = suite["profiles"]["fluxon_tcp_thread"]["runtime"]["test_stack"]["deploy"]["target_ip_map"] + self.assertIn(f"node-{topology}", target_map) + self.assertEqual(target_map["node-1"], target_map[f"node-{topology}"]) + self.assertEqual( + suite["scenes"]["bench_mq"]["test_stack"]["role_weights"], + role_weights, + ) + port_entry = suite["profiles"]["fluxon_tcp_thread"]["runtime"]["test_stack"]["port_alloc"]["by_topology"][topology] + self.assertGreaterEqual(port_entry["kv_p2p_port_stride"], 512) + + runner = _load_runner_module() + parsed = runner._parse_suite_config(suite) + expanded = runner._expand_cases(parsed) + self.assertEqual([case.case_id for case in expanded], [f"bench_mq__{scale_id}__fluxon_tcp_thread"]) + + def test_script_defaults_keep_owner_and_payload_small(self) -> None: + entry = _load_module() + with tempfile.TemporaryDirectory() as td: + config_path = Path(td) / "benchmark_full_matrix_many_targets.yaml" + suite_out = Path(td) / "largescale_mq_default.yaml" + cfg = yaml.safe_load(entry.DEFAULT_CONFIG.read_text(encoding="utf-8")) + target_map = cfg["profiles"]["fluxon_tcp"]["runtime"]["test_stack"]["deploy"]["target_ip_map"] + for idx in range(1, 169): + target_map[f"node-{idx}"] = f"10.88.{idx // 250}.{idx % 250 + 1}" + config_path.write_text(yaml.safe_dump(cfg, sort_keys=False, allow_unicode=False), encoding="utf-8") + + with mock.patch.object(entry, "call", side_effect=AssertionError("test_runner should not run")): + with mock.patch.object( + sys, + "argv", + [ + str(MODULE_PATH), + "--generate-only", + "--config", + str(config_path), + "--suite-out", + str(suite_out), + ], + ): + rc = entry.main() + + self.assertEqual(rc, 0) + suite = yaml.safe_load(suite_out.read_text(encoding="utf-8")) + scale_id = "largescale_mq_n4owner_1gib_p160_c8" + scale = suite["scales"][scale_id] + self.assertEqual(scale["topology"], 168) + self.assertEqual(scale["owner"]["owner_count"], 4) + self.assertEqual(scale["owner"]["owner_dram_bytes"], 1073741824) + self.assertEqual(scale["benchmark"]["processes_per_target"], 1) + self.assertEqual(scale["benchmark"]["threads_per_process"], 4) + self.assertNotIn("owner_group_processes", scale["benchmark"]) + self.assertEqual(scale["benchmark"]["value_size"], 256) + self.assertEqual(scale["owner"]["targets"], ["node-1", "node-2", "node-3", "node-4"]) + + def test_real_run_copies_bundle_uses_active_testbed_ip_and_invokes_runner(self) -> None: + entry = _load_module() + with tempfile.TemporaryDirectory() as td: + root = Path(td) + bundle = root / "source_bundle" + bundle.mkdir() + (bundle / "bootstrap_workdir").mkdir() + (bundle / "ssh_config").write_text("# local\n", encoding="utf-8") + (bundle / "gen_k8s_daemonset").mkdir() + source_deployconf = bundle / "deployconf_testbed.local.yaml" + source_start = bundle / "start_test_bed.runner.yaml" + source_ssh_config = bundle / "ssh_config" + source_bootstrap_workdir = bundle / "bootstrap_workdir" + (bundle / "deployconf_testbed.local.yaml").write_text( + "\n".join( + [ + f"gen_k8s_daemonset_mirror_outdir: {bundle / 'gen_k8s_daemonset'}", + "global_envs:", + " FLUXON_CLUSTER_NAME: fluxon_testbed", + " FLUXON_SHARED_MEM: ${HOSTWORKDIR}/shm", + "cluster_nodes:", + " - hostname: runner-a", + " ip: 10.9.0.7", + " hostworkdir: /tmp/runner/a", + " execution_mode: local", + "", + ] + ), + encoding="utf-8", + ) + (bundle / "start_test_bed.runner.yaml").write_text( + "\n".join( + [ + "schema_version: 6", + f"deployconf_path: {source_deployconf}", + "controller_url: http://10.9.0.7:19080/r/ops/fluxon_testbed", + "controller_basic_auth:", + " username: ops_admin", + " password: ops_password", + "", + ] + ), + encoding="utf-8", + ) + (bundle / "manifest.json").write_text( + json.dumps( + { + "deployconf_path": str(source_deployconf), + "start_config_path": str(source_start), + "ssh_config_path": str(source_ssh_config), + "workdir": str(source_bootstrap_workdir), + "bootstrap_mode": "apply_only", + "controller_request_mode": "direct", + } + ), + encoding="utf-8", + ) + workdir = root / "run" + suite_out = root / "suite.yaml" + calls: list[tuple[list[str], dict[str, str] | None]] = [] + + def fake_call(cmd, *, env=None): + calls.append((list(cmd), None if env is None else dict(env))) + return 0 + + with mock.patch.object(entry, "call", side_effect=fake_call): + with mock.patch.dict("os.environ", {"FLUXON_TEST_STACK_LOCAL_RELEASE_ROOT": "/tmp/release"}, clear=True): + with mock.patch.object( + sys, + "argv", + [ + str(MODULE_PATH), + "--single-host-logical-targets", + "--testbed-bundle-source", + str(bundle), + "--workdir", + str(workdir), + "--suite-out", + str(suite_out), + "--owner-count", + "4", + "--owner-dram-gib", + "1", + "--producer-count", + "8", + "--consumer-count", + "8", + "--duration-seconds", + "30", + "--value-size", + "256", + "--op-timeout-seconds", + "5", + "--cluster-ready-timeout-seconds", + "1800", + "--consumer-sim-min-ms", + "1", + "--consumer-sim-max-ms", + "1", + ], + ): + rc = entry.main() + + self.assertEqual(rc, 0) + self.assertEqual(len(calls), 1) + run_local_start = workdir / "testbed_bundle" / "start_test_bed.runner.yaml" + self.assertEqual( + calls[0][1]["FLUXON_TEST_STACK_START_TEST_BED_CONFIG"], + str(run_local_start.resolve()), + ) + self.assertEqual(calls[0][1]["FLUXON_TEST_STACK_LOCAL_RELEASE_ROOT"], "/tmp/release") + self.assertEqual(calls[0][0][0:3], [sys.executable, "-u", str(RUNNER_PATH)]) + self.assertIn("--action", calls[0][0]) + self.assertIn("run", calls[0][0]) + run_local_deployconf = workdir / "testbed_bundle" / "deployconf_testbed.local.yaml" + run_local_mirror = workdir / "testbed_bundle" / "gen_k8s_daemonset" + run_local_start_payload = yaml.safe_load(run_local_start.read_text(encoding="utf-8")) + self.assertEqual(run_local_start_payload["deployconf_path"], "./deployconf_testbed.local.yaml") + run_local_deployconf_payload = yaml.safe_load(run_local_deployconf.read_text(encoding="utf-8")) + self.assertEqual( + run_local_deployconf_payload["gen_k8s_daemonset_mirror_outdir"], + str(run_local_mirror.resolve()), + ) + run_local_manifest = json.loads((workdir / "testbed_bundle" / "manifest.json").read_text(encoding="utf-8")) + self.assertEqual(run_local_manifest["deployconf_path"], "deployconf_testbed.local.yaml") + self.assertEqual(run_local_manifest["start_config_path"], "start_test_bed.runner.yaml") + self.assertEqual(run_local_manifest["ssh_config_path"], "ssh_config") + self.assertEqual(run_local_manifest["workdir"], "bootstrap_workdir") + suite = yaml.safe_load(suite_out.read_text(encoding="utf-8")) + scale = suite["scales"]["largescale_mq_n4owner_1gib_p8_c8"] + self.assertEqual(scale["topology"], 4) + self.assertEqual(scale["benchmark"]["processes_per_target"], 4) + self.assertEqual(scale["benchmark"]["owner_group_processes"], 1) + target_map = suite["profiles"]["fluxon_tcp_thread"]["runtime"]["test_stack"]["deploy"]["target_ip_map"] + self.assertEqual(target_map["node-1"], "10.9.0.7") + self.assertEqual(target_map["node-4"], "10.9.0.7") + port_entry = suite["profiles"]["fluxon_tcp_thread"]["runtime"]["test_stack"]["port_alloc"][ + "by_topology" + ][4] + self.assertEqual(port_entry["coordinator_port_base"], 20480) + + def test_real_run_relocates_generated_bundle_mirror_after_bundle_move(self) -> None: + entry = _load_module() + with tempfile.TemporaryDirectory() as td: + root = Path(td) + previous_bundle = root / "previous_runner" / "testbed_bundle" + source_bundle = root / "current_runner" / "testbed_bundle" + previous_bundle.mkdir(parents=True) + source_bundle.mkdir(parents=True) + (source_bundle / "bootstrap_workdir").mkdir() + (source_bundle / "ssh_config").write_text("# local\n", encoding="utf-8") + (source_bundle / "gen_k8s_daemonset").mkdir() + source_deployconf = source_bundle / "deployconf_testbed.local.yaml" + source_start = source_bundle / "start_test_bed.runner.yaml" + source_deployconf.write_text( + "\n".join( + [ + f"gen_k8s_daemonset_mirror_outdir: {previous_bundle / 'gen_k8s_daemonset'}", + "global_envs:", + " FLUXON_CLUSTER_NAME: fluxon_testbed", + " FLUXON_SHARED_MEM: ${HOSTWORKDIR}/shm", + "cluster_nodes:", + " - hostname: runner-a", + " ip: 10.9.0.8", + " hostworkdir: /tmp/runner/a", + " execution_mode: local", + "", + ] + ), + encoding="utf-8", + ) + source_start.write_text( + "\n".join( + [ + "schema_version: 6", + "deployconf_path: ./deployconf_testbed.local.yaml", + "controller_url: http://10.9.0.8:19080/r/ops/fluxon_testbed", + "controller_basic_auth:", + " username: ops_admin", + " password: ops_password", + "", + ] + ), + encoding="utf-8", + ) + (source_bundle / "manifest.json").write_text( + json.dumps( + { + "deployconf_path": "deployconf_testbed.local.yaml", + "start_config_path": "start_test_bed.runner.yaml", + "ssh_config_path": "ssh_config", + "workdir": "bootstrap_workdir", + } + ), + encoding="utf-8", + ) + workdir = root / "run" + suite_out = root / "suite.yaml" + + with mock.patch.object(entry, "call", return_value=0): + with mock.patch.object( + sys, + "argv", + [ + str(MODULE_PATH), + "--single-host-logical-targets", + "--testbed-bundle-source", + str(source_bundle), + "--workdir", + str(workdir), + "--suite-out", + str(suite_out), + "--owner-count", + "4", + "--owner-dram-gib", + "1", + "--producer-count", + "8", + "--consumer-count", + "8", + ], + ): + rc = entry.main() + + self.assertEqual(rc, 0) + run_local_mirror = workdir / "testbed_bundle" / "gen_k8s_daemonset" + run_local_deployconf = workdir / "testbed_bundle" / "deployconf_testbed.local.yaml" + run_local_deployconf_payload = yaml.safe_load(run_local_deployconf.read_text(encoding="utf-8")) + self.assertEqual( + run_local_deployconf_payload["gen_k8s_daemonset_mirror_outdir"], + str(run_local_mirror.resolve()), + ) + + +if __name__ == "__main__": + raise SystemExit(unittest.main()) diff --git a/fluxon_test_stack/top_attention_test_index/README.md b/fluxon_test_stack/top_attention_test_index/README.md index 706517f..1eda482 100644 --- a/fluxon_test_stack/top_attention_test_index/README.md +++ b/fluxon_test_stack/top_attention_test_index/README.md @@ -38,7 +38,7 @@ Entries: - `_kv_py_core.py`: Python KV backend/core smoke coverage - `_relay_mq.py`: MQ relay docker coverage - `_mq_core.py`: non-Ctrl-C MQ correctness coverage -- `_largescale_mq.py`: TEST_STACK large-scale MQ benchmark wrapper (defaults to 30 owners at 5GiB, 300 producers, 8 consumers) +- `_largescale_mq.py`: TEST_STACK large-scale MQ benchmark wrapper (defaults to 4 owners at 1GiB, 160 producers, 8 consumers, 256-byte values) - `_mq_mpsc.py`: MPSC API channel coverage - `_mq_mpmc.py`: MPMC API channel coverage - `_mq_mpmc_bench.py`: heavier MPMC bench scripts @@ -68,9 +68,11 @@ Operational note: - `_largescale_mq.py` generates a temporary `bench_mq` suite and then forwards to `fluxon_test_stack/test_runner.py`. The selected TEST_STACK profile must - provide at least 308 common non-bastion deploy targets in `target_ip_map` for - the default 300-producer/8-consumer topology; pass `--config` for the large - cluster suite before running it. + provide at least 168 common non-bastion deploy targets in `target_ip_map` for + the default 160-producer/8-consumer topology. For same-host CI, pass + `--single-host-logical-targets` and a testbed bundle; the wrapper copies the + bundle into `/testbed_bundle` and rewrites bundle-local metadata + before invoking the runner. - All `_cargo_*.py` wrappers are direct-process entrypoints. They do not forward `pytest` selectors or `cargo test` passthrough flags unless the wrapper explicitly defines that surface. diff --git a/fluxon_test_stack/top_attention_test_index/_largescale_mq.py b/fluxon_test_stack/top_attention_test_index/_largescale_mq.py index 7211bd9..cf1cfaf 100755 --- a/fluxon_test_stack/top_attention_test_index/_largescale_mq.py +++ b/fluxon_test_stack/top_attention_test_index/_largescale_mq.py @@ -3,11 +3,14 @@ import argparse import copy +import json import os import re +import shutil import sys from pathlib import Path from typing import Any +from urllib.parse import urlparse import yaml @@ -18,15 +21,21 @@ SCENE_ID = "bench_mq" -DEFAULT_PROFILE_ID = "fluxon_tcp" +BASE_FLUXON_PROFILE_ID = "fluxon_tcp" +CI_PUBLIC_PROFILE_ID = "fluxon_tcp_thread" +DEFAULT_PROFILE_ID = CI_PUBLIC_PROFILE_ID +LOCAL_RELEASE_ROOT_ENV = "FLUXON_TEST_STACK_LOCAL_RELEASE_ROOT" +RELEASE_MANIFEST_FILENAME = "fluxon_release.sha256" DEFAULT_CONFIG = REPO_ROOT / "fluxon_test_stack" / "benchmark_full_matrix.yaml" -DEFAULT_WORKDIR = REPO_ROOT / ".tmp" / "test_largescale_mq_p300_c8" +DEFAULT_WORKDIR = REPO_ROOT / ".tmp" / "test_largescale_mq_p160_c8" RUNNER = REPO_ROOT / "fluxon_test_stack" / "test_runner.py" +LOCAL_TEST_STACK_COORDINATOR_PORT_OFFSET = 1000 +LOCAL_TEST_STACK_TOPOLOGY_PORT_SPAN = 100 DEFAULT_BENCHMARK = { "processes_per_target": 1, "threads_per_process": 4, - "value_size": 16384, + "value_size": 256, "metric_warmup_seconds": 0, "op_timeout_seconds": 30, "cluster_ready_timeout_seconds": 1800, @@ -44,12 +53,242 @@ def _repo_path(raw: str) -> Path: return (REPO_ROOT / path).resolve() +def _resolve_user_path(raw: str) -> Path: + path = Path(raw).expanduser() + if path.is_absolute(): + return path.resolve() + return (Path.cwd() / path).resolve() + + def _require_dict(raw: Any, ctx: str) -> dict[str, Any]: if not isinstance(raw, dict): raise SystemExit(f"{ctx} must be a mapping") return raw +def _is_within_root(path: Path, root: Path) -> bool: + resolved_path = path.resolve() + resolved_root = root.resolve() + return resolved_path == resolved_root or resolved_root in resolved_path.parents + + +def _clean_bundle_relpath(raw: str, *, field_name: str) -> Path: + relpath = Path(raw) + if relpath.is_absolute() or ".." in relpath.parts: + raise SystemExit(f"{field_name} must be a clean relative path: {relpath}") + return relpath + + +def _load_yaml_mapping(path: Path, *, ctx: str) -> dict[str, Any]: + payload = yaml.safe_load(path.read_text(encoding="utf-8")) + return _require_dict(payload, ctx) + + +def _write_yaml_mapping(path: Path, payload: dict[str, Any]) -> None: + path.write_text(yaml.safe_dump(payload, sort_keys=False, allow_unicode=False), encoding="utf-8") + + +def _parse_sha256_manifest_names(path: Path) -> list[str]: + out: list[str] = [] + for index, raw_line in enumerate(path.read_text(encoding="utf-8").splitlines(), start=1): + line = raw_line.strip() + if not line: + continue + parts = line.split(maxsplit=1) + if len(parts) != 2: + raise SystemExit(f"invalid sha256 manifest line {index}: {path}: {raw_line!r}") + out.append(parts[1].strip()) + return out + + +def _find_release_wheel_name_from_manifest(root: Path) -> str | None: + manifest_path = (root / RELEASE_MANIFEST_FILENAME).resolve() + if not manifest_path.is_file(): + return None + wheel_names = [ + Path(name).name + for name in _parse_sha256_manifest_names(manifest_path) + if Path(name).name.startswith("fluxon-") and Path(name).name.endswith(".whl") + ] + if not wheel_names: + return None + unique_names = sorted(set(wheel_names)) + if len(unique_names) != 1: + raise SystemExit(f"release manifest must contain one Fluxon wheel: {manifest_path} wheels={unique_names}") + return unique_names[0] + + +def _ci_public_release_wheel_name(fallback: str) -> str: + roots: list[Path] = [] + env_root = os.environ.get(LOCAL_RELEASE_ROOT_ENV, "").strip() + if env_root: + roots.append(Path(env_root).expanduser().resolve()) + roots.append((REPO_ROOT / "fluxon_release").resolve()) + for root in roots: + wheel_name = _find_release_wheel_name_from_manifest(root) + if wheel_name is not None: + return wheel_name + return fallback + + +def _ensure_ci_public_profile(cfg: dict[str, Any], profile_ids: list[str]) -> None: + if CI_PUBLIC_PROFILE_ID not in profile_ids: + return + profiles = _require_dict(cfg.get("profiles"), "config.profiles") + artifact_sets = _require_dict(cfg.get("artifact_sets"), "config.artifact_sets") + if CI_PUBLIC_PROFILE_ID in profiles and CI_PUBLIC_PROFILE_ID in artifact_sets: + return + + base_profile = copy.deepcopy( + _require_dict(profiles.get(BASE_FLUXON_PROFILE_ID), f"config.profiles[{BASE_FLUXON_PROFILE_ID!r}]") + ) + base_artifact_set = copy.deepcopy( + _require_dict( + artifact_sets.get(BASE_FLUXON_PROFILE_ID), + f"config.artifact_sets[{BASE_FLUXON_PROFILE_ID!r}]", + ) + ) + + release_source = _require_dict( + base_artifact_set.get("release_source"), + f"config.artifact_sets[{BASE_FLUXON_PROFILE_ID!r}].release_source", + ) + test_rsc_source = _require_dict( + base_artifact_set.get("test_rsc_source"), + f"config.artifact_sets[{BASE_FLUXON_PROFILE_ID!r}].test_rsc_source", + ) + release_artifacts = _require_dict( + base_artifact_set.get("release_artifacts"), + f"config.artifact_sets[{BASE_FLUXON_PROFILE_ID!r}].release_artifacts", + ) + release_source["key_prefix"] = f"profiles/{CI_PUBLIC_PROFILE_ID}" + test_rsc_source["key_prefix"] = f"test_rsc/{CI_PUBLIC_PROFILE_ID}" + release_artifacts["wheel"] = _ci_public_release_wheel_name( + str(release_artifacts.get("wheel", "")).strip() or "fluxon-0.2.1-py3-none-any.whl" + ) + base_profile["artifact_set"] = CI_PUBLIC_PROFILE_ID + artifact_sets[CI_PUBLIC_PROFILE_ID] = base_artifact_set + profiles[CI_PUBLIC_PROFILE_ID] = base_profile + + +def _bundle_relpath(path: Path, *, base: Path, dot_prefix: bool = False) -> str: + rel = os.path.relpath(str(path.resolve()), str(base.resolve())).replace(os.sep, "/") + if dot_prefix and rel != "." and not rel.startswith("."): + return f"./{rel}" + return rel + + +def _relocated_bundle_path( + raw: Any, + *, + src_root: Path, + dst_root: Path, + base: Path, + field_name: str, + generated_bundle_child_name: str | None = None, +) -> Path: + if not isinstance(raw, str) or not raw.strip(): + raise SystemExit(f"{field_name} must be a non-empty path string") + raw_path = Path(raw).expanduser() + if raw_path.is_absolute(): + resolved = raw_path.resolve() + if _is_within_root(resolved, dst_root): + return resolved + if _is_within_root(resolved, src_root): + return (dst_root / resolved.relative_to(src_root.resolve())).resolve() + if generated_bundle_child_name is not None and resolved.name == generated_bundle_child_name: + return (dst_root / generated_bundle_child_name).resolve() + raise SystemExit( + f"{field_name} must point inside the testbed bundle: path={resolved} src={src_root} dst={dst_root}" + ) + return (base / raw_path).resolve() + + +def _normalize_run_local_testbed_bundle( + *, + src_root: Path, + dst_root: Path, + start_config_relpath: str, +) -> Path: + relpath = _clean_bundle_relpath(start_config_relpath, field_name="--start-config-relpath") + start_cfg = (dst_root / relpath).resolve() + if not _is_within_root(start_cfg, dst_root): + raise SystemExit(f"--start-config-relpath escapes testbed bundle: {relpath}") + if not start_cfg.is_file(): + raise SystemExit(f"start config is missing inside testbed bundle: {start_cfg}") + + start_payload = _load_yaml_mapping(start_cfg, ctx=f"start config {start_cfg}") + deployconf_path = _relocated_bundle_path( + start_payload.get("deployconf_path"), + src_root=src_root, + dst_root=dst_root, + base=start_cfg.parent, + field_name="start_test_bed.deployconf_path", + ) + if not _is_within_root(deployconf_path, dst_root): + raise SystemExit(f"start_test_bed.deployconf_path escapes testbed bundle: {deployconf_path}") + if not deployconf_path.is_file(): + raise SystemExit(f"start config deployconf_path is missing: {deployconf_path}") + start_payload["deployconf_path"] = _bundle_relpath(deployconf_path, base=start_cfg.parent, dot_prefix=True) + _write_yaml_mapping(start_cfg, start_payload) + + deployconf_payload = _load_yaml_mapping(deployconf_path, ctx=f"deployconf {deployconf_path}") + mirror_outdir = _relocated_bundle_path( + deployconf_payload.get("gen_k8s_daemonset_mirror_outdir"), + src_root=src_root, + dst_root=dst_root, + base=deployconf_path.parent, + field_name="deployconf.gen_k8s_daemonset_mirror_outdir", + generated_bundle_child_name="gen_k8s_daemonset", + ) + if not _is_within_root(mirror_outdir, dst_root): + raise SystemExit(f"deployconf.gen_k8s_daemonset_mirror_outdir escapes testbed bundle: {mirror_outdir}") + mirror_outdir.mkdir(parents=True, exist_ok=True) + deployconf_payload["gen_k8s_daemonset_mirror_outdir"] = str(mirror_outdir.resolve()) + _write_yaml_mapping(deployconf_path, deployconf_payload) + + manifest_path = start_cfg.with_name("manifest.json") + if manifest_path.exists(): + try: + manifest = _require_dict( + json.loads(manifest_path.read_text(encoding="utf-8")), + f"testbed bundle manifest {manifest_path}", + ) + except Exception as exc: + raise SystemExit(f"failed to load testbed bundle manifest {manifest_path}: {exc}") from exc + + manifest_targets = { + "deployconf_path": deployconf_path, + "start_config_path": start_cfg, + } + for field_name in ("ssh_config_path", "workdir"): + if field_name not in manifest: + continue + target_path = _relocated_bundle_path( + manifest.get(field_name), + src_root=src_root, + dst_root=dst_root, + base=manifest_path.parent, + field_name=f"manifest.{field_name}", + ) + if not _is_within_root(target_path, dst_root): + raise SystemExit(f"manifest.{field_name} escapes testbed bundle: {target_path}") + if field_name == "workdir": + target_path.mkdir(parents=True, exist_ok=True) + elif not target_path.exists(): + raise SystemExit(f"manifest.{field_name} is missing inside testbed bundle: {target_path}") + manifest_targets[field_name] = target_path + + for field_name, target_path in manifest_targets.items(): + manifest[field_name] = _bundle_relpath(target_path, base=manifest_path.parent) + manifest_path.write_text( + json.dumps(manifest, ensure_ascii=False, indent=2, sort_keys=True) + "\n", + encoding="utf-8", + ) + + return start_cfg + + def _split_ids(raw_values: list[str] | None, *, default: str) -> list[str]: if not raw_values: return [default] @@ -95,6 +334,54 @@ def _profile_target_map(cfg: dict[str, Any], profile_id: str) -> dict[str, Any]: ) +def _local_test_stack_coordinator_port_base(*, controller_port: int, topology_key: Any) -> int: + topology_offset = 0 + if isinstance(topology_key, int): + topology_offset = int(topology_key) * LOCAL_TEST_STACK_TOPOLOGY_PORT_SPAN + elif isinstance(topology_key, str) and topology_key.isdigit(): + topology_offset = int(topology_key) * LOCAL_TEST_STACK_TOPOLOGY_PORT_SPAN + elif topology_key != "DEFAULT": + raise SystemExit(f"unsupported test_stack port_alloc topology key: {topology_key!r}") + + port = int(controller_port) + LOCAL_TEST_STACK_COORDINATOR_PORT_OFFSET + topology_offset + if port <= 0 or port > 65535: + raise SystemExit(f"computed local TEST_STACK coordinator_port_base out of range: {port}") + return port + + +def _rewrite_test_stack_coordinator_ports_for_local_controller( + suite: dict[str, Any], + *, + controller_port: int, +) -> None: + profiles = _require_dict(suite.get("profiles"), "suite.profiles") + for profile_id, profile in profiles.items(): + if not isinstance(profile, dict): + continue + runtime = _require_dict(profile.get("runtime"), f"suite.profiles[{profile_id!r}].runtime") + test_stack = _require_dict( + runtime.get("test_stack"), + f"suite.profiles[{profile_id!r}].runtime.test_stack", + ) + port_alloc = _require_dict( + test_stack.get("port_alloc"), + f"suite.profiles[{profile_id!r}].runtime.test_stack.port_alloc", + ) + by_topology = _require_dict( + port_alloc.get("by_topology"), + f"suite.profiles[{profile_id!r}].runtime.test_stack.port_alloc.by_topology", + ) + for topology_key, entry in by_topology.items(): + if not isinstance(entry, dict): + continue + if "coordinator_port_base" not in entry: + continue + entry["coordinator_port_base"] = _local_test_stack_coordinator_port_base( + controller_port=int(controller_port), + topology_key=topology_key, + ) + + def _ordered_usable_targets(target_ip_map: dict[str, Any], *, ctx: str) -> list[str]: out: list[str] = [] for raw_target in target_ip_map: @@ -133,6 +420,41 @@ def _common_targets(cfg: dict[str, Any], profile_ids: list[str], required_count: return ordered_common[:required_count] +def _apply_single_host_logical_targets( + cfg: dict[str, Any], + *, + profile_ids: list[str], + required_count: int, + anchor_ip_override: str | None, +) -> None: + if required_count <= 0: + raise SystemExit("--single-host-logical-targets requires a positive target count") + override_ip = None + if anchor_ip_override is not None: + override_ip = str(anchor_ip_override).strip() + if not override_ip: + raise SystemExit("single-host anchor IP override must be non-empty") + for profile_id in profile_ids: + target_map = _profile_target_map(cfg, profile_id) + ordered = _ordered_usable_targets( + target_map, + ctx=f"config.profiles[{profile_id!r}].runtime.test_stack.deploy.target_ip_map", + ) + if not ordered: + raise SystemExit( + f"profile {profile_id!r} has no non-bastion target to use as the single-host anchor" + ) + anchor_target = ordered[0] + anchor_ip = target_map.get(anchor_target) + if not isinstance(anchor_ip, str) or not anchor_ip.strip(): + raise SystemExit( + f"profile {profile_id!r} anchor target {anchor_target!r} has no usable IP" + ) + resolved_anchor_ip = override_ip or anchor_ip.strip() + for idx in range(1, int(required_count) + 1): + target_map[f"node-{idx}"] = resolved_anchor_ip + + def _base_benchmark(cfg: dict[str, Any]) -> dict[str, Any]: scenes = _require_dict(cfg.get("scenes"), "config.scenes") scene = _require_dict(scenes.get(SCENE_ID), f"config.scenes[{SCENE_ID!r}]") @@ -225,7 +547,13 @@ def _pruned_artifact_sets(cfg: dict[str, Any], profile_ids: list[str]) -> dict[s return out -def _build_suite(cfg: dict[str, Any], args: argparse.Namespace, profile_ids: list[str]) -> dict[str, Any]: +def _build_suite( + cfg: dict[str, Any], + args: argparse.Namespace, + profile_ids: list[str], + *, + single_host_anchor_ip: str | None = None, +) -> dict[str, Any]: producer_count = int(args.producer_count) consumer_count = int(args.consumer_count) owner_count = int(args.owner_count) @@ -235,6 +563,7 @@ def _build_suite(cfg: dict[str, Any], args: argparse.Namespace, profile_ids: lis ("owner-count", owner_count), ("owner-dram-gib", int(args.owner_dram_gib)), ("duration-seconds", int(args.duration_seconds)), + ("threads-per-process", int(args.threads_per_process)), ("op-timeout-seconds", int(args.op_timeout_seconds)), ("cluster-ready-timeout-seconds", int(args.cluster_ready_timeout_seconds)), ): @@ -249,12 +578,37 @@ def _build_suite(cfg: dict[str, Any], args: argparse.Namespace, profile_ids: lis if int(args.consumer_sim_min_ms) > int(args.consumer_sim_max_ms): raise SystemExit("--consumer-sim-min-ms must be <= --consumer-sim-max-ms") - topology = producer_count + consumer_count + single_host = bool(args.single_host_logical_targets) + processes_per_target = owner_count if single_host else 1 + if single_host: + if producer_count % processes_per_target != 0: + raise SystemExit( + "--single-host-logical-targets requires producer-count to be divisible by owner-count " + f"so process fanout can preserve the requested count: producer={producer_count} owner={owner_count}" + ) + if consumer_count % processes_per_target != 0: + raise SystemExit( + "--single-host-logical-targets requires consumer-count to be divisible by owner-count " + f"so process fanout can preserve the requested count: consumer={consumer_count} owner={owner_count}" + ) + producer_targets = producer_count // processes_per_target + consumer_targets = consumer_count // processes_per_target + else: + producer_targets = producer_count + consumer_targets = consumer_count + topology = producer_targets + consumer_targets if owner_count > topology: raise SystemExit( f"owner-count={owner_count} cannot exceed benchmark topology={topology} " "when owner targets are co-located with benchmark targets" ) + if single_host: + _apply_single_host_logical_targets( + cfg, + profile_ids=profile_ids, + required_count=topology, + anchor_ip_override=single_host_anchor_ip, + ) target_hosts = _common_targets(cfg, profile_ids, topology) owner_targets = target_hosts[:owner_count] @@ -266,8 +620,8 @@ def _build_suite(cfg: dict[str, Any], args: argparse.Namespace, profile_ids: lis benchmark = _base_benchmark(cfg) benchmark.update( { - "processes_per_target": 1, - "threads_per_process": 4, + "processes_per_target": processes_per_target, + "threads_per_process": int(args.threads_per_process), "value_size": int(args.value_size), "metric_warmup_seconds": int(args.metric_warmup_seconds), "op_timeout_seconds": int(args.op_timeout_seconds), @@ -279,12 +633,14 @@ def _build_suite(cfg: dict[str, Any], args: argparse.Namespace, profile_ids: lis ], } ) + if single_host: + benchmark["owner_group_processes"] = 1 _ensure_largescale_port_alloc( cfg, profile_ids=profile_ids, topology=topology, - required_p2p_ports_per_slot=topology + 1 + owner_count, + required_p2p_ports_per_slot=(topology * processes_per_target) + 1 + owner_count, ) scenes = _require_dict(cfg.get("scenes"), "config.scenes") @@ -292,8 +648,8 @@ def _build_suite(cfg: dict[str, Any], args: argparse.Namespace, profile_ids: lis scene["test_stack"] = copy.deepcopy(_require_dict(scene.get("test_stack"), f"config.scenes[{SCENE_ID!r}].test_stack")) scene["test_stack"]["mode"] = "MPMC" scene["test_stack"]["role_weights"] = _role_weights_for_exact_mpmc_counts( - producer_count, - consumer_count, + producer_targets, + consumer_targets, ) scene["select"] = {"scales": [scale_id], "profiles": list(profile_ids)} @@ -329,11 +685,80 @@ def _build_suite(cfg: dict[str, Any], args: argparse.Namespace, profile_ids: lis } +def _prepare_run_local_testbed_bundle( + *, + source: str, + workdir: Path, + start_config_relpath: str, +) -> Path: + src = _resolve_user_path(source) + if not src.is_dir(): + raise SystemExit(f"--testbed-bundle-source must be an existing directory: {src}") + dst = (workdir / "testbed_bundle").resolve() + src_root_for_relocation = src.resolve() + if src == dst: + pass + else: + if src in dst.parents: + raise SystemExit(f"--testbed-bundle-source cannot contain the run-local destination: src={src} dst={dst}") + if dst in src.parents: + raise SystemExit(f"--testbed-bundle-source cannot be inside the run-local destination: src={src} dst={dst}") + if dst.exists(): + shutil.rmtree(dst) + dst.parent.mkdir(parents=True, exist_ok=True) + shutil.copytree(src, dst, symlinks=True) + + return _normalize_run_local_testbed_bundle( + src_root=src_root_for_relocation, + dst_root=dst, + start_config_relpath=start_config_relpath, + ) + + +def _single_host_anchor_ip_from_start_config(start_cfg: Path) -> str: + start_payload = yaml.safe_load(start_cfg.read_text(encoding="utf-8")) + start = _require_dict(start_payload, f"start config {start_cfg}") + raw_deployconf = start.get("deployconf_path") + if not isinstance(raw_deployconf, str) or not raw_deployconf.strip(): + raise SystemExit(f"start config {start_cfg} must define deployconf_path") + deployconf_path = Path(raw_deployconf).expanduser() + if not deployconf_path.is_absolute(): + deployconf_path = (start_cfg.parent / deployconf_path).resolve() + if not deployconf_path.is_file(): + raise SystemExit(f"start config deployconf_path is missing: {deployconf_path}") + deployconf_payload = yaml.safe_load(deployconf_path.read_text(encoding="utf-8")) + deployconf = _require_dict(deployconf_payload, f"deployconf {deployconf_path}") + cluster_nodes = deployconf.get("cluster_nodes") + if not isinstance(cluster_nodes, list) or not cluster_nodes: + raise SystemExit(f"deployconf {deployconf_path} must define non-empty cluster_nodes") + for index, raw_node in enumerate(cluster_nodes): + node = _require_dict(raw_node, f"deployconf.cluster_nodes[{index}]") + hostname = node.get("hostname") + if isinstance(hostname, str) and "bastion" in hostname.lower(): + continue + node_ip = node.get("ip") + if isinstance(node_ip, str) and node_ip.strip(): + return node_ip.strip() + raise SystemExit(f"deployconf {deployconf_path} has no non-bastion cluster node IP") + + +def _controller_port_from_start_config(start_cfg: Path) -> int: + start_payload = yaml.safe_load(start_cfg.read_text(encoding="utf-8")) + start = _require_dict(start_payload, f"start config {start_cfg}") + raw_url = start.get("controller_url") + if not isinstance(raw_url, str) or not raw_url.strip(): + raise SystemExit(f"start config {start_cfg} must define controller_url") + parsed = urlparse(raw_url.strip()) + if parsed.port is None: + raise SystemExit(f"start config {start_cfg} controller_url must include an explicit port: {raw_url}") + return int(parsed.port) + + def main() -> int: parser = argparse.ArgumentParser( description=( "Flat index entry for the TEST_STACK large-scale MQ benchmark " - "(default: 30 owners at 5GiB, 300 producers, 8 consumers)." + "(default: 4 owners at 1GiB, 160 producers, 8 consumers, 256-byte values)." ) ) parser.add_argument("--python", default=os.environ.get("PYTHON", sys.executable)) @@ -343,13 +768,33 @@ def main() -> int: parser.add_argument("--profile", action="append", dest="profiles", help="Profile id to run; repeat or comma-separate.") parser.add_argument("--action", choices=["run", "clean"], default="run") parser.add_argument("--generate-only", action="store_true", help="Write the generated suite YAML and do not invoke test_runner.") - parser.add_argument("--owner-count", type=int, default=30) - parser.add_argument("--owner-dram-gib", type=int, default=5) - parser.add_argument("--producer-count", type=int, default=300) + parser.add_argument( + "--testbed-bundle-source", + help="Existing TEST_STACK testbed bundle directory copied to /testbed_bundle before a real run.", + ) + parser.add_argument( + "--start-config-relpath", + default="start_test_bed.runner.yaml", + help="Start-testbed config path inside the run-local testbed bundle.", + ) + parser.add_argument( + "--single-host-logical-targets", + action="store_true", + help="Generate node-1..N logical TEST_STACK targets on the first usable target IP of each selected profile.", + ) + parser.add_argument("--owner-count", type=int, default=4) + parser.add_argument("--owner-dram-gib", type=int, default=1) + parser.add_argument("--producer-count", type=int, default=160) parser.add_argument("--consumer-count", type=int, default=8) parser.add_argument("--duration-seconds", type=int, default=60) - parser.add_argument("--value-size", type=int, default=16384) + parser.add_argument("--value-size", type=int, default=256) parser.add_argument("--metric-warmup-seconds", type=int, default=0) + parser.add_argument( + "--threads-per-process", + type=int, + default=int(DEFAULT_BENCHMARK["threads_per_process"]), + help="Worker threads per benchmark process.", + ) parser.add_argument("--op-timeout-seconds", type=int, default=30) parser.add_argument("--cluster-ready-timeout-seconds", type=int, default=1800) parser.add_argument("--consumer-sim-min-ms", type=int, default=700) @@ -360,6 +805,21 @@ def main() -> int: if args.action == "clean": return call([args.python, "-u", str(RUNNER), "--workdir", str(workdir), "--action", "clean"]) + start_cfg: Path | None = None + single_host_anchor_ip: str | None = None + local_controller_port: int | None = None + if not args.generate_only: + if not args.testbed_bundle_source: + raise SystemExit("--testbed-bundle-source is required unless --generate-only is set") + start_cfg = _prepare_run_local_testbed_bundle( + source=args.testbed_bundle_source, + workdir=workdir, + start_config_relpath=args.start_config_relpath, + ) + local_controller_port = _controller_port_from_start_config(start_cfg) + if bool(args.single_host_logical_targets): + single_host_anchor_ip = _single_host_anchor_ip_from_start_config(start_cfg) + config_path = _repo_path(args.config) if not config_path.exists(): raise SystemExit(f"--config not found: {config_path}") @@ -368,7 +828,18 @@ def main() -> int: cfg = _require_dict(yaml.safe_load(fh), f"config file {config_path}") profile_ids = _split_ids(args.profiles, default=DEFAULT_PROFILE_ID) - suite = _build_suite(cfg, args, profile_ids) + _ensure_ci_public_profile(cfg, profile_ids) + suite = _build_suite( + cfg, + args, + profile_ids, + single_host_anchor_ip=single_host_anchor_ip, + ) + if local_controller_port is not None: + _rewrite_test_stack_coordinator_ports_for_local_controller( + suite, + controller_port=local_controller_port, + ) suite_out = _repo_path(args.suite_out) if args.suite_out else (workdir / "largescale_mq_suite.yaml") suite_out.parent.mkdir(parents=True, exist_ok=True) @@ -378,7 +849,13 @@ def main() -> int: print(f"generated suite: {suite_out}", flush=True) if args.generate_only: return 0 - return call([args.python, "-u", str(RUNNER), "--config", str(suite_out), "--workdir", str(workdir), "--action", "run"]) + assert start_cfg is not None + env = os.environ.copy() + env["FLUXON_TEST_STACK_START_TEST_BED_CONFIG"] = str(start_cfg) + return call( + [args.python, "-u", str(RUNNER), "--config", str(suite_out), "--workdir", str(workdir), "--action", "run"], + env=env, + ) if __name__ == "__main__": diff --git a/setup_and_pack/tests/test_doc_site_builder_image_workflow.py b/setup_and_pack/tests/test_doc_site_builder_image_workflow.py index e91244c..159408a 100644 --- a/setup_and_pack/tests/test_doc_site_builder_image_workflow.py +++ b/setup_and_pack/tests/test_doc_site_builder_image_workflow.py @@ -10,6 +10,7 @@ WORKFLOW_PATH = REPO_ROOT / ".github" / "workflows" / "doc-site-builder-image.yml" ALL_TEST_WORKFLOW_PATH = REPO_ROOT / ".github" / "workflows" / "all_test.yml" DOCS_PAGES_WORKFLOW_PATH = REPO_ROOT / ".github" / "workflows" / "docs-pages.yml" +LARGESCALE_MQ_WORKFLOW_PATH = REPO_ROOT / ".github" / "workflows" / "largescale-mq.yml" class DocSiteBuilderImageWorkflowTest(unittest.TestCase): @@ -46,6 +47,18 @@ def test_main_testbed_workflow_keeps_suite_generation_in_workflow(self) -> None: self.assertIn("ci_top_attention_bin_kvtest", workflow_text) self.assertIn("ci_top_attention_doc_page_build", workflow_text) self.assertIn("ci_top_attention_mq_core", workflow_text) + self.assertIn("ci_top_attention_largescale_mq", workflow_text) + self.assertIn("_{suffix}.py", workflow_text) + self.assertIn("--single-host-logical-targets", workflow_text) + self.assertIn("--testbed-bundle-source", workflow_text) + self.assertIn("__TEST_BED_BUNDLE_ROOT__", workflow_text) + self.assertIn("largescale_mq_ci_single_host", workflow_text) + self.assertIn("--owner-count", workflow_text) + self.assertIn('"4"', workflow_text) + self.assertIn("--value-size", workflow_text) + self.assertIn('"256"', workflow_text) + self.assertIn("for producer_count, consumer_count in ((8, 8), (32, 32), (160, 8))", workflow_text) + self.assertIn('"30"', workflow_text) self.assertIn("doc_site_base_url", workflow_text) self.assertIn("rather_no_git_submodule.py", workflow_text) @@ -60,6 +73,30 @@ def test_docs_pages_uses_container_entrypoint(self) -> None: self.assertNotIn("doc-site-npm", workflow_text) self.assertNotIn("doc-site-plugins", workflow_text) + def test_largescale_mq_workflow_runs_manual_self_hosted_entrypoint(self) -> None: + workflow_text = LARGESCALE_MQ_WORKFLOW_PATH.read_text(encoding="utf-8") + yaml.load(workflow_text, Loader=yaml.BaseLoader) + + self.assertIn("workflow_dispatch", workflow_text) + self.assertIn("self-hosted", workflow_text) + self.assertIn("fluxon_test_stack/top_attention_test_index/_largescale_mq.py", workflow_text) + self.assertIn("--generate-only", workflow_text) + self.assertIn("Run large-scale MQ benchmark", workflow_text) + self.assertIn("inputs.run_mode == 'run'", workflow_text) + self.assertIn("testbed_bundle_path", workflow_text) + self.assertIn("--testbed-bundle-source", workflow_text) + self.assertIn("${{ toJSON(inputs.testbed_bundle_path) }}", workflow_text) + self.assertNotIn("FLUXON_TEST_STACK_START_TEST_BED_CONFIG", workflow_text) + self.assertIn("owner_count", workflow_text) + self.assertIn('default: "4"', workflow_text) + self.assertIn("owner_dram_gib", workflow_text) + self.assertIn('default: "1"', workflow_text) + self.assertIn("value_size", workflow_text) + self.assertIn('default: "256"', workflow_text) + self.assertIn("--producer-count", workflow_text) + self.assertIn("--consumer-count", workflow_text) + self.assertIn("actions/upload-artifact@v4", workflow_text) + if __name__ == "__main__": unittest.main() From 325816171c6185a4c4dfcd1542fa0d8fdec9e2bc Mon Sep 17 00:00:00 2001 From: ActivePeter <1020401660@qq.com> Date: Fri, 3 Jul 2026 00:06:29 +0800 Subject: [PATCH 2/2] test --- .github/workflows/all_test.yml | 42 +++++--- fluxon_test_stack/test_runner.py | 98 ++++++++++++++++++- .../test_test_runner_testbed_contract.py | 6 ++ .../tests/test_test_runner_ui_contract.py | 23 +++++ .../test_doc_site_builder_image_workflow.py | 3 + 5 files changed, 160 insertions(+), 12 deletions(-) diff --git a/.github/workflows/all_test.yml b/.github/workflows/all_test.yml index 1404293..f41cd84 100644 --- a/.github/workflows/all_test.yml +++ b/.github/workflows/all_test.yml @@ -308,6 +308,21 @@ jobs: print(f"missing {case_runs_path}") raise SystemExit(0) + def print_file(path): + print(f"=== {path} ===") + if path.exists(): + print(path.read_text(encoding="utf-8", errors="replace")) + else: + print(f"missing {path}") + + def print_tail(path, *, line_count=240): + print(f"=== {path} tail ===") + if path.exists(): + lines = path.read_text(encoding="utf-8", errors="replace").splitlines() + print("\n".join(lines[-line_count:])) + else: + print(f"missing {path}") + case_runs = yaml.safe_load(case_runs_path.read_text(encoding="utf-8")) print("=== case_runs.yaml ===") print(yaml.safe_dump(case_runs, sort_keys=False, allow_unicode=False)) @@ -325,19 +340,24 @@ jobs: "logs/ci_runner/exit_code.txt", ): path = run_dir / rel - print(f"=== {path} ===") - if path.exists(): - print(path.read_text(encoding="utf-8", errors="replace")) - else: - print(f"missing {path}") + print_file(path) stdout_path = run_dir / "logs" / "ci_runner" / "stdout.log" - print(f"=== {stdout_path} tail ===") - if stdout_path.exists(): - lines = stdout_path.read_text(encoding="utf-8", errors="replace").splitlines() - print("\n".join(lines[-240:])) - else: - print(f"missing {stdout_path}") + print_tail(stdout_path) + + nested_root = workdir / "largescale_mq_ci_single_host" + print(f"=== nested largescale MQ diagnostics: {nested_root} ===") + if not nested_root.exists(): + print(f"missing {nested_root}") + else: + for path in sorted(nested_root.glob("*/case_runs.yaml")): + print_file(path) + for path in sorted(nested_root.glob("*/results/*/run_*/summary.yaml")): + print_file(path) + for path in sorted(nested_root.glob("*/results/*/run_*/exception.txt")): + print_file(path) + for path in sorted(nested_root.glob("*/test_runner.log")): + print_tail(path) PY - name: Normalize ci_2_virt_node debug artifact permissions diff --git a/fluxon_test_stack/test_runner.py b/fluxon_test_stack/test_runner.py index 58657a4..4291208 100644 --- a/fluxon_test_stack/test_runner.py +++ b/fluxon_test_stack/test_runner.py @@ -448,6 +448,7 @@ def _scene_id_is_allowed(scene_id: str) -> bool: _RUNNER_STDIO_MIRROR_THREAD: Optional[threading.Thread] = None _RUNNER_STDIO_ROUTER_THREAD: Optional[threading.Thread] = None _CI_WAIT_HEARTBEAT_INTERVAL_SECONDS = 15.0 +_CI_WAIT_STATUS_SNAPSHOT_INTERVAL_SECONDS = 60.0 _CI_WAIT_TAIL_MAX_CHARS = 8000 _TEST_RUNNER_UI_MAX_LOG_CHUNK_BYTES = 1024 * 1024 _TEST_RUNNER_UI_HISTORY_SCHEMA_VERSION = 1 @@ -14126,6 +14127,11 @@ def _write_ci_runner_script( cmd_lines.append(f"echo {_shell_quote('=' * 80)}") cmd_lines.append(f"echo {_shell_quote(f'STEP {idx}: {step_label} :: {cmd}')}") cmd_lines.append(f"echo {_shell_quote('=' * 80)}") + cmd_lines.append("step_started_at=$(date +%s)") + cmd_lines.append( + f"echo {_shell_quote(f'[ci_runner] STEP {idx} start label={step_label}')} " + '"started_at_utc=$(date -u +%Y-%m-%dT%H:%M:%SZ)"' + ) if timeout_seconds is None: cmd_lines.append(f"{cmd}") else: @@ -14134,6 +14140,12 @@ def _write_ci_runner_script( # - Timeout must be configured explicitly per command in suite config (no hidden defaults). cmd_lines.append(f"timeout --preserve-status --signal=KILL {int(timeout_seconds)} {cmd}") cmd_lines.append("rc=$?") + cmd_lines.append("step_finished_at=$(date +%s)") + cmd_lines.append( + f"echo {_shell_quote(f'[ci_runner] STEP {idx} finish label={step_label}')} " + '"rc=$rc elapsed_s=$((step_finished_at - step_started_at)) ' + 'finished_at_utc=$(date -u +%Y-%m-%dT%H:%M:%SZ)"' + ) cmd_lines.append('if [ "$rc" -ne 0 ]; then') cmd_lines.append(' echo "[ci_runner] FAILED rc=$rc"') cmd_lines.append(' fail_and_exit "$rc"') @@ -14210,6 +14222,20 @@ def _write_ci_runner_script( log_dir="{run_dir.as_posix()}/logs/ci_runner" mkdir -p "$log_dir" exit_code_path="$log_dir/exit_code.txt" + restart_count_path="$log_dir/restart_count.txt" + exec >>"$log_dir/stdout.log" 2>&1 + if [ -f "$restart_count_path" ]; then + restart_count="$(cat "$restart_count_path" 2>/dev/null || echo 0)" + else + restart_count=0 + fi + case "$restart_count" in + ''|*[!0-9]*) restart_count=0 ;; + esac + restart_count=$((restart_count + 1)) + printf '%s\n' "$restart_count" > "$restart_count_path" + echo + echo "[ci_runner] start attempt=$restart_count pid=$$ ppid=$PPID host=$(hostname) started_at_utc=$(date -u +%Y-%m-%dT%H:%M:%SZ)" # The CI runner workload is a Deployment and may be restarted. # Once exit_code.txt is written, the run is terminal. If we restart, we must not delete it # or re-run tests, otherwise the runner can never converge. @@ -14236,7 +14262,6 @@ def _write_ci_runner_script( hold_pid="" done }} -exec >"$log_dir/stdout.log" 2>&1 prepare_env_path="{_ci_prepare_env_path(run_dir=run_dir).as_posix()}" if [ -f "$prepare_env_path" ]; then @@ -14738,6 +14763,56 @@ def _print_ci_wait_progress( return next_offset, next_heartbeat_at +def _ci_wait_observed_file_state_debug(state: Optional[_ObservedFileState]) -> str: + if state is None: + return "missing" + return f"size={state.size} mtime_ns={state.mtime_ns}" + + +def _ci_wait_status_debug(status: Dict[str, Any]) -> str: + preferred_keys = ( + "ok", + "running", + "exit_code", + "pid", + "started_at", + "finished_at", + "message", + "error", + "stderr", + ) + summary = {key: status.get(key) for key in preferred_keys if key in status} + if not summary: + summary = dict(status) + text = json.dumps(summary, ensure_ascii=True, sort_keys=True, default=str) + max_chars = 1200 + if len(text) > max_chars: + return text[:max_chars] + "..." + return text + + +def _print_ci_wait_status_snapshot( + *, + run_dir: Path, + status: Optional[Dict[str, Any]], + baseline_state: Optional[_ObservedFileState], + current_state: Optional[_ObservedFileState], + last_status_err: Optional[str], +) -> None: + now = time.time() + status_text = "unavailable" if status is None else _ci_wait_status_debug(status) + print( + f"{_ci_log_timestamp_prefix(now)} " + "[CI wait exit_code] status_snapshot " + f"status={status_text} " + f"baseline_exit_code_state={_ci_wait_observed_file_state_debug(baseline_state)} " + f"current_exit_code_state={_ci_wait_observed_file_state_debug(current_state)} " + f"last_status_err={last_status_err} " + f"log={str((run_dir / 'logs' / 'ci_runner' / 'stdout.log').resolve())}", + flush=True, + ) + + def _instance_file_exists( resolved_case: Dict[str, Any], *, instance_id: str, path: Path ) -> bool: @@ -15190,6 +15265,7 @@ def _wait_ci_runner_exit_code( last_status_err: str | None = None log_offset = 0 next_heartbeat_at = 0.0 + next_status_snapshot_at = 0.0 while True: log_offset, next_heartbeat_at = _print_ci_wait_progress( resolved_case, @@ -15226,6 +15302,16 @@ def _wait_ci_runner_exit_code( status = _instance_status(resolved_case, instance_id="ci_runner") except _HttpGetJsonTransientError as exc: last_status_err = str(exc) + now = time.time() + if now >= next_status_snapshot_at: + _print_ci_wait_status_snapshot( + run_dir=run_dir, + status=None, + baseline_state=baseline_state, + current_state=current_state, + last_status_err=last_status_err, + ) + next_status_snapshot_at = now + _CI_WAIT_STATUS_SNAPSHOT_INTERVAL_SECONDS if time.time() >= deadline: raise ValueError( "ci_runner.exit_code wait timeout with transient controller errors: " @@ -15233,6 +15319,16 @@ def _wait_ci_runner_exit_code( ) from exc time.sleep(2.0) continue + now = time.time() + if now >= next_status_snapshot_at: + _print_ci_wait_status_snapshot( + run_dir=run_dir, + status=status, + baseline_state=baseline_state, + current_state=current_state, + last_status_err=last_status_err, + ) + next_status_snapshot_at = now + _CI_WAIT_STATUS_SNAPSHOT_INTERVAL_SECONDS status_exit_code = status.get("exit_code") if status.get("ok") is True and status.get("running") is False and isinstance(status_exit_code, int): return _require_int(status_exit_code, "ci_runner.status.exit_code", min_v=-255) diff --git a/fluxon_test_stack/tests/test_test_runner_testbed_contract.py b/fluxon_test_stack/tests/test_test_runner_testbed_contract.py index 58bbf95..7d95928 100644 --- a/fluxon_test_stack/tests/test_test_runner_testbed_contract.py +++ b/fluxon_test_stack/tests/test_test_runner_testbed_contract.py @@ -1182,6 +1182,12 @@ def test_ci_runner_script_sources_prepare_env_when_present(self) -> None: script_text = script_path.read_text(encoding="utf-8") self.assertIn('prepare_env_path="', script_text) self.assertIn('. "$prepare_env_path"', script_text) + self.assertIn('restart_count_path="$log_dir/restart_count.txt"', script_text) + self.assertIn('exec >>"$log_dir/stdout.log" 2>&1', script_text) + self.assertIn("[ci_runner] start attempt=$restart_count", script_text) + self.assertIn("[ci_runner] STEP 1 start label=", script_text) + self.assertIn("[ci_runner] STEP 1 finish label=", script_text) + subprocess.run(["bash", "-n", str(script_path)], check=True) def test_ci_prepare_exports_testbed_bundle_and_release_authority(self) -> None: with tempfile.TemporaryDirectory() as td: diff --git a/fluxon_test_stack/tests/test_test_runner_ui_contract.py b/fluxon_test_stack/tests/test_test_runner_ui_contract.py index 2abc4ec..4f727be 100644 --- a/fluxon_test_stack/tests/test_test_runner_ui_contract.py +++ b/fluxon_test_stack/tests/test_test_runner_ui_contract.py @@ -106,6 +106,29 @@ def test_print_ci_wait_progress_emits_new_tail(self) -> None: "[1970-01-01 00:01:40 UTC] a\n[1970-01-01 00:01:40 UTC] b\n", ) + def test_print_ci_wait_status_snapshot_includes_status_and_file_states(self) -> None: + buf = io.StringIO() + with tempfile.TemporaryDirectory() as td: + run_dir = Path(td) + current_state = _RUNNER._ObservedFileState(size=12, mtime_ns=34) + with mock.patch.object(_RUNNER.time, "time", return_value=100.0): + with mock.patch.object(_RUNNER.sys, "stdout", buf): + _RUNNER._print_ci_wait_status_snapshot( + run_dir=run_dir, + status={"ok": True, "running": True, "detail": "ignored"}, + baseline_state=None, + current_state=current_state, + last_status_err="transient", + ) + + text = buf.getvalue() + self.assertIn("[CI wait exit_code] status_snapshot", text) + self.assertIn('"ok": true', text) + self.assertIn('"running": true', text) + self.assertIn("baseline_exit_code_state=missing", text) + self.assertIn("current_exit_code_state=size=12 mtime_ns=34", text) + self.assertIn("last_status_err=transient", text) + def test_runner_stdio_mirror_enabled_only_for_github_actions(self) -> None: with mock.patch.dict(os.environ, {"GITHUB_ACTIONS": "true"}, clear=True): self.assertTrue(_RUNNER._runner_stdio_mirror_enabled()) diff --git a/setup_and_pack/tests/test_doc_site_builder_image_workflow.py b/setup_and_pack/tests/test_doc_site_builder_image_workflow.py index 159408a..6807c5d 100644 --- a/setup_and_pack/tests/test_doc_site_builder_image_workflow.py +++ b/setup_and_pack/tests/test_doc_site_builder_image_workflow.py @@ -55,10 +55,13 @@ def test_main_testbed_workflow_keeps_suite_generation_in_workflow(self) -> None: self.assertIn("largescale_mq_ci_single_host", workflow_text) self.assertIn("--owner-count", workflow_text) self.assertIn('"4"', workflow_text) + self.assertNotIn("--threads-per-process", workflow_text) + self.assertNotIn('"timeout_seconds": 3600', workflow_text) self.assertIn("--value-size", workflow_text) self.assertIn('"256"', workflow_text) self.assertIn("for producer_count, consumer_count in ((8, 8), (32, 32), (160, 8))", workflow_text) self.assertIn('"30"', workflow_text) + self.assertIn("nested largescale MQ diagnostics", workflow_text) self.assertIn("doc_site_base_url", workflow_text) self.assertIn("rather_no_git_submodule.py", workflow_text)